cua-agent 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (112) hide show
  1. agent/__init__.py +21 -12
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +229 -0
  5. agent/agent.py +594 -0
  6. agent/callbacks/__init__.py +19 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/telemetry.py +210 -0
  13. agent/callbacks/trajectory_saver.py +305 -0
  14. agent/cli.py +297 -0
  15. agent/computer_handler.py +107 -0
  16. agent/decorators.py +90 -0
  17. agent/loops/__init__.py +11 -0
  18. agent/loops/anthropic.py +728 -0
  19. agent/loops/omniparser.py +339 -0
  20. agent/loops/openai.py +95 -0
  21. agent/loops/uitars.py +688 -0
  22. agent/responses.py +207 -0
  23. agent/telemetry.py +135 -14
  24. agent/types.py +79 -0
  25. agent/ui/__init__.py +7 -1
  26. agent/ui/__main__.py +2 -13
  27. agent/ui/gradio/__init__.py +6 -19
  28. agent/ui/gradio/app.py +94 -1313
  29. agent/ui/gradio/ui_components.py +721 -0
  30. cua_agent-0.4.0.dist-info/METADATA +424 -0
  31. cua_agent-0.4.0.dist-info/RECORD +33 -0
  32. agent/core/__init__.py +0 -27
  33. agent/core/agent.py +0 -210
  34. agent/core/base.py +0 -217
  35. agent/core/callbacks.py +0 -200
  36. agent/core/experiment.py +0 -249
  37. agent/core/factory.py +0 -122
  38. agent/core/messages.py +0 -332
  39. agent/core/provider_config.py +0 -21
  40. agent/core/telemetry.py +0 -142
  41. agent/core/tools/__init__.py +0 -21
  42. agent/core/tools/base.py +0 -74
  43. agent/core/tools/bash.py +0 -52
  44. agent/core/tools/collection.py +0 -46
  45. agent/core/tools/computer.py +0 -113
  46. agent/core/tools/edit.py +0 -67
  47. agent/core/tools/manager.py +0 -56
  48. agent/core/tools.py +0 -32
  49. agent/core/types.py +0 -88
  50. agent/core/visualization.py +0 -197
  51. agent/providers/__init__.py +0 -4
  52. agent/providers/anthropic/__init__.py +0 -6
  53. agent/providers/anthropic/api/client.py +0 -360
  54. agent/providers/anthropic/api/logging.py +0 -150
  55. agent/providers/anthropic/api_handler.py +0 -140
  56. agent/providers/anthropic/callbacks/__init__.py +0 -5
  57. agent/providers/anthropic/callbacks/manager.py +0 -65
  58. agent/providers/anthropic/loop.py +0 -568
  59. agent/providers/anthropic/prompts.py +0 -23
  60. agent/providers/anthropic/response_handler.py +0 -226
  61. agent/providers/anthropic/tools/__init__.py +0 -33
  62. agent/providers/anthropic/tools/base.py +0 -88
  63. agent/providers/anthropic/tools/bash.py +0 -66
  64. agent/providers/anthropic/tools/collection.py +0 -34
  65. agent/providers/anthropic/tools/computer.py +0 -396
  66. agent/providers/anthropic/tools/edit.py +0 -326
  67. agent/providers/anthropic/tools/manager.py +0 -54
  68. agent/providers/anthropic/tools/run.py +0 -42
  69. agent/providers/anthropic/types.py +0 -16
  70. agent/providers/anthropic/utils.py +0 -381
  71. agent/providers/omni/__init__.py +0 -8
  72. agent/providers/omni/api_handler.py +0 -42
  73. agent/providers/omni/clients/anthropic.py +0 -103
  74. agent/providers/omni/clients/base.py +0 -35
  75. agent/providers/omni/clients/oaicompat.py +0 -195
  76. agent/providers/omni/clients/ollama.py +0 -122
  77. agent/providers/omni/clients/openai.py +0 -155
  78. agent/providers/omni/clients/utils.py +0 -25
  79. agent/providers/omni/image_utils.py +0 -34
  80. agent/providers/omni/loop.py +0 -990
  81. agent/providers/omni/parser.py +0 -307
  82. agent/providers/omni/prompts.py +0 -64
  83. agent/providers/omni/tools/__init__.py +0 -30
  84. agent/providers/omni/tools/base.py +0 -29
  85. agent/providers/omni/tools/bash.py +0 -74
  86. agent/providers/omni/tools/computer.py +0 -179
  87. agent/providers/omni/tools/manager.py +0 -61
  88. agent/providers/omni/utils.py +0 -236
  89. agent/providers/openai/__init__.py +0 -6
  90. agent/providers/openai/api_handler.py +0 -456
  91. agent/providers/openai/loop.py +0 -472
  92. agent/providers/openai/response_handler.py +0 -205
  93. agent/providers/openai/tools/__init__.py +0 -15
  94. agent/providers/openai/tools/base.py +0 -79
  95. agent/providers/openai/tools/computer.py +0 -326
  96. agent/providers/openai/tools/manager.py +0 -106
  97. agent/providers/openai/types.py +0 -36
  98. agent/providers/openai/utils.py +0 -98
  99. agent/providers/uitars/__init__.py +0 -1
  100. agent/providers/uitars/clients/base.py +0 -35
  101. agent/providers/uitars/clients/mlxvlm.py +0 -263
  102. agent/providers/uitars/clients/oaicompat.py +0 -214
  103. agent/providers/uitars/loop.py +0 -660
  104. agent/providers/uitars/prompts.py +0 -63
  105. agent/providers/uitars/tools/__init__.py +0 -1
  106. agent/providers/uitars/tools/computer.py +0 -283
  107. agent/providers/uitars/tools/manager.py +0 -60
  108. agent/providers/uitars/utils.py +0 -264
  109. cua_agent-0.3.2.dist-info/METADATA +0 -295
  110. cua_agent-0.3.2.dist-info/RECORD +0 -87
  111. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0.dist-info}/WHEEL +0 -0
  112. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0.dist-info}/entry_points.txt +0 -0
agent/ui/gradio/app.py CHANGED
@@ -1,27 +1,18 @@
1
1
  """
2
- Advanced Gradio UI for Computer-Use Agent
2
+ Advanced Gradio UI for Computer-Use Agent (cua-agent)
3
3
 
4
- This is a Gradio interface for the Computer-Use Agent
4
+ This is a Gradio interface for the Computer-Use Agent v0.4.x (cua-agent)
5
5
  with an advanced UI for model selection and configuration.
6
6
 
7
- Supported Agent Loops and Models:
8
- - AgentLoop.OPENAI: Uses OpenAI Operator CUA model
9
- computer-use-preview
10
-
11
- - AgentLoop.ANTHROPIC: Uses Anthropic Computer-Use models
12
- • claude-3-5-sonnet-20240620
13
- • claude-3-7-sonnet-20250219
14
-
15
- - AgentLoop.OMNI (experimental): Uses OmniParser for element pixel-detection
16
- • claude-3-5-sonnet-20240620
17
- • claude-3-7-sonnet-20250219
18
- • gpt-4.5-preview
19
- • gpt-4o
20
- • gpt-4
7
+ Supported Agent Models:
8
+ - OpenAI: openai/computer-use-preview
9
+ - Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
10
+ - UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
11
+ - Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
21
12
 
22
13
  Requirements:
23
- - Mac with Apple Silicon (M1/M2/M3/M4)
24
- - macOS 14 (Sonoma) or newer
14
+ - Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
15
+ - macOS 14 (Sonoma) or newer / Ubuntu 20.04+
25
16
  - Python 3.11+
26
17
  - Lume CLI installed (https://github.com/trycua/cua)
27
18
  - OpenAI or Anthropic API key
@@ -39,19 +30,21 @@ from gradio.components.chatbot import MetadataDict
39
30
  from typing import cast
40
31
 
41
32
  # Import from agent package
42
- from agent.core.types import AgentResponse
43
- from agent.core.callbacks import DefaultCallbackHandler
33
+ from agent import ComputerAgent
34
+ from agent.types import Messages, AgentResponse
44
35
  from computer import Computer
45
36
 
46
- from agent import ComputerAgent, AgentLoop, LLM, LLMProvider
47
-
48
37
  # Global variables
49
38
  global_agent = None
50
39
  global_computer = None
51
40
  SETTINGS_FILE = Path(".gradio_settings.json")
52
41
 
53
- # We'll use asyncio.run() instead of a persistent event loop
54
42
 
43
+ import dotenv
44
+ if dotenv.load_dotenv():
45
+ print(f"DEBUG - Loaded environment variables from {dotenv.find_dotenv()}")
46
+ else:
47
+ print("DEBUG - No .env file found")
55
48
 
56
49
  # --- Settings Load/Save Functions ---
57
50
  def load_settings() -> Dict[str, Any]:
@@ -60,7 +53,6 @@ def load_settings() -> Dict[str, Any]:
60
53
  try:
61
54
  with open(SETTINGS_FILE, "r") as f:
62
55
  settings = json.load(f)
63
- # Basic validation (can be expanded)
64
56
  if isinstance(settings, dict):
65
57
  print(f"DEBUG - Loaded settings from {SETTINGS_FILE}")
66
58
  return settings
@@ -71,7 +63,6 @@ def load_settings() -> Dict[str, Any]:
71
63
 
72
64
  def save_settings(settings: Dict[str, Any]):
73
65
  """Saves settings to the JSON file."""
74
- # Ensure sensitive keys are not saved
75
66
  settings.pop("provider_api_key", None)
76
67
  try:
77
68
  with open(SETTINGS_FILE, "w") as f:
@@ -81,246 +72,90 @@ def save_settings(settings: Dict[str, Any]):
81
72
  print(f"Warning: Could not save settings to {SETTINGS_FILE}: {e}")
82
73
 
83
74
 
84
- # --- End Settings Load/Save ---
85
-
86
-
87
- # Custom Screenshot Handler for Gradio chat
88
- class GradioChatScreenshotHandler(DefaultCallbackHandler):
89
- """Custom handler that adds screenshots to the Gradio chatbot and updates annotated image."""
90
-
91
- def __init__(self, chatbot_history: List[gr.ChatMessage]):
92
- """Initialize with reference to chat history and annotated image component.
93
-
94
- Args:
95
- chatbot_history: Reference to the Gradio chatbot history list
96
- annotated_image: Reference to the annotated image component
97
- """
98
- self.chatbot_history = chatbot_history
99
- print("GradioChatScreenshotHandler initialized")
75
+ # # Custom Screenshot Handler for Gradio chat
76
+ # class GradioChatScreenshotHandler:
77
+ # """Custom handler that adds screenshots to the Gradio chatbot."""
100
78
 
101
- async def on_screenshot(
102
- self,
103
- screenshot_base64: str,
104
- action_type: str = "",
105
- parsed_screen: Optional[dict] = None,
106
- ) -> None:
107
- """Add screenshot to chatbot when a screenshot is taken and update the annotated image.
79
+ # def __init__(self, chatbot_history: List[gr.ChatMessage]):
80
+ # self.chatbot_history = chatbot_history
81
+ # print("GradioChatScreenshotHandler initialized")
108
82
 
109
- Args:
110
- screenshot_base64: Base64 encoded screenshot
111
- action_type: Type of action that triggered the screenshot
112
-
113
- Returns:
114
- Original screenshot (does not modify it)
115
- """
116
- # Create a markdown image element for the screenshot
117
- image_markdown = (
118
- f"![Screenshot after {action_type}](data:image/png;base64,{screenshot_base64})"
119
- )
120
-
121
- # Simply append the screenshot as a new message
122
- if self.chatbot_history is not None:
123
- self.chatbot_history.append(
124
- gr.ChatMessage(
125
- role="assistant",
126
- content=image_markdown,
127
- metadata={"title": f"🖥️ Screenshot - {action_type}", "status": "done"},
128
- )
129
- )
83
+ # async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
84
+ # """Add screenshot to chatbot when a screenshot is taken."""
85
+ # image_markdown = f"![Screenshot after {action_type}](data:image/png;base64,{screenshot_base64})"
86
+
87
+ # if self.chatbot_history is not None:
88
+ # self.chatbot_history.append(
89
+ # gr.ChatMessage(
90
+ # role="assistant",
91
+ # content=image_markdown,
92
+ # metadata={"title": f"🖥️ Screenshot - {action_type}", "status": "done"},
93
+ # )
94
+ # )
130
95
 
131
96
 
132
- # Detect if current device is MacOS
97
+ # Detect platform capabilities
133
98
  is_mac = platform.system().lower() == "darwin"
134
-
135
- # Detect if lume is available (host device is macOS)
136
99
  is_lume_available = is_mac or (os.environ.get("PYLUME_HOST", "localhost") != "localhost")
137
100
 
138
101
  print("PYLUME_HOST: ", os.environ.get("PYLUME_HOST", "localhost"))
139
102
  print("is_mac: ", is_mac)
140
103
  print("Lume available: ", is_lume_available)
141
104
 
142
- # Map model names to specific provider model names
105
+ # Map model names to agent model strings
143
106
  MODEL_MAPPINGS = {
144
107
  "openai": {
145
- # Default to operator CUA model
146
- "default": "computer-use-preview",
147
- "OpenAI: Computer-Use Preview": "computer-use-preview",
148
- # Map standard OpenAI model names to CUA-specific model names
149
- "gpt-4-turbo": "computer-use-preview",
150
- "gpt-4o": "computer-use-preview",
151
- "gpt-4": "computer-use-preview",
152
- "gpt-4.5-preview": "computer-use-preview",
153
- "gpt-4o-mini": "gpt-4o-mini",
108
+ "default": "openai/computer-use-preview",
109
+ "OpenAI: Computer-Use Preview": "openai/computer-use-preview",
154
110
  },
155
111
  "anthropic": {
156
- # Default to newest model
157
- "default": "claude-3-7-sonnet-20250219",
158
- # New Claude 4 models
159
- "Anthropic: Claude 4 Opus (20250514)": "claude-opus-4-20250514",
160
- "Anthropic: Claude 4 Sonnet (20250514)": "claude-sonnet-4-20250514",
161
- "claude-opus-4-20250514": "claude-opus-4-20250514",
162
- "claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
163
-
164
- # Specific Claude models for CUA
165
- "Anthropic: Claude 3.7 Sonnet (20250219)": "claude-3-5-sonnet-20240620",
166
- "Anthropic: Claude 3.5 Sonnet (20240620)": "claude-3-7-sonnet-20250219",
167
- "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
168
- "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
169
- # Map standard model names to CUA-specific model names
170
- "claude-3-opus": "claude-3-7-sonnet-20250219",
171
- "claude-3-sonnet": "claude-3-5-sonnet-20240620",
172
- "claude-3-5-sonnet": "claude-3-5-sonnet-20240620",
173
- "claude-3-7-sonnet": "claude-3-7-sonnet-20250219",
112
+ "default": "anthropic/claude-3-7-sonnet-20250219",
113
+ "Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
114
+ "Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
115
+ "Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
116
+ "Anthropic: Claude 3.5 Sonnet (20240620)": "anthropic/claude-3-5-sonnet-20240620",
174
117
  },
175
118
  "omni": {
176
- # OMNI works with any of these models
177
- "default": "gpt-4o",
178
- "gpt-4o": "gpt-4o",
179
- "gpt-4o-mini": "gpt-4o-mini",
180
- "gpt-4": "gpt-4",
181
- "gpt-4.5-preview": "gpt-4.5-preview",
182
- "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
183
- "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
119
+ "default": "omniparser+openai/gpt-4o",
120
+ "OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
121
+ "OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
122
+ "OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
123
+ "OMNI: Claude 3.5 Sonnet (20240620)": "omniparser+anthropic/claude-3-5-sonnet-20240620",
184
124
  },
185
125
  "uitars": {
186
- # UI-TARS models using MLXVLM provider
187
- "default": "mlx-community/UI-TARS-1.5-7B-4bit" if is_mac else "tgi",
188
- "mlx-community/UI-TARS-1.5-7B-4bit": "mlx-community/UI-TARS-1.5-7B-4bit",
189
- "mlx-community/UI-TARS-1.5-7B-6bit": "mlx-community/UI-TARS-1.5-7B-6bit"
190
- },
191
- "ollama": {
192
- # For Ollama models, we keep the original name
193
- "default": "llama3", # A common default model
194
- # Don't map other models - we'll use the original name
195
- },
196
- "oaicompat": {
197
- # Default for OpenAI-compatible providers like VLLM
198
- "default": "Qwen2.5-VL-7B-Instruct",
126
+ "default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
127
+ "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
199
128
  },
200
129
  }
201
130
 
202
131
 
203
- def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
204
- """
205
- Determine the provider and actual model name to use based on the input.
206
-
207
- Args:
208
- model_name: The requested model name
209
- loop_provider: The requested agent loop provider
210
-
211
- Returns:
212
- tuple: (provider, model_name_to_use, agent_loop)
213
- """
214
- # Get the agent loop
215
- loop_provider_map = {
216
- "OPENAI": AgentLoop.OPENAI,
217
- "ANTHROPIC": AgentLoop.ANTHROPIC,
218
- "OMNI": AgentLoop.OMNI,
219
- "OMNI-OLLAMA": AgentLoop.OMNI, # Special case for Ollama models with OMNI parser
220
- "UITARS": AgentLoop.UITARS, # UI-TARS implementation
221
- }
222
- agent_loop = loop_provider_map.get(loop_provider, AgentLoop.OPENAI)
223
-
224
- # Set up the provider and model based on the loop and model_name
225
- if agent_loop == AgentLoop.OPENAI:
226
- provider = LLMProvider.OPENAI
227
- model_name_to_use = MODEL_MAPPINGS["openai"].get(
228
- model_name, MODEL_MAPPINGS["openai"]["default"]
229
- )
230
- elif agent_loop == AgentLoop.ANTHROPIC:
231
- provider = LLMProvider.ANTHROPIC
232
- model_name_to_use = MODEL_MAPPINGS["anthropic"].get(
233
- model_name, MODEL_MAPPINGS["anthropic"]["default"]
234
- )
235
- elif agent_loop == AgentLoop.OMNI:
236
- # Determine provider and clean model name based on the full string from UI
237
- cleaned_model_name = model_name # Default to using the name as-is (for custom)
238
-
239
- if model_name == "Custom model (OpenAI compatible API)":
240
- # Actual model name comes from custom_model_value via model_to_use.
241
- # Assume OAICOMPAT for custom models unless overridden by URL/key later?
242
- # get_provider_and_model determines the *initial* provider/model.
243
- # The custom URL/key in process_response ultimately dictates the OAICOMPAT setup.
244
- provider = LLMProvider.OAICOMPAT
245
- # We set cleaned_model_name below outside the checks based on model_to_use
246
- cleaned_model_name = "" # Placeholder, will be set by custom value later
247
- elif model_name.startswith("OMNI: Ollama "):
248
- provider = LLMProvider.OLLAMA
249
- # Extract the part after "OMNI: Ollama "
250
- cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
251
- elif model_name.startswith("OMNI: Claude "):
252
- provider = LLMProvider.ANTHROPIC
253
-
254
- model_name = model_name.replace("OMNI: ", "Anthropic: ")
255
- cleaned_model_name = MODEL_MAPPINGS["anthropic"].get(
256
- model_name, MODEL_MAPPINGS["anthropic"]["default"]
257
- )
258
- elif model_name.startswith("OMNI: OpenAI "):
259
- provider = LLMProvider.OPENAI
260
- # Extract the model part, e.g., "GPT-4o mini"
261
- model_key_part = model_name.replace("OMNI: OpenAI ", "")
262
- # Normalize the extracted part: "gpt4omini"
263
- model_key_part_norm = model_key_part.lower().replace("-", "").replace(" ", "")
264
-
265
- cleaned_model_name = MODEL_MAPPINGS["omni"]["default"] # Default if not found
266
- # Find the canonical name in the main OMNI map for OpenAI models
267
- for key_omni, val_omni in MODEL_MAPPINGS["omni"].items():
268
- # Normalize the omni map key: "gpt-4o-mini" -> "gpt4omini"
269
- key_omni_norm = key_omni.lower().replace("-", "").replace(" ", "")
270
- # Check if the normalized omni key matches the normalized extracted part
271
- if key_omni_norm == model_key_part_norm:
272
- cleaned_model_name = (
273
- val_omni # Use the value from the OMNI map (e.g., gpt-4o-mini)
274
- )
275
- break
276
- # Note: No fallback needed here as we explicitly check against omni keys
277
-
278
- else: # Handles unexpected formats or the raw custom name if "Custom model (OpenAI compatible API)" selected
279
- # Should only happen if user selected "Custom model (OpenAI compatible API)"
280
- # Or if a model name format isn't caught above
281
- provider = LLMProvider.OAICOMPAT
282
- cleaned_model_name = (
283
- model_name.strip() if model_name.strip() else MODEL_MAPPINGS["oaicompat"]["default"]
284
- )
285
-
286
- # Assign the determined model name
287
- model_name_to_use = cleaned_model_name
288
- # agent_loop remains AgentLoop.OMNI
289
- elif agent_loop == AgentLoop.UITARS:
290
- # For UITARS, use MLXVLM for mlx-community models, OAICOMPAT for custom
291
- if model_name == "Custom model (OpenAI compatible API)":
292
- provider = LLMProvider.OAICOMPAT
293
- model_name_to_use = "tgi"
294
- else:
295
- provider = LLMProvider.MLXVLM
296
- # Get the model name from the mappings or use as-is if not found
297
- model_name_to_use = MODEL_MAPPINGS["uitars"].get(
298
- model_name, model_name if model_name else MODEL_MAPPINGS["uitars"]["default"]
299
- )
300
- else:
301
- # Default to OpenAI if unrecognized loop
302
- provider = LLMProvider.OPENAI
303
- model_name_to_use = MODEL_MAPPINGS["openai"]["default"]
304
- agent_loop = AgentLoop.OPENAI
305
-
306
- print(f"Mapping {model_name} and {loop_provider} to {provider}, {model_name_to_use}, {agent_loop}")
307
-
308
- return provider, model_name_to_use, agent_loop
132
+ def get_model_string(model_name: str, loop_provider: str) -> str:
133
+ """Determine the agent model string based on the input."""
134
+ if model_name == "Custom model (OpenAI compatible API)":
135
+ return "custom_oaicompat"
136
+ elif model_name == "Custom model (ollama)":
137
+ return "custom_ollama"
138
+ elif loop_provider == "OMNI-OLLAMA" or model_name.startswith("OMNI: Ollama "):
139
+ if model_name.startswith("OMNI: Ollama "):
140
+ ollama_model = model_name.split("OMNI: Ollama ", 1)[1]
141
+ return f"omniparser+ollama_chat/{ollama_model}"
142
+ return "omniparser+ollama_chat/llama3"
143
+
144
+ # Map based on loop provider
145
+ mapping = MODEL_MAPPINGS.get(loop_provider.lower(), MODEL_MAPPINGS["openai"])
146
+ return mapping.get(model_name, mapping["default"])
309
147
 
310
148
 
311
149
  def get_ollama_models() -> List[str]:
312
150
  """Get available models from Ollama if installed."""
313
151
  try:
314
152
  import subprocess
315
-
316
153
  result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
317
154
  if result.returncode == 0:
318
155
  lines = result.stdout.strip().split("\n")
319
- if len(lines) < 2: # No models or just header
156
+ if len(lines) < 2:
320
157
  return []
321
-
322
158
  models = []
323
- # Skip header line
324
159
  for line in lines[1:]:
325
160
  parts = line.split()
326
161
  if parts:
@@ -342,7 +177,6 @@ def create_computer_instance(
342
177
  ) -> Computer:
343
178
  """Create or get the global Computer instance."""
344
179
  global global_computer
345
-
346
180
  if global_computer is None:
347
181
  global_computer = Computer(
348
182
  verbosity=verbosity,
@@ -351,29 +185,25 @@ def create_computer_instance(
351
185
  name=name if name else "",
352
186
  api_key=api_key
353
187
  )
354
-
355
188
  return global_computer
356
189
 
357
190
 
358
191
  def create_agent(
359
- provider: LLMProvider,
360
- agent_loop: AgentLoop,
361
- model_name: str,
362
- api_key: Optional[str] = None,
192
+ model_string: str,
363
193
  save_trajectory: bool = True,
364
194
  only_n_most_recent_images: int = 3,
365
195
  verbosity: int = logging.INFO,
366
- use_oaicompat: bool = False,
367
- provider_base_url: Optional[str] = None,
196
+ custom_model_name: Optional[str] = None,
368
197
  computer_os: str = "macos",
369
198
  computer_provider: str = "lume",
370
199
  computer_name: Optional[str] = None,
371
200
  computer_api_key: Optional[str] = None,
201
+ max_trajectory_budget: Optional[float] = None,
372
202
  ) -> ComputerAgent:
373
203
  """Create or update the global agent with the specified parameters."""
374
204
  global global_agent
375
205
 
376
- # Create the computer if not already done
206
+ # Create the computer
377
207
  computer = create_computer_instance(
378
208
  verbosity=verbosity,
379
209
  os_type=computer_os,
@@ -382,1086 +212,37 @@ def create_agent(
382
212
  api_key=computer_api_key
383
213
  )
384
214
 
385
- # Get API key from environment if not provided
386
- if api_key is None:
387
- if provider == LLMProvider.OPENAI:
388
- api_key = os.environ.get("OPENAI_API_KEY", "")
389
- elif provider == LLMProvider.ANTHROPIC:
390
- api_key = os.environ.get("ANTHROPIC_API_KEY", "")
391
-
392
- # Use provided provider_base_url if available, otherwise use default
393
- default_base_url = "http://localhost:1234/v1" if use_oaicompat else None
394
- custom_base_url = provider_base_url or default_base_url
395
-
396
- if use_oaicompat:
397
- # Special handling for OAICOMPAT - use OAICOMPAT provider with custom base URL
398
- print(f"DEBUG - Creating OAICOMPAT agent with model: {model_name}, URL: {custom_base_url}")
399
- llm = LLM(
400
- provider=LLMProvider.OAICOMPAT, # Set to OAICOMPAT instead of using original provider
401
- name=model_name,
402
- provider_base_url=custom_base_url,
403
- )
404
- print(f"DEBUG - LLM provider is now: {llm.provider}, base URL: {llm.provider_base_url}")
405
- # Note: Don't pass use_oaicompat to the agent, as it doesn't accept this parameter
406
- elif provider == LLMProvider.OAICOMPAT:
407
- # This path is unlikely to be taken with our current approach
408
- llm = LLM(provider=provider, name=model_name, provider_base_url=custom_base_url)
409
- else:
410
- # For other providers, just use standard parameters
411
- llm = LLM(provider=provider, name=model_name)
412
-
413
- # Create or update the agent
414
- global_agent = ComputerAgent(
415
- computer=computer,
416
- loop=agent_loop,
417
- model=llm,
418
- api_key=api_key,
419
- save_trajectory=save_trajectory,
420
- only_n_most_recent_images=only_n_most_recent_images,
421
- verbosity=verbosity,
422
- )
423
-
424
- return global_agent
425
-
426
-
427
- def create_gradio_ui(
428
- provider_name: str = "openai",
429
- model_name: str = "gpt-4o",
430
- ) -> gr.Blocks:
431
- """Create a Gradio UI for the Computer-Use Agent.
432
-
433
- Args:
434
- provider_name: The provider to use (e.g., "openai", "anthropic")
435
- model_name: The model to use (e.g., "gpt-4o", "claude-3-7-sonnet")
436
-
437
- Returns:
438
- A Gradio Blocks application
439
- """
440
- # --- Load Settings ---
441
- saved_settings = load_settings()
442
- # --- End Load Settings ---
443
-
444
- # Check for API keys
445
- openai_api_key = os.environ.get("OPENAI_API_KEY", "")
446
- anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
447
- cua_api_key = os.environ.get("CUA_API_KEY", "")
448
-
449
- # Always show models regardless of API key availability
450
- openai_models = ["OpenAI: Computer-Use Preview"]
451
- anthropic_models = [
452
- "Anthropic: Claude 4 Opus (20250514)",
453
- "Anthropic: Claude 4 Sonnet (20250514)",
454
-
455
- "Anthropic: Claude 3.7 Sonnet (20250219)",
456
- "Anthropic: Claude 3.5 Sonnet (20240620)",
457
- ]
458
- omni_models = [
459
- "OMNI: OpenAI GPT-4o",
460
- "OMNI: OpenAI GPT-4o mini",
461
- "OMNI: OpenAI GPT-4.5-preview",
462
- "OMNI: Claude 4 Opus (20250514)",
463
- "OMNI: Claude 4 Sonnet (20250514)",
464
- "OMNI: Claude 3.7 Sonnet (20250219)",
465
- "OMNI: Claude 3.5 Sonnet (20240620)"
466
- ]
467
-
468
- # Check if API keys are available
469
- has_openai_key = bool(openai_api_key)
470
- has_anthropic_key = bool(anthropic_api_key)
471
- has_cua_key = bool(cua_api_key)
472
-
473
- print("has_openai_key", has_openai_key)
474
- print("has_anthropic_key", has_anthropic_key)
475
- print("has_cua_key", has_cua_key)
476
-
477
- # Get Ollama models for OMNI
478
- ollama_models = get_ollama_models()
479
- if ollama_models:
480
- omni_models += ollama_models
481
-
482
- # Detect if current device is MacOS
483
- is_mac = platform.system().lower() == "darwin"
484
-
485
- # Format model choices
486
- provider_to_models = {
487
- "OPENAI": openai_models,
488
- "ANTHROPIC": anthropic_models,
489
- "OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"], # Add custom model options
490
- "UITARS": ([
491
- "mlx-community/UI-TARS-1.5-7B-4bit",
492
- "mlx-community/UI-TARS-1.5-7B-6bit",
493
- ] if is_mac else []) + ["Custom model (OpenAI compatible API)"], # UI-TARS options with MLX models
215
+ # Handle custom models
216
+ if model_string == "custom_oaicompat" and custom_model_name:
217
+ model_string = custom_model_name
218
+ elif model_string == "custom_ollama" and custom_model_name:
219
+ model_string = f"omniparser+ollama_chat/{custom_model_name}"
220
+
221
+ # Create agent kwargs
222
+ agent_kwargs = {
223
+ "model": model_string,
224
+ "tools": [computer],
225
+ "only_n_most_recent_images": only_n_most_recent_images,
226
+ "verbosity": verbosity,
494
227
  }
495
-
496
- # --- Apply Saved Settings (override defaults if available) ---
497
- initial_loop = saved_settings.get("agent_loop", "OMNI")
498
- # Ensure the saved model is actually available in the choices for the loaded loop
499
- available_models_for_loop = provider_to_models.get(initial_loop, [])
500
- saved_model_choice = saved_settings.get("model_choice")
501
- if saved_model_choice and saved_model_choice in available_models_for_loop:
502
- initial_model = saved_model_choice
503
- else:
504
- # If saved model isn't valid for the loop, reset to default for that loop
505
- if initial_loop == "OPENAI":
506
- initial_model = (
507
- "OpenAI: Computer-Use Preview" if openai_models else "No models available"
508
- )
509
- elif initial_loop == "ANTHROPIC":
510
- initial_model = anthropic_models[0] if anthropic_models else "No models available"
511
- else: # OMNI
512
- initial_model = omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
513
- if "Custom model (OpenAI compatible API)" in available_models_for_loop:
514
- initial_model = (
515
- "Custom model (OpenAI compatible API)" # Default to custom if available and no other default fits
516
- )
517
-
518
- initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
519
- initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
520
- initial_save_trajectory = saved_settings.get("save_trajectory", True)
521
- initial_recent_images = saved_settings.get("recent_images", 3)
522
- # --- End Apply Saved Settings ---
523
-
524
- # Example prompts
525
- example_messages = [
526
- "Create a Python virtual environment, install pandas and matplotlib, then plot stock data",
527
- "Open a PDF in Preview, add annotations, and save it as a compressed version",
528
- "Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
529
- "Configure SSH keys and set up a connection to a remote server",
530
- ]
531
228
 
532
- # Function to generate Python code based on configuration and tasks
533
- def generate_python_code(agent_loop_choice, provider, model_name, tasks, provider_url, recent_images=3, save_trajectory=True, computer_os="macos", computer_provider="lume", container_name="", cua_cloud_api_key=""):
534
- """Generate Python code for the current configuration and tasks.
535
-
536
- Args:
537
- agent_loop_choice: The agent loop type (e.g., UITARS, OPENAI, ANTHROPIC, OMNI)
538
- provider: The provider type (e.g., OPENAI, ANTHROPIC, OLLAMA, OAICOMPAT, MLXVLM)
539
- model_name: The model name
540
- tasks: List of tasks to execute
541
- provider_url: The provider base URL for OAICOMPAT providers
542
- recent_images: Number of recent images to keep in context
543
- save_trajectory: Whether to save the agent trajectory
544
- computer_os: Operating system type for the computer
545
- computer_provider: Provider type for the computer
546
- container_name: Optional VM name
547
- cua_cloud_api_key: Optional CUA Cloud API key
548
-
549
- Returns:
550
- Formatted Python code as a string
551
- """
552
- # Format the tasks as a Python list
553
- tasks_str = ""
554
- for task in tasks:
555
- if task and task.strip():
556
- tasks_str += f' "{task}",\n'
557
-
558
- # Create the Python code template with computer configuration
559
- computer_args = []
560
- if computer_os != "macos":
561
- computer_args.append(f'os_type="{computer_os}"')
562
- if computer_provider != "lume":
563
- computer_args.append(f'provider_type="{computer_provider}"')
564
- if container_name:
565
- computer_args.append(f'name="{container_name}"')
566
- if cua_cloud_api_key:
567
- computer_args.append(f'api_key="{cua_cloud_api_key}"')
568
-
569
- computer_args_str = ", ".join(computer_args)
570
- if computer_args_str:
571
- computer_args_str = f"({computer_args_str})"
572
- else:
573
- computer_args_str = "()"
574
-
575
- code = f'''import asyncio
576
- from computer import Computer
577
- from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
578
-
579
- async def main():
580
- async with Computer{computer_args_str} as macos_computer:
581
- agent = ComputerAgent(
582
- computer=macos_computer,
583
- loop=AgentLoop.{agent_loop_choice},
584
- only_n_most_recent_images={recent_images},
585
- save_trajectory={save_trajectory},'''
586
-
587
- # Add the model configuration based on provider and agent loop
588
- if agent_loop_choice == "OPENAI":
589
- # For OPENAI loop, always use OPENAI provider with computer-use-preview
590
- code += f'''
591
- model=LLM(
592
- provider=LLMProvider.OPENAI,
593
- name="computer-use-preview"
594
- )'''
595
- elif agent_loop_choice == "ANTHROPIC":
596
- # For ANTHROPIC loop, always use ANTHROPIC provider
597
- code += f'''
598
- model=LLM(
599
- provider=LLMProvider.ANTHROPIC,
600
- name="{model_name}"
601
- )'''
602
- elif agent_loop_choice == "UITARS":
603
- # For UITARS, use MLXVLM for mlx-community models, OAICOMPAT for others
604
- if provider == LLMProvider.MLXVLM:
605
- code += f'''
606
- model=LLM(
607
- provider=LLMProvider.MLXVLM,
608
- name="{model_name}"
609
- )'''
610
- else: # OAICOMPAT
611
- code += f'''
612
- model=LLM(
613
- provider=LLMProvider.OAICOMPAT,
614
- name="{model_name}",
615
- provider_base_url="{provider_url}"
616
- )'''
617
- elif agent_loop_choice == "OMNI":
618
- # For OMNI, provider can be OPENAI, ANTHROPIC, OLLAMA, or OAICOMPAT
619
- if provider == LLMProvider.OAICOMPAT:
620
- code += f'''
621
- model=LLM(
622
- provider=LLMProvider.OAICOMPAT,
623
- name="{model_name}",
624
- provider_base_url="{provider_url}"
625
- )'''
626
- else: # OPENAI, ANTHROPIC, OLLAMA
627
- code += f'''
628
- model=LLM(
629
- provider=LLMProvider.{provider.name},
630
- name="{model_name}"
631
- )'''
632
- else:
633
- # Default case - just use the provided provider and model
634
- code += f'''
635
- model=LLM(
636
- provider=LLMProvider.{provider.name},
637
- name="{model_name}"
638
- )'''
639
-
640
- code += """
641
- )
642
- """
643
-
644
- # Add tasks section if there are tasks
645
- if tasks_str:
646
- code += f'''
647
- # Prompts for the computer-use agent
648
- tasks = [
649
- {tasks_str.rstrip()}
650
- ]
651
-
652
- for task in tasks:
653
- print(f"Executing task: {{task}}")
654
- async for result in agent.run(task):
655
- print(result)'''
656
- else:
657
- # If no tasks, just add a placeholder for a single task
658
- code += f'''
659
- # Execute a single task
660
- task = "Search for information about CUA on GitHub"
661
- print(f"Executing task: {{task}}")
662
- async for result in agent.run(task):
663
- print(result)'''
664
-
665
-
666
-
667
- # Add the main block
668
- code += '''
669
-
670
- if __name__ == "__main__":
671
- asyncio.run(main())'''
672
-
673
- return code
674
-
675
- # Create the Gradio interface with advanced UI
676
- with gr.Blocks(title="Computer-Use Agent") as demo:
677
- with gr.Row():
678
- # Left column for settings
679
- with gr.Column(scale=1):
680
- # Logo with theme-aware styling
681
- gr.HTML(
682
- """
683
- <style>
684
- .light-logo, .dark-logo {
685
- display: block;
686
- margin: 0 auto;
687
- width: 80px;
688
- }
689
- /* Hide dark logo in light mode */
690
- .dark-logo {
691
- display: none;
692
- }
693
- /* In dark mode, hide light logo and show dark logo */
694
- .dark .light-logo {
695
- display: none;
696
- }
697
- .dark .dark-logo {
698
- display: block;
699
- }
700
- </style>
701
- <div style="display: flex; justify-content: center; margin-bottom: 0.5em">
702
- <img class="light-logo" alt="CUA Logo"
703
- src="https://github.com/trycua/cua/blob/main/img/logo_black.png?raw=true" />
704
- <img class="dark-logo" alt="CUA Logo"
705
- src="https://github.com/trycua/cua/blob/main/img/logo_white.png?raw=true" />
706
- </div>
707
- """
708
- )
709
-
710
- # Add accordion for Python code
711
- with gr.Accordion("Python Code", open=False):
712
- code_display = gr.Code(
713
- language="python",
714
- value=generate_python_code(
715
- initial_loop,
716
- LLMProvider.OPENAI,
717
- "gpt-4o",
718
- [],
719
- "https://openrouter.ai/api/v1",
720
- 3, # recent_images default
721
- True, # save_trajectory default
722
- "macos",
723
- "lume",
724
- "",
725
- ""
726
- ),
727
- interactive=False,
728
- )
729
-
730
- with gr.Accordion("Computer Configuration", open=True):
731
- # Computer configuration options
732
- computer_os = gr.Radio(
733
- choices=["macos", "linux", "windows"],
734
- label="Operating System",
735
- value="macos",
736
- info="Select the operating system for the computer",
737
- )
738
-
739
- is_windows = platform.system().lower() == "windows"
740
- is_mac = platform.system().lower() == "darwin"
741
-
742
- providers = ["cloud"]
743
- if is_lume_available:
744
- providers += ["lume"]
745
- if is_windows:
746
- providers += ["winsandbox"]
747
-
748
- computer_provider = gr.Radio(
749
- choices=providers,
750
- label="Provider",
751
- value="lume" if is_mac else "cloud",
752
- info="Select the computer provider",
753
- )
754
-
755
- container_name = gr.Textbox(
756
- label="Container Name",
757
- placeholder="Enter container name (optional)",
758
- value="",
759
- info="Optional name for the container",
760
- )
761
-
762
- cua_cloud_api_key = gr.Textbox(
763
- label="CUA Cloud API Key",
764
- placeholder="Enter your CUA Cloud API key",
765
- value="",
766
- type="password",
767
- info="Required for cloud provider",
768
- visible=(not has_cua_key)
769
- )
770
-
771
- with gr.Accordion("Agent Configuration", open=True):
772
- # Configuration options
773
- agent_loop = gr.Dropdown(
774
- choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
775
- label="Agent Loop",
776
- value=initial_loop,
777
- info="Select the agent loop provider",
778
- )
779
-
780
-
781
- # Create separate model selection dropdowns for each provider type
782
- # This avoids the Gradio bug with updating choices
783
- with gr.Group() as model_selection_group:
784
- # OpenAI models dropdown
785
- openai_model_choice = gr.Dropdown(
786
- choices=openai_models,
787
- label="OpenAI Model",
788
- value=openai_models[0] if openai_models else "No models available",
789
- info="Select OpenAI model",
790
- interactive=True,
791
- visible=(initial_loop == "OPENAI")
792
- )
793
-
794
- # Anthropic models dropdown
795
- anthropic_model_choice = gr.Dropdown(
796
- choices=anthropic_models,
797
- label="Anthropic Model",
798
- value=anthropic_models[0] if anthropic_models else "No models available",
799
- info="Select Anthropic model",
800
- interactive=True,
801
- visible=(initial_loop == "ANTHROPIC")
802
- )
803
-
804
- # OMNI models dropdown
805
- omni_model_choice = gr.Dropdown(
806
- choices=omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
807
- label="OMNI Model",
808
- value=omni_models[0] if omni_models else "Custom model (OpenAI compatible API)",
809
- info="Select OMNI model or choose a custom model option",
810
- interactive=True,
811
- visible=(initial_loop == "OMNI")
812
- )
813
-
814
- # UITARS models dropdown
815
- uitars_model_choice = gr.Dropdown(
816
- choices=provider_to_models.get("UITARS", ["No models available"]),
817
- label="UITARS Model",
818
- value=provider_to_models.get("UITARS", ["No models available"])[0] if provider_to_models.get("UITARS") else "No models available",
819
- info="Select UITARS model",
820
- interactive=True,
821
- visible=(initial_loop == "UITARS")
822
- )
823
-
824
- # Hidden field to store the selected model (for compatibility with existing code)
825
- model_choice = gr.Textbox(visible=False)
826
-
827
- # Add API key inputs for OpenAI and Anthropic
828
- with gr.Group(visible=not has_openai_key and (initial_loop == "OPENAI" or initial_loop == "OMNI")) as openai_key_group:
829
- openai_api_key_input = gr.Textbox(
830
- label="OpenAI API Key",
831
- placeholder="Enter your OpenAI API key",
832
- value="",
833
- interactive=True,
834
- type="password",
835
- info="Required for OpenAI models"
836
- )
837
-
838
- with gr.Group(visible=not has_anthropic_key and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")) as anthropic_key_group:
839
- anthropic_api_key_input = gr.Textbox(
840
- label="Anthropic API Key",
841
- placeholder="Enter your Anthropic API key",
842
- value="",
843
- interactive=True,
844
- type="password",
845
- info="Required for Anthropic models"
846
- )
847
-
848
- # Function to set OpenAI API key environment variable
849
- def set_openai_api_key(key):
850
- if key and key.strip():
851
- os.environ["OPENAI_API_KEY"] = key.strip()
852
- print(f"DEBUG - Set OpenAI API key environment variable")
853
- return key
854
-
855
- # Function to set Anthropic API key environment variable
856
- def set_anthropic_api_key(key):
857
- if key and key.strip():
858
- os.environ["ANTHROPIC_API_KEY"] = key.strip()
859
- print(f"DEBUG - Set Anthropic API key environment variable")
860
- return key
861
-
862
- # Add change event handlers for API key inputs
863
- openai_api_key_input.change(
864
- fn=set_openai_api_key,
865
- inputs=[openai_api_key_input],
866
- outputs=[openai_api_key_input],
867
- queue=False
868
- )
869
-
870
- anthropic_api_key_input.change(
871
- fn=set_anthropic_api_key,
872
- inputs=[anthropic_api_key_input],
873
- outputs=[anthropic_api_key_input],
874
- queue=False
875
- )
876
-
877
- # Combined function to update UI based on selections
878
- def update_ui(loop=None, openai_model=None, anthropic_model=None, omni_model=None, uitars_model=None):
879
- # Default values if not provided
880
- loop = loop or agent_loop.value
881
-
882
- # Determine which model value to use for custom model checks
883
- model_value = None
884
- if loop == "OPENAI" and openai_model:
885
- model_value = openai_model
886
- elif loop == "ANTHROPIC" and anthropic_model:
887
- model_value = anthropic_model
888
- elif loop == "OMNI" and omni_model:
889
- model_value = omni_model
890
- elif loop == "UITARS" and uitars_model:
891
- model_value = uitars_model
892
-
893
- # Show/hide appropriate model dropdown based on loop selection
894
- openai_visible = (loop == "OPENAI")
895
- anthropic_visible = (loop == "ANTHROPIC")
896
- omni_visible = (loop == "OMNI")
897
- uitars_visible = (loop == "UITARS")
898
-
899
- # Show/hide API key inputs based on loop selection
900
- show_openai_key = not has_openai_key and (loop == "OPENAI" or (loop == "OMNI" and model_value and "OpenAI" in model_value and "Custom" not in model_value))
901
- show_anthropic_key = not has_anthropic_key and (loop == "ANTHROPIC" or (loop == "OMNI" and model_value and "Claude" in model_value and "Custom" not in model_value))
902
-
903
- # Determine custom model visibility
904
- is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
905
- is_custom_ollama = model_value == "Custom model (ollama)"
906
- is_any_custom = is_custom_openai_api or is_custom_ollama
907
-
908
- # Update the hidden model_choice field based on the visible dropdown
909
- model_choice_value = model_value if model_value else ""
910
-
911
- # Return all UI updates
912
- return [
913
- # Model dropdowns visibility
914
- gr.update(visible=openai_visible),
915
- gr.update(visible=anthropic_visible),
916
- gr.update(visible=omni_visible),
917
- gr.update(visible=uitars_visible),
918
- # API key inputs visibility
919
- gr.update(visible=show_openai_key),
920
- gr.update(visible=show_anthropic_key),
921
- # Custom model fields visibility
922
- gr.update(visible=is_any_custom), # Custom model name always visible for any custom option
923
- gr.update(visible=is_custom_openai_api), # Provider base URL only for OpenAI compatible API
924
- gr.update(visible=is_custom_openai_api), # Provider API key only for OpenAI compatible API
925
- # Update the hidden model_choice field
926
- gr.update(value=model_choice_value)
927
- ]
928
-
929
- # Add custom model textbox (visible for both custom model options)
930
- custom_model = gr.Textbox(
931
- label="Custom Model Name",
932
- placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
933
- value=initial_custom_model,
934
- visible=(initial_model == "Custom model (OpenAI compatible API)" or initial_model == "Custom model (ollama)"),
935
- interactive=True,
936
- )
937
-
938
- # Add custom provider base URL textbox (only visible for OpenAI compatible API)
939
- provider_base_url = gr.Textbox(
940
- label="Provider Base URL",
941
- placeholder="Enter provider base URL (e.g., http://localhost:1234/v1)",
942
- value=initial_provider_base_url,
943
- visible=(initial_model == "Custom model (OpenAI compatible API)"),
944
- interactive=True,
945
- )
946
-
947
- # Add custom API key textbox (only visible for OpenAI compatible API)
948
- provider_api_key = gr.Textbox(
949
- label="Provider API Key",
950
- placeholder="Enter provider API key (if required)",
951
- value="",
952
- visible=(initial_model == "Custom model (OpenAI compatible API)"),
953
- interactive=True,
954
- type="password",
955
- )
956
-
957
- # Connect agent_loop changes to update all UI elements
958
- agent_loop.change(
959
- fn=update_ui,
960
- inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
961
- outputs=[
962
- openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
963
- openai_key_group, anthropic_key_group,
964
- custom_model, provider_base_url, provider_api_key,
965
- model_choice # Add model_choice to outputs
966
- ],
967
- queue=False # Process immediately without queueing
968
- )
969
-
970
- # Connect each model dropdown to update UI
971
- omni_model_choice.change(
972
- fn=update_ui,
973
- inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
974
- outputs=[
975
- openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
976
- openai_key_group, anthropic_key_group,
977
- custom_model, provider_base_url, provider_api_key,
978
- model_choice # Add model_choice to outputs
979
- ],
980
- queue=False
981
- )
982
-
983
- uitars_model_choice.change(
984
- fn=update_ui,
985
- inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
986
- outputs=[
987
- openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
988
- openai_key_group, anthropic_key_group,
989
- custom_model, provider_base_url, provider_api_key,
990
- model_choice # Add model_choice to outputs
991
- ],
992
- queue=False
993
- )
994
-
995
- openai_model_choice.change(
996
- fn=update_ui,
997
- inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
998
- outputs=[
999
- openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
1000
- openai_key_group, anthropic_key_group,
1001
- custom_model, provider_base_url, provider_api_key,
1002
- model_choice # Add model_choice to outputs
1003
- ],
1004
- queue=False
1005
- )
1006
-
1007
- anthropic_model_choice.change(
1008
- fn=update_ui,
1009
- inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
1010
- outputs=[
1011
- openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
1012
- openai_key_group, anthropic_key_group,
1013
- custom_model, provider_base_url, provider_api_key,
1014
- model_choice # Add model_choice to outputs
1015
- ],
1016
- queue=False
1017
- )
1018
-
1019
- save_trajectory = gr.Checkbox(
1020
- label="Save Trajectory",
1021
- value=initial_save_trajectory,
1022
- info="Save the agent's trajectory for debugging",
1023
- interactive=True,
1024
- )
1025
-
1026
- recent_images = gr.Slider(
1027
- label="Recent Images",
1028
- minimum=1,
1029
- maximum=10,
1030
- value=initial_recent_images,
1031
- step=1,
1032
- info="Number of recent images to keep in context",
1033
- interactive=True,
1034
- )
1035
-
1036
-
1037
- # Right column for chat interface
1038
- with gr.Column(scale=2):
1039
- # Add instruction text before the chat interface
1040
- gr.Markdown(
1041
- "Ask me to perform tasks in a virtual macOS environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
1042
- )
1043
-
1044
- chatbot_history = gr.Chatbot(type="messages")
1045
- msg = gr.Textbox(
1046
- placeholder="Ask me to perform tasks in a virtual macOS environment"
1047
- )
1048
- clear = gr.Button("Clear")
1049
-
1050
- # Add cancel button
1051
- cancel_button = gr.Button("Cancel", variant="stop")
1052
-
1053
- # Add examples
1054
- example_group = gr.Examples(examples=example_messages, inputs=msg)
1055
-
1056
- # Function to handle chat submission
1057
- def chat_submit(message, history):
1058
- # Add user message to history
1059
- history.append(gr.ChatMessage(role="user", content=message))
1060
- return "", history
1061
-
1062
- # Function to cancel the running agent
1063
- async def cancel_agent_task(history):
1064
- global global_agent
1065
- if global_agent and hasattr(global_agent, '_loop'):
1066
- print("DEBUG - Cancelling agent task")
1067
- # Cancel the agent loop
1068
- if hasattr(global_agent._loop, 'cancel') and callable(global_agent._loop.cancel):
1069
- await global_agent._loop.cancel()
1070
- history.append(gr.ChatMessage(role="assistant", content="Task cancelled by user", metadata={"title": "❌ Cancelled"}))
1071
- else:
1072
- history.append(gr.ChatMessage(role="assistant", content="Could not cancel task: cancel method not found", metadata={"title": "⚠️ Warning"}))
1073
- else:
1074
- history.append(gr.ChatMessage(role="assistant", content="No active agent task to cancel", metadata={"title": "ℹ️ Info"}))
1075
- return history
1076
-
1077
- # Function to process agent response after user input
1078
- async def process_response(
1079
- history,
1080
- openai_model_value,
1081
- anthropic_model_value,
1082
- omni_model_value,
1083
- uitars_model_value,
1084
- custom_model_value,
1085
- agent_loop_choice,
1086
- save_traj,
1087
- recent_imgs,
1088
- custom_url_value=None,
1089
- custom_api_key=None,
1090
- openai_key_input=None,
1091
- anthropic_key_input=None,
1092
- computer_os="macos",
1093
- computer_provider="lume",
1094
- container_name="",
1095
- cua_cloud_api_key="",
1096
- ):
1097
- if not history:
1098
- yield history
1099
- return
1100
-
1101
- # Get the last user message
1102
- last_user_message = history[-1]["content"]
1103
-
1104
- # Get the appropriate model value based on the agent loop
1105
- if agent_loop_choice == "OPENAI":
1106
- model_choice_value = openai_model_value
1107
- elif agent_loop_choice == "ANTHROPIC":
1108
- model_choice_value = anthropic_model_value
1109
- elif agent_loop_choice == "OMNI":
1110
- model_choice_value = omni_model_value
1111
- elif agent_loop_choice == "UITARS":
1112
- model_choice_value = uitars_model_value
1113
- else:
1114
- model_choice_value = "No models available"
1115
-
1116
- # Determine if this is a custom model selection and which type
1117
- is_custom_openai_api = model_choice_value == "Custom model (OpenAI compatible API)"
1118
- is_custom_ollama = model_choice_value == "Custom model (ollama)"
1119
- is_custom_model_selected = is_custom_openai_api or is_custom_ollama
1120
-
1121
- # Determine the model name string to analyze: custom or from dropdown
1122
- if is_custom_model_selected:
1123
- model_string_to_analyze = custom_model_value
1124
- else:
1125
- model_string_to_analyze = model_choice_value # Use the full UI string initially
1126
-
1127
- try:
1128
- # Special case for UITARS - use MLXVLM provider or OAICOMPAT for custom
1129
- if agent_loop_choice == "UITARS":
1130
- if is_custom_openai_api:
1131
- provider = LLMProvider.OAICOMPAT
1132
- cleaned_model_name_from_func = custom_model_value
1133
- agent_loop_type = AgentLoop.UITARS
1134
- print(f"Using OAICOMPAT provider for custom UITARS model: {custom_model_value}")
1135
- else:
1136
- provider = LLMProvider.MLXVLM
1137
- cleaned_model_name_from_func = model_string_to_analyze
1138
- agent_loop_type = AgentLoop.UITARS
1139
- print(f"Using MLXVLM provider for UITARS model: {model_string_to_analyze}")
1140
- # Special case for Ollama custom model
1141
- elif is_custom_ollama and agent_loop_choice == "OMNI":
1142
- provider = LLMProvider.OLLAMA
1143
- cleaned_model_name_from_func = custom_model_value
1144
- agent_loop_type = AgentLoop.OMNI
1145
- print(f"Using Ollama provider for custom model: {custom_model_value}")
1146
- else:
1147
- # Get the provider, *cleaned* model name, and agent loop type
1148
- provider, cleaned_model_name_from_func, agent_loop_type = (
1149
- get_provider_and_model(model_string_to_analyze, agent_loop_choice)
1150
- )
1151
-
1152
- print(f"provider={provider} cleaned_model_name_from_func={cleaned_model_name_from_func} agent_loop_type={agent_loop_type} agent_loop_choice={agent_loop_choice}")
1153
-
1154
- # Determine the final model name to send to the agent
1155
- # If custom selected, use the custom text box value, otherwise use the cleaned name
1156
- final_model_name_to_send = (
1157
- custom_model_value
1158
- if is_custom_model_selected
1159
- else cleaned_model_name_from_func
1160
- )
1161
-
1162
- # Determine if OAICOMPAT should be used (for OpenAI compatible API custom model)
1163
- is_oaicompat = is_custom_openai_api
1164
-
1165
- # Get API key based on provider determined by get_provider_and_model
1166
- if is_oaicompat and custom_api_key:
1167
- # Use custom API key if provided for OpenAI compatible API custom model
1168
- api_key = custom_api_key
1169
- print(
1170
- f"DEBUG - Using custom API key for OpenAI compatible API model: {final_model_name_to_send}"
1171
- )
1172
- elif provider == LLMProvider.OLLAMA:
1173
- # No API key needed for Ollama
1174
- api_key = ""
1175
- print(f"DEBUG - No API key needed for Ollama model: {final_model_name_to_send}")
1176
- elif provider == LLMProvider.OPENAI:
1177
- # Use OpenAI key from input if provided, otherwise use environment variable
1178
- api_key = openai_key_input if openai_key_input else (openai_api_key or os.environ.get("OPENAI_API_KEY", ""))
1179
- if openai_key_input:
1180
- # Set the environment variable for the OpenAI API key
1181
- os.environ["OPENAI_API_KEY"] = openai_key_input
1182
- print(f"DEBUG - Using provided OpenAI API key from UI and set as environment variable")
1183
- elif provider == LLMProvider.ANTHROPIC:
1184
- # Use Anthropic key from input if provided, otherwise use environment variable
1185
- api_key = anthropic_key_input if anthropic_key_input else (anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", ""))
1186
- if anthropic_key_input:
1187
- # Set the environment variable for the Anthropic API key
1188
- os.environ["ANTHROPIC_API_KEY"] = anthropic_key_input
1189
- print(f"DEBUG - Using provided Anthropic API key from UI and set as environment variable")
1190
- else:
1191
- # For Ollama or default OAICOMPAT (without custom key), no key needed/expected
1192
- api_key = ""
1193
-
1194
- cua_cloud_api_key = cua_cloud_api_key or os.environ.get("CUA_API_KEY", "")
1195
-
1196
- # --- Save Settings Before Running Agent ---
1197
- current_settings = {
1198
- "agent_loop": agent_loop_choice,
1199
- "model_choice": model_choice_value,
1200
- "custom_model": custom_model_value,
1201
- "provider_base_url": custom_url_value,
1202
- "save_trajectory": save_traj,
1203
- "recent_images": recent_imgs,
1204
- "computer_os": computer_os,
1205
- "computer_provider": computer_provider,
1206
- "container_name": container_name,
1207
- "cua_cloud_api_key": cua_cloud_api_key,
1208
- }
1209
- save_settings(current_settings)
1210
- # --- End Save Settings ---
1211
-
1212
- # Create or update the agent
1213
- create_agent(
1214
- # Provider determined by special cases and get_provider_and_model
1215
- provider=provider,
1216
- agent_loop=agent_loop_type,
1217
- # Pass the FINAL determined model name (cleaned or custom)
1218
- model_name=final_model_name_to_send,
1219
- api_key=api_key,
1220
- save_trajectory=save_traj,
1221
- only_n_most_recent_images=recent_imgs,
1222
- use_oaicompat=is_oaicompat, # Set flag if custom model was selected
1223
- # Pass custom URL only if custom model was selected
1224
- provider_base_url=custom_url_value if is_oaicompat else None,
1225
- computer_os=computer_os,
1226
- computer_provider=computer_provider,
1227
- computer_name=container_name,
1228
- computer_api_key=cua_cloud_api_key,
1229
- verbosity=logging.DEBUG, # Added verbosity here
1230
- )
1231
-
1232
- if global_agent is None:
1233
- # Add initial empty assistant message
1234
- history.append(
1235
- gr.ChatMessage(
1236
- role="assistant",
1237
- content="Failed to create agent. Check API keys and configuration.",
1238
- )
1239
- )
1240
- yield history
1241
- return
1242
-
1243
- # Add the screenshot handler to the agent's loop if available
1244
- if global_agent and hasattr(global_agent, "_loop"):
1245
- print("DEBUG - Adding screenshot handler to agent loop")
1246
-
1247
- # Create the screenshot handler with references to UI components
1248
- screenshot_handler = GradioChatScreenshotHandler(history)
1249
-
1250
- # Add the handler to the callback manager if it exists AND is not None
1251
- if (
1252
- hasattr(global_agent._loop, "callback_manager")
1253
- and global_agent._loop.callback_manager is not None
1254
- ):
1255
- global_agent._loop.callback_manager.add_handler(screenshot_handler)
1256
- print(
1257
- f"DEBUG - Screenshot handler added to callback manager with history: {id(history)}"
1258
- )
1259
- else:
1260
- # Optional: Log a warning if the callback manager is missing/None for a specific loop
1261
- print(
1262
- f"WARNING - Callback manager not found or is None for loop type: {type(global_agent._loop)}. Screenshot handler not added."
1263
- )
1264
-
1265
- # Stream responses from the agent
1266
- async for result in global_agent.run(last_user_message):
1267
- print(f"DEBUG - Agent response ------- START")
1268
- from pprint import pprint
1269
- pprint(result)
1270
- print(f"DEBUG - Agent response ------- END")
1271
-
1272
- def generate_gradio_messages():
1273
- if result.get("content"):
1274
- yield gr.ChatMessage(
1275
- role="assistant",
1276
- content=result.get("content", ""),
1277
- metadata=cast(MetadataDict, result.get("metadata", {}))
1278
- )
1279
- else:
1280
- outputs = result.get("output", [])
1281
- for output in outputs:
1282
- if output.get("type") == "message":
1283
- content = output.get("content", [])
1284
- for content_part in content:
1285
- if content_part.get("text"):
1286
- yield gr.ChatMessage(
1287
- role=output.get("role", "assistant"),
1288
- content=content_part.get("text", ""),
1289
- metadata=content_part.get("metadata", {})
1290
- )
1291
- elif output.get("type") == "reasoning":
1292
- # if it's openAI, we only have access to a summary of the reasoning
1293
- summary_content = output.get("summary", [])
1294
- if summary_content:
1295
- for summary_part in summary_content:
1296
- if summary_part.get("type") == "summary_text":
1297
- yield gr.ChatMessage(
1298
- role="assistant",
1299
- content=summary_part.get("text", "")
1300
- )
1301
- else:
1302
- summary_content = output.get("text", "")
1303
- if summary_content:
1304
- yield gr.ChatMessage(
1305
- role="assistant",
1306
- content=summary_content,
1307
- )
1308
- elif output.get("type") == "computer_call":
1309
- action = output.get("action", {})
1310
- action_type = action.get("type", "")
1311
- if action_type:
1312
- action_title = f"🛠️ Performing {action_type}"
1313
- if action.get("x") and action.get("y"):
1314
- action_title += f" at ({action['x']}, {action['y']})"
1315
- yield gr.ChatMessage(
1316
- role="assistant",
1317
- content=f"```json\n{json.dumps(action)}\n```",
1318
- metadata={"title": action_title}
1319
- )
1320
-
1321
- for message in generate_gradio_messages():
1322
- history.append(message)
1323
- yield history
1324
-
1325
- except Exception as e:
1326
- import traceback
1327
-
1328
- traceback.print_exc()
1329
- # Update with error message
1330
- history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
1331
- yield history
1332
-
1333
- # Connect the submit button to the process_response function
1334
- submit_event = msg.submit(
1335
- fn=chat_submit,
1336
- inputs=[msg, chatbot_history],
1337
- outputs=[msg, chatbot_history],
1338
- queue=False,
1339
- ).then(
1340
- fn=process_response,
1341
- inputs=[
1342
- chatbot_history,
1343
- openai_model_choice,
1344
- anthropic_model_choice,
1345
- omni_model_choice,
1346
- uitars_model_choice,
1347
- custom_model,
1348
- agent_loop,
1349
- save_trajectory,
1350
- recent_images,
1351
- provider_base_url,
1352
- provider_api_key,
1353
- openai_api_key_input,
1354
- anthropic_api_key_input,
1355
- computer_os,
1356
- computer_provider,
1357
- container_name,
1358
- cua_cloud_api_key,
1359
- ],
1360
- outputs=[chatbot_history],
1361
- queue=True,
1362
- )
1363
-
1364
- # Clear button functionality
1365
- clear.click(lambda: None, None, chatbot_history, queue=False)
1366
-
1367
- # Connect cancel button to cancel function
1368
- cancel_button.click(
1369
- cancel_agent_task,
1370
- [chatbot_history],
1371
- [chatbot_history],
1372
- queue=False # Process immediately without queueing
1373
- )
1374
-
1375
-
1376
- # Function to update the code display based on configuration and chat history
1377
- def update_code_display(agent_loop, model_choice_val, custom_model_val, chat_history, provider_base_url, recent_images_val, save_trajectory_val, computer_os, computer_provider, container_name, cua_cloud_api_key):
1378
- # Extract messages from chat history
1379
- messages = []
1380
- if chat_history:
1381
- for msg in chat_history:
1382
- if isinstance(msg, dict) and msg.get("role") == "user":
1383
- messages.append(msg.get("content", ""))
1384
-
1385
- # Determine provider and model based on current selection
1386
- provider, model_name, _ = get_provider_and_model(
1387
- model_choice_val or custom_model_val or "gpt-4o",
1388
- agent_loop
1389
- )
1390
-
1391
- return generate_python_code(
1392
- agent_loop,
1393
- provider,
1394
- model_name,
1395
- messages,
1396
- provider_base_url,
1397
- recent_images_val,
1398
- save_trajectory_val,
1399
- computer_os,
1400
- computer_provider,
1401
- container_name,
1402
- cua_cloud_api_key
1403
- )
1404
-
1405
- # Update code display when configuration changes
1406
- agent_loop.change(
1407
- update_code_display,
1408
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
1409
- outputs=[code_display]
1410
- )
1411
- model_choice.change(
1412
- update_code_display,
1413
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
1414
- outputs=[code_display]
1415
- )
1416
- custom_model.change(
1417
- update_code_display,
1418
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
1419
- outputs=[code_display]
1420
- )
1421
- chatbot_history.change(
1422
- update_code_display,
1423
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
1424
- outputs=[code_display]
1425
- )
1426
- recent_images.change(
1427
- update_code_display,
1428
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
1429
- outputs=[code_display]
1430
- )
1431
- save_trajectory.change(
1432
- update_code_display,
1433
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
1434
- outputs=[code_display]
1435
- )
1436
- computer_os.change(
1437
- update_code_display,
1438
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
1439
- outputs=[code_display]
1440
- )
1441
- computer_provider.change(
1442
- update_code_display,
1443
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
1444
- outputs=[code_display]
1445
- )
1446
- container_name.change(
1447
- update_code_display,
1448
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
1449
- outputs=[code_display]
1450
- )
1451
- cua_cloud_api_key.change(
1452
- update_code_display,
1453
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, provider_base_url, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key],
1454
- outputs=[code_display]
1455
- )
229
+ if save_trajectory:
230
+ agent_kwargs["trajectory_dir"] = "trajectories"
231
+
232
+ if max_trajectory_budget:
233
+ agent_kwargs["max_trajectory_budget"] = {"max_budget": max_trajectory_budget, "raise_error": True}
1456
234
 
1457
- return demo
235
+ global_agent = ComputerAgent(**agent_kwargs)
236
+ return global_agent
1458
237
 
1459
238
 
1460
- def test_cua():
239
+ def launch_ui():
1461
240
  """Standalone function to launch the Gradio app."""
241
+ from agent.ui.gradio.ui_components import create_gradio_ui
242
+ print(f"Starting Gradio app for CUA Agent...")
1462
243
  demo = create_gradio_ui()
1463
- demo.launch(share=False, inbrowser=True) # Don't create a public link
244
+ demo.launch(share=False, inbrowser=True)
1464
245
 
1465
246
 
1466
247
  if __name__ == "__main__":
1467
- test_cua()
248
+ launch_ui()