cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/cli.py CHANGED
@@ -3,55 +3,75 @@ CLI chat interface for agent - Computer Use Agent
3
3
 
4
4
  Usage:
5
5
  python -m agent.cli <model_string>
6
-
6
+
7
7
  Examples:
8
8
  python -m agent.cli openai/computer-use-preview
9
- python -m agent.cli anthropic/claude-3-5-sonnet-20241022
10
- python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
9
+ python -m agent.cli anthropic/claude-sonnet-4-5-20250929
10
+ python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
11
11
  """
12
12
 
13
13
  try:
14
- import asyncio
15
14
  import argparse
15
+ import asyncio
16
+ import base64
17
+ import json
16
18
  import os
19
+ import platform
17
20
  import sys
18
- import json
19
- from typing import List, Dict, Any
21
+ import time
22
+ from pathlib import Path
23
+ from typing import Any, Dict, List
24
+
20
25
  import dotenv
26
+
27
+ try:
28
+ from PIL import Image, ImageDraw
29
+
30
+ PIL_AVAILABLE = True
31
+ except Exception:
32
+ PIL_AVAILABLE = False
21
33
  from yaspin import yaspin
22
34
  except ImportError:
23
35
  if __name__ == "__main__":
24
36
  raise ImportError(
25
- "CLI dependencies not found. "
26
- "Please install with: pip install \"cua-agent[cli]\""
37
+ "CLI dependencies not found. " 'Please install with: pip install "cua-agent[cli]"'
27
38
  )
28
39
 
29
40
  # Load environment variables
30
41
  dotenv.load_dotenv()
31
42
 
43
+
32
44
  # Color codes for terminal output
33
45
  class Colors:
34
- RESET = '\033[0m'
35
- BOLD = '\033[1m'
36
- DIM = '\033[2m'
37
-
46
+ RESET = "\033[0m"
47
+ BOLD = "\033[1m"
48
+ DIM = "\033[2m"
49
+
38
50
  # Text colors
39
- RED = '\033[31m'
40
- GREEN = '\033[32m'
41
- YELLOW = '\033[33m'
42
- BLUE = '\033[34m'
43
- MAGENTA = '\033[35m'
44
- CYAN = '\033[36m'
45
- WHITE = '\033[37m'
46
- GRAY = '\033[90m'
47
-
51
+ RED = "\033[31m"
52
+ GREEN = "\033[32m"
53
+ YELLOW = "\033[33m"
54
+ BLUE = "\033[34m"
55
+ MAGENTA = "\033[35m"
56
+ CYAN = "\033[36m"
57
+ WHITE = "\033[37m"
58
+ GRAY = "\033[90m"
59
+
48
60
  # Background colors
49
- BG_RED = '\033[41m'
50
- BG_GREEN = '\033[42m'
51
- BG_YELLOW = '\033[43m'
52
- BG_BLUE = '\033[44m'
61
+ BG_RED = "\033[41m"
62
+ BG_GREEN = "\033[42m"
63
+ BG_YELLOW = "\033[43m"
64
+ BG_BLUE = "\033[44m"
53
65
 
54
- def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n", right: str = ""):
66
+
67
+ def print_colored(
68
+ text: str,
69
+ color: str = "",
70
+ bold: bool = False,
71
+ dim: bool = False,
72
+ end: str = "\n",
73
+ right: str = "",
74
+ ):
55
75
  """Print colored text to terminal with optional right-aligned text."""
56
76
  prefix = ""
57
77
  if bold:
@@ -60,24 +80,25 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
60
80
  prefix += Colors.DIM
61
81
  if color:
62
82
  prefix += color
63
-
83
+
64
84
  if right:
65
85
  # Get terminal width (default to 80 if unable to determine)
66
86
  try:
67
87
  import shutil
88
+
68
89
  terminal_width = shutil.get_terminal_size().columns
69
90
  except:
70
91
  terminal_width = 80
71
92
 
72
93
  # Add right margin
73
94
  terminal_width -= 1
74
-
95
+
75
96
  # Calculate padding needed
76
97
  # Account for ANSI escape codes not taking visual space
77
98
  visible_left_len = len(text)
78
99
  visible_right_len = len(right)
79
100
  padding = terminal_width - visible_left_len - visible_right_len
80
-
101
+
81
102
  if padding > 0:
82
103
  output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}"
83
104
  else:
@@ -85,7 +106,7 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
85
106
  output = f"{prefix}{text} {right}{Colors.RESET}"
86
107
  else:
87
108
  output = f"{prefix}{text}{Colors.RESET}"
88
-
109
+
89
110
  print(output, end=end)
90
111
 
91
112
 
@@ -104,29 +125,34 @@ def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
104
125
  args_str = f"('{details['text']}')"
105
126
  elif action_type == "scroll" and "x" in details and "y" in details:
106
127
  args_str = f"({details['x']}, {details['y']})"
107
-
128
+
108
129
  if total_cost > 0:
109
130
  print_colored(f"🛠️ {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}")
110
131
  else:
111
132
  print_colored(f"🛠️ {action_type}{args_str}", dim=True)
112
133
 
134
+
113
135
  def print_welcome(model: str, agent_loop: str, container_name: str):
114
136
  """Print welcome message."""
115
137
  print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
116
138
  print_colored("Type 'exit' to quit.", dim=True)
117
139
 
140
+
118
141
  async def ainput(prompt: str = ""):
119
142
  return await asyncio.to_thread(input, prompt)
120
143
 
121
- async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True):
144
+
145
+ async def chat_loop(
146
+ agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True
147
+ ):
122
148
  """Main chat loop with the agent."""
123
149
  print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
124
-
150
+
125
151
  history = []
126
-
152
+
127
153
  if initial_prompt:
128
154
  history.append({"role": "user", "content": initial_prompt})
129
-
155
+
130
156
  total_cost = 0
131
157
 
132
158
  while True:
@@ -134,31 +160,31 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
134
160
  # Get user input with prompt
135
161
  print_colored("> ", end="")
136
162
  user_input = await ainput()
137
-
138
- if user_input.lower() in ['exit', 'quit', 'q']:
163
+
164
+ if user_input.lower() in ["exit", "quit", "q"]:
139
165
  print_colored("\n👋 Goodbye!")
140
166
  break
141
-
167
+
142
168
  if not user_input:
143
169
  continue
144
-
170
+
145
171
  # Add user message to history
146
172
  history.append({"role": "user", "content": user_input})
147
-
173
+
148
174
  # Stream responses from the agent with spinner
149
175
  with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
150
176
  spinner.hide()
151
-
177
+
152
178
  async for result in agent.run(history):
153
179
  # Add agent responses to history
154
180
  history.extend(result.get("output", []))
155
181
 
156
182
  if show_usage:
157
183
  total_cost += result.get("usage", {}).get("response_cost", 0)
158
-
184
+
159
185
  # Process and display the output
160
186
  for item in result.get("output", []):
161
- if item.get("type") == "message":
187
+ if item.get("type") == "message" and item.get("role") == "assistant":
162
188
  # Display agent text response
163
189
  content = item.get("content", [])
164
190
  for content_part in content:
@@ -167,7 +193,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
167
193
  if text:
168
194
  spinner.hide()
169
195
  print_colored(text)
170
-
196
+
171
197
  elif item.get("type") == "computer_call":
172
198
  # Display computer action
173
199
  action = item.get("action", {})
@@ -177,7 +203,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
177
203
  print_action(action_type, action, total_cost)
178
204
  spinner.text = f"Performing {action_type}..."
179
205
  spinner.show()
180
-
206
+
181
207
  elif item.get("type") == "function_call":
182
208
  # Display function call
183
209
  function_name = item.get("name", "")
@@ -185,121 +211,141 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
185
211
  print_colored(f"🔧 Calling function: {function_name}", dim=True)
186
212
  spinner.text = f"Calling {function_name}..."
187
213
  spinner.show()
188
-
214
+
189
215
  elif item.get("type") == "function_call_output":
190
216
  # Display function output (dimmed)
191
217
  output = item.get("output", "")
192
218
  if output and len(output.strip()) > 0:
193
219
  spinner.hide()
194
220
  print_colored(f"📤 {output}", dim=True)
195
-
221
+
196
222
  spinner.hide()
197
223
  if show_usage and total_cost > 0:
198
224
  print_colored(f"Total cost: ${total_cost:.2f}", dim=True)
199
-
225
+
200
226
 
201
227
  async def main():
202
228
  """Main CLI function."""
203
229
  parser = argparse.ArgumentParser(
204
- description="CUA Agent CLI - Interactive computer use assistant",
230
+ description="Cua Agent CLI - Interactive computer use assistant",
205
231
  formatter_class=argparse.RawDescriptionHelpFormatter,
206
232
  epilog="""
207
233
  Examples:
208
234
  python -m agent.cli openai/computer-use-preview
209
- python -m agent.cli anthropic/claude-3-5-sonnet-20241022
210
- python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
235
+ python -m agent.cli anthropic/claude-sonnet-4-5-20250929
236
+ python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
211
237
  python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
212
- """
238
+ """,
213
239
  )
214
-
240
+
215
241
  parser.add_argument(
216
242
  "model",
217
- help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
243
+ help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-sonnet-4-5-20250929')",
218
244
  )
219
-
245
+
246
+ parser.add_argument(
247
+ "--provider",
248
+ choices=["cloud", "lume", "winsandbox", "docker"],
249
+ default="cloud",
250
+ help="Computer provider to use: cloud (default), lume, winsandbox, or docker",
251
+ )
252
+
220
253
  parser.add_argument(
221
254
  "--images",
222
255
  type=int,
223
256
  default=3,
224
- help="Number of recent images to keep in context (default: 3)"
225
- )
226
-
227
- parser.add_argument(
228
- "--trajectory",
229
- action="store_true",
230
- help="Save trajectory for debugging"
257
+ help="Number of recent images to keep in context (default: 3)",
231
258
  )
232
-
259
+
260
+ parser.add_argument("--trajectory", action="store_true", help="Save trajectory for debugging")
261
+
262
+ parser.add_argument("--budget", type=float, help="Maximum budget for the session (in dollars)")
263
+
264
+ parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
265
+
233
266
  parser.add_argument(
234
- "--budget",
235
- type=float,
236
- help="Maximum budget for the session (in dollars)"
267
+ "-p",
268
+ "--prompt",
269
+ type=str,
270
+ help="Initial prompt to send to the agent. Leave blank for interactive mode.",
237
271
  )
238
-
272
+
239
273
  parser.add_argument(
240
- "--verbose",
241
- action="store_true",
242
- help="Enable verbose logging"
274
+ "--prompt-file",
275
+ type=Path,
276
+ help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt.",
243
277
  )
244
278
 
245
279
  parser.add_argument(
246
- "-p", "--prompt",
280
+ "--predict-click",
281
+ dest="predict_click",
247
282
  type=str,
248
- help="Initial prompt to send to the agent. Leave blank for interactive mode."
283
+ help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it.",
249
284
  )
250
285
 
251
- parser.add_argument(
252
- "-c", "--cache",
253
- action="store_true",
254
- help="Tell the API to enable caching"
255
- )
286
+ parser.add_argument("-c", "--cache", action="store_true", help="Tell the API to enable caching")
256
287
 
257
288
  parser.add_argument(
258
- "-u", "--usage",
259
- action="store_true",
260
- help="Show total cost of the agent runs"
289
+ "-u", "--usage", action="store_true", help="Show total cost of the agent runs"
261
290
  )
262
291
 
263
292
  parser.add_argument(
264
- "-r", "--max-retries",
293
+ "-r",
294
+ "--max-retries",
265
295
  type=int,
266
296
  default=3,
267
- help="Maximum number of retries for the LLM API calls"
297
+ help="Maximum number of retries for the LLM API calls",
268
298
  )
269
-
299
+
300
+ # Provider override credentials
301
+ parser.add_argument(
302
+ "--api-key",
303
+ dest="api_key",
304
+ type=str,
305
+ help="API key override for the model provider (passed to ComputerAgent)",
306
+ )
307
+ parser.add_argument(
308
+ "--api-base",
309
+ dest="api_base",
310
+ type=str,
311
+ help="API base URL override for the model provider (passed to ComputerAgent)",
312
+ )
313
+
270
314
  args = parser.parse_args()
271
-
315
+
272
316
  # Check for required environment variables
273
317
  container_name = os.getenv("CUA_CONTAINER_NAME")
274
318
  cua_api_key = os.getenv("CUA_API_KEY")
275
-
276
- # Prompt for missing environment variables
319
+
320
+ # Prompt for missing environment variables (container name always required)
277
321
  if not container_name:
278
- print_colored("CUA_CONTAINER_NAME not set.", dim=True)
279
- print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
280
- container_name = input("Enter your CUA container name: ").strip()
281
- if not container_name:
282
- print_colored("❌ Container name is required.")
283
- sys.exit(1)
284
-
285
- if not cua_api_key:
322
+ if args.provider == "cloud":
323
+ print_colored("CUA_CONTAINER_NAME not set.", dim=True)
324
+ print_colored("You can get a Cua container at https://cua.ai/", dim=True)
325
+ container_name = input("Enter your Cua container name: ").strip()
326
+ if not container_name:
327
+ print_colored("❌ Container name is required.")
328
+ sys.exit(1)
329
+ else:
330
+ container_name = "cli-sandbox"
331
+
332
+ # Only require API key for cloud provider
333
+ if args.provider == "cloud" and not cua_api_key:
286
334
  print_colored("CUA_API_KEY not set.", dim=True)
287
- cua_api_key = input("Enter your CUA API key: ").strip()
335
+ cua_api_key = input("Enter your Cua API key: ").strip()
288
336
  if not cua_api_key:
289
- print_colored("❌ API key is required.")
337
+ print_colored("❌ API key is required for cloud provider.")
290
338
  sys.exit(1)
291
-
339
+
292
340
  # Check for provider-specific API keys based on model
293
341
  provider_api_keys = {
294
342
  "openai/": "OPENAI_API_KEY",
295
343
  "anthropic/": "ANTHROPIC_API_KEY",
296
- "omniparser+": "OPENAI_API_KEY",
297
- "omniparser+": "ANTHROPIC_API_KEY",
298
344
  }
299
-
345
+
300
346
  # Find matching provider and check for API key
301
347
  for prefix, env_var in provider_api_keys.items():
302
- if args.model.startswith(prefix):
348
+ if prefix in args.model:
303
349
  if not os.getenv(env_var):
304
350
  print_colored(f"{env_var} not set.", dim=True)
305
351
  api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
@@ -309,7 +355,7 @@ Examples:
309
355
  # Set the environment variable for the session
310
356
  os.environ[env_var] = api_key
311
357
  break
312
-
358
+
313
359
  # Import here to avoid import errors if dependencies are missing
314
360
  try:
315
361
  from agent import ComputerAgent
@@ -318,48 +364,152 @@ Examples:
318
364
  print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
319
365
  print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
320
366
  sys.exit(1)
321
-
367
+
368
+ # Resolve provider -> os_type, provider_type, api key requirement
369
+ provider_map = {
370
+ "cloud": ("linux", "cloud", True),
371
+ "lume": ("macos", "lume", False),
372
+ "winsandbox": ("windows", "winsandbox", False),
373
+ "docker": ("linux", "docker", False),
374
+ }
375
+ os_type, provider_type, needs_api_key = provider_map[args.provider]
376
+
377
+ computer_kwargs = {
378
+ "os_type": os_type,
379
+ "provider_type": provider_type,
380
+ "name": container_name,
381
+ }
382
+ if needs_api_key:
383
+ computer_kwargs["api_key"] = cua_api_key # type: ignore
384
+
322
385
  # Create computer instance
323
- async with Computer(
324
- os_type="linux",
325
- provider_type="cloud",
326
- name=container_name,
327
- api_key=cua_api_key
328
- ) as computer:
329
-
386
+ async with Computer(**computer_kwargs) as computer: # type: ignore
387
+
330
388
  # Create agent
331
389
  agent_kwargs = {
332
390
  "model": args.model,
333
391
  "tools": [computer],
392
+ "trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
334
393
  "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
335
- "max_retries": args.max_retries
394
+ "max_retries": args.max_retries,
336
395
  }
337
396
 
397
+ # Thread API credentials to agent if provided
398
+ if args.api_key:
399
+ agent_kwargs["api_key"] = args.api_key
400
+ if args.api_base:
401
+ agent_kwargs["api_base"] = args.api_base
402
+
338
403
  if args.images > 0:
339
404
  agent_kwargs["only_n_most_recent_images"] = args.images
340
-
405
+
341
406
  if args.trajectory:
342
407
  agent_kwargs["trajectory_dir"] = "trajectories"
343
-
408
+
344
409
  if args.budget:
345
410
  agent_kwargs["max_trajectory_budget"] = {
346
411
  "max_budget": args.budget,
347
412
  "raise_error": True,
348
- "reset_after_each_run": False
413
+ "reset_after_each_run": False,
349
414
  }
350
415
 
351
416
  if args.cache:
352
417
  agent_kwargs["use_prompt_caching"] = True
353
-
418
+
354
419
  agent = ComputerAgent(**agent_kwargs)
355
-
356
- # Start chat loop
357
- await chat_loop(agent, args.model, container_name, args.prompt, args.usage)
358
420
 
421
+ # If predict-click mode is requested, run once and exit
422
+ if args.predict_click:
423
+ if not PIL_AVAILABLE:
424
+ print_colored(
425
+ "❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow",
426
+ Colors.RED,
427
+ bold=True,
428
+ )
429
+ sys.exit(1)
430
+
431
+ instruction = args.predict_click
432
+ print_colored(f"Predicting click for: '{instruction}'", Colors.CYAN)
433
+
434
+ # Take a fresh screenshot FIRST
435
+ try:
436
+ img_bytes = await computer.interface.screenshot()
437
+ except Exception as e:
438
+ print_colored(f"❌ Failed to take screenshot: {e}", Colors.RED, bold=True)
439
+ sys.exit(1)
440
+
441
+ # Encode screenshot to base64 for predict_click
442
+ try:
443
+ image_b64 = base64.b64encode(img_bytes).decode("utf-8")
444
+ except Exception as e:
445
+ print_colored(f"❌ Failed to encode screenshot: {e}", Colors.RED, bold=True)
446
+ sys.exit(1)
447
+
448
+ try:
449
+ coords = await agent.predict_click(instruction, image_b64=image_b64)
450
+ except Exception as e:
451
+ print_colored(f"❌ predict_click failed: {e}", Colors.RED, bold=True)
452
+ sys.exit(1)
453
+
454
+ if not coords:
455
+ print_colored("⚠️ No coordinates returned.", Colors.YELLOW)
456
+ sys.exit(2)
457
+
458
+ x, y = coords
459
+ print_colored(f"✅ Predicted coordinates: ({x}, {y})", Colors.GREEN)
460
+
461
+ try:
462
+ from io import BytesIO
463
+
464
+ with Image.open(BytesIO(img_bytes)) as img:
465
+ img = img.convert("RGB")
466
+ draw = ImageDraw.Draw(img)
467
+ # Draw crosshair
468
+ size = 12
469
+ color = (255, 0, 0)
470
+ draw.line([(x - size, y), (x + size, y)], fill=color, width=3)
471
+ draw.line([(x, y - size), (x, y + size)], fill=color, width=3)
472
+ # Optional small circle
473
+ r = 6
474
+ draw.ellipse([(x - r, y - r), (x + r, y + r)], outline=color, width=2)
475
+
476
+ out_path = Path.cwd() / f"predict_click_{int(time.time())}.png"
477
+ img.save(out_path)
478
+ print_colored(f"🖼️ Saved to {out_path}")
479
+
480
+ # Open the image with default viewer
481
+ try:
482
+ system = platform.system().lower()
483
+ if system == "windows":
484
+ os.startfile(str(out_path)) # type: ignore[attr-defined]
485
+ elif system == "darwin":
486
+ os.system(f'open "{out_path}"')
487
+ else:
488
+ os.system(f'xdg-open "{out_path}"')
489
+ except Exception:
490
+ pass
491
+ except Exception as e:
492
+ print_colored(f"❌ Failed to render/save screenshot: {e}", Colors.RED, bold=True)
493
+ sys.exit(1)
494
+
495
+ # Done
496
+ sys.exit(0)
497
+
498
+ # Resolve initial prompt from --prompt-file or --prompt
499
+ initial_prompt = args.prompt or ""
500
+ if args.prompt_file:
501
+ try:
502
+ initial_prompt = args.prompt_file.read_text(encoding="utf-8")
503
+ except Exception as e:
504
+ print_colored(f"❌ Failed to read --prompt-file: {e}", Colors.RED, bold=True)
505
+ sys.exit(1)
506
+
507
+ # Start chat loop (default interactive mode)
508
+ await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
359
509
 
360
510
 
361
511
  if __name__ == "__main__":
362
512
  try:
363
513
  asyncio.run(main())
364
514
  except (KeyboardInterrupt, EOFError) as _:
365
- print_colored("\n\n👋 Goodbye!")
515
+ print_colored("\n\n👋 Goodbye!")
@@ -6,27 +6,32 @@ computer interface types, supporting both the ComputerHandler protocol and the
6
6
  Computer library interface.
7
7
  """
8
8
 
9
+ from computer import Computer as cuaComputer
10
+
9
11
  from .base import AsyncComputerHandler
10
12
  from .cua import cuaComputerHandler
11
13
  from .custom import CustomComputerHandler
12
- from computer import Computer as cuaComputer
14
+
13
15
 
14
16
  def is_agent_computer(computer):
15
- """Check if the given computer is a ComputerHandler or CUA Computer."""
16
- return isinstance(computer, AsyncComputerHandler) or \
17
- isinstance(computer, cuaComputer) or \
18
- (isinstance(computer, dict)) #and "screenshot" in computer)
17
+ """Check if the given computer is a ComputerHandler or Cua Computer."""
18
+ return (
19
+ isinstance(computer, AsyncComputerHandler)
20
+ or isinstance(computer, cuaComputer)
21
+ or (isinstance(computer, dict))
22
+ ) # and "screenshot" in computer)
23
+
19
24
 
20
25
  async def make_computer_handler(computer):
21
26
  """
22
27
  Create a computer handler from a computer interface.
23
-
28
+
24
29
  Args:
25
30
  computer: Either a ComputerHandler instance, Computer instance, or dict of functions
26
-
31
+
27
32
  Returns:
28
33
  ComputerHandler: A computer handler instance
29
-
34
+
30
35
  Raises:
31
36
  ValueError: If the computer type is not supported
32
37
  """
@@ -38,4 +43,4 @@ async def make_computer_handler(computer):
38
43
  return computer_handler
39
44
  if isinstance(computer, dict):
40
45
  return CustomComputerHandler(computer)
41
- raise ValueError(f"Unsupported computer type: {type(computer)}")
46
+ raise ValueError(f"Unsupported computer type: {type(computer)}")