cua-agent 0.4.31__py3-none-any.whl → 0.4.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

@@ -188,6 +188,8 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
188
188
  model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
189
189
  if "+" in model:
190
190
  model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
191
+ # strip non-alphanumeric characters from model_name_short
192
+ model_name_short = ''.join(c for c in model_name_short if c.isalnum() or c == '_')
191
193
 
192
194
  # id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
193
195
  now = datetime.now()
agent/cli.py CHANGED
@@ -18,6 +18,15 @@ try:
18
18
  import json
19
19
  from typing import List, Dict, Any
20
20
  import dotenv
21
+ import base64
22
+ import time
23
+ import platform
24
+ from pathlib import Path
25
+ try:
26
+ from PIL import Image, ImageDraw
27
+ PIL_AVAILABLE = True
28
+ except Exception:
29
+ PIL_AVAILABLE = False
21
30
  from yaspin import yaspin
22
31
  except ImportError:
23
32
  if __name__ == "__main__":
@@ -158,7 +167,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
158
167
 
159
168
  # Process and display the output
160
169
  for item in result.get("output", []):
161
- if item.get("type") == "message":
170
+ if item.get("type") == "message" and item.get("role") == "assistant":
162
171
  # Display agent text response
163
172
  content = item.get("content", [])
164
173
  for content_part in content:
@@ -217,6 +226,13 @@ Examples:
217
226
  help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
218
227
  )
219
228
 
229
+ parser.add_argument(
230
+ "--provider",
231
+ choices=["cloud", "lume", "winsandbox", "docker"],
232
+ default="cloud",
233
+ help="Computer provider to use: cloud (default), lume, winsandbox, or docker"
234
+ )
235
+
220
236
  parser.add_argument(
221
237
  "--images",
222
238
  type=int,
@@ -248,6 +264,19 @@ Examples:
248
264
  help="Initial prompt to send to the agent. Leave blank for interactive mode."
249
265
  )
250
266
 
267
+ parser.add_argument(
268
+ "--prompt-file",
269
+ type=Path,
270
+ help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt."
271
+ )
272
+
273
+ parser.add_argument(
274
+ "--predict-click",
275
+ dest="predict_click",
276
+ type=str,
277
+ help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it."
278
+ )
279
+
251
280
  parser.add_argument(
252
281
  "-c", "--cache",
253
282
  action="store_true",
@@ -273,33 +302,35 @@ Examples:
273
302
  container_name = os.getenv("CUA_CONTAINER_NAME")
274
303
  cua_api_key = os.getenv("CUA_API_KEY")
275
304
 
276
- # Prompt for missing environment variables
305
+ # Prompt for missing environment variables (container name always required)
277
306
  if not container_name:
278
- print_colored("CUA_CONTAINER_NAME not set.", dim=True)
279
- print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
280
- container_name = input("Enter your CUA container name: ").strip()
281
- if not container_name:
282
- print_colored("❌ Container name is required.")
283
- sys.exit(1)
284
-
285
- if not cua_api_key:
307
+ if args.provider == "cloud":
308
+ print_colored("CUA_CONTAINER_NAME not set.", dim=True)
309
+ print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
310
+ container_name = input("Enter your CUA container name: ").strip()
311
+ if not container_name:
312
+ print_colored("❌ Container name is required.")
313
+ sys.exit(1)
314
+ else:
315
+ container_name = "cli-sandbox"
316
+
317
+ # Only require API key for cloud provider
318
+ if args.provider == "cloud" and not cua_api_key:
286
319
  print_colored("CUA_API_KEY not set.", dim=True)
287
320
  cua_api_key = input("Enter your CUA API key: ").strip()
288
321
  if not cua_api_key:
289
- print_colored("❌ API key is required.")
322
+ print_colored("❌ API key is required for cloud provider.")
290
323
  sys.exit(1)
291
324
 
292
325
  # Check for provider-specific API keys based on model
293
326
  provider_api_keys = {
294
327
  "openai/": "OPENAI_API_KEY",
295
328
  "anthropic/": "ANTHROPIC_API_KEY",
296
- "omniparser+": "OPENAI_API_KEY",
297
- "omniparser+": "ANTHROPIC_API_KEY",
298
329
  }
299
330
 
300
331
  # Find matching provider and check for API key
301
332
  for prefix, env_var in provider_api_keys.items():
302
- if args.model.startswith(prefix):
333
+ if prefix in args.model:
303
334
  if not os.getenv(env_var):
304
335
  print_colored(f"{env_var} not set.", dim=True)
305
336
  api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
@@ -319,18 +350,31 @@ Examples:
319
350
  print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
320
351
  sys.exit(1)
321
352
 
353
+ # Resolve provider -> os_type, provider_type, api key requirement
354
+ provider_map = {
355
+ "cloud": ("linux", "cloud", True),
356
+ "lume": ("macos", "lume", False),
357
+ "winsandbox": ("windows", "winsandbox", False),
358
+ "docker": ("linux", "docker", False),
359
+ }
360
+ os_type, provider_type, needs_api_key = provider_map[args.provider]
361
+
362
+ computer_kwargs = {
363
+ "os_type": os_type,
364
+ "provider_type": provider_type,
365
+ "name": container_name,
366
+ }
367
+ if needs_api_key:
368
+ computer_kwargs["api_key"] = cua_api_key # type: ignore
369
+
322
370
  # Create computer instance
323
- async with Computer(
324
- os_type="linux",
325
- provider_type="cloud",
326
- name=container_name,
327
- api_key=cua_api_key
328
- ) as computer:
371
+ async with Computer(**computer_kwargs) as computer: # type: ignore
329
372
 
330
373
  # Create agent
331
374
  agent_kwargs = {
332
375
  "model": args.model,
333
376
  "tools": [computer],
377
+ "trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
334
378
  "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
335
379
  "max_retries": args.max_retries
336
380
  }
@@ -353,8 +397,89 @@ Examples:
353
397
 
354
398
  agent = ComputerAgent(**agent_kwargs)
355
399
 
356
- # Start chat loop
357
- await chat_loop(agent, args.model, container_name, args.prompt, args.usage)
400
+ # If predict-click mode is requested, run once and exit
401
+ if args.predict_click:
402
+ if not PIL_AVAILABLE:
403
+ print_colored("❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow", Colors.RED, bold=True)
404
+ sys.exit(1)
405
+
406
+ instruction = args.predict_click
407
+ print_colored(f"Predicting click for: '{instruction}'", Colors.CYAN)
408
+
409
+ # Take a fresh screenshot FIRST
410
+ try:
411
+ img_bytes = await computer.interface.screenshot()
412
+ except Exception as e:
413
+ print_colored(f"❌ Failed to take screenshot: {e}", Colors.RED, bold=True)
414
+ sys.exit(1)
415
+
416
+ # Encode screenshot to base64 for predict_click
417
+ try:
418
+ image_b64 = base64.b64encode(img_bytes).decode("utf-8")
419
+ except Exception as e:
420
+ print_colored(f"❌ Failed to encode screenshot: {e}", Colors.RED, bold=True)
421
+ sys.exit(1)
422
+
423
+ try:
424
+ coords = await agent.predict_click(instruction, image_b64=image_b64)
425
+ except Exception as e:
426
+ print_colored(f"❌ predict_click failed: {e}", Colors.RED, bold=True)
427
+ sys.exit(1)
428
+
429
+ if not coords:
430
+ print_colored("⚠️ No coordinates returned.", Colors.YELLOW)
431
+ sys.exit(2)
432
+
433
+ x, y = coords
434
+ print_colored(f"✅ Predicted coordinates: ({x}, {y})", Colors.GREEN)
435
+
436
+ try:
437
+ from io import BytesIO
438
+ with Image.open(BytesIO(img_bytes)) as img:
439
+ img = img.convert("RGB")
440
+ draw = ImageDraw.Draw(img)
441
+ # Draw crosshair
442
+ size = 12
443
+ color = (255, 0, 0)
444
+ draw.line([(x - size, y), (x + size, y)], fill=color, width=3)
445
+ draw.line([(x, y - size), (x, y + size)], fill=color, width=3)
446
+ # Optional small circle
447
+ r = 6
448
+ draw.ellipse([(x - r, y - r), (x + r, y + r)], outline=color, width=2)
449
+
450
+ out_path = Path.cwd() / f"predict_click_{int(time.time())}.png"
451
+ img.save(out_path)
452
+ print_colored(f"🖼️ Saved to {out_path}")
453
+
454
+ # Open the image with default viewer
455
+ try:
456
+ system = platform.system().lower()
457
+ if system == "windows":
458
+ os.startfile(str(out_path)) # type: ignore[attr-defined]
459
+ elif system == "darwin":
460
+ os.system(f"open \"{out_path}\"")
461
+ else:
462
+ os.system(f"xdg-open \"{out_path}\"")
463
+ except Exception:
464
+ pass
465
+ except Exception as e:
466
+ print_colored(f"❌ Failed to render/save screenshot: {e}", Colors.RED, bold=True)
467
+ sys.exit(1)
468
+
469
+ # Done
470
+ sys.exit(0)
471
+
472
+ # Resolve initial prompt from --prompt-file or --prompt
473
+ initial_prompt = args.prompt or ""
474
+ if args.prompt_file:
475
+ try:
476
+ initial_prompt = args.prompt_file.read_text(encoding="utf-8")
477
+ except Exception as e:
478
+ print_colored(f"❌ Failed to read --prompt-file: {e}", Colors.RED, bold=True)
479
+ sys.exit(1)
480
+
481
+ # Start chat loop (default interactive mode)
482
+ await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
358
483
 
359
484
 
360
485
 
agent/loops/__init__.py CHANGED
@@ -10,5 +10,23 @@ from . import omniparser
10
10
  from . import gta1
11
11
  from . import composed_grounded
12
12
  from . import glm45v
13
+ from . import opencua
14
+ from . import internvl
15
+ from . import holo
16
+ from . import moondream3
17
+ from . import gemini
13
18
 
14
- __all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded", "glm45v"]
19
+ __all__ = [
20
+ "anthropic",
21
+ "openai",
22
+ "uitars",
23
+ "omniparser",
24
+ "gta1",
25
+ "composed_grounded",
26
+ "glm45v",
27
+ "opencua",
28
+ "internvl",
29
+ "holo",
30
+ "moondream3",
31
+ "gemini"
32
+ ]
agent/loops/anthropic.py CHANGED
@@ -33,7 +33,7 @@ from ..responses import (
33
33
  MODEL_TOOL_MAPPING = [
34
34
  # Claude 4 models
35
35
  {
36
- "pattern": r"claude-4|claude-opus-4|claude-sonnet-4",
36
+ "pattern": r"claude-4|claude-opus-4|claude-sonnet-4|claude-haiku-4",
37
37
  "tool_version": "computer_20250124",
38
38
  "beta_flag": "computer-use-2025-01-24"
39
39
  },
@@ -1577,11 +1577,10 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
1577
1577
  isinstance(item.get("action"), dict)):
1578
1578
 
1579
1579
  action = item["action"]
1580
- if action.get("type") == "click":
1580
+ if action.get("x") and action.get("y"):
1581
1581
  x = action.get("x")
1582
1582
  y = action.get("y")
1583
- if x is not None and y is not None:
1584
- return (int(x), int(y))
1583
+ return (int(x), int(y))
1585
1584
 
1586
1585
  return None
1587
1586
 
@@ -126,7 +126,7 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str
126
126
 
127
127
 
128
128
  @register_agent(r".*\+.*", priority=1)
129
- class ComposedGroundedConfig:
129
+ class ComposedGroundedConfig(AsyncAgentConfig):
130
130
  """
131
131
  Composed-grounded agent configuration that uses both grounding and thinking models.
132
132