cua-agent 0.4.31__py3-none-any.whl → 0.4.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/adapters/huggingfacelocal_adapter.py +15 -66
- agent/adapters/models/__init__.py +33 -0
- agent/adapters/models/generic.py +75 -0
- agent/adapters/models/internvl.py +254 -0
- agent/adapters/models/opencua.py +100 -0
- agent/adapters/models/qwen2_5_vl.py +75 -0
- agent/agent.py +5 -1
- agent/callbacks/trajectory_saver.py +2 -0
- agent/cli.py +147 -22
- agent/loops/__init__.py +19 -1
- agent/loops/anthropic.py +3 -4
- agent/loops/composed_grounded.py +1 -1
- agent/loops/gemini.py +391 -0
- agent/loops/glm45v.py +3 -2
- agent/loops/gta1.py +1 -1
- agent/loops/holo.py +216 -0
- agent/loops/internvl.py +185 -0
- agent/loops/moondream3.py +464 -0
- agent/loops/openai.py +1 -2
- agent/loops/opencua.py +142 -0
- agent/loops/uitars.py +1 -1
- {cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/METADATA +23 -4
- {cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/RECORD +25 -15
- {cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/entry_points.txt +0 -0
|
@@ -188,6 +188,8 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
188
188
|
model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
|
|
189
189
|
if "+" in model:
|
|
190
190
|
model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
|
|
191
|
+
# strip non-alphanumeric characters from model_name_short
|
|
192
|
+
model_name_short = ''.join(c for c in model_name_short if c.isalnum() or c == '_')
|
|
191
193
|
|
|
192
194
|
# id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
|
|
193
195
|
now = datetime.now()
|
agent/cli.py
CHANGED
|
@@ -18,6 +18,15 @@ try:
|
|
|
18
18
|
import json
|
|
19
19
|
from typing import List, Dict, Any
|
|
20
20
|
import dotenv
|
|
21
|
+
import base64
|
|
22
|
+
import time
|
|
23
|
+
import platform
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
try:
|
|
26
|
+
from PIL import Image, ImageDraw
|
|
27
|
+
PIL_AVAILABLE = True
|
|
28
|
+
except Exception:
|
|
29
|
+
PIL_AVAILABLE = False
|
|
21
30
|
from yaspin import yaspin
|
|
22
31
|
except ImportError:
|
|
23
32
|
if __name__ == "__main__":
|
|
@@ -158,7 +167,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
|
|
158
167
|
|
|
159
168
|
# Process and display the output
|
|
160
169
|
for item in result.get("output", []):
|
|
161
|
-
if item.get("type") == "message":
|
|
170
|
+
if item.get("type") == "message" and item.get("role") == "assistant":
|
|
162
171
|
# Display agent text response
|
|
163
172
|
content = item.get("content", [])
|
|
164
173
|
for content_part in content:
|
|
@@ -217,6 +226,13 @@ Examples:
|
|
|
217
226
|
help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
|
|
218
227
|
)
|
|
219
228
|
|
|
229
|
+
parser.add_argument(
|
|
230
|
+
"--provider",
|
|
231
|
+
choices=["cloud", "lume", "winsandbox", "docker"],
|
|
232
|
+
default="cloud",
|
|
233
|
+
help="Computer provider to use: cloud (default), lume, winsandbox, or docker"
|
|
234
|
+
)
|
|
235
|
+
|
|
220
236
|
parser.add_argument(
|
|
221
237
|
"--images",
|
|
222
238
|
type=int,
|
|
@@ -248,6 +264,19 @@ Examples:
|
|
|
248
264
|
help="Initial prompt to send to the agent. Leave blank for interactive mode."
|
|
249
265
|
)
|
|
250
266
|
|
|
267
|
+
parser.add_argument(
|
|
268
|
+
"--prompt-file",
|
|
269
|
+
type=Path,
|
|
270
|
+
help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt."
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
parser.add_argument(
|
|
274
|
+
"--predict-click",
|
|
275
|
+
dest="predict_click",
|
|
276
|
+
type=str,
|
|
277
|
+
help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it."
|
|
278
|
+
)
|
|
279
|
+
|
|
251
280
|
parser.add_argument(
|
|
252
281
|
"-c", "--cache",
|
|
253
282
|
action="store_true",
|
|
@@ -273,33 +302,35 @@ Examples:
|
|
|
273
302
|
container_name = os.getenv("CUA_CONTAINER_NAME")
|
|
274
303
|
cua_api_key = os.getenv("CUA_API_KEY")
|
|
275
304
|
|
|
276
|
-
# Prompt for missing environment variables
|
|
305
|
+
# Prompt for missing environment variables (container name always required)
|
|
277
306
|
if not container_name:
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
307
|
+
if args.provider == "cloud":
|
|
308
|
+
print_colored("CUA_CONTAINER_NAME not set.", dim=True)
|
|
309
|
+
print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
|
|
310
|
+
container_name = input("Enter your CUA container name: ").strip()
|
|
311
|
+
if not container_name:
|
|
312
|
+
print_colored("❌ Container name is required.")
|
|
313
|
+
sys.exit(1)
|
|
314
|
+
else:
|
|
315
|
+
container_name = "cli-sandbox"
|
|
316
|
+
|
|
317
|
+
# Only require API key for cloud provider
|
|
318
|
+
if args.provider == "cloud" and not cua_api_key:
|
|
286
319
|
print_colored("CUA_API_KEY not set.", dim=True)
|
|
287
320
|
cua_api_key = input("Enter your CUA API key: ").strip()
|
|
288
321
|
if not cua_api_key:
|
|
289
|
-
print_colored("❌ API key is required.")
|
|
322
|
+
print_colored("❌ API key is required for cloud provider.")
|
|
290
323
|
sys.exit(1)
|
|
291
324
|
|
|
292
325
|
# Check for provider-specific API keys based on model
|
|
293
326
|
provider_api_keys = {
|
|
294
327
|
"openai/": "OPENAI_API_KEY",
|
|
295
328
|
"anthropic/": "ANTHROPIC_API_KEY",
|
|
296
|
-
"omniparser+": "OPENAI_API_KEY",
|
|
297
|
-
"omniparser+": "ANTHROPIC_API_KEY",
|
|
298
329
|
}
|
|
299
330
|
|
|
300
331
|
# Find matching provider and check for API key
|
|
301
332
|
for prefix, env_var in provider_api_keys.items():
|
|
302
|
-
if args.model
|
|
333
|
+
if prefix in args.model:
|
|
303
334
|
if not os.getenv(env_var):
|
|
304
335
|
print_colored(f"{env_var} not set.", dim=True)
|
|
305
336
|
api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
|
|
@@ -319,18 +350,31 @@ Examples:
|
|
|
319
350
|
print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
|
|
320
351
|
sys.exit(1)
|
|
321
352
|
|
|
353
|
+
# Resolve provider -> os_type, provider_type, api key requirement
|
|
354
|
+
provider_map = {
|
|
355
|
+
"cloud": ("linux", "cloud", True),
|
|
356
|
+
"lume": ("macos", "lume", False),
|
|
357
|
+
"winsandbox": ("windows", "winsandbox", False),
|
|
358
|
+
"docker": ("linux", "docker", False),
|
|
359
|
+
}
|
|
360
|
+
os_type, provider_type, needs_api_key = provider_map[args.provider]
|
|
361
|
+
|
|
362
|
+
computer_kwargs = {
|
|
363
|
+
"os_type": os_type,
|
|
364
|
+
"provider_type": provider_type,
|
|
365
|
+
"name": container_name,
|
|
366
|
+
}
|
|
367
|
+
if needs_api_key:
|
|
368
|
+
computer_kwargs["api_key"] = cua_api_key # type: ignore
|
|
369
|
+
|
|
322
370
|
# Create computer instance
|
|
323
|
-
async with Computer(
|
|
324
|
-
os_type="linux",
|
|
325
|
-
provider_type="cloud",
|
|
326
|
-
name=container_name,
|
|
327
|
-
api_key=cua_api_key
|
|
328
|
-
) as computer:
|
|
371
|
+
async with Computer(**computer_kwargs) as computer: # type: ignore
|
|
329
372
|
|
|
330
373
|
# Create agent
|
|
331
374
|
agent_kwargs = {
|
|
332
375
|
"model": args.model,
|
|
333
376
|
"tools": [computer],
|
|
377
|
+
"trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
|
|
334
378
|
"verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
|
|
335
379
|
"max_retries": args.max_retries
|
|
336
380
|
}
|
|
@@ -353,8 +397,89 @@ Examples:
|
|
|
353
397
|
|
|
354
398
|
agent = ComputerAgent(**agent_kwargs)
|
|
355
399
|
|
|
356
|
-
#
|
|
357
|
-
|
|
400
|
+
# If predict-click mode is requested, run once and exit
|
|
401
|
+
if args.predict_click:
|
|
402
|
+
if not PIL_AVAILABLE:
|
|
403
|
+
print_colored("❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow", Colors.RED, bold=True)
|
|
404
|
+
sys.exit(1)
|
|
405
|
+
|
|
406
|
+
instruction = args.predict_click
|
|
407
|
+
print_colored(f"Predicting click for: '{instruction}'", Colors.CYAN)
|
|
408
|
+
|
|
409
|
+
# Take a fresh screenshot FIRST
|
|
410
|
+
try:
|
|
411
|
+
img_bytes = await computer.interface.screenshot()
|
|
412
|
+
except Exception as e:
|
|
413
|
+
print_colored(f"❌ Failed to take screenshot: {e}", Colors.RED, bold=True)
|
|
414
|
+
sys.exit(1)
|
|
415
|
+
|
|
416
|
+
# Encode screenshot to base64 for predict_click
|
|
417
|
+
try:
|
|
418
|
+
image_b64 = base64.b64encode(img_bytes).decode("utf-8")
|
|
419
|
+
except Exception as e:
|
|
420
|
+
print_colored(f"❌ Failed to encode screenshot: {e}", Colors.RED, bold=True)
|
|
421
|
+
sys.exit(1)
|
|
422
|
+
|
|
423
|
+
try:
|
|
424
|
+
coords = await agent.predict_click(instruction, image_b64=image_b64)
|
|
425
|
+
except Exception as e:
|
|
426
|
+
print_colored(f"❌ predict_click failed: {e}", Colors.RED, bold=True)
|
|
427
|
+
sys.exit(1)
|
|
428
|
+
|
|
429
|
+
if not coords:
|
|
430
|
+
print_colored("⚠️ No coordinates returned.", Colors.YELLOW)
|
|
431
|
+
sys.exit(2)
|
|
432
|
+
|
|
433
|
+
x, y = coords
|
|
434
|
+
print_colored(f"✅ Predicted coordinates: ({x}, {y})", Colors.GREEN)
|
|
435
|
+
|
|
436
|
+
try:
|
|
437
|
+
from io import BytesIO
|
|
438
|
+
with Image.open(BytesIO(img_bytes)) as img:
|
|
439
|
+
img = img.convert("RGB")
|
|
440
|
+
draw = ImageDraw.Draw(img)
|
|
441
|
+
# Draw crosshair
|
|
442
|
+
size = 12
|
|
443
|
+
color = (255, 0, 0)
|
|
444
|
+
draw.line([(x - size, y), (x + size, y)], fill=color, width=3)
|
|
445
|
+
draw.line([(x, y - size), (x, y + size)], fill=color, width=3)
|
|
446
|
+
# Optional small circle
|
|
447
|
+
r = 6
|
|
448
|
+
draw.ellipse([(x - r, y - r), (x + r, y + r)], outline=color, width=2)
|
|
449
|
+
|
|
450
|
+
out_path = Path.cwd() / f"predict_click_{int(time.time())}.png"
|
|
451
|
+
img.save(out_path)
|
|
452
|
+
print_colored(f"🖼️ Saved to {out_path}")
|
|
453
|
+
|
|
454
|
+
# Open the image with default viewer
|
|
455
|
+
try:
|
|
456
|
+
system = platform.system().lower()
|
|
457
|
+
if system == "windows":
|
|
458
|
+
os.startfile(str(out_path)) # type: ignore[attr-defined]
|
|
459
|
+
elif system == "darwin":
|
|
460
|
+
os.system(f"open \"{out_path}\"")
|
|
461
|
+
else:
|
|
462
|
+
os.system(f"xdg-open \"{out_path}\"")
|
|
463
|
+
except Exception:
|
|
464
|
+
pass
|
|
465
|
+
except Exception as e:
|
|
466
|
+
print_colored(f"❌ Failed to render/save screenshot: {e}", Colors.RED, bold=True)
|
|
467
|
+
sys.exit(1)
|
|
468
|
+
|
|
469
|
+
# Done
|
|
470
|
+
sys.exit(0)
|
|
471
|
+
|
|
472
|
+
# Resolve initial prompt from --prompt-file or --prompt
|
|
473
|
+
initial_prompt = args.prompt or ""
|
|
474
|
+
if args.prompt_file:
|
|
475
|
+
try:
|
|
476
|
+
initial_prompt = args.prompt_file.read_text(encoding="utf-8")
|
|
477
|
+
except Exception as e:
|
|
478
|
+
print_colored(f"❌ Failed to read --prompt-file: {e}", Colors.RED, bold=True)
|
|
479
|
+
sys.exit(1)
|
|
480
|
+
|
|
481
|
+
# Start chat loop (default interactive mode)
|
|
482
|
+
await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
|
|
358
483
|
|
|
359
484
|
|
|
360
485
|
|
agent/loops/__init__.py
CHANGED
|
@@ -10,5 +10,23 @@ from . import omniparser
|
|
|
10
10
|
from . import gta1
|
|
11
11
|
from . import composed_grounded
|
|
12
12
|
from . import glm45v
|
|
13
|
+
from . import opencua
|
|
14
|
+
from . import internvl
|
|
15
|
+
from . import holo
|
|
16
|
+
from . import moondream3
|
|
17
|
+
from . import gemini
|
|
13
18
|
|
|
14
|
-
__all__ = [
|
|
19
|
+
__all__ = [
|
|
20
|
+
"anthropic",
|
|
21
|
+
"openai",
|
|
22
|
+
"uitars",
|
|
23
|
+
"omniparser",
|
|
24
|
+
"gta1",
|
|
25
|
+
"composed_grounded",
|
|
26
|
+
"glm45v",
|
|
27
|
+
"opencua",
|
|
28
|
+
"internvl",
|
|
29
|
+
"holo",
|
|
30
|
+
"moondream3",
|
|
31
|
+
"gemini"
|
|
32
|
+
]
|
agent/loops/anthropic.py
CHANGED
|
@@ -33,7 +33,7 @@ from ..responses import (
|
|
|
33
33
|
MODEL_TOOL_MAPPING = [
|
|
34
34
|
# Claude 4 models
|
|
35
35
|
{
|
|
36
|
-
"pattern": r"claude-4|claude-opus-4|claude-sonnet-4",
|
|
36
|
+
"pattern": r"claude-4|claude-opus-4|claude-sonnet-4|claude-haiku-4",
|
|
37
37
|
"tool_version": "computer_20250124",
|
|
38
38
|
"beta_flag": "computer-use-2025-01-24"
|
|
39
39
|
},
|
|
@@ -1577,11 +1577,10 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
|
1577
1577
|
isinstance(item.get("action"), dict)):
|
|
1578
1578
|
|
|
1579
1579
|
action = item["action"]
|
|
1580
|
-
if action.get("
|
|
1580
|
+
if action.get("x") and action.get("y"):
|
|
1581
1581
|
x = action.get("x")
|
|
1582
1582
|
y = action.get("y")
|
|
1583
|
-
|
|
1584
|
-
return (int(x), int(y))
|
|
1583
|
+
return (int(x), int(y))
|
|
1585
1584
|
|
|
1586
1585
|
return None
|
|
1587
1586
|
|
agent/loops/composed_grounded.py
CHANGED
|
@@ -126,7 +126,7 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str
|
|
|
126
126
|
|
|
127
127
|
|
|
128
128
|
@register_agent(r".*\+.*", priority=1)
|
|
129
|
-
class ComposedGroundedConfig:
|
|
129
|
+
class ComposedGroundedConfig(AsyncAgentConfig):
|
|
130
130
|
"""
|
|
131
131
|
Composed-grounded agent configuration that uses both grounding and thinking models.
|
|
132
132
|
|