cua-agent 0.4.32__tar.gz → 0.4.33__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.4.32 → cua_agent-0.4.33}/PKG-INFO +6 -3
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/cli.py +57 -21
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/__init__.py +4 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/anthropic.py +1 -1
- cua_agent-0.4.33/agent/loops/gemini.py +391 -0
- cua_agent-0.4.33/agent/loops/moondream3.py +464 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/openai.py +1 -2
- {cua_agent-0.4.32 → cua_agent-0.4.33}/pyproject.toml +7 -3
- {cua_agent-0.4.32 → cua_agent-0.4.33}/README.md +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/__init__.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/__main__.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/adapters/__init__.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/adapters/huggingfacelocal_adapter.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/adapters/human_adapter.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/adapters/mlxvlm_adapter.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/adapters/models/__init__.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/adapters/models/generic.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/adapters/models/internvl.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/adapters/models/opencua.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/adapters/models/qwen2_5_vl.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/agent.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/callbacks/__init__.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/callbacks/base.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/callbacks/budget_manager.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/callbacks/image_retention.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/callbacks/logging.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/callbacks/operator_validator.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/callbacks/pii_anonymization.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/callbacks/prompt_instructions.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/callbacks/telemetry.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/callbacks/trajectory_saver.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/computers/__init__.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/computers/base.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/computers/cua.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/computers/custom.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/decorators.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/human_tool/__init__.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/human_tool/__main__.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/human_tool/server.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/human_tool/ui.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/integrations/hud/__init__.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/integrations/hud/agent.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/integrations/hud/proxy.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/base.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/composed_grounded.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/glm45v.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/gta1.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/holo.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/internvl.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/model_types.csv +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/omniparser.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/opencua.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/loops/uitars.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/proxy/examples.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/proxy/handlers.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/responses.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/types.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/ui/__init__.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/ui/__main__.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/ui/gradio/__init__.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/ui/gradio/app.py +0 -0
- {cua_agent-0.4.32 → cua_agent-0.4.33}/agent/ui/gradio/ui_components.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.33
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: >=3.12
|
|
@@ -49,7 +49,9 @@ Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
|
|
|
49
49
|
Provides-Extra: cli
|
|
50
50
|
Requires-Dist: yaspin>=3.1.0; extra == "cli"
|
|
51
51
|
Provides-Extra: hud
|
|
52
|
-
Requires-Dist: hud-python==0.4.
|
|
52
|
+
Requires-Dist: hud-python==0.4.52; extra == "hud"
|
|
53
|
+
Provides-Extra: gemini
|
|
54
|
+
Requires-Dist: google-genai>=1.41.0; extra == "gemini"
|
|
53
55
|
Provides-Extra: all
|
|
54
56
|
Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
|
|
55
57
|
Requires-Dist: accelerate; extra == "all"
|
|
@@ -62,7 +64,8 @@ Requires-Dist: blobfile>=3.0.0; extra == "all"
|
|
|
62
64
|
Requires-Dist: gradio>=5.23.3; extra == "all"
|
|
63
65
|
Requires-Dist: python-dotenv>=1.0.1; extra == "all"
|
|
64
66
|
Requires-Dist: yaspin>=3.1.0; extra == "all"
|
|
65
|
-
Requires-Dist: hud-python==0.4.
|
|
67
|
+
Requires-Dist: hud-python==0.4.52; extra == "all"
|
|
68
|
+
Requires-Dist: google-genai>=1.41.0; extra == "all"
|
|
66
69
|
Description-Content-Type: text/markdown
|
|
67
70
|
|
|
68
71
|
<div align="center">
|
|
@@ -167,7 +167,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
|
|
167
167
|
|
|
168
168
|
# Process and display the output
|
|
169
169
|
for item in result.get("output", []):
|
|
170
|
-
if item.get("type") == "message":
|
|
170
|
+
if item.get("type") == "message" and item.get("role") == "assistant":
|
|
171
171
|
# Display agent text response
|
|
172
172
|
content = item.get("content", [])
|
|
173
173
|
for content_part in content:
|
|
@@ -226,6 +226,13 @@ Examples:
|
|
|
226
226
|
help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
|
|
227
227
|
)
|
|
228
228
|
|
|
229
|
+
parser.add_argument(
|
|
230
|
+
"--provider",
|
|
231
|
+
choices=["cloud", "lume", "winsandbox", "docker"],
|
|
232
|
+
default="cloud",
|
|
233
|
+
help="Computer provider to use: cloud (default), lume, winsandbox, or docker"
|
|
234
|
+
)
|
|
235
|
+
|
|
229
236
|
parser.add_argument(
|
|
230
237
|
"--images",
|
|
231
238
|
type=int,
|
|
@@ -257,6 +264,12 @@ Examples:
|
|
|
257
264
|
help="Initial prompt to send to the agent. Leave blank for interactive mode."
|
|
258
265
|
)
|
|
259
266
|
|
|
267
|
+
parser.add_argument(
|
|
268
|
+
"--prompt-file",
|
|
269
|
+
type=Path,
|
|
270
|
+
help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt."
|
|
271
|
+
)
|
|
272
|
+
|
|
260
273
|
parser.add_argument(
|
|
261
274
|
"--predict-click",
|
|
262
275
|
dest="predict_click",
|
|
@@ -289,33 +302,35 @@ Examples:
|
|
|
289
302
|
container_name = os.getenv("CUA_CONTAINER_NAME")
|
|
290
303
|
cua_api_key = os.getenv("CUA_API_KEY")
|
|
291
304
|
|
|
292
|
-
# Prompt for missing environment variables
|
|
305
|
+
# Prompt for missing environment variables (container name always required)
|
|
293
306
|
if not container_name:
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
307
|
+
if args.provider == "cloud":
|
|
308
|
+
print_colored("CUA_CONTAINER_NAME not set.", dim=True)
|
|
309
|
+
print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
|
|
310
|
+
container_name = input("Enter your CUA container name: ").strip()
|
|
311
|
+
if not container_name:
|
|
312
|
+
print_colored("❌ Container name is required.")
|
|
313
|
+
sys.exit(1)
|
|
314
|
+
else:
|
|
315
|
+
container_name = "cli-sandbox"
|
|
316
|
+
|
|
317
|
+
# Only require API key for cloud provider
|
|
318
|
+
if args.provider == "cloud" and not cua_api_key:
|
|
302
319
|
print_colored("CUA_API_KEY not set.", dim=True)
|
|
303
320
|
cua_api_key = input("Enter your CUA API key: ").strip()
|
|
304
321
|
if not cua_api_key:
|
|
305
|
-
print_colored("❌ API key is required.")
|
|
322
|
+
print_colored("❌ API key is required for cloud provider.")
|
|
306
323
|
sys.exit(1)
|
|
307
324
|
|
|
308
325
|
# Check for provider-specific API keys based on model
|
|
309
326
|
provider_api_keys = {
|
|
310
327
|
"openai/": "OPENAI_API_KEY",
|
|
311
328
|
"anthropic/": "ANTHROPIC_API_KEY",
|
|
312
|
-
"omniparser+": "OPENAI_API_KEY",
|
|
313
|
-
"omniparser+": "ANTHROPIC_API_KEY",
|
|
314
329
|
}
|
|
315
330
|
|
|
316
331
|
# Find matching provider and check for API key
|
|
317
332
|
for prefix, env_var in provider_api_keys.items():
|
|
318
|
-
if args.model
|
|
333
|
+
if prefix in args.model:
|
|
319
334
|
if not os.getenv(env_var):
|
|
320
335
|
print_colored(f"{env_var} not set.", dim=True)
|
|
321
336
|
api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
|
|
@@ -335,13 +350,25 @@ Examples:
|
|
|
335
350
|
print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
|
|
336
351
|
sys.exit(1)
|
|
337
352
|
|
|
353
|
+
# Resolve provider -> os_type, provider_type, api key requirement
|
|
354
|
+
provider_map = {
|
|
355
|
+
"cloud": ("linux", "cloud", True),
|
|
356
|
+
"lume": ("macos", "lume", False),
|
|
357
|
+
"winsandbox": ("windows", "winsandbox", False),
|
|
358
|
+
"docker": ("linux", "docker", False),
|
|
359
|
+
}
|
|
360
|
+
os_type, provider_type, needs_api_key = provider_map[args.provider]
|
|
361
|
+
|
|
362
|
+
computer_kwargs = {
|
|
363
|
+
"os_type": os_type,
|
|
364
|
+
"provider_type": provider_type,
|
|
365
|
+
"name": container_name,
|
|
366
|
+
}
|
|
367
|
+
if needs_api_key:
|
|
368
|
+
computer_kwargs["api_key"] = cua_api_key # type: ignore
|
|
369
|
+
|
|
338
370
|
# Create computer instance
|
|
339
|
-
async with Computer(
|
|
340
|
-
os_type="linux",
|
|
341
|
-
provider_type="cloud",
|
|
342
|
-
name=container_name,
|
|
343
|
-
api_key=cua_api_key
|
|
344
|
-
) as computer:
|
|
371
|
+
async with Computer(**computer_kwargs) as computer: # type: ignore
|
|
345
372
|
|
|
346
373
|
# Create agent
|
|
347
374
|
agent_kwargs = {
|
|
@@ -442,8 +469,17 @@ Examples:
|
|
|
442
469
|
# Done
|
|
443
470
|
sys.exit(0)
|
|
444
471
|
|
|
472
|
+
# Resolve initial prompt from --prompt-file or --prompt
|
|
473
|
+
initial_prompt = args.prompt or ""
|
|
474
|
+
if args.prompt_file:
|
|
475
|
+
try:
|
|
476
|
+
initial_prompt = args.prompt_file.read_text(encoding="utf-8")
|
|
477
|
+
except Exception as e:
|
|
478
|
+
print_colored(f"❌ Failed to read --prompt-file: {e}", Colors.RED, bold=True)
|
|
479
|
+
sys.exit(1)
|
|
480
|
+
|
|
445
481
|
# Start chat loop (default interactive mode)
|
|
446
|
-
await chat_loop(agent, args.model, container_name,
|
|
482
|
+
await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
|
|
447
483
|
|
|
448
484
|
|
|
449
485
|
|
|
@@ -13,6 +13,8 @@ from . import glm45v
|
|
|
13
13
|
from . import opencua
|
|
14
14
|
from . import internvl
|
|
15
15
|
from . import holo
|
|
16
|
+
from . import moondream3
|
|
17
|
+
from . import gemini
|
|
16
18
|
|
|
17
19
|
__all__ = [
|
|
18
20
|
"anthropic",
|
|
@@ -25,4 +27,6 @@ __all__ = [
|
|
|
25
27
|
"opencua",
|
|
26
28
|
"internvl",
|
|
27
29
|
"holo",
|
|
30
|
+
"moondream3",
|
|
31
|
+
"gemini"
|
|
28
32
|
]
|
|
@@ -33,7 +33,7 @@ from ..responses import (
|
|
|
33
33
|
MODEL_TOOL_MAPPING = [
|
|
34
34
|
# Claude 4 models
|
|
35
35
|
{
|
|
36
|
-
"pattern": r"claude-4|claude-opus-4|claude-sonnet-4",
|
|
36
|
+
"pattern": r"claude-4|claude-opus-4|claude-sonnet-4|claude-haiku-4",
|
|
37
37
|
"tool_version": "computer_20250124",
|
|
38
38
|
"beta_flag": "computer-use-2025-01-24"
|
|
39
39
|
},
|
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gemini 2.5 Computer Use agent loop
|
|
3
|
+
|
|
4
|
+
Maps internal Agent SDK message format to Google's Gemini Computer Use API and back.
|
|
5
|
+
|
|
6
|
+
Key features:
|
|
7
|
+
- Lazy import of google.genai
|
|
8
|
+
- Configure Computer Use tool with excluded browser-specific predefined functions
|
|
9
|
+
- Optional custom function declarations hook for computer-call specific functions
|
|
10
|
+
- Convert Gemini function_call parts into internal computer_call actions
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import base64
|
|
16
|
+
import io
|
|
17
|
+
import uuid
|
|
18
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
19
|
+
|
|
20
|
+
from PIL import Image
|
|
21
|
+
|
|
22
|
+
from ..decorators import register_agent
|
|
23
|
+
from ..loops.base import AsyncAgentConfig
|
|
24
|
+
from ..types import AgentCapability
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _lazy_import_genai():
|
|
28
|
+
"""Import google.genai lazily to avoid hard dependency unless used."""
|
|
29
|
+
try:
|
|
30
|
+
from google import genai # type: ignore
|
|
31
|
+
from google.genai import types # type: ignore
|
|
32
|
+
return genai, types
|
|
33
|
+
except Exception as e: # pragma: no cover
|
|
34
|
+
raise RuntimeError(
|
|
35
|
+
"google.genai is required for the Gemini Computer Use loop. Install the Google Gemini SDK."
|
|
36
|
+
) from e
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _data_url_to_bytes(data_url: str) -> Tuple[bytes, str]:
|
|
40
|
+
"""Convert a data URL to raw bytes and mime type."""
|
|
41
|
+
if not data_url.startswith("data:"):
|
|
42
|
+
# Assume it's base64 png payload
|
|
43
|
+
try:
|
|
44
|
+
return base64.b64decode(data_url), "image/png"
|
|
45
|
+
except Exception:
|
|
46
|
+
return b"", "application/octet-stream"
|
|
47
|
+
header, b64 = data_url.split(",", 1)
|
|
48
|
+
mime = "image/png"
|
|
49
|
+
if ";" in header:
|
|
50
|
+
mime = header.split(";")[0].split(":", 1)[1] or "image/png"
|
|
51
|
+
return base64.b64decode(b64), mime
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _bytes_image_size(img_bytes: bytes) -> Tuple[int, int]:
|
|
55
|
+
try:
|
|
56
|
+
img = Image.open(io.BytesIO(img_bytes))
|
|
57
|
+
return img.size
|
|
58
|
+
except Exception:
|
|
59
|
+
return (1024, 768)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _find_last_user_text(messages: List[Dict[str, Any]]) -> List[str]:
|
|
63
|
+
texts: List[str] = []
|
|
64
|
+
for msg in reversed(messages):
|
|
65
|
+
if msg.get("type") in (None, "message") and msg.get("role") == "user":
|
|
66
|
+
content = msg.get("content")
|
|
67
|
+
if isinstance(content, str):
|
|
68
|
+
return [content]
|
|
69
|
+
elif isinstance(content, list):
|
|
70
|
+
for c in content:
|
|
71
|
+
if c.get("type") in ("input_text", "output_text") and c.get("text"):
|
|
72
|
+
texts.append(c["text"]) # newest first
|
|
73
|
+
if texts:
|
|
74
|
+
return list(reversed(texts))
|
|
75
|
+
return []
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _find_last_screenshot(messages: List[Dict[str, Any]]) -> Optional[bytes]:
|
|
79
|
+
for msg in reversed(messages):
|
|
80
|
+
if msg.get("type") == "computer_call_output":
|
|
81
|
+
out = msg.get("output", {})
|
|
82
|
+
if isinstance(out, dict) and out.get("type") in ("input_image", "computer_screenshot"):
|
|
83
|
+
image_url = out.get("image_url", "")
|
|
84
|
+
if image_url:
|
|
85
|
+
data, _ = _data_url_to_bytes(image_url)
|
|
86
|
+
return data
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _denormalize(v: int, size: int) -> int:
|
|
91
|
+
# Gemini returns 0-999 normalized
|
|
92
|
+
try:
|
|
93
|
+
return max(0, min(size - 1, int(round(v / 1000 * size))))
|
|
94
|
+
except Exception:
|
|
95
|
+
return 0
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _map_gemini_fc_to_computer_call(
|
|
99
|
+
fc: Dict[str, Any],
|
|
100
|
+
screen_w: int,
|
|
101
|
+
screen_h: int,
|
|
102
|
+
) -> Optional[Dict[str, Any]]:
|
|
103
|
+
name = fc.get("name")
|
|
104
|
+
args = fc.get("args", {}) or {}
|
|
105
|
+
|
|
106
|
+
action: Dict[str, Any] = {}
|
|
107
|
+
if name == "click_at":
|
|
108
|
+
x = _denormalize(int(args.get("x", 0)), screen_w)
|
|
109
|
+
y = _denormalize(int(args.get("y", 0)), screen_h)
|
|
110
|
+
action = {"type": "click", "x": x, "y": y, "button": "left"}
|
|
111
|
+
elif name == "type_text_at":
|
|
112
|
+
x = _denormalize(int(args.get("x", 0)), screen_w)
|
|
113
|
+
y = _denormalize(int(args.get("y", 0)), screen_h)
|
|
114
|
+
text = args.get("text", "")
|
|
115
|
+
if args.get("press_enter") == True:
|
|
116
|
+
text += "\n"
|
|
117
|
+
action = {"type": "type", "x": x, "y": y, "text": text}
|
|
118
|
+
elif name == "hover_at":
|
|
119
|
+
x = _denormalize(int(args.get("x", 0)), screen_w)
|
|
120
|
+
y = _denormalize(int(args.get("y", 0)), screen_h)
|
|
121
|
+
action = {"type": "move", "x": x, "y": y}
|
|
122
|
+
elif name == "key_combination":
|
|
123
|
+
keys = str(args.get("keys", ""))
|
|
124
|
+
action = {"type": "keypress", "keys": keys}
|
|
125
|
+
elif name == "scroll_document":
|
|
126
|
+
direction = args.get("direction", "down")
|
|
127
|
+
magnitude = 800
|
|
128
|
+
dx, dy = 0, 0
|
|
129
|
+
if direction == "down":
|
|
130
|
+
dy = magnitude
|
|
131
|
+
elif direction == "up":
|
|
132
|
+
dy = -magnitude
|
|
133
|
+
elif direction == "right":
|
|
134
|
+
dx = magnitude
|
|
135
|
+
elif direction == "left":
|
|
136
|
+
dx = -magnitude
|
|
137
|
+
action = {"type": "scroll", "scroll_x": dx, "scroll_y": dy, "x": int(screen_w / 2), "y": int(screen_h / 2)}
|
|
138
|
+
elif name == "scroll_at":
|
|
139
|
+
x = _denormalize(int(args.get("x", 500)), screen_w)
|
|
140
|
+
y = _denormalize(int(args.get("y", 500)), screen_h)
|
|
141
|
+
direction = args.get("direction", "down")
|
|
142
|
+
magnitude = int(args.get("magnitude", 800))
|
|
143
|
+
dx, dy = 0, 0
|
|
144
|
+
if direction == "down":
|
|
145
|
+
dy = magnitude
|
|
146
|
+
elif direction == "up":
|
|
147
|
+
dy = -magnitude
|
|
148
|
+
elif direction == "right":
|
|
149
|
+
dx = magnitude
|
|
150
|
+
elif direction == "left":
|
|
151
|
+
dx = -magnitude
|
|
152
|
+
action = {"type": "scroll", "scroll_x": dx, "scroll_y": dy, "x": x, "y": y}
|
|
153
|
+
elif name == "drag_and_drop":
|
|
154
|
+
x = _denormalize(int(args.get("x", 0)), screen_w)
|
|
155
|
+
y = _denormalize(int(args.get("y", 0)), screen_h)
|
|
156
|
+
dx = _denormalize(int(args.get("destination_x", x)), screen_w)
|
|
157
|
+
dy = _denormalize(int(args.get("destination_y", y)), screen_h)
|
|
158
|
+
action = {"type": "drag", "start_x": x, "start_y": y, "end_x": dx, "end_y": dy, "button": "left"}
|
|
159
|
+
elif name == "wait_5_seconds":
|
|
160
|
+
action = {"type": "wait"}
|
|
161
|
+
else:
|
|
162
|
+
# Unsupported / excluded browser-specific or custom function; ignore
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
return {
|
|
166
|
+
"type": "computer_call",
|
|
167
|
+
"call_id": uuid.uuid4().hex,
|
|
168
|
+
"status": "completed",
|
|
169
|
+
"action": action,
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@register_agent(models=r"^gemini-2\.5-computer-use-preview-10-2025$")
|
|
174
|
+
class GeminiComputerUseConfig(AsyncAgentConfig):
|
|
175
|
+
async def predict_step(
|
|
176
|
+
self,
|
|
177
|
+
messages: List[Dict[str, Any]],
|
|
178
|
+
model: str,
|
|
179
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
180
|
+
max_retries: Optional[int] = None,
|
|
181
|
+
stream: bool = False,
|
|
182
|
+
computer_handler=None,
|
|
183
|
+
use_prompt_caching: Optional[bool] = False,
|
|
184
|
+
_on_api_start=None,
|
|
185
|
+
_on_api_end=None,
|
|
186
|
+
_on_usage=None,
|
|
187
|
+
_on_screenshot=None,
|
|
188
|
+
**kwargs,
|
|
189
|
+
) -> Dict[str, Any]:
|
|
190
|
+
genai, types = _lazy_import_genai()
|
|
191
|
+
|
|
192
|
+
client = genai.Client()
|
|
193
|
+
|
|
194
|
+
# Build excluded predefined functions for browser-specific behavior
|
|
195
|
+
excluded = [
|
|
196
|
+
"open_web_browser",
|
|
197
|
+
"search",
|
|
198
|
+
"navigate",
|
|
199
|
+
"go_forward",
|
|
200
|
+
"go_back",
|
|
201
|
+
"scroll_document",
|
|
202
|
+
]
|
|
203
|
+
# Optional custom functions: can be extended by host code via `tools` parameter later if desired
|
|
204
|
+
CUSTOM_FUNCTION_DECLARATIONS: List[Any] = []
|
|
205
|
+
|
|
206
|
+
# Compose tools config
|
|
207
|
+
generate_content_config = types.GenerateContentConfig(
|
|
208
|
+
tools=[
|
|
209
|
+
types.Tool(
|
|
210
|
+
computer_use=types.ComputerUse(
|
|
211
|
+
environment=types.Environment.ENVIRONMENT_BROWSER,
|
|
212
|
+
excluded_predefined_functions=excluded,
|
|
213
|
+
)
|
|
214
|
+
),
|
|
215
|
+
# types.Tool(function_declarations=CUSTOM_FUNCTION_DECLARATIONS), # enable when custom functions needed
|
|
216
|
+
]
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Prepare contents: last user text + latest screenshot
|
|
220
|
+
user_texts = _find_last_user_text(messages)
|
|
221
|
+
screenshot_bytes = _find_last_screenshot(messages)
|
|
222
|
+
|
|
223
|
+
parts: List[Any] = []
|
|
224
|
+
for t in user_texts:
|
|
225
|
+
parts.append(types.Part(text=t))
|
|
226
|
+
|
|
227
|
+
screen_w, screen_h = 1024, 768
|
|
228
|
+
if screenshot_bytes:
|
|
229
|
+
screen_w, screen_h = _bytes_image_size(screenshot_bytes)
|
|
230
|
+
parts.append(types.Part.from_bytes(data=screenshot_bytes, mime_type="image/png"))
|
|
231
|
+
|
|
232
|
+
# If we don't have any content, at least pass an empty user part to prompt reasoning
|
|
233
|
+
if not parts:
|
|
234
|
+
parts = [types.Part(text="Proceed to the next action.")]
|
|
235
|
+
|
|
236
|
+
contents = [types.Content(role="user", parts=parts)]
|
|
237
|
+
|
|
238
|
+
api_kwargs = {
|
|
239
|
+
"model": model,
|
|
240
|
+
"contents": contents,
|
|
241
|
+
"config": generate_content_config,
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
if _on_api_start:
|
|
245
|
+
await _on_api_start({
|
|
246
|
+
"model": api_kwargs["model"],
|
|
247
|
+
# "contents": api_kwargs["contents"], # Disabled for now
|
|
248
|
+
"config": api_kwargs["config"],
|
|
249
|
+
})
|
|
250
|
+
|
|
251
|
+
response = client.models.generate_content(**api_kwargs)
|
|
252
|
+
|
|
253
|
+
if _on_api_end:
|
|
254
|
+
await _on_api_end({
|
|
255
|
+
"model": api_kwargs["model"],
|
|
256
|
+
# "contents": api_kwargs["contents"], # Disabled for now
|
|
257
|
+
"config": api_kwargs["config"],
|
|
258
|
+
}, response)
|
|
259
|
+
|
|
260
|
+
# Usage (Gemini SDK may not always provide token usage; populate when available)
|
|
261
|
+
usage: Dict[str, Any] = {}
|
|
262
|
+
try:
|
|
263
|
+
# Some SDKs expose response.usage; if available, copy
|
|
264
|
+
if getattr(response, "usage_metadata", None):
|
|
265
|
+
md = response.usage_metadata
|
|
266
|
+
usage = {
|
|
267
|
+
"prompt_tokens": getattr(md, "prompt_token_count", None) or 0,
|
|
268
|
+
"completion_tokens": getattr(md, "candidates_token_count", None) or 0,
|
|
269
|
+
"total_tokens": getattr(md, "total_token_count", None) or 0,
|
|
270
|
+
}
|
|
271
|
+
except Exception:
|
|
272
|
+
pass
|
|
273
|
+
|
|
274
|
+
if _on_usage and usage:
|
|
275
|
+
await _on_usage(usage)
|
|
276
|
+
|
|
277
|
+
# Parse output into internal items
|
|
278
|
+
output_items: List[Dict[str, Any]] = []
|
|
279
|
+
|
|
280
|
+
candidate = response.candidates[0]
|
|
281
|
+
# Text parts from the model (assistant message)
|
|
282
|
+
text_parts: List[str] = []
|
|
283
|
+
function_calls: List[Dict[str, Any]] = []
|
|
284
|
+
for p in candidate.content.parts:
|
|
285
|
+
if getattr(p, "text", None):
|
|
286
|
+
text_parts.append(p.text)
|
|
287
|
+
if getattr(p, "function_call", None):
|
|
288
|
+
# p.function_call has name and args
|
|
289
|
+
fc = {
|
|
290
|
+
"name": getattr(p.function_call, "name", None),
|
|
291
|
+
"args": dict(getattr(p.function_call, "args", {}) or {}),
|
|
292
|
+
}
|
|
293
|
+
function_calls.append(fc)
|
|
294
|
+
|
|
295
|
+
if text_parts:
|
|
296
|
+
output_items.append(
|
|
297
|
+
{
|
|
298
|
+
"type": "message",
|
|
299
|
+
"role": "assistant",
|
|
300
|
+
"content": [{"type": "output_text", "text": "\n".join(text_parts)}],
|
|
301
|
+
}
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
# Map function calls to internal computer_call actions
|
|
305
|
+
for fc in function_calls:
|
|
306
|
+
item = _map_gemini_fc_to_computer_call(fc, screen_w, screen_h)
|
|
307
|
+
if item is not None:
|
|
308
|
+
output_items.append(item)
|
|
309
|
+
|
|
310
|
+
return {"output": output_items, "usage": usage}
|
|
311
|
+
|
|
312
|
+
async def predict_click(
|
|
313
|
+
self,
|
|
314
|
+
model: str,
|
|
315
|
+
image_b64: str,
|
|
316
|
+
instruction: str,
|
|
317
|
+
**kwargs,
|
|
318
|
+
) -> Optional[Tuple[float, float]]:
|
|
319
|
+
"""Ask Gemini CUA to output a single click action for the given instruction.
|
|
320
|
+
|
|
321
|
+
Excludes all predefined tools except `click_at` and sends the screenshot.
|
|
322
|
+
Returns pixel (x, y) if a click is proposed, else None.
|
|
323
|
+
"""
|
|
324
|
+
genai, types = _lazy_import_genai()
|
|
325
|
+
|
|
326
|
+
client = genai.Client()
|
|
327
|
+
|
|
328
|
+
# Exclude all but click_at
|
|
329
|
+
exclude_all_but_click = [
|
|
330
|
+
"open_web_browser",
|
|
331
|
+
"wait_5_seconds",
|
|
332
|
+
"go_back",
|
|
333
|
+
"go_forward",
|
|
334
|
+
"search",
|
|
335
|
+
"navigate",
|
|
336
|
+
"hover_at",
|
|
337
|
+
"type_text_at",
|
|
338
|
+
"key_combination",
|
|
339
|
+
"scroll_document",
|
|
340
|
+
"scroll_at",
|
|
341
|
+
"drag_and_drop",
|
|
342
|
+
]
|
|
343
|
+
|
|
344
|
+
config = types.GenerateContentConfig(
|
|
345
|
+
tools=[
|
|
346
|
+
types.Tool(
|
|
347
|
+
computer_use=types.ComputerUse(
|
|
348
|
+
environment=types.Environment.ENVIRONMENT_BROWSER,
|
|
349
|
+
excluded_predefined_functions=exclude_all_but_click,
|
|
350
|
+
)
|
|
351
|
+
)
|
|
352
|
+
]
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Prepare prompt parts
|
|
356
|
+
try:
|
|
357
|
+
img_bytes = base64.b64decode(image_b64)
|
|
358
|
+
except Exception:
|
|
359
|
+
img_bytes = b""
|
|
360
|
+
|
|
361
|
+
w, h = _bytes_image_size(img_bytes) if img_bytes else (1024, 768)
|
|
362
|
+
|
|
363
|
+
parts: List[Any] = [types.Part(text=f"Click {instruction}.")]
|
|
364
|
+
if img_bytes:
|
|
365
|
+
parts.append(types.Part.from_bytes(data=img_bytes, mime_type="image/png"))
|
|
366
|
+
|
|
367
|
+
contents = [types.Content(role="user", parts=parts)]
|
|
368
|
+
|
|
369
|
+
response = client.models.generate_content(
|
|
370
|
+
model=model,
|
|
371
|
+
contents=contents,
|
|
372
|
+
config=config,
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# Parse first click_at
|
|
376
|
+
try:
|
|
377
|
+
candidate = response.candidates[0]
|
|
378
|
+
for p in candidate.content.parts:
|
|
379
|
+
fc = getattr(p, "function_call", None)
|
|
380
|
+
if fc and getattr(fc, "name", None) == "click_at":
|
|
381
|
+
args = dict(getattr(fc, "args", {}) or {})
|
|
382
|
+
x = _denormalize(int(args.get("x", 0)), w)
|
|
383
|
+
y = _denormalize(int(args.get("y", 0)), h)
|
|
384
|
+
return float(x), float(y)
|
|
385
|
+
except Exception:
|
|
386
|
+
return None
|
|
387
|
+
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
391
|
+
return ["click", "step"]
|
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Moondream3+ composed-grounded agent loop implementation.
|
|
3
|
+
Grounding is handled by a local Moondream3 preview model via Transformers.
|
|
4
|
+
Thinking is delegated to the trailing LLM in the composed model string: "moondream3+<thinking_model>".
|
|
5
|
+
|
|
6
|
+
Differences from composed_grounded:
|
|
7
|
+
- Provides a singleton Moondream3 client outside the class.
|
|
8
|
+
- predict_click uses model.point(image, instruction, settings={"max_objects": 1}) and returns pixel coordinates.
|
|
9
|
+
- If the last image was a screenshot (or we take one), run model.detect(image, "all form ui") to get bboxes, then
|
|
10
|
+
run model.caption on each cropped bbox to label it. Overlay labels on the screenshot and emit via _on_screenshot.
|
|
11
|
+
- Add a user message listing all detected form UI names so the thinker can reference them.
|
|
12
|
+
- If the thinking model doesn't support vision, filter out image content before calling litellm.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import uuid
|
|
18
|
+
import base64
|
|
19
|
+
import io
|
|
20
|
+
from typing import Dict, List, Any, Optional, Tuple, Any
|
|
21
|
+
|
|
22
|
+
from PIL import Image, ImageDraw, ImageFont
|
|
23
|
+
import torch
|
|
24
|
+
from transformers import AutoModelForCausalLM
|
|
25
|
+
import litellm
|
|
26
|
+
|
|
27
|
+
from ..decorators import register_agent
|
|
28
|
+
from ..types import AgentCapability
|
|
29
|
+
from ..loops.base import AsyncAgentConfig
|
|
30
|
+
from ..responses import (
|
|
31
|
+
convert_computer_calls_xy2desc,
|
|
32
|
+
convert_responses_items_to_completion_messages,
|
|
33
|
+
convert_completion_messages_to_responses_items,
|
|
34
|
+
convert_computer_calls_desc2xy,
|
|
35
|
+
get_all_element_descriptions,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
_MOONDREAM_SINGLETON = None
|
|
39
|
+
|
|
40
|
+
def get_moondream_model() -> Any:
|
|
41
|
+
"""Get a singleton instance of the Moondream3 preview model."""
|
|
42
|
+
global _MOONDREAM_SINGLETON
|
|
43
|
+
if _MOONDREAM_SINGLETON is None:
|
|
44
|
+
_MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained(
|
|
45
|
+
"moondream/moondream3-preview",
|
|
46
|
+
trust_remote_code=True,
|
|
47
|
+
torch_dtype=torch.bfloat16,
|
|
48
|
+
device_map="cuda",
|
|
49
|
+
)
|
|
50
|
+
return _MOONDREAM_SINGLETON
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _decode_image_b64(image_b64: str) -> Image.Image:
|
|
54
|
+
data = base64.b64decode(image_b64)
|
|
55
|
+
return Image.open(io.BytesIO(data)).convert("RGB")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _image_to_b64(img: Image.Image) -> str:
|
|
59
|
+
buf = io.BytesIO()
|
|
60
|
+
img.save(buf, format="PNG")
|
|
61
|
+
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _supports_vision(model: str) -> bool:
|
|
65
|
+
"""Heuristic vision support detection for thinking model."""
|
|
66
|
+
m = model.lower()
|
|
67
|
+
vision_markers = [
|
|
68
|
+
"gpt-4o",
|
|
69
|
+
"gpt-4.1",
|
|
70
|
+
"o1",
|
|
71
|
+
"o3",
|
|
72
|
+
"claude-3",
|
|
73
|
+
"claude-3.5",
|
|
74
|
+
"sonnet",
|
|
75
|
+
"haiku",
|
|
76
|
+
"opus",
|
|
77
|
+
"gemini-1.5",
|
|
78
|
+
"llava",
|
|
79
|
+
]
|
|
80
|
+
return any(v in m for v in vision_markers)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
84
|
+
filtered: List[Dict[str, Any]] = []
|
|
85
|
+
for msg in messages:
|
|
86
|
+
msg_copy = {**msg}
|
|
87
|
+
content = msg_copy.get("content")
|
|
88
|
+
if isinstance(content, list):
|
|
89
|
+
msg_copy["content"] = [c for c in content if c.get("type") != "image_url"]
|
|
90
|
+
filtered.append(msg_copy)
|
|
91
|
+
return filtered
|
|
92
|
+
|
|
93
|
+
def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]:
|
|
94
|
+
"""Detect UI elements with Moondream, caption each, draw labels with backgrounds.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
base_img: PIL image of the screenshot (RGB or RGBA). Will be copied/converted internally.
|
|
98
|
+
model_md: Moondream model instance with .detect() and .query() methods.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
A tuple of (annotated_image_base64_png, detected_names)
|
|
102
|
+
"""
|
|
103
|
+
# Ensure RGBA for semi-transparent fills
|
|
104
|
+
if base_img.mode != "RGBA":
|
|
105
|
+
base_img = base_img.convert("RGBA")
|
|
106
|
+
W, H = base_img.width, base_img.height
|
|
107
|
+
|
|
108
|
+
# Detect objects
|
|
109
|
+
try:
|
|
110
|
+
detect_result = model_md.detect(base_img, "all ui elements")
|
|
111
|
+
objects = detect_result.get("objects", []) if isinstance(detect_result, dict) else []
|
|
112
|
+
except Exception:
|
|
113
|
+
objects = []
|
|
114
|
+
|
|
115
|
+
draw = ImageDraw.Draw(base_img)
|
|
116
|
+
try:
|
|
117
|
+
font = ImageFont.load_default()
|
|
118
|
+
except Exception:
|
|
119
|
+
font = None
|
|
120
|
+
|
|
121
|
+
detected_names: List[str] = []
|
|
122
|
+
|
|
123
|
+
for i, obj in enumerate(objects):
|
|
124
|
+
try:
|
|
125
|
+
# Clamp normalized coords and crop
|
|
126
|
+
x_min = max(0.0, min(1.0, float(obj.get("x_min", 0.0))))
|
|
127
|
+
y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0))))
|
|
128
|
+
x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0))))
|
|
129
|
+
y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0))))
|
|
130
|
+
left, top, right, bottom = int(x_min * W), int(y_min * H), int(x_max * W), int(y_max * H)
|
|
131
|
+
left, top = max(0, left), max(0, top)
|
|
132
|
+
right, bottom = min(W - 1, right), min(H - 1, bottom)
|
|
133
|
+
crop = base_img.crop((left, top, right, bottom))
|
|
134
|
+
|
|
135
|
+
# Prompted short caption
|
|
136
|
+
try:
|
|
137
|
+
result = model_md.query(crop, "Caption this UI element in few words.")
|
|
138
|
+
caption_text = (result or {}).get("answer", "")
|
|
139
|
+
except Exception:
|
|
140
|
+
caption_text = ""
|
|
141
|
+
|
|
142
|
+
name = (caption_text or "").strip() or f"element_{i+1}"
|
|
143
|
+
detected_names.append(name)
|
|
144
|
+
|
|
145
|
+
# Draw bbox
|
|
146
|
+
draw.rectangle([left, top, right, bottom], outline=(255, 215, 0, 255), width=2)
|
|
147
|
+
|
|
148
|
+
# Label background with padding and rounded corners
|
|
149
|
+
label = f"{i+1}. {name}"
|
|
150
|
+
padding = 3
|
|
151
|
+
if font:
|
|
152
|
+
text_bbox = draw.textbbox((0, 0), label, font=font)
|
|
153
|
+
else:
|
|
154
|
+
text_bbox = draw.textbbox((0, 0), label)
|
|
155
|
+
text_w = text_bbox[2] - text_bbox[0]
|
|
156
|
+
text_h = text_bbox[3] - text_bbox[1]
|
|
157
|
+
|
|
158
|
+
tx = left + 3
|
|
159
|
+
ty = top - (text_h + 2 * padding + 4)
|
|
160
|
+
if ty < 0:
|
|
161
|
+
ty = top + 3
|
|
162
|
+
|
|
163
|
+
bg_left = tx - padding
|
|
164
|
+
bg_top = ty - padding
|
|
165
|
+
bg_right = tx + text_w + padding
|
|
166
|
+
bg_bottom = ty + text_h + padding
|
|
167
|
+
try:
|
|
168
|
+
draw.rounded_rectangle(
|
|
169
|
+
[bg_left, bg_top, bg_right, bg_bottom],
|
|
170
|
+
radius=4,
|
|
171
|
+
fill=(0, 0, 0, 160),
|
|
172
|
+
outline=(255, 215, 0, 200),
|
|
173
|
+
width=1,
|
|
174
|
+
)
|
|
175
|
+
except Exception:
|
|
176
|
+
draw.rectangle(
|
|
177
|
+
[bg_left, bg_top, bg_right, bg_bottom],
|
|
178
|
+
fill=(0, 0, 0, 160),
|
|
179
|
+
outline=(255, 215, 0, 200),
|
|
180
|
+
width=1,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
text_fill = (255, 255, 255, 255)
|
|
184
|
+
if font:
|
|
185
|
+
draw.text((tx, ty), label, fill=text_fill, font=font)
|
|
186
|
+
else:
|
|
187
|
+
draw.text((tx, ty), label, fill=text_fill)
|
|
188
|
+
except Exception:
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
# Encode PNG base64
|
|
192
|
+
annotated = base_img
|
|
193
|
+
if annotated.mode not in ("RGBA", "RGB"):
|
|
194
|
+
annotated = annotated.convert("RGBA")
|
|
195
|
+
annotated_b64 = _image_to_b64(annotated)
|
|
196
|
+
return annotated_b64, detected_names
|
|
197
|
+
|
|
198
|
+
GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
|
199
|
+
"type": "function",
|
|
200
|
+
"function": {
|
|
201
|
+
"name": "computer",
|
|
202
|
+
"description": (
|
|
203
|
+
"Control a computer by taking screenshots and interacting with UI elements. "
|
|
204
|
+
"The screenshot action will include a list of detected form UI element names when available. "
|
|
205
|
+
"Use element descriptions to locate and interact with UI elements on the screen."
|
|
206
|
+
),
|
|
207
|
+
"parameters": {
|
|
208
|
+
"type": "object",
|
|
209
|
+
"properties": {
|
|
210
|
+
"action": {
|
|
211
|
+
"type": "string",
|
|
212
|
+
"enum": [
|
|
213
|
+
"screenshot",
|
|
214
|
+
"click",
|
|
215
|
+
"double_click",
|
|
216
|
+
"drag",
|
|
217
|
+
"type",
|
|
218
|
+
"keypress",
|
|
219
|
+
"scroll",
|
|
220
|
+
"move",
|
|
221
|
+
"wait",
|
|
222
|
+
"get_current_url",
|
|
223
|
+
"get_dimensions",
|
|
224
|
+
"get_environment",
|
|
225
|
+
],
|
|
226
|
+
"description": "The action to perform (required for all actions)",
|
|
227
|
+
},
|
|
228
|
+
"element_description": {
|
|
229
|
+
"type": "string",
|
|
230
|
+
"description": "Description of the element to interact with (required for click/double_click/move/scroll)",
|
|
231
|
+
},
|
|
232
|
+
"start_element_description": {
|
|
233
|
+
"type": "string",
|
|
234
|
+
"description": "Description of the element to start dragging from (required for drag)",
|
|
235
|
+
},
|
|
236
|
+
"end_element_description": {
|
|
237
|
+
"type": "string",
|
|
238
|
+
"description": "Description of the element to drag to (required for drag)",
|
|
239
|
+
},
|
|
240
|
+
"text": {
|
|
241
|
+
"type": "string",
|
|
242
|
+
"description": "The text to type (required for type)",
|
|
243
|
+
},
|
|
244
|
+
"keys": {
|
|
245
|
+
"type": "array",
|
|
246
|
+
"items": {"type": "string"},
|
|
247
|
+
"description": "Key(s) to press (required for keypress)",
|
|
248
|
+
},
|
|
249
|
+
"button": {
|
|
250
|
+
"type": "string",
|
|
251
|
+
"enum": ["left", "right", "wheel", "back", "forward"],
|
|
252
|
+
"description": "The mouse button to use for click/double_click",
|
|
253
|
+
},
|
|
254
|
+
"scroll_x": {
|
|
255
|
+
"type": "integer",
|
|
256
|
+
"description": "Horizontal scroll amount (required for scroll)",
|
|
257
|
+
},
|
|
258
|
+
"scroll_y": {
|
|
259
|
+
"type": "integer",
|
|
260
|
+
"description": "Vertical scroll amount (required for scroll)",
|
|
261
|
+
},
|
|
262
|
+
},
|
|
263
|
+
"required": ["action"],
|
|
264
|
+
},
|
|
265
|
+
},
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
@register_agent(r"moondream3\+.*", priority=2)
|
|
269
|
+
class Moondream3PlusConfig(AsyncAgentConfig):
|
|
270
|
+
def __init__(self):
|
|
271
|
+
self.desc2xy: Dict[str, Tuple[float, float]] = {}
|
|
272
|
+
|
|
273
|
+
async def predict_step(
|
|
274
|
+
self,
|
|
275
|
+
messages: List[Dict[str, Any]],
|
|
276
|
+
model: str,
|
|
277
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
278
|
+
max_retries: Optional[int] = None,
|
|
279
|
+
stream: bool = False,
|
|
280
|
+
computer_handler=None,
|
|
281
|
+
use_prompt_caching: Optional[bool] = False,
|
|
282
|
+
_on_api_start=None,
|
|
283
|
+
_on_api_end=None,
|
|
284
|
+
_on_usage=None,
|
|
285
|
+
_on_screenshot=None,
|
|
286
|
+
**kwargs,
|
|
287
|
+
) -> Dict[str, Any]:
|
|
288
|
+
# Parse composed model: moondream3+<thinking_model>
|
|
289
|
+
if "+" not in model:
|
|
290
|
+
raise ValueError(f"Composed model must be 'moondream3+<thinking_model>', got: {model}")
|
|
291
|
+
_, thinking_model = model.split("+", 1)
|
|
292
|
+
|
|
293
|
+
pre_output_items: List[Dict[str, Any]] = []
|
|
294
|
+
|
|
295
|
+
# Acquire last screenshot; if missing, take one
|
|
296
|
+
last_image_b64: Optional[str] = None
|
|
297
|
+
for message in reversed(messages):
|
|
298
|
+
if (
|
|
299
|
+
isinstance(message, dict)
|
|
300
|
+
and message.get("type") == "computer_call_output"
|
|
301
|
+
and isinstance(message.get("output"), dict)
|
|
302
|
+
and message["output"].get("type") == "input_image"
|
|
303
|
+
):
|
|
304
|
+
image_url = message["output"].get("image_url", "")
|
|
305
|
+
if image_url.startswith("data:image/png;base64,"):
|
|
306
|
+
last_image_b64 = image_url.split(",", 1)[1]
|
|
307
|
+
break
|
|
308
|
+
|
|
309
|
+
if last_image_b64 is None and computer_handler is not None:
|
|
310
|
+
# Take a screenshot
|
|
311
|
+
screenshot_b64 = await computer_handler.screenshot() # type: ignore
|
|
312
|
+
if screenshot_b64:
|
|
313
|
+
call_id = uuid.uuid4().hex
|
|
314
|
+
pre_output_items += [
|
|
315
|
+
{
|
|
316
|
+
"type": "message",
|
|
317
|
+
"role": "assistant",
|
|
318
|
+
"content": [
|
|
319
|
+
{"type": "output_text", "text": "Taking a screenshot to analyze the current screen."}
|
|
320
|
+
],
|
|
321
|
+
},
|
|
322
|
+
{"type": "computer_call", "call_id": call_id, "status": "completed", "action": {"type": "screenshot"}},
|
|
323
|
+
{
|
|
324
|
+
"type": "computer_call_output",
|
|
325
|
+
"call_id": call_id,
|
|
326
|
+
"output": {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_b64}"},
|
|
327
|
+
},
|
|
328
|
+
]
|
|
329
|
+
last_image_b64 = screenshot_b64
|
|
330
|
+
if _on_screenshot:
|
|
331
|
+
await _on_screenshot(screenshot_b64)
|
|
332
|
+
|
|
333
|
+
# If we have a last screenshot, run Moondream detection and labeling
|
|
334
|
+
detected_names: List[str] = []
|
|
335
|
+
if last_image_b64 is not None:
|
|
336
|
+
base_img = _decode_image_b64(last_image_b64)
|
|
337
|
+
model_md = get_moondream_model()
|
|
338
|
+
annotated_b64, detected_names = _annotate_detect_and_label_ui(base_img, model_md)
|
|
339
|
+
if _on_screenshot:
|
|
340
|
+
await _on_screenshot(annotated_b64, "annotated_form_ui")
|
|
341
|
+
|
|
342
|
+
# Also push a user message listing all detected names
|
|
343
|
+
if detected_names:
|
|
344
|
+
names_text = "\n".join(f"- {n}" for n in detected_names)
|
|
345
|
+
pre_output_items.append(
|
|
346
|
+
{
|
|
347
|
+
"type": "message",
|
|
348
|
+
"role": "user",
|
|
349
|
+
"content": [
|
|
350
|
+
{"type": "input_text", "text": "Detected form UI elements on screen:"},
|
|
351
|
+
{"type": "input_text", "text": names_text},
|
|
352
|
+
{"type": "input_text", "text": "Please continue with the next action needed to perform your task."}
|
|
353
|
+
],
|
|
354
|
+
}
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
tool_schemas = []
|
|
358
|
+
for schema in (tools or []):
|
|
359
|
+
if schema.get("type") == "computer":
|
|
360
|
+
tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
|
|
361
|
+
else:
|
|
362
|
+
tool_schemas.append(schema)
|
|
363
|
+
|
|
364
|
+
# Step 1: Convert computer calls from xy to descriptions
|
|
365
|
+
input_messages = messages + pre_output_items
|
|
366
|
+
messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
|
|
367
|
+
|
|
368
|
+
# Step 2: Convert responses items to completion messages
|
|
369
|
+
completion_messages = convert_responses_items_to_completion_messages(
|
|
370
|
+
messages_with_descriptions,
|
|
371
|
+
allow_images_in_tool_results=False,
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# Optionally filter images if model lacks vision
|
|
375
|
+
if not _supports_vision(thinking_model):
|
|
376
|
+
completion_messages = _filter_images_from_completion_messages(completion_messages)
|
|
377
|
+
|
|
378
|
+
# Step 3: Call thinking model with litellm.acompletion
|
|
379
|
+
api_kwargs = {
|
|
380
|
+
"model": thinking_model,
|
|
381
|
+
"messages": completion_messages,
|
|
382
|
+
"tools": tool_schemas,
|
|
383
|
+
"max_retries": max_retries,
|
|
384
|
+
"stream": stream,
|
|
385
|
+
**kwargs,
|
|
386
|
+
}
|
|
387
|
+
if use_prompt_caching:
|
|
388
|
+
api_kwargs["use_prompt_caching"] = use_prompt_caching
|
|
389
|
+
|
|
390
|
+
if _on_api_start:
|
|
391
|
+
await _on_api_start(api_kwargs)
|
|
392
|
+
|
|
393
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
394
|
+
|
|
395
|
+
if _on_api_end:
|
|
396
|
+
await _on_api_end(api_kwargs, response)
|
|
397
|
+
|
|
398
|
+
usage = {
|
|
399
|
+
**response.usage.model_dump(), # type: ignore
|
|
400
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
401
|
+
}
|
|
402
|
+
if _on_usage:
|
|
403
|
+
await _on_usage(usage)
|
|
404
|
+
|
|
405
|
+
# Step 4: Convert completion messages back to responses items format
|
|
406
|
+
response_dict = response.model_dump() # type: ignore
|
|
407
|
+
choice_messages = [choice["message"] for choice in response_dict["choices"]]
|
|
408
|
+
thinking_output_items: List[Dict[str, Any]] = []
|
|
409
|
+
for choice_message in choice_messages:
|
|
410
|
+
thinking_output_items.extend(
|
|
411
|
+
convert_completion_messages_to_responses_items([choice_message])
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# Step 5: Use Moondream to get coordinates for each description
|
|
415
|
+
element_descriptions = get_all_element_descriptions(thinking_output_items)
|
|
416
|
+
if element_descriptions and last_image_b64:
|
|
417
|
+
for desc in element_descriptions:
|
|
418
|
+
for _ in range(3): # try 3 times
|
|
419
|
+
coords = await self.predict_click(
|
|
420
|
+
model=model,
|
|
421
|
+
image_b64=last_image_b64,
|
|
422
|
+
instruction=desc,
|
|
423
|
+
)
|
|
424
|
+
if coords:
|
|
425
|
+
self.desc2xy[desc] = coords
|
|
426
|
+
break
|
|
427
|
+
|
|
428
|
+
# Step 6: Convert computer calls from descriptions back to xy coordinates
|
|
429
|
+
final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
|
|
430
|
+
|
|
431
|
+
# Step 7: Return output and usage
|
|
432
|
+
return {"output": pre_output_items + final_output_items, "usage": usage}
|
|
433
|
+
|
|
434
|
+
async def predict_click(
|
|
435
|
+
self,
|
|
436
|
+
model: str,
|
|
437
|
+
image_b64: str,
|
|
438
|
+
instruction: str,
|
|
439
|
+
**kwargs,
|
|
440
|
+
) -> Optional[Tuple[float, float]]:
|
|
441
|
+
"""Predict click coordinates using Moondream3's point API.
|
|
442
|
+
|
|
443
|
+
Returns pixel coordinates (x, y) as floats.
|
|
444
|
+
"""
|
|
445
|
+
img = _decode_image_b64(image_b64)
|
|
446
|
+
W, H = img.width, img.height
|
|
447
|
+
model_md = get_moondream_model()
|
|
448
|
+
try:
|
|
449
|
+
result = model_md.point(img, instruction, settings={"max_objects": 1})
|
|
450
|
+
except Exception:
|
|
451
|
+
return None
|
|
452
|
+
|
|
453
|
+
try:
|
|
454
|
+
pt = (result or {}).get("points", [])[0]
|
|
455
|
+
x_norm = float(pt.get("x", 0.0))
|
|
456
|
+
y_norm = float(pt.get("y", 0.0))
|
|
457
|
+
x_px = max(0.0, min(float(W - 1), x_norm * W))
|
|
458
|
+
y_px = max(0.0, min(float(H - 1), y_norm * H))
|
|
459
|
+
return (x_px, y_px)
|
|
460
|
+
except Exception:
|
|
461
|
+
return None
|
|
462
|
+
|
|
463
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
464
|
+
return ["click", "step"]
|
|
@@ -53,8 +53,7 @@ async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools
|
|
|
53
53
|
|
|
54
54
|
return openai_tools
|
|
55
55
|
|
|
56
|
-
|
|
57
|
-
@register_agent(models=r".*computer-use-preview.*")
|
|
56
|
+
@register_agent(models=r".*(^|/)computer-use-preview")
|
|
58
57
|
class OpenAIComputerUseConfig:
|
|
59
58
|
"""
|
|
60
59
|
OpenAI computer-use-preview agent configuration using liteLLM responses.
|
|
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
|
|
|
6
6
|
|
|
7
7
|
[project]
|
|
8
8
|
name = "cua-agent"
|
|
9
|
-
version = "0.4.
|
|
9
|
+
version = "0.4.33"
|
|
10
10
|
description = "CUA (Computer Use) Agent for AI-driven computer interaction"
|
|
11
11
|
readme = "README.md"
|
|
12
12
|
authors = [
|
|
@@ -70,7 +70,10 @@ cli = [
|
|
|
70
70
|
"yaspin>=3.1.0",
|
|
71
71
|
]
|
|
72
72
|
hud = [
|
|
73
|
-
"hud-python==0.4.
|
|
73
|
+
"hud-python==0.4.52",
|
|
74
|
+
]
|
|
75
|
+
gemini = [
|
|
76
|
+
"google-genai>=1.41.0",
|
|
74
77
|
]
|
|
75
78
|
all = [
|
|
76
79
|
"mlx-vlm>=0.1.27; sys_platform == 'darwin'",
|
|
@@ -84,7 +87,8 @@ all = [
|
|
|
84
87
|
"gradio>=5.23.3",
|
|
85
88
|
"python-dotenv>=1.0.1",
|
|
86
89
|
"yaspin>=3.1.0",
|
|
87
|
-
"hud-python==0.4.
|
|
90
|
+
"hud-python==0.4.52",
|
|
91
|
+
"google-genai>=1.41.0",
|
|
88
92
|
]
|
|
89
93
|
|
|
90
94
|
[tool.uv]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|