cua-agent 0.4.31__py3-none-any.whl → 0.4.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/adapters/huggingfacelocal_adapter.py +15 -66
- agent/adapters/models/__init__.py +33 -0
- agent/adapters/models/generic.py +75 -0
- agent/adapters/models/internvl.py +254 -0
- agent/adapters/models/opencua.py +100 -0
- agent/adapters/models/qwen2_5_vl.py +75 -0
- agent/agent.py +5 -1
- agent/callbacks/trajectory_saver.py +2 -0
- agent/cli.py +147 -22
- agent/loops/__init__.py +19 -1
- agent/loops/anthropic.py +3 -4
- agent/loops/composed_grounded.py +1 -1
- agent/loops/gemini.py +391 -0
- agent/loops/glm45v.py +3 -2
- agent/loops/gta1.py +1 -1
- agent/loops/holo.py +216 -0
- agent/loops/internvl.py +185 -0
- agent/loops/moondream3.py +464 -0
- agent/loops/openai.py +1 -2
- agent/loops/opencua.py +142 -0
- agent/loops/uitars.py +1 -1
- {cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/METADATA +23 -4
- {cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/RECORD +25 -15
- {cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/entry_points.txt +0 -0
agent/loops/internvl.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""
|
|
2
|
+
InternVL agent loop implementation for click prediction using litellm.acompletion.
|
|
3
|
+
|
|
4
|
+
Implements the ScreenSpot InternVL grounding baseline behavior:
|
|
5
|
+
- Uses the exact grounding prompt format with <image> and <ref> tags
|
|
6
|
+
- Expects coordinates in 0-1000 normalized range in formats [[x1,y1,x2,y2]] or [[x,y]]
|
|
7
|
+
- Converts to pixel coordinates relative to the original screenshot size
|
|
8
|
+
|
|
9
|
+
Note: We do NOT manually load the InternVL model; acompletions (via HuggingFaceLocalAdapter)
|
|
10
|
+
will handle loading based on the provided model name.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import base64
|
|
16
|
+
import math
|
|
17
|
+
import re
|
|
18
|
+
from io import BytesIO
|
|
19
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
20
|
+
|
|
21
|
+
from PIL import Image
|
|
22
|
+
import litellm
|
|
23
|
+
|
|
24
|
+
from ..decorators import register_agent
|
|
25
|
+
from .composed_grounded import ComposedGroundedConfig
|
|
26
|
+
from ..types import AgentCapability
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Regex patterns for extracting coordinates
|
|
30
|
+
# Accept optional whitespace and optional decimal fractions
|
|
31
|
+
_NUM = r"(\d+(?:\.\d+)?)"
|
|
32
|
+
_POINT_PATTERN = re.compile(r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]")
|
|
33
|
+
_BBOX_PATTERN = re.compile(
|
|
34
|
+
r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _extract_first_point(text: str) -> Optional[Tuple[float, float]]:
|
|
39
|
+
"""Extract the first [[x,y]] as normalized (0-1000) floats."""
|
|
40
|
+
m = _POINT_PATTERN.search(text)
|
|
41
|
+
if not m:
|
|
42
|
+
return None
|
|
43
|
+
try:
|
|
44
|
+
x = float(m.group(1))
|
|
45
|
+
y = float(m.group(2))
|
|
46
|
+
return x, y
|
|
47
|
+
except Exception:
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _extract_last_bbox(text: str) -> Optional[Tuple[float, float, float, float]]:
|
|
52
|
+
"""Extract the last [[x1,y1,x2,y2]] as normalized (0-1000) floats."""
|
|
53
|
+
matches = list(_BBOX_PATTERN.finditer(text))
|
|
54
|
+
if not matches:
|
|
55
|
+
return None
|
|
56
|
+
m = matches[-1]
|
|
57
|
+
try:
|
|
58
|
+
x1 = float(m.group(1))
|
|
59
|
+
y1 = float(m.group(2))
|
|
60
|
+
x2 = float(m.group(3))
|
|
61
|
+
y2 = float(m.group(4))
|
|
62
|
+
return x1, y1, x2, y2
|
|
63
|
+
except Exception:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _scale_norm_to_pixels(x_norm: float, y_norm: float, width: int, height: int) -> Tuple[int, int]:
|
|
68
|
+
"""Scale 0-1000 normalized coordinates to pixel coordinates for given image size."""
|
|
69
|
+
x_px = int(math.floor((x_norm / 1000.0) * width))
|
|
70
|
+
y_px = int(math.floor((y_norm / 1000.0) * height))
|
|
71
|
+
# Clamp to image bounds just in case
|
|
72
|
+
x_px = max(0, min(width - 1, x_px))
|
|
73
|
+
y_px = max(0, min(height - 1, y_px))
|
|
74
|
+
return x_px, y_px
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@register_agent(models=r"(?i).*InternVL.*")
|
|
78
|
+
class InternVLConfig(ComposedGroundedConfig):
|
|
79
|
+
"""InternVL agent configuration reusing ComposedGroundedConfig for steps and
|
|
80
|
+
overriding predict_click to implement ScreenSpot InternVL grounding baseline."""
|
|
81
|
+
|
|
82
|
+
async def predict_step(
|
|
83
|
+
self,
|
|
84
|
+
messages: List[Dict[str, Any]],
|
|
85
|
+
model: str,
|
|
86
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
87
|
+
max_retries: Optional[int] = None,
|
|
88
|
+
stream: bool = False,
|
|
89
|
+
computer_handler=None,
|
|
90
|
+
_on_api_start=None,
|
|
91
|
+
_on_api_end=None,
|
|
92
|
+
_on_usage=None,
|
|
93
|
+
_on_screenshot=None,
|
|
94
|
+
**kwargs
|
|
95
|
+
) -> Dict[str, Any]:
|
|
96
|
+
"""Fallback to a self-composed model"""
|
|
97
|
+
return await super().predict_step(
|
|
98
|
+
messages=messages,
|
|
99
|
+
model=f"{model}+{model}",
|
|
100
|
+
tools=tools,
|
|
101
|
+
max_retries=max_retries,
|
|
102
|
+
stream=stream,
|
|
103
|
+
computer_handler=computer_handler,
|
|
104
|
+
_on_api_start=_on_api_start,
|
|
105
|
+
_on_api_end=_on_api_end,
|
|
106
|
+
_on_usage=_on_usage,
|
|
107
|
+
_on_screenshot=_on_screenshot,
|
|
108
|
+
**kwargs
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
async def predict_click(
|
|
112
|
+
self,
|
|
113
|
+
model: str,
|
|
114
|
+
image_b64: str,
|
|
115
|
+
instruction: str,
|
|
116
|
+
**kwargs
|
|
117
|
+
) -> Optional[Tuple[int, int]]:
|
|
118
|
+
"""
|
|
119
|
+
Predict click coordinates using InternVL via litellm.acompletion.
|
|
120
|
+
|
|
121
|
+
Behavior mirrors the ScreenSpot InternVL baseline:
|
|
122
|
+
- Prompt: "<image>\nPlease provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. Answer in the format of [[x1, y1, x2, y2]]"
|
|
123
|
+
- Parse either [[x,y]] point or [[x1,y1,x2,y2]] bbox, using bbox center if point missing
|
|
124
|
+
- Coordinates are 0-1000 normalized; convert to pixel coordinates for the original screenshot
|
|
125
|
+
"""
|
|
126
|
+
try:
|
|
127
|
+
# Decode image dimensions to scale the normalized outputs
|
|
128
|
+
img_bytes = base64.b64decode(image_b64)
|
|
129
|
+
image = Image.open(BytesIO(img_bytes))
|
|
130
|
+
width, height = image.size
|
|
131
|
+
except Exception:
|
|
132
|
+
# If decoding fails, proceed with a safe default size to avoid crash
|
|
133
|
+
width, height = 1920, 1080
|
|
134
|
+
|
|
135
|
+
# Build grounding prompt exactly like the baseline
|
|
136
|
+
grounding_prompt = (
|
|
137
|
+
f"Please provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. "
|
|
138
|
+
f"Answer in the format of [[x1, y1, x2, y2]]"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Prepare messages for LiteLLM
|
|
142
|
+
messages = [
|
|
143
|
+
{
|
|
144
|
+
"role": "user",
|
|
145
|
+
"content": [
|
|
146
|
+
{
|
|
147
|
+
"type": "image_url",
|
|
148
|
+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
|
149
|
+
},
|
|
150
|
+
{"type": "text", "text": grounding_prompt},
|
|
151
|
+
],
|
|
152
|
+
}
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
# Call acompletion; HuggingFaceLocalAdapter/model handler will handle InternVL loading
|
|
156
|
+
api_kwargs = {
|
|
157
|
+
"model": model,
|
|
158
|
+
"messages": messages,
|
|
159
|
+
# Conservative generation params akin to baseline (deterministic)
|
|
160
|
+
"max_tokens": kwargs.get("max_tokens", 256),
|
|
161
|
+
"temperature": kwargs.get("temperature", 0.0),
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
165
|
+
output_text = (response.choices[0].message.content or "").strip() # type: ignore
|
|
166
|
+
|
|
167
|
+
# print(f"InternVL output: {output_text}")
|
|
168
|
+
|
|
169
|
+
# Try to parse a point first; if absent, parse bbox and take center
|
|
170
|
+
point = _extract_first_point(output_text)
|
|
171
|
+
if point is None:
|
|
172
|
+
bbox = _extract_last_bbox(output_text)
|
|
173
|
+
if bbox is None:
|
|
174
|
+
return None
|
|
175
|
+
x1, y1, x2, y2 = bbox
|
|
176
|
+
cx = (x1 + x2) / 2.0
|
|
177
|
+
cy = (y1 + y2) / 2.0
|
|
178
|
+
point = (cx, cy)
|
|
179
|
+
|
|
180
|
+
x_norm, y_norm = point
|
|
181
|
+
x_px, y_px = _scale_norm_to_pixels(x_norm, y_norm, width, height)
|
|
182
|
+
return (x_px, y_px)
|
|
183
|
+
|
|
184
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
185
|
+
return ["click", "step"]
|
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Moondream3+ composed-grounded agent loop implementation.
|
|
3
|
+
Grounding is handled by a local Moondream3 preview model via Transformers.
|
|
4
|
+
Thinking is delegated to the trailing LLM in the composed model string: "moondream3+<thinking_model>".
|
|
5
|
+
|
|
6
|
+
Differences from composed_grounded:
|
|
7
|
+
- Provides a singleton Moondream3 client outside the class.
|
|
8
|
+
- predict_click uses model.point(image, instruction, settings={"max_objects": 1}) and returns pixel coordinates.
|
|
9
|
+
- If the last image was a screenshot (or we take one), run model.detect(image, "all form ui") to get bboxes, then
|
|
10
|
+
run model.caption on each cropped bbox to label it. Overlay labels on the screenshot and emit via _on_screenshot.
|
|
11
|
+
- Add a user message listing all detected form UI names so the thinker can reference them.
|
|
12
|
+
- If the thinking model doesn't support vision, filter out image content before calling litellm.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import uuid
|
|
18
|
+
import base64
|
|
19
|
+
import io
|
|
20
|
+
from typing import Dict, List, Any, Optional, Tuple, Any
|
|
21
|
+
|
|
22
|
+
from PIL import Image, ImageDraw, ImageFont
|
|
23
|
+
import torch
|
|
24
|
+
from transformers import AutoModelForCausalLM
|
|
25
|
+
import litellm
|
|
26
|
+
|
|
27
|
+
from ..decorators import register_agent
|
|
28
|
+
from ..types import AgentCapability
|
|
29
|
+
from ..loops.base import AsyncAgentConfig
|
|
30
|
+
from ..responses import (
|
|
31
|
+
convert_computer_calls_xy2desc,
|
|
32
|
+
convert_responses_items_to_completion_messages,
|
|
33
|
+
convert_completion_messages_to_responses_items,
|
|
34
|
+
convert_computer_calls_desc2xy,
|
|
35
|
+
get_all_element_descriptions,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
_MOONDREAM_SINGLETON = None
|
|
39
|
+
|
|
40
|
+
def get_moondream_model() -> Any:
|
|
41
|
+
"""Get a singleton instance of the Moondream3 preview model."""
|
|
42
|
+
global _MOONDREAM_SINGLETON
|
|
43
|
+
if _MOONDREAM_SINGLETON is None:
|
|
44
|
+
_MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained(
|
|
45
|
+
"moondream/moondream3-preview",
|
|
46
|
+
trust_remote_code=True,
|
|
47
|
+
torch_dtype=torch.bfloat16,
|
|
48
|
+
device_map="cuda",
|
|
49
|
+
)
|
|
50
|
+
return _MOONDREAM_SINGLETON
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _decode_image_b64(image_b64: str) -> Image.Image:
|
|
54
|
+
data = base64.b64decode(image_b64)
|
|
55
|
+
return Image.open(io.BytesIO(data)).convert("RGB")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _image_to_b64(img: Image.Image) -> str:
|
|
59
|
+
buf = io.BytesIO()
|
|
60
|
+
img.save(buf, format="PNG")
|
|
61
|
+
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _supports_vision(model: str) -> bool:
|
|
65
|
+
"""Heuristic vision support detection for thinking model."""
|
|
66
|
+
m = model.lower()
|
|
67
|
+
vision_markers = [
|
|
68
|
+
"gpt-4o",
|
|
69
|
+
"gpt-4.1",
|
|
70
|
+
"o1",
|
|
71
|
+
"o3",
|
|
72
|
+
"claude-3",
|
|
73
|
+
"claude-3.5",
|
|
74
|
+
"sonnet",
|
|
75
|
+
"haiku",
|
|
76
|
+
"opus",
|
|
77
|
+
"gemini-1.5",
|
|
78
|
+
"llava",
|
|
79
|
+
]
|
|
80
|
+
return any(v in m for v in vision_markers)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
84
|
+
filtered: List[Dict[str, Any]] = []
|
|
85
|
+
for msg in messages:
|
|
86
|
+
msg_copy = {**msg}
|
|
87
|
+
content = msg_copy.get("content")
|
|
88
|
+
if isinstance(content, list):
|
|
89
|
+
msg_copy["content"] = [c for c in content if c.get("type") != "image_url"]
|
|
90
|
+
filtered.append(msg_copy)
|
|
91
|
+
return filtered
|
|
92
|
+
|
|
93
|
+
def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]:
|
|
94
|
+
"""Detect UI elements with Moondream, caption each, draw labels with backgrounds.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
base_img: PIL image of the screenshot (RGB or RGBA). Will be copied/converted internally.
|
|
98
|
+
model_md: Moondream model instance with .detect() and .query() methods.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
A tuple of (annotated_image_base64_png, detected_names)
|
|
102
|
+
"""
|
|
103
|
+
# Ensure RGBA for semi-transparent fills
|
|
104
|
+
if base_img.mode != "RGBA":
|
|
105
|
+
base_img = base_img.convert("RGBA")
|
|
106
|
+
W, H = base_img.width, base_img.height
|
|
107
|
+
|
|
108
|
+
# Detect objects
|
|
109
|
+
try:
|
|
110
|
+
detect_result = model_md.detect(base_img, "all ui elements")
|
|
111
|
+
objects = detect_result.get("objects", []) if isinstance(detect_result, dict) else []
|
|
112
|
+
except Exception:
|
|
113
|
+
objects = []
|
|
114
|
+
|
|
115
|
+
draw = ImageDraw.Draw(base_img)
|
|
116
|
+
try:
|
|
117
|
+
font = ImageFont.load_default()
|
|
118
|
+
except Exception:
|
|
119
|
+
font = None
|
|
120
|
+
|
|
121
|
+
detected_names: List[str] = []
|
|
122
|
+
|
|
123
|
+
for i, obj in enumerate(objects):
|
|
124
|
+
try:
|
|
125
|
+
# Clamp normalized coords and crop
|
|
126
|
+
x_min = max(0.0, min(1.0, float(obj.get("x_min", 0.0))))
|
|
127
|
+
y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0))))
|
|
128
|
+
x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0))))
|
|
129
|
+
y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0))))
|
|
130
|
+
left, top, right, bottom = int(x_min * W), int(y_min * H), int(x_max * W), int(y_max * H)
|
|
131
|
+
left, top = max(0, left), max(0, top)
|
|
132
|
+
right, bottom = min(W - 1, right), min(H - 1, bottom)
|
|
133
|
+
crop = base_img.crop((left, top, right, bottom))
|
|
134
|
+
|
|
135
|
+
# Prompted short caption
|
|
136
|
+
try:
|
|
137
|
+
result = model_md.query(crop, "Caption this UI element in few words.")
|
|
138
|
+
caption_text = (result or {}).get("answer", "")
|
|
139
|
+
except Exception:
|
|
140
|
+
caption_text = ""
|
|
141
|
+
|
|
142
|
+
name = (caption_text or "").strip() or f"element_{i+1}"
|
|
143
|
+
detected_names.append(name)
|
|
144
|
+
|
|
145
|
+
# Draw bbox
|
|
146
|
+
draw.rectangle([left, top, right, bottom], outline=(255, 215, 0, 255), width=2)
|
|
147
|
+
|
|
148
|
+
# Label background with padding and rounded corners
|
|
149
|
+
label = f"{i+1}. {name}"
|
|
150
|
+
padding = 3
|
|
151
|
+
if font:
|
|
152
|
+
text_bbox = draw.textbbox((0, 0), label, font=font)
|
|
153
|
+
else:
|
|
154
|
+
text_bbox = draw.textbbox((0, 0), label)
|
|
155
|
+
text_w = text_bbox[2] - text_bbox[0]
|
|
156
|
+
text_h = text_bbox[3] - text_bbox[1]
|
|
157
|
+
|
|
158
|
+
tx = left + 3
|
|
159
|
+
ty = top - (text_h + 2 * padding + 4)
|
|
160
|
+
if ty < 0:
|
|
161
|
+
ty = top + 3
|
|
162
|
+
|
|
163
|
+
bg_left = tx - padding
|
|
164
|
+
bg_top = ty - padding
|
|
165
|
+
bg_right = tx + text_w + padding
|
|
166
|
+
bg_bottom = ty + text_h + padding
|
|
167
|
+
try:
|
|
168
|
+
draw.rounded_rectangle(
|
|
169
|
+
[bg_left, bg_top, bg_right, bg_bottom],
|
|
170
|
+
radius=4,
|
|
171
|
+
fill=(0, 0, 0, 160),
|
|
172
|
+
outline=(255, 215, 0, 200),
|
|
173
|
+
width=1,
|
|
174
|
+
)
|
|
175
|
+
except Exception:
|
|
176
|
+
draw.rectangle(
|
|
177
|
+
[bg_left, bg_top, bg_right, bg_bottom],
|
|
178
|
+
fill=(0, 0, 0, 160),
|
|
179
|
+
outline=(255, 215, 0, 200),
|
|
180
|
+
width=1,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
text_fill = (255, 255, 255, 255)
|
|
184
|
+
if font:
|
|
185
|
+
draw.text((tx, ty), label, fill=text_fill, font=font)
|
|
186
|
+
else:
|
|
187
|
+
draw.text((tx, ty), label, fill=text_fill)
|
|
188
|
+
except Exception:
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
# Encode PNG base64
|
|
192
|
+
annotated = base_img
|
|
193
|
+
if annotated.mode not in ("RGBA", "RGB"):
|
|
194
|
+
annotated = annotated.convert("RGBA")
|
|
195
|
+
annotated_b64 = _image_to_b64(annotated)
|
|
196
|
+
return annotated_b64, detected_names
|
|
197
|
+
|
|
198
|
+
GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
|
199
|
+
"type": "function",
|
|
200
|
+
"function": {
|
|
201
|
+
"name": "computer",
|
|
202
|
+
"description": (
|
|
203
|
+
"Control a computer by taking screenshots and interacting with UI elements. "
|
|
204
|
+
"The screenshot action will include a list of detected form UI element names when available. "
|
|
205
|
+
"Use element descriptions to locate and interact with UI elements on the screen."
|
|
206
|
+
),
|
|
207
|
+
"parameters": {
|
|
208
|
+
"type": "object",
|
|
209
|
+
"properties": {
|
|
210
|
+
"action": {
|
|
211
|
+
"type": "string",
|
|
212
|
+
"enum": [
|
|
213
|
+
"screenshot",
|
|
214
|
+
"click",
|
|
215
|
+
"double_click",
|
|
216
|
+
"drag",
|
|
217
|
+
"type",
|
|
218
|
+
"keypress",
|
|
219
|
+
"scroll",
|
|
220
|
+
"move",
|
|
221
|
+
"wait",
|
|
222
|
+
"get_current_url",
|
|
223
|
+
"get_dimensions",
|
|
224
|
+
"get_environment",
|
|
225
|
+
],
|
|
226
|
+
"description": "The action to perform (required for all actions)",
|
|
227
|
+
},
|
|
228
|
+
"element_description": {
|
|
229
|
+
"type": "string",
|
|
230
|
+
"description": "Description of the element to interact with (required for click/double_click/move/scroll)",
|
|
231
|
+
},
|
|
232
|
+
"start_element_description": {
|
|
233
|
+
"type": "string",
|
|
234
|
+
"description": "Description of the element to start dragging from (required for drag)",
|
|
235
|
+
},
|
|
236
|
+
"end_element_description": {
|
|
237
|
+
"type": "string",
|
|
238
|
+
"description": "Description of the element to drag to (required for drag)",
|
|
239
|
+
},
|
|
240
|
+
"text": {
|
|
241
|
+
"type": "string",
|
|
242
|
+
"description": "The text to type (required for type)",
|
|
243
|
+
},
|
|
244
|
+
"keys": {
|
|
245
|
+
"type": "array",
|
|
246
|
+
"items": {"type": "string"},
|
|
247
|
+
"description": "Key(s) to press (required for keypress)",
|
|
248
|
+
},
|
|
249
|
+
"button": {
|
|
250
|
+
"type": "string",
|
|
251
|
+
"enum": ["left", "right", "wheel", "back", "forward"],
|
|
252
|
+
"description": "The mouse button to use for click/double_click",
|
|
253
|
+
},
|
|
254
|
+
"scroll_x": {
|
|
255
|
+
"type": "integer",
|
|
256
|
+
"description": "Horizontal scroll amount (required for scroll)",
|
|
257
|
+
},
|
|
258
|
+
"scroll_y": {
|
|
259
|
+
"type": "integer",
|
|
260
|
+
"description": "Vertical scroll amount (required for scroll)",
|
|
261
|
+
},
|
|
262
|
+
},
|
|
263
|
+
"required": ["action"],
|
|
264
|
+
},
|
|
265
|
+
},
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
@register_agent(r"moondream3\+.*", priority=2)
|
|
269
|
+
class Moondream3PlusConfig(AsyncAgentConfig):
|
|
270
|
+
def __init__(self):
|
|
271
|
+
self.desc2xy: Dict[str, Tuple[float, float]] = {}
|
|
272
|
+
|
|
273
|
+
async def predict_step(
|
|
274
|
+
self,
|
|
275
|
+
messages: List[Dict[str, Any]],
|
|
276
|
+
model: str,
|
|
277
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
278
|
+
max_retries: Optional[int] = None,
|
|
279
|
+
stream: bool = False,
|
|
280
|
+
computer_handler=None,
|
|
281
|
+
use_prompt_caching: Optional[bool] = False,
|
|
282
|
+
_on_api_start=None,
|
|
283
|
+
_on_api_end=None,
|
|
284
|
+
_on_usage=None,
|
|
285
|
+
_on_screenshot=None,
|
|
286
|
+
**kwargs,
|
|
287
|
+
) -> Dict[str, Any]:
|
|
288
|
+
# Parse composed model: moondream3+<thinking_model>
|
|
289
|
+
if "+" not in model:
|
|
290
|
+
raise ValueError(f"Composed model must be 'moondream3+<thinking_model>', got: {model}")
|
|
291
|
+
_, thinking_model = model.split("+", 1)
|
|
292
|
+
|
|
293
|
+
pre_output_items: List[Dict[str, Any]] = []
|
|
294
|
+
|
|
295
|
+
# Acquire last screenshot; if missing, take one
|
|
296
|
+
last_image_b64: Optional[str] = None
|
|
297
|
+
for message in reversed(messages):
|
|
298
|
+
if (
|
|
299
|
+
isinstance(message, dict)
|
|
300
|
+
and message.get("type") == "computer_call_output"
|
|
301
|
+
and isinstance(message.get("output"), dict)
|
|
302
|
+
and message["output"].get("type") == "input_image"
|
|
303
|
+
):
|
|
304
|
+
image_url = message["output"].get("image_url", "")
|
|
305
|
+
if image_url.startswith("data:image/png;base64,"):
|
|
306
|
+
last_image_b64 = image_url.split(",", 1)[1]
|
|
307
|
+
break
|
|
308
|
+
|
|
309
|
+
if last_image_b64 is None and computer_handler is not None:
|
|
310
|
+
# Take a screenshot
|
|
311
|
+
screenshot_b64 = await computer_handler.screenshot() # type: ignore
|
|
312
|
+
if screenshot_b64:
|
|
313
|
+
call_id = uuid.uuid4().hex
|
|
314
|
+
pre_output_items += [
|
|
315
|
+
{
|
|
316
|
+
"type": "message",
|
|
317
|
+
"role": "assistant",
|
|
318
|
+
"content": [
|
|
319
|
+
{"type": "output_text", "text": "Taking a screenshot to analyze the current screen."}
|
|
320
|
+
],
|
|
321
|
+
},
|
|
322
|
+
{"type": "computer_call", "call_id": call_id, "status": "completed", "action": {"type": "screenshot"}},
|
|
323
|
+
{
|
|
324
|
+
"type": "computer_call_output",
|
|
325
|
+
"call_id": call_id,
|
|
326
|
+
"output": {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_b64}"},
|
|
327
|
+
},
|
|
328
|
+
]
|
|
329
|
+
last_image_b64 = screenshot_b64
|
|
330
|
+
if _on_screenshot:
|
|
331
|
+
await _on_screenshot(screenshot_b64)
|
|
332
|
+
|
|
333
|
+
# If we have a last screenshot, run Moondream detection and labeling
|
|
334
|
+
detected_names: List[str] = []
|
|
335
|
+
if last_image_b64 is not None:
|
|
336
|
+
base_img = _decode_image_b64(last_image_b64)
|
|
337
|
+
model_md = get_moondream_model()
|
|
338
|
+
annotated_b64, detected_names = _annotate_detect_and_label_ui(base_img, model_md)
|
|
339
|
+
if _on_screenshot:
|
|
340
|
+
await _on_screenshot(annotated_b64, "annotated_form_ui")
|
|
341
|
+
|
|
342
|
+
# Also push a user message listing all detected names
|
|
343
|
+
if detected_names:
|
|
344
|
+
names_text = "\n".join(f"- {n}" for n in detected_names)
|
|
345
|
+
pre_output_items.append(
|
|
346
|
+
{
|
|
347
|
+
"type": "message",
|
|
348
|
+
"role": "user",
|
|
349
|
+
"content": [
|
|
350
|
+
{"type": "input_text", "text": "Detected form UI elements on screen:"},
|
|
351
|
+
{"type": "input_text", "text": names_text},
|
|
352
|
+
{"type": "input_text", "text": "Please continue with the next action needed to perform your task."}
|
|
353
|
+
],
|
|
354
|
+
}
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
tool_schemas = []
|
|
358
|
+
for schema in (tools or []):
|
|
359
|
+
if schema.get("type") == "computer":
|
|
360
|
+
tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
|
|
361
|
+
else:
|
|
362
|
+
tool_schemas.append(schema)
|
|
363
|
+
|
|
364
|
+
# Step 1: Convert computer calls from xy to descriptions
|
|
365
|
+
input_messages = messages + pre_output_items
|
|
366
|
+
messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
|
|
367
|
+
|
|
368
|
+
# Step 2: Convert responses items to completion messages
|
|
369
|
+
completion_messages = convert_responses_items_to_completion_messages(
|
|
370
|
+
messages_with_descriptions,
|
|
371
|
+
allow_images_in_tool_results=False,
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# Optionally filter images if model lacks vision
|
|
375
|
+
if not _supports_vision(thinking_model):
|
|
376
|
+
completion_messages = _filter_images_from_completion_messages(completion_messages)
|
|
377
|
+
|
|
378
|
+
# Step 3: Call thinking model with litellm.acompletion
|
|
379
|
+
api_kwargs = {
|
|
380
|
+
"model": thinking_model,
|
|
381
|
+
"messages": completion_messages,
|
|
382
|
+
"tools": tool_schemas,
|
|
383
|
+
"max_retries": max_retries,
|
|
384
|
+
"stream": stream,
|
|
385
|
+
**kwargs,
|
|
386
|
+
}
|
|
387
|
+
if use_prompt_caching:
|
|
388
|
+
api_kwargs["use_prompt_caching"] = use_prompt_caching
|
|
389
|
+
|
|
390
|
+
if _on_api_start:
|
|
391
|
+
await _on_api_start(api_kwargs)
|
|
392
|
+
|
|
393
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
394
|
+
|
|
395
|
+
if _on_api_end:
|
|
396
|
+
await _on_api_end(api_kwargs, response)
|
|
397
|
+
|
|
398
|
+
usage = {
|
|
399
|
+
**response.usage.model_dump(), # type: ignore
|
|
400
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
401
|
+
}
|
|
402
|
+
if _on_usage:
|
|
403
|
+
await _on_usage(usage)
|
|
404
|
+
|
|
405
|
+
# Step 4: Convert completion messages back to responses items format
|
|
406
|
+
response_dict = response.model_dump() # type: ignore
|
|
407
|
+
choice_messages = [choice["message"] for choice in response_dict["choices"]]
|
|
408
|
+
thinking_output_items: List[Dict[str, Any]] = []
|
|
409
|
+
for choice_message in choice_messages:
|
|
410
|
+
thinking_output_items.extend(
|
|
411
|
+
convert_completion_messages_to_responses_items([choice_message])
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# Step 5: Use Moondream to get coordinates for each description
|
|
415
|
+
element_descriptions = get_all_element_descriptions(thinking_output_items)
|
|
416
|
+
if element_descriptions and last_image_b64:
|
|
417
|
+
for desc in element_descriptions:
|
|
418
|
+
for _ in range(3): # try 3 times
|
|
419
|
+
coords = await self.predict_click(
|
|
420
|
+
model=model,
|
|
421
|
+
image_b64=last_image_b64,
|
|
422
|
+
instruction=desc,
|
|
423
|
+
)
|
|
424
|
+
if coords:
|
|
425
|
+
self.desc2xy[desc] = coords
|
|
426
|
+
break
|
|
427
|
+
|
|
428
|
+
# Step 6: Convert computer calls from descriptions back to xy coordinates
|
|
429
|
+
final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
|
|
430
|
+
|
|
431
|
+
# Step 7: Return output and usage
|
|
432
|
+
return {"output": pre_output_items + final_output_items, "usage": usage}
|
|
433
|
+
|
|
434
|
+
async def predict_click(
|
|
435
|
+
self,
|
|
436
|
+
model: str,
|
|
437
|
+
image_b64: str,
|
|
438
|
+
instruction: str,
|
|
439
|
+
**kwargs,
|
|
440
|
+
) -> Optional[Tuple[float, float]]:
|
|
441
|
+
"""Predict click coordinates using Moondream3's point API.
|
|
442
|
+
|
|
443
|
+
Returns pixel coordinates (x, y) as floats.
|
|
444
|
+
"""
|
|
445
|
+
img = _decode_image_b64(image_b64)
|
|
446
|
+
W, H = img.width, img.height
|
|
447
|
+
model_md = get_moondream_model()
|
|
448
|
+
try:
|
|
449
|
+
result = model_md.point(img, instruction, settings={"max_objects": 1})
|
|
450
|
+
except Exception:
|
|
451
|
+
return None
|
|
452
|
+
|
|
453
|
+
try:
|
|
454
|
+
pt = (result or {}).get("points", [])[0]
|
|
455
|
+
x_norm = float(pt.get("x", 0.0))
|
|
456
|
+
y_norm = float(pt.get("y", 0.0))
|
|
457
|
+
x_px = max(0.0, min(float(W - 1), x_norm * W))
|
|
458
|
+
y_px = max(0.0, min(float(H - 1), y_norm * H))
|
|
459
|
+
return (x_px, y_px)
|
|
460
|
+
except Exception:
|
|
461
|
+
return None
|
|
462
|
+
|
|
463
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
464
|
+
return ["click", "step"]
|
agent/loops/openai.py
CHANGED
|
@@ -53,8 +53,7 @@ async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools
|
|
|
53
53
|
|
|
54
54
|
return openai_tools
|
|
55
55
|
|
|
56
|
-
|
|
57
|
-
@register_agent(models=r".*computer-use-preview.*")
|
|
56
|
+
@register_agent(models=r".*(^|/)computer-use-preview")
|
|
58
57
|
class OpenAIComputerUseConfig:
|
|
59
58
|
"""
|
|
60
59
|
OpenAI computer-use-preview agent configuration using liteLLM responses.
|