cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Moondream3+ composed-grounded agent loop implementation.
|
|
3
|
+
Grounding is handled by a local Moondream3 preview model via Transformers.
|
|
4
|
+
Thinking is delegated to the trailing LLM in the composed model string: "moondream3+<thinking_model>".
|
|
5
|
+
|
|
6
|
+
Differences from composed_grounded:
|
|
7
|
+
- Provides a singleton Moondream3 client outside the class.
|
|
8
|
+
- predict_click uses model.point(image, instruction, settings={"max_objects": 1}) and returns pixel coordinates.
|
|
9
|
+
- If the last image was a screenshot (or we take one), run model.detect(image, "all form ui") to get bboxes, then
|
|
10
|
+
run model.caption on each cropped bbox to label it. Overlay labels on the screenshot and emit via _on_screenshot.
|
|
11
|
+
- Add a user message listing all detected form UI names so the thinker can reference them.
|
|
12
|
+
- If the thinking model doesn't support vision, filter out image content before calling litellm.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import base64
|
|
18
|
+
import io
|
|
19
|
+
import uuid
|
|
20
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
21
|
+
|
|
22
|
+
import litellm
|
|
23
|
+
from PIL import Image, ImageDraw, ImageFont
|
|
24
|
+
|
|
25
|
+
from ..decorators import register_agent
|
|
26
|
+
from ..loops.base import AsyncAgentConfig
|
|
27
|
+
from ..responses import (
|
|
28
|
+
convert_completion_messages_to_responses_items,
|
|
29
|
+
convert_computer_calls_desc2xy,
|
|
30
|
+
convert_computer_calls_xy2desc,
|
|
31
|
+
convert_responses_items_to_completion_messages,
|
|
32
|
+
get_all_element_descriptions,
|
|
33
|
+
)
|
|
34
|
+
from ..types import AgentCapability
|
|
35
|
+
|
|
36
|
+
_MOONDREAM_SINGLETON = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_moondream_model() -> Any:
|
|
40
|
+
"""Get a singleton instance of the Moondream3 preview model."""
|
|
41
|
+
global _MOONDREAM_SINGLETON
|
|
42
|
+
if _MOONDREAM_SINGLETON is None:
|
|
43
|
+
try:
|
|
44
|
+
import torch
|
|
45
|
+
from transformers import AutoModelForCausalLM
|
|
46
|
+
|
|
47
|
+
_MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained(
|
|
48
|
+
"moondream/moondream3-preview",
|
|
49
|
+
trust_remote_code=True,
|
|
50
|
+
torch_dtype=torch.bfloat16,
|
|
51
|
+
device_map="cuda",
|
|
52
|
+
)
|
|
53
|
+
except ImportError as e:
|
|
54
|
+
raise RuntimeError(
|
|
55
|
+
"moondream3 requires torch and transformers. Install with: pip install cua-agent[moondream3]"
|
|
56
|
+
) from e
|
|
57
|
+
return _MOONDREAM_SINGLETON
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _decode_image_b64(image_b64: str) -> Image.Image:
|
|
61
|
+
data = base64.b64decode(image_b64)
|
|
62
|
+
return Image.open(io.BytesIO(data)).convert("RGB")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _image_to_b64(img: Image.Image) -> str:
|
|
66
|
+
buf = io.BytesIO()
|
|
67
|
+
img.save(buf, format="PNG")
|
|
68
|
+
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _supports_vision(model: str) -> bool:
|
|
72
|
+
"""Heuristic vision support detection for thinking model."""
|
|
73
|
+
m = model.lower()
|
|
74
|
+
vision_markers = [
|
|
75
|
+
"gpt-4o",
|
|
76
|
+
"gpt-4.1",
|
|
77
|
+
"o1",
|
|
78
|
+
"o3",
|
|
79
|
+
"claude-3",
|
|
80
|
+
"claude-3.5",
|
|
81
|
+
"sonnet",
|
|
82
|
+
"haiku",
|
|
83
|
+
"opus",
|
|
84
|
+
"gemini-1.5",
|
|
85
|
+
"llava",
|
|
86
|
+
]
|
|
87
|
+
return any(v in m for v in vision_markers)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
91
|
+
filtered: List[Dict[str, Any]] = []
|
|
92
|
+
for msg in messages:
|
|
93
|
+
msg_copy = {**msg}
|
|
94
|
+
content = msg_copy.get("content")
|
|
95
|
+
if isinstance(content, list):
|
|
96
|
+
msg_copy["content"] = [c for c in content if c.get("type") != "image_url"]
|
|
97
|
+
filtered.append(msg_copy)
|
|
98
|
+
return filtered
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]:
|
|
102
|
+
"""Detect UI elements with Moondream, caption each, draw labels with backgrounds.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
base_img: PIL image of the screenshot (RGB or RGBA). Will be copied/converted internally.
|
|
106
|
+
model_md: Moondream model instance with .detect() and .query() methods.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
A tuple of (annotated_image_base64_png, detected_names)
|
|
110
|
+
"""
|
|
111
|
+
# Ensure RGBA for semi-transparent fills
|
|
112
|
+
if base_img.mode != "RGBA":
|
|
113
|
+
base_img = base_img.convert("RGBA")
|
|
114
|
+
W, H = base_img.width, base_img.height
|
|
115
|
+
|
|
116
|
+
# Detect objects
|
|
117
|
+
try:
|
|
118
|
+
detect_result = model_md.detect(base_img, "all ui elements")
|
|
119
|
+
objects = detect_result.get("objects", []) if isinstance(detect_result, dict) else []
|
|
120
|
+
except Exception:
|
|
121
|
+
objects = []
|
|
122
|
+
|
|
123
|
+
draw = ImageDraw.Draw(base_img)
|
|
124
|
+
try:
|
|
125
|
+
font = ImageFont.load_default()
|
|
126
|
+
except Exception:
|
|
127
|
+
font = None
|
|
128
|
+
|
|
129
|
+
detected_names: List[str] = []
|
|
130
|
+
|
|
131
|
+
for i, obj in enumerate(objects):
|
|
132
|
+
try:
|
|
133
|
+
# Clamp normalized coords and crop
|
|
134
|
+
x_min = max(0.0, min(1.0, float(obj.get("x_min", 0.0))))
|
|
135
|
+
y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0))))
|
|
136
|
+
x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0))))
|
|
137
|
+
y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0))))
|
|
138
|
+
left, top, right, bottom = (
|
|
139
|
+
int(x_min * W),
|
|
140
|
+
int(y_min * H),
|
|
141
|
+
int(x_max * W),
|
|
142
|
+
int(y_max * H),
|
|
143
|
+
)
|
|
144
|
+
left, top = max(0, left), max(0, top)
|
|
145
|
+
right, bottom = min(W - 1, right), min(H - 1, bottom)
|
|
146
|
+
crop = base_img.crop((left, top, right, bottom))
|
|
147
|
+
|
|
148
|
+
# Prompted short caption
|
|
149
|
+
try:
|
|
150
|
+
result = model_md.query(crop, "Caption this UI element in few words.")
|
|
151
|
+
caption_text = (result or {}).get("answer", "")
|
|
152
|
+
except Exception:
|
|
153
|
+
caption_text = ""
|
|
154
|
+
|
|
155
|
+
name = (caption_text or "").strip() or f"element_{i+1}"
|
|
156
|
+
detected_names.append(name)
|
|
157
|
+
|
|
158
|
+
# Draw bbox
|
|
159
|
+
draw.rectangle([left, top, right, bottom], outline=(255, 215, 0, 255), width=2)
|
|
160
|
+
|
|
161
|
+
# Label background with padding and rounded corners
|
|
162
|
+
label = f"{i+1}. {name}"
|
|
163
|
+
padding = 3
|
|
164
|
+
if font:
|
|
165
|
+
text_bbox = draw.textbbox((0, 0), label, font=font)
|
|
166
|
+
else:
|
|
167
|
+
text_bbox = draw.textbbox((0, 0), label)
|
|
168
|
+
text_w = text_bbox[2] - text_bbox[0]
|
|
169
|
+
text_h = text_bbox[3] - text_bbox[1]
|
|
170
|
+
|
|
171
|
+
tx = left + 3
|
|
172
|
+
ty = top - (text_h + 2 * padding + 4)
|
|
173
|
+
if ty < 0:
|
|
174
|
+
ty = top + 3
|
|
175
|
+
|
|
176
|
+
bg_left = tx - padding
|
|
177
|
+
bg_top = ty - padding
|
|
178
|
+
bg_right = tx + text_w + padding
|
|
179
|
+
bg_bottom = ty + text_h + padding
|
|
180
|
+
try:
|
|
181
|
+
draw.rounded_rectangle(
|
|
182
|
+
[bg_left, bg_top, bg_right, bg_bottom],
|
|
183
|
+
radius=4,
|
|
184
|
+
fill=(0, 0, 0, 160),
|
|
185
|
+
outline=(255, 215, 0, 200),
|
|
186
|
+
width=1,
|
|
187
|
+
)
|
|
188
|
+
except Exception:
|
|
189
|
+
draw.rectangle(
|
|
190
|
+
[bg_left, bg_top, bg_right, bg_bottom],
|
|
191
|
+
fill=(0, 0, 0, 160),
|
|
192
|
+
outline=(255, 215, 0, 200),
|
|
193
|
+
width=1,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
text_fill = (255, 255, 255, 255)
|
|
197
|
+
if font:
|
|
198
|
+
draw.text((tx, ty), label, fill=text_fill, font=font)
|
|
199
|
+
else:
|
|
200
|
+
draw.text((tx, ty), label, fill=text_fill)
|
|
201
|
+
except Exception:
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
# Encode PNG base64
|
|
205
|
+
annotated = base_img
|
|
206
|
+
if annotated.mode not in ("RGBA", "RGB"):
|
|
207
|
+
annotated = annotated.convert("RGBA")
|
|
208
|
+
annotated_b64 = _image_to_b64(annotated)
|
|
209
|
+
return annotated_b64, detected_names
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
|
213
|
+
"type": "function",
|
|
214
|
+
"function": {
|
|
215
|
+
"name": "computer",
|
|
216
|
+
"description": (
|
|
217
|
+
"Control a computer by taking screenshots and interacting with UI elements. "
|
|
218
|
+
"The screenshot action will include a list of detected form UI element names when available. "
|
|
219
|
+
"Use element descriptions to locate and interact with UI elements on the screen."
|
|
220
|
+
),
|
|
221
|
+
"parameters": {
|
|
222
|
+
"type": "object",
|
|
223
|
+
"properties": {
|
|
224
|
+
"action": {
|
|
225
|
+
"type": "string",
|
|
226
|
+
"enum": [
|
|
227
|
+
"screenshot",
|
|
228
|
+
"click",
|
|
229
|
+
"double_click",
|
|
230
|
+
"drag",
|
|
231
|
+
"type",
|
|
232
|
+
"keypress",
|
|
233
|
+
"scroll",
|
|
234
|
+
"move",
|
|
235
|
+
"wait",
|
|
236
|
+
"get_current_url",
|
|
237
|
+
"get_dimensions",
|
|
238
|
+
"get_environment",
|
|
239
|
+
],
|
|
240
|
+
"description": "The action to perform (required for all actions)",
|
|
241
|
+
},
|
|
242
|
+
"element_description": {
|
|
243
|
+
"type": "string",
|
|
244
|
+
"description": "Description of the element to interact with (required for click/double_click/move/scroll)",
|
|
245
|
+
},
|
|
246
|
+
"start_element_description": {
|
|
247
|
+
"type": "string",
|
|
248
|
+
"description": "Description of the element to start dragging from (required for drag)",
|
|
249
|
+
},
|
|
250
|
+
"end_element_description": {
|
|
251
|
+
"type": "string",
|
|
252
|
+
"description": "Description of the element to drag to (required for drag)",
|
|
253
|
+
},
|
|
254
|
+
"text": {
|
|
255
|
+
"type": "string",
|
|
256
|
+
"description": "The text to type (required for type)",
|
|
257
|
+
},
|
|
258
|
+
"keys": {
|
|
259
|
+
"type": "array",
|
|
260
|
+
"items": {"type": "string"},
|
|
261
|
+
"description": "Key(s) to press (required for keypress)",
|
|
262
|
+
},
|
|
263
|
+
"button": {
|
|
264
|
+
"type": "string",
|
|
265
|
+
"enum": ["left", "right", "wheel", "back", "forward"],
|
|
266
|
+
"description": "The mouse button to use for click/double_click",
|
|
267
|
+
},
|
|
268
|
+
"scroll_x": {
|
|
269
|
+
"type": "integer",
|
|
270
|
+
"description": "Horizontal scroll amount (required for scroll)",
|
|
271
|
+
},
|
|
272
|
+
"scroll_y": {
|
|
273
|
+
"type": "integer",
|
|
274
|
+
"description": "Vertical scroll amount (required for scroll)",
|
|
275
|
+
},
|
|
276
|
+
},
|
|
277
|
+
"required": ["action"],
|
|
278
|
+
},
|
|
279
|
+
},
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@register_agent(r"moondream3\+.*", priority=2)
|
|
284
|
+
class Moondream3PlusConfig(AsyncAgentConfig):
|
|
285
|
+
def __init__(self):
|
|
286
|
+
self.desc2xy: Dict[str, Tuple[float, float]] = {}
|
|
287
|
+
|
|
288
|
+
async def predict_step(
|
|
289
|
+
self,
|
|
290
|
+
messages: List[Dict[str, Any]],
|
|
291
|
+
model: str,
|
|
292
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
293
|
+
max_retries: Optional[int] = None,
|
|
294
|
+
stream: bool = False,
|
|
295
|
+
computer_handler=None,
|
|
296
|
+
use_prompt_caching: Optional[bool] = False,
|
|
297
|
+
_on_api_start=None,
|
|
298
|
+
_on_api_end=None,
|
|
299
|
+
_on_usage=None,
|
|
300
|
+
_on_screenshot=None,
|
|
301
|
+
**kwargs,
|
|
302
|
+
) -> Dict[str, Any]:
|
|
303
|
+
# Parse composed model: moondream3+<thinking_model>
|
|
304
|
+
if "+" not in model:
|
|
305
|
+
raise ValueError(f"Composed model must be 'moondream3+<thinking_model>', got: {model}")
|
|
306
|
+
_, thinking_model = model.split("+", 1)
|
|
307
|
+
|
|
308
|
+
pre_output_items: List[Dict[str, Any]] = []
|
|
309
|
+
|
|
310
|
+
# Acquire last screenshot; if missing, take one
|
|
311
|
+
last_image_b64: Optional[str] = None
|
|
312
|
+
for message in reversed(messages):
|
|
313
|
+
if (
|
|
314
|
+
isinstance(message, dict)
|
|
315
|
+
and message.get("type") == "computer_call_output"
|
|
316
|
+
and isinstance(message.get("output"), dict)
|
|
317
|
+
and message["output"].get("type") == "input_image"
|
|
318
|
+
):
|
|
319
|
+
image_url = message["output"].get("image_url", "")
|
|
320
|
+
if image_url.startswith("data:image/png;base64,"):
|
|
321
|
+
last_image_b64 = image_url.split(",", 1)[1]
|
|
322
|
+
break
|
|
323
|
+
|
|
324
|
+
if last_image_b64 is None and computer_handler is not None:
|
|
325
|
+
# Take a screenshot
|
|
326
|
+
screenshot_b64 = await computer_handler.screenshot() # type: ignore
|
|
327
|
+
if screenshot_b64:
|
|
328
|
+
call_id = uuid.uuid4().hex
|
|
329
|
+
pre_output_items += [
|
|
330
|
+
{
|
|
331
|
+
"type": "message",
|
|
332
|
+
"role": "assistant",
|
|
333
|
+
"content": [
|
|
334
|
+
{
|
|
335
|
+
"type": "output_text",
|
|
336
|
+
"text": "Taking a screenshot to analyze the current screen.",
|
|
337
|
+
}
|
|
338
|
+
],
|
|
339
|
+
},
|
|
340
|
+
{
|
|
341
|
+
"type": "computer_call",
|
|
342
|
+
"call_id": call_id,
|
|
343
|
+
"status": "completed",
|
|
344
|
+
"action": {"type": "screenshot"},
|
|
345
|
+
},
|
|
346
|
+
{
|
|
347
|
+
"type": "computer_call_output",
|
|
348
|
+
"call_id": call_id,
|
|
349
|
+
"output": {
|
|
350
|
+
"type": "input_image",
|
|
351
|
+
"image_url": f"data:image/png;base64,{screenshot_b64}",
|
|
352
|
+
},
|
|
353
|
+
},
|
|
354
|
+
]
|
|
355
|
+
last_image_b64 = screenshot_b64
|
|
356
|
+
if _on_screenshot:
|
|
357
|
+
await _on_screenshot(screenshot_b64)
|
|
358
|
+
|
|
359
|
+
# If we have a last screenshot, run Moondream detection and labeling
|
|
360
|
+
detected_names: List[str] = []
|
|
361
|
+
if last_image_b64 is not None:
|
|
362
|
+
base_img = _decode_image_b64(last_image_b64)
|
|
363
|
+
model_md = get_moondream_model()
|
|
364
|
+
annotated_b64, detected_names = _annotate_detect_and_label_ui(base_img, model_md)
|
|
365
|
+
if _on_screenshot:
|
|
366
|
+
await _on_screenshot(annotated_b64, "annotated_form_ui")
|
|
367
|
+
|
|
368
|
+
# Also push a user message listing all detected names
|
|
369
|
+
if detected_names:
|
|
370
|
+
names_text = "\n".join(f"- {n}" for n in detected_names)
|
|
371
|
+
pre_output_items.append(
|
|
372
|
+
{
|
|
373
|
+
"type": "message",
|
|
374
|
+
"role": "user",
|
|
375
|
+
"content": [
|
|
376
|
+
{"type": "input_text", "text": "Detected form UI elements on screen:"},
|
|
377
|
+
{"type": "input_text", "text": names_text},
|
|
378
|
+
{
|
|
379
|
+
"type": "input_text",
|
|
380
|
+
"text": "Please continue with the next action needed to perform your task.",
|
|
381
|
+
},
|
|
382
|
+
],
|
|
383
|
+
}
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
tool_schemas = []
|
|
387
|
+
for schema in tools or []:
|
|
388
|
+
if schema.get("type") == "computer":
|
|
389
|
+
tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
|
|
390
|
+
else:
|
|
391
|
+
tool_schemas.append(schema)
|
|
392
|
+
|
|
393
|
+
# Step 1: Convert computer calls from xy to descriptions
|
|
394
|
+
input_messages = messages + pre_output_items
|
|
395
|
+
messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
|
|
396
|
+
|
|
397
|
+
# Step 2: Convert responses items to completion messages
|
|
398
|
+
completion_messages = convert_responses_items_to_completion_messages(
|
|
399
|
+
messages_with_descriptions,
|
|
400
|
+
allow_images_in_tool_results=False,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
# Optionally filter images if model lacks vision
|
|
404
|
+
if not _supports_vision(thinking_model):
|
|
405
|
+
completion_messages = _filter_images_from_completion_messages(completion_messages)
|
|
406
|
+
|
|
407
|
+
# Step 3: Call thinking model with litellm.acompletion
|
|
408
|
+
api_kwargs = {
|
|
409
|
+
"model": thinking_model,
|
|
410
|
+
"messages": completion_messages,
|
|
411
|
+
"tools": tool_schemas,
|
|
412
|
+
"max_retries": max_retries,
|
|
413
|
+
"stream": stream,
|
|
414
|
+
**kwargs,
|
|
415
|
+
}
|
|
416
|
+
if use_prompt_caching:
|
|
417
|
+
api_kwargs["use_prompt_caching"] = use_prompt_caching
|
|
418
|
+
|
|
419
|
+
if _on_api_start:
|
|
420
|
+
await _on_api_start(api_kwargs)
|
|
421
|
+
|
|
422
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
423
|
+
|
|
424
|
+
if _on_api_end:
|
|
425
|
+
await _on_api_end(api_kwargs, response)
|
|
426
|
+
|
|
427
|
+
usage = {
|
|
428
|
+
**response.usage.model_dump(), # type: ignore
|
|
429
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
430
|
+
}
|
|
431
|
+
if _on_usage:
|
|
432
|
+
await _on_usage(usage)
|
|
433
|
+
|
|
434
|
+
# Step 4: Convert completion messages back to responses items format
|
|
435
|
+
response_dict = response.model_dump() # type: ignore
|
|
436
|
+
choice_messages = [choice["message"] for choice in response_dict["choices"]]
|
|
437
|
+
thinking_output_items: List[Dict[str, Any]] = []
|
|
438
|
+
for choice_message in choice_messages:
|
|
439
|
+
thinking_output_items.extend(
|
|
440
|
+
convert_completion_messages_to_responses_items([choice_message])
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
# Step 5: Use Moondream to get coordinates for each description
|
|
444
|
+
element_descriptions = get_all_element_descriptions(thinking_output_items)
|
|
445
|
+
if element_descriptions and last_image_b64:
|
|
446
|
+
for desc in element_descriptions:
|
|
447
|
+
for _ in range(3): # try 3 times
|
|
448
|
+
coords = await self.predict_click(
|
|
449
|
+
model=model,
|
|
450
|
+
image_b64=last_image_b64,
|
|
451
|
+
instruction=desc,
|
|
452
|
+
)
|
|
453
|
+
if coords:
|
|
454
|
+
self.desc2xy[desc] = coords
|
|
455
|
+
break
|
|
456
|
+
|
|
457
|
+
# Step 6: Convert computer calls from descriptions back to xy coordinates
|
|
458
|
+
final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
|
|
459
|
+
|
|
460
|
+
# Step 7: Return output and usage
|
|
461
|
+
return {"output": pre_output_items + final_output_items, "usage": usage}
|
|
462
|
+
|
|
463
|
+
async def predict_click(
|
|
464
|
+
self,
|
|
465
|
+
model: str,
|
|
466
|
+
image_b64: str,
|
|
467
|
+
instruction: str,
|
|
468
|
+
**kwargs,
|
|
469
|
+
) -> Optional[Tuple[float, float]]:
|
|
470
|
+
"""Predict click coordinates using Moondream3's point API.
|
|
471
|
+
|
|
472
|
+
Returns pixel coordinates (x, y) as floats.
|
|
473
|
+
"""
|
|
474
|
+
img = _decode_image_b64(image_b64)
|
|
475
|
+
W, H = img.width, img.height
|
|
476
|
+
model_md = get_moondream_model()
|
|
477
|
+
try:
|
|
478
|
+
result = model_md.point(img, instruction, settings={"max_objects": 1})
|
|
479
|
+
except Exception:
|
|
480
|
+
return None
|
|
481
|
+
|
|
482
|
+
try:
|
|
483
|
+
pt = (result or {}).get("points", [])[0]
|
|
484
|
+
x_norm = float(pt.get("x", 0.0))
|
|
485
|
+
y_norm = float(pt.get("y", 0.0))
|
|
486
|
+
x_px = max(0.0, min(float(W - 1), x_norm * W))
|
|
487
|
+
y_px = max(0.0, min(float(H - 1), y_norm * H))
|
|
488
|
+
return (x_px, y_px)
|
|
489
|
+
except Exception:
|
|
490
|
+
return None
|
|
491
|
+
|
|
492
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
493
|
+
return ["click", "step"]
|