cua-agent 0.4.31__py3-none-any.whl → 0.4.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

@@ -0,0 +1,185 @@
1
+ """
2
+ InternVL agent loop implementation for click prediction using litellm.acompletion.
3
+
4
+ Implements the ScreenSpot InternVL grounding baseline behavior:
5
+ - Uses the exact grounding prompt format with <image> and <ref> tags
6
+ - Expects coordinates in 0-1000 normalized range in formats [[x1,y1,x2,y2]] or [[x,y]]
7
+ - Converts to pixel coordinates relative to the original screenshot size
8
+
9
+ Note: We do NOT manually load the InternVL model; acompletions (via HuggingFaceLocalAdapter)
10
+ will handle loading based on the provided model name.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import base64
16
+ import math
17
+ import re
18
+ from io import BytesIO
19
+ from typing import Any, Dict, List, Optional, Tuple
20
+
21
+ from PIL import Image
22
+ import litellm
23
+
24
+ from ..decorators import register_agent
25
+ from .composed_grounded import ComposedGroundedConfig
26
+ from ..types import AgentCapability
27
+
28
+
29
+ # Regex patterns for extracting coordinates
30
+ # Accept optional whitespace and optional decimal fractions
31
+ _NUM = r"(\d+(?:\.\d+)?)"
32
+ _POINT_PATTERN = re.compile(r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]")
33
+ _BBOX_PATTERN = re.compile(
34
+ r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]"
35
+ )
36
+
37
+
38
+ def _extract_first_point(text: str) -> Optional[Tuple[float, float]]:
39
+ """Extract the first [[x,y]] as normalized (0-1000) floats."""
40
+ m = _POINT_PATTERN.search(text)
41
+ if not m:
42
+ return None
43
+ try:
44
+ x = float(m.group(1))
45
+ y = float(m.group(2))
46
+ return x, y
47
+ except Exception:
48
+ return None
49
+
50
+
51
+ def _extract_last_bbox(text: str) -> Optional[Tuple[float, float, float, float]]:
52
+ """Extract the last [[x1,y1,x2,y2]] as normalized (0-1000) floats."""
53
+ matches = list(_BBOX_PATTERN.finditer(text))
54
+ if not matches:
55
+ return None
56
+ m = matches[-1]
57
+ try:
58
+ x1 = float(m.group(1))
59
+ y1 = float(m.group(2))
60
+ x2 = float(m.group(3))
61
+ y2 = float(m.group(4))
62
+ return x1, y1, x2, y2
63
+ except Exception:
64
+ return None
65
+
66
+
67
+ def _scale_norm_to_pixels(x_norm: float, y_norm: float, width: int, height: int) -> Tuple[int, int]:
68
+ """Scale 0-1000 normalized coordinates to pixel coordinates for given image size."""
69
+ x_px = int(math.floor((x_norm / 1000.0) * width))
70
+ y_px = int(math.floor((y_norm / 1000.0) * height))
71
+ # Clamp to image bounds just in case
72
+ x_px = max(0, min(width - 1, x_px))
73
+ y_px = max(0, min(height - 1, y_px))
74
+ return x_px, y_px
75
+
76
+
77
+ @register_agent(models=r"(?i).*InternVL.*")
78
+ class InternVLConfig(ComposedGroundedConfig):
79
+ """InternVL agent configuration reusing ComposedGroundedConfig for steps and
80
+ overriding predict_click to implement ScreenSpot InternVL grounding baseline."""
81
+
82
+ async def predict_step(
83
+ self,
84
+ messages: List[Dict[str, Any]],
85
+ model: str,
86
+ tools: Optional[List[Dict[str, Any]]] = None,
87
+ max_retries: Optional[int] = None,
88
+ stream: bool = False,
89
+ computer_handler=None,
90
+ _on_api_start=None,
91
+ _on_api_end=None,
92
+ _on_usage=None,
93
+ _on_screenshot=None,
94
+ **kwargs
95
+ ) -> Dict[str, Any]:
96
+ """Fallback to a self-composed model"""
97
+ return await super().predict_step(
98
+ messages=messages,
99
+ model=f"{model}+{model}",
100
+ tools=tools,
101
+ max_retries=max_retries,
102
+ stream=stream,
103
+ computer_handler=computer_handler,
104
+ _on_api_start=_on_api_start,
105
+ _on_api_end=_on_api_end,
106
+ _on_usage=_on_usage,
107
+ _on_screenshot=_on_screenshot,
108
+ **kwargs
109
+ )
110
+
111
+ async def predict_click(
112
+ self,
113
+ model: str,
114
+ image_b64: str,
115
+ instruction: str,
116
+ **kwargs
117
+ ) -> Optional[Tuple[int, int]]:
118
+ """
119
+ Predict click coordinates using InternVL via litellm.acompletion.
120
+
121
+ Behavior mirrors the ScreenSpot InternVL baseline:
122
+ - Prompt: "<image>\nPlease provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. Answer in the format of [[x1, y1, x2, y2]]"
123
+ - Parse either [[x,y]] point or [[x1,y1,x2,y2]] bbox, using bbox center if point missing
124
+ - Coordinates are 0-1000 normalized; convert to pixel coordinates for the original screenshot
125
+ """
126
+ try:
127
+ # Decode image dimensions to scale the normalized outputs
128
+ img_bytes = base64.b64decode(image_b64)
129
+ image = Image.open(BytesIO(img_bytes))
130
+ width, height = image.size
131
+ except Exception:
132
+ # If decoding fails, proceed with a safe default size to avoid crash
133
+ width, height = 1920, 1080
134
+
135
+ # Build grounding prompt exactly like the baseline
136
+ grounding_prompt = (
137
+ f"Please provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. "
138
+ f"Answer in the format of [[x1, y1, x2, y2]]"
139
+ )
140
+
141
+ # Prepare messages for LiteLLM
142
+ messages = [
143
+ {
144
+ "role": "user",
145
+ "content": [
146
+ {
147
+ "type": "image_url",
148
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
149
+ },
150
+ {"type": "text", "text": grounding_prompt},
151
+ ],
152
+ }
153
+ ]
154
+
155
+ # Call acompletion; HuggingFaceLocalAdapter/model handler will handle InternVL loading
156
+ api_kwargs = {
157
+ "model": model,
158
+ "messages": messages,
159
+ # Conservative generation params akin to baseline (deterministic)
160
+ "max_tokens": kwargs.get("max_tokens", 256),
161
+ "temperature": kwargs.get("temperature", 0.0),
162
+ }
163
+
164
+ response = await litellm.acompletion(**api_kwargs)
165
+ output_text = (response.choices[0].message.content or "").strip() # type: ignore
166
+
167
+ # print(f"InternVL output: {output_text}")
168
+
169
+ # Try to parse a point first; if absent, parse bbox and take center
170
+ point = _extract_first_point(output_text)
171
+ if point is None:
172
+ bbox = _extract_last_bbox(output_text)
173
+ if bbox is None:
174
+ return None
175
+ x1, y1, x2, y2 = bbox
176
+ cx = (x1 + x2) / 2.0
177
+ cy = (y1 + y2) / 2.0
178
+ point = (cx, cy)
179
+
180
+ x_norm, y_norm = point
181
+ x_px, y_px = _scale_norm_to_pixels(x_norm, y_norm, width, height)
182
+ return (x_px, y_px)
183
+
184
+ def get_capabilities(self) -> List[AgentCapability]:
185
+ return ["click", "step"]
@@ -0,0 +1,464 @@
1
+ """
2
+ Moondream3+ composed-grounded agent loop implementation.
3
+ Grounding is handled by a local Moondream3 preview model via Transformers.
4
+ Thinking is delegated to the trailing LLM in the composed model string: "moondream3+<thinking_model>".
5
+
6
+ Differences from composed_grounded:
7
+ - Provides a singleton Moondream3 client outside the class.
8
+ - predict_click uses model.point(image, instruction, settings={"max_objects": 1}) and returns pixel coordinates.
9
+ - If the last image was a screenshot (or we take one), run model.detect(image, "all form ui") to get bboxes, then
10
+ run model.caption on each cropped bbox to label it. Overlay labels on the screenshot and emit via _on_screenshot.
11
+ - Add a user message listing all detected form UI names so the thinker can reference them.
12
+ - If the thinking model doesn't support vision, filter out image content before calling litellm.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import uuid
18
+ import base64
19
+ import io
20
+ from typing import Dict, List, Any, Optional, Tuple, Any
21
+
22
+ from PIL import Image, ImageDraw, ImageFont
23
+ import torch
24
+ from transformers import AutoModelForCausalLM
25
+ import litellm
26
+
27
+ from ..decorators import register_agent
28
+ from ..types import AgentCapability
29
+ from ..loops.base import AsyncAgentConfig
30
+ from ..responses import (
31
+ convert_computer_calls_xy2desc,
32
+ convert_responses_items_to_completion_messages,
33
+ convert_completion_messages_to_responses_items,
34
+ convert_computer_calls_desc2xy,
35
+ get_all_element_descriptions,
36
+ )
37
+
38
+ _MOONDREAM_SINGLETON = None
39
+
40
+ def get_moondream_model() -> Any:
41
+ """Get a singleton instance of the Moondream3 preview model."""
42
+ global _MOONDREAM_SINGLETON
43
+ if _MOONDREAM_SINGLETON is None:
44
+ _MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained(
45
+ "moondream/moondream3-preview",
46
+ trust_remote_code=True,
47
+ torch_dtype=torch.bfloat16,
48
+ device_map="cuda",
49
+ )
50
+ return _MOONDREAM_SINGLETON
51
+
52
+
53
+ def _decode_image_b64(image_b64: str) -> Image.Image:
54
+ data = base64.b64decode(image_b64)
55
+ return Image.open(io.BytesIO(data)).convert("RGB")
56
+
57
+
58
+ def _image_to_b64(img: Image.Image) -> str:
59
+ buf = io.BytesIO()
60
+ img.save(buf, format="PNG")
61
+ return base64.b64encode(buf.getvalue()).decode("utf-8")
62
+
63
+
64
+ def _supports_vision(model: str) -> bool:
65
+ """Heuristic vision support detection for thinking model."""
66
+ m = model.lower()
67
+ vision_markers = [
68
+ "gpt-4o",
69
+ "gpt-4.1",
70
+ "o1",
71
+ "o3",
72
+ "claude-3",
73
+ "claude-3.5",
74
+ "sonnet",
75
+ "haiku",
76
+ "opus",
77
+ "gemini-1.5",
78
+ "llava",
79
+ ]
80
+ return any(v in m for v in vision_markers)
81
+
82
+
83
+ def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
84
+ filtered: List[Dict[str, Any]] = []
85
+ for msg in messages:
86
+ msg_copy = {**msg}
87
+ content = msg_copy.get("content")
88
+ if isinstance(content, list):
89
+ msg_copy["content"] = [c for c in content if c.get("type") != "image_url"]
90
+ filtered.append(msg_copy)
91
+ return filtered
92
+
93
+ def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]:
94
+ """Detect UI elements with Moondream, caption each, draw labels with backgrounds.
95
+
96
+ Args:
97
+ base_img: PIL image of the screenshot (RGB or RGBA). Will be copied/converted internally.
98
+ model_md: Moondream model instance with .detect() and .query() methods.
99
+
100
+ Returns:
101
+ A tuple of (annotated_image_base64_png, detected_names)
102
+ """
103
+ # Ensure RGBA for semi-transparent fills
104
+ if base_img.mode != "RGBA":
105
+ base_img = base_img.convert("RGBA")
106
+ W, H = base_img.width, base_img.height
107
+
108
+ # Detect objects
109
+ try:
110
+ detect_result = model_md.detect(base_img, "all ui elements")
111
+ objects = detect_result.get("objects", []) if isinstance(detect_result, dict) else []
112
+ except Exception:
113
+ objects = []
114
+
115
+ draw = ImageDraw.Draw(base_img)
116
+ try:
117
+ font = ImageFont.load_default()
118
+ except Exception:
119
+ font = None
120
+
121
+ detected_names: List[str] = []
122
+
123
+ for i, obj in enumerate(objects):
124
+ try:
125
+ # Clamp normalized coords and crop
126
+ x_min = max(0.0, min(1.0, float(obj.get("x_min", 0.0))))
127
+ y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0))))
128
+ x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0))))
129
+ y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0))))
130
+ left, top, right, bottom = int(x_min * W), int(y_min * H), int(x_max * W), int(y_max * H)
131
+ left, top = max(0, left), max(0, top)
132
+ right, bottom = min(W - 1, right), min(H - 1, bottom)
133
+ crop = base_img.crop((left, top, right, bottom))
134
+
135
+ # Prompted short caption
136
+ try:
137
+ result = model_md.query(crop, "Caption this UI element in few words.")
138
+ caption_text = (result or {}).get("answer", "")
139
+ except Exception:
140
+ caption_text = ""
141
+
142
+ name = (caption_text or "").strip() or f"element_{i+1}"
143
+ detected_names.append(name)
144
+
145
+ # Draw bbox
146
+ draw.rectangle([left, top, right, bottom], outline=(255, 215, 0, 255), width=2)
147
+
148
+ # Label background with padding and rounded corners
149
+ label = f"{i+1}. {name}"
150
+ padding = 3
151
+ if font:
152
+ text_bbox = draw.textbbox((0, 0), label, font=font)
153
+ else:
154
+ text_bbox = draw.textbbox((0, 0), label)
155
+ text_w = text_bbox[2] - text_bbox[0]
156
+ text_h = text_bbox[3] - text_bbox[1]
157
+
158
+ tx = left + 3
159
+ ty = top - (text_h + 2 * padding + 4)
160
+ if ty < 0:
161
+ ty = top + 3
162
+
163
+ bg_left = tx - padding
164
+ bg_top = ty - padding
165
+ bg_right = tx + text_w + padding
166
+ bg_bottom = ty + text_h + padding
167
+ try:
168
+ draw.rounded_rectangle(
169
+ [bg_left, bg_top, bg_right, bg_bottom],
170
+ radius=4,
171
+ fill=(0, 0, 0, 160),
172
+ outline=(255, 215, 0, 200),
173
+ width=1,
174
+ )
175
+ except Exception:
176
+ draw.rectangle(
177
+ [bg_left, bg_top, bg_right, bg_bottom],
178
+ fill=(0, 0, 0, 160),
179
+ outline=(255, 215, 0, 200),
180
+ width=1,
181
+ )
182
+
183
+ text_fill = (255, 255, 255, 255)
184
+ if font:
185
+ draw.text((tx, ty), label, fill=text_fill, font=font)
186
+ else:
187
+ draw.text((tx, ty), label, fill=text_fill)
188
+ except Exception:
189
+ continue
190
+
191
+ # Encode PNG base64
192
+ annotated = base_img
193
+ if annotated.mode not in ("RGBA", "RGB"):
194
+ annotated = annotated.convert("RGBA")
195
+ annotated_b64 = _image_to_b64(annotated)
196
+ return annotated_b64, detected_names
197
+
198
+ GROUNDED_COMPUTER_TOOL_SCHEMA = {
199
+ "type": "function",
200
+ "function": {
201
+ "name": "computer",
202
+ "description": (
203
+ "Control a computer by taking screenshots and interacting with UI elements. "
204
+ "The screenshot action will include a list of detected form UI element names when available. "
205
+ "Use element descriptions to locate and interact with UI elements on the screen."
206
+ ),
207
+ "parameters": {
208
+ "type": "object",
209
+ "properties": {
210
+ "action": {
211
+ "type": "string",
212
+ "enum": [
213
+ "screenshot",
214
+ "click",
215
+ "double_click",
216
+ "drag",
217
+ "type",
218
+ "keypress",
219
+ "scroll",
220
+ "move",
221
+ "wait",
222
+ "get_current_url",
223
+ "get_dimensions",
224
+ "get_environment",
225
+ ],
226
+ "description": "The action to perform (required for all actions)",
227
+ },
228
+ "element_description": {
229
+ "type": "string",
230
+ "description": "Description of the element to interact with (required for click/double_click/move/scroll)",
231
+ },
232
+ "start_element_description": {
233
+ "type": "string",
234
+ "description": "Description of the element to start dragging from (required for drag)",
235
+ },
236
+ "end_element_description": {
237
+ "type": "string",
238
+ "description": "Description of the element to drag to (required for drag)",
239
+ },
240
+ "text": {
241
+ "type": "string",
242
+ "description": "The text to type (required for type)",
243
+ },
244
+ "keys": {
245
+ "type": "array",
246
+ "items": {"type": "string"},
247
+ "description": "Key(s) to press (required for keypress)",
248
+ },
249
+ "button": {
250
+ "type": "string",
251
+ "enum": ["left", "right", "wheel", "back", "forward"],
252
+ "description": "The mouse button to use for click/double_click",
253
+ },
254
+ "scroll_x": {
255
+ "type": "integer",
256
+ "description": "Horizontal scroll amount (required for scroll)",
257
+ },
258
+ "scroll_y": {
259
+ "type": "integer",
260
+ "description": "Vertical scroll amount (required for scroll)",
261
+ },
262
+ },
263
+ "required": ["action"],
264
+ },
265
+ },
266
+ }
267
+
268
+ @register_agent(r"moondream3\+.*", priority=2)
269
+ class Moondream3PlusConfig(AsyncAgentConfig):
270
+ def __init__(self):
271
+ self.desc2xy: Dict[str, Tuple[float, float]] = {}
272
+
273
+ async def predict_step(
274
+ self,
275
+ messages: List[Dict[str, Any]],
276
+ model: str,
277
+ tools: Optional[List[Dict[str, Any]]] = None,
278
+ max_retries: Optional[int] = None,
279
+ stream: bool = False,
280
+ computer_handler=None,
281
+ use_prompt_caching: Optional[bool] = False,
282
+ _on_api_start=None,
283
+ _on_api_end=None,
284
+ _on_usage=None,
285
+ _on_screenshot=None,
286
+ **kwargs,
287
+ ) -> Dict[str, Any]:
288
+ # Parse composed model: moondream3+<thinking_model>
289
+ if "+" not in model:
290
+ raise ValueError(f"Composed model must be 'moondream3+<thinking_model>', got: {model}")
291
+ _, thinking_model = model.split("+", 1)
292
+
293
+ pre_output_items: List[Dict[str, Any]] = []
294
+
295
+ # Acquire last screenshot; if missing, take one
296
+ last_image_b64: Optional[str] = None
297
+ for message in reversed(messages):
298
+ if (
299
+ isinstance(message, dict)
300
+ and message.get("type") == "computer_call_output"
301
+ and isinstance(message.get("output"), dict)
302
+ and message["output"].get("type") == "input_image"
303
+ ):
304
+ image_url = message["output"].get("image_url", "")
305
+ if image_url.startswith("data:image/png;base64,"):
306
+ last_image_b64 = image_url.split(",", 1)[1]
307
+ break
308
+
309
+ if last_image_b64 is None and computer_handler is not None:
310
+ # Take a screenshot
311
+ screenshot_b64 = await computer_handler.screenshot() # type: ignore
312
+ if screenshot_b64:
313
+ call_id = uuid.uuid4().hex
314
+ pre_output_items += [
315
+ {
316
+ "type": "message",
317
+ "role": "assistant",
318
+ "content": [
319
+ {"type": "output_text", "text": "Taking a screenshot to analyze the current screen."}
320
+ ],
321
+ },
322
+ {"type": "computer_call", "call_id": call_id, "status": "completed", "action": {"type": "screenshot"}},
323
+ {
324
+ "type": "computer_call_output",
325
+ "call_id": call_id,
326
+ "output": {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_b64}"},
327
+ },
328
+ ]
329
+ last_image_b64 = screenshot_b64
330
+ if _on_screenshot:
331
+ await _on_screenshot(screenshot_b64)
332
+
333
+ # If we have a last screenshot, run Moondream detection and labeling
334
+ detected_names: List[str] = []
335
+ if last_image_b64 is not None:
336
+ base_img = _decode_image_b64(last_image_b64)
337
+ model_md = get_moondream_model()
338
+ annotated_b64, detected_names = _annotate_detect_and_label_ui(base_img, model_md)
339
+ if _on_screenshot:
340
+ await _on_screenshot(annotated_b64, "annotated_form_ui")
341
+
342
+ # Also push a user message listing all detected names
343
+ if detected_names:
344
+ names_text = "\n".join(f"- {n}" for n in detected_names)
345
+ pre_output_items.append(
346
+ {
347
+ "type": "message",
348
+ "role": "user",
349
+ "content": [
350
+ {"type": "input_text", "text": "Detected form UI elements on screen:"},
351
+ {"type": "input_text", "text": names_text},
352
+ {"type": "input_text", "text": "Please continue with the next action needed to perform your task."}
353
+ ],
354
+ }
355
+ )
356
+
357
+ tool_schemas = []
358
+ for schema in (tools or []):
359
+ if schema.get("type") == "computer":
360
+ tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
361
+ else:
362
+ tool_schemas.append(schema)
363
+
364
+ # Step 1: Convert computer calls from xy to descriptions
365
+ input_messages = messages + pre_output_items
366
+ messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
367
+
368
+ # Step 2: Convert responses items to completion messages
369
+ completion_messages = convert_responses_items_to_completion_messages(
370
+ messages_with_descriptions,
371
+ allow_images_in_tool_results=False,
372
+ )
373
+
374
+ # Optionally filter images if model lacks vision
375
+ if not _supports_vision(thinking_model):
376
+ completion_messages = _filter_images_from_completion_messages(completion_messages)
377
+
378
+ # Step 3: Call thinking model with litellm.acompletion
379
+ api_kwargs = {
380
+ "model": thinking_model,
381
+ "messages": completion_messages,
382
+ "tools": tool_schemas,
383
+ "max_retries": max_retries,
384
+ "stream": stream,
385
+ **kwargs,
386
+ }
387
+ if use_prompt_caching:
388
+ api_kwargs["use_prompt_caching"] = use_prompt_caching
389
+
390
+ if _on_api_start:
391
+ await _on_api_start(api_kwargs)
392
+
393
+ response = await litellm.acompletion(**api_kwargs)
394
+
395
+ if _on_api_end:
396
+ await _on_api_end(api_kwargs, response)
397
+
398
+ usage = {
399
+ **response.usage.model_dump(), # type: ignore
400
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
401
+ }
402
+ if _on_usage:
403
+ await _on_usage(usage)
404
+
405
+ # Step 4: Convert completion messages back to responses items format
406
+ response_dict = response.model_dump() # type: ignore
407
+ choice_messages = [choice["message"] for choice in response_dict["choices"]]
408
+ thinking_output_items: List[Dict[str, Any]] = []
409
+ for choice_message in choice_messages:
410
+ thinking_output_items.extend(
411
+ convert_completion_messages_to_responses_items([choice_message])
412
+ )
413
+
414
+ # Step 5: Use Moondream to get coordinates for each description
415
+ element_descriptions = get_all_element_descriptions(thinking_output_items)
416
+ if element_descriptions and last_image_b64:
417
+ for desc in element_descriptions:
418
+ for _ in range(3): # try 3 times
419
+ coords = await self.predict_click(
420
+ model=model,
421
+ image_b64=last_image_b64,
422
+ instruction=desc,
423
+ )
424
+ if coords:
425
+ self.desc2xy[desc] = coords
426
+ break
427
+
428
+ # Step 6: Convert computer calls from descriptions back to xy coordinates
429
+ final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
430
+
431
+ # Step 7: Return output and usage
432
+ return {"output": pre_output_items + final_output_items, "usage": usage}
433
+
434
+ async def predict_click(
435
+ self,
436
+ model: str,
437
+ image_b64: str,
438
+ instruction: str,
439
+ **kwargs,
440
+ ) -> Optional[Tuple[float, float]]:
441
+ """Predict click coordinates using Moondream3's point API.
442
+
443
+ Returns pixel coordinates (x, y) as floats.
444
+ """
445
+ img = _decode_image_b64(image_b64)
446
+ W, H = img.width, img.height
447
+ model_md = get_moondream_model()
448
+ try:
449
+ result = model_md.point(img, instruction, settings={"max_objects": 1})
450
+ except Exception:
451
+ return None
452
+
453
+ try:
454
+ pt = (result or {}).get("points", [])[0]
455
+ x_norm = float(pt.get("x", 0.0))
456
+ y_norm = float(pt.get("y", 0.0))
457
+ x_px = max(0.0, min(float(W - 1), x_norm * W))
458
+ y_px = max(0.0, min(float(H - 1), y_norm * H))
459
+ return (x_px, y_px)
460
+ except Exception:
461
+ return None
462
+
463
+ def get_capabilities(self) -> List[AgentCapability]:
464
+ return ["click", "step"]
agent/loops/openai.py CHANGED
@@ -53,8 +53,7 @@ async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools
53
53
 
54
54
  return openai_tools
55
55
 
56
-
57
- @register_agent(models=r".*computer-use-preview.*")
56
+ @register_agent(models=r".*(^|/)computer-use-preview")
58
57
  class OpenAIComputerUseConfig:
59
58
  """
60
59
  OpenAI computer-use-preview agent configuration using liteLLM responses.