cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,493 @@
1
+ """
2
+ Moondream3+ composed-grounded agent loop implementation.
3
+ Grounding is handled by a local Moondream3 preview model via Transformers.
4
+ Thinking is delegated to the trailing LLM in the composed model string: "moondream3+<thinking_model>".
5
+
6
+ Differences from composed_grounded:
7
+ - Provides a singleton Moondream3 client outside the class.
8
+ - predict_click uses model.point(image, instruction, settings={"max_objects": 1}) and returns pixel coordinates.
9
+ - If the last image was a screenshot (or we take one), run model.detect(image, "all form ui") to get bboxes, then
10
+ run model.caption on each cropped bbox to label it. Overlay labels on the screenshot and emit via _on_screenshot.
11
+ - Add a user message listing all detected form UI names so the thinker can reference them.
12
+ - If the thinking model doesn't support vision, filter out image content before calling litellm.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import base64
18
+ import io
19
+ import uuid
20
+ from typing import Any, Dict, List, Optional, Tuple
21
+
22
+ import litellm
23
+ from PIL import Image, ImageDraw, ImageFont
24
+
25
+ from ..decorators import register_agent
26
+ from ..loops.base import AsyncAgentConfig
27
+ from ..responses import (
28
+ convert_completion_messages_to_responses_items,
29
+ convert_computer_calls_desc2xy,
30
+ convert_computer_calls_xy2desc,
31
+ convert_responses_items_to_completion_messages,
32
+ get_all_element_descriptions,
33
+ )
34
+ from ..types import AgentCapability
35
+
36
+ _MOONDREAM_SINGLETON = None
37
+
38
+
39
+ def get_moondream_model() -> Any:
40
+ """Get a singleton instance of the Moondream3 preview model."""
41
+ global _MOONDREAM_SINGLETON
42
+ if _MOONDREAM_SINGLETON is None:
43
+ try:
44
+ import torch
45
+ from transformers import AutoModelForCausalLM
46
+
47
+ _MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained(
48
+ "moondream/moondream3-preview",
49
+ trust_remote_code=True,
50
+ torch_dtype=torch.bfloat16,
51
+ device_map="cuda",
52
+ )
53
+ except ImportError as e:
54
+ raise RuntimeError(
55
+ "moondream3 requires torch and transformers. Install with: pip install cua-agent[moondream3]"
56
+ ) from e
57
+ return _MOONDREAM_SINGLETON
58
+
59
+
60
+ def _decode_image_b64(image_b64: str) -> Image.Image:
61
+ data = base64.b64decode(image_b64)
62
+ return Image.open(io.BytesIO(data)).convert("RGB")
63
+
64
+
65
+ def _image_to_b64(img: Image.Image) -> str:
66
+ buf = io.BytesIO()
67
+ img.save(buf, format="PNG")
68
+ return base64.b64encode(buf.getvalue()).decode("utf-8")
69
+
70
+
71
+ def _supports_vision(model: str) -> bool:
72
+ """Heuristic vision support detection for thinking model."""
73
+ m = model.lower()
74
+ vision_markers = [
75
+ "gpt-4o",
76
+ "gpt-4.1",
77
+ "o1",
78
+ "o3",
79
+ "claude-3",
80
+ "claude-3.5",
81
+ "sonnet",
82
+ "haiku",
83
+ "opus",
84
+ "gemini-1.5",
85
+ "llava",
86
+ ]
87
+ return any(v in m for v in vision_markers)
88
+
89
+
90
+ def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
91
+ filtered: List[Dict[str, Any]] = []
92
+ for msg in messages:
93
+ msg_copy = {**msg}
94
+ content = msg_copy.get("content")
95
+ if isinstance(content, list):
96
+ msg_copy["content"] = [c for c in content if c.get("type") != "image_url"]
97
+ filtered.append(msg_copy)
98
+ return filtered
99
+
100
+
101
+ def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]:
102
+ """Detect UI elements with Moondream, caption each, draw labels with backgrounds.
103
+
104
+ Args:
105
+ base_img: PIL image of the screenshot (RGB or RGBA). Will be copied/converted internally.
106
+ model_md: Moondream model instance with .detect() and .query() methods.
107
+
108
+ Returns:
109
+ A tuple of (annotated_image_base64_png, detected_names)
110
+ """
111
+ # Ensure RGBA for semi-transparent fills
112
+ if base_img.mode != "RGBA":
113
+ base_img = base_img.convert("RGBA")
114
+ W, H = base_img.width, base_img.height
115
+
116
+ # Detect objects
117
+ try:
118
+ detect_result = model_md.detect(base_img, "all ui elements")
119
+ objects = detect_result.get("objects", []) if isinstance(detect_result, dict) else []
120
+ except Exception:
121
+ objects = []
122
+
123
+ draw = ImageDraw.Draw(base_img)
124
+ try:
125
+ font = ImageFont.load_default()
126
+ except Exception:
127
+ font = None
128
+
129
+ detected_names: List[str] = []
130
+
131
+ for i, obj in enumerate(objects):
132
+ try:
133
+ # Clamp normalized coords and crop
134
+ x_min = max(0.0, min(1.0, float(obj.get("x_min", 0.0))))
135
+ y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0))))
136
+ x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0))))
137
+ y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0))))
138
+ left, top, right, bottom = (
139
+ int(x_min * W),
140
+ int(y_min * H),
141
+ int(x_max * W),
142
+ int(y_max * H),
143
+ )
144
+ left, top = max(0, left), max(0, top)
145
+ right, bottom = min(W - 1, right), min(H - 1, bottom)
146
+ crop = base_img.crop((left, top, right, bottom))
147
+
148
+ # Prompted short caption
149
+ try:
150
+ result = model_md.query(crop, "Caption this UI element in few words.")
151
+ caption_text = (result or {}).get("answer", "")
152
+ except Exception:
153
+ caption_text = ""
154
+
155
+ name = (caption_text or "").strip() or f"element_{i+1}"
156
+ detected_names.append(name)
157
+
158
+ # Draw bbox
159
+ draw.rectangle([left, top, right, bottom], outline=(255, 215, 0, 255), width=2)
160
+
161
+ # Label background with padding and rounded corners
162
+ label = f"{i+1}. {name}"
163
+ padding = 3
164
+ if font:
165
+ text_bbox = draw.textbbox((0, 0), label, font=font)
166
+ else:
167
+ text_bbox = draw.textbbox((0, 0), label)
168
+ text_w = text_bbox[2] - text_bbox[0]
169
+ text_h = text_bbox[3] - text_bbox[1]
170
+
171
+ tx = left + 3
172
+ ty = top - (text_h + 2 * padding + 4)
173
+ if ty < 0:
174
+ ty = top + 3
175
+
176
+ bg_left = tx - padding
177
+ bg_top = ty - padding
178
+ bg_right = tx + text_w + padding
179
+ bg_bottom = ty + text_h + padding
180
+ try:
181
+ draw.rounded_rectangle(
182
+ [bg_left, bg_top, bg_right, bg_bottom],
183
+ radius=4,
184
+ fill=(0, 0, 0, 160),
185
+ outline=(255, 215, 0, 200),
186
+ width=1,
187
+ )
188
+ except Exception:
189
+ draw.rectangle(
190
+ [bg_left, bg_top, bg_right, bg_bottom],
191
+ fill=(0, 0, 0, 160),
192
+ outline=(255, 215, 0, 200),
193
+ width=1,
194
+ )
195
+
196
+ text_fill = (255, 255, 255, 255)
197
+ if font:
198
+ draw.text((tx, ty), label, fill=text_fill, font=font)
199
+ else:
200
+ draw.text((tx, ty), label, fill=text_fill)
201
+ except Exception:
202
+ continue
203
+
204
+ # Encode PNG base64
205
+ annotated = base_img
206
+ if annotated.mode not in ("RGBA", "RGB"):
207
+ annotated = annotated.convert("RGBA")
208
+ annotated_b64 = _image_to_b64(annotated)
209
+ return annotated_b64, detected_names
210
+
211
+
212
+ GROUNDED_COMPUTER_TOOL_SCHEMA = {
213
+ "type": "function",
214
+ "function": {
215
+ "name": "computer",
216
+ "description": (
217
+ "Control a computer by taking screenshots and interacting with UI elements. "
218
+ "The screenshot action will include a list of detected form UI element names when available. "
219
+ "Use element descriptions to locate and interact with UI elements on the screen."
220
+ ),
221
+ "parameters": {
222
+ "type": "object",
223
+ "properties": {
224
+ "action": {
225
+ "type": "string",
226
+ "enum": [
227
+ "screenshot",
228
+ "click",
229
+ "double_click",
230
+ "drag",
231
+ "type",
232
+ "keypress",
233
+ "scroll",
234
+ "move",
235
+ "wait",
236
+ "get_current_url",
237
+ "get_dimensions",
238
+ "get_environment",
239
+ ],
240
+ "description": "The action to perform (required for all actions)",
241
+ },
242
+ "element_description": {
243
+ "type": "string",
244
+ "description": "Description of the element to interact with (required for click/double_click/move/scroll)",
245
+ },
246
+ "start_element_description": {
247
+ "type": "string",
248
+ "description": "Description of the element to start dragging from (required for drag)",
249
+ },
250
+ "end_element_description": {
251
+ "type": "string",
252
+ "description": "Description of the element to drag to (required for drag)",
253
+ },
254
+ "text": {
255
+ "type": "string",
256
+ "description": "The text to type (required for type)",
257
+ },
258
+ "keys": {
259
+ "type": "array",
260
+ "items": {"type": "string"},
261
+ "description": "Key(s) to press (required for keypress)",
262
+ },
263
+ "button": {
264
+ "type": "string",
265
+ "enum": ["left", "right", "wheel", "back", "forward"],
266
+ "description": "The mouse button to use for click/double_click",
267
+ },
268
+ "scroll_x": {
269
+ "type": "integer",
270
+ "description": "Horizontal scroll amount (required for scroll)",
271
+ },
272
+ "scroll_y": {
273
+ "type": "integer",
274
+ "description": "Vertical scroll amount (required for scroll)",
275
+ },
276
+ },
277
+ "required": ["action"],
278
+ },
279
+ },
280
+ }
281
+
282
+
283
+ @register_agent(r"moondream3\+.*", priority=2)
284
+ class Moondream3PlusConfig(AsyncAgentConfig):
285
+ def __init__(self):
286
+ self.desc2xy: Dict[str, Tuple[float, float]] = {}
287
+
288
+ async def predict_step(
289
+ self,
290
+ messages: List[Dict[str, Any]],
291
+ model: str,
292
+ tools: Optional[List[Dict[str, Any]]] = None,
293
+ max_retries: Optional[int] = None,
294
+ stream: bool = False,
295
+ computer_handler=None,
296
+ use_prompt_caching: Optional[bool] = False,
297
+ _on_api_start=None,
298
+ _on_api_end=None,
299
+ _on_usage=None,
300
+ _on_screenshot=None,
301
+ **kwargs,
302
+ ) -> Dict[str, Any]:
303
+ # Parse composed model: moondream3+<thinking_model>
304
+ if "+" not in model:
305
+ raise ValueError(f"Composed model must be 'moondream3+<thinking_model>', got: {model}")
306
+ _, thinking_model = model.split("+", 1)
307
+
308
+ pre_output_items: List[Dict[str, Any]] = []
309
+
310
+ # Acquire last screenshot; if missing, take one
311
+ last_image_b64: Optional[str] = None
312
+ for message in reversed(messages):
313
+ if (
314
+ isinstance(message, dict)
315
+ and message.get("type") == "computer_call_output"
316
+ and isinstance(message.get("output"), dict)
317
+ and message["output"].get("type") == "input_image"
318
+ ):
319
+ image_url = message["output"].get("image_url", "")
320
+ if image_url.startswith("data:image/png;base64,"):
321
+ last_image_b64 = image_url.split(",", 1)[1]
322
+ break
323
+
324
+ if last_image_b64 is None and computer_handler is not None:
325
+ # Take a screenshot
326
+ screenshot_b64 = await computer_handler.screenshot() # type: ignore
327
+ if screenshot_b64:
328
+ call_id = uuid.uuid4().hex
329
+ pre_output_items += [
330
+ {
331
+ "type": "message",
332
+ "role": "assistant",
333
+ "content": [
334
+ {
335
+ "type": "output_text",
336
+ "text": "Taking a screenshot to analyze the current screen.",
337
+ }
338
+ ],
339
+ },
340
+ {
341
+ "type": "computer_call",
342
+ "call_id": call_id,
343
+ "status": "completed",
344
+ "action": {"type": "screenshot"},
345
+ },
346
+ {
347
+ "type": "computer_call_output",
348
+ "call_id": call_id,
349
+ "output": {
350
+ "type": "input_image",
351
+ "image_url": f"data:image/png;base64,{screenshot_b64}",
352
+ },
353
+ },
354
+ ]
355
+ last_image_b64 = screenshot_b64
356
+ if _on_screenshot:
357
+ await _on_screenshot(screenshot_b64)
358
+
359
+ # If we have a last screenshot, run Moondream detection and labeling
360
+ detected_names: List[str] = []
361
+ if last_image_b64 is not None:
362
+ base_img = _decode_image_b64(last_image_b64)
363
+ model_md = get_moondream_model()
364
+ annotated_b64, detected_names = _annotate_detect_and_label_ui(base_img, model_md)
365
+ if _on_screenshot:
366
+ await _on_screenshot(annotated_b64, "annotated_form_ui")
367
+
368
+ # Also push a user message listing all detected names
369
+ if detected_names:
370
+ names_text = "\n".join(f"- {n}" for n in detected_names)
371
+ pre_output_items.append(
372
+ {
373
+ "type": "message",
374
+ "role": "user",
375
+ "content": [
376
+ {"type": "input_text", "text": "Detected form UI elements on screen:"},
377
+ {"type": "input_text", "text": names_text},
378
+ {
379
+ "type": "input_text",
380
+ "text": "Please continue with the next action needed to perform your task.",
381
+ },
382
+ ],
383
+ }
384
+ )
385
+
386
+ tool_schemas = []
387
+ for schema in tools or []:
388
+ if schema.get("type") == "computer":
389
+ tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
390
+ else:
391
+ tool_schemas.append(schema)
392
+
393
+ # Step 1: Convert computer calls from xy to descriptions
394
+ input_messages = messages + pre_output_items
395
+ messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
396
+
397
+ # Step 2: Convert responses items to completion messages
398
+ completion_messages = convert_responses_items_to_completion_messages(
399
+ messages_with_descriptions,
400
+ allow_images_in_tool_results=False,
401
+ )
402
+
403
+ # Optionally filter images if model lacks vision
404
+ if not _supports_vision(thinking_model):
405
+ completion_messages = _filter_images_from_completion_messages(completion_messages)
406
+
407
+ # Step 3: Call thinking model with litellm.acompletion
408
+ api_kwargs = {
409
+ "model": thinking_model,
410
+ "messages": completion_messages,
411
+ "tools": tool_schemas,
412
+ "max_retries": max_retries,
413
+ "stream": stream,
414
+ **kwargs,
415
+ }
416
+ if use_prompt_caching:
417
+ api_kwargs["use_prompt_caching"] = use_prompt_caching
418
+
419
+ if _on_api_start:
420
+ await _on_api_start(api_kwargs)
421
+
422
+ response = await litellm.acompletion(**api_kwargs)
423
+
424
+ if _on_api_end:
425
+ await _on_api_end(api_kwargs, response)
426
+
427
+ usage = {
428
+ **response.usage.model_dump(), # type: ignore
429
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
430
+ }
431
+ if _on_usage:
432
+ await _on_usage(usage)
433
+
434
+ # Step 4: Convert completion messages back to responses items format
435
+ response_dict = response.model_dump() # type: ignore
436
+ choice_messages = [choice["message"] for choice in response_dict["choices"]]
437
+ thinking_output_items: List[Dict[str, Any]] = []
438
+ for choice_message in choice_messages:
439
+ thinking_output_items.extend(
440
+ convert_completion_messages_to_responses_items([choice_message])
441
+ )
442
+
443
+ # Step 5: Use Moondream to get coordinates for each description
444
+ element_descriptions = get_all_element_descriptions(thinking_output_items)
445
+ if element_descriptions and last_image_b64:
446
+ for desc in element_descriptions:
447
+ for _ in range(3): # try 3 times
448
+ coords = await self.predict_click(
449
+ model=model,
450
+ image_b64=last_image_b64,
451
+ instruction=desc,
452
+ )
453
+ if coords:
454
+ self.desc2xy[desc] = coords
455
+ break
456
+
457
+ # Step 6: Convert computer calls from descriptions back to xy coordinates
458
+ final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
459
+
460
+ # Step 7: Return output and usage
461
+ return {"output": pre_output_items + final_output_items, "usage": usage}
462
+
463
+ async def predict_click(
464
+ self,
465
+ model: str,
466
+ image_b64: str,
467
+ instruction: str,
468
+ **kwargs,
469
+ ) -> Optional[Tuple[float, float]]:
470
+ """Predict click coordinates using Moondream3's point API.
471
+
472
+ Returns pixel coordinates (x, y) as floats.
473
+ """
474
+ img = _decode_image_b64(image_b64)
475
+ W, H = img.width, img.height
476
+ model_md = get_moondream_model()
477
+ try:
478
+ result = model_md.point(img, instruction, settings={"max_objects": 1})
479
+ except Exception:
480
+ return None
481
+
482
+ try:
483
+ pt = (result or {}).get("points", [])[0]
484
+ x_norm = float(pt.get("x", 0.0))
485
+ y_norm = float(pt.get("y", 0.0))
486
+ x_px = max(0.0, min(float(W - 1), x_norm * W))
487
+ y_px = max(0.0, min(float(H - 1), y_norm * H))
488
+ return (x_px, y_px)
489
+ except Exception:
490
+ return None
491
+
492
+ def get_capabilities(self) -> List[AgentCapability]:
493
+ return ["click", "step"]