cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,290 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ # Hugging Face imports are local to avoid hard dependency at module import
6
+ try:
7
+ import base64 # type: ignore
8
+ from io import BytesIO # type: ignore
9
+
10
+ # Attempt to import InternVL's model dependencies
11
+ import einops as _ # type: ignore
12
+ import requests # type: ignore
13
+ import timm as _ # type: ignore
14
+ import torch # type: ignore
15
+ import torchvision.transforms as T # type: ignore
16
+ from PIL import Image # type: ignore
17
+ from torchvision.transforms.functional import InterpolationMode # type: ignore
18
+ from transformers import AutoModel, AutoTokenizer # type: ignore
19
+
20
+ HF_AVAILABLE = True
21
+ except Exception:
22
+ HF_AVAILABLE = False
23
+
24
+
25
+ class InternVLModel:
26
+ """Generic Hugging Face vision-language model handler.
27
+ Uses InternVL's native `model.chat()` interface with `AutoTokenizer`.
28
+ Provides preprocessing to support multi-turn conversations with multiple images.
29
+ """
30
+
31
+ def __init__(
32
+ self, model_name: str, device: str = "auto", trust_remote_code: bool = False
33
+ ) -> None:
34
+ if not HF_AVAILABLE:
35
+ raise ImportError(
36
+ 'InternVL dependencies not found. Install with: pip install "cua-agent[internvl-hf]"'
37
+ )
38
+ self.model_name = model_name
39
+ self.device = device
40
+ self.model = None
41
+ self.tokenizer = None
42
+ self.trust_remote_code = trust_remote_code
43
+ self._load()
44
+
45
+ def _load(self) -> None:
46
+ # Load model
47
+ self.model = AutoModel.from_pretrained(
48
+ self.model_name,
49
+ torch_dtype=torch.bfloat16,
50
+ low_cpu_mem_usage=True,
51
+ use_flash_attn=True,
52
+ device_map=self.device,
53
+ trust_remote_code=self.trust_remote_code,
54
+ ).eval()
55
+ # Load tokenizer (InternVL requires trust_remote_code=True and often use_fast=False)
56
+ self.tokenizer = AutoTokenizer.from_pretrained(
57
+ self.model_name,
58
+ trust_remote_code=self.trust_remote_code,
59
+ use_fast=False,
60
+ )
61
+
62
+ # ---- Image preprocessing utilities adapted from InternVL docs ----
63
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
64
+ IMAGENET_STD = (0.229, 0.224, 0.225)
65
+
66
+ def _build_transform(self, input_size: int) -> T.Compose:
67
+ MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
68
+ transform = T.Compose(
69
+ [
70
+ T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
71
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
72
+ T.ToTensor(),
73
+ T.Normalize(mean=MEAN, std=STD),
74
+ ]
75
+ )
76
+ return transform
77
+
78
+ def _find_closest_aspect_ratio(
79
+ self,
80
+ aspect_ratio: float,
81
+ target_ratios: List[tuple],
82
+ width: int,
83
+ height: int,
84
+ image_size: int,
85
+ ):
86
+ best_ratio_diff = float("inf")
87
+ best_ratio = (1, 1)
88
+ area = width * height
89
+ for ratio in target_ratios:
90
+ target_aspect_ratio = ratio[0] / ratio[1]
91
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
92
+ if ratio_diff < best_ratio_diff:
93
+ best_ratio_diff = ratio_diff
94
+ best_ratio = ratio
95
+ elif ratio_diff == best_ratio_diff:
96
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
97
+ best_ratio = ratio
98
+ return best_ratio
99
+
100
+ def _dynamic_preprocess(
101
+ self,
102
+ image: Image.Image,
103
+ min_num: int = 1,
104
+ max_num: int = 12,
105
+ image_size: int = 448,
106
+ use_thumbnail: bool = True,
107
+ ) -> List[Image.Image]:
108
+ orig_width, orig_height = image.size
109
+ aspect_ratio = orig_width / orig_height
110
+
111
+ target_ratios = set(
112
+ (i, j)
113
+ for n in range(min_num, max_num + 1)
114
+ for i in range(1, n + 1)
115
+ for j in range(1, n + 1)
116
+ if i * j <= max_num and i * j >= min_num
117
+ )
118
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
119
+
120
+ target_aspect_ratio = self._find_closest_aspect_ratio(
121
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size
122
+ )
123
+
124
+ target_width = image_size * target_aspect_ratio[0]
125
+ target_height = image_size * target_aspect_ratio[1]
126
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
127
+
128
+ resized_img = image.resize((target_width, target_height))
129
+ processed_images: List[Image.Image] = []
130
+ for i in range(blocks):
131
+ box = (
132
+ (i % (target_width // image_size)) * image_size,
133
+ (i // (target_width // image_size)) * image_size,
134
+ ((i % (target_width // image_size)) + 1) * image_size,
135
+ ((i // (target_width // image_size)) + 1) * image_size,
136
+ )
137
+ split_img = resized_img.crop(box)
138
+ processed_images.append(split_img)
139
+ assert len(processed_images) == blocks
140
+ if use_thumbnail and len(processed_images) != 1:
141
+ thumbnail_img = image.resize((image_size, image_size))
142
+ processed_images.append(thumbnail_img)
143
+ return processed_images
144
+
145
+ def _load_image_from_source(self, src: str) -> Image.Image:
146
+ """Load PIL image from various sources: data URL, http(s), or local path."""
147
+ if src.startswith("data:image/"):
148
+ # data URL base64
149
+ header, b64data = src.split(",", 1)
150
+ img_bytes = base64.b64decode(b64data)
151
+ return Image.open(BytesIO(img_bytes)).convert("RGB")
152
+ if src.startswith("http://") or src.startswith("https://"):
153
+ resp = requests.get(src, timeout=10)
154
+ resp.raise_for_status()
155
+ return Image.open(BytesIO(resp.content)).convert("RGB")
156
+ # Assume local file path
157
+ return Image.open(src).convert("RGB")
158
+
159
+ def _images_to_pixel_values(
160
+ self, images: List[Image.Image], input_size: int = 448, max_num: int = 12
161
+ ):
162
+ transform = self._build_transform(input_size=input_size)
163
+ pixel_values_list = []
164
+ num_patches_list: List[int] = []
165
+ for img in images:
166
+ tiles = self._dynamic_preprocess(
167
+ img, image_size=input_size, use_thumbnail=True, max_num=max_num
168
+ )
169
+ pv = [transform(tile) for tile in tiles]
170
+ pv = torch.stack(pv)
171
+ num_patches_list.append(pv.shape[0])
172
+ pixel_values_list.append(pv)
173
+ if not pixel_values_list:
174
+ return None, []
175
+ pixel_values = torch.cat(pixel_values_list)
176
+ return pixel_values, num_patches_list
177
+
178
+ def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
179
+ """Generate text for the given HF-format messages.
180
+ messages: [{ role, content: [{type:'text'|'image', text|image}] }]
181
+
182
+ This implementation constructs InternVL-compatible inputs and uses
183
+ `model.chat(tokenizer, pixel_values, question, history=...)` to avoid
184
+ relying on AutoProcessor (which fails for some tokenizers).
185
+ """
186
+ assert self.model is not None and self.tokenizer is not None
187
+
188
+ # Build textual context and collect images and the final question
189
+ context_lines: List[str] = []
190
+ all_images: List[Image.Image] = []
191
+ last_user_text_parts: List[str] = []
192
+
193
+ for msg in messages:
194
+ role = msg.get("role", "user")
195
+ content = msg.get("content", [])
196
+ if isinstance(content, str):
197
+ content_items = [{"type": "text", "text": content}]
198
+ else:
199
+ content_items = content
200
+
201
+ if role == "user":
202
+ # Collect text and images
203
+ parts_text: List[str] = []
204
+ for item in content_items:
205
+ if item.get("type") == "text":
206
+ t = item.get("text", "")
207
+ if t:
208
+ parts_text.append(t)
209
+ elif item.get("type") == "image":
210
+ url = item.get("image", "")
211
+ if url:
212
+ try:
213
+ all_images.append(self._load_image_from_source(url))
214
+ except Exception:
215
+ # Ignore failed image loads but keep going
216
+ pass
217
+ text = "\n".join(parts_text).strip()
218
+ if text:
219
+ context_lines.append(f"User: {text}")
220
+ # Track last user text separately for question
221
+ last_user_text_parts = parts_text or last_user_text_parts
222
+ elif role == "assistant":
223
+ # Only keep text content for history
224
+ parts_text = [
225
+ item.get("text", "") for item in content_items if item.get("type") == "text"
226
+ ]
227
+ text = "\n".join(parts_text).strip()
228
+ if text:
229
+ context_lines.append(f"Assistant: {text}")
230
+
231
+ # Prepare pixel values for all collected images (across turns)
232
+ pixel_values = None
233
+ num_patches_list: List[int] = []
234
+ if all_images:
235
+ pixel_values, num_patches_list = self._images_to_pixel_values(
236
+ all_images, input_size=448, max_num=12
237
+ )
238
+ if pixel_values is not None:
239
+ # Convert dtype/device as in docs
240
+ pixel_values = pixel_values.to(torch.bfloat16)
241
+ # Chat API expects tensors on CUDA when model is on CUDA
242
+ try:
243
+ pixel_values = pixel_values.to(self.model.device)
244
+ except Exception:
245
+ pass
246
+
247
+ # Build question with any prior context and numbered image placeholders
248
+ if all_images:
249
+ # Separate images layout: Image-1: <image> ... then question text
250
+ prefix_lines = [f"Image-{i+1}: <image>" for i in range(len(all_images))]
251
+ prefix = "\n".join(prefix_lines) + "\n"
252
+ else:
253
+ prefix = ""
254
+
255
+ last_user_text = "\n".join(last_user_text_parts).strip()
256
+ # Combine prior text-only turns as context to emulate multi-turn
257
+ context_text = "\n".join(context_lines[:-1]) if len(context_lines) > 1 else ""
258
+ base_question = last_user_text if last_user_text else "Describe the image(s) in detail."
259
+ if context_text:
260
+ question = (context_text + "\n" + prefix + base_question).strip()
261
+ else:
262
+ question = (prefix + base_question).strip()
263
+
264
+ # Generation config
265
+ generation_config = dict(max_new_tokens=max_new_tokens, do_sample=False)
266
+
267
+ # Call InternVL chat
268
+ try:
269
+ if pixel_values is None:
270
+ # Pure-text conversation (embed prior turns in question)
271
+ response = self.model.chat(self.tokenizer, None, question, generation_config)
272
+ else:
273
+ # Multi-image: pass num_patches_list if >1 image
274
+ if len(num_patches_list) > 1:
275
+ response = self.model.chat(
276
+ self.tokenizer,
277
+ pixel_values,
278
+ question,
279
+ generation_config,
280
+ num_patches_list=num_patches_list,
281
+ )
282
+ else:
283
+ response = self.model.chat(
284
+ self.tokenizer, pixel_values, question, generation_config
285
+ )
286
+ except Exception as e:
287
+ # Fallback: return empty string to avoid crashing the adapter
288
+ return ""
289
+
290
+ return response or ""
@@ -0,0 +1,115 @@
1
+ import base64
2
+ import re
3
+ from io import BytesIO
4
+ from typing import Any, Dict, List
5
+
6
+ try:
7
+ import blobfile as _ # assert blobfile is installed
8
+ import torch # type: ignore
9
+ from PIL import Image # type: ignore
10
+ from transformers import ( # type: ignore
11
+ AutoImageProcessor,
12
+ AutoModel,
13
+ AutoTokenizer,
14
+ )
15
+
16
+ OPENCUA_AVAILABLE = True
17
+ except Exception:
18
+ OPENCUA_AVAILABLE = False
19
+
20
+
21
+ class OpenCUAModel:
22
+ """OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
23
+
24
+ def __init__(
25
+ self, model_name: str, device: str = "auto", trust_remote_code: bool = False
26
+ ) -> None:
27
+ if not OPENCUA_AVAILABLE:
28
+ raise ImportError(
29
+ 'OpenCUA requirements not found. Install with: pip install "cua-agent[opencua-hf]"'
30
+ )
31
+ self.model_name = model_name
32
+ self.device = device
33
+ self.model = None
34
+ self.tokenizer = None
35
+ self.image_processor = None
36
+ self.trust_remote_code = trust_remote_code
37
+ self._load()
38
+
39
+ def _load(self) -> None:
40
+ self.tokenizer = AutoTokenizer.from_pretrained(
41
+ self.model_name, trust_remote_code=self.trust_remote_code
42
+ )
43
+ self.model = AutoModel.from_pretrained(
44
+ self.model_name,
45
+ torch_dtype="auto",
46
+ device_map=self.device,
47
+ trust_remote_code=self.trust_remote_code,
48
+ attn_implementation="sdpa",
49
+ )
50
+ self.image_processor = AutoImageProcessor.from_pretrained(
51
+ self.model_name, trust_remote_code=self.trust_remote_code
52
+ )
53
+
54
+ @staticmethod
55
+ def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str:
56
+ # Expect HF-format messages with content items type: "image" with data URL
57
+ for msg in reversed(messages):
58
+ for item in reversed(msg.get("content", [])):
59
+ if isinstance(item, dict) and item.get("type") == "image":
60
+ url = item.get("image", "")
61
+ if isinstance(url, str) and url.startswith("data:image/"):
62
+ return url.split(",", 1)[1]
63
+ return ""
64
+
65
+ def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
66
+ assert (
67
+ self.model is not None
68
+ and self.tokenizer is not None
69
+ and self.image_processor is not None
70
+ )
71
+
72
+ # Tokenize text side using chat template
73
+ input_ids = self.tokenizer.apply_chat_template(
74
+ messages, tokenize=True, add_generation_prompt=True
75
+ )
76
+ input_ids = torch.tensor([input_ids]).to(self.model.device)
77
+
78
+ # Prepare image inputs from last data URL image
79
+ image_b64 = self._extract_last_image_b64(messages)
80
+ pixel_values = None
81
+ grid_thws = None
82
+ if image_b64:
83
+ image = Image.open(BytesIO(base64.b64decode(image_b64))).convert("RGB")
84
+ image_info = self.image_processor.preprocess(images=[image])
85
+ pixel_values = torch.tensor(image_info["pixel_values"]).to(
86
+ dtype=torch.bfloat16, device=self.model.device
87
+ )
88
+ grid_thws = (
89
+ torch.tensor(image_info["image_grid_thw"])
90
+ if "image_grid_thw" in image_info
91
+ else None
92
+ )
93
+
94
+ gen_kwargs: Dict[str, Any] = {
95
+ "max_new_tokens": max_new_tokens,
96
+ "temperature": 0,
97
+ }
98
+ if pixel_values is not None:
99
+ gen_kwargs["pixel_values"] = pixel_values
100
+ if grid_thws is not None:
101
+ gen_kwargs["grid_thws"] = grid_thws
102
+
103
+ with torch.no_grad():
104
+ generated_ids = self.model.generate(
105
+ input_ids,
106
+ **gen_kwargs,
107
+ )
108
+
109
+ # Remove prompt tokens
110
+ prompt_len = input_ids.shape[1]
111
+ generated_ids = generated_ids[:, prompt_len:]
112
+ output_text = self.tokenizer.batch_decode(
113
+ generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
114
+ )[0]
115
+ return output_text
@@ -0,0 +1,78 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ # Hugging Face imports are local to avoid hard dependency at module import
4
+ try:
5
+ import torch # type: ignore
6
+ from transformers import AutoModelForImageTextToText, AutoProcessor # type: ignore
7
+
8
+ HF_AVAILABLE = True
9
+ except Exception:
10
+ HF_AVAILABLE = False
11
+
12
+
13
+ class Qwen2_5_VLModel:
14
+ """Qwen2.5-VL Hugging Face vision-language model handler.
15
+ Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
16
+ """
17
+
18
+ def __init__(
19
+ self, model_name: str, device: str = "auto", trust_remote_code: bool = False
20
+ ) -> None:
21
+ if not HF_AVAILABLE:
22
+ raise ImportError(
23
+ 'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
24
+ )
25
+ self.model_name = model_name
26
+ self.device = device
27
+ self.model = None
28
+ self.processor = None
29
+ self.trust_remote_code = trust_remote_code
30
+ self._load()
31
+
32
+ def _load(self) -> None:
33
+ # Load model
34
+ self.model = AutoModelForImageTextToText.from_pretrained(
35
+ self.model_name,
36
+ torch_dtype=torch.bfloat16,
37
+ device_map=self.device,
38
+ attn_implementation="sdpa",
39
+ trust_remote_code=self.trust_remote_code,
40
+ )
41
+ # Load processor
42
+ self.processor = AutoProcessor.from_pretrained(
43
+ self.model_name,
44
+ min_pixels=3136,
45
+ max_pixels=4096 * 2160,
46
+ device_map=self.device,
47
+ trust_remote_code=self.trust_remote_code,
48
+ )
49
+
50
+ def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
51
+ """Generate text for the given HF-format messages.
52
+ messages: [{ role, content: [{type:'text'|'image', text|image}] }]
53
+ """
54
+ assert self.model is not None and self.processor is not None
55
+ # Apply chat template and tokenize
56
+ inputs = self.processor.apply_chat_template(
57
+ messages,
58
+ add_generation_prompt=True,
59
+ tokenize=True,
60
+ return_dict=True,
61
+ return_tensors="pt",
62
+ )
63
+ # Move inputs to the same device as model
64
+ inputs = inputs.to(self.model.device)
65
+ # Generate
66
+ with torch.no_grad():
67
+ generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
68
+ # Trim prompt tokens from output
69
+ generated_ids_trimmed = [
70
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
71
+ ]
72
+ # Decode
73
+ output_text = self.processor.batch_decode(
74
+ generated_ids_trimmed,
75
+ skip_special_tokens=True,
76
+ clean_up_tokenization_spaces=False,
77
+ )
78
+ return output_text[0] if output_text else ""