cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,370 @@
1
+ import asyncio
2
+ import base64
3
+ import functools
4
+ import io
5
+ import math
6
+ import re
7
+ import warnings
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Tuple, cast
10
+
11
+ from litellm import acompletion, completion
12
+ from litellm.llms.custom_llm import CustomLLM
13
+ from litellm.types.utils import GenericStreamingChunk, ModelResponse
14
+ from PIL import Image
15
+
16
+ # Try to import MLX dependencies
17
+ try:
18
+ import mlx.core as mx
19
+ from mlx_vlm import generate, load
20
+ from mlx_vlm.prompt_utils import apply_chat_template
21
+ from mlx_vlm.utils import load_config
22
+ from transformers.tokenization_utils import PreTrainedTokenizer
23
+
24
+ MLX_AVAILABLE = True
25
+ except ImportError:
26
+ MLX_AVAILABLE = False
27
+
28
+ # Constants for smart_resize
29
+ IMAGE_FACTOR = 28
30
+ MIN_PIXELS = 100 * 28 * 28
31
+ MAX_PIXELS = 16384 * 28 * 28
32
+ MAX_RATIO = 200
33
+
34
+
35
+ def round_by_factor(number: float, factor: int) -> int:
36
+ """Returns the closest integer to 'number' that is divisible by 'factor'."""
37
+ return round(number / factor) * factor
38
+
39
+
40
+ def ceil_by_factor(number: float, factor: int) -> int:
41
+ """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
42
+ return math.ceil(number / factor) * factor
43
+
44
+
45
+ def floor_by_factor(number: float, factor: int) -> int:
46
+ """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
47
+ return math.floor(number / factor) * factor
48
+
49
+
50
+ def smart_resize(
51
+ height: int,
52
+ width: int,
53
+ factor: int = IMAGE_FACTOR,
54
+ min_pixels: int = MIN_PIXELS,
55
+ max_pixels: int = MAX_PIXELS,
56
+ ) -> tuple[int, int]:
57
+ """
58
+ Rescales the image so that the following conditions are met:
59
+
60
+ 1. Both dimensions (height and width) are divisible by 'factor'.
61
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
62
+ 3. The aspect ratio of the image is maintained as closely as possible.
63
+ """
64
+ if max(height, width) / min(height, width) > MAX_RATIO:
65
+ raise ValueError(
66
+ f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
67
+ )
68
+ h_bar = max(factor, round_by_factor(height, factor))
69
+ w_bar = max(factor, round_by_factor(width, factor))
70
+ if h_bar * w_bar > max_pixels:
71
+ beta = math.sqrt((height * width) / max_pixels)
72
+ h_bar = floor_by_factor(height / beta, factor)
73
+ w_bar = floor_by_factor(width / beta, factor)
74
+ elif h_bar * w_bar < min_pixels:
75
+ beta = math.sqrt(min_pixels / (height * width))
76
+ h_bar = ceil_by_factor(height * beta, factor)
77
+ w_bar = ceil_by_factor(width * beta, factor)
78
+ return h_bar, w_bar
79
+
80
+
81
+ class MLXVLMAdapter(CustomLLM):
82
+ """MLX VLM Adapter for running vision-language models locally using MLX."""
83
+
84
+ def __init__(self, **kwargs):
85
+ """Initialize the adapter.
86
+
87
+ Args:
88
+ **kwargs: Additional arguments
89
+ """
90
+ super().__init__()
91
+
92
+ self.models = {} # Cache for loaded models
93
+ self.processors = {} # Cache for loaded processors
94
+ self.configs = {} # Cache for loaded configs
95
+ self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
96
+
97
+ def _load_model_and_processor(self, model_name: str):
98
+ """Load model and processor if not already cached.
99
+
100
+ Args:
101
+ model_name: Name of the model to load
102
+
103
+ Returns:
104
+ Tuple of (model, processor, config)
105
+ """
106
+ if not MLX_AVAILABLE:
107
+ raise ImportError("MLX VLM dependencies not available. Please install mlx-vlm.")
108
+
109
+ if model_name not in self.models:
110
+ # Load model and processor
111
+ model_obj, processor = load(
112
+ model_name, processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
113
+ )
114
+ config = load_config(model_name)
115
+
116
+ # Cache them
117
+ self.models[model_name] = model_obj
118
+ self.processors[model_name] = processor
119
+ self.configs[model_name] = config
120
+
121
+ return self.models[model_name], self.processors[model_name], self.configs[model_name]
122
+
123
+ def _process_coordinates(
124
+ self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]
125
+ ) -> str:
126
+ """Process coordinates in box tokens based on image resizing using smart_resize approach.
127
+
128
+ Args:
129
+ text: Text containing box tokens
130
+ original_size: Original image size (width, height)
131
+ model_size: Model processed image size (width, height)
132
+
133
+ Returns:
134
+ Text with processed coordinates
135
+ """
136
+ # Find all box tokens
137
+ box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>"
138
+
139
+ def process_coords(match):
140
+ model_x, model_y = int(match.group(1)), int(match.group(2))
141
+ # Scale coordinates from model space to original image space
142
+ # Both original_size and model_size are in (width, height) format
143
+ new_x = int(model_x * original_size[0] / model_size[0]) # Width
144
+ new_y = int(model_y * original_size[1] / model_size[1]) # Height
145
+ return f"<|box_start|>({new_x},{new_y})<|box_end|>"
146
+
147
+ return re.sub(box_pattern, process_coords, text)
148
+
149
+ def _convert_messages(self, messages: List[Dict[str, Any]]) -> Tuple[
150
+ List[Dict[str, Any]],
151
+ List[Image.Image],
152
+ Dict[int, Tuple[int, int]],
153
+ Dict[int, Tuple[int, int]],
154
+ ]:
155
+ """Convert OpenAI format messages to MLX VLM format and extract images.
156
+
157
+ Args:
158
+ messages: Messages in OpenAI format
159
+
160
+ Returns:
161
+ Tuple of (processed_messages, images, original_sizes, model_sizes)
162
+ """
163
+ processed_messages = []
164
+ images = []
165
+ original_sizes = {} # Track original sizes of images for coordinate mapping
166
+ model_sizes = {} # Track model processed sizes
167
+ image_index = 0
168
+
169
+ for message in messages:
170
+ processed_message = {"role": message["role"], "content": []}
171
+
172
+ content = message.get("content", [])
173
+ if isinstance(content, str):
174
+ # Simple text content
175
+ processed_message["content"] = content
176
+ elif isinstance(content, list):
177
+ # Multi-modal content
178
+ processed_content = []
179
+ for item in content:
180
+ if item.get("type") == "text":
181
+ processed_content.append({"type": "text", "text": item.get("text", "")})
182
+ elif item.get("type") == "image_url":
183
+ image_url = item.get("image_url", {}).get("url", "")
184
+ pil_image = None
185
+
186
+ if image_url.startswith("data:image/"):
187
+ # Extract base64 data
188
+ base64_data = image_url.split(",")[1]
189
+ # Convert base64 to PIL Image
190
+ image_data = base64.b64decode(base64_data)
191
+ pil_image = Image.open(io.BytesIO(image_data))
192
+ else:
193
+ # Handle file path or URL
194
+ pil_image = Image.open(image_url)
195
+
196
+ # Store original image size for coordinate mapping
197
+ original_size = pil_image.size
198
+ original_sizes[image_index] = original_size
199
+
200
+ # Use smart_resize to determine model size
201
+ # Note: smart_resize expects (height, width) but PIL gives (width, height)
202
+ height, width = original_size[1], original_size[0]
203
+ new_height, new_width = smart_resize(height, width)
204
+ # Store model size in (width, height) format for consistent coordinate processing
205
+ model_sizes[image_index] = (new_width, new_height)
206
+
207
+ # Resize the image using the calculated dimensions from smart_resize
208
+ resized_image = pil_image.resize((new_width, new_height))
209
+ images.append(resized_image)
210
+
211
+ # Add image placeholder to content
212
+ processed_content.append({"type": "image"})
213
+
214
+ image_index += 1
215
+
216
+ processed_message["content"] = processed_content
217
+
218
+ processed_messages.append(processed_message)
219
+
220
+ return processed_messages, images, original_sizes, model_sizes
221
+
222
+ def _generate(self, **kwargs) -> str:
223
+ """Generate response using the local MLX VLM model.
224
+
225
+ Args:
226
+ **kwargs: Keyword arguments containing messages and model info
227
+
228
+ Returns:
229
+ Generated text response
230
+ """
231
+ messages = kwargs.get("messages", [])
232
+ model_name = kwargs.get("model", "mlx-community/UI-TARS-1.5-7B-4bit")
233
+ max_tokens = kwargs.get("max_tokens", 128)
234
+
235
+ # Warn about ignored kwargs
236
+ ignored_kwargs = set(kwargs.keys()) - {"messages", "model", "max_tokens"}
237
+ if ignored_kwargs:
238
+ warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
239
+
240
+ # Load model and processor
241
+ model, processor, config = self._load_model_and_processor(model_name)
242
+
243
+ # Convert messages and extract images
244
+ processed_messages, images, original_sizes, model_sizes = self._convert_messages(messages)
245
+
246
+ # Process user text input with box coordinates after image processing
247
+ # Swap original_size and model_size arguments for inverse transformation
248
+ for msg_idx, msg in enumerate(processed_messages):
249
+ if msg.get("role") == "user" and isinstance(msg.get("content"), str):
250
+ content = msg.get("content", "")
251
+ if (
252
+ "<|box_start|>" in content
253
+ and original_sizes
254
+ and model_sizes
255
+ and 0 in original_sizes
256
+ and 0 in model_sizes
257
+ ):
258
+ orig_size = original_sizes[0]
259
+ model_size = model_sizes[0]
260
+ # Swap arguments to perform inverse transformation for user input
261
+ processed_messages[msg_idx]["content"] = self._process_coordinates(
262
+ content, model_size, orig_size
263
+ )
264
+
265
+ try:
266
+ # Format prompt according to model requirements using the processor directly
267
+ prompt = processor.apply_chat_template(
268
+ processed_messages, tokenize=False, add_generation_prompt=True, return_tensors="pt"
269
+ )
270
+ tokenizer = cast(PreTrainedTokenizer, processor)
271
+
272
+ # Generate response
273
+ text_content, usage = generate(
274
+ model,
275
+ tokenizer,
276
+ str(prompt),
277
+ images, # type: ignore
278
+ verbose=False,
279
+ max_tokens=max_tokens,
280
+ )
281
+
282
+ except Exception as e:
283
+ raise RuntimeError(f"Error generating response: {str(e)}") from e
284
+
285
+ # Process coordinates in the response back to original image space
286
+ if original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
287
+ # Get original image size and model size (using the first image)
288
+ orig_size = original_sizes[0]
289
+ model_size = model_sizes[0]
290
+
291
+ # Check if output contains box tokens that need processing
292
+ if "<|box_start|>" in text_content:
293
+ # Process coordinates from model space back to original image space
294
+ text_content = self._process_coordinates(text_content, orig_size, model_size)
295
+
296
+ return text_content
297
+
298
+ def completion(self, *args, **kwargs) -> ModelResponse:
299
+ """Synchronous completion method.
300
+
301
+ Returns:
302
+ ModelResponse with generated text
303
+ """
304
+ generated_text = self._generate(**kwargs)
305
+
306
+ result = completion(
307
+ model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
308
+ mock_response=generated_text,
309
+ )
310
+ return cast(ModelResponse, result)
311
+
312
+ async def acompletion(self, *args, **kwargs) -> ModelResponse:
313
+ """Asynchronous completion method.
314
+
315
+ Returns:
316
+ ModelResponse with generated text
317
+ """
318
+ # Run _generate in thread pool to avoid blocking
319
+ loop = asyncio.get_event_loop()
320
+ generated_text = await loop.run_in_executor(
321
+ self._executor, functools.partial(self._generate, **kwargs)
322
+ )
323
+
324
+ result = await acompletion(
325
+ model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
326
+ mock_response=generated_text,
327
+ )
328
+ return cast(ModelResponse, result)
329
+
330
+ def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
331
+ """Synchronous streaming method.
332
+
333
+ Returns:
334
+ Iterator of GenericStreamingChunk
335
+ """
336
+ generated_text = self._generate(**kwargs)
337
+
338
+ generic_streaming_chunk: GenericStreamingChunk = {
339
+ "finish_reason": "stop",
340
+ "index": 0,
341
+ "is_finished": True,
342
+ "text": generated_text,
343
+ "tool_use": None,
344
+ "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
345
+ }
346
+
347
+ yield generic_streaming_chunk
348
+
349
+ async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
350
+ """Asynchronous streaming method.
351
+
352
+ Returns:
353
+ AsyncIterator of GenericStreamingChunk
354
+ """
355
+ # Run _generate in thread pool to avoid blocking
356
+ loop = asyncio.get_event_loop()
357
+ generated_text = await loop.run_in_executor(
358
+ self._executor, functools.partial(self._generate, **kwargs)
359
+ )
360
+
361
+ generic_streaming_chunk: GenericStreamingChunk = {
362
+ "finish_reason": "stop",
363
+ "index": 0,
364
+ "is_finished": True,
365
+ "text": generated_text,
366
+ "tool_use": None,
367
+ "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
368
+ }
369
+
370
+ yield generic_streaming_chunk
@@ -0,0 +1,41 @@
1
+ from typing import Optional
2
+
3
+ try:
4
+ from transformers import AutoConfig
5
+
6
+ HF_AVAILABLE = True
7
+ except ImportError:
8
+ HF_AVAILABLE = False
9
+
10
+ from .generic import GenericHFModel
11
+ from .internvl import InternVLModel
12
+ from .opencua import OpenCUAModel
13
+ from .qwen2_5_vl import Qwen2_5_VLModel
14
+
15
+
16
+ def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False):
17
+ """Factory function to load and return the right model handler instance.
18
+
19
+ - If the underlying transformers config class matches OpenCUA, return OpenCUAModel
20
+ - Otherwise, return GenericHFModel
21
+ """
22
+ if not HF_AVAILABLE:
23
+ raise ImportError(
24
+ 'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
25
+ )
26
+ cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
27
+ cls = cfg.__class__.__name__
28
+ print(f"cls: {cls}")
29
+ if "OpenCUA" in cls:
30
+ return OpenCUAModel(
31
+ model_name=model_name, device=device, trust_remote_code=trust_remote_code
32
+ )
33
+ elif "Qwen2_5_VL" in cls:
34
+ return Qwen2_5_VLModel(
35
+ model_name=model_name, device=device, trust_remote_code=trust_remote_code
36
+ )
37
+ elif "InternVL" in cls:
38
+ return InternVLModel(
39
+ model_name=model_name, device=device, trust_remote_code=trust_remote_code
40
+ )
41
+ return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
@@ -0,0 +1,78 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ # Hugging Face imports are local to avoid hard dependency at module import
4
+ try:
5
+ import torch # type: ignore
6
+ from transformers import AutoModel, AutoProcessor # type: ignore
7
+
8
+ HF_AVAILABLE = True
9
+ except Exception:
10
+ HF_AVAILABLE = False
11
+
12
+
13
+ class GenericHFModel:
14
+ """Generic Hugging Face vision-language model handler.
15
+ Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
16
+ """
17
+
18
+ def __init__(
19
+ self, model_name: str, device: str = "auto", trust_remote_code: bool = False
20
+ ) -> None:
21
+ if not HF_AVAILABLE:
22
+ raise ImportError(
23
+ 'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
24
+ )
25
+ self.model_name = model_name
26
+ self.device = device
27
+ self.model = None
28
+ self.processor = None
29
+ self.trust_remote_code = trust_remote_code
30
+ self._load()
31
+
32
+ def _load(self) -> None:
33
+ # Load model
34
+ self.model = AutoModel.from_pretrained(
35
+ self.model_name,
36
+ torch_dtype=torch.float16,
37
+ device_map=self.device,
38
+ attn_implementation="sdpa",
39
+ trust_remote_code=self.trust_remote_code,
40
+ )
41
+ # Load processor
42
+ self.processor = AutoProcessor.from_pretrained(
43
+ self.model_name,
44
+ min_pixels=3136,
45
+ max_pixels=4096 * 2160,
46
+ device_map=self.device,
47
+ trust_remote_code=self.trust_remote_code,
48
+ )
49
+
50
+ def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
51
+ """Generate text for the given HF-format messages.
52
+ messages: [{ role, content: [{type:'text'|'image', text|image}] }]
53
+ """
54
+ assert self.model is not None and self.processor is not None
55
+ # Apply chat template and tokenize
56
+ inputs = self.processor.apply_chat_template(
57
+ messages,
58
+ add_generation_prompt=True,
59
+ tokenize=True,
60
+ return_dict=True,
61
+ return_tensors="pt",
62
+ )
63
+ # Move inputs to the same device as model
64
+ inputs = inputs.to(self.model.device)
65
+ # Generate
66
+ with torch.no_grad():
67
+ generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
68
+ # Trim prompt tokens from output
69
+ generated_ids_trimmed = [
70
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
71
+ ]
72
+ # Decode
73
+ output_text = self.processor.batch_decode(
74
+ generated_ids_trimmed,
75
+ skip_special_tokens=True,
76
+ clean_up_tokenization_spaces=False,
77
+ )
78
+ return output_text[0] if output_text else ""