cua-agent 0.4.33__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +49 -20
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.33.dist-info → cua_agent-0.4.35.dist-info}/METADATA +22 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.33.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.33.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.33.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
@@ -1,19 +1,22 @@
1
1
  from __future__ import annotations
2
- from typing import List, Dict, Any, Optional
2
+
3
+ from typing import Any, Dict, List, Optional
3
4
 
4
5
  # Hugging Face imports are local to avoid hard dependency at module import
5
6
  try:
6
- import torch # type: ignore
7
- from transformers import AutoModel, AutoTokenizer # type: ignore
7
+ import base64 # type: ignore
8
+ from io import BytesIO # type: ignore
9
+
8
10
  # Attempt to import InternVL's model dependencies
9
11
  import einops as _ # type: ignore
12
+ import requests # type: ignore
10
13
  import timm as _ # type: ignore
11
- from PIL import Image # type: ignore
14
+ import torch # type: ignore
12
15
  import torchvision.transforms as T # type: ignore
16
+ from PIL import Image # type: ignore
13
17
  from torchvision.transforms.functional import InterpolationMode # type: ignore
14
- import base64 # type: ignore
15
- from io import BytesIO # type: ignore
16
- import requests # type: ignore
18
+ from transformers import AutoModel, AutoTokenizer # type: ignore
19
+
17
20
  HF_AVAILABLE = True
18
21
  except Exception:
19
22
  HF_AVAILABLE = False
@@ -25,10 +28,12 @@ class InternVLModel:
25
28
  Provides preprocessing to support multi-turn conversations with multiple images.
26
29
  """
27
30
 
28
- def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
31
+ def __init__(
32
+ self, model_name: str, device: str = "auto", trust_remote_code: bool = False
33
+ ) -> None:
29
34
  if not HF_AVAILABLE:
30
35
  raise ImportError(
31
- "InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\""
36
+ 'InternVL dependencies not found. Install with: pip install "cua-agent[internvl-hf]"'
32
37
  )
33
38
  self.model_name = model_name
34
39
  self.device = device
@@ -60,16 +65,25 @@ class InternVLModel:
60
65
 
61
66
  def _build_transform(self, input_size: int) -> T.Compose:
62
67
  MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
63
- transform = T.Compose([
64
- T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
65
- T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
66
- T.ToTensor(),
67
- T.Normalize(mean=MEAN, std=STD)
68
- ])
68
+ transform = T.Compose(
69
+ [
70
+ T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
71
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
72
+ T.ToTensor(),
73
+ T.Normalize(mean=MEAN, std=STD),
74
+ ]
75
+ )
69
76
  return transform
70
77
 
71
- def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int):
72
- best_ratio_diff = float('inf')
78
+ def _find_closest_aspect_ratio(
79
+ self,
80
+ aspect_ratio: float,
81
+ target_ratios: List[tuple],
82
+ width: int,
83
+ height: int,
84
+ image_size: int,
85
+ ):
86
+ best_ratio_diff = float("inf")
73
87
  best_ratio = (1, 1)
74
88
  area = width * height
75
89
  for ratio in target_ratios:
@@ -83,17 +97,29 @@ class InternVLModel:
83
97
  best_ratio = ratio
84
98
  return best_ratio
85
99
 
86
- def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]:
100
+ def _dynamic_preprocess(
101
+ self,
102
+ image: Image.Image,
103
+ min_num: int = 1,
104
+ max_num: int = 12,
105
+ image_size: int = 448,
106
+ use_thumbnail: bool = True,
107
+ ) -> List[Image.Image]:
87
108
  orig_width, orig_height = image.size
88
109
  aspect_ratio = orig_width / orig_height
89
110
 
90
111
  target_ratios = set(
91
- (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
92
- i * j <= max_num and i * j >= min_num)
112
+ (i, j)
113
+ for n in range(min_num, max_num + 1)
114
+ for i in range(1, n + 1)
115
+ for j in range(1, n + 1)
116
+ if i * j <= max_num and i * j >= min_num
117
+ )
93
118
  target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
94
119
 
95
120
  target_aspect_ratio = self._find_closest_aspect_ratio(
96
- aspect_ratio, target_ratios, orig_width, orig_height, image_size)
121
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size
122
+ )
97
123
 
98
124
  target_width = image_size * target_aspect_ratio[0]
99
125
  target_height = image_size * target_aspect_ratio[1]
@@ -106,7 +132,7 @@ class InternVLModel:
106
132
  (i % (target_width // image_size)) * image_size,
107
133
  (i // (target_width // image_size)) * image_size,
108
134
  ((i % (target_width // image_size)) + 1) * image_size,
109
- ((i // (target_width // image_size)) + 1) * image_size
135
+ ((i // (target_width // image_size)) + 1) * image_size,
110
136
  )
111
137
  split_img = resized_img.crop(box)
112
138
  processed_images.append(split_img)
@@ -122,20 +148,24 @@ class InternVLModel:
122
148
  # data URL base64
123
149
  header, b64data = src.split(",", 1)
124
150
  img_bytes = base64.b64decode(b64data)
125
- return Image.open(BytesIO(img_bytes)).convert('RGB')
151
+ return Image.open(BytesIO(img_bytes)).convert("RGB")
126
152
  if src.startswith("http://") or src.startswith("https://"):
127
153
  resp = requests.get(src, timeout=10)
128
154
  resp.raise_for_status()
129
- return Image.open(BytesIO(resp.content)).convert('RGB')
155
+ return Image.open(BytesIO(resp.content)).convert("RGB")
130
156
  # Assume local file path
131
- return Image.open(src).convert('RGB')
157
+ return Image.open(src).convert("RGB")
132
158
 
133
- def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12):
159
+ def _images_to_pixel_values(
160
+ self, images: List[Image.Image], input_size: int = 448, max_num: int = 12
161
+ ):
134
162
  transform = self._build_transform(input_size=input_size)
135
163
  pixel_values_list = []
136
164
  num_patches_list: List[int] = []
137
165
  for img in images:
138
- tiles = self._dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
166
+ tiles = self._dynamic_preprocess(
167
+ img, image_size=input_size, use_thumbnail=True, max_num=max_num
168
+ )
139
169
  pv = [transform(tile) for tile in tiles]
140
170
  pv = torch.stack(pv)
141
171
  num_patches_list.append(pv.shape[0])
@@ -191,7 +221,9 @@ class InternVLModel:
191
221
  last_user_text_parts = parts_text or last_user_text_parts
192
222
  elif role == "assistant":
193
223
  # Only keep text content for history
194
- parts_text = [item.get("text", "") for item in content_items if item.get("type") == "text"]
224
+ parts_text = [
225
+ item.get("text", "") for item in content_items if item.get("type") == "text"
226
+ ]
195
227
  text = "\n".join(parts_text).strip()
196
228
  if text:
197
229
  context_lines.append(f"Assistant: {text}")
@@ -200,7 +232,9 @@ class InternVLModel:
200
232
  pixel_values = None
201
233
  num_patches_list: List[int] = []
202
234
  if all_images:
203
- pixel_values, num_patches_list = self._images_to_pixel_values(all_images, input_size=448, max_num=12)
235
+ pixel_values, num_patches_list = self._images_to_pixel_values(
236
+ all_images, input_size=448, max_num=12
237
+ )
204
238
  if pixel_values is not None:
205
239
  # Convert dtype/device as in docs
206
240
  pixel_values = pixel_values.to(torch.bfloat16)
@@ -246,7 +280,9 @@ class InternVLModel:
246
280
  num_patches_list=num_patches_list,
247
281
  )
248
282
  else:
249
- response = self.model.chat(self.tokenizer, pixel_values, question, generation_config)
283
+ response = self.model.chat(
284
+ self.tokenizer, pixel_values, question, generation_config
285
+ )
250
286
  except Exception as e:
251
287
  # Fallback: return empty string to avoid crashing the adapter
252
288
  return ""
@@ -1,13 +1,18 @@
1
- from typing import List, Dict, Any
2
- import re
3
1
  import base64
2
+ import re
4
3
  from io import BytesIO
4
+ from typing import Any, Dict, List
5
5
 
6
6
  try:
7
+ import blobfile as _ # assert blobfile is installed
7
8
  import torch # type: ignore
8
- from transformers import AutoTokenizer, AutoModel, AutoImageProcessor # type: ignore
9
9
  from PIL import Image # type: ignore
10
- import blobfile as _ # assert blobfile is installed
10
+ from transformers import ( # type: ignore
11
+ AutoImageProcessor,
12
+ AutoModel,
13
+ AutoTokenizer,
14
+ )
15
+
11
16
  OPENCUA_AVAILABLE = True
12
17
  except Exception:
13
18
  OPENCUA_AVAILABLE = False
@@ -16,10 +21,12 @@ except Exception:
16
21
  class OpenCUAModel:
17
22
  """OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
18
23
 
19
- def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
24
+ def __init__(
25
+ self, model_name: str, device: str = "auto", trust_remote_code: bool = False
26
+ ) -> None:
20
27
  if not OPENCUA_AVAILABLE:
21
28
  raise ImportError(
22
- "OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\""
29
+ 'OpenCUA requirements not found. Install with: pip install "cua-agent[opencua-hf]"'
23
30
  )
24
31
  self.model_name = model_name
25
32
  self.device = device
@@ -56,7 +63,11 @@ class OpenCUAModel:
56
63
  return ""
57
64
 
58
65
  def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
59
- assert self.model is not None and self.tokenizer is not None and self.image_processor is not None
66
+ assert (
67
+ self.model is not None
68
+ and self.tokenizer is not None
69
+ and self.image_processor is not None
70
+ )
60
71
 
61
72
  # Tokenize text side using chat template
62
73
  input_ids = self.tokenizer.apply_chat_template(
@@ -74,7 +85,11 @@ class OpenCUAModel:
74
85
  pixel_values = torch.tensor(image_info["pixel_values"]).to(
75
86
  dtype=torch.bfloat16, device=self.model.device
76
87
  )
77
- grid_thws = torch.tensor(image_info["image_grid_thw"]) if "image_grid_thw" in image_info else None
88
+ grid_thws = (
89
+ torch.tensor(image_info["image_grid_thw"])
90
+ if "image_grid_thw" in image_info
91
+ else None
92
+ )
78
93
 
79
94
  gen_kwargs: Dict[str, Any] = {
80
95
  "max_new_tokens": max_new_tokens,
@@ -1,9 +1,10 @@
1
- from typing import List, Dict, Any, Optional
1
+ from typing import Any, Dict, List, Optional
2
2
 
3
3
  # Hugging Face imports are local to avoid hard dependency at module import
4
4
  try:
5
5
  import torch # type: ignore
6
6
  from transformers import AutoModelForImageTextToText, AutoProcessor # type: ignore
7
+
7
8
  HF_AVAILABLE = True
8
9
  except Exception:
9
10
  HF_AVAILABLE = False
@@ -14,10 +15,12 @@ class Qwen2_5_VLModel:
14
15
  Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
15
16
  """
16
17
 
17
- def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
18
+ def __init__(
19
+ self, model_name: str, device: str = "auto", trust_remote_code: bool = False
20
+ ) -> None:
18
21
  if not HF_AVAILABLE:
19
22
  raise ImportError(
20
- "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
23
+ 'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
21
24
  )
22
25
  self.model_name = model_name
23
26
  self.device = device
@@ -64,7 +67,7 @@ class Qwen2_5_VLModel:
64
67
  generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
65
68
  # Trim prompt tokens from output
66
69
  generated_ids_trimmed = [
67
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
70
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
68
71
  ]
69
72
  # Decode
70
73
  output_text = self.processor.batch_decode(