cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +32 -19
- agent/computers/cua.py +33 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +215 -210
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +510 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
- cua_agent-0.4.36.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
|
@@ -1,19 +1,22 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
3
4
|
|
|
4
5
|
# Hugging Face imports are local to avoid hard dependency at module import
|
|
5
6
|
try:
|
|
6
|
-
import
|
|
7
|
-
from
|
|
7
|
+
import base64 # type: ignore
|
|
8
|
+
from io import BytesIO # type: ignore
|
|
9
|
+
|
|
8
10
|
# Attempt to import InternVL's model dependencies
|
|
9
11
|
import einops as _ # type: ignore
|
|
12
|
+
import requests # type: ignore
|
|
10
13
|
import timm as _ # type: ignore
|
|
11
|
-
|
|
14
|
+
import torch # type: ignore
|
|
12
15
|
import torchvision.transforms as T # type: ignore
|
|
16
|
+
from PIL import Image # type: ignore
|
|
13
17
|
from torchvision.transforms.functional import InterpolationMode # type: ignore
|
|
14
|
-
import
|
|
15
|
-
|
|
16
|
-
import requests # type: ignore
|
|
18
|
+
from transformers import AutoModel, AutoTokenizer # type: ignore
|
|
19
|
+
|
|
17
20
|
HF_AVAILABLE = True
|
|
18
21
|
except Exception:
|
|
19
22
|
HF_AVAILABLE = False
|
|
@@ -25,10 +28,12 @@ class InternVLModel:
|
|
|
25
28
|
Provides preprocessing to support multi-turn conversations with multiple images.
|
|
26
29
|
"""
|
|
27
30
|
|
|
28
|
-
def __init__(
|
|
31
|
+
def __init__(
|
|
32
|
+
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
|
|
33
|
+
) -> None:
|
|
29
34
|
if not HF_AVAILABLE:
|
|
30
35
|
raise ImportError(
|
|
31
|
-
|
|
36
|
+
'InternVL dependencies not found. Install with: pip install "cua-agent[internvl-hf]"'
|
|
32
37
|
)
|
|
33
38
|
self.model_name = model_name
|
|
34
39
|
self.device = device
|
|
@@ -60,16 +65,25 @@ class InternVLModel:
|
|
|
60
65
|
|
|
61
66
|
def _build_transform(self, input_size: int) -> T.Compose:
|
|
62
67
|
MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
|
|
63
|
-
transform = T.Compose(
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
68
|
+
transform = T.Compose(
|
|
69
|
+
[
|
|
70
|
+
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
|
|
71
|
+
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
|
|
72
|
+
T.ToTensor(),
|
|
73
|
+
T.Normalize(mean=MEAN, std=STD),
|
|
74
|
+
]
|
|
75
|
+
)
|
|
69
76
|
return transform
|
|
70
77
|
|
|
71
|
-
def _find_closest_aspect_ratio(
|
|
72
|
-
|
|
78
|
+
def _find_closest_aspect_ratio(
|
|
79
|
+
self,
|
|
80
|
+
aspect_ratio: float,
|
|
81
|
+
target_ratios: List[tuple],
|
|
82
|
+
width: int,
|
|
83
|
+
height: int,
|
|
84
|
+
image_size: int,
|
|
85
|
+
):
|
|
86
|
+
best_ratio_diff = float("inf")
|
|
73
87
|
best_ratio = (1, 1)
|
|
74
88
|
area = width * height
|
|
75
89
|
for ratio in target_ratios:
|
|
@@ -83,17 +97,29 @@ class InternVLModel:
|
|
|
83
97
|
best_ratio = ratio
|
|
84
98
|
return best_ratio
|
|
85
99
|
|
|
86
|
-
def _dynamic_preprocess(
|
|
100
|
+
def _dynamic_preprocess(
|
|
101
|
+
self,
|
|
102
|
+
image: Image.Image,
|
|
103
|
+
min_num: int = 1,
|
|
104
|
+
max_num: int = 12,
|
|
105
|
+
image_size: int = 448,
|
|
106
|
+
use_thumbnail: bool = True,
|
|
107
|
+
) -> List[Image.Image]:
|
|
87
108
|
orig_width, orig_height = image.size
|
|
88
109
|
aspect_ratio = orig_width / orig_height
|
|
89
110
|
|
|
90
111
|
target_ratios = set(
|
|
91
|
-
(i, j)
|
|
92
|
-
|
|
112
|
+
(i, j)
|
|
113
|
+
for n in range(min_num, max_num + 1)
|
|
114
|
+
for i in range(1, n + 1)
|
|
115
|
+
for j in range(1, n + 1)
|
|
116
|
+
if i * j <= max_num and i * j >= min_num
|
|
117
|
+
)
|
|
93
118
|
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
|
94
119
|
|
|
95
120
|
target_aspect_ratio = self._find_closest_aspect_ratio(
|
|
96
|
-
aspect_ratio, target_ratios, orig_width, orig_height, image_size
|
|
121
|
+
aspect_ratio, target_ratios, orig_width, orig_height, image_size
|
|
122
|
+
)
|
|
97
123
|
|
|
98
124
|
target_width = image_size * target_aspect_ratio[0]
|
|
99
125
|
target_height = image_size * target_aspect_ratio[1]
|
|
@@ -106,7 +132,7 @@ class InternVLModel:
|
|
|
106
132
|
(i % (target_width // image_size)) * image_size,
|
|
107
133
|
(i // (target_width // image_size)) * image_size,
|
|
108
134
|
((i % (target_width // image_size)) + 1) * image_size,
|
|
109
|
-
((i // (target_width // image_size)) + 1) * image_size
|
|
135
|
+
((i // (target_width // image_size)) + 1) * image_size,
|
|
110
136
|
)
|
|
111
137
|
split_img = resized_img.crop(box)
|
|
112
138
|
processed_images.append(split_img)
|
|
@@ -122,20 +148,24 @@ class InternVLModel:
|
|
|
122
148
|
# data URL base64
|
|
123
149
|
header, b64data = src.split(",", 1)
|
|
124
150
|
img_bytes = base64.b64decode(b64data)
|
|
125
|
-
return Image.open(BytesIO(img_bytes)).convert(
|
|
151
|
+
return Image.open(BytesIO(img_bytes)).convert("RGB")
|
|
126
152
|
if src.startswith("http://") or src.startswith("https://"):
|
|
127
153
|
resp = requests.get(src, timeout=10)
|
|
128
154
|
resp.raise_for_status()
|
|
129
|
-
return Image.open(BytesIO(resp.content)).convert(
|
|
155
|
+
return Image.open(BytesIO(resp.content)).convert("RGB")
|
|
130
156
|
# Assume local file path
|
|
131
|
-
return Image.open(src).convert(
|
|
157
|
+
return Image.open(src).convert("RGB")
|
|
132
158
|
|
|
133
|
-
def _images_to_pixel_values(
|
|
159
|
+
def _images_to_pixel_values(
|
|
160
|
+
self, images: List[Image.Image], input_size: int = 448, max_num: int = 12
|
|
161
|
+
):
|
|
134
162
|
transform = self._build_transform(input_size=input_size)
|
|
135
163
|
pixel_values_list = []
|
|
136
164
|
num_patches_list: List[int] = []
|
|
137
165
|
for img in images:
|
|
138
|
-
tiles = self._dynamic_preprocess(
|
|
166
|
+
tiles = self._dynamic_preprocess(
|
|
167
|
+
img, image_size=input_size, use_thumbnail=True, max_num=max_num
|
|
168
|
+
)
|
|
139
169
|
pv = [transform(tile) for tile in tiles]
|
|
140
170
|
pv = torch.stack(pv)
|
|
141
171
|
num_patches_list.append(pv.shape[0])
|
|
@@ -191,7 +221,9 @@ class InternVLModel:
|
|
|
191
221
|
last_user_text_parts = parts_text or last_user_text_parts
|
|
192
222
|
elif role == "assistant":
|
|
193
223
|
# Only keep text content for history
|
|
194
|
-
parts_text = [
|
|
224
|
+
parts_text = [
|
|
225
|
+
item.get("text", "") for item in content_items if item.get("type") == "text"
|
|
226
|
+
]
|
|
195
227
|
text = "\n".join(parts_text).strip()
|
|
196
228
|
if text:
|
|
197
229
|
context_lines.append(f"Assistant: {text}")
|
|
@@ -200,7 +232,9 @@ class InternVLModel:
|
|
|
200
232
|
pixel_values = None
|
|
201
233
|
num_patches_list: List[int] = []
|
|
202
234
|
if all_images:
|
|
203
|
-
pixel_values, num_patches_list = self._images_to_pixel_values(
|
|
235
|
+
pixel_values, num_patches_list = self._images_to_pixel_values(
|
|
236
|
+
all_images, input_size=448, max_num=12
|
|
237
|
+
)
|
|
204
238
|
if pixel_values is not None:
|
|
205
239
|
# Convert dtype/device as in docs
|
|
206
240
|
pixel_values = pixel_values.to(torch.bfloat16)
|
|
@@ -246,7 +280,9 @@ class InternVLModel:
|
|
|
246
280
|
num_patches_list=num_patches_list,
|
|
247
281
|
)
|
|
248
282
|
else:
|
|
249
|
-
response = self.model.chat(
|
|
283
|
+
response = self.model.chat(
|
|
284
|
+
self.tokenizer, pixel_values, question, generation_config
|
|
285
|
+
)
|
|
250
286
|
except Exception as e:
|
|
251
287
|
# Fallback: return empty string to avoid crashing the adapter
|
|
252
288
|
return ""
|
agent/adapters/models/opencua.py
CHANGED
|
@@ -1,13 +1,18 @@
|
|
|
1
|
-
from typing import List, Dict, Any
|
|
2
|
-
import re
|
|
3
1
|
import base64
|
|
2
|
+
import re
|
|
4
3
|
from io import BytesIO
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
5
|
|
|
6
6
|
try:
|
|
7
|
+
import blobfile as _ # assert blobfile is installed
|
|
7
8
|
import torch # type: ignore
|
|
8
|
-
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor # type: ignore
|
|
9
9
|
from PIL import Image # type: ignore
|
|
10
|
-
|
|
10
|
+
from transformers import ( # type: ignore
|
|
11
|
+
AutoImageProcessor,
|
|
12
|
+
AutoModel,
|
|
13
|
+
AutoTokenizer,
|
|
14
|
+
)
|
|
15
|
+
|
|
11
16
|
OPENCUA_AVAILABLE = True
|
|
12
17
|
except Exception:
|
|
13
18
|
OPENCUA_AVAILABLE = False
|
|
@@ -16,10 +21,12 @@ except Exception:
|
|
|
16
21
|
class OpenCUAModel:
|
|
17
22
|
"""OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
|
|
18
23
|
|
|
19
|
-
def __init__(
|
|
24
|
+
def __init__(
|
|
25
|
+
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
|
|
26
|
+
) -> None:
|
|
20
27
|
if not OPENCUA_AVAILABLE:
|
|
21
28
|
raise ImportError(
|
|
22
|
-
|
|
29
|
+
'OpenCUA requirements not found. Install with: pip install "cua-agent[opencua-hf]"'
|
|
23
30
|
)
|
|
24
31
|
self.model_name = model_name
|
|
25
32
|
self.device = device
|
|
@@ -56,7 +63,11 @@ class OpenCUAModel:
|
|
|
56
63
|
return ""
|
|
57
64
|
|
|
58
65
|
def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
|
|
59
|
-
assert
|
|
66
|
+
assert (
|
|
67
|
+
self.model is not None
|
|
68
|
+
and self.tokenizer is not None
|
|
69
|
+
and self.image_processor is not None
|
|
70
|
+
)
|
|
60
71
|
|
|
61
72
|
# Tokenize text side using chat template
|
|
62
73
|
input_ids = self.tokenizer.apply_chat_template(
|
|
@@ -74,7 +85,11 @@ class OpenCUAModel:
|
|
|
74
85
|
pixel_values = torch.tensor(image_info["pixel_values"]).to(
|
|
75
86
|
dtype=torch.bfloat16, device=self.model.device
|
|
76
87
|
)
|
|
77
|
-
grid_thws =
|
|
88
|
+
grid_thws = (
|
|
89
|
+
torch.tensor(image_info["image_grid_thw"])
|
|
90
|
+
if "image_grid_thw" in image_info
|
|
91
|
+
else None
|
|
92
|
+
)
|
|
78
93
|
|
|
79
94
|
gen_kwargs: Dict[str, Any] = {
|
|
80
95
|
"max_new_tokens": max_new_tokens,
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
2
|
|
|
3
3
|
# Hugging Face imports are local to avoid hard dependency at module import
|
|
4
4
|
try:
|
|
5
5
|
import torch # type: ignore
|
|
6
6
|
from transformers import AutoModelForImageTextToText, AutoProcessor # type: ignore
|
|
7
|
+
|
|
7
8
|
HF_AVAILABLE = True
|
|
8
9
|
except Exception:
|
|
9
10
|
HF_AVAILABLE = False
|
|
@@ -14,10 +15,12 @@ class Qwen2_5_VLModel:
|
|
|
14
15
|
Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
|
|
15
16
|
"""
|
|
16
17
|
|
|
17
|
-
def __init__(
|
|
18
|
+
def __init__(
|
|
19
|
+
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
|
|
20
|
+
) -> None:
|
|
18
21
|
if not HF_AVAILABLE:
|
|
19
22
|
raise ImportError(
|
|
20
|
-
|
|
23
|
+
'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
|
|
21
24
|
)
|
|
22
25
|
self.model_name = model_name
|
|
23
26
|
self.device = device
|
|
@@ -64,7 +67,7 @@ class Qwen2_5_VLModel:
|
|
|
64
67
|
generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
65
68
|
# Trim prompt tokens from output
|
|
66
69
|
generated_ids_trimmed = [
|
|
67
|
-
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
70
|
+
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
68
71
|
]
|
|
69
72
|
# Decode
|
|
70
73
|
output_text = self.processor.batch_decode(
|