cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +26 -17
- agent/computers/cua.py +27 -23
- agent/computers/custom.py +72 -69
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +212 -209
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +475 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
- cua_agent-0.4.35.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/loops/gta1.py
CHANGED
|
@@ -5,75 +5,80 @@ Code: https://github.com/Yan98/GTA1
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import asyncio
|
|
8
|
+
import base64
|
|
8
9
|
import json
|
|
10
|
+
import math
|
|
9
11
|
import re
|
|
10
|
-
import base64
|
|
11
|
-
from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
|
|
12
|
-
from io import BytesIO
|
|
13
12
|
import uuid
|
|
14
|
-
from
|
|
13
|
+
from io import BytesIO
|
|
14
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
15
|
+
|
|
15
16
|
import litellm
|
|
16
|
-
import
|
|
17
|
+
from PIL import Image
|
|
17
18
|
|
|
18
19
|
from ..decorators import register_agent
|
|
19
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
20
20
|
from ..loops.base import AsyncAgentConfig
|
|
21
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
21
22
|
|
|
22
|
-
SYSTEM_PROMPT =
|
|
23
|
+
SYSTEM_PROMPT = """
|
|
23
24
|
You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.
|
|
24
25
|
|
|
25
26
|
Output the coordinate pair exactly:
|
|
26
27
|
(x,y)
|
|
27
|
-
|
|
28
|
+
""".strip()
|
|
29
|
+
|
|
28
30
|
|
|
29
31
|
def extract_coordinates(raw_string: str) -> Tuple[float, float]:
|
|
30
32
|
"""Extract coordinates from model output."""
|
|
31
33
|
try:
|
|
32
34
|
matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
|
|
33
|
-
return tuple(map(float, matches[0]))
|
|
35
|
+
return tuple(map(float, matches[0])) # type: ignore
|
|
34
36
|
except:
|
|
35
37
|
return (0.0, 0.0)
|
|
36
38
|
|
|
37
|
-
|
|
39
|
+
|
|
40
|
+
def smart_resize(
|
|
41
|
+
height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360
|
|
42
|
+
) -> Tuple[int, int]:
|
|
38
43
|
"""Smart resize function similar to qwen_vl_utils."""
|
|
39
44
|
# Calculate the total pixels
|
|
40
45
|
total_pixels = height * width
|
|
41
|
-
|
|
46
|
+
|
|
42
47
|
# If already within bounds, return original dimensions
|
|
43
48
|
if min_pixels <= total_pixels <= max_pixels:
|
|
44
49
|
# Round to nearest factor
|
|
45
50
|
new_height = (height // factor) * factor
|
|
46
51
|
new_width = (width // factor) * factor
|
|
47
52
|
return new_height, new_width
|
|
48
|
-
|
|
53
|
+
|
|
49
54
|
# Calculate scaling factor
|
|
50
55
|
if total_pixels > max_pixels:
|
|
51
56
|
scale = (max_pixels / total_pixels) ** 0.5
|
|
52
57
|
else:
|
|
53
58
|
scale = (min_pixels / total_pixels) ** 0.5
|
|
54
|
-
|
|
59
|
+
|
|
55
60
|
# Apply scaling
|
|
56
61
|
new_height = int(height * scale)
|
|
57
62
|
new_width = int(width * scale)
|
|
58
|
-
|
|
63
|
+
|
|
59
64
|
# Round to nearest factor
|
|
60
65
|
new_height = (new_height // factor) * factor
|
|
61
66
|
new_width = (new_width // factor) * factor
|
|
62
|
-
|
|
67
|
+
|
|
63
68
|
# Ensure minimum size
|
|
64
69
|
new_height = max(new_height, factor)
|
|
65
70
|
new_width = max(new_width, factor)
|
|
66
|
-
|
|
71
|
+
|
|
67
72
|
return new_height, new_width
|
|
68
73
|
|
|
74
|
+
|
|
69
75
|
@register_agent(models=r".*GTA1.*")
|
|
70
76
|
class GTA1Config(AsyncAgentConfig):
|
|
71
77
|
"""GTA1 agent configuration implementing AsyncAgentConfig protocol for click prediction."""
|
|
72
|
-
|
|
78
|
+
|
|
73
79
|
def __init__(self):
|
|
74
80
|
self.current_model = None
|
|
75
81
|
self.last_screenshot_b64 = None
|
|
76
|
-
|
|
77
82
|
|
|
78
83
|
async def predict_step(
|
|
79
84
|
self,
|
|
@@ -87,25 +92,21 @@ class GTA1Config(AsyncAgentConfig):
|
|
|
87
92
|
_on_api_end=None,
|
|
88
93
|
_on_usage=None,
|
|
89
94
|
_on_screenshot=None,
|
|
90
|
-
**kwargs
|
|
95
|
+
**kwargs,
|
|
91
96
|
) -> Dict[str, Any]:
|
|
92
97
|
raise NotImplementedError()
|
|
93
98
|
|
|
94
99
|
async def predict_click(
|
|
95
|
-
self,
|
|
96
|
-
model: str,
|
|
97
|
-
image_b64: str,
|
|
98
|
-
instruction: str,
|
|
99
|
-
**kwargs
|
|
100
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
100
101
|
) -> Optional[Tuple[float, float]]:
|
|
101
102
|
"""
|
|
102
103
|
Predict click coordinates using GTA1 model via litellm.acompletion.
|
|
103
|
-
|
|
104
|
+
|
|
104
105
|
Args:
|
|
105
106
|
model: The GTA1 model name
|
|
106
107
|
image_b64: Base64 encoded image
|
|
107
108
|
instruction: Instruction for where to click
|
|
108
|
-
|
|
109
|
+
|
|
109
110
|
Returns:
|
|
110
111
|
Tuple of (x, y) coordinates or None if prediction fails
|
|
111
112
|
"""
|
|
@@ -113,66 +114,62 @@ class GTA1Config(AsyncAgentConfig):
|
|
|
113
114
|
image_data = base64.b64decode(image_b64)
|
|
114
115
|
image = Image.open(BytesIO(image_data))
|
|
115
116
|
width, height = image.width, image.height
|
|
116
|
-
|
|
117
|
+
|
|
117
118
|
# Smart resize the image (similar to qwen_vl_utils)
|
|
118
119
|
resized_height, resized_width = smart_resize(
|
|
119
|
-
height,
|
|
120
|
+
height,
|
|
121
|
+
width,
|
|
120
122
|
factor=28, # Default factor for Qwen models
|
|
121
123
|
min_pixels=3136,
|
|
122
|
-
max_pixels=4096 * 2160
|
|
124
|
+
max_pixels=4096 * 2160,
|
|
123
125
|
)
|
|
124
126
|
resized_image = image.resize((resized_width, resized_height))
|
|
125
127
|
scale_x, scale_y = width / resized_width, height / resized_height
|
|
126
|
-
|
|
128
|
+
|
|
127
129
|
# Convert resized image back to base64
|
|
128
130
|
buffered = BytesIO()
|
|
129
131
|
resized_image.save(buffered, format="PNG")
|
|
130
132
|
resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
|
|
131
|
-
|
|
133
|
+
|
|
132
134
|
# Prepare system and user messages
|
|
133
135
|
system_message = {
|
|
134
136
|
"role": "system",
|
|
135
|
-
"content": SYSTEM_PROMPT.format(height=resized_height, width=resized_width)
|
|
137
|
+
"content": SYSTEM_PROMPT.format(height=resized_height, width=resized_width),
|
|
136
138
|
}
|
|
137
|
-
|
|
139
|
+
|
|
138
140
|
user_message = {
|
|
139
141
|
"role": "user",
|
|
140
142
|
"content": [
|
|
141
143
|
{
|
|
142
144
|
"type": "image_url",
|
|
143
|
-
"image_url": {
|
|
144
|
-
"url": f"data:image/png;base64,{resized_image_b64}"
|
|
145
|
-
}
|
|
145
|
+
"image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
|
|
146
146
|
},
|
|
147
|
-
{
|
|
148
|
-
|
|
149
|
-
"text": instruction
|
|
150
|
-
}
|
|
151
|
-
]
|
|
147
|
+
{"type": "text", "text": instruction},
|
|
148
|
+
],
|
|
152
149
|
}
|
|
153
|
-
|
|
150
|
+
|
|
154
151
|
# Prepare API call kwargs
|
|
155
152
|
api_kwargs = {
|
|
156
153
|
"model": model,
|
|
157
154
|
"messages": [system_message, user_message],
|
|
158
155
|
"max_tokens": 2056,
|
|
159
156
|
"temperature": 0.0,
|
|
160
|
-
**kwargs
|
|
157
|
+
**kwargs,
|
|
161
158
|
}
|
|
162
|
-
|
|
159
|
+
|
|
163
160
|
# Use liteLLM acompletion
|
|
164
161
|
response = await litellm.acompletion(**api_kwargs)
|
|
165
|
-
|
|
162
|
+
|
|
166
163
|
# Extract response text
|
|
167
|
-
output_text = response.choices[0].message.content
|
|
168
|
-
|
|
164
|
+
output_text = response.choices[0].message.content # type: ignore
|
|
165
|
+
|
|
169
166
|
# Extract and rescale coordinates
|
|
170
|
-
pred_x, pred_y = extract_coordinates(output_text)
|
|
167
|
+
pred_x, pred_y = extract_coordinates(output_text) # type: ignore
|
|
171
168
|
pred_x *= scale_x
|
|
172
169
|
pred_y *= scale_y
|
|
173
|
-
|
|
170
|
+
|
|
174
171
|
return (math.floor(pred_x), math.floor(pred_y))
|
|
175
|
-
|
|
172
|
+
|
|
176
173
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
177
174
|
"""Return the capabilities supported by this agent."""
|
|
178
175
|
return ["click"]
|
agent/loops/holo.py
CHANGED
|
@@ -21,8 +21,8 @@ import litellm
|
|
|
21
21
|
from PIL import Image
|
|
22
22
|
|
|
23
23
|
from ..decorators import register_agent
|
|
24
|
-
from .base import AsyncAgentConfig
|
|
25
24
|
from ..types import AgentCapability
|
|
25
|
+
from .base import AsyncAgentConfig
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def _strip_hf_prefix(model: str) -> str:
|
|
@@ -53,7 +53,9 @@ def _maybe_smart_resize(image: Image.Image, model: str) -> Tuple[Image.Image, Tu
|
|
|
53
53
|
if image_processor is None:
|
|
54
54
|
return image, (orig_w, orig_h)
|
|
55
55
|
|
|
56
|
-
factor = getattr(image_processor, "patch_size", 14) * getattr(
|
|
56
|
+
factor = getattr(image_processor, "patch_size", 14) * getattr(
|
|
57
|
+
image_processor, "merge_size", 1
|
|
58
|
+
)
|
|
57
59
|
min_pixels = getattr(image_processor, "min_pixels", 256 * 256)
|
|
58
60
|
max_pixels = getattr(image_processor, "max_pixels", 1536 * 1536)
|
|
59
61
|
|
agent/loops/internvl.py
CHANGED
|
@@ -18,13 +18,12 @@ import re
|
|
|
18
18
|
from io import BytesIO
|
|
19
19
|
from typing import Any, Dict, List, Optional, Tuple
|
|
20
20
|
|
|
21
|
-
from PIL import Image
|
|
22
21
|
import litellm
|
|
22
|
+
from PIL import Image
|
|
23
23
|
|
|
24
24
|
from ..decorators import register_agent
|
|
25
|
-
from .composed_grounded import ComposedGroundedConfig
|
|
26
25
|
from ..types import AgentCapability
|
|
27
|
-
|
|
26
|
+
from .composed_grounded import ComposedGroundedConfig
|
|
28
27
|
|
|
29
28
|
# Regex patterns for extracting coordinates
|
|
30
29
|
# Accept optional whitespace and optional decimal fractions
|
|
@@ -91,7 +90,7 @@ class InternVLConfig(ComposedGroundedConfig):
|
|
|
91
90
|
_on_api_end=None,
|
|
92
91
|
_on_usage=None,
|
|
93
92
|
_on_screenshot=None,
|
|
94
|
-
**kwargs
|
|
93
|
+
**kwargs,
|
|
95
94
|
) -> Dict[str, Any]:
|
|
96
95
|
"""Fallback to a self-composed model"""
|
|
97
96
|
return await super().predict_step(
|
|
@@ -105,15 +104,11 @@ class InternVLConfig(ComposedGroundedConfig):
|
|
|
105
104
|
_on_api_end=_on_api_end,
|
|
106
105
|
_on_usage=_on_usage,
|
|
107
106
|
_on_screenshot=_on_screenshot,
|
|
108
|
-
**kwargs
|
|
107
|
+
**kwargs,
|
|
109
108
|
)
|
|
110
|
-
|
|
109
|
+
|
|
111
110
|
async def predict_click(
|
|
112
|
-
self,
|
|
113
|
-
model: str,
|
|
114
|
-
image_b64: str,
|
|
115
|
-
instruction: str,
|
|
116
|
-
**kwargs
|
|
111
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
117
112
|
) -> Optional[Tuple[int, int]]:
|
|
118
113
|
"""
|
|
119
114
|
Predict click coordinates using InternVL via litellm.acompletion.
|
agent/loops/moondream3.py
CHANGED
|
@@ -14,27 +14,28 @@ Differences from composed_grounded:
|
|
|
14
14
|
|
|
15
15
|
from __future__ import annotations
|
|
16
16
|
|
|
17
|
-
import uuid
|
|
18
17
|
import base64
|
|
19
18
|
import io
|
|
20
|
-
|
|
19
|
+
import uuid
|
|
20
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
21
21
|
|
|
22
|
-
from PIL import Image, ImageDraw, ImageFont
|
|
23
22
|
import litellm
|
|
23
|
+
from PIL import Image, ImageDraw, ImageFont
|
|
24
24
|
|
|
25
25
|
from ..decorators import register_agent
|
|
26
|
-
from ..types import AgentCapability
|
|
27
26
|
from ..loops.base import AsyncAgentConfig
|
|
28
27
|
from ..responses import (
|
|
29
|
-
convert_computer_calls_xy2desc,
|
|
30
|
-
convert_responses_items_to_completion_messages,
|
|
31
28
|
convert_completion_messages_to_responses_items,
|
|
32
29
|
convert_computer_calls_desc2xy,
|
|
30
|
+
convert_computer_calls_xy2desc,
|
|
31
|
+
convert_responses_items_to_completion_messages,
|
|
33
32
|
get_all_element_descriptions,
|
|
34
33
|
)
|
|
34
|
+
from ..types import AgentCapability
|
|
35
35
|
|
|
36
36
|
_MOONDREAM_SINGLETON = None
|
|
37
37
|
|
|
38
|
+
|
|
38
39
|
def get_moondream_model() -> Any:
|
|
39
40
|
"""Get a singleton instance of the Moondream3 preview model."""
|
|
40
41
|
global _MOONDREAM_SINGLETON
|
|
@@ -42,6 +43,7 @@ def get_moondream_model() -> Any:
|
|
|
42
43
|
try:
|
|
43
44
|
import torch
|
|
44
45
|
from transformers import AutoModelForCausalLM
|
|
46
|
+
|
|
45
47
|
_MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained(
|
|
46
48
|
"moondream/moondream3-preview",
|
|
47
49
|
trust_remote_code=True,
|
|
@@ -95,6 +97,7 @@ def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> L
|
|
|
95
97
|
filtered.append(msg_copy)
|
|
96
98
|
return filtered
|
|
97
99
|
|
|
100
|
+
|
|
98
101
|
def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]:
|
|
99
102
|
"""Detect UI elements with Moondream, caption each, draw labels with backgrounds.
|
|
100
103
|
|
|
@@ -132,7 +135,12 @@ def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str,
|
|
|
132
135
|
y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0))))
|
|
133
136
|
x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0))))
|
|
134
137
|
y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0))))
|
|
135
|
-
left, top, right, bottom =
|
|
138
|
+
left, top, right, bottom = (
|
|
139
|
+
int(x_min * W),
|
|
140
|
+
int(y_min * H),
|
|
141
|
+
int(x_max * W),
|
|
142
|
+
int(y_max * H),
|
|
143
|
+
)
|
|
136
144
|
left, top = max(0, left), max(0, top)
|
|
137
145
|
right, bottom = min(W - 1, right), min(H - 1, bottom)
|
|
138
146
|
crop = base_img.crop((left, top, right, bottom))
|
|
@@ -200,6 +208,7 @@ def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str,
|
|
|
200
208
|
annotated_b64 = _image_to_b64(annotated)
|
|
201
209
|
return annotated_b64, detected_names
|
|
202
210
|
|
|
211
|
+
|
|
203
212
|
GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
|
204
213
|
"type": "function",
|
|
205
214
|
"function": {
|
|
@@ -270,6 +279,7 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
|
|
270
279
|
},
|
|
271
280
|
}
|
|
272
281
|
|
|
282
|
+
|
|
273
283
|
@register_agent(r"moondream3\+.*", priority=2)
|
|
274
284
|
class Moondream3PlusConfig(AsyncAgentConfig):
|
|
275
285
|
def __init__(self):
|
|
@@ -321,14 +331,25 @@ class Moondream3PlusConfig(AsyncAgentConfig):
|
|
|
321
331
|
"type": "message",
|
|
322
332
|
"role": "assistant",
|
|
323
333
|
"content": [
|
|
324
|
-
{
|
|
334
|
+
{
|
|
335
|
+
"type": "output_text",
|
|
336
|
+
"text": "Taking a screenshot to analyze the current screen.",
|
|
337
|
+
}
|
|
325
338
|
],
|
|
326
339
|
},
|
|
327
|
-
{
|
|
340
|
+
{
|
|
341
|
+
"type": "computer_call",
|
|
342
|
+
"call_id": call_id,
|
|
343
|
+
"status": "completed",
|
|
344
|
+
"action": {"type": "screenshot"},
|
|
345
|
+
},
|
|
328
346
|
{
|
|
329
347
|
"type": "computer_call_output",
|
|
330
348
|
"call_id": call_id,
|
|
331
|
-
"output": {
|
|
349
|
+
"output": {
|
|
350
|
+
"type": "input_image",
|
|
351
|
+
"image_url": f"data:image/png;base64,{screenshot_b64}",
|
|
352
|
+
},
|
|
332
353
|
},
|
|
333
354
|
]
|
|
334
355
|
last_image_b64 = screenshot_b64
|
|
@@ -354,13 +375,16 @@ class Moondream3PlusConfig(AsyncAgentConfig):
|
|
|
354
375
|
"content": [
|
|
355
376
|
{"type": "input_text", "text": "Detected form UI elements on screen:"},
|
|
356
377
|
{"type": "input_text", "text": names_text},
|
|
357
|
-
{
|
|
378
|
+
{
|
|
379
|
+
"type": "input_text",
|
|
380
|
+
"text": "Please continue with the next action needed to perform your task.",
|
|
381
|
+
},
|
|
358
382
|
],
|
|
359
383
|
}
|
|
360
384
|
)
|
|
361
385
|
|
|
362
386
|
tool_schemas = []
|
|
363
|
-
for schema in
|
|
387
|
+
for schema in tools or []:
|
|
364
388
|
if schema.get("type") == "computer":
|
|
365
389
|
tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
|
|
366
390
|
else:
|