cua-agent 0.4.11__tar.gz → 0.4.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.4.11 → cua_agent-0.4.12}/PKG-INFO +10 -1
- {cua_agent-0.4.11 → cua_agent-0.4.12}/README.md +1 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/loops/__init__.py +2 -1
- cua_agent-0.4.12/agent/loops/glm45v.py +902 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/pyproject.toml +10 -1
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/__init__.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/__main__.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/adapters/__init__.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/adapters/huggingfacelocal_adapter.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/agent.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/callbacks/__init__.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/callbacks/base.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/callbacks/budget_manager.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/callbacks/image_retention.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/callbacks/logging.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/callbacks/pii_anonymization.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/callbacks/telemetry.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/callbacks/trajectory_saver.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/cli.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/computer_handler.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/decorators.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/loops/anthropic.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/loops/base.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/loops/composed_grounded.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/loops/gta1.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/loops/model_types.csv +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/loops/omniparser.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/loops/openai.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/loops/uitars.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/responses.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/telemetry.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/types.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/ui/__init__.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/ui/__main__.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/ui/gradio/__init__.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/ui/gradio/app.py +0 -0
- {cua_agent-0.4.11 → cua_agent-0.4.12}/agent/ui/gradio/ui_components.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.12
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: >=3.11
|
|
@@ -25,16 +25,24 @@ Provides-Extra: uitars
|
|
|
25
25
|
Provides-Extra: uitars-mlx
|
|
26
26
|
Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "uitars-mlx"
|
|
27
27
|
Provides-Extra: uitars-hf
|
|
28
|
+
Requires-Dist: accelerate; extra == "uitars-hf"
|
|
29
|
+
Requires-Dist: torch; extra == "uitars-hf"
|
|
28
30
|
Requires-Dist: transformers>=4.54.0; extra == "uitars-hf"
|
|
29
31
|
Provides-Extra: ui
|
|
30
32
|
Requires-Dist: gradio>=5.23.3; extra == "ui"
|
|
31
33
|
Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
|
|
32
34
|
Provides-Extra: cli
|
|
33
35
|
Requires-Dist: yaspin>=3.1.0; extra == "cli"
|
|
36
|
+
Provides-Extra: glm45v-hf
|
|
37
|
+
Requires-Dist: accelerate; extra == "glm45v-hf"
|
|
38
|
+
Requires-Dist: torch; extra == "glm45v-hf"
|
|
39
|
+
Requires-Dist: transformers-v4.55.0-GLM-4.5V-preview; extra == "glm45v-hf"
|
|
34
40
|
Provides-Extra: all
|
|
35
41
|
Requires-Dist: ultralytics>=8.0.0; extra == "all"
|
|
36
42
|
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "all"
|
|
37
43
|
Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
|
|
44
|
+
Requires-Dist: accelerate; extra == "all"
|
|
45
|
+
Requires-Dist: torch; extra == "all"
|
|
38
46
|
Requires-Dist: transformers>=4.54.0; extra == "all"
|
|
39
47
|
Requires-Dist: gradio>=5.23.3; extra == "all"
|
|
40
48
|
Requires-Dist: python-dotenv>=1.0.1; extra == "all"
|
|
@@ -80,6 +88,7 @@ pip install "cua-agent[omni]" # Omniparser + any LLM support
|
|
|
80
88
|
pip install "cua-agent[uitars]" # UI-TARS
|
|
81
89
|
pip install "cua-agent[uitars-mlx]" # UI-TARS + MLX support
|
|
82
90
|
pip install "cua-agent[uitars-hf]" # UI-TARS + Huggingface support
|
|
91
|
+
pip install "cua-agent[glm45v-hf]" # GLM-4.5V + Huggingface support
|
|
83
92
|
pip install "cua-agent[ui]" # Gradio UI support
|
|
84
93
|
```
|
|
85
94
|
|
|
@@ -37,6 +37,7 @@ pip install "cua-agent[omni]" # Omniparser + any LLM support
|
|
|
37
37
|
pip install "cua-agent[uitars]" # UI-TARS
|
|
38
38
|
pip install "cua-agent[uitars-mlx]" # UI-TARS + MLX support
|
|
39
39
|
pip install "cua-agent[uitars-hf]" # UI-TARS + Huggingface support
|
|
40
|
+
pip install "cua-agent[glm45v-hf]" # GLM-4.5V + Huggingface support
|
|
40
41
|
pip install "cua-agent[ui]" # Gradio UI support
|
|
41
42
|
```
|
|
42
43
|
|
|
@@ -9,5 +9,6 @@ from . import uitars
|
|
|
9
9
|
from . import omniparser
|
|
10
10
|
from . import gta1
|
|
11
11
|
from . import composed_grounded
|
|
12
|
+
from . import glm45v
|
|
12
13
|
|
|
13
|
-
__all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded"]
|
|
14
|
+
__all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded", "glm45v"]
|
|
@@ -0,0 +1,902 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GLM-4.5V agent loop implementation using liteLLM for GLM-4.5V model.
|
|
3
|
+
Supports vision-language models for computer control with bounding box parsing.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import json
|
|
8
|
+
import base64
|
|
9
|
+
import re
|
|
10
|
+
from typing import Dict, List, Any, Optional, Tuple
|
|
11
|
+
from io import BytesIO
|
|
12
|
+
from PIL import Image
|
|
13
|
+
import litellm
|
|
14
|
+
from litellm.types.utils import ModelResponse
|
|
15
|
+
from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
|
|
16
|
+
|
|
17
|
+
from ..decorators import register_agent
|
|
18
|
+
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
19
|
+
from ..loops.base import AsyncAgentConfig
|
|
20
|
+
from ..responses import (
|
|
21
|
+
convert_responses_items_to_completion_messages,
|
|
22
|
+
convert_completion_messages_to_responses_items,
|
|
23
|
+
make_reasoning_item,
|
|
24
|
+
make_output_text_item,
|
|
25
|
+
make_click_item,
|
|
26
|
+
make_double_click_item,
|
|
27
|
+
make_drag_item,
|
|
28
|
+
make_keypress_item,
|
|
29
|
+
make_scroll_item,
|
|
30
|
+
make_type_item,
|
|
31
|
+
make_wait_item,
|
|
32
|
+
make_input_image_item
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# GLM-4.5V specific constants
|
|
36
|
+
GLM_ACTION_SPACE = """
|
|
37
|
+
### {left,right,middle}_click
|
|
38
|
+
|
|
39
|
+
Call rule: `{left,right,middle}_click(start_box='[x,y]', element_info='')`
|
|
40
|
+
{
|
|
41
|
+
'name': ['left_click', 'right_click', 'middle_click'],
|
|
42
|
+
'description': 'Perform a left/right/middle mouse click at the specified coordinates on the screen.',
|
|
43
|
+
'parameters': {
|
|
44
|
+
'type': 'object',
|
|
45
|
+
'properties': {
|
|
46
|
+
'start_box': {
|
|
47
|
+
'type': 'array',
|
|
48
|
+
'items': {
|
|
49
|
+
'type': 'integer'
|
|
50
|
+
},
|
|
51
|
+
'description': 'Coordinates [x,y] where to perform the click, normalized to 0-999 range.'
|
|
52
|
+
},
|
|
53
|
+
'element_info': {
|
|
54
|
+
'type': 'string',
|
|
55
|
+
'description': 'Optional text description of the UI element being clicked.'
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
'required': ['start_box']
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
### hover
|
|
63
|
+
|
|
64
|
+
Call rule: `hover(start_box='[x,y]', element_info='')`
|
|
65
|
+
{
|
|
66
|
+
'name': 'hover',
|
|
67
|
+
'description': 'Move the mouse pointer to the specified coordinates without performing any click action.',
|
|
68
|
+
'parameters': {
|
|
69
|
+
'type': 'object',
|
|
70
|
+
'properties': {
|
|
71
|
+
'start_box': {
|
|
72
|
+
'type': 'array',
|
|
73
|
+
'items': {
|
|
74
|
+
'type': 'integer'
|
|
75
|
+
},
|
|
76
|
+
'description': 'Coordinates [x,y] where to move the mouse pointer, normalized to 0-999 range.'
|
|
77
|
+
},
|
|
78
|
+
'element_info': {
|
|
79
|
+
'type': 'string',
|
|
80
|
+
'description': 'Optional text description of the UI element being hovered over.'
|
|
81
|
+
}
|
|
82
|
+
},
|
|
83
|
+
'required': ['start_box']
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
### left_double_click
|
|
88
|
+
|
|
89
|
+
Call rule: `left_double_click(start_box='[x,y]', element_info='')`
|
|
90
|
+
{
|
|
91
|
+
'name': 'left_double_click',
|
|
92
|
+
'description': 'Perform a left mouse double-click at the specified coordinates on the screen.',
|
|
93
|
+
'parameters': {
|
|
94
|
+
'type': 'object',
|
|
95
|
+
'properties': {
|
|
96
|
+
'start_box': {
|
|
97
|
+
'type': 'array',
|
|
98
|
+
'items': {
|
|
99
|
+
'type': 'integer'
|
|
100
|
+
},
|
|
101
|
+
'description': 'Coordinates [x,y] where to perform the double-click, normalized to 0-999 range.'
|
|
102
|
+
},
|
|
103
|
+
'element_info': {
|
|
104
|
+
'type': 'string',
|
|
105
|
+
'description': 'Optional text description of the UI element being double-clicked.'
|
|
106
|
+
}
|
|
107
|
+
},
|
|
108
|
+
'required': ['start_box']
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
### left_drag
|
|
113
|
+
|
|
114
|
+
Call rule: `left_drag(start_box='[x1,y1]', end_box='[x2,y2]', element_info='')`
|
|
115
|
+
{
|
|
116
|
+
'name': 'left_drag',
|
|
117
|
+
'description': 'Drag the mouse from starting coordinates to ending coordinates while holding the left mouse button.',
|
|
118
|
+
'parameters': {
|
|
119
|
+
'type': 'object',
|
|
120
|
+
'properties': {
|
|
121
|
+
'start_box': {
|
|
122
|
+
'type': 'array',
|
|
123
|
+
'items': {
|
|
124
|
+
'type': 'integer'
|
|
125
|
+
},
|
|
126
|
+
'description': 'Starting coordinates [x1,y1] for the drag operation, normalized to 0-999 range.'
|
|
127
|
+
},
|
|
128
|
+
'end_box': {
|
|
129
|
+
'type': 'array',
|
|
130
|
+
'items': {
|
|
131
|
+
'type': 'integer'
|
|
132
|
+
},
|
|
133
|
+
'description': 'Ending coordinates [x2,y2] for the drag operation, normalized to 0-999 range.'
|
|
134
|
+
},
|
|
135
|
+
'element_info': {
|
|
136
|
+
'type': 'string',
|
|
137
|
+
'description': 'Optional text description of the UI element being dragged.'
|
|
138
|
+
}
|
|
139
|
+
},
|
|
140
|
+
'required': ['start_box', 'end_box']
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
### key
|
|
145
|
+
|
|
146
|
+
Call rule: `key(keys='')`
|
|
147
|
+
{
|
|
148
|
+
'name': 'key',
|
|
149
|
+
'description': 'Simulate pressing a single key or combination of keys on the keyboard.',
|
|
150
|
+
'parameters': {
|
|
151
|
+
'type': 'object',
|
|
152
|
+
'properties': {
|
|
153
|
+
'keys': {
|
|
154
|
+
'type': 'string',
|
|
155
|
+
'description': 'The key or key combination to press. Use '+' to separate keys in combinations (e.g., 'ctrl+c', 'alt+tab').'
|
|
156
|
+
}
|
|
157
|
+
},
|
|
158
|
+
'required': ['keys']
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
### type
|
|
163
|
+
|
|
164
|
+
Call rule: `type(content='')`
|
|
165
|
+
{
|
|
166
|
+
'name': 'type',
|
|
167
|
+
'description': 'Type text content into the currently focused text input field. This action only performs typing and does not handle field activation or clearing.',
|
|
168
|
+
'parameters': {
|
|
169
|
+
'type': 'object',
|
|
170
|
+
'properties': {
|
|
171
|
+
'content': {
|
|
172
|
+
'type': 'string',
|
|
173
|
+
'description': 'The text content to be typed into the active text field.'
|
|
174
|
+
}
|
|
175
|
+
},
|
|
176
|
+
'required': ['content']
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
### scroll
|
|
181
|
+
|
|
182
|
+
Call rule: `scroll(start_box='[x,y]', direction='', step=5, element_info='')`
|
|
183
|
+
{
|
|
184
|
+
'name': 'scroll',
|
|
185
|
+
'description': 'Scroll an element at the specified coordinates in the specified direction by a given number of wheel steps.',
|
|
186
|
+
'parameters': {
|
|
187
|
+
'type': 'object',
|
|
188
|
+
'properties': {
|
|
189
|
+
'start_box': {
|
|
190
|
+
'type': 'array',
|
|
191
|
+
'items': {
|
|
192
|
+
'type': 'integer'
|
|
193
|
+
},
|
|
194
|
+
'description': 'Coordinates [x,y] of the element or area to scroll, normalized to 0-999 range.'
|
|
195
|
+
},
|
|
196
|
+
'direction': {
|
|
197
|
+
'type': 'string',
|
|
198
|
+
'enum': ['down', 'up'],
|
|
199
|
+
'description': 'The direction to scroll: 'down' or 'up'.'
|
|
200
|
+
},
|
|
201
|
+
'step': {
|
|
202
|
+
'type': 'integer',
|
|
203
|
+
'default': 5,
|
|
204
|
+
'description': 'Number of wheel steps to scroll, default is 5.'
|
|
205
|
+
},
|
|
206
|
+
'element_info': {
|
|
207
|
+
'type': 'string',
|
|
208
|
+
'description': 'Optional text description of the UI element being scrolled.'
|
|
209
|
+
}
|
|
210
|
+
},
|
|
211
|
+
'required': ['start_box', 'direction']
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
### WAIT
|
|
216
|
+
|
|
217
|
+
Call rule: `WAIT()`
|
|
218
|
+
{
|
|
219
|
+
'name': 'WAIT',
|
|
220
|
+
'description': 'Wait for 5 seconds before proceeding to the next action.',
|
|
221
|
+
'parameters': {
|
|
222
|
+
'type': 'object',
|
|
223
|
+
'properties': {},
|
|
224
|
+
'required': []
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
### DONE
|
|
229
|
+
|
|
230
|
+
Call rule: `DONE()`
|
|
231
|
+
{
|
|
232
|
+
'name': 'DONE',
|
|
233
|
+
'description': 'Indicate that the current task has been completed successfully and no further actions are needed.',
|
|
234
|
+
'parameters': {
|
|
235
|
+
'type': 'object',
|
|
236
|
+
'properties': {},
|
|
237
|
+
'required': []
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
### FAIL
|
|
242
|
+
|
|
243
|
+
Call rule: `FAIL()`
|
|
244
|
+
{
|
|
245
|
+
'name': 'FAIL',
|
|
246
|
+
'description': 'Indicate that the current task cannot be completed or is impossible to accomplish.',
|
|
247
|
+
'parameters': {
|
|
248
|
+
'type': 'object',
|
|
249
|
+
'properties': {},
|
|
250
|
+
'required': []
|
|
251
|
+
}
|
|
252
|
+
}"""
|
|
253
|
+
|
|
254
|
+
def encode_image_to_base64(image_path: str) -> str:
|
|
255
|
+
"""Encode image file to base64 string with data URI."""
|
|
256
|
+
with open(image_path, "rb") as image_file:
|
|
257
|
+
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
|
258
|
+
return f"data:image/png;base64,{encoded_string}"
|
|
259
|
+
|
|
260
|
+
def parse_glm_response(response: str) -> Dict[str, Any]:
|
|
261
|
+
"""
|
|
262
|
+
Parse GLM-4.5V response to extract action and memory.
|
|
263
|
+
|
|
264
|
+
The special tokens <|begin_of_box|> and <|end_of_box|> mark bounding boxes.
|
|
265
|
+
Coordinates are normalized values between 0 and 1000.
|
|
266
|
+
"""
|
|
267
|
+
# Extract action from between special tokens
|
|
268
|
+
pattern = r"<\|begin_of_box\|>(.*?)<\|end_of_box\|>"
|
|
269
|
+
match = re.search(pattern, response)
|
|
270
|
+
if match:
|
|
271
|
+
action = match.group(1).strip()
|
|
272
|
+
else:
|
|
273
|
+
# Fallback: look for function call patterns
|
|
274
|
+
action_pattern = r"[\w_]+\([^)]*\)"
|
|
275
|
+
matches = re.findall(action_pattern, response)
|
|
276
|
+
action = matches[0] if matches else None
|
|
277
|
+
|
|
278
|
+
# Extract memory section
|
|
279
|
+
memory_pattern = r"Memory:(.*?)$"
|
|
280
|
+
memory_match = re.search(memory_pattern, response, re.DOTALL)
|
|
281
|
+
memory = memory_match.group(1).strip() if memory_match else "[]"
|
|
282
|
+
|
|
283
|
+
# Extract action text (everything before Memory:)
|
|
284
|
+
action_text_pattern = r'^(.*?)Memory:'
|
|
285
|
+
action_text_match = re.search(action_text_pattern, response, re.DOTALL)
|
|
286
|
+
action_text = action_text_match.group(1).strip() if action_text_match else response
|
|
287
|
+
|
|
288
|
+
# Clean up action text by removing special tokens
|
|
289
|
+
if action_text:
|
|
290
|
+
action_text = action_text.replace("<|begin_of_box|>", "").replace("<|end_of_box|>", "")
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
"action": action,
|
|
294
|
+
"action_text": action_text,
|
|
295
|
+
"memory": memory
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
def get_last_image_from_messages(messages: Messages) -> Optional[str]:
|
|
299
|
+
"""Extract the last image from messages for processing."""
|
|
300
|
+
for message in reversed(messages):
|
|
301
|
+
if isinstance(message, dict):
|
|
302
|
+
if message.get("type") == "computer_call_output":
|
|
303
|
+
output = message.get("output", {})
|
|
304
|
+
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
305
|
+
image_url = output.get("image_url", "")
|
|
306
|
+
if isinstance(image_url, str) and image_url.startswith("data:image/"):
|
|
307
|
+
# Extract base64 part
|
|
308
|
+
return image_url.split(",", 1)[1]
|
|
309
|
+
elif message.get("role") == "user":
|
|
310
|
+
content = message.get("content", [])
|
|
311
|
+
if isinstance(content, list):
|
|
312
|
+
for item in reversed(content):
|
|
313
|
+
if isinstance(item, dict) and item.get("type") == "image_url":
|
|
314
|
+
image_url_obj = item.get("image_url", {})
|
|
315
|
+
if isinstance(image_url_obj, dict):
|
|
316
|
+
image_url = image_url_obj.get("url", "")
|
|
317
|
+
if isinstance(image_url, str) and image_url.startswith("data:image/"):
|
|
318
|
+
return image_url.split(",", 1)[1]
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
def convert_responses_items_to_glm45v_pc_prompt(messages: Messages, task: str, memory: str = "") -> List[Dict[str, Any]]:
|
|
322
|
+
"""Convert responses items to GLM-4.5V PC prompt format with historical actions.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
messages: List of message items from the conversation
|
|
326
|
+
task: The task description
|
|
327
|
+
memory: Current memory state
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
List of content items for the prompt (text and image_url items)
|
|
331
|
+
"""
|
|
332
|
+
action_space = GLM_ACTION_SPACE
|
|
333
|
+
|
|
334
|
+
# Template head
|
|
335
|
+
head_text = f"""You are a GUI Agent, and your primary task is to respond accurately to user requests or questions. In addition to directly answering the user's queries, you can also use tools or perform GUI operations directly until you fulfill the user's request or provide a correct answer. You should carefully read and understand the images and questions provided by the user, and engage in thinking and reflection when appropriate. The coordinates involved are all represented in thousandths (0-999).
|
|
336
|
+
|
|
337
|
+
# Task:
|
|
338
|
+
{task}
|
|
339
|
+
|
|
340
|
+
# Task Platform
|
|
341
|
+
Ubuntu
|
|
342
|
+
|
|
343
|
+
# Action Space
|
|
344
|
+
{action_space}
|
|
345
|
+
|
|
346
|
+
# Historical Actions and Current Memory
|
|
347
|
+
History:"""
|
|
348
|
+
|
|
349
|
+
# Template tail
|
|
350
|
+
tail_text = f"""
|
|
351
|
+
Memory:
|
|
352
|
+
{memory}
|
|
353
|
+
# Output Format
|
|
354
|
+
Plain text explanation with action(param='...')
|
|
355
|
+
Memory:
|
|
356
|
+
[{{"key": "value"}}, ...]
|
|
357
|
+
|
|
358
|
+
# Some Additional Notes
|
|
359
|
+
- I'll give you the most recent 4 history screenshots(shrunked to 50%*50%) along with the historical action steps.
|
|
360
|
+
- You should put the key information you *have to remember* in a seperated memory part and I'll give it to you in the next round. The content in this part should be a dict list. If you no longer need some given information, you should remove it from the memory. Even if you don't need to remember anything, you should also output an empty list.
|
|
361
|
+
- My computer's password is "password", feel free to use it when you need sudo rights.
|
|
362
|
+
- For the thunderbird account "anonym-x2024@outlook.com", the password is "gTCI";=@y7|QJ0nDa_kN3Sb&>".
|
|
363
|
+
|
|
364
|
+
Current Screenshot:
|
|
365
|
+
"""
|
|
366
|
+
|
|
367
|
+
# Build history from messages
|
|
368
|
+
history = []
|
|
369
|
+
history_images = []
|
|
370
|
+
|
|
371
|
+
# Group messages into steps
|
|
372
|
+
current_step = []
|
|
373
|
+
step_num = 0
|
|
374
|
+
|
|
375
|
+
for message in messages:
|
|
376
|
+
msg_type = message.get("type")
|
|
377
|
+
|
|
378
|
+
if msg_type == "reasoning":
|
|
379
|
+
current_step.append(message)
|
|
380
|
+
elif msg_type == "message" and message.get("role") == "assistant":
|
|
381
|
+
current_step.append(message)
|
|
382
|
+
elif msg_type == "computer_call":
|
|
383
|
+
current_step.append(message)
|
|
384
|
+
elif msg_type == "computer_call_output":
|
|
385
|
+
current_step.append(message)
|
|
386
|
+
# End of step - process it
|
|
387
|
+
if current_step:
|
|
388
|
+
step_num += 1
|
|
389
|
+
|
|
390
|
+
# Extract bot thought from message content
|
|
391
|
+
bot_thought = ""
|
|
392
|
+
for item in current_step:
|
|
393
|
+
if item.get("type") == "message" and item.get("role") == "assistant":
|
|
394
|
+
content = item.get("content", [])
|
|
395
|
+
for content_item in content:
|
|
396
|
+
if content_item.get("type") == "output_text":
|
|
397
|
+
bot_thought = content_item.get("text", "")
|
|
398
|
+
break
|
|
399
|
+
break
|
|
400
|
+
|
|
401
|
+
# Extract action from computer_call
|
|
402
|
+
action_text = ""
|
|
403
|
+
for item in current_step:
|
|
404
|
+
if item.get("type") == "computer_call":
|
|
405
|
+
action = item.get("action", {})
|
|
406
|
+
action_type = action.get("type", "")
|
|
407
|
+
|
|
408
|
+
if action_type == "click":
|
|
409
|
+
x, y = action.get("x", 0), action.get("y", 0)
|
|
410
|
+
# Convert to 0-999 range (assuming screen dimensions)
|
|
411
|
+
# For now, use direct coordinates - this may need adjustment
|
|
412
|
+
action_text = f"left_click(start_box='[{x},{y}]')"
|
|
413
|
+
elif action_type == "double_click":
|
|
414
|
+
x, y = action.get("x", 0), action.get("y", 0)
|
|
415
|
+
action_text = f"left_double_click(start_box='[{x},{y}]')"
|
|
416
|
+
elif action_type == "right_click":
|
|
417
|
+
x, y = action.get("x", 0), action.get("y", 0)
|
|
418
|
+
action_text = f"right_click(start_box='[{x},{y}]')"
|
|
419
|
+
elif action_type == "drag":
|
|
420
|
+
# Handle drag with path
|
|
421
|
+
path = action.get("path", [])
|
|
422
|
+
if len(path) >= 2:
|
|
423
|
+
start = path[0]
|
|
424
|
+
end = path[-1]
|
|
425
|
+
action_text = f"left_drag(start_box='[{start.get('x', 0)},{start.get('y', 0)}]', end_box='[{end.get('x', 0)},{end.get('y', 0)}]')"
|
|
426
|
+
elif action_type == "keypress":
|
|
427
|
+
key = action.get("key", "")
|
|
428
|
+
action_text = f"key(keys='{key}')"
|
|
429
|
+
elif action_type == "type":
|
|
430
|
+
text = action.get("text", "")
|
|
431
|
+
action_text = f"type(content='{text}')"
|
|
432
|
+
elif action_type == "scroll":
|
|
433
|
+
x, y = action.get("x", 0), action.get("y", 0)
|
|
434
|
+
direction = action.get("direction", "down")
|
|
435
|
+
action_text = f"scroll(start_box='[{x},{y}]', direction='{direction}')"
|
|
436
|
+
elif action_type == "wait":
|
|
437
|
+
action_text = "WAIT()"
|
|
438
|
+
break
|
|
439
|
+
|
|
440
|
+
# Extract screenshot from computer_call_output
|
|
441
|
+
screenshot_url = None
|
|
442
|
+
for item in current_step:
|
|
443
|
+
if item.get("type") == "computer_call_output":
|
|
444
|
+
output = item.get("output", {})
|
|
445
|
+
if output.get("type") == "input_image":
|
|
446
|
+
screenshot_url = output.get("image_url", "")
|
|
447
|
+
break
|
|
448
|
+
|
|
449
|
+
# Store step info
|
|
450
|
+
step_info = {
|
|
451
|
+
"step_num": step_num,
|
|
452
|
+
"bot_thought": bot_thought,
|
|
453
|
+
"action_text": action_text,
|
|
454
|
+
"screenshot_url": screenshot_url
|
|
455
|
+
}
|
|
456
|
+
history.append(step_info)
|
|
457
|
+
|
|
458
|
+
# Store screenshot for last 4 steps
|
|
459
|
+
if screenshot_url:
|
|
460
|
+
history_images.append(screenshot_url)
|
|
461
|
+
|
|
462
|
+
current_step = []
|
|
463
|
+
|
|
464
|
+
# Build content array with head, history, and tail
|
|
465
|
+
content = []
|
|
466
|
+
current_text = head_text
|
|
467
|
+
|
|
468
|
+
total_history_steps = len(history)
|
|
469
|
+
history_image_count = min(4, len(history_images)) # Last 4 images
|
|
470
|
+
|
|
471
|
+
for step_idx, step_info in enumerate(history):
|
|
472
|
+
step_num = step_info["step_num"]
|
|
473
|
+
bot_thought = step_info["bot_thought"]
|
|
474
|
+
action_text = step_info["action_text"]
|
|
475
|
+
|
|
476
|
+
if step_idx < total_history_steps - history_image_count:
|
|
477
|
+
# For steps beyond the last 4, use text placeholder
|
|
478
|
+
current_text += f"\nstep {step_num}: Screenshot:(Omitted in context.) Thought: {bot_thought}\nAction: {action_text}"
|
|
479
|
+
else:
|
|
480
|
+
# For the last 4 steps, insert images
|
|
481
|
+
current_text += f"\nstep {step_num}: Screenshot:"
|
|
482
|
+
content.append({"type": "text", "text": current_text})
|
|
483
|
+
|
|
484
|
+
# Add image
|
|
485
|
+
img_idx = step_idx - (total_history_steps - history_image_count)
|
|
486
|
+
if img_idx < len(history_images):
|
|
487
|
+
content.append({"type": "image_url", "image_url": {"url": history_images[img_idx]}})
|
|
488
|
+
|
|
489
|
+
current_text = f" Thought: {bot_thought}\nAction: {action_text}"
|
|
490
|
+
|
|
491
|
+
# Add tail
|
|
492
|
+
current_text += tail_text
|
|
493
|
+
content.append({"type": "text", "text": current_text})
|
|
494
|
+
|
|
495
|
+
return content
|
|
496
|
+
|
|
497
|
+
def model_dump(obj) -> Dict[str, Any]:
|
|
498
|
+
if isinstance(obj, dict):
|
|
499
|
+
return {k: model_dump(v) for k, v in obj.items()}
|
|
500
|
+
elif hasattr(obj, "model_dump"):
|
|
501
|
+
return obj.model_dump()
|
|
502
|
+
else:
|
|
503
|
+
return obj
|
|
504
|
+
|
|
505
|
+
def convert_glm_completion_to_responses_items(response: ModelResponse, image_width: int, image_height: int) -> List[Dict[str, Any]]:
|
|
506
|
+
"""
|
|
507
|
+
Convert GLM-4.5V completion response to responses items format.
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
response: LiteLLM ModelResponse from GLM-4.5V
|
|
511
|
+
image_width: Original image width for coordinate scaling
|
|
512
|
+
image_height: Original image height for coordinate scaling
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
List of response items in the proper format
|
|
516
|
+
"""
|
|
517
|
+
import uuid
|
|
518
|
+
|
|
519
|
+
response_items = []
|
|
520
|
+
|
|
521
|
+
if not response.choices or not response.choices[0].message:
|
|
522
|
+
return response_items
|
|
523
|
+
|
|
524
|
+
message = response.choices[0].message
|
|
525
|
+
content = message.content or ""
|
|
526
|
+
reasoning_content = getattr(message, 'reasoning_content', None)
|
|
527
|
+
|
|
528
|
+
# Add reasoning item if present
|
|
529
|
+
if reasoning_content:
|
|
530
|
+
reasoning_item = model_dump(make_reasoning_item(reasoning_content))
|
|
531
|
+
response_items.append(reasoning_item)
|
|
532
|
+
|
|
533
|
+
# Parse the content to extract action and text
|
|
534
|
+
parsed_response = parse_glm_response(content)
|
|
535
|
+
action = parsed_response.get("action", "")
|
|
536
|
+
action_text = parsed_response.get("action_text", "")
|
|
537
|
+
|
|
538
|
+
# Add message item with text content (excluding action and memory)
|
|
539
|
+
if action_text:
|
|
540
|
+
# Remove action from action_text if it's there
|
|
541
|
+
clean_text = action_text
|
|
542
|
+
if action and action in clean_text:
|
|
543
|
+
clean_text = clean_text.replace(action, "").strip()
|
|
544
|
+
|
|
545
|
+
# Remove memory section
|
|
546
|
+
memory_pattern = r"Memory:\s*\[.*?\]\s*$"
|
|
547
|
+
clean_text = re.sub(memory_pattern, "", clean_text, flags=re.DOTALL).strip()
|
|
548
|
+
|
|
549
|
+
if clean_text:
|
|
550
|
+
message_item = model_dump(make_output_text_item(clean_text))
|
|
551
|
+
response_items.append(message_item)
|
|
552
|
+
|
|
553
|
+
# Convert action to computer call if present
|
|
554
|
+
if action:
|
|
555
|
+
call_id = f"call_{uuid.uuid4().hex[:8]}"
|
|
556
|
+
|
|
557
|
+
# Parse different action types and create appropriate computer calls
|
|
558
|
+
if action.startswith("left_click"):
|
|
559
|
+
coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
560
|
+
if coord_match:
|
|
561
|
+
x, y = int(coord_match.group(1)), int(coord_match.group(2))
|
|
562
|
+
# Convert from 0-999 to actual pixel coordinates
|
|
563
|
+
actual_x = int((x / 999.0) * image_width)
|
|
564
|
+
actual_y = int((y / 999.0) * image_height)
|
|
565
|
+
computer_call = model_dump(make_click_item(actual_x, actual_y))
|
|
566
|
+
computer_call["call_id"] = call_id
|
|
567
|
+
computer_call["status"] = "completed"
|
|
568
|
+
response_items.append(computer_call)
|
|
569
|
+
|
|
570
|
+
elif action.startswith("right_click"):
|
|
571
|
+
coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
572
|
+
if coord_match:
|
|
573
|
+
x, y = int(coord_match.group(1)), int(coord_match.group(2))
|
|
574
|
+
actual_x = int((x / 999.0) * image_width)
|
|
575
|
+
actual_y = int((y / 999.0) * image_height)
|
|
576
|
+
computer_call = model_dump(make_click_item(actual_x, actual_y, button="right"))
|
|
577
|
+
computer_call["call_id"] = call_id
|
|
578
|
+
computer_call["status"] = "completed"
|
|
579
|
+
response_items.append(computer_call)
|
|
580
|
+
|
|
581
|
+
elif action.startswith("left_double_click"):
|
|
582
|
+
coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
583
|
+
if coord_match:
|
|
584
|
+
x, y = int(coord_match.group(1)), int(coord_match.group(2))
|
|
585
|
+
actual_x = int((x / 999.0) * image_width)
|
|
586
|
+
actual_y = int((y / 999.0) * image_height)
|
|
587
|
+
computer_call = model_dump(make_double_click_item(actual_x, actual_y))
|
|
588
|
+
computer_call["call_id"] = call_id
|
|
589
|
+
computer_call["status"] = "completed"
|
|
590
|
+
response_items.append(computer_call)
|
|
591
|
+
|
|
592
|
+
elif action.startswith("left_drag"):
|
|
593
|
+
start_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
594
|
+
end_match = re.search(r"end_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
595
|
+
if start_match and end_match:
|
|
596
|
+
x1, y1 = int(start_match.group(1)), int(start_match.group(2))
|
|
597
|
+
x2, y2 = int(end_match.group(1)), int(end_match.group(2))
|
|
598
|
+
actual_x1 = int((x1 / 999.0) * image_width)
|
|
599
|
+
actual_y1 = int((y1 / 999.0) * image_height)
|
|
600
|
+
actual_x2 = int((x2 / 999.0) * image_width)
|
|
601
|
+
actual_y2 = int((y2 / 999.0) * image_height)
|
|
602
|
+
# Create path for drag operation
|
|
603
|
+
drag_path = [{"x": actual_x1, "y": actual_y1}, {"x": actual_x2, "y": actual_y2}]
|
|
604
|
+
computer_call = model_dump(make_drag_item(drag_path))
|
|
605
|
+
computer_call["call_id"] = call_id
|
|
606
|
+
computer_call["status"] = "completed"
|
|
607
|
+
response_items.append(computer_call)
|
|
608
|
+
|
|
609
|
+
elif action.startswith("key"):
|
|
610
|
+
key_match = re.search(r"keys='([^']+)'", action)
|
|
611
|
+
if key_match:
|
|
612
|
+
keys = key_match.group(1)
|
|
613
|
+
# Split keys by '+' for key combinations, or use as single key
|
|
614
|
+
key_list = keys.split('+') if '+' in keys else [keys]
|
|
615
|
+
computer_call = model_dump(make_keypress_item(key_list))
|
|
616
|
+
computer_call["call_id"] = call_id
|
|
617
|
+
computer_call["status"] = "completed"
|
|
618
|
+
response_items.append(computer_call)
|
|
619
|
+
|
|
620
|
+
elif action.startswith("type"):
|
|
621
|
+
content_match = re.search(r"content='([^']*)'", action)
|
|
622
|
+
if content_match:
|
|
623
|
+
content = content_match.group(1)
|
|
624
|
+
computer_call = model_dump(make_type_item(content))
|
|
625
|
+
computer_call["call_id"] = call_id
|
|
626
|
+
computer_call["status"] = "completed"
|
|
627
|
+
response_items.append(computer_call)
|
|
628
|
+
|
|
629
|
+
elif action.startswith("scroll"):
|
|
630
|
+
coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
631
|
+
direction_match = re.search(r"direction='([^']+)'", action)
|
|
632
|
+
if coord_match and direction_match:
|
|
633
|
+
x, y = int(coord_match.group(1)), int(coord_match.group(2))
|
|
634
|
+
direction = direction_match.group(1)
|
|
635
|
+
actual_x = int((x / 999.0) * image_width)
|
|
636
|
+
actual_y = int((y / 999.0) * image_height)
|
|
637
|
+
# Convert direction to scroll amounts
|
|
638
|
+
scroll_x, scroll_y = 0, 0
|
|
639
|
+
if direction == "up":
|
|
640
|
+
scroll_y = -5
|
|
641
|
+
elif direction == "down":
|
|
642
|
+
scroll_y = 5
|
|
643
|
+
elif direction == "left":
|
|
644
|
+
scroll_x = -5
|
|
645
|
+
elif direction == "right":
|
|
646
|
+
scroll_x = 5
|
|
647
|
+
computer_call = model_dump(make_scroll_item(actual_x, actual_y, scroll_x, scroll_y))
|
|
648
|
+
computer_call["call_id"] = call_id
|
|
649
|
+
computer_call["status"] = "completed"
|
|
650
|
+
response_items.append(computer_call)
|
|
651
|
+
|
|
652
|
+
elif action == "WAIT()":
|
|
653
|
+
computer_call = model_dump(make_wait_item())
|
|
654
|
+
computer_call["call_id"] = call_id
|
|
655
|
+
computer_call["status"] = "completed"
|
|
656
|
+
response_items.append(computer_call)
|
|
657
|
+
|
|
658
|
+
return response_items
|
|
659
|
+
|
|
660
|
+
@register_agent(models=r"(?i).*GLM-4\.5V.*")
|
|
661
|
+
class Glm4vConfig(AsyncAgentConfig):
|
|
662
|
+
"""GLM-4.5V agent configuration using liteLLM."""
|
|
663
|
+
|
|
664
|
+
async def predict_step(
|
|
665
|
+
self,
|
|
666
|
+
messages: List[Dict[str, Any]],
|
|
667
|
+
model: str,
|
|
668
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
669
|
+
max_retries: Optional[int] = None,
|
|
670
|
+
stream: bool = False,
|
|
671
|
+
computer_handler=None,
|
|
672
|
+
use_prompt_caching: Optional[bool] = False,
|
|
673
|
+
_on_api_start=None,
|
|
674
|
+
_on_api_end=None,
|
|
675
|
+
_on_usage=None,
|
|
676
|
+
_on_screenshot=None,
|
|
677
|
+
**kwargs
|
|
678
|
+
) -> Dict[str, Any]:
|
|
679
|
+
"""
|
|
680
|
+
Predict the next step using GLM-4.5V model.
|
|
681
|
+
|
|
682
|
+
Args:
|
|
683
|
+
messages: Input messages following Responses format
|
|
684
|
+
model: Model name to use
|
|
685
|
+
tools: Optional list of tool schemas
|
|
686
|
+
max_retries: Maximum number of retries for API calls
|
|
687
|
+
stream: Whether to stream the response
|
|
688
|
+
computer_handler: Computer handler for taking screenshots
|
|
689
|
+
use_prompt_caching: Whether to use prompt caching
|
|
690
|
+
_on_api_start: Callback for API start
|
|
691
|
+
_on_api_end: Callback for API end
|
|
692
|
+
_on_usage: Callback for usage tracking
|
|
693
|
+
_on_screenshot: Callback for screenshot events
|
|
694
|
+
|
|
695
|
+
Returns:
|
|
696
|
+
Dict with "output" and "usage" keys
|
|
697
|
+
"""
|
|
698
|
+
# Get the user instruction from the last user message
|
|
699
|
+
user_instruction = ""
|
|
700
|
+
for message in reversed(messages):
|
|
701
|
+
if isinstance(message, dict) and message.get("role") == "user":
|
|
702
|
+
content = message.get("content", "")
|
|
703
|
+
if isinstance(content, str):
|
|
704
|
+
user_instruction = content
|
|
705
|
+
elif isinstance(content, list):
|
|
706
|
+
for item in content:
|
|
707
|
+
if isinstance(item, dict) and item.get("type") == "text":
|
|
708
|
+
user_instruction = item.get("text", "")
|
|
709
|
+
break
|
|
710
|
+
break
|
|
711
|
+
|
|
712
|
+
# Get the last image for processing
|
|
713
|
+
last_image_b64 = get_last_image_from_messages(messages)
|
|
714
|
+
if not last_image_b64 and computer_handler:
|
|
715
|
+
# Take a screenshot if no image available
|
|
716
|
+
screenshot_b64 = await computer_handler.screenshot()
|
|
717
|
+
if screenshot_b64:
|
|
718
|
+
last_image_b64 = screenshot_b64
|
|
719
|
+
if _on_screenshot:
|
|
720
|
+
await _on_screenshot(screenshot_b64)
|
|
721
|
+
|
|
722
|
+
if not last_image_b64:
|
|
723
|
+
raise ValueError("No image available for GLM-4.5V processing")
|
|
724
|
+
|
|
725
|
+
# Convert responses items to GLM-4.5V PC prompt format with historical actions
|
|
726
|
+
prompt_content = convert_responses_items_to_glm45v_pc_prompt(
|
|
727
|
+
messages=messages,
|
|
728
|
+
task=user_instruction,
|
|
729
|
+
memory="[]" # Initialize with empty memory for now
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
# Add the current screenshot to the end
|
|
733
|
+
prompt_content.append({
|
|
734
|
+
"type": "image_url",
|
|
735
|
+
"image_url": {"url": f"data:image/png;base64,{last_image_b64}"}
|
|
736
|
+
})
|
|
737
|
+
|
|
738
|
+
# Prepare messages for liteLLM
|
|
739
|
+
litellm_messages = [
|
|
740
|
+
{
|
|
741
|
+
"role": "system",
|
|
742
|
+
"content": "You are a helpful GUI agent assistant."
|
|
743
|
+
},
|
|
744
|
+
{
|
|
745
|
+
"role": "user",
|
|
746
|
+
"content": prompt_content
|
|
747
|
+
}
|
|
748
|
+
]
|
|
749
|
+
|
|
750
|
+
# Prepare API call kwargs
|
|
751
|
+
api_kwargs = {
|
|
752
|
+
"model": model,
|
|
753
|
+
"messages": litellm_messages,
|
|
754
|
+
# "max_tokens": 2048,
|
|
755
|
+
# "temperature": 0.001,
|
|
756
|
+
# "extra_body": {
|
|
757
|
+
# "skip_special_tokens": False,
|
|
758
|
+
# }
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
# Add API callbacks
|
|
762
|
+
if _on_api_start:
|
|
763
|
+
await _on_api_start(api_kwargs)
|
|
764
|
+
|
|
765
|
+
# Call liteLLM
|
|
766
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
767
|
+
|
|
768
|
+
if _on_api_end:
|
|
769
|
+
await _on_api_end(api_kwargs, response)
|
|
770
|
+
|
|
771
|
+
# Get image dimensions for coordinate scaling
|
|
772
|
+
image_width, image_height = 1920, 1080 # Default dimensions
|
|
773
|
+
|
|
774
|
+
# Try to get actual dimensions from the image
|
|
775
|
+
try:
|
|
776
|
+
image_data = base64.b64decode(last_image_b64)
|
|
777
|
+
image = Image.open(BytesIO(image_data))
|
|
778
|
+
image_width, image_height = image.size
|
|
779
|
+
except Exception:
|
|
780
|
+
pass # Use default dimensions
|
|
781
|
+
|
|
782
|
+
# Convert GLM completion response to responses items
|
|
783
|
+
response_items = convert_glm_completion_to_responses_items(response, image_width, image_height)
|
|
784
|
+
|
|
785
|
+
# Extract usage information
|
|
786
|
+
response_usage = {
|
|
787
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
|
|
788
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
789
|
+
}
|
|
790
|
+
if _on_usage:
|
|
791
|
+
await _on_usage(response_usage)
|
|
792
|
+
|
|
793
|
+
# Create agent response
|
|
794
|
+
agent_response = {
|
|
795
|
+
"output": response_items,
|
|
796
|
+
"usage": response_usage
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
return agent_response
|
|
800
|
+
|
|
801
|
+
async def predict_click(
|
|
802
|
+
self,
|
|
803
|
+
model: str,
|
|
804
|
+
image_b64: str,
|
|
805
|
+
instruction: str,
|
|
806
|
+
**kwargs
|
|
807
|
+
) -> Optional[Tuple[int, int]]:
|
|
808
|
+
"""
|
|
809
|
+
Predict click coordinates using GLM-4.5V model.
|
|
810
|
+
|
|
811
|
+
Args:
|
|
812
|
+
model: Model name to use
|
|
813
|
+
image_b64: Base64 encoded image
|
|
814
|
+
instruction: Instruction for where to click
|
|
815
|
+
|
|
816
|
+
Returns:
|
|
817
|
+
Tuple with (x, y) coordinates or None
|
|
818
|
+
"""
|
|
819
|
+
try:
|
|
820
|
+
# Create a simple click instruction prompt
|
|
821
|
+
click_prompt = f"""You are a GUI agent. Look at the screenshot and identify where to click for: {instruction}
|
|
822
|
+
|
|
823
|
+
Respond with a single click action in this format:
|
|
824
|
+
left_click(start_box='[x,y]')
|
|
825
|
+
|
|
826
|
+
Where x,y are coordinates normalized to 0-999 range."""
|
|
827
|
+
|
|
828
|
+
# Prepare messages for liteLLM
|
|
829
|
+
litellm_messages = [
|
|
830
|
+
{
|
|
831
|
+
"role": "system",
|
|
832
|
+
"content": "You are a helpful GUI agent assistant."
|
|
833
|
+
},
|
|
834
|
+
{
|
|
835
|
+
"role": "user",
|
|
836
|
+
"content": [
|
|
837
|
+
{"type": "text", "text": click_prompt},
|
|
838
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
|
|
839
|
+
]
|
|
840
|
+
}
|
|
841
|
+
]
|
|
842
|
+
|
|
843
|
+
# Prepare API call kwargs
|
|
844
|
+
api_kwargs = {
|
|
845
|
+
"model": model,
|
|
846
|
+
"messages": litellm_messages,
|
|
847
|
+
"max_tokens": 100,
|
|
848
|
+
"temperature": 0.001,
|
|
849
|
+
"extra_body": {
|
|
850
|
+
"skip_special_tokens": False,
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
# Call liteLLM
|
|
855
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
856
|
+
|
|
857
|
+
# Extract response content
|
|
858
|
+
response_content = response.choices[0].message.content.strip()
|
|
859
|
+
|
|
860
|
+
# Parse response for click coordinates
|
|
861
|
+
# Look for coordinates in the response, handling special tokens
|
|
862
|
+
coord_pattern = r"<\|begin_of_box\|>.*?left_click\(start_box='?\[(\d+),(\d+)\]'?\).*?<\|end_of_box\|>"
|
|
863
|
+
match = re.search(coord_pattern, response_content)
|
|
864
|
+
|
|
865
|
+
if not match:
|
|
866
|
+
# Fallback: look for coordinates without special tokens
|
|
867
|
+
coord_pattern = r"left_click\(start_box='?\[(\d+),(\d+)\]'?\)"
|
|
868
|
+
match = re.search(coord_pattern, response_content)
|
|
869
|
+
|
|
870
|
+
if match:
|
|
871
|
+
x, y = int(match.group(1)), int(match.group(2))
|
|
872
|
+
|
|
873
|
+
# Get actual image dimensions for scaling
|
|
874
|
+
try:
|
|
875
|
+
image_data = base64.b64decode(image_b64)
|
|
876
|
+
image = Image.open(BytesIO(image_data))
|
|
877
|
+
image_width, image_height = image.size
|
|
878
|
+
except Exception:
|
|
879
|
+
# Use default dimensions
|
|
880
|
+
image_width, image_height = 1920, 1080
|
|
881
|
+
|
|
882
|
+
# Convert from 0-999 normalized coordinates to actual pixel coordinates
|
|
883
|
+
actual_x = int((x / 999.0) * image_width)
|
|
884
|
+
actual_y = int((y / 999.0) * image_height)
|
|
885
|
+
|
|
886
|
+
return (actual_x, actual_y)
|
|
887
|
+
|
|
888
|
+
return None
|
|
889
|
+
|
|
890
|
+
except Exception as e:
|
|
891
|
+
# Log error and return None
|
|
892
|
+
print(f"Error in predict_click: {e}")
|
|
893
|
+
return None
|
|
894
|
+
|
|
895
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
896
|
+
"""
|
|
897
|
+
Get list of capabilities supported by this agent config.
|
|
898
|
+
|
|
899
|
+
Returns:
|
|
900
|
+
List of capability strings
|
|
901
|
+
"""
|
|
902
|
+
return ["step", "click"]
|
|
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
|
|
|
6
6
|
|
|
7
7
|
[project]
|
|
8
8
|
name = "cua-agent"
|
|
9
|
-
version = "0.4.
|
|
9
|
+
version = "0.4.12"
|
|
10
10
|
description = "CUA (Computer Use) Agent for AI-driven computer interaction"
|
|
11
11
|
readme = "README.md"
|
|
12
12
|
authors = [
|
|
@@ -40,6 +40,8 @@ uitars-mlx = [
|
|
|
40
40
|
"mlx-vlm>=0.1.27; sys_platform == 'darwin'",
|
|
41
41
|
]
|
|
42
42
|
uitars-hf = [
|
|
43
|
+
"accelerate",
|
|
44
|
+
"torch",
|
|
43
45
|
"transformers>=4.54.0",
|
|
44
46
|
]
|
|
45
47
|
ui = [
|
|
@@ -49,10 +51,17 @@ ui = [
|
|
|
49
51
|
cli = [
|
|
50
52
|
"yaspin>=3.1.0",
|
|
51
53
|
]
|
|
54
|
+
glm45v-hf = [
|
|
55
|
+
"accelerate",
|
|
56
|
+
"torch",
|
|
57
|
+
"transformers-v4.55.0-GLM-4.5V-preview",
|
|
58
|
+
]
|
|
52
59
|
all = [
|
|
53
60
|
"ultralytics>=8.0.0",
|
|
54
61
|
"cua-som>=0.1.0,<0.2.0",
|
|
55
62
|
"mlx-vlm>=0.1.27; sys_platform == 'darwin'",
|
|
63
|
+
"accelerate",
|
|
64
|
+
"torch",
|
|
56
65
|
"transformers>=4.54.0",
|
|
57
66
|
"gradio>=5.23.3",
|
|
58
67
|
"python-dotenv>=1.0.1",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|