cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +32 -19
- agent/computers/cua.py +33 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +215 -210
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +510 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
- cua_agent-0.4.36.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
agent/loops/omniparser.py
CHANGED
|
@@ -5,100 +5,102 @@ Code: https://github.com/microsoft/OmniParser
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import asyncio
|
|
8
|
+
import base64
|
|
9
|
+
import inspect
|
|
8
10
|
import json
|
|
9
|
-
from typing import
|
|
11
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
12
|
+
|
|
10
13
|
import litellm
|
|
11
|
-
import inspect
|
|
12
|
-
import base64
|
|
13
14
|
|
|
14
15
|
from ..decorators import register_agent
|
|
15
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
16
16
|
from ..loops.base import AsyncAgentConfig
|
|
17
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
17
18
|
|
|
18
19
|
SOM_TOOL_SCHEMA = {
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
20
|
+
"type": "function",
|
|
21
|
+
"name": "computer",
|
|
22
|
+
"description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
|
|
23
|
+
"parameters": {
|
|
24
|
+
"type": "object",
|
|
25
|
+
"properties": {
|
|
26
|
+
"action": {
|
|
27
|
+
"type": "string",
|
|
28
|
+
"enum": [
|
|
29
|
+
"screenshot",
|
|
30
|
+
"click",
|
|
31
|
+
"double_click",
|
|
32
|
+
"drag",
|
|
33
|
+
"type",
|
|
34
|
+
"keypress",
|
|
35
|
+
"scroll",
|
|
36
|
+
"move",
|
|
37
|
+
"wait",
|
|
38
|
+
"get_current_url",
|
|
39
|
+
"get_dimensions",
|
|
40
|
+
"get_environment",
|
|
41
|
+
],
|
|
42
|
+
"description": "The action to perform",
|
|
43
|
+
},
|
|
44
|
+
"element_id": {
|
|
45
|
+
"type": "integer",
|
|
46
|
+
"description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
|
|
47
|
+
},
|
|
48
|
+
"start_element_id": {
|
|
49
|
+
"type": "integer",
|
|
50
|
+
"description": "The ID of the element to start dragging from (required for drag action)",
|
|
51
|
+
},
|
|
52
|
+
"end_element_id": {
|
|
53
|
+
"type": "integer",
|
|
54
|
+
"description": "The ID of the element to drag to (required for drag action)",
|
|
55
|
+
},
|
|
56
|
+
"text": {
|
|
57
|
+
"type": "string",
|
|
58
|
+
"description": "The text to type (required for type action)",
|
|
59
|
+
},
|
|
60
|
+
"keys": {
|
|
61
|
+
"type": "string",
|
|
62
|
+
"description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
|
|
63
|
+
},
|
|
64
|
+
"button": {
|
|
65
|
+
"type": "string",
|
|
66
|
+
"description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
|
|
67
|
+
},
|
|
68
|
+
"scroll_x": {
|
|
69
|
+
"type": "integer",
|
|
70
|
+
"description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
|
|
71
|
+
},
|
|
72
|
+
"scroll_y": {
|
|
73
|
+
"type": "integer",
|
|
74
|
+
"description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
|
|
75
|
+
},
|
|
76
|
+
},
|
|
77
|
+
"required": ["action"],
|
|
75
78
|
},
|
|
76
|
-
"required": [
|
|
77
|
-
"action"
|
|
78
|
-
]
|
|
79
|
-
}
|
|
80
79
|
}
|
|
81
80
|
|
|
82
81
|
OMNIPARSER_AVAILABLE = False
|
|
83
82
|
try:
|
|
84
83
|
from som import OmniParser
|
|
84
|
+
|
|
85
85
|
OMNIPARSER_AVAILABLE = True
|
|
86
86
|
except ImportError:
|
|
87
87
|
pass
|
|
88
88
|
OMNIPARSER_SINGLETON = None
|
|
89
89
|
|
|
90
|
+
|
|
90
91
|
def get_parser():
|
|
91
92
|
global OMNIPARSER_SINGLETON
|
|
92
93
|
if OMNIPARSER_SINGLETON is None:
|
|
93
94
|
OMNIPARSER_SINGLETON = OmniParser()
|
|
94
95
|
return OMNIPARSER_SINGLETON
|
|
95
|
-
|
|
96
|
+
|
|
97
|
+
|
|
96
98
|
def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
|
97
99
|
"""Get the last computer_call_output message from a messages list.
|
|
98
|
-
|
|
100
|
+
|
|
99
101
|
Args:
|
|
100
102
|
messages: List of messages to search through
|
|
101
|
-
|
|
103
|
+
|
|
102
104
|
Returns:
|
|
103
105
|
The last computer_call_output message dict, or None if not found
|
|
104
106
|
"""
|
|
@@ -107,11 +109,12 @@ def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Di
|
|
|
107
109
|
return message
|
|
108
110
|
return None
|
|
109
111
|
|
|
112
|
+
|
|
110
113
|
def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]:
|
|
111
114
|
"""Prepare tools for OpenAI API format"""
|
|
112
115
|
omniparser_tools = []
|
|
113
116
|
id2xy = dict()
|
|
114
|
-
|
|
117
|
+
|
|
115
118
|
for schema in tool_schemas:
|
|
116
119
|
if schema["type"] == "computer":
|
|
117
120
|
omniparser_tools.append(SOM_TOOL_SCHEMA)
|
|
@@ -122,72 +125,80 @@ def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[T
|
|
|
122
125
|
elif schema["type"] == "function":
|
|
123
126
|
# Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
|
|
124
127
|
# Schema should be: {type, name, description, parameters}
|
|
125
|
-
omniparser_tools.append({
|
|
126
|
-
|
|
128
|
+
omniparser_tools.append({"type": "function", **schema["function"]})
|
|
129
|
+
|
|
127
130
|
return omniparser_tools, id2xy
|
|
128
131
|
|
|
129
|
-
async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]):
|
|
130
|
-
item_type = item.get("type")
|
|
131
|
-
|
|
132
|
-
def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
|
|
133
|
-
if element_id is None:
|
|
134
|
-
return (None, None)
|
|
135
|
-
return id2xy.get(element_id, (None, None))
|
|
136
|
-
|
|
137
|
-
if item_type == "function_call":
|
|
138
|
-
fn_name = item.get("name")
|
|
139
|
-
fn_args = json.loads(item.get("arguments", "{}"))
|
|
140
|
-
|
|
141
|
-
item_id = item.get("id")
|
|
142
|
-
call_id = item.get("call_id")
|
|
143
|
-
|
|
144
|
-
if fn_name == "computer":
|
|
145
|
-
action = fn_args.get("action")
|
|
146
|
-
element_id = fn_args.get("element_id")
|
|
147
|
-
start_element_id = fn_args.get("start_element_id")
|
|
148
|
-
end_element_id = fn_args.get("end_element_id")
|
|
149
|
-
text = fn_args.get("text")
|
|
150
|
-
keys = fn_args.get("keys")
|
|
151
|
-
button = fn_args.get("button")
|
|
152
|
-
scroll_x = fn_args.get("scroll_x")
|
|
153
|
-
scroll_y = fn_args.get("scroll_y")
|
|
154
|
-
|
|
155
|
-
x, y = _get_xy(element_id)
|
|
156
|
-
start_x, start_y = _get_xy(start_element_id)
|
|
157
|
-
end_x, end_y = _get_xy(end_element_id)
|
|
158
|
-
|
|
159
|
-
action_args = {
|
|
160
|
-
"type": action,
|
|
161
|
-
"x": x,
|
|
162
|
-
"y": y,
|
|
163
|
-
"start_x": start_x,
|
|
164
|
-
"start_y": start_y,
|
|
165
|
-
"end_x": end_x,
|
|
166
|
-
"end_y": end_y,
|
|
167
|
-
"text": text,
|
|
168
|
-
"keys": keys,
|
|
169
|
-
"button": button,
|
|
170
|
-
"scroll_x": scroll_x,
|
|
171
|
-
"scroll_y": scroll_y
|
|
172
|
-
}
|
|
173
|
-
# Remove None values to keep the JSON clean
|
|
174
|
-
action_args = {k: v for k, v in action_args.items() if v is not None}
|
|
175
132
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
133
|
+
async def replace_function_with_computer_call(
|
|
134
|
+
item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]
|
|
135
|
+
):
|
|
136
|
+
item_type = item.get("type")
|
|
137
|
+
|
|
138
|
+
def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
|
|
139
|
+
if element_id is None:
|
|
140
|
+
return (None, None)
|
|
141
|
+
return id2xy.get(element_id, (None, None))
|
|
142
|
+
|
|
143
|
+
if item_type == "function_call":
|
|
144
|
+
fn_name = item.get("name")
|
|
145
|
+
fn_args = json.loads(item.get("arguments", "{}"))
|
|
146
|
+
|
|
147
|
+
item_id = item.get("id")
|
|
148
|
+
call_id = item.get("call_id")
|
|
183
149
|
|
|
184
|
-
|
|
150
|
+
if fn_name == "computer":
|
|
151
|
+
action = fn_args.get("action")
|
|
152
|
+
element_id = fn_args.get("element_id")
|
|
153
|
+
start_element_id = fn_args.get("start_element_id")
|
|
154
|
+
end_element_id = fn_args.get("end_element_id")
|
|
155
|
+
text = fn_args.get("text")
|
|
156
|
+
keys = fn_args.get("keys")
|
|
157
|
+
button = fn_args.get("button")
|
|
158
|
+
scroll_x = fn_args.get("scroll_x")
|
|
159
|
+
scroll_y = fn_args.get("scroll_y")
|
|
185
160
|
|
|
186
|
-
|
|
161
|
+
x, y = _get_xy(element_id)
|
|
162
|
+
start_x, start_y = _get_xy(start_element_id)
|
|
163
|
+
end_x, end_y = _get_xy(end_element_id)
|
|
164
|
+
|
|
165
|
+
action_args = {
|
|
166
|
+
"type": action,
|
|
167
|
+
"x": x,
|
|
168
|
+
"y": y,
|
|
169
|
+
"start_x": start_x,
|
|
170
|
+
"start_y": start_y,
|
|
171
|
+
"end_x": end_x,
|
|
172
|
+
"end_y": end_y,
|
|
173
|
+
"text": text,
|
|
174
|
+
"keys": keys,
|
|
175
|
+
"button": button,
|
|
176
|
+
"scroll_x": scroll_x,
|
|
177
|
+
"scroll_y": scroll_y,
|
|
178
|
+
}
|
|
179
|
+
# Remove None values to keep the JSON clean
|
|
180
|
+
action_args = {k: v for k, v in action_args.items() if v is not None}
|
|
181
|
+
|
|
182
|
+
return [
|
|
183
|
+
{
|
|
184
|
+
"type": "computer_call",
|
|
185
|
+
"action": action_args,
|
|
186
|
+
"id": item_id,
|
|
187
|
+
"call_id": call_id,
|
|
188
|
+
"status": "completed",
|
|
189
|
+
}
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
return [item]
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
async def replace_computer_call_with_function(
|
|
196
|
+
item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]
|
|
197
|
+
):
|
|
187
198
|
"""
|
|
188
199
|
Convert computer_call back to function_call format.
|
|
189
200
|
Also handles computer_call_output -> function_call_output conversion.
|
|
190
|
-
|
|
201
|
+
|
|
191
202
|
Args:
|
|
192
203
|
item: The item to convert
|
|
193
204
|
xy2id: Mapping from (x, y) coordinates to element IDs
|
|
@@ -202,12 +213,12 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
|
|
|
202
213
|
|
|
203
214
|
if item_type == "computer_call":
|
|
204
215
|
action_data = item.get("action", {})
|
|
205
|
-
|
|
216
|
+
|
|
206
217
|
# Extract coordinates and convert back to element IDs
|
|
207
218
|
element_id = _get_element_id(action_data.get("x"), action_data.get("y"))
|
|
208
219
|
start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y"))
|
|
209
220
|
end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y"))
|
|
210
|
-
|
|
221
|
+
|
|
211
222
|
# Build function arguments
|
|
212
223
|
fn_args = {
|
|
213
224
|
"action": action_data.get("type"),
|
|
@@ -218,33 +229,38 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
|
|
|
218
229
|
"keys": action_data.get("keys"),
|
|
219
230
|
"button": action_data.get("button"),
|
|
220
231
|
"scroll_x": action_data.get("scroll_x"),
|
|
221
|
-
"scroll_y": action_data.get("scroll_y")
|
|
232
|
+
"scroll_y": action_data.get("scroll_y"),
|
|
222
233
|
}
|
|
223
|
-
|
|
234
|
+
|
|
224
235
|
# Remove None values to keep the JSON clean
|
|
225
236
|
fn_args = {k: v for k, v in fn_args.items() if v is not None}
|
|
226
|
-
|
|
227
|
-
return [
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
237
|
+
|
|
238
|
+
return [
|
|
239
|
+
{
|
|
240
|
+
"type": "function_call",
|
|
241
|
+
"name": "computer",
|
|
242
|
+
"arguments": json.dumps(fn_args),
|
|
243
|
+
"id": item.get("id"),
|
|
244
|
+
"call_id": item.get("call_id"),
|
|
245
|
+
"status": "completed",
|
|
246
|
+
}
|
|
247
|
+
]
|
|
248
|
+
|
|
239
249
|
elif item_type == "computer_call_output":
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
250
|
+
output = item.get("output")
|
|
251
|
+
|
|
252
|
+
if isinstance(output, dict):
|
|
253
|
+
output = [output]
|
|
254
|
+
|
|
255
|
+
return [
|
|
256
|
+
{
|
|
257
|
+
"type": "function_call_output",
|
|
258
|
+
"call_id": item.get("call_id"),
|
|
259
|
+
"output": output,
|
|
260
|
+
"id": item.get("id"),
|
|
261
|
+
"status": "completed",
|
|
262
|
+
}
|
|
263
|
+
]
|
|
248
264
|
|
|
249
265
|
return [item]
|
|
250
266
|
|
|
@@ -252,7 +268,7 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
|
|
|
252
268
|
@register_agent(models=r"omniparser\+.*|omni\+.*", priority=2)
|
|
253
269
|
class OmniparserConfig(AsyncAgentConfig):
|
|
254
270
|
"""Omniparser agent configuration implementing AsyncAgentConfig protocol."""
|
|
255
|
-
|
|
271
|
+
|
|
256
272
|
async def predict_step(
|
|
257
273
|
self,
|
|
258
274
|
messages: List[Dict[str, Any]],
|
|
@@ -266,25 +282,27 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
266
282
|
_on_api_end=None,
|
|
267
283
|
_on_usage=None,
|
|
268
284
|
_on_screenshot=None,
|
|
269
|
-
**kwargs
|
|
285
|
+
**kwargs,
|
|
270
286
|
) -> Dict[str, Any]:
|
|
271
287
|
"""
|
|
272
288
|
OpenAI computer-use-preview agent loop using liteLLM responses.
|
|
273
|
-
|
|
289
|
+
|
|
274
290
|
Supports OpenAI's computer use preview models.
|
|
275
291
|
"""
|
|
276
292
|
if not OMNIPARSER_AVAILABLE:
|
|
277
|
-
raise ValueError(
|
|
278
|
-
|
|
293
|
+
raise ValueError(
|
|
294
|
+
"omniparser loop requires som to be installed. Install it with `pip install cua-som`."
|
|
295
|
+
)
|
|
296
|
+
|
|
279
297
|
tools = tools or []
|
|
280
|
-
|
|
281
|
-
llm_model = model.split(
|
|
298
|
+
|
|
299
|
+
llm_model = model.split("+")[-1]
|
|
282
300
|
|
|
283
301
|
# Prepare tools for OpenAI API
|
|
284
302
|
openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
|
|
285
303
|
|
|
286
304
|
# Find last computer_call_output
|
|
287
|
-
last_computer_call_output = get_last_computer_call_output(messages)
|
|
305
|
+
last_computer_call_output = get_last_computer_call_output(messages) # type: ignore
|
|
288
306
|
if last_computer_call_output:
|
|
289
307
|
image_url = last_computer_call_output.get("output", {}).get("image_url", "")
|
|
290
308
|
image_data = image_url.split(",")[-1]
|
|
@@ -294,14 +312,17 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
294
312
|
if _on_screenshot:
|
|
295
313
|
await _on_screenshot(result.annotated_image_base64, "annotated_image")
|
|
296
314
|
for element in result.elements:
|
|
297
|
-
id2xy[element.id] = (
|
|
298
|
-
|
|
315
|
+
id2xy[element.id] = (
|
|
316
|
+
(element.bbox.x1 + element.bbox.x2) / 2,
|
|
317
|
+
(element.bbox.y1 + element.bbox.y2) / 2,
|
|
318
|
+
)
|
|
319
|
+
|
|
299
320
|
# handle computer calls -> function calls
|
|
300
321
|
new_messages = []
|
|
301
322
|
for message in messages:
|
|
302
323
|
if not isinstance(message, dict):
|
|
303
324
|
message = message.__dict__
|
|
304
|
-
new_messages += await replace_computer_call_with_function(message, id2xy)
|
|
325
|
+
new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
|
|
305
326
|
messages = new_messages
|
|
306
327
|
|
|
307
328
|
# Prepare API call kwargs
|
|
@@ -312,13 +333,13 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
312
333
|
"stream": stream,
|
|
313
334
|
"truncation": "auto",
|
|
314
335
|
"num_retries": max_retries,
|
|
315
|
-
**kwargs
|
|
336
|
+
**kwargs,
|
|
316
337
|
}
|
|
317
|
-
|
|
338
|
+
|
|
318
339
|
# Call API start hook
|
|
319
340
|
if _on_api_start:
|
|
320
341
|
await _on_api_start(api_kwargs)
|
|
321
|
-
|
|
342
|
+
|
|
322
343
|
print(str(api_kwargs)[:1000])
|
|
323
344
|
|
|
324
345
|
# Use liteLLM responses
|
|
@@ -330,60 +351,50 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
330
351
|
|
|
331
352
|
# Extract usage information
|
|
332
353
|
usage = {
|
|
333
|
-
**response.usage.model_dump(),
|
|
334
|
-
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
354
|
+
**response.usage.model_dump(), # type: ignore
|
|
355
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
|
|
335
356
|
}
|
|
336
357
|
if _on_usage:
|
|
337
358
|
await _on_usage(usage)
|
|
338
359
|
|
|
339
360
|
# handle som function calls -> xy computer calls
|
|
340
361
|
new_output = []
|
|
341
|
-
for i in range(len(response.output)):
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
return {
|
|
345
|
-
|
|
346
|
-
"usage": usage
|
|
347
|
-
}
|
|
348
|
-
|
|
362
|
+
for i in range(len(response.output)): # type: ignore
|
|
363
|
+
new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
|
|
364
|
+
|
|
365
|
+
return {"output": new_output, "usage": usage}
|
|
366
|
+
|
|
349
367
|
async def predict_click(
|
|
350
|
-
self,
|
|
351
|
-
model: str,
|
|
352
|
-
image_b64: str,
|
|
353
|
-
instruction: str,
|
|
354
|
-
**kwargs
|
|
368
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
355
369
|
) -> Optional[Tuple[float, float]]:
|
|
356
370
|
"""
|
|
357
371
|
Predict click coordinates using OmniParser and LLM.
|
|
358
|
-
|
|
372
|
+
|
|
359
373
|
Uses OmniParser to annotate the image with element IDs, then uses LLM
|
|
360
374
|
to identify the correct element ID based on the instruction.
|
|
361
375
|
"""
|
|
362
376
|
if not OMNIPARSER_AVAILABLE:
|
|
363
377
|
return None
|
|
364
|
-
|
|
378
|
+
|
|
365
379
|
# Parse the image with OmniParser to get annotated image and elements
|
|
366
380
|
parser = get_parser()
|
|
367
381
|
result = parser.parse(image_b64)
|
|
368
|
-
|
|
382
|
+
|
|
369
383
|
# Extract the LLM model from composed model string
|
|
370
|
-
llm_model = model.split(
|
|
371
|
-
|
|
384
|
+
llm_model = model.split("+")[-1]
|
|
385
|
+
|
|
372
386
|
# Create system prompt for element ID prediction
|
|
373
|
-
SYSTEM_PROMPT =
|
|
387
|
+
SYSTEM_PROMPT = """
|
|
374
388
|
You are an expert UI element locator. Given a GUI image annotated with numerical IDs over each interactable element, along with a user's element description, provide the ID of the specified element.
|
|
375
389
|
|
|
376
390
|
The image shows UI elements with numbered overlays. Each number corresponds to a clickable/interactable element.
|
|
377
391
|
|
|
378
392
|
Output only the element ID as a single integer.
|
|
379
|
-
|
|
380
|
-
|
|
393
|
+
""".strip()
|
|
394
|
+
|
|
381
395
|
# Prepare messages for LLM
|
|
382
396
|
messages = [
|
|
383
|
-
{
|
|
384
|
-
"role": "system",
|
|
385
|
-
"content": SYSTEM_PROMPT
|
|
386
|
-
},
|
|
397
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
|
387
398
|
{
|
|
388
399
|
"role": "user",
|
|
389
400
|
"content": [
|
|
@@ -391,31 +402,25 @@ Output only the element ID as a single integer.
|
|
|
391
402
|
"type": "image_url",
|
|
392
403
|
"image_url": {
|
|
393
404
|
"url": f"data:image/png;base64,{result.annotated_image_base64}"
|
|
394
|
-
}
|
|
405
|
+
},
|
|
395
406
|
},
|
|
396
|
-
{
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
}
|
|
400
|
-
]
|
|
401
|
-
}
|
|
407
|
+
{"type": "text", "text": f"Find the element: {instruction}"},
|
|
408
|
+
],
|
|
409
|
+
},
|
|
402
410
|
]
|
|
403
|
-
|
|
411
|
+
|
|
404
412
|
# Call LLM to predict element ID
|
|
405
413
|
response = await litellm.acompletion(
|
|
406
|
-
model=llm_model,
|
|
407
|
-
messages=messages,
|
|
408
|
-
max_tokens=10,
|
|
409
|
-
temperature=0.1
|
|
414
|
+
model=llm_model, messages=messages, max_tokens=10, temperature=0.1
|
|
410
415
|
)
|
|
411
|
-
|
|
416
|
+
|
|
412
417
|
# Extract element ID from response
|
|
413
|
-
response_text = response.choices[0].message.content.strip()
|
|
414
|
-
|
|
418
|
+
response_text = response.choices[0].message.content.strip() # type: ignore
|
|
419
|
+
|
|
415
420
|
# Try to parse the element ID
|
|
416
421
|
try:
|
|
417
422
|
element_id = int(response_text)
|
|
418
|
-
|
|
423
|
+
|
|
419
424
|
# Find the element with this ID and return its center coordinates
|
|
420
425
|
for element in result.elements:
|
|
421
426
|
if element.id == element_id:
|
|
@@ -425,9 +430,9 @@ Output only the element ID as a single integer.
|
|
|
425
430
|
except ValueError:
|
|
426
431
|
# If we can't parse the ID, return None
|
|
427
432
|
pass
|
|
428
|
-
|
|
433
|
+
|
|
429
434
|
return None
|
|
430
|
-
|
|
435
|
+
|
|
431
436
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
432
437
|
"""Return the capabilities supported by this agent."""
|
|
433
438
|
return ["step"]
|