cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +26 -17
- agent/computers/cua.py +27 -23
- agent/computers/custom.py +72 -69
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +212 -209
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +475 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
- cua_agent-0.4.35.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/loops/omniparser.py
CHANGED
|
@@ -5,100 +5,102 @@ Code: https://github.com/microsoft/OmniParser
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import asyncio
|
|
8
|
+
import base64
|
|
9
|
+
import inspect
|
|
8
10
|
import json
|
|
9
|
-
from typing import
|
|
11
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
12
|
+
|
|
10
13
|
import litellm
|
|
11
|
-
import inspect
|
|
12
|
-
import base64
|
|
13
14
|
|
|
14
15
|
from ..decorators import register_agent
|
|
15
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
16
16
|
from ..loops.base import AsyncAgentConfig
|
|
17
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
17
18
|
|
|
18
19
|
SOM_TOOL_SCHEMA = {
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
20
|
+
"type": "function",
|
|
21
|
+
"name": "computer",
|
|
22
|
+
"description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
|
|
23
|
+
"parameters": {
|
|
24
|
+
"type": "object",
|
|
25
|
+
"properties": {
|
|
26
|
+
"action": {
|
|
27
|
+
"type": "string",
|
|
28
|
+
"enum": [
|
|
29
|
+
"screenshot",
|
|
30
|
+
"click",
|
|
31
|
+
"double_click",
|
|
32
|
+
"drag",
|
|
33
|
+
"type",
|
|
34
|
+
"keypress",
|
|
35
|
+
"scroll",
|
|
36
|
+
"move",
|
|
37
|
+
"wait",
|
|
38
|
+
"get_current_url",
|
|
39
|
+
"get_dimensions",
|
|
40
|
+
"get_environment",
|
|
41
|
+
],
|
|
42
|
+
"description": "The action to perform",
|
|
43
|
+
},
|
|
44
|
+
"element_id": {
|
|
45
|
+
"type": "integer",
|
|
46
|
+
"description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
|
|
47
|
+
},
|
|
48
|
+
"start_element_id": {
|
|
49
|
+
"type": "integer",
|
|
50
|
+
"description": "The ID of the element to start dragging from (required for drag action)",
|
|
51
|
+
},
|
|
52
|
+
"end_element_id": {
|
|
53
|
+
"type": "integer",
|
|
54
|
+
"description": "The ID of the element to drag to (required for drag action)",
|
|
55
|
+
},
|
|
56
|
+
"text": {
|
|
57
|
+
"type": "string",
|
|
58
|
+
"description": "The text to type (required for type action)",
|
|
59
|
+
},
|
|
60
|
+
"keys": {
|
|
61
|
+
"type": "string",
|
|
62
|
+
"description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
|
|
63
|
+
},
|
|
64
|
+
"button": {
|
|
65
|
+
"type": "string",
|
|
66
|
+
"description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
|
|
67
|
+
},
|
|
68
|
+
"scroll_x": {
|
|
69
|
+
"type": "integer",
|
|
70
|
+
"description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
|
|
71
|
+
},
|
|
72
|
+
"scroll_y": {
|
|
73
|
+
"type": "integer",
|
|
74
|
+
"description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
|
|
75
|
+
},
|
|
76
|
+
},
|
|
77
|
+
"required": ["action"],
|
|
75
78
|
},
|
|
76
|
-
"required": [
|
|
77
|
-
"action"
|
|
78
|
-
]
|
|
79
|
-
}
|
|
80
79
|
}
|
|
81
80
|
|
|
82
81
|
OMNIPARSER_AVAILABLE = False
|
|
83
82
|
try:
|
|
84
83
|
from som import OmniParser
|
|
84
|
+
|
|
85
85
|
OMNIPARSER_AVAILABLE = True
|
|
86
86
|
except ImportError:
|
|
87
87
|
pass
|
|
88
88
|
OMNIPARSER_SINGLETON = None
|
|
89
89
|
|
|
90
|
+
|
|
90
91
|
def get_parser():
|
|
91
92
|
global OMNIPARSER_SINGLETON
|
|
92
93
|
if OMNIPARSER_SINGLETON is None:
|
|
93
94
|
OMNIPARSER_SINGLETON = OmniParser()
|
|
94
95
|
return OMNIPARSER_SINGLETON
|
|
95
|
-
|
|
96
|
+
|
|
97
|
+
|
|
96
98
|
def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
|
97
99
|
"""Get the last computer_call_output message from a messages list.
|
|
98
|
-
|
|
100
|
+
|
|
99
101
|
Args:
|
|
100
102
|
messages: List of messages to search through
|
|
101
|
-
|
|
103
|
+
|
|
102
104
|
Returns:
|
|
103
105
|
The last computer_call_output message dict, or None if not found
|
|
104
106
|
"""
|
|
@@ -107,11 +109,12 @@ def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Di
|
|
|
107
109
|
return message
|
|
108
110
|
return None
|
|
109
111
|
|
|
112
|
+
|
|
110
113
|
def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]:
|
|
111
114
|
"""Prepare tools for OpenAI API format"""
|
|
112
115
|
omniparser_tools = []
|
|
113
116
|
id2xy = dict()
|
|
114
|
-
|
|
117
|
+
|
|
115
118
|
for schema in tool_schemas:
|
|
116
119
|
if schema["type"] == "computer":
|
|
117
120
|
omniparser_tools.append(SOM_TOOL_SCHEMA)
|
|
@@ -122,72 +125,80 @@ def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[T
|
|
|
122
125
|
elif schema["type"] == "function":
|
|
123
126
|
# Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
|
|
124
127
|
# Schema should be: {type, name, description, parameters}
|
|
125
|
-
omniparser_tools.append({
|
|
126
|
-
|
|
128
|
+
omniparser_tools.append({"type": "function", **schema["function"]})
|
|
129
|
+
|
|
127
130
|
return omniparser_tools, id2xy
|
|
128
131
|
|
|
129
|
-
async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]):
|
|
130
|
-
item_type = item.get("type")
|
|
131
|
-
|
|
132
|
-
def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
|
|
133
|
-
if element_id is None:
|
|
134
|
-
return (None, None)
|
|
135
|
-
return id2xy.get(element_id, (None, None))
|
|
136
|
-
|
|
137
|
-
if item_type == "function_call":
|
|
138
|
-
fn_name = item.get("name")
|
|
139
|
-
fn_args = json.loads(item.get("arguments", "{}"))
|
|
140
|
-
|
|
141
|
-
item_id = item.get("id")
|
|
142
|
-
call_id = item.get("call_id")
|
|
143
|
-
|
|
144
|
-
if fn_name == "computer":
|
|
145
|
-
action = fn_args.get("action")
|
|
146
|
-
element_id = fn_args.get("element_id")
|
|
147
|
-
start_element_id = fn_args.get("start_element_id")
|
|
148
|
-
end_element_id = fn_args.get("end_element_id")
|
|
149
|
-
text = fn_args.get("text")
|
|
150
|
-
keys = fn_args.get("keys")
|
|
151
|
-
button = fn_args.get("button")
|
|
152
|
-
scroll_x = fn_args.get("scroll_x")
|
|
153
|
-
scroll_y = fn_args.get("scroll_y")
|
|
154
|
-
|
|
155
|
-
x, y = _get_xy(element_id)
|
|
156
|
-
start_x, start_y = _get_xy(start_element_id)
|
|
157
|
-
end_x, end_y = _get_xy(end_element_id)
|
|
158
|
-
|
|
159
|
-
action_args = {
|
|
160
|
-
"type": action,
|
|
161
|
-
"x": x,
|
|
162
|
-
"y": y,
|
|
163
|
-
"start_x": start_x,
|
|
164
|
-
"start_y": start_y,
|
|
165
|
-
"end_x": end_x,
|
|
166
|
-
"end_y": end_y,
|
|
167
|
-
"text": text,
|
|
168
|
-
"keys": keys,
|
|
169
|
-
"button": button,
|
|
170
|
-
"scroll_x": scroll_x,
|
|
171
|
-
"scroll_y": scroll_y
|
|
172
|
-
}
|
|
173
|
-
# Remove None values to keep the JSON clean
|
|
174
|
-
action_args = {k: v for k, v in action_args.items() if v is not None}
|
|
175
132
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
133
|
+
async def replace_function_with_computer_call(
|
|
134
|
+
item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]
|
|
135
|
+
):
|
|
136
|
+
item_type = item.get("type")
|
|
137
|
+
|
|
138
|
+
def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
|
|
139
|
+
if element_id is None:
|
|
140
|
+
return (None, None)
|
|
141
|
+
return id2xy.get(element_id, (None, None))
|
|
142
|
+
|
|
143
|
+
if item_type == "function_call":
|
|
144
|
+
fn_name = item.get("name")
|
|
145
|
+
fn_args = json.loads(item.get("arguments", "{}"))
|
|
146
|
+
|
|
147
|
+
item_id = item.get("id")
|
|
148
|
+
call_id = item.get("call_id")
|
|
149
|
+
|
|
150
|
+
if fn_name == "computer":
|
|
151
|
+
action = fn_args.get("action")
|
|
152
|
+
element_id = fn_args.get("element_id")
|
|
153
|
+
start_element_id = fn_args.get("start_element_id")
|
|
154
|
+
end_element_id = fn_args.get("end_element_id")
|
|
155
|
+
text = fn_args.get("text")
|
|
156
|
+
keys = fn_args.get("keys")
|
|
157
|
+
button = fn_args.get("button")
|
|
158
|
+
scroll_x = fn_args.get("scroll_x")
|
|
159
|
+
scroll_y = fn_args.get("scroll_y")
|
|
183
160
|
|
|
184
|
-
|
|
161
|
+
x, y = _get_xy(element_id)
|
|
162
|
+
start_x, start_y = _get_xy(start_element_id)
|
|
163
|
+
end_x, end_y = _get_xy(end_element_id)
|
|
185
164
|
|
|
186
|
-
|
|
165
|
+
action_args = {
|
|
166
|
+
"type": action,
|
|
167
|
+
"x": x,
|
|
168
|
+
"y": y,
|
|
169
|
+
"start_x": start_x,
|
|
170
|
+
"start_y": start_y,
|
|
171
|
+
"end_x": end_x,
|
|
172
|
+
"end_y": end_y,
|
|
173
|
+
"text": text,
|
|
174
|
+
"keys": keys,
|
|
175
|
+
"button": button,
|
|
176
|
+
"scroll_x": scroll_x,
|
|
177
|
+
"scroll_y": scroll_y,
|
|
178
|
+
}
|
|
179
|
+
# Remove None values to keep the JSON clean
|
|
180
|
+
action_args = {k: v for k, v in action_args.items() if v is not None}
|
|
181
|
+
|
|
182
|
+
return [
|
|
183
|
+
{
|
|
184
|
+
"type": "computer_call",
|
|
185
|
+
"action": action_args,
|
|
186
|
+
"id": item_id,
|
|
187
|
+
"call_id": call_id,
|
|
188
|
+
"status": "completed",
|
|
189
|
+
}
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
return [item]
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
async def replace_computer_call_with_function(
|
|
196
|
+
item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]
|
|
197
|
+
):
|
|
187
198
|
"""
|
|
188
199
|
Convert computer_call back to function_call format.
|
|
189
200
|
Also handles computer_call_output -> function_call_output conversion.
|
|
190
|
-
|
|
201
|
+
|
|
191
202
|
Args:
|
|
192
203
|
item: The item to convert
|
|
193
204
|
xy2id: Mapping from (x, y) coordinates to element IDs
|
|
@@ -202,12 +213,12 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
|
|
|
202
213
|
|
|
203
214
|
if item_type == "computer_call":
|
|
204
215
|
action_data = item.get("action", {})
|
|
205
|
-
|
|
216
|
+
|
|
206
217
|
# Extract coordinates and convert back to element IDs
|
|
207
218
|
element_id = _get_element_id(action_data.get("x"), action_data.get("y"))
|
|
208
219
|
start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y"))
|
|
209
220
|
end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y"))
|
|
210
|
-
|
|
221
|
+
|
|
211
222
|
# Build function arguments
|
|
212
223
|
fn_args = {
|
|
213
224
|
"action": action_data.get("type"),
|
|
@@ -218,33 +229,36 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
|
|
|
218
229
|
"keys": action_data.get("keys"),
|
|
219
230
|
"button": action_data.get("button"),
|
|
220
231
|
"scroll_x": action_data.get("scroll_x"),
|
|
221
|
-
"scroll_y": action_data.get("scroll_y")
|
|
232
|
+
"scroll_y": action_data.get("scroll_y"),
|
|
222
233
|
}
|
|
223
|
-
|
|
234
|
+
|
|
224
235
|
# Remove None values to keep the JSON clean
|
|
225
236
|
fn_args = {k: v for k, v in fn_args.items() if v is not None}
|
|
226
|
-
|
|
227
|
-
return [
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
237
|
+
|
|
238
|
+
return [
|
|
239
|
+
{
|
|
240
|
+
"type": "function_call",
|
|
241
|
+
"name": "computer",
|
|
242
|
+
"arguments": json.dumps(fn_args),
|
|
243
|
+
"id": item.get("id"),
|
|
244
|
+
"call_id": item.get("call_id"),
|
|
245
|
+
"status": "completed",
|
|
246
|
+
# Fall back to string representation
|
|
247
|
+
"content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})",
|
|
248
|
+
}
|
|
249
|
+
]
|
|
250
|
+
|
|
239
251
|
elif item_type == "computer_call_output":
|
|
240
252
|
# Simple conversion: computer_call_output -> function_call_output
|
|
241
|
-
return [
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
253
|
+
return [
|
|
254
|
+
{
|
|
255
|
+
"type": "function_call_output",
|
|
256
|
+
"call_id": item.get("call_id"),
|
|
257
|
+
"content": [item.get("output")],
|
|
258
|
+
"id": item.get("id"),
|
|
259
|
+
"status": "completed",
|
|
260
|
+
}
|
|
261
|
+
]
|
|
248
262
|
|
|
249
263
|
return [item]
|
|
250
264
|
|
|
@@ -252,7 +266,7 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
|
|
|
252
266
|
@register_agent(models=r"omniparser\+.*|omni\+.*", priority=2)
|
|
253
267
|
class OmniparserConfig(AsyncAgentConfig):
|
|
254
268
|
"""Omniparser agent configuration implementing AsyncAgentConfig protocol."""
|
|
255
|
-
|
|
269
|
+
|
|
256
270
|
async def predict_step(
|
|
257
271
|
self,
|
|
258
272
|
messages: List[Dict[str, Any]],
|
|
@@ -266,25 +280,27 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
266
280
|
_on_api_end=None,
|
|
267
281
|
_on_usage=None,
|
|
268
282
|
_on_screenshot=None,
|
|
269
|
-
**kwargs
|
|
283
|
+
**kwargs,
|
|
270
284
|
) -> Dict[str, Any]:
|
|
271
285
|
"""
|
|
272
286
|
OpenAI computer-use-preview agent loop using liteLLM responses.
|
|
273
|
-
|
|
287
|
+
|
|
274
288
|
Supports OpenAI's computer use preview models.
|
|
275
289
|
"""
|
|
276
290
|
if not OMNIPARSER_AVAILABLE:
|
|
277
|
-
raise ValueError(
|
|
278
|
-
|
|
291
|
+
raise ValueError(
|
|
292
|
+
"omniparser loop requires som to be installed. Install it with `pip install cua-som`."
|
|
293
|
+
)
|
|
294
|
+
|
|
279
295
|
tools = tools or []
|
|
280
|
-
|
|
281
|
-
llm_model = model.split(
|
|
296
|
+
|
|
297
|
+
llm_model = model.split("+")[-1]
|
|
282
298
|
|
|
283
299
|
# Prepare tools for OpenAI API
|
|
284
300
|
openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
|
|
285
301
|
|
|
286
302
|
# Find last computer_call_output
|
|
287
|
-
last_computer_call_output = get_last_computer_call_output(messages)
|
|
303
|
+
last_computer_call_output = get_last_computer_call_output(messages) # type: ignore
|
|
288
304
|
if last_computer_call_output:
|
|
289
305
|
image_url = last_computer_call_output.get("output", {}).get("image_url", "")
|
|
290
306
|
image_data = image_url.split(",")[-1]
|
|
@@ -294,14 +310,17 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
294
310
|
if _on_screenshot:
|
|
295
311
|
await _on_screenshot(result.annotated_image_base64, "annotated_image")
|
|
296
312
|
for element in result.elements:
|
|
297
|
-
id2xy[element.id] = (
|
|
298
|
-
|
|
313
|
+
id2xy[element.id] = (
|
|
314
|
+
(element.bbox.x1 + element.bbox.x2) / 2,
|
|
315
|
+
(element.bbox.y1 + element.bbox.y2) / 2,
|
|
316
|
+
)
|
|
317
|
+
|
|
299
318
|
# handle computer calls -> function calls
|
|
300
319
|
new_messages = []
|
|
301
320
|
for message in messages:
|
|
302
321
|
if not isinstance(message, dict):
|
|
303
322
|
message = message.__dict__
|
|
304
|
-
new_messages += await replace_computer_call_with_function(message, id2xy)
|
|
323
|
+
new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
|
|
305
324
|
messages = new_messages
|
|
306
325
|
|
|
307
326
|
# Prepare API call kwargs
|
|
@@ -312,13 +331,13 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
312
331
|
"stream": stream,
|
|
313
332
|
"truncation": "auto",
|
|
314
333
|
"num_retries": max_retries,
|
|
315
|
-
**kwargs
|
|
334
|
+
**kwargs,
|
|
316
335
|
}
|
|
317
|
-
|
|
336
|
+
|
|
318
337
|
# Call API start hook
|
|
319
338
|
if _on_api_start:
|
|
320
339
|
await _on_api_start(api_kwargs)
|
|
321
|
-
|
|
340
|
+
|
|
322
341
|
print(str(api_kwargs)[:1000])
|
|
323
342
|
|
|
324
343
|
# Use liteLLM responses
|
|
@@ -330,60 +349,50 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
330
349
|
|
|
331
350
|
# Extract usage information
|
|
332
351
|
usage = {
|
|
333
|
-
**response.usage.model_dump(),
|
|
334
|
-
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
352
|
+
**response.usage.model_dump(), # type: ignore
|
|
353
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
|
|
335
354
|
}
|
|
336
355
|
if _on_usage:
|
|
337
356
|
await _on_usage(usage)
|
|
338
357
|
|
|
339
358
|
# handle som function calls -> xy computer calls
|
|
340
359
|
new_output = []
|
|
341
|
-
for i in range(len(response.output)):
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
return {
|
|
345
|
-
|
|
346
|
-
"usage": usage
|
|
347
|
-
}
|
|
348
|
-
|
|
360
|
+
for i in range(len(response.output)): # type: ignore
|
|
361
|
+
new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
|
|
362
|
+
|
|
363
|
+
return {"output": new_output, "usage": usage}
|
|
364
|
+
|
|
349
365
|
async def predict_click(
|
|
350
|
-
self,
|
|
351
|
-
model: str,
|
|
352
|
-
image_b64: str,
|
|
353
|
-
instruction: str,
|
|
354
|
-
**kwargs
|
|
366
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
355
367
|
) -> Optional[Tuple[float, float]]:
|
|
356
368
|
"""
|
|
357
369
|
Predict click coordinates using OmniParser and LLM.
|
|
358
|
-
|
|
370
|
+
|
|
359
371
|
Uses OmniParser to annotate the image with element IDs, then uses LLM
|
|
360
372
|
to identify the correct element ID based on the instruction.
|
|
361
373
|
"""
|
|
362
374
|
if not OMNIPARSER_AVAILABLE:
|
|
363
375
|
return None
|
|
364
|
-
|
|
376
|
+
|
|
365
377
|
# Parse the image with OmniParser to get annotated image and elements
|
|
366
378
|
parser = get_parser()
|
|
367
379
|
result = parser.parse(image_b64)
|
|
368
|
-
|
|
380
|
+
|
|
369
381
|
# Extract the LLM model from composed model string
|
|
370
|
-
llm_model = model.split(
|
|
371
|
-
|
|
382
|
+
llm_model = model.split("+")[-1]
|
|
383
|
+
|
|
372
384
|
# Create system prompt for element ID prediction
|
|
373
|
-
SYSTEM_PROMPT =
|
|
385
|
+
SYSTEM_PROMPT = """
|
|
374
386
|
You are an expert UI element locator. Given a GUI image annotated with numerical IDs over each interactable element, along with a user's element description, provide the ID of the specified element.
|
|
375
387
|
|
|
376
388
|
The image shows UI elements with numbered overlays. Each number corresponds to a clickable/interactable element.
|
|
377
389
|
|
|
378
390
|
Output only the element ID as a single integer.
|
|
379
|
-
|
|
380
|
-
|
|
391
|
+
""".strip()
|
|
392
|
+
|
|
381
393
|
# Prepare messages for LLM
|
|
382
394
|
messages = [
|
|
383
|
-
{
|
|
384
|
-
"role": "system",
|
|
385
|
-
"content": SYSTEM_PROMPT
|
|
386
|
-
},
|
|
395
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
|
387
396
|
{
|
|
388
397
|
"role": "user",
|
|
389
398
|
"content": [
|
|
@@ -391,31 +400,25 @@ Output only the element ID as a single integer.
|
|
|
391
400
|
"type": "image_url",
|
|
392
401
|
"image_url": {
|
|
393
402
|
"url": f"data:image/png;base64,{result.annotated_image_base64}"
|
|
394
|
-
}
|
|
403
|
+
},
|
|
395
404
|
},
|
|
396
|
-
{
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
}
|
|
400
|
-
]
|
|
401
|
-
}
|
|
405
|
+
{"type": "text", "text": f"Find the element: {instruction}"},
|
|
406
|
+
],
|
|
407
|
+
},
|
|
402
408
|
]
|
|
403
|
-
|
|
409
|
+
|
|
404
410
|
# Call LLM to predict element ID
|
|
405
411
|
response = await litellm.acompletion(
|
|
406
|
-
model=llm_model,
|
|
407
|
-
messages=messages,
|
|
408
|
-
max_tokens=10,
|
|
409
|
-
temperature=0.1
|
|
412
|
+
model=llm_model, messages=messages, max_tokens=10, temperature=0.1
|
|
410
413
|
)
|
|
411
|
-
|
|
414
|
+
|
|
412
415
|
# Extract element ID from response
|
|
413
|
-
response_text = response.choices[0].message.content.strip()
|
|
414
|
-
|
|
416
|
+
response_text = response.choices[0].message.content.strip() # type: ignore
|
|
417
|
+
|
|
415
418
|
# Try to parse the element ID
|
|
416
419
|
try:
|
|
417
420
|
element_id = int(response_text)
|
|
418
|
-
|
|
421
|
+
|
|
419
422
|
# Find the element with this ID and return its center coordinates
|
|
420
423
|
for element in result.elements:
|
|
421
424
|
if element.id == element_id:
|
|
@@ -425,9 +428,9 @@ Output only the element ID as a single integer.
|
|
|
425
428
|
except ValueError:
|
|
426
429
|
# If we can't parse the ID, return None
|
|
427
430
|
pass
|
|
428
|
-
|
|
431
|
+
|
|
429
432
|
return None
|
|
430
|
-
|
|
433
|
+
|
|
431
434
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
432
435
|
"""Return the capabilities supported by this agent."""
|
|
433
436
|
return ["step"]
|