cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -19
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +6 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +370 -0
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +431 -241
- agent/callbacks/__init__.py +10 -3
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +140 -0
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +106 -69
- agent/callbacks/trajectory_saver.py +178 -70
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +164 -74
- agent/integrations/hud/agent.py +338 -342
- agent/integrations/hud/proxy.py +297 -0
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +590 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +142 -144
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +63 -56
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +262 -212
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +196 -0
- agent/proxy/handlers.py +255 -0
- agent/responses.py +486 -339
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +20 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- agent/integrations/hud/adapter.py +0 -121
- agent/integrations/hud/computer_handler.py +0 -187
- agent/telemetry.py +0 -142
- cua_agent-0.4.14.dist-info/METADATA +0 -436
- cua_agent-0.4.14.dist-info/RECORD +0 -50
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/omniparser.py
CHANGED
|
@@ -5,100 +5,108 @@ Code: https://github.com/microsoft/OmniParser
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import asyncio
|
|
8
|
+
import base64
|
|
9
|
+
import inspect
|
|
8
10
|
import json
|
|
9
|
-
from typing import
|
|
11
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
12
|
+
|
|
10
13
|
import litellm
|
|
11
|
-
import inspect
|
|
12
|
-
import base64
|
|
13
14
|
|
|
14
15
|
from ..decorators import register_agent
|
|
15
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
16
16
|
from ..loops.base import AsyncAgentConfig
|
|
17
|
+
from ..responses import (
|
|
18
|
+
convert_completion_messages_to_responses_items,
|
|
19
|
+
convert_responses_items_to_completion_messages,
|
|
20
|
+
)
|
|
21
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
17
22
|
|
|
18
23
|
SOM_TOOL_SCHEMA = {
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
24
|
+
"type": "function",
|
|
25
|
+
"function": {
|
|
26
|
+
"name": "computer",
|
|
27
|
+
"description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
|
|
28
|
+
"parameters": {
|
|
29
|
+
"type": "object",
|
|
30
|
+
"properties": {
|
|
31
|
+
"action": {
|
|
32
|
+
"type": "string",
|
|
33
|
+
"enum": [
|
|
34
|
+
"screenshot",
|
|
35
|
+
"click",
|
|
36
|
+
"double_click",
|
|
37
|
+
"drag",
|
|
38
|
+
"type",
|
|
39
|
+
"keypress",
|
|
40
|
+
"scroll",
|
|
41
|
+
"move",
|
|
42
|
+
"wait",
|
|
43
|
+
"get_current_url",
|
|
44
|
+
"get_dimensions",
|
|
45
|
+
"get_environment",
|
|
46
|
+
],
|
|
47
|
+
"description": "The action to perform",
|
|
48
|
+
},
|
|
49
|
+
"element_id": {
|
|
50
|
+
"type": "integer",
|
|
51
|
+
"description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
|
|
52
|
+
},
|
|
53
|
+
"start_element_id": {
|
|
54
|
+
"type": "integer",
|
|
55
|
+
"description": "The ID of the element to start dragging from (required for drag action)",
|
|
56
|
+
},
|
|
57
|
+
"end_element_id": {
|
|
58
|
+
"type": "integer",
|
|
59
|
+
"description": "The ID of the element to drag to (required for drag action)",
|
|
60
|
+
},
|
|
61
|
+
"text": {
|
|
62
|
+
"type": "string",
|
|
63
|
+
"description": "The text to type (required for type action)",
|
|
64
|
+
},
|
|
65
|
+
"keys": {
|
|
66
|
+
"type": "string",
|
|
67
|
+
"description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
|
|
68
|
+
},
|
|
69
|
+
"button": {
|
|
70
|
+
"type": "string",
|
|
71
|
+
"description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
|
|
72
|
+
},
|
|
73
|
+
"scroll_x": {
|
|
74
|
+
"type": "integer",
|
|
75
|
+
"description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
|
|
76
|
+
},
|
|
77
|
+
"scroll_y": {
|
|
78
|
+
"type": "integer",
|
|
79
|
+
"description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
|
|
80
|
+
},
|
|
81
|
+
},
|
|
82
|
+
"required": ["action", "element_id"],
|
|
83
|
+
},
|
|
75
84
|
},
|
|
76
|
-
"required": [
|
|
77
|
-
"action"
|
|
78
|
-
]
|
|
79
|
-
}
|
|
80
85
|
}
|
|
81
86
|
|
|
82
87
|
OMNIPARSER_AVAILABLE = False
|
|
83
88
|
try:
|
|
84
89
|
from som import OmniParser
|
|
90
|
+
|
|
85
91
|
OMNIPARSER_AVAILABLE = True
|
|
86
92
|
except ImportError:
|
|
87
93
|
pass
|
|
88
94
|
OMNIPARSER_SINGLETON = None
|
|
89
95
|
|
|
96
|
+
|
|
90
97
|
def get_parser():
|
|
91
98
|
global OMNIPARSER_SINGLETON
|
|
92
99
|
if OMNIPARSER_SINGLETON is None:
|
|
93
100
|
OMNIPARSER_SINGLETON = OmniParser()
|
|
94
101
|
return OMNIPARSER_SINGLETON
|
|
95
|
-
|
|
102
|
+
|
|
103
|
+
|
|
96
104
|
def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
|
97
105
|
"""Get the last computer_call_output message from a messages list.
|
|
98
|
-
|
|
106
|
+
|
|
99
107
|
Args:
|
|
100
108
|
messages: List of messages to search through
|
|
101
|
-
|
|
109
|
+
|
|
102
110
|
Returns:
|
|
103
111
|
The last computer_call_output message dict, or None if not found
|
|
104
112
|
"""
|
|
@@ -107,11 +115,12 @@ def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Di
|
|
|
107
115
|
return message
|
|
108
116
|
return None
|
|
109
117
|
|
|
118
|
+
|
|
110
119
|
def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]:
|
|
111
120
|
"""Prepare tools for OpenAI API format"""
|
|
112
121
|
omniparser_tools = []
|
|
113
122
|
id2xy = dict()
|
|
114
|
-
|
|
123
|
+
|
|
115
124
|
for schema in tool_schemas:
|
|
116
125
|
if schema["type"] == "computer":
|
|
117
126
|
omniparser_tools.append(SOM_TOOL_SCHEMA)
|
|
@@ -122,72 +131,80 @@ def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[T
|
|
|
122
131
|
elif schema["type"] == "function":
|
|
123
132
|
# Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
|
|
124
133
|
# Schema should be: {type, name, description, parameters}
|
|
125
|
-
omniparser_tools.append({
|
|
126
|
-
|
|
134
|
+
omniparser_tools.append({"type": "function", **schema["function"]})
|
|
135
|
+
|
|
127
136
|
return omniparser_tools, id2xy
|
|
128
137
|
|
|
129
|
-
async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]):
|
|
130
|
-
item_type = item.get("type")
|
|
131
|
-
|
|
132
|
-
def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
|
|
133
|
-
if element_id is None:
|
|
134
|
-
return (None, None)
|
|
135
|
-
return id2xy.get(element_id, (None, None))
|
|
136
|
-
|
|
137
|
-
if item_type == "function_call":
|
|
138
|
-
fn_name = item.get("name")
|
|
139
|
-
fn_args = json.loads(item.get("arguments", "{}"))
|
|
140
|
-
|
|
141
|
-
item_id = item.get("id")
|
|
142
|
-
call_id = item.get("call_id")
|
|
143
|
-
|
|
144
|
-
if fn_name == "computer":
|
|
145
|
-
action = fn_args.get("action")
|
|
146
|
-
element_id = fn_args.get("element_id")
|
|
147
|
-
start_element_id = fn_args.get("start_element_id")
|
|
148
|
-
end_element_id = fn_args.get("end_element_id")
|
|
149
|
-
text = fn_args.get("text")
|
|
150
|
-
keys = fn_args.get("keys")
|
|
151
|
-
button = fn_args.get("button")
|
|
152
|
-
scroll_x = fn_args.get("scroll_x")
|
|
153
|
-
scroll_y = fn_args.get("scroll_y")
|
|
154
|
-
|
|
155
|
-
x, y = _get_xy(element_id)
|
|
156
|
-
start_x, start_y = _get_xy(start_element_id)
|
|
157
|
-
end_x, end_y = _get_xy(end_element_id)
|
|
158
|
-
|
|
159
|
-
action_args = {
|
|
160
|
-
"type": action,
|
|
161
|
-
"x": x,
|
|
162
|
-
"y": y,
|
|
163
|
-
"start_x": start_x,
|
|
164
|
-
"start_y": start_y,
|
|
165
|
-
"end_x": end_x,
|
|
166
|
-
"end_y": end_y,
|
|
167
|
-
"text": text,
|
|
168
|
-
"keys": keys,
|
|
169
|
-
"button": button,
|
|
170
|
-
"scroll_x": scroll_x,
|
|
171
|
-
"scroll_y": scroll_y
|
|
172
|
-
}
|
|
173
|
-
# Remove None values to keep the JSON clean
|
|
174
|
-
action_args = {k: v for k, v in action_args.items() if v is not None}
|
|
175
138
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
139
|
+
async def replace_function_with_computer_call(
|
|
140
|
+
item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]
|
|
141
|
+
):
|
|
142
|
+
item_type = item.get("type")
|
|
143
|
+
|
|
144
|
+
def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
|
|
145
|
+
if element_id is None:
|
|
146
|
+
return (None, None)
|
|
147
|
+
return id2xy.get(element_id, (None, None))
|
|
148
|
+
|
|
149
|
+
if item_type == "function_call":
|
|
150
|
+
fn_name = item.get("name")
|
|
151
|
+
fn_args = json.loads(item.get("arguments", "{}"))
|
|
183
152
|
|
|
184
|
-
|
|
153
|
+
item_id = item.get("id")
|
|
154
|
+
call_id = item.get("call_id")
|
|
185
155
|
|
|
186
|
-
|
|
156
|
+
if fn_name == "computer":
|
|
157
|
+
action = fn_args.get("action")
|
|
158
|
+
element_id = fn_args.get("element_id")
|
|
159
|
+
start_element_id = fn_args.get("start_element_id")
|
|
160
|
+
end_element_id = fn_args.get("end_element_id")
|
|
161
|
+
text = fn_args.get("text")
|
|
162
|
+
keys = fn_args.get("keys")
|
|
163
|
+
button = fn_args.get("button")
|
|
164
|
+
scroll_x = fn_args.get("scroll_x")
|
|
165
|
+
scroll_y = fn_args.get("scroll_y")
|
|
166
|
+
|
|
167
|
+
x, y = _get_xy(element_id)
|
|
168
|
+
start_x, start_y = _get_xy(start_element_id)
|
|
169
|
+
end_x, end_y = _get_xy(end_element_id)
|
|
170
|
+
|
|
171
|
+
action_args = {
|
|
172
|
+
"type": action,
|
|
173
|
+
"x": x,
|
|
174
|
+
"y": y,
|
|
175
|
+
"start_x": start_x,
|
|
176
|
+
"start_y": start_y,
|
|
177
|
+
"end_x": end_x,
|
|
178
|
+
"end_y": end_y,
|
|
179
|
+
"text": text,
|
|
180
|
+
"keys": keys,
|
|
181
|
+
"button": button,
|
|
182
|
+
"scroll_x": scroll_x,
|
|
183
|
+
"scroll_y": scroll_y,
|
|
184
|
+
}
|
|
185
|
+
# Remove None values to keep the JSON clean
|
|
186
|
+
action_args = {k: v for k, v in action_args.items() if v is not None}
|
|
187
|
+
|
|
188
|
+
return [
|
|
189
|
+
{
|
|
190
|
+
"type": "computer_call",
|
|
191
|
+
"action": action_args,
|
|
192
|
+
"id": item_id,
|
|
193
|
+
"call_id": call_id,
|
|
194
|
+
"status": "completed",
|
|
195
|
+
}
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
return [item]
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
async def replace_computer_call_with_function(
|
|
202
|
+
item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]
|
|
203
|
+
):
|
|
187
204
|
"""
|
|
188
205
|
Convert computer_call back to function_call format.
|
|
189
206
|
Also handles computer_call_output -> function_call_output conversion.
|
|
190
|
-
|
|
207
|
+
|
|
191
208
|
Args:
|
|
192
209
|
item: The item to convert
|
|
193
210
|
xy2id: Mapping from (x, y) coordinates to element IDs
|
|
@@ -202,12 +219,12 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
|
|
|
202
219
|
|
|
203
220
|
if item_type == "computer_call":
|
|
204
221
|
action_data = item.get("action", {})
|
|
205
|
-
|
|
222
|
+
|
|
206
223
|
# Extract coordinates and convert back to element IDs
|
|
207
224
|
element_id = _get_element_id(action_data.get("x"), action_data.get("y"))
|
|
208
225
|
start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y"))
|
|
209
226
|
end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y"))
|
|
210
|
-
|
|
227
|
+
|
|
211
228
|
# Build function arguments
|
|
212
229
|
fn_args = {
|
|
213
230
|
"action": action_data.get("type"),
|
|
@@ -218,33 +235,38 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
|
|
|
218
235
|
"keys": action_data.get("keys"),
|
|
219
236
|
"button": action_data.get("button"),
|
|
220
237
|
"scroll_x": action_data.get("scroll_x"),
|
|
221
|
-
"scroll_y": action_data.get("scroll_y")
|
|
238
|
+
"scroll_y": action_data.get("scroll_y"),
|
|
222
239
|
}
|
|
223
|
-
|
|
240
|
+
|
|
224
241
|
# Remove None values to keep the JSON clean
|
|
225
242
|
fn_args = {k: v for k, v in fn_args.items() if v is not None}
|
|
226
|
-
|
|
227
|
-
return [
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
243
|
+
|
|
244
|
+
return [
|
|
245
|
+
{
|
|
246
|
+
"type": "function_call",
|
|
247
|
+
"name": "computer",
|
|
248
|
+
"arguments": json.dumps(fn_args),
|
|
249
|
+
"id": item.get("id"),
|
|
250
|
+
"call_id": item.get("call_id"),
|
|
251
|
+
"status": "completed",
|
|
252
|
+
}
|
|
253
|
+
]
|
|
254
|
+
|
|
239
255
|
elif item_type == "computer_call_output":
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
256
|
+
output = item.get("output")
|
|
257
|
+
|
|
258
|
+
if isinstance(output, dict):
|
|
259
|
+
output = [output]
|
|
260
|
+
|
|
261
|
+
return [
|
|
262
|
+
{
|
|
263
|
+
"type": "function_call_output",
|
|
264
|
+
"call_id": item.get("call_id"),
|
|
265
|
+
"output": item.get("output"),
|
|
266
|
+
"id": item.get("id"),
|
|
267
|
+
"status": "completed",
|
|
268
|
+
}
|
|
269
|
+
]
|
|
248
270
|
|
|
249
271
|
return [item]
|
|
250
272
|
|
|
@@ -252,7 +274,7 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
|
|
|
252
274
|
@register_agent(models=r"omniparser\+.*|omni\+.*", priority=2)
|
|
253
275
|
class OmniparserConfig(AsyncAgentConfig):
|
|
254
276
|
"""Omniparser agent configuration implementing AsyncAgentConfig protocol."""
|
|
255
|
-
|
|
277
|
+
|
|
256
278
|
async def predict_step(
|
|
257
279
|
self,
|
|
258
280
|
messages: List[Dict[str, Any]],
|
|
@@ -266,63 +288,124 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
266
288
|
_on_api_end=None,
|
|
267
289
|
_on_usage=None,
|
|
268
290
|
_on_screenshot=None,
|
|
269
|
-
**kwargs
|
|
291
|
+
**kwargs,
|
|
270
292
|
) -> Dict[str, Any]:
|
|
271
293
|
"""
|
|
272
294
|
OpenAI computer-use-preview agent loop using liteLLM responses.
|
|
273
|
-
|
|
295
|
+
|
|
274
296
|
Supports OpenAI's computer use preview models.
|
|
275
297
|
"""
|
|
276
298
|
if not OMNIPARSER_AVAILABLE:
|
|
277
|
-
raise ValueError(
|
|
278
|
-
|
|
299
|
+
raise ValueError(
|
|
300
|
+
"omniparser loop requires som to be installed. Install it with `pip install cua-som`."
|
|
301
|
+
)
|
|
302
|
+
|
|
279
303
|
tools = tools or []
|
|
280
|
-
|
|
281
|
-
llm_model = model.split(
|
|
304
|
+
|
|
305
|
+
llm_model = model.split("+")[-1]
|
|
306
|
+
|
|
307
|
+
# Get screen dimensions from computer handler
|
|
308
|
+
try:
|
|
309
|
+
width, height = await computer_handler.get_dimensions()
|
|
310
|
+
except Exception:
|
|
311
|
+
# Fallback to default dimensions if method fails
|
|
312
|
+
width, height = 1024, 768
|
|
282
313
|
|
|
283
314
|
# Prepare tools for OpenAI API
|
|
284
315
|
openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
|
|
285
316
|
|
|
286
|
-
#
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
317
|
+
# Build per-screenshot element mappings for historical consistency
|
|
318
|
+
screenshot_mappings = [] # (message_index, xy2id)
|
|
319
|
+
|
|
320
|
+
parser = get_parser()
|
|
321
|
+
|
|
322
|
+
for idx, message in enumerate(messages):
|
|
323
|
+
if not isinstance(message, dict):
|
|
324
|
+
message = message.__dict__
|
|
325
|
+
|
|
326
|
+
if message.get("type") == "computer_call_output":
|
|
327
|
+
image_url = message.get("output", {}).get("image_url", "")
|
|
328
|
+
if not image_url:
|
|
329
|
+
continue
|
|
330
|
+
|
|
331
|
+
image_data = image_url.split(",")[-1]
|
|
332
|
+
if not image_data:
|
|
333
|
+
continue
|
|
334
|
+
|
|
293
335
|
result = parser.parse(image_data)
|
|
336
|
+
|
|
294
337
|
if _on_screenshot:
|
|
295
338
|
await _on_screenshot(result.annotated_image_base64, "annotated_image")
|
|
339
|
+
|
|
340
|
+
local_id2xy = {}
|
|
341
|
+
|
|
296
342
|
for element in result.elements:
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
343
|
+
norm_x = (element.bbox.x1 + element.bbox.x2) / 2
|
|
344
|
+
norm_y = (element.bbox.y1 + element.bbox.y2) / 2
|
|
345
|
+
pixel_x = int(norm_x * width)
|
|
346
|
+
pixel_y = int(norm_y * height)
|
|
347
|
+
local_id2xy[element.id] = (pixel_x, pixel_y)
|
|
348
|
+
|
|
349
|
+
xy2id = {v: k for k, v in local_id2xy.items()}
|
|
350
|
+
screenshot_mappings.append((idx, xy2id))
|
|
351
|
+
|
|
352
|
+
# Replace screenshot with annotated image
|
|
353
|
+
message["output"]["image_url"] = (
|
|
354
|
+
f"data:image/png;base64,{result.annotated_image_base64}"
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
def get_mapping_for_index(index):
|
|
358
|
+
applicable = [m for i, m in screenshot_mappings if i <= index]
|
|
359
|
+
return applicable[-1] if applicable else {}
|
|
360
|
+
|
|
361
|
+
messages_with_element_ids = []
|
|
362
|
+
|
|
363
|
+
for i, message in enumerate(messages):
|
|
302
364
|
if not isinstance(message, dict):
|
|
303
365
|
message = message.__dict__
|
|
304
|
-
|
|
305
|
-
|
|
366
|
+
|
|
367
|
+
xy2id = get_mapping_for_index(i)
|
|
368
|
+
converted = await replace_computer_call_with_function(message, xy2id)
|
|
369
|
+
messages_with_element_ids.extend(converted)
|
|
370
|
+
|
|
371
|
+
completion_messages = convert_responses_items_to_completion_messages(
|
|
372
|
+
messages_with_element_ids, allow_images_in_tool_results=False
|
|
373
|
+
)
|
|
306
374
|
|
|
307
375
|
# Prepare API call kwargs
|
|
308
376
|
api_kwargs = {
|
|
309
377
|
"model": llm_model,
|
|
310
|
-
"
|
|
378
|
+
"messages": completion_messages,
|
|
311
379
|
"tools": openai_tools if openai_tools else None,
|
|
312
380
|
"stream": stream,
|
|
313
|
-
"truncation": "auto",
|
|
314
381
|
"num_retries": max_retries,
|
|
315
|
-
**kwargs
|
|
382
|
+
**kwargs,
|
|
316
383
|
}
|
|
317
|
-
|
|
384
|
+
|
|
385
|
+
# Add Vertex AI specific parameters if using vertex_ai models
|
|
386
|
+
if llm_model.startswith("vertex_ai/"):
|
|
387
|
+
import os
|
|
388
|
+
|
|
389
|
+
# Pass vertex_project and vertex_location to liteLLM
|
|
390
|
+
if "vertex_project" not in api_kwargs:
|
|
391
|
+
api_kwargs["vertex_project"] = os.getenv("GOOGLE_CLOUD_PROJECT")
|
|
392
|
+
if "vertex_location" not in api_kwargs:
|
|
393
|
+
api_kwargs["vertex_location"] = "global"
|
|
394
|
+
|
|
395
|
+
# Pass through Gemini 3-specific parameters if provided
|
|
396
|
+
if "thinking_level" in kwargs:
|
|
397
|
+
api_kwargs["thinking_level"] = kwargs["thinking_level"]
|
|
398
|
+
if "media_resolution" in kwargs:
|
|
399
|
+
api_kwargs["media_resolution"] = kwargs["media_resolution"]
|
|
400
|
+
|
|
318
401
|
# Call API start hook
|
|
319
402
|
if _on_api_start:
|
|
320
403
|
await _on_api_start(api_kwargs)
|
|
321
|
-
|
|
404
|
+
|
|
322
405
|
print(str(api_kwargs)[:1000])
|
|
323
406
|
|
|
324
|
-
# Use liteLLM
|
|
325
|
-
response = await litellm.
|
|
407
|
+
# Use liteLLM completion
|
|
408
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
326
409
|
|
|
327
410
|
# Call API end hook
|
|
328
411
|
if _on_api_end:
|
|
@@ -330,60 +413,83 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
330
413
|
|
|
331
414
|
# Extract usage information
|
|
332
415
|
usage = {
|
|
333
|
-
**response.usage.model_dump(),
|
|
334
|
-
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
416
|
+
**response.usage.model_dump(), # type: ignore
|
|
417
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
|
|
335
418
|
}
|
|
336
419
|
if _on_usage:
|
|
337
420
|
await _on_usage(usage)
|
|
338
421
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
422
|
+
response_dict = response.model_dump() # type: ignore
|
|
423
|
+
choice_messages = [choice["message"] for choice in response_dict["choices"]]
|
|
424
|
+
responses_items = []
|
|
425
|
+
for choice_message in choice_messages:
|
|
426
|
+
responses_items.extend(convert_completion_messages_to_responses_items([choice_message]))
|
|
427
|
+
|
|
428
|
+
# Convert element_id → x,y (similar to moondream's convert_computer_calls_desc2xy)
|
|
429
|
+
final_output = []
|
|
430
|
+
for item in responses_items:
|
|
431
|
+
if item.get("type") == "computer_call" and "action" in item:
|
|
432
|
+
action = item["action"].copy()
|
|
433
|
+
|
|
434
|
+
# Handle single element_id
|
|
435
|
+
if "element_id" in action:
|
|
436
|
+
element_id = action["element_id"]
|
|
437
|
+
if element_id in id2xy:
|
|
438
|
+
x, y = id2xy[element_id]
|
|
439
|
+
action["x"] = x
|
|
440
|
+
action["y"] = y
|
|
441
|
+
del action["element_id"]
|
|
442
|
+
|
|
443
|
+
# Handle start_element_id and end_element_id for drag operations
|
|
444
|
+
elif "start_element_id" in action and "end_element_id" in action:
|
|
445
|
+
start_id = action["start_element_id"]
|
|
446
|
+
end_id = action["end_element_id"]
|
|
447
|
+
if start_id in id2xy and end_id in id2xy:
|
|
448
|
+
start_x, start_y = id2xy[start_id]
|
|
449
|
+
end_x, end_y = id2xy[end_id]
|
|
450
|
+
action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
|
|
451
|
+
del action["start_element_id"]
|
|
452
|
+
del action["end_element_id"]
|
|
453
|
+
|
|
454
|
+
converted_item = item.copy()
|
|
455
|
+
converted_item["action"] = action
|
|
456
|
+
final_output.append(converted_item)
|
|
457
|
+
else:
|
|
458
|
+
final_output.append(item)
|
|
459
|
+
|
|
460
|
+
return {"output": final_output, "usage": usage}
|
|
461
|
+
|
|
349
462
|
async def predict_click(
|
|
350
|
-
self,
|
|
351
|
-
model: str,
|
|
352
|
-
image_b64: str,
|
|
353
|
-
instruction: str,
|
|
354
|
-
**kwargs
|
|
463
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
355
464
|
) -> Optional[Tuple[float, float]]:
|
|
356
465
|
"""
|
|
357
466
|
Predict click coordinates using OmniParser and LLM.
|
|
358
|
-
|
|
467
|
+
|
|
359
468
|
Uses OmniParser to annotate the image with element IDs, then uses LLM
|
|
360
469
|
to identify the correct element ID based on the instruction.
|
|
361
470
|
"""
|
|
362
471
|
if not OMNIPARSER_AVAILABLE:
|
|
363
472
|
return None
|
|
364
|
-
|
|
473
|
+
|
|
365
474
|
# Parse the image with OmniParser to get annotated image and elements
|
|
366
475
|
parser = get_parser()
|
|
367
476
|
result = parser.parse(image_b64)
|
|
368
|
-
|
|
477
|
+
|
|
369
478
|
# Extract the LLM model from composed model string
|
|
370
|
-
llm_model = model.split(
|
|
371
|
-
|
|
479
|
+
llm_model = model.split("+")[-1]
|
|
480
|
+
|
|
372
481
|
# Create system prompt for element ID prediction
|
|
373
|
-
SYSTEM_PROMPT =
|
|
482
|
+
SYSTEM_PROMPT = """
|
|
374
483
|
You are an expert UI element locator. Given a GUI image annotated with numerical IDs over each interactable element, along with a user's element description, provide the ID of the specified element.
|
|
375
484
|
|
|
376
485
|
The image shows UI elements with numbered overlays. Each number corresponds to a clickable/interactable element.
|
|
377
486
|
|
|
378
487
|
Output only the element ID as a single integer.
|
|
379
|
-
|
|
380
|
-
|
|
488
|
+
""".strip()
|
|
489
|
+
|
|
381
490
|
# Prepare messages for LLM
|
|
382
491
|
messages = [
|
|
383
|
-
{
|
|
384
|
-
"role": "system",
|
|
385
|
-
"content": SYSTEM_PROMPT
|
|
386
|
-
},
|
|
492
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
|
387
493
|
{
|
|
388
494
|
"role": "user",
|
|
389
495
|
"content": [
|
|
@@ -391,31 +497,25 @@ Output only the element ID as a single integer.
|
|
|
391
497
|
"type": "image_url",
|
|
392
498
|
"image_url": {
|
|
393
499
|
"url": f"data:image/png;base64,{result.annotated_image_base64}"
|
|
394
|
-
}
|
|
500
|
+
},
|
|
395
501
|
},
|
|
396
|
-
{
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
}
|
|
400
|
-
]
|
|
401
|
-
}
|
|
502
|
+
{"type": "text", "text": f"Find the element: {instruction}"},
|
|
503
|
+
],
|
|
504
|
+
},
|
|
402
505
|
]
|
|
403
|
-
|
|
506
|
+
|
|
404
507
|
# Call LLM to predict element ID
|
|
405
508
|
response = await litellm.acompletion(
|
|
406
|
-
model=llm_model,
|
|
407
|
-
messages=messages,
|
|
408
|
-
max_tokens=10,
|
|
409
|
-
temperature=0.1
|
|
509
|
+
model=llm_model, messages=messages, max_tokens=10, temperature=0.1
|
|
410
510
|
)
|
|
411
|
-
|
|
511
|
+
|
|
412
512
|
# Extract element ID from response
|
|
413
|
-
response_text = response.choices[0].message.content.strip()
|
|
414
|
-
|
|
513
|
+
response_text = response.choices[0].message.content.strip() # type: ignore
|
|
514
|
+
|
|
415
515
|
# Try to parse the element ID
|
|
416
516
|
try:
|
|
417
517
|
element_id = int(response_text)
|
|
418
|
-
|
|
518
|
+
|
|
419
519
|
# Find the element with this ID and return its center coordinates
|
|
420
520
|
for element in result.elements:
|
|
421
521
|
if element.id == element_id:
|
|
@@ -425,9 +525,9 @@ Output only the element ID as a single integer.
|
|
|
425
525
|
except ValueError:
|
|
426
526
|
# If we can't parse the ID, return None
|
|
427
527
|
pass
|
|
428
|
-
|
|
528
|
+
|
|
429
529
|
return None
|
|
430
|
-
|
|
530
|
+
|
|
431
531
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
432
532
|
"""Return the capabilities supported by this agent."""
|
|
433
533
|
return ["step"]
|