cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/openai.py
CHANGED
|
@@ -6,12 +6,14 @@ import asyncio
|
|
|
6
6
|
import base64
|
|
7
7
|
import json
|
|
8
8
|
from io import BytesIO
|
|
9
|
-
from typing import
|
|
9
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
10
|
+
|
|
10
11
|
import litellm
|
|
11
12
|
from PIL import Image
|
|
12
13
|
|
|
13
14
|
from ..decorators import register_agent
|
|
14
|
-
from ..types import
|
|
15
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
16
|
+
|
|
15
17
|
|
|
16
18
|
async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
|
|
17
19
|
"""Map a computer tool to OpenAI's computer-use-preview tool schema"""
|
|
@@ -21,26 +23,26 @@ async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
|
|
|
21
23
|
except Exception:
|
|
22
24
|
# Fallback to default dimensions if method fails
|
|
23
25
|
width, height = 1024, 768
|
|
24
|
-
|
|
26
|
+
|
|
25
27
|
# Get environment from the computer handler
|
|
26
28
|
try:
|
|
27
29
|
environment = await computer_handler.get_environment()
|
|
28
30
|
except Exception:
|
|
29
31
|
# Fallback to default environment if method fails
|
|
30
32
|
environment = "linux"
|
|
31
|
-
|
|
33
|
+
|
|
32
34
|
return {
|
|
33
35
|
"type": "computer_use_preview",
|
|
34
36
|
"display_width": width,
|
|
35
37
|
"display_height": height,
|
|
36
|
-
"environment": environment # mac, windows, linux, browser
|
|
38
|
+
"environment": environment, # mac, windows, linux, browser
|
|
37
39
|
}
|
|
38
40
|
|
|
39
41
|
|
|
40
42
|
async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
|
|
41
43
|
"""Prepare tools for OpenAI API format"""
|
|
42
44
|
openai_tools = []
|
|
43
|
-
|
|
45
|
+
|
|
44
46
|
for schema in tool_schemas:
|
|
45
47
|
if schema["type"] == "computer":
|
|
46
48
|
# Map computer tool to OpenAI format
|
|
@@ -49,19 +51,19 @@ async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools
|
|
|
49
51
|
elif schema["type"] == "function":
|
|
50
52
|
# Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
|
|
51
53
|
# Schema should be: {type, name, description, parameters}
|
|
52
|
-
openai_tools.append({
|
|
53
|
-
|
|
54
|
+
openai_tools.append({"type": "function", **schema["function"]})
|
|
55
|
+
|
|
54
56
|
return openai_tools
|
|
55
57
|
|
|
56
58
|
|
|
57
|
-
@register_agent(models=r".*computer-use-preview
|
|
59
|
+
@register_agent(models=r".*(^|/)computer-use-preview")
|
|
58
60
|
class OpenAIComputerUseConfig:
|
|
59
61
|
"""
|
|
60
62
|
OpenAI computer-use-preview agent configuration using liteLLM responses.
|
|
61
|
-
|
|
63
|
+
|
|
62
64
|
Supports OpenAI's computer use preview models.
|
|
63
65
|
"""
|
|
64
|
-
|
|
66
|
+
|
|
65
67
|
async def predict_step(
|
|
66
68
|
self,
|
|
67
69
|
messages: List[Dict[str, Any]],
|
|
@@ -75,11 +77,11 @@ class OpenAIComputerUseConfig:
|
|
|
75
77
|
_on_api_end=None,
|
|
76
78
|
_on_usage=None,
|
|
77
79
|
_on_screenshot=None,
|
|
78
|
-
**kwargs
|
|
80
|
+
**kwargs,
|
|
79
81
|
) -> Dict[str, Any]:
|
|
80
82
|
"""
|
|
81
83
|
Predict the next step based on input items.
|
|
82
|
-
|
|
84
|
+
|
|
83
85
|
Args:
|
|
84
86
|
messages: Input items following Responses format
|
|
85
87
|
model: Model name to use
|
|
@@ -92,12 +94,12 @@ class OpenAIComputerUseConfig:
|
|
|
92
94
|
_on_usage: Callback for usage tracking
|
|
93
95
|
_on_screenshot: Callback for screenshot events
|
|
94
96
|
**kwargs: Additional arguments
|
|
95
|
-
|
|
97
|
+
|
|
96
98
|
Returns:
|
|
97
99
|
Dictionary with "output" (output items) and "usage" array
|
|
98
100
|
"""
|
|
99
101
|
tools = tools or []
|
|
100
|
-
|
|
102
|
+
|
|
101
103
|
# Prepare tools for OpenAI API
|
|
102
104
|
openai_tools = await _prepare_tools_for_openai(tools)
|
|
103
105
|
|
|
@@ -110,16 +112,16 @@ class OpenAIComputerUseConfig:
|
|
|
110
112
|
"reasoning": {"summary": "concise"},
|
|
111
113
|
"truncation": "auto",
|
|
112
114
|
"num_retries": max_retries,
|
|
113
|
-
**kwargs
|
|
115
|
+
**kwargs,
|
|
114
116
|
}
|
|
115
|
-
|
|
117
|
+
|
|
116
118
|
# Call API start hook
|
|
117
119
|
if _on_api_start:
|
|
118
120
|
await _on_api_start(api_kwargs)
|
|
119
|
-
|
|
121
|
+
|
|
120
122
|
# Use liteLLM responses
|
|
121
123
|
response = await litellm.aresponses(**api_kwargs)
|
|
122
|
-
|
|
124
|
+
|
|
123
125
|
# Call API end hook
|
|
124
126
|
if _on_api_end:
|
|
125
127
|
await _on_api_end(api_kwargs, response)
|
|
@@ -136,24 +138,21 @@ class OpenAIComputerUseConfig:
|
|
|
136
138
|
output_dict = response.model_dump()
|
|
137
139
|
output_dict["usage"] = usage
|
|
138
140
|
return output_dict
|
|
139
|
-
|
|
141
|
+
|
|
140
142
|
async def predict_click(
|
|
141
|
-
self,
|
|
142
|
-
model: str,
|
|
143
|
-
image_b64: str,
|
|
144
|
-
instruction: str
|
|
143
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
145
144
|
) -> Optional[Tuple[int, int]]:
|
|
146
145
|
"""
|
|
147
146
|
Predict click coordinates based on image and instruction.
|
|
148
|
-
|
|
147
|
+
|
|
149
148
|
Uses OpenAI computer-use-preview with manually constructed input items
|
|
150
149
|
and a prompt that instructs the agent to only output clicks.
|
|
151
|
-
|
|
150
|
+
|
|
152
151
|
Args:
|
|
153
152
|
model: Model name to use
|
|
154
153
|
image_b64: Base64 encoded image
|
|
155
154
|
instruction: Instruction for where to click
|
|
156
|
-
|
|
155
|
+
|
|
157
156
|
Returns:
|
|
158
157
|
Tuple of (x, y) coordinates or None if prediction fails
|
|
159
158
|
"""
|
|
@@ -161,7 +160,7 @@ class OpenAIComputerUseConfig:
|
|
|
161
160
|
# Manually construct input items with image and click instruction
|
|
162
161
|
input_items = [
|
|
163
162
|
{
|
|
164
|
-
"role": "user",
|
|
163
|
+
"role": "user",
|
|
165
164
|
"content": f"""You are a UI grounding expert. Follow these guidelines:
|
|
166
165
|
|
|
167
166
|
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
|
@@ -173,19 +172,16 @@ class OpenAIComputerUseConfig:
|
|
|
173
172
|
7. Be decisive and action-oriented. Complete the requested task fully.
|
|
174
173
|
|
|
175
174
|
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
|
176
|
-
Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
175
|
+
Task: Click {instruction}. Output ONLY a click action on the target element.""",
|
|
177
176
|
},
|
|
178
177
|
{
|
|
179
178
|
"role": "user",
|
|
180
179
|
"content": [
|
|
181
|
-
{
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
}
|
|
185
|
-
]
|
|
186
|
-
}
|
|
180
|
+
{"type": "input_image", "image_url": f"data:image/png;base64,{image_b64}"}
|
|
181
|
+
],
|
|
182
|
+
},
|
|
187
183
|
]
|
|
188
|
-
|
|
184
|
+
|
|
189
185
|
# Get image dimensions from base64 data
|
|
190
186
|
try:
|
|
191
187
|
image_data = base64.b64decode(image_b64)
|
|
@@ -194,15 +190,15 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
|
194
190
|
except Exception:
|
|
195
191
|
# Fallback to default dimensions if image parsing fails
|
|
196
192
|
display_width, display_height = 1024, 768
|
|
197
|
-
|
|
193
|
+
|
|
198
194
|
# Prepare computer tool for click actions
|
|
199
195
|
computer_tool = {
|
|
200
196
|
"type": "computer_use_preview",
|
|
201
197
|
"display_width": display_width,
|
|
202
198
|
"display_height": display_height,
|
|
203
|
-
"environment": "windows"
|
|
199
|
+
"environment": "windows",
|
|
204
200
|
}
|
|
205
|
-
|
|
201
|
+
|
|
206
202
|
# Prepare API call kwargs
|
|
207
203
|
api_kwargs = {
|
|
208
204
|
"model": model,
|
|
@@ -211,32 +207,35 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
|
211
207
|
"stream": False,
|
|
212
208
|
"reasoning": {"summary": "concise"},
|
|
213
209
|
"truncation": "auto",
|
|
214
|
-
"max_tokens": 200 # Keep response short for click prediction
|
|
210
|
+
"max_tokens": 200, # Keep response short for click prediction
|
|
211
|
+
**kwargs,
|
|
215
212
|
}
|
|
216
|
-
|
|
213
|
+
|
|
217
214
|
# Use liteLLM responses
|
|
218
215
|
response = await litellm.aresponses(**api_kwargs)
|
|
219
|
-
|
|
216
|
+
|
|
220
217
|
# Extract click coordinates from response output
|
|
221
218
|
output_dict = response.model_dump()
|
|
222
|
-
output_items = output_dict.get("output", [])
|
|
223
|
-
|
|
219
|
+
output_items = output_dict.get("output", [])
|
|
220
|
+
|
|
224
221
|
# Look for computer_call with click action
|
|
225
222
|
for item in output_items:
|
|
226
|
-
if (
|
|
227
|
-
item
|
|
228
|
-
|
|
229
|
-
|
|
223
|
+
if (
|
|
224
|
+
isinstance(item, dict)
|
|
225
|
+
and item.get("type") == "computer_call"
|
|
226
|
+
and isinstance(item.get("action"), dict)
|
|
227
|
+
):
|
|
228
|
+
|
|
230
229
|
action = item["action"]
|
|
231
230
|
if action.get("x") is not None and action.get("y") is not None:
|
|
232
231
|
return (int(action.get("x")), int(action.get("y")))
|
|
233
|
-
|
|
232
|
+
|
|
234
233
|
return None
|
|
235
|
-
|
|
234
|
+
|
|
236
235
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
237
236
|
"""
|
|
238
237
|
Get list of capabilities supported by this agent config.
|
|
239
|
-
|
|
238
|
+
|
|
240
239
|
Returns:
|
|
241
240
|
List of capability strings
|
|
242
241
|
"""
|
agent/loops/opencua.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OpenCUA agent loop implementation for click prediction using litellm.acompletion
|
|
3
|
+
Based on OpenCUA model for GUI grounding tasks.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import base64
|
|
8
|
+
import json
|
|
9
|
+
import math
|
|
10
|
+
import re
|
|
11
|
+
import uuid
|
|
12
|
+
from io import BytesIO
|
|
13
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
14
|
+
|
|
15
|
+
import litellm
|
|
16
|
+
from PIL import Image
|
|
17
|
+
|
|
18
|
+
from ..decorators import register_agent
|
|
19
|
+
from ..loops.base import AsyncAgentConfig
|
|
20
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
21
|
+
from .composed_grounded import ComposedGroundedConfig
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def extract_coordinates_from_click(text: str) -> Optional[Tuple[int, int]]:
|
|
25
|
+
"""Extract coordinates from click(x=..., y=...) or pyautogui.click(x=..., y=...) format.
|
|
26
|
+
|
|
27
|
+
This function supports parsing both generic click() and legacy pyautogui.click() formats
|
|
28
|
+
for backwards compatibility with models that may still output pyautogui format.
|
|
29
|
+
"""
|
|
30
|
+
try:
|
|
31
|
+
# Look for click(x=1443, y=343) or pyautogui.click(x=1443, y=343) pattern
|
|
32
|
+
pattern = r"(?:pyautogui\.)?click\(x=(\d+),\s*y=(\d+)\)"
|
|
33
|
+
match = re.search(pattern, text)
|
|
34
|
+
if match:
|
|
35
|
+
x, y = int(match.group(1)), int(match.group(2))
|
|
36
|
+
return (x, y)
|
|
37
|
+
return None
|
|
38
|
+
except Exception:
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@register_agent(models=r"(?i).*OpenCUA.*")
|
|
43
|
+
class OpenCUAConfig(ComposedGroundedConfig):
|
|
44
|
+
"""OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
|
|
45
|
+
|
|
46
|
+
def __init__(self):
|
|
47
|
+
super().__init__()
|
|
48
|
+
self.current_model = None
|
|
49
|
+
self.last_screenshot_b64 = None
|
|
50
|
+
|
|
51
|
+
async def predict_step(
|
|
52
|
+
self,
|
|
53
|
+
messages: List[Dict[str, Any]],
|
|
54
|
+
model: str,
|
|
55
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
56
|
+
max_retries: Optional[int] = None,
|
|
57
|
+
stream: bool = False,
|
|
58
|
+
computer_handler=None,
|
|
59
|
+
_on_api_start=None,
|
|
60
|
+
_on_api_end=None,
|
|
61
|
+
_on_usage=None,
|
|
62
|
+
_on_screenshot=None,
|
|
63
|
+
**kwargs,
|
|
64
|
+
) -> Dict[str, Any]:
|
|
65
|
+
"""Fallback to a self-composed model"""
|
|
66
|
+
return await super().predict_step(
|
|
67
|
+
messages=messages,
|
|
68
|
+
model=f"{model}+{model}",
|
|
69
|
+
tools=tools,
|
|
70
|
+
max_retries=max_retries,
|
|
71
|
+
stream=stream,
|
|
72
|
+
computer_handler=computer_handler,
|
|
73
|
+
_on_api_start=_on_api_start,
|
|
74
|
+
_on_api_end=_on_api_end,
|
|
75
|
+
_on_usage=_on_usage,
|
|
76
|
+
_on_screenshot=_on_screenshot,
|
|
77
|
+
**kwargs,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
async def predict_click(
|
|
81
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
82
|
+
) -> Optional[Tuple[int, int]]:
|
|
83
|
+
"""
|
|
84
|
+
Predict click coordinates using OpenCUA model via litellm.acompletion.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
model: The OpenCUA model name
|
|
88
|
+
image_b64: Base64 encoded image
|
|
89
|
+
instruction: Instruction for where to click
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Tuple of (x, y) coordinates or None if prediction fails
|
|
93
|
+
"""
|
|
94
|
+
# Prepare system message
|
|
95
|
+
system_prompt = (
|
|
96
|
+
"You are a GUI agent. You are given a task and a screenshot of the screen. "
|
|
97
|
+
"You need to perform a series of click actions to complete the task."
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
system_message = {"role": "system", "content": system_prompt}
|
|
101
|
+
|
|
102
|
+
# Prepare user message with image and instruction
|
|
103
|
+
user_message = {
|
|
104
|
+
"role": "user",
|
|
105
|
+
"content": [
|
|
106
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
|
|
107
|
+
{"type": "text", "text": f"Click on {instruction}"},
|
|
108
|
+
],
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# Prepare API call kwargs
|
|
112
|
+
api_kwargs = {
|
|
113
|
+
"model": model,
|
|
114
|
+
"messages": [system_message, user_message],
|
|
115
|
+
"max_new_tokens": 2056,
|
|
116
|
+
"temperature": 0,
|
|
117
|
+
**kwargs,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
# Use liteLLM acompletion
|
|
121
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
122
|
+
|
|
123
|
+
# Extract response text
|
|
124
|
+
output_text = response.choices[0].message.content
|
|
125
|
+
# print(output_text)
|
|
126
|
+
|
|
127
|
+
# Extract coordinates from click format (supports both click() and pyautogui.click() for backwards compatibility)
|
|
128
|
+
coordinates = extract_coordinates_from_click(output_text)
|
|
129
|
+
|
|
130
|
+
return coordinates
|
|
131
|
+
|
|
132
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
133
|
+
"""Return the capabilities supported by this agent."""
|
|
134
|
+
return ["click"]
|
agent/loops/uiins.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""
|
|
2
|
+
UI-Ins agent loop implementation for click prediction using litellm.acompletion
|
|
3
|
+
Paper: https://arxiv.org/pdf/2510.202861
|
|
4
|
+
Code: https://github.com/alibaba/UI-Ins
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import base64
|
|
9
|
+
import json
|
|
10
|
+
import math
|
|
11
|
+
import re
|
|
12
|
+
import uuid
|
|
13
|
+
from io import BytesIO
|
|
14
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
15
|
+
|
|
16
|
+
import litellm
|
|
17
|
+
from PIL import Image
|
|
18
|
+
|
|
19
|
+
from ..decorators import register_agent
|
|
20
|
+
from ..loops.base import AsyncAgentConfig
|
|
21
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
22
|
+
|
|
23
|
+
SYSTEM_PROMPT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\nReturn a json object with a reasoning process in tags, a function name and arguments within XML tags:\n```\n\n...\n\n\n{"name": "grounding", "arguments": }\n\n```\n represents the following item of the action space:\n## Action Space{"action": "click", "coordinate": [x, y]}\nYour task is to accurately locate a UI element based on the instruction. You should first analyze instruction in tags and finally output the function in tags.\n"""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_coordinates(raw_string: str) -> tuple[int, int]:
|
|
27
|
+
matches = re.findall(r"\[(\d+),\s*(\d+)\]", raw_string)
|
|
28
|
+
if matches:
|
|
29
|
+
return tuple(map(int, matches[0]))
|
|
30
|
+
return -1, -1
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def smart_resize(
|
|
34
|
+
height: int,
|
|
35
|
+
width: int,
|
|
36
|
+
factor: int = 28,
|
|
37
|
+
min_pixels: int = 3136,
|
|
38
|
+
max_pixels: int = 8847360,
|
|
39
|
+
) -> Tuple[int, int]:
|
|
40
|
+
"""Smart resize function similar to qwen_vl_utils."""
|
|
41
|
+
# Calculate the total pixels
|
|
42
|
+
total_pixels = height * width
|
|
43
|
+
|
|
44
|
+
# If already within bounds, return original dimensions
|
|
45
|
+
if min_pixels <= total_pixels <= max_pixels:
|
|
46
|
+
# Round to nearest factor
|
|
47
|
+
new_height = (height // factor) * factor
|
|
48
|
+
new_width = (width // factor) * factor
|
|
49
|
+
return new_height, new_width
|
|
50
|
+
|
|
51
|
+
# Calculate scaling factor
|
|
52
|
+
if total_pixels > max_pixels:
|
|
53
|
+
scale = (max_pixels / total_pixels) ** 0.5
|
|
54
|
+
else:
|
|
55
|
+
scale = (min_pixels / total_pixels) ** 0.5
|
|
56
|
+
|
|
57
|
+
# Apply scaling
|
|
58
|
+
new_height = int(height * scale)
|
|
59
|
+
new_width = int(width * scale)
|
|
60
|
+
|
|
61
|
+
# Round to nearest factor
|
|
62
|
+
new_height = (new_height // factor) * factor
|
|
63
|
+
new_width = (new_width // factor) * factor
|
|
64
|
+
|
|
65
|
+
# Ensure minimum size
|
|
66
|
+
new_height = max(new_height, factor)
|
|
67
|
+
new_width = max(new_width, factor)
|
|
68
|
+
|
|
69
|
+
return new_height, new_width
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@register_agent(models=r".*UI-Ins.*")
|
|
73
|
+
class UIInsConfig(AsyncAgentConfig):
|
|
74
|
+
"""UI-Ins agent configuration implementing AsyncAgentConfig protocol for click prediction."""
|
|
75
|
+
|
|
76
|
+
def __init__(self):
|
|
77
|
+
self.current_model = None
|
|
78
|
+
self.last_screenshot_b64 = None
|
|
79
|
+
|
|
80
|
+
async def predict_step(
|
|
81
|
+
self,
|
|
82
|
+
messages: List[Dict[str, Any]],
|
|
83
|
+
model: str,
|
|
84
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
85
|
+
max_retries: Optional[int] = None,
|
|
86
|
+
stream: bool = False,
|
|
87
|
+
computer_handler=None,
|
|
88
|
+
_on_api_start=None,
|
|
89
|
+
_on_api_end=None,
|
|
90
|
+
_on_usage=None,
|
|
91
|
+
_on_screenshot=None,
|
|
92
|
+
**kwargs,
|
|
93
|
+
) -> Dict[str, Any]:
|
|
94
|
+
raise NotImplementedError()
|
|
95
|
+
|
|
96
|
+
async def predict_click(
|
|
97
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
98
|
+
) -> Optional[Tuple[float, float]]:
|
|
99
|
+
"""
|
|
100
|
+
Predict click coordinates using UI-Ins model via litellm.acompletion.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
model: The UI-Ins model name
|
|
104
|
+
image_b64: Base64 encoded image
|
|
105
|
+
instruction: Instruction for where to click
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Tuple of (x, y) coordinates or None if prediction fails
|
|
109
|
+
"""
|
|
110
|
+
# Decode base64 image
|
|
111
|
+
image_data = base64.b64decode(image_b64)
|
|
112
|
+
image = Image.open(BytesIO(image_data))
|
|
113
|
+
width, height = image.width, image.height
|
|
114
|
+
|
|
115
|
+
# Smart resize the image (similar to qwen_vl_utils)
|
|
116
|
+
resized_height, resized_width = smart_resize(
|
|
117
|
+
height,
|
|
118
|
+
width,
|
|
119
|
+
factor=28, # Default factor for Qwen models
|
|
120
|
+
min_pixels=3136,
|
|
121
|
+
max_pixels=4096 * 2160,
|
|
122
|
+
)
|
|
123
|
+
resized_image = image.resize((resized_width, resized_height))
|
|
124
|
+
scale_x, scale_y = width / resized_width, height / resized_height
|
|
125
|
+
|
|
126
|
+
# Convert resized image back to base64
|
|
127
|
+
buffered = BytesIO()
|
|
128
|
+
resized_image.save(buffered, format="PNG")
|
|
129
|
+
resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
|
|
130
|
+
|
|
131
|
+
# Prepare system and user messages
|
|
132
|
+
system_message = {
|
|
133
|
+
"role": "system",
|
|
134
|
+
"content": [
|
|
135
|
+
{"type": "text", "text": "You are a helpful assistant."},
|
|
136
|
+
{"type": "text", "text": SYSTEM_PROMPT},
|
|
137
|
+
],
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
user_message = {
|
|
141
|
+
"role": "user",
|
|
142
|
+
"content": [
|
|
143
|
+
{
|
|
144
|
+
"type": "image_url",
|
|
145
|
+
"image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
|
|
146
|
+
},
|
|
147
|
+
{"type": "text", "text": instruction},
|
|
148
|
+
],
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# Prepare API call kwargs
|
|
152
|
+
api_kwargs = {
|
|
153
|
+
"model": model,
|
|
154
|
+
"messages": [system_message, user_message],
|
|
155
|
+
"max_tokens": 2056,
|
|
156
|
+
"temperature": 0.0,
|
|
157
|
+
**kwargs,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
# Use liteLLM acompletion
|
|
161
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
162
|
+
|
|
163
|
+
# Extract response text
|
|
164
|
+
output_text = response.choices[0].message.content # type: ignore
|
|
165
|
+
|
|
166
|
+
# Extract and rescale coordinates
|
|
167
|
+
pred_x, pred_y = parse_coordinates(output_text) # type: ignore
|
|
168
|
+
pred_x *= scale_x
|
|
169
|
+
pred_y *= scale_y
|
|
170
|
+
|
|
171
|
+
return (math.floor(pred_x), math.floor(pred_y))
|
|
172
|
+
|
|
173
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
174
|
+
"""Return the capabilities supported by this agent."""
|
|
175
|
+
return ["click"]
|