cua-agent 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +2 -2
- agent/adapters/huggingfacelocal_adapter.py +8 -5
- agent/agent.py +85 -15
- agent/cli.py +9 -3
- agent/computer_handler.py +3 -1
- agent/decorators.py +28 -66
- agent/loops/__init__.py +3 -1
- agent/loops/anthropic.py +200 -84
- agent/loops/base.py +76 -0
- agent/loops/composed_grounded.py +318 -0
- agent/loops/gta1.py +178 -0
- agent/loops/model_types.csv +6 -0
- agent/loops/omniparser.py +178 -84
- agent/loops/openai.py +198 -58
- agent/loops/uitars.py +305 -178
- agent/responses.py +477 -1
- agent/types.py +7 -5
- agent/ui/gradio/app.py +14 -7
- agent/ui/gradio/ui_components.py +18 -1
- {cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/METADATA +3 -3
- cua_agent-0.4.8.dist-info/RECORD +37 -0
- cua_agent-0.4.6.dist-info/RECORD +0 -33
- {cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/entry_points.txt +0 -0
agent/loops/omniparser.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
OpenAI computer-use-preview agent loop implementation using liteLLM
|
|
3
|
+
Paper: https://arxiv.org/abs/2408.00203
|
|
4
|
+
Code: https://github.com/microsoft/OmniParser
|
|
3
5
|
"""
|
|
4
6
|
|
|
5
7
|
import asyncio
|
|
@@ -9,8 +11,9 @@ import litellm
|
|
|
9
11
|
import inspect
|
|
10
12
|
import base64
|
|
11
13
|
|
|
12
|
-
from ..decorators import
|
|
13
|
-
from ..types import Messages, AgentResponse, Tools
|
|
14
|
+
from ..decorators import register_agent
|
|
15
|
+
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
16
|
+
from ..loops.base import AsyncAgentConfig
|
|
14
17
|
|
|
15
18
|
SOM_TOOL_SCHEMA = {
|
|
16
19
|
"type": "function",
|
|
@@ -246,94 +249,185 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
|
|
|
246
249
|
return [item]
|
|
247
250
|
|
|
248
251
|
|
|
249
|
-
@
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
model: str,
|
|
253
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
|
254
|
-
max_retries: Optional[int] = None,
|
|
255
|
-
stream: bool = False,
|
|
256
|
-
computer_handler=None,
|
|
257
|
-
use_prompt_caching: Optional[bool] = False,
|
|
258
|
-
_on_api_start=None,
|
|
259
|
-
_on_api_end=None,
|
|
260
|
-
_on_usage=None,
|
|
261
|
-
_on_screenshot=None,
|
|
262
|
-
**kwargs
|
|
263
|
-
) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
|
|
264
|
-
"""
|
|
265
|
-
OpenAI computer-use-preview agent loop using liteLLM responses.
|
|
266
|
-
|
|
267
|
-
Supports OpenAI's computer use preview models.
|
|
268
|
-
"""
|
|
269
|
-
if not OMNIPARSER_AVAILABLE:
|
|
270
|
-
raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
|
|
271
|
-
|
|
272
|
-
tools = tools or []
|
|
252
|
+
@register_agent(models=r"omniparser\+.*|omni\+.*", priority=2)
|
|
253
|
+
class OmniparserConfig(AsyncAgentConfig):
|
|
254
|
+
"""Omniparser agent configuration implementing AsyncAgentConfig protocol."""
|
|
273
255
|
|
|
274
|
-
|
|
256
|
+
async def predict_step(
|
|
257
|
+
self,
|
|
258
|
+
messages: List[Dict[str, Any]],
|
|
259
|
+
model: str,
|
|
260
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
261
|
+
max_retries: Optional[int] = None,
|
|
262
|
+
stream: bool = False,
|
|
263
|
+
computer_handler=None,
|
|
264
|
+
use_prompt_caching: Optional[bool] = False,
|
|
265
|
+
_on_api_start=None,
|
|
266
|
+
_on_api_end=None,
|
|
267
|
+
_on_usage=None,
|
|
268
|
+
_on_screenshot=None,
|
|
269
|
+
**kwargs
|
|
270
|
+
) -> Dict[str, Any]:
|
|
271
|
+
"""
|
|
272
|
+
OpenAI computer-use-preview agent loop using liteLLM responses.
|
|
273
|
+
|
|
274
|
+
Supports OpenAI's computer use preview models.
|
|
275
|
+
"""
|
|
276
|
+
if not OMNIPARSER_AVAILABLE:
|
|
277
|
+
raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
|
|
278
|
+
|
|
279
|
+
tools = tools or []
|
|
280
|
+
|
|
281
|
+
llm_model = model.split('+')[-1]
|
|
275
282
|
|
|
276
|
-
|
|
277
|
-
|
|
283
|
+
# Prepare tools for OpenAI API
|
|
284
|
+
openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
|
|
278
285
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
286
|
+
# Find last computer_call_output
|
|
287
|
+
last_computer_call_output = get_last_computer_call_output(messages) # type: ignore
|
|
288
|
+
if last_computer_call_output:
|
|
289
|
+
image_url = last_computer_call_output.get("output", {}).get("image_url", "")
|
|
290
|
+
image_data = image_url.split(",")[-1]
|
|
291
|
+
if image_data:
|
|
292
|
+
parser = get_parser()
|
|
293
|
+
result = parser.parse(image_data)
|
|
294
|
+
if _on_screenshot:
|
|
295
|
+
await _on_screenshot(result.annotated_image_base64, "annotated_image")
|
|
296
|
+
for element in result.elements:
|
|
297
|
+
id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
|
|
298
|
+
|
|
299
|
+
# handle computer calls -> function calls
|
|
300
|
+
new_messages = []
|
|
301
|
+
for message in messages:
|
|
302
|
+
if not isinstance(message, dict):
|
|
303
|
+
message = message.__dict__
|
|
304
|
+
new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
|
|
305
|
+
messages = new_messages
|
|
299
306
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
307
|
+
# Prepare API call kwargs
|
|
308
|
+
api_kwargs = {
|
|
309
|
+
"model": llm_model,
|
|
310
|
+
"input": messages,
|
|
311
|
+
"tools": openai_tools if openai_tools else None,
|
|
312
|
+
"stream": stream,
|
|
313
|
+
"truncation": "auto",
|
|
314
|
+
"num_retries": max_retries,
|
|
315
|
+
**kwargs
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
# Call API start hook
|
|
319
|
+
if _on_api_start:
|
|
320
|
+
await _on_api_start(api_kwargs)
|
|
321
|
+
|
|
322
|
+
print(str(api_kwargs)[:1000])
|
|
323
|
+
|
|
324
|
+
# Use liteLLM responses
|
|
325
|
+
response = await litellm.aresponses(**api_kwargs)
|
|
317
326
|
|
|
318
|
-
|
|
319
|
-
|
|
327
|
+
# Call API end hook
|
|
328
|
+
if _on_api_end:
|
|
329
|
+
await _on_api_end(api_kwargs, response)
|
|
320
330
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
331
|
+
# Extract usage information
|
|
332
|
+
usage = {
|
|
333
|
+
**response.usage.model_dump(), # type: ignore
|
|
334
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
|
|
335
|
+
}
|
|
336
|
+
if _on_usage:
|
|
337
|
+
await _on_usage(usage)
|
|
324
338
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
339
|
+
# handle som function calls -> xy computer calls
|
|
340
|
+
new_output = []
|
|
341
|
+
for i in range(len(response.output)): # type: ignore
|
|
342
|
+
new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
|
|
343
|
+
|
|
344
|
+
return {
|
|
345
|
+
"output": new_output,
|
|
346
|
+
"usage": usage
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
async def predict_click(
|
|
350
|
+
self,
|
|
351
|
+
model: str,
|
|
352
|
+
image_b64: str,
|
|
353
|
+
instruction: str,
|
|
354
|
+
**kwargs
|
|
355
|
+
) -> Optional[Tuple[float, float]]:
|
|
356
|
+
"""
|
|
357
|
+
Predict click coordinates using OmniParser and LLM.
|
|
358
|
+
|
|
359
|
+
Uses OmniParser to annotate the image with element IDs, then uses LLM
|
|
360
|
+
to identify the correct element ID based on the instruction.
|
|
361
|
+
"""
|
|
362
|
+
if not OMNIPARSER_AVAILABLE:
|
|
363
|
+
return None
|
|
364
|
+
|
|
365
|
+
# Parse the image with OmniParser to get annotated image and elements
|
|
366
|
+
parser = get_parser()
|
|
367
|
+
result = parser.parse(image_b64)
|
|
368
|
+
|
|
369
|
+
# Extract the LLM model from composed model string
|
|
370
|
+
llm_model = model.split('+')[-1]
|
|
371
|
+
|
|
372
|
+
# Create system prompt for element ID prediction
|
|
373
|
+
SYSTEM_PROMPT = f'''
|
|
374
|
+
You are an expert UI element locator. Given a GUI image annotated with numerical IDs over each interactable element, along with a user's element description, provide the ID of the specified element.
|
|
332
375
|
|
|
333
|
-
|
|
334
|
-
new_output = []
|
|
335
|
-
for i in range(len(response.output)):
|
|
336
|
-
new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy)
|
|
337
|
-
response.output = new_output
|
|
376
|
+
The image shows UI elements with numbered overlays. Each number corresponds to a clickable/interactable element.
|
|
338
377
|
|
|
339
|
-
|
|
378
|
+
Output only the element ID as a single integer.
|
|
379
|
+
'''.strip()
|
|
380
|
+
|
|
381
|
+
# Prepare messages for LLM
|
|
382
|
+
messages = [
|
|
383
|
+
{
|
|
384
|
+
"role": "system",
|
|
385
|
+
"content": SYSTEM_PROMPT
|
|
386
|
+
},
|
|
387
|
+
{
|
|
388
|
+
"role": "user",
|
|
389
|
+
"content": [
|
|
390
|
+
{
|
|
391
|
+
"type": "image_url",
|
|
392
|
+
"image_url": {
|
|
393
|
+
"url": f"data:image/png;base64,{result.annotated_image_base64}"
|
|
394
|
+
}
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
"type": "text",
|
|
398
|
+
"text": f"Find the element: {instruction}"
|
|
399
|
+
}
|
|
400
|
+
]
|
|
401
|
+
}
|
|
402
|
+
]
|
|
403
|
+
|
|
404
|
+
# Call LLM to predict element ID
|
|
405
|
+
response = await litellm.acompletion(
|
|
406
|
+
model=llm_model,
|
|
407
|
+
messages=messages,
|
|
408
|
+
max_tokens=10,
|
|
409
|
+
temperature=0.1
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
# Extract element ID from response
|
|
413
|
+
response_text = response.choices[0].message.content.strip() # type: ignore
|
|
414
|
+
|
|
415
|
+
# Try to parse the element ID
|
|
416
|
+
try:
|
|
417
|
+
element_id = int(response_text)
|
|
418
|
+
|
|
419
|
+
# Find the element with this ID and return its center coordinates
|
|
420
|
+
for element in result.elements:
|
|
421
|
+
if element.id == element_id:
|
|
422
|
+
center_x = (element.bbox.x1 + element.bbox.x2) / 2
|
|
423
|
+
center_y = (element.bbox.y1 + element.bbox.y2) / 2
|
|
424
|
+
return (center_x, center_y)
|
|
425
|
+
except ValueError:
|
|
426
|
+
# If we can't parse the ID, return None
|
|
427
|
+
pass
|
|
428
|
+
|
|
429
|
+
return None
|
|
430
|
+
|
|
431
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
432
|
+
"""Return the capabilities supported by this agent."""
|
|
433
|
+
return ["step"]
|
agent/loops/openai.py
CHANGED
|
@@ -3,31 +3,49 @@ OpenAI computer-use-preview agent loop implementation using liteLLM
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
import base64
|
|
6
7
|
import json
|
|
7
|
-
from
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
|
|
8
10
|
import litellm
|
|
11
|
+
from PIL import Image
|
|
9
12
|
|
|
10
|
-
from ..decorators import
|
|
11
|
-
from ..types import Messages, AgentResponse, Tools
|
|
13
|
+
from ..decorators import register_agent
|
|
14
|
+
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
12
15
|
|
|
13
|
-
def _map_computer_tool_to_openai(
|
|
16
|
+
async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
|
|
14
17
|
"""Map a computer tool to OpenAI's computer-use-preview tool schema"""
|
|
18
|
+
# Get dimensions from the computer handler
|
|
19
|
+
try:
|
|
20
|
+
width, height = await computer_handler.get_dimensions()
|
|
21
|
+
except Exception:
|
|
22
|
+
# Fallback to default dimensions if method fails
|
|
23
|
+
width, height = 1024, 768
|
|
24
|
+
|
|
25
|
+
# Get environment from the computer handler
|
|
26
|
+
try:
|
|
27
|
+
environment = await computer_handler.get_environment()
|
|
28
|
+
except Exception:
|
|
29
|
+
# Fallback to default environment if method fails
|
|
30
|
+
environment = "linux"
|
|
31
|
+
|
|
15
32
|
return {
|
|
16
33
|
"type": "computer_use_preview",
|
|
17
|
-
"display_width":
|
|
18
|
-
"display_height":
|
|
19
|
-
"environment":
|
|
34
|
+
"display_width": width,
|
|
35
|
+
"display_height": height,
|
|
36
|
+
"environment": environment # mac, windows, linux, browser
|
|
20
37
|
}
|
|
21
38
|
|
|
22
39
|
|
|
23
|
-
def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
|
|
40
|
+
async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
|
|
24
41
|
"""Prepare tools for OpenAI API format"""
|
|
25
42
|
openai_tools = []
|
|
26
43
|
|
|
27
44
|
for schema in tool_schemas:
|
|
28
45
|
if schema["type"] == "computer":
|
|
29
46
|
# Map computer tool to OpenAI format
|
|
30
|
-
|
|
47
|
+
computer_tool = await _map_computer_tool_to_openai(schema["computer"])
|
|
48
|
+
openai_tools.append(computer_tool)
|
|
31
49
|
elif schema["type"] == "function":
|
|
32
50
|
# Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
|
|
33
51
|
# Schema should be: {type, name, description, parameters}
|
|
@@ -36,60 +54,182 @@ def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
|
|
|
36
54
|
return openai_tools
|
|
37
55
|
|
|
38
56
|
|
|
39
|
-
@
|
|
40
|
-
|
|
41
|
-
messages: Messages,
|
|
42
|
-
model: str,
|
|
43
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
|
44
|
-
max_retries: Optional[int] = None,
|
|
45
|
-
stream: bool = False,
|
|
46
|
-
computer_handler=None,
|
|
47
|
-
use_prompt_caching: Optional[bool] = False,
|
|
48
|
-
_on_api_start=None,
|
|
49
|
-
_on_api_end=None,
|
|
50
|
-
_on_usage=None,
|
|
51
|
-
_on_screenshot=None,
|
|
52
|
-
**kwargs
|
|
53
|
-
) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
|
|
57
|
+
@register_agent(models=r".*computer-use-preview.*")
|
|
58
|
+
class OpenAIComputerUseConfig:
|
|
54
59
|
"""
|
|
55
|
-
OpenAI computer-use-preview agent
|
|
60
|
+
OpenAI computer-use-preview agent configuration using liteLLM responses.
|
|
56
61
|
|
|
57
62
|
Supports OpenAI's computer use preview models.
|
|
58
63
|
"""
|
|
59
|
-
tools = tools or []
|
|
60
64
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
65
|
+
async def predict_step(
|
|
66
|
+
self,
|
|
67
|
+
messages: List[Dict[str, Any]],
|
|
68
|
+
model: str,
|
|
69
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
70
|
+
max_retries: Optional[int] = None,
|
|
71
|
+
stream: bool = False,
|
|
72
|
+
computer_handler=None,
|
|
73
|
+
use_prompt_caching: Optional[bool] = False,
|
|
74
|
+
_on_api_start=None,
|
|
75
|
+
_on_api_end=None,
|
|
76
|
+
_on_usage=None,
|
|
77
|
+
_on_screenshot=None,
|
|
73
78
|
**kwargs
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
79
|
+
) -> Dict[str, Any]:
|
|
80
|
+
"""
|
|
81
|
+
Predict the next step based on input items.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
messages: Input items following Responses format
|
|
85
|
+
model: Model name to use
|
|
86
|
+
tools: Optional list of tool schemas
|
|
87
|
+
max_retries: Maximum number of retries
|
|
88
|
+
stream: Whether to stream responses
|
|
89
|
+
computer_handler: Computer handler instance
|
|
90
|
+
_on_api_start: Callback for API start
|
|
91
|
+
_on_api_end: Callback for API end
|
|
92
|
+
_on_usage: Callback for usage tracking
|
|
93
|
+
_on_screenshot: Callback for screenshot events
|
|
94
|
+
**kwargs: Additional arguments
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Dictionary with "output" (output items) and "usage" array
|
|
98
|
+
"""
|
|
99
|
+
tools = tools or []
|
|
100
|
+
|
|
101
|
+
# Prepare tools for OpenAI API
|
|
102
|
+
openai_tools = await _prepare_tools_for_openai(tools)
|
|
86
103
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
104
|
+
# Prepare API call kwargs
|
|
105
|
+
api_kwargs = {
|
|
106
|
+
"model": model,
|
|
107
|
+
"input": messages,
|
|
108
|
+
"tools": openai_tools if openai_tools else None,
|
|
109
|
+
"stream": stream,
|
|
110
|
+
"reasoning": {"summary": "concise"},
|
|
111
|
+
"truncation": "auto",
|
|
112
|
+
"num_retries": max_retries,
|
|
113
|
+
**kwargs
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Call API start hook
|
|
117
|
+
if _on_api_start:
|
|
118
|
+
await _on_api_start(api_kwargs)
|
|
119
|
+
|
|
120
|
+
# Use liteLLM responses
|
|
121
|
+
response = await litellm.aresponses(**api_kwargs)
|
|
122
|
+
|
|
123
|
+
# Call API end hook
|
|
124
|
+
if _on_api_end:
|
|
125
|
+
await _on_api_end(api_kwargs, response)
|
|
126
|
+
|
|
127
|
+
# Extract usage information
|
|
128
|
+
usage = {
|
|
129
|
+
**response.usage.model_dump(),
|
|
130
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
131
|
+
}
|
|
132
|
+
if _on_usage:
|
|
133
|
+
await _on_usage(usage)
|
|
134
|
+
|
|
135
|
+
# Return in the expected format
|
|
136
|
+
output_dict = response.model_dump()
|
|
137
|
+
output_dict["usage"] = usage
|
|
138
|
+
return output_dict
|
|
139
|
+
|
|
140
|
+
async def predict_click(
|
|
141
|
+
self,
|
|
142
|
+
model: str,
|
|
143
|
+
image_b64: str,
|
|
144
|
+
instruction: str
|
|
145
|
+
) -> Optional[Tuple[int, int]]:
|
|
146
|
+
"""
|
|
147
|
+
Predict click coordinates based on image and instruction.
|
|
148
|
+
|
|
149
|
+
Uses OpenAI computer-use-preview with manually constructed input items
|
|
150
|
+
and a prompt that instructs the agent to only output clicks.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
model: Model name to use
|
|
154
|
+
image_b64: Base64 encoded image
|
|
155
|
+
instruction: Instruction for where to click
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Tuple of (x, y) coordinates or None if prediction fails
|
|
159
|
+
"""
|
|
160
|
+
# TODO: use computer tool to get dimensions + environment
|
|
161
|
+
# Manually construct input items with image and click instruction
|
|
162
|
+
input_items = [
|
|
163
|
+
{
|
|
164
|
+
"role": "user",
|
|
165
|
+
"content": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
"role": "user",
|
|
169
|
+
"content": [
|
|
170
|
+
{
|
|
171
|
+
"type": "input_image",
|
|
172
|
+
"image_url": f"data:image/png;base64,{image_b64}"
|
|
173
|
+
}
|
|
174
|
+
]
|
|
175
|
+
}
|
|
176
|
+
]
|
|
177
|
+
|
|
178
|
+
# Get image dimensions from base64 data
|
|
179
|
+
try:
|
|
180
|
+
image_data = base64.b64decode(image_b64)
|
|
181
|
+
image = Image.open(BytesIO(image_data))
|
|
182
|
+
display_width, display_height = image.size
|
|
183
|
+
except Exception:
|
|
184
|
+
# Fallback to default dimensions if image parsing fails
|
|
185
|
+
display_width, display_height = 1024, 768
|
|
186
|
+
|
|
187
|
+
# Prepare computer tool for click actions
|
|
188
|
+
computer_tool = {
|
|
189
|
+
"type": "computer_use_preview",
|
|
190
|
+
"display_width": display_width,
|
|
191
|
+
"display_height": display_height,
|
|
192
|
+
"environment": "windows"
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
# Prepare API call kwargs
|
|
196
|
+
api_kwargs = {
|
|
197
|
+
"model": model,
|
|
198
|
+
"input": input_items,
|
|
199
|
+
"tools": [computer_tool],
|
|
200
|
+
"stream": False,
|
|
201
|
+
"reasoning": {"summary": "concise"},
|
|
202
|
+
"truncation": "auto",
|
|
203
|
+
"max_tokens": 100 # Keep response short for click prediction
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
# Use liteLLM responses
|
|
207
|
+
response = await litellm.aresponses(**api_kwargs)
|
|
208
|
+
|
|
209
|
+
# Extract click coordinates from response output
|
|
210
|
+
output_dict = response.model_dump()
|
|
211
|
+
output_items = output_dict.get("output", [])
|
|
212
|
+
|
|
213
|
+
# Look for computer_call with click action
|
|
214
|
+
for item in output_items:
|
|
215
|
+
if (isinstance(item, dict) and
|
|
216
|
+
item.get("type") == "computer_call" and
|
|
217
|
+
isinstance(item.get("action"), dict)):
|
|
218
|
+
|
|
219
|
+
action = item["action"]
|
|
220
|
+
if action.get("type") == "click":
|
|
221
|
+
x = action.get("x")
|
|
222
|
+
y = action.get("y")
|
|
223
|
+
if x is not None and y is not None:
|
|
224
|
+
return (int(x), int(y))
|
|
225
|
+
|
|
226
|
+
return None
|
|
94
227
|
|
|
95
|
-
|
|
228
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
229
|
+
"""
|
|
230
|
+
Get list of capabilities supported by this agent config.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
List of capability strings
|
|
234
|
+
"""
|
|
235
|
+
return ["click", "step"]
|