cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/base.py
CHANGED
|
@@ -2,13 +2,15 @@
|
|
|
2
2
|
Base protocol for async agent configurations
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from typing import Protocol, List, Dict, Any, Optional, Tuple, Union
|
|
6
5
|
from abc import abstractmethod
|
|
6
|
+
from typing import Any, Dict, List, Optional, Protocol, Tuple, Union
|
|
7
|
+
|
|
7
8
|
from ..types import AgentCapability
|
|
8
9
|
|
|
10
|
+
|
|
9
11
|
class AsyncAgentConfig(Protocol):
|
|
10
12
|
"""Protocol defining the interface for async agent configurations."""
|
|
11
|
-
|
|
13
|
+
|
|
12
14
|
@abstractmethod
|
|
13
15
|
async def predict_step(
|
|
14
16
|
self,
|
|
@@ -22,11 +24,11 @@ class AsyncAgentConfig(Protocol):
|
|
|
22
24
|
_on_api_end=None,
|
|
23
25
|
_on_usage=None,
|
|
24
26
|
_on_screenshot=None,
|
|
25
|
-
**
|
|
27
|
+
**generation_config,
|
|
26
28
|
) -> Dict[str, Any]:
|
|
27
29
|
"""
|
|
28
30
|
Predict the next step based on input items.
|
|
29
|
-
|
|
31
|
+
|
|
30
32
|
Args:
|
|
31
33
|
messages: Input items following Responses format (message, function_call, computer_call)
|
|
32
34
|
model: Model name to use
|
|
@@ -38,38 +40,40 @@ class AsyncAgentConfig(Protocol):
|
|
|
38
40
|
_on_api_end: Callback for API end
|
|
39
41
|
_on_usage: Callback for usage tracking
|
|
40
42
|
_on_screenshot: Callback for screenshot events
|
|
41
|
-
**
|
|
42
|
-
|
|
43
|
+
**generation_config: Additional arguments to pass to the model provider
|
|
44
|
+
- api_key: Optional API key for the provider
|
|
45
|
+
- api_base: Optional API base URL for the provider
|
|
46
|
+
|
|
43
47
|
Returns:
|
|
44
48
|
Dictionary with "output" (output items) and "usage" array
|
|
45
49
|
"""
|
|
46
50
|
...
|
|
47
|
-
|
|
51
|
+
|
|
48
52
|
@abstractmethod
|
|
49
53
|
async def predict_click(
|
|
50
|
-
self,
|
|
51
|
-
model: str,
|
|
52
|
-
image_b64: str,
|
|
53
|
-
instruction: str
|
|
54
|
+
self, model: str, image_b64: str, instruction: str, **generation_config
|
|
54
55
|
) -> Optional[Tuple[int, int]]:
|
|
55
56
|
"""
|
|
56
57
|
Predict click coordinates based on image and instruction.
|
|
57
|
-
|
|
58
|
+
|
|
58
59
|
Args:
|
|
59
60
|
model: Model name to use
|
|
60
61
|
image_b64: Base64 encoded image
|
|
61
62
|
instruction: Instruction for where to click
|
|
62
|
-
|
|
63
|
+
**generation_config: Additional arguments to pass to the model provider
|
|
64
|
+
- api_key: Optional API key for the provider
|
|
65
|
+
- api_base: Optional API base URL for the provider
|
|
66
|
+
|
|
63
67
|
Returns:
|
|
64
68
|
None or tuple with (x, y) coordinates
|
|
65
69
|
"""
|
|
66
70
|
...
|
|
67
|
-
|
|
71
|
+
|
|
68
72
|
@abstractmethod
|
|
69
73
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
70
74
|
"""
|
|
71
75
|
Get list of capabilities supported by this agent config.
|
|
72
|
-
|
|
76
|
+
|
|
73
77
|
Returns:
|
|
74
78
|
List of capability strings (e.g., ["step", "click"])
|
|
75
79
|
"""
|
agent/loops/composed_grounded.py
CHANGED
|
@@ -3,122 +3,117 @@ Composed-grounded agent loop implementation that combines grounding and thinking
|
|
|
3
3
|
Uses a two-stage approach: grounding model for element detection, thinking model for reasoning.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import uuid
|
|
7
6
|
import asyncio
|
|
8
|
-
import json
|
|
9
7
|
import base64
|
|
10
|
-
|
|
8
|
+
import json
|
|
9
|
+
import uuid
|
|
11
10
|
from io import BytesIO
|
|
12
|
-
from
|
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
12
|
+
|
|
13
13
|
import litellm
|
|
14
|
+
from PIL import Image
|
|
14
15
|
|
|
16
|
+
from ..agent import find_agent_config
|
|
15
17
|
from ..decorators import register_agent
|
|
16
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
17
18
|
from ..loops.base import AsyncAgentConfig
|
|
18
19
|
from ..responses import (
|
|
19
|
-
convert_computer_calls_xy2desc,
|
|
20
|
-
convert_responses_items_to_completion_messages,
|
|
21
20
|
convert_completion_messages_to_responses_items,
|
|
22
21
|
convert_computer_calls_desc2xy,
|
|
23
|
-
|
|
22
|
+
convert_computer_calls_xy2desc,
|
|
23
|
+
convert_responses_items_to_completion_messages,
|
|
24
|
+
get_all_element_descriptions,
|
|
24
25
|
)
|
|
25
|
-
from ..
|
|
26
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
26
27
|
|
|
27
28
|
GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
29
|
+
"type": "function",
|
|
30
|
+
"function": {
|
|
31
|
+
"name": "computer",
|
|
32
|
+
"description": "Control a computer by taking screenshots and interacting with UI elements. This tool uses element descriptions to locate and interact with UI elements on the screen (e.g., 'red submit button', 'search text field', 'hamburger menu icon', 'close button in top right corner').",
|
|
33
|
+
"parameters": {
|
|
34
|
+
"type": "object",
|
|
35
|
+
"properties": {
|
|
36
|
+
"action": {
|
|
37
|
+
"type": "string",
|
|
38
|
+
"enum": [
|
|
39
|
+
"screenshot",
|
|
40
|
+
"click",
|
|
41
|
+
"double_click",
|
|
42
|
+
"drag",
|
|
43
|
+
"type",
|
|
44
|
+
"keypress",
|
|
45
|
+
"scroll",
|
|
46
|
+
"move",
|
|
47
|
+
"wait",
|
|
48
|
+
"get_current_url",
|
|
49
|
+
"get_dimensions",
|
|
50
|
+
"get_environment",
|
|
51
|
+
],
|
|
52
|
+
"description": "The action to perform (required for all actions)",
|
|
53
|
+
},
|
|
54
|
+
"element_description": {
|
|
55
|
+
"type": "string",
|
|
56
|
+
"description": "Description of the element to interact with (required for click, double_click, move, scroll actions)",
|
|
57
|
+
},
|
|
58
|
+
"start_element_description": {
|
|
59
|
+
"type": "string",
|
|
60
|
+
"description": "Description of the element to start dragging from (required for drag action)",
|
|
61
|
+
},
|
|
62
|
+
"end_element_description": {
|
|
63
|
+
"type": "string",
|
|
64
|
+
"description": "Description of the element to drag to (required for drag action)",
|
|
65
|
+
},
|
|
66
|
+
"text": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"description": "The text to type (required for type action)",
|
|
69
|
+
},
|
|
70
|
+
"keys": {
|
|
71
|
+
"type": "array",
|
|
72
|
+
"items": {"type": "string"},
|
|
73
|
+
"description": "Key(s) to press (required for keypress action)",
|
|
74
|
+
},
|
|
75
|
+
"button": {
|
|
76
|
+
"type": "string",
|
|
77
|
+
"enum": ["left", "right", "wheel", "back", "forward"],
|
|
78
|
+
"description": "The mouse button to use for click action (required for click and double_click action)",
|
|
79
|
+
},
|
|
80
|
+
"scroll_x": {
|
|
81
|
+
"type": "integer",
|
|
82
|
+
"description": "Horizontal scroll amount for scroll action (required for scroll action)",
|
|
83
|
+
},
|
|
84
|
+
"scroll_y": {
|
|
85
|
+
"type": "integer",
|
|
86
|
+
"description": "Vertical scroll amount for scroll action (required for scroll action)",
|
|
87
|
+
},
|
|
73
88
|
},
|
|
74
|
-
"
|
|
75
|
-
},
|
|
76
|
-
"button": {
|
|
77
|
-
"type": "string",
|
|
78
|
-
"enum": [
|
|
79
|
-
"left",
|
|
80
|
-
"right",
|
|
81
|
-
"wheel",
|
|
82
|
-
"back",
|
|
83
|
-
"forward"
|
|
84
|
-
],
|
|
85
|
-
"description": "The mouse button to use for click action (required for click and double_click action)",
|
|
86
|
-
},
|
|
87
|
-
"scroll_x": {
|
|
88
|
-
"type": "integer",
|
|
89
|
-
"description": "Horizontal scroll amount for scroll action (required for scroll action)",
|
|
89
|
+
"required": ["action"],
|
|
90
90
|
},
|
|
91
|
-
|
|
92
|
-
"type": "integer",
|
|
93
|
-
"description": "Vertical scroll amount for scroll action (required for scroll action)",
|
|
94
|
-
},
|
|
95
|
-
},
|
|
96
|
-
"required": [
|
|
97
|
-
"action"
|
|
98
|
-
]
|
|
99
|
-
}
|
|
100
|
-
}
|
|
91
|
+
},
|
|
101
92
|
}
|
|
102
93
|
|
|
94
|
+
|
|
103
95
|
def _prepare_tools_for_grounded(tool_schemas: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
104
96
|
"""Prepare tools for grounded API format"""
|
|
105
97
|
grounded_tools = []
|
|
106
|
-
|
|
98
|
+
|
|
107
99
|
for schema in tool_schemas:
|
|
108
100
|
if schema["type"] == "computer":
|
|
109
101
|
grounded_tools.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
|
|
110
102
|
else:
|
|
111
103
|
grounded_tools.append(schema)
|
|
112
|
-
|
|
104
|
+
|
|
113
105
|
return grounded_tools
|
|
114
106
|
|
|
107
|
+
|
|
115
108
|
def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str]:
|
|
116
109
|
"""Get the last computer call output image from messages."""
|
|
117
110
|
for message in reversed(messages):
|
|
118
|
-
if (
|
|
119
|
-
message
|
|
120
|
-
|
|
121
|
-
message
|
|
111
|
+
if (
|
|
112
|
+
isinstance(message, dict)
|
|
113
|
+
and message.get("type") == "computer_call_output"
|
|
114
|
+
and isinstance(message.get("output"), dict)
|
|
115
|
+
and message["output"].get("type") == "input_image"
|
|
116
|
+
):
|
|
122
117
|
image_url = message["output"].get("image_url", "")
|
|
123
118
|
if image_url.startswith("data:image/png;base64,"):
|
|
124
119
|
return image_url.split(",", 1)[1]
|
|
@@ -126,17 +121,17 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str
|
|
|
126
121
|
|
|
127
122
|
|
|
128
123
|
@register_agent(r".*\+.*", priority=1)
|
|
129
|
-
class ComposedGroundedConfig:
|
|
124
|
+
class ComposedGroundedConfig(AsyncAgentConfig):
|
|
130
125
|
"""
|
|
131
126
|
Composed-grounded agent configuration that uses both grounding and thinking models.
|
|
132
|
-
|
|
127
|
+
|
|
133
128
|
The model parameter should be in format: "grounding_model+thinking_model"
|
|
134
129
|
e.g., "huggingface-local/HelloKKMe/GTA1-7B+gemini/gemini-1.5-pro"
|
|
135
130
|
"""
|
|
136
|
-
|
|
131
|
+
|
|
137
132
|
def __init__(self):
|
|
138
133
|
self.desc2xy: Dict[str, Tuple[float, float]] = {}
|
|
139
|
-
|
|
134
|
+
|
|
140
135
|
async def predict_step(
|
|
141
136
|
self,
|
|
142
137
|
messages: List[Dict[str, Any]],
|
|
@@ -150,11 +145,11 @@ class ComposedGroundedConfig:
|
|
|
150
145
|
_on_api_end=None,
|
|
151
146
|
_on_usage=None,
|
|
152
147
|
_on_screenshot=None,
|
|
153
|
-
**kwargs
|
|
148
|
+
**kwargs,
|
|
154
149
|
) -> Dict[str, Any]:
|
|
155
150
|
"""
|
|
156
151
|
Composed-grounded predict step implementation.
|
|
157
|
-
|
|
152
|
+
|
|
158
153
|
Process:
|
|
159
154
|
0. Store last computer call image, if none then take a screenshot
|
|
160
155
|
1. Convert computer calls from xy to descriptions
|
|
@@ -167,18 +162,20 @@ class ComposedGroundedConfig:
|
|
|
167
162
|
"""
|
|
168
163
|
# Parse the composed model
|
|
169
164
|
if "+" not in model:
|
|
170
|
-
raise ValueError(
|
|
165
|
+
raise ValueError(
|
|
166
|
+
f"Composed model must be in format 'grounding_model+thinking_model', got: {model}"
|
|
167
|
+
)
|
|
171
168
|
grounding_model, thinking_model = model.split("+", 1)
|
|
172
|
-
|
|
169
|
+
|
|
173
170
|
pre_output_items = []
|
|
174
|
-
|
|
171
|
+
|
|
175
172
|
# Step 0: Store last computer call image, if none then take a screenshot
|
|
176
173
|
last_image_b64 = get_last_computer_call_image(messages)
|
|
177
174
|
if last_image_b64 is None:
|
|
178
175
|
# Take a screenshot
|
|
179
|
-
screenshot_b64 = await computer_handler.screenshot()
|
|
176
|
+
screenshot_b64 = await computer_handler.screenshot() # type: ignore
|
|
180
177
|
if screenshot_b64:
|
|
181
|
-
|
|
178
|
+
|
|
182
179
|
call_id = uuid.uuid4().hex
|
|
183
180
|
pre_output_items += [
|
|
184
181
|
{
|
|
@@ -187,45 +184,42 @@ class ComposedGroundedConfig:
|
|
|
187
184
|
"content": [
|
|
188
185
|
{
|
|
189
186
|
"type": "output_text",
|
|
190
|
-
"text": "Taking a screenshot to see the current computer screen."
|
|
187
|
+
"text": "Taking a screenshot to see the current computer screen.",
|
|
191
188
|
}
|
|
192
|
-
]
|
|
189
|
+
],
|
|
193
190
|
},
|
|
194
191
|
{
|
|
195
|
-
"action": {
|
|
196
|
-
"type": "screenshot"
|
|
197
|
-
},
|
|
192
|
+
"action": {"type": "screenshot"},
|
|
198
193
|
"call_id": call_id,
|
|
199
194
|
"status": "completed",
|
|
200
|
-
"type": "computer_call"
|
|
195
|
+
"type": "computer_call",
|
|
201
196
|
},
|
|
202
197
|
{
|
|
203
198
|
"type": "computer_call_output",
|
|
204
199
|
"call_id": call_id,
|
|
205
200
|
"output": {
|
|
206
201
|
"type": "input_image",
|
|
207
|
-
"image_url": f"data:image/png;base64,{screenshot_b64}"
|
|
208
|
-
}
|
|
202
|
+
"image_url": f"data:image/png;base64,{screenshot_b64}",
|
|
203
|
+
},
|
|
209
204
|
},
|
|
210
205
|
]
|
|
211
206
|
last_image_b64 = screenshot_b64
|
|
212
|
-
|
|
207
|
+
|
|
213
208
|
# Call screenshot callback if provided
|
|
214
209
|
if _on_screenshot:
|
|
215
210
|
await _on_screenshot(screenshot_b64)
|
|
216
|
-
|
|
217
|
-
tool_schemas = _prepare_tools_for_grounded(tools)
|
|
211
|
+
|
|
212
|
+
tool_schemas = _prepare_tools_for_grounded(tools) # type: ignore
|
|
218
213
|
|
|
219
214
|
# Step 1: Convert computer calls from xy to descriptions
|
|
220
215
|
input_messages = messages + pre_output_items
|
|
221
216
|
messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
|
|
222
|
-
|
|
217
|
+
|
|
223
218
|
# Step 2: Convert responses items to completion messages
|
|
224
219
|
completion_messages = convert_responses_items_to_completion_messages(
|
|
225
|
-
messages_with_descriptions,
|
|
226
|
-
allow_images_in_tool_results=False
|
|
220
|
+
messages_with_descriptions, allow_images_in_tool_results=False
|
|
227
221
|
)
|
|
228
|
-
|
|
222
|
+
|
|
229
223
|
# Step 3: Call thinking model with litellm.acompletion
|
|
230
224
|
api_kwargs = {
|
|
231
225
|
"model": thinking_model,
|
|
@@ -233,98 +227,90 @@ class ComposedGroundedConfig:
|
|
|
233
227
|
"tools": tool_schemas,
|
|
234
228
|
"max_retries": max_retries,
|
|
235
229
|
"stream": stream,
|
|
236
|
-
**kwargs
|
|
230
|
+
**kwargs,
|
|
237
231
|
}
|
|
238
232
|
|
|
239
233
|
if use_prompt_caching:
|
|
240
234
|
api_kwargs["use_prompt_caching"] = use_prompt_caching
|
|
241
|
-
|
|
235
|
+
|
|
242
236
|
# Call API start hook
|
|
243
237
|
if _on_api_start:
|
|
244
238
|
await _on_api_start(api_kwargs)
|
|
245
|
-
|
|
239
|
+
|
|
246
240
|
# Make the completion call
|
|
247
241
|
response = await litellm.acompletion(**api_kwargs)
|
|
248
|
-
|
|
242
|
+
|
|
249
243
|
# Call API end hook
|
|
250
244
|
if _on_api_end:
|
|
251
245
|
await _on_api_end(api_kwargs, response)
|
|
252
|
-
|
|
246
|
+
|
|
253
247
|
# Extract usage information
|
|
254
248
|
usage = {
|
|
255
|
-
**response.usage.model_dump(),
|
|
249
|
+
**response.usage.model_dump(), # type: ignore
|
|
256
250
|
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
257
251
|
}
|
|
258
252
|
if _on_usage:
|
|
259
253
|
await _on_usage(usage)
|
|
260
|
-
|
|
254
|
+
|
|
261
255
|
# Step 4: Convert completion messages back to responses items format
|
|
262
|
-
response_dict = response.model_dump()
|
|
256
|
+
response_dict = response.model_dump() # type: ignore
|
|
263
257
|
choice_messages = [choice["message"] for choice in response_dict["choices"]]
|
|
264
258
|
thinking_output_items = []
|
|
265
|
-
|
|
259
|
+
|
|
266
260
|
for choice_message in choice_messages:
|
|
267
|
-
thinking_output_items.extend(
|
|
268
|
-
|
|
261
|
+
thinking_output_items.extend(
|
|
262
|
+
convert_completion_messages_to_responses_items([choice_message])
|
|
263
|
+
)
|
|
264
|
+
|
|
269
265
|
# Step 5: Get all element descriptions and populate desc2xy mapping
|
|
270
266
|
element_descriptions = get_all_element_descriptions(thinking_output_items)
|
|
271
|
-
|
|
267
|
+
|
|
272
268
|
if element_descriptions and last_image_b64:
|
|
273
269
|
# Use grounding model to predict coordinates for each description
|
|
274
270
|
grounding_agent_conf = find_agent_config(grounding_model)
|
|
275
271
|
if grounding_agent_conf:
|
|
276
272
|
grounding_agent = grounding_agent_conf.agent_class()
|
|
277
|
-
|
|
273
|
+
|
|
278
274
|
for desc in element_descriptions:
|
|
279
|
-
for _ in range(3):
|
|
275
|
+
for _ in range(3): # try 3 times
|
|
280
276
|
coords = await grounding_agent.predict_click(
|
|
281
|
-
model=grounding_model,
|
|
282
|
-
image_b64=last_image_b64,
|
|
283
|
-
instruction=desc
|
|
277
|
+
model=grounding_model, image_b64=last_image_b64, instruction=desc
|
|
284
278
|
)
|
|
285
279
|
if coords:
|
|
286
280
|
self.desc2xy[desc] = coords
|
|
287
281
|
break
|
|
288
|
-
|
|
282
|
+
|
|
289
283
|
# Step 6: Convert computer calls from descriptions back to xy coordinates
|
|
290
284
|
final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
|
|
291
|
-
|
|
285
|
+
|
|
292
286
|
# Step 7: Return output and usage
|
|
293
|
-
return {
|
|
294
|
-
|
|
295
|
-
"usage": usage
|
|
296
|
-
}
|
|
297
|
-
|
|
287
|
+
return {"output": pre_output_items + final_output_items, "usage": usage}
|
|
288
|
+
|
|
298
289
|
async def predict_click(
|
|
299
|
-
self,
|
|
300
|
-
model: str,
|
|
301
|
-
image_b64: str,
|
|
302
|
-
instruction: str,
|
|
303
|
-
**kwargs
|
|
290
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
304
291
|
) -> Optional[Tuple[int, int]]:
|
|
305
292
|
"""
|
|
306
293
|
Predict click coordinates using the grounding model.
|
|
307
|
-
|
|
294
|
+
|
|
308
295
|
For composed models, uses only the grounding model part for click prediction.
|
|
309
296
|
"""
|
|
310
297
|
# Parse the composed model to get grounding model
|
|
311
298
|
if "+" not in model:
|
|
312
|
-
raise ValueError(
|
|
299
|
+
raise ValueError(
|
|
300
|
+
f"Composed model must be in format 'grounding_model+thinking_model', got: {model}"
|
|
301
|
+
)
|
|
313
302
|
grounding_model, thinking_model = model.split("+", 1)
|
|
314
|
-
|
|
303
|
+
|
|
315
304
|
# Find and use the grounding agent
|
|
316
305
|
grounding_agent_conf = find_agent_config(grounding_model)
|
|
317
306
|
if grounding_agent_conf:
|
|
318
307
|
grounding_agent = grounding_agent_conf.agent_class()
|
|
319
308
|
return await grounding_agent.predict_click(
|
|
320
|
-
model=grounding_model,
|
|
321
|
-
image_b64=image_b64,
|
|
322
|
-
instruction=instruction,
|
|
323
|
-
**kwargs
|
|
309
|
+
model=grounding_model, image_b64=image_b64, instruction=instruction, **kwargs
|
|
324
310
|
)
|
|
325
|
-
|
|
311
|
+
|
|
326
312
|
return None
|
|
327
|
-
|
|
313
|
+
|
|
328
314
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
329
315
|
"""Return the capabilities supported by this agent."""
|
|
330
316
|
return ["click", "step"]
|