cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -19
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +6 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +370 -0
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +431 -241
- agent/callbacks/__init__.py +10 -3
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +140 -0
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +106 -69
- agent/callbacks/trajectory_saver.py +178 -70
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +164 -74
- agent/integrations/hud/agent.py +338 -342
- agent/integrations/hud/proxy.py +297 -0
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +590 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +142 -144
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +63 -56
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +262 -212
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +196 -0
- agent/proxy/handlers.py +255 -0
- agent/responses.py +486 -339
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +20 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- agent/integrations/hud/adapter.py +0 -121
- agent/integrations/hud/computer_handler.py +0 -187
- agent/telemetry.py +0 -142
- cua_agent-0.4.14.dist-info/METADATA +0 -436
- cua_agent-0.4.14.dist-info/RECORD +0 -50
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/base.py
CHANGED
|
@@ -2,13 +2,15 @@
|
|
|
2
2
|
Base protocol for async agent configurations
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from typing import Protocol, List, Dict, Any, Optional, Tuple, Union
|
|
6
5
|
from abc import abstractmethod
|
|
6
|
+
from typing import Any, Dict, List, Optional, Protocol, Tuple, Union
|
|
7
|
+
|
|
7
8
|
from ..types import AgentCapability
|
|
8
9
|
|
|
10
|
+
|
|
9
11
|
class AsyncAgentConfig(Protocol):
|
|
10
12
|
"""Protocol defining the interface for async agent configurations."""
|
|
11
|
-
|
|
13
|
+
|
|
12
14
|
@abstractmethod
|
|
13
15
|
async def predict_step(
|
|
14
16
|
self,
|
|
@@ -22,11 +24,11 @@ class AsyncAgentConfig(Protocol):
|
|
|
22
24
|
_on_api_end=None,
|
|
23
25
|
_on_usage=None,
|
|
24
26
|
_on_screenshot=None,
|
|
25
|
-
**
|
|
27
|
+
**generation_config,
|
|
26
28
|
) -> Dict[str, Any]:
|
|
27
29
|
"""
|
|
28
30
|
Predict the next step based on input items.
|
|
29
|
-
|
|
31
|
+
|
|
30
32
|
Args:
|
|
31
33
|
messages: Input items following Responses format (message, function_call, computer_call)
|
|
32
34
|
model: Model name to use
|
|
@@ -38,38 +40,40 @@ class AsyncAgentConfig(Protocol):
|
|
|
38
40
|
_on_api_end: Callback for API end
|
|
39
41
|
_on_usage: Callback for usage tracking
|
|
40
42
|
_on_screenshot: Callback for screenshot events
|
|
41
|
-
**
|
|
42
|
-
|
|
43
|
+
**generation_config: Additional arguments to pass to the model provider
|
|
44
|
+
- api_key: Optional API key for the provider
|
|
45
|
+
- api_base: Optional API base URL for the provider
|
|
46
|
+
|
|
43
47
|
Returns:
|
|
44
48
|
Dictionary with "output" (output items) and "usage" array
|
|
45
49
|
"""
|
|
46
50
|
...
|
|
47
|
-
|
|
51
|
+
|
|
48
52
|
@abstractmethod
|
|
49
53
|
async def predict_click(
|
|
50
|
-
self,
|
|
51
|
-
model: str,
|
|
52
|
-
image_b64: str,
|
|
53
|
-
instruction: str
|
|
54
|
+
self, model: str, image_b64: str, instruction: str, **generation_config
|
|
54
55
|
) -> Optional[Tuple[int, int]]:
|
|
55
56
|
"""
|
|
56
57
|
Predict click coordinates based on image and instruction.
|
|
57
|
-
|
|
58
|
+
|
|
58
59
|
Args:
|
|
59
60
|
model: Model name to use
|
|
60
61
|
image_b64: Base64 encoded image
|
|
61
62
|
instruction: Instruction for where to click
|
|
62
|
-
|
|
63
|
+
**generation_config: Additional arguments to pass to the model provider
|
|
64
|
+
- api_key: Optional API key for the provider
|
|
65
|
+
- api_base: Optional API base URL for the provider
|
|
66
|
+
|
|
63
67
|
Returns:
|
|
64
68
|
None or tuple with (x, y) coordinates
|
|
65
69
|
"""
|
|
66
70
|
...
|
|
67
|
-
|
|
71
|
+
|
|
68
72
|
@abstractmethod
|
|
69
73
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
70
74
|
"""
|
|
71
75
|
Get list of capabilities supported by this agent config.
|
|
72
|
-
|
|
76
|
+
|
|
73
77
|
Returns:
|
|
74
78
|
List of capability strings (e.g., ["step", "click"])
|
|
75
79
|
"""
|
agent/loops/composed_grounded.py
CHANGED
|
@@ -3,112 +3,117 @@ Composed-grounded agent loop implementation that combines grounding and thinking
|
|
|
3
3
|
Uses a two-stage approach: grounding model for element detection, thinking model for reasoning.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import uuid
|
|
7
6
|
import asyncio
|
|
8
|
-
import json
|
|
9
7
|
import base64
|
|
10
|
-
|
|
8
|
+
import json
|
|
9
|
+
import uuid
|
|
11
10
|
from io import BytesIO
|
|
12
|
-
from
|
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
12
|
+
|
|
13
13
|
import litellm
|
|
14
|
+
from PIL import Image
|
|
14
15
|
|
|
16
|
+
from ..agent import find_agent_config
|
|
15
17
|
from ..decorators import register_agent
|
|
16
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
17
18
|
from ..loops.base import AsyncAgentConfig
|
|
18
19
|
from ..responses import (
|
|
19
|
-
convert_computer_calls_xy2desc,
|
|
20
|
-
convert_responses_items_to_completion_messages,
|
|
21
20
|
convert_completion_messages_to_responses_items,
|
|
22
21
|
convert_computer_calls_desc2xy,
|
|
23
|
-
|
|
22
|
+
convert_computer_calls_xy2desc,
|
|
23
|
+
convert_responses_items_to_completion_messages,
|
|
24
|
+
get_all_element_descriptions,
|
|
24
25
|
)
|
|
25
|
-
from ..
|
|
26
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
26
27
|
|
|
27
28
|
GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
29
|
+
"type": "function",
|
|
30
|
+
"function": {
|
|
31
|
+
"name": "computer",
|
|
32
|
+
"description": "Control a computer by taking screenshots and interacting with UI elements. This tool uses element descriptions to locate and interact with UI elements on the screen (e.g., 'red submit button', 'search text field', 'hamburger menu icon', 'close button in top right corner').",
|
|
33
|
+
"parameters": {
|
|
34
|
+
"type": "object",
|
|
35
|
+
"properties": {
|
|
36
|
+
"action": {
|
|
37
|
+
"type": "string",
|
|
38
|
+
"enum": [
|
|
39
|
+
"screenshot",
|
|
40
|
+
"click",
|
|
41
|
+
"double_click",
|
|
42
|
+
"drag",
|
|
43
|
+
"type",
|
|
44
|
+
"keypress",
|
|
45
|
+
"scroll",
|
|
46
|
+
"move",
|
|
47
|
+
"wait",
|
|
48
|
+
"get_current_url",
|
|
49
|
+
"get_dimensions",
|
|
50
|
+
"get_environment",
|
|
51
|
+
],
|
|
52
|
+
"description": "The action to perform (required for all actions)",
|
|
53
|
+
},
|
|
54
|
+
"element_description": {
|
|
55
|
+
"type": "string",
|
|
56
|
+
"description": "Description of the element to interact with (required for click, double_click, move, scroll actions)",
|
|
57
|
+
},
|
|
58
|
+
"start_element_description": {
|
|
59
|
+
"type": "string",
|
|
60
|
+
"description": "Description of the element to start dragging from (required for drag action)",
|
|
61
|
+
},
|
|
62
|
+
"end_element_description": {
|
|
63
|
+
"type": "string",
|
|
64
|
+
"description": "Description of the element to drag to (required for drag action)",
|
|
65
|
+
},
|
|
66
|
+
"text": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"description": "The text to type (required for type action)",
|
|
69
|
+
},
|
|
70
|
+
"keys": {
|
|
71
|
+
"type": "array",
|
|
72
|
+
"items": {"type": "string"},
|
|
73
|
+
"description": "Key(s) to press (required for keypress action)",
|
|
74
|
+
},
|
|
75
|
+
"button": {
|
|
76
|
+
"type": "string",
|
|
77
|
+
"enum": ["left", "right", "wheel", "back", "forward"],
|
|
78
|
+
"description": "The mouse button to use for click action (required for click and double_click action)",
|
|
79
|
+
},
|
|
80
|
+
"scroll_x": {
|
|
81
|
+
"type": "integer",
|
|
82
|
+
"description": "Horizontal scroll amount for scroll action (required for scroll action)",
|
|
83
|
+
},
|
|
84
|
+
"scroll_y": {
|
|
85
|
+
"type": "integer",
|
|
86
|
+
"description": "Vertical scroll amount for scroll action (required for scroll action)",
|
|
87
|
+
},
|
|
88
|
+
},
|
|
89
|
+
"required": ["action"],
|
|
64
90
|
},
|
|
65
|
-
|
|
66
|
-
"type": "string",
|
|
67
|
-
"description": "The text to type (required for type action)"
|
|
68
|
-
},
|
|
69
|
-
"keys": {
|
|
70
|
-
"type": "string",
|
|
71
|
-
"description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
|
|
72
|
-
},
|
|
73
|
-
"button": {
|
|
74
|
-
"type": "string",
|
|
75
|
-
"description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
|
|
76
|
-
},
|
|
77
|
-
"scroll_x": {
|
|
78
|
-
"type": "integer",
|
|
79
|
-
"description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
|
|
80
|
-
},
|
|
81
|
-
"scroll_y": {
|
|
82
|
-
"type": "integer",
|
|
83
|
-
"description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
|
|
84
|
-
},
|
|
85
|
-
},
|
|
86
|
-
"required": [
|
|
87
|
-
"action"
|
|
88
|
-
]
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
+
},
|
|
91
92
|
}
|
|
92
93
|
|
|
94
|
+
|
|
93
95
|
def _prepare_tools_for_grounded(tool_schemas: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
94
96
|
"""Prepare tools for grounded API format"""
|
|
95
97
|
grounded_tools = []
|
|
96
|
-
|
|
98
|
+
|
|
97
99
|
for schema in tool_schemas:
|
|
98
100
|
if schema["type"] == "computer":
|
|
99
101
|
grounded_tools.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
|
|
100
102
|
else:
|
|
101
103
|
grounded_tools.append(schema)
|
|
102
|
-
|
|
104
|
+
|
|
103
105
|
return grounded_tools
|
|
104
106
|
|
|
107
|
+
|
|
105
108
|
def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str]:
|
|
106
109
|
"""Get the last computer call output image from messages."""
|
|
107
110
|
for message in reversed(messages):
|
|
108
|
-
if (
|
|
109
|
-
message
|
|
110
|
-
|
|
111
|
-
message
|
|
111
|
+
if (
|
|
112
|
+
isinstance(message, dict)
|
|
113
|
+
and message.get("type") == "computer_call_output"
|
|
114
|
+
and isinstance(message.get("output"), dict)
|
|
115
|
+
and message["output"].get("type") == "input_image"
|
|
116
|
+
):
|
|
112
117
|
image_url = message["output"].get("image_url", "")
|
|
113
118
|
if image_url.startswith("data:image/png;base64,"):
|
|
114
119
|
return image_url.split(",", 1)[1]
|
|
@@ -116,17 +121,17 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str
|
|
|
116
121
|
|
|
117
122
|
|
|
118
123
|
@register_agent(r".*\+.*", priority=1)
|
|
119
|
-
class ComposedGroundedConfig:
|
|
124
|
+
class ComposedGroundedConfig(AsyncAgentConfig):
|
|
120
125
|
"""
|
|
121
126
|
Composed-grounded agent configuration that uses both grounding and thinking models.
|
|
122
|
-
|
|
127
|
+
|
|
123
128
|
The model parameter should be in format: "grounding_model+thinking_model"
|
|
124
129
|
e.g., "huggingface-local/HelloKKMe/GTA1-7B+gemini/gemini-1.5-pro"
|
|
125
130
|
"""
|
|
126
|
-
|
|
131
|
+
|
|
127
132
|
def __init__(self):
|
|
128
133
|
self.desc2xy: Dict[str, Tuple[float, float]] = {}
|
|
129
|
-
|
|
134
|
+
|
|
130
135
|
async def predict_step(
|
|
131
136
|
self,
|
|
132
137
|
messages: List[Dict[str, Any]],
|
|
@@ -140,11 +145,11 @@ class ComposedGroundedConfig:
|
|
|
140
145
|
_on_api_end=None,
|
|
141
146
|
_on_usage=None,
|
|
142
147
|
_on_screenshot=None,
|
|
143
|
-
**kwargs
|
|
148
|
+
**kwargs,
|
|
144
149
|
) -> Dict[str, Any]:
|
|
145
150
|
"""
|
|
146
151
|
Composed-grounded predict step implementation.
|
|
147
|
-
|
|
152
|
+
|
|
148
153
|
Process:
|
|
149
154
|
0. Store last computer call image, if none then take a screenshot
|
|
150
155
|
1. Convert computer calls from xy to descriptions
|
|
@@ -157,18 +162,20 @@ class ComposedGroundedConfig:
|
|
|
157
162
|
"""
|
|
158
163
|
# Parse the composed model
|
|
159
164
|
if "+" not in model:
|
|
160
|
-
raise ValueError(
|
|
165
|
+
raise ValueError(
|
|
166
|
+
f"Composed model must be in format 'grounding_model+thinking_model', got: {model}"
|
|
167
|
+
)
|
|
161
168
|
grounding_model, thinking_model = model.split("+", 1)
|
|
162
|
-
|
|
169
|
+
|
|
163
170
|
pre_output_items = []
|
|
164
|
-
|
|
171
|
+
|
|
165
172
|
# Step 0: Store last computer call image, if none then take a screenshot
|
|
166
173
|
last_image_b64 = get_last_computer_call_image(messages)
|
|
167
174
|
if last_image_b64 is None:
|
|
168
175
|
# Take a screenshot
|
|
169
|
-
screenshot_b64 = await computer_handler.screenshot()
|
|
176
|
+
screenshot_b64 = await computer_handler.screenshot() # type: ignore
|
|
170
177
|
if screenshot_b64:
|
|
171
|
-
|
|
178
|
+
|
|
172
179
|
call_id = uuid.uuid4().hex
|
|
173
180
|
pre_output_items += [
|
|
174
181
|
{
|
|
@@ -177,45 +184,42 @@ class ComposedGroundedConfig:
|
|
|
177
184
|
"content": [
|
|
178
185
|
{
|
|
179
186
|
"type": "output_text",
|
|
180
|
-
"text": "Taking a screenshot to see the current computer screen."
|
|
187
|
+
"text": "Taking a screenshot to see the current computer screen.",
|
|
181
188
|
}
|
|
182
|
-
]
|
|
189
|
+
],
|
|
183
190
|
},
|
|
184
191
|
{
|
|
185
|
-
"action": {
|
|
186
|
-
"type": "screenshot"
|
|
187
|
-
},
|
|
192
|
+
"action": {"type": "screenshot"},
|
|
188
193
|
"call_id": call_id,
|
|
189
194
|
"status": "completed",
|
|
190
|
-
"type": "computer_call"
|
|
195
|
+
"type": "computer_call",
|
|
191
196
|
},
|
|
192
197
|
{
|
|
193
198
|
"type": "computer_call_output",
|
|
194
199
|
"call_id": call_id,
|
|
195
200
|
"output": {
|
|
196
201
|
"type": "input_image",
|
|
197
|
-
"image_url": f"data:image/png;base64,{screenshot_b64}"
|
|
198
|
-
}
|
|
202
|
+
"image_url": f"data:image/png;base64,{screenshot_b64}",
|
|
203
|
+
},
|
|
199
204
|
},
|
|
200
205
|
]
|
|
201
206
|
last_image_b64 = screenshot_b64
|
|
202
|
-
|
|
207
|
+
|
|
203
208
|
# Call screenshot callback if provided
|
|
204
209
|
if _on_screenshot:
|
|
205
210
|
await _on_screenshot(screenshot_b64)
|
|
206
|
-
|
|
207
|
-
tool_schemas = _prepare_tools_for_grounded(tools)
|
|
211
|
+
|
|
212
|
+
tool_schemas = _prepare_tools_for_grounded(tools) # type: ignore
|
|
208
213
|
|
|
209
214
|
# Step 1: Convert computer calls from xy to descriptions
|
|
210
215
|
input_messages = messages + pre_output_items
|
|
211
216
|
messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
|
|
212
|
-
|
|
217
|
+
|
|
213
218
|
# Step 2: Convert responses items to completion messages
|
|
214
219
|
completion_messages = convert_responses_items_to_completion_messages(
|
|
215
|
-
messages_with_descriptions,
|
|
216
|
-
allow_images_in_tool_results=False
|
|
220
|
+
messages_with_descriptions, allow_images_in_tool_results=False
|
|
217
221
|
)
|
|
218
|
-
|
|
222
|
+
|
|
219
223
|
# Step 3: Call thinking model with litellm.acompletion
|
|
220
224
|
api_kwargs = {
|
|
221
225
|
"model": thinking_model,
|
|
@@ -223,96 +227,90 @@ class ComposedGroundedConfig:
|
|
|
223
227
|
"tools": tool_schemas,
|
|
224
228
|
"max_retries": max_retries,
|
|
225
229
|
"stream": stream,
|
|
226
|
-
**kwargs
|
|
230
|
+
**kwargs,
|
|
227
231
|
}
|
|
228
232
|
|
|
229
233
|
if use_prompt_caching:
|
|
230
234
|
api_kwargs["use_prompt_caching"] = use_prompt_caching
|
|
231
|
-
|
|
235
|
+
|
|
232
236
|
# Call API start hook
|
|
233
237
|
if _on_api_start:
|
|
234
238
|
await _on_api_start(api_kwargs)
|
|
235
|
-
|
|
239
|
+
|
|
236
240
|
# Make the completion call
|
|
237
241
|
response = await litellm.acompletion(**api_kwargs)
|
|
238
|
-
|
|
242
|
+
|
|
239
243
|
# Call API end hook
|
|
240
244
|
if _on_api_end:
|
|
241
245
|
await _on_api_end(api_kwargs, response)
|
|
242
|
-
|
|
246
|
+
|
|
243
247
|
# Extract usage information
|
|
244
248
|
usage = {
|
|
245
|
-
**response.usage.model_dump(),
|
|
249
|
+
**response.usage.model_dump(), # type: ignore
|
|
246
250
|
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
247
251
|
}
|
|
248
252
|
if _on_usage:
|
|
249
253
|
await _on_usage(usage)
|
|
250
|
-
|
|
254
|
+
|
|
251
255
|
# Step 4: Convert completion messages back to responses items format
|
|
252
|
-
response_dict = response.model_dump()
|
|
256
|
+
response_dict = response.model_dump() # type: ignore
|
|
253
257
|
choice_messages = [choice["message"] for choice in response_dict["choices"]]
|
|
254
258
|
thinking_output_items = []
|
|
255
|
-
|
|
259
|
+
|
|
256
260
|
for choice_message in choice_messages:
|
|
257
|
-
thinking_output_items.extend(
|
|
258
|
-
|
|
261
|
+
thinking_output_items.extend(
|
|
262
|
+
convert_completion_messages_to_responses_items([choice_message])
|
|
263
|
+
)
|
|
264
|
+
|
|
259
265
|
# Step 5: Get all element descriptions and populate desc2xy mapping
|
|
260
266
|
element_descriptions = get_all_element_descriptions(thinking_output_items)
|
|
261
|
-
|
|
267
|
+
|
|
262
268
|
if element_descriptions and last_image_b64:
|
|
263
269
|
# Use grounding model to predict coordinates for each description
|
|
264
270
|
grounding_agent_conf = find_agent_config(grounding_model)
|
|
265
271
|
if grounding_agent_conf:
|
|
266
272
|
grounding_agent = grounding_agent_conf.agent_class()
|
|
267
|
-
|
|
273
|
+
|
|
268
274
|
for desc in element_descriptions:
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
275
|
+
for _ in range(3): # try 3 times
|
|
276
|
+
coords = await grounding_agent.predict_click(
|
|
277
|
+
model=grounding_model, image_b64=last_image_b64, instruction=desc
|
|
278
|
+
)
|
|
279
|
+
if coords:
|
|
280
|
+
self.desc2xy[desc] = coords
|
|
281
|
+
break
|
|
282
|
+
|
|
277
283
|
# Step 6: Convert computer calls from descriptions back to xy coordinates
|
|
278
284
|
final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
|
|
279
|
-
|
|
285
|
+
|
|
280
286
|
# Step 7: Return output and usage
|
|
281
|
-
return {
|
|
282
|
-
|
|
283
|
-
"usage": usage
|
|
284
|
-
}
|
|
285
|
-
|
|
287
|
+
return {"output": pre_output_items + final_output_items, "usage": usage}
|
|
288
|
+
|
|
286
289
|
async def predict_click(
|
|
287
|
-
self,
|
|
288
|
-
model: str,
|
|
289
|
-
image_b64: str,
|
|
290
|
-
instruction: str,
|
|
291
|
-
**kwargs
|
|
290
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
292
291
|
) -> Optional[Tuple[int, int]]:
|
|
293
292
|
"""
|
|
294
293
|
Predict click coordinates using the grounding model.
|
|
295
|
-
|
|
294
|
+
|
|
296
295
|
For composed models, uses only the grounding model part for click prediction.
|
|
297
296
|
"""
|
|
298
297
|
# Parse the composed model to get grounding model
|
|
299
298
|
if "+" not in model:
|
|
300
|
-
raise ValueError(
|
|
299
|
+
raise ValueError(
|
|
300
|
+
f"Composed model must be in format 'grounding_model+thinking_model', got: {model}"
|
|
301
|
+
)
|
|
301
302
|
grounding_model, thinking_model = model.split("+", 1)
|
|
302
|
-
|
|
303
|
+
|
|
303
304
|
# Find and use the grounding agent
|
|
304
305
|
grounding_agent_conf = find_agent_config(grounding_model)
|
|
305
306
|
if grounding_agent_conf:
|
|
306
307
|
grounding_agent = grounding_agent_conf.agent_class()
|
|
307
308
|
return await grounding_agent.predict_click(
|
|
308
|
-
model=grounding_model,
|
|
309
|
-
image_b64=image_b64,
|
|
310
|
-
instruction=instruction,
|
|
311
|
-
**kwargs
|
|
309
|
+
model=grounding_model, image_b64=image_b64, instruction=instruction, **kwargs
|
|
312
310
|
)
|
|
313
|
-
|
|
311
|
+
|
|
314
312
|
return None
|
|
315
|
-
|
|
313
|
+
|
|
316
314
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
317
315
|
"""Return the capabilities supported by this agent."""
|
|
318
316
|
return ["click", "step"]
|