openadapt-ml 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -115
- openadapt_ml/benchmarks/agent.py +265 -421
- openadapt_ml/benchmarks/azure.py +28 -19
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1722 -4847
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +22 -5
- openadapt_ml/benchmarks/vm_monitor.py +530 -29
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +2038 -487
- openadapt_ml/cloud/ssh_tunnel.py +68 -26
- openadapt_ml/datasets/next_action.py +40 -30
- openadapt_ml/evals/grounding.py +8 -3
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +41 -26
- openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
- openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/runner.py +29 -14
- openadapt_ml/export/parquet.py +36 -24
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +8 -6
- openadapt_ml/ingest/capture.py +25 -22
- openadapt_ml/ingest/loader.py +7 -4
- openadapt_ml/ingest/synthetic.py +189 -100
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/demo_retriever.py +50 -24
- openadapt_ml/retrieval/embeddings.py +9 -8
- openadapt_ml/retrieval/retriever.py +3 -1
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +18 -5
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +9 -0
- openadapt_ml/schema/converters.py +74 -27
- openadapt_ml/schema/episode.py +31 -18
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +85 -54
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +15 -9
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +3 -1
- openadapt_ml/scripts/train.py +21 -9
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +52 -41
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +143 -86
- openadapt_ml/training/trl_trainer.py +70 -21
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/METADATA +215 -14
- openadapt_ml-0.2.2.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/live_tracker.py +0 -180
- openadapt_ml/benchmarks/runner.py +0 -418
- openadapt_ml/benchmarks/waa.py +0 -761
- openadapt_ml/benchmarks/waa_live.py +0 -619
- openadapt_ml-0.2.0.dist-info/RECORD +0 -86
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,922 @@
|
|
|
1
|
+
"""Response parsing for baseline adapters.
|
|
2
|
+
|
|
3
|
+
Extracts structured actions from VLM responses with support for:
|
|
4
|
+
- JSON format extraction
|
|
5
|
+
- Function-call syntax (CLICK(x, y))
|
|
6
|
+
- PyAutoGUI format (OSWorld compatible)
|
|
7
|
+
- UFO format (Observation/Thought/ControlLabel)
|
|
8
|
+
- Element ID to coordinate normalization
|
|
9
|
+
- Robust fallback parsing
|
|
10
|
+
|
|
11
|
+
Based on patterns from:
|
|
12
|
+
- Claude Computer Use
|
|
13
|
+
- OSWorld benchmark
|
|
14
|
+
- Microsoft UFO
|
|
15
|
+
- Agent-S
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
import re
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from typing import TYPE_CHECKING, Any
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from openadapt_ml.baselines.config import ScreenConfig
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class UIElement:
|
|
34
|
+
"""UI element with bounding box for coordinate conversion.
|
|
35
|
+
|
|
36
|
+
Used to convert element_id actions to coordinate actions.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
element_id: int | str
|
|
40
|
+
role: str = ""
|
|
41
|
+
name: str = ""
|
|
42
|
+
bbox: tuple[float, float, float, float] | None = None # (x1, y1, x2, y2)
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def center(self) -> tuple[float, float] | None:
|
|
46
|
+
"""Get center point of element."""
|
|
47
|
+
if self.bbox is None:
|
|
48
|
+
return None
|
|
49
|
+
x1, y1, x2, y2 = self.bbox
|
|
50
|
+
return ((x1 + x2) / 2, (y1 + y2) / 2)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class ElementRegistry:
|
|
55
|
+
"""Registry of UI elements for element_id to coordinate conversion.
|
|
56
|
+
|
|
57
|
+
Used by the parser to convert Track C (SoM) element IDs to
|
|
58
|
+
Track A coordinates when needed.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
elements: dict[int, UIElement] = field(default_factory=dict)
|
|
62
|
+
screen_width: int = 1920
|
|
63
|
+
screen_height: int = 1080
|
|
64
|
+
is_normalized: bool = True # Whether bbox coordinates are normalized (0-1)
|
|
65
|
+
|
|
66
|
+
def add_element(
|
|
67
|
+
self,
|
|
68
|
+
element_id: int | str,
|
|
69
|
+
bbox: tuple[float, float, float, float],
|
|
70
|
+
role: str = "",
|
|
71
|
+
name: str = "",
|
|
72
|
+
) -> None:
|
|
73
|
+
"""Add an element to the registry."""
|
|
74
|
+
eid = int(element_id) if isinstance(element_id, str) else element_id
|
|
75
|
+
self.elements[eid] = UIElement(element_id=eid, role=role, name=name, bbox=bbox)
|
|
76
|
+
|
|
77
|
+
def get_element(self, element_id: int) -> UIElement | None:
|
|
78
|
+
"""Get element by ID."""
|
|
79
|
+
return self.elements.get(element_id)
|
|
80
|
+
|
|
81
|
+
def get_center_coords(
|
|
82
|
+
self, element_id: int, normalize: bool = True
|
|
83
|
+
) -> tuple[float, float] | None:
|
|
84
|
+
"""Get center coordinates for an element.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
element_id: Element ID to look up.
|
|
88
|
+
normalize: Whether to return normalized (0-1) coordinates.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
(x, y) center coordinates, or None if element not found.
|
|
92
|
+
"""
|
|
93
|
+
element = self.get_element(element_id)
|
|
94
|
+
if element is None or element.bbox is None:
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
center = element.center
|
|
98
|
+
if center is None:
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
x, y = center
|
|
102
|
+
|
|
103
|
+
# Handle normalization
|
|
104
|
+
if self.is_normalized and not normalize:
|
|
105
|
+
# Convert from normalized to pixels
|
|
106
|
+
x = x * self.screen_width
|
|
107
|
+
y = y * self.screen_height
|
|
108
|
+
elif not self.is_normalized and normalize:
|
|
109
|
+
# Convert from pixels to normalized
|
|
110
|
+
x = x / self.screen_width
|
|
111
|
+
y = y / self.screen_height
|
|
112
|
+
|
|
113
|
+
return (x, y)
|
|
114
|
+
|
|
115
|
+
@classmethod
|
|
116
|
+
def from_a11y_tree(
|
|
117
|
+
cls,
|
|
118
|
+
tree: dict[str, Any] | list[dict[str, Any]],
|
|
119
|
+
screen_width: int = 1920,
|
|
120
|
+
screen_height: int = 1080,
|
|
121
|
+
) -> "ElementRegistry":
|
|
122
|
+
"""Build registry from accessibility tree.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
tree: Accessibility tree as dict or list of element dicts.
|
|
126
|
+
screen_width: Screen width for coordinate conversion.
|
|
127
|
+
screen_height: Screen height for coordinate conversion.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
ElementRegistry with all elements from tree.
|
|
131
|
+
"""
|
|
132
|
+
registry = cls(screen_width=screen_width, screen_height=screen_height)
|
|
133
|
+
|
|
134
|
+
def process_node(node: dict[str, Any]) -> None:
|
|
135
|
+
node_id = node.get("id", node.get("node_id", node.get("element_id")))
|
|
136
|
+
if node_id is not None:
|
|
137
|
+
try:
|
|
138
|
+
eid = int(
|
|
139
|
+
str(node_id).replace("e", "").replace("[", "").replace("]", "")
|
|
140
|
+
)
|
|
141
|
+
bbox = node.get("bbox", node.get("bounds"))
|
|
142
|
+
if bbox and len(bbox) >= 4:
|
|
143
|
+
registry.add_element(
|
|
144
|
+
element_id=eid,
|
|
145
|
+
bbox=tuple(bbox[:4]),
|
|
146
|
+
role=node.get("role", ""),
|
|
147
|
+
name=node.get("name", ""),
|
|
148
|
+
)
|
|
149
|
+
except (ValueError, TypeError):
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
# Process children
|
|
153
|
+
for child in node.get("children", []):
|
|
154
|
+
if isinstance(child, dict):
|
|
155
|
+
process_node(child)
|
|
156
|
+
|
|
157
|
+
if isinstance(tree, dict):
|
|
158
|
+
process_node(tree)
|
|
159
|
+
elif isinstance(tree, list):
|
|
160
|
+
for node in tree:
|
|
161
|
+
if isinstance(node, dict):
|
|
162
|
+
process_node(node)
|
|
163
|
+
|
|
164
|
+
return registry
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@dataclass
|
|
168
|
+
class ParsedAction:
|
|
169
|
+
"""Parsed action from model response.
|
|
170
|
+
|
|
171
|
+
Attributes:
|
|
172
|
+
action_type: Action type (click, type, key, scroll, done, wait, fail, unknown).
|
|
173
|
+
x: X coordinate (normalized 0-1) for coordinate actions.
|
|
174
|
+
y: Y coordinate (normalized 0-1) for coordinate actions.
|
|
175
|
+
element_id: Element ID for SoM actions.
|
|
176
|
+
text: Text content for type actions.
|
|
177
|
+
key: Key name for key actions.
|
|
178
|
+
modifiers: Key modifiers (ctrl, shift, alt) for key/hotkey actions.
|
|
179
|
+
direction: Scroll direction for scroll actions.
|
|
180
|
+
amount: Scroll amount for scroll actions.
|
|
181
|
+
observation: Observed state description (ReAct/UFO format).
|
|
182
|
+
thought: Reasoning text (ReAct/UFO format).
|
|
183
|
+
plan: Multi-step plan (UFO format).
|
|
184
|
+
status: Execution status (UFO format: CONTINUE, FINISH, ERROR).
|
|
185
|
+
raw_response: Original model response.
|
|
186
|
+
parse_error: Error message if parsing failed.
|
|
187
|
+
confidence: Parser confidence score (0-1).
|
|
188
|
+
metadata: Additional parsed data.
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
action_type: str
|
|
192
|
+
x: float | None = None
|
|
193
|
+
y: float | None = None
|
|
194
|
+
element_id: int | None = None
|
|
195
|
+
text: str | None = None
|
|
196
|
+
key: str | None = None
|
|
197
|
+
modifiers: list[str] | None = None
|
|
198
|
+
direction: str | None = None
|
|
199
|
+
amount: int | None = None
|
|
200
|
+
observation: str | None = None
|
|
201
|
+
thought: str | None = None
|
|
202
|
+
plan: list[str] | None = None
|
|
203
|
+
status: str | None = None
|
|
204
|
+
raw_response: str | None = None
|
|
205
|
+
parse_error: str | None = None
|
|
206
|
+
confidence: float = 1.0
|
|
207
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def is_valid(self) -> bool:
|
|
211
|
+
"""Check if the action was successfully parsed."""
|
|
212
|
+
return self.parse_error is None and self.action_type != "unknown"
|
|
213
|
+
|
|
214
|
+
@property
|
|
215
|
+
def is_terminal(self) -> bool:
|
|
216
|
+
"""Check if this action terminates the episode."""
|
|
217
|
+
return self.action_type in ("done", "fail")
|
|
218
|
+
|
|
219
|
+
@property
|
|
220
|
+
def has_coordinates(self) -> bool:
|
|
221
|
+
"""Check if action has coordinate data."""
|
|
222
|
+
return self.x is not None and self.y is not None
|
|
223
|
+
|
|
224
|
+
@property
|
|
225
|
+
def has_element_id(self) -> bool:
|
|
226
|
+
"""Check if action has element ID."""
|
|
227
|
+
return self.element_id is not None
|
|
228
|
+
|
|
229
|
+
def to_dict(self) -> dict[str, Any]:
|
|
230
|
+
"""Convert to action dictionary for benchmark integration."""
|
|
231
|
+
result: dict[str, Any] = {"type": self.action_type}
|
|
232
|
+
|
|
233
|
+
if self.x is not None:
|
|
234
|
+
result["x"] = self.x
|
|
235
|
+
if self.y is not None:
|
|
236
|
+
result["y"] = self.y
|
|
237
|
+
if self.element_id is not None:
|
|
238
|
+
result["element_id"] = self.element_id
|
|
239
|
+
if self.text is not None:
|
|
240
|
+
result["text"] = self.text
|
|
241
|
+
if self.key is not None:
|
|
242
|
+
result["key"] = self.key
|
|
243
|
+
if self.modifiers:
|
|
244
|
+
result["modifiers"] = self.modifiers
|
|
245
|
+
if self.direction is not None:
|
|
246
|
+
result["direction"] = self.direction
|
|
247
|
+
if self.amount is not None:
|
|
248
|
+
result["amount"] = self.amount
|
|
249
|
+
if self.observation is not None:
|
|
250
|
+
result["observation"] = self.observation
|
|
251
|
+
if self.thought is not None:
|
|
252
|
+
result["thought"] = self.thought
|
|
253
|
+
if self.plan:
|
|
254
|
+
result["plan"] = self.plan
|
|
255
|
+
if self.status is not None:
|
|
256
|
+
result["status"] = self.status
|
|
257
|
+
|
|
258
|
+
return result
|
|
259
|
+
|
|
260
|
+
def to_pyautogui(
|
|
261
|
+
self,
|
|
262
|
+
screen_width: int = 1920,
|
|
263
|
+
screen_height: int = 1080,
|
|
264
|
+
) -> str:
|
|
265
|
+
"""Convert to PyAutoGUI code string.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
screen_width: Screen width for coordinate conversion.
|
|
269
|
+
screen_height: Screen height for coordinate conversion.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
PyAutoGUI code string.
|
|
273
|
+
"""
|
|
274
|
+
if self.action_type == "click":
|
|
275
|
+
if self.x is not None and self.y is not None:
|
|
276
|
+
px = int(self.x * screen_width)
|
|
277
|
+
py = int(self.y * screen_height)
|
|
278
|
+
return f"pyautogui.click({px}, {py})"
|
|
279
|
+
elif self.element_id is not None:
|
|
280
|
+
return (
|
|
281
|
+
f"# CLICK element {self.element_id} (needs coordinate conversion)"
|
|
282
|
+
)
|
|
283
|
+
elif self.action_type == "type":
|
|
284
|
+
text = self.text or ""
|
|
285
|
+
return f"pyautogui.write('{text}')"
|
|
286
|
+
elif self.action_type == "key":
|
|
287
|
+
key = self.key or ""
|
|
288
|
+
if self.modifiers:
|
|
289
|
+
keys = ", ".join([f"'{k}'" for k in self.modifiers + [key]])
|
|
290
|
+
return f"pyautogui.hotkey({keys})"
|
|
291
|
+
return f"pyautogui.press('{key}')"
|
|
292
|
+
elif self.action_type == "scroll":
|
|
293
|
+
direction = self.direction or "down"
|
|
294
|
+
amount = self.amount or 3
|
|
295
|
+
clicks = -amount if direction == "down" else amount
|
|
296
|
+
return f"pyautogui.scroll({clicks})"
|
|
297
|
+
elif self.action_type == "done":
|
|
298
|
+
return "DONE"
|
|
299
|
+
elif self.action_type == "wait":
|
|
300
|
+
return "WAIT"
|
|
301
|
+
elif self.action_type == "fail":
|
|
302
|
+
return "FAIL"
|
|
303
|
+
|
|
304
|
+
return f"# Unknown action: {self.action_type}"
|
|
305
|
+
|
|
306
|
+
def with_coordinates(
|
|
307
|
+
self,
|
|
308
|
+
x: float,
|
|
309
|
+
y: float,
|
|
310
|
+
source: str = "conversion",
|
|
311
|
+
) -> "ParsedAction":
|
|
312
|
+
"""Create a copy with coordinates added.
|
|
313
|
+
|
|
314
|
+
Useful for converting element_id actions to coordinate actions.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
x: X coordinate (normalized 0-1).
|
|
318
|
+
y: Y coordinate (normalized 0-1).
|
|
319
|
+
source: Source of coordinates for metadata.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
New ParsedAction with coordinates.
|
|
323
|
+
"""
|
|
324
|
+
return ParsedAction(
|
|
325
|
+
action_type=self.action_type,
|
|
326
|
+
x=x,
|
|
327
|
+
y=y,
|
|
328
|
+
element_id=self.element_id,
|
|
329
|
+
text=self.text,
|
|
330
|
+
key=self.key,
|
|
331
|
+
modifiers=self.modifiers,
|
|
332
|
+
direction=self.direction,
|
|
333
|
+
amount=self.amount,
|
|
334
|
+
observation=self.observation,
|
|
335
|
+
thought=self.thought,
|
|
336
|
+
plan=self.plan,
|
|
337
|
+
status=self.status,
|
|
338
|
+
raw_response=self.raw_response,
|
|
339
|
+
parse_error=self.parse_error,
|
|
340
|
+
confidence=self.confidence,
|
|
341
|
+
metadata={**self.metadata, "coord_source": source},
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
class UnifiedResponseParser:
|
|
346
|
+
"""Parser for VLM responses across all tracks and formats.
|
|
347
|
+
|
|
348
|
+
Supports:
|
|
349
|
+
- JSON format: {"action": "CLICK", "x": 0.5, "y": 0.3}
|
|
350
|
+
- Function format: CLICK(0.5, 0.3) or CLICK([17])
|
|
351
|
+
- PyAutoGUI format: pyautogui.click(960, 540)
|
|
352
|
+
- UFO format: {"Observation": ..., "Thought": ..., "ControlLabel": 17}
|
|
353
|
+
- Mixed format: ReAct-style with thought + action
|
|
354
|
+
|
|
355
|
+
Example:
|
|
356
|
+
parser = UnifiedResponseParser()
|
|
357
|
+
action = parser.parse('{"action": "CLICK", "x": 0.5, "y": 0.3}')
|
|
358
|
+
print(action.x, action.y) # 0.5, 0.3
|
|
359
|
+
|
|
360
|
+
# With element registry for ID->coordinate conversion
|
|
361
|
+
registry = ElementRegistry.from_a11y_tree(tree)
|
|
362
|
+
parser = UnifiedResponseParser(element_registry=registry)
|
|
363
|
+
action = parser.parse('{"action": "CLICK", "element_id": 17}')
|
|
364
|
+
action = parser.resolve_element_id(action)
|
|
365
|
+
print(action.x, action.y) # Converted coordinates
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
def __init__(
|
|
369
|
+
self,
|
|
370
|
+
element_registry: ElementRegistry | None = None,
|
|
371
|
+
screen_config: "ScreenConfig | None" = None,
|
|
372
|
+
normalize_coordinates: bool = True,
|
|
373
|
+
):
|
|
374
|
+
"""Initialize parser.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
element_registry: Optional registry for element_id conversion.
|
|
378
|
+
screen_config: Optional screen configuration for coordinate handling.
|
|
379
|
+
normalize_coordinates: Whether to normalize coordinates to 0-1.
|
|
380
|
+
"""
|
|
381
|
+
self.element_registry = element_registry
|
|
382
|
+
self.screen_config = screen_config
|
|
383
|
+
self.normalize_coordinates = normalize_coordinates
|
|
384
|
+
|
|
385
|
+
# Default screen dimensions
|
|
386
|
+
self._screen_width = screen_config.width if screen_config else 1920
|
|
387
|
+
self._screen_height = screen_config.height if screen_config else 1080
|
|
388
|
+
|
|
389
|
+
def parse(self, response: str) -> ParsedAction:
|
|
390
|
+
"""Parse model response into structured action.
|
|
391
|
+
|
|
392
|
+
Tries multiple parsing strategies in order:
|
|
393
|
+
1. JSON extraction (most reliable)
|
|
394
|
+
2. PyAutoGUI code patterns
|
|
395
|
+
3. Function-style patterns (CLICK, TYPE, etc.)
|
|
396
|
+
4. Special keywords (DONE, WAIT, FAIL)
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
response: Raw model response string.
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
ParsedAction with extracted fields.
|
|
403
|
+
"""
|
|
404
|
+
if not response:
|
|
405
|
+
return ParsedAction(
|
|
406
|
+
action_type="unknown",
|
|
407
|
+
raw_response=response,
|
|
408
|
+
parse_error="Empty response",
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
response = response.strip()
|
|
412
|
+
|
|
413
|
+
# Try JSON first (most structured)
|
|
414
|
+
action = self._try_json_parse(response)
|
|
415
|
+
if action.is_valid:
|
|
416
|
+
action.raw_response = response
|
|
417
|
+
return action
|
|
418
|
+
|
|
419
|
+
# Try PyAutoGUI format
|
|
420
|
+
action = self._try_pyautogui_parse(response)
|
|
421
|
+
if action.is_valid:
|
|
422
|
+
action.raw_response = response
|
|
423
|
+
return action
|
|
424
|
+
|
|
425
|
+
# Try function-call patterns
|
|
426
|
+
action = self._try_regex_parse(response)
|
|
427
|
+
if action.is_valid:
|
|
428
|
+
action.raw_response = response
|
|
429
|
+
return action
|
|
430
|
+
|
|
431
|
+
# Try special keywords
|
|
432
|
+
action = self._try_keyword_parse(response)
|
|
433
|
+
if action.is_valid:
|
|
434
|
+
action.raw_response = response
|
|
435
|
+
return action
|
|
436
|
+
|
|
437
|
+
# Return unknown action with error
|
|
438
|
+
return ParsedAction(
|
|
439
|
+
action_type="unknown",
|
|
440
|
+
raw_response=response,
|
|
441
|
+
parse_error="No action pattern found in response",
|
|
442
|
+
confidence=0.0,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
def _try_json_parse(self, response: str) -> ParsedAction:
|
|
446
|
+
"""Try to extract and parse JSON from response."""
|
|
447
|
+
# Try to find JSON object in response
|
|
448
|
+
json_patterns = [
|
|
449
|
+
r"```json\s*(\{[^`]*\})\s*```", # Markdown code block
|
|
450
|
+
r"```\s*(\{[^`]*\})\s*```", # Plain code block
|
|
451
|
+
r"(\{[^{}]*\})", # Simple JSON object
|
|
452
|
+
r"(\{[^{}]*\{[^{}]*\}[^{}]*\})", # Nested JSON (max 1 level)
|
|
453
|
+
]
|
|
454
|
+
|
|
455
|
+
for pattern in json_patterns:
|
|
456
|
+
matches = re.findall(pattern, response, re.DOTALL)
|
|
457
|
+
for match in matches:
|
|
458
|
+
try:
|
|
459
|
+
data = json.loads(match)
|
|
460
|
+
action = self._dict_to_action(data)
|
|
461
|
+
if action.is_valid:
|
|
462
|
+
return action
|
|
463
|
+
except json.JSONDecodeError:
|
|
464
|
+
continue
|
|
465
|
+
|
|
466
|
+
return ParsedAction(action_type="unknown", parse_error="No valid JSON found")
|
|
467
|
+
|
|
468
|
+
def _dict_to_action(self, data: dict[str, Any]) -> ParsedAction:
|
|
469
|
+
"""Convert parsed dict to ParsedAction.
|
|
470
|
+
|
|
471
|
+
Handles multiple formats:
|
|
472
|
+
- Standard: {"action": "CLICK", "x": 0.5, "y": 0.3}
|
|
473
|
+
- UFO: {"Observation": ..., "Thought": ..., "ControlLabel": 17}
|
|
474
|
+
- ReAct: {"observation": ..., "thought": ..., "action": "CLICK"}
|
|
475
|
+
"""
|
|
476
|
+
# Extract ReAct/UFO fields first
|
|
477
|
+
observation = data.get("observation", data.get("Observation"))
|
|
478
|
+
thought = data.get("thought", data.get("Thought"))
|
|
479
|
+
plan = data.get("plan", data.get("Plan"))
|
|
480
|
+
status = data.get("status", data.get("Status"))
|
|
481
|
+
|
|
482
|
+
# Get action type (handle various key names)
|
|
483
|
+
action_type = (
|
|
484
|
+
data.get("action", "")
|
|
485
|
+
or data.get("type", "")
|
|
486
|
+
or data.get("Function", "") # UFO format
|
|
487
|
+
).lower()
|
|
488
|
+
|
|
489
|
+
# Handle UFO ControlLabel as element click
|
|
490
|
+
control_label = data.get("ControlLabel", data.get("control_label"))
|
|
491
|
+
if control_label is not None and not action_type:
|
|
492
|
+
action_type = "click"
|
|
493
|
+
|
|
494
|
+
if action_type == "click":
|
|
495
|
+
# Check for element_id first (SoM/UFO)
|
|
496
|
+
element_id = data.get("element_id", data.get("ControlLabel"))
|
|
497
|
+
if element_id is not None:
|
|
498
|
+
return ParsedAction(
|
|
499
|
+
action_type="click",
|
|
500
|
+
element_id=self._normalize_element_id(element_id),
|
|
501
|
+
observation=observation,
|
|
502
|
+
thought=thought,
|
|
503
|
+
plan=plan,
|
|
504
|
+
status=status,
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
# Then check for coordinates
|
|
508
|
+
if "x" in data and "y" in data:
|
|
509
|
+
x, y = self._normalize_coords(float(data["x"]), float(data["y"]))
|
|
510
|
+
return ParsedAction(
|
|
511
|
+
action_type="click",
|
|
512
|
+
x=x,
|
|
513
|
+
y=y,
|
|
514
|
+
observation=observation,
|
|
515
|
+
thought=thought,
|
|
516
|
+
plan=plan,
|
|
517
|
+
status=status,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
# Check coordinate array format
|
|
521
|
+
if "coordinate" in data:
|
|
522
|
+
coords = data["coordinate"]
|
|
523
|
+
if isinstance(coords, (list, tuple)) and len(coords) >= 2:
|
|
524
|
+
x, y = self._normalize_coords(float(coords[0]), float(coords[1]))
|
|
525
|
+
return ParsedAction(
|
|
526
|
+
action_type="click",
|
|
527
|
+
x=x,
|
|
528
|
+
y=y,
|
|
529
|
+
observation=observation,
|
|
530
|
+
thought=thought,
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
return ParsedAction(
|
|
534
|
+
action_type="click",
|
|
535
|
+
parse_error="CLICK missing coordinates or element_id",
|
|
536
|
+
observation=observation,
|
|
537
|
+
thought=thought,
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
elif action_type in ("type", "input_text", "write"):
|
|
541
|
+
text = data.get("text", "")
|
|
542
|
+
# Handle UFO Args format
|
|
543
|
+
args = data.get("Args", data.get("args", []))
|
|
544
|
+
if not text and args:
|
|
545
|
+
text = args[0] if args else ""
|
|
546
|
+
return ParsedAction(
|
|
547
|
+
action_type="type",
|
|
548
|
+
text=text,
|
|
549
|
+
observation=observation,
|
|
550
|
+
thought=thought,
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
elif action_type in ("key", "press", "hotkey"):
|
|
554
|
+
key = data.get("key", "")
|
|
555
|
+
modifiers = data.get("modifiers", [])
|
|
556
|
+
|
|
557
|
+
# Handle UFO Args format for hotkey
|
|
558
|
+
args = data.get("Args", data.get("args", []))
|
|
559
|
+
if args and not key:
|
|
560
|
+
if len(args) == 1:
|
|
561
|
+
key = args[0]
|
|
562
|
+
else:
|
|
563
|
+
modifiers = args[:-1]
|
|
564
|
+
key = args[-1]
|
|
565
|
+
|
|
566
|
+
return ParsedAction(
|
|
567
|
+
action_type="key",
|
|
568
|
+
key=key,
|
|
569
|
+
modifiers=modifiers if modifiers else None,
|
|
570
|
+
observation=observation,
|
|
571
|
+
thought=thought,
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
elif action_type == "scroll":
|
|
575
|
+
direction = data.get("direction", data.get("scroll_direction", "down"))
|
|
576
|
+
amount = data.get("amount", data.get("scroll_amount", 3))
|
|
577
|
+
|
|
578
|
+
# Handle UFO Args format
|
|
579
|
+
args = data.get("Args", data.get("args", []))
|
|
580
|
+
if args and not direction:
|
|
581
|
+
direction = args[0] if args else "down"
|
|
582
|
+
|
|
583
|
+
return ParsedAction(
|
|
584
|
+
action_type="scroll",
|
|
585
|
+
direction=direction,
|
|
586
|
+
amount=amount,
|
|
587
|
+
observation=observation,
|
|
588
|
+
thought=thought,
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
elif action_type in ("done", "finish", "complete"):
|
|
592
|
+
return ParsedAction(
|
|
593
|
+
action_type="done",
|
|
594
|
+
status="FINISH",
|
|
595
|
+
observation=observation,
|
|
596
|
+
thought=thought,
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
elif action_type in ("wait", "pause"):
|
|
600
|
+
return ParsedAction(
|
|
601
|
+
action_type="wait",
|
|
602
|
+
observation=observation,
|
|
603
|
+
thought=thought,
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
elif action_type in ("fail", "error", "impossible"):
|
|
607
|
+
return ParsedAction(
|
|
608
|
+
action_type="fail",
|
|
609
|
+
status="ERROR",
|
|
610
|
+
observation=observation,
|
|
611
|
+
thought=thought,
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
else:
|
|
615
|
+
return ParsedAction(
|
|
616
|
+
action_type="unknown",
|
|
617
|
+
parse_error=f"Unknown action type: {action_type}",
|
|
618
|
+
observation=observation,
|
|
619
|
+
thought=thought,
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
def _try_pyautogui_parse(self, response: str) -> ParsedAction:
|
|
623
|
+
"""Try to parse PyAutoGUI-style code."""
|
|
624
|
+
# pyautogui.click(x, y)
|
|
625
|
+
click_match = re.search(
|
|
626
|
+
r"pyautogui\.click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)",
|
|
627
|
+
response,
|
|
628
|
+
re.IGNORECASE,
|
|
629
|
+
)
|
|
630
|
+
if click_match:
|
|
631
|
+
x = int(click_match.group(1))
|
|
632
|
+
y = int(click_match.group(2))
|
|
633
|
+
x, y = self._normalize_coords(x, y)
|
|
634
|
+
return ParsedAction(action_type="click", x=x, y=y)
|
|
635
|
+
|
|
636
|
+
# pyautogui.doubleClick(x, y)
|
|
637
|
+
dclick_match = re.search(
|
|
638
|
+
r"pyautogui\.doubleClick\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)",
|
|
639
|
+
response,
|
|
640
|
+
re.IGNORECASE,
|
|
641
|
+
)
|
|
642
|
+
if dclick_match:
|
|
643
|
+
x = int(dclick_match.group(1))
|
|
644
|
+
y = int(dclick_match.group(2))
|
|
645
|
+
x, y = self._normalize_coords(x, y)
|
|
646
|
+
return ParsedAction(
|
|
647
|
+
action_type="click",
|
|
648
|
+
x=x,
|
|
649
|
+
y=y,
|
|
650
|
+
metadata={"double_click": True},
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
# pyautogui.write('text')
|
|
654
|
+
write_match = re.search(
|
|
655
|
+
r'pyautogui\.write\s*\(\s*[\'"](.+?)[\'"]\s*\)',
|
|
656
|
+
response,
|
|
657
|
+
re.IGNORECASE,
|
|
658
|
+
)
|
|
659
|
+
if write_match:
|
|
660
|
+
return ParsedAction(action_type="type", text=write_match.group(1))
|
|
661
|
+
|
|
662
|
+
# pyautogui.press('key')
|
|
663
|
+
press_match = re.search(
|
|
664
|
+
r'pyautogui\.press\s*\(\s*[\'"](.+?)[\'"]\s*\)',
|
|
665
|
+
response,
|
|
666
|
+
re.IGNORECASE,
|
|
667
|
+
)
|
|
668
|
+
if press_match:
|
|
669
|
+
return ParsedAction(action_type="key", key=press_match.group(1))
|
|
670
|
+
|
|
671
|
+
# pyautogui.hotkey('key1', 'key2')
|
|
672
|
+
hotkey_match = re.search(
|
|
673
|
+
r"pyautogui\.hotkey\s*\(\s*(.+?)\s*\)",
|
|
674
|
+
response,
|
|
675
|
+
re.IGNORECASE,
|
|
676
|
+
)
|
|
677
|
+
if hotkey_match:
|
|
678
|
+
keys_str = hotkey_match.group(1)
|
|
679
|
+
# Extract keys from quotes
|
|
680
|
+
keys = re.findall(r'[\'"]([^\'"]+)[\'"]', keys_str)
|
|
681
|
+
if keys:
|
|
682
|
+
modifiers = keys[:-1] if len(keys) > 1 else None
|
|
683
|
+
key = keys[-1]
|
|
684
|
+
return ParsedAction(
|
|
685
|
+
action_type="key",
|
|
686
|
+
key=key,
|
|
687
|
+
modifiers=modifiers,
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
# pyautogui.scroll(amount)
|
|
691
|
+
scroll_match = re.search(
|
|
692
|
+
r"pyautogui\.scroll\s*\(\s*(-?\d+)\s*\)",
|
|
693
|
+
response,
|
|
694
|
+
re.IGNORECASE,
|
|
695
|
+
)
|
|
696
|
+
if scroll_match:
|
|
697
|
+
clicks = int(scroll_match.group(1))
|
|
698
|
+
direction = "up" if clicks > 0 else "down"
|
|
699
|
+
return ParsedAction(
|
|
700
|
+
action_type="scroll",
|
|
701
|
+
direction=direction,
|
|
702
|
+
amount=abs(clicks),
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
return ParsedAction(
|
|
706
|
+
action_type="unknown", parse_error="No PyAutoGUI pattern matched"
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
def _try_regex_parse(self, response: str) -> ParsedAction:
|
|
710
|
+
"""Try regex patterns for function-style actions."""
|
|
711
|
+
# CLICK(x, y) - normalized coordinates
|
|
712
|
+
click_norm = re.search(
|
|
713
|
+
r"CLICK\s*\(\s*(0?\.\d+)\s*,\s*(0?\.\d+)\s*\)",
|
|
714
|
+
response,
|
|
715
|
+
re.IGNORECASE,
|
|
716
|
+
)
|
|
717
|
+
if click_norm:
|
|
718
|
+
return ParsedAction(
|
|
719
|
+
action_type="click",
|
|
720
|
+
x=float(click_norm.group(1)),
|
|
721
|
+
y=float(click_norm.group(2)),
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
# CLICK(x, y) - larger numbers (pixels)
|
|
725
|
+
click_pixel = re.search(
|
|
726
|
+
r"CLICK\s*\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)",
|
|
727
|
+
response,
|
|
728
|
+
re.IGNORECASE,
|
|
729
|
+
)
|
|
730
|
+
if click_pixel:
|
|
731
|
+
x = float(click_pixel.group(1))
|
|
732
|
+
y = float(click_pixel.group(2))
|
|
733
|
+
x, y = self._normalize_coords(x, y)
|
|
734
|
+
return ParsedAction(action_type="click", x=x, y=y)
|
|
735
|
+
|
|
736
|
+
# CLICK([id]) - element ID
|
|
737
|
+
click_element = re.search(
|
|
738
|
+
r"CLICK\s*\(\s*\[\s*(\d+)\s*\]\s*\)",
|
|
739
|
+
response,
|
|
740
|
+
re.IGNORECASE,
|
|
741
|
+
)
|
|
742
|
+
if click_element:
|
|
743
|
+
return ParsedAction(
|
|
744
|
+
action_type="click",
|
|
745
|
+
element_id=int(click_element.group(1)),
|
|
746
|
+
)
|
|
747
|
+
|
|
748
|
+
# CLICK(id) without brackets
|
|
749
|
+
click_id = re.search(
|
|
750
|
+
r"CLICK\s*\(\s*(\d+)\s*\)",
|
|
751
|
+
response,
|
|
752
|
+
re.IGNORECASE,
|
|
753
|
+
)
|
|
754
|
+
if click_id:
|
|
755
|
+
# Check if it's likely an element ID (small number) vs coordinate
|
|
756
|
+
val = int(click_id.group(1))
|
|
757
|
+
if val < 1000: # Likely element ID
|
|
758
|
+
return ParsedAction(action_type="click", element_id=val)
|
|
759
|
+
|
|
760
|
+
# TYPE("text") or TYPE('text')
|
|
761
|
+
type_match = re.search(
|
|
762
|
+
r'TYPE\s*\(\s*["\'](.+?)["\']\s*\)',
|
|
763
|
+
response,
|
|
764
|
+
re.IGNORECASE,
|
|
765
|
+
)
|
|
766
|
+
if type_match:
|
|
767
|
+
return ParsedAction(action_type="type", text=type_match.group(1))
|
|
768
|
+
|
|
769
|
+
# KEY(key) or KEY(mod+key)
|
|
770
|
+
key_match = re.search(
|
|
771
|
+
r"KEY\s*\(\s*([a-zA-Z0-9_+]+)\s*\)",
|
|
772
|
+
response,
|
|
773
|
+
re.IGNORECASE,
|
|
774
|
+
)
|
|
775
|
+
if key_match:
|
|
776
|
+
key_str = key_match.group(1).lower()
|
|
777
|
+
if "+" in key_str:
|
|
778
|
+
parts = key_str.split("+")
|
|
779
|
+
modifiers = parts[:-1]
|
|
780
|
+
key = parts[-1]
|
|
781
|
+
return ParsedAction(action_type="key", key=key, modifiers=modifiers)
|
|
782
|
+
return ParsedAction(action_type="key", key=key_str)
|
|
783
|
+
|
|
784
|
+
# SCROLL(direction) or SCROLL(direction, amount)
|
|
785
|
+
scroll_match = re.search(
|
|
786
|
+
r"SCROLL\s*\(\s*([a-zA-Z]+)(?:\s*,\s*(\d+))?\s*\)",
|
|
787
|
+
response,
|
|
788
|
+
re.IGNORECASE,
|
|
789
|
+
)
|
|
790
|
+
if scroll_match:
|
|
791
|
+
direction = scroll_match.group(1).lower()
|
|
792
|
+
amount = int(scroll_match.group(2)) if scroll_match.group(2) else 3
|
|
793
|
+
return ParsedAction(
|
|
794
|
+
action_type="scroll", direction=direction, amount=amount
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
return ParsedAction(
|
|
798
|
+
action_type="unknown", parse_error="No regex pattern matched"
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
def _try_keyword_parse(self, response: str) -> ParsedAction:
|
|
802
|
+
"""Try special keywords."""
|
|
803
|
+
response_upper = response.upper().strip()
|
|
804
|
+
|
|
805
|
+
# DONE() or just DONE
|
|
806
|
+
if (
|
|
807
|
+
re.search(r"\bDONE\s*\(\s*\)\s*$", response, re.IGNORECASE)
|
|
808
|
+
or response_upper == "DONE"
|
|
809
|
+
):
|
|
810
|
+
return ParsedAction(action_type="done")
|
|
811
|
+
|
|
812
|
+
# WAIT() or WAIT
|
|
813
|
+
if (
|
|
814
|
+
re.search(r"\bWAIT\s*\(\s*\)\s*$", response, re.IGNORECASE)
|
|
815
|
+
or response_upper == "WAIT"
|
|
816
|
+
):
|
|
817
|
+
return ParsedAction(action_type="wait")
|
|
818
|
+
|
|
819
|
+
# FAIL() or FAIL
|
|
820
|
+
if (
|
|
821
|
+
re.search(r"\bFAIL\s*\(\s*\)\s*$", response, re.IGNORECASE)
|
|
822
|
+
or response_upper == "FAIL"
|
|
823
|
+
):
|
|
824
|
+
return ParsedAction(action_type="fail")
|
|
825
|
+
|
|
826
|
+
# Look for "task is complete" or similar phrases
|
|
827
|
+
if re.search(
|
|
828
|
+
r"task\s+(?:is\s+)?(?:complete|done|finished)", response, re.IGNORECASE
|
|
829
|
+
):
|
|
830
|
+
return ParsedAction(
|
|
831
|
+
action_type="done",
|
|
832
|
+
confidence=0.7,
|
|
833
|
+
metadata={"inferred": True},
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
return ParsedAction(action_type="unknown", parse_error="No keyword matched")
|
|
837
|
+
|
|
838
|
+
def _normalize_coords(self, x: float, y: float) -> tuple[float, float]:
|
|
839
|
+
"""Normalize coordinates to 0-1 range if needed."""
|
|
840
|
+
if not self.normalize_coordinates:
|
|
841
|
+
return (x, y)
|
|
842
|
+
|
|
843
|
+
# If coordinates are large, assume they're pixels
|
|
844
|
+
if x > 1.5 or y > 1.5:
|
|
845
|
+
x = x / self._screen_width
|
|
846
|
+
y = y / self._screen_height
|
|
847
|
+
|
|
848
|
+
# Clamp to valid range
|
|
849
|
+
x = max(0.0, min(1.0, x))
|
|
850
|
+
y = max(0.0, min(1.0, y))
|
|
851
|
+
|
|
852
|
+
return (x, y)
|
|
853
|
+
|
|
854
|
+
def _normalize_element_id(self, element_id: Any) -> int | None:
|
|
855
|
+
"""Normalize element_id to integer format."""
|
|
856
|
+
if element_id is None:
|
|
857
|
+
return None
|
|
858
|
+
|
|
859
|
+
if isinstance(element_id, int):
|
|
860
|
+
return element_id
|
|
861
|
+
|
|
862
|
+
if isinstance(element_id, str):
|
|
863
|
+
# Extract number from "e17", "[17]", "element_17" etc.
|
|
864
|
+
match = re.search(r"\d+", element_id)
|
|
865
|
+
if match:
|
|
866
|
+
return int(match.group())
|
|
867
|
+
|
|
868
|
+
try:
|
|
869
|
+
return int(element_id)
|
|
870
|
+
except (ValueError, TypeError):
|
|
871
|
+
return None
|
|
872
|
+
|
|
873
|
+
def resolve_element_id(
|
|
874
|
+
self,
|
|
875
|
+
action: ParsedAction,
|
|
876
|
+
registry: ElementRegistry | None = None,
|
|
877
|
+
) -> ParsedAction:
|
|
878
|
+
"""Convert element_id to coordinates if registry available.
|
|
879
|
+
|
|
880
|
+
Args:
|
|
881
|
+
action: ParsedAction with element_id.
|
|
882
|
+
registry: Element registry (uses self.element_registry if None).
|
|
883
|
+
|
|
884
|
+
Returns:
|
|
885
|
+
ParsedAction with coordinates added if conversion succeeded,
|
|
886
|
+
original action otherwise.
|
|
887
|
+
"""
|
|
888
|
+
if not action.has_element_id or action.has_coordinates:
|
|
889
|
+
return action
|
|
890
|
+
|
|
891
|
+
reg = registry or self.element_registry
|
|
892
|
+
if reg is None:
|
|
893
|
+
return action
|
|
894
|
+
|
|
895
|
+
coords = reg.get_center_coords(action.element_id, normalize=True)
|
|
896
|
+
if coords is not None:
|
|
897
|
+
return action.with_coordinates(
|
|
898
|
+
x=coords[0],
|
|
899
|
+
y=coords[1],
|
|
900
|
+
source=f"element_{action.element_id}",
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
return action
|
|
904
|
+
|
|
905
|
+
def parse_and_resolve(
|
|
906
|
+
self,
|
|
907
|
+
response: str,
|
|
908
|
+
registry: ElementRegistry | None = None,
|
|
909
|
+
) -> ParsedAction:
|
|
910
|
+
"""Parse response and resolve element_id to coordinates.
|
|
911
|
+
|
|
912
|
+
Convenience method that combines parse() and resolve_element_id().
|
|
913
|
+
|
|
914
|
+
Args:
|
|
915
|
+
response: Raw model response.
|
|
916
|
+
registry: Optional element registry for ID conversion.
|
|
917
|
+
|
|
918
|
+
Returns:
|
|
919
|
+
ParsedAction with coordinates if available.
|
|
920
|
+
"""
|
|
921
|
+
action = self.parse(response)
|
|
922
|
+
return self.resolve_element_id(action, registry)
|