openadapt-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/__init__.py +0 -0
- openadapt_ml/benchmarks/__init__.py +125 -0
- openadapt_ml/benchmarks/agent.py +825 -0
- openadapt_ml/benchmarks/azure.py +761 -0
- openadapt_ml/benchmarks/base.py +366 -0
- openadapt_ml/benchmarks/cli.py +884 -0
- openadapt_ml/benchmarks/data_collection.py +432 -0
- openadapt_ml/benchmarks/runner.py +381 -0
- openadapt_ml/benchmarks/waa.py +704 -0
- openadapt_ml/cloud/__init__.py +5 -0
- openadapt_ml/cloud/azure_inference.py +441 -0
- openadapt_ml/cloud/lambda_labs.py +2445 -0
- openadapt_ml/cloud/local.py +790 -0
- openadapt_ml/config.py +56 -0
- openadapt_ml/datasets/__init__.py +0 -0
- openadapt_ml/datasets/next_action.py +507 -0
- openadapt_ml/evals/__init__.py +23 -0
- openadapt_ml/evals/grounding.py +241 -0
- openadapt_ml/evals/plot_eval_metrics.py +174 -0
- openadapt_ml/evals/trajectory_matching.py +486 -0
- openadapt_ml/grounding/__init__.py +45 -0
- openadapt_ml/grounding/base.py +236 -0
- openadapt_ml/grounding/detector.py +570 -0
- openadapt_ml/ingest/__init__.py +43 -0
- openadapt_ml/ingest/capture.py +312 -0
- openadapt_ml/ingest/loader.py +232 -0
- openadapt_ml/ingest/synthetic.py +1102 -0
- openadapt_ml/models/__init__.py +0 -0
- openadapt_ml/models/api_adapter.py +171 -0
- openadapt_ml/models/base_adapter.py +59 -0
- openadapt_ml/models/dummy_adapter.py +42 -0
- openadapt_ml/models/qwen_vl.py +426 -0
- openadapt_ml/runtime/__init__.py +0 -0
- openadapt_ml/runtime/policy.py +182 -0
- openadapt_ml/schemas/__init__.py +53 -0
- openadapt_ml/schemas/sessions.py +122 -0
- openadapt_ml/schemas/validation.py +252 -0
- openadapt_ml/scripts/__init__.py +0 -0
- openadapt_ml/scripts/compare.py +1490 -0
- openadapt_ml/scripts/demo_policy.py +62 -0
- openadapt_ml/scripts/eval_policy.py +287 -0
- openadapt_ml/scripts/make_gif.py +153 -0
- openadapt_ml/scripts/prepare_synthetic.py +43 -0
- openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
- openadapt_ml/scripts/train.py +174 -0
- openadapt_ml/training/__init__.py +0 -0
- openadapt_ml/training/benchmark_viewer.py +1538 -0
- openadapt_ml/training/shared_ui.py +157 -0
- openadapt_ml/training/stub_provider.py +276 -0
- openadapt_ml/training/trainer.py +2446 -0
- openadapt_ml/training/viewer.py +2970 -0
- openadapt_ml-0.1.0.dist-info/METADATA +818 -0
- openadapt_ml-0.1.0.dist-info/RECORD +55 -0
- openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
- openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1102 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import random
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import List, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
from PIL import Image, ImageDraw, ImageFont
|
|
10
|
+
|
|
11
|
+
from openadapt_ml.schemas.sessions import Action, Episode, Observation, Session, Step
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
IMG_WIDTH = 800
|
|
15
|
+
IMG_HEIGHT = 600
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _load_font(size: int = 16) -> ImageFont.FreeTypeFont | ImageFont.ImageFont: # type: ignore[name-defined]
|
|
19
|
+
try:
|
|
20
|
+
return ImageFont.truetype("arial.ttf", size)
|
|
21
|
+
except OSError:
|
|
22
|
+
return ImageFont.load_default()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
FONT = _load_font(16)
|
|
26
|
+
FONT_TITLE = _load_font(24)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _normalize(x_px: int, y_px: int) -> Tuple[float, float]:
|
|
30
|
+
"""Normalize pixel coordinates to [0, 1] relative to image size."""
|
|
31
|
+
|
|
32
|
+
return x_px / IMG_WIDTH, y_px / IMG_HEIGHT
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _text_size(draw: ImageDraw.ImageDraw, text: str, font: ImageFont.ImageFont) -> Tuple[int, int]:
|
|
36
|
+
"""Compute text width/height using textbbox for Pillow compatibility."""
|
|
37
|
+
|
|
38
|
+
left, top, right, bottom = draw.textbbox((0, 0), text, font=font)
|
|
39
|
+
return right - left, bottom - top
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class LoginUIElements:
|
|
44
|
+
"""Absolute pixel bounds for important interactive regions.
|
|
45
|
+
|
|
46
|
+
Bounds are (x, y, w, h) in pixels.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
username_box: Tuple[int, int, int, int]
|
|
50
|
+
password_box: Tuple[int, int, int, int]
|
|
51
|
+
login_button: Tuple[int, int, int, int]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _compute_login_layout(max_offset: int = 10, jitter: bool = True) -> LoginUIElements:
|
|
55
|
+
"""Sample a login UI layout, optionally with jitter.
|
|
56
|
+
|
|
57
|
+
This computes absolute pixel bounds for all key elements once, so that a
|
|
58
|
+
single layout can be reused across all frames in an episode.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
# Username label and box base geometry
|
|
62
|
+
label_x = 200
|
|
63
|
+
uname_label_y = 160
|
|
64
|
+
box_w, box_h = 360, 40
|
|
65
|
+
uname_box_y = uname_label_y + 24
|
|
66
|
+
|
|
67
|
+
def _maybe_jitter(x: int, y: int) -> tuple[int, int]:
|
|
68
|
+
if not jitter:
|
|
69
|
+
return x, y
|
|
70
|
+
dx = random.randint(-max_offset, max_offset)
|
|
71
|
+
dy = random.randint(-max_offset, max_offset)
|
|
72
|
+
jx = max(0, min(IMG_WIDTH, x + dx))
|
|
73
|
+
jy = max(0, min(IMG_HEIGHT, y + dy))
|
|
74
|
+
return jx, jy
|
|
75
|
+
|
|
76
|
+
# Username box position
|
|
77
|
+
uname_x, uname_y = _maybe_jitter(label_x, uname_box_y)
|
|
78
|
+
uname_x = max(20, min(IMG_WIDTH - box_w - 20, uname_x))
|
|
79
|
+
uname_y = max(uname_label_y + 10, min(IMG_HEIGHT - box_h - 100, uname_y))
|
|
80
|
+
username_box = (uname_x, uname_y, box_w, box_h)
|
|
81
|
+
|
|
82
|
+
# Password label and box
|
|
83
|
+
pw_label_y = uname_y + box_h + 30
|
|
84
|
+
pw_box_y = pw_label_y + 24
|
|
85
|
+
pw_x, pw_y = _maybe_jitter(label_x, pw_box_y)
|
|
86
|
+
pw_x = max(20, min(IMG_WIDTH - box_w - 20, pw_x))
|
|
87
|
+
pw_y = max(pw_label_y + 10, min(IMG_HEIGHT - box_h - 80, pw_y))
|
|
88
|
+
password_box = (pw_x, pw_y, box_w, box_h)
|
|
89
|
+
|
|
90
|
+
# Login button
|
|
91
|
+
btn_w, btn_h = 140, 45
|
|
92
|
+
base_btn_x = (IMG_WIDTH - btn_w) // 2
|
|
93
|
+
base_btn_y = pw_y + box_h + 50
|
|
94
|
+
btn_x, btn_y = _maybe_jitter(base_btn_x, base_btn_y)
|
|
95
|
+
btn_x = max(20, min(IMG_WIDTH - btn_w - 20, btn_x))
|
|
96
|
+
btn_y = max(pw_y + box_h + 20, min(IMG_HEIGHT - btn_h - 40, btn_y))
|
|
97
|
+
login_button = (btn_x, btn_y, btn_w, btn_h)
|
|
98
|
+
|
|
99
|
+
return LoginUIElements(
|
|
100
|
+
username_box=username_box,
|
|
101
|
+
password_box=password_box,
|
|
102
|
+
login_button=login_button,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _draw_login_screen(
|
|
107
|
+
username: str = "",
|
|
108
|
+
password: str = "",
|
|
109
|
+
layout: Optional[LoginUIElements] = None,
|
|
110
|
+
jitter: bool = True,
|
|
111
|
+
) -> tuple[Image.Image, LoginUIElements]:
|
|
112
|
+
"""Draw a simple login screen with slight layout jitter and a decoy button.
|
|
113
|
+
|
|
114
|
+
Returns the image and absolute pixel bounds for key interactive elements.
|
|
115
|
+
Bounds are (x, y, w, h).
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
img = Image.new("RGB", (IMG_WIDTH, IMG_HEIGHT), color=(230, 230, 230))
|
|
119
|
+
draw = ImageDraw.Draw(img)
|
|
120
|
+
|
|
121
|
+
# Title
|
|
122
|
+
title_text = "Welcome Back!"
|
|
123
|
+
tw, th = _text_size(draw, title_text, FONT_TITLE)
|
|
124
|
+
tx = (IMG_WIDTH - tw) // 2
|
|
125
|
+
ty = 80
|
|
126
|
+
draw.text((tx, ty), title_text, fill="black", font=FONT_TITLE)
|
|
127
|
+
|
|
128
|
+
# Determine layout once; if not provided, sample it (optionally with jitter).
|
|
129
|
+
if layout is None:
|
|
130
|
+
layout = _compute_login_layout(jitter=jitter)
|
|
131
|
+
|
|
132
|
+
# Username label and box
|
|
133
|
+
label_x = 200
|
|
134
|
+
uname_label_y = 160
|
|
135
|
+
box_w, box_h = 360, 40
|
|
136
|
+
draw.text((label_x, uname_label_y), "Username:", fill="black", font=FONT)
|
|
137
|
+
|
|
138
|
+
uname_x, uname_y, _, _ = layout.username_box
|
|
139
|
+
draw.rectangle(
|
|
140
|
+
[
|
|
141
|
+
(uname_x, uname_y),
|
|
142
|
+
(uname_x + box_w, uname_y + box_h),
|
|
143
|
+
],
|
|
144
|
+
outline="black",
|
|
145
|
+
fill="white",
|
|
146
|
+
)
|
|
147
|
+
if username:
|
|
148
|
+
draw.text((uname_x + 8, uname_y + 10), username, fill="black", font=FONT)
|
|
149
|
+
|
|
150
|
+
# Password label and box
|
|
151
|
+
pw_x, pw_y, _, _ = layout.password_box
|
|
152
|
+
pw_label_y = pw_y - 24
|
|
153
|
+
draw.text((label_x, pw_label_y), "Password:", fill="black", font=FONT)
|
|
154
|
+
|
|
155
|
+
draw.rectangle(
|
|
156
|
+
[
|
|
157
|
+
(pw_x, pw_y),
|
|
158
|
+
(pw_x + box_w, pw_y + box_h),
|
|
159
|
+
],
|
|
160
|
+
outline="black",
|
|
161
|
+
fill="white",
|
|
162
|
+
)
|
|
163
|
+
if password:
|
|
164
|
+
masked = "*" * len(password)
|
|
165
|
+
draw.text((pw_x + 8, pw_y + 10), masked, fill="black", font=FONT)
|
|
166
|
+
|
|
167
|
+
# Login button
|
|
168
|
+
btn_x, btn_y, btn_w, btn_h = layout.login_button
|
|
169
|
+
|
|
170
|
+
draw.rectangle(
|
|
171
|
+
[
|
|
172
|
+
(btn_x, btn_y),
|
|
173
|
+
(btn_x + btn_w, btn_y + btn_h),
|
|
174
|
+
],
|
|
175
|
+
outline="black",
|
|
176
|
+
fill="green",
|
|
177
|
+
)
|
|
178
|
+
btn_text = "Login"
|
|
179
|
+
btw, bth = _text_size(draw, btn_text, FONT)
|
|
180
|
+
draw.text(
|
|
181
|
+
(btn_x + (btn_w - btw) // 2, btn_y + (btn_h - bth) // 2),
|
|
182
|
+
btn_text,
|
|
183
|
+
fill="white",
|
|
184
|
+
font=FONT,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
login_button = (btn_x, btn_y, btn_w, btn_h)
|
|
188
|
+
|
|
189
|
+
# Decoy clickable button (e.g., Help) in the lower-right area.
|
|
190
|
+
decoy_w, decoy_h = 110, 35
|
|
191
|
+
decoy_x = IMG_WIDTH - decoy_w - 40
|
|
192
|
+
decoy_y = btn_y
|
|
193
|
+
draw.rectangle(
|
|
194
|
+
[
|
|
195
|
+
(decoy_x, decoy_y),
|
|
196
|
+
(decoy_x + decoy_w, decoy_y + decoy_h),
|
|
197
|
+
],
|
|
198
|
+
outline="black",
|
|
199
|
+
fill=(180, 180, 180),
|
|
200
|
+
)
|
|
201
|
+
decoy_text = "Help"
|
|
202
|
+
dtw, dth = _text_size(draw, decoy_text, FONT)
|
|
203
|
+
draw.text(
|
|
204
|
+
(decoy_x + (decoy_w - dtw) // 2, decoy_y + (decoy_h - dth) // 2),
|
|
205
|
+
decoy_text,
|
|
206
|
+
fill="black",
|
|
207
|
+
font=FONT,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
elements = LoginUIElements(
|
|
211
|
+
username_box=layout.username_box,
|
|
212
|
+
password_box=layout.password_box,
|
|
213
|
+
login_button=login_button,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
return img, elements
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _overlay_som_marks(
|
|
220
|
+
img: Image.Image,
|
|
221
|
+
elements: List[Tuple[int, Tuple[int, int, int, int]]],
|
|
222
|
+
) -> Image.Image:
|
|
223
|
+
"""Overlay Set-of-Marks numbered labels on interactive elements.
|
|
224
|
+
|
|
225
|
+
Uses the style from the SoM paper: black squares with white numbers.
|
|
226
|
+
Labels are positioned at the top-left corner of each element.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
img: The base screenshot image.
|
|
230
|
+
elements: List of (index, (x, y, w, h)) tuples for each interactive element.
|
|
231
|
+
Index is the 1-based element number shown in the label.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
A copy of the image with [1], [2], [3], etc. labels overlaid.
|
|
235
|
+
"""
|
|
236
|
+
img = img.copy()
|
|
237
|
+
draw = ImageDraw.Draw(img)
|
|
238
|
+
|
|
239
|
+
# Load a slightly larger font for SoM labels
|
|
240
|
+
try:
|
|
241
|
+
som_font = ImageFont.truetype("arial.ttf", 14)
|
|
242
|
+
except OSError:
|
|
243
|
+
som_font = ImageFont.load_default()
|
|
244
|
+
|
|
245
|
+
for idx, bounds in elements:
|
|
246
|
+
x, y, w, h = bounds
|
|
247
|
+
label = f"[{idx}]"
|
|
248
|
+
|
|
249
|
+
# Measure text size
|
|
250
|
+
text_bbox = draw.textbbox((0, 0), label, font=som_font)
|
|
251
|
+
text_width = text_bbox[2] - text_bbox[0]
|
|
252
|
+
text_height = text_bbox[3] - text_bbox[1]
|
|
253
|
+
|
|
254
|
+
# Add padding for the box
|
|
255
|
+
padding = 4
|
|
256
|
+
box_width = text_width + padding * 2
|
|
257
|
+
box_height = text_height + padding * 2
|
|
258
|
+
|
|
259
|
+
# Position ABOVE and to the LEFT of the element (not inside)
|
|
260
|
+
# This ensures labels don't obscure content
|
|
261
|
+
box_x = x - 4
|
|
262
|
+
box_y = y - box_height - 2
|
|
263
|
+
|
|
264
|
+
# Ensure box stays within image bounds
|
|
265
|
+
if box_y < 0:
|
|
266
|
+
# If no room above, position to the left of the element
|
|
267
|
+
box_y = y + 4
|
|
268
|
+
if box_x < 0:
|
|
269
|
+
box_x = 4
|
|
270
|
+
|
|
271
|
+
# Draw black rectangle background (SoM paper style)
|
|
272
|
+
draw.rectangle(
|
|
273
|
+
[box_x, box_y, box_x + box_width, box_y + box_height],
|
|
274
|
+
fill="black",
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Draw white text centered in the box
|
|
278
|
+
text_x = box_x + padding
|
|
279
|
+
text_y = box_y + padding
|
|
280
|
+
draw.text((text_x, text_y), label, fill="white", font=som_font)
|
|
281
|
+
|
|
282
|
+
return img
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
# Element index mapping for the login screen (1-based for human readability)
|
|
286
|
+
SOM_USERNAME_FIELD = 1
|
|
287
|
+
SOM_PASSWORD_FIELD = 2
|
|
288
|
+
SOM_LOGIN_BUTTON = 3
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _draw_logged_in_screen(username: str) -> Image.Image:
|
|
292
|
+
"""Simple logged-in confirmation screen."""
|
|
293
|
+
|
|
294
|
+
img = Image.new("RGB", (IMG_WIDTH, IMG_HEIGHT), color=(210, 230, 210))
|
|
295
|
+
draw = ImageDraw.Draw(img)
|
|
296
|
+
text = f"Welcome, {username}!"
|
|
297
|
+
tw, th = _text_size(draw, text, FONT_TITLE)
|
|
298
|
+
tx = (IMG_WIDTH - tw) // 2
|
|
299
|
+
ty = (IMG_HEIGHT - th) // 2
|
|
300
|
+
draw.text((tx, ty), text, fill="darkgreen", font=FONT_TITLE)
|
|
301
|
+
return img
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _save_image(img: Image.Image, path: Path) -> None:
|
|
305
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
306
|
+
img.save(path)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _center(bounds: Tuple[int, int, int, int]) -> Tuple[float, float]:
|
|
310
|
+
x, y, w, h = bounds
|
|
311
|
+
cx = x + w // 2
|
|
312
|
+
cy = y + h // 2
|
|
313
|
+
return _normalize(cx, cy)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _bbox_normalized(bounds: Tuple[int, int, int, int]) -> Tuple[float, float, float, float]:
|
|
317
|
+
"""Convert pixel bounds (x, y, w, h) to normalized bbox (x_min, y_min, x_max, y_max)."""
|
|
318
|
+
x, y, w, h = bounds
|
|
319
|
+
x_min = x / IMG_WIDTH
|
|
320
|
+
y_min = y / IMG_HEIGHT
|
|
321
|
+
x_max = (x + w) / IMG_WIDTH
|
|
322
|
+
y_max = (y + h) / IMG_HEIGHT
|
|
323
|
+
return (x_min, y_min, x_max, y_max)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _script_login_episode(
|
|
327
|
+
root: Path,
|
|
328
|
+
episode_id: str,
|
|
329
|
+
username: str,
|
|
330
|
+
password: str,
|
|
331
|
+
jitter: bool = True,
|
|
332
|
+
) -> Episode:
|
|
333
|
+
"""Create a scripted login episode with a fixed sequence of steps.
|
|
334
|
+
|
|
335
|
+
Steps (6 total):
|
|
336
|
+
- Step 0: blank login screen → click username field.
|
|
337
|
+
- Step 1: username field focused → type username.
|
|
338
|
+
- Step 2: username typed → click password field.
|
|
339
|
+
- Step 3: password field focused → type password.
|
|
340
|
+
- Step 4: password typed → click login button.
|
|
341
|
+
- Step 5: logged-in screen → DONE.
|
|
342
|
+
|
|
343
|
+
Each step includes bounding boxes for clickable elements to support
|
|
344
|
+
bbox-based click hit evaluation.
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
steps: List[Step] = []
|
|
348
|
+
|
|
349
|
+
# Sample a single layout for the entire episode (controls jitter vs no-jitter).
|
|
350
|
+
layout = _compute_login_layout(jitter=jitter)
|
|
351
|
+
|
|
352
|
+
# Compute normalized bounding boxes for all elements
|
|
353
|
+
username_bbox = _bbox_normalized(layout.username_box)
|
|
354
|
+
password_bbox = _bbox_normalized(layout.password_box)
|
|
355
|
+
login_bbox = _bbox_normalized(layout.login_button)
|
|
356
|
+
|
|
357
|
+
# Step 0: blank login screen → click username field
|
|
358
|
+
cx, cy = _center(layout.username_box)
|
|
359
|
+
img0, _ = _draw_login_screen(layout=layout, jitter=False)
|
|
360
|
+
img0_path = root / f"{episode_id}_step_0.png"
|
|
361
|
+
_save_image(img0, img0_path)
|
|
362
|
+
obs0 = Observation(image_path=str(img0_path))
|
|
363
|
+
steps.append(
|
|
364
|
+
Step(
|
|
365
|
+
t=0.0,
|
|
366
|
+
observation=obs0,
|
|
367
|
+
action=Action(type="click", x=cx, y=cy, bbox=username_bbox),
|
|
368
|
+
thought="Focus the username field.",
|
|
369
|
+
)
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Step 1: username field focused → type username
|
|
373
|
+
img1, _ = _draw_login_screen(username="", layout=layout, jitter=False)
|
|
374
|
+
img1_path = root / f"{episode_id}_step_1.png"
|
|
375
|
+
_save_image(img1, img1_path)
|
|
376
|
+
obs1 = Observation(image_path=str(img1_path))
|
|
377
|
+
steps.append(
|
|
378
|
+
Step(
|
|
379
|
+
t=1.0,
|
|
380
|
+
observation=obs1,
|
|
381
|
+
action=Action(type="type", text=username),
|
|
382
|
+
thought="Type the username.",
|
|
383
|
+
)
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
# Step 2: username typed → click password field
|
|
387
|
+
cx_pw, cy_pw = _center(layout.password_box)
|
|
388
|
+
img2, _ = _draw_login_screen(username=username, layout=layout, jitter=False)
|
|
389
|
+
img2_path = root / f"{episode_id}_step_2.png"
|
|
390
|
+
_save_image(img2, img2_path)
|
|
391
|
+
obs2 = Observation(image_path=str(img2_path))
|
|
392
|
+
steps.append(
|
|
393
|
+
Step(
|
|
394
|
+
t=2.0,
|
|
395
|
+
observation=obs2,
|
|
396
|
+
action=Action(type="click", x=cx_pw, y=cy_pw, bbox=password_bbox),
|
|
397
|
+
thought="Focus the password field.",
|
|
398
|
+
)
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
# Step 3: password field focused → type password
|
|
402
|
+
img3, _ = _draw_login_screen(username=username, layout=layout, jitter=False)
|
|
403
|
+
img3_path = root / f"{episode_id}_step_3.png"
|
|
404
|
+
_save_image(img3, img3_path)
|
|
405
|
+
obs3 = Observation(image_path=str(img3_path))
|
|
406
|
+
steps.append(
|
|
407
|
+
Step(
|
|
408
|
+
t=3.0,
|
|
409
|
+
observation=obs3,
|
|
410
|
+
action=Action(type="type", text=password),
|
|
411
|
+
thought="Type the password.",
|
|
412
|
+
)
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
# Step 4: password typed → click login button
|
|
416
|
+
cx_btn, cy_btn = _center(layout.login_button)
|
|
417
|
+
img4, _ = _draw_login_screen(username=username, password=password, layout=layout, jitter=False)
|
|
418
|
+
img4_path = root / f"{episode_id}_step_4.png"
|
|
419
|
+
_save_image(img4, img4_path)
|
|
420
|
+
obs4 = Observation(image_path=str(img4_path))
|
|
421
|
+
steps.append(
|
|
422
|
+
Step(
|
|
423
|
+
t=4.0,
|
|
424
|
+
observation=obs4,
|
|
425
|
+
action=Action(type="click", x=cx_btn, y=cy_btn, bbox=login_bbox),
|
|
426
|
+
thought="Submit the login form.",
|
|
427
|
+
)
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
# Step 5: logged-in screen → DONE
|
|
431
|
+
img5 = _draw_logged_in_screen(username=username)
|
|
432
|
+
img5_path = root / f"{episode_id}_step_5.png"
|
|
433
|
+
_save_image(img5, img5_path)
|
|
434
|
+
obs5 = Observation(image_path=str(img5_path))
|
|
435
|
+
steps.append(
|
|
436
|
+
Step(
|
|
437
|
+
t=5.0,
|
|
438
|
+
observation=obs5,
|
|
439
|
+
action=Action(type="done"),
|
|
440
|
+
thought="Login successful; workflow complete.",
|
|
441
|
+
)
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
episode = Episode(
|
|
445
|
+
id=episode_id,
|
|
446
|
+
goal=f"Log in with username '{username}' and password '{password}'",
|
|
447
|
+
steps=steps,
|
|
448
|
+
summary="Successful login via username and password.",
|
|
449
|
+
success=True,
|
|
450
|
+
workflow_id="login_basic",
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
return episode
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def _script_login_episode_som(
|
|
457
|
+
root: Path,
|
|
458
|
+
episode_id: str,
|
|
459
|
+
username: str,
|
|
460
|
+
password: str,
|
|
461
|
+
jitter: bool = True,
|
|
462
|
+
) -> Episode:
|
|
463
|
+
"""Create a scripted login episode with Set-of-Marks (SoM) overlay.
|
|
464
|
+
|
|
465
|
+
This variant generates screenshots with numbered labels [1], [2], [3] on
|
|
466
|
+
interactive elements, and uses element_index instead of raw coordinates
|
|
467
|
+
for click actions.
|
|
468
|
+
|
|
469
|
+
Steps (6 total):
|
|
470
|
+
- Step 0: SoM login screen → click element [1] (username field)
|
|
471
|
+
- Step 1: username field focused → type username
|
|
472
|
+
- Step 2: username typed → click element [2] (password field)
|
|
473
|
+
- Step 3: password field focused → type password
|
|
474
|
+
- Step 4: password typed → click element [3] (login button)
|
|
475
|
+
- Step 5: logged-in screen → DONE
|
|
476
|
+
"""
|
|
477
|
+
|
|
478
|
+
steps: List[Step] = []
|
|
479
|
+
|
|
480
|
+
# Sample a single layout for the entire episode
|
|
481
|
+
layout = _compute_login_layout(jitter=jitter)
|
|
482
|
+
|
|
483
|
+
# Compute normalized bounding boxes for all elements
|
|
484
|
+
username_bbox = _bbox_normalized(layout.username_box)
|
|
485
|
+
password_bbox = _bbox_normalized(layout.password_box)
|
|
486
|
+
login_bbox = _bbox_normalized(layout.login_button)
|
|
487
|
+
|
|
488
|
+
# Define element mapping for SoM overlay
|
|
489
|
+
som_elements = [
|
|
490
|
+
(SOM_USERNAME_FIELD, layout.username_box),
|
|
491
|
+
(SOM_PASSWORD_FIELD, layout.password_box),
|
|
492
|
+
(SOM_LOGIN_BUTTON, layout.login_button),
|
|
493
|
+
]
|
|
494
|
+
|
|
495
|
+
# Step 0: SoM login screen → click username field [1]
|
|
496
|
+
cx, cy = _center(layout.username_box)
|
|
497
|
+
img0, _ = _draw_login_screen(layout=layout, jitter=False)
|
|
498
|
+
img0_som = _overlay_som_marks(img0, som_elements)
|
|
499
|
+
img0_path = root / f"{episode_id}_step_0.png"
|
|
500
|
+
_save_image(img0_som, img0_path)
|
|
501
|
+
obs0 = Observation(image_path=str(img0_path))
|
|
502
|
+
steps.append(
|
|
503
|
+
Step(
|
|
504
|
+
t=0.0,
|
|
505
|
+
observation=obs0,
|
|
506
|
+
action=Action(
|
|
507
|
+
type="click",
|
|
508
|
+
x=cx,
|
|
509
|
+
y=cy,
|
|
510
|
+
bbox=username_bbox,
|
|
511
|
+
element_index=SOM_USERNAME_FIELD,
|
|
512
|
+
),
|
|
513
|
+
thought="Focus the username field by clicking element [1].",
|
|
514
|
+
)
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
# Step 1: username field focused → type username into element [1]
|
|
518
|
+
img1, _ = _draw_login_screen(username="", layout=layout, jitter=False)
|
|
519
|
+
img1_som = _overlay_som_marks(img1, som_elements)
|
|
520
|
+
img1_path = root / f"{episode_id}_step_1.png"
|
|
521
|
+
_save_image(img1_som, img1_path)
|
|
522
|
+
obs1 = Observation(image_path=str(img1_path))
|
|
523
|
+
steps.append(
|
|
524
|
+
Step(
|
|
525
|
+
t=1.0,
|
|
526
|
+
observation=obs1,
|
|
527
|
+
action=Action(type="type", text=username, element_index=SOM_USERNAME_FIELD),
|
|
528
|
+
thought="Type the username into element [1].",
|
|
529
|
+
)
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
# Step 2: username typed → click password field [2]
|
|
533
|
+
cx_pw, cy_pw = _center(layout.password_box)
|
|
534
|
+
img2, _ = _draw_login_screen(username=username, layout=layout, jitter=False)
|
|
535
|
+
img2_som = _overlay_som_marks(img2, som_elements)
|
|
536
|
+
img2_path = root / f"{episode_id}_step_2.png"
|
|
537
|
+
_save_image(img2_som, img2_path)
|
|
538
|
+
obs2 = Observation(image_path=str(img2_path))
|
|
539
|
+
steps.append(
|
|
540
|
+
Step(
|
|
541
|
+
t=2.0,
|
|
542
|
+
observation=obs2,
|
|
543
|
+
action=Action(
|
|
544
|
+
type="click",
|
|
545
|
+
x=cx_pw,
|
|
546
|
+
y=cy_pw,
|
|
547
|
+
bbox=password_bbox,
|
|
548
|
+
element_index=SOM_PASSWORD_FIELD,
|
|
549
|
+
),
|
|
550
|
+
thought="Focus the password field by clicking element [2].",
|
|
551
|
+
)
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
# Step 3: password field focused → type password into element [2]
|
|
555
|
+
img3, _ = _draw_login_screen(username=username, layout=layout, jitter=False)
|
|
556
|
+
img3_som = _overlay_som_marks(img3, som_elements)
|
|
557
|
+
img3_path = root / f"{episode_id}_step_3.png"
|
|
558
|
+
_save_image(img3_som, img3_path)
|
|
559
|
+
obs3 = Observation(image_path=str(img3_path))
|
|
560
|
+
steps.append(
|
|
561
|
+
Step(
|
|
562
|
+
t=3.0,
|
|
563
|
+
observation=obs3,
|
|
564
|
+
action=Action(type="type", text=password, element_index=SOM_PASSWORD_FIELD),
|
|
565
|
+
thought="Type the password into element [2].",
|
|
566
|
+
)
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
# Step 4: password typed → click login button [3]
|
|
570
|
+
cx_btn, cy_btn = _center(layout.login_button)
|
|
571
|
+
img4, _ = _draw_login_screen(
|
|
572
|
+
username=username, password=password, layout=layout, jitter=False
|
|
573
|
+
)
|
|
574
|
+
img4_som = _overlay_som_marks(img4, som_elements)
|
|
575
|
+
img4_path = root / f"{episode_id}_step_4.png"
|
|
576
|
+
_save_image(img4_som, img4_path)
|
|
577
|
+
obs4 = Observation(image_path=str(img4_path))
|
|
578
|
+
steps.append(
|
|
579
|
+
Step(
|
|
580
|
+
t=4.0,
|
|
581
|
+
observation=obs4,
|
|
582
|
+
action=Action(
|
|
583
|
+
type="click",
|
|
584
|
+
x=cx_btn,
|
|
585
|
+
y=cy_btn,
|
|
586
|
+
bbox=login_bbox,
|
|
587
|
+
element_index=SOM_LOGIN_BUTTON,
|
|
588
|
+
),
|
|
589
|
+
thought="Submit the login form by clicking element [3].",
|
|
590
|
+
)
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
# Step 5: logged-in screen → DONE (no SoM needed)
|
|
594
|
+
img5 = _draw_logged_in_screen(username=username)
|
|
595
|
+
img5_path = root / f"{episode_id}_step_5.png"
|
|
596
|
+
_save_image(img5, img5_path)
|
|
597
|
+
obs5 = Observation(image_path=str(img5_path))
|
|
598
|
+
steps.append(
|
|
599
|
+
Step(
|
|
600
|
+
t=5.0,
|
|
601
|
+
observation=obs5,
|
|
602
|
+
action=Action(type="done"),
|
|
603
|
+
thought="Login successful; workflow complete.",
|
|
604
|
+
)
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
episode = Episode(
|
|
608
|
+
id=episode_id,
|
|
609
|
+
goal=f"Log in with username '{username}' and password '{password}'",
|
|
610
|
+
steps=steps,
|
|
611
|
+
summary="Successful login via username and password (SoM mode).",
|
|
612
|
+
success=True,
|
|
613
|
+
workflow_id="login_basic_som",
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
return episode
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
@dataclass
|
|
620
|
+
class RegistrationUIElements:
|
|
621
|
+
"""Absolute pixel bounds for registration form interactive regions.
|
|
622
|
+
|
|
623
|
+
Bounds are (x, y, w, h) in pixels.
|
|
624
|
+
"""
|
|
625
|
+
|
|
626
|
+
first_name_box: Tuple[int, int, int, int]
|
|
627
|
+
last_name_box: Tuple[int, int, int, int]
|
|
628
|
+
email_box: Tuple[int, int, int, int]
|
|
629
|
+
password_box: Tuple[int, int, int, int]
|
|
630
|
+
confirm_password_box: Tuple[int, int, int, int]
|
|
631
|
+
register_button: Tuple[int, int, int, int]
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
# Element index mapping for the registration screen (1-based)
|
|
635
|
+
SOM_FIRST_NAME_FIELD = 1
|
|
636
|
+
SOM_LAST_NAME_FIELD = 2
|
|
637
|
+
SOM_EMAIL_FIELD = 3
|
|
638
|
+
SOM_REG_PASSWORD_FIELD = 4
|
|
639
|
+
SOM_CONFIRM_PASSWORD_FIELD = 5
|
|
640
|
+
SOM_REGISTER_BUTTON = 6
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
def _compute_registration_layout(max_offset: int = 8, jitter: bool = True) -> RegistrationUIElements:
|
|
644
|
+
"""Compute registration form layout with optional jitter."""
|
|
645
|
+
|
|
646
|
+
label_x = 180
|
|
647
|
+
box_w, box_h = 400, 36
|
|
648
|
+
start_y = 100
|
|
649
|
+
field_spacing = 70
|
|
650
|
+
|
|
651
|
+
def _maybe_jitter(x: int, y: int) -> tuple[int, int]:
|
|
652
|
+
if not jitter:
|
|
653
|
+
return x, y
|
|
654
|
+
dx = random.randint(-max_offset, max_offset)
|
|
655
|
+
dy = random.randint(-max_offset, max_offset)
|
|
656
|
+
return max(20, min(IMG_WIDTH - box_w - 20, x + dx)), max(20, min(IMG_HEIGHT - 60, y + dy))
|
|
657
|
+
|
|
658
|
+
# First name
|
|
659
|
+
fn_x, fn_y = _maybe_jitter(label_x, start_y + 24)
|
|
660
|
+
first_name_box = (fn_x, fn_y, box_w, box_h)
|
|
661
|
+
|
|
662
|
+
# Last name
|
|
663
|
+
ln_x, ln_y = _maybe_jitter(label_x, start_y + field_spacing + 24)
|
|
664
|
+
last_name_box = (ln_x, ln_y, box_w, box_h)
|
|
665
|
+
|
|
666
|
+
# Email
|
|
667
|
+
em_x, em_y = _maybe_jitter(label_x, start_y + 2 * field_spacing + 24)
|
|
668
|
+
email_box = (em_x, em_y, box_w, box_h)
|
|
669
|
+
|
|
670
|
+
# Password
|
|
671
|
+
pw_x, pw_y = _maybe_jitter(label_x, start_y + 3 * field_spacing + 24)
|
|
672
|
+
password_box = (pw_x, pw_y, box_w, box_h)
|
|
673
|
+
|
|
674
|
+
# Confirm password
|
|
675
|
+
cpw_x, cpw_y = _maybe_jitter(label_x, start_y + 4 * field_spacing + 24)
|
|
676
|
+
confirm_password_box = (cpw_x, cpw_y, box_w, box_h)
|
|
677
|
+
|
|
678
|
+
# Register button
|
|
679
|
+
btn_w, btn_h = 160, 45
|
|
680
|
+
btn_x, btn_y = _maybe_jitter((IMG_WIDTH - btn_w) // 2, start_y + 5 * field_spacing + 40)
|
|
681
|
+
register_button = (btn_x, btn_y, btn_w, btn_h)
|
|
682
|
+
|
|
683
|
+
return RegistrationUIElements(
|
|
684
|
+
first_name_box=first_name_box,
|
|
685
|
+
last_name_box=last_name_box,
|
|
686
|
+
email_box=email_box,
|
|
687
|
+
password_box=password_box,
|
|
688
|
+
confirm_password_box=confirm_password_box,
|
|
689
|
+
register_button=register_button,
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
def _draw_registration_screen(
|
|
694
|
+
first_name: str = "",
|
|
695
|
+
last_name: str = "",
|
|
696
|
+
email: str = "",
|
|
697
|
+
password: str = "",
|
|
698
|
+
confirm_password: str = "",
|
|
699
|
+
layout: Optional[RegistrationUIElements] = None,
|
|
700
|
+
jitter: bool = True,
|
|
701
|
+
) -> tuple[Image.Image, RegistrationUIElements]:
|
|
702
|
+
"""Draw a registration form with multiple text fields."""
|
|
703
|
+
|
|
704
|
+
img = Image.new("RGB", (IMG_WIDTH, IMG_HEIGHT), color=(235, 240, 245))
|
|
705
|
+
draw = ImageDraw.Draw(img)
|
|
706
|
+
|
|
707
|
+
# Title
|
|
708
|
+
title_text = "Create Account"
|
|
709
|
+
tw, th = _text_size(draw, title_text, FONT_TITLE)
|
|
710
|
+
draw.text(((IMG_WIDTH - tw) // 2, 40), title_text, fill="darkblue", font=FONT_TITLE)
|
|
711
|
+
|
|
712
|
+
if layout is None:
|
|
713
|
+
layout = _compute_registration_layout(jitter=jitter)
|
|
714
|
+
|
|
715
|
+
label_x = 180
|
|
716
|
+
box_w, box_h = 400, 36
|
|
717
|
+
start_y = 100
|
|
718
|
+
field_spacing = 70
|
|
719
|
+
|
|
720
|
+
fields = [
|
|
721
|
+
("First Name:", layout.first_name_box, first_name, False),
|
|
722
|
+
("Last Name:", layout.last_name_box, last_name, False),
|
|
723
|
+
("Email:", layout.email_box, email, False),
|
|
724
|
+
("Password:", layout.password_box, password, True),
|
|
725
|
+
("Confirm Password:", layout.confirm_password_box, confirm_password, True),
|
|
726
|
+
]
|
|
727
|
+
|
|
728
|
+
for i, (label, box, value, is_password) in enumerate(fields):
|
|
729
|
+
bx, by, bw, bh = box
|
|
730
|
+
label_y = start_y + i * field_spacing
|
|
731
|
+
draw.text((label_x, label_y), label, fill="black", font=FONT)
|
|
732
|
+
draw.rectangle([(bx, by), (bx + bw, by + bh)], outline="black", fill="white")
|
|
733
|
+
if value:
|
|
734
|
+
display_val = "*" * len(value) if is_password else value
|
|
735
|
+
draw.text((bx + 8, by + 8), display_val, fill="black", font=FONT)
|
|
736
|
+
|
|
737
|
+
# Register button
|
|
738
|
+
btn_x, btn_y, btn_w, btn_h = layout.register_button
|
|
739
|
+
draw.rectangle([(btn_x, btn_y), (btn_x + btn_w, btn_y + btn_h)], outline="black", fill="darkblue")
|
|
740
|
+
btn_text = "Register"
|
|
741
|
+
btw, bth = _text_size(draw, btn_text, FONT)
|
|
742
|
+
draw.text((btn_x + (btn_w - btw) // 2, btn_y + (btn_h - bth) // 2), btn_text, fill="white", font=FONT)
|
|
743
|
+
|
|
744
|
+
# Decoy "Clear Form" button
|
|
745
|
+
decoy_w, decoy_h = 100, 35
|
|
746
|
+
decoy_x = IMG_WIDTH - decoy_w - 30
|
|
747
|
+
decoy_y = btn_y + 5
|
|
748
|
+
draw.rectangle([(decoy_x, decoy_y), (decoy_x + decoy_w, decoy_y + decoy_h)], outline="gray", fill=(200, 200, 200))
|
|
749
|
+
decoy_text = "Clear"
|
|
750
|
+
dtw, dth = _text_size(draw, decoy_text, FONT)
|
|
751
|
+
draw.text((decoy_x + (decoy_w - dtw) // 2, decoy_y + (decoy_h - dth) // 2), decoy_text, fill="gray", font=FONT)
|
|
752
|
+
|
|
753
|
+
return img, layout
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def _draw_registration_success_screen(first_name: str, email: str) -> Image.Image:
|
|
757
|
+
"""Registration success screen."""
|
|
758
|
+
img = Image.new("RGB", (IMG_WIDTH, IMG_HEIGHT), color=(210, 235, 210))
|
|
759
|
+
draw = ImageDraw.Draw(img)
|
|
760
|
+
text = f"Welcome, {first_name}!"
|
|
761
|
+
tw, th = _text_size(draw, text, FONT_TITLE)
|
|
762
|
+
draw.text(((IMG_WIDTH - tw) // 2, IMG_HEIGHT // 2 - 40), text, fill="darkgreen", font=FONT_TITLE)
|
|
763
|
+
subtext = f"Confirmation sent to {email}"
|
|
764
|
+
stw, sth = _text_size(draw, subtext, FONT)
|
|
765
|
+
draw.text(((IMG_WIDTH - stw) // 2, IMG_HEIGHT // 2 + 20), subtext, fill="gray", font=FONT)
|
|
766
|
+
return img
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def _script_registration_episode(
|
|
770
|
+
root: Path,
|
|
771
|
+
episode_id: str,
|
|
772
|
+
first_name: str,
|
|
773
|
+
last_name: str,
|
|
774
|
+
email: str,
|
|
775
|
+
password: str,
|
|
776
|
+
jitter: bool = True,
|
|
777
|
+
) -> Episode:
|
|
778
|
+
"""Create a scripted registration episode with 12 steps.
|
|
779
|
+
|
|
780
|
+
Steps:
|
|
781
|
+
- 0: Click first name field
|
|
782
|
+
- 1: Type first name
|
|
783
|
+
- 2: Click last name field
|
|
784
|
+
- 3: Type last name
|
|
785
|
+
- 4: Click email field
|
|
786
|
+
- 5: Type email
|
|
787
|
+
- 6: Click password field
|
|
788
|
+
- 7: Type password
|
|
789
|
+
- 8: Click confirm password field
|
|
790
|
+
- 9: Type confirm password
|
|
791
|
+
- 10: Click register button
|
|
792
|
+
- 11: DONE
|
|
793
|
+
"""
|
|
794
|
+
steps: List[Step] = []
|
|
795
|
+
layout = _compute_registration_layout(jitter=jitter)
|
|
796
|
+
|
|
797
|
+
# Field data: (field_name, box, value, element_index)
|
|
798
|
+
field_sequence = [
|
|
799
|
+
("first_name", layout.first_name_box, first_name, SOM_FIRST_NAME_FIELD),
|
|
800
|
+
("last_name", layout.last_name_box, last_name, SOM_LAST_NAME_FIELD),
|
|
801
|
+
("email", layout.email_box, email, SOM_EMAIL_FIELD),
|
|
802
|
+
("password", layout.password_box, password, SOM_REG_PASSWORD_FIELD),
|
|
803
|
+
("confirm_password", layout.confirm_password_box, password, SOM_CONFIRM_PASSWORD_FIELD),
|
|
804
|
+
]
|
|
805
|
+
|
|
806
|
+
current_values = {"first_name": "", "last_name": "", "email": "", "password": "", "confirm_password": ""}
|
|
807
|
+
step_idx = 0
|
|
808
|
+
|
|
809
|
+
for field_name, box, value, elem_idx in field_sequence:
|
|
810
|
+
# Click step
|
|
811
|
+
cx, cy = _center(box)
|
|
812
|
+
bbox = _bbox_normalized(box)
|
|
813
|
+
img, _ = _draw_registration_screen(
|
|
814
|
+
first_name=current_values["first_name"],
|
|
815
|
+
last_name=current_values["last_name"],
|
|
816
|
+
email=current_values["email"],
|
|
817
|
+
password=current_values["password"],
|
|
818
|
+
confirm_password=current_values["confirm_password"],
|
|
819
|
+
layout=layout,
|
|
820
|
+
jitter=False,
|
|
821
|
+
)
|
|
822
|
+
img_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
823
|
+
_save_image(img, img_path)
|
|
824
|
+
steps.append(Step(
|
|
825
|
+
t=float(step_idx),
|
|
826
|
+
observation=Observation(image_path=str(img_path)),
|
|
827
|
+
action=Action(type="click", x=cx, y=cy, bbox=bbox, element_index=elem_idx),
|
|
828
|
+
thought=f"Focus the {field_name.replace('_', ' ')} field.",
|
|
829
|
+
))
|
|
830
|
+
step_idx += 1
|
|
831
|
+
|
|
832
|
+
# Type step
|
|
833
|
+
img2, _ = _draw_registration_screen(
|
|
834
|
+
first_name=current_values["first_name"],
|
|
835
|
+
last_name=current_values["last_name"],
|
|
836
|
+
email=current_values["email"],
|
|
837
|
+
password=current_values["password"],
|
|
838
|
+
confirm_password=current_values["confirm_password"],
|
|
839
|
+
layout=layout,
|
|
840
|
+
jitter=False,
|
|
841
|
+
)
|
|
842
|
+
img2_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
843
|
+
_save_image(img2, img2_path)
|
|
844
|
+
steps.append(Step(
|
|
845
|
+
t=float(step_idx),
|
|
846
|
+
observation=Observation(image_path=str(img2_path)),
|
|
847
|
+
action=Action(type="type", text=value, element_index=elem_idx),
|
|
848
|
+
thought=f"Type the {field_name.replace('_', ' ')}.",
|
|
849
|
+
))
|
|
850
|
+
current_values[field_name] = value
|
|
851
|
+
step_idx += 1
|
|
852
|
+
|
|
853
|
+
# Click register button
|
|
854
|
+
cx, cy = _center(layout.register_button)
|
|
855
|
+
bbox = _bbox_normalized(layout.register_button)
|
|
856
|
+
img, _ = _draw_registration_screen(
|
|
857
|
+
first_name=current_values["first_name"],
|
|
858
|
+
last_name=current_values["last_name"],
|
|
859
|
+
email=current_values["email"],
|
|
860
|
+
password=current_values["password"],
|
|
861
|
+
confirm_password=current_values["confirm_password"],
|
|
862
|
+
layout=layout,
|
|
863
|
+
jitter=False,
|
|
864
|
+
)
|
|
865
|
+
img_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
866
|
+
_save_image(img, img_path)
|
|
867
|
+
steps.append(Step(
|
|
868
|
+
t=float(step_idx),
|
|
869
|
+
observation=Observation(image_path=str(img_path)),
|
|
870
|
+
action=Action(type="click", x=cx, y=cy, bbox=bbox, element_index=SOM_REGISTER_BUTTON),
|
|
871
|
+
thought="Submit the registration form.",
|
|
872
|
+
))
|
|
873
|
+
step_idx += 1
|
|
874
|
+
|
|
875
|
+
# Done step
|
|
876
|
+
img_done = _draw_registration_success_screen(first_name, email)
|
|
877
|
+
img_done_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
878
|
+
_save_image(img_done, img_done_path)
|
|
879
|
+
steps.append(Step(
|
|
880
|
+
t=float(step_idx),
|
|
881
|
+
observation=Observation(image_path=str(img_done_path)),
|
|
882
|
+
action=Action(type="done"),
|
|
883
|
+
thought="Registration successful; workflow complete.",
|
|
884
|
+
))
|
|
885
|
+
|
|
886
|
+
return Episode(
|
|
887
|
+
id=episode_id,
|
|
888
|
+
goal=f"Register with first name '{first_name}', last name '{last_name}', email '{email}', and password",
|
|
889
|
+
steps=steps,
|
|
890
|
+
summary="Successful registration.",
|
|
891
|
+
success=True,
|
|
892
|
+
workflow_id="registration",
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
def _script_registration_episode_som(
|
|
897
|
+
root: Path,
|
|
898
|
+
episode_id: str,
|
|
899
|
+
first_name: str,
|
|
900
|
+
last_name: str,
|
|
901
|
+
email: str,
|
|
902
|
+
password: str,
|
|
903
|
+
jitter: bool = True,
|
|
904
|
+
) -> Episode:
|
|
905
|
+
"""Create a registration episode with SoM overlays."""
|
|
906
|
+
steps: List[Step] = []
|
|
907
|
+
layout = _compute_registration_layout(jitter=jitter)
|
|
908
|
+
|
|
909
|
+
som_elements = [
|
|
910
|
+
(SOM_FIRST_NAME_FIELD, layout.first_name_box),
|
|
911
|
+
(SOM_LAST_NAME_FIELD, layout.last_name_box),
|
|
912
|
+
(SOM_EMAIL_FIELD, layout.email_box),
|
|
913
|
+
(SOM_REG_PASSWORD_FIELD, layout.password_box),
|
|
914
|
+
(SOM_CONFIRM_PASSWORD_FIELD, layout.confirm_password_box),
|
|
915
|
+
(SOM_REGISTER_BUTTON, layout.register_button),
|
|
916
|
+
]
|
|
917
|
+
|
|
918
|
+
field_sequence = [
|
|
919
|
+
("first_name", layout.first_name_box, first_name, SOM_FIRST_NAME_FIELD),
|
|
920
|
+
("last_name", layout.last_name_box, last_name, SOM_LAST_NAME_FIELD),
|
|
921
|
+
("email", layout.email_box, email, SOM_EMAIL_FIELD),
|
|
922
|
+
("password", layout.password_box, password, SOM_REG_PASSWORD_FIELD),
|
|
923
|
+
("confirm_password", layout.confirm_password_box, password, SOM_CONFIRM_PASSWORD_FIELD),
|
|
924
|
+
]
|
|
925
|
+
|
|
926
|
+
current_values = {"first_name": "", "last_name": "", "email": "", "password": "", "confirm_password": ""}
|
|
927
|
+
step_idx = 0
|
|
928
|
+
|
|
929
|
+
for field_name, box, value, elem_idx in field_sequence:
|
|
930
|
+
# Click step
|
|
931
|
+
cx, cy = _center(box)
|
|
932
|
+
bbox = _bbox_normalized(box)
|
|
933
|
+
img, _ = _draw_registration_screen(
|
|
934
|
+
first_name=current_values["first_name"],
|
|
935
|
+
last_name=current_values["last_name"],
|
|
936
|
+
email=current_values["email"],
|
|
937
|
+
password=current_values["password"],
|
|
938
|
+
confirm_password=current_values["confirm_password"],
|
|
939
|
+
layout=layout,
|
|
940
|
+
jitter=False,
|
|
941
|
+
)
|
|
942
|
+
img_som = _overlay_som_marks(img, som_elements)
|
|
943
|
+
img_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
944
|
+
_save_image(img_som, img_path)
|
|
945
|
+
steps.append(Step(
|
|
946
|
+
t=float(step_idx),
|
|
947
|
+
observation=Observation(image_path=str(img_path)),
|
|
948
|
+
action=Action(type="click", x=cx, y=cy, bbox=bbox, element_index=elem_idx),
|
|
949
|
+
thought=f"Focus element [{elem_idx}] ({field_name.replace('_', ' ')} field).",
|
|
950
|
+
))
|
|
951
|
+
step_idx += 1
|
|
952
|
+
|
|
953
|
+
# Type step
|
|
954
|
+
img2, _ = _draw_registration_screen(
|
|
955
|
+
first_name=current_values["first_name"],
|
|
956
|
+
last_name=current_values["last_name"],
|
|
957
|
+
email=current_values["email"],
|
|
958
|
+
password=current_values["password"],
|
|
959
|
+
confirm_password=current_values["confirm_password"],
|
|
960
|
+
layout=layout,
|
|
961
|
+
jitter=False,
|
|
962
|
+
)
|
|
963
|
+
img2_som = _overlay_som_marks(img2, som_elements)
|
|
964
|
+
img2_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
965
|
+
_save_image(img2_som, img2_path)
|
|
966
|
+
steps.append(Step(
|
|
967
|
+
t=float(step_idx),
|
|
968
|
+
observation=Observation(image_path=str(img2_path)),
|
|
969
|
+
action=Action(type="type", text=value, element_index=elem_idx),
|
|
970
|
+
thought=f"Type into element [{elem_idx}].",
|
|
971
|
+
))
|
|
972
|
+
current_values[field_name] = value
|
|
973
|
+
step_idx += 1
|
|
974
|
+
|
|
975
|
+
# Click register button
|
|
976
|
+
cx, cy = _center(layout.register_button)
|
|
977
|
+
bbox = _bbox_normalized(layout.register_button)
|
|
978
|
+
img, _ = _draw_registration_screen(
|
|
979
|
+
first_name=current_values["first_name"],
|
|
980
|
+
last_name=current_values["last_name"],
|
|
981
|
+
email=current_values["email"],
|
|
982
|
+
password=current_values["password"],
|
|
983
|
+
confirm_password=current_values["confirm_password"],
|
|
984
|
+
layout=layout,
|
|
985
|
+
jitter=False,
|
|
986
|
+
)
|
|
987
|
+
img_som = _overlay_som_marks(img, som_elements)
|
|
988
|
+
img_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
989
|
+
_save_image(img_som, img_path)
|
|
990
|
+
steps.append(Step(
|
|
991
|
+
t=float(step_idx),
|
|
992
|
+
observation=Observation(image_path=str(img_path)),
|
|
993
|
+
action=Action(type="click", x=cx, y=cy, bbox=bbox, element_index=SOM_REGISTER_BUTTON),
|
|
994
|
+
thought=f"Click element [{SOM_REGISTER_BUTTON}] to submit registration.",
|
|
995
|
+
))
|
|
996
|
+
step_idx += 1
|
|
997
|
+
|
|
998
|
+
# Done step
|
|
999
|
+
img_done = _draw_registration_success_screen(first_name, email)
|
|
1000
|
+
img_done_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
1001
|
+
_save_image(img_done, img_done_path)
|
|
1002
|
+
steps.append(Step(
|
|
1003
|
+
t=float(step_idx),
|
|
1004
|
+
observation=Observation(image_path=str(img_done_path)),
|
|
1005
|
+
action=Action(type="done"),
|
|
1006
|
+
thought="Registration successful; workflow complete.",
|
|
1007
|
+
))
|
|
1008
|
+
|
|
1009
|
+
return Episode(
|
|
1010
|
+
id=episode_id,
|
|
1011
|
+
goal=f"Register with first name '{first_name}', last name '{last_name}', email '{email}', and password",
|
|
1012
|
+
steps=steps,
|
|
1013
|
+
summary="Successful registration (SoM mode).",
|
|
1014
|
+
success=True,
|
|
1015
|
+
workflow_id="registration_som",
|
|
1016
|
+
)
|
|
1017
|
+
|
|
1018
|
+
|
|
1019
|
+
def generate_synthetic_sessions(
|
|
1020
|
+
num_sessions: int = 10,
|
|
1021
|
+
seed: int | None = None,
|
|
1022
|
+
output_dir: str | os.PathLike[str] | None = None,
|
|
1023
|
+
jitter: bool = True,
|
|
1024
|
+
use_som: bool = False,
|
|
1025
|
+
scenario: str = "login",
|
|
1026
|
+
) -> List[Session]:
|
|
1027
|
+
"""Generate a list of synthetic Sessions with semantic UI episodes.
|
|
1028
|
+
|
|
1029
|
+
Each Session contains a single Episode. Images for all steps are written
|
|
1030
|
+
to `output_dir`.
|
|
1031
|
+
|
|
1032
|
+
Args:
|
|
1033
|
+
num_sessions: Number of sessions to generate.
|
|
1034
|
+
seed: Random seed for reproducibility.
|
|
1035
|
+
output_dir: Directory to write images to.
|
|
1036
|
+
jitter: Whether to apply slight position jitter to UI elements.
|
|
1037
|
+
use_som: If True, generate Set-of-Marks (SoM) annotated screenshots
|
|
1038
|
+
with numbered element labels and use element indices for
|
|
1039
|
+
click actions instead of raw coordinates.
|
|
1040
|
+
scenario: Type of UI scenario to generate. Options:
|
|
1041
|
+
- "login": Simple login form (6 steps, 3 elements)
|
|
1042
|
+
- "registration": Registration form (12 steps, 6 elements)
|
|
1043
|
+
"""
|
|
1044
|
+
|
|
1045
|
+
if seed is not None:
|
|
1046
|
+
random.seed(seed)
|
|
1047
|
+
|
|
1048
|
+
if output_dir is None:
|
|
1049
|
+
suffix = "_som" if use_som else ""
|
|
1050
|
+
output_root = Path("synthetic") / f"data_{scenario}{suffix}"
|
|
1051
|
+
else:
|
|
1052
|
+
output_root = Path(output_dir)
|
|
1053
|
+
|
|
1054
|
+
sessions: List[Session] = []
|
|
1055
|
+
|
|
1056
|
+
for i in range(num_sessions):
|
|
1057
|
+
session_id = f"session_{i:04d}"
|
|
1058
|
+
session_dir = output_root / session_id
|
|
1059
|
+
|
|
1060
|
+
if scenario == "login":
|
|
1061
|
+
episode_id = f"{session_id}_login"
|
|
1062
|
+
username = f"user{i}"
|
|
1063
|
+
password = f"pass{i}123"
|
|
1064
|
+
|
|
1065
|
+
if use_som:
|
|
1066
|
+
episode = _script_login_episode_som(
|
|
1067
|
+
session_dir, episode_id, username, password, jitter=jitter
|
|
1068
|
+
)
|
|
1069
|
+
else:
|
|
1070
|
+
episode = _script_login_episode(
|
|
1071
|
+
session_dir, episode_id, username, password, jitter=jitter
|
|
1072
|
+
)
|
|
1073
|
+
|
|
1074
|
+
elif scenario == "registration":
|
|
1075
|
+
episode_id = f"{session_id}_registration"
|
|
1076
|
+
first_name = f"John{i}"
|
|
1077
|
+
last_name = f"Doe{i}"
|
|
1078
|
+
email = f"john{i}@example.com"
|
|
1079
|
+
password = f"SecurePass{i}!"
|
|
1080
|
+
|
|
1081
|
+
if use_som:
|
|
1082
|
+
episode = _script_registration_episode_som(
|
|
1083
|
+
session_dir, episode_id, first_name, last_name, email, password, jitter=jitter
|
|
1084
|
+
)
|
|
1085
|
+
else:
|
|
1086
|
+
episode = _script_registration_episode(
|
|
1087
|
+
session_dir, episode_id, first_name, last_name, email, password, jitter=jitter
|
|
1088
|
+
)
|
|
1089
|
+
|
|
1090
|
+
else:
|
|
1091
|
+
raise ValueError(f"Unknown scenario: {scenario}. Options: login, registration")
|
|
1092
|
+
|
|
1093
|
+
session = Session(
|
|
1094
|
+
id=session_id,
|
|
1095
|
+
episodes=[episode],
|
|
1096
|
+
meta={"scenario": scenario, "use_som": use_som},
|
|
1097
|
+
)
|
|
1098
|
+
sessions.append(session)
|
|
1099
|
+
|
|
1100
|
+
return sessions
|
|
1101
|
+
|
|
1102
|
+
|