openadapt-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. openadapt_ml/__init__.py +0 -0
  2. openadapt_ml/benchmarks/__init__.py +125 -0
  3. openadapt_ml/benchmarks/agent.py +825 -0
  4. openadapt_ml/benchmarks/azure.py +761 -0
  5. openadapt_ml/benchmarks/base.py +366 -0
  6. openadapt_ml/benchmarks/cli.py +884 -0
  7. openadapt_ml/benchmarks/data_collection.py +432 -0
  8. openadapt_ml/benchmarks/runner.py +381 -0
  9. openadapt_ml/benchmarks/waa.py +704 -0
  10. openadapt_ml/cloud/__init__.py +5 -0
  11. openadapt_ml/cloud/azure_inference.py +441 -0
  12. openadapt_ml/cloud/lambda_labs.py +2445 -0
  13. openadapt_ml/cloud/local.py +790 -0
  14. openadapt_ml/config.py +56 -0
  15. openadapt_ml/datasets/__init__.py +0 -0
  16. openadapt_ml/datasets/next_action.py +507 -0
  17. openadapt_ml/evals/__init__.py +23 -0
  18. openadapt_ml/evals/grounding.py +241 -0
  19. openadapt_ml/evals/plot_eval_metrics.py +174 -0
  20. openadapt_ml/evals/trajectory_matching.py +486 -0
  21. openadapt_ml/grounding/__init__.py +45 -0
  22. openadapt_ml/grounding/base.py +236 -0
  23. openadapt_ml/grounding/detector.py +570 -0
  24. openadapt_ml/ingest/__init__.py +43 -0
  25. openadapt_ml/ingest/capture.py +312 -0
  26. openadapt_ml/ingest/loader.py +232 -0
  27. openadapt_ml/ingest/synthetic.py +1102 -0
  28. openadapt_ml/models/__init__.py +0 -0
  29. openadapt_ml/models/api_adapter.py +171 -0
  30. openadapt_ml/models/base_adapter.py +59 -0
  31. openadapt_ml/models/dummy_adapter.py +42 -0
  32. openadapt_ml/models/qwen_vl.py +426 -0
  33. openadapt_ml/runtime/__init__.py +0 -0
  34. openadapt_ml/runtime/policy.py +182 -0
  35. openadapt_ml/schemas/__init__.py +53 -0
  36. openadapt_ml/schemas/sessions.py +122 -0
  37. openadapt_ml/schemas/validation.py +252 -0
  38. openadapt_ml/scripts/__init__.py +0 -0
  39. openadapt_ml/scripts/compare.py +1490 -0
  40. openadapt_ml/scripts/demo_policy.py +62 -0
  41. openadapt_ml/scripts/eval_policy.py +287 -0
  42. openadapt_ml/scripts/make_gif.py +153 -0
  43. openadapt_ml/scripts/prepare_synthetic.py +43 -0
  44. openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
  45. openadapt_ml/scripts/train.py +174 -0
  46. openadapt_ml/training/__init__.py +0 -0
  47. openadapt_ml/training/benchmark_viewer.py +1538 -0
  48. openadapt_ml/training/shared_ui.py +157 -0
  49. openadapt_ml/training/stub_provider.py +276 -0
  50. openadapt_ml/training/trainer.py +2446 -0
  51. openadapt_ml/training/viewer.py +2970 -0
  52. openadapt_ml-0.1.0.dist-info/METADATA +818 -0
  53. openadapt_ml-0.1.0.dist-info/RECORD +55 -0
  54. openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
  55. openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1102 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import random
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import List, Optional, Tuple
8
+
9
+ from PIL import Image, ImageDraw, ImageFont
10
+
11
+ from openadapt_ml.schemas.sessions import Action, Episode, Observation, Session, Step
12
+
13
+
14
+ IMG_WIDTH = 800
15
+ IMG_HEIGHT = 600
16
+
17
+
18
+ def _load_font(size: int = 16) -> ImageFont.FreeTypeFont | ImageFont.ImageFont: # type: ignore[name-defined]
19
+ try:
20
+ return ImageFont.truetype("arial.ttf", size)
21
+ except OSError:
22
+ return ImageFont.load_default()
23
+
24
+
25
+ FONT = _load_font(16)
26
+ FONT_TITLE = _load_font(24)
27
+
28
+
29
+ def _normalize(x_px: int, y_px: int) -> Tuple[float, float]:
30
+ """Normalize pixel coordinates to [0, 1] relative to image size."""
31
+
32
+ return x_px / IMG_WIDTH, y_px / IMG_HEIGHT
33
+
34
+
35
+ def _text_size(draw: ImageDraw.ImageDraw, text: str, font: ImageFont.ImageFont) -> Tuple[int, int]:
36
+ """Compute text width/height using textbbox for Pillow compatibility."""
37
+
38
+ left, top, right, bottom = draw.textbbox((0, 0), text, font=font)
39
+ return right - left, bottom - top
40
+
41
+
42
+ @dataclass
43
+ class LoginUIElements:
44
+ """Absolute pixel bounds for important interactive regions.
45
+
46
+ Bounds are (x, y, w, h) in pixels.
47
+ """
48
+
49
+ username_box: Tuple[int, int, int, int]
50
+ password_box: Tuple[int, int, int, int]
51
+ login_button: Tuple[int, int, int, int]
52
+
53
+
54
+ def _compute_login_layout(max_offset: int = 10, jitter: bool = True) -> LoginUIElements:
55
+ """Sample a login UI layout, optionally with jitter.
56
+
57
+ This computes absolute pixel bounds for all key elements once, so that a
58
+ single layout can be reused across all frames in an episode.
59
+ """
60
+
61
+ # Username label and box base geometry
62
+ label_x = 200
63
+ uname_label_y = 160
64
+ box_w, box_h = 360, 40
65
+ uname_box_y = uname_label_y + 24
66
+
67
+ def _maybe_jitter(x: int, y: int) -> tuple[int, int]:
68
+ if not jitter:
69
+ return x, y
70
+ dx = random.randint(-max_offset, max_offset)
71
+ dy = random.randint(-max_offset, max_offset)
72
+ jx = max(0, min(IMG_WIDTH, x + dx))
73
+ jy = max(0, min(IMG_HEIGHT, y + dy))
74
+ return jx, jy
75
+
76
+ # Username box position
77
+ uname_x, uname_y = _maybe_jitter(label_x, uname_box_y)
78
+ uname_x = max(20, min(IMG_WIDTH - box_w - 20, uname_x))
79
+ uname_y = max(uname_label_y + 10, min(IMG_HEIGHT - box_h - 100, uname_y))
80
+ username_box = (uname_x, uname_y, box_w, box_h)
81
+
82
+ # Password label and box
83
+ pw_label_y = uname_y + box_h + 30
84
+ pw_box_y = pw_label_y + 24
85
+ pw_x, pw_y = _maybe_jitter(label_x, pw_box_y)
86
+ pw_x = max(20, min(IMG_WIDTH - box_w - 20, pw_x))
87
+ pw_y = max(pw_label_y + 10, min(IMG_HEIGHT - box_h - 80, pw_y))
88
+ password_box = (pw_x, pw_y, box_w, box_h)
89
+
90
+ # Login button
91
+ btn_w, btn_h = 140, 45
92
+ base_btn_x = (IMG_WIDTH - btn_w) // 2
93
+ base_btn_y = pw_y + box_h + 50
94
+ btn_x, btn_y = _maybe_jitter(base_btn_x, base_btn_y)
95
+ btn_x = max(20, min(IMG_WIDTH - btn_w - 20, btn_x))
96
+ btn_y = max(pw_y + box_h + 20, min(IMG_HEIGHT - btn_h - 40, btn_y))
97
+ login_button = (btn_x, btn_y, btn_w, btn_h)
98
+
99
+ return LoginUIElements(
100
+ username_box=username_box,
101
+ password_box=password_box,
102
+ login_button=login_button,
103
+ )
104
+
105
+
106
+ def _draw_login_screen(
107
+ username: str = "",
108
+ password: str = "",
109
+ layout: Optional[LoginUIElements] = None,
110
+ jitter: bool = True,
111
+ ) -> tuple[Image.Image, LoginUIElements]:
112
+ """Draw a simple login screen with slight layout jitter and a decoy button.
113
+
114
+ Returns the image and absolute pixel bounds for key interactive elements.
115
+ Bounds are (x, y, w, h).
116
+ """
117
+
118
+ img = Image.new("RGB", (IMG_WIDTH, IMG_HEIGHT), color=(230, 230, 230))
119
+ draw = ImageDraw.Draw(img)
120
+
121
+ # Title
122
+ title_text = "Welcome Back!"
123
+ tw, th = _text_size(draw, title_text, FONT_TITLE)
124
+ tx = (IMG_WIDTH - tw) // 2
125
+ ty = 80
126
+ draw.text((tx, ty), title_text, fill="black", font=FONT_TITLE)
127
+
128
+ # Determine layout once; if not provided, sample it (optionally with jitter).
129
+ if layout is None:
130
+ layout = _compute_login_layout(jitter=jitter)
131
+
132
+ # Username label and box
133
+ label_x = 200
134
+ uname_label_y = 160
135
+ box_w, box_h = 360, 40
136
+ draw.text((label_x, uname_label_y), "Username:", fill="black", font=FONT)
137
+
138
+ uname_x, uname_y, _, _ = layout.username_box
139
+ draw.rectangle(
140
+ [
141
+ (uname_x, uname_y),
142
+ (uname_x + box_w, uname_y + box_h),
143
+ ],
144
+ outline="black",
145
+ fill="white",
146
+ )
147
+ if username:
148
+ draw.text((uname_x + 8, uname_y + 10), username, fill="black", font=FONT)
149
+
150
+ # Password label and box
151
+ pw_x, pw_y, _, _ = layout.password_box
152
+ pw_label_y = pw_y - 24
153
+ draw.text((label_x, pw_label_y), "Password:", fill="black", font=FONT)
154
+
155
+ draw.rectangle(
156
+ [
157
+ (pw_x, pw_y),
158
+ (pw_x + box_w, pw_y + box_h),
159
+ ],
160
+ outline="black",
161
+ fill="white",
162
+ )
163
+ if password:
164
+ masked = "*" * len(password)
165
+ draw.text((pw_x + 8, pw_y + 10), masked, fill="black", font=FONT)
166
+
167
+ # Login button
168
+ btn_x, btn_y, btn_w, btn_h = layout.login_button
169
+
170
+ draw.rectangle(
171
+ [
172
+ (btn_x, btn_y),
173
+ (btn_x + btn_w, btn_y + btn_h),
174
+ ],
175
+ outline="black",
176
+ fill="green",
177
+ )
178
+ btn_text = "Login"
179
+ btw, bth = _text_size(draw, btn_text, FONT)
180
+ draw.text(
181
+ (btn_x + (btn_w - btw) // 2, btn_y + (btn_h - bth) // 2),
182
+ btn_text,
183
+ fill="white",
184
+ font=FONT,
185
+ )
186
+
187
+ login_button = (btn_x, btn_y, btn_w, btn_h)
188
+
189
+ # Decoy clickable button (e.g., Help) in the lower-right area.
190
+ decoy_w, decoy_h = 110, 35
191
+ decoy_x = IMG_WIDTH - decoy_w - 40
192
+ decoy_y = btn_y
193
+ draw.rectangle(
194
+ [
195
+ (decoy_x, decoy_y),
196
+ (decoy_x + decoy_w, decoy_y + decoy_h),
197
+ ],
198
+ outline="black",
199
+ fill=(180, 180, 180),
200
+ )
201
+ decoy_text = "Help"
202
+ dtw, dth = _text_size(draw, decoy_text, FONT)
203
+ draw.text(
204
+ (decoy_x + (decoy_w - dtw) // 2, decoy_y + (decoy_h - dth) // 2),
205
+ decoy_text,
206
+ fill="black",
207
+ font=FONT,
208
+ )
209
+
210
+ elements = LoginUIElements(
211
+ username_box=layout.username_box,
212
+ password_box=layout.password_box,
213
+ login_button=login_button,
214
+ )
215
+
216
+ return img, elements
217
+
218
+
219
+ def _overlay_som_marks(
220
+ img: Image.Image,
221
+ elements: List[Tuple[int, Tuple[int, int, int, int]]],
222
+ ) -> Image.Image:
223
+ """Overlay Set-of-Marks numbered labels on interactive elements.
224
+
225
+ Uses the style from the SoM paper: black squares with white numbers.
226
+ Labels are positioned at the top-left corner of each element.
227
+
228
+ Args:
229
+ img: The base screenshot image.
230
+ elements: List of (index, (x, y, w, h)) tuples for each interactive element.
231
+ Index is the 1-based element number shown in the label.
232
+
233
+ Returns:
234
+ A copy of the image with [1], [2], [3], etc. labels overlaid.
235
+ """
236
+ img = img.copy()
237
+ draw = ImageDraw.Draw(img)
238
+
239
+ # Load a slightly larger font for SoM labels
240
+ try:
241
+ som_font = ImageFont.truetype("arial.ttf", 14)
242
+ except OSError:
243
+ som_font = ImageFont.load_default()
244
+
245
+ for idx, bounds in elements:
246
+ x, y, w, h = bounds
247
+ label = f"[{idx}]"
248
+
249
+ # Measure text size
250
+ text_bbox = draw.textbbox((0, 0), label, font=som_font)
251
+ text_width = text_bbox[2] - text_bbox[0]
252
+ text_height = text_bbox[3] - text_bbox[1]
253
+
254
+ # Add padding for the box
255
+ padding = 4
256
+ box_width = text_width + padding * 2
257
+ box_height = text_height + padding * 2
258
+
259
+ # Position ABOVE and to the LEFT of the element (not inside)
260
+ # This ensures labels don't obscure content
261
+ box_x = x - 4
262
+ box_y = y - box_height - 2
263
+
264
+ # Ensure box stays within image bounds
265
+ if box_y < 0:
266
+ # If no room above, position to the left of the element
267
+ box_y = y + 4
268
+ if box_x < 0:
269
+ box_x = 4
270
+
271
+ # Draw black rectangle background (SoM paper style)
272
+ draw.rectangle(
273
+ [box_x, box_y, box_x + box_width, box_y + box_height],
274
+ fill="black",
275
+ )
276
+
277
+ # Draw white text centered in the box
278
+ text_x = box_x + padding
279
+ text_y = box_y + padding
280
+ draw.text((text_x, text_y), label, fill="white", font=som_font)
281
+
282
+ return img
283
+
284
+
285
+ # Element index mapping for the login screen (1-based for human readability)
286
+ SOM_USERNAME_FIELD = 1
287
+ SOM_PASSWORD_FIELD = 2
288
+ SOM_LOGIN_BUTTON = 3
289
+
290
+
291
+ def _draw_logged_in_screen(username: str) -> Image.Image:
292
+ """Simple logged-in confirmation screen."""
293
+
294
+ img = Image.new("RGB", (IMG_WIDTH, IMG_HEIGHT), color=(210, 230, 210))
295
+ draw = ImageDraw.Draw(img)
296
+ text = f"Welcome, {username}!"
297
+ tw, th = _text_size(draw, text, FONT_TITLE)
298
+ tx = (IMG_WIDTH - tw) // 2
299
+ ty = (IMG_HEIGHT - th) // 2
300
+ draw.text((tx, ty), text, fill="darkgreen", font=FONT_TITLE)
301
+ return img
302
+
303
+
304
+ def _save_image(img: Image.Image, path: Path) -> None:
305
+ path.parent.mkdir(parents=True, exist_ok=True)
306
+ img.save(path)
307
+
308
+
309
+ def _center(bounds: Tuple[int, int, int, int]) -> Tuple[float, float]:
310
+ x, y, w, h = bounds
311
+ cx = x + w // 2
312
+ cy = y + h // 2
313
+ return _normalize(cx, cy)
314
+
315
+
316
+ def _bbox_normalized(bounds: Tuple[int, int, int, int]) -> Tuple[float, float, float, float]:
317
+ """Convert pixel bounds (x, y, w, h) to normalized bbox (x_min, y_min, x_max, y_max)."""
318
+ x, y, w, h = bounds
319
+ x_min = x / IMG_WIDTH
320
+ y_min = y / IMG_HEIGHT
321
+ x_max = (x + w) / IMG_WIDTH
322
+ y_max = (y + h) / IMG_HEIGHT
323
+ return (x_min, y_min, x_max, y_max)
324
+
325
+
326
+ def _script_login_episode(
327
+ root: Path,
328
+ episode_id: str,
329
+ username: str,
330
+ password: str,
331
+ jitter: bool = True,
332
+ ) -> Episode:
333
+ """Create a scripted login episode with a fixed sequence of steps.
334
+
335
+ Steps (6 total):
336
+ - Step 0: blank login screen → click username field.
337
+ - Step 1: username field focused → type username.
338
+ - Step 2: username typed → click password field.
339
+ - Step 3: password field focused → type password.
340
+ - Step 4: password typed → click login button.
341
+ - Step 5: logged-in screen → DONE.
342
+
343
+ Each step includes bounding boxes for clickable elements to support
344
+ bbox-based click hit evaluation.
345
+ """
346
+
347
+ steps: List[Step] = []
348
+
349
+ # Sample a single layout for the entire episode (controls jitter vs no-jitter).
350
+ layout = _compute_login_layout(jitter=jitter)
351
+
352
+ # Compute normalized bounding boxes for all elements
353
+ username_bbox = _bbox_normalized(layout.username_box)
354
+ password_bbox = _bbox_normalized(layout.password_box)
355
+ login_bbox = _bbox_normalized(layout.login_button)
356
+
357
+ # Step 0: blank login screen → click username field
358
+ cx, cy = _center(layout.username_box)
359
+ img0, _ = _draw_login_screen(layout=layout, jitter=False)
360
+ img0_path = root / f"{episode_id}_step_0.png"
361
+ _save_image(img0, img0_path)
362
+ obs0 = Observation(image_path=str(img0_path))
363
+ steps.append(
364
+ Step(
365
+ t=0.0,
366
+ observation=obs0,
367
+ action=Action(type="click", x=cx, y=cy, bbox=username_bbox),
368
+ thought="Focus the username field.",
369
+ )
370
+ )
371
+
372
+ # Step 1: username field focused → type username
373
+ img1, _ = _draw_login_screen(username="", layout=layout, jitter=False)
374
+ img1_path = root / f"{episode_id}_step_1.png"
375
+ _save_image(img1, img1_path)
376
+ obs1 = Observation(image_path=str(img1_path))
377
+ steps.append(
378
+ Step(
379
+ t=1.0,
380
+ observation=obs1,
381
+ action=Action(type="type", text=username),
382
+ thought="Type the username.",
383
+ )
384
+ )
385
+
386
+ # Step 2: username typed → click password field
387
+ cx_pw, cy_pw = _center(layout.password_box)
388
+ img2, _ = _draw_login_screen(username=username, layout=layout, jitter=False)
389
+ img2_path = root / f"{episode_id}_step_2.png"
390
+ _save_image(img2, img2_path)
391
+ obs2 = Observation(image_path=str(img2_path))
392
+ steps.append(
393
+ Step(
394
+ t=2.0,
395
+ observation=obs2,
396
+ action=Action(type="click", x=cx_pw, y=cy_pw, bbox=password_bbox),
397
+ thought="Focus the password field.",
398
+ )
399
+ )
400
+
401
+ # Step 3: password field focused → type password
402
+ img3, _ = _draw_login_screen(username=username, layout=layout, jitter=False)
403
+ img3_path = root / f"{episode_id}_step_3.png"
404
+ _save_image(img3, img3_path)
405
+ obs3 = Observation(image_path=str(img3_path))
406
+ steps.append(
407
+ Step(
408
+ t=3.0,
409
+ observation=obs3,
410
+ action=Action(type="type", text=password),
411
+ thought="Type the password.",
412
+ )
413
+ )
414
+
415
+ # Step 4: password typed → click login button
416
+ cx_btn, cy_btn = _center(layout.login_button)
417
+ img4, _ = _draw_login_screen(username=username, password=password, layout=layout, jitter=False)
418
+ img4_path = root / f"{episode_id}_step_4.png"
419
+ _save_image(img4, img4_path)
420
+ obs4 = Observation(image_path=str(img4_path))
421
+ steps.append(
422
+ Step(
423
+ t=4.0,
424
+ observation=obs4,
425
+ action=Action(type="click", x=cx_btn, y=cy_btn, bbox=login_bbox),
426
+ thought="Submit the login form.",
427
+ )
428
+ )
429
+
430
+ # Step 5: logged-in screen → DONE
431
+ img5 = _draw_logged_in_screen(username=username)
432
+ img5_path = root / f"{episode_id}_step_5.png"
433
+ _save_image(img5, img5_path)
434
+ obs5 = Observation(image_path=str(img5_path))
435
+ steps.append(
436
+ Step(
437
+ t=5.0,
438
+ observation=obs5,
439
+ action=Action(type="done"),
440
+ thought="Login successful; workflow complete.",
441
+ )
442
+ )
443
+
444
+ episode = Episode(
445
+ id=episode_id,
446
+ goal=f"Log in with username '{username}' and password '{password}'",
447
+ steps=steps,
448
+ summary="Successful login via username and password.",
449
+ success=True,
450
+ workflow_id="login_basic",
451
+ )
452
+
453
+ return episode
454
+
455
+
456
+ def _script_login_episode_som(
457
+ root: Path,
458
+ episode_id: str,
459
+ username: str,
460
+ password: str,
461
+ jitter: bool = True,
462
+ ) -> Episode:
463
+ """Create a scripted login episode with Set-of-Marks (SoM) overlay.
464
+
465
+ This variant generates screenshots with numbered labels [1], [2], [3] on
466
+ interactive elements, and uses element_index instead of raw coordinates
467
+ for click actions.
468
+
469
+ Steps (6 total):
470
+ - Step 0: SoM login screen → click element [1] (username field)
471
+ - Step 1: username field focused → type username
472
+ - Step 2: username typed → click element [2] (password field)
473
+ - Step 3: password field focused → type password
474
+ - Step 4: password typed → click element [3] (login button)
475
+ - Step 5: logged-in screen → DONE
476
+ """
477
+
478
+ steps: List[Step] = []
479
+
480
+ # Sample a single layout for the entire episode
481
+ layout = _compute_login_layout(jitter=jitter)
482
+
483
+ # Compute normalized bounding boxes for all elements
484
+ username_bbox = _bbox_normalized(layout.username_box)
485
+ password_bbox = _bbox_normalized(layout.password_box)
486
+ login_bbox = _bbox_normalized(layout.login_button)
487
+
488
+ # Define element mapping for SoM overlay
489
+ som_elements = [
490
+ (SOM_USERNAME_FIELD, layout.username_box),
491
+ (SOM_PASSWORD_FIELD, layout.password_box),
492
+ (SOM_LOGIN_BUTTON, layout.login_button),
493
+ ]
494
+
495
+ # Step 0: SoM login screen → click username field [1]
496
+ cx, cy = _center(layout.username_box)
497
+ img0, _ = _draw_login_screen(layout=layout, jitter=False)
498
+ img0_som = _overlay_som_marks(img0, som_elements)
499
+ img0_path = root / f"{episode_id}_step_0.png"
500
+ _save_image(img0_som, img0_path)
501
+ obs0 = Observation(image_path=str(img0_path))
502
+ steps.append(
503
+ Step(
504
+ t=0.0,
505
+ observation=obs0,
506
+ action=Action(
507
+ type="click",
508
+ x=cx,
509
+ y=cy,
510
+ bbox=username_bbox,
511
+ element_index=SOM_USERNAME_FIELD,
512
+ ),
513
+ thought="Focus the username field by clicking element [1].",
514
+ )
515
+ )
516
+
517
+ # Step 1: username field focused → type username into element [1]
518
+ img1, _ = _draw_login_screen(username="", layout=layout, jitter=False)
519
+ img1_som = _overlay_som_marks(img1, som_elements)
520
+ img1_path = root / f"{episode_id}_step_1.png"
521
+ _save_image(img1_som, img1_path)
522
+ obs1 = Observation(image_path=str(img1_path))
523
+ steps.append(
524
+ Step(
525
+ t=1.0,
526
+ observation=obs1,
527
+ action=Action(type="type", text=username, element_index=SOM_USERNAME_FIELD),
528
+ thought="Type the username into element [1].",
529
+ )
530
+ )
531
+
532
+ # Step 2: username typed → click password field [2]
533
+ cx_pw, cy_pw = _center(layout.password_box)
534
+ img2, _ = _draw_login_screen(username=username, layout=layout, jitter=False)
535
+ img2_som = _overlay_som_marks(img2, som_elements)
536
+ img2_path = root / f"{episode_id}_step_2.png"
537
+ _save_image(img2_som, img2_path)
538
+ obs2 = Observation(image_path=str(img2_path))
539
+ steps.append(
540
+ Step(
541
+ t=2.0,
542
+ observation=obs2,
543
+ action=Action(
544
+ type="click",
545
+ x=cx_pw,
546
+ y=cy_pw,
547
+ bbox=password_bbox,
548
+ element_index=SOM_PASSWORD_FIELD,
549
+ ),
550
+ thought="Focus the password field by clicking element [2].",
551
+ )
552
+ )
553
+
554
+ # Step 3: password field focused → type password into element [2]
555
+ img3, _ = _draw_login_screen(username=username, layout=layout, jitter=False)
556
+ img3_som = _overlay_som_marks(img3, som_elements)
557
+ img3_path = root / f"{episode_id}_step_3.png"
558
+ _save_image(img3_som, img3_path)
559
+ obs3 = Observation(image_path=str(img3_path))
560
+ steps.append(
561
+ Step(
562
+ t=3.0,
563
+ observation=obs3,
564
+ action=Action(type="type", text=password, element_index=SOM_PASSWORD_FIELD),
565
+ thought="Type the password into element [2].",
566
+ )
567
+ )
568
+
569
+ # Step 4: password typed → click login button [3]
570
+ cx_btn, cy_btn = _center(layout.login_button)
571
+ img4, _ = _draw_login_screen(
572
+ username=username, password=password, layout=layout, jitter=False
573
+ )
574
+ img4_som = _overlay_som_marks(img4, som_elements)
575
+ img4_path = root / f"{episode_id}_step_4.png"
576
+ _save_image(img4_som, img4_path)
577
+ obs4 = Observation(image_path=str(img4_path))
578
+ steps.append(
579
+ Step(
580
+ t=4.0,
581
+ observation=obs4,
582
+ action=Action(
583
+ type="click",
584
+ x=cx_btn,
585
+ y=cy_btn,
586
+ bbox=login_bbox,
587
+ element_index=SOM_LOGIN_BUTTON,
588
+ ),
589
+ thought="Submit the login form by clicking element [3].",
590
+ )
591
+ )
592
+
593
+ # Step 5: logged-in screen → DONE (no SoM needed)
594
+ img5 = _draw_logged_in_screen(username=username)
595
+ img5_path = root / f"{episode_id}_step_5.png"
596
+ _save_image(img5, img5_path)
597
+ obs5 = Observation(image_path=str(img5_path))
598
+ steps.append(
599
+ Step(
600
+ t=5.0,
601
+ observation=obs5,
602
+ action=Action(type="done"),
603
+ thought="Login successful; workflow complete.",
604
+ )
605
+ )
606
+
607
+ episode = Episode(
608
+ id=episode_id,
609
+ goal=f"Log in with username '{username}' and password '{password}'",
610
+ steps=steps,
611
+ summary="Successful login via username and password (SoM mode).",
612
+ success=True,
613
+ workflow_id="login_basic_som",
614
+ )
615
+
616
+ return episode
617
+
618
+
619
+ @dataclass
620
+ class RegistrationUIElements:
621
+ """Absolute pixel bounds for registration form interactive regions.
622
+
623
+ Bounds are (x, y, w, h) in pixels.
624
+ """
625
+
626
+ first_name_box: Tuple[int, int, int, int]
627
+ last_name_box: Tuple[int, int, int, int]
628
+ email_box: Tuple[int, int, int, int]
629
+ password_box: Tuple[int, int, int, int]
630
+ confirm_password_box: Tuple[int, int, int, int]
631
+ register_button: Tuple[int, int, int, int]
632
+
633
+
634
+ # Element index mapping for the registration screen (1-based)
635
+ SOM_FIRST_NAME_FIELD = 1
636
+ SOM_LAST_NAME_FIELD = 2
637
+ SOM_EMAIL_FIELD = 3
638
+ SOM_REG_PASSWORD_FIELD = 4
639
+ SOM_CONFIRM_PASSWORD_FIELD = 5
640
+ SOM_REGISTER_BUTTON = 6
641
+
642
+
643
+ def _compute_registration_layout(max_offset: int = 8, jitter: bool = True) -> RegistrationUIElements:
644
+ """Compute registration form layout with optional jitter."""
645
+
646
+ label_x = 180
647
+ box_w, box_h = 400, 36
648
+ start_y = 100
649
+ field_spacing = 70
650
+
651
+ def _maybe_jitter(x: int, y: int) -> tuple[int, int]:
652
+ if not jitter:
653
+ return x, y
654
+ dx = random.randint(-max_offset, max_offset)
655
+ dy = random.randint(-max_offset, max_offset)
656
+ return max(20, min(IMG_WIDTH - box_w - 20, x + dx)), max(20, min(IMG_HEIGHT - 60, y + dy))
657
+
658
+ # First name
659
+ fn_x, fn_y = _maybe_jitter(label_x, start_y + 24)
660
+ first_name_box = (fn_x, fn_y, box_w, box_h)
661
+
662
+ # Last name
663
+ ln_x, ln_y = _maybe_jitter(label_x, start_y + field_spacing + 24)
664
+ last_name_box = (ln_x, ln_y, box_w, box_h)
665
+
666
+ # Email
667
+ em_x, em_y = _maybe_jitter(label_x, start_y + 2 * field_spacing + 24)
668
+ email_box = (em_x, em_y, box_w, box_h)
669
+
670
+ # Password
671
+ pw_x, pw_y = _maybe_jitter(label_x, start_y + 3 * field_spacing + 24)
672
+ password_box = (pw_x, pw_y, box_w, box_h)
673
+
674
+ # Confirm password
675
+ cpw_x, cpw_y = _maybe_jitter(label_x, start_y + 4 * field_spacing + 24)
676
+ confirm_password_box = (cpw_x, cpw_y, box_w, box_h)
677
+
678
+ # Register button
679
+ btn_w, btn_h = 160, 45
680
+ btn_x, btn_y = _maybe_jitter((IMG_WIDTH - btn_w) // 2, start_y + 5 * field_spacing + 40)
681
+ register_button = (btn_x, btn_y, btn_w, btn_h)
682
+
683
+ return RegistrationUIElements(
684
+ first_name_box=first_name_box,
685
+ last_name_box=last_name_box,
686
+ email_box=email_box,
687
+ password_box=password_box,
688
+ confirm_password_box=confirm_password_box,
689
+ register_button=register_button,
690
+ )
691
+
692
+
693
+ def _draw_registration_screen(
694
+ first_name: str = "",
695
+ last_name: str = "",
696
+ email: str = "",
697
+ password: str = "",
698
+ confirm_password: str = "",
699
+ layout: Optional[RegistrationUIElements] = None,
700
+ jitter: bool = True,
701
+ ) -> tuple[Image.Image, RegistrationUIElements]:
702
+ """Draw a registration form with multiple text fields."""
703
+
704
+ img = Image.new("RGB", (IMG_WIDTH, IMG_HEIGHT), color=(235, 240, 245))
705
+ draw = ImageDraw.Draw(img)
706
+
707
+ # Title
708
+ title_text = "Create Account"
709
+ tw, th = _text_size(draw, title_text, FONT_TITLE)
710
+ draw.text(((IMG_WIDTH - tw) // 2, 40), title_text, fill="darkblue", font=FONT_TITLE)
711
+
712
+ if layout is None:
713
+ layout = _compute_registration_layout(jitter=jitter)
714
+
715
+ label_x = 180
716
+ box_w, box_h = 400, 36
717
+ start_y = 100
718
+ field_spacing = 70
719
+
720
+ fields = [
721
+ ("First Name:", layout.first_name_box, first_name, False),
722
+ ("Last Name:", layout.last_name_box, last_name, False),
723
+ ("Email:", layout.email_box, email, False),
724
+ ("Password:", layout.password_box, password, True),
725
+ ("Confirm Password:", layout.confirm_password_box, confirm_password, True),
726
+ ]
727
+
728
+ for i, (label, box, value, is_password) in enumerate(fields):
729
+ bx, by, bw, bh = box
730
+ label_y = start_y + i * field_spacing
731
+ draw.text((label_x, label_y), label, fill="black", font=FONT)
732
+ draw.rectangle([(bx, by), (bx + bw, by + bh)], outline="black", fill="white")
733
+ if value:
734
+ display_val = "*" * len(value) if is_password else value
735
+ draw.text((bx + 8, by + 8), display_val, fill="black", font=FONT)
736
+
737
+ # Register button
738
+ btn_x, btn_y, btn_w, btn_h = layout.register_button
739
+ draw.rectangle([(btn_x, btn_y), (btn_x + btn_w, btn_y + btn_h)], outline="black", fill="darkblue")
740
+ btn_text = "Register"
741
+ btw, bth = _text_size(draw, btn_text, FONT)
742
+ draw.text((btn_x + (btn_w - btw) // 2, btn_y + (btn_h - bth) // 2), btn_text, fill="white", font=FONT)
743
+
744
+ # Decoy "Clear Form" button
745
+ decoy_w, decoy_h = 100, 35
746
+ decoy_x = IMG_WIDTH - decoy_w - 30
747
+ decoy_y = btn_y + 5
748
+ draw.rectangle([(decoy_x, decoy_y), (decoy_x + decoy_w, decoy_y + decoy_h)], outline="gray", fill=(200, 200, 200))
749
+ decoy_text = "Clear"
750
+ dtw, dth = _text_size(draw, decoy_text, FONT)
751
+ draw.text((decoy_x + (decoy_w - dtw) // 2, decoy_y + (decoy_h - dth) // 2), decoy_text, fill="gray", font=FONT)
752
+
753
+ return img, layout
754
+
755
+
756
+ def _draw_registration_success_screen(first_name: str, email: str) -> Image.Image:
757
+ """Registration success screen."""
758
+ img = Image.new("RGB", (IMG_WIDTH, IMG_HEIGHT), color=(210, 235, 210))
759
+ draw = ImageDraw.Draw(img)
760
+ text = f"Welcome, {first_name}!"
761
+ tw, th = _text_size(draw, text, FONT_TITLE)
762
+ draw.text(((IMG_WIDTH - tw) // 2, IMG_HEIGHT // 2 - 40), text, fill="darkgreen", font=FONT_TITLE)
763
+ subtext = f"Confirmation sent to {email}"
764
+ stw, sth = _text_size(draw, subtext, FONT)
765
+ draw.text(((IMG_WIDTH - stw) // 2, IMG_HEIGHT // 2 + 20), subtext, fill="gray", font=FONT)
766
+ return img
767
+
768
+
769
+ def _script_registration_episode(
770
+ root: Path,
771
+ episode_id: str,
772
+ first_name: str,
773
+ last_name: str,
774
+ email: str,
775
+ password: str,
776
+ jitter: bool = True,
777
+ ) -> Episode:
778
+ """Create a scripted registration episode with 12 steps.
779
+
780
+ Steps:
781
+ - 0: Click first name field
782
+ - 1: Type first name
783
+ - 2: Click last name field
784
+ - 3: Type last name
785
+ - 4: Click email field
786
+ - 5: Type email
787
+ - 6: Click password field
788
+ - 7: Type password
789
+ - 8: Click confirm password field
790
+ - 9: Type confirm password
791
+ - 10: Click register button
792
+ - 11: DONE
793
+ """
794
+ steps: List[Step] = []
795
+ layout = _compute_registration_layout(jitter=jitter)
796
+
797
+ # Field data: (field_name, box, value, element_index)
798
+ field_sequence = [
799
+ ("first_name", layout.first_name_box, first_name, SOM_FIRST_NAME_FIELD),
800
+ ("last_name", layout.last_name_box, last_name, SOM_LAST_NAME_FIELD),
801
+ ("email", layout.email_box, email, SOM_EMAIL_FIELD),
802
+ ("password", layout.password_box, password, SOM_REG_PASSWORD_FIELD),
803
+ ("confirm_password", layout.confirm_password_box, password, SOM_CONFIRM_PASSWORD_FIELD),
804
+ ]
805
+
806
+ current_values = {"first_name": "", "last_name": "", "email": "", "password": "", "confirm_password": ""}
807
+ step_idx = 0
808
+
809
+ for field_name, box, value, elem_idx in field_sequence:
810
+ # Click step
811
+ cx, cy = _center(box)
812
+ bbox = _bbox_normalized(box)
813
+ img, _ = _draw_registration_screen(
814
+ first_name=current_values["first_name"],
815
+ last_name=current_values["last_name"],
816
+ email=current_values["email"],
817
+ password=current_values["password"],
818
+ confirm_password=current_values["confirm_password"],
819
+ layout=layout,
820
+ jitter=False,
821
+ )
822
+ img_path = root / f"{episode_id}_step_{step_idx}.png"
823
+ _save_image(img, img_path)
824
+ steps.append(Step(
825
+ t=float(step_idx),
826
+ observation=Observation(image_path=str(img_path)),
827
+ action=Action(type="click", x=cx, y=cy, bbox=bbox, element_index=elem_idx),
828
+ thought=f"Focus the {field_name.replace('_', ' ')} field.",
829
+ ))
830
+ step_idx += 1
831
+
832
+ # Type step
833
+ img2, _ = _draw_registration_screen(
834
+ first_name=current_values["first_name"],
835
+ last_name=current_values["last_name"],
836
+ email=current_values["email"],
837
+ password=current_values["password"],
838
+ confirm_password=current_values["confirm_password"],
839
+ layout=layout,
840
+ jitter=False,
841
+ )
842
+ img2_path = root / f"{episode_id}_step_{step_idx}.png"
843
+ _save_image(img2, img2_path)
844
+ steps.append(Step(
845
+ t=float(step_idx),
846
+ observation=Observation(image_path=str(img2_path)),
847
+ action=Action(type="type", text=value, element_index=elem_idx),
848
+ thought=f"Type the {field_name.replace('_', ' ')}.",
849
+ ))
850
+ current_values[field_name] = value
851
+ step_idx += 1
852
+
853
+ # Click register button
854
+ cx, cy = _center(layout.register_button)
855
+ bbox = _bbox_normalized(layout.register_button)
856
+ img, _ = _draw_registration_screen(
857
+ first_name=current_values["first_name"],
858
+ last_name=current_values["last_name"],
859
+ email=current_values["email"],
860
+ password=current_values["password"],
861
+ confirm_password=current_values["confirm_password"],
862
+ layout=layout,
863
+ jitter=False,
864
+ )
865
+ img_path = root / f"{episode_id}_step_{step_idx}.png"
866
+ _save_image(img, img_path)
867
+ steps.append(Step(
868
+ t=float(step_idx),
869
+ observation=Observation(image_path=str(img_path)),
870
+ action=Action(type="click", x=cx, y=cy, bbox=bbox, element_index=SOM_REGISTER_BUTTON),
871
+ thought="Submit the registration form.",
872
+ ))
873
+ step_idx += 1
874
+
875
+ # Done step
876
+ img_done = _draw_registration_success_screen(first_name, email)
877
+ img_done_path = root / f"{episode_id}_step_{step_idx}.png"
878
+ _save_image(img_done, img_done_path)
879
+ steps.append(Step(
880
+ t=float(step_idx),
881
+ observation=Observation(image_path=str(img_done_path)),
882
+ action=Action(type="done"),
883
+ thought="Registration successful; workflow complete.",
884
+ ))
885
+
886
+ return Episode(
887
+ id=episode_id,
888
+ goal=f"Register with first name '{first_name}', last name '{last_name}', email '{email}', and password",
889
+ steps=steps,
890
+ summary="Successful registration.",
891
+ success=True,
892
+ workflow_id="registration",
893
+ )
894
+
895
+
896
+ def _script_registration_episode_som(
897
+ root: Path,
898
+ episode_id: str,
899
+ first_name: str,
900
+ last_name: str,
901
+ email: str,
902
+ password: str,
903
+ jitter: bool = True,
904
+ ) -> Episode:
905
+ """Create a registration episode with SoM overlays."""
906
+ steps: List[Step] = []
907
+ layout = _compute_registration_layout(jitter=jitter)
908
+
909
+ som_elements = [
910
+ (SOM_FIRST_NAME_FIELD, layout.first_name_box),
911
+ (SOM_LAST_NAME_FIELD, layout.last_name_box),
912
+ (SOM_EMAIL_FIELD, layout.email_box),
913
+ (SOM_REG_PASSWORD_FIELD, layout.password_box),
914
+ (SOM_CONFIRM_PASSWORD_FIELD, layout.confirm_password_box),
915
+ (SOM_REGISTER_BUTTON, layout.register_button),
916
+ ]
917
+
918
+ field_sequence = [
919
+ ("first_name", layout.first_name_box, first_name, SOM_FIRST_NAME_FIELD),
920
+ ("last_name", layout.last_name_box, last_name, SOM_LAST_NAME_FIELD),
921
+ ("email", layout.email_box, email, SOM_EMAIL_FIELD),
922
+ ("password", layout.password_box, password, SOM_REG_PASSWORD_FIELD),
923
+ ("confirm_password", layout.confirm_password_box, password, SOM_CONFIRM_PASSWORD_FIELD),
924
+ ]
925
+
926
+ current_values = {"first_name": "", "last_name": "", "email": "", "password": "", "confirm_password": ""}
927
+ step_idx = 0
928
+
929
+ for field_name, box, value, elem_idx in field_sequence:
930
+ # Click step
931
+ cx, cy = _center(box)
932
+ bbox = _bbox_normalized(box)
933
+ img, _ = _draw_registration_screen(
934
+ first_name=current_values["first_name"],
935
+ last_name=current_values["last_name"],
936
+ email=current_values["email"],
937
+ password=current_values["password"],
938
+ confirm_password=current_values["confirm_password"],
939
+ layout=layout,
940
+ jitter=False,
941
+ )
942
+ img_som = _overlay_som_marks(img, som_elements)
943
+ img_path = root / f"{episode_id}_step_{step_idx}.png"
944
+ _save_image(img_som, img_path)
945
+ steps.append(Step(
946
+ t=float(step_idx),
947
+ observation=Observation(image_path=str(img_path)),
948
+ action=Action(type="click", x=cx, y=cy, bbox=bbox, element_index=elem_idx),
949
+ thought=f"Focus element [{elem_idx}] ({field_name.replace('_', ' ')} field).",
950
+ ))
951
+ step_idx += 1
952
+
953
+ # Type step
954
+ img2, _ = _draw_registration_screen(
955
+ first_name=current_values["first_name"],
956
+ last_name=current_values["last_name"],
957
+ email=current_values["email"],
958
+ password=current_values["password"],
959
+ confirm_password=current_values["confirm_password"],
960
+ layout=layout,
961
+ jitter=False,
962
+ )
963
+ img2_som = _overlay_som_marks(img2, som_elements)
964
+ img2_path = root / f"{episode_id}_step_{step_idx}.png"
965
+ _save_image(img2_som, img2_path)
966
+ steps.append(Step(
967
+ t=float(step_idx),
968
+ observation=Observation(image_path=str(img2_path)),
969
+ action=Action(type="type", text=value, element_index=elem_idx),
970
+ thought=f"Type into element [{elem_idx}].",
971
+ ))
972
+ current_values[field_name] = value
973
+ step_idx += 1
974
+
975
+ # Click register button
976
+ cx, cy = _center(layout.register_button)
977
+ bbox = _bbox_normalized(layout.register_button)
978
+ img, _ = _draw_registration_screen(
979
+ first_name=current_values["first_name"],
980
+ last_name=current_values["last_name"],
981
+ email=current_values["email"],
982
+ password=current_values["password"],
983
+ confirm_password=current_values["confirm_password"],
984
+ layout=layout,
985
+ jitter=False,
986
+ )
987
+ img_som = _overlay_som_marks(img, som_elements)
988
+ img_path = root / f"{episode_id}_step_{step_idx}.png"
989
+ _save_image(img_som, img_path)
990
+ steps.append(Step(
991
+ t=float(step_idx),
992
+ observation=Observation(image_path=str(img_path)),
993
+ action=Action(type="click", x=cx, y=cy, bbox=bbox, element_index=SOM_REGISTER_BUTTON),
994
+ thought=f"Click element [{SOM_REGISTER_BUTTON}] to submit registration.",
995
+ ))
996
+ step_idx += 1
997
+
998
+ # Done step
999
+ img_done = _draw_registration_success_screen(first_name, email)
1000
+ img_done_path = root / f"{episode_id}_step_{step_idx}.png"
1001
+ _save_image(img_done, img_done_path)
1002
+ steps.append(Step(
1003
+ t=float(step_idx),
1004
+ observation=Observation(image_path=str(img_done_path)),
1005
+ action=Action(type="done"),
1006
+ thought="Registration successful; workflow complete.",
1007
+ ))
1008
+
1009
+ return Episode(
1010
+ id=episode_id,
1011
+ goal=f"Register with first name '{first_name}', last name '{last_name}', email '{email}', and password",
1012
+ steps=steps,
1013
+ summary="Successful registration (SoM mode).",
1014
+ success=True,
1015
+ workflow_id="registration_som",
1016
+ )
1017
+
1018
+
1019
+ def generate_synthetic_sessions(
1020
+ num_sessions: int = 10,
1021
+ seed: int | None = None,
1022
+ output_dir: str | os.PathLike[str] | None = None,
1023
+ jitter: bool = True,
1024
+ use_som: bool = False,
1025
+ scenario: str = "login",
1026
+ ) -> List[Session]:
1027
+ """Generate a list of synthetic Sessions with semantic UI episodes.
1028
+
1029
+ Each Session contains a single Episode. Images for all steps are written
1030
+ to `output_dir`.
1031
+
1032
+ Args:
1033
+ num_sessions: Number of sessions to generate.
1034
+ seed: Random seed for reproducibility.
1035
+ output_dir: Directory to write images to.
1036
+ jitter: Whether to apply slight position jitter to UI elements.
1037
+ use_som: If True, generate Set-of-Marks (SoM) annotated screenshots
1038
+ with numbered element labels and use element indices for
1039
+ click actions instead of raw coordinates.
1040
+ scenario: Type of UI scenario to generate. Options:
1041
+ - "login": Simple login form (6 steps, 3 elements)
1042
+ - "registration": Registration form (12 steps, 6 elements)
1043
+ """
1044
+
1045
+ if seed is not None:
1046
+ random.seed(seed)
1047
+
1048
+ if output_dir is None:
1049
+ suffix = "_som" if use_som else ""
1050
+ output_root = Path("synthetic") / f"data_{scenario}{suffix}"
1051
+ else:
1052
+ output_root = Path(output_dir)
1053
+
1054
+ sessions: List[Session] = []
1055
+
1056
+ for i in range(num_sessions):
1057
+ session_id = f"session_{i:04d}"
1058
+ session_dir = output_root / session_id
1059
+
1060
+ if scenario == "login":
1061
+ episode_id = f"{session_id}_login"
1062
+ username = f"user{i}"
1063
+ password = f"pass{i}123"
1064
+
1065
+ if use_som:
1066
+ episode = _script_login_episode_som(
1067
+ session_dir, episode_id, username, password, jitter=jitter
1068
+ )
1069
+ else:
1070
+ episode = _script_login_episode(
1071
+ session_dir, episode_id, username, password, jitter=jitter
1072
+ )
1073
+
1074
+ elif scenario == "registration":
1075
+ episode_id = f"{session_id}_registration"
1076
+ first_name = f"John{i}"
1077
+ last_name = f"Doe{i}"
1078
+ email = f"john{i}@example.com"
1079
+ password = f"SecurePass{i}!"
1080
+
1081
+ if use_som:
1082
+ episode = _script_registration_episode_som(
1083
+ session_dir, episode_id, first_name, last_name, email, password, jitter=jitter
1084
+ )
1085
+ else:
1086
+ episode = _script_registration_episode(
1087
+ session_dir, episode_id, first_name, last_name, email, password, jitter=jitter
1088
+ )
1089
+
1090
+ else:
1091
+ raise ValueError(f"Unknown scenario: {scenario}. Options: login, registration")
1092
+
1093
+ session = Session(
1094
+ id=session_id,
1095
+ episodes=[episode],
1096
+ meta={"scenario": scenario, "use_som": use_som},
1097
+ )
1098
+ sessions.append(session)
1099
+
1100
+ return sessions
1101
+
1102
+