openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -107
  8. openadapt_ml/benchmarks/agent.py +297 -374
  9. openadapt_ml/benchmarks/azure.py +62 -24
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1874 -751
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +1236 -0
  14. openadapt_ml/benchmarks/vm_monitor.py +1111 -0
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
  16. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  17. openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
  18. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  19. openadapt_ml/cloud/azure_inference.py +3 -5
  20. openadapt_ml/cloud/lambda_labs.py +722 -307
  21. openadapt_ml/cloud/local.py +3194 -89
  22. openadapt_ml/cloud/ssh_tunnel.py +595 -0
  23. openadapt_ml/datasets/next_action.py +125 -96
  24. openadapt_ml/evals/grounding.py +32 -9
  25. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  26. openadapt_ml/evals/trajectory_matching.py +120 -57
  27. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  28. openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
  29. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  30. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  31. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  32. openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
  33. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  34. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  35. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  36. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  37. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  38. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  39. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  40. openadapt_ml/experiments/waa_demo/runner.py +732 -0
  41. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  42. openadapt_ml/export/__init__.py +9 -0
  43. openadapt_ml/export/__main__.py +6 -0
  44. openadapt_ml/export/cli.py +89 -0
  45. openadapt_ml/export/parquet.py +277 -0
  46. openadapt_ml/grounding/detector.py +18 -14
  47. openadapt_ml/ingest/__init__.py +11 -10
  48. openadapt_ml/ingest/capture.py +97 -86
  49. openadapt_ml/ingest/loader.py +120 -69
  50. openadapt_ml/ingest/synthetic.py +344 -193
  51. openadapt_ml/models/api_adapter.py +14 -4
  52. openadapt_ml/models/base_adapter.py +10 -2
  53. openadapt_ml/models/providers/__init__.py +288 -0
  54. openadapt_ml/models/providers/anthropic.py +266 -0
  55. openadapt_ml/models/providers/base.py +299 -0
  56. openadapt_ml/models/providers/google.py +376 -0
  57. openadapt_ml/models/providers/openai.py +342 -0
  58. openadapt_ml/models/qwen_vl.py +46 -19
  59. openadapt_ml/perception/__init__.py +35 -0
  60. openadapt_ml/perception/integration.py +399 -0
  61. openadapt_ml/retrieval/README.md +226 -0
  62. openadapt_ml/retrieval/USAGE.md +391 -0
  63. openadapt_ml/retrieval/__init__.py +91 -0
  64. openadapt_ml/retrieval/demo_retriever.py +843 -0
  65. openadapt_ml/retrieval/embeddings.py +630 -0
  66. openadapt_ml/retrieval/index.py +194 -0
  67. openadapt_ml/retrieval/retriever.py +162 -0
  68. openadapt_ml/runtime/__init__.py +50 -0
  69. openadapt_ml/runtime/policy.py +27 -14
  70. openadapt_ml/runtime/safety_gate.py +471 -0
  71. openadapt_ml/schema/__init__.py +113 -0
  72. openadapt_ml/schema/converters.py +588 -0
  73. openadapt_ml/schema/episode.py +470 -0
  74. openadapt_ml/scripts/capture_screenshots.py +530 -0
  75. openadapt_ml/scripts/compare.py +102 -61
  76. openadapt_ml/scripts/demo_policy.py +4 -1
  77. openadapt_ml/scripts/eval_policy.py +19 -14
  78. openadapt_ml/scripts/make_gif.py +1 -1
  79. openadapt_ml/scripts/prepare_synthetic.py +16 -17
  80. openadapt_ml/scripts/train.py +98 -75
  81. openadapt_ml/segmentation/README.md +920 -0
  82. openadapt_ml/segmentation/__init__.py +97 -0
  83. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  84. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  85. openadapt_ml/segmentation/annotator.py +610 -0
  86. openadapt_ml/segmentation/cache.py +290 -0
  87. openadapt_ml/segmentation/cli.py +674 -0
  88. openadapt_ml/segmentation/deduplicator.py +656 -0
  89. openadapt_ml/segmentation/frame_describer.py +788 -0
  90. openadapt_ml/segmentation/pipeline.py +340 -0
  91. openadapt_ml/segmentation/schemas.py +622 -0
  92. openadapt_ml/segmentation/segment_extractor.py +634 -0
  93. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  94. openadapt_ml/training/benchmark_viewer.py +3255 -19
  95. openadapt_ml/training/shared_ui.py +7 -7
  96. openadapt_ml/training/stub_provider.py +57 -35
  97. openadapt_ml/training/trainer.py +255 -441
  98. openadapt_ml/training/trl_trainer.py +403 -0
  99. openadapt_ml/training/viewer.py +323 -108
  100. openadapt_ml/training/viewer_components.py +180 -0
  101. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
  102. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  103. openadapt_ml/benchmarks/base.py +0 -366
  104. openadapt_ml/benchmarks/data_collection.py +0 -432
  105. openadapt_ml/benchmarks/runner.py +0 -381
  106. openadapt_ml/benchmarks/waa.py +0 -704
  107. openadapt_ml/schemas/__init__.py +0 -53
  108. openadapt_ml/schemas/sessions.py +0 -122
  109. openadapt_ml/schemas/validation.py +0 -252
  110. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  111. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  112. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -8,7 +8,7 @@ from typing import List, Optional, Tuple
8
8
 
9
9
  from PIL import Image, ImageDraw, ImageFont
10
10
 
11
- from openadapt_ml.schemas.sessions import Action, Episode, Observation, Session, Step
11
+ from openadapt_ml.schema import Action, ActionType, Episode, Observation, Step
12
12
 
13
13
 
14
14
  IMG_WIDTH = 800
@@ -32,7 +32,9 @@ def _normalize(x_px: int, y_px: int) -> Tuple[float, float]:
32
32
  return x_px / IMG_WIDTH, y_px / IMG_HEIGHT
33
33
 
34
34
 
35
- def _text_size(draw: ImageDraw.ImageDraw, text: str, font: ImageFont.ImageFont) -> Tuple[int, int]:
35
+ def _text_size(
36
+ draw: ImageDraw.ImageDraw, text: str, font: ImageFont.ImageFont
37
+ ) -> Tuple[int, int]:
36
38
  """Compute text width/height using textbbox for Pillow compatibility."""
37
39
 
38
40
  left, top, right, bottom = draw.textbbox((0, 0), text, font=font)
@@ -313,7 +315,9 @@ def _center(bounds: Tuple[int, int, int, int]) -> Tuple[float, float]:
313
315
  return _normalize(cx, cy)
314
316
 
315
317
 
316
- def _bbox_normalized(bounds: Tuple[int, int, int, int]) -> Tuple[float, float, float, float]:
318
+ def _bbox_normalized(
319
+ bounds: Tuple[int, int, int, int],
320
+ ) -> Tuple[float, float, float, float]:
317
321
  """Convert pixel bounds (x, y, w, h) to normalized bbox (x_min, y_min, x_max, y_max)."""
318
322
  x, y, w, h = bounds
319
323
  x_min = x / IMG_WIDTH
@@ -333,12 +337,12 @@ def _script_login_episode(
333
337
  """Create a scripted login episode with a fixed sequence of steps.
334
338
 
335
339
  Steps (6 total):
336
- - Step 0: blank login screen click username field.
337
- - Step 1: username field focused type username.
338
- - Step 2: username typed click password field.
339
- - Step 3: password field focused type password.
340
- - Step 4: password typed click login button.
341
- - Step 5: logged-in screen DONE.
340
+ - Step 0: blank login screen -> click username field.
341
+ - Step 1: username field focused -> type username.
342
+ - Step 2: username typed -> click password field.
343
+ - Step 3: password field focused -> type password.
344
+ - Step 4: password typed -> click login button.
345
+ - Step 5: logged-in screen -> DONE.
342
346
 
343
347
  Each step includes bounding boxes for clickable elements to support
344
348
  bbox-based click hit evaluation.
@@ -354,100 +358,122 @@ def _script_login_episode(
354
358
  password_bbox = _bbox_normalized(layout.password_box)
355
359
  login_bbox = _bbox_normalized(layout.login_button)
356
360
 
357
- # Step 0: blank login screen click username field
361
+ # Step 0: blank login screen -> click username field
358
362
  cx, cy = _center(layout.username_box)
359
363
  img0, _ = _draw_login_screen(layout=layout, jitter=False)
360
364
  img0_path = root / f"{episode_id}_step_0.png"
361
365
  _save_image(img0, img0_path)
362
- obs0 = Observation(image_path=str(img0_path))
366
+ obs0 = Observation(screenshot_path=str(img0_path))
363
367
  steps.append(
364
368
  Step(
365
- t=0.0,
369
+ step_index=0,
370
+ timestamp=0.0,
366
371
  observation=obs0,
367
- action=Action(type="click", x=cx, y=cy, bbox=username_bbox),
368
- thought="Focus the username field.",
372
+ action=Action(
373
+ type=ActionType.CLICK,
374
+ normalized_coordinates=(cx, cy),
375
+ raw={"bbox": username_bbox},
376
+ ),
377
+ reasoning="Focus the username field.",
369
378
  )
370
379
  )
371
380
 
372
- # Step 1: username field focused type username
381
+ # Step 1: username field focused -> type username
373
382
  img1, _ = _draw_login_screen(username="", layout=layout, jitter=False)
374
383
  img1_path = root / f"{episode_id}_step_1.png"
375
384
  _save_image(img1, img1_path)
376
- obs1 = Observation(image_path=str(img1_path))
385
+ obs1 = Observation(screenshot_path=str(img1_path))
377
386
  steps.append(
378
387
  Step(
379
- t=1.0,
388
+ step_index=1,
389
+ timestamp=1.0,
380
390
  observation=obs1,
381
- action=Action(type="type", text=username),
382
- thought="Type the username.",
391
+ action=Action(type=ActionType.TYPE, text=username),
392
+ reasoning="Type the username.",
383
393
  )
384
394
  )
385
395
 
386
- # Step 2: username typed click password field
396
+ # Step 2: username typed -> click password field
387
397
  cx_pw, cy_pw = _center(layout.password_box)
388
398
  img2, _ = _draw_login_screen(username=username, layout=layout, jitter=False)
389
399
  img2_path = root / f"{episode_id}_step_2.png"
390
400
  _save_image(img2, img2_path)
391
- obs2 = Observation(image_path=str(img2_path))
401
+ obs2 = Observation(screenshot_path=str(img2_path))
392
402
  steps.append(
393
403
  Step(
394
- t=2.0,
404
+ step_index=2,
405
+ timestamp=2.0,
395
406
  observation=obs2,
396
- action=Action(type="click", x=cx_pw, y=cy_pw, bbox=password_bbox),
397
- thought="Focus the password field.",
407
+ action=Action(
408
+ type=ActionType.CLICK,
409
+ normalized_coordinates=(cx_pw, cy_pw),
410
+ raw={"bbox": password_bbox},
411
+ ),
412
+ reasoning="Focus the password field.",
398
413
  )
399
414
  )
400
415
 
401
- # Step 3: password field focused type password
416
+ # Step 3: password field focused -> type password
402
417
  img3, _ = _draw_login_screen(username=username, layout=layout, jitter=False)
403
418
  img3_path = root / f"{episode_id}_step_3.png"
404
419
  _save_image(img3, img3_path)
405
- obs3 = Observation(image_path=str(img3_path))
420
+ obs3 = Observation(screenshot_path=str(img3_path))
406
421
  steps.append(
407
422
  Step(
408
- t=3.0,
423
+ step_index=3,
424
+ timestamp=3.0,
409
425
  observation=obs3,
410
- action=Action(type="type", text=password),
411
- thought="Type the password.",
426
+ action=Action(type=ActionType.TYPE, text=password),
427
+ reasoning="Type the password.",
412
428
  )
413
429
  )
414
430
 
415
- # Step 4: password typed click login button
431
+ # Step 4: password typed -> click login button
416
432
  cx_btn, cy_btn = _center(layout.login_button)
417
- img4, _ = _draw_login_screen(username=username, password=password, layout=layout, jitter=False)
433
+ img4, _ = _draw_login_screen(
434
+ username=username, password=password, layout=layout, jitter=False
435
+ )
418
436
  img4_path = root / f"{episode_id}_step_4.png"
419
437
  _save_image(img4, img4_path)
420
- obs4 = Observation(image_path=str(img4_path))
438
+ obs4 = Observation(screenshot_path=str(img4_path))
421
439
  steps.append(
422
440
  Step(
423
- t=4.0,
441
+ step_index=4,
442
+ timestamp=4.0,
424
443
  observation=obs4,
425
- action=Action(type="click", x=cx_btn, y=cy_btn, bbox=login_bbox),
426
- thought="Submit the login form.",
444
+ action=Action(
445
+ type=ActionType.CLICK,
446
+ normalized_coordinates=(cx_btn, cy_btn),
447
+ raw={"bbox": login_bbox},
448
+ ),
449
+ reasoning="Submit the login form.",
427
450
  )
428
451
  )
429
452
 
430
- # Step 5: logged-in screen DONE
453
+ # Step 5: logged-in screen -> DONE
431
454
  img5 = _draw_logged_in_screen(username=username)
432
455
  img5_path = root / f"{episode_id}_step_5.png"
433
456
  _save_image(img5, img5_path)
434
- obs5 = Observation(image_path=str(img5_path))
457
+ obs5 = Observation(screenshot_path=str(img5_path))
435
458
  steps.append(
436
459
  Step(
437
- t=5.0,
460
+ step_index=5,
461
+ timestamp=5.0,
438
462
  observation=obs5,
439
- action=Action(type="done"),
440
- thought="Login successful; workflow complete.",
463
+ action=Action(type=ActionType.DONE),
464
+ reasoning="Login successful; workflow complete.",
441
465
  )
442
466
  )
443
467
 
444
468
  episode = Episode(
445
- id=episode_id,
446
- goal=f"Log in with username '{username}' and password '{password}'",
469
+ episode_id=episode_id,
470
+ instruction=f"Log in with username '{username}' and password '{password}'",
447
471
  steps=steps,
448
- summary="Successful login via username and password.",
449
472
  success=True,
450
- workflow_id="login_basic",
473
+ metadata={
474
+ "summary": "Successful login via username and password.",
475
+ "workflow_id": "login_basic",
476
+ },
451
477
  )
452
478
 
453
479
  return episode
@@ -467,12 +493,12 @@ def _script_login_episode_som(
467
493
  for click actions.
468
494
 
469
495
  Steps (6 total):
470
- - Step 0: SoM login screen click element [1] (username field)
471
- - Step 1: username field focused type username
472
- - Step 2: username typed click element [2] (password field)
473
- - Step 3: password field focused type password
474
- - Step 4: password typed click element [3] (login button)
475
- - Step 5: logged-in screen DONE
496
+ - Step 0: SoM login screen -> click element [1] (username field)
497
+ - Step 1: username field focused -> type username
498
+ - Step 2: username typed -> click element [2] (password field)
499
+ - Step 3: password field focused -> type password
500
+ - Step 4: password typed -> click element [3] (login button)
501
+ - Step 5: logged-in screen -> DONE
476
502
  """
477
503
 
478
504
  steps: List[Step] = []
@@ -492,81 +518,89 @@ def _script_login_episode_som(
492
518
  (SOM_LOGIN_BUTTON, layout.login_button),
493
519
  ]
494
520
 
495
- # Step 0: SoM login screen click username field [1]
521
+ # Step 0: SoM login screen -> click username field [1]
496
522
  cx, cy = _center(layout.username_box)
497
523
  img0, _ = _draw_login_screen(layout=layout, jitter=False)
498
524
  img0_som = _overlay_som_marks(img0, som_elements)
499
525
  img0_path = root / f"{episode_id}_step_0.png"
500
526
  _save_image(img0_som, img0_path)
501
- obs0 = Observation(image_path=str(img0_path))
527
+ obs0 = Observation(screenshot_path=str(img0_path))
502
528
  steps.append(
503
529
  Step(
504
- t=0.0,
530
+ step_index=0,
531
+ timestamp=0.0,
505
532
  observation=obs0,
506
533
  action=Action(
507
- type="click",
508
- x=cx,
509
- y=cy,
510
- bbox=username_bbox,
511
- element_index=SOM_USERNAME_FIELD,
534
+ type=ActionType.CLICK,
535
+ normalized_coordinates=(cx, cy),
536
+ raw={"bbox": username_bbox, "element_index": SOM_USERNAME_FIELD},
512
537
  ),
513
- thought="Focus the username field by clicking element [1].",
538
+ reasoning="Focus the username field by clicking element [1].",
514
539
  )
515
540
  )
516
541
 
517
- # Step 1: username field focused type username into element [1]
542
+ # Step 1: username field focused -> type username into element [1]
518
543
  img1, _ = _draw_login_screen(username="", layout=layout, jitter=False)
519
544
  img1_som = _overlay_som_marks(img1, som_elements)
520
545
  img1_path = root / f"{episode_id}_step_1.png"
521
546
  _save_image(img1_som, img1_path)
522
- obs1 = Observation(image_path=str(img1_path))
547
+ obs1 = Observation(screenshot_path=str(img1_path))
523
548
  steps.append(
524
549
  Step(
525
- t=1.0,
550
+ step_index=1,
551
+ timestamp=1.0,
526
552
  observation=obs1,
527
- action=Action(type="type", text=username, element_index=SOM_USERNAME_FIELD),
528
- thought="Type the username into element [1].",
553
+ action=Action(
554
+ type=ActionType.TYPE,
555
+ text=username,
556
+ raw={"element_index": SOM_USERNAME_FIELD},
557
+ ),
558
+ reasoning="Type the username into element [1].",
529
559
  )
530
560
  )
531
561
 
532
- # Step 2: username typed click password field [2]
562
+ # Step 2: username typed -> click password field [2]
533
563
  cx_pw, cy_pw = _center(layout.password_box)
534
564
  img2, _ = _draw_login_screen(username=username, layout=layout, jitter=False)
535
565
  img2_som = _overlay_som_marks(img2, som_elements)
536
566
  img2_path = root / f"{episode_id}_step_2.png"
537
567
  _save_image(img2_som, img2_path)
538
- obs2 = Observation(image_path=str(img2_path))
568
+ obs2 = Observation(screenshot_path=str(img2_path))
539
569
  steps.append(
540
570
  Step(
541
- t=2.0,
571
+ step_index=2,
572
+ timestamp=2.0,
542
573
  observation=obs2,
543
574
  action=Action(
544
- type="click",
545
- x=cx_pw,
546
- y=cy_pw,
547
- bbox=password_bbox,
548
- element_index=SOM_PASSWORD_FIELD,
575
+ type=ActionType.CLICK,
576
+ normalized_coordinates=(cx_pw, cy_pw),
577
+ raw={"bbox": password_bbox, "element_index": SOM_PASSWORD_FIELD},
549
578
  ),
550
- thought="Focus the password field by clicking element [2].",
579
+ reasoning="Focus the password field by clicking element [2].",
551
580
  )
552
581
  )
553
582
 
554
- # Step 3: password field focused type password into element [2]
583
+ # Step 3: password field focused -> type password into element [2]
555
584
  img3, _ = _draw_login_screen(username=username, layout=layout, jitter=False)
556
585
  img3_som = _overlay_som_marks(img3, som_elements)
557
586
  img3_path = root / f"{episode_id}_step_3.png"
558
587
  _save_image(img3_som, img3_path)
559
- obs3 = Observation(image_path=str(img3_path))
588
+ obs3 = Observation(screenshot_path=str(img3_path))
560
589
  steps.append(
561
590
  Step(
562
- t=3.0,
591
+ step_index=3,
592
+ timestamp=3.0,
563
593
  observation=obs3,
564
- action=Action(type="type", text=password, element_index=SOM_PASSWORD_FIELD),
565
- thought="Type the password into element [2].",
594
+ action=Action(
595
+ type=ActionType.TYPE,
596
+ text=password,
597
+ raw={"element_index": SOM_PASSWORD_FIELD},
598
+ ),
599
+ reasoning="Type the password into element [2].",
566
600
  )
567
601
  )
568
602
 
569
- # Step 4: password typed click login button [3]
603
+ # Step 4: password typed -> click login button [3]
570
604
  cx_btn, cy_btn = _center(layout.login_button)
571
605
  img4, _ = _draw_login_screen(
572
606
  username=username, password=password, layout=layout, jitter=False
@@ -574,43 +608,45 @@ def _script_login_episode_som(
574
608
  img4_som = _overlay_som_marks(img4, som_elements)
575
609
  img4_path = root / f"{episode_id}_step_4.png"
576
610
  _save_image(img4_som, img4_path)
577
- obs4 = Observation(image_path=str(img4_path))
611
+ obs4 = Observation(screenshot_path=str(img4_path))
578
612
  steps.append(
579
613
  Step(
580
- t=4.0,
614
+ step_index=4,
615
+ timestamp=4.0,
581
616
  observation=obs4,
582
617
  action=Action(
583
- type="click",
584
- x=cx_btn,
585
- y=cy_btn,
586
- bbox=login_bbox,
587
- element_index=SOM_LOGIN_BUTTON,
618
+ type=ActionType.CLICK,
619
+ normalized_coordinates=(cx_btn, cy_btn),
620
+ raw={"bbox": login_bbox, "element_index": SOM_LOGIN_BUTTON},
588
621
  ),
589
- thought="Submit the login form by clicking element [3].",
622
+ reasoning="Submit the login form by clicking element [3].",
590
623
  )
591
624
  )
592
625
 
593
- # Step 5: logged-in screen DONE (no SoM needed)
626
+ # Step 5: logged-in screen -> DONE (no SoM needed)
594
627
  img5 = _draw_logged_in_screen(username=username)
595
628
  img5_path = root / f"{episode_id}_step_5.png"
596
629
  _save_image(img5, img5_path)
597
- obs5 = Observation(image_path=str(img5_path))
630
+ obs5 = Observation(screenshot_path=str(img5_path))
598
631
  steps.append(
599
632
  Step(
600
- t=5.0,
633
+ step_index=5,
634
+ timestamp=5.0,
601
635
  observation=obs5,
602
- action=Action(type="done"),
603
- thought="Login successful; workflow complete.",
636
+ action=Action(type=ActionType.DONE),
637
+ reasoning="Login successful; workflow complete.",
604
638
  )
605
639
  )
606
640
 
607
641
  episode = Episode(
608
- id=episode_id,
609
- goal=f"Log in with username '{username}' and password '{password}'",
642
+ episode_id=episode_id,
643
+ instruction=f"Log in with username '{username}' and password '{password}'",
610
644
  steps=steps,
611
- summary="Successful login via username and password (SoM mode).",
612
645
  success=True,
613
- workflow_id="login_basic_som",
646
+ metadata={
647
+ "summary": "Successful login via username and password (SoM mode).",
648
+ "workflow_id": "login_basic_som",
649
+ },
614
650
  )
615
651
 
616
652
  return episode
@@ -640,7 +676,9 @@ SOM_CONFIRM_PASSWORD_FIELD = 5
640
676
  SOM_REGISTER_BUTTON = 6
641
677
 
642
678
 
643
- def _compute_registration_layout(max_offset: int = 8, jitter: bool = True) -> RegistrationUIElements:
679
+ def _compute_registration_layout(
680
+ max_offset: int = 8, jitter: bool = True
681
+ ) -> RegistrationUIElements:
644
682
  """Compute registration form layout with optional jitter."""
645
683
 
646
684
  label_x = 180
@@ -653,7 +691,9 @@ def _compute_registration_layout(max_offset: int = 8, jitter: bool = True) -> Re
653
691
  return x, y
654
692
  dx = random.randint(-max_offset, max_offset)
655
693
  dy = random.randint(-max_offset, max_offset)
656
- return max(20, min(IMG_WIDTH - box_w - 20, x + dx)), max(20, min(IMG_HEIGHT - 60, y + dy))
694
+ return max(20, min(IMG_WIDTH - box_w - 20, x + dx)), max(
695
+ 20, min(IMG_HEIGHT - 60, y + dy)
696
+ )
657
697
 
658
698
  # First name
659
699
  fn_x, fn_y = _maybe_jitter(label_x, start_y + 24)
@@ -677,7 +717,9 @@ def _compute_registration_layout(max_offset: int = 8, jitter: bool = True) -> Re
677
717
 
678
718
  # Register button
679
719
  btn_w, btn_h = 160, 45
680
- btn_x, btn_y = _maybe_jitter((IMG_WIDTH - btn_w) // 2, start_y + 5 * field_spacing + 40)
720
+ btn_x, btn_y = _maybe_jitter(
721
+ (IMG_WIDTH - btn_w) // 2, start_y + 5 * field_spacing + 40
722
+ )
681
723
  register_button = (btn_x, btn_y, btn_w, btn_h)
682
724
 
683
725
  return RegistrationUIElements(
@@ -713,7 +755,7 @@ def _draw_registration_screen(
713
755
  layout = _compute_registration_layout(jitter=jitter)
714
756
 
715
757
  label_x = 180
716
- box_w, box_h = 400, 36
758
+ _box_w, _box_h = 400, 36
717
759
  start_y = 100
718
760
  field_spacing = 70
719
761
 
@@ -736,19 +778,37 @@ def _draw_registration_screen(
736
778
 
737
779
  # Register button
738
780
  btn_x, btn_y, btn_w, btn_h = layout.register_button
739
- draw.rectangle([(btn_x, btn_y), (btn_x + btn_w, btn_y + btn_h)], outline="black", fill="darkblue")
781
+ draw.rectangle(
782
+ [(btn_x, btn_y), (btn_x + btn_w, btn_y + btn_h)],
783
+ outline="black",
784
+ fill="darkblue",
785
+ )
740
786
  btn_text = "Register"
741
787
  btw, bth = _text_size(draw, btn_text, FONT)
742
- draw.text((btn_x + (btn_w - btw) // 2, btn_y + (btn_h - bth) // 2), btn_text, fill="white", font=FONT)
788
+ draw.text(
789
+ (btn_x + (btn_w - btw) // 2, btn_y + (btn_h - bth) // 2),
790
+ btn_text,
791
+ fill="white",
792
+ font=FONT,
793
+ )
743
794
 
744
795
  # Decoy "Clear Form" button
745
796
  decoy_w, decoy_h = 100, 35
746
797
  decoy_x = IMG_WIDTH - decoy_w - 30
747
798
  decoy_y = btn_y + 5
748
- draw.rectangle([(decoy_x, decoy_y), (decoy_x + decoy_w, decoy_y + decoy_h)], outline="gray", fill=(200, 200, 200))
799
+ draw.rectangle(
800
+ [(decoy_x, decoy_y), (decoy_x + decoy_w, decoy_y + decoy_h)],
801
+ outline="gray",
802
+ fill=(200, 200, 200),
803
+ )
749
804
  decoy_text = "Clear"
750
805
  dtw, dth = _text_size(draw, decoy_text, FONT)
751
- draw.text((decoy_x + (decoy_w - dtw) // 2, decoy_y + (decoy_h - dth) // 2), decoy_text, fill="gray", font=FONT)
806
+ draw.text(
807
+ (decoy_x + (decoy_w - dtw) // 2, decoy_y + (decoy_h - dth) // 2),
808
+ decoy_text,
809
+ fill="gray",
810
+ font=FONT,
811
+ )
752
812
 
753
813
  return img, layout
754
814
 
@@ -759,10 +819,17 @@ def _draw_registration_success_screen(first_name: str, email: str) -> Image.Imag
759
819
  draw = ImageDraw.Draw(img)
760
820
  text = f"Welcome, {first_name}!"
761
821
  tw, th = _text_size(draw, text, FONT_TITLE)
762
- draw.text(((IMG_WIDTH - tw) // 2, IMG_HEIGHT // 2 - 40), text, fill="darkgreen", font=FONT_TITLE)
822
+ draw.text(
823
+ ((IMG_WIDTH - tw) // 2, IMG_HEIGHT // 2 - 40),
824
+ text,
825
+ fill="darkgreen",
826
+ font=FONT_TITLE,
827
+ )
763
828
  subtext = f"Confirmation sent to {email}"
764
829
  stw, sth = _text_size(draw, subtext, FONT)
765
- draw.text(((IMG_WIDTH - stw) // 2, IMG_HEIGHT // 2 + 20), subtext, fill="gray", font=FONT)
830
+ draw.text(
831
+ ((IMG_WIDTH - stw) // 2, IMG_HEIGHT // 2 + 20), subtext, fill="gray", font=FONT
832
+ )
766
833
  return img
767
834
 
768
835
 
@@ -800,10 +867,21 @@ def _script_registration_episode(
800
867
  ("last_name", layout.last_name_box, last_name, SOM_LAST_NAME_FIELD),
801
868
  ("email", layout.email_box, email, SOM_EMAIL_FIELD),
802
869
  ("password", layout.password_box, password, SOM_REG_PASSWORD_FIELD),
803
- ("confirm_password", layout.confirm_password_box, password, SOM_CONFIRM_PASSWORD_FIELD),
870
+ (
871
+ "confirm_password",
872
+ layout.confirm_password_box,
873
+ password,
874
+ SOM_CONFIRM_PASSWORD_FIELD,
875
+ ),
804
876
  ]
805
877
 
806
- current_values = {"first_name": "", "last_name": "", "email": "", "password": "", "confirm_password": ""}
878
+ current_values = {
879
+ "first_name": "",
880
+ "last_name": "",
881
+ "email": "",
882
+ "password": "",
883
+ "confirm_password": "",
884
+ }
807
885
  step_idx = 0
808
886
 
809
887
  for field_name, box, value, elem_idx in field_sequence:
@@ -821,12 +899,19 @@ def _script_registration_episode(
821
899
  )
822
900
  img_path = root / f"{episode_id}_step_{step_idx}.png"
823
901
  _save_image(img, img_path)
824
- steps.append(Step(
825
- t=float(step_idx),
826
- observation=Observation(image_path=str(img_path)),
827
- action=Action(type="click", x=cx, y=cy, bbox=bbox, element_index=elem_idx),
828
- thought=f"Focus the {field_name.replace('_', ' ')} field.",
829
- ))
902
+ steps.append(
903
+ Step(
904
+ step_index=step_idx,
905
+ timestamp=float(step_idx),
906
+ observation=Observation(screenshot_path=str(img_path)),
907
+ action=Action(
908
+ type=ActionType.CLICK,
909
+ normalized_coordinates=(cx, cy),
910
+ raw={"bbox": bbox, "element_index": elem_idx},
911
+ ),
912
+ reasoning=f"Focus the {field_name.replace('_', ' ')} field.",
913
+ )
914
+ )
830
915
  step_idx += 1
831
916
 
832
917
  # Type step
@@ -841,12 +926,19 @@ def _script_registration_episode(
841
926
  )
842
927
  img2_path = root / f"{episode_id}_step_{step_idx}.png"
843
928
  _save_image(img2, img2_path)
844
- steps.append(Step(
845
- t=float(step_idx),
846
- observation=Observation(image_path=str(img2_path)),
847
- action=Action(type="type", text=value, element_index=elem_idx),
848
- thought=f"Type the {field_name.replace('_', ' ')}.",
849
- ))
929
+ steps.append(
930
+ Step(
931
+ step_index=step_idx,
932
+ timestamp=float(step_idx),
933
+ observation=Observation(screenshot_path=str(img2_path)),
934
+ action=Action(
935
+ type=ActionType.TYPE,
936
+ text=value,
937
+ raw={"element_index": elem_idx},
938
+ ),
939
+ reasoning=f"Type the {field_name.replace('_', ' ')}.",
940
+ )
941
+ )
850
942
  current_values[field_name] = value
851
943
  step_idx += 1
852
944
 
@@ -864,32 +956,44 @@ def _script_registration_episode(
864
956
  )
865
957
  img_path = root / f"{episode_id}_step_{step_idx}.png"
866
958
  _save_image(img, img_path)
867
- steps.append(Step(
868
- t=float(step_idx),
869
- observation=Observation(image_path=str(img_path)),
870
- action=Action(type="click", x=cx, y=cy, bbox=bbox, element_index=SOM_REGISTER_BUTTON),
871
- thought="Submit the registration form.",
872
- ))
959
+ steps.append(
960
+ Step(
961
+ step_index=step_idx,
962
+ timestamp=float(step_idx),
963
+ observation=Observation(screenshot_path=str(img_path)),
964
+ action=Action(
965
+ type=ActionType.CLICK,
966
+ normalized_coordinates=(cx, cy),
967
+ raw={"bbox": bbox, "element_index": SOM_REGISTER_BUTTON},
968
+ ),
969
+ reasoning="Submit the registration form.",
970
+ )
971
+ )
873
972
  step_idx += 1
874
973
 
875
974
  # Done step
876
975
  img_done = _draw_registration_success_screen(first_name, email)
877
976
  img_done_path = root / f"{episode_id}_step_{step_idx}.png"
878
977
  _save_image(img_done, img_done_path)
879
- steps.append(Step(
880
- t=float(step_idx),
881
- observation=Observation(image_path=str(img_done_path)),
882
- action=Action(type="done"),
883
- thought="Registration successful; workflow complete.",
884
- ))
978
+ steps.append(
979
+ Step(
980
+ step_index=step_idx,
981
+ timestamp=float(step_idx),
982
+ observation=Observation(screenshot_path=str(img_done_path)),
983
+ action=Action(type=ActionType.DONE),
984
+ reasoning="Registration successful; workflow complete.",
985
+ )
986
+ )
885
987
 
886
988
  return Episode(
887
- id=episode_id,
888
- goal=f"Register with first name '{first_name}', last name '{last_name}', email '{email}', and password",
989
+ episode_id=episode_id,
990
+ instruction=f"Register with first name '{first_name}', last name '{last_name}', email '{email}', and password",
889
991
  steps=steps,
890
- summary="Successful registration.",
891
992
  success=True,
892
- workflow_id="registration",
993
+ metadata={
994
+ "summary": "Successful registration.",
995
+ "workflow_id": "registration",
996
+ },
893
997
  )
894
998
 
895
999
 
@@ -920,10 +1024,21 @@ def _script_registration_episode_som(
920
1024
  ("last_name", layout.last_name_box, last_name, SOM_LAST_NAME_FIELD),
921
1025
  ("email", layout.email_box, email, SOM_EMAIL_FIELD),
922
1026
  ("password", layout.password_box, password, SOM_REG_PASSWORD_FIELD),
923
- ("confirm_password", layout.confirm_password_box, password, SOM_CONFIRM_PASSWORD_FIELD),
1027
+ (
1028
+ "confirm_password",
1029
+ layout.confirm_password_box,
1030
+ password,
1031
+ SOM_CONFIRM_PASSWORD_FIELD,
1032
+ ),
924
1033
  ]
925
1034
 
926
- current_values = {"first_name": "", "last_name": "", "email": "", "password": "", "confirm_password": ""}
1035
+ current_values = {
1036
+ "first_name": "",
1037
+ "last_name": "",
1038
+ "email": "",
1039
+ "password": "",
1040
+ "confirm_password": "",
1041
+ }
927
1042
  step_idx = 0
928
1043
 
929
1044
  for field_name, box, value, elem_idx in field_sequence:
@@ -942,12 +1057,19 @@ def _script_registration_episode_som(
942
1057
  img_som = _overlay_som_marks(img, som_elements)
943
1058
  img_path = root / f"{episode_id}_step_{step_idx}.png"
944
1059
  _save_image(img_som, img_path)
945
- steps.append(Step(
946
- t=float(step_idx),
947
- observation=Observation(image_path=str(img_path)),
948
- action=Action(type="click", x=cx, y=cy, bbox=bbox, element_index=elem_idx),
949
- thought=f"Focus element [{elem_idx}] ({field_name.replace('_', ' ')} field).",
950
- ))
1060
+ steps.append(
1061
+ Step(
1062
+ step_index=step_idx,
1063
+ timestamp=float(step_idx),
1064
+ observation=Observation(screenshot_path=str(img_path)),
1065
+ action=Action(
1066
+ type=ActionType.CLICK,
1067
+ normalized_coordinates=(cx, cy),
1068
+ raw={"bbox": bbox, "element_index": elem_idx},
1069
+ ),
1070
+ reasoning=f"Focus element [{elem_idx}] ({field_name.replace('_', ' ')} field).",
1071
+ )
1072
+ )
951
1073
  step_idx += 1
952
1074
 
953
1075
  # Type step
@@ -963,12 +1085,19 @@ def _script_registration_episode_som(
963
1085
  img2_som = _overlay_som_marks(img2, som_elements)
964
1086
  img2_path = root / f"{episode_id}_step_{step_idx}.png"
965
1087
  _save_image(img2_som, img2_path)
966
- steps.append(Step(
967
- t=float(step_idx),
968
- observation=Observation(image_path=str(img2_path)),
969
- action=Action(type="type", text=value, element_index=elem_idx),
970
- thought=f"Type into element [{elem_idx}].",
971
- ))
1088
+ steps.append(
1089
+ Step(
1090
+ step_index=step_idx,
1091
+ timestamp=float(step_idx),
1092
+ observation=Observation(screenshot_path=str(img2_path)),
1093
+ action=Action(
1094
+ type=ActionType.TYPE,
1095
+ text=value,
1096
+ raw={"element_index": elem_idx},
1097
+ ),
1098
+ reasoning=f"Type into element [{elem_idx}].",
1099
+ )
1100
+ )
972
1101
  current_values[field_name] = value
973
1102
  step_idx += 1
974
1103
 
@@ -987,50 +1116,62 @@ def _script_registration_episode_som(
987
1116
  img_som = _overlay_som_marks(img, som_elements)
988
1117
  img_path = root / f"{episode_id}_step_{step_idx}.png"
989
1118
  _save_image(img_som, img_path)
990
- steps.append(Step(
991
- t=float(step_idx),
992
- observation=Observation(image_path=str(img_path)),
993
- action=Action(type="click", x=cx, y=cy, bbox=bbox, element_index=SOM_REGISTER_BUTTON),
994
- thought=f"Click element [{SOM_REGISTER_BUTTON}] to submit registration.",
995
- ))
1119
+ steps.append(
1120
+ Step(
1121
+ step_index=step_idx,
1122
+ timestamp=float(step_idx),
1123
+ observation=Observation(screenshot_path=str(img_path)),
1124
+ action=Action(
1125
+ type=ActionType.CLICK,
1126
+ normalized_coordinates=(cx, cy),
1127
+ raw={"bbox": bbox, "element_index": SOM_REGISTER_BUTTON},
1128
+ ),
1129
+ reasoning=f"Click element [{SOM_REGISTER_BUTTON}] to submit registration.",
1130
+ )
1131
+ )
996
1132
  step_idx += 1
997
1133
 
998
1134
  # Done step
999
1135
  img_done = _draw_registration_success_screen(first_name, email)
1000
1136
  img_done_path = root / f"{episode_id}_step_{step_idx}.png"
1001
1137
  _save_image(img_done, img_done_path)
1002
- steps.append(Step(
1003
- t=float(step_idx),
1004
- observation=Observation(image_path=str(img_done_path)),
1005
- action=Action(type="done"),
1006
- thought="Registration successful; workflow complete.",
1007
- ))
1138
+ steps.append(
1139
+ Step(
1140
+ step_index=step_idx,
1141
+ timestamp=float(step_idx),
1142
+ observation=Observation(screenshot_path=str(img_done_path)),
1143
+ action=Action(type=ActionType.DONE),
1144
+ reasoning="Registration successful; workflow complete.",
1145
+ )
1146
+ )
1008
1147
 
1009
1148
  return Episode(
1010
- id=episode_id,
1011
- goal=f"Register with first name '{first_name}', last name '{last_name}', email '{email}', and password",
1149
+ episode_id=episode_id,
1150
+ instruction=f"Register with first name '{first_name}', last name '{last_name}', email '{email}', and password",
1012
1151
  steps=steps,
1013
- summary="Successful registration (SoM mode).",
1014
1152
  success=True,
1015
- workflow_id="registration_som",
1153
+ metadata={
1154
+ "summary": "Successful registration (SoM mode).",
1155
+ "workflow_id": "registration_som",
1156
+ },
1016
1157
  )
1017
1158
 
1018
1159
 
1019
- def generate_synthetic_sessions(
1020
- num_sessions: int = 10,
1160
+ def generate_synthetic_episodes(
1161
+ num_episodes: int = 10,
1021
1162
  seed: int | None = None,
1022
1163
  output_dir: str | os.PathLike[str] | None = None,
1023
1164
  jitter: bool = True,
1024
1165
  use_som: bool = False,
1025
1166
  scenario: str = "login",
1026
- ) -> List[Session]:
1027
- """Generate a list of synthetic Sessions with semantic UI episodes.
1167
+ ) -> List[Episode]:
1168
+ """Generate a list of synthetic Episodes with semantic UI episodes.
1028
1169
 
1029
- Each Session contains a single Episode. Images for all steps are written
1030
- to `output_dir`.
1170
+ Each Episode contains steps for a complete UI workflow. Images for all
1171
+ steps are written to `output_dir`.
1031
1172
 
1032
1173
  Args:
1033
- num_sessions: Number of sessions to generate.
1174
+ num_episodes: Number of episodes to generate.
1034
1175
  seed: Random seed for reproducibility.
1035
1176
  output_dir: Directory to write images to.
1036
1177
  jitter: Whether to apply slight position jitter to UI elements.
@@ -1040,6 +1181,9 @@ def generate_synthetic_sessions(
1040
1181
  scenario: Type of UI scenario to generate. Options:
1041
1182
  - "login": Simple login form (6 steps, 3 elements)
1042
1183
  - "registration": Registration form (12 steps, 6 elements)
1184
+
1185
+ Returns:
1186
+ List of Episode objects.
1043
1187
  """
1044
1188
 
1045
1189
  if seed is not None:
@@ -1051,28 +1195,28 @@ def generate_synthetic_sessions(
1051
1195
  else:
1052
1196
  output_root = Path(output_dir)
1053
1197
 
1054
- sessions: List[Session] = []
1198
+ episodes: List[Episode] = []
1055
1199
 
1056
- for i in range(num_sessions):
1057
- session_id = f"session_{i:04d}"
1058
- session_dir = output_root / session_id
1200
+ for i in range(num_episodes):
1201
+ episode_id = f"episode_{i:04d}"
1202
+ episode_dir = output_root / episode_id
1059
1203
 
1060
1204
  if scenario == "login":
1061
- episode_id = f"{session_id}_login"
1205
+ episode_id_full = f"{episode_id}_login"
1062
1206
  username = f"user{i}"
1063
1207
  password = f"pass{i}123"
1064
1208
 
1065
1209
  if use_som:
1066
1210
  episode = _script_login_episode_som(
1067
- session_dir, episode_id, username, password, jitter=jitter
1211
+ episode_dir, episode_id_full, username, password, jitter=jitter
1068
1212
  )
1069
1213
  else:
1070
1214
  episode = _script_login_episode(
1071
- session_dir, episode_id, username, password, jitter=jitter
1215
+ episode_dir, episode_id_full, username, password, jitter=jitter
1072
1216
  )
1073
1217
 
1074
1218
  elif scenario == "registration":
1075
- episode_id = f"{session_id}_registration"
1219
+ episode_id_full = f"{episode_id}_registration"
1076
1220
  first_name = f"John{i}"
1077
1221
  last_name = f"Doe{i}"
1078
1222
  email = f"john{i}@example.com"
@@ -1080,23 +1224,30 @@ def generate_synthetic_sessions(
1080
1224
 
1081
1225
  if use_som:
1082
1226
  episode = _script_registration_episode_som(
1083
- session_dir, episode_id, first_name, last_name, email, password, jitter=jitter
1227
+ episode_dir,
1228
+ episode_id_full,
1229
+ first_name,
1230
+ last_name,
1231
+ email,
1232
+ password,
1233
+ jitter=jitter,
1084
1234
  )
1085
1235
  else:
1086
1236
  episode = _script_registration_episode(
1087
- session_dir, episode_id, first_name, last_name, email, password, jitter=jitter
1237
+ episode_dir,
1238
+ episode_id_full,
1239
+ first_name,
1240
+ last_name,
1241
+ email,
1242
+ password,
1243
+ jitter=jitter,
1088
1244
  )
1089
1245
 
1090
1246
  else:
1091
- raise ValueError(f"Unknown scenario: {scenario}. Options: login, registration")
1092
-
1093
- session = Session(
1094
- id=session_id,
1095
- episodes=[episode],
1096
- meta={"scenario": scenario, "use_som": use_som},
1097
- )
1098
- sessions.append(session)
1099
-
1100
- return sessions
1247
+ raise ValueError(
1248
+ f"Unknown scenario: {scenario}. Options: login, registration"
1249
+ )
1101
1250
 
1251
+ episodes.append(episode)
1102
1252
 
1253
+ return episodes