openadapt-ml 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -115
  8. openadapt_ml/benchmarks/agent.py +265 -421
  9. openadapt_ml/benchmarks/azure.py +28 -19
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1722 -4847
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +22 -5
  14. openadapt_ml/benchmarks/vm_monitor.py +530 -29
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
  16. openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
  17. openadapt_ml/cloud/azure_inference.py +3 -5
  18. openadapt_ml/cloud/lambda_labs.py +722 -307
  19. openadapt_ml/cloud/local.py +2038 -487
  20. openadapt_ml/cloud/ssh_tunnel.py +68 -26
  21. openadapt_ml/datasets/next_action.py +40 -30
  22. openadapt_ml/evals/grounding.py +8 -3
  23. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  24. openadapt_ml/evals/trajectory_matching.py +41 -26
  25. openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
  26. openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
  27. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  28. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  29. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  30. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  31. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  32. openadapt_ml/experiments/waa_demo/runner.py +29 -14
  33. openadapt_ml/export/parquet.py +36 -24
  34. openadapt_ml/grounding/detector.py +18 -14
  35. openadapt_ml/ingest/__init__.py +8 -6
  36. openadapt_ml/ingest/capture.py +25 -22
  37. openadapt_ml/ingest/loader.py +7 -4
  38. openadapt_ml/ingest/synthetic.py +189 -100
  39. openadapt_ml/models/api_adapter.py +14 -4
  40. openadapt_ml/models/base_adapter.py +10 -2
  41. openadapt_ml/models/providers/__init__.py +288 -0
  42. openadapt_ml/models/providers/anthropic.py +266 -0
  43. openadapt_ml/models/providers/base.py +299 -0
  44. openadapt_ml/models/providers/google.py +376 -0
  45. openadapt_ml/models/providers/openai.py +342 -0
  46. openadapt_ml/models/qwen_vl.py +46 -19
  47. openadapt_ml/perception/__init__.py +35 -0
  48. openadapt_ml/perception/integration.py +399 -0
  49. openadapt_ml/retrieval/demo_retriever.py +50 -24
  50. openadapt_ml/retrieval/embeddings.py +9 -8
  51. openadapt_ml/retrieval/retriever.py +3 -1
  52. openadapt_ml/runtime/__init__.py +50 -0
  53. openadapt_ml/runtime/policy.py +18 -5
  54. openadapt_ml/runtime/safety_gate.py +471 -0
  55. openadapt_ml/schema/__init__.py +9 -0
  56. openadapt_ml/schema/converters.py +74 -27
  57. openadapt_ml/schema/episode.py +31 -18
  58. openadapt_ml/scripts/capture_screenshots.py +530 -0
  59. openadapt_ml/scripts/compare.py +85 -54
  60. openadapt_ml/scripts/demo_policy.py +4 -1
  61. openadapt_ml/scripts/eval_policy.py +15 -9
  62. openadapt_ml/scripts/make_gif.py +1 -1
  63. openadapt_ml/scripts/prepare_synthetic.py +3 -1
  64. openadapt_ml/scripts/train.py +21 -9
  65. openadapt_ml/segmentation/README.md +920 -0
  66. openadapt_ml/segmentation/__init__.py +97 -0
  67. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  68. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  69. openadapt_ml/segmentation/annotator.py +610 -0
  70. openadapt_ml/segmentation/cache.py +290 -0
  71. openadapt_ml/segmentation/cli.py +674 -0
  72. openadapt_ml/segmentation/deduplicator.py +656 -0
  73. openadapt_ml/segmentation/frame_describer.py +788 -0
  74. openadapt_ml/segmentation/pipeline.py +340 -0
  75. openadapt_ml/segmentation/schemas.py +622 -0
  76. openadapt_ml/segmentation/segment_extractor.py +634 -0
  77. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  78. openadapt_ml/training/benchmark_viewer.py +52 -41
  79. openadapt_ml/training/shared_ui.py +7 -7
  80. openadapt_ml/training/stub_provider.py +57 -35
  81. openadapt_ml/training/trainer.py +143 -86
  82. openadapt_ml/training/trl_trainer.py +70 -21
  83. openadapt_ml/training/viewer.py +323 -108
  84. openadapt_ml/training/viewer_components.py +180 -0
  85. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +215 -14
  86. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  87. openadapt_ml/benchmarks/base.py +0 -366
  88. openadapt_ml/benchmarks/data_collection.py +0 -432
  89. openadapt_ml/benchmarks/live_tracker.py +0 -180
  90. openadapt_ml/benchmarks/runner.py +0 -418
  91. openadapt_ml/benchmarks/waa.py +0 -761
  92. openadapt_ml/benchmarks/waa_live.py +0 -619
  93. openadapt_ml-0.2.0.dist-info/RECORD +0 -86
  94. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  95. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -32,7 +32,9 @@ def _normalize(x_px: int, y_px: int) -> Tuple[float, float]:
32
32
  return x_px / IMG_WIDTH, y_px / IMG_HEIGHT
33
33
 
34
34
 
35
- def _text_size(draw: ImageDraw.ImageDraw, text: str, font: ImageFont.ImageFont) -> Tuple[int, int]:
35
+ def _text_size(
36
+ draw: ImageDraw.ImageDraw, text: str, font: ImageFont.ImageFont
37
+ ) -> Tuple[int, int]:
36
38
  """Compute text width/height using textbbox for Pillow compatibility."""
37
39
 
38
40
  left, top, right, bottom = draw.textbbox((0, 0), text, font=font)
@@ -313,7 +315,9 @@ def _center(bounds: Tuple[int, int, int, int]) -> Tuple[float, float]:
313
315
  return _normalize(cx, cy)
314
316
 
315
317
 
316
- def _bbox_normalized(bounds: Tuple[int, int, int, int]) -> Tuple[float, float, float, float]:
318
+ def _bbox_normalized(
319
+ bounds: Tuple[int, int, int, int],
320
+ ) -> Tuple[float, float, float, float]:
317
321
  """Convert pixel bounds (x, y, w, h) to normalized bbox (x_min, y_min, x_max, y_max)."""
318
322
  x, y, w, h = bounds
319
323
  x_min = x / IMG_WIDTH
@@ -426,7 +430,9 @@ def _script_login_episode(
426
430
 
427
431
  # Step 4: password typed -> click login button
428
432
  cx_btn, cy_btn = _center(layout.login_button)
429
- img4, _ = _draw_login_screen(username=username, password=password, layout=layout, jitter=False)
433
+ img4, _ = _draw_login_screen(
434
+ username=username, password=password, layout=layout, jitter=False
435
+ )
430
436
  img4_path = root / f"{episode_id}_step_4.png"
431
437
  _save_image(img4, img4_path)
432
438
  obs4 = Observation(screenshot_path=str(img4_path))
@@ -670,7 +676,9 @@ SOM_CONFIRM_PASSWORD_FIELD = 5
670
676
  SOM_REGISTER_BUTTON = 6
671
677
 
672
678
 
673
- def _compute_registration_layout(max_offset: int = 8, jitter: bool = True) -> RegistrationUIElements:
679
+ def _compute_registration_layout(
680
+ max_offset: int = 8, jitter: bool = True
681
+ ) -> RegistrationUIElements:
674
682
  """Compute registration form layout with optional jitter."""
675
683
 
676
684
  label_x = 180
@@ -683,7 +691,9 @@ def _compute_registration_layout(max_offset: int = 8, jitter: bool = True) -> Re
683
691
  return x, y
684
692
  dx = random.randint(-max_offset, max_offset)
685
693
  dy = random.randint(-max_offset, max_offset)
686
- return max(20, min(IMG_WIDTH - box_w - 20, x + dx)), max(20, min(IMG_HEIGHT - 60, y + dy))
694
+ return max(20, min(IMG_WIDTH - box_w - 20, x + dx)), max(
695
+ 20, min(IMG_HEIGHT - 60, y + dy)
696
+ )
687
697
 
688
698
  # First name
689
699
  fn_x, fn_y = _maybe_jitter(label_x, start_y + 24)
@@ -707,7 +717,9 @@ def _compute_registration_layout(max_offset: int = 8, jitter: bool = True) -> Re
707
717
 
708
718
  # Register button
709
719
  btn_w, btn_h = 160, 45
710
- btn_x, btn_y = _maybe_jitter((IMG_WIDTH - btn_w) // 2, start_y + 5 * field_spacing + 40)
720
+ btn_x, btn_y = _maybe_jitter(
721
+ (IMG_WIDTH - btn_w) // 2, start_y + 5 * field_spacing + 40
722
+ )
711
723
  register_button = (btn_x, btn_y, btn_w, btn_h)
712
724
 
713
725
  return RegistrationUIElements(
@@ -743,7 +755,7 @@ def _draw_registration_screen(
743
755
  layout = _compute_registration_layout(jitter=jitter)
744
756
 
745
757
  label_x = 180
746
- box_w, box_h = 400, 36
758
+ _box_w, _box_h = 400, 36
747
759
  start_y = 100
748
760
  field_spacing = 70
749
761
 
@@ -766,19 +778,37 @@ def _draw_registration_screen(
766
778
 
767
779
  # Register button
768
780
  btn_x, btn_y, btn_w, btn_h = layout.register_button
769
- draw.rectangle([(btn_x, btn_y), (btn_x + btn_w, btn_y + btn_h)], outline="black", fill="darkblue")
781
+ draw.rectangle(
782
+ [(btn_x, btn_y), (btn_x + btn_w, btn_y + btn_h)],
783
+ outline="black",
784
+ fill="darkblue",
785
+ )
770
786
  btn_text = "Register"
771
787
  btw, bth = _text_size(draw, btn_text, FONT)
772
- draw.text((btn_x + (btn_w - btw) // 2, btn_y + (btn_h - bth) // 2), btn_text, fill="white", font=FONT)
788
+ draw.text(
789
+ (btn_x + (btn_w - btw) // 2, btn_y + (btn_h - bth) // 2),
790
+ btn_text,
791
+ fill="white",
792
+ font=FONT,
793
+ )
773
794
 
774
795
  # Decoy "Clear Form" button
775
796
  decoy_w, decoy_h = 100, 35
776
797
  decoy_x = IMG_WIDTH - decoy_w - 30
777
798
  decoy_y = btn_y + 5
778
- draw.rectangle([(decoy_x, decoy_y), (decoy_x + decoy_w, decoy_y + decoy_h)], outline="gray", fill=(200, 200, 200))
799
+ draw.rectangle(
800
+ [(decoy_x, decoy_y), (decoy_x + decoy_w, decoy_y + decoy_h)],
801
+ outline="gray",
802
+ fill=(200, 200, 200),
803
+ )
779
804
  decoy_text = "Clear"
780
805
  dtw, dth = _text_size(draw, decoy_text, FONT)
781
- draw.text((decoy_x + (decoy_w - dtw) // 2, decoy_y + (decoy_h - dth) // 2), decoy_text, fill="gray", font=FONT)
806
+ draw.text(
807
+ (decoy_x + (decoy_w - dtw) // 2, decoy_y + (decoy_h - dth) // 2),
808
+ decoy_text,
809
+ fill="gray",
810
+ font=FONT,
811
+ )
782
812
 
783
813
  return img, layout
784
814
 
@@ -789,10 +819,17 @@ def _draw_registration_success_screen(first_name: str, email: str) -> Image.Imag
789
819
  draw = ImageDraw.Draw(img)
790
820
  text = f"Welcome, {first_name}!"
791
821
  tw, th = _text_size(draw, text, FONT_TITLE)
792
- draw.text(((IMG_WIDTH - tw) // 2, IMG_HEIGHT // 2 - 40), text, fill="darkgreen", font=FONT_TITLE)
822
+ draw.text(
823
+ ((IMG_WIDTH - tw) // 2, IMG_HEIGHT // 2 - 40),
824
+ text,
825
+ fill="darkgreen",
826
+ font=FONT_TITLE,
827
+ )
793
828
  subtext = f"Confirmation sent to {email}"
794
829
  stw, sth = _text_size(draw, subtext, FONT)
795
- draw.text(((IMG_WIDTH - stw) // 2, IMG_HEIGHT // 2 + 20), subtext, fill="gray", font=FONT)
830
+ draw.text(
831
+ ((IMG_WIDTH - stw) // 2, IMG_HEIGHT // 2 + 20), subtext, fill="gray", font=FONT
832
+ )
796
833
  return img
797
834
 
798
835
 
@@ -830,10 +867,21 @@ def _script_registration_episode(
830
867
  ("last_name", layout.last_name_box, last_name, SOM_LAST_NAME_FIELD),
831
868
  ("email", layout.email_box, email, SOM_EMAIL_FIELD),
832
869
  ("password", layout.password_box, password, SOM_REG_PASSWORD_FIELD),
833
- ("confirm_password", layout.confirm_password_box, password, SOM_CONFIRM_PASSWORD_FIELD),
870
+ (
871
+ "confirm_password",
872
+ layout.confirm_password_box,
873
+ password,
874
+ SOM_CONFIRM_PASSWORD_FIELD,
875
+ ),
834
876
  ]
835
877
 
836
- current_values = {"first_name": "", "last_name": "", "email": "", "password": "", "confirm_password": ""}
878
+ current_values = {
879
+ "first_name": "",
880
+ "last_name": "",
881
+ "email": "",
882
+ "password": "",
883
+ "confirm_password": "",
884
+ }
837
885
  step_idx = 0
838
886
 
839
887
  for field_name, box, value, elem_idx in field_sequence:
@@ -851,17 +899,19 @@ def _script_registration_episode(
851
899
  )
852
900
  img_path = root / f"{episode_id}_step_{step_idx}.png"
853
901
  _save_image(img, img_path)
854
- steps.append(Step(
855
- step_index=step_idx,
856
- timestamp=float(step_idx),
857
- observation=Observation(screenshot_path=str(img_path)),
858
- action=Action(
859
- type=ActionType.CLICK,
860
- normalized_coordinates=(cx, cy),
861
- raw={"bbox": bbox, "element_index": elem_idx},
862
- ),
863
- reasoning=f"Focus the {field_name.replace('_', ' ')} field.",
864
- ))
902
+ steps.append(
903
+ Step(
904
+ step_index=step_idx,
905
+ timestamp=float(step_idx),
906
+ observation=Observation(screenshot_path=str(img_path)),
907
+ action=Action(
908
+ type=ActionType.CLICK,
909
+ normalized_coordinates=(cx, cy),
910
+ raw={"bbox": bbox, "element_index": elem_idx},
911
+ ),
912
+ reasoning=f"Focus the {field_name.replace('_', ' ')} field.",
913
+ )
914
+ )
865
915
  step_idx += 1
866
916
 
867
917
  # Type step
@@ -876,17 +926,19 @@ def _script_registration_episode(
876
926
  )
877
927
  img2_path = root / f"{episode_id}_step_{step_idx}.png"
878
928
  _save_image(img2, img2_path)
879
- steps.append(Step(
880
- step_index=step_idx,
881
- timestamp=float(step_idx),
882
- observation=Observation(screenshot_path=str(img2_path)),
883
- action=Action(
884
- type=ActionType.TYPE,
885
- text=value,
886
- raw={"element_index": elem_idx},
887
- ),
888
- reasoning=f"Type the {field_name.replace('_', ' ')}.",
889
- ))
929
+ steps.append(
930
+ Step(
931
+ step_index=step_idx,
932
+ timestamp=float(step_idx),
933
+ observation=Observation(screenshot_path=str(img2_path)),
934
+ action=Action(
935
+ type=ActionType.TYPE,
936
+ text=value,
937
+ raw={"element_index": elem_idx},
938
+ ),
939
+ reasoning=f"Type the {field_name.replace('_', ' ')}.",
940
+ )
941
+ )
890
942
  current_values[field_name] = value
891
943
  step_idx += 1
892
944
 
@@ -904,30 +956,34 @@ def _script_registration_episode(
904
956
  )
905
957
  img_path = root / f"{episode_id}_step_{step_idx}.png"
906
958
  _save_image(img, img_path)
907
- steps.append(Step(
908
- step_index=step_idx,
909
- timestamp=float(step_idx),
910
- observation=Observation(screenshot_path=str(img_path)),
911
- action=Action(
912
- type=ActionType.CLICK,
913
- normalized_coordinates=(cx, cy),
914
- raw={"bbox": bbox, "element_index": SOM_REGISTER_BUTTON},
915
- ),
916
- reasoning="Submit the registration form.",
917
- ))
959
+ steps.append(
960
+ Step(
961
+ step_index=step_idx,
962
+ timestamp=float(step_idx),
963
+ observation=Observation(screenshot_path=str(img_path)),
964
+ action=Action(
965
+ type=ActionType.CLICK,
966
+ normalized_coordinates=(cx, cy),
967
+ raw={"bbox": bbox, "element_index": SOM_REGISTER_BUTTON},
968
+ ),
969
+ reasoning="Submit the registration form.",
970
+ )
971
+ )
918
972
  step_idx += 1
919
973
 
920
974
  # Done step
921
975
  img_done = _draw_registration_success_screen(first_name, email)
922
976
  img_done_path = root / f"{episode_id}_step_{step_idx}.png"
923
977
  _save_image(img_done, img_done_path)
924
- steps.append(Step(
925
- step_index=step_idx,
926
- timestamp=float(step_idx),
927
- observation=Observation(screenshot_path=str(img_done_path)),
928
- action=Action(type=ActionType.DONE),
929
- reasoning="Registration successful; workflow complete.",
930
- ))
978
+ steps.append(
979
+ Step(
980
+ step_index=step_idx,
981
+ timestamp=float(step_idx),
982
+ observation=Observation(screenshot_path=str(img_done_path)),
983
+ action=Action(type=ActionType.DONE),
984
+ reasoning="Registration successful; workflow complete.",
985
+ )
986
+ )
931
987
 
932
988
  return Episode(
933
989
  episode_id=episode_id,
@@ -968,10 +1024,21 @@ def _script_registration_episode_som(
968
1024
  ("last_name", layout.last_name_box, last_name, SOM_LAST_NAME_FIELD),
969
1025
  ("email", layout.email_box, email, SOM_EMAIL_FIELD),
970
1026
  ("password", layout.password_box, password, SOM_REG_PASSWORD_FIELD),
971
- ("confirm_password", layout.confirm_password_box, password, SOM_CONFIRM_PASSWORD_FIELD),
1027
+ (
1028
+ "confirm_password",
1029
+ layout.confirm_password_box,
1030
+ password,
1031
+ SOM_CONFIRM_PASSWORD_FIELD,
1032
+ ),
972
1033
  ]
973
1034
 
974
- current_values = {"first_name": "", "last_name": "", "email": "", "password": "", "confirm_password": ""}
1035
+ current_values = {
1036
+ "first_name": "",
1037
+ "last_name": "",
1038
+ "email": "",
1039
+ "password": "",
1040
+ "confirm_password": "",
1041
+ }
975
1042
  step_idx = 0
976
1043
 
977
1044
  for field_name, box, value, elem_idx in field_sequence:
@@ -990,17 +1057,19 @@ def _script_registration_episode_som(
990
1057
  img_som = _overlay_som_marks(img, som_elements)
991
1058
  img_path = root / f"{episode_id}_step_{step_idx}.png"
992
1059
  _save_image(img_som, img_path)
993
- steps.append(Step(
994
- step_index=step_idx,
995
- timestamp=float(step_idx),
996
- observation=Observation(screenshot_path=str(img_path)),
997
- action=Action(
998
- type=ActionType.CLICK,
999
- normalized_coordinates=(cx, cy),
1000
- raw={"bbox": bbox, "element_index": elem_idx},
1001
- ),
1002
- reasoning=f"Focus element [{elem_idx}] ({field_name.replace('_', ' ')} field).",
1003
- ))
1060
+ steps.append(
1061
+ Step(
1062
+ step_index=step_idx,
1063
+ timestamp=float(step_idx),
1064
+ observation=Observation(screenshot_path=str(img_path)),
1065
+ action=Action(
1066
+ type=ActionType.CLICK,
1067
+ normalized_coordinates=(cx, cy),
1068
+ raw={"bbox": bbox, "element_index": elem_idx},
1069
+ ),
1070
+ reasoning=f"Focus element [{elem_idx}] ({field_name.replace('_', ' ')} field).",
1071
+ )
1072
+ )
1004
1073
  step_idx += 1
1005
1074
 
1006
1075
  # Type step
@@ -1016,17 +1085,19 @@ def _script_registration_episode_som(
1016
1085
  img2_som = _overlay_som_marks(img2, som_elements)
1017
1086
  img2_path = root / f"{episode_id}_step_{step_idx}.png"
1018
1087
  _save_image(img2_som, img2_path)
1019
- steps.append(Step(
1020
- step_index=step_idx,
1021
- timestamp=float(step_idx),
1022
- observation=Observation(screenshot_path=str(img2_path)),
1023
- action=Action(
1024
- type=ActionType.TYPE,
1025
- text=value,
1026
- raw={"element_index": elem_idx},
1027
- ),
1028
- reasoning=f"Type into element [{elem_idx}].",
1029
- ))
1088
+ steps.append(
1089
+ Step(
1090
+ step_index=step_idx,
1091
+ timestamp=float(step_idx),
1092
+ observation=Observation(screenshot_path=str(img2_path)),
1093
+ action=Action(
1094
+ type=ActionType.TYPE,
1095
+ text=value,
1096
+ raw={"element_index": elem_idx},
1097
+ ),
1098
+ reasoning=f"Type into element [{elem_idx}].",
1099
+ )
1100
+ )
1030
1101
  current_values[field_name] = value
1031
1102
  step_idx += 1
1032
1103
 
@@ -1045,30 +1116,34 @@ def _script_registration_episode_som(
1045
1116
  img_som = _overlay_som_marks(img, som_elements)
1046
1117
  img_path = root / f"{episode_id}_step_{step_idx}.png"
1047
1118
  _save_image(img_som, img_path)
1048
- steps.append(Step(
1049
- step_index=step_idx,
1050
- timestamp=float(step_idx),
1051
- observation=Observation(screenshot_path=str(img_path)),
1052
- action=Action(
1053
- type=ActionType.CLICK,
1054
- normalized_coordinates=(cx, cy),
1055
- raw={"bbox": bbox, "element_index": SOM_REGISTER_BUTTON},
1056
- ),
1057
- reasoning=f"Click element [{SOM_REGISTER_BUTTON}] to submit registration.",
1058
- ))
1119
+ steps.append(
1120
+ Step(
1121
+ step_index=step_idx,
1122
+ timestamp=float(step_idx),
1123
+ observation=Observation(screenshot_path=str(img_path)),
1124
+ action=Action(
1125
+ type=ActionType.CLICK,
1126
+ normalized_coordinates=(cx, cy),
1127
+ raw={"bbox": bbox, "element_index": SOM_REGISTER_BUTTON},
1128
+ ),
1129
+ reasoning=f"Click element [{SOM_REGISTER_BUTTON}] to submit registration.",
1130
+ )
1131
+ )
1059
1132
  step_idx += 1
1060
1133
 
1061
1134
  # Done step
1062
1135
  img_done = _draw_registration_success_screen(first_name, email)
1063
1136
  img_done_path = root / f"{episode_id}_step_{step_idx}.png"
1064
1137
  _save_image(img_done, img_done_path)
1065
- steps.append(Step(
1066
- step_index=step_idx,
1067
- timestamp=float(step_idx),
1068
- observation=Observation(screenshot_path=str(img_done_path)),
1069
- action=Action(type=ActionType.DONE),
1070
- reasoning="Registration successful; workflow complete.",
1071
- ))
1138
+ steps.append(
1139
+ Step(
1140
+ step_index=step_idx,
1141
+ timestamp=float(step_idx),
1142
+ observation=Observation(screenshot_path=str(img_done_path)),
1143
+ action=Action(type=ActionType.DONE),
1144
+ reasoning="Registration successful; workflow complete.",
1145
+ )
1146
+ )
1072
1147
 
1073
1148
  return Episode(
1074
1149
  episode_id=episode_id,
@@ -1149,15 +1224,29 @@ def generate_synthetic_episodes(
1149
1224
 
1150
1225
  if use_som:
1151
1226
  episode = _script_registration_episode_som(
1152
- episode_dir, episode_id_full, first_name, last_name, email, password, jitter=jitter
1227
+ episode_dir,
1228
+ episode_id_full,
1229
+ first_name,
1230
+ last_name,
1231
+ email,
1232
+ password,
1233
+ jitter=jitter,
1153
1234
  )
1154
1235
  else:
1155
1236
  episode = _script_registration_episode(
1156
- episode_dir, episode_id_full, first_name, last_name, email, password, jitter=jitter
1237
+ episode_dir,
1238
+ episode_id_full,
1239
+ first_name,
1240
+ last_name,
1241
+ email,
1242
+ password,
1243
+ jitter=jitter,
1157
1244
  )
1158
1245
 
1159
1246
  else:
1160
- raise ValueError(f"Unknown scenario: {scenario}. Options: login, registration")
1247
+ raise ValueError(
1248
+ f"Unknown scenario: {scenario}. Options: login, registration"
1249
+ )
1161
1250
 
1162
1251
  episodes.append(episode)
1163
1252
 
@@ -50,7 +50,9 @@ class ApiVLMAdapter(BaseVLMAdapter):
50
50
  "Install with `uv sync --extra api`."
51
51
  ) from exc
52
52
 
53
- key = api_key or settings.anthropic_api_key or os.getenv("ANTHROPIC_API_KEY")
53
+ key = (
54
+ api_key or settings.anthropic_api_key or os.getenv("ANTHROPIC_API_KEY")
55
+ )
54
56
  if not key:
55
57
  raise RuntimeError(
56
58
  "ANTHROPIC_API_KEY is required but not found. "
@@ -87,10 +89,14 @@ class ApiVLMAdapter(BaseVLMAdapter):
87
89
  super().__init__(model=model, processor=processor, device=device)
88
90
 
89
91
  def prepare_inputs(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]: # type: ignore[override]
90
- raise NotImplementedError("ApiVLMAdapter does not support training (prepare_inputs)")
92
+ raise NotImplementedError(
93
+ "ApiVLMAdapter does not support training (prepare_inputs)"
94
+ )
91
95
 
92
96
  def compute_loss(self, inputs: Dict[str, Any]) -> torch.Tensor: # type: ignore[override]
93
- raise NotImplementedError("ApiVLMAdapter does not support training (compute_loss)")
97
+ raise NotImplementedError(
98
+ "ApiVLMAdapter does not support training (compute_loss)"
99
+ )
94
100
 
95
101
  def generate(self, sample: Dict[str, Any], max_new_tokens: int = 64) -> str: # type: ignore[override]
96
102
  images = sample.get("images", [])
@@ -138,7 +144,11 @@ class ApiVLMAdapter(BaseVLMAdapter):
138
144
 
139
145
  # Anthropic messages API returns a list of content blocks.
140
146
  parts = getattr(resp, "content", [])
141
- texts = [getattr(p, "text", "") for p in parts if getattr(p, "type", "") == "text"]
147
+ texts = [
148
+ getattr(p, "text", "")
149
+ for p in parts
150
+ if getattr(p, "type", "") == "text"
151
+ ]
142
152
  return "\n".join([t for t in texts if t]).strip()
143
153
 
144
154
  if self.provider == "openai":
@@ -14,7 +14,10 @@ def get_default_device() -> torch.device:
14
14
 
15
15
  if torch.cuda.is_available():
16
16
  return torch.device("cuda")
17
- if getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available(): # type: ignore[attr-defined]
17
+ if (
18
+ getattr(torch.backends, "mps", None) is not None
19
+ and torch.backends.mps.is_available()
20
+ ): # type: ignore[attr-defined]
18
21
  return torch.device("mps")
19
22
  return torch.device("cpu")
20
23
 
@@ -28,7 +31,12 @@ class BaseVLMAdapter(ABC):
28
31
  - generating assistant text given a single sample at inference time
29
32
  """
30
33
 
31
- def __init__(self, model: torch.nn.Module, processor: Any, device: Optional[torch.device] = None) -> None:
34
+ def __init__(
35
+ self,
36
+ model: torch.nn.Module,
37
+ processor: Any,
38
+ device: Optional[torch.device] = None,
39
+ ) -> None:
32
40
  self.model = model
33
41
  self.processor = processor
34
42
  self.device = device or get_default_device()