openadapt-ml 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -115
- openadapt_ml/benchmarks/agent.py +265 -421
- openadapt_ml/benchmarks/azure.py +28 -19
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1722 -4847
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +22 -5
- openadapt_ml/benchmarks/vm_monitor.py +530 -29
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +2038 -487
- openadapt_ml/cloud/ssh_tunnel.py +68 -26
- openadapt_ml/datasets/next_action.py +40 -30
- openadapt_ml/evals/grounding.py +8 -3
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +41 -26
- openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
- openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/runner.py +29 -14
- openadapt_ml/export/parquet.py +36 -24
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +8 -6
- openadapt_ml/ingest/capture.py +25 -22
- openadapt_ml/ingest/loader.py +7 -4
- openadapt_ml/ingest/synthetic.py +189 -100
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/demo_retriever.py +50 -24
- openadapt_ml/retrieval/embeddings.py +9 -8
- openadapt_ml/retrieval/retriever.py +3 -1
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +18 -5
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +9 -0
- openadapt_ml/schema/converters.py +74 -27
- openadapt_ml/schema/episode.py +31 -18
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +85 -54
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +15 -9
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +3 -1
- openadapt_ml/scripts/train.py +21 -9
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +52 -41
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +143 -86
- openadapt_ml/training/trl_trainer.py +70 -21
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/METADATA +215 -14
- openadapt_ml-0.2.2.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/live_tracker.py +0 -180
- openadapt_ml/benchmarks/runner.py +0 -418
- openadapt_ml/benchmarks/waa.py +0 -761
- openadapt_ml/benchmarks/waa_live.py +0 -619
- openadapt_ml-0.2.0.dist-info/RECORD +0 -86
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/licenses/LICENSE +0 -0
openadapt_ml/ingest/synthetic.py
CHANGED
|
@@ -32,7 +32,9 @@ def _normalize(x_px: int, y_px: int) -> Tuple[float, float]:
|
|
|
32
32
|
return x_px / IMG_WIDTH, y_px / IMG_HEIGHT
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def _text_size(
|
|
35
|
+
def _text_size(
|
|
36
|
+
draw: ImageDraw.ImageDraw, text: str, font: ImageFont.ImageFont
|
|
37
|
+
) -> Tuple[int, int]:
|
|
36
38
|
"""Compute text width/height using textbbox for Pillow compatibility."""
|
|
37
39
|
|
|
38
40
|
left, top, right, bottom = draw.textbbox((0, 0), text, font=font)
|
|
@@ -313,7 +315,9 @@ def _center(bounds: Tuple[int, int, int, int]) -> Tuple[float, float]:
|
|
|
313
315
|
return _normalize(cx, cy)
|
|
314
316
|
|
|
315
317
|
|
|
316
|
-
def _bbox_normalized(
|
|
318
|
+
def _bbox_normalized(
|
|
319
|
+
bounds: Tuple[int, int, int, int],
|
|
320
|
+
) -> Tuple[float, float, float, float]:
|
|
317
321
|
"""Convert pixel bounds (x, y, w, h) to normalized bbox (x_min, y_min, x_max, y_max)."""
|
|
318
322
|
x, y, w, h = bounds
|
|
319
323
|
x_min = x / IMG_WIDTH
|
|
@@ -426,7 +430,9 @@ def _script_login_episode(
|
|
|
426
430
|
|
|
427
431
|
# Step 4: password typed -> click login button
|
|
428
432
|
cx_btn, cy_btn = _center(layout.login_button)
|
|
429
|
-
img4, _ = _draw_login_screen(
|
|
433
|
+
img4, _ = _draw_login_screen(
|
|
434
|
+
username=username, password=password, layout=layout, jitter=False
|
|
435
|
+
)
|
|
430
436
|
img4_path = root / f"{episode_id}_step_4.png"
|
|
431
437
|
_save_image(img4, img4_path)
|
|
432
438
|
obs4 = Observation(screenshot_path=str(img4_path))
|
|
@@ -670,7 +676,9 @@ SOM_CONFIRM_PASSWORD_FIELD = 5
|
|
|
670
676
|
SOM_REGISTER_BUTTON = 6
|
|
671
677
|
|
|
672
678
|
|
|
673
|
-
def _compute_registration_layout(
|
|
679
|
+
def _compute_registration_layout(
|
|
680
|
+
max_offset: int = 8, jitter: bool = True
|
|
681
|
+
) -> RegistrationUIElements:
|
|
674
682
|
"""Compute registration form layout with optional jitter."""
|
|
675
683
|
|
|
676
684
|
label_x = 180
|
|
@@ -683,7 +691,9 @@ def _compute_registration_layout(max_offset: int = 8, jitter: bool = True) -> Re
|
|
|
683
691
|
return x, y
|
|
684
692
|
dx = random.randint(-max_offset, max_offset)
|
|
685
693
|
dy = random.randint(-max_offset, max_offset)
|
|
686
|
-
return max(20, min(IMG_WIDTH - box_w - 20, x + dx)), max(
|
|
694
|
+
return max(20, min(IMG_WIDTH - box_w - 20, x + dx)), max(
|
|
695
|
+
20, min(IMG_HEIGHT - 60, y + dy)
|
|
696
|
+
)
|
|
687
697
|
|
|
688
698
|
# First name
|
|
689
699
|
fn_x, fn_y = _maybe_jitter(label_x, start_y + 24)
|
|
@@ -707,7 +717,9 @@ def _compute_registration_layout(max_offset: int = 8, jitter: bool = True) -> Re
|
|
|
707
717
|
|
|
708
718
|
# Register button
|
|
709
719
|
btn_w, btn_h = 160, 45
|
|
710
|
-
btn_x, btn_y = _maybe_jitter(
|
|
720
|
+
btn_x, btn_y = _maybe_jitter(
|
|
721
|
+
(IMG_WIDTH - btn_w) // 2, start_y + 5 * field_spacing + 40
|
|
722
|
+
)
|
|
711
723
|
register_button = (btn_x, btn_y, btn_w, btn_h)
|
|
712
724
|
|
|
713
725
|
return RegistrationUIElements(
|
|
@@ -743,7 +755,7 @@ def _draw_registration_screen(
|
|
|
743
755
|
layout = _compute_registration_layout(jitter=jitter)
|
|
744
756
|
|
|
745
757
|
label_x = 180
|
|
746
|
-
|
|
758
|
+
_box_w, _box_h = 400, 36
|
|
747
759
|
start_y = 100
|
|
748
760
|
field_spacing = 70
|
|
749
761
|
|
|
@@ -766,19 +778,37 @@ def _draw_registration_screen(
|
|
|
766
778
|
|
|
767
779
|
# Register button
|
|
768
780
|
btn_x, btn_y, btn_w, btn_h = layout.register_button
|
|
769
|
-
draw.rectangle(
|
|
781
|
+
draw.rectangle(
|
|
782
|
+
[(btn_x, btn_y), (btn_x + btn_w, btn_y + btn_h)],
|
|
783
|
+
outline="black",
|
|
784
|
+
fill="darkblue",
|
|
785
|
+
)
|
|
770
786
|
btn_text = "Register"
|
|
771
787
|
btw, bth = _text_size(draw, btn_text, FONT)
|
|
772
|
-
draw.text(
|
|
788
|
+
draw.text(
|
|
789
|
+
(btn_x + (btn_w - btw) // 2, btn_y + (btn_h - bth) // 2),
|
|
790
|
+
btn_text,
|
|
791
|
+
fill="white",
|
|
792
|
+
font=FONT,
|
|
793
|
+
)
|
|
773
794
|
|
|
774
795
|
# Decoy "Clear Form" button
|
|
775
796
|
decoy_w, decoy_h = 100, 35
|
|
776
797
|
decoy_x = IMG_WIDTH - decoy_w - 30
|
|
777
798
|
decoy_y = btn_y + 5
|
|
778
|
-
draw.rectangle(
|
|
799
|
+
draw.rectangle(
|
|
800
|
+
[(decoy_x, decoy_y), (decoy_x + decoy_w, decoy_y + decoy_h)],
|
|
801
|
+
outline="gray",
|
|
802
|
+
fill=(200, 200, 200),
|
|
803
|
+
)
|
|
779
804
|
decoy_text = "Clear"
|
|
780
805
|
dtw, dth = _text_size(draw, decoy_text, FONT)
|
|
781
|
-
draw.text(
|
|
806
|
+
draw.text(
|
|
807
|
+
(decoy_x + (decoy_w - dtw) // 2, decoy_y + (decoy_h - dth) // 2),
|
|
808
|
+
decoy_text,
|
|
809
|
+
fill="gray",
|
|
810
|
+
font=FONT,
|
|
811
|
+
)
|
|
782
812
|
|
|
783
813
|
return img, layout
|
|
784
814
|
|
|
@@ -789,10 +819,17 @@ def _draw_registration_success_screen(first_name: str, email: str) -> Image.Imag
|
|
|
789
819
|
draw = ImageDraw.Draw(img)
|
|
790
820
|
text = f"Welcome, {first_name}!"
|
|
791
821
|
tw, th = _text_size(draw, text, FONT_TITLE)
|
|
792
|
-
draw.text(
|
|
822
|
+
draw.text(
|
|
823
|
+
((IMG_WIDTH - tw) // 2, IMG_HEIGHT // 2 - 40),
|
|
824
|
+
text,
|
|
825
|
+
fill="darkgreen",
|
|
826
|
+
font=FONT_TITLE,
|
|
827
|
+
)
|
|
793
828
|
subtext = f"Confirmation sent to {email}"
|
|
794
829
|
stw, sth = _text_size(draw, subtext, FONT)
|
|
795
|
-
draw.text(
|
|
830
|
+
draw.text(
|
|
831
|
+
((IMG_WIDTH - stw) // 2, IMG_HEIGHT // 2 + 20), subtext, fill="gray", font=FONT
|
|
832
|
+
)
|
|
796
833
|
return img
|
|
797
834
|
|
|
798
835
|
|
|
@@ -830,10 +867,21 @@ def _script_registration_episode(
|
|
|
830
867
|
("last_name", layout.last_name_box, last_name, SOM_LAST_NAME_FIELD),
|
|
831
868
|
("email", layout.email_box, email, SOM_EMAIL_FIELD),
|
|
832
869
|
("password", layout.password_box, password, SOM_REG_PASSWORD_FIELD),
|
|
833
|
-
(
|
|
870
|
+
(
|
|
871
|
+
"confirm_password",
|
|
872
|
+
layout.confirm_password_box,
|
|
873
|
+
password,
|
|
874
|
+
SOM_CONFIRM_PASSWORD_FIELD,
|
|
875
|
+
),
|
|
834
876
|
]
|
|
835
877
|
|
|
836
|
-
current_values = {
|
|
878
|
+
current_values = {
|
|
879
|
+
"first_name": "",
|
|
880
|
+
"last_name": "",
|
|
881
|
+
"email": "",
|
|
882
|
+
"password": "",
|
|
883
|
+
"confirm_password": "",
|
|
884
|
+
}
|
|
837
885
|
step_idx = 0
|
|
838
886
|
|
|
839
887
|
for field_name, box, value, elem_idx in field_sequence:
|
|
@@ -851,17 +899,19 @@ def _script_registration_episode(
|
|
|
851
899
|
)
|
|
852
900
|
img_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
853
901
|
_save_image(img, img_path)
|
|
854
|
-
steps.append(
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
902
|
+
steps.append(
|
|
903
|
+
Step(
|
|
904
|
+
step_index=step_idx,
|
|
905
|
+
timestamp=float(step_idx),
|
|
906
|
+
observation=Observation(screenshot_path=str(img_path)),
|
|
907
|
+
action=Action(
|
|
908
|
+
type=ActionType.CLICK,
|
|
909
|
+
normalized_coordinates=(cx, cy),
|
|
910
|
+
raw={"bbox": bbox, "element_index": elem_idx},
|
|
911
|
+
),
|
|
912
|
+
reasoning=f"Focus the {field_name.replace('_', ' ')} field.",
|
|
913
|
+
)
|
|
914
|
+
)
|
|
865
915
|
step_idx += 1
|
|
866
916
|
|
|
867
917
|
# Type step
|
|
@@ -876,17 +926,19 @@ def _script_registration_episode(
|
|
|
876
926
|
)
|
|
877
927
|
img2_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
878
928
|
_save_image(img2, img2_path)
|
|
879
|
-
steps.append(
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
929
|
+
steps.append(
|
|
930
|
+
Step(
|
|
931
|
+
step_index=step_idx,
|
|
932
|
+
timestamp=float(step_idx),
|
|
933
|
+
observation=Observation(screenshot_path=str(img2_path)),
|
|
934
|
+
action=Action(
|
|
935
|
+
type=ActionType.TYPE,
|
|
936
|
+
text=value,
|
|
937
|
+
raw={"element_index": elem_idx},
|
|
938
|
+
),
|
|
939
|
+
reasoning=f"Type the {field_name.replace('_', ' ')}.",
|
|
940
|
+
)
|
|
941
|
+
)
|
|
890
942
|
current_values[field_name] = value
|
|
891
943
|
step_idx += 1
|
|
892
944
|
|
|
@@ -904,30 +956,34 @@ def _script_registration_episode(
|
|
|
904
956
|
)
|
|
905
957
|
img_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
906
958
|
_save_image(img, img_path)
|
|
907
|
-
steps.append(
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
959
|
+
steps.append(
|
|
960
|
+
Step(
|
|
961
|
+
step_index=step_idx,
|
|
962
|
+
timestamp=float(step_idx),
|
|
963
|
+
observation=Observation(screenshot_path=str(img_path)),
|
|
964
|
+
action=Action(
|
|
965
|
+
type=ActionType.CLICK,
|
|
966
|
+
normalized_coordinates=(cx, cy),
|
|
967
|
+
raw={"bbox": bbox, "element_index": SOM_REGISTER_BUTTON},
|
|
968
|
+
),
|
|
969
|
+
reasoning="Submit the registration form.",
|
|
970
|
+
)
|
|
971
|
+
)
|
|
918
972
|
step_idx += 1
|
|
919
973
|
|
|
920
974
|
# Done step
|
|
921
975
|
img_done = _draw_registration_success_screen(first_name, email)
|
|
922
976
|
img_done_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
923
977
|
_save_image(img_done, img_done_path)
|
|
924
|
-
steps.append(
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
978
|
+
steps.append(
|
|
979
|
+
Step(
|
|
980
|
+
step_index=step_idx,
|
|
981
|
+
timestamp=float(step_idx),
|
|
982
|
+
observation=Observation(screenshot_path=str(img_done_path)),
|
|
983
|
+
action=Action(type=ActionType.DONE),
|
|
984
|
+
reasoning="Registration successful; workflow complete.",
|
|
985
|
+
)
|
|
986
|
+
)
|
|
931
987
|
|
|
932
988
|
return Episode(
|
|
933
989
|
episode_id=episode_id,
|
|
@@ -968,10 +1024,21 @@ def _script_registration_episode_som(
|
|
|
968
1024
|
("last_name", layout.last_name_box, last_name, SOM_LAST_NAME_FIELD),
|
|
969
1025
|
("email", layout.email_box, email, SOM_EMAIL_FIELD),
|
|
970
1026
|
("password", layout.password_box, password, SOM_REG_PASSWORD_FIELD),
|
|
971
|
-
(
|
|
1027
|
+
(
|
|
1028
|
+
"confirm_password",
|
|
1029
|
+
layout.confirm_password_box,
|
|
1030
|
+
password,
|
|
1031
|
+
SOM_CONFIRM_PASSWORD_FIELD,
|
|
1032
|
+
),
|
|
972
1033
|
]
|
|
973
1034
|
|
|
974
|
-
current_values = {
|
|
1035
|
+
current_values = {
|
|
1036
|
+
"first_name": "",
|
|
1037
|
+
"last_name": "",
|
|
1038
|
+
"email": "",
|
|
1039
|
+
"password": "",
|
|
1040
|
+
"confirm_password": "",
|
|
1041
|
+
}
|
|
975
1042
|
step_idx = 0
|
|
976
1043
|
|
|
977
1044
|
for field_name, box, value, elem_idx in field_sequence:
|
|
@@ -990,17 +1057,19 @@ def _script_registration_episode_som(
|
|
|
990
1057
|
img_som = _overlay_som_marks(img, som_elements)
|
|
991
1058
|
img_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
992
1059
|
_save_image(img_som, img_path)
|
|
993
|
-
steps.append(
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1060
|
+
steps.append(
|
|
1061
|
+
Step(
|
|
1062
|
+
step_index=step_idx,
|
|
1063
|
+
timestamp=float(step_idx),
|
|
1064
|
+
observation=Observation(screenshot_path=str(img_path)),
|
|
1065
|
+
action=Action(
|
|
1066
|
+
type=ActionType.CLICK,
|
|
1067
|
+
normalized_coordinates=(cx, cy),
|
|
1068
|
+
raw={"bbox": bbox, "element_index": elem_idx},
|
|
1069
|
+
),
|
|
1070
|
+
reasoning=f"Focus element [{elem_idx}] ({field_name.replace('_', ' ')} field).",
|
|
1071
|
+
)
|
|
1072
|
+
)
|
|
1004
1073
|
step_idx += 1
|
|
1005
1074
|
|
|
1006
1075
|
# Type step
|
|
@@ -1016,17 +1085,19 @@ def _script_registration_episode_som(
|
|
|
1016
1085
|
img2_som = _overlay_som_marks(img2, som_elements)
|
|
1017
1086
|
img2_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
1018
1087
|
_save_image(img2_som, img2_path)
|
|
1019
|
-
steps.append(
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1088
|
+
steps.append(
|
|
1089
|
+
Step(
|
|
1090
|
+
step_index=step_idx,
|
|
1091
|
+
timestamp=float(step_idx),
|
|
1092
|
+
observation=Observation(screenshot_path=str(img2_path)),
|
|
1093
|
+
action=Action(
|
|
1094
|
+
type=ActionType.TYPE,
|
|
1095
|
+
text=value,
|
|
1096
|
+
raw={"element_index": elem_idx},
|
|
1097
|
+
),
|
|
1098
|
+
reasoning=f"Type into element [{elem_idx}].",
|
|
1099
|
+
)
|
|
1100
|
+
)
|
|
1030
1101
|
current_values[field_name] = value
|
|
1031
1102
|
step_idx += 1
|
|
1032
1103
|
|
|
@@ -1045,30 +1116,34 @@ def _script_registration_episode_som(
|
|
|
1045
1116
|
img_som = _overlay_som_marks(img, som_elements)
|
|
1046
1117
|
img_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
1047
1118
|
_save_image(img_som, img_path)
|
|
1048
|
-
steps.append(
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1119
|
+
steps.append(
|
|
1120
|
+
Step(
|
|
1121
|
+
step_index=step_idx,
|
|
1122
|
+
timestamp=float(step_idx),
|
|
1123
|
+
observation=Observation(screenshot_path=str(img_path)),
|
|
1124
|
+
action=Action(
|
|
1125
|
+
type=ActionType.CLICK,
|
|
1126
|
+
normalized_coordinates=(cx, cy),
|
|
1127
|
+
raw={"bbox": bbox, "element_index": SOM_REGISTER_BUTTON},
|
|
1128
|
+
),
|
|
1129
|
+
reasoning=f"Click element [{SOM_REGISTER_BUTTON}] to submit registration.",
|
|
1130
|
+
)
|
|
1131
|
+
)
|
|
1059
1132
|
step_idx += 1
|
|
1060
1133
|
|
|
1061
1134
|
# Done step
|
|
1062
1135
|
img_done = _draw_registration_success_screen(first_name, email)
|
|
1063
1136
|
img_done_path = root / f"{episode_id}_step_{step_idx}.png"
|
|
1064
1137
|
_save_image(img_done, img_done_path)
|
|
1065
|
-
steps.append(
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1138
|
+
steps.append(
|
|
1139
|
+
Step(
|
|
1140
|
+
step_index=step_idx,
|
|
1141
|
+
timestamp=float(step_idx),
|
|
1142
|
+
observation=Observation(screenshot_path=str(img_done_path)),
|
|
1143
|
+
action=Action(type=ActionType.DONE),
|
|
1144
|
+
reasoning="Registration successful; workflow complete.",
|
|
1145
|
+
)
|
|
1146
|
+
)
|
|
1072
1147
|
|
|
1073
1148
|
return Episode(
|
|
1074
1149
|
episode_id=episode_id,
|
|
@@ -1149,15 +1224,29 @@ def generate_synthetic_episodes(
|
|
|
1149
1224
|
|
|
1150
1225
|
if use_som:
|
|
1151
1226
|
episode = _script_registration_episode_som(
|
|
1152
|
-
episode_dir,
|
|
1227
|
+
episode_dir,
|
|
1228
|
+
episode_id_full,
|
|
1229
|
+
first_name,
|
|
1230
|
+
last_name,
|
|
1231
|
+
email,
|
|
1232
|
+
password,
|
|
1233
|
+
jitter=jitter,
|
|
1153
1234
|
)
|
|
1154
1235
|
else:
|
|
1155
1236
|
episode = _script_registration_episode(
|
|
1156
|
-
episode_dir,
|
|
1237
|
+
episode_dir,
|
|
1238
|
+
episode_id_full,
|
|
1239
|
+
first_name,
|
|
1240
|
+
last_name,
|
|
1241
|
+
email,
|
|
1242
|
+
password,
|
|
1243
|
+
jitter=jitter,
|
|
1157
1244
|
)
|
|
1158
1245
|
|
|
1159
1246
|
else:
|
|
1160
|
-
raise ValueError(
|
|
1247
|
+
raise ValueError(
|
|
1248
|
+
f"Unknown scenario: {scenario}. Options: login, registration"
|
|
1249
|
+
)
|
|
1161
1250
|
|
|
1162
1251
|
episodes.append(episode)
|
|
1163
1252
|
|
|
@@ -50,7 +50,9 @@ class ApiVLMAdapter(BaseVLMAdapter):
|
|
|
50
50
|
"Install with `uv sync --extra api`."
|
|
51
51
|
) from exc
|
|
52
52
|
|
|
53
|
-
key =
|
|
53
|
+
key = (
|
|
54
|
+
api_key or settings.anthropic_api_key or os.getenv("ANTHROPIC_API_KEY")
|
|
55
|
+
)
|
|
54
56
|
if not key:
|
|
55
57
|
raise RuntimeError(
|
|
56
58
|
"ANTHROPIC_API_KEY is required but not found. "
|
|
@@ -87,10 +89,14 @@ class ApiVLMAdapter(BaseVLMAdapter):
|
|
|
87
89
|
super().__init__(model=model, processor=processor, device=device)
|
|
88
90
|
|
|
89
91
|
def prepare_inputs(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]: # type: ignore[override]
|
|
90
|
-
raise NotImplementedError(
|
|
92
|
+
raise NotImplementedError(
|
|
93
|
+
"ApiVLMAdapter does not support training (prepare_inputs)"
|
|
94
|
+
)
|
|
91
95
|
|
|
92
96
|
def compute_loss(self, inputs: Dict[str, Any]) -> torch.Tensor: # type: ignore[override]
|
|
93
|
-
raise NotImplementedError(
|
|
97
|
+
raise NotImplementedError(
|
|
98
|
+
"ApiVLMAdapter does not support training (compute_loss)"
|
|
99
|
+
)
|
|
94
100
|
|
|
95
101
|
def generate(self, sample: Dict[str, Any], max_new_tokens: int = 64) -> str: # type: ignore[override]
|
|
96
102
|
images = sample.get("images", [])
|
|
@@ -138,7 +144,11 @@ class ApiVLMAdapter(BaseVLMAdapter):
|
|
|
138
144
|
|
|
139
145
|
# Anthropic messages API returns a list of content blocks.
|
|
140
146
|
parts = getattr(resp, "content", [])
|
|
141
|
-
texts = [
|
|
147
|
+
texts = [
|
|
148
|
+
getattr(p, "text", "")
|
|
149
|
+
for p in parts
|
|
150
|
+
if getattr(p, "type", "") == "text"
|
|
151
|
+
]
|
|
142
152
|
return "\n".join([t for t in texts if t]).strip()
|
|
143
153
|
|
|
144
154
|
if self.provider == "openai":
|
|
@@ -14,7 +14,10 @@ def get_default_device() -> torch.device:
|
|
|
14
14
|
|
|
15
15
|
if torch.cuda.is_available():
|
|
16
16
|
return torch.device("cuda")
|
|
17
|
-
if
|
|
17
|
+
if (
|
|
18
|
+
getattr(torch.backends, "mps", None) is not None
|
|
19
|
+
and torch.backends.mps.is_available()
|
|
20
|
+
): # type: ignore[attr-defined]
|
|
18
21
|
return torch.device("mps")
|
|
19
22
|
return torch.device("cpu")
|
|
20
23
|
|
|
@@ -28,7 +31,12 @@ class BaseVLMAdapter(ABC):
|
|
|
28
31
|
- generating assistant text given a single sample at inference time
|
|
29
32
|
"""
|
|
30
33
|
|
|
31
|
-
def __init__(
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
model: torch.nn.Module,
|
|
37
|
+
processor: Any,
|
|
38
|
+
device: Optional[torch.device] = None,
|
|
39
|
+
) -> None:
|
|
32
40
|
self.model = model
|
|
33
41
|
self.processor = processor
|
|
34
42
|
self.device = device or get_default_device()
|