PyPI - openadapt-ml - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

openadapt-ml 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

openadapt_ml/baselines/__init__.py +121 -0
openadapt_ml/baselines/adapter.py +185 -0
openadapt_ml/baselines/cli.py +314 -0
openadapt_ml/baselines/config.py +448 -0
openadapt_ml/baselines/parser.py +922 -0
openadapt_ml/baselines/prompts.py +787 -0
openadapt_ml/benchmarks/__init__.py +13 -107
openadapt_ml/benchmarks/agent.py +297 -374
openadapt_ml/benchmarks/azure.py +62 -24
openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
openadapt_ml/benchmarks/cli.py +1874 -751
openadapt_ml/benchmarks/trace_export.py +631 -0
openadapt_ml/benchmarks/viewer.py +1236 -0
openadapt_ml/benchmarks/vm_monitor.py +1111 -0
openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
openadapt_ml/cloud/azure_inference.py +3 -5
openadapt_ml/cloud/lambda_labs.py +722 -307
openadapt_ml/cloud/local.py +3194 -89
openadapt_ml/cloud/ssh_tunnel.py +595 -0
openadapt_ml/datasets/next_action.py +125 -96
openadapt_ml/evals/grounding.py +32 -9
openadapt_ml/evals/plot_eval_metrics.py +15 -13
openadapt_ml/evals/trajectory_matching.py +120 -57
openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
openadapt_ml/experiments/representation_shootout/config.py +390 -0
openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
openadapt_ml/experiments/representation_shootout/runner.py +687 -0
openadapt_ml/experiments/waa_demo/__init__.py +10 -0
openadapt_ml/experiments/waa_demo/demos.py +357 -0
openadapt_ml/experiments/waa_demo/runner.py +732 -0
openadapt_ml/experiments/waa_demo/tasks.py +151 -0
openadapt_ml/export/__init__.py +9 -0
openadapt_ml/export/__main__.py +6 -0
openadapt_ml/export/cli.py +89 -0
openadapt_ml/export/parquet.py +277 -0
openadapt_ml/grounding/detector.py +18 -14
openadapt_ml/ingest/__init__.py +11 -10
openadapt_ml/ingest/capture.py +97 -86
openadapt_ml/ingest/loader.py +120 -69
openadapt_ml/ingest/synthetic.py +344 -193
openadapt_ml/models/api_adapter.py +14 -4
openadapt_ml/models/base_adapter.py +10 -2
openadapt_ml/models/providers/__init__.py +288 -0
openadapt_ml/models/providers/anthropic.py +266 -0
openadapt_ml/models/providers/base.py +299 -0
openadapt_ml/models/providers/google.py +376 -0
openadapt_ml/models/providers/openai.py +342 -0
openadapt_ml/models/qwen_vl.py +46 -19
openadapt_ml/perception/__init__.py +35 -0
openadapt_ml/perception/integration.py +399 -0
openadapt_ml/retrieval/README.md +226 -0
openadapt_ml/retrieval/USAGE.md +391 -0
openadapt_ml/retrieval/__init__.py +91 -0
openadapt_ml/retrieval/demo_retriever.py +843 -0
openadapt_ml/retrieval/embeddings.py +630 -0
openadapt_ml/retrieval/index.py +194 -0
openadapt_ml/retrieval/retriever.py +162 -0
openadapt_ml/runtime/__init__.py +50 -0
openadapt_ml/runtime/policy.py +27 -14
openadapt_ml/runtime/safety_gate.py +471 -0
openadapt_ml/schema/__init__.py +113 -0
openadapt_ml/schema/converters.py +588 -0
openadapt_ml/schema/episode.py +470 -0
openadapt_ml/scripts/capture_screenshots.py +530 -0
openadapt_ml/scripts/compare.py +102 -61
openadapt_ml/scripts/demo_policy.py +4 -1
openadapt_ml/scripts/eval_policy.py +19 -14
openadapt_ml/scripts/make_gif.py +1 -1
openadapt_ml/scripts/prepare_synthetic.py +16 -17
openadapt_ml/scripts/train.py +98 -75
openadapt_ml/segmentation/README.md +920 -0
openadapt_ml/segmentation/__init__.py +97 -0
openadapt_ml/segmentation/adapters/__init__.py +5 -0
openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
openadapt_ml/segmentation/annotator.py +610 -0
openadapt_ml/segmentation/cache.py +290 -0
openadapt_ml/segmentation/cli.py +674 -0
openadapt_ml/segmentation/deduplicator.py +656 -0
openadapt_ml/segmentation/frame_describer.py +788 -0
openadapt_ml/segmentation/pipeline.py +340 -0
openadapt_ml/segmentation/schemas.py +622 -0
openadapt_ml/segmentation/segment_extractor.py +634 -0
openadapt_ml/training/azure_ops_viewer.py +1097 -0
openadapt_ml/training/benchmark_viewer.py +3255 -19
openadapt_ml/training/shared_ui.py +7 -7
openadapt_ml/training/stub_provider.py +57 -35
openadapt_ml/training/trainer.py +255 -441
openadapt_ml/training/trl_trainer.py +403 -0
openadapt_ml/training/viewer.py +323 -108
openadapt_ml/training/viewer_components.py +180 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
openadapt_ml-0.2.1.dist-info/RECORD +116 -0
openadapt_ml/benchmarks/base.py +0 -366
openadapt_ml/benchmarks/data_collection.py +0 -432
openadapt_ml/benchmarks/runner.py +0 -381
openadapt_ml/benchmarks/waa.py +0 -704
openadapt_ml/schemas/__init__.py +0 -53
openadapt_ml/schemas/sessions.py +0 -122
openadapt_ml/schemas/validation.py +0 -252
openadapt_ml-0.1.0.dist-info/RECORD +0 -55
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0

openadapt_ml/datasets/next_action.py CHANGED Viewed

@@ -3,10 +3,9 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import Any, Dict, List
-import torch
 from torch.utils.data import Dataset
-from openadapt_ml.schemas.sessions import Action, Episode, Step
+from openadapt_ml.schema import Action, ActionType, Episode, Step, UIElement
 # Coordinate-based DSL system prompt (original)
@@ -20,7 +19,7 @@ SYSTEM_PROMPT = (
     "- Example: An element in the middle of the screen would be approximately x=0.5, y=0.5\n\n"
     "ALLOWED ACTIONS (use exactly this format):\n"
     "- CLICK(x=0.XX, y=0.XX)  → click at normalized coordinates\n"
-    "- TYPE(text=\"...\")     → type text into the currently focused field\n"
+    '- TYPE(text="...")     → type text into the currently focused field\n'
     "- WAIT()                 → wait for UI to update\n"
     "- DONE()                 → task is complete\n\n"
     "RESPONSE FORMAT (required):\n"
@@ -42,14 +41,14 @@ SYSTEM_PROMPT_SOM = (
     "[3] = Login button\n\n"
     "ALLOWED ACTIONS (use exactly this format):\n"
     "- CLICK([N])            → click element with number N to focus/activate it\n"
-    "- TYPE([N], \"text\")   → type text into element N (e.g., TYPE([2], \"hello\"))\n"
+    '- TYPE([N], "text")   → type text into element N (e.g., TYPE([2], "hello"))\n'
     "- WAIT()                → wait for UI to update\n"
     "- DONE()                → task is complete\n\n"
     "ACTION SEQUENCE FOR LOGIN:\n"
     "1. CLICK([1]) to focus username field\n"
-    "2. TYPE([1], \"username\") to enter username\n"
+    '2. TYPE([1], "username") to enter username\n'
     "3. CLICK([2]) to focus password field\n"
-    "4. TYPE([2], \"password\") to enter password\n"
+    '4. TYPE([2], "password") to enter password\n'
     "5. CLICK([3]) to submit login\n"
     "6. DONE() when login is complete\n\n"
     "RESPONSE FORMAT (required):\n"
@@ -74,20 +73,20 @@ SYSTEM_PROMPT_SOM_REGISTRATION = (
     "[6] = Register button\n\n"
     "ALLOWED ACTIONS (use exactly this format):\n"
     "- CLICK([N])            → click element with number N to focus/activate it\n"
-    "- TYPE([N], \"text\")   → type text into element N (e.g., TYPE([2], \"hello\"))\n"
+    '- TYPE([N], "text")   → type text into element N (e.g., TYPE([2], "hello"))\n'
     "- WAIT()                → wait for UI to update\n"
     "- DONE()                → task is complete\n\n"
     "ACTION SEQUENCE FOR REGISTRATION:\n"
     "1. CLICK([1]) to focus first name field\n"
-    "2. TYPE([1], \"name\") to enter first name\n"
+    '2. TYPE([1], "name") to enter first name\n'
     "3. CLICK([2]) to focus last name field\n"
-    "4. TYPE([2], \"name\") to enter last name\n"
+    '4. TYPE([2], "name") to enter last name\n'
     "5. CLICK([3]) to focus email field\n"
-    "6. TYPE([3], \"email\") to enter email\n"
+    '6. TYPE([3], "email") to enter email\n'
     "7. CLICK([4]) to focus password field\n"
-    "8. TYPE([4], \"pass\") to enter password\n"
+    '8. TYPE([4], "pass") to enter password\n'
     "9. CLICK([5]) to focus confirm password field\n"
-    "10. TYPE([5], \"pass\") to enter confirmation\n"
+    '10. TYPE([5], "pass") to enter confirmation\n'
     "11. CLICK([6]) to submit registration\n"
     "12. DONE() when registration is complete\n\n"
     "RESPONSE FORMAT (required):\n"
@@ -97,6 +96,13 @@ SYSTEM_PROMPT_SOM_REGISTRATION = (
 )
+def _get_element_id(action: Action) -> str | None:
+    """Extract element ID from action's element field."""
+    if action.element is not None and action.element.element_id is not None:
+        return action.element.element_id
+    return None
 def format_action(action: Action, use_som: bool = False) -> str:
     """Serialize an Action into a simple textual command.
@@ -110,53 +116,55 @@ def format_action(action: Action, use_som: bool = False) -> str:
     Args:
         action: The action to format.
         use_som: If True, use Set-of-Marks (SoM) index-based format instead of
-                 coordinate-based format. Requires element_index to be set.
+                 coordinate-based format. Requires element with element_id to be set.
     """
     t = action.type
+    element_id = _get_element_id(action)
     if use_som:
         # SoM mode: use element indices instead of coordinates
-        if t == "click" and action.element_index is not None:
-            return f"CLICK([{action.element_index}])"
-        if t == "type" and action.text is not None:
-            escaped = action.text.replace("\\", "\\\\").replace("\"", "\\\"")
-            if action.element_index is not None:
-                return f"TYPE([{action.element_index}], \"{escaped}\")"
+        if t == ActionType.CLICK and element_id is not None:
+            return f"CLICK([{element_id}])"
+        if t == ActionType.TYPE and action.text is not None:
+            escaped = action.text.replace("\\", "\\\\").replace('"', '\\"')
+            if element_id is not None:
+                return f'TYPE([{element_id}], "{escaped}")'
             else:
                 # Fallback: TYPE without element reference (for focused field)
-                return f"TYPE(\"{escaped}\")"
-        if t == "wait":
+                return f'TYPE("{escaped}")'
+        if t == ActionType.WAIT:
             return "WAIT()"
-        if t == "done":
+        if t == ActionType.DONE:
             return "DONE()"
         # Fallback
-        return f"ACTION(type={t})"
+        return f"ACTION(type={t.value if isinstance(t, ActionType) else t})"
     else:
         # Coordinate mode (original)
-        if t == "click" and action.x is not None and action.y is not None:
-            return f"CLICK(x={action.x:.2f}, y={action.y:.2f})"
-        if t == "type" and action.text is not None:
-            escaped = action.text.replace("\\", "\\\\").replace("\"", "\\\"")
-            return f"TYPE(text=\"{escaped}\")"
-        if t == "wait":
+        if t == ActionType.CLICK and action.normalized_coordinates is not None:
+            x, y = action.normalized_coordinates
+            return f"CLICK(x={x:.2f}, y={y:.2f})"
+        if t == ActionType.TYPE and action.text is not None:
+            escaped = action.text.replace("\\", "\\\\").replace('"', '\\"')
+            return f'TYPE(text="{escaped}")'
+        if t == ActionType.WAIT:
             return "WAIT()"
-        if t == "done":
+        if t == ActionType.DONE:
             return "DONE()"
         # Fallback
-        return f"ACTION(type={t})"
+        return f"ACTION(type={t.value if isinstance(t, ActionType) else t})"
 def parse_action_som(text: str) -> Action:
     """Parse a SoM-style action string into an Action object.
     Supported formats:
-    - CLICK([N])          → click element N
-    - TYPE([N], "text")   → type text into element N
-    - TYPE("text")        → type text into focused field
-    - WAIT()              → wait
-    - DONE()              → done
+    - CLICK([N])          -> click element N
+    - TYPE([N], "text")   -> type text into element N
+    - TYPE("text")        -> type text into focused field
+    - WAIT()              -> wait
+    - DONE()              -> done
-    Returns Action with element_index set for click/type actions.
+    Returns Action with element set for click/type actions.
     """
     import re
@@ -165,35 +173,39 @@ def parse_action_som(text: str) -> Action:
     # CLICK([N])
     match = re.match(r"CLICK\(\[(\d+)\]\)", text)
     if match:
-        idx = int(match.group(1))
-        return Action(type="click", element_index=idx)
+        idx = match.group(1)
+        return Action(type=ActionType.CLICK, element=UIElement(element_id=idx))
     # TYPE([N], "text") or TYPE([N], 'text')
     match = re.match(r'TYPE\(\[(\d+)\],\s*["\'](.*)["\']\)', text, re.DOTALL)
     if match:
-        idx = int(match.group(1))
-        content = match.group(2).replace("\\\"", "\"").replace("\\\\", "\\")
-        return Action(type="type", text=content, element_index=idx)
+        idx = match.group(1)
+        content = match.group(2).replace('\\"', '"').replace("\\\\", "\\")
+        return Action(
+            type=ActionType.TYPE, text=content, element=UIElement(element_id=idx)
+        )
     # TYPE("text") - no element index
     match = re.match(r'TYPE\(["\'](.*)["\']\)', text, re.DOTALL)
     if match:
-        content = match.group(1).replace("\\\"", "\"").replace("\\\\", "\\")
-        return Action(type="type", text=content)
+        content = match.group(1).replace('\\"', '"').replace("\\\\", "\\")
+        return Action(type=ActionType.TYPE, text=content)
     # WAIT()
     if text.upper().startswith("WAIT"):
-        return Action(type="wait")
+        return Action(type=ActionType.WAIT)
     # DONE()
     if text.upper().startswith("DONE"):
-        return Action(type="done")
+        return Action(type=ActionType.DONE)
     # Failed to parse
-    return Action(type="failed", raw={"text": text})
+    return Action(type=ActionType.FAIL, raw={"text": text})
-def _generate_generic_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
+def _generate_generic_thought(
+    step_index: int, step: Step, goal: str, total_steps: int
+) -> str:
     """Generate a thought for real captures (non-synthetic scenarios).
     This creates action-appropriate thoughts that teach the model to output
@@ -205,10 +217,10 @@ def _generate_generic_thought(step_index: int, step: Step, goal: str, total_step
     # Progress context
     progress = f"Step {step_index + 1} of {total_steps}."
-    if t == "click":
-        if action.x is not None and action.y is not None:
+    if t == ActionType.CLICK:
+        if action.normalized_coordinates is not None:
             # Describe the click location relative to screen regions
-            x, y = action.x, action.y
+            x, y = action.normalized_coordinates
             h_pos = "left" if x < 0.33 else ("center" if x < 0.66 else "right")
             v_pos = "top" if y < 0.33 else ("middle" if y < 0.66 else "bottom")
             return (
@@ -217,28 +229,30 @@ def _generate_generic_thought(step_index: int, step: Step, goal: str, total_step
             )
         return f"{progress} I need to click on the relevant UI element to continue toward '{goal}'."
-    if t == "double_click":
+    if t == ActionType.DOUBLE_CLICK:
         return f"{progress} I need to double-click to select or activate this element for '{goal}'."
-    if t == "type":
+    if t == ActionType.TYPE:
         if action.text:
             # Don't reveal the actual text, just indicate typing is needed
             return f"{progress} I need to type text into the focused input field to continue toward '{goal}'."
         return f"{progress} I need to enter text in the current field."
-    if t == "scroll":
+    if t == ActionType.SCROLL:
         return f"{progress} I need to scroll to reveal more content or reach the target element for '{goal}'."
-    if t == "drag":
-        return f"{progress} I need to drag an element to complete this part of '{goal}'."
+    if t == ActionType.DRAG:
+        return (
+            f"{progress} I need to drag an element to complete this part of '{goal}'."
+        )
-    if t == "key_press":
+    if t == ActionType.KEY:
         return f"{progress} I need to press a key to continue the workflow."
-    if t == "wait":
+    if t == ActionType.WAIT:
         return f"{progress} I should wait for the UI to update before the next action."
-    if t == "done":
+    if t == ActionType.DONE:
         return f"The goal '{goal}' has been achieved. The workflow is complete."
     # Fallback
@@ -260,9 +274,6 @@ def _generate_thought_for_step(
     actions back to the stated objective.
     """
-    action = step.action
-    t = action.type
     if scenario == "registration":
         return _generate_registration_thought(step_index, step, goal, total_steps)
     elif scenario == "login" and total_steps <= 7:
@@ -273,48 +284,50 @@ def _generate_thought_for_step(
         return _generate_generic_thought(step_index, step, goal, total_steps)
-def _generate_login_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
+def _generate_login_thought(
+    step_index: int, step: Step, goal: str, total_steps: int
+) -> str:
     """Generate thought for login scenario (6 steps)."""
     action = step.action
     t = action.type
     # Step 0: click username field
-    if step_index == 0 and t == "click":
+    if step_index == 0 and t == ActionType.CLICK:
         return (
             "I see a login screen with empty username and password fields and a Login button. "
             f"To start logging in, I need to click on the username field to focus it ({goal})."
         )
     # Step 1: type username
-    if step_index == 1 and t == "type":
+    if step_index == 1 and t == ActionType.TYPE:
         return (
             "The username field is focused. To move toward the login goal, I should type the "
             "username into this field."
         )
     # Step 2: click password field
-    if step_index == 2 and t == "click":
+    if step_index == 2 and t == ActionType.CLICK:
         return (
             "The username has been entered. Next, I need to focus the password field so that I can "
             "enter the password for this login. I will click on the password input box."
         )
     # Step 3: type password
-    if step_index == 3 and t == "type":
+    if step_index == 3 and t == ActionType.TYPE:
         return (
             "The password field is focused. To continue the login process, I should type the "
             "password (which will appear as masked characters on the screen)."
         )
     # Step 4: click Login button
-    if step_index == 4 and t == "click":
+    if step_index == 4 and t == ActionType.CLICK:
         return (
             "Both the username and password have been entered. To submit the form and attempt the "
             "login, I should click the Login button."
         )
     # Step 5: DONE on logged-in screen
-    if step_index == 5 and t == "done":
+    if step_index == 5 and t == ActionType.DONE:
         return (
             "I now see a logged-in confirmation screen indicating the goal has been satisfied. "
             "The task is complete, so I should emit DONE()."
@@ -327,48 +340,50 @@ def _generate_login_thought(step_index: int, step: Step, goal: str, total_steps:
     )
-def _generate_registration_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
+def _generate_registration_thought(
+    step_index: int, step: Step, goal: str, total_steps: int
+) -> str:
     """Generate thought for registration scenario (12 steps)."""
     action = step.action
     t = action.type
     # Registration step mapping (pairs of click + type for 5 fields, then submit + done)
     thoughts = {
-        (0, "click"): (
+        (0, ActionType.CLICK): (
             "I see a registration form with empty fields for name, email, and password. "
             f"To start registration, I need to click on the First Name field ({goal})."
         ),
-        (1, "type"): (
+        (1, ActionType.TYPE): (
             "The First Name field is focused. I should type the first name."
         ),
-        (2, "click"): (
+        (2, ActionType.CLICK): (
             "First name entered. Now I need to focus the Last Name field to enter it."
         ),
-        (3, "type"): (
+        (3, ActionType.TYPE): (
             "The Last Name field is focused. I should type the last name."
         ),
-        (4, "click"): (
+        (4, ActionType.CLICK): (
             "Last name entered. Now I need to focus the Email field to enter the email address."
         ),
-        (5, "type"): (
+        (5, ActionType.TYPE): (
             "The Email field is focused. I should type the email address."
         ),
-        (6, "click"): (
+        (6, ActionType.CLICK): (
             "Email entered. Now I need to focus the Password field to create a password."
         ),
-        (7, "type"): (
+        (7, ActionType.TYPE): (
             "The Password field is focused. I should type the password."
         ),
-        (8, "click"): (
+        (8, ActionType.CLICK): (
             "Password entered. Now I need to focus the Confirm Password field to verify the password."
         ),
-        (9, "type"): (
+        (9, ActionType.TYPE): (
             "The Confirm Password field is focused. I should type the same password again."
         ),
-        (10, "click"): (
+        (10, ActionType.CLICK): (
             "All form fields are filled. I should click the Register button to submit the form."
         ),
-        (11, "done"): (
+        (11, ActionType.DONE): (
             "Registration is complete - I see a success screen. The task is finished."
         ),
     }
@@ -385,10 +400,16 @@ def _generate_registration_thought(step_index: int, step: Step, goal: str, total
 def _detect_scenario(episode: Episode) -> str:
-    """Detect scenario from episode workflow_id."""
-    workflow_id = episode.workflow_id or ""
-    if "registration" in workflow_id.lower():
+    """Detect scenario from episode task_id or metadata."""
+    # Check task_id first
+    task_id = episode.task_id or ""
+    if "registration" in task_id.lower():
         return "registration"
+    # Check metadata for workflow_id (backward compatibility)
+    if episode.metadata and "workflow_id" in episode.metadata:
+        workflow_id = episode.metadata.get("workflow_id", "")
+        if "registration" in str(workflow_id).lower():
+            return "registration"
     return "login"
@@ -417,7 +438,8 @@ def build_next_action_sft_samples(
     samples: List[Dict[str, Any]] = []
     for episode in episodes:
-        goal = episode.goal
+        # Use instruction as the goal (new schema field name)
+        goal = episode.instruction
         total_steps = len(episode.steps)
         scenario = _detect_scenario(episode)
@@ -430,18 +452,21 @@ def build_next_action_sft_samples(
         else:
             system_prompt = SYSTEM_PROMPT
-        for step_index, step in enumerate(episode.steps):
-            image_path = step.observation.image_path
+        for step in episode.steps:
+            # Use step_index from the Step model
+            step_index = step.step_index
+            # Use screenshot_path instead of image_path
+            image_path = step.observation.screenshot_path
             if not image_path:
                 # Skip steps without an associated image
                 continue
             # Build action history from previous steps
             action_history = []
-            for prev_idx in range(step_index):
-                prev_step = episode.steps[prev_idx]
-                prev_action_text = format_action(prev_step.action, use_som=use_som)
-                action_history.append(prev_action_text)
+            for prev_step in episode.steps:
+                if prev_step.step_index < step_index:
+                    prev_action_text = format_action(prev_step.action, use_som=use_som)
+                    action_history.append(prev_action_text)
             # Build history section for both modes - use actual step count
             if action_history:
@@ -450,7 +475,9 @@ def build_next_action_sft_samples(
                     history_text += f"  {i}. {action_text}\n"
                 history_text += f"\nThis is step {step_index + 1} of {total_steps}. "
             else:
-                history_text = f"This is step 1 of {total_steps} (no actions completed yet). "
+                history_text = (
+                    f"This is step 1 of {total_steps} (no actions completed yet). "
+                )
             if use_som:
                 user_content = (
@@ -458,7 +485,7 @@ def build_next_action_sft_samples(
                     f"{history_text}"
                     "Look at the screenshot and determine the NEXT action.\n\n"
                     "Thought: [which numbered element to interact with and why]\n"
-                    "Action: [CLICK([N]) or TYPE([N], \"text\") or WAIT() or DONE()]"
+                    'Action: [CLICK([N]) or TYPE([N], "text") or WAIT() or DONE()]'
                 )
             else:
                 user_content = (
@@ -466,13 +493,15 @@ def build_next_action_sft_samples(
                     f"{history_text}"
                     "Look at the screenshot and determine the NEXT action.\n\n"
                     "Thought: [what element to interact with and why]\n"
-                    "Action: [CLICK(x=..., y=...) or TYPE(text=\"...\") or WAIT() or DONE()]"
+                    'Action: [CLICK(x=..., y=...) or TYPE(text="...") or WAIT() or DONE()]'
                 )
             # Provide a deterministic, semantically meaningful Thought while supervising
             # the exact DSL Action.
             action_text = format_action(step.action, use_som=use_som)
-            thought_text = _generate_thought_for_step(step_index, step, goal, scenario, total_steps)
+            thought_text = _generate_thought_for_step(
+                step_index, step, goal, scenario, total_steps
+            )
             assistant_content = f"Thought: {thought_text}\nAction: {action_text}"
             sample = {

openadapt_ml/evals/grounding.py CHANGED Viewed

@@ -19,6 +19,7 @@ from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from PIL import Image
+    from openadapt_ml.data.types import Episode
     from openadapt_ml.grounding.base import GroundingModule, RegionCandidate
@@ -212,30 +213,52 @@ def evaluate_grounder_on_episode(
     """
     from PIL import Image
-    from openadapt_ml.schemas.sessions import Episode
+    from openadapt_ml.schema import ActionType
     test_cases = []
     for step in episode.steps:
         action = step.action
+        # Get action type as string for comparison
+        action_type_str = (
+            action.type.value if isinstance(action.type, ActionType) else action.type
+        )
         # Only evaluate clicks with bboxes
-        if action.type not in ("click", "double_click"):
+        if action_type_str not in ("click", "double_click"):
             continue
-        if action.bbox is None:
+        # Check for bbox - in new schema, bbox is in element.bounds or raw
+        bbox = None
+        if action.element and action.element.bounds:
+            b = action.element.bounds
+            bbox = (b.x, b.y, b.x + b.width, b.y + b.height)
+        elif action.raw and "bbox" in action.raw:
+            bbox = action.raw["bbox"]
+        if bbox is None:
             continue
-        if step.observation.image_path is None:
+        if step.observation.screenshot_path is None:
             continue
         # Load image
         try:
-            image = Image.open(step.observation.image_path)
+            image = Image.open(step.observation.screenshot_path)
         except Exception:
             continue
-        # Create target description from thought or action
-        target_desc = step.thought or f"element at ({action.x:.2f}, {action.y:.2f})"
-        test_cases.append((image, target_desc, action.bbox))
+        # Create target description from reasoning or action coordinates
+        coords_x, coords_y = None, None
+        if action.normalized_coordinates:
+            coords_x, coords_y = action.normalized_coordinates
+        if coords_x is not None and coords_y is not None:
+            target_desc = (
+                step.reasoning or f"element at ({coords_x:.2f}, {coords_y:.2f})"
+            )
+        else:
+            target_desc = step.reasoning or "target element"
+        test_cases.append((image, target_desc, bbox))
     return evaluate_grounder(grounder, test_cases, k=k)

openadapt_ml/evals/plot_eval_metrics.py CHANGED Viewed

@@ -73,7 +73,7 @@ def plot_eval_metrics(
     fig.suptitle(
         "VLM Model Comparison (Offline fine-tuned vs API models)",
         fontsize=12,
-        fontweight='bold',
+        fontweight="bold",
     )
     if num_metrics == 1:
         axes = [axes]
@@ -96,36 +96,38 @@ def plot_eval_metrics(
             hatches.append(hatch)
         x = range(num_models)
-        bars = ax.bar(x, values, tick_label=labels, color=colors, edgecolor='black', linewidth=1.2)
+        bars = ax.bar(
+            x, values, tick_label=labels, color=colors, edgecolor="black", linewidth=1.2
+        )
         # Apply hatch patterns
         for bar, hatch in zip(bars, hatches):
             bar.set_hatch(hatch)
-        ax.set_title(title, fontsize=11, fontweight='bold')
+        ax.set_title(title, fontsize=11, fontweight="bold")
         ax.set_ylabel(key, fontsize=9)
         ax.set_ylim(bottom=0.0)
         # Rotate x-axis labels to prevent crowding
-        ax.tick_params(axis='x', labelrotation=45, labelsize=8)
+        ax.tick_params(axis="x", labelrotation=45, labelsize=8)
         # Align labels to the right for better readability when rotated
         for tick in ax.get_xticklabels():
-            tick.set_horizontalalignment('right')
+            tick.set_horizontalalignment("right")
     fig.tight_layout()
     # Add legend explaining color coding and hatch patterns
     legend_elements = [
-        Patch(facecolor='#4A90E2', edgecolor='black', label='Qwen3-VL-2B'),
-        Patch(facecolor='#2E5C8A', edgecolor='black', label='Qwen3-VL-8B'),
-        Patch(facecolor='#FF6B35', edgecolor='black', label='Claude (API)'),
-        Patch(facecolor='#C1121F', edgecolor='black', label='GPT (API)'),
-        Patch(facecolor='gray', edgecolor='black', hatch='///', label='Fine-tuned'),
-        Patch(facecolor='gray', edgecolor='black', label='Base/Pretrained'),
+        Patch(facecolor="#4A90E2", edgecolor="black", label="Qwen3-VL-2B"),
+        Patch(facecolor="#2E5C8A", edgecolor="black", label="Qwen3-VL-8B"),
+        Patch(facecolor="#FF6B35", edgecolor="black", label="Claude (API)"),
+        Patch(facecolor="#C1121F", edgecolor="black", label="GPT (API)"),
+        Patch(facecolor="gray", edgecolor="black", hatch="///", label="Fine-tuned"),
+        Patch(facecolor="gray", edgecolor="black", label="Base/Pretrained"),
     ]
     fig.legend(
         handles=legend_elements,
-        loc='lower center',
+        loc="lower center",
         bbox_to_anchor=(0.5, -0.05),
         ncol=3,
         fontsize=9,
@@ -133,7 +135,7 @@ def plot_eval_metrics(
     )
     output_path.parent.mkdir(parents=True, exist_ok=True)
-    fig.savefig(output_path, dpi=150, bbox_inches='tight')
+    fig.savefig(output_path, dpi=150, bbox_inches="tight")
     plt.close(fig)

openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

openadapt-ml 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl