PyPI - openadapt-ml - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

openadapt-ml 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

openadapt_ml/benchmarks/__init__.py +8 -0
openadapt_ml/benchmarks/agent.py +90 -11
openadapt_ml/benchmarks/azure.py +35 -6
openadapt_ml/benchmarks/cli.py +4449 -201
openadapt_ml/benchmarks/live_tracker.py +180 -0
openadapt_ml/benchmarks/runner.py +41 -4
openadapt_ml/benchmarks/viewer.py +1219 -0
openadapt_ml/benchmarks/vm_monitor.py +610 -0
openadapt_ml/benchmarks/waa.py +61 -4
openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
openadapt_ml/benchmarks/waa_live.py +619 -0
openadapt_ml/cloud/local.py +1555 -1
openadapt_ml/cloud/ssh_tunnel.py +553 -0
openadapt_ml/datasets/next_action.py +87 -68
openadapt_ml/evals/grounding.py +26 -8
openadapt_ml/evals/trajectory_matching.py +84 -36
openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
openadapt_ml/experiments/waa_demo/__init__.py +10 -0
openadapt_ml/experiments/waa_demo/demos.py +357 -0
openadapt_ml/experiments/waa_demo/runner.py +717 -0
openadapt_ml/experiments/waa_demo/tasks.py +151 -0
openadapt_ml/export/__init__.py +9 -0
openadapt_ml/export/__main__.py +6 -0
openadapt_ml/export/cli.py +89 -0
openadapt_ml/export/parquet.py +265 -0
openadapt_ml/ingest/__init__.py +3 -4
openadapt_ml/ingest/capture.py +89 -81
openadapt_ml/ingest/loader.py +116 -68
openadapt_ml/ingest/synthetic.py +221 -159
openadapt_ml/retrieval/README.md +226 -0
openadapt_ml/retrieval/USAGE.md +391 -0
openadapt_ml/retrieval/__init__.py +91 -0
openadapt_ml/retrieval/demo_retriever.py +817 -0
openadapt_ml/retrieval/embeddings.py +629 -0
openadapt_ml/retrieval/index.py +194 -0
openadapt_ml/retrieval/retriever.py +160 -0
openadapt_ml/runtime/policy.py +10 -10
openadapt_ml/schema/__init__.py +104 -0
openadapt_ml/schema/converters.py +541 -0
openadapt_ml/schema/episode.py +457 -0
openadapt_ml/scripts/compare.py +26 -16
openadapt_ml/scripts/eval_policy.py +4 -5
openadapt_ml/scripts/prepare_synthetic.py +14 -17
openadapt_ml/scripts/train.py +81 -70
openadapt_ml/training/benchmark_viewer.py +3225 -0
openadapt_ml/training/trainer.py +120 -363
openadapt_ml/training/trl_trainer.py +354 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
openadapt_ml-0.2.0.dist-info/RECORD +86 -0
openadapt_ml/schemas/__init__.py +0 -53
openadapt_ml/schemas/sessions.py +0 -122
openadapt_ml/schemas/validation.py +0 -252
openadapt_ml-0.1.0.dist-info/RECORD +0 -55
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0

openadapt_ml/datasets/next_action.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Any, Dict, List
 import torch
 from torch.utils.data import Dataset
-from openadapt_ml.schemas.sessions import Action, Episode, Step
+from openadapt_ml.schema import Action, ActionType, Episode, Step, UIElement
 # Coordinate-based DSL system prompt (original)
@@ -97,6 +97,13 @@ SYSTEM_PROMPT_SOM_REGISTRATION = (
 )
+def _get_element_id(action: Action) -> str | None:
+    """Extract element ID from action's element field."""
+    if action.element is not None and action.element.element_id is not None:
+        return action.element.element_id
+    return None
 def format_action(action: Action, use_som: bool = False) -> str:
     """Serialize an Action into a simple textual command.
@@ -110,53 +117,55 @@ def format_action(action: Action, use_som: bool = False) -> str:
     Args:
         action: The action to format.
         use_som: If True, use Set-of-Marks (SoM) index-based format instead of
-                 coordinate-based format. Requires element_index to be set.
+                 coordinate-based format. Requires element with element_id to be set.
     """
     t = action.type
+    element_id = _get_element_id(action)
     if use_som:
         # SoM mode: use element indices instead of coordinates
-        if t == "click" and action.element_index is not None:
-            return f"CLICK([{action.element_index}])"
-        if t == "type" and action.text is not None:
+        if t == ActionType.CLICK and element_id is not None:
+            return f"CLICK([{element_id}])"
+        if t == ActionType.TYPE and action.text is not None:
             escaped = action.text.replace("\\", "\\\\").replace("\"", "\\\"")
-            if action.element_index is not None:
-                return f"TYPE([{action.element_index}], \"{escaped}\")"
+            if element_id is not None:
+                return f"TYPE([{element_id}], \"{escaped}\")"
             else:
                 # Fallback: TYPE without element reference (for focused field)
                 return f"TYPE(\"{escaped}\")"
-        if t == "wait":
+        if t == ActionType.WAIT:
             return "WAIT()"
-        if t == "done":
+        if t == ActionType.DONE:
             return "DONE()"
         # Fallback
-        return f"ACTION(type={t})"
+        return f"ACTION(type={t.value if isinstance(t, ActionType) else t})"
     else:
         # Coordinate mode (original)
-        if t == "click" and action.x is not None and action.y is not None:
-            return f"CLICK(x={action.x:.2f}, y={action.y:.2f})"
-        if t == "type" and action.text is not None:
+        if t == ActionType.CLICK and action.normalized_coordinates is not None:
+            x, y = action.normalized_coordinates
+            return f"CLICK(x={x:.2f}, y={y:.2f})"
+        if t == ActionType.TYPE and action.text is not None:
             escaped = action.text.replace("\\", "\\\\").replace("\"", "\\\"")
             return f"TYPE(text=\"{escaped}\")"
-        if t == "wait":
+        if t == ActionType.WAIT:
             return "WAIT()"
-        if t == "done":
+        if t == ActionType.DONE:
             return "DONE()"
         # Fallback
-        return f"ACTION(type={t})"
+        return f"ACTION(type={t.value if isinstance(t, ActionType) else t})"
 def parse_action_som(text: str) -> Action:
     """Parse a SoM-style action string into an Action object.
     Supported formats:
-    - CLICK([N])          → click element N
-    - TYPE([N], "text")   → type text into element N
-    - TYPE("text")        → type text into focused field
-    - WAIT()              → wait
-    - DONE()              → done
+    - CLICK([N])          -> click element N
+    - TYPE([N], "text")   -> type text into element N
+    - TYPE("text")        -> type text into focused field
+    - WAIT()              -> wait
+    - DONE()              -> done
-    Returns Action with element_index set for click/type actions.
+    Returns Action with element set for click/type actions.
     """
     import re
@@ -165,32 +174,32 @@ def parse_action_som(text: str) -> Action:
     # CLICK([N])
     match = re.match(r"CLICK\(\[(\d+)\]\)", text)
     if match:
-        idx = int(match.group(1))
-        return Action(type="click", element_index=idx)
+        idx = match.group(1)
+        return Action(type=ActionType.CLICK, element=UIElement(element_id=idx))
     # TYPE([N], "text") or TYPE([N], 'text')
     match = re.match(r'TYPE\(\[(\d+)\],\s*["\'](.*)["\']\)', text, re.DOTALL)
     if match:
-        idx = int(match.group(1))
+        idx = match.group(1)
         content = match.group(2).replace("\\\"", "\"").replace("\\\\", "\\")
-        return Action(type="type", text=content, element_index=idx)
+        return Action(type=ActionType.TYPE, text=content, element=UIElement(element_id=idx))
     # TYPE("text") - no element index
     match = re.match(r'TYPE\(["\'](.*)["\']\)', text, re.DOTALL)
     if match:
         content = match.group(1).replace("\\\"", "\"").replace("\\\\", "\\")
-        return Action(type="type", text=content)
+        return Action(type=ActionType.TYPE, text=content)
     # WAIT()
     if text.upper().startswith("WAIT"):
-        return Action(type="wait")
+        return Action(type=ActionType.WAIT)
     # DONE()
     if text.upper().startswith("DONE"):
-        return Action(type="done")
+        return Action(type=ActionType.DONE)
     # Failed to parse
-    return Action(type="failed", raw={"text": text})
+    return Action(type=ActionType.FAIL, raw={"text": text})
 def _generate_generic_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
@@ -205,10 +214,10 @@ def _generate_generic_thought(step_index: int, step: Step, goal: str, total_step
     # Progress context
     progress = f"Step {step_index + 1} of {total_steps}."
-    if t == "click":
-        if action.x is not None and action.y is not None:
+    if t == ActionType.CLICK:
+        if action.normalized_coordinates is not None:
             # Describe the click location relative to screen regions
-            x, y = action.x, action.y
+            x, y = action.normalized_coordinates
             h_pos = "left" if x < 0.33 else ("center" if x < 0.66 else "right")
             v_pos = "top" if y < 0.33 else ("middle" if y < 0.66 else "bottom")
             return (
@@ -217,28 +226,28 @@ def _generate_generic_thought(step_index: int, step: Step, goal: str, total_step
             )
         return f"{progress} I need to click on the relevant UI element to continue toward '{goal}'."
-    if t == "double_click":
+    if t == ActionType.DOUBLE_CLICK:
         return f"{progress} I need to double-click to select or activate this element for '{goal}'."
-    if t == "type":
+    if t == ActionType.TYPE:
         if action.text:
             # Don't reveal the actual text, just indicate typing is needed
             return f"{progress} I need to type text into the focused input field to continue toward '{goal}'."
         return f"{progress} I need to enter text in the current field."
-    if t == "scroll":
+    if t == ActionType.SCROLL:
         return f"{progress} I need to scroll to reveal more content or reach the target element for '{goal}'."
-    if t == "drag":
+    if t == ActionType.DRAG:
         return f"{progress} I need to drag an element to complete this part of '{goal}'."
-    if t == "key_press":
+    if t == ActionType.KEY:
         return f"{progress} I need to press a key to continue the workflow."
-    if t == "wait":
+    if t == ActionType.WAIT:
         return f"{progress} I should wait for the UI to update before the next action."
-    if t == "done":
+    if t == ActionType.DONE:
         return f"The goal '{goal}' has been achieved. The workflow is complete."
     # Fallback
@@ -279,42 +288,42 @@ def _generate_login_thought(step_index: int, step: Step, goal: str, total_steps:
     t = action.type
     # Step 0: click username field
-    if step_index == 0 and t == "click":
+    if step_index == 0 and t == ActionType.CLICK:
         return (
             "I see a login screen with empty username and password fields and a Login button. "
             f"To start logging in, I need to click on the username field to focus it ({goal})."
         )
     # Step 1: type username
-    if step_index == 1 and t == "type":
+    if step_index == 1 and t == ActionType.TYPE:
         return (
             "The username field is focused. To move toward the login goal, I should type the "
             "username into this field."
         )
     # Step 2: click password field
-    if step_index == 2 and t == "click":
+    if step_index == 2 and t == ActionType.CLICK:
         return (
             "The username has been entered. Next, I need to focus the password field so that I can "
             "enter the password for this login. I will click on the password input box."
         )
     # Step 3: type password
-    if step_index == 3 and t == "type":
+    if step_index == 3 and t == ActionType.TYPE:
         return (
             "The password field is focused. To continue the login process, I should type the "
             "password (which will appear as masked characters on the screen)."
         )
     # Step 4: click Login button
-    if step_index == 4 and t == "click":
+    if step_index == 4 and t == ActionType.CLICK:
         return (
             "Both the username and password have been entered. To submit the form and attempt the "
             "login, I should click the Login button."
         )
     # Step 5: DONE on logged-in screen
-    if step_index == 5 and t == "done":
+    if step_index == 5 and t == ActionType.DONE:
         return (
             "I now see a logged-in confirmation screen indicating the goal has been satisfied. "
             "The task is complete, so I should emit DONE()."
@@ -334,41 +343,41 @@ def _generate_registration_thought(step_index: int, step: Step, goal: str, total
     # Registration step mapping (pairs of click + type for 5 fields, then submit + done)
     thoughts = {
-        (0, "click"): (
+        (0, ActionType.CLICK): (
             "I see a registration form with empty fields for name, email, and password. "
             f"To start registration, I need to click on the First Name field ({goal})."
         ),
-        (1, "type"): (
+        (1, ActionType.TYPE): (
             "The First Name field is focused. I should type the first name."
         ),
-        (2, "click"): (
+        (2, ActionType.CLICK): (
             "First name entered. Now I need to focus the Last Name field to enter it."
         ),
-        (3, "type"): (
+        (3, ActionType.TYPE): (
             "The Last Name field is focused. I should type the last name."
         ),
-        (4, "click"): (
+        (4, ActionType.CLICK): (
             "Last name entered. Now I need to focus the Email field to enter the email address."
         ),
-        (5, "type"): (
+        (5, ActionType.TYPE): (
             "The Email field is focused. I should type the email address."
         ),
-        (6, "click"): (
+        (6, ActionType.CLICK): (
             "Email entered. Now I need to focus the Password field to create a password."
         ),
-        (7, "type"): (
+        (7, ActionType.TYPE): (
             "The Password field is focused. I should type the password."
         ),
-        (8, "click"): (
+        (8, ActionType.CLICK): (
             "Password entered. Now I need to focus the Confirm Password field to verify the password."
         ),
-        (9, "type"): (
+        (9, ActionType.TYPE): (
             "The Confirm Password field is focused. I should type the same password again."
         ),
-        (10, "click"): (
+        (10, ActionType.CLICK): (
             "All form fields are filled. I should click the Register button to submit the form."
         ),
-        (11, "done"): (
+        (11, ActionType.DONE): (
             "Registration is complete - I see a success screen. The task is finished."
         ),
     }
@@ -385,10 +394,16 @@ def _generate_registration_thought(step_index: int, step: Step, goal: str, total
 def _detect_scenario(episode: Episode) -> str:
-    """Detect scenario from episode workflow_id."""
-    workflow_id = episode.workflow_id or ""
-    if "registration" in workflow_id.lower():
+    """Detect scenario from episode task_id or metadata."""
+    # Check task_id first
+    task_id = episode.task_id or ""
+    if "registration" in task_id.lower():
         return "registration"
+    # Check metadata for workflow_id (backward compatibility)
+    if episode.metadata and "workflow_id" in episode.metadata:
+        workflow_id = episode.metadata.get("workflow_id", "")
+        if "registration" in str(workflow_id).lower():
+            return "registration"
     return "login"
@@ -417,7 +432,8 @@ def build_next_action_sft_samples(
     samples: List[Dict[str, Any]] = []
     for episode in episodes:
-        goal = episode.goal
+        # Use instruction as the goal (new schema field name)
+        goal = episode.instruction
         total_steps = len(episode.steps)
         scenario = _detect_scenario(episode)
@@ -430,18 +446,21 @@ def build_next_action_sft_samples(
         else:
             system_prompt = SYSTEM_PROMPT
-        for step_index, step in enumerate(episode.steps):
-            image_path = step.observation.image_path
+        for step in episode.steps:
+            # Use step_index from the Step model
+            step_index = step.step_index
+            # Use screenshot_path instead of image_path
+            image_path = step.observation.screenshot_path
             if not image_path:
                 # Skip steps without an associated image
                 continue
             # Build action history from previous steps
             action_history = []
-            for prev_idx in range(step_index):
-                prev_step = episode.steps[prev_idx]
-                prev_action_text = format_action(prev_step.action, use_som=use_som)
-                action_history.append(prev_action_text)
+            for prev_step in episode.steps:
+                if prev_step.step_index < step_index:
+                    prev_action_text = format_action(prev_step.action, use_som=use_som)
+                    action_history.append(prev_action_text)
             # Build history section for both modes - use actual step count
             if action_history:

openadapt_ml/evals/grounding.py CHANGED Viewed

@@ -212,30 +212,48 @@ def evaluate_grounder_on_episode(
     """
     from PIL import Image
-    from openadapt_ml.schemas.sessions import Episode
+    from openadapt_ml.schema import Episode, ActionType
     test_cases = []
     for step in episode.steps:
         action = step.action
+        # Get action type as string for comparison
+        action_type_str = action.type.value if isinstance(action.type, ActionType) else action.type
         # Only evaluate clicks with bboxes
-        if action.type not in ("click", "double_click"):
+        if action_type_str not in ("click", "double_click"):
             continue
-        if action.bbox is None:
+        # Check for bbox - in new schema, bbox is in element.bounds or raw
+        bbox = None
+        if action.element and action.element.bounds:
+            b = action.element.bounds
+            bbox = (b.x, b.y, b.x + b.width, b.y + b.height)
+        elif action.raw and "bbox" in action.raw:
+            bbox = action.raw["bbox"]
+        if bbox is None:
             continue
-        if step.observation.image_path is None:
+        if step.observation.screenshot_path is None:
             continue
         # Load image
         try:
-            image = Image.open(step.observation.image_path)
+            image = Image.open(step.observation.screenshot_path)
         except Exception:
             continue
-        # Create target description from thought or action
-        target_desc = step.thought or f"element at ({action.x:.2f}, {action.y:.2f})"
+        # Create target description from reasoning or action coordinates
+        coords_x, coords_y = None, None
+        if action.normalized_coordinates:
+            coords_x, coords_y = action.normalized_coordinates
+        if coords_x is not None and coords_y is not None:
+            target_desc = step.reasoning or f"element at ({coords_x:.2f}, {coords_y:.2f})"
+        else:
+            target_desc = step.reasoning or "target element"
-        test_cases.append((image, target_desc, action.bbox))
+        test_cases.append((image, target_desc, bbox))
     return evaluate_grounder(grounder, test_cases, k=k)

openadapt_ml/evals/trajectory_matching.py CHANGED Viewed

@@ -5,7 +5,7 @@ from dataclasses import dataclass, field
 from typing import Any, Callable, Dict, List, Optional
 from openadapt_ml.runtime.policy import AgentPolicy
-from openadapt_ml.schemas.sessions import Action, Episode
+from openadapt_ml.schema import Action, Episode, ActionType
 @dataclass
@@ -92,22 +92,46 @@ class AggregateMetrics:
     element_accuracy: Optional[float] = None  # SoM element index accuracy
+def _get_action_type_str(action: Action) -> str:
+    """Get action type as string, handling both enum and string types."""
+    return action.type.value if isinstance(action.type, ActionType) else action.type
+def _get_normalized_coords(action: Action) -> tuple[Optional[float], Optional[float]]:
+    """Extract normalized coordinates from action."""
+    if action.normalized_coordinates:
+        return action.normalized_coordinates
+    return None, None
+def _get_bbox(action: Action) -> Optional[tuple[float, float, float, float]]:
+    """Extract bounding box from action, checking element.bounds or raw."""
+    if action.element and action.element.bounds:
+        b = action.element.bounds
+        return (b.x, b.y, b.x + b.width, b.y + b.height)
+    elif action.raw and "bbox" in action.raw:
+        return action.raw["bbox"]
+    return None
 def compute_coordinate_error(pred_action: Action, gt_action: Action) -> Optional[float]:
     """Compute normalized L2 distance between predicted and ground-truth coords.
     Returns None if either action is missing coordinates.
     """
+    pred_x, pred_y = _get_normalized_coords(pred_action)
+    gt_x, gt_y = _get_normalized_coords(gt_action)
     if (
-        pred_action.x is None
-        or pred_action.y is None
-        or gt_action.x is None
-        or gt_action.y is None
+        pred_x is None
+        or pred_y is None
+        or gt_x is None
+        or gt_y is None
     ):
         return None
-    dx = pred_action.x - gt_action.x
-    dy = pred_action.y - gt_action.y
+    dx = pred_x - gt_x
+    dy = pred_y - gt_y
     return math.sqrt(dx * dx + dy * dy)
@@ -119,14 +143,16 @@ def is_click_in_bbox(pred_action: Action, gt_action: Action) -> Optional[bool]:
         - False if prediction is outside bbox
         - None if no bbox is available (fall back to coord distance)
     """
-    if gt_action.bbox is None:
+    gt_bbox = _get_bbox(gt_action)
+    if gt_bbox is None:
         return None
-    if pred_action.x is None or pred_action.y is None:
+    pred_x, pred_y = _get_normalized_coords(pred_action)
+    if pred_x is None or pred_y is None:
         return False
-    x_min, y_min, x_max, y_max = gt_action.bbox
-    return (x_min <= pred_action.x <= x_max) and (y_min <= pred_action.y <= y_max)
+    x_min, y_min, x_max, y_max = gt_bbox
+    return (x_min <= pred_x <= x_max) and (y_min <= pred_y <= y_max)
 def evaluate_episode(
@@ -177,7 +203,7 @@ def evaluate_episode(
     for step_idx, step in enumerate(episode.steps):
         # Skip steps without an image; the dataset builder does the same.
-        if not step.observation.image_path:
+        if not step.observation.screenshot_path:
             continue
         if sample_idx >= len(samples):
@@ -189,13 +215,17 @@ def evaluate_episode(
         pred_action, _thought, pred_state, raw_text = policy.predict_action_from_sample(sample)
         gt_action = step.action
+        # Get action types as strings for comparison
+        pred_type_str = _get_action_type_str(pred_action)
+        gt_type_str = _get_action_type_str(gt_action)
         # Track state-based success from final step
         if pred_state and isinstance(pred_state, dict):
             success_val = pred_state.get("success")
             if isinstance(success_val, bool):
                 last_state_success = success_val
-        type_match = pred_action.type == gt_action.type
+        type_match = pred_type_str == gt_type_str
         if type_match:
             step_matches += 1
         else:
@@ -206,14 +236,28 @@ def evaluate_episode(
         bbox_hit = False
         element_hit = False
+        # Helper to get element index - check element.element_id or raw field
+        def _get_element_index(action: Action) -> Optional[int]:
+            if action.element and action.element.element_id:
+                try:
+                    return int(action.element.element_id)
+                except (ValueError, TypeError):
+                    pass
+            if action.raw and "element_index" in action.raw:
+                return action.raw["element_index"]
+            return None
+        gt_element_index = _get_element_index(gt_action)
+        pred_element_index = _get_element_index(pred_action)
         # SoM mode: evaluate by element index for click/drag/type actions
-        if use_som and gt_action.type in {"click", "drag", "type"}:
-            if gt_action.element_index is not None:
+        if use_som and gt_type_str in {"click", "drag", "type"}:
+            if gt_element_index is not None:
                 element_total += 1
-                if pred_action.element_index == gt_action.element_index:
+                if pred_element_index == gt_element_index:
                     element_hits += 1
                     element_hit = True
-        elif gt_action.type in {"click", "drag"}:
+        elif gt_type_str in {"click", "drag"}:
             # Coordinate mode: evaluate by coordinate distance
             coord_error = compute_coordinate_error(pred_action, gt_action)
             if coord_error is not None:
@@ -233,11 +277,11 @@ def evaluate_episode(
         # Full step correctness: type matches AND element/coord match for relevant actions
         if type_match:
-            if use_som and gt_action.type in {"click", "drag", "type"}:
+            if use_som and gt_type_str in {"click", "drag", "type"}:
                 # SoM mode: require element index match
                 if element_hit:
                     full_step_correct += 1
-            elif gt_action.type in {"click", "drag"}:
+            elif gt_type_str in {"click", "drag"}:
                 # Coordinate mode: require click hit
                 if click_hit:
                     full_step_correct += 1
@@ -247,8 +291,8 @@ def evaluate_episode(
         # Track semantic milestones using the milestone spec
         for milestone in milestones:
-            if step_idx == milestone.step_index and gt_action.type == milestone.expected_type:
-                if pred_action.type == milestone.expected_type:
+            if step_idx == milestone.step_index and gt_type_str == milestone.expected_type:
+                if pred_type_str == milestone.expected_type:
                     # Check coord threshold if specified (for click actions)
                     if milestone.coord_threshold is not None:
                         if coord_error is not None and coord_error < milestone.coord_threshold:
@@ -258,9 +302,13 @@ def evaluate_episode(
                         milestones_achieved[milestone.name] = True
         # Ensure DONE is correct at the DONE step.
-        if gt_action.type == "done" and pred_action.type != "done":
+        if gt_type_str == "done" and pred_type_str != "done":
             success_pred = False
+        # Get normalized coordinates for logging
+        pred_x, pred_y = _get_normalized_coords(pred_action)
+        gt_x, gt_y = _get_normalized_coords(gt_action)
         # Optional logging of this step.
         if log_fn is not None and (log_limit is None or logged_count < log_limit):
             messages = sample.get("messages", [])
@@ -273,30 +321,30 @@ def evaluate_episode(
                     user_prompt = m.get("content")
             record: Dict[str, Any] = {
-                "episode_id": episode.id,
+                "episode_id": episode.episode_id,
                 "step_index": step_idx,
-                "goal": episode.goal,
+                "goal": episode.instruction,
                 "system_prompt": system_prompt,
                 "user_prompt": user_prompt,
                 "model_output_raw": raw_text,
                 "pred_action": {
-                    "type": pred_action.type,
-                    "x": pred_action.x,
-                    "y": pred_action.y,
+                    "type": pred_type_str,
+                    "x": pred_x,
+                    "y": pred_y,
                     "text": pred_action.text,
-                    "element_index": pred_action.element_index,
+                    "element_index": pred_element_index,
                 },
                 "ground_truth_action": {
-                    "type": gt_action.type,
-                    "x": gt_action.x,
-                    "y": gt_action.y,
+                    "type": gt_type_str,
+                    "x": gt_x,
+                    "y": gt_y,
                     "text": gt_action.text,
-                    "element_index": gt_action.element_index,
+                    "element_index": gt_element_index,
                 },
-                "correct_type": pred_action.type == gt_action.type,
+                "correct_type": pred_type_str == gt_type_str,
                 "coord_error_norm": coord_error,
-                "element_match": pred_action.element_index == gt_action.element_index
-                if gt_action.element_index is not None
+                "element_match": pred_element_index == gt_element_index
+                if gt_element_index is not None
                 else None,
             }
@@ -306,7 +354,7 @@ def evaluate_episode(
         step_total += 1
     metrics = EpisodeMetrics(
-        episode_id=episode.id,
+        episode_id=episode.episode_id,
         step_matches=step_matches,
         step_total=step_total,
         coord_errors=coord_errors,

openadapt_ml/experiments/demo_prompt/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Demo-conditioned prompt experiment.
+Tests whether including a human demonstration in the prompt
+improves VLM agent performance on similar tasks.
+"""
+from openadapt_ml.experiments.demo_prompt.format_demo import (
+    format_episode_as_demo,
+    format_action,
+)
+from openadapt_ml.experiments.demo_prompt.run_experiment import (
+    DemoPromptExperiment,
+)
+__all__ = [
+    "format_episode_as_demo",
+    "format_action",
+    "DemoPromptExperiment",
+]

openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

openadapt-ml 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl