PyPI - openadapt-ml - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

openadapt-ml 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

openadapt_ml/baselines/__init__.py +121 -0
openadapt_ml/baselines/adapter.py +185 -0
openadapt_ml/baselines/cli.py +314 -0
openadapt_ml/baselines/config.py +448 -0
openadapt_ml/baselines/parser.py +922 -0
openadapt_ml/baselines/prompts.py +787 -0
openadapt_ml/benchmarks/__init__.py +13 -115
openadapt_ml/benchmarks/agent.py +265 -421
openadapt_ml/benchmarks/azure.py +28 -19
openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
openadapt_ml/benchmarks/cli.py +1722 -4847
openadapt_ml/benchmarks/trace_export.py +631 -0
openadapt_ml/benchmarks/viewer.py +22 -5
openadapt_ml/benchmarks/vm_monitor.py +530 -29
openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
openadapt_ml/cloud/azure_inference.py +3 -5
openadapt_ml/cloud/lambda_labs.py +722 -307
openadapt_ml/cloud/local.py +2038 -487
openadapt_ml/cloud/ssh_tunnel.py +68 -26
openadapt_ml/datasets/next_action.py +40 -30
openadapt_ml/evals/grounding.py +8 -3
openadapt_ml/evals/plot_eval_metrics.py +15 -13
openadapt_ml/evals/trajectory_matching.py +41 -26
openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
openadapt_ml/experiments/representation_shootout/config.py +390 -0
openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
openadapt_ml/experiments/representation_shootout/runner.py +687 -0
openadapt_ml/experiments/waa_demo/runner.py +29 -14
openadapt_ml/export/parquet.py +36 -24
openadapt_ml/grounding/detector.py +18 -14
openadapt_ml/ingest/__init__.py +8 -6
openadapt_ml/ingest/capture.py +25 -22
openadapt_ml/ingest/loader.py +7 -4
openadapt_ml/ingest/synthetic.py +189 -100
openadapt_ml/models/api_adapter.py +14 -4
openadapt_ml/models/base_adapter.py +10 -2
openadapt_ml/models/providers/__init__.py +288 -0
openadapt_ml/models/providers/anthropic.py +266 -0
openadapt_ml/models/providers/base.py +299 -0
openadapt_ml/models/providers/google.py +376 -0
openadapt_ml/models/providers/openai.py +342 -0
openadapt_ml/models/qwen_vl.py +46 -19
openadapt_ml/perception/__init__.py +35 -0
openadapt_ml/perception/integration.py +399 -0
openadapt_ml/retrieval/demo_retriever.py +50 -24
openadapt_ml/retrieval/embeddings.py +9 -8
openadapt_ml/retrieval/retriever.py +3 -1
openadapt_ml/runtime/__init__.py +50 -0
openadapt_ml/runtime/policy.py +18 -5
openadapt_ml/runtime/safety_gate.py +471 -0
openadapt_ml/schema/__init__.py +9 -0
openadapt_ml/schema/converters.py +74 -27
openadapt_ml/schema/episode.py +31 -18
openadapt_ml/scripts/capture_screenshots.py +530 -0
openadapt_ml/scripts/compare.py +85 -54
openadapt_ml/scripts/demo_policy.py +4 -1
openadapt_ml/scripts/eval_policy.py +15 -9
openadapt_ml/scripts/make_gif.py +1 -1
openadapt_ml/scripts/prepare_synthetic.py +3 -1
openadapt_ml/scripts/train.py +21 -9
openadapt_ml/segmentation/README.md +920 -0
openadapt_ml/segmentation/__init__.py +97 -0
openadapt_ml/segmentation/adapters/__init__.py +5 -0
openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
openadapt_ml/segmentation/annotator.py +610 -0
openadapt_ml/segmentation/cache.py +290 -0
openadapt_ml/segmentation/cli.py +674 -0
openadapt_ml/segmentation/deduplicator.py +656 -0
openadapt_ml/segmentation/frame_describer.py +788 -0
openadapt_ml/segmentation/pipeline.py +340 -0
openadapt_ml/segmentation/schemas.py +622 -0
openadapt_ml/segmentation/segment_extractor.py +634 -0
openadapt_ml/training/azure_ops_viewer.py +1097 -0
openadapt_ml/training/benchmark_viewer.py +52 -41
openadapt_ml/training/shared_ui.py +7 -7
openadapt_ml/training/stub_provider.py +57 -35
openadapt_ml/training/trainer.py +143 -86
openadapt_ml/training/trl_trainer.py +70 -21
openadapt_ml/training/viewer.py +323 -108
openadapt_ml/training/viewer_components.py +180 -0
{openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/METADATA +215 -14
openadapt_ml-0.2.2.dist-info/RECORD +116 -0
openadapt_ml/benchmarks/base.py +0 -366
openadapt_ml/benchmarks/data_collection.py +0 -432
openadapt_ml/benchmarks/live_tracker.py +0 -180
openadapt_ml/benchmarks/runner.py +0 -418
openadapt_ml/benchmarks/waa.py +0 -761
openadapt_ml/benchmarks/waa_live.py +0 -619
openadapt_ml-0.2.0.dist-info/RECORD +0 -86
{openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/WHEEL +0 -0
{openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/licenses/LICENSE +0 -0

openadapt_ml/cloud/ssh_tunnel.py CHANGED Viewed

@@ -51,9 +51,8 @@ import signal
 import socket
 import subprocess
 import time
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
 logger = logging.getLogger(__name__)
@@ -97,9 +96,11 @@ class SSHTunnelManager:
     """
     # Default tunnel configurations
+    # Note: WAA uses local_port=5001 to avoid conflicts with any local WAA server on 5000
+    # The remote port is still 5000 (where WAA Flask runs inside Windows)
     DEFAULT_TUNNELS = [
         TunnelConfig(name="vnc", local_port=8006, remote_port=8006),
-        TunnelConfig(name="waa", local_port=5000, remote_port=5000),
+        TunnelConfig(name="waa", local_port=5001, remote_port=5000),
     ]
     # Auto-reconnect settings
@@ -125,7 +126,9 @@ class SSHTunnelManager:
         self._current_vm_ip: str | None = None
         self._current_ssh_user: str | None = None
         self._auto_reconnect = auto_reconnect
-        self._reconnect_attempts: dict[str, int] = {}  # Track reconnect attempts per tunnel
+        self._reconnect_attempts: dict[
+            str, int
+        ] = {}  # Track reconnect attempts per tunnel
     def start_tunnels_for_vm(
         self,
@@ -198,7 +201,9 @@ class SSHTunnelManager:
                     pid=None,  # We don't know the PID of the external tunnel
                 )
             else:
-                logger.warning(f"Port {config.local_port} already in use by unknown process")
+                logger.warning(
+                    f"Port {config.local_port} already in use by unknown process"
+                )
                 return TunnelStatus(
                     name=config.name,
                     active=False,
@@ -213,16 +218,25 @@ class SSHTunnelManager:
         # TCPKeepAlive=yes: Enable TCP-level keepalive as additional safeguard
         ssh_cmd = [
             "ssh",
-            "-o", "StrictHostKeyChecking=no",
-            "-o", "UserKnownHostsFile=/dev/null",
-            "-o", "LogLevel=ERROR",
-            "-o", "ServerAliveInterval=60",
-            "-o", "ServerAliveCountMax=10",
-            "-o", "TCPKeepAlive=yes",
-            "-o", "ExitOnForwardFailure=yes",
-            "-i", str(self.ssh_key_path),
+            "-o",
+            "StrictHostKeyChecking=no",
+            "-o",
+            "UserKnownHostsFile=/dev/null",
+            "-o",
+            "LogLevel=ERROR",
+            "-o",
+            "ServerAliveInterval=60",
+            "-o",
+            "ServerAliveCountMax=10",
+            "-o",
+            "TCPKeepAlive=yes",
+            "-o",
+            "ExitOnForwardFailure=yes",
+            "-i",
+            str(self.ssh_key_path),
             "-N",  # Don't execute remote command
-            "-L", f"{config.local_port}:{config.remote_host}:{config.remote_port}",
+            "-L",
+            f"{config.local_port}:{config.remote_host}:{config.remote_port}",
             f"{ssh_user}@{vm_ip}",
         ]
@@ -253,7 +267,9 @@ class SSHTunnelManager:
             # Tunnel started successfully
             self._active_tunnels[config.name] = (config, proc)
-            logger.info(f"Started tunnel {config.name}: localhost:{config.local_port} -> {vm_ip}:{config.remote_port}")
+            logger.info(
+                f"Started tunnel {config.name}: localhost:{config.local_port} -> {vm_ip}:{config.remote_port}"
+            )
             return TunnelStatus(
                 name=config.name,
@@ -340,24 +356,36 @@ class SSHTunnelManager:
                         name=config.name,
                         active=True,
                         local_port=config.local_port,
-                        remote_endpoint=f"{self._current_vm_ip}:{config.remote_port}" if self._current_vm_ip else "unknown",
+                        remote_endpoint=f"{self._current_vm_ip}:{config.remote_port}"
+                        if self._current_vm_ip
+                        else "unknown",
                         pid=proc.pid,
                     )
                 else:
                     # Process died - but check if port is still working
                     # (could be another tunnel on the same port)
                     del self._active_tunnels[config.name]
-                    if self._is_port_in_use(config.local_port) and self._check_tunnel_works(config.local_port, config.remote_port):
+                    if self._is_port_in_use(
+                        config.local_port
+                    ) and self._check_tunnel_works(
+                        config.local_port, config.remote_port
+                    ):
                         results[config.name] = TunnelStatus(
                             name=config.name,
                             active=True,
                             local_port=config.local_port,
-                            remote_endpoint=f"{self._current_vm_ip}:{config.remote_port}" if self._current_vm_ip else "external",
+                            remote_endpoint=f"{self._current_vm_ip}:{config.remote_port}"
+                            if self._current_vm_ip
+                            else "external",
                             pid=None,  # External tunnel, PID unknown
                         )
                     else:
                         # Tunnel is dead - mark for restart if auto_reconnect enabled
-                        if self._auto_reconnect and auto_restart and self._current_vm_ip:
+                        if (
+                            self._auto_reconnect
+                            and auto_restart
+                            and self._current_vm_ip
+                        ):
                             tunnels_to_restart.append(config)
                         results[config.name] = TunnelStatus(
                             name=config.name,
@@ -369,13 +397,19 @@ class SSHTunnelManager:
             else:
                 # Not tracked internally - but check if an external tunnel exists
                 # This handles tunnels started by other processes or after manager restart
-                if self._is_port_in_use(config.local_port) and self._check_tunnel_works(config.local_port, config.remote_port):
-                    logger.debug(f"Found working external tunnel on port {config.local_port}")
+                if self._is_port_in_use(config.local_port) and self._check_tunnel_works(
+                    config.local_port, config.remote_port
+                ):
+                    logger.debug(
+                        f"Found working external tunnel on port {config.local_port}"
+                    )
                     results[config.name] = TunnelStatus(
                         name=config.name,
                         active=True,
                         local_port=config.local_port,
-                        remote_endpoint=f"{self._current_vm_ip}:{config.remote_port}" if self._current_vm_ip else "external",
+                        remote_endpoint=f"{self._current_vm_ip}:{config.remote_port}"
+                        if self._current_vm_ip
+                        else "external",
                         pid=None,  # External tunnel, PID unknown
                     )
                 else:
@@ -390,16 +424,22 @@ class SSHTunnelManager:
         for config in tunnels_to_restart:
             attempts = self._reconnect_attempts.get(config.name, 0)
             if attempts < self.MAX_RECONNECT_ATTEMPTS:
-                logger.info(f"Auto-reconnecting tunnel {config.name} (attempt {attempts + 1}/{self.MAX_RECONNECT_ATTEMPTS})")
+                logger.info(
+                    f"Auto-reconnecting tunnel {config.name} (attempt {attempts + 1}/{self.MAX_RECONNECT_ATTEMPTS})"
+                )
                 time.sleep(self.RECONNECT_DELAY_SECONDS)
                 self._reconnect_attempts[config.name] = attempts + 1
-                status = self._start_tunnel(config, self._current_vm_ip, self._current_ssh_user or "azureuser")
+                status = self._start_tunnel(
+                    config, self._current_vm_ip, self._current_ssh_user or "azureuser"
+                )
                 results[config.name] = status
                 if status.active:
                     logger.info(f"Successfully reconnected tunnel {config.name}")
                     self._reconnect_attempts[config.name] = 0  # Reset on success
             else:
-                logger.warning(f"Tunnel {config.name} exceeded max reconnect attempts ({self.MAX_RECONNECT_ATTEMPTS})")
+                logger.warning(
+                    f"Tunnel {config.name} exceeded max reconnect attempts ({self.MAX_RECONNECT_ATTEMPTS})"
+                )
                 results[config.name] = TunnelStatus(
                     name=config.name,
                     active=False,
@@ -455,7 +495,9 @@ class SSHTunnelManager:
         """
         # If VM changed, stop old tunnels and reset reconnect attempts
         if self._current_vm_ip and self._current_vm_ip != vm_ip:
-            logger.info(f"VM IP changed from {self._current_vm_ip} to {vm_ip}, restarting tunnels")
+            logger.info(
+                f"VM IP changed from {self._current_vm_ip} to {vm_ip}, restarting tunnels"
+            )
             self.stop_all_tunnels()
             self.reset_reconnect_attempts()  # Fresh start for new VM

openadapt_ml/datasets/next_action.py CHANGED Viewed

@@ -3,7 +3,6 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import Any, Dict, List
-import torch
 from torch.utils.data import Dataset
 from openadapt_ml.schema import Action, ActionType, Episode, Step, UIElement
@@ -20,7 +19,7 @@ SYSTEM_PROMPT = (
     "- Example: An element in the middle of the screen would be approximately x=0.5, y=0.5\n\n"
     "ALLOWED ACTIONS (use exactly this format):\n"
     "- CLICK(x=0.XX, y=0.XX)  → click at normalized coordinates\n"
-    "- TYPE(text=\"...\")     → type text into the currently focused field\n"
+    '- TYPE(text="...")     → type text into the currently focused field\n'
     "- WAIT()                 → wait for UI to update\n"
     "- DONE()                 → task is complete\n\n"
     "RESPONSE FORMAT (required):\n"
@@ -42,14 +41,14 @@ SYSTEM_PROMPT_SOM = (
     "[3] = Login button\n\n"
     "ALLOWED ACTIONS (use exactly this format):\n"
     "- CLICK([N])            → click element with number N to focus/activate it\n"
-    "- TYPE([N], \"text\")   → type text into element N (e.g., TYPE([2], \"hello\"))\n"
+    '- TYPE([N], "text")   → type text into element N (e.g., TYPE([2], "hello"))\n'
     "- WAIT()                → wait for UI to update\n"
     "- DONE()                → task is complete\n\n"
     "ACTION SEQUENCE FOR LOGIN:\n"
     "1. CLICK([1]) to focus username field\n"
-    "2. TYPE([1], \"username\") to enter username\n"
+    '2. TYPE([1], "username") to enter username\n'
     "3. CLICK([2]) to focus password field\n"
-    "4. TYPE([2], \"password\") to enter password\n"
+    '4. TYPE([2], "password") to enter password\n'
     "5. CLICK([3]) to submit login\n"
     "6. DONE() when login is complete\n\n"
     "RESPONSE FORMAT (required):\n"
@@ -74,20 +73,20 @@ SYSTEM_PROMPT_SOM_REGISTRATION = (
     "[6] = Register button\n\n"
     "ALLOWED ACTIONS (use exactly this format):\n"
     "- CLICK([N])            → click element with number N to focus/activate it\n"
-    "- TYPE([N], \"text\")   → type text into element N (e.g., TYPE([2], \"hello\"))\n"
+    '- TYPE([N], "text")   → type text into element N (e.g., TYPE([2], "hello"))\n'
     "- WAIT()                → wait for UI to update\n"
     "- DONE()                → task is complete\n\n"
     "ACTION SEQUENCE FOR REGISTRATION:\n"
     "1. CLICK([1]) to focus first name field\n"
-    "2. TYPE([1], \"name\") to enter first name\n"
+    '2. TYPE([1], "name") to enter first name\n'
     "3. CLICK([2]) to focus last name field\n"
-    "4. TYPE([2], \"name\") to enter last name\n"
+    '4. TYPE([2], "name") to enter last name\n'
     "5. CLICK([3]) to focus email field\n"
-    "6. TYPE([3], \"email\") to enter email\n"
+    '6. TYPE([3], "email") to enter email\n'
     "7. CLICK([4]) to focus password field\n"
-    "8. TYPE([4], \"pass\") to enter password\n"
+    '8. TYPE([4], "pass") to enter password\n'
     "9. CLICK([5]) to focus confirm password field\n"
-    "10. TYPE([5], \"pass\") to enter confirmation\n"
+    '10. TYPE([5], "pass") to enter confirmation\n'
     "11. CLICK([6]) to submit registration\n"
     "12. DONE() when registration is complete\n\n"
     "RESPONSE FORMAT (required):\n"
@@ -127,12 +126,12 @@ def format_action(action: Action, use_som: bool = False) -> str:
         if t == ActionType.CLICK and element_id is not None:
             return f"CLICK([{element_id}])"
         if t == ActionType.TYPE and action.text is not None:
-            escaped = action.text.replace("\\", "\\\\").replace("\"", "\\\"")
+            escaped = action.text.replace("\\", "\\\\").replace('"', '\\"')
             if element_id is not None:
-                return f"TYPE([{element_id}], \"{escaped}\")"
+                return f'TYPE([{element_id}], "{escaped}")'
             else:
                 # Fallback: TYPE without element reference (for focused field)
-                return f"TYPE(\"{escaped}\")"
+                return f'TYPE("{escaped}")'
         if t == ActionType.WAIT:
             return "WAIT()"
         if t == ActionType.DONE:
@@ -145,8 +144,8 @@ def format_action(action: Action, use_som: bool = False) -> str:
             x, y = action.normalized_coordinates
             return f"CLICK(x={x:.2f}, y={y:.2f})"
         if t == ActionType.TYPE and action.text is not None:
-            escaped = action.text.replace("\\", "\\\\").replace("\"", "\\\"")
-            return f"TYPE(text=\"{escaped}\")"
+            escaped = action.text.replace("\\", "\\\\").replace('"', '\\"')
+            return f'TYPE(text="{escaped}")'
         if t == ActionType.WAIT:
             return "WAIT()"
         if t == ActionType.DONE:
@@ -181,13 +180,15 @@ def parse_action_som(text: str) -> Action:
     match = re.match(r'TYPE\(\[(\d+)\],\s*["\'](.*)["\']\)', text, re.DOTALL)
     if match:
         idx = match.group(1)
-        content = match.group(2).replace("\\\"", "\"").replace("\\\\", "\\")
-        return Action(type=ActionType.TYPE, text=content, element=UIElement(element_id=idx))
+        content = match.group(2).replace('\\"', '"').replace("\\\\", "\\")
+        return Action(
+            type=ActionType.TYPE, text=content, element=UIElement(element_id=idx)
+        )
     # TYPE("text") - no element index
     match = re.match(r'TYPE\(["\'](.*)["\']\)', text, re.DOTALL)
     if match:
-        content = match.group(1).replace("\\\"", "\"").replace("\\\\", "\\")
+        content = match.group(1).replace('\\"', '"').replace("\\\\", "\\")
         return Action(type=ActionType.TYPE, text=content)
     # WAIT()
@@ -202,7 +203,9 @@ def parse_action_som(text: str) -> Action:
     return Action(type=ActionType.FAIL, raw={"text": text})
-def _generate_generic_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
+def _generate_generic_thought(
+    step_index: int, step: Step, goal: str, total_steps: int
+) -> str:
     """Generate a thought for real captures (non-synthetic scenarios).
     This creates action-appropriate thoughts that teach the model to output
@@ -239,7 +242,9 @@ def _generate_generic_thought(step_index: int, step: Step, goal: str, total_step
         return f"{progress} I need to scroll to reveal more content or reach the target element for '{goal}'."
     if t == ActionType.DRAG:
-        return f"{progress} I need to drag an element to complete this part of '{goal}'."
+        return (
+            f"{progress} I need to drag an element to complete this part of '{goal}'."
+        )
     if t == ActionType.KEY:
         return f"{progress} I need to press a key to continue the workflow."
@@ -269,9 +274,6 @@ def _generate_thought_for_step(
     actions back to the stated objective.
     """
-    action = step.action
-    t = action.type
     if scenario == "registration":
         return _generate_registration_thought(step_index, step, goal, total_steps)
     elif scenario == "login" and total_steps <= 7:
@@ -282,7 +284,9 @@ def _generate_thought_for_step(
         return _generate_generic_thought(step_index, step, goal, total_steps)
-def _generate_login_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
+def _generate_login_thought(
+    step_index: int, step: Step, goal: str, total_steps: int
+) -> str:
     """Generate thought for login scenario (6 steps)."""
     action = step.action
     t = action.type
@@ -336,7 +340,9 @@ def _generate_login_thought(step_index: int, step: Step, goal: str, total_steps:
     )
-def _generate_registration_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
+def _generate_registration_thought(
+    step_index: int, step: Step, goal: str, total_steps: int
+) -> str:
     """Generate thought for registration scenario (12 steps)."""
     action = step.action
     t = action.type
@@ -469,7 +475,9 @@ def build_next_action_sft_samples(
                     history_text += f"  {i}. {action_text}\n"
                 history_text += f"\nThis is step {step_index + 1} of {total_steps}. "
             else:
-                history_text = f"This is step 1 of {total_steps} (no actions completed yet). "
+                history_text = (
+                    f"This is step 1 of {total_steps} (no actions completed yet). "
+                )
             if use_som:
                 user_content = (
@@ -477,7 +485,7 @@ def build_next_action_sft_samples(
                     f"{history_text}"
                     "Look at the screenshot and determine the NEXT action.\n\n"
                     "Thought: [which numbered element to interact with and why]\n"
-                    "Action: [CLICK([N]) or TYPE([N], \"text\") or WAIT() or DONE()]"
+                    'Action: [CLICK([N]) or TYPE([N], "text") or WAIT() or DONE()]'
                 )
             else:
                 user_content = (
@@ -485,13 +493,15 @@ def build_next_action_sft_samples(
                     f"{history_text}"
                     "Look at the screenshot and determine the NEXT action.\n\n"
                     "Thought: [what element to interact with and why]\n"
-                    "Action: [CLICK(x=..., y=...) or TYPE(text=\"...\") or WAIT() or DONE()]"
+                    'Action: [CLICK(x=..., y=...) or TYPE(text="...") or WAIT() or DONE()]'
                 )
             # Provide a deterministic, semantically meaningful Thought while supervising
             # the exact DSL Action.
             action_text = format_action(step.action, use_som=use_som)
-            thought_text = _generate_thought_for_step(step_index, step, goal, scenario, total_steps)
+            thought_text = _generate_thought_for_step(
+                step_index, step, goal, scenario, total_steps
+            )
             assistant_content = f"Thought: {thought_text}\nAction: {action_text}"
             sample = {

openadapt_ml/evals/grounding.py CHANGED Viewed

@@ -19,6 +19,7 @@ from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from PIL import Image
+    from openadapt_ml.data.types import Episode
     from openadapt_ml.grounding.base import GroundingModule, RegionCandidate
@@ -212,7 +213,7 @@ def evaluate_grounder_on_episode(
     """
     from PIL import Image
-    from openadapt_ml.schema import Episode, ActionType
+    from openadapt_ml.schema import ActionType
     test_cases = []
@@ -220,7 +221,9 @@ def evaluate_grounder_on_episode(
         action = step.action
         # Get action type as string for comparison
-        action_type_str = action.type.value if isinstance(action.type, ActionType) else action.type
+        action_type_str = (
+            action.type.value if isinstance(action.type, ActionType) else action.type
+        )
         # Only evaluate clicks with bboxes
         if action_type_str not in ("click", "double_click"):
@@ -250,7 +253,9 @@ def evaluate_grounder_on_episode(
         if action.normalized_coordinates:
             coords_x, coords_y = action.normalized_coordinates
         if coords_x is not None and coords_y is not None:
-            target_desc = step.reasoning or f"element at ({coords_x:.2f}, {coords_y:.2f})"
+            target_desc = (
+                step.reasoning or f"element at ({coords_x:.2f}, {coords_y:.2f})"
+            )
         else:
             target_desc = step.reasoning or "target element"

openadapt_ml/evals/plot_eval_metrics.py CHANGED Viewed

@@ -73,7 +73,7 @@ def plot_eval_metrics(
     fig.suptitle(
         "VLM Model Comparison (Offline fine-tuned vs API models)",
         fontsize=12,
-        fontweight='bold',
+        fontweight="bold",
     )
     if num_metrics == 1:
         axes = [axes]
@@ -96,36 +96,38 @@ def plot_eval_metrics(
             hatches.append(hatch)
         x = range(num_models)
-        bars = ax.bar(x, values, tick_label=labels, color=colors, edgecolor='black', linewidth=1.2)
+        bars = ax.bar(
+            x, values, tick_label=labels, color=colors, edgecolor="black", linewidth=1.2
+        )
         # Apply hatch patterns
         for bar, hatch in zip(bars, hatches):
             bar.set_hatch(hatch)
-        ax.set_title(title, fontsize=11, fontweight='bold')
+        ax.set_title(title, fontsize=11, fontweight="bold")
         ax.set_ylabel(key, fontsize=9)
         ax.set_ylim(bottom=0.0)
         # Rotate x-axis labels to prevent crowding
-        ax.tick_params(axis='x', labelrotation=45, labelsize=8)
+        ax.tick_params(axis="x", labelrotation=45, labelsize=8)
         # Align labels to the right for better readability when rotated
         for tick in ax.get_xticklabels():
-            tick.set_horizontalalignment('right')
+            tick.set_horizontalalignment("right")
     fig.tight_layout()
     # Add legend explaining color coding and hatch patterns
     legend_elements = [
-        Patch(facecolor='#4A90E2', edgecolor='black', label='Qwen3-VL-2B'),
-        Patch(facecolor='#2E5C8A', edgecolor='black', label='Qwen3-VL-8B'),
-        Patch(facecolor='#FF6B35', edgecolor='black', label='Claude (API)'),
-        Patch(facecolor='#C1121F', edgecolor='black', label='GPT (API)'),
-        Patch(facecolor='gray', edgecolor='black', hatch='///', label='Fine-tuned'),
-        Patch(facecolor='gray', edgecolor='black', label='Base/Pretrained'),
+        Patch(facecolor="#4A90E2", edgecolor="black", label="Qwen3-VL-2B"),
+        Patch(facecolor="#2E5C8A", edgecolor="black", label="Qwen3-VL-8B"),
+        Patch(facecolor="#FF6B35", edgecolor="black", label="Claude (API)"),
+        Patch(facecolor="#C1121F", edgecolor="black", label="GPT (API)"),
+        Patch(facecolor="gray", edgecolor="black", hatch="///", label="Fine-tuned"),
+        Patch(facecolor="gray", edgecolor="black", label="Base/Pretrained"),
     ]
     fig.legend(
         handles=legend_elements,
-        loc='lower center',
+        loc="lower center",
         bbox_to_anchor=(0.5, -0.05),
         ncol=3,
         fontsize=9,
@@ -133,7 +135,7 @@ def plot_eval_metrics(
     )
     output_path.parent.mkdir(parents=True, exist_ok=True)
-    fig.savefig(output_path, dpi=150, bbox_inches='tight')
+    fig.savefig(output_path, dpi=150, bbox_inches="tight")
     plt.close(fig)

openadapt_ml/evals/trajectory_matching.py CHANGED Viewed

@@ -15,10 +15,15 @@ class MilestoneSpec:
     A milestone is achieved when, at a specific step, the predicted action
     matches certain criteria (type match + optional coord threshold).
     """
     name: str
     step_index: int  # Which step in the episode (0-indexed)
-    expected_type: str  # Expected ground truth action type ("click", "type", "done", etc.)
-    coord_threshold: Optional[float] = None  # If set, coord error must be < this for clicks
+    expected_type: (
+        str  # Expected ground truth action type ("click", "type", "done", etc.)
+    )
+    coord_threshold: Optional[float] = (
+        None  # If set, coord error must be < this for clicks
+    )
 # Predefined milestone specs per scenario
@@ -28,7 +33,9 @@ class MilestoneSpec:
 LOGIN_MILESTONES = [
     MilestoneSpec("typed_username", step_index=1, expected_type="type"),
     MilestoneSpec("typed_password", step_index=3, expected_type="type"),
-    MilestoneSpec("clicked_login", step_index=4, expected_type="click", coord_threshold=0.10),
+    MilestoneSpec(
+        "clicked_login", step_index=4, expected_type="click", coord_threshold=0.10
+    ),
     MilestoneSpec("emitted_done", step_index=5, expected_type="done"),
 ]
@@ -81,14 +88,22 @@ class AggregateMetrics:
     action_type_accuracy: float
     mean_coord_error: Optional[float]
     coord_error_count: int
-    episode_success_rate: Optional[float]  # Strict: all steps must match (renamed from success_pred)
+    episode_success_rate: Optional[
+        float
+    ]  # Strict: all steps must match (renamed from success_pred)
     click_hit_rate: Optional[float]  # Point-based: within 5% of center
-    mean_episode_progress: Optional[float]  # Partial credit: avg(step_matches/step_total)
+    mean_episode_progress: Optional[
+        float
+    ]  # Partial credit: avg(step_matches/step_total)
     # New partial-credit metrics
-    mean_episode_step_score: Optional[float]  # Strict partial: avg(full_step_correct/step_total)
+    mean_episode_step_score: Optional[
+        float
+    ]  # Strict partial: avg(full_step_correct/step_total)
     weak_episode_success_rate: Optional[float]  # Semantic milestones all achieved
     state_success_rate: Optional[float] = None  # From model's State: {"success": true}
-    bbox_hit_rate: Optional[float] = None  # Bbox-based: click anywhere in element bounds
+    bbox_hit_rate: Optional[float] = (
+        None  # Bbox-based: click anywhere in element bounds
+    )
     element_accuracy: Optional[float] = None  # SoM element index accuracy
@@ -122,12 +137,7 @@ def compute_coordinate_error(pred_action: Action, gt_action: Action) -> Optional
     pred_x, pred_y = _get_normalized_coords(pred_action)
     gt_x, gt_y = _get_normalized_coords(gt_action)
-    if (
-        pred_x is None
-        or pred_y is None
-        or gt_x is None
-        or gt_y is None
-    ):
+    if pred_x is None or pred_y is None or gt_x is None or gt_y is None:
         return None
     dx = pred_x - gt_x
@@ -212,7 +222,9 @@ def evaluate_episode(
         sample = samples[sample_idx]
         sample_idx += 1
-        pred_action, _thought, pred_state, raw_text = policy.predict_action_from_sample(sample)
+        pred_action, _thought, pred_state, raw_text = policy.predict_action_from_sample(
+            sample
+        )
         gt_action = step.action
         # Get action types as strings for comparison
@@ -233,7 +245,6 @@ def evaluate_episode(
         coord_error: Optional[float] = None
         click_hit = False
-        bbox_hit = False
         element_hit = False
         # Helper to get element index - check element.element_id or raw field
@@ -273,7 +284,6 @@ def evaluate_episode(
                 bbox_total += 1
                 if in_bbox:
                     bbox_hits += 1
-                    bbox_hit = True
         # Full step correctness: type matches AND element/coord match for relevant actions
         if type_match:
@@ -291,11 +301,17 @@ def evaluate_episode(
         # Track semantic milestones using the milestone spec
         for milestone in milestones:
-            if step_idx == milestone.step_index and gt_type_str == milestone.expected_type:
+            if (
+                step_idx == milestone.step_index
+                and gt_type_str == milestone.expected_type
+            ):
                 if pred_type_str == milestone.expected_type:
                     # Check coord threshold if specified (for click actions)
                     if milestone.coord_threshold is not None:
-                        if coord_error is not None and coord_error < milestone.coord_threshold:
+                        if (
+                            coord_error is not None
+                            and coord_error < milestone.coord_threshold
+                        ):
                             milestones_achieved[milestone.name] = True
                     else:
                         # No coord threshold - type match is sufficient
@@ -428,18 +444,16 @@ def aggregate_metrics(episodes_metrics: List[EpisodeMetrics]) -> AggregateMetric
     # Partial credit: average episode progress (step_matches / step_total per episode)
     if eval_episodes:
-        episode_progress_scores = [
-            m.step_matches / m.step_total for m in eval_episodes
-        ]
-        mean_episode_progress = sum(episode_progress_scores) / len(episode_progress_scores)
+        episode_progress_scores = [m.step_matches / m.step_total for m in eval_episodes]
+        mean_episode_progress = sum(episode_progress_scores) / len(
+            episode_progress_scores
+        )
     else:
         mean_episode_progress = None
     # Strict partial: avg(full_step_correct / step_total) - requires type match + click hit
     if eval_episodes:
-        step_scores = [
-            m.full_step_correct / m.step_total for m in eval_episodes
-        ]
+        step_scores = [m.full_step_correct / m.step_total for m in eval_episodes]
         mean_episode_step_score = sum(step_scores) / len(step_scores)
     else:
         mean_episode_step_score = None
@@ -447,7 +461,8 @@ def aggregate_metrics(episodes_metrics: List[EpisodeMetrics]) -> AggregateMetric
     # Weak episode success: all milestones achieved
     if eval_episodes:
         weak_success_count = sum(
-            1 for m in eval_episodes
+            1
+            for m in eval_episodes
             if m.milestones_achieved and all(m.milestones_achieved.values())
         )
         weak_episode_success_rate = weak_success_count / len(eval_episodes)

openadapt-ml 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

openadapt-ml 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl