PyPI - openadapt-ml - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

openadapt-ml 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

openadapt_ml/baselines/__init__.py +121 -0
openadapt_ml/baselines/adapter.py +185 -0
openadapt_ml/baselines/cli.py +314 -0
openadapt_ml/baselines/config.py +448 -0
openadapt_ml/baselines/parser.py +922 -0
openadapt_ml/baselines/prompts.py +787 -0
openadapt_ml/benchmarks/__init__.py +13 -107
openadapt_ml/benchmarks/agent.py +297 -374
openadapt_ml/benchmarks/azure.py +62 -24
openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
openadapt_ml/benchmarks/cli.py +1874 -751
openadapt_ml/benchmarks/trace_export.py +631 -0
openadapt_ml/benchmarks/viewer.py +1236 -0
openadapt_ml/benchmarks/vm_monitor.py +1111 -0
openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
openadapt_ml/cloud/azure_inference.py +3 -5
openadapt_ml/cloud/lambda_labs.py +722 -307
openadapt_ml/cloud/local.py +3194 -89
openadapt_ml/cloud/ssh_tunnel.py +595 -0
openadapt_ml/datasets/next_action.py +125 -96
openadapt_ml/evals/grounding.py +32 -9
openadapt_ml/evals/plot_eval_metrics.py +15 -13
openadapt_ml/evals/trajectory_matching.py +120 -57
openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
openadapt_ml/experiments/representation_shootout/config.py +390 -0
openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
openadapt_ml/experiments/representation_shootout/runner.py +687 -0
openadapt_ml/experiments/waa_demo/__init__.py +10 -0
openadapt_ml/experiments/waa_demo/demos.py +357 -0
openadapt_ml/experiments/waa_demo/runner.py +732 -0
openadapt_ml/experiments/waa_demo/tasks.py +151 -0
openadapt_ml/export/__init__.py +9 -0
openadapt_ml/export/__main__.py +6 -0
openadapt_ml/export/cli.py +89 -0
openadapt_ml/export/parquet.py +277 -0
openadapt_ml/grounding/detector.py +18 -14
openadapt_ml/ingest/__init__.py +11 -10
openadapt_ml/ingest/capture.py +97 -86
openadapt_ml/ingest/loader.py +120 -69
openadapt_ml/ingest/synthetic.py +344 -193
openadapt_ml/models/api_adapter.py +14 -4
openadapt_ml/models/base_adapter.py +10 -2
openadapt_ml/models/providers/__init__.py +288 -0
openadapt_ml/models/providers/anthropic.py +266 -0
openadapt_ml/models/providers/base.py +299 -0
openadapt_ml/models/providers/google.py +376 -0
openadapt_ml/models/providers/openai.py +342 -0
openadapt_ml/models/qwen_vl.py +46 -19
openadapt_ml/perception/__init__.py +35 -0
openadapt_ml/perception/integration.py +399 -0
openadapt_ml/retrieval/README.md +226 -0
openadapt_ml/retrieval/USAGE.md +391 -0
openadapt_ml/retrieval/__init__.py +91 -0
openadapt_ml/retrieval/demo_retriever.py +843 -0
openadapt_ml/retrieval/embeddings.py +630 -0
openadapt_ml/retrieval/index.py +194 -0
openadapt_ml/retrieval/retriever.py +162 -0
openadapt_ml/runtime/__init__.py +50 -0
openadapt_ml/runtime/policy.py +27 -14
openadapt_ml/runtime/safety_gate.py +471 -0
openadapt_ml/schema/__init__.py +113 -0
openadapt_ml/schema/converters.py +588 -0
openadapt_ml/schema/episode.py +470 -0
openadapt_ml/scripts/capture_screenshots.py +530 -0
openadapt_ml/scripts/compare.py +102 -61
openadapt_ml/scripts/demo_policy.py +4 -1
openadapt_ml/scripts/eval_policy.py +19 -14
openadapt_ml/scripts/make_gif.py +1 -1
openadapt_ml/scripts/prepare_synthetic.py +16 -17
openadapt_ml/scripts/train.py +98 -75
openadapt_ml/segmentation/README.md +920 -0
openadapt_ml/segmentation/__init__.py +97 -0
openadapt_ml/segmentation/adapters/__init__.py +5 -0
openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
openadapt_ml/segmentation/annotator.py +610 -0
openadapt_ml/segmentation/cache.py +290 -0
openadapt_ml/segmentation/cli.py +674 -0
openadapt_ml/segmentation/deduplicator.py +656 -0
openadapt_ml/segmentation/frame_describer.py +788 -0
openadapt_ml/segmentation/pipeline.py +340 -0
openadapt_ml/segmentation/schemas.py +622 -0
openadapt_ml/segmentation/segment_extractor.py +634 -0
openadapt_ml/training/azure_ops_viewer.py +1097 -0
openadapt_ml/training/benchmark_viewer.py +3255 -19
openadapt_ml/training/shared_ui.py +7 -7
openadapt_ml/training/stub_provider.py +57 -35
openadapt_ml/training/trainer.py +255 -441
openadapt_ml/training/trl_trainer.py +403 -0
openadapt_ml/training/viewer.py +323 -108
openadapt_ml/training/viewer_components.py +180 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
openadapt_ml-0.2.1.dist-info/RECORD +116 -0
openadapt_ml/benchmarks/base.py +0 -366
openadapt_ml/benchmarks/data_collection.py +0 -432
openadapt_ml/benchmarks/runner.py +0 -381
openadapt_ml/benchmarks/waa.py +0 -704
openadapt_ml/schemas/__init__.py +0 -53
openadapt_ml/schemas/sessions.py +0 -122
openadapt_ml/schemas/validation.py +0 -252
openadapt_ml-0.1.0.dist-info/RECORD +0 -55
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0

openadapt_ml/evals/trajectory_matching.py CHANGED Viewed

@@ -5,7 +5,7 @@ from dataclasses import dataclass, field
 from typing import Any, Callable, Dict, List, Optional
 from openadapt_ml.runtime.policy import AgentPolicy
-from openadapt_ml.schemas.sessions import Action, Episode
+from openadapt_ml.schema import Action, Episode, ActionType
 @dataclass
@@ -15,10 +15,15 @@ class MilestoneSpec:
     A milestone is achieved when, at a specific step, the predicted action
     matches certain criteria (type match + optional coord threshold).
     """
     name: str
     step_index: int  # Which step in the episode (0-indexed)
-    expected_type: str  # Expected ground truth action type ("click", "type", "done", etc.)
-    coord_threshold: Optional[float] = None  # If set, coord error must be < this for clicks
+    expected_type: (
+        str  # Expected ground truth action type ("click", "type", "done", etc.)
+    )
+    coord_threshold: Optional[float] = (
+        None  # If set, coord error must be < this for clicks
+    )
 # Predefined milestone specs per scenario
@@ -28,7 +33,9 @@ class MilestoneSpec:
 LOGIN_MILESTONES = [
     MilestoneSpec("typed_username", step_index=1, expected_type="type"),
     MilestoneSpec("typed_password", step_index=3, expected_type="type"),
-    MilestoneSpec("clicked_login", step_index=4, expected_type="click", coord_threshold=0.10),
+    MilestoneSpec(
+        "clicked_login", step_index=4, expected_type="click", coord_threshold=0.10
+    ),
     MilestoneSpec("emitted_done", step_index=5, expected_type="done"),
 ]
@@ -81,33 +88,60 @@ class AggregateMetrics:
     action_type_accuracy: float
     mean_coord_error: Optional[float]
     coord_error_count: int
-    episode_success_rate: Optional[float]  # Strict: all steps must match (renamed from success_pred)
+    episode_success_rate: Optional[
+        float
+    ]  # Strict: all steps must match (renamed from success_pred)
     click_hit_rate: Optional[float]  # Point-based: within 5% of center
-    mean_episode_progress: Optional[float]  # Partial credit: avg(step_matches/step_total)
+    mean_episode_progress: Optional[
+        float
+    ]  # Partial credit: avg(step_matches/step_total)
     # New partial-credit metrics
-    mean_episode_step_score: Optional[float]  # Strict partial: avg(full_step_correct/step_total)
+    mean_episode_step_score: Optional[
+        float
+    ]  # Strict partial: avg(full_step_correct/step_total)
     weak_episode_success_rate: Optional[float]  # Semantic milestones all achieved
     state_success_rate: Optional[float] = None  # From model's State: {"success": true}
-    bbox_hit_rate: Optional[float] = None  # Bbox-based: click anywhere in element bounds
+    bbox_hit_rate: Optional[float] = (
+        None  # Bbox-based: click anywhere in element bounds
+    )
     element_accuracy: Optional[float] = None  # SoM element index accuracy
+def _get_action_type_str(action: Action) -> str:
+    """Get action type as string, handling both enum and string types."""
+    return action.type.value if isinstance(action.type, ActionType) else action.type
+def _get_normalized_coords(action: Action) -> tuple[Optional[float], Optional[float]]:
+    """Extract normalized coordinates from action."""
+    if action.normalized_coordinates:
+        return action.normalized_coordinates
+    return None, None
+def _get_bbox(action: Action) -> Optional[tuple[float, float, float, float]]:
+    """Extract bounding box from action, checking element.bounds or raw."""
+    if action.element and action.element.bounds:
+        b = action.element.bounds
+        return (b.x, b.y, b.x + b.width, b.y + b.height)
+    elif action.raw and "bbox" in action.raw:
+        return action.raw["bbox"]
+    return None
 def compute_coordinate_error(pred_action: Action, gt_action: Action) -> Optional[float]:
     """Compute normalized L2 distance between predicted and ground-truth coords.
     Returns None if either action is missing coordinates.
     """
+    pred_x, pred_y = _get_normalized_coords(pred_action)
+    gt_x, gt_y = _get_normalized_coords(gt_action)
-    if (
-        pred_action.x is None
-        or pred_action.y is None
-        or gt_action.x is None
-        or gt_action.y is None
-    ):
+    if pred_x is None or pred_y is None or gt_x is None or gt_y is None:
         return None
-    dx = pred_action.x - gt_action.x
-    dy = pred_action.y - gt_action.y
+    dx = pred_x - gt_x
+    dy = pred_y - gt_y
     return math.sqrt(dx * dx + dy * dy)
@@ -119,14 +153,16 @@ def is_click_in_bbox(pred_action: Action, gt_action: Action) -> Optional[bool]:
         - False if prediction is outside bbox
         - None if no bbox is available (fall back to coord distance)
     """
-    if gt_action.bbox is None:
+    gt_bbox = _get_bbox(gt_action)
+    if gt_bbox is None:
         return None
-    if pred_action.x is None or pred_action.y is None:
+    pred_x, pred_y = _get_normalized_coords(pred_action)
+    if pred_x is None or pred_y is None:
         return False
-    x_min, y_min, x_max, y_max = gt_action.bbox
-    return (x_min <= pred_action.x <= x_max) and (y_min <= pred_action.y <= y_max)
+    x_min, y_min, x_max, y_max = gt_bbox
+    return (x_min <= pred_x <= x_max) and (y_min <= pred_y <= y_max)
 def evaluate_episode(
@@ -177,7 +213,7 @@ def evaluate_episode(
     for step_idx, step in enumerate(episode.steps):
         # Skip steps without an image; the dataset builder does the same.
-        if not step.observation.image_path:
+        if not step.observation.screenshot_path:
             continue
         if sample_idx >= len(samples):
@@ -186,16 +222,22 @@ def evaluate_episode(
         sample = samples[sample_idx]
         sample_idx += 1
-        pred_action, _thought, pred_state, raw_text = policy.predict_action_from_sample(sample)
+        pred_action, _thought, pred_state, raw_text = policy.predict_action_from_sample(
+            sample
+        )
         gt_action = step.action
+        # Get action types as strings for comparison
+        pred_type_str = _get_action_type_str(pred_action)
+        gt_type_str = _get_action_type_str(gt_action)
         # Track state-based success from final step
         if pred_state and isinstance(pred_state, dict):
             success_val = pred_state.get("success")
             if isinstance(success_val, bool):
                 last_state_success = success_val
-        type_match = pred_action.type == gt_action.type
+        type_match = pred_type_str == gt_type_str
         if type_match:
             step_matches += 1
         else:
@@ -203,17 +245,30 @@ def evaluate_episode(
         coord_error: Optional[float] = None
         click_hit = False
-        bbox_hit = False
         element_hit = False
+        # Helper to get element index - check element.element_id or raw field
+        def _get_element_index(action: Action) -> Optional[int]:
+            if action.element and action.element.element_id:
+                try:
+                    return int(action.element.element_id)
+                except (ValueError, TypeError):
+                    pass
+            if action.raw and "element_index" in action.raw:
+                return action.raw["element_index"]
+            return None
+        gt_element_index = _get_element_index(gt_action)
+        pred_element_index = _get_element_index(pred_action)
         # SoM mode: evaluate by element index for click/drag/type actions
-        if use_som and gt_action.type in {"click", "drag", "type"}:
-            if gt_action.element_index is not None:
+        if use_som and gt_type_str in {"click", "drag", "type"}:
+            if gt_element_index is not None:
                 element_total += 1
-                if pred_action.element_index == gt_action.element_index:
+                if pred_element_index == gt_element_index:
                     element_hits += 1
                     element_hit = True
-        elif gt_action.type in {"click", "drag"}:
+        elif gt_type_str in {"click", "drag"}:
             # Coordinate mode: evaluate by coordinate distance
             coord_error = compute_coordinate_error(pred_action, gt_action)
             if coord_error is not None:
@@ -229,15 +284,14 @@ def evaluate_episode(
                 bbox_total += 1
                 if in_bbox:
                     bbox_hits += 1
-                    bbox_hit = True
         # Full step correctness: type matches AND element/coord match for relevant actions
         if type_match:
-            if use_som and gt_action.type in {"click", "drag", "type"}:
+            if use_som and gt_type_str in {"click", "drag", "type"}:
                 # SoM mode: require element index match
                 if element_hit:
                     full_step_correct += 1
-            elif gt_action.type in {"click", "drag"}:
+            elif gt_type_str in {"click", "drag"}:
                 # Coordinate mode: require click hit
                 if click_hit:
                     full_step_correct += 1
@@ -247,20 +301,30 @@ def evaluate_episode(
         # Track semantic milestones using the milestone spec
         for milestone in milestones:
-            if step_idx == milestone.step_index and gt_action.type == milestone.expected_type:
-                if pred_action.type == milestone.expected_type:
+            if (
+                step_idx == milestone.step_index
+                and gt_type_str == milestone.expected_type
+            ):
+                if pred_type_str == milestone.expected_type:
                     # Check coord threshold if specified (for click actions)
                     if milestone.coord_threshold is not None:
-                        if coord_error is not None and coord_error < milestone.coord_threshold:
+                        if (
+                            coord_error is not None
+                            and coord_error < milestone.coord_threshold
+                        ):
                             milestones_achieved[milestone.name] = True
                     else:
                         # No coord threshold - type match is sufficient
                         milestones_achieved[milestone.name] = True
         # Ensure DONE is correct at the DONE step.
-        if gt_action.type == "done" and pred_action.type != "done":
+        if gt_type_str == "done" and pred_type_str != "done":
             success_pred = False
+        # Get normalized coordinates for logging
+        pred_x, pred_y = _get_normalized_coords(pred_action)
+        gt_x, gt_y = _get_normalized_coords(gt_action)
         # Optional logging of this step.
         if log_fn is not None and (log_limit is None or logged_count < log_limit):
             messages = sample.get("messages", [])
@@ -273,30 +337,30 @@ def evaluate_episode(
                     user_prompt = m.get("content")
             record: Dict[str, Any] = {
-                "episode_id": episode.id,
+                "episode_id": episode.episode_id,
                 "step_index": step_idx,
-                "goal": episode.goal,
+                "goal": episode.instruction,
                 "system_prompt": system_prompt,
                 "user_prompt": user_prompt,
                 "model_output_raw": raw_text,
                 "pred_action": {
-                    "type": pred_action.type,
-                    "x": pred_action.x,
-                    "y": pred_action.y,
+                    "type": pred_type_str,
+                    "x": pred_x,
+                    "y": pred_y,
                     "text": pred_action.text,
-                    "element_index": pred_action.element_index,
+                    "element_index": pred_element_index,
                 },
                 "ground_truth_action": {
-                    "type": gt_action.type,
-                    "x": gt_action.x,
-                    "y": gt_action.y,
+                    "type": gt_type_str,
+                    "x": gt_x,
+                    "y": gt_y,
                     "text": gt_action.text,
-                    "element_index": gt_action.element_index,
+                    "element_index": gt_element_index,
                 },
-                "correct_type": pred_action.type == gt_action.type,
+                "correct_type": pred_type_str == gt_type_str,
                 "coord_error_norm": coord_error,
-                "element_match": pred_action.element_index == gt_action.element_index
-                if gt_action.element_index is not None
+                "element_match": pred_element_index == gt_element_index
+                if gt_element_index is not None
                 else None,
             }
@@ -306,7 +370,7 @@ def evaluate_episode(
         step_total += 1
     metrics = EpisodeMetrics(
-        episode_id=episode.id,
+        episode_id=episode.episode_id,
         step_matches=step_matches,
         step_total=step_total,
         coord_errors=coord_errors,
@@ -380,18 +444,16 @@ def aggregate_metrics(episodes_metrics: List[EpisodeMetrics]) -> AggregateMetric
     # Partial credit: average episode progress (step_matches / step_total per episode)
     if eval_episodes:
-        episode_progress_scores = [
-            m.step_matches / m.step_total for m in eval_episodes
-        ]
-        mean_episode_progress = sum(episode_progress_scores) / len(episode_progress_scores)
+        episode_progress_scores = [m.step_matches / m.step_total for m in eval_episodes]
+        mean_episode_progress = sum(episode_progress_scores) / len(
+            episode_progress_scores
+        )
     else:
         mean_episode_progress = None
     # Strict partial: avg(full_step_correct / step_total) - requires type match + click hit
     if eval_episodes:
-        step_scores = [
-            m.full_step_correct / m.step_total for m in eval_episodes
-        ]
+        step_scores = [m.full_step_correct / m.step_total for m in eval_episodes]
         mean_episode_step_score = sum(step_scores) / len(step_scores)
     else:
         mean_episode_step_score = None
@@ -399,7 +461,8 @@ def aggregate_metrics(episodes_metrics: List[EpisodeMetrics]) -> AggregateMetric
     # Weak episode success: all milestones achieved
     if eval_episodes:
         weak_success_count = sum(
-            1 for m in eval_episodes
+            1
+            for m in eval_episodes
             if m.milestones_achieved and all(m.milestones_achieved.values())
         )
         weak_episode_success_rate = weak_success_count / len(eval_episodes)

openadapt_ml/experiments/demo_prompt/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Demo-conditioned prompt experiment.
+Tests whether including a human demonstration in the prompt
+improves VLM agent performance on similar tasks.
+"""
+from openadapt_ml.experiments.demo_prompt.format_demo import (
+    format_episode_as_demo,
+    format_action,
+)
+from openadapt_ml.experiments.demo_prompt.run_experiment import (
+    DemoPromptExperiment,
+)
+__all__ = [
+    "format_episode_as_demo",
+    "format_action",
+    "DemoPromptExperiment",
+]

openadapt_ml/experiments/demo_prompt/format_demo.py ADDED Viewed

@@ -0,0 +1,236 @@
+"""Demo formatting utilities for few-shot prompting."""
+from __future__ import annotations
+from pathlib import Path
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from openadapt_ml.schema import Action, Episode, Step
+def format_action(action: "Action") -> str:
+    """Format an Action as a string for the prompt.
+    Args:
+        action: Action to format.
+    Returns:
+        String representation like "CLICK(0.5, 0.3)" or "TYPE('hello')".
+    """
+    # Get action type value (handle both enum and string)
+    action_type = action.type.value if hasattr(action.type, "value") else action.type
+    if action_type == "click":
+        if action.normalized_coordinates is not None:
+            x, y = action.normalized_coordinates
+            return f"CLICK({x:.3f}, {y:.3f})"
+        return "CLICK()"
+    elif action_type == "double_click":
+        if action.normalized_coordinates is not None:
+            x, y = action.normalized_coordinates
+            return f"DOUBLE_CLICK({x:.3f}, {y:.3f})"
+        return "DOUBLE_CLICK()"
+    elif action_type == "type":
+        text = action.text or ""
+        # Escape quotes and truncate if very long
+        text = text.replace('"', '\\"')
+        if len(text) > 50:
+            text = text[:47] + "..."
+        return f'TYPE("{text}")'
+    elif action_type == "key":
+        key = action.key or "unknown"
+        if action.modifiers:
+            mods = "+".join(action.modifiers)
+            return f"KEY({mods}+{key})"
+        return f"KEY({key})"
+    elif action_type == "scroll":
+        direction = action.scroll_direction or "down"
+        return f"SCROLL({direction})"
+    elif action_type == "drag":
+        if (
+            action.normalized_coordinates is not None
+            and action.normalized_end is not None
+        ):
+            x, y = action.normalized_coordinates
+            end_x, end_y = action.normalized_end
+            return f"DRAG({x:.3f}, {y:.3f}, {end_x:.3f}, {end_y:.3f})"
+        return "DRAG()"
+    else:
+        return f"{action_type.upper()}()"
+def format_step(step: "Step", step_num: int) -> str:
+    """Format a single step for the demo.
+    Args:
+        step: Step to format.
+        step_num: Step number (1-indexed).
+    Returns:
+        Formatted step string.
+    """
+    lines = [f"Step {step_num}:"]
+    # Add window context if available
+    if step.observation and step.observation.window_title:
+        lines.append(f"  Window: {step.observation.window_title}")
+    # Add action
+    if step.action:
+        action_str = format_action(step.action)
+        lines.append(f"  Action: {action_str}")
+    return "\n".join(lines)
+def format_episode_as_demo(
+    episode: "Episode",
+    max_steps: int = 10,
+    include_screenshots: bool = False,
+) -> str:
+    """Convert an Episode to a few-shot demo format.
+    Args:
+        episode: Episode containing the demonstration.
+        max_steps: Maximum number of steps to include.
+        include_screenshots: Whether to include screenshot paths (for multi-image).
+    Returns:
+        Formatted demo string for prompt injection.
+    """
+    lines = [
+        "DEMONSTRATION:",
+        f"Task: {episode.instruction}",
+        "",
+    ]
+    for i, step in enumerate(episode.steps[:max_steps], 1):
+        lines.append(format_step(step, i))
+        # Optionally include screenshot reference
+        if (
+            include_screenshots
+            and step.observation
+            and step.observation.screenshot_path
+        ):
+            lines.append(f"  [Screenshot: {step.observation.screenshot_path}]")
+        lines.append("")
+    lines.append("---")
+    return "\n".join(lines)
+def format_episode_verbose(
+    episode: "Episode",
+    max_steps: int = 10,
+) -> str:
+    """Format episode with more context per step.
+    Includes:
+    - Screen summary
+    - User intent (inferred)
+    - Action taken
+    - Observed result
+    Args:
+        episode: Episode to format.
+        max_steps: Maximum steps to include.
+    Returns:
+        Verbose demo string.
+    """
+    lines = [
+        "DEMONSTRATION:",
+        f"Goal: {episode.instruction}",
+        "",
+        "The following shows the step-by-step procedure:",
+        "",
+    ]
+    for i, step in enumerate(episode.steps[:max_steps], 1):
+        lines.append(f"Step {i}:")
+        # Screen summary
+        if step.observation:
+            if step.observation.window_title:
+                lines.append(f"  [Screen: {step.observation.window_title}]")
+        # Action taken
+        if step.action:
+            action_str = format_action(step.action)
+            lines.append(f"  [Action: {action_str}]")
+        # Observed result (inferred from next step's observation)
+        if i < len(episode.steps):
+            next_step = episode.steps[i]
+            if next_step.observation and next_step.observation.window_title:
+                if (
+                    not step.observation
+                    or next_step.observation.window_title
+                    != step.observation.window_title
+                ):
+                    lines.append(
+                        f"  [Result: Window changed to {next_step.observation.window_title}]"
+                    )
+        lines.append("")
+    lines.append("---")
+    return "\n".join(lines)
+def get_demo_screenshot_paths(
+    episode: "Episode",
+    max_steps: int = 10,
+) -> list[str]:
+    """Get screenshot paths from episode for multi-image prompting.
+    Args:
+        episode: Episode to extract screenshots from.
+        max_steps: Maximum steps to include.
+    Returns:
+        List of screenshot paths.
+    """
+    paths = []
+    for step in episode.steps[:max_steps]:
+        if step.observation and step.observation.screenshot_path:
+            path = step.observation.screenshot_path
+            if Path(path).exists():
+                paths.append(path)
+    return paths
+def generate_length_matched_control(demo: str) -> str:
+    """Generate a control prompt with the same token count but no trajectory info.
+    Used to control for prompt length effects.
+    Args:
+        demo: The demo string to match length of.
+    Returns:
+        Control string of similar length with irrelevant content.
+    """
+    # Use generic placeholder text
+    placeholder = (
+        "This is placeholder text that serves as a control condition. "
+        "It contains no relevant information about the task or demonstration. "
+        "The purpose is to match the token count of the demonstration prompt. "
+    )
+    # Repeat to match approximate length
+    target_len = len(demo)
+    control = ""
+    while len(control) < target_len:
+        control += placeholder
+    return control[:target_len]

openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json ADDED Viewed

@@ -0,0 +1,83 @@
+{
+  "timestamp": "20251231_002125",
+  "provider": "anthropic",
+  "screenshot": "/Users/abrichr/oa/src/openadapt-capture/turn-off-nightshift/screenshots/capture_31807990_step_0.png",
+  "demo": "DEMONSTRATION:\nGoal: Turn off Night Shift in macOS System Settings\n\nThe following shows the step-by-step procedure:\n\nStep 1:\n  [Screen: Desktop with Terminal window visible]\n  [Action: CLICK(0.01, 0.01) - Click Apple menu icon in top-left]\n  [Result: Apple menu dropdown opened]\n\nStep 2:\n  [Screen: Apple menu visible with options]\n  [Action: CLICK on \"System Settings...\" menu item]\n  [Result: System Settings application opened]\n\nStep 3:\n  [Screen: System Settings window with sidebar]\n  [Action: CLICK on \"Displays\" in the sidebar]\n  [Result: Displays panel shown in main area]\n\nStep 4:\n  [Screen: Displays panel showing display settings]\n  [Action: CLICK on \"Night Shift...\" button]\n  [Result: Night Shift popup/sheet appeared]\n\nStep 5:\n  [Screen: Night Shift popup with Schedule dropdown]\n  [Action: CLICK on Schedule dropdown, select \"Off\"]\n  [Result: Night Shift schedule set to Off, Night Shift disabled]\n\n---",
+  "test_cases": [
+    {
+      "test_case": {
+        "name": "near_toggle",
+        "task": "Turn ON Night Shift in macOS System Settings",
+        "similarity": "near",
+        "notes": "Same procedure, just toggle to opposite state"
+      },
+      "results": {
+        "zero_shot": {
+          "action": "CLICK(20, 8)",
+          "response": "I need to turn ON Night Shift in macOS. Based on the terminal window showing an attempt to control Night Shift, I should access System Settings to properly enable it. Let me click on the Apple menu.\n\nACTION: CLICK(20, 8)",
+          "error": null
+        },
+        "with_demo": {
+          "action": "CLICK(20, 8)",
+          "response": "I need to turn ON Night Shift in macOS System Settings. Let me start by clicking on the Apple menu icon to access System Settings.\n\nACTION: CLICK(20, 8)",
+          "error": null
+        },
+        "control": {
+          "action": "CLICK(1243, 8)",
+          "response": "I need to analyze the screenshot to understand how to turn on Night Shift in macOS System Settings.\n\nThe screenshot shows a Terminal window with some command output. To access Night Shift settings in macOS, I need to open System Settings. I can see various system icons in the menu bar at the top of the screen.\n\nACTION: CLICK(1243, 8)",
+          "error": null
+        }
+      }
+    },
+    {
+      "test_case": {
+        "name": "medium_same_panel",
+        "task": "Adjust Night Shift color temperature to warmer setting",
+        "similarity": "medium",
+        "notes": "Same navigation path, different final action"
+      },
+      "results": {
+        "zero_shot": {
+          "action": "CLICK(1218, 8)",
+          "response": "The task requires adjusting the Night Shift color temperature to a warmer setting. Looking at the screenshot, I can see a macOS Terminal window. The system menu bar at the top right contains various control icons. I need to find the Night Shift or display settings controls.\n\nACTION: CLICK(1218, 8)",
+          "error": null
+        },
+        "with_demo": {
+          "action": "CLICK(19, 8)",
+          "response": "I need to adjust the Night Shift color temperature to make it warmer. First, I should access the System Settings to find the Night Shift controls.\n\nACTION: CLICK(19, 8)",
+          "error": null
+        },
+        "control": {
+          "action": "CLICK(1114, 8)",
+          "response": "I need to analyze the screenshot and the goal to determine the next action.\n\nThe screenshot shows a Terminal window on macOS. The goal is to \"Adjust Night Shift color temperature to warmer setting.\"\n\nLooking at the top menu bar, I can see various system icons. To adjust Night Shift settings, I would need to access System Preferences or use the Control Center. I can see what appears to be system control icons in the top-right corner of the screen.\n\nThe most direct way to adjust Night Shift would be to click on the Control Center icon or the display settings in the menu bar.\n\nACTION: CLICK(1114, 8)",
+          "error": null
+        }
+      }
+    },
+    {
+      "test_case": {
+        "name": "far_different_setting",
+        "task": "Turn on True Tone display in macOS System Settings",
+        "similarity": "far",
+        "notes": "Same app but different panel (True Tone is in Displays)"
+      },
+      "results": {
+        "zero_shot": {
+          "action": "CLICK(1217, 8)",
+          "response": "I need to access System Settings to turn on True Tone display. Let me analyze the screen to find how to access it.\n\nACTION: CLICK(1217, 8)",
+          "error": null
+        },
+        "with_demo": {
+          "action": "CLICK(20, 8)",
+          "response": "I need to turn on True Tone display in macOS System Settings. Let me start by clicking on the Apple menu in the top-left corner.\n\nACTION: CLICK(20, 8)",
+          "error": null
+        },
+        "control": {
+          "action": "CLICK(20, 8)",
+          "response": "I need to open System Settings to enable True Tone display. I'll click on the Apple menu in the top-left corner to access System Settings.\n\nACTION: CLICK(20, 8)",
+          "error": null
+        }
+      }
+    }
+  ]
+}

openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

openadapt-ml 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl