PyPI - openadapt-ml - Versions diffs - 0.1.0__py3-none-any.whl - Mend

openadapt-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

openadapt_ml/__init__.py +0 -0
openadapt_ml/benchmarks/__init__.py +125 -0
openadapt_ml/benchmarks/agent.py +825 -0
openadapt_ml/benchmarks/azure.py +761 -0
openadapt_ml/benchmarks/base.py +366 -0
openadapt_ml/benchmarks/cli.py +884 -0
openadapt_ml/benchmarks/data_collection.py +432 -0
openadapt_ml/benchmarks/runner.py +381 -0
openadapt_ml/benchmarks/waa.py +704 -0
openadapt_ml/cloud/__init__.py +5 -0
openadapt_ml/cloud/azure_inference.py +441 -0
openadapt_ml/cloud/lambda_labs.py +2445 -0
openadapt_ml/cloud/local.py +790 -0
openadapt_ml/config.py +56 -0
openadapt_ml/datasets/__init__.py +0 -0
openadapt_ml/datasets/next_action.py +507 -0
openadapt_ml/evals/__init__.py +23 -0
openadapt_ml/evals/grounding.py +241 -0
openadapt_ml/evals/plot_eval_metrics.py +174 -0
openadapt_ml/evals/trajectory_matching.py +486 -0
openadapt_ml/grounding/__init__.py +45 -0
openadapt_ml/grounding/base.py +236 -0
openadapt_ml/grounding/detector.py +570 -0
openadapt_ml/ingest/__init__.py +43 -0
openadapt_ml/ingest/capture.py +312 -0
openadapt_ml/ingest/loader.py +232 -0
openadapt_ml/ingest/synthetic.py +1102 -0
openadapt_ml/models/__init__.py +0 -0
openadapt_ml/models/api_adapter.py +171 -0
openadapt_ml/models/base_adapter.py +59 -0
openadapt_ml/models/dummy_adapter.py +42 -0
openadapt_ml/models/qwen_vl.py +426 -0
openadapt_ml/runtime/__init__.py +0 -0
openadapt_ml/runtime/policy.py +182 -0
openadapt_ml/schemas/__init__.py +53 -0
openadapt_ml/schemas/sessions.py +122 -0
openadapt_ml/schemas/validation.py +252 -0
openadapt_ml/scripts/__init__.py +0 -0
openadapt_ml/scripts/compare.py +1490 -0
openadapt_ml/scripts/demo_policy.py +62 -0
openadapt_ml/scripts/eval_policy.py +287 -0
openadapt_ml/scripts/make_gif.py +153 -0
openadapt_ml/scripts/prepare_synthetic.py +43 -0
openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
openadapt_ml/scripts/train.py +174 -0
openadapt_ml/training/__init__.py +0 -0
openadapt_ml/training/benchmark_viewer.py +1538 -0
openadapt_ml/training/shared_ui.py +157 -0
openadapt_ml/training/stub_provider.py +276 -0
openadapt_ml/training/trainer.py +2446 -0
openadapt_ml/training/viewer.py +2970 -0
openadapt_ml-0.1.0.dist-info/METADATA +818 -0
openadapt_ml-0.1.0.dist-info/RECORD +55 -0
openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0

openadapt_ml/evals/grounding.py ADDED Viewed

@@ -0,0 +1,241 @@
+"""Grounding-specific evaluation metrics.
+This module provides metrics for evaluating grounding accuracy independent
+of policy performance, as described in the architecture document.
+Metrics:
+    - bbox_iou: Intersection over Union with ground-truth element bbox
+    - centroid_hit_rate: Whether click point lands inside correct element
+    - oracle_hit_rate@k: Any of top-k candidates correct
+    - grounding_latency: Time per grounding call
+"""
+from __future__ import annotations
+import time
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from PIL import Image
+    from openadapt_ml.grounding.base import GroundingModule, RegionCandidate
+@dataclass
+class GroundingResult:
+    """Result of a single grounding evaluation."""
+    target_description: str
+    ground_truth_bbox: tuple[float, float, float, float] | None
+    predicted_candidates: list["RegionCandidate"]
+    latency_ms: float
+    # Computed metrics
+    best_iou: float = 0.0
+    centroid_hit: bool = False
+    oracle_hit_at_k: dict[int, bool] = field(default_factory=dict)
+    def __post_init__(self) -> None:
+        """Compute metrics from predictions and ground truth."""
+        if not self.ground_truth_bbox or not self.predicted_candidates:
+            return
+        gt_x1, gt_y1, gt_x2, gt_y2 = self.ground_truth_bbox
+        for k, candidate in enumerate(self.predicted_candidates, start=1):
+            # IoU
+            iou = self._compute_iou(candidate.bbox, self.ground_truth_bbox)
+            if iou > self.best_iou:
+                self.best_iou = iou
+            # Centroid hit
+            cx, cy = candidate.centroid
+            if gt_x1 <= cx <= gt_x2 and gt_y1 <= cy <= gt_y2:
+                if not self.centroid_hit:
+                    self.centroid_hit = True
+            # Oracle hit at k (if any candidate up to k is a hit)
+            hit = iou > 0.5 or (gt_x1 <= cx <= gt_x2 and gt_y1 <= cy <= gt_y2)
+            if hit:
+                # Mark all k >= current k as hits
+                for check_k in range(k, max(len(self.predicted_candidates) + 1, 6)):
+                    self.oracle_hit_at_k[check_k] = True
+    def _compute_iou(
+        self,
+        bbox1: tuple[float, float, float, float],
+        bbox2: tuple[float, float, float, float],
+    ) -> float:
+        """Compute IoU between two bboxes."""
+        x1, y1, x2, y2 = bbox1
+        ox1, oy1, ox2, oy2 = bbox2
+        # Intersection
+        ix1 = max(x1, ox1)
+        iy1 = max(y1, oy1)
+        ix2 = min(x2, ox2)
+        iy2 = min(y2, oy2)
+        if ix1 >= ix2 or iy1 >= iy2:
+            return 0.0
+        intersection = (ix2 - ix1) * (iy2 - iy1)
+        area1 = (x2 - x1) * (y2 - y1)
+        area2 = (ox2 - ox1) * (oy2 - oy1)
+        union = area1 + area2 - intersection
+        return intersection / union if union > 0 else 0.0
+@dataclass
+class GroundingMetrics:
+    """Aggregated grounding metrics across multiple evaluations."""
+    results: list[GroundingResult] = field(default_factory=list)
+    @property
+    def count(self) -> int:
+        """Number of evaluated samples."""
+        return len(self.results)
+    @property
+    def mean_iou(self) -> float:
+        """Mean IoU across all samples."""
+        if not self.results:
+            return 0.0
+        return sum(r.best_iou for r in self.results) / len(self.results)
+    @property
+    def centroid_hit_rate(self) -> float:
+        """Fraction of samples where centroid hit ground truth."""
+        if not self.results:
+            return 0.0
+        return sum(1 for r in self.results if r.centroid_hit) / len(self.results)
+    def oracle_hit_rate(self, k: int = 1) -> float:
+        """Fraction of samples where any of top-k candidates hit.
+        Args:
+            k: Number of candidates to consider.
+        Returns:
+            Hit rate in [0, 1].
+        """
+        if not self.results:
+            return 0.0
+        hits = sum(1 for r in self.results if r.oracle_hit_at_k.get(k, False))
+        return hits / len(self.results)
+    @property
+    def mean_latency_ms(self) -> float:
+        """Mean grounding latency in milliseconds."""
+        if not self.results:
+            return 0.0
+        return sum(r.latency_ms for r in self.results) / len(self.results)
+    def summary(self) -> dict:
+        """Return summary dict of all metrics."""
+        return {
+            "count": self.count,
+            "mean_iou": self.mean_iou,
+            "centroid_hit_rate": self.centroid_hit_rate,
+            "oracle_hit_rate@1": self.oracle_hit_rate(1),
+            "oracle_hit_rate@3": self.oracle_hit_rate(3),
+            "oracle_hit_rate@5": self.oracle_hit_rate(5),
+            "mean_latency_ms": self.mean_latency_ms,
+        }
+    def __str__(self) -> str:
+        """Pretty-print metrics summary."""
+        s = self.summary()
+        return (
+            f"Grounding Metrics (n={s['count']}):\n"
+            f"  Mean IoU:           {s['mean_iou']:.3f}\n"
+            f"  Centroid Hit Rate:  {s['centroid_hit_rate']:.3f}\n"
+            f"  Oracle Hit @1:      {s['oracle_hit_rate@1']:.3f}\n"
+            f"  Oracle Hit @3:      {s['oracle_hit_rate@3']:.3f}\n"
+            f"  Oracle Hit @5:      {s['oracle_hit_rate@5']:.3f}\n"
+            f"  Mean Latency:       {s['mean_latency_ms']:.1f}ms"
+        )
+def evaluate_grounder(
+    grounder: "GroundingModule",
+    test_cases: list[tuple["Image", str, tuple[float, float, float, float]]],
+    k: int = 5,
+) -> GroundingMetrics:
+    """Evaluate a grounding module on test cases.
+    Args:
+        grounder: GroundingModule to evaluate.
+        test_cases: List of (image, target_description, ground_truth_bbox) tuples.
+        k: Number of candidates to request from grounder.
+    Returns:
+        GroundingMetrics with aggregated results.
+    """
+    metrics = GroundingMetrics()
+    for image, target_desc, gt_bbox in test_cases:
+        start = time.perf_counter()
+        candidates = grounder.ground(image, target_desc, k=k)
+        latency_ms = (time.perf_counter() - start) * 1000
+        result = GroundingResult(
+            target_description=target_desc,
+            ground_truth_bbox=gt_bbox,
+            predicted_candidates=candidates,
+            latency_ms=latency_ms,
+        )
+        metrics.results.append(result)
+    return metrics
+def evaluate_grounder_on_episode(
+    grounder: "GroundingModule",
+    episode: "Episode",
+    k: int = 5,
+) -> GroundingMetrics:
+    """Evaluate a grounding module on an Episode's click actions.
+    Only evaluates steps with click actions that have ground-truth bboxes.
+    Args:
+        grounder: GroundingModule to evaluate.
+        episode: Episode with Steps containing Actions with bboxes.
+        k: Number of candidates to request.
+    Returns:
+        GroundingMetrics for click actions with bboxes.
+    """
+    from PIL import Image
+    from openadapt_ml.schemas.sessions import Episode
+    test_cases = []
+    for step in episode.steps:
+        action = step.action
+        # Only evaluate clicks with bboxes
+        if action.type not in ("click", "double_click"):
+            continue
+        if action.bbox is None:
+            continue
+        if step.observation.image_path is None:
+            continue
+        # Load image
+        try:
+            image = Image.open(step.observation.image_path)
+        except Exception:
+            continue
+        # Create target description from thought or action
+        target_desc = step.thought or f"element at ({action.x:.2f}, {action.y:.2f})"
+        test_cases.append((image, target_desc, action.bbox))
+    return evaluate_grounder(grounder, test_cases, k=k)

openadapt_ml/evals/plot_eval_metrics.py ADDED Viewed

@@ -0,0 +1,174 @@
+from __future__ import annotations
+import argparse
+from pathlib import Path
+from typing import Any, Dict, List
+import json
+import matplotlib.pyplot as plt
+from matplotlib.patches import Patch
+METRIC_KEYS = [
+    ("action_type_accuracy", "Action Type Accuracy"),
+    ("mean_coord_error", "Mean Coord Error"),
+    ("click_hit_rate", "Click Hit Rate"),
+    ("episode_success_rate", "Strict Episode Success"),
+    ("mean_episode_progress", "Episode Progress"),
+    ("mean_episode_step_score", "Step Score (Type+Click)"),
+    ("weak_episode_success_rate", "Weak Episode Success"),
+]
+def _load_metrics(path: Path) -> Dict[str, Any]:
+    with path.open("r", encoding="utf-8") as f:
+        payload = json.load(f)
+    return payload.get("metrics", payload)
+def _get_bar_style(label: str) -> tuple[str, str]:
+    """Determine bar color and hatch pattern based on model label.
+    Returns:
+        (color, hatch): color string and hatch pattern
+    """
+    label_lower = label.lower()
+    # Determine color based on model type
+    if "claude" in label_lower:
+        color = "#FF6B35"  # Orange for Claude
+    elif "gpt" in label_lower or "openai" in label_lower:
+        color = "#C1121F"  # Red for GPT
+    elif "2b" in label_lower:
+        color = "#4A90E2"  # Light blue for 2B
+    elif "8b" in label_lower:
+        color = "#2E5C8A"  # Dark blue for 8B
+    else:
+        color = "#6C757D"  # Gray for unknown
+    # Determine hatch pattern for fine-tuned models
+    if "ft" in label_lower or "fine" in label_lower or "finetuned" in label_lower:
+        hatch = "///"  # Diagonal lines for fine-tuned
+    else:
+        hatch = ""  # Solid for base/API models
+    return color, hatch
+def plot_eval_metrics(
+    metric_files: List[Path],
+    labels: List[str],
+    output_path: Path,
+) -> None:
+    if len(metric_files) != len(labels):
+        raise ValueError("Number of labels must match number of metric files")
+    metrics_list = [_load_metrics(p) for p in metric_files]
+    num_models = len(metrics_list)
+    num_metrics = len(METRIC_KEYS)
+    fig, axes = plt.subplots(1, num_metrics, figsize=(4 * num_metrics, 5))
+    fig.suptitle(
+        "VLM Model Comparison (Offline fine-tuned vs API models)",
+        fontsize=12,
+        fontweight='bold',
+    )
+    if num_metrics == 1:
+        axes = [axes]
+    for idx, (key, title) in enumerate(METRIC_KEYS):
+        ax = axes[idx]
+        values: List[float] = []
+        colors: List[str] = []
+        hatches: List[str] = []
+        for m, label in zip(metrics_list, labels):
+            v = m.get(key)
+            if v is None:
+                values.append(0.0)
+            else:
+                values.append(float(v))
+            color, hatch = _get_bar_style(label)
+            colors.append(color)
+            hatches.append(hatch)
+        x = range(num_models)
+        bars = ax.bar(x, values, tick_label=labels, color=colors, edgecolor='black', linewidth=1.2)
+        # Apply hatch patterns
+        for bar, hatch in zip(bars, hatches):
+            bar.set_hatch(hatch)
+        ax.set_title(title, fontsize=11, fontweight='bold')
+        ax.set_ylabel(key, fontsize=9)
+        ax.set_ylim(bottom=0.0)
+        # Rotate x-axis labels to prevent crowding
+        ax.tick_params(axis='x', labelrotation=45, labelsize=8)
+        # Align labels to the right for better readability when rotated
+        for tick in ax.get_xticklabels():
+            tick.set_horizontalalignment('right')
+    fig.tight_layout()
+    # Add legend explaining color coding and hatch patterns
+    legend_elements = [
+        Patch(facecolor='#4A90E2', edgecolor='black', label='Qwen3-VL-2B'),
+        Patch(facecolor='#2E5C8A', edgecolor='black', label='Qwen3-VL-8B'),
+        Patch(facecolor='#FF6B35', edgecolor='black', label='Claude (API)'),
+        Patch(facecolor='#C1121F', edgecolor='black', label='GPT (API)'),
+        Patch(facecolor='gray', edgecolor='black', hatch='///', label='Fine-tuned'),
+        Patch(facecolor='gray', edgecolor='black', label='Base/Pretrained'),
+    ]
+    fig.legend(
+        handles=legend_elements,
+        loc='lower center',
+        bbox_to_anchor=(0.5, -0.05),
+        ncol=3,
+        fontsize=9,
+        frameon=True,
+    )
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(output_path, dpi=150, bbox_inches='tight')
+    plt.close(fig)
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Plot evaluation metrics (base vs fine-tuned or cross-model).",
+    )
+    parser.add_argument(
+        "--files",
+        type=str,
+        nargs="+",
+        required=True,
+        help="Paths to one or more JSON metric files produced by eval_policy.py.",
+    )
+    parser.add_argument(
+        "--labels",
+        type=str,
+        nargs="+",
+        required=True,
+        help="Labels for each metrics file (e.g. base ft).",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=True,
+        help="Output PNG path for the plot.",
+    )
+    args = parser.parse_args()
+    files = [Path(p) for p in args.files]
+    labels = list(args.labels)
+    output_path = Path(args.output)
+    plot_eval_metrics(files, labels, output_path)
+if __name__ == "__main__":
+    main()