PyPI - openadapt-ml - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

openadapt-ml 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

openadapt_ml/baselines/__init__.py +121 -0
openadapt_ml/baselines/adapter.py +185 -0
openadapt_ml/baselines/cli.py +314 -0
openadapt_ml/baselines/config.py +448 -0
openadapt_ml/baselines/parser.py +922 -0
openadapt_ml/baselines/prompts.py +787 -0
openadapt_ml/benchmarks/__init__.py +13 -115
openadapt_ml/benchmarks/agent.py +265 -421
openadapt_ml/benchmarks/azure.py +28 -19
openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
openadapt_ml/benchmarks/cli.py +1722 -4847
openadapt_ml/benchmarks/trace_export.py +631 -0
openadapt_ml/benchmarks/viewer.py +22 -5
openadapt_ml/benchmarks/vm_monitor.py +530 -29
openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
openadapt_ml/cloud/azure_inference.py +3 -5
openadapt_ml/cloud/lambda_labs.py +722 -307
openadapt_ml/cloud/local.py +2038 -487
openadapt_ml/cloud/ssh_tunnel.py +68 -26
openadapt_ml/datasets/next_action.py +40 -30
openadapt_ml/evals/grounding.py +8 -3
openadapt_ml/evals/plot_eval_metrics.py +15 -13
openadapt_ml/evals/trajectory_matching.py +41 -26
openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
openadapt_ml/experiments/representation_shootout/config.py +390 -0
openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
openadapt_ml/experiments/representation_shootout/runner.py +687 -0
openadapt_ml/experiments/waa_demo/runner.py +29 -14
openadapt_ml/export/parquet.py +36 -24
openadapt_ml/grounding/detector.py +18 -14
openadapt_ml/ingest/__init__.py +8 -6
openadapt_ml/ingest/capture.py +25 -22
openadapt_ml/ingest/loader.py +7 -4
openadapt_ml/ingest/synthetic.py +189 -100
openadapt_ml/models/api_adapter.py +14 -4
openadapt_ml/models/base_adapter.py +10 -2
openadapt_ml/models/providers/__init__.py +288 -0
openadapt_ml/models/providers/anthropic.py +266 -0
openadapt_ml/models/providers/base.py +299 -0
openadapt_ml/models/providers/google.py +376 -0
openadapt_ml/models/providers/openai.py +342 -0
openadapt_ml/models/qwen_vl.py +46 -19
openadapt_ml/perception/__init__.py +35 -0
openadapt_ml/perception/integration.py +399 -0
openadapt_ml/retrieval/demo_retriever.py +50 -24
openadapt_ml/retrieval/embeddings.py +9 -8
openadapt_ml/retrieval/retriever.py +3 -1
openadapt_ml/runtime/__init__.py +50 -0
openadapt_ml/runtime/policy.py +18 -5
openadapt_ml/runtime/safety_gate.py +471 -0
openadapt_ml/schema/__init__.py +9 -0
openadapt_ml/schema/converters.py +74 -27
openadapt_ml/schema/episode.py +31 -18
openadapt_ml/scripts/capture_screenshots.py +530 -0
openadapt_ml/scripts/compare.py +85 -54
openadapt_ml/scripts/demo_policy.py +4 -1
openadapt_ml/scripts/eval_policy.py +15 -9
openadapt_ml/scripts/make_gif.py +1 -1
openadapt_ml/scripts/prepare_synthetic.py +3 -1
openadapt_ml/scripts/train.py +21 -9
openadapt_ml/segmentation/README.md +920 -0
openadapt_ml/segmentation/__init__.py +97 -0
openadapt_ml/segmentation/adapters/__init__.py +5 -0
openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
openadapt_ml/segmentation/annotator.py +610 -0
openadapt_ml/segmentation/cache.py +290 -0
openadapt_ml/segmentation/cli.py +674 -0
openadapt_ml/segmentation/deduplicator.py +656 -0
openadapt_ml/segmentation/frame_describer.py +788 -0
openadapt_ml/segmentation/pipeline.py +340 -0
openadapt_ml/segmentation/schemas.py +622 -0
openadapt_ml/segmentation/segment_extractor.py +634 -0
openadapt_ml/training/azure_ops_viewer.py +1097 -0
openadapt_ml/training/benchmark_viewer.py +52 -41
openadapt_ml/training/shared_ui.py +7 -7
openadapt_ml/training/stub_provider.py +57 -35
openadapt_ml/training/trainer.py +143 -86
openadapt_ml/training/trl_trainer.py +70 -21
openadapt_ml/training/viewer.py +323 -108
openadapt_ml/training/viewer_components.py +180 -0
{openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/METADATA +215 -14
openadapt_ml-0.2.2.dist-info/RECORD +116 -0
openadapt_ml/benchmarks/base.py +0 -366
openadapt_ml/benchmarks/data_collection.py +0 -432
openadapt_ml/benchmarks/live_tracker.py +0 -180
openadapt_ml/benchmarks/runner.py +0 -418
openadapt_ml/benchmarks/waa.py +0 -761
openadapt_ml/benchmarks/waa_live.py +0 -619
openadapt_ml-0.2.0.dist-info/RECORD +0 -86
{openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/WHEEL +0 -0
{openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/licenses/LICENSE +0 -0

openadapt_ml/benchmarks/data_collection.py DELETED Viewed

@@ -1,432 +0,0 @@
-"""Data collection for benchmark viewer integration.
-This module handles saving execution traces during benchmark runs for later
-replay in the benchmark viewer. It creates a structured directory layout with
-screenshots, metadata, and execution traces.
-Directory structure:
-    benchmark_results/
-    ├── waa_eval_YYYYMMDD_HHMMSS/
-    │   ├── metadata.json
-    │   ├── tasks/
-    │   │   ├── task_001/
-    │   │   │   ├── task.json
-    │   │   │   ├── screenshots/
-    │   │   │   │   ├── step_000.png
-    │   │   │   │   ├── step_001.png
-    │   │   │   │   └── ...
-    │   │   │   └── execution.json
-    │   │   └── task_002/
-    │   │       └── ...
-    │   └── summary.json
-Example:
-    from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
-    collector = ExecutionTraceCollector(
-        benchmark_name="waa",
-        run_name="waa_eval_20241214",
-        model_id="qwen3vl-2b-epoch5"
-    )
-    # During evaluation
-    collector.start_task(task)
-    for step_idx, (obs, action) in enumerate(trajectory):
-        collector.record_step(step_idx, obs, action, reasoning="...")
-    collector.finish_task(result)
-"""
-from __future__ import annotations
-import json
-import logging
-from dataclasses import asdict, dataclass
-from datetime import datetime
-from pathlib import Path
-from typing import Any
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAction,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
-)
-logger = logging.getLogger(__name__)
-@dataclass
-class ExecutionStep:
-    """Single step in execution trace.
-    Attributes:
-        step_idx: Step index in the trajectory.
-        screenshot_path: Relative path to screenshot image.
-        action: Action taken at this step.
-        reasoning: Optional reasoning/thought from the agent.
-        timestamp: Timestamp when step was recorded.
-    """
-    step_idx: int
-    screenshot_path: str | None
-    action: dict[str, Any]  # Serialized BenchmarkAction
-    reasoning: str | None = None
-    timestamp: float | None = None
-class ExecutionTraceCollector:
-    """Collects execution traces during benchmark runs.
-    This class handles:
-    - Creating the directory structure for a benchmark run
-    - Saving screenshots at each step
-    - Recording actions and reasoning
-    - Saving task results and metadata
-    Args:
-        benchmark_name: Name of the benchmark (e.g., "waa", "webarena").
-        run_name: Unique name for this evaluation run (e.g., "waa_eval_20241214").
-        model_id: Identifier for the model being evaluated.
-        output_dir: Base directory for benchmark results (default: "./benchmark_results").
-    """
-    def __init__(
-        self,
-        benchmark_name: str,
-        run_name: str | None = None,
-        model_id: str = "unknown",
-        output_dir: str | Path = "benchmark_results",
-    ):
-        self.benchmark_name = benchmark_name
-        self.model_id = model_id
-        # Auto-generate run_name if not provided
-        if run_name is None:
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            run_name = f"{benchmark_name}_eval_{timestamp}"
-        self.run_name = run_name
-        # Set up directory structure
-        self.output_dir = Path(output_dir)
-        self.run_dir = self.output_dir / run_name
-        self.tasks_dir = self.run_dir / "tasks"
-        # Current task tracking
-        self._current_task: BenchmarkTask | None = None
-        self._current_task_dir: Path | None = None
-        self._current_screenshots_dir: Path | None = None
-        self._current_steps: list[ExecutionStep] = []
-        # Initialize run
-        self._initialize_run()
-    def _initialize_run(self) -> None:
-        """Initialize the benchmark run directory and metadata."""
-        self.run_dir.mkdir(parents=True, exist_ok=True)
-        self.tasks_dir.mkdir(exist_ok=True)
-        # Save run metadata
-        metadata = {
-            "benchmark_name": self.benchmark_name,
-            "run_name": self.run_name,
-            "model_id": self.model_id,
-            "created_at": datetime.now().isoformat(),
-        }
-        metadata_path = self.run_dir / "metadata.json"
-        with open(metadata_path, "w") as f:
-            json.dump(metadata, f, indent=2)
-        logger.info(f"Initialized benchmark run at: {self.run_dir}")
-    def start_task(self, task: BenchmarkTask) -> None:
-        """Start collecting data for a new task.
-        Args:
-            task: The benchmark task being executed.
-        """
-        if self._current_task is not None:
-            logger.warning(
-                f"Starting new task {task.task_id} without finishing {self._current_task.task_id}"
-            )
-        self._current_task = task
-        self._current_steps = []
-        # Create task directory
-        task_dir_name = self._sanitize_task_id(task.task_id)
-        self._current_task_dir = self.tasks_dir / task_dir_name
-        self._current_task_dir.mkdir(parents=True, exist_ok=True)
-        # Create screenshots directory
-        self._current_screenshots_dir = self._current_task_dir / "screenshots"
-        self._current_screenshots_dir.mkdir(exist_ok=True)
-        # Save task definition
-        task_data = {
-            "task_id": task.task_id,
-            "instruction": task.instruction,
-            "domain": task.domain,
-            "initial_state_ref": task.initial_state_ref,
-            "time_limit_steps": task.time_limit_steps,
-            "raw_config": task.raw_config,
-            "evaluation_spec": task.evaluation_spec,
-        }
-        task_path = self._current_task_dir / "task.json"
-        with open(task_path, "w") as f:
-            json.dump(task_data, f, indent=2)
-        logger.info(f"Started collecting data for task: {task.task_id}")
-    def record_step(
-        self,
-        step_idx: int,
-        observation: BenchmarkObservation,
-        action: BenchmarkAction,
-        reasoning: str | None = None,
-    ) -> None:
-        """Record a single step in the execution trace.
-        Args:
-            step_idx: Index of this step in the trajectory.
-            observation: Observation at this step.
-            action: Action taken at this step.
-            reasoning: Optional reasoning/thought from the agent.
-        """
-        if self._current_task is None:
-            raise RuntimeError("No task started. Call start_task() first.")
-        # Save screenshot if available
-        screenshot_path = None
-        if observation.screenshot is not None:
-            screenshot_path = self._save_screenshot(step_idx, observation.screenshot)
-        elif observation.screenshot_path is not None:
-            # Copy existing screenshot
-            screenshot_path = self._copy_screenshot(step_idx, observation.screenshot_path)
-        # Create execution step record
-        step = ExecutionStep(
-            step_idx=step_idx,
-            screenshot_path=screenshot_path,
-            action=self._serialize_action(action),
-            reasoning=reasoning,
-            timestamp=datetime.now().timestamp(),
-        )
-        self._current_steps.append(step)
-    def finish_task(self, result: BenchmarkResult) -> None:
-        """Finish collecting data for the current task and save execution trace.
-        Args:
-            result: The evaluation result for the task.
-        """
-        if self._current_task is None:
-            raise RuntimeError("No task started. Call start_task() first.")
-        # Save execution trace
-        execution_data = {
-            "task_id": result.task_id,
-            "model_id": self.model_id,
-            "success": result.success,
-            "score": result.score,
-            "num_steps": result.num_steps,
-            "total_time_seconds": result.total_time_seconds,
-            "error": result.error,
-            "reason": result.reason,
-            "steps": [asdict(step) for step in self._current_steps],
-        }
-        execution_path = self._current_task_dir / "execution.json"
-        with open(execution_path, "w") as f:
-            json.dump(execution_data, f, indent=2)
-        logger.info(
-            f"Saved execution trace for task {result.task_id}: "
-            f"{'SUCCESS' if result.success else 'FAIL'} ({result.num_steps} steps)"
-        )
-        # Clear current task
-        self._current_task = None
-        self._current_task_dir = None
-        self._current_screenshots_dir = None
-        self._current_steps = []
-    def save_summary(self, all_results: list[BenchmarkResult]) -> None:
-        """Save summary of all task results.
-        Args:
-            all_results: List of all BenchmarkResult objects from the run.
-        """
-        summary = {
-            "benchmark_name": self.benchmark_name,
-            "run_name": self.run_name,
-            "model_id": self.model_id,
-            "num_tasks": len(all_results),
-            "num_success": sum(1 for r in all_results if r.success),
-            "success_rate": sum(1 for r in all_results if r.success) / len(all_results) if all_results else 0.0,
-            "avg_score": sum(r.score for r in all_results) / len(all_results) if all_results else 0.0,
-            "avg_steps": sum(r.num_steps for r in all_results) / len(all_results) if all_results else 0.0,
-            "avg_time_seconds": sum(r.total_time_seconds for r in all_results) / len(all_results) if all_results else 0.0,
-            "tasks": [
-                {
-                    "task_id": r.task_id,
-                    "success": r.success,
-                    "score": r.score,
-                    "num_steps": r.num_steps,
-                    "error": r.error,
-                }
-                for r in all_results
-            ],
-        }
-        summary_path = self.run_dir / "summary.json"
-        with open(summary_path, "w") as f:
-            json.dump(summary, f, indent=2)
-        logger.info(
-            f"Saved summary: {summary['num_success']}/{summary['num_tasks']} tasks succeeded "
-            f"({summary['success_rate']:.1%})"
-        )
-    def _save_screenshot(self, step_idx: int, screenshot_bytes: bytes) -> str:
-        """Save screenshot bytes to file.
-        Args:
-            step_idx: Step index for naming the file.
-            screenshot_bytes: PNG image bytes.
-        Returns:
-            Relative path to the saved screenshot.
-        """
-        if self._current_screenshots_dir is None:
-            raise RuntimeError("No task started")
-        filename = f"step_{step_idx:03d}.png"
-        screenshot_path = self._current_screenshots_dir / filename
-        with open(screenshot_path, "wb") as f:
-            f.write(screenshot_bytes)
-        # Return relative path from task directory
-        return f"screenshots/{filename}"
-    def _copy_screenshot(self, step_idx: int, source_path: str) -> str:
-        """Copy screenshot from existing path.
-        Args:
-            step_idx: Step index for naming the file.
-            source_path: Path to existing screenshot.
-        Returns:
-            Relative path to the copied screenshot.
-        """
-        if self._current_screenshots_dir is None:
-            raise RuntimeError("No task started")
-        filename = f"step_{step_idx:03d}.png"
-        dest_path = self._current_screenshots_dir / filename
-        # Copy file
-        import shutil
-        shutil.copy2(source_path, dest_path)
-        return f"screenshots/{filename}"
-    def _serialize_action(self, action: BenchmarkAction) -> dict[str, Any]:
-        """Serialize BenchmarkAction to dict.
-        Args:
-            action: Action to serialize.
-        Returns:
-            Dictionary representation of the action.
-        """
-        return {
-            "type": action.type,
-            "x": action.x,
-            "y": action.y,
-            "target_node_id": action.target_node_id,
-            "target_bbox": action.target_bbox,
-            "target_role": action.target_role,
-            "target_name": action.target_name,
-            "text": action.text,
-            "key": action.key,
-            "modifiers": action.modifiers,
-            "scroll_direction": action.scroll_direction,
-            "scroll_amount": action.scroll_amount,
-            "end_x": action.end_x,
-            "end_y": action.end_y,
-            "answer": action.answer,
-            "raw_action": action.raw_action,
-        }
-    def _sanitize_task_id(self, task_id: str) -> str:
-        """Sanitize task ID for use as directory name.
-        Args:
-            task_id: Original task ID.
-        Returns:
-            Sanitized task ID safe for filesystem.
-        """
-        # Replace unsafe characters with underscores
-        safe_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in task_id)
-        return safe_id
-def save_execution_trace(
-    task: BenchmarkTask,
-    result: BenchmarkResult,
-    trajectory: list[tuple[BenchmarkObservation, BenchmarkAction]],
-    benchmark_name: str,
-    model_id: str = "unknown",
-    output_dir: str | Path = "benchmark_results",
-    run_name: str | None = None,
-    reasoning_map: dict[int, str] | None = None,
-) -> Path:
-    """Convenience function to save a complete execution trace.
-    This is a simpler alternative to using ExecutionTraceCollector directly
-    when you have the complete trajectory available.
-    Args:
-        task: The benchmark task.
-        result: The evaluation result.
-        trajectory: List of (observation, action) pairs.
-        benchmark_name: Name of the benchmark.
-        model_id: Identifier for the model.
-        output_dir: Base directory for results.
-        run_name: Optional run name (auto-generated if None).
-        reasoning_map: Optional map of step_idx -> reasoning text.
-    Returns:
-        Path to the task directory.
-    Example:
-        save_execution_trace(
-            task=task,
-            result=result,
-            trajectory=trajectory,
-            benchmark_name="waa",
-            model_id="qwen3vl-2b-epoch5",
-            reasoning_map={0: "I should click the button", 1: "Now type the text"}
-        )
-    """
-    collector = ExecutionTraceCollector(
-        benchmark_name=benchmark_name,
-        run_name=run_name,
-        model_id=model_id,
-        output_dir=output_dir,
-    )
-    collector.start_task(task)
-    for step_idx, (obs, action) in enumerate(trajectory):
-        reasoning = reasoning_map.get(step_idx) if reasoning_map else None
-        collector.record_step(step_idx, obs, action, reasoning)
-    collector.finish_task(result)
-    return collector._current_task_dir or collector.tasks_dir

openadapt_ml/benchmarks/live_tracker.py DELETED Viewed

@@ -1,180 +0,0 @@
-"""Live evaluation progress tracker for benchmark viewer.
-This module provides a tracker that writes real-time evaluation progress
-to a JSON file that the viewer can poll via /api/benchmark-live.
-"""
-from __future__ import annotations
-import json
-from dataclasses import asdict, dataclass
-from pathlib import Path
-from typing import Any
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAction,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
-)
-@dataclass
-class LiveStepData:
-    """Data for a single step in live evaluation."""
-    step_idx: int
-    action: dict[str, Any]
-    reasoning: str | None = None
-    screenshot_url: str | None = None
-@dataclass
-class LiveTaskData:
-    """Data for current task being evaluated."""
-    task_id: str
-    instruction: str
-    domain: str
-    steps: list[LiveStepData]
-    result: dict[str, Any] | None = None
-class LiveEvaluationTracker:
-    """Tracks live evaluation progress and writes to benchmark_live.json.
-    This class is designed to be used alongside ExecutionTraceCollector
-    to provide real-time progress updates to the viewer.
-    Args:
-        output_file: Path to output JSON file (default: benchmark_live.json).
-        total_tasks: Total number of tasks to evaluate.
-    """
-    def __init__(
-        self,
-        output_file: str | Path = "benchmark_live.json",
-        total_tasks: int = 0,
-    ):
-        self.output_file = Path(output_file)
-        self.total_tasks = total_tasks
-        self.tasks_completed = 0
-        self.current_task: LiveTaskData | None = None
-        # Initialize with idle state
-        self._write_state({"status": "idle"})
-    def start_task(self, task: BenchmarkTask) -> None:
-        """Start tracking a new task.
-        Args:
-            task: The benchmark task being evaluated.
-        """
-        self.current_task = LiveTaskData(
-            task_id=task.task_id,
-            instruction=task.instruction,
-            domain=task.domain or "unknown",
-            steps=[],
-            result=None,
-        )
-        self._write_state({
-            "status": "running",
-            "total_tasks": self.total_tasks,
-            "tasks_completed": self.tasks_completed,
-            "current_task": asdict(self.current_task),
-        })
-    def record_step(
-        self,
-        step_idx: int,
-        observation: BenchmarkObservation,
-        action: BenchmarkAction,
-        reasoning: str | None = None,
-    ) -> None:
-        """Record a step in the current task.
-        Args:
-            step_idx: Index of this step.
-            observation: Observation at this step.
-            action: Action taken at this step.
-            reasoning: Optional reasoning/thought from agent.
-        """
-        if self.current_task is None:
-            raise RuntimeError("No task started. Call start_task() first.")
-        # Serialize action
-        action_data = {
-            "type": action.type,
-            "x": action.x,
-            "y": action.y,
-            "target_node_id": action.target_node_id,
-            "text": action.text,
-            "key": action.key,
-        }
-        # Create step data
-        step = LiveStepData(
-            step_idx=step_idx,
-            action=action_data,
-            reasoning=reasoning,
-            screenshot_url=None,  # Could be populated if we serve screenshots
-        )
-        self.current_task.steps.append(step)
-        # Write updated state
-        self._write_state({
-            "status": "running",
-            "total_tasks": self.total_tasks,
-            "tasks_completed": self.tasks_completed,
-            "current_task": asdict(self.current_task),
-        })
-    def finish_task(self, result: BenchmarkResult) -> None:
-        """Finish tracking the current task.
-        Args:
-            result: The evaluation result for the task.
-        """
-        if self.current_task is None:
-            raise RuntimeError("No task started. Call start_task() first.")
-        # Add result to current task
-        self.current_task.result = {
-            "success": result.success,
-            "score": result.score,
-            "num_steps": result.num_steps,
-            "total_time_seconds": result.total_time_seconds,
-        }
-        # Increment completed count
-        self.tasks_completed += 1
-        # Write updated state
-        self._write_state({
-            "status": "running",
-            "total_tasks": self.total_tasks,
-            "tasks_completed": self.tasks_completed,
-            "current_task": asdict(self.current_task),
-        })
-        # Clear current task
-        self.current_task = None
-    def finish(self) -> None:
-        """Mark evaluation as complete."""
-        self._write_state({
-            "status": "complete",
-            "total_tasks": self.total_tasks,
-            "tasks_completed": self.tasks_completed,
-        })
-    def _write_state(self, state: dict[str, Any]) -> None:
-        """Write current state to JSON file.
-        Args:
-            state: State dictionary to write.
-        """
-        with open(self.output_file, "w") as f:
-            json.dump(state, f, indent=2)

openadapt-ml 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

openadapt-ml 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl