PyPI - openadapt-ml - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

openadapt-ml 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

openadapt_ml/benchmarks/__init__.py +8 -0
openadapt_ml/benchmarks/agent.py +90 -11
openadapt_ml/benchmarks/azure.py +35 -6
openadapt_ml/benchmarks/cli.py +4449 -201
openadapt_ml/benchmarks/live_tracker.py +180 -0
openadapt_ml/benchmarks/runner.py +41 -4
openadapt_ml/benchmarks/viewer.py +1219 -0
openadapt_ml/benchmarks/vm_monitor.py +610 -0
openadapt_ml/benchmarks/waa.py +61 -4
openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
openadapt_ml/benchmarks/waa_live.py +619 -0
openadapt_ml/cloud/local.py +1555 -1
openadapt_ml/cloud/ssh_tunnel.py +553 -0
openadapt_ml/datasets/next_action.py +87 -68
openadapt_ml/evals/grounding.py +26 -8
openadapt_ml/evals/trajectory_matching.py +84 -36
openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
openadapt_ml/experiments/waa_demo/__init__.py +10 -0
openadapt_ml/experiments/waa_demo/demos.py +357 -0
openadapt_ml/experiments/waa_demo/runner.py +717 -0
openadapt_ml/experiments/waa_demo/tasks.py +151 -0
openadapt_ml/export/__init__.py +9 -0
openadapt_ml/export/__main__.py +6 -0
openadapt_ml/export/cli.py +89 -0
openadapt_ml/export/parquet.py +265 -0
openadapt_ml/ingest/__init__.py +3 -4
openadapt_ml/ingest/capture.py +89 -81
openadapt_ml/ingest/loader.py +116 -68
openadapt_ml/ingest/synthetic.py +221 -159
openadapt_ml/retrieval/README.md +226 -0
openadapt_ml/retrieval/USAGE.md +391 -0
openadapt_ml/retrieval/__init__.py +91 -0
openadapt_ml/retrieval/demo_retriever.py +817 -0
openadapt_ml/retrieval/embeddings.py +629 -0
openadapt_ml/retrieval/index.py +194 -0
openadapt_ml/retrieval/retriever.py +160 -0
openadapt_ml/runtime/policy.py +10 -10
openadapt_ml/schema/__init__.py +104 -0
openadapt_ml/schema/converters.py +541 -0
openadapt_ml/schema/episode.py +457 -0
openadapt_ml/scripts/compare.py +26 -16
openadapt_ml/scripts/eval_policy.py +4 -5
openadapt_ml/scripts/prepare_synthetic.py +14 -17
openadapt_ml/scripts/train.py +81 -70
openadapt_ml/training/benchmark_viewer.py +3225 -0
openadapt_ml/training/trainer.py +120 -363
openadapt_ml/training/trl_trainer.py +354 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
openadapt_ml-0.2.0.dist-info/RECORD +86 -0
openadapt_ml/schemas/__init__.py +0 -53
openadapt_ml/schemas/sessions.py +0 -122
openadapt_ml/schemas/validation.py +0 -252
openadapt_ml-0.1.0.dist-info/RECORD +0 -55
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0

openadapt_ml/benchmarks/vm_monitor.py ADDED Viewed

@@ -0,0 +1,610 @@
+"""VM monitoring utilities for WAA benchmark evaluation.
+This module provides reusable classes for monitoring Windows VMs running WAA.
+Can be used by the viewer, CLI, or as a standalone tool.
+Usage:
+    # Monitor a single VM
+    from openadapt_ml.benchmarks.vm_monitor import VMMonitor, VMConfig
+    config = VMConfig(
+        name="azure-waa-vm",
+        ssh_host="172.171.112.41",
+        ssh_user="azureuser",
+        docker_container="winarena",
+        internal_ip="20.20.20.21",
+    )
+    monitor = VMMonitor(config)
+    status = monitor.check_status()
+    print(f"VNC: {status.vnc_reachable}, WAA: {status.waa_ready}")
+    # Or run continuous monitoring
+    monitor.run_monitor(callback=lambda s: print(s))
+"""
+from __future__ import annotations
+import json
+import subprocess
+import time
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+from pathlib import Path
+from typing import Callable
+import urllib.request
+import urllib.error
+import socket
+@dataclass
+class VMConfig:
+    """Configuration for a WAA VM."""
+    name: str
+    ssh_host: str
+    ssh_user: str = "azureuser"
+    vnc_port: int = 8006
+    waa_port: int = 5000
+    qmp_port: int = 7200
+    docker_container: str = "winarena"
+    internal_ip: str = "20.20.20.21"
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization."""
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: dict) -> VMConfig:
+        """Create from dictionary."""
+        return cls(**data)
+@dataclass
+class VMStatus:
+    """Status of a WAA VM at a point in time."""
+    config: VMConfig
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+    ssh_reachable: bool = False
+    vnc_reachable: bool = False
+    waa_ready: bool = False
+    waa_probe_response: str | None = None
+    container_running: bool = False
+    container_logs: str | None = None
+    disk_usage_gb: float | None = None
+    error: str | None = None
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "config": self.config.to_dict(),
+            "timestamp": self.timestamp,
+            "ssh_reachable": self.ssh_reachable,
+            "vnc_reachable": self.vnc_reachable,
+            "waa_ready": self.waa_ready,
+            "waa_probe_response": self.waa_probe_response,
+            "container_running": self.container_running,
+            "container_logs": self.container_logs,
+            "disk_usage_gb": self.disk_usage_gb,
+            "error": self.error,
+        }
+class VMMonitor:
+    """Monitor a single WAA VM."""
+    def __init__(self, config: VMConfig, timeout: int = 5):
+        """Initialize monitor.
+        Args:
+            config: VM configuration.
+            timeout: Timeout in seconds for network operations.
+        """
+        self.config = config
+        self.timeout = timeout
+    def check_vnc(self) -> bool:
+        """Check if VNC port is reachable."""
+        try:
+            url = f"http://{self.config.ssh_host}:{self.config.vnc_port}/"
+            req = urllib.request.Request(url, method="HEAD")
+            with urllib.request.urlopen(req, timeout=self.timeout):
+                return True
+        except (urllib.error.URLError, socket.timeout, Exception):
+            return False
+    def check_ssh(self) -> bool:
+        """Check if SSH is reachable."""
+        try:
+            result = subprocess.run(
+                [
+                    "ssh",
+                    "-o", "StrictHostKeyChecking=no",
+                    "-o", f"ConnectTimeout={self.timeout}",
+                    "-o", "BatchMode=yes",
+                    f"{self.config.ssh_user}@{self.config.ssh_host}",
+                    "echo ok",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=self.timeout + 5,
+            )
+            return result.returncode == 0 and "ok" in result.stdout
+        except (subprocess.TimeoutExpired, Exception):
+            return False
+    def check_waa_probe(self) -> tuple[bool, str | None]:
+        """Check if WAA /probe endpoint responds.
+        Returns:
+            Tuple of (ready, response_text).
+        """
+        try:
+            cmd = f"curl -s --connect-timeout {self.timeout} http://{self.config.internal_ip}:{self.config.waa_port}/probe"
+            result = subprocess.run(
+                [
+                    "ssh",
+                    "-o", "StrictHostKeyChecking=no",
+                    "-o", f"ConnectTimeout={self.timeout}",
+                    "-o", "BatchMode=yes",
+                    f"{self.config.ssh_user}@{self.config.ssh_host}",
+                    cmd,
+                ],
+                capture_output=True,
+                text=True,
+                timeout=self.timeout + 10,
+            )
+            response = result.stdout.strip()
+            if response and "error" not in response.lower():
+                return True, response
+            return False, response or None
+        except (subprocess.TimeoutExpired, Exception) as e:
+            return False, str(e)
+    def get_container_status(self) -> tuple[bool, str | None]:
+        """Check container status and get recent logs.
+        Returns:
+            Tuple of (running, last_log_lines).
+        """
+        try:
+            cmd = f"docker ps -q -f name={self.config.docker_container}"
+            result = subprocess.run(
+                [
+                    "ssh",
+                    "-o", "StrictHostKeyChecking=no",
+                    "-o", f"ConnectTimeout={self.timeout}",
+                    "-o", "BatchMode=yes",
+                    f"{self.config.ssh_user}@{self.config.ssh_host}",
+                    cmd,
+                ],
+                capture_output=True,
+                text=True,
+                timeout=self.timeout + 5,
+            )
+            running = bool(result.stdout.strip())
+            if running:
+                # Get last few log lines
+                log_cmd = f"docker logs {self.config.docker_container} 2>&1 | tail -5"
+                log_result = subprocess.run(
+                    [
+                        "ssh",
+                        "-o", "StrictHostKeyChecking=no",
+                        "-o", f"ConnectTimeout={self.timeout}",
+                        "-o", "BatchMode=yes",
+                        f"{self.config.ssh_user}@{self.config.ssh_host}",
+                        log_cmd,
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=self.timeout + 10,
+                )
+                return True, log_result.stdout.strip()
+            return False, None
+        except (subprocess.TimeoutExpired, Exception) as e:
+            return False, str(e)
+    def get_disk_usage(self) -> float | None:
+        """Get disk usage of data.img in GB."""
+        try:
+            # Try common paths
+            paths = [
+                "/home/azureuser/waa-storage/data.img",
+                "/home/ubuntu/waa-storage/data.img",
+                "/storage/data.img",
+            ]
+            for path in paths:
+                cmd = f"du -b {path} 2>/dev/null | cut -f1"
+                result = subprocess.run(
+                    [
+                        "ssh",
+                        "-o", "StrictHostKeyChecking=no",
+                        "-o", f"ConnectTimeout={self.timeout}",
+                        "-o", "BatchMode=yes",
+                        f"{self.config.ssh_user}@{self.config.ssh_host}",
+                        cmd,
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=self.timeout + 5,
+                )
+                if result.returncode == 0 and result.stdout.strip():
+                    try:
+                        bytes_size = int(result.stdout.strip())
+                        return round(bytes_size / (1024 ** 3), 2)
+                    except ValueError:
+                        continue
+            return None
+        except (subprocess.TimeoutExpired, Exception):
+            return None
+    def check_status(self) -> VMStatus:
+        """Perform full status check on the VM.
+        Returns:
+            VMStatus with all checks performed.
+        """
+        status = VMStatus(config=self.config)
+        try:
+            # Check VNC first (fastest, no SSH needed)
+            status.vnc_reachable = self.check_vnc()
+            # Check SSH
+            status.ssh_reachable = self.check_ssh()
+            if status.ssh_reachable:
+                # Check container
+                status.container_running, status.container_logs = self.get_container_status()
+                # Check WAA probe
+                status.waa_ready, status.waa_probe_response = self.check_waa_probe()
+                # Get disk usage
+                status.disk_usage_gb = self.get_disk_usage()
+        except Exception as e:
+            status.error = str(e)
+        return status
+    def run_monitor(
+        self,
+        callback: Callable[[VMStatus], None] | None = None,
+        interval: int = 30,
+        stop_on_ready: bool = True,
+        output_file: str | Path | None = None,
+    ) -> VMStatus:
+        """Run continuous monitoring until WAA is ready.
+        Args:
+            callback: Optional callback function called with each status update.
+            interval: Seconds between checks.
+            stop_on_ready: Stop monitoring when WAA is ready.
+            output_file: Optional file to write status updates (JSON lines).
+        Returns:
+            Final VMStatus (typically when WAA is ready).
+        """
+        output_path = Path(output_file) if output_file else None
+        if output_path:
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+        while True:
+            status = self.check_status()
+            # Call callback if provided
+            if callback:
+                callback(status)
+            # Write to file if provided
+            if output_path:
+                with open(output_path, "a") as f:
+                    f.write(json.dumps(status.to_dict()) + "\n")
+            # Check if we should stop
+            if stop_on_ready and status.waa_ready:
+                return status
+            time.sleep(interval)
+@dataclass
+class PoolWorker:
+    """A single worker in a VM pool."""
+    name: str
+    ip: str
+    status: str = "creating"  # creating, ready, running, completed, failed, deleted
+    docker_container: str = "winarena"
+    waa_ready: bool = False
+    assigned_tasks: list[str] = field(default_factory=list)
+    completed_tasks: list[str] = field(default_factory=list)
+    current_task: str | None = None
+    error: str | None = None
+    created_at: str = field(default_factory=lambda: datetime.now().isoformat())
+    updated_at: str = field(default_factory=lambda: datetime.now().isoformat())
+@dataclass
+class VMPool:
+    """A pool of worker VMs for parallel WAA evaluation."""
+    pool_id: str
+    created_at: str
+    resource_group: str
+    location: str
+    vm_size: str
+    workers: list[PoolWorker]
+    total_tasks: int = 0
+    completed_tasks: int = 0
+    failed_tasks: int = 0
+class VMPoolRegistry:
+    """Manage VM pools for parallel WAA evaluation."""
+    REGISTRY_FILE = "benchmark_results/vm_pool_registry.json"
+    def __init__(self, registry_file: str | Path | None = None):
+        """Initialize pool registry.
+        Args:
+            registry_file: Path to JSON registry file.
+        """
+        self.registry_file = Path(registry_file or self.REGISTRY_FILE)
+        self._pool: VMPool | None = None
+        self.load()
+    def load(self) -> None:
+        """Load pool from registry file."""
+        if self.registry_file.exists():
+            try:
+                with open(self.registry_file) as f:
+                    data = json.load(f)
+                    workers = [PoolWorker(**w) for w in data.get("workers", [])]
+                    self._pool = VMPool(
+                        pool_id=data["pool_id"],
+                        created_at=data["created_at"],
+                        resource_group=data["resource_group"],
+                        location=data["location"],
+                        vm_size=data["vm_size"],
+                        workers=workers,
+                        total_tasks=data.get("total_tasks", 0),
+                        completed_tasks=data.get("completed_tasks", 0),
+                        failed_tasks=data.get("failed_tasks", 0),
+                    )
+            except (json.JSONDecodeError, KeyError) as e:
+                print(f"Warning: Could not load pool registry: {e}")
+                self._pool = None
+    def save(self) -> None:
+        """Save pool to registry file."""
+        if self._pool is None:
+            return
+        self.registry_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.registry_file, "w") as f:
+            json.dump(asdict(self._pool), f, indent=2)
+    def create_pool(
+        self,
+        workers: list[tuple[str, str]],  # [(name, ip), ...]
+        resource_group: str,
+        location: str,
+        vm_size: str = "Standard_D4ds_v5",
+    ) -> VMPool:
+        """Create a new pool from created VMs.
+        Args:
+            workers: List of (name, ip) tuples.
+            resource_group: Azure resource group.
+            location: Azure region.
+            vm_size: VM size used.
+        Returns:
+            Created VMPool.
+        """
+        pool_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+        self._pool = VMPool(
+            pool_id=pool_id,
+            created_at=datetime.now().isoformat(),
+            resource_group=resource_group,
+            location=location,
+            vm_size=vm_size,
+            workers=[PoolWorker(name=name, ip=ip, status="ready") for name, ip in workers],
+        )
+        self.save()
+        return self._pool
+    def get_pool(self) -> VMPool | None:
+        """Get current pool."""
+        return self._pool
+    def update_worker(self, name: str, **kwargs) -> None:
+        """Update a worker's status.
+        Args:
+            name: Worker name.
+            **kwargs: Fields to update.
+        """
+        if self._pool is None:
+            return
+        for worker in self._pool.workers:
+            if worker.name == name:
+                for key, value in kwargs.items():
+                    if hasattr(worker, key):
+                        setattr(worker, key, value)
+                worker.updated_at = datetime.now().isoformat()
+                break
+        self.save()
+    def update_pool_progress(self, completed: int = 0, failed: int = 0) -> None:
+        """Update pool-level progress.
+        Args:
+            completed: Increment completed count by this amount.
+            failed: Increment failed count by this amount.
+        """
+        if self._pool is None:
+            return
+        self._pool.completed_tasks += completed
+        self._pool.failed_tasks += failed
+        self.save()
+    def delete_pool(self) -> bool:
+        """Delete the pool registry (VMs must be deleted separately).
+        Returns:
+            True if pool was deleted.
+        """
+        if self.registry_file.exists():
+            self.registry_file.unlink()
+            self._pool = None
+            return True
+        return False
+class VMRegistry:
+    """Manage a registry of VMs and their status."""
+    def __init__(self, registry_file: str | Path = "benchmark_results/vm_registry.json"):
+        """Initialize registry.
+        Args:
+            registry_file: Path to JSON registry file.
+        """
+        self.registry_file = Path(registry_file)
+        self._vms: list[VMConfig] = []
+        self.load()
+    def load(self) -> None:
+        """Load VMs from registry file."""
+        if self.registry_file.exists():
+            with open(self.registry_file) as f:
+                data = json.load(f)
+                self._vms = [VMConfig.from_dict(vm) for vm in data]
+    def save(self) -> None:
+        """Save VMs to registry file."""
+        self.registry_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.registry_file, "w") as f:
+            json.dump([vm.to_dict() for vm in self._vms], f, indent=2)
+    def add(self, config: VMConfig) -> None:
+        """Add a VM to the registry."""
+        # Remove existing VM with same name
+        self._vms = [vm for vm in self._vms if vm.name != config.name]
+        self._vms.append(config)
+        self.save()
+    def remove(self, name: str) -> bool:
+        """Remove a VM from the registry.
+        Returns:
+            True if VM was found and removed.
+        """
+        original_len = len(self._vms)
+        self._vms = [vm for vm in self._vms if vm.name != name]
+        if len(self._vms) < original_len:
+            self.save()
+            return True
+        return False
+    def get(self, name: str) -> VMConfig | None:
+        """Get a VM by name."""
+        for vm in self._vms:
+            if vm.name == name:
+                return vm
+        return None
+    def list(self) -> list[VMConfig]:
+        """List all VMs."""
+        return list(self._vms)
+    def check_all(self, timeout: int = 5) -> list[VMStatus]:
+        """Check status of all VMs.
+        Args:
+            timeout: Timeout per VM check.
+        Returns:
+            List of VMStatus for each registered VM.
+        """
+        statuses = []
+        for config in self._vms:
+            monitor = VMMonitor(config, timeout=timeout)
+            statuses.append(monitor.check_status())
+        return statuses
+def main():
+    """CLI entry point for VM monitoring."""
+    import argparse
+    parser = argparse.ArgumentParser(description="Monitor WAA VMs")
+    parser.add_argument("--host", help="SSH host")
+    parser.add_argument("--user", default="azureuser", help="SSH user")
+    parser.add_argument("--container", default="winarena", help="Docker container name")
+    parser.add_argument("--interval", type=int, default=30, help="Check interval in seconds")
+    parser.add_argument("--output", help="Output file for status updates (JSON lines)")
+    parser.add_argument("--list", action="store_true", help="List all registered VMs")
+    parser.add_argument("--check-all", action="store_true", help="Check all registered VMs")
+    args = parser.parse_args()
+    if args.list:
+        registry = VMRegistry()
+        for vm in registry.list():
+            print(f"  {vm.name}: {vm.ssh_user}@{vm.ssh_host} (container: {vm.docker_container})")
+        return
+    if args.check_all:
+        registry = VMRegistry()
+        for status in registry.check_all():
+            print(f"\n{status.config.name}:")
+            print(f"  SSH: {'✓' if status.ssh_reachable else '✗'}")
+            print(f"  VNC: {'✓' if status.vnc_reachable else '✗'}")
+            print(f"  WAA: {'✓ READY' if status.waa_ready else '✗ Not ready'}")
+            if status.disk_usage_gb:
+                print(f"  Disk: {status.disk_usage_gb} GB")
+        return
+    if not args.host:
+        parser.error("--host is required for monitoring")
+    config = VMConfig(
+        name="cli-vm",
+        ssh_host=args.host,
+        ssh_user=args.user,
+        docker_container=args.container,
+    )
+    monitor = VMMonitor(config)
+    def print_status(status: VMStatus):
+        ts = datetime.now().strftime("%H:%M:%S")
+        waa_str = "READY!" if status.waa_ready else "not ready"
+        disk_str = f"{status.disk_usage_gb}GB" if status.disk_usage_gb else "?"
+        print(f"[{ts}] SSH: {'✓' if status.ssh_reachable else '✗'} | "
+              f"VNC: {'✓' if status.vnc_reachable else '✗'} | "
+              f"WAA: {waa_str} | Disk: {disk_str}")
+        if status.container_logs:
+            # Show last log line
+            last_line = status.container_logs.split('\n')[-1][:80]
+            print(f"         Log: {last_line}")
+    print(f"Monitoring {args.host}... (Ctrl+C to stop)")
+    try:
+        final_status = monitor.run_monitor(
+            callback=print_status,
+            interval=args.interval,
+            output_file=args.output,
+        )
+        print(f"\n✓ WAA is ready! Probe response: {final_status.waa_probe_response}")
+    except KeyboardInterrupt:
+        print("\nMonitoring stopped.")
+if __name__ == "__main__":
+    main()

openadapt_ml/benchmarks/waa.py CHANGED Viewed

@@ -565,6 +565,8 @@ class WAAMockAdapter(BenchmarkAdapter):
         self._current_task: BenchmarkTask | None = None
         self._step_count = 0
         self._temp_dir: Path | None = None
+        self._actions: list[BenchmarkAction] = []  # Track actions for evaluation
+        self._text_entered: str | None = None  # Track typed text
         self._generate_mock_tasks()
     @property
@@ -608,24 +610,79 @@ class WAAMockAdapter(BenchmarkAdapter):
     def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
         self._current_task = task
         self._step_count = 0
+        self._actions = []  # Clear action history
+        self._text_entered = None
         return self._mock_observation()
     def step(
         self, action: BenchmarkAction
     ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
         self._step_count += 1
+        self._actions.append(action)  # Track action for evaluation
+        # Track typed text
+        if action.type == "type" and action.text:
+            self._text_entered = action.text
         done = action.type == "done" or self._step_count >= 15
         return self._mock_observation(), done, {"step": self._step_count}
     def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
-        # Random success for testing
-        import random
-        success = random.random() < 0.2  # ~20% success rate like WAA SOTA
+        """Evaluate task based on actions taken.
+        Success criteria for mock tasks:
+        - Agent clicked the Submit button (ID 4) OR
+        - Agent typed text AND clicked OK (ID 1) OR
+        - Agent completed with DONE action after meaningful interaction
+        This provides deterministic evaluation based on actual agent behavior,
+        not random chance. The mock UI has:
+        - ID 1: OK button
+        - ID 2: Text input field
+        - ID 3: Cancel button
+        - ID 4: Submit button
+        """
+        # Check what actions were taken
+        clicked_ids = set()
+        typed_text = False
+        called_done = False
+        for action in self._actions:
+            if action.type == "click":
+                # Extract target node ID from action
+                target_id = getattr(action, "target_node_id", None)
+                if target_id:
+                    clicked_ids.add(str(target_id))
+            elif action.type == "type" and action.text:
+                typed_text = True
+            elif action.type == "done":
+                called_done = True
+        # Success criteria:
+        # 1. Clicked Submit (ID 4) - primary success path
+        # 2. Typed something AND clicked OK (ID 1) - form submission path
+        # 3. Called DONE after at least 2 actions - reasonable completion
+        clicked_submit = "4" in clicked_ids
+        clicked_ok = "1" in clicked_ids
+        form_submitted = typed_text and clicked_ok
+        reasonable_completion = called_done and len(self._actions) >= 2
+        success = clicked_submit or form_submitted or reasonable_completion
+        # Calculate partial credit score
+        score = 0.0
+        if success:
+            score = 1.0
+        elif typed_text or clicked_ids:
+            # Partial credit for taking meaningful actions
+            score = 0.3 + (0.1 * min(len(clicked_ids), 3)) + (0.2 if typed_text else 0.0)
         return BenchmarkResult(
             task_id=task.task_id,
             success=success,
-            score=1.0 if success else 0.0,
+            score=score,
             num_steps=self._step_count,
+            reason=f"clicked={list(clicked_ids)}, typed={typed_text}, done={called_done}",
         )
     def _mock_observation(self) -> BenchmarkObservation:

openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

openadapt-ml 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl