PyPI - openadapt-ml - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

openadapt-ml 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

openadapt_ml/benchmarks/__init__.py +8 -0
openadapt_ml/benchmarks/agent.py +90 -11
openadapt_ml/benchmarks/azure.py +35 -6
openadapt_ml/benchmarks/cli.py +4449 -201
openadapt_ml/benchmarks/live_tracker.py +180 -0
openadapt_ml/benchmarks/runner.py +41 -4
openadapt_ml/benchmarks/viewer.py +1219 -0
openadapt_ml/benchmarks/vm_monitor.py +610 -0
openadapt_ml/benchmarks/waa.py +61 -4
openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
openadapt_ml/benchmarks/waa_live.py +619 -0
openadapt_ml/cloud/local.py +1555 -1
openadapt_ml/cloud/ssh_tunnel.py +553 -0
openadapt_ml/datasets/next_action.py +87 -68
openadapt_ml/evals/grounding.py +26 -8
openadapt_ml/evals/trajectory_matching.py +84 -36
openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
openadapt_ml/experiments/waa_demo/__init__.py +10 -0
openadapt_ml/experiments/waa_demo/demos.py +357 -0
openadapt_ml/experiments/waa_demo/runner.py +717 -0
openadapt_ml/experiments/waa_demo/tasks.py +151 -0
openadapt_ml/export/__init__.py +9 -0
openadapt_ml/export/__main__.py +6 -0
openadapt_ml/export/cli.py +89 -0
openadapt_ml/export/parquet.py +265 -0
openadapt_ml/ingest/__init__.py +3 -4
openadapt_ml/ingest/capture.py +89 -81
openadapt_ml/ingest/loader.py +116 -68
openadapt_ml/ingest/synthetic.py +221 -159
openadapt_ml/retrieval/README.md +226 -0
openadapt_ml/retrieval/USAGE.md +391 -0
openadapt_ml/retrieval/__init__.py +91 -0
openadapt_ml/retrieval/demo_retriever.py +817 -0
openadapt_ml/retrieval/embeddings.py +629 -0
openadapt_ml/retrieval/index.py +194 -0
openadapt_ml/retrieval/retriever.py +160 -0
openadapt_ml/runtime/policy.py +10 -10
openadapt_ml/schema/__init__.py +104 -0
openadapt_ml/schema/converters.py +541 -0
openadapt_ml/schema/episode.py +457 -0
openadapt_ml/scripts/compare.py +26 -16
openadapt_ml/scripts/eval_policy.py +4 -5
openadapt_ml/scripts/prepare_synthetic.py +14 -17
openadapt_ml/scripts/train.py +81 -70
openadapt_ml/training/benchmark_viewer.py +3225 -0
openadapt_ml/training/trainer.py +120 -363
openadapt_ml/training/trl_trainer.py +354 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
openadapt_ml-0.2.0.dist-info/RECORD +86 -0
openadapt_ml/schemas/__init__.py +0 -53
openadapt_ml/schemas/sessions.py +0 -122
openadapt_ml/schemas/validation.py +0 -252
openadapt_ml-0.1.0.dist-info/RECORD +0 -55
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0

openadapt_ml/benchmarks/__init__.py CHANGED Viewed

@@ -54,6 +54,7 @@ from openadapt_ml.benchmarks.agent import (
     PolicyAgent,
     RandomAgent,
     ScriptedAgent,
+    SmartMockAgent,
 )
 from openadapt_ml.benchmarks.base import (
     BenchmarkAction,
@@ -71,6 +72,8 @@ from openadapt_ml.benchmarks.runner import (
     evaluate_agent_on_benchmark,
 )
 from openadapt_ml.benchmarks.waa import WAAAdapter, WAAConfig, WAAMockAdapter
+from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
+from openadapt_ml.benchmarks.viewer import generate_benchmark_viewer
 # Azure orchestration (lazy import to avoid requiring azure-ai-ml)
 def _get_azure_classes():
@@ -97,6 +100,7 @@ __all__ = [
     "APIBenchmarkAgent",
     "ScriptedAgent",
     "RandomAgent",
+    "SmartMockAgent",
     # Evaluation
     "EvaluationConfig",
     "evaluate_agent_on_benchmark",
@@ -106,6 +110,10 @@ __all__ = [
     "WAAAdapter",
     "WAAConfig",
     "WAAMockAdapter",
+    "WAALiveAdapter",
+    "WAALiveConfig",
+    # Viewer
+    "generate_benchmark_viewer",
     # Azure (lazy-loaded)
     "AzureConfig",
     "AzureWAAOrchestrator",

openadapt_ml/benchmarks/agent.py CHANGED Viewed

@@ -36,7 +36,7 @@ from openadapt_ml.benchmarks.base import (
 if TYPE_CHECKING:
     from openadapt_ml.models.api_adapter import ApiVLMAdapter
     from openadapt_ml.runtime.policy import AgentPolicy
-    from openadapt_ml.schemas.sessions import Action
+    from openadapt_ml.schema import Action, ActionType
 class BenchmarkAgent(ABC):
@@ -259,22 +259,51 @@ class PolicyAgent(BenchmarkAgent):
         Returns:
             BenchmarkAction.
         """
+        # Extract normalized coordinates
+        x, y = None, None
+        if action.normalized_coordinates is not None:
+            x, y = action.normalized_coordinates
+        # Extract end coordinates for drag
+        end_x, end_y = None, None
+        if action.normalized_end is not None:
+            end_x, end_y = action.normalized_end
+        # Extract action type value (enum -> string)
+        action_type = action.type.value if hasattr(action.type, 'value') else action.type
+        # Extract element info if available
+        target_node_id = None
+        target_role = None
+        target_name = None
+        target_bbox = None
+        if action.element is not None:
+            target_node_id = action.element.element_id
+            target_role = action.element.role
+            target_name = action.element.name
+            if action.element.bounds is not None:
+                target_bbox = (
+                    action.element.bounds.x,
+                    action.element.bounds.y,
+                    action.element.bounds.x + action.element.bounds.width,
+                    action.element.bounds.y + action.element.bounds.height,
+                )
         return BenchmarkAction(
-            type=action.type,
-            x=action.x,
-            y=action.y,
+            type=action_type,
+            x=x,
+            y=y,
             text=action.text,
-            target_bbox=action.bbox,
-            # Map additional fields if present
-            target_node_id=getattr(action, "target_node_id", None),
-            target_role=getattr(action, "target_role", None),
-            target_name=getattr(action, "target_name", None),
+            target_bbox=target_bbox,
+            target_node_id=target_node_id,
+            target_role=target_role,
+            target_name=target_name,
             key=getattr(action, "key", None),
             modifiers=getattr(action, "modifiers", None),
             scroll_direction=getattr(action, "scroll_direction", None),
             scroll_amount=getattr(action, "scroll_amount", None),
-            end_x=getattr(action, "end_x", None),
-            end_y=getattr(action, "end_y", None),
+            end_x=end_x,
+            end_y=end_y,
             answer=getattr(action, "answer", None),
             raw_action={"thought": thought} if thought else None,
         )
@@ -391,6 +420,56 @@ class RandomAgent(BenchmarkAgent):
         pass
+class SmartMockAgent(BenchmarkAgent):
+    """Agent designed to pass WAAMockAdapter evaluation.
+    Performs a fixed sequence of actions that satisfy the mock adapter's
+    success criteria. Use for validating the benchmark pipeline locally.
+    The mock adapter evaluates success based on:
+    - Clicking Submit (ID 4) - primary success path
+    - Typing something AND clicking OK (ID 1) - form submission path
+    - Calling DONE after at least 2 actions - reasonable completion
+    This agent clicks Submit (ID 4) which is the simplest success path.
+    """
+    def __init__(self):
+        """Initialize the agent."""
+        self._step = 0
+        # Simple action sequence: click Submit button (ID 4), then done
+        self._actions = [
+            BenchmarkAction(type="click", target_node_id="4"),  # Click Submit
+            BenchmarkAction(type="done"),
+        ]
+    def act(
+        self,
+        observation: BenchmarkObservation,
+        task: BenchmarkTask,
+        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
+    ) -> BenchmarkAction:
+        """Return the next scripted action.
+        Args:
+            observation: Ignored.
+            task: Ignored.
+            history: Ignored.
+        Returns:
+            Next action from script, or DONE if script exhausted.
+        """
+        if self._step < len(self._actions):
+            action = self._actions[self._step]
+            self._step += 1
+            return action
+        return BenchmarkAction(type="done")
+    def reset(self) -> None:
+        """Reset step counter."""
+        self._step = 0
 class APIBenchmarkAgent(BenchmarkAgent):
     """Agent that uses hosted VLM APIs (Claude, GPT-5.1) for benchmark evaluation.

openadapt_ml/benchmarks/azure.py CHANGED Viewed

@@ -355,6 +355,7 @@ class AzureMLClient:
         command: str,
         environment_variables: dict[str, str] | None = None,
         display_name: str | None = None,
+        timeout_hours: float = 4.0,
     ) -> str:
         """Submit a job to a compute instance.
@@ -363,6 +364,8 @@ class AzureMLClient:
             command: Command to run.
             environment_variables: Environment variables.
             display_name: Job display name.
+            timeout_hours: Maximum job duration in hours (default: 4). The job
+                will be automatically canceled after this duration.
         Returns:
             Job name/ID.
@@ -376,16 +379,27 @@ class AzureMLClient:
             name="waa-agent-env",
         )
+        import time
+        import uuid
+        timestamp = int(time.time())
+        unique_id = str(uuid.uuid4())[:8]
+        job_name = f"waa-{compute_name}-{timestamp}-{unique_id}"
+        # Convert hours to seconds for Azure ML timeout
+        timeout_seconds = int(timeout_hours * 3600)
         job = ml_command(
             command=command,
             environment=env,
             compute=compute_name,
+            name=job_name,  # Unique job name for Azure ML
             display_name=display_name or f"waa-job-{compute_name}",
             environment_variables=environment_variables or {},
+            limits={"timeout": timeout_seconds},
         )
         submitted = self.client.jobs.create_or_update(job)
-        logger.info(f"Job submitted: {submitted.name}")
+        logger.info(f"Job submitted: {submitted.name} (timeout: {timeout_hours}h)")
         return submitted.name
     def wait_for_job(self, job_name: str, timeout_seconds: int = 3600) -> dict:
@@ -458,6 +472,7 @@ class AzureWAAOrchestrator:
         max_steps_per_task: int = 15,
         on_worker_complete: Callable[[WorkerState], None] | None = None,
         cleanup_on_complete: bool = True,
+        timeout_hours: float = 4.0,
     ) -> list[BenchmarkResult]:
         """Run evaluation across multiple Azure VMs.
@@ -468,6 +483,8 @@ class AzureWAAOrchestrator:
             max_steps_per_task: Maximum steps per task.
             on_worker_complete: Callback when a worker finishes.
             cleanup_on_complete: Whether to delete VMs after completion.
+            timeout_hours: Maximum job duration in hours (default: 4). Jobs are
+                auto-canceled after this duration to prevent runaway costs.
         Returns:
             List of BenchmarkResult for all tasks.
@@ -519,7 +536,7 @@ class AzureWAAOrchestrator:
             # Submit jobs to workers
             print(f"[3/4] Submitting evaluation jobs...")
-            self._submit_worker_jobs(workers, task_batches, agent, max_steps_per_task)
+            self._submit_worker_jobs(workers, task_batches, agent, max_steps_per_task, timeout_hours)
             print(f"      Jobs submitted")
             # Wait for completion and collect results
@@ -577,8 +594,17 @@ class AzureWAAOrchestrator:
         task_batches: list[list[BenchmarkTask]],
         agent: BenchmarkAgent,
         max_steps: int,
+        timeout_hours: float = 4.0,
     ) -> None:
-        """Submit evaluation jobs to workers."""
+        """Submit evaluation jobs to workers.
+        Args:
+            workers: List of worker states.
+            task_batches: Task batches for each worker.
+            agent: Agent to run.
+            max_steps: Maximum steps per task.
+            timeout_hours: Maximum job duration in hours.
+        """
         for worker, tasks in zip(workers, task_batches):
             if worker.status == "failed":
                 continue
@@ -591,7 +617,7 @@ class AzureWAAOrchestrator:
                 # Build command
                 command = self._build_worker_command(task_ids_json, max_steps, agent)
-                # Submit job
+                # Submit job with timeout
                 self.ml_client.submit_job(
                     compute_name=worker.compute_name,
                     command=command,
@@ -600,6 +626,7 @@ class AzureWAAOrchestrator:
                         "WAA_MAX_STEPS": str(max_steps),
                     },
                     display_name=f"waa-worker-{worker.worker_id}",
+                    timeout_hours=timeout_hours,
                 )
                 worker.status = "running"
                 worker.start_time = time.time()
@@ -625,9 +652,11 @@ class AzureWAAOrchestrator:
         # TODO: Serialize agent config and pass to remote worker
         # For now, workers use a default agent configuration
         _ = agent  # Reserved for agent serialization
+        # WAA Docker image has client at /client (see Dockerfile-WinArena)
+        # The run.py script is at /client/run.py (not a module, so use python run.py)
         return f"""
-        cd /workspace/WindowsAgentArena && \
-        python -m client.run \
+        cd /client && \
+        python run.py \
             --task_ids '{task_ids_json}' \
             --max_steps {max_steps} \
             --output_dir /outputs

openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

openadapt-ml 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl