PyPI - openadapt-ml - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

openadapt-ml 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

openadapt_ml/baselines/__init__.py +121 -0
openadapt_ml/baselines/adapter.py +185 -0
openadapt_ml/baselines/cli.py +314 -0
openadapt_ml/baselines/config.py +448 -0
openadapt_ml/baselines/parser.py +922 -0
openadapt_ml/baselines/prompts.py +787 -0
openadapt_ml/benchmarks/__init__.py +13 -107
openadapt_ml/benchmarks/agent.py +297 -374
openadapt_ml/benchmarks/azure.py +62 -24
openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
openadapt_ml/benchmarks/cli.py +1874 -751
openadapt_ml/benchmarks/trace_export.py +631 -0
openadapt_ml/benchmarks/viewer.py +1236 -0
openadapt_ml/benchmarks/vm_monitor.py +1111 -0
openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
openadapt_ml/cloud/azure_inference.py +3 -5
openadapt_ml/cloud/lambda_labs.py +722 -307
openadapt_ml/cloud/local.py +3194 -89
openadapt_ml/cloud/ssh_tunnel.py +595 -0
openadapt_ml/datasets/next_action.py +125 -96
openadapt_ml/evals/grounding.py +32 -9
openadapt_ml/evals/plot_eval_metrics.py +15 -13
openadapt_ml/evals/trajectory_matching.py +120 -57
openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
openadapt_ml/experiments/representation_shootout/config.py +390 -0
openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
openadapt_ml/experiments/representation_shootout/runner.py +687 -0
openadapt_ml/experiments/waa_demo/__init__.py +10 -0
openadapt_ml/experiments/waa_demo/demos.py +357 -0
openadapt_ml/experiments/waa_demo/runner.py +732 -0
openadapt_ml/experiments/waa_demo/tasks.py +151 -0
openadapt_ml/export/__init__.py +9 -0
openadapt_ml/export/__main__.py +6 -0
openadapt_ml/export/cli.py +89 -0
openadapt_ml/export/parquet.py +277 -0
openadapt_ml/grounding/detector.py +18 -14
openadapt_ml/ingest/__init__.py +11 -10
openadapt_ml/ingest/capture.py +97 -86
openadapt_ml/ingest/loader.py +120 -69
openadapt_ml/ingest/synthetic.py +344 -193
openadapt_ml/models/api_adapter.py +14 -4
openadapt_ml/models/base_adapter.py +10 -2
openadapt_ml/models/providers/__init__.py +288 -0
openadapt_ml/models/providers/anthropic.py +266 -0
openadapt_ml/models/providers/base.py +299 -0
openadapt_ml/models/providers/google.py +376 -0
openadapt_ml/models/providers/openai.py +342 -0
openadapt_ml/models/qwen_vl.py +46 -19
openadapt_ml/perception/__init__.py +35 -0
openadapt_ml/perception/integration.py +399 -0
openadapt_ml/retrieval/README.md +226 -0
openadapt_ml/retrieval/USAGE.md +391 -0
openadapt_ml/retrieval/__init__.py +91 -0
openadapt_ml/retrieval/demo_retriever.py +843 -0
openadapt_ml/retrieval/embeddings.py +630 -0
openadapt_ml/retrieval/index.py +194 -0
openadapt_ml/retrieval/retriever.py +162 -0
openadapt_ml/runtime/__init__.py +50 -0
openadapt_ml/runtime/policy.py +27 -14
openadapt_ml/runtime/safety_gate.py +471 -0
openadapt_ml/schema/__init__.py +113 -0
openadapt_ml/schema/converters.py +588 -0
openadapt_ml/schema/episode.py +470 -0
openadapt_ml/scripts/capture_screenshots.py +530 -0
openadapt_ml/scripts/compare.py +102 -61
openadapt_ml/scripts/demo_policy.py +4 -1
openadapt_ml/scripts/eval_policy.py +19 -14
openadapt_ml/scripts/make_gif.py +1 -1
openadapt_ml/scripts/prepare_synthetic.py +16 -17
openadapt_ml/scripts/train.py +98 -75
openadapt_ml/segmentation/README.md +920 -0
openadapt_ml/segmentation/__init__.py +97 -0
openadapt_ml/segmentation/adapters/__init__.py +5 -0
openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
openadapt_ml/segmentation/annotator.py +610 -0
openadapt_ml/segmentation/cache.py +290 -0
openadapt_ml/segmentation/cli.py +674 -0
openadapt_ml/segmentation/deduplicator.py +656 -0
openadapt_ml/segmentation/frame_describer.py +788 -0
openadapt_ml/segmentation/pipeline.py +340 -0
openadapt_ml/segmentation/schemas.py +622 -0
openadapt_ml/segmentation/segment_extractor.py +634 -0
openadapt_ml/training/azure_ops_viewer.py +1097 -0
openadapt_ml/training/benchmark_viewer.py +3255 -19
openadapt_ml/training/shared_ui.py +7 -7
openadapt_ml/training/stub_provider.py +57 -35
openadapt_ml/training/trainer.py +255 -441
openadapt_ml/training/trl_trainer.py +403 -0
openadapt_ml/training/viewer.py +323 -108
openadapt_ml/training/viewer_components.py +180 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
openadapt_ml-0.2.1.dist-info/RECORD +116 -0
openadapt_ml/benchmarks/base.py +0 -366
openadapt_ml/benchmarks/data_collection.py +0 -432
openadapt_ml/benchmarks/runner.py +0 -381
openadapt_ml/benchmarks/waa.py +0 -704
openadapt_ml/schemas/__init__.py +0 -53
openadapt_ml/schemas/sessions.py +0 -122
openadapt_ml/schemas/validation.py +0 -252
openadapt_ml-0.1.0.dist-info/RECORD +0 -55
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0

openadapt_ml/benchmarks/azure.py CHANGED Viewed

@@ -30,16 +30,13 @@ from __future__ import annotations
 import json
 import logging
-import os
-import tempfile
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Callable
+from typing import Callable
-from openadapt_ml.benchmarks.agent import BenchmarkAgent
-from openadapt_ml.benchmarks.base import BenchmarkResult, BenchmarkTask
+from openadapt_evals import BenchmarkAgent, BenchmarkResult, BenchmarkTask
 logger = logging.getLogger(__name__)
@@ -233,7 +230,9 @@ class AzureMLClient:
                 resource_group_name=self.config.resource_group,
                 workspace_name=self.config.workspace_name,
             )
-            logger.info(f"Connected to Azure ML workspace: {self.config.workspace_name}")
+            logger.info(
+                f"Connected to Azure ML workspace: {self.config.workspace_name}"
+            )
         return self._client
     def _get_credential(self):
@@ -241,11 +240,13 @@ class AzureMLClient:
         from openadapt_ml.config import settings
         # Use service principal if credentials are configured
-        if all([
-            settings.azure_client_id,
-            settings.azure_client_secret,
-            settings.azure_tenant_id,
-        ]):
+        if all(
+            [
+                settings.azure_client_id,
+                settings.azure_client_secret,
+                settings.azure_tenant_id,
+            ]
+        ):
             logger.info("Using service principal authentication")
             return self._ClientSecretCredential(
                 tenant_id=settings.azure_tenant_id,
@@ -301,7 +302,10 @@ class AzureMLClient:
                 f"/providers/Microsoft.ManagedIdentity"
                 f"/userAssignedIdentities/{self.config.managed_identity_name}"
             )
-            compute.identity = {"type": "UserAssigned", "user_assigned_identities": [identity_id]}
+            compute.identity = {
+                "type": "UserAssigned",
+                "user_assigned_identities": [identity_id],
+            }
         print(f"      Creating VM: {name}...", end="", flush=True)
         self.client.compute.begin_create_or_update(compute).result()
@@ -355,6 +359,7 @@ class AzureMLClient:
         command: str,
         environment_variables: dict[str, str] | None = None,
         display_name: str | None = None,
+        timeout_hours: float = 4.0,
     ) -> str:
         """Submit a job to a compute instance.
@@ -363,6 +368,8 @@ class AzureMLClient:
             command: Command to run.
             environment_variables: Environment variables.
             display_name: Job display name.
+            timeout_hours: Maximum job duration in hours (default: 4). The job
+                will be automatically canceled after this duration.
         Returns:
             Job name/ID.
@@ -376,16 +383,28 @@ class AzureMLClient:
             name="waa-agent-env",
         )
+        import time
+        import uuid
+        timestamp = int(time.time())
+        unique_id = str(uuid.uuid4())[:8]
+        job_name = f"waa-{compute_name}-{timestamp}-{unique_id}"
+        # Convert hours to seconds for Azure ML timeout
+        timeout_seconds = int(timeout_hours * 3600)
         job = ml_command(
             command=command,
             environment=env,
             compute=compute_name,
+            name=job_name,  # Unique job name for Azure ML
             display_name=display_name or f"waa-job-{compute_name}",
             environment_variables=environment_variables or {},
+            limits={"timeout": timeout_seconds},
         )
         submitted = self.client.jobs.create_or_update(job)
-        logger.info(f"Job submitted: {submitted.name}")
+        logger.info(f"Job submitted: {submitted.name} (timeout: {timeout_hours}h)")
         return submitted.name
     def wait_for_job(self, job_name: str, timeout_seconds: int = 3600) -> dict:
@@ -458,6 +477,7 @@ class AzureWAAOrchestrator:
         max_steps_per_task: int = 15,
         on_worker_complete: Callable[[WorkerState], None] | None = None,
         cleanup_on_complete: bool = True,
+        timeout_hours: float = 4.0,
     ) -> list[BenchmarkResult]:
         """Run evaluation across multiple Azure VMs.
@@ -468,12 +488,14 @@ class AzureWAAOrchestrator:
             max_steps_per_task: Maximum steps per task.
             on_worker_complete: Callback when a worker finishes.
             cleanup_on_complete: Whether to delete VMs after completion.
+            timeout_hours: Maximum job duration in hours (default: 4). Jobs are
+                auto-canceled after this duration to prevent runaway costs.
         Returns:
             List of BenchmarkResult for all tasks.
         """
         # Load tasks
-        from openadapt_ml.benchmarks.waa import WAAAdapter
+        from openadapt_evals import WAAMockAdapter as WAAAdapter
         adapter = WAAAdapter(waa_repo_path=self.waa_repo_path)
         if task_ids:
@@ -513,17 +535,21 @@ class AzureWAAOrchestrator:
         try:
             # Provision VMs in parallel
-            print(f"[2/4] Provisioning {num_workers} Azure VM(s)... (this takes 3-5 minutes)")
+            print(
+                f"[2/4] Provisioning {num_workers} Azure VM(s)... (this takes 3-5 minutes)"
+            )
             self._provision_workers(workers)
-            print(f"      VM(s) ready")
+            print("      VM(s) ready")
             # Submit jobs to workers
-            print(f"[3/4] Submitting evaluation jobs...")
-            self._submit_worker_jobs(workers, task_batches, agent, max_steps_per_task)
-            print(f"      Jobs submitted")
+            print("[3/4] Submitting evaluation jobs...")
+            self._submit_worker_jobs(
+                workers, task_batches, agent, max_steps_per_task, timeout_hours
+            )
+            print("      Jobs submitted")
             # Wait for completion and collect results
-            print(f"[4/4] Waiting for workers to complete...")
+            print("[4/4] Waiting for workers to complete...")
             results = self._wait_and_collect_results(workers, on_worker_complete)
             self._current_run.status = "completed"
@@ -577,8 +603,17 @@ class AzureWAAOrchestrator:
         task_batches: list[list[BenchmarkTask]],
         agent: BenchmarkAgent,
         max_steps: int,
+        timeout_hours: float = 4.0,
     ) -> None:
-        """Submit evaluation jobs to workers."""
+        """Submit evaluation jobs to workers.
+        Args:
+            workers: List of worker states.
+            task_batches: Task batches for each worker.
+            agent: Agent to run.
+            max_steps: Maximum steps per task.
+            timeout_hours: Maximum job duration in hours.
+        """
         for worker, tasks in zip(workers, task_batches):
             if worker.status == "failed":
                 continue
@@ -591,7 +626,7 @@ class AzureWAAOrchestrator:
                 # Build command
                 command = self._build_worker_command(task_ids_json, max_steps, agent)
-                # Submit job
+                # Submit job with timeout
                 self.ml_client.submit_job(
                     compute_name=worker.compute_name,
                     command=command,
@@ -600,6 +635,7 @@ class AzureWAAOrchestrator:
                         "WAA_MAX_STEPS": str(max_steps),
                     },
                     display_name=f"waa-worker-{worker.worker_id}",
+                    timeout_hours=timeout_hours,
                 )
                 worker.status = "running"
                 worker.start_time = time.time()
@@ -625,9 +661,11 @@ class AzureWAAOrchestrator:
         # TODO: Serialize agent config and pass to remote worker
         # For now, workers use a default agent configuration
         _ = agent  # Reserved for agent serialization
+        # WAA Docker image has client at /client (see Dockerfile-WinArena)
+        # The run.py script is at /client/run.py (not a module, so use python run.py)
         return f"""
-        cd /workspace/WindowsAgentArena && \
-        python -m client.run \
+        cd /client && \
+        python run.py \
             --task_ids '{task_ids_json}' \
             --max_steps {max_steps} \
             --output_dir /outputs

openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

openadapt-ml 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl