PyPI - openadapt-ml - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

openadapt-ml 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

openadapt_ml/baselines/__init__.py +121 -0
openadapt_ml/baselines/adapter.py +185 -0
openadapt_ml/baselines/cli.py +314 -0
openadapt_ml/baselines/config.py +448 -0
openadapt_ml/baselines/parser.py +922 -0
openadapt_ml/baselines/prompts.py +787 -0
openadapt_ml/benchmarks/__init__.py +13 -107
openadapt_ml/benchmarks/agent.py +297 -374
openadapt_ml/benchmarks/azure.py +62 -24
openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
openadapt_ml/benchmarks/cli.py +1874 -751
openadapt_ml/benchmarks/trace_export.py +631 -0
openadapt_ml/benchmarks/viewer.py +1236 -0
openadapt_ml/benchmarks/vm_monitor.py +1111 -0
openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
openadapt_ml/cloud/azure_inference.py +3 -5
openadapt_ml/cloud/lambda_labs.py +722 -307
openadapt_ml/cloud/local.py +3194 -89
openadapt_ml/cloud/ssh_tunnel.py +595 -0
openadapt_ml/datasets/next_action.py +125 -96
openadapt_ml/evals/grounding.py +32 -9
openadapt_ml/evals/plot_eval_metrics.py +15 -13
openadapt_ml/evals/trajectory_matching.py +120 -57
openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
openadapt_ml/experiments/representation_shootout/config.py +390 -0
openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
openadapt_ml/experiments/representation_shootout/runner.py +687 -0
openadapt_ml/experiments/waa_demo/__init__.py +10 -0
openadapt_ml/experiments/waa_demo/demos.py +357 -0
openadapt_ml/experiments/waa_demo/runner.py +732 -0
openadapt_ml/experiments/waa_demo/tasks.py +151 -0
openadapt_ml/export/__init__.py +9 -0
openadapt_ml/export/__main__.py +6 -0
openadapt_ml/export/cli.py +89 -0
openadapt_ml/export/parquet.py +277 -0
openadapt_ml/grounding/detector.py +18 -14
openadapt_ml/ingest/__init__.py +11 -10
openadapt_ml/ingest/capture.py +97 -86
openadapt_ml/ingest/loader.py +120 -69
openadapt_ml/ingest/synthetic.py +344 -193
openadapt_ml/models/api_adapter.py +14 -4
openadapt_ml/models/base_adapter.py +10 -2
openadapt_ml/models/providers/__init__.py +288 -0
openadapt_ml/models/providers/anthropic.py +266 -0
openadapt_ml/models/providers/base.py +299 -0
openadapt_ml/models/providers/google.py +376 -0
openadapt_ml/models/providers/openai.py +342 -0
openadapt_ml/models/qwen_vl.py +46 -19
openadapt_ml/perception/__init__.py +35 -0
openadapt_ml/perception/integration.py +399 -0
openadapt_ml/retrieval/README.md +226 -0
openadapt_ml/retrieval/USAGE.md +391 -0
openadapt_ml/retrieval/__init__.py +91 -0
openadapt_ml/retrieval/demo_retriever.py +843 -0
openadapt_ml/retrieval/embeddings.py +630 -0
openadapt_ml/retrieval/index.py +194 -0
openadapt_ml/retrieval/retriever.py +162 -0
openadapt_ml/runtime/__init__.py +50 -0
openadapt_ml/runtime/policy.py +27 -14
openadapt_ml/runtime/safety_gate.py +471 -0
openadapt_ml/schema/__init__.py +113 -0
openadapt_ml/schema/converters.py +588 -0
openadapt_ml/schema/episode.py +470 -0
openadapt_ml/scripts/capture_screenshots.py +530 -0
openadapt_ml/scripts/compare.py +102 -61
openadapt_ml/scripts/demo_policy.py +4 -1
openadapt_ml/scripts/eval_policy.py +19 -14
openadapt_ml/scripts/make_gif.py +1 -1
openadapt_ml/scripts/prepare_synthetic.py +16 -17
openadapt_ml/scripts/train.py +98 -75
openadapt_ml/segmentation/README.md +920 -0
openadapt_ml/segmentation/__init__.py +97 -0
openadapt_ml/segmentation/adapters/__init__.py +5 -0
openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
openadapt_ml/segmentation/annotator.py +610 -0
openadapt_ml/segmentation/cache.py +290 -0
openadapt_ml/segmentation/cli.py +674 -0
openadapt_ml/segmentation/deduplicator.py +656 -0
openadapt_ml/segmentation/frame_describer.py +788 -0
openadapt_ml/segmentation/pipeline.py +340 -0
openadapt_ml/segmentation/schemas.py +622 -0
openadapt_ml/segmentation/segment_extractor.py +634 -0
openadapt_ml/training/azure_ops_viewer.py +1097 -0
openadapt_ml/training/benchmark_viewer.py +3255 -19
openadapt_ml/training/shared_ui.py +7 -7
openadapt_ml/training/stub_provider.py +57 -35
openadapt_ml/training/trainer.py +255 -441
openadapt_ml/training/trl_trainer.py +403 -0
openadapt_ml/training/viewer.py +323 -108
openadapt_ml/training/viewer_components.py +180 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
openadapt_ml-0.2.1.dist-info/RECORD +116 -0
openadapt_ml/benchmarks/base.py +0 -366
openadapt_ml/benchmarks/data_collection.py +0 -432
openadapt_ml/benchmarks/runner.py +0 -381
openadapt_ml/benchmarks/waa.py +0 -704
openadapt_ml/schemas/__init__.py +0 -53
openadapt_ml/schemas/sessions.py +0 -122
openadapt_ml/schemas/validation.py +0 -252
openadapt_ml-0.1.0.dist-info/RECORD +0 -55
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0

{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: openadapt-ml
-Version: 0.1.0
+Version: 0.2.1
 Summary: Model-agnostic, domain-agnostic ML engine for GUI automation agents
 Project-URL: Homepage, https://github.com/OpenAdaptAI/openadapt-ml
 Project-URL: Repository, https://github.com/OpenAdaptAI/openadapt-ml
@@ -13,18 +13,22 @@ Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
-Requires-Python: >=3.12
+Requires-Python: >=3.10
 Requires-Dist: azure-ai-ml>=1.30.0
 Requires-Dist: azure-identity>=1.25.1
 Requires-Dist: bitsandbytes>=0.41.0
+Requires-Dist: click>=8.1.0
 Requires-Dist: google-generativeai>=0.8.5
 Requires-Dist: matplotlib>=3.10.7
 Requires-Dist: openadapt-capture>=0.1.0
 Requires-Dist: peft>=0.18.0
 Requires-Dist: pillow>=12.0.0
+Requires-Dist: pyautogui>=0.9.54
 Requires-Dist: pydantic-settings>=2.0.0
 Requires-Dist: pytest>=9.0.2
 Requires-Dist: pyyaml>=6.0.3
@@ -38,20 +42,38 @@ Requires-Dist: pydantic-settings>=2.0.0; extra == 'api'
 Provides-Extra: azure
 Requires-Dist: azure-ai-ml>=1.0.0; extra == 'azure'
 Requires-Dist: azure-identity>=1.0.0; extra == 'azure'
+Provides-Extra: benchmarks
+Requires-Dist: openadapt-evals>=0.1.1; extra == 'benchmarks'
 Provides-Extra: dev
 Requires-Dist: pytest>=9.0.0; extra == 'dev'
 Requires-Dist: ruff>=0.1.0; extra == 'dev'
 Provides-Extra: lambda-labs
 Requires-Dist: requests>=2.28.0; extra == 'lambda-labs'
+Provides-Extra: parquet
+Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
+Provides-Extra: training
+Requires-Dist: datasets>=2.18.0; extra == 'training'
+Requires-Dist: trl>=0.12.0; extra == 'training'
 Description-Content-Type: text/markdown
 # OpenAdapt-ML
+[![Build Status](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/publish.yml/badge.svg)](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/publish.yml)
+[![PyPI version](https://img.shields.io/pypi/v/openadapt-ml.svg)](https://pypi.org/project/openadapt-ml/)
+[![Downloads](https://img.shields.io/pypi/dm/openadapt-ml.svg)](https://pypi.org/project/openadapt-ml/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-[![Python Version](https://img.shields.io/badge/python-3.12-blue)](https://www.python.org/)
+[![Python 3.12+](https://img.shields.io/badge/python-3.12%2B-blue)](https://www.python.org/downloads/)
 OpenAdapt-ML is a **model-agnostic, domain-agnostic ML engine** for GUI
-automation agents.
+automation agents. It sits above **TRL + Unsloth** (which we use directly for training performance) and provides the GUI-specific layer:
+- **Episode semantics**: Step/action/observation alignment, screenshot-action coupling, termination handling
+- **Demo-conditioned inference**: Retrieval-augmented prompting (in early experiments: 46.7% -> 100% first-action accuracy on a controlled macOS benchmark where all 45 tasks share the same navigation entry point - see [publication roadmap](https://github.com/OpenAdaptAI/OpenAdapt/blob/main/docs/publication-roadmap.md) for methodology and limitations)
+- **Benchmark adapters**: WAA today, OSWorld/WebArena planned
+- **VLM adapters**: Supports open-source GUI-agent models (Qwen3-VL, Qwen2.5-VL)
+- **Training pipeline**: TRL + Unsloth integration for 2x faster training with 50% less VRAM
+OpenAdapt-ML is **not** a training framework, optimizer, hardware orchestrator, or experiment manager. We use TRL/Unsloth, Lambda Labs/Azure, and W&B/MLflow for those.
 It provides:
@@ -59,24 +81,41 @@ It provides:
 - **Synthetic semantic UI generation** for bootstrapping datasets.
 - **Dataset builders** that turn episodes into next-action SFT samples.
 - **VLM adapters** (Qwen3-VL, Qwen2.5-VL) using Hugging Face + PEFT.
-- A minimal **supervised training loop** for fine-tuning.
+- **SFT training via TRL** with Unsloth optimizations for efficient fine-tuning.
 - A simple **runtime policy** API that predicts the next GUI action.
 The design is described in detail in [`docs/design.md`](docs/design.md).
 ---
-## 1. Quickstart
+## 1. Installation
-### 1.1 Install dependencies
+### 1.1 From PyPI (recommended)
-From the repository root:
+```bash
+# Install the package
+uv add openadapt-ml
+# For training with TRL (recommended for fine-tuning)
+uv add openadapt-ml[training]
+# For API-backed VLMs (Claude, GPT)
+uv add openadapt-ml[api]
+```
+### 1.2 From source (development)
 ```bash
+git clone https://github.com/OpenAdaptAI/openadapt-ml.git
+cd openadapt-ml
 uv sync
 ```
-### 1.2 Run a small demo policy
+---
+## 2. Quickstart
+### 2.1 Run a small demo policy
 Run a fast, model-free smoke test:
@@ -84,7 +123,7 @@ Run a fast, model-free smoke test:
 uv run python -m openadapt_ml.scripts.demo_policy --backend dummy
 ```
-### 1.3 Run the synthetic login benchmark (end-to-end)
+### 2.2 Run the synthetic login benchmark (end-to-end)
 On a machine with a suitable GPU, you can reproduce the Qwen3-VL synthetic
 login benchmark (train → eval base/FT → plot) with a single command:
@@ -138,7 +177,7 @@ For complete documentation including training setup, evaluation metrics, SoM mod
 ---
-## 2. Repository Structure
+## 3. Repository Structure
 Key modules:
@@ -159,9 +198,9 @@ Key modules:
 - `openadapt_ml/models/dummy_adapter.py`
   - Tiny fake adapter used to validate training and runtime flows without
     loading a real VLM.
-- `openadapt_ml/training/trainer.py`
-  - Minimal supervised training loop (`train_supervised`) with gradient
-    accumulation and logging.
+- `openadapt_ml/training/trl_trainer.py`
+  - TRL-based SFT training (`train_with_trl`) with Unsloth optimizations
+    for 2x faster training and 50% less VRAM.
 - `openadapt_ml/runtime/policy.py`
   - `AgentPolicy` that formats inputs for a VLM and parses textual actions
     like `CLICK(x=..., y=...)` and `DONE()` into structured `Action`s.
@@ -184,12 +223,12 @@ Configs and docs:
 ---
-## 3. Environment Setup
+## 4. Environment Setup
 OpenAdapt-ML targets **Python 3.12** and uses [`uv`](https://github.com/astral-sh/uv)
 for dependency management.
-### 2.1 Install and sync
+### 4.1 Install and sync
 From the repository root:
@@ -202,7 +241,7 @@ uv sync
 This will create a virtual environment (e.g. `.venv/`) and install all
 packages declared in `pyproject.toml`.
-### 2.2 Working inside the environment
+### 4.2 Working inside the environment
 Use `uv run` to execute Python modules and scripts with the synced
 environment:
@@ -215,15 +254,17 @@ You can also run `pytest` or other tools via `uv run`.
 ---
-## 4. Synthetic Data & Datasets
+## 5. Synthetic Data & Datasets
 The v1 pipeline is validated on **synthetic, semantic UIs**, starting with a
 simple login flow.
-### 3.1 Synthetic scenarios
+### 5.1 Synthetic scenarios
 OpenAdapt-ML includes synthetic UI generators for structured GUI automation benchmarks.
-Currently two scenarios are supported:
+Currently two scenarios are supported.
+> **Note:** These are **synthetic, controlled benchmarks** designed for rapid iteration and debugging, not real-world evaluation. The 100% accuracy results below demonstrate that fine-tuning works on simple scenarios with known ground truth - they do not represent performance on production UIs or standard benchmarks like WAA. See section 14 (Limitations) for details.
 #### Login Scenario (6 steps, 3 elements)
@@ -255,7 +296,7 @@ A more complex registration form with first name, last name, email, password, co
 | Episode Success Rate | **100%** |
 | Episodes / Steps | 32 / 384 |
-### 3.2 Generating synthetic data
+### 5.2 Generating synthetic data
 Synthetic data is generated on the fly by `generate_synthetic_sessions` in
 `openadapt_ml/ingest/synthetic.py` and used internally by the training
@@ -286,7 +327,7 @@ Each session contains episodes with:
   - An observation (screenshot path).
   - An action (e.g. `CLICK`, `TYPE`, `DONE`).
-### 3.3 Next-action SFT samples
+### 5.3 Next-action SFT samples
 Episodes are converted into SFT-style samples by
 `build_next_action_sft_samples` in `openadapt_ml/datasets/next_action.py`.
@@ -312,21 +353,20 @@ and its invariants, see `docs/design.md` §7.4.
 ---
-## 5. Training
+## 6. Training
-Training is driven by `openadapt_ml/scripts/train.py` and YAML configs under
-`configs/`.
+Training uses **TRL (Transformer Reinforcement Learning)** with **Unsloth** optimizations
+for efficient VLM fine-tuning. This provides 2x faster training with 50% less VRAM compared
+to standard approaches.
-The training script:
+The training pipeline:
-1. Loads a config file (YAML).
-2. Generates synthetic sessions.
-3. Flattens to episodes and builds SFT samples.
-4. Wraps them in a `NextActionDataset`.
-5. Instantiates a VLM adapter (e.g. `QwenVLAdapter`).
-6. Runs `train_supervised` over the dataset.
+1. Loads episodes from synthetic data or real recordings.
+2. Converts to TRL-compatible SFT format with images and chat messages.
+3. Fine-tunes using SFTTrainer with LoRA adapters.
+4. Generates checkpoints and training logs for visualization.
-### 4.1 Qwen3-VL synthetic training
+### 6.1 Qwen3-VL synthetic training
 Config: `configs/qwen3vl_synthetic.yaml`
@@ -353,7 +393,7 @@ This will:
 - Run a single-epoch supervised fine-tuning loop.
 - Print loss values as training progresses.
-### 4.2 Qwen2.5-VL synthetic training
+### 6.2 Qwen2.5-VL synthetic training
 Config: `configs/qwen2_5vl_synthetic.yaml`
@@ -378,7 +418,7 @@ format expected by the Qwen2.5-VL processor.
 > Note: Both configs are sized for **small synthetic smoke runs**, not
 > large-scale production training.
-### 4.3 Qwen3-VL synthetic login benchmark (hero example)
+### 6.3 Qwen3-VL synthetic login benchmark (hero example)
 OpenAdapt-ML ships a **synthetic login** benchmark backed by Qwen3-VL,
 used to compare **base vs LoRA-fine-tuned** models on a hardened synthetic
@@ -407,15 +447,18 @@ It exposes step-level performance metrics, which let us visually answer the ques
 | Claude Sonnet 4.5   | API          | 0.121           | 0.757       | 0.000          |
 | GPT-5.1             | API          | 0.183           | 0.057       | 0.600          |
-**Key findings:**
-1. **Fine-tuning delivers massive gains**: Both 2B and 8B models show 2-3x improvement in action accuracy after fine-tuning
-2. **Small fine-tuned models beat large APIs**: Qwen3-VL-2B FT (469% base) outperforms both Claude Sonnet 4.5 (121%) and GPT-5.1 (183%)
-3. **Precision matters**: Fine-tuned models have excellent click precision (85-100% hit rate, <0.05 coord error) while API models struggle with the action format
-4. **Size vs specialization**: The fine-tuned 2B model outperforms the general-purpose Claude Sonnet 4.5, showing that domain-specific fine-tuning trumps raw model size
+**Observations on synthetic login benchmark:**
-### 4.4 Set-of-Marks (SoM) Mode: 100% Accuracy
+> **Important:** These findings are from a synthetic benchmark with ~3 UI elements and a fixed action sequence. They demonstrate the training pipeline works, but should not be extrapolated to real-world GUI automation performance. Evaluation on standard benchmarks (WAA, WebArena) is ongoing.
-With **Set-of-Marks** visual prompting, fine-tuned Qwen3-VL-2B achieves **100% accuracy** on both login (6-step) and registration (12-step) scenarios:
+1. **Fine-tuning improves synthetic task performance**: Both 2B and 8B models show 2-3x improvement in action accuracy after fine-tuning on this specific task
+2. **On this synthetic benchmark, fine-tuned models outperform zero-shot API calls**: This is expected since the task is simple and the models are trained on it directly
+3. **Coordinate precision is learnable**: Fine-tuned models achieve low coordinate error on training distribution
+4. **API models struggle with custom action format**: Without fine-tuning on the specific DSL (CLICK/TYPE/DONE), API models have high format-error rates
+### 6.4 Set-of-Marks (SoM) Mode: 100% Accuracy on Synthetic Benchmarks
+With **Set-of-Marks** visual prompting, fine-tuned Qwen3-VL-2B achieves **100% accuracy** on both login (6-step) and registration (12-step) synthetic scenarios. Note that these are controlled, toy benchmarks with a small number of UI elements:
 | Scenario | Steps | Elements | Action Acc | Element Acc | Episode Success |
 |----------|-------|----------|------------|-------------|-----------------|
@@ -452,11 +495,11 @@ For the full SoM investigation report, see [`experiments/qwen_login/SOM_INVESTIG
 ---
-## 6. Grounding Module
+## 7. Grounding Module
 OpenAdapt-ML includes a **grounding module** for locating UI elements on screenshots using natural language descriptions. This enables policy/grounding separation where the policy decides *what* to do and the grounder finds *where* to do it.
-### 6.1 GeminiGrounder Demo
+### 7.1 GeminiGrounder Demo
 The `GeminiGrounder` uses Google's Gemini vision API to locate UI elements:
@@ -475,7 +518,7 @@ if candidates:
     print(f"Found at {best.centroid} with {best.confidence:.0%} confidence")
 ```
-### 6.2 Set-of-Marks (SoM) Support
+### 7.2 Set-of-Marks (SoM) Support
 The grounding module includes functions for extracting all UI elements and overlaying numbered labels (Set-of-Marks):
@@ -497,7 +540,7 @@ This enables element-based actions using indices instead of coordinates:
 See `docs/gemini_grounding.md` for full documentation and `examples/test_gemini_grounding.py` for a complete example.
-### 6.3 Available Grounders
+### 7.3 Available Grounders
 | Grounder | Description | Latency | Use Case |
 |----------|-------------|---------|----------|
@@ -505,7 +548,7 @@ See `docs/gemini_grounding.md` for full documentation and `examples/test_gemini_
 | `OracleGrounder` | Ground-truth bboxes | ~0ms | Evaluation |
 | `DetectorGrounder` | Generic wrapper with backend selection | varies | Flexible |
-### 6.4 Grounding Evaluation
+### 7.4 Grounding Evaluation
 The `openadapt_ml.evals.grounding` module provides metrics for evaluating grounding accuracy:
@@ -523,7 +566,7 @@ print(metrics)
 ---
-## 7. VLM Adapters
+## 8. VLM Adapters
 All VLM backends implement the shared `BaseVLMAdapter` interface in
 `openadapt_ml/models/base_adapter.py` (prepare inputs, compute loss, generate
@@ -542,7 +585,7 @@ Current adapters include:
 For full adapter internals and training-time vs runtime behavior, see
 `docs/design.md` §8.
-### 7.1 API-backed adapters
+### 8.1 API-backed adapters
 To use the API-backed adapter from Python, you can configure API keys via `.env`
 file, environment variables, or pass them explicitly:
@@ -565,12 +608,12 @@ The existing CLI scripts `scripts/demo_policy.py` and
 ---
-## 8. Runtime Policy & Demos
+## 9. Runtime Policy & Demos
 The runtime policy is implemented in `openadapt_ml/runtime/policy.py` as
 `AgentPolicy`.
-### 8.1 AgentPolicy
+### 9.1 AgentPolicy
 `AgentPolicy` is initialized with a VLM adapter (dummy or real). Given an
 SFT-style sample, it:
@@ -581,7 +624,7 @@ SFT-style sample, it:
    - `DONE()`
 3. Returns a structured `Action` plus an optional free-form `thought`.
-### 8.2 Demo script
+### 9.2 Demo script
 `openadapt_ml/scripts/demo_policy.py` demonstrates how to use
 `AgentPolicy` with different backends.
@@ -613,7 +656,7 @@ Each invocation will:
 ---
-## 9. Testing
+## 10. Testing
 Basic tests are provided under `tests/`.
@@ -623,26 +666,26 @@ Run the test suite with:
 uv run pytest
 ```
-In particular:
+Key test files:
-- `tests/test_training_dummy.py` runs a smoke test over the training loop
-  using `DummyAdapter`.
+- `tests/test_training_dummy.py` - Tests TRL training configuration and sample conversion
+- `tests/test_local_cli.py` - Tests local training CLI commands (status, check, viewer)
 ---
-## 10. Training on Real Data
+## 11. Training on Real Data
 OpenAdapt-ML supports training on real GUI recordings from two sources:
 1. **openadapt-capture** - New lightweight recording format
 2. **OpenAdapt database** - Original OpenAdapt recordings (legacy)
-### 10.1 Training on openadapt-capture recordings
+### 11.1 Training on openadapt-capture recordings
 [openadapt-capture](https://github.com/OpenAdaptAI/openadapt-capture) is a lightweight GUI recording tool.
 ```bash
 # Install openadapt-capture
-uv pip install openadapt-capture
+uv add openadapt-capture
 # Record a workflow (e.g., turning off Night Shift)
 openadapt-capture record --output ~/captures/turn-off-nightshift
@@ -656,7 +699,7 @@ uv run python -m openadapt_ml.scripts.train \
 The goal is automatically derived from the directory name (e.g., `"Turn off nightshift"`).
-### 10.2 Compare human vs AI predictions
+### 11.2 Compare human vs AI predictions
 ```bash
 uv run python -m openadapt_ml.scripts.compare \
@@ -673,11 +716,11 @@ The comparison viewer shows:
 ---
-## 11. Local Training (CUDA / Apple Silicon)
+## 12. Local Training (CUDA / Apple Silicon)
 Train locally on your own GPU. Auto-detects CUDA or Apple Silicon (MPS).
-### 11.1 Quick start
+### 12.1 Quick start
 ```bash
 # Train on a capture (auto-detects device and config)
@@ -686,7 +729,7 @@ uv run python -m openadapt_ml.cloud.local train \
   --open  # Opens dashboard in browser
 ```
-### 11.2 Training workflow
+### 12.2 Training workflow
 ```bash
 # Check device and training status
@@ -713,11 +756,11 @@ uv run python -m openadapt_ml.cloud.local compare \
 ---
-## 12. Cloud GPU Training (Lambda Labs)
+## 13. Cloud GPU Training (Lambda Labs)
 For faster training on powerful GPUs, use Lambda Labs. Full documentation: [`docs/cloud_gpu_training.md`](docs/cloud_gpu_training.md).
-### 12.1 Quick start
+### 13.1 Quick start
 ```bash
 # Set API key
@@ -729,7 +772,7 @@ uv run python -m openadapt_ml.cloud.lambda_labs train \
   --goal "Turn off Night Shift in System Settings"
 ```
-### 12.2 Manual workflow
+### 13.2 Manual workflow
 ```bash
 # List available instances and pricing
@@ -751,7 +794,7 @@ uv run python -m openadapt_ml.cloud.lambda_labs download <instance_id>
 uv run python -m openadapt_ml.cloud.lambda_labs terminate <instance_id>
 ```
-### 12.3 Training visualization
+### 13.3 Training visualization
 The training process generates:
 - **`training_output/dashboard.html`** - Real-time training dashboard with loss curves
@@ -790,9 +833,203 @@ uv run python -m openadapt_ml.cloud.local serve --port 8080 --open
 - `Home` / `End` - First/last frame
 - `O` - Toggle click overlay
+**Benchmark Viewer:**
+![Benchmark Viewer](docs/images/benchmark_viewer.png)
+*View benchmark evaluation results with task-level filtering, success/failure status, and run comparison. Shows Claude achieving 30% on mock evaluation tasks (simulated environment for testing the pipeline - real WAA evaluation requires Windows VMs).*
+### 13.4 VM Monitoring Dashboard
+For managing Azure VMs used in benchmark evaluations, the `vm monitor` command provides a comprehensive dashboard:
+```bash
+# Start VM monitoring dashboard (auto-opens browser)
+uv run python -m openadapt_ml.benchmarks.cli vm monitor
+# Show detailed information (evaluation history, daily/weekly costs)
+uv run python -m openadapt_ml.benchmarks.cli vm monitor --details
+```
+**VM Monitor Dashboard (Full View):**
+![VM Monitor Dashboard](docs/screenshots/vm_monitor_dashboard_full.png)
+*The VM monitor dashboard shows: (1) VM status (name, IP, size, state), (2) Current activity (idle/benchmark running), (3) Cost tracking (uptime, hourly rate, total cost), (4) Recent Azure ML jobs from last 7 days, and (6) Dashboard & access URLs.*
+**VM Monitor Dashboard (With --details Flag):**
+![VM Monitor Dashboard Details](docs/screenshots/vm_monitor_details.png)
+*The --details flag adds: (5) Evaluation history with success rates and agent types, plus extended cost information (daily/weekly projections).*
+**Features:**
+- **Real-time VM status** - Shows VM size, power state, and IP address
+- **Activity detection** - Identifies if VM is idle, running benchmarks, or in setup
+- **Cost tracking** - Displays uptime hours, hourly rate, and total cost for current session
+- **Azure ML jobs** - Lists recent jobs from last 7 days with status indicators
+- **Evaluation history** - Shows past benchmark runs with success rates (with --details flag)
+- **Dashboard & tunnels** - Auto-starts web dashboard and SSH/VNC tunnels for accessing Windows VM
+**Mock mode for testing:**
+```bash
+# Generate screenshots or test dashboard without a VM running
+uv run python -m openadapt_ml.benchmarks.cli vm monitor --mock
+```
+**Auto-shutdown option:**
+```bash
+# Automatically deallocate VM after 2 hours to prevent runaway costs
+uv run python -m openadapt_ml.benchmarks.cli vm monitor --auto-shutdown-hours 2
+```
+### 13.5 Benchmark Execution Logs
+View benchmark execution progress and logs:
+```bash
+# View WAA container status and Docker logs
+uv run python -m openadapt_ml.benchmarks.cli logs
+# View WAA benchmark execution logs (task progress, agent actions)
+uv run python -m openadapt_ml.benchmarks.cli logs --run
+# Stream execution logs live
+uv run python -m openadapt_ml.benchmarks.cli logs --run -f
+# Show last N lines of execution logs
+uv run python -m openadapt_ml.benchmarks.cli logs --run --tail 100
+# Show benchmark progress and ETA
+uv run python -m openadapt_ml.benchmarks.cli logs --progress
+```
+**Example: Container status (`logs`)**
+```
+WAA Status (20.12.180.208)
+============================================================
+[Docker Images]
+REPOSITORY              TAG       SIZE
+waa-auto                latest    25.4GB
+windowsarena/winarena   latest    25.8GB
+[Container]
+  Status: Up 49 minutes
+[Storage]
+  Total: 21G
+  Disk image: 64G
+[QEMU VM]
+  Status: Running (PID 1471)
+  CPU: 176%, MEM: 51.6%, Uptime: 47:28
+[WAA Server]
+  "status": "Probe successful"
+ (READY)
+```
+**Example: Benchmark execution logs (`logs --run -f`)**
+```
+Run log: /home/azureuser/cli_logs/run_20260128_175507.log
+------------------------------------------------------------
+Streaming log (Ctrl+C to stop)...
+[2026-01-28 23:05:10,303 INFO agent/401-MainProcess] Thinking...
+[2026-01-28 23:05:17,318 INFO python/62-MainProcess] Updated computer successfully
+[2026-01-28 23:05:17,318 INFO lib_run_single/56-MainProcess] Step 9: computer.window_manager.switch_to_application("Summer Trip - File Explorer")
+```
+**Example: Benchmark progress (`logs --progress`)**
+```
+=== WAA Benchmark Progress ===
+Log: /home/azureuser/cli_logs/run_20260128_175507.log
+Started: 2026-01-28 22:55:14
+Latest:  2026-01-28 23:28:37
+Tasks completed: 1 / 154
+Elapsed: 33 minutes
+Avg time per task: ~33 min
+Remaining tasks: 153
+Estimated remaining: ~84h 9m
+Progress: 0% [1/154]
+```
+**Other useful commands:**
+```bash
+# Check WAA server status (probe endpoint)
+uv run python -m openadapt_ml.benchmarks.cli probe
+# Check VM/Azure status
+uv run python -m openadapt_ml.benchmarks.cli status
+# Download benchmark results from VM
+uv run python -m openadapt_ml.benchmarks.cli download
+# Analyze downloaded results
+uv run python -m openadapt_ml.benchmarks.cli analyze
+```
+**Running benchmarks:**
+```bash
+# Run full benchmark (154 tasks)
+uv run python -m openadapt_ml.benchmarks.cli run --num-tasks 154
+# Run specific domain
+uv run python -m openadapt_ml.benchmarks.cli run --domain notepad --num-tasks 5
+# Run single task
+uv run python -m openadapt_ml.benchmarks.cli run --task notepad_1
+```
+For complete VM management commands and Azure setup instructions, see [`CLAUDE.md`](CLAUDE.md) and [`docs/azure_waa_setup.md`](docs/azure_waa_setup.md).
+### 13.6 Screenshot Capture Tool
+Capture screenshots of dashboards and VMs for documentation and PR purposes:
+```bash
+# Capture all available targets
+uv run python -m openadapt_ml.benchmarks.cli screenshot
+# List available targets
+uv run python -m openadapt_ml.benchmarks.cli screenshot --list
+# Capture specific targets
+uv run python -m openadapt_ml.benchmarks.cli screenshot --target terminal
+uv run python -m openadapt_ml.benchmarks.cli screenshot --target azure-ops --target vnc
+# Custom output directory
+uv run python -m openadapt_ml.benchmarks.cli screenshot --output /path/to/screenshots
+# Without timestamp in filename
+uv run python -m openadapt_ml.benchmarks.cli screenshot --target terminal --no-timestamp
+```
+**Available targets:**
+| Target | Description |
+|--------|-------------|
+| `azure-ops` | Azure ops dashboard (localhost:8765) |
+| `vnc` | VNC viewer (localhost:8006) - Windows VM |
+| `terminal` | VM monitor terminal output (mock mode) |
+| `terminal-live` | VM monitor terminal output (live, requires running VM) |
+| `training` | Training dashboard (localhost:8080) |
+| `vm-screen` | Windows VM screen capture via QEMU |
+**Notes:**
+- Terminal screenshots use PIL to render terminal output as PNG images
+- Web page screenshots work best with playwright installed (`uv add playwright && playwright install chromium`)
+- On macOS, interactive capture using `screencapture` is available as a fallback
+- Screenshots are saved to `docs/screenshots/` by default with timestamps
 ---
-## 13. Limitations & Notes
+## 14. Limitations & Notes
 - **Apple Silicon / bitsandbytes**:
   - Example configs are sized for CPU / Apple Silicon development runs; see
@@ -805,12 +1042,18 @@ uv run python -m openadapt_ml.cloud.local serve --port 8080 --open
 - **Evaluation**:
   - v1 focuses on smoke tests and qualitative behavior on synthetic data.
     More formal evaluation scripts and metrics are planned.
+- **Windows Agent Arena (WAA) on Azure**:
+  - WAA requires nested virtualization (Windows VM inside Docker via QEMU)
+  - Azure ML managed compute does not support nested virtualization
+  - For real WAA evaluation, use dedicated VMs with Dv3/Ev3 series or run locally
+  - Mock evaluation (`test-mock`) validates the pipeline without Windows VMs
+  - See `CLAUDE.md` for detailed workarounds and infrastructure setup
 For deeper architectural details, see [`docs/design.md`](docs/design.md).
 ---
-## 14. Roadmap
+## 15. Roadmap
 For the up-to-date, prioritized roadmap (including concrete implementation
 targets and agent-executable acceptance criteria), see

openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

openadapt-ml 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl