openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -107
  8. openadapt_ml/benchmarks/agent.py +297 -374
  9. openadapt_ml/benchmarks/azure.py +62 -24
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1874 -751
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +1236 -0
  14. openadapt_ml/benchmarks/vm_monitor.py +1111 -0
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
  16. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  17. openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
  18. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  19. openadapt_ml/cloud/azure_inference.py +3 -5
  20. openadapt_ml/cloud/lambda_labs.py +722 -307
  21. openadapt_ml/cloud/local.py +3194 -89
  22. openadapt_ml/cloud/ssh_tunnel.py +595 -0
  23. openadapt_ml/datasets/next_action.py +125 -96
  24. openadapt_ml/evals/grounding.py +32 -9
  25. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  26. openadapt_ml/evals/trajectory_matching.py +120 -57
  27. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  28. openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
  29. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  30. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  31. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  32. openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
  33. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  34. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  35. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  36. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  37. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  38. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  39. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  40. openadapt_ml/experiments/waa_demo/runner.py +732 -0
  41. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  42. openadapt_ml/export/__init__.py +9 -0
  43. openadapt_ml/export/__main__.py +6 -0
  44. openadapt_ml/export/cli.py +89 -0
  45. openadapt_ml/export/parquet.py +277 -0
  46. openadapt_ml/grounding/detector.py +18 -14
  47. openadapt_ml/ingest/__init__.py +11 -10
  48. openadapt_ml/ingest/capture.py +97 -86
  49. openadapt_ml/ingest/loader.py +120 -69
  50. openadapt_ml/ingest/synthetic.py +344 -193
  51. openadapt_ml/models/api_adapter.py +14 -4
  52. openadapt_ml/models/base_adapter.py +10 -2
  53. openadapt_ml/models/providers/__init__.py +288 -0
  54. openadapt_ml/models/providers/anthropic.py +266 -0
  55. openadapt_ml/models/providers/base.py +299 -0
  56. openadapt_ml/models/providers/google.py +376 -0
  57. openadapt_ml/models/providers/openai.py +342 -0
  58. openadapt_ml/models/qwen_vl.py +46 -19
  59. openadapt_ml/perception/__init__.py +35 -0
  60. openadapt_ml/perception/integration.py +399 -0
  61. openadapt_ml/retrieval/README.md +226 -0
  62. openadapt_ml/retrieval/USAGE.md +391 -0
  63. openadapt_ml/retrieval/__init__.py +91 -0
  64. openadapt_ml/retrieval/demo_retriever.py +843 -0
  65. openadapt_ml/retrieval/embeddings.py +630 -0
  66. openadapt_ml/retrieval/index.py +194 -0
  67. openadapt_ml/retrieval/retriever.py +162 -0
  68. openadapt_ml/runtime/__init__.py +50 -0
  69. openadapt_ml/runtime/policy.py +27 -14
  70. openadapt_ml/runtime/safety_gate.py +471 -0
  71. openadapt_ml/schema/__init__.py +113 -0
  72. openadapt_ml/schema/converters.py +588 -0
  73. openadapt_ml/schema/episode.py +470 -0
  74. openadapt_ml/scripts/capture_screenshots.py +530 -0
  75. openadapt_ml/scripts/compare.py +102 -61
  76. openadapt_ml/scripts/demo_policy.py +4 -1
  77. openadapt_ml/scripts/eval_policy.py +19 -14
  78. openadapt_ml/scripts/make_gif.py +1 -1
  79. openadapt_ml/scripts/prepare_synthetic.py +16 -17
  80. openadapt_ml/scripts/train.py +98 -75
  81. openadapt_ml/segmentation/README.md +920 -0
  82. openadapt_ml/segmentation/__init__.py +97 -0
  83. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  84. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  85. openadapt_ml/segmentation/annotator.py +610 -0
  86. openadapt_ml/segmentation/cache.py +290 -0
  87. openadapt_ml/segmentation/cli.py +674 -0
  88. openadapt_ml/segmentation/deduplicator.py +656 -0
  89. openadapt_ml/segmentation/frame_describer.py +788 -0
  90. openadapt_ml/segmentation/pipeline.py +340 -0
  91. openadapt_ml/segmentation/schemas.py +622 -0
  92. openadapt_ml/segmentation/segment_extractor.py +634 -0
  93. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  94. openadapt_ml/training/benchmark_viewer.py +3255 -19
  95. openadapt_ml/training/shared_ui.py +7 -7
  96. openadapt_ml/training/stub_provider.py +57 -35
  97. openadapt_ml/training/trainer.py +255 -441
  98. openadapt_ml/training/trl_trainer.py +403 -0
  99. openadapt_ml/training/viewer.py +323 -108
  100. openadapt_ml/training/viewer_components.py +180 -0
  101. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
  102. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  103. openadapt_ml/benchmarks/base.py +0 -366
  104. openadapt_ml/benchmarks/data_collection.py +0 -432
  105. openadapt_ml/benchmarks/runner.py +0 -381
  106. openadapt_ml/benchmarks/waa.py +0 -704
  107. openadapt_ml/schemas/__init__.py +0 -53
  108. openadapt_ml/schemas/sessions.py +0 -122
  109. openadapt_ml/schemas/validation.py +0 -252
  110. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  111. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  112. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openadapt-ml
3
- Version: 0.1.0
3
+ Version: 0.2.1
4
4
  Summary: Model-agnostic, domain-agnostic ML engine for GUI automation agents
5
5
  Project-URL: Homepage, https://github.com/OpenAdaptAI/openadapt-ml
6
6
  Project-URL: Repository, https://github.com/OpenAdaptAI/openadapt-ml
@@ -13,18 +13,22 @@ Classifier: Development Status :: 3 - Alpha
13
13
  Classifier: Intended Audience :: Developers
14
14
  Classifier: License :: OSI Approved :: MIT License
15
15
  Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
16
18
  Classifier: Programming Language :: Python :: 3.12
17
19
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
20
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
- Requires-Python: >=3.12
21
+ Requires-Python: >=3.10
20
22
  Requires-Dist: azure-ai-ml>=1.30.0
21
23
  Requires-Dist: azure-identity>=1.25.1
22
24
  Requires-Dist: bitsandbytes>=0.41.0
25
+ Requires-Dist: click>=8.1.0
23
26
  Requires-Dist: google-generativeai>=0.8.5
24
27
  Requires-Dist: matplotlib>=3.10.7
25
28
  Requires-Dist: openadapt-capture>=0.1.0
26
29
  Requires-Dist: peft>=0.18.0
27
30
  Requires-Dist: pillow>=12.0.0
31
+ Requires-Dist: pyautogui>=0.9.54
28
32
  Requires-Dist: pydantic-settings>=2.0.0
29
33
  Requires-Dist: pytest>=9.0.2
30
34
  Requires-Dist: pyyaml>=6.0.3
@@ -38,20 +42,38 @@ Requires-Dist: pydantic-settings>=2.0.0; extra == 'api'
38
42
  Provides-Extra: azure
39
43
  Requires-Dist: azure-ai-ml>=1.0.0; extra == 'azure'
40
44
  Requires-Dist: azure-identity>=1.0.0; extra == 'azure'
45
+ Provides-Extra: benchmarks
46
+ Requires-Dist: openadapt-evals>=0.1.1; extra == 'benchmarks'
41
47
  Provides-Extra: dev
42
48
  Requires-Dist: pytest>=9.0.0; extra == 'dev'
43
49
  Requires-Dist: ruff>=0.1.0; extra == 'dev'
44
50
  Provides-Extra: lambda-labs
45
51
  Requires-Dist: requests>=2.28.0; extra == 'lambda-labs'
52
+ Provides-Extra: parquet
53
+ Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
54
+ Provides-Extra: training
55
+ Requires-Dist: datasets>=2.18.0; extra == 'training'
56
+ Requires-Dist: trl>=0.12.0; extra == 'training'
46
57
  Description-Content-Type: text/markdown
47
58
 
48
59
  # OpenAdapt-ML
49
60
 
61
+ [![Build Status](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/publish.yml/badge.svg)](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/publish.yml)
62
+ [![PyPI version](https://img.shields.io/pypi/v/openadapt-ml.svg)](https://pypi.org/project/openadapt-ml/)
63
+ [![Downloads](https://img.shields.io/pypi/dm/openadapt-ml.svg)](https://pypi.org/project/openadapt-ml/)
50
64
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
51
- [![Python Version](https://img.shields.io/badge/python-3.12-blue)](https://www.python.org/)
65
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12%2B-blue)](https://www.python.org/downloads/)
52
66
 
53
67
  OpenAdapt-ML is a **model-agnostic, domain-agnostic ML engine** for GUI
54
- automation agents.
68
+ automation agents. It sits above **TRL + Unsloth** (which we use directly for training performance) and provides the GUI-specific layer:
69
+
70
+ - **Episode semantics**: Step/action/observation alignment, screenshot-action coupling, termination handling
71
+ - **Demo-conditioned inference**: Retrieval-augmented prompting (in early experiments: 46.7% -> 100% first-action accuracy on a controlled macOS benchmark where all 45 tasks share the same navigation entry point - see [publication roadmap](https://github.com/OpenAdaptAI/OpenAdapt/blob/main/docs/publication-roadmap.md) for methodology and limitations)
72
+ - **Benchmark adapters**: WAA today, OSWorld/WebArena planned
73
+ - **VLM adapters**: Supports open-source GUI-agent models (Qwen3-VL, Qwen2.5-VL)
74
+ - **Training pipeline**: TRL + Unsloth integration for 2x faster training with 50% less VRAM
75
+
76
+ OpenAdapt-ML is **not** a training framework, optimizer, hardware orchestrator, or experiment manager. We use TRL/Unsloth, Lambda Labs/Azure, and W&B/MLflow for those.
55
77
 
56
78
  It provides:
57
79
 
@@ -59,24 +81,41 @@ It provides:
59
81
  - **Synthetic semantic UI generation** for bootstrapping datasets.
60
82
  - **Dataset builders** that turn episodes into next-action SFT samples.
61
83
  - **VLM adapters** (Qwen3-VL, Qwen2.5-VL) using Hugging Face + PEFT.
62
- - A minimal **supervised training loop** for fine-tuning.
84
+ - **SFT training via TRL** with Unsloth optimizations for efficient fine-tuning.
63
85
  - A simple **runtime policy** API that predicts the next GUI action.
64
86
 
65
87
  The design is described in detail in [`docs/design.md`](docs/design.md).
66
88
 
67
89
  ---
68
90
 
69
- ## 1. Quickstart
91
+ ## 1. Installation
70
92
 
71
- ### 1.1 Install dependencies
93
+ ### 1.1 From PyPI (recommended)
72
94
 
73
- From the repository root:
95
+ ```bash
96
+ # Install the package
97
+ uv add openadapt-ml
98
+
99
+ # For training with TRL (recommended for fine-tuning)
100
+ uv add openadapt-ml[training]
101
+
102
+ # For API-backed VLMs (Claude, GPT)
103
+ uv add openadapt-ml[api]
104
+ ```
105
+
106
+ ### 1.2 From source (development)
74
107
 
75
108
  ```bash
109
+ git clone https://github.com/OpenAdaptAI/openadapt-ml.git
110
+ cd openadapt-ml
76
111
  uv sync
77
112
  ```
78
113
 
79
- ### 1.2 Run a small demo policy
114
+ ---
115
+
116
+ ## 2. Quickstart
117
+
118
+ ### 2.1 Run a small demo policy
80
119
 
81
120
  Run a fast, model-free smoke test:
82
121
 
@@ -84,7 +123,7 @@ Run a fast, model-free smoke test:
84
123
  uv run python -m openadapt_ml.scripts.demo_policy --backend dummy
85
124
  ```
86
125
 
87
- ### 1.3 Run the synthetic login benchmark (end-to-end)
126
+ ### 2.2 Run the synthetic login benchmark (end-to-end)
88
127
 
89
128
  On a machine with a suitable GPU, you can reproduce the Qwen3-VL synthetic
90
129
  login benchmark (train → eval base/FT → plot) with a single command:
@@ -138,7 +177,7 @@ For complete documentation including training setup, evaluation metrics, SoM mod
138
177
 
139
178
  ---
140
179
 
141
- ## 2. Repository Structure
180
+ ## 3. Repository Structure
142
181
 
143
182
  Key modules:
144
183
 
@@ -159,9 +198,9 @@ Key modules:
159
198
  - `openadapt_ml/models/dummy_adapter.py`
160
199
  - Tiny fake adapter used to validate training and runtime flows without
161
200
  loading a real VLM.
162
- - `openadapt_ml/training/trainer.py`
163
- - Minimal supervised training loop (`train_supervised`) with gradient
164
- accumulation and logging.
201
+ - `openadapt_ml/training/trl_trainer.py`
202
+ - TRL-based SFT training (`train_with_trl`) with Unsloth optimizations
203
+ for 2x faster training and 50% less VRAM.
165
204
  - `openadapt_ml/runtime/policy.py`
166
205
  - `AgentPolicy` that formats inputs for a VLM and parses textual actions
167
206
  like `CLICK(x=..., y=...)` and `DONE()` into structured `Action`s.
@@ -184,12 +223,12 @@ Configs and docs:
184
223
 
185
224
  ---
186
225
 
187
- ## 3. Environment Setup
226
+ ## 4. Environment Setup
188
227
 
189
228
  OpenAdapt-ML targets **Python 3.12** and uses [`uv`](https://github.com/astral-sh/uv)
190
229
  for dependency management.
191
230
 
192
- ### 2.1 Install and sync
231
+ ### 4.1 Install and sync
193
232
 
194
233
  From the repository root:
195
234
 
@@ -202,7 +241,7 @@ uv sync
202
241
  This will create a virtual environment (e.g. `.venv/`) and install all
203
242
  packages declared in `pyproject.toml`.
204
243
 
205
- ### 2.2 Working inside the environment
244
+ ### 4.2 Working inside the environment
206
245
 
207
246
  Use `uv run` to execute Python modules and scripts with the synced
208
247
  environment:
@@ -215,15 +254,17 @@ You can also run `pytest` or other tools via `uv run`.
215
254
 
216
255
  ---
217
256
 
218
- ## 4. Synthetic Data & Datasets
257
+ ## 5. Synthetic Data & Datasets
219
258
 
220
259
  The v1 pipeline is validated on **synthetic, semantic UIs**, starting with a
221
260
  simple login flow.
222
261
 
223
- ### 3.1 Synthetic scenarios
262
+ ### 5.1 Synthetic scenarios
224
263
 
225
264
  OpenAdapt-ML includes synthetic UI generators for structured GUI automation benchmarks.
226
- Currently two scenarios are supported:
265
+ Currently two scenarios are supported.
266
+
267
+ > **Note:** These are **synthetic, controlled benchmarks** designed for rapid iteration and debugging, not real-world evaluation. The 100% accuracy results below demonstrate that fine-tuning works on simple scenarios with known ground truth - they do not represent performance on production UIs or standard benchmarks like WAA. See section 14 (Limitations) for details.
227
268
 
228
269
  #### Login Scenario (6 steps, 3 elements)
229
270
 
@@ -255,7 +296,7 @@ A more complex registration form with first name, last name, email, password, co
255
296
  | Episode Success Rate | **100%** |
256
297
  | Episodes / Steps | 32 / 384 |
257
298
 
258
- ### 3.2 Generating synthetic data
299
+ ### 5.2 Generating synthetic data
259
300
 
260
301
  Synthetic data is generated on the fly by `generate_synthetic_sessions` in
261
302
  `openadapt_ml/ingest/synthetic.py` and used internally by the training
@@ -286,7 +327,7 @@ Each session contains episodes with:
286
327
  - An observation (screenshot path).
287
328
  - An action (e.g. `CLICK`, `TYPE`, `DONE`).
288
329
 
289
- ### 3.3 Next-action SFT samples
330
+ ### 5.3 Next-action SFT samples
290
331
 
291
332
  Episodes are converted into SFT-style samples by
292
333
  `build_next_action_sft_samples` in `openadapt_ml/datasets/next_action.py`.
@@ -312,21 +353,20 @@ and its invariants, see `docs/design.md` §7.4.
312
353
 
313
354
  ---
314
355
 
315
- ## 5. Training
356
+ ## 6. Training
316
357
 
317
- Training is driven by `openadapt_ml/scripts/train.py` and YAML configs under
318
- `configs/`.
358
+ Training uses **TRL (Transformer Reinforcement Learning)** with **Unsloth** optimizations
359
+ for efficient VLM fine-tuning. This provides 2x faster training with 50% less VRAM compared
360
+ to standard approaches.
319
361
 
320
- The training script:
362
+ The training pipeline:
321
363
 
322
- 1. Loads a config file (YAML).
323
- 2. Generates synthetic sessions.
324
- 3. Flattens to episodes and builds SFT samples.
325
- 4. Wraps them in a `NextActionDataset`.
326
- 5. Instantiates a VLM adapter (e.g. `QwenVLAdapter`).
327
- 6. Runs `train_supervised` over the dataset.
364
+ 1. Loads episodes from synthetic data or real recordings.
365
+ 2. Converts to TRL-compatible SFT format with images and chat messages.
366
+ 3. Fine-tunes using SFTTrainer with LoRA adapters.
367
+ 4. Generates checkpoints and training logs for visualization.
328
368
 
329
- ### 4.1 Qwen3-VL synthetic training
369
+ ### 6.1 Qwen3-VL synthetic training
330
370
 
331
371
  Config: `configs/qwen3vl_synthetic.yaml`
332
372
 
@@ -353,7 +393,7 @@ This will:
353
393
  - Run a single-epoch supervised fine-tuning loop.
354
394
  - Print loss values as training progresses.
355
395
 
356
- ### 4.2 Qwen2.5-VL synthetic training
396
+ ### 6.2 Qwen2.5-VL synthetic training
357
397
 
358
398
  Config: `configs/qwen2_5vl_synthetic.yaml`
359
399
 
@@ -378,7 +418,7 @@ format expected by the Qwen2.5-VL processor.
378
418
  > Note: Both configs are sized for **small synthetic smoke runs**, not
379
419
  > large-scale production training.
380
420
 
381
- ### 4.3 Qwen3-VL synthetic login benchmark (hero example)
421
+ ### 6.3 Qwen3-VL synthetic login benchmark (hero example)
382
422
 
383
423
  OpenAdapt-ML ships a **synthetic login** benchmark backed by Qwen3-VL,
384
424
  used to compare **base vs LoRA-fine-tuned** models on a hardened synthetic
@@ -407,15 +447,18 @@ It exposes step-level performance metrics, which let us visually answer the ques
407
447
  | Claude Sonnet 4.5 | API | 0.121 | 0.757 | 0.000 |
408
448
  | GPT-5.1 | API | 0.183 | 0.057 | 0.600 |
409
449
 
410
- **Key findings:**
411
- 1. **Fine-tuning delivers massive gains**: Both 2B and 8B models show 2-3x improvement in action accuracy after fine-tuning
412
- 2. **Small fine-tuned models beat large APIs**: Qwen3-VL-2B FT (469% base) outperforms both Claude Sonnet 4.5 (121%) and GPT-5.1 (183%)
413
- 3. **Precision matters**: Fine-tuned models have excellent click precision (85-100% hit rate, <0.05 coord error) while API models struggle with the action format
414
- 4. **Size vs specialization**: The fine-tuned 2B model outperforms the general-purpose Claude Sonnet 4.5, showing that domain-specific fine-tuning trumps raw model size
450
+ **Observations on synthetic login benchmark:**
415
451
 
416
- ### 4.4 Set-of-Marks (SoM) Mode: 100% Accuracy
452
+ > **Important:** These findings are from a synthetic benchmark with ~3 UI elements and a fixed action sequence. They demonstrate the training pipeline works, but should not be extrapolated to real-world GUI automation performance. Evaluation on standard benchmarks (WAA, WebArena) is ongoing.
417
453
 
418
- With **Set-of-Marks** visual prompting, fine-tuned Qwen3-VL-2B achieves **100% accuracy** on both login (6-step) and registration (12-step) scenarios:
454
+ 1. **Fine-tuning improves synthetic task performance**: Both 2B and 8B models show 2-3x improvement in action accuracy after fine-tuning on this specific task
455
+ 2. **On this synthetic benchmark, fine-tuned models outperform zero-shot API calls**: This is expected since the task is simple and the models are trained on it directly
456
+ 3. **Coordinate precision is learnable**: Fine-tuned models achieve low coordinate error on training distribution
457
+ 4. **API models struggle with custom action format**: Without fine-tuning on the specific DSL (CLICK/TYPE/DONE), API models have high format-error rates
458
+
459
+ ### 6.4 Set-of-Marks (SoM) Mode: 100% Accuracy on Synthetic Benchmarks
460
+
461
+ With **Set-of-Marks** visual prompting, fine-tuned Qwen3-VL-2B achieves **100% accuracy** on both login (6-step) and registration (12-step) synthetic scenarios. Note that these are controlled, toy benchmarks with a small number of UI elements:
419
462
 
420
463
  | Scenario | Steps | Elements | Action Acc | Element Acc | Episode Success |
421
464
  |----------|-------|----------|------------|-------------|-----------------|
@@ -452,11 +495,11 @@ For the full SoM investigation report, see [`experiments/qwen_login/SOM_INVESTIG
452
495
 
453
496
  ---
454
497
 
455
- ## 6. Grounding Module
498
+ ## 7. Grounding Module
456
499
 
457
500
  OpenAdapt-ML includes a **grounding module** for locating UI elements on screenshots using natural language descriptions. This enables policy/grounding separation where the policy decides *what* to do and the grounder finds *where* to do it.
458
501
 
459
- ### 6.1 GeminiGrounder Demo
502
+ ### 7.1 GeminiGrounder Demo
460
503
 
461
504
  The `GeminiGrounder` uses Google's Gemini vision API to locate UI elements:
462
505
 
@@ -475,7 +518,7 @@ if candidates:
475
518
  print(f"Found at {best.centroid} with {best.confidence:.0%} confidence")
476
519
  ```
477
520
 
478
- ### 6.2 Set-of-Marks (SoM) Support
521
+ ### 7.2 Set-of-Marks (SoM) Support
479
522
 
480
523
  The grounding module includes functions for extracting all UI elements and overlaying numbered labels (Set-of-Marks):
481
524
 
@@ -497,7 +540,7 @@ This enables element-based actions using indices instead of coordinates:
497
540
 
498
541
  See `docs/gemini_grounding.md` for full documentation and `examples/test_gemini_grounding.py` for a complete example.
499
542
 
500
- ### 6.3 Available Grounders
543
+ ### 7.3 Available Grounders
501
544
 
502
545
  | Grounder | Description | Latency | Use Case |
503
546
  |----------|-------------|---------|----------|
@@ -505,7 +548,7 @@ See `docs/gemini_grounding.md` for full documentation and `examples/test_gemini_
505
548
  | `OracleGrounder` | Ground-truth bboxes | ~0ms | Evaluation |
506
549
  | `DetectorGrounder` | Generic wrapper with backend selection | varies | Flexible |
507
550
 
508
- ### 6.4 Grounding Evaluation
551
+ ### 7.4 Grounding Evaluation
509
552
 
510
553
  The `openadapt_ml.evals.grounding` module provides metrics for evaluating grounding accuracy:
511
554
 
@@ -523,7 +566,7 @@ print(metrics)
523
566
 
524
567
  ---
525
568
 
526
- ## 7. VLM Adapters
569
+ ## 8. VLM Adapters
527
570
 
528
571
  All VLM backends implement the shared `BaseVLMAdapter` interface in
529
572
  `openadapt_ml/models/base_adapter.py` (prepare inputs, compute loss, generate
@@ -542,7 +585,7 @@ Current adapters include:
542
585
  For full adapter internals and training-time vs runtime behavior, see
543
586
  `docs/design.md` §8.
544
587
 
545
- ### 7.1 API-backed adapters
588
+ ### 8.1 API-backed adapters
546
589
 
547
590
  To use the API-backed adapter from Python, you can configure API keys via `.env`
548
591
  file, environment variables, or pass them explicitly:
@@ -565,12 +608,12 @@ The existing CLI scripts `scripts/demo_policy.py` and
565
608
 
566
609
  ---
567
610
 
568
- ## 8. Runtime Policy & Demos
611
+ ## 9. Runtime Policy & Demos
569
612
 
570
613
  The runtime policy is implemented in `openadapt_ml/runtime/policy.py` as
571
614
  `AgentPolicy`.
572
615
 
573
- ### 8.1 AgentPolicy
616
+ ### 9.1 AgentPolicy
574
617
 
575
618
  `AgentPolicy` is initialized with a VLM adapter (dummy or real). Given an
576
619
  SFT-style sample, it:
@@ -581,7 +624,7 @@ SFT-style sample, it:
581
624
  - `DONE()`
582
625
  3. Returns a structured `Action` plus an optional free-form `thought`.
583
626
 
584
- ### 8.2 Demo script
627
+ ### 9.2 Demo script
585
628
 
586
629
  `openadapt_ml/scripts/demo_policy.py` demonstrates how to use
587
630
  `AgentPolicy` with different backends.
@@ -613,7 +656,7 @@ Each invocation will:
613
656
 
614
657
  ---
615
658
 
616
- ## 9. Testing
659
+ ## 10. Testing
617
660
 
618
661
  Basic tests are provided under `tests/`.
619
662
 
@@ -623,26 +666,26 @@ Run the test suite with:
623
666
  uv run pytest
624
667
  ```
625
668
 
626
- In particular:
669
+ Key test files:
627
670
 
628
- - `tests/test_training_dummy.py` runs a smoke test over the training loop
629
- using `DummyAdapter`.
671
+ - `tests/test_training_dummy.py` - Tests TRL training configuration and sample conversion
672
+ - `tests/test_local_cli.py` - Tests local training CLI commands (status, check, viewer)
630
673
 
631
674
  ---
632
675
 
633
- ## 10. Training on Real Data
676
+ ## 11. Training on Real Data
634
677
 
635
678
  OpenAdapt-ML supports training on real GUI recordings from two sources:
636
679
  1. **openadapt-capture** - New lightweight recording format
637
680
  2. **OpenAdapt database** - Original OpenAdapt recordings (legacy)
638
681
 
639
- ### 10.1 Training on openadapt-capture recordings
682
+ ### 11.1 Training on openadapt-capture recordings
640
683
 
641
684
  [openadapt-capture](https://github.com/OpenAdaptAI/openadapt-capture) is a lightweight GUI recording tool.
642
685
 
643
686
  ```bash
644
687
  # Install openadapt-capture
645
- uv pip install openadapt-capture
688
+ uv add openadapt-capture
646
689
 
647
690
  # Record a workflow (e.g., turning off Night Shift)
648
691
  openadapt-capture record --output ~/captures/turn-off-nightshift
@@ -656,7 +699,7 @@ uv run python -m openadapt_ml.scripts.train \
656
699
 
657
700
  The goal is automatically derived from the directory name (e.g., `"Turn off nightshift"`).
658
701
 
659
- ### 10.2 Compare human vs AI predictions
702
+ ### 11.2 Compare human vs AI predictions
660
703
 
661
704
  ```bash
662
705
  uv run python -m openadapt_ml.scripts.compare \
@@ -673,11 +716,11 @@ The comparison viewer shows:
673
716
 
674
717
  ---
675
718
 
676
- ## 11. Local Training (CUDA / Apple Silicon)
719
+ ## 12. Local Training (CUDA / Apple Silicon)
677
720
 
678
721
  Train locally on your own GPU. Auto-detects CUDA or Apple Silicon (MPS).
679
722
 
680
- ### 11.1 Quick start
723
+ ### 12.1 Quick start
681
724
 
682
725
  ```bash
683
726
  # Train on a capture (auto-detects device and config)
@@ -686,7 +729,7 @@ uv run python -m openadapt_ml.cloud.local train \
686
729
  --open # Opens dashboard in browser
687
730
  ```
688
731
 
689
- ### 11.2 Training workflow
732
+ ### 12.2 Training workflow
690
733
 
691
734
  ```bash
692
735
  # Check device and training status
@@ -713,11 +756,11 @@ uv run python -m openadapt_ml.cloud.local compare \
713
756
 
714
757
  ---
715
758
 
716
- ## 12. Cloud GPU Training (Lambda Labs)
759
+ ## 13. Cloud GPU Training (Lambda Labs)
717
760
 
718
761
  For faster training on powerful GPUs, use Lambda Labs. Full documentation: [`docs/cloud_gpu_training.md`](docs/cloud_gpu_training.md).
719
762
 
720
- ### 12.1 Quick start
763
+ ### 13.1 Quick start
721
764
 
722
765
  ```bash
723
766
  # Set API key
@@ -729,7 +772,7 @@ uv run python -m openadapt_ml.cloud.lambda_labs train \
729
772
  --goal "Turn off Night Shift in System Settings"
730
773
  ```
731
774
 
732
- ### 12.2 Manual workflow
775
+ ### 13.2 Manual workflow
733
776
 
734
777
  ```bash
735
778
  # List available instances and pricing
@@ -751,7 +794,7 @@ uv run python -m openadapt_ml.cloud.lambda_labs download <instance_id>
751
794
  uv run python -m openadapt_ml.cloud.lambda_labs terminate <instance_id>
752
795
  ```
753
796
 
754
- ### 12.3 Training visualization
797
+ ### 13.3 Training visualization
755
798
 
756
799
  The training process generates:
757
800
  - **`training_output/dashboard.html`** - Real-time training dashboard with loss curves
@@ -790,9 +833,203 @@ uv run python -m openadapt_ml.cloud.local serve --port 8080 --open
790
833
  - `Home` / `End` - First/last frame
791
834
  - `O` - Toggle click overlay
792
835
 
836
+ **Benchmark Viewer:**
837
+
838
+ ![Benchmark Viewer](docs/images/benchmark_viewer.png)
839
+
840
+ *View benchmark evaluation results with task-level filtering, success/failure status, and run comparison. Shows Claude achieving 30% on mock evaluation tasks (simulated environment for testing the pipeline - real WAA evaluation requires Windows VMs).*
841
+
842
+ ### 13.4 VM Monitoring Dashboard
843
+
844
+ For managing Azure VMs used in benchmark evaluations, the `vm monitor` command provides a comprehensive dashboard:
845
+
846
+ ```bash
847
+ # Start VM monitoring dashboard (auto-opens browser)
848
+ uv run python -m openadapt_ml.benchmarks.cli vm monitor
849
+
850
+ # Show detailed information (evaluation history, daily/weekly costs)
851
+ uv run python -m openadapt_ml.benchmarks.cli vm monitor --details
852
+ ```
853
+
854
+ **VM Monitor Dashboard (Full View):**
855
+
856
+ ![VM Monitor Dashboard](docs/screenshots/vm_monitor_dashboard_full.png)
857
+
858
+ *The VM monitor dashboard shows: (1) VM status (name, IP, size, state), (2) Current activity (idle/benchmark running), (3) Cost tracking (uptime, hourly rate, total cost), (4) Recent Azure ML jobs from last 7 days, and (6) Dashboard & access URLs.*
859
+
860
+ **VM Monitor Dashboard (With --details Flag):**
861
+
862
+ ![VM Monitor Dashboard Details](docs/screenshots/vm_monitor_details.png)
863
+
864
+ *The --details flag adds: (5) Evaluation history with success rates and agent types, plus extended cost information (daily/weekly projections).*
865
+
866
+ **Features:**
867
+ - **Real-time VM status** - Shows VM size, power state, and IP address
868
+ - **Activity detection** - Identifies if VM is idle, running benchmarks, or in setup
869
+ - **Cost tracking** - Displays uptime hours, hourly rate, and total cost for current session
870
+ - **Azure ML jobs** - Lists recent jobs from last 7 days with status indicators
871
+ - **Evaluation history** - Shows past benchmark runs with success rates (with --details flag)
872
+ - **Dashboard & tunnels** - Auto-starts web dashboard and SSH/VNC tunnels for accessing Windows VM
873
+
874
+ **Mock mode for testing:**
875
+ ```bash
876
+ # Generate screenshots or test dashboard without a VM running
877
+ uv run python -m openadapt_ml.benchmarks.cli vm monitor --mock
878
+ ```
879
+
880
+ **Auto-shutdown option:**
881
+ ```bash
882
+ # Automatically deallocate VM after 2 hours to prevent runaway costs
883
+ uv run python -m openadapt_ml.benchmarks.cli vm monitor --auto-shutdown-hours 2
884
+ ```
885
+
886
+ ### 13.5 Benchmark Execution Logs
887
+
888
+ View benchmark execution progress and logs:
889
+
890
+ ```bash
891
+ # View WAA container status and Docker logs
892
+ uv run python -m openadapt_ml.benchmarks.cli logs
893
+
894
+ # View WAA benchmark execution logs (task progress, agent actions)
895
+ uv run python -m openadapt_ml.benchmarks.cli logs --run
896
+
897
+ # Stream execution logs live
898
+ uv run python -m openadapt_ml.benchmarks.cli logs --run -f
899
+
900
+ # Show last N lines of execution logs
901
+ uv run python -m openadapt_ml.benchmarks.cli logs --run --tail 100
902
+
903
+ # Show benchmark progress and ETA
904
+ uv run python -m openadapt_ml.benchmarks.cli logs --progress
905
+ ```
906
+
907
+ **Example: Container status (`logs`)**
908
+ ```
909
+ WAA Status (20.12.180.208)
910
+ ============================================================
911
+
912
+ [Docker Images]
913
+ REPOSITORY TAG SIZE
914
+ waa-auto latest 25.4GB
915
+ windowsarena/winarena latest 25.8GB
916
+
917
+ [Container]
918
+ Status: Up 49 minutes
919
+
920
+ [Storage]
921
+ Total: 21G
922
+ Disk image: 64G
923
+
924
+ [QEMU VM]
925
+ Status: Running (PID 1471)
926
+ CPU: 176%, MEM: 51.6%, Uptime: 47:28
927
+
928
+ [WAA Server]
929
+ "status": "Probe successful"
930
+ (READY)
931
+ ```
932
+
933
+ **Example: Benchmark execution logs (`logs --run -f`)**
934
+ ```
935
+ Run log: /home/azureuser/cli_logs/run_20260128_175507.log
936
+ ------------------------------------------------------------
937
+ Streaming log (Ctrl+C to stop)...
938
+
939
+ [2026-01-28 23:05:10,303 INFO agent/401-MainProcess] Thinking...
940
+ [2026-01-28 23:05:17,318 INFO python/62-MainProcess] Updated computer successfully
941
+ [2026-01-28 23:05:17,318 INFO lib_run_single/56-MainProcess] Step 9: computer.window_manager.switch_to_application("Summer Trip - File Explorer")
942
+ ```
943
+
944
+ **Example: Benchmark progress (`logs --progress`)**
945
+ ```
946
+ === WAA Benchmark Progress ===
947
+
948
+ Log: /home/azureuser/cli_logs/run_20260128_175507.log
949
+ Started: 2026-01-28 22:55:14
950
+ Latest: 2026-01-28 23:28:37
951
+
952
+ Tasks completed: 1 / 154
953
+ Elapsed: 33 minutes
954
+
955
+ Avg time per task: ~33 min
956
+ Remaining tasks: 153
957
+ Estimated remaining: ~84h 9m
958
+
959
+ Progress: 0% [1/154]
960
+ ```
961
+
962
+ **Other useful commands:**
963
+ ```bash
964
+ # Check WAA server status (probe endpoint)
965
+ uv run python -m openadapt_ml.benchmarks.cli probe
966
+
967
+ # Check VM/Azure status
968
+ uv run python -m openadapt_ml.benchmarks.cli status
969
+
970
+ # Download benchmark results from VM
971
+ uv run python -m openadapt_ml.benchmarks.cli download
972
+
973
+ # Analyze downloaded results
974
+ uv run python -m openadapt_ml.benchmarks.cli analyze
975
+ ```
976
+
977
+ **Running benchmarks:**
978
+ ```bash
979
+ # Run full benchmark (154 tasks)
980
+ uv run python -m openadapt_ml.benchmarks.cli run --num-tasks 154
981
+
982
+ # Run specific domain
983
+ uv run python -m openadapt_ml.benchmarks.cli run --domain notepad --num-tasks 5
984
+
985
+ # Run single task
986
+ uv run python -m openadapt_ml.benchmarks.cli run --task notepad_1
987
+ ```
988
+
989
+ For complete VM management commands and Azure setup instructions, see [`CLAUDE.md`](CLAUDE.md) and [`docs/azure_waa_setup.md`](docs/azure_waa_setup.md).
990
+
991
+ ### 13.6 Screenshot Capture Tool
992
+
993
+ Capture screenshots of dashboards and VMs for documentation and PR purposes:
994
+
995
+ ```bash
996
+ # Capture all available targets
997
+ uv run python -m openadapt_ml.benchmarks.cli screenshot
998
+
999
+ # List available targets
1000
+ uv run python -m openadapt_ml.benchmarks.cli screenshot --list
1001
+
1002
+ # Capture specific targets
1003
+ uv run python -m openadapt_ml.benchmarks.cli screenshot --target terminal
1004
+ uv run python -m openadapt_ml.benchmarks.cli screenshot --target azure-ops --target vnc
1005
+
1006
+ # Custom output directory
1007
+ uv run python -m openadapt_ml.benchmarks.cli screenshot --output /path/to/screenshots
1008
+
1009
+ # Without timestamp in filename
1010
+ uv run python -m openadapt_ml.benchmarks.cli screenshot --target terminal --no-timestamp
1011
+ ```
1012
+
1013
+ **Available targets:**
1014
+
1015
+ | Target | Description |
1016
+ |--------|-------------|
1017
+ | `azure-ops` | Azure ops dashboard (localhost:8765) |
1018
+ | `vnc` | VNC viewer (localhost:8006) - Windows VM |
1019
+ | `terminal` | VM monitor terminal output (mock mode) |
1020
+ | `terminal-live` | VM monitor terminal output (live, requires running VM) |
1021
+ | `training` | Training dashboard (localhost:8080) |
1022
+ | `vm-screen` | Windows VM screen capture via QEMU |
1023
+
1024
+ **Notes:**
1025
+ - Terminal screenshots use PIL to render terminal output as PNG images
1026
+ - Web page screenshots work best with playwright installed (`uv add playwright && playwright install chromium`)
1027
+ - On macOS, interactive capture using `screencapture` is available as a fallback
1028
+ - Screenshots are saved to `docs/screenshots/` by default with timestamps
1029
+
793
1030
  ---
794
1031
 
795
- ## 13. Limitations & Notes
1032
+ ## 14. Limitations & Notes
796
1033
 
797
1034
  - **Apple Silicon / bitsandbytes**:
798
1035
  - Example configs are sized for CPU / Apple Silicon development runs; see
@@ -805,12 +1042,18 @@ uv run python -m openadapt_ml.cloud.local serve --port 8080 --open
805
1042
  - **Evaluation**:
806
1043
  - v1 focuses on smoke tests and qualitative behavior on synthetic data.
807
1044
  More formal evaluation scripts and metrics are planned.
1045
+ - **Windows Agent Arena (WAA) on Azure**:
1046
+ - WAA requires nested virtualization (Windows VM inside Docker via QEMU)
1047
+ - Azure ML managed compute does not support nested virtualization
1048
+ - For real WAA evaluation, use dedicated VMs with Dv3/Ev3 series or run locally
1049
+ - Mock evaluation (`test-mock`) validates the pipeline without Windows VMs
1050
+ - See `CLAUDE.md` for detailed workarounds and infrastructure setup
808
1051
 
809
1052
  For deeper architectural details, see [`docs/design.md`](docs/design.md).
810
1053
 
811
1054
  ---
812
1055
 
813
- ## 14. Roadmap
1056
+ ## 15. Roadmap
814
1057
 
815
1058
  For the up-to-date, prioritized roadmap (including concrete implementation
816
1059
  targets and agent-executable acceptance criteria), see