openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. openadapt_ml/benchmarks/__init__.py +8 -0
  2. openadapt_ml/benchmarks/agent.py +90 -11
  3. openadapt_ml/benchmarks/azure.py +35 -6
  4. openadapt_ml/benchmarks/cli.py +4449 -201
  5. openadapt_ml/benchmarks/live_tracker.py +180 -0
  6. openadapt_ml/benchmarks/runner.py +41 -4
  7. openadapt_ml/benchmarks/viewer.py +1219 -0
  8. openadapt_ml/benchmarks/vm_monitor.py +610 -0
  9. openadapt_ml/benchmarks/waa.py +61 -4
  10. openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
  11. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  12. openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
  13. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  14. openadapt_ml/benchmarks/waa_live.py +619 -0
  15. openadapt_ml/cloud/local.py +1555 -1
  16. openadapt_ml/cloud/ssh_tunnel.py +553 -0
  17. openadapt_ml/datasets/next_action.py +87 -68
  18. openadapt_ml/evals/grounding.py +26 -8
  19. openadapt_ml/evals/trajectory_matching.py +84 -36
  20. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  21. openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
  22. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  23. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  24. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  25. openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
  26. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  27. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  28. openadapt_ml/experiments/waa_demo/runner.py +717 -0
  29. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  30. openadapt_ml/export/__init__.py +9 -0
  31. openadapt_ml/export/__main__.py +6 -0
  32. openadapt_ml/export/cli.py +89 -0
  33. openadapt_ml/export/parquet.py +265 -0
  34. openadapt_ml/ingest/__init__.py +3 -4
  35. openadapt_ml/ingest/capture.py +89 -81
  36. openadapt_ml/ingest/loader.py +116 -68
  37. openadapt_ml/ingest/synthetic.py +221 -159
  38. openadapt_ml/retrieval/README.md +226 -0
  39. openadapt_ml/retrieval/USAGE.md +391 -0
  40. openadapt_ml/retrieval/__init__.py +91 -0
  41. openadapt_ml/retrieval/demo_retriever.py +817 -0
  42. openadapt_ml/retrieval/embeddings.py +629 -0
  43. openadapt_ml/retrieval/index.py +194 -0
  44. openadapt_ml/retrieval/retriever.py +160 -0
  45. openadapt_ml/runtime/policy.py +10 -10
  46. openadapt_ml/schema/__init__.py +104 -0
  47. openadapt_ml/schema/converters.py +541 -0
  48. openadapt_ml/schema/episode.py +457 -0
  49. openadapt_ml/scripts/compare.py +26 -16
  50. openadapt_ml/scripts/eval_policy.py +4 -5
  51. openadapt_ml/scripts/prepare_synthetic.py +14 -17
  52. openadapt_ml/scripts/train.py +81 -70
  53. openadapt_ml/training/benchmark_viewer.py +3225 -0
  54. openadapt_ml/training/trainer.py +120 -363
  55. openadapt_ml/training/trl_trainer.py +354 -0
  56. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
  57. openadapt_ml-0.2.0.dist-info/RECORD +86 -0
  58. openadapt_ml/schemas/__init__.py +0 -53
  59. openadapt_ml/schemas/sessions.py +0 -122
  60. openadapt_ml/schemas/validation.py +0 -252
  61. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  62. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
  63. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openadapt-ml
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Model-agnostic, domain-agnostic ML engine for GUI automation agents
5
5
  Project-URL: Homepage, https://github.com/OpenAdaptAI/openadapt-ml
6
6
  Project-URL: Repository, https://github.com/OpenAdaptAI/openadapt-ml
@@ -43,15 +43,29 @@ Requires-Dist: pytest>=9.0.0; extra == 'dev'
43
43
  Requires-Dist: ruff>=0.1.0; extra == 'dev'
44
44
  Provides-Extra: lambda-labs
45
45
  Requires-Dist: requests>=2.28.0; extra == 'lambda-labs'
46
+ Provides-Extra: parquet
47
+ Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
48
+ Provides-Extra: training
49
+ Requires-Dist: datasets>=2.18.0; extra == 'training'
50
+ Requires-Dist: trl>=0.12.0; extra == 'training'
46
51
  Description-Content-Type: text/markdown
47
52
 
48
53
  # OpenAdapt-ML
49
54
 
55
+ [![PyPI version](https://badge.fury.io/py/openadapt-ml.svg)](https://badge.fury.io/py/openadapt-ml)
50
56
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
51
57
  [![Python Version](https://img.shields.io/badge/python-3.12-blue)](https://www.python.org/)
52
58
 
53
59
  OpenAdapt-ML is a **model-agnostic, domain-agnostic ML engine** for GUI
54
- automation agents.
60
+ automation agents. It sits above **TRL + Unsloth** (which we use directly for training performance) and provides the GUI-specific layer:
61
+
62
+ - **Episode semantics**: Step/action/observation alignment, screenshot-action coupling, termination handling
63
+ - **Demo-conditioned inference**: Retrieval-augmented prompting (validated: 33% → 100% first-action accuracy)
64
+ - **Benchmark adapters**: WAA today, OSWorld/WebArena planned
65
+ - **VLM adapters**: Updated with leading GUI-agent SOTA open-source models
66
+ - **Training pipeline**: TRL + Unsloth integration for 2x faster training with 50% less VRAM
67
+
68
+ OpenAdapt-ML is **not** a training framework, optimizer, hardware orchestrator, or experiment manager. We use TRL/Unsloth, Lambda Labs/Azure, and W&B/MLflow for those.
55
69
 
56
70
  It provides:
57
71
 
@@ -59,24 +73,41 @@ It provides:
59
73
  - **Synthetic semantic UI generation** for bootstrapping datasets.
60
74
  - **Dataset builders** that turn episodes into next-action SFT samples.
61
75
  - **VLM adapters** (Qwen3-VL, Qwen2.5-VL) using Hugging Face + PEFT.
62
- - A minimal **supervised training loop** for fine-tuning.
76
+ - **SFT training via TRL** with Unsloth optimizations for efficient fine-tuning.
63
77
  - A simple **runtime policy** API that predicts the next GUI action.
64
78
 
65
79
  The design is described in detail in [`docs/design.md`](docs/design.md).
66
80
 
67
81
  ---
68
82
 
69
- ## 1. Quickstart
83
+ ## 1. Installation
70
84
 
71
- ### 1.1 Install dependencies
85
+ ### 1.1 From PyPI (recommended)
72
86
 
73
- From the repository root:
87
+ ```bash
88
+ # Install the package
89
+ uv add openadapt-ml
90
+
91
+ # For training with TRL (recommended for fine-tuning)
92
+ uv add openadapt-ml[training]
93
+
94
+ # For API-backed VLMs (Claude, GPT)
95
+ uv add openadapt-ml[api]
96
+ ```
97
+
98
+ ### 1.2 From source (development)
74
99
 
75
100
  ```bash
101
+ git clone https://github.com/OpenAdaptAI/openadapt-ml.git
102
+ cd openadapt-ml
76
103
  uv sync
77
104
  ```
78
105
 
79
- ### 1.2 Run a small demo policy
106
+ ---
107
+
108
+ ## 2. Quickstart
109
+
110
+ ### 2.1 Run a small demo policy
80
111
 
81
112
  Run a fast, model-free smoke test:
82
113
 
@@ -84,7 +115,7 @@ Run a fast, model-free smoke test:
84
115
  uv run python -m openadapt_ml.scripts.demo_policy --backend dummy
85
116
  ```
86
117
 
87
- ### 1.3 Run the synthetic login benchmark (end-to-end)
118
+ ### 2.2 Run the synthetic login benchmark (end-to-end)
88
119
 
89
120
  On a machine with a suitable GPU, you can reproduce the Qwen3-VL synthetic
90
121
  login benchmark (train → eval base/FT → plot) with a single command:
@@ -138,7 +169,7 @@ For complete documentation including training setup, evaluation metrics, SoM mod
138
169
 
139
170
  ---
140
171
 
141
- ## 2. Repository Structure
172
+ ## 3. Repository Structure
142
173
 
143
174
  Key modules:
144
175
 
@@ -159,9 +190,9 @@ Key modules:
159
190
  - `openadapt_ml/models/dummy_adapter.py`
160
191
  - Tiny fake adapter used to validate training and runtime flows without
161
192
  loading a real VLM.
162
- - `openadapt_ml/training/trainer.py`
163
- - Minimal supervised training loop (`train_supervised`) with gradient
164
- accumulation and logging.
193
+ - `openadapt_ml/training/trl_trainer.py`
194
+ - TRL-based SFT training (`train_with_trl`) with Unsloth optimizations
195
+ for 2x faster training and 50% less VRAM.
165
196
  - `openadapt_ml/runtime/policy.py`
166
197
  - `AgentPolicy` that formats inputs for a VLM and parses textual actions
167
198
  like `CLICK(x=..., y=...)` and `DONE()` into structured `Action`s.
@@ -184,12 +215,12 @@ Configs and docs:
184
215
 
185
216
  ---
186
217
 
187
- ## 3. Environment Setup
218
+ ## 4. Environment Setup
188
219
 
189
220
  OpenAdapt-ML targets **Python 3.12** and uses [`uv`](https://github.com/astral-sh/uv)
190
221
  for dependency management.
191
222
 
192
- ### 2.1 Install and sync
223
+ ### 4.1 Install and sync
193
224
 
194
225
  From the repository root:
195
226
 
@@ -202,7 +233,7 @@ uv sync
202
233
  This will create a virtual environment (e.g. `.venv/`) and install all
203
234
  packages declared in `pyproject.toml`.
204
235
 
205
- ### 2.2 Working inside the environment
236
+ ### 4.2 Working inside the environment
206
237
 
207
238
  Use `uv run` to execute Python modules and scripts with the synced
208
239
  environment:
@@ -215,12 +246,12 @@ You can also run `pytest` or other tools via `uv run`.
215
246
 
216
247
  ---
217
248
 
218
- ## 4. Synthetic Data & Datasets
249
+ ## 5. Synthetic Data & Datasets
219
250
 
220
251
  The v1 pipeline is validated on **synthetic, semantic UIs**, starting with a
221
252
  simple login flow.
222
253
 
223
- ### 3.1 Synthetic scenarios
254
+ ### 5.1 Synthetic scenarios
224
255
 
225
256
  OpenAdapt-ML includes synthetic UI generators for structured GUI automation benchmarks.
226
257
  Currently two scenarios are supported:
@@ -255,7 +286,7 @@ A more complex registration form with first name, last name, email, password, co
255
286
  | Episode Success Rate | **100%** |
256
287
  | Episodes / Steps | 32 / 384 |
257
288
 
258
- ### 3.2 Generating synthetic data
289
+ ### 5.2 Generating synthetic data
259
290
 
260
291
  Synthetic data is generated on the fly by `generate_synthetic_sessions` in
261
292
  `openadapt_ml/ingest/synthetic.py` and used internally by the training
@@ -286,7 +317,7 @@ Each session contains episodes with:
286
317
  - An observation (screenshot path).
287
318
  - An action (e.g. `CLICK`, `TYPE`, `DONE`).
288
319
 
289
- ### 3.3 Next-action SFT samples
320
+ ### 5.3 Next-action SFT samples
290
321
 
291
322
  Episodes are converted into SFT-style samples by
292
323
  `build_next_action_sft_samples` in `openadapt_ml/datasets/next_action.py`.
@@ -312,21 +343,20 @@ and its invariants, see `docs/design.md` §7.4.
312
343
 
313
344
  ---
314
345
 
315
- ## 5. Training
346
+ ## 6. Training
316
347
 
317
- Training is driven by `openadapt_ml/scripts/train.py` and YAML configs under
318
- `configs/`.
348
+ Training uses **TRL (Transformer Reinforcement Learning)** with **Unsloth** optimizations
349
+ for efficient VLM fine-tuning. This provides 2x faster training with 50% less VRAM compared
350
+ to standard approaches.
319
351
 
320
- The training script:
352
+ The training pipeline:
321
353
 
322
- 1. Loads a config file (YAML).
323
- 2. Generates synthetic sessions.
324
- 3. Flattens to episodes and builds SFT samples.
325
- 4. Wraps them in a `NextActionDataset`.
326
- 5. Instantiates a VLM adapter (e.g. `QwenVLAdapter`).
327
- 6. Runs `train_supervised` over the dataset.
354
+ 1. Loads episodes from synthetic data or real recordings.
355
+ 2. Converts to TRL-compatible SFT format with images and chat messages.
356
+ 3. Fine-tunes using SFTTrainer with LoRA adapters.
357
+ 4. Generates checkpoints and training logs for visualization.
328
358
 
329
- ### 4.1 Qwen3-VL synthetic training
359
+ ### 6.1 Qwen3-VL synthetic training
330
360
 
331
361
  Config: `configs/qwen3vl_synthetic.yaml`
332
362
 
@@ -353,7 +383,7 @@ This will:
353
383
  - Run a single-epoch supervised fine-tuning loop.
354
384
  - Print loss values as training progresses.
355
385
 
356
- ### 4.2 Qwen2.5-VL synthetic training
386
+ ### 6.2 Qwen2.5-VL synthetic training
357
387
 
358
388
  Config: `configs/qwen2_5vl_synthetic.yaml`
359
389
 
@@ -378,7 +408,7 @@ format expected by the Qwen2.5-VL processor.
378
408
  > Note: Both configs are sized for **small synthetic smoke runs**, not
379
409
  > large-scale production training.
380
410
 
381
- ### 4.3 Qwen3-VL synthetic login benchmark (hero example)
411
+ ### 6.3 Qwen3-VL synthetic login benchmark (hero example)
382
412
 
383
413
  OpenAdapt-ML ships a **synthetic login** benchmark backed by Qwen3-VL,
384
414
  used to compare **base vs LoRA-fine-tuned** models on a hardened synthetic
@@ -413,7 +443,7 @@ It exposes step-level performance metrics, which let us visually answer the ques
413
443
  3. **Precision matters**: Fine-tuned models have excellent click precision (85-100% hit rate, <0.05 coord error) while API models struggle with the action format
414
444
  4. **Size vs specialization**: The fine-tuned 2B model outperforms the general-purpose Claude Sonnet 4.5, showing that domain-specific fine-tuning trumps raw model size
415
445
 
416
- ### 4.4 Set-of-Marks (SoM) Mode: 100% Accuracy
446
+ ### 6.4 Set-of-Marks (SoM) Mode: 100% Accuracy
417
447
 
418
448
  With **Set-of-Marks** visual prompting, fine-tuned Qwen3-VL-2B achieves **100% accuracy** on both login (6-step) and registration (12-step) scenarios:
419
449
 
@@ -452,11 +482,11 @@ For the full SoM investigation report, see [`experiments/qwen_login/SOM_INVESTIG
452
482
 
453
483
  ---
454
484
 
455
- ## 6. Grounding Module
485
+ ## 7. Grounding Module
456
486
 
457
487
  OpenAdapt-ML includes a **grounding module** for locating UI elements on screenshots using natural language descriptions. This enables policy/grounding separation where the policy decides *what* to do and the grounder finds *where* to do it.
458
488
 
459
- ### 6.1 GeminiGrounder Demo
489
+ ### 7.1 GeminiGrounder Demo
460
490
 
461
491
  The `GeminiGrounder` uses Google's Gemini vision API to locate UI elements:
462
492
 
@@ -475,7 +505,7 @@ if candidates:
475
505
  print(f"Found at {best.centroid} with {best.confidence:.0%} confidence")
476
506
  ```
477
507
 
478
- ### 6.2 Set-of-Marks (SoM) Support
508
+ ### 7.2 Set-of-Marks (SoM) Support
479
509
 
480
510
  The grounding module includes functions for extracting all UI elements and overlaying numbered labels (Set-of-Marks):
481
511
 
@@ -497,7 +527,7 @@ This enables element-based actions using indices instead of coordinates:
497
527
 
498
528
  See `docs/gemini_grounding.md` for full documentation and `examples/test_gemini_grounding.py` for a complete example.
499
529
 
500
- ### 6.3 Available Grounders
530
+ ### 7.3 Available Grounders
501
531
 
502
532
  | Grounder | Description | Latency | Use Case |
503
533
  |----------|-------------|---------|----------|
@@ -505,7 +535,7 @@ See `docs/gemini_grounding.md` for full documentation and `examples/test_gemini_
505
535
  | `OracleGrounder` | Ground-truth bboxes | ~0ms | Evaluation |
506
536
  | `DetectorGrounder` | Generic wrapper with backend selection | varies | Flexible |
507
537
 
508
- ### 6.4 Grounding Evaluation
538
+ ### 7.4 Grounding Evaluation
509
539
 
510
540
  The `openadapt_ml.evals.grounding` module provides metrics for evaluating grounding accuracy:
511
541
 
@@ -523,7 +553,7 @@ print(metrics)
523
553
 
524
554
  ---
525
555
 
526
- ## 7. VLM Adapters
556
+ ## 8. VLM Adapters
527
557
 
528
558
  All VLM backends implement the shared `BaseVLMAdapter` interface in
529
559
  `openadapt_ml/models/base_adapter.py` (prepare inputs, compute loss, generate
@@ -542,7 +572,7 @@ Current adapters include:
542
572
  For full adapter internals and training-time vs runtime behavior, see
543
573
  `docs/design.md` §8.
544
574
 
545
- ### 7.1 API-backed adapters
575
+ ### 8.1 API-backed adapters
546
576
 
547
577
  To use the API-backed adapter from Python, you can configure API keys via `.env`
548
578
  file, environment variables, or pass them explicitly:
@@ -565,12 +595,12 @@ The existing CLI scripts `scripts/demo_policy.py` and
565
595
 
566
596
  ---
567
597
 
568
- ## 8. Runtime Policy & Demos
598
+ ## 9. Runtime Policy & Demos
569
599
 
570
600
  The runtime policy is implemented in `openadapt_ml/runtime/policy.py` as
571
601
  `AgentPolicy`.
572
602
 
573
- ### 8.1 AgentPolicy
603
+ ### 9.1 AgentPolicy
574
604
 
575
605
  `AgentPolicy` is initialized with a VLM adapter (dummy or real). Given an
576
606
  SFT-style sample, it:
@@ -581,7 +611,7 @@ SFT-style sample, it:
581
611
  - `DONE()`
582
612
  3. Returns a structured `Action` plus an optional free-form `thought`.
583
613
 
584
- ### 8.2 Demo script
614
+ ### 9.2 Demo script
585
615
 
586
616
  `openadapt_ml/scripts/demo_policy.py` demonstrates how to use
587
617
  `AgentPolicy` with different backends.
@@ -613,7 +643,7 @@ Each invocation will:
613
643
 
614
644
  ---
615
645
 
616
- ## 9. Testing
646
+ ## 10. Testing
617
647
 
618
648
  Basic tests are provided under `tests/`.
619
649
 
@@ -623,26 +653,26 @@ Run the test suite with:
623
653
  uv run pytest
624
654
  ```
625
655
 
626
- In particular:
656
+ Key test files:
627
657
 
628
- - `tests/test_training_dummy.py` runs a smoke test over the training loop
629
- using `DummyAdapter`.
658
+ - `tests/test_training_dummy.py` - Tests TRL training configuration and sample conversion
659
+ - `tests/test_local_cli.py` - Tests local training CLI commands (status, check, viewer)
630
660
 
631
661
  ---
632
662
 
633
- ## 10. Training on Real Data
663
+ ## 11. Training on Real Data
634
664
 
635
665
  OpenAdapt-ML supports training on real GUI recordings from two sources:
636
666
  1. **openadapt-capture** - New lightweight recording format
637
667
  2. **OpenAdapt database** - Original OpenAdapt recordings (legacy)
638
668
 
639
- ### 10.1 Training on openadapt-capture recordings
669
+ ### 11.1 Training on openadapt-capture recordings
640
670
 
641
671
  [openadapt-capture](https://github.com/OpenAdaptAI/openadapt-capture) is a lightweight GUI recording tool.
642
672
 
643
673
  ```bash
644
674
  # Install openadapt-capture
645
- uv pip install openadapt-capture
675
+ uv add openadapt-capture
646
676
 
647
677
  # Record a workflow (e.g., turning off Night Shift)
648
678
  openadapt-capture record --output ~/captures/turn-off-nightshift
@@ -656,7 +686,7 @@ uv run python -m openadapt_ml.scripts.train \
656
686
 
657
687
  The goal is automatically derived from the directory name (e.g., `"Turn off nightshift"`).
658
688
 
659
- ### 10.2 Compare human vs AI predictions
689
+ ### 11.2 Compare human vs AI predictions
660
690
 
661
691
  ```bash
662
692
  uv run python -m openadapt_ml.scripts.compare \
@@ -673,11 +703,11 @@ The comparison viewer shows:
673
703
 
674
704
  ---
675
705
 
676
- ## 11. Local Training (CUDA / Apple Silicon)
706
+ ## 12. Local Training (CUDA / Apple Silicon)
677
707
 
678
708
  Train locally on your own GPU. Auto-detects CUDA or Apple Silicon (MPS).
679
709
 
680
- ### 11.1 Quick start
710
+ ### 12.1 Quick start
681
711
 
682
712
  ```bash
683
713
  # Train on a capture (auto-detects device and config)
@@ -686,7 +716,7 @@ uv run python -m openadapt_ml.cloud.local train \
686
716
  --open # Opens dashboard in browser
687
717
  ```
688
718
 
689
- ### 11.2 Training workflow
719
+ ### 12.2 Training workflow
690
720
 
691
721
  ```bash
692
722
  # Check device and training status
@@ -713,11 +743,11 @@ uv run python -m openadapt_ml.cloud.local compare \
713
743
 
714
744
  ---
715
745
 
716
- ## 12. Cloud GPU Training (Lambda Labs)
746
+ ## 13. Cloud GPU Training (Lambda Labs)
717
747
 
718
748
  For faster training on powerful GPUs, use Lambda Labs. Full documentation: [`docs/cloud_gpu_training.md`](docs/cloud_gpu_training.md).
719
749
 
720
- ### 12.1 Quick start
750
+ ### 13.1 Quick start
721
751
 
722
752
  ```bash
723
753
  # Set API key
@@ -729,7 +759,7 @@ uv run python -m openadapt_ml.cloud.lambda_labs train \
729
759
  --goal "Turn off Night Shift in System Settings"
730
760
  ```
731
761
 
732
- ### 12.2 Manual workflow
762
+ ### 13.2 Manual workflow
733
763
 
734
764
  ```bash
735
765
  # List available instances and pricing
@@ -751,7 +781,7 @@ uv run python -m openadapt_ml.cloud.lambda_labs download <instance_id>
751
781
  uv run python -m openadapt_ml.cloud.lambda_labs terminate <instance_id>
752
782
  ```
753
783
 
754
- ### 12.3 Training visualization
784
+ ### 13.3 Training visualization
755
785
 
756
786
  The training process generates:
757
787
  - **`training_output/dashboard.html`** - Real-time training dashboard with loss curves
@@ -790,9 +820,15 @@ uv run python -m openadapt_ml.cloud.local serve --port 8080 --open
790
820
  - `Home` / `End` - First/last frame
791
821
  - `O` - Toggle click overlay
792
822
 
823
+ **Benchmark Viewer:**
824
+
825
+ ![Benchmark Viewer](docs/images/benchmark_viewer.png)
826
+
827
+ *View benchmark evaluation results with task-level filtering, success/failure status, and run comparison. Shows Claude achieving 30% on mock evaluation tasks (simulated environment for testing the pipeline - real WAA evaluation requires Windows VMs).*
828
+
793
829
  ---
794
830
 
795
- ## 13. Limitations & Notes
831
+ ## 14. Limitations & Notes
796
832
 
797
833
  - **Apple Silicon / bitsandbytes**:
798
834
  - Example configs are sized for CPU / Apple Silicon development runs; see
@@ -805,12 +841,18 @@ uv run python -m openadapt_ml.cloud.local serve --port 8080 --open
805
841
  - **Evaluation**:
806
842
  - v1 focuses on smoke tests and qualitative behavior on synthetic data.
807
843
  More formal evaluation scripts and metrics are planned.
844
+ - **Windows Agent Arena (WAA) on Azure**:
845
+ - WAA requires nested virtualization (Windows VM inside Docker via QEMU)
846
+ - Azure ML managed compute does not support nested virtualization
847
+ - For real WAA evaluation, use dedicated VMs with Dv3/Ev3 series or run locally
848
+ - Mock evaluation (`test-mock`) validates the pipeline without Windows VMs
849
+ - See `CLAUDE.md` for detailed workarounds and infrastructure setup
808
850
 
809
851
  For deeper architectural details, see [`docs/design.md`](docs/design.md).
810
852
 
811
853
  ---
812
854
 
813
- ## 14. Roadmap
855
+ ## 15. Roadmap
814
856
 
815
857
  For the up-to-date, prioritized roadmap (including concrete implementation
816
858
  targets and agent-executable acceptance criteria), see
@@ -0,0 +1,86 @@
1
+ openadapt_ml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ openadapt_ml/config.py,sha256=eH5WTKRPkkidjkNb25Wn_dUAizUQTsVPMYLDq_ekPJQ,1865
3
+ openadapt_ml/benchmarks/__init__.py,sha256=RRAoXm_al-DgStCKCycnC60_iZo52mLmUu5nN5IPfxY,3855
4
+ openadapt_ml/benchmarks/agent.py,sha256=kuq-dWqWJH8ogjklAFdnbG_6zVzHtd6Ab2rPxRjvHtU,29724
5
+ openadapt_ml/benchmarks/azure.py,sha256=AI5ZdsNxmQ197ymOj68lTLAtl-eByuguhACqv7UZhIg,28145
6
+ openadapt_ml/benchmarks/base.py,sha256=d7T_zMlMPlN0beDWkpzOvOHYQO6QnsePLQ45iKbi66Y,11667
7
+ openadapt_ml/benchmarks/cli.py,sha256=ElBZDcmDOpA0uAoHYKRx3fLAJ5v8dH4R9SxfhmwTZWw,204898
8
+ openadapt_ml/benchmarks/data_collection.py,sha256=EYOsYnFQifF3MXD0TZxznd-HbODiovnGDtxGjSMpO-Y,14652
9
+ openadapt_ml/benchmarks/live_tracker.py,sha256=1SukwgRYbAzSMzHOhPZOSgZ58L44CYtX-KOfAzyAJZw,5130
10
+ openadapt_ml/benchmarks/runner.py,sha256=a52GasPKEnNgevxilQAI1z8FnzwWddLiDY60NYWxpZk,13616
11
+ openadapt_ml/benchmarks/viewer.py,sha256=Mjbt67gfnSw29rNhU4JYL0cdAc-UZ3QpxmG6GhKnD68,41111
12
+ openadapt_ml/benchmarks/vm_monitor.py,sha256=NKhLF66hcBqxIDHk-iPKn31AfzpXyE-9IkBTYX2k70I,20330
13
+ openadapt_ml/benchmarks/waa.py,sha256=V_RtDfOLW2jD8MN-M9YJYvt67NQCXrUkVxOtcNYVLHo,27008
14
+ openadapt_ml/benchmarks/waa_live.py,sha256=U92KaBWCpJmwqSURJeG5biEolTyBCvyCikWRz05IV-A,22982
15
+ openadapt_ml/benchmarks/waa_deploy/Dockerfile,sha256=m9Nsei1v_m0jF1HWc7yCMz-_wD3rHHv9H9HBC2lFVSE,10969
16
+ openadapt_ml/benchmarks/waa_deploy/__init__.py,sha256=KV71HrrgETytfY0i4vFSi-yM0KjoQP2hd9Bl03cZ9yc,320
17
+ openadapt_ml/benchmarks/waa_deploy/api_agent.py,sha256=dbazhRxc554901LFiVuj6sMmOgoHtTKl8XIAiIJrFWU,20024
18
+ openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat,sha256=YxgrSWh76zLijlpxEpulWf9To2JtJ-yR42lr2WyTXiY,1496
19
+ openadapt_ml/cloud/__init__.py,sha256=XYrvxivJeZ8qYnuGod5kodMlm3iT2OK2GAApO3CNB0c,133
20
+ openadapt_ml/cloud/azure_inference.py,sha256=lqkG86Dn6x2Rp7Y-X9tK_8mn0QfRvaDQNRHg-kJkQyc,15736
21
+ openadapt_ml/cloud/lambda_labs.py,sha256=jZ700gNR8zxO2qGa7yqByKXgb4-18zN30YQdDZ3Fbuk,102206
22
+ openadapt_ml/cloud/local.py,sha256=AX7gxgkwGwOJ1ogp-rQefKDyK3MDNMWA9W2qkwaV66g,104782
23
+ openadapt_ml/cloud/ssh_tunnel.py,sha256=q4VEKT4cEU0-hAP1t9fL2jRur0FMRxg8ZV6j1IQV45k,20824
24
+ openadapt_ml/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ openadapt_ml/datasets/next_action.py,sha256=eNNwwdRE-pJa1DCgEIr9eTipZ7pDAEDVxkc9UPObJLs,21219
26
+ openadapt_ml/evals/__init__.py,sha256=Kx7bSvPHwmoGVI3q1wS_lC17W2S32YHj0459JRqu6Ow,573
27
+ openadapt_ml/evals/grounding.py,sha256=un4bWrhqwrJ4O9QqF40NfnhKeOMntGdCJcwXz6ZzNJ4,8447
28
+ openadapt_ml/evals/plot_eval_metrics.py,sha256=gsikQ3MSUY7Pw61D8lB286q0MPpBL9E05UFHPrawViw,5237
29
+ openadapt_ml/evals/trajectory_matching.py,sha256=p40wDDlD0AyiY6vsgkcp6FBDISyKKQKycsrEz7uGF8Y,20616
30
+ openadapt_ml/experiments/demo_prompt/__init__.py,sha256=dwS0bI53jXMzHE-DPhb_mhmPdoqSZRIcNbV79wt8KPM,454
31
+ openadapt_ml/experiments/demo_prompt/format_demo.py,sha256=rIXcjiYhI2YuLDUjY4iTxPp4ZmtvKibQQYTCfEn4lZs,6553
32
+ openadapt_ml/experiments/demo_prompt/run_experiment.py,sha256=uchhadnqxauBXxlalTh7wXPLOOXk7NBY1mwXiN7rpHI,16309
33
+ openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json,sha256=08oryOF126toTQDN9xciodavvfsaWNnXuBs0aULwpfI,5326
34
+ openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json,sha256=u03VgYTQia_HzilzNjxdGLpUSdbo4SzmHqI-GXlvurg,26915
35
+ openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json,sha256=FA1JgXXLor6on3lHlfJdNSuKzBca30ggH8IWSJEmmfA,11517
36
+ openadapt_ml/experiments/waa_demo/__init__.py,sha256=9M8iLxO9GWAw-FIB-0tzsqaweLcO5EVP1Sc5BoK16iU,363
37
+ openadapt_ml/experiments/waa_demo/demos.py,sha256=UwO0EYy8wUEggaBaI_cXuYe_jwSB1hx3ZtPf-z9bhjc,13796
38
+ openadapt_ml/experiments/waa_demo/runner.py,sha256=OxgQhZIqhYeGDYmAcQLEsFh3B053rnuPL0ZEIoXz0bI,24327
39
+ openadapt_ml/experiments/waa_demo/tasks.py,sha256=jw1QwbOt8xmWBW2lmBWcJzKBXssjv_e0j49MlC2rVJY,5425
40
+ openadapt_ml/export/__init__.py,sha256=mKehKHOio4jGcK-3r0-pb446GdKMPs0O9hAu4S0_R7s,266
41
+ openadapt_ml/export/__main__.py,sha256=0ObtWcdzf6p7gPwhNlCKpNm2FIhmusdYNkuk8tyt77U,149
42
+ openadapt_ml/export/cli.py,sha256=goTKNq9cOO9wsdNluLMH_-f9kdWShH3FPP8sCZ6KaPI,2331
43
+ openadapt_ml/export/parquet.py,sha256=1BGHJKJc302trn64mUbuhxYMBPNvH80QW5f8GM63njk,9664
44
+ openadapt_ml/grounding/__init__.py,sha256=uMvcALFRXmKD6PHhqLZ24Y6zhRUs46_PnWYqiqJP5cM,1412
45
+ openadapt_ml/grounding/base.py,sha256=mnjT25nxltZCD0VBzgIgj2kuCcB4sgXBN97MBaW5P6c,7688
46
+ openadapt_ml/grounding/detector.py,sha256=z-6Y_jnUNnhviUjKv6okjJ0r13DmBiNZMzNJo0rTlBY,19786
47
+ openadapt_ml/ingest/__init__.py,sha256=7YASU-pOXtjalcRJ3WCbGuEWne0aVZtypsfcEmp-yFE,1437
48
+ openadapt_ml/ingest/capture.py,sha256=SR22U6M7hU_QAsCYAG4JaxGFHpLZg96k_SJ-tkXAgbs,10218
49
+ openadapt_ml/ingest/loader.py,sha256=PPBFMA9d7oc2bjXgrdPCZQBnv3MvoDFQNJvXeI90_j8,9865
50
+ openadapt_ml/ingest/synthetic.py,sha256=DSUyqbLxHtJjELitTP2C_3bv3-E0UW7P-RFAO9W8XFs,39302
51
+ openadapt_ml/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
+ openadapt_ml/models/api_adapter.py,sha256=9EfQrXuFYIT-ea-wmGzJlM4thgVVDIRZnqqS_gL-PbU,6541
53
+ openadapt_ml/models/base_adapter.py,sha256=AG40BhdDORwUNYYg3DM1wsErX57aEJrkK0tyn0tEKhI,2050
54
+ openadapt_ml/models/dummy_adapter.py,sha256=h4Zu-rjWgtG1r8jRtcsrX-FZm8iImrhrTQ7TsLfjE8A,1581
55
+ openadapt_ml/models/qwen_vl.py,sha256=sIJUtDRXAcz9zh3uRWOMYVOxdWIXlcM3vazdNOAsY_U,17239
56
+ openadapt_ml/retrieval/README.md,sha256=j4gXhTo6yH-5cuw4ER4174V-U6TQakOVT6Hj4kj7B0I,5696
57
+ openadapt_ml/retrieval/USAGE.md,sha256=XDIrX-94Z5nC-wvnBY5yF5gTqUYixxCC3wwUFvQx5YM,9278
58
+ openadapt_ml/retrieval/__init__.py,sha256=xocb84riKLUCezUioKssFRhAQsnvexh4W932o368_qg,2726
59
+ openadapt_ml/retrieval/demo_retriever.py,sha256=fYPLKzlG7yGHfV-F-TECBbxkMyP1fqdqPzTJ7G0oXYU,29184
60
+ openadapt_ml/retrieval/embeddings.py,sha256=W4Bqo48Ds4BI9zZg1awkSi9p5kplRRALEjEgVh-jbsY,19239
61
+ openadapt_ml/retrieval/index.py,sha256=UBFnSxp5T5eKt2txFcd0FytKCw1qxONZfxnFJVrduRQ,5710
62
+ openadapt_ml/retrieval/retriever.py,sha256=idJcz4pUHgPHuZvX3VIUmO8Vs-iw4_1w6UUypynRfVY,4579
63
+ openadapt_ml/runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
+ openadapt_ml/runtime/policy.py,sha256=M-OyhaE5gWh7e3KJ0Ip_YavlfRaEO7S4UKNjvqH_qsg,6724
65
+ openadapt_ml/schema/__init__.py,sha256=W1Rx58WjFpUE2D1hdujD6tkxr5m8U3nE4JBMwdw7kLc,2787
66
+ openadapt_ml/schema/converters.py,sha256=3qK1z8EATFaDi1M0w9T7PLiRtJu6OtQM7JG7qbE4EKU,18392
67
+ openadapt_ml/schema/episode.py,sha256=_QQ34V39DYLaOx5GnH4mKHoXteekqRSq1C2aJe_Y_5Y,15399
68
+ openadapt_ml/scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
+ openadapt_ml/scripts/compare.py,sha256=IUi3lQfOV8qm1JMvPDAnk3UqT7vPYNjLGeN8qda1wXc,56921
70
+ openadapt_ml/scripts/demo_policy.py,sha256=NqB1akCKT5dlLER5ToBpF15QYaWnbXtRgWBvrOb6HJc,2105
71
+ openadapt_ml/scripts/eval_policy.py,sha256=Tk1QMNlkfcyHToAqdTgdLkJyEGyVLXMcE5EVmJb89Ng,10381
72
+ openadapt_ml/scripts/make_gif.py,sha256=H9fevBZFH31_7vs-OROfg9A2U6eboWXUNjyU2-XLMqw,4439
73
+ openadapt_ml/scripts/prepare_synthetic.py,sha256=RbnY3QiH_RFk2du3awXFn90jxJQNUkhWlei2NSQ1aUs,1109
74
+ openadapt_ml/scripts/run_qwen_login_benchmark.py,sha256=NWIhCAFSX5pYKFRCec7RkrYtzvz2LNMqhDfXcKxlagM,5655
75
+ openadapt_ml/scripts/train.py,sha256=BrDJxiZx1S8igpNi6hC287kh51qU5RtPcdVq1yxEJkQ,6685
76
+ openadapt_ml/training/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
+ openadapt_ml/training/benchmark_viewer.py,sha256=iD56K467n0qS-nrcxxGZtABQs6qswB2je5Uj4xEacNI,174238
78
+ openadapt_ml/training/shared_ui.py,sha256=7ZdBDo-__pHUMiuvSot-waYvpinAullRXQTsiPoruBw,4823
79
+ openadapt_ml/training/stub_provider.py,sha256=fw2b0EzsCDDFhTZQIKR49BEFmSEN1d4g9rhFihzErWY,10508
80
+ openadapt_ml/training/trainer.py,sha256=K4LMDe-GxnGZB9gt0IGMU67DcqtcggaaQIbJp4GYcYc,90713
81
+ openadapt_ml/training/trl_trainer.py,sha256=9NQnNt2MDLUUUA_V8f2PwsDLeHnb2fOnmxol24-PyjU,11355
82
+ openadapt_ml/training/viewer.py,sha256=7uA2SHW1Uh7v65s8sOSaYuAVUlyp6GsCOGy6YA0hnBQ,121132
83
+ openadapt_ml-0.2.0.dist-info/METADATA,sha256=-pGXAMvRsWMHCdjsj3GzALIcAr0fPaMvEaUv8jkGYfA,28839
84
+ openadapt_ml-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
85
+ openadapt_ml-0.2.0.dist-info/licenses/LICENSE,sha256=2E5UY67RVLedJuNnwGudkAMtfM3LZNUcHgmaL89TAfw,1068
86
+ openadapt_ml-0.2.0.dist-info/RECORD,,
@@ -1,53 +0,0 @@
1
- """Schema definitions and validation for openadapt-ml.
2
-
3
- Core data structures:
4
- - Action: A single GUI action (click, type, scroll, etc.)
5
- - Observation: GUI state observation (screenshot, accessibility tree, etc.)
6
- - Step: One timestep containing observation + action
7
- - Episode: A single task attempt / workflow instance
8
- - Session: Container for multiple episodes
9
-
10
- Validation:
11
- - validate_episode(): Validate an Episode object
12
- - validate_session(): Validate a Session object
13
- - validate_episodes(): Validate a list of Episodes
14
- - ValidationError: Raised on schema violations
15
- """
16
-
17
- from openadapt_ml.schemas.sessions import (
18
- Action,
19
- ActionType,
20
- Episode,
21
- Observation,
22
- Session,
23
- Step,
24
- )
25
- from openadapt_ml.schemas.validation import (
26
- ValidationError,
27
- summarize_episodes,
28
- validate_action,
29
- validate_episode,
30
- validate_episodes,
31
- validate_observation,
32
- validate_session,
33
- validate_step,
34
- )
35
-
36
- __all__ = [
37
- # Core types
38
- "Action",
39
- "ActionType",
40
- "Episode",
41
- "Observation",
42
- "Session",
43
- "Step",
44
- # Validation
45
- "ValidationError",
46
- "validate_action",
47
- "validate_episode",
48
- "validate_episodes",
49
- "validate_observation",
50
- "validate_session",
51
- "validate_step",
52
- "summarize_episodes",
53
- ]