openadapt-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. openadapt_ml/__init__.py +0 -0
  2. openadapt_ml/benchmarks/__init__.py +125 -0
  3. openadapt_ml/benchmarks/agent.py +825 -0
  4. openadapt_ml/benchmarks/azure.py +761 -0
  5. openadapt_ml/benchmarks/base.py +366 -0
  6. openadapt_ml/benchmarks/cli.py +884 -0
  7. openadapt_ml/benchmarks/data_collection.py +432 -0
  8. openadapt_ml/benchmarks/runner.py +381 -0
  9. openadapt_ml/benchmarks/waa.py +704 -0
  10. openadapt_ml/cloud/__init__.py +5 -0
  11. openadapt_ml/cloud/azure_inference.py +441 -0
  12. openadapt_ml/cloud/lambda_labs.py +2445 -0
  13. openadapt_ml/cloud/local.py +790 -0
  14. openadapt_ml/config.py +56 -0
  15. openadapt_ml/datasets/__init__.py +0 -0
  16. openadapt_ml/datasets/next_action.py +507 -0
  17. openadapt_ml/evals/__init__.py +23 -0
  18. openadapt_ml/evals/grounding.py +241 -0
  19. openadapt_ml/evals/plot_eval_metrics.py +174 -0
  20. openadapt_ml/evals/trajectory_matching.py +486 -0
  21. openadapt_ml/grounding/__init__.py +45 -0
  22. openadapt_ml/grounding/base.py +236 -0
  23. openadapt_ml/grounding/detector.py +570 -0
  24. openadapt_ml/ingest/__init__.py +43 -0
  25. openadapt_ml/ingest/capture.py +312 -0
  26. openadapt_ml/ingest/loader.py +232 -0
  27. openadapt_ml/ingest/synthetic.py +1102 -0
  28. openadapt_ml/models/__init__.py +0 -0
  29. openadapt_ml/models/api_adapter.py +171 -0
  30. openadapt_ml/models/base_adapter.py +59 -0
  31. openadapt_ml/models/dummy_adapter.py +42 -0
  32. openadapt_ml/models/qwen_vl.py +426 -0
  33. openadapt_ml/runtime/__init__.py +0 -0
  34. openadapt_ml/runtime/policy.py +182 -0
  35. openadapt_ml/schemas/__init__.py +53 -0
  36. openadapt_ml/schemas/sessions.py +122 -0
  37. openadapt_ml/schemas/validation.py +252 -0
  38. openadapt_ml/scripts/__init__.py +0 -0
  39. openadapt_ml/scripts/compare.py +1490 -0
  40. openadapt_ml/scripts/demo_policy.py +62 -0
  41. openadapt_ml/scripts/eval_policy.py +287 -0
  42. openadapt_ml/scripts/make_gif.py +153 -0
  43. openadapt_ml/scripts/prepare_synthetic.py +43 -0
  44. openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
  45. openadapt_ml/scripts/train.py +174 -0
  46. openadapt_ml/training/__init__.py +0 -0
  47. openadapt_ml/training/benchmark_viewer.py +1538 -0
  48. openadapt_ml/training/shared_ui.py +157 -0
  49. openadapt_ml/training/stub_provider.py +276 -0
  50. openadapt_ml/training/trainer.py +2446 -0
  51. openadapt_ml/training/viewer.py +2970 -0
  52. openadapt_ml-0.1.0.dist-info/METADATA +818 -0
  53. openadapt_ml-0.1.0.dist-info/RECORD +55 -0
  54. openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
  55. openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,818 @@
1
+ Metadata-Version: 2.4
2
+ Name: openadapt-ml
3
+ Version: 0.1.0
4
+ Summary: Model-agnostic, domain-agnostic ML engine for GUI automation agents
5
+ Project-URL: Homepage, https://github.com/OpenAdaptAI/openadapt-ml
6
+ Project-URL: Repository, https://github.com/OpenAdaptAI/openadapt-ml
7
+ Project-URL: Documentation, https://github.com/OpenAdaptAI/openadapt-ml/tree/main/docs
8
+ Author-email: "MLDSAI Inc." <richard@mldsai.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: agents,automation,fine-tuning,gui,ml,vision-language-models,vlm
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Requires-Python: >=3.12
20
+ Requires-Dist: azure-ai-ml>=1.30.0
21
+ Requires-Dist: azure-identity>=1.25.1
22
+ Requires-Dist: bitsandbytes>=0.41.0
23
+ Requires-Dist: google-generativeai>=0.8.5
24
+ Requires-Dist: matplotlib>=3.10.7
25
+ Requires-Dist: openadapt-capture>=0.1.0
26
+ Requires-Dist: peft>=0.18.0
27
+ Requires-Dist: pillow>=12.0.0
28
+ Requires-Dist: pydantic-settings>=2.0.0
29
+ Requires-Dist: pytest>=9.0.2
30
+ Requires-Dist: pyyaml>=6.0.3
31
+ Requires-Dist: torch>=2.9.1
32
+ Requires-Dist: torchvision>=0.24.1
33
+ Requires-Dist: transformers>=4.57.3
34
+ Provides-Extra: api
35
+ Requires-Dist: anthropic>=0.40.0; extra == 'api'
36
+ Requires-Dist: openai>=1.0.0; extra == 'api'
37
+ Requires-Dist: pydantic-settings>=2.0.0; extra == 'api'
38
+ Provides-Extra: azure
39
+ Requires-Dist: azure-ai-ml>=1.0.0; extra == 'azure'
40
+ Requires-Dist: azure-identity>=1.0.0; extra == 'azure'
41
+ Provides-Extra: dev
42
+ Requires-Dist: pytest>=9.0.0; extra == 'dev'
43
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
44
+ Provides-Extra: lambda-labs
45
+ Requires-Dist: requests>=2.28.0; extra == 'lambda-labs'
46
+ Description-Content-Type: text/markdown
47
+
48
+ # OpenAdapt-ML
49
+
50
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
51
+ [![Python Version](https://img.shields.io/badge/python-3.12-blue)](https://www.python.org/)
52
+
53
+ OpenAdapt-ML is a **model-agnostic, domain-agnostic ML engine** for GUI
54
+ automation agents.
55
+
56
+ It provides:
57
+
58
+ - **Schemas** for GUI interaction trajectories (screens + actions + goals).
59
+ - **Synthetic semantic UI generation** for bootstrapping datasets.
60
+ - **Dataset builders** that turn episodes into next-action SFT samples.
61
+ - **VLM adapters** (Qwen3-VL, Qwen2.5-VL) using Hugging Face + PEFT.
62
+ - A minimal **supervised training loop** for fine-tuning.
63
+ - A simple **runtime policy** API that predicts the next GUI action.
64
+
65
+ The design is described in detail in [`docs/design.md`](docs/design.md).
66
+
67
+ ---
68
+
69
+ ## 1. Quickstart
70
+
71
+ ### 1.1 Install dependencies
72
+
73
+ From the repository root:
74
+
75
+ ```bash
76
+ uv sync
77
+ ```
78
+
79
+ ### 1.2 Run a small demo policy
80
+
81
+ Run a fast, model-free smoke test:
82
+
83
+ ```bash
84
+ uv run python -m openadapt_ml.scripts.demo_policy --backend dummy
85
+ ```
86
+
87
+ ### 1.3 Run the synthetic login benchmark (end-to-end)
88
+
89
+ On a machine with a suitable GPU, you can reproduce the Qwen3-VL synthetic
90
+ login benchmark (train → eval base/FT → plot) with a single command:
91
+
92
+ ```bash
93
+ uv run python -m openadapt_ml.scripts.run_qwen_login_benchmark \
94
+ --config configs/qwen3vl_synthetic_dev.yaml \
95
+ --out-dir experiments/qwen_login/2b_dev
96
+ ```
97
+
98
+ This default invocation will:
99
+
100
+ - Train a LoRA adapter on the hardened synthetic login scenario.
101
+ - Evaluate both the **base** and **fine-tuned** Qwen3-VL models on fresh
102
+ synthetic episodes.
103
+ - Write eval JSONs and a comparison plot under
104
+ `experiments/qwen_login/2b_dev/`.
105
+
106
+ The `qwen3vl_synthetic_dev` config is sized for small development runs on Apple
107
+ Silicon / CPU, but will also run on CUDA GPUs.
108
+
109
+ To additionally compare against hosted API backends (Claude Sonnet 4.5 and
110
+ OpenAI GPT-5.1), first install the optional `api` extra and configure your API
111
+ keys:
112
+
113
+ ```bash
114
+ uv sync --extra api
115
+
116
+ # Option 1: Use .env file (recommended)
117
+ cp .env.example .env
118
+ # Edit .env with your API keys
119
+
120
+ # Option 2: Export environment variables (for CI/containers)
121
+ export ANTHROPIC_API_KEY=... # for Claude Sonnet 4.5
122
+ export OPENAI_API_KEY=... # for GPT-5.1
123
+ ```
124
+
125
+ Then run:
126
+
127
+ ```bash
128
+ uv run python -m openadapt_ml.scripts.run_qwen_login_benchmark \
129
+ --config configs/qwen3vl_synthetic_dev.yaml \
130
+ --out-dir experiments/qwen_login/2b_dev \
131
+ --include-all-apis
132
+ ```
133
+
134
+ This will evaluate and plot **Qwen3 base**, **Qwen3 FT**, **Claude Sonnet 4.5**,
135
+ and **GPT-5.1** on the same synthetic login benchmark.
136
+
137
+ For complete documentation including training setup, evaluation metrics, SoM mode results, and reproduction instructions, see **[`docs/qwen_login_experiment.md`](docs/qwen_login_experiment.md)**. For implementation details and technical notes, see `docs/state_and_next_steps_qwen_login.md`.
138
+
139
+ ---
140
+
141
+ ## 2. Repository Structure
142
+
143
+ Key modules:
144
+
145
+ - `openadapt_ml/schemas/`
146
+ - Canonical dataclasses for `Session`, `Episode`, `Step`, `Observation`,
147
+ `Action`.
148
+ - `openadapt_ml/ingest/synthetic.py`
149
+ - Synthetic semantic UI generator (e.g. login screen) that produces PNG
150
+ screenshots and scripted episodes.
151
+ - `openadapt_ml/datasets/next_action.py`
152
+ - Converts episodes into goal-conditioned, chat-style next-action SFT
153
+ samples suitable for VLM fine-tuning.
154
+ - `openadapt_ml/models/base_adapter.py`
155
+ - `BaseVLMAdapter` abstraction shared by all VLM backends.
156
+ - `openadapt_ml/models/qwen_vl.py`
157
+ - `QwenVLAdapter` implementing support for **Qwen3-VL** and
158
+ **Qwen2.5-VL**.
159
+ - `openadapt_ml/models/dummy_adapter.py`
160
+ - Tiny fake adapter used to validate training and runtime flows without
161
+ loading a real VLM.
162
+ - `openadapt_ml/training/trainer.py`
163
+ - Minimal supervised training loop (`train_supervised`) with gradient
164
+ accumulation and logging.
165
+ - `openadapt_ml/runtime/policy.py`
166
+ - `AgentPolicy` that formats inputs for a VLM and parses textual actions
167
+ like `CLICK(x=..., y=...)` and `DONE()` into structured `Action`s.
168
+ - `openadapt_ml/scripts/train.py`
169
+ - CLI entry point for running synthetic-data training with a chosen
170
+ model/config.
171
+ - `openadapt_ml/scripts/demo_policy.py`
172
+ - CLI demo showing how to use `AgentPolicy` with different backends
173
+ (dummy, Qwen3-VL, Qwen2.5-VL).
174
+
175
+ Configs and docs:
176
+
177
+ - `configs/qwen3vl_synthetic.yaml`
178
+ - Synthetic training config for **Qwen3-VL-8B-Instruct**.
179
+ - `configs/qwen2_5vl_synthetic.yaml`
180
+ - Synthetic training config for **Qwen2.5-VL-7B-Instruct**.
181
+ - `docs/design.md`
182
+ - High-level design document (scope, architecture, schemas, adapters,
183
+ training, runtime, and evaluation strategy).
184
+
185
+ ---
186
+
187
+ ## 3. Environment Setup
188
+
189
+ OpenAdapt-ML targets **Python 3.12** and uses [`uv`](https://github.com/astral-sh/uv)
190
+ for dependency management.
191
+
192
+ ### 2.1 Install and sync
193
+
194
+ From the repository root:
195
+
196
+ ```bash
197
+ # Ensure uv is installed (see uv docs for platform-specific install)
198
+ # Then:
199
+ uv sync
200
+ ```
201
+
202
+ This will create a virtual environment (e.g. `.venv/`) and install all
203
+ packages declared in `pyproject.toml`.
204
+
205
+ ### 2.2 Working inside the environment
206
+
207
+ Use `uv run` to execute Python modules and scripts with the synced
208
+ environment:
209
+
210
+ ```bash
211
+ uv run python -m openadapt_ml.scripts.train --help
212
+ ```
213
+
214
+ You can also run `pytest` or other tools via `uv run`.
215
+
216
+ ---
217
+
218
+ ## 4. Synthetic Data & Datasets
219
+
220
+ The v1 pipeline is validated on **synthetic, semantic UIs**, starting with a
221
+ simple login flow.
222
+
223
+ ### 3.1 Synthetic scenarios
224
+
225
+ OpenAdapt-ML includes synthetic UI generators for structured GUI automation benchmarks.
226
+ Currently two scenarios are supported:
227
+
228
+ #### Login Scenario (6 steps, 3 elements)
229
+
230
+ A simple login form with username, password, and login button.
231
+
232
+ ![Login Demo](experiments/qwen_login/login_demo.gif)
233
+
234
+ **Login SoM Evaluation Results:**
235
+
236
+ | Metric | Qwen3-VL-2B FT |
237
+ |--------|----------------|
238
+ | Action Type Accuracy | **100%** |
239
+ | Element Accuracy | **100%** |
240
+ | Episode Success Rate | **100%** |
241
+ | Episodes / Steps | 32 / 192 |
242
+
243
+ #### Registration Scenario (12 steps, 6 elements)
244
+
245
+ A more complex registration form with first name, last name, email, password, confirm password, and register button.
246
+
247
+ ![Registration Demo](experiments/qwen_login/registration_demo.gif)
248
+
249
+ **Registration SoM Evaluation Results:**
250
+
251
+ | Metric | Qwen3-VL-2B FT |
252
+ |--------|----------------|
253
+ | Action Type Accuracy | **100%** |
254
+ | Element Accuracy | **100%** |
255
+ | Episode Success Rate | **100%** |
256
+ | Episodes / Steps | 32 / 384 |
257
+
258
+ ### 3.2 Generating synthetic data
259
+
260
+ Synthetic data is generated on the fly by `generate_synthetic_sessions` in
261
+ `openadapt_ml/ingest/synthetic.py` and used internally by the training
262
+ scripts.
263
+
264
+ You can also call it directly from Python:
265
+
266
+ ```python
267
+ from openadapt_ml.ingest.synthetic import generate_synthetic_sessions
268
+
269
+ # Login scenario (default)
270
+ sessions = generate_synthetic_sessions(num_sessions=2, seed=123, output_dir="synthetic_login")
271
+
272
+ # Registration scenario
273
+ sessions = generate_synthetic_sessions(
274
+ num_sessions=2,
275
+ seed=123,
276
+ output_dir="synthetic_registration",
277
+ scenario="registration", # "login" or "registration"
278
+ use_som=True, # Enable Set-of-Marks visual overlays
279
+ )
280
+ ```
281
+
282
+ Each session contains episodes with:
283
+
284
+ - A **goal** (e.g. "Log in as demo user").
285
+ - A sequence of **steps**, each with:
286
+ - An observation (screenshot path).
287
+ - An action (e.g. `CLICK`, `TYPE`, `DONE`).
288
+
289
+ ### 3.3 Next-action SFT samples
290
+
291
+ Episodes are converted into SFT-style samples by
292
+ `build_next_action_sft_samples` in `openadapt_ml/datasets/next_action.py`.
293
+
294
+ Each sample has the form:
295
+
296
+ ```python
297
+ {
298
+ "images": ["/path/to/screenshot.png"],
299
+ "messages": [
300
+ {"role": "system", "content": ...},
301
+ {"role": "user", "content": "Goal: ...\nCurrent screen: ..."},
302
+ {"role": "assistant", "content": "CLICK(x=..., y=...)"},
303
+ ],
304
+ }
305
+ ```
306
+
307
+ These samples are wrapped in a simple `NextActionDataset` for use with the
308
+ training loop.
309
+
310
+ For the full, canonical definition of the action DSL (CLICK/TYPE/WAIT/DONE)
311
+ and its invariants, see `docs/design.md` §7.4.
312
+
313
+ ---
314
+
315
+ ## 5. Training
316
+
317
+ Training is driven by `openadapt_ml/scripts/train.py` and YAML configs under
318
+ `configs/`.
319
+
320
+ The training script:
321
+
322
+ 1. Loads a config file (YAML).
323
+ 2. Generates synthetic sessions.
324
+ 3. Flattens to episodes and builds SFT samples.
325
+ 4. Wraps them in a `NextActionDataset`.
326
+ 5. Instantiates a VLM adapter (e.g. `QwenVLAdapter`).
327
+ 6. Runs `train_supervised` over the dataset.
328
+
329
+ ### 4.1 Qwen3-VL synthetic training
330
+
331
+ Config: `configs/qwen3vl_synthetic.yaml`
332
+
333
+ Key fields:
334
+
335
+ ```yaml
336
+ model:
337
+ name: Qwen/Qwen3-VL-8B-Instruct
338
+ load_in_4bit: false # 4-bit quantization is disabled on macOS / Apple Silicon
339
+
340
+ # LoRA config and training hyperparameters are also defined in the YAML.
341
+ ```
342
+
343
+ Run:
344
+
345
+ ```bash
346
+ uv run python -m openadapt_ml.scripts.train --config configs/qwen3vl_synthetic.yaml
347
+ ```
348
+
349
+ This will:
350
+
351
+ - Download and load `Qwen/Qwen3-VL-8B-Instruct`.
352
+ - Generate a small synthetic dataset.
353
+ - Run a single-epoch supervised fine-tuning loop.
354
+ - Print loss values as training progresses.
355
+
356
+ ### 4.2 Qwen2.5-VL synthetic training
357
+
358
+ Config: `configs/qwen2_5vl_synthetic.yaml`
359
+
360
+ Key fields:
361
+
362
+ ```yaml
363
+ model:
364
+ name: Qwen/Qwen2.5-VL-7B-Instruct
365
+ load_in_4bit: false
366
+ ```
367
+
368
+ Run:
369
+
370
+ ```bash
371
+ uv run python -m openadapt_ml.scripts.train --config configs/qwen2_5vl_synthetic.yaml
372
+ ```
373
+
374
+ This exercises the **Qwen2.5-VL** path in `QwenVLAdapter`, using a
375
+ `process_vision_info`-style helper internally to pack image inputs in the
376
+ format expected by the Qwen2.5-VL processor.
377
+
378
+ > Note: Both configs are sized for **small synthetic smoke runs**, not
379
+ > large-scale production training.
380
+
381
+ ### 4.3 Qwen3-VL synthetic login benchmark (hero example)
382
+
383
+ OpenAdapt-ML ships a **synthetic login** benchmark backed by Qwen3-VL,
384
+ used to compare **base vs LoRA-fine-tuned** models on a hardened synthetic
385
+ environment (layout jitter + a decoy "Help" button).
386
+
387
+ FT = **LoRA fine-tuned Qwen3-VL** on synthetic login.
388
+ Base = **frozen pretrained Qwen3-VL**.
389
+
390
+ **Comprehensive Model Comparison (Login - 6 steps):**
391
+
392
+ ![Comprehensive VLM Comparison](experiments/qwen_login/comprehensive_comparison.png)
393
+
394
+ The plot compares all six evaluated models across four key metrics (action type accuracy,
395
+ coordinate error, click hit rate, and episode success rate). The legend shows color coding
396
+ for model types (Qwen 2B/8B vs API models) and hatching patterns for fine-tuned vs base models.
397
+ It exposes step-level performance metrics, which let us visually answer the question: "Does fine-tuning a small local model outperform large API models?"
398
+
399
+ **Comprehensive Results** (all models on hardened synthetic login):
400
+
401
+ | Model | Type | Action Accuracy | Coord Error | Click Hit Rate |
402
+ |---------------------|--------------|-----------------|-------------|----------------|
403
+ | Qwen3-VL-2B base | Offline | 0.143 | N/A | N/A |
404
+ | **Qwen3-VL-2B FT** | **Offline** | **0.469** | **0.051** | **0.850** |
405
+ | Qwen3-VL-8B base | Offline | 0.143 | N/A | N/A |
406
+ | **Qwen3-VL-8B FT** | **Offline** | **0.286** | **0.004** | **1.000** |
407
+ | Claude Sonnet 4.5 | API | 0.121 | 0.757 | 0.000 |
408
+ | GPT-5.1 | API | 0.183 | 0.057 | 0.600 |
409
+
410
+ **Key findings:**
411
+ 1. **Fine-tuning delivers massive gains**: Both 2B and 8B models show 2-3x improvement in action accuracy after fine-tuning
412
+ 2. **Small fine-tuned models beat large APIs**: Qwen3-VL-2B FT (469% base) outperforms both Claude Sonnet 4.5 (121%) and GPT-5.1 (183%)
413
+ 3. **Precision matters**: Fine-tuned models have excellent click precision (85-100% hit rate, <0.05 coord error) while API models struggle with the action format
414
+ 4. **Size vs specialization**: The fine-tuned 2B model outperforms the general-purpose Claude Sonnet 4.5, showing that domain-specific fine-tuning trumps raw model size
415
+
416
+ ### 4.4 Set-of-Marks (SoM) Mode: 100% Accuracy
417
+
418
+ With **Set-of-Marks** visual prompting, fine-tuned Qwen3-VL-2B achieves **100% accuracy** on both login (6-step) and registration (12-step) scenarios:
419
+
420
+ | Scenario | Steps | Elements | Action Acc | Element Acc | Episode Success |
421
+ |----------|-------|----------|------------|-------------|-----------------|
422
+ | Login | 6 | 3 | **100%** | **100%** | **100%** |
423
+ | Registration | 12 | 6 | **100%** | **100%** | **100%** |
424
+
425
+ **Cost/Latency Comparison (SoM mode):**
426
+
427
+ | Approach | Login Accuracy | Registration Accuracy | Cost | Latency |
428
+ |----------|----------------|----------------------|------|---------|
429
+ | Claude API + SoM | 100% | 100%* | ~$0.01/step | ~500ms |
430
+ | GPT-4.1 API + SoM | 100% | 100%* | ~$0.01/step | ~500ms |
431
+ | **Qwen 2B + SoM** | **100%** | **100%** | **Free (local)** | **~50ms** |
432
+
433
+ *API results on registration pending evaluation
434
+
435
+ **How SoM works:** Instead of predicting precise coordinates (`CLICK(x=0.42, y=0.31)`), the model selects numbered UI elements (`CLICK([1])`). This reduces spatial reasoning to element selection, which small models handle well.
436
+
437
+ To use SoM mode:
438
+
439
+ ```bash
440
+ # Training with SoM
441
+ uv run python -m openadapt_ml.scripts.train --config configs/qwen3vl_synthetic_som.yaml
442
+
443
+ # Evaluation with SoM
444
+ uv run python -m openadapt_ml.scripts.eval_policy \
445
+ --config configs/qwen3vl_synthetic_som.yaml \
446
+ --backend qwen3 \
447
+ --dsl-mode som \
448
+ --overfit # Check memorization
449
+ ```
450
+
451
+ For the full SoM investigation report, see [`experiments/qwen_login/SOM_INVESTIGATION_REPORT.md`](experiments/qwen_login/SOM_INVESTIGATION_REPORT.md).
452
+
453
+ ---
454
+
455
+ ## 6. Grounding Module
456
+
457
+ OpenAdapt-ML includes a **grounding module** for locating UI elements on screenshots using natural language descriptions. This enables policy/grounding separation where the policy decides *what* to do and the grounder finds *where* to do it.
458
+
459
+ ### 6.1 GeminiGrounder Demo
460
+
461
+ The `GeminiGrounder` uses Google's Gemini vision API to locate UI elements:
462
+
463
+ ![Grounding Demo](docs/images/grounding_demo.png)
464
+
465
+ *Calculator button "2" located by GeminiGrounder with 99% confidence*
466
+
467
+ ```python
468
+ from openadapt_ml.grounding import GeminiGrounder
469
+
470
+ grounder = GeminiGrounder() # Uses GOOGLE_API_KEY from .env
471
+ candidates = grounder.ground(screenshot, "the login button", k=3)
472
+
473
+ if candidates:
474
+ best = candidates[0]
475
+ print(f"Found at {best.centroid} with {best.confidence:.0%} confidence")
476
+ ```
477
+
478
+ ### 6.2 Set-of-Marks (SoM) Support
479
+
480
+ The grounding module includes functions for extracting all UI elements and overlaying numbered labels (Set-of-Marks):
481
+
482
+ ```python
483
+ from openadapt_ml.grounding import extract_ui_elements, overlay_element_marks
484
+
485
+ # Extract all interactive elements
486
+ elements = extract_ui_elements(screenshot)
487
+ # Returns: [{"id": 1, "label": "Login button", "bbox": [x1,y1,x2,y2], ...}, ...]
488
+
489
+ # Overlay numbered labels on screenshot
490
+ marked_screenshot = overlay_element_marks(screenshot, elements, style="compact")
491
+ marked_screenshot.save("screenshot_with_marks.png")
492
+ ```
493
+
494
+ This enables element-based actions using indices instead of coordinates:
495
+ - Old: `CLICK(x=0.487, y=0.328)` - coordinate-based, brittle
496
+ - New: `CLICK([1])` - element-based, robust
497
+
498
+ See `docs/gemini_grounding.md` for full documentation and `examples/test_gemini_grounding.py` for a complete example.
499
+
500
+ ### 6.3 Available Grounders
501
+
502
+ | Grounder | Description | Latency | Use Case |
503
+ |----------|-------------|---------|----------|
504
+ | `GeminiGrounder` | Google Gemini vision API | ~3s | Real UIs, zero-shot |
505
+ | `OracleGrounder` | Ground-truth bboxes | ~0ms | Evaluation |
506
+ | `DetectorGrounder` | Generic wrapper with backend selection | varies | Flexible |
507
+
508
+ ### 6.4 Grounding Evaluation
509
+
510
+ The `openadapt_ml.evals.grounding` module provides metrics for evaluating grounding accuracy:
511
+
512
+ ```python
513
+ from openadapt_ml.evals import GroundingMetrics, evaluate_grounder
514
+
515
+ metrics = evaluate_grounder(grounder, test_cases, k=5)
516
+ print(metrics)
517
+ # Grounding Metrics (n=10):
518
+ # Mean IoU: 0.720
519
+ # Centroid Hit Rate: 0.900
520
+ # Oracle Hit @1: 0.800
521
+ # Mean Latency: 3150ms
522
+ ```
523
+
524
+ ---
525
+
526
+ ## 7. VLM Adapters
527
+
528
+ All VLM backends implement the shared `BaseVLMAdapter` interface in
529
+ `openadapt_ml/models/base_adapter.py` (prepare inputs, compute loss, generate
530
+ text from a sample).
531
+
532
+ Current adapters include:
533
+
534
+ - `QwenVLAdapter` (`openadapt_ml/models/qwen_vl.py`) for Qwen3-VL and
535
+ Qwen2.5-VL.
536
+ - `DummyAdapter` (`openadapt_ml/models/dummy_adapter.py`) for fast smoke
537
+ tests without loading a real VLM.
538
+ - `ApiVLMAdapter` (`openadapt_ml/models/api_adapter.py`) for hosted VLM
539
+ APIs (Anthropic Claude Sonnet 4.5 and OpenAI GPT-5.1). This adapter is
540
+ inference-only and implements `generate` using the respective SDKs.
541
+
542
+ For full adapter internals and training-time vs runtime behavior, see
543
+ `docs/design.md` §8.
544
+
545
+ ### 7.1 API-backed adapters
546
+
547
+ To use the API-backed adapter from Python, you can configure API keys via `.env`
548
+ file, environment variables, or pass them explicitly:
549
+
550
+ ```python
551
+ from openadapt_ml.models.api_adapter import ApiVLMAdapter
552
+
553
+ # Use .env file or environment variables (ANTHROPIC_API_KEY / OPENAI_API_KEY)
554
+ claude_adapter = ApiVLMAdapter(provider="anthropic")
555
+ gpt_adapter = ApiVLMAdapter(provider="openai")
556
+
557
+ # Or pass API keys explicitly from your application's config
558
+ claude_adapter = ApiVLMAdapter(provider="anthropic", api_key="...")
559
+ gpt_adapter = ApiVLMAdapter(provider="openai", api_key="...")
560
+ ```
561
+
562
+ The existing CLI scripts `scripts/demo_policy.py` and
563
+ `scripts/eval_policy.py` expose these backends via the `--backend` flag
564
+ (`claude` / `openai`).
565
+
566
+ ---
567
+
568
+ ## 8. Runtime Policy & Demos
569
+
570
+ The runtime policy is implemented in `openadapt_ml/runtime/policy.py` as
571
+ `AgentPolicy`.
572
+
573
+ ### 8.1 AgentPolicy
574
+
575
+ `AgentPolicy` is initialized with a VLM adapter (dummy or real). Given an
576
+ SFT-style sample, it:
577
+
578
+ 1. Calls `adapter.generate(sample)` to obtain assistant text.
579
+ 2. Parses actions from strings like:
580
+ - `CLICK(x=0.45, y=0.71)`
581
+ - `DONE()`
582
+ 3. Returns a structured `Action` plus an optional free-form `thought`.
583
+
584
+ ### 8.2 Demo script
585
+
586
+ `openadapt_ml/scripts/demo_policy.py` demonstrates how to use
587
+ `AgentPolicy` with different backends.
588
+
589
+ Run with a **dummy** backend (fast, no model load):
590
+
591
+ ```bash
592
+ uv run python -m openadapt_ml.scripts.demo_policy --backend dummy
593
+ ```
594
+
595
+ Run with **Qwen3-VL** backend:
596
+
597
+ ```bash
598
+ uv run python -m openadapt_ml.scripts.demo_policy --backend qwen3
599
+ ```
600
+
601
+ Run with **Qwen2.5-VL** backend:
602
+
603
+ ```bash
604
+ uv run python -m openadapt_ml.scripts.demo_policy --backend qwen2_5
605
+ ```
606
+
607
+ Each invocation will:
608
+
609
+ - Generate a synthetic login episode and select one step.
610
+ - Build an SFT-style sample from that step.
611
+ - Use `AgentPolicy` to predict the next action.
612
+ - Print the raw messages and the parsed action/thought.
613
+
614
+ ---
615
+
616
+ ## 9. Testing
617
+
618
+ Basic tests are provided under `tests/`.
619
+
620
+ Run the test suite with:
621
+
622
+ ```bash
623
+ uv run pytest
624
+ ```
625
+
626
+ In particular:
627
+
628
+ - `tests/test_training_dummy.py` runs a smoke test over the training loop
629
+ using `DummyAdapter`.
630
+
631
+ ---
632
+
633
+ ## 10. Training on Real Data
634
+
635
+ OpenAdapt-ML supports training on real GUI recordings from two sources:
636
+ 1. **openadapt-capture** - New lightweight recording format
637
+ 2. **OpenAdapt database** - Original OpenAdapt recordings (legacy)
638
+
639
+ ### 10.1 Training on openadapt-capture recordings
640
+
641
+ [openadapt-capture](https://github.com/OpenAdaptAI/openadapt-capture) is a lightweight GUI recording tool.
642
+
643
+ ```bash
644
+ # Install openadapt-capture
645
+ uv pip install openadapt-capture
646
+
647
+ # Record a workflow (e.g., turning off Night Shift)
648
+ openadapt-capture record --output ~/captures/turn-off-nightshift
649
+
650
+ # Train on the capture
651
+ uv run python -m openadapt_ml.scripts.train \
652
+ --config configs/qwen3vl_capture.yaml \
653
+ --capture ~/captures/turn-off-nightshift \
654
+ --open # Opens training dashboard in browser
655
+ ```
656
+
657
+ The goal is automatically derived from the directory name (e.g., `"Turn off nightshift"`).
658
+
659
+ ### 10.2 Compare human vs AI predictions
660
+
661
+ ```bash
662
+ uv run python -m openadapt_ml.scripts.compare \
663
+ --capture ~/captures/turn-off-nightshift \
664
+ --checkpoint checkpoints/qwen3vl2b_capture_lora \
665
+ --open # Opens comparison viewer
666
+ ```
667
+
668
+ The comparison viewer shows:
669
+ - Side-by-side human actions vs model predictions
670
+ - Click position overlays on screenshots
671
+ - Accuracy metrics and distance calculations
672
+ - Navigation between training dashboard and comparison viewer
673
+
674
+ ---
675
+
676
+ ## 11. Local Training (CUDA / Apple Silicon)
677
+
678
+ Train locally on your own GPU. Auto-detects CUDA or Apple Silicon (MPS).
679
+
680
+ ### 11.1 Quick start
681
+
682
+ ```bash
683
+ # Train on a capture (auto-detects device and config)
684
+ uv run python -m openadapt_ml.cloud.local train \
685
+ --capture ~/captures/turn-off-nightshift \
686
+ --open # Opens dashboard in browser
687
+ ```
688
+
689
+ ### 11.2 Training workflow
690
+
691
+ ```bash
692
+ # Check device and training status
693
+ uv run python -m openadapt_ml.cloud.local status
694
+
695
+ # Train on a capture
696
+ uv run python -m openadapt_ml.cloud.local train --capture ~/captures/my-workflow --open
697
+
698
+ # Check training health (loss progression, convergence)
699
+ uv run python -m openadapt_ml.cloud.local check
700
+
701
+ # Start dashboard server
702
+ uv run python -m openadapt_ml.cloud.local serve --open
703
+
704
+ # Regenerate viewer
705
+ uv run python -m openadapt_ml.cloud.local viewer --open
706
+
707
+ # Run human vs AI comparison
708
+ uv run python -m openadapt_ml.cloud.local compare \
709
+ --capture ~/captures/my-workflow \
710
+ --checkpoint checkpoints/qwen3vl2b_capture_lora \
711
+ --open
712
+ ```
713
+
714
+ ---
715
+
716
+ ## 12. Cloud GPU Training (Lambda Labs)
717
+
718
+ For faster training on powerful GPUs, use Lambda Labs. Full documentation: [`docs/cloud_gpu_training.md`](docs/cloud_gpu_training.md).
719
+
720
+ ### 12.1 Quick start
721
+
722
+ ```bash
723
+ # Set API key
724
+ export LAMBDA_API_KEY=your_key_here
725
+
726
+ # Launch, train, download, and terminate in one command
727
+ uv run python -m openadapt_ml.cloud.lambda_labs train \
728
+ --capture ~/captures/turn-off-nightshift \
729
+ --goal "Turn off Night Shift in System Settings"
730
+ ```
731
+
732
+ ### 12.2 Manual workflow
733
+
734
+ ```bash
735
+ # List available instances and pricing
736
+ uv run python -m openadapt_ml.cloud.lambda_labs list
737
+
738
+ # Launch an A10 instance (~$0.75/hr)
739
+ uv run python -m openadapt_ml.cloud.lambda_labs launch --type gpu_1x_a10
740
+
741
+ # Check training status
742
+ uv run python -m openadapt_ml.cloud.lambda_labs train-status
743
+
744
+ # Check training health (loss progression, early stopping analysis)
745
+ uv run python -m openadapt_ml.cloud.lambda_labs check <instance_id>
746
+
747
+ # Download checkpoints and comparison results
748
+ uv run python -m openadapt_ml.cloud.lambda_labs download <instance_id>
749
+
750
+ # IMPORTANT: Terminate when done (billed by the hour!)
751
+ uv run python -m openadapt_ml.cloud.lambda_labs terminate <instance_id>
752
+ ```
753
+
754
+ ### 12.3 Training visualization
755
+
756
+ The training process generates:
757
+ - **`training_output/dashboard.html`** - Real-time training dashboard with loss curves
758
+ - **`training_output/viewer.html`** - Unified viewer for comparing human vs model predictions
759
+
760
+ Use the navigation tabs to switch between Training and Viewer.
761
+
762
+ **To serve the dashboard:**
763
+ ```bash
764
+ uv run python -m openadapt_ml.cloud.local serve --port 8080 --open
765
+ ```
766
+
767
+ **Training Dashboard:**
768
+
769
+ ![Training Dashboard - Top](docs/images/dashboard/training_top.png)
770
+
771
+ *Shows training progress, loss curves, stats (current loss, min loss, avg step time), and ETA.*
772
+
773
+ ![Training Dashboard - Bottom](docs/images/dashboard/training_bottom.png)
774
+
775
+ *Training configuration and evaluation samples with visual overlays showing human (green) vs predicted (purple) click positions.*
776
+
777
+ **Comparison Viewer:**
778
+
779
+ ![Viewer - Top](docs/images/dashboard/viewer_top.png)
780
+
781
+ *Compare human actions vs model predictions frame-by-frame. Shows action type, model reasoning output, and match/mismatch status.*
782
+
783
+ ![Viewer - Bottom](docs/images/dashboard/viewer_bottom.png)
784
+
785
+ *Event timeline, event details, transcript, and video playback controls.*
786
+
787
+ **Keyboard shortcuts (Viewer):**
788
+ - `Space` - Play/pause
789
+ - `←` / `→` - Previous/next frame
790
+ - `Home` / `End` - First/last frame
791
+ - `O` - Toggle click overlay
792
+
793
+ ---
794
+
795
+ ## 13. Limitations & Notes
796
+
797
+ - **Apple Silicon / bitsandbytes**:
798
+ - Example configs are sized for CPU / Apple Silicon development runs; see
799
+ `docs/design.md` §9.4 for details on QLoRA and platform-specific
800
+ considerations.
801
+ - **Batching**:
802
+ - For v1, `QwenVLAdapter` is implemented assuming `batch_size=1` for
803
+ simplicity when handling multimodal inputs. The training configs are
804
+ sized accordingly.
805
+ - **Evaluation**:
806
+ - v1 focuses on smoke tests and qualitative behavior on synthetic data.
807
+ More formal evaluation scripts and metrics are planned.
808
+
809
+ For deeper architectural details, see [`docs/design.md`](docs/design.md).
810
+
811
+ ---
812
+
813
+ ## 14. Roadmap
814
+
815
+ For the up-to-date, prioritized roadmap (including concrete implementation
816
+ targets and agent-executable acceptance criteria), see
817
+ [`docs/roadmap.md`](docs/roadmap.md).
818
+