openadapt-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/__init__.py +0 -0
- openadapt_ml/benchmarks/__init__.py +125 -0
- openadapt_ml/benchmarks/agent.py +825 -0
- openadapt_ml/benchmarks/azure.py +761 -0
- openadapt_ml/benchmarks/base.py +366 -0
- openadapt_ml/benchmarks/cli.py +884 -0
- openadapt_ml/benchmarks/data_collection.py +432 -0
- openadapt_ml/benchmarks/runner.py +381 -0
- openadapt_ml/benchmarks/waa.py +704 -0
- openadapt_ml/cloud/__init__.py +5 -0
- openadapt_ml/cloud/azure_inference.py +441 -0
- openadapt_ml/cloud/lambda_labs.py +2445 -0
- openadapt_ml/cloud/local.py +790 -0
- openadapt_ml/config.py +56 -0
- openadapt_ml/datasets/__init__.py +0 -0
- openadapt_ml/datasets/next_action.py +507 -0
- openadapt_ml/evals/__init__.py +23 -0
- openadapt_ml/evals/grounding.py +241 -0
- openadapt_ml/evals/plot_eval_metrics.py +174 -0
- openadapt_ml/evals/trajectory_matching.py +486 -0
- openadapt_ml/grounding/__init__.py +45 -0
- openadapt_ml/grounding/base.py +236 -0
- openadapt_ml/grounding/detector.py +570 -0
- openadapt_ml/ingest/__init__.py +43 -0
- openadapt_ml/ingest/capture.py +312 -0
- openadapt_ml/ingest/loader.py +232 -0
- openadapt_ml/ingest/synthetic.py +1102 -0
- openadapt_ml/models/__init__.py +0 -0
- openadapt_ml/models/api_adapter.py +171 -0
- openadapt_ml/models/base_adapter.py +59 -0
- openadapt_ml/models/dummy_adapter.py +42 -0
- openadapt_ml/models/qwen_vl.py +426 -0
- openadapt_ml/runtime/__init__.py +0 -0
- openadapt_ml/runtime/policy.py +182 -0
- openadapt_ml/schemas/__init__.py +53 -0
- openadapt_ml/schemas/sessions.py +122 -0
- openadapt_ml/schemas/validation.py +252 -0
- openadapt_ml/scripts/__init__.py +0 -0
- openadapt_ml/scripts/compare.py +1490 -0
- openadapt_ml/scripts/demo_policy.py +62 -0
- openadapt_ml/scripts/eval_policy.py +287 -0
- openadapt_ml/scripts/make_gif.py +153 -0
- openadapt_ml/scripts/prepare_synthetic.py +43 -0
- openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
- openadapt_ml/scripts/train.py +174 -0
- openadapt_ml/training/__init__.py +0 -0
- openadapt_ml/training/benchmark_viewer.py +1538 -0
- openadapt_ml/training/shared_ui.py +157 -0
- openadapt_ml/training/stub_provider.py +276 -0
- openadapt_ml/training/trainer.py +2446 -0
- openadapt_ml/training/viewer.py +2970 -0
- openadapt_ml-0.1.0.dist-info/METADATA +818 -0
- openadapt_ml-0.1.0.dist-info/RECORD +55 -0
- openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
- openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,818 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: openadapt-ml
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Model-agnostic, domain-agnostic ML engine for GUI automation agents
|
|
5
|
+
Project-URL: Homepage, https://github.com/OpenAdaptAI/openadapt-ml
|
|
6
|
+
Project-URL: Repository, https://github.com/OpenAdaptAI/openadapt-ml
|
|
7
|
+
Project-URL: Documentation, https://github.com/OpenAdaptAI/openadapt-ml/tree/main/docs
|
|
8
|
+
Author-email: "MLDSAI Inc." <richard@mldsai.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: agents,automation,fine-tuning,gui,ml,vision-language-models,vlm
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Requires-Dist: azure-ai-ml>=1.30.0
|
|
21
|
+
Requires-Dist: azure-identity>=1.25.1
|
|
22
|
+
Requires-Dist: bitsandbytes>=0.41.0
|
|
23
|
+
Requires-Dist: google-generativeai>=0.8.5
|
|
24
|
+
Requires-Dist: matplotlib>=3.10.7
|
|
25
|
+
Requires-Dist: openadapt-capture>=0.1.0
|
|
26
|
+
Requires-Dist: peft>=0.18.0
|
|
27
|
+
Requires-Dist: pillow>=12.0.0
|
|
28
|
+
Requires-Dist: pydantic-settings>=2.0.0
|
|
29
|
+
Requires-Dist: pytest>=9.0.2
|
|
30
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
31
|
+
Requires-Dist: torch>=2.9.1
|
|
32
|
+
Requires-Dist: torchvision>=0.24.1
|
|
33
|
+
Requires-Dist: transformers>=4.57.3
|
|
34
|
+
Provides-Extra: api
|
|
35
|
+
Requires-Dist: anthropic>=0.40.0; extra == 'api'
|
|
36
|
+
Requires-Dist: openai>=1.0.0; extra == 'api'
|
|
37
|
+
Requires-Dist: pydantic-settings>=2.0.0; extra == 'api'
|
|
38
|
+
Provides-Extra: azure
|
|
39
|
+
Requires-Dist: azure-ai-ml>=1.0.0; extra == 'azure'
|
|
40
|
+
Requires-Dist: azure-identity>=1.0.0; extra == 'azure'
|
|
41
|
+
Provides-Extra: dev
|
|
42
|
+
Requires-Dist: pytest>=9.0.0; extra == 'dev'
|
|
43
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
44
|
+
Provides-Extra: lambda-labs
|
|
45
|
+
Requires-Dist: requests>=2.28.0; extra == 'lambda-labs'
|
|
46
|
+
Description-Content-Type: text/markdown
|
|
47
|
+
|
|
48
|
+
# OpenAdapt-ML
|
|
49
|
+
|
|
50
|
+
[](https://opensource.org/licenses/MIT)
|
|
51
|
+
[](https://www.python.org/)
|
|
52
|
+
|
|
53
|
+
OpenAdapt-ML is a **model-agnostic, domain-agnostic ML engine** for GUI
|
|
54
|
+
automation agents.
|
|
55
|
+
|
|
56
|
+
It provides:
|
|
57
|
+
|
|
58
|
+
- **Schemas** for GUI interaction trajectories (screens + actions + goals).
|
|
59
|
+
- **Synthetic semantic UI generation** for bootstrapping datasets.
|
|
60
|
+
- **Dataset builders** that turn episodes into next-action SFT samples.
|
|
61
|
+
- **VLM adapters** (Qwen3-VL, Qwen2.5-VL) using Hugging Face + PEFT.
|
|
62
|
+
- A minimal **supervised training loop** for fine-tuning.
|
|
63
|
+
- A simple **runtime policy** API that predicts the next GUI action.
|
|
64
|
+
|
|
65
|
+
The design is described in detail in [`docs/design.md`](docs/design.md).
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## 1. Quickstart
|
|
70
|
+
|
|
71
|
+
### 1.1 Install dependencies
|
|
72
|
+
|
|
73
|
+
From the repository root:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
uv sync
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### 1.2 Run a small demo policy
|
|
80
|
+
|
|
81
|
+
Run a fast, model-free smoke test:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
uv run python -m openadapt_ml.scripts.demo_policy --backend dummy
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 1.3 Run the synthetic login benchmark (end-to-end)
|
|
88
|
+
|
|
89
|
+
On a machine with a suitable GPU, you can reproduce the Qwen3-VL synthetic
|
|
90
|
+
login benchmark (train → eval base/FT → plot) with a single command:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
uv run python -m openadapt_ml.scripts.run_qwen_login_benchmark \
|
|
94
|
+
--config configs/qwen3vl_synthetic_dev.yaml \
|
|
95
|
+
--out-dir experiments/qwen_login/2b_dev
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
This default invocation will:
|
|
99
|
+
|
|
100
|
+
- Train a LoRA adapter on the hardened synthetic login scenario.
|
|
101
|
+
- Evaluate both the **base** and **fine-tuned** Qwen3-VL models on fresh
|
|
102
|
+
synthetic episodes.
|
|
103
|
+
- Write eval JSONs and a comparison plot under
|
|
104
|
+
`experiments/qwen_login/2b_dev/`.
|
|
105
|
+
|
|
106
|
+
The `qwen3vl_synthetic_dev` config is sized for small development runs on Apple
|
|
107
|
+
Silicon / CPU, but will also run on CUDA GPUs.
|
|
108
|
+
|
|
109
|
+
To additionally compare against hosted API backends (Claude Sonnet 4.5 and
|
|
110
|
+
OpenAI GPT-5.1), first install the optional `api` extra and configure your API
|
|
111
|
+
keys:
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
uv sync --extra api
|
|
115
|
+
|
|
116
|
+
# Option 1: Use .env file (recommended)
|
|
117
|
+
cp .env.example .env
|
|
118
|
+
# Edit .env with your API keys
|
|
119
|
+
|
|
120
|
+
# Option 2: Export environment variables (for CI/containers)
|
|
121
|
+
export ANTHROPIC_API_KEY=... # for Claude Sonnet 4.5
|
|
122
|
+
export OPENAI_API_KEY=... # for GPT-5.1
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Then run:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
uv run python -m openadapt_ml.scripts.run_qwen_login_benchmark \
|
|
129
|
+
--config configs/qwen3vl_synthetic_dev.yaml \
|
|
130
|
+
--out-dir experiments/qwen_login/2b_dev \
|
|
131
|
+
--include-all-apis
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
This will evaluate and plot **Qwen3 base**, **Qwen3 FT**, **Claude Sonnet 4.5**,
|
|
135
|
+
and **GPT-5.1** on the same synthetic login benchmark.
|
|
136
|
+
|
|
137
|
+
For complete documentation including training setup, evaluation metrics, SoM mode results, and reproduction instructions, see **[`docs/qwen_login_experiment.md`](docs/qwen_login_experiment.md)**. For implementation details and technical notes, see `docs/state_and_next_steps_qwen_login.md`.
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## 2. Repository Structure
|
|
142
|
+
|
|
143
|
+
Key modules:
|
|
144
|
+
|
|
145
|
+
- `openadapt_ml/schemas/`
|
|
146
|
+
- Canonical dataclasses for `Session`, `Episode`, `Step`, `Observation`,
|
|
147
|
+
`Action`.
|
|
148
|
+
- `openadapt_ml/ingest/synthetic.py`
|
|
149
|
+
- Synthetic semantic UI generator (e.g. login screen) that produces PNG
|
|
150
|
+
screenshots and scripted episodes.
|
|
151
|
+
- `openadapt_ml/datasets/next_action.py`
|
|
152
|
+
- Converts episodes into goal-conditioned, chat-style next-action SFT
|
|
153
|
+
samples suitable for VLM fine-tuning.
|
|
154
|
+
- `openadapt_ml/models/base_adapter.py`
|
|
155
|
+
- `BaseVLMAdapter` abstraction shared by all VLM backends.
|
|
156
|
+
- `openadapt_ml/models/qwen_vl.py`
|
|
157
|
+
- `QwenVLAdapter` implementing support for **Qwen3-VL** and
|
|
158
|
+
**Qwen2.5-VL**.
|
|
159
|
+
- `openadapt_ml/models/dummy_adapter.py`
|
|
160
|
+
- Tiny fake adapter used to validate training and runtime flows without
|
|
161
|
+
loading a real VLM.
|
|
162
|
+
- `openadapt_ml/training/trainer.py`
|
|
163
|
+
- Minimal supervised training loop (`train_supervised`) with gradient
|
|
164
|
+
accumulation and logging.
|
|
165
|
+
- `openadapt_ml/runtime/policy.py`
|
|
166
|
+
- `AgentPolicy` that formats inputs for a VLM and parses textual actions
|
|
167
|
+
like `CLICK(x=..., y=...)` and `DONE()` into structured `Action`s.
|
|
168
|
+
- `openadapt_ml/scripts/train.py`
|
|
169
|
+
- CLI entry point for running synthetic-data training with a chosen
|
|
170
|
+
model/config.
|
|
171
|
+
- `openadapt_ml/scripts/demo_policy.py`
|
|
172
|
+
- CLI demo showing how to use `AgentPolicy` with different backends
|
|
173
|
+
(dummy, Qwen3-VL, Qwen2.5-VL).
|
|
174
|
+
|
|
175
|
+
Configs and docs:
|
|
176
|
+
|
|
177
|
+
- `configs/qwen3vl_synthetic.yaml`
|
|
178
|
+
- Synthetic training config for **Qwen3-VL-8B-Instruct**.
|
|
179
|
+
- `configs/qwen2_5vl_synthetic.yaml`
|
|
180
|
+
- Synthetic training config for **Qwen2.5-VL-7B-Instruct**.
|
|
181
|
+
- `docs/design.md`
|
|
182
|
+
- High-level design document (scope, architecture, schemas, adapters,
|
|
183
|
+
training, runtime, and evaluation strategy).
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## 3. Environment Setup
|
|
188
|
+
|
|
189
|
+
OpenAdapt-ML targets **Python 3.12** and uses [`uv`](https://github.com/astral-sh/uv)
|
|
190
|
+
for dependency management.
|
|
191
|
+
|
|
192
|
+
### 2.1 Install and sync
|
|
193
|
+
|
|
194
|
+
From the repository root:
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
# Ensure uv is installed (see uv docs for platform-specific install)
|
|
198
|
+
# Then:
|
|
199
|
+
uv sync
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
This will create a virtual environment (e.g. `.venv/`) and install all
|
|
203
|
+
packages declared in `pyproject.toml`.
|
|
204
|
+
|
|
205
|
+
### 2.2 Working inside the environment
|
|
206
|
+
|
|
207
|
+
Use `uv run` to execute Python modules and scripts with the synced
|
|
208
|
+
environment:
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
uv run python -m openadapt_ml.scripts.train --help
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
You can also run `pytest` or other tools via `uv run`.
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
## 4. Synthetic Data & Datasets
|
|
219
|
+
|
|
220
|
+
The v1 pipeline is validated on **synthetic, semantic UIs**, starting with a
|
|
221
|
+
simple login flow.
|
|
222
|
+
|
|
223
|
+
### 3.1 Synthetic scenarios
|
|
224
|
+
|
|
225
|
+
OpenAdapt-ML includes synthetic UI generators for structured GUI automation benchmarks.
|
|
226
|
+
Currently two scenarios are supported:
|
|
227
|
+
|
|
228
|
+
#### Login Scenario (6 steps, 3 elements)
|
|
229
|
+
|
|
230
|
+
A simple login form with username, password, and login button.
|
|
231
|
+
|
|
232
|
+

|
|
233
|
+
|
|
234
|
+
**Login SoM Evaluation Results:**
|
|
235
|
+
|
|
236
|
+
| Metric | Qwen3-VL-2B FT |
|
|
237
|
+
|--------|----------------|
|
|
238
|
+
| Action Type Accuracy | **100%** |
|
|
239
|
+
| Element Accuracy | **100%** |
|
|
240
|
+
| Episode Success Rate | **100%** |
|
|
241
|
+
| Episodes / Steps | 32 / 192 |
|
|
242
|
+
|
|
243
|
+
#### Registration Scenario (12 steps, 6 elements)
|
|
244
|
+
|
|
245
|
+
A more complex registration form with first name, last name, email, password, confirm password, and register button.
|
|
246
|
+
|
|
247
|
+

|
|
248
|
+
|
|
249
|
+
**Registration SoM Evaluation Results:**
|
|
250
|
+
|
|
251
|
+
| Metric | Qwen3-VL-2B FT |
|
|
252
|
+
|--------|----------------|
|
|
253
|
+
| Action Type Accuracy | **100%** |
|
|
254
|
+
| Element Accuracy | **100%** |
|
|
255
|
+
| Episode Success Rate | **100%** |
|
|
256
|
+
| Episodes / Steps | 32 / 384 |
|
|
257
|
+
|
|
258
|
+
### 3.2 Generating synthetic data
|
|
259
|
+
|
|
260
|
+
Synthetic data is generated on the fly by `generate_synthetic_sessions` in
|
|
261
|
+
`openadapt_ml/ingest/synthetic.py` and used internally by the training
|
|
262
|
+
scripts.
|
|
263
|
+
|
|
264
|
+
You can also call it directly from Python:
|
|
265
|
+
|
|
266
|
+
```python
|
|
267
|
+
from openadapt_ml.ingest.synthetic import generate_synthetic_sessions
|
|
268
|
+
|
|
269
|
+
# Login scenario (default)
|
|
270
|
+
sessions = generate_synthetic_sessions(num_sessions=2, seed=123, output_dir="synthetic_login")
|
|
271
|
+
|
|
272
|
+
# Registration scenario
|
|
273
|
+
sessions = generate_synthetic_sessions(
|
|
274
|
+
num_sessions=2,
|
|
275
|
+
seed=123,
|
|
276
|
+
output_dir="synthetic_registration",
|
|
277
|
+
scenario="registration", # "login" or "registration"
|
|
278
|
+
use_som=True, # Enable Set-of-Marks visual overlays
|
|
279
|
+
)
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
Each session contains episodes with:
|
|
283
|
+
|
|
284
|
+
- A **goal** (e.g. "Log in as demo user").
|
|
285
|
+
- A sequence of **steps**, each with:
|
|
286
|
+
- An observation (screenshot path).
|
|
287
|
+
- An action (e.g. `CLICK`, `TYPE`, `DONE`).
|
|
288
|
+
|
|
289
|
+
### 3.3 Next-action SFT samples
|
|
290
|
+
|
|
291
|
+
Episodes are converted into SFT-style samples by
|
|
292
|
+
`build_next_action_sft_samples` in `openadapt_ml/datasets/next_action.py`.
|
|
293
|
+
|
|
294
|
+
Each sample has the form:
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
{
|
|
298
|
+
"images": ["/path/to/screenshot.png"],
|
|
299
|
+
"messages": [
|
|
300
|
+
{"role": "system", "content": ...},
|
|
301
|
+
{"role": "user", "content": "Goal: ...\nCurrent screen: ..."},
|
|
302
|
+
{"role": "assistant", "content": "CLICK(x=..., y=...)"},
|
|
303
|
+
],
|
|
304
|
+
}
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
These samples are wrapped in a simple `NextActionDataset` for use with the
|
|
308
|
+
training loop.
|
|
309
|
+
|
|
310
|
+
For the full, canonical definition of the action DSL (CLICK/TYPE/WAIT/DONE)
|
|
311
|
+
and its invariants, see `docs/design.md` §7.4.
|
|
312
|
+
|
|
313
|
+
---
|
|
314
|
+
|
|
315
|
+
## 5. Training
|
|
316
|
+
|
|
317
|
+
Training is driven by `openadapt_ml/scripts/train.py` and YAML configs under
|
|
318
|
+
`configs/`.
|
|
319
|
+
|
|
320
|
+
The training script:
|
|
321
|
+
|
|
322
|
+
1. Loads a config file (YAML).
|
|
323
|
+
2. Generates synthetic sessions.
|
|
324
|
+
3. Flattens to episodes and builds SFT samples.
|
|
325
|
+
4. Wraps them in a `NextActionDataset`.
|
|
326
|
+
5. Instantiates a VLM adapter (e.g. `QwenVLAdapter`).
|
|
327
|
+
6. Runs `train_supervised` over the dataset.
|
|
328
|
+
|
|
329
|
+
### 4.1 Qwen3-VL synthetic training
|
|
330
|
+
|
|
331
|
+
Config: `configs/qwen3vl_synthetic.yaml`
|
|
332
|
+
|
|
333
|
+
Key fields:
|
|
334
|
+
|
|
335
|
+
```yaml
|
|
336
|
+
model:
|
|
337
|
+
name: Qwen/Qwen3-VL-8B-Instruct
|
|
338
|
+
load_in_4bit: false # 4-bit quantization is disabled on macOS / Apple Silicon
|
|
339
|
+
|
|
340
|
+
# LoRA config and training hyperparameters are also defined in the YAML.
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
Run:
|
|
344
|
+
|
|
345
|
+
```bash
|
|
346
|
+
uv run python -m openadapt_ml.scripts.train --config configs/qwen3vl_synthetic.yaml
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
This will:
|
|
350
|
+
|
|
351
|
+
- Download and load `Qwen/Qwen3-VL-8B-Instruct`.
|
|
352
|
+
- Generate a small synthetic dataset.
|
|
353
|
+
- Run a single-epoch supervised fine-tuning loop.
|
|
354
|
+
- Print loss values as training progresses.
|
|
355
|
+
|
|
356
|
+
### 4.2 Qwen2.5-VL synthetic training
|
|
357
|
+
|
|
358
|
+
Config: `configs/qwen2_5vl_synthetic.yaml`
|
|
359
|
+
|
|
360
|
+
Key fields:
|
|
361
|
+
|
|
362
|
+
```yaml
|
|
363
|
+
model:
|
|
364
|
+
name: Qwen/Qwen2.5-VL-7B-Instruct
|
|
365
|
+
load_in_4bit: false
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
Run:
|
|
369
|
+
|
|
370
|
+
```bash
|
|
371
|
+
uv run python -m openadapt_ml.scripts.train --config configs/qwen2_5vl_synthetic.yaml
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
This exercises the **Qwen2.5-VL** path in `QwenVLAdapter`, using a
|
|
375
|
+
`process_vision_info`-style helper internally to pack image inputs in the
|
|
376
|
+
format expected by the Qwen2.5-VL processor.
|
|
377
|
+
|
|
378
|
+
> Note: Both configs are sized for **small synthetic smoke runs**, not
|
|
379
|
+
> large-scale production training.
|
|
380
|
+
|
|
381
|
+
### 4.3 Qwen3-VL synthetic login benchmark (hero example)
|
|
382
|
+
|
|
383
|
+
OpenAdapt-ML ships a **synthetic login** benchmark backed by Qwen3-VL,
|
|
384
|
+
used to compare **base vs LoRA-fine-tuned** models on a hardened synthetic
|
|
385
|
+
environment (layout jitter + a decoy "Help" button).
|
|
386
|
+
|
|
387
|
+
FT = **LoRA fine-tuned Qwen3-VL** on synthetic login.
|
|
388
|
+
Base = **frozen pretrained Qwen3-VL**.
|
|
389
|
+
|
|
390
|
+
**Comprehensive Model Comparison (Login - 6 steps):**
|
|
391
|
+
|
|
392
|
+

|
|
393
|
+
|
|
394
|
+
The plot compares all six evaluated models across four key metrics (action type accuracy,
|
|
395
|
+
coordinate error, click hit rate, and episode success rate). The legend shows color coding
|
|
396
|
+
for model types (Qwen 2B/8B vs API models) and hatching patterns for fine-tuned vs base models.
|
|
397
|
+
It exposes step-level performance metrics, which let us visually answer the question: "Does fine-tuning a small local model outperform large API models?"
|
|
398
|
+
|
|
399
|
+
**Comprehensive Results** (all models on hardened synthetic login):
|
|
400
|
+
|
|
401
|
+
| Model | Type | Action Accuracy | Coord Error | Click Hit Rate |
|
|
402
|
+
|---------------------|--------------|-----------------|-------------|----------------|
|
|
403
|
+
| Qwen3-VL-2B base | Offline | 0.143 | N/A | N/A |
|
|
404
|
+
| **Qwen3-VL-2B FT** | **Offline** | **0.469** | **0.051** | **0.850** |
|
|
405
|
+
| Qwen3-VL-8B base | Offline | 0.143 | N/A | N/A |
|
|
406
|
+
| **Qwen3-VL-8B FT** | **Offline** | **0.286** | **0.004** | **1.000** |
|
|
407
|
+
| Claude Sonnet 4.5 | API | 0.121 | 0.757 | 0.000 |
|
|
408
|
+
| GPT-5.1 | API | 0.183 | 0.057 | 0.600 |
|
|
409
|
+
|
|
410
|
+
**Key findings:**
|
|
411
|
+
1. **Fine-tuning delivers massive gains**: Both 2B and 8B models show 2-3x improvement in action accuracy after fine-tuning
|
|
412
|
+
2. **Small fine-tuned models beat large APIs**: Qwen3-VL-2B FT (469% base) outperforms both Claude Sonnet 4.5 (121%) and GPT-5.1 (183%)
|
|
413
|
+
3. **Precision matters**: Fine-tuned models have excellent click precision (85-100% hit rate, <0.05 coord error) while API models struggle with the action format
|
|
414
|
+
4. **Size vs specialization**: The fine-tuned 2B model outperforms the general-purpose Claude Sonnet 4.5, showing that domain-specific fine-tuning trumps raw model size
|
|
415
|
+
|
|
416
|
+
### 4.4 Set-of-Marks (SoM) Mode: 100% Accuracy
|
|
417
|
+
|
|
418
|
+
With **Set-of-Marks** visual prompting, fine-tuned Qwen3-VL-2B achieves **100% accuracy** on both login (6-step) and registration (12-step) scenarios:
|
|
419
|
+
|
|
420
|
+
| Scenario | Steps | Elements | Action Acc | Element Acc | Episode Success |
|
|
421
|
+
|----------|-------|----------|------------|-------------|-----------------|
|
|
422
|
+
| Login | 6 | 3 | **100%** | **100%** | **100%** |
|
|
423
|
+
| Registration | 12 | 6 | **100%** | **100%** | **100%** |
|
|
424
|
+
|
|
425
|
+
**Cost/Latency Comparison (SoM mode):**
|
|
426
|
+
|
|
427
|
+
| Approach | Login Accuracy | Registration Accuracy | Cost | Latency |
|
|
428
|
+
|----------|----------------|----------------------|------|---------|
|
|
429
|
+
| Claude API + SoM | 100% | 100%* | ~$0.01/step | ~500ms |
|
|
430
|
+
| GPT-4.1 API + SoM | 100% | 100%* | ~$0.01/step | ~500ms |
|
|
431
|
+
| **Qwen 2B + SoM** | **100%** | **100%** | **Free (local)** | **~50ms** |
|
|
432
|
+
|
|
433
|
+
*API results on registration pending evaluation
|
|
434
|
+
|
|
435
|
+
**How SoM works:** Instead of predicting precise coordinates (`CLICK(x=0.42, y=0.31)`), the model selects numbered UI elements (`CLICK([1])`). This reduces spatial reasoning to element selection, which small models handle well.
|
|
436
|
+
|
|
437
|
+
To use SoM mode:
|
|
438
|
+
|
|
439
|
+
```bash
|
|
440
|
+
# Training with SoM
|
|
441
|
+
uv run python -m openadapt_ml.scripts.train --config configs/qwen3vl_synthetic_som.yaml
|
|
442
|
+
|
|
443
|
+
# Evaluation with SoM
|
|
444
|
+
uv run python -m openadapt_ml.scripts.eval_policy \
|
|
445
|
+
--config configs/qwen3vl_synthetic_som.yaml \
|
|
446
|
+
--backend qwen3 \
|
|
447
|
+
--dsl-mode som \
|
|
448
|
+
--overfit # Check memorization
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
For the full SoM investigation report, see [`experiments/qwen_login/SOM_INVESTIGATION_REPORT.md`](experiments/qwen_login/SOM_INVESTIGATION_REPORT.md).
|
|
452
|
+
|
|
453
|
+
---
|
|
454
|
+
|
|
455
|
+
## 6. Grounding Module
|
|
456
|
+
|
|
457
|
+
OpenAdapt-ML includes a **grounding module** for locating UI elements on screenshots using natural language descriptions. This enables policy/grounding separation where the policy decides *what* to do and the grounder finds *where* to do it.
|
|
458
|
+
|
|
459
|
+
### 6.1 GeminiGrounder Demo
|
|
460
|
+
|
|
461
|
+
The `GeminiGrounder` uses Google's Gemini vision API to locate UI elements:
|
|
462
|
+
|
|
463
|
+

|
|
464
|
+
|
|
465
|
+
*Calculator button "2" located by GeminiGrounder with 99% confidence*
|
|
466
|
+
|
|
467
|
+
```python
|
|
468
|
+
from openadapt_ml.grounding import GeminiGrounder
|
|
469
|
+
|
|
470
|
+
grounder = GeminiGrounder() # Uses GOOGLE_API_KEY from .env
|
|
471
|
+
candidates = grounder.ground(screenshot, "the login button", k=3)
|
|
472
|
+
|
|
473
|
+
if candidates:
|
|
474
|
+
best = candidates[0]
|
|
475
|
+
print(f"Found at {best.centroid} with {best.confidence:.0%} confidence")
|
|
476
|
+
```
|
|
477
|
+
|
|
478
|
+
### 6.2 Set-of-Marks (SoM) Support
|
|
479
|
+
|
|
480
|
+
The grounding module includes functions for extracting all UI elements and overlaying numbered labels (Set-of-Marks):
|
|
481
|
+
|
|
482
|
+
```python
|
|
483
|
+
from openadapt_ml.grounding import extract_ui_elements, overlay_element_marks
|
|
484
|
+
|
|
485
|
+
# Extract all interactive elements
|
|
486
|
+
elements = extract_ui_elements(screenshot)
|
|
487
|
+
# Returns: [{"id": 1, "label": "Login button", "bbox": [x1,y1,x2,y2], ...}, ...]
|
|
488
|
+
|
|
489
|
+
# Overlay numbered labels on screenshot
|
|
490
|
+
marked_screenshot = overlay_element_marks(screenshot, elements, style="compact")
|
|
491
|
+
marked_screenshot.save("screenshot_with_marks.png")
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
This enables element-based actions using indices instead of coordinates:
|
|
495
|
+
- Old: `CLICK(x=0.487, y=0.328)` - coordinate-based, brittle
|
|
496
|
+
- New: `CLICK([1])` - element-based, robust
|
|
497
|
+
|
|
498
|
+
See `docs/gemini_grounding.md` for full documentation and `examples/test_gemini_grounding.py` for a complete example.
|
|
499
|
+
|
|
500
|
+
### 6.3 Available Grounders
|
|
501
|
+
|
|
502
|
+
| Grounder | Description | Latency | Use Case |
|
|
503
|
+
|----------|-------------|---------|----------|
|
|
504
|
+
| `GeminiGrounder` | Google Gemini vision API | ~3s | Real UIs, zero-shot |
|
|
505
|
+
| `OracleGrounder` | Ground-truth bboxes | ~0ms | Evaluation |
|
|
506
|
+
| `DetectorGrounder` | Generic wrapper with backend selection | varies | Flexible |
|
|
507
|
+
|
|
508
|
+
### 6.4 Grounding Evaluation
|
|
509
|
+
|
|
510
|
+
The `openadapt_ml.evals.grounding` module provides metrics for evaluating grounding accuracy:
|
|
511
|
+
|
|
512
|
+
```python
|
|
513
|
+
from openadapt_ml.evals import GroundingMetrics, evaluate_grounder
|
|
514
|
+
|
|
515
|
+
metrics = evaluate_grounder(grounder, test_cases, k=5)
|
|
516
|
+
print(metrics)
|
|
517
|
+
# Grounding Metrics (n=10):
|
|
518
|
+
# Mean IoU: 0.720
|
|
519
|
+
# Centroid Hit Rate: 0.900
|
|
520
|
+
# Oracle Hit @1: 0.800
|
|
521
|
+
# Mean Latency: 3150ms
|
|
522
|
+
```
|
|
523
|
+
|
|
524
|
+
---
|
|
525
|
+
|
|
526
|
+
## 7. VLM Adapters
|
|
527
|
+
|
|
528
|
+
All VLM backends implement the shared `BaseVLMAdapter` interface in
|
|
529
|
+
`openadapt_ml/models/base_adapter.py` (prepare inputs, compute loss, generate
|
|
530
|
+
text from a sample).
|
|
531
|
+
|
|
532
|
+
Current adapters include:
|
|
533
|
+
|
|
534
|
+
- `QwenVLAdapter` (`openadapt_ml/models/qwen_vl.py`) for Qwen3-VL and
|
|
535
|
+
Qwen2.5-VL.
|
|
536
|
+
- `DummyAdapter` (`openadapt_ml/models/dummy_adapter.py`) for fast smoke
|
|
537
|
+
tests without loading a real VLM.
|
|
538
|
+
- `ApiVLMAdapter` (`openadapt_ml/models/api_adapter.py`) for hosted VLM
|
|
539
|
+
APIs (Anthropic Claude Sonnet 4.5 and OpenAI GPT-5.1). This adapter is
|
|
540
|
+
inference-only and implements `generate` using the respective SDKs.
|
|
541
|
+
|
|
542
|
+
For full adapter internals and training-time vs runtime behavior, see
|
|
543
|
+
`docs/design.md` §8.
|
|
544
|
+
|
|
545
|
+
### 7.1 API-backed adapters
|
|
546
|
+
|
|
547
|
+
To use the API-backed adapter from Python, you can configure API keys via `.env`
|
|
548
|
+
file, environment variables, or pass them explicitly:
|
|
549
|
+
|
|
550
|
+
```python
|
|
551
|
+
from openadapt_ml.models.api_adapter import ApiVLMAdapter
|
|
552
|
+
|
|
553
|
+
# Use .env file or environment variables (ANTHROPIC_API_KEY / OPENAI_API_KEY)
|
|
554
|
+
claude_adapter = ApiVLMAdapter(provider="anthropic")
|
|
555
|
+
gpt_adapter = ApiVLMAdapter(provider="openai")
|
|
556
|
+
|
|
557
|
+
# Or pass API keys explicitly from your application's config
|
|
558
|
+
claude_adapter = ApiVLMAdapter(provider="anthropic", api_key="...")
|
|
559
|
+
gpt_adapter = ApiVLMAdapter(provider="openai", api_key="...")
|
|
560
|
+
```
|
|
561
|
+
|
|
562
|
+
The existing CLI scripts `scripts/demo_policy.py` and
|
|
563
|
+
`scripts/eval_policy.py` expose these backends via the `--backend` flag
|
|
564
|
+
(`claude` / `openai`).
|
|
565
|
+
|
|
566
|
+
---
|
|
567
|
+
|
|
568
|
+
## 8. Runtime Policy & Demos
|
|
569
|
+
|
|
570
|
+
The runtime policy is implemented in `openadapt_ml/runtime/policy.py` as
|
|
571
|
+
`AgentPolicy`.
|
|
572
|
+
|
|
573
|
+
### 8.1 AgentPolicy
|
|
574
|
+
|
|
575
|
+
`AgentPolicy` is initialized with a VLM adapter (dummy or real). Given an
|
|
576
|
+
SFT-style sample, it:
|
|
577
|
+
|
|
578
|
+
1. Calls `adapter.generate(sample)` to obtain assistant text.
|
|
579
|
+
2. Parses actions from strings like:
|
|
580
|
+
- `CLICK(x=0.45, y=0.71)`
|
|
581
|
+
- `DONE()`
|
|
582
|
+
3. Returns a structured `Action` plus an optional free-form `thought`.
|
|
583
|
+
|
|
584
|
+
### 8.2 Demo script
|
|
585
|
+
|
|
586
|
+
`openadapt_ml/scripts/demo_policy.py` demonstrates how to use
|
|
587
|
+
`AgentPolicy` with different backends.
|
|
588
|
+
|
|
589
|
+
Run with a **dummy** backend (fast, no model load):
|
|
590
|
+
|
|
591
|
+
```bash
|
|
592
|
+
uv run python -m openadapt_ml.scripts.demo_policy --backend dummy
|
|
593
|
+
```
|
|
594
|
+
|
|
595
|
+
Run with **Qwen3-VL** backend:
|
|
596
|
+
|
|
597
|
+
```bash
|
|
598
|
+
uv run python -m openadapt_ml.scripts.demo_policy --backend qwen3
|
|
599
|
+
```
|
|
600
|
+
|
|
601
|
+
Run with **Qwen2.5-VL** backend:
|
|
602
|
+
|
|
603
|
+
```bash
|
|
604
|
+
uv run python -m openadapt_ml.scripts.demo_policy --backend qwen2_5
|
|
605
|
+
```
|
|
606
|
+
|
|
607
|
+
Each invocation will:
|
|
608
|
+
|
|
609
|
+
- Generate a synthetic login episode and select one step.
|
|
610
|
+
- Build an SFT-style sample from that step.
|
|
611
|
+
- Use `AgentPolicy` to predict the next action.
|
|
612
|
+
- Print the raw messages and the parsed action/thought.
|
|
613
|
+
|
|
614
|
+
---
|
|
615
|
+
|
|
616
|
+
## 9. Testing
|
|
617
|
+
|
|
618
|
+
Basic tests are provided under `tests/`.
|
|
619
|
+
|
|
620
|
+
Run the test suite with:
|
|
621
|
+
|
|
622
|
+
```bash
|
|
623
|
+
uv run pytest
|
|
624
|
+
```
|
|
625
|
+
|
|
626
|
+
In particular:
|
|
627
|
+
|
|
628
|
+
- `tests/test_training_dummy.py` runs a smoke test over the training loop
|
|
629
|
+
using `DummyAdapter`.
|
|
630
|
+
|
|
631
|
+
---
|
|
632
|
+
|
|
633
|
+
## 10. Training on Real Data
|
|
634
|
+
|
|
635
|
+
OpenAdapt-ML supports training on real GUI recordings from two sources:
|
|
636
|
+
1. **openadapt-capture** - New lightweight recording format
|
|
637
|
+
2. **OpenAdapt database** - Original OpenAdapt recordings (legacy)
|
|
638
|
+
|
|
639
|
+
### 10.1 Training on openadapt-capture recordings
|
|
640
|
+
|
|
641
|
+
[openadapt-capture](https://github.com/OpenAdaptAI/openadapt-capture) is a lightweight GUI recording tool.
|
|
642
|
+
|
|
643
|
+
```bash
|
|
644
|
+
# Install openadapt-capture
|
|
645
|
+
uv pip install openadapt-capture
|
|
646
|
+
|
|
647
|
+
# Record a workflow (e.g., turning off Night Shift)
|
|
648
|
+
openadapt-capture record --output ~/captures/turn-off-nightshift
|
|
649
|
+
|
|
650
|
+
# Train on the capture
|
|
651
|
+
uv run python -m openadapt_ml.scripts.train \
|
|
652
|
+
--config configs/qwen3vl_capture.yaml \
|
|
653
|
+
--capture ~/captures/turn-off-nightshift \
|
|
654
|
+
--open # Opens training dashboard in browser
|
|
655
|
+
```
|
|
656
|
+
|
|
657
|
+
The goal is automatically derived from the directory name (e.g., `"Turn off nightshift"`).
|
|
658
|
+
|
|
659
|
+
### 10.2 Compare human vs AI predictions
|
|
660
|
+
|
|
661
|
+
```bash
|
|
662
|
+
uv run python -m openadapt_ml.scripts.compare \
|
|
663
|
+
--capture ~/captures/turn-off-nightshift \
|
|
664
|
+
--checkpoint checkpoints/qwen3vl2b_capture_lora \
|
|
665
|
+
--open # Opens comparison viewer
|
|
666
|
+
```
|
|
667
|
+
|
|
668
|
+
The comparison viewer shows:
|
|
669
|
+
- Side-by-side human actions vs model predictions
|
|
670
|
+
- Click position overlays on screenshots
|
|
671
|
+
- Accuracy metrics and distance calculations
|
|
672
|
+
- Navigation between training dashboard and comparison viewer
|
|
673
|
+
|
|
674
|
+
---
|
|
675
|
+
|
|
676
|
+
## 11. Local Training (CUDA / Apple Silicon)
|
|
677
|
+
|
|
678
|
+
Train locally on your own GPU. Auto-detects CUDA or Apple Silicon (MPS).
|
|
679
|
+
|
|
680
|
+
### 11.1 Quick start
|
|
681
|
+
|
|
682
|
+
```bash
|
|
683
|
+
# Train on a capture (auto-detects device and config)
|
|
684
|
+
uv run python -m openadapt_ml.cloud.local train \
|
|
685
|
+
--capture ~/captures/turn-off-nightshift \
|
|
686
|
+
--open # Opens dashboard in browser
|
|
687
|
+
```
|
|
688
|
+
|
|
689
|
+
### 11.2 Training workflow
|
|
690
|
+
|
|
691
|
+
```bash
|
|
692
|
+
# Check device and training status
|
|
693
|
+
uv run python -m openadapt_ml.cloud.local status
|
|
694
|
+
|
|
695
|
+
# Train on a capture
|
|
696
|
+
uv run python -m openadapt_ml.cloud.local train --capture ~/captures/my-workflow --open
|
|
697
|
+
|
|
698
|
+
# Check training health (loss progression, convergence)
|
|
699
|
+
uv run python -m openadapt_ml.cloud.local check
|
|
700
|
+
|
|
701
|
+
# Start dashboard server
|
|
702
|
+
uv run python -m openadapt_ml.cloud.local serve --open
|
|
703
|
+
|
|
704
|
+
# Regenerate viewer
|
|
705
|
+
uv run python -m openadapt_ml.cloud.local viewer --open
|
|
706
|
+
|
|
707
|
+
# Run human vs AI comparison
|
|
708
|
+
uv run python -m openadapt_ml.cloud.local compare \
|
|
709
|
+
--capture ~/captures/my-workflow \
|
|
710
|
+
--checkpoint checkpoints/qwen3vl2b_capture_lora \
|
|
711
|
+
--open
|
|
712
|
+
```
|
|
713
|
+
|
|
714
|
+
---
|
|
715
|
+
|
|
716
|
+
## 12. Cloud GPU Training (Lambda Labs)
|
|
717
|
+
|
|
718
|
+
For faster training on powerful GPUs, use Lambda Labs. Full documentation: [`docs/cloud_gpu_training.md`](docs/cloud_gpu_training.md).
|
|
719
|
+
|
|
720
|
+
### 12.1 Quick start
|
|
721
|
+
|
|
722
|
+
```bash
|
|
723
|
+
# Set API key
|
|
724
|
+
export LAMBDA_API_KEY=your_key_here
|
|
725
|
+
|
|
726
|
+
# Launch, train, download, and terminate in one command
|
|
727
|
+
uv run python -m openadapt_ml.cloud.lambda_labs train \
|
|
728
|
+
--capture ~/captures/turn-off-nightshift \
|
|
729
|
+
--goal "Turn off Night Shift in System Settings"
|
|
730
|
+
```
|
|
731
|
+
|
|
732
|
+
### 12.2 Manual workflow
|
|
733
|
+
|
|
734
|
+
```bash
|
|
735
|
+
# List available instances and pricing
|
|
736
|
+
uv run python -m openadapt_ml.cloud.lambda_labs list
|
|
737
|
+
|
|
738
|
+
# Launch an A10 instance (~$0.75/hr)
|
|
739
|
+
uv run python -m openadapt_ml.cloud.lambda_labs launch --type gpu_1x_a10
|
|
740
|
+
|
|
741
|
+
# Check training status
|
|
742
|
+
uv run python -m openadapt_ml.cloud.lambda_labs train-status
|
|
743
|
+
|
|
744
|
+
# Check training health (loss progression, early stopping analysis)
|
|
745
|
+
uv run python -m openadapt_ml.cloud.lambda_labs check <instance_id>
|
|
746
|
+
|
|
747
|
+
# Download checkpoints and comparison results
|
|
748
|
+
uv run python -m openadapt_ml.cloud.lambda_labs download <instance_id>
|
|
749
|
+
|
|
750
|
+
# IMPORTANT: Terminate when done (billed by the hour!)
|
|
751
|
+
uv run python -m openadapt_ml.cloud.lambda_labs terminate <instance_id>
|
|
752
|
+
```
|
|
753
|
+
|
|
754
|
+
### 12.3 Training visualization
|
|
755
|
+
|
|
756
|
+
The training process generates:
|
|
757
|
+
- **`training_output/dashboard.html`** - Real-time training dashboard with loss curves
|
|
758
|
+
- **`training_output/viewer.html`** - Unified viewer for comparing human vs model predictions
|
|
759
|
+
|
|
760
|
+
Use the navigation tabs to switch between Training and Viewer.
|
|
761
|
+
|
|
762
|
+
**To serve the dashboard:**
|
|
763
|
+
```bash
|
|
764
|
+
uv run python -m openadapt_ml.cloud.local serve --port 8080 --open
|
|
765
|
+
```
|
|
766
|
+
|
|
767
|
+
**Training Dashboard:**
|
|
768
|
+
|
|
769
|
+

|
|
770
|
+
|
|
771
|
+
*Shows training progress, loss curves, stats (current loss, min loss, avg step time), and ETA.*
|
|
772
|
+
|
|
773
|
+

|
|
774
|
+
|
|
775
|
+
*Training configuration and evaluation samples with visual overlays showing human (green) vs predicted (purple) click positions.*
|
|
776
|
+
|
|
777
|
+
**Comparison Viewer:**
|
|
778
|
+
|
|
779
|
+

|
|
780
|
+
|
|
781
|
+
*Compare human actions vs model predictions frame-by-frame. Shows action type, model reasoning output, and match/mismatch status.*
|
|
782
|
+
|
|
783
|
+

|
|
784
|
+
|
|
785
|
+
*Event timeline, event details, transcript, and video playback controls.*
|
|
786
|
+
|
|
787
|
+
**Keyboard shortcuts (Viewer):**
|
|
788
|
+
- `Space` - Play/pause
|
|
789
|
+
- `←` / `→` - Previous/next frame
|
|
790
|
+
- `Home` / `End` - First/last frame
|
|
791
|
+
- `O` - Toggle click overlay
|
|
792
|
+
|
|
793
|
+
---
|
|
794
|
+
|
|
795
|
+
## 13. Limitations & Notes
|
|
796
|
+
|
|
797
|
+
- **Apple Silicon / bitsandbytes**:
|
|
798
|
+
- Example configs are sized for CPU / Apple Silicon development runs; see
|
|
799
|
+
`docs/design.md` §9.4 for details on QLoRA and platform-specific
|
|
800
|
+
considerations.
|
|
801
|
+
- **Batching**:
|
|
802
|
+
- For v1, `QwenVLAdapter` is implemented assuming `batch_size=1` for
|
|
803
|
+
simplicity when handling multimodal inputs. The training configs are
|
|
804
|
+
sized accordingly.
|
|
805
|
+
- **Evaluation**:
|
|
806
|
+
- v1 focuses on smoke tests and qualitative behavior on synthetic data.
|
|
807
|
+
More formal evaluation scripts and metrics are planned.
|
|
808
|
+
|
|
809
|
+
For deeper architectural details, see [`docs/design.md`](docs/design.md).
|
|
810
|
+
|
|
811
|
+
---
|
|
812
|
+
|
|
813
|
+
## 14. Roadmap
|
|
814
|
+
|
|
815
|
+
For the up-to-date, prioritized roadmap (including concrete implementation
|
|
816
|
+
targets and agent-executable acceptance criteria), see
|
|
817
|
+
[`docs/roadmap.md`](docs/roadmap.md).
|
|
818
|
+
|