openadapt-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/__init__.py +0 -0
- openadapt_ml/benchmarks/__init__.py +125 -0
- openadapt_ml/benchmarks/agent.py +825 -0
- openadapt_ml/benchmarks/azure.py +761 -0
- openadapt_ml/benchmarks/base.py +366 -0
- openadapt_ml/benchmarks/cli.py +884 -0
- openadapt_ml/benchmarks/data_collection.py +432 -0
- openadapt_ml/benchmarks/runner.py +381 -0
- openadapt_ml/benchmarks/waa.py +704 -0
- openadapt_ml/cloud/__init__.py +5 -0
- openadapt_ml/cloud/azure_inference.py +441 -0
- openadapt_ml/cloud/lambda_labs.py +2445 -0
- openadapt_ml/cloud/local.py +790 -0
- openadapt_ml/config.py +56 -0
- openadapt_ml/datasets/__init__.py +0 -0
- openadapt_ml/datasets/next_action.py +507 -0
- openadapt_ml/evals/__init__.py +23 -0
- openadapt_ml/evals/grounding.py +241 -0
- openadapt_ml/evals/plot_eval_metrics.py +174 -0
- openadapt_ml/evals/trajectory_matching.py +486 -0
- openadapt_ml/grounding/__init__.py +45 -0
- openadapt_ml/grounding/base.py +236 -0
- openadapt_ml/grounding/detector.py +570 -0
- openadapt_ml/ingest/__init__.py +43 -0
- openadapt_ml/ingest/capture.py +312 -0
- openadapt_ml/ingest/loader.py +232 -0
- openadapt_ml/ingest/synthetic.py +1102 -0
- openadapt_ml/models/__init__.py +0 -0
- openadapt_ml/models/api_adapter.py +171 -0
- openadapt_ml/models/base_adapter.py +59 -0
- openadapt_ml/models/dummy_adapter.py +42 -0
- openadapt_ml/models/qwen_vl.py +426 -0
- openadapt_ml/runtime/__init__.py +0 -0
- openadapt_ml/runtime/policy.py +182 -0
- openadapt_ml/schemas/__init__.py +53 -0
- openadapt_ml/schemas/sessions.py +122 -0
- openadapt_ml/schemas/validation.py +252 -0
- openadapt_ml/scripts/__init__.py +0 -0
- openadapt_ml/scripts/compare.py +1490 -0
- openadapt_ml/scripts/demo_policy.py +62 -0
- openadapt_ml/scripts/eval_policy.py +287 -0
- openadapt_ml/scripts/make_gif.py +153 -0
- openadapt_ml/scripts/prepare_synthetic.py +43 -0
- openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
- openadapt_ml/scripts/train.py +174 -0
- openadapt_ml/training/__init__.py +0 -0
- openadapt_ml/training/benchmark_viewer.py +1538 -0
- openadapt_ml/training/shared_ui.py +157 -0
- openadapt_ml/training/stub_provider.py +276 -0
- openadapt_ml/training/trainer.py +2446 -0
- openadapt_ml/training/viewer.py +2970 -0
- openadapt_ml-0.1.0.dist-info/METADATA +818 -0
- openadapt_ml-0.1.0.dist-info/RECORD +55 -0
- openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
- openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import argparse
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from openadapt_ml.config import settings
|
|
8
|
+
from openadapt_ml.scripts.train import main as train_main
|
|
9
|
+
from openadapt_ml.scripts.eval_policy import main as eval_main
|
|
10
|
+
from openadapt_ml.evals.plot_eval_metrics import plot_eval_metrics
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _require_env(var_name: str) -> None:
|
|
14
|
+
"""Raise a clear error if a required API key is missing."""
|
|
15
|
+
|
|
16
|
+
# Check settings first, then fall back to os.getenv
|
|
17
|
+
if var_name == "ANTHROPIC_API_KEY":
|
|
18
|
+
key = settings.anthropic_api_key or os.getenv(var_name)
|
|
19
|
+
elif var_name == "OPENAI_API_KEY":
|
|
20
|
+
key = settings.openai_api_key or os.getenv(var_name)
|
|
21
|
+
else:
|
|
22
|
+
key = os.getenv(var_name)
|
|
23
|
+
|
|
24
|
+
if not key:
|
|
25
|
+
raise RuntimeError(
|
|
26
|
+
f"API key {var_name} is required for this benchmark but is not set. "
|
|
27
|
+
"Please set it in .env file, as an environment variable, or configure "
|
|
28
|
+
"it before including the corresponding API backend."
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def run_qwen_login_benchmark(
|
|
33
|
+
config_path: str,
|
|
34
|
+
out_dir: str,
|
|
35
|
+
include_claude: bool = False,
|
|
36
|
+
include_openai: bool = False,
|
|
37
|
+
skip_train: bool = False,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""Run end-to-end synthetic login benchmark (train → eval base/FT → plot).
|
|
40
|
+
|
|
41
|
+
This is a thin orchestrator over existing train/eval/plot utilities. It:
|
|
42
|
+
- trains a LoRA adapter using the given config
|
|
43
|
+
- evaluates the base (no LoRA) and fine-tuned models on fresh synthetic data
|
|
44
|
+
- writes eval JSONs and a comparison plot under the given output directory
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
config = Path(config_path)
|
|
48
|
+
out_root = Path(out_dir)
|
|
49
|
+
|
|
50
|
+
eval_dir = out_root / "eval"
|
|
51
|
+
plots_dir = out_root / "plots"
|
|
52
|
+
|
|
53
|
+
eval_dir.mkdir(parents=True, exist_ok=True)
|
|
54
|
+
plots_dir.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
|
|
56
|
+
# Validate API keys up front if needed.
|
|
57
|
+
if include_claude:
|
|
58
|
+
_require_env("ANTHROPIC_API_KEY")
|
|
59
|
+
if include_openai:
|
|
60
|
+
_require_env("OPENAI_API_KEY")
|
|
61
|
+
|
|
62
|
+
# 1) Train LoRA adapter according to config unless explicitly skipped.
|
|
63
|
+
if not skip_train:
|
|
64
|
+
train_main(str(config))
|
|
65
|
+
|
|
66
|
+
metric_files = []
|
|
67
|
+
labels = []
|
|
68
|
+
|
|
69
|
+
# 2) Evaluate Qwen base model (ignoring any LoRA config).
|
|
70
|
+
base_json = eval_dir / "eval_qwen_base.json"
|
|
71
|
+
eval_main(
|
|
72
|
+
config_path=str(config),
|
|
73
|
+
backend="qwen3",
|
|
74
|
+
output_json=str(base_json),
|
|
75
|
+
ignore_lora=True,
|
|
76
|
+
log_samples=None,
|
|
77
|
+
log_limit=None,
|
|
78
|
+
)
|
|
79
|
+
metric_files.append(base_json)
|
|
80
|
+
labels.append("Qwen3-2B base")
|
|
81
|
+
|
|
82
|
+
# 3) Evaluate fine-tuned Qwen model (LoRA-enabled).
|
|
83
|
+
ft_json = eval_dir / "eval_qwen_ft.json"
|
|
84
|
+
eval_main(
|
|
85
|
+
config_path=str(config),
|
|
86
|
+
backend="qwen3",
|
|
87
|
+
output_json=str(ft_json),
|
|
88
|
+
ignore_lora=False,
|
|
89
|
+
log_samples=None,
|
|
90
|
+
log_limit=None,
|
|
91
|
+
)
|
|
92
|
+
metric_files.append(ft_json)
|
|
93
|
+
labels.append("Qwen3-2B FT")
|
|
94
|
+
|
|
95
|
+
# 4) Optionally evaluate API backends.
|
|
96
|
+
if include_claude:
|
|
97
|
+
claude_json = eval_dir / "eval_claude.json"
|
|
98
|
+
eval_main(
|
|
99
|
+
config_path=str(config),
|
|
100
|
+
backend="claude",
|
|
101
|
+
output_json=str(claude_json),
|
|
102
|
+
ignore_lora=False,
|
|
103
|
+
log_samples=None,
|
|
104
|
+
log_limit=None,
|
|
105
|
+
)
|
|
106
|
+
metric_files.append(claude_json)
|
|
107
|
+
labels.append("Claude Sonnet 4.5")
|
|
108
|
+
|
|
109
|
+
if include_openai:
|
|
110
|
+
gpt_json = eval_dir / "eval_gpt51.json"
|
|
111
|
+
eval_main(
|
|
112
|
+
config_path=str(config),
|
|
113
|
+
backend="openai",
|
|
114
|
+
output_json=str(gpt_json),
|
|
115
|
+
ignore_lora=False,
|
|
116
|
+
log_samples=None,
|
|
117
|
+
log_limit=None,
|
|
118
|
+
)
|
|
119
|
+
metric_files.append(gpt_json)
|
|
120
|
+
labels.append("GPT-5.1")
|
|
121
|
+
|
|
122
|
+
# 5) Plot metrics for whichever backends were evaluated.
|
|
123
|
+
if include_claude or include_openai:
|
|
124
|
+
plot_name = "qwen_vs_apis.png"
|
|
125
|
+
else:
|
|
126
|
+
plot_name = "qwen_base_vs_ft.png"
|
|
127
|
+
|
|
128
|
+
plot_path = plots_dir / plot_name
|
|
129
|
+
plot_eval_metrics(
|
|
130
|
+
metric_files=metric_files,
|
|
131
|
+
labels=labels,
|
|
132
|
+
output_path=plot_path,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def main() -> None:
|
|
137
|
+
parser = argparse.ArgumentParser(
|
|
138
|
+
description=(
|
|
139
|
+
"Run the synthetic login benchmark end-to-end (train → eval base/FT → plot)."
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
parser.add_argument(
|
|
143
|
+
"--config",
|
|
144
|
+
type=str,
|
|
145
|
+
required=True,
|
|
146
|
+
help="Path to YAML config file (e.g. configs/qwen3vl_synthetic_dev.yaml)",
|
|
147
|
+
)
|
|
148
|
+
parser.add_argument(
|
|
149
|
+
"--out-dir",
|
|
150
|
+
type=str,
|
|
151
|
+
required=True,
|
|
152
|
+
help=(
|
|
153
|
+
"Output directory for eval JSONs and plots "
|
|
154
|
+
"(e.g. experiments/qwen_login/2b_dev)"
|
|
155
|
+
),
|
|
156
|
+
)
|
|
157
|
+
parser.add_argument(
|
|
158
|
+
"--include-claude",
|
|
159
|
+
action="store_true",
|
|
160
|
+
help="Include Claude Sonnet 4.5 API backend in the evaluation.",
|
|
161
|
+
)
|
|
162
|
+
parser.add_argument(
|
|
163
|
+
"--include-openai",
|
|
164
|
+
action="store_true",
|
|
165
|
+
help="Include GPT-5.1 API backend in the evaluation.",
|
|
166
|
+
)
|
|
167
|
+
parser.add_argument(
|
|
168
|
+
"--include-all-apis",
|
|
169
|
+
action="store_true",
|
|
170
|
+
help="Shorthand to include both Claude and GPT-5.1 backends.",
|
|
171
|
+
)
|
|
172
|
+
parser.add_argument(
|
|
173
|
+
"--skip-train",
|
|
174
|
+
action="store_true",
|
|
175
|
+
help="Skip LoRA training and only run evaluations.",
|
|
176
|
+
)
|
|
177
|
+
args = parser.parse_args()
|
|
178
|
+
|
|
179
|
+
include_claude = args.include_claude or args.include_all_apis
|
|
180
|
+
include_openai = args.include_openai or args.include_all_apis
|
|
181
|
+
|
|
182
|
+
run_qwen_login_benchmark(
|
|
183
|
+
config_path=args.config,
|
|
184
|
+
out_dir=args.out_dir,
|
|
185
|
+
include_claude=include_claude,
|
|
186
|
+
include_openai=include_openai,
|
|
187
|
+
skip_train=args.skip_train,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
if __name__ == "__main__":
|
|
192
|
+
main()
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Optional, Dict, Any
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from openadapt_ml.datasets.next_action import NextActionDataset, build_next_action_sft_samples
|
|
9
|
+
from openadapt_ml.ingest.synthetic import generate_synthetic_sessions
|
|
10
|
+
from openadapt_ml.models.qwen_vl import QwenVLAdapter
|
|
11
|
+
from openadapt_ml.training.trainer import TrainingConfig, TrainingLogger, train_supervised
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _load_config(path: str | Path) -> dict:
|
|
15
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
16
|
+
return yaml.safe_load(f)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load_capture_episodes(capture_path: str | Path, goal: str | None = None) -> list:
|
|
20
|
+
"""Load episodes from an openadapt-capture recording."""
|
|
21
|
+
from openadapt_ml.ingest.capture import capture_to_episode
|
|
22
|
+
|
|
23
|
+
capture_path = Path(capture_path)
|
|
24
|
+
episode = capture_to_episode(capture_path, goal=goal)
|
|
25
|
+
return [episode]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def main(
|
|
29
|
+
config_path: str,
|
|
30
|
+
capture_path: str | None = None,
|
|
31
|
+
goal: str | None = None,
|
|
32
|
+
output_dir: str | None = None,
|
|
33
|
+
open_dashboard: bool = False,
|
|
34
|
+
) -> None:
|
|
35
|
+
cfg = _load_config(config_path)
|
|
36
|
+
|
|
37
|
+
model_name = cfg["model"]["name"]
|
|
38
|
+
load_in_4bit = cfg["model"].get("load_in_4bit", False)
|
|
39
|
+
max_pixels = cfg["model"].get("max_pixels") # For faster training with smaller images
|
|
40
|
+
min_pixels = cfg["model"].get("min_pixels")
|
|
41
|
+
|
|
42
|
+
# LoRA config may include an optional weights_path where the trained
|
|
43
|
+
# adapter should be saved. We pass a cleaned config (without
|
|
44
|
+
# weights_path) to the adapter loader.
|
|
45
|
+
raw_lora_cfg = cfg.get("lora")
|
|
46
|
+
lora_weights_path: Optional[str] = None
|
|
47
|
+
lora_cfg: Optional[Dict[str, Any]] = None
|
|
48
|
+
if isinstance(raw_lora_cfg, dict):
|
|
49
|
+
lora_weights_path = raw_lora_cfg.get("weights_path")
|
|
50
|
+
lora_cfg = {k: v for k, v in raw_lora_cfg.items() if k != "weights_path"}
|
|
51
|
+
else:
|
|
52
|
+
lora_cfg = raw_lora_cfg
|
|
53
|
+
|
|
54
|
+
# Load data - either from capture or synthetic
|
|
55
|
+
use_som = cfg.get("synthetic_data", {}).get("use_som", False)
|
|
56
|
+
|
|
57
|
+
if capture_path:
|
|
58
|
+
# Load from real openadapt-capture recording
|
|
59
|
+
print(f"Loading capture from: {capture_path}")
|
|
60
|
+
episodes = _load_capture_episodes(capture_path, goal=goal)
|
|
61
|
+
data_source = f"capture '{Path(capture_path).name}'"
|
|
62
|
+
else:
|
|
63
|
+
# Generate synthetic data
|
|
64
|
+
synth_cfg = cfg.get("synthetic_data", {})
|
|
65
|
+
num_sessions = synth_cfg.get("num_sessions", 10)
|
|
66
|
+
seed = synth_cfg.get("seed")
|
|
67
|
+
default_output_dir = str(Path("synthetic") / "train")
|
|
68
|
+
output_dir = synth_cfg.get("output_dir", default_output_dir)
|
|
69
|
+
use_som = synth_cfg.get("use_som", False)
|
|
70
|
+
scenario = synth_cfg.get("scenario", "login")
|
|
71
|
+
|
|
72
|
+
sessions = generate_synthetic_sessions(
|
|
73
|
+
num_sessions=num_sessions,
|
|
74
|
+
seed=seed,
|
|
75
|
+
output_dir=output_dir,
|
|
76
|
+
use_som=use_som,
|
|
77
|
+
scenario=scenario,
|
|
78
|
+
)
|
|
79
|
+
episodes = [ep for sess in sessions for ep in sess.episodes]
|
|
80
|
+
data_source = f"synthetic '{scenario}'"
|
|
81
|
+
|
|
82
|
+
samples = build_next_action_sft_samples(episodes, use_som=use_som)
|
|
83
|
+
dataset = NextActionDataset(samples)
|
|
84
|
+
|
|
85
|
+
# Adapter + model
|
|
86
|
+
adapter = QwenVLAdapter.from_pretrained(
|
|
87
|
+
model_name=model_name,
|
|
88
|
+
lora_config=lora_cfg,
|
|
89
|
+
load_in_4bit=load_in_4bit,
|
|
90
|
+
max_pixels=max_pixels,
|
|
91
|
+
min_pixels=min_pixels,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Training config
|
|
95
|
+
train_cfg_raw = cfg.get("training", {})
|
|
96
|
+
# Determine output directory
|
|
97
|
+
if output_dir is None:
|
|
98
|
+
output_dir = train_cfg_raw.get("output_dir", "training_output")
|
|
99
|
+
train_cfg = TrainingConfig(
|
|
100
|
+
num_train_epochs=train_cfg_raw.get("num_train_epochs", 1),
|
|
101
|
+
per_device_train_batch_size=train_cfg_raw.get("per_device_train_batch_size", 1),
|
|
102
|
+
gradient_accumulation_steps=train_cfg_raw.get("gradient_accumulation_steps", 1),
|
|
103
|
+
learning_rate=train_cfg_raw.get("learning_rate", 2e-4),
|
|
104
|
+
warmup_ratio=train_cfg_raw.get("warmup_ratio", 0.03),
|
|
105
|
+
weight_decay=train_cfg_raw.get("weight_decay", 0.0),
|
|
106
|
+
max_grad_norm=train_cfg_raw.get("max_grad_norm", 1.0),
|
|
107
|
+
logging_steps=train_cfg_raw.get("logging_steps", 10),
|
|
108
|
+
lr_scheduler_type=train_cfg_raw.get("lr_scheduler_type", "linear"),
|
|
109
|
+
early_stop_loss=train_cfg_raw.get("early_stop_loss", 1e-4),
|
|
110
|
+
early_stop_patience=train_cfg_raw.get("early_stop_patience", 10),
|
|
111
|
+
output_dir=output_dir,
|
|
112
|
+
# Evaluation settings
|
|
113
|
+
eval_every_epoch=train_cfg_raw.get("eval_every_epoch", True),
|
|
114
|
+
eval_samples=train_cfg_raw.get("eval_samples", 3),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
som_label = " (SoM mode)" if use_som else " (coordinate mode)"
|
|
118
|
+
print(f"Loaded {len(episodes)} episodes and {len(samples)} SFT samples{som_label} from {data_source}.")
|
|
119
|
+
print("Starting training...")
|
|
120
|
+
|
|
121
|
+
# Get goal from episodes (for logging/viewer)
|
|
122
|
+
episode_goal = episodes[0].goal if episodes else ""
|
|
123
|
+
|
|
124
|
+
# Create logger with metadata for dashboard
|
|
125
|
+
logger = TrainingLogger(
|
|
126
|
+
output_dir=train_cfg.output_dir,
|
|
127
|
+
config=train_cfg,
|
|
128
|
+
capture_path=str(capture_path) if capture_path else "",
|
|
129
|
+
config_path=str(config_path),
|
|
130
|
+
goal=goal or episode_goal, # Use explicit goal or episode goal
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Pass the first episode for periodic evaluation (if available)
|
|
134
|
+
eval_episode = episodes[0] if episodes else None
|
|
135
|
+
training_success = train_supervised(adapter, dataset, train_cfg, logger=logger, episode=eval_episode)
|
|
136
|
+
|
|
137
|
+
# Persist the trained adapter if a weights_path was provided and training succeeded.
|
|
138
|
+
if lora_weights_path:
|
|
139
|
+
if training_success:
|
|
140
|
+
save_path = Path(lora_weights_path)
|
|
141
|
+
save_path.mkdir(parents=True, exist_ok=True)
|
|
142
|
+
adapter.model.save_pretrained(save_path) # type: ignore[arg-type]
|
|
143
|
+
print(f"Saved LoRA adapter to {save_path}")
|
|
144
|
+
else:
|
|
145
|
+
print("Training aborted due to invalid loss. Skipping checkpoint save to avoid corrupted weights.")
|
|
146
|
+
|
|
147
|
+
# Open dashboard in browser if requested
|
|
148
|
+
if open_dashboard:
|
|
149
|
+
import webbrowser
|
|
150
|
+
dashboard_path = Path(output_dir) / "dashboard.html"
|
|
151
|
+
if dashboard_path.exists():
|
|
152
|
+
webbrowser.open(f"file://{dashboard_path.absolute()}")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
if __name__ == "__main__":
|
|
156
|
+
import argparse
|
|
157
|
+
|
|
158
|
+
parser = argparse.ArgumentParser(
|
|
159
|
+
description="Train Qwen-VL adapter on synthetic data or openadapt-capture recordings."
|
|
160
|
+
)
|
|
161
|
+
parser.add_argument("--config", type=str, required=True, help="Path to YAML config file.")
|
|
162
|
+
parser.add_argument("--capture", type=str, help="Path to openadapt-capture recording directory.")
|
|
163
|
+
parser.add_argument("--goal", type=str, help="Task goal/description (overrides recording's task description).")
|
|
164
|
+
parser.add_argument("--output-dir", type=str, help="Output directory for logs and dashboard.")
|
|
165
|
+
parser.add_argument("--open", action="store_true", help="Open training dashboard in browser.")
|
|
166
|
+
args = parser.parse_args()
|
|
167
|
+
|
|
168
|
+
main(
|
|
169
|
+
args.config,
|
|
170
|
+
capture_path=args.capture,
|
|
171
|
+
goal=args.goal,
|
|
172
|
+
output_dir=args.output_dir,
|
|
173
|
+
open_dashboard=args.open,
|
|
174
|
+
)
|
|
File without changes
|