aquin 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aquin/__init__.py +3 -0
- aquin/cli.py +197 -0
- aquin/run.py +197 -0
- aquin-0.0.1.dist-info/METADATA +136 -0
- aquin-0.0.1.dist-info/RECORD +8 -0
- aquin-0.0.1.dist-info/WHEEL +5 -0
- aquin-0.0.1.dist-info/entry_points.txt +2 -0
- aquin-0.0.1.dist-info/top_level.txt +1 -0
aquin/__init__.py
ADDED
aquin/cli.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import tarfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
_CONFIG_PATH = Path.home() / ".aquin" / "config.json"
|
|
10
|
+
_BASE_URL = "https://api.aquin.app"
|
|
11
|
+
_PACKAGE_PATH = Path.cwd() / "aquin_run.tar.gz"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _load_config() -> dict:
|
|
15
|
+
if _CONFIG_PATH.exists():
|
|
16
|
+
with open(_CONFIG_PATH) as f:
|
|
17
|
+
return json.load(f)
|
|
18
|
+
return {}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _save_config(data: dict) -> None:
|
|
22
|
+
_CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
with open(_CONFIG_PATH, "w") as f:
|
|
24
|
+
json.dump(data, f, indent=2)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _get_api_key(explicit: str | None = None) -> str:
|
|
28
|
+
if explicit:
|
|
29
|
+
return explicit
|
|
30
|
+
cfg = _load_config()
|
|
31
|
+
key = cfg.get("api_key") or os.environ.get("AQUIN_API_KEY") or ""
|
|
32
|
+
if not key:
|
|
33
|
+
print("No API key found. Run: aquin login")
|
|
34
|
+
sys.exit(1)
|
|
35
|
+
return key
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _package_path() -> Path:
|
|
39
|
+
return Path.cwd() / "aquin_run.tar.gz"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _read_meta() -> dict:
|
|
43
|
+
run_dir = Path.cwd() / "aquin_run"
|
|
44
|
+
if not run_dir.exists():
|
|
45
|
+
print("No aquin_run/ directory found. Run your training script first.")
|
|
46
|
+
sys.exit(1)
|
|
47
|
+
meta_path = run_dir / "meta.json"
|
|
48
|
+
if not meta_path.exists():
|
|
49
|
+
print("aquin_run/meta.json not found. Run your training script first.")
|
|
50
|
+
sys.exit(1)
|
|
51
|
+
with open(meta_path) as f:
|
|
52
|
+
return json.load(f)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def cmd_login(args: list[str]) -> None:
|
|
56
|
+
"""aquin login — save your API key"""
|
|
57
|
+
key = input("Paste your Aquin API key: ").strip()
|
|
58
|
+
if not key:
|
|
59
|
+
print("No key entered.")
|
|
60
|
+
sys.exit(1)
|
|
61
|
+
cfg = _load_config()
|
|
62
|
+
cfg["api_key"] = key
|
|
63
|
+
_save_config(cfg)
|
|
64
|
+
print(f"API key saved to {_CONFIG_PATH}")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def cmd_package(args: list[str]) -> None:
|
|
68
|
+
"""aquin package — bundle aquin_run/ into aquin_run.tar.gz"""
|
|
69
|
+
meta = _read_meta()
|
|
70
|
+
run_name = meta.get("run_name", meta["run_id"])
|
|
71
|
+
out = _package_path()
|
|
72
|
+
|
|
73
|
+
print(f"[aquin] Packaging run '{run_name}'...")
|
|
74
|
+
run_dir = Path.cwd() / "aquin_run"
|
|
75
|
+
with tarfile.open(out, "w:gz") as tar:
|
|
76
|
+
tar.add(run_dir, arcname="aquin_run")
|
|
77
|
+
|
|
78
|
+
size_mb = round(out.stat().st_size / 1_000_000, 1)
|
|
79
|
+
print(f"[aquin] Package ready: {out.name} ({size_mb} MB)")
|
|
80
|
+
print(f"[aquin] Run aquin push to send it to Aquin.")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def cmd_push(args: list[str]) -> None:
|
|
84
|
+
"""aquin push — push aquin_run.tar.gz to Aquin"""
|
|
85
|
+
import requests
|
|
86
|
+
|
|
87
|
+
explicit_key = None
|
|
88
|
+
for i, a in enumerate(args):
|
|
89
|
+
if a == "--key" and i + 1 < len(args):
|
|
90
|
+
explicit_key = args[i + 1]
|
|
91
|
+
|
|
92
|
+
pkg = _package_path()
|
|
93
|
+
if not pkg.exists():
|
|
94
|
+
print("No aquin_run.tar.gz found. Run aquin package first.")
|
|
95
|
+
sys.exit(1)
|
|
96
|
+
|
|
97
|
+
meta = _read_meta()
|
|
98
|
+
run_id = meta["run_id"]
|
|
99
|
+
run_name = meta.get("run_name", run_id)
|
|
100
|
+
api_key = _get_api_key(explicit_key)
|
|
101
|
+
base_url = os.environ.get("AQUIN_BASE_URL", _BASE_URL).rstrip("/")
|
|
102
|
+
|
|
103
|
+
size_mb = round(pkg.stat().st_size / 1_000_000, 1)
|
|
104
|
+
print(f"[aquin] Pushing '{run_name}' ({size_mb} MB)...")
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
# Step 1: get the VM upload URL from Next.js (auth only, no body)
|
|
108
|
+
auth_resp = requests.get(
|
|
109
|
+
f"{base_url}/api/sdk/push",
|
|
110
|
+
params={
|
|
111
|
+
"run_id": run_id,
|
|
112
|
+
"run_name": run_name,
|
|
113
|
+
"base_model": meta.get("base_model", ""),
|
|
114
|
+
},
|
|
115
|
+
headers={"Authorization": f"Bearer {api_key}"},
|
|
116
|
+
timeout=15,
|
|
117
|
+
)
|
|
118
|
+
auth_resp.raise_for_status()
|
|
119
|
+
upload = auth_resp.json()
|
|
120
|
+
|
|
121
|
+
# Step 2: push tar.gz directly to the VM (bypasses Vercel body limit)
|
|
122
|
+
file_size = pkg.stat().st_size
|
|
123
|
+
with open(pkg, "rb") as f:
|
|
124
|
+
resp = requests.post(
|
|
125
|
+
upload["url"],
|
|
126
|
+
data=f,
|
|
127
|
+
headers={**upload["headers"], "Content-Length": str(file_size)},
|
|
128
|
+
timeout=600,
|
|
129
|
+
)
|
|
130
|
+
resp.raise_for_status()
|
|
131
|
+
print("[aquin] Run pushed successfully.")
|
|
132
|
+
print("[aquin] Open Aquin, your run will appear as a notification.")
|
|
133
|
+
except Exception as exc:
|
|
134
|
+
print(f"[aquin] Push failed: {exc}")
|
|
135
|
+
sys.exit(1)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def cmd_whoami(args: list[str]) -> None:
|
|
139
|
+
"""aquin whoami — show the currently logged in user"""
|
|
140
|
+
import requests
|
|
141
|
+
key = _get_api_key()
|
|
142
|
+
base_url = os.environ.get("AQUIN_BASE_URL", _BASE_URL).rstrip("/")
|
|
143
|
+
try:
|
|
144
|
+
resp = requests.get(
|
|
145
|
+
f"{base_url}/api/sdk/whoami",
|
|
146
|
+
headers={"Authorization": f"Bearer {key}"},
|
|
147
|
+
timeout=10,
|
|
148
|
+
)
|
|
149
|
+
resp.raise_for_status()
|
|
150
|
+
data = resp.json()
|
|
151
|
+
print(f"Logged in as: {data.get('email', data.get('user_id', '?'))}")
|
|
152
|
+
except Exception as exc:
|
|
153
|
+
print(f"Error: {exc}")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def cmd_help(args: list[str]) -> None:
|
|
157
|
+
print("Aquin CLI")
|
|
158
|
+
print("")
|
|
159
|
+
print("Usage: aquin <command>")
|
|
160
|
+
print("")
|
|
161
|
+
print("Commands:")
|
|
162
|
+
print(" login Save your Aquin API key")
|
|
163
|
+
print(" package Bundle aquin_run/ into aquin_run.tar.gz")
|
|
164
|
+
print(" push Push aquin_run.tar.gz to Aquin")
|
|
165
|
+
print(" whoami Show the currently logged in account")
|
|
166
|
+
print(" help Show this help message")
|
|
167
|
+
print("")
|
|
168
|
+
print("Typical workflow:")
|
|
169
|
+
print(" 1. Add aquin.init() / run.log() / run.finish() to your training script")
|
|
170
|
+
print(" 2. Run your training script")
|
|
171
|
+
print(" 3. aquin package")
|
|
172
|
+
print(" 4. aquin push")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def main() -> None:
|
|
176
|
+
args = sys.argv[1:]
|
|
177
|
+
if not args:
|
|
178
|
+
cmd_help([])
|
|
179
|
+
return
|
|
180
|
+
|
|
181
|
+
cmd = args[0]
|
|
182
|
+
rest = args[1:]
|
|
183
|
+
|
|
184
|
+
if cmd == "login":
|
|
185
|
+
cmd_login(rest)
|
|
186
|
+
elif cmd == "package":
|
|
187
|
+
cmd_package(rest)
|
|
188
|
+
elif cmd == "push":
|
|
189
|
+
cmd_push(rest)
|
|
190
|
+
elif cmd == "whoami":
|
|
191
|
+
cmd_whoami(rest)
|
|
192
|
+
elif cmd in ("help", "--help", "-h"):
|
|
193
|
+
cmd_help(rest)
|
|
194
|
+
else:
|
|
195
|
+
print(f"Unknown command: {cmd}")
|
|
196
|
+
print("Run aquin help to see available commands.")
|
|
197
|
+
sys.exit(1)
|
aquin/run.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
import uuid
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def init(
|
|
13
|
+
base_model: str | None = None,
|
|
14
|
+
run_name: str | None = None,
|
|
15
|
+
config: dict[str, Any] | None = None,
|
|
16
|
+
) -> "Run":
|
|
17
|
+
"""Start recording a training run. Call this before your training loop."""
|
|
18
|
+
return Run(base_model=base_model, run_name=run_name, config=config)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Run:
|
|
22
|
+
"""
|
|
23
|
+
Records metrics and checkpoints locally during training.
|
|
24
|
+
Call aquin.init() to create one.
|
|
25
|
+
|
|
26
|
+
After training, use the CLI:
|
|
27
|
+
aquin package # then: aquin push
|
|
28
|
+
|
|
29
|
+
Example
|
|
30
|
+
-------
|
|
31
|
+
run = aquin.init(
|
|
32
|
+
base_model="meta-llama/Llama-3.2-1B-Instruct",
|
|
33
|
+
run_name="my-run",
|
|
34
|
+
config={"lr": 2e-4, "epochs": 3, "rank": 16, "lora_alpha": 32,
|
|
35
|
+
"method": "qlora", "per_device_train_batch_size": 2,
|
|
36
|
+
"gradient_accumulation_steps": 8, "dataset": "data.jsonl"},
|
|
37
|
+
)
|
|
38
|
+
for step, batch in enumerate(dataloader):
|
|
39
|
+
loss = train_step(batch)
|
|
40
|
+
run.log(step, loss=loss.item(), learning_rate=scheduler.get_last_lr()[0],
|
|
41
|
+
grad_norm=grad_norm, epoch=epoch)
|
|
42
|
+
run.checkpoint(model, step=step)
|
|
43
|
+
run.finish()
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
base_model: str | None = None,
|
|
49
|
+
run_name: str | None = None,
|
|
50
|
+
config: dict[str, Any] | None = None,
|
|
51
|
+
) -> None:
|
|
52
|
+
self._run_id = str(uuid.uuid4())
|
|
53
|
+
self._run_name = run_name or f"run-{self._run_id[:8]}"
|
|
54
|
+
self._base_model = base_model or os.environ.get("AQUIN_BASE_MODEL") or ""
|
|
55
|
+
self._started_at = time.time()
|
|
56
|
+
|
|
57
|
+
self._run_dir = Path.cwd() / "aquin_run"
|
|
58
|
+
self._run_dir.mkdir(exist_ok=True)
|
|
59
|
+
(self._run_dir / "checkpoints").mkdir(exist_ok=True)
|
|
60
|
+
|
|
61
|
+
self._metrics: list[dict[str, Any]] = []
|
|
62
|
+
self._metrics_lock = threading.Lock()
|
|
63
|
+
|
|
64
|
+
self._write_meta()
|
|
65
|
+
if config:
|
|
66
|
+
self._write_config(config)
|
|
67
|
+
|
|
68
|
+
print(f"[aquin] Recording run '{self._run_name}' (id: {self._run_id})")
|
|
69
|
+
print(f"[aquin] Run data will be saved to {self._run_dir}")
|
|
70
|
+
|
|
71
|
+
# ── Public API ──────────────────────────────────────────────────────────────
|
|
72
|
+
|
|
73
|
+
def log(
|
|
74
|
+
self,
|
|
75
|
+
step: int,
|
|
76
|
+
*,
|
|
77
|
+
loss: float,
|
|
78
|
+
learning_rate: float | None = None,
|
|
79
|
+
grad_norm: float | None = None,
|
|
80
|
+
momentum_norm: float | None = None,
|
|
81
|
+
epoch: int | None = None,
|
|
82
|
+
batch: int | None = None,
|
|
83
|
+
total_batches: int | None = None,
|
|
84
|
+
step_ms: float | None = None,
|
|
85
|
+
**extra: float,
|
|
86
|
+
) -> None:
|
|
87
|
+
"""
|
|
88
|
+
Record metrics for one training step. Call this every step inside your loop.
|
|
89
|
+
|
|
90
|
+
Required
|
|
91
|
+
--------
|
|
92
|
+
step : global training step index (0-based or 1-based, be consistent)
|
|
93
|
+
loss : scalar training loss for this step
|
|
94
|
+
|
|
95
|
+
Recommended
|
|
96
|
+
-----------
|
|
97
|
+
learning_rate : current LR from your scheduler
|
|
98
|
+
grad_norm : gradient norm — compute with torch.nn.utils.clip_grad_norm_()
|
|
99
|
+
momentum_norm : optimizer momentum norm — enables Momentum chart in the UI
|
|
100
|
+
for AdamW: sum(p.exp_avg.norm() for p in optimizer.state.values())
|
|
101
|
+
epoch : current epoch number — enables epoch summary table in the UI
|
|
102
|
+
batch : batch index within the epoch
|
|
103
|
+
total_batches : total batches per epoch
|
|
104
|
+
step_ms : wall-clock time for this step in milliseconds
|
|
105
|
+
|
|
106
|
+
Any additional keyword arguments are stored alongside the standard metrics.
|
|
107
|
+
"""
|
|
108
|
+
entry: dict[str, Any] = {"step": step, "metrics": {"loss": loss}}
|
|
109
|
+
if learning_rate is not None:
|
|
110
|
+
entry["metrics"]["learning_rate"] = learning_rate
|
|
111
|
+
if grad_norm is not None:
|
|
112
|
+
entry["metrics"]["grad_norm"] = grad_norm
|
|
113
|
+
if momentum_norm is not None:
|
|
114
|
+
entry["metrics"]["momentum_norm"] = momentum_norm
|
|
115
|
+
if step_ms is not None:
|
|
116
|
+
entry["metrics"]["step_ms"] = step_ms
|
|
117
|
+
if extra:
|
|
118
|
+
entry["metrics"].update(extra)
|
|
119
|
+
if epoch is not None:
|
|
120
|
+
entry["epoch"] = epoch
|
|
121
|
+
if batch is not None:
|
|
122
|
+
entry["batch"] = batch
|
|
123
|
+
if total_batches is not None:
|
|
124
|
+
entry["total_batches"] = total_batches
|
|
125
|
+
|
|
126
|
+
with self._metrics_lock:
|
|
127
|
+
self._metrics.append(entry)
|
|
128
|
+
|
|
129
|
+
def checkpoint(self, model: Any, step: int) -> None:
|
|
130
|
+
"""
|
|
131
|
+
Save the final model checkpoint. Call once at the end of training.
|
|
132
|
+
One checkpoint per run — always replaces the previous save.
|
|
133
|
+
|
|
134
|
+
Args
|
|
135
|
+
----
|
|
136
|
+
model: a PyTorch nn.Module
|
|
137
|
+
step : the training step this checkpoint corresponds to
|
|
138
|
+
"""
|
|
139
|
+
try:
|
|
140
|
+
import torch
|
|
141
|
+
except ImportError:
|
|
142
|
+
raise ImportError("torch is required to save checkpoints.")
|
|
143
|
+
|
|
144
|
+
ckpt_path = self._run_dir / "checkpoints" / "checkpoint.pt"
|
|
145
|
+
print(f"[aquin] Saving checkpoint (step {step})...")
|
|
146
|
+
torch.save({"step": step, "state_dict": model.state_dict()}, ckpt_path)
|
|
147
|
+
self._write_metrics()
|
|
148
|
+
print(f"[aquin] Checkpoint saved (step {step}).")
|
|
149
|
+
|
|
150
|
+
def finish(self, config: dict[str, Any] | None = None) -> None:
|
|
151
|
+
"""
|
|
152
|
+
Finalise the run. Flushes all metrics and optional config to disk.
|
|
153
|
+
|
|
154
|
+
If you passed config to aquin.init() you don't need to pass it again here.
|
|
155
|
+
After this, run: aquin package # then: aquin push
|
|
156
|
+
|
|
157
|
+
Args
|
|
158
|
+
----
|
|
159
|
+
config: training hyperparameters to store alongside the run.
|
|
160
|
+
Recognised keys (all optional):
|
|
161
|
+
lr, epochs, rank, lora_alpha, method,
|
|
162
|
+
per_device_train_batch_size, gradient_accumulation_steps,
|
|
163
|
+
dataset, max_seq_len, optimizer, scheduler,
|
|
164
|
+
dropout, weight_decay, grad_clip
|
|
165
|
+
"""
|
|
166
|
+
self._write_metrics()
|
|
167
|
+
if config:
|
|
168
|
+
self._write_config(config)
|
|
169
|
+
elapsed = round(time.time() - self._started_at)
|
|
170
|
+
print(f"[aquin] Run finished. {len(self._metrics)} steps recorded in {elapsed}s.")
|
|
171
|
+
print(f"[aquin] Push to Aquin with: aquin package # then: aquin push")
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def run_id(self) -> str:
|
|
175
|
+
return self._run_id
|
|
176
|
+
|
|
177
|
+
# ── Internal ────────────────────────────────────────────────────────────────
|
|
178
|
+
|
|
179
|
+
def _write_meta(self) -> None:
|
|
180
|
+
meta = {
|
|
181
|
+
"run_id": self._run_id,
|
|
182
|
+
"run_name": self._run_name,
|
|
183
|
+
"base_model": self._base_model,
|
|
184
|
+
"started_at": self._started_at,
|
|
185
|
+
}
|
|
186
|
+
with open(self._run_dir / "meta.json", "w") as f:
|
|
187
|
+
json.dump(meta, f, indent=2)
|
|
188
|
+
|
|
189
|
+
def _write_metrics(self) -> None:
|
|
190
|
+
with self._metrics_lock:
|
|
191
|
+
data = list(self._metrics)
|
|
192
|
+
with open(self._run_dir / "metrics.json", "w") as f:
|
|
193
|
+
json.dump(data, f)
|
|
194
|
+
|
|
195
|
+
def _write_config(self, config: dict[str, Any]) -> None:
|
|
196
|
+
with open(self._run_dir / "config.json", "w") as f:
|
|
197
|
+
json.dump(config, f, indent=2)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aquin
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Record training runs locally and push to Aquin for post-hoc inspection — loss curves, SAE diffs, model diffs, and more.
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://aquin.app
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: requests>=2.28
|
|
10
|
+
|
|
11
|
+
# Aquin SDK
|
|
12
|
+
|
|
13
|
+
Record your training runs locally and push them to [Aquin](https://aquin.app) for post-hoc inspection — loss curves, learning rate, grad norm, epoch summaries, SAE feature diffs, model behaviour diffs, and more.
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install aquin
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quickstart
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
import aquin
|
|
25
|
+
|
|
26
|
+
run = aquin.init(
|
|
27
|
+
base_model="meta-llama/Llama-3.2-1B-Instruct",
|
|
28
|
+
run_name="my-lora-run",
|
|
29
|
+
config={
|
|
30
|
+
"lr": 2e-4, "epochs": 3, "rank": 16, "lora_alpha": 32,
|
|
31
|
+
"method": "qlora", "per_device_train_batch_size": 2,
|
|
32
|
+
"gradient_accumulation_steps": 8, "dataset": "data.jsonl",
|
|
33
|
+
},
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
for epoch in range(3):
|
|
37
|
+
for step, batch in enumerate(dataloader):
|
|
38
|
+
loss = train_step(batch)
|
|
39
|
+
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0).item()
|
|
40
|
+
run.log(
|
|
41
|
+
step,
|
|
42
|
+
loss=loss.item(),
|
|
43
|
+
learning_rate=scheduler.get_last_lr()[0],
|
|
44
|
+
grad_norm=grad_norm,
|
|
45
|
+
epoch=epoch,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
run.checkpoint(model, step=step)
|
|
49
|
+
run.finish()
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Then push:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
aquin package
|
|
56
|
+
aquin push
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Your run appears in the Aquin dashboard under **CLI runs** with the full inspection suite.
|
|
60
|
+
|
|
61
|
+
## API
|
|
62
|
+
|
|
63
|
+
### `aquin.init(base_model, run_name, config)`
|
|
64
|
+
|
|
65
|
+
Starts a new run. Creates `aquin_run/` in the current directory.
|
|
66
|
+
|
|
67
|
+
| Param | Description |
|
|
68
|
+
|---|---|
|
|
69
|
+
| `base_model` | HuggingFace model ID, e.g. `"meta-llama/Llama-3.2-1B-Instruct"` |
|
|
70
|
+
| `run_name` | Display name for the run |
|
|
71
|
+
| `config` | Dict of training hyperparameters (optional, can also pass to `finish()`) |
|
|
72
|
+
|
|
73
|
+
### `run.log(step, *, loss, ...)`
|
|
74
|
+
|
|
75
|
+
Record metrics for one training step. Call every step inside your loop.
|
|
76
|
+
|
|
77
|
+
| Param | Description |
|
|
78
|
+
|---|---|
|
|
79
|
+
| `step` | Global training step (required) |
|
|
80
|
+
| `loss` | Scalar training loss (required) |
|
|
81
|
+
| `learning_rate` | Current LR — enables LR chart |
|
|
82
|
+
| `grad_norm` | Gradient norm — enables grad norm chart |
|
|
83
|
+
| `epoch` | Current epoch — enables epoch summary table |
|
|
84
|
+
| `momentum_norm` | Optimizer momentum norm — enables momentum chart |
|
|
85
|
+
| `step_ms` | Wall-clock time for this step in ms |
|
|
86
|
+
|
|
87
|
+
### `run.checkpoint(model, step)`
|
|
88
|
+
|
|
89
|
+
Saves the model checkpoint locally. One checkpoint per run — always replaces the previous save. Call once at the end of training. The checkpoint is included in the push and used for SAE diff and model diff analysis.
|
|
90
|
+
|
|
91
|
+
### `run.finish(config)`
|
|
92
|
+
|
|
93
|
+
Flushes all metrics to disk. Pass `config` here if you didn't pass it to `aquin.init()`.
|
|
94
|
+
|
|
95
|
+
## CLI
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
aquin login # save your API key
|
|
99
|
+
aquin package # bundle aquin_run/ into aquin_run.tar.gz
|
|
100
|
+
aquin push # push to Aquin
|
|
101
|
+
aquin whoami # check which account you're logged in as
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Using with HuggingFace Trainer / TRL
|
|
105
|
+
|
|
106
|
+
Use a `TrainerCallback` to hook into the training loop:
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
import time
|
|
110
|
+
from transformers import TrainerCallback
|
|
111
|
+
|
|
112
|
+
class AquinCallback(TrainerCallback):
|
|
113
|
+
def __init__(self, run):
|
|
114
|
+
self.run = run
|
|
115
|
+
self._step_start = 0.0
|
|
116
|
+
|
|
117
|
+
def on_step_begin(self, args, state, control, **kwargs):
|
|
118
|
+
self._step_start = time.time()
|
|
119
|
+
|
|
120
|
+
def on_log(self, args, state, control, logs=None, **kwargs):
|
|
121
|
+
if not logs or "loss" not in logs:
|
|
122
|
+
return
|
|
123
|
+
self.run.log(
|
|
124
|
+
step=state.global_step,
|
|
125
|
+
loss=float(logs["loss"]),
|
|
126
|
+
learning_rate=float(logs["learning_rate"]) if "learning_rate" in logs else None,
|
|
127
|
+
grad_norm=float(logs["grad_norm"]) if "grad_norm" in logs else None,
|
|
128
|
+
epoch=int(state.epoch) if state.epoch is not None else None,
|
|
129
|
+
step_ms=round((time.time() - self._step_start) * 1000),
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def on_train_end(self, args, state, control, **kwargs):
|
|
133
|
+
model = kwargs.get("model")
|
|
134
|
+
if model:
|
|
135
|
+
self.run.checkpoint(model, step=state.global_step)
|
|
136
|
+
```
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
aquin/__init__.py,sha256=FraX6aNKjZXE6nlP7kcpYnIsgZ7ybKmVgaBpdF_8_y4,54
|
|
2
|
+
aquin/cli.py,sha256=kW2sWBLyFF9yZQJIg3rEeN5JSVQwPczBHUyao60mWis,5901
|
|
3
|
+
aquin/run.py,sha256=2TtOIWKTe1yT8TG3BZ-MQRbwnBRiG_d3xrPvZtt8_vI,7317
|
|
4
|
+
aquin-0.0.1.dist-info/METADATA,sha256=tWZG32rvIfchvkWYxjPGO2RiTqpUmQwreDBy3nRvU2M,4251
|
|
5
|
+
aquin-0.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
6
|
+
aquin-0.0.1.dist-info/entry_points.txt,sha256=G2OIiZIvNTHNXeuMnkTooZLLDDIEdYsABFP3eQEfHQ0,41
|
|
7
|
+
aquin-0.0.1.dist-info/top_level.txt,sha256=_z0uFSWkloDciYFBw0MXB_sIID7Q1EsVGU234HDrZps,6
|
|
8
|
+
aquin-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
aquin
|