aquin 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aquin-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.4
2
+ Name: aquin
3
+ Version: 0.0.1
4
+ Summary: Record training runs locally and push to Aquin for post-hoc inspection — loss curves, SAE diffs, model diffs, and more.
5
+ License: MIT
6
+ Project-URL: Homepage, https://aquin.app
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: requests>=2.28
10
+
11
+ # Aquin SDK
12
+
13
+ Record your training runs locally and push them to [Aquin](https://aquin.app) for post-hoc inspection — loss curves, learning rate, grad norm, epoch summaries, SAE feature diffs, model behaviour diffs, and more.
14
+
15
+ ## Install
16
+
17
+ ```bash
18
+ pip install aquin
19
+ ```
20
+
21
+ ## Quickstart
22
+
23
+ ```python
24
+ import aquin
25
+
26
+ run = aquin.init(
27
+ base_model="meta-llama/Llama-3.2-1B-Instruct",
28
+ run_name="my-lora-run",
29
+ config={
30
+ "lr": 2e-4, "epochs": 3, "rank": 16, "lora_alpha": 32,
31
+ "method": "qlora", "per_device_train_batch_size": 2,
32
+ "gradient_accumulation_steps": 8, "dataset": "data.jsonl",
33
+ },
34
+ )
35
+
36
+ for epoch in range(3):
37
+ for step, batch in enumerate(dataloader):
38
+ loss = train_step(batch)
39
+ grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0).item()
40
+ run.log(
41
+ step,
42
+ loss=loss.item(),
43
+ learning_rate=scheduler.get_last_lr()[0],
44
+ grad_norm=grad_norm,
45
+ epoch=epoch,
46
+ )
47
+
48
+ run.checkpoint(model, step=step)
49
+ run.finish()
50
+ ```
51
+
52
+ Then push:
53
+
54
+ ```bash
55
+ aquin package
56
+ aquin push
57
+ ```
58
+
59
+ Your run appears in the Aquin dashboard under **CLI runs** with the full inspection suite.
60
+
61
+ ## API
62
+
63
+ ### `aquin.init(base_model, run_name, config)`
64
+
65
+ Starts a new run. Creates `aquin_run/` in the current directory.
66
+
67
+ | Param | Description |
68
+ |---|---|
69
+ | `base_model` | HuggingFace model ID, e.g. `"meta-llama/Llama-3.2-1B-Instruct"` |
70
+ | `run_name` | Display name for the run |
71
+ | `config` | Dict of training hyperparameters (optional, can also pass to `finish()`) |
72
+
73
+ ### `run.log(step, *, loss, ...)`
74
+
75
+ Record metrics for one training step. Call every step inside your loop.
76
+
77
+ | Param | Description |
78
+ |---|---|
79
+ | `step` | Global training step (required) |
80
+ | `loss` | Scalar training loss (required) |
81
+ | `learning_rate` | Current LR — enables LR chart |
82
+ | `grad_norm` | Gradient norm — enables grad norm chart |
83
+ | `epoch` | Current epoch — enables epoch summary table |
84
+ | `momentum_norm` | Optimizer momentum norm — enables momentum chart |
85
+ | `step_ms` | Wall-clock time for this step in ms |
86
+
87
+ ### `run.checkpoint(model, step)`
88
+
89
+ Saves the model checkpoint locally. One checkpoint per run — always replaces the previous save. Call once at the end of training. The checkpoint is included in the push and used for SAE diff and model diff analysis.
90
+
91
+ ### `run.finish(config)`
92
+
93
+ Flushes all metrics to disk. Pass `config` here if you didn't pass it to `aquin.init()`.
94
+
95
+ ## CLI
96
+
97
+ ```bash
98
+ aquin login # save your API key
99
+ aquin package # bundle aquin_run/ into aquin_run.tar.gz
100
+ aquin push # push to Aquin
101
+ aquin whoami # check which account you're logged in as
102
+ ```
103
+
104
+ ## Using with HuggingFace Trainer / TRL
105
+
106
+ Use a `TrainerCallback` to hook into the training loop:
107
+
108
+ ```python
109
+ import time
110
+ from transformers import TrainerCallback
111
+
112
+ class AquinCallback(TrainerCallback):
113
+ def __init__(self, run):
114
+ self.run = run
115
+ self._step_start = 0.0
116
+
117
+ def on_step_begin(self, args, state, control, **kwargs):
118
+ self._step_start = time.time()
119
+
120
+ def on_log(self, args, state, control, logs=None, **kwargs):
121
+ if not logs or "loss" not in logs:
122
+ return
123
+ self.run.log(
124
+ step=state.global_step,
125
+ loss=float(logs["loss"]),
126
+ learning_rate=float(logs["learning_rate"]) if "learning_rate" in logs else None,
127
+ grad_norm=float(logs["grad_norm"]) if "grad_norm" in logs else None,
128
+ epoch=int(state.epoch) if state.epoch is not None else None,
129
+ step_ms=round((time.time() - self._step_start) * 1000),
130
+ )
131
+
132
+ def on_train_end(self, args, state, control, **kwargs):
133
+ model = kwargs.get("model")
134
+ if model:
135
+ self.run.checkpoint(model, step=state.global_step)
136
+ ```
aquin-0.0.1/README.md ADDED
@@ -0,0 +1,126 @@
1
+ # Aquin SDK
2
+
3
+ Record your training runs locally and push them to [Aquin](https://aquin.app) for post-hoc inspection — loss curves, learning rate, grad norm, epoch summaries, SAE feature diffs, model behaviour diffs, and more.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install aquin
9
+ ```
10
+
11
+ ## Quickstart
12
+
13
+ ```python
14
+ import aquin
15
+
16
+ run = aquin.init(
17
+ base_model="meta-llama/Llama-3.2-1B-Instruct",
18
+ run_name="my-lora-run",
19
+ config={
20
+ "lr": 2e-4, "epochs": 3, "rank": 16, "lora_alpha": 32,
21
+ "method": "qlora", "per_device_train_batch_size": 2,
22
+ "gradient_accumulation_steps": 8, "dataset": "data.jsonl",
23
+ },
24
+ )
25
+
26
+ for epoch in range(3):
27
+ for step, batch in enumerate(dataloader):
28
+ loss = train_step(batch)
29
+ grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0).item()
30
+ run.log(
31
+ step,
32
+ loss=loss.item(),
33
+ learning_rate=scheduler.get_last_lr()[0],
34
+ grad_norm=grad_norm,
35
+ epoch=epoch,
36
+ )
37
+
38
+ run.checkpoint(model, step=step)
39
+ run.finish()
40
+ ```
41
+
42
+ Then push:
43
+
44
+ ```bash
45
+ aquin package
46
+ aquin push
47
+ ```
48
+
49
+ Your run appears in the Aquin dashboard under **CLI runs** with the full inspection suite.
50
+
51
+ ## API
52
+
53
+ ### `aquin.init(base_model, run_name, config)`
54
+
55
+ Starts a new run. Creates `aquin_run/` in the current directory.
56
+
57
+ | Param | Description |
58
+ |---|---|
59
+ | `base_model` | HuggingFace model ID, e.g. `"meta-llama/Llama-3.2-1B-Instruct"` |
60
+ | `run_name` | Display name for the run |
61
+ | `config` | Dict of training hyperparameters (optional, can also pass to `finish()`) |
62
+
63
+ ### `run.log(step, *, loss, ...)`
64
+
65
+ Record metrics for one training step. Call every step inside your loop.
66
+
67
+ | Param | Description |
68
+ |---|---|
69
+ | `step` | Global training step (required) |
70
+ | `loss` | Scalar training loss (required) |
71
+ | `learning_rate` | Current LR — enables LR chart |
72
+ | `grad_norm` | Gradient norm — enables grad norm chart |
73
+ | `epoch` | Current epoch — enables epoch summary table |
74
+ | `momentum_norm` | Optimizer momentum norm — enables momentum chart |
75
+ | `step_ms` | Wall-clock time for this step in ms |
76
+
77
+ ### `run.checkpoint(model, step)`
78
+
79
+ Saves the model checkpoint locally. One checkpoint per run — always replaces the previous save. Call once at the end of training. The checkpoint is included in the push and used for SAE diff and model diff analysis.
80
+
81
+ ### `run.finish(config)`
82
+
83
+ Flushes all metrics to disk. Pass `config` here if you didn't pass it to `aquin.init()`.
84
+
85
+ ## CLI
86
+
87
+ ```bash
88
+ aquin login # save your API key
89
+ aquin package # bundle aquin_run/ into aquin_run.tar.gz
90
+ aquin push # push to Aquin
91
+ aquin whoami # check which account you're logged in as
92
+ ```
93
+
94
+ ## Using with HuggingFace Trainer / TRL
95
+
96
+ Use a `TrainerCallback` to hook into the training loop:
97
+
98
+ ```python
99
+ import time
100
+ from transformers import TrainerCallback
101
+
102
+ class AquinCallback(TrainerCallback):
103
+ def __init__(self, run):
104
+ self.run = run
105
+ self._step_start = 0.0
106
+
107
+ def on_step_begin(self, args, state, control, **kwargs):
108
+ self._step_start = time.time()
109
+
110
+ def on_log(self, args, state, control, logs=None, **kwargs):
111
+ if not logs or "loss" not in logs:
112
+ return
113
+ self.run.log(
114
+ step=state.global_step,
115
+ loss=float(logs["loss"]),
116
+ learning_rate=float(logs["learning_rate"]) if "learning_rate" in logs else None,
117
+ grad_norm=float(logs["grad_norm"]) if "grad_norm" in logs else None,
118
+ epoch=int(state.epoch) if state.epoch is not None else None,
119
+ step_ms=round((time.time() - self._step_start) * 1000),
120
+ )
121
+
122
+ def on_train_end(self, args, state, control, **kwargs):
123
+ model = kwargs.get("model")
124
+ if model:
125
+ self.run.checkpoint(model, step=state.global_step)
126
+ ```
@@ -0,0 +1,3 @@
1
+ from .run import Run, init
2
+
3
+ __all__ = ["Run", "init"]
@@ -0,0 +1,197 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import sys
6
+ import tarfile
7
+ from pathlib import Path
8
+
9
+ _CONFIG_PATH = Path.home() / ".aquin" / "config.json"
10
+ _BASE_URL = "https://api.aquin.app"
11
+ _PACKAGE_PATH = Path.cwd() / "aquin_run.tar.gz"
12
+
13
+
14
+ def _load_config() -> dict:
15
+ if _CONFIG_PATH.exists():
16
+ with open(_CONFIG_PATH) as f:
17
+ return json.load(f)
18
+ return {}
19
+
20
+
21
+ def _save_config(data: dict) -> None:
22
+ _CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
23
+ with open(_CONFIG_PATH, "w") as f:
24
+ json.dump(data, f, indent=2)
25
+
26
+
27
+ def _get_api_key(explicit: str | None = None) -> str:
28
+ if explicit:
29
+ return explicit
30
+ cfg = _load_config()
31
+ key = cfg.get("api_key") or os.environ.get("AQUIN_API_KEY") or ""
32
+ if not key:
33
+ print("No API key found. Run: aquin login")
34
+ sys.exit(1)
35
+ return key
36
+
37
+
38
+ def _package_path() -> Path:
39
+ return Path.cwd() / "aquin_run.tar.gz"
40
+
41
+
42
+ def _read_meta() -> dict:
43
+ run_dir = Path.cwd() / "aquin_run"
44
+ if not run_dir.exists():
45
+ print("No aquin_run/ directory found. Run your training script first.")
46
+ sys.exit(1)
47
+ meta_path = run_dir / "meta.json"
48
+ if not meta_path.exists():
49
+ print("aquin_run/meta.json not found. Run your training script first.")
50
+ sys.exit(1)
51
+ with open(meta_path) as f:
52
+ return json.load(f)
53
+
54
+
55
+ def cmd_login(args: list[str]) -> None:
56
+ """aquin login — save your API key"""
57
+ key = input("Paste your Aquin API key: ").strip()
58
+ if not key:
59
+ print("No key entered.")
60
+ sys.exit(1)
61
+ cfg = _load_config()
62
+ cfg["api_key"] = key
63
+ _save_config(cfg)
64
+ print(f"API key saved to {_CONFIG_PATH}")
65
+
66
+
67
+ def cmd_package(args: list[str]) -> None:
68
+ """aquin package — bundle aquin_run/ into aquin_run.tar.gz"""
69
+ meta = _read_meta()
70
+ run_name = meta.get("run_name", meta["run_id"])
71
+ out = _package_path()
72
+
73
+ print(f"[aquin] Packaging run '{run_name}'...")
74
+ run_dir = Path.cwd() / "aquin_run"
75
+ with tarfile.open(out, "w:gz") as tar:
76
+ tar.add(run_dir, arcname="aquin_run")
77
+
78
+ size_mb = round(out.stat().st_size / 1_000_000, 1)
79
+ print(f"[aquin] Package ready: {out.name} ({size_mb} MB)")
80
+ print(f"[aquin] Run aquin push to send it to Aquin.")
81
+
82
+
83
+ def cmd_push(args: list[str]) -> None:
84
+ """aquin push — push aquin_run.tar.gz to Aquin"""
85
+ import requests
86
+
87
+ explicit_key = None
88
+ for i, a in enumerate(args):
89
+ if a == "--key" and i + 1 < len(args):
90
+ explicit_key = args[i + 1]
91
+
92
+ pkg = _package_path()
93
+ if not pkg.exists():
94
+ print("No aquin_run.tar.gz found. Run aquin package first.")
95
+ sys.exit(1)
96
+
97
+ meta = _read_meta()
98
+ run_id = meta["run_id"]
99
+ run_name = meta.get("run_name", run_id)
100
+ api_key = _get_api_key(explicit_key)
101
+ base_url = os.environ.get("AQUIN_BASE_URL", _BASE_URL).rstrip("/")
102
+
103
+ size_mb = round(pkg.stat().st_size / 1_000_000, 1)
104
+ print(f"[aquin] Pushing '{run_name}' ({size_mb} MB)...")
105
+
106
+ try:
107
+ # Step 1: get the VM upload URL from Next.js (auth only, no body)
108
+ auth_resp = requests.get(
109
+ f"{base_url}/api/sdk/push",
110
+ params={
111
+ "run_id": run_id,
112
+ "run_name": run_name,
113
+ "base_model": meta.get("base_model", ""),
114
+ },
115
+ headers={"Authorization": f"Bearer {api_key}"},
116
+ timeout=15,
117
+ )
118
+ auth_resp.raise_for_status()
119
+ upload = auth_resp.json()
120
+
121
+ # Step 2: push tar.gz directly to the VM (bypasses Vercel body limit)
122
+ file_size = pkg.stat().st_size
123
+ with open(pkg, "rb") as f:
124
+ resp = requests.post(
125
+ upload["url"],
126
+ data=f,
127
+ headers={**upload["headers"], "Content-Length": str(file_size)},
128
+ timeout=600,
129
+ )
130
+ resp.raise_for_status()
131
+ print("[aquin] Run pushed successfully.")
132
+ print("[aquin] Open Aquin, your run will appear as a notification.")
133
+ except Exception as exc:
134
+ print(f"[aquin] Push failed: {exc}")
135
+ sys.exit(1)
136
+
137
+
138
+ def cmd_whoami(args: list[str]) -> None:
139
+ """aquin whoami — show the currently logged in user"""
140
+ import requests
141
+ key = _get_api_key()
142
+ base_url = os.environ.get("AQUIN_BASE_URL", _BASE_URL).rstrip("/")
143
+ try:
144
+ resp = requests.get(
145
+ f"{base_url}/api/sdk/whoami",
146
+ headers={"Authorization": f"Bearer {key}"},
147
+ timeout=10,
148
+ )
149
+ resp.raise_for_status()
150
+ data = resp.json()
151
+ print(f"Logged in as: {data.get('email', data.get('user_id', '?'))}")
152
+ except Exception as exc:
153
+ print(f"Error: {exc}")
154
+
155
+
156
+ def cmd_help(args: list[str]) -> None:
157
+ print("Aquin CLI")
158
+ print("")
159
+ print("Usage: aquin <command>")
160
+ print("")
161
+ print("Commands:")
162
+ print(" login Save your Aquin API key")
163
+ print(" package Bundle aquin_run/ into aquin_run.tar.gz")
164
+ print(" push Push aquin_run.tar.gz to Aquin")
165
+ print(" whoami Show the currently logged in account")
166
+ print(" help Show this help message")
167
+ print("")
168
+ print("Typical workflow:")
169
+ print(" 1. Add aquin.init() / run.log() / run.finish() to your training script")
170
+ print(" 2. Run your training script")
171
+ print(" 3. aquin package")
172
+ print(" 4. aquin push")
173
+
174
+
175
+ def main() -> None:
176
+ args = sys.argv[1:]
177
+ if not args:
178
+ cmd_help([])
179
+ return
180
+
181
+ cmd = args[0]
182
+ rest = args[1:]
183
+
184
+ if cmd == "login":
185
+ cmd_login(rest)
186
+ elif cmd == "package":
187
+ cmd_package(rest)
188
+ elif cmd == "push":
189
+ cmd_push(rest)
190
+ elif cmd == "whoami":
191
+ cmd_whoami(rest)
192
+ elif cmd in ("help", "--help", "-h"):
193
+ cmd_help(rest)
194
+ else:
195
+ print(f"Unknown command: {cmd}")
196
+ print("Run aquin help to see available commands.")
197
+ sys.exit(1)
@@ -0,0 +1,197 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import threading
6
+ import time
7
+ import uuid
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+
12
+ def init(
13
+ base_model: str | None = None,
14
+ run_name: str | None = None,
15
+ config: dict[str, Any] | None = None,
16
+ ) -> "Run":
17
+ """Start recording a training run. Call this before your training loop."""
18
+ return Run(base_model=base_model, run_name=run_name, config=config)
19
+
20
+
21
+ class Run:
22
+ """
23
+ Records metrics and checkpoints locally during training.
24
+ Call aquin.init() to create one.
25
+
26
+ After training, use the CLI:
27
+ aquin package # then: aquin push
28
+
29
+ Example
30
+ -------
31
+ run = aquin.init(
32
+ base_model="meta-llama/Llama-3.2-1B-Instruct",
33
+ run_name="my-run",
34
+ config={"lr": 2e-4, "epochs": 3, "rank": 16, "lora_alpha": 32,
35
+ "method": "qlora", "per_device_train_batch_size": 2,
36
+ "gradient_accumulation_steps": 8, "dataset": "data.jsonl"},
37
+ )
38
+ for step, batch in enumerate(dataloader):
39
+ loss = train_step(batch)
40
+ run.log(step, loss=loss.item(), learning_rate=scheduler.get_last_lr()[0],
41
+ grad_norm=grad_norm, epoch=epoch)
42
+ run.checkpoint(model, step=step)
43
+ run.finish()
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ base_model: str | None = None,
49
+ run_name: str | None = None,
50
+ config: dict[str, Any] | None = None,
51
+ ) -> None:
52
+ self._run_id = str(uuid.uuid4())
53
+ self._run_name = run_name or f"run-{self._run_id[:8]}"
54
+ self._base_model = base_model or os.environ.get("AQUIN_BASE_MODEL") or ""
55
+ self._started_at = time.time()
56
+
57
+ self._run_dir = Path.cwd() / "aquin_run"
58
+ self._run_dir.mkdir(exist_ok=True)
59
+ (self._run_dir / "checkpoints").mkdir(exist_ok=True)
60
+
61
+ self._metrics: list[dict[str, Any]] = []
62
+ self._metrics_lock = threading.Lock()
63
+
64
+ self._write_meta()
65
+ if config:
66
+ self._write_config(config)
67
+
68
+ print(f"[aquin] Recording run '{self._run_name}' (id: {self._run_id})")
69
+ print(f"[aquin] Run data will be saved to {self._run_dir}")
70
+
71
+ # ── Public API ──────────────────────────────────────────────────────────────
72
+
73
+ def log(
74
+ self,
75
+ step: int,
76
+ *,
77
+ loss: float,
78
+ learning_rate: float | None = None,
79
+ grad_norm: float | None = None,
80
+ momentum_norm: float | None = None,
81
+ epoch: int | None = None,
82
+ batch: int | None = None,
83
+ total_batches: int | None = None,
84
+ step_ms: float | None = None,
85
+ **extra: float,
86
+ ) -> None:
87
+ """
88
+ Record metrics for one training step. Call this every step inside your loop.
89
+
90
+ Required
91
+ --------
92
+ step : global training step index (0-based or 1-based, be consistent)
93
+ loss : scalar training loss for this step
94
+
95
+ Recommended
96
+ -----------
97
+ learning_rate : current LR from your scheduler
98
+ grad_norm : gradient norm — compute with torch.nn.utils.clip_grad_norm_()
99
+ momentum_norm : optimizer momentum norm — enables Momentum chart in the UI
100
+ for AdamW: sum(p.exp_avg.norm() for p in optimizer.state.values())
101
+ epoch : current epoch number — enables epoch summary table in the UI
102
+ batch : batch index within the epoch
103
+ total_batches : total batches per epoch
104
+ step_ms : wall-clock time for this step in milliseconds
105
+
106
+ Any additional keyword arguments are stored alongside the standard metrics.
107
+ """
108
+ entry: dict[str, Any] = {"step": step, "metrics": {"loss": loss}}
109
+ if learning_rate is not None:
110
+ entry["metrics"]["learning_rate"] = learning_rate
111
+ if grad_norm is not None:
112
+ entry["metrics"]["grad_norm"] = grad_norm
113
+ if momentum_norm is not None:
114
+ entry["metrics"]["momentum_norm"] = momentum_norm
115
+ if step_ms is not None:
116
+ entry["metrics"]["step_ms"] = step_ms
117
+ if extra:
118
+ entry["metrics"].update(extra)
119
+ if epoch is not None:
120
+ entry["epoch"] = epoch
121
+ if batch is not None:
122
+ entry["batch"] = batch
123
+ if total_batches is not None:
124
+ entry["total_batches"] = total_batches
125
+
126
+ with self._metrics_lock:
127
+ self._metrics.append(entry)
128
+
129
+ def checkpoint(self, model: Any, step: int) -> None:
130
+ """
131
+ Save the final model checkpoint. Call once at the end of training.
132
+ One checkpoint per run — always replaces the previous save.
133
+
134
+ Args
135
+ ----
136
+ model: a PyTorch nn.Module
137
+ step : the training step this checkpoint corresponds to
138
+ """
139
+ try:
140
+ import torch
141
+ except ImportError:
142
+ raise ImportError("torch is required to save checkpoints.")
143
+
144
+ ckpt_path = self._run_dir / "checkpoints" / "checkpoint.pt"
145
+ print(f"[aquin] Saving checkpoint (step {step})...")
146
+ torch.save({"step": step, "state_dict": model.state_dict()}, ckpt_path)
147
+ self._write_metrics()
148
+ print(f"[aquin] Checkpoint saved (step {step}).")
149
+
150
+ def finish(self, config: dict[str, Any] | None = None) -> None:
151
+ """
152
+ Finalise the run. Flushes all metrics and optional config to disk.
153
+
154
+ If you passed config to aquin.init() you don't need to pass it again here.
155
+ After this, run: aquin package # then: aquin push
156
+
157
+ Args
158
+ ----
159
+ config: training hyperparameters to store alongside the run.
160
+ Recognised keys (all optional):
161
+ lr, epochs, rank, lora_alpha, method,
162
+ per_device_train_batch_size, gradient_accumulation_steps,
163
+ dataset, max_seq_len, optimizer, scheduler,
164
+ dropout, weight_decay, grad_clip
165
+ """
166
+ self._write_metrics()
167
+ if config:
168
+ self._write_config(config)
169
+ elapsed = round(time.time() - self._started_at)
170
+ print(f"[aquin] Run finished. {len(self._metrics)} steps recorded in {elapsed}s.")
171
+ print(f"[aquin] Push to Aquin with: aquin package # then: aquin push")
172
+
173
+ @property
174
+ def run_id(self) -> str:
175
+ return self._run_id
176
+
177
+ # ── Internal ────────────────────────────────────────────────────────────────
178
+
179
+ def _write_meta(self) -> None:
180
+ meta = {
181
+ "run_id": self._run_id,
182
+ "run_name": self._run_name,
183
+ "base_model": self._base_model,
184
+ "started_at": self._started_at,
185
+ }
186
+ with open(self._run_dir / "meta.json", "w") as f:
187
+ json.dump(meta, f, indent=2)
188
+
189
+ def _write_metrics(self) -> None:
190
+ with self._metrics_lock:
191
+ data = list(self._metrics)
192
+ with open(self._run_dir / "metrics.json", "w") as f:
193
+ json.dump(data, f)
194
+
195
+ def _write_config(self, config: dict[str, Any]) -> None:
196
+ with open(self._run_dir / "config.json", "w") as f:
197
+ json.dump(config, f, indent=2)
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.4
2
+ Name: aquin
3
+ Version: 0.0.1
4
+ Summary: Record training runs locally and push to Aquin for post-hoc inspection — loss curves, SAE diffs, model diffs, and more.
5
+ License: MIT
6
+ Project-URL: Homepage, https://aquin.app
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: requests>=2.28
10
+
11
+ # Aquin SDK
12
+
13
+ Record your training runs locally and push them to [Aquin](https://aquin.app) for post-hoc inspection — loss curves, learning rate, grad norm, epoch summaries, SAE feature diffs, model behaviour diffs, and more.
14
+
15
+ ## Install
16
+
17
+ ```bash
18
+ pip install aquin
19
+ ```
20
+
21
+ ## Quickstart
22
+
23
+ ```python
24
+ import aquin
25
+
26
+ run = aquin.init(
27
+ base_model="meta-llama/Llama-3.2-1B-Instruct",
28
+ run_name="my-lora-run",
29
+ config={
30
+ "lr": 2e-4, "epochs": 3, "rank": 16, "lora_alpha": 32,
31
+ "method": "qlora", "per_device_train_batch_size": 2,
32
+ "gradient_accumulation_steps": 8, "dataset": "data.jsonl",
33
+ },
34
+ )
35
+
36
+ for epoch in range(3):
37
+ for step, batch in enumerate(dataloader):
38
+ loss = train_step(batch)
39
+ grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0).item()
40
+ run.log(
41
+ step,
42
+ loss=loss.item(),
43
+ learning_rate=scheduler.get_last_lr()[0],
44
+ grad_norm=grad_norm,
45
+ epoch=epoch,
46
+ )
47
+
48
+ run.checkpoint(model, step=step)
49
+ run.finish()
50
+ ```
51
+
52
+ Then push:
53
+
54
+ ```bash
55
+ aquin package
56
+ aquin push
57
+ ```
58
+
59
+ Your run appears in the Aquin dashboard under **CLI runs** with the full inspection suite.
60
+
61
+ ## API
62
+
63
+ ### `aquin.init(base_model, run_name, config)`
64
+
65
+ Starts a new run. Creates `aquin_run/` in the current directory.
66
+
67
+ | Param | Description |
68
+ |---|---|
69
+ | `base_model` | HuggingFace model ID, e.g. `"meta-llama/Llama-3.2-1B-Instruct"` |
70
+ | `run_name` | Display name for the run |
71
+ | `config` | Dict of training hyperparameters (optional, can also pass to `finish()`) |
72
+
73
+ ### `run.log(step, *, loss, ...)`
74
+
75
+ Record metrics for one training step. Call every step inside your loop.
76
+
77
+ | Param | Description |
78
+ |---|---|
79
+ | `step` | Global training step (required) |
80
+ | `loss` | Scalar training loss (required) |
81
+ | `learning_rate` | Current LR — enables LR chart |
82
+ | `grad_norm` | Gradient norm — enables grad norm chart |
83
+ | `epoch` | Current epoch — enables epoch summary table |
84
+ | `momentum_norm` | Optimizer momentum norm — enables momentum chart |
85
+ | `step_ms` | Wall-clock time for this step in ms |
86
+
87
+ ### `run.checkpoint(model, step)`
88
+
89
+ Saves the model checkpoint locally. One checkpoint per run — always replaces the previous save. Call once at the end of training. The checkpoint is included in the push and used for SAE diff and model diff analysis.
90
+
91
+ ### `run.finish(config)`
92
+
93
+ Flushes all metrics to disk. Pass `config` here if you didn't pass it to `aquin.init()`.
94
+
95
+ ## CLI
96
+
97
+ ```bash
98
+ aquin login # save your API key
99
+ aquin package # bundle aquin_run/ into aquin_run.tar.gz
100
+ aquin push # push to Aquin
101
+ aquin whoami # check which account you're logged in as
102
+ ```
103
+
104
+ ## Using with HuggingFace Trainer / TRL
105
+
106
+ Use a `TrainerCallback` to hook into the training loop:
107
+
108
+ ```python
109
+ import time
110
+ from transformers import TrainerCallback
111
+
112
+ class AquinCallback(TrainerCallback):
113
+ def __init__(self, run):
114
+ self.run = run
115
+ self._step_start = 0.0
116
+
117
+ def on_step_begin(self, args, state, control, **kwargs):
118
+ self._step_start = time.time()
119
+
120
+ def on_log(self, args, state, control, logs=None, **kwargs):
121
+ if not logs or "loss" not in logs:
122
+ return
123
+ self.run.log(
124
+ step=state.global_step,
125
+ loss=float(logs["loss"]),
126
+ learning_rate=float(logs["learning_rate"]) if "learning_rate" in logs else None,
127
+ grad_norm=float(logs["grad_norm"]) if "grad_norm" in logs else None,
128
+ epoch=int(state.epoch) if state.epoch is not None else None,
129
+ step_ms=round((time.time() - self._step_start) * 1000),
130
+ )
131
+
132
+ def on_train_end(self, args, state, control, **kwargs):
133
+ model = kwargs.get("model")
134
+ if model:
135
+ self.run.checkpoint(model, step=state.global_step)
136
+ ```
@@ -0,0 +1,11 @@
1
+ README.md
2
+ pyproject.toml
3
+ aquin/__init__.py
4
+ aquin/cli.py
5
+ aquin/run.py
6
+ aquin.egg-info/PKG-INFO
7
+ aquin.egg-info/SOURCES.txt
8
+ aquin.egg-info/dependency_links.txt
9
+ aquin.egg-info/entry_points.txt
10
+ aquin.egg-info/requires.txt
11
+ aquin.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ aquin = aquin.cli:main
@@ -0,0 +1 @@
1
+ requests>=2.28
@@ -0,0 +1 @@
1
+ aquin
@@ -0,0 +1,24 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "aquin"
7
+ version = "0.0.1"
8
+ description = "Record training runs locally and push to Aquin for post-hoc inspection — loss curves, SAE diffs, model diffs, and more."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ dependencies = [
13
+ "requests>=2.28",
14
+ ]
15
+
16
+ [project.scripts]
17
+ aquin = "aquin.cli:main"
18
+
19
+ [project.urls]
20
+ Homepage = "https://aquin.app"
21
+
22
+ [tool.setuptools.packages.find]
23
+ where = ["."]
24
+ include = ["aquin*"]
aquin-0.0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+