tlog-ml 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tlog_ml-0.1.0/LICENSE +21 -0
- tlog_ml-0.1.0/PKG-INFO +266 -0
- tlog_ml-0.1.0/README.md +239 -0
- tlog_ml-0.1.0/pyproject.toml +43 -0
- tlog_ml-0.1.0/setup.cfg +4 -0
- tlog_ml-0.1.0/tests/test_media_export.py +76 -0
- tlog_ml-0.1.0/tests/test_run.py +135 -0
- tlog_ml-0.1.0/tests/test_store.py +97 -0
- tlog_ml-0.1.0/tests/test_tui_keys.py +72 -0
- tlog_ml-0.1.0/tests/test_tui_layout.py +65 -0
- tlog_ml-0.1.0/tests/test_writer.py +57 -0
- tlog_ml-0.1.0/tlog/__init__.py +89 -0
- tlog_ml-0.1.0/tlog/cli.py +183 -0
- tlog_ml-0.1.0/tlog/console.py +71 -0
- tlog_ml-0.1.0/tlog/export.py +93 -0
- tlog_ml-0.1.0/tlog/frontend/app.js +494 -0
- tlog_ml-0.1.0/tlog/frontend/index.html +29 -0
- tlog_ml-0.1.0/tlog/frontend/style.css +181 -0
- tlog_ml-0.1.0/tlog/frontend/vendor/uplot.min.css +1 -0
- tlog_ml-0.1.0/tlog/frontend/vendor/uplot.min.js +2 -0
- tlog_ml-0.1.0/tlog/media.py +113 -0
- tlog_ml-0.1.0/tlog/meta.py +152 -0
- tlog_ml-0.1.0/tlog/payload.py +72 -0
- tlog_ml-0.1.0/tlog/run.py +282 -0
- tlog_ml-0.1.0/tlog/server.py +133 -0
- tlog_ml-0.1.0/tlog/store.py +354 -0
- tlog_ml-0.1.0/tlog/system.py +132 -0
- tlog_ml-0.1.0/tlog/tui.py +446 -0
- tlog_ml-0.1.0/tlog/writer.py +95 -0
- tlog_ml-0.1.0/tlog_ml.egg-info/PKG-INFO +266 -0
- tlog_ml-0.1.0/tlog_ml.egg-info/SOURCES.txt +33 -0
- tlog_ml-0.1.0/tlog_ml.egg-info/dependency_links.txt +1 -0
- tlog_ml-0.1.0/tlog_ml.egg-info/entry_points.txt +2 -0
- tlog_ml-0.1.0/tlog_ml.egg-info/requires.txt +3 -0
- tlog_ml-0.1.0/tlog_ml.egg-info/top_level.txt +1 -0
tlog_ml-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Philippe Hansen-Estruch
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
tlog_ml-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tlog-ml
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight, local-first experiment logger for neural network training — wandb-shaped API, zero deps, terminal + HTML + web viewers
|
|
5
|
+
Author: Philippe Hansen-Estruch
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/philippe-eecs/tlog
|
|
8
|
+
Project-URL: Issues, https://github.com/philippe-eecs/tlog/issues
|
|
9
|
+
Keywords: experiment-tracking,logging,machine-learning,wandb,slurm,tui
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: System :: Logging
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# tlog
|
|
29
|
+
|
|
30
|
+
A lightweight, local-first experiment logger for neural network training.
|
|
31
|
+
wandb-shaped API, **zero dependencies** in your training environment, and three
|
|
32
|
+
clean ways to look at your runs from a SLURM cluster with nothing but a
|
|
33
|
+
terminal:
|
|
34
|
+
|
|
35
|
+
| viewer | command | when |
|
|
36
|
+
|---|---|---|
|
|
37
|
+
| **terminal dashboard** | `tlog watch` | live charts in a tmux pane — the default |
|
|
38
|
+
| **live web dashboard** | `tlog serve` | wandb-like browser UI through an SSH/VS Code port-forward |
|
|
39
|
+
| **self-contained HTML** | `tlog export -o report.html` | one file with charts + images; preview in VS Code, scp it, share it |
|
|
40
|
+
|
|
41
|
+
Everything is plain append-only JSONL in a run directory: grep-able,
|
|
42
|
+
rsync-able, crash-safe, no daemon, no cloud, no account.
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
● demo/baseline (da064b) · step 1500 · finished
|
|
46
|
+
loss eval training timing memory console
|
|
47
|
+
|
|
48
|
+
loss/charb 0.3158 loss/dino 0.07182
|
|
49
|
+
1.552 ┤⡧⣼ 0.3882 ┤⡧⣼
|
|
50
|
+
│⠇⢹⢿⣠⢀ │⡇⢹⣶⣀⣀
|
|
51
|
+
│ ⠹⢹⠢⣧⣄⣀ │ ⠛⢹⠢⡦⣆⢀
|
|
52
|
+
│ ⠁⠋⠋⠳⢶⢤⡀ │ ⠋⠉⠢⣴⣀⣀
|
|
53
|
+
│ ⠘⠙⠦⠦⢴⢄⣀⡀ │ ⠘⠙⢢⡧⢦⣀⢀
|
|
54
|
+
│ ⠉⠙⠛⠓⠶⠤⣤⣠⣠⣀⡀ │ ⠉⠙⠋⠳⠶⢤⣤⣴⣠⡄⡀
|
|
55
|
+
0.2771 ┤ ⠉⠙⠉⠛⠋⠓⠲⠚⠴⠖⠤⠤⠦⡦ 0.06998 ┤ ⠁⠙⠉⠋⠋⠑⠳⠒⠲⠶⠤⡶⣦⣦
|
|
56
|
+
10 1490 10 1490
|
|
57
|
+
|
|
58
|
+
loss/ssim 0.131 loss/total 0.5004
|
|
59
|
+
0.6492 ┤⡧⣼ 2.605 ┤⡧⣼
|
|
60
|
+
│⠇⢹⢶⣀⢀ │⠇⢹⢶⣀
|
|
61
|
+
│ ⠹⠹⠦⣶⣄⣀ │ ⠹⢹⠢⣦⣄⣀
|
|
62
|
+
│ ⠛⠋⠣⣴⣠⡀ │ ⠋⠋⠲⣴⢤⣀
|
|
63
|
+
│ ⠈⠘⠙⠲⠦⢤⣀⢀ │ ⠘⠙⠦⠦⢤⣀⣀
|
|
64
|
+
│ ⠉⠙⠋⠓⢴⠤⢤⣤⣠⡀⡀ │ ⠈⠙⠋⠲⠴⠤⣤⣠⢠⡀⡀
|
|
65
|
+
0.1174 ┤ ⠁⠉⠉⠛⠊⠛⠲⠖⠴⠒⠤⡴⠦⣤ 0.4672 ┤ ⠉⠉⠉⠛⠊⠛⠲⠖⠴⠖⠤⠤⠦⣦
|
|
66
|
+
10 1490 10 1490
|
|
67
|
+
|
|
68
|
+
←/→ pages · ↑/↓ scroll · 1-9 cols (auto) · s smooth (0) · l log (off) · q quit
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
*An actual `tlog watch` frame — braille-canvas charts in a plain tmux pane.*
|
|
72
|
+
|
|
73
|
+
## Install
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install tlog-ml # distribution is tlog-ml; you still `import tlog`
|
|
77
|
+
# or for development:
|
|
78
|
+
git clone https://github.com/philippe-eecs/tlog && cd tlog
|
|
79
|
+
pip install -e ".[dev]"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
The core has **zero dependencies** — nothing to conflict with your torch/jax
|
|
83
|
+
pins. PIL is used opportunistically if present (image encoding, report
|
|
84
|
+
downscaling); otherwise a pure-stdlib PNG encoder takes over.
|
|
85
|
+
|
|
86
|
+
## Quickstart
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
import tlog
|
|
90
|
+
|
|
91
|
+
run = tlog.init(project="vitok", name="vae-L16", config=vars(args))
|
|
92
|
+
|
|
93
|
+
for step in range(steps):
|
|
94
|
+
...
|
|
95
|
+
if step % log_freq == 0:
|
|
96
|
+
tlog.log({"loss/total": loss, "training/lr": lr,
|
|
97
|
+
"timing/mfu_percent": mfu}, step=step)
|
|
98
|
+
if step % eval_freq == 0:
|
|
99
|
+
tlog.log({f"eval/{k}": v for k, v in eval_stats.items()}, step=step)
|
|
100
|
+
tlog.log_images("eval/recon", [orig, recon], step=step) # torch/np/PIL
|
|
101
|
+
|
|
102
|
+
tlog.finish()
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Then, in another tmux pane:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
tlog # == tlog watch: live dashboard of the latest run
|
|
109
|
+
tlog ls # table of runs: step, last loss, slurm job, status
|
|
110
|
+
tlog tail # live captured console output of the latest run
|
|
111
|
+
tlog serve # web UI on :8585 (VS Code auto-forwards the port)
|
|
112
|
+
tlog export run-a run-b -o compare.html # side-by-side report
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Key namespaces (`loss/`, `eval/`, `timing/`, ...) become chart groups / TUI
|
|
116
|
+
pages automatically.
|
|
117
|
+
|
|
118
|
+
## What gets captured
|
|
119
|
+
|
|
120
|
+
`tlog.init()` records, without being asked:
|
|
121
|
+
|
|
122
|
+
- **SLURM**: job id, job name, partition, nodelist, array task id, and the
|
|
123
|
+
actual `sbatch` script that launched the job (saved as `launch.sh`)
|
|
124
|
+
- **git**: commit, branch, dirty flag, and a `diff.patch` of uncommitted changes
|
|
125
|
+
- **environment**: argv, entrypoint, hostname, user, python/torch/CUDA
|
|
126
|
+
versions, GPU models, world size
|
|
127
|
+
- **system metrics** (background thread, 10s interval): GPU util/mem/temp/power
|
|
128
|
+
per device via nvidia-smi, CPU%, RAM — shown as their own chart groups
|
|
129
|
+
- **console**: stdout/stderr teed to `console.log` (tqdm-safe; viewers resolve
|
|
130
|
+
`\r` overwrites)
|
|
131
|
+
|
|
132
|
+
## How it works
|
|
133
|
+
|
|
134
|
+
tlog is two decoupled halves that only meet at the filesystem: a **write
|
|
135
|
+
path** that lives inside your training process, and a **read path** (the
|
|
136
|
+
viewers) that runs anywhere that can see the same disk. There is no daemon,
|
|
137
|
+
no database, no socket between them — a run *is* a directory:
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
runs/<project>/<name>__<timestamp>__<id>/
|
|
141
|
+
├── meta.json # identity + environment snapshot + restart history
|
|
142
|
+
├── config.json # your hyperparameters (vars(args))
|
|
143
|
+
├── metrics.jsonl # one JSON object per log() call, append-only
|
|
144
|
+
├── system.jsonl # sampled GPU/CPU/RAM
|
|
145
|
+
├── console.log # teed stdout/stderr
|
|
146
|
+
├── launch.sh # captured sbatch script (under SLURM)
|
|
147
|
+
├── diff.patch # uncommitted git changes
|
|
148
|
+
└── media/ # PNGs + index.jsonl mapping them to (key, step)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### The write path never blocks training
|
|
152
|
+
|
|
153
|
+
`log()` serializes one JSON line and appends it. Lines are written whole and
|
|
154
|
+
flushed, so a crash loses at most the line in flight and can never corrupt
|
|
155
|
+
history; `fsync` runs on a 30s timer to bound hard-failure data loss without
|
|
156
|
+
paying sync cost per step. Everything slow happens off the hot path: git
|
|
157
|
+
diff / nvidia-smi / `scontrol` captures run in a background thread after
|
|
158
|
+
init, system sampling and the liveness heartbeat are daemon threads, and
|
|
159
|
+
framework versions are read from `sys.modules` instead of importing anything.
|
|
160
|
+
|
|
161
|
+
### Preemption-safe by construction
|
|
162
|
+
|
|
163
|
+
SLURM requeues a preempted job with the same job id and bumps
|
|
164
|
+
`SLURM_RESTART_COUNT`. `init(resume="auto")` (the default) detects that,
|
|
165
|
+
finds the run directory it created before the preemption, and keeps
|
|
166
|
+
appending — recording a restart event in `meta.json`. Restarting from an
|
|
167
|
+
older checkpoint re-logs some steps; instead of rewriting files (dangerous),
|
|
168
|
+
**readers keep the last value logged per (metric, step)**, so charts come out
|
|
169
|
+
continuous and the storage stays strictly append-only. Explicit resume:
|
|
170
|
+
`tlog.init(id="a1b2c3", resume="must")`.
|
|
171
|
+
|
|
172
|
+
### The read path is one engine with three faces
|
|
173
|
+
|
|
174
|
+
`store.py` discovers runs, tails JSONL incrementally (remembering byte
|
|
175
|
+
offsets, parsing only complete new lines), applies keep-last dedup, and
|
|
176
|
+
downsamples with **min/max/mean buckets** — a one-step loss spike survives
|
|
177
|
+
being squeezed into a 200-px chart instead of being averaged away. Debiased
|
|
178
|
+
EMA smoothing (same formula as wandb) sits on top. The three viewers are just
|
|
179
|
+
renderers over this engine:
|
|
180
|
+
|
|
181
|
+
- **TUI**: each terminal cell is a 2×4 braille dot grid, so a tmux pane
|
|
182
|
+
becomes a pixel canvas; charts are drawn with Bresenham lines and repainted
|
|
183
|
+
on the alternate screen buffer. Pure ANSI — no curses, works over any SSH.
|
|
184
|
+
- **Web**: a stdlib `ThreadingHTTPServer` with JSON endpoints; the browser
|
|
185
|
+
polls every 3s and refetches only runs whose files changed (mtime-keyed).
|
|
186
|
+
- **Export**: the *same* frontend with data, images (base64), and uPlot
|
|
187
|
+
inlined into one HTML file. One codebase, a mode flag, two surfaces.
|
|
188
|
+
|
|
189
|
+
### Liveness without IPC
|
|
190
|
+
|
|
191
|
+
A daemon thread touches `heartbeat` every 15s. Viewers call a run *running*
|
|
192
|
+
if the heartbeat is fresh, *finished* if `finish()` marked it, and *dead* if
|
|
193
|
+
neither — which is how a SIGKILLed job shows up correctly with no process
|
|
194
|
+
ever being asked.
|
|
195
|
+
|
|
196
|
+
## Distributed training
|
|
197
|
+
|
|
198
|
+
`tlog.init()` is a no-op on non-zero ranks (it checks the `RANK` env var set
|
|
199
|
+
by torchrun/SLURM), so you can call it unguarded — or keep your existing
|
|
200
|
+
`if rank == 0:` guard; both are fine.
|
|
201
|
+
|
|
202
|
+
## Migrating from wandb
|
|
203
|
+
|
|
204
|
+
```diff
|
|
205
|
+
-import wandb
|
|
206
|
+
+import tlog
|
|
207
|
+
|
|
208
|
+
-wandb.init(project=args.project, name=args.name, config=vars(args))
|
|
209
|
+
+tlog.init(project=args.project, name=args.name, config=vars(args))
|
|
210
|
+
|
|
211
|
+
-wandb.log(avg, step=step)
|
|
212
|
+
+tlog.log(avg, step=step)
|
|
213
|
+
|
|
214
|
+
-wandb.finish()
|
|
215
|
+
+tlog.finish()
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
Runs land in `./runs` by default; set `TLOG_DIR=/scratch/$USER/runs` (or pass
|
|
219
|
+
`dir=`) to keep them on scratch.
|
|
220
|
+
|
|
221
|
+
## The viewers in detail
|
|
222
|
+
|
|
223
|
+
**`tlog watch [run]`** — braille line charts with min/max bands, one page per
|
|
224
|
+
metric group plus a console page; the grid auto-sizes to the pane and scrolls
|
|
225
|
+
when a group has more charts than fit. Keys: `←/→` pages · `↑/↓` (or `j/k`)
|
|
226
|
+
scroll charts / console history · `1`–`9` force column count, `0` auto (or
|
|
227
|
+
`--cols N`) · `s` smoothing (EMA 0 → 0.6 → 0.9 → 0.99) · `l` log scale ·
|
|
228
|
+
`q` quit.
|
|
229
|
+
|
|
230
|
+
**`tlog serve [root]`** — open `http://localhost:8585` through VS Code Remote
|
|
231
|
+
(auto port-forward) or `ssh -L 8585:localhost:8585 cluster`. Multi-run
|
|
232
|
+
overlay charts with synced cursors, smoothing slider, log scale, a media tab
|
|
233
|
+
laid out **runs-as-columns × steps-as-rows** for side-by-side recon/eval
|
|
234
|
+
comparison, a config tab that highlights differing hyperparameters, and live
|
|
235
|
+
console.
|
|
236
|
+
|
|
237
|
+
**`tlog export <runs...> -o report.html`** — the same UI frozen into a single
|
|
238
|
+
file (images downscaled to ≤512px by default; `--max-image-px 0` keeps
|
|
239
|
+
originals). No server, no internet — works in VS Code's HTML preview.
|
|
240
|
+
|
|
241
|
+
## Demo without a GPU
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
python examples/fake_train.py --steps 2000 &
|
|
245
|
+
tlog watch
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## Prior art
|
|
249
|
+
|
|
250
|
+
[trackio](https://github.com/gradio-app/trackio), [aim](https://github.com/aimhubio/aim),
|
|
251
|
+
TensorBoard, and MLflow all live in adjacent space. tlog's niche is the
|
|
252
|
+
combination: a zero-dependency stdlib-only core safe to drop into any
|
|
253
|
+
training env, files you can grep as the source of truth, SLURM-native
|
|
254
|
+
metadata + preemption semantics, a terminal dashboard designed for a tmux
|
|
255
|
+
pane on a GPU cluster, and single-file HTML reports — in ~2,700 lines of
|
|
256
|
+
Python you can read in an afternoon.
|
|
257
|
+
|
|
258
|
+
## Tests
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
python -m pytest tests/
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
## License
|
|
265
|
+
|
|
266
|
+
MIT
|
tlog_ml-0.1.0/README.md
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
# tlog
|
|
2
|
+
|
|
3
|
+
A lightweight, local-first experiment logger for neural network training.
|
|
4
|
+
wandb-shaped API, **zero dependencies** in your training environment, and three
|
|
5
|
+
clean ways to look at your runs from a SLURM cluster with nothing but a
|
|
6
|
+
terminal:
|
|
7
|
+
|
|
8
|
+
| viewer | command | when |
|
|
9
|
+
|---|---|---|
|
|
10
|
+
| **terminal dashboard** | `tlog watch` | live charts in a tmux pane — the default |
|
|
11
|
+
| **live web dashboard** | `tlog serve` | wandb-like browser UI through an SSH/VS Code port-forward |
|
|
12
|
+
| **self-contained HTML** | `tlog export -o report.html` | one file with charts + images; preview in VS Code, scp it, share it |
|
|
13
|
+
|
|
14
|
+
Everything is plain append-only JSONL in a run directory: grep-able,
|
|
15
|
+
rsync-able, crash-safe, no daemon, no cloud, no account.
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
● demo/baseline (da064b) · step 1500 · finished
|
|
19
|
+
loss eval training timing memory console
|
|
20
|
+
|
|
21
|
+
loss/charb 0.3158 loss/dino 0.07182
|
|
22
|
+
1.552 ┤⡧⣼ 0.3882 ┤⡧⣼
|
|
23
|
+
│⠇⢹⢿⣠⢀ │⡇⢹⣶⣀⣀
|
|
24
|
+
│ ⠹⢹⠢⣧⣄⣀ │ ⠛⢹⠢⡦⣆⢀
|
|
25
|
+
│ ⠁⠋⠋⠳⢶⢤⡀ │ ⠋⠉⠢⣴⣀⣀
|
|
26
|
+
│ ⠘⠙⠦⠦⢴⢄⣀⡀ │ ⠘⠙⢢⡧⢦⣀⢀
|
|
27
|
+
│ ⠉⠙⠛⠓⠶⠤⣤⣠⣠⣀⡀ │ ⠉⠙⠋⠳⠶⢤⣤⣴⣠⡄⡀
|
|
28
|
+
0.2771 ┤ ⠉⠙⠉⠛⠋⠓⠲⠚⠴⠖⠤⠤⠦⡦ 0.06998 ┤ ⠁⠙⠉⠋⠋⠑⠳⠒⠲⠶⠤⡶⣦⣦
|
|
29
|
+
10 1490 10 1490
|
|
30
|
+
|
|
31
|
+
loss/ssim 0.131 loss/total 0.5004
|
|
32
|
+
0.6492 ┤⡧⣼ 2.605 ┤⡧⣼
|
|
33
|
+
│⠇⢹⢶⣀⢀ │⠇⢹⢶⣀
|
|
34
|
+
│ ⠹⠹⠦⣶⣄⣀ │ ⠹⢹⠢⣦⣄⣀
|
|
35
|
+
│ ⠛⠋⠣⣴⣠⡀ │ ⠋⠋⠲⣴⢤⣀
|
|
36
|
+
│ ⠈⠘⠙⠲⠦⢤⣀⢀ │ ⠘⠙⠦⠦⢤⣀⣀
|
|
37
|
+
│ ⠉⠙⠋⠓⢴⠤⢤⣤⣠⡀⡀ │ ⠈⠙⠋⠲⠴⠤⣤⣠⢠⡀⡀
|
|
38
|
+
0.1174 ┤ ⠁⠉⠉⠛⠊⠛⠲⠖⠴⠒⠤⡴⠦⣤ 0.4672 ┤ ⠉⠉⠉⠛⠊⠛⠲⠖⠴⠖⠤⠤⠦⣦
|
|
39
|
+
10 1490 10 1490
|
|
40
|
+
|
|
41
|
+
←/→ pages · ↑/↓ scroll · 1-9 cols (auto) · s smooth (0) · l log (off) · q quit
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
*An actual `tlog watch` frame — braille-canvas charts in a plain tmux pane.*
|
|
45
|
+
|
|
46
|
+
## Install
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install tlog-ml # distribution is tlog-ml; you still `import tlog`
|
|
50
|
+
# or for development:
|
|
51
|
+
git clone https://github.com/philippe-eecs/tlog && cd tlog
|
|
52
|
+
pip install -e ".[dev]"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
The core has **zero dependencies** — nothing to conflict with your torch/jax
|
|
56
|
+
pins. PIL is used opportunistically if present (image encoding, report
|
|
57
|
+
downscaling); otherwise a pure-stdlib PNG encoder takes over.
|
|
58
|
+
|
|
59
|
+
## Quickstart
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
import tlog
|
|
63
|
+
|
|
64
|
+
run = tlog.init(project="vitok", name="vae-L16", config=vars(args))
|
|
65
|
+
|
|
66
|
+
for step in range(steps):
|
|
67
|
+
...
|
|
68
|
+
if step % log_freq == 0:
|
|
69
|
+
tlog.log({"loss/total": loss, "training/lr": lr,
|
|
70
|
+
"timing/mfu_percent": mfu}, step=step)
|
|
71
|
+
if step % eval_freq == 0:
|
|
72
|
+
tlog.log({f"eval/{k}": v for k, v in eval_stats.items()}, step=step)
|
|
73
|
+
tlog.log_images("eval/recon", [orig, recon], step=step) # torch/np/PIL
|
|
74
|
+
|
|
75
|
+
tlog.finish()
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Then, in another tmux pane:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
tlog # == tlog watch: live dashboard of the latest run
|
|
82
|
+
tlog ls # table of runs: step, last loss, slurm job, status
|
|
83
|
+
tlog tail # live captured console output of the latest run
|
|
84
|
+
tlog serve # web UI on :8585 (VS Code auto-forwards the port)
|
|
85
|
+
tlog export run-a run-b -o compare.html # side-by-side report
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Key namespaces (`loss/`, `eval/`, `timing/`, ...) become chart groups / TUI
|
|
89
|
+
pages automatically.
|
|
90
|
+
|
|
91
|
+
## What gets captured
|
|
92
|
+
|
|
93
|
+
`tlog.init()` records, without being asked:
|
|
94
|
+
|
|
95
|
+
- **SLURM**: job id, job name, partition, nodelist, array task id, and the
|
|
96
|
+
actual `sbatch` script that launched the job (saved as `launch.sh`)
|
|
97
|
+
- **git**: commit, branch, dirty flag, and a `diff.patch` of uncommitted changes
|
|
98
|
+
- **environment**: argv, entrypoint, hostname, user, python/torch/CUDA
|
|
99
|
+
versions, GPU models, world size
|
|
100
|
+
- **system metrics** (background thread, 10s interval): GPU util/mem/temp/power
|
|
101
|
+
per device via nvidia-smi, CPU%, RAM — shown as their own chart groups
|
|
102
|
+
- **console**: stdout/stderr teed to `console.log` (tqdm-safe; viewers resolve
|
|
103
|
+
`\r` overwrites)
|
|
104
|
+
|
|
105
|
+
## How it works
|
|
106
|
+
|
|
107
|
+
tlog is two decoupled halves that only meet at the filesystem: a **write
|
|
108
|
+
path** that lives inside your training process, and a **read path** (the
|
|
109
|
+
viewers) that runs anywhere that can see the same disk. There is no daemon,
|
|
110
|
+
no database, no socket between them — a run *is* a directory:
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
runs/<project>/<name>__<timestamp>__<id>/
|
|
114
|
+
├── meta.json # identity + environment snapshot + restart history
|
|
115
|
+
├── config.json # your hyperparameters (vars(args))
|
|
116
|
+
├── metrics.jsonl # one JSON object per log() call, append-only
|
|
117
|
+
├── system.jsonl # sampled GPU/CPU/RAM
|
|
118
|
+
├── console.log # teed stdout/stderr
|
|
119
|
+
├── launch.sh # captured sbatch script (under SLURM)
|
|
120
|
+
├── diff.patch # uncommitted git changes
|
|
121
|
+
└── media/ # PNGs + index.jsonl mapping them to (key, step)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### The write path never blocks training
|
|
125
|
+
|
|
126
|
+
`log()` serializes one JSON line and appends it. Lines are written whole and
|
|
127
|
+
flushed, so a crash loses at most the line in flight and can never corrupt
|
|
128
|
+
history; `fsync` runs on a 30s timer to bound hard-failure data loss without
|
|
129
|
+
paying sync cost per step. Everything slow happens off the hot path: git
|
|
130
|
+
diff / nvidia-smi / `scontrol` captures run in a background thread after
|
|
131
|
+
init, system sampling and the liveness heartbeat are daemon threads, and
|
|
132
|
+
framework versions are read from `sys.modules` instead of importing anything.
|
|
133
|
+
|
|
134
|
+
### Preemption-safe by construction
|
|
135
|
+
|
|
136
|
+
SLURM requeues a preempted job with the same job id and bumps
|
|
137
|
+
`SLURM_RESTART_COUNT`. `init(resume="auto")` (the default) detects that,
|
|
138
|
+
finds the run directory it created before the preemption, and keeps
|
|
139
|
+
appending — recording a restart event in `meta.json`. Restarting from an
|
|
140
|
+
older checkpoint re-logs some steps; instead of rewriting files (dangerous),
|
|
141
|
+
**readers keep the last value logged per (metric, step)**, so charts come out
|
|
142
|
+
continuous and the storage stays strictly append-only. Explicit resume:
|
|
143
|
+
`tlog.init(id="a1b2c3", resume="must")`.
|
|
144
|
+
|
|
145
|
+
### The read path is one engine with three faces
|
|
146
|
+
|
|
147
|
+
`store.py` discovers runs, tails JSONL incrementally (remembering byte
|
|
148
|
+
offsets, parsing only complete new lines), applies keep-last dedup, and
|
|
149
|
+
downsamples with **min/max/mean buckets** — a one-step loss spike survives
|
|
150
|
+
being squeezed into a 200-px chart instead of being averaged away. Debiased
|
|
151
|
+
EMA smoothing (same formula as wandb) sits on top. The three viewers are just
|
|
152
|
+
renderers over this engine:
|
|
153
|
+
|
|
154
|
+
- **TUI**: each terminal cell is a 2×4 braille dot grid, so a tmux pane
|
|
155
|
+
becomes a pixel canvas; charts are drawn with Bresenham lines and repainted
|
|
156
|
+
on the alternate screen buffer. Pure ANSI — no curses, works over any SSH.
|
|
157
|
+
- **Web**: a stdlib `ThreadingHTTPServer` with JSON endpoints; the browser
|
|
158
|
+
polls every 3s and refetches only runs whose files changed (mtime-keyed).
|
|
159
|
+
- **Export**: the *same* frontend with data, images (base64), and uPlot
|
|
160
|
+
inlined into one HTML file. One codebase, a mode flag, two surfaces.
|
|
161
|
+
|
|
162
|
+
### Liveness without IPC
|
|
163
|
+
|
|
164
|
+
A daemon thread touches `heartbeat` every 15s. Viewers call a run *running*
|
|
165
|
+
if the heartbeat is fresh, *finished* if `finish()` marked it, and *dead* if
|
|
166
|
+
neither — which is how a SIGKILLed job shows up correctly with no process
|
|
167
|
+
ever being asked.
|
|
168
|
+
|
|
169
|
+
## Distributed training
|
|
170
|
+
|
|
171
|
+
`tlog.init()` is a no-op on non-zero ranks (it checks the `RANK` env var set
|
|
172
|
+
by torchrun/SLURM), so you can call it unguarded — or keep your existing
|
|
173
|
+
`if rank == 0:` guard; both are fine.
|
|
174
|
+
|
|
175
|
+
## Migrating from wandb
|
|
176
|
+
|
|
177
|
+
```diff
|
|
178
|
+
-import wandb
|
|
179
|
+
+import tlog
|
|
180
|
+
|
|
181
|
+
-wandb.init(project=args.project, name=args.name, config=vars(args))
|
|
182
|
+
+tlog.init(project=args.project, name=args.name, config=vars(args))
|
|
183
|
+
|
|
184
|
+
-wandb.log(avg, step=step)
|
|
185
|
+
+tlog.log(avg, step=step)
|
|
186
|
+
|
|
187
|
+
-wandb.finish()
|
|
188
|
+
+tlog.finish()
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Runs land in `./runs` by default; set `TLOG_DIR=/scratch/$USER/runs` (or pass
|
|
192
|
+
`dir=`) to keep them on scratch.
|
|
193
|
+
|
|
194
|
+
## The viewers in detail
|
|
195
|
+
|
|
196
|
+
**`tlog watch [run]`** — braille line charts with min/max bands, one page per
|
|
197
|
+
metric group plus a console page; the grid auto-sizes to the pane and scrolls
|
|
198
|
+
when a group has more charts than fit. Keys: `←/→` pages · `↑/↓` (or `j/k`)
|
|
199
|
+
scroll charts / console history · `1`–`9` force column count, `0` auto (or
|
|
200
|
+
`--cols N`) · `s` smoothing (EMA 0 → 0.6 → 0.9 → 0.99) · `l` log scale ·
|
|
201
|
+
`q` quit.
|
|
202
|
+
|
|
203
|
+
**`tlog serve [root]`** — open `http://localhost:8585` through VS Code Remote
|
|
204
|
+
(auto port-forward) or `ssh -L 8585:localhost:8585 cluster`. Multi-run
|
|
205
|
+
overlay charts with synced cursors, smoothing slider, log scale, a media tab
|
|
206
|
+
laid out **runs-as-columns × steps-as-rows** for side-by-side recon/eval
|
|
207
|
+
comparison, a config tab that highlights differing hyperparameters, and live
|
|
208
|
+
console.
|
|
209
|
+
|
|
210
|
+
**`tlog export <runs...> -o report.html`** — the same UI frozen into a single
|
|
211
|
+
file (images downscaled to ≤512px by default; `--max-image-px 0` keeps
|
|
212
|
+
originals). No server, no internet — works in VS Code's HTML preview.
|
|
213
|
+
|
|
214
|
+
## Demo without a GPU
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
python examples/fake_train.py --steps 2000 &
|
|
218
|
+
tlog watch
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## Prior art
|
|
222
|
+
|
|
223
|
+
[trackio](https://github.com/gradio-app/trackio), [aim](https://github.com/aimhubio/aim),
|
|
224
|
+
TensorBoard, and MLflow all live in adjacent space. tlog's niche is the
|
|
225
|
+
combination: a zero-dependency stdlib-only core safe to drop into any
|
|
226
|
+
training env, files you can grep as the source of truth, SLURM-native
|
|
227
|
+
metadata + preemption semantics, a terminal dashboard designed for a tmux
|
|
228
|
+
pane on a GPU cluster, and single-file HTML reports — in ~2,700 lines of
|
|
229
|
+
Python you can read in an afternoon.
|
|
230
|
+
|
|
231
|
+
## Tests
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
python -m pytest tests/
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## License
|
|
238
|
+
|
|
239
|
+
MIT
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tlog-ml"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Lightweight, local-first experiment logger for neural network training — wandb-shaped API, zero deps, terminal + HTML + web viewers"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Philippe Hansen-Estruch" }]
|
|
13
|
+
keywords = ["experiment-tracking", "logging", "machine-learning", "wandb", "slurm", "tui"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Environment :: Console",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
25
|
+
"Topic :: System :: Logging",
|
|
26
|
+
]
|
|
27
|
+
dependencies = []
|
|
28
|
+
|
|
29
|
+
[project.optional-dependencies]
|
|
30
|
+
dev = ["pytest>=7.0"]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Repository = "https://github.com/philippe-eecs/tlog"
|
|
34
|
+
Issues = "https://github.com/philippe-eecs/tlog/issues"
|
|
35
|
+
|
|
36
|
+
[project.scripts]
|
|
37
|
+
tlog = "tlog.cli:main"
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
include = ["tlog*"]
|
|
41
|
+
|
|
42
|
+
[tool.setuptools.package-data]
|
|
43
|
+
tlog = ["frontend/*", "frontend/vendor/*"]
|
tlog_ml-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import struct
|
|
3
|
+
import zlib
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from tlog.media import encode_png, save_image
|
|
8
|
+
from tlog.run import Run
|
|
9
|
+
from tlog.store import find_runs
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def parse_png(data: bytes):
|
|
13
|
+
assert data[:8] == b"\x89PNG\r\n\x1a\n"
|
|
14
|
+
chunks = {}
|
|
15
|
+
pos = 8
|
|
16
|
+
while pos < len(data):
|
|
17
|
+
(length,) = struct.unpack(">I", data[pos : pos + 4])
|
|
18
|
+
tag = data[pos + 4 : pos + 8]
|
|
19
|
+
body = data[pos + 8 : pos + 8 + length]
|
|
20
|
+
(crc,) = struct.unpack(">I", data[pos + 8 + length : pos + 12 + length])
|
|
21
|
+
assert crc == zlib.crc32(tag + body), f"bad crc for {tag}"
|
|
22
|
+
chunks[tag] = body
|
|
23
|
+
pos += 12 + length
|
|
24
|
+
return chunks
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_encode_png_roundtrip():
|
|
28
|
+
w, h = 3, 2
|
|
29
|
+
pixels = bytes(range(w * h * 3))
|
|
30
|
+
png = encode_png(pixels, w, h, 3)
|
|
31
|
+
chunks = parse_png(png)
|
|
32
|
+
width, height, depth, color = struct.unpack(">IIBB", chunks[b"IHDR"][:10])
|
|
33
|
+
assert (width, height, depth, color) == (3, 2, 8, 2)
|
|
34
|
+
raw = zlib.decompress(chunks[b"IDAT"])
|
|
35
|
+
# filter byte 0 + scanline per row
|
|
36
|
+
assert len(raw) == h * (1 + w * 3)
|
|
37
|
+
assert raw[1 : 1 + w * 3] == pixels[: w * 3]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_save_image_numpy(tmp_path):
|
|
41
|
+
np = pytest.importorskip("numpy")
|
|
42
|
+
# float HWC in [0,1]
|
|
43
|
+
arr = np.linspace(0, 1, 4 * 5 * 3).reshape(4, 5, 3)
|
|
44
|
+
save_image(arr, tmp_path / "a.png")
|
|
45
|
+
assert (tmp_path / "a.png").read_bytes()[:8] == b"\x89PNG\r\n\x1a\n"
|
|
46
|
+
# uint8 CHW gets transposed
|
|
47
|
+
chw = (arr.transpose(2, 0, 1) * 255).astype("uint8")
|
|
48
|
+
save_image(chw, tmp_path / "b.png")
|
|
49
|
+
chunks = parse_png((tmp_path / "b.png").read_bytes())
|
|
50
|
+
width, height = struct.unpack(">II", chunks[b"IHDR"][:8])
|
|
51
|
+
assert (width, height) == (5, 4)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_export_html_smoke(tmp_path):
|
|
55
|
+
from tlog.export import export_html
|
|
56
|
+
|
|
57
|
+
run = Run(project="p", name="r1", dir=tmp_path, config={"lr": 1},
|
|
58
|
+
capture_console=False, system_metrics=False)
|
|
59
|
+
for s in range(0, 100, 10):
|
|
60
|
+
run.log({"loss/total": 1.0 / (s + 1), "eval/fid": 50 - s / 4}, step=s)
|
|
61
|
+
run.log_images("eval/recon", [(bytes([0, 128, 255] * 4), 2, 2, 3)], step=50)
|
|
62
|
+
run.finish()
|
|
63
|
+
|
|
64
|
+
info = find_runs(tmp_path)[0]
|
|
65
|
+
out = export_html([info], tmp_path / "report.html")
|
|
66
|
+
html = out.read_text()
|
|
67
|
+
assert "loss/total" in html
|
|
68
|
+
assert "data:image/png;base64," in html
|
|
69
|
+
assert '"mode"' not in html.split("TLOG_MODE")[0] # sanity: template filled
|
|
70
|
+
assert "{{" not in html.replace("{{}}", "") # no leftover placeholders
|
|
71
|
+
|
|
72
|
+
payload = html.split("window.TLOG_DATA = ", 1)[1].split(";\n", 1)[0]
|
|
73
|
+
data = json.loads(payload.replace("<\\/", "</"))
|
|
74
|
+
assert data["runs"][0]["name"] == "r1"
|
|
75
|
+
assert len(data["runs"][0]["metrics"]["loss/total"]["steps"]) == 10
|
|
76
|
+
assert data["runs"][0]["media"][0]["step"] == 50
|