tlog-ml 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tlog_ml-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Philippe Hansen-Estruch
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
tlog_ml-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,266 @@
1
+ Metadata-Version: 2.4
2
+ Name: tlog-ml
3
+ Version: 0.1.0
4
+ Summary: Lightweight, local-first experiment logger for neural network training — wandb-shaped API, zero deps, terminal + HTML + web viewers
5
+ Author: Philippe Hansen-Estruch
6
+ License: MIT
7
+ Project-URL: Repository, https://github.com/philippe-eecs/tlog
8
+ Project-URL: Issues, https://github.com/philippe-eecs/tlog/issues
9
+ Keywords: experiment-tracking,logging,machine-learning,wandb,slurm,tui
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: System :: Logging
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest>=7.0; extra == "dev"
26
+ Dynamic: license-file
27
+
28
+ # tlog
29
+
30
+ A lightweight, local-first experiment logger for neural network training.
31
+ wandb-shaped API, **zero dependencies** in your training environment, and three
32
+ clean ways to look at your runs from a SLURM cluster with nothing but a
33
+ terminal:
34
+
35
+ | viewer | command | when |
36
+ |---|---|---|
37
+ | **terminal dashboard** | `tlog watch` | live charts in a tmux pane — the default |
38
+ | **live web dashboard** | `tlog serve` | wandb-like browser UI through an SSH/VS Code port-forward |
39
+ | **self-contained HTML** | `tlog export -o report.html` | one file with charts + images; preview in VS Code, scp it, share it |
40
+
41
+ Everything is plain append-only JSONL in a run directory: grep-able,
42
+ rsync-able, crash-safe, no daemon, no cloud, no account.
43
+
44
+ ```
45
+ ● demo/baseline (da064b) · step 1500 · finished
46
+ loss eval training timing memory console
47
+
48
+ loss/charb 0.3158 loss/dino 0.07182
49
+ 1.552 ┤⡧⣼ 0.3882 ┤⡧⣼
50
+ │⠇⢹⢿⣠⢀ │⡇⢹⣶⣀⣀
51
+ │ ⠹⢹⠢⣧⣄⣀ │ ⠛⢹⠢⡦⣆⢀
52
+ │ ⠁⠋⠋⠳⢶⢤⡀ │ ⠋⠉⠢⣴⣀⣀
53
+ │ ⠘⠙⠦⠦⢴⢄⣀⡀ │ ⠘⠙⢢⡧⢦⣀⢀
54
+ │ ⠉⠙⠛⠓⠶⠤⣤⣠⣠⣀⡀ │ ⠉⠙⠋⠳⠶⢤⣤⣴⣠⡄⡀
55
+ 0.2771 ┤ ⠉⠙⠉⠛⠋⠓⠲⠚⠴⠖⠤⠤⠦⡦ 0.06998 ┤ ⠁⠙⠉⠋⠋⠑⠳⠒⠲⠶⠤⡶⣦⣦
56
+ 10 1490 10 1490
57
+
58
+ loss/ssim 0.131 loss/total 0.5004
59
+ 0.6492 ┤⡧⣼ 2.605 ┤⡧⣼
60
+ │⠇⢹⢶⣀⢀ │⠇⢹⢶⣀
61
+ │ ⠹⠹⠦⣶⣄⣀ │ ⠹⢹⠢⣦⣄⣀
62
+ │ ⠛⠋⠣⣴⣠⡀ │ ⠋⠋⠲⣴⢤⣀
63
+ │ ⠈⠘⠙⠲⠦⢤⣀⢀ │ ⠘⠙⠦⠦⢤⣀⣀
64
+ │ ⠉⠙⠋⠓⢴⠤⢤⣤⣠⡀⡀ │ ⠈⠙⠋⠲⠴⠤⣤⣠⢠⡀⡀
65
+ 0.1174 ┤ ⠁⠉⠉⠛⠊⠛⠲⠖⠴⠒⠤⡴⠦⣤ 0.4672 ┤ ⠉⠉⠉⠛⠊⠛⠲⠖⠴⠖⠤⠤⠦⣦
66
+ 10 1490 10 1490
67
+
68
+ ←/→ pages · ↑/↓ scroll · 1-9 cols (auto) · s smooth (0) · l log (off) · q quit
69
+ ```
70
+
71
+ *An actual `tlog watch` frame — braille-canvas charts in a plain tmux pane.*
72
+
73
+ ## Install
74
+
75
+ ```bash
76
+ pip install tlog-ml # distribution is tlog-ml; you still `import tlog`
77
+ # or for development:
78
+ git clone https://github.com/philippe-eecs/tlog && cd tlog
79
+ pip install -e ".[dev]"
80
+ ```
81
+
82
+ The core has **zero dependencies** — nothing to conflict with your torch/jax
83
+ pins. PIL is used opportunistically if present (image encoding, report
84
+ downscaling); otherwise a pure-stdlib PNG encoder takes over.
85
+
86
+ ## Quickstart
87
+
88
+ ```python
89
+ import tlog
90
+
91
+ run = tlog.init(project="vitok", name="vae-L16", config=vars(args))
92
+
93
+ for step in range(steps):
94
+ ...
95
+ if step % log_freq == 0:
96
+ tlog.log({"loss/total": loss, "training/lr": lr,
97
+ "timing/mfu_percent": mfu}, step=step)
98
+ if step % eval_freq == 0:
99
+ tlog.log({f"eval/{k}": v for k, v in eval_stats.items()}, step=step)
100
+ tlog.log_images("eval/recon", [orig, recon], step=step) # torch/np/PIL
101
+
102
+ tlog.finish()
103
+ ```
104
+
105
+ Then, in another tmux pane:
106
+
107
+ ```bash
108
+ tlog # == tlog watch: live dashboard of the latest run
109
+ tlog ls # table of runs: step, last loss, slurm job, status
110
+ tlog tail # live captured console output of the latest run
111
+ tlog serve # web UI on :8585 (VS Code auto-forwards the port)
112
+ tlog export run-a run-b -o compare.html # side-by-side report
113
+ ```
114
+
115
+ Key namespaces (`loss/`, `eval/`, `timing/`, ...) become chart groups / TUI
116
+ pages automatically.
117
+
118
+ ## What gets captured
119
+
120
+ `tlog.init()` records, without being asked:
121
+
122
+ - **SLURM**: job id, job name, partition, nodelist, array task id, and the
123
+ actual `sbatch` script that launched the job (saved as `launch.sh`)
124
+ - **git**: commit, branch, dirty flag, and a `diff.patch` of uncommitted changes
125
+ - **environment**: argv, entrypoint, hostname, user, python/torch/CUDA
126
+ versions, GPU models, world size
127
+ - **system metrics** (background thread, 10s interval): GPU util/mem/temp/power
128
+ per device via nvidia-smi, CPU%, RAM — shown as their own chart groups
129
+ - **console**: stdout/stderr teed to `console.log` (tqdm-safe; viewers resolve
130
+ `\r` overwrites)
131
+
132
+ ## How it works
133
+
134
+ tlog is two decoupled halves that only meet at the filesystem: a **write
135
+ path** that lives inside your training process, and a **read path** (the
136
+ viewers) that runs anywhere that can see the same disk. There is no daemon,
137
+ no database, no socket between them — a run *is* a directory:
138
+
139
+ ```
140
+ runs/<project>/<name>__<timestamp>__<id>/
141
+ ├── meta.json # identity + environment snapshot + restart history
142
+ ├── config.json # your hyperparameters (vars(args))
143
+ ├── metrics.jsonl # one JSON object per log() call, append-only
144
+ ├── system.jsonl # sampled GPU/CPU/RAM
145
+ ├── console.log # teed stdout/stderr
146
+ ├── launch.sh # captured sbatch script (under SLURM)
147
+ ├── diff.patch # uncommitted git changes
148
+ └── media/ # PNGs + index.jsonl mapping them to (key, step)
149
+ ```
150
+
151
+ ### The write path never blocks training
152
+
153
+ `log()` serializes one JSON line and appends it. Lines are written whole and
154
+ flushed, so a crash loses at most the line in flight and can never corrupt
155
+ history; `fsync` runs on a 30s timer to bound hard-failure data loss without
156
+ paying sync cost per step. Everything slow happens off the hot path: git
157
+ diff / nvidia-smi / `scontrol` captures run in a background thread after
158
+ init, system sampling and the liveness heartbeat are daemon threads, and
159
+ framework versions are read from `sys.modules` instead of importing anything.
160
+
161
+ ### Preemption-safe by construction
162
+
163
+ SLURM requeues a preempted job with the same job id and bumps
164
+ `SLURM_RESTART_COUNT`. `init(resume="auto")` (the default) detects that,
165
+ finds the run directory it created before the preemption, and keeps
166
+ appending — recording a restart event in `meta.json`. Restarting from an
167
+ older checkpoint re-logs some steps; instead of rewriting files (dangerous),
168
+ **readers keep the last value logged per (metric, step)**, so charts come out
169
+ continuous and the storage stays strictly append-only. Explicit resume:
170
+ `tlog.init(id="a1b2c3", resume="must")`.
171
+
172
+ ### The read path is one engine with three faces
173
+
174
+ `store.py` discovers runs, tails JSONL incrementally (remembering byte
175
+ offsets, parsing only complete new lines), applies keep-last dedup, and
176
+ downsamples with **min/max/mean buckets** — a one-step loss spike survives
177
+ being squeezed into a 200-px chart instead of being averaged away. Debiased
178
+ EMA smoothing (same formula as wandb) sits on top. The three viewers are just
179
+ renderers over this engine:
180
+
181
+ - **TUI**: each terminal cell is a 2×4 braille dot grid, so a tmux pane
182
+ becomes a pixel canvas; charts are drawn with Bresenham lines and repainted
183
+ on the alternate screen buffer. Pure ANSI — no curses, works over any SSH.
184
+ - **Web**: a stdlib `ThreadingHTTPServer` with JSON endpoints; the browser
185
+ polls every 3s and refetches only runs whose files changed (mtime-keyed).
186
+ - **Export**: the *same* frontend with data, images (base64), and uPlot
187
+ inlined into one HTML file. One codebase, a mode flag, two surfaces.
188
+
189
+ ### Liveness without IPC
190
+
191
+ A daemon thread touches `heartbeat` every 15s. Viewers call a run *running*
192
+ if the heartbeat is fresh, *finished* if `finish()` marked it, and *dead* if
193
+ neither — which is how a SIGKILLed job shows up correctly with no process
194
+ ever being asked.
195
+
196
+ ## Distributed training
197
+
198
+ `tlog.init()` is a no-op on non-zero ranks (it checks the `RANK` env var set
199
+ by torchrun/SLURM), so you can call it unguarded — or keep your existing
200
+ `if rank == 0:` guard; both are fine.
201
+
202
+ ## Migrating from wandb
203
+
204
+ ```diff
205
+ -import wandb
206
+ +import tlog
207
+
208
+ -wandb.init(project=args.project, name=args.name, config=vars(args))
209
+ +tlog.init(project=args.project, name=args.name, config=vars(args))
210
+
211
+ -wandb.log(avg, step=step)
212
+ +tlog.log(avg, step=step)
213
+
214
+ -wandb.finish()
215
+ +tlog.finish()
216
+ ```
217
+
218
+ Runs land in `./runs` by default; set `TLOG_DIR=/scratch/$USER/runs` (or pass
219
+ `dir=`) to keep them on scratch.
220
+
221
+ ## The viewers in detail
222
+
223
+ **`tlog watch [run]`** — braille line charts with min/max bands, one page per
224
+ metric group plus a console page; the grid auto-sizes to the pane and scrolls
225
+ when a group has more charts than fit. Keys: `←/→` pages · `↑/↓` (or `j/k`)
226
+ scroll charts / console history · `1`–`9` force column count, `0` auto (or
227
+ `--cols N`) · `s` smoothing (EMA 0 → 0.6 → 0.9 → 0.99) · `l` log scale ·
228
+ `q` quit.
229
+
230
+ **`tlog serve [root]`** — open `http://localhost:8585` through VS Code Remote
231
+ (auto port-forward) or `ssh -L 8585:localhost:8585 cluster`. Multi-run
232
+ overlay charts with synced cursors, smoothing slider, log scale, a media tab
233
+ laid out **runs-as-columns × steps-as-rows** for side-by-side recon/eval
234
+ comparison, a config tab that highlights differing hyperparameters, and live
235
+ console.
236
+
237
+ **`tlog export <runs...> -o report.html`** — the same UI frozen into a single
238
+ file (images downscaled to ≤512px by default; `--max-image-px 0` keeps
239
+ originals). No server, no internet — works in VS Code's HTML preview.
240
+
241
+ ## Demo without a GPU
242
+
243
+ ```bash
244
+ python examples/fake_train.py --steps 2000 &
245
+ tlog watch
246
+ ```
247
+
248
+ ## Prior art
249
+
250
+ [trackio](https://github.com/gradio-app/trackio), [aim](https://github.com/aimhubio/aim),
251
+ TensorBoard, and MLflow all live in adjacent space. tlog's niche is the
252
+ combination: a zero-dependency stdlib-only core safe to drop into any
253
+ training env, files you can grep as the source of truth, SLURM-native
254
+ metadata + preemption semantics, a terminal dashboard designed for a tmux
255
+ pane on a GPU cluster, and single-file HTML reports — in ~2,700 lines of
256
+ Python you can read in an afternoon.
257
+
258
+ ## Tests
259
+
260
+ ```bash
261
+ python -m pytest tests/
262
+ ```
263
+
264
+ ## License
265
+
266
+ MIT
@@ -0,0 +1,239 @@
1
+ # tlog
2
+
3
+ A lightweight, local-first experiment logger for neural network training.
4
+ wandb-shaped API, **zero dependencies** in your training environment, and three
5
+ clean ways to look at your runs from a SLURM cluster with nothing but a
6
+ terminal:
7
+
8
+ | viewer | command | when |
9
+ |---|---|---|
10
+ | **terminal dashboard** | `tlog watch` | live charts in a tmux pane — the default |
11
+ | **live web dashboard** | `tlog serve` | wandb-like browser UI through an SSH/VS Code port-forward |
12
+ | **self-contained HTML** | `tlog export -o report.html` | one file with charts + images; preview in VS Code, scp it, share it |
13
+
14
+ Everything is plain append-only JSONL in a run directory: grep-able,
15
+ rsync-able, crash-safe, no daemon, no cloud, no account.
16
+
17
+ ```
18
+ ● demo/baseline (da064b) · step 1500 · finished
19
+ loss eval training timing memory console
20
+
21
+ loss/charb 0.3158 loss/dino 0.07182
22
+ 1.552 ┤⡧⣼ 0.3882 ┤⡧⣼
23
+ │⠇⢹⢿⣠⢀ │⡇⢹⣶⣀⣀
24
+ │ ⠹⢹⠢⣧⣄⣀ │ ⠛⢹⠢⡦⣆⢀
25
+ │ ⠁⠋⠋⠳⢶⢤⡀ │ ⠋⠉⠢⣴⣀⣀
26
+ │ ⠘⠙⠦⠦⢴⢄⣀⡀ │ ⠘⠙⢢⡧⢦⣀⢀
27
+ │ ⠉⠙⠛⠓⠶⠤⣤⣠⣠⣀⡀ │ ⠉⠙⠋⠳⠶⢤⣤⣴⣠⡄⡀
28
+ 0.2771 ┤ ⠉⠙⠉⠛⠋⠓⠲⠚⠴⠖⠤⠤⠦⡦ 0.06998 ┤ ⠁⠙⠉⠋⠋⠑⠳⠒⠲⠶⠤⡶⣦⣦
29
+ 10 1490 10 1490
30
+
31
+ loss/ssim 0.131 loss/total 0.5004
32
+ 0.6492 ┤⡧⣼ 2.605 ┤⡧⣼
33
+ │⠇⢹⢶⣀⢀ │⠇⢹⢶⣀
34
+ │ ⠹⠹⠦⣶⣄⣀ │ ⠹⢹⠢⣦⣄⣀
35
+ │ ⠛⠋⠣⣴⣠⡀ │ ⠋⠋⠲⣴⢤⣀
36
+ │ ⠈⠘⠙⠲⠦⢤⣀⢀ │ ⠘⠙⠦⠦⢤⣀⣀
37
+ │ ⠉⠙⠋⠓⢴⠤⢤⣤⣠⡀⡀ │ ⠈⠙⠋⠲⠴⠤⣤⣠⢠⡀⡀
38
+ 0.1174 ┤ ⠁⠉⠉⠛⠊⠛⠲⠖⠴⠒⠤⡴⠦⣤ 0.4672 ┤ ⠉⠉⠉⠛⠊⠛⠲⠖⠴⠖⠤⠤⠦⣦
39
+ 10 1490 10 1490
40
+
41
+ ←/→ pages · ↑/↓ scroll · 1-9 cols (auto) · s smooth (0) · l log (off) · q quit
42
+ ```
43
+
44
+ *An actual `tlog watch` frame — braille-canvas charts in a plain tmux pane.*
45
+
46
+ ## Install
47
+
48
+ ```bash
49
+ pip install tlog-ml # distribution is tlog-ml; you still `import tlog`
50
+ # or for development:
51
+ git clone https://github.com/philippe-eecs/tlog && cd tlog
52
+ pip install -e ".[dev]"
53
+ ```
54
+
55
+ The core has **zero dependencies** — nothing to conflict with your torch/jax
56
+ pins. PIL is used opportunistically if present (image encoding, report
57
+ downscaling); otherwise a pure-stdlib PNG encoder takes over.
58
+
59
+ ## Quickstart
60
+
61
+ ```python
62
+ import tlog
63
+
64
+ run = tlog.init(project="vitok", name="vae-L16", config=vars(args))
65
+
66
+ for step in range(steps):
67
+ ...
68
+ if step % log_freq == 0:
69
+ tlog.log({"loss/total": loss, "training/lr": lr,
70
+ "timing/mfu_percent": mfu}, step=step)
71
+ if step % eval_freq == 0:
72
+ tlog.log({f"eval/{k}": v for k, v in eval_stats.items()}, step=step)
73
+ tlog.log_images("eval/recon", [orig, recon], step=step) # torch/np/PIL
74
+
75
+ tlog.finish()
76
+ ```
77
+
78
+ Then, in another tmux pane:
79
+
80
+ ```bash
81
+ tlog # == tlog watch: live dashboard of the latest run
82
+ tlog ls # table of runs: step, last loss, slurm job, status
83
+ tlog tail # live captured console output of the latest run
84
+ tlog serve # web UI on :8585 (VS Code auto-forwards the port)
85
+ tlog export run-a run-b -o compare.html # side-by-side report
86
+ ```
87
+
88
+ Key namespaces (`loss/`, `eval/`, `timing/`, ...) become chart groups / TUI
89
+ pages automatically.
90
+
91
+ ## What gets captured
92
+
93
+ `tlog.init()` records, without being asked:
94
+
95
+ - **SLURM**: job id, job name, partition, nodelist, array task id, and the
96
+ actual `sbatch` script that launched the job (saved as `launch.sh`)
97
+ - **git**: commit, branch, dirty flag, and a `diff.patch` of uncommitted changes
98
+ - **environment**: argv, entrypoint, hostname, user, python/torch/CUDA
99
+ versions, GPU models, world size
100
+ - **system metrics** (background thread, 10s interval): GPU util/mem/temp/power
101
+ per device via nvidia-smi, CPU%, RAM — shown as their own chart groups
102
+ - **console**: stdout/stderr teed to `console.log` (tqdm-safe; viewers resolve
103
+ `\r` overwrites)
104
+
105
+ ## How it works
106
+
107
+ tlog is two decoupled halves that only meet at the filesystem: a **write
108
+ path** that lives inside your training process, and a **read path** (the
109
+ viewers) that runs anywhere that can see the same disk. There is no daemon,
110
+ no database, no socket between them — a run *is* a directory:
111
+
112
+ ```
113
+ runs/<project>/<name>__<timestamp>__<id>/
114
+ ├── meta.json # identity + environment snapshot + restart history
115
+ ├── config.json # your hyperparameters (vars(args))
116
+ ├── metrics.jsonl # one JSON object per log() call, append-only
117
+ ├── system.jsonl # sampled GPU/CPU/RAM
118
+ ├── console.log # teed stdout/stderr
119
+ ├── launch.sh # captured sbatch script (under SLURM)
120
+ ├── diff.patch # uncommitted git changes
121
+ └── media/ # PNGs + index.jsonl mapping them to (key, step)
122
+ ```
123
+
124
+ ### The write path never blocks training
125
+
126
+ `log()` serializes one JSON line and appends it. Lines are written whole and
127
+ flushed, so a crash loses at most the line in flight and can never corrupt
128
+ history; `fsync` runs on a 30s timer to bound hard-failure data loss without
129
+ paying sync cost per step. Everything slow happens off the hot path: git
130
+ diff / nvidia-smi / `scontrol` captures run in a background thread after
131
+ init, system sampling and the liveness heartbeat are daemon threads, and
132
+ framework versions are read from `sys.modules` instead of importing anything.
133
+
134
+ ### Preemption-safe by construction
135
+
136
+ SLURM requeues a preempted job with the same job id and bumps
137
+ `SLURM_RESTART_COUNT`. `init(resume="auto")` (the default) detects that,
138
+ finds the run directory it created before the preemption, and keeps
139
+ appending — recording a restart event in `meta.json`. Restarting from an
140
+ older checkpoint re-logs some steps; instead of rewriting files (dangerous),
141
+ **readers keep the last value logged per (metric, step)**, so charts come out
142
+ continuous and the storage stays strictly append-only. Explicit resume:
143
+ `tlog.init(id="a1b2c3", resume="must")`.
144
+
145
+ ### The read path is one engine with three faces
146
+
147
+ `store.py` discovers runs, tails JSONL incrementally (remembering byte
148
+ offsets, parsing only complete new lines), applies keep-last dedup, and
149
+ downsamples with **min/max/mean buckets** — a one-step loss spike survives
150
+ being squeezed into a 200-px chart instead of being averaged away. Debiased
151
+ EMA smoothing (same formula as wandb) sits on top. The three viewers are just
152
+ renderers over this engine:
153
+
154
+ - **TUI**: each terminal cell is a 2×4 braille dot grid, so a tmux pane
155
+ becomes a pixel canvas; charts are drawn with Bresenham lines and repainted
156
+ on the alternate screen buffer. Pure ANSI — no curses, works over any SSH.
157
+ - **Web**: a stdlib `ThreadingHTTPServer` with JSON endpoints; the browser
158
+ polls every 3s and refetches only runs whose files changed (mtime-keyed).
159
+ - **Export**: the *same* frontend with data, images (base64), and uPlot
160
+ inlined into one HTML file. One codebase, a mode flag, two surfaces.
161
+
162
+ ### Liveness without IPC
163
+
164
+ A daemon thread touches `heartbeat` every 15s. Viewers call a run *running*
165
+ if the heartbeat is fresh, *finished* if `finish()` marked it, and *dead* if
166
+ neither — which is how a SIGKILLed job shows up correctly with no process
167
+ ever being asked.
168
+
169
+ ## Distributed training
170
+
171
+ `tlog.init()` is a no-op on non-zero ranks (it checks the `RANK` env var set
172
+ by torchrun/SLURM), so you can call it unguarded — or keep your existing
173
+ `if rank == 0:` guard; both are fine.
174
+
175
+ ## Migrating from wandb
176
+
177
+ ```diff
178
+ -import wandb
179
+ +import tlog
180
+
181
+ -wandb.init(project=args.project, name=args.name, config=vars(args))
182
+ +tlog.init(project=args.project, name=args.name, config=vars(args))
183
+
184
+ -wandb.log(avg, step=step)
185
+ +tlog.log(avg, step=step)
186
+
187
+ -wandb.finish()
188
+ +tlog.finish()
189
+ ```
190
+
191
+ Runs land in `./runs` by default; set `TLOG_DIR=/scratch/$USER/runs` (or pass
192
+ `dir=`) to keep them on scratch.
193
+
194
+ ## The viewers in detail
195
+
196
+ **`tlog watch [run]`** — braille line charts with min/max bands, one page per
197
+ metric group plus a console page; the grid auto-sizes to the pane and scrolls
198
+ when a group has more charts than fit. Keys: `←/→` pages · `↑/↓` (or `j/k`)
199
+ scroll charts / console history · `1`–`9` force column count, `0` auto (or
200
+ `--cols N`) · `s` smoothing (EMA 0 → 0.6 → 0.9 → 0.99) · `l` log scale ·
201
+ `q` quit.
202
+
203
+ **`tlog serve [root]`** — open `http://localhost:8585` through VS Code Remote
204
+ (auto port-forward) or `ssh -L 8585:localhost:8585 cluster`. Multi-run
205
+ overlay charts with synced cursors, smoothing slider, log scale, a media tab
206
+ laid out **runs-as-columns × steps-as-rows** for side-by-side recon/eval
207
+ comparison, a config tab that highlights differing hyperparameters, and live
208
+ console.
209
+
210
+ **`tlog export <runs...> -o report.html`** — the same UI frozen into a single
211
+ file (images downscaled to ≤512px by default; `--max-image-px 0` keeps
212
+ originals). No server, no internet — works in VS Code's HTML preview.
213
+
214
+ ## Demo without a GPU
215
+
216
+ ```bash
217
+ python examples/fake_train.py --steps 2000 &
218
+ tlog watch
219
+ ```
220
+
221
+ ## Prior art
222
+
223
+ [trackio](https://github.com/gradio-app/trackio), [aim](https://github.com/aimhubio/aim),
224
+ TensorBoard, and MLflow all live in adjacent space. tlog's niche is the
225
+ combination: a zero-dependency stdlib-only core safe to drop into any
226
+ training env, files you can grep as the source of truth, SLURM-native
227
+ metadata + preemption semantics, a terminal dashboard designed for a tmux
228
+ pane on a GPU cluster, and single-file HTML reports — in ~2,700 lines of
229
+ Python you can read in an afternoon.
230
+
231
+ ## Tests
232
+
233
+ ```bash
234
+ python -m pytest tests/
235
+ ```
236
+
237
+ ## License
238
+
239
+ MIT
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tlog-ml"
7
+ version = "0.1.0"
8
+ description = "Lightweight, local-first experiment logger for neural network training — wandb-shaped API, zero deps, terminal + HTML + web viewers"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Philippe Hansen-Estruch" }]
13
+ keywords = ["experiment-tracking", "logging", "machine-learning", "wandb", "slurm", "tui"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Environment :: Console",
17
+ "Intended Audience :: Science/Research",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
25
+ "Topic :: System :: Logging",
26
+ ]
27
+ dependencies = []
28
+
29
+ [project.optional-dependencies]
30
+ dev = ["pytest>=7.0"]
31
+
32
+ [project.urls]
33
+ Repository = "https://github.com/philippe-eecs/tlog"
34
+ Issues = "https://github.com/philippe-eecs/tlog/issues"
35
+
36
+ [project.scripts]
37
+ tlog = "tlog.cli:main"
38
+
39
+ [tool.setuptools.packages.find]
40
+ include = ["tlog*"]
41
+
42
+ [tool.setuptools.package-data]
43
+ tlog = ["frontend/*", "frontend/vendor/*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,76 @@
1
+ import json
2
+ import struct
3
+ import zlib
4
+
5
+ import pytest
6
+
7
+ from tlog.media import encode_png, save_image
8
+ from tlog.run import Run
9
+ from tlog.store import find_runs
10
+
11
+
12
+ def parse_png(data: bytes):
13
+ assert data[:8] == b"\x89PNG\r\n\x1a\n"
14
+ chunks = {}
15
+ pos = 8
16
+ while pos < len(data):
17
+ (length,) = struct.unpack(">I", data[pos : pos + 4])
18
+ tag = data[pos + 4 : pos + 8]
19
+ body = data[pos + 8 : pos + 8 + length]
20
+ (crc,) = struct.unpack(">I", data[pos + 8 + length : pos + 12 + length])
21
+ assert crc == zlib.crc32(tag + body), f"bad crc for {tag}"
22
+ chunks[tag] = body
23
+ pos += 12 + length
24
+ return chunks
25
+
26
+
27
+ def test_encode_png_roundtrip():
28
+ w, h = 3, 2
29
+ pixels = bytes(range(w * h * 3))
30
+ png = encode_png(pixels, w, h, 3)
31
+ chunks = parse_png(png)
32
+ width, height, depth, color = struct.unpack(">IIBB", chunks[b"IHDR"][:10])
33
+ assert (width, height, depth, color) == (3, 2, 8, 2)
34
+ raw = zlib.decompress(chunks[b"IDAT"])
35
+ # filter byte 0 + scanline per row
36
+ assert len(raw) == h * (1 + w * 3)
37
+ assert raw[1 : 1 + w * 3] == pixels[: w * 3]
38
+
39
+
40
+ def test_save_image_numpy(tmp_path):
41
+ np = pytest.importorskip("numpy")
42
+ # float HWC in [0,1]
43
+ arr = np.linspace(0, 1, 4 * 5 * 3).reshape(4, 5, 3)
44
+ save_image(arr, tmp_path / "a.png")
45
+ assert (tmp_path / "a.png").read_bytes()[:8] == b"\x89PNG\r\n\x1a\n"
46
+ # uint8 CHW gets transposed
47
+ chw = (arr.transpose(2, 0, 1) * 255).astype("uint8")
48
+ save_image(chw, tmp_path / "b.png")
49
+ chunks = parse_png((tmp_path / "b.png").read_bytes())
50
+ width, height = struct.unpack(">II", chunks[b"IHDR"][:8])
51
+ assert (width, height) == (5, 4)
52
+
53
+
54
+ def test_export_html_smoke(tmp_path):
55
+ from tlog.export import export_html
56
+
57
+ run = Run(project="p", name="r1", dir=tmp_path, config={"lr": 1},
58
+ capture_console=False, system_metrics=False)
59
+ for s in range(0, 100, 10):
60
+ run.log({"loss/total": 1.0 / (s + 1), "eval/fid": 50 - s / 4}, step=s)
61
+ run.log_images("eval/recon", [(bytes([0, 128, 255] * 4), 2, 2, 3)], step=50)
62
+ run.finish()
63
+
64
+ info = find_runs(tmp_path)[0]
65
+ out = export_html([info], tmp_path / "report.html")
66
+ html = out.read_text()
67
+ assert "loss/total" in html
68
+ assert "data:image/png;base64," in html
69
+ assert '"mode"' not in html.split("TLOG_MODE")[0] # sanity: template filled
70
+ assert "{{" not in html.replace("{{}}", "") # no leftover placeholders
71
+
72
+ payload = html.split("window.TLOG_DATA = ", 1)[1].split(";\n", 1)[0]
73
+ data = json.loads(payload.replace("<\\/", "</"))
74
+ assert data["runs"][0]["name"] == "r1"
75
+ assert len(data["runs"][0]["metrics"]["loss/total"]["steps"]) == 10
76
+ assert data["runs"][0]["media"][0]["step"] == 50