tlog-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tlog/__init__.py ADDED
@@ -0,0 +1,89 @@
1
+ """tlog — lightweight, local-first experiment logger for neural net training.
2
+
3
+ Drop-in wandb-shaped API:
4
+
5
+ import tlog
6
+
7
+ run = tlog.init(project="vitok", name="vae-L", config=vars(args))
8
+ tlog.log({"loss/total": 0.41, "training/lr": 3e-4}, step=step)
9
+ tlog.log_images("eval/recon", [orig, recon], step=step)
10
+ tlog.finish()
11
+
12
+ View runs with `tlog watch` (terminal), `tlog serve` (browser via port
13
+ forward), or `tlog export -o report.html` (single shareable file).
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ from typing import Any
20
+
21
+ from .run import NoopRun, Run
22
+
23
+ __version__ = "0.1.0"
24
+ __all__ = ["init", "log", "log_images", "finish", "run", "Run", "NoopRun"]
25
+
26
+ run: Run | NoopRun | None = None # the active run, set by init()
27
+
28
+
29
+ def init(
30
+ project: str = "default",
31
+ name: str | None = None,
32
+ config: dict | None = None,
33
+ dir: str | None = None,
34
+ id: str | None = None,
35
+ resume: str = "auto",
36
+ capture_console: bool = True,
37
+ system_metrics: bool = True,
38
+ rank_zero_only: bool = True,
39
+ ) -> Run | NoopRun:
40
+ """Start (or resume) a run. On non-zero ranks (per the RANK env var set by
41
+ torchrun/SLURM) returns a no-op run unless rank_zero_only=False.
42
+
43
+ resume: "auto" — resume iff an explicit `id` is given or this process is a
44
+ SLURM requeue (SLURM_RESTART_COUNT > 0) of a job that
45
+ already created a run; otherwise start fresh.
46
+ "must" — resume an existing run or raise.
47
+ "never" — always start a fresh run.
48
+ """
49
+ global run
50
+ if rank_zero_only and int(os.environ.get("RANK", "0") or 0) != 0:
51
+ run = NoopRun()
52
+ return run
53
+ if run is not None and not isinstance(run, NoopRun):
54
+ run.finish()
55
+ run = Run(
56
+ project=project,
57
+ name=name,
58
+ config=config,
59
+ dir=dir,
60
+ id=id,
61
+ resume=resume,
62
+ capture_console=capture_console,
63
+ system_metrics=system_metrics,
64
+ )
65
+ print(f"tlog: logging to {run.dir}" + (" (resumed)" if run.resumed else ""))
66
+ return run
67
+
68
+
69
+ def _require_run() -> Run | NoopRun:
70
+ if run is None:
71
+ raise RuntimeError("tlog.init() must be called before logging")
72
+ return run
73
+
74
+
75
+ def log(metrics: dict[str, Any], step: int | None = None) -> None:
76
+ """Log a dict of scalar metrics at a training step."""
77
+ _require_run().log(metrics, step=step)
78
+
79
+
80
+ def log_images(key: str, images: Any, step: int | None = None, caption: str | None = None) -> None:
81
+ """Log one image or a list of images (PIL / torch tensor / numpy array)."""
82
+ _require_run().log_images(key, images, step=step, caption=caption)
83
+
84
+
85
+ def finish() -> None:
86
+ """Mark the active run finished and flush all files."""
87
+ global run
88
+ if run is not None:
89
+ run.finish()
tlog/cli.py ADDED
@@ -0,0 +1,183 @@
1
+ """`tlog` command line: watch (default), ls, tail, export, serve, rm."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import datetime
7
+ import os
8
+ import shutil
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ from . import store
13
+
14
+ DIM = "\x1b[2m"
15
+ BOLD = "\x1b[1m"
16
+ RESET = "\x1b[0m"
17
+ _STATUS_COLOR = {"running": "\x1b[38;5;114m", "finished": "\x1b[38;5;75m", "dead": "\x1b[38;5;203m"}
18
+
19
+
20
+ def default_root() -> str:
21
+ return os.environ.get("TLOG_DIR", "./runs")
22
+
23
+
24
+ def _resolve_or_die(spec: str | None, root: str) -> store.RunInfo:
25
+ if spec is None:
26
+ info = store.latest_run(root)
27
+ if info is None:
28
+ sys.exit(f"tlog: no runs found under {root!r} (set --dir or TLOG_DIR)")
29
+ return info
30
+ info = store.resolve_run(spec, root)
31
+ if info is None:
32
+ sys.exit(f"tlog: no run matching {spec!r} under {root!r}")
33
+ return info
34
+
35
+
36
+ def cmd_ls(args: argparse.Namespace) -> None:
37
+ runs = store.find_runs(args.root or default_root())
38
+ if not runs:
39
+ print(f"no runs under {args.root or default_root()!r}")
40
+ return
41
+ color = sys.stdout.isatty()
42
+ rows = [("", "PROJECT/NAME", "ID", "STEP", "LAST LOSS", "STARTED", "SLURM", "STATUS")]
43
+ for r in runs:
44
+ last = store.last_record(r.path / "metrics.jsonl") or {}
45
+ step = last.get("_step")
46
+ loss_rec = store.last_record(
47
+ r.path / "metrics.jsonl",
48
+ predicate=lambda rec: any(k.startswith("loss") for k in rec),
49
+ ) or {}
50
+ loss = next(
51
+ (v for k, v in loss_rec.items() if k.startswith("loss")),
52
+ next((v for k, v in last.items() if not k.startswith("_")), None),
53
+ )
54
+ started = datetime.datetime.fromtimestamp(r.created_at).strftime("%m-%d %H:%M")
55
+ slurm = r.meta.get("env", {}).get("slurm", {}).get("SLURM_JOB_ID", "")
56
+ status = r.status
57
+ dot = "●"
58
+ if color:
59
+ dot = _STATUS_COLOR.get(status, "") + "●" + RESET
60
+ rows.append(
61
+ (
62
+ dot,
63
+ f"{r.project}/{r.name}",
64
+ r.id,
65
+ f"{step:,}" if step is not None else "-",
66
+ f"{loss:.4g}" if isinstance(loss, (int, float)) else "-",
67
+ started,
68
+ slurm,
69
+ status,
70
+ )
71
+ )
72
+ plain = [tuple(c if i or not color else "●" for i, c in enumerate(row)) for row in rows]
73
+ widths = [max(len(str(r[i])) for r in plain) for i in range(len(rows[0]))]
74
+ for row, p in zip(rows, plain):
75
+ line = " ".join(
76
+ str(c) + " " * (widths[i] - len(str(p[i]))) for i, c in enumerate(row)
77
+ )
78
+ print(line.rstrip())
79
+
80
+
81
+ def cmd_watch(args: argparse.Namespace) -> None:
82
+ from .tui import watch
83
+
84
+ root = args.dir or default_root()
85
+ info = _resolve_or_die(args.run, root)
86
+ watch(info, interval=args.interval, ncols=args.cols)
87
+
88
+
89
+ def cmd_tail(args: argparse.Namespace) -> None:
90
+ info = _resolve_or_die(args.run, args.dir or default_root())
91
+ for line in store.read_console(info, max_lines=args.lines):
92
+ print(line)
93
+
94
+
95
+ def cmd_export(args: argparse.Namespace) -> None:
96
+ from .export import export_html
97
+
98
+ root = args.dir or default_root()
99
+ runs = [_resolve_or_die(spec, root) for spec in args.runs] or None
100
+ if runs is None:
101
+ runs = store.find_runs(root)
102
+ if not runs:
103
+ sys.exit(f"tlog: no runs under {root!r}")
104
+ out = export_html(runs, Path(args.output), max_image_px=args.max_image_px)
105
+ size_kb = out.stat().st_size / 1024
106
+ print(f"wrote {out} ({size_kb:,.0f} KB, {len(runs)} run{'s' * (len(runs) != 1)})")
107
+
108
+
109
+ def cmd_serve(args: argparse.Namespace) -> None:
110
+ from .server import serve
111
+
112
+ serve(args.root or default_root(), host=args.host, port=args.port)
113
+
114
+
115
+ def cmd_rm(args: argparse.Namespace) -> None:
116
+ info = _resolve_or_die(args.run, args.dir or default_root())
117
+ if not args.yes:
118
+ answer = input(f"delete {info.path}? [y/N] ")
119
+ if answer.strip().lower() not in ("y", "yes"):
120
+ print("aborted")
121
+ return
122
+ shutil.rmtree(info.path)
123
+ print(f"deleted {info.path}")
124
+
125
+
126
+ def main(argv: list[str] | None = None) -> None:
127
+ parser = argparse.ArgumentParser(
128
+ prog="tlog",
129
+ description="lightweight local experiment logger — view training runs in "
130
+ "the terminal, a browser, or a self-contained HTML file",
131
+ )
132
+ sub = parser.add_subparsers(dest="command")
133
+
134
+ p_watch = sub.add_parser("watch", help="live terminal dashboard (default command)")
135
+ p_watch.add_argument("run", nargs="?", help="run dir, id, or name (default: latest run)")
136
+ p_watch.add_argument("--dir", help="runs root (default: $TLOG_DIR or ./runs)")
137
+ p_watch.add_argument("--interval", type=float, default=2.0, help="refresh seconds")
138
+ p_watch.add_argument(
139
+ "--cols", type=int, default=None,
140
+ help="chart columns (default: auto from pane width; keys 1-9/0 at runtime)",
141
+ )
142
+ p_watch.set_defaults(func=cmd_watch)
143
+
144
+ p_ls = sub.add_parser("ls", help="list runs")
145
+ p_ls.add_argument("root", nargs="?", help="runs root (default: $TLOG_DIR or ./runs)")
146
+ p_ls.set_defaults(func=cmd_ls)
147
+
148
+ p_tail = sub.add_parser("tail", help="show a run's captured console log")
149
+ p_tail.add_argument("run", nargs="?", help="run dir, id, or name (default: latest)")
150
+ p_tail.add_argument("-n", "--lines", type=int, default=50)
151
+ p_tail.add_argument("--dir", help="runs root")
152
+ p_tail.set_defaults(func=cmd_tail)
153
+
154
+ p_export = sub.add_parser("export", help="write a self-contained HTML report")
155
+ p_export.add_argument("runs", nargs="*", help="runs to include (default: all)")
156
+ p_export.add_argument("-o", "--output", default="tlog_report.html")
157
+ p_export.add_argument("--dir", help="runs root")
158
+ p_export.add_argument(
159
+ "--max-image-px", type=int, default=512,
160
+ help="downscale embedded images to this max side (0 = keep original)",
161
+ )
162
+ p_export.set_defaults(func=cmd_export)
163
+
164
+ p_serve = sub.add_parser("serve", help="live web dashboard (port-forward friendly)")
165
+ p_serve.add_argument("root", nargs="?", help="runs root (default: $TLOG_DIR or ./runs)")
166
+ p_serve.add_argument("-p", "--port", type=int, default=8585)
167
+ p_serve.add_argument("--host", default="127.0.0.1")
168
+ p_serve.set_defaults(func=cmd_serve)
169
+
170
+ p_rm = sub.add_parser("rm", help="delete a run directory")
171
+ p_rm.add_argument("run", help="run dir, id, or name")
172
+ p_rm.add_argument("-y", "--yes", action="store_true", help="skip confirmation")
173
+ p_rm.add_argument("--dir", help="runs root")
174
+ p_rm.set_defaults(func=cmd_rm)
175
+
176
+ args = parser.parse_args(argv)
177
+ if args.command is None: # bare `tlog` -> watch latest
178
+ args = parser.parse_args(["watch"] + (argv or sys.argv[1:]))
179
+ args.func(args)
180
+
181
+
182
+ if __name__ == "__main__":
183
+ main()
tlog/console.py ADDED
@@ -0,0 +1,71 @@
1
+ """Tee stdout/stderr of the training process into the run directory."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ import threading
7
+ from pathlib import Path
8
+ from typing import TextIO
9
+
10
+
11
+ class _Tee:
12
+ """File-like wrapper that mirrors writes to the original stream and a log
13
+ file. Exposes enough of the TextIO surface for print/tqdm/logging."""
14
+
15
+ def __init__(self, stream: TextIO, logfile: TextIO, lock: threading.Lock):
16
+ self._stream = stream
17
+ self._logfile = logfile
18
+ self._lock = lock
19
+
20
+ def write(self, data: str) -> int:
21
+ n = self._stream.write(data)
22
+ with self._lock:
23
+ if not self._logfile.closed:
24
+ try:
25
+ self._logfile.write(data)
26
+ except (OSError, ValueError):
27
+ pass
28
+ return n
29
+
30
+ def flush(self) -> None:
31
+ self._stream.flush()
32
+ with self._lock:
33
+ if not self._logfile.closed:
34
+ try:
35
+ self._logfile.flush()
36
+ except (OSError, ValueError):
37
+ pass
38
+
39
+ def isatty(self) -> bool:
40
+ return self._stream.isatty()
41
+
42
+ def fileno(self) -> int:
43
+ return self._stream.fileno()
44
+
45
+ @property
46
+ def encoding(self):
47
+ return getattr(self._stream, "encoding", "utf-8")
48
+
49
+ def __getattr__(self, name):
50
+ return getattr(self._stream, name)
51
+
52
+
53
+ class ConsoleCapture:
54
+ def __init__(self, path: Path):
55
+ # line-buffered so `tlog tail`/viewers see output promptly
56
+ self._logfile = open(path, "a", buffering=1, encoding="utf-8", errors="replace")
57
+ self._lock = threading.Lock()
58
+ self._orig_stdout = sys.stdout
59
+ self._orig_stderr = sys.stderr
60
+ sys.stdout = _Tee(self._orig_stdout, self._logfile, self._lock)
61
+ sys.stderr = _Tee(self._orig_stderr, self._logfile, self._lock)
62
+
63
+ def stop(self) -> None:
64
+ if isinstance(sys.stdout, _Tee):
65
+ sys.stdout = self._orig_stdout
66
+ if isinstance(sys.stderr, _Tee):
67
+ sys.stderr = self._orig_stderr
68
+ with self._lock:
69
+ if not self._logfile.closed:
70
+ self._logfile.flush()
71
+ self._logfile.close()
tlog/export.py ADDED
@@ -0,0 +1,93 @@
1
+ """`tlog export` — render runs into one self-contained HTML file.
2
+
3
+ Everything (frontend, uPlot, metric data, images as base64) is inlined, so the
4
+ file can be opened in VS Code's preview, scp'd to a laptop, or attached to a
5
+ message with no server and no internet access.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import base64
11
+ import datetime
12
+ import io
13
+ import json
14
+ from pathlib import Path
15
+
16
+ from .payload import run_media, run_metrics, run_summary
17
+ from .store import RunInfo, read_console
18
+
19
+ FRONTEND = Path(__file__).parent / "frontend"
20
+
21
+
22
+ def _data_uri(png_path: Path, max_px: int) -> str | None:
23
+ try:
24
+ raw = png_path.read_bytes()
25
+ except OSError:
26
+ return None
27
+ if max_px > 0:
28
+ try: # downscale with PIL if available to keep the report small
29
+ from PIL import Image
30
+
31
+ img = Image.open(io.BytesIO(raw))
32
+ if max(img.size) > max_px:
33
+ img.thumbnail((max_px, max_px))
34
+ buf = io.BytesIO()
35
+ img.save(buf, format="PNG")
36
+ raw = buf.getvalue()
37
+ except ImportError:
38
+ pass
39
+ except Exception:
40
+ pass
41
+ return "data:image/png;base64," + base64.b64encode(raw).decode("ascii")
42
+
43
+
44
+ def build_data(runs: list[RunInfo], max_image_px: int = 512) -> dict:
45
+ payload_runs = []
46
+ for info in runs:
47
+ summary = run_summary(info)
48
+ summary["metrics"] = run_metrics(info)
49
+ media = []
50
+ for rec in run_media(info):
51
+ files = []
52
+ for rel in rec["files"]:
53
+ uri = _data_uri(info.path / "media" / rel, max_image_px)
54
+ if uri:
55
+ files.append(uri)
56
+ if files:
57
+ rec = dict(rec, files=files)
58
+ media.append(rec)
59
+ summary["media"] = media
60
+ summary["console"] = "\n".join(read_console(info, max_lines=300))
61
+ payload_runs.append(summary)
62
+ return {
63
+ "generated_at": datetime.datetime.now().isoformat(timespec="seconds"),
64
+ "runs": payload_runs,
65
+ }
66
+
67
+
68
+ def render_template(mode: str, data: dict | None, title: str = "tlog") -> str:
69
+ html = (FRONTEND / "index.html").read_text()
70
+ data_json = "null" if data is None else json.dumps(
71
+ data, separators=(",", ":")
72
+ ).replace("</", "<\\/")
73
+ return (
74
+ html.replace("{{TITLE}}", title)
75
+ .replace("{{UPLOT_CSS}}", (FRONTEND / "vendor" / "uplot.min.css").read_text())
76
+ .replace("{{CSS}}", (FRONTEND / "style.css").read_text())
77
+ .replace("{{UPLOT_JS}}", (FRONTEND / "vendor" / "uplot.min.js").read_text())
78
+ .replace("{{MODE}}", mode)
79
+ .replace("{{DATA}}", data_json)
80
+ .replace("{{APP_JS}}", (FRONTEND / "app.js").read_text())
81
+ )
82
+
83
+
84
+ def export_html(
85
+ runs: list[RunInfo], output: Path, max_image_px: int = 512
86
+ ) -> Path:
87
+ data = build_data(runs, max_image_px=max_image_px)
88
+ title = "tlog — " + ", ".join(r.name for r in runs[:3]) + (
89
+ f" +{len(runs) - 3}" if len(runs) > 3 else ""
90
+ )
91
+ output = Path(output)
92
+ output.write_text(render_template("export", data, title), encoding="utf-8")
93
+ return output