tinytrainlog 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tinytrainlog
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Tiny train logger
5
5
  Keywords: pytorch,metrics,logging,training,machine-learning
6
6
  Author: jdh
@@ -10,4 +10,9 @@ Requires-Python: >=3.12
10
10
  Project-URL: Homepage, https://github.com/jdhouseholder/tinytrainlog
11
11
  Description-Content-Type: text/markdown
12
12
 
13
- # 🚅🚅🚅 Tiny Train Log 🚅🚅🚅
13
+ # 🚅🪵 Tiny Train Log 🚅🪵
14
+
15
+ The *structured log* for multi-server research scale ml project metrics.
16
+ Stores runs in a queryable and mergable database for easy multi-server data collection!
17
+
18
+ ## Finally SQL based *post-hoc* analysis for my research projects!
@@ -0,0 +1,6 @@
1
+ # 🚅🪵 Tiny Train Log 🚅🪵
2
+
3
+ The *structured log* for multi-server research scale ml project metrics.
4
+ Stores runs in a queryable and mergable database for easy multi-server data collection!
5
+
6
+ ## Finally SQL based *post-hoc* analysis for my research projects!
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "tinytrainlog"
3
- version = "0.1.1"
3
+ version = "0.1.2"
4
4
  description = "Tiny train logger"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -1,5 +1,5 @@
1
1
  import random
2
- from pathlib import Path
2
+ import sqlite3
3
3
 
4
4
  ADJECTIVES = [
5
5
  "bold",
@@ -108,8 +108,8 @@ NOUNS = [
108
108
  ]
109
109
 
110
110
 
111
- def generate_run_name(root_dir: Path, max_attempts: int = 100) -> str:
112
- existing = {p.name for p in root_dir.iterdir()} if root_dir.exists() else set()
111
+ def generate_run_name(conn: sqlite3.Connection, max_attempts: int = 100) -> str:
112
+ existing = {row[0] for row in conn.execute("SELECT name FROM runs").fetchall()}
113
113
  for _ in range(max_attempts):
114
114
  name = f"{random.choice(ADJECTIVES)}-{random.choice(NOUNS)}"
115
115
  if name not in existing:
@@ -0,0 +1,234 @@
1
+ import json
2
+ import shutil
3
+ import socket
4
+ import sqlite3
5
+ from pathlib import Path
6
+
7
+ from ._names import generate_run_name
8
+
9
+ _SCHEMA = """\
10
+ CREATE TABLE IF NOT EXISTS runs (
11
+ name TEXT PRIMARY KEY,
12
+ machine_id TEXT,
13
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
14
+ );
15
+ CREATE TABLE IF NOT EXISTS config (
16
+ run_name TEXT NOT NULL REFERENCES runs(name),
17
+ key TEXT NOT NULL,
18
+ value TEXT NOT NULL,
19
+ updated_at TEXT NOT NULL DEFAULT (datetime('now')),
20
+ UNIQUE(run_name, key)
21
+ );
22
+ CREATE TABLE IF NOT EXISTS tags (
23
+ run_name TEXT NOT NULL REFERENCES runs(name),
24
+ tag TEXT NOT NULL,
25
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
26
+ UNIQUE(run_name, tag)
27
+ );
28
+ CREATE TABLE IF NOT EXISTS steps (
29
+ run_name TEXT NOT NULL REFERENCES runs(name),
30
+ step INTEGER NOT NULL,
31
+ key TEXT NOT NULL,
32
+ value REAL NOT NULL,
33
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
34
+ );
35
+ CREATE TABLE IF NOT EXISTS epochs (
36
+ run_name TEXT NOT NULL REFERENCES runs(name),
37
+ epoch INTEGER NOT NULL,
38
+ key TEXT NOT NULL,
39
+ value REAL NOT NULL,
40
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
41
+ );
42
+ CREATE TABLE IF NOT EXISTS eval (
43
+ run_name TEXT NOT NULL REFERENCES runs(name),
44
+ step INTEGER,
45
+ epoch INTEGER,
46
+ key TEXT NOT NULL,
47
+ value REAL NOT NULL,
48
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
49
+ );
50
+ CREATE TABLE IF NOT EXISTS test (
51
+ run_name TEXT NOT NULL REFERENCES runs(name),
52
+ key TEXT NOT NULL,
53
+ value REAL NOT NULL,
54
+ updated_at TEXT NOT NULL DEFAULT (datetime('now')),
55
+ UNIQUE(run_name, key)
56
+ );
57
+ """
58
+
59
+ _DATA_TABLES = ("config", "tags", "steps", "epochs", "eval", "test")
60
+
61
+
62
+ class MetricsLogger:
63
+ def __init__(
64
+ self,
65
+ root_dir: str | Path,
66
+ run_name: str | None = None,
67
+ machine_id: str | None = None,
68
+ ):
69
+ self.root_dir = Path(root_dir)
70
+ self.root_dir.mkdir(parents=True, exist_ok=True)
71
+ self.machine_id = machine_id or socket.gethostname()
72
+
73
+ self._db_path = self.root_dir / "runs.db"
74
+ self._conn = sqlite3.connect(self._db_path)
75
+ self._conn.executescript(_SCHEMA)
76
+
77
+ if run_name is None:
78
+ run_name = generate_run_name(self._conn)
79
+ self.run_name = run_name
80
+
81
+ self._conn.execute(
82
+ "INSERT INTO runs (name, machine_id) VALUES (?, ?)"
83
+ " ON CONFLICT(name) DO UPDATE SET machine_id = excluded.machine_id",
84
+ (self.run_name, self.machine_id),
85
+ )
86
+ self._conn.commit()
87
+
88
+ self._checkpoint_dir = self.root_dir / self.run_name / "checkpoints"
89
+ self._checkpoint_dir.mkdir(parents=True, exist_ok=True)
90
+
91
+ @property
92
+ def run_dir(self) -> Path:
93
+ return self.root_dir / self.run_name
94
+
95
+ def set_config(self, config: dict) -> None:
96
+ self._conn.executemany(
97
+ "INSERT INTO config (run_name, key, value) VALUES (?, ?, ?)"
98
+ " ON CONFLICT(run_name, key) DO UPDATE SET value = excluded.value,"
99
+ " updated_at = datetime('now')",
100
+ [(self.run_name, k, json.dumps(v)) for k, v in config.items()],
101
+ )
102
+ self._conn.commit()
103
+
104
+ def add_tags(self, tags: list[str]) -> None:
105
+ self._conn.executemany(
106
+ "INSERT OR IGNORE INTO tags (run_name, tag) VALUES (?, ?)",
107
+ [(self.run_name, tag) for tag in tags],
108
+ )
109
+ self._conn.commit()
110
+
111
+ def log_step(self, step: int, **metrics) -> None:
112
+ self._conn.executemany(
113
+ "INSERT INTO steps (run_name, step, key, value) VALUES (?, ?, ?, ?)",
114
+ [(self.run_name, step, k, v) for k, v in metrics.items()],
115
+ )
116
+ self._conn.commit()
117
+
118
+ def log_epoch(self, epoch: int, **metrics) -> None:
119
+ self._conn.executemany(
120
+ "INSERT INTO epochs (run_name, epoch, key, value) VALUES (?, ?, ?, ?)",
121
+ [(self.run_name, epoch, k, v) for k, v in metrics.items()],
122
+ )
123
+ self._conn.commit()
124
+
125
+ def log_eval(
126
+ self, *, step: int | None = None, epoch: int | None = None, **metrics
127
+ ) -> None:
128
+ if step is None and epoch is None:
129
+ raise ValueError("At least one of 'step' or 'epoch' must be provided.")
130
+ self._conn.executemany(
131
+ "INSERT INTO eval (run_name, step, epoch, key, value) VALUES (?, ?, ?, ?, ?)",
132
+ [(self.run_name, step, epoch, k, v) for k, v in metrics.items()],
133
+ )
134
+ self._conn.commit()
135
+
136
+ def log_test(self, **metrics) -> None:
137
+ self._conn.executemany(
138
+ "INSERT INTO test (run_name, key, value) VALUES (?, ?, ?)"
139
+ " ON CONFLICT(run_name, key) DO UPDATE SET value = excluded.value,"
140
+ " updated_at = datetime('now')",
141
+ [(self.run_name, k, v) for k, v in metrics.items()],
142
+ )
143
+ self._conn.commit()
144
+
145
+ def checkpoint_path(
146
+ self, step: int | None = None, epoch: int | None = None
147
+ ) -> Path:
148
+ if (step is None) == (epoch is None):
149
+ raise ValueError("Exactly one of 'step' or 'epoch' must be provided.")
150
+ if step is not None:
151
+ return self._checkpoint_dir / f"step_{step}.pt"
152
+ return self._checkpoint_dir / f"epoch_{epoch}.pt"
153
+
154
+ @property
155
+ def checkpoint_dir(self) -> Path:
156
+ return self._checkpoint_dir
157
+
158
+ def delete_run(self, run_name: str) -> None:
159
+ self._conn.execute("BEGIN")
160
+ try:
161
+ for table in _DATA_TABLES:
162
+ self._conn.execute(
163
+ f"DELETE FROM {table} WHERE run_name = ?", (run_name,)
164
+ )
165
+ self._conn.execute("DELETE FROM runs WHERE name = ?", (run_name,))
166
+ self._conn.execute("COMMIT")
167
+ except Exception:
168
+ self._conn.execute("ROLLBACK")
169
+ raise
170
+
171
+ run_dir = self.root_dir / run_name
172
+ if run_dir.exists():
173
+ shutil.rmtree(run_dir)
174
+
175
+ @staticmethod
176
+ def merge(target_dir: str | Path, source_dir: str | Path) -> None:
177
+ target_dir = Path(target_dir)
178
+ source_dir = Path(source_dir)
179
+ target_db = target_dir / "runs.db"
180
+ source_db = source_dir / "runs.db"
181
+
182
+ if not source_db.exists():
183
+ raise FileNotFoundError(f"No runs.db found in {source_dir}")
184
+
185
+ target_dir.mkdir(parents=True, exist_ok=True)
186
+ conn = sqlite3.connect(target_db)
187
+ conn.executescript(_SCHEMA)
188
+ conn.execute("ATTACH DATABASE ? AS other", (str(source_db),))
189
+
190
+ # Check for name conflicts
191
+ conflicts = conn.execute(
192
+ "SELECT o.name, o.machine_id, m.machine_id"
193
+ " FROM other.runs o INNER JOIN main.runs m ON o.name = m.name"
194
+ ).fetchall()
195
+ if conflicts:
196
+ conn.execute("DETACH DATABASE other")
197
+ conn.close()
198
+ details = ", ".join(
199
+ f"'{name}' (source: {src or '?'}, target: {tgt or '?'})"
200
+ for name, src, tgt in conflicts
201
+ )
202
+ raise ValueError(
203
+ f"Run name conflicts: {details}. "
204
+ f"Rename the conflicting runs before merging."
205
+ )
206
+
207
+ try:
208
+ conn.execute("BEGIN")
209
+ conn.execute("INSERT INTO main.runs SELECT * FROM other.runs")
210
+ for table in _DATA_TABLES:
211
+ conn.execute(f"INSERT INTO main.{table} SELECT * FROM other.{table}")
212
+ conn.execute("COMMIT")
213
+ except Exception:
214
+ conn.execute("ROLLBACK")
215
+ raise
216
+ finally:
217
+ conn.execute("DETACH DATABASE other")
218
+ conn.close()
219
+
220
+ # Copy checkpoint directories
221
+ for run_dir in source_dir.iterdir():
222
+ if run_dir.is_dir():
223
+ target_run_dir = target_dir / run_dir.name
224
+ if not target_run_dir.exists():
225
+ shutil.copytree(run_dir, target_run_dir)
226
+
227
+ def close(self) -> None:
228
+ self._conn.close()
229
+
230
+ def __enter__(self):
231
+ return self
232
+
233
+ def __exit__(self, *exc):
234
+ self.close()
@@ -1 +0,0 @@
1
- # 🚅🚅🚅 Tiny Train Log 🚅🚅🚅
@@ -1,53 +0,0 @@
1
- import json
2
- from pathlib import Path
3
-
4
- from ._names import generate_run_name
5
-
6
-
7
- class MetricsLogger:
8
- def __init__(self, root_dir: str | Path, run_name: str | None = None):
9
- self.root_dir = Path(root_dir)
10
- if run_name is None:
11
- run_name = generate_run_name(self.root_dir)
12
- self.run_name = run_name
13
- self.run_dir = self.root_dir / self.run_name
14
- self.run_dir.mkdir(parents=True, exist_ok=True)
15
- self._checkpoint_dir = self.run_dir / "checkpoints"
16
- self._checkpoint_dir.mkdir(exist_ok=True)
17
-
18
- def set_config(self, config: dict) -> None:
19
- (self.run_dir / "config.json").write_text(json.dumps(config, indent=2) + "\n")
20
-
21
- def add_tags(self, tags: list[str]) -> None:
22
- tags_path = self.run_dir / "tags.json"
23
- if tags_path.exists():
24
- existing = json.loads(tags_path.read_text())
25
- else:
26
- existing = []
27
- seen = set(existing)
28
- for tag in tags:
29
- if tag not in seen:
30
- existing.append(tag)
31
- seen.add(tag)
32
- tags_path.write_text(json.dumps(existing, indent=2) + "\n")
33
-
34
- def log_step(self, step: int, **metrics) -> None:
35
- with open(self.run_dir / "steps.jsonl", "a") as f:
36
- f.write(json.dumps({"step": step, **metrics}) + "\n")
37
-
38
- def log_epoch(self, epoch: int, **metrics) -> None:
39
- with open(self.run_dir / "epochs.jsonl", "a") as f:
40
- f.write(json.dumps({"epoch": epoch, **metrics}) + "\n")
41
-
42
- def checkpoint_path(
43
- self, step: int | None = None, epoch: int | None = None
44
- ) -> Path:
45
- if (step is None) == (epoch is None):
46
- raise ValueError("Exactly one of 'step' or 'epoch' must be provided.")
47
- if step is not None:
48
- return self._checkpoint_dir / f"step_{step}.pt"
49
- return self._checkpoint_dir / f"epoch_{epoch}.pt"
50
-
51
- @property
52
- def checkpoint_dir(self) -> Path:
53
- return self._checkpoint_dir