tinytrainlog 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tinytrainlog-0.1.1 → tinytrainlog-0.1.2}/PKG-INFO +7 -2
- tinytrainlog-0.1.2/README.md +6 -0
- {tinytrainlog-0.1.1 → tinytrainlog-0.1.2}/pyproject.toml +1 -1
- tinytrainlog-0.1.2/src/tinytrainlog/.ruff_cache/0.15.8/17101086425162019420 +0 -0
- {tinytrainlog-0.1.1 → tinytrainlog-0.1.2}/src/tinytrainlog/_names.py +3 -3
- tinytrainlog-0.1.2/src/tinytrainlog/metrics_logger.py +234 -0
- tinytrainlog-0.1.1/README.md +0 -1
- tinytrainlog-0.1.1/src/tinytrainlog/metrics_logger.py +0 -53
- {tinytrainlog-0.1.1 → tinytrainlog-0.1.2}/src/tinytrainlog/.ruff_cache/.gitignore +0 -0
- {tinytrainlog-0.1.1 → tinytrainlog-0.1.2}/src/tinytrainlog/.ruff_cache/0.15.8/13173811450099868753 +0 -0
- {tinytrainlog-0.1.1 → tinytrainlog-0.1.2}/src/tinytrainlog/.ruff_cache/CACHEDIR.TAG +0 -0
- {tinytrainlog-0.1.1 → tinytrainlog-0.1.2}/src/tinytrainlog/__init__.py +0 -0
- {tinytrainlog-0.1.1 → tinytrainlog-0.1.2}/src/tinytrainlog/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tinytrainlog
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Tiny train logger
|
|
5
5
|
Keywords: pytorch,metrics,logging,training,machine-learning
|
|
6
6
|
Author: jdh
|
|
@@ -10,4 +10,9 @@ Requires-Python: >=3.12
|
|
|
10
10
|
Project-URL: Homepage, https://github.com/jdhouseholder/tinytrainlog
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
12
|
|
|
13
|
-
#
|
|
13
|
+
# 🚅🪵 Tiny Train Log 🚅🪵
|
|
14
|
+
|
|
15
|
+
The *structured log* for multi-server research scale ml project metrics.
|
|
16
|
+
Stores runs in a queryable and mergable database for easy multi-server data collection!
|
|
17
|
+
|
|
18
|
+
## Finally SQL based *post-hoc* analysis for my research projects!
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import random
|
|
2
|
-
|
|
2
|
+
import sqlite3
|
|
3
3
|
|
|
4
4
|
ADJECTIVES = [
|
|
5
5
|
"bold",
|
|
@@ -108,8 +108,8 @@ NOUNS = [
|
|
|
108
108
|
]
|
|
109
109
|
|
|
110
110
|
|
|
111
|
-
def generate_run_name(
|
|
112
|
-
existing = {
|
|
111
|
+
def generate_run_name(conn: sqlite3.Connection, max_attempts: int = 100) -> str:
|
|
112
|
+
existing = {row[0] for row in conn.execute("SELECT name FROM runs").fetchall()}
|
|
113
113
|
for _ in range(max_attempts):
|
|
114
114
|
name = f"{random.choice(ADJECTIVES)}-{random.choice(NOUNS)}"
|
|
115
115
|
if name not in existing:
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import shutil
|
|
3
|
+
import socket
|
|
4
|
+
import sqlite3
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from ._names import generate_run_name
|
|
8
|
+
|
|
9
|
+
_SCHEMA = """\
|
|
10
|
+
CREATE TABLE IF NOT EXISTS runs (
|
|
11
|
+
name TEXT PRIMARY KEY,
|
|
12
|
+
machine_id TEXT,
|
|
13
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
14
|
+
);
|
|
15
|
+
CREATE TABLE IF NOT EXISTS config (
|
|
16
|
+
run_name TEXT NOT NULL REFERENCES runs(name),
|
|
17
|
+
key TEXT NOT NULL,
|
|
18
|
+
value TEXT NOT NULL,
|
|
19
|
+
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
20
|
+
UNIQUE(run_name, key)
|
|
21
|
+
);
|
|
22
|
+
CREATE TABLE IF NOT EXISTS tags (
|
|
23
|
+
run_name TEXT NOT NULL REFERENCES runs(name),
|
|
24
|
+
tag TEXT NOT NULL,
|
|
25
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
26
|
+
UNIQUE(run_name, tag)
|
|
27
|
+
);
|
|
28
|
+
CREATE TABLE IF NOT EXISTS steps (
|
|
29
|
+
run_name TEXT NOT NULL REFERENCES runs(name),
|
|
30
|
+
step INTEGER NOT NULL,
|
|
31
|
+
key TEXT NOT NULL,
|
|
32
|
+
value REAL NOT NULL,
|
|
33
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
34
|
+
);
|
|
35
|
+
CREATE TABLE IF NOT EXISTS epochs (
|
|
36
|
+
run_name TEXT NOT NULL REFERENCES runs(name),
|
|
37
|
+
epoch INTEGER NOT NULL,
|
|
38
|
+
key TEXT NOT NULL,
|
|
39
|
+
value REAL NOT NULL,
|
|
40
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
41
|
+
);
|
|
42
|
+
CREATE TABLE IF NOT EXISTS eval (
|
|
43
|
+
run_name TEXT NOT NULL REFERENCES runs(name),
|
|
44
|
+
step INTEGER,
|
|
45
|
+
epoch INTEGER,
|
|
46
|
+
key TEXT NOT NULL,
|
|
47
|
+
value REAL NOT NULL,
|
|
48
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
49
|
+
);
|
|
50
|
+
CREATE TABLE IF NOT EXISTS test (
|
|
51
|
+
run_name TEXT NOT NULL REFERENCES runs(name),
|
|
52
|
+
key TEXT NOT NULL,
|
|
53
|
+
value REAL NOT NULL,
|
|
54
|
+
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
55
|
+
UNIQUE(run_name, key)
|
|
56
|
+
);
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
_DATA_TABLES = ("config", "tags", "steps", "epochs", "eval", "test")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class MetricsLogger:
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
root_dir: str | Path,
|
|
66
|
+
run_name: str | None = None,
|
|
67
|
+
machine_id: str | None = None,
|
|
68
|
+
):
|
|
69
|
+
self.root_dir = Path(root_dir)
|
|
70
|
+
self.root_dir.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
self.machine_id = machine_id or socket.gethostname()
|
|
72
|
+
|
|
73
|
+
self._db_path = self.root_dir / "runs.db"
|
|
74
|
+
self._conn = sqlite3.connect(self._db_path)
|
|
75
|
+
self._conn.executescript(_SCHEMA)
|
|
76
|
+
|
|
77
|
+
if run_name is None:
|
|
78
|
+
run_name = generate_run_name(self._conn)
|
|
79
|
+
self.run_name = run_name
|
|
80
|
+
|
|
81
|
+
self._conn.execute(
|
|
82
|
+
"INSERT INTO runs (name, machine_id) VALUES (?, ?)"
|
|
83
|
+
" ON CONFLICT(name) DO UPDATE SET machine_id = excluded.machine_id",
|
|
84
|
+
(self.run_name, self.machine_id),
|
|
85
|
+
)
|
|
86
|
+
self._conn.commit()
|
|
87
|
+
|
|
88
|
+
self._checkpoint_dir = self.root_dir / self.run_name / "checkpoints"
|
|
89
|
+
self._checkpoint_dir.mkdir(parents=True, exist_ok=True)
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def run_dir(self) -> Path:
|
|
93
|
+
return self.root_dir / self.run_name
|
|
94
|
+
|
|
95
|
+
def set_config(self, config: dict) -> None:
|
|
96
|
+
self._conn.executemany(
|
|
97
|
+
"INSERT INTO config (run_name, key, value) VALUES (?, ?, ?)"
|
|
98
|
+
" ON CONFLICT(run_name, key) DO UPDATE SET value = excluded.value,"
|
|
99
|
+
" updated_at = datetime('now')",
|
|
100
|
+
[(self.run_name, k, json.dumps(v)) for k, v in config.items()],
|
|
101
|
+
)
|
|
102
|
+
self._conn.commit()
|
|
103
|
+
|
|
104
|
+
def add_tags(self, tags: list[str]) -> None:
|
|
105
|
+
self._conn.executemany(
|
|
106
|
+
"INSERT OR IGNORE INTO tags (run_name, tag) VALUES (?, ?)",
|
|
107
|
+
[(self.run_name, tag) for tag in tags],
|
|
108
|
+
)
|
|
109
|
+
self._conn.commit()
|
|
110
|
+
|
|
111
|
+
def log_step(self, step: int, **metrics) -> None:
|
|
112
|
+
self._conn.executemany(
|
|
113
|
+
"INSERT INTO steps (run_name, step, key, value) VALUES (?, ?, ?, ?)",
|
|
114
|
+
[(self.run_name, step, k, v) for k, v in metrics.items()],
|
|
115
|
+
)
|
|
116
|
+
self._conn.commit()
|
|
117
|
+
|
|
118
|
+
def log_epoch(self, epoch: int, **metrics) -> None:
|
|
119
|
+
self._conn.executemany(
|
|
120
|
+
"INSERT INTO epochs (run_name, epoch, key, value) VALUES (?, ?, ?, ?)",
|
|
121
|
+
[(self.run_name, epoch, k, v) for k, v in metrics.items()],
|
|
122
|
+
)
|
|
123
|
+
self._conn.commit()
|
|
124
|
+
|
|
125
|
+
def log_eval(
|
|
126
|
+
self, *, step: int | None = None, epoch: int | None = None, **metrics
|
|
127
|
+
) -> None:
|
|
128
|
+
if step is None and epoch is None:
|
|
129
|
+
raise ValueError("At least one of 'step' or 'epoch' must be provided.")
|
|
130
|
+
self._conn.executemany(
|
|
131
|
+
"INSERT INTO eval (run_name, step, epoch, key, value) VALUES (?, ?, ?, ?, ?)",
|
|
132
|
+
[(self.run_name, step, epoch, k, v) for k, v in metrics.items()],
|
|
133
|
+
)
|
|
134
|
+
self._conn.commit()
|
|
135
|
+
|
|
136
|
+
def log_test(self, **metrics) -> None:
|
|
137
|
+
self._conn.executemany(
|
|
138
|
+
"INSERT INTO test (run_name, key, value) VALUES (?, ?, ?)"
|
|
139
|
+
" ON CONFLICT(run_name, key) DO UPDATE SET value = excluded.value,"
|
|
140
|
+
" updated_at = datetime('now')",
|
|
141
|
+
[(self.run_name, k, v) for k, v in metrics.items()],
|
|
142
|
+
)
|
|
143
|
+
self._conn.commit()
|
|
144
|
+
|
|
145
|
+
def checkpoint_path(
|
|
146
|
+
self, step: int | None = None, epoch: int | None = None
|
|
147
|
+
) -> Path:
|
|
148
|
+
if (step is None) == (epoch is None):
|
|
149
|
+
raise ValueError("Exactly one of 'step' or 'epoch' must be provided.")
|
|
150
|
+
if step is not None:
|
|
151
|
+
return self._checkpoint_dir / f"step_{step}.pt"
|
|
152
|
+
return self._checkpoint_dir / f"epoch_{epoch}.pt"
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def checkpoint_dir(self) -> Path:
|
|
156
|
+
return self._checkpoint_dir
|
|
157
|
+
|
|
158
|
+
def delete_run(self, run_name: str) -> None:
|
|
159
|
+
self._conn.execute("BEGIN")
|
|
160
|
+
try:
|
|
161
|
+
for table in _DATA_TABLES:
|
|
162
|
+
self._conn.execute(
|
|
163
|
+
f"DELETE FROM {table} WHERE run_name = ?", (run_name,)
|
|
164
|
+
)
|
|
165
|
+
self._conn.execute("DELETE FROM runs WHERE name = ?", (run_name,))
|
|
166
|
+
self._conn.execute("COMMIT")
|
|
167
|
+
except Exception:
|
|
168
|
+
self._conn.execute("ROLLBACK")
|
|
169
|
+
raise
|
|
170
|
+
|
|
171
|
+
run_dir = self.root_dir / run_name
|
|
172
|
+
if run_dir.exists():
|
|
173
|
+
shutil.rmtree(run_dir)
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def merge(target_dir: str | Path, source_dir: str | Path) -> None:
|
|
177
|
+
target_dir = Path(target_dir)
|
|
178
|
+
source_dir = Path(source_dir)
|
|
179
|
+
target_db = target_dir / "runs.db"
|
|
180
|
+
source_db = source_dir / "runs.db"
|
|
181
|
+
|
|
182
|
+
if not source_db.exists():
|
|
183
|
+
raise FileNotFoundError(f"No runs.db found in {source_dir}")
|
|
184
|
+
|
|
185
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
186
|
+
conn = sqlite3.connect(target_db)
|
|
187
|
+
conn.executescript(_SCHEMA)
|
|
188
|
+
conn.execute("ATTACH DATABASE ? AS other", (str(source_db),))
|
|
189
|
+
|
|
190
|
+
# Check for name conflicts
|
|
191
|
+
conflicts = conn.execute(
|
|
192
|
+
"SELECT o.name, o.machine_id, m.machine_id"
|
|
193
|
+
" FROM other.runs o INNER JOIN main.runs m ON o.name = m.name"
|
|
194
|
+
).fetchall()
|
|
195
|
+
if conflicts:
|
|
196
|
+
conn.execute("DETACH DATABASE other")
|
|
197
|
+
conn.close()
|
|
198
|
+
details = ", ".join(
|
|
199
|
+
f"'{name}' (source: {src or '?'}, target: {tgt or '?'})"
|
|
200
|
+
for name, src, tgt in conflicts
|
|
201
|
+
)
|
|
202
|
+
raise ValueError(
|
|
203
|
+
f"Run name conflicts: {details}. "
|
|
204
|
+
f"Rename the conflicting runs before merging."
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
conn.execute("BEGIN")
|
|
209
|
+
conn.execute("INSERT INTO main.runs SELECT * FROM other.runs")
|
|
210
|
+
for table in _DATA_TABLES:
|
|
211
|
+
conn.execute(f"INSERT INTO main.{table} SELECT * FROM other.{table}")
|
|
212
|
+
conn.execute("COMMIT")
|
|
213
|
+
except Exception:
|
|
214
|
+
conn.execute("ROLLBACK")
|
|
215
|
+
raise
|
|
216
|
+
finally:
|
|
217
|
+
conn.execute("DETACH DATABASE other")
|
|
218
|
+
conn.close()
|
|
219
|
+
|
|
220
|
+
# Copy checkpoint directories
|
|
221
|
+
for run_dir in source_dir.iterdir():
|
|
222
|
+
if run_dir.is_dir():
|
|
223
|
+
target_run_dir = target_dir / run_dir.name
|
|
224
|
+
if not target_run_dir.exists():
|
|
225
|
+
shutil.copytree(run_dir, target_run_dir)
|
|
226
|
+
|
|
227
|
+
def close(self) -> None:
|
|
228
|
+
self._conn.close()
|
|
229
|
+
|
|
230
|
+
def __enter__(self):
|
|
231
|
+
return self
|
|
232
|
+
|
|
233
|
+
def __exit__(self, *exc):
|
|
234
|
+
self.close()
|
tinytrainlog-0.1.1/README.md
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# 🚅🚅🚅 Tiny Train Log 🚅🚅🚅
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from ._names import generate_run_name
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class MetricsLogger:
|
|
8
|
-
def __init__(self, root_dir: str | Path, run_name: str | None = None):
|
|
9
|
-
self.root_dir = Path(root_dir)
|
|
10
|
-
if run_name is None:
|
|
11
|
-
run_name = generate_run_name(self.root_dir)
|
|
12
|
-
self.run_name = run_name
|
|
13
|
-
self.run_dir = self.root_dir / self.run_name
|
|
14
|
-
self.run_dir.mkdir(parents=True, exist_ok=True)
|
|
15
|
-
self._checkpoint_dir = self.run_dir / "checkpoints"
|
|
16
|
-
self._checkpoint_dir.mkdir(exist_ok=True)
|
|
17
|
-
|
|
18
|
-
def set_config(self, config: dict) -> None:
|
|
19
|
-
(self.run_dir / "config.json").write_text(json.dumps(config, indent=2) + "\n")
|
|
20
|
-
|
|
21
|
-
def add_tags(self, tags: list[str]) -> None:
|
|
22
|
-
tags_path = self.run_dir / "tags.json"
|
|
23
|
-
if tags_path.exists():
|
|
24
|
-
existing = json.loads(tags_path.read_text())
|
|
25
|
-
else:
|
|
26
|
-
existing = []
|
|
27
|
-
seen = set(existing)
|
|
28
|
-
for tag in tags:
|
|
29
|
-
if tag not in seen:
|
|
30
|
-
existing.append(tag)
|
|
31
|
-
seen.add(tag)
|
|
32
|
-
tags_path.write_text(json.dumps(existing, indent=2) + "\n")
|
|
33
|
-
|
|
34
|
-
def log_step(self, step: int, **metrics) -> None:
|
|
35
|
-
with open(self.run_dir / "steps.jsonl", "a") as f:
|
|
36
|
-
f.write(json.dumps({"step": step, **metrics}) + "\n")
|
|
37
|
-
|
|
38
|
-
def log_epoch(self, epoch: int, **metrics) -> None:
|
|
39
|
-
with open(self.run_dir / "epochs.jsonl", "a") as f:
|
|
40
|
-
f.write(json.dumps({"epoch": epoch, **metrics}) + "\n")
|
|
41
|
-
|
|
42
|
-
def checkpoint_path(
|
|
43
|
-
self, step: int | None = None, epoch: int | None = None
|
|
44
|
-
) -> Path:
|
|
45
|
-
if (step is None) == (epoch is None):
|
|
46
|
-
raise ValueError("Exactly one of 'step' or 'epoch' must be provided.")
|
|
47
|
-
if step is not None:
|
|
48
|
-
return self._checkpoint_dir / f"step_{step}.pt"
|
|
49
|
-
return self._checkpoint_dir / f"epoch_{epoch}.pt"
|
|
50
|
-
|
|
51
|
-
@property
|
|
52
|
-
def checkpoint_dir(self) -> Path:
|
|
53
|
-
return self._checkpoint_dir
|
|
File without changes
|
{tinytrainlog-0.1.1 → tinytrainlog-0.1.2}/src/tinytrainlog/.ruff_cache/0.15.8/13173811450099868753
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|