nshtrainer 0.10.10__py3-none-any.whl → 0.10.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nshtrainer/_checkpoint/metadata.py +73 -0
- nshtrainer/_checkpoint/saver.py +52 -0
- nshtrainer/callbacks/latest_epoch_checkpoint.py +60 -20
- nshtrainer/callbacks/model_checkpoint.py +18 -0
- {nshtrainer-0.10.10.dist-info → nshtrainer-0.10.11.dist-info}/METADATA +1 -1
- {nshtrainer-0.10.10.dist-info → nshtrainer-0.10.11.dist-info}/RECORD +7 -6
- {nshtrainer-0.10.10.dist-info → nshtrainer-0.10.11.dist-info}/WHEEL +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
import datetime
|
|
3
3
|
import logging
|
|
4
|
+
import shutil
|
|
5
|
+
from collections.abc import Callable
|
|
4
6
|
from pathlib import Path
|
|
5
7
|
from typing import TYPE_CHECKING, Any, cast
|
|
6
8
|
|
|
@@ -100,3 +102,74 @@ def _write_checkpoint_metadata(
|
|
|
100
102
|
log.warning(f"Failed to write hparams to {checkpoint_path}: {e}")
|
|
101
103
|
else:
|
|
102
104
|
log.info(f"Checkpoint metadata written to {checkpoint_path}")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _remove_checkpoint_metadata(checkpoint_path: Path):
|
|
108
|
+
for path in (
|
|
109
|
+
checkpoint_path.with_suffix(METADATA_PATH_SUFFIX),
|
|
110
|
+
checkpoint_path.with_suffix(HPARAMS_PATH_SUFFIX),
|
|
111
|
+
):
|
|
112
|
+
try:
|
|
113
|
+
path.unlink(missing_ok=True)
|
|
114
|
+
except Exception as e:
|
|
115
|
+
log.warning(f"Failed to remove {path}: {e}")
|
|
116
|
+
else:
|
|
117
|
+
log.info(f"Removed {path}")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _link_checkpoint_metadata(checkpoint_path: Path, linked_checkpoint_path: Path):
|
|
121
|
+
# First, remove any existing metadata files
|
|
122
|
+
_remove_checkpoint_metadata(linked_checkpoint_path)
|
|
123
|
+
|
|
124
|
+
# Link the metadata files to the new checkpoint
|
|
125
|
+
for path in (
|
|
126
|
+
checkpoint_path.with_suffix(METADATA_PATH_SUFFIX),
|
|
127
|
+
checkpoint_path.with_suffix(HPARAMS_PATH_SUFFIX),
|
|
128
|
+
):
|
|
129
|
+
linked_path = linked_checkpoint_path.with_suffix(path.suffix)
|
|
130
|
+
try:
|
|
131
|
+
try:
|
|
132
|
+
linked_path.symlink_to(path)
|
|
133
|
+
except OSError:
|
|
134
|
+
# on Windows, special permissions are required to create symbolic links as a regular user
|
|
135
|
+
# fall back to copying the file
|
|
136
|
+
shutil.copy(path, linked_path)
|
|
137
|
+
except Exception as e:
|
|
138
|
+
log.warning(f"Failed to link {path} to {linked_path}: {e}")
|
|
139
|
+
else:
|
|
140
|
+
log.info(f"Linked {path} to {linked_path}")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _checkpoint_sort_key_fn(key: Callable[[CheckpointMetadata, Path], Any]):
|
|
144
|
+
def sort_key_fn(checkpoint_path: Path):
|
|
145
|
+
if not (p := checkpoint_path.with_suffix(METADATA_PATH_SUFFIX)).exists():
|
|
146
|
+
raise FileNotFoundError(f"Metadata file not found: {p}")
|
|
147
|
+
|
|
148
|
+
nonlocal key
|
|
149
|
+
return key(CheckpointMetadata.from_file(p), p)
|
|
150
|
+
|
|
151
|
+
return sort_key_fn
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _sort_ckpts_by_metadata(
|
|
155
|
+
checkpoint_paths: list[Path],
|
|
156
|
+
key: Callable[[CheckpointMetadata, Path], Any],
|
|
157
|
+
fallback_key: Callable[[Path], Any],
|
|
158
|
+
):
|
|
159
|
+
# First, let's make sure all the metadata files exist.
|
|
160
|
+
# If not, use the fallback function to sort the checkpoints.
|
|
161
|
+
no_metadata_paths: list[Path] = []
|
|
162
|
+
for path in checkpoint_paths:
|
|
163
|
+
if (path.with_suffix(METADATA_PATH_SUFFIX)).exists():
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
no_metadata_paths.append(path)
|
|
167
|
+
|
|
168
|
+
if no_metadata_paths:
|
|
169
|
+
log.warning(
|
|
170
|
+
f"Metadata file not found on {len(no_metadata_paths)} checkpoints: {no_metadata_paths}\n"
|
|
171
|
+
"Falling back to sorting by last modified time."
|
|
172
|
+
)
|
|
173
|
+
return sorted(checkpoint_paths, key=fallback_key)
|
|
174
|
+
|
|
175
|
+
return sorted(checkpoint_paths, key=_checkpoint_sort_key_fn(key))
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from lightning.pytorch import Trainer
|
|
6
|
+
|
|
7
|
+
from .metadata import _link_checkpoint_metadata, _remove_checkpoint_metadata
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _link_checkpoint(
|
|
11
|
+
trainer: Trainer,
|
|
12
|
+
filepath: str | Path | os.PathLike,
|
|
13
|
+
linkpath: str | Path | os.PathLike,
|
|
14
|
+
*,
|
|
15
|
+
barrier: bool,
|
|
16
|
+
metadata: bool,
|
|
17
|
+
):
|
|
18
|
+
if not isinstance(filepath, Path):
|
|
19
|
+
filepath = Path(filepath)
|
|
20
|
+
if not isinstance(linkpath, Path):
|
|
21
|
+
linkpath = Path(linkpath)
|
|
22
|
+
|
|
23
|
+
if trainer.is_global_zero:
|
|
24
|
+
if linkpath.exists():
|
|
25
|
+
if linkpath.is_symlink() or linkpath.is_file():
|
|
26
|
+
linkpath.unlink()
|
|
27
|
+
elif linkpath.is_dir():
|
|
28
|
+
shutil.rmtree(linkpath)
|
|
29
|
+
_remove_checkpoint_metadata(linkpath)
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
target_path = filepath.relative_to(linkpath.parent)
|
|
33
|
+
linkpath.symlink_to(target_path)
|
|
34
|
+
except OSError:
|
|
35
|
+
# on Windows, special permissions are required to create symbolic links as a regular user
|
|
36
|
+
# fall back to copying the file
|
|
37
|
+
shutil.copy(filepath, linkpath)
|
|
38
|
+
|
|
39
|
+
_link_checkpoint_metadata(filepath, linkpath)
|
|
40
|
+
if barrier:
|
|
41
|
+
trainer.strategy.barrier()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _remove_checkpoint(
|
|
45
|
+
trainer: Trainer,
|
|
46
|
+
filepath: str | Path | os.PathLike,
|
|
47
|
+
remove_metadata: bool = True,
|
|
48
|
+
):
|
|
49
|
+
if not isinstance(filepath, Path):
|
|
50
|
+
filepath = Path(filepath)
|
|
51
|
+
trainer.strategy.remove_checkpoint(filepath)
|
|
52
|
+
_remove_checkpoint_metadata(filepath)
|
|
@@ -6,6 +6,8 @@ from lightning.pytorch import LightningModule, Trainer
|
|
|
6
6
|
from lightning.pytorch.callbacks import Checkpoint
|
|
7
7
|
from typing_extensions import override
|
|
8
8
|
|
|
9
|
+
from .._checkpoint.metadata import _sort_ckpts_by_metadata
|
|
10
|
+
from .._checkpoint.saver import _link_checkpoint, _remove_checkpoint
|
|
9
11
|
from .base import CallbackConfigBase
|
|
10
12
|
|
|
11
13
|
log = logging.getLogger(__name__)
|
|
@@ -17,15 +19,18 @@ class LatestEpochCheckpointCallbackConfig(CallbackConfigBase):
|
|
|
17
19
|
dirpath: str | Path | None = None
|
|
18
20
|
"""Directory path to save the checkpoint file."""
|
|
19
21
|
|
|
20
|
-
filename: str = "
|
|
22
|
+
filename: str = "epoch{epoch:02d}_step{step:04d}"
|
|
21
23
|
"""Checkpoint filename. This must not include the extension."""
|
|
22
24
|
|
|
23
25
|
save_weights_only: bool = False
|
|
24
26
|
"""Whether to save only the model's weights or the entire model object."""
|
|
25
27
|
|
|
26
|
-
latest_symlink_filename: str | None = "latest
|
|
28
|
+
latest_symlink_filename: str | None = "latest"
|
|
27
29
|
"""Filename for the latest symlink. If None, no symlink will be created."""
|
|
28
30
|
|
|
31
|
+
latest_k: int | Literal["all"] = 1
|
|
32
|
+
"""Number of latest checkpoints to keep. If "all", all checkpoints are kept."""
|
|
33
|
+
|
|
29
34
|
@override
|
|
30
35
|
def create_callbacks(self, root_config):
|
|
31
36
|
dirpath = self.dirpath or root_config.directory.resolve_subdirectory(
|
|
@@ -37,38 +42,73 @@ class LatestEpochCheckpointCallbackConfig(CallbackConfigBase):
|
|
|
37
42
|
|
|
38
43
|
|
|
39
44
|
class LatestEpochCheckpoint(Checkpoint):
|
|
45
|
+
PREFIX = "latest_"
|
|
46
|
+
EXTENSION = ".ckpt"
|
|
47
|
+
|
|
40
48
|
def __init__(self, config: LatestEpochCheckpointCallbackConfig, dirpath: Path):
|
|
41
49
|
super().__init__()
|
|
42
50
|
|
|
43
51
|
self.config = config
|
|
44
52
|
self.dirpath = dirpath
|
|
45
53
|
|
|
54
|
+
@override
|
|
55
|
+
def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
|
|
56
|
+
self._save_new_checkpoint(trainer)
|
|
57
|
+
|
|
58
|
+
def _latest_symlink_filename(self):
|
|
59
|
+
if (filename := self.config.latest_symlink_filename) is None:
|
|
60
|
+
return None
|
|
61
|
+
return f"{filename}{self.EXTENSION}"
|
|
62
|
+
|
|
46
63
|
def _ckpt_path(self, trainer: Trainer):
|
|
47
|
-
|
|
64
|
+
filename = self.config.filename.format(
|
|
48
65
|
epoch=trainer.current_epoch, step=trainer.global_step
|
|
49
66
|
)
|
|
67
|
+
filename = f"{self.PREFIX}{filename}.{self.EXTENSION}"
|
|
68
|
+
return self.dirpath / filename
|
|
69
|
+
|
|
70
|
+
def _remove_checkpoints(self, trainer: Trainer, ckpt_paths: list[Path]):
|
|
71
|
+
for ckpt_path in ckpt_paths:
|
|
72
|
+
_remove_checkpoint(trainer, ckpt_path, remove_metadata=True)
|
|
73
|
+
|
|
74
|
+
def _remove_old_checkpoints(self, trainer: Trainer):
|
|
75
|
+
if (latest_k := self.config.latest_k) == "all":
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
# Get all configs, ignoring the latest symlink
|
|
79
|
+
ckpt_paths = list(self.dirpath.glob(f"{self.PREFIX}*{self.EXTENSION}"))
|
|
80
|
+
# Ignore the latest symlink
|
|
81
|
+
if (latest_symlink_filename := self._latest_symlink_filename()) is not None:
|
|
82
|
+
ckpt_paths = [p for p in ckpt_paths if p.name != latest_symlink_filename]
|
|
83
|
+
|
|
84
|
+
# Sort by epoch, then step, then last modified
|
|
85
|
+
ckpt_paths = _sort_ckpts_by_metadata(
|
|
86
|
+
ckpt_paths,
|
|
87
|
+
key=lambda meta, p: (meta.epoch, meta.global_step, p.stat().st_mtime),
|
|
88
|
+
fallback_key=lambda p: p.stat().st_mtime,
|
|
89
|
+
# ^ Called if metadata is not found on all checkpoints
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Remove all but the latest k checkpoints
|
|
93
|
+
ckpts_to_remove = ckpt_paths[:-latest_k]
|
|
94
|
+
self._remove_checkpoints(trainer, ckpts_to_remove)
|
|
95
|
+
|
|
96
|
+
def _save_new_checkpoint(self, trainer: Trainer):
|
|
97
|
+
# Remove old checkpoints
|
|
98
|
+
self._remove_old_checkpoints(trainer)
|
|
50
99
|
|
|
51
|
-
@override
|
|
52
|
-
def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
|
|
53
100
|
# Save the new checkpoint
|
|
54
101
|
filepath = self._ckpt_path(trainer)
|
|
55
102
|
trainer.save_checkpoint(filepath, self.config.save_weights_only)
|
|
56
103
|
|
|
57
104
|
# Create the latest symlink
|
|
58
|
-
if (
|
|
59
|
-
trainer.is_global_zero
|
|
60
|
-
and (symlink_filename := self.config.latest_symlink_filename) is not None
|
|
61
|
-
):
|
|
105
|
+
if (symlink_filename := self._latest_symlink_filename()) is not None:
|
|
62
106
|
symlink_path = self.dirpath / symlink_filename
|
|
63
|
-
|
|
64
|
-
|
|
107
|
+
_link_checkpoint(
|
|
108
|
+
trainer,
|
|
109
|
+
filepath,
|
|
110
|
+
symlink_path,
|
|
111
|
+
barrier=True,
|
|
112
|
+
metadata=True,
|
|
113
|
+
)
|
|
65
114
|
log.info(f"Created latest symlink: {symlink_path}")
|
|
66
|
-
|
|
67
|
-
def latest_checkpoint(self):
|
|
68
|
-
if (symlink_filename := self.config.latest_symlink_filename) is None:
|
|
69
|
-
return None
|
|
70
|
-
|
|
71
|
-
if not (symlink_path := self.dirpath / symlink_filename).exists():
|
|
72
|
-
return None
|
|
73
|
-
|
|
74
|
-
return symlink_path
|
|
@@ -4,11 +4,13 @@ from datetime import timedelta
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import TYPE_CHECKING, Literal
|
|
6
6
|
|
|
7
|
+
from lightning.pytorch import Trainer
|
|
7
8
|
from lightning.pytorch.callbacks.model_checkpoint import (
|
|
8
9
|
ModelCheckpoint as _ModelCheckpoint,
|
|
9
10
|
)
|
|
10
11
|
from typing_extensions import override
|
|
11
12
|
|
|
13
|
+
from .._checkpoint.saver import _link_checkpoint, _remove_checkpoint
|
|
12
14
|
from ..metrics import MetricConfig
|
|
13
15
|
from .base import CallbackConfigBase
|
|
14
16
|
|
|
@@ -158,6 +160,8 @@ class ModelCheckpointCallbackConfig(CallbackConfigBase):
|
|
|
158
160
|
|
|
159
161
|
|
|
160
162
|
class ModelCheckpoint(_ModelCheckpoint):
|
|
163
|
+
CHECKPOINT_NAME_LAST = "best"
|
|
164
|
+
|
|
161
165
|
@override
|
|
162
166
|
def __init__(
|
|
163
167
|
self,
|
|
@@ -185,3 +189,17 @@ class ModelCheckpoint(_ModelCheckpoint):
|
|
|
185
189
|
save_on_train_epoch_end=self.config.save_on_train_epoch_end,
|
|
186
190
|
enable_version_counter=self.config.enable_version_counter,
|
|
187
191
|
)
|
|
192
|
+
|
|
193
|
+
@override
|
|
194
|
+
def _link_checkpoint(self, trainer: Trainer, filepath: str, linkpath: str): # pyright: ignore[reportIncompatibleMethodOverride]
|
|
195
|
+
return _link_checkpoint(
|
|
196
|
+
trainer,
|
|
197
|
+
filepath,
|
|
198
|
+
linkpath,
|
|
199
|
+
barrier=True,
|
|
200
|
+
metadata=True,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
@override
|
|
204
|
+
def _remove_checkpoint(self, trainer: Trainer, filepath: str):
|
|
205
|
+
return _remove_checkpoint(trainer, filepath, remove_metadata=True)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
nshtrainer/__init__.py,sha256=39loiLLXbaGiozEsAn8mPHopxaPsek8JsgR9DD2gxtY,583
|
|
2
2
|
nshtrainer/_checkpoint/loader.py,sha256=48flPr1XgQHOgIPaCrRqOEvRuG0SZuV3cQ1vgHLqFqI,11025
|
|
3
|
-
nshtrainer/_checkpoint/metadata.py,sha256=
|
|
3
|
+
nshtrainer/_checkpoint/metadata.py,sha256=B6kPmWsq2TQh0gTzBx-1pLIwTVEs_Qw5v0nHEeTBdO4,5636
|
|
4
|
+
nshtrainer/_checkpoint/saver.py,sha256=KZp9ITUVHwj2Ttu81zXKdlS_h-fKkHearspwuAijDpM,1501
|
|
4
5
|
nshtrainer/_experimental/__init__.py,sha256=2tQIcrWT8U8no_AeBTYnozaTmxN40kuAJdGQ4b-PoWM,120
|
|
5
6
|
nshtrainer/_experimental/flops/__init__.py,sha256=edo9Ez3LlrnxkNRX9W6YBhPkRPKYGLpkpnl5gx7sEX8,1550
|
|
6
7
|
nshtrainer/_experimental/flops/flop_counter.py,sha256=-sL0Fy6poXa__hyzUMdZScjPULp4coQELQpPU6p6dXU,25736
|
|
@@ -14,9 +15,9 @@ nshtrainer/callbacks/ema.py,sha256=8-WHmKFP3VfnzMviJaIFmVD9xHPqIPmq9NRF5xdu3c8,1
|
|
|
14
15
|
nshtrainer/callbacks/finite_checks.py,sha256=gJC_RUr3ais3FJI0uB6wUZnDdE3WRwCix3ppA3PwQXA,2077
|
|
15
16
|
nshtrainer/callbacks/gradient_skipping.py,sha256=pqu5AELx4ctJxR2Y7YSSiGd5oGauVCTZFCEIIS6s88w,3665
|
|
16
17
|
nshtrainer/callbacks/interval.py,sha256=smz5Zl8cN6X6yHKVsMRS2e3SEkzRCP3LvwE1ONvLfaw,8080
|
|
17
|
-
nshtrainer/callbacks/latest_epoch_checkpoint.py,sha256=
|
|
18
|
+
nshtrainer/callbacks/latest_epoch_checkpoint.py,sha256=t4vWa4PvJDO3rKXKZbuegm7iLl7xCEd17wNif0Bp-BA,4138
|
|
18
19
|
nshtrainer/callbacks/log_epoch.py,sha256=fTa_K_Y8A7g09630cG4YkDE6AzSMPkjb9bpPm4gtqos,1120
|
|
19
|
-
nshtrainer/callbacks/model_checkpoint.py,sha256=
|
|
20
|
+
nshtrainer/callbacks/model_checkpoint.py,sha256=MaDkD8Ismcj8u6l2flCFlqJR3-k1Tc4xzhxNWNux4n0,6556
|
|
20
21
|
nshtrainer/callbacks/norm_logging.py,sha256=T2psu8mYsw9iahPKT6aUPjkGrZ4TIzm6_UUUmE09GJs,6274
|
|
21
22
|
nshtrainer/callbacks/on_exception_checkpoint.py,sha256=x42BYZ2ejf2rhqPLCmT5nyWKhA9qBEosiV8ZNhhZ6lI,3355
|
|
22
23
|
nshtrainer/callbacks/print_table.py,sha256=_FdAHhqylWGk4Z0c2FrLFeiMA4jhfA_beZRK_BHpzmE,2837
|
|
@@ -78,6 +79,6 @@ nshtrainer/util/seed.py,sha256=Or2wMPsnQxfnZ2xfBiyMcHFIUt3tGTNeMMyOEanCkqs,280
|
|
|
78
79
|
nshtrainer/util/slurm.py,sha256=rofIU26z3SdL79SF45tNez6juou1cyDLz07oXEZb9Hg,1566
|
|
79
80
|
nshtrainer/util/typed.py,sha256=NGuDkDzFlc1fAoaXjOFZVbmj0mRFjsQi1E_hPa7Bn5U,128
|
|
80
81
|
nshtrainer/util/typing_utils.py,sha256=8ptjSSLZxlmy4FY6lzzkoGoF5fGNClo8-B_c0XHQaNU,385
|
|
81
|
-
nshtrainer-0.10.
|
|
82
|
-
nshtrainer-0.10.
|
|
83
|
-
nshtrainer-0.10.
|
|
82
|
+
nshtrainer-0.10.11.dist-info/METADATA,sha256=9WAsp25_csjDcchr5X22g7ocQpQ-d-ewB3gS9EAZSE8,696
|
|
83
|
+
nshtrainer-0.10.11.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
84
|
+
nshtrainer-0.10.11.dist-info/RECORD,,
|
|
File without changes
|