nshtrainer 0.10.5__py3-none-any.whl → 0.10.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -43,10 +43,6 @@ class LatestEpochCheckpoint(Checkpoint):
43
43
  self.config = config
44
44
  self.dirpath = dirpath
45
45
 
46
- # Also, we hold a reference to the last checkpoint path
47
- # to be able to remove it when a new checkpoint is saved.
48
- self._last_ckpt_path: Path | None = None
49
-
50
46
  def _ckpt_path(self, trainer: Trainer):
51
47
  return self.dirpath / self.config.filename.format(
52
48
  epoch=trainer.current_epoch, step=trainer.global_step
@@ -54,20 +50,17 @@ class LatestEpochCheckpoint(Checkpoint):
54
50
 
55
51
  @override
56
52
  def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
57
- # Remove the last checkpoint if it exists
58
- if self._last_ckpt_path is not None:
59
- trainer.strategy.remove_checkpoint(self._last_ckpt_path)
60
-
61
53
  # Save the new checkpoint
62
54
  filepath = self._ckpt_path(trainer)
63
55
  trainer.save_checkpoint(filepath, self.config.save_weights_only)
64
- self._last_ckpt_path = filepath
65
56
 
66
57
  # Create the latest symlink
67
- if (symlink_filename := self.config.latest_symlink_filename) is not None:
58
+ if (
59
+ trainer.is_global_zero
60
+ and (symlink_filename := self.config.latest_symlink_filename) is not None
61
+ ):
68
62
  symlink_path = self.dirpath / symlink_filename
69
- if symlink_path.exists():
70
- symlink_path.unlink()
63
+ symlink_path.unlink(missing_ok=True)
71
64
  symlink_path.symlink_to(filepath.name)
72
65
  log.info(f"Created latest symlink: {symlink_path}")
73
66
 
@@ -1121,6 +1121,9 @@ class SanityCheckingConfig(C.Config):
1121
1121
 
1122
1122
 
1123
1123
  class TrainerConfig(C.Config):
1124
+ ckpt_path: str | Path | None = None
1125
+ """Path to a checkpoint to load and resume training from."""
1126
+
1124
1127
  checkpoint_loading: CheckpointLoadingConfig | Literal["auto"] = "auto"
1125
1128
  """Checkpoint loading configuration options."""
1126
1129
 
@@ -304,6 +304,10 @@ class Trainer(LightningTrainer):
304
304
  log_dir = str(Path(log_dir).resolve())
305
305
  log.critical(f"LightningTrainer log directory: {self.log_dir}.")
306
306
 
307
+ # Set the checkpoint
308
+ if (ckpt_path := config.trainer.ckpt_path) is not None:
309
+ self.ckpt_path = str(Path(ckpt_path).resolve().absolute())
310
+
307
311
  def __runtime_tracker(self):
308
312
  return next(
309
313
  (
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nshtrainer
3
- Version: 0.10.5
3
+ Version: 0.10.7
4
4
  Summary:
5
5
  Author: Nima Shoghi
6
6
  Author-email: nimashoghi@gmail.com
@@ -14,7 +14,7 @@ nshtrainer/callbacks/ema.py,sha256=8-WHmKFP3VfnzMviJaIFmVD9xHPqIPmq9NRF5xdu3c8,1
14
14
  nshtrainer/callbacks/finite_checks.py,sha256=AO5fa51uANAjAkeJfTquOjK6W_4RSU5Kky3f5jmAPlQ,2084
15
15
  nshtrainer/callbacks/gradient_skipping.py,sha256=fSJpjgHbztFKz7w3qFuCHZpmbEt9BCLAy-sU0B4xJQI,3474
16
16
  nshtrainer/callbacks/interval.py,sha256=smz5Zl8cN6X6yHKVsMRS2e3SEkzRCP3LvwE1ONvLfaw,8080
17
- nshtrainer/callbacks/latest_epoch_checkpoint.py,sha256=UnwgGIc2reD7cTnUeIlDHo1LeAkgLEZFNvy2NGvUfRQ,2838
17
+ nshtrainer/callbacks/latest_epoch_checkpoint.py,sha256=zCRAUsqW-2PaoIwVKlXOqdh2uF_B_YUUTmQO1wSomR8,2489
18
18
  nshtrainer/callbacks/log_epoch.py,sha256=fTa_K_Y8A7g09630cG4YkDE6AzSMPkjb9bpPm4gtqos,1120
19
19
  nshtrainer/callbacks/model_checkpoint.py,sha256=N0raLsHlCVSbO3QU5eNFUXUDqxxW3C73oQwceMnFE_k,5955
20
20
  nshtrainer/callbacks/norm_logging.py,sha256=EWyrfkp8iHjQi9iAAXHxb0xStw2RwkdpKG2_gLarQRA,6281
@@ -52,7 +52,7 @@ nshtrainer/metrics/_config.py,sha256=hWWS4IXENRyH3RmJ7z1Wx1n3Lt1sNMlGOrcU6PW15o0
52
52
  nshtrainer/model/__init__.py,sha256=TbexTxiE20WHYg5q3L88Hysk4LlHeKk_isv33aSBREA,1918
53
53
  nshtrainer/model/_environment.py,sha256=JCFxxwMhkviiMDkqIXJmiuepqiSYIlcoSQM7Y2H2KX4,23036
54
54
  nshtrainer/model/base.py,sha256=Bmw-t70TydDbE9P0ee-lTibGoUhrCx5Qke-upa7FGVM,17512
55
- nshtrainer/model/config.py,sha256=OsVba02cmEYVf6V-A6ljV7VMAW5XZO6GWNRk8ktUw2o,53177
55
+ nshtrainer/model/config.py,sha256=6pAqDUk1eBloR3vZmtsWVdMrKeT2V3UvOn5UZ7YhZ_Q,53283
56
56
  nshtrainer/model/modules/callback.py,sha256=JF59U9-CjJsAIspEhTJbVaGN0wGctZG7UquE3IS7R8A,6408
57
57
  nshtrainer/model/modules/debug.py,sha256=DTVty8cKnzj1GCULRyGx_sWTTsq9NLi30dzqjRTnuCU,1127
58
58
  nshtrainer/model/modules/distributed.py,sha256=ABpR9d-3uBS_fivfy_WYW-dExW6vp5BPaoPQnOudHng,1725
@@ -73,12 +73,12 @@ nshtrainer/trainer/__init__.py,sha256=P2rmr8oBVTHk-HJHYPcUwWqDEArMbPR4_rPpATbWK3
73
73
  nshtrainer/trainer/_runtime_callback.py,sha256=sd2cUdRJG-UCdQr9ruZvEYpNGNF1t2W2fuxwwVlQD9E,4164
74
74
  nshtrainer/trainer/checkpoint_connector.py,sha256=xoqI2dcPnlNFPPLVIU6dBOvRPC9PtfX5qu__xV1lx0Y,2124
75
75
  nshtrainer/trainer/signal_connector.py,sha256=llwc8pdKAWxREFpjdi14Bpy8rGVMEJsmJx_s2p4gI8E,10689
76
- nshtrainer/trainer/trainer.py,sha256=n3T9Iz3eaDostxEdjapWImAsVMxyU9WBdhlPl0THX-g,16785
76
+ nshtrainer/trainer/trainer.py,sha256=tFyzIsF8c-FABTH6wwDOR9y8kydVJqeVO7PDNFMvhSU,16950
77
77
  nshtrainer/util/environment.py,sha256=_SEtiQ_s5bL5pllUlf96AOUv15kNvCPvocVC13S7mIk,4166
78
78
  nshtrainer/util/seed.py,sha256=HEXgVs-wldByahOysKwq7506OHxdYTEgmP-tDQVAEkQ,287
79
79
  nshtrainer/util/slurm.py,sha256=rofIU26z3SdL79SF45tNez6juou1cyDLz07oXEZb9Hg,1566
80
80
  nshtrainer/util/typed.py,sha256=NGuDkDzFlc1fAoaXjOFZVbmj0mRFjsQi1E_hPa7Bn5U,128
81
81
  nshtrainer/util/typing_utils.py,sha256=8ptjSSLZxlmy4FY6lzzkoGoF5fGNClo8-B_c0XHQaNU,385
82
- nshtrainer-0.10.5.dist-info/METADATA,sha256=nBa8n5rSpkY6MWYI-2JjmJzWKWa7gY0NOs737jsXXsU,695
83
- nshtrainer-0.10.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
84
- nshtrainer-0.10.5.dist-info/RECORD,,
82
+ nshtrainer-0.10.7.dist-info/METADATA,sha256=IQ6IEecsAvygnoV5P6_mkG9RjRGnb_cFuOf2Ic2HLIY,695
83
+ nshtrainer-0.10.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
84
+ nshtrainer-0.10.7.dist-info/RECORD,,