nshtrainer 0.10.14__py3-none-any.whl → 0.10.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nshtrainer/_checkpoint/metadata.py +5 -9
- nshtrainer/callbacks/latest_epoch_checkpoint.py +4 -2
- nshtrainer/model/config.py +4 -3
- {nshtrainer-0.10.14.dist-info → nshtrainer-0.10.16.dist-info}/METADATA +1 -1
- {nshtrainer-0.10.14.dist-info → nshtrainer-0.10.16.dist-info}/RECORD +6 -6
- {nshtrainer-0.10.14.dist-info → nshtrainer-0.10.16.dist-info}/WHEEL +0 -0
|
@@ -105,10 +105,8 @@ def _write_checkpoint_metadata(
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
def _remove_checkpoint_metadata(checkpoint_path: Path):
|
|
108
|
-
for
|
|
109
|
-
checkpoint_path.with_suffix(
|
|
110
|
-
checkpoint_path.with_suffix(HPARAMS_PATH_SUFFIX),
|
|
111
|
-
):
|
|
108
|
+
for suffix in (METADATA_PATH_SUFFIX, HPARAMS_PATH_SUFFIX):
|
|
109
|
+
path = checkpoint_path.with_suffix(suffix)
|
|
112
110
|
try:
|
|
113
111
|
path.unlink(missing_ok=True)
|
|
114
112
|
except Exception as e:
|
|
@@ -122,11 +120,9 @@ def _link_checkpoint_metadata(checkpoint_path: Path, linked_checkpoint_path: Pat
|
|
|
122
120
|
_remove_checkpoint_metadata(linked_checkpoint_path)
|
|
123
121
|
|
|
124
122
|
# Link the metadata files to the new checkpoint
|
|
125
|
-
for
|
|
126
|
-
checkpoint_path.with_suffix(
|
|
127
|
-
|
|
128
|
-
):
|
|
129
|
-
linked_path = linked_checkpoint_path.with_suffix(path.suffix)
|
|
123
|
+
for suffix in (METADATA_PATH_SUFFIX, HPARAMS_PATH_SUFFIX):
|
|
124
|
+
path = checkpoint_path.with_suffix(suffix)
|
|
125
|
+
linked_path = linked_checkpoint_path.with_suffix(suffix)
|
|
130
126
|
try:
|
|
131
127
|
try:
|
|
132
128
|
linked_path.symlink_to(path)
|
|
@@ -64,7 +64,7 @@ class LatestEpochCheckpoint(Checkpoint):
|
|
|
64
64
|
filename = self.config.filename.format(
|
|
65
65
|
epoch=trainer.current_epoch, step=trainer.global_step
|
|
66
66
|
)
|
|
67
|
-
filename = f"{self.PREFIX}{filename}
|
|
67
|
+
filename = f"{self.PREFIX}{filename}{self.EXTENSION}"
|
|
68
68
|
return self.dirpath / filename
|
|
69
69
|
|
|
70
70
|
def _remove_checkpoints(self, trainer: Trainer, ckpt_paths: list[Path]):
|
|
@@ -95,7 +95,9 @@ class LatestEpochCheckpoint(Checkpoint):
|
|
|
95
95
|
|
|
96
96
|
def _save_new_checkpoint(self, trainer: Trainer):
|
|
97
97
|
# Remove old checkpoints
|
|
98
|
-
|
|
98
|
+
if trainer.is_global_zero:
|
|
99
|
+
self._remove_old_checkpoints(trainer)
|
|
100
|
+
trainer.strategy.barrier()
|
|
99
101
|
|
|
100
102
|
# Save the new checkpoint
|
|
101
103
|
filepath = self._ckpt_path(trainer)
|
nshtrainer/model/config.py
CHANGED
|
@@ -315,9 +315,10 @@ class WandbLoggerConfig(CallbackConfigBase, BaseLoggerConfig):
|
|
|
315
315
|
if pkg_resources.parse_version(
|
|
316
316
|
wandb.__version__
|
|
317
317
|
) < pkg_resources.parse_version("0.17.5"):
|
|
318
|
-
|
|
319
|
-
"The version of WandB installed does not support the `wandb-core` backend
|
|
320
|
-
"
|
|
318
|
+
raise ValueError(
|
|
319
|
+
"The version of WandB installed does not support the `wandb-core` backend "
|
|
320
|
+
f"(expected version >= 0.17.5, found version {wandb.__version__}). "
|
|
321
|
+
"Please either upgrade to a newer version of WandB or disable the `use_wandb_core` option."
|
|
321
322
|
)
|
|
322
323
|
else:
|
|
323
324
|
wandb.require("core")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
nshtrainer/__init__.py,sha256=39loiLLXbaGiozEsAn8mPHopxaPsek8JsgR9DD2gxtY,583
|
|
2
2
|
nshtrainer/_checkpoint/loader.py,sha256=48flPr1XgQHOgIPaCrRqOEvRuG0SZuV3cQ1vgHLqFqI,11025
|
|
3
|
-
nshtrainer/_checkpoint/metadata.py,sha256=
|
|
3
|
+
nshtrainer/_checkpoint/metadata.py,sha256=GlhlAyJh5gcp3R8l2Y3eAUQtQzBnitFlB0xdx-khEUQ,5579
|
|
4
4
|
nshtrainer/_checkpoint/saver.py,sha256=z_c7a91O4Bh4lZZjqJgxT3w25qFlJsOopV3cpJtkHk8,1655
|
|
5
5
|
nshtrainer/_experimental/__init__.py,sha256=2tQIcrWT8U8no_AeBTYnozaTmxN40kuAJdGQ4b-PoWM,120
|
|
6
6
|
nshtrainer/_experimental/flops/__init__.py,sha256=edo9Ez3LlrnxkNRX9W6YBhPkRPKYGLpkpnl5gx7sEX8,1550
|
|
@@ -15,7 +15,7 @@ nshtrainer/callbacks/ema.py,sha256=8-WHmKFP3VfnzMviJaIFmVD9xHPqIPmq9NRF5xdu3c8,1
|
|
|
15
15
|
nshtrainer/callbacks/finite_checks.py,sha256=gJC_RUr3ais3FJI0uB6wUZnDdE3WRwCix3ppA3PwQXA,2077
|
|
16
16
|
nshtrainer/callbacks/gradient_skipping.py,sha256=pqu5AELx4ctJxR2Y7YSSiGd5oGauVCTZFCEIIS6s88w,3665
|
|
17
17
|
nshtrainer/callbacks/interval.py,sha256=smz5Zl8cN6X6yHKVsMRS2e3SEkzRCP3LvwE1ONvLfaw,8080
|
|
18
|
-
nshtrainer/callbacks/latest_epoch_checkpoint.py,sha256=
|
|
18
|
+
nshtrainer/callbacks/latest_epoch_checkpoint.py,sha256=5JC-JCdgWNnunl0jv4Q9LhkEspLAn0x8VpCMJZi7-ow,4219
|
|
19
19
|
nshtrainer/callbacks/log_epoch.py,sha256=fTa_K_Y8A7g09630cG4YkDE6AzSMPkjb9bpPm4gtqos,1120
|
|
20
20
|
nshtrainer/callbacks/model_checkpoint.py,sha256=8D0wWLhr_KiksAA1fjfIuby42Mq6XokCvAnVUhjADd8,6564
|
|
21
21
|
nshtrainer/callbacks/norm_logging.py,sha256=T2psu8mYsw9iahPKT6aUPjkGrZ4TIzm6_UUUmE09GJs,6274
|
|
@@ -52,7 +52,7 @@ nshtrainer/metrics/__init__.py,sha256=ObLIELGguIEcUpRsUkqh1ltrvZii6vglTpJGrPvoy0
|
|
|
52
52
|
nshtrainer/metrics/_config.py,sha256=hWWS4IXENRyH3RmJ7z1Wx1n3Lt1sNMlGOrcU6PW15o0,1104
|
|
53
53
|
nshtrainer/model/__init__.py,sha256=NpvyQHmGaHB8xdraHmm8l7kDHLmvJSgBNQKkfYqtgyI,1454
|
|
54
54
|
nshtrainer/model/base.py,sha256=AXRfEsFAT0Ln7zjYVPU5NgtHS_c8FZM-M4pyLamO7OA,17516
|
|
55
|
-
nshtrainer/model/config.py,sha256=
|
|
55
|
+
nshtrainer/model/config.py,sha256=npR8undYPqjIGlAZpm4suRP77qE9R42G_9Y-2Am9Wh4,54780
|
|
56
56
|
nshtrainer/model/modules/callback.py,sha256=K0-cyEtBcQhI7Q2e-AGTE8T-GghUPY9DYmneU6ULV6g,6401
|
|
57
57
|
nshtrainer/model/modules/debug.py,sha256=Yy7XEdPou9BkCsD5hJchwJGmCVGrfUru5g9VjPM4uAw,1120
|
|
58
58
|
nshtrainer/model/modules/distributed.py,sha256=ABpR9d-3uBS_fivfy_WYW-dExW6vp5BPaoPQnOudHng,1725
|
|
@@ -79,6 +79,6 @@ nshtrainer/util/seed.py,sha256=Or2wMPsnQxfnZ2xfBiyMcHFIUt3tGTNeMMyOEanCkqs,280
|
|
|
79
79
|
nshtrainer/util/slurm.py,sha256=rofIU26z3SdL79SF45tNez6juou1cyDLz07oXEZb9Hg,1566
|
|
80
80
|
nshtrainer/util/typed.py,sha256=NGuDkDzFlc1fAoaXjOFZVbmj0mRFjsQi1E_hPa7Bn5U,128
|
|
81
81
|
nshtrainer/util/typing_utils.py,sha256=8ptjSSLZxlmy4FY6lzzkoGoF5fGNClo8-B_c0XHQaNU,385
|
|
82
|
-
nshtrainer-0.10.
|
|
83
|
-
nshtrainer-0.10.
|
|
84
|
-
nshtrainer-0.10.
|
|
82
|
+
nshtrainer-0.10.16.dist-info/METADATA,sha256=8jgjZDL82cNf_ys1xKUuqfKXAol8m2dWYB909W239fk,696
|
|
83
|
+
nshtrainer-0.10.16.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
84
|
+
nshtrainer-0.10.16.dist-info/RECORD,,
|
|
File without changes
|