nshtrainer 0.26.2__tar.gz → 0.27.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/PKG-INFO +1 -1
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/pyproject.toml +1 -1
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/checkpoint/_base.py +1 -4
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/model/config.py +1 -1
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/trainer/trainer.py +13 -9
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/README.md +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/_callback.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/_checkpoint/loader.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/_checkpoint/metadata.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/_checkpoint/saver.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/_experimental/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/_hf_hub.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/_throughput_monitor_callback.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/actsave.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/base.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/checkpoint/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/checkpoint/best_checkpoint.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/checkpoint/last_checkpoint.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/checkpoint/on_exception_checkpoint.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/early_stopping.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/ema.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/finite_checks.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/gradient_skipping.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/interval.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/log_epoch.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/norm_logging.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/print_table.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/throughput_monitor.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/timer.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/wandb_watch.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/data/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/data/balanced_batch_sampler.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/data/transform.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/_experimental.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/actsave.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/callbacks.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/config.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/data.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/log.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/lr_scheduler.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/model.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/nn.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/optimizer.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/runner.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/snapshot.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/snoop.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/trainer.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/typecheck.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/ll/util.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/loggers/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/loggers/_base.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/loggers/csv.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/loggers/tensorboard.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/loggers/wandb.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/lr_scheduler/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/lr_scheduler/_base.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/lr_scheduler/linear_warmup_cosine.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/lr_scheduler/reduce_lr_on_plateau.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/metrics/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/metrics/_config.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/model/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/model/base.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/model/modules/callback.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/model/modules/debug.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/model/modules/distributed.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/model/modules/logger.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/model/modules/profiler.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/model/modules/rlp_sanity_checks.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/model/modules/shared_parameters.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/nn/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/nn/mlp.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/nn/module_dict.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/nn/module_list.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/nn/nonlinearity.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/optimizer.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/runner.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/scripts/find_packages.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/trainer/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/trainer/_runtime_callback.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/trainer/checkpoint_connector.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/trainer/signal_connector.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/util/_environment_info.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/util/_useful_types.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/util/environment.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/util/path.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/util/seed.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/util/slurm.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/util/typed.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/util/typing_utils.py +0 -0
|
@@ -153,10 +153,7 @@ class CheckpointBase(Checkpoint, ABC, Generic[TConfig]):
|
|
|
153
153
|
# Save the new checkpoint
|
|
154
154
|
filepath = self.resolve_checkpoint_path(self.current_metrics(trainer))
|
|
155
155
|
trainer._nshtrainer_save_checkpoint(
|
|
156
|
-
filepath,
|
|
157
|
-
self.config.save_weights_only,
|
|
158
|
-
use_checkpoint_cache=None,
|
|
159
|
-
ckpt_cache_use_symlink=False,
|
|
156
|
+
filepath, self.config.save_weights_only, use_checkpoint_cache=None
|
|
160
157
|
)
|
|
161
158
|
|
|
162
159
|
if trainer.is_global_zero:
|
|
@@ -1012,7 +1012,7 @@ class TrainerConfig(C.Config):
|
|
|
1012
1012
|
"""If enabled, the model supports scaling the gradients of shared parameters that are registered using `LightningModuleBase.register_shared_parameters(...)`"""
|
|
1013
1013
|
save_checkpoint_metadata: bool = True
|
|
1014
1014
|
"""If enabled, will save additional metadata whenever a checkpoint is saved."""
|
|
1015
|
-
use_checkpoint_cache: bool =
|
|
1015
|
+
use_checkpoint_cache: bool = False
|
|
1016
1016
|
"""If enabled, will optimize the saving of duplicate checkpoints by creating symlinks instead of copying the file."""
|
|
1017
1017
|
|
|
1018
1018
|
lightning_kwargs: LightningTrainerKwargs = LightningTrainerKwargs()
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
import shutil
|
|
4
|
+
from collections import defaultdict
|
|
4
5
|
from collections.abc import Sequence
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import TYPE_CHECKING, Any, cast
|
|
@@ -18,7 +19,6 @@ from lightning.pytorch.utilities.types import _EVALUATE_OUTPUT, _PREDICT_OUTPUT
|
|
|
18
19
|
from typing_extensions import Unpack, assert_never, override
|
|
19
20
|
|
|
20
21
|
from .._checkpoint.metadata import _write_checkpoint_metadata
|
|
21
|
-
from .._checkpoint.saver import _link_checkpoint
|
|
22
22
|
from ..callbacks.base import resolve_all_callbacks
|
|
23
23
|
from ..model.config import (
|
|
24
24
|
AcceleratorConfigProtocol,
|
|
@@ -280,6 +280,12 @@ class Trainer(LightningTrainer):
|
|
|
280
280
|
if TYPE_CHECKING:
|
|
281
281
|
callbacks: list[Callback]
|
|
282
282
|
|
|
283
|
+
def _nshtrainer_checkpoint_cache_get(self, key: tuple[int, int]):
|
|
284
|
+
return next(
|
|
285
|
+
(ckpt for ckpt in self._nshtrainer_checkpoint_cache[key] if ckpt.exists()),
|
|
286
|
+
None,
|
|
287
|
+
)
|
|
288
|
+
|
|
283
289
|
@override
|
|
284
290
|
def __init__(
|
|
285
291
|
self,
|
|
@@ -287,7 +293,9 @@ class Trainer(LightningTrainer):
|
|
|
287
293
|
/,
|
|
288
294
|
**kwargs: Unpack[LightningTrainerKwargs],
|
|
289
295
|
):
|
|
290
|
-
self._nshtrainer_checkpoint_cache
|
|
296
|
+
self._nshtrainer_checkpoint_cache = defaultdict[tuple[int, int], list[Path]](
|
|
297
|
+
lambda: []
|
|
298
|
+
)
|
|
291
299
|
|
|
292
300
|
self._pre_init(config)
|
|
293
301
|
|
|
@@ -417,7 +425,6 @@ class Trainer(LightningTrainer):
|
|
|
417
425
|
weights_only: bool = False,
|
|
418
426
|
storage_options: Any | None = None,
|
|
419
427
|
use_checkpoint_cache: bool | None = None,
|
|
420
|
-
ckpt_cache_use_symlink: bool = False,
|
|
421
428
|
):
|
|
422
429
|
lm = self._base_module
|
|
423
430
|
root_config = cast(BaseConfig, lm.hparams)
|
|
@@ -433,7 +440,7 @@ class Trainer(LightningTrainer):
|
|
|
433
440
|
if (
|
|
434
441
|
use_checkpoint_cache
|
|
435
442
|
and (
|
|
436
|
-
cached_path := self.
|
|
443
|
+
cached_path := self._nshtrainer_checkpoint_cache_get(
|
|
437
444
|
(self.current_epoch, self.global_step)
|
|
438
445
|
)
|
|
439
446
|
)
|
|
@@ -442,10 +449,7 @@ class Trainer(LightningTrainer):
|
|
|
442
449
|
# If we have a cached path, then we symlink it to the new path.
|
|
443
450
|
log.info(f"Re-using cached checkpoint {cached_path} for {filepath}.")
|
|
444
451
|
if self.is_global_zero:
|
|
445
|
-
|
|
446
|
-
_link_checkpoint(cached_path, filepath, metadata=False)
|
|
447
|
-
else:
|
|
448
|
-
shutil.copy(cached_path, filepath)
|
|
452
|
+
shutil.copy(cached_path, filepath)
|
|
449
453
|
self.strategy.barrier("Trainer.save_checkpoint")
|
|
450
454
|
else:
|
|
451
455
|
super().save_checkpoint(filepath, weights_only, storage_options)
|
|
@@ -454,7 +458,7 @@ class Trainer(LightningTrainer):
|
|
|
454
458
|
if use_checkpoint_cache and cached_path is None:
|
|
455
459
|
self._nshtrainer_checkpoint_cache[
|
|
456
460
|
(self.current_epoch, self.global_step)
|
|
457
|
-
]
|
|
461
|
+
].append(filepath)
|
|
458
462
|
log.debug(f"Checkpoint saved to cache: {filepath}")
|
|
459
463
|
|
|
460
464
|
# Save the checkpoint metadata
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/_throughput_monitor_callback.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/checkpoint/best_checkpoint.py
RENAMED
|
File without changes
|
{nshtrainer-0.26.2 → nshtrainer-0.27.0}/src/nshtrainer/callbacks/checkpoint/last_checkpoint.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|