nshtrainer 0.26.2__tar.gz → 0.28.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/PKG-INFO +1 -1
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/pyproject.toml +1 -1
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/_hf_hub.py +0 -16
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/checkpoint/_base.py +1 -6
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/model/config.py +0 -2
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/trainer/trainer.py +5 -53
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/README.md +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/_callback.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/_checkpoint/loader.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/_checkpoint/metadata.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/_checkpoint/saver.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/_experimental/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/_throughput_monitor_callback.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/actsave.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/base.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/checkpoint/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/checkpoint/best_checkpoint.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/checkpoint/last_checkpoint.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/checkpoint/on_exception_checkpoint.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/early_stopping.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/ema.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/finite_checks.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/gradient_skipping.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/interval.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/log_epoch.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/norm_logging.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/print_table.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/throughput_monitor.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/timer.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/wandb_watch.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/data/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/data/balanced_batch_sampler.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/data/transform.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/_experimental.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/actsave.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/callbacks.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/config.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/data.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/log.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/lr_scheduler.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/model.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/nn.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/optimizer.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/runner.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/snapshot.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/snoop.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/trainer.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/typecheck.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/ll/util.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/loggers/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/loggers/_base.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/loggers/csv.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/loggers/tensorboard.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/loggers/wandb.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/lr_scheduler/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/lr_scheduler/_base.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/lr_scheduler/linear_warmup_cosine.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/lr_scheduler/reduce_lr_on_plateau.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/metrics/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/metrics/_config.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/model/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/model/base.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/model/modules/callback.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/model/modules/debug.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/model/modules/distributed.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/model/modules/logger.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/model/modules/profiler.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/model/modules/rlp_sanity_checks.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/model/modules/shared_parameters.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/nn/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/nn/mlp.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/nn/module_dict.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/nn/module_list.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/nn/nonlinearity.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/optimizer.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/runner.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/scripts/find_packages.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/trainer/__init__.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/trainer/_runtime_callback.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/trainer/checkpoint_connector.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/trainer/signal_connector.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/util/_environment_info.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/util/_useful_types.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/util/environment.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/util/path.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/util/seed.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/util/slurm.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/util/typed.py +0 -0
- {nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/util/typing_utils.py +0 -0
|
@@ -359,19 +359,3 @@ class HFHubCallback(NTCallbackBase):
|
|
|
359
359
|
# NOTE: This file is fairly small, so we can just upload it directly.
|
|
360
360
|
# No need to copy.
|
|
361
361
|
self._save_file(metadata_path)
|
|
362
|
-
|
|
363
|
-
@override
|
|
364
|
-
def state_dict(self):
|
|
365
|
-
return {
|
|
366
|
-
"repo_id": self._repo_id,
|
|
367
|
-
"checksum_to_path_in_repo": {
|
|
368
|
-
k: str(v) for k, v in self._checksum_to_path_in_repo.items()
|
|
369
|
-
},
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
@override
|
|
373
|
-
def load_state_dict(self, state_dict):
|
|
374
|
-
self._repo_id = state_dict["repo_id"]
|
|
375
|
-
self._checksum_to_path_in_repo = {
|
|
376
|
-
k: Path(v) for k, v in state_dict["checksum_to_path_in_repo"].items()
|
|
377
|
-
}
|
|
@@ -152,12 +152,7 @@ class CheckpointBase(Checkpoint, ABC, Generic[TConfig]):
|
|
|
152
152
|
|
|
153
153
|
# Save the new checkpoint
|
|
154
154
|
filepath = self.resolve_checkpoint_path(self.current_metrics(trainer))
|
|
155
|
-
trainer.
|
|
156
|
-
filepath,
|
|
157
|
-
self.config.save_weights_only,
|
|
158
|
-
use_checkpoint_cache=None,
|
|
159
|
-
ckpt_cache_use_symlink=False,
|
|
160
|
-
)
|
|
155
|
+
trainer.save_checkpoint(filepath, self.config.save_weights_only)
|
|
161
156
|
|
|
162
157
|
if trainer.is_global_zero:
|
|
163
158
|
# Create the latest symlink
|
|
@@ -1012,8 +1012,6 @@ class TrainerConfig(C.Config):
|
|
|
1012
1012
|
"""If enabled, the model supports scaling the gradients of shared parameters that are registered using `LightningModuleBase.register_shared_parameters(...)`"""
|
|
1013
1013
|
save_checkpoint_metadata: bool = True
|
|
1014
1014
|
"""If enabled, will save additional metadata whenever a checkpoint is saved."""
|
|
1015
|
-
use_checkpoint_cache: bool = True
|
|
1016
|
-
"""If enabled, will optimize the saving of duplicate checkpoints by creating symlinks instead of copying the file."""
|
|
1017
1015
|
|
|
1018
1016
|
lightning_kwargs: LightningTrainerKwargs = LightningTrainerKwargs()
|
|
1019
1017
|
"""
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
-
import shutil
|
|
4
3
|
from collections.abc import Sequence
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import TYPE_CHECKING, Any, cast
|
|
@@ -18,7 +17,6 @@ from lightning.pytorch.utilities.types import _EVALUATE_OUTPUT, _PREDICT_OUTPUT
|
|
|
18
17
|
from typing_extensions import Unpack, assert_never, override
|
|
19
18
|
|
|
20
19
|
from .._checkpoint.metadata import _write_checkpoint_metadata
|
|
21
|
-
from .._checkpoint.saver import _link_checkpoint
|
|
22
20
|
from ..callbacks.base import resolve_all_callbacks
|
|
23
21
|
from ..model.config import (
|
|
24
22
|
AcceleratorConfigProtocol,
|
|
@@ -287,8 +285,6 @@ class Trainer(LightningTrainer):
|
|
|
287
285
|
/,
|
|
288
286
|
**kwargs: Unpack[LightningTrainerKwargs],
|
|
289
287
|
):
|
|
290
|
-
self._nshtrainer_checkpoint_cache: dict[tuple[int, int], Path] = {}
|
|
291
|
-
|
|
292
288
|
self._pre_init(config)
|
|
293
289
|
|
|
294
290
|
kwargs = self._update_kwargs(config, kwargs)
|
|
@@ -411,54 +407,24 @@ class Trainer(LightningTrainer):
|
|
|
411
407
|
|
|
412
408
|
return super()._run(model, ckpt_path)
|
|
413
409
|
|
|
414
|
-
|
|
410
|
+
@override
|
|
411
|
+
def save_checkpoint(
|
|
415
412
|
self,
|
|
416
413
|
filepath: str | Path,
|
|
417
414
|
weights_only: bool = False,
|
|
418
415
|
storage_options: Any | None = None,
|
|
419
|
-
use_checkpoint_cache: bool | None = None,
|
|
420
|
-
ckpt_cache_use_symlink: bool = False,
|
|
421
416
|
):
|
|
422
|
-
lm = self._base_module
|
|
423
|
-
root_config = cast(BaseConfig, lm.hparams)
|
|
424
|
-
if use_checkpoint_cache is None:
|
|
425
|
-
use_checkpoint_cache = root_config.trainer.use_checkpoint_cache
|
|
426
|
-
|
|
427
417
|
filepath = Path(filepath)
|
|
428
418
|
|
|
429
419
|
# List of files that we should upload to HF
|
|
430
420
|
written_files: list[Path] = [filepath]
|
|
431
421
|
|
|
432
|
-
|
|
433
|
-
if (
|
|
434
|
-
use_checkpoint_cache
|
|
435
|
-
and (
|
|
436
|
-
cached_path := self._nshtrainer_checkpoint_cache.get(
|
|
437
|
-
(self.current_epoch, self.global_step)
|
|
438
|
-
)
|
|
439
|
-
)
|
|
440
|
-
is not None
|
|
441
|
-
):
|
|
442
|
-
# If we have a cached path, then we symlink it to the new path.
|
|
443
|
-
log.info(f"Re-using cached checkpoint {cached_path} for {filepath}.")
|
|
444
|
-
if self.is_global_zero:
|
|
445
|
-
if ckpt_cache_use_symlink:
|
|
446
|
-
_link_checkpoint(cached_path, filepath, metadata=False)
|
|
447
|
-
else:
|
|
448
|
-
shutil.copy(cached_path, filepath)
|
|
449
|
-
self.strategy.barrier("Trainer.save_checkpoint")
|
|
450
|
-
else:
|
|
451
|
-
super().save_checkpoint(filepath, weights_only, storage_options)
|
|
452
|
-
|
|
453
|
-
# If we are using the cache but we don't have a cached path, then we save the checkpoint to the cache.
|
|
454
|
-
if use_checkpoint_cache and cached_path is None:
|
|
455
|
-
self._nshtrainer_checkpoint_cache[
|
|
456
|
-
(self.current_epoch, self.global_step)
|
|
457
|
-
] = filepath
|
|
458
|
-
log.debug(f"Checkpoint saved to cache: {filepath}")
|
|
422
|
+
super().save_checkpoint(filepath, weights_only, storage_options)
|
|
459
423
|
|
|
460
424
|
# Save the checkpoint metadata
|
|
461
425
|
metadata_path = None
|
|
426
|
+
lm = self._base_module
|
|
427
|
+
root_config = cast(BaseConfig, lm.hparams)
|
|
462
428
|
if root_config.trainer.save_checkpoint_metadata and self.is_global_zero:
|
|
463
429
|
# Generate the metadata and write to disk
|
|
464
430
|
if (
|
|
@@ -470,17 +436,3 @@ class Trainer(LightningTrainer):
|
|
|
470
436
|
from .. import _callback
|
|
471
437
|
|
|
472
438
|
_callback._call_on_checkpoint_saved(self, filepath, metadata_path)
|
|
473
|
-
|
|
474
|
-
@override
|
|
475
|
-
def save_checkpoint(
|
|
476
|
-
self,
|
|
477
|
-
filepath: str | Path,
|
|
478
|
-
weights_only: bool = False,
|
|
479
|
-
storage_options: Any | None = None,
|
|
480
|
-
):
|
|
481
|
-
return self._nshtrainer_save_checkpoint(
|
|
482
|
-
filepath=filepath,
|
|
483
|
-
weights_only=weights_only,
|
|
484
|
-
storage_options=storage_options,
|
|
485
|
-
use_checkpoint_cache=False,
|
|
486
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/_throughput_monitor_callback.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/checkpoint/best_checkpoint.py
RENAMED
|
File without changes
|
{nshtrainer-0.26.2 → nshtrainer-0.28.0}/src/nshtrainer/callbacks/checkpoint/last_checkpoint.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|