nshtrainer 0.8.6__tar.gz → 0.9.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/PKG-INFO +1 -1
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/pyproject.toml +1 -1
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/model/config.py +47 -10
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/trainer/signal_connector.py +18 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/README.md +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/__init__.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/_experimental/__init__.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/_experimental/flops/__init__.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/_experimental/flops/flop_counter.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/_experimental/flops/module_tracker.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/actsave/__init__.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/actsave/_callback.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/__init__.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/_throughput_monitor_callback.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/base.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/early_stopping.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/ema.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/finite_checks.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/gradient_skipping.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/interval.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/latest_epoch_checkpoint.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/log_epoch.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/norm_logging.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/on_exception_checkpoint.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/print_table.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/throughput_monitor.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/timer.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/wandb_watch.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/data/__init__.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/data/balanced_batch_sampler.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/data/transform.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/__init__.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/_experimental.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/actsave.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/callbacks.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/config.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/data.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/log.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/lr_scheduler.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/model.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/nn.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/optimizer.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/runner.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/snapshot.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/snoop.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/trainer.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/typecheck.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/ll/util.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/lr_scheduler/__init__.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/lr_scheduler/_base.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/lr_scheduler/linear_warmup_cosine.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/lr_scheduler/reduce_lr_on_plateau.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/model/__init__.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/model/base.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/model/modules/callback.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/model/modules/debug.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/model/modules/distributed.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/model/modules/logger.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/model/modules/profiler.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/model/modules/rlp_sanity_checks.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/model/modules/shared_parameters.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/nn/__init__.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/nn/mlp.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/nn/module_dict.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/nn/module_list.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/nn/nonlinearity.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/optimizer.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/runner.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/scripts/check_env.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/scripts/find_packages.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/trainer/__init__.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/trainer/trainer.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/util/environment.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/util/seed.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/util/slurm.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/util/typed.py +0 -0
- {nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/util/typing_utils.py +0 -0
|
@@ -805,6 +805,36 @@ class CheckpointLoadingConfig(C.Config):
|
|
|
805
805
|
"""
|
|
806
806
|
|
|
807
807
|
|
|
808
|
+
def _create_symlink_to_nshrunner(base_dir: Path):
|
|
809
|
+
# Resolve the current nshrunner session directory
|
|
810
|
+
if not (session_dir := os.environ.get("NSHRUNNER_SESSION_DIR")):
|
|
811
|
+
log.warning("NSHRUNNER_SESSION_DIR is not set. Skipping symlink creation.")
|
|
812
|
+
return
|
|
813
|
+
session_dir = Path(session_dir)
|
|
814
|
+
if not session_dir.exists() or not session_dir.is_dir():
|
|
815
|
+
log.warning(
|
|
816
|
+
f"NSHRUNNER_SESSION_DIR is not a valid directory: {session_dir}. "
|
|
817
|
+
"Skipping symlink creation."
|
|
818
|
+
)
|
|
819
|
+
return
|
|
820
|
+
|
|
821
|
+
# Create the symlink
|
|
822
|
+
symlink_path = base_dir / "nshrunner"
|
|
823
|
+
if symlink_path.exists():
|
|
824
|
+
# If it already points to the correct directory, we're done
|
|
825
|
+
if symlink_path.resolve() == session_dir.resolve():
|
|
826
|
+
return
|
|
827
|
+
|
|
828
|
+
# Otherwise, we should log a warning and remove the existing symlink
|
|
829
|
+
log.warning(
|
|
830
|
+
f"A symlink pointing to {symlink_path.resolve()} already exists at {symlink_path}. "
|
|
831
|
+
"Removing the existing symlink."
|
|
832
|
+
)
|
|
833
|
+
symlink_path.unlink()
|
|
834
|
+
|
|
835
|
+
symlink_path.symlink_to(session_dir)
|
|
836
|
+
|
|
837
|
+
|
|
808
838
|
class DirectoryConfig(C.Config):
|
|
809
839
|
project_root: Path | None = None
|
|
810
840
|
"""
|
|
@@ -813,30 +843,33 @@ class DirectoryConfig(C.Config):
|
|
|
813
843
|
This isn't specific to the run; it is the parent directory of all runs.
|
|
814
844
|
"""
|
|
815
845
|
|
|
846
|
+
create_symlink_to_nshrunner_root: bool = True
|
|
847
|
+
"""Should we create a symlink to the root folder for the Runner (if we're in one)?"""
|
|
848
|
+
|
|
816
849
|
log: Path | None = None
|
|
817
|
-
"""Base directory for all experiment tracking (e.g., WandB, Tensorboard, etc.) files. If None, will use
|
|
850
|
+
"""Base directory for all experiment tracking (e.g., WandB, Tensorboard, etc.) files. If None, will use nshtrainer/{id}/log/."""
|
|
818
851
|
|
|
819
852
|
stdio: Path | None = None
|
|
820
|
-
"""stdout/stderr log directory to use for the trainer. If None, will use
|
|
853
|
+
"""stdout/stderr log directory to use for the trainer. If None, will use nshtrainer/{id}/stdio/."""
|
|
821
854
|
|
|
822
855
|
checkpoint: Path | None = None
|
|
823
|
-
"""Checkpoint directory to use for the trainer. If None, will use
|
|
856
|
+
"""Checkpoint directory to use for the trainer. If None, will use nshtrainer/{id}/checkpoint/."""
|
|
824
857
|
|
|
825
858
|
activation: Path | None = None
|
|
826
|
-
"""Activation directory to use for the trainer. If None, will use
|
|
859
|
+
"""Activation directory to use for the trainer. If None, will use nshtrainer/{id}/activation/."""
|
|
827
860
|
|
|
828
861
|
profile: Path | None = None
|
|
829
|
-
"""Directory to save profiling information to. If None, will use
|
|
862
|
+
"""Directory to save profiling information to. If None, will use nshtrainer/{id}/profile/."""
|
|
830
863
|
|
|
831
864
|
def resolve_run_root_directory(self, run_id: str) -> Path:
|
|
832
865
|
if (project_root_dir := self.project_root) is None:
|
|
833
866
|
project_root_dir = Path.cwd()
|
|
834
867
|
|
|
835
|
-
# The default base dir is $CWD/
|
|
836
|
-
base_dir = project_root_dir / "
|
|
868
|
+
# The default base dir is $CWD/nshtrainer/{id}/
|
|
869
|
+
base_dir = project_root_dir / "nshtrainer"
|
|
837
870
|
base_dir.mkdir(exist_ok=True)
|
|
838
871
|
|
|
839
|
-
# Add a .gitignore file to the
|
|
872
|
+
# Add a .gitignore file to the nshtrainer directory
|
|
840
873
|
# which will ignore all files except for the .gitignore file itself
|
|
841
874
|
gitignore_path = base_dir / ".gitignore"
|
|
842
875
|
if not gitignore_path.exists():
|
|
@@ -846,6 +879,10 @@ class DirectoryConfig(C.Config):
|
|
|
846
879
|
base_dir = base_dir / run_id
|
|
847
880
|
base_dir.mkdir(exist_ok=True)
|
|
848
881
|
|
|
882
|
+
# Create a symlink to the root folder for the Runner
|
|
883
|
+
if self.create_symlink_to_nshrunner_root:
|
|
884
|
+
_create_symlink_to_nshrunner(base_dir)
|
|
885
|
+
|
|
849
886
|
return base_dir
|
|
850
887
|
|
|
851
888
|
def resolve_subdirectory(
|
|
@@ -854,7 +891,7 @@ class DirectoryConfig(C.Config):
|
|
|
854
891
|
# subdirectory: Literal["log", "stdio", "checkpoint", "activation", "profile"],
|
|
855
892
|
subdirectory: str,
|
|
856
893
|
) -> Path:
|
|
857
|
-
# The subdir will be $CWD/
|
|
894
|
+
# The subdir will be $CWD/nshtrainer/{id}/{log, stdio, checkpoint, activation}/
|
|
858
895
|
if (subdir := getattr(self, subdirectory, None)) is not None:
|
|
859
896
|
assert isinstance(
|
|
860
897
|
subdir, Path
|
|
@@ -874,7 +911,7 @@ class DirectoryConfig(C.Config):
|
|
|
874
911
|
if (log_dir := logger.log_dir) is not None:
|
|
875
912
|
return log_dir
|
|
876
913
|
|
|
877
|
-
# Save to
|
|
914
|
+
# Save to nshtrainer/{id}/log/{logger kind}/{id}/
|
|
878
915
|
log_dir = self.resolve_subdirectory(run_id, "log")
|
|
879
916
|
log_dir = log_dir / logger.kind
|
|
880
917
|
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import re
|
|
4
4
|
import signal
|
|
5
5
|
import subprocess
|
|
6
|
+
import sys
|
|
6
7
|
import threading
|
|
7
8
|
from collections import defaultdict
|
|
8
9
|
from collections.abc import Callable
|
|
@@ -245,3 +246,20 @@ class _SignalConnector(_LightningSignalConnector):
|
|
|
245
246
|
os.chmod(exit_script_path, 0o755)
|
|
246
247
|
|
|
247
248
|
log.info(f"Requeue script written to {exit_script_path}")
|
|
249
|
+
|
|
250
|
+
# Kill the current session to trigger the exit script
|
|
251
|
+
log.info("Killing current session to trigger exit script")
|
|
252
|
+
self._kill_current_session()
|
|
253
|
+
|
|
254
|
+
def _kill_current_session(self):
|
|
255
|
+
from lightning.pytorch.trainer.call import _interrupt
|
|
256
|
+
|
|
257
|
+
_interrupt(self.trainer, KeyboardInterrupt())
|
|
258
|
+
self.trainer._teardown()
|
|
259
|
+
if (launcher := self.trainer.strategy.launcher) is not None:
|
|
260
|
+
launcher.kill(_get_sigkill_signal())
|
|
261
|
+
exit(1)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _get_sigkill_signal() -> _SIGNUM:
|
|
265
|
+
return signal.SIGTERM if sys.platform == "win32" else signal.SIGKILL
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nshtrainer-0.8.6 → nshtrainer-0.9.1}/src/nshtrainer/callbacks/_throughput_monitor_callback.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|