nshtrainer 0.8.5__tar.gz → 0.8.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/PKG-INFO +1 -1
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/pyproject.toml +1 -2
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/trainer/signal_connector.py +60 -14
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/README.md +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/__init__.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/_experimental/__init__.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/_experimental/flops/__init__.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/_experimental/flops/flop_counter.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/_experimental/flops/module_tracker.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/actsave/__init__.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/actsave/_callback.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/__init__.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/_throughput_monitor_callback.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/base.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/early_stopping.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/ema.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/finite_checks.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/gradient_skipping.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/interval.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/latest_epoch_checkpoint.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/log_epoch.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/norm_logging.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/on_exception_checkpoint.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/print_table.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/throughput_monitor.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/timer.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/wandb_watch.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/data/__init__.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/data/balanced_batch_sampler.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/data/transform.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/__init__.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/_experimental.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/actsave.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/callbacks.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/config.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/data.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/log.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/lr_scheduler.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/model.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/nn.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/optimizer.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/runner.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/snapshot.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/snoop.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/trainer.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/typecheck.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/ll/util.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/lr_scheduler/__init__.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/lr_scheduler/_base.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/lr_scheduler/linear_warmup_cosine.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/lr_scheduler/reduce_lr_on_plateau.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/model/__init__.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/model/base.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/model/config.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/model/modules/callback.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/model/modules/debug.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/model/modules/distributed.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/model/modules/logger.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/model/modules/profiler.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/model/modules/rlp_sanity_checks.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/model/modules/shared_parameters.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/nn/__init__.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/nn/mlp.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/nn/module_dict.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/nn/module_list.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/nn/nonlinearity.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/optimizer.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/runner.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/scripts/check_env.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/scripts/find_packages.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/trainer/__init__.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/trainer/trainer.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/util/environment.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/util/seed.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/util/slurm.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/util/typed.py +0 -0
- {nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/util/typing_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "nshtrainer"
|
|
3
|
-
version = "0.8.
|
|
3
|
+
version = "0.8.7"
|
|
4
4
|
description = ""
|
|
5
5
|
authors = ["Nima Shoghi <nimashoghi@gmail.com>"]
|
|
6
6
|
readme = "README.md"
|
|
@@ -36,7 +36,6 @@ strictListInference = true
|
|
|
36
36
|
strictDictionaryInference = true
|
|
37
37
|
strictSetInference = true
|
|
38
38
|
reportPrivateImportUsage = false
|
|
39
|
-
ignore = ["./build/"]
|
|
40
39
|
|
|
41
40
|
[tool.ruff.lint]
|
|
42
41
|
ignore = ["F722", "F821", "E731", "E741"]
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import re
|
|
4
4
|
import signal
|
|
5
5
|
import subprocess
|
|
6
|
+
import sys
|
|
6
7
|
import threading
|
|
7
8
|
from collections import defaultdict
|
|
8
9
|
from collections.abc import Callable
|
|
@@ -150,7 +151,7 @@ class _SignalConnector(_LightningSignalConnector):
|
|
|
150
151
|
cmd = ["scontrol", "requeue", job_id]
|
|
151
152
|
|
|
152
153
|
# requeue job
|
|
153
|
-
log.info(f"
|
|
154
|
+
log.info(f"Requeuing job {job_id}...")
|
|
154
155
|
try:
|
|
155
156
|
result = subprocess.call(cmd)
|
|
156
157
|
except FileNotFoundError:
|
|
@@ -202,18 +203,63 @@ class _SignalConnector(_LightningSignalConnector):
|
|
|
202
203
|
exe = str((Path(bin_dir) / exe).resolve().absolute())
|
|
203
204
|
|
|
204
205
|
log.info(f"Using LSF requeue executable: {exe}")
|
|
205
|
-
cmd = [exe, job_id]
|
|
206
|
-
|
|
207
|
-
# Requeue job
|
|
208
|
-
log.info(f"Requeuing job {job_id}...")
|
|
209
|
-
try:
|
|
210
|
-
result = subprocess.call(cmd)
|
|
211
|
-
except FileNotFoundError:
|
|
212
|
-
# Retry with shell context if subprocess call fails
|
|
213
|
-
result = subprocess.call(" ".join(cmd), shell=True)
|
|
214
206
|
|
|
215
|
-
#
|
|
216
|
-
|
|
217
|
-
|
|
207
|
+
# If NSHRUNNER_LSF_EXIT_SCRIPT_DIR exists, we should emit a bash script in that directory
|
|
208
|
+
# rather than calling the requeue command directly. This is because the requeue command
|
|
209
|
+
# is only available outside of the `jsrun` context, and the exit script is called within
|
|
210
|
+
# the `jsrun` context.
|
|
211
|
+
if not (exit_script_dir := os.getenv("NSHRUNNER_LSF_EXIT_SCRIPT_DIR")):
|
|
212
|
+
cmd = [exe, job_id]
|
|
213
|
+
|
|
214
|
+
# Requeue job
|
|
215
|
+
log.info(f"Requeuing job {job_id}...")
|
|
216
|
+
try:
|
|
217
|
+
result = subprocess.call(cmd)
|
|
218
|
+
except FileNotFoundError:
|
|
219
|
+
# Retry with shell context if subprocess call fails
|
|
220
|
+
result = subprocess.call(" ".join(cmd), shell=True)
|
|
221
|
+
|
|
222
|
+
# Print result text
|
|
223
|
+
if result == 0:
|
|
224
|
+
log.info(f"Requeued LSF job: {job_id}")
|
|
225
|
+
else:
|
|
226
|
+
log.warning(
|
|
227
|
+
f"Requeuing LSF job {job_id} failed with error code {result}"
|
|
228
|
+
)
|
|
218
229
|
else:
|
|
219
|
-
log.
|
|
230
|
+
log.critical(
|
|
231
|
+
"Environment variable NSHRUNNER_LSF_EXIT_SCRIPT_DIR found.\n"
|
|
232
|
+
"Writing requeue script to exit script directory."
|
|
233
|
+
)
|
|
234
|
+
exit_script_dir = Path(exit_script_dir)
|
|
235
|
+
assert (
|
|
236
|
+
exit_script_dir.is_dir()
|
|
237
|
+
), f"Exit script directory {exit_script_dir} does not exist"
|
|
238
|
+
|
|
239
|
+
exit_script_path = exit_script_dir / f"requeue_{job_id}.sh"
|
|
240
|
+
log.info(f"Writing requeue script to {exit_script_path}")
|
|
241
|
+
|
|
242
|
+
with exit_script_path.open("w") as f:
|
|
243
|
+
f.write(f"#!/bin/bash\n{exe} {job_id}\n")
|
|
244
|
+
|
|
245
|
+
# Make the script executable
|
|
246
|
+
os.chmod(exit_script_path, 0o755)
|
|
247
|
+
|
|
248
|
+
log.info(f"Requeue script written to {exit_script_path}")
|
|
249
|
+
|
|
250
|
+
# Kill the current session to trigger the exit script
|
|
251
|
+
log.info("Killing current session to trigger exit script")
|
|
252
|
+
self._kill_current_session()
|
|
253
|
+
|
|
254
|
+
def _kill_current_session(self):
|
|
255
|
+
from lightning.pytorch.trainer.call import _interrupt
|
|
256
|
+
|
|
257
|
+
_interrupt(self.trainer, KeyboardInterrupt())
|
|
258
|
+
self.trainer._teardown()
|
|
259
|
+
if (launcher := self.trainer.strategy.launcher) is not None:
|
|
260
|
+
launcher.kill(_get_sigkill_signal())
|
|
261
|
+
exit(1)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _get_sigkill_signal() -> _SIGNUM:
|
|
265
|
+
return signal.SIGTERM if sys.platform == "win32" else signal.SIGKILL
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nshtrainer-0.8.5 → nshtrainer-0.8.7}/src/nshtrainer/callbacks/_throughput_monitor_callback.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|