nshtrainer 0.8.5__tar.gz → 0.8.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/PKG-INFO +1 -1
  2. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/pyproject.toml +1 -2
  3. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/trainer/signal_connector.py +42 -14
  4. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/README.md +0 -0
  5. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/__init__.py +0 -0
  6. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/_experimental/__init__.py +0 -0
  7. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/_experimental/flops/__init__.py +0 -0
  8. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/_experimental/flops/flop_counter.py +0 -0
  9. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/_experimental/flops/module_tracker.py +0 -0
  10. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/actsave/__init__.py +0 -0
  11. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/actsave/_callback.py +0 -0
  12. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/__init__.py +0 -0
  13. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/_throughput_monitor_callback.py +0 -0
  14. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/base.py +0 -0
  15. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/early_stopping.py +0 -0
  16. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/ema.py +0 -0
  17. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/finite_checks.py +0 -0
  18. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/gradient_skipping.py +0 -0
  19. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/interval.py +0 -0
  20. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/latest_epoch_checkpoint.py +0 -0
  21. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/log_epoch.py +0 -0
  22. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/norm_logging.py +0 -0
  23. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/on_exception_checkpoint.py +0 -0
  24. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/print_table.py +0 -0
  25. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/throughput_monitor.py +0 -0
  26. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/timer.py +0 -0
  27. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/callbacks/wandb_watch.py +0 -0
  28. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/data/__init__.py +0 -0
  29. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/data/balanced_batch_sampler.py +0 -0
  30. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/data/transform.py +0 -0
  31. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/__init__.py +0 -0
  32. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/_experimental.py +0 -0
  33. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/actsave.py +0 -0
  34. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/callbacks.py +0 -0
  35. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/config.py +0 -0
  36. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/data.py +0 -0
  37. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/log.py +0 -0
  38. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/lr_scheduler.py +0 -0
  39. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/model.py +0 -0
  40. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/nn.py +0 -0
  41. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/optimizer.py +0 -0
  42. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/runner.py +0 -0
  43. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/snapshot.py +0 -0
  44. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/snoop.py +0 -0
  45. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/trainer.py +0 -0
  46. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/typecheck.py +0 -0
  47. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/ll/util.py +0 -0
  48. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/lr_scheduler/__init__.py +0 -0
  49. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/lr_scheduler/_base.py +0 -0
  50. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/lr_scheduler/linear_warmup_cosine.py +0 -0
  51. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/lr_scheduler/reduce_lr_on_plateau.py +0 -0
  52. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/model/__init__.py +0 -0
  53. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/model/base.py +0 -0
  54. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/model/config.py +0 -0
  55. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/model/modules/callback.py +0 -0
  56. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/model/modules/debug.py +0 -0
  57. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/model/modules/distributed.py +0 -0
  58. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/model/modules/logger.py +0 -0
  59. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/model/modules/profiler.py +0 -0
  60. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/model/modules/rlp_sanity_checks.py +0 -0
  61. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/model/modules/shared_parameters.py +0 -0
  62. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/nn/__init__.py +0 -0
  63. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/nn/mlp.py +0 -0
  64. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/nn/module_dict.py +0 -0
  65. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/nn/module_list.py +0 -0
  66. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/nn/nonlinearity.py +0 -0
  67. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/optimizer.py +0 -0
  68. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/runner.py +0 -0
  69. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/scripts/check_env.py +0 -0
  70. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/scripts/find_packages.py +0 -0
  71. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/trainer/__init__.py +0 -0
  72. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/trainer/trainer.py +0 -0
  73. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/util/environment.py +0 -0
  74. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/util/seed.py +0 -0
  75. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/util/slurm.py +0 -0
  76. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/util/typed.py +0 -0
  77. {nshtrainer-0.8.5 → nshtrainer-0.8.6}/src/nshtrainer/util/typing_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nshtrainer
3
- Version: 0.8.5
3
+ Version: 0.8.6
4
4
  Summary:
5
5
  Author: Nima Shoghi
6
6
  Author-email: nimashoghi@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "nshtrainer"
3
- version = "0.8.5"
3
+ version = "0.8.6"
4
4
  description = ""
5
5
  authors = ["Nima Shoghi <nimashoghi@gmail.com>"]
6
6
  readme = "README.md"
@@ -36,7 +36,6 @@ strictListInference = true
36
36
  strictDictionaryInference = true
37
37
  strictSetInference = true
38
38
  reportPrivateImportUsage = false
39
- ignore = ["./build/"]
40
39
 
41
40
  [tool.ruff.lint]
42
41
  ignore = ["F722", "F821", "E731", "E741"]
@@ -150,7 +150,7 @@ class _SignalConnector(_LightningSignalConnector):
150
150
  cmd = ["scontrol", "requeue", job_id]
151
151
 
152
152
  # requeue job
153
- log.info(f"requeing job {job_id}...")
153
+ log.info(f"Requeuing job {job_id}...")
154
154
  try:
155
155
  result = subprocess.call(cmd)
156
156
  except FileNotFoundError:
@@ -202,18 +202,46 @@ class _SignalConnector(_LightningSignalConnector):
202
202
  exe = str((Path(bin_dir) / exe).resolve().absolute())
203
203
 
204
204
  log.info(f"Using LSF requeue executable: {exe}")
205
- cmd = [exe, job_id]
206
-
207
- # Requeue job
208
- log.info(f"Requeuing job {job_id}...")
209
- try:
210
- result = subprocess.call(cmd)
211
- except FileNotFoundError:
212
- # Retry with shell context if subprocess call fails
213
- result = subprocess.call(" ".join(cmd), shell=True)
214
205
 
215
- # Print result text
216
- if result == 0:
217
- log.info(f"Requeued LSF job: {job_id}")
206
+ # If NSHRUNNER_LSF_EXIT_SCRIPT_DIR exists, we should emit a bash script in that directory
207
+ # rather than calling the requeue command directly. This is because the requeue command
208
+ # is only available outside of the `jsrun` context, and the exit script is called within
209
+ # the `jsrun` context.
210
+ if not (exit_script_dir := os.getenv("NSHRUNNER_LSF_EXIT_SCRIPT_DIR")):
211
+ cmd = [exe, job_id]
212
+
213
+ # Requeue job
214
+ log.info(f"Requeuing job {job_id}...")
215
+ try:
216
+ result = subprocess.call(cmd)
217
+ except FileNotFoundError:
218
+ # Retry with shell context if subprocess call fails
219
+ result = subprocess.call(" ".join(cmd), shell=True)
220
+
221
+ # Print result text
222
+ if result == 0:
223
+ log.info(f"Requeued LSF job: {job_id}")
224
+ else:
225
+ log.warning(
226
+ f"Requeuing LSF job {job_id} failed with error code {result}"
227
+ )
218
228
  else:
219
- log.warning(f"Requeuing LSF job {job_id} failed with error code {result}")
229
+ log.critical(
230
+ "Environment variable NSHRUNNER_LSF_EXIT_SCRIPT_DIR found.\n"
231
+ "Writing requeue script to exit script directory."
232
+ )
233
+ exit_script_dir = Path(exit_script_dir)
234
+ assert (
235
+ exit_script_dir.is_dir()
236
+ ), f"Exit script directory {exit_script_dir} does not exist"
237
+
238
+ exit_script_path = exit_script_dir / f"requeue_{job_id}.sh"
239
+ log.info(f"Writing requeue script to {exit_script_path}")
240
+
241
+ with exit_script_path.open("w") as f:
242
+ f.write(f"#!/bin/bash\n{exe} {job_id}\n")
243
+
244
+ # Make the script executable
245
+ os.chmod(exit_script_path, 0o755)
246
+
247
+ log.info(f"Requeue script written to {exit_script_path}")
File without changes