nshtrainer 0.8.5__py3-none-any.whl → 0.8.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@ import os
3
3
  import re
4
4
  import signal
5
5
  import subprocess
6
+ import sys
6
7
  import threading
7
8
  from collections import defaultdict
8
9
  from collections.abc import Callable
@@ -150,7 +151,7 @@ class _SignalConnector(_LightningSignalConnector):
150
151
  cmd = ["scontrol", "requeue", job_id]
151
152
 
152
153
  # requeue job
153
- log.info(f"requeing job {job_id}...")
154
+ log.info(f"Requeuing job {job_id}...")
154
155
  try:
155
156
  result = subprocess.call(cmd)
156
157
  except FileNotFoundError:
@@ -202,18 +203,63 @@ class _SignalConnector(_LightningSignalConnector):
202
203
  exe = str((Path(bin_dir) / exe).resolve().absolute())
203
204
 
204
205
  log.info(f"Using LSF requeue executable: {exe}")
205
- cmd = [exe, job_id]
206
-
207
- # Requeue job
208
- log.info(f"Requeuing job {job_id}...")
209
- try:
210
- result = subprocess.call(cmd)
211
- except FileNotFoundError:
212
- # Retry with shell context if subprocess call fails
213
- result = subprocess.call(" ".join(cmd), shell=True)
214
206
 
215
- # Print result text
216
- if result == 0:
217
- log.info(f"Requeued LSF job: {job_id}")
207
+ # If NSHRUNNER_LSF_EXIT_SCRIPT_DIR exists, we should emit a bash script in that directory
208
+ # rather than calling the requeue command directly. This is because the requeue command
209
+ # is only available outside of the `jsrun` context, and the exit script is called within
210
+ # the `jsrun` context.
211
+ if not (exit_script_dir := os.getenv("NSHRUNNER_LSF_EXIT_SCRIPT_DIR")):
212
+ cmd = [exe, job_id]
213
+
214
+ # Requeue job
215
+ log.info(f"Requeuing job {job_id}...")
216
+ try:
217
+ result = subprocess.call(cmd)
218
+ except FileNotFoundError:
219
+ # Retry with shell context if subprocess call fails
220
+ result = subprocess.call(" ".join(cmd), shell=True)
221
+
222
+ # Print result text
223
+ if result == 0:
224
+ log.info(f"Requeued LSF job: {job_id}")
225
+ else:
226
+ log.warning(
227
+ f"Requeuing LSF job {job_id} failed with error code {result}"
228
+ )
218
229
  else:
219
- log.warning(f"Requeuing LSF job {job_id} failed with error code {result}")
230
+ log.critical(
231
+ "Environment variable NSHRUNNER_LSF_EXIT_SCRIPT_DIR found.\n"
232
+ "Writing requeue script to exit script directory."
233
+ )
234
+ exit_script_dir = Path(exit_script_dir)
235
+ assert (
236
+ exit_script_dir.is_dir()
237
+ ), f"Exit script directory {exit_script_dir} does not exist"
238
+
239
+ exit_script_path = exit_script_dir / f"requeue_{job_id}.sh"
240
+ log.info(f"Writing requeue script to {exit_script_path}")
241
+
242
+ with exit_script_path.open("w") as f:
243
+ f.write(f"#!/bin/bash\n{exe} {job_id}\n")
244
+
245
+ # Make the script executable
246
+ os.chmod(exit_script_path, 0o755)
247
+
248
+ log.info(f"Requeue script written to {exit_script_path}")
249
+
250
+ # Kill the current session to trigger the exit script
251
+ log.info("Killing current session to trigger exit script")
252
+ self._kill_current_session()
253
+
254
+ def _kill_current_session(self):
255
+ from lightning.pytorch.trainer.call import _interrupt
256
+
257
+ _interrupt(self.trainer, KeyboardInterrupt())
258
+ self.trainer._teardown()
259
+ if (launcher := self.trainer.strategy.launcher) is not None:
260
+ launcher.kill(_get_sigkill_signal())
261
+ exit(1)
262
+
263
+
264
+ def _get_sigkill_signal() -> _SIGNUM:
265
+ return signal.SIGTERM if sys.platform == "win32" else signal.SIGKILL
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nshtrainer
3
- Version: 0.8.5
3
+ Version: 0.8.7
4
4
  Summary:
5
5
  Author: Nima Shoghi
6
6
  Author-email: nimashoghi@gmail.com
@@ -65,13 +65,13 @@ nshtrainer/runner.py,sha256=7EumpnBkdNWjSNT9Gm-pkxAJ3W6-iMC-yae-WNeZcLw,3771
65
65
  nshtrainer/scripts/check_env.py,sha256=IMl6dSqsLYppI0XuCsVq8lK4bYqXwY9KHJkzsShz4Kg,806
66
66
  nshtrainer/scripts/find_packages.py,sha256=FbdlfmAefttFSMfaT0A46a-oHLP_ioaQKihwBfBeWeA,1467
67
67
  nshtrainer/trainer/__init__.py,sha256=P2rmr8oBVTHk-HJHYPcUwWqDEArMbPR4_rPpATbWK3E,40
68
- nshtrainer/trainer/signal_connector.py,sha256=QAoPM_C5JJOVQebcrJOimUUD3GHyoeZUqCEAvzZlT4U,8710
68
+ nshtrainer/trainer/signal_connector.py,sha256=JSP8W2PSdzwO3iWX1WOL1l8dufh2dKgUWeJ2gEWCppg,10626
69
69
  nshtrainer/trainer/trainer.py,sha256=eYEYfY9v70MuorHcSf8nqM7f2CkmUHhpPcjCk4FJD7k,14034
70
70
  nshtrainer/util/environment.py,sha256=_SEtiQ_s5bL5pllUlf96AOUv15kNvCPvocVC13S7mIk,4166
71
71
  nshtrainer/util/seed.py,sha256=HEXgVs-wldByahOysKwq7506OHxdYTEgmP-tDQVAEkQ,287
72
72
  nshtrainer/util/slurm.py,sha256=rofIU26z3SdL79SF45tNez6juou1cyDLz07oXEZb9Hg,1566
73
73
  nshtrainer/util/typed.py,sha256=NGuDkDzFlc1fAoaXjOFZVbmj0mRFjsQi1E_hPa7Bn5U,128
74
74
  nshtrainer/util/typing_utils.py,sha256=8ptjSSLZxlmy4FY6lzzkoGoF5fGNClo8-B_c0XHQaNU,385
75
- nshtrainer-0.8.5.dist-info/METADATA,sha256=swroihBvsK71kkd51ekyULjRaBGr3ujP9fZ4HDQqhN8,647
76
- nshtrainer-0.8.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
77
- nshtrainer-0.8.5.dist-info/RECORD,,
75
+ nshtrainer-0.8.7.dist-info/METADATA,sha256=O1kFYWXIuVK1EU0TpwbpbADX1lJQmPM1-9xLTuNaNB8,647
76
+ nshtrainer-0.8.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
77
+ nshtrainer-0.8.7.dist-info/RECORD,,