toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +41 -17
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +4 -5
  7. toil/batchSystems/gridengine.py +1 -1
  8. toil/batchSystems/htcondor.py +5 -5
  9. toil/batchSystems/kubernetes.py +25 -11
  10. toil/batchSystems/local_support.py +3 -3
  11. toil/batchSystems/lsf.py +9 -9
  12. toil/batchSystems/mesos/batchSystem.py +4 -4
  13. toil/batchSystems/mesos/executor.py +3 -2
  14. toil/batchSystems/options.py +9 -0
  15. toil/batchSystems/singleMachine.py +11 -10
  16. toil/batchSystems/slurm.py +129 -16
  17. toil/batchSystems/torque.py +1 -1
  18. toil/bus.py +45 -3
  19. toil/common.py +56 -31
  20. toil/cwl/cwltoil.py +442 -371
  21. toil/deferred.py +1 -1
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/abstractFileStore.py +69 -20
  24. toil/fileStores/cachingFileStore.py +6 -22
  25. toil/fileStores/nonCachingFileStore.py +6 -15
  26. toil/job.py +270 -86
  27. toil/jobStores/abstractJobStore.py +37 -31
  28. toil/jobStores/aws/jobStore.py +280 -218
  29. toil/jobStores/aws/utils.py +60 -31
  30. toil/jobStores/conftest.py +2 -2
  31. toil/jobStores/fileJobStore.py +3 -3
  32. toil/jobStores/googleJobStore.py +3 -4
  33. toil/leader.py +89 -38
  34. toil/lib/aws/__init__.py +26 -10
  35. toil/lib/aws/iam.py +2 -2
  36. toil/lib/aws/session.py +62 -22
  37. toil/lib/aws/utils.py +73 -37
  38. toil/lib/conversions.py +24 -1
  39. toil/lib/ec2.py +118 -69
  40. toil/lib/expando.py +1 -1
  41. toil/lib/generatedEC2Lists.py +8 -8
  42. toil/lib/io.py +42 -4
  43. toil/lib/misc.py +1 -3
  44. toil/lib/resources.py +57 -16
  45. toil/lib/retry.py +12 -5
  46. toil/lib/threading.py +29 -14
  47. toil/lib/throttle.py +1 -1
  48. toil/options/common.py +31 -30
  49. toil/options/wdl.py +5 -0
  50. toil/provisioners/__init__.py +9 -3
  51. toil/provisioners/abstractProvisioner.py +12 -2
  52. toil/provisioners/aws/__init__.py +20 -15
  53. toil/provisioners/aws/awsProvisioner.py +406 -329
  54. toil/provisioners/gceProvisioner.py +2 -2
  55. toil/provisioners/node.py +13 -5
  56. toil/server/app.py +1 -1
  57. toil/statsAndLogging.py +93 -23
  58. toil/test/__init__.py +27 -12
  59. toil/test/batchSystems/batchSystemTest.py +40 -33
  60. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  61. toil/test/batchSystems/test_slurm.py +22 -7
  62. toil/test/cactus/__init__.py +0 -0
  63. toil/test/cactus/test_cactus_integration.py +58 -0
  64. toil/test/cwl/cwlTest.py +245 -236
  65. toil/test/cwl/seqtk_seq.cwl +1 -1
  66. toil/test/docs/scriptsTest.py +11 -14
  67. toil/test/jobStores/jobStoreTest.py +40 -54
  68. toil/test/lib/aws/test_iam.py +2 -2
  69. toil/test/lib/test_ec2.py +1 -1
  70. toil/test/options/__init__.py +13 -0
  71. toil/test/options/options.py +37 -0
  72. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  73. toil/test/provisioners/clusterTest.py +99 -16
  74. toil/test/server/serverTest.py +2 -2
  75. toil/test/src/autoDeploymentTest.py +1 -1
  76. toil/test/src/dockerCheckTest.py +2 -1
  77. toil/test/src/environmentTest.py +125 -0
  78. toil/test/src/fileStoreTest.py +1 -1
  79. toil/test/src/jobDescriptionTest.py +18 -8
  80. toil/test/src/jobTest.py +1 -1
  81. toil/test/src/realtimeLoggerTest.py +4 -0
  82. toil/test/src/workerTest.py +52 -19
  83. toil/test/utils/toilDebugTest.py +62 -4
  84. toil/test/utils/utilsTest.py +23 -21
  85. toil/test/wdl/wdltoil_test.py +49 -21
  86. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  87. toil/toilState.py +68 -9
  88. toil/utils/toilDebugFile.py +1 -1
  89. toil/utils/toilDebugJob.py +153 -26
  90. toil/utils/toilLaunchCluster.py +12 -2
  91. toil/utils/toilRsyncCluster.py +7 -2
  92. toil/utils/toilSshCluster.py +7 -3
  93. toil/utils/toilStats.py +310 -266
  94. toil/utils/toilStatus.py +98 -52
  95. toil/version.py +11 -11
  96. toil/wdl/wdltoil.py +644 -225
  97. toil/worker.py +125 -83
  98. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  99. toil-7.0.0.dist-info/METADATA +158 -0
  100. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
  101. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  102. toil-6.1.0a1.dist-info/METADATA +0 -125
  103. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  104. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
@@ -174,13 +174,13 @@ class MesosBatchSystem(BatchSystemLocalSupport,
174
174
  def unignoreNode(self, nodeAddress):
175
175
  self.ignoredNodes.remove(nodeAddress)
176
176
 
177
- def issueBatchJob(self, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None):
177
+ def issueBatchJob(self, command: str, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None):
178
178
  """
179
179
  Issues the following command returning a unique jobID. Command is the string to run, memory
180
180
  is an int giving the number of bytes the job needs to run in and cores is the number of cpus
181
181
  needed for the job and error-file is the path of the file to place any std-err/std-out in.
182
182
  """
183
- localID = self.handleLocalJob(jobNode)
183
+ localID = self.handleLocalJob(command, jobNode)
184
184
  if localID is not None:
185
185
  return localID
186
186
 
@@ -200,12 +200,12 @@ class MesosBatchSystem(BatchSystemLocalSupport,
200
200
  job = ToilJob(jobID=jobID,
201
201
  name=str(jobNode),
202
202
  resources=MesosShape(wallTime=0, **mesos_resources),
203
- command=jobNode.command,
203
+ command=command,
204
204
  userScript=self.userScript,
205
205
  environment=environment,
206
206
  workerCleanupInfo=self.workerCleanupInfo)
207
207
  jobType = job.resources
208
- log.debug("Queueing the job command: %s with job id: %s ...", jobNode.command, str(jobID))
208
+ log.debug("Queueing the job %s with job id: %s ...", jobNode, str(jobID))
209
209
 
210
210
  # TODO: round all elements of resources
211
211
 
@@ -196,12 +196,13 @@ class MesosExecutor(Executor):
196
196
  """
197
197
  if job.userScript:
198
198
  job.userScript.register()
199
- log.debug("Invoking command: '%s'", job.command)
199
+ command = job.command
200
+ log.debug("Invoking command: '%s'", command)
200
201
  # Construct the job's environment
201
202
  jobEnv = dict(os.environ, **job.environment)
202
203
  log.debug('Using environment variables: %s', jobEnv.keys())
203
204
  with self.popenLock:
204
- return subprocess.Popen(job.command,
205
+ return subprocess.Popen(command,
205
206
  preexec_fn=lambda: os.setpgrp(),
206
207
  shell=True, env=jobEnv)
207
208
 
@@ -76,6 +76,7 @@ def set_batchsystem_options(batch_system: Optional[str], set_option: OptionSette
76
76
  set_option("manualMemArgs")
77
77
  set_option("run_local_jobs_on_workers")
78
78
  set_option("statePollingWait")
79
+ set_option("state_polling_timeout")
79
80
  set_option("batch_logs_dir")
80
81
 
81
82
 
@@ -164,6 +165,14 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
164
165
  "Return cached results if within the waiting period. Only works for grid "
165
166
  "engine batch systems such as gridengine, htcondor, torque, slurm, and lsf."
166
167
  )
168
+ parser.add_argument(
169
+ "--statePollingTimeout",
170
+ dest="state_polling_timeout",
171
+ type=int,
172
+ default=1200,
173
+ help="Time, in seconds, to retry against a broken scheduler. Only works for grid "
174
+ "engine batch systems such as gridengine, htcondor, torque, slurm, and lsf."
175
+ )
167
176
  parser.add_argument(
168
177
  "--batchLogsDir",
169
178
  dest="batch_logs_dir",
@@ -475,17 +475,17 @@ class SingleMachineBatchSystem(BatchSystemSupport):
475
475
  # We can actually run in this thread
476
476
  jobName, jobStoreLocator, jobStoreID = jobCommand.split()[1:4] # Parse command
477
477
  jobStore = Toil.resumeJobStore(jobStoreLocator)
478
- toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID,
479
- redirectOutputToLogFile=not self.debugWorker) # Call the worker
478
+ statusCode = toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID,
479
+ redirect_output_to_log_file=not self.debugWorker) # Call the worker
480
480
  else:
481
481
  # Run synchronously. If starting or running the command fails, let the exception stop us.
482
- subprocess.check_call(jobCommand,
482
+ statusCode = subprocess.check_call(jobCommand,
483
483
  shell=True,
484
484
  env=dict(os.environ, **environment))
485
485
 
486
486
  self.runningJobs.pop(jobID)
487
487
  if not info.killIntended:
488
- self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=0, wallTime=time.time() - info.time, exitReason=None))
488
+ self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=statusCode, wallTime=time.time() - info.time, exitReason=None))
489
489
 
490
490
  def getSchedulingStatusMessage(self):
491
491
  # Implement the abstractBatchSystem's scheduling status message API
@@ -655,6 +655,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
655
655
  # and all its children together. We assume that the
656
656
  # process group ID will equal the PID of the process we
657
657
  # are starting.
658
+ logger.debug("Attempting to run job command: %s", jobCommand)
658
659
  popen = subprocess.Popen(jobCommand,
659
660
  shell=True,
660
661
  env=child_environment,
@@ -743,24 +744,24 @@ class SingleMachineBatchSystem(BatchSystemSupport):
743
744
 
744
745
  logger.debug('Child %d for job %s succeeded', pid, jobID)
745
746
 
746
- def issueBatchJob(self, jobDesc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
747
+ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
747
748
  """Adds the command and resources to a queue to be run."""
748
749
 
749
750
  self._checkOnDaddy()
750
751
 
751
752
  # Apply scale in cores
752
- scaled_desc = jobDesc.scale('cores', self.scale)
753
+ scaled_desc = job_desc.scale('cores', self.scale)
753
754
  # Round cores up to multiples of minCores
754
755
  scaled_desc.cores = max(math.ceil(scaled_desc.cores / self.minCores) * self.minCores, self.minCores)
755
756
 
756
757
  # Don't do our own assertions about job size vs. our configured size.
757
758
  # The abstract batch system can handle it.
758
759
  self.check_resource_request(scaled_desc)
759
- logger.debug(f"Issuing the command: {jobDesc.command} with {scaled_desc.requirements_string()}")
760
+ logger.debug(f"Issuing the command: {command} with {scaled_desc.requirements_string()}")
760
761
  with self.jobIndexLock:
761
762
  jobID = self.jobIndex
762
763
  self.jobIndex += 1
763
- self.jobs[jobID] = jobDesc.command
764
+ self.jobs[jobID] = command
764
765
 
765
766
  environment = self.environment.copy()
766
767
  if job_environment:
@@ -769,10 +770,10 @@ class SingleMachineBatchSystem(BatchSystemSupport):
769
770
  if self.debugWorker:
770
771
  # Run immediately, blocking for return.
771
772
  # Ignore resource requirements; we run one job at a time
772
- self._runDebugJob(jobDesc.command, jobID, environment)
773
+ self._runDebugJob(command, jobID, environment)
773
774
  else:
774
775
  # Queue the job for later
775
- self.inputQueue.put((jobDesc.command, jobID, scaled_desc.cores, scaled_desc.memory,
776
+ self.inputQueue.put((command, jobID, scaled_desc.cores, scaled_desc.memory,
776
777
  scaled_desc.disk, scaled_desc.accelerators, environment))
777
778
 
778
779
  return jobID
@@ -16,8 +16,9 @@ import math
16
16
  import os
17
17
  from argparse import ArgumentParser, _ArgumentGroup
18
18
  from shlex import quote
19
- from typing import Dict, List, Optional, TypeVar, Union
19
+ from typing import Dict, List, Optional, Set, Tuple, TypeVar, Union
20
20
 
21
+ from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE, InsufficientSystemResources
21
22
  from toil.batchSystems.abstractGridEngineBatchSystem import \
22
23
  AbstractGridEngineBatchSystem
23
24
  from toil.batchSystems.options import OptionSetter
@@ -26,10 +27,50 @@ from toil.lib.misc import CalledProcessErrorStderr, call_command
26
27
 
27
28
  logger = logging.getLogger(__name__)
28
29
 
30
+ # We have a complete list of Slurm states. States not in one of these aren't
31
+ # allowed. See <https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>
32
+
33
+ # If a job is in one of these states, Slurm can't run it anymore.
34
+ # We don't include states where the job is held or paused here;
35
+ # those mean it could run and needs to wait for someone to un-hold
36
+ # it, so Toil should wait for it.
37
+ #
38
+ # We map from each terminal state to the Toil-ontology exit reason.
39
+ TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
40
+ "BOOT_FAIL": BatchJobExitReason.LOST,
41
+ "CANCELLED": BatchJobExitReason.KILLED,
42
+ "COMPLETED": BatchJobExitReason.FINISHED,
43
+ "DEADLINE": BatchJobExitReason.KILLED,
44
+ "FAILED": BatchJobExitReason.FAILED,
45
+ "NODE_FAIL": BatchJobExitReason.LOST,
46
+ "OUT_OF_MEMORY": BatchJobExitReason.MEMLIMIT,
47
+ "PREEMPTED": BatchJobExitReason.KILLED,
48
+ "REVOKED": BatchJobExitReason.KILLED,
49
+ "SPECIAL_EXIT": BatchJobExitReason.FAILED,
50
+ "TIMEOUT": BatchJobExitReason.KILLED
51
+ }
52
+
53
+ # If a job is in one of these states, it might eventually move to a different
54
+ # state.
55
+ NONTERMINAL_STATES: Set[str] = {
56
+ "CONFIGURING",
57
+ "COMPLETING",
58
+ "PENDING",
59
+ "RUNNING",
60
+ "RESV_DEL_HOLD",
61
+ "REQUEUE_FED",
62
+ "REQUEUE_HOLD",
63
+ "REQUEUED",
64
+ "RESIZING",
65
+ "SIGNALING",
66
+ "STAGE_OUT",
67
+ "STOPPED",
68
+ "SUSPENDED"
69
+ }
29
70
 
30
71
  class SlurmBatchSystem(AbstractGridEngineBatchSystem):
31
72
 
32
- class Worker(AbstractGridEngineBatchSystem.Worker):
73
+ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
33
74
 
34
75
  def getRunningJobIDs(self):
35
76
  # Should return a dictionary of Job IDs and number of seconds
@@ -64,7 +105,9 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
64
105
  jobName: str,
65
106
  job_environment: Optional[Dict[str, str]] = None,
66
107
  gpus: Optional[int] = None) -> List[str]:
67
- return self.prepareSbatch(cpu, memory, jobID, jobName, job_environment, gpus) + [f'--wrap={command}']
108
+ # Make sure to use exec so we can get Slurm's signals in the Toil
109
+ # worker instead of having an intervening Bash
110
+ return self.prepareSbatch(cpu, memory, jobID, jobName, job_environment, gpus) + [f'--wrap=exec {command}']
68
111
 
69
112
  def submitJob(self, subLine):
70
113
  try:
@@ -92,15 +135,15 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
92
135
  logger.debug("sbatch submitted job %d", result)
93
136
  return result
94
137
  except OSError as e:
95
- logger.error("sbatch command failed")
138
+ logger.error(f"sbatch command failed with error: {e}")
96
139
  raise e
97
140
 
98
- def coalesce_job_exit_codes(self, batch_job_id_list: list) -> list:
141
+ def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
99
142
  """
100
143
  Collect all job exit codes in a single call.
101
144
  :param batch_job_id_list: list of Job ID strings, where each string has the form
102
145
  "<job>[.<task>]".
103
- :return: list of job exit codes, associated with the list of job IDs.
146
+ :return: list of job exit codes or exit code, exit reason pairs associated with the list of job IDs.
104
147
  """
105
148
  logger.debug("Getting exit codes for slurm jobs: %s", batch_job_id_list)
106
149
  # Convert batch_job_id_list to list of integer job IDs.
@@ -111,7 +154,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
111
154
  exit_codes.append(self._get_job_return_code(status))
112
155
  return exit_codes
113
156
 
114
- def getJobExitCode(self, batchJobID: str) -> int:
157
+ def getJobExitCode(self, batchJobID: str) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
115
158
  """
116
159
  Get job exit code for given batch job ID.
117
160
  :param batchJobID: string of the form "<job>[.<task>]".
@@ -138,18 +181,68 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
138
181
  status_dict = self._getJobDetailsFromScontrol(job_id_list)
139
182
  return status_dict
140
183
 
141
- def _get_job_return_code(self, status: tuple) -> list:
184
+ def _get_job_return_code(self, status: tuple) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
142
185
  """
186
+ Given a Slurm return code, status pair, summarize them into a Toil return code, exit reason pair.
187
+
188
+ The return code may have already been OR'd with the 128-offset
189
+ Slurm-reported signal.
190
+
191
+ Slurm will report return codes of 0 even if jobs time out instead
192
+ of succeeding:
193
+
194
+ 2093597|TIMEOUT|0:0
195
+ 2093597.batch|CANCELLED|0:15
196
+
197
+ So we guarantee here that, if the Slurm status string is not a
198
+ successful one as defined in
199
+ <https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>, we
200
+ will not return a successful return code.
201
+
143
202
  Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
144
- :param status: tuple containing the job's state and it's return code.
145
- :return: the job's return code if it's completed, otherwise None.
203
+ :param status: tuple containing the job's state and it's return code from Slurm.
204
+ :return: the job's return code for Toil if it's completed, otherwise None.
146
205
  """
147
206
  state, rc = status
148
- # If job is in a running state, set return code to None to indicate we don't have
149
- # an update.
150
- if state in ('PENDING', 'RUNNING', 'CONFIGURING', 'COMPLETING', 'RESIZING', 'SUSPENDED'):
151
- rc = None
152
- return rc
207
+
208
+ if state not in TERMINAL_STATES:
209
+ # Don't treat the job as exited yet
210
+ return None
211
+
212
+ exit_reason = TERMINAL_STATES[state]
213
+
214
+ if exit_reason == BatchJobExitReason.FINISHED:
215
+ # The only state that should produce a 0 ever is COMPLETED. So
216
+ # if the job is COMPLETED and the exit reason is thus FINISHED,
217
+ # pass along the code it has.
218
+ return (rc, exit_reason)
219
+
220
+ if rc == 0:
221
+ # The job claims to be in a state other than COMPLETED, but
222
+ # also to have not encountered a problem. Say the exit status
223
+ # is unavailable.
224
+ return (EXIT_STATUS_UNAVAILABLE_VALUE, exit_reason)
225
+
226
+ # If the code is nonzero, pass it along.
227
+ return (rc, exit_reason)
228
+
229
+ def _canonicalize_state(self, state: str) -> str:
230
+ """
231
+ Turn a state string form SLURM into just the state token like "CANCELED".
232
+ """
233
+
234
+ # Slurm will sometimes send something like "CANCELED by 30065" in
235
+ # the state column for some reason.
236
+
237
+ state_token = state
238
+
239
+ if " " in state_token:
240
+ state_token = state.split(" ", 1)[0]
241
+
242
+ if state_token not in TERMINAL_STATES and state_token not in NONTERMINAL_STATES:
243
+ raise RuntimeError("Toil job in unimplemented Slurm state " + state)
244
+
245
+ return state_token
153
246
 
154
247
  def _getJobDetailsFromSacct(self, job_id_list: list) -> dict:
155
248
  """
@@ -178,6 +271,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
178
271
  if len(values) < 3:
179
272
  continue
180
273
  job_id_raw, state, exitcode = values
274
+ state = self._canonicalize_state(state)
181
275
  logger.debug("%s state of job %s is %s", args[0], job_id_raw, state)
182
276
  # JobIDRaw is in the form JobID[.JobStep]; we're not interested in job steps.
183
277
  job_id_parts = job_id_raw.split(".")
@@ -252,6 +346,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
252
346
  if job_id not in job_id_list:
253
347
  continue
254
348
  state = job['JobState']
349
+ state = self._canonicalize_state(state)
255
350
  logger.debug("%s state of job %s is %s", args[0], job_id, state)
256
351
  try:
257
352
  exitcode = job['ExitCode']
@@ -283,8 +378,26 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
283
378
  job_environment: Optional[Dict[str, str]],
284
379
  gpus: Optional[int]) -> List[str]:
285
380
 
286
- # Returns the sbatch command line before the script to run
381
+ """
382
+ Returns the sbatch command line to run to queue the job.
383
+ """
384
+
385
+ # Start by naming the job
287
386
  sbatch_line = ['sbatch', '-J', f'toil_job_{jobID}_{jobName}']
387
+
388
+ # Make sure the job gets a signal before it disappears so that e.g.
389
+ # container cleanup finally blocks can run. Ask for SIGINT so we
390
+ # can get the default Python KeyboardInterrupt which third-party
391
+ # code is likely to plan for. Make sure to send it to the batch
392
+ # shell process with "B:", not to all the srun steps it launches
393
+ # (because there shouldn't be any). We cunningly replaced the batch
394
+ # shell process with the Toil worker process, so Toil should be
395
+ # able to get the signal.
396
+ #
397
+ # TODO: Add a way to detect when the job failed because it
398
+ # responded to this signal and use the right exit reason for it.
399
+ sbatch_line.append("--signal=B:INT@30")
400
+
288
401
  if gpus:
289
402
  sbatch_line = sbatch_line[:1] + [f'--gres=gpu:{gpus}'] + sbatch_line[1:]
290
403
  environment = {}
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
31
31
  class TorqueBatchSystem(AbstractGridEngineBatchSystem):
32
32
 
33
33
  # class-specific Worker
34
- class Worker(AbstractGridEngineBatchSystem.Worker):
34
+ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
35
35
  def __init__(
36
36
  self, newJobsQueue, updatedJobsQueue, killQueue, killedJobsQueue, boss
37
37
  ):
toil/bus.py CHANGED
@@ -20,7 +20,7 @@ functions to "handle" different things happening. Over time, it has become very
20
20
  brittle: exactly the right handling functions need to be called in exactly the
21
21
  right order, or it gets confused and does the wrong thing.
22
22
 
23
- The MessageBus is meant to let the leader avoid this by more losely coupling
23
+ The MessageBus is meant to let the leader avoid this by more loosely coupling
24
24
  its components together, by having them communicate by sending messages instead
25
25
  of by calling functions.
26
26
 
@@ -87,6 +87,43 @@ from pubsub.core.topicutils import ALL_TOPICS
87
87
 
88
88
  logger = logging.getLogger( __name__ )
89
89
 
90
+ # We define some ways to talk about jobs.
91
+
92
+ class Names(NamedTuple):
93
+ """
94
+ Stores all the kinds of name a job can have.
95
+ """
96
+ # Name of the kind of job this is
97
+ job_name: str
98
+ # Name of this particular work unit
99
+ unit_name: str
100
+ # Human-readable name for the job
101
+ display_name: str
102
+ # What the job prints as, used for stats-and-logging log management
103
+ stats_name: str
104
+ # Job store ID of the job for the work unit
105
+ job_store_id: str
106
+
107
+ def get_job_kind(names: Names) -> str:
108
+ """
109
+ Return an identifying string for the job.
110
+
111
+ The result may contain spaces.
112
+
113
+ Returns: Either the unit name, job name, or display name, which identifies
114
+ the kind of job it is to toil.
115
+ Otherwise "Unknown Job" in case no identifier is available
116
+ """
117
+ if names.unit_name:
118
+ return names.unit_name
119
+ elif names.job_name:
120
+ return names.job_name
121
+ elif names.display_name:
122
+ return names.display_name
123
+ else:
124
+ return "Unknown Job"
125
+
126
+
90
127
  # We define a bunch of named tuple message types.
91
128
  # These all need to be plain data: only hold ints, strings, etc.
92
129
 
@@ -648,6 +685,7 @@ class JobStatus:
648
685
 
649
686
  def __repr__(self) -> str:
650
687
  return json.dumps(self, default= lambda o: o.__dict__, indent=4)
688
+
651
689
  def replay_message_bus(path: str) -> Dict[str, JobStatus]:
652
690
  """
653
691
  Replay all the messages and work out what they mean for jobs.
@@ -703,12 +741,16 @@ def replay_message_bus(path: str) -> Dict[str, JobStatus]:
703
741
 
704
742
  return job_statuses
705
743
 
706
- def gen_message_bus_path() -> str:
744
+ def gen_message_bus_path(tmpdir: Optional[str] = None) -> str:
707
745
  """
708
746
  Return a file path in tmp to store the message bus at.
709
747
  Calling function is responsible for cleaning the generated file.
748
+
749
+ The tmpdir argument will override the directory that the
750
+ message bus will be made in. If not provided, the standard tempfile
751
+ order will be used.
710
752
  """
711
- fd, path = tempfile.mkstemp()
753
+ fd, path = tempfile.mkstemp(dir=tmpdir)
712
754
  os.close(fd)
713
755
  return path
714
756
  #TODO Might want to clean up the tmpfile at some point after running the workflow