toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +41 -17
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +4 -5
  7. toil/batchSystems/gridengine.py +1 -1
  8. toil/batchSystems/htcondor.py +5 -5
  9. toil/batchSystems/kubernetes.py +25 -11
  10. toil/batchSystems/local_support.py +3 -3
  11. toil/batchSystems/lsf.py +9 -9
  12. toil/batchSystems/mesos/batchSystem.py +4 -4
  13. toil/batchSystems/mesos/executor.py +3 -2
  14. toil/batchSystems/options.py +9 -0
  15. toil/batchSystems/singleMachine.py +11 -10
  16. toil/batchSystems/slurm.py +129 -16
  17. toil/batchSystems/torque.py +1 -1
  18. toil/bus.py +45 -3
  19. toil/common.py +56 -31
  20. toil/cwl/cwltoil.py +442 -371
  21. toil/deferred.py +1 -1
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/abstractFileStore.py +69 -20
  24. toil/fileStores/cachingFileStore.py +6 -22
  25. toil/fileStores/nonCachingFileStore.py +6 -15
  26. toil/job.py +270 -86
  27. toil/jobStores/abstractJobStore.py +37 -31
  28. toil/jobStores/aws/jobStore.py +280 -218
  29. toil/jobStores/aws/utils.py +60 -31
  30. toil/jobStores/conftest.py +2 -2
  31. toil/jobStores/fileJobStore.py +3 -3
  32. toil/jobStores/googleJobStore.py +3 -4
  33. toil/leader.py +89 -38
  34. toil/lib/aws/__init__.py +26 -10
  35. toil/lib/aws/iam.py +2 -2
  36. toil/lib/aws/session.py +62 -22
  37. toil/lib/aws/utils.py +73 -37
  38. toil/lib/conversions.py +24 -1
  39. toil/lib/ec2.py +118 -69
  40. toil/lib/expando.py +1 -1
  41. toil/lib/generatedEC2Lists.py +8 -8
  42. toil/lib/io.py +42 -4
  43. toil/lib/misc.py +1 -3
  44. toil/lib/resources.py +57 -16
  45. toil/lib/retry.py +12 -5
  46. toil/lib/threading.py +29 -14
  47. toil/lib/throttle.py +1 -1
  48. toil/options/common.py +31 -30
  49. toil/options/wdl.py +5 -0
  50. toil/provisioners/__init__.py +9 -3
  51. toil/provisioners/abstractProvisioner.py +12 -2
  52. toil/provisioners/aws/__init__.py +20 -15
  53. toil/provisioners/aws/awsProvisioner.py +406 -329
  54. toil/provisioners/gceProvisioner.py +2 -2
  55. toil/provisioners/node.py +13 -5
  56. toil/server/app.py +1 -1
  57. toil/statsAndLogging.py +93 -23
  58. toil/test/__init__.py +27 -12
  59. toil/test/batchSystems/batchSystemTest.py +40 -33
  60. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  61. toil/test/batchSystems/test_slurm.py +22 -7
  62. toil/test/cactus/__init__.py +0 -0
  63. toil/test/cactus/test_cactus_integration.py +58 -0
  64. toil/test/cwl/cwlTest.py +245 -236
  65. toil/test/cwl/seqtk_seq.cwl +1 -1
  66. toil/test/docs/scriptsTest.py +11 -14
  67. toil/test/jobStores/jobStoreTest.py +40 -54
  68. toil/test/lib/aws/test_iam.py +2 -2
  69. toil/test/lib/test_ec2.py +1 -1
  70. toil/test/options/__init__.py +13 -0
  71. toil/test/options/options.py +37 -0
  72. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  73. toil/test/provisioners/clusterTest.py +99 -16
  74. toil/test/server/serverTest.py +2 -2
  75. toil/test/src/autoDeploymentTest.py +1 -1
  76. toil/test/src/dockerCheckTest.py +2 -1
  77. toil/test/src/environmentTest.py +125 -0
  78. toil/test/src/fileStoreTest.py +1 -1
  79. toil/test/src/jobDescriptionTest.py +18 -8
  80. toil/test/src/jobTest.py +1 -1
  81. toil/test/src/realtimeLoggerTest.py +4 -0
  82. toil/test/src/workerTest.py +52 -19
  83. toil/test/utils/toilDebugTest.py +62 -4
  84. toil/test/utils/utilsTest.py +23 -21
  85. toil/test/wdl/wdltoil_test.py +49 -21
  86. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  87. toil/toilState.py +68 -9
  88. toil/utils/toilDebugFile.py +1 -1
  89. toil/utils/toilDebugJob.py +153 -26
  90. toil/utils/toilLaunchCluster.py +12 -2
  91. toil/utils/toilRsyncCluster.py +7 -2
  92. toil/utils/toilSshCluster.py +7 -3
  93. toil/utils/toilStats.py +310 -266
  94. toil/utils/toilStatus.py +98 -52
  95. toil/version.py +11 -11
  96. toil/wdl/wdltoil.py +644 -225
  97. toil/worker.py +125 -83
  98. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  99. toil-7.0.0.dist-info/METADATA +158 -0
  100. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
  101. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  102. toil-6.1.0a1.dist-info/METADATA +0 -125
  103. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  104. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
@@ -22,9 +22,10 @@ from typing import Dict, List, Optional, Tuple, Union
22
22
  from toil.batchSystems.abstractBatchSystem import (BatchJobExitReason,
23
23
  UpdatedBatchJobInfo)
24
24
  from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
25
- from toil.bus import ExternalBatchIdMessage
25
+ from toil.bus import ExternalBatchIdMessage, get_job_kind
26
26
  from toil.job import AcceleratorRequirement
27
27
  from toil.lib.misc import CalledProcessErrorStderr
28
+ from toil.lib.retry import old_retry, DEFAULT_DELAYS
28
29
 
29
30
  logger = logging.getLogger(__name__)
30
31
 
@@ -44,26 +45,29 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
44
45
  A partial implementation of BatchSystemSupport for batch systems run on a
45
46
  standard HPC cluster. By default auto-deployment is not implemented.
46
47
  """
48
+ class GridEngineThreadException(Exception):
49
+ pass
47
50
 
48
- class Worker(Thread, metaclass=ABCMeta):
49
-
51
+ class GridEngineThread(Thread, metaclass=ABCMeta):
50
52
  def __init__(self, newJobsQueue: Queue, updatedJobsQueue: Queue, killQueue: Queue, killedJobsQueue: Queue, boss: 'AbstractGridEngineBatchSystem') -> None:
51
53
  """
52
- Abstract worker interface class. All instances are created with five
54
+ Abstract thread interface class. All instances are created with five
53
55
  initial arguments (below). Note the Queue instances passed are empty.
54
56
 
55
57
  :param newJobsQueue: a Queue of new (unsubmitted) jobs
56
58
  :param updatedJobsQueue: a Queue of jobs that have been updated
57
59
  :param killQueue: a Queue of active jobs that need to be killed
58
- :param killedJobsQueue: Queue of killed jobs for this worker
60
+ :param killedJobsQueue: Queue of killed jobs for this thread
59
61
  :param boss: the AbstractGridEngineBatchSystem instance that
60
- controls this AbstractGridEngineWorker
62
+ controls this GridEngineThread
61
63
 
62
64
  """
63
65
  Thread.__init__(self)
64
66
  self.boss = boss
65
67
  self.boss.config.statePollingWait = \
66
68
  self.boss.config.statePollingWait or self.boss.getWaitDuration()
69
+ self.boss.config.state_polling_timeout = \
70
+ self.boss.config.state_polling_timeout or self.boss.config.statePollingWait * 10
67
71
  self.newJobsQueue = newJobsQueue
68
72
  self.updatedJobsQueue = updatedJobsQueue
69
73
  self.killQueue = killQueue
@@ -74,6 +78,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
74
78
  self.batchJobIDs: Dict[int, str] = dict()
75
79
  self._checkOnJobsCache = None
76
80
  self._checkOnJobsTimestamp = None
81
+ self.exception = None
77
82
 
78
83
  def getBatchSystemID(self, jobID: int) -> str:
79
84
  """
@@ -107,7 +112,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
107
112
  """
108
113
  Create a new job with the given attributes.
109
114
 
110
- Implementation-specific; called by AbstractGridEngineWorker.run()
115
+ Implementation-specific; called by GridEngineThread.run()
111
116
  """
112
117
  activity = False
113
118
  # Load new job id if present:
@@ -143,7 +148,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
143
148
 
144
149
  def killJobs(self):
145
150
  """
146
- Kill any running jobs within worker
151
+ Kill any running jobs within thread
147
152
  """
148
153
  killList = list()
149
154
  while True:
@@ -175,7 +180,8 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
175
180
  while killList:
176
181
  for jobID in list(killList):
177
182
  batchJobID = self.getBatchSystemID(jobID)
178
- if self.boss.with_retries(self.getJobExitCode, batchJobID) is not None:
183
+ exit_code = self.boss.with_retries(self.getJobExitCode, batchJobID)
184
+ if exit_code is not None:
179
185
  logger.debug('Adding jobID %s to killedJobsQueue', jobID)
180
186
  self.killedJobsQueue.put(jobID)
181
187
  killList.remove(jobID)
@@ -225,23 +231,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
225
231
  return activity
226
232
 
227
233
  def _handle_job_status(
228
- self, job_id: int, status: Union[int, None], activity: bool
234
+ self, job_id: int, status: Union[int, Tuple[int, Optional[BatchJobExitReason]], None], activity: bool
229
235
  ) -> bool:
230
236
  """
231
237
  Helper method for checkOnJobs to handle job statuses
232
238
  """
233
239
  if status is not None:
240
+ if isinstance(status, int):
241
+ code = status
242
+ reason = None
243
+ else:
244
+ code, reason = status
234
245
  self.updatedJobsQueue.put(
235
246
  UpdatedBatchJobInfo(
236
- jobID=job_id, exitStatus=status, exitReason=None, wallTime=None
237
- )
238
- )
239
- self.forgetJob(job_id)
240
- return True
241
- if status is not None and isinstance(status, BatchJobExitReason):
242
- self.updatedJobsQueue.put(
243
- UpdatedBatchJobInfo(
244
- jobID=job_id, exitStatus=1, exitReason=status, wallTime=None
247
+ jobID=job_id, exitStatus=code, exitReason=reason, wallTime=None
245
248
  )
246
249
  )
247
250
  self.forgetJob(job_id)
@@ -276,14 +279,17 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
276
279
  while self._runStep():
277
280
  pass
278
281
  except Exception as ex:
279
- logger.error("GridEngine like batch system failure", exc_info=ex)
280
- raise
282
+ self.exception = ex
283
+ logger.error("GridEngine like batch system failure: %s", ex)
284
+ # don't raise exception as is_alive will still be set to false,
285
+ # signalling exception in the thread as we expect the thread to
286
+ # always be running for the duration of the workflow
281
287
 
282
- def coalesce_job_exit_codes(self, batch_job_id_list: list) -> list:
288
+ def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
283
289
  """
284
- Returns exit codes for a list of jobs.
290
+ Returns exit codes and possibly exit reasons for a list of jobs, or None if they are running.
285
291
 
286
- Called by AbstractGridEngineWorker.checkOnJobs().
292
+ Called by GridEngineThread.checkOnJobs().
287
293
 
288
294
  This is an optional part of the interface. It should raise
289
295
  NotImplementedError if not actually implemented for a particular
@@ -344,23 +350,26 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
344
350
  def killJob(self, jobID):
345
351
  """
346
352
  Kill specific job with the Toil job ID. Implementation-specific; called
347
- by AbstractGridEngineWorker.killJobs()
353
+ by GridEngineThread.killJobs()
348
354
 
349
355
  :param string jobID: Toil job ID
350
356
  """
351
357
  raise NotImplementedError()
352
358
 
353
359
  @abstractmethod
354
- def getJobExitCode(self, batchJobID):
360
+ def getJobExitCode(self, batchJobID) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
355
361
  """
356
- Returns job exit code or an instance of abstractBatchSystem.BatchJobExitReason.
357
- if something else happened other than the job exiting.
358
- Implementation-specific; called by AbstractGridEngineWorker.checkOnJobs()
362
+ Returns job exit code and possibly an instance of abstractBatchSystem.BatchJobExitReason.
359
363
 
360
- :param string batchjobID: batch system job ID
364
+ Returns None if the job is still running.
361
365
 
362
- :rtype: int|toil.batchSystems.abstractBatchSystem.BatchJobExitReason: exit code int
363
- or BatchJobExitReason if something else happened other than job exiting.
366
+ If the job is not running but the exit code is not available, it
367
+ will be EXIT_STATUS_UNAVAILABLE_VALUE. Implementation-specific;
368
+ called by GridEngineThread.checkOnJobs().
369
+
370
+ The exit code will only be 0 if the job affirmatively succeeded.
371
+
372
+ :param string batchjobID: batch system job ID
364
373
  """
365
374
  raise NotImplementedError()
366
375
 
@@ -375,24 +384,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
375
384
  self.updatedJobsQueue = Queue()
376
385
  self.killQueue = Queue()
377
386
  self.killedJobsQueue = Queue()
378
- # get the associated worker class here
379
- self.worker = self.Worker(self.newJobsQueue, self.updatedJobsQueue,
380
- self.killQueue, self.killedJobsQueue, self)
381
- self.worker.start()
387
+ # get the associated thread class here
388
+ self.background_thread = self.GridEngineThread(self.newJobsQueue, self.updatedJobsQueue,
389
+ self.killQueue, self.killedJobsQueue, self)
390
+ self.background_thread.start()
382
391
  self._getRunningBatchJobIDsTimestamp = None
383
392
  self._getRunningBatchJobIDsCache = {}
384
393
 
385
- @classmethod
386
- def supportsWorkerCleanup(cls):
387
- return False
388
-
389
394
  @classmethod
390
395
  def supportsAutoDeployment(cls):
391
396
  return False
392
397
 
393
- def issueBatchJob(self, jobDesc, job_environment: Optional[Dict[str, str]] = None):
398
+ def issueBatchJob(self, command: str, jobDesc, job_environment: Optional[Dict[str, str]] = None):
394
399
  # Avoid submitting internal jobs to the batch queue, handle locally
395
- localID = self.handleLocalJob(jobDesc)
400
+ localID = self.handleLocalJob(command, jobDesc)
396
401
  if localID is not None:
397
402
  return localID
398
403
  else:
@@ -406,11 +411,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
406
411
  gpus = accelerator['count']
407
412
  else:
408
413
  gpus = jobDesc.accelerators
409
-
410
- self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory, jobDesc.command, jobDesc.get_job_kind(),
414
+
415
+ self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory, command, get_job_kind(jobDesc.get_names()),
411
416
  job_environment, gpus))
412
- logger.debug("Issued the job command: %s with job id: %s and job name %s", jobDesc.command, str(jobID),
413
- jobDesc.get_job_kind())
417
+ logger.debug("Issued the job command: %s with job id: %s and job name %s", command, str(jobID),
418
+ get_job_kind(jobDesc.get_names()))
414
419
  return jobID
415
420
 
416
421
  def killBatchJobs(self, jobIDs):
@@ -424,7 +429,12 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
424
429
  for jobID in jobIDs:
425
430
  self.killQueue.put(jobID)
426
431
  while jobIDs:
427
- killedJobId = self.killedJobsQueue.get()
432
+ try:
433
+ killedJobId = self.killedJobsQueue.get(timeout=10)
434
+ except Empty:
435
+ if not self.background_thread.is_alive():
436
+ raise self.GridEngineThreadException("Grid engine thread failed unexpectedly") from self.background_thread.exception
437
+ continue
428
438
  if killedJobId is None:
429
439
  break
430
440
  jobIDs.remove(killedJobId)
@@ -456,7 +466,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
456
466
  self.config.statePollingWait):
457
467
  batchIds = self._getRunningBatchJobIDsCache
458
468
  else:
459
- batchIds = self.with_retries(self.worker.getRunningJobIDs)
469
+ batchIds = self.with_retries(self.background_thread.getRunningJobIDs)
460
470
  self._getRunningBatchJobIDsCache = batchIds
461
471
  self._getRunningBatchJobIDsTimestamp = datetime.now()
462
472
  batchIds.update(self.getRunningLocalJobIDs())
@@ -464,6 +474,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
464
474
 
465
475
  def getUpdatedBatchJob(self, maxWait):
466
476
  local_tuple = self.getUpdatedLocalJob(0)
477
+
478
+ if not self.background_thread.is_alive():
479
+ # kill remaining jobs on the thread
480
+ self.background_thread.killJobs()
481
+ raise self.GridEngineThreadException("Unexpected GridEngineThread failure") from self.background_thread.exception
467
482
  if local_tuple:
468
483
  return local_tuple
469
484
  else:
@@ -477,14 +492,14 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
477
492
 
478
493
  def shutdown(self) -> None:
479
494
  """
480
- Signals worker to shutdown (via sentinel) then cleanly joins the thread
495
+ Signals thread to shutdown (via sentinel) then cleanly joins the thread
481
496
  """
482
497
  self.shutdownLocal()
483
498
  newJobsQueue = self.newJobsQueue
484
499
  self.newJobsQueue = None
485
500
 
486
501
  newJobsQueue.put(None)
487
- self.worker.join()
502
+ self.background_thread.join()
488
503
 
489
504
  def setEnv(self, name, value=None):
490
505
  if value and ',' in value:
@@ -503,21 +518,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
503
518
 
504
519
  def with_retries(self, operation, *args, **kwargs):
505
520
  """
506
- Call operation with args and kwargs. If one of the calls to an SGE
507
- command fails, sleep and try again for a set number of times.
521
+ Call operation with args and kwargs. If one of the calls to a
522
+ command fails, sleep and try again.
508
523
  """
509
- maxTries = 3
510
- tries = 0
511
- while True:
512
- tries += 1
513
- try:
514
- return operation(*args, **kwargs)
515
- except CalledProcessErrorStderr as err:
516
- if tries < maxTries:
517
- logger.error("Will retry errored operation %s, code %d: %s",
518
- operation.__name__, err.returncode, err.stderr)
519
- time.sleep(self.config.statePollingWait)
520
- else:
521
- logger.error("Failed operation %s, code %d: %s",
524
+ for attempt in old_retry(
525
+ # Don't retry more often than the state polling wait.
526
+ delays=[max(delay, self.config.statePollingWait) for delay in DEFAULT_DELAYS],
527
+ timeout=self.config.state_polling_timeout,
528
+ predicate=lambda e: isinstance(e, CalledProcessErrorStderr)
529
+ ):
530
+ with attempt:
531
+ try:
532
+ return operation(*args, **kwargs)
533
+ except CalledProcessErrorStderr as err:
534
+ logger.error("Errored operation %s, code %d: %s",
522
535
  operation.__name__, err.returncode, err.stderr)
536
+ # Raise up to the retry logic, which will retry until timeout
523
537
  raise err
@@ -36,7 +36,7 @@ import uuid
36
36
  from argparse import ArgumentParser, _ArgumentGroup
37
37
  from typing import Any, Dict, Iterator, List, Optional, Set, Union
38
38
 
39
- from boto.exception import BotoServerError
39
+ from botocore.exceptions import ClientError
40
40
 
41
41
  from toil import applianceSelf
42
42
  from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE,
@@ -156,9 +156,9 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
156
156
  'AWS Batch can only provide nvidia gpu accelerators.'
157
157
  ])
158
158
 
159
- def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
159
+ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
160
160
  # Try the job as local
161
- local_id = self.handleLocalJob(job_desc)
161
+ local_id = self.handleLocalJob(command, job_desc)
162
162
  if local_id is not None:
163
163
  # It is a local job
164
164
  return local_id
@@ -184,7 +184,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
184
184
  environment.update(job_environment)
185
185
 
186
186
  # Make a command to run it in the executor
187
- command_list = pack_job(job_desc, self.user_script)
187
+ command_list = pack_job(command, self.user_script)
188
188
 
189
189
  # Compose a job spec to submit
190
190
  job_spec = {
@@ -376,7 +376,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
376
376
  # Get rid of the job definition we are using if we can.
377
377
  self._destroy_job_definition()
378
378
 
379
- @retry(errors=[BotoServerError])
379
+ @retry(errors=[ClientError])
380
380
  def _try_terminate(self, aws_id: str) -> None:
381
381
  """
382
382
  Internal function. Should not be called outside this class.
@@ -392,7 +392,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
392
392
  # Kill the AWS Batch job
393
393
  self.client.terminate_job(jobId=aws_id, reason='Killed by Toil')
394
394
 
395
- @retry(errors=[BotoServerError])
395
+ @retry(errors=[ClientError])
396
396
  def _wait_until_stopped(self, aws_id: str) -> None:
397
397
  """
398
398
  Internal function. Should not be called outside this class.
@@ -418,7 +418,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
418
418
  logger.info('Waiting for killed job %s to stop', self.aws_id_to_bs_id.get(aws_id, aws_id))
419
419
  time.sleep(2)
420
420
 
421
- @retry(errors=[BotoServerError])
421
+ @retry(errors=[ClientError])
422
422
  def _get_or_create_job_definition(self) -> str:
423
423
  """
424
424
  Internal function. Should not be called outside this class.
@@ -482,7 +482,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
482
482
 
483
483
  return self.job_definition
484
484
 
485
- @retry(errors=[BotoServerError])
485
+ @retry(errors=[ClientError])
486
486
  def _destroy_job_definition(self) -> None:
487
487
  """
488
488
  Internal function. Should not be called outside this class.
@@ -69,8 +69,13 @@ class WorkerCleanupContext:
69
69
 
70
70
  def __enter__(self) -> None:
71
71
  # Set up an arena so we know who is the last worker to leave
72
- self.arena = LastProcessStandingArena(Toil.get_toil_coordination_dir(self.workerCleanupInfo.work_dir, self.workerCleanupInfo.coordination_dir),
73
- self.workerCleanupInfo.workflow_id + '-cleanup')
72
+ self.arena = LastProcessStandingArena(
73
+ Toil.get_toil_coordination_dir(
74
+ self.workerCleanupInfo.work_dir,
75
+ self.workerCleanupInfo.coordination_dir
76
+ ),
77
+ Toil.get_workflow_path_component(self.workerCleanupInfo.workflow_id) + "-cleanup"
78
+ )
74
79
  logger.debug('Entering cleanup arena')
75
80
  self.arena.enter()
76
81
  logger.debug('Cleanup arena entered')
@@ -90,4 +95,3 @@ class WorkerCleanupContext:
90
95
  # Now the coordination_dir is allowed to no longer exist on the node.
91
96
  logger.debug('Cleanup arena left')
92
97
 
93
-
@@ -25,18 +25,17 @@ import sys
25
25
  from typing import Any, Dict, List, Optional
26
26
 
27
27
  from toil.batchSystems.abstractBatchSystem import EXIT_STATUS_UNAVAILABLE_VALUE
28
- from toil.job import JobDescription
29
28
  from toil.resource import Resource
30
29
  from toil.statsAndLogging import configure_root_logger, set_log_level
31
30
 
32
31
  logger = logging.getLogger(__name__)
33
32
 
34
33
 
35
- def pack_job(job_desc: JobDescription, user_script: Optional[Resource] = None, environment: Optional[Dict[str, str]] = None) -> List[str]:
34
+ def pack_job(command: str, user_script: Optional[Resource] = None, environment: Optional[Dict[str, str]] = None) -> List[str]:
36
35
  """
37
- Create a command that, when run, will execute the given job.
36
+ Create a command that runs the given command in an environment.
38
37
 
39
- :param job_desc: Job description for the job to run.
38
+ :param command: Worker command to run to run the job.
40
39
  :param user_script: User script that will be loaded before the job is run.
41
40
  :param environment: Environment variable dict that will be applied before
42
41
  the job is run.
@@ -46,7 +45,7 @@ def pack_job(job_desc: JobDescription, user_script: Optional[Resource] = None, e
46
45
  """
47
46
  # Make a job dict to send to the executor.
48
47
  # TODO: Factor out executor setup from here and Kubernetes and TES
49
- job: Dict[str, Any] = {"command": job_desc.command}
48
+ job: Dict[str, Any] = {"command": command}
50
49
  if user_script is not None:
51
50
  # If there's a user script resource be sure to send it along
52
51
  job['userScript'] = user_script
@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
28
28
 
29
29
  class GridEngineBatchSystem(AbstractGridEngineBatchSystem):
30
30
 
31
- class Worker(AbstractGridEngineBatchSystem.Worker):
31
+ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
32
32
  """
33
33
  Grid Engine-specific AbstractGridEngineWorker methods
34
34
  """
@@ -48,7 +48,7 @@ schedd_lock = Lock()
48
48
  class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
49
49
  # When using HTCondor, the Schedd handles scheduling
50
50
 
51
- class Worker(AbstractGridEngineBatchSystem.Worker):
51
+ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
52
52
 
53
53
  # Override the createJobs method so that we can use htcondor.Submit objects
54
54
  # and so that we can get disk allocation requests and ceil the CPU request.
@@ -387,9 +387,9 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
387
387
  return '"' + ' '.join(env_items) + '"'
388
388
 
389
389
  # Override the issueBatchJob method so HTCondor can be given the disk request
390
- def issueBatchJob(self, jobNode, job_environment: Optional[Dict[str, str]] = None):
390
+ def issueBatchJob(self, command: str, jobNode, job_environment: Optional[Dict[str, str]] = None):
391
391
  # Avoid submitting internal jobs to the batch queue, handle locally
392
- localID = self.handleLocalJob(jobNode)
392
+ localID = self.handleLocalJob(command, jobNode)
393
393
  if localID is not None:
394
394
  return localID
395
395
  else:
@@ -398,7 +398,7 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
398
398
  self.currentJobs.add(jobID)
399
399
 
400
400
  # Construct our style of job tuple
401
- self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName, jobNode.command,
401
+ self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName, command,
402
402
  job_environment or {}, jobNode.accelerators))
403
- logger.debug("Issued the job command: %s with job id: %s ", jobNode.command, str(jobID))
403
+ logger.debug("Issued the job command: %s with job id: %s ", command, str(jobID))
404
404
  return jobID
@@ -47,6 +47,8 @@ from typing import (Any,
47
47
  cast,
48
48
  overload)
49
49
 
50
+ from toil.lib.conversions import opt_strtobool
51
+
50
52
  if sys.version_info < (3, 10):
51
53
  from typing_extensions import ParamSpec
52
54
  else:
@@ -83,7 +85,7 @@ from kubernetes.client import (BatchV1Api,
83
85
  V1SecretVolumeSource,
84
86
  V1Toleration,
85
87
  V1Volume,
86
- V1VolumeMount)
88
+ V1VolumeMount, V1SecurityContext)
87
89
  from kubernetes.client.api_client import ApiClient
88
90
  from kubernetes.client.exceptions import ApiException
89
91
  from kubernetes.config.config_exception import ConfigException
@@ -758,6 +760,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
758
760
 
759
761
  def _create_pod_spec(
760
762
  self,
763
+ command: str,
761
764
  job_desc: JobDescription,
762
765
  job_environment: Optional[Dict[str, str]] = None
763
766
  ) -> V1PodSpec:
@@ -770,7 +773,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
770
773
  environment.update(job_environment)
771
774
 
772
775
  # Make a command to run it in the executor
773
- command_list = pack_job(job_desc, self.user_script, environment=environment)
776
+ command_list = pack_job(command, self.user_script, environment=environment)
774
777
 
775
778
  # The Kubernetes API makes sense only in terms of the YAML format. Objects
776
779
  # represent sections of the YAML files. Except from our point of view, all
@@ -877,14 +880,20 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
877
880
 
878
881
  # Make a container definition
879
882
  container = V1Container(command=command_list,
880
- image=self.docker_image,
881
- name="runner-container",
882
- resources=resources,
883
- volume_mounts=mounts)
883
+ image=self.docker_image,
884
+ name="runner-container",
885
+ resources=resources,
886
+ volume_mounts=mounts)
887
+
888
+ # In case security context rules are not allowed to be set, we only apply
889
+ # a security context at all if we need to turn on privileged mode.
890
+ if self.config.kubernetes_privileged:
891
+ container.security_context = V1SecurityContext(privileged=self.config.kubernetes_privileged)
892
+
884
893
  # Wrap the container in a spec
885
894
  pod_spec = V1PodSpec(containers=[container],
886
- volumes=volumes,
887
- restart_policy="Never")
895
+ volumes=volumes,
896
+ restart_policy="Never")
888
897
  # Tell the spec where to land
889
898
  placement.apply(pod_spec)
890
899
 
@@ -1005,9 +1014,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1005
1014
  self._release_acquired_resources(resources, notify=resource_notify)
1006
1015
  del self._acquired_resources[job_name]
1007
1016
 
1008
- def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
1017
+ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
1009
1018
  # Try the job as local
1010
- localID = self.handleLocalJob(job_desc)
1019
+ localID = self.handleLocalJob(command, job_desc)
1011
1020
  if localID is not None:
1012
1021
  # It is a local job
1013
1022
  return localID
@@ -1018,7 +1027,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1018
1027
  self.check_resource_request(job_desc)
1019
1028
 
1020
1029
  # Make a pod that describes running the job
1021
- pod_spec = self._create_pod_spec(job_desc, job_environment=job_environment)
1030
+ pod_spec = self._create_pod_spec(command, job_desc, job_environment=job_environment)
1022
1031
 
1023
1032
  # Make a batch system scope job ID
1024
1033
  job_id = self.getNextJobID()
@@ -1879,6 +1888,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1879
1888
  parser.add_argument("--kubernetesPodTimeout", dest="kubernetes_pod_timeout", default=120, env_var="TOIL_KUBERNETES_POD_TIMEOUT", type=float,
1880
1889
  help="Seconds to wait for a scheduled Kubernetes pod to start running. "
1881
1890
  "(default: %(default)s)")
1891
+ parser.add_argument("--kubernetesPrivileged", dest="kubernetes_privileged", default=False, env_var="TOIL_KUBERNETES_PRIVILEGED", type=opt_strtobool,
1892
+ help="Whether to ask worker pods to run in privileged mode. This should be used to access "
1893
+ "privileged operations, such as FUSE. On Toil-managed clusters with --enableFuse, "
1894
+ "this is set to True. (default: %(default)s)")
1882
1895
 
1883
1896
  OptionType = TypeVar('OptionType')
1884
1897
  @classmethod
@@ -1887,4 +1900,5 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1887
1900
  setOption("kubernetes_owner")
1888
1901
  setOption("kubernetes_service_account",)
1889
1902
  setOption("kubernetes_pod_timeout")
1903
+ setOption("kubernetes_privileged")
1890
1904
 
@@ -34,9 +34,9 @@ class BatchSystemLocalSupport(BatchSystemSupport):
34
34
  config, maxCores, maxMemory, maxDisk, max_jobs=max_local_jobs
35
35
  )
36
36
 
37
- def handleLocalJob(self, jobDesc: JobDescription) -> Optional[int]:
37
+ def handleLocalJob(self, command: str, jobDesc: JobDescription) -> Optional[int]:
38
38
  """
39
- To be called by issueBatchJobs.
39
+ To be called by issueBatchJob.
40
40
 
41
41
  Returns the jobID if the jobDesc has been submitted to the local queue,
42
42
  otherwise returns None
@@ -50,7 +50,7 @@ class BatchSystemLocalSupport(BatchSystemSupport):
50
50
  # somehow doesn't error whereas just returning the value complains
51
51
  # we're returning an Any. TODO: When singleMachine.py typechecks,
52
52
  # remove all these extra variables.
53
- local_id: int = self.localBatch.issueBatchJob(jobDesc)
53
+ local_id: int = self.localBatch.issueBatchJob(command, jobDesc)
54
54
  return local_id
55
55
  else:
56
56
  return None
toil/batchSystems/lsf.py CHANGED
@@ -25,12 +25,12 @@ import re
25
25
  import subprocess
26
26
  from datetime import datetime
27
27
  from random import randint
28
- from typing import Dict, List, Optional, Union
28
+ from typing import Dict, List, Optional, Tuple, Union
29
29
 
30
30
  from dateutil.parser import parse
31
31
  from dateutil.tz import tzlocal
32
32
 
33
- from toil.batchSystems.abstractBatchSystem import BatchJobExitReason
33
+ from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE
34
34
  from toil.batchSystems.abstractGridEngineBatchSystem import \
35
35
  AbstractGridEngineBatchSystem
36
36
  from toil.batchSystems.lsfHelper import (check_lsf_json_output_supported,
@@ -44,8 +44,8 @@ logger = logging.getLogger(__name__)
44
44
 
45
45
  class LSFBatchSystem(AbstractGridEngineBatchSystem):
46
46
 
47
- class Worker(AbstractGridEngineBatchSystem.Worker):
48
- """LSF specific AbstractGridEngineWorker methods."""
47
+ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
48
+ """LSF specific GridEngineThread methods."""
49
49
 
50
50
  def getRunningJobIDs(self):
51
51
  times = {}
@@ -161,7 +161,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
161
161
  status_resonse.append(None)
162
162
  return status_resonse
163
163
 
164
- def getJobExitCode(self, lsfJobID):
164
+ def getJobExitCode(self, lsfJobID) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
165
165
  # the task is set as part of the job ID if using getBatchSystemID()
166
166
  if "NOT_SUBMITTED" in lsfJobID:
167
167
  logger.error("bjobs detected job failed to submit")
@@ -186,7 +186,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
186
186
 
187
187
  return self.fallbackGetJobExitCode(job)
188
188
 
189
- def parse_bjobs_record(self, bjobs_record: dict, job: int) -> Union[int, None]:
189
+ def parse_bjobs_record(self, bjobs_record: dict, job: int) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
190
190
  """
191
191
  Helper functions for getJobExitCode and to parse the bjobs status record
192
192
  """
@@ -224,7 +224,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
224
224
  exit_info, job
225
225
  )
226
226
  if "TERM_MEMLIMIT" in exit_reason:
227
- return BatchJobExitReason.MEMLIMIT
227
+ return (exit_code if exit_code != 0 else EXIT_STATUS_UNAVAILABLE_VALUE, BatchJobExitReason.MEMLIMIT)
228
228
  return exit_code
229
229
  if process_status == "RUN":
230
230
  logger.debug(
@@ -237,7 +237,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
237
237
 
238
238
  return self.getJobExitCodeBACCT(job)
239
239
 
240
- def getJobExitCodeBACCT(self,job):
240
+ def getJobExitCodeBACCT(self,job) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
241
241
  # if not found in bjobs, then try bacct (slower than bjobs)
242
242
  logger.debug("bjobs failed to detect job - trying bacct: "
243
243
  "{}".format(job))
@@ -258,7 +258,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
258
258
  "running: {}".format(job))
259
259
  return None
260
260
 
261
- def fallbackGetJobExitCode(self, job):
261
+ def fallbackGetJobExitCode(self, job) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
262
262
  args = ["bjobs", "-l", str(job)]
263
263
  logger.debug(f"Checking job exit code for job via bjobs (fallback): {job}")
264
264
  stdout = call_command(args)