toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +22 -13
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/contained_executor.py +4 -5
  6. toil/batchSystems/gridengine.py +1 -1
  7. toil/batchSystems/htcondor.py +5 -5
  8. toil/batchSystems/kubernetes.py +25 -11
  9. toil/batchSystems/local_support.py +3 -3
  10. toil/batchSystems/lsf.py +2 -2
  11. toil/batchSystems/mesos/batchSystem.py +4 -4
  12. toil/batchSystems/mesos/executor.py +3 -2
  13. toil/batchSystems/options.py +9 -0
  14. toil/batchSystems/singleMachine.py +11 -10
  15. toil/batchSystems/slurm.py +64 -22
  16. toil/batchSystems/torque.py +1 -1
  17. toil/bus.py +7 -3
  18. toil/common.py +36 -13
  19. toil/cwl/cwltoil.py +365 -312
  20. toil/deferred.py +1 -1
  21. toil/fileStores/abstractFileStore.py +17 -17
  22. toil/fileStores/cachingFileStore.py +2 -2
  23. toil/fileStores/nonCachingFileStore.py +1 -1
  24. toil/job.py +228 -60
  25. toil/jobStores/abstractJobStore.py +18 -10
  26. toil/jobStores/aws/jobStore.py +280 -218
  27. toil/jobStores/aws/utils.py +57 -29
  28. toil/jobStores/conftest.py +2 -2
  29. toil/jobStores/fileJobStore.py +2 -2
  30. toil/jobStores/googleJobStore.py +3 -4
  31. toil/leader.py +72 -24
  32. toil/lib/aws/__init__.py +26 -10
  33. toil/lib/aws/iam.py +2 -2
  34. toil/lib/aws/session.py +62 -22
  35. toil/lib/aws/utils.py +73 -37
  36. toil/lib/conversions.py +5 -1
  37. toil/lib/ec2.py +118 -69
  38. toil/lib/expando.py +1 -1
  39. toil/lib/io.py +14 -2
  40. toil/lib/misc.py +1 -3
  41. toil/lib/resources.py +55 -21
  42. toil/lib/retry.py +12 -5
  43. toil/lib/threading.py +2 -2
  44. toil/lib/throttle.py +1 -1
  45. toil/options/common.py +27 -24
  46. toil/provisioners/__init__.py +9 -3
  47. toil/provisioners/abstractProvisioner.py +9 -7
  48. toil/provisioners/aws/__init__.py +20 -15
  49. toil/provisioners/aws/awsProvisioner.py +406 -329
  50. toil/provisioners/gceProvisioner.py +2 -2
  51. toil/provisioners/node.py +13 -5
  52. toil/server/app.py +1 -1
  53. toil/statsAndLogging.py +58 -16
  54. toil/test/__init__.py +27 -12
  55. toil/test/batchSystems/batchSystemTest.py +40 -33
  56. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  57. toil/test/batchSystems/test_slurm.py +1 -1
  58. toil/test/cwl/cwlTest.py +8 -91
  59. toil/test/cwl/seqtk_seq.cwl +1 -1
  60. toil/test/docs/scriptsTest.py +10 -13
  61. toil/test/jobStores/jobStoreTest.py +33 -49
  62. toil/test/lib/aws/test_iam.py +2 -2
  63. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  64. toil/test/provisioners/clusterTest.py +90 -8
  65. toil/test/server/serverTest.py +2 -2
  66. toil/test/src/autoDeploymentTest.py +1 -1
  67. toil/test/src/dockerCheckTest.py +2 -1
  68. toil/test/src/environmentTest.py +125 -0
  69. toil/test/src/fileStoreTest.py +1 -1
  70. toil/test/src/jobDescriptionTest.py +18 -8
  71. toil/test/src/jobTest.py +1 -1
  72. toil/test/src/realtimeLoggerTest.py +4 -0
  73. toil/test/src/workerTest.py +52 -19
  74. toil/test/utils/toilDebugTest.py +61 -3
  75. toil/test/utils/utilsTest.py +20 -18
  76. toil/test/wdl/wdltoil_test.py +24 -71
  77. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  78. toil/toilState.py +68 -9
  79. toil/utils/toilDebugJob.py +153 -26
  80. toil/utils/toilLaunchCluster.py +12 -2
  81. toil/utils/toilRsyncCluster.py +7 -2
  82. toil/utils/toilSshCluster.py +7 -3
  83. toil/utils/toilStats.py +2 -1
  84. toil/utils/toilStatus.py +97 -51
  85. toil/version.py +10 -10
  86. toil/wdl/wdltoil.py +318 -51
  87. toil/worker.py +96 -69
  88. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  89. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
  90. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
  91. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  92. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  93. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/__init__.py CHANGED
@@ -22,7 +22,6 @@ from datetime import datetime
22
22
  from typing import TYPE_CHECKING, Optional, Tuple
23
23
 
24
24
  import requests
25
- from pytz import timezone
26
25
 
27
26
  from docker.errors import ImageNotFound
28
27
  from toil.lib.memoize import memoize
@@ -210,7 +209,7 @@ def customDockerInitCmd() -> str:
210
209
  private docker registries). Any single quotes are escaped and the command cannot contain a
211
210
  set of blacklisted chars (newline or tab).
212
211
 
213
- :returns: The custom commmand, or an empty string is returned if the environment variable is not set.
212
+ :returns: The custom command, or an empty string is returned if the environment variable is not set.
214
213
  """
215
214
  command = lookupEnvVar(name='user-defined custom docker init command',
216
215
  envName='TOIL_CUSTOM_DOCKER_INIT_COMMAND',
@@ -440,7 +439,6 @@ def logProcessContext(config: "Config") -> None:
440
439
 
441
440
 
442
441
  try:
443
- from boto import provider
444
442
  from botocore.credentials import (JSONFileCache,
445
443
  RefreshableCredentials,
446
444
  create_credential_resolver)
@@ -477,234 +475,5 @@ try:
477
475
  """
478
476
  return datetime.strptime(s, datetime_format)
479
477
 
480
-
481
- class BotoCredentialAdapter(provider.Provider):
482
- """
483
- Boto 2 Adapter to use AWS credentials obtained via Boto 3's credential finding logic.
484
-
485
- This allows for automatic role assumption
486
- respecting the Boto 3 config files, even when parts of the app still use
487
- Boto 2.
488
-
489
- This class also handles caching credentials in multi-process environments
490
- to avoid loads of processes swamping the EC2 metadata service.
491
- """
492
-
493
- # TODO: We take kwargs because new boto2 versions have an 'anon'
494
- # argument and we want to be future proof
495
-
496
- def __init__(self, name, access_key=None, secret_key=None,
497
- security_token=None, profile_name=None, **kwargs):
498
- """Create a new BotoCredentialAdapter."""
499
- # TODO: We take kwargs because new boto2 versions have an 'anon'
500
- # argument and we want to be future proof
501
-
502
- if (name == 'aws' or name is None) and access_key is None and not kwargs.get('anon', False):
503
- # We are on AWS and we don't have credentials passed along and we aren't anonymous.
504
- # We will backend into a boto3 resolver for getting credentials.
505
- # Make sure to enable boto3's own caching, so we can share that
506
- # cache with pure boto3 code elsewhere in Toil.
507
- # Keep synced with toil.lib.aws.session.establish_boto3_session
508
- self._boto3_resolver = create_credential_resolver(Session(profile=profile_name), cache=JSONFileCache())
509
- else:
510
- # We will use the normal flow
511
- self._boto3_resolver = None
512
-
513
- # Pass along all the arguments
514
- super().__init__(name, access_key=access_key,
515
- secret_key=secret_key, security_token=security_token,
516
- profile_name=profile_name, **kwargs)
517
-
518
- def get_credentials(self, access_key=None, secret_key=None, security_token=None, profile_name=None):
519
- """
520
- Make sure our credential fields are populated.
521
-
522
- Called by the base class constructor.
523
- """
524
- if self._boto3_resolver is not None:
525
- # Go get the credentials from the cache, or from boto3 if not cached.
526
- # We need to be eager here; having the default None
527
- # _credential_expiry_time makes the accessors never try to refresh.
528
- self._obtain_credentials_from_cache_or_boto3()
529
- else:
530
- # We're not on AWS, or they passed a key, or we're anonymous.
531
- # Use the normal route; our credentials shouldn't expire.
532
- super().get_credentials(access_key=access_key,
533
- secret_key=secret_key, security_token=security_token,
534
- profile_name=profile_name)
535
-
536
- def _populate_keys_from_metadata_server(self):
537
- """
538
- Hack to catch _credential_expiry_time being too soon and refresh the credentials.
539
-
540
- This override is misnamed; it's actually the only hook we have to catch
541
- _credential_expiry_time being too soon and refresh the credentials. We
542
- actually just go back and poke the cache to see if it feels like
543
- getting us new credentials.
544
-
545
- Boto 2 hardcodes a refresh within 5 minutes of expiry:
546
- https://github.com/boto/boto/blob/591911db1029f2fbb8ba1842bfcc514159b37b32/boto/provider.py#L247
547
-
548
- Boto 3 wants to refresh 15 or 10 minutes before expiry:
549
- https://github.com/boto/botocore/blob/8d3ea0e61473fba43774eb3c74e1b22995ee7370/botocore/credentials.py#L279
550
-
551
- So if we ever want to refresh, Boto 3 wants to refresh too.
552
- """
553
- # This should only happen if we have expiring credentials, which we should only get from boto3
554
- if self._boto3_resolver is None:
555
- raise RuntimeError("The Boto3 resolver should not be None.")
556
-
557
- self._obtain_credentials_from_cache_or_boto3()
558
-
559
- @retry()
560
- def _obtain_credentials_from_boto3(self):
561
- """
562
- Fill our credential fields from Boto 3.
563
-
564
- We know the current cached credentials are not good, and that we
565
- need to get them from Boto 3. Fill in our credential fields
566
- (_access_key, _secret_key, _security_token,
567
- _credential_expiry_time) from Boto 3.
568
- """
569
- # We get a Credentials object
570
- # <https://github.com/boto/botocore/blob/8d3ea0e61473fba43774eb3c74e1b22995ee7370/botocore/credentials.py#L227>
571
- # or a RefreshableCredentials, or None on failure.
572
- creds = self._boto3_resolver.load_credentials()
573
-
574
- if creds is None:
575
- try:
576
- resolvers = str(self._boto3_resolver.providers)
577
- except:
578
- resolvers = "(Resolvers unavailable)"
579
- raise RuntimeError("Could not obtain AWS credentials from Boto3. Resolvers tried: " + resolvers)
580
-
581
- # Make sure the credentials actually has some credentials if it is lazy
582
- creds.get_frozen_credentials()
583
-
584
- # Get when the credentials will expire, if ever
585
- if isinstance(creds, RefreshableCredentials):
586
- # Credentials may expire.
587
- # Get a naive UTC datetime like boto 2 uses from the boto 3 time.
588
- self._credential_expiry_time = creds._expiry_time.astimezone(timezone('UTC')).replace(tzinfo=None)
589
- else:
590
- # Credentials never expire
591
- self._credential_expiry_time = None
592
-
593
- # Then, atomically get all the credentials bits. They may be newer than we think they are, but never older.
594
- frozen = creds.get_frozen_credentials()
595
-
596
- # Copy them into us
597
- self._access_key = frozen.access_key
598
- self._secret_key = frozen.secret_key
599
- self._security_token = frozen.token
600
-
601
- def _obtain_credentials_from_cache_or_boto3(self):
602
- """
603
- Get the cached credentials.
604
-
605
- Or retrieve them from Boto 3 and cache them
606
- (or wait for another cooperating process to do so) if they are missing
607
- or not fresh enough.
608
- """
609
- cache_path = '~/.cache/aws/cached_temporary_credentials'
610
- path = os.path.expanduser(cache_path)
611
- tmp_path = path + '.tmp'
612
- while True:
613
- log.debug('Attempting to read cached credentials from %s.', path)
614
- try:
615
- with open(path) as f:
616
- content = f.read()
617
- if content:
618
- record = content.split('\n')
619
- if len(record) != 4:
620
- raise RuntimeError("Number of cached credentials is not 4.")
621
- self._access_key = record[0]
622
- self._secret_key = record[1]
623
- self._security_token = record[2]
624
- self._credential_expiry_time = str_to_datetime(record[3])
625
- else:
626
- log.debug('%s is empty. Credentials are not temporary.', path)
627
- self._obtain_credentials_from_boto3()
628
- return
629
- except OSError as e:
630
- if e.errno == errno.ENOENT:
631
- log.debug('Cached credentials are missing.')
632
- dir_path = os.path.dirname(path)
633
- if not os.path.exists(dir_path):
634
- log.debug('Creating parent directory %s', dir_path)
635
- try:
636
- # A race would be ok at this point
637
- os.makedirs(dir_path, exist_ok=True)
638
- except OSError as e2:
639
- if e2.errno == errno.EROFS:
640
- # Sometimes we don't actually have write access to ~.
641
- # We may be running in a non-writable Toil container.
642
- # We should just go get our own credentials
643
- log.debug('Cannot use the credentials cache because we are working on a read-only filesystem.')
644
- self._obtain_credentials_from_boto3()
645
- else:
646
- raise
647
- else:
648
- raise
649
- else:
650
- if self._credentials_need_refresh():
651
- log.debug('Cached credentials are expired.')
652
- else:
653
- log.debug('Cached credentials exist and are still fresh.')
654
- return
655
- # We get here if credentials are missing or expired
656
- log.debug('Racing to create %s.', tmp_path)
657
- # Only one process, the winner, will succeed
658
- try:
659
- fd = os.open(tmp_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o600)
660
- except OSError as e:
661
- if e.errno == errno.EEXIST:
662
- log.debug('Lost the race to create %s. Waiting on winner to remove it.', tmp_path)
663
- while os.path.exists(tmp_path):
664
- time.sleep(0.1)
665
- log.debug('Winner removed %s. Trying from the top.', tmp_path)
666
- else:
667
- raise
668
- else:
669
- try:
670
- log.debug('Won the race to create %s. Requesting credentials from backend.', tmp_path)
671
- self._obtain_credentials_from_boto3()
672
- except:
673
- os.close(fd)
674
- fd = None
675
- log.debug('Failed to obtain credentials, removing %s.', tmp_path)
676
- # This unblocks the losers.
677
- os.unlink(tmp_path)
678
- # Bail out. It's too likely to happen repeatedly
679
- raise
680
- else:
681
- if self._credential_expiry_time is None:
682
- os.close(fd)
683
- fd = None
684
- log.debug('Credentials are not temporary. Leaving %s empty and renaming it to %s.',
685
- tmp_path, path)
686
- # No need to actually cache permanent credentials,
687
- # because we know we aren't getting them from the
688
- # metadata server or by assuming a role. Those both
689
- # give temporary credentials.
690
- else:
691
- log.debug('Writing credentials to %s.', tmp_path)
692
- with os.fdopen(fd, 'w') as fh:
693
- fd = None
694
- fh.write('\n'.join([
695
- self._access_key,
696
- self._secret_key,
697
- self._security_token,
698
- datetime_to_str(self._credential_expiry_time)]))
699
- log.debug('Wrote credentials to %s. Renaming to %s.', tmp_path, path)
700
- os.rename(tmp_path, path)
701
- return
702
- finally:
703
- if fd is not None:
704
- os.close(fd)
705
-
706
-
707
- provider.Provider = BotoCredentialAdapter
708
-
709
478
  except ImportError:
710
479
  pass
@@ -58,6 +58,13 @@ class BatchJobExitReason(enum.IntEnum):
58
58
  """Internal error."""
59
59
  MEMLIMIT: int = 6
60
60
  """Job hit batch system imposed memory limit."""
61
+ MISSING: int = 7
62
+ """Job disappeared from the scheduler without actually stopping, so Toil killed it."""
63
+ MAXJOBDURATION: int = 8
64
+ """Job ran longer than --maxJobDuration, so Toil killed it."""
65
+ PARTITION: int = 9
66
+ """Job was not able to talk to the leader via the job store, so Toil declared it failed."""
67
+
61
68
 
62
69
  @classmethod
63
70
  def to_string(cls, value: int) -> str:
@@ -156,17 +163,19 @@ class AbstractBatchSystem(ABC):
156
163
  """
157
164
 
158
165
  @abstractmethod
159
- def issueBatchJob(self, jobDesc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
166
+ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
160
167
  """
161
168
  Issues a job with the specified command to the batch system and returns
162
- a unique jobID.
169
+ a unique job ID number.
163
170
 
164
- :param jobDesc: a toil.job.JobDescription
171
+ :param command: the command to execute somewhere to run the Toil
172
+ worker process
173
+ :param job_desc: the JobDescription for the job being run
165
174
  :param job_environment: a collection of job-specific environment
166
- variables to be set on the worker.
175
+ variables to be set on the worker.
167
176
 
168
- :return: a unique jobID that can be used to reference the newly issued
169
- job
177
+ :return: a unique job ID number that can be used to reference the newly
178
+ issued job
170
179
  """
171
180
  raise NotImplementedError()
172
181
 
@@ -188,20 +197,20 @@ class AbstractBatchSystem(ABC):
188
197
  """
189
198
  Gets all currently issued jobs
190
199
 
191
- :return: A list of jobs (as jobIDs) currently issued (may be running, or may be
192
- waiting to be run). Despite the result being a list, the ordering should not
193
- be depended upon.
200
+ :return: A list of jobs (as job ID numbers) currently issued (may be
201
+ running, or may be waiting to be run). Despite the result being a
202
+ list, the ordering should not be depended upon.
194
203
  """
195
204
  raise NotImplementedError()
196
205
 
197
206
  @abstractmethod
198
207
  def getRunningBatchJobIDs(self) -> Dict[int, float]:
199
208
  """
200
- Gets a map of jobs as jobIDs that are currently running (not just waiting)
201
- and how long they have been running, in seconds.
209
+ Gets a map of jobs as job ID numbers that are currently running (not
210
+ just waiting) and how long they have been running, in seconds.
202
211
 
203
- :return: dictionary with currently running jobID keys and how many seconds they have
204
- been running as the value
212
+ :return: dictionary with currently running job ID number keys and how
213
+ many seconds they have been running as the value
205
214
  """
206
215
  raise NotImplementedError()
207
216
 
@@ -25,6 +25,7 @@ from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
25
25
  from toil.bus import ExternalBatchIdMessage, get_job_kind
26
26
  from toil.job import AcceleratorRequirement
27
27
  from toil.lib.misc import CalledProcessErrorStderr
28
+ from toil.lib.retry import old_retry, DEFAULT_DELAYS
28
29
 
29
30
  logger = logging.getLogger(__name__)
30
31
 
@@ -44,26 +45,29 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
44
45
  A partial implementation of BatchSystemSupport for batch systems run on a
45
46
  standard HPC cluster. By default auto-deployment is not implemented.
46
47
  """
48
+ class GridEngineThreadException(Exception):
49
+ pass
47
50
 
48
- class Worker(Thread, metaclass=ABCMeta):
49
-
51
+ class GridEngineThread(Thread, metaclass=ABCMeta):
50
52
  def __init__(self, newJobsQueue: Queue, updatedJobsQueue: Queue, killQueue: Queue, killedJobsQueue: Queue, boss: 'AbstractGridEngineBatchSystem') -> None:
51
53
  """
52
- Abstract worker interface class. All instances are created with five
54
+ Abstract thread interface class. All instances are created with five
53
55
  initial arguments (below). Note the Queue instances passed are empty.
54
56
 
55
57
  :param newJobsQueue: a Queue of new (unsubmitted) jobs
56
58
  :param updatedJobsQueue: a Queue of jobs that have been updated
57
59
  :param killQueue: a Queue of active jobs that need to be killed
58
- :param killedJobsQueue: Queue of killed jobs for this worker
60
+ :param killedJobsQueue: Queue of killed jobs for this thread
59
61
  :param boss: the AbstractGridEngineBatchSystem instance that
60
- controls this AbstractGridEngineWorker
62
+ controls this GridEngineThread
61
63
 
62
64
  """
63
65
  Thread.__init__(self)
64
66
  self.boss = boss
65
67
  self.boss.config.statePollingWait = \
66
68
  self.boss.config.statePollingWait or self.boss.getWaitDuration()
69
+ self.boss.config.state_polling_timeout = \
70
+ self.boss.config.state_polling_timeout or self.boss.config.statePollingWait * 10
67
71
  self.newJobsQueue = newJobsQueue
68
72
  self.updatedJobsQueue = updatedJobsQueue
69
73
  self.killQueue = killQueue
@@ -74,6 +78,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
74
78
  self.batchJobIDs: Dict[int, str] = dict()
75
79
  self._checkOnJobsCache = None
76
80
  self._checkOnJobsTimestamp = None
81
+ self.exception = None
77
82
 
78
83
  def getBatchSystemID(self, jobID: int) -> str:
79
84
  """
@@ -107,7 +112,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
107
112
  """
108
113
  Create a new job with the given attributes.
109
114
 
110
- Implementation-specific; called by AbstractGridEngineWorker.run()
115
+ Implementation-specific; called by GridEngineThread.run()
111
116
  """
112
117
  activity = False
113
118
  # Load new job id if present:
@@ -143,7 +148,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
143
148
 
144
149
  def killJobs(self):
145
150
  """
146
- Kill any running jobs within worker
151
+ Kill any running jobs within thread
147
152
  """
148
153
  killList = list()
149
154
  while True:
@@ -175,7 +180,8 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
175
180
  while killList:
176
181
  for jobID in list(killList):
177
182
  batchJobID = self.getBatchSystemID(jobID)
178
- if self.boss.with_retries(self.getJobExitCode, batchJobID) is not None:
183
+ exit_code = self.boss.with_retries(self.getJobExitCode, batchJobID)
184
+ if exit_code is not None:
179
185
  logger.debug('Adding jobID %s to killedJobsQueue', jobID)
180
186
  self.killedJobsQueue.put(jobID)
181
187
  killList.remove(jobID)
@@ -273,14 +279,17 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
273
279
  while self._runStep():
274
280
  pass
275
281
  except Exception as ex:
276
- logger.error("GridEngine like batch system failure", exc_info=ex)
277
- raise
282
+ self.exception = ex
283
+ logger.error("GridEngine like batch system failure: %s", ex)
284
+ # don't raise exception as is_alive will still be set to false,
285
+ # signalling exception in the thread as we expect the thread to
286
+ # always be running for the duration of the workflow
278
287
 
279
288
  def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
280
289
  """
281
290
  Returns exit codes and possibly exit reasons for a list of jobs, or None if they are running.
282
291
 
283
- Called by AbstractGridEngineWorker.checkOnJobs().
292
+ Called by GridEngineThread.checkOnJobs().
284
293
 
285
294
  This is an optional part of the interface. It should raise
286
295
  NotImplementedError if not actually implemented for a particular
@@ -341,7 +350,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
341
350
  def killJob(self, jobID):
342
351
  """
343
352
  Kill specific job with the Toil job ID. Implementation-specific; called
344
- by AbstractGridEngineWorker.killJobs()
353
+ by GridEngineThread.killJobs()
345
354
 
346
355
  :param string jobID: Toil job ID
347
356
  """
@@ -356,7 +365,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
356
365
 
357
366
  If the job is not running but the exit code is not available, it
358
367
  will be EXIT_STATUS_UNAVAILABLE_VALUE. Implementation-specific;
359
- called by AbstractGridEngineWorker.checkOnJobs().
368
+ called by GridEngineThread.checkOnJobs().
360
369
 
361
370
  The exit code will only be 0 if the job affirmatively succeeded.
362
371
 
@@ -375,24 +384,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
375
384
  self.updatedJobsQueue = Queue()
376
385
  self.killQueue = Queue()
377
386
  self.killedJobsQueue = Queue()
378
- # get the associated worker class here
379
- self.worker = self.Worker(self.newJobsQueue, self.updatedJobsQueue,
380
- self.killQueue, self.killedJobsQueue, self)
381
- self.worker.start()
387
+ # get the associated thread class here
388
+ self.background_thread = self.GridEngineThread(self.newJobsQueue, self.updatedJobsQueue,
389
+ self.killQueue, self.killedJobsQueue, self)
390
+ self.background_thread.start()
382
391
  self._getRunningBatchJobIDsTimestamp = None
383
392
  self._getRunningBatchJobIDsCache = {}
384
393
 
385
- @classmethod
386
- def supportsWorkerCleanup(cls):
387
- return False
388
-
389
394
  @classmethod
390
395
  def supportsAutoDeployment(cls):
391
396
  return False
392
397
 
393
- def issueBatchJob(self, jobDesc, job_environment: Optional[Dict[str, str]] = None):
398
+ def issueBatchJob(self, command: str, jobDesc, job_environment: Optional[Dict[str, str]] = None):
394
399
  # Avoid submitting internal jobs to the batch queue, handle locally
395
- localID = self.handleLocalJob(jobDesc)
400
+ localID = self.handleLocalJob(command, jobDesc)
396
401
  if localID is not None:
397
402
  return localID
398
403
  else:
@@ -406,10 +411,10 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
406
411
  gpus = accelerator['count']
407
412
  else:
408
413
  gpus = jobDesc.accelerators
409
-
410
- self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory, jobDesc.command, get_job_kind(jobDesc.get_names()),
414
+
415
+ self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory, command, get_job_kind(jobDesc.get_names()),
411
416
  job_environment, gpus))
412
- logger.debug("Issued the job command: %s with job id: %s and job name %s", jobDesc.command, str(jobID),
417
+ logger.debug("Issued the job command: %s with job id: %s and job name %s", command, str(jobID),
413
418
  get_job_kind(jobDesc.get_names()))
414
419
  return jobID
415
420
 
@@ -424,7 +429,12 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
424
429
  for jobID in jobIDs:
425
430
  self.killQueue.put(jobID)
426
431
  while jobIDs:
427
- killedJobId = self.killedJobsQueue.get()
432
+ try:
433
+ killedJobId = self.killedJobsQueue.get(timeout=10)
434
+ except Empty:
435
+ if not self.background_thread.is_alive():
436
+ raise self.GridEngineThreadException("Grid engine thread failed unexpectedly") from self.background_thread.exception
437
+ continue
428
438
  if killedJobId is None:
429
439
  break
430
440
  jobIDs.remove(killedJobId)
@@ -456,7 +466,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
456
466
  self.config.statePollingWait):
457
467
  batchIds = self._getRunningBatchJobIDsCache
458
468
  else:
459
- batchIds = self.with_retries(self.worker.getRunningJobIDs)
469
+ batchIds = self.with_retries(self.background_thread.getRunningJobIDs)
460
470
  self._getRunningBatchJobIDsCache = batchIds
461
471
  self._getRunningBatchJobIDsTimestamp = datetime.now()
462
472
  batchIds.update(self.getRunningLocalJobIDs())
@@ -464,6 +474,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
464
474
 
465
475
  def getUpdatedBatchJob(self, maxWait):
466
476
  local_tuple = self.getUpdatedLocalJob(0)
477
+
478
+ if not self.background_thread.is_alive():
479
+ # kill remaining jobs on the thread
480
+ self.background_thread.killJobs()
481
+ raise self.GridEngineThreadException("Unexpected GridEngineThread failure") from self.background_thread.exception
467
482
  if local_tuple:
468
483
  return local_tuple
469
484
  else:
@@ -477,14 +492,14 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
477
492
 
478
493
  def shutdown(self) -> None:
479
494
  """
480
- Signals worker to shutdown (via sentinel) then cleanly joins the thread
495
+ Signals thread to shutdown (via sentinel) then cleanly joins the thread
481
496
  """
482
497
  self.shutdownLocal()
483
498
  newJobsQueue = self.newJobsQueue
484
499
  self.newJobsQueue = None
485
500
 
486
501
  newJobsQueue.put(None)
487
- self.worker.join()
502
+ self.background_thread.join()
488
503
 
489
504
  def setEnv(self, name, value=None):
490
505
  if value and ',' in value:
@@ -503,21 +518,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
503
518
 
504
519
  def with_retries(self, operation, *args, **kwargs):
505
520
  """
506
- Call operation with args and kwargs. If one of the calls to an SGE
507
- command fails, sleep and try again for a set number of times.
521
+ Call operation with args and kwargs. If one of the calls to a
522
+ command fails, sleep and try again.
508
523
  """
509
- maxTries = 3
510
- tries = 0
511
- while True:
512
- tries += 1
513
- try:
514
- return operation(*args, **kwargs)
515
- except CalledProcessErrorStderr as err:
516
- if tries < maxTries:
517
- logger.error("Will retry errored operation %s, code %d: %s",
518
- operation.__name__, err.returncode, err.stderr)
519
- time.sleep(self.config.statePollingWait)
520
- else:
521
- logger.error("Failed operation %s, code %d: %s",
524
+ for attempt in old_retry(
525
+ # Don't retry more often than the state polling wait.
526
+ delays=[max(delay, self.config.statePollingWait) for delay in DEFAULT_DELAYS],
527
+ timeout=self.config.state_polling_timeout,
528
+ predicate=lambda e: isinstance(e, CalledProcessErrorStderr)
529
+ ):
530
+ with attempt:
531
+ try:
532
+ return operation(*args, **kwargs)
533
+ except CalledProcessErrorStderr as err:
534
+ logger.error("Errored operation %s, code %d: %s",
522
535
  operation.__name__, err.returncode, err.stderr)
536
+ # Raise up to the retry logic, which will retry until timeout
523
537
  raise err
@@ -36,7 +36,7 @@ import uuid
36
36
  from argparse import ArgumentParser, _ArgumentGroup
37
37
  from typing import Any, Dict, Iterator, List, Optional, Set, Union
38
38
 
39
- from boto.exception import BotoServerError
39
+ from botocore.exceptions import ClientError
40
40
 
41
41
  from toil import applianceSelf
42
42
  from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE,
@@ -156,9 +156,9 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
156
156
  'AWS Batch can only provide nvidia gpu accelerators.'
157
157
  ])
158
158
 
159
- def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
159
+ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
160
160
  # Try the job as local
161
- local_id = self.handleLocalJob(job_desc)
161
+ local_id = self.handleLocalJob(command, job_desc)
162
162
  if local_id is not None:
163
163
  # It is a local job
164
164
  return local_id
@@ -184,7 +184,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
184
184
  environment.update(job_environment)
185
185
 
186
186
  # Make a command to run it in the executor
187
- command_list = pack_job(job_desc, self.user_script)
187
+ command_list = pack_job(command, self.user_script)
188
188
 
189
189
  # Compose a job spec to submit
190
190
  job_spec = {
@@ -376,7 +376,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
376
376
  # Get rid of the job definition we are using if we can.
377
377
  self._destroy_job_definition()
378
378
 
379
- @retry(errors=[BotoServerError])
379
+ @retry(errors=[ClientError])
380
380
  def _try_terminate(self, aws_id: str) -> None:
381
381
  """
382
382
  Internal function. Should not be called outside this class.
@@ -392,7 +392,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
392
392
  # Kill the AWS Batch job
393
393
  self.client.terminate_job(jobId=aws_id, reason='Killed by Toil')
394
394
 
395
- @retry(errors=[BotoServerError])
395
+ @retry(errors=[ClientError])
396
396
  def _wait_until_stopped(self, aws_id: str) -> None:
397
397
  """
398
398
  Internal function. Should not be called outside this class.
@@ -418,7 +418,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
418
418
  logger.info('Waiting for killed job %s to stop', self.aws_id_to_bs_id.get(aws_id, aws_id))
419
419
  time.sleep(2)
420
420
 
421
- @retry(errors=[BotoServerError])
421
+ @retry(errors=[ClientError])
422
422
  def _get_or_create_job_definition(self) -> str:
423
423
  """
424
424
  Internal function. Should not be called outside this class.
@@ -482,7 +482,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
482
482
 
483
483
  return self.job_definition
484
484
 
485
- @retry(errors=[BotoServerError])
485
+ @retry(errors=[ClientError])
486
486
  def _destroy_job_definition(self) -> None:
487
487
  """
488
488
  Internal function. Should not be called outside this class.
@@ -25,18 +25,17 @@ import sys
25
25
  from typing import Any, Dict, List, Optional
26
26
 
27
27
  from toil.batchSystems.abstractBatchSystem import EXIT_STATUS_UNAVAILABLE_VALUE
28
- from toil.job import JobDescription
29
28
  from toil.resource import Resource
30
29
  from toil.statsAndLogging import configure_root_logger, set_log_level
31
30
 
32
31
  logger = logging.getLogger(__name__)
33
32
 
34
33
 
35
- def pack_job(job_desc: JobDescription, user_script: Optional[Resource] = None, environment: Optional[Dict[str, str]] = None) -> List[str]:
34
+ def pack_job(command: str, user_script: Optional[Resource] = None, environment: Optional[Dict[str, str]] = None) -> List[str]:
36
35
  """
37
- Create a command that, when run, will execute the given job.
36
+ Create a command that runs the given command in an environment.
38
37
 
39
- :param job_desc: Job description for the job to run.
38
+ :param command: Worker command to run to run the job.
40
39
  :param user_script: User script that will be loaded before the job is run.
41
40
  :param environment: Environment variable dict that will be applied before
42
41
  the job is run.
@@ -46,7 +45,7 @@ def pack_job(job_desc: JobDescription, user_script: Optional[Resource] = None, e
46
45
  """
47
46
  # Make a job dict to send to the executor.
48
47
  # TODO: Factor out executor setup from here and Kubernetes and TES
49
- job: Dict[str, Any] = {"command": job_desc.command}
48
+ job: Dict[str, Any] = {"command": command}
50
49
  if user_script is not None:
51
50
  # If there's a user script resource be sure to send it along
52
51
  job['userScript'] = user_script