toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +41 -17
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +4 -5
  7. toil/batchSystems/gridengine.py +1 -1
  8. toil/batchSystems/htcondor.py +5 -5
  9. toil/batchSystems/kubernetes.py +25 -11
  10. toil/batchSystems/local_support.py +3 -3
  11. toil/batchSystems/lsf.py +9 -9
  12. toil/batchSystems/mesos/batchSystem.py +4 -4
  13. toil/batchSystems/mesos/executor.py +3 -2
  14. toil/batchSystems/options.py +9 -0
  15. toil/batchSystems/singleMachine.py +11 -10
  16. toil/batchSystems/slurm.py +129 -16
  17. toil/batchSystems/torque.py +1 -1
  18. toil/bus.py +45 -3
  19. toil/common.py +56 -31
  20. toil/cwl/cwltoil.py +442 -371
  21. toil/deferred.py +1 -1
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/abstractFileStore.py +69 -20
  24. toil/fileStores/cachingFileStore.py +6 -22
  25. toil/fileStores/nonCachingFileStore.py +6 -15
  26. toil/job.py +270 -86
  27. toil/jobStores/abstractJobStore.py +37 -31
  28. toil/jobStores/aws/jobStore.py +280 -218
  29. toil/jobStores/aws/utils.py +60 -31
  30. toil/jobStores/conftest.py +2 -2
  31. toil/jobStores/fileJobStore.py +3 -3
  32. toil/jobStores/googleJobStore.py +3 -4
  33. toil/leader.py +89 -38
  34. toil/lib/aws/__init__.py +26 -10
  35. toil/lib/aws/iam.py +2 -2
  36. toil/lib/aws/session.py +62 -22
  37. toil/lib/aws/utils.py +73 -37
  38. toil/lib/conversions.py +24 -1
  39. toil/lib/ec2.py +118 -69
  40. toil/lib/expando.py +1 -1
  41. toil/lib/generatedEC2Lists.py +8 -8
  42. toil/lib/io.py +42 -4
  43. toil/lib/misc.py +1 -3
  44. toil/lib/resources.py +57 -16
  45. toil/lib/retry.py +12 -5
  46. toil/lib/threading.py +29 -14
  47. toil/lib/throttle.py +1 -1
  48. toil/options/common.py +31 -30
  49. toil/options/wdl.py +5 -0
  50. toil/provisioners/__init__.py +9 -3
  51. toil/provisioners/abstractProvisioner.py +12 -2
  52. toil/provisioners/aws/__init__.py +20 -15
  53. toil/provisioners/aws/awsProvisioner.py +406 -329
  54. toil/provisioners/gceProvisioner.py +2 -2
  55. toil/provisioners/node.py +13 -5
  56. toil/server/app.py +1 -1
  57. toil/statsAndLogging.py +93 -23
  58. toil/test/__init__.py +27 -12
  59. toil/test/batchSystems/batchSystemTest.py +40 -33
  60. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  61. toil/test/batchSystems/test_slurm.py +22 -7
  62. toil/test/cactus/__init__.py +0 -0
  63. toil/test/cactus/test_cactus_integration.py +58 -0
  64. toil/test/cwl/cwlTest.py +245 -236
  65. toil/test/cwl/seqtk_seq.cwl +1 -1
  66. toil/test/docs/scriptsTest.py +11 -14
  67. toil/test/jobStores/jobStoreTest.py +40 -54
  68. toil/test/lib/aws/test_iam.py +2 -2
  69. toil/test/lib/test_ec2.py +1 -1
  70. toil/test/options/__init__.py +13 -0
  71. toil/test/options/options.py +37 -0
  72. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  73. toil/test/provisioners/clusterTest.py +99 -16
  74. toil/test/server/serverTest.py +2 -2
  75. toil/test/src/autoDeploymentTest.py +1 -1
  76. toil/test/src/dockerCheckTest.py +2 -1
  77. toil/test/src/environmentTest.py +125 -0
  78. toil/test/src/fileStoreTest.py +1 -1
  79. toil/test/src/jobDescriptionTest.py +18 -8
  80. toil/test/src/jobTest.py +1 -1
  81. toil/test/src/realtimeLoggerTest.py +4 -0
  82. toil/test/src/workerTest.py +52 -19
  83. toil/test/utils/toilDebugTest.py +62 -4
  84. toil/test/utils/utilsTest.py +23 -21
  85. toil/test/wdl/wdltoil_test.py +49 -21
  86. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  87. toil/toilState.py +68 -9
  88. toil/utils/toilDebugFile.py +1 -1
  89. toil/utils/toilDebugJob.py +153 -26
  90. toil/utils/toilLaunchCluster.py +12 -2
  91. toil/utils/toilRsyncCluster.py +7 -2
  92. toil/utils/toilSshCluster.py +7 -3
  93. toil/utils/toilStats.py +310 -266
  94. toil/utils/toilStatus.py +98 -52
  95. toil/version.py +11 -11
  96. toil/wdl/wdltoil.py +644 -225
  97. toil/worker.py +125 -83
  98. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  99. toil-7.0.0.dist-info/METADATA +158 -0
  100. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
  101. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  102. toil-6.1.0a1.dist-info/METADATA +0 -125
  103. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  104. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
@@ -17,25 +17,25 @@ import logging
17
17
  import os
18
18
  import types
19
19
  from ssl import SSLError
20
- from typing import Optional, cast
20
+ from typing import Optional, cast, TYPE_CHECKING, Dict, List, Tuple
21
21
 
22
22
  from boto3.s3.transfer import TransferConfig
23
- from boto.exception import SDBResponseError
24
23
  from botocore.client import Config
25
24
  from botocore.exceptions import ClientError
26
- from mypy_boto3_s3 import S3Client, S3ServiceResource
25
+ from mypy_boto3_sdb.type_defs import ItemTypeDef, AttributeTypeDef
27
26
 
28
- from toil.lib.aws import session
29
- from toil.lib.aws.utils import connection_reset, get_bucket_region
27
+ from toil.lib.aws import session, AWSServerErrors
28
+ from toil.lib.aws.utils import connection_error, get_bucket_region
30
29
  from toil.lib.compatibility import compat_bytes
31
30
  from toil.lib.retry import (DEFAULT_DELAYS,
32
31
  DEFAULT_TIMEOUT,
33
- ErrorCondition,
34
32
  get_error_code,
35
33
  get_error_message,
36
34
  get_error_status,
37
35
  old_retry,
38
36
  retry)
37
+ if TYPE_CHECKING:
38
+ from mypy_boto3_s3 import S3ServiceResource
39
39
 
40
40
  logger = logging.getLogger(__name__)
41
41
 
@@ -125,11 +125,11 @@ class SDBHelper:
125
125
  return cls._maxChunks() * cls.maxValueSize
126
126
 
127
127
  @classmethod
128
- def binaryToAttributes(cls, binary):
128
+ def binaryToAttributes(cls, binary) -> Dict[str, str]:
129
129
  """
130
130
  Turn a bytestring, or None, into SimpleDB attributes.
131
131
  """
132
- if binary is None: return {'numChunks': 0}
132
+ if binary is None: return {'numChunks': '0'}
133
133
  assert isinstance(binary, bytes)
134
134
  assert len(binary) <= cls.maxBinarySize()
135
135
  # The use of compression is just an optimization. We can't include it in the maxValueSize
@@ -143,10 +143,41 @@ class SDBHelper:
143
143
  assert len(encoded) <= cls._maxEncodedSize()
144
144
  n = cls.maxValueSize
145
145
  chunks = (encoded[i:i + n] for i in range(0, len(encoded), n))
146
- attributes = {cls._chunkName(i): chunk for i, chunk in enumerate(chunks)}
147
- attributes.update({'numChunks': len(attributes)})
146
+ attributes = {cls._chunkName(i): chunk.decode("utf-8") for i, chunk in enumerate(chunks)}
147
+ attributes.update({'numChunks': str(len(attributes))})
148
148
  return attributes
149
149
 
150
+ @classmethod
151
+ def attributeDictToList(cls, attributes: Dict[str, str]) -> List[AttributeTypeDef]:
152
+ """
153
+ Convert the attribute dict (ex: from binaryToAttributes) into a list of attribute typed dicts
154
+ to be compatible with boto3 argument syntax
155
+ :param attributes: Dict[str, str], attribute in object form
156
+ :return: List[AttributeTypeDef], list of attributes in typed dict form
157
+ """
158
+ return [{"Name": name, "Value": value} for name, value in attributes.items()]
159
+
160
+ @classmethod
161
+ def attributeListToDict(cls, attributes: List[AttributeTypeDef]) -> Dict[str, str]:
162
+ """
163
+ Convert the attribute boto3 representation of list of attribute typed dicts
164
+ back to a dictionary with name, value pairs
165
+ :param attribute: List[AttributeTypeDef, attribute in typed dict form
166
+ :return: Dict[str, str], attribute in dict form
167
+ """
168
+ return {attribute["Name"]: attribute["Value"] for attribute in attributes}
169
+
170
+ @classmethod
171
+ def get_attributes_from_item(cls, item: ItemTypeDef, keys: List[str]) -> List[Optional[str]]:
172
+ return_values: List[Optional[str]] = [None for _ in keys]
173
+ mapped_indices: Dict[str, int] = {name: index for index, name in enumerate(keys)}
174
+ for attribute in item["Attributes"]:
175
+ name = attribute["Name"]
176
+ value = attribute["Value"]
177
+ if name in mapped_indices:
178
+ return_values[mapped_indices[name]] = value
179
+ return return_values
180
+
150
181
  @classmethod
151
182
  def _chunkName(cls, i):
152
183
  return str(i).zfill(3)
@@ -165,14 +196,21 @@ class SDBHelper:
165
196
  return 'numChunks'
166
197
 
167
198
  @classmethod
168
- def attributesToBinary(cls, attributes):
199
+ def attributesToBinary(cls, attributes: List[AttributeTypeDef]) -> Tuple[bytes, int]:
169
200
  """
170
201
  :rtype: (str|None,int)
171
202
  :return: the binary data and the number of chunks it was composed from
172
203
  """
173
- chunks = [(int(k), v) for k, v in attributes.items() if cls._isValidChunkName(k)]
204
+ chunks = []
205
+ numChunks: int = 0
206
+ for attribute in attributes:
207
+ name = attribute["Name"]
208
+ value = attribute["Value"]
209
+ if cls._isValidChunkName(name):
210
+ chunks.append((int(name), value))
211
+ if name == "numChunks":
212
+ numChunks = int(value)
174
213
  chunks.sort()
175
- numChunks = int(attributes['numChunks'])
176
214
  if numChunks:
177
215
  serializedJob = b''.join(v.encode() for k, v in chunks)
178
216
  compressed = base64.b64decode(serializedJob)
@@ -192,10 +230,7 @@ def fileSizeAndTime(localFilePath):
192
230
  return file_stat.st_size, file_stat.st_mtime
193
231
 
194
232
 
195
- @retry(errors=[ErrorCondition(
196
- error=ClientError,
197
- error_codes=[404, 500, 502, 503, 504]
198
- )])
233
+ @retry(errors=[AWSServerErrors])
199
234
  def uploadFromPath(localFilePath: str,
200
235
  resource,
201
236
  bucketName: str,
@@ -231,10 +266,7 @@ def uploadFromPath(localFilePath: str,
231
266
  return version
232
267
 
233
268
 
234
- @retry(errors=[ErrorCondition(
235
- error=ClientError,
236
- error_codes=[404, 500, 502, 503, 504]
237
- )])
269
+ @retry(errors=[AWSServerErrors])
238
270
  def uploadFile(readable,
239
271
  resource,
240
272
  bucketName: str,
@@ -286,11 +318,8 @@ class ServerSideCopyProhibitedError(RuntimeError):
286
318
  insists that you pay to download and upload the data yourself instead.
287
319
  """
288
320
 
289
- @retry(errors=[ErrorCondition(
290
- error=ClientError,
291
- error_codes=[404, 500, 502, 503, 504]
292
- )])
293
- def copyKeyMultipart(resource: S3ServiceResource,
321
+ @retry(errors=[AWSServerErrors])
322
+ def copyKeyMultipart(resource: "S3ServiceResource",
294
323
  srcBucketName: str,
295
324
  srcKeyName: str,
296
325
  srcKeyVersion: str,
@@ -346,7 +375,7 @@ def copyKeyMultipart(resource: S3ServiceResource,
346
375
  # not wherever the bucket virtual hostnames go.
347
376
  source_region = get_bucket_region(srcBucketName)
348
377
  source_client = cast(
349
- S3Client,
378
+ "S3Client",
350
379
  session.client(
351
380
  's3',
352
381
  region_name=source_region,
@@ -438,9 +467,9 @@ def sdb_unavailable(e):
438
467
 
439
468
 
440
469
  def no_such_sdb_domain(e):
441
- return (isinstance(e, SDBResponseError)
442
- and e.error_code
443
- and e.error_code.endswith('NoSuchDomain'))
470
+ return (isinstance(e, ClientError)
471
+ and get_error_code(e)
472
+ and get_error_code(e).endswith('NoSuchDomain'))
444
473
 
445
474
 
446
475
  def retryable_ssl_error(e):
@@ -451,7 +480,7 @@ def retryable_ssl_error(e):
451
480
  def retryable_sdb_errors(e):
452
481
  return (sdb_unavailable(e)
453
482
  or no_such_sdb_domain(e)
454
- or connection_reset(e)
483
+ or connection_error(e)
455
484
  or retryable_ssl_error(e))
456
485
 
457
486
 
@@ -17,7 +17,7 @@
17
17
  collect_ignore = []
18
18
 
19
19
  try:
20
- import boto
21
- print(boto.__file__) # prevent this import from being removed
20
+ import boto3
21
+ print(boto3.__file__) # prevent this import from being removed
22
22
  except ImportError:
23
23
  collect_ignore.append("aws")
@@ -113,7 +113,7 @@ class FileJobStore(AbstractJobStore):
113
113
  os.mkdir(self.jobStoreDir)
114
114
  except OSError as e:
115
115
  if e.errno == errno.EEXIST:
116
- raise JobStoreExistsException(self.jobStoreDir)
116
+ raise JobStoreExistsException(self.jobStoreDir, "file")
117
117
  else:
118
118
  raise
119
119
  os.makedirs(self.jobsDir, exist_ok=True)
@@ -127,7 +127,7 @@ class FileJobStore(AbstractJobStore):
127
127
 
128
128
  def resume(self):
129
129
  if not os.path.isdir(self.jobStoreDir):
130
- raise NoSuchJobStoreException(self.jobStoreDir)
130
+ raise NoSuchJobStoreException(self.jobStoreDir, "file")
131
131
  super().resume()
132
132
 
133
133
  def destroy(self):
@@ -920,7 +920,7 @@ class FileJobStore(AbstractJobStore):
920
920
  :raise NoSuchFileException: if the file with ID jobStoreFileID does
921
921
  not exist or is not a file
922
922
  """
923
- if not self.file_exists(unquote(jobStoreFileID)):
923
+ if not self.file_exists(jobStoreFileID):
924
924
  raise NoSuchFileException(jobStoreFileID)
925
925
 
926
926
  def _get_arbitrary_jobs_dir_for_name(self, jobNameSlug):
@@ -164,7 +164,7 @@ class GoogleJobStore(AbstractJobStore):
164
164
  try:
165
165
  self.bucket = self.storageClient.create_bucket(self.bucketName)
166
166
  except exceptions.Conflict:
167
- raise JobStoreExistsException(self.locator)
167
+ raise JobStoreExistsException(self.locator, "google")
168
168
  super().initialize(config)
169
169
 
170
170
  # set up sever side encryption after we set up config in super
@@ -178,7 +178,7 @@ class GoogleJobStore(AbstractJobStore):
178
178
  try:
179
179
  self.bucket = self.storageClient.get_bucket(self.bucketName)
180
180
  except exceptions.NotFound:
181
- raise NoSuchJobStoreException(self.locator)
181
+ raise NoSuchJobStoreException(self.locator, "google")
182
182
  super().resume()
183
183
 
184
184
  @google_retry
@@ -209,8 +209,7 @@ class GoogleJobStore(AbstractJobStore):
209
209
 
210
210
  def assign_job_id(self, job_description):
211
211
  jobStoreID = self._new_job_id()
212
- log.debug("Assigning ID to job %s for '%s'",
213
- jobStoreID, '<no command>' if job_description.command is None else job_description.command)
212
+ log.debug("Assigning ID to job %s", jobStoreID)
214
213
  job_description.jobStoreID = jobStoreID
215
214
 
216
215
  @contextmanager
toil/leader.py CHANGED
@@ -28,14 +28,16 @@ import enlighten
28
28
  from toil import resolveEntryPoint
29
29
  from toil.batchSystems import DeadlockException
30
30
  from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem,
31
- BatchJobExitReason)
31
+ BatchJobExitReason,
32
+ EXIT_STATUS_UNAVAILABLE_VALUE)
32
33
  from toil.bus import (JobCompletedMessage,
33
34
  JobFailedMessage,
34
35
  JobIssuedMessage,
35
36
  JobMissingMessage,
36
37
  JobUpdatedMessage,
37
38
  QueueSizeMessage,
38
- gen_message_bus_path)
39
+ gen_message_bus_path,
40
+ get_job_kind)
39
41
  from toil.common import Config, ToilMetrics
40
42
  from toil.cwl.utils import CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
41
43
  from toil.exceptions import FailedJobsException
@@ -117,7 +119,12 @@ class Leader:
117
119
  if self.config.write_messages is None:
118
120
  # The user hasn't specified a place for the message bus so we
119
121
  # should make one.
120
- self.config.write_messages = gen_message_bus_path()
122
+ # pass in coordination_dir for toil-cwl-runner; we want to obey --tmpdir-prefix
123
+ # from cwltool and we change the coordination_dir when detected. we don't want
124
+ # to make another config attribute so put the message bus in the already prefixed dir
125
+ # if a coordination_dir is provided normally, we can still put the bus in there
126
+ # as the coordination dir should serve a similar purpose to the tmp directory
127
+ self.config.write_messages = gen_message_bus_path(config.coordination_dir)
121
128
 
122
129
  # Message bus messages need to go to the given file.
123
130
  # Keep a reference to the return value so the listener stays alive.
@@ -287,7 +294,11 @@ class Leader:
287
294
  for job_id in self.toilState.totalFailedJobs:
288
295
  # Refresh all the failed jobs to get e.g. the log file IDs that the workers wrote
289
296
  self.toilState.reset_job(job_id)
290
- failed_jobs.append(self.toilState.get_job(job_id))
297
+ try:
298
+ failed_jobs.append(self.toilState.get_job(job_id))
299
+ except NoSuchJobException:
300
+ # Job actually finished and was removed
301
+ pass
291
302
 
292
303
  logger.info("Failed jobs at end of the run: %s", ' '.join(str(j) for j in failed_jobs))
293
304
  raise FailedJobsException(self.jobStore, failed_jobs, exit_code=self.recommended_fail_exit_code)
@@ -520,10 +531,10 @@ class Leader:
520
531
  "manager: %s", readyJob.jobStoreID)
521
532
  elif readyJob.jobStoreID in self.toilState.hasFailedSuccessors:
522
533
  self._processFailedSuccessors(job_id)
523
- elif readyJob.command is not None or result_status != 0:
524
- # The job has a command it must be run before any successors.
534
+ elif readyJob.has_body() or result_status != 0:
535
+ # The job has a body it must be run before any successors.
525
536
  # Similarly, if the job previously failed we rerun it, even if it doesn't have a
526
- # command to run, to eliminate any parts of the stack now completed.
537
+ # body to run, to eliminate any parts of the stack now completed.
527
538
  isServiceJob = readyJob.jobStoreID in self.toilState.service_to_client
528
539
 
529
540
  # We want to run the job, and expend one of its "tries" (possibly
@@ -549,6 +560,7 @@ class Leader:
549
560
  for serviceID in serviceJobList:
550
561
  if serviceID in self.toilState.service_to_client:
551
562
  raise RuntimeError(f"The ready service ID: {serviceID} was already added.")
563
+ # TODO: Why do we refresh here?
552
564
  self.toilState.reset_job(serviceID)
553
565
  serviceHost = self.toilState.get_job(serviceID)
554
566
  self.toilState.service_to_client[serviceID] = readyJob.jobStoreID
@@ -705,8 +717,9 @@ class Leader:
705
717
  if exitStatus == 0:
706
718
  logger.debug('Job ended: %s', updatedJob)
707
719
  else:
708
- logger.warning(f'Job failed with exit value {exitStatus}: {updatedJob}\n'
709
- f'Exit reason: {exitReason}')
720
+ status_string = str(exitStatus) if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE else "<UNAVAILABLE>"
721
+ logger.warning(f'Job failed with exit value {status_string}: {updatedJob}\n'
722
+ f'Exit reason: {BatchJobExitReason.to_string(exitReason)}')
710
723
  if exitStatus == CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE:
711
724
  # This is a CWL job informing us that the workflow is
712
725
  # asking things of us that Toil can't do. When we raise an
@@ -715,7 +728,7 @@ class Leader:
715
728
  logger.warning("This indicates an unsupported CWL requirement!")
716
729
  self.recommended_fail_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
717
730
  # Tell everyone it stopped running.
718
- self._messages.publish(JobCompletedMessage(updatedJob.get_job_kind(), updatedJob.jobStoreID, exitStatus))
731
+ self._messages.publish(JobCompletedMessage(get_job_kind(updatedJob.get_names()), updatedJob.jobStoreID, exitStatus))
719
732
  self.process_finished_job(bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason)
720
733
 
721
734
  def _processLostJobs(self):
@@ -893,11 +906,6 @@ class Leader:
893
906
  workerCommand.append('--context')
894
907
  workerCommand.append(base64.b64encode(pickle.dumps(context)).decode('utf-8'))
895
908
 
896
- # We locally override the command. This shouldn't get persisted back to
897
- # the job store, or we will detach the job body from the job
898
- # description. TODO: Don't do it this way! It's weird!
899
- jobNode.command = ' '.join(workerCommand)
900
-
901
909
  omp_threads = os.environ.get('OMP_NUM_THREADS') \
902
910
  or str(max(1, int(jobNode.cores))) # make sure OMP_NUM_THREADS is a positive integer
903
911
 
@@ -907,7 +915,7 @@ class Leader:
907
915
  }
908
916
 
909
917
  # jobBatchSystemID is an int for each job
910
- jobBatchSystemID = self.batchSystem.issueBatchJob(jobNode, job_environment=job_environment)
918
+ jobBatchSystemID = self.batchSystem.issueBatchJob(' '.join(workerCommand), jobNode, job_environment=job_environment)
911
919
  # Record the job by the ID the batch system will use to talk about it with us
912
920
  self.issued_jobs_by_batch_system_id[jobBatchSystemID] = jobNode.jobStoreID
913
921
  # Record that this job is issued right now and shouldn't e.g. be issued again.
@@ -921,7 +929,7 @@ class Leader:
921
929
  "%s and %s",
922
930
  jobNode, str(jobBatchSystemID), jobNode.requirements_string())
923
931
  # Tell everyone it is issued and the queue size changed
924
- self._messages.publish(JobIssuedMessage(jobNode.get_job_kind(), jobNode.jobStoreID, jobBatchSystemID))
932
+ self._messages.publish(JobIssuedMessage(get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID))
925
933
  self._messages.publish(QueueSizeMessage(self.getNumberOfJobsIssued()))
926
934
  # Tell the user there's another job to do
927
935
  self.progress_overall.total += 1
@@ -1045,7 +1053,7 @@ class Leader:
1045
1053
  jobs = [job for job in jobs if job.preemptible == preemptible]
1046
1054
  return jobs
1047
1055
 
1048
- def killJobs(self, jobsToKill):
1056
+ def killJobs(self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitReason.KILLED):
1049
1057
  """
1050
1058
  Kills the given set of jobs and then sends them for processing.
1051
1059
 
@@ -1059,7 +1067,7 @@ class Leader:
1059
1067
  self.batchSystem.killBatchJobs(jobsToKill)
1060
1068
  for jobBatchSystemID in jobsToKill:
1061
1069
  # Reissue immediately, noting that we killed the job
1062
- willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=BatchJobExitReason.KILLED)
1070
+ willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=exit_reason)
1063
1071
 
1064
1072
  if willRerun:
1065
1073
  # Compose a list of all the jobs that will run again
@@ -1089,7 +1097,7 @@ class Leader:
1089
1097
  str(runningJobs[jobBatchSystemID]),
1090
1098
  str(maxJobDuration))
1091
1099
  jobsToKill.append(jobBatchSystemID)
1092
- reissued = self.killJobs(jobsToKill)
1100
+ reissued = self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MAXJOBDURATION)
1093
1101
  if len(jobsToKill) > 0:
1094
1102
  # Summarize our actions
1095
1103
  logger.info("Killed %d over long jobs and reissued %d of them", len(jobsToKill), len(reissued))
@@ -1127,7 +1135,7 @@ class Leader:
1127
1135
  if timesMissing == killAfterNTimesMissing:
1128
1136
  self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
1129
1137
  jobsToKill.append(jobBatchSystemID)
1130
- self.killJobs(jobsToKill)
1138
+ self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MISSING)
1131
1139
  return len( self.reissueMissingJobs_missingHash ) == 0 #We use this to inform
1132
1140
  #if there are missing jobs
1133
1141
 
@@ -1157,7 +1165,7 @@ class Leader:
1157
1165
  self.progress_overall.update(incr=-1)
1158
1166
  self.progress_failed.update(incr=1)
1159
1167
 
1160
- # Delegate to the vers
1168
+ # Delegate to the version that uses a JobDescription
1161
1169
  return self.process_finished_job_description(issued_job, result_status, wall_time, exit_reason, batch_system_id)
1162
1170
 
1163
1171
  def process_finished_job_description(self, finished_job: JobDescription, result_status: int,
@@ -1165,7 +1173,7 @@ class Leader:
1165
1173
  exit_reason: Optional[BatchJobExitReason] = None,
1166
1174
  batch_system_id: Optional[int] = None) -> bool:
1167
1175
  """
1168
- Process a finished JobDescription based upon its succees or failure.
1176
+ Process a finished JobDescription based upon its success or failure.
1169
1177
 
1170
1178
  If wall-clock time is available, informs the cluster scaler about the
1171
1179
  job finishing.
@@ -1188,19 +1196,62 @@ class Leader:
1188
1196
  logger.debug("Job %s continues to exist (i.e. has more to do)", finished_job)
1189
1197
  try:
1190
1198
  # Reload the job as modified by the worker
1191
- self.toilState.reset_job(job_store_id)
1192
- replacement_job = self.toilState.get_job(job_store_id)
1199
+ if finished_job.has_body():
1200
+ # The worker was expected to do some work. We expect the
1201
+ # worker to have updated the job description.
1202
+
1203
+ # If the job succeeded, we wait around to see the update
1204
+ # and fail the job if we don't see it.
1205
+ if result_status == 0:
1206
+ timeout = self.config.job_store_timeout
1207
+ complaint = (
1208
+ f"has no new version available after {timeout} "
1209
+ "seconds. Either worker updates to "
1210
+ "the job store are delayed longer than your "
1211
+ "--jobStoreTimeout, or the worker trying to run the "
1212
+ "job was killed (or never started)."
1213
+ )
1214
+ else:
1215
+ timeout = 0
1216
+ complaint = (
1217
+ "has no new version available immediately. The "
1218
+ "batch system may have killed (or never started) "
1219
+ "the Toil worker."
1220
+ )
1221
+ change_detected = self.toilState.reset_job_expecting_change(job_store_id, timeout)
1222
+ replacement_job = self.toilState.get_job(job_store_id)
1223
+
1224
+ if not change_detected:
1225
+ logger.warning(
1226
+ 'Job %s %s',
1227
+ replacement_job,
1228
+ complaint
1229
+ )
1230
+ if result_status == 0:
1231
+ # Make the job fail because we ran it and it finished
1232
+ # and we never heard back.
1233
+ logger.error(
1234
+ 'Marking ostensibly successful job %s that did '
1235
+ 'not report in to the job store before '
1236
+ '--jobStoreTimeout as having been partitioned '
1237
+ 'from us.',
1238
+ replacement_job
1239
+ )
1240
+ result_status = EXIT_STATUS_UNAVAILABLE_VALUE
1241
+ exit_reason = BatchJobExitReason.PARTITION
1242
+ else:
1243
+ # If there was no body sent, the worker won't commit any
1244
+ # changes to the job description. So don't wait around for
1245
+ # any and don't complain if we don't see them.
1246
+ self.toilState.reset_job(job_store_id)
1247
+ replacement_job = self.toilState.get_job(job_store_id)
1248
+
1193
1249
  except NoSuchJobException:
1194
1250
  # We have a ghost job - the job has been deleted but a stale
1195
1251
  # read from e.g. a non-POSIX-compliant filesystem gave us a
1196
1252
  # false positive when we checked for its existence. Process the
1197
1253
  # job from here as any other job removed from the job store.
1198
- # This is a hack until we can figure out how to actually always
1199
- # have a strongly-consistent communications channel. See
1200
- # https://github.com/BD2KGenomics/toil/issues/1091
1201
- logger.warning('Got a stale read for job %s; caught its '
1202
- 'completion in time, but other jobs may try to run twice! Fix '
1203
- 'the consistency of your job store storage!', finished_job)
1254
+ logger.debug("Job %s is actually complete upon closer inspection", finished_job)
1204
1255
  self.processRemovedJob(finished_job, result_status)
1205
1256
  return False
1206
1257
  if replacement_job.logJobStoreFileID is not None:
@@ -1208,11 +1259,12 @@ class Leader:
1208
1259
  # more memory efficient than read().striplines() while leaving off the
1209
1260
  # trailing \n left when using readlines()
1210
1261
  # http://stackoverflow.com/a/15233739
1211
- StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
1262
+ StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
1212
1263
  message='The job seems to have left a log file, indicating failure: %s' % replacement_job)
1213
1264
  if self.config.writeLogs or self.config.writeLogsGzip:
1214
1265
  with replacement_job.getLogFileHandle(self.jobStore) as log_stream:
1215
- StatsAndLogging.writeLogFiles(replacement_job.chainedJobs, log_stream, self.config, failed=True)
1266
+ # Send log data from the job store to each per-job log file involved.
1267
+ StatsAndLogging.writeLogFiles([names.stats_name for names in replacement_job.get_chain()], log_stream, self.config, failed=True)
1216
1268
  if result_status != 0:
1217
1269
  # If the batch system returned a non-zero exit code then the worker
1218
1270
  # is assumed not to have captured the failure of the job, so we
@@ -1236,13 +1288,12 @@ class Leader:
1236
1288
  else:
1237
1289
  with log_stream:
1238
1290
  if os.path.getsize(log_file) > 0:
1239
- StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
1291
+ StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
1240
1292
  message='The batch system left a non-empty file %s:' % log_file)
1241
1293
  if self.config.writeLogs or self.config.writeLogsGzip:
1242
1294
  file_root, _ = os.path.splitext(os.path.basename(log_file))
1243
- job_names = replacement_job.chainedJobs
1244
- if job_names is None: # For jobs that fail this way, replacement_job.chainedJobs is not guaranteed to be set
1245
- job_names = [str(replacement_job)]
1295
+ job_names = [names.stats_name for names in replacement_job.get_chain()]
1296
+ # Tack the batch system log file name onto each job's name
1246
1297
  job_names = [j + '_' + file_root for j in job_names]
1247
1298
  log_stream.seek(0)
1248
1299
  StatsAndLogging.writeLogFiles(job_names, log_stream, self.config, failed=True)
@@ -1309,7 +1360,7 @@ class Leader:
1309
1360
 
1310
1361
  # Tell everyone it failed
1311
1362
 
1312
- self._messages.publish(JobFailedMessage(job_desc.get_job_kind(), job_id))
1363
+ self._messages.publish(JobFailedMessage(get_job_kind(job_desc.get_names()), job_id))
1313
1364
 
1314
1365
  if job_id in self.toilState.service_to_client:
1315
1366
  # Is a service job
toil/lib/aws/__init__.py CHANGED
@@ -16,11 +16,25 @@ import logging
16
16
  import os
17
17
  import re
18
18
  import socket
19
+ import toil.lib.retry
19
20
  from http.client import HTTPException
20
- from typing import Dict, MutableMapping, Optional
21
+ from typing import Dict, MutableMapping, Optional, Union, Literal
21
22
  from urllib.error import URLError
22
23
  from urllib.request import urlopen
23
24
 
25
+ from botocore.exceptions import ClientError
26
+
27
+ from mypy_boto3_s3.literals import BucketLocationConstraintType
28
+
29
+ AWSRegionName = Union[BucketLocationConstraintType, Literal["us-east-1"]]
30
+
31
+ # These are errors where we think something randomly
32
+ # went wrong on the AWS side and we ought to retry.
33
+ AWSServerErrors = toil.lib.retry.ErrorCondition(
34
+ error=ClientError,
35
+ error_codes=[404, 500, 502, 503, 504]
36
+ )
37
+
24
38
  logger = logging.getLogger(__name__)
25
39
 
26
40
  # This file isn't allowed to import anything that depends on Boto or Boto3,
@@ -67,11 +81,10 @@ def get_aws_zone_from_metadata() -> Optional[str]:
67
81
  # metadata.
68
82
  try:
69
83
  # Use the EC2 metadata service
70
- import boto
71
- str(boto) # to prevent removal of the import
72
- from boto.utils import get_instance_metadata
84
+ from ec2_metadata import ec2_metadata
85
+
73
86
  logger.debug("Fetch AZ from EC2 metadata")
74
- return get_instance_metadata()['placement']['availability-zone']
87
+ return ec2_metadata.availability_zone
75
88
  except ImportError:
76
89
  # This is expected to happen a lot
77
90
  logger.debug("No boto to fetch ECS metadata")
@@ -82,12 +95,15 @@ def get_aws_zone_from_metadata() -> Optional[str]:
82
95
 
83
96
  def get_aws_zone_from_boto() -> Optional[str]:
84
97
  """
85
- Get the AWS zone from the Boto config file, if it is configured and the
86
- boto module is available.
98
+ Get the AWS zone from the Boto3 config file or from AWS_DEFAULT_REGION, if it is configured and the
99
+ boto3 module is available.
87
100
  """
88
101
  try:
89
- import boto
90
- zone = boto.config.get('Boto', 'ec2_region_name')
102
+ import boto3
103
+ from session import client
104
+ boto3_session = boto3.session.Session()
105
+ # this should check AWS_DEFAULT_REGION and ~/.aws/config
106
+ zone = boto3_session.region_name
91
107
  if zone is not None:
92
108
  zone += 'a' # derive an availability zone in the region
93
109
  return zone
@@ -128,7 +144,7 @@ def get_current_aws_zone() -> Optional[str]:
128
144
  get_aws_zone_from_environment_region() or \
129
145
  get_aws_zone_from_boto()
130
146
 
131
- def zone_to_region(zone: str) -> str:
147
+ def zone_to_region(zone: str) -> AWSRegionName:
132
148
  """Get a region (e.g. us-west-2) from a zone (e.g. us-west-1c)."""
133
149
  # re.compile() caches the regex internally so we don't have to
134
150
  availability_zone = re.compile(r'^([a-z]{2}-[a-z]+-[1-9][0-9]*)([a-z])$')
toil/lib/aws/iam.py CHANGED
@@ -257,8 +257,8 @@ def get_policy_permissions(region: str) -> AllowedActionCollection:
257
257
  :param zone: AWS zone to connect to
258
258
  """
259
259
 
260
- iam: IAMClient = cast(IAMClient, get_client('iam', region))
261
- sts: STSClient = cast(STSClient, get_client('sts', region))
260
+ iam: IAMClient = get_client('iam', region)
261
+ sts: STSClient = get_client('sts', region)
262
262
  #TODO Condider effect: deny at some point
263
263
  allowed_actions: AllowedActionCollection = defaultdict(lambda: {'Action': [], 'NotAction': []})
264
264
  try: