toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +22 -13
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/contained_executor.py +4 -5
  6. toil/batchSystems/gridengine.py +1 -1
  7. toil/batchSystems/htcondor.py +5 -5
  8. toil/batchSystems/kubernetes.py +25 -11
  9. toil/batchSystems/local_support.py +3 -3
  10. toil/batchSystems/lsf.py +2 -2
  11. toil/batchSystems/mesos/batchSystem.py +4 -4
  12. toil/batchSystems/mesos/executor.py +3 -2
  13. toil/batchSystems/options.py +9 -0
  14. toil/batchSystems/singleMachine.py +11 -10
  15. toil/batchSystems/slurm.py +64 -22
  16. toil/batchSystems/torque.py +1 -1
  17. toil/bus.py +7 -3
  18. toil/common.py +36 -13
  19. toil/cwl/cwltoil.py +365 -312
  20. toil/deferred.py +1 -1
  21. toil/fileStores/abstractFileStore.py +17 -17
  22. toil/fileStores/cachingFileStore.py +2 -2
  23. toil/fileStores/nonCachingFileStore.py +1 -1
  24. toil/job.py +228 -60
  25. toil/jobStores/abstractJobStore.py +18 -10
  26. toil/jobStores/aws/jobStore.py +280 -218
  27. toil/jobStores/aws/utils.py +57 -29
  28. toil/jobStores/conftest.py +2 -2
  29. toil/jobStores/fileJobStore.py +2 -2
  30. toil/jobStores/googleJobStore.py +3 -4
  31. toil/leader.py +72 -24
  32. toil/lib/aws/__init__.py +26 -10
  33. toil/lib/aws/iam.py +2 -2
  34. toil/lib/aws/session.py +62 -22
  35. toil/lib/aws/utils.py +73 -37
  36. toil/lib/conversions.py +5 -1
  37. toil/lib/ec2.py +118 -69
  38. toil/lib/expando.py +1 -1
  39. toil/lib/io.py +14 -2
  40. toil/lib/misc.py +1 -3
  41. toil/lib/resources.py +55 -21
  42. toil/lib/retry.py +12 -5
  43. toil/lib/threading.py +2 -2
  44. toil/lib/throttle.py +1 -1
  45. toil/options/common.py +27 -24
  46. toil/provisioners/__init__.py +9 -3
  47. toil/provisioners/abstractProvisioner.py +9 -7
  48. toil/provisioners/aws/__init__.py +20 -15
  49. toil/provisioners/aws/awsProvisioner.py +406 -329
  50. toil/provisioners/gceProvisioner.py +2 -2
  51. toil/provisioners/node.py +13 -5
  52. toil/server/app.py +1 -1
  53. toil/statsAndLogging.py +58 -16
  54. toil/test/__init__.py +27 -12
  55. toil/test/batchSystems/batchSystemTest.py +40 -33
  56. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  57. toil/test/batchSystems/test_slurm.py +1 -1
  58. toil/test/cwl/cwlTest.py +8 -91
  59. toil/test/cwl/seqtk_seq.cwl +1 -1
  60. toil/test/docs/scriptsTest.py +10 -13
  61. toil/test/jobStores/jobStoreTest.py +33 -49
  62. toil/test/lib/aws/test_iam.py +2 -2
  63. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  64. toil/test/provisioners/clusterTest.py +90 -8
  65. toil/test/server/serverTest.py +2 -2
  66. toil/test/src/autoDeploymentTest.py +1 -1
  67. toil/test/src/dockerCheckTest.py +2 -1
  68. toil/test/src/environmentTest.py +125 -0
  69. toil/test/src/fileStoreTest.py +1 -1
  70. toil/test/src/jobDescriptionTest.py +18 -8
  71. toil/test/src/jobTest.py +1 -1
  72. toil/test/src/realtimeLoggerTest.py +4 -0
  73. toil/test/src/workerTest.py +52 -19
  74. toil/test/utils/toilDebugTest.py +61 -3
  75. toil/test/utils/utilsTest.py +20 -18
  76. toil/test/wdl/wdltoil_test.py +24 -71
  77. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  78. toil/toilState.py +68 -9
  79. toil/utils/toilDebugJob.py +153 -26
  80. toil/utils/toilLaunchCluster.py +12 -2
  81. toil/utils/toilRsyncCluster.py +7 -2
  82. toil/utils/toilSshCluster.py +7 -3
  83. toil/utils/toilStats.py +2 -1
  84. toil/utils/toilStatus.py +97 -51
  85. toil/version.py +10 -10
  86. toil/wdl/wdltoil.py +318 -51
  87. toil/worker.py +96 -69
  88. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  89. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
  90. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
  91. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  92. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  93. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
@@ -17,26 +17,25 @@ import logging
17
17
  import os
18
18
  import types
19
19
  from ssl import SSLError
20
- from typing import Optional, cast, TYPE_CHECKING
20
+ from typing import Optional, cast, TYPE_CHECKING, Dict, List, Tuple
21
21
 
22
22
  from boto3.s3.transfer import TransferConfig
23
- from boto.exception import SDBResponseError
24
23
  from botocore.client import Config
25
24
  from botocore.exceptions import ClientError
25
+ from mypy_boto3_sdb.type_defs import ItemTypeDef, AttributeTypeDef
26
26
 
27
- from toil.lib.aws import session
28
- from toil.lib.aws.utils import connection_reset, get_bucket_region
27
+ from toil.lib.aws import session, AWSServerErrors
28
+ from toil.lib.aws.utils import connection_error, get_bucket_region
29
29
  from toil.lib.compatibility import compat_bytes
30
30
  from toil.lib.retry import (DEFAULT_DELAYS,
31
31
  DEFAULT_TIMEOUT,
32
- ErrorCondition,
33
32
  get_error_code,
34
33
  get_error_message,
35
34
  get_error_status,
36
35
  old_retry,
37
36
  retry)
38
37
  if TYPE_CHECKING:
39
- from mypy_boto3_s3 import S3Client, S3ServiceResource
38
+ from mypy_boto3_s3 import S3ServiceResource
40
39
 
41
40
  logger = logging.getLogger(__name__)
42
41
 
@@ -126,11 +125,11 @@ class SDBHelper:
126
125
  return cls._maxChunks() * cls.maxValueSize
127
126
 
128
127
  @classmethod
129
- def binaryToAttributes(cls, binary):
128
+ def binaryToAttributes(cls, binary) -> Dict[str, str]:
130
129
  """
131
130
  Turn a bytestring, or None, into SimpleDB attributes.
132
131
  """
133
- if binary is None: return {'numChunks': 0}
132
+ if binary is None: return {'numChunks': '0'}
134
133
  assert isinstance(binary, bytes)
135
134
  assert len(binary) <= cls.maxBinarySize()
136
135
  # The use of compression is just an optimization. We can't include it in the maxValueSize
@@ -144,10 +143,41 @@ class SDBHelper:
144
143
  assert len(encoded) <= cls._maxEncodedSize()
145
144
  n = cls.maxValueSize
146
145
  chunks = (encoded[i:i + n] for i in range(0, len(encoded), n))
147
- attributes = {cls._chunkName(i): chunk for i, chunk in enumerate(chunks)}
148
- attributes.update({'numChunks': len(attributes)})
146
+ attributes = {cls._chunkName(i): chunk.decode("utf-8") for i, chunk in enumerate(chunks)}
147
+ attributes.update({'numChunks': str(len(attributes))})
149
148
  return attributes
150
149
 
150
+ @classmethod
151
+ def attributeDictToList(cls, attributes: Dict[str, str]) -> List[AttributeTypeDef]:
152
+ """
153
+ Convert the attribute dict (ex: from binaryToAttributes) into a list of attribute typed dicts
154
+ to be compatible with boto3 argument syntax
155
+ :param attributes: Dict[str, str], attribute in object form
156
+ :return: List[AttributeTypeDef], list of attributes in typed dict form
157
+ """
158
+ return [{"Name": name, "Value": value} for name, value in attributes.items()]
159
+
160
+ @classmethod
161
+ def attributeListToDict(cls, attributes: List[AttributeTypeDef]) -> Dict[str, str]:
162
+ """
163
+ Convert the attribute boto3 representation of list of attribute typed dicts
164
+ back to a dictionary with name, value pairs
165
+ :param attribute: List[AttributeTypeDef, attribute in typed dict form
166
+ :return: Dict[str, str], attribute in dict form
167
+ """
168
+ return {attribute["Name"]: attribute["Value"] for attribute in attributes}
169
+
170
+ @classmethod
171
+ def get_attributes_from_item(cls, item: ItemTypeDef, keys: List[str]) -> List[Optional[str]]:
172
+ return_values: List[Optional[str]] = [None for _ in keys]
173
+ mapped_indices: Dict[str, int] = {name: index for index, name in enumerate(keys)}
174
+ for attribute in item["Attributes"]:
175
+ name = attribute["Name"]
176
+ value = attribute["Value"]
177
+ if name in mapped_indices:
178
+ return_values[mapped_indices[name]] = value
179
+ return return_values
180
+
151
181
  @classmethod
152
182
  def _chunkName(cls, i):
153
183
  return str(i).zfill(3)
@@ -166,14 +196,21 @@ class SDBHelper:
166
196
  return 'numChunks'
167
197
 
168
198
  @classmethod
169
- def attributesToBinary(cls, attributes):
199
+ def attributesToBinary(cls, attributes: List[AttributeTypeDef]) -> Tuple[bytes, int]:
170
200
  """
171
201
  :rtype: (str|None,int)
172
202
  :return: the binary data and the number of chunks it was composed from
173
203
  """
174
- chunks = [(int(k), v) for k, v in attributes.items() if cls._isValidChunkName(k)]
204
+ chunks = []
205
+ numChunks: int = 0
206
+ for attribute in attributes:
207
+ name = attribute["Name"]
208
+ value = attribute["Value"]
209
+ if cls._isValidChunkName(name):
210
+ chunks.append((int(name), value))
211
+ if name == "numChunks":
212
+ numChunks = int(value)
175
213
  chunks.sort()
176
- numChunks = int(attributes['numChunks'])
177
214
  if numChunks:
178
215
  serializedJob = b''.join(v.encode() for k, v in chunks)
179
216
  compressed = base64.b64decode(serializedJob)
@@ -193,10 +230,7 @@ def fileSizeAndTime(localFilePath):
193
230
  return file_stat.st_size, file_stat.st_mtime
194
231
 
195
232
 
196
- @retry(errors=[ErrorCondition(
197
- error=ClientError,
198
- error_codes=[404, 500, 502, 503, 504]
199
- )])
233
+ @retry(errors=[AWSServerErrors])
200
234
  def uploadFromPath(localFilePath: str,
201
235
  resource,
202
236
  bucketName: str,
@@ -232,10 +266,7 @@ def uploadFromPath(localFilePath: str,
232
266
  return version
233
267
 
234
268
 
235
- @retry(errors=[ErrorCondition(
236
- error=ClientError,
237
- error_codes=[404, 500, 502, 503, 504]
238
- )])
269
+ @retry(errors=[AWSServerErrors])
239
270
  def uploadFile(readable,
240
271
  resource,
241
272
  bucketName: str,
@@ -287,10 +318,7 @@ class ServerSideCopyProhibitedError(RuntimeError):
287
318
  insists that you pay to download and upload the data yourself instead.
288
319
  """
289
320
 
290
- @retry(errors=[ErrorCondition(
291
- error=ClientError,
292
- error_codes=[404, 500, 502, 503, 504]
293
- )])
321
+ @retry(errors=[AWSServerErrors])
294
322
  def copyKeyMultipart(resource: "S3ServiceResource",
295
323
  srcBucketName: str,
296
324
  srcKeyName: str,
@@ -439,9 +467,9 @@ def sdb_unavailable(e):
439
467
 
440
468
 
441
469
  def no_such_sdb_domain(e):
442
- return (isinstance(e, SDBResponseError)
443
- and e.error_code
444
- and e.error_code.endswith('NoSuchDomain'))
470
+ return (isinstance(e, ClientError)
471
+ and get_error_code(e)
472
+ and get_error_code(e).endswith('NoSuchDomain'))
445
473
 
446
474
 
447
475
  def retryable_ssl_error(e):
@@ -452,7 +480,7 @@ def retryable_ssl_error(e):
452
480
  def retryable_sdb_errors(e):
453
481
  return (sdb_unavailable(e)
454
482
  or no_such_sdb_domain(e)
455
- or connection_reset(e)
483
+ or connection_error(e)
456
484
  or retryable_ssl_error(e))
457
485
 
458
486
 
@@ -17,7 +17,7 @@
17
17
  collect_ignore = []
18
18
 
19
19
  try:
20
- import boto
21
- print(boto.__file__) # prevent this import from being removed
20
+ import boto3
21
+ print(boto3.__file__) # prevent this import from being removed
22
22
  except ImportError:
23
23
  collect_ignore.append("aws")
@@ -113,7 +113,7 @@ class FileJobStore(AbstractJobStore):
113
113
  os.mkdir(self.jobStoreDir)
114
114
  except OSError as e:
115
115
  if e.errno == errno.EEXIST:
116
- raise JobStoreExistsException(self.jobStoreDir)
116
+ raise JobStoreExistsException(self.jobStoreDir, "file")
117
117
  else:
118
118
  raise
119
119
  os.makedirs(self.jobsDir, exist_ok=True)
@@ -127,7 +127,7 @@ class FileJobStore(AbstractJobStore):
127
127
 
128
128
  def resume(self):
129
129
  if not os.path.isdir(self.jobStoreDir):
130
- raise NoSuchJobStoreException(self.jobStoreDir)
130
+ raise NoSuchJobStoreException(self.jobStoreDir, "file")
131
131
  super().resume()
132
132
 
133
133
  def destroy(self):
@@ -164,7 +164,7 @@ class GoogleJobStore(AbstractJobStore):
164
164
  try:
165
165
  self.bucket = self.storageClient.create_bucket(self.bucketName)
166
166
  except exceptions.Conflict:
167
- raise JobStoreExistsException(self.locator)
167
+ raise JobStoreExistsException(self.locator, "google")
168
168
  super().initialize(config)
169
169
 
170
170
  # set up sever side encryption after we set up config in super
@@ -178,7 +178,7 @@ class GoogleJobStore(AbstractJobStore):
178
178
  try:
179
179
  self.bucket = self.storageClient.get_bucket(self.bucketName)
180
180
  except exceptions.NotFound:
181
- raise NoSuchJobStoreException(self.locator)
181
+ raise NoSuchJobStoreException(self.locator, "google")
182
182
  super().resume()
183
183
 
184
184
  @google_retry
@@ -209,8 +209,7 @@ class GoogleJobStore(AbstractJobStore):
209
209
 
210
210
  def assign_job_id(self, job_description):
211
211
  jobStoreID = self._new_job_id()
212
- log.debug("Assigning ID to job %s for '%s'",
213
- jobStoreID, '<no command>' if job_description.command is None else job_description.command)
212
+ log.debug("Assigning ID to job %s", jobStoreID)
214
213
  job_description.jobStoreID = jobStoreID
215
214
 
216
215
  @contextmanager
toil/leader.py CHANGED
@@ -119,7 +119,12 @@ class Leader:
119
119
  if self.config.write_messages is None:
120
120
  # The user hasn't specified a place for the message bus so we
121
121
  # should make one.
122
- self.config.write_messages = gen_message_bus_path()
122
+ # pass in coordination_dir for toil-cwl-runner; we want to obey --tmpdir-prefix
123
+ # from cwltool and we change the coordination_dir when detected. we don't want
124
+ # to make another config attribute so put the message bus in the already prefixed dir
125
+ # if a coordination_dir is provided normally, we can still put the bus in there
126
+ # as the coordination dir should serve a similar purpose to the tmp directory
127
+ self.config.write_messages = gen_message_bus_path(config.coordination_dir)
123
128
 
124
129
  # Message bus messages need to go to the given file.
125
130
  # Keep a reference to the return value so the listener stays alive.
@@ -289,7 +294,11 @@ class Leader:
289
294
  for job_id in self.toilState.totalFailedJobs:
290
295
  # Refresh all the failed jobs to get e.g. the log file IDs that the workers wrote
291
296
  self.toilState.reset_job(job_id)
292
- failed_jobs.append(self.toilState.get_job(job_id))
297
+ try:
298
+ failed_jobs.append(self.toilState.get_job(job_id))
299
+ except NoSuchJobException:
300
+ # Job actually finished and was removed
301
+ pass
293
302
 
294
303
  logger.info("Failed jobs at end of the run: %s", ' '.join(str(j) for j in failed_jobs))
295
304
  raise FailedJobsException(self.jobStore, failed_jobs, exit_code=self.recommended_fail_exit_code)
@@ -522,10 +531,10 @@ class Leader:
522
531
  "manager: %s", readyJob.jobStoreID)
523
532
  elif readyJob.jobStoreID in self.toilState.hasFailedSuccessors:
524
533
  self._processFailedSuccessors(job_id)
525
- elif readyJob.command is not None or result_status != 0:
526
- # The job has a command it must be run before any successors.
534
+ elif readyJob.has_body() or result_status != 0:
535
+ # The job has a body it must be run before any successors.
527
536
  # Similarly, if the job previously failed we rerun it, even if it doesn't have a
528
- # command to run, to eliminate any parts of the stack now completed.
537
+ # body to run, to eliminate any parts of the stack now completed.
529
538
  isServiceJob = readyJob.jobStoreID in self.toilState.service_to_client
530
539
 
531
540
  # We want to run the job, and expend one of its "tries" (possibly
@@ -551,6 +560,7 @@ class Leader:
551
560
  for serviceID in serviceJobList:
552
561
  if serviceID in self.toilState.service_to_client:
553
562
  raise RuntimeError(f"The ready service ID: {serviceID} was already added.")
563
+ # TODO: Why do we refresh here?
554
564
  self.toilState.reset_job(serviceID)
555
565
  serviceHost = self.toilState.get_job(serviceID)
556
566
  self.toilState.service_to_client[serviceID] = readyJob.jobStoreID
@@ -896,11 +906,6 @@ class Leader:
896
906
  workerCommand.append('--context')
897
907
  workerCommand.append(base64.b64encode(pickle.dumps(context)).decode('utf-8'))
898
908
 
899
- # We locally override the command. This shouldn't get persisted back to
900
- # the job store, or we will detach the job body from the job
901
- # description. TODO: Don't do it this way! It's weird!
902
- jobNode.command = ' '.join(workerCommand)
903
-
904
909
  omp_threads = os.environ.get('OMP_NUM_THREADS') \
905
910
  or str(max(1, int(jobNode.cores))) # make sure OMP_NUM_THREADS is a positive integer
906
911
 
@@ -910,7 +915,7 @@ class Leader:
910
915
  }
911
916
 
912
917
  # jobBatchSystemID is an int for each job
913
- jobBatchSystemID = self.batchSystem.issueBatchJob(jobNode, job_environment=job_environment)
918
+ jobBatchSystemID = self.batchSystem.issueBatchJob(' '.join(workerCommand), jobNode, job_environment=job_environment)
914
919
  # Record the job by the ID the batch system will use to talk about it with us
915
920
  self.issued_jobs_by_batch_system_id[jobBatchSystemID] = jobNode.jobStoreID
916
921
  # Record that this job is issued right now and shouldn't e.g. be issued again.
@@ -1048,7 +1053,7 @@ class Leader:
1048
1053
  jobs = [job for job in jobs if job.preemptible == preemptible]
1049
1054
  return jobs
1050
1055
 
1051
- def killJobs(self, jobsToKill):
1056
+ def killJobs(self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitReason.KILLED):
1052
1057
  """
1053
1058
  Kills the given set of jobs and then sends them for processing.
1054
1059
 
@@ -1062,7 +1067,7 @@ class Leader:
1062
1067
  self.batchSystem.killBatchJobs(jobsToKill)
1063
1068
  for jobBatchSystemID in jobsToKill:
1064
1069
  # Reissue immediately, noting that we killed the job
1065
- willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=BatchJobExitReason.KILLED)
1070
+ willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=exit_reason)
1066
1071
 
1067
1072
  if willRerun:
1068
1073
  # Compose a list of all the jobs that will run again
@@ -1092,7 +1097,7 @@ class Leader:
1092
1097
  str(runningJobs[jobBatchSystemID]),
1093
1098
  str(maxJobDuration))
1094
1099
  jobsToKill.append(jobBatchSystemID)
1095
- reissued = self.killJobs(jobsToKill)
1100
+ reissued = self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MAXJOBDURATION)
1096
1101
  if len(jobsToKill) > 0:
1097
1102
  # Summarize our actions
1098
1103
  logger.info("Killed %d over long jobs and reissued %d of them", len(jobsToKill), len(reissued))
@@ -1130,7 +1135,7 @@ class Leader:
1130
1135
  if timesMissing == killAfterNTimesMissing:
1131
1136
  self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
1132
1137
  jobsToKill.append(jobBatchSystemID)
1133
- self.killJobs(jobsToKill)
1138
+ self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MISSING)
1134
1139
  return len( self.reissueMissingJobs_missingHash ) == 0 #We use this to inform
1135
1140
  #if there are missing jobs
1136
1141
 
@@ -1168,7 +1173,7 @@ class Leader:
1168
1173
  exit_reason: Optional[BatchJobExitReason] = None,
1169
1174
  batch_system_id: Optional[int] = None) -> bool:
1170
1175
  """
1171
- Process a finished JobDescription based upon its succees or failure.
1176
+ Process a finished JobDescription based upon its success or failure.
1172
1177
 
1173
1178
  If wall-clock time is available, informs the cluster scaler about the
1174
1179
  job finishing.
@@ -1191,19 +1196,62 @@ class Leader:
1191
1196
  logger.debug("Job %s continues to exist (i.e. has more to do)", finished_job)
1192
1197
  try:
1193
1198
  # Reload the job as modified by the worker
1194
- self.toilState.reset_job(job_store_id)
1195
- replacement_job = self.toilState.get_job(job_store_id)
1199
+ if finished_job.has_body():
1200
+ # The worker was expected to do some work. We expect the
1201
+ # worker to have updated the job description.
1202
+
1203
+ # If the job succeeded, we wait around to see the update
1204
+ # and fail the job if we don't see it.
1205
+ if result_status == 0:
1206
+ timeout = self.config.job_store_timeout
1207
+ complaint = (
1208
+ f"has no new version available after {timeout} "
1209
+ "seconds. Either worker updates to "
1210
+ "the job store are delayed longer than your "
1211
+ "--jobStoreTimeout, or the worker trying to run the "
1212
+ "job was killed (or never started)."
1213
+ )
1214
+ else:
1215
+ timeout = 0
1216
+ complaint = (
1217
+ "has no new version available immediately. The "
1218
+ "batch system may have killed (or never started) "
1219
+ "the Toil worker."
1220
+ )
1221
+ change_detected = self.toilState.reset_job_expecting_change(job_store_id, timeout)
1222
+ replacement_job = self.toilState.get_job(job_store_id)
1223
+
1224
+ if not change_detected:
1225
+ logger.warning(
1226
+ 'Job %s %s',
1227
+ replacement_job,
1228
+ complaint
1229
+ )
1230
+ if result_status == 0:
1231
+ # Make the job fail because we ran it and it finished
1232
+ # and we never heard back.
1233
+ logger.error(
1234
+ 'Marking ostensibly successful job %s that did '
1235
+ 'not report in to the job store before '
1236
+ '--jobStoreTimeout as having been partitioned '
1237
+ 'from us.',
1238
+ replacement_job
1239
+ )
1240
+ result_status = EXIT_STATUS_UNAVAILABLE_VALUE
1241
+ exit_reason = BatchJobExitReason.PARTITION
1242
+ else:
1243
+ # If there was no body sent, the worker won't commit any
1244
+ # changes to the job description. So don't wait around for
1245
+ # any and don't complain if we don't see them.
1246
+ self.toilState.reset_job(job_store_id)
1247
+ replacement_job = self.toilState.get_job(job_store_id)
1248
+
1196
1249
  except NoSuchJobException:
1197
1250
  # We have a ghost job - the job has been deleted but a stale
1198
1251
  # read from e.g. a non-POSIX-compliant filesystem gave us a
1199
1252
  # false positive when we checked for its existence. Process the
1200
1253
  # job from here as any other job removed from the job store.
1201
- # This is a hack until we can figure out how to actually always
1202
- # have a strongly-consistent communications channel. See
1203
- # https://github.com/BD2KGenomics/toil/issues/1091
1204
- logger.warning('Got a stale read for job %s; caught its '
1205
- 'completion in time, but other jobs may try to run twice! Fix '
1206
- 'the consistency of your job store storage!', finished_job)
1254
+ logger.debug("Job %s is actually complete upon closer inspection", finished_job)
1207
1255
  self.processRemovedJob(finished_job, result_status)
1208
1256
  return False
1209
1257
  if replacement_job.logJobStoreFileID is not None:
toil/lib/aws/__init__.py CHANGED
@@ -16,11 +16,25 @@ import logging
16
16
  import os
17
17
  import re
18
18
  import socket
19
+ import toil.lib.retry
19
20
  from http.client import HTTPException
20
- from typing import Dict, MutableMapping, Optional
21
+ from typing import Dict, MutableMapping, Optional, Union, Literal
21
22
  from urllib.error import URLError
22
23
  from urllib.request import urlopen
23
24
 
25
+ from botocore.exceptions import ClientError
26
+
27
+ from mypy_boto3_s3.literals import BucketLocationConstraintType
28
+
29
+ AWSRegionName = Union[BucketLocationConstraintType, Literal["us-east-1"]]
30
+
31
+ # These are errors where we think something randomly
32
+ # went wrong on the AWS side and we ought to retry.
33
+ AWSServerErrors = toil.lib.retry.ErrorCondition(
34
+ error=ClientError,
35
+ error_codes=[404, 500, 502, 503, 504]
36
+ )
37
+
24
38
  logger = logging.getLogger(__name__)
25
39
 
26
40
  # This file isn't allowed to import anything that depends on Boto or Boto3,
@@ -67,11 +81,10 @@ def get_aws_zone_from_metadata() -> Optional[str]:
67
81
  # metadata.
68
82
  try:
69
83
  # Use the EC2 metadata service
70
- import boto
71
- str(boto) # to prevent removal of the import
72
- from boto.utils import get_instance_metadata
84
+ from ec2_metadata import ec2_metadata
85
+
73
86
  logger.debug("Fetch AZ from EC2 metadata")
74
- return get_instance_metadata()['placement']['availability-zone']
87
+ return ec2_metadata.availability_zone
75
88
  except ImportError:
76
89
  # This is expected to happen a lot
77
90
  logger.debug("No boto to fetch ECS metadata")
@@ -82,12 +95,15 @@ def get_aws_zone_from_metadata() -> Optional[str]:
82
95
 
83
96
  def get_aws_zone_from_boto() -> Optional[str]:
84
97
  """
85
- Get the AWS zone from the Boto config file, if it is configured and the
86
- boto module is available.
98
+ Get the AWS zone from the Boto3 config file or from AWS_DEFAULT_REGION, if it is configured and the
99
+ boto3 module is available.
87
100
  """
88
101
  try:
89
- import boto
90
- zone = boto.config.get('Boto', 'ec2_region_name')
102
+ import boto3
103
+ from session import client
104
+ boto3_session = boto3.session.Session()
105
+ # this should check AWS_DEFAULT_REGION and ~/.aws/config
106
+ zone = boto3_session.region_name
91
107
  if zone is not None:
92
108
  zone += 'a' # derive an availability zone in the region
93
109
  return zone
@@ -128,7 +144,7 @@ def get_current_aws_zone() -> Optional[str]:
128
144
  get_aws_zone_from_environment_region() or \
129
145
  get_aws_zone_from_boto()
130
146
 
131
- def zone_to_region(zone: str) -> str:
147
+ def zone_to_region(zone: str) -> AWSRegionName:
132
148
  """Get a region (e.g. us-west-2) from a zone (e.g. us-west-1c)."""
133
149
  # re.compile() caches the regex internally so we don't have to
134
150
  availability_zone = re.compile(r'^([a-z]{2}-[a-z]+-[1-9][0-9]*)([a-z])$')
toil/lib/aws/iam.py CHANGED
@@ -257,8 +257,8 @@ def get_policy_permissions(region: str) -> AllowedActionCollection:
257
257
  :param zone: AWS zone to connect to
258
258
  """
259
259
 
260
- iam: IAMClient = cast(IAMClient, get_client('iam', region))
261
- sts: STSClient = cast(STSClient, get_client('sts', region))
260
+ iam: IAMClient = get_client('iam', region)
261
+ sts: STSClient = get_client('sts', region)
262
262
  #TODO Condider effect: deny at some point
263
263
  allowed_actions: AllowedActionCollection = defaultdict(lambda: {'Action': [], 'NotAction': []})
264
264
  try:
toil/lib/aws/session.py CHANGED
@@ -15,16 +15,21 @@ import collections
15
15
  import logging
16
16
  import os
17
17
  import threading
18
- from typing import Dict, Optional, Tuple, cast
18
+ from typing import Dict, Optional, Tuple, cast, Union, Literal, overload, TypeVar
19
19
 
20
20
  import boto3
21
21
  import boto3.resources.base
22
- import boto.connection
23
22
  import botocore
24
23
  from boto3 import Session
25
24
  from botocore.client import Config
26
25
  from botocore.session import get_session
27
26
  from botocore.utils import JSONFileCache
27
+ from mypy_boto3_autoscaling import AutoScalingClient
28
+ from mypy_boto3_ec2 import EC2Client, EC2ServiceResource
29
+ from mypy_boto3_iam import IAMClient, IAMServiceResource
30
+ from mypy_boto3_s3 import S3Client, S3ServiceResource
31
+ from mypy_boto3_sdb import SimpleDBClient
32
+ from mypy_boto3_sts import STSClient
28
33
 
29
34
  logger = logging.getLogger(__name__)
30
35
 
@@ -120,6 +125,13 @@ class AWSConnectionManager:
120
125
  storage.item = _new_boto3_session(region_name=region)
121
126
  return cast(boto3.session.Session, storage.item)
122
127
 
128
+ @overload
129
+ def resource(self, region: Optional[str], service_name: Literal["s3"], endpoint_url: Optional[str] = None) -> S3ServiceResource: ...
130
+ @overload
131
+ def resource(self, region: Optional[str], service_name: Literal["iam"], endpoint_url: Optional[str] = None) -> IAMServiceResource: ...
132
+ @overload
133
+ def resource(self, region: Optional[str], service_name: Literal["ec2"], endpoint_url: Optional[str] = None) -> EC2ServiceResource: ...
134
+
123
135
  def resource(self, region: Optional[str], service_name: str, endpoint_url: Optional[str] = None) -> boto3.resources.base.ServiceResource:
124
136
  """
125
137
  Get the Boto3 Resource to use with the given service (like 'ec2') in the given region.
@@ -146,7 +158,28 @@ class AWSConnectionManager:
146
158
 
147
159
  return cast(boto3.resources.base.ServiceResource, storage.item)
148
160
 
149
- def client(self, region: Optional[str], service_name: str, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> botocore.client.BaseClient:
161
+ @overload
162
+ def client(self, region: Optional[str], service_name: Literal["ec2"], endpoint_url: Optional[str] = None,
163
+ config: Optional[Config] = None) -> EC2Client: ...
164
+ @overload
165
+ def client(self, region: Optional[str], service_name: Literal["iam"], endpoint_url: Optional[str] = None,
166
+ config: Optional[Config] = None) -> IAMClient: ...
167
+ @overload
168
+ def client(self, region: Optional[str], service_name: Literal["s3"], endpoint_url: Optional[str] = None,
169
+ config: Optional[Config] = None) -> S3Client: ...
170
+ @overload
171
+ def client(self, region: Optional[str], service_name: Literal["sts"], endpoint_url: Optional[str] = None,
172
+ config: Optional[Config] = None) -> STSClient: ...
173
+ @overload
174
+ def client(self, region: Optional[str], service_name: Literal["sdb"], endpoint_url: Optional[str] = None,
175
+ config: Optional[Config] = None) -> SimpleDBClient: ...
176
+ @overload
177
+ def client(self, region: Optional[str], service_name: Literal["autoscaling"], endpoint_url: Optional[str] = None,
178
+ config: Optional[Config] = None) -> AutoScalingClient: ...
179
+
180
+
181
+ def client(self, region: Optional[str], service_name: Literal["ec2", "iam", "s3", "sts", "sdb", "autoscaling"], endpoint_url: Optional[str] = None,
182
+ config: Optional[Config] = None) -> botocore.client.BaseClient:
150
183
  """
151
184
  Get the Boto3 Client to use with the given service (like 'ec2') in the given region.
152
185
 
@@ -159,9 +192,9 @@ class AWSConnectionManager:
159
192
  # Don't try and memoize if a custom config is used
160
193
  with _init_lock:
161
194
  if endpoint_url is not None:
162
- return self.session(region).client(service_name, endpoint_url=endpoint_url, config=config) # type: ignore
195
+ return self.session(region).client(service_name, endpoint_url=endpoint_url, config=config)
163
196
  else:
164
- return self.session(region).client(service_name, config=config) # type: ignore
197
+ return self.session(region).client(service_name, config=config)
165
198
 
166
199
  key = (region, service_name, endpoint_url)
167
200
  storage = self.client_cache[key]
@@ -172,25 +205,12 @@ class AWSConnectionManager:
172
205
  if endpoint_url is not None:
173
206
  # The Boto3 stubs are probably missing an overload here too. See:
174
207
  # <https://github.com/vemel/mypy_boto3_builder/issues/121#issuecomment-1011322636>
175
- storage.item = self.session(region).client(service_name, endpoint_url=endpoint_url) # type: ignore
208
+ storage.item = self.session(region).client(service_name, endpoint_url=endpoint_url)
176
209
  else:
177
210
  # We might not be able to pass None to Boto3 and have it be the same as no argument.
178
- storage.item = self.session(region).client(service_name) # type: ignore
211
+ storage.item = self.session(region).client(service_name)
179
212
  return cast(botocore.client.BaseClient , storage.item)
180
213
 
181
- def boto2(self, region: Optional[str], service_name: str) -> boto.connection.AWSAuthConnection:
182
- """
183
- Get the connected boto2 connection for the given region and service.
184
- """
185
- if service_name == 'iam':
186
- # IAM connections are regionless
187
- region = 'universal'
188
- key = (region, service_name)
189
- storage = self.boto2_cache[key]
190
- if not hasattr(storage, 'item'):
191
- with _init_lock:
192
- storage.item = getattr(boto, service_name).connect_to_region(region, profile_name=os.environ.get("TOIL_AWS_PROFILE", None))
193
- return cast(boto.connection.AWSAuthConnection, storage.item)
194
214
 
195
215
  # If you don't want your own AWSConnectionManager, we have a global one and some global functions
196
216
  _global_manager = AWSConnectionManager()
@@ -205,7 +225,20 @@ def establish_boto3_session(region_name: Optional[str] = None) -> Session:
205
225
  # Just use a global version of the manager. Note that we change the argument order!
206
226
  return _global_manager.session(region_name)
207
227
 
208
- def client(service_name: str, region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> botocore.client.BaseClient:
228
+ @overload
229
+ def client(service_name: Literal["ec2"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> EC2Client: ...
230
+ @overload
231
+ def client(service_name: Literal["iam"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> IAMClient: ...
232
+ @overload
233
+ def client(service_name: Literal["s3"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> S3Client: ...
234
+ @overload
235
+ def client(service_name: Literal["sts"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> STSClient: ...
236
+ @overload
237
+ def client(service_name: Literal["sdb"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> SimpleDBClient: ...
238
+ @overload
239
+ def client(service_name: Literal["autoscaling"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> AutoScalingClient: ...
240
+
241
+ def client(service_name: Literal["ec2", "iam", "s3", "sts", "sdb", "autoscaling"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> botocore.client.BaseClient:
209
242
  """
210
243
  Get a Boto 3 client for a particular AWS service, usable by the current thread.
211
244
 
@@ -215,7 +248,14 @@ def client(service_name: str, region_name: Optional[str] = None, endpoint_url: O
215
248
  # Just use a global version of the manager. Note that we change the argument order!
216
249
  return _global_manager.client(region_name, service_name, endpoint_url=endpoint_url, config=config)
217
250
 
218
- def resource(service_name: str, region_name: Optional[str] = None, endpoint_url: Optional[str] = None) -> boto3.resources.base.ServiceResource:
251
+ @overload
252
+ def resource(service_name: Literal["s3"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None) -> S3ServiceResource: ...
253
+ @overload
254
+ def resource(service_name: Literal["iam"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None) -> IAMServiceResource: ...
255
+ @overload
256
+ def resource(service_name: Literal["ec2"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None) -> EC2ServiceResource: ...
257
+
258
+ def resource(service_name: Literal["s3", "iam", "ec2"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None) -> boto3.resources.base.ServiceResource:
219
259
  """
220
260
  Get a Boto 3 resource for a particular AWS service, usable by the current thread.
221
261