toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +22 -13
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/contained_executor.py +4 -5
  6. toil/batchSystems/gridengine.py +1 -1
  7. toil/batchSystems/htcondor.py +5 -5
  8. toil/batchSystems/kubernetes.py +25 -11
  9. toil/batchSystems/local_support.py +3 -3
  10. toil/batchSystems/lsf.py +2 -2
  11. toil/batchSystems/mesos/batchSystem.py +4 -4
  12. toil/batchSystems/mesos/executor.py +3 -2
  13. toil/batchSystems/options.py +9 -0
  14. toil/batchSystems/singleMachine.py +11 -10
  15. toil/batchSystems/slurm.py +64 -22
  16. toil/batchSystems/torque.py +1 -1
  17. toil/bus.py +7 -3
  18. toil/common.py +36 -13
  19. toil/cwl/cwltoil.py +365 -312
  20. toil/deferred.py +1 -1
  21. toil/fileStores/abstractFileStore.py +17 -17
  22. toil/fileStores/cachingFileStore.py +2 -2
  23. toil/fileStores/nonCachingFileStore.py +1 -1
  24. toil/job.py +228 -60
  25. toil/jobStores/abstractJobStore.py +18 -10
  26. toil/jobStores/aws/jobStore.py +280 -218
  27. toil/jobStores/aws/utils.py +57 -29
  28. toil/jobStores/conftest.py +2 -2
  29. toil/jobStores/fileJobStore.py +2 -2
  30. toil/jobStores/googleJobStore.py +3 -4
  31. toil/leader.py +72 -24
  32. toil/lib/aws/__init__.py +26 -10
  33. toil/lib/aws/iam.py +2 -2
  34. toil/lib/aws/session.py +62 -22
  35. toil/lib/aws/utils.py +73 -37
  36. toil/lib/conversions.py +5 -1
  37. toil/lib/ec2.py +118 -69
  38. toil/lib/expando.py +1 -1
  39. toil/lib/io.py +14 -2
  40. toil/lib/misc.py +1 -3
  41. toil/lib/resources.py +55 -21
  42. toil/lib/retry.py +12 -5
  43. toil/lib/threading.py +2 -2
  44. toil/lib/throttle.py +1 -1
  45. toil/options/common.py +27 -24
  46. toil/provisioners/__init__.py +9 -3
  47. toil/provisioners/abstractProvisioner.py +9 -7
  48. toil/provisioners/aws/__init__.py +20 -15
  49. toil/provisioners/aws/awsProvisioner.py +406 -329
  50. toil/provisioners/gceProvisioner.py +2 -2
  51. toil/provisioners/node.py +13 -5
  52. toil/server/app.py +1 -1
  53. toil/statsAndLogging.py +58 -16
  54. toil/test/__init__.py +27 -12
  55. toil/test/batchSystems/batchSystemTest.py +40 -33
  56. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  57. toil/test/batchSystems/test_slurm.py +1 -1
  58. toil/test/cwl/cwlTest.py +8 -91
  59. toil/test/cwl/seqtk_seq.cwl +1 -1
  60. toil/test/docs/scriptsTest.py +10 -13
  61. toil/test/jobStores/jobStoreTest.py +33 -49
  62. toil/test/lib/aws/test_iam.py +2 -2
  63. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  64. toil/test/provisioners/clusterTest.py +90 -8
  65. toil/test/server/serverTest.py +2 -2
  66. toil/test/src/autoDeploymentTest.py +1 -1
  67. toil/test/src/dockerCheckTest.py +2 -1
  68. toil/test/src/environmentTest.py +125 -0
  69. toil/test/src/fileStoreTest.py +1 -1
  70. toil/test/src/jobDescriptionTest.py +18 -8
  71. toil/test/src/jobTest.py +1 -1
  72. toil/test/src/realtimeLoggerTest.py +4 -0
  73. toil/test/src/workerTest.py +52 -19
  74. toil/test/utils/toilDebugTest.py +61 -3
  75. toil/test/utils/utilsTest.py +20 -18
  76. toil/test/wdl/wdltoil_test.py +24 -71
  77. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  78. toil/toilState.py +68 -9
  79. toil/utils/toilDebugJob.py +153 -26
  80. toil/utils/toilLaunchCluster.py +12 -2
  81. toil/utils/toilRsyncCluster.py +7 -2
  82. toil/utils/toilSshCluster.py +7 -3
  83. toil/utils/toilStats.py +2 -1
  84. toil/utils/toilStatus.py +97 -51
  85. toil/version.py +10 -10
  86. toil/wdl/wdltoil.py +318 -51
  87. toil/worker.py +96 -69
  88. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  89. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
  90. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
  91. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  92. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  93. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/lib/ec2.py CHANGED
@@ -1,13 +1,11 @@
1
1
  import logging
2
2
  import time
3
3
  from base64 import b64encode
4
- from operator import attrgetter
5
- from typing import Dict, Iterable, List, Optional, Union
4
+ from operator import itemgetter
5
+ from typing import Dict, Iterable, List, Optional, Union, TYPE_CHECKING, Generator, Callable, Mapping, Any
6
6
 
7
+ import botocore.client
7
8
  from boto3.resources.base import ServiceResource
8
- from boto.ec2.instance import Instance as Boto2Instance
9
- from boto.ec2.spotinstancerequest import SpotInstanceRequest
10
- from botocore.client import BaseClient
11
9
 
12
10
  from toil.lib.aws.session import establish_boto3_session
13
11
  from toil.lib.aws.utils import flatten_tags
@@ -18,6 +16,11 @@ from toil.lib.retry import (ErrorCondition,
18
16
  old_retry,
19
17
  retry)
20
18
 
19
+ from mypy_boto3_ec2.client import EC2Client
20
+ from mypy_boto3_autoscaling.client import AutoScalingClient
21
+ from mypy_boto3_ec2.type_defs import SpotInstanceRequestTypeDef, DescribeInstancesResultTypeDef, InstanceTypeDef
22
+ from mypy_boto3_ec2.service_resource import EC2ServiceResource, Instance
23
+
21
24
  a_short_time = 5
22
25
  a_long_time = 60 * 60
23
26
  logger = logging.getLogger(__name__)
@@ -38,6 +41,7 @@ def not_found(e):
38
41
  # Not the right kind of error
39
42
  return False
40
43
 
44
+
41
45
  def inconsistencies_detected(e):
42
46
  if get_error_code(e) == 'InvalidGroup.NotFound':
43
47
  return True
@@ -45,6 +49,7 @@ def inconsistencies_detected(e):
45
49
  matches = ('invalid iam instance profile' in m) or ('no associated iam roles' in m)
46
50
  return matches
47
51
 
52
+
48
53
  # We also define these error categories for the new retry decorator
49
54
  INCONSISTENCY_ERRORS = [ErrorCondition(boto_error_codes=['InvalidGroup.NotFound']),
50
55
  ErrorCondition(error_message_must_include='Invalid IAM Instance Profile'),
@@ -62,9 +67,10 @@ class UnexpectedResourceState(Exception):
62
67
  super().__init__(
63
68
  "Expected state of %s to be '%s' but got '%s'" %
64
69
  (resource, to_state, state))
65
-
66
- def wait_transition(resource, from_states, to_state,
67
- state_getter=attrgetter('state')):
70
+
71
+
72
+ def wait_transition(boto3_ec2: EC2Client, resource: InstanceTypeDef, from_states: Iterable[str], to_state: str,
73
+ state_getter: Callable[[InstanceTypeDef], str]=lambda x: x.get('State').get('Name')):
68
74
  """
69
75
  Wait until the specified EC2 resource (instance, image, volume, ...) transitions from any
70
76
  of the given 'from' states to the specified 'to' state. If the instance is found in a state
@@ -76,41 +82,44 @@ def wait_transition(resource, from_states, to_state,
76
82
  :param to_state: the state of the resource when this method returns
77
83
  """
78
84
  state = state_getter(resource)
85
+ instance_id = resource["InstanceId"]
79
86
  while state in from_states:
80
87
  time.sleep(a_short_time)
81
88
  for attempt in retry_ec2():
82
89
  with attempt:
83
- resource.update(validate=True)
90
+ described = boto3_ec2.describe_instances(InstanceIds=[instance_id])
91
+ resource = described["Reservations"][0]["Instances"][0] # there should only be one requested
84
92
  state = state_getter(resource)
85
93
  if state != to_state:
86
94
  raise UnexpectedResourceState(resource, to_state, state)
87
95
 
88
96
 
89
- def wait_instances_running(ec2, instances: Iterable[Boto2Instance]) -> Iterable[Boto2Instance]:
97
+ def wait_instances_running(boto3_ec2: EC2Client, instances: Iterable[InstanceTypeDef]) -> Generator[InstanceTypeDef, None, None]:
90
98
  """
91
99
  Wait until no instance in the given iterable is 'pending'. Yield every instance that
92
100
  entered the running state as soon as it does.
93
101
 
94
- :param boto.ec2.connection.EC2Connection ec2: the EC2 connection to use for making requests
95
- :param Iterable[Boto2Instance] instances: the instances to wait on
96
- :rtype: Iterable[Boto2Instance]
102
+ :param EC2Client boto3_ec2: the EC2 connection to use for making requests
103
+ :param Iterable[InstanceTypeDef] instances: the instances to wait on
104
+ :rtype: Iterable[InstanceTypeDef]
97
105
  """
98
106
  running_ids = set()
99
107
  other_ids = set()
100
108
  while True:
101
109
  pending_ids = set()
102
110
  for i in instances:
103
- if i.state == 'pending':
104
- pending_ids.add(i.id)
105
- elif i.state == 'running':
106
- if i.id in running_ids:
111
+ i: InstanceTypeDef
112
+ if i['State']['Name'] == 'pending':
113
+ pending_ids.add(i['InstanceId'])
114
+ elif i['State']['Name'] == 'running':
115
+ if i['InstanceId'] in running_ids:
107
116
  raise RuntimeError("An instance was already added to the list of running instance IDs. Maybe there is a duplicate.")
108
- running_ids.add(i.id)
117
+ running_ids.add(i['InstanceId'])
109
118
  yield i
110
119
  else:
111
- if i.id in other_ids:
120
+ if i['InstanceId'] in other_ids:
112
121
  raise RuntimeError("An instance was already added to the list of other instances. Maybe there is a duplicate.")
113
- other_ids.add(i.id)
122
+ other_ids.add(i['InstanceId'])
114
123
  yield i
115
124
  logger.info('%i instance(s) pending, %i running, %i other.',
116
125
  *list(map(len, (pending_ids, running_ids, other_ids))))
@@ -121,14 +130,16 @@ def wait_instances_running(ec2, instances: Iterable[Boto2Instance]) -> Iterable[
121
130
  time.sleep(seconds)
122
131
  for attempt in retry_ec2():
123
132
  with attempt:
124
- instances = ec2.get_only_instances(list(pending_ids))
133
+ described_instances = boto3_ec2.describe_instances(InstanceIds=list(pending_ids))
134
+ instances = [instance for reservation in described_instances["Reservations"] for instance in reservation["Instances"]]
125
135
 
126
136
 
127
- def wait_spot_requests_active(ec2, requests: Iterable[SpotInstanceRequest], timeout: float = None, tentative: bool = False) -> Iterable[List[SpotInstanceRequest]]:
137
+ def wait_spot_requests_active(boto3_ec2: EC2Client, requests: Iterable[SpotInstanceRequestTypeDef], timeout: float = None, tentative: bool = False) -> Iterable[List[SpotInstanceRequestTypeDef]]:
128
138
  """
129
139
  Wait until no spot request in the given iterator is in the 'open' state or, optionally,
130
140
  a timeout occurs. Yield spot requests as soon as they leave the 'open' state.
131
141
 
142
+ :param boto3_ec2: ec2 client
132
143
  :param requests: The requests to wait on.
133
144
 
134
145
  :param timeout: Maximum time in seconds to spend waiting or None to wait forever. If a
@@ -145,11 +156,11 @@ def wait_spot_requests_active(ec2, requests: Iterable[SpotInstanceRequest], time
145
156
  other_ids = set()
146
157
  open_ids = None
147
158
 
148
- def cancel():
159
+ def cancel() -> None:
149
160
  logger.warning('Cancelling remaining %i spot requests.', len(open_ids))
150
- ec2.cancel_spot_instance_requests(list(open_ids))
161
+ boto3_ec2.cancel_spot_instance_requests(SpotInstanceRequestIds=list(open_ids))
151
162
 
152
- def spot_request_not_found(e):
163
+ def spot_request_not_found(e: Exception) -> bool:
153
164
  return get_error_code(e) == 'InvalidSpotInstanceRequestID.NotFound'
154
165
 
155
166
  try:
@@ -157,30 +168,31 @@ def wait_spot_requests_active(ec2, requests: Iterable[SpotInstanceRequest], time
157
168
  open_ids, eval_ids, fulfill_ids = set(), set(), set()
158
169
  batch = []
159
170
  for r in requests:
160
- if r.state == 'open':
161
- open_ids.add(r.id)
162
- if r.status.code == 'pending-evaluation':
163
- eval_ids.add(r.id)
164
- elif r.status.code == 'pending-fulfillment':
165
- fulfill_ids.add(r.id)
171
+ r: SpotInstanceRequestTypeDef # pycharm thinks it is a string
172
+ if r['State'] == 'open':
173
+ open_ids.add(r['InstanceId'])
174
+ if r['Status'] == 'pending-evaluation':
175
+ eval_ids.add(r['InstanceId'])
176
+ elif r['Status'] == 'pending-fulfillment':
177
+ fulfill_ids.add(r['InstanceId'])
166
178
  else:
167
179
  logger.info(
168
180
  'Request %s entered status %s indicating that it will not be '
169
- 'fulfilled anytime soon.', r.id, r.status.code)
170
- elif r.state == 'active':
171
- if r.id in active_ids:
181
+ 'fulfilled anytime soon.', r['InstanceId'], r['Status'])
182
+ elif r['State'] == 'active':
183
+ if r['InstanceId'] in active_ids:
172
184
  raise RuntimeError("A request was already added to the list of active requests. Maybe there are duplicate requests.")
173
- active_ids.add(r.id)
185
+ active_ids.add(r['InstanceId'])
174
186
  batch.append(r)
175
187
  else:
176
- if r.id in other_ids:
188
+ if r['InstanceId'] in other_ids:
177
189
  raise RuntimeError("A request was already added to the list of other IDs. Maybe there are duplicate requests.")
178
- other_ids.add(r.id)
190
+ other_ids.add(r['InstanceId'])
179
191
  batch.append(r)
180
192
  if batch:
181
193
  yield batch
182
194
  logger.info('%i spot requests(s) are open (%i of which are pending evaluation and %i '
183
- 'are pending fulfillment), %i are active and %i are in another state.',
195
+ 'are pending fulfillment), %i are active and %i are in another state.',
184
196
  *list(map(len, (open_ids, eval_ids, fulfill_ids, active_ids, other_ids))))
185
197
  if not open_ids or tentative and not eval_ids and not fulfill_ids:
186
198
  break
@@ -192,8 +204,7 @@ def wait_spot_requests_active(ec2, requests: Iterable[SpotInstanceRequest], time
192
204
  time.sleep(sleep_time)
193
205
  for attempt in retry_ec2(retry_while=spot_request_not_found):
194
206
  with attempt:
195
- requests = ec2.get_all_spot_instance_requests(
196
- list(open_ids))
207
+ requests = boto3_ec2.describe_spot_instance_requests(SpotInstanceRequestIds=list(open_ids))
197
208
  except BaseException:
198
209
  if open_ids:
199
210
  with panic(logger):
@@ -204,47 +215,56 @@ def wait_spot_requests_active(ec2, requests: Iterable[SpotInstanceRequest], time
204
215
  cancel()
205
216
 
206
217
 
207
- def create_spot_instances(ec2, price, image_id, spec, num_instances=1, timeout=None, tentative=False, tags=None) -> Iterable[List[Boto2Instance]]:
218
+ def create_spot_instances(boto3_ec2: EC2Client, price, image_id, spec, num_instances=1, timeout=None, tentative=False, tags=None) -> Generator[DescribeInstancesResultTypeDef, None, None]:
208
219
  """
209
220
  Create instances on the spot market.
210
221
  """
222
+
211
223
  def spotRequestNotFound(e):
212
224
  return getattr(e, 'error_code', None) == "InvalidSpotInstanceRequestID.NotFound"
213
225
 
226
+ spec['LaunchSpecification'].update({'ImageId': image_id}) # boto3 image id is in the launch specification
214
227
  for attempt in retry_ec2(retry_for=a_long_time,
215
228
  retry_while=inconsistencies_detected):
216
229
  with attempt:
217
- requests = ec2.request_spot_instances(
218
- price, image_id, count=num_instances, **spec)
230
+ requests_dict = boto3_ec2.request_spot_instances(
231
+ SpotPrice=price, InstanceCount=num_instances, **spec)
232
+ requests = requests_dict['SpotInstanceRequests']
219
233
 
220
234
  if tags is not None:
221
- for requestID in (request.id for request in requests):
235
+ for requestID in (request['SpotInstanceRequestId'] for request in requests):
222
236
  for attempt in retry_ec2(retry_while=spotRequestNotFound):
223
237
  with attempt:
224
- ec2.create_tags([requestID], tags)
238
+ boto3_ec2.create_tags(Resources=[requestID], Tags=tags)
225
239
 
226
240
  num_active, num_other = 0, 0
227
241
  # noinspection PyUnboundLocalVariable,PyTypeChecker
228
242
  # request_spot_instances's type annotation is wrong
229
- for batch in wait_spot_requests_active(ec2,
243
+ for batch in wait_spot_requests_active(boto3_ec2,
230
244
  requests,
231
245
  timeout=timeout,
232
246
  tentative=tentative):
233
247
  instance_ids = []
234
248
  for request in batch:
235
- if request.state == 'active':
236
- instance_ids.append(request.instance_id)
249
+ request: SpotInstanceRequestTypeDef
250
+ if request["State"] == 'active':
251
+ instance_ids.append(request["InstanceId"])
237
252
  num_active += 1
238
253
  else:
239
254
  logger.info(
240
255
  'Request %s in unexpected state %s.',
241
- request.id,
242
- request.state)
256
+ request["InstanceId"],
257
+ request["State"])
243
258
  num_other += 1
244
259
  if instance_ids:
245
260
  # This next line is the reason we batch. It's so we can get multiple instances in
246
261
  # a single request.
247
- yield ec2.get_only_instances(instance_ids)
262
+ for instance_id in instance_ids:
263
+ for attempt in retry_ec2():
264
+ with attempt:
265
+ # Increase hop limit from 1 to use Instance Metadata V2
266
+ boto3_ec2.modify_instance_metadata_options(InstanceId=instance_id, HttpPutResponseHopLimit=3)
267
+ yield boto3_ec2.describe_instances(InstanceIds=instance_ids)
248
268
  if not num_active:
249
269
  message = 'None of the spot requests entered the active state'
250
270
  if tentative:
@@ -255,22 +275,43 @@ def create_spot_instances(ec2, price, image_id, spec, num_instances=1, timeout=N
255
275
  logger.warning('%i request(s) entered a state other than active.', num_other)
256
276
 
257
277
 
258
- def create_ondemand_instances(ec2, image_id, spec, num_instances=1) -> List[Boto2Instance]:
278
+ def create_ondemand_instances(boto3_ec2: EC2Client, image_id: str, spec: Mapping[str, Any], num_instances: int=1) -> List[InstanceTypeDef]:
259
279
  """
260
280
  Requests the RunInstances EC2 API call but accounts for the race between recently created
261
281
  instance profiles, IAM roles and an instance creation that refers to them.
262
282
 
263
- :rtype: List[Boto2Instance]
283
+ :rtype: List[InstanceTypeDef]
264
284
  """
265
- instance_type = spec['instance_type']
285
+ instance_type = spec['InstanceType']
266
286
  logger.info('Creating %s instance(s) ... ', instance_type)
287
+ boto_instance_list = []
267
288
  for attempt in retry_ec2(retry_for=a_long_time,
268
289
  retry_while=inconsistencies_detected):
269
290
  with attempt:
270
- return ec2.run_instances(image_id,
271
- min_count=num_instances,
272
- max_count=num_instances,
273
- **spec).instances
291
+ boto_instance_list: List[InstanceTypeDef] = boto3_ec2.run_instances(ImageId=image_id,
292
+ MinCount=num_instances,
293
+ MaxCount=num_instances,
294
+ **spec)['Instances']
295
+
296
+ return boto_instance_list
297
+
298
+
299
+ def increase_instance_hop_limit(boto3_ec2: EC2Client, boto_instance_list: List[InstanceTypeDef]) -> None:
300
+ """
301
+ Increase the default HTTP hop limit, as we are running Toil and Kubernetes inside a Docker container, so the default
302
+ hop limit of 1 will not be enough when grabbing metadata information with ec2_metadata
303
+
304
+ Must be called after the instances are guaranteed to be running.
305
+
306
+ :param boto_instance_list: List of boto instances to modify
307
+ :return:
308
+ """
309
+ for boto_instance in boto_instance_list:
310
+ instance_id = boto_instance['InstanceId']
311
+ for attempt in retry_ec2():
312
+ with attempt:
313
+ # Increase hop limit from 1 to use Instance Metadata V2
314
+ boto3_ec2.modify_instance_metadata_options(InstanceId=instance_id, HttpPutResponseHopLimit=3)
274
315
 
275
316
 
276
317
  def prune(bushy: dict) -> dict:
@@ -289,6 +330,7 @@ def prune(bushy: dict) -> dict:
289
330
  # catch, and to wait on IAM items.
290
331
  iam_client = establish_boto3_session().client('iam')
291
332
 
333
+
292
334
  # exception is generated by a factory so we weirdly need a client instance to reference it
293
335
  @retry(errors=[iam_client.exceptions.NoSuchEntityException],
294
336
  intervals=[1, 1, 2, 4, 8, 16, 32, 64])
@@ -301,7 +343,7 @@ def wait_until_instance_profile_arn_exists(instance_profile_arn: str):
301
343
 
302
344
 
303
345
  @retry(intervals=[5, 5, 10, 20, 20, 20, 20], errors=INCONSISTENCY_ERRORS)
304
- def create_instances(ec2_resource: ServiceResource,
346
+ def create_instances(ec2_resource: EC2ServiceResource,
305
347
  image_id: str,
306
348
  key_name: str,
307
349
  instance_type: str,
@@ -312,7 +354,7 @@ def create_instances(ec2_resource: ServiceResource,
312
354
  instance_profile_arn: Optional[str] = None,
313
355
  placement_az: Optional[str] = None,
314
356
  subnet_id: str = None,
315
- tags: Optional[Dict[str, str]] = None) -> List[dict]:
357
+ tags: Optional[Dict[str, str]] = None) -> List[Instance]:
316
358
  """
317
359
  Replaces create_ondemand_instances. Uses boto3 and returns a list of Boto3 instance dicts.
318
360
 
@@ -336,7 +378,10 @@ def create_instances(ec2_resource: ServiceResource,
336
378
  'InstanceType': instance_type,
337
379
  'UserData': user_data,
338
380
  'BlockDeviceMappings': block_device_map,
339
- 'SubnetId': subnet_id}
381
+ 'SubnetId': subnet_id,
382
+ # Metadata V2 defaults hops to 1, which is an issue when running inside a docker container
383
+ # https://github.com/adamchainz/ec2-metadata?tab=readme-ov-file#instance-metadata-service-version-2
384
+ 'MetadataOptions': {'HttpPutResponseHopLimit': 3}}
340
385
 
341
386
  if instance_profile_arn:
342
387
  # We could just retry when we get an error because the ARN doesn't
@@ -357,8 +402,9 @@ def create_instances(ec2_resource: ServiceResource,
357
402
 
358
403
  return ec2_resource.create_instances(**prune(request))
359
404
 
405
+
360
406
  @retry(intervals=[5, 5, 10, 20, 20, 20, 20], errors=INCONSISTENCY_ERRORS)
361
- def create_launch_template(ec2_client: BaseClient,
407
+ def create_launch_template(ec2_client: EC2Client,
362
408
  template_name: str,
363
409
  image_id: str,
364
410
  key_name: str,
@@ -400,7 +446,10 @@ def create_launch_template(ec2_client: BaseClient,
400
446
  'InstanceType': instance_type,
401
447
  'UserData': user_data,
402
448
  'BlockDeviceMappings': block_device_map,
403
- 'SubnetId': subnet_id}
449
+ 'SubnetId': subnet_id,
450
+ # Increase hop limit from 1 to use Instance Metadata V2
451
+ 'MetadataOptions': {'HttpPutResponseHopLimit': 3}
452
+ }
404
453
 
405
454
  if instance_profile_arn:
406
455
  # We could just retry when we get an error because the ARN doesn't
@@ -413,6 +462,7 @@ def create_launch_template(ec2_client: BaseClient,
413
462
  if placement_az:
414
463
  template['Placement'] = {'AvailabilityZone': placement_az}
415
464
 
465
+ flat_tags = []
416
466
  if tags:
417
467
  # Tag everything when we make it.
418
468
  flat_tags = flatten_tags(tags)
@@ -429,17 +479,16 @@ def create_launch_template(ec2_client: BaseClient,
429
479
 
430
480
 
431
481
  @retry(intervals=[5, 5, 10, 20, 20, 20, 20], errors=INCONSISTENCY_ERRORS)
432
- def create_auto_scaling_group(autoscaling_client: BaseClient,
482
+ def create_auto_scaling_group(autoscaling_client: AutoScalingClient,
433
483
  asg_name: str,
434
484
  launch_template_ids: Dict[str, str],
435
485
  vpc_subnets: List[str],
436
486
  min_size: int,
437
487
  max_size: int,
438
- instance_types: Optional[List[str]] = None,
488
+ instance_types: Optional[Iterable[str]] = None,
439
489
  spot_bid: Optional[float] = None,
440
490
  spot_cheapest: bool = False,
441
491
  tags: Optional[Dict[str, str]] = None) -> None:
442
-
443
492
  """
444
493
  Create a new Auto Scaling Group with the given name (which is also its
445
494
  unique identifier).
@@ -472,7 +521,7 @@ def create_auto_scaling_group(autoscaling_client: BaseClient,
472
521
  """
473
522
 
474
523
  if instance_types is None:
475
- instance_types = []
524
+ instance_types: List[str] = []
476
525
 
477
526
  if instance_types is not None and len(instance_types) > 20:
478
527
  raise RuntimeError(f"Too many instance types ({len(instance_types)}) in group; AWS supports only 20.")
@@ -493,8 +542,8 @@ def create_auto_scaling_group(autoscaling_client: BaseClient,
493
542
  # We need to use a launch template per instance type so that different
494
543
  # instance types with specified EBS storage size overrides will get their
495
544
  # storage.
496
- mip = {'LaunchTemplate': {'LaunchTemplateSpecification': get_launch_template_spec(next(iter(instance_types))),
497
- 'Overrides': [{'InstanceType': t, 'LaunchTemplateSpecification': get_launch_template_spec(t)} for t in instance_types]}}
545
+ mip = {'LaunchTemplate': {'LaunchTemplateSpecification': get_launch_template_spec(next(iter(instance_types))), # noqa
546
+ 'Overrides': [{'InstanceType': t, 'LaunchTemplateSpecification': get_launch_template_spec(t)} for t in instance_types]}} # noqa
498
547
 
499
548
  if spot_bid is not None:
500
549
  # Ask for spot instances by saying everything above base capacity of 0 should be spot.
toil/lib/expando.py CHANGED
@@ -16,7 +16,7 @@
16
16
 
17
17
  class Expando(dict):
18
18
  """
19
- Pass inital attributes to the constructor:
19
+ Pass initial attributes to the constructor:
20
20
 
21
21
  >>> o = Expando(foo=42)
22
22
  >>> o.foo
toil/lib/io.py CHANGED
@@ -16,7 +16,7 @@ def mkdtemp(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Opt
16
16
 
17
17
  The permissions on the directory will be 711 instead of 700, allowing the
18
18
  group and all other users to traverse the directory. This is necessary if
19
- the direcotry is on NFS and the Docker daemon would like to mount it or a
19
+ the directory is on NFS and the Docker daemon would like to mount it or a
20
20
  file inside it into a container, because on NFS even the Docker daemon
21
21
  appears bound by the file permissions.
22
22
 
@@ -159,14 +159,26 @@ def atomic_copyobj(src_fh: BytesIO, dest_path: str, length: int = 16384, executa
159
159
  os.chmod(dest_path_tmp, os.stat(dest_path_tmp).st_mode | stat.S_IXUSR)
160
160
 
161
161
 
162
- def make_public_dir(in_directory: Optional[str] = None) -> str:
162
+ def make_public_dir(in_directory: str, suggested_name: Optional[str] = None) -> str:
163
163
  """
164
+ Make a publicly-accessible directory in the given directory.
165
+
166
+ :param suggested_name: Use this directory name first if possible.
167
+
164
168
  Try to make a random directory name with length 4 that doesn't exist, with the given prefix.
165
169
  Otherwise, try length 5, length 6, etc, up to a max of 32 (len of uuid4 with dashes replaced).
166
170
  This function's purpose is mostly to avoid having long file names when generating directories.
167
171
  If somehow this fails, which should be incredibly unlikely, default to a normal uuid4, which was
168
172
  our old default.
169
173
  """
174
+ if suggested_name is not None:
175
+ generated_dir_path: str = os.path.join(in_directory, suggested_name)
176
+ try:
177
+ os.mkdir(generated_dir_path)
178
+ os.chmod(generated_dir_path, 0o777)
179
+ return generated_dir_path
180
+ except FileExistsError:
181
+ pass
170
182
  for i in range(4, 32 + 1): # make random uuids and truncate to lengths starting at 4 and working up to max 32
171
183
  for _ in range(10): # make 10 attempts for each length
172
184
  truncated_uuid: str = str(uuid.uuid4()).replace('-', '')[:i]
toil/lib/misc.py CHANGED
@@ -11,8 +11,6 @@ import typing
11
11
  from contextlib import closing
12
12
  from typing import Iterator, List, Optional
13
13
 
14
- import pytz
15
-
16
14
  logger = logging.getLogger(__name__)
17
15
 
18
16
 
@@ -56,7 +54,7 @@ def get_user_name() -> str:
56
54
 
57
55
  def utc_now() -> datetime.datetime:
58
56
  """Return a datetime in the UTC timezone corresponding to right now."""
59
- return datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
57
+ return datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
60
58
 
61
59
  def unix_now_ms() -> float:
62
60
  """Return the current time in milliseconds since the Unix epoch."""
toil/lib/resources.py CHANGED
@@ -18,29 +18,63 @@ import sys
18
18
  import resource
19
19
  from typing import List, Tuple
20
20
 
21
-
22
- def get_total_cpu_time_and_memory_usage() -> Tuple[float, int]:
21
+ class ResourceMonitor:
23
22
  """
24
- Gives the total cpu time of itself and all its children, and the maximum RSS memory usage of
25
- itself and its single largest child (in kibibytes).
23
+ Global resource monitoring widget.
24
+
25
+ Presents class methods to get the resource usage of this process and child
26
+ processes, and other class methods to adjust the statistics so they can
27
+ account for e.g. resources used inside containers, or other resource usage
28
+ that *should* be billable to the current process.
26
29
  """
27
- me = resource.getrusage(resource.RUSAGE_SELF)
28
- children = resource.getrusage(resource.RUSAGE_CHILDREN)
29
- total_cpu_time = me.ru_utime + me.ru_stime + children.ru_utime + children.ru_stime
30
- total_memory_usage = me.ru_maxrss + children.ru_maxrss
31
- if sys.platform == "darwin":
32
- # On Linux, getrusage works in "kilobytes" (really kibibytes), but on
33
- # Mac it works in bytes. See
34
- # <https://github.com/python/cpython/issues/74698>
35
- total_memory_usage = int(math.ceil(total_memory_usage / 1024))
36
- return total_cpu_time, total_memory_usage
37
-
38
-
39
- def get_total_cpu_time() -> float:
40
- """Gives the total cpu time, including the children."""
41
- me = resource.getrusage(resource.RUSAGE_SELF)
42
- childs = resource.getrusage(resource.RUSAGE_CHILDREN)
43
- return me.ru_utime + me.ru_stime + childs.ru_utime + childs.ru_stime
30
+
31
+ # Store some extra usage to tack onto the stats as module-level globals
32
+ _extra_cpu_seconds: float = 0
33
+ _extra_memory_ki: int = 0
34
+
35
+ @classmethod
36
+ def record_extra_memory(cls, peak_ki: int) -> None:
37
+ """
38
+ Become responsible for the given peak memory usage, in kibibytes.
39
+
40
+ The memory will be treated as if it was used by a child process at the time
41
+ our real child processes were also using their peak memory.
42
+ """
43
+ cls._extra_memory_ki = max(cls._extra_memory_ki, peak_ki)
44
+
45
+ @classmethod
46
+ def record_extra_cpu(cls, seconds: float) -> None:
47
+ """
48
+ Become responsible for the given CPU time.
49
+
50
+ The CPU time will be treated as if it had been used by a child process.
51
+ """
52
+ cls._extra_cpu_seconds += seconds
53
+
54
+ @classmethod
55
+ def get_total_cpu_time_and_memory_usage(cls) -> Tuple[float, int]:
56
+ """
57
+ Gives the total cpu time of itself and all its children, and the maximum RSS memory usage of
58
+ itself and its single largest child (in kibibytes).
59
+ """
60
+ me = resource.getrusage(resource.RUSAGE_SELF)
61
+ children = resource.getrusage(resource.RUSAGE_CHILDREN)
62
+ total_cpu_time = me.ru_utime + me.ru_stime + children.ru_utime + children.ru_stime + cls._extra_cpu_seconds
63
+ total_memory_usage = me.ru_maxrss + children.ru_maxrss
64
+ if sys.platform == "darwin":
65
+ # On Linux, getrusage works in "kilobytes" (really kibibytes), but on
66
+ # Mac it works in bytes. See
67
+ # <https://github.com/python/cpython/issues/74698>
68
+ total_memory_usage = int(math.ceil(total_memory_usage / 1024))
69
+ total_memory_usage += cls._extra_memory_ki
70
+ return total_cpu_time, total_memory_usage
71
+
72
+ @classmethod
73
+ def get_total_cpu_time(cls) -> float:
74
+ """Gives the total cpu time, including the children."""
75
+ me = resource.getrusage(resource.RUSAGE_SELF)
76
+ childs = resource.getrusage(resource.RUSAGE_CHILDREN)
77
+ return me.ru_utime + me.ru_stime + childs.ru_utime + childs.ru_stime + cls._extra_cpu_seconds
44
78
 
45
79
 
46
80
  def glob(glob_pattern: str, directoryname: str) -> List[str]:
toil/lib/retry.py CHANGED
@@ -142,7 +142,7 @@ from typing import (Any,
142
142
  Sequence,
143
143
  Tuple,
144
144
  Type,
145
- Union)
145
+ Union, TypeVar)
146
146
 
147
147
  import requests.exceptions
148
148
  import urllib3.exceptions
@@ -224,13 +224,16 @@ class ErrorCondition:
224
224
  )
225
225
 
226
226
 
227
+ # There is a better way to type hint this with python 3.10
228
+ # https://stackoverflow.com/a/68290080
229
+ RT = TypeVar("RT")
227
230
  def retry(
228
231
  intervals: Optional[List] = None,
229
232
  infinite_retries: bool = False,
230
233
  errors: Optional[Sequence[Union[ErrorCondition, Type[Exception]]]] = None,
231
234
  log_message: Optional[Tuple[Callable, str]] = None,
232
235
  prepare: Optional[List[Callable]] = None,
233
- ) -> Callable[[Any], Any]:
236
+ ) -> Callable[[Callable[..., RT]], Callable[..., RT]]:
234
237
  """
235
238
  Retry a function if it fails with any Exception defined in "errors".
236
239
 
@@ -281,9 +284,9 @@ def retry(
281
284
  if error_condition.retry_on_this_condition:
282
285
  retriable_errors.add(error_condition.error)
283
286
 
284
- def decorate(func):
287
+ def decorate(func: Callable[..., RT]) -> Callable[..., RT]:
285
288
  @functools.wraps(func)
286
- def call(*args, **kwargs):
289
+ def call(*args, **kwargs) -> RT:
287
290
  intervals_remaining = copy.deepcopy(intervals)
288
291
  while True:
289
292
  try:
@@ -488,13 +491,15 @@ def error_meets_conditions(e, error_conditions):
488
491
  DEFAULT_DELAYS = (0, 1, 1, 4, 16, 64)
489
492
  DEFAULT_TIMEOUT = 300
490
493
 
494
+ E = TypeVar("E", bound=Exception) # so mypy understands passed through types
495
+
491
496
  # TODO: Replace the use of this with retry()
492
497
  # The aws provisioner and jobstore need a large refactoring to be boto3 compliant, so this is
493
498
  # still used there to avoid the duplication of future work
494
499
  def old_retry(
495
500
  delays: Iterable[float] = DEFAULT_DELAYS,
496
501
  timeout: float = DEFAULT_TIMEOUT,
497
- predicate: Callable[[Exception], bool] = lambda e: False,
502
+ predicate: Callable[[E], bool] = lambda e: False,
498
503
  ) -> Generator[ContextManager, None, None]:
499
504
  """
500
505
  Deprecated.
@@ -567,6 +572,8 @@ def old_retry(
567
572
  >>> i
568
573
  1
569
574
  """
575
+ if timeout is None:
576
+ timeout = DEFAULT_TIMEOUT
570
577
  if timeout > 0:
571
578
  go = [ None ]
572
579
 
toil/lib/threading.py CHANGED
@@ -28,7 +28,7 @@ import traceback
28
28
  from contextlib import contextmanager
29
29
  from typing import Dict, Iterator, Optional, Union, cast
30
30
 
31
- import psutil # type: ignore
31
+ import psutil
32
32
 
33
33
  from toil.lib.exceptions import raise_
34
34
  from toil.lib.io import robust_rmtree
@@ -41,7 +41,7 @@ class ExceptionalThread(threading.Thread):
41
41
  A thread whose join() method re-raises exceptions raised during run(). While join() is
42
42
  idempotent, the exception is only during the first invocation of join() that successfully
43
43
  joined the thread. If join() times out, no exception will be re reraised even though an
44
- exception might already have occured in run().
44
+ exception might already have occurred in run().
45
45
 
46
46
  When subclassing this thread, override tryRun() instead of run().
47
47