toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +22 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +2 -2
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +64 -22
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +7 -3
- toil/common.py +36 -13
- toil/cwl/cwltoil.py +365 -312
- toil/deferred.py +1 -1
- toil/fileStores/abstractFileStore.py +17 -17
- toil/fileStores/cachingFileStore.py +2 -2
- toil/fileStores/nonCachingFileStore.py +1 -1
- toil/job.py +228 -60
- toil/jobStores/abstractJobStore.py +18 -10
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +57 -29
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +2 -2
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +72 -24
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +5 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/io.py +14 -2
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +55 -21
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +2 -2
- toil/lib/throttle.py +1 -1
- toil/options/common.py +27 -24
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +9 -7
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +58 -16
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +1 -1
- toil/test/cwl/cwlTest.py +8 -91
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +10 -13
- toil/test/jobStores/jobStoreTest.py +33 -49
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +90 -8
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +61 -3
- toil/test/utils/utilsTest.py +20 -18
- toil/test/wdl/wdltoil_test.py +24 -71
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +2 -1
- toil/utils/toilStatus.py +97 -51
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +318 -51
- toil/worker.py +96 -69
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/lib/ec2.py
CHANGED
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
3
|
from base64 import b64encode
|
|
4
|
-
from operator import
|
|
5
|
-
from typing import Dict, Iterable, List, Optional, Union
|
|
4
|
+
from operator import itemgetter
|
|
5
|
+
from typing import Dict, Iterable, List, Optional, Union, TYPE_CHECKING, Generator, Callable, Mapping, Any
|
|
6
6
|
|
|
7
|
+
import botocore.client
|
|
7
8
|
from boto3.resources.base import ServiceResource
|
|
8
|
-
from boto.ec2.instance import Instance as Boto2Instance
|
|
9
|
-
from boto.ec2.spotinstancerequest import SpotInstanceRequest
|
|
10
|
-
from botocore.client import BaseClient
|
|
11
9
|
|
|
12
10
|
from toil.lib.aws.session import establish_boto3_session
|
|
13
11
|
from toil.lib.aws.utils import flatten_tags
|
|
@@ -18,6 +16,11 @@ from toil.lib.retry import (ErrorCondition,
|
|
|
18
16
|
old_retry,
|
|
19
17
|
retry)
|
|
20
18
|
|
|
19
|
+
from mypy_boto3_ec2.client import EC2Client
|
|
20
|
+
from mypy_boto3_autoscaling.client import AutoScalingClient
|
|
21
|
+
from mypy_boto3_ec2.type_defs import SpotInstanceRequestTypeDef, DescribeInstancesResultTypeDef, InstanceTypeDef
|
|
22
|
+
from mypy_boto3_ec2.service_resource import EC2ServiceResource, Instance
|
|
23
|
+
|
|
21
24
|
a_short_time = 5
|
|
22
25
|
a_long_time = 60 * 60
|
|
23
26
|
logger = logging.getLogger(__name__)
|
|
@@ -38,6 +41,7 @@ def not_found(e):
|
|
|
38
41
|
# Not the right kind of error
|
|
39
42
|
return False
|
|
40
43
|
|
|
44
|
+
|
|
41
45
|
def inconsistencies_detected(e):
|
|
42
46
|
if get_error_code(e) == 'InvalidGroup.NotFound':
|
|
43
47
|
return True
|
|
@@ -45,6 +49,7 @@ def inconsistencies_detected(e):
|
|
|
45
49
|
matches = ('invalid iam instance profile' in m) or ('no associated iam roles' in m)
|
|
46
50
|
return matches
|
|
47
51
|
|
|
52
|
+
|
|
48
53
|
# We also define these error categories for the new retry decorator
|
|
49
54
|
INCONSISTENCY_ERRORS = [ErrorCondition(boto_error_codes=['InvalidGroup.NotFound']),
|
|
50
55
|
ErrorCondition(error_message_must_include='Invalid IAM Instance Profile'),
|
|
@@ -62,9 +67,10 @@ class UnexpectedResourceState(Exception):
|
|
|
62
67
|
super().__init__(
|
|
63
68
|
"Expected state of %s to be '%s' but got '%s'" %
|
|
64
69
|
(resource, to_state, state))
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def wait_transition(boto3_ec2: EC2Client, resource: InstanceTypeDef, from_states: Iterable[str], to_state: str,
|
|
73
|
+
state_getter: Callable[[InstanceTypeDef], str]=lambda x: x.get('State').get('Name')):
|
|
68
74
|
"""
|
|
69
75
|
Wait until the specified EC2 resource (instance, image, volume, ...) transitions from any
|
|
70
76
|
of the given 'from' states to the specified 'to' state. If the instance is found in a state
|
|
@@ -76,41 +82,44 @@ def wait_transition(resource, from_states, to_state,
|
|
|
76
82
|
:param to_state: the state of the resource when this method returns
|
|
77
83
|
"""
|
|
78
84
|
state = state_getter(resource)
|
|
85
|
+
instance_id = resource["InstanceId"]
|
|
79
86
|
while state in from_states:
|
|
80
87
|
time.sleep(a_short_time)
|
|
81
88
|
for attempt in retry_ec2():
|
|
82
89
|
with attempt:
|
|
83
|
-
|
|
90
|
+
described = boto3_ec2.describe_instances(InstanceIds=[instance_id])
|
|
91
|
+
resource = described["Reservations"][0]["Instances"][0] # there should only be one requested
|
|
84
92
|
state = state_getter(resource)
|
|
85
93
|
if state != to_state:
|
|
86
94
|
raise UnexpectedResourceState(resource, to_state, state)
|
|
87
95
|
|
|
88
96
|
|
|
89
|
-
def wait_instances_running(
|
|
97
|
+
def wait_instances_running(boto3_ec2: EC2Client, instances: Iterable[InstanceTypeDef]) -> Generator[InstanceTypeDef, None, None]:
|
|
90
98
|
"""
|
|
91
99
|
Wait until no instance in the given iterable is 'pending'. Yield every instance that
|
|
92
100
|
entered the running state as soon as it does.
|
|
93
101
|
|
|
94
|
-
:param
|
|
95
|
-
:param Iterable[
|
|
96
|
-
:rtype: Iterable[
|
|
102
|
+
:param EC2Client boto3_ec2: the EC2 connection to use for making requests
|
|
103
|
+
:param Iterable[InstanceTypeDef] instances: the instances to wait on
|
|
104
|
+
:rtype: Iterable[InstanceTypeDef]
|
|
97
105
|
"""
|
|
98
106
|
running_ids = set()
|
|
99
107
|
other_ids = set()
|
|
100
108
|
while True:
|
|
101
109
|
pending_ids = set()
|
|
102
110
|
for i in instances:
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
111
|
+
i: InstanceTypeDef
|
|
112
|
+
if i['State']['Name'] == 'pending':
|
|
113
|
+
pending_ids.add(i['InstanceId'])
|
|
114
|
+
elif i['State']['Name'] == 'running':
|
|
115
|
+
if i['InstanceId'] in running_ids:
|
|
107
116
|
raise RuntimeError("An instance was already added to the list of running instance IDs. Maybe there is a duplicate.")
|
|
108
|
-
running_ids.add(i
|
|
117
|
+
running_ids.add(i['InstanceId'])
|
|
109
118
|
yield i
|
|
110
119
|
else:
|
|
111
|
-
if i
|
|
120
|
+
if i['InstanceId'] in other_ids:
|
|
112
121
|
raise RuntimeError("An instance was already added to the list of other instances. Maybe there is a duplicate.")
|
|
113
|
-
other_ids.add(i
|
|
122
|
+
other_ids.add(i['InstanceId'])
|
|
114
123
|
yield i
|
|
115
124
|
logger.info('%i instance(s) pending, %i running, %i other.',
|
|
116
125
|
*list(map(len, (pending_ids, running_ids, other_ids))))
|
|
@@ -121,14 +130,16 @@ def wait_instances_running(ec2, instances: Iterable[Boto2Instance]) -> Iterable[
|
|
|
121
130
|
time.sleep(seconds)
|
|
122
131
|
for attempt in retry_ec2():
|
|
123
132
|
with attempt:
|
|
124
|
-
|
|
133
|
+
described_instances = boto3_ec2.describe_instances(InstanceIds=list(pending_ids))
|
|
134
|
+
instances = [instance for reservation in described_instances["Reservations"] for instance in reservation["Instances"]]
|
|
125
135
|
|
|
126
136
|
|
|
127
|
-
def wait_spot_requests_active(
|
|
137
|
+
def wait_spot_requests_active(boto3_ec2: EC2Client, requests: Iterable[SpotInstanceRequestTypeDef], timeout: float = None, tentative: bool = False) -> Iterable[List[SpotInstanceRequestTypeDef]]:
|
|
128
138
|
"""
|
|
129
139
|
Wait until no spot request in the given iterator is in the 'open' state or, optionally,
|
|
130
140
|
a timeout occurs. Yield spot requests as soon as they leave the 'open' state.
|
|
131
141
|
|
|
142
|
+
:param boto3_ec2: ec2 client
|
|
132
143
|
:param requests: The requests to wait on.
|
|
133
144
|
|
|
134
145
|
:param timeout: Maximum time in seconds to spend waiting or None to wait forever. If a
|
|
@@ -145,11 +156,11 @@ def wait_spot_requests_active(ec2, requests: Iterable[SpotInstanceRequest], time
|
|
|
145
156
|
other_ids = set()
|
|
146
157
|
open_ids = None
|
|
147
158
|
|
|
148
|
-
def cancel():
|
|
159
|
+
def cancel() -> None:
|
|
149
160
|
logger.warning('Cancelling remaining %i spot requests.', len(open_ids))
|
|
150
|
-
|
|
161
|
+
boto3_ec2.cancel_spot_instance_requests(SpotInstanceRequestIds=list(open_ids))
|
|
151
162
|
|
|
152
|
-
def spot_request_not_found(e):
|
|
163
|
+
def spot_request_not_found(e: Exception) -> bool:
|
|
153
164
|
return get_error_code(e) == 'InvalidSpotInstanceRequestID.NotFound'
|
|
154
165
|
|
|
155
166
|
try:
|
|
@@ -157,30 +168,31 @@ def wait_spot_requests_active(ec2, requests: Iterable[SpotInstanceRequest], time
|
|
|
157
168
|
open_ids, eval_ids, fulfill_ids = set(), set(), set()
|
|
158
169
|
batch = []
|
|
159
170
|
for r in requests:
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
171
|
+
r: SpotInstanceRequestTypeDef # pycharm thinks it is a string
|
|
172
|
+
if r['State'] == 'open':
|
|
173
|
+
open_ids.add(r['InstanceId'])
|
|
174
|
+
if r['Status'] == 'pending-evaluation':
|
|
175
|
+
eval_ids.add(r['InstanceId'])
|
|
176
|
+
elif r['Status'] == 'pending-fulfillment':
|
|
177
|
+
fulfill_ids.add(r['InstanceId'])
|
|
166
178
|
else:
|
|
167
179
|
logger.info(
|
|
168
180
|
'Request %s entered status %s indicating that it will not be '
|
|
169
|
-
'fulfilled anytime soon.', r
|
|
170
|
-
elif r
|
|
171
|
-
if r
|
|
181
|
+
'fulfilled anytime soon.', r['InstanceId'], r['Status'])
|
|
182
|
+
elif r['State'] == 'active':
|
|
183
|
+
if r['InstanceId'] in active_ids:
|
|
172
184
|
raise RuntimeError("A request was already added to the list of active requests. Maybe there are duplicate requests.")
|
|
173
|
-
active_ids.add(r
|
|
185
|
+
active_ids.add(r['InstanceId'])
|
|
174
186
|
batch.append(r)
|
|
175
187
|
else:
|
|
176
|
-
if r
|
|
188
|
+
if r['InstanceId'] in other_ids:
|
|
177
189
|
raise RuntimeError("A request was already added to the list of other IDs. Maybe there are duplicate requests.")
|
|
178
|
-
other_ids.add(r
|
|
190
|
+
other_ids.add(r['InstanceId'])
|
|
179
191
|
batch.append(r)
|
|
180
192
|
if batch:
|
|
181
193
|
yield batch
|
|
182
194
|
logger.info('%i spot requests(s) are open (%i of which are pending evaluation and %i '
|
|
183
|
-
|
|
195
|
+
'are pending fulfillment), %i are active and %i are in another state.',
|
|
184
196
|
*list(map(len, (open_ids, eval_ids, fulfill_ids, active_ids, other_ids))))
|
|
185
197
|
if not open_ids or tentative and not eval_ids and not fulfill_ids:
|
|
186
198
|
break
|
|
@@ -192,8 +204,7 @@ def wait_spot_requests_active(ec2, requests: Iterable[SpotInstanceRequest], time
|
|
|
192
204
|
time.sleep(sleep_time)
|
|
193
205
|
for attempt in retry_ec2(retry_while=spot_request_not_found):
|
|
194
206
|
with attempt:
|
|
195
|
-
requests =
|
|
196
|
-
list(open_ids))
|
|
207
|
+
requests = boto3_ec2.describe_spot_instance_requests(SpotInstanceRequestIds=list(open_ids))
|
|
197
208
|
except BaseException:
|
|
198
209
|
if open_ids:
|
|
199
210
|
with panic(logger):
|
|
@@ -204,47 +215,56 @@ def wait_spot_requests_active(ec2, requests: Iterable[SpotInstanceRequest], time
|
|
|
204
215
|
cancel()
|
|
205
216
|
|
|
206
217
|
|
|
207
|
-
def create_spot_instances(
|
|
218
|
+
def create_spot_instances(boto3_ec2: EC2Client, price, image_id, spec, num_instances=1, timeout=None, tentative=False, tags=None) -> Generator[DescribeInstancesResultTypeDef, None, None]:
|
|
208
219
|
"""
|
|
209
220
|
Create instances on the spot market.
|
|
210
221
|
"""
|
|
222
|
+
|
|
211
223
|
def spotRequestNotFound(e):
|
|
212
224
|
return getattr(e, 'error_code', None) == "InvalidSpotInstanceRequestID.NotFound"
|
|
213
225
|
|
|
226
|
+
spec['LaunchSpecification'].update({'ImageId': image_id}) # boto3 image id is in the launch specification
|
|
214
227
|
for attempt in retry_ec2(retry_for=a_long_time,
|
|
215
228
|
retry_while=inconsistencies_detected):
|
|
216
229
|
with attempt:
|
|
217
|
-
|
|
218
|
-
price,
|
|
230
|
+
requests_dict = boto3_ec2.request_spot_instances(
|
|
231
|
+
SpotPrice=price, InstanceCount=num_instances, **spec)
|
|
232
|
+
requests = requests_dict['SpotInstanceRequests']
|
|
219
233
|
|
|
220
234
|
if tags is not None:
|
|
221
|
-
for requestID in (request
|
|
235
|
+
for requestID in (request['SpotInstanceRequestId'] for request in requests):
|
|
222
236
|
for attempt in retry_ec2(retry_while=spotRequestNotFound):
|
|
223
237
|
with attempt:
|
|
224
|
-
|
|
238
|
+
boto3_ec2.create_tags(Resources=[requestID], Tags=tags)
|
|
225
239
|
|
|
226
240
|
num_active, num_other = 0, 0
|
|
227
241
|
# noinspection PyUnboundLocalVariable,PyTypeChecker
|
|
228
242
|
# request_spot_instances's type annotation is wrong
|
|
229
|
-
for batch in wait_spot_requests_active(
|
|
243
|
+
for batch in wait_spot_requests_active(boto3_ec2,
|
|
230
244
|
requests,
|
|
231
245
|
timeout=timeout,
|
|
232
246
|
tentative=tentative):
|
|
233
247
|
instance_ids = []
|
|
234
248
|
for request in batch:
|
|
235
|
-
|
|
236
|
-
|
|
249
|
+
request: SpotInstanceRequestTypeDef
|
|
250
|
+
if request["State"] == 'active':
|
|
251
|
+
instance_ids.append(request["InstanceId"])
|
|
237
252
|
num_active += 1
|
|
238
253
|
else:
|
|
239
254
|
logger.info(
|
|
240
255
|
'Request %s in unexpected state %s.',
|
|
241
|
-
request
|
|
242
|
-
request
|
|
256
|
+
request["InstanceId"],
|
|
257
|
+
request["State"])
|
|
243
258
|
num_other += 1
|
|
244
259
|
if instance_ids:
|
|
245
260
|
# This next line is the reason we batch. It's so we can get multiple instances in
|
|
246
261
|
# a single request.
|
|
247
|
-
|
|
262
|
+
for instance_id in instance_ids:
|
|
263
|
+
for attempt in retry_ec2():
|
|
264
|
+
with attempt:
|
|
265
|
+
# Increase hop limit from 1 to use Instance Metadata V2
|
|
266
|
+
boto3_ec2.modify_instance_metadata_options(InstanceId=instance_id, HttpPutResponseHopLimit=3)
|
|
267
|
+
yield boto3_ec2.describe_instances(InstanceIds=instance_ids)
|
|
248
268
|
if not num_active:
|
|
249
269
|
message = 'None of the spot requests entered the active state'
|
|
250
270
|
if tentative:
|
|
@@ -255,22 +275,43 @@ def create_spot_instances(ec2, price, image_id, spec, num_instances=1, timeout=N
|
|
|
255
275
|
logger.warning('%i request(s) entered a state other than active.', num_other)
|
|
256
276
|
|
|
257
277
|
|
|
258
|
-
def create_ondemand_instances(
|
|
278
|
+
def create_ondemand_instances(boto3_ec2: EC2Client, image_id: str, spec: Mapping[str, Any], num_instances: int=1) -> List[InstanceTypeDef]:
|
|
259
279
|
"""
|
|
260
280
|
Requests the RunInstances EC2 API call but accounts for the race between recently created
|
|
261
281
|
instance profiles, IAM roles and an instance creation that refers to them.
|
|
262
282
|
|
|
263
|
-
:rtype: List[
|
|
283
|
+
:rtype: List[InstanceTypeDef]
|
|
264
284
|
"""
|
|
265
|
-
instance_type = spec['
|
|
285
|
+
instance_type = spec['InstanceType']
|
|
266
286
|
logger.info('Creating %s instance(s) ... ', instance_type)
|
|
287
|
+
boto_instance_list = []
|
|
267
288
|
for attempt in retry_ec2(retry_for=a_long_time,
|
|
268
289
|
retry_while=inconsistencies_detected):
|
|
269
290
|
with attempt:
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
291
|
+
boto_instance_list: List[InstanceTypeDef] = boto3_ec2.run_instances(ImageId=image_id,
|
|
292
|
+
MinCount=num_instances,
|
|
293
|
+
MaxCount=num_instances,
|
|
294
|
+
**spec)['Instances']
|
|
295
|
+
|
|
296
|
+
return boto_instance_list
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def increase_instance_hop_limit(boto3_ec2: EC2Client, boto_instance_list: List[InstanceTypeDef]) -> None:
|
|
300
|
+
"""
|
|
301
|
+
Increase the default HTTP hop limit, as we are running Toil and Kubernetes inside a Docker container, so the default
|
|
302
|
+
hop limit of 1 will not be enough when grabbing metadata information with ec2_metadata
|
|
303
|
+
|
|
304
|
+
Must be called after the instances are guaranteed to be running.
|
|
305
|
+
|
|
306
|
+
:param boto_instance_list: List of boto instances to modify
|
|
307
|
+
:return:
|
|
308
|
+
"""
|
|
309
|
+
for boto_instance in boto_instance_list:
|
|
310
|
+
instance_id = boto_instance['InstanceId']
|
|
311
|
+
for attempt in retry_ec2():
|
|
312
|
+
with attempt:
|
|
313
|
+
# Increase hop limit from 1 to use Instance Metadata V2
|
|
314
|
+
boto3_ec2.modify_instance_metadata_options(InstanceId=instance_id, HttpPutResponseHopLimit=3)
|
|
274
315
|
|
|
275
316
|
|
|
276
317
|
def prune(bushy: dict) -> dict:
|
|
@@ -289,6 +330,7 @@ def prune(bushy: dict) -> dict:
|
|
|
289
330
|
# catch, and to wait on IAM items.
|
|
290
331
|
iam_client = establish_boto3_session().client('iam')
|
|
291
332
|
|
|
333
|
+
|
|
292
334
|
# exception is generated by a factory so we weirdly need a client instance to reference it
|
|
293
335
|
@retry(errors=[iam_client.exceptions.NoSuchEntityException],
|
|
294
336
|
intervals=[1, 1, 2, 4, 8, 16, 32, 64])
|
|
@@ -301,7 +343,7 @@ def wait_until_instance_profile_arn_exists(instance_profile_arn: str):
|
|
|
301
343
|
|
|
302
344
|
|
|
303
345
|
@retry(intervals=[5, 5, 10, 20, 20, 20, 20], errors=INCONSISTENCY_ERRORS)
|
|
304
|
-
def create_instances(ec2_resource:
|
|
346
|
+
def create_instances(ec2_resource: EC2ServiceResource,
|
|
305
347
|
image_id: str,
|
|
306
348
|
key_name: str,
|
|
307
349
|
instance_type: str,
|
|
@@ -312,7 +354,7 @@ def create_instances(ec2_resource: ServiceResource,
|
|
|
312
354
|
instance_profile_arn: Optional[str] = None,
|
|
313
355
|
placement_az: Optional[str] = None,
|
|
314
356
|
subnet_id: str = None,
|
|
315
|
-
tags: Optional[Dict[str, str]] = None) -> List[
|
|
357
|
+
tags: Optional[Dict[str, str]] = None) -> List[Instance]:
|
|
316
358
|
"""
|
|
317
359
|
Replaces create_ondemand_instances. Uses boto3 and returns a list of Boto3 instance dicts.
|
|
318
360
|
|
|
@@ -336,7 +378,10 @@ def create_instances(ec2_resource: ServiceResource,
|
|
|
336
378
|
'InstanceType': instance_type,
|
|
337
379
|
'UserData': user_data,
|
|
338
380
|
'BlockDeviceMappings': block_device_map,
|
|
339
|
-
'SubnetId': subnet_id
|
|
381
|
+
'SubnetId': subnet_id,
|
|
382
|
+
# Metadata V2 defaults hops to 1, which is an issue when running inside a docker container
|
|
383
|
+
# https://github.com/adamchainz/ec2-metadata?tab=readme-ov-file#instance-metadata-service-version-2
|
|
384
|
+
'MetadataOptions': {'HttpPutResponseHopLimit': 3}}
|
|
340
385
|
|
|
341
386
|
if instance_profile_arn:
|
|
342
387
|
# We could just retry when we get an error because the ARN doesn't
|
|
@@ -357,8 +402,9 @@ def create_instances(ec2_resource: ServiceResource,
|
|
|
357
402
|
|
|
358
403
|
return ec2_resource.create_instances(**prune(request))
|
|
359
404
|
|
|
405
|
+
|
|
360
406
|
@retry(intervals=[5, 5, 10, 20, 20, 20, 20], errors=INCONSISTENCY_ERRORS)
|
|
361
|
-
def create_launch_template(ec2_client:
|
|
407
|
+
def create_launch_template(ec2_client: EC2Client,
|
|
362
408
|
template_name: str,
|
|
363
409
|
image_id: str,
|
|
364
410
|
key_name: str,
|
|
@@ -400,7 +446,10 @@ def create_launch_template(ec2_client: BaseClient,
|
|
|
400
446
|
'InstanceType': instance_type,
|
|
401
447
|
'UserData': user_data,
|
|
402
448
|
'BlockDeviceMappings': block_device_map,
|
|
403
|
-
'SubnetId': subnet_id
|
|
449
|
+
'SubnetId': subnet_id,
|
|
450
|
+
# Increase hop limit from 1 to use Instance Metadata V2
|
|
451
|
+
'MetadataOptions': {'HttpPutResponseHopLimit': 3}
|
|
452
|
+
}
|
|
404
453
|
|
|
405
454
|
if instance_profile_arn:
|
|
406
455
|
# We could just retry when we get an error because the ARN doesn't
|
|
@@ -413,6 +462,7 @@ def create_launch_template(ec2_client: BaseClient,
|
|
|
413
462
|
if placement_az:
|
|
414
463
|
template['Placement'] = {'AvailabilityZone': placement_az}
|
|
415
464
|
|
|
465
|
+
flat_tags = []
|
|
416
466
|
if tags:
|
|
417
467
|
# Tag everything when we make it.
|
|
418
468
|
flat_tags = flatten_tags(tags)
|
|
@@ -429,17 +479,16 @@ def create_launch_template(ec2_client: BaseClient,
|
|
|
429
479
|
|
|
430
480
|
|
|
431
481
|
@retry(intervals=[5, 5, 10, 20, 20, 20, 20], errors=INCONSISTENCY_ERRORS)
|
|
432
|
-
def create_auto_scaling_group(autoscaling_client:
|
|
482
|
+
def create_auto_scaling_group(autoscaling_client: AutoScalingClient,
|
|
433
483
|
asg_name: str,
|
|
434
484
|
launch_template_ids: Dict[str, str],
|
|
435
485
|
vpc_subnets: List[str],
|
|
436
486
|
min_size: int,
|
|
437
487
|
max_size: int,
|
|
438
|
-
instance_types: Optional[
|
|
488
|
+
instance_types: Optional[Iterable[str]] = None,
|
|
439
489
|
spot_bid: Optional[float] = None,
|
|
440
490
|
spot_cheapest: bool = False,
|
|
441
491
|
tags: Optional[Dict[str, str]] = None) -> None:
|
|
442
|
-
|
|
443
492
|
"""
|
|
444
493
|
Create a new Auto Scaling Group with the given name (which is also its
|
|
445
494
|
unique identifier).
|
|
@@ -472,7 +521,7 @@ def create_auto_scaling_group(autoscaling_client: BaseClient,
|
|
|
472
521
|
"""
|
|
473
522
|
|
|
474
523
|
if instance_types is None:
|
|
475
|
-
instance_types = []
|
|
524
|
+
instance_types: List[str] = []
|
|
476
525
|
|
|
477
526
|
if instance_types is not None and len(instance_types) > 20:
|
|
478
527
|
raise RuntimeError(f"Too many instance types ({len(instance_types)}) in group; AWS supports only 20.")
|
|
@@ -493,8 +542,8 @@ def create_auto_scaling_group(autoscaling_client: BaseClient,
|
|
|
493
542
|
# We need to use a launch template per instance type so that different
|
|
494
543
|
# instance types with specified EBS storage size overrides will get their
|
|
495
544
|
# storage.
|
|
496
|
-
mip = {'LaunchTemplate': {'LaunchTemplateSpecification': get_launch_template_spec(next(iter(instance_types))),
|
|
497
|
-
'Overrides': [{'InstanceType': t, 'LaunchTemplateSpecification': get_launch_template_spec(t)} for t in instance_types]}}
|
|
545
|
+
mip = {'LaunchTemplate': {'LaunchTemplateSpecification': get_launch_template_spec(next(iter(instance_types))), # noqa
|
|
546
|
+
'Overrides': [{'InstanceType': t, 'LaunchTemplateSpecification': get_launch_template_spec(t)} for t in instance_types]}} # noqa
|
|
498
547
|
|
|
499
548
|
if spot_bid is not None:
|
|
500
549
|
# Ask for spot instances by saying everything above base capacity of 0 should be spot.
|
toil/lib/expando.py
CHANGED
toil/lib/io.py
CHANGED
|
@@ -16,7 +16,7 @@ def mkdtemp(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Opt
|
|
|
16
16
|
|
|
17
17
|
The permissions on the directory will be 711 instead of 700, allowing the
|
|
18
18
|
group and all other users to traverse the directory. This is necessary if
|
|
19
|
-
the
|
|
19
|
+
the directory is on NFS and the Docker daemon would like to mount it or a
|
|
20
20
|
file inside it into a container, because on NFS even the Docker daemon
|
|
21
21
|
appears bound by the file permissions.
|
|
22
22
|
|
|
@@ -159,14 +159,26 @@ def atomic_copyobj(src_fh: BytesIO, dest_path: str, length: int = 16384, executa
|
|
|
159
159
|
os.chmod(dest_path_tmp, os.stat(dest_path_tmp).st_mode | stat.S_IXUSR)
|
|
160
160
|
|
|
161
161
|
|
|
162
|
-
def make_public_dir(in_directory: Optional[str] = None) -> str:
|
|
162
|
+
def make_public_dir(in_directory: str, suggested_name: Optional[str] = None) -> str:
|
|
163
163
|
"""
|
|
164
|
+
Make a publicly-accessible directory in the given directory.
|
|
165
|
+
|
|
166
|
+
:param suggested_name: Use this directory name first if possible.
|
|
167
|
+
|
|
164
168
|
Try to make a random directory name with length 4 that doesn't exist, with the given prefix.
|
|
165
169
|
Otherwise, try length 5, length 6, etc, up to a max of 32 (len of uuid4 with dashes replaced).
|
|
166
170
|
This function's purpose is mostly to avoid having long file names when generating directories.
|
|
167
171
|
If somehow this fails, which should be incredibly unlikely, default to a normal uuid4, which was
|
|
168
172
|
our old default.
|
|
169
173
|
"""
|
|
174
|
+
if suggested_name is not None:
|
|
175
|
+
generated_dir_path: str = os.path.join(in_directory, suggested_name)
|
|
176
|
+
try:
|
|
177
|
+
os.mkdir(generated_dir_path)
|
|
178
|
+
os.chmod(generated_dir_path, 0o777)
|
|
179
|
+
return generated_dir_path
|
|
180
|
+
except FileExistsError:
|
|
181
|
+
pass
|
|
170
182
|
for i in range(4, 32 + 1): # make random uuids and truncate to lengths starting at 4 and working up to max 32
|
|
171
183
|
for _ in range(10): # make 10 attempts for each length
|
|
172
184
|
truncated_uuid: str = str(uuid.uuid4()).replace('-', '')[:i]
|
toil/lib/misc.py
CHANGED
|
@@ -11,8 +11,6 @@ import typing
|
|
|
11
11
|
from contextlib import closing
|
|
12
12
|
from typing import Iterator, List, Optional
|
|
13
13
|
|
|
14
|
-
import pytz
|
|
15
|
-
|
|
16
14
|
logger = logging.getLogger(__name__)
|
|
17
15
|
|
|
18
16
|
|
|
@@ -56,7 +54,7 @@ def get_user_name() -> str:
|
|
|
56
54
|
|
|
57
55
|
def utc_now() -> datetime.datetime:
|
|
58
56
|
"""Return a datetime in the UTC timezone corresponding to right now."""
|
|
59
|
-
return datetime.datetime.utcnow().replace(tzinfo=
|
|
57
|
+
return datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
|
|
60
58
|
|
|
61
59
|
def unix_now_ms() -> float:
|
|
62
60
|
"""Return the current time in milliseconds since the Unix epoch."""
|
toil/lib/resources.py
CHANGED
|
@@ -18,29 +18,63 @@ import sys
|
|
|
18
18
|
import resource
|
|
19
19
|
from typing import List, Tuple
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
def get_total_cpu_time_and_memory_usage() -> Tuple[float, int]:
|
|
21
|
+
class ResourceMonitor:
|
|
23
22
|
"""
|
|
24
|
-
|
|
25
|
-
|
|
23
|
+
Global resource monitoring widget.
|
|
24
|
+
|
|
25
|
+
Presents class methods to get the resource usage of this process and child
|
|
26
|
+
processes, and other class methods to adjust the statistics so they can
|
|
27
|
+
account for e.g. resources used inside containers, or other resource usage
|
|
28
|
+
that *should* be billable to the current process.
|
|
26
29
|
"""
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
30
|
+
|
|
31
|
+
# Store some extra usage to tack onto the stats as module-level globals
|
|
32
|
+
_extra_cpu_seconds: float = 0
|
|
33
|
+
_extra_memory_ki: int = 0
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def record_extra_memory(cls, peak_ki: int) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Become responsible for the given peak memory usage, in kibibytes.
|
|
39
|
+
|
|
40
|
+
The memory will be treated as if it was used by a child process at the time
|
|
41
|
+
our real child processes were also using their peak memory.
|
|
42
|
+
"""
|
|
43
|
+
cls._extra_memory_ki = max(cls._extra_memory_ki, peak_ki)
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def record_extra_cpu(cls, seconds: float) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Become responsible for the given CPU time.
|
|
49
|
+
|
|
50
|
+
The CPU time will be treated as if it had been used by a child process.
|
|
51
|
+
"""
|
|
52
|
+
cls._extra_cpu_seconds += seconds
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def get_total_cpu_time_and_memory_usage(cls) -> Tuple[float, int]:
|
|
56
|
+
"""
|
|
57
|
+
Gives the total cpu time of itself and all its children, and the maximum RSS memory usage of
|
|
58
|
+
itself and its single largest child (in kibibytes).
|
|
59
|
+
"""
|
|
60
|
+
me = resource.getrusage(resource.RUSAGE_SELF)
|
|
61
|
+
children = resource.getrusage(resource.RUSAGE_CHILDREN)
|
|
62
|
+
total_cpu_time = me.ru_utime + me.ru_stime + children.ru_utime + children.ru_stime + cls._extra_cpu_seconds
|
|
63
|
+
total_memory_usage = me.ru_maxrss + children.ru_maxrss
|
|
64
|
+
if sys.platform == "darwin":
|
|
65
|
+
# On Linux, getrusage works in "kilobytes" (really kibibytes), but on
|
|
66
|
+
# Mac it works in bytes. See
|
|
67
|
+
# <https://github.com/python/cpython/issues/74698>
|
|
68
|
+
total_memory_usage = int(math.ceil(total_memory_usage / 1024))
|
|
69
|
+
total_memory_usage += cls._extra_memory_ki
|
|
70
|
+
return total_cpu_time, total_memory_usage
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def get_total_cpu_time(cls) -> float:
|
|
74
|
+
"""Gives the total cpu time, including the children."""
|
|
75
|
+
me = resource.getrusage(resource.RUSAGE_SELF)
|
|
76
|
+
childs = resource.getrusage(resource.RUSAGE_CHILDREN)
|
|
77
|
+
return me.ru_utime + me.ru_stime + childs.ru_utime + childs.ru_stime + cls._extra_cpu_seconds
|
|
44
78
|
|
|
45
79
|
|
|
46
80
|
def glob(glob_pattern: str, directoryname: str) -> List[str]:
|
toil/lib/retry.py
CHANGED
|
@@ -142,7 +142,7 @@ from typing import (Any,
|
|
|
142
142
|
Sequence,
|
|
143
143
|
Tuple,
|
|
144
144
|
Type,
|
|
145
|
-
Union)
|
|
145
|
+
Union, TypeVar)
|
|
146
146
|
|
|
147
147
|
import requests.exceptions
|
|
148
148
|
import urllib3.exceptions
|
|
@@ -224,13 +224,16 @@ class ErrorCondition:
|
|
|
224
224
|
)
|
|
225
225
|
|
|
226
226
|
|
|
227
|
+
# There is a better way to type hint this with python 3.10
|
|
228
|
+
# https://stackoverflow.com/a/68290080
|
|
229
|
+
RT = TypeVar("RT")
|
|
227
230
|
def retry(
|
|
228
231
|
intervals: Optional[List] = None,
|
|
229
232
|
infinite_retries: bool = False,
|
|
230
233
|
errors: Optional[Sequence[Union[ErrorCondition, Type[Exception]]]] = None,
|
|
231
234
|
log_message: Optional[Tuple[Callable, str]] = None,
|
|
232
235
|
prepare: Optional[List[Callable]] = None,
|
|
233
|
-
) -> Callable[[
|
|
236
|
+
) -> Callable[[Callable[..., RT]], Callable[..., RT]]:
|
|
234
237
|
"""
|
|
235
238
|
Retry a function if it fails with any Exception defined in "errors".
|
|
236
239
|
|
|
@@ -281,9 +284,9 @@ def retry(
|
|
|
281
284
|
if error_condition.retry_on_this_condition:
|
|
282
285
|
retriable_errors.add(error_condition.error)
|
|
283
286
|
|
|
284
|
-
def decorate(func):
|
|
287
|
+
def decorate(func: Callable[..., RT]) -> Callable[..., RT]:
|
|
285
288
|
@functools.wraps(func)
|
|
286
|
-
def call(*args, **kwargs):
|
|
289
|
+
def call(*args, **kwargs) -> RT:
|
|
287
290
|
intervals_remaining = copy.deepcopy(intervals)
|
|
288
291
|
while True:
|
|
289
292
|
try:
|
|
@@ -488,13 +491,15 @@ def error_meets_conditions(e, error_conditions):
|
|
|
488
491
|
DEFAULT_DELAYS = (0, 1, 1, 4, 16, 64)
|
|
489
492
|
DEFAULT_TIMEOUT = 300
|
|
490
493
|
|
|
494
|
+
E = TypeVar("E", bound=Exception) # so mypy understands passed through types
|
|
495
|
+
|
|
491
496
|
# TODO: Replace the use of this with retry()
|
|
492
497
|
# The aws provisioner and jobstore need a large refactoring to be boto3 compliant, so this is
|
|
493
498
|
# still used there to avoid the duplication of future work
|
|
494
499
|
def old_retry(
|
|
495
500
|
delays: Iterable[float] = DEFAULT_DELAYS,
|
|
496
501
|
timeout: float = DEFAULT_TIMEOUT,
|
|
497
|
-
predicate: Callable[[
|
|
502
|
+
predicate: Callable[[E], bool] = lambda e: False,
|
|
498
503
|
) -> Generator[ContextManager, None, None]:
|
|
499
504
|
"""
|
|
500
505
|
Deprecated.
|
|
@@ -567,6 +572,8 @@ def old_retry(
|
|
|
567
572
|
>>> i
|
|
568
573
|
1
|
|
569
574
|
"""
|
|
575
|
+
if timeout is None:
|
|
576
|
+
timeout = DEFAULT_TIMEOUT
|
|
570
577
|
if timeout > 0:
|
|
571
578
|
go = [ None ]
|
|
572
579
|
|
toil/lib/threading.py
CHANGED
|
@@ -28,7 +28,7 @@ import traceback
|
|
|
28
28
|
from contextlib import contextmanager
|
|
29
29
|
from typing import Dict, Iterator, Optional, Union, cast
|
|
30
30
|
|
|
31
|
-
import psutil
|
|
31
|
+
import psutil
|
|
32
32
|
|
|
33
33
|
from toil.lib.exceptions import raise_
|
|
34
34
|
from toil.lib.io import robust_rmtree
|
|
@@ -41,7 +41,7 @@ class ExceptionalThread(threading.Thread):
|
|
|
41
41
|
A thread whose join() method re-raises exceptions raised during run(). While join() is
|
|
42
42
|
idempotent, the exception is only during the first invocation of join() that successfully
|
|
43
43
|
joined the thread. If join() times out, no exception will be re reraised even though an
|
|
44
|
-
exception might already have
|
|
44
|
+
exception might already have occurred in run().
|
|
45
45
|
|
|
46
46
|
When subclassing this thread, override tryRun() instead of run().
|
|
47
47
|
|