toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +41 -17
- toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +9 -9
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +129 -16
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +45 -3
- toil/common.py +56 -31
- toil/cwl/cwltoil.py +442 -371
- toil/deferred.py +1 -1
- toil/exceptions.py +1 -1
- toil/fileStores/abstractFileStore.py +69 -20
- toil/fileStores/cachingFileStore.py +6 -22
- toil/fileStores/nonCachingFileStore.py +6 -15
- toil/job.py +270 -86
- toil/jobStores/abstractJobStore.py +37 -31
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +60 -31
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +3 -3
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +89 -38
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +24 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/generatedEC2Lists.py +8 -8
- toil/lib/io.py +42 -4
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +57 -16
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +29 -14
- toil/lib/throttle.py +1 -1
- toil/options/common.py +31 -30
- toil/options/wdl.py +5 -0
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +12 -2
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +93 -23
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +22 -7
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +245 -236
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +11 -14
- toil/test/jobStores/jobStoreTest.py +40 -54
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/lib/test_ec2.py +1 -1
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +37 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +99 -16
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +62 -4
- toil/test/utils/utilsTest.py +23 -21
- toil/test/wdl/wdltoil_test.py +49 -21
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugFile.py +1 -1
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +310 -266
- toil/utils/toilStatus.py +98 -52
- toil/version.py +11 -11
- toil/wdl/wdltoil.py +644 -225
- toil/worker.py +125 -83
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- toil-7.0.0.dist-info/METADATA +158 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
|
@@ -11,6 +11,8 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
14
16
|
import json
|
|
15
17
|
import logging
|
|
16
18
|
import os
|
|
@@ -29,34 +31,34 @@ from typing import (Any,
|
|
|
29
31
|
Iterable,
|
|
30
32
|
List,
|
|
31
33
|
Optional,
|
|
32
|
-
Set
|
|
34
|
+
Set,
|
|
35
|
+
Union,
|
|
36
|
+
Literal,
|
|
37
|
+
cast,
|
|
38
|
+
TypeVar)
|
|
33
39
|
from urllib.parse import unquote
|
|
34
40
|
|
|
35
41
|
# We need these to exist as attributes we can get off of the boto object
|
|
36
|
-
import boto.ec2
|
|
37
|
-
import boto.iam
|
|
38
|
-
import boto.vpc
|
|
39
|
-
from boto.ec2.blockdevicemapping import \
|
|
40
|
-
BlockDeviceMapping as Boto2BlockDeviceMapping
|
|
41
|
-
from boto.ec2.blockdevicemapping import BlockDeviceType as Boto2BlockDeviceType
|
|
42
|
-
from boto.ec2.instance import Instance as Boto2Instance
|
|
43
|
-
from boto.exception import BotoServerError, EC2ResponseError
|
|
44
|
-
from boto.utils import get_instance_metadata
|
|
45
42
|
from botocore.exceptions import ClientError
|
|
43
|
+
from mypy_boto3_autoscaling.client import AutoScalingClient
|
|
44
|
+
from mypy_boto3_ec2.service_resource import Instance
|
|
45
|
+
from mypy_boto3_iam.type_defs import InstanceProfileTypeDef, RoleTypeDef, ListRolePoliciesResponseTypeDef
|
|
46
|
+
from mypy_extensions import VarArg, KwArg
|
|
46
47
|
|
|
47
|
-
from toil.lib.aws import zone_to_region
|
|
48
|
+
from toil.lib.aws import zone_to_region, AWSRegionName, AWSServerErrors
|
|
48
49
|
from toil.lib.aws.ami import get_flatcar_ami
|
|
49
50
|
from toil.lib.aws.iam import (CLUSTER_LAUNCHING_PERMISSIONS,
|
|
50
51
|
get_policy_permissions,
|
|
51
52
|
policy_permissions_allow)
|
|
52
53
|
from toil.lib.aws.session import AWSConnectionManager
|
|
53
|
-
from toil.lib.aws.utils import create_s3_bucket
|
|
54
|
+
from toil.lib.aws.utils import create_s3_bucket, flatten_tags, boto3_pager
|
|
54
55
|
from toil.lib.conversions import human2bytes
|
|
55
56
|
from toil.lib.ec2 import (a_short_time,
|
|
56
57
|
create_auto_scaling_group,
|
|
57
58
|
create_instances,
|
|
58
59
|
create_launch_template,
|
|
59
60
|
create_ondemand_instances,
|
|
61
|
+
increase_instance_hop_limit,
|
|
60
62
|
create_spot_instances,
|
|
61
63
|
wait_instances_running,
|
|
62
64
|
wait_transition,
|
|
@@ -73,12 +75,20 @@ from toil.lib.retry import (ErrorCondition,
|
|
|
73
75
|
old_retry,
|
|
74
76
|
retry)
|
|
75
77
|
from toil.provisioners import (ClusterCombinationNotSupportedException,
|
|
76
|
-
NoSuchClusterException
|
|
78
|
+
NoSuchClusterException,
|
|
79
|
+
NoSuchZoneException)
|
|
77
80
|
from toil.provisioners.abstractProvisioner import (AbstractProvisioner,
|
|
78
81
|
ManagedNodesNotSupportedException,
|
|
79
82
|
Shape)
|
|
80
83
|
from toil.provisioners.aws import get_best_aws_zone
|
|
81
84
|
from toil.provisioners.node import Node
|
|
85
|
+
from toil.lib.aws.session import client as get_client
|
|
86
|
+
|
|
87
|
+
from mypy_boto3_ec2.client import EC2Client
|
|
88
|
+
from mypy_boto3_iam.client import IAMClient
|
|
89
|
+
from mypy_boto3_ec2.type_defs import DescribeInstancesResultTypeDef, InstanceTypeDef, TagTypeDef, BlockDeviceMappingTypeDef, EbsBlockDeviceTypeDef, FilterTypeDef, SpotInstanceRequestTypeDef, TagDescriptionTypeDef, SecurityGroupTypeDef, \
|
|
90
|
+
CreateSecurityGroupResultTypeDef, IpPermissionTypeDef, ReservationTypeDef
|
|
91
|
+
from mypy_boto3_s3.literals import BucketLocationConstraintType
|
|
82
92
|
|
|
83
93
|
logger = logging.getLogger(__name__)
|
|
84
94
|
logging.getLogger("boto").setLevel(logging.CRITICAL)
|
|
@@ -98,14 +108,8 @@ _S3_BUCKET_MAX_NAME_LEN = 63
|
|
|
98
108
|
# The suffix of the S3 bucket associated with the cluster
|
|
99
109
|
_S3_BUCKET_INTERNAL_SUFFIX = '--internal'
|
|
100
110
|
|
|
101
|
-
# prevent removal of these imports
|
|
102
|
-
str(boto.ec2)
|
|
103
|
-
str(boto.iam)
|
|
104
|
-
str(boto.vpc)
|
|
105
|
-
|
|
106
111
|
|
|
107
|
-
|
|
108
|
-
def awsRetryPredicate(e):
|
|
112
|
+
def awsRetryPredicate(e: Exception) -> bool:
|
|
109
113
|
if isinstance(e, socket.gaierror):
|
|
110
114
|
# Could be a DNS outage:
|
|
111
115
|
# socket.gaierror: [Errno -2] Name or service not known
|
|
@@ -135,33 +139,38 @@ def expectedShutdownErrors(e: Exception) -> bool:
|
|
|
135
139
|
return get_error_status(e) == 400 and 'dependent object' in get_error_body(e)
|
|
136
140
|
|
|
137
141
|
|
|
138
|
-
|
|
142
|
+
F = TypeVar("F") # so mypy understands passed through types
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def awsRetry(f: Callable[..., F]) -> Callable[..., F]:
|
|
139
146
|
"""
|
|
140
|
-
This decorator retries the wrapped function if aws throws unexpected errors
|
|
141
|
-
|
|
147
|
+
This decorator retries the wrapped function if aws throws unexpected errors.
|
|
148
|
+
|
|
142
149
|
It should wrap any function that makes use of boto
|
|
143
150
|
"""
|
|
151
|
+
|
|
144
152
|
@wraps(f)
|
|
145
|
-
def wrapper(*args, **kwargs):
|
|
153
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
146
154
|
for attempt in old_retry(delays=truncExpBackoff(),
|
|
147
155
|
timeout=300,
|
|
148
156
|
predicate=awsRetryPredicate):
|
|
149
157
|
with attempt:
|
|
150
158
|
return f(*args, **kwargs)
|
|
159
|
+
|
|
151
160
|
return wrapper
|
|
152
161
|
|
|
153
162
|
|
|
154
|
-
def awsFilterImpairedNodes(nodes,
|
|
163
|
+
def awsFilterImpairedNodes(nodes: List[InstanceTypeDef], boto3_ec2: EC2Client) -> List[InstanceTypeDef]:
|
|
155
164
|
# if TOIL_AWS_NODE_DEBUG is set don't terminate nodes with
|
|
156
165
|
# failing status checks so they can be debugged
|
|
157
166
|
nodeDebug = os.environ.get('TOIL_AWS_NODE_DEBUG') in ('True', 'TRUE', 'true', True)
|
|
158
167
|
if not nodeDebug:
|
|
159
168
|
return nodes
|
|
160
|
-
nodeIDs = [node
|
|
161
|
-
statuses =
|
|
162
|
-
statusMap = {status
|
|
163
|
-
healthyNodes = [node for node in nodes if statusMap.get(node
|
|
164
|
-
impairedNodes = [node
|
|
169
|
+
nodeIDs = [node["InstanceId"] for node in nodes]
|
|
170
|
+
statuses = boto3_ec2.describe_instance_status(InstanceIds=nodeIDs)
|
|
171
|
+
statusMap = {status["InstanceId"]: status["InstanceStatus"]["Status"] for status in statuses["InstanceStatuses"]}
|
|
172
|
+
healthyNodes = [node for node in nodes if statusMap.get(node["InstanceId"], None) != 'impaired']
|
|
173
|
+
impairedNodes = [node["InstanceId"] for node in nodes if statusMap.get(node["InstanceId"], None) == 'impaired']
|
|
165
174
|
logger.warning('TOIL_AWS_NODE_DEBUG is set and nodes %s have failed EC2 status checks so '
|
|
166
175
|
'will not be terminated.', ' '.join(impairedNodes))
|
|
167
176
|
return healthyNodes
|
|
@@ -170,8 +179,24 @@ def awsFilterImpairedNodes(nodes, ec2):
|
|
|
170
179
|
class InvalidClusterStateException(Exception):
|
|
171
180
|
pass
|
|
172
181
|
|
|
182
|
+
|
|
183
|
+
def collapse_tags(instance_tags: List[TagTypeDef]) -> Dict[str, str]:
|
|
184
|
+
"""
|
|
185
|
+
Collapse tags from boto3 format to node format
|
|
186
|
+
:param instance_tags: tags as list of TagTypeDef
|
|
187
|
+
:return: Dict of tags
|
|
188
|
+
"""
|
|
189
|
+
collapsed_tags: Dict[str, str] = dict()
|
|
190
|
+
for tag in instance_tags:
|
|
191
|
+
if tag.get("Key") is not None:
|
|
192
|
+
collapsed_tags[tag["Key"]] = tag["Value"]
|
|
193
|
+
return collapsed_tags
|
|
194
|
+
|
|
195
|
+
|
|
173
196
|
class AWSProvisioner(AbstractProvisioner):
|
|
174
|
-
def __init__(self, clusterName, clusterType, zone
|
|
197
|
+
def __init__(self, clusterName: Optional[str], clusterType: Optional[str], zone: Optional[str],
|
|
198
|
+
nodeStorage: int, nodeStorageOverrides: Optional[List[str]], sseKey: Optional[str],
|
|
199
|
+
enable_fuse: bool):
|
|
175
200
|
self.cloud = 'aws'
|
|
176
201
|
self._sseKey = sseKey
|
|
177
202
|
# self._zone will be filled in by base class constructor
|
|
@@ -186,7 +211,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
186
211
|
|
|
187
212
|
# Determine our region to work in, before readClusterSettings() which
|
|
188
213
|
# might need it. TODO: support multiple regions in one cluster
|
|
189
|
-
self._region = zone_to_region(zone)
|
|
214
|
+
self._region: AWSRegionName = zone_to_region(zone)
|
|
190
215
|
|
|
191
216
|
# Set up our connections to AWS
|
|
192
217
|
self.aws = AWSConnectionManager()
|
|
@@ -197,16 +222,22 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
197
222
|
|
|
198
223
|
# Call base class constructor, which will call createClusterSettings()
|
|
199
224
|
# or readClusterSettings()
|
|
200
|
-
super().__init__(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides)
|
|
225
|
+
super().__init__(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, enable_fuse)
|
|
226
|
+
|
|
227
|
+
if self._zone is None:
|
|
228
|
+
logger.warning("Leader zone was never initialized before creating AWS provisioner. Defaulting to cluster zone.")
|
|
229
|
+
|
|
230
|
+
self._leader_subnet: str = self._get_default_subnet(self._zone or zone)
|
|
231
|
+
self._tags: Dict[str, Any] = {}
|
|
201
232
|
|
|
202
233
|
# After self.clusterName is set, generate a valid name for the S3 bucket associated with this cluster
|
|
203
234
|
suffix = _S3_BUCKET_INTERNAL_SUFFIX
|
|
204
235
|
self.s3_bucket_name = self.clusterName[:_S3_BUCKET_MAX_NAME_LEN - len(suffix)] + suffix
|
|
205
236
|
|
|
206
|
-
def supportedClusterTypes(self):
|
|
237
|
+
def supportedClusterTypes(self) -> Set[str]:
|
|
207
238
|
return {'mesos', 'kubernetes'}
|
|
208
239
|
|
|
209
|
-
def createClusterSettings(self):
|
|
240
|
+
def createClusterSettings(self) -> None:
|
|
210
241
|
"""
|
|
211
242
|
Create a new set of cluster settings for a cluster to be deployed into
|
|
212
243
|
AWS.
|
|
@@ -216,41 +247,51 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
216
247
|
# constructor.
|
|
217
248
|
assert self._zone is not None
|
|
218
249
|
|
|
219
|
-
def readClusterSettings(self):
|
|
250
|
+
def readClusterSettings(self) -> None:
|
|
220
251
|
"""
|
|
221
252
|
Reads the cluster settings from the instance metadata, which assumes
|
|
222
253
|
the instance is the leader.
|
|
223
254
|
"""
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
instance =
|
|
255
|
+
from ec2_metadata import ec2_metadata
|
|
256
|
+
boto3_ec2 = self.aws.client(self._region, 'ec2')
|
|
257
|
+
instance: InstanceTypeDef = boto3_ec2.describe_instances(InstanceIds=[ec2_metadata.instance_id])["Reservations"][0]["Instances"][0]
|
|
227
258
|
# The cluster name is the same as the name of the leader.
|
|
228
|
-
self.clusterName =
|
|
259
|
+
self.clusterName: str = "default-toil-cluster-name"
|
|
260
|
+
for tag in instance["Tags"]:
|
|
261
|
+
if tag.get("Key") == "Name":
|
|
262
|
+
self.clusterName = tag["Value"]
|
|
229
263
|
# Determine what subnet we, the leader, are in
|
|
230
|
-
self._leader_subnet = instance
|
|
264
|
+
self._leader_subnet = instance["SubnetId"]
|
|
231
265
|
# Determine where to deploy workers.
|
|
232
266
|
self._worker_subnets_by_zone = self._get_good_subnets_like(self._leader_subnet)
|
|
233
267
|
|
|
234
|
-
self._leaderPrivateIP =
|
|
235
|
-
self.
|
|
236
|
-
self._tags = {k: v for k, v in self.getLeader().tags.items() if k != _TAG_KEY_TOIL_NODE_TYPE}
|
|
268
|
+
self._leaderPrivateIP = ec2_metadata.private_ipv4 # this is PRIVATE IP
|
|
269
|
+
self._tags = {k: v for k, v in (self.getLeader().tags or {}).items() if k != _TAG_KEY_TOIL_NODE_TYPE}
|
|
237
270
|
# Grab the ARN name of the instance profile (a str) to apply to workers
|
|
238
|
-
|
|
271
|
+
leader_info = None
|
|
272
|
+
for attempt in old_retry(timeout=300, predicate=lambda e: True):
|
|
273
|
+
with attempt:
|
|
274
|
+
leader_info = ec2_metadata.iam_info
|
|
275
|
+
if leader_info is None:
|
|
276
|
+
raise RuntimeError("Could not get EC2 metadata IAM info")
|
|
277
|
+
if leader_info is None:
|
|
278
|
+
# This is more for mypy as it is unable to see that the retry will guarantee this is not None
|
|
279
|
+
# and that this is not reachable
|
|
280
|
+
raise RuntimeError(f"Leader IAM metadata is unreachable.")
|
|
281
|
+
self._leaderProfileArn = leader_info["InstanceProfileArn"]
|
|
282
|
+
|
|
239
283
|
# The existing metadata API returns a single string if there is one security group, but
|
|
240
284
|
# a list when there are multiple: change the format to always be a list.
|
|
241
|
-
rawSecurityGroups =
|
|
242
|
-
self._leaderSecurityGroupNames =
|
|
285
|
+
rawSecurityGroups = ec2_metadata.security_groups
|
|
286
|
+
self._leaderSecurityGroupNames: Set[str] = set(rawSecurityGroups)
|
|
243
287
|
# Since we have access to the names, we don't also need to use any IDs
|
|
244
|
-
self._leaderSecurityGroupIDs = set()
|
|
288
|
+
self._leaderSecurityGroupIDs: Set[str] = set()
|
|
245
289
|
|
|
246
290
|
# Let the base provisioner work out how to deploy duly authorized
|
|
247
291
|
# workers for this leader.
|
|
248
292
|
self._setLeaderWorkerAuthentication()
|
|
249
293
|
|
|
250
|
-
@retry(errors=[
|
|
251
|
-
error=ClientError,
|
|
252
|
-
error_codes=[404, 500, 502, 503, 504]
|
|
253
|
-
)])
|
|
294
|
+
@retry(errors=[AWSServerErrors])
|
|
254
295
|
def _write_file_to_cloud(self, key: str, contents: bytes) -> str:
|
|
255
296
|
bucket_name = self.s3_bucket_name
|
|
256
297
|
|
|
@@ -289,7 +330,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
289
330
|
obj = self.aws.resource(self._region, 's3').Object(bucket_name, key)
|
|
290
331
|
|
|
291
332
|
try:
|
|
292
|
-
return obj.get()
|
|
333
|
+
return obj.get()['Body'].read()
|
|
293
334
|
except ClientError as e:
|
|
294
335
|
if get_error_status(e) == 404:
|
|
295
336
|
logger.warning(f'Trying to read non-existent file "{key}" from {bucket_name}.')
|
|
@@ -305,11 +346,11 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
305
346
|
owner: str,
|
|
306
347
|
keyName: str,
|
|
307
348
|
botoPath: str,
|
|
308
|
-
userTags: Optional[
|
|
349
|
+
userTags: Optional[Dict[str, str]],
|
|
309
350
|
vpcSubnet: Optional[str],
|
|
310
351
|
awsEc2ProfileArn: Optional[str],
|
|
311
|
-
awsEc2ExtraSecurityGroupIds: Optional[
|
|
312
|
-
**kwargs):
|
|
352
|
+
awsEc2ExtraSecurityGroupIds: Optional[List[str]],
|
|
353
|
+
**kwargs: Dict[str, Any]) -> None:
|
|
313
354
|
"""
|
|
314
355
|
Starts a single leader node and populates this class with the leader's metadata.
|
|
315
356
|
|
|
@@ -352,9 +393,6 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
352
393
|
if vpcSubnet:
|
|
353
394
|
# This is where we put the leader
|
|
354
395
|
self._leader_subnet = vpcSubnet
|
|
355
|
-
else:
|
|
356
|
-
# Find the default subnet for the zone
|
|
357
|
-
self._leader_subnet = self._get_default_subnet(self._zone)
|
|
358
396
|
|
|
359
397
|
profileArn = awsEc2ProfileArn or self._createProfileArn()
|
|
360
398
|
|
|
@@ -370,7 +408,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
370
408
|
if userTags is not None:
|
|
371
409
|
self._tags.update(userTags)
|
|
372
410
|
|
|
373
|
-
#All user specified tags have been set
|
|
411
|
+
# All user specified tags have been set
|
|
374
412
|
userData = self._getIgnitionUserData('leader', architecture=self._architecture)
|
|
375
413
|
|
|
376
414
|
if self.clusterType == 'kubernetes':
|
|
@@ -383,18 +421,18 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
383
421
|
leader_tags[_TAG_KEY_TOIL_NODE_TYPE] = 'leader'
|
|
384
422
|
logger.debug('Launching leader with tags: %s', leader_tags)
|
|
385
423
|
|
|
386
|
-
instances = create_instances(self.aws.resource(self._region, 'ec2'),
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
424
|
+
instances: List[Instance] = create_instances(self.aws.resource(self._region, 'ec2'),
|
|
425
|
+
image_id=self._discoverAMI(),
|
|
426
|
+
num_instances=1,
|
|
427
|
+
key_name=self._keyName,
|
|
428
|
+
security_group_ids=createdSGs + (awsEc2ExtraSecurityGroupIds or []),
|
|
429
|
+
instance_type=leader_type.name,
|
|
430
|
+
user_data=userData,
|
|
431
|
+
block_device_map=bdms,
|
|
432
|
+
instance_profile_arn=profileArn,
|
|
433
|
+
placement_az=self._zone,
|
|
434
|
+
subnet_id=self._leader_subnet,
|
|
435
|
+
tags=leader_tags)
|
|
398
436
|
|
|
399
437
|
# wait for the leader to exist at all
|
|
400
438
|
leader = instances[0]
|
|
@@ -425,7 +463,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
425
463
|
leaderNode = Node(publicIP=leader.public_ip_address, privateIP=leader.private_ip_address,
|
|
426
464
|
name=leader.id, launchTime=leader.launch_time,
|
|
427
465
|
nodeType=leader_type.name, preemptible=False,
|
|
428
|
-
tags=leader.tags)
|
|
466
|
+
tags=collapse_tags(leader.tags))
|
|
429
467
|
leaderNode.waitForNode('toil_leader')
|
|
430
468
|
|
|
431
469
|
# Download credentials
|
|
@@ -483,7 +521,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
483
521
|
acls = set(self._get_subnet_acls(base_subnet_id))
|
|
484
522
|
|
|
485
523
|
# Compose a filter that selects the subnets we might want
|
|
486
|
-
filters = [{
|
|
524
|
+
filters: List[FilterTypeDef] = [{
|
|
487
525
|
'Name': 'vpc-id',
|
|
488
526
|
'Values': [vpc_id]
|
|
489
527
|
}, {
|
|
@@ -495,7 +533,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
495
533
|
}]
|
|
496
534
|
|
|
497
535
|
# Fill in this collection
|
|
498
|
-
by_az = {}
|
|
536
|
+
by_az: Dict[str, List[str]] = {}
|
|
499
537
|
|
|
500
538
|
# Go get all the subnets. There's no way to page manually here so it
|
|
501
539
|
# must page automatically.
|
|
@@ -534,7 +572,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
534
572
|
}]
|
|
535
573
|
|
|
536
574
|
# TODO: Can't we use the resource's network_acls.filter(Filters=)?
|
|
537
|
-
return [item['NetworkAclId'] for item in
|
|
575
|
+
return [item['NetworkAclId'] for item in boto3_pager(ec2.describe_network_acls,
|
|
538
576
|
'NetworkAcls',
|
|
539
577
|
Filters=filters)]
|
|
540
578
|
|
|
@@ -546,7 +584,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
546
584
|
"""
|
|
547
585
|
|
|
548
586
|
# Compose a filter that selects the default subnet in the AZ
|
|
549
|
-
filters = [{
|
|
587
|
+
filters: List[FilterTypeDef] = [{
|
|
550
588
|
'Name': 'default-for-az',
|
|
551
589
|
'Values': ['true']
|
|
552
590
|
}, {
|
|
@@ -586,7 +624,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
586
624
|
|
|
587
625
|
return 'aws'
|
|
588
626
|
|
|
589
|
-
def getNodeShape(self, instance_type: str, preemptible=False) -> Shape:
|
|
627
|
+
def getNodeShape(self, instance_type: str, preemptible: bool=False) -> Shape:
|
|
590
628
|
"""
|
|
591
629
|
Get the Shape for the given instance type (e.g. 't2.medium').
|
|
592
630
|
"""
|
|
@@ -603,13 +641,13 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
603
641
|
# mesos about whether a job can run on a particular node type
|
|
604
642
|
memory = (type_info.memory - 0.1) * 2 ** 30
|
|
605
643
|
return Shape(wallTime=60 * 60,
|
|
606
|
-
memory=memory,
|
|
644
|
+
memory=int(memory),
|
|
607
645
|
cores=type_info.cores,
|
|
608
|
-
disk=disk,
|
|
646
|
+
disk=int(disk),
|
|
609
647
|
preemptible=preemptible)
|
|
610
648
|
|
|
611
649
|
@staticmethod
|
|
612
|
-
def retryPredicate(e):
|
|
650
|
+
def retryPredicate(e: Exception) -> bool:
|
|
613
651
|
return awsRetryPredicate(e)
|
|
614
652
|
|
|
615
653
|
def destroyCluster(self) -> None:
|
|
@@ -619,8 +657,8 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
619
657
|
# The leader may create more instances while we're terminating the workers.
|
|
620
658
|
vpcId = None
|
|
621
659
|
try:
|
|
622
|
-
leader = self.
|
|
623
|
-
vpcId = leader.
|
|
660
|
+
leader = self._getLeaderInstanceBoto3()
|
|
661
|
+
vpcId = leader.get("VpcId")
|
|
624
662
|
logger.info('Terminating the leader first ...')
|
|
625
663
|
self._terminateInstances([leader])
|
|
626
664
|
except (NoSuchClusterException, InvalidClusterStateException):
|
|
@@ -651,14 +689,16 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
651
689
|
# Do the workers after the ASGs because some may belong to ASGs
|
|
652
690
|
logger.info('Terminating any remaining workers ...')
|
|
653
691
|
removed = False
|
|
654
|
-
instances = self.
|
|
692
|
+
instances = self._get_nodes_in_cluster_boto3(include_stopped_nodes=True)
|
|
655
693
|
spotIDs = self._getSpotRequestIDs()
|
|
694
|
+
boto3_ec2: EC2Client = self.aws.client(region=self._region, service_name="ec2")
|
|
656
695
|
if spotIDs:
|
|
657
|
-
|
|
696
|
+
boto3_ec2.cancel_spot_instance_requests(SpotInstanceRequestIds=spotIDs)
|
|
697
|
+
# self.aws.boto2(self._region, 'ec2').cancel_spot_instance_requests(request_ids=spotIDs)
|
|
658
698
|
removed = True
|
|
659
|
-
instancesToTerminate = awsFilterImpairedNodes(instances, self.aws.
|
|
699
|
+
instancesToTerminate = awsFilterImpairedNodes(instances, self.aws.client(self._region, 'ec2'))
|
|
660
700
|
if instancesToTerminate:
|
|
661
|
-
vpcId = vpcId or instancesToTerminate[0].
|
|
701
|
+
vpcId = vpcId or instancesToTerminate[0].get("VpcId")
|
|
662
702
|
self._terminateInstances(instancesToTerminate)
|
|
663
703
|
removed = True
|
|
664
704
|
if removed:
|
|
@@ -672,7 +712,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
672
712
|
# for some LuanchTemplate.
|
|
673
713
|
mistake = False
|
|
674
714
|
for ltID in self._get_launch_template_ids():
|
|
675
|
-
response =
|
|
715
|
+
response = boto3_ec2.delete_launch_template(LaunchTemplateId=ltID)
|
|
676
716
|
if 'LaunchTemplate' not in response:
|
|
677
717
|
mistake = True
|
|
678
718
|
else:
|
|
@@ -694,16 +734,17 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
694
734
|
removed = False
|
|
695
735
|
for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors):
|
|
696
736
|
with attempt:
|
|
697
|
-
|
|
737
|
+
security_groups: List[SecurityGroupTypeDef] = boto3_ec2.describe_security_groups()["SecurityGroups"]
|
|
738
|
+
for security_group in security_groups:
|
|
698
739
|
# TODO: If we terminate the leader and the workers but
|
|
699
740
|
# miss the security group, we won't find it now because
|
|
700
741
|
# we won't have vpcId set.
|
|
701
|
-
if
|
|
742
|
+
if security_group.get("GroupName") == self.clusterName and vpcId and security_group.get("VpcId") == vpcId:
|
|
702
743
|
try:
|
|
703
|
-
|
|
744
|
+
boto3_ec2.delete_security_group(GroupId=security_group["GroupId"])
|
|
704
745
|
removed = True
|
|
705
|
-
except
|
|
706
|
-
if e
|
|
746
|
+
except ClientError as e:
|
|
747
|
+
if get_error_code(e) == 'InvalidGroup.NotFound':
|
|
707
748
|
pass
|
|
708
749
|
else:
|
|
709
750
|
raise
|
|
@@ -777,10 +818,9 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
777
818
|
|
|
778
819
|
return spot_bid
|
|
779
820
|
|
|
780
|
-
def addNodes(self, nodeTypes: Set[str], numNodes, preemptible, spotBid=None) -> int:
|
|
821
|
+
def addNodes(self, nodeTypes: Set[str], numNodes: int, preemptible: bool, spotBid: Optional[float]=None) -> int:
|
|
781
822
|
# Grab the AWS connection we need
|
|
782
|
-
|
|
783
|
-
|
|
823
|
+
boto3_ec2 = get_client(service_name='ec2', region_name=self._region)
|
|
784
824
|
assert self._leaderPrivateIP
|
|
785
825
|
|
|
786
826
|
if preemptible:
|
|
@@ -792,7 +832,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
792
832
|
node_type = next(iter(nodeTypes))
|
|
793
833
|
type_info = E2Instances[node_type]
|
|
794
834
|
root_vol_size = self._nodeStorageOverrides.get(node_type, self._nodeStorage)
|
|
795
|
-
bdm = self.
|
|
835
|
+
bdm = self._getBoto3BlockDeviceMapping(type_info,
|
|
796
836
|
rootVolSize=root_vol_size)
|
|
797
837
|
|
|
798
838
|
# Pick a zone and subnet_id to launch into
|
|
@@ -803,7 +843,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
803
843
|
# We're allowed to pick from any of these zones.
|
|
804
844
|
zone_options = list(self._worker_subnets_by_zone.keys())
|
|
805
845
|
|
|
806
|
-
zone = get_best_aws_zone(spotBid, type_info.name,
|
|
846
|
+
zone = get_best_aws_zone(spotBid, type_info.name, boto3_ec2, zone_options)
|
|
807
847
|
else:
|
|
808
848
|
# We don't need to ever do any balancing across zones for on-demand
|
|
809
849
|
# instances. Just pick a zone.
|
|
@@ -814,6 +854,9 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
814
854
|
# The workers aren't allowed in the leader's zone.
|
|
815
855
|
# Pick an arbitrary zone we can use.
|
|
816
856
|
zone = next(iter(self._worker_subnets_by_zone.keys()))
|
|
857
|
+
if zone is None:
|
|
858
|
+
logger.exception("Could not find a valid zone. Make sure TOIL_AWS_ZONE is set or spot bids are not too low.")
|
|
859
|
+
raise NoSuchZoneException()
|
|
817
860
|
if self._leader_subnet in self._worker_subnets_by_zone.get(zone, []):
|
|
818
861
|
# The leader's subnet is an option for this zone, so use it.
|
|
819
862
|
subnet_id = self._leader_subnet
|
|
@@ -822,21 +865,40 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
822
865
|
subnet_id = next(iter(self._worker_subnets_by_zone[zone]))
|
|
823
866
|
|
|
824
867
|
keyPath = self._sseKey if self._sseKey else None
|
|
825
|
-
userData = self._getIgnitionUserData('worker', keyPath, preemptible, self._architecture)
|
|
868
|
+
userData: str = self._getIgnitionUserData('worker', keyPath, preemptible, self._architecture)
|
|
869
|
+
userDataBytes: bytes = b""
|
|
826
870
|
if isinstance(userData, str):
|
|
827
871
|
# Spot-market provisioning requires bytes for user data.
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
872
|
+
userDataBytes = userData.encode('utf-8')
|
|
873
|
+
|
|
874
|
+
spot_kwargs = {'KeyName': self._keyName,
|
|
875
|
+
'LaunchSpecification': {
|
|
876
|
+
'SecurityGroupIds': self._getSecurityGroupIDs(),
|
|
877
|
+
'InstanceType': type_info.name,
|
|
878
|
+
'UserData': userDataBytes,
|
|
879
|
+
'BlockDeviceMappings': bdm,
|
|
880
|
+
'IamInstanceProfile': {
|
|
881
|
+
'Arn': self._leaderProfileArn
|
|
882
|
+
},
|
|
883
|
+
'Placement': {
|
|
884
|
+
'AvailabilityZone': zone
|
|
885
|
+
},
|
|
886
|
+
'SubnetId': subnet_id}
|
|
887
|
+
}
|
|
888
|
+
on_demand_kwargs = {'KeyName': self._keyName,
|
|
889
|
+
'SecurityGroupIds': self._getSecurityGroupIDs(),
|
|
890
|
+
'InstanceType': type_info.name,
|
|
891
|
+
'UserData': userDataBytes,
|
|
892
|
+
'BlockDeviceMappings': bdm,
|
|
893
|
+
'IamInstanceProfile': {
|
|
894
|
+
'Arn': self._leaderProfileArn
|
|
895
|
+
},
|
|
896
|
+
'Placement': {
|
|
897
|
+
'AvailabilityZone': zone
|
|
898
|
+
},
|
|
899
|
+
'SubnetId': subnet_id}
|
|
900
|
+
|
|
901
|
+
instancesLaunched: List[InstanceTypeDef] = []
|
|
840
902
|
|
|
841
903
|
for attempt in old_retry(predicate=awsRetryPredicate):
|
|
842
904
|
with attempt:
|
|
@@ -845,41 +907,45 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
845
907
|
# every request in this method
|
|
846
908
|
if not preemptible:
|
|
847
909
|
logger.debug('Launching %s non-preemptible nodes', numNodes)
|
|
848
|
-
instancesLaunched = create_ondemand_instances(
|
|
910
|
+
instancesLaunched = create_ondemand_instances(boto3_ec2=boto3_ec2,
|
|
849
911
|
image_id=self._discoverAMI(),
|
|
850
|
-
spec=
|
|
912
|
+
spec=on_demand_kwargs, num_instances=numNodes)
|
|
851
913
|
else:
|
|
852
914
|
logger.debug('Launching %s preemptible nodes', numNodes)
|
|
853
915
|
# force generator to evaluate
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
916
|
+
generatedInstancesLaunched: List[DescribeInstancesResultTypeDef] = list(create_spot_instances(boto3_ec2=boto3_ec2,
|
|
917
|
+
price=spotBid,
|
|
918
|
+
image_id=self._discoverAMI(),
|
|
919
|
+
tags={_TAG_KEY_TOIL_CLUSTER_NAME: self.clusterName},
|
|
920
|
+
spec=spot_kwargs,
|
|
921
|
+
num_instances=numNodes,
|
|
922
|
+
tentative=True)
|
|
923
|
+
)
|
|
862
924
|
# flatten the list
|
|
863
|
-
|
|
925
|
+
flatten_reservations: List[ReservationTypeDef] = [reservation for subdict in generatedInstancesLaunched for reservation in subdict["Reservations"] for key, value in subdict.items()]
|
|
926
|
+
# get a flattened list of all requested instances, as before instancesLaunched is a dict of reservations which is a dict of instance requests
|
|
927
|
+
instancesLaunched = [instance for instances in flatten_reservations for instance in instances['Instances']]
|
|
864
928
|
|
|
865
929
|
for attempt in old_retry(predicate=awsRetryPredicate):
|
|
866
930
|
with attempt:
|
|
867
|
-
wait_instances_running(
|
|
931
|
+
list(wait_instances_running(boto3_ec2, instancesLaunched)) # ensure all instances are running
|
|
932
|
+
|
|
933
|
+
increase_instance_hop_limit(boto3_ec2, instancesLaunched)
|
|
868
934
|
|
|
869
935
|
self._tags[_TAG_KEY_TOIL_NODE_TYPE] = 'worker'
|
|
870
|
-
AWSProvisioner._addTags(instancesLaunched, self._tags)
|
|
936
|
+
AWSProvisioner._addTags(boto3_ec2, instancesLaunched, self._tags)
|
|
871
937
|
if self._sseKey:
|
|
872
938
|
for i in instancesLaunched:
|
|
873
939
|
self._waitForIP(i)
|
|
874
|
-
node = Node(publicIP=i
|
|
875
|
-
launchTime=i
|
|
876
|
-
tags=i
|
|
940
|
+
node = Node(publicIP=i['PublicIpAddress'], privateIP=i['PrivateIpAddress'], name=i['InstanceId'],
|
|
941
|
+
launchTime=i['LaunchTime'], nodeType=i['InstanceType'], preemptible=preemptible,
|
|
942
|
+
tags=collapse_tags(i['Tags']))
|
|
877
943
|
node.waitForNode('toil_worker')
|
|
878
944
|
node.coreRsync([self._sseKey, ':' + self._sseKey], applianceName='toil_worker')
|
|
879
945
|
logger.debug('Launched %s new instance(s)', numNodes)
|
|
880
946
|
return len(instancesLaunched)
|
|
881
947
|
|
|
882
|
-
def addManagedNodes(self, nodeTypes: Set[str], minNodes, maxNodes, preemptible, spotBid=None) -> None:
|
|
948
|
+
def addManagedNodes(self, nodeTypes: Set[str], minNodes: int, maxNodes: int, preemptible: bool, spotBid: Optional[float] = None) -> None:
|
|
883
949
|
|
|
884
950
|
if self.clusterType != 'kubernetes':
|
|
885
951
|
raise ManagedNodesNotSupportedException("Managed nodes only supported for Kubernetes clusters")
|
|
@@ -901,17 +967,17 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
901
967
|
|
|
902
968
|
def getProvisionedWorkers(self, instance_type: Optional[str] = None, preemptible: Optional[bool] = None) -> List[Node]:
|
|
903
969
|
assert self._leaderPrivateIP
|
|
904
|
-
entireCluster = self.
|
|
970
|
+
entireCluster = self._get_nodes_in_cluster_boto3(instance_type=instance_type)
|
|
905
971
|
logger.debug('All nodes in cluster: %s', entireCluster)
|
|
906
|
-
workerInstances = [i for i in entireCluster if i
|
|
972
|
+
workerInstances: List[InstanceTypeDef] = [i for i in entireCluster if i["PrivateIpAddress"] != self._leaderPrivateIP]
|
|
907
973
|
logger.debug('All workers found in cluster: %s', workerInstances)
|
|
908
974
|
if preemptible is not None:
|
|
909
|
-
workerInstances = [i for i in workerInstances if preemptible == (i
|
|
975
|
+
workerInstances = [i for i in workerInstances if preemptible == (i["SpotInstanceRequestId"] is not None)]
|
|
910
976
|
logger.debug('%spreemptible workers found in cluster: %s', 'non-' if not preemptible else '', workerInstances)
|
|
911
|
-
workerInstances = awsFilterImpairedNodes(workerInstances, self.aws.
|
|
912
|
-
return [Node(publicIP=i
|
|
913
|
-
name=i
|
|
914
|
-
preemptible=i
|
|
977
|
+
workerInstances = awsFilterImpairedNodes(workerInstances, self.aws.client(self._region, 'ec2'))
|
|
978
|
+
return [Node(publicIP=i["PublicIpAddress"], privateIP=i["PrivateIpAddress"],
|
|
979
|
+
name=i["InstanceId"], launchTime=i["LaunchTime"], nodeType=i["InstanceType"],
|
|
980
|
+
preemptible=i["SpotInstanceRequestId"] is not None, tags=collapse_tags(i["Tags"]))
|
|
915
981
|
for i in workerInstances]
|
|
916
982
|
|
|
917
983
|
@memoize
|
|
@@ -952,37 +1018,65 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
952
1018
|
denamespaced = '/' + '_'.join(s.replace('_', '/') for s in namespaced_name.split('__'))
|
|
953
1019
|
return denamespaced.startswith(self._toNameSpace())
|
|
954
1020
|
|
|
1021
|
+
def _getLeaderInstanceBoto3(self) -> InstanceTypeDef:
|
|
1022
|
+
"""
|
|
1023
|
+
Get the Boto 3 instance for the cluster's leader.
|
|
1024
|
+
:return: InstanceTypeDef
|
|
1025
|
+
"""
|
|
1026
|
+
# Tags are stored differently in Boto 3
|
|
1027
|
+
instances: List[InstanceTypeDef] = self._get_nodes_in_cluster_boto3(include_stopped_nodes=True)
|
|
1028
|
+
instances.sort(key=lambda x: x["LaunchTime"])
|
|
1029
|
+
try:
|
|
1030
|
+
leader = instances[0] # assume leader was launched first
|
|
1031
|
+
except IndexError:
|
|
1032
|
+
raise NoSuchClusterException(self.clusterName)
|
|
1033
|
+
if leader.get("Tags") is not None:
|
|
1034
|
+
tag_value = next(item["Value"] for item in leader["Tags"] if item["Key"] == _TAG_KEY_TOIL_NODE_TYPE)
|
|
1035
|
+
else:
|
|
1036
|
+
tag_value = None
|
|
1037
|
+
if (tag_value or 'leader') != 'leader':
|
|
1038
|
+
raise InvalidClusterStateException(
|
|
1039
|
+
'Invalid cluster state! The first launched instance appears not to be the leader '
|
|
1040
|
+
'as it is missing the "leader" tag. The safest recovery is to destroy the cluster '
|
|
1041
|
+
'and restart the job. Incorrect Leader ID: %s' % leader["InstanceId"]
|
|
1042
|
+
)
|
|
1043
|
+
return leader
|
|
955
1044
|
|
|
956
|
-
def _getLeaderInstance(self) ->
|
|
1045
|
+
def _getLeaderInstance(self) -> InstanceTypeDef:
|
|
957
1046
|
"""
|
|
958
1047
|
Get the Boto 2 instance for the cluster's leader.
|
|
959
1048
|
"""
|
|
960
|
-
instances = self.
|
|
961
|
-
instances.sort(key=lambda x: x
|
|
1049
|
+
instances = self._get_nodes_in_cluster_boto3(include_stopped_nodes=True)
|
|
1050
|
+
instances.sort(key=lambda x: x["LaunchTime"])
|
|
962
1051
|
try:
|
|
963
|
-
leader = instances[0] # assume leader was launched first
|
|
1052
|
+
leader: InstanceTypeDef = instances[0] # assume leader was launched first
|
|
964
1053
|
except IndexError:
|
|
965
1054
|
raise NoSuchClusterException(self.clusterName)
|
|
966
|
-
|
|
1055
|
+
tagged_node_type: str = 'leader'
|
|
1056
|
+
for tag in leader["Tags"]:
|
|
1057
|
+
# If a tag specifying node type exists,
|
|
1058
|
+
if tag.get("Key") is not None and tag["Key"] == _TAG_KEY_TOIL_NODE_TYPE:
|
|
1059
|
+
tagged_node_type = tag["Value"]
|
|
1060
|
+
if tagged_node_type != 'leader':
|
|
967
1061
|
raise InvalidClusterStateException(
|
|
968
1062
|
'Invalid cluster state! The first launched instance appears not to be the leader '
|
|
969
1063
|
'as it is missing the "leader" tag. The safest recovery is to destroy the cluster '
|
|
970
|
-
'and restart the job. Incorrect Leader ID: %s' % leader
|
|
1064
|
+
'and restart the job. Incorrect Leader ID: %s' % leader["InstanceId"]
|
|
971
1065
|
)
|
|
972
1066
|
return leader
|
|
973
1067
|
|
|
974
|
-
def getLeader(self, wait=False) -> Node:
|
|
1068
|
+
def getLeader(self, wait: bool = False) -> Node:
|
|
975
1069
|
"""
|
|
976
1070
|
Get the leader for the cluster as a Toil Node object.
|
|
977
1071
|
"""
|
|
978
|
-
leader = self.
|
|
1072
|
+
leader: InstanceTypeDef = self._getLeaderInstanceBoto3()
|
|
979
1073
|
|
|
980
|
-
leaderNode = Node(publicIP=leader
|
|
981
|
-
name=leader
|
|
982
|
-
preemptible=False, tags=leader
|
|
1074
|
+
leaderNode = Node(publicIP=leader["PublicIpAddress"], privateIP=leader["PrivateIpAddress"],
|
|
1075
|
+
name=leader["InstanceId"], launchTime=leader["LaunchTime"], nodeType=None,
|
|
1076
|
+
preemptible=False, tags=collapse_tags(leader["Tags"]))
|
|
983
1077
|
if wait:
|
|
984
1078
|
logger.debug("Waiting for toil_leader to enter 'running' state...")
|
|
985
|
-
wait_instances_running(self.aws.
|
|
1079
|
+
wait_instances_running(self.aws.client(self._region, 'ec2'), [leader])
|
|
986
1080
|
logger.debug('... toil_leader is running')
|
|
987
1081
|
self._waitForIP(leader)
|
|
988
1082
|
leaderNode.waitForNode('toil_leader')
|
|
@@ -991,17 +1085,20 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
991
1085
|
|
|
992
1086
|
@classmethod
|
|
993
1087
|
@awsRetry
|
|
994
|
-
def _addTag(cls, instance:
|
|
995
|
-
instance.
|
|
1088
|
+
def _addTag(cls, boto3_ec2: EC2Client, instance: InstanceTypeDef, key: str, value: str) -> None:
|
|
1089
|
+
if instance.get('Tags') is None:
|
|
1090
|
+
instance['Tags'] = []
|
|
1091
|
+
new_tag: TagTypeDef = {"Key": key, "Value": value}
|
|
1092
|
+
boto3_ec2.create_tags(Resources=[instance["InstanceId"]], Tags=[new_tag])
|
|
996
1093
|
|
|
997
1094
|
@classmethod
|
|
998
|
-
def _addTags(cls, instances: List[
|
|
1095
|
+
def _addTags(cls, boto3_ec2: EC2Client, instances: List[InstanceTypeDef], tags: Dict[str, str]) -> None:
|
|
999
1096
|
for instance in instances:
|
|
1000
1097
|
for key, value in tags.items():
|
|
1001
|
-
cls._addTag(instance, key, value)
|
|
1098
|
+
cls._addTag(boto3_ec2, instance, key, value)
|
|
1002
1099
|
|
|
1003
1100
|
@classmethod
|
|
1004
|
-
def _waitForIP(cls, instance:
|
|
1101
|
+
def _waitForIP(cls, instance: InstanceTypeDef) -> None:
|
|
1005
1102
|
"""
|
|
1006
1103
|
Wait until the instances has a public IP address assigned to it.
|
|
1007
1104
|
|
|
@@ -1010,32 +1107,32 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1010
1107
|
logger.debug('Waiting for ip...')
|
|
1011
1108
|
while True:
|
|
1012
1109
|
time.sleep(a_short_time)
|
|
1013
|
-
instance.
|
|
1014
|
-
if instance.ip_address or instance.public_dns_name or instance.private_ip_address:
|
|
1110
|
+
if instance.get("PublicIpAddress") or instance.get("PublicDnsName") or instance.get("PrivateIpAddress"):
|
|
1015
1111
|
logger.debug('...got ip')
|
|
1016
1112
|
break
|
|
1017
1113
|
|
|
1018
|
-
def _terminateInstances(self, instances: List[
|
|
1019
|
-
instanceIDs = [x
|
|
1114
|
+
def _terminateInstances(self, instances: List[InstanceTypeDef]) -> None:
|
|
1115
|
+
instanceIDs = [x["InstanceId"] for x in instances]
|
|
1020
1116
|
self._terminateIDs(instanceIDs)
|
|
1021
1117
|
logger.info('... Waiting for instance(s) to shut down...')
|
|
1022
1118
|
for instance in instances:
|
|
1023
|
-
wait_transition(instance, {'pending', 'running', 'shutting-down', 'stopping', 'stopped'}, 'terminated')
|
|
1119
|
+
wait_transition(self.aws.client(region=self._region, service_name="ec2"), instance, {'pending', 'running', 'shutting-down', 'stopping', 'stopped'}, 'terminated')
|
|
1024
1120
|
logger.info('Instance(s) terminated.')
|
|
1025
1121
|
|
|
1026
1122
|
@awsRetry
|
|
1027
|
-
def _terminateIDs(self, instanceIDs: List[str]):
|
|
1123
|
+
def _terminateIDs(self, instanceIDs: List[str]) -> None:
|
|
1028
1124
|
logger.info('Terminating instance(s): %s', instanceIDs)
|
|
1029
|
-
self.aws.
|
|
1125
|
+
boto3_ec2 = self.aws.client(region=self._region, service_name="ec2")
|
|
1126
|
+
boto3_ec2.terminate_instances(InstanceIds=instanceIDs)
|
|
1030
1127
|
logger.info('Instance(s) terminated.')
|
|
1031
1128
|
|
|
1032
1129
|
@awsRetry
|
|
1033
|
-
def _deleteRoles(self, names: List[str]):
|
|
1130
|
+
def _deleteRoles(self, names: List[str]) -> None:
|
|
1034
1131
|
"""
|
|
1035
1132
|
Delete all the given named IAM roles.
|
|
1036
1133
|
Detatches but does not delete associated instance profiles.
|
|
1037
1134
|
"""
|
|
1038
|
-
|
|
1135
|
+
boto3_iam = self.aws.client(region=self._region, service_name="iam")
|
|
1039
1136
|
for role_name in names:
|
|
1040
1137
|
for profile_name in self._getRoleInstanceProfileNames(role_name):
|
|
1041
1138
|
# We can't delete either the role or the profile while they
|
|
@@ -1043,60 +1140,64 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1043
1140
|
|
|
1044
1141
|
for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors):
|
|
1045
1142
|
with attempt:
|
|
1046
|
-
|
|
1047
|
-
|
|
1143
|
+
boto3_iam.remove_role_from_instance_profile(InstanceProfileName=profile_name,
|
|
1144
|
+
RoleName=role_name)
|
|
1048
1145
|
# We also need to drop all inline policies
|
|
1049
1146
|
for policy_name in self._getRoleInlinePolicyNames(role_name):
|
|
1050
1147
|
for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors):
|
|
1051
1148
|
with attempt:
|
|
1052
|
-
|
|
1053
|
-
|
|
1149
|
+
boto3_iam.delete_role_policy(PolicyName=policy_name,
|
|
1150
|
+
RoleName=role_name)
|
|
1054
1151
|
|
|
1055
1152
|
for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors):
|
|
1056
1153
|
with attempt:
|
|
1057
|
-
|
|
1154
|
+
boto3_iam.delete_role(RoleName=role_name)
|
|
1058
1155
|
logger.debug('... Successfully deleted IAM role %s', role_name)
|
|
1059
1156
|
|
|
1060
|
-
|
|
1061
1157
|
@awsRetry
|
|
1062
|
-
def _deleteInstanceProfiles(self, names: List[str]):
|
|
1158
|
+
def _deleteInstanceProfiles(self, names: List[str]) -> None:
|
|
1063
1159
|
"""
|
|
1064
1160
|
Delete all the given named IAM instance profiles.
|
|
1065
1161
|
All roles must already be detached.
|
|
1066
1162
|
"""
|
|
1067
|
-
|
|
1163
|
+
boto3_iam = self.aws.client(region=self._region, service_name="iam")
|
|
1068
1164
|
for profile_name in names:
|
|
1069
1165
|
for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors):
|
|
1070
1166
|
with attempt:
|
|
1071
|
-
|
|
1167
|
+
boto3_iam.delete_instance_profile(InstanceProfileName=profile_name)
|
|
1072
1168
|
logger.debug('... Succesfully deleted instance profile %s', profile_name)
|
|
1073
1169
|
|
|
1074
1170
|
@classmethod
|
|
1075
|
-
def
|
|
1171
|
+
def _getBoto3BlockDeviceMapping(cls, type_info: InstanceType, rootVolSize: int = 50) -> List[BlockDeviceMappingTypeDef]:
|
|
1076
1172
|
# determine number of ephemeral drives via cgcloud-lib (actually this is moved into toil's lib
|
|
1077
1173
|
bdtKeys = [''] + [f'/dev/xvd{c}' for c in string.ascii_lowercase[1:]]
|
|
1078
|
-
|
|
1174
|
+
bdm_list: List[BlockDeviceMappingTypeDef] = []
|
|
1079
1175
|
# Change root volume size to allow for bigger Docker instances
|
|
1080
|
-
root_vol =
|
|
1081
|
-
|
|
1082
|
-
bdm
|
|
1176
|
+
root_vol: EbsBlockDeviceTypeDef = {"DeleteOnTermination": True,
|
|
1177
|
+
"VolumeSize": rootVolSize}
|
|
1178
|
+
bdm: BlockDeviceMappingTypeDef = {"DeviceName": "/dev/xvda", "Ebs": root_vol}
|
|
1179
|
+
bdm_list.append(bdm)
|
|
1083
1180
|
# The first disk is already attached for us so start with 2nd.
|
|
1084
1181
|
# Disk count is weirdly a float in our instance database, so make it an int here.
|
|
1085
1182
|
for disk in range(1, int(type_info.disks) + 1):
|
|
1086
|
-
bdm
|
|
1087
|
-
|
|
1183
|
+
bdm = {}
|
|
1184
|
+
bdm["DeviceName"] = bdtKeys[disk]
|
|
1185
|
+
bdm["VirtualName"] = f"ephemeral{disk - 1}" # ephemeral counts start at 0
|
|
1186
|
+
bdm["Ebs"] = root_vol # default
|
|
1187
|
+
# bdm["Ebs"] = root_vol.update({"VirtualName": f"ephemeral{disk - 1}"})
|
|
1188
|
+
bdm_list.append(bdm)
|
|
1088
1189
|
|
|
1089
|
-
logger.debug('Device mapping: %s',
|
|
1090
|
-
return
|
|
1190
|
+
logger.debug('Device mapping: %s', bdm_list)
|
|
1191
|
+
return bdm_list
|
|
1091
1192
|
|
|
1092
1193
|
@classmethod
|
|
1093
|
-
def _getBoto3BlockDeviceMappings(cls, type_info: InstanceType, rootVolSize: int = 50) -> List[
|
|
1194
|
+
def _getBoto3BlockDeviceMappings(cls, type_info: InstanceType, rootVolSize: int = 50) -> List[BlockDeviceMappingTypeDef]:
|
|
1094
1195
|
"""
|
|
1095
1196
|
Get block device mappings for the root volume for a worker.
|
|
1096
1197
|
"""
|
|
1097
1198
|
|
|
1098
1199
|
# Start with the root
|
|
1099
|
-
bdms = [{
|
|
1200
|
+
bdms: List[BlockDeviceMappingTypeDef] = [{
|
|
1100
1201
|
'DeviceName': '/dev/xvda',
|
|
1101
1202
|
'Ebs': {
|
|
1102
1203
|
'DeleteOnTermination': True,
|
|
@@ -1121,96 +1222,98 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1121
1222
|
return bdms
|
|
1122
1223
|
|
|
1123
1224
|
@awsRetry
|
|
1124
|
-
def
|
|
1225
|
+
def _get_nodes_in_cluster_boto3(self, instance_type: Optional[str] = None, include_stopped_nodes: bool = False) -> List[InstanceTypeDef]:
|
|
1125
1226
|
"""
|
|
1126
|
-
Get
|
|
1227
|
+
Get Boto3 instance objects for all nodes in the cluster.
|
|
1127
1228
|
"""
|
|
1229
|
+
boto3_ec2: EC2Client = self.aws.client(region=self._region, service_name='ec2')
|
|
1230
|
+
instance_filter: FilterTypeDef = {'Name': 'instance.group-name', 'Values': [self.clusterName]}
|
|
1231
|
+
describe_response: DescribeInstancesResultTypeDef = boto3_ec2.describe_instances(Filters=[instance_filter])
|
|
1232
|
+
all_instances: List[InstanceTypeDef] = []
|
|
1233
|
+
for reservation in describe_response['Reservations']:
|
|
1234
|
+
instances = reservation['Instances']
|
|
1235
|
+
all_instances.extend(instances)
|
|
1128
1236
|
|
|
1129
|
-
all_instances = self.aws.boto2(self._region, 'ec2').get_only_instances(filters={'instance.group-name': self.clusterName})
|
|
1237
|
+
# all_instances = self.aws.boto2(self._region, 'ec2').get_only_instances(filters={'instance.group-name': self.clusterName})
|
|
1130
1238
|
|
|
1131
|
-
def instanceFilter(i):
|
|
1239
|
+
def instanceFilter(i: InstanceTypeDef) -> bool:
|
|
1132
1240
|
# filter by type only if nodeType is true
|
|
1133
|
-
rightType = not instance_type or i
|
|
1134
|
-
rightState = i
|
|
1241
|
+
rightType = not instance_type or i['InstanceType'] == instance_type
|
|
1242
|
+
rightState = i['State']['Name'] == 'running' or i['State']['Name'] == 'pending'
|
|
1135
1243
|
if include_stopped_nodes:
|
|
1136
|
-
rightState = rightState or i
|
|
1244
|
+
rightState = rightState or i['State']['Name'] == 'stopping' or i['State']['Name'] == 'stopped'
|
|
1137
1245
|
return rightType and rightState
|
|
1138
1246
|
|
|
1139
1247
|
return [i for i in all_instances if instanceFilter(i)]
|
|
1140
1248
|
|
|
1141
|
-
def _filter_nodes_in_cluster(self, instance_type: Optional[str] = None, preemptible: bool = False) -> List[Boto2Instance]:
|
|
1142
|
-
"""
|
|
1143
|
-
Get Boto2 instance objects for the nodes in the cluster filtered by preemptability.
|
|
1144
|
-
"""
|
|
1145
|
-
|
|
1146
|
-
instances = self._get_nodes_in_cluster(instance_type, include_stopped_nodes=False)
|
|
1147
|
-
|
|
1148
|
-
if preemptible:
|
|
1149
|
-
return [i for i in instances if i.spot_instance_request_id is not None]
|
|
1150
|
-
|
|
1151
|
-
return [i for i in instances if i.spot_instance_request_id is None]
|
|
1152
|
-
|
|
1153
1249
|
def _getSpotRequestIDs(self) -> List[str]:
|
|
1154
1250
|
"""
|
|
1155
1251
|
Get the IDs of all spot requests associated with the cluster.
|
|
1156
1252
|
"""
|
|
1157
1253
|
|
|
1158
1254
|
# Grab the connection we need to use for this operation.
|
|
1159
|
-
ec2 = self.aws.
|
|
1255
|
+
ec2: EC2Client = self.aws.client(self._region, 'ec2')
|
|
1160
1256
|
|
|
1161
|
-
requests = ec2.
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1257
|
+
requests: List[SpotInstanceRequestTypeDef] = ec2.describe_spot_instance_requests()["SpotInstanceRequests"]
|
|
1258
|
+
tag_filter: FilterTypeDef = {"Name": "tag:" + _TAG_KEY_TOIL_CLUSTER_NAME, "Values": [self.clusterName]}
|
|
1259
|
+
tags: List[TagDescriptionTypeDef] = ec2.describe_tags(Filters=[tag_filter])["Tags"]
|
|
1260
|
+
idsToCancel = [tag["ResourceId"] for tag in tags]
|
|
1261
|
+
return [request["SpotInstanceRequestId"] for request in requests if request["InstanceId"] in idsToCancel]
|
|
1165
1262
|
|
|
1166
1263
|
def _createSecurityGroups(self) -> List[str]:
|
|
1167
1264
|
"""
|
|
1168
1265
|
Create security groups for the cluster. Returns a list of their IDs.
|
|
1169
1266
|
"""
|
|
1267
|
+
def group_not_found(e: ClientError) -> bool:
|
|
1268
|
+
retry = (get_error_status(e) == 400 and 'does not exist in default VPC' in get_error_body(e))
|
|
1269
|
+
return retry
|
|
1170
1270
|
|
|
1171
1271
|
# Grab the connection we need to use for this operation.
|
|
1172
1272
|
# The VPC connection can do anything the EC2 one can do, but also look at subnets.
|
|
1173
|
-
|
|
1273
|
+
boto3_ec2: EC2Client = self.aws.client(region=self._region, service_name="ec2")
|
|
1174
1274
|
|
|
1175
|
-
|
|
1176
|
-
retry = (e.status == 400 and 'does not exist in default VPC' in e.body)
|
|
1177
|
-
return retry
|
|
1178
|
-
# Security groups need to belong to the same VPC as the leader. If we
|
|
1179
|
-
# put the leader in a particular non-default subnet, it may be in a
|
|
1180
|
-
# particular non-default VPC, which we need to know about.
|
|
1181
|
-
vpcId = None
|
|
1275
|
+
vpc_id = None
|
|
1182
1276
|
if self._leader_subnet:
|
|
1183
|
-
subnets =
|
|
1277
|
+
subnets = boto3_ec2.describe_subnets(SubnetIds=[self._leader_subnet])["Subnets"]
|
|
1184
1278
|
if len(subnets) > 0:
|
|
1185
|
-
|
|
1186
|
-
# security group create/get. ssh + all ports open within the group
|
|
1279
|
+
vpc_id = subnets[0]["VpcId"]
|
|
1187
1280
|
try:
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1281
|
+
# Security groups need to belong to the same VPC as the leader. If we
|
|
1282
|
+
# put the leader in a particular non-default subnet, it may be in a
|
|
1283
|
+
# particular non-default VPC, which we need to know about.
|
|
1284
|
+
other = {"GroupName": self.clusterName, "Description": "Toil appliance security group"}
|
|
1285
|
+
if vpc_id is not None:
|
|
1286
|
+
other["VpcId"] = vpc_id
|
|
1287
|
+
# mypy stubs don't explicitly state kwargs even though documentation allows it, and mypy gets confused
|
|
1288
|
+
web_response: CreateSecurityGroupResultTypeDef = boto3_ec2.create_security_group(**other) # type: ignore[arg-type]
|
|
1289
|
+
except ClientError as e:
|
|
1290
|
+
if get_error_status(e) == 400 and 'already exists' in get_error_body(e):
|
|
1291
|
+
pass
|
|
1193
1292
|
else:
|
|
1194
1293
|
raise
|
|
1195
1294
|
else:
|
|
1196
|
-
for attempt in old_retry(predicate=
|
|
1197
|
-
with attempt:
|
|
1198
|
-
# open port 22 for ssh-ing
|
|
1199
|
-
web.authorize(ip_protocol='tcp', from_port=22, to_port=22, cidr_ip='0.0.0.0/0')
|
|
1200
|
-
# TODO: boto2 doesn't support IPv6 here but we need to.
|
|
1201
|
-
for attempt in old_retry(predicate=groupNotFound, timeout=300):
|
|
1202
|
-
with attempt:
|
|
1203
|
-
# the following authorizes all TCP access within the web security group
|
|
1204
|
-
web.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=web)
|
|
1205
|
-
for attempt in old_retry(predicate=groupNotFound, timeout=300):
|
|
1295
|
+
for attempt in old_retry(predicate=group_not_found, timeout=300):
|
|
1206
1296
|
with attempt:
|
|
1207
|
-
|
|
1208
|
-
|
|
1297
|
+
ip_permissions: List[IpPermissionTypeDef] = [{"IpProtocol": "tcp",
|
|
1298
|
+
"FromPort": 22,
|
|
1299
|
+
"ToPort": 22,
|
|
1300
|
+
"IpRanges": [
|
|
1301
|
+
{"CidrIp": "0.0.0.0/0"}
|
|
1302
|
+
],
|
|
1303
|
+
"Ipv6Ranges": [{"CidrIpv6": "::/0"}]}]
|
|
1304
|
+
for protocol in ("tcp", "udp"):
|
|
1305
|
+
ip_permissions.append({"IpProtocol": protocol,
|
|
1306
|
+
"FromPort": 0,
|
|
1307
|
+
"ToPort": 65535,
|
|
1308
|
+
"UserIdGroupPairs":
|
|
1309
|
+
[{"GroupId": web_response["GroupId"],
|
|
1310
|
+
"GroupName": self.clusterName}]})
|
|
1311
|
+
boto3_ec2.authorize_security_group_ingress(IpPermissions=ip_permissions, GroupName=self.clusterName, GroupId=web_response["GroupId"])
|
|
1209
1312
|
out = []
|
|
1210
|
-
for sg in
|
|
1211
|
-
if sg
|
|
1212
|
-
out.append(sg)
|
|
1213
|
-
return
|
|
1313
|
+
for sg in boto3_ec2.describe_security_groups()["SecurityGroups"]:
|
|
1314
|
+
if sg["GroupName"] == self.clusterName and (vpc_id is None or sg["VpcId"] == vpc_id):
|
|
1315
|
+
out.append(sg["GroupId"])
|
|
1316
|
+
return out
|
|
1214
1317
|
|
|
1215
1318
|
@awsRetry
|
|
1216
1319
|
def _getSecurityGroupIDs(self) -> List[str]:
|
|
@@ -1222,13 +1325,13 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1222
1325
|
|
|
1223
1326
|
# Depending on if we enumerated them on the leader or locally, we might
|
|
1224
1327
|
# know the required security groups by name, ID, or both.
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1328
|
+
boto3_ec2 = self.aws.client(region=self._region, service_name='ec2')
|
|
1329
|
+
return [sg["GroupId"] for sg in boto3_ec2.describe_security_groups()["SecurityGroups"]
|
|
1330
|
+
if (sg["GroupName"] in self._leaderSecurityGroupNames or
|
|
1331
|
+
sg["GroupId"] in self._leaderSecurityGroupIDs)]
|
|
1229
1332
|
|
|
1230
1333
|
@awsRetry
|
|
1231
|
-
def _get_launch_template_ids(self, filters: Optional[List[
|
|
1334
|
+
def _get_launch_template_ids(self, filters: Optional[List[FilterTypeDef]] = None) -> List[str]:
|
|
1232
1335
|
"""
|
|
1233
1336
|
Find all launch templates associated with the cluster.
|
|
1234
1337
|
|
|
@@ -1236,10 +1339,10 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1236
1339
|
"""
|
|
1237
1340
|
|
|
1238
1341
|
# Grab the connection we need to use for this operation.
|
|
1239
|
-
ec2 = self.aws.client(self._region, 'ec2')
|
|
1342
|
+
ec2: EC2Client = self.aws.client(self._region, 'ec2')
|
|
1240
1343
|
|
|
1241
1344
|
# How do we match the right templates?
|
|
1242
|
-
combined_filters = [{'Name': 'tag:' + _TAG_KEY_TOIL_CLUSTER_NAME, 'Values': [self.clusterName]}]
|
|
1345
|
+
combined_filters: List[FilterTypeDef] = [{'Name': 'tag:' + _TAG_KEY_TOIL_CLUSTER_NAME, 'Values': [self.clusterName]}]
|
|
1243
1346
|
|
|
1244
1347
|
if filters:
|
|
1245
1348
|
# Add any user-specified filters
|
|
@@ -1254,7 +1357,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1254
1357
|
allTemplateIDs += [item['LaunchTemplateId'] for item in response.get('LaunchTemplates', [])]
|
|
1255
1358
|
if 'NextToken' in response:
|
|
1256
1359
|
# There are more pages. Get the next one, supplying the token.
|
|
1257
|
-
response = ec2.describe_launch_templates(Filters=filters,
|
|
1360
|
+
response = ec2.describe_launch_templates(Filters=filters or [],
|
|
1258
1361
|
NextToken=response['NextToken'],
|
|
1259
1362
|
MaxResults=200)
|
|
1260
1363
|
else:
|
|
@@ -1286,10 +1389,10 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1286
1389
|
lt_name = self._name_worker_launch_template(instance_type, preemptible=preemptible)
|
|
1287
1390
|
|
|
1288
1391
|
# How do we match the right templates?
|
|
1289
|
-
filters = [{'Name': 'launch-template-name', 'Values': [lt_name]}]
|
|
1392
|
+
filters: List[FilterTypeDef] = [{'Name': 'launch-template-name', 'Values': [lt_name]}]
|
|
1290
1393
|
|
|
1291
1394
|
# Get the templates
|
|
1292
|
-
templates = self._get_launch_template_ids(filters=filters)
|
|
1395
|
+
templates: List[str] = self._get_launch_template_ids(filters=filters)
|
|
1293
1396
|
|
|
1294
1397
|
if len(templates) > 1:
|
|
1295
1398
|
# There shouldn't ever be multiple templates with our reserved name
|
|
@@ -1305,7 +1408,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1305
1408
|
# writes). Recurse to try again, because now it exists.
|
|
1306
1409
|
logger.info('Waiting %f seconds for template %s to be available', backoff, lt_name)
|
|
1307
1410
|
time.sleep(backoff)
|
|
1308
|
-
return self._get_worker_launch_template(instance_type, preemptible=preemptible, backoff=backoff*2)
|
|
1411
|
+
return self._get_worker_launch_template(instance_type, preemptible=preemptible, backoff=backoff * 2)
|
|
1309
1412
|
else:
|
|
1310
1413
|
raise
|
|
1311
1414
|
else:
|
|
@@ -1345,7 +1448,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1345
1448
|
|
|
1346
1449
|
assert self._leaderPrivateIP
|
|
1347
1450
|
type_info = E2Instances[instance_type]
|
|
1348
|
-
rootVolSize=self._nodeStorageOverrides.get(instance_type, self._nodeStorage)
|
|
1451
|
+
rootVolSize = self._nodeStorageOverrides.get(instance_type, self._nodeStorage)
|
|
1349
1452
|
bdms = self._getBoto3BlockDeviceMappings(type_info, rootVolSize=rootVolSize)
|
|
1350
1453
|
|
|
1351
1454
|
keyPath = self._sseKey if self._sseKey else None
|
|
@@ -1377,16 +1480,16 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1377
1480
|
"""
|
|
1378
1481
|
|
|
1379
1482
|
# Grab the connection we need to use for this operation.
|
|
1380
|
-
autoscaling = self.aws.client(self._region, 'autoscaling')
|
|
1483
|
+
autoscaling: AutoScalingClient = self.aws.client(self._region, 'autoscaling')
|
|
1381
1484
|
|
|
1382
1485
|
# AWS won't filter ASGs server-side for us in describe_auto_scaling_groups.
|
|
1383
1486
|
# So we search instances of applied tags for the ASGs they are on.
|
|
1384
1487
|
# The ASGs tagged with our cluster are our ASGs.
|
|
1385
1488
|
# The filtering is on different fields of the tag object itself.
|
|
1386
|
-
filters = [{'Name': 'key',
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1489
|
+
filters: List[FilterTypeDef] = [{'Name': 'key',
|
|
1490
|
+
'Values': [_TAG_KEY_TOIL_CLUSTER_NAME]},
|
|
1491
|
+
{'Name': 'value',
|
|
1492
|
+
'Values': [self.clusterName]}]
|
|
1390
1493
|
|
|
1391
1494
|
matchedASGs = []
|
|
1392
1495
|
# Get the first page with no NextToken
|
|
@@ -1461,7 +1564,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1461
1564
|
if self.clusterType == 'kubernetes':
|
|
1462
1565
|
# We also need to tag it with Kubernetes autoscaler info (empty tags)
|
|
1463
1566
|
tags['k8s.io/cluster-autoscaler/' + self.clusterName] = ''
|
|
1464
|
-
assert(self.clusterName != 'enabled')
|
|
1567
|
+
assert (self.clusterName != 'enabled')
|
|
1465
1568
|
tags['k8s.io/cluster-autoscaler/enabled'] = ''
|
|
1466
1569
|
tags['k8s.io/cluster-autoscaler/node-template/resources/ephemeral-storage'] = f'{min_gigs}G'
|
|
1467
1570
|
|
|
@@ -1481,7 +1584,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1481
1584
|
|
|
1482
1585
|
return asg_name
|
|
1483
1586
|
|
|
1484
|
-
def _boto2_pager(self, requestor_callable: Callable, result_attribute_name: str) -> Iterable[Dict[str, Any]]:
|
|
1587
|
+
def _boto2_pager(self, requestor_callable: Callable[[...], Any], result_attribute_name: str) -> Iterable[Dict[str, Any]]: # type: ignore[misc]
|
|
1485
1588
|
"""
|
|
1486
1589
|
Yield all the results from calling the given Boto 2 method and paging
|
|
1487
1590
|
through all the results using the "marker" field. Results are to be
|
|
@@ -1489,33 +1592,13 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1489
1592
|
"""
|
|
1490
1593
|
marker = None
|
|
1491
1594
|
while True:
|
|
1492
|
-
result = requestor_callable(marker=marker)
|
|
1595
|
+
result = requestor_callable(marker=marker) # type: ignore[call-arg]
|
|
1493
1596
|
yield from getattr(result, result_attribute_name)
|
|
1494
1597
|
if result.is_truncated == 'true':
|
|
1495
1598
|
marker = result.marker
|
|
1496
1599
|
else:
|
|
1497
1600
|
break
|
|
1498
1601
|
|
|
1499
|
-
def _pager(self, requestor_callable: Callable, result_attribute_name: str, **kwargs) -> Iterable[Dict[str, Any]]:
|
|
1500
|
-
"""
|
|
1501
|
-
Yield all the results from calling the given Boto 3 method with the
|
|
1502
|
-
given keyword arguments, paging through the results using the Marker or
|
|
1503
|
-
NextToken, and fetching out and looping over the list in the response
|
|
1504
|
-
with the given attribute name.
|
|
1505
|
-
"""
|
|
1506
|
-
|
|
1507
|
-
# Recover the Boto3 client, and the name of the operation
|
|
1508
|
-
client = requestor_callable.__self__
|
|
1509
|
-
op_name = requestor_callable.__name__
|
|
1510
|
-
|
|
1511
|
-
# grab a Boto 3 built-in paginator. See
|
|
1512
|
-
# <https://boto3.amazonaws.com/v1/documentation/api/latest/guide/paginators.html>
|
|
1513
|
-
paginator = client.get_paginator(op_name)
|
|
1514
|
-
|
|
1515
|
-
for page in paginator.paginate(**kwargs):
|
|
1516
|
-
# Invoke it and go through the pages, yielding from them
|
|
1517
|
-
yield from page.get(result_attribute_name, [])
|
|
1518
|
-
|
|
1519
1602
|
@awsRetry
|
|
1520
1603
|
def _getRoleNames(self) -> List[str]:
|
|
1521
1604
|
"""
|
|
@@ -1523,10 +1606,12 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1523
1606
|
"""
|
|
1524
1607
|
|
|
1525
1608
|
results = []
|
|
1526
|
-
|
|
1609
|
+
boto3_iam = self.aws.client(self._region, 'iam')
|
|
1610
|
+
for result in boto3_pager(boto3_iam.list_roles, 'Roles'):
|
|
1527
1611
|
# For each Boto2 role object
|
|
1528
1612
|
# Grab out the name
|
|
1529
|
-
|
|
1613
|
+
cast(RoleTypeDef, result)
|
|
1614
|
+
name = result['RoleName']
|
|
1530
1615
|
if self._is_our_namespaced_name(name):
|
|
1531
1616
|
# If it looks like ours, it is ours.
|
|
1532
1617
|
results.append(name)
|
|
@@ -1539,11 +1624,12 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1539
1624
|
"""
|
|
1540
1625
|
|
|
1541
1626
|
results = []
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1627
|
+
boto3_iam = self.aws.client(self._region, 'iam')
|
|
1628
|
+
for result in boto3_pager(boto3_iam.list_instance_profiles,
|
|
1629
|
+
'InstanceProfiles'):
|
|
1545
1630
|
# Grab out the name
|
|
1546
|
-
|
|
1631
|
+
cast(InstanceProfileTypeDef, result)
|
|
1632
|
+
name = result['InstanceProfileName']
|
|
1547
1633
|
if self._is_our_namespaced_name(name):
|
|
1548
1634
|
# If it looks like ours, it is ours.
|
|
1549
1635
|
results.append(name)
|
|
@@ -1558,9 +1644,9 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1558
1644
|
"""
|
|
1559
1645
|
|
|
1560
1646
|
# Grab the connection we need to use for this operation.
|
|
1561
|
-
|
|
1647
|
+
boto3_iam: IAMClient = self.aws.client(self._region, 'iam')
|
|
1562
1648
|
|
|
1563
|
-
return [item['InstanceProfileName'] for item in
|
|
1649
|
+
return [item['InstanceProfileName'] for item in boto3_pager(boto3_iam.list_instance_profiles_for_role,
|
|
1564
1650
|
'InstanceProfiles',
|
|
1565
1651
|
RoleName=role_name)]
|
|
1566
1652
|
|
|
@@ -1575,11 +1661,11 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1575
1661
|
"""
|
|
1576
1662
|
|
|
1577
1663
|
# Grab the connection we need to use for this operation.
|
|
1578
|
-
|
|
1664
|
+
boto3_iam: IAMClient = self.aws.client(self._region, 'iam')
|
|
1579
1665
|
|
|
1580
1666
|
# TODO: we don't currently use attached policies.
|
|
1581
1667
|
|
|
1582
|
-
return [item['PolicyArn'] for item in
|
|
1668
|
+
return [item['PolicyArn'] for item in boto3_pager(boto3_iam.list_attached_role_policies,
|
|
1583
1669
|
'AttachedPolicies',
|
|
1584
1670
|
RoleName=role_name)]
|
|
1585
1671
|
|
|
@@ -1591,20 +1677,18 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1591
1677
|
"""
|
|
1592
1678
|
|
|
1593
1679
|
# Grab the connection we need to use for this operation.
|
|
1594
|
-
|
|
1680
|
+
boto3_iam: IAMClient = self.aws.client(self._region, 'iam')
|
|
1595
1681
|
|
|
1596
|
-
return list(
|
|
1597
|
-
'PolicyNames',
|
|
1598
|
-
RoleName=role_name))
|
|
1682
|
+
return list(boto3_pager(boto3_iam.list_role_policies, 'PolicyNames', RoleName=role_name))
|
|
1599
1683
|
|
|
1600
|
-
def full_policy(self, resource: str) ->
|
|
1684
|
+
def full_policy(self, resource: str) -> Dict[str, Any]:
|
|
1601
1685
|
"""
|
|
1602
1686
|
Produce a dict describing the JSON form of a full-access-granting AWS
|
|
1603
1687
|
IAM policy for the service with the given name (e.g. 's3').
|
|
1604
1688
|
"""
|
|
1605
1689
|
return dict(Version="2012-10-17", Statement=[dict(Effect="Allow", Resource="*", Action=f"{resource}:*")])
|
|
1606
1690
|
|
|
1607
|
-
def kubernetes_policy(self) ->
|
|
1691
|
+
def kubernetes_policy(self) -> Dict[str, Any]:
|
|
1608
1692
|
"""
|
|
1609
1693
|
Get the Kubernetes policy grants not provided by the full grants on EC2
|
|
1610
1694
|
and IAM. See
|
|
@@ -1671,45 +1755,42 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1671
1755
|
"""
|
|
1672
1756
|
|
|
1673
1757
|
# Grab the connection we need to use for this operation.
|
|
1674
|
-
|
|
1758
|
+
boto3_iam: IAMClient = self.aws.client(self._region, 'iam')
|
|
1675
1759
|
|
|
1676
1760
|
# Make sure we can tell our roles apart from roles for other clusters
|
|
1677
1761
|
aws_role_name = self._namespace_name(local_role_name)
|
|
1678
1762
|
try:
|
|
1679
1763
|
# Make the role
|
|
1680
1764
|
logger.debug('Creating IAM role %s...', aws_role_name)
|
|
1681
|
-
|
|
1765
|
+
assume_role_policy_document = json.dumps({
|
|
1682
1766
|
"Version": "2012-10-17",
|
|
1683
1767
|
"Statement": [{
|
|
1684
1768
|
"Effect": "Allow",
|
|
1685
1769
|
"Principal": {"Service": ["ec2.amazonaws.com"]},
|
|
1686
1770
|
"Action": ["sts:AssumeRole"]}
|
|
1687
|
-
]})
|
|
1771
|
+
]})
|
|
1772
|
+
boto3_iam.create_role(RoleName=aws_role_name, AssumeRolePolicyDocument=assume_role_policy_document)
|
|
1688
1773
|
logger.debug('Created new IAM role')
|
|
1689
|
-
except
|
|
1690
|
-
if e
|
|
1774
|
+
except ClientError as e:
|
|
1775
|
+
if get_error_status(e) == 409 and get_error_code(e) == 'EntityAlreadyExists':
|
|
1691
1776
|
logger.debug('IAM role already exists. Reusing.')
|
|
1692
1777
|
else:
|
|
1693
1778
|
raise
|
|
1694
1779
|
|
|
1695
1780
|
# Delete superfluous policies
|
|
1696
|
-
policy_names = set(
|
|
1781
|
+
policy_names = set(boto3_iam.list_role_policies(RoleName=aws_role_name)["PolicyNames"])
|
|
1697
1782
|
for policy_name in policy_names.difference(set(list(policies.keys()))):
|
|
1698
|
-
|
|
1783
|
+
boto3_iam.delete_role_policy(RoleName=aws_role_name, PolicyName=policy_name)
|
|
1699
1784
|
|
|
1700
1785
|
# Create expected policies
|
|
1701
1786
|
for policy_name, policy in policies.items():
|
|
1702
1787
|
current_policy = None
|
|
1703
1788
|
try:
|
|
1704
|
-
current_policy =
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
if e.status == 404 and e.error_code == 'NoSuchEntity':
|
|
1708
|
-
pass
|
|
1709
|
-
else:
|
|
1710
|
-
raise
|
|
1789
|
+
current_policy = boto3_iam.get_role_policy(RoleName=aws_role_name, PolicyName=policy_name)["PolicyDocument"]
|
|
1790
|
+
except boto3_iam.exceptions.NoSuchEntityException:
|
|
1791
|
+
pass
|
|
1711
1792
|
if current_policy != policy:
|
|
1712
|
-
|
|
1793
|
+
boto3_iam.put_role_policy(RoleName=aws_role_name, PolicyName=policy_name, PolicyDocument=json.dumps(policy))
|
|
1713
1794
|
|
|
1714
1795
|
# Now the role has the right policies so it is ready.
|
|
1715
1796
|
return aws_role_name
|
|
@@ -1724,7 +1805,7 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1724
1805
|
"""
|
|
1725
1806
|
|
|
1726
1807
|
# Grab the connection we need to use for this operation.
|
|
1727
|
-
|
|
1808
|
+
boto3_iam: IAMClient = self.aws.client(self._region, 'iam')
|
|
1728
1809
|
|
|
1729
1810
|
policy = dict(iam_full=self.full_policy('iam'), ec2_full=self.full_policy('ec2'),
|
|
1730
1811
|
s3_full=self.full_policy('s3'), sbd_full=self.full_policy('sdb'))
|
|
@@ -1735,45 +1816,41 @@ class AWSProvisioner(AbstractProvisioner):
|
|
|
1735
1816
|
iamRoleName = self._setup_iam_ec2_role(_INSTANCE_PROFILE_ROLE_NAME, policy)
|
|
1736
1817
|
|
|
1737
1818
|
try:
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
else:
|
|
1746
|
-
raise
|
|
1819
|
+
profile_result = boto3_iam.get_instance_profile(InstanceProfileName=iamRoleName)
|
|
1820
|
+
profile: InstanceProfileTypeDef = profile_result["InstanceProfile"]
|
|
1821
|
+
logger.debug("Have preexisting instance profile: %s", profile)
|
|
1822
|
+
except boto3_iam.exceptions.NoSuchEntityException:
|
|
1823
|
+
profile_result = boto3_iam.create_instance_profile(InstanceProfileName=iamRoleName)
|
|
1824
|
+
profile = profile_result["InstanceProfile"]
|
|
1825
|
+
logger.debug("Created new instance profile: %s", profile)
|
|
1747
1826
|
else:
|
|
1748
|
-
profile =
|
|
1749
|
-
profile = profile.instance_profile
|
|
1827
|
+
profile = profile_result["InstanceProfile"]
|
|
1750
1828
|
|
|
1751
|
-
profile_arn = profile
|
|
1829
|
+
profile_arn: str = profile["Arn"]
|
|
1752
1830
|
|
|
1753
1831
|
# Now we have the profile ARN, but we want to make sure it really is
|
|
1754
1832
|
# visible by name in a different session.
|
|
1755
1833
|
wait_until_instance_profile_arn_exists(profile_arn)
|
|
1756
1834
|
|
|
1757
|
-
if len(profile
|
|
1835
|
+
if len(profile["Roles"]) > 1:
|
|
1758
1836
|
# This is too many roles. We probably grabbed something we should
|
|
1759
1837
|
# not have by mistake, and this is some important profile for
|
|
1760
1838
|
# something else.
|
|
1761
1839
|
raise RuntimeError(f'Did not expect instance profile {profile_arn} to contain '
|
|
1762
1840
|
f'more than one role; is it really a Toil-managed profile?')
|
|
1763
|
-
elif len(profile
|
|
1764
|
-
|
|
1765
|
-
if profile.roles.member.role_name == iamRoleName:
|
|
1841
|
+
elif len(profile["Roles"]) == 1:
|
|
1842
|
+
if profile["Roles"][0]["RoleName"] == iamRoleName:
|
|
1766
1843
|
return profile_arn
|
|
1767
1844
|
else:
|
|
1768
1845
|
# Drop this wrong role and use the fallback code for 0 roles
|
|
1769
|
-
|
|
1770
|
-
|
|
1846
|
+
boto3_iam.remove_role_from_instance_profile(InstanceProfileName=iamRoleName,
|
|
1847
|
+
RoleName=profile["Roles"][0]["RoleName"])
|
|
1771
1848
|
|
|
1772
1849
|
# If we get here, we had 0 roles on the profile, or we had 1 but we removed it.
|
|
1773
|
-
for attempt in old_retry(predicate=lambda err: err
|
|
1850
|
+
for attempt in old_retry(predicate=lambda err: get_error_status(err) == 404):
|
|
1774
1851
|
with attempt:
|
|
1775
1852
|
# Put the IAM role on the profile
|
|
1776
|
-
|
|
1853
|
+
boto3_iam.add_role_to_instance_profile(InstanceProfileName=profile["InstanceProfileName"], RoleName=iamRoleName)
|
|
1777
1854
|
logger.debug("Associated role %s with profile", iamRoleName)
|
|
1778
1855
|
|
|
1779
1856
|
return profile_arn
|