toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -17,7 +17,7 @@ import os
|
|
|
17
17
|
import threading
|
|
18
18
|
import time
|
|
19
19
|
import uuid
|
|
20
|
-
from typing import Optional
|
|
20
|
+
from typing import Optional
|
|
21
21
|
|
|
22
22
|
import requests
|
|
23
23
|
from libcloud.compute.drivers.gce import GCEFailedNode
|
|
@@ -25,8 +25,8 @@ from libcloud.compute.providers import get_driver
|
|
|
25
25
|
from libcloud.compute.types import Provider
|
|
26
26
|
|
|
27
27
|
from toil.jobStores.googleJobStore import GoogleJobStore
|
|
28
|
-
from toil.lib.conversions import human2bytes
|
|
29
28
|
from toil.lib.compatibility import compat_bytes_recursive
|
|
29
|
+
from toil.lib.conversions import human2bytes
|
|
30
30
|
from toil.provisioners import NoSuchClusterException
|
|
31
31
|
from toil.provisioners.abstractProvisioner import AbstractProvisioner, Shape
|
|
32
32
|
from toil.provisioners.node import Node
|
|
@@ -34,24 +34,41 @@ from toil.provisioners.node import Node
|
|
|
34
34
|
logger = logging.getLogger(__name__)
|
|
35
35
|
logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING)
|
|
36
36
|
|
|
37
|
+
|
|
37
38
|
class GCEProvisioner(AbstractProvisioner):
|
|
38
39
|
"""
|
|
39
40
|
Implements a Google Compute Engine Provisioner using libcloud.
|
|
40
41
|
"""
|
|
41
42
|
|
|
42
43
|
NODE_BOTO_PATH = "/root/.boto" # boto file path on instances
|
|
43
|
-
SOURCE_IMAGE = b
|
|
44
|
-
|
|
45
|
-
def __init__(
|
|
46
|
-
self
|
|
44
|
+
SOURCE_IMAGE = b"projects/kinvolk-public/global/images/family/flatcar-stable"
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
clusterName,
|
|
49
|
+
clusterType,
|
|
50
|
+
zone,
|
|
51
|
+
nodeStorage,
|
|
52
|
+
nodeStorageOverrides,
|
|
53
|
+
sseKey,
|
|
54
|
+
enable_fuse,
|
|
55
|
+
):
|
|
56
|
+
self.cloud = "gce"
|
|
47
57
|
self._sseKey = sseKey
|
|
48
58
|
|
|
49
59
|
# Call base class constructor, which will call createClusterSettings()
|
|
50
60
|
# or readClusterSettings()
|
|
51
|
-
super().__init__(
|
|
61
|
+
super().__init__(
|
|
62
|
+
clusterName,
|
|
63
|
+
clusterType,
|
|
64
|
+
zone,
|
|
65
|
+
nodeStorage,
|
|
66
|
+
nodeStorageOverrides,
|
|
67
|
+
enable_fuse,
|
|
68
|
+
)
|
|
52
69
|
|
|
53
70
|
def supportedClusterTypes(self):
|
|
54
|
-
return {
|
|
71
|
+
return {"mesos"}
|
|
55
72
|
|
|
56
73
|
def createClusterSettings(self):
|
|
57
74
|
# All we need to do is read the Google credentials we need to provision
|
|
@@ -65,30 +82,38 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
65
82
|
reading the metadata.
|
|
66
83
|
"""
|
|
67
84
|
metadata_server = "http://metadata/computeMetadata/v1/instance/"
|
|
68
|
-
metadata_flavor = {
|
|
69
|
-
zone = requests.get(metadata_server +
|
|
70
|
-
self._zone = zone.split(
|
|
85
|
+
metadata_flavor = {"Metadata-Flavor": "Google"}
|
|
86
|
+
zone = requests.get(metadata_server + "zone", headers=metadata_flavor).text
|
|
87
|
+
self._zone = zone.split("/")[-1]
|
|
71
88
|
|
|
72
89
|
project_metadata_server = "http://metadata/computeMetadata/v1/project/"
|
|
73
|
-
self._projectId = requests.get(
|
|
90
|
+
self._projectId = requests.get(
|
|
91
|
+
project_metadata_server + "project-id", headers=metadata_flavor
|
|
92
|
+
).text
|
|
74
93
|
|
|
75
94
|
# From a GCE instance, these values can be blank. Only the projectId is needed
|
|
76
|
-
self._googleJson =
|
|
77
|
-
self._clientEmail =
|
|
95
|
+
self._googleJson = ""
|
|
96
|
+
self._clientEmail = ""
|
|
78
97
|
|
|
79
|
-
self._tags = requests.get(
|
|
98
|
+
self._tags = requests.get(
|
|
99
|
+
metadata_server + "description", headers=metadata_flavor
|
|
100
|
+
).text
|
|
80
101
|
tags = json.loads(self._tags)
|
|
81
|
-
self.clusterName = tags[
|
|
102
|
+
self.clusterName = tags["clusterName"]
|
|
82
103
|
self._gceDriver = self._getDriver()
|
|
83
|
-
self._instanceGroup = self._gceDriver.ex_get_instancegroup(
|
|
104
|
+
self._instanceGroup = self._gceDriver.ex_get_instancegroup(
|
|
105
|
+
self.clusterName, zone=self._zone
|
|
106
|
+
)
|
|
84
107
|
|
|
85
108
|
leader = self.getLeader()
|
|
86
109
|
self._leaderPrivateIP = leader.privateIP
|
|
87
110
|
|
|
88
111
|
# The location of the Google credentials file on instances.
|
|
89
112
|
self._credentialsPath = GoogleJobStore.nodeServiceAccountJson
|
|
90
|
-
self._keyName =
|
|
91
|
-
self._botoPath =
|
|
113
|
+
self._keyName = "core" # key name leader users to communicate with works
|
|
114
|
+
self._botoPath = (
|
|
115
|
+
self.NODE_BOTO_PATH
|
|
116
|
+
) # boto credentials (used if reading an AWS bucket)
|
|
92
117
|
|
|
93
118
|
# Let the base provisioner work out how to deploy duly authorized
|
|
94
119
|
# workers for this leader.
|
|
@@ -98,28 +123,32 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
98
123
|
"""
|
|
99
124
|
Get the credentials from the file specified by GOOGLE_APPLICATION_CREDENTIALS.
|
|
100
125
|
"""
|
|
101
|
-
self._googleJson = os.getenv(
|
|
126
|
+
self._googleJson = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
|
102
127
|
if not self._googleJson:
|
|
103
|
-
raise RuntimeError(
|
|
128
|
+
raise RuntimeError("GOOGLE_APPLICATION_CREDENTIALS not set.")
|
|
104
129
|
try:
|
|
105
130
|
with open(self._googleJson) as jsonFile:
|
|
106
131
|
self.googleConnectionParams = json.loads(jsonFile.read())
|
|
107
132
|
except:
|
|
108
|
-
|
|
109
|
-
|
|
133
|
+
raise RuntimeError(
|
|
134
|
+
"GCEProvisioner: Could not parse the Google service account json file %s"
|
|
135
|
+
% self._googleJson
|
|
136
|
+
)
|
|
110
137
|
|
|
111
|
-
self._projectId = self.googleConnectionParams[
|
|
112
|
-
self._clientEmail = self.googleConnectionParams[
|
|
138
|
+
self._projectId = self.googleConnectionParams["project_id"]
|
|
139
|
+
self._clientEmail = self.googleConnectionParams["client_email"]
|
|
113
140
|
self._credentialsPath = self._googleJson
|
|
114
141
|
self._clearLeaderWorkerAuthentication() # TODO: Why are we doing this?
|
|
115
142
|
self._gceDriver = self._getDriver()
|
|
116
143
|
|
|
117
144
|
def _write_file_to_cloud(self, key: str, contents: bytes) -> str:
|
|
118
|
-
raise NotImplementedError(
|
|
145
|
+
raise NotImplementedError(
|
|
146
|
+
"The gceProvisioner doesn't support _write_file_to_cloud()."
|
|
147
|
+
)
|
|
119
148
|
|
|
120
149
|
def _get_user_data_limit(self) -> int:
|
|
121
150
|
# See: https://cloud.google.com/compute/docs/metadata/setting-custom-metadata#limitations
|
|
122
|
-
return human2bytes(
|
|
151
|
+
return human2bytes("256KB")
|
|
123
152
|
|
|
124
153
|
def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs):
|
|
125
154
|
"""
|
|
@@ -131,39 +160,42 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
131
160
|
vpcSubnet: A subnet (optional).
|
|
132
161
|
use_private_ip: even though a public ip exists, ignore it (optional)
|
|
133
162
|
"""
|
|
134
|
-
if
|
|
163
|
+
if "keyName" not in kwargs:
|
|
135
164
|
raise RuntimeError("A keyPairName is required for the GCE provisioner.")
|
|
136
|
-
self._keyName = kwargs[
|
|
137
|
-
if
|
|
138
|
-
self._botoPath = kwargs[
|
|
139
|
-
self._vpcSubnet = kwargs.get(
|
|
140
|
-
self._network = kwargs.get(
|
|
141
|
-
self._use_private_ip = kwargs.get(
|
|
165
|
+
self._keyName = kwargs["keyName"]
|
|
166
|
+
if "botoPath" in kwargs:
|
|
167
|
+
self._botoPath = kwargs["botoPath"]
|
|
168
|
+
self._vpcSubnet = kwargs.get("vpcSubnet", None)
|
|
169
|
+
self._network = kwargs.get("network", None)
|
|
170
|
+
self._use_private_ip = kwargs.get("use_private_ip", None)
|
|
142
171
|
|
|
143
172
|
# Throws an error if cluster exists
|
|
144
|
-
self._instanceGroup = self._gceDriver.ex_create_instancegroup(
|
|
145
|
-
|
|
173
|
+
self._instanceGroup = self._gceDriver.ex_create_instancegroup(
|
|
174
|
+
self.clusterName, self._zone
|
|
175
|
+
)
|
|
176
|
+
logger.debug("Launching leader")
|
|
146
177
|
|
|
147
178
|
# GCE doesn't have a dictionary tags field. The tags field is just a string list.
|
|
148
179
|
# Therefore, dumping tags into the description.
|
|
149
|
-
tags = {
|
|
150
|
-
if
|
|
151
|
-
tags.update(kwargs[
|
|
180
|
+
tags = {"Owner": self._keyName, "clusterName": self.clusterName}
|
|
181
|
+
if "userTags" in kwargs:
|
|
182
|
+
tags.update(kwargs["userTags"])
|
|
152
183
|
self._tags = json.dumps(tags)
|
|
153
184
|
|
|
154
|
-
metadata = {
|
|
155
|
-
|
|
156
|
-
|
|
185
|
+
metadata = {
|
|
186
|
+
"items": [
|
|
187
|
+
{"key": "user-data", "value": self._getIgnitionUserData("leader")}
|
|
188
|
+
]
|
|
189
|
+
}
|
|
190
|
+
imageType = "flatcar-stable"
|
|
191
|
+
sa_scopes = [{"scopes": ["compute", "storage-full"]}]
|
|
157
192
|
disk = {}
|
|
158
|
-
disk[
|
|
159
|
-
|
|
160
|
-
|
|
193
|
+
disk["initializeParams"] = {
|
|
194
|
+
"sourceImage": self.SOURCE_IMAGE,
|
|
195
|
+
"diskSizeGb": leaderStorage,
|
|
161
196
|
}
|
|
162
|
-
disk.update({
|
|
163
|
-
|
|
164
|
-
'autoDelete': True
|
|
165
|
-
})
|
|
166
|
-
name = 'l' + str(uuid.uuid4())
|
|
197
|
+
disk.update({"boot": True, "autoDelete": True})
|
|
198
|
+
name = "l" + str(uuid.uuid4())
|
|
167
199
|
|
|
168
200
|
leader = self._gceDriver.create_node(
|
|
169
201
|
name,
|
|
@@ -174,9 +206,9 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
174
206
|
ex_metadata=compat_bytes_recursive(metadata),
|
|
175
207
|
ex_network=self._network,
|
|
176
208
|
ex_subnetwork=self._vpcSubnet,
|
|
177
|
-
ex_disks_gce_struct
|
|
209
|
+
ex_disks_gce_struct=[compat_bytes_recursive(disk)],
|
|
178
210
|
description=self._tags,
|
|
179
|
-
ex_preemptible=False
|
|
211
|
+
ex_preemptible=False,
|
|
180
212
|
)
|
|
181
213
|
|
|
182
214
|
self._instanceGroup.add_instances([leader])
|
|
@@ -184,18 +216,27 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
184
216
|
# self.subnetID = leader.subnet_id # TODO: get subnetID
|
|
185
217
|
|
|
186
218
|
# Wait for the appliance to start and inject credentials.
|
|
187
|
-
leaderNode = Node(
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
219
|
+
leaderNode = Node(
|
|
220
|
+
publicIP=leader.public_ips[0],
|
|
221
|
+
privateIP=leader.private_ips[0],
|
|
222
|
+
name=leader.name,
|
|
223
|
+
launchTime=leader.created_at,
|
|
224
|
+
nodeType=leader.size,
|
|
225
|
+
preemptible=False,
|
|
226
|
+
tags=self._tags,
|
|
227
|
+
use_private_ip=self._use_private_ip,
|
|
228
|
+
)
|
|
229
|
+
leaderNode.waitForNode("toil_leader", keyName=self._keyName)
|
|
191
230
|
leaderNode.copySshKeys(self._keyName)
|
|
192
|
-
leaderNode.injectFile(
|
|
231
|
+
leaderNode.injectFile(
|
|
232
|
+
self._credentialsPath, GoogleJobStore.nodeServiceAccountJson, "toil_leader"
|
|
233
|
+
)
|
|
193
234
|
if self._botoPath:
|
|
194
|
-
leaderNode.injectFile(self._botoPath, self.NODE_BOTO_PATH,
|
|
235
|
+
leaderNode.injectFile(self._botoPath, self.NODE_BOTO_PATH, "toil_leader")
|
|
195
236
|
# Download credentials
|
|
196
237
|
self._setLeaderWorkerAuthentication(leaderNode)
|
|
197
238
|
|
|
198
|
-
logger.debug(
|
|
239
|
+
logger.debug("Launched leader")
|
|
199
240
|
|
|
200
241
|
def getNodeShape(self, instance_type: str, preemptible=False) -> Shape:
|
|
201
242
|
# TODO: read this value only once
|
|
@@ -208,21 +249,25 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
208
249
|
if disk == 0:
|
|
209
250
|
# This is an EBS-backed instance. We will use the root
|
|
210
251
|
# volume, so add the amount of EBS storage requested forhe root volume
|
|
211
|
-
disk =
|
|
252
|
+
disk = (
|
|
253
|
+
self._nodeStorageOverrides.get(instance_type, self._nodeStorage) * 2**30
|
|
254
|
+
)
|
|
212
255
|
|
|
213
256
|
# Ram is in M.
|
|
214
257
|
# Underestimate memory by 100M to prevent autoscaler from disagreeing with
|
|
215
258
|
# mesos about whether a job can run on a particular node type
|
|
216
|
-
memory = (instanceType.ram/1000 - 0.1) * 2
|
|
217
|
-
return Shape(
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
259
|
+
memory = (instanceType.ram / 1000 - 0.1) * 2**30
|
|
260
|
+
return Shape(
|
|
261
|
+
wallTime=60 * 60,
|
|
262
|
+
memory=memory,
|
|
263
|
+
cores=instanceType.extra["guestCpus"],
|
|
264
|
+
disk=disk,
|
|
265
|
+
preemptible=preemptible,
|
|
266
|
+
)
|
|
222
267
|
|
|
223
268
|
@staticmethod
|
|
224
269
|
def retryPredicate(e):
|
|
225
|
-
"""
|
|
270
|
+
"""Not used by GCE"""
|
|
226
271
|
return False
|
|
227
272
|
|
|
228
273
|
def destroyCluster(self) -> None:
|
|
@@ -238,7 +283,9 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
238
283
|
attempts += 1
|
|
239
284
|
|
|
240
285
|
# remove group
|
|
241
|
-
instanceGroup = self._gceDriver.ex_get_instancegroup(
|
|
286
|
+
instanceGroup = self._gceDriver.ex_get_instancegroup(
|
|
287
|
+
self.clusterName, zone=self._zone
|
|
288
|
+
)
|
|
242
289
|
instanceGroup.destroy()
|
|
243
290
|
|
|
244
291
|
def terminateNodes(self, nodes):
|
|
@@ -248,7 +295,7 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
248
295
|
instancesToKill = [i for i in instances if i.name in nodeNames]
|
|
249
296
|
self._terminateInstances(instancesToKill)
|
|
250
297
|
|
|
251
|
-
def addNodes(self, nodeTypes:
|
|
298
|
+
def addNodes(self, nodeTypes: set[str], numNodes, preemptible, spotBid=None) -> int:
|
|
252
299
|
assert self._leaderPrivateIP
|
|
253
300
|
|
|
254
301
|
# We don't support any balancing here so just pick one of the
|
|
@@ -268,23 +315,21 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
268
315
|
keyPath = self._sseKey
|
|
269
316
|
|
|
270
317
|
if not preemptible:
|
|
271
|
-
logger.debug(
|
|
318
|
+
logger.debug("Launching %s non-preemptible nodes", numNodes)
|
|
272
319
|
else:
|
|
273
|
-
logger.debug(
|
|
320
|
+
logger.debug("Launching %s preemptible nodes", numNodes)
|
|
274
321
|
|
|
275
322
|
# kwargs["subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(self.instanceMetaData).subnet_id
|
|
276
|
-
userData = self._getIgnitionUserData(
|
|
277
|
-
metadata = {
|
|
278
|
-
imageType =
|
|
279
|
-
sa_scopes = [{
|
|
323
|
+
userData = self._getIgnitionUserData("worker", keyPath, preemptible)
|
|
324
|
+
metadata = {"items": [{"key": "user-data", "value": userData}]}
|
|
325
|
+
imageType = "flatcar-stable"
|
|
326
|
+
sa_scopes = [{"scopes": ["compute", "storage-full"]}]
|
|
280
327
|
disk = {}
|
|
281
|
-
disk[
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
'autoDelete': True
|
|
287
|
-
})
|
|
328
|
+
disk["initializeParams"] = {
|
|
329
|
+
"sourceImage": self.SOURCE_IMAGE,
|
|
330
|
+
"diskSizeGb": self._nodeStorageOverrides.get(node_type, self._nodeStorage),
|
|
331
|
+
}
|
|
332
|
+
disk.update({"boot": True, "autoDelete": True})
|
|
288
333
|
|
|
289
334
|
# TODO:
|
|
290
335
|
# - bug in gce.py for ex_create_multiple_nodes (erroneously, doesn't allow image and disk to specified)
|
|
@@ -294,26 +339,38 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
294
339
|
retries = 0
|
|
295
340
|
workersCreated = 0
|
|
296
341
|
# Try a few times to create the requested number of workers
|
|
297
|
-
while numNodes-workersCreated > 0 and retries < 3:
|
|
342
|
+
while numNodes - workersCreated > 0 and retries < 3:
|
|
298
343
|
instancesLaunched = self.ex_create_multiple_nodes(
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
344
|
+
"",
|
|
345
|
+
node_type,
|
|
346
|
+
imageType,
|
|
347
|
+
numNodes - workersCreated,
|
|
348
|
+
location=self._zone,
|
|
349
|
+
ex_service_accounts=sa_scopes,
|
|
350
|
+
ex_metadata=metadata,
|
|
351
|
+
ex_disks_gce_struct=[disk],
|
|
352
|
+
description=self._tags,
|
|
353
|
+
ex_preemptible=preemptible,
|
|
354
|
+
)
|
|
307
355
|
failedWorkers = []
|
|
308
356
|
for instance in instancesLaunched:
|
|
309
357
|
if isinstance(instance, GCEFailedNode):
|
|
310
|
-
logger.error(
|
|
311
|
-
|
|
358
|
+
logger.error(
|
|
359
|
+
"Worker failed to launch with code %s. Error message: %s"
|
|
360
|
+
% (instance.code, instance.error)
|
|
361
|
+
)
|
|
312
362
|
continue
|
|
313
363
|
|
|
314
|
-
node = Node(
|
|
315
|
-
|
|
316
|
-
|
|
364
|
+
node = Node(
|
|
365
|
+
publicIP=instance.public_ips[0],
|
|
366
|
+
privateIP=instance.private_ips[0],
|
|
367
|
+
name=instance.name,
|
|
368
|
+
launchTime=instance.created_at,
|
|
369
|
+
nodeType=instance.size,
|
|
370
|
+
preemptible=False,
|
|
371
|
+
tags=self._tags,
|
|
372
|
+
use_private_ip=self._use_private_ip,
|
|
373
|
+
) # FIXME: what should tags be set to?
|
|
317
374
|
|
|
318
375
|
try:
|
|
319
376
|
self._injectWorkerFiles(node, botoExists)
|
|
@@ -321,43 +378,55 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
321
378
|
self._instanceGroup.add_instances([instance])
|
|
322
379
|
workersCreated += 1
|
|
323
380
|
except Exception as e:
|
|
324
|
-
logger.error(
|
|
381
|
+
logger.error(
|
|
382
|
+
f"Failed to configure worker {node.name}. Error message: {e}"
|
|
383
|
+
)
|
|
325
384
|
failedWorkers.append(instance)
|
|
326
385
|
if failedWorkers:
|
|
327
386
|
logger.error("Terminating %d failed workers" % len(failedWorkers))
|
|
328
387
|
self._terminateInstances(failedWorkers)
|
|
329
388
|
retries += 1
|
|
330
389
|
|
|
331
|
-
logger.debug(
|
|
390
|
+
logger.debug("Launched %d new instance(s)", numNodes)
|
|
332
391
|
if numNodes != workersCreated:
|
|
333
|
-
logger.error("Failed to launch %d worker(s)", numNodes-workersCreated)
|
|
392
|
+
logger.error("Failed to launch %d worker(s)", numNodes - workersCreated)
|
|
334
393
|
return workersCreated
|
|
335
394
|
|
|
336
|
-
def getProvisionedWorkers(
|
|
395
|
+
def getProvisionedWorkers(
|
|
396
|
+
self, instance_type: Optional[str] = None, preemptible: Optional[bool] = None
|
|
397
|
+
):
|
|
337
398
|
assert self._leaderPrivateIP
|
|
338
399
|
entireCluster = self._getNodesInCluster(instance_type=instance_type)
|
|
339
|
-
logger.debug(
|
|
400
|
+
logger.debug("All nodes in cluster: %s", entireCluster)
|
|
340
401
|
workerInstances = []
|
|
341
402
|
for instance in entireCluster:
|
|
342
403
|
if preemptible is not None:
|
|
343
|
-
scheduling = instance.extra.get(
|
|
404
|
+
scheduling = instance.extra.get("scheduling")
|
|
344
405
|
# If this field is not found in the extra meta-data, assume the node is not preemptible.
|
|
345
|
-
if scheduling and scheduling.get(
|
|
406
|
+
if scheduling and scheduling.get("preemptible", False) != preemptible:
|
|
346
407
|
continue
|
|
347
408
|
isWorker = True
|
|
348
409
|
for ip in instance.private_ips:
|
|
349
410
|
if ip == self._leaderPrivateIP:
|
|
350
411
|
isWorker = False
|
|
351
412
|
break # don't include the leader
|
|
352
|
-
if isWorker and instance.state ==
|
|
413
|
+
if isWorker and instance.state == "running":
|
|
353
414
|
workerInstances.append(instance)
|
|
354
415
|
|
|
355
|
-
logger.debug(
|
|
356
|
-
return [
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
416
|
+
logger.debug("All workers found in cluster: %s", workerInstances)
|
|
417
|
+
return [
|
|
418
|
+
Node(
|
|
419
|
+
publicIP=i.public_ips[0],
|
|
420
|
+
privateIP=i.private_ips[0],
|
|
421
|
+
name=i.name,
|
|
422
|
+
launchTime=i.created_at,
|
|
423
|
+
nodeType=i.size,
|
|
424
|
+
preemptible=i.extra.get("scheduling", {}).get("preemptible", False),
|
|
425
|
+
tags=None,
|
|
426
|
+
use_private_ip=self._use_private_ip,
|
|
427
|
+
)
|
|
428
|
+
for i in workerInstances
|
|
429
|
+
]
|
|
361
430
|
|
|
362
431
|
def getLeader(self):
|
|
363
432
|
instances = self._getNodesInCluster()
|
|
@@ -366,49 +435,64 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
366
435
|
leader = instances[0] # assume leader was launched first
|
|
367
436
|
except IndexError:
|
|
368
437
|
raise NoSuchClusterException(self.clusterName)
|
|
369
|
-
return Node(
|
|
370
|
-
|
|
371
|
-
|
|
438
|
+
return Node(
|
|
439
|
+
publicIP=leader.public_ips[0],
|
|
440
|
+
privateIP=leader.private_ips[0],
|
|
441
|
+
name=leader.name,
|
|
442
|
+
launchTime=leader.created_at,
|
|
443
|
+
nodeType=leader.size,
|
|
444
|
+
preemptible=False,
|
|
445
|
+
tags=None,
|
|
446
|
+
use_private_ip=self._use_private_ip,
|
|
447
|
+
)
|
|
372
448
|
|
|
373
449
|
def _injectWorkerFiles(self, node, botoExists):
|
|
374
450
|
"""
|
|
375
451
|
Set up the credentials on the worker.
|
|
376
452
|
"""
|
|
377
|
-
node.waitForNode(
|
|
453
|
+
node.waitForNode("toil_worker", keyName=self._keyName)
|
|
378
454
|
node.copySshKeys(self._keyName)
|
|
379
|
-
node.injectFile(
|
|
455
|
+
node.injectFile(
|
|
456
|
+
self._credentialsPath, GoogleJobStore.nodeServiceAccountJson, "toil_worker"
|
|
457
|
+
)
|
|
380
458
|
if self._sseKey:
|
|
381
|
-
node.injectFile(self._sseKey, self._sseKey,
|
|
459
|
+
node.injectFile(self._sseKey, self._sseKey, "toil_worker")
|
|
382
460
|
if botoExists:
|
|
383
|
-
node.injectFile(self._botoPath, self.NODE_BOTO_PATH,
|
|
461
|
+
node.injectFile(self._botoPath, self.NODE_BOTO_PATH, "toil_worker")
|
|
384
462
|
|
|
385
463
|
def _getNodesInCluster(self, instance_type: Optional[str] = None):
|
|
386
|
-
instanceGroup = self._gceDriver.ex_get_instancegroup(
|
|
464
|
+
instanceGroup = self._gceDriver.ex_get_instancegroup(
|
|
465
|
+
self.clusterName, zone=self._zone
|
|
466
|
+
)
|
|
387
467
|
instances = instanceGroup.list_instances()
|
|
388
468
|
if instance_type:
|
|
389
|
-
instances = [
|
|
469
|
+
instances = [
|
|
470
|
+
instance for instance in instances if instance.size == instance_type
|
|
471
|
+
]
|
|
390
472
|
return instances
|
|
391
473
|
|
|
392
474
|
def _getDriver(self):
|
|
393
|
-
"""
|
|
475
|
+
"""Connect to GCE"""
|
|
394
476
|
driverCls = get_driver(Provider.GCE)
|
|
395
|
-
return driverCls(
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
477
|
+
return driverCls(
|
|
478
|
+
self._clientEmail,
|
|
479
|
+
self._googleJson,
|
|
480
|
+
project=self._projectId,
|
|
481
|
+
datacenter=self._zone,
|
|
482
|
+
)
|
|
399
483
|
|
|
400
484
|
def _terminateInstances(self, instances):
|
|
401
485
|
def worker(driver, instance):
|
|
402
|
-
logger.debug(
|
|
486
|
+
logger.debug("Terminating instance: %s", instance.name)
|
|
403
487
|
driver.destroy_node(instance)
|
|
404
488
|
|
|
405
489
|
threads = []
|
|
406
490
|
for instance in instances:
|
|
407
|
-
t = threading.Thread(target=worker, args=(self._gceDriver,instance))
|
|
491
|
+
t = threading.Thread(target=worker, args=(self._gceDriver, instance))
|
|
408
492
|
threads.append(t)
|
|
409
493
|
t.start()
|
|
410
494
|
|
|
411
|
-
logger.debug(
|
|
495
|
+
logger.debug("... Waiting for instance(s) to shut down...")
|
|
412
496
|
for t in threads:
|
|
413
497
|
t.join()
|
|
414
498
|
|
|
@@ -416,20 +500,37 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
416
500
|
DEFAULT_TASK_COMPLETION_TIMEOUT = 180
|
|
417
501
|
|
|
418
502
|
def ex_create_multiple_nodes(
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
503
|
+
self,
|
|
504
|
+
base_name,
|
|
505
|
+
size,
|
|
506
|
+
image,
|
|
507
|
+
number,
|
|
508
|
+
location=None,
|
|
509
|
+
ex_network="default",
|
|
510
|
+
ex_subnetwork=None,
|
|
511
|
+
ex_tags=None,
|
|
512
|
+
ex_metadata=None,
|
|
513
|
+
ignore_errors=True,
|
|
514
|
+
use_existing_disk=True,
|
|
515
|
+
poll_interval=2,
|
|
516
|
+
external_ip="ephemeral",
|
|
517
|
+
ex_disk_type="pd-standard",
|
|
518
|
+
ex_disk_auto_delete=True,
|
|
519
|
+
ex_service_accounts=None,
|
|
520
|
+
timeout=DEFAULT_TASK_COMPLETION_TIMEOUT,
|
|
521
|
+
description=None,
|
|
522
|
+
ex_can_ip_forward=None,
|
|
523
|
+
ex_disks_gce_struct=None,
|
|
524
|
+
ex_nic_gce_struct=None,
|
|
525
|
+
ex_on_host_maintenance=None,
|
|
526
|
+
ex_automatic_restart=None,
|
|
527
|
+
ex_image_family=None,
|
|
528
|
+
ex_preemptible=None,
|
|
529
|
+
):
|
|
429
530
|
"""
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
531
|
+
Monkey patch to gce.py in libcloud to allow disk and images to be specified.
|
|
532
|
+
Also changed name to a uuid below.
|
|
533
|
+
The prefix 'wp' identifies preemptible nodes and 'wn' non-preemptible nodes.
|
|
433
534
|
"""
|
|
434
535
|
# if image and ex_disks_gce_struct:
|
|
435
536
|
# raise ValueError("Cannot specify both 'image' and "
|
|
@@ -437,78 +538,80 @@ class GCEProvisioner(AbstractProvisioner):
|
|
|
437
538
|
|
|
438
539
|
driver = self._getDriver()
|
|
439
540
|
if image and ex_image_family:
|
|
440
|
-
raise ValueError("Cannot specify both 'image' and "
|
|
441
|
-
"'ex_image_family'")
|
|
541
|
+
raise ValueError("Cannot specify both 'image' and " "'ex_image_family'")
|
|
442
542
|
|
|
443
543
|
location = location or driver.zone
|
|
444
|
-
if not hasattr(location,
|
|
544
|
+
if not hasattr(location, "name"):
|
|
445
545
|
location = driver.ex_get_zone(location)
|
|
446
|
-
if not hasattr(size,
|
|
546
|
+
if not hasattr(size, "name"):
|
|
447
547
|
size = driver.ex_get_size(size, location)
|
|
448
|
-
if not hasattr(ex_network,
|
|
548
|
+
if not hasattr(ex_network, "name"):
|
|
449
549
|
ex_network = driver.ex_get_network(ex_network)
|
|
450
|
-
if ex_subnetwork and not hasattr(ex_subnetwork,
|
|
451
|
-
ex_subnetwork =
|
|
452
|
-
driver.
|
|
453
|
-
|
|
550
|
+
if ex_subnetwork and not hasattr(ex_subnetwork, "name"):
|
|
551
|
+
ex_subnetwork = driver.ex_get_subnetwork(
|
|
552
|
+
ex_subnetwork, region=driver._get_region_from_zone(location)
|
|
553
|
+
)
|
|
454
554
|
if ex_image_family:
|
|
455
555
|
image = driver.ex_get_image_from_family(ex_image_family)
|
|
456
|
-
if image and not hasattr(image,
|
|
556
|
+
if image and not hasattr(image, "name"):
|
|
457
557
|
image = driver.ex_get_image(image)
|
|
458
|
-
if not hasattr(ex_disk_type,
|
|
558
|
+
if not hasattr(ex_disk_type, "name"):
|
|
459
559
|
ex_disk_type = driver.ex_get_disktype(ex_disk_type, zone=location)
|
|
460
560
|
|
|
461
|
-
node_attrs = {
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
561
|
+
node_attrs = {
|
|
562
|
+
"size": size,
|
|
563
|
+
"image": image,
|
|
564
|
+
"location": location,
|
|
565
|
+
"network": ex_network,
|
|
566
|
+
"subnetwork": ex_subnetwork,
|
|
567
|
+
"tags": ex_tags,
|
|
568
|
+
"metadata": ex_metadata,
|
|
569
|
+
"ignore_errors": ignore_errors,
|
|
570
|
+
"use_existing_disk": use_existing_disk,
|
|
571
|
+
"external_ip": external_ip,
|
|
572
|
+
"ex_disk_type": ex_disk_type,
|
|
573
|
+
"ex_disk_auto_delete": ex_disk_auto_delete,
|
|
574
|
+
"ex_service_accounts": ex_service_accounts,
|
|
575
|
+
"description": description,
|
|
576
|
+
"ex_can_ip_forward": ex_can_ip_forward,
|
|
577
|
+
"ex_disks_gce_struct": ex_disks_gce_struct,
|
|
578
|
+
"ex_nic_gce_struct": ex_nic_gce_struct,
|
|
579
|
+
"ex_on_host_maintenance": ex_on_host_maintenance,
|
|
580
|
+
"ex_automatic_restart": ex_automatic_restart,
|
|
581
|
+
"ex_preemptible": ex_preemptible,
|
|
582
|
+
}
|
|
481
583
|
# List for holding the status information for disk/node creation.
|
|
482
584
|
status_list = []
|
|
483
585
|
|
|
484
586
|
for i in range(number):
|
|
485
|
-
name =
|
|
587
|
+
name = "wp" if ex_preemptible else "wn"
|
|
486
588
|
name += str(uuid.uuid4()) # '%s-%03d' % (base_name, i)
|
|
487
|
-
status = {
|
|
589
|
+
status = {"name": name, "node_response": None, "node": None}
|
|
488
590
|
status_list.append(status)
|
|
489
591
|
|
|
490
592
|
start_time = time.time()
|
|
491
593
|
complete = False
|
|
492
594
|
while not complete:
|
|
493
595
|
if time.time() - start_time >= timeout:
|
|
494
|
-
raise Exception(
|
|
495
|
-
|
|
596
|
+
raise Exception(
|
|
597
|
+
"Timeout (%s sec) while waiting for multiple " "instances"
|
|
598
|
+
)
|
|
496
599
|
complete = True
|
|
497
600
|
time.sleep(poll_interval)
|
|
498
601
|
for status in status_list:
|
|
499
602
|
# Create the node or check status if already in progress.
|
|
500
|
-
if not status[
|
|
501
|
-
if not status[
|
|
603
|
+
if not status["node"]:
|
|
604
|
+
if not status["node_response"]:
|
|
502
605
|
driver._multi_create_node(status, node_attrs)
|
|
503
606
|
else:
|
|
504
607
|
driver._multi_check_node(status, node_attrs)
|
|
505
608
|
# If any of the nodes have not been created (or failed) we are
|
|
506
609
|
# not done yet.
|
|
507
|
-
if not status[
|
|
610
|
+
if not status["node"]:
|
|
508
611
|
complete = False
|
|
509
612
|
|
|
510
613
|
# Return list of nodes
|
|
511
614
|
node_list = []
|
|
512
615
|
for status in status_list:
|
|
513
|
-
node_list.append(status[
|
|
616
|
+
node_list.append(status["node"])
|
|
514
617
|
return node_list
|