toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/provisioners/node.py
CHANGED
|
@@ -13,11 +13,12 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import datetime
|
|
15
15
|
import logging
|
|
16
|
-
import pipes
|
|
17
16
|
import socket
|
|
18
17
|
import subprocess
|
|
19
18
|
import time
|
|
20
19
|
from itertools import count
|
|
20
|
+
from shlex import quote
|
|
21
|
+
from typing import Any, Optional, Union
|
|
21
22
|
|
|
22
23
|
from toil.lib.memoize import parse_iso_utc
|
|
23
24
|
|
|
@@ -29,15 +30,31 @@ logger = logging.getLogger(__name__)
|
|
|
29
30
|
class Node:
|
|
30
31
|
maxWaitTime = 7 * 60
|
|
31
32
|
|
|
32
|
-
def __init__(
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
publicIP: str,
|
|
36
|
+
privateIP: str,
|
|
37
|
+
name: str,
|
|
38
|
+
launchTime: Union[datetime.datetime, str],
|
|
39
|
+
nodeType: Optional[str],
|
|
40
|
+
preemptible: bool,
|
|
41
|
+
tags: Optional[dict[str, str]] = None,
|
|
42
|
+
use_private_ip: Optional[bool] = None,
|
|
43
|
+
) -> None:
|
|
33
44
|
self.publicIP = publicIP
|
|
34
45
|
self.privateIP = privateIP
|
|
35
46
|
if use_private_ip:
|
|
36
|
-
self.effectiveIP = self.privateIP
|
|
47
|
+
self.effectiveIP = self.privateIP # or self.publicIP?
|
|
37
48
|
else:
|
|
38
49
|
self.effectiveIP = self.publicIP or self.privateIP
|
|
39
50
|
self.name = name
|
|
40
|
-
|
|
51
|
+
if isinstance(launchTime, datetime.datetime):
|
|
52
|
+
self.launchTime = launchTime
|
|
53
|
+
else:
|
|
54
|
+
try:
|
|
55
|
+
self.launchTime = parse_iso_utc(launchTime)
|
|
56
|
+
except ValueError:
|
|
57
|
+
self.launchTime = datetime.datetime.fromisoformat(launchTime)
|
|
41
58
|
self.nodeType = nodeType
|
|
42
59
|
self.preemptible = preemptible
|
|
43
60
|
self.tags = tags
|
|
@@ -65,12 +82,12 @@ class Node:
|
|
|
65
82
|
"""
|
|
66
83
|
if self.launchTime:
|
|
67
84
|
now = datetime.datetime.utcnow()
|
|
68
|
-
delta = now -
|
|
85
|
+
delta = now - self.launchTime
|
|
69
86
|
return 1 - delta.total_seconds() / 3600.0 % 1.0
|
|
70
87
|
else:
|
|
71
88
|
return 1
|
|
72
89
|
|
|
73
|
-
def waitForNode(self, role, keyName=
|
|
90
|
+
def waitForNode(self, role: str, keyName: str = "core") -> None:
|
|
74
91
|
self._waitForSSHPort()
|
|
75
92
|
# wait here so docker commands can be used reliably afterwards
|
|
76
93
|
self._waitForSSHKeys(keyName=keyName)
|
|
@@ -78,8 +95,8 @@ class Node:
|
|
|
78
95
|
self._waitForAppliance(role=role, keyName=keyName)
|
|
79
96
|
|
|
80
97
|
def copySshKeys(self, keyName):
|
|
81
|
-
"""
|
|
82
|
-
if keyName ==
|
|
98
|
+
"""Copy authorized_keys file to the core user from the keyName user."""
|
|
99
|
+
if keyName == "core":
|
|
83
100
|
return # No point.
|
|
84
101
|
|
|
85
102
|
# Make sure that keys are there.
|
|
@@ -88,9 +105,17 @@ class Node:
|
|
|
88
105
|
# copy keys to core user so that the ssh calls will work
|
|
89
106
|
# - normal mechanism failed unless public key was in the google-ssh format
|
|
90
107
|
# - even so, the key wasn't copied correctly to the core account
|
|
91
|
-
keyFile =
|
|
92
|
-
self.sshInstance(
|
|
93
|
-
|
|
108
|
+
keyFile = "/home/%s/.ssh/authorized_keys" % keyName
|
|
109
|
+
self.sshInstance(
|
|
110
|
+
"/usr/bin/sudo", "/usr/bin/cp", keyFile, "/home/core/.ssh", user=keyName
|
|
111
|
+
)
|
|
112
|
+
self.sshInstance(
|
|
113
|
+
"/usr/bin/sudo",
|
|
114
|
+
"/usr/bin/chown",
|
|
115
|
+
"core",
|
|
116
|
+
"/home/core/.ssh/authorized_keys",
|
|
117
|
+
user=keyName,
|
|
118
|
+
)
|
|
94
119
|
|
|
95
120
|
def injectFile(self, fromFile, toFile, role):
|
|
96
121
|
"""
|
|
@@ -102,9 +127,13 @@ class Node:
|
|
|
102
127
|
self.coreRsync([fromFile, ":" + toFile], applianceName=role)
|
|
103
128
|
return True
|
|
104
129
|
except Exception as e:
|
|
105
|
-
logger.debug(
|
|
130
|
+
logger.debug(
|
|
131
|
+
"Rsync to new node failed, trying again. Error message: %s" % e
|
|
132
|
+
)
|
|
106
133
|
time.sleep(10 * retry)
|
|
107
|
-
raise RuntimeError(
|
|
134
|
+
raise RuntimeError(
|
|
135
|
+
f"Failed to inject file {fromFile} to {role} with ip {self.effectiveIP}"
|
|
136
|
+
)
|
|
108
137
|
|
|
109
138
|
def extractFile(self, fromFile, toFile, role):
|
|
110
139
|
"""
|
|
@@ -116,74 +145,111 @@ class Node:
|
|
|
116
145
|
self.coreRsync([":" + fromFile, toFile], applianceName=role)
|
|
117
146
|
return True
|
|
118
147
|
except Exception as e:
|
|
119
|
-
logger.debug(
|
|
148
|
+
logger.debug(
|
|
149
|
+
"Rsync from new node failed, trying again. Error message: %s" % e
|
|
150
|
+
)
|
|
120
151
|
time.sleep(10 * retry)
|
|
121
|
-
raise RuntimeError(
|
|
152
|
+
raise RuntimeError(
|
|
153
|
+
f"Failed to extract file {fromFile} from {role} with ip {self.effectiveIP}"
|
|
154
|
+
)
|
|
122
155
|
|
|
123
|
-
def _waitForSSHKeys(self, keyName=
|
|
156
|
+
def _waitForSSHKeys(self, keyName="core"):
|
|
124
157
|
# the propagation of public ssh keys vs. opening the SSH port is racey, so this method blocks until
|
|
125
158
|
# the keys are propagated and the instance can be SSH into
|
|
126
159
|
start_time = time.time()
|
|
127
160
|
last_error = None
|
|
128
161
|
while True:
|
|
129
162
|
if time.time() - start_time > self.maxWaitTime:
|
|
130
|
-
raise RuntimeError(
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
163
|
+
raise RuntimeError(
|
|
164
|
+
f"Key propagation failed on machine with ip {self.effectiveIP}."
|
|
165
|
+
+ (
|
|
166
|
+
"\n\nMake sure that your public key is attached to your account and you are using "
|
|
167
|
+
"the correct private key. If you are using a key with a passphrase, be sure to "
|
|
168
|
+
"set up ssh-agent. For details, refer to "
|
|
169
|
+
"https://toil.readthedocs.io/en/latest/running/cloud/cloud.html."
|
|
170
|
+
if last_error and "Permission denied" in last_error
|
|
171
|
+
else ""
|
|
172
|
+
)
|
|
173
|
+
)
|
|
136
174
|
try:
|
|
137
|
-
logger.info(
|
|
138
|
-
|
|
175
|
+
logger.info(
|
|
176
|
+
"Attempting to establish SSH connection to %s@%s...",
|
|
177
|
+
keyName,
|
|
178
|
+
self.effectiveIP,
|
|
179
|
+
)
|
|
180
|
+
self.sshInstance("ps", sshOptions=["-oBatchMode=yes"], user=keyName)
|
|
139
181
|
except RuntimeError as err:
|
|
140
182
|
last_error = str(err)
|
|
141
|
-
logger.info(
|
|
183
|
+
logger.info(
|
|
184
|
+
"Connection rejected, waiting for public SSH key to be propagated. Trying again in 10s."
|
|
185
|
+
)
|
|
142
186
|
time.sleep(10)
|
|
143
187
|
else:
|
|
144
|
-
logger.info(
|
|
188
|
+
logger.info("...SSH connection established.")
|
|
145
189
|
return
|
|
146
190
|
|
|
147
|
-
def _waitForDockerDaemon(self, keyName=
|
|
148
|
-
logger.info(
|
|
191
|
+
def _waitForDockerDaemon(self, keyName="core"):
|
|
192
|
+
logger.info("Waiting for docker on %s to start...", self.effectiveIP)
|
|
149
193
|
sleepTime = 10
|
|
150
194
|
startTime = time.time()
|
|
151
195
|
while True:
|
|
152
196
|
if time.time() - startTime > self.maxWaitTime:
|
|
153
|
-
raise RuntimeError(
|
|
197
|
+
raise RuntimeError(
|
|
198
|
+
"Docker daemon failed to start on machine with ip %s"
|
|
199
|
+
% self.effectiveIP
|
|
200
|
+
)
|
|
154
201
|
try:
|
|
155
|
-
output = self.sshInstance(
|
|
156
|
-
|
|
202
|
+
output = self.sshInstance(
|
|
203
|
+
"/usr/bin/ps", "auxww", sshOptions=["-oBatchMode=yes"], user=keyName
|
|
204
|
+
)
|
|
205
|
+
if b"dockerd" in output:
|
|
157
206
|
# docker daemon has started
|
|
158
|
-
logger.info(
|
|
207
|
+
logger.info("Docker daemon running")
|
|
159
208
|
break
|
|
160
209
|
else:
|
|
161
|
-
logger.info(
|
|
210
|
+
logger.info(
|
|
211
|
+
"... Still waiting for docker daemon, trying in %s sec..."
|
|
212
|
+
% sleepTime
|
|
213
|
+
)
|
|
162
214
|
time.sleep(sleepTime)
|
|
163
215
|
except RuntimeError:
|
|
164
216
|
logger.info("Wait for docker daemon failed ssh, trying again.")
|
|
165
217
|
sleepTime += 20
|
|
166
218
|
|
|
167
|
-
def _waitForAppliance(self, role, keyName=
|
|
168
|
-
logger.info(
|
|
219
|
+
def _waitForAppliance(self, role, keyName="core"):
|
|
220
|
+
logger.info("Waiting for %s Toil appliance to start...", role)
|
|
169
221
|
sleepTime = 20
|
|
170
222
|
startTime = time.time()
|
|
171
223
|
while True:
|
|
172
224
|
if time.time() - startTime > self.maxWaitTime:
|
|
173
|
-
raise RuntimeError(
|
|
174
|
-
|
|
225
|
+
raise RuntimeError(
|
|
226
|
+
"Appliance failed to start on machine with IP: "
|
|
227
|
+
+ self.effectiveIP
|
|
228
|
+
+ "\nCheck if TOIL_APPLIANCE_SELF is set correctly and the container exists."
|
|
229
|
+
)
|
|
175
230
|
try:
|
|
176
|
-
output = self.sshInstance(
|
|
177
|
-
|
|
178
|
-
|
|
231
|
+
output = self.sshInstance(
|
|
232
|
+
"/usr/bin/docker",
|
|
233
|
+
"ps",
|
|
234
|
+
sshOptions=["-oBatchMode=yes"],
|
|
235
|
+
user=keyName,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
role = (
|
|
239
|
+
bytes(role, encoding="utf-8")
|
|
240
|
+
if type(role) != type(output)
|
|
241
|
+
else role
|
|
242
|
+
)
|
|
179
243
|
|
|
180
244
|
if role in output:
|
|
181
|
-
logger.info(
|
|
245
|
+
logger.info("...Toil appliance started")
|
|
182
246
|
break
|
|
183
247
|
else:
|
|
184
|
-
logger.info(
|
|
185
|
-
|
|
186
|
-
|
|
248
|
+
logger.info(
|
|
249
|
+
"...Still waiting for appliance, trying again in %s sec..."
|
|
250
|
+
% sleepTime
|
|
251
|
+
)
|
|
252
|
+
logger.debug(f"Role: {role}\n" f"Output: {output}\n\n")
|
|
187
253
|
time.sleep(sleepTime)
|
|
188
254
|
except RuntimeError:
|
|
189
255
|
# ignore exceptions, keep trying
|
|
@@ -197,13 +263,13 @@ class Node:
|
|
|
197
263
|
:return: the number of unsuccessful attempts to connect to the port before a the first
|
|
198
264
|
success
|
|
199
265
|
"""
|
|
200
|
-
logger.debug(
|
|
266
|
+
logger.debug("Waiting for ssh port on %s to open...", self.effectiveIP)
|
|
201
267
|
for i in count():
|
|
202
268
|
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
203
269
|
try:
|
|
204
270
|
s.settimeout(a_short_time)
|
|
205
271
|
s.connect((self.effectiveIP, 22))
|
|
206
|
-
logger.debug(
|
|
272
|
+
logger.debug("...ssh port open")
|
|
207
273
|
return i
|
|
208
274
|
except OSError:
|
|
209
275
|
pass
|
|
@@ -217,7 +283,7 @@ class Node:
|
|
|
217
283
|
interactive SSHing. The default value is False. Input=string is passed as
|
|
218
284
|
input to the Popen call.
|
|
219
285
|
"""
|
|
220
|
-
kwargs[
|
|
286
|
+
kwargs["appliance"] = True
|
|
221
287
|
return self.coreSSH(*args, **kwargs)
|
|
222
288
|
|
|
223
289
|
def sshInstance(self, *args, **kwargs):
|
|
@@ -225,7 +291,7 @@ class Node:
|
|
|
225
291
|
Run a command on the instance.
|
|
226
292
|
Returns the binary output of the command.
|
|
227
293
|
"""
|
|
228
|
-
kwargs[
|
|
294
|
+
kwargs["collectStdout"] = True
|
|
229
295
|
return self.coreSSH(*args, **kwargs)
|
|
230
296
|
|
|
231
297
|
def coreSSH(self, *args, **kwargs):
|
|
@@ -241,64 +307,74 @@ class Node:
|
|
|
241
307
|
:param bytes input: UTF-8 encoded input bytes to send to the command
|
|
242
308
|
|
|
243
309
|
"""
|
|
244
|
-
commandTokens = [
|
|
245
|
-
if not kwargs.pop(
|
|
246
|
-
kwargs[
|
|
247
|
-
|
|
248
|
-
|
|
310
|
+
commandTokens = ["ssh", "-tt"]
|
|
311
|
+
if not kwargs.pop("strict", False):
|
|
312
|
+
kwargs["sshOptions"] = [
|
|
313
|
+
"-oUserKnownHostsFile=/dev/null",
|
|
314
|
+
"-oStrictHostKeyChecking=no",
|
|
315
|
+
] + kwargs.get("sshOptions", [])
|
|
316
|
+
sshOptions = kwargs.pop("sshOptions", None)
|
|
249
317
|
# Forward ports:
|
|
250
318
|
# 5050 for Mesos dashboard (although to talk to agents you will need a proxy)
|
|
251
|
-
commandTokens.extend([
|
|
319
|
+
commandTokens.extend(["-L", "5050:localhost:5050"])
|
|
252
320
|
if sshOptions:
|
|
253
321
|
# add specified options to ssh command
|
|
254
322
|
assert isinstance(sshOptions, list)
|
|
255
323
|
commandTokens.extend(sshOptions)
|
|
256
324
|
# specify host
|
|
257
|
-
user = kwargs.pop(
|
|
258
|
-
commandTokens.append(f
|
|
325
|
+
user = kwargs.pop("user", "core") # CHANGED: Is this needed?
|
|
326
|
+
commandTokens.append(f"{user}@{str(self.effectiveIP)}")
|
|
259
327
|
|
|
260
|
-
inputString = kwargs.pop(
|
|
328
|
+
inputString = kwargs.pop("input", None)
|
|
261
329
|
if inputString is not None:
|
|
262
|
-
kwargs[
|
|
330
|
+
kwargs["stdin"] = subprocess.PIPE
|
|
263
331
|
|
|
264
|
-
if kwargs.pop(
|
|
265
|
-
kwargs[
|
|
266
|
-
kwargs[
|
|
332
|
+
if kwargs.pop("collectStdout", None):
|
|
333
|
+
kwargs["stdout"] = subprocess.PIPE
|
|
334
|
+
kwargs["stderr"] = subprocess.PIPE
|
|
267
335
|
|
|
268
|
-
tty = kwargs.pop(
|
|
269
|
-
if kwargs.pop(
|
|
270
|
-
ttyFlag =
|
|
271
|
-
commandTokens += [
|
|
336
|
+
tty = kwargs.pop("tty", None)
|
|
337
|
+
if kwargs.pop("appliance", None):
|
|
338
|
+
ttyFlag = "-t" if tty else ""
|
|
339
|
+
commandTokens += ["docker", "exec", "-i", ttyFlag, "toil_leader"]
|
|
272
340
|
|
|
273
|
-
logger.debug(
|
|
274
|
-
args = list(map(
|
|
341
|
+
logger.debug("Node %s: %s", self.effectiveIP, " ".join(args))
|
|
342
|
+
args = list(map(quote, args))
|
|
275
343
|
commandTokens += args
|
|
276
|
-
logger.debug(
|
|
344
|
+
logger.debug("Full command %s", " ".join(commandTokens))
|
|
277
345
|
process = subprocess.Popen(commandTokens, **kwargs)
|
|
278
346
|
stdout, stderr = process.communicate(input=inputString)
|
|
279
347
|
# at this point the process has already exited, no need for a timeout
|
|
280
348
|
exit_code = process.returncode
|
|
281
349
|
# ssh has been throwing random 255 errors - why?
|
|
282
350
|
if exit_code != 0:
|
|
283
|
-
logger.info(
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
351
|
+
logger.info(
|
|
352
|
+
'Executing the command "%s" on the appliance returned a non-zero '
|
|
353
|
+
"exit code %s with stdout %s and stderr %s"
|
|
354
|
+
% (" ".join(args), exit_code, stdout, stderr)
|
|
355
|
+
)
|
|
356
|
+
raise RuntimeError(
|
|
357
|
+
'Executing the command "%s" on the appliance returned a non-zero '
|
|
358
|
+
"exit code %s with stdout %s and stderr %s"
|
|
359
|
+
% (" ".join(args), exit_code, stdout, stderr)
|
|
360
|
+
)
|
|
289
361
|
return stdout
|
|
290
362
|
|
|
291
|
-
def coreRsync(
|
|
292
|
-
|
|
363
|
+
def coreRsync(
|
|
364
|
+
self, args: list[str], applianceName: str = "toil_leader", **kwargs: Any
|
|
365
|
+
) -> int:
|
|
366
|
+
remoteRsync = (
|
|
367
|
+
"docker exec -i %s rsync -v" % applianceName
|
|
368
|
+
) # Access rsync inside appliance
|
|
293
369
|
parsedArgs = []
|
|
294
370
|
sshCommand = "ssh"
|
|
295
|
-
if not kwargs.pop(
|
|
371
|
+
if not kwargs.pop("strict", False):
|
|
296
372
|
sshCommand = "ssh -oUserKnownHostsFile=/dev/null -oStrictHostKeyChecking=no"
|
|
297
373
|
hostInserted = False
|
|
298
374
|
# Insert remote host address
|
|
299
375
|
for i in args:
|
|
300
376
|
if i.startswith(":") and not hostInserted:
|
|
301
|
-
user = kwargs.pop(
|
|
377
|
+
user = kwargs.pop("user", "core") # CHANGED: Is this needed?
|
|
302
378
|
i = (f"{user}@{self.effectiveIP}") + i
|
|
303
379
|
hostInserted = True
|
|
304
380
|
elif i.startswith(":") and hostInserted:
|
|
@@ -306,7 +382,7 @@ class Node:
|
|
|
306
382
|
parsedArgs.append(i)
|
|
307
383
|
if not hostInserted:
|
|
308
384
|
raise ValueError("No remote host found in argument list")
|
|
309
|
-
command = [
|
|
385
|
+
command = ["rsync", "-e", sshCommand, "--rsync-path", remoteRsync]
|
|
310
386
|
logger.debug("Running %r.", command + parsedArgs)
|
|
311
387
|
|
|
312
388
|
return subprocess.check_call(command + parsedArgs)
|
toil/realtimeLogger.py
CHANGED
|
@@ -20,7 +20,7 @@ import os.path
|
|
|
20
20
|
import socketserver as SocketServer
|
|
21
21
|
import threading
|
|
22
22
|
from types import TracebackType
|
|
23
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
23
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
24
24
|
|
|
25
25
|
from toil.lib.misc import get_public_ip
|
|
26
26
|
from toil.statsAndLogging import set_log_level
|
|
@@ -49,7 +49,7 @@ class LoggingDatagramHandler(SocketServer.BaseRequestHandler):
|
|
|
49
49
|
|
|
50
50
|
try:
|
|
51
51
|
# Parse it as JSON
|
|
52
|
-
message_attrs = json.loads(data.decode(
|
|
52
|
+
message_attrs = json.loads(data.decode("utf-8"))
|
|
53
53
|
# Fluff it up into a proper logging record
|
|
54
54
|
record = logging.makeLogRecord(message_attrs)
|
|
55
55
|
if isinstance(record.args, list):
|
|
@@ -81,7 +81,7 @@ class JSONDatagramHandler(logging.handlers.DatagramHandler):
|
|
|
81
81
|
|
|
82
82
|
def makePickle(self, record: logging.LogRecord) -> bytes:
|
|
83
83
|
"""Actually, encode the record as bare JSON instead."""
|
|
84
|
-
return json.dumps(record.__dict__).encode(
|
|
84
|
+
return json.dumps(record.__dict__).encode("utf-8")
|
|
85
85
|
|
|
86
86
|
|
|
87
87
|
class RealtimeLoggerMetaclass(type):
|
|
@@ -113,7 +113,7 @@ class RealtimeLogger(metaclass=RealtimeLoggerMetaclass):
|
|
|
113
113
|
envPrefix = "TOIL_RT_LOGGING_"
|
|
114
114
|
|
|
115
115
|
# Avoid duplicating the default level everywhere
|
|
116
|
-
defaultLevel =
|
|
116
|
+
defaultLevel = "INFO"
|
|
117
117
|
|
|
118
118
|
# State maintained on server and client
|
|
119
119
|
|
|
@@ -131,19 +131,24 @@ class RealtimeLogger(metaclass=RealtimeLoggerMetaclass):
|
|
|
131
131
|
logger = None
|
|
132
132
|
|
|
133
133
|
@classmethod
|
|
134
|
-
def _startLeader(
|
|
134
|
+
def _startLeader(
|
|
135
|
+
cls, batchSystem: "AbstractBatchSystem", level: str = defaultLevel
|
|
136
|
+
) -> None:
|
|
135
137
|
with cls.lock:
|
|
136
138
|
if cls.initialized == 0:
|
|
137
139
|
cls.initialized += 1
|
|
138
140
|
if level:
|
|
139
|
-
logger.info(
|
|
141
|
+
logger.info("Starting real-time logging.")
|
|
140
142
|
# Start up the logging server
|
|
141
143
|
cls.loggingServer = SocketServer.ThreadingUDPServer(
|
|
142
|
-
|
|
143
|
-
|
|
144
|
+
server_address=("0.0.0.0", 0),
|
|
145
|
+
RequestHandlerClass=LoggingDatagramHandler,
|
|
146
|
+
)
|
|
144
147
|
|
|
145
148
|
# Set up a thread to do all the serving in the background and exit when we do
|
|
146
|
-
cls.serverThread = threading.Thread(
|
|
149
|
+
cls.serverThread = threading.Thread(
|
|
150
|
+
target=cls.loggingServer.serve_forever
|
|
151
|
+
)
|
|
147
152
|
cls.serverThread.daemon = True
|
|
148
153
|
cls.serverThread.start()
|
|
149
154
|
|
|
@@ -156,28 +161,30 @@ class RealtimeLogger(metaclass=RealtimeLoggerMetaclass):
|
|
|
156
161
|
os.environ[name] = value
|
|
157
162
|
batchSystem.setEnv(name)
|
|
158
163
|
|
|
159
|
-
_setEnv(
|
|
160
|
-
_setEnv(
|
|
164
|
+
_setEnv("ADDRESS", "%s:%i" % (ip, port))
|
|
165
|
+
_setEnv("LEVEL", level)
|
|
161
166
|
else:
|
|
162
|
-
logger.debug(
|
|
167
|
+
logger.debug("Real-time logging disabled")
|
|
163
168
|
else:
|
|
164
169
|
if level:
|
|
165
|
-
logger.warning(
|
|
170
|
+
logger.warning("Ignoring nested request to start real-time logging")
|
|
166
171
|
|
|
167
172
|
@classmethod
|
|
168
173
|
def _stopLeader(cls) -> None:
|
|
169
174
|
"""Stop the server on the leader."""
|
|
170
175
|
with cls.lock:
|
|
171
176
|
if cls.initialized == 0:
|
|
172
|
-
raise RuntimeError(
|
|
177
|
+
raise RuntimeError(
|
|
178
|
+
"Can't stop the server on the leader as the leader was never initialized."
|
|
179
|
+
)
|
|
173
180
|
cls.initialized -= 1
|
|
174
181
|
if cls.initialized == 0:
|
|
175
182
|
if cls.loggingServer:
|
|
176
|
-
logger.info(
|
|
183
|
+
logger.info("Stopping real-time logging server.")
|
|
177
184
|
cls.loggingServer.shutdown()
|
|
178
185
|
cls.loggingServer = None
|
|
179
186
|
if cls.serverThread:
|
|
180
|
-
logger.info(
|
|
187
|
+
logger.info("Joining real-time logging server thread.")
|
|
181
188
|
cls.serverThread.join()
|
|
182
189
|
cls.serverThread = None
|
|
183
190
|
for k in list(os.environ.keys()):
|
|
@@ -198,9 +205,9 @@ class RealtimeLogger(metaclass=RealtimeLoggerMetaclass):
|
|
|
198
205
|
if cls.logger is None:
|
|
199
206
|
with cls.lock:
|
|
200
207
|
if cls.logger is None:
|
|
201
|
-
cls.logger = logging.getLogger(
|
|
208
|
+
cls.logger = logging.getLogger("toil-rt")
|
|
202
209
|
try:
|
|
203
|
-
level = os.environ[cls.envPrefix +
|
|
210
|
+
level = os.environ[cls.envPrefix + "LEVEL"]
|
|
204
211
|
except KeyError:
|
|
205
212
|
# There is no server running on the leader, so suppress most log messages
|
|
206
213
|
# and skip the UDP stuff.
|
|
@@ -209,16 +216,16 @@ class RealtimeLogger(metaclass=RealtimeLoggerMetaclass):
|
|
|
209
216
|
# Adopt the logging level set on the leader.
|
|
210
217
|
set_log_level(level, cls.logger)
|
|
211
218
|
try:
|
|
212
|
-
address = os.environ[cls.envPrefix +
|
|
219
|
+
address = os.environ[cls.envPrefix + "ADDRESS"]
|
|
213
220
|
except KeyError:
|
|
214
221
|
pass
|
|
215
222
|
else:
|
|
216
223
|
# We know where to send messages to, so send them.
|
|
217
|
-
host, port = address.split(
|
|
224
|
+
host, port = address.split(":")
|
|
218
225
|
cls.logger.addHandler(JSONDatagramHandler(host, int(port)))
|
|
219
226
|
return cls.logger
|
|
220
227
|
|
|
221
|
-
def __init__(self, batchSystem:
|
|
228
|
+
def __init__(self, batchSystem: "AbstractBatchSystem", level: str = defaultLevel):
|
|
222
229
|
"""
|
|
223
230
|
Create a context manager that starts up the UDP server.
|
|
224
231
|
|
|
@@ -237,5 +244,10 @@ class RealtimeLogger(metaclass=RealtimeLoggerMetaclass):
|
|
|
237
244
|
RealtimeLogger._startLeader(self.__batchSystem, level=self.__level)
|
|
238
245
|
|
|
239
246
|
# noinspection PyUnusedLocal
|
|
240
|
-
def __exit__(
|
|
247
|
+
def __exit__(
|
|
248
|
+
self,
|
|
249
|
+
exc_type: Optional[type[BaseException]],
|
|
250
|
+
exc_val: Optional[BaseException],
|
|
251
|
+
exc_tb: Optional[TracebackType],
|
|
252
|
+
) -> None:
|
|
241
253
|
RealtimeLogger._stopLeader()
|