patme 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of patme might be problematic. Click here for more details.
- patme/__init__.py +52 -0
- patme/buildtools/__init__.py +7 -0
- patme/buildtools/rce_releasecreator.py +336 -0
- patme/buildtools/release.py +26 -0
- patme/femtools/__init__.py +5 -0
- patme/femtools/abqmsgfilechecker.py +137 -0
- patme/femtools/fecall.py +1092 -0
- patme/geometry/__init__.py +0 -0
- patme/geometry/area.py +124 -0
- patme/geometry/coordinatesystem.py +635 -0
- patme/geometry/intersect.py +284 -0
- patme/geometry/line.py +183 -0
- patme/geometry/misc.py +420 -0
- patme/geometry/plane.py +464 -0
- patme/geometry/rotate.py +244 -0
- patme/geometry/scale.py +152 -0
- patme/geometry/shape2d.py +50 -0
- patme/geometry/transformations.py +1831 -0
- patme/geometry/translate.py +139 -0
- patme/mechanics/__init__.py +4 -0
- patme/mechanics/loads.py +435 -0
- patme/mechanics/material.py +1260 -0
- patme/service/__init__.py +7 -0
- patme/service/decorators.py +85 -0
- patme/service/duration.py +96 -0
- patme/service/exceptionhook.py +104 -0
- patme/service/exceptions.py +36 -0
- patme/service/io/__init__.py +3 -0
- patme/service/io/basewriter.py +122 -0
- patme/service/logger.py +375 -0
- patme/service/mathutils.py +108 -0
- patme/service/misc.py +71 -0
- patme/service/moveimports.py +217 -0
- patme/service/stringutils.py +419 -0
- patme/service/systemutils.py +290 -0
- patme/sshtools/__init__.py +3 -0
- patme/sshtools/cara.py +435 -0
- patme/sshtools/clustercaller.py +420 -0
- patme/sshtools/facluster.py +350 -0
- patme/sshtools/sshcall.py +168 -0
- patme-0.4.4.dist-info/LICENSE +21 -0
- patme-0.4.4.dist-info/LICENSES/MIT.txt +9 -0
- patme-0.4.4.dist-info/METADATA +168 -0
- patme-0.4.4.dist-info/RECORD +46 -0
- patme-0.4.4.dist-info/WHEEL +4 -0
- patme-0.4.4.dist-info/entry_points.txt +3 -0
patme/sshtools/cara.py
ADDED
|
@@ -0,0 +1,435 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2022 German Aerospace Center (DLR)
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Connect via python to the FA cluster using ssh, submit commands, move inputs/outputs to and from the cluster.
|
|
7
|
+
|
|
8
|
+
**Scenario**
|
|
9
|
+
|
|
10
|
+
To connect to the institute cluster and submit a job, two things need to be done.
|
|
11
|
+
First the input files must be copied to the cluster.
|
|
12
|
+
|
|
13
|
+
``\\\\cluster.fa.bs.dlr.de\\<username>\\``
|
|
14
|
+
|
|
15
|
+
Secondly, when all required files are available on the cluster,
|
|
16
|
+
the cluster command (see cluster documentation) needs to be sent using a secure connection.
|
|
17
|
+
This is done via ssh using the rsa-public/privat-key algorithm.
|
|
18
|
+
|
|
19
|
+
**Connect for the first time**
|
|
20
|
+
|
|
21
|
+
- Create a public+private key using ssh-keygen
|
|
22
|
+
- Put the private key to the location specified in patme.sshtools.sshcall.privateKeyFileConfig
|
|
23
|
+
or adapt this variable at runtime.
|
|
24
|
+
- Append the public key to "~/.ssh/authorized_keys" on the remote computer
|
|
25
|
+
|
|
26
|
+
**HowTo connect to the a host**
|
|
27
|
+
|
|
28
|
+
>> sshCluster('echo hello world')
|
|
29
|
+
'hello world\\n'
|
|
30
|
+
|
|
31
|
+
"""
|
|
32
|
+
import os
|
|
33
|
+
import re
|
|
34
|
+
from time import sleep, time
|
|
35
|
+
|
|
36
|
+
from patme.service.exceptions import DelisSshError
|
|
37
|
+
from patme.service.logger import log
|
|
38
|
+
from patme.sshtools import sshcall
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_default_slurm_args():
|
|
42
|
+
return {
|
|
43
|
+
"nodes": "1",
|
|
44
|
+
"hint": "nomultithread",
|
|
45
|
+
"time": "02:00:00",
|
|
46
|
+
"account": "2263032",
|
|
47
|
+
"no-kill": "",
|
|
48
|
+
"output": "cluster.r%j.log",
|
|
49
|
+
"partition": os.environ.get("CARA_PARTITION", "ppp"),
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _getClusterAuthentication():
|
|
54
|
+
"""Returns the hostname, host key string and private key file information for a cluster call.
|
|
55
|
+
|
|
56
|
+
A description to these objects can be found in service.sshremotecall.callSSH"""
|
|
57
|
+
|
|
58
|
+
hostname = "cara.dlr.de"
|
|
59
|
+
hostKeyString = "AAAAB3NzaC1yc2EAAAADAQABAAACAQDL9y9u3D+refVuZnJJNdVeMK53EG0hfGUwuA+JyT2zOs6xOnhXhbTB0hOpORv4sd9V3mHJDf1yyIlZ/bgJCT4Znazz3amqzD7SmqGeNR8r7Z4whQY0drMpL67fthFNsqoUdjsOn+FZfWsZhy2ntMLIi4KRZ9Kaoe8Kqo3j1gej0iwq6W2+LYB69zhP1SHtT+603Qw97kAgrQeA2R71BFwUXSRzgDbPlucX8he9S4WjWZ3OTpfXksQtIN/8jGAsTw6x/4iu1ia8bjW5jc4q5qrF4UPdsRlbuByn2/QBU4XHZUcq6rZqv6KGyNqja2sZHsT7weDHo5JtYMNUzVB75SfmMigIxy3hcD6xicc5gSLQuw7e1BZsC8ld9Ku5hkL9OdXl/jkble55dO9lEKgze+y0QscBAYJKgi0FpQSMxw9SNt1IdImosIWfTT3jY3halybgWKvx85LVM86q45bk0RSSjgh1Oup87UO3GqF72zA+PX36v32WqMKoQ6ssqKjXOwSsXC1Ytf4GU7utoUXsqqFZOM/6CZp/09yPTTkkZGGsy2iUOw/1bS3uQcZi+lIpWqtEbsHYjrEOIPxofz4gl2Fo8yfQoUhKmED4XwWMnw0jwxNHy2uwBQz0ysIT4tz1ekBUh4fgO+2xhX/g6O24sLsfAGzc/I1gIUpmMaGJOQiwuw=="
|
|
60
|
+
|
|
61
|
+
"""this is the key from the file "~/.ssh/known_hosts" used by openssh on a linux machine"""
|
|
62
|
+
privateKey = os.path.join(os.path.expanduser("~"), ".ssh", "id_rsa")
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
from sshconf import read_ssh_config
|
|
66
|
+
|
|
67
|
+
cfgfile = os.path.join(os.path.expanduser("~"), ".ssh", "config")
|
|
68
|
+
if os.path.exists(cfgfile):
|
|
69
|
+
conf = read_ssh_config(cfgfile)
|
|
70
|
+
for host in conf.hosts():
|
|
71
|
+
cara_host = conf.host(host)
|
|
72
|
+
if "cara.dlr.de" in cara_host["hostname"]:
|
|
73
|
+
privateKey = cara_host["identityfile"]
|
|
74
|
+
break
|
|
75
|
+
|
|
76
|
+
except ImportError:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
if not os.path.exists(privateKey):
|
|
80
|
+
msg = f"Private key file '{privateKey}' not found! "
|
|
81
|
+
msg += "Ensure that the ssh-agent has a valid private key stored"
|
|
82
|
+
log.warn(msg)
|
|
83
|
+
privateKey = None
|
|
84
|
+
|
|
85
|
+
return hostname, hostKeyString, privateKey
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def sshClusterJob(
|
|
89
|
+
remoteCommand, printOutput=True, checkInterval=5, time2WaitForJob=30, monitorNodeUsage=False, **kwargs
|
|
90
|
+
):
|
|
91
|
+
"""Submit a job to the institute cluster via ssh and return when terminated.
|
|
92
|
+
|
|
93
|
+
After job submission, a connection to the cluster is established every 'checkInterval' seconds
|
|
94
|
+
to check if the job's status is already set to 'COMPLETED'.
|
|
95
|
+
|
|
96
|
+
:param remoteCommand: String with command for cluster. The arguments for the queuing
|
|
97
|
+
system must not contain the option '-i' to wait for job
|
|
98
|
+
completion.
|
|
99
|
+
:param printOutput: True (default) will print output created by the ssh call.
|
|
100
|
+
:param checkInterval: Time period in seconds between job completion checks
|
|
101
|
+
:param time2WaitForJob: After job submission it might take some time for the cluster to
|
|
102
|
+
add the job to the queue. Enter max seconds [int] to wait.
|
|
103
|
+
|
|
104
|
+
:return: int, job id
|
|
105
|
+
"""
|
|
106
|
+
with log.switchLevelTemp(log.WARN):
|
|
107
|
+
retVal = sshCluster(remoteCommand, printOutput=printOutput, **kwargs)
|
|
108
|
+
|
|
109
|
+
if not retVal or (retVal and "Submitted batch job" not in retVal):
|
|
110
|
+
msg = "Job submission to cluster failed or maybe the arguments "
|
|
111
|
+
msg += "for the cluster contained the option -i\n"
|
|
112
|
+
msg += f"remote command: {remoteCommand}\nreturn value: {retVal}"
|
|
113
|
+
raise DelisSshError(msg)
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
jobId = int(retVal.split()[-1])
|
|
117
|
+
except:
|
|
118
|
+
msg = "Could not extract job id for cluster job submission.\n"
|
|
119
|
+
msg += f"remote command: {remoteCommand}\nreturn value: {retVal}"
|
|
120
|
+
raise DelisSshError(msg)
|
|
121
|
+
|
|
122
|
+
log.info(f"Job enqueued. JobId: {jobId}")
|
|
123
|
+
usageWarningDone = False
|
|
124
|
+
jobStatus = "ENQUEUED"
|
|
125
|
+
while True:
|
|
126
|
+
with log.switchLevelTemp(log.WARN):
|
|
127
|
+
(jobEnded, retStatus) = clusterJobEnded(jobId, time2WaitForJob, **kwargs)
|
|
128
|
+
|
|
129
|
+
if jobEnded:
|
|
130
|
+
break
|
|
131
|
+
|
|
132
|
+
if retStatus != jobStatus:
|
|
133
|
+
log.info(f"Job status: {retStatus}")
|
|
134
|
+
jobStatus = retStatus
|
|
135
|
+
|
|
136
|
+
if monitorNodeUsage:
|
|
137
|
+
if not usageWarningDone:
|
|
138
|
+
nodeName = getNodeOfJob(jobId)
|
|
139
|
+
usageWarningDone = printNodeUtilization(nodeName, usageWarningDone)
|
|
140
|
+
|
|
141
|
+
sleep(checkInterval)
|
|
142
|
+
|
|
143
|
+
return jobId
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def sshCluster(remoteCommand, printOutput=True, **kwargs):
|
|
147
|
+
"""Submit a job to the institute cluster via ssh.
|
|
148
|
+
|
|
149
|
+
The method does not
|
|
150
|
+
wait for the completion of the cluster call. Please use sshClusterJob instead.
|
|
151
|
+
|
|
152
|
+
:param remoteCommand: String with command for cluster
|
|
153
|
+
:param printOutput: True (default) will print output created by the ssh call.
|
|
154
|
+
"""
|
|
155
|
+
username = kwargs.get("username", None)
|
|
156
|
+
hostname, bsfalxclusterKeyString, privateKeyFile = _getClusterAuthentication()
|
|
157
|
+
return sshcall.callSSH(
|
|
158
|
+
hostname,
|
|
159
|
+
remoteCommand,
|
|
160
|
+
privateKeyFile,
|
|
161
|
+
username=username,
|
|
162
|
+
hostKeyString=bsfalxclusterKeyString,
|
|
163
|
+
printOutput=printOutput,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def clusterJobEnded(jobId, time2WaitForJob=30, printOutput=False, **kwargs):
|
|
168
|
+
"""Checks if jobId is still listed in the cluster queue.
|
|
169
|
+
|
|
170
|
+
:param jobId: Id of job running on the cluster <int>
|
|
171
|
+
:param time2WaitForJob: After job submission it might take some time for the cluster to
|
|
172
|
+
add the job to the queue. Enter max seconds [int] to wait.
|
|
173
|
+
:param printOutput: Flag if the ssh output should be output. Defaults to True
|
|
174
|
+
:return: True if job with jobId still exists in the queue, else False
|
|
175
|
+
:raise DelisSshError: in case job is neither running nor completed successfully
|
|
176
|
+
"""
|
|
177
|
+
status = clusterJobStatus(jobId, printOutput=printOutput, **kwargs)
|
|
178
|
+
if not status and time2WaitForJob:
|
|
179
|
+
startTime = time()
|
|
180
|
+
while not status:
|
|
181
|
+
status = clusterJobStatus(jobId, printOutput=printOutput, **kwargs)
|
|
182
|
+
if time() - startTime > time2WaitForJob:
|
|
183
|
+
raise DelisSshError(f"Could not obtain status of cluster job with id {jobId}")
|
|
184
|
+
|
|
185
|
+
if not status:
|
|
186
|
+
raise DelisSshError(f"Job with id {jobId} not found in cluster job history.")
|
|
187
|
+
|
|
188
|
+
elif "PENDING" == status:
|
|
189
|
+
log.debug("Job execution on cluster is waiting for resources.")
|
|
190
|
+
return (False, status)
|
|
191
|
+
|
|
192
|
+
elif status in ["RESIZING", "RUNNING", "REQUEUED"]:
|
|
193
|
+
return (False, status)
|
|
194
|
+
|
|
195
|
+
elif "FAILED" == status:
|
|
196
|
+
log.debug(f"Job with id {jobId} failed")
|
|
197
|
+
return (True, status)
|
|
198
|
+
|
|
199
|
+
elif "CANCELLED" == status:
|
|
200
|
+
log.debug(f"Job with id {jobId} was cancelled.")
|
|
201
|
+
return (True, status)
|
|
202
|
+
|
|
203
|
+
elif "COMPLETED" == status:
|
|
204
|
+
return (True, status)
|
|
205
|
+
|
|
206
|
+
elif "OUT_OF_ME" in status:
|
|
207
|
+
log.debug(f"Job with id {jobId} failed due to too less memory")
|
|
208
|
+
return (True, status)
|
|
209
|
+
|
|
210
|
+
else:
|
|
211
|
+
raise DelisSshError(f'Unknown cluster status: "{status}"')
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def clusterJobStatus(jobId, printOutput=False, **kwargs):
|
|
215
|
+
"""Checks if cluster process with id "jobID" is pending.
|
|
216
|
+
|
|
217
|
+
:param jobId: id of cluster process (int)
|
|
218
|
+
:return: True if state is pending, else False
|
|
219
|
+
"""
|
|
220
|
+
squeue = sshCluster("squeue", printOutput=printOutput, **kwargs)
|
|
221
|
+
jobFinished = re.search(rf"\s+{jobId} ", squeue) is None
|
|
222
|
+
sacct = sshCluster(f"sacct -o state -n -j {jobId}", printOutput=printOutput, **kwargs)
|
|
223
|
+
status = sacct.split("\n")[0].replace("+", "").strip()
|
|
224
|
+
if jobFinished and (status in ["RUNNING", "PENDING"]):
|
|
225
|
+
return "COMPLETED"
|
|
226
|
+
else:
|
|
227
|
+
return status
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def copyClusterFilesSCP(files, srcBaseDir=".", destBaseDir=".", mode="put", keytype="ssh-rsa", port=None, **kwargs):
|
|
231
|
+
"""doc"""
|
|
232
|
+
hostname, bsfalxclusterKeyString, privateKeyFile = _getClusterAuthentication()
|
|
233
|
+
username = kwargs.pop("username", None)
|
|
234
|
+
with log.switchLevelTemp(log.WARN):
|
|
235
|
+
sshcall.copyFilesSCP(
|
|
236
|
+
files,
|
|
237
|
+
hostname,
|
|
238
|
+
privateKeyFile,
|
|
239
|
+
username,
|
|
240
|
+
srcBaseDir,
|
|
241
|
+
destBaseDir,
|
|
242
|
+
bsfalxclusterKeyString,
|
|
243
|
+
mode,
|
|
244
|
+
keytype,
|
|
245
|
+
port,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _wrapSshCluster(*args, **kwargs):
|
|
250
|
+
"""This method wraps the sshCluster routine to prevent python cyclic imports"""
|
|
251
|
+
retries = 3
|
|
252
|
+
for retry in range(retries):
|
|
253
|
+
try:
|
|
254
|
+
result = sshCluster(*args, **kwargs)
|
|
255
|
+
break
|
|
256
|
+
except Exception as e:
|
|
257
|
+
if retry < retries:
|
|
258
|
+
log.error(f"Got an error while calling the cluster (retry in 60s): {e}")
|
|
259
|
+
time.sleep(60)
|
|
260
|
+
else:
|
|
261
|
+
raise
|
|
262
|
+
return result
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def numberOfClusterJobsAvailable(exclusiveNode=False):
|
|
266
|
+
"""Checks and returns the number of available jobs on the FA cluster.
|
|
267
|
+
|
|
268
|
+
:param exclusiveNode: if True, number of cluster jobs is given, that can allocate
|
|
269
|
+
a complete node. The default is False
|
|
270
|
+
|
|
271
|
+
:returns: Returns the number of jobs that can be executed on the cluster.
|
|
272
|
+
"""
|
|
273
|
+
clusterCommand = 'sinfo -h -o "%t %N";'
|
|
274
|
+
clusterCommand += 'squeue -h -t RUNNING,COMPLETING -o "%N"'
|
|
275
|
+
clusterOutput = _wrapSshCluster(clusterCommand, printOutput=False).split("\n")
|
|
276
|
+
|
|
277
|
+
# STATE NODELIST <- this line does not appear in clusterOutput
|
|
278
|
+
# mix node[1,3] <- these nodes have one or more active jobs
|
|
279
|
+
# alloc node5 <- these nodes are exclusively used
|
|
280
|
+
# idle node[2,4,6] <- these nodes are awaiting jobs (up to 2)
|
|
281
|
+
# NODELIST <- this line does not appear in clusterOutput
|
|
282
|
+
# node5 <- job on node5
|
|
283
|
+
# node1
|
|
284
|
+
# node3
|
|
285
|
+
# node3
|
|
286
|
+
|
|
287
|
+
mixNodes = _splitNodes([line.split()[1][4:].strip("[]") for line in clusterOutput if "mix" in line])
|
|
288
|
+
idleNodes = _splitNodes([line.split()[1][4:].strip("[]") for line in clusterOutput if "idle" in line])
|
|
289
|
+
nodeNumbersOfActiveJobs = [int(line[4:].strip()) for line in clusterOutput if line.startswith("node")]
|
|
290
|
+
|
|
291
|
+
numberOfPosssibleJobs = 0
|
|
292
|
+
if exclusiveNode:
|
|
293
|
+
numberOfPosssibleJobs = len(idleNodes)
|
|
294
|
+
else:
|
|
295
|
+
for mixNode in mixNodes:
|
|
296
|
+
if nodeNumbersOfActiveJobs.count(mixNode) < 2:
|
|
297
|
+
numberOfPosssibleJobs += 1
|
|
298
|
+
numberOfPosssibleJobs += len(idleNodes) * 2
|
|
299
|
+
return numberOfPosssibleJobs
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _splitNodes(nodes):
|
|
303
|
+
"""parses the nodes string and returns a list of node numbers
|
|
304
|
+
|
|
305
|
+
Example:
|
|
306
|
+
|
|
307
|
+
>>> inputString = ['1,4', '2-3,5-6']
|
|
308
|
+
>>> _splitNodes(inputString)
|
|
309
|
+
[1, 2, 3, 4, 5, 6]
|
|
310
|
+
"""
|
|
311
|
+
outputNodes = []
|
|
312
|
+
for nodesString in nodes:
|
|
313
|
+
groups = nodesString.split(",")
|
|
314
|
+
for group in groups:
|
|
315
|
+
groupMembers = group.split("-")
|
|
316
|
+
if len(groupMembers) > 1:
|
|
317
|
+
outputNodes.extend(range(int(groupMembers[0]), int(groupMembers[1]) + 1))
|
|
318
|
+
else:
|
|
319
|
+
outputNodes.append(int(groupMembers[0]))
|
|
320
|
+
return list(set(outputNodes))
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def numberOfIdleClusterNodes():
|
|
324
|
+
"""returns the number of idle cluster nodes
|
|
325
|
+
|
|
326
|
+
cluster call returns: "3/3" which is Allocated/Idle
|
|
327
|
+
|
|
328
|
+
Attention: This is not the number of possible cluster jobs, since 2 jobs can be run
|
|
329
|
+
at each node. If zero nodes are idle, there may be still the opportunity to start
|
|
330
|
+
a job right away.
|
|
331
|
+
"""
|
|
332
|
+
clusterOutput = _wrapSshCluster('sinfo -h -e -o "%A"', printOutput=False)
|
|
333
|
+
return int(clusterOutput.split("/")[-1])
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def getNodeUtilization(nodeName="head"):
|
|
337
|
+
"""Returns the utilization of the cluster head (default) or of one of its nodes.
|
|
338
|
+
The information is retrieved using the commands vmstat and df. The keys of the
|
|
339
|
+
returned dictionary are described in the following.
|
|
340
|
+
|
|
341
|
+
Processes
|
|
342
|
+
r: The number of processes waiting for run time.
|
|
343
|
+
b: The number of processes in uninterruptible sleep.
|
|
344
|
+
RAM Memory
|
|
345
|
+
swpd: The amount of virtual memory used. (in MB)
|
|
346
|
+
free: The amount of idle memory. (in MB)
|
|
347
|
+
buff: The amount of memory used as buffers. (in MB)
|
|
348
|
+
cache: The amount of memory used as cache. (in MB)
|
|
349
|
+
Swap Memory
|
|
350
|
+
si: Amount of memory swapped in from disk (in MB/s).
|
|
351
|
+
so: Amount of memory swapped to disk (in MB/s).
|
|
352
|
+
IO
|
|
353
|
+
bi: Blocks received from a block device (blocks/s).
|
|
354
|
+
bo: Blocks sent to a block device (blocks/s).
|
|
355
|
+
System
|
|
356
|
+
in: The number of interrupts per second, including the clock.
|
|
357
|
+
cs: The number of context switches per second.
|
|
358
|
+
CPU
|
|
359
|
+
These are percentages of total CPU time.
|
|
360
|
+
us: Time spent running non-kernel code. (user time, including nice time)
|
|
361
|
+
sy: Time spent running kernel code. (system time)
|
|
362
|
+
id: Time spent idle. Prior to Linux 2.5.41, this includes IO-wait time.
|
|
363
|
+
wa: Time spent waiting for IO. Prior to Linux 2.5.41, shown as zero.
|
|
364
|
+
HDD Memory
|
|
365
|
+
1K-blocks: Total size of storage memory (in KB)
|
|
366
|
+
Used: Total size of used storage memory (in KB)
|
|
367
|
+
Available: Total size of available storage memory (in KB)
|
|
368
|
+
Use%: Relative usage of storage memory (in %)
|
|
369
|
+
|
|
370
|
+
:param nodeName: Name of the node (node1, ...) of which the information is to
|
|
371
|
+
be retrieved. The default is "head".
|
|
372
|
+
:return: Dictionary with utilization information.
|
|
373
|
+
"""
|
|
374
|
+
nodeCmdString = ""
|
|
375
|
+
filesystem = "/home"
|
|
376
|
+
if nodeName != "head":
|
|
377
|
+
nodeCmdString = f"ssh {nodeName} "
|
|
378
|
+
filesystem = "/dev/sda3"
|
|
379
|
+
remoteCmd = f"{nodeCmdString}vmstat -S M;"
|
|
380
|
+
remoteCmd += f"{nodeCmdString}df -l -k"
|
|
381
|
+
remoteCmdOutput = _wrapSshCluster(remoteCmd, printOutput=False).split("\n")
|
|
382
|
+
vmstatDict = dict(zip(remoteCmdOutput[1].split(), [float(item) for item in remoteCmdOutput[2].split()]))
|
|
383
|
+
dfData = [row for row in remoteCmdOutput if row.startswith(filesystem)][0]
|
|
384
|
+
dfDict = dict(zip(remoteCmdOutput[3].split()[1:-1], [float(item.strip("%")) for item in dfData.split()[1:-1]]))
|
|
385
|
+
vmstatDict.update(dfDict)
|
|
386
|
+
return vmstatDict
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def printNodeUtilization(self, nodeName, printOnCriticalUtilization=False):
|
|
390
|
+
"""Prints the utilization of a cluster node
|
|
391
|
+
|
|
392
|
+
:param nodeName: name of the cluster node to inspect
|
|
393
|
+
:param printOnCriticalUtilization: Flag if only on a critical utilization, the routine should print anything
|
|
394
|
+
:return: Flag if a usage warning was emit
|
|
395
|
+
:raise DelisSshError: if nodeName could not be found
|
|
396
|
+
"""
|
|
397
|
+
if printOnCriticalUtilization:
|
|
398
|
+
logMethod = log.warn
|
|
399
|
+
else:
|
|
400
|
+
logMethod = log.info
|
|
401
|
+
usageWarningDone = False
|
|
402
|
+
if nodeName:
|
|
403
|
+
utilizationInfo = getNodeUtilization(nodeName=nodeName)
|
|
404
|
+
freeRam = (utilizationInfo["free"] + utilizationInfo["cache"]) / 1024
|
|
405
|
+
freeHdd = utilizationInfo["Available"] / 1024 / 1024
|
|
406
|
+
if freeHdd < 2 or not printOnCriticalUtilization:
|
|
407
|
+
logMethod(f"HDD memory utilization of node {nodeName} critical. This may cause problems.")
|
|
408
|
+
usageWarningDone = True
|
|
409
|
+
if freeRam < 2 or not printOnCriticalUtilization:
|
|
410
|
+
logMethod(f"RAM memory utilization of node {nodeName} critical. This may cause problems.")
|
|
411
|
+
usageWarningDone = True
|
|
412
|
+
else:
|
|
413
|
+
raise DelisSshError(f'Utilization of the used cluster node {nodeName} cannot be performed: Node "" not found.')
|
|
414
|
+
return usageWarningDone
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def getNodeOfJob(jobId):
|
|
418
|
+
"""Returns the name of the node on which the job with id "jobId" is being executed on.
|
|
419
|
+
|
|
420
|
+
:param jobId: Id of cluster process (int)
|
|
421
|
+
:return: Name of node ("node1", "node2", ...) or None if jobId is not found.
|
|
422
|
+
"""
|
|
423
|
+
node = None
|
|
424
|
+
try:
|
|
425
|
+
node = _wrapSshCluster(f"squeue | grep {jobId}", printOutput=False).split()[7]
|
|
426
|
+
except:
|
|
427
|
+
log.warning("Node not found, because jobID not found in cluster queue")
|
|
428
|
+
return node
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
if __name__ == "__main__":
|
|
432
|
+
from patme.sshtools import sshcall
|
|
433
|
+
|
|
434
|
+
sshcall.privateKeyFileConfig = None
|
|
435
|
+
sshCluster("echo foobar")
|