toil 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +18 -13
- toil/batchSystems/abstractBatchSystem.py +39 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +24 -24
- toil/batchSystems/awsBatch.py +14 -14
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/contained_executor.py +3 -3
- toil/batchSystems/htcondor.py +0 -1
- toil/batchSystems/kubernetes.py +34 -31
- toil/batchSystems/local_support.py +3 -1
- toil/batchSystems/lsf.py +7 -7
- toil/batchSystems/mesos/batchSystem.py +7 -7
- toil/batchSystems/options.py +32 -83
- toil/batchSystems/registry.py +104 -23
- toil/batchSystems/singleMachine.py +16 -13
- toil/batchSystems/slurm.py +87 -16
- toil/batchSystems/torque.py +0 -1
- toil/bus.py +44 -8
- toil/common.py +544 -753
- toil/cwl/__init__.py +28 -32
- toil/cwl/cwltoil.py +595 -574
- toil/cwl/utils.py +55 -10
- toil/exceptions.py +1 -1
- toil/fileStores/__init__.py +2 -2
- toil/fileStores/abstractFileStore.py +88 -14
- toil/fileStores/cachingFileStore.py +610 -549
- toil/fileStores/nonCachingFileStore.py +46 -22
- toil/job.py +182 -101
- toil/jobStores/abstractJobStore.py +161 -95
- toil/jobStores/aws/jobStore.py +23 -9
- toil/jobStores/aws/utils.py +6 -6
- toil/jobStores/fileJobStore.py +116 -18
- toil/jobStores/googleJobStore.py +16 -7
- toil/jobStores/utils.py +5 -6
- toil/leader.py +87 -56
- toil/lib/accelerators.py +10 -5
- toil/lib/aws/__init__.py +3 -14
- toil/lib/aws/ami.py +22 -9
- toil/lib/aws/iam.py +21 -13
- toil/lib/aws/session.py +2 -16
- toil/lib/aws/utils.py +4 -5
- toil/lib/compatibility.py +1 -1
- toil/lib/conversions.py +26 -3
- toil/lib/docker.py +22 -23
- toil/lib/ec2.py +10 -6
- toil/lib/ec2nodes.py +106 -100
- toil/lib/encryption/_nacl.py +2 -1
- toil/lib/generatedEC2Lists.py +325 -18
- toil/lib/io.py +49 -2
- toil/lib/misc.py +1 -1
- toil/lib/resources.py +9 -2
- toil/lib/threading.py +101 -38
- toil/options/common.py +736 -0
- toil/options/cwl.py +336 -0
- toil/options/wdl.py +37 -0
- toil/provisioners/abstractProvisioner.py +9 -4
- toil/provisioners/aws/__init__.py +3 -6
- toil/provisioners/aws/awsProvisioner.py +6 -0
- toil/provisioners/clusterScaler.py +3 -2
- toil/provisioners/gceProvisioner.py +2 -2
- toil/realtimeLogger.py +2 -1
- toil/resource.py +24 -18
- toil/server/app.py +2 -3
- toil/server/cli/wes_cwl_runner.py +4 -4
- toil/server/utils.py +1 -1
- toil/server/wes/abstract_backend.py +3 -2
- toil/server/wes/amazon_wes_utils.py +5 -4
- toil/server/wes/tasks.py +2 -3
- toil/server/wes/toil_backend.py +2 -10
- toil/server/wsgi_app.py +2 -0
- toil/serviceManager.py +12 -10
- toil/statsAndLogging.py +41 -9
- toil/test/__init__.py +29 -54
- toil/test/batchSystems/batchSystemTest.py +11 -111
- toil/test/batchSystems/test_slurm.py +24 -8
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +438 -223
- toil/test/cwl/glob_dir.cwl +15 -0
- toil/test/cwl/preemptible.cwl +21 -0
- toil/test/cwl/preemptible_expression.cwl +28 -0
- toil/test/cwl/revsort.cwl +1 -1
- toil/test/cwl/revsort2.cwl +1 -1
- toil/test/docs/scriptsTest.py +2 -3
- toil/test/jobStores/jobStoreTest.py +34 -21
- toil/test/lib/aws/test_iam.py +4 -14
- toil/test/lib/aws/test_utils.py +0 -3
- toil/test/lib/dockerTest.py +4 -4
- toil/test/lib/test_ec2.py +12 -17
- toil/test/mesos/helloWorld.py +4 -5
- toil/test/mesos/stress.py +1 -1
- toil/test/{wdl/conftest.py → options/__init__.py} +0 -10
- toil/test/options/options.py +37 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
- toil/test/provisioners/clusterScalerTest.py +6 -4
- toil/test/provisioners/clusterTest.py +23 -11
- toil/test/provisioners/gceProvisionerTest.py +0 -6
- toil/test/provisioners/restartScript.py +3 -2
- toil/test/server/serverTest.py +1 -1
- toil/test/sort/restart_sort.py +2 -1
- toil/test/sort/sort.py +2 -1
- toil/test/sort/sortTest.py +2 -13
- toil/test/src/autoDeploymentTest.py +45 -45
- toil/test/src/busTest.py +5 -5
- toil/test/src/checkpointTest.py +2 -2
- toil/test/src/deferredFunctionTest.py +1 -1
- toil/test/src/fileStoreTest.py +32 -16
- toil/test/src/helloWorldTest.py +1 -1
- toil/test/src/importExportFileTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +2 -1
- toil/test/src/jobServiceTest.py +1 -1
- toil/test/src/jobTest.py +18 -18
- toil/test/src/miscTests.py +5 -3
- toil/test/src/promisedRequirementTest.py +3 -3
- toil/test/src/realtimeLoggerTest.py +1 -1
- toil/test/src/resourceTest.py +2 -2
- toil/test/src/restartDAGTest.py +1 -1
- toil/test/src/resumabilityTest.py +36 -2
- toil/test/src/retainTempDirTest.py +1 -1
- toil/test/src/systemTest.py +2 -2
- toil/test/src/toilContextManagerTest.py +2 -2
- toil/test/src/userDefinedJobArgTypeTest.py +1 -1
- toil/test/utils/toilDebugTest.py +98 -32
- toil/test/utils/toilKillTest.py +2 -2
- toil/test/utils/utilsTest.py +23 -3
- toil/test/wdl/wdltoil_test.py +223 -45
- toil/toilState.py +7 -6
- toil/utils/toilClean.py +1 -1
- toil/utils/toilConfig.py +36 -0
- toil/utils/toilDebugFile.py +60 -33
- toil/utils/toilDebugJob.py +39 -12
- toil/utils/toilDestroyCluster.py +1 -1
- toil/utils/toilKill.py +1 -1
- toil/utils/toilLaunchCluster.py +13 -2
- toil/utils/toilMain.py +3 -2
- toil/utils/toilRsyncCluster.py +1 -1
- toil/utils/toilSshCluster.py +1 -1
- toil/utils/toilStats.py +445 -305
- toil/utils/toilStatus.py +2 -5
- toil/version.py +10 -10
- toil/wdl/utils.py +2 -122
- toil/wdl/wdltoil.py +1257 -492
- toil/worker.py +55 -46
- toil-6.1.0.dist-info/METADATA +124 -0
- toil-6.1.0.dist-info/RECORD +241 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/WHEEL +1 -1
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -1
- toil/batchSystems/parasol.py +0 -379
- toil/batchSystems/tes.py +0 -459
- toil/test/batchSystems/parasolTestSupport.py +0 -117
- toil/test/wdl/builtinTest.py +0 -506
- toil/test/wdl/toilwdlTest.py +0 -522
- toil/wdl/toilwdl.py +0 -141
- toil/wdl/versions/dev.py +0 -107
- toil/wdl/versions/draft2.py +0 -980
- toil/wdl/versions/v1.py +0 -794
- toil/wdl/wdl_analysis.py +0 -116
- toil/wdl/wdl_functions.py +0 -997
- toil/wdl/wdl_synthesis.py +0 -1011
- toil/wdl/wdl_types.py +0 -243
- toil-5.12.0.dist-info/METADATA +0 -118
- toil-5.12.0.dist-info/RECORD +0 -244
- /toil/{wdl/versions → options}/__init__.py +0 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
|
@@ -6,5 +6,4 @@ cwltoil = toil.cwl.cwltoil:cwltoil_was_removed [cwl]
|
|
|
6
6
|
toil = toil.utils.toilMain:main
|
|
7
7
|
toil-cwl-runner = toil.cwl.cwltoil:main [cwl]
|
|
8
8
|
toil-wdl-runner = toil.wdl.wdltoil:main [wdl]
|
|
9
|
-
toil-wdl-runner-old = toil.wdl.toilwdl:main [wdl]
|
|
10
9
|
toil-wes-cwl-runner = toil.server.cli.wes_cwl_runner:main [server]
|
toil/batchSystems/parasol.py
DELETED
|
@@ -1,379 +0,0 @@
|
|
|
1
|
-
# Copyright (C) 2015-2021 Regents of the University of California
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
import logging
|
|
16
|
-
import os
|
|
17
|
-
import re
|
|
18
|
-
import subprocess
|
|
19
|
-
import tempfile
|
|
20
|
-
import time
|
|
21
|
-
from argparse import ArgumentParser, _ArgumentGroup
|
|
22
|
-
from queue import Empty, Queue
|
|
23
|
-
from shutil import which
|
|
24
|
-
from threading import Thread
|
|
25
|
-
from typing import Dict, Optional, Union
|
|
26
|
-
|
|
27
|
-
from toil.batchSystems.abstractBatchSystem import (BatchSystemSupport,
|
|
28
|
-
UpdatedBatchJobInfo)
|
|
29
|
-
from toil.batchSystems.options import OptionSetter
|
|
30
|
-
from toil.common import SYS_MAX_SIZE, Toil
|
|
31
|
-
from toil.lib.iterables import concat
|
|
32
|
-
from toil.test import get_temp_file
|
|
33
|
-
|
|
34
|
-
logger = logging.getLogger(__name__)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class ParasolBatchSystem(BatchSystemSupport):
|
|
38
|
-
"""
|
|
39
|
-
The interface for Parasol.
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
@classmethod
|
|
43
|
-
def supportsWorkerCleanup(cls):
|
|
44
|
-
return False
|
|
45
|
-
|
|
46
|
-
@classmethod
|
|
47
|
-
def supportsAutoDeployment(cls):
|
|
48
|
-
return False
|
|
49
|
-
|
|
50
|
-
def __init__(self, config, maxCores, maxMemory, maxDisk):
|
|
51
|
-
super().__init__(config, maxCores, maxMemory, maxDisk)
|
|
52
|
-
if maxMemory != SYS_MAX_SIZE:
|
|
53
|
-
logger.warning('The Parasol batch system does not support maxMemory.')
|
|
54
|
-
# Keep the name of the results file for the pstat2 command..
|
|
55
|
-
command = config.parasolCommand
|
|
56
|
-
if os.path.sep not in command:
|
|
57
|
-
try:
|
|
58
|
-
command = which(command)
|
|
59
|
-
except StopIteration:
|
|
60
|
-
raise RuntimeError("Can't find %s on PATH." % command)
|
|
61
|
-
logger.debug('Using Parasol at %s', command)
|
|
62
|
-
self.parasolCommand = command
|
|
63
|
-
jobStoreType, path = Toil.parseLocator(config.jobStore)
|
|
64
|
-
if jobStoreType != 'file':
|
|
65
|
-
raise RuntimeError("The parasol batch system doesn't currently work with any "
|
|
66
|
-
"jobStore type except file jobStores.")
|
|
67
|
-
self.parasolResultsDir = tempfile.mkdtemp(dir=os.path.abspath(path))
|
|
68
|
-
logger.debug("Using parasol results dir: %s", self.parasolResultsDir)
|
|
69
|
-
|
|
70
|
-
# In Parasol, each results file corresponds to a separate batch, and all jobs in a batch
|
|
71
|
-
# have the same cpu and memory requirements. The keys to this dictionary are the (cpu,
|
|
72
|
-
# memory) tuples for each batch. A new batch is created whenever a job has a new unique
|
|
73
|
-
# combination of cpu and memory requirements.
|
|
74
|
-
self.resultsFiles = dict()
|
|
75
|
-
self.maxBatches = config.parasolMaxBatches
|
|
76
|
-
|
|
77
|
-
# Allows the worker process to send back the IDs of jobs that have finished, so the batch
|
|
78
|
-
# system can decrease its used cpus counter
|
|
79
|
-
self.cpuUsageQueue = Queue()
|
|
80
|
-
|
|
81
|
-
# Also stores finished job IDs, but is read by getUpdatedJobIDs().
|
|
82
|
-
self.updatedJobsQueue = Queue()
|
|
83
|
-
|
|
84
|
-
# Use this to stop the worker when shutting down
|
|
85
|
-
self.running = True
|
|
86
|
-
|
|
87
|
-
self.worker = Thread(target=self.updatedJobWorker, args=())
|
|
88
|
-
self.worker.start()
|
|
89
|
-
self.usedCpus = 0
|
|
90
|
-
self.jobIDsToCpu = {}
|
|
91
|
-
|
|
92
|
-
# Set of jobs that have been issued but aren't known to have finished or been killed yet.
|
|
93
|
-
# Jobs that end by themselves are removed in getUpdatedJob, and jobs that are killed are
|
|
94
|
-
# removed in killBatchJobs.
|
|
95
|
-
self.runningJobs = set()
|
|
96
|
-
|
|
97
|
-
def _runParasol(self, command, autoRetry=True):
|
|
98
|
-
"""
|
|
99
|
-
Issue a parasol command using popen to capture the output.
|
|
100
|
-
|
|
101
|
-
If the command fails then it will try pinging parasol until it gets a response.
|
|
102
|
-
When it gets a response it will recursively call the issue parasol command,
|
|
103
|
-
repeating this pattern for a maximum of N times. The final exit value will reflect this.
|
|
104
|
-
"""
|
|
105
|
-
command = list(concat(self.parasolCommand, command))
|
|
106
|
-
while True:
|
|
107
|
-
logger.debug('Running %r', command)
|
|
108
|
-
process = subprocess.Popen(command,
|
|
109
|
-
stdout=subprocess.PIPE,
|
|
110
|
-
stderr=subprocess.PIPE,
|
|
111
|
-
bufsize=-1)
|
|
112
|
-
stdout, stderr = process.communicate()
|
|
113
|
-
status = process.wait()
|
|
114
|
-
for line in stderr.decode('utf-8').split('\n'):
|
|
115
|
-
if line: logger.warning(line)
|
|
116
|
-
if status == 0:
|
|
117
|
-
return 0, stdout.decode('utf-8').split('\n')
|
|
118
|
-
message = 'Command %r failed with exit status %i' % (command, status)
|
|
119
|
-
if autoRetry:
|
|
120
|
-
logger.warning(message)
|
|
121
|
-
else:
|
|
122
|
-
logger.error(message)
|
|
123
|
-
return status, None
|
|
124
|
-
logger.warning('Waiting for a 10s, before trying again')
|
|
125
|
-
time.sleep(10)
|
|
126
|
-
|
|
127
|
-
parasolOutputPattern = re.compile("your job ([0-9]+).*")
|
|
128
|
-
|
|
129
|
-
def issueBatchJob(self, jobDesc, job_environment: Optional[Dict[str, str]] = None):
|
|
130
|
-
"""Issue parasol with job commands."""
|
|
131
|
-
self.check_resource_request(jobDesc)
|
|
132
|
-
|
|
133
|
-
MiB = 1 << 20
|
|
134
|
-
truncatedMemory = jobDesc.memory // MiB * MiB
|
|
135
|
-
# Look for a batch for jobs with these resource requirements, with
|
|
136
|
-
# the memory rounded down to the nearest megabyte. Rounding down
|
|
137
|
-
# meams the new job can't ever decrease the memory requirements
|
|
138
|
-
# of jobs already in the batch.
|
|
139
|
-
if len(self.resultsFiles) >= self.maxBatches:
|
|
140
|
-
raise RuntimeError('Number of batches reached limit of %i' % self.maxBatches)
|
|
141
|
-
try:
|
|
142
|
-
results = self.resultsFiles[(truncatedMemory, jobDesc.cores)]
|
|
143
|
-
except KeyError:
|
|
144
|
-
results = get_temp_file(rootDir=self.parasolResultsDir)
|
|
145
|
-
self.resultsFiles[(truncatedMemory, jobDesc.cores)] = results
|
|
146
|
-
|
|
147
|
-
# Prefix the command with environment overrides, optionally looking them up from the
|
|
148
|
-
# current environment if the value is None
|
|
149
|
-
command = ' '.join(concat('env', self.__environment(job_environment), jobDesc.command))
|
|
150
|
-
parasolCommand = ['-verbose',
|
|
151
|
-
'-ram=%i' % jobDesc.memory,
|
|
152
|
-
'-cpu=%i' % jobDesc.cores,
|
|
153
|
-
'-results=' + results,
|
|
154
|
-
'add', 'job', command]
|
|
155
|
-
# Deal with the cpus
|
|
156
|
-
self.usedCpus += jobDesc.cores
|
|
157
|
-
while True: # Process finished results with no wait
|
|
158
|
-
try:
|
|
159
|
-
jobID = self.cpuUsageQueue.get_nowait()
|
|
160
|
-
except Empty:
|
|
161
|
-
break
|
|
162
|
-
if jobID in list(self.jobIDsToCpu.keys()):
|
|
163
|
-
self.usedCpus -= self.jobIDsToCpu.pop(jobID)
|
|
164
|
-
assert self.usedCpus >= 0
|
|
165
|
-
while self.usedCpus > self.maxCores: # If we are still waiting
|
|
166
|
-
jobID = self.cpuUsageQueue.get()
|
|
167
|
-
if jobID in list(self.jobIDsToCpu.keys()):
|
|
168
|
-
self.usedCpus -= self.jobIDsToCpu.pop(jobID)
|
|
169
|
-
assert self.usedCpus >= 0
|
|
170
|
-
# Now keep going
|
|
171
|
-
while True:
|
|
172
|
-
line = self._runParasol(parasolCommand)[1][0]
|
|
173
|
-
match = self.parasolOutputPattern.match(line)
|
|
174
|
-
if match is None:
|
|
175
|
-
# This is because parasol add job will return success, even if the job was not
|
|
176
|
-
# properly issued!
|
|
177
|
-
logger.debug('We failed to properly add the job, we will try again after a 5s.')
|
|
178
|
-
time.sleep(5)
|
|
179
|
-
else:
|
|
180
|
-
jobID = int(match.group(1))
|
|
181
|
-
self.jobIDsToCpu[jobID] = jobDesc.cores
|
|
182
|
-
self.runningJobs.add(jobID)
|
|
183
|
-
logger.debug(f"Got the parasol job id: {jobID} from line: {line}")
|
|
184
|
-
return jobID
|
|
185
|
-
|
|
186
|
-
def setEnv(self, name, value=None):
|
|
187
|
-
if value and ' ' in value:
|
|
188
|
-
raise ValueError('Parasol does not support spaces in environment variable values.')
|
|
189
|
-
return super().setEnv(name, value)
|
|
190
|
-
|
|
191
|
-
def __environment(self, job_environment: Optional[Dict[str, str]] = None):
|
|
192
|
-
environment = self.environment.copy()
|
|
193
|
-
if job_environment:
|
|
194
|
-
environment.update(job_environment)
|
|
195
|
-
|
|
196
|
-
return (k + '=' + (os.environ[k] if v is None else v) for k, v in list(environment.items()))
|
|
197
|
-
|
|
198
|
-
def killBatchJobs(self, jobIDs):
|
|
199
|
-
"""Kills the given jobs, represented as Job ids, then checks they are dead by checking
|
|
200
|
-
they are not in the list of issued jobs.
|
|
201
|
-
"""
|
|
202
|
-
while True:
|
|
203
|
-
for jobID in jobIDs:
|
|
204
|
-
if jobID in self.runningJobs:
|
|
205
|
-
self.runningJobs.remove(jobID)
|
|
206
|
-
exitValue = self._runParasol(['remove', 'job', str(jobID)],
|
|
207
|
-
autoRetry=False)[0]
|
|
208
|
-
logger.debug("Tried to remove jobID: %i, with exit value: %i" % (jobID, exitValue))
|
|
209
|
-
runningJobs = self.getIssuedBatchJobIDs()
|
|
210
|
-
if set(jobIDs).difference(set(runningJobs)) == set(jobIDs):
|
|
211
|
-
break
|
|
212
|
-
logger.warning('Tried to kill some jobs, but something happened and they are still '
|
|
213
|
-
'going, will try again in 5s.')
|
|
214
|
-
time.sleep(5)
|
|
215
|
-
# Update the CPU usage, because killed jobs aren't written to the results file.
|
|
216
|
-
for jobID in jobIDs:
|
|
217
|
-
if jobID in list(self.jobIDsToCpu.keys()):
|
|
218
|
-
self.usedCpus -= self.jobIDsToCpu.pop(jobID)
|
|
219
|
-
|
|
220
|
-
runningPattern = re.compile(r'r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+')
|
|
221
|
-
|
|
222
|
-
def getJobIDsForResultsFile(self, resultsFile):
|
|
223
|
-
"""Get all queued and running jobs for a results file."""
|
|
224
|
-
jobIDs = []
|
|
225
|
-
for line in self._runParasol(['-extended', 'list', 'jobs'])[1]:
|
|
226
|
-
fields = line.strip().split()
|
|
227
|
-
if len(fields) == 0 or fields[-1] != resultsFile:
|
|
228
|
-
continue
|
|
229
|
-
jobID = fields[0]
|
|
230
|
-
jobIDs.append(int(jobID))
|
|
231
|
-
return set(jobIDs)
|
|
232
|
-
|
|
233
|
-
def getIssuedBatchJobIDs(self):
|
|
234
|
-
"""
|
|
235
|
-
Gets the list of jobs issued to parasol in all results files, but not including jobs
|
|
236
|
-
created by other users.
|
|
237
|
-
"""
|
|
238
|
-
issuedJobs = set()
|
|
239
|
-
for resultsFile in self.resultsFiles.values():
|
|
240
|
-
issuedJobs.update(self.getJobIDsForResultsFile(resultsFile))
|
|
241
|
-
|
|
242
|
-
return list(issuedJobs)
|
|
243
|
-
|
|
244
|
-
def getRunningBatchJobIDs(self):
|
|
245
|
-
"""Returns map of running jobIDs and the time they have been running."""
|
|
246
|
-
# Example lines..
|
|
247
|
-
# r 5410186 benedictpaten worker 1247029663 localhost
|
|
248
|
-
# r 5410324 benedictpaten worker 1247030076 localhost
|
|
249
|
-
runningJobs = {}
|
|
250
|
-
issuedJobs = self.getIssuedBatchJobIDs()
|
|
251
|
-
for line in self._runParasol(['pstat2'])[1]:
|
|
252
|
-
if line != '':
|
|
253
|
-
match = self.runningPattern.match(line)
|
|
254
|
-
if match is not None:
|
|
255
|
-
jobID = int(match.group(1))
|
|
256
|
-
startTime = int(match.group(2))
|
|
257
|
-
if jobID in issuedJobs: # It's one of our jobs
|
|
258
|
-
runningJobs[jobID] = time.time() - startTime
|
|
259
|
-
return runningJobs
|
|
260
|
-
|
|
261
|
-
def getUpdatedBatchJob(self, maxWait):
|
|
262
|
-
while True:
|
|
263
|
-
try:
|
|
264
|
-
item = self.updatedJobsQueue.get(timeout=maxWait)
|
|
265
|
-
except Empty:
|
|
266
|
-
return None
|
|
267
|
-
try:
|
|
268
|
-
self.runningJobs.remove(item.jobID)
|
|
269
|
-
except KeyError:
|
|
270
|
-
# We tried to kill this job, but it ended by itself instead, so skip it.
|
|
271
|
-
pass
|
|
272
|
-
else:
|
|
273
|
-
return item
|
|
274
|
-
|
|
275
|
-
def updatedJobWorker(self):
|
|
276
|
-
"""
|
|
277
|
-
We use the parasol results to update the status of jobs, adding them
|
|
278
|
-
to the list of updated jobs.
|
|
279
|
-
|
|
280
|
-
Results have the following structure.. (thanks Mark D!)
|
|
281
|
-
|
|
282
|
-
==================== =============================================
|
|
283
|
-
int status; Job status - wait() return format. 0 is good.
|
|
284
|
-
char host; Machine job ran on.
|
|
285
|
-
char jobId; Job queuing system job ID
|
|
286
|
-
char exe; Job executable file (no path)
|
|
287
|
-
int usrTicks; 'User' CPU time in ticks.
|
|
288
|
-
int sysTicks; 'System' CPU time in ticks.
|
|
289
|
-
unsigned submitTime; Job submission time in seconds since 1/1/1970
|
|
290
|
-
unsigned startTime; Job start time in seconds since 1/1/1970
|
|
291
|
-
unsigned endTime; Job end time in seconds since 1/1/1970
|
|
292
|
-
char user; User who ran job
|
|
293
|
-
char errFile; Location of stderr file on host
|
|
294
|
-
==================== =============================================
|
|
295
|
-
|
|
296
|
-
Plus you finally have the command name.
|
|
297
|
-
"""
|
|
298
|
-
resultsFiles = set()
|
|
299
|
-
resultsFileHandles = []
|
|
300
|
-
try:
|
|
301
|
-
while self.running:
|
|
302
|
-
# Look for any new results files that have been created, and open them
|
|
303
|
-
newResultsFiles = set(os.listdir(self.parasolResultsDir)).difference(resultsFiles)
|
|
304
|
-
for newFile in newResultsFiles:
|
|
305
|
-
newFilePath = os.path.join(self.parasolResultsDir, newFile)
|
|
306
|
-
resultsFileHandles.append(open(newFilePath))
|
|
307
|
-
resultsFiles.add(newFile)
|
|
308
|
-
for fileHandle in resultsFileHandles:
|
|
309
|
-
while self.running:
|
|
310
|
-
line = fileHandle.readline()
|
|
311
|
-
if not line:
|
|
312
|
-
break
|
|
313
|
-
assert line[-1] == '\n'
|
|
314
|
-
(status, host, jobId, exe, usrTicks, sysTicks, submitTime, startTime,
|
|
315
|
-
endTime, user, errFile, command) = line[:-1].split(None, 11)
|
|
316
|
-
status = int(status)
|
|
317
|
-
jobId = int(jobId)
|
|
318
|
-
if os.WIFEXITED(status):
|
|
319
|
-
status = os.WEXITSTATUS(status)
|
|
320
|
-
else:
|
|
321
|
-
status = -status
|
|
322
|
-
self.cpuUsageQueue.put(jobId)
|
|
323
|
-
startTime = int(startTime)
|
|
324
|
-
endTime = int(endTime)
|
|
325
|
-
if endTime == startTime:
|
|
326
|
-
# Both, start and end time is an integer so to get sub-second
|
|
327
|
-
# accuracy we use the ticks reported by Parasol as an approximation.
|
|
328
|
-
# This isn't documented but what Parasol calls "ticks" is actually a
|
|
329
|
-
# hundredth of a second. Parasol does the unit conversion early on
|
|
330
|
-
# after a job finished. Search paraNode.c for ticksToHundreths. We
|
|
331
|
-
# also cheat a little by always reporting at least one hundredth of a
|
|
332
|
-
# second.
|
|
333
|
-
usrTicks = int(usrTicks)
|
|
334
|
-
sysTicks = int(sysTicks)
|
|
335
|
-
wallTime = float(max(1, usrTicks + sysTicks)) * 0.01
|
|
336
|
-
else:
|
|
337
|
-
wallTime = float(endTime - startTime)
|
|
338
|
-
self.updatedJobsQueue.put(UpdatedBatchJobInfo(jobID=jobId, exitStatus=status, wallTime=wallTime, exitReason=None))
|
|
339
|
-
time.sleep(1)
|
|
340
|
-
except:
|
|
341
|
-
logger.warning("Error occurred while parsing parasol results files.")
|
|
342
|
-
raise
|
|
343
|
-
finally:
|
|
344
|
-
for fileHandle in resultsFileHandles:
|
|
345
|
-
fileHandle.close()
|
|
346
|
-
|
|
347
|
-
def shutdown(self) -> None:
|
|
348
|
-
self.killBatchJobs(self.getIssuedBatchJobIDs()) # cleanup jobs
|
|
349
|
-
for results in self.resultsFiles.values():
|
|
350
|
-
exitValue = self._runParasol(['-results=' + results, 'clear', 'sick'],
|
|
351
|
-
autoRetry=False)[0]
|
|
352
|
-
if exitValue is not None:
|
|
353
|
-
logger.warning("Could not clear sick status of the parasol batch %s" % results)
|
|
354
|
-
exitValue = self._runParasol(['-results=' + results, 'flushResults'],
|
|
355
|
-
autoRetry=False)[0]
|
|
356
|
-
if exitValue is not None:
|
|
357
|
-
logger.warning("Could not flush the parasol batch %s" % results)
|
|
358
|
-
self.running = False
|
|
359
|
-
logger.debug('Joining worker thread...')
|
|
360
|
-
self.worker.join()
|
|
361
|
-
logger.debug('... joined worker thread.')
|
|
362
|
-
for results in list(self.resultsFiles.values()):
|
|
363
|
-
os.remove(results)
|
|
364
|
-
os.rmdir(self.parasolResultsDir)
|
|
365
|
-
|
|
366
|
-
@classmethod
|
|
367
|
-
def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
|
|
368
|
-
parser.add_argument("--parasolCommand", dest="parasolCommand", default='parasol',
|
|
369
|
-
help="The name or path of the parasol program. Will be looked up on PATH "
|
|
370
|
-
"unless it starts with a slash. (default: %(default)s).")
|
|
371
|
-
parser.add_argument("--parasolMaxBatches", dest="parasolMaxBatches", default=1000,
|
|
372
|
-
help="Maximum number of job batches the Parasol batch is allowed to create. One batch is "
|
|
373
|
-
"created for jobs with a a unique set of resource requirements. (default: %(default)s).")
|
|
374
|
-
|
|
375
|
-
@classmethod
|
|
376
|
-
def setOptions(cls, setOption: OptionSetter):
|
|
377
|
-
from toil.common import iC
|
|
378
|
-
setOption("parasolCommand", None, None, 'parasol')
|
|
379
|
-
setOption("parasolMaxBatches", int, iC(1), 10000)
|