toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -12,10 +12,12 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import logging
|
|
15
|
-
from typing import
|
|
15
|
+
from typing import Optional
|
|
16
16
|
|
|
17
|
-
from toil.batchSystems.abstractBatchSystem import (
|
|
18
|
-
|
|
17
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
18
|
+
BatchSystemSupport,
|
|
19
|
+
UpdatedBatchJobInfo,
|
|
20
|
+
)
|
|
19
21
|
from toil.batchSystems.singleMachine import SingleMachineBatchSystem
|
|
20
22
|
from toil.common import Config
|
|
21
23
|
from toil.job import JobDescription
|
|
@@ -27,22 +29,25 @@ logger = logging.getLogger(__name__)
|
|
|
27
29
|
class BatchSystemLocalSupport(BatchSystemSupport):
|
|
28
30
|
"""Adds a local queue for helper jobs, useful for CWL & others."""
|
|
29
31
|
|
|
30
|
-
def __init__(
|
|
32
|
+
def __init__(
|
|
33
|
+
self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
|
|
34
|
+
) -> None:
|
|
31
35
|
super().__init__(config, maxCores, maxMemory, maxDisk)
|
|
32
|
-
max_local_jobs =
|
|
36
|
+
max_local_jobs = (
|
|
37
|
+
config.max_local_jobs if config.max_local_jobs is not None else cpu_count()
|
|
38
|
+
)
|
|
33
39
|
self.localBatch: SingleMachineBatchSystem = SingleMachineBatchSystem(
|
|
34
40
|
config, maxCores, maxMemory, maxDisk, max_jobs=max_local_jobs
|
|
35
41
|
)
|
|
36
42
|
|
|
37
|
-
def handleLocalJob(self, jobDesc: JobDescription) -> Optional[int]:
|
|
43
|
+
def handleLocalJob(self, command: str, jobDesc: JobDescription) -> Optional[int]:
|
|
38
44
|
"""
|
|
39
|
-
To be called by
|
|
45
|
+
To be called by issueBatchJob.
|
|
40
46
|
|
|
41
47
|
Returns the jobID if the jobDesc has been submitted to the local queue,
|
|
42
48
|
otherwise returns None
|
|
43
49
|
"""
|
|
44
|
-
if
|
|
45
|
-
and jobDesc.local):
|
|
50
|
+
if not self.config.run_local_jobs_on_workers and jobDesc.local:
|
|
46
51
|
# Since singleMachine.py doesn't typecheck yet and MyPy is ignoring
|
|
47
52
|
# it, it will raise errors here unless we add type annotations to
|
|
48
53
|
# everything we get back from it. The easiest way to do that seems
|
|
@@ -50,12 +55,12 @@ class BatchSystemLocalSupport(BatchSystemSupport):
|
|
|
50
55
|
# somehow doesn't error whereas just returning the value complains
|
|
51
56
|
# we're returning an Any. TODO: When singleMachine.py typechecks,
|
|
52
57
|
# remove all these extra variables.
|
|
53
|
-
local_id: int = self.localBatch.issueBatchJob(jobDesc)
|
|
58
|
+
local_id: int = self.localBatch.issueBatchJob(command, jobDesc)
|
|
54
59
|
return local_id
|
|
55
60
|
else:
|
|
56
61
|
return None
|
|
57
62
|
|
|
58
|
-
def killLocalJobs(self, jobIDs:
|
|
63
|
+
def killLocalJobs(self, jobIDs: list[int]) -> None:
|
|
59
64
|
"""
|
|
60
65
|
Will kill all local jobs that match the provided jobIDs.
|
|
61
66
|
|
|
@@ -63,14 +68,14 @@ class BatchSystemLocalSupport(BatchSystemSupport):
|
|
|
63
68
|
"""
|
|
64
69
|
self.localBatch.killBatchJobs(jobIDs)
|
|
65
70
|
|
|
66
|
-
def getIssuedLocalJobIDs(self) ->
|
|
71
|
+
def getIssuedLocalJobIDs(self) -> list[int]:
|
|
67
72
|
"""To be called by getIssuedBatchJobIDs."""
|
|
68
|
-
local_ids:
|
|
73
|
+
local_ids: list[int] = self.localBatch.getIssuedBatchJobIDs()
|
|
69
74
|
return local_ids
|
|
70
75
|
|
|
71
|
-
def getRunningLocalJobIDs(self) ->
|
|
76
|
+
def getRunningLocalJobIDs(self) -> dict[int, float]:
|
|
72
77
|
"""To be called by getRunningBatchJobIDs()."""
|
|
73
|
-
local_running:
|
|
78
|
+
local_running: dict[int, float] = self.localBatch.getRunningBatchJobIDs()
|
|
74
79
|
return local_running
|
|
75
80
|
|
|
76
81
|
def getUpdatedLocalJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]:
|
toil/batchSystems/lsf.py
CHANGED
|
@@ -25,18 +25,24 @@ import re
|
|
|
25
25
|
import subprocess
|
|
26
26
|
from datetime import datetime
|
|
27
27
|
from random import randint
|
|
28
|
-
from typing import
|
|
28
|
+
from typing import Optional, Union
|
|
29
29
|
|
|
30
30
|
from dateutil.parser import parse
|
|
31
31
|
from dateutil.tz import tzlocal
|
|
32
32
|
|
|
33
|
-
from toil.batchSystems.abstractBatchSystem import
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
33
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
34
|
+
EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
35
|
+
BatchJobExitReason,
|
|
36
|
+
)
|
|
37
|
+
from toil.batchSystems.abstractGridEngineBatchSystem import (
|
|
38
|
+
AbstractGridEngineBatchSystem,
|
|
39
|
+
)
|
|
40
|
+
from toil.batchSystems.lsfHelper import (
|
|
41
|
+
check_lsf_json_output_supported,
|
|
42
|
+
parse_mem_and_cmd_from_output,
|
|
43
|
+
parse_memory,
|
|
44
|
+
per_core_reservation,
|
|
45
|
+
)
|
|
40
46
|
from toil.lib.misc import call_command
|
|
41
47
|
|
|
42
48
|
logger = logging.getLogger(__name__)
|
|
@@ -44,53 +50,64 @@ logger = logging.getLogger(__name__)
|
|
|
44
50
|
|
|
45
51
|
class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
46
52
|
|
|
47
|
-
class
|
|
48
|
-
"""LSF specific
|
|
53
|
+
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
54
|
+
"""LSF specific GridEngineThread methods."""
|
|
49
55
|
|
|
50
56
|
def getRunningJobIDs(self):
|
|
51
57
|
times = {}
|
|
52
58
|
with self.runningJobsLock:
|
|
53
|
-
currentjobs = {str(self.batchJobIDs[x][0]): x for x in
|
|
54
|
-
self.runningJobs}
|
|
59
|
+
currentjobs = {str(self.batchJobIDs[x][0]): x for x in self.runningJobs}
|
|
55
60
|
|
|
56
61
|
if check_lsf_json_output_supported:
|
|
57
|
-
stdout = call_command(["bjobs","-json","-o", "jobid stat start_time"])
|
|
62
|
+
stdout = call_command(["bjobs", "-json", "-o", "jobid stat start_time"])
|
|
58
63
|
|
|
59
64
|
bjobs_records = self.parseBjobs(stdout)
|
|
60
65
|
if bjobs_records:
|
|
61
66
|
for single_item in bjobs_records:
|
|
62
|
-
if
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
67
|
+
if (
|
|
68
|
+
single_item["STAT"] == "RUN"
|
|
69
|
+
and single_item["JOBID"] in currentjobs
|
|
70
|
+
):
|
|
71
|
+
jobstart = parse(
|
|
72
|
+
single_item["START_TIME"],
|
|
73
|
+
default=datetime.now(tzlocal()),
|
|
74
|
+
)
|
|
75
|
+
times[currentjobs[single_item["JOBID"]]] = (
|
|
76
|
+
datetime.now(tzlocal()) - jobstart
|
|
77
|
+
)
|
|
66
78
|
else:
|
|
67
79
|
times = self.fallbackRunningJobIDs(currentjobs)
|
|
68
80
|
return times
|
|
69
81
|
|
|
70
82
|
def fallbackRunningJobIDs(self, currentjobs):
|
|
71
83
|
times = {}
|
|
72
|
-
stdout = call_command(
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
84
|
+
stdout = call_command(
|
|
85
|
+
["bjobs", "-o", "jobid stat start_time delimiter='|'"]
|
|
86
|
+
)
|
|
87
|
+
for curline in stdout.split("\n"):
|
|
88
|
+
items = curline.strip().split("|")
|
|
89
|
+
if items[0] in currentjobs and items[1] == "RUN":
|
|
76
90
|
jobstart = parse(items[2], default=datetime.now(tzlocal()))
|
|
77
|
-
times[currentjobs[items[0]]] = datetime.now(tzlocal())
|
|
78
|
-
- jobstart
|
|
91
|
+
times[currentjobs[items[0]]] = datetime.now(tzlocal()) - jobstart
|
|
79
92
|
return times
|
|
80
93
|
|
|
81
94
|
def killJob(self, jobID):
|
|
82
|
-
call_command([
|
|
83
|
-
|
|
84
|
-
def prepareSubmission(
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
95
|
+
call_command(["bkill", self.getBatchSystemID(jobID)])
|
|
96
|
+
|
|
97
|
+
def prepareSubmission(
|
|
98
|
+
self,
|
|
99
|
+
cpu: int,
|
|
100
|
+
memory: int,
|
|
101
|
+
jobID: int,
|
|
102
|
+
command: str,
|
|
103
|
+
jobName: str,
|
|
104
|
+
job_environment: Optional[dict[str, str]] = None,
|
|
105
|
+
gpus: Optional[int] = None,
|
|
106
|
+
):
|
|
107
|
+
return (
|
|
108
|
+
self.prepareBsub(cpu, memory, jobID) + [command],
|
|
109
|
+
job_environment,
|
|
110
|
+
) # pass job_environment to .submitJob()
|
|
94
111
|
|
|
95
112
|
def submitJob(self, subLine):
|
|
96
113
|
subLine, job_environment = subLine
|
|
@@ -102,7 +119,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
102
119
|
stdout = call_command(subLine, env=combinedEnv)
|
|
103
120
|
# Example success: Job <39605914> is submitted to default queue <general>.
|
|
104
121
|
# Example fail: Service class does not exist. Job not submitted.
|
|
105
|
-
result_search = re.search(
|
|
122
|
+
result_search = re.search("Job <(.*)> is submitted", stdout)
|
|
106
123
|
|
|
107
124
|
if result_search:
|
|
108
125
|
result = int(result_search.group(1))
|
|
@@ -138,7 +155,11 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
138
155
|
logger.debug("Getting coalesced job exit codes via bjobs")
|
|
139
156
|
bjobs_records = self.parseBjobs(
|
|
140
157
|
subprocess.run(
|
|
141
|
-
args,
|
|
158
|
+
args,
|
|
159
|
+
check=False,
|
|
160
|
+
stdout=subprocess.PIPE,
|
|
161
|
+
stderr=subprocess.STDOUT,
|
|
162
|
+
encoding="utf-8",
|
|
142
163
|
).stdout
|
|
143
164
|
)
|
|
144
165
|
if bjobs_records:
|
|
@@ -161,23 +182,31 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
161
182
|
status_resonse.append(None)
|
|
162
183
|
return status_resonse
|
|
163
184
|
|
|
164
|
-
def getJobExitCode(
|
|
185
|
+
def getJobExitCode(
|
|
186
|
+
self, lsfJobID
|
|
187
|
+
) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]:
|
|
165
188
|
# the task is set as part of the job ID if using getBatchSystemID()
|
|
166
189
|
if "NOT_SUBMITTED" in lsfJobID:
|
|
167
190
|
logger.error("bjobs detected job failed to submit")
|
|
168
191
|
return 1
|
|
169
192
|
|
|
170
193
|
job, task = (lsfJobID, None)
|
|
171
|
-
if
|
|
172
|
-
job, task = lsfJobID.split(
|
|
194
|
+
if "." in lsfJobID:
|
|
195
|
+
job, task = lsfJobID.split(".", 1)
|
|
173
196
|
|
|
174
197
|
self.parseMaxMem(job)
|
|
175
198
|
# first try bjobs to find out job state
|
|
176
199
|
if check_lsf_json_output_supported:
|
|
177
|
-
args = [
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
200
|
+
args = [
|
|
201
|
+
"bjobs",
|
|
202
|
+
"-json",
|
|
203
|
+
"-o",
|
|
204
|
+
"user exit_code stat exit_reason pend_reason",
|
|
205
|
+
str(job),
|
|
206
|
+
]
|
|
207
|
+
logger.debug(
|
|
208
|
+
"Checking job exit code for job via bjobs: " "{}".format(job)
|
|
209
|
+
)
|
|
181
210
|
stdout = call_command(args)
|
|
182
211
|
bjobs_records = self.parseBjobs(stdout)
|
|
183
212
|
if bjobs_records:
|
|
@@ -186,7 +215,9 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
186
215
|
|
|
187
216
|
return self.fallbackGetJobExitCode(job)
|
|
188
217
|
|
|
189
|
-
def parse_bjobs_record(
|
|
218
|
+
def parse_bjobs_record(
|
|
219
|
+
self, bjobs_record: dict, job: int
|
|
220
|
+
) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]:
|
|
190
221
|
"""
|
|
191
222
|
Helper functions for getJobExitCode and to parse the bjobs status record
|
|
192
223
|
"""
|
|
@@ -202,7 +233,8 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
202
233
|
pending_info = "\n" + bjobs_record["PEND_REASON"]
|
|
203
234
|
logger.debug(
|
|
204
235
|
"bjobs detected job pending with: %s\nfor job: %s",
|
|
205
|
-
pending_info,
|
|
236
|
+
pending_info,
|
|
237
|
+
job,
|
|
206
238
|
)
|
|
207
239
|
return None
|
|
208
240
|
if process_status == "EXIT":
|
|
@@ -221,10 +253,18 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
221
253
|
exit_info += f"\nexit reason: {exit_reason}"
|
|
222
254
|
logger.error(
|
|
223
255
|
"bjobs detected job failed with: %s\nfor job: %s",
|
|
224
|
-
exit_info,
|
|
256
|
+
exit_info,
|
|
257
|
+
job,
|
|
225
258
|
)
|
|
226
259
|
if "TERM_MEMLIMIT" in exit_reason:
|
|
227
|
-
return
|
|
260
|
+
return (
|
|
261
|
+
(
|
|
262
|
+
exit_code
|
|
263
|
+
if exit_code != 0
|
|
264
|
+
else EXIT_STATUS_UNAVAILABLE_VALUE
|
|
265
|
+
),
|
|
266
|
+
BatchJobExitReason.MEMLIMIT,
|
|
267
|
+
)
|
|
228
268
|
return exit_code
|
|
229
269
|
if process_status == "RUN":
|
|
230
270
|
logger.debug(
|
|
@@ -237,46 +277,53 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
237
277
|
|
|
238
278
|
return self.getJobExitCodeBACCT(job)
|
|
239
279
|
|
|
240
|
-
def getJobExitCodeBACCT(
|
|
280
|
+
def getJobExitCodeBACCT(
|
|
281
|
+
self, job
|
|
282
|
+
) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]:
|
|
241
283
|
# if not found in bjobs, then try bacct (slower than bjobs)
|
|
242
|
-
logger.debug("bjobs failed to detect job - trying bacct: "
|
|
243
|
-
"{}".format(job))
|
|
284
|
+
logger.debug("bjobs failed to detect job - trying bacct: " "{}".format(job))
|
|
244
285
|
|
|
245
286
|
args = ["bacct", "-l", str(job)]
|
|
246
287
|
stdout = call_command(args)
|
|
247
|
-
process_output = stdout.split(
|
|
288
|
+
process_output = stdout.split("\n")
|
|
248
289
|
for line in process_output:
|
|
249
290
|
if line.find("Completed <done>") > -1 or line.find("<DONE>") > -1:
|
|
250
|
-
logger.debug("Detected job completed for job: "
|
|
251
|
-
"{}".format(job))
|
|
291
|
+
logger.debug("Detected job completed for job: " "{}".format(job))
|
|
252
292
|
return 0
|
|
253
293
|
elif line.find("Completed <exit>") > -1 or line.find("<EXIT>") > -1:
|
|
254
|
-
logger.error("Detected job failed for job: "
|
|
255
|
-
"{}".format(job))
|
|
294
|
+
logger.error("Detected job failed for job: " "{}".format(job))
|
|
256
295
|
return 1
|
|
257
|
-
logger.debug(
|
|
258
|
-
|
|
296
|
+
logger.debug(
|
|
297
|
+
"Can't determine exit code for job or job still "
|
|
298
|
+
"running: {}".format(job)
|
|
299
|
+
)
|
|
259
300
|
return None
|
|
260
301
|
|
|
261
|
-
def fallbackGetJobExitCode(
|
|
302
|
+
def fallbackGetJobExitCode(
|
|
303
|
+
self, job
|
|
304
|
+
) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]:
|
|
262
305
|
args = ["bjobs", "-l", str(job)]
|
|
263
306
|
logger.debug(f"Checking job exit code for job via bjobs (fallback): {job}")
|
|
264
307
|
stdout = call_command(args)
|
|
265
308
|
output = stdout.replace("\n ", "")
|
|
266
|
-
process_output = output.split(
|
|
309
|
+
process_output = output.split("\n")
|
|
267
310
|
started = 0
|
|
268
311
|
for line in process_output:
|
|
269
312
|
if "Done successfully" in line or "Status <DONE>" in line:
|
|
270
313
|
logger.debug(f"bjobs detected job completed for job: {job}")
|
|
271
314
|
return 0
|
|
272
315
|
elif "New job is waiting for scheduling" in line:
|
|
273
|
-
logger.debug(
|
|
316
|
+
logger.debug(
|
|
317
|
+
f"bjobs detected job pending scheduling for job: {job}"
|
|
318
|
+
)
|
|
274
319
|
return None
|
|
275
320
|
elif "PENDING REASONS" in line or "Status <PEND>" in line:
|
|
276
321
|
logger.debug(f"bjobs detected job pending for job: {job}")
|
|
277
322
|
return None
|
|
278
323
|
elif "Exited with exit code" in line:
|
|
279
|
-
exit = int(
|
|
324
|
+
exit = int(
|
|
325
|
+
line[line.find("Exited with exit code ") + 22 :].split(".")[0]
|
|
326
|
+
)
|
|
280
327
|
logger.error(f"bjobs detected job exit code {exit} for job {job}")
|
|
281
328
|
return exit
|
|
282
329
|
elif "Completed <exit>" in line:
|
|
@@ -293,7 +340,8 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
293
340
|
"""
|
|
294
341
|
Implementation-specific helper methods
|
|
295
342
|
"""
|
|
296
|
-
|
|
343
|
+
|
|
344
|
+
def prepareBsub(self, cpu: int, mem: int, jobID: int) -> list[str]:
|
|
297
345
|
"""
|
|
298
346
|
Make a bsub commandline to execute.
|
|
299
347
|
|
|
@@ -308,18 +356,15 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
308
356
|
if per_core_reservation() and cpu:
|
|
309
357
|
mem = mem / math.ceil(cpu)
|
|
310
358
|
mem = parse_memory(mem)
|
|
311
|
-
bsubMem = [
|
|
312
|
-
|
|
313
|
-
f'rusage[mem={mem}]',
|
|
314
|
-
'-M', mem]
|
|
315
|
-
bsubCpu = [] if cpu is None else ['-n', str(math.ceil(cpu))]
|
|
359
|
+
bsubMem = ["-R", f"select[mem>{mem}] " f"rusage[mem={mem}]", "-M", mem]
|
|
360
|
+
bsubCpu = [] if cpu is None else ["-n", str(math.ceil(cpu))]
|
|
316
361
|
bsubline = ["bsub", "-cwd", ".", "-J", f"toil_job_{jobID}"]
|
|
317
362
|
bsubline.extend(bsubMem)
|
|
318
363
|
bsubline.extend(bsubCpu)
|
|
319
|
-
stdoutfile: str = self.boss.format_std_out_err_path(jobID,
|
|
320
|
-
stderrfile: str = self.boss.format_std_out_err_path(jobID,
|
|
321
|
-
bsubline.extend([
|
|
322
|
-
lsfArgs = os.getenv(
|
|
364
|
+
stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%J", "out")
|
|
365
|
+
stderrfile: str = self.boss.format_std_out_err_path(jobID, "%J", "err")
|
|
366
|
+
bsubline.extend(["-o", stdoutfile, "-e", stderrfile])
|
|
367
|
+
lsfArgs = os.getenv("TOIL_LSF_ARGS")
|
|
323
368
|
if lsfArgs:
|
|
324
369
|
bsubline.extend(lsfArgs.split())
|
|
325
370
|
return bsubline
|
|
@@ -333,16 +378,16 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
333
378
|
bjobs_dict = None
|
|
334
379
|
bjobs_records = None
|
|
335
380
|
# Handle Cannot connect to LSF. Please wait ... type messages
|
|
336
|
-
dict_start = bjobs_output_str.find(
|
|
337
|
-
dict_end = bjobs_output_str.rfind(
|
|
381
|
+
dict_start = bjobs_output_str.find("{")
|
|
382
|
+
dict_end = bjobs_output_str.rfind("}")
|
|
338
383
|
if dict_start != -1 and dict_end != -1:
|
|
339
|
-
bjobs_output = bjobs_output_str[dict_start:(dict_end+1)]
|
|
384
|
+
bjobs_output = bjobs_output_str[dict_start : (dict_end + 1)]
|
|
340
385
|
try:
|
|
341
386
|
bjobs_dict = json.loads(bjobs_output)
|
|
342
387
|
except json.decoder.JSONDecodeError:
|
|
343
388
|
logger.error(f"Could not parse bjobs output: {bjobs_output_str}")
|
|
344
|
-
if
|
|
345
|
-
bjobs_records = bjobs_dict[
|
|
389
|
+
if "RECORDS" in bjobs_dict:
|
|
390
|
+
bjobs_records = bjobs_dict["RECORDS"]
|
|
346
391
|
if bjobs_records is None:
|
|
347
392
|
logger.error(f"Could not find bjobs output json in: {bjobs_output_str}")
|
|
348
393
|
|
|
@@ -358,16 +403,24 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
358
403
|
output = subprocess.check_output(["bjobs", "-l", str(jobID)], text=True)
|
|
359
404
|
max_mem, command = parse_mem_and_cmd_from_output(output=output)
|
|
360
405
|
if not max_mem:
|
|
361
|
-
logger.warning(
|
|
406
|
+
logger.warning(
|
|
407
|
+
f"[job ID {jobID}] Unable to Collect Maximum Memory Usage: {output}"
|
|
408
|
+
)
|
|
362
409
|
return
|
|
363
410
|
|
|
364
411
|
if not command:
|
|
365
|
-
logger.warning(
|
|
412
|
+
logger.warning(
|
|
413
|
+
f"[job ID {jobID}] Cannot Parse Max Memory Due to Missing Command String: {output}"
|
|
414
|
+
)
|
|
366
415
|
else:
|
|
367
|
-
logger.info(
|
|
416
|
+
logger.info(
|
|
417
|
+
f"[job ID {jobID}, Command {command.group(1)}] Max Memory Used: {max_mem.group(1)}"
|
|
418
|
+
)
|
|
368
419
|
return max_mem
|
|
369
420
|
except subprocess.CalledProcessError as e:
|
|
370
|
-
logger.warning(
|
|
421
|
+
logger.warning(
|
|
422
|
+
f"[job ID {jobID}] Unable to Collect Maximum Memory Usage: {e}"
|
|
423
|
+
)
|
|
371
424
|
|
|
372
425
|
def getWaitDuration(self):
|
|
373
426
|
"""We give LSF a second to catch its breath (in seconds)"""
|
toil/batchSystems/lsfHelper.py
CHANGED
|
@@ -72,7 +72,7 @@ def apply_conf_file(fn, conf_filename):
|
|
|
72
72
|
for env in LSF_CONF_ENV:
|
|
73
73
|
conf_file = get_conf_file(conf_filename, env)
|
|
74
74
|
if conf_file:
|
|
75
|
-
with open(conf_file, encoding=
|
|
75
|
+
with open(conf_file, encoding="utf-8") as conf_handle:
|
|
76
76
|
value = fn(conf_handle)
|
|
77
77
|
if value:
|
|
78
78
|
return value
|
|
@@ -112,9 +112,9 @@ def apply_bparams(fn):
|
|
|
112
112
|
"""
|
|
113
113
|
cmd = ["bparams", "-a"]
|
|
114
114
|
try:
|
|
115
|
-
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode(
|
|
115
|
+
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode("utf-8")
|
|
116
116
|
except subprocess.CalledProcessError as exc:
|
|
117
|
-
logger.debug(exc.output.decode(
|
|
117
|
+
logger.debug(exc.output.decode("utf-8"))
|
|
118
118
|
return None
|
|
119
119
|
return fn(output.split("\n"))
|
|
120
120
|
|
|
@@ -125,9 +125,9 @@ def apply_lsadmin(fn):
|
|
|
125
125
|
"""
|
|
126
126
|
cmd = ["lsadmin", "showconf", "lim"]
|
|
127
127
|
try:
|
|
128
|
-
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode(
|
|
128
|
+
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode("utf-8")
|
|
129
129
|
except subprocess.CalledProcessError as exc:
|
|
130
|
-
logger.debug(exc.output.decode(
|
|
130
|
+
logger.debug(exc.output.decode("utf-8"))
|
|
131
131
|
return None
|
|
132
132
|
return fn(output.split("\n"))
|
|
133
133
|
|
|
@@ -161,7 +161,7 @@ def parse_mem_and_cmd_from_output(output: str):
|
|
|
161
161
|
# Handle hard wrapping in the middle of words and arbitrary
|
|
162
162
|
# indents. May drop spaces at the starts of lines that aren't
|
|
163
163
|
# meant to be part of the indent.
|
|
164
|
-
cleaned_up_output =
|
|
164
|
+
cleaned_up_output = " ".join(re.sub(r"\n\s*", "", output).split(","))
|
|
165
165
|
max_mem = re.search(r"MAX ?MEM: ?(.*?);", cleaned_up_output)
|
|
166
166
|
command = re.search(r"Command ?<(.*?)>", cleaned_up_output)
|
|
167
167
|
return max_mem, command
|
|
@@ -173,10 +173,10 @@ def get_lsf_version():
|
|
|
173
173
|
"""
|
|
174
174
|
cmd = ["lsid"]
|
|
175
175
|
try:
|
|
176
|
-
output = subprocess.check_output(cmd).decode(
|
|
176
|
+
output = subprocess.check_output(cmd).decode("utf-8")
|
|
177
177
|
except:
|
|
178
178
|
return None
|
|
179
|
-
bjobs_search = re.search(
|
|
179
|
+
bjobs_search = re.search("IBM Spectrum LSF Standard (.*),", output)
|
|
180
180
|
if bjobs_search:
|
|
181
181
|
lsf_version = bjobs_search.group(1)
|
|
182
182
|
return lsf_version
|
|
@@ -188,7 +188,9 @@ def check_lsf_json_output_supported():
|
|
|
188
188
|
"""Check if the current LSF system supports bjobs json output."""
|
|
189
189
|
try:
|
|
190
190
|
lsf_version = get_lsf_version()
|
|
191
|
-
if lsf_version and (
|
|
191
|
+
if lsf_version and (
|
|
192
|
+
version.parse(lsf_version) >= version.parse(LSF_JSON_OUTPUT_MIN_VERSION)
|
|
193
|
+
):
|
|
192
194
|
return True
|
|
193
195
|
except:
|
|
194
196
|
return False
|
|
@@ -197,11 +199,11 @@ def check_lsf_json_output_supported():
|
|
|
197
199
|
|
|
198
200
|
def parse_memory(mem: float) -> str:
|
|
199
201
|
"""Parse memory parameter."""
|
|
200
|
-
megabytes_of_mem = convert_units(float(mem), src_unit=
|
|
202
|
+
megabytes_of_mem = convert_units(float(mem), src_unit="B", dst_unit="MB")
|
|
201
203
|
if megabytes_of_mem < 1:
|
|
202
204
|
megabytes_of_mem = 1.0
|
|
203
205
|
# round as a string here to avoid returning something like 1.231e+12
|
|
204
|
-
return f
|
|
206
|
+
return f"{megabytes_of_mem:.0f}MB"
|
|
205
207
|
|
|
206
208
|
|
|
207
209
|
def per_core_reservation():
|
|
@@ -19,19 +19,23 @@ from threading import Lock
|
|
|
19
19
|
|
|
20
20
|
from toil.provisioners.abstractProvisioner import Shape
|
|
21
21
|
|
|
22
|
-
TaskData = namedtuple(
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
22
|
+
TaskData = namedtuple(
|
|
23
|
+
"TaskData",
|
|
24
|
+
(
|
|
25
|
+
# Time when the task was started
|
|
26
|
+
"startTime",
|
|
27
|
+
# Mesos' ID of the agent where task is being run
|
|
28
|
+
"agentID",
|
|
29
|
+
# IP of agent where task is being run
|
|
30
|
+
"agentIP",
|
|
31
|
+
# Mesos' ID of the executor running the task
|
|
32
|
+
"executorID",
|
|
33
|
+
# Memory requirement of the task
|
|
34
|
+
"memory",
|
|
35
|
+
# CPU requirement of the task
|
|
36
|
+
"cores",
|
|
37
|
+
),
|
|
38
|
+
)
|
|
35
39
|
|
|
36
40
|
|
|
37
41
|
class JobQueue:
|
|
@@ -52,7 +56,11 @@ class JobQueue:
|
|
|
52
56
|
|
|
53
57
|
def jobIDs(self):
|
|
54
58
|
with self.jobLock:
|
|
55
|
-
return [
|
|
59
|
+
return [
|
|
60
|
+
job.jobID
|
|
61
|
+
for queue in list(self.queues.values())
|
|
62
|
+
for job in list(queue.queue)
|
|
63
|
+
]
|
|
56
64
|
|
|
57
65
|
def nextJobOfType(self, jobType):
|
|
58
66
|
with self.jobLock:
|
|
@@ -80,18 +88,22 @@ class MesosShape(Shape):
|
|
|
80
88
|
return not self.greater_than(other)
|
|
81
89
|
|
|
82
90
|
|
|
83
|
-
ToilJob = namedtuple(
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
91
|
+
ToilJob = namedtuple(
|
|
92
|
+
"ToilJob",
|
|
93
|
+
(
|
|
94
|
+
# A job ID specific to this batch system implementation
|
|
95
|
+
"jobID",
|
|
96
|
+
# What string to display in the mesos UI
|
|
97
|
+
"name",
|
|
98
|
+
# A ResourceRequirement tuple describing the resources needed by this job
|
|
99
|
+
"resources",
|
|
100
|
+
# The command to be run on the worker node
|
|
101
|
+
"command",
|
|
102
|
+
# The resource object representing the user script
|
|
103
|
+
"userScript",
|
|
104
|
+
# A dictionary with additional environment variables to be set on the worker process
|
|
105
|
+
"environment",
|
|
106
|
+
# A named tuple containing all the required info for cleaning up the worker node
|
|
107
|
+
"workerCleanupInfo",
|
|
108
|
+
),
|
|
109
|
+
)
|