toil 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +18 -13
- toil/batchSystems/abstractBatchSystem.py +39 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +24 -24
- toil/batchSystems/awsBatch.py +14 -14
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/contained_executor.py +3 -3
- toil/batchSystems/htcondor.py +0 -1
- toil/batchSystems/kubernetes.py +34 -31
- toil/batchSystems/local_support.py +3 -1
- toil/batchSystems/lsf.py +7 -7
- toil/batchSystems/mesos/batchSystem.py +7 -7
- toil/batchSystems/options.py +32 -83
- toil/batchSystems/registry.py +104 -23
- toil/batchSystems/singleMachine.py +16 -13
- toil/batchSystems/slurm.py +87 -16
- toil/batchSystems/torque.py +0 -1
- toil/bus.py +44 -8
- toil/common.py +544 -753
- toil/cwl/__init__.py +28 -32
- toil/cwl/cwltoil.py +595 -574
- toil/cwl/utils.py +55 -10
- toil/exceptions.py +1 -1
- toil/fileStores/__init__.py +2 -2
- toil/fileStores/abstractFileStore.py +88 -14
- toil/fileStores/cachingFileStore.py +610 -549
- toil/fileStores/nonCachingFileStore.py +46 -22
- toil/job.py +182 -101
- toil/jobStores/abstractJobStore.py +161 -95
- toil/jobStores/aws/jobStore.py +23 -9
- toil/jobStores/aws/utils.py +6 -6
- toil/jobStores/fileJobStore.py +116 -18
- toil/jobStores/googleJobStore.py +16 -7
- toil/jobStores/utils.py +5 -6
- toil/leader.py +87 -56
- toil/lib/accelerators.py +10 -5
- toil/lib/aws/__init__.py +3 -14
- toil/lib/aws/ami.py +22 -9
- toil/lib/aws/iam.py +21 -13
- toil/lib/aws/session.py +2 -16
- toil/lib/aws/utils.py +4 -5
- toil/lib/compatibility.py +1 -1
- toil/lib/conversions.py +26 -3
- toil/lib/docker.py +22 -23
- toil/lib/ec2.py +10 -6
- toil/lib/ec2nodes.py +106 -100
- toil/lib/encryption/_nacl.py +2 -1
- toil/lib/generatedEC2Lists.py +325 -18
- toil/lib/io.py +49 -2
- toil/lib/misc.py +1 -1
- toil/lib/resources.py +9 -2
- toil/lib/threading.py +101 -38
- toil/options/common.py +736 -0
- toil/options/cwl.py +336 -0
- toil/options/wdl.py +37 -0
- toil/provisioners/abstractProvisioner.py +9 -4
- toil/provisioners/aws/__init__.py +3 -6
- toil/provisioners/aws/awsProvisioner.py +6 -0
- toil/provisioners/clusterScaler.py +3 -2
- toil/provisioners/gceProvisioner.py +2 -2
- toil/realtimeLogger.py +2 -1
- toil/resource.py +24 -18
- toil/server/app.py +2 -3
- toil/server/cli/wes_cwl_runner.py +4 -4
- toil/server/utils.py +1 -1
- toil/server/wes/abstract_backend.py +3 -2
- toil/server/wes/amazon_wes_utils.py +5 -4
- toil/server/wes/tasks.py +2 -3
- toil/server/wes/toil_backend.py +2 -10
- toil/server/wsgi_app.py +2 -0
- toil/serviceManager.py +12 -10
- toil/statsAndLogging.py +41 -9
- toil/test/__init__.py +29 -54
- toil/test/batchSystems/batchSystemTest.py +11 -111
- toil/test/batchSystems/test_slurm.py +24 -8
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +438 -223
- toil/test/cwl/glob_dir.cwl +15 -0
- toil/test/cwl/preemptible.cwl +21 -0
- toil/test/cwl/preemptible_expression.cwl +28 -0
- toil/test/cwl/revsort.cwl +1 -1
- toil/test/cwl/revsort2.cwl +1 -1
- toil/test/docs/scriptsTest.py +2 -3
- toil/test/jobStores/jobStoreTest.py +34 -21
- toil/test/lib/aws/test_iam.py +4 -14
- toil/test/lib/aws/test_utils.py +0 -3
- toil/test/lib/dockerTest.py +4 -4
- toil/test/lib/test_ec2.py +12 -17
- toil/test/mesos/helloWorld.py +4 -5
- toil/test/mesos/stress.py +1 -1
- toil/test/{wdl/conftest.py → options/__init__.py} +0 -10
- toil/test/options/options.py +37 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
- toil/test/provisioners/clusterScalerTest.py +6 -4
- toil/test/provisioners/clusterTest.py +23 -11
- toil/test/provisioners/gceProvisionerTest.py +0 -6
- toil/test/provisioners/restartScript.py +3 -2
- toil/test/server/serverTest.py +1 -1
- toil/test/sort/restart_sort.py +2 -1
- toil/test/sort/sort.py +2 -1
- toil/test/sort/sortTest.py +2 -13
- toil/test/src/autoDeploymentTest.py +45 -45
- toil/test/src/busTest.py +5 -5
- toil/test/src/checkpointTest.py +2 -2
- toil/test/src/deferredFunctionTest.py +1 -1
- toil/test/src/fileStoreTest.py +32 -16
- toil/test/src/helloWorldTest.py +1 -1
- toil/test/src/importExportFileTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +2 -1
- toil/test/src/jobServiceTest.py +1 -1
- toil/test/src/jobTest.py +18 -18
- toil/test/src/miscTests.py +5 -3
- toil/test/src/promisedRequirementTest.py +3 -3
- toil/test/src/realtimeLoggerTest.py +1 -1
- toil/test/src/resourceTest.py +2 -2
- toil/test/src/restartDAGTest.py +1 -1
- toil/test/src/resumabilityTest.py +36 -2
- toil/test/src/retainTempDirTest.py +1 -1
- toil/test/src/systemTest.py +2 -2
- toil/test/src/toilContextManagerTest.py +2 -2
- toil/test/src/userDefinedJobArgTypeTest.py +1 -1
- toil/test/utils/toilDebugTest.py +98 -32
- toil/test/utils/toilKillTest.py +2 -2
- toil/test/utils/utilsTest.py +23 -3
- toil/test/wdl/wdltoil_test.py +223 -45
- toil/toilState.py +7 -6
- toil/utils/toilClean.py +1 -1
- toil/utils/toilConfig.py +36 -0
- toil/utils/toilDebugFile.py +60 -33
- toil/utils/toilDebugJob.py +39 -12
- toil/utils/toilDestroyCluster.py +1 -1
- toil/utils/toilKill.py +1 -1
- toil/utils/toilLaunchCluster.py +13 -2
- toil/utils/toilMain.py +3 -2
- toil/utils/toilRsyncCluster.py +1 -1
- toil/utils/toilSshCluster.py +1 -1
- toil/utils/toilStats.py +445 -305
- toil/utils/toilStatus.py +2 -5
- toil/version.py +10 -10
- toil/wdl/utils.py +2 -122
- toil/wdl/wdltoil.py +1257 -492
- toil/worker.py +55 -46
- toil-6.1.0.dist-info/METADATA +124 -0
- toil-6.1.0.dist-info/RECORD +241 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/WHEEL +1 -1
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -1
- toil/batchSystems/parasol.py +0 -379
- toil/batchSystems/tes.py +0 -459
- toil/test/batchSystems/parasolTestSupport.py +0 -117
- toil/test/wdl/builtinTest.py +0 -506
- toil/test/wdl/toilwdlTest.py +0 -522
- toil/wdl/toilwdl.py +0 -141
- toil/wdl/versions/dev.py +0 -107
- toil/wdl/versions/draft2.py +0 -980
- toil/wdl/versions/v1.py +0 -794
- toil/wdl/wdl_analysis.py +0 -116
- toil/wdl/wdl_functions.py +0 -997
- toil/wdl/wdl_synthesis.py +0 -1011
- toil/wdl/wdl_types.py +0 -243
- toil-5.12.0.dist-info/METADATA +0 -118
- toil-5.12.0.dist-info/RECORD +0 -244
- /toil/{wdl/versions → options}/__init__.py +0 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
toil/batchSystems/slurm.py
CHANGED
|
@@ -16,13 +16,14 @@ import math
|
|
|
16
16
|
import os
|
|
17
17
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
18
18
|
from shlex import quote
|
|
19
|
-
from typing import
|
|
19
|
+
from typing import Dict, List, Optional, Tuple, TypeVar, Union
|
|
20
20
|
|
|
21
|
+
from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE
|
|
21
22
|
from toil.batchSystems.abstractGridEngineBatchSystem import \
|
|
22
23
|
AbstractGridEngineBatchSystem
|
|
23
24
|
from toil.batchSystems.options import OptionSetter
|
|
24
|
-
from toil.lib.misc import CalledProcessErrorStderr, call_command
|
|
25
25
|
from toil.job import Requirer
|
|
26
|
+
from toil.lib.misc import CalledProcessErrorStderr, call_command
|
|
26
27
|
|
|
27
28
|
logger = logging.getLogger(__name__)
|
|
28
29
|
|
|
@@ -64,7 +65,9 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
64
65
|
jobName: str,
|
|
65
66
|
job_environment: Optional[Dict[str, str]] = None,
|
|
66
67
|
gpus: Optional[int] = None) -> List[str]:
|
|
67
|
-
|
|
68
|
+
# Make sure to use exec so we can get Slurm's signals in the Toil
|
|
69
|
+
# worker instead of having an intervening Bash
|
|
70
|
+
return self.prepareSbatch(cpu, memory, jobID, jobName, job_environment, gpus) + [f'--wrap=exec {command}']
|
|
68
71
|
|
|
69
72
|
def submitJob(self, subLine):
|
|
70
73
|
try:
|
|
@@ -95,12 +98,12 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
95
98
|
logger.error("sbatch command failed")
|
|
96
99
|
raise e
|
|
97
100
|
|
|
98
|
-
def coalesce_job_exit_codes(self, batch_job_id_list: list) ->
|
|
101
|
+
def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
|
|
99
102
|
"""
|
|
100
103
|
Collect all job exit codes in a single call.
|
|
101
104
|
:param batch_job_id_list: list of Job ID strings, where each string has the form
|
|
102
105
|
"<job>[.<task>]".
|
|
103
|
-
:return: list of job exit codes, associated with the list of job IDs.
|
|
106
|
+
:return: list of job exit codes or exit code, exit reason pairs associated with the list of job IDs.
|
|
104
107
|
"""
|
|
105
108
|
logger.debug("Getting exit codes for slurm jobs: %s", batch_job_id_list)
|
|
106
109
|
# Convert batch_job_id_list to list of integer job IDs.
|
|
@@ -111,7 +114,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
111
114
|
exit_codes.append(self._get_job_return_code(status))
|
|
112
115
|
return exit_codes
|
|
113
116
|
|
|
114
|
-
def getJobExitCode(self, batchJobID: str) -> int:
|
|
117
|
+
def getJobExitCode(self, batchJobID: str) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
|
|
115
118
|
"""
|
|
116
119
|
Get job exit code for given batch job ID.
|
|
117
120
|
:param batchJobID: string of the form "<job>[.<task>]".
|
|
@@ -138,18 +141,68 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
138
141
|
status_dict = self._getJobDetailsFromScontrol(job_id_list)
|
|
139
142
|
return status_dict
|
|
140
143
|
|
|
141
|
-
def _get_job_return_code(self, status: tuple) ->
|
|
144
|
+
def _get_job_return_code(self, status: tuple) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
|
|
142
145
|
"""
|
|
146
|
+
Given a Slurm return code, status pair, summarize them into a Toil return code, exit reason pair.
|
|
147
|
+
|
|
148
|
+
The return code may have already been OR'd with the 128-offset
|
|
149
|
+
Slurm-reported signal.
|
|
150
|
+
|
|
151
|
+
Slurm will report return codes of 0 even if jobs time out instead
|
|
152
|
+
of succeeding:
|
|
153
|
+
|
|
154
|
+
2093597|TIMEOUT|0:0
|
|
155
|
+
2093597.batch|CANCELLED|0:15
|
|
156
|
+
|
|
157
|
+
So we guarantee here that, if the Slurm status string is not a
|
|
158
|
+
successful one as defined in
|
|
159
|
+
<https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>, we
|
|
160
|
+
will not return a successful return code.
|
|
161
|
+
|
|
143
162
|
Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
|
|
144
|
-
:param status: tuple containing the job's state and it's return code.
|
|
145
|
-
:return: the job's return code if it's completed, otherwise None.
|
|
163
|
+
:param status: tuple containing the job's state and it's return code from Slurm.
|
|
164
|
+
:return: the job's return code for Toil if it's completed, otherwise None.
|
|
146
165
|
"""
|
|
147
166
|
state, rc = status
|
|
148
|
-
|
|
149
|
-
#
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
167
|
+
|
|
168
|
+
# If a job is in one of these states, Slurm can't run it anymore.
|
|
169
|
+
# We don't include states where the job is held or paused here;
|
|
170
|
+
# those mean it could run and needs to wait for someone to un-hold
|
|
171
|
+
# it, so Toil should wait for it.
|
|
172
|
+
#
|
|
173
|
+
# We map from each terminal state to the Toil-ontology exit reason.
|
|
174
|
+
TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
|
|
175
|
+
"BOOT_FAIL": BatchJobExitReason.LOST,
|
|
176
|
+
"CANCELLED": BatchJobExitReason.KILLED,
|
|
177
|
+
"COMPLETED": BatchJobExitReason.FINISHED,
|
|
178
|
+
"DEADLINE": BatchJobExitReason.KILLED,
|
|
179
|
+
"FAILED": BatchJobExitReason.FAILED,
|
|
180
|
+
"NODE_FAIL": BatchJobExitReason.LOST,
|
|
181
|
+
"OUT_OF_MEMORY": BatchJobExitReason.MEMLIMIT,
|
|
182
|
+
"PREEMPTED": BatchJobExitReason.KILLED,
|
|
183
|
+
"TIMEOUT": BatchJobExitReason.KILLED
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if state not in TERMINAL_STATES:
|
|
187
|
+
# Don't treat the job as exited yet
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
exit_reason = TERMINAL_STATES[state]
|
|
191
|
+
|
|
192
|
+
if exit_reason == BatchJobExitReason.FINISHED:
|
|
193
|
+
# The only state that should produce a 0 ever is COMPLETED. So
|
|
194
|
+
# if the job is COMPLETED and the exit reason is thus FINISHED,
|
|
195
|
+
# pass along the code it has.
|
|
196
|
+
return (rc, exit_reason)
|
|
197
|
+
|
|
198
|
+
if rc == 0:
|
|
199
|
+
# The job claims to be in a state other than COMPLETED, but
|
|
200
|
+
# also to have not encountered a problem. Say the exit status
|
|
201
|
+
# is unavailable.
|
|
202
|
+
return (EXIT_STATUS_UNAVAILABLE_VALUE, exit_reason)
|
|
203
|
+
|
|
204
|
+
# If the code is nonzero, pass it along.
|
|
205
|
+
return (rc, exit_reason)
|
|
153
206
|
|
|
154
207
|
def _getJobDetailsFromSacct(self, job_id_list: list) -> dict:
|
|
155
208
|
"""
|
|
@@ -283,8 +336,26 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
283
336
|
job_environment: Optional[Dict[str, str]],
|
|
284
337
|
gpus: Optional[int]) -> List[str]:
|
|
285
338
|
|
|
286
|
-
|
|
339
|
+
"""
|
|
340
|
+
Returns the sbatch command line to run to queue the job.
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
# Start by naming the job
|
|
287
344
|
sbatch_line = ['sbatch', '-J', f'toil_job_{jobID}_{jobName}']
|
|
345
|
+
|
|
346
|
+
# Make sure the job gets a signal before it disappears so that e.g.
|
|
347
|
+
# container cleanup finally blocks can run. Ask for SIGINT so we
|
|
348
|
+
# can get the default Python KeyboardInterrupt which third-party
|
|
349
|
+
# code is likely to plan for. Make sure to send it to the batch
|
|
350
|
+
# shell process with "B:", not to all the srun steps it launches
|
|
351
|
+
# (because there shouldn't be any). We cunningly replaced the batch
|
|
352
|
+
# shell process with the Toil worker process, so Toil should be
|
|
353
|
+
# able to get the signal.
|
|
354
|
+
#
|
|
355
|
+
# TODO: Add a way to detect when the job failed because it
|
|
356
|
+
# responded to this signal and use the right exit reason for it.
|
|
357
|
+
sbatch_line.append("--signal=B:INT@30")
|
|
358
|
+
|
|
288
359
|
if gpus:
|
|
289
360
|
sbatch_line = sbatch_line[:1] + [f'--gres=gpu:{gpus}'] + sbatch_line[1:]
|
|
290
361
|
environment = {}
|
|
@@ -387,5 +458,5 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
387
458
|
OptionType = TypeVar('OptionType')
|
|
388
459
|
@classmethod
|
|
389
460
|
def setOptions(cls, setOption: OptionSetter) -> None:
|
|
390
|
-
setOption("allocate_mem"
|
|
461
|
+
setOption("allocate_mem")
|
|
391
462
|
|
toil/batchSystems/torque.py
CHANGED
toil/bus.py
CHANGED
|
@@ -61,15 +61,13 @@ MessageBus.connect_output_file() and MessageBus.scan_bus_messages().
|
|
|
61
61
|
"""
|
|
62
62
|
|
|
63
63
|
import collections
|
|
64
|
-
|
|
65
|
-
import inspect
|
|
64
|
+
import json
|
|
66
65
|
import logging
|
|
67
|
-
|
|
68
66
|
import os
|
|
69
67
|
import queue
|
|
70
|
-
import json
|
|
71
68
|
import tempfile
|
|
72
69
|
import threading
|
|
70
|
+
from dataclasses import dataclass
|
|
73
71
|
from typing import (IO,
|
|
74
72
|
Any,
|
|
75
73
|
Callable,
|
|
@@ -80,7 +78,6 @@ from typing import (IO,
|
|
|
80
78
|
Optional,
|
|
81
79
|
Type,
|
|
82
80
|
TypeVar,
|
|
83
|
-
Union,
|
|
84
81
|
cast)
|
|
85
82
|
|
|
86
83
|
from pubsub.core import Publisher
|
|
@@ -90,6 +87,43 @@ from pubsub.core.topicutils import ALL_TOPICS
|
|
|
90
87
|
|
|
91
88
|
logger = logging.getLogger( __name__ )
|
|
92
89
|
|
|
90
|
+
# We define some ways to talk about jobs.
|
|
91
|
+
|
|
92
|
+
class Names(NamedTuple):
|
|
93
|
+
"""
|
|
94
|
+
Stores all the kinds of name a job can have.
|
|
95
|
+
"""
|
|
96
|
+
# Name of the kind of job this is
|
|
97
|
+
job_name: str
|
|
98
|
+
# Name of this particular work unit
|
|
99
|
+
unit_name: str
|
|
100
|
+
# Human-readable name for the job
|
|
101
|
+
display_name: str
|
|
102
|
+
# What the job prints as, used for stats-and-logging log management
|
|
103
|
+
stats_name: str
|
|
104
|
+
# Job store ID of the job for the work unit
|
|
105
|
+
job_store_id: str
|
|
106
|
+
|
|
107
|
+
def get_job_kind(names: Names) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Return an identifying string for the job.
|
|
110
|
+
|
|
111
|
+
The result may contain spaces.
|
|
112
|
+
|
|
113
|
+
Returns: Either the unit name, job name, or display name, which identifies
|
|
114
|
+
the kind of job it is to toil.
|
|
115
|
+
Otherwise "Unknown Job" in case no identifier is available
|
|
116
|
+
"""
|
|
117
|
+
if names.unit_name:
|
|
118
|
+
return names.unit_name
|
|
119
|
+
elif names.job_name:
|
|
120
|
+
return names.job_name
|
|
121
|
+
elif names.display_name:
|
|
122
|
+
return names.display_name
|
|
123
|
+
else:
|
|
124
|
+
return "Unknown Job"
|
|
125
|
+
|
|
126
|
+
|
|
93
127
|
# We define a bunch of named tuple message types.
|
|
94
128
|
# These all need to be plain data: only hold ints, strings, etc.
|
|
95
129
|
|
|
@@ -396,8 +430,8 @@ class MessageBus:
|
|
|
396
430
|
given topic.
|
|
397
431
|
"""
|
|
398
432
|
# There should always be a "message"
|
|
399
|
-
|
|
400
|
-
|
|
433
|
+
if len(message_data) != 1 or 'message' not in message_data:
|
|
434
|
+
raise RuntimeError("Cannot log the bus message. The message is either empty/malformed or there are too many messages provided.")
|
|
401
435
|
message = message_data['message']
|
|
402
436
|
topic = topic_object.getName()
|
|
403
437
|
stream.write(topic.encode('utf-8'))
|
|
@@ -572,7 +606,8 @@ class MessageInbox(MessageBusClient):
|
|
|
572
606
|
handled = False
|
|
573
607
|
try:
|
|
574
608
|
# Emit the message
|
|
575
|
-
|
|
609
|
+
if not isinstance(message, message_type):
|
|
610
|
+
raise RuntimeError(f"Unacceptable message type {type(message)} in list for type {message_type}")
|
|
576
611
|
yield message
|
|
577
612
|
# If we get here it was handled without error.
|
|
578
613
|
handled = True
|
|
@@ -650,6 +685,7 @@ class JobStatus:
|
|
|
650
685
|
|
|
651
686
|
def __repr__(self) -> str:
|
|
652
687
|
return json.dumps(self, default= lambda o: o.__dict__, indent=4)
|
|
688
|
+
|
|
653
689
|
def replay_message_bus(path: str) -> Dict[str, JobStatus]:
|
|
654
690
|
"""
|
|
655
691
|
Replay all the messages and work out what they mean for jobs.
|