toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +22 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +2 -2
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +64 -22
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +7 -3
- toil/common.py +36 -13
- toil/cwl/cwltoil.py +365 -312
- toil/deferred.py +1 -1
- toil/fileStores/abstractFileStore.py +17 -17
- toil/fileStores/cachingFileStore.py +2 -2
- toil/fileStores/nonCachingFileStore.py +1 -1
- toil/job.py +228 -60
- toil/jobStores/abstractJobStore.py +18 -10
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +57 -29
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +2 -2
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +72 -24
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +5 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/io.py +14 -2
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +55 -21
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +2 -2
- toil/lib/throttle.py +1 -1
- toil/options/common.py +27 -24
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +9 -7
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +58 -16
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +1 -1
- toil/test/cwl/cwlTest.py +8 -91
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +10 -13
- toil/test/jobStores/jobStoreTest.py +33 -49
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +90 -8
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +61 -3
- toil/test/utils/utilsTest.py +20 -18
- toil/test/wdl/wdltoil_test.py +24 -71
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +2 -1
- toil/utils/toilStatus.py +97 -51
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +318 -51
- toil/worker.py +96 -69
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/utils/toilDebugJob.py
CHANGED
|
@@ -12,66 +12,193 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""Debug tool for running a toil job locally."""
|
|
15
|
+
import gc
|
|
15
16
|
import logging
|
|
16
|
-
|
|
17
|
+
import os
|
|
17
18
|
import pprint
|
|
18
19
|
import sys
|
|
19
20
|
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Optional, List, Tuple
|
|
23
|
+
|
|
20
24
|
from toil.common import Config, Toil, parser_with_common_options
|
|
25
|
+
from toil.job import FilesDownloadedStoppingPointReached
|
|
21
26
|
from toil.jobStores.fileJobStore import FileJobStore
|
|
22
27
|
from toil.statsAndLogging import set_logging_from_options
|
|
23
28
|
from toil.utils.toilDebugFile import printContentsOfJobStore
|
|
29
|
+
from toil.utils.toilStatus import ToilStatus
|
|
24
30
|
from toil.worker import workerScript
|
|
25
31
|
|
|
26
32
|
logger = logging.getLogger(__name__)
|
|
27
33
|
|
|
28
34
|
|
|
29
35
|
def main() -> None:
|
|
30
|
-
parser = parser_with_common_options(jobstore_option=True, prog="toil debug-job")
|
|
31
|
-
parser.add_argument("
|
|
32
|
-
help="The job store id of a job within the provided jobstore
|
|
33
|
-
parser.add_argument("--printJobInfo",
|
|
34
|
-
help="Dump debugging info about
|
|
36
|
+
parser = parser_with_common_options(jobstore_option=True, prog="toil debug-job", default_log_level=logging.DEBUG)
|
|
37
|
+
parser.add_argument("job", type=str,
|
|
38
|
+
help="The job store id or job name of a job within the provided jobstore")
|
|
39
|
+
parser.add_argument("--printJobInfo", action="store_true",
|
|
40
|
+
help="Dump debugging info about the job instead of running it")
|
|
41
|
+
parser.add_argument("--retrieveTaskDirectory", dest="retrieve_task_directory", type=str, default=None,
|
|
42
|
+
help="Download CWL or WDL task inputs to the given directory and stop.")
|
|
35
43
|
|
|
36
44
|
options = parser.parse_args()
|
|
37
45
|
set_logging_from_options(options)
|
|
38
46
|
|
|
47
|
+
if options.retrieve_task_directory is not None and os.path.exists(options.retrieve_task_directory):
|
|
48
|
+
# The logic to duplicate container mounts depends on stuff not already existing.
|
|
49
|
+
logger.error(
|
|
50
|
+
"The directory %s given for --retrieveTaskDirectory already exists. "
|
|
51
|
+
"Stopping to avoid clobbering existing files.",
|
|
52
|
+
options.retrieve_task_directory
|
|
53
|
+
)
|
|
54
|
+
sys.exit(1)
|
|
55
|
+
|
|
39
56
|
jobStore = Toil.resumeJobStore(options.jobStore)
|
|
40
57
|
# Get the config with the workflow ID from the job store
|
|
41
58
|
config = jobStore.config
|
|
42
59
|
# But override its options
|
|
43
60
|
config.setOptions(options)
|
|
44
61
|
|
|
45
|
-
|
|
62
|
+
# Find the job
|
|
63
|
+
|
|
64
|
+
if jobStore.job_exists(options.job):
|
|
65
|
+
# The user asked for a particular job and it exists
|
|
66
|
+
job_id = options.job
|
|
67
|
+
else:
|
|
68
|
+
# Go search by name and fill in job_id
|
|
69
|
+
|
|
70
|
+
# TODO: break out job store scan logic so it doesn't need to re-connect
|
|
71
|
+
# to the job store.
|
|
72
|
+
status = ToilStatus(options.jobStore)
|
|
73
|
+
hits = []
|
|
74
|
+
suggestion = None
|
|
75
|
+
for job in status.jobsToReport:
|
|
76
|
+
if options.job in (job.jobName, job.unitName, job.displayName):
|
|
77
|
+
# Find all the jobs that sort of match
|
|
78
|
+
hits.append(job)
|
|
79
|
+
if suggestion is None and job.remainingTryCount == 0:
|
|
80
|
+
# How about this nice failing job instead?
|
|
81
|
+
suggestion = job
|
|
82
|
+
if len(hits) == 0:
|
|
83
|
+
# No hits
|
|
84
|
+
if suggestion is None:
|
|
85
|
+
logger.critical("No job found with ID or name \"%s\". No jobs are completely failed.", options.job)
|
|
86
|
+
else:
|
|
87
|
+
logger.critical("No job found with ID or name \"%s\". How about the failed job %s instead?", options.job, suggestion)
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
elif len(hits) > 1:
|
|
90
|
+
# Several hits, maybe only one has failed
|
|
91
|
+
completely_failed_hits = [job for job in hits if job.remainingTryCount == 0]
|
|
92
|
+
if len(completely_failed_hits) == 0:
|
|
93
|
+
logger.critical("Multiple jobs match \"%s\" but none are completely failed: %s", options.job, hits)
|
|
94
|
+
sys.exit(1)
|
|
95
|
+
elif len(completely_failed_hits) > 0:
|
|
96
|
+
logger.critical("Multiple jobs matching \"%s\" are completely failed: %s", options.job, completely_failed_hits)
|
|
97
|
+
sys.exit(1)
|
|
98
|
+
else:
|
|
99
|
+
# We found one completely failed job, they probably mean that one.
|
|
100
|
+
logger.info("There are %s jobs matching \"%s\"; assuming you mean the failed one: %s", options.job, completely_failed_hits[0])
|
|
101
|
+
job_id = completely_failed_hits[0].jobStoreID
|
|
102
|
+
else:
|
|
103
|
+
# We found one job with this name, so they must mean that one
|
|
104
|
+
logger.info("Looked up job named \"%s\": %s", options.job, hits[0])
|
|
105
|
+
job_id = hits[0].jobStoreID
|
|
46
106
|
|
|
47
107
|
if options.printJobInfo:
|
|
108
|
+
# Report on the job
|
|
109
|
+
|
|
48
110
|
if isinstance(jobStore, FileJobStore):
|
|
49
111
|
# List all its files if we can
|
|
50
|
-
printContentsOfJobStore(job_store=jobStore, job_id=
|
|
112
|
+
printContentsOfJobStore(job_store=jobStore, job_id=job_id)
|
|
51
113
|
# Print the job description itself
|
|
52
|
-
job_desc = jobStore.load_job(
|
|
114
|
+
job_desc = jobStore.load_job(job_id)
|
|
53
115
|
print(f"Job: {job_desc}")
|
|
54
116
|
pprint.pprint(job_desc.__dict__)
|
|
117
|
+
else:
|
|
118
|
+
# Run the job
|
|
55
119
|
|
|
56
|
-
|
|
120
|
+
debug_flags = set()
|
|
121
|
+
local_worker_temp_dir = None
|
|
122
|
+
if options.retrieve_task_directory is not None:
|
|
123
|
+
# Pick a directory in it (which may be removed by the worker) as the worker's temp dir.
|
|
124
|
+
local_worker_temp_dir = os.path.join(options.retrieve_task_directory, "worker")
|
|
125
|
+
# Make sure it exists
|
|
126
|
+
os.makedirs(local_worker_temp_dir, exist_ok=True)
|
|
127
|
+
# And tell the job to just download files
|
|
128
|
+
debug_flags.add("download_only")
|
|
129
|
+
# We might need to reconstruct a container environment.
|
|
130
|
+
host_and_job_paths: Optional[List[Tuple[str, str]]] = None
|
|
131
|
+
# Track if the run succeeded without error
|
|
132
|
+
run_succeeded = False
|
|
57
133
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
134
|
+
logger.info(f"Running the following job locally: {job_id}")
|
|
135
|
+
try:
|
|
136
|
+
workerScript(jobStore, config, job_id, job_id, redirect_output_to_log_file=False, local_worker_temp_dir=local_worker_temp_dir, debug_flags=debug_flags)
|
|
137
|
+
except FilesDownloadedStoppingPointReached as e:
|
|
138
|
+
# We asked for the files to be downloaded and now they are.
|
|
139
|
+
assert options.retrieve_task_directory is not None
|
|
140
|
+
if e.host_and_job_paths is not None:
|
|
141
|
+
# Capture the container mapping so we can reconstruct the container environment after we unwind the worker stack.
|
|
142
|
+
host_and_job_paths = e.host_and_job_paths
|
|
143
|
+
else:
|
|
144
|
+
# No error!
|
|
145
|
+
run_succeeded = True
|
|
61
146
|
|
|
62
|
-
|
|
63
|
-
#
|
|
147
|
+
# Make sure the deferred function manager cleans up and logs its
|
|
148
|
+
# shutdown before we start writing any reports.
|
|
149
|
+
gc.collect()
|
|
64
150
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
workerScript(jobStore, config, jobID, jobID, redirectOutputToLogFile=False)
|
|
68
|
-
logger.debug(f"Finished running: {jobID}")
|
|
69
|
-
# Even if the job fails, the worker script succeeds unless something goes wrong with it internally.
|
|
151
|
+
if run_succeeded:
|
|
152
|
+
logger.info(f"Successfully ran: {job_id}")
|
|
70
153
|
|
|
71
|
-
|
|
154
|
+
if host_and_job_paths is not None:
|
|
155
|
+
# We need to make a place that looks like the job paths half of these.
|
|
72
156
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
157
|
+
# Sort by job-side path so we do children before parents, to
|
|
158
|
+
# stop us from accidentally making children inside moutned
|
|
159
|
+
# parents.
|
|
160
|
+
sorted_mounts = sorted(host_and_job_paths, key=lambda t: t[1], reverse=True)
|
|
161
|
+
|
|
162
|
+
fake_job_root = os.path.join(options.retrieve_task_directory, "inside")
|
|
163
|
+
os.makedirs(fake_job_root, exist_ok=True)
|
|
164
|
+
|
|
165
|
+
for host_path, job_path in sorted_mounts:
|
|
166
|
+
if not os.path.exists(host_path):
|
|
167
|
+
logger.error("Job intended to mount %s as %s but it does not exist!", host_path, job_path)
|
|
168
|
+
continue
|
|
169
|
+
if not job_path.startswith("/"):
|
|
170
|
+
logger.error("Job intended to mount %s as %s but destination is a relative path!", host_path, job_path)
|
|
171
|
+
continue
|
|
172
|
+
# Drop the slash because we are building a chroot-ish mini filesystem.
|
|
173
|
+
job_relative_path = job_path[1:]
|
|
174
|
+
if job_relative_path.startswith("/"):
|
|
175
|
+
# We are having trouble understanding what the job
|
|
176
|
+
# intended to do. Stop working on this mount.
|
|
177
|
+
logger.error("Job intended to mount %s as %s but destination starts with multiple slashes for some reason!", host_path, job_path)
|
|
178
|
+
continue
|
|
179
|
+
fake_job_path = os.path.join(fake_job_root, job_relative_path)
|
|
180
|
+
if os.path.exists(fake_job_path):
|
|
181
|
+
logger.error("Job intended to mount %s as %s but that location is already mounted!", host_path, job_path)
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
logger.info("Job mounted %s as %s", host_path, job_path)
|
|
185
|
+
|
|
186
|
+
# Make sure the directory to contain the mount exists.
|
|
187
|
+
fake_job_containing_path = os.path.dirname(fake_job_path)
|
|
188
|
+
os.makedirs(fake_job_containing_path, exist_ok=True)
|
|
189
|
+
|
|
190
|
+
top_pathobj = Path(os.path.abspath(options.retrieve_task_directory))
|
|
191
|
+
source_pathobj = Path(host_path)
|
|
192
|
+
if top_pathobj in source_pathobj.parents:
|
|
193
|
+
# We're linking to a file we already downloaded (probably).
|
|
194
|
+
# Make a relative symlink so the whole assemblage can move.
|
|
195
|
+
host_path = os.path.relpath(host_path, fake_job_containing_path)
|
|
196
|
+
|
|
197
|
+
# Make a symlink to simulate the mount
|
|
198
|
+
os.symlink(host_path, fake_job_path)
|
|
199
|
+
|
|
200
|
+
logger.info("Reconstructed job container filesystem at %s", fake_job_root)
|
|
201
|
+
|
|
202
|
+
# TODO: Option to print list of successor jobs
|
|
203
|
+
# TODO: Option to run job within python debugger, allowing step through of arguments
|
|
204
|
+
# idea would be to have option to import pdb and set breakpoint at the start of the user's code
|
toil/utils/toilLaunchCluster.py
CHANGED
|
@@ -20,6 +20,7 @@ from typing import Dict, List, Tuple, Union
|
|
|
20
20
|
from toil import applianceSelf
|
|
21
21
|
from toil.common import parser_with_common_options
|
|
22
22
|
from toil.lib.aws import build_tag_dict_from_env
|
|
23
|
+
from toil.lib.conversions import opt_strtobool
|
|
23
24
|
from toil.provisioners import (check_valid_node_types,
|
|
24
25
|
cluster_factory,
|
|
25
26
|
parse_node_types)
|
|
@@ -31,7 +32,11 @@ logger = logging.getLogger(__name__)
|
|
|
31
32
|
def create_tags_dict(tags: List[str]) -> Dict[str, str]:
|
|
32
33
|
tags_dict = dict()
|
|
33
34
|
for tag in tags:
|
|
34
|
-
|
|
35
|
+
try:
|
|
36
|
+
key, value = tag.split('=')
|
|
37
|
+
except ValueError:
|
|
38
|
+
logger.error("Tag specification '%s' must contain '='", tag)
|
|
39
|
+
raise
|
|
35
40
|
tags_dict[key] = value
|
|
36
41
|
return tags_dict
|
|
37
42
|
|
|
@@ -114,6 +119,10 @@ def main() -> None:
|
|
|
114
119
|
help="Any additional security groups to attach to EC2 instances. Note that a security group "
|
|
115
120
|
"with its name equal to the cluster name will always be created, thus ensure that "
|
|
116
121
|
"the extra security groups do not have the same name as the cluster name.")
|
|
122
|
+
parser.add_argument("--allowFuse", type=opt_strtobool, default=True,
|
|
123
|
+
help="Enable both the leader and worker nodes to be able to run Singularity with FUSE. For "
|
|
124
|
+
"Kubernetes, this will make the leader privileged and ask workers to run as privileged. "
|
|
125
|
+
"(default: %(default)s)")
|
|
117
126
|
#TODO Set Aws Profile in CLI options
|
|
118
127
|
options = parser.parse_args()
|
|
119
128
|
set_logging_from_options(options)
|
|
@@ -178,7 +187,8 @@ def main() -> None:
|
|
|
178
187
|
clusterName=options.clusterName,
|
|
179
188
|
clusterType=options.clusterType,
|
|
180
189
|
zone=options.zone,
|
|
181
|
-
nodeStorage=options.nodeStorage
|
|
190
|
+
nodeStorage=options.nodeStorage,
|
|
191
|
+
enable_fuse=options.allowFuse)
|
|
182
192
|
|
|
183
193
|
cluster.launchCluster(leaderNodeType=options.leaderNodeType,
|
|
184
194
|
leaderStorage=options.leaderStorage,
|
toil/utils/toilRsyncCluster.py
CHANGED
|
@@ -14,9 +14,10 @@
|
|
|
14
14
|
"""Rsyncs into the toil appliance container running on the leader of the cluster."""
|
|
15
15
|
import argparse
|
|
16
16
|
import logging
|
|
17
|
+
import sys
|
|
17
18
|
|
|
18
19
|
from toil.common import parser_with_common_options
|
|
19
|
-
from toil.provisioners import cluster_factory
|
|
20
|
+
from toil.provisioners import cluster_factory, NoSuchClusterException
|
|
20
21
|
from toil.statsAndLogging import set_logging_from_options
|
|
21
22
|
|
|
22
23
|
logger = logging.getLogger(__name__)
|
|
@@ -37,4 +38,8 @@ def main() -> None:
|
|
|
37
38
|
cluster = cluster_factory(provisioner=options.provisioner,
|
|
38
39
|
clusterName=options.clusterName,
|
|
39
40
|
zone=options.zone)
|
|
40
|
-
|
|
41
|
+
try:
|
|
42
|
+
cluster.getLeader().coreRsync(args=options.args, strict=not options.insecure)
|
|
43
|
+
except NoSuchClusterException as e:
|
|
44
|
+
logger.error(e)
|
|
45
|
+
sys.exit(1)
|
toil/utils/toilSshCluster.py
CHANGED
|
@@ -18,7 +18,7 @@ import sys
|
|
|
18
18
|
from typing import List
|
|
19
19
|
|
|
20
20
|
from toil.common import parser_with_common_options
|
|
21
|
-
from toil.provisioners import cluster_factory
|
|
21
|
+
from toil.provisioners import cluster_factory, NoSuchClusterException
|
|
22
22
|
from toil.statsAndLogging import set_logging_from_options
|
|
23
23
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
@@ -54,5 +54,9 @@ def main() -> None:
|
|
|
54
54
|
sshOptions.extend(['-L', f'{options.grafana_port}:localhost:3000',
|
|
55
55
|
'-L', '9090:localhost:9090'])
|
|
56
56
|
|
|
57
|
-
|
|
58
|
-
|
|
57
|
+
try:
|
|
58
|
+
cluster.getLeader().sshAppliance(*command, strict=not options.insecure, tty=sys.stdin.isatty(),
|
|
59
|
+
sshOptions=sshOptions)
|
|
60
|
+
except NoSuchClusterException as e:
|
|
61
|
+
logger.error(e)
|
|
62
|
+
sys.exit(1)
|
toil/utils/toilStats.py
CHANGED
|
@@ -451,7 +451,7 @@ def update_column_widths(tag: Expando, cw: ColumnWidths, options: Namespace) ->
|
|
|
451
451
|
cw.set_width(category, field, len(s) + 1)
|
|
452
452
|
|
|
453
453
|
|
|
454
|
-
def build_element(element: Expando, items: List[Job], item_name: str, defaults:
|
|
454
|
+
def build_element(element: Expando, items: List[Job], item_name: str, defaults: Dict[str, float]) -> Expando:
|
|
455
455
|
"""Create an element for output."""
|
|
456
456
|
|
|
457
457
|
def assertNonnegative(i: float, name: str) -> float:
|
|
@@ -696,6 +696,7 @@ def main() -> None:
|
|
|
696
696
|
except NoSuchJobStoreException:
|
|
697
697
|
logger.critical("The job store %s does not exist", config.jobStore)
|
|
698
698
|
sys.exit(1)
|
|
699
|
+
logger.info('Gathering stats from jobstore... depending on the number of jobs, this may take a while (e.g. 10 jobs ~= 3 seconds; 100,000 jobs ~= 3,000 seconds or 50 minutes).')
|
|
699
700
|
stats = get_stats(jobStore)
|
|
700
701
|
collatedStatsTag = process_data(jobStore.config, stats)
|
|
701
702
|
report_data(collatedStatsTag, options)
|
toil/utils/toilStatus.py
CHANGED
|
@@ -94,65 +94,99 @@ class ToilStatus:
|
|
|
94
94
|
children += "\t(CHILD_JOB:%s,PRECEDENCE:%i)" % (childJob, level)
|
|
95
95
|
print(children)
|
|
96
96
|
|
|
97
|
-
def printAggregateJobStats(self, properties: List[str], childNumber: int) -> None:
|
|
98
|
-
"""
|
|
99
|
-
|
|
97
|
+
def printAggregateJobStats(self, properties: List[Set[str]], childNumber: List[int]) -> None:
|
|
98
|
+
"""
|
|
99
|
+
Prints each job's ID, log file, remaining tries, and other properties.
|
|
100
|
+
|
|
101
|
+
:param properties: A set of string flag names for each job in self.jobsToReport.
|
|
102
|
+
:param childNumber: A list of child counts for each job in self.jobsToReport.
|
|
103
|
+
"""
|
|
104
|
+
for job, job_properties, job_child_number in zip(self.jobsToReport, properties, childNumber):
|
|
100
105
|
|
|
101
106
|
def lf(x: str) -> str:
|
|
102
|
-
return f"{x}:{str(x in
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
107
|
+
return f"{x}:{str(x in job_properties)}"
|
|
108
|
+
# We use a sort of not-really-machine-readable key:value TSV format here.
|
|
109
|
+
# But we only include important keys to help the humans, and flags
|
|
110
|
+
# don't have a value, just a key.
|
|
111
|
+
parts = [f"JOB:{job}"]
|
|
112
|
+
for flag in ["COMPLETELY_FAILED", "READY_TO_RUN", "IS_ZOMBIE", "HAS_SERVICES", "IS_SERVICE"]:
|
|
113
|
+
if flag in job_properties:
|
|
114
|
+
parts.append(flag)
|
|
115
|
+
if job.logJobStoreFileID:
|
|
116
|
+
parts.append(f"LOG_FILE:{job.logJobStoreFileID}")
|
|
117
|
+
if job.remainingTryCount > 0:
|
|
118
|
+
parts.append(f"TRYS_REMAINING:{job.remainingTryCount}")
|
|
119
|
+
if job_child_number > 0:
|
|
120
|
+
parts.append(f"CHILD_NUMBER:{job_child_number}")
|
|
121
|
+
|
|
122
|
+
print("\t".join(parts))
|
|
109
123
|
|
|
110
124
|
def report_on_jobs(self) -> Dict[str, Any]:
|
|
111
125
|
"""
|
|
112
126
|
Gathers information about jobs such as its child jobs and status.
|
|
113
127
|
|
|
114
|
-
:returns jobStats:
|
|
115
|
-
|
|
128
|
+
:returns jobStats: Dict containing some lists of jobs by category, and
|
|
129
|
+
some lists of job properties for each job in self.jobsToReport.
|
|
116
130
|
"""
|
|
131
|
+
# These are lists of the matching jobs
|
|
117
132
|
hasChildren = []
|
|
118
133
|
readyToRun = []
|
|
119
134
|
zombies = []
|
|
120
135
|
hasLogFile: List[JobDescription] = []
|
|
121
136
|
hasServices = []
|
|
122
137
|
services: List[ServiceJobDescription] = []
|
|
123
|
-
|
|
138
|
+
completely_failed = []
|
|
139
|
+
|
|
140
|
+
# These are stats for jobs in self.jobsToReport
|
|
141
|
+
child_number: List[int] = []
|
|
142
|
+
properties: List[Set[str]] = []
|
|
143
|
+
|
|
144
|
+
# TODO: This mix of semantics is confusing and made per-job status be
|
|
145
|
+
# wrong for multiple years because it was not understood. Redesign it!
|
|
124
146
|
|
|
125
147
|
for job in self.jobsToReport:
|
|
148
|
+
job_properties: Set[str] = set()
|
|
126
149
|
if job.logJobStoreFileID is not None:
|
|
127
150
|
hasLogFile.append(job)
|
|
128
151
|
|
|
129
|
-
|
|
130
|
-
|
|
152
|
+
job_child_number = len(list(job.allSuccessors()))
|
|
153
|
+
child_number.append(job_child_number)
|
|
154
|
+
if job_child_number > 0: # Total number of successors > 0
|
|
131
155
|
hasChildren.append(job)
|
|
132
|
-
|
|
133
|
-
elif job.
|
|
134
|
-
# Job has no children and a
|
|
156
|
+
job_properties.add("HAS_CHILDREN")
|
|
157
|
+
elif job.has_body():
|
|
158
|
+
# Job has no children and a body to run. Indicates job could be run.
|
|
135
159
|
readyToRun.append(job)
|
|
136
|
-
|
|
160
|
+
job_properties.add("READY_TO_RUN")
|
|
137
161
|
else:
|
|
138
162
|
# Job has no successors and no command, so is a zombie job.
|
|
139
163
|
zombies.append(job)
|
|
140
|
-
|
|
164
|
+
job_properties.add("IS_ZOMBIE")
|
|
141
165
|
if job.services:
|
|
142
166
|
hasServices.append(job)
|
|
143
|
-
|
|
167
|
+
job_properties.add("HAS_SERVICES")
|
|
144
168
|
if isinstance(job, ServiceJobDescription):
|
|
145
169
|
services.append(job)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
170
|
+
job_properties.add("IS_SERVICE")
|
|
171
|
+
if job.remainingTryCount == 0:
|
|
172
|
+
# Job is out of tries (and thus completely failed)
|
|
173
|
+
job_properties.add("COMPLETELY_FAILED")
|
|
174
|
+
completely_failed.append(job)
|
|
175
|
+
properties.append(job_properties)
|
|
176
|
+
|
|
177
|
+
jobStats = {
|
|
178
|
+
# These are lists of the mathcing jobs
|
|
179
|
+
'hasChildren': hasChildren,
|
|
180
|
+
'readyToRun': readyToRun,
|
|
181
|
+
'zombies': zombies,
|
|
182
|
+
'hasServices': hasServices,
|
|
183
|
+
'services': services,
|
|
184
|
+
'hasLogFile': hasLogFile,
|
|
185
|
+
'completelyFailed': completely_failed,
|
|
186
|
+
# These are stats for jobs in self.jobsToReport
|
|
187
|
+
'properties': properties,
|
|
188
|
+
'childNumber': child_number
|
|
189
|
+
}
|
|
156
190
|
return jobStats
|
|
157
191
|
|
|
158
192
|
@staticmethod
|
|
@@ -251,8 +285,9 @@ class ToilStatus:
|
|
|
251
285
|
"""
|
|
252
286
|
try:
|
|
253
287
|
return self.jobStore.load_root_job()
|
|
254
|
-
except JobException:
|
|
255
|
-
|
|
288
|
+
except JobException as e:
|
|
289
|
+
logger.info(e)
|
|
290
|
+
print('Root job is absent. The workflow has may have completed successfully.')
|
|
256
291
|
raise
|
|
257
292
|
|
|
258
293
|
def fetchUserJobs(self, jobs: List[str]) -> List[JobDescription]:
|
|
@@ -326,7 +361,7 @@ def main() -> None:
|
|
|
326
361
|
help="Do not print overall, aggregate status of workflow.",
|
|
327
362
|
default=True)
|
|
328
363
|
|
|
329
|
-
parser.add_argument("--printDot", action="store_true",
|
|
364
|
+
parser.add_argument("--dot", "--printDot", dest="print_dot", action="store_true",
|
|
330
365
|
help="Print dot formatted description of the graph. If using --jobs will "
|
|
331
366
|
"restrict to subgraph including only those jobs. default=%(default)s",
|
|
332
367
|
default=False)
|
|
@@ -335,20 +370,24 @@ def main() -> None:
|
|
|
335
370
|
help="Restrict reporting to the following jobs (allows subsetting of the report).",
|
|
336
371
|
default=None)
|
|
337
372
|
|
|
338
|
-
parser.add_argument("--printPerJobStats", action="store_true",
|
|
373
|
+
parser.add_argument("--perJob", "--printPerJobStats", dest="print_per_job_stats", action="store_true",
|
|
339
374
|
help="Print info about each job. default=%(default)s",
|
|
340
375
|
default=False)
|
|
341
376
|
|
|
342
|
-
parser.add_argument("--printLogs", action="store_true",
|
|
377
|
+
parser.add_argument("--logs", "--printLogs", dest="print_logs", action="store_true",
|
|
343
378
|
help="Print the log files of jobs (if they exist). default=%(default)s",
|
|
344
379
|
default=False)
|
|
345
380
|
|
|
346
|
-
parser.add_argument("--printChildren", action="store_true",
|
|
381
|
+
parser.add_argument("--children", "--printChildren", dest="print_children", action="store_true",
|
|
347
382
|
help="Print children of each job. default=%(default)s",
|
|
348
383
|
default=False)
|
|
349
384
|
|
|
350
|
-
parser.add_argument("--printStatus", action="store_true",
|
|
385
|
+
parser.add_argument("--status", "--printStatus", dest="print_status", action="store_true",
|
|
351
386
|
help="Determine which jobs are currently running and the associated batch system ID")
|
|
387
|
+
|
|
388
|
+
parser.add_argument("--failed", "--printFailed", dest="print_failed", action="store_true",
|
|
389
|
+
help="List jobs which seem to have failed to run")
|
|
390
|
+
|
|
352
391
|
options = parser.parse_args()
|
|
353
392
|
set_logging_from_options(options)
|
|
354
393
|
|
|
@@ -356,13 +395,10 @@ def main() -> None:
|
|
|
356
395
|
parser.print_help()
|
|
357
396
|
sys.exit(0)
|
|
358
397
|
|
|
359
|
-
config = Config()
|
|
360
|
-
config.setOptions(options)
|
|
361
|
-
|
|
362
398
|
try:
|
|
363
|
-
status = ToilStatus(
|
|
399
|
+
status = ToilStatus(options.jobStore, options.jobs)
|
|
364
400
|
except NoSuchJobStoreException:
|
|
365
|
-
print('
|
|
401
|
+
print(f'The job store {options.jobStore} was not found.')
|
|
366
402
|
return
|
|
367
403
|
except JobException: # Workflow likely complete, user informed in ToilStatus()
|
|
368
404
|
return
|
|
@@ -370,34 +406,44 @@ def main() -> None:
|
|
|
370
406
|
jobStats = status.report_on_jobs()
|
|
371
407
|
|
|
372
408
|
# Info to be reported.
|
|
409
|
+
# These are lists of matching jobs.
|
|
373
410
|
hasChildren = jobStats['hasChildren']
|
|
374
411
|
readyToRun = jobStats['readyToRun']
|
|
375
412
|
zombies = jobStats['zombies']
|
|
376
413
|
hasServices = jobStats['hasServices']
|
|
377
414
|
services = jobStats['services']
|
|
378
415
|
hasLogFile = jobStats['hasLogFile']
|
|
416
|
+
completely_failed = jobStats['completelyFailed']
|
|
417
|
+
# These are results for corresponding jobs in status.jobsToReport
|
|
379
418
|
properties = jobStats['properties']
|
|
380
419
|
childNumber = jobStats['childNumber']
|
|
381
420
|
|
|
382
|
-
if options.
|
|
421
|
+
if options.print_per_job_stats:
|
|
383
422
|
status.printAggregateJobStats(properties, childNumber)
|
|
384
|
-
if options.
|
|
423
|
+
if options.print_logs:
|
|
385
424
|
status.printJobLog()
|
|
386
|
-
if options.
|
|
425
|
+
if options.print_children:
|
|
387
426
|
status.printJobChildren()
|
|
388
|
-
if options.
|
|
427
|
+
if options.print_dot:
|
|
389
428
|
status.print_dot_chart()
|
|
429
|
+
if options.print_failed:
|
|
430
|
+
print("Failed jobs:")
|
|
431
|
+
for job in completely_failed:
|
|
432
|
+
print(job)
|
|
390
433
|
if options.stats:
|
|
391
434
|
print('Of the %i jobs considered, '
|
|
392
|
-
'there are
|
|
435
|
+
'there are '
|
|
436
|
+
'%i completely failed jobs, '
|
|
437
|
+
'%i jobs with children, '
|
|
393
438
|
'%i jobs ready to run, '
|
|
394
439
|
'%i zombie jobs, '
|
|
395
440
|
'%i jobs with services, '
|
|
396
441
|
'%i services, '
|
|
397
442
|
'and %i jobs with log files currently in %s.' %
|
|
398
|
-
(len(status.jobsToReport), len(
|
|
399
|
-
len(
|
|
400
|
-
|
|
443
|
+
(len(status.jobsToReport), len(completely_failed), len(hasChildren),
|
|
444
|
+
len(readyToRun), len(zombies), len(hasServices), len(services),
|
|
445
|
+
len(hasLogFile), status.jobStore))
|
|
446
|
+
if options.print_status:
|
|
401
447
|
status.print_bus_messages()
|
|
402
448
|
if len(status.jobsToReport) > 0 and options.failIfNotComplete:
|
|
403
449
|
# Upon workflow completion, all jobs will have been removed from job store
|
toil/version.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
baseVersion = '
|
|
1
|
+
baseVersion = '7.0.0'
|
|
2
2
|
cgcloudVersion = '1.6.0a1.dev393'
|
|
3
|
-
version = '
|
|
4
|
-
cacheTag = 'cache-local-py3.
|
|
5
|
-
mainCacheTag = 'cache-master-py3.
|
|
6
|
-
distVersion = '
|
|
7
|
-
exactPython = 'python3.
|
|
8
|
-
python = 'python3.
|
|
9
|
-
dockerTag = '
|
|
10
|
-
currentCommit = '
|
|
3
|
+
version = '7.0.0-d569ea5711eb310ffd5703803f7250ebf7c19576'
|
|
4
|
+
cacheTag = 'cache-local-py3.9'
|
|
5
|
+
mainCacheTag = 'cache-master-py3.9'
|
|
6
|
+
distVersion = '7.0.0'
|
|
7
|
+
exactPython = 'python3.9'
|
|
8
|
+
python = 'python3.9'
|
|
9
|
+
dockerTag = '7.0.0-d569ea5711eb310ffd5703803f7250ebf7c19576-py3.9'
|
|
10
|
+
currentCommit = 'd569ea5711eb310ffd5703803f7250ebf7c19576'
|
|
11
11
|
dockerRegistry = 'quay.io/ucsc_cgl'
|
|
12
12
|
dockerName = 'toil'
|
|
13
13
|
dirty = False
|
|
14
|
-
cwltool_version = '3.1.
|
|
14
|
+
cwltool_version = '3.1.20240508115724'
|