toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +22 -13
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/contained_executor.py +4 -5
  6. toil/batchSystems/gridengine.py +1 -1
  7. toil/batchSystems/htcondor.py +5 -5
  8. toil/batchSystems/kubernetes.py +25 -11
  9. toil/batchSystems/local_support.py +3 -3
  10. toil/batchSystems/lsf.py +2 -2
  11. toil/batchSystems/mesos/batchSystem.py +4 -4
  12. toil/batchSystems/mesos/executor.py +3 -2
  13. toil/batchSystems/options.py +9 -0
  14. toil/batchSystems/singleMachine.py +11 -10
  15. toil/batchSystems/slurm.py +64 -22
  16. toil/batchSystems/torque.py +1 -1
  17. toil/bus.py +7 -3
  18. toil/common.py +36 -13
  19. toil/cwl/cwltoil.py +365 -312
  20. toil/deferred.py +1 -1
  21. toil/fileStores/abstractFileStore.py +17 -17
  22. toil/fileStores/cachingFileStore.py +2 -2
  23. toil/fileStores/nonCachingFileStore.py +1 -1
  24. toil/job.py +228 -60
  25. toil/jobStores/abstractJobStore.py +18 -10
  26. toil/jobStores/aws/jobStore.py +280 -218
  27. toil/jobStores/aws/utils.py +57 -29
  28. toil/jobStores/conftest.py +2 -2
  29. toil/jobStores/fileJobStore.py +2 -2
  30. toil/jobStores/googleJobStore.py +3 -4
  31. toil/leader.py +72 -24
  32. toil/lib/aws/__init__.py +26 -10
  33. toil/lib/aws/iam.py +2 -2
  34. toil/lib/aws/session.py +62 -22
  35. toil/lib/aws/utils.py +73 -37
  36. toil/lib/conversions.py +5 -1
  37. toil/lib/ec2.py +118 -69
  38. toil/lib/expando.py +1 -1
  39. toil/lib/io.py +14 -2
  40. toil/lib/misc.py +1 -3
  41. toil/lib/resources.py +55 -21
  42. toil/lib/retry.py +12 -5
  43. toil/lib/threading.py +2 -2
  44. toil/lib/throttle.py +1 -1
  45. toil/options/common.py +27 -24
  46. toil/provisioners/__init__.py +9 -3
  47. toil/provisioners/abstractProvisioner.py +9 -7
  48. toil/provisioners/aws/__init__.py +20 -15
  49. toil/provisioners/aws/awsProvisioner.py +406 -329
  50. toil/provisioners/gceProvisioner.py +2 -2
  51. toil/provisioners/node.py +13 -5
  52. toil/server/app.py +1 -1
  53. toil/statsAndLogging.py +58 -16
  54. toil/test/__init__.py +27 -12
  55. toil/test/batchSystems/batchSystemTest.py +40 -33
  56. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  57. toil/test/batchSystems/test_slurm.py +1 -1
  58. toil/test/cwl/cwlTest.py +8 -91
  59. toil/test/cwl/seqtk_seq.cwl +1 -1
  60. toil/test/docs/scriptsTest.py +10 -13
  61. toil/test/jobStores/jobStoreTest.py +33 -49
  62. toil/test/lib/aws/test_iam.py +2 -2
  63. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  64. toil/test/provisioners/clusterTest.py +90 -8
  65. toil/test/server/serverTest.py +2 -2
  66. toil/test/src/autoDeploymentTest.py +1 -1
  67. toil/test/src/dockerCheckTest.py +2 -1
  68. toil/test/src/environmentTest.py +125 -0
  69. toil/test/src/fileStoreTest.py +1 -1
  70. toil/test/src/jobDescriptionTest.py +18 -8
  71. toil/test/src/jobTest.py +1 -1
  72. toil/test/src/realtimeLoggerTest.py +4 -0
  73. toil/test/src/workerTest.py +52 -19
  74. toil/test/utils/toilDebugTest.py +61 -3
  75. toil/test/utils/utilsTest.py +20 -18
  76. toil/test/wdl/wdltoil_test.py +24 -71
  77. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  78. toil/toilState.py +68 -9
  79. toil/utils/toilDebugJob.py +153 -26
  80. toil/utils/toilLaunchCluster.py +12 -2
  81. toil/utils/toilRsyncCluster.py +7 -2
  82. toil/utils/toilSshCluster.py +7 -3
  83. toil/utils/toilStats.py +2 -1
  84. toil/utils/toilStatus.py +97 -51
  85. toil/version.py +10 -10
  86. toil/wdl/wdltoil.py +318 -51
  87. toil/worker.py +96 -69
  88. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  89. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
  90. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
  91. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  92. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  93. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
@@ -12,66 +12,193 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  """Debug tool for running a toil job locally."""
15
+ import gc
15
16
  import logging
16
-
17
+ import os
17
18
  import pprint
18
19
  import sys
19
20
 
21
+ from pathlib import Path
22
+ from typing import Optional, List, Tuple
23
+
20
24
  from toil.common import Config, Toil, parser_with_common_options
25
+ from toil.job import FilesDownloadedStoppingPointReached
21
26
  from toil.jobStores.fileJobStore import FileJobStore
22
27
  from toil.statsAndLogging import set_logging_from_options
23
28
  from toil.utils.toilDebugFile import printContentsOfJobStore
29
+ from toil.utils.toilStatus import ToilStatus
24
30
  from toil.worker import workerScript
25
31
 
26
32
  logger = logging.getLogger(__name__)
27
33
 
28
34
 
29
35
  def main() -> None:
30
- parser = parser_with_common_options(jobstore_option=True, prog="toil debug-job")
31
- parser.add_argument("jobID", type=str, nargs='?', default=None,
32
- help="The job store id of a job within the provided jobstore to run by itself.")
33
- parser.add_argument("--printJobInfo", type=str,
34
- help="Dump debugging info about this job ID")
36
+ parser = parser_with_common_options(jobstore_option=True, prog="toil debug-job", default_log_level=logging.DEBUG)
37
+ parser.add_argument("job", type=str,
38
+ help="The job store id or job name of a job within the provided jobstore")
39
+ parser.add_argument("--printJobInfo", action="store_true",
40
+ help="Dump debugging info about the job instead of running it")
41
+ parser.add_argument("--retrieveTaskDirectory", dest="retrieve_task_directory", type=str, default=None,
42
+ help="Download CWL or WDL task inputs to the given directory and stop.")
35
43
 
36
44
  options = parser.parse_args()
37
45
  set_logging_from_options(options)
38
46
 
47
+ if options.retrieve_task_directory is not None and os.path.exists(options.retrieve_task_directory):
48
+ # The logic to duplicate container mounts depends on stuff not already existing.
49
+ logger.error(
50
+ "The directory %s given for --retrieveTaskDirectory already exists. "
51
+ "Stopping to avoid clobbering existing files.",
52
+ options.retrieve_task_directory
53
+ )
54
+ sys.exit(1)
55
+
39
56
  jobStore = Toil.resumeJobStore(options.jobStore)
40
57
  # Get the config with the workflow ID from the job store
41
58
  config = jobStore.config
42
59
  # But override its options
43
60
  config.setOptions(options)
44
61
 
45
- did_something = False
62
+ # Find the job
63
+
64
+ if jobStore.job_exists(options.job):
65
+ # The user asked for a particular job and it exists
66
+ job_id = options.job
67
+ else:
68
+ # Go search by name and fill in job_id
69
+
70
+ # TODO: break out job store scan logic so it doesn't need to re-connect
71
+ # to the job store.
72
+ status = ToilStatus(options.jobStore)
73
+ hits = []
74
+ suggestion = None
75
+ for job in status.jobsToReport:
76
+ if options.job in (job.jobName, job.unitName, job.displayName):
77
+ # Find all the jobs that sort of match
78
+ hits.append(job)
79
+ if suggestion is None and job.remainingTryCount == 0:
80
+ # How about this nice failing job instead?
81
+ suggestion = job
82
+ if len(hits) == 0:
83
+ # No hits
84
+ if suggestion is None:
85
+ logger.critical("No job found with ID or name \"%s\". No jobs are completely failed.", options.job)
86
+ else:
87
+ logger.critical("No job found with ID or name \"%s\". How about the failed job %s instead?", options.job, suggestion)
88
+ sys.exit(1)
89
+ elif len(hits) > 1:
90
+ # Several hits, maybe only one has failed
91
+ completely_failed_hits = [job for job in hits if job.remainingTryCount == 0]
92
+ if len(completely_failed_hits) == 0:
93
+ logger.critical("Multiple jobs match \"%s\" but none are completely failed: %s", options.job, hits)
94
+ sys.exit(1)
95
+ elif len(completely_failed_hits) > 0:
96
+ logger.critical("Multiple jobs matching \"%s\" are completely failed: %s", options.job, completely_failed_hits)
97
+ sys.exit(1)
98
+ else:
99
+ # We found one completely failed job, they probably mean that one.
100
+ logger.info("There are %s jobs matching \"%s\"; assuming you mean the failed one: %s", options.job, completely_failed_hits[0])
101
+ job_id = completely_failed_hits[0].jobStoreID
102
+ else:
103
+ # We found one job with this name, so they must mean that one
104
+ logger.info("Looked up job named \"%s\": %s", options.job, hits[0])
105
+ job_id = hits[0].jobStoreID
46
106
 
47
107
  if options.printJobInfo:
108
+ # Report on the job
109
+
48
110
  if isinstance(jobStore, FileJobStore):
49
111
  # List all its files if we can
50
- printContentsOfJobStore(job_store=jobStore, job_id=options.printJobInfo)
112
+ printContentsOfJobStore(job_store=jobStore, job_id=job_id)
51
113
  # Print the job description itself
52
- job_desc = jobStore.load_job(options.printJobInfo)
114
+ job_desc = jobStore.load_job(job_id)
53
115
  print(f"Job: {job_desc}")
54
116
  pprint.pprint(job_desc.__dict__)
117
+ else:
118
+ # Run the job
55
119
 
56
- did_something = True
120
+ debug_flags = set()
121
+ local_worker_temp_dir = None
122
+ if options.retrieve_task_directory is not None:
123
+ # Pick a directory in it (which may be removed by the worker) as the worker's temp dir.
124
+ local_worker_temp_dir = os.path.join(options.retrieve_task_directory, "worker")
125
+ # Make sure it exists
126
+ os.makedirs(local_worker_temp_dir, exist_ok=True)
127
+ # And tell the job to just download files
128
+ debug_flags.add("download_only")
129
+ # We might need to reconstruct a container environment.
130
+ host_and_job_paths: Optional[List[Tuple[str, str]]] = None
131
+ # Track if the run succeeded without error
132
+ run_succeeded = False
57
133
 
58
- # TODO: Option to print list of successor jobs
59
- # TODO: Option to run job within python debugger, allowing step through of arguments
60
- # idea would be to have option to import pdb and set breakpoint at the start of the user's code
134
+ logger.info(f"Running the following job locally: {job_id}")
135
+ try:
136
+ workerScript(jobStore, config, job_id, job_id, redirect_output_to_log_file=False, local_worker_temp_dir=local_worker_temp_dir, debug_flags=debug_flags)
137
+ except FilesDownloadedStoppingPointReached as e:
138
+ # We asked for the files to be downloaded and now they are.
139
+ assert options.retrieve_task_directory is not None
140
+ if e.host_and_job_paths is not None:
141
+ # Capture the container mapping so we can reconstruct the container environment after we unwind the worker stack.
142
+ host_and_job_paths = e.host_and_job_paths
143
+ else:
144
+ # No error!
145
+ run_succeeded = True
61
146
 
62
- if options.jobID is not None:
63
- # We actually want to run a job.
147
+ # Make sure the deferred function manager cleans up and logs its
148
+ # shutdown before we start writing any reports.
149
+ gc.collect()
64
150
 
65
- jobID = options.jobID
66
- logger.debug(f"Running the following job locally: {jobID}")
67
- workerScript(jobStore, config, jobID, jobID, redirectOutputToLogFile=False)
68
- logger.debug(f"Finished running: {jobID}")
69
- # Even if the job fails, the worker script succeeds unless something goes wrong with it internally.
151
+ if run_succeeded:
152
+ logger.info(f"Successfully ran: {job_id}")
70
153
 
71
- did_something = True
154
+ if host_and_job_paths is not None:
155
+ # We need to make a place that looks like the job paths half of these.
72
156
 
73
- if not did_something:
74
- # Somebody forgot to tell us to do anything.
75
- # Show the usage instructions.
76
- parser.print_help()
77
- sys.exit(1)
157
+ # Sort by job-side path so we do children before parents, to
158
+ # stop us from accidentally making children inside moutned
159
+ # parents.
160
+ sorted_mounts = sorted(host_and_job_paths, key=lambda t: t[1], reverse=True)
161
+
162
+ fake_job_root = os.path.join(options.retrieve_task_directory, "inside")
163
+ os.makedirs(fake_job_root, exist_ok=True)
164
+
165
+ for host_path, job_path in sorted_mounts:
166
+ if not os.path.exists(host_path):
167
+ logger.error("Job intended to mount %s as %s but it does not exist!", host_path, job_path)
168
+ continue
169
+ if not job_path.startswith("/"):
170
+ logger.error("Job intended to mount %s as %s but destination is a relative path!", host_path, job_path)
171
+ continue
172
+ # Drop the slash because we are building a chroot-ish mini filesystem.
173
+ job_relative_path = job_path[1:]
174
+ if job_relative_path.startswith("/"):
175
+ # We are having trouble understanding what the job
176
+ # intended to do. Stop working on this mount.
177
+ logger.error("Job intended to mount %s as %s but destination starts with multiple slashes for some reason!", host_path, job_path)
178
+ continue
179
+ fake_job_path = os.path.join(fake_job_root, job_relative_path)
180
+ if os.path.exists(fake_job_path):
181
+ logger.error("Job intended to mount %s as %s but that location is already mounted!", host_path, job_path)
182
+ continue
183
+
184
+ logger.info("Job mounted %s as %s", host_path, job_path)
185
+
186
+ # Make sure the directory to contain the mount exists.
187
+ fake_job_containing_path = os.path.dirname(fake_job_path)
188
+ os.makedirs(fake_job_containing_path, exist_ok=True)
189
+
190
+ top_pathobj = Path(os.path.abspath(options.retrieve_task_directory))
191
+ source_pathobj = Path(host_path)
192
+ if top_pathobj in source_pathobj.parents:
193
+ # We're linking to a file we already downloaded (probably).
194
+ # Make a relative symlink so the whole assemblage can move.
195
+ host_path = os.path.relpath(host_path, fake_job_containing_path)
196
+
197
+ # Make a symlink to simulate the mount
198
+ os.symlink(host_path, fake_job_path)
199
+
200
+ logger.info("Reconstructed job container filesystem at %s", fake_job_root)
201
+
202
+ # TODO: Option to print list of successor jobs
203
+ # TODO: Option to run job within python debugger, allowing step through of arguments
204
+ # idea would be to have option to import pdb and set breakpoint at the start of the user's code
@@ -20,6 +20,7 @@ from typing import Dict, List, Tuple, Union
20
20
  from toil import applianceSelf
21
21
  from toil.common import parser_with_common_options
22
22
  from toil.lib.aws import build_tag_dict_from_env
23
+ from toil.lib.conversions import opt_strtobool
23
24
  from toil.provisioners import (check_valid_node_types,
24
25
  cluster_factory,
25
26
  parse_node_types)
@@ -31,7 +32,11 @@ logger = logging.getLogger(__name__)
31
32
  def create_tags_dict(tags: List[str]) -> Dict[str, str]:
32
33
  tags_dict = dict()
33
34
  for tag in tags:
34
- key, value = tag.split('=')
35
+ try:
36
+ key, value = tag.split('=')
37
+ except ValueError:
38
+ logger.error("Tag specification '%s' must contain '='", tag)
39
+ raise
35
40
  tags_dict[key] = value
36
41
  return tags_dict
37
42
 
@@ -114,6 +119,10 @@ def main() -> None:
114
119
  help="Any additional security groups to attach to EC2 instances. Note that a security group "
115
120
  "with its name equal to the cluster name will always be created, thus ensure that "
116
121
  "the extra security groups do not have the same name as the cluster name.")
122
+ parser.add_argument("--allowFuse", type=opt_strtobool, default=True,
123
+ help="Enable both the leader and worker nodes to be able to run Singularity with FUSE. For "
124
+ "Kubernetes, this will make the leader privileged and ask workers to run as privileged. "
125
+ "(default: %(default)s)")
117
126
  #TODO Set Aws Profile in CLI options
118
127
  options = parser.parse_args()
119
128
  set_logging_from_options(options)
@@ -178,7 +187,8 @@ def main() -> None:
178
187
  clusterName=options.clusterName,
179
188
  clusterType=options.clusterType,
180
189
  zone=options.zone,
181
- nodeStorage=options.nodeStorage)
190
+ nodeStorage=options.nodeStorage,
191
+ enable_fuse=options.allowFuse)
182
192
 
183
193
  cluster.launchCluster(leaderNodeType=options.leaderNodeType,
184
194
  leaderStorage=options.leaderStorage,
@@ -14,9 +14,10 @@
14
14
  """Rsyncs into the toil appliance container running on the leader of the cluster."""
15
15
  import argparse
16
16
  import logging
17
+ import sys
17
18
 
18
19
  from toil.common import parser_with_common_options
19
- from toil.provisioners import cluster_factory
20
+ from toil.provisioners import cluster_factory, NoSuchClusterException
20
21
  from toil.statsAndLogging import set_logging_from_options
21
22
 
22
23
  logger = logging.getLogger(__name__)
@@ -37,4 +38,8 @@ def main() -> None:
37
38
  cluster = cluster_factory(provisioner=options.provisioner,
38
39
  clusterName=options.clusterName,
39
40
  zone=options.zone)
40
- cluster.getLeader().coreRsync(args=options.args, strict=not options.insecure)
41
+ try:
42
+ cluster.getLeader().coreRsync(args=options.args, strict=not options.insecure)
43
+ except NoSuchClusterException as e:
44
+ logger.error(e)
45
+ sys.exit(1)
@@ -18,7 +18,7 @@ import sys
18
18
  from typing import List
19
19
 
20
20
  from toil.common import parser_with_common_options
21
- from toil.provisioners import cluster_factory
21
+ from toil.provisioners import cluster_factory, NoSuchClusterException
22
22
  from toil.statsAndLogging import set_logging_from_options
23
23
 
24
24
  logger = logging.getLogger(__name__)
@@ -54,5 +54,9 @@ def main() -> None:
54
54
  sshOptions.extend(['-L', f'{options.grafana_port}:localhost:3000',
55
55
  '-L', '9090:localhost:9090'])
56
56
 
57
- cluster.getLeader().sshAppliance(*command, strict=not options.insecure, tty=sys.stdin.isatty(),
58
- sshOptions=sshOptions)
57
+ try:
58
+ cluster.getLeader().sshAppliance(*command, strict=not options.insecure, tty=sys.stdin.isatty(),
59
+ sshOptions=sshOptions)
60
+ except NoSuchClusterException as e:
61
+ logger.error(e)
62
+ sys.exit(1)
toil/utils/toilStats.py CHANGED
@@ -451,7 +451,7 @@ def update_column_widths(tag: Expando, cw: ColumnWidths, options: Namespace) ->
451
451
  cw.set_width(category, field, len(s) + 1)
452
452
 
453
453
 
454
- def build_element(element: Expando, items: List[Job], item_name: str, defaults: dict[str, float]) -> Expando:
454
+ def build_element(element: Expando, items: List[Job], item_name: str, defaults: Dict[str, float]) -> Expando:
455
455
  """Create an element for output."""
456
456
 
457
457
  def assertNonnegative(i: float, name: str) -> float:
@@ -696,6 +696,7 @@ def main() -> None:
696
696
  except NoSuchJobStoreException:
697
697
  logger.critical("The job store %s does not exist", config.jobStore)
698
698
  sys.exit(1)
699
+ logger.info('Gathering stats from jobstore... depending on the number of jobs, this may take a while (e.g. 10 jobs ~= 3 seconds; 100,000 jobs ~= 3,000 seconds or 50 minutes).')
699
700
  stats = get_stats(jobStore)
700
701
  collatedStatsTag = process_data(jobStore.config, stats)
701
702
  report_data(collatedStatsTag, options)
toil/utils/toilStatus.py CHANGED
@@ -94,65 +94,99 @@ class ToilStatus:
94
94
  children += "\t(CHILD_JOB:%s,PRECEDENCE:%i)" % (childJob, level)
95
95
  print(children)
96
96
 
97
- def printAggregateJobStats(self, properties: List[str], childNumber: int) -> None:
98
- """Prints a job's ID, log file, remaining tries, and other properties."""
99
- for job in self.jobsToReport:
97
+ def printAggregateJobStats(self, properties: List[Set[str]], childNumber: List[int]) -> None:
98
+ """
99
+ Prints each job's ID, log file, remaining tries, and other properties.
100
+
101
+ :param properties: A set of string flag names for each job in self.jobsToReport.
102
+ :param childNumber: A list of child counts for each job in self.jobsToReport.
103
+ """
104
+ for job, job_properties, job_child_number in zip(self.jobsToReport, properties, childNumber):
100
105
 
101
106
  def lf(x: str) -> str:
102
- return f"{x}:{str(x in properties)}"
103
- print("\t".join(("JOB:%s" % job,
104
- "LOG_FILE:%s" % job.logJobStoreFileID,
105
- "TRYS_REMAINING:%i" % job.remainingTryCount,
106
- "CHILD_NUMBER:%s" % childNumber,
107
- lf("READY_TO_RUN"), lf("IS_ZOMBIE"),
108
- lf("HAS_SERVICES"), lf("IS_SERVICE"))))
107
+ return f"{x}:{str(x in job_properties)}"
108
+ # We use a sort of not-really-machine-readable key:value TSV format here.
109
+ # But we only include important keys to help the humans, and flags
110
+ # don't have a value, just a key.
111
+ parts = [f"JOB:{job}"]
112
+ for flag in ["COMPLETELY_FAILED", "READY_TO_RUN", "IS_ZOMBIE", "HAS_SERVICES", "IS_SERVICE"]:
113
+ if flag in job_properties:
114
+ parts.append(flag)
115
+ if job.logJobStoreFileID:
116
+ parts.append(f"LOG_FILE:{job.logJobStoreFileID}")
117
+ if job.remainingTryCount > 0:
118
+ parts.append(f"TRYS_REMAINING:{job.remainingTryCount}")
119
+ if job_child_number > 0:
120
+ parts.append(f"CHILD_NUMBER:{job_child_number}")
121
+
122
+ print("\t".join(parts))
109
123
 
110
124
  def report_on_jobs(self) -> Dict[str, Any]:
111
125
  """
112
126
  Gathers information about jobs such as its child jobs and status.
113
127
 
114
- :returns jobStats: Pairings of a useful category and a list of jobs which fall into it.
115
- :rtype dict:
128
+ :returns jobStats: Dict containing some lists of jobs by category, and
129
+ some lists of job properties for each job in self.jobsToReport.
116
130
  """
131
+ # These are lists of the matching jobs
117
132
  hasChildren = []
118
133
  readyToRun = []
119
134
  zombies = []
120
135
  hasLogFile: List[JobDescription] = []
121
136
  hasServices = []
122
137
  services: List[ServiceJobDescription] = []
123
- properties = set()
138
+ completely_failed = []
139
+
140
+ # These are stats for jobs in self.jobsToReport
141
+ child_number: List[int] = []
142
+ properties: List[Set[str]] = []
143
+
144
+ # TODO: This mix of semantics is confusing and made per-job status be
145
+ # wrong for multiple years because it was not understood. Redesign it!
124
146
 
125
147
  for job in self.jobsToReport:
148
+ job_properties: Set[str] = set()
126
149
  if job.logJobStoreFileID is not None:
127
150
  hasLogFile.append(job)
128
151
 
129
- childNumber = len(list(job.allSuccessors()))
130
- if childNumber > 0: # Total number of successors > 0
152
+ job_child_number = len(list(job.allSuccessors()))
153
+ child_number.append(job_child_number)
154
+ if job_child_number > 0: # Total number of successors > 0
131
155
  hasChildren.append(job)
132
- properties.add("HAS_CHILDREN")
133
- elif job.command is not None:
134
- # Job has no children and a command to run. Indicates job could be run.
156
+ job_properties.add("HAS_CHILDREN")
157
+ elif job.has_body():
158
+ # Job has no children and a body to run. Indicates job could be run.
135
159
  readyToRun.append(job)
136
- properties.add("READY_TO_RUN")
160
+ job_properties.add("READY_TO_RUN")
137
161
  else:
138
162
  # Job has no successors and no command, so is a zombie job.
139
163
  zombies.append(job)
140
- properties.add("IS_ZOMBIE")
164
+ job_properties.add("IS_ZOMBIE")
141
165
  if job.services:
142
166
  hasServices.append(job)
143
- properties.add("HAS_SERVICES")
167
+ job_properties.add("HAS_SERVICES")
144
168
  if isinstance(job, ServiceJobDescription):
145
169
  services.append(job)
146
- properties.add("IS_SERVICE")
147
-
148
- jobStats = {'hasChildren': hasChildren,
149
- 'readyToRun': readyToRun,
150
- 'zombies': zombies,
151
- 'hasServices': hasServices,
152
- 'services': services,
153
- 'hasLogFile': hasLogFile,
154
- 'properties': properties,
155
- 'childNumber': childNumber}
170
+ job_properties.add("IS_SERVICE")
171
+ if job.remainingTryCount == 0:
172
+ # Job is out of tries (and thus completely failed)
173
+ job_properties.add("COMPLETELY_FAILED")
174
+ completely_failed.append(job)
175
+ properties.append(job_properties)
176
+
177
+ jobStats = {
178
+ # These are lists of the mathcing jobs
179
+ 'hasChildren': hasChildren,
180
+ 'readyToRun': readyToRun,
181
+ 'zombies': zombies,
182
+ 'hasServices': hasServices,
183
+ 'services': services,
184
+ 'hasLogFile': hasLogFile,
185
+ 'completelyFailed': completely_failed,
186
+ # These are stats for jobs in self.jobsToReport
187
+ 'properties': properties,
188
+ 'childNumber': child_number
189
+ }
156
190
  return jobStats
157
191
 
158
192
  @staticmethod
@@ -251,8 +285,9 @@ class ToilStatus:
251
285
  """
252
286
  try:
253
287
  return self.jobStore.load_root_job()
254
- except JobException:
255
- print('Root job is absent. The workflow has may have completed successfully.', file=sys.stderr)
288
+ except JobException as e:
289
+ logger.info(e)
290
+ print('Root job is absent. The workflow has may have completed successfully.')
256
291
  raise
257
292
 
258
293
  def fetchUserJobs(self, jobs: List[str]) -> List[JobDescription]:
@@ -326,7 +361,7 @@ def main() -> None:
326
361
  help="Do not print overall, aggregate status of workflow.",
327
362
  default=True)
328
363
 
329
- parser.add_argument("--printDot", action="store_true",
364
+ parser.add_argument("--dot", "--printDot", dest="print_dot", action="store_true",
330
365
  help="Print dot formatted description of the graph. If using --jobs will "
331
366
  "restrict to subgraph including only those jobs. default=%(default)s",
332
367
  default=False)
@@ -335,20 +370,24 @@ def main() -> None:
335
370
  help="Restrict reporting to the following jobs (allows subsetting of the report).",
336
371
  default=None)
337
372
 
338
- parser.add_argument("--printPerJobStats", action="store_true",
373
+ parser.add_argument("--perJob", "--printPerJobStats", dest="print_per_job_stats", action="store_true",
339
374
  help="Print info about each job. default=%(default)s",
340
375
  default=False)
341
376
 
342
- parser.add_argument("--printLogs", action="store_true",
377
+ parser.add_argument("--logs", "--printLogs", dest="print_logs", action="store_true",
343
378
  help="Print the log files of jobs (if they exist). default=%(default)s",
344
379
  default=False)
345
380
 
346
- parser.add_argument("--printChildren", action="store_true",
381
+ parser.add_argument("--children", "--printChildren", dest="print_children", action="store_true",
347
382
  help="Print children of each job. default=%(default)s",
348
383
  default=False)
349
384
 
350
- parser.add_argument("--printStatus", action="store_true",
385
+ parser.add_argument("--status", "--printStatus", dest="print_status", action="store_true",
351
386
  help="Determine which jobs are currently running and the associated batch system ID")
387
+
388
+ parser.add_argument("--failed", "--printFailed", dest="print_failed", action="store_true",
389
+ help="List jobs which seem to have failed to run")
390
+
352
391
  options = parser.parse_args()
353
392
  set_logging_from_options(options)
354
393
 
@@ -356,13 +395,10 @@ def main() -> None:
356
395
  parser.print_help()
357
396
  sys.exit(0)
358
397
 
359
- config = Config()
360
- config.setOptions(options)
361
-
362
398
  try:
363
- status = ToilStatus(config.jobStore, options.jobs)
399
+ status = ToilStatus(options.jobStore, options.jobs)
364
400
  except NoSuchJobStoreException:
365
- print('No job store found.')
401
+ print(f'The job store {options.jobStore} was not found.')
366
402
  return
367
403
  except JobException: # Workflow likely complete, user informed in ToilStatus()
368
404
  return
@@ -370,34 +406,44 @@ def main() -> None:
370
406
  jobStats = status.report_on_jobs()
371
407
 
372
408
  # Info to be reported.
409
+ # These are lists of matching jobs.
373
410
  hasChildren = jobStats['hasChildren']
374
411
  readyToRun = jobStats['readyToRun']
375
412
  zombies = jobStats['zombies']
376
413
  hasServices = jobStats['hasServices']
377
414
  services = jobStats['services']
378
415
  hasLogFile = jobStats['hasLogFile']
416
+ completely_failed = jobStats['completelyFailed']
417
+ # These are results for corresponding jobs in status.jobsToReport
379
418
  properties = jobStats['properties']
380
419
  childNumber = jobStats['childNumber']
381
420
 
382
- if options.printPerJobStats:
421
+ if options.print_per_job_stats:
383
422
  status.printAggregateJobStats(properties, childNumber)
384
- if options.printLogs:
423
+ if options.print_logs:
385
424
  status.printJobLog()
386
- if options.printChildren:
425
+ if options.print_children:
387
426
  status.printJobChildren()
388
- if options.printDot:
427
+ if options.print_dot:
389
428
  status.print_dot_chart()
429
+ if options.print_failed:
430
+ print("Failed jobs:")
431
+ for job in completely_failed:
432
+ print(job)
390
433
  if options.stats:
391
434
  print('Of the %i jobs considered, '
392
- 'there are %i jobs with children, '
435
+ 'there are '
436
+ '%i completely failed jobs, '
437
+ '%i jobs with children, '
393
438
  '%i jobs ready to run, '
394
439
  '%i zombie jobs, '
395
440
  '%i jobs with services, '
396
441
  '%i services, '
397
442
  'and %i jobs with log files currently in %s.' %
398
- (len(status.jobsToReport), len(hasChildren), len(readyToRun), len(zombies),
399
- len(hasServices), len(services), len(hasLogFile), status.jobStore))
400
- if options.printStatus:
443
+ (len(status.jobsToReport), len(completely_failed), len(hasChildren),
444
+ len(readyToRun), len(zombies), len(hasServices), len(services),
445
+ len(hasLogFile), status.jobStore))
446
+ if options.print_status:
401
447
  status.print_bus_messages()
402
448
  if len(status.jobsToReport) > 0 and options.failIfNotComplete:
403
449
  # Upon workflow completion, all jobs will have been removed from job store
toil/version.py CHANGED
@@ -1,14 +1,14 @@
1
- baseVersion = '6.1.0'
1
+ baseVersion = '7.0.0'
2
2
  cgcloudVersion = '1.6.0a1.dev393'
3
- version = '6.1.0-3f9cba3766e52866ea80d0934498f8c8f3129c3f'
4
- cacheTag = 'cache-local-py3.10'
5
- mainCacheTag = 'cache-master-py3.10'
6
- distVersion = '6.1.0'
7
- exactPython = 'python3.10'
8
- python = 'python3.10'
9
- dockerTag = '6.1.0-3f9cba3766e52866ea80d0934498f8c8f3129c3f-py3.10'
10
- currentCommit = '3f9cba3766e52866ea80d0934498f8c8f3129c3f'
3
+ version = '7.0.0-d569ea5711eb310ffd5703803f7250ebf7c19576'
4
+ cacheTag = 'cache-local-py3.9'
5
+ mainCacheTag = 'cache-master-py3.9'
6
+ distVersion = '7.0.0'
7
+ exactPython = 'python3.9'
8
+ python = 'python3.9'
9
+ dockerTag = '7.0.0-d569ea5711eb310ffd5703803f7250ebf7c19576-py3.9'
10
+ currentCommit = 'd569ea5711eb310ffd5703803f7250ebf7c19576'
11
11
  dockerRegistry = 'quay.io/ucsc_cgl'
12
12
  dockerName = 'toil'
13
13
  dirty = False
14
- cwltool_version = '3.1.20240112164112'
14
+ cwltool_version = '3.1.20240508115724'