toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +41 -17
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +4 -5
  7. toil/batchSystems/gridengine.py +1 -1
  8. toil/batchSystems/htcondor.py +5 -5
  9. toil/batchSystems/kubernetes.py +25 -11
  10. toil/batchSystems/local_support.py +3 -3
  11. toil/batchSystems/lsf.py +9 -9
  12. toil/batchSystems/mesos/batchSystem.py +4 -4
  13. toil/batchSystems/mesos/executor.py +3 -2
  14. toil/batchSystems/options.py +9 -0
  15. toil/batchSystems/singleMachine.py +11 -10
  16. toil/batchSystems/slurm.py +129 -16
  17. toil/batchSystems/torque.py +1 -1
  18. toil/bus.py +45 -3
  19. toil/common.py +56 -31
  20. toil/cwl/cwltoil.py +442 -371
  21. toil/deferred.py +1 -1
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/abstractFileStore.py +69 -20
  24. toil/fileStores/cachingFileStore.py +6 -22
  25. toil/fileStores/nonCachingFileStore.py +6 -15
  26. toil/job.py +270 -86
  27. toil/jobStores/abstractJobStore.py +37 -31
  28. toil/jobStores/aws/jobStore.py +280 -218
  29. toil/jobStores/aws/utils.py +60 -31
  30. toil/jobStores/conftest.py +2 -2
  31. toil/jobStores/fileJobStore.py +3 -3
  32. toil/jobStores/googleJobStore.py +3 -4
  33. toil/leader.py +89 -38
  34. toil/lib/aws/__init__.py +26 -10
  35. toil/lib/aws/iam.py +2 -2
  36. toil/lib/aws/session.py +62 -22
  37. toil/lib/aws/utils.py +73 -37
  38. toil/lib/conversions.py +24 -1
  39. toil/lib/ec2.py +118 -69
  40. toil/lib/expando.py +1 -1
  41. toil/lib/generatedEC2Lists.py +8 -8
  42. toil/lib/io.py +42 -4
  43. toil/lib/misc.py +1 -3
  44. toil/lib/resources.py +57 -16
  45. toil/lib/retry.py +12 -5
  46. toil/lib/threading.py +29 -14
  47. toil/lib/throttle.py +1 -1
  48. toil/options/common.py +31 -30
  49. toil/options/wdl.py +5 -0
  50. toil/provisioners/__init__.py +9 -3
  51. toil/provisioners/abstractProvisioner.py +12 -2
  52. toil/provisioners/aws/__init__.py +20 -15
  53. toil/provisioners/aws/awsProvisioner.py +406 -329
  54. toil/provisioners/gceProvisioner.py +2 -2
  55. toil/provisioners/node.py +13 -5
  56. toil/server/app.py +1 -1
  57. toil/statsAndLogging.py +93 -23
  58. toil/test/__init__.py +27 -12
  59. toil/test/batchSystems/batchSystemTest.py +40 -33
  60. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  61. toil/test/batchSystems/test_slurm.py +22 -7
  62. toil/test/cactus/__init__.py +0 -0
  63. toil/test/cactus/test_cactus_integration.py +58 -0
  64. toil/test/cwl/cwlTest.py +245 -236
  65. toil/test/cwl/seqtk_seq.cwl +1 -1
  66. toil/test/docs/scriptsTest.py +11 -14
  67. toil/test/jobStores/jobStoreTest.py +40 -54
  68. toil/test/lib/aws/test_iam.py +2 -2
  69. toil/test/lib/test_ec2.py +1 -1
  70. toil/test/options/__init__.py +13 -0
  71. toil/test/options/options.py +37 -0
  72. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  73. toil/test/provisioners/clusterTest.py +99 -16
  74. toil/test/server/serverTest.py +2 -2
  75. toil/test/src/autoDeploymentTest.py +1 -1
  76. toil/test/src/dockerCheckTest.py +2 -1
  77. toil/test/src/environmentTest.py +125 -0
  78. toil/test/src/fileStoreTest.py +1 -1
  79. toil/test/src/jobDescriptionTest.py +18 -8
  80. toil/test/src/jobTest.py +1 -1
  81. toil/test/src/realtimeLoggerTest.py +4 -0
  82. toil/test/src/workerTest.py +52 -19
  83. toil/test/utils/toilDebugTest.py +62 -4
  84. toil/test/utils/utilsTest.py +23 -21
  85. toil/test/wdl/wdltoil_test.py +49 -21
  86. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  87. toil/toilState.py +68 -9
  88. toil/utils/toilDebugFile.py +1 -1
  89. toil/utils/toilDebugJob.py +153 -26
  90. toil/utils/toilLaunchCluster.py +12 -2
  91. toil/utils/toilRsyncCluster.py +7 -2
  92. toil/utils/toilSshCluster.py +7 -3
  93. toil/utils/toilStats.py +310 -266
  94. toil/utils/toilStatus.py +98 -52
  95. toil/version.py +11 -11
  96. toil/wdl/wdltoil.py +644 -225
  97. toil/worker.py +125 -83
  98. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  99. toil-7.0.0.dist-info/METADATA +158 -0
  100. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
  101. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  102. toil-6.1.0a1.dist-info/METADATA +0 -125
  103. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  104. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
@@ -12,66 +12,193 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  """Debug tool for running a toil job locally."""
15
+ import gc
15
16
  import logging
16
-
17
+ import os
17
18
  import pprint
18
19
  import sys
19
20
 
21
+ from pathlib import Path
22
+ from typing import Optional, List, Tuple
23
+
20
24
  from toil.common import Config, Toil, parser_with_common_options
25
+ from toil.job import FilesDownloadedStoppingPointReached
21
26
  from toil.jobStores.fileJobStore import FileJobStore
22
27
  from toil.statsAndLogging import set_logging_from_options
23
28
  from toil.utils.toilDebugFile import printContentsOfJobStore
29
+ from toil.utils.toilStatus import ToilStatus
24
30
  from toil.worker import workerScript
25
31
 
26
32
  logger = logging.getLogger(__name__)
27
33
 
28
34
 
29
35
  def main() -> None:
30
- parser = parser_with_common_options(jobstore_option=True, prog="toil debug-job")
31
- parser.add_argument("jobID", type=str, nargs='?', default=None,
32
- help="The job store id of a job within the provided jobstore to run by itself.")
33
- parser.add_argument("--printJobInfo", type=str,
34
- help="Dump debugging info about this job ID")
36
+ parser = parser_with_common_options(jobstore_option=True, prog="toil debug-job", default_log_level=logging.DEBUG)
37
+ parser.add_argument("job", type=str,
38
+ help="The job store id or job name of a job within the provided jobstore")
39
+ parser.add_argument("--printJobInfo", action="store_true",
40
+ help="Dump debugging info about the job instead of running it")
41
+ parser.add_argument("--retrieveTaskDirectory", dest="retrieve_task_directory", type=str, default=None,
42
+ help="Download CWL or WDL task inputs to the given directory and stop.")
35
43
 
36
44
  options = parser.parse_args()
37
45
  set_logging_from_options(options)
38
46
 
47
+ if options.retrieve_task_directory is not None and os.path.exists(options.retrieve_task_directory):
48
+ # The logic to duplicate container mounts depends on stuff not already existing.
49
+ logger.error(
50
+ "The directory %s given for --retrieveTaskDirectory already exists. "
51
+ "Stopping to avoid clobbering existing files.",
52
+ options.retrieve_task_directory
53
+ )
54
+ sys.exit(1)
55
+
39
56
  jobStore = Toil.resumeJobStore(options.jobStore)
40
57
  # Get the config with the workflow ID from the job store
41
58
  config = jobStore.config
42
59
  # But override its options
43
60
  config.setOptions(options)
44
61
 
45
- did_something = False
62
+ # Find the job
63
+
64
+ if jobStore.job_exists(options.job):
65
+ # The user asked for a particular job and it exists
66
+ job_id = options.job
67
+ else:
68
+ # Go search by name and fill in job_id
69
+
70
+ # TODO: break out job store scan logic so it doesn't need to re-connect
71
+ # to the job store.
72
+ status = ToilStatus(options.jobStore)
73
+ hits = []
74
+ suggestion = None
75
+ for job in status.jobsToReport:
76
+ if options.job in (job.jobName, job.unitName, job.displayName):
77
+ # Find all the jobs that sort of match
78
+ hits.append(job)
79
+ if suggestion is None and job.remainingTryCount == 0:
80
+ # How about this nice failing job instead?
81
+ suggestion = job
82
+ if len(hits) == 0:
83
+ # No hits
84
+ if suggestion is None:
85
+ logger.critical("No job found with ID or name \"%s\". No jobs are completely failed.", options.job)
86
+ else:
87
+ logger.critical("No job found with ID or name \"%s\". How about the failed job %s instead?", options.job, suggestion)
88
+ sys.exit(1)
89
+ elif len(hits) > 1:
90
+ # Several hits, maybe only one has failed
91
+ completely_failed_hits = [job for job in hits if job.remainingTryCount == 0]
92
+ if len(completely_failed_hits) == 0:
93
+ logger.critical("Multiple jobs match \"%s\" but none are completely failed: %s", options.job, hits)
94
+ sys.exit(1)
95
+ elif len(completely_failed_hits) > 0:
96
+ logger.critical("Multiple jobs matching \"%s\" are completely failed: %s", options.job, completely_failed_hits)
97
+ sys.exit(1)
98
+ else:
99
+ # We found one completely failed job, they probably mean that one.
100
+ logger.info("There are %s jobs matching \"%s\"; assuming you mean the failed one: %s", options.job, completely_failed_hits[0])
101
+ job_id = completely_failed_hits[0].jobStoreID
102
+ else:
103
+ # We found one job with this name, so they must mean that one
104
+ logger.info("Looked up job named \"%s\": %s", options.job, hits[0])
105
+ job_id = hits[0].jobStoreID
46
106
 
47
107
  if options.printJobInfo:
108
+ # Report on the job
109
+
48
110
  if isinstance(jobStore, FileJobStore):
49
111
  # List all its files if we can
50
- printContentsOfJobStore(job_store=jobStore, job_id=options.printJobInfo)
112
+ printContentsOfJobStore(job_store=jobStore, job_id=job_id)
51
113
  # Print the job description itself
52
- job_desc = jobStore.load_job(options.printJobInfo)
114
+ job_desc = jobStore.load_job(job_id)
53
115
  print(f"Job: {job_desc}")
54
116
  pprint.pprint(job_desc.__dict__)
117
+ else:
118
+ # Run the job
55
119
 
56
- did_something = True
120
+ debug_flags = set()
121
+ local_worker_temp_dir = None
122
+ if options.retrieve_task_directory is not None:
123
+ # Pick a directory in it (which may be removed by the worker) as the worker's temp dir.
124
+ local_worker_temp_dir = os.path.join(options.retrieve_task_directory, "worker")
125
+ # Make sure it exists
126
+ os.makedirs(local_worker_temp_dir, exist_ok=True)
127
+ # And tell the job to just download files
128
+ debug_flags.add("download_only")
129
+ # We might need to reconstruct a container environment.
130
+ host_and_job_paths: Optional[List[Tuple[str, str]]] = None
131
+ # Track if the run succeeded without error
132
+ run_succeeded = False
57
133
 
58
- # TODO: Option to print list of successor jobs
59
- # TODO: Option to run job within python debugger, allowing step through of arguments
60
- # idea would be to have option to import pdb and set breakpoint at the start of the user's code
134
+ logger.info(f"Running the following job locally: {job_id}")
135
+ try:
136
+ workerScript(jobStore, config, job_id, job_id, redirect_output_to_log_file=False, local_worker_temp_dir=local_worker_temp_dir, debug_flags=debug_flags)
137
+ except FilesDownloadedStoppingPointReached as e:
138
+ # We asked for the files to be downloaded and now they are.
139
+ assert options.retrieve_task_directory is not None
140
+ if e.host_and_job_paths is not None:
141
+ # Capture the container mapping so we can reconstruct the container environment after we unwind the worker stack.
142
+ host_and_job_paths = e.host_and_job_paths
143
+ else:
144
+ # No error!
145
+ run_succeeded = True
61
146
 
62
- if options.jobID is not None:
63
- # We actually want to run a job.
147
+ # Make sure the deferred function manager cleans up and logs its
148
+ # shutdown before we start writing any reports.
149
+ gc.collect()
64
150
 
65
- jobID = options.jobID
66
- logger.debug(f"Running the following job locally: {jobID}")
67
- workerScript(jobStore, config, jobID, jobID, redirectOutputToLogFile=False)
68
- logger.debug(f"Finished running: {jobID}")
69
- # Even if the job fails, the worker script succeeds unless something goes wrong with it internally.
151
+ if run_succeeded:
152
+ logger.info(f"Successfully ran: {job_id}")
70
153
 
71
- did_something = True
154
+ if host_and_job_paths is not None:
155
+ # We need to make a place that looks like the job paths half of these.
72
156
 
73
- if not did_something:
74
- # Somebody forgot to tell us to do anything.
75
- # Show the usage instructions.
76
- parser.print_help()
77
- sys.exit(1)
157
+ # Sort by job-side path so we do children before parents, to
158
+ # stop us from accidentally making children inside moutned
159
+ # parents.
160
+ sorted_mounts = sorted(host_and_job_paths, key=lambda t: t[1], reverse=True)
161
+
162
+ fake_job_root = os.path.join(options.retrieve_task_directory, "inside")
163
+ os.makedirs(fake_job_root, exist_ok=True)
164
+
165
+ for host_path, job_path in sorted_mounts:
166
+ if not os.path.exists(host_path):
167
+ logger.error("Job intended to mount %s as %s but it does not exist!", host_path, job_path)
168
+ continue
169
+ if not job_path.startswith("/"):
170
+ logger.error("Job intended to mount %s as %s but destination is a relative path!", host_path, job_path)
171
+ continue
172
+ # Drop the slash because we are building a chroot-ish mini filesystem.
173
+ job_relative_path = job_path[1:]
174
+ if job_relative_path.startswith("/"):
175
+ # We are having trouble understanding what the job
176
+ # intended to do. Stop working on this mount.
177
+ logger.error("Job intended to mount %s as %s but destination starts with multiple slashes for some reason!", host_path, job_path)
178
+ continue
179
+ fake_job_path = os.path.join(fake_job_root, job_relative_path)
180
+ if os.path.exists(fake_job_path):
181
+ logger.error("Job intended to mount %s as %s but that location is already mounted!", host_path, job_path)
182
+ continue
183
+
184
+ logger.info("Job mounted %s as %s", host_path, job_path)
185
+
186
+ # Make sure the directory to contain the mount exists.
187
+ fake_job_containing_path = os.path.dirname(fake_job_path)
188
+ os.makedirs(fake_job_containing_path, exist_ok=True)
189
+
190
+ top_pathobj = Path(os.path.abspath(options.retrieve_task_directory))
191
+ source_pathobj = Path(host_path)
192
+ if top_pathobj in source_pathobj.parents:
193
+ # We're linking to a file we already downloaded (probably).
194
+ # Make a relative symlink so the whole assemblage can move.
195
+ host_path = os.path.relpath(host_path, fake_job_containing_path)
196
+
197
+ # Make a symlink to simulate the mount
198
+ os.symlink(host_path, fake_job_path)
199
+
200
+ logger.info("Reconstructed job container filesystem at %s", fake_job_root)
201
+
202
+ # TODO: Option to print list of successor jobs
203
+ # TODO: Option to run job within python debugger, allowing step through of arguments
204
+ # idea would be to have option to import pdb and set breakpoint at the start of the user's code
@@ -20,6 +20,7 @@ from typing import Dict, List, Tuple, Union
20
20
  from toil import applianceSelf
21
21
  from toil.common import parser_with_common_options
22
22
  from toil.lib.aws import build_tag_dict_from_env
23
+ from toil.lib.conversions import opt_strtobool
23
24
  from toil.provisioners import (check_valid_node_types,
24
25
  cluster_factory,
25
26
  parse_node_types)
@@ -31,7 +32,11 @@ logger = logging.getLogger(__name__)
31
32
  def create_tags_dict(tags: List[str]) -> Dict[str, str]:
32
33
  tags_dict = dict()
33
34
  for tag in tags:
34
- key, value = tag.split('=')
35
+ try:
36
+ key, value = tag.split('=')
37
+ except ValueError:
38
+ logger.error("Tag specification '%s' must contain '='", tag)
39
+ raise
35
40
  tags_dict[key] = value
36
41
  return tags_dict
37
42
 
@@ -114,6 +119,10 @@ def main() -> None:
114
119
  help="Any additional security groups to attach to EC2 instances. Note that a security group "
115
120
  "with its name equal to the cluster name will always be created, thus ensure that "
116
121
  "the extra security groups do not have the same name as the cluster name.")
122
+ parser.add_argument("--allowFuse", type=opt_strtobool, default=True,
123
+ help="Enable both the leader and worker nodes to be able to run Singularity with FUSE. For "
124
+ "Kubernetes, this will make the leader privileged and ask workers to run as privileged. "
125
+ "(default: %(default)s)")
117
126
  #TODO Set Aws Profile in CLI options
118
127
  options = parser.parse_args()
119
128
  set_logging_from_options(options)
@@ -178,7 +187,8 @@ def main() -> None:
178
187
  clusterName=options.clusterName,
179
188
  clusterType=options.clusterType,
180
189
  zone=options.zone,
181
- nodeStorage=options.nodeStorage)
190
+ nodeStorage=options.nodeStorage,
191
+ enable_fuse=options.allowFuse)
182
192
 
183
193
  cluster.launchCluster(leaderNodeType=options.leaderNodeType,
184
194
  leaderStorage=options.leaderStorage,
@@ -14,9 +14,10 @@
14
14
  """Rsyncs into the toil appliance container running on the leader of the cluster."""
15
15
  import argparse
16
16
  import logging
17
+ import sys
17
18
 
18
19
  from toil.common import parser_with_common_options
19
- from toil.provisioners import cluster_factory
20
+ from toil.provisioners import cluster_factory, NoSuchClusterException
20
21
  from toil.statsAndLogging import set_logging_from_options
21
22
 
22
23
  logger = logging.getLogger(__name__)
@@ -37,4 +38,8 @@ def main() -> None:
37
38
  cluster = cluster_factory(provisioner=options.provisioner,
38
39
  clusterName=options.clusterName,
39
40
  zone=options.zone)
40
- cluster.getLeader().coreRsync(args=options.args, strict=not options.insecure)
41
+ try:
42
+ cluster.getLeader().coreRsync(args=options.args, strict=not options.insecure)
43
+ except NoSuchClusterException as e:
44
+ logger.error(e)
45
+ sys.exit(1)
@@ -18,7 +18,7 @@ import sys
18
18
  from typing import List
19
19
 
20
20
  from toil.common import parser_with_common_options
21
- from toil.provisioners import cluster_factory
21
+ from toil.provisioners import cluster_factory, NoSuchClusterException
22
22
  from toil.statsAndLogging import set_logging_from_options
23
23
 
24
24
  logger = logging.getLogger(__name__)
@@ -54,5 +54,9 @@ def main() -> None:
54
54
  sshOptions.extend(['-L', f'{options.grafana_port}:localhost:3000',
55
55
  '-L', '9090:localhost:9090'])
56
56
 
57
- cluster.getLeader().sshAppliance(*command, strict=not options.insecure, tty=sys.stdin.isatty(),
58
- sshOptions=sshOptions)
57
+ try:
58
+ cluster.getLeader().sshAppliance(*command, strict=not options.insecure, tty=sys.stdin.isatty(),
59
+ sshOptions=sshOptions)
60
+ except NoSuchClusterException as e:
61
+ logger.error(e)
62
+ sys.exit(1)