toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. toil/__init__.py +121 -83
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +137 -77
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
  5. toil/batchSystems/awsBatch.py +237 -128
  6. toil/batchSystems/cleanup_support.py +22 -16
  7. toil/batchSystems/contained_executor.py +30 -26
  8. toil/batchSystems/gridengine.py +85 -49
  9. toil/batchSystems/htcondor.py +164 -87
  10. toil/batchSystems/kubernetes.py +622 -386
  11. toil/batchSystems/local_support.py +17 -12
  12. toil/batchSystems/lsf.py +132 -79
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +288 -149
  16. toil/batchSystems/mesos/executor.py +77 -49
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +38 -29
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +293 -123
  21. toil/batchSystems/slurm.py +489 -137
  22. toil/batchSystems/torque.py +46 -32
  23. toil/bus.py +141 -73
  24. toil/common.py +630 -359
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1114 -532
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +62 -41
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +88 -57
  32. toil/fileStores/cachingFileStore.py +711 -247
  33. toil/fileStores/nonCachingFileStore.py +113 -75
  34. toil/job.py +988 -315
  35. toil/jobStores/abstractJobStore.py +387 -243
  36. toil/jobStores/aws/jobStore.py +727 -403
  37. toil/jobStores/aws/utils.py +161 -109
  38. toil/jobStores/conftest.py +1 -0
  39. toil/jobStores/fileJobStore.py +289 -151
  40. toil/jobStores/googleJobStore.py +137 -70
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +614 -269
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +55 -28
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +193 -58
  49. toil/lib/aws/utils.py +238 -218
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +83 -49
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +322 -209
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +4 -2
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +99 -11
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +65 -18
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +19 -7
  71. toil/lib/retry.py +115 -77
  72. toil/lib/threading.py +282 -80
  73. toil/lib/throttle.py +15 -14
  74. toil/options/common.py +834 -401
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +70 -19
  78. toil/provisioners/__init__.py +111 -46
  79. toil/provisioners/abstractProvisioner.py +322 -157
  80. toil/provisioners/aws/__init__.py +62 -30
  81. toil/provisioners/aws/awsProvisioner.py +980 -627
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +147 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +127 -61
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +148 -64
  98. toil/test/__init__.py +263 -179
  99. toil/test/batchSystems/batchSystemTest.py +438 -195
  100. toil/test/batchSystems/batch_system_plugin_test.py +18 -7
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +93 -47
  104. toil/test/cactus/test_cactus_integration.py +20 -22
  105. toil/test/cwl/cwlTest.py +271 -71
  106. toil/test/cwl/measure_default_memory.cwl +12 -0
  107. toil/test/cwl/not_run_required_input.cwl +29 -0
  108. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  109. toil/test/docs/scriptsTest.py +60 -34
  110. toil/test/jobStores/jobStoreTest.py +412 -235
  111. toil/test/lib/aws/test_iam.py +116 -48
  112. toil/test/lib/aws/test_s3.py +16 -9
  113. toil/test/lib/aws/test_utils.py +5 -6
  114. toil/test/lib/dockerTest.py +118 -141
  115. toil/test/lib/test_conversions.py +113 -115
  116. toil/test/lib/test_ec2.py +57 -49
  117. toil/test/lib/test_integration.py +104 -0
  118. toil/test/lib/test_misc.py +12 -5
  119. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  120. toil/test/mesos/helloWorld.py +7 -6
  121. toil/test/mesos/stress.py +25 -20
  122. toil/test/options/options.py +7 -2
  123. toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
  124. toil/test/provisioners/clusterScalerTest.py +440 -250
  125. toil/test/provisioners/clusterTest.py +81 -42
  126. toil/test/provisioners/gceProvisionerTest.py +174 -100
  127. toil/test/provisioners/provisionerTest.py +25 -13
  128. toil/test/provisioners/restartScript.py +5 -4
  129. toil/test/server/serverTest.py +188 -141
  130. toil/test/sort/restart_sort.py +137 -68
  131. toil/test/sort/sort.py +134 -66
  132. toil/test/sort/sortTest.py +91 -49
  133. toil/test/src/autoDeploymentTest.py +140 -100
  134. toil/test/src/busTest.py +20 -18
  135. toil/test/src/checkpointTest.py +8 -2
  136. toil/test/src/deferredFunctionTest.py +49 -35
  137. toil/test/src/dockerCheckTest.py +33 -26
  138. toil/test/src/environmentTest.py +20 -10
  139. toil/test/src/fileStoreTest.py +538 -271
  140. toil/test/src/helloWorldTest.py +7 -4
  141. toil/test/src/importExportFileTest.py +61 -31
  142. toil/test/src/jobDescriptionTest.py +32 -17
  143. toil/test/src/jobEncapsulationTest.py +2 -0
  144. toil/test/src/jobFileStoreTest.py +74 -50
  145. toil/test/src/jobServiceTest.py +187 -73
  146. toil/test/src/jobTest.py +120 -70
  147. toil/test/src/miscTests.py +19 -18
  148. toil/test/src/promisedRequirementTest.py +82 -36
  149. toil/test/src/promisesTest.py +7 -6
  150. toil/test/src/realtimeLoggerTest.py +6 -6
  151. toil/test/src/regularLogTest.py +71 -37
  152. toil/test/src/resourceTest.py +80 -49
  153. toil/test/src/restartDAGTest.py +36 -22
  154. toil/test/src/resumabilityTest.py +9 -2
  155. toil/test/src/retainTempDirTest.py +45 -14
  156. toil/test/src/systemTest.py +12 -8
  157. toil/test/src/threadingTest.py +44 -25
  158. toil/test/src/toilContextManagerTest.py +10 -7
  159. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  160. toil/test/src/workerTest.py +33 -16
  161. toil/test/utils/toilDebugTest.py +70 -58
  162. toil/test/utils/toilKillTest.py +4 -5
  163. toil/test/utils/utilsTest.py +239 -102
  164. toil/test/wdl/wdltoil_test.py +789 -148
  165. toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
  166. toil/toilState.py +52 -26
  167. toil/utils/toilConfig.py +13 -4
  168. toil/utils/toilDebugFile.py +44 -27
  169. toil/utils/toilDebugJob.py +85 -25
  170. toil/utils/toilDestroyCluster.py +11 -6
  171. toil/utils/toilKill.py +8 -3
  172. toil/utils/toilLaunchCluster.py +251 -145
  173. toil/utils/toilMain.py +37 -16
  174. toil/utils/toilRsyncCluster.py +27 -14
  175. toil/utils/toilSshCluster.py +45 -22
  176. toil/utils/toilStats.py +75 -36
  177. toil/utils/toilStatus.py +226 -119
  178. toil/utils/toilUpdateEC2Instances.py +3 -1
  179. toil/version.py +11 -11
  180. toil/wdl/utils.py +5 -5
  181. toil/wdl/wdltoil.py +3513 -1052
  182. toil/worker.py +269 -128
  183. toil-8.0.0.dist-info/METADATA +173 -0
  184. toil-8.0.0.dist-info/RECORD +253 -0
  185. {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  186. toil-7.0.0.dist-info/METADATA +0 -158
  187. toil-7.0.0.dist-info/RECORD +0 -244
  188. {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
  189. {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  190. {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/worker.py CHANGED
@@ -25,23 +25,31 @@ import stat
25
25
  import sys
26
26
  import time
27
27
  import traceback
28
+ from collections.abc import Iterator
28
29
  from contextlib import contextmanager
29
- from typing import Any, Callable, Iterator, List, Set, Optional
30
+ from typing import Any, Callable, Optional
30
31
 
31
32
  from configargparse import ArgParser
32
33
 
33
34
  from toil import logProcessContext
34
35
  from toil.common import Config, Toil, safeUnpickleFromStream
35
- from toil.cwl.utils import (CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
36
- CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE)
36
+ from toil.cwl.utils import (
37
+ CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
38
+ CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE,
39
+ )
37
40
  from toil.deferred import DeferredFunctionManager
38
41
  from toil.fileStores.abstractFileStore import AbstractFileStore
39
- from toil.job import CheckpointJobDescription, Job, JobDescription, DebugStoppingPointReached
42
+ from toil.job import (
43
+ CheckpointJobDescription,
44
+ DebugStoppingPointReached,
45
+ Job,
46
+ JobDescription,
47
+ )
40
48
  from toil.jobStores.abstractJobStore import AbstractJobStore
41
49
  from toil.lib.expando import MagicExpando
42
50
  from toil.lib.io import make_public_dir
43
51
  from toil.lib.resources import ResourceMonitor
44
- from toil.statsAndLogging import configure_root_logger, set_log_level, install_log_color
52
+ from toil.statsAndLogging import configure_root_logger, install_log_color, set_log_level
45
53
 
46
54
  logger = logging.getLogger(__name__)
47
55
 
@@ -49,10 +57,12 @@ logger = logging.getLogger(__name__)
49
57
  class StatsDict(MagicExpando):
50
58
  """Subclass of MagicExpando for type-checking purposes."""
51
59
 
52
- jobs: List[MagicExpando]
60
+ jobs: list[MagicExpando]
53
61
 
54
62
 
55
- def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, config: Config) -> Optional[JobDescription]:
63
+ def nextChainable(
64
+ predecessor: JobDescription, job_store: AbstractJobStore, config: Config
65
+ ) -> Optional[JobDescription]:
56
66
  """
57
67
  Returns the next chainable job's JobDescription after the given predecessor
58
68
  JobDescription, if one exists, or None if the chain must terminate.
@@ -61,24 +71,41 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
61
71
  :param job_store: The JobStore to fetch JobDescriptions from.
62
72
  :param config: The configuration for the current run.
63
73
  """
64
- #If no more jobs to run or services not finished, quit
65
- if predecessor.nextSuccessors() is None or len(predecessor.services) > 0 or (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint is not None):
66
- logger.debug("Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
67
- predecessor.nextSuccessors() is None, len(predecessor.services), (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint is not None))
74
+ # If no more jobs to run or services not finished, quit
75
+ if (
76
+ predecessor.nextSuccessors() is None
77
+ or len(predecessor.services) > 0
78
+ or (
79
+ isinstance(predecessor, CheckpointJobDescription)
80
+ and predecessor.checkpoint is not None
81
+ )
82
+ ):
83
+ logger.debug(
84
+ "Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
85
+ predecessor.nextSuccessors() is None,
86
+ len(predecessor.services),
87
+ (
88
+ isinstance(predecessor, CheckpointJobDescription)
89
+ and predecessor.checkpoint is not None
90
+ ),
91
+ )
68
92
  return None
69
93
 
70
-
71
- #Get the next set of jobs to run
94
+ # Get the next set of jobs to run
72
95
  jobs = list(predecessor.nextSuccessors() or set())
73
96
  if len(jobs) == 0:
74
97
  # If there are no jobs, we might just not have any children.
75
- logger.debug("Stopping running chain of jobs because job has no ready children or follow-ons")
98
+ logger.debug(
99
+ "Stopping running chain of jobs because job has no ready children or follow-ons"
100
+ )
76
101
  return None
77
102
 
78
- #If there are 2 or more jobs to run in parallel we quit
103
+ # If there are 2 or more jobs to run in parallel we quit
79
104
  if len(jobs) >= 2:
80
- logger.debug("No more jobs can run in series by this worker,"
81
- " it's got %i successors", len(jobs))
105
+ logger.debug(
106
+ "No more jobs can run in series by this worker," " it's got %i successors",
107
+ len(jobs),
108
+ )
82
109
  logger.debug("Two distinct successors are %s and %s", jobs[0], jobs[1])
83
110
  return None
84
111
 
@@ -90,8 +117,8 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
90
117
  # Load the successor JobDescription
91
118
  successor = job_store.load_job(successorID)
92
119
 
93
- #We check the requirements of the successor to see if we can run it
94
- #within the current worker
120
+ # We check the requirements of the successor to see if we can run it
121
+ # within the current worker
95
122
  if successor.memory > predecessor.memory:
96
123
  logger.debug("We need more memory for the next job, so finishing")
97
124
  return None
@@ -102,14 +129,20 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
102
129
  logger.debug("We need more disk for the next job, so finishing")
103
130
  return None
104
131
  if successor.preemptible != predecessor.preemptible:
105
- logger.debug("Preemptibility is different for the next job, returning to the leader")
132
+ logger.debug(
133
+ "Preemptibility is different for the next job, returning to the leader"
134
+ )
106
135
  return None
107
136
  if successor.predecessorNumber > 1:
108
- logger.debug("The next job has multiple predecessors; we must return to the leader.")
137
+ logger.debug(
138
+ "The next job has multiple predecessors; we must return to the leader."
139
+ )
109
140
  return None
110
141
 
111
142
  if len(successor.services) > 0:
112
- logger.debug("The next job requires services that will not yet be started; we must return to the leader.")
143
+ logger.debug(
144
+ "The next job requires services that will not yet be started; we must return to the leader."
145
+ )
113
146
  return None
114
147
 
115
148
  if isinstance(successor, CheckpointJobDescription):
@@ -117,7 +150,11 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
117
150
  logger.debug("Next job is checkpoint, so finishing")
118
151
  return None
119
152
 
120
- if not config.run_local_jobs_on_workers and predecessor.local and not successor.local:
153
+ if (
154
+ not config.run_local_jobs_on_workers
155
+ and predecessor.local
156
+ and not successor.local
157
+ ):
121
158
  # This job might be running on the leader, but the next job may not.
122
159
  #
123
160
  # TODO: Optimize by detecting whether we actually are on the leader,
@@ -128,6 +165,7 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
128
165
  # Made it through! This job is chainable.
129
166
  return successor
130
167
 
168
+
131
169
  def workerScript(
132
170
  job_store: AbstractJobStore,
133
171
  config: Config,
@@ -135,7 +173,7 @@ def workerScript(
135
173
  job_store_id: str,
136
174
  redirect_output_to_log_file: bool = True,
137
175
  local_worker_temp_dir: Optional[str] = None,
138
- debug_flags: Optional[Set[str]] = None
176
+ debug_flags: Optional[set[str]] = None,
139
177
  ) -> int:
140
178
  """
141
179
  Worker process script, runs a job.
@@ -162,7 +200,7 @@ def workerScript(
162
200
  logger.debug("Worker started for job %s...", job_name)
163
201
 
164
202
  ##########################################
165
- #Create the worker killer, if requested
203
+ # Create the worker killer, if requested
166
204
  ##########################################
167
205
 
168
206
  logFileByteReportLimit = config.maxLogFileSize
@@ -203,10 +241,10 @@ def workerScript(
203
241
  # before it does. Either way, init will have to clean it up for us.
204
242
 
205
243
  ##########################################
206
- #Load the environment for the job
244
+ # Load the environment for the job
207
245
  ##########################################
208
246
 
209
- #First load the environment for the job.
247
+ # First load the environment for the job.
210
248
  with job_store.read_shared_file_stream("environment.pickle") as fileHandle:
211
249
  environment = safeUnpickleFromStream(fileHandle)
212
250
  env_reject = {
@@ -224,15 +262,15 @@ def workerScript(
224
262
  "XDG_SESSION_ID",
225
263
  "XDG_RUNTIME_DIR",
226
264
  "XDG_DATA_DIRS",
227
- "DBUS_SESSION_BUS_ADDRESS"
265
+ "DBUS_SESSION_BUS_ADDRESS",
228
266
  }
229
267
  for i in environment:
230
268
  if i == "PATH":
231
269
  # Handle path specially. Sometimes e.g. leader may not include
232
270
  # /bin, but the Toil appliance needs it.
233
- if i in os.environ and os.environ[i] != '':
271
+ if i in os.environ and os.environ[i] != "":
234
272
  # Use the provided PATH and then the local system's PATH
235
- os.environ[i] = environment[i] + ':' + os.environ[i]
273
+ os.environ[i] = environment[i] + ":" + os.environ[i]
236
274
  else:
237
275
  # Use the provided PATH only
238
276
  os.environ[i] = environment[i]
@@ -240,41 +278,45 @@ def workerScript(
240
278
  os.environ[i] = environment[i]
241
279
  # sys.path is used by __import__ to find modules
242
280
  if "PYTHONPATH" in environment:
243
- for e in environment["PYTHONPATH"].split(':'):
244
- if e != '':
281
+ for e in environment["PYTHONPATH"].split(":"):
282
+ if e != "":
245
283
  sys.path.append(e)
246
284
 
247
285
  ##########################################
248
- #Setup the temporary directories.
286
+ # Setup the temporary directories.
249
287
  ##########################################
250
288
  # Dir to put all this worker's temp files in.
251
289
  if config.workflowID is None:
252
290
  raise RuntimeError("The worker workflow ID was never set.")
253
291
  toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir)
254
292
  # Dir to put lock files in, ideally not on NFS.
255
- toil_coordination_dir = Toil.get_local_workflow_coordination_dir(config.workflowID, config.workDir, config.coordination_dir)
293
+ toil_coordination_dir = Toil.get_local_workflow_coordination_dir(
294
+ config.workflowID, config.workDir, config.coordination_dir
295
+ )
256
296
  if local_worker_temp_dir is None:
257
297
  # Invent a temp directory to work in
258
298
  local_worker_temp_dir = make_public_dir(toilWorkflowDir)
259
299
  os.chmod(local_worker_temp_dir, 0o755)
260
300
 
261
301
  ##########################################
262
- #Setup the logging
302
+ # Setup the logging
263
303
  ##########################################
264
304
 
265
- #This is mildly tricky because we don't just want to
266
- #redirect stdout and stderr for this Python process; we want to redirect it
267
- #for this process and all children. Consequently, we can't just replace
268
- #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
269
- #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
305
+ # This is mildly tricky because we don't just want to
306
+ # redirect stdout and stderr for this Python process; we want to redirect it
307
+ # for this process and all children. Consequently, we can't just replace
308
+ # sys.stdout and sys.stderr; we need to mess with the underlying OS-level
309
+ # file descriptors. See <http://stackoverflow.com/a/11632982/402891>
270
310
 
271
- #When we start, standard input is file descriptor 0, standard output is
272
- #file descriptor 1, and standard error is file descriptor 2.
311
+ # When we start, standard input is file descriptor 0, standard output is
312
+ # file descriptor 1, and standard error is file descriptor 2.
273
313
 
274
314
  # Do we even want to redirect output? Let the config make us not do it.
275
- redirect_output_to_log_file = redirect_output_to_log_file and not config.disableWorkerOutputCapture
315
+ redirect_output_to_log_file = (
316
+ redirect_output_to_log_file and not config.disableWorkerOutputCapture
317
+ )
276
318
 
277
- #What file do we want to point FDs 1 and 2 to?
319
+ # What file do we want to point FDs 1 and 2 to?
278
320
  tempWorkerLogPath = os.path.join(local_worker_temp_dir, "worker_log.txt")
279
321
 
280
322
  if redirect_output_to_log_file:
@@ -322,6 +364,7 @@ def workerScript(
322
364
 
323
365
  def blockFn() -> bool:
324
366
  return True
367
+
325
368
  job = None
326
369
  try:
327
370
 
@@ -365,7 +408,10 @@ def workerScript(
365
408
  # If a checkpoint exists, restart from the checkpoint
366
409
  ##########################################
367
410
 
368
- if isinstance(jobDesc, CheckpointJobDescription) and jobDesc.checkpoint is not None:
411
+ if (
412
+ isinstance(jobDesc, CheckpointJobDescription)
413
+ and jobDesc.checkpoint is not None
414
+ ):
369
415
  # The job is a checkpoint, and is being restarted after previously completing
370
416
  logger.debug("Job is a checkpoint")
371
417
  # If the checkpoint still has extant successors or services, its
@@ -381,12 +427,23 @@ def workerScript(
381
427
  # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
382
428
  # because of the job being a checkpoint
383
429
  else:
384
- logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.")
385
- #Delete any remnant files
386
- list(map(job_store.delete_file, list(filter(job_store.file_exists, jobDesc.checkpointFilesToDelete))))
430
+ logger.debug(
431
+ "The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete."
432
+ )
433
+ # Delete any remnant files
434
+ list(
435
+ map(
436
+ job_store.delete_file,
437
+ list(
438
+ filter(
439
+ job_store.file_exists, jobDesc.checkpointFilesToDelete
440
+ )
441
+ ),
442
+ )
443
+ )
387
444
 
388
445
  ##########################################
389
- #Setup the stats, if requested
446
+ # Setup the stats, if requested
390
447
  ##########################################
391
448
 
392
449
  if config.stats:
@@ -397,7 +454,7 @@ def workerScript(
397
454
  startTime = time.time()
398
455
  while True:
399
456
  ##########################################
400
- #Run the job body, if there is one
457
+ # Run the job body, if there is one
401
458
  ##########################################
402
459
 
403
460
  logger.info("Working on job %s", jobDesc)
@@ -417,33 +474,48 @@ def workerScript(
417
474
  job.set_debug_flag(flag)
418
475
 
419
476
  # Create a fileStore object for the job
420
- fileStore = AbstractFileStore.createFileStore(job_store, jobDesc, local_worker_temp_dir, blockFn,
421
- caching=config.caching)
422
- with job._executor(stats=statsDict if config.stats else None,
423
- fileStore=fileStore):
424
- with deferredFunctionManager.open() as defer:
425
- with fileStore.open(job):
426
- # Get the next block function to wait on committing this job
427
- blockFn = fileStore.waitForCommit
428
-
429
- # Run the job, save new successors, and set up
430
- # locally (but don't commit) successor
431
- # relationships and job completion.
432
- # Pass everything as name=value because Cactus
433
- # likes to override _runner when it shouldn't and
434
- # it needs some hope of finding the arguments it
435
- # wants across multiple Toil versions. We also
436
- # still pass a jobGraph argument to placate old
437
- # versions of Cactus.
438
- job._runner(jobGraph=None, jobStore=job_store, fileStore=fileStore, defer=defer)
439
-
440
- # When the executor for the job finishes it will
441
- # kick off a commit with the link to the job body
442
- # cut.
443
-
444
- # Accumulate messages from this job & any subsequent chained jobs
445
- statsDict.workers.logs_to_leader += fileStore.logging_messages
446
- statsDict.workers.logging_user_streams += fileStore.logging_user_streams
477
+ fileStore = AbstractFileStore.createFileStore(
478
+ job_store,
479
+ jobDesc,
480
+ local_worker_temp_dir,
481
+ blockFn,
482
+ caching=config.caching,
483
+ )
484
+ try:
485
+ with job._executor(
486
+ stats=statsDict if config.stats else None, fileStore=fileStore
487
+ ):
488
+ with deferredFunctionManager.open() as defer:
489
+ with fileStore.open(job):
490
+ # Get the next block function to wait on committing this job
491
+ blockFn = fileStore.waitForCommit
492
+
493
+ # Run the job, save new successors, and set up
494
+ # locally (but don't commit) successor
495
+ # relationships and job completion.
496
+ # Pass everything as name=value because Cactus
497
+ # likes to override _runner when it shouldn't and
498
+ # it needs some hope of finding the arguments it
499
+ # wants across multiple Toil versions. We also
500
+ # still pass a jobGraph argument to placate old
501
+ # versions of Cactus.
502
+ job._runner(
503
+ jobGraph=None,
504
+ jobStore=job_store,
505
+ fileStore=fileStore,
506
+ defer=defer,
507
+ )
508
+
509
+ # When the executor for the job finishes it will
510
+ # kick off a commit with the link to the job body
511
+ # cut.
512
+ finally:
513
+ # Accumulate messages from this job & any subsequent chained jobs.
514
+ # Keep the messages even if the job fails.
515
+ statsDict.workers.logs_to_leader += fileStore.logging_messages
516
+ statsDict.workers.logging_user_streams += (
517
+ fileStore.logging_user_streams
518
+ )
447
519
 
448
520
  logger.info("Completed body for %s", jobDesc)
449
521
 
@@ -458,7 +530,7 @@ def workerScript(
458
530
  raise RuntimeError("The termination flag is set")
459
531
 
460
532
  ##########################################
461
- #Establish if we can run another job within the worker
533
+ # Establish if we can run another job within the worker
462
534
  ##########################################
463
535
  successor = nextChainable(jobDesc, job_store, config)
464
536
  if successor is None or config.disableChaining:
@@ -481,9 +553,13 @@ def workerScript(
481
553
 
482
554
  # Make sure nothing has gone wrong and we can really chain
483
555
  if jobDesc.memory < successor.memory:
484
- raise RuntimeError("Cannot chain jobs. A job's memory cannot be less than it's successor.")
556
+ raise RuntimeError(
557
+ "Cannot chain jobs. A job's memory cannot be less than it's successor."
558
+ )
485
559
  if jobDesc.cores < successor.cores:
486
- raise RuntimeError("Cannot chain jobs. A job's cores cannot be less than it's successor.")
560
+ raise RuntimeError(
561
+ "Cannot chain jobs. A job's cores cannot be less than it's successor."
562
+ )
487
563
 
488
564
  # Save the successor's original ID, so we can clean it (and its
489
565
  # body) up after we finish executing it.
@@ -499,8 +575,13 @@ def workerScript(
499
575
 
500
576
  # Build a fileStore to update the job and commit the replacement.
501
577
  # TODO: can we have a commit operation without an entire FileStore???
502
- fileStore = AbstractFileStore.createFileStore(job_store, jobDesc, local_worker_temp_dir, blockFn,
503
- caching=config.caching)
578
+ fileStore = AbstractFileStore.createFileStore(
579
+ job_store,
580
+ jobDesc,
581
+ local_worker_temp_dir,
582
+ blockFn,
583
+ caching=config.caching,
584
+ )
504
585
 
505
586
  # Update blockFn to wait for that commit operation.
506
587
  blockFn = fileStore.waitForCommit
@@ -511,10 +592,12 @@ def workerScript(
511
592
  logger.debug("Starting the next job")
512
593
 
513
594
  ##########################################
514
- #Finish up the stats
595
+ # Finish up the stats
515
596
  ##########################################
516
597
  if config.stats:
517
- totalCPUTime, totalMemoryUsage = ResourceMonitor.get_total_cpu_time_and_memory_usage()
598
+ totalCPUTime, totalMemoryUsage = (
599
+ ResourceMonitor.get_total_cpu_time_and_memory_usage()
600
+ )
518
601
  statsDict.workers.time = str(time.time() - startTime)
519
602
  statsDict.workers.clock = str(totalCPUTime - startClock)
520
603
  statsDict.workers.memory = str(totalMemoryUsage)
@@ -526,25 +609,37 @@ def workerScript(
526
609
  statsDict.workers.disk = str(max_bytes)
527
610
  # Count the jobs executed.
528
611
  # TODO: toil stats could compute this but its parser is too general to hook into simply.
529
- statsDict.workers.jobs_run = len(statsDict.jobs)
530
-
612
+ statsDict.workers.jobs_run = len(statsDict.jobs)
531
613
 
532
614
  # log the worker log path here so that if the file is truncated the path can still be found
533
615
  if redirect_output_to_log_file:
534
- logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", local_worker_temp_dir)
535
-
536
- logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
616
+ logger.info(
617
+ "Worker log can be found at %s. Set --cleanWorkDir to retain this log",
618
+ local_worker_temp_dir,
619
+ )
620
+
621
+ logger.info(
622
+ "Finished running the chain of jobs on this node, we ran for a total of %f seconds",
623
+ time.time() - startTime,
624
+ )
537
625
 
538
626
  ##########################################
539
- #Trapping where worker goes wrong
627
+ # Trapping where worker goes wrong
540
628
  ##########################################
541
629
  except DebugStoppingPointReached:
542
630
  # Job wants the worker to stop for debugging
543
631
  raise
544
- except BaseException as e: #Case that something goes wrong in worker, or we are asked to stop
632
+ except (
633
+ BaseException
634
+ ) as e: # Case that something goes wrong in worker, or we are asked to stop
545
635
  if not isinstance(e, SystemExit):
546
- logger.critical("Worker crashed with traceback:\n%s", traceback.format_exc())
547
- logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname())
636
+ logger.critical(
637
+ "Worker crashed with traceback:\n%s", traceback.format_exc()
638
+ )
639
+ logger.error(
640
+ "Exiting the worker because of a failed job on host %s",
641
+ socket.gethostname(),
642
+ )
548
643
  if isinstance(e, CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION):
549
644
  # We need to inform the leader that this is a CWL workflow problem
550
645
  # and it needs to inform its caller.
@@ -552,6 +647,15 @@ def workerScript(
552
647
  elif isinstance(e, SystemExit) and isinstance(e.code, int) and e.code != 0:
553
648
  # We're meant to be exiting with a particular code.
554
649
  failure_exit_code = e.code
650
+ else:
651
+ try:
652
+ from WDL.runtime.error import CommandFailed
653
+
654
+ if isinstance(e, CommandFailed):
655
+ failure_exit_code = e.exit_status
656
+ except ImportError:
657
+ # WDL dependency not available
658
+ pass
555
659
  AbstractFileStore._terminateEvent.set()
556
660
  finally:
557
661
  # Get rid of our deferred function manager now so we can't mistake it
@@ -567,16 +671,15 @@ def workerScript(
567
671
  logger.debug("cwltool.main._terminate_processess exception: %s", (e))
568
672
  raise e
569
673
 
570
-
571
674
  ##########################################
572
- #Wait for the asynchronous chain of writes/updates to finish
675
+ # Wait for the asynchronous chain of writes/updates to finish
573
676
  ##########################################
574
677
 
575
678
  blockFn()
576
679
 
577
680
  ##########################################
578
- #All the asynchronous worker/update threads must be finished now,
579
- #so safe to test if they completed okay
681
+ # All the asynchronous worker/update threads must be finished now,
682
+ # so safe to test if they completed okay
580
683
  ##########################################
581
684
 
582
685
  if AbstractFileStore._terminateEvent.is_set():
@@ -589,7 +692,7 @@ def workerScript(
589
692
  jobAttemptFailed = True
590
693
 
591
694
  ##########################################
592
- #Cleanup
695
+ # Cleanup
593
696
  ##########################################
594
697
 
595
698
  # Close the worker logging
@@ -628,32 +731,48 @@ def workerScript(
628
731
  jobDesc.jobStoreID, cleanup=True
629
732
  )
630
733
  with job_store.update_file_stream(logJobStoreFileID) as w:
631
- with open(tempWorkerLogPath, 'rb') as f:
632
- if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0:
734
+ with open(tempWorkerLogPath, "rb") as f:
735
+ if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
633
736
  if logFileByteReportLimit > 0:
634
- f.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file
737
+ f.seek(
738
+ -logFileByteReportLimit, 2
739
+ ) # seek to last tooBig bytes of file
635
740
  elif logFileByteReportLimit < 0:
636
- f.seek(logFileByteReportLimit, 0) # seek to first tooBig bytes of file
741
+ f.seek(
742
+ logFileByteReportLimit, 0
743
+ ) # seek to first tooBig bytes of file
637
744
  # Dump the possibly-invalid-Unicode bytes into the log file
638
- w.write(f.read()) # TODO load file using a buffer
745
+ w.write(f.read()) # TODO load file using a buffer
639
746
  # Commit log file reference back to JobStore
640
747
  job_store.update_job(jobDesc)
641
748
 
642
- elif ((debugging or (config.writeLogsFromAllJobs and not jobDesc.local))
643
- and redirect_output_to_log_file): # write log messages
644
- with open(tempWorkerLogPath, 'rb') as logFile:
749
+ elif (
750
+ debugging or (config.writeLogsFromAllJobs and not jobDesc.local)
751
+ ) and redirect_output_to_log_file: # write log messages
752
+ with open(tempWorkerLogPath, "rb") as logFile:
645
753
  if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
646
754
  if logFileByteReportLimit > 0:
647
- logFile.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file
755
+ logFile.seek(
756
+ -logFileByteReportLimit, 2
757
+ ) # seek to last tooBig bytes of file
648
758
  elif logFileByteReportLimit < 0:
649
- logFile.seek(logFileByteReportLimit, 0) # seek to first tooBig bytes of file
759
+ logFile.seek(
760
+ logFileByteReportLimit, 0
761
+ ) # seek to first tooBig bytes of file
650
762
  # Make sure lines are Unicode so they can be JSON serialized as part of the dict.
651
763
  # We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters.
652
- logMessages = [line.decode('utf-8', 'skip') for line in logFile.read().splitlines()]
764
+ logMessages = [
765
+ line.decode("utf-8", "skip") for line in logFile.read().splitlines()
766
+ ]
653
767
  statsDict.logs.names = [names.stats_name for names in jobDesc.get_chain()]
654
768
  statsDict.logs.messages = logMessages
655
769
 
656
- if debugging or config.stats or statsDict.workers.logs_to_leader or statsDict.workers.logging_user_streams:
770
+ if (
771
+ debugging
772
+ or config.stats
773
+ or statsDict.workers.logs_to_leader
774
+ or statsDict.workers.logging_user_streams
775
+ ):
657
776
  # We have stats/logging to report back.
658
777
  # We report even if the job attempt failed.
659
778
  # TODO: Will that upset analysis of the stats?
@@ -661,7 +780,12 @@ def workerScript(
661
780
 
662
781
  # Remove the temp dir
663
782
  cleanUp = config.cleanWorkDir
664
- if cleanUp == 'always' or (cleanUp == 'onSuccess' and not jobAttemptFailed) or (cleanUp == 'onError' and jobAttemptFailed):
783
+ if (
784
+ cleanUp == "always"
785
+ or (cleanUp == "onSuccess" and not jobAttemptFailed)
786
+ or (cleanUp == "onError" and jobAttemptFailed)
787
+ ):
788
+
665
789
  def make_parent_writable(func: Callable[[str], Any], path: str, _: Any) -> None:
666
790
  """
667
791
  When encountering an error removing a file or directory, make sure
@@ -672,9 +796,17 @@ def workerScript(
672
796
  """
673
797
  # Just chmod it for rwx for user. This can't work anyway if it isn't ours.
674
798
  try:
675
- os.chmod(os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
799
+ os.chmod(
800
+ os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR
801
+ )
676
802
  except PermissionError as e:
677
- logger.error('Could not set permissions on %s to allow cleanup of %s: %s', os.path.dirname(path), path, e)
803
+ logger.error(
804
+ "Could not set permissions on %s to allow cleanup of %s: %s",
805
+ os.path.dirname(path),
806
+ path,
807
+ e,
808
+ )
809
+
678
810
  shutil.rmtree(local_worker_temp_dir, onerror=make_parent_writable)
679
811
 
680
812
  # This must happen after the log file is done with, else there is no place to put the log
@@ -683,13 +815,13 @@ def workerScript(
683
815
  # We can now safely get rid of the JobDescription, and all jobs it chained up
684
816
  job_store.delete_job(merged_in.job_store_id)
685
817
 
686
-
687
818
  if jobAttemptFailed:
688
819
  return failure_exit_code
689
820
  else:
690
821
  return 0
691
822
 
692
- def parse_args(args: List[str]) -> Any:
823
+
824
+ def parse_args(args: list[str]) -> Any:
693
825
  """
694
826
  Parse command-line arguments to the worker.
695
827
  """
@@ -703,26 +835,33 @@ def parse_args(args: List[str]) -> Any:
703
835
  # Now add all the options to it
704
836
 
705
837
  # Base required job information
706
- parser.add_argument("jobName", type=str,
707
- help="Text name of the job being run")
708
- parser.add_argument("jobStoreLocator", type=str,
709
- help="Information required to connect to the job store")
710
- parser.add_argument("jobStoreID", type=str,
711
- help="ID of the job within the job store")
838
+ parser.add_argument("jobName", type=str, help="Text name of the job being run")
839
+ parser.add_argument(
840
+ "jobStoreLocator",
841
+ type=str,
842
+ help="Information required to connect to the job store",
843
+ )
844
+ parser.add_argument(
845
+ "jobStoreID", type=str, help="ID of the job within the job store"
846
+ )
712
847
 
713
848
  # Additional worker abilities
714
- parser.add_argument("--context", default=[], action="append",
849
+ parser.add_argument(
850
+ "--context",
851
+ default=[],
852
+ action="append",
715
853
  help="""Pickled, base64-encoded context manager(s) to run job inside of.
716
854
  Allows the Toil leader to pass setup and cleanup work provided by the
717
855
  batch system, in the form of pickled Python context manager objects,
718
856
  that the worker can then run before/after the job on the batch
719
- system's behalf.""")
857
+ system's behalf.""",
858
+ )
720
859
 
721
860
  return parser.parse_args(args)
722
861
 
723
862
 
724
863
  @contextmanager
725
- def in_contexts(contexts: List[str]) -> Iterator[None]:
864
+ def in_contexts(contexts: list[str]) -> Iterator[None]:
726
865
  """
727
866
  Unpickle and enter all the pickled, base64-encoded context managers in the
728
867
  given list. Then do the body, then leave them all.
@@ -736,10 +875,12 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
736
875
  rest = contexts[1:]
737
876
 
738
877
  try:
739
- manager = pickle.loads(base64.b64decode(first.encode('utf-8')))
878
+ manager = pickle.loads(base64.b64decode(first.encode("utf-8")))
740
879
  except:
741
880
  exc_info = sys.exc_info()
742
- logger.error('Exception while unpickling context manager: ', exc_info=exc_info)
881
+ logger.error(
882
+ "Exception while unpickling context manager: ", exc_info=exc_info
883
+ )
743
884
  raise
744
885
 
745
886
  with manager:
@@ -749,14 +890,14 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
749
890
  yield
750
891
 
751
892
 
752
- def main(argv: Optional[List[str]] = None) -> None:
893
+ def main(argv: Optional[list[str]] = None) -> None:
753
894
  if argv is None:
754
895
  argv = sys.argv
755
896
  # Parse our command line
756
897
  options = parse_args(argv)
757
898
 
758
899
  ##########################################
759
- #Load the jobStore/config file
900
+ # Load the jobStore/config file
760
901
  ##########################################
761
902
 
762
903
  job_store = Toil.resumeJobStore(options.jobStoreLocator)