toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. toil/__init__.py +122 -315
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +173 -89
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
  5. toil/batchSystems/awsBatch.py +244 -135
  6. toil/batchSystems/cleanup_support.py +26 -16
  7. toil/batchSystems/contained_executor.py +31 -28
  8. toil/batchSystems/gridengine.py +86 -50
  9. toil/batchSystems/htcondor.py +166 -89
  10. toil/batchSystems/kubernetes.py +632 -382
  11. toil/batchSystems/local_support.py +20 -15
  12. toil/batchSystems/lsf.py +134 -81
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +290 -151
  16. toil/batchSystems/mesos/executor.py +79 -50
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +46 -28
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +296 -125
  21. toil/batchSystems/slurm.py +603 -138
  22. toil/batchSystems/torque.py +47 -33
  23. toil/bus.py +186 -76
  24. toil/common.py +664 -368
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1136 -483
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +63 -42
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +140 -60
  32. toil/fileStores/cachingFileStore.py +717 -269
  33. toil/fileStores/nonCachingFileStore.py +116 -87
  34. toil/job.py +1225 -368
  35. toil/jobStores/abstractJobStore.py +416 -266
  36. toil/jobStores/aws/jobStore.py +863 -477
  37. toil/jobStores/aws/utils.py +201 -120
  38. toil/jobStores/conftest.py +3 -2
  39. toil/jobStores/fileJobStore.py +292 -154
  40. toil/jobStores/googleJobStore.py +140 -74
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +668 -272
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +74 -31
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +214 -39
  49. toil/lib/aws/utils.py +287 -231
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +104 -47
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +361 -199
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +5 -3
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +141 -15
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +66 -21
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +68 -15
  71. toil/lib/retry.py +126 -81
  72. toil/lib/threading.py +299 -82
  73. toil/lib/throttle.py +16 -15
  74. toil/options/common.py +843 -409
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +73 -17
  78. toil/provisioners/__init__.py +117 -46
  79. toil/provisioners/abstractProvisioner.py +332 -157
  80. toil/provisioners/aws/__init__.py +70 -33
  81. toil/provisioners/aws/awsProvisioner.py +1145 -715
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +155 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +128 -62
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +224 -70
  98. toil/test/__init__.py +282 -183
  99. toil/test/batchSystems/batchSystemTest.py +460 -210
  100. toil/test/batchSystems/batch_system_plugin_test.py +90 -0
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +110 -49
  104. toil/test/cactus/__init__.py +0 -0
  105. toil/test/cactus/test_cactus_integration.py +56 -0
  106. toil/test/cwl/cwlTest.py +496 -287
  107. toil/test/cwl/measure_default_memory.cwl +12 -0
  108. toil/test/cwl/not_run_required_input.cwl +29 -0
  109. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  110. toil/test/cwl/seqtk_seq.cwl +1 -1
  111. toil/test/docs/scriptsTest.py +69 -46
  112. toil/test/jobStores/jobStoreTest.py +427 -264
  113. toil/test/lib/aws/test_iam.py +118 -50
  114. toil/test/lib/aws/test_s3.py +16 -9
  115. toil/test/lib/aws/test_utils.py +5 -6
  116. toil/test/lib/dockerTest.py +118 -141
  117. toil/test/lib/test_conversions.py +113 -115
  118. toil/test/lib/test_ec2.py +58 -50
  119. toil/test/lib/test_integration.py +104 -0
  120. toil/test/lib/test_misc.py +12 -5
  121. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  122. toil/test/mesos/helloWorld.py +7 -6
  123. toil/test/mesos/stress.py +25 -20
  124. toil/test/options/__init__.py +13 -0
  125. toil/test/options/options.py +42 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
  127. toil/test/provisioners/clusterScalerTest.py +440 -250
  128. toil/test/provisioners/clusterTest.py +166 -44
  129. toil/test/provisioners/gceProvisionerTest.py +174 -100
  130. toil/test/provisioners/provisionerTest.py +25 -13
  131. toil/test/provisioners/restartScript.py +5 -4
  132. toil/test/server/serverTest.py +188 -141
  133. toil/test/sort/restart_sort.py +137 -68
  134. toil/test/sort/sort.py +134 -66
  135. toil/test/sort/sortTest.py +91 -49
  136. toil/test/src/autoDeploymentTest.py +141 -101
  137. toil/test/src/busTest.py +20 -18
  138. toil/test/src/checkpointTest.py +8 -2
  139. toil/test/src/deferredFunctionTest.py +49 -35
  140. toil/test/src/dockerCheckTest.py +32 -24
  141. toil/test/src/environmentTest.py +135 -0
  142. toil/test/src/fileStoreTest.py +539 -272
  143. toil/test/src/helloWorldTest.py +7 -4
  144. toil/test/src/importExportFileTest.py +61 -31
  145. toil/test/src/jobDescriptionTest.py +46 -21
  146. toil/test/src/jobEncapsulationTest.py +2 -0
  147. toil/test/src/jobFileStoreTest.py +74 -50
  148. toil/test/src/jobServiceTest.py +187 -73
  149. toil/test/src/jobTest.py +121 -71
  150. toil/test/src/miscTests.py +19 -18
  151. toil/test/src/promisedRequirementTest.py +82 -36
  152. toil/test/src/promisesTest.py +7 -6
  153. toil/test/src/realtimeLoggerTest.py +10 -6
  154. toil/test/src/regularLogTest.py +71 -37
  155. toil/test/src/resourceTest.py +80 -49
  156. toil/test/src/restartDAGTest.py +36 -22
  157. toil/test/src/resumabilityTest.py +9 -2
  158. toil/test/src/retainTempDirTest.py +45 -14
  159. toil/test/src/systemTest.py +12 -8
  160. toil/test/src/threadingTest.py +44 -25
  161. toil/test/src/toilContextManagerTest.py +10 -7
  162. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  163. toil/test/src/workerTest.py +73 -23
  164. toil/test/utils/toilDebugTest.py +103 -33
  165. toil/test/utils/toilKillTest.py +4 -5
  166. toil/test/utils/utilsTest.py +245 -106
  167. toil/test/wdl/wdltoil_test.py +818 -149
  168. toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
  169. toil/toilState.py +120 -35
  170. toil/utils/toilConfig.py +13 -4
  171. toil/utils/toilDebugFile.py +44 -27
  172. toil/utils/toilDebugJob.py +214 -27
  173. toil/utils/toilDestroyCluster.py +11 -6
  174. toil/utils/toilKill.py +8 -3
  175. toil/utils/toilLaunchCluster.py +256 -140
  176. toil/utils/toilMain.py +37 -16
  177. toil/utils/toilRsyncCluster.py +32 -14
  178. toil/utils/toilSshCluster.py +49 -22
  179. toil/utils/toilStats.py +356 -273
  180. toil/utils/toilStatus.py +292 -139
  181. toil/utils/toilUpdateEC2Instances.py +3 -1
  182. toil/version.py +12 -12
  183. toil/wdl/utils.py +5 -5
  184. toil/wdl/wdltoil.py +3913 -1033
  185. toil/worker.py +367 -184
  186. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
  187. toil-8.0.0.dist-info/METADATA +173 -0
  188. toil-8.0.0.dist-info/RECORD +253 -0
  189. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  190. toil-6.1.0a1.dist-info/METADATA +0 -125
  191. toil-6.1.0a1.dist-info/RECORD +0 -237
  192. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  193. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/worker.py CHANGED
@@ -25,24 +25,31 @@ import stat
25
25
  import sys
26
26
  import time
27
27
  import traceback
28
+ from collections.abc import Iterator
28
29
  from contextlib import contextmanager
29
- from typing import Any, Callable, Iterator, List, Optional
30
+ from typing import Any, Callable, Optional
30
31
 
31
32
  from configargparse import ArgParser
32
33
 
33
34
  from toil import logProcessContext
34
35
  from toil.common import Config, Toil, safeUnpickleFromStream
35
- from toil.cwl.utils import (CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
36
- CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE)
36
+ from toil.cwl.utils import (
37
+ CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
38
+ CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE,
39
+ )
37
40
  from toil.deferred import DeferredFunctionManager
38
41
  from toil.fileStores.abstractFileStore import AbstractFileStore
39
- from toil.job import CheckpointJobDescription, Job, JobDescription
42
+ from toil.job import (
43
+ CheckpointJobDescription,
44
+ DebugStoppingPointReached,
45
+ Job,
46
+ JobDescription,
47
+ )
40
48
  from toil.jobStores.abstractJobStore import AbstractJobStore
41
49
  from toil.lib.expando import MagicExpando
42
50
  from toil.lib.io import make_public_dir
43
- from toil.lib.resources import (get_total_cpu_time,
44
- get_total_cpu_time_and_memory_usage)
45
- from toil.statsAndLogging import configure_root_logger, set_log_level
51
+ from toil.lib.resources import ResourceMonitor
52
+ from toil.statsAndLogging import configure_root_logger, install_log_color, set_log_level
46
53
 
47
54
  logger = logging.getLogger(__name__)
48
55
 
@@ -50,36 +57,55 @@ logger = logging.getLogger(__name__)
50
57
  class StatsDict(MagicExpando):
51
58
  """Subclass of MagicExpando for type-checking purposes."""
52
59
 
53
- jobs: List[str]
60
+ jobs: list[MagicExpando]
54
61
 
55
62
 
56
- def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, config: Config) -> Optional[JobDescription]:
63
+ def nextChainable(
64
+ predecessor: JobDescription, job_store: AbstractJobStore, config: Config
65
+ ) -> Optional[JobDescription]:
57
66
  """
58
67
  Returns the next chainable job's JobDescription after the given predecessor
59
68
  JobDescription, if one exists, or None if the chain must terminate.
60
69
 
61
70
  :param predecessor: The job to chain from
62
- :param jobStore: The JobStore to fetch JobDescriptions from.
71
+ :param job_store: The JobStore to fetch JobDescriptions from.
63
72
  :param config: The configuration for the current run.
64
73
  """
65
- #If no more jobs to run or services not finished, quit
66
- if predecessor.nextSuccessors() is None or len(predecessor.services) > 0 or (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint != None):
67
- logger.debug("Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
68
- predecessor.nextSuccessors() is None, len(predecessor.services), (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint != None))
74
+ # If no more jobs to run or services not finished, quit
75
+ if (
76
+ predecessor.nextSuccessors() is None
77
+ or len(predecessor.services) > 0
78
+ or (
79
+ isinstance(predecessor, CheckpointJobDescription)
80
+ and predecessor.checkpoint is not None
81
+ )
82
+ ):
83
+ logger.debug(
84
+ "Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
85
+ predecessor.nextSuccessors() is None,
86
+ len(predecessor.services),
87
+ (
88
+ isinstance(predecessor, CheckpointJobDescription)
89
+ and predecessor.checkpoint is not None
90
+ ),
91
+ )
69
92
  return None
70
93
 
71
-
72
- #Get the next set of jobs to run
73
- jobs = list(predecessor.nextSuccessors())
94
+ # Get the next set of jobs to run
95
+ jobs = list(predecessor.nextSuccessors() or set())
74
96
  if len(jobs) == 0:
75
97
  # If there are no jobs, we might just not have any children.
76
- logger.debug("Stopping running chain of jobs because job has no ready children or follow-ons")
98
+ logger.debug(
99
+ "Stopping running chain of jobs because job has no ready children or follow-ons"
100
+ )
77
101
  return None
78
102
 
79
- #If there are 2 or more jobs to run in parallel we quit
103
+ # If there are 2 or more jobs to run in parallel we quit
80
104
  if len(jobs) >= 2:
81
- logger.debug("No more jobs can run in series by this worker,"
82
- " it's got %i successors", len(jobs))
105
+ logger.debug(
106
+ "No more jobs can run in series by this worker," " it's got %i successors",
107
+ len(jobs),
108
+ )
83
109
  logger.debug("Two distinct successors are %s and %s", jobs[0], jobs[1])
84
110
  return None
85
111
 
@@ -89,10 +115,10 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
89
115
  logger.debug("%s would chain to ID %s", predecessor, successorID)
90
116
 
91
117
  # Load the successor JobDescription
92
- successor = jobStore.load_job(successorID)
118
+ successor = job_store.load_job(successorID)
93
119
 
94
- #We check the requirements of the successor to see if we can run it
95
- #within the current worker
120
+ # We check the requirements of the successor to see if we can run it
121
+ # within the current worker
96
122
  if successor.memory > predecessor.memory:
97
123
  logger.debug("We need more memory for the next job, so finishing")
98
124
  return None
@@ -103,14 +129,20 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
103
129
  logger.debug("We need more disk for the next job, so finishing")
104
130
  return None
105
131
  if successor.preemptible != predecessor.preemptible:
106
- logger.debug("Preemptibility is different for the next job, returning to the leader")
132
+ logger.debug(
133
+ "Preemptibility is different for the next job, returning to the leader"
134
+ )
107
135
  return None
108
136
  if successor.predecessorNumber > 1:
109
- logger.debug("The next job has multiple predecessors; we must return to the leader.")
137
+ logger.debug(
138
+ "The next job has multiple predecessors; we must return to the leader."
139
+ )
110
140
  return None
111
141
 
112
142
  if len(successor.services) > 0:
113
- logger.debug("The next job requires services that will not yet be started; we must return to the leader.")
143
+ logger.debug(
144
+ "The next job requires services that will not yet be started; we must return to the leader."
145
+ )
114
146
  return None
115
147
 
116
148
  if isinstance(successor, CheckpointJobDescription):
@@ -118,17 +150,43 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
118
150
  logger.debug("Next job is checkpoint, so finishing")
119
151
  return None
120
152
 
153
+ if (
154
+ not config.run_local_jobs_on_workers
155
+ and predecessor.local
156
+ and not successor.local
157
+ ):
158
+ # This job might be running on the leader, but the next job may not.
159
+ #
160
+ # TODO: Optimize by detecting whether we actually are on the leader,
161
+ # somehow.
162
+ logger.debug("Next job is not allowed to run on the leader, so finishing")
163
+ return None
164
+
121
165
  # Made it through! This job is chainable.
122
166
  return successor
123
167
 
124
- def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobStoreID: str, redirectOutputToLogFile: bool = True) -> int:
168
+
169
+ def workerScript(
170
+ job_store: AbstractJobStore,
171
+ config: Config,
172
+ job_name: str,
173
+ job_store_id: str,
174
+ redirect_output_to_log_file: bool = True,
175
+ local_worker_temp_dir: Optional[str] = None,
176
+ debug_flags: Optional[set[str]] = None,
177
+ ) -> int:
125
178
  """
126
179
  Worker process script, runs a job.
127
180
 
128
- :param jobStore: The JobStore to fetch JobDescriptions from.
181
+ :param job_store: The JobStore to fetch JobDescriptions from.
129
182
  :param config: The configuration for the current run.
130
- :param jobName: The "job name" (a user friendly name) of the job to be run
131
- :param jobStoreID: The job store ID of the job to be run
183
+ :param job_name: The "job name" (a user friendly name) of the job to be run
184
+ :param job_store_id: The job store ID of the job to be run
185
+ :param redirect_output_to_log_file: If False, log directly to the console
186
+ instead of capturing job output.
187
+ :param local_worker_temp_dir: The directory for the worker to work in. May
188
+ be recursively removed after the job runs.
189
+ :param debug_flags: Flags to set on each job before running it.
132
190
 
133
191
  :return int: 1 if a job failed, or 0 if all jobs succeeded
134
192
  """
@@ -136,8 +194,13 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
136
194
  configure_root_logger()
137
195
  set_log_level(config.logLevel)
138
196
 
197
+ if config.colored_logs:
198
+ install_log_color()
199
+
200
+ logger.debug("Worker started for job %s...", job_name)
201
+
139
202
  ##########################################
140
- #Create the worker killer, if requested
203
+ # Create the worker killer, if requested
141
204
  ##########################################
142
205
 
143
206
  logFileByteReportLimit = config.maxLogFileSize
@@ -178,11 +241,11 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
178
241
  # before it does. Either way, init will have to clean it up for us.
179
242
 
180
243
  ##########################################
181
- #Load the environment for the job
244
+ # Load the environment for the job
182
245
  ##########################################
183
246
 
184
- #First load the environment for the job.
185
- with jobStore.read_shared_file_stream("environment.pickle") as fileHandle:
247
+ # First load the environment for the job.
248
+ with job_store.read_shared_file_stream("environment.pickle") as fileHandle:
186
249
  environment = safeUnpickleFromStream(fileHandle)
187
250
  env_reject = {
188
251
  "TMPDIR",
@@ -199,15 +262,15 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
199
262
  "XDG_SESSION_ID",
200
263
  "XDG_RUNTIME_DIR",
201
264
  "XDG_DATA_DIRS",
202
- "DBUS_SESSION_BUS_ADDRESS"
265
+ "DBUS_SESSION_BUS_ADDRESS",
203
266
  }
204
267
  for i in environment:
205
268
  if i == "PATH":
206
269
  # Handle path specially. Sometimes e.g. leader may not include
207
270
  # /bin, but the Toil appliance needs it.
208
- if i in os.environ and os.environ[i] != '':
271
+ if i in os.environ and os.environ[i] != "":
209
272
  # Use the provided PATH and then the local system's PATH
210
- os.environ[i] = environment[i] + ':' + os.environ[i]
273
+ os.environ[i] = environment[i] + ":" + os.environ[i]
211
274
  else:
212
275
  # Use the provided PATH only
213
276
  os.environ[i] = environment[i]
@@ -215,42 +278,48 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
215
278
  os.environ[i] = environment[i]
216
279
  # sys.path is used by __import__ to find modules
217
280
  if "PYTHONPATH" in environment:
218
- for e in environment["PYTHONPATH"].split(':'):
219
- if e != '':
281
+ for e in environment["PYTHONPATH"].split(":"):
282
+ if e != "":
220
283
  sys.path.append(e)
221
284
 
222
285
  ##########################################
223
- #Setup the temporary directories.
286
+ # Setup the temporary directories.
224
287
  ##########################################
225
288
  # Dir to put all this worker's temp files in.
226
289
  if config.workflowID is None:
227
290
  raise RuntimeError("The worker workflow ID was never set.")
228
291
  toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir)
229
292
  # Dir to put lock files in, ideally not on NFS.
230
- toil_coordination_dir = Toil.get_local_workflow_coordination_dir(config.workflowID, config.workDir, config.coordination_dir)
231
- localWorkerTempDir = make_public_dir(in_directory=toilWorkflowDir)
232
- os.chmod(localWorkerTempDir, 0o755)
293
+ toil_coordination_dir = Toil.get_local_workflow_coordination_dir(
294
+ config.workflowID, config.workDir, config.coordination_dir
295
+ )
296
+ if local_worker_temp_dir is None:
297
+ # Invent a temp directory to work in
298
+ local_worker_temp_dir = make_public_dir(toilWorkflowDir)
299
+ os.chmod(local_worker_temp_dir, 0o755)
233
300
 
234
301
  ##########################################
235
- #Setup the logging
302
+ # Setup the logging
236
303
  ##########################################
237
304
 
238
- #This is mildly tricky because we don't just want to
239
- #redirect stdout and stderr for this Python process; we want to redirect it
240
- #for this process and all children. Consequently, we can't just replace
241
- #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
242
- #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
305
+ # This is mildly tricky because we don't just want to
306
+ # redirect stdout and stderr for this Python process; we want to redirect it
307
+ # for this process and all children. Consequently, we can't just replace
308
+ # sys.stdout and sys.stderr; we need to mess with the underlying OS-level
309
+ # file descriptors. See <http://stackoverflow.com/a/11632982/402891>
243
310
 
244
- #When we start, standard input is file descriptor 0, standard output is
245
- #file descriptor 1, and standard error is file descriptor 2.
311
+ # When we start, standard input is file descriptor 0, standard output is
312
+ # file descriptor 1, and standard error is file descriptor 2.
246
313
 
247
314
  # Do we even want to redirect output? Let the config make us not do it.
248
- redirectOutputToLogFile = redirectOutputToLogFile and not config.disableWorkerOutputCapture
315
+ redirect_output_to_log_file = (
316
+ redirect_output_to_log_file and not config.disableWorkerOutputCapture
317
+ )
249
318
 
250
- #What file do we want to point FDs 1 and 2 to?
251
- tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")
319
+ # What file do we want to point FDs 1 and 2 to?
320
+ tempWorkerLogPath = os.path.join(local_worker_temp_dir, "worker_log.txt")
252
321
 
253
- if redirectOutputToLogFile:
322
+ if redirect_output_to_log_file:
254
323
  # Announce that we are redirecting logging, and where it will now go.
255
324
  # This is only important if we are trying to manually trace a faulty worker invocation.
256
325
  logger.debug("Redirecting logging to %s", tempWorkerLogPath)
@@ -287,13 +356,15 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
287
356
 
288
357
  jobAttemptFailed = False
289
358
  failure_exit_code = 1
359
+ first_job_cores = None
290
360
  statsDict = StatsDict() # type: ignore[no-untyped-call]
291
361
  statsDict.jobs = []
292
- statsDict.workers.logsToMaster = []
362
+ statsDict.workers.logs_to_leader = []
363
+ statsDict.workers.logging_user_streams = []
293
364
 
294
365
  def blockFn() -> bool:
295
366
  return True
296
- listOfJobs = [jobName]
367
+
297
368
  job = None
298
369
  try:
299
370
 
@@ -312,18 +383,17 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
312
383
  # Load the JobDescription
313
384
  ##########################################
314
385
 
315
- jobDesc = jobStore.load_job(jobStoreID)
316
- listOfJobs[0] = str(jobDesc)
386
+ jobDesc = job_store.load_job(job_store_id)
317
387
  logger.debug("Parsed job description")
318
388
 
319
389
  ##########################################
320
390
  # Cleanup from any earlier invocation of the job
321
391
  ##########################################
322
392
 
323
- if jobDesc.command is None:
393
+ if not jobDesc.has_body():
324
394
  logger.debug("Job description has no body to run.")
325
395
  # Cleanup jobs already finished
326
- jobDesc.clear_nonexistent_dependents(jobStore)
396
+ jobDesc.clear_nonexistent_dependents(job_store)
327
397
  logger.debug("Cleaned up any references to completed successor jobs")
328
398
 
329
399
  # This cleans the old log file which may
@@ -331,14 +401,17 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
331
401
  oldLogFile = jobDesc.logJobStoreFileID
332
402
  if oldLogFile is not None:
333
403
  jobDesc.logJobStoreFileID = None
334
- jobStore.update_job(jobDesc) # Update first, before deleting any files
335
- jobStore.delete_file(oldLogFile)
404
+ job_store.update_job(jobDesc) # Update first, before deleting any files
405
+ job_store.delete_file(oldLogFile)
336
406
 
337
407
  ##########################################
338
408
  # If a checkpoint exists, restart from the checkpoint
339
409
  ##########################################
340
410
 
341
- if isinstance(jobDesc, CheckpointJobDescription) and jobDesc.checkpoint is not None:
411
+ if (
412
+ isinstance(jobDesc, CheckpointJobDescription)
413
+ and jobDesc.checkpoint is not None
414
+ ):
342
415
  # The job is a checkpoint, and is being restarted after previously completing
343
416
  logger.debug("Job is a checkpoint")
344
417
  # If the checkpoint still has extant successors or services, its
@@ -350,75 +423,106 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
350
423
  if jobDesc.remainingTryCount < 0:
351
424
  raise RuntimeError("The try count of the job cannot be negative.")
352
425
  jobDesc.remainingTryCount = max(0, jobDesc.remainingTryCount - 1)
353
- jobDesc.restartCheckpoint(jobStore)
426
+ jobDesc.restartCheckpoint(job_store)
354
427
  # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
355
428
  # because of the job being a checkpoint
356
429
  else:
357
- logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.")
358
- #Delete any remnant files
359
- list(map(jobStore.delete_file, list(filter(jobStore.file_exists, jobDesc.checkpointFilesToDelete))))
430
+ logger.debug(
431
+ "The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete."
432
+ )
433
+ # Delete any remnant files
434
+ list(
435
+ map(
436
+ job_store.delete_file,
437
+ list(
438
+ filter(
439
+ job_store.file_exists, jobDesc.checkpointFilesToDelete
440
+ )
441
+ ),
442
+ )
443
+ )
360
444
 
361
445
  ##########################################
362
- #Setup the stats, if requested
446
+ # Setup the stats, if requested
363
447
  ##########################################
364
448
 
365
449
  if config.stats:
366
- startClock = get_total_cpu_time()
450
+ # Remember the cores from the first job, which is how many we have reserved for us.
451
+ statsDict.workers.requested_cores = jobDesc.cores
452
+ startClock = ResourceMonitor.get_total_cpu_time()
367
453
 
368
454
  startTime = time.time()
369
455
  while True:
370
456
  ##########################################
371
- #Run the job body, if there is one
457
+ # Run the job body, if there is one
372
458
  ##########################################
373
459
 
374
460
  logger.info("Working on job %s", jobDesc)
375
461
 
376
- if jobDesc.command is not None:
377
- if not jobDesc.command.startswith("_toil "):
378
- raise RuntimeError("Job command must start with '_toil' before being converted to an executable command.")
379
- logger.debug("Got a command to run: %s" % jobDesc.command)
462
+ if jobDesc.has_body():
380
463
  # Load the job. It will use the same JobDescription we have been using.
381
- job = Job.loadJob(jobStore, jobDesc)
464
+ job = Job.loadJob(job_store, jobDesc)
382
465
  if isinstance(jobDesc, CheckpointJobDescription):
383
- # If it is a checkpoint job, save the command
384
- jobDesc.checkpoint = jobDesc.command
466
+ # If it is a checkpoint job, set the checkpoint
467
+ jobDesc.set_checkpoint()
385
468
 
386
469
  logger.info("Loaded body %s from description %s", job, jobDesc)
387
470
 
471
+ if debug_flags:
472
+ for flag in debug_flags:
473
+ logger.debug("Turning on debug flag %s on job", flag)
474
+ job.set_debug_flag(flag)
475
+
388
476
  # Create a fileStore object for the job
389
- fileStore = AbstractFileStore.createFileStore(jobStore, jobDesc, localWorkerTempDir, blockFn,
390
- caching=config.caching)
391
- with job._executor(stats=statsDict if config.stats else None,
392
- fileStore=fileStore):
393
- with deferredFunctionManager.open() as defer:
394
- with fileStore.open(job):
395
- # Get the next block function to wait on committing this job
396
- blockFn = fileStore.waitForCommit
397
-
398
- # Run the job, save new successors, and set up
399
- # locally (but don't commit) successor
400
- # relationships and job completion.
401
- # Pass everything as name=value because Cactus
402
- # likes to override _runner when it shouldn't and
403
- # it needs some hope of finding the arguments it
404
- # wants across multiple Toil versions. We also
405
- # still pass a jobGraph argument to placate old
406
- # versions of Cactus.
407
- job._runner(jobGraph=None, jobStore=jobStore, fileStore=fileStore, defer=defer)
408
-
409
- # When the executor for the job finishes it will
410
- # kick off a commit with the command link to the
411
- # job body cut.
412
-
413
- # Accumulate messages from this job & any subsequent chained jobs
414
- statsDict.workers.logsToMaster += fileStore.loggingMessages
477
+ fileStore = AbstractFileStore.createFileStore(
478
+ job_store,
479
+ jobDesc,
480
+ local_worker_temp_dir,
481
+ blockFn,
482
+ caching=config.caching,
483
+ )
484
+ try:
485
+ with job._executor(
486
+ stats=statsDict if config.stats else None, fileStore=fileStore
487
+ ):
488
+ with deferredFunctionManager.open() as defer:
489
+ with fileStore.open(job):
490
+ # Get the next block function to wait on committing this job
491
+ blockFn = fileStore.waitForCommit
492
+
493
+ # Run the job, save new successors, and set up
494
+ # locally (but don't commit) successor
495
+ # relationships and job completion.
496
+ # Pass everything as name=value because Cactus
497
+ # likes to override _runner when it shouldn't and
498
+ # it needs some hope of finding the arguments it
499
+ # wants across multiple Toil versions. We also
500
+ # still pass a jobGraph argument to placate old
501
+ # versions of Cactus.
502
+ job._runner(
503
+ jobGraph=None,
504
+ jobStore=job_store,
505
+ fileStore=fileStore,
506
+ defer=defer,
507
+ )
508
+
509
+ # When the executor for the job finishes it will
510
+ # kick off a commit with the link to the job body
511
+ # cut.
512
+ finally:
513
+ # Accumulate messages from this job & any subsequent chained jobs.
514
+ # Keep the messages even if the job fails.
515
+ statsDict.workers.logs_to_leader += fileStore.logging_messages
516
+ statsDict.workers.logging_user_streams += (
517
+ fileStore.logging_user_streams
518
+ )
415
519
 
416
520
  logger.info("Completed body for %s", jobDesc)
417
521
 
418
522
  else:
419
- #The command may be none, in which case
420
- #the JobDescription is either a shell ready to be deleted or has
421
- #been scheduled after a failure to cleanup
523
+ # The body may not be attached, in which case the
524
+ # JobDescription is either a shell ready to be deleted or has
525
+ # been scheduled after a failure to cleanup
422
526
  logger.debug("No user job to run, so finishing")
423
527
  break
424
528
 
@@ -426,9 +530,9 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
426
530
  raise RuntimeError("The termination flag is set")
427
531
 
428
532
  ##########################################
429
- #Establish if we can run another job within the worker
533
+ # Establish if we can run another job within the worker
430
534
  ##########################################
431
- successor = nextChainable(jobDesc, jobStore, config)
535
+ successor = nextChainable(jobDesc, job_store, config)
432
536
  if successor is None or config.disableChaining:
433
537
  # Can't chain any more jobs. We are going to stop.
434
538
 
@@ -449,17 +553,18 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
449
553
 
450
554
  # Make sure nothing has gone wrong and we can really chain
451
555
  if jobDesc.memory < successor.memory:
452
- raise RuntimeError("Cannot chain jobs. A job's memory cannot be less than it's successor.")
556
+ raise RuntimeError(
557
+ "Cannot chain jobs. A job's memory cannot be less than it's successor."
558
+ )
453
559
  if jobDesc.cores < successor.cores:
454
- raise RuntimeError("Cannot chain jobs. A job's cores cannot be less than it's successor.")
560
+ raise RuntimeError(
561
+ "Cannot chain jobs. A job's cores cannot be less than it's successor."
562
+ )
455
563
 
456
564
  # Save the successor's original ID, so we can clean it (and its
457
565
  # body) up after we finish executing it.
458
566
  successorID = successor.jobStoreID
459
567
 
460
- # add the successor to the list of jobs run
461
- listOfJobs.append(str(successor))
462
-
463
568
  # Now we need to become that successor, under the original ID.
464
569
  successor.replace(jobDesc)
465
570
  jobDesc = successor
@@ -470,8 +575,13 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
470
575
 
471
576
  # Build a fileStore to update the job and commit the replacement.
472
577
  # TODO: can we have a commit operation without an entire FileStore???
473
- fileStore = AbstractFileStore.createFileStore(jobStore, jobDesc, localWorkerTempDir, blockFn,
474
- caching=config.caching)
578
+ fileStore = AbstractFileStore.createFileStore(
579
+ job_store,
580
+ jobDesc,
581
+ local_worker_temp_dir,
582
+ blockFn,
583
+ caching=config.caching,
584
+ )
475
585
 
476
586
  # Update blockFn to wait for that commit operation.
477
587
  blockFn = fileStore.waitForCommit
@@ -482,30 +592,70 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
482
592
  logger.debug("Starting the next job")
483
593
 
484
594
  ##########################################
485
- #Finish up the stats
595
+ # Finish up the stats
486
596
  ##########################################
487
597
  if config.stats:
488
- totalCPUTime, totalMemoryUsage = get_total_cpu_time_and_memory_usage()
598
+ totalCPUTime, totalMemoryUsage = (
599
+ ResourceMonitor.get_total_cpu_time_and_memory_usage()
600
+ )
489
601
  statsDict.workers.time = str(time.time() - startTime)
490
602
  statsDict.workers.clock = str(totalCPUTime - startClock)
491
603
  statsDict.workers.memory = str(totalMemoryUsage)
604
+ # Say the worker used the max disk we saw from any job
605
+ max_bytes = 0
606
+ for job_stats in statsDict.jobs:
607
+ if "disk" in job_stats:
608
+ max_bytes = max(max_bytes, int(job_stats.disk))
609
+ statsDict.workers.disk = str(max_bytes)
610
+ # Count the jobs executed.
611
+ # TODO: toil stats could compute this but its parser is too general to hook into simply.
612
+ statsDict.workers.jobs_run = len(statsDict.jobs)
492
613
 
493
614
  # log the worker log path here so that if the file is truncated the path can still be found
494
- if redirectOutputToLogFile:
495
- logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", localWorkerTempDir)
496
-
497
- logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
615
+ if redirect_output_to_log_file:
616
+ logger.info(
617
+ "Worker log can be found at %s. Set --cleanWorkDir to retain this log",
618
+ local_worker_temp_dir,
619
+ )
620
+
621
+ logger.info(
622
+ "Finished running the chain of jobs on this node, we ran for a total of %f seconds",
623
+ time.time() - startTime,
624
+ )
498
625
 
499
626
  ##########################################
500
- #Trapping where worker goes wrong
627
+ # Trapping where worker goes wrong
501
628
  ##########################################
502
- except Exception as e: #Case that something goes wrong in worker
503
- traceback.print_exc()
504
- logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname())
629
+ except DebugStoppingPointReached:
630
+ # Job wants the worker to stop for debugging
631
+ raise
632
+ except (
633
+ BaseException
634
+ ) as e: # Case that something goes wrong in worker, or we are asked to stop
635
+ if not isinstance(e, SystemExit):
636
+ logger.critical(
637
+ "Worker crashed with traceback:\n%s", traceback.format_exc()
638
+ )
639
+ logger.error(
640
+ "Exiting the worker because of a failed job on host %s",
641
+ socket.gethostname(),
642
+ )
505
643
  if isinstance(e, CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION):
506
644
  # We need to inform the leader that this is a CWL workflow problem
507
645
  # and it needs to inform its caller.
508
646
  failure_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
647
+ elif isinstance(e, SystemExit) and isinstance(e.code, int) and e.code != 0:
648
+ # We're meant to be exiting with a particular code.
649
+ failure_exit_code = e.code
650
+ else:
651
+ try:
652
+ from WDL.runtime.error import CommandFailed
653
+
654
+ if isinstance(e, CommandFailed):
655
+ failure_exit_code = e.exit_status
656
+ except ImportError:
657
+ # WDL dependency not available
658
+ pass
509
659
  AbstractFileStore._terminateEvent.set()
510
660
  finally:
511
661
  # Get rid of our deferred function manager now so we can't mistake it
@@ -521,16 +671,15 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
521
671
  logger.debug("cwltool.main._terminate_processess exception: %s", (e))
522
672
  raise e
523
673
 
524
-
525
674
  ##########################################
526
- #Wait for the asynchronous chain of writes/updates to finish
675
+ # Wait for the asynchronous chain of writes/updates to finish
527
676
  ##########################################
528
677
 
529
678
  blockFn()
530
679
 
531
680
  ##########################################
532
- #All the asynchronous worker/update threads must be finished now,
533
- #so safe to test if they completed okay
681
+ # All the asynchronous worker/update threads must be finished now,
682
+ # so safe to test if they completed okay
534
683
  ##########################################
535
684
 
536
685
  if AbstractFileStore._terminateEvent.is_set():
@@ -538,19 +687,19 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
538
687
 
539
688
  # Clobber any garbage state we have for this job from failing with
540
689
  # whatever good state is still stored in the JobStore
541
- jobDesc = jobStore.load_job(jobStoreID)
690
+ jobDesc = job_store.load_job(job_store_id)
542
691
  # Remember that we failed
543
692
  jobAttemptFailed = True
544
693
 
545
694
  ##########################################
546
- #Cleanup
695
+ # Cleanup
547
696
  ##########################################
548
697
 
549
698
  # Close the worker logging
550
699
  # Flush at the Python level
551
700
  sys.stdout.flush()
552
701
  sys.stderr.flush()
553
- if redirectOutputToLogFile:
702
+ if redirect_output_to_log_file:
554
703
  # Flush at the OS level
555
704
  os.fsync(1)
556
705
  os.fsync(2)
@@ -577,43 +726,66 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
577
726
  # relative to the end (since Python won't decode Unicode backward, or even
578
727
  # interpret seek offsets in characters for us). TODO: We may get invalid or
579
728
  # just different Unicode by breaking up a character at the boundary!
580
- if jobAttemptFailed and redirectOutputToLogFile:
581
- jobDesc.logJobStoreFileID = logJobStoreFileID = jobStore.getEmptyFileStoreID(
729
+ if jobAttemptFailed and redirect_output_to_log_file:
730
+ jobDesc.logJobStoreFileID = logJobStoreFileID = job_store.getEmptyFileStoreID(
582
731
  jobDesc.jobStoreID, cleanup=True
583
732
  )
584
- jobDesc.chainedJobs = listOfJobs
585
- with jobStore.update_file_stream(logJobStoreFileID) as w:
586
- with open(tempWorkerLogPath, 'rb') as f:
587
- if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0:
733
+ with job_store.update_file_stream(logJobStoreFileID) as w:
734
+ with open(tempWorkerLogPath, "rb") as f:
735
+ if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
588
736
  if logFileByteReportLimit > 0:
589
- f.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file
737
+ f.seek(
738
+ -logFileByteReportLimit, 2
739
+ ) # seek to last tooBig bytes of file
590
740
  elif logFileByteReportLimit < 0:
591
- f.seek(logFileByteReportLimit, 0) # seek to first tooBig bytes of file
741
+ f.seek(
742
+ logFileByteReportLimit, 0
743
+ ) # seek to first tooBig bytes of file
592
744
  # Dump the possibly-invalid-Unicode bytes into the log file
593
- w.write(f.read()) # TODO load file using a buffer
745
+ w.write(f.read()) # TODO load file using a buffer
594
746
  # Commit log file reference back to JobStore
595
- jobStore.update_job(jobDesc)
747
+ job_store.update_job(jobDesc)
596
748
 
597
- elif ((debugging or (config.writeLogsFromAllJobs and not jobDesc.local))
598
- and redirectOutputToLogFile): # write log messages
599
- with open(tempWorkerLogPath, 'rb') as logFile:
749
+ elif (
750
+ debugging or (config.writeLogsFromAllJobs and not jobDesc.local)
751
+ ) and redirect_output_to_log_file: # write log messages
752
+ with open(tempWorkerLogPath, "rb") as logFile:
600
753
  if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
601
754
  if logFileByteReportLimit > 0:
602
- logFile.seek(-logFileByteReportLimit, 2) # seek to last tooBig bytes of file
755
+ logFile.seek(
756
+ -logFileByteReportLimit, 2
757
+ ) # seek to last tooBig bytes of file
603
758
  elif logFileByteReportLimit < 0:
604
- logFile.seek(logFileByteReportLimit, 0) # seek to first tooBig bytes of file
759
+ logFile.seek(
760
+ logFileByteReportLimit, 0
761
+ ) # seek to first tooBig bytes of file
605
762
  # Make sure lines are Unicode so they can be JSON serialized as part of the dict.
606
763
  # We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters.
607
- logMessages = [line.decode('utf-8', 'skip') for line in logFile.read().splitlines()]
608
- statsDict.logs.names = listOfJobs
764
+ logMessages = [
765
+ line.decode("utf-8", "skip") for line in logFile.read().splitlines()
766
+ ]
767
+ statsDict.logs.names = [names.stats_name for names in jobDesc.get_chain()]
609
768
  statsDict.logs.messages = logMessages
610
769
 
611
- if (debugging or config.stats or statsDict.workers.logsToMaster) and not jobAttemptFailed: # We have stats/logging to report back
612
- jobStore.write_logs(json.dumps(statsDict, ensure_ascii=True))
770
+ if (
771
+ debugging
772
+ or config.stats
773
+ or statsDict.workers.logs_to_leader
774
+ or statsDict.workers.logging_user_streams
775
+ ):
776
+ # We have stats/logging to report back.
777
+ # We report even if the job attempt failed.
778
+ # TODO: Will that upset analysis of the stats?
779
+ job_store.write_logs(json.dumps(statsDict, ensure_ascii=True))
613
780
 
614
781
  # Remove the temp dir
615
782
  cleanUp = config.cleanWorkDir
616
- if cleanUp == 'always' or (cleanUp == 'onSuccess' and not jobAttemptFailed) or (cleanUp == 'onError' and jobAttemptFailed):
783
+ if (
784
+ cleanUp == "always"
785
+ or (cleanUp == "onSuccess" and not jobAttemptFailed)
786
+ or (cleanUp == "onError" and jobAttemptFailed)
787
+ ):
788
+
617
789
  def make_parent_writable(func: Callable[[str], Any], path: str, _: Any) -> None:
618
790
  """
619
791
  When encountering an error removing a file or directory, make sure
@@ -624,24 +796,32 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
624
796
  """
625
797
  # Just chmod it for rwx for user. This can't work anyway if it isn't ours.
626
798
  try:
627
- os.chmod(os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
799
+ os.chmod(
800
+ os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR
801
+ )
628
802
  except PermissionError as e:
629
- logger.error('Could not set permissions on %s to allow cleanup of %s: %s', os.path.dirname(path), path, e)
630
- shutil.rmtree(localWorkerTempDir, onerror=make_parent_writable)
803
+ logger.error(
804
+ "Could not set permissions on %s to allow cleanup of %s: %s",
805
+ os.path.dirname(path),
806
+ path,
807
+ e,
808
+ )
809
+
810
+ shutil.rmtree(local_worker_temp_dir, onerror=make_parent_writable)
631
811
 
632
812
  # This must happen after the log file is done with, else there is no place to put the log
633
813
  if (not jobAttemptFailed) and jobDesc.is_subtree_done():
634
- # We can now safely get rid of the JobDescription, and all jobs it chained up
635
- for otherID in jobDesc.merged_jobs:
636
- jobStore.delete_job(otherID)
637
- jobStore.delete_job(str(jobDesc.jobStoreID))
814
+ for merged_in in jobDesc.get_chain():
815
+ # We can now safely get rid of the JobDescription, and all jobs it chained up
816
+ job_store.delete_job(merged_in.job_store_id)
638
817
 
639
818
  if jobAttemptFailed:
640
819
  return failure_exit_code
641
820
  else:
642
821
  return 0
643
822
 
644
- def parse_args(args: List[str]) -> Any:
823
+
824
+ def parse_args(args: list[str]) -> Any:
645
825
  """
646
826
  Parse command-line arguments to the worker.
647
827
  """
@@ -655,26 +835,33 @@ def parse_args(args: List[str]) -> Any:
655
835
  # Now add all the options to it
656
836
 
657
837
  # Base required job information
658
- parser.add_argument("jobName", type=str,
659
- help="Text name of the job being run")
660
- parser.add_argument("jobStoreLocator", type=str,
661
- help="Information required to connect to the job store")
662
- parser.add_argument("jobStoreID", type=str,
663
- help="ID of the job within the job store")
838
+ parser.add_argument("jobName", type=str, help="Text name of the job being run")
839
+ parser.add_argument(
840
+ "jobStoreLocator",
841
+ type=str,
842
+ help="Information required to connect to the job store",
843
+ )
844
+ parser.add_argument(
845
+ "jobStoreID", type=str, help="ID of the job within the job store"
846
+ )
664
847
 
665
848
  # Additional worker abilities
666
- parser.add_argument("--context", default=[], action="append",
849
+ parser.add_argument(
850
+ "--context",
851
+ default=[],
852
+ action="append",
667
853
  help="""Pickled, base64-encoded context manager(s) to run job inside of.
668
854
  Allows the Toil leader to pass setup and cleanup work provided by the
669
855
  batch system, in the form of pickled Python context manager objects,
670
856
  that the worker can then run before/after the job on the batch
671
- system's behalf.""")
857
+ system's behalf.""",
858
+ )
672
859
 
673
860
  return parser.parse_args(args)
674
861
 
675
862
 
676
863
  @contextmanager
677
- def in_contexts(contexts: List[str]) -> Iterator[None]:
864
+ def in_contexts(contexts: list[str]) -> Iterator[None]:
678
865
  """
679
866
  Unpickle and enter all the pickled, base64-encoded context managers in the
680
867
  given list. Then do the body, then leave them all.
@@ -688,10 +875,12 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
688
875
  rest = contexts[1:]
689
876
 
690
877
  try:
691
- manager = pickle.loads(base64.b64decode(first.encode('utf-8')))
878
+ manager = pickle.loads(base64.b64decode(first.encode("utf-8")))
692
879
  except:
693
880
  exc_info = sys.exc_info()
694
- logger.error('Exception while unpickling context manager: ', exc_info=exc_info)
881
+ logger.error(
882
+ "Exception while unpickling context manager: ", exc_info=exc_info
883
+ )
695
884
  raise
696
885
 
697
886
  with manager:
@@ -701,28 +890,22 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
701
890
  yield
702
891
 
703
892
 
704
- def main(argv: Optional[List[str]] = None) -> None:
893
+ def main(argv: Optional[list[str]] = None) -> None:
705
894
  if argv is None:
706
895
  argv = sys.argv
707
-
708
896
  # Parse our command line
709
897
  options = parse_args(argv)
710
898
 
711
- # Parse input args
712
- jobName = argv[1]
713
- jobStoreLocator = argv[2]
714
- jobStoreID = argv[3]
715
-
716
899
  ##########################################
717
- #Load the jobStore/config file
900
+ # Load the jobStore/config file
718
901
  ##########################################
719
902
 
720
- jobStore = Toil.resumeJobStore(options.jobStoreLocator)
721
- config = jobStore.config
903
+ job_store = Toil.resumeJobStore(options.jobStoreLocator)
904
+ config = job_store.config
722
905
 
723
906
  with in_contexts(options.context):
724
907
  # Call the worker
725
- exit_code = workerScript(jobStore, config, options.jobName, options.jobStoreID)
908
+ exit_code = workerScript(job_store, config, options.jobName, options.jobStoreID)
726
909
 
727
910
  # Exit with its return value
728
911
  sys.exit(exit_code)