toil 7.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. toil/__init__.py +124 -86
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +137 -77
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
  5. toil/batchSystems/awsBatch.py +237 -128
  6. toil/batchSystems/cleanup_support.py +22 -16
  7. toil/batchSystems/contained_executor.py +30 -26
  8. toil/batchSystems/gridengine.py +85 -49
  9. toil/batchSystems/htcondor.py +164 -87
  10. toil/batchSystems/kubernetes.py +622 -386
  11. toil/batchSystems/local_support.py +17 -12
  12. toil/batchSystems/lsf.py +132 -79
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +288 -149
  16. toil/batchSystems/mesos/executor.py +77 -49
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +39 -29
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +293 -123
  21. toil/batchSystems/slurm.py +651 -155
  22. toil/batchSystems/torque.py +46 -32
  23. toil/bus.py +141 -73
  24. toil/common.py +784 -397
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1137 -534
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +62 -41
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +88 -57
  32. toil/fileStores/cachingFileStore.py +711 -247
  33. toil/fileStores/nonCachingFileStore.py +113 -75
  34. toil/job.py +1031 -349
  35. toil/jobStores/abstractJobStore.py +387 -243
  36. toil/jobStores/aws/jobStore.py +772 -412
  37. toil/jobStores/aws/utils.py +161 -109
  38. toil/jobStores/conftest.py +1 -0
  39. toil/jobStores/fileJobStore.py +289 -151
  40. toil/jobStores/googleJobStore.py +137 -70
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +614 -269
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +55 -28
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +204 -58
  49. toil/lib/aws/utils.py +290 -213
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +83 -49
  53. toil/lib/docker.py +131 -103
  54. toil/lib/dockstore.py +379 -0
  55. toil/lib/ec2.py +322 -209
  56. toil/lib/ec2nodes.py +174 -105
  57. toil/lib/encryption/_dummy.py +5 -3
  58. toil/lib/encryption/_nacl.py +10 -6
  59. toil/lib/encryption/conftest.py +1 -0
  60. toil/lib/exceptions.py +26 -7
  61. toil/lib/expando.py +4 -2
  62. toil/lib/ftp_utils.py +217 -0
  63. toil/lib/generatedEC2Lists.py +127 -19
  64. toil/lib/history.py +1271 -0
  65. toil/lib/history_submission.py +681 -0
  66. toil/lib/humanize.py +6 -2
  67. toil/lib/io.py +121 -12
  68. toil/lib/iterables.py +4 -2
  69. toil/lib/memoize.py +12 -8
  70. toil/lib/misc.py +83 -18
  71. toil/lib/objects.py +2 -2
  72. toil/lib/resources.py +19 -7
  73. toil/lib/retry.py +125 -87
  74. toil/lib/threading.py +282 -80
  75. toil/lib/throttle.py +15 -14
  76. toil/lib/trs.py +390 -0
  77. toil/lib/web.py +38 -0
  78. toil/options/common.py +850 -402
  79. toil/options/cwl.py +185 -90
  80. toil/options/runner.py +50 -0
  81. toil/options/wdl.py +70 -19
  82. toil/provisioners/__init__.py +111 -46
  83. toil/provisioners/abstractProvisioner.py +322 -157
  84. toil/provisioners/aws/__init__.py +62 -30
  85. toil/provisioners/aws/awsProvisioner.py +980 -627
  86. toil/provisioners/clusterScaler.py +541 -279
  87. toil/provisioners/gceProvisioner.py +283 -180
  88. toil/provisioners/node.py +147 -79
  89. toil/realtimeLogger.py +34 -22
  90. toil/resource.py +137 -75
  91. toil/server/app.py +127 -61
  92. toil/server/celery_app.py +3 -1
  93. toil/server/cli/wes_cwl_runner.py +84 -55
  94. toil/server/utils.py +56 -31
  95. toil/server/wes/abstract_backend.py +64 -26
  96. toil/server/wes/amazon_wes_utils.py +21 -15
  97. toil/server/wes/tasks.py +121 -63
  98. toil/server/wes/toil_backend.py +142 -107
  99. toil/server/wsgi_app.py +4 -3
  100. toil/serviceManager.py +58 -22
  101. toil/statsAndLogging.py +183 -65
  102. toil/test/__init__.py +263 -179
  103. toil/test/batchSystems/batchSystemTest.py +438 -195
  104. toil/test/batchSystems/batch_system_plugin_test.py +18 -7
  105. toil/test/batchSystems/test_gridengine.py +173 -0
  106. toil/test/batchSystems/test_lsf_helper.py +67 -58
  107. toil/test/batchSystems/test_slurm.py +265 -49
  108. toil/test/cactus/test_cactus_integration.py +20 -22
  109. toil/test/cwl/conftest.py +39 -0
  110. toil/test/cwl/cwlTest.py +375 -72
  111. toil/test/cwl/measure_default_memory.cwl +12 -0
  112. toil/test/cwl/not_run_required_input.cwl +29 -0
  113. toil/test/cwl/optional-file.cwl +18 -0
  114. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  115. toil/test/docs/scriptsTest.py +60 -34
  116. toil/test/jobStores/jobStoreTest.py +412 -235
  117. toil/test/lib/aws/test_iam.py +116 -48
  118. toil/test/lib/aws/test_s3.py +16 -9
  119. toil/test/lib/aws/test_utils.py +5 -6
  120. toil/test/lib/dockerTest.py +118 -141
  121. toil/test/lib/test_conversions.py +113 -115
  122. toil/test/lib/test_ec2.py +57 -49
  123. toil/test/lib/test_history.py +212 -0
  124. toil/test/lib/test_misc.py +12 -5
  125. toil/test/lib/test_trs.py +161 -0
  126. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  127. toil/test/mesos/helloWorld.py +7 -6
  128. toil/test/mesos/stress.py +25 -20
  129. toil/test/options/options.py +7 -2
  130. toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
  131. toil/test/provisioners/clusterScalerTest.py +440 -250
  132. toil/test/provisioners/clusterTest.py +81 -42
  133. toil/test/provisioners/gceProvisionerTest.py +174 -100
  134. toil/test/provisioners/provisionerTest.py +25 -13
  135. toil/test/provisioners/restartScript.py +5 -4
  136. toil/test/server/serverTest.py +188 -141
  137. toil/test/sort/restart_sort.py +137 -68
  138. toil/test/sort/sort.py +134 -66
  139. toil/test/sort/sortTest.py +91 -49
  140. toil/test/src/autoDeploymentTest.py +140 -100
  141. toil/test/src/busTest.py +20 -18
  142. toil/test/src/checkpointTest.py +8 -2
  143. toil/test/src/deferredFunctionTest.py +49 -35
  144. toil/test/src/dockerCheckTest.py +33 -26
  145. toil/test/src/environmentTest.py +20 -10
  146. toil/test/src/fileStoreTest.py +538 -271
  147. toil/test/src/helloWorldTest.py +7 -4
  148. toil/test/src/importExportFileTest.py +61 -31
  149. toil/test/src/jobDescriptionTest.py +32 -17
  150. toil/test/src/jobEncapsulationTest.py +2 -0
  151. toil/test/src/jobFileStoreTest.py +74 -50
  152. toil/test/src/jobServiceTest.py +187 -73
  153. toil/test/src/jobTest.py +120 -70
  154. toil/test/src/miscTests.py +19 -18
  155. toil/test/src/promisedRequirementTest.py +82 -36
  156. toil/test/src/promisesTest.py +7 -6
  157. toil/test/src/realtimeLoggerTest.py +6 -6
  158. toil/test/src/regularLogTest.py +71 -37
  159. toil/test/src/resourceTest.py +80 -49
  160. toil/test/src/restartDAGTest.py +36 -22
  161. toil/test/src/resumabilityTest.py +9 -2
  162. toil/test/src/retainTempDirTest.py +45 -14
  163. toil/test/src/systemTest.py +12 -8
  164. toil/test/src/threadingTest.py +44 -25
  165. toil/test/src/toilContextManagerTest.py +10 -7
  166. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  167. toil/test/src/workerTest.py +33 -16
  168. toil/test/utils/toilDebugTest.py +70 -58
  169. toil/test/utils/toilKillTest.py +4 -5
  170. toil/test/utils/utilsTest.py +239 -102
  171. toil/test/wdl/wdltoil_test.py +789 -148
  172. toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
  173. toil/toilState.py +52 -26
  174. toil/utils/toilConfig.py +13 -4
  175. toil/utils/toilDebugFile.py +44 -27
  176. toil/utils/toilDebugJob.py +85 -25
  177. toil/utils/toilDestroyCluster.py +11 -6
  178. toil/utils/toilKill.py +8 -3
  179. toil/utils/toilLaunchCluster.py +251 -145
  180. toil/utils/toilMain.py +37 -16
  181. toil/utils/toilRsyncCluster.py +27 -14
  182. toil/utils/toilSshCluster.py +45 -22
  183. toil/utils/toilStats.py +75 -36
  184. toil/utils/toilStatus.py +226 -119
  185. toil/utils/toilUpdateEC2Instances.py +3 -1
  186. toil/version.py +6 -6
  187. toil/wdl/utils.py +5 -5
  188. toil/wdl/wdltoil.py +3528 -1053
  189. toil/worker.py +370 -149
  190. toil-8.1.0b1.dist-info/METADATA +178 -0
  191. toil-8.1.0b1.dist-info/RECORD +259 -0
  192. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
  193. toil-7.0.0.dist-info/METADATA +0 -158
  194. toil-7.0.0.dist-info/RECORD +0 -244
  195. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
  196. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
  197. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
toil/lib/threading.py CHANGED
@@ -21,12 +21,16 @@ import fcntl
21
21
  import logging
22
22
  import math
23
23
  import os
24
+ import platform
25
+ import subprocess
24
26
  import sys
25
27
  import tempfile
26
28
  import threading
29
+ import time
27
30
  import traceback
31
+ from collections.abc import Iterator
28
32
  from contextlib import contextmanager
29
- from typing import Dict, Iterator, Optional, Union, cast
33
+ from typing import Optional, Union, cast
30
34
 
31
35
  import psutil
32
36
 
@@ -36,6 +40,144 @@ from toil.lib.io import robust_rmtree
36
40
  logger = logging.getLogger(__name__)
37
41
 
38
42
 
43
+ def ensure_filesystem_lockable(
44
+ path: str, timeout: float = 30, hint: Optional[str] = None
45
+ ) -> None:
46
+ """
47
+ Make sure that the filesystem used at the given path is one where locks are safe to use.
48
+
49
+ File locks are not safe to use on Ceph. See
50
+ <https://github.com/DataBiosphere/toil/issues/4972>.
51
+
52
+ Raises an exception if the filesystem is detected as one where using locks
53
+ is known to trigger bugs in the filesystem implementation. Also raises an
54
+ exception if the given path does not exist, or if attempting to determine
55
+ the filesystem type takes more than the timeout in seconds.
56
+
57
+ If the filesystem type cannot be determined, does nothing.
58
+
59
+ :param hint: Extra text to include in an error, if raised, telling the user
60
+ how to change the offending path.
61
+ """
62
+
63
+ if not os.path.exists(path):
64
+ # Raise a normal-looking FileNotFoundError. See <https://stackoverflow.com/a/36077407>
65
+ raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
66
+
67
+ if platform.system() == "Linux":
68
+ # We know how to find the filesystem here.
69
+
70
+ try:
71
+ # Start a child process to stat the path. See <https://unix.stackexchange.com/a/402236>.
72
+ # We really should call statfs but no bindings for it are in PyPI.
73
+ completed = subprocess.run(
74
+ ["stat", "-f", "-c", "%T", path],
75
+ check=True,
76
+ capture_output=True,
77
+ timeout=timeout,
78
+ )
79
+ except subprocess.TimeoutExpired as e:
80
+ # The subprocess itself is Too Slow
81
+ raise RuntimeError(
82
+ f"Polling filesystem type at {path} took more than {timeout} seconds; is your filesystem working?"
83
+ ) from e
84
+ except subprocess.CalledProcessError as e:
85
+ # Stat didn't work. Maybe we don't have the right version of stat installed?
86
+ logger.warning(
87
+ "Could not determine filesystem type at %s because of: %s",
88
+ path,
89
+ e.stderr.decode("utf-8", errors="replace").strip(),
90
+ )
91
+ # If we don't know the filesystem type, keep going anyway.
92
+ return
93
+
94
+ filesystem_type = completed.stdout.decode("utf-8", errors="replace").strip()
95
+
96
+ if filesystem_type == "ceph":
97
+ # Ceph is known to deadlock the MDS and break the parent directory when locking.
98
+ message = [
99
+ f"Refusing to use {path} because file locks are known to break {filesystem_type} filesystems."
100
+ ]
101
+ if hint:
102
+ # Hint the user how to fix this.
103
+ message.append(hint)
104
+ raise RuntimeError(" ".join(message))
105
+ else:
106
+ # Other filesystem types are fine (even though NFS is sometimes
107
+ # flaky with regard to locks actually locking anything).
108
+ logger.debug(
109
+ "Detected that %s has lockable filesystem type: %s",
110
+ path,
111
+ filesystem_type,
112
+ )
113
+
114
+ # Other platforms (Mac) probably aren't mounting Ceph and also don't
115
+ # usually use the same stat binary implementation.
116
+
117
+
118
+ def safe_lock(fd: int, block: bool = True, shared: bool = False) -> None:
119
+ """
120
+ Get an fcntl lock, while retrying on IO errors.
121
+
122
+ Raises OSError with EACCES or EAGAIN when a nonblocking lock is not
123
+ immediately available.
124
+ """
125
+
126
+ # Set up retry logic. TODO: Use @retry instead.
127
+ error_backoff = 1
128
+ MAX_ERROR_TRIES = 10
129
+ error_tries = 0
130
+
131
+ while True:
132
+ try:
133
+ # Wait until we can exclusively lock it.
134
+ lock_mode = (fcntl.LOCK_SH if shared else fcntl.LOCK_EX) | (
135
+ fcntl.LOCK_NB if not block else 0
136
+ )
137
+ fcntl.flock(fd, lock_mode)
138
+ return
139
+ except OSError as e:
140
+ if e.errno in (errno.EACCES, errno.EAGAIN):
141
+ # Nonblocking lock not available.
142
+ raise
143
+ elif e.errno == errno.EIO:
144
+ # Sometimes Ceph produces IO errors when talking to lock files.
145
+ # Back off and try again.
146
+ # TODO: Should we eventually give up if the disk really is
147
+ # broken? If so we should use the retry system.
148
+ if error_tries < MAX_ERROR_TRIES:
149
+ logger.error(
150
+ "IO error talking to lock file. Retrying after %s seconds.",
151
+ error_backoff,
152
+ )
153
+ time.sleep(error_backoff)
154
+ error_backoff = min(60, error_backoff * 2)
155
+ error_tries += 1
156
+ continue
157
+ else:
158
+ logger.critical(
159
+ "Too many IO errors talking to lock file. If using Ceph, check for MDS deadlocks. See <https://tracker.ceph.com/issues/62123>."
160
+ )
161
+ raise
162
+ else:
163
+ raise
164
+
165
+
166
+ def safe_unlock_and_close(fd: int) -> None:
167
+ """
168
+ Release an fcntl lock and close the file descriptor, while handling fcntl IO errors.
169
+ """
170
+ try:
171
+ fcntl.flock(fd, fcntl.LOCK_UN)
172
+ except OSError as e:
173
+ if e.errno != errno.EIO:
174
+ raise
175
+ # Sometimes Ceph produces EIO. We don't need to retry then because
176
+ # we're going to close the FD and after that the file can't remain
177
+ # locked by us.
178
+ os.close(fd)
179
+
180
+
39
181
  class ExceptionalThread(threading.Thread):
40
182
  """
41
183
  A thread whose join() method re-raises exceptions raised during run(). While join() is
@@ -65,6 +207,7 @@ class ExceptionalThread(threading.Thread):
65
207
  AssertionError
66
208
 
67
209
  """
210
+
68
211
  exc_info = None
69
212
 
70
213
  def run(self) -> None:
@@ -103,21 +246,23 @@ def cpu_count() -> int:
103
246
  :rtype: int
104
247
  """
105
248
 
106
- cached = getattr(cpu_count, 'result', None)
249
+ cached = getattr(cpu_count, "result", None)
107
250
  if cached is not None:
108
251
  # We already got a CPU count.
109
252
  return cast(int, cached)
110
253
 
111
254
  # Get the fallback answer of all the CPUs on the machine
112
- psutil_cpu_count = cast(Optional[int], psutil.cpu_count(logical=True))
255
+ psutil_cpu_count = psutil.cpu_count(logical=True)
113
256
  if psutil_cpu_count is None:
114
- logger.debug('Could not retrieve the logical CPU count.')
257
+ logger.debug("Could not retrieve the logical CPU count.")
115
258
 
116
- total_machine_size: Union[float, int] = psutil_cpu_count if psutil_cpu_count is not None else float('inf')
117
- logger.debug('Total machine size: %s core(s)', total_machine_size)
259
+ total_machine_size: Union[float, int] = (
260
+ psutil_cpu_count if psutil_cpu_count is not None else float("inf")
261
+ )
262
+ logger.debug("Total machine size: %s core(s)", total_machine_size)
118
263
 
119
264
  # cgroups may limit the size
120
- cgroup_size: Union[float, int] = float('inf')
265
+ cgroup_size: Union[float, int] = float("inf")
121
266
 
122
267
  try:
123
268
  # See if we can fetch these and use them
@@ -125,13 +270,13 @@ def cpu_count() -> int:
125
270
  period: Optional[int] = None
126
271
 
127
272
  # CGroups v1 keeps quota and period separate
128
- CGROUP1_QUOTA_FILE = '/sys/fs/cgroup/cpu/cpu.cfs_quota_us'
129
- CGROUP1_PERIOD_FILE = '/sys/fs/cgroup/cpu/cpu.cfs_period_us'
273
+ CGROUP1_QUOTA_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"
274
+ CGROUP1_PERIOD_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"
130
275
  # CGroups v2 keeps both in one file, space-separated, quota first
131
- CGROUP2_COMBINED_FILE = '/sys/fs/cgroup/cpu.max'
276
+ CGROUP2_COMBINED_FILE = "/sys/fs/cgroup/cpu.max"
132
277
 
133
278
  if os.path.exists(CGROUP1_QUOTA_FILE) and os.path.exists(CGROUP1_PERIOD_FILE):
134
- logger.debug('CPU quota and period available from cgroups v1')
279
+ logger.debug("CPU quota and period available from cgroups v1")
135
280
  with open(CGROUP1_QUOTA_FILE) as stream:
136
281
  # Read the quota
137
282
  quota = int(stream.read())
@@ -140,56 +285,58 @@ def cpu_count() -> int:
140
285
  # Read the period in which we are allowed to burn the quota
141
286
  period = int(stream.read())
142
287
  elif os.path.exists(CGROUP2_COMBINED_FILE):
143
- logger.debug('CPU quota and period available from cgroups v2')
288
+ logger.debug("CPU quota and period available from cgroups v2")
144
289
  with open(CGROUP2_COMBINED_FILE) as stream:
145
290
  # Read the quota and the period together
146
- quota, period = (int(part) for part in stream.read().split(' '))
291
+ quota, period = (int(part) for part in stream.read().split(" "))
147
292
  else:
148
- logger.debug('CPU quota/period not available from cgroups v1 or cgroups v2')
293
+ logger.debug("CPU quota/period not available from cgroups v1 or cgroups v2")
149
294
 
150
295
  if quota is not None and period is not None:
151
296
  # We got a quota and a period.
152
- logger.debug('CPU quota: %d period: %d', quota, period)
297
+ logger.debug("CPU quota: %d period: %d", quota, period)
153
298
 
154
299
  if quota == -1:
155
300
  # But the quota can be -1 for unset.
156
301
  # Assume we can use the whole machine.
157
- cgroup_size = float('inf')
302
+ cgroup_size = float("inf")
158
303
  else:
159
304
  # The thread count is how many multiples of a wall clock period we
160
305
  # can burn in that period.
161
- cgroup_size = int(math.ceil(float(quota)/float(period)))
306
+ cgroup_size = int(math.ceil(float(quota) / float(period)))
162
307
 
163
- logger.debug('Control group size in cores: %s', cgroup_size)
308
+ logger.debug("Control group size in cores: %s", cgroup_size)
164
309
  except:
165
310
  # We can't actually read these cgroup fields. Maybe we are a mac or something.
166
- logger.debug('Could not inspect cgroup: %s', traceback.format_exc())
311
+ logger.debug("Could not inspect cgroup: %s", traceback.format_exc())
167
312
 
168
313
  # CPU affinity may limit the size
169
- affinity_size: Union[float, int] = float('inf')
170
- if hasattr(os, 'sched_getaffinity'):
314
+ affinity_size: Union[float, int] = float("inf")
315
+ if hasattr(os, "sched_getaffinity"):
171
316
  try:
172
- logger.debug('CPU affinity available')
317
+ logger.debug("CPU affinity available")
173
318
  affinity_size = len(os.sched_getaffinity(0))
174
- logger.debug('CPU affinity is restricted to %d cores', affinity_size)
319
+ logger.debug("CPU affinity is restricted to %d cores", affinity_size)
175
320
  except:
176
- # We can't actually read this even though it exists.
177
- logger.debug('Could not inspect scheduling affinity: %s', traceback.format_exc())
321
+ # We can't actually read this even though it exists.
322
+ logger.debug(
323
+ "Could not inspect scheduling affinity: %s", traceback.format_exc()
324
+ )
178
325
  else:
179
- logger.debug('CPU affinity not available')
326
+ logger.debug("CPU affinity not available")
180
327
 
181
- limit: Union[float, int] = float('inf')
328
+ limit: Union[float, int] = float("inf")
182
329
  # Apply all the limits to take the smallest
183
330
  limit = min(limit, total_machine_size)
184
331
  limit = min(limit, cgroup_size)
185
332
  limit = min(limit, affinity_size)
186
- if limit < 1 or limit == float('inf'):
333
+ if limit < 1 or limit == float("inf"):
187
334
  # Fall back to 1 if we can't get a size
188
335
  limit = 1
189
336
  result = int(limit)
190
- logger.debug('cpu_count: %s', result)
337
+ logger.debug("cpu_count: %s", result)
191
338
  # Make sure to remember it for the next call
192
- setattr(cpu_count, 'result', result)
339
+ setattr(cpu_count, "result", result)
193
340
  return result
194
341
 
195
342
 
@@ -211,7 +358,8 @@ def cpu_count() -> int:
211
358
  current_process_name_lock = threading.Lock()
212
359
  # And a global dict from work directory to name in that work directory.
213
360
  # We also have a file descriptor per work directory but it is just leaked.
214
- current_process_name_for: Dict[str, str] = {}
361
+ current_process_name_for: dict[str, str] = {}
362
+
215
363
 
216
364
  def collect_process_name_garbage() -> None:
217
365
  """
@@ -235,6 +383,7 @@ def collect_process_name_garbage() -> None:
235
383
  for base_dir in missing:
236
384
  del current_process_name_for[base_dir]
237
385
 
386
+
238
387
  def destroy_all_process_names() -> None:
239
388
  """
240
389
  Delete all our process name files because our process is going away.
@@ -249,9 +398,11 @@ def destroy_all_process_names() -> None:
249
398
  for base_dir, name in current_process_name_for.items():
250
399
  robust_rmtree(os.path.join(base_dir, name))
251
400
 
401
+
252
402
  # Run the cleanup at exit
253
403
  atexit.register(destroy_all_process_names)
254
404
 
405
+
255
406
  def get_process_name(base_dir: str) -> str:
256
407
  """
257
408
  Return the name of the current process. Like a PID but visible between
@@ -280,10 +431,16 @@ def get_process_name(base_dir: str) -> str:
280
431
 
281
432
  # Lock the file. The lock will automatically go away if our process does.
282
433
  try:
283
- fcntl.lockf(nameFD, fcntl.LOCK_EX | fcntl.LOCK_NB)
434
+ safe_lock(nameFD, block=False)
284
435
  except OSError as e:
285
- # Someone else might have locked it even though they should not have.
286
- raise RuntimeError(f"Could not lock process name file {nameFileName}: {str(e)}")
436
+ if e.errno in (errno.EACCES, errno.EAGAIN):
437
+ # Someone else locked it even though they should not have.
438
+ raise RuntimeError(
439
+ f"Could not lock process name file {nameFileName}"
440
+ ) from e
441
+ else:
442
+ # Something else is wrong
443
+ raise
287
444
 
288
445
  # Save the basename
289
446
  current_process_name_for[base_dir] = os.path.basename(nameFileName)
@@ -321,20 +478,24 @@ def process_name_exists(base_dir: str, name: str) -> bool:
321
478
  # If the file is gone, the process can't exist.
322
479
  return False
323
480
 
324
-
325
481
  nameFD = None
326
482
  try:
327
483
  try:
328
484
  # Otherwise see if we can lock it shared, for which we need an FD, but
329
485
  # only for reading.
330
486
  nameFD = os.open(nameFileName, os.O_RDONLY)
331
- fcntl.lockf(nameFD, fcntl.LOCK_SH | fcntl.LOCK_NB)
332
487
  except FileNotFoundError as e:
333
488
  # File has vanished
334
489
  return False
490
+ try:
491
+ safe_lock(nameFD, block=False, shared=True)
335
492
  except OSError as e:
336
- # Could not lock. Process is alive.
337
- return True
493
+ if e.errno in (errno.EACCES, errno.EAGAIN):
494
+ # Could not lock. Process is alive.
495
+ return True
496
+ else:
497
+ # Something else went wrong
498
+ raise
338
499
  else:
339
500
  # Could lock. Process is dead.
340
501
  # Remove the file. We race to be the first to do so.
@@ -342,8 +503,8 @@ def process_name_exists(base_dir: str, name: str) -> bool:
342
503
  os.remove(nameFileName)
343
504
  except:
344
505
  pass
345
- # Unlock
346
- fcntl.lockf(nameFD, fcntl.LOCK_UN)
506
+ safe_unlock_and_close(nameFD)
507
+ nameFD = None
347
508
  # Report process death
348
509
  return False
349
510
  finally:
@@ -353,6 +514,7 @@ def process_name_exists(base_dir: str, name: str) -> bool:
353
514
  except:
354
515
  pass
355
516
 
517
+
356
518
  # Similar to the process naming system above, we define a global mutex system
357
519
  # for critical sections, based just around file locks.
358
520
  @contextmanager
@@ -372,21 +534,34 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
372
534
  if not os.path.isdir(base_dir):
373
535
  raise RuntimeError(f"Directory {base_dir} for mutex does not exist")
374
536
 
537
+ # TODO: We don't know what CLI option controls where to put this mutex, so
538
+ # we aren't very helpful if the location is bad.
539
+ ensure_filesystem_lockable(
540
+ base_dir, hint=f"Specify a different place to put the {mutex} mutex."
541
+ )
542
+
375
543
  # Define a filename
376
- lock_filename = os.path.join(base_dir, 'toil-mutex-' + mutex)
544
+ lock_filename = os.path.join(base_dir, "toil-mutex-" + mutex)
377
545
 
378
- logger.debug('PID %d acquiring mutex %s', os.getpid(), lock_filename)
546
+ logger.debug("PID %d acquiring mutex %s", os.getpid(), lock_filename)
379
547
 
380
548
  # We can't just create/open and lock a file, because when we clean up
381
549
  # there's a race where someone can open the file before we unlink it and
382
550
  # get a lock on the deleted file.
383
551
 
552
+ error_backoff = 1
553
+
384
554
  while True:
385
555
  # Try to create the file, ignoring if it exists or not.
386
556
  fd = os.open(lock_filename, os.O_CREAT | os.O_WRONLY)
387
557
 
388
- # Wait until we can exclusively lock it.
389
- fcntl.lockf(fd, fcntl.LOCK_EX)
558
+ try:
559
+ # Wait until we can exclusively lock it, handling error retry.
560
+ safe_lock(fd)
561
+ except:
562
+ # Something went wrong
563
+ os.close(fd)
564
+ raise
390
565
 
391
566
  # Holding the lock, make sure we are looking at the same file on disk still.
392
567
  try:
@@ -394,16 +569,14 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
394
569
  fd_stats = os.fstat(fd)
395
570
  except OSError as e:
396
571
  if e.errno == errno.ESTALE:
397
- # The file handle has gone stale, because somebody removed the file.
572
+ # The file handle has gone stale, because somebody removed the
573
+ # file.
398
574
  # Try again.
399
- try:
400
- fcntl.lockf(fd, fcntl.LOCK_UN)
401
- except OSError:
402
- pass
403
- os.close(fd)
575
+ safe_unlock_and_close(fd)
404
576
  continue
405
577
  else:
406
578
  # Something else broke
579
+ os.close(fd)
407
580
  raise
408
581
 
409
582
  try:
@@ -412,13 +585,16 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
412
585
  except FileNotFoundError:
413
586
  path_stats = None
414
587
 
415
- if path_stats is None or fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino:
588
+ if (
589
+ path_stats is None
590
+ or fd_stats.st_dev != path_stats.st_dev
591
+ or fd_stats.st_ino != path_stats.st_ino
592
+ ):
416
593
  # The file we have a lock on is not the file linked to the name (if
417
594
  # any). This usually happens, because before someone releases a
418
595
  # lock, they delete the file. Go back and contend again. TODO: This
419
596
  # allows a lot of queue jumping on our mutex.
420
- fcntl.lockf(fd, fcntl.LOCK_UN)
421
- os.close(fd)
597
+ safe_unlock_and_close(fd)
422
598
  continue
423
599
  else:
424
600
  # We have a lock on the file that the name points to. Since we
@@ -428,12 +604,12 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
428
604
 
429
605
  try:
430
606
  # When we have it, do the thing we are protecting.
431
- logger.debug('PID %d now holds mutex %s', os.getpid(), lock_filename)
607
+ logger.debug("PID %d now holds mutex %s", os.getpid(), lock_filename)
432
608
  yield
433
609
  finally:
434
610
  # Delete it while we still own it, so we can't delete it from out from
435
611
  # under someone else who thinks they are holding it.
436
- logger.debug('PID %d releasing mutex %s', os.getpid(), lock_filename)
612
+ logger.debug("PID %d releasing mutex %s", os.getpid(), lock_filename)
437
613
 
438
614
  # We have had observations in the wild of the lock file not exisiting
439
615
  # when we go to unlink it, causing a crash on mutex release. See
@@ -451,23 +627,36 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
451
627
 
452
628
  # Check to make sure it still looks locked before we unlink.
453
629
  if path_stats is None:
454
- logger.error('PID %d had mutex %s disappear while locked! Mutex system is not working!', os.getpid(), lock_filename)
455
- elif fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino:
456
- logger.error('PID %d had mutex %s get replaced while locked! Mutex system is not working!', os.getpid(), lock_filename)
630
+ logger.error(
631
+ "PID %d had mutex %s disappear while locked! Mutex system is not working!",
632
+ os.getpid(),
633
+ lock_filename,
634
+ )
635
+ elif (
636
+ fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino
637
+ ):
638
+ logger.error(
639
+ "PID %d had mutex %s get replaced while locked! Mutex system is not working!",
640
+ os.getpid(),
641
+ lock_filename,
642
+ )
457
643
 
458
644
  if path_stats is not None:
459
645
  try:
460
646
  # Unlink the file
461
647
  os.unlink(lock_filename)
462
648
  except FileNotFoundError:
463
- logger.error('PID %d had mutex %s disappear between stat and unlink while unlocking! Mutex system is not working!', os.getpid(), lock_filename)
649
+ logger.error(
650
+ "PID %d had mutex %s disappear between stat and unlink while unlocking! Mutex system is not working!",
651
+ os.getpid(),
652
+ lock_filename,
653
+ )
464
654
 
465
655
  # Note that we are unlinking it and then unlocking it; a lot of people
466
656
  # might have opened it before we unlinked it and will wake up when they
467
657
  # get the worthless lock on the now-unlinked file. We have to do some
468
658
  # stat gymnastics above to work around this.
469
- fcntl.lockf(fd, fcntl.LOCK_UN)
470
- os.close(fd)
659
+ safe_unlock_and_close(fd)
471
660
 
472
661
 
473
662
  class LastProcessStandingArena:
@@ -503,13 +692,13 @@ class LastProcessStandingArena:
503
692
 
504
693
  # We need a mutex name to allow only one process to be entering or
505
694
  # leaving at a time.
506
- self.mutex = name + '-arena-lock'
695
+ self.mutex = name + "-arena-lock"
507
696
 
508
697
  # We need a way to track who is actually in, and who was in but died.
509
698
  # So everybody gets a locked file (again).
510
699
  # TODO: deduplicate with the similar logic for process names, and also
511
700
  # deferred functions.
512
- self.lockfileDir = os.path.join(base_dir, name + '-arena-members')
701
+ self.lockfileDir = os.path.join(base_dir, name + "-arena-members")
513
702
 
514
703
  # When we enter the arena, we fill this in with the FD of the locked
515
704
  # file that represents our presence.
@@ -525,7 +714,7 @@ class LastProcessStandingArena:
525
714
  You may not enter the arena again before leaving it.
526
715
  """
527
716
 
528
- logger.debug('Joining arena %s', self.lockfileDir)
717
+ logger.debug("Joining arena %s", self.lockfileDir)
529
718
 
530
719
  # Make sure we're not in it already.
531
720
  if self.lockfileName is not None or self.lockfileFD is not None:
@@ -540,19 +729,23 @@ class LastProcessStandingArena:
540
729
  except FileExistsError:
541
730
  pass
542
731
  except Exception as e:
543
- raise RuntimeError("Could not make lock file directory " + self.lockfileDir) from e
732
+ raise RuntimeError(
733
+ "Could not make lock file directory " + self.lockfileDir
734
+ ) from e
544
735
 
545
736
  # Make ourselves a file in it and lock it to prove we are alive.
546
737
  try:
547
- self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
738
+ self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
548
739
  except Exception as e:
549
- raise RuntimeError("Could not make lock file in " + self.lockfileDir) from e
740
+ raise RuntimeError(
741
+ "Could not make lock file in " + self.lockfileDir
742
+ ) from e
550
743
  # Nobody can see it yet, so lock it right away
551
- fcntl.lockf(self.lockfileFD, fcntl.LOCK_EX) # type: ignore
744
+ safe_lock(self.lockfileFD) # type: ignore
552
745
 
553
746
  # Now we're properly in, so release the global mutex
554
747
 
555
- logger.debug('Now in arena %s', self.lockfileDir)
748
+ logger.debug("Now in arena %s", self.lockfileDir)
556
749
 
557
750
  def leave(self) -> Iterator[bool]:
558
751
  """
@@ -572,7 +765,7 @@ class LastProcessStandingArena:
572
765
  if self.lockfileName is None or self.lockfileFD is None:
573
766
  raise RuntimeError("This process is not in the arena.")
574
767
 
575
- logger.debug('Leaving arena %s', self.lockfileDir)
768
+ logger.debug("Leaving arena %s", self.lockfileDir)
576
769
 
577
770
  with global_mutex(self.base_dir, self.mutex):
578
771
  # Now nobody else should also be trying to join or leave.
@@ -583,8 +776,7 @@ class LastProcessStandingArena:
583
776
  except:
584
777
  pass
585
778
  self.lockfileName = None
586
- fcntl.lockf(self.lockfileFD, fcntl.LOCK_UN)
587
- os.close(self.lockfileFD)
779
+ safe_unlock_and_close(self.lockfileFD)
588
780
  self.lockfileFD = None
589
781
 
590
782
  for item in os.listdir(self.lockfileDir):
@@ -598,32 +790,42 @@ class LastProcessStandingArena:
598
790
  continue
599
791
 
600
792
  try:
601
- fcntl.lockf(fd, fcntl.LOCK_SH | fcntl.LOCK_NB)
793
+ safe_lock(fd, block=False, shared=True)
602
794
  except OSError as e:
603
- # Could not lock. It's alive!
604
- break
795
+ if e.errno in (errno.EACCES, errno.EAGAIN):
796
+ # Could not lock. It's alive!
797
+ break
798
+ else:
799
+ # Something else is wrong
800
+ os.close(fd)
801
+ raise
605
802
  else:
606
803
  # Could lock. Process is dead.
607
804
  try:
608
805
  os.remove(full_path)
609
806
  except:
610
807
  pass
611
- fcntl.lockf(fd, fcntl.LOCK_UN)
808
+ safe_unlock_and_close(fd)
612
809
  # Continue with the loop normally.
613
810
  else:
614
811
  # Nothing alive was found. Nobody will come in while we hold
615
812
  # the global mutex, so we are the Last Process Standing.
616
- logger.debug('We are the Last Process Standing in arena %s', self.lockfileDir)
813
+ logger.debug(
814
+ "We are the Last Process Standing in arena %s", self.lockfileDir
815
+ )
617
816
  yield True
618
817
 
619
818
  try:
620
819
  # Delete the arena directory so as to leave nothing behind.
621
820
  os.rmdir(self.lockfileDir)
622
821
  except:
623
- logger.warning('Could not clean up arena %s completely: %s',
624
- self.lockfileDir, traceback.format_exc())
822
+ logger.warning(
823
+ "Could not clean up arena %s completely: %s",
824
+ self.lockfileDir,
825
+ traceback.format_exc(),
826
+ )
625
827
 
626
828
  # Now we're done, whether we were the last one or not, and can
627
829
  # release the mutex.
628
830
 
629
- logger.debug('Now out of arena %s', self.lockfileDir)
831
+ logger.debug("Now out of arena %s", self.lockfileDir)