toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. toil/__init__.py +122 -315
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +173 -89
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
  5. toil/batchSystems/awsBatch.py +244 -135
  6. toil/batchSystems/cleanup_support.py +26 -16
  7. toil/batchSystems/contained_executor.py +31 -28
  8. toil/batchSystems/gridengine.py +86 -50
  9. toil/batchSystems/htcondor.py +166 -89
  10. toil/batchSystems/kubernetes.py +632 -382
  11. toil/batchSystems/local_support.py +20 -15
  12. toil/batchSystems/lsf.py +134 -81
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +290 -151
  16. toil/batchSystems/mesos/executor.py +79 -50
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +46 -28
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +296 -125
  21. toil/batchSystems/slurm.py +603 -138
  22. toil/batchSystems/torque.py +47 -33
  23. toil/bus.py +186 -76
  24. toil/common.py +664 -368
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1136 -483
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +63 -42
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +140 -60
  32. toil/fileStores/cachingFileStore.py +717 -269
  33. toil/fileStores/nonCachingFileStore.py +116 -87
  34. toil/job.py +1225 -368
  35. toil/jobStores/abstractJobStore.py +416 -266
  36. toil/jobStores/aws/jobStore.py +863 -477
  37. toil/jobStores/aws/utils.py +201 -120
  38. toil/jobStores/conftest.py +3 -2
  39. toil/jobStores/fileJobStore.py +292 -154
  40. toil/jobStores/googleJobStore.py +140 -74
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +668 -272
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +74 -31
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +214 -39
  49. toil/lib/aws/utils.py +287 -231
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +104 -47
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +361 -199
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +5 -3
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +141 -15
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +66 -21
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +68 -15
  71. toil/lib/retry.py +126 -81
  72. toil/lib/threading.py +299 -82
  73. toil/lib/throttle.py +16 -15
  74. toil/options/common.py +843 -409
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +73 -17
  78. toil/provisioners/__init__.py +117 -46
  79. toil/provisioners/abstractProvisioner.py +332 -157
  80. toil/provisioners/aws/__init__.py +70 -33
  81. toil/provisioners/aws/awsProvisioner.py +1145 -715
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +155 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +128 -62
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +224 -70
  98. toil/test/__init__.py +282 -183
  99. toil/test/batchSystems/batchSystemTest.py +460 -210
  100. toil/test/batchSystems/batch_system_plugin_test.py +90 -0
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +110 -49
  104. toil/test/cactus/__init__.py +0 -0
  105. toil/test/cactus/test_cactus_integration.py +56 -0
  106. toil/test/cwl/cwlTest.py +496 -287
  107. toil/test/cwl/measure_default_memory.cwl +12 -0
  108. toil/test/cwl/not_run_required_input.cwl +29 -0
  109. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  110. toil/test/cwl/seqtk_seq.cwl +1 -1
  111. toil/test/docs/scriptsTest.py +69 -46
  112. toil/test/jobStores/jobStoreTest.py +427 -264
  113. toil/test/lib/aws/test_iam.py +118 -50
  114. toil/test/lib/aws/test_s3.py +16 -9
  115. toil/test/lib/aws/test_utils.py +5 -6
  116. toil/test/lib/dockerTest.py +118 -141
  117. toil/test/lib/test_conversions.py +113 -115
  118. toil/test/lib/test_ec2.py +58 -50
  119. toil/test/lib/test_integration.py +104 -0
  120. toil/test/lib/test_misc.py +12 -5
  121. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  122. toil/test/mesos/helloWorld.py +7 -6
  123. toil/test/mesos/stress.py +25 -20
  124. toil/test/options/__init__.py +13 -0
  125. toil/test/options/options.py +42 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
  127. toil/test/provisioners/clusterScalerTest.py +440 -250
  128. toil/test/provisioners/clusterTest.py +166 -44
  129. toil/test/provisioners/gceProvisionerTest.py +174 -100
  130. toil/test/provisioners/provisionerTest.py +25 -13
  131. toil/test/provisioners/restartScript.py +5 -4
  132. toil/test/server/serverTest.py +188 -141
  133. toil/test/sort/restart_sort.py +137 -68
  134. toil/test/sort/sort.py +134 -66
  135. toil/test/sort/sortTest.py +91 -49
  136. toil/test/src/autoDeploymentTest.py +141 -101
  137. toil/test/src/busTest.py +20 -18
  138. toil/test/src/checkpointTest.py +8 -2
  139. toil/test/src/deferredFunctionTest.py +49 -35
  140. toil/test/src/dockerCheckTest.py +32 -24
  141. toil/test/src/environmentTest.py +135 -0
  142. toil/test/src/fileStoreTest.py +539 -272
  143. toil/test/src/helloWorldTest.py +7 -4
  144. toil/test/src/importExportFileTest.py +61 -31
  145. toil/test/src/jobDescriptionTest.py +46 -21
  146. toil/test/src/jobEncapsulationTest.py +2 -0
  147. toil/test/src/jobFileStoreTest.py +74 -50
  148. toil/test/src/jobServiceTest.py +187 -73
  149. toil/test/src/jobTest.py +121 -71
  150. toil/test/src/miscTests.py +19 -18
  151. toil/test/src/promisedRequirementTest.py +82 -36
  152. toil/test/src/promisesTest.py +7 -6
  153. toil/test/src/realtimeLoggerTest.py +10 -6
  154. toil/test/src/regularLogTest.py +71 -37
  155. toil/test/src/resourceTest.py +80 -49
  156. toil/test/src/restartDAGTest.py +36 -22
  157. toil/test/src/resumabilityTest.py +9 -2
  158. toil/test/src/retainTempDirTest.py +45 -14
  159. toil/test/src/systemTest.py +12 -8
  160. toil/test/src/threadingTest.py +44 -25
  161. toil/test/src/toilContextManagerTest.py +10 -7
  162. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  163. toil/test/src/workerTest.py +73 -23
  164. toil/test/utils/toilDebugTest.py +103 -33
  165. toil/test/utils/toilKillTest.py +4 -5
  166. toil/test/utils/utilsTest.py +245 -106
  167. toil/test/wdl/wdltoil_test.py +818 -149
  168. toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
  169. toil/toilState.py +120 -35
  170. toil/utils/toilConfig.py +13 -4
  171. toil/utils/toilDebugFile.py +44 -27
  172. toil/utils/toilDebugJob.py +214 -27
  173. toil/utils/toilDestroyCluster.py +11 -6
  174. toil/utils/toilKill.py +8 -3
  175. toil/utils/toilLaunchCluster.py +256 -140
  176. toil/utils/toilMain.py +37 -16
  177. toil/utils/toilRsyncCluster.py +32 -14
  178. toil/utils/toilSshCluster.py +49 -22
  179. toil/utils/toilStats.py +356 -273
  180. toil/utils/toilStatus.py +292 -139
  181. toil/utils/toilUpdateEC2Instances.py +3 -1
  182. toil/version.py +12 -12
  183. toil/wdl/utils.py +5 -5
  184. toil/wdl/wdltoil.py +3913 -1033
  185. toil/worker.py +367 -184
  186. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
  187. toil-8.0.0.dist-info/METADATA +173 -0
  188. toil-8.0.0.dist-info/RECORD +253 -0
  189. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  190. toil-6.1.0a1.dist-info/METADATA +0 -125
  191. toil-6.1.0a1.dist-info/RECORD +0 -237
  192. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  193. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/lib/threading.py CHANGED
@@ -21,14 +21,18 @@ import fcntl
21
21
  import logging
22
22
  import math
23
23
  import os
24
+ import platform
25
+ import subprocess
24
26
  import sys
25
27
  import tempfile
26
28
  import threading
29
+ import time
27
30
  import traceback
31
+ from collections.abc import Iterator
28
32
  from contextlib import contextmanager
29
- from typing import Dict, Iterator, Optional, Union, cast
33
+ from typing import Optional, Union, cast
30
34
 
31
- import psutil # type: ignore
35
+ import psutil
32
36
 
33
37
  from toil.lib.exceptions import raise_
34
38
  from toil.lib.io import robust_rmtree
@@ -36,12 +40,150 @@ from toil.lib.io import robust_rmtree
36
40
  logger = logging.getLogger(__name__)
37
41
 
38
42
 
43
+ def ensure_filesystem_lockable(
44
+ path: str, timeout: float = 30, hint: Optional[str] = None
45
+ ) -> None:
46
+ """
47
+ Make sure that the filesystem used at the given path is one where locks are safe to use.
48
+
49
+ File locks are not safe to use on Ceph. See
50
+ <https://github.com/DataBiosphere/toil/issues/4972>.
51
+
52
+ Raises an exception if the filesystem is detected as one where using locks
53
+ is known to trigger bugs in the filesystem implementation. Also raises an
54
+ exception if the given path does not exist, or if attempting to determine
55
+ the filesystem type takes more than the timeout in seconds.
56
+
57
+ If the filesystem type cannot be determined, does nothing.
58
+
59
+ :param hint: Extra text to include in an error, if raised, telling the user
60
+ how to change the offending path.
61
+ """
62
+
63
+ if not os.path.exists(path):
64
+ # Raise a normal-looking FileNotFoundError. See <https://stackoverflow.com/a/36077407>
65
+ raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
66
+
67
+ if platform.system() == "Linux":
68
+ # We know how to find the filesystem here.
69
+
70
+ try:
71
+ # Start a child process to stat the path. See <https://unix.stackexchange.com/a/402236>.
72
+ # We really should call statfs but no bindings for it are in PyPI.
73
+ completed = subprocess.run(
74
+ ["stat", "-f", "-c", "%T", path],
75
+ check=True,
76
+ capture_output=True,
77
+ timeout=timeout,
78
+ )
79
+ except subprocess.TimeoutExpired as e:
80
+ # The subprocess itself is Too Slow
81
+ raise RuntimeError(
82
+ f"Polling filesystem type at {path} took more than {timeout} seconds; is your filesystem working?"
83
+ ) from e
84
+ except subprocess.CalledProcessError as e:
85
+ # Stat didn't work. Maybe we don't have the right version of stat installed?
86
+ logger.warning(
87
+ "Could not determine filesystem type at %s because of: %s",
88
+ path,
89
+ e.stderr.decode("utf-8", errors="replace").strip(),
90
+ )
91
+ # If we don't know the filesystem type, keep going anyway.
92
+ return
93
+
94
+ filesystem_type = completed.stdout.decode("utf-8", errors="replace").strip()
95
+
96
+ if filesystem_type == "ceph":
97
+ # Ceph is known to deadlock the MDS and break the parent directory when locking.
98
+ message = [
99
+ f"Refusing to use {path} because file locks are known to break {filesystem_type} filesystems."
100
+ ]
101
+ if hint:
102
+ # Hint the user how to fix this.
103
+ message.append(hint)
104
+ raise RuntimeError(" ".join(message))
105
+ else:
106
+ # Other filesystem types are fine (even though NFS is sometimes
107
+ # flaky with regard to locks actually locking anything).
108
+ logger.debug(
109
+ "Detected that %s has lockable filesystem type: %s",
110
+ path,
111
+ filesystem_type,
112
+ )
113
+
114
+ # Other platforms (Mac) probably aren't mounting Ceph and also don't
115
+ # usually use the same stat binary implementation.
116
+
117
+
118
+ def safe_lock(fd: int, block: bool = True, shared: bool = False) -> None:
119
+ """
120
+ Get an fcntl lock, while retrying on IO errors.
121
+
122
+ Raises OSError with EACCES or EAGAIN when a nonblocking lock is not
123
+ immediately available.
124
+ """
125
+
126
+ # Set up retry logic. TODO: Use @retry instead.
127
+ error_backoff = 1
128
+ MAX_ERROR_TRIES = 10
129
+ error_tries = 0
130
+
131
+ while True:
132
+ try:
133
+ # Wait until we can exclusively lock it.
134
+ lock_mode = (fcntl.LOCK_SH if shared else fcntl.LOCK_EX) | (
135
+ fcntl.LOCK_NB if not block else 0
136
+ )
137
+ fcntl.flock(fd, lock_mode)
138
+ return
139
+ except OSError as e:
140
+ if e.errno in (errno.EACCES, errno.EAGAIN):
141
+ # Nonblocking lock not available.
142
+ raise
143
+ elif e.errno == errno.EIO:
144
+ # Sometimes Ceph produces IO errors when talking to lock files.
145
+ # Back off and try again.
146
+ # TODO: Should we eventually give up if the disk really is
147
+ # broken? If so we should use the retry system.
148
+ if error_tries < MAX_ERROR_TRIES:
149
+ logger.error(
150
+ "IO error talking to lock file. Retrying after %s seconds.",
151
+ error_backoff,
152
+ )
153
+ time.sleep(error_backoff)
154
+ error_backoff = min(60, error_backoff * 2)
155
+ error_tries += 1
156
+ continue
157
+ else:
158
+ logger.critical(
159
+ "Too many IO errors talking to lock file. If using Ceph, check for MDS deadlocks. See <https://tracker.ceph.com/issues/62123>."
160
+ )
161
+ raise
162
+ else:
163
+ raise
164
+
165
+
166
+ def safe_unlock_and_close(fd: int) -> None:
167
+ """
168
+ Release an fcntl lock and close the file descriptor, while handling fcntl IO errors.
169
+ """
170
+ try:
171
+ fcntl.flock(fd, fcntl.LOCK_UN)
172
+ except OSError as e:
173
+ if e.errno != errno.EIO:
174
+ raise
175
+ # Sometimes Ceph produces EIO. We don't need to retry then because
176
+ # we're going to close the FD and after that the file can't remain
177
+ # locked by us.
178
+ os.close(fd)
179
+
180
+
39
181
  class ExceptionalThread(threading.Thread):
40
182
  """
41
183
  A thread whose join() method re-raises exceptions raised during run(). While join() is
42
184
  idempotent, the exception is only during the first invocation of join() that successfully
43
185
  joined the thread. If join() times out, no exception will be re reraised even though an
44
- exception might already have occured in run().
186
+ exception might already have occurred in run().
45
187
 
46
188
  When subclassing this thread, override tryRun() instead of run().
47
189
 
@@ -65,6 +207,7 @@ class ExceptionalThread(threading.Thread):
65
207
  AssertionError
66
208
 
67
209
  """
210
+
68
211
  exc_info = None
69
212
 
70
213
  def run(self) -> None:
@@ -103,18 +246,23 @@ def cpu_count() -> int:
103
246
  :rtype: int
104
247
  """
105
248
 
106
- cached = getattr(cpu_count, 'result', None)
249
+ cached = getattr(cpu_count, "result", None)
107
250
  if cached is not None:
108
251
  # We already got a CPU count.
109
252
  return cast(int, cached)
110
253
 
111
254
  # Get the fallback answer of all the CPUs on the machine
112
- total_machine_size = cast(int, psutil.cpu_count(logical=True))
255
+ psutil_cpu_count = psutil.cpu_count(logical=True)
256
+ if psutil_cpu_count is None:
257
+ logger.debug("Could not retrieve the logical CPU count.")
113
258
 
114
- logger.debug('Total machine size: %d cores', total_machine_size)
259
+ total_machine_size: Union[float, int] = (
260
+ psutil_cpu_count if psutil_cpu_count is not None else float("inf")
261
+ )
262
+ logger.debug("Total machine size: %s core(s)", total_machine_size)
115
263
 
116
264
  # cgroups may limit the size
117
- cgroup_size: Union[float, int] = float('inf')
265
+ cgroup_size: Union[float, int] = float("inf")
118
266
 
119
267
  try:
120
268
  # See if we can fetch these and use them
@@ -122,13 +270,13 @@ def cpu_count() -> int:
122
270
  period: Optional[int] = None
123
271
 
124
272
  # CGroups v1 keeps quota and period separate
125
- CGROUP1_QUOTA_FILE = '/sys/fs/cgroup/cpu/cpu.cfs_quota_us'
126
- CGROUP1_PERIOD_FILE = '/sys/fs/cgroup/cpu/cpu.cfs_period_us'
273
+ CGROUP1_QUOTA_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"
274
+ CGROUP1_PERIOD_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"
127
275
  # CGroups v2 keeps both in one file, space-separated, quota first
128
- CGROUP2_COMBINED_FILE = '/sys/fs/cgroup/cpu.max'
276
+ CGROUP2_COMBINED_FILE = "/sys/fs/cgroup/cpu.max"
129
277
 
130
278
  if os.path.exists(CGROUP1_QUOTA_FILE) and os.path.exists(CGROUP1_PERIOD_FILE):
131
- logger.debug('CPU quota and period available from cgroups v1')
279
+ logger.debug("CPU quota and period available from cgroups v1")
132
280
  with open(CGROUP1_QUOTA_FILE) as stream:
133
281
  # Read the quota
134
282
  quota = int(stream.read())
@@ -137,49 +285,58 @@ def cpu_count() -> int:
137
285
  # Read the period in which we are allowed to burn the quota
138
286
  period = int(stream.read())
139
287
  elif os.path.exists(CGROUP2_COMBINED_FILE):
140
- logger.debug('CPU quota and period available from cgroups v2')
288
+ logger.debug("CPU quota and period available from cgroups v2")
141
289
  with open(CGROUP2_COMBINED_FILE) as stream:
142
290
  # Read the quota and the period together
143
- quota, period = (int(part) for part in stream.read().split(' '))
291
+ quota, period = (int(part) for part in stream.read().split(" "))
144
292
  else:
145
- logger.debug('CPU quota/period not available from cgroups v1 or cgroups v2')
293
+ logger.debug("CPU quota/period not available from cgroups v1 or cgroups v2")
146
294
 
147
295
  if quota is not None and period is not None:
148
296
  # We got a quota and a period.
149
- logger.debug('CPU quota: %d period: %d', quota, period)
297
+ logger.debug("CPU quota: %d period: %d", quota, period)
150
298
 
151
299
  if quota == -1:
152
300
  # But the quota can be -1 for unset.
153
301
  # Assume we can use the whole machine.
154
- return total_machine_size
155
-
156
- # The thread count is how many multiples of a wall clock period we
157
- # can burn in that period.
158
- cgroup_size = int(math.ceil(float(quota)/float(period)))
302
+ cgroup_size = float("inf")
303
+ else:
304
+ # The thread count is how many multiples of a wall clock period we
305
+ # can burn in that period.
306
+ cgroup_size = int(math.ceil(float(quota) / float(period)))
159
307
 
160
- logger.debug('Control group size in cores: %d', cgroup_size)
308
+ logger.debug("Control group size in cores: %s", cgroup_size)
161
309
  except:
162
310
  # We can't actually read these cgroup fields. Maybe we are a mac or something.
163
- logger.debug('Could not inspect cgroup: %s', traceback.format_exc())
311
+ logger.debug("Could not inspect cgroup: %s", traceback.format_exc())
164
312
 
165
313
  # CPU affinity may limit the size
166
- affinity_size: Union[float, int] = float('inf')
167
- if hasattr(os, 'sched_getaffinity'):
314
+ affinity_size: Union[float, int] = float("inf")
315
+ if hasattr(os, "sched_getaffinity"):
168
316
  try:
169
- logger.debug('CPU affinity available')
317
+ logger.debug("CPU affinity available")
170
318
  affinity_size = len(os.sched_getaffinity(0))
171
- logger.debug('CPU affinity is restricted to %d cores', affinity_size)
319
+ logger.debug("CPU affinity is restricted to %d cores", affinity_size)
172
320
  except:
173
- # We can't actually read this even though it exists.
174
- logger.debug('Could not inspect scheduling affinity: %s', traceback.format_exc())
321
+ # We can't actually read this even though it exists.
322
+ logger.debug(
323
+ "Could not inspect scheduling affinity: %s", traceback.format_exc()
324
+ )
175
325
  else:
176
- logger.debug('CPU affinity not available')
177
-
178
- # Return the smaller of the actual thread count and the cgroup's limit, minimum 1.
179
- result = cast(int, max(1, min(min(affinity_size, cgroup_size), total_machine_size)))
180
- logger.debug('cpu_count: %s', str(result))
326
+ logger.debug("CPU affinity not available")
327
+
328
+ limit: Union[float, int] = float("inf")
329
+ # Apply all the limits to take the smallest
330
+ limit = min(limit, total_machine_size)
331
+ limit = min(limit, cgroup_size)
332
+ limit = min(limit, affinity_size)
333
+ if limit < 1 or limit == float("inf"):
334
+ # Fall back to 1 if we can't get a size
335
+ limit = 1
336
+ result = int(limit)
337
+ logger.debug("cpu_count: %s", result)
181
338
  # Make sure to remember it for the next call
182
- setattr(cpu_count, 'result', result)
339
+ setattr(cpu_count, "result", result)
183
340
  return result
184
341
 
185
342
 
@@ -201,7 +358,8 @@ def cpu_count() -> int:
201
358
  current_process_name_lock = threading.Lock()
202
359
  # And a global dict from work directory to name in that work directory.
203
360
  # We also have a file descriptor per work directory but it is just leaked.
204
- current_process_name_for: Dict[str, str] = {}
361
+ current_process_name_for: dict[str, str] = {}
362
+
205
363
 
206
364
  def collect_process_name_garbage() -> None:
207
365
  """
@@ -225,6 +383,7 @@ def collect_process_name_garbage() -> None:
225
383
  for base_dir in missing:
226
384
  del current_process_name_for[base_dir]
227
385
 
386
+
228
387
  def destroy_all_process_names() -> None:
229
388
  """
230
389
  Delete all our process name files because our process is going away.
@@ -239,9 +398,11 @@ def destroy_all_process_names() -> None:
239
398
  for base_dir, name in current_process_name_for.items():
240
399
  robust_rmtree(os.path.join(base_dir, name))
241
400
 
401
+
242
402
  # Run the cleanup at exit
243
403
  atexit.register(destroy_all_process_names)
244
404
 
405
+
245
406
  def get_process_name(base_dir: str) -> str:
246
407
  """
247
408
  Return the name of the current process. Like a PID but visible between
@@ -270,10 +431,16 @@ def get_process_name(base_dir: str) -> str:
270
431
 
271
432
  # Lock the file. The lock will automatically go away if our process does.
272
433
  try:
273
- fcntl.lockf(nameFD, fcntl.LOCK_EX | fcntl.LOCK_NB)
434
+ safe_lock(nameFD, block=False)
274
435
  except OSError as e:
275
- # Someone else might have locked it even though they should not have.
276
- raise RuntimeError(f"Could not lock process name file {nameFileName}: {str(e)}")
436
+ if e.errno in (errno.EACCES, errno.EAGAIN):
437
+ # Someone else locked it even though they should not have.
438
+ raise RuntimeError(
439
+ f"Could not lock process name file {nameFileName}"
440
+ ) from e
441
+ else:
442
+ # Something else is wrong
443
+ raise
277
444
 
278
445
  # Save the basename
279
446
  current_process_name_for[base_dir] = os.path.basename(nameFileName)
@@ -311,20 +478,24 @@ def process_name_exists(base_dir: str, name: str) -> bool:
311
478
  # If the file is gone, the process can't exist.
312
479
  return False
313
480
 
314
-
315
481
  nameFD = None
316
482
  try:
317
483
  try:
318
484
  # Otherwise see if we can lock it shared, for which we need an FD, but
319
485
  # only for reading.
320
486
  nameFD = os.open(nameFileName, os.O_RDONLY)
321
- fcntl.lockf(nameFD, fcntl.LOCK_SH | fcntl.LOCK_NB)
322
487
  except FileNotFoundError as e:
323
488
  # File has vanished
324
489
  return False
490
+ try:
491
+ safe_lock(nameFD, block=False, shared=True)
325
492
  except OSError as e:
326
- # Could not lock. Process is alive.
327
- return True
493
+ if e.errno in (errno.EACCES, errno.EAGAIN):
494
+ # Could not lock. Process is alive.
495
+ return True
496
+ else:
497
+ # Something else went wrong
498
+ raise
328
499
  else:
329
500
  # Could lock. Process is dead.
330
501
  # Remove the file. We race to be the first to do so.
@@ -332,8 +503,8 @@ def process_name_exists(base_dir: str, name: str) -> bool:
332
503
  os.remove(nameFileName)
333
504
  except:
334
505
  pass
335
- # Unlock
336
- fcntl.lockf(nameFD, fcntl.LOCK_UN)
506
+ safe_unlock_and_close(nameFD)
507
+ nameFD = None
337
508
  # Report process death
338
509
  return False
339
510
  finally:
@@ -343,6 +514,7 @@ def process_name_exists(base_dir: str, name: str) -> bool:
343
514
  except:
344
515
  pass
345
516
 
517
+
346
518
  # Similar to the process naming system above, we define a global mutex system
347
519
  # for critical sections, based just around file locks.
348
520
  @contextmanager
@@ -362,21 +534,34 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
362
534
  if not os.path.isdir(base_dir):
363
535
  raise RuntimeError(f"Directory {base_dir} for mutex does not exist")
364
536
 
537
+ # TODO: We don't know what CLI option controls where to put this mutex, so
538
+ # we aren't very helpful if the location is bad.
539
+ ensure_filesystem_lockable(
540
+ base_dir, hint=f"Specify a different place to put the {mutex} mutex."
541
+ )
542
+
365
543
  # Define a filename
366
- lock_filename = os.path.join(base_dir, 'toil-mutex-' + mutex)
544
+ lock_filename = os.path.join(base_dir, "toil-mutex-" + mutex)
367
545
 
368
- logger.debug('PID %d acquiring mutex %s', os.getpid(), lock_filename)
546
+ logger.debug("PID %d acquiring mutex %s", os.getpid(), lock_filename)
369
547
 
370
548
  # We can't just create/open and lock a file, because when we clean up
371
549
  # there's a race where someone can open the file before we unlink it and
372
550
  # get a lock on the deleted file.
373
551
 
552
+ error_backoff = 1
553
+
374
554
  while True:
375
555
  # Try to create the file, ignoring if it exists or not.
376
556
  fd = os.open(lock_filename, os.O_CREAT | os.O_WRONLY)
377
557
 
378
- # Wait until we can exclusively lock it.
379
- fcntl.lockf(fd, fcntl.LOCK_EX)
558
+ try:
559
+ # Wait until we can exclusively lock it, handling error retry.
560
+ safe_lock(fd)
561
+ except:
562
+ # Something went wrong
563
+ os.close(fd)
564
+ raise
380
565
 
381
566
  # Holding the lock, make sure we are looking at the same file on disk still.
382
567
  try:
@@ -384,16 +569,14 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
384
569
  fd_stats = os.fstat(fd)
385
570
  except OSError as e:
386
571
  if e.errno == errno.ESTALE:
387
- # The file handle has gone stale, because somebody removed the file.
572
+ # The file handle has gone stale, because somebody removed the
573
+ # file.
388
574
  # Try again.
389
- try:
390
- fcntl.lockf(fd, fcntl.LOCK_UN)
391
- except OSError:
392
- pass
393
- os.close(fd)
575
+ safe_unlock_and_close(fd)
394
576
  continue
395
577
  else:
396
578
  # Something else broke
579
+ os.close(fd)
397
580
  raise
398
581
 
399
582
  try:
@@ -402,13 +585,16 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
402
585
  except FileNotFoundError:
403
586
  path_stats = None
404
587
 
405
- if path_stats is None or fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino:
588
+ if (
589
+ path_stats is None
590
+ or fd_stats.st_dev != path_stats.st_dev
591
+ or fd_stats.st_ino != path_stats.st_ino
592
+ ):
406
593
  # The file we have a lock on is not the file linked to the name (if
407
594
  # any). This usually happens, because before someone releases a
408
595
  # lock, they delete the file. Go back and contend again. TODO: This
409
596
  # allows a lot of queue jumping on our mutex.
410
- fcntl.lockf(fd, fcntl.LOCK_UN)
411
- os.close(fd)
597
+ safe_unlock_and_close(fd)
412
598
  continue
413
599
  else:
414
600
  # We have a lock on the file that the name points to. Since we
@@ -418,12 +604,12 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
418
604
 
419
605
  try:
420
606
  # When we have it, do the thing we are protecting.
421
- logger.debug('PID %d now holds mutex %s', os.getpid(), lock_filename)
607
+ logger.debug("PID %d now holds mutex %s", os.getpid(), lock_filename)
422
608
  yield
423
609
  finally:
424
610
  # Delete it while we still own it, so we can't delete it from out from
425
611
  # under someone else who thinks they are holding it.
426
- logger.debug('PID %d releasing mutex %s', os.getpid(), lock_filename)
612
+ logger.debug("PID %d releasing mutex %s", os.getpid(), lock_filename)
427
613
 
428
614
  # We have had observations in the wild of the lock file not exisiting
429
615
  # when we go to unlink it, causing a crash on mutex release. See
@@ -441,23 +627,36 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
441
627
 
442
628
  # Check to make sure it still looks locked before we unlink.
443
629
  if path_stats is None:
444
- logger.error('PID %d had mutex %s disappear while locked! Mutex system is not working!', os.getpid(), lock_filename)
445
- elif fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino:
446
- logger.error('PID %d had mutex %s get replaced while locked! Mutex system is not working!', os.getpid(), lock_filename)
630
+ logger.error(
631
+ "PID %d had mutex %s disappear while locked! Mutex system is not working!",
632
+ os.getpid(),
633
+ lock_filename,
634
+ )
635
+ elif (
636
+ fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino
637
+ ):
638
+ logger.error(
639
+ "PID %d had mutex %s get replaced while locked! Mutex system is not working!",
640
+ os.getpid(),
641
+ lock_filename,
642
+ )
447
643
 
448
644
  if path_stats is not None:
449
645
  try:
450
646
  # Unlink the file
451
647
  os.unlink(lock_filename)
452
648
  except FileNotFoundError:
453
- logger.error('PID %d had mutex %s disappear between stat and unlink while unlocking! Mutex system is not working!', os.getpid(), lock_filename)
649
+ logger.error(
650
+ "PID %d had mutex %s disappear between stat and unlink while unlocking! Mutex system is not working!",
651
+ os.getpid(),
652
+ lock_filename,
653
+ )
454
654
 
455
655
  # Note that we are unlinking it and then unlocking it; a lot of people
456
656
  # might have opened it before we unlinked it and will wake up when they
457
657
  # get the worthless lock on the now-unlinked file. We have to do some
458
658
  # stat gymnastics above to work around this.
459
- fcntl.lockf(fd, fcntl.LOCK_UN)
460
- os.close(fd)
659
+ safe_unlock_and_close(fd)
461
660
 
462
661
 
463
662
  class LastProcessStandingArena:
@@ -493,13 +692,13 @@ class LastProcessStandingArena:
493
692
 
494
693
  # We need a mutex name to allow only one process to be entering or
495
694
  # leaving at a time.
496
- self.mutex = name + '-arena-lock'
695
+ self.mutex = name + "-arena-lock"
497
696
 
498
697
  # We need a way to track who is actually in, and who was in but died.
499
698
  # So everybody gets a locked file (again).
500
699
  # TODO: deduplicate with the similar logic for process names, and also
501
700
  # deferred functions.
502
- self.lockfileDir = os.path.join(base_dir, name + '-arena-members')
701
+ self.lockfileDir = os.path.join(base_dir, name + "-arena-members")
503
702
 
504
703
  # When we enter the arena, we fill this in with the FD of the locked
505
704
  # file that represents our presence.
@@ -515,7 +714,7 @@ class LastProcessStandingArena:
515
714
  You may not enter the arena again before leaving it.
516
715
  """
517
716
 
518
- logger.debug('Joining arena %s', self.lockfileDir)
717
+ logger.debug("Joining arena %s", self.lockfileDir)
519
718
 
520
719
  # Make sure we're not in it already.
521
720
  if self.lockfileName is not None or self.lockfileFD is not None:
@@ -529,15 +728,24 @@ class LastProcessStandingArena:
529
728
  os.mkdir(self.lockfileDir)
530
729
  except FileExistsError:
531
730
  pass
731
+ except Exception as e:
732
+ raise RuntimeError(
733
+ "Could not make lock file directory " + self.lockfileDir
734
+ ) from e
532
735
 
533
736
  # Make ourselves a file in it and lock it to prove we are alive.
534
- self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
737
+ try:
738
+ self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
739
+ except Exception as e:
740
+ raise RuntimeError(
741
+ "Could not make lock file in " + self.lockfileDir
742
+ ) from e
535
743
  # Nobody can see it yet, so lock it right away
536
- fcntl.lockf(self.lockfileFD, fcntl.LOCK_EX) # type: ignore
744
+ safe_lock(self.lockfileFD) # type: ignore
537
745
 
538
746
  # Now we're properly in, so release the global mutex
539
747
 
540
- logger.debug('Now in arena %s', self.lockfileDir)
748
+ logger.debug("Now in arena %s", self.lockfileDir)
541
749
 
542
750
  def leave(self) -> Iterator[bool]:
543
751
  """
@@ -557,7 +765,7 @@ class LastProcessStandingArena:
557
765
  if self.lockfileName is None or self.lockfileFD is None:
558
766
  raise RuntimeError("This process is not in the arena.")
559
767
 
560
- logger.debug('Leaving arena %s', self.lockfileDir)
768
+ logger.debug("Leaving arena %s", self.lockfileDir)
561
769
 
562
770
  with global_mutex(self.base_dir, self.mutex):
563
771
  # Now nobody else should also be trying to join or leave.
@@ -568,8 +776,7 @@ class LastProcessStandingArena:
568
776
  except:
569
777
  pass
570
778
  self.lockfileName = None
571
- fcntl.lockf(self.lockfileFD, fcntl.LOCK_UN)
572
- os.close(self.lockfileFD)
779
+ safe_unlock_and_close(self.lockfileFD)
573
780
  self.lockfileFD = None
574
781
 
575
782
  for item in os.listdir(self.lockfileDir):
@@ -583,32 +790,42 @@ class LastProcessStandingArena:
583
790
  continue
584
791
 
585
792
  try:
586
- fcntl.lockf(fd, fcntl.LOCK_SH | fcntl.LOCK_NB)
793
+ safe_lock(fd, block=False, shared=True)
587
794
  except OSError as e:
588
- # Could not lock. It's alive!
589
- break
795
+ if e.errno in (errno.EACCES, errno.EAGAIN):
796
+ # Could not lock. It's alive!
797
+ break
798
+ else:
799
+ # Something else is wrong
800
+ os.close(fd)
801
+ raise
590
802
  else:
591
803
  # Could lock. Process is dead.
592
804
  try:
593
805
  os.remove(full_path)
594
806
  except:
595
807
  pass
596
- fcntl.lockf(fd, fcntl.LOCK_UN)
808
+ safe_unlock_and_close(fd)
597
809
  # Continue with the loop normally.
598
810
  else:
599
811
  # Nothing alive was found. Nobody will come in while we hold
600
812
  # the global mutex, so we are the Last Process Standing.
601
- logger.debug('We are the Last Process Standing in arena %s', self.lockfileDir)
813
+ logger.debug(
814
+ "We are the Last Process Standing in arena %s", self.lockfileDir
815
+ )
602
816
  yield True
603
817
 
604
818
  try:
605
819
  # Delete the arena directory so as to leave nothing behind.
606
820
  os.rmdir(self.lockfileDir)
607
821
  except:
608
- logger.warning('Could not clean up arena %s completely: %s',
609
- self.lockfileDir, traceback.format_exc())
822
+ logger.warning(
823
+ "Could not clean up arena %s completely: %s",
824
+ self.lockfileDir,
825
+ traceback.format_exc(),
826
+ )
610
827
 
611
828
  # Now we're done, whether we were the last one or not, and can
612
829
  # release the mutex.
613
830
 
614
- logger.debug('Now out of arena %s', self.lockfileDir)
831
+ logger.debug("Now out of arena %s", self.lockfileDir)