toil 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. toil/__init__.py +18 -13
  2. toil/batchSystems/abstractBatchSystem.py +39 -13
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +24 -24
  4. toil/batchSystems/awsBatch.py +14 -14
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +3 -3
  7. toil/batchSystems/htcondor.py +0 -1
  8. toil/batchSystems/kubernetes.py +34 -31
  9. toil/batchSystems/local_support.py +3 -1
  10. toil/batchSystems/lsf.py +7 -7
  11. toil/batchSystems/mesos/batchSystem.py +7 -7
  12. toil/batchSystems/options.py +32 -83
  13. toil/batchSystems/registry.py +104 -23
  14. toil/batchSystems/singleMachine.py +16 -13
  15. toil/batchSystems/slurm.py +87 -16
  16. toil/batchSystems/torque.py +0 -1
  17. toil/bus.py +44 -8
  18. toil/common.py +544 -753
  19. toil/cwl/__init__.py +28 -32
  20. toil/cwl/cwltoil.py +595 -574
  21. toil/cwl/utils.py +55 -10
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/__init__.py +2 -2
  24. toil/fileStores/abstractFileStore.py +88 -14
  25. toil/fileStores/cachingFileStore.py +610 -549
  26. toil/fileStores/nonCachingFileStore.py +46 -22
  27. toil/job.py +182 -101
  28. toil/jobStores/abstractJobStore.py +161 -95
  29. toil/jobStores/aws/jobStore.py +23 -9
  30. toil/jobStores/aws/utils.py +6 -6
  31. toil/jobStores/fileJobStore.py +116 -18
  32. toil/jobStores/googleJobStore.py +16 -7
  33. toil/jobStores/utils.py +5 -6
  34. toil/leader.py +87 -56
  35. toil/lib/accelerators.py +10 -5
  36. toil/lib/aws/__init__.py +3 -14
  37. toil/lib/aws/ami.py +22 -9
  38. toil/lib/aws/iam.py +21 -13
  39. toil/lib/aws/session.py +2 -16
  40. toil/lib/aws/utils.py +4 -5
  41. toil/lib/compatibility.py +1 -1
  42. toil/lib/conversions.py +26 -3
  43. toil/lib/docker.py +22 -23
  44. toil/lib/ec2.py +10 -6
  45. toil/lib/ec2nodes.py +106 -100
  46. toil/lib/encryption/_nacl.py +2 -1
  47. toil/lib/generatedEC2Lists.py +325 -18
  48. toil/lib/io.py +49 -2
  49. toil/lib/misc.py +1 -1
  50. toil/lib/resources.py +9 -2
  51. toil/lib/threading.py +101 -38
  52. toil/options/common.py +736 -0
  53. toil/options/cwl.py +336 -0
  54. toil/options/wdl.py +37 -0
  55. toil/provisioners/abstractProvisioner.py +9 -4
  56. toil/provisioners/aws/__init__.py +3 -6
  57. toil/provisioners/aws/awsProvisioner.py +6 -0
  58. toil/provisioners/clusterScaler.py +3 -2
  59. toil/provisioners/gceProvisioner.py +2 -2
  60. toil/realtimeLogger.py +2 -1
  61. toil/resource.py +24 -18
  62. toil/server/app.py +2 -3
  63. toil/server/cli/wes_cwl_runner.py +4 -4
  64. toil/server/utils.py +1 -1
  65. toil/server/wes/abstract_backend.py +3 -2
  66. toil/server/wes/amazon_wes_utils.py +5 -4
  67. toil/server/wes/tasks.py +2 -3
  68. toil/server/wes/toil_backend.py +2 -10
  69. toil/server/wsgi_app.py +2 -0
  70. toil/serviceManager.py +12 -10
  71. toil/statsAndLogging.py +41 -9
  72. toil/test/__init__.py +29 -54
  73. toil/test/batchSystems/batchSystemTest.py +11 -111
  74. toil/test/batchSystems/test_slurm.py +24 -8
  75. toil/test/cactus/__init__.py +0 -0
  76. toil/test/cactus/test_cactus_integration.py +58 -0
  77. toil/test/cwl/cwlTest.py +438 -223
  78. toil/test/cwl/glob_dir.cwl +15 -0
  79. toil/test/cwl/preemptible.cwl +21 -0
  80. toil/test/cwl/preemptible_expression.cwl +28 -0
  81. toil/test/cwl/revsort.cwl +1 -1
  82. toil/test/cwl/revsort2.cwl +1 -1
  83. toil/test/docs/scriptsTest.py +2 -3
  84. toil/test/jobStores/jobStoreTest.py +34 -21
  85. toil/test/lib/aws/test_iam.py +4 -14
  86. toil/test/lib/aws/test_utils.py +0 -3
  87. toil/test/lib/dockerTest.py +4 -4
  88. toil/test/lib/test_ec2.py +12 -17
  89. toil/test/mesos/helloWorld.py +4 -5
  90. toil/test/mesos/stress.py +1 -1
  91. toil/test/{wdl/conftest.py → options/__init__.py} +0 -10
  92. toil/test/options/options.py +37 -0
  93. toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
  94. toil/test/provisioners/clusterScalerTest.py +6 -4
  95. toil/test/provisioners/clusterTest.py +23 -11
  96. toil/test/provisioners/gceProvisionerTest.py +0 -6
  97. toil/test/provisioners/restartScript.py +3 -2
  98. toil/test/server/serverTest.py +1 -1
  99. toil/test/sort/restart_sort.py +2 -1
  100. toil/test/sort/sort.py +2 -1
  101. toil/test/sort/sortTest.py +2 -13
  102. toil/test/src/autoDeploymentTest.py +45 -45
  103. toil/test/src/busTest.py +5 -5
  104. toil/test/src/checkpointTest.py +2 -2
  105. toil/test/src/deferredFunctionTest.py +1 -1
  106. toil/test/src/fileStoreTest.py +32 -16
  107. toil/test/src/helloWorldTest.py +1 -1
  108. toil/test/src/importExportFileTest.py +1 -1
  109. toil/test/src/jobDescriptionTest.py +2 -1
  110. toil/test/src/jobServiceTest.py +1 -1
  111. toil/test/src/jobTest.py +18 -18
  112. toil/test/src/miscTests.py +5 -3
  113. toil/test/src/promisedRequirementTest.py +3 -3
  114. toil/test/src/realtimeLoggerTest.py +1 -1
  115. toil/test/src/resourceTest.py +2 -2
  116. toil/test/src/restartDAGTest.py +1 -1
  117. toil/test/src/resumabilityTest.py +36 -2
  118. toil/test/src/retainTempDirTest.py +1 -1
  119. toil/test/src/systemTest.py +2 -2
  120. toil/test/src/toilContextManagerTest.py +2 -2
  121. toil/test/src/userDefinedJobArgTypeTest.py +1 -1
  122. toil/test/utils/toilDebugTest.py +98 -32
  123. toil/test/utils/toilKillTest.py +2 -2
  124. toil/test/utils/utilsTest.py +23 -3
  125. toil/test/wdl/wdltoil_test.py +223 -45
  126. toil/toilState.py +7 -6
  127. toil/utils/toilClean.py +1 -1
  128. toil/utils/toilConfig.py +36 -0
  129. toil/utils/toilDebugFile.py +60 -33
  130. toil/utils/toilDebugJob.py +39 -12
  131. toil/utils/toilDestroyCluster.py +1 -1
  132. toil/utils/toilKill.py +1 -1
  133. toil/utils/toilLaunchCluster.py +13 -2
  134. toil/utils/toilMain.py +3 -2
  135. toil/utils/toilRsyncCluster.py +1 -1
  136. toil/utils/toilSshCluster.py +1 -1
  137. toil/utils/toilStats.py +445 -305
  138. toil/utils/toilStatus.py +2 -5
  139. toil/version.py +10 -10
  140. toil/wdl/utils.py +2 -122
  141. toil/wdl/wdltoil.py +1257 -492
  142. toil/worker.py +55 -46
  143. toil-6.1.0.dist-info/METADATA +124 -0
  144. toil-6.1.0.dist-info/RECORD +241 -0
  145. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/WHEEL +1 -1
  146. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -1
  147. toil/batchSystems/parasol.py +0 -379
  148. toil/batchSystems/tes.py +0 -459
  149. toil/test/batchSystems/parasolTestSupport.py +0 -117
  150. toil/test/wdl/builtinTest.py +0 -506
  151. toil/test/wdl/toilwdlTest.py +0 -522
  152. toil/wdl/toilwdl.py +0 -141
  153. toil/wdl/versions/dev.py +0 -107
  154. toil/wdl/versions/draft2.py +0 -980
  155. toil/wdl/versions/v1.py +0 -794
  156. toil/wdl/wdl_analysis.py +0 -116
  157. toil/wdl/wdl_functions.py +0 -997
  158. toil/wdl/wdl_synthesis.py +0 -1011
  159. toil/wdl/wdl_types.py +0 -243
  160. toil-5.12.0.dist-info/METADATA +0 -118
  161. toil-5.12.0.dist-info/RECORD +0 -244
  162. /toil/{wdl/versions → options}/__init__.py +0 -0
  163. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
  164. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
toil/lib/io.py CHANGED
@@ -2,6 +2,7 @@ import logging
2
2
  import os
3
3
  import shutil
4
4
  import stat
5
+ import tempfile
5
6
  import uuid
6
7
  from contextlib import contextmanager
7
8
  from io import BytesIO
@@ -9,6 +10,26 @@ from typing import IO, Any, Callable, Iterator, Optional, Union
9
10
 
10
11
  logger = logging.getLogger(__name__)
11
12
 
13
+ def mkdtemp(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Optional[str] = None) -> str:
14
+ """
15
+ Make a temporary directory like tempfile.mkdtemp, but with relaxed permissions.
16
+
17
+ The permissions on the directory will be 711 instead of 700, allowing the
18
+ group and all other users to traverse the directory. This is necessary if
19
+ the direcotry is on NFS and the Docker daemon would like to mount it or a
20
+ file inside it into a container, because on NFS even the Docker daemon
21
+ appears bound by the file permissions.
22
+
23
+ See <https://github.com/DataBiosphere/toil/issues/4644>, and
24
+ <https://stackoverflow.com/a/67928880> which talks about a similar problem
25
+ but in the context of user namespaces.
26
+ """
27
+ # Make the directory
28
+ result = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir)
29
+ # Grant all the permissions: full control for user, and execute for group and other
30
+ os.chmod(result, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
31
+ # Return the path created
32
+ return result
12
33
 
13
34
  def robust_rmtree(path: Union[str, bytes]) -> None:
14
35
  """
@@ -161,17 +182,43 @@ def make_public_dir(in_directory: Optional[str] = None) -> str:
161
182
  os.chmod(this_should_never_happen, 0o777)
162
183
  return this_should_never_happen
163
184
 
164
- def try_path(path: str) -> Optional[str]:
185
+ def try_path(path: str, min_size: int = 100 * 1024 * 1024) -> Optional[str]:
165
186
  """
166
187
  Try to use the given path. Return it if it exists or can be made,
167
188
  and we can make things within it, or None otherwise.
189
+
190
+ :param min_size: Reject paths on filesystems smaller than this many bytes.
168
191
  """
192
+
169
193
  try:
170
194
  os.makedirs(path, exist_ok=True)
171
195
  except OSError:
172
196
  # Maybe we lack permissions
173
197
  return None
174
- return path if os.path.exists(path) and os.access(path, os.W_OK) else None
198
+
199
+ if not os.path.exists(path):
200
+ # We didn't manage to make it
201
+ return None
202
+
203
+ if not os.access(path, os.W_OK):
204
+ # It doesn't look writable
205
+ return None
206
+
207
+ try:
208
+ stats = os.statvfs(path)
209
+ except OSError:
210
+ # Maybe we lack permissions
211
+ return None
212
+
213
+ # Is the filesystem big enough?
214
+ # We need to look at the FS size and not the free space so we don't change
215
+ # over to a different filesystem when this one fills up.
216
+ fs_size = stats.f_frsize * stats.f_blocks
217
+ if fs_size < min_size:
218
+ # Too small
219
+ return None
220
+
221
+ return path
175
222
 
176
223
 
177
224
  class WriteWatchingStream:
toil/lib/misc.py CHANGED
@@ -9,7 +9,7 @@ import sys
9
9
  import time
10
10
  import typing
11
11
  from contextlib import closing
12
- from typing import Iterator, List, Optional, Union
12
+ from typing import Iterator, List, Optional
13
13
 
14
14
  import pytz
15
15
 
toil/lib/resources.py CHANGED
@@ -13,6 +13,8 @@
13
13
  # limitations under the License.
14
14
  import fnmatch
15
15
  import os
16
+ import math
17
+ import sys
16
18
  import resource
17
19
  from typing import List, Tuple
18
20
 
@@ -20,12 +22,17 @@ from typing import List, Tuple
20
22
  def get_total_cpu_time_and_memory_usage() -> Tuple[float, int]:
21
23
  """
22
24
  Gives the total cpu time of itself and all its children, and the maximum RSS memory usage of
23
- itself and its single largest child.
25
+ itself and its single largest child (in kibibytes).
24
26
  """
25
27
  me = resource.getrusage(resource.RUSAGE_SELF)
26
28
  children = resource.getrusage(resource.RUSAGE_CHILDREN)
27
29
  total_cpu_time = me.ru_utime + me.ru_stime + children.ru_utime + children.ru_stime
28
30
  total_memory_usage = me.ru_maxrss + children.ru_maxrss
31
+ if sys.platform == "darwin":
32
+ # On Linux, getrusage works in "kilobytes" (really kibibytes), but on
33
+ # Mac it works in bytes. See
34
+ # <https://github.com/python/cpython/issues/74698>
35
+ total_memory_usage = int(math.ceil(total_memory_usage / 1024))
29
36
  return total_cpu_time, total_memory_usage
30
37
 
31
38
 
@@ -42,7 +49,7 @@ def glob(glob_pattern: str, directoryname: str) -> List[str]:
42
49
  the glob_pattern and returns a list=[].
43
50
 
44
51
  :param directoryname: Any accessible folder name on the filesystem.
45
- :param glob_pattern: A string like "*.txt", which would find all text files.
52
+ :param glob_pattern: A string like ``*.txt``, which would find all text files.
46
53
  :return: A list=[] of absolute filepaths matching the glob pattern.
47
54
  """
48
55
  matches = []
toil/lib/threading.py CHANGED
@@ -16,6 +16,7 @@
16
16
  # Note: renamed from "threading.py" to "threading.py" to avoid conflicting imports
17
17
  # from the built-in "threading" from psutil in python3.9
18
18
  import atexit
19
+ import errno
19
20
  import fcntl
20
21
  import logging
21
22
  import math
@@ -25,7 +26,7 @@ import tempfile
25
26
  import threading
26
27
  import traceback
27
28
  from contextlib import contextmanager
28
- from typing import Any, Dict, Iterator, Optional, Union, cast
29
+ from typing import Dict, Iterator, Optional, Union, cast
29
30
 
30
31
  import psutil # type: ignore
31
32
 
@@ -108,9 +109,12 @@ def cpu_count() -> int:
108
109
  return cast(int, cached)
109
110
 
110
111
  # Get the fallback answer of all the CPUs on the machine
111
- total_machine_size = cast(int, psutil.cpu_count(logical=True))
112
+ psutil_cpu_count = cast(Optional[int], psutil.cpu_count(logical=True))
113
+ if psutil_cpu_count is None:
114
+ logger.debug('Could not retrieve the logical CPU count.')
112
115
 
113
- logger.debug('Total machine size: %d cores', total_machine_size)
116
+ total_machine_size: Union[float, int] = psutil_cpu_count if psutil_cpu_count is not None else float('inf')
117
+ logger.debug('Total machine size: %s core(s)', total_machine_size)
114
118
 
115
119
  # cgroups may limit the size
116
120
  cgroup_size: Union[float, int] = float('inf')
@@ -150,13 +154,13 @@ def cpu_count() -> int:
150
154
  if quota == -1:
151
155
  # But the quota can be -1 for unset.
152
156
  # Assume we can use the whole machine.
153
- return total_machine_size
154
-
155
- # The thread count is how many multiples of a wall clock period we
156
- # can burn in that period.
157
- cgroup_size = int(math.ceil(float(quota)/float(period)))
157
+ cgroup_size = float('inf')
158
+ else:
159
+ # The thread count is how many multiples of a wall clock period we
160
+ # can burn in that period.
161
+ cgroup_size = int(math.ceil(float(quota)/float(period)))
158
162
 
159
- logger.debug('Control group size in cores: %d', cgroup_size)
163
+ logger.debug('Control group size in cores: %s', cgroup_size)
160
164
  except:
161
165
  # We can't actually read these cgroup fields. Maybe we are a mac or something.
162
166
  logger.debug('Could not inspect cgroup: %s', traceback.format_exc())
@@ -174,9 +178,16 @@ def cpu_count() -> int:
174
178
  else:
175
179
  logger.debug('CPU affinity not available')
176
180
 
177
- # Return the smaller of the actual thread count and the cgroup's limit, minimum 1.
178
- result = cast(int, max(1, min(min(affinity_size, cgroup_size), total_machine_size)))
179
- logger.debug('cpu_count: %s', str(result))
181
+ limit: Union[float, int] = float('inf')
182
+ # Apply all the limits to take the smallest
183
+ limit = min(limit, total_machine_size)
184
+ limit = min(limit, cgroup_size)
185
+ limit = min(limit, affinity_size)
186
+ if limit < 1 or limit == float('inf'):
187
+ # Fall back to 1 if we can't get a size
188
+ limit = 1
189
+ result = int(limit)
190
+ logger.debug('cpu_count: %s', result)
180
191
  # Make sure to remember it for the next call
181
192
  setattr(cpu_count, 'result', result)
182
193
  return result
@@ -358,6 +369,9 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
358
369
  :param str mutex: Mutex to lock. Must be a permissible path component.
359
370
  """
360
371
 
372
+ if not os.path.isdir(base_dir):
373
+ raise RuntimeError(f"Directory {base_dir} for mutex does not exist")
374
+
361
375
  # Define a filename
362
376
  lock_filename = os.path.join(base_dir, 'toil-mutex-' + mutex)
363
377
 
@@ -368,18 +382,32 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
368
382
  # get a lock on the deleted file.
369
383
 
370
384
  while True:
371
- fd = -1
372
-
373
- try:
374
- # Try to create the file, ignoring if it exists or not.
375
- fd = os.open(lock_filename, os.O_CREAT | os.O_WRONLY)
385
+ # Try to create the file, ignoring if it exists or not.
386
+ fd = os.open(lock_filename, os.O_CREAT | os.O_WRONLY)
376
387
 
377
- # Wait until we can exclusively lock it.
378
- fcntl.lockf(fd, fcntl.LOCK_EX)
388
+ # Wait until we can exclusively lock it.
389
+ fcntl.lockf(fd, fcntl.LOCK_EX)
379
390
 
380
- # Holding the lock, make sure we are looking at the same file on disk still.
391
+ # Holding the lock, make sure we are looking at the same file on disk still.
392
+ try:
393
+ # So get the stats from the open file
381
394
  fd_stats = os.fstat(fd)
395
+ except OSError as e:
396
+ if e.errno == errno.ESTALE:
397
+ # The file handle has gone stale, because somebody removed the file.
398
+ # Try again.
399
+ try:
400
+ fcntl.lockf(fd, fcntl.LOCK_UN)
401
+ except OSError:
402
+ pass
403
+ os.close(fd)
404
+ continue
405
+ else:
406
+ # Something else broke
407
+ raise
382
408
 
409
+ try:
410
+ # And get the stats for the name in the directory
383
411
  path_stats: Optional[os.stat_result] = os.stat(lock_filename)
384
412
  except FileNotFoundError:
385
413
  path_stats = None
@@ -389,10 +417,9 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
389
417
  # any). This usually happens, because before someone releases a
390
418
  # lock, they delete the file. Go back and contend again. TODO: This
391
419
  # allows a lot of queue jumping on our mutex.
392
- if fd != -1:
393
- fcntl.lockf(fd, fcntl.LOCK_UN)
394
- os.close(fd)
395
- continue
420
+ fcntl.lockf(fd, fcntl.LOCK_UN)
421
+ os.close(fd)
422
+ continue
396
423
  else:
397
424
  # We have a lock on the file that the name points to. Since we
398
425
  # hold the lock, nobody will be deleting it or can be in the
@@ -407,14 +434,40 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
407
434
  # Delete it while we still own it, so we can't delete it from out from
408
435
  # under someone else who thinks they are holding it.
409
436
  logger.debug('PID %d releasing mutex %s', os.getpid(), lock_filename)
410
- os.unlink(lock_filename)
411
- if fd != -1:
412
- fcntl.lockf(fd, fcntl.LOCK_UN)
413
- # Note that we are unlinking it and then unlocking it; a lot of people
414
- # might have opened it before we unlinked it and will wake up when they
415
- # get the worthless lock on the now-unlinked file. We have to do some
416
- # stat gymnastics above to work around this.
417
- os.close(fd)
437
+
438
+ # We have had observations in the wild of the lock file not exisiting
439
+ # when we go to unlink it, causing a crash on mutex release. See
440
+ # <https://github.com/DataBiosphere/toil/issues/4654>.
441
+ #
442
+ # We want to tolerate this; maybe unlink() interacts with fcntl() locks
443
+ # on NFS in a way that is actually fine, somehow? But we also want to
444
+ # complain loudly if something is tampering with our locks or not
445
+ # really enforcing locks on the filesystem, so we will notice if it is
446
+ # the cause of further problems.
447
+ try:
448
+ path_stats = os.stat(lock_filename)
449
+ except FileNotFoundError:
450
+ path_stats = None
451
+
452
+ # Check to make sure it still looks locked before we unlink.
453
+ if path_stats is None:
454
+ logger.error('PID %d had mutex %s disappear while locked! Mutex system is not working!', os.getpid(), lock_filename)
455
+ elif fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino:
456
+ logger.error('PID %d had mutex %s get replaced while locked! Mutex system is not working!', os.getpid(), lock_filename)
457
+
458
+ if path_stats is not None:
459
+ try:
460
+ # Unlink the file
461
+ os.unlink(lock_filename)
462
+ except FileNotFoundError:
463
+ logger.error('PID %d had mutex %s disappear between stat and unlink while unlocking! Mutex system is not working!', os.getpid(), lock_filename)
464
+
465
+ # Note that we are unlinking it and then unlocking it; a lot of people
466
+ # might have opened it before we unlinked it and will wake up when they
467
+ # get the worthless lock on the now-unlinked file. We have to do some
468
+ # stat gymnastics above to work around this.
469
+ fcntl.lockf(fd, fcntl.LOCK_UN)
470
+ os.close(fd)
418
471
 
419
472
 
420
473
  class LastProcessStandingArena:
@@ -475,8 +528,8 @@ class LastProcessStandingArena:
475
528
  logger.debug('Joining arena %s', self.lockfileDir)
476
529
 
477
530
  # Make sure we're not in it already.
478
- assert self.lockfileName is None
479
- assert self.lockfileFD is None
531
+ if self.lockfileName is not None or self.lockfileFD is not None:
532
+ raise RuntimeError("A process is already in the arena")
480
533
 
481
534
  with global_mutex(self.base_dir, self.mutex):
482
535
  # Now nobody else should also be trying to join or leave.
@@ -486,9 +539,14 @@ class LastProcessStandingArena:
486
539
  os.mkdir(self.lockfileDir)
487
540
  except FileExistsError:
488
541
  pass
542
+ except Exception as e:
543
+ raise RuntimeError("Could not make lock file directory " + self.lockfileDir) from e
489
544
 
490
545
  # Make ourselves a file in it and lock it to prove we are alive.
491
- self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
546
+ try:
547
+ self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
548
+ except Exception as e:
549
+ raise RuntimeError("Could not make lock file in " + self.lockfileDir) from e
492
550
  # Nobody can see it yet, so lock it right away
493
551
  fcntl.lockf(self.lockfileFD, fcntl.LOCK_EX) # type: ignore
494
552
 
@@ -511,8 +569,8 @@ class LastProcessStandingArena:
511
569
  """
512
570
 
513
571
  # Make sure we're in it to start.
514
- assert self.lockfileName is not None
515
- assert self.lockfileFD is not None
572
+ if self.lockfileName is None or self.lockfileFD is None:
573
+ raise RuntimeError("This process is not in the arena.")
516
574
 
517
575
  logger.debug('Leaving arena %s', self.lockfileDir)
518
576
 
@@ -533,7 +591,12 @@ class LastProcessStandingArena:
533
591
  # There is someone claiming to be here. Are they alive?
534
592
  full_path = os.path.join(self.lockfileDir, item)
535
593
 
536
- fd = os.open(full_path, os.O_RDONLY)
594
+ try:
595
+ fd = os.open(full_path, os.O_RDONLY)
596
+ except OSError as e:
597
+ # suddenly file doesnt exist on network file system?
598
+ continue
599
+
537
600
  try:
538
601
  fcntl.lockf(fd, fcntl.LOCK_SH | fcntl.LOCK_NB)
539
602
  except OSError as e: