PyPI - toil - Versions diffs - 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl - Mend

toil 6.1.0a1py3-none-any.whl → 8.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (193) hide show

toil/__init__.py +122 -315
toil/batchSystems/__init__.py +1 -0
toil/batchSystems/abstractBatchSystem.py +173 -89
toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
toil/batchSystems/awsBatch.py +244 -135
toil/batchSystems/cleanup_support.py +26 -16
toil/batchSystems/contained_executor.py +31 -28
toil/batchSystems/gridengine.py +86 -50
toil/batchSystems/htcondor.py +166 -89
toil/batchSystems/kubernetes.py +632 -382
toil/batchSystems/local_support.py +20 -15
toil/batchSystems/lsf.py +134 -81
toil/batchSystems/lsfHelper.py +13 -11
toil/batchSystems/mesos/__init__.py +41 -29
toil/batchSystems/mesos/batchSystem.py +290 -151
toil/batchSystems/mesos/executor.py +79 -50
toil/batchSystems/mesos/test/__init__.py +31 -23
toil/batchSystems/options.py +46 -28
toil/batchSystems/registry.py +53 -19
toil/batchSystems/singleMachine.py +296 -125
toil/batchSystems/slurm.py +603 -138
toil/batchSystems/torque.py +47 -33
toil/bus.py +186 -76
toil/common.py +664 -368
toil/cwl/__init__.py +1 -1
toil/cwl/cwltoil.py +1136 -483
toil/cwl/utils.py +17 -22
toil/deferred.py +63 -42
toil/exceptions.py +5 -3
toil/fileStores/__init__.py +5 -5
toil/fileStores/abstractFileStore.py +140 -60
toil/fileStores/cachingFileStore.py +717 -269
toil/fileStores/nonCachingFileStore.py +116 -87
toil/job.py +1225 -368
toil/jobStores/abstractJobStore.py +416 -266
toil/jobStores/aws/jobStore.py +863 -477
toil/jobStores/aws/utils.py +201 -120
toil/jobStores/conftest.py +3 -2
toil/jobStores/fileJobStore.py +292 -154
toil/jobStores/googleJobStore.py +140 -74
toil/jobStores/utils.py +36 -15
toil/leader.py +668 -272
toil/lib/accelerators.py +115 -18
toil/lib/aws/__init__.py +74 -31
toil/lib/aws/ami.py +122 -87
toil/lib/aws/iam.py +284 -108
toil/lib/aws/s3.py +31 -0
toil/lib/aws/session.py +214 -39
toil/lib/aws/utils.py +287 -231
toil/lib/bioio.py +13 -5
toil/lib/compatibility.py +11 -6
toil/lib/conversions.py +104 -47
toil/lib/docker.py +131 -103
toil/lib/ec2.py +361 -199
toil/lib/ec2nodes.py +174 -106
toil/lib/encryption/_dummy.py +5 -3
toil/lib/encryption/_nacl.py +10 -6
toil/lib/encryption/conftest.py +1 -0
toil/lib/exceptions.py +26 -7
toil/lib/expando.py +5 -3
toil/lib/ftp_utils.py +217 -0
toil/lib/generatedEC2Lists.py +127 -19
toil/lib/humanize.py +6 -2
toil/lib/integration.py +341 -0
toil/lib/io.py +141 -15
toil/lib/iterables.py +4 -2
toil/lib/memoize.py +12 -8
toil/lib/misc.py +66 -21
toil/lib/objects.py +2 -2
toil/lib/resources.py +68 -15
toil/lib/retry.py +126 -81
toil/lib/threading.py +299 -82
toil/lib/throttle.py +16 -15
toil/options/common.py +843 -409
toil/options/cwl.py +175 -90
toil/options/runner.py +50 -0
toil/options/wdl.py +73 -17
toil/provisioners/__init__.py +117 -46
toil/provisioners/abstractProvisioner.py +332 -157
toil/provisioners/aws/__init__.py +70 -33
toil/provisioners/aws/awsProvisioner.py +1145 -715
toil/provisioners/clusterScaler.py +541 -279
toil/provisioners/gceProvisioner.py +282 -179
toil/provisioners/node.py +155 -79
toil/realtimeLogger.py +34 -22
toil/resource.py +137 -75
toil/server/app.py +128 -62
toil/server/celery_app.py +3 -1
toil/server/cli/wes_cwl_runner.py +82 -53
toil/server/utils.py +54 -28
toil/server/wes/abstract_backend.py +64 -26
toil/server/wes/amazon_wes_utils.py +21 -15
toil/server/wes/tasks.py +121 -63
toil/server/wes/toil_backend.py +142 -107
toil/server/wsgi_app.py +4 -3
toil/serviceManager.py +58 -22
toil/statsAndLogging.py +224 -70
toil/test/__init__.py +282 -183
toil/test/batchSystems/batchSystemTest.py +460 -210
toil/test/batchSystems/batch_system_plugin_test.py +90 -0
toil/test/batchSystems/test_gridengine.py +173 -0
toil/test/batchSystems/test_lsf_helper.py +67 -58
toil/test/batchSystems/test_slurm.py +110 -49
toil/test/cactus/__init__.py +0 -0
toil/test/cactus/test_cactus_integration.py +56 -0
toil/test/cwl/cwlTest.py +496 -287
toil/test/cwl/measure_default_memory.cwl +12 -0
toil/test/cwl/not_run_required_input.cwl +29 -0
toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
toil/test/cwl/seqtk_seq.cwl +1 -1
toil/test/docs/scriptsTest.py +69 -46
toil/test/jobStores/jobStoreTest.py +427 -264
toil/test/lib/aws/test_iam.py +118 -50
toil/test/lib/aws/test_s3.py +16 -9
toil/test/lib/aws/test_utils.py +5 -6
toil/test/lib/dockerTest.py +118 -141
toil/test/lib/test_conversions.py +113 -115
toil/test/lib/test_ec2.py +58 -50
toil/test/lib/test_integration.py +104 -0
toil/test/lib/test_misc.py +12 -5
toil/test/mesos/MesosDataStructuresTest.py +23 -10
toil/test/mesos/helloWorld.py +7 -6
toil/test/mesos/stress.py +25 -20
toil/test/options/__init__.py +13 -0
toil/test/options/options.py +42 -0
toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
toil/test/provisioners/clusterScalerTest.py +440 -250
toil/test/provisioners/clusterTest.py +166 -44
toil/test/provisioners/gceProvisionerTest.py +174 -100
toil/test/provisioners/provisionerTest.py +25 -13
toil/test/provisioners/restartScript.py +5 -4
toil/test/server/serverTest.py +188 -141
toil/test/sort/restart_sort.py +137 -68
toil/test/sort/sort.py +134 -66
toil/test/sort/sortTest.py +91 -49
toil/test/src/autoDeploymentTest.py +141 -101
toil/test/src/busTest.py +20 -18
toil/test/src/checkpointTest.py +8 -2
toil/test/src/deferredFunctionTest.py +49 -35
toil/test/src/dockerCheckTest.py +32 -24
toil/test/src/environmentTest.py +135 -0
toil/test/src/fileStoreTest.py +539 -272
toil/test/src/helloWorldTest.py +7 -4
toil/test/src/importExportFileTest.py +61 -31
toil/test/src/jobDescriptionTest.py +46 -21
toil/test/src/jobEncapsulationTest.py +2 -0
toil/test/src/jobFileStoreTest.py +74 -50
toil/test/src/jobServiceTest.py +187 -73
toil/test/src/jobTest.py +121 -71
toil/test/src/miscTests.py +19 -18
toil/test/src/promisedRequirementTest.py +82 -36
toil/test/src/promisesTest.py +7 -6
toil/test/src/realtimeLoggerTest.py +10 -6
toil/test/src/regularLogTest.py +71 -37
toil/test/src/resourceTest.py +80 -49
toil/test/src/restartDAGTest.py +36 -22
toil/test/src/resumabilityTest.py +9 -2
toil/test/src/retainTempDirTest.py +45 -14
toil/test/src/systemTest.py +12 -8
toil/test/src/threadingTest.py +44 -25
toil/test/src/toilContextManagerTest.py +10 -7
toil/test/src/userDefinedJobArgTypeTest.py +8 -5
toil/test/src/workerTest.py +73 -23
toil/test/utils/toilDebugTest.py +103 -33
toil/test/utils/toilKillTest.py +4 -5
toil/test/utils/utilsTest.py +245 -106
toil/test/wdl/wdltoil_test.py +818 -149
toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
toil/toilState.py +120 -35
toil/utils/toilConfig.py +13 -4
toil/utils/toilDebugFile.py +44 -27
toil/utils/toilDebugJob.py +214 -27
toil/utils/toilDestroyCluster.py +11 -6
toil/utils/toilKill.py +8 -3
toil/utils/toilLaunchCluster.py +256 -140
toil/utils/toilMain.py +37 -16
toil/utils/toilRsyncCluster.py +32 -14
toil/utils/toilSshCluster.py +49 -22
toil/utils/toilStats.py +356 -273
toil/utils/toilStatus.py +292 -139
toil/utils/toilUpdateEC2Instances.py +3 -1
toil/version.py +12 -12
toil/wdl/utils.py +5 -5
toil/wdl/wdltoil.py +3913 -1033
toil/worker.py +367 -184
{toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
toil-8.0.0.dist-info/METADATA +173 -0
toil-8.0.0.dist-info/RECORD +253 -0
{toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
toil-6.1.0a1.dist-info/METADATA +0 -125
toil-6.1.0a1.dist-info/RECORD +0 -237
{toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
{toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0

toil/lib/threading.py CHANGED Viewed

@@ -21,14 +21,18 @@ import fcntl
 import logging
 import math
 import os
+import platform
+import subprocess
 import sys
 import tempfile
 import threading
+import time
 import traceback
+from collections.abc import Iterator
 from contextlib import contextmanager
-from typing import Dict, Iterator, Optional, Union, cast
+from typing import Optional, Union, cast
-import psutil  # type: ignore
+import psutil
 from toil.lib.exceptions import raise_
 from toil.lib.io import robust_rmtree
@@ -36,12 +40,150 @@ from toil.lib.io import robust_rmtree
 logger = logging.getLogger(__name__)
+def ensure_filesystem_lockable(
+    path: str, timeout: float = 30, hint: Optional[str] = None
+) -> None:
+    """
+    Make sure that the filesystem used at the given path is one where locks are safe to use.
+    File locks are not safe to use on Ceph. See
+    <https://github.com/DataBiosphere/toil/issues/4972>.
+    Raises an exception if the filesystem is detected as one where using locks
+    is known to trigger bugs in the filesystem implementation. Also raises an
+    exception if the given path does not exist, or if attempting to determine
+    the filesystem type takes more than the timeout in seconds.
+    If the filesystem type cannot be determined, does nothing.
+    :param hint: Extra text to include in an error, if raised, telling the user
+        how to change the offending path.
+    """
+    if not os.path.exists(path):
+        # Raise a normal-looking FileNotFoundError. See <https://stackoverflow.com/a/36077407>
+        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
+    if platform.system() == "Linux":
+        # We know how to find the filesystem here.
+        try:
+            # Start a child process to stat the path. See <https://unix.stackexchange.com/a/402236>.
+            # We really should call statfs but no bindings for it are in PyPI.
+            completed = subprocess.run(
+                ["stat", "-f", "-c", "%T", path],
+                check=True,
+                capture_output=True,
+                timeout=timeout,
+            )
+        except subprocess.TimeoutExpired as e:
+            # The subprocess itself is Too Slow
+            raise RuntimeError(
+                f"Polling filesystem type at {path} took more than {timeout} seconds; is your filesystem working?"
+            ) from e
+        except subprocess.CalledProcessError as e:
+            # Stat didn't work. Maybe we don't have the right version of stat installed?
+            logger.warning(
+                "Could not determine filesystem type at %s because of: %s",
+                path,
+                e.stderr.decode("utf-8", errors="replace").strip(),
+            )
+            # If we don't know the filesystem type, keep going anyway.
+            return
+        filesystem_type = completed.stdout.decode("utf-8", errors="replace").strip()
+        if filesystem_type == "ceph":
+            # Ceph is known to deadlock the MDS and break the parent directory when locking.
+            message = [
+                f"Refusing to use {path} because file locks are known to break {filesystem_type} filesystems."
+            ]
+            if hint:
+                # Hint the user how to fix this.
+                message.append(hint)
+            raise RuntimeError(" ".join(message))
+        else:
+            # Other filesystem types are fine (even though NFS is sometimes
+            # flaky with regard to locks actually locking anything).
+            logger.debug(
+                "Detected that %s has lockable filesystem type: %s",
+                path,
+                filesystem_type,
+            )
+    # Other platforms (Mac) probably aren't mounting Ceph and also don't
+    # usually use the same stat binary implementation.
+def safe_lock(fd: int, block: bool = True, shared: bool = False) -> None:
+    """
+    Get an fcntl lock, while retrying on IO errors.
+    Raises OSError with EACCES or EAGAIN when a nonblocking lock is not
+    immediately available.
+    """
+    # Set up retry logic. TODO: Use @retry instead.
+    error_backoff = 1
+    MAX_ERROR_TRIES = 10
+    error_tries = 0
+    while True:
+        try:
+            # Wait until we can exclusively lock it.
+            lock_mode = (fcntl.LOCK_SH if shared else fcntl.LOCK_EX) | (
+                fcntl.LOCK_NB if not block else 0
+            )
+            fcntl.flock(fd, lock_mode)
+            return
+        except OSError as e:
+            if e.errno in (errno.EACCES, errno.EAGAIN):
+                # Nonblocking lock not available.
+                raise
+            elif e.errno == errno.EIO:
+                # Sometimes Ceph produces IO errors when talking to lock files.
+                # Back off and try again.
+                # TODO: Should we eventually give up if the disk really is
+                # broken? If so we should use the retry system.
+                if error_tries < MAX_ERROR_TRIES:
+                    logger.error(
+                        "IO error talking to lock file. Retrying after %s seconds.",
+                        error_backoff,
+                    )
+                    time.sleep(error_backoff)
+                    error_backoff = min(60, error_backoff * 2)
+                    error_tries += 1
+                    continue
+                else:
+                    logger.critical(
+                        "Too many IO errors talking to lock file. If using Ceph, check for MDS deadlocks. See <https://tracker.ceph.com/issues/62123>."
+                    )
+                    raise
+            else:
+                raise
+def safe_unlock_and_close(fd: int) -> None:
+    """
+    Release an fcntl lock and close the file descriptor, while handling fcntl IO errors.
+    """
+    try:
+        fcntl.flock(fd, fcntl.LOCK_UN)
+    except OSError as e:
+        if e.errno != errno.EIO:
+            raise
+        # Sometimes Ceph produces EIO. We don't need to retry then because
+        # we're going to close the FD and after that the file can't remain
+        # locked by us.
+    os.close(fd)
 class ExceptionalThread(threading.Thread):
     """
     A thread whose join() method re-raises exceptions raised during run(). While join() is
     idempotent, the exception is only during the first invocation of join() that successfully
     joined the thread. If join() times out, no exception will be re reraised even though an
-    exception might already have occured in run().
+    exception might already have occurred in run().
     When subclassing this thread, override tryRun() instead of run().
@@ -65,6 +207,7 @@ class ExceptionalThread(threading.Thread):
     AssertionError
     """
     exc_info = None
     def run(self) -> None:
@@ -103,18 +246,23 @@ def cpu_count() -> int:
     :rtype: int
     """
-    cached = getattr(cpu_count, 'result', None)
+    cached = getattr(cpu_count, "result", None)
     if cached is not None:
         # We already got a CPU count.
         return cast(int, cached)
     # Get the fallback answer of all the CPUs on the machine
-    total_machine_size = cast(int, psutil.cpu_count(logical=True))
+    psutil_cpu_count = psutil.cpu_count(logical=True)
+    if psutil_cpu_count is None:
+        logger.debug("Could not retrieve the logical CPU count.")
-    logger.debug('Total machine size: %d cores', total_machine_size)
+    total_machine_size: Union[float, int] = (
+        psutil_cpu_count if psutil_cpu_count is not None else float("inf")
+    )
+    logger.debug("Total machine size: %s core(s)", total_machine_size)
     # cgroups may limit the size
-    cgroup_size: Union[float, int] = float('inf')
+    cgroup_size: Union[float, int] = float("inf")
     try:
         # See if we can fetch these and use them
@@ -122,13 +270,13 @@ def cpu_count() -> int:
         period: Optional[int] = None
         # CGroups v1 keeps quota and period separate
-        CGROUP1_QUOTA_FILE = '/sys/fs/cgroup/cpu/cpu.cfs_quota_us'
-        CGROUP1_PERIOD_FILE = '/sys/fs/cgroup/cpu/cpu.cfs_period_us'
+        CGROUP1_QUOTA_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"
+        CGROUP1_PERIOD_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"
         # CGroups v2 keeps both in one file, space-separated, quota first
-        CGROUP2_COMBINED_FILE = '/sys/fs/cgroup/cpu.max'
+        CGROUP2_COMBINED_FILE = "/sys/fs/cgroup/cpu.max"
         if os.path.exists(CGROUP1_QUOTA_FILE) and os.path.exists(CGROUP1_PERIOD_FILE):
-            logger.debug('CPU quota and period available from cgroups v1')
+            logger.debug("CPU quota and period available from cgroups v1")
             with open(CGROUP1_QUOTA_FILE) as stream:
                 # Read the quota
                 quota = int(stream.read())
@@ -137,49 +285,58 @@ def cpu_count() -> int:
                 # Read the period in which we are allowed to burn the quota
                 period = int(stream.read())
         elif os.path.exists(CGROUP2_COMBINED_FILE):
-            logger.debug('CPU quota and period available from cgroups v2')
+            logger.debug("CPU quota and period available from cgroups v2")
             with open(CGROUP2_COMBINED_FILE) as stream:
                 # Read the quota and the period together
-                quota, period = (int(part) for part in stream.read().split(' '))
+                quota, period = (int(part) for part in stream.read().split(" "))
         else:
-            logger.debug('CPU quota/period not available from cgroups v1 or cgroups v2')
+            logger.debug("CPU quota/period not available from cgroups v1 or cgroups v2")
         if quota is not None and period is not None:
             # We got a quota and a period.
-            logger.debug('CPU quota: %d period: %d', quota, period)
+            logger.debug("CPU quota: %d period: %d", quota, period)
             if quota == -1:
                 # But the quota can be -1 for unset.
                 # Assume we can use the whole machine.
-                return total_machine_size
-            # The thread count is how many multiples of a wall clock period we
-            # can burn in that period.
-            cgroup_size = int(math.ceil(float(quota)/float(period)))
+                cgroup_size = float("inf")
+            else:
+                # The thread count is how many multiples of a wall clock period we
+                # can burn in that period.
+                cgroup_size = int(math.ceil(float(quota) / float(period)))
-            logger.debug('Control group size in cores: %d', cgroup_size)
+            logger.debug("Control group size in cores: %s", cgroup_size)
     except:
         # We can't actually read these cgroup fields. Maybe we are a mac or something.
-        logger.debug('Could not inspect cgroup: %s', traceback.format_exc())
+        logger.debug("Could not inspect cgroup: %s", traceback.format_exc())
     # CPU affinity may limit the size
-    affinity_size: Union[float, int] = float('inf')
-    if hasattr(os, 'sched_getaffinity'):
+    affinity_size: Union[float, int] = float("inf")
+    if hasattr(os, "sched_getaffinity"):
         try:
-            logger.debug('CPU affinity available')
+            logger.debug("CPU affinity available")
             affinity_size = len(os.sched_getaffinity(0))
-            logger.debug('CPU affinity is restricted to %d cores', affinity_size)
+            logger.debug("CPU affinity is restricted to %d cores", affinity_size)
         except:
-             # We can't actually read this even though it exists.
-            logger.debug('Could not inspect scheduling affinity: %s', traceback.format_exc())
+            # We can't actually read this even though it exists.
+            logger.debug(
+                "Could not inspect scheduling affinity: %s", traceback.format_exc()
+            )
     else:
-        logger.debug('CPU affinity not available')
-    # Return the smaller of the actual thread count and the cgroup's limit, minimum 1.
-    result = cast(int, max(1, min(min(affinity_size, cgroup_size), total_machine_size)))
-    logger.debug('cpu_count: %s', str(result))
+        logger.debug("CPU affinity not available")
+    limit: Union[float, int] = float("inf")
+    # Apply all the limits to take the smallest
+    limit = min(limit, total_machine_size)
+    limit = min(limit, cgroup_size)
+    limit = min(limit, affinity_size)
+    if limit < 1 or limit == float("inf"):
+        # Fall back to 1 if we can't get a size
+        limit = 1
+    result = int(limit)
+    logger.debug("cpu_count: %s", result)
     # Make sure to remember it for the next call
-    setattr(cpu_count, 'result', result)
+    setattr(cpu_count, "result", result)
     return result
@@ -201,7 +358,8 @@ def cpu_count() -> int:
 current_process_name_lock = threading.Lock()
 # And a global dict from work directory to name in that work directory.
 # We also have a file descriptor per work directory but it is just leaked.
-current_process_name_for: Dict[str, str] = {}
+current_process_name_for: dict[str, str] = {}
 def collect_process_name_garbage() -> None:
     """
@@ -225,6 +383,7 @@ def collect_process_name_garbage() -> None:
     for base_dir in missing:
         del current_process_name_for[base_dir]
 def destroy_all_process_names() -> None:
     """
     Delete all our process name files because our process is going away.
@@ -239,9 +398,11 @@ def destroy_all_process_names() -> None:
     for base_dir, name in current_process_name_for.items():
         robust_rmtree(os.path.join(base_dir, name))
 # Run the cleanup at exit
 atexit.register(destroy_all_process_names)
 def get_process_name(base_dir: str) -> str:
     """
     Return the name of the current process. Like a PID but visible between
@@ -270,10 +431,16 @@ def get_process_name(base_dir: str) -> str:
         # Lock the file. The lock will automatically go away if our process does.
         try:
-            fcntl.lockf(nameFD, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            safe_lock(nameFD, block=False)
         except OSError as e:
-            # Someone else might have locked it even though they should not have.
-            raise RuntimeError(f"Could not lock process name file {nameFileName}: {str(e)}")
+            if e.errno in (errno.EACCES, errno.EAGAIN):
+                # Someone else locked it even though they should not have.
+                raise RuntimeError(
+                    f"Could not lock process name file {nameFileName}"
+                ) from e
+            else:
+                # Something else is wrong
+                raise
         # Save the basename
         current_process_name_for[base_dir] = os.path.basename(nameFileName)
@@ -311,20 +478,24 @@ def process_name_exists(base_dir: str, name: str) -> bool:
         # If the file is gone, the process can't exist.
         return False
     nameFD = None
     try:
         try:
             # Otherwise see if we can lock it shared, for which we need an FD, but
             # only for reading.
             nameFD = os.open(nameFileName, os.O_RDONLY)
-            fcntl.lockf(nameFD, fcntl.LOCK_SH | fcntl.LOCK_NB)
         except FileNotFoundError as e:
             # File has vanished
             return False
+        try:
+            safe_lock(nameFD, block=False, shared=True)
         except OSError as e:
-            # Could not lock. Process is alive.
-            return True
+            if e.errno in (errno.EACCES, errno.EAGAIN):
+                # Could not lock. Process is alive.
+                return True
+            else:
+                # Something else went wrong
+                raise
         else:
             # Could lock. Process is dead.
             # Remove the file. We race to be the first to do so.
@@ -332,8 +503,8 @@ def process_name_exists(base_dir: str, name: str) -> bool:
                 os.remove(nameFileName)
             except:
                 pass
-            # Unlock
-            fcntl.lockf(nameFD, fcntl.LOCK_UN)
+            safe_unlock_and_close(nameFD)
+            nameFD = None
             # Report process death
             return False
     finally:
@@ -343,6 +514,7 @@ def process_name_exists(base_dir: str, name: str) -> bool:
             except:
                 pass
 # Similar to the process naming system above, we define a global mutex system
 # for critical sections, based just around file locks.
 @contextmanager
@@ -362,21 +534,34 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
     if not os.path.isdir(base_dir):
         raise RuntimeError(f"Directory {base_dir} for mutex does not exist")
+    # TODO: We don't know what CLI option controls where to put this mutex, so
+    # we aren't very helpful if the location is bad.
+    ensure_filesystem_lockable(
+        base_dir, hint=f"Specify a different place to put the {mutex} mutex."
+    )
     # Define a filename
-    lock_filename = os.path.join(base_dir, 'toil-mutex-' + mutex)
+    lock_filename = os.path.join(base_dir, "toil-mutex-" + mutex)
-    logger.debug('PID %d acquiring mutex %s', os.getpid(), lock_filename)
+    logger.debug("PID %d acquiring mutex %s", os.getpid(), lock_filename)
     # We can't just create/open and lock a file, because when we clean up
     # there's a race where someone can open the file before we unlink it and
     # get a lock on the deleted file.
+    error_backoff = 1
     while True:
         # Try to create the file, ignoring if it exists or not.
         fd = os.open(lock_filename, os.O_CREAT | os.O_WRONLY)
-        # Wait until we can exclusively lock it.
-        fcntl.lockf(fd, fcntl.LOCK_EX)
+        try:
+            # Wait until we can exclusively lock it, handling error retry.
+            safe_lock(fd)
+        except:
+            # Something went wrong
+            os.close(fd)
+            raise
         # Holding the lock, make sure we are looking at the same file on disk still.
         try:
@@ -384,16 +569,14 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
             fd_stats = os.fstat(fd)
         except OSError as e:
             if e.errno == errno.ESTALE:
-                # The file handle has gone stale, because somebody removed the file.
+                # The file handle has gone stale, because somebody removed the
+                # file.
                 # Try again.
-                try:
-                    fcntl.lockf(fd, fcntl.LOCK_UN)
-                except OSError:
-                    pass
-                os.close(fd)
+                safe_unlock_and_close(fd)
                 continue
             else:
                 # Something else broke
+                os.close(fd)
                 raise
         try:
@@ -402,13 +585,16 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
         except FileNotFoundError:
             path_stats = None
-        if path_stats is None or fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino:
+        if (
+            path_stats is None
+            or fd_stats.st_dev != path_stats.st_dev
+            or fd_stats.st_ino != path_stats.st_ino
+        ):
             # The file we have a lock on is not the file linked to the name (if
             # any). This usually happens, because before someone releases a
             # lock, they delete the file. Go back and contend again. TODO: This
             # allows a lot of queue jumping on our mutex.
-            fcntl.lockf(fd, fcntl.LOCK_UN)
-            os.close(fd)
+            safe_unlock_and_close(fd)
             continue
         else:
             # We have a lock on the file that the name points to. Since we
@@ -418,12 +604,12 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
     try:
         # When we have it, do the thing we are protecting.
-        logger.debug('PID %d now holds mutex %s', os.getpid(), lock_filename)
+        logger.debug("PID %d now holds mutex %s", os.getpid(), lock_filename)
         yield
     finally:
         # Delete it while we still own it, so we can't delete it from out from
         # under someone else who thinks they are holding it.
-        logger.debug('PID %d releasing mutex %s', os.getpid(), lock_filename)
+        logger.debug("PID %d releasing mutex %s", os.getpid(), lock_filename)
         # We have had observations in the wild of the lock file not exisiting
         # when we go to unlink it, causing a crash on mutex release. See
@@ -441,23 +627,36 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
         # Check to make sure it still looks locked before we unlink.
         if path_stats is None:
-            logger.error('PID %d had mutex %s disappear while locked! Mutex system is not working!', os.getpid(), lock_filename)
-        elif fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino:
-            logger.error('PID %d had mutex %s get replaced while locked! Mutex system is not working!', os.getpid(), lock_filename)
+            logger.error(
+                "PID %d had mutex %s disappear while locked! Mutex system is not working!",
+                os.getpid(),
+                lock_filename,
+            )
+        elif (
+            fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino
+        ):
+            logger.error(
+                "PID %d had mutex %s get replaced while locked! Mutex system is not working!",
+                os.getpid(),
+                lock_filename,
+            )
         if path_stats is not None:
             try:
                 # Unlink the file
                 os.unlink(lock_filename)
             except FileNotFoundError:
-                logger.error('PID %d had mutex %s disappear between stat and unlink while unlocking! Mutex system is not working!', os.getpid(), lock_filename)
+                logger.error(
+                    "PID %d had mutex %s disappear between stat and unlink while unlocking! Mutex system is not working!",
+                    os.getpid(),
+                    lock_filename,
+                )
         # Note that we are unlinking it and then unlocking it; a lot of people
         # might have opened it before we unlinked it and will wake up when they
         # get the worthless lock on the now-unlinked file. We have to do some
         # stat gymnastics above to work around this.
-        fcntl.lockf(fd, fcntl.LOCK_UN)
-        os.close(fd)
+        safe_unlock_and_close(fd)
 class LastProcessStandingArena:
@@ -493,13 +692,13 @@ class LastProcessStandingArena:
         # We need a mutex name to allow only one process to be entering or
         # leaving at a time.
-        self.mutex = name + '-arena-lock'
+        self.mutex = name + "-arena-lock"
         # We need a way to track who is actually in, and who was in but died.
         # So everybody gets a locked file (again).
         # TODO: deduplicate with the similar logic for process names, and also
         # deferred functions.
-        self.lockfileDir = os.path.join(base_dir, name + '-arena-members')
+        self.lockfileDir = os.path.join(base_dir, name + "-arena-members")
         # When we enter the arena, we fill this in with the FD of the locked
         # file that represents our presence.
@@ -515,7 +714,7 @@ class LastProcessStandingArena:
         You may not enter the arena again before leaving it.
         """
-        logger.debug('Joining arena %s', self.lockfileDir)
+        logger.debug("Joining arena %s", self.lockfileDir)
         # Make sure we're not in it already.
         if self.lockfileName is not None or self.lockfileFD is not None:
@@ -529,15 +728,24 @@ class LastProcessStandingArena:
                 os.mkdir(self.lockfileDir)
             except FileExistsError:
                 pass
+            except Exception as e:
+                raise RuntimeError(
+                    "Could not make lock file directory " + self.lockfileDir
+                ) from e
             # Make ourselves a file in it and lock it to prove we are alive.
-            self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
+            try:
+                self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir)  # type: ignore
+            except Exception as e:
+                raise RuntimeError(
+                    "Could not make lock file in " + self.lockfileDir
+                ) from e
             # Nobody can see it yet, so lock it right away
-            fcntl.lockf(self.lockfileFD, fcntl.LOCK_EX) # type: ignore
+            safe_lock(self.lockfileFD)  # type: ignore
             # Now we're properly in, so release the global mutex
-        logger.debug('Now in arena %s', self.lockfileDir)
+        logger.debug("Now in arena %s", self.lockfileDir)
     def leave(self) -> Iterator[bool]:
         """
@@ -557,7 +765,7 @@ class LastProcessStandingArena:
         if self.lockfileName is None or self.lockfileFD is None:
             raise RuntimeError("This process is not in the arena.")
-        logger.debug('Leaving arena %s', self.lockfileDir)
+        logger.debug("Leaving arena %s", self.lockfileDir)
         with global_mutex(self.base_dir, self.mutex):
             # Now nobody else should also be trying to join or leave.
@@ -568,8 +776,7 @@ class LastProcessStandingArena:
             except:
                 pass
             self.lockfileName = None
-            fcntl.lockf(self.lockfileFD, fcntl.LOCK_UN)
-            os.close(self.lockfileFD)
+            safe_unlock_and_close(self.lockfileFD)
             self.lockfileFD = None
             for item in os.listdir(self.lockfileDir):
@@ -583,32 +790,42 @@ class LastProcessStandingArena:
                     continue
                 try:
-                    fcntl.lockf(fd, fcntl.LOCK_SH | fcntl.LOCK_NB)
+                    safe_lock(fd, block=False, shared=True)
                 except OSError as e:
-                    # Could not lock. It's alive!
-                    break
+                    if e.errno in (errno.EACCES, errno.EAGAIN):
+                        # Could not lock. It's alive!
+                        break
+                    else:
+                        # Something else is wrong
+                        os.close(fd)
+                        raise
                 else:
                     # Could lock. Process is dead.
                     try:
                         os.remove(full_path)
                     except:
                         pass
-                    fcntl.lockf(fd, fcntl.LOCK_UN)
+                    safe_unlock_and_close(fd)
                     # Continue with the loop normally.
             else:
                 # Nothing alive was found. Nobody will come in while we hold
                 # the global mutex, so we are the Last Process Standing.
-                logger.debug('We are the Last Process Standing in arena %s', self.lockfileDir)
+                logger.debug(
+                    "We are the Last Process Standing in arena %s", self.lockfileDir
+                )
                 yield True
                 try:
                     # Delete the arena directory so as to leave nothing behind.
                     os.rmdir(self.lockfileDir)
                 except:
-                    logger.warning('Could not clean up arena %s completely: %s',
-                                   self.lockfileDir, traceback.format_exc())
+                    logger.warning(
+                        "Could not clean up arena %s completely: %s",
+                        self.lockfileDir,
+                        traceback.format_exc(),
+                    )
             # Now we're done, whether we were the last one or not, and can
             # release the mutex.
-        logger.debug('Now out of arena %s', self.lockfileDir)
+        logger.debug("Now out of arena %s", self.lockfileDir)

toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

toil 6.1.0a1py3-none-any.whl → 8.0.0py3-none-any.whl