toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +121 -83
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +38 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +489 -137
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +630 -359
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1114 -532
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +988 -315
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +727 -403
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +193 -58
- toil/lib/aws/utils.py +238 -218
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +99 -11
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +65 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +115 -77
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/options/common.py +834 -401
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +148 -64
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +93 -47
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/cwlTest.py +271 -71
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +11 -11
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3513 -1052
- toil/worker.py +269 -128
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/lib/threading.py
CHANGED
|
@@ -21,12 +21,16 @@ import fcntl
|
|
|
21
21
|
import logging
|
|
22
22
|
import math
|
|
23
23
|
import os
|
|
24
|
+
import platform
|
|
25
|
+
import subprocess
|
|
24
26
|
import sys
|
|
25
27
|
import tempfile
|
|
26
28
|
import threading
|
|
29
|
+
import time
|
|
27
30
|
import traceback
|
|
31
|
+
from collections.abc import Iterator
|
|
28
32
|
from contextlib import contextmanager
|
|
29
|
-
from typing import
|
|
33
|
+
from typing import Optional, Union, cast
|
|
30
34
|
|
|
31
35
|
import psutil
|
|
32
36
|
|
|
@@ -36,6 +40,144 @@ from toil.lib.io import robust_rmtree
|
|
|
36
40
|
logger = logging.getLogger(__name__)
|
|
37
41
|
|
|
38
42
|
|
|
43
|
+
def ensure_filesystem_lockable(
|
|
44
|
+
path: str, timeout: float = 30, hint: Optional[str] = None
|
|
45
|
+
) -> None:
|
|
46
|
+
"""
|
|
47
|
+
Make sure that the filesystem used at the given path is one where locks are safe to use.
|
|
48
|
+
|
|
49
|
+
File locks are not safe to use on Ceph. See
|
|
50
|
+
<https://github.com/DataBiosphere/toil/issues/4972>.
|
|
51
|
+
|
|
52
|
+
Raises an exception if the filesystem is detected as one where using locks
|
|
53
|
+
is known to trigger bugs in the filesystem implementation. Also raises an
|
|
54
|
+
exception if the given path does not exist, or if attempting to determine
|
|
55
|
+
the filesystem type takes more than the timeout in seconds.
|
|
56
|
+
|
|
57
|
+
If the filesystem type cannot be determined, does nothing.
|
|
58
|
+
|
|
59
|
+
:param hint: Extra text to include in an error, if raised, telling the user
|
|
60
|
+
how to change the offending path.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
if not os.path.exists(path):
|
|
64
|
+
# Raise a normal-looking FileNotFoundError. See <https://stackoverflow.com/a/36077407>
|
|
65
|
+
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
|
|
66
|
+
|
|
67
|
+
if platform.system() == "Linux":
|
|
68
|
+
# We know how to find the filesystem here.
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
# Start a child process to stat the path. See <https://unix.stackexchange.com/a/402236>.
|
|
72
|
+
# We really should call statfs but no bindings for it are in PyPI.
|
|
73
|
+
completed = subprocess.run(
|
|
74
|
+
["stat", "-f", "-c", "%T", path],
|
|
75
|
+
check=True,
|
|
76
|
+
capture_output=True,
|
|
77
|
+
timeout=timeout,
|
|
78
|
+
)
|
|
79
|
+
except subprocess.TimeoutExpired as e:
|
|
80
|
+
# The subprocess itself is Too Slow
|
|
81
|
+
raise RuntimeError(
|
|
82
|
+
f"Polling filesystem type at {path} took more than {timeout} seconds; is your filesystem working?"
|
|
83
|
+
) from e
|
|
84
|
+
except subprocess.CalledProcessError as e:
|
|
85
|
+
# Stat didn't work. Maybe we don't have the right version of stat installed?
|
|
86
|
+
logger.warning(
|
|
87
|
+
"Could not determine filesystem type at %s because of: %s",
|
|
88
|
+
path,
|
|
89
|
+
e.stderr.decode("utf-8", errors="replace").strip(),
|
|
90
|
+
)
|
|
91
|
+
# If we don't know the filesystem type, keep going anyway.
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
filesystem_type = completed.stdout.decode("utf-8", errors="replace").strip()
|
|
95
|
+
|
|
96
|
+
if filesystem_type == "ceph":
|
|
97
|
+
# Ceph is known to deadlock the MDS and break the parent directory when locking.
|
|
98
|
+
message = [
|
|
99
|
+
f"Refusing to use {path} because file locks are known to break {filesystem_type} filesystems."
|
|
100
|
+
]
|
|
101
|
+
if hint:
|
|
102
|
+
# Hint the user how to fix this.
|
|
103
|
+
message.append(hint)
|
|
104
|
+
raise RuntimeError(" ".join(message))
|
|
105
|
+
else:
|
|
106
|
+
# Other filesystem types are fine (even though NFS is sometimes
|
|
107
|
+
# flaky with regard to locks actually locking anything).
|
|
108
|
+
logger.debug(
|
|
109
|
+
"Detected that %s has lockable filesystem type: %s",
|
|
110
|
+
path,
|
|
111
|
+
filesystem_type,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Other platforms (Mac) probably aren't mounting Ceph and also don't
|
|
115
|
+
# usually use the same stat binary implementation.
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def safe_lock(fd: int, block: bool = True, shared: bool = False) -> None:
|
|
119
|
+
"""
|
|
120
|
+
Get an fcntl lock, while retrying on IO errors.
|
|
121
|
+
|
|
122
|
+
Raises OSError with EACCES or EAGAIN when a nonblocking lock is not
|
|
123
|
+
immediately available.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
# Set up retry logic. TODO: Use @retry instead.
|
|
127
|
+
error_backoff = 1
|
|
128
|
+
MAX_ERROR_TRIES = 10
|
|
129
|
+
error_tries = 0
|
|
130
|
+
|
|
131
|
+
while True:
|
|
132
|
+
try:
|
|
133
|
+
# Wait until we can exclusively lock it.
|
|
134
|
+
lock_mode = (fcntl.LOCK_SH if shared else fcntl.LOCK_EX) | (
|
|
135
|
+
fcntl.LOCK_NB if not block else 0
|
|
136
|
+
)
|
|
137
|
+
fcntl.flock(fd, lock_mode)
|
|
138
|
+
return
|
|
139
|
+
except OSError as e:
|
|
140
|
+
if e.errno in (errno.EACCES, errno.EAGAIN):
|
|
141
|
+
# Nonblocking lock not available.
|
|
142
|
+
raise
|
|
143
|
+
elif e.errno == errno.EIO:
|
|
144
|
+
# Sometimes Ceph produces IO errors when talking to lock files.
|
|
145
|
+
# Back off and try again.
|
|
146
|
+
# TODO: Should we eventually give up if the disk really is
|
|
147
|
+
# broken? If so we should use the retry system.
|
|
148
|
+
if error_tries < MAX_ERROR_TRIES:
|
|
149
|
+
logger.error(
|
|
150
|
+
"IO error talking to lock file. Retrying after %s seconds.",
|
|
151
|
+
error_backoff,
|
|
152
|
+
)
|
|
153
|
+
time.sleep(error_backoff)
|
|
154
|
+
error_backoff = min(60, error_backoff * 2)
|
|
155
|
+
error_tries += 1
|
|
156
|
+
continue
|
|
157
|
+
else:
|
|
158
|
+
logger.critical(
|
|
159
|
+
"Too many IO errors talking to lock file. If using Ceph, check for MDS deadlocks. See <https://tracker.ceph.com/issues/62123>."
|
|
160
|
+
)
|
|
161
|
+
raise
|
|
162
|
+
else:
|
|
163
|
+
raise
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def safe_unlock_and_close(fd: int) -> None:
|
|
167
|
+
"""
|
|
168
|
+
Release an fcntl lock and close the file descriptor, while handling fcntl IO errors.
|
|
169
|
+
"""
|
|
170
|
+
try:
|
|
171
|
+
fcntl.flock(fd, fcntl.LOCK_UN)
|
|
172
|
+
except OSError as e:
|
|
173
|
+
if e.errno != errno.EIO:
|
|
174
|
+
raise
|
|
175
|
+
# Sometimes Ceph produces EIO. We don't need to retry then because
|
|
176
|
+
# we're going to close the FD and after that the file can't remain
|
|
177
|
+
# locked by us.
|
|
178
|
+
os.close(fd)
|
|
179
|
+
|
|
180
|
+
|
|
39
181
|
class ExceptionalThread(threading.Thread):
|
|
40
182
|
"""
|
|
41
183
|
A thread whose join() method re-raises exceptions raised during run(). While join() is
|
|
@@ -65,6 +207,7 @@ class ExceptionalThread(threading.Thread):
|
|
|
65
207
|
AssertionError
|
|
66
208
|
|
|
67
209
|
"""
|
|
210
|
+
|
|
68
211
|
exc_info = None
|
|
69
212
|
|
|
70
213
|
def run(self) -> None:
|
|
@@ -103,21 +246,23 @@ def cpu_count() -> int:
|
|
|
103
246
|
:rtype: int
|
|
104
247
|
"""
|
|
105
248
|
|
|
106
|
-
cached = getattr(cpu_count,
|
|
249
|
+
cached = getattr(cpu_count, "result", None)
|
|
107
250
|
if cached is not None:
|
|
108
251
|
# We already got a CPU count.
|
|
109
252
|
return cast(int, cached)
|
|
110
253
|
|
|
111
254
|
# Get the fallback answer of all the CPUs on the machine
|
|
112
|
-
psutil_cpu_count =
|
|
255
|
+
psutil_cpu_count = psutil.cpu_count(logical=True)
|
|
113
256
|
if psutil_cpu_count is None:
|
|
114
|
-
logger.debug(
|
|
257
|
+
logger.debug("Could not retrieve the logical CPU count.")
|
|
115
258
|
|
|
116
|
-
total_machine_size: Union[float, int] =
|
|
117
|
-
|
|
259
|
+
total_machine_size: Union[float, int] = (
|
|
260
|
+
psutil_cpu_count if psutil_cpu_count is not None else float("inf")
|
|
261
|
+
)
|
|
262
|
+
logger.debug("Total machine size: %s core(s)", total_machine_size)
|
|
118
263
|
|
|
119
264
|
# cgroups may limit the size
|
|
120
|
-
cgroup_size: Union[float, int] = float(
|
|
265
|
+
cgroup_size: Union[float, int] = float("inf")
|
|
121
266
|
|
|
122
267
|
try:
|
|
123
268
|
# See if we can fetch these and use them
|
|
@@ -125,13 +270,13 @@ def cpu_count() -> int:
|
|
|
125
270
|
period: Optional[int] = None
|
|
126
271
|
|
|
127
272
|
# CGroups v1 keeps quota and period separate
|
|
128
|
-
CGROUP1_QUOTA_FILE =
|
|
129
|
-
CGROUP1_PERIOD_FILE =
|
|
273
|
+
CGROUP1_QUOTA_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"
|
|
274
|
+
CGROUP1_PERIOD_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"
|
|
130
275
|
# CGroups v2 keeps both in one file, space-separated, quota first
|
|
131
|
-
CGROUP2_COMBINED_FILE =
|
|
276
|
+
CGROUP2_COMBINED_FILE = "/sys/fs/cgroup/cpu.max"
|
|
132
277
|
|
|
133
278
|
if os.path.exists(CGROUP1_QUOTA_FILE) and os.path.exists(CGROUP1_PERIOD_FILE):
|
|
134
|
-
logger.debug(
|
|
279
|
+
logger.debug("CPU quota and period available from cgroups v1")
|
|
135
280
|
with open(CGROUP1_QUOTA_FILE) as stream:
|
|
136
281
|
# Read the quota
|
|
137
282
|
quota = int(stream.read())
|
|
@@ -140,56 +285,58 @@ def cpu_count() -> int:
|
|
|
140
285
|
# Read the period in which we are allowed to burn the quota
|
|
141
286
|
period = int(stream.read())
|
|
142
287
|
elif os.path.exists(CGROUP2_COMBINED_FILE):
|
|
143
|
-
logger.debug(
|
|
288
|
+
logger.debug("CPU quota and period available from cgroups v2")
|
|
144
289
|
with open(CGROUP2_COMBINED_FILE) as stream:
|
|
145
290
|
# Read the quota and the period together
|
|
146
|
-
quota, period = (int(part) for part in stream.read().split(
|
|
291
|
+
quota, period = (int(part) for part in stream.read().split(" "))
|
|
147
292
|
else:
|
|
148
|
-
logger.debug(
|
|
293
|
+
logger.debug("CPU quota/period not available from cgroups v1 or cgroups v2")
|
|
149
294
|
|
|
150
295
|
if quota is not None and period is not None:
|
|
151
296
|
# We got a quota and a period.
|
|
152
|
-
logger.debug(
|
|
297
|
+
logger.debug("CPU quota: %d period: %d", quota, period)
|
|
153
298
|
|
|
154
299
|
if quota == -1:
|
|
155
300
|
# But the quota can be -1 for unset.
|
|
156
301
|
# Assume we can use the whole machine.
|
|
157
|
-
cgroup_size = float(
|
|
302
|
+
cgroup_size = float("inf")
|
|
158
303
|
else:
|
|
159
304
|
# The thread count is how many multiples of a wall clock period we
|
|
160
305
|
# can burn in that period.
|
|
161
|
-
cgroup_size = int(math.ceil(float(quota)/float(period)))
|
|
306
|
+
cgroup_size = int(math.ceil(float(quota) / float(period)))
|
|
162
307
|
|
|
163
|
-
logger.debug(
|
|
308
|
+
logger.debug("Control group size in cores: %s", cgroup_size)
|
|
164
309
|
except:
|
|
165
310
|
# We can't actually read these cgroup fields. Maybe we are a mac or something.
|
|
166
|
-
logger.debug(
|
|
311
|
+
logger.debug("Could not inspect cgroup: %s", traceback.format_exc())
|
|
167
312
|
|
|
168
313
|
# CPU affinity may limit the size
|
|
169
|
-
affinity_size: Union[float, int] = float(
|
|
170
|
-
if hasattr(os,
|
|
314
|
+
affinity_size: Union[float, int] = float("inf")
|
|
315
|
+
if hasattr(os, "sched_getaffinity"):
|
|
171
316
|
try:
|
|
172
|
-
logger.debug(
|
|
317
|
+
logger.debug("CPU affinity available")
|
|
173
318
|
affinity_size = len(os.sched_getaffinity(0))
|
|
174
|
-
logger.debug(
|
|
319
|
+
logger.debug("CPU affinity is restricted to %d cores", affinity_size)
|
|
175
320
|
except:
|
|
176
|
-
|
|
177
|
-
logger.debug(
|
|
321
|
+
# We can't actually read this even though it exists.
|
|
322
|
+
logger.debug(
|
|
323
|
+
"Could not inspect scheduling affinity: %s", traceback.format_exc()
|
|
324
|
+
)
|
|
178
325
|
else:
|
|
179
|
-
logger.debug(
|
|
326
|
+
logger.debug("CPU affinity not available")
|
|
180
327
|
|
|
181
|
-
limit: Union[float, int] = float(
|
|
328
|
+
limit: Union[float, int] = float("inf")
|
|
182
329
|
# Apply all the limits to take the smallest
|
|
183
330
|
limit = min(limit, total_machine_size)
|
|
184
331
|
limit = min(limit, cgroup_size)
|
|
185
332
|
limit = min(limit, affinity_size)
|
|
186
|
-
if limit < 1 or limit == float(
|
|
333
|
+
if limit < 1 or limit == float("inf"):
|
|
187
334
|
# Fall back to 1 if we can't get a size
|
|
188
335
|
limit = 1
|
|
189
336
|
result = int(limit)
|
|
190
|
-
logger.debug(
|
|
337
|
+
logger.debug("cpu_count: %s", result)
|
|
191
338
|
# Make sure to remember it for the next call
|
|
192
|
-
setattr(cpu_count,
|
|
339
|
+
setattr(cpu_count, "result", result)
|
|
193
340
|
return result
|
|
194
341
|
|
|
195
342
|
|
|
@@ -211,7 +358,8 @@ def cpu_count() -> int:
|
|
|
211
358
|
current_process_name_lock = threading.Lock()
|
|
212
359
|
# And a global dict from work directory to name in that work directory.
|
|
213
360
|
# We also have a file descriptor per work directory but it is just leaked.
|
|
214
|
-
current_process_name_for:
|
|
361
|
+
current_process_name_for: dict[str, str] = {}
|
|
362
|
+
|
|
215
363
|
|
|
216
364
|
def collect_process_name_garbage() -> None:
|
|
217
365
|
"""
|
|
@@ -235,6 +383,7 @@ def collect_process_name_garbage() -> None:
|
|
|
235
383
|
for base_dir in missing:
|
|
236
384
|
del current_process_name_for[base_dir]
|
|
237
385
|
|
|
386
|
+
|
|
238
387
|
def destroy_all_process_names() -> None:
|
|
239
388
|
"""
|
|
240
389
|
Delete all our process name files because our process is going away.
|
|
@@ -249,9 +398,11 @@ def destroy_all_process_names() -> None:
|
|
|
249
398
|
for base_dir, name in current_process_name_for.items():
|
|
250
399
|
robust_rmtree(os.path.join(base_dir, name))
|
|
251
400
|
|
|
401
|
+
|
|
252
402
|
# Run the cleanup at exit
|
|
253
403
|
atexit.register(destroy_all_process_names)
|
|
254
404
|
|
|
405
|
+
|
|
255
406
|
def get_process_name(base_dir: str) -> str:
|
|
256
407
|
"""
|
|
257
408
|
Return the name of the current process. Like a PID but visible between
|
|
@@ -280,10 +431,16 @@ def get_process_name(base_dir: str) -> str:
|
|
|
280
431
|
|
|
281
432
|
# Lock the file. The lock will automatically go away if our process does.
|
|
282
433
|
try:
|
|
283
|
-
|
|
434
|
+
safe_lock(nameFD, block=False)
|
|
284
435
|
except OSError as e:
|
|
285
|
-
|
|
286
|
-
|
|
436
|
+
if e.errno in (errno.EACCES, errno.EAGAIN):
|
|
437
|
+
# Someone else locked it even though they should not have.
|
|
438
|
+
raise RuntimeError(
|
|
439
|
+
f"Could not lock process name file {nameFileName}"
|
|
440
|
+
) from e
|
|
441
|
+
else:
|
|
442
|
+
# Something else is wrong
|
|
443
|
+
raise
|
|
287
444
|
|
|
288
445
|
# Save the basename
|
|
289
446
|
current_process_name_for[base_dir] = os.path.basename(nameFileName)
|
|
@@ -321,20 +478,24 @@ def process_name_exists(base_dir: str, name: str) -> bool:
|
|
|
321
478
|
# If the file is gone, the process can't exist.
|
|
322
479
|
return False
|
|
323
480
|
|
|
324
|
-
|
|
325
481
|
nameFD = None
|
|
326
482
|
try:
|
|
327
483
|
try:
|
|
328
484
|
# Otherwise see if we can lock it shared, for which we need an FD, but
|
|
329
485
|
# only for reading.
|
|
330
486
|
nameFD = os.open(nameFileName, os.O_RDONLY)
|
|
331
|
-
fcntl.lockf(nameFD, fcntl.LOCK_SH | fcntl.LOCK_NB)
|
|
332
487
|
except FileNotFoundError as e:
|
|
333
488
|
# File has vanished
|
|
334
489
|
return False
|
|
490
|
+
try:
|
|
491
|
+
safe_lock(nameFD, block=False, shared=True)
|
|
335
492
|
except OSError as e:
|
|
336
|
-
|
|
337
|
-
|
|
493
|
+
if e.errno in (errno.EACCES, errno.EAGAIN):
|
|
494
|
+
# Could not lock. Process is alive.
|
|
495
|
+
return True
|
|
496
|
+
else:
|
|
497
|
+
# Something else went wrong
|
|
498
|
+
raise
|
|
338
499
|
else:
|
|
339
500
|
# Could lock. Process is dead.
|
|
340
501
|
# Remove the file. We race to be the first to do so.
|
|
@@ -342,8 +503,8 @@ def process_name_exists(base_dir: str, name: str) -> bool:
|
|
|
342
503
|
os.remove(nameFileName)
|
|
343
504
|
except:
|
|
344
505
|
pass
|
|
345
|
-
|
|
346
|
-
|
|
506
|
+
safe_unlock_and_close(nameFD)
|
|
507
|
+
nameFD = None
|
|
347
508
|
# Report process death
|
|
348
509
|
return False
|
|
349
510
|
finally:
|
|
@@ -353,6 +514,7 @@ def process_name_exists(base_dir: str, name: str) -> bool:
|
|
|
353
514
|
except:
|
|
354
515
|
pass
|
|
355
516
|
|
|
517
|
+
|
|
356
518
|
# Similar to the process naming system above, we define a global mutex system
|
|
357
519
|
# for critical sections, based just around file locks.
|
|
358
520
|
@contextmanager
|
|
@@ -372,21 +534,34 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
|
|
|
372
534
|
if not os.path.isdir(base_dir):
|
|
373
535
|
raise RuntimeError(f"Directory {base_dir} for mutex does not exist")
|
|
374
536
|
|
|
537
|
+
# TODO: We don't know what CLI option controls where to put this mutex, so
|
|
538
|
+
# we aren't very helpful if the location is bad.
|
|
539
|
+
ensure_filesystem_lockable(
|
|
540
|
+
base_dir, hint=f"Specify a different place to put the {mutex} mutex."
|
|
541
|
+
)
|
|
542
|
+
|
|
375
543
|
# Define a filename
|
|
376
|
-
lock_filename = os.path.join(base_dir,
|
|
544
|
+
lock_filename = os.path.join(base_dir, "toil-mutex-" + mutex)
|
|
377
545
|
|
|
378
|
-
logger.debug(
|
|
546
|
+
logger.debug("PID %d acquiring mutex %s", os.getpid(), lock_filename)
|
|
379
547
|
|
|
380
548
|
# We can't just create/open and lock a file, because when we clean up
|
|
381
549
|
# there's a race where someone can open the file before we unlink it and
|
|
382
550
|
# get a lock on the deleted file.
|
|
383
551
|
|
|
552
|
+
error_backoff = 1
|
|
553
|
+
|
|
384
554
|
while True:
|
|
385
555
|
# Try to create the file, ignoring if it exists or not.
|
|
386
556
|
fd = os.open(lock_filename, os.O_CREAT | os.O_WRONLY)
|
|
387
557
|
|
|
388
|
-
|
|
389
|
-
|
|
558
|
+
try:
|
|
559
|
+
# Wait until we can exclusively lock it, handling error retry.
|
|
560
|
+
safe_lock(fd)
|
|
561
|
+
except:
|
|
562
|
+
# Something went wrong
|
|
563
|
+
os.close(fd)
|
|
564
|
+
raise
|
|
390
565
|
|
|
391
566
|
# Holding the lock, make sure we are looking at the same file on disk still.
|
|
392
567
|
try:
|
|
@@ -394,16 +569,14 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
|
|
|
394
569
|
fd_stats = os.fstat(fd)
|
|
395
570
|
except OSError as e:
|
|
396
571
|
if e.errno == errno.ESTALE:
|
|
397
|
-
# The file handle has gone stale, because somebody removed the
|
|
572
|
+
# The file handle has gone stale, because somebody removed the
|
|
573
|
+
# file.
|
|
398
574
|
# Try again.
|
|
399
|
-
|
|
400
|
-
fcntl.lockf(fd, fcntl.LOCK_UN)
|
|
401
|
-
except OSError:
|
|
402
|
-
pass
|
|
403
|
-
os.close(fd)
|
|
575
|
+
safe_unlock_and_close(fd)
|
|
404
576
|
continue
|
|
405
577
|
else:
|
|
406
578
|
# Something else broke
|
|
579
|
+
os.close(fd)
|
|
407
580
|
raise
|
|
408
581
|
|
|
409
582
|
try:
|
|
@@ -412,13 +585,16 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
|
|
|
412
585
|
except FileNotFoundError:
|
|
413
586
|
path_stats = None
|
|
414
587
|
|
|
415
|
-
if
|
|
588
|
+
if (
|
|
589
|
+
path_stats is None
|
|
590
|
+
or fd_stats.st_dev != path_stats.st_dev
|
|
591
|
+
or fd_stats.st_ino != path_stats.st_ino
|
|
592
|
+
):
|
|
416
593
|
# The file we have a lock on is not the file linked to the name (if
|
|
417
594
|
# any). This usually happens, because before someone releases a
|
|
418
595
|
# lock, they delete the file. Go back and contend again. TODO: This
|
|
419
596
|
# allows a lot of queue jumping on our mutex.
|
|
420
|
-
|
|
421
|
-
os.close(fd)
|
|
597
|
+
safe_unlock_and_close(fd)
|
|
422
598
|
continue
|
|
423
599
|
else:
|
|
424
600
|
# We have a lock on the file that the name points to. Since we
|
|
@@ -428,12 +604,12 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
|
|
|
428
604
|
|
|
429
605
|
try:
|
|
430
606
|
# When we have it, do the thing we are protecting.
|
|
431
|
-
logger.debug(
|
|
607
|
+
logger.debug("PID %d now holds mutex %s", os.getpid(), lock_filename)
|
|
432
608
|
yield
|
|
433
609
|
finally:
|
|
434
610
|
# Delete it while we still own it, so we can't delete it from out from
|
|
435
611
|
# under someone else who thinks they are holding it.
|
|
436
|
-
logger.debug(
|
|
612
|
+
logger.debug("PID %d releasing mutex %s", os.getpid(), lock_filename)
|
|
437
613
|
|
|
438
614
|
# We have had observations in the wild of the lock file not exisiting
|
|
439
615
|
# when we go to unlink it, causing a crash on mutex release. See
|
|
@@ -451,23 +627,36 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
|
|
|
451
627
|
|
|
452
628
|
# Check to make sure it still looks locked before we unlink.
|
|
453
629
|
if path_stats is None:
|
|
454
|
-
logger.error(
|
|
455
|
-
|
|
456
|
-
|
|
630
|
+
logger.error(
|
|
631
|
+
"PID %d had mutex %s disappear while locked! Mutex system is not working!",
|
|
632
|
+
os.getpid(),
|
|
633
|
+
lock_filename,
|
|
634
|
+
)
|
|
635
|
+
elif (
|
|
636
|
+
fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino
|
|
637
|
+
):
|
|
638
|
+
logger.error(
|
|
639
|
+
"PID %d had mutex %s get replaced while locked! Mutex system is not working!",
|
|
640
|
+
os.getpid(),
|
|
641
|
+
lock_filename,
|
|
642
|
+
)
|
|
457
643
|
|
|
458
644
|
if path_stats is not None:
|
|
459
645
|
try:
|
|
460
646
|
# Unlink the file
|
|
461
647
|
os.unlink(lock_filename)
|
|
462
648
|
except FileNotFoundError:
|
|
463
|
-
logger.error(
|
|
649
|
+
logger.error(
|
|
650
|
+
"PID %d had mutex %s disappear between stat and unlink while unlocking! Mutex system is not working!",
|
|
651
|
+
os.getpid(),
|
|
652
|
+
lock_filename,
|
|
653
|
+
)
|
|
464
654
|
|
|
465
655
|
# Note that we are unlinking it and then unlocking it; a lot of people
|
|
466
656
|
# might have opened it before we unlinked it and will wake up when they
|
|
467
657
|
# get the worthless lock on the now-unlinked file. We have to do some
|
|
468
658
|
# stat gymnastics above to work around this.
|
|
469
|
-
|
|
470
|
-
os.close(fd)
|
|
659
|
+
safe_unlock_and_close(fd)
|
|
471
660
|
|
|
472
661
|
|
|
473
662
|
class LastProcessStandingArena:
|
|
@@ -503,13 +692,13 @@ class LastProcessStandingArena:
|
|
|
503
692
|
|
|
504
693
|
# We need a mutex name to allow only one process to be entering or
|
|
505
694
|
# leaving at a time.
|
|
506
|
-
self.mutex = name +
|
|
695
|
+
self.mutex = name + "-arena-lock"
|
|
507
696
|
|
|
508
697
|
# We need a way to track who is actually in, and who was in but died.
|
|
509
698
|
# So everybody gets a locked file (again).
|
|
510
699
|
# TODO: deduplicate with the similar logic for process names, and also
|
|
511
700
|
# deferred functions.
|
|
512
|
-
self.lockfileDir = os.path.join(base_dir, name +
|
|
701
|
+
self.lockfileDir = os.path.join(base_dir, name + "-arena-members")
|
|
513
702
|
|
|
514
703
|
# When we enter the arena, we fill this in with the FD of the locked
|
|
515
704
|
# file that represents our presence.
|
|
@@ -525,7 +714,7 @@ class LastProcessStandingArena:
|
|
|
525
714
|
You may not enter the arena again before leaving it.
|
|
526
715
|
"""
|
|
527
716
|
|
|
528
|
-
logger.debug(
|
|
717
|
+
logger.debug("Joining arena %s", self.lockfileDir)
|
|
529
718
|
|
|
530
719
|
# Make sure we're not in it already.
|
|
531
720
|
if self.lockfileName is not None or self.lockfileFD is not None:
|
|
@@ -540,19 +729,23 @@ class LastProcessStandingArena:
|
|
|
540
729
|
except FileExistsError:
|
|
541
730
|
pass
|
|
542
731
|
except Exception as e:
|
|
543
|
-
raise RuntimeError(
|
|
732
|
+
raise RuntimeError(
|
|
733
|
+
"Could not make lock file directory " + self.lockfileDir
|
|
734
|
+
) from e
|
|
544
735
|
|
|
545
736
|
# Make ourselves a file in it and lock it to prove we are alive.
|
|
546
737
|
try:
|
|
547
|
-
self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir)
|
|
738
|
+
self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
|
|
548
739
|
except Exception as e:
|
|
549
|
-
raise RuntimeError(
|
|
740
|
+
raise RuntimeError(
|
|
741
|
+
"Could not make lock file in " + self.lockfileDir
|
|
742
|
+
) from e
|
|
550
743
|
# Nobody can see it yet, so lock it right away
|
|
551
|
-
|
|
744
|
+
safe_lock(self.lockfileFD) # type: ignore
|
|
552
745
|
|
|
553
746
|
# Now we're properly in, so release the global mutex
|
|
554
747
|
|
|
555
|
-
logger.debug(
|
|
748
|
+
logger.debug("Now in arena %s", self.lockfileDir)
|
|
556
749
|
|
|
557
750
|
def leave(self) -> Iterator[bool]:
|
|
558
751
|
"""
|
|
@@ -572,7 +765,7 @@ class LastProcessStandingArena:
|
|
|
572
765
|
if self.lockfileName is None or self.lockfileFD is None:
|
|
573
766
|
raise RuntimeError("This process is not in the arena.")
|
|
574
767
|
|
|
575
|
-
logger.debug(
|
|
768
|
+
logger.debug("Leaving arena %s", self.lockfileDir)
|
|
576
769
|
|
|
577
770
|
with global_mutex(self.base_dir, self.mutex):
|
|
578
771
|
# Now nobody else should also be trying to join or leave.
|
|
@@ -583,8 +776,7 @@ class LastProcessStandingArena:
|
|
|
583
776
|
except:
|
|
584
777
|
pass
|
|
585
778
|
self.lockfileName = None
|
|
586
|
-
|
|
587
|
-
os.close(self.lockfileFD)
|
|
779
|
+
safe_unlock_and_close(self.lockfileFD)
|
|
588
780
|
self.lockfileFD = None
|
|
589
781
|
|
|
590
782
|
for item in os.listdir(self.lockfileDir):
|
|
@@ -598,32 +790,42 @@ class LastProcessStandingArena:
|
|
|
598
790
|
continue
|
|
599
791
|
|
|
600
792
|
try:
|
|
601
|
-
|
|
793
|
+
safe_lock(fd, block=False, shared=True)
|
|
602
794
|
except OSError as e:
|
|
603
|
-
|
|
604
|
-
|
|
795
|
+
if e.errno in (errno.EACCES, errno.EAGAIN):
|
|
796
|
+
# Could not lock. It's alive!
|
|
797
|
+
break
|
|
798
|
+
else:
|
|
799
|
+
# Something else is wrong
|
|
800
|
+
os.close(fd)
|
|
801
|
+
raise
|
|
605
802
|
else:
|
|
606
803
|
# Could lock. Process is dead.
|
|
607
804
|
try:
|
|
608
805
|
os.remove(full_path)
|
|
609
806
|
except:
|
|
610
807
|
pass
|
|
611
|
-
|
|
808
|
+
safe_unlock_and_close(fd)
|
|
612
809
|
# Continue with the loop normally.
|
|
613
810
|
else:
|
|
614
811
|
# Nothing alive was found. Nobody will come in while we hold
|
|
615
812
|
# the global mutex, so we are the Last Process Standing.
|
|
616
|
-
logger.debug(
|
|
813
|
+
logger.debug(
|
|
814
|
+
"We are the Last Process Standing in arena %s", self.lockfileDir
|
|
815
|
+
)
|
|
617
816
|
yield True
|
|
618
817
|
|
|
619
818
|
try:
|
|
620
819
|
# Delete the arena directory so as to leave nothing behind.
|
|
621
820
|
os.rmdir(self.lockfileDir)
|
|
622
821
|
except:
|
|
623
|
-
logger.warning(
|
|
624
|
-
|
|
822
|
+
logger.warning(
|
|
823
|
+
"Could not clean up arena %s completely: %s",
|
|
824
|
+
self.lockfileDir,
|
|
825
|
+
traceback.format_exc(),
|
|
826
|
+
)
|
|
625
827
|
|
|
626
828
|
# Now we're done, whether we were the last one or not, and can
|
|
627
829
|
# release the mutex.
|
|
628
830
|
|
|
629
|
-
logger.debug(
|
|
831
|
+
logger.debug("Now out of arena %s", self.lockfileDir)
|