toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/lib/threading.py
CHANGED
|
@@ -21,14 +21,18 @@ import fcntl
|
|
|
21
21
|
import logging
|
|
22
22
|
import math
|
|
23
23
|
import os
|
|
24
|
+
import platform
|
|
25
|
+
import subprocess
|
|
24
26
|
import sys
|
|
25
27
|
import tempfile
|
|
26
28
|
import threading
|
|
29
|
+
import time
|
|
27
30
|
import traceback
|
|
31
|
+
from collections.abc import Iterator
|
|
28
32
|
from contextlib import contextmanager
|
|
29
|
-
from typing import
|
|
33
|
+
from typing import Optional, Union, cast
|
|
30
34
|
|
|
31
|
-
import psutil
|
|
35
|
+
import psutil
|
|
32
36
|
|
|
33
37
|
from toil.lib.exceptions import raise_
|
|
34
38
|
from toil.lib.io import robust_rmtree
|
|
@@ -36,12 +40,150 @@ from toil.lib.io import robust_rmtree
|
|
|
36
40
|
logger = logging.getLogger(__name__)
|
|
37
41
|
|
|
38
42
|
|
|
43
|
+
def ensure_filesystem_lockable(
|
|
44
|
+
path: str, timeout: float = 30, hint: Optional[str] = None
|
|
45
|
+
) -> None:
|
|
46
|
+
"""
|
|
47
|
+
Make sure that the filesystem used at the given path is one where locks are safe to use.
|
|
48
|
+
|
|
49
|
+
File locks are not safe to use on Ceph. See
|
|
50
|
+
<https://github.com/DataBiosphere/toil/issues/4972>.
|
|
51
|
+
|
|
52
|
+
Raises an exception if the filesystem is detected as one where using locks
|
|
53
|
+
is known to trigger bugs in the filesystem implementation. Also raises an
|
|
54
|
+
exception if the given path does not exist, or if attempting to determine
|
|
55
|
+
the filesystem type takes more than the timeout in seconds.
|
|
56
|
+
|
|
57
|
+
If the filesystem type cannot be determined, does nothing.
|
|
58
|
+
|
|
59
|
+
:param hint: Extra text to include in an error, if raised, telling the user
|
|
60
|
+
how to change the offending path.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
if not os.path.exists(path):
|
|
64
|
+
# Raise a normal-looking FileNotFoundError. See <https://stackoverflow.com/a/36077407>
|
|
65
|
+
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
|
|
66
|
+
|
|
67
|
+
if platform.system() == "Linux":
|
|
68
|
+
# We know how to find the filesystem here.
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
# Start a child process to stat the path. See <https://unix.stackexchange.com/a/402236>.
|
|
72
|
+
# We really should call statfs but no bindings for it are in PyPI.
|
|
73
|
+
completed = subprocess.run(
|
|
74
|
+
["stat", "-f", "-c", "%T", path],
|
|
75
|
+
check=True,
|
|
76
|
+
capture_output=True,
|
|
77
|
+
timeout=timeout,
|
|
78
|
+
)
|
|
79
|
+
except subprocess.TimeoutExpired as e:
|
|
80
|
+
# The subprocess itself is Too Slow
|
|
81
|
+
raise RuntimeError(
|
|
82
|
+
f"Polling filesystem type at {path} took more than {timeout} seconds; is your filesystem working?"
|
|
83
|
+
) from e
|
|
84
|
+
except subprocess.CalledProcessError as e:
|
|
85
|
+
# Stat didn't work. Maybe we don't have the right version of stat installed?
|
|
86
|
+
logger.warning(
|
|
87
|
+
"Could not determine filesystem type at %s because of: %s",
|
|
88
|
+
path,
|
|
89
|
+
e.stderr.decode("utf-8", errors="replace").strip(),
|
|
90
|
+
)
|
|
91
|
+
# If we don't know the filesystem type, keep going anyway.
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
filesystem_type = completed.stdout.decode("utf-8", errors="replace").strip()
|
|
95
|
+
|
|
96
|
+
if filesystem_type == "ceph":
|
|
97
|
+
# Ceph is known to deadlock the MDS and break the parent directory when locking.
|
|
98
|
+
message = [
|
|
99
|
+
f"Refusing to use {path} because file locks are known to break {filesystem_type} filesystems."
|
|
100
|
+
]
|
|
101
|
+
if hint:
|
|
102
|
+
# Hint the user how to fix this.
|
|
103
|
+
message.append(hint)
|
|
104
|
+
raise RuntimeError(" ".join(message))
|
|
105
|
+
else:
|
|
106
|
+
# Other filesystem types are fine (even though NFS is sometimes
|
|
107
|
+
# flaky with regard to locks actually locking anything).
|
|
108
|
+
logger.debug(
|
|
109
|
+
"Detected that %s has lockable filesystem type: %s",
|
|
110
|
+
path,
|
|
111
|
+
filesystem_type,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Other platforms (Mac) probably aren't mounting Ceph and also don't
|
|
115
|
+
# usually use the same stat binary implementation.
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def safe_lock(fd: int, block: bool = True, shared: bool = False) -> None:
|
|
119
|
+
"""
|
|
120
|
+
Get an fcntl lock, while retrying on IO errors.
|
|
121
|
+
|
|
122
|
+
Raises OSError with EACCES or EAGAIN when a nonblocking lock is not
|
|
123
|
+
immediately available.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
# Set up retry logic. TODO: Use @retry instead.
|
|
127
|
+
error_backoff = 1
|
|
128
|
+
MAX_ERROR_TRIES = 10
|
|
129
|
+
error_tries = 0
|
|
130
|
+
|
|
131
|
+
while True:
|
|
132
|
+
try:
|
|
133
|
+
# Wait until we can exclusively lock it.
|
|
134
|
+
lock_mode = (fcntl.LOCK_SH if shared else fcntl.LOCK_EX) | (
|
|
135
|
+
fcntl.LOCK_NB if not block else 0
|
|
136
|
+
)
|
|
137
|
+
fcntl.flock(fd, lock_mode)
|
|
138
|
+
return
|
|
139
|
+
except OSError as e:
|
|
140
|
+
if e.errno in (errno.EACCES, errno.EAGAIN):
|
|
141
|
+
# Nonblocking lock not available.
|
|
142
|
+
raise
|
|
143
|
+
elif e.errno == errno.EIO:
|
|
144
|
+
# Sometimes Ceph produces IO errors when talking to lock files.
|
|
145
|
+
# Back off and try again.
|
|
146
|
+
# TODO: Should we eventually give up if the disk really is
|
|
147
|
+
# broken? If so we should use the retry system.
|
|
148
|
+
if error_tries < MAX_ERROR_TRIES:
|
|
149
|
+
logger.error(
|
|
150
|
+
"IO error talking to lock file. Retrying after %s seconds.",
|
|
151
|
+
error_backoff,
|
|
152
|
+
)
|
|
153
|
+
time.sleep(error_backoff)
|
|
154
|
+
error_backoff = min(60, error_backoff * 2)
|
|
155
|
+
error_tries += 1
|
|
156
|
+
continue
|
|
157
|
+
else:
|
|
158
|
+
logger.critical(
|
|
159
|
+
"Too many IO errors talking to lock file. If using Ceph, check for MDS deadlocks. See <https://tracker.ceph.com/issues/62123>."
|
|
160
|
+
)
|
|
161
|
+
raise
|
|
162
|
+
else:
|
|
163
|
+
raise
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def safe_unlock_and_close(fd: int) -> None:
|
|
167
|
+
"""
|
|
168
|
+
Release an fcntl lock and close the file descriptor, while handling fcntl IO errors.
|
|
169
|
+
"""
|
|
170
|
+
try:
|
|
171
|
+
fcntl.flock(fd, fcntl.LOCK_UN)
|
|
172
|
+
except OSError as e:
|
|
173
|
+
if e.errno != errno.EIO:
|
|
174
|
+
raise
|
|
175
|
+
# Sometimes Ceph produces EIO. We don't need to retry then because
|
|
176
|
+
# we're going to close the FD and after that the file can't remain
|
|
177
|
+
# locked by us.
|
|
178
|
+
os.close(fd)
|
|
179
|
+
|
|
180
|
+
|
|
39
181
|
class ExceptionalThread(threading.Thread):
|
|
40
182
|
"""
|
|
41
183
|
A thread whose join() method re-raises exceptions raised during run(). While join() is
|
|
42
184
|
idempotent, the exception is only during the first invocation of join() that successfully
|
|
43
185
|
joined the thread. If join() times out, no exception will be re reraised even though an
|
|
44
|
-
exception might already have
|
|
186
|
+
exception might already have occurred in run().
|
|
45
187
|
|
|
46
188
|
When subclassing this thread, override tryRun() instead of run().
|
|
47
189
|
|
|
@@ -65,6 +207,7 @@ class ExceptionalThread(threading.Thread):
|
|
|
65
207
|
AssertionError
|
|
66
208
|
|
|
67
209
|
"""
|
|
210
|
+
|
|
68
211
|
exc_info = None
|
|
69
212
|
|
|
70
213
|
def run(self) -> None:
|
|
@@ -103,18 +246,23 @@ def cpu_count() -> int:
|
|
|
103
246
|
:rtype: int
|
|
104
247
|
"""
|
|
105
248
|
|
|
106
|
-
cached = getattr(cpu_count,
|
|
249
|
+
cached = getattr(cpu_count, "result", None)
|
|
107
250
|
if cached is not None:
|
|
108
251
|
# We already got a CPU count.
|
|
109
252
|
return cast(int, cached)
|
|
110
253
|
|
|
111
254
|
# Get the fallback answer of all the CPUs on the machine
|
|
112
|
-
|
|
255
|
+
psutil_cpu_count = psutil.cpu_count(logical=True)
|
|
256
|
+
if psutil_cpu_count is None:
|
|
257
|
+
logger.debug("Could not retrieve the logical CPU count.")
|
|
113
258
|
|
|
114
|
-
|
|
259
|
+
total_machine_size: Union[float, int] = (
|
|
260
|
+
psutil_cpu_count if psutil_cpu_count is not None else float("inf")
|
|
261
|
+
)
|
|
262
|
+
logger.debug("Total machine size: %s core(s)", total_machine_size)
|
|
115
263
|
|
|
116
264
|
# cgroups may limit the size
|
|
117
|
-
cgroup_size: Union[float, int] = float(
|
|
265
|
+
cgroup_size: Union[float, int] = float("inf")
|
|
118
266
|
|
|
119
267
|
try:
|
|
120
268
|
# See if we can fetch these and use them
|
|
@@ -122,13 +270,13 @@ def cpu_count() -> int:
|
|
|
122
270
|
period: Optional[int] = None
|
|
123
271
|
|
|
124
272
|
# CGroups v1 keeps quota and period separate
|
|
125
|
-
CGROUP1_QUOTA_FILE =
|
|
126
|
-
CGROUP1_PERIOD_FILE =
|
|
273
|
+
CGROUP1_QUOTA_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"
|
|
274
|
+
CGROUP1_PERIOD_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"
|
|
127
275
|
# CGroups v2 keeps both in one file, space-separated, quota first
|
|
128
|
-
CGROUP2_COMBINED_FILE =
|
|
276
|
+
CGROUP2_COMBINED_FILE = "/sys/fs/cgroup/cpu.max"
|
|
129
277
|
|
|
130
278
|
if os.path.exists(CGROUP1_QUOTA_FILE) and os.path.exists(CGROUP1_PERIOD_FILE):
|
|
131
|
-
logger.debug(
|
|
279
|
+
logger.debug("CPU quota and period available from cgroups v1")
|
|
132
280
|
with open(CGROUP1_QUOTA_FILE) as stream:
|
|
133
281
|
# Read the quota
|
|
134
282
|
quota = int(stream.read())
|
|
@@ -137,49 +285,58 @@ def cpu_count() -> int:
|
|
|
137
285
|
# Read the period in which we are allowed to burn the quota
|
|
138
286
|
period = int(stream.read())
|
|
139
287
|
elif os.path.exists(CGROUP2_COMBINED_FILE):
|
|
140
|
-
logger.debug(
|
|
288
|
+
logger.debug("CPU quota and period available from cgroups v2")
|
|
141
289
|
with open(CGROUP2_COMBINED_FILE) as stream:
|
|
142
290
|
# Read the quota and the period together
|
|
143
|
-
quota, period = (int(part) for part in stream.read().split(
|
|
291
|
+
quota, period = (int(part) for part in stream.read().split(" "))
|
|
144
292
|
else:
|
|
145
|
-
logger.debug(
|
|
293
|
+
logger.debug("CPU quota/period not available from cgroups v1 or cgroups v2")
|
|
146
294
|
|
|
147
295
|
if quota is not None and period is not None:
|
|
148
296
|
# We got a quota and a period.
|
|
149
|
-
logger.debug(
|
|
297
|
+
logger.debug("CPU quota: %d period: %d", quota, period)
|
|
150
298
|
|
|
151
299
|
if quota == -1:
|
|
152
300
|
# But the quota can be -1 for unset.
|
|
153
301
|
# Assume we can use the whole machine.
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
302
|
+
cgroup_size = float("inf")
|
|
303
|
+
else:
|
|
304
|
+
# The thread count is how many multiples of a wall clock period we
|
|
305
|
+
# can burn in that period.
|
|
306
|
+
cgroup_size = int(math.ceil(float(quota) / float(period)))
|
|
159
307
|
|
|
160
|
-
logger.debug(
|
|
308
|
+
logger.debug("Control group size in cores: %s", cgroup_size)
|
|
161
309
|
except:
|
|
162
310
|
# We can't actually read these cgroup fields. Maybe we are a mac or something.
|
|
163
|
-
logger.debug(
|
|
311
|
+
logger.debug("Could not inspect cgroup: %s", traceback.format_exc())
|
|
164
312
|
|
|
165
313
|
# CPU affinity may limit the size
|
|
166
|
-
affinity_size: Union[float, int] = float(
|
|
167
|
-
if hasattr(os,
|
|
314
|
+
affinity_size: Union[float, int] = float("inf")
|
|
315
|
+
if hasattr(os, "sched_getaffinity"):
|
|
168
316
|
try:
|
|
169
|
-
logger.debug(
|
|
317
|
+
logger.debug("CPU affinity available")
|
|
170
318
|
affinity_size = len(os.sched_getaffinity(0))
|
|
171
|
-
logger.debug(
|
|
319
|
+
logger.debug("CPU affinity is restricted to %d cores", affinity_size)
|
|
172
320
|
except:
|
|
173
|
-
|
|
174
|
-
logger.debug(
|
|
321
|
+
# We can't actually read this even though it exists.
|
|
322
|
+
logger.debug(
|
|
323
|
+
"Could not inspect scheduling affinity: %s", traceback.format_exc()
|
|
324
|
+
)
|
|
175
325
|
else:
|
|
176
|
-
logger.debug(
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
326
|
+
logger.debug("CPU affinity not available")
|
|
327
|
+
|
|
328
|
+
limit: Union[float, int] = float("inf")
|
|
329
|
+
# Apply all the limits to take the smallest
|
|
330
|
+
limit = min(limit, total_machine_size)
|
|
331
|
+
limit = min(limit, cgroup_size)
|
|
332
|
+
limit = min(limit, affinity_size)
|
|
333
|
+
if limit < 1 or limit == float("inf"):
|
|
334
|
+
# Fall back to 1 if we can't get a size
|
|
335
|
+
limit = 1
|
|
336
|
+
result = int(limit)
|
|
337
|
+
logger.debug("cpu_count: %s", result)
|
|
181
338
|
# Make sure to remember it for the next call
|
|
182
|
-
setattr(cpu_count,
|
|
339
|
+
setattr(cpu_count, "result", result)
|
|
183
340
|
return result
|
|
184
341
|
|
|
185
342
|
|
|
@@ -201,7 +358,8 @@ def cpu_count() -> int:
|
|
|
201
358
|
current_process_name_lock = threading.Lock()
|
|
202
359
|
# And a global dict from work directory to name in that work directory.
|
|
203
360
|
# We also have a file descriptor per work directory but it is just leaked.
|
|
204
|
-
current_process_name_for:
|
|
361
|
+
current_process_name_for: dict[str, str] = {}
|
|
362
|
+
|
|
205
363
|
|
|
206
364
|
def collect_process_name_garbage() -> None:
|
|
207
365
|
"""
|
|
@@ -225,6 +383,7 @@ def collect_process_name_garbage() -> None:
|
|
|
225
383
|
for base_dir in missing:
|
|
226
384
|
del current_process_name_for[base_dir]
|
|
227
385
|
|
|
386
|
+
|
|
228
387
|
def destroy_all_process_names() -> None:
|
|
229
388
|
"""
|
|
230
389
|
Delete all our process name files because our process is going away.
|
|
@@ -239,9 +398,11 @@ def destroy_all_process_names() -> None:
|
|
|
239
398
|
for base_dir, name in current_process_name_for.items():
|
|
240
399
|
robust_rmtree(os.path.join(base_dir, name))
|
|
241
400
|
|
|
401
|
+
|
|
242
402
|
# Run the cleanup at exit
|
|
243
403
|
atexit.register(destroy_all_process_names)
|
|
244
404
|
|
|
405
|
+
|
|
245
406
|
def get_process_name(base_dir: str) -> str:
|
|
246
407
|
"""
|
|
247
408
|
Return the name of the current process. Like a PID but visible between
|
|
@@ -270,10 +431,16 @@ def get_process_name(base_dir: str) -> str:
|
|
|
270
431
|
|
|
271
432
|
# Lock the file. The lock will automatically go away if our process does.
|
|
272
433
|
try:
|
|
273
|
-
|
|
434
|
+
safe_lock(nameFD, block=False)
|
|
274
435
|
except OSError as e:
|
|
275
|
-
|
|
276
|
-
|
|
436
|
+
if e.errno in (errno.EACCES, errno.EAGAIN):
|
|
437
|
+
# Someone else locked it even though they should not have.
|
|
438
|
+
raise RuntimeError(
|
|
439
|
+
f"Could not lock process name file {nameFileName}"
|
|
440
|
+
) from e
|
|
441
|
+
else:
|
|
442
|
+
# Something else is wrong
|
|
443
|
+
raise
|
|
277
444
|
|
|
278
445
|
# Save the basename
|
|
279
446
|
current_process_name_for[base_dir] = os.path.basename(nameFileName)
|
|
@@ -311,20 +478,24 @@ def process_name_exists(base_dir: str, name: str) -> bool:
|
|
|
311
478
|
# If the file is gone, the process can't exist.
|
|
312
479
|
return False
|
|
313
480
|
|
|
314
|
-
|
|
315
481
|
nameFD = None
|
|
316
482
|
try:
|
|
317
483
|
try:
|
|
318
484
|
# Otherwise see if we can lock it shared, for which we need an FD, but
|
|
319
485
|
# only for reading.
|
|
320
486
|
nameFD = os.open(nameFileName, os.O_RDONLY)
|
|
321
|
-
fcntl.lockf(nameFD, fcntl.LOCK_SH | fcntl.LOCK_NB)
|
|
322
487
|
except FileNotFoundError as e:
|
|
323
488
|
# File has vanished
|
|
324
489
|
return False
|
|
490
|
+
try:
|
|
491
|
+
safe_lock(nameFD, block=False, shared=True)
|
|
325
492
|
except OSError as e:
|
|
326
|
-
|
|
327
|
-
|
|
493
|
+
if e.errno in (errno.EACCES, errno.EAGAIN):
|
|
494
|
+
# Could not lock. Process is alive.
|
|
495
|
+
return True
|
|
496
|
+
else:
|
|
497
|
+
# Something else went wrong
|
|
498
|
+
raise
|
|
328
499
|
else:
|
|
329
500
|
# Could lock. Process is dead.
|
|
330
501
|
# Remove the file. We race to be the first to do so.
|
|
@@ -332,8 +503,8 @@ def process_name_exists(base_dir: str, name: str) -> bool:
|
|
|
332
503
|
os.remove(nameFileName)
|
|
333
504
|
except:
|
|
334
505
|
pass
|
|
335
|
-
|
|
336
|
-
|
|
506
|
+
safe_unlock_and_close(nameFD)
|
|
507
|
+
nameFD = None
|
|
337
508
|
# Report process death
|
|
338
509
|
return False
|
|
339
510
|
finally:
|
|
@@ -343,6 +514,7 @@ def process_name_exists(base_dir: str, name: str) -> bool:
|
|
|
343
514
|
except:
|
|
344
515
|
pass
|
|
345
516
|
|
|
517
|
+
|
|
346
518
|
# Similar to the process naming system above, we define a global mutex system
|
|
347
519
|
# for critical sections, based just around file locks.
|
|
348
520
|
@contextmanager
|
|
@@ -362,21 +534,34 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
|
|
|
362
534
|
if not os.path.isdir(base_dir):
|
|
363
535
|
raise RuntimeError(f"Directory {base_dir} for mutex does not exist")
|
|
364
536
|
|
|
537
|
+
# TODO: We don't know what CLI option controls where to put this mutex, so
|
|
538
|
+
# we aren't very helpful if the location is bad.
|
|
539
|
+
ensure_filesystem_lockable(
|
|
540
|
+
base_dir, hint=f"Specify a different place to put the {mutex} mutex."
|
|
541
|
+
)
|
|
542
|
+
|
|
365
543
|
# Define a filename
|
|
366
|
-
lock_filename = os.path.join(base_dir,
|
|
544
|
+
lock_filename = os.path.join(base_dir, "toil-mutex-" + mutex)
|
|
367
545
|
|
|
368
|
-
logger.debug(
|
|
546
|
+
logger.debug("PID %d acquiring mutex %s", os.getpid(), lock_filename)
|
|
369
547
|
|
|
370
548
|
# We can't just create/open and lock a file, because when we clean up
|
|
371
549
|
# there's a race where someone can open the file before we unlink it and
|
|
372
550
|
# get a lock on the deleted file.
|
|
373
551
|
|
|
552
|
+
error_backoff = 1
|
|
553
|
+
|
|
374
554
|
while True:
|
|
375
555
|
# Try to create the file, ignoring if it exists or not.
|
|
376
556
|
fd = os.open(lock_filename, os.O_CREAT | os.O_WRONLY)
|
|
377
557
|
|
|
378
|
-
|
|
379
|
-
|
|
558
|
+
try:
|
|
559
|
+
# Wait until we can exclusively lock it, handling error retry.
|
|
560
|
+
safe_lock(fd)
|
|
561
|
+
except:
|
|
562
|
+
# Something went wrong
|
|
563
|
+
os.close(fd)
|
|
564
|
+
raise
|
|
380
565
|
|
|
381
566
|
# Holding the lock, make sure we are looking at the same file on disk still.
|
|
382
567
|
try:
|
|
@@ -384,16 +569,14 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
|
|
|
384
569
|
fd_stats = os.fstat(fd)
|
|
385
570
|
except OSError as e:
|
|
386
571
|
if e.errno == errno.ESTALE:
|
|
387
|
-
# The file handle has gone stale, because somebody removed the
|
|
572
|
+
# The file handle has gone stale, because somebody removed the
|
|
573
|
+
# file.
|
|
388
574
|
# Try again.
|
|
389
|
-
|
|
390
|
-
fcntl.lockf(fd, fcntl.LOCK_UN)
|
|
391
|
-
except OSError:
|
|
392
|
-
pass
|
|
393
|
-
os.close(fd)
|
|
575
|
+
safe_unlock_and_close(fd)
|
|
394
576
|
continue
|
|
395
577
|
else:
|
|
396
578
|
# Something else broke
|
|
579
|
+
os.close(fd)
|
|
397
580
|
raise
|
|
398
581
|
|
|
399
582
|
try:
|
|
@@ -402,13 +585,16 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
|
|
|
402
585
|
except FileNotFoundError:
|
|
403
586
|
path_stats = None
|
|
404
587
|
|
|
405
|
-
if
|
|
588
|
+
if (
|
|
589
|
+
path_stats is None
|
|
590
|
+
or fd_stats.st_dev != path_stats.st_dev
|
|
591
|
+
or fd_stats.st_ino != path_stats.st_ino
|
|
592
|
+
):
|
|
406
593
|
# The file we have a lock on is not the file linked to the name (if
|
|
407
594
|
# any). This usually happens, because before someone releases a
|
|
408
595
|
# lock, they delete the file. Go back and contend again. TODO: This
|
|
409
596
|
# allows a lot of queue jumping on our mutex.
|
|
410
|
-
|
|
411
|
-
os.close(fd)
|
|
597
|
+
safe_unlock_and_close(fd)
|
|
412
598
|
continue
|
|
413
599
|
else:
|
|
414
600
|
# We have a lock on the file that the name points to. Since we
|
|
@@ -418,12 +604,12 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
|
|
|
418
604
|
|
|
419
605
|
try:
|
|
420
606
|
# When we have it, do the thing we are protecting.
|
|
421
|
-
logger.debug(
|
|
607
|
+
logger.debug("PID %d now holds mutex %s", os.getpid(), lock_filename)
|
|
422
608
|
yield
|
|
423
609
|
finally:
|
|
424
610
|
# Delete it while we still own it, so we can't delete it from out from
|
|
425
611
|
# under someone else who thinks they are holding it.
|
|
426
|
-
logger.debug(
|
|
612
|
+
logger.debug("PID %d releasing mutex %s", os.getpid(), lock_filename)
|
|
427
613
|
|
|
428
614
|
# We have had observations in the wild of the lock file not exisiting
|
|
429
615
|
# when we go to unlink it, causing a crash on mutex release. See
|
|
@@ -441,23 +627,36 @@ def global_mutex(base_dir: str, mutex: str) -> Iterator[None]:
|
|
|
441
627
|
|
|
442
628
|
# Check to make sure it still looks locked before we unlink.
|
|
443
629
|
if path_stats is None:
|
|
444
|
-
logger.error(
|
|
445
|
-
|
|
446
|
-
|
|
630
|
+
logger.error(
|
|
631
|
+
"PID %d had mutex %s disappear while locked! Mutex system is not working!",
|
|
632
|
+
os.getpid(),
|
|
633
|
+
lock_filename,
|
|
634
|
+
)
|
|
635
|
+
elif (
|
|
636
|
+
fd_stats.st_dev != path_stats.st_dev or fd_stats.st_ino != path_stats.st_ino
|
|
637
|
+
):
|
|
638
|
+
logger.error(
|
|
639
|
+
"PID %d had mutex %s get replaced while locked! Mutex system is not working!",
|
|
640
|
+
os.getpid(),
|
|
641
|
+
lock_filename,
|
|
642
|
+
)
|
|
447
643
|
|
|
448
644
|
if path_stats is not None:
|
|
449
645
|
try:
|
|
450
646
|
# Unlink the file
|
|
451
647
|
os.unlink(lock_filename)
|
|
452
648
|
except FileNotFoundError:
|
|
453
|
-
logger.error(
|
|
649
|
+
logger.error(
|
|
650
|
+
"PID %d had mutex %s disappear between stat and unlink while unlocking! Mutex system is not working!",
|
|
651
|
+
os.getpid(),
|
|
652
|
+
lock_filename,
|
|
653
|
+
)
|
|
454
654
|
|
|
455
655
|
# Note that we are unlinking it and then unlocking it; a lot of people
|
|
456
656
|
# might have opened it before we unlinked it and will wake up when they
|
|
457
657
|
# get the worthless lock on the now-unlinked file. We have to do some
|
|
458
658
|
# stat gymnastics above to work around this.
|
|
459
|
-
|
|
460
|
-
os.close(fd)
|
|
659
|
+
safe_unlock_and_close(fd)
|
|
461
660
|
|
|
462
661
|
|
|
463
662
|
class LastProcessStandingArena:
|
|
@@ -493,13 +692,13 @@ class LastProcessStandingArena:
|
|
|
493
692
|
|
|
494
693
|
# We need a mutex name to allow only one process to be entering or
|
|
495
694
|
# leaving at a time.
|
|
496
|
-
self.mutex = name +
|
|
695
|
+
self.mutex = name + "-arena-lock"
|
|
497
696
|
|
|
498
697
|
# We need a way to track who is actually in, and who was in but died.
|
|
499
698
|
# So everybody gets a locked file (again).
|
|
500
699
|
# TODO: deduplicate with the similar logic for process names, and also
|
|
501
700
|
# deferred functions.
|
|
502
|
-
self.lockfileDir = os.path.join(base_dir, name +
|
|
701
|
+
self.lockfileDir = os.path.join(base_dir, name + "-arena-members")
|
|
503
702
|
|
|
504
703
|
# When we enter the arena, we fill this in with the FD of the locked
|
|
505
704
|
# file that represents our presence.
|
|
@@ -515,7 +714,7 @@ class LastProcessStandingArena:
|
|
|
515
714
|
You may not enter the arena again before leaving it.
|
|
516
715
|
"""
|
|
517
716
|
|
|
518
|
-
logger.debug(
|
|
717
|
+
logger.debug("Joining arena %s", self.lockfileDir)
|
|
519
718
|
|
|
520
719
|
# Make sure we're not in it already.
|
|
521
720
|
if self.lockfileName is not None or self.lockfileFD is not None:
|
|
@@ -529,15 +728,24 @@ class LastProcessStandingArena:
|
|
|
529
728
|
os.mkdir(self.lockfileDir)
|
|
530
729
|
except FileExistsError:
|
|
531
730
|
pass
|
|
731
|
+
except Exception as e:
|
|
732
|
+
raise RuntimeError(
|
|
733
|
+
"Could not make lock file directory " + self.lockfileDir
|
|
734
|
+
) from e
|
|
532
735
|
|
|
533
736
|
# Make ourselves a file in it and lock it to prove we are alive.
|
|
534
|
-
|
|
737
|
+
try:
|
|
738
|
+
self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
|
|
739
|
+
except Exception as e:
|
|
740
|
+
raise RuntimeError(
|
|
741
|
+
"Could not make lock file in " + self.lockfileDir
|
|
742
|
+
) from e
|
|
535
743
|
# Nobody can see it yet, so lock it right away
|
|
536
|
-
|
|
744
|
+
safe_lock(self.lockfileFD) # type: ignore
|
|
537
745
|
|
|
538
746
|
# Now we're properly in, so release the global mutex
|
|
539
747
|
|
|
540
|
-
logger.debug(
|
|
748
|
+
logger.debug("Now in arena %s", self.lockfileDir)
|
|
541
749
|
|
|
542
750
|
def leave(self) -> Iterator[bool]:
|
|
543
751
|
"""
|
|
@@ -557,7 +765,7 @@ class LastProcessStandingArena:
|
|
|
557
765
|
if self.lockfileName is None or self.lockfileFD is None:
|
|
558
766
|
raise RuntimeError("This process is not in the arena.")
|
|
559
767
|
|
|
560
|
-
logger.debug(
|
|
768
|
+
logger.debug("Leaving arena %s", self.lockfileDir)
|
|
561
769
|
|
|
562
770
|
with global_mutex(self.base_dir, self.mutex):
|
|
563
771
|
# Now nobody else should also be trying to join or leave.
|
|
@@ -568,8 +776,7 @@ class LastProcessStandingArena:
|
|
|
568
776
|
except:
|
|
569
777
|
pass
|
|
570
778
|
self.lockfileName = None
|
|
571
|
-
|
|
572
|
-
os.close(self.lockfileFD)
|
|
779
|
+
safe_unlock_and_close(self.lockfileFD)
|
|
573
780
|
self.lockfileFD = None
|
|
574
781
|
|
|
575
782
|
for item in os.listdir(self.lockfileDir):
|
|
@@ -583,32 +790,42 @@ class LastProcessStandingArena:
|
|
|
583
790
|
continue
|
|
584
791
|
|
|
585
792
|
try:
|
|
586
|
-
|
|
793
|
+
safe_lock(fd, block=False, shared=True)
|
|
587
794
|
except OSError as e:
|
|
588
|
-
|
|
589
|
-
|
|
795
|
+
if e.errno in (errno.EACCES, errno.EAGAIN):
|
|
796
|
+
# Could not lock. It's alive!
|
|
797
|
+
break
|
|
798
|
+
else:
|
|
799
|
+
# Something else is wrong
|
|
800
|
+
os.close(fd)
|
|
801
|
+
raise
|
|
590
802
|
else:
|
|
591
803
|
# Could lock. Process is dead.
|
|
592
804
|
try:
|
|
593
805
|
os.remove(full_path)
|
|
594
806
|
except:
|
|
595
807
|
pass
|
|
596
|
-
|
|
808
|
+
safe_unlock_and_close(fd)
|
|
597
809
|
# Continue with the loop normally.
|
|
598
810
|
else:
|
|
599
811
|
# Nothing alive was found. Nobody will come in while we hold
|
|
600
812
|
# the global mutex, so we are the Last Process Standing.
|
|
601
|
-
logger.debug(
|
|
813
|
+
logger.debug(
|
|
814
|
+
"We are the Last Process Standing in arena %s", self.lockfileDir
|
|
815
|
+
)
|
|
602
816
|
yield True
|
|
603
817
|
|
|
604
818
|
try:
|
|
605
819
|
# Delete the arena directory so as to leave nothing behind.
|
|
606
820
|
os.rmdir(self.lockfileDir)
|
|
607
821
|
except:
|
|
608
|
-
logger.warning(
|
|
609
|
-
|
|
822
|
+
logger.warning(
|
|
823
|
+
"Could not clean up arena %s completely: %s",
|
|
824
|
+
self.lockfileDir,
|
|
825
|
+
traceback.format_exc(),
|
|
826
|
+
)
|
|
610
827
|
|
|
611
828
|
# Now we're done, whether we were the last one or not, and can
|
|
612
829
|
# release the mutex.
|
|
613
830
|
|
|
614
|
-
logger.debug(
|
|
831
|
+
logger.debug("Now out of arena %s", self.lockfileDir)
|