toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -12,39 +12,41 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import errno
|
|
15
|
-
import fcntl
|
|
16
15
|
import logging
|
|
17
16
|
import os
|
|
18
17
|
import tempfile
|
|
19
18
|
from collections import defaultdict
|
|
19
|
+
from collections.abc import Generator, Iterator
|
|
20
20
|
from contextlib import contextmanager
|
|
21
|
-
from typing import (
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
cast,
|
|
34
|
-
overload)
|
|
21
|
+
from typing import (
|
|
22
|
+
IO,
|
|
23
|
+
Any,
|
|
24
|
+
Callable,
|
|
25
|
+
ContextManager,
|
|
26
|
+
DefaultDict,
|
|
27
|
+
Literal,
|
|
28
|
+
Optional,
|
|
29
|
+
Union,
|
|
30
|
+
cast,
|
|
31
|
+
overload,
|
|
32
|
+
)
|
|
35
33
|
|
|
36
34
|
import dill
|
|
37
35
|
|
|
38
|
-
from toil.common import
|
|
36
|
+
from toil.common import getFileSystemSize
|
|
39
37
|
from toil.fileStores import FileID
|
|
40
38
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
41
39
|
from toil.job import Job, JobDescription
|
|
42
40
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
43
41
|
from toil.lib.compatibility import deprecated
|
|
44
|
-
from toil.lib.conversions import bytes2human
|
|
45
42
|
from toil.lib.io import make_public_dir, robust_rmtree
|
|
46
43
|
from toil.lib.retry import ErrorCondition, retry
|
|
47
|
-
from toil.lib.threading import
|
|
44
|
+
from toil.lib.threading import (
|
|
45
|
+
get_process_name,
|
|
46
|
+
process_name_exists,
|
|
47
|
+
safe_lock,
|
|
48
|
+
safe_unlock_and_close,
|
|
49
|
+
)
|
|
48
50
|
|
|
49
51
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
50
52
|
|
|
@@ -60,7 +62,7 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
60
62
|
super().__init__(jobStore, jobDesc, file_store_dir, waitForPreviousCommit)
|
|
61
63
|
# This will be defined in the `open` method.
|
|
62
64
|
self.jobStateFile: Optional[str] = None
|
|
63
|
-
self.localFileMap: DefaultDict[str,
|
|
65
|
+
self.localFileMap: DefaultDict[str, list[str]] = defaultdict(list)
|
|
64
66
|
|
|
65
67
|
self.check_for_state_corruption()
|
|
66
68
|
|
|
@@ -79,10 +81,10 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
79
81
|
|
|
80
82
|
if coordination_dir and not os.path.exists(coordination_dir):
|
|
81
83
|
raise RuntimeError(
|
|
82
|
-
f
|
|
83
|
-
f
|
|
84
|
-
f
|
|
85
|
-
f
|
|
84
|
+
f"The Toil coordination directory at {coordination_dir} "
|
|
85
|
+
f"was removed while the workflow was running! Please provide a "
|
|
86
|
+
f"TOIL_COORDINATION_DIR or --coordinationDir at a location that "
|
|
87
|
+
f"is safe from automated cleanup during the workflow run."
|
|
86
88
|
)
|
|
87
89
|
|
|
88
90
|
def check_for_state_corruption(self) -> None:
|
|
@@ -94,48 +96,43 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
94
96
|
|
|
95
97
|
if self.jobStateFile and not os.path.exists(self.jobStateFile):
|
|
96
98
|
raise RuntimeError(
|
|
97
|
-
f
|
|
98
|
-
f
|
|
99
|
-
f
|
|
100
|
-
f
|
|
99
|
+
f"The job state file {self.jobStateFile} "
|
|
100
|
+
f"was removed while the workflow was running! Please provide a "
|
|
101
|
+
f"TOIL_COORDINATION_DIR or --coordinationDir at a location that "
|
|
102
|
+
f"is safe from automated cleanup during the workflow run."
|
|
101
103
|
)
|
|
102
104
|
|
|
103
105
|
@contextmanager
|
|
104
106
|
def open(self, job: Job) -> Generator[None, None, None]:
|
|
105
|
-
jobReqs = job.disk
|
|
106
107
|
startingDir = os.getcwd()
|
|
107
|
-
self.localTempDir: str = make_public_dir(
|
|
108
|
+
self.localTempDir: str = make_public_dir(
|
|
109
|
+
self.localTempDir, suggested_name="job"
|
|
110
|
+
)
|
|
108
111
|
self._removeDeadJobs(self.coordination_dir)
|
|
109
112
|
self.jobStateFile = self._createJobStateFile()
|
|
110
113
|
self.check_for_state_corruption()
|
|
111
114
|
freeSpace, diskSize = getFileSystemSize(self.localTempDir)
|
|
112
115
|
if freeSpace <= 0.1 * diskSize:
|
|
113
|
-
logger.warning(
|
|
116
|
+
logger.warning(
|
|
117
|
+
f"Starting job {self.jobName} with less than 10%% of disk space remaining."
|
|
118
|
+
)
|
|
114
119
|
try:
|
|
115
120
|
os.chdir(self.localTempDir)
|
|
116
121
|
with super().open(job):
|
|
117
122
|
yield
|
|
118
123
|
finally:
|
|
119
|
-
disk = getDirSizeRecursively(self.localTempDir)
|
|
120
|
-
percent = float(disk) / jobReqs * 100 if jobReqs > 0 else 0.0
|
|
121
|
-
disk_usage = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(disk)}B [{disk}B] used, "
|
|
122
|
-
f"{bytes2human(jobReqs)}B [{jobReqs}B] requested).")
|
|
123
|
-
if disk > jobReqs:
|
|
124
|
-
self.log_to_leader("Job used more disk than requested. For CWL, consider increasing the outdirMin "
|
|
125
|
-
f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
|
|
126
|
-
level=logging.WARNING)
|
|
127
|
-
else:
|
|
128
|
-
self.log_to_leader(disk_usage, level=logging.DEBUG)
|
|
129
124
|
os.chdir(startingDir)
|
|
130
125
|
# Finally delete the job from the worker
|
|
131
126
|
self.check_for_state_corruption()
|
|
132
127
|
try:
|
|
133
128
|
os.remove(self.jobStateFile)
|
|
134
129
|
except FileNotFoundError:
|
|
135
|
-
logger.exception(
|
|
136
|
-
|
|
130
|
+
logger.exception(
|
|
131
|
+
"Job state file %s has gone missing unexpectedly; some cleanup for failed jobs may be getting skipped!",
|
|
132
|
+
self.jobStateFile,
|
|
133
|
+
)
|
|
137
134
|
|
|
138
|
-
def writeGlobalFile(self, localFileName: str, cleanup: bool=False) -> FileID:
|
|
135
|
+
def writeGlobalFile(self, localFileName: str, cleanup: bool = False) -> FileID:
|
|
139
136
|
absLocalFileName = self._resolveAbsoluteLocalPath(localFileName)
|
|
140
137
|
creatorID = str(self.jobDesc.jobStoreID)
|
|
141
138
|
fileStoreID = self.jobStore.write_file(absLocalFileName, creatorID, cleanup)
|
|
@@ -145,12 +142,20 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
145
142
|
self.localFileMap[fileStoreID].append(absLocalFileName)
|
|
146
143
|
return FileID.forPath(fileStoreID, absLocalFileName)
|
|
147
144
|
|
|
148
|
-
def readGlobalFile(
|
|
149
|
-
|
|
145
|
+
def readGlobalFile(
|
|
146
|
+
self,
|
|
147
|
+
fileStoreID: str,
|
|
148
|
+
userPath: Optional[str] = None,
|
|
149
|
+
cache: bool = True,
|
|
150
|
+
mutable: bool = False,
|
|
151
|
+
symlink: bool = False,
|
|
152
|
+
) -> str:
|
|
150
153
|
if userPath is not None:
|
|
151
154
|
localFilePath = self._resolveAbsoluteLocalPath(userPath)
|
|
152
155
|
if os.path.exists(localFilePath):
|
|
153
|
-
raise RuntimeError(
|
|
156
|
+
raise RuntimeError(
|
|
157
|
+
" File %s " % localFilePath + " exists. Cannot Overwrite."
|
|
158
|
+
)
|
|
154
159
|
else:
|
|
155
160
|
localFilePath = self.getLocalTempFileName()
|
|
156
161
|
|
|
@@ -165,25 +170,30 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
165
170
|
fileStoreID: str,
|
|
166
171
|
encoding: Literal[None] = None,
|
|
167
172
|
errors: Optional[str] = None,
|
|
168
|
-
) -> ContextManager[IO[bytes]]:
|
|
169
|
-
...
|
|
173
|
+
) -> ContextManager[IO[bytes]]: ...
|
|
170
174
|
|
|
171
175
|
@overload
|
|
172
176
|
def readGlobalFileStream(
|
|
173
177
|
self, fileStoreID: str, encoding: str, errors: Optional[str] = None
|
|
174
|
-
) -> ContextManager[IO[str]]:
|
|
175
|
-
...
|
|
178
|
+
) -> ContextManager[IO[str]]: ...
|
|
176
179
|
|
|
177
180
|
# TODO: This seems to hit https://github.com/python/mypy/issues/11373
|
|
178
181
|
# But that is supposedly fixed.
|
|
179
182
|
|
|
180
|
-
@contextmanager
|
|
181
|
-
def readGlobalFileStream(
|
|
182
|
-
|
|
183
|
+
@contextmanager # type: ignore
|
|
184
|
+
def readGlobalFileStream(
|
|
185
|
+
self,
|
|
186
|
+
fileStoreID: str,
|
|
187
|
+
encoding: Optional[str] = None,
|
|
188
|
+
errors: Optional[str] = None,
|
|
189
|
+
) -> Iterator[Union[IO[bytes], IO[str]]]:
|
|
190
|
+
with self.jobStore.read_file_stream(
|
|
191
|
+
fileStoreID, encoding=encoding, errors=errors
|
|
192
|
+
) as f:
|
|
183
193
|
self.logAccess(fileStoreID)
|
|
184
194
|
yield f
|
|
185
195
|
|
|
186
|
-
@deprecated(new_function_name=
|
|
196
|
+
@deprecated(new_function_name="export_file")
|
|
187
197
|
def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None:
|
|
188
198
|
return self.export_file(jobStoreFileID, dstUrl)
|
|
189
199
|
|
|
@@ -194,7 +204,9 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
194
204
|
try:
|
|
195
205
|
localFilePaths = self.localFileMap.pop(fileStoreID)
|
|
196
206
|
except KeyError:
|
|
197
|
-
raise OSError(
|
|
207
|
+
raise OSError(
|
|
208
|
+
errno.ENOENT, "Attempting to delete local copies of a file with none"
|
|
209
|
+
)
|
|
198
210
|
else:
|
|
199
211
|
for localFilePath in localFilePaths:
|
|
200
212
|
os.remove(localFilePath)
|
|
@@ -245,7 +257,6 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
245
257
|
self._terminateEvent.set()
|
|
246
258
|
raise
|
|
247
259
|
|
|
248
|
-
|
|
249
260
|
def __del__(self) -> None:
|
|
250
261
|
"""
|
|
251
262
|
Cleanup function that is run when destroying the class instance. Nothing to do since there
|
|
@@ -253,7 +264,9 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
253
264
|
"""
|
|
254
265
|
|
|
255
266
|
@classmethod
|
|
256
|
-
def _removeDeadJobs(
|
|
267
|
+
def _removeDeadJobs(
|
|
268
|
+
cls, coordination_dir: str, batchSystemShutdown: bool = False
|
|
269
|
+
) -> None:
|
|
257
270
|
"""
|
|
258
271
|
Look at the state of all jobs registered in the individual job state files, and handle them
|
|
259
272
|
(clean up the disk)
|
|
@@ -266,26 +279,34 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
266
279
|
cls.check_for_coordination_corruption(coordination_dir)
|
|
267
280
|
|
|
268
281
|
for jobState in cls._getAllJobStates(coordination_dir):
|
|
269
|
-
if not process_name_exists(coordination_dir, jobState[
|
|
282
|
+
if not process_name_exists(coordination_dir, jobState["jobProcessName"]):
|
|
270
283
|
# We need to have a race to pick someone to clean up.
|
|
271
284
|
|
|
272
285
|
try:
|
|
273
|
-
# Open the directory
|
|
274
|
-
|
|
286
|
+
# Open the directory.
|
|
287
|
+
# We can't open a directory for write, only for read.
|
|
288
|
+
dirFD = os.open(jobState["jobDir"], os.O_RDONLY)
|
|
275
289
|
except FileNotFoundError:
|
|
276
290
|
# The cleanup has happened and we can't contest for it
|
|
277
291
|
continue
|
|
278
292
|
|
|
279
293
|
try:
|
|
280
|
-
# Try and lock it
|
|
281
|
-
|
|
294
|
+
# Try and lock it non-blocking
|
|
295
|
+
safe_lock(dirFD, block=False)
|
|
282
296
|
except OSError as e:
|
|
283
|
-
# We lost the race. Someone else is alive and has it locked.
|
|
284
297
|
os.close(dirFD)
|
|
298
|
+
if e.errno not in (errno.EACCES, errno.EAGAIN):
|
|
299
|
+
# Something went wrong
|
|
300
|
+
raise
|
|
301
|
+
# Otherwise, we lost the race. Someone else is alive and
|
|
302
|
+
# has it locked. So loop around again.
|
|
285
303
|
else:
|
|
286
304
|
# We got it
|
|
287
|
-
logger.warning(
|
|
288
|
-
|
|
305
|
+
logger.warning(
|
|
306
|
+
"Detected that job (%s) prematurely terminated. Fixing the "
|
|
307
|
+
"state of the job on disk.",
|
|
308
|
+
jobState["jobName"],
|
|
309
|
+
)
|
|
289
310
|
|
|
290
311
|
try:
|
|
291
312
|
if not batchSystemShutdown:
|
|
@@ -293,13 +314,12 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
293
314
|
# Delete the old work directory if it still exists. Do this only during
|
|
294
315
|
# the life of the program and dont' do it during the batch system
|
|
295
316
|
# cleanup. Leave that to the batch system cleanup code.
|
|
296
|
-
robust_rmtree(jobState[
|
|
317
|
+
robust_rmtree(jobState["jobDir"])
|
|
297
318
|
finally:
|
|
298
|
-
|
|
299
|
-
os.close(dirFD)
|
|
319
|
+
safe_unlock_and_close(dirFD)
|
|
300
320
|
|
|
301
321
|
@classmethod
|
|
302
|
-
def _getAllJobStates(cls, coordination_dir: str) -> Iterator[
|
|
322
|
+
def _getAllJobStates(cls, coordination_dir: str) -> Iterator[dict[str, str]]:
|
|
303
323
|
"""
|
|
304
324
|
Generator function that deserializes and yields the job state for every job on the node,
|
|
305
325
|
one at a time.
|
|
@@ -316,7 +336,7 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
316
336
|
# So we need to work in bytes.
|
|
317
337
|
for entry in os.scandir(os.fsencode(coordination_dir)):
|
|
318
338
|
# For each job state file in the coordination directory
|
|
319
|
-
if entry.name.endswith(b
|
|
339
|
+
if entry.name.endswith(b".jobState"):
|
|
320
340
|
# This is the state of a job
|
|
321
341
|
jobStateFiles.append(os.fsdecode(entry.path))
|
|
322
342
|
|
|
@@ -329,7 +349,7 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
329
349
|
# job finished & deleted its jobState file since the jobState files were discovered
|
|
330
350
|
continue
|
|
331
351
|
elif e.errno == 5:
|
|
332
|
-
# This is a OSError: [Errno 5] Input/output error (jobStatefile seems to disappear
|
|
352
|
+
# This is a OSError: [Errno 5] Input/output error (jobStatefile seems to disappear
|
|
333
353
|
# on network file system sometimes)
|
|
334
354
|
continue
|
|
335
355
|
else:
|
|
@@ -337,16 +357,16 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
337
357
|
|
|
338
358
|
@staticmethod
|
|
339
359
|
# Retry on any OSError except FileNotFoundError, which we throw immediately
|
|
340
|
-
@retry(
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
error=FileNotFoundError,
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
def _readJobState(jobStateFileName: str) ->
|
|
347
|
-
with open(jobStateFileName,
|
|
360
|
+
@retry(
|
|
361
|
+
errors=[
|
|
362
|
+
OSError,
|
|
363
|
+
ErrorCondition(error=FileNotFoundError, retry_on_this_condition=False),
|
|
364
|
+
]
|
|
365
|
+
)
|
|
366
|
+
def _readJobState(jobStateFileName: str) -> dict[str, str]:
|
|
367
|
+
with open(jobStateFileName, "rb") as fH:
|
|
348
368
|
state = dill.load(fH)
|
|
349
|
-
return cast(
|
|
369
|
+
return cast(dict[str, str], state)
|
|
350
370
|
|
|
351
371
|
def _createJobStateFile(self) -> str:
|
|
352
372
|
"""
|
|
@@ -359,17 +379,26 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
359
379
|
:rtype: str
|
|
360
380
|
"""
|
|
361
381
|
self.check_for_state_corruption()
|
|
362
|
-
jobState = {
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
382
|
+
jobState = {
|
|
383
|
+
"jobProcessName": get_process_name(self.coordination_dir),
|
|
384
|
+
"jobName": self.jobName,
|
|
385
|
+
"jobDir": self.localTempDir,
|
|
386
|
+
}
|
|
387
|
+
try:
|
|
388
|
+
(fd, jobStateFile) = tempfile.mkstemp(
|
|
389
|
+
suffix=".jobState.tmp", dir=self.coordination_dir
|
|
390
|
+
)
|
|
391
|
+
except Exception as e:
|
|
392
|
+
raise RuntimeError(
|
|
393
|
+
"Could not make state file in " + self.coordination_dir
|
|
394
|
+
) from e
|
|
395
|
+
with open(fd, "wb") as fH:
|
|
367
396
|
# Write data
|
|
368
397
|
dill.dump(jobState, fH)
|
|
369
398
|
# Drop suffix
|
|
370
|
-
jobStateFile = jobStateFile[
|
|
399
|
+
jobStateFile = jobStateFile[: -len(".tmp")]
|
|
371
400
|
# Put in place
|
|
372
|
-
os.rename(jobStateFile +
|
|
401
|
+
os.rename(jobStateFile + ".tmp", jobStateFile)
|
|
373
402
|
return jobStateFile
|
|
374
403
|
|
|
375
404
|
@classmethod
|