toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +121 -83
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +38 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +489 -137
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +630 -359
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1114 -532
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +988 -315
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +727 -403
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +193 -58
- toil/lib/aws/utils.py +238 -218
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +99 -11
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +65 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +115 -77
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/options/common.py +834 -401
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +148 -64
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +93 -47
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/cwlTest.py +271 -71
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +11 -11
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3513 -1052
- toil/worker.py +269 -128
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -12,26 +12,24 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import errno
|
|
15
|
-
import fcntl
|
|
16
15
|
import logging
|
|
17
16
|
import os
|
|
18
17
|
import tempfile
|
|
19
18
|
from collections import defaultdict
|
|
19
|
+
from collections.abc import Generator, Iterator
|
|
20
20
|
from contextlib import contextmanager
|
|
21
|
-
from typing import (
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
cast,
|
|
34
|
-
overload)
|
|
21
|
+
from typing import (
|
|
22
|
+
IO,
|
|
23
|
+
Any,
|
|
24
|
+
Callable,
|
|
25
|
+
ContextManager,
|
|
26
|
+
DefaultDict,
|
|
27
|
+
Literal,
|
|
28
|
+
Optional,
|
|
29
|
+
Union,
|
|
30
|
+
cast,
|
|
31
|
+
overload,
|
|
32
|
+
)
|
|
35
33
|
|
|
36
34
|
import dill
|
|
37
35
|
|
|
@@ -43,7 +41,12 @@ from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
|
43
41
|
from toil.lib.compatibility import deprecated
|
|
44
42
|
from toil.lib.io import make_public_dir, robust_rmtree
|
|
45
43
|
from toil.lib.retry import ErrorCondition, retry
|
|
46
|
-
from toil.lib.threading import
|
|
44
|
+
from toil.lib.threading import (
|
|
45
|
+
get_process_name,
|
|
46
|
+
process_name_exists,
|
|
47
|
+
safe_lock,
|
|
48
|
+
safe_unlock_and_close,
|
|
49
|
+
)
|
|
47
50
|
|
|
48
51
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
49
52
|
|
|
@@ -59,7 +62,7 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
59
62
|
super().__init__(jobStore, jobDesc, file_store_dir, waitForPreviousCommit)
|
|
60
63
|
# This will be defined in the `open` method.
|
|
61
64
|
self.jobStateFile: Optional[str] = None
|
|
62
|
-
self.localFileMap: DefaultDict[str,
|
|
65
|
+
self.localFileMap: DefaultDict[str, list[str]] = defaultdict(list)
|
|
63
66
|
|
|
64
67
|
self.check_for_state_corruption()
|
|
65
68
|
|
|
@@ -78,10 +81,10 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
78
81
|
|
|
79
82
|
if coordination_dir and not os.path.exists(coordination_dir):
|
|
80
83
|
raise RuntimeError(
|
|
81
|
-
f
|
|
82
|
-
f
|
|
83
|
-
f
|
|
84
|
-
f
|
|
84
|
+
f"The Toil coordination directory at {coordination_dir} "
|
|
85
|
+
f"was removed while the workflow was running! Please provide a "
|
|
86
|
+
f"TOIL_COORDINATION_DIR or --coordinationDir at a location that "
|
|
87
|
+
f"is safe from automated cleanup during the workflow run."
|
|
85
88
|
)
|
|
86
89
|
|
|
87
90
|
def check_for_state_corruption(self) -> None:
|
|
@@ -93,22 +96,26 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
93
96
|
|
|
94
97
|
if self.jobStateFile and not os.path.exists(self.jobStateFile):
|
|
95
98
|
raise RuntimeError(
|
|
96
|
-
f
|
|
97
|
-
f
|
|
98
|
-
f
|
|
99
|
-
f
|
|
99
|
+
f"The job state file {self.jobStateFile} "
|
|
100
|
+
f"was removed while the workflow was running! Please provide a "
|
|
101
|
+
f"TOIL_COORDINATION_DIR or --coordinationDir at a location that "
|
|
102
|
+
f"is safe from automated cleanup during the workflow run."
|
|
100
103
|
)
|
|
101
104
|
|
|
102
105
|
@contextmanager
|
|
103
106
|
def open(self, job: Job) -> Generator[None, None, None]:
|
|
104
107
|
startingDir = os.getcwd()
|
|
105
|
-
self.localTempDir: str = make_public_dir(
|
|
108
|
+
self.localTempDir: str = make_public_dir(
|
|
109
|
+
self.localTempDir, suggested_name="job"
|
|
110
|
+
)
|
|
106
111
|
self._removeDeadJobs(self.coordination_dir)
|
|
107
112
|
self.jobStateFile = self._createJobStateFile()
|
|
108
113
|
self.check_for_state_corruption()
|
|
109
114
|
freeSpace, diskSize = getFileSystemSize(self.localTempDir)
|
|
110
115
|
if freeSpace <= 0.1 * diskSize:
|
|
111
|
-
logger.warning(
|
|
116
|
+
logger.warning(
|
|
117
|
+
f"Starting job {self.jobName} with less than 10%% of disk space remaining."
|
|
118
|
+
)
|
|
112
119
|
try:
|
|
113
120
|
os.chdir(self.localTempDir)
|
|
114
121
|
with super().open(job):
|
|
@@ -120,10 +127,12 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
120
127
|
try:
|
|
121
128
|
os.remove(self.jobStateFile)
|
|
122
129
|
except FileNotFoundError:
|
|
123
|
-
logger.exception(
|
|
124
|
-
|
|
130
|
+
logger.exception(
|
|
131
|
+
"Job state file %s has gone missing unexpectedly; some cleanup for failed jobs may be getting skipped!",
|
|
132
|
+
self.jobStateFile,
|
|
133
|
+
)
|
|
125
134
|
|
|
126
|
-
def writeGlobalFile(self, localFileName: str, cleanup: bool=False) -> FileID:
|
|
135
|
+
def writeGlobalFile(self, localFileName: str, cleanup: bool = False) -> FileID:
|
|
127
136
|
absLocalFileName = self._resolveAbsoluteLocalPath(localFileName)
|
|
128
137
|
creatorID = str(self.jobDesc.jobStoreID)
|
|
129
138
|
fileStoreID = self.jobStore.write_file(absLocalFileName, creatorID, cleanup)
|
|
@@ -133,12 +142,20 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
133
142
|
self.localFileMap[fileStoreID].append(absLocalFileName)
|
|
134
143
|
return FileID.forPath(fileStoreID, absLocalFileName)
|
|
135
144
|
|
|
136
|
-
def readGlobalFile(
|
|
137
|
-
|
|
145
|
+
def readGlobalFile(
|
|
146
|
+
self,
|
|
147
|
+
fileStoreID: str,
|
|
148
|
+
userPath: Optional[str] = None,
|
|
149
|
+
cache: bool = True,
|
|
150
|
+
mutable: bool = False,
|
|
151
|
+
symlink: bool = False,
|
|
152
|
+
) -> str:
|
|
138
153
|
if userPath is not None:
|
|
139
154
|
localFilePath = self._resolveAbsoluteLocalPath(userPath)
|
|
140
155
|
if os.path.exists(localFilePath):
|
|
141
|
-
raise RuntimeError(
|
|
156
|
+
raise RuntimeError(
|
|
157
|
+
" File %s " % localFilePath + " exists. Cannot Overwrite."
|
|
158
|
+
)
|
|
142
159
|
else:
|
|
143
160
|
localFilePath = self.getLocalTempFileName()
|
|
144
161
|
|
|
@@ -153,25 +170,30 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
153
170
|
fileStoreID: str,
|
|
154
171
|
encoding: Literal[None] = None,
|
|
155
172
|
errors: Optional[str] = None,
|
|
156
|
-
) -> ContextManager[IO[bytes]]:
|
|
157
|
-
...
|
|
173
|
+
) -> ContextManager[IO[bytes]]: ...
|
|
158
174
|
|
|
159
175
|
@overload
|
|
160
176
|
def readGlobalFileStream(
|
|
161
177
|
self, fileStoreID: str, encoding: str, errors: Optional[str] = None
|
|
162
|
-
) -> ContextManager[IO[str]]:
|
|
163
|
-
...
|
|
178
|
+
) -> ContextManager[IO[str]]: ...
|
|
164
179
|
|
|
165
180
|
# TODO: This seems to hit https://github.com/python/mypy/issues/11373
|
|
166
181
|
# But that is supposedly fixed.
|
|
167
182
|
|
|
168
|
-
@contextmanager
|
|
169
|
-
def readGlobalFileStream(
|
|
170
|
-
|
|
183
|
+
@contextmanager # type: ignore
|
|
184
|
+
def readGlobalFileStream(
|
|
185
|
+
self,
|
|
186
|
+
fileStoreID: str,
|
|
187
|
+
encoding: Optional[str] = None,
|
|
188
|
+
errors: Optional[str] = None,
|
|
189
|
+
) -> Iterator[Union[IO[bytes], IO[str]]]:
|
|
190
|
+
with self.jobStore.read_file_stream(
|
|
191
|
+
fileStoreID, encoding=encoding, errors=errors
|
|
192
|
+
) as f:
|
|
171
193
|
self.logAccess(fileStoreID)
|
|
172
194
|
yield f
|
|
173
195
|
|
|
174
|
-
@deprecated(new_function_name=
|
|
196
|
+
@deprecated(new_function_name="export_file")
|
|
175
197
|
def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None:
|
|
176
198
|
return self.export_file(jobStoreFileID, dstUrl)
|
|
177
199
|
|
|
@@ -182,7 +204,9 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
182
204
|
try:
|
|
183
205
|
localFilePaths = self.localFileMap.pop(fileStoreID)
|
|
184
206
|
except KeyError:
|
|
185
|
-
raise OSError(
|
|
207
|
+
raise OSError(
|
|
208
|
+
errno.ENOENT, "Attempting to delete local copies of a file with none"
|
|
209
|
+
)
|
|
186
210
|
else:
|
|
187
211
|
for localFilePath in localFilePaths:
|
|
188
212
|
os.remove(localFilePath)
|
|
@@ -233,7 +257,6 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
233
257
|
self._terminateEvent.set()
|
|
234
258
|
raise
|
|
235
259
|
|
|
236
|
-
|
|
237
260
|
def __del__(self) -> None:
|
|
238
261
|
"""
|
|
239
262
|
Cleanup function that is run when destroying the class instance. Nothing to do since there
|
|
@@ -241,7 +264,9 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
241
264
|
"""
|
|
242
265
|
|
|
243
266
|
@classmethod
|
|
244
|
-
def _removeDeadJobs(
|
|
267
|
+
def _removeDeadJobs(
|
|
268
|
+
cls, coordination_dir: str, batchSystemShutdown: bool = False
|
|
269
|
+
) -> None:
|
|
245
270
|
"""
|
|
246
271
|
Look at the state of all jobs registered in the individual job state files, and handle them
|
|
247
272
|
(clean up the disk)
|
|
@@ -254,26 +279,34 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
254
279
|
cls.check_for_coordination_corruption(coordination_dir)
|
|
255
280
|
|
|
256
281
|
for jobState in cls._getAllJobStates(coordination_dir):
|
|
257
|
-
if not process_name_exists(coordination_dir, jobState[
|
|
282
|
+
if not process_name_exists(coordination_dir, jobState["jobProcessName"]):
|
|
258
283
|
# We need to have a race to pick someone to clean up.
|
|
259
284
|
|
|
260
285
|
try:
|
|
261
|
-
# Open the directory
|
|
262
|
-
|
|
286
|
+
# Open the directory.
|
|
287
|
+
# We can't open a directory for write, only for read.
|
|
288
|
+
dirFD = os.open(jobState["jobDir"], os.O_RDONLY)
|
|
263
289
|
except FileNotFoundError:
|
|
264
290
|
# The cleanup has happened and we can't contest for it
|
|
265
291
|
continue
|
|
266
292
|
|
|
267
293
|
try:
|
|
268
|
-
# Try and lock it
|
|
269
|
-
|
|
294
|
+
# Try and lock it non-blocking
|
|
295
|
+
safe_lock(dirFD, block=False)
|
|
270
296
|
except OSError as e:
|
|
271
|
-
# We lost the race. Someone else is alive and has it locked.
|
|
272
297
|
os.close(dirFD)
|
|
298
|
+
if e.errno not in (errno.EACCES, errno.EAGAIN):
|
|
299
|
+
# Something went wrong
|
|
300
|
+
raise
|
|
301
|
+
# Otherwise, we lost the race. Someone else is alive and
|
|
302
|
+
# has it locked. So loop around again.
|
|
273
303
|
else:
|
|
274
304
|
# We got it
|
|
275
|
-
logger.warning(
|
|
276
|
-
|
|
305
|
+
logger.warning(
|
|
306
|
+
"Detected that job (%s) prematurely terminated. Fixing the "
|
|
307
|
+
"state of the job on disk.",
|
|
308
|
+
jobState["jobName"],
|
|
309
|
+
)
|
|
277
310
|
|
|
278
311
|
try:
|
|
279
312
|
if not batchSystemShutdown:
|
|
@@ -281,13 +314,12 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
281
314
|
# Delete the old work directory if it still exists. Do this only during
|
|
282
315
|
# the life of the program and dont' do it during the batch system
|
|
283
316
|
# cleanup. Leave that to the batch system cleanup code.
|
|
284
|
-
robust_rmtree(jobState[
|
|
317
|
+
robust_rmtree(jobState["jobDir"])
|
|
285
318
|
finally:
|
|
286
|
-
|
|
287
|
-
os.close(dirFD)
|
|
319
|
+
safe_unlock_and_close(dirFD)
|
|
288
320
|
|
|
289
321
|
@classmethod
|
|
290
|
-
def _getAllJobStates(cls, coordination_dir: str) -> Iterator[
|
|
322
|
+
def _getAllJobStates(cls, coordination_dir: str) -> Iterator[dict[str, str]]:
|
|
291
323
|
"""
|
|
292
324
|
Generator function that deserializes and yields the job state for every job on the node,
|
|
293
325
|
one at a time.
|
|
@@ -304,7 +336,7 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
304
336
|
# So we need to work in bytes.
|
|
305
337
|
for entry in os.scandir(os.fsencode(coordination_dir)):
|
|
306
338
|
# For each job state file in the coordination directory
|
|
307
|
-
if entry.name.endswith(b
|
|
339
|
+
if entry.name.endswith(b".jobState"):
|
|
308
340
|
# This is the state of a job
|
|
309
341
|
jobStateFiles.append(os.fsdecode(entry.path))
|
|
310
342
|
|
|
@@ -317,7 +349,7 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
317
349
|
# job finished & deleted its jobState file since the jobState files were discovered
|
|
318
350
|
continue
|
|
319
351
|
elif e.errno == 5:
|
|
320
|
-
# This is a OSError: [Errno 5] Input/output error (jobStatefile seems to disappear
|
|
352
|
+
# This is a OSError: [Errno 5] Input/output error (jobStatefile seems to disappear
|
|
321
353
|
# on network file system sometimes)
|
|
322
354
|
continue
|
|
323
355
|
else:
|
|
@@ -325,16 +357,16 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
325
357
|
|
|
326
358
|
@staticmethod
|
|
327
359
|
# Retry on any OSError except FileNotFoundError, which we throw immediately
|
|
328
|
-
@retry(
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
error=FileNotFoundError,
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
def _readJobState(jobStateFileName: str) ->
|
|
335
|
-
with open(jobStateFileName,
|
|
360
|
+
@retry(
|
|
361
|
+
errors=[
|
|
362
|
+
OSError,
|
|
363
|
+
ErrorCondition(error=FileNotFoundError, retry_on_this_condition=False),
|
|
364
|
+
]
|
|
365
|
+
)
|
|
366
|
+
def _readJobState(jobStateFileName: str) -> dict[str, str]:
|
|
367
|
+
with open(jobStateFileName, "rb") as fH:
|
|
336
368
|
state = dill.load(fH)
|
|
337
|
-
return cast(
|
|
369
|
+
return cast(dict[str, str], state)
|
|
338
370
|
|
|
339
371
|
def _createJobStateFile(self) -> str:
|
|
340
372
|
"""
|
|
@@ -347,20 +379,26 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
347
379
|
:rtype: str
|
|
348
380
|
"""
|
|
349
381
|
self.check_for_state_corruption()
|
|
350
|
-
jobState = {
|
|
351
|
-
|
|
352
|
-
|
|
382
|
+
jobState = {
|
|
383
|
+
"jobProcessName": get_process_name(self.coordination_dir),
|
|
384
|
+
"jobName": self.jobName,
|
|
385
|
+
"jobDir": self.localTempDir,
|
|
386
|
+
}
|
|
353
387
|
try:
|
|
354
|
-
(fd, jobStateFile) = tempfile.mkstemp(
|
|
388
|
+
(fd, jobStateFile) = tempfile.mkstemp(
|
|
389
|
+
suffix=".jobState.tmp", dir=self.coordination_dir
|
|
390
|
+
)
|
|
355
391
|
except Exception as e:
|
|
356
|
-
raise RuntimeError(
|
|
357
|
-
|
|
392
|
+
raise RuntimeError(
|
|
393
|
+
"Could not make state file in " + self.coordination_dir
|
|
394
|
+
) from e
|
|
395
|
+
with open(fd, "wb") as fH:
|
|
358
396
|
# Write data
|
|
359
397
|
dill.dump(jobState, fH)
|
|
360
398
|
# Drop suffix
|
|
361
|
-
jobStateFile = jobStateFile[
|
|
399
|
+
jobStateFile = jobStateFile[: -len(".tmp")]
|
|
362
400
|
# Put in place
|
|
363
|
-
os.rename(jobStateFile +
|
|
401
|
+
os.rename(jobStateFile + ".tmp", jobStateFile)
|
|
364
402
|
return jobStateFile
|
|
365
403
|
|
|
366
404
|
@classmethod
|