toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -22,28 +22,24 @@ import sqlite3
|
|
|
22
22
|
import stat
|
|
23
23
|
import threading
|
|
24
24
|
import time
|
|
25
|
+
from collections.abc import Generator, Iterator, Sequence
|
|
25
26
|
from contextlib import contextmanager
|
|
26
27
|
from tempfile import mkstemp
|
|
27
|
-
from typing import
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
Iterator,
|
|
31
|
-
Optional,
|
|
32
|
-
Sequence,
|
|
33
|
-
Tuple)
|
|
34
|
-
|
|
35
|
-
from toil.common import cacheDirName, getDirSizeRecursively, getFileSystemSize
|
|
28
|
+
from typing import Any, Callable, Optional
|
|
29
|
+
|
|
30
|
+
from toil.common import cacheDirName, getFileSystemSize
|
|
36
31
|
from toil.fileStores import FileID
|
|
37
32
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
38
33
|
from toil.job import Job, JobDescription
|
|
39
34
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
40
35
|
from toil.lib.compatibility import deprecated
|
|
41
|
-
from toil.lib.
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
36
|
+
from toil.lib.io import (
|
|
37
|
+
atomic_copy,
|
|
38
|
+
atomic_copyobj,
|
|
39
|
+
make_public_dir,
|
|
40
|
+
mkdtemp,
|
|
41
|
+
robust_rmtree,
|
|
42
|
+
)
|
|
47
43
|
from toil.lib.retry import ErrorCondition, retry
|
|
48
44
|
from toil.lib.threading import get_process_name, process_name_exists
|
|
49
45
|
|
|
@@ -67,9 +63,12 @@ class CacheUnbalancedError(CacheError):
|
|
|
67
63
|
"""
|
|
68
64
|
Raised if file store can't free enough space for caching
|
|
69
65
|
"""
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
66
|
+
|
|
67
|
+
message = (
|
|
68
|
+
"Unable unable to free enough space for caching. This error frequently arises due "
|
|
69
|
+
"to jobs using more disk than they have requested. Turn on debug logging to see "
|
|
70
|
+
"more information leading up to this error through cache usage logs."
|
|
71
|
+
)
|
|
73
72
|
|
|
74
73
|
def __init__(self):
|
|
75
74
|
super().__init__(self.message)
|
|
@@ -88,9 +87,11 @@ class IllegalDeletionCacheError(CacheError):
|
|
|
88
87
|
"""
|
|
89
88
|
|
|
90
89
|
def __init__(self, deletedFile):
|
|
91
|
-
message =
|
|
92
|
-
|
|
93
|
-
|
|
90
|
+
message = (
|
|
91
|
+
"Cache tracked file (%s) has been deleted or moved by user "
|
|
92
|
+
" without updating cache database. Use deleteLocalFile to "
|
|
93
|
+
"delete such files." % deletedFile
|
|
94
|
+
)
|
|
94
95
|
super().__init__(message)
|
|
95
96
|
|
|
96
97
|
|
|
@@ -209,13 +210,15 @@ class CachingFileStore(AbstractFileStore):
|
|
|
209
210
|
# Variables related to caching
|
|
210
211
|
# Decide where the cache directory will be. We put it in the local
|
|
211
212
|
# workflow directory.
|
|
212
|
-
self.localCacheDir = os.path.join(
|
|
213
|
+
self.localCacheDir = os.path.join(
|
|
214
|
+
self.workflow_dir, cacheDirName(self.jobStore.config.workflowID)
|
|
215
|
+
)
|
|
213
216
|
|
|
214
217
|
# Since each worker has it's own unique CachingFileStore instance, and only one Job can run
|
|
215
218
|
# at a time on a worker, we can track some stuff about the running job in ourselves.
|
|
216
219
|
self.jobName: str = str(self.jobDesc)
|
|
217
220
|
self.jobID = self.jobDesc.jobStoreID
|
|
218
|
-
logger.debug(
|
|
221
|
+
logger.debug("Starting job (%s) with ID (%s).", self.jobName, self.jobID)
|
|
219
222
|
|
|
220
223
|
# When the job actually starts, we will fill this in with the job's disk requirement.
|
|
221
224
|
self.jobDiskBytes: Optional[float] = None
|
|
@@ -231,7 +234,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
231
234
|
# the workflow left one behind without cleaning up properly; we need to
|
|
232
235
|
# be able to tell that from showing up on a machine where a cache has
|
|
233
236
|
# already been created.
|
|
234
|
-
self.dbPath = os.path.join(
|
|
237
|
+
self.dbPath = os.path.join(
|
|
238
|
+
self.coordination_dir, f"cache-{self.workflowAttemptNumber}.db"
|
|
239
|
+
)
|
|
235
240
|
|
|
236
241
|
# Database connections are provided by magic properties self.con and
|
|
237
242
|
# self.cur that always have the right object for the current thread to
|
|
@@ -255,7 +260,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
255
260
|
|
|
256
261
|
# Initialize the space accounting properties
|
|
257
262
|
freeSpace, _ = getFileSystemSize(self.localCacheDir)
|
|
258
|
-
self._write(
|
|
263
|
+
self._write(
|
|
264
|
+
[
|
|
265
|
+
(
|
|
266
|
+
"INSERT OR IGNORE INTO properties VALUES (?, ?)",
|
|
267
|
+
("maxSpace", freeSpace),
|
|
268
|
+
)
|
|
269
|
+
]
|
|
270
|
+
)
|
|
259
271
|
|
|
260
272
|
# Space used by caching and by jobs is accounted with queries
|
|
261
273
|
|
|
@@ -285,10 +297,12 @@ class CachingFileStore(AbstractFileStore):
|
|
|
285
297
|
"""
|
|
286
298
|
Get the database connection to be used for the current thread.
|
|
287
299
|
"""
|
|
288
|
-
if not hasattr(self._thread_local,
|
|
300
|
+
if not hasattr(self._thread_local, "con"):
|
|
289
301
|
# Connect to the database for this thread.
|
|
290
302
|
# TODO: We assume the connection closes when the thread goes away and can no longer use it.
|
|
291
|
-
self._thread_local.con = sqlite3.connect(
|
|
303
|
+
self._thread_local.con = sqlite3.connect(
|
|
304
|
+
self.dbPath, timeout=SQLITE_TIMEOUT_SECS
|
|
305
|
+
)
|
|
292
306
|
return self._thread_local.con
|
|
293
307
|
|
|
294
308
|
@property
|
|
@@ -296,18 +310,20 @@ class CachingFileStore(AbstractFileStore):
|
|
|
296
310
|
"""
|
|
297
311
|
Get the main cursor to be used for the current thread.
|
|
298
312
|
"""
|
|
299
|
-
if not hasattr(self._thread_local,
|
|
313
|
+
if not hasattr(self._thread_local, "cur"):
|
|
300
314
|
# If we don't already have a main cursor for the thread, make one.
|
|
301
315
|
self._thread_local.cur = self.con.cursor()
|
|
302
316
|
return self._thread_local.cur
|
|
303
317
|
|
|
304
318
|
@staticmethod
|
|
305
|
-
@retry(
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
319
|
+
@retry(
|
|
320
|
+
infinite_retries=True,
|
|
321
|
+
errors=[
|
|
322
|
+
ErrorCondition(
|
|
323
|
+
error=sqlite3.OperationalError, error_message_must_include="is locked"
|
|
324
|
+
)
|
|
325
|
+
],
|
|
326
|
+
)
|
|
311
327
|
def _static_write(con, cur, operations):
|
|
312
328
|
"""
|
|
313
329
|
Write to the caching database, using the given connection.
|
|
@@ -341,7 +357,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
341
357
|
# Do it
|
|
342
358
|
cur.execute(command, args)
|
|
343
359
|
except Exception as e:
|
|
344
|
-
logging.error(
|
|
360
|
+
logging.error("Error talking to caching database: %s", str(e))
|
|
345
361
|
|
|
346
362
|
# Try to make sure we don't somehow leave anything part-done if a
|
|
347
363
|
# middle operation somehow fails.
|
|
@@ -361,13 +377,17 @@ class CachingFileStore(AbstractFileStore):
|
|
|
361
377
|
return cur.rowcount
|
|
362
378
|
|
|
363
379
|
@staticmethod
|
|
364
|
-
@retry(
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
380
|
+
@retry(
|
|
381
|
+
infinite_retries=True,
|
|
382
|
+
errors=[
|
|
383
|
+
ErrorCondition(
|
|
384
|
+
error=sqlite3.OperationalError, error_message_must_include="is locked"
|
|
385
|
+
)
|
|
386
|
+
],
|
|
387
|
+
)
|
|
388
|
+
def _static_read(
|
|
389
|
+
cur: sqlite3.Cursor, query: str, args: Optional[Sequence[Any]] = ()
|
|
390
|
+
) -> Iterator[Any]:
|
|
371
391
|
"""
|
|
372
392
|
Read from the database.
|
|
373
393
|
|
|
@@ -420,7 +440,11 @@ class CachingFileStore(AbstractFileStore):
|
|
|
420
440
|
# Get a cursor
|
|
421
441
|
cur = con.cursor()
|
|
422
442
|
|
|
423
|
-
cls._static_write(
|
|
443
|
+
cls._static_write(
|
|
444
|
+
con,
|
|
445
|
+
cur,
|
|
446
|
+
[
|
|
447
|
+
"""
|
|
424
448
|
CREATE TABLE IF NOT EXISTS files (
|
|
425
449
|
id TEXT NOT NULL PRIMARY KEY,
|
|
426
450
|
path TEXT UNIQUE NOT NULL,
|
|
@@ -428,7 +452,8 @@ class CachingFileStore(AbstractFileStore):
|
|
|
428
452
|
state TEXT NOT NULL,
|
|
429
453
|
owner TEXT
|
|
430
454
|
)
|
|
431
|
-
""",
|
|
455
|
+
""",
|
|
456
|
+
"""
|
|
432
457
|
CREATE TABLE IF NOT EXISTS refs (
|
|
433
458
|
path TEXT NOT NULL,
|
|
434
459
|
file_id TEXT NOT NULL,
|
|
@@ -436,19 +461,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
436
461
|
state TEXT NOT NULL,
|
|
437
462
|
PRIMARY KEY (path, file_id)
|
|
438
463
|
)
|
|
439
|
-
""",
|
|
464
|
+
""",
|
|
465
|
+
"""
|
|
440
466
|
CREATE TABLE IF NOT EXISTS jobs (
|
|
441
467
|
id TEXT NOT NULL PRIMARY KEY,
|
|
442
468
|
tempdir TEXT NOT NULL,
|
|
443
469
|
disk INT NOT NULL,
|
|
444
470
|
worker TEXT
|
|
445
471
|
)
|
|
446
|
-
""",
|
|
472
|
+
""",
|
|
473
|
+
"""
|
|
447
474
|
CREATE TABLE IF NOT EXISTS properties (
|
|
448
475
|
name TEXT NOT NULL PRIMARY KEY,
|
|
449
476
|
value INT NOT NULL
|
|
450
477
|
)
|
|
451
|
-
"""
|
|
478
|
+
""",
|
|
479
|
+
],
|
|
480
|
+
)
|
|
452
481
|
|
|
453
482
|
# Caching-specific API
|
|
454
483
|
|
|
@@ -459,10 +488,12 @@ class CachingFileStore(AbstractFileStore):
|
|
|
459
488
|
If no limit is available, raises an error.
|
|
460
489
|
"""
|
|
461
490
|
|
|
462
|
-
for row in self.cur.execute(
|
|
491
|
+
for row in self.cur.execute(
|
|
492
|
+
"SELECT value FROM properties WHERE name = ?", ("maxSpace",)
|
|
493
|
+
):
|
|
463
494
|
return row[0]
|
|
464
495
|
|
|
465
|
-
raise RuntimeError(
|
|
496
|
+
raise RuntimeError("Unable to retrieve cache limit")
|
|
466
497
|
|
|
467
498
|
def getCacheUsed(self):
|
|
468
499
|
"""
|
|
@@ -475,10 +506,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
475
506
|
if self.cachingIsFree():
|
|
476
507
|
return 0
|
|
477
508
|
|
|
478
|
-
for row in self._read(
|
|
509
|
+
for row in self._read("SELECT TOTAL(size) FROM files"):
|
|
479
510
|
return row[0]
|
|
480
511
|
|
|
481
|
-
raise RuntimeError(
|
|
512
|
+
raise RuntimeError("Unable to retrieve cache usage")
|
|
482
513
|
|
|
483
514
|
def getCacheExtraJobSpace(self):
|
|
484
515
|
"""
|
|
@@ -493,15 +524,17 @@ class CachingFileStore(AbstractFileStore):
|
|
|
493
524
|
"""
|
|
494
525
|
|
|
495
526
|
# Total up the sizes of all the reads of files and subtract it from the total disk reservation of all jobs
|
|
496
|
-
for row in self._read(
|
|
527
|
+
for row in self._read(
|
|
528
|
+
"""
|
|
497
529
|
SELECT (
|
|
498
530
|
(SELECT TOTAL(disk) FROM jobs) -
|
|
499
531
|
(SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state == 'immutable')
|
|
500
532
|
) as result
|
|
501
|
-
"""
|
|
533
|
+
"""
|
|
534
|
+
):
|
|
502
535
|
return row[0]
|
|
503
536
|
|
|
504
|
-
raise RuntimeError(
|
|
537
|
+
raise RuntimeError("Unable to retrieve extra job space")
|
|
505
538
|
|
|
506
539
|
def getCacheAvailable(self):
|
|
507
540
|
"""
|
|
@@ -520,33 +553,38 @@ class CachingFileStore(AbstractFileStore):
|
|
|
520
553
|
|
|
521
554
|
# Do a little report first
|
|
522
555
|
for row in self._read("SELECT value FROM properties WHERE name = 'maxSpace'"):
|
|
523
|
-
logger.debug(
|
|
556
|
+
logger.debug("Max space: %d", row[0])
|
|
524
557
|
for row in self._read("SELECT TOTAL(size) FROM files"):
|
|
525
|
-
logger.debug(
|
|
558
|
+
logger.debug("Total file size: %d", row[0])
|
|
526
559
|
for row in self._read("SELECT TOTAL(disk) FROM jobs"):
|
|
527
|
-
logger.debug(
|
|
528
|
-
for row in self._read(
|
|
529
|
-
|
|
560
|
+
logger.debug("Total job disk requirement size: %d", row[0])
|
|
561
|
+
for row in self._read(
|
|
562
|
+
"SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state = 'immutable'"
|
|
563
|
+
):
|
|
564
|
+
logger.debug("Total immutable reference size: %d", row[0])
|
|
530
565
|
|
|
531
566
|
if self.cachingIsFree():
|
|
532
567
|
# If caching is free, we just say that all the space is always available.
|
|
533
|
-
for row in self._read(
|
|
568
|
+
for row in self._read(
|
|
569
|
+
"SELECT value FROM properties WHERE name = 'maxSpace'"
|
|
570
|
+
):
|
|
534
571
|
return row[0]
|
|
535
572
|
|
|
536
|
-
raise RuntimeError(
|
|
537
|
-
|
|
573
|
+
raise RuntimeError("Unable to retrieve available cache space")
|
|
538
574
|
|
|
539
|
-
for row in self._read(
|
|
575
|
+
for row in self._read(
|
|
576
|
+
"""
|
|
540
577
|
SELECT (
|
|
541
578
|
(SELECT value FROM properties WHERE name = 'maxSpace') -
|
|
542
579
|
(SELECT TOTAL(size) FROM files) -
|
|
543
580
|
((SELECT TOTAL(disk) FROM jobs) -
|
|
544
581
|
(SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state = 'immutable'))
|
|
545
582
|
) as result
|
|
546
|
-
"""
|
|
583
|
+
"""
|
|
584
|
+
):
|
|
547
585
|
return row[0]
|
|
548
586
|
|
|
549
|
-
raise RuntimeError(
|
|
587
|
+
raise RuntimeError("Unable to retrieve available cache space")
|
|
550
588
|
|
|
551
589
|
def getSpaceUsableForJobs(self):
|
|
552
590
|
"""
|
|
@@ -556,15 +594,17 @@ class CachingFileStore(AbstractFileStore):
|
|
|
556
594
|
If not retrievable, raises an error.
|
|
557
595
|
"""
|
|
558
596
|
|
|
559
|
-
for row in self._read(
|
|
597
|
+
for row in self._read(
|
|
598
|
+
"""
|
|
560
599
|
SELECT (
|
|
561
600
|
(SELECT value FROM properties WHERE name = 'maxSpace') -
|
|
562
601
|
(SELECT TOTAL(disk) FROM jobs)
|
|
563
602
|
) as result
|
|
564
|
-
"""
|
|
603
|
+
"""
|
|
604
|
+
):
|
|
565
605
|
return row[0]
|
|
566
606
|
|
|
567
|
-
raise RuntimeError(
|
|
607
|
+
raise RuntimeError("Unable to retrieve usabel space for jobs")
|
|
568
608
|
|
|
569
609
|
def getCacheUnusedJobRequirement(self):
|
|
570
610
|
"""
|
|
@@ -576,28 +616,36 @@ class CachingFileStore(AbstractFileStore):
|
|
|
576
616
|
If no value is available, raises an error.
|
|
577
617
|
"""
|
|
578
618
|
|
|
579
|
-
logger.debug(
|
|
580
|
-
|
|
581
|
-
for row in self._read('SELECT * FROM files'):
|
|
582
|
-
logger.debug('File record: %s', str(row))
|
|
619
|
+
logger.debug("Get unused space for job %s", self.jobID)
|
|
583
620
|
|
|
584
|
-
for row in self._read(
|
|
585
|
-
logger.debug(
|
|
621
|
+
for row in self._read("SELECT * FROM files"):
|
|
622
|
+
logger.debug("File record: %s", str(row))
|
|
586
623
|
|
|
624
|
+
for row in self._read("SELECT * FROM refs"):
|
|
625
|
+
logger.debug("Ref record: %s", str(row))
|
|
587
626
|
|
|
588
|
-
for row in self._read(
|
|
589
|
-
(
|
|
627
|
+
for row in self._read(
|
|
628
|
+
"SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.job_id = ? AND refs.state != ?",
|
|
629
|
+
(self.jobID, "mutable"),
|
|
630
|
+
):
|
|
590
631
|
# Sum up all the sizes of our referenced files, then subtract that from how much we came in with
|
|
591
632
|
return self.jobDiskBytes - row[0]
|
|
592
633
|
|
|
593
|
-
raise RuntimeError(
|
|
634
|
+
raise RuntimeError("Unable to retrieve unused job requirement space")
|
|
594
635
|
|
|
595
636
|
def adjustCacheLimit(self, newTotalBytes):
|
|
596
637
|
"""
|
|
597
638
|
Adjust the total cache size limit to the given number of bytes.
|
|
598
639
|
"""
|
|
599
640
|
|
|
600
|
-
self._write(
|
|
641
|
+
self._write(
|
|
642
|
+
[
|
|
643
|
+
(
|
|
644
|
+
"UPDATE properties SET value = ? WHERE name = ?",
|
|
645
|
+
(newTotalBytes, "maxSpace"),
|
|
646
|
+
)
|
|
647
|
+
]
|
|
648
|
+
)
|
|
601
649
|
|
|
602
650
|
def fileIsCached(self, fileID):
|
|
603
651
|
"""
|
|
@@ -608,8 +656,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
608
656
|
file you need to do it in a transaction.
|
|
609
657
|
"""
|
|
610
658
|
|
|
611
|
-
for row in self._read(
|
|
612
|
-
(
|
|
659
|
+
for row in self._read(
|
|
660
|
+
"SELECT COUNT(*) FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)",
|
|
661
|
+
(fileID, "cached", "uploadable", "uploading"),
|
|
662
|
+
):
|
|
613
663
|
|
|
614
664
|
return row[0] > 0
|
|
615
665
|
return False
|
|
@@ -621,7 +671,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
621
671
|
Counts mutable references too.
|
|
622
672
|
"""
|
|
623
673
|
|
|
624
|
-
for row in self._read(
|
|
674
|
+
for row in self._read("SELECT COUNT(*) FROM refs WHERE file_id = ?", (fileID,)):
|
|
625
675
|
return row[0]
|
|
626
676
|
return 0
|
|
627
677
|
|
|
@@ -634,11 +684,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
634
684
|
configurations, most notably the FileJobStore.
|
|
635
685
|
"""
|
|
636
686
|
|
|
637
|
-
for row in self._read(
|
|
687
|
+
for row in self._read(
|
|
688
|
+
"SELECT value FROM properties WHERE name = ?", ("freeCaching",)
|
|
689
|
+
):
|
|
638
690
|
return row[0] == 1
|
|
639
691
|
|
|
640
692
|
# Otherwise we need to set it
|
|
641
693
|
from toil.jobStores.fileJobStore import FileJobStore
|
|
694
|
+
|
|
642
695
|
if isinstance(self.jobStore, FileJobStore) and not self.forceNonFreeCaching:
|
|
643
696
|
# Caching may be free since we are using a file job store.
|
|
644
697
|
|
|
@@ -647,7 +700,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
647
700
|
|
|
648
701
|
# Read it out to a generated name.
|
|
649
702
|
destDir = mkdtemp(dir=self.localCacheDir)
|
|
650
|
-
cachedFile = os.path.join(destDir,
|
|
703
|
+
cachedFile = os.path.join(destDir, "sniffLinkCount")
|
|
651
704
|
self.jobStore.read_file(emptyID, cachedFile, symlink=False)
|
|
652
705
|
|
|
653
706
|
# Check the link count
|
|
@@ -667,7 +720,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
667
720
|
free = 0
|
|
668
721
|
|
|
669
722
|
# Save to the database if we're the first to work this out
|
|
670
|
-
self._write(
|
|
723
|
+
self._write(
|
|
724
|
+
[("INSERT OR IGNORE INTO properties VALUES (?, ?)", ("freeCaching", free))]
|
|
725
|
+
)
|
|
671
726
|
|
|
672
727
|
# Return true if we said caching was free
|
|
673
728
|
return free == 1
|
|
@@ -684,7 +739,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
684
739
|
|
|
685
740
|
# Hash the file ID
|
|
686
741
|
hasher = hashlib.sha1()
|
|
687
|
-
hasher.update(fileStoreID.encode(
|
|
742
|
+
hasher.update(fileStoreID.encode("utf-8"))
|
|
688
743
|
|
|
689
744
|
# Get a unique temp file name, including the file ID's hash to make
|
|
690
745
|
# sure we can never collide even though we are going to remove the
|
|
@@ -708,17 +763,19 @@ class CachingFileStore(AbstractFileStore):
|
|
|
708
763
|
# Get a list of all file owner processes on this node.
|
|
709
764
|
# Exclude NULL because it comes out as 0 and we can't look for PID 0.
|
|
710
765
|
owners = []
|
|
711
|
-
for row in self._read(
|
|
766
|
+
for row in self._read(
|
|
767
|
+
"SELECT DISTINCT owner FROM files WHERE owner IS NOT NULL"
|
|
768
|
+
):
|
|
712
769
|
owners.append(row[0])
|
|
713
770
|
|
|
714
771
|
# Work out which of them have died.
|
|
715
772
|
deadOwners = []
|
|
716
773
|
for owner in owners:
|
|
717
774
|
if not process_name_exists(self.coordination_dir, owner):
|
|
718
|
-
logger.debug(
|
|
775
|
+
logger.debug("Owner %s is dead", owner)
|
|
719
776
|
deadOwners.append(owner)
|
|
720
777
|
else:
|
|
721
|
-
logger.debug(
|
|
778
|
+
logger.debug("Owner %s is alive", owner)
|
|
722
779
|
|
|
723
780
|
for owner in deadOwners:
|
|
724
781
|
# Try and adopt all the files that any dead owner had
|
|
@@ -737,14 +794,28 @@ class CachingFileStore(AbstractFileStore):
|
|
|
737
794
|
#
|
|
738
795
|
# TODO: if we ever let other PIDs be responsible for writing our
|
|
739
796
|
# files asynchronously, this will need to change.
|
|
740
|
-
self._write(
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
797
|
+
self._write(
|
|
798
|
+
[
|
|
799
|
+
(
|
|
800
|
+
"UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?",
|
|
801
|
+
(me, "deleting", owner, "deleting"),
|
|
802
|
+
),
|
|
803
|
+
(
|
|
804
|
+
"UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?",
|
|
805
|
+
(me, "deleting", owner, "downloading"),
|
|
806
|
+
),
|
|
807
|
+
(
|
|
808
|
+
"UPDATE files SET owner = NULL, state = ? WHERE owner = ? AND (state = ? OR state = ?)",
|
|
809
|
+
("cached", owner, "uploadable", "uploading"),
|
|
810
|
+
),
|
|
811
|
+
]
|
|
812
|
+
)
|
|
746
813
|
|
|
747
|
-
logger.debug(
|
|
814
|
+
logger.debug(
|
|
815
|
+
"Tried to adopt file operations from dead worker %s to ourselves as %s",
|
|
816
|
+
owner,
|
|
817
|
+
me,
|
|
818
|
+
)
|
|
748
819
|
|
|
749
820
|
def _executePendingDeletions(self):
|
|
750
821
|
"""
|
|
@@ -758,16 +829,19 @@ class CachingFileStore(AbstractFileStore):
|
|
|
758
829
|
|
|
759
830
|
# Remember the file IDs we are deleting
|
|
760
831
|
deletedFiles = []
|
|
761
|
-
for row in self._read(
|
|
832
|
+
for row in self._read(
|
|
833
|
+
"SELECT id, path FROM files WHERE owner = ? AND state = ?",
|
|
834
|
+
(me, "deleting"),
|
|
835
|
+
):
|
|
762
836
|
# Grab everything we are supposed to delete and delete it
|
|
763
837
|
fileID = row[0]
|
|
764
838
|
filePath = row[1]
|
|
765
839
|
try:
|
|
766
840
|
os.unlink(filePath)
|
|
767
|
-
logger.debug(
|
|
841
|
+
logger.debug("Successfully deleted: %s", filePath)
|
|
768
842
|
except OSError:
|
|
769
843
|
# Probably already deleted
|
|
770
|
-
logger.debug(
|
|
844
|
+
logger.debug("File already gone: %s", filePath)
|
|
771
845
|
# Still need to mark it as deleted
|
|
772
846
|
|
|
773
847
|
# Whether we deleted the file or just found out that it is gone, we
|
|
@@ -778,8 +852,15 @@ class CachingFileStore(AbstractFileStore):
|
|
|
778
852
|
for fileID in deletedFiles:
|
|
779
853
|
# Drop all the files. They should have stayed in deleting state. We move them from there to not present at all.
|
|
780
854
|
# Also drop their references, if they had any from dead downloaders.
|
|
781
|
-
self._write(
|
|
782
|
-
|
|
855
|
+
self._write(
|
|
856
|
+
[
|
|
857
|
+
(
|
|
858
|
+
"DELETE FROM files WHERE id = ? AND state = ?",
|
|
859
|
+
(fileID, "deleting"),
|
|
860
|
+
),
|
|
861
|
+
("DELETE FROM refs WHERE file_id = ?", (fileID,)),
|
|
862
|
+
]
|
|
863
|
+
)
|
|
783
864
|
|
|
784
865
|
return len(deletedFiles)
|
|
785
866
|
|
|
@@ -799,7 +880,11 @@ class CachingFileStore(AbstractFileStore):
|
|
|
799
880
|
# Try and find a file we might want to upload
|
|
800
881
|
fileID = None
|
|
801
882
|
filePath = None
|
|
802
|
-
for row in self._static_read(
|
|
883
|
+
for row in self._static_read(
|
|
884
|
+
self.cur,
|
|
885
|
+
"SELECT id, path FROM files WHERE state = ? AND owner = ? LIMIT 1",
|
|
886
|
+
("uploadable", me),
|
|
887
|
+
):
|
|
803
888
|
fileID = row[0]
|
|
804
889
|
filePath = row[1]
|
|
805
890
|
|
|
@@ -808,30 +893,57 @@ class CachingFileStore(AbstractFileStore):
|
|
|
808
893
|
break
|
|
809
894
|
|
|
810
895
|
# We need to set it to uploading in a way that we can detect that *we* won the update race instead of anyone else.
|
|
811
|
-
rowCount = self._static_write(
|
|
896
|
+
rowCount = self._static_write(
|
|
897
|
+
self.con,
|
|
898
|
+
self.cur,
|
|
899
|
+
[
|
|
900
|
+
(
|
|
901
|
+
"UPDATE files SET state = ? WHERE id = ? AND state = ?",
|
|
902
|
+
("uploading", fileID, "uploadable"),
|
|
903
|
+
)
|
|
904
|
+
],
|
|
905
|
+
)
|
|
812
906
|
if rowCount != 1:
|
|
813
907
|
# We didn't manage to update it. Someone else (a running job if
|
|
814
908
|
# we are a committing thread, or visa versa) must have grabbed
|
|
815
909
|
# it.
|
|
816
|
-
logger.debug(
|
|
910
|
+
logger.debug("Lost race to upload %s", fileID)
|
|
817
911
|
# Try again to see if there is something else to grab.
|
|
818
912
|
continue
|
|
819
913
|
|
|
820
914
|
# Upload the file
|
|
821
|
-
logger.debug(
|
|
915
|
+
logger.debug("Actually executing upload for file %s", fileID)
|
|
822
916
|
try:
|
|
823
917
|
self.jobStore.update_file(fileID, filePath)
|
|
824
918
|
except:
|
|
825
919
|
# We need to set the state back to 'uploadable' in case of any failures to ensure
|
|
826
920
|
# we can retry properly.
|
|
827
|
-
self._static_write(
|
|
921
|
+
self._static_write(
|
|
922
|
+
self.con,
|
|
923
|
+
self.cur,
|
|
924
|
+
[
|
|
925
|
+
(
|
|
926
|
+
"UPDATE files SET state = ? WHERE id = ? AND state = ?",
|
|
927
|
+
("uploadable", fileID, "uploading"),
|
|
928
|
+
)
|
|
929
|
+
],
|
|
930
|
+
)
|
|
828
931
|
raise
|
|
829
932
|
|
|
830
933
|
# Count it for the total uploaded files value we need to return
|
|
831
934
|
uploadedCount += 1
|
|
832
935
|
|
|
833
936
|
# Remember that we uploaded it in the database
|
|
834
|
-
self._static_write(
|
|
937
|
+
self._static_write(
|
|
938
|
+
self.con,
|
|
939
|
+
self.cur,
|
|
940
|
+
[
|
|
941
|
+
(
|
|
942
|
+
"UPDATE files SET state = ?, owner = NULL WHERE id = ?",
|
|
943
|
+
("cached", fileID),
|
|
944
|
+
)
|
|
945
|
+
],
|
|
946
|
+
)
|
|
835
947
|
|
|
836
948
|
return uploadedCount
|
|
837
949
|
|
|
@@ -855,7 +967,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
855
967
|
# But we won't actually let the job run and use any of this space until
|
|
856
968
|
# the cache has been successfully cleared out.
|
|
857
969
|
with self.as_process() as me:
|
|
858
|
-
self._write(
|
|
970
|
+
self._write(
|
|
971
|
+
[
|
|
972
|
+
(
|
|
973
|
+
"INSERT INTO jobs VALUES (?, ?, ?, ?)",
|
|
974
|
+
(self.jobID, self.localTempDir, newJobReqs, me),
|
|
975
|
+
)
|
|
976
|
+
]
|
|
977
|
+
)
|
|
859
978
|
|
|
860
979
|
# Now we need to make sure that we can fit all currently cached files,
|
|
861
980
|
# and the parts of the total job requirements not currently spent on
|
|
@@ -863,7 +982,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
863
982
|
|
|
864
983
|
available = self.getCacheAvailable()
|
|
865
984
|
|
|
866
|
-
logger.debug(
|
|
985
|
+
logger.debug("Available space with job: %d bytes", available)
|
|
867
986
|
|
|
868
987
|
if available >= 0:
|
|
869
988
|
# We're fine on disk space
|
|
@@ -887,10 +1006,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
887
1006
|
"""
|
|
888
1007
|
|
|
889
1008
|
# Get the job's temp dir
|
|
890
|
-
for row in cls._static_read(
|
|
1009
|
+
for row in cls._static_read(
|
|
1010
|
+
cur, "SELECT tempdir FROM jobs WHERE id = ?", (jobID,)
|
|
1011
|
+
):
|
|
891
1012
|
jobTemp = row[0]
|
|
892
1013
|
|
|
893
|
-
for row in cls._static_read(
|
|
1014
|
+
for row in cls._static_read(
|
|
1015
|
+
cur, "SELECT path FROM refs WHERE job_id = ?", (jobID,)
|
|
1016
|
+
):
|
|
894
1017
|
try:
|
|
895
1018
|
# Delete all the reference files.
|
|
896
1019
|
os.unlink(row[0])
|
|
@@ -898,7 +1021,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
898
1021
|
# May not exist
|
|
899
1022
|
pass
|
|
900
1023
|
# And their database entries
|
|
901
|
-
cls._static_write(con, cur, [(
|
|
1024
|
+
cls._static_write(con, cur, [("DELETE FROM refs WHERE job_id = ?", (jobID,))])
|
|
902
1025
|
|
|
903
1026
|
try:
|
|
904
1027
|
# Delete the job's temp directory to the extent that we can.
|
|
@@ -907,7 +1030,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
907
1030
|
pass
|
|
908
1031
|
|
|
909
1032
|
# Strike the job from the database
|
|
910
|
-
cls._static_write(con, cur, [(
|
|
1033
|
+
cls._static_write(con, cur, [("DELETE FROM jobs WHERE id = ?", (jobID,))])
|
|
911
1034
|
|
|
912
1035
|
def _deallocateSpaceForJob(self):
|
|
913
1036
|
"""
|
|
@@ -938,12 +1061,12 @@ class CachingFileStore(AbstractFileStore):
|
|
|
938
1061
|
if self._executePendingDeletions() > 0:
|
|
939
1062
|
# We actually had something to delete, which we deleted.
|
|
940
1063
|
# Maybe there is space now
|
|
941
|
-
logger.debug(
|
|
1064
|
+
logger.debug("Successfully executed pending deletions to free space")
|
|
942
1065
|
return True
|
|
943
1066
|
|
|
944
1067
|
if self._executePendingUploads() > 0:
|
|
945
1068
|
# We had something to upload. Maybe it can be evicted now.
|
|
946
|
-
logger.debug(
|
|
1069
|
+
logger.debug("Successfully executed pending uploads to free space")
|
|
947
1070
|
return True
|
|
948
1071
|
|
|
949
1072
|
# Otherwise, not enough files could be found in deleting state to solve our problem.
|
|
@@ -953,37 +1076,45 @@ class CachingFileStore(AbstractFileStore):
|
|
|
953
1076
|
# soon as we hit the cache limit.
|
|
954
1077
|
|
|
955
1078
|
# Find something that has no non-mutable references and is not already being deleted.
|
|
956
|
-
self._read(
|
|
1079
|
+
self._read(
|
|
1080
|
+
"""
|
|
957
1081
|
SELECT files.id FROM files WHERE files.state = 'cached' AND NOT EXISTS (
|
|
958
1082
|
SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
|
|
959
1083
|
) LIMIT 1
|
|
960
|
-
"""
|
|
1084
|
+
"""
|
|
1085
|
+
)
|
|
961
1086
|
row = self.cur.fetchone()
|
|
962
1087
|
if row is None:
|
|
963
1088
|
# Nothing can be evicted by us.
|
|
964
1089
|
# Someone else might be in the process of evicting something that will free up space for us too.
|
|
965
1090
|
# Or someone mught be uploading something and we have to wait for them to finish before it can be deleted.
|
|
966
|
-
logger.debug(
|
|
1091
|
+
logger.debug("Could not find anything to evict! Cannot free up space!")
|
|
967
1092
|
return False
|
|
968
1093
|
|
|
969
1094
|
# Otherwise we found an eviction candidate.
|
|
970
1095
|
fileID = row[0]
|
|
971
1096
|
|
|
972
1097
|
# Try and grab it for deletion, subject to the condition that nothing has started reading it
|
|
973
|
-
self._write(
|
|
1098
|
+
self._write(
|
|
1099
|
+
[
|
|
1100
|
+
(
|
|
1101
|
+
"""
|
|
974
1102
|
UPDATE files SET owner = ?, state = ? WHERE id = ? AND state = ?
|
|
975
1103
|
AND owner IS NULL AND NOT EXISTS (
|
|
976
1104
|
SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
|
|
977
1105
|
)
|
|
978
1106
|
""",
|
|
979
|
-
|
|
1107
|
+
(me, "deleting", fileID, "cached"),
|
|
1108
|
+
)
|
|
1109
|
+
]
|
|
1110
|
+
)
|
|
980
1111
|
|
|
981
|
-
logger.debug(
|
|
1112
|
+
logger.debug("Evicting file %s", fileID)
|
|
982
1113
|
|
|
983
1114
|
# Whether we actually got it or not, try deleting everything we have to delete
|
|
984
1115
|
if self._executePendingDeletions() > 0:
|
|
985
1116
|
# We deleted something
|
|
986
|
-
logger.debug(
|
|
1117
|
+
logger.debug("Successfully executed pending deletions to free space")
|
|
987
1118
|
return True
|
|
988
1119
|
|
|
989
1120
|
def _freeUpSpace(self):
|
|
@@ -1000,7 +1131,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1000
1131
|
|
|
1001
1132
|
while availableSpace < 0:
|
|
1002
1133
|
# While there isn't enough space for the thing we want
|
|
1003
|
-
logger.debug(
|
|
1134
|
+
logger.debug(
|
|
1135
|
+
"Cache is full (%d bytes free). Trying to free up space!",
|
|
1136
|
+
availableSpace,
|
|
1137
|
+
)
|
|
1004
1138
|
# Free up space. See if we made any progress
|
|
1005
1139
|
progress = self._tryToFreeUpSpace()
|
|
1006
1140
|
availableSpace = self.getCacheAvailable()
|
|
@@ -1012,19 +1146,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1012
1146
|
# See if we've been oversubscribed.
|
|
1013
1147
|
jobSpace = self.getSpaceUsableForJobs()
|
|
1014
1148
|
if jobSpace < 0:
|
|
1015
|
-
logger.critical(
|
|
1149
|
+
logger.critical(
|
|
1150
|
+
"Jobs on this machine have oversubscribed our total available space (%d bytes)!",
|
|
1151
|
+
jobSpace,
|
|
1152
|
+
)
|
|
1016
1153
|
raise CacheUnbalancedError
|
|
1017
1154
|
else:
|
|
1018
1155
|
patience -= 1
|
|
1019
1156
|
if patience <= 0:
|
|
1020
|
-
logger.critical(
|
|
1157
|
+
logger.critical(
|
|
1158
|
+
"Waited implausibly long for active uploads and deletes."
|
|
1159
|
+
)
|
|
1021
1160
|
raise CacheUnbalancedError
|
|
1022
1161
|
else:
|
|
1023
1162
|
# Wait a bit and come back
|
|
1024
1163
|
time.sleep(2)
|
|
1025
1164
|
|
|
1026
|
-
logger.debug(
|
|
1027
|
-
|
|
1165
|
+
logger.debug("Cache has %d bytes free.", availableSpace)
|
|
1028
1166
|
|
|
1029
1167
|
# Normal AbstractFileStore API
|
|
1030
1168
|
|
|
@@ -1037,15 +1175,21 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1037
1175
|
# Create a working directory for the job
|
|
1038
1176
|
startingDir = os.getcwd()
|
|
1039
1177
|
# Move self.localTempDir from the worker directory set up in __init__ to a per-job directory.
|
|
1040
|
-
self.localTempDir = make_public_dir(
|
|
1178
|
+
self.localTempDir = make_public_dir(self.localTempDir, suggested_name="job")
|
|
1041
1179
|
# Check the status of all jobs on this node. If there are jobs that started and died before
|
|
1042
1180
|
# cleaning up their presence from the database, clean them up ourselves.
|
|
1043
1181
|
self._removeDeadJobs(self.coordination_dir, self.con)
|
|
1044
|
-
# Get the
|
|
1182
|
+
# Get the disk requirement for the job, which we will use to know if we
|
|
1183
|
+
# have filled the cache or not.
|
|
1045
1184
|
self.jobDiskBytes = job.disk
|
|
1046
1185
|
|
|
1047
|
-
logger.debug(
|
|
1048
|
-
|
|
1186
|
+
logger.debug(
|
|
1187
|
+
"Actually running job (%s) with ID (%s) which wants %d of our %d bytes.",
|
|
1188
|
+
self.jobName,
|
|
1189
|
+
self.jobID,
|
|
1190
|
+
self.jobDiskBytes,
|
|
1191
|
+
self.getCacheLimit(),
|
|
1192
|
+
)
|
|
1049
1193
|
|
|
1050
1194
|
# Register the current job as taking this much space, and evict files
|
|
1051
1195
|
# from the cache to make room before letting the job run.
|
|
@@ -1055,22 +1199,6 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1055
1199
|
with super().open(job):
|
|
1056
1200
|
yield
|
|
1057
1201
|
finally:
|
|
1058
|
-
# See how much disk space is used at the end of the job.
|
|
1059
|
-
# Not a real peak disk usage, but close enough to be useful for warning the user.
|
|
1060
|
-
# TODO: Push this logic into the abstract file store
|
|
1061
|
-
disk: int = getDirSizeRecursively(self.localTempDir)
|
|
1062
|
-
percent: float = 0.0
|
|
1063
|
-
if self.jobDiskBytes and self.jobDiskBytes > 0:
|
|
1064
|
-
percent = float(disk) / self.jobDiskBytes * 100
|
|
1065
|
-
disk_usage: str = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(disk)}B [{disk}B] used, "
|
|
1066
|
-
f"{bytes2human(self.jobDiskBytes)}B [{self.jobDiskBytes}B] requested).")
|
|
1067
|
-
if disk > self.jobDiskBytes:
|
|
1068
|
-
self.log_to_leader("Job used more disk than requested. For CWL, consider increasing the outdirMin "
|
|
1069
|
-
f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
|
|
1070
|
-
level=logging.WARNING)
|
|
1071
|
-
else:
|
|
1072
|
-
self.log_to_leader(disk_usage, level=logging.DEBUG)
|
|
1073
|
-
|
|
1074
1202
|
# Go back up to the per-worker local temp directory.
|
|
1075
1203
|
os.chdir(startingDir)
|
|
1076
1204
|
self.cleanupInProgress = True
|
|
@@ -1095,7 +1223,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1095
1223
|
# Create an empty file to get an ID.
|
|
1096
1224
|
# Make sure to pass along the file basename.
|
|
1097
1225
|
# TODO: this empty file could leak if we die now...
|
|
1098
|
-
fileID = self.jobStore.
|
|
1226
|
+
fileID = self.jobStore.get_empty_file_store_id(
|
|
1227
|
+
creatorID, cleanup, os.path.basename(localFileName)
|
|
1228
|
+
)
|
|
1099
1229
|
# Work out who we are
|
|
1100
1230
|
with self.as_process() as me:
|
|
1101
1231
|
|
|
@@ -1104,10 +1234,22 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1104
1234
|
|
|
1105
1235
|
# Create a file in uploadable state and a reference, in the same transaction.
|
|
1106
1236
|
# Say the reference is an immutable reference
|
|
1107
|
-
self._write(
|
|
1108
|
-
|
|
1237
|
+
self._write(
|
|
1238
|
+
[
|
|
1239
|
+
(
|
|
1240
|
+
"INSERT INTO files VALUES (?, ?, ?, ?, ?)",
|
|
1241
|
+
(fileID, cachePath, fileSize, "uploadable", me),
|
|
1242
|
+
),
|
|
1243
|
+
(
|
|
1244
|
+
"INSERT INTO refs VALUES (?, ?, ?, ?)",
|
|
1245
|
+
(absLocalFileName, fileID, creatorID, "immutable"),
|
|
1246
|
+
),
|
|
1247
|
+
]
|
|
1248
|
+
)
|
|
1109
1249
|
|
|
1110
|
-
if absLocalFileName.startswith(self.localTempDir) and not os.path.islink(
|
|
1250
|
+
if absLocalFileName.startswith(self.localTempDir) and not os.path.islink(
|
|
1251
|
+
absLocalFileName
|
|
1252
|
+
):
|
|
1111
1253
|
# We should link into the cache, because the upload is coming from our local temp dir (and not via a symlink in there)
|
|
1112
1254
|
try:
|
|
1113
1255
|
# Try and hardlink the file into the cache.
|
|
@@ -1118,8 +1260,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1118
1260
|
|
|
1119
1261
|
linkedToCache = True
|
|
1120
1262
|
|
|
1121
|
-
logger.debug(
|
|
1122
|
-
|
|
1263
|
+
logger.debug(
|
|
1264
|
+
"Hardlinked file %s into cache at %s; deferring write to job store",
|
|
1265
|
+
localFileName,
|
|
1266
|
+
cachePath,
|
|
1267
|
+
)
|
|
1268
|
+
assert not os.path.islink(cachePath), (
|
|
1269
|
+
"Symlink %s has invaded cache!" % cachePath
|
|
1270
|
+
)
|
|
1123
1271
|
|
|
1124
1272
|
# Don't do the upload now. Let it be deferred until later (when the job is committing).
|
|
1125
1273
|
except OSError:
|
|
@@ -1133,7 +1281,6 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1133
1281
|
# files to vanish from our cache.
|
|
1134
1282
|
linkedToCache = False
|
|
1135
1283
|
|
|
1136
|
-
|
|
1137
1284
|
if not linkedToCache:
|
|
1138
1285
|
# If we can't do the link into the cache and upload from there, we
|
|
1139
1286
|
# have to just upload right away. We can't guarantee sufficient
|
|
@@ -1142,27 +1289,40 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1142
1289
|
|
|
1143
1290
|
# Change the reference to 'mutable', which it will be.
|
|
1144
1291
|
# And drop the file altogether.
|
|
1145
|
-
self._write(
|
|
1146
|
-
|
|
1292
|
+
self._write(
|
|
1293
|
+
[
|
|
1294
|
+
(
|
|
1295
|
+
"UPDATE refs SET state = ? WHERE path = ? AND file_id = ?",
|
|
1296
|
+
("mutable", absLocalFileName, fileID),
|
|
1297
|
+
),
|
|
1298
|
+
("DELETE FROM files WHERE id = ?", (fileID,)),
|
|
1299
|
+
]
|
|
1300
|
+
)
|
|
1147
1301
|
|
|
1148
1302
|
# Save the file to the job store right now
|
|
1149
|
-
logger.debug(
|
|
1303
|
+
logger.debug(
|
|
1304
|
+
"Actually executing upload immediately for file %s", fileID
|
|
1305
|
+
)
|
|
1150
1306
|
self.jobStore.update_file(fileID, absLocalFileName)
|
|
1151
1307
|
|
|
1152
1308
|
# Ship out the completed FileID object with its real size.
|
|
1153
1309
|
return FileID.forPath(fileID, absLocalFileName)
|
|
1154
1310
|
|
|
1155
|
-
def readGlobalFile(
|
|
1311
|
+
def readGlobalFile(
|
|
1312
|
+
self, fileStoreID, userPath=None, cache=True, mutable=False, symlink=False
|
|
1313
|
+
):
|
|
1156
1314
|
|
|
1157
1315
|
if str(fileStoreID) in self.filesToDelete:
|
|
1158
1316
|
# File has already been deleted
|
|
1159
|
-
raise FileNotFoundError(f
|
|
1317
|
+
raise FileNotFoundError(f"Attempted to read deleted file: {fileStoreID}")
|
|
1160
1318
|
|
|
1161
1319
|
if userPath is not None:
|
|
1162
1320
|
# Validate the destination we got
|
|
1163
1321
|
localFilePath = self._resolveAbsoluteLocalPath(userPath)
|
|
1164
1322
|
if os.path.exists(localFilePath):
|
|
1165
|
-
raise RuntimeError(
|
|
1323
|
+
raise RuntimeError(
|
|
1324
|
+
" File %s " % localFilePath + " exists. Cannot Overwrite."
|
|
1325
|
+
)
|
|
1166
1326
|
else:
|
|
1167
1327
|
# Make our own destination
|
|
1168
1328
|
localFilePath = self.getLocalTempFileName()
|
|
@@ -1174,22 +1334,29 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1174
1334
|
# We want to use the cache
|
|
1175
1335
|
|
|
1176
1336
|
if mutable:
|
|
1177
|
-
finalPath = self._readGlobalFileMutablyWithCache(
|
|
1337
|
+
finalPath = self._readGlobalFileMutablyWithCache(
|
|
1338
|
+
fileStoreID, localFilePath, readerID
|
|
1339
|
+
)
|
|
1178
1340
|
else:
|
|
1179
|
-
finalPath = self._readGlobalFileWithCache(
|
|
1341
|
+
finalPath = self._readGlobalFileWithCache(
|
|
1342
|
+
fileStoreID, localFilePath, symlink, readerID
|
|
1343
|
+
)
|
|
1180
1344
|
else:
|
|
1181
1345
|
# We do not want to use the cache
|
|
1182
|
-
finalPath = self._readGlobalFileWithoutCache(
|
|
1346
|
+
finalPath = self._readGlobalFileWithoutCache(
|
|
1347
|
+
fileStoreID, localFilePath, mutable, symlink, readerID
|
|
1348
|
+
)
|
|
1183
1349
|
|
|
1184
|
-
if getattr(fileStoreID,
|
|
1350
|
+
if getattr(fileStoreID, "executable", False):
|
|
1185
1351
|
os.chmod(finalPath, os.stat(finalPath).st_mode | stat.S_IXUSR)
|
|
1186
1352
|
|
|
1187
1353
|
# Record access in case the job crashes and we have to log it
|
|
1188
1354
|
self.logAccess(fileStoreID, finalPath)
|
|
1189
1355
|
return finalPath
|
|
1190
1356
|
|
|
1191
|
-
|
|
1192
|
-
|
|
1357
|
+
def _readGlobalFileWithoutCache(
|
|
1358
|
+
self, fileStoreID, localFilePath, mutable, symlink, readerID
|
|
1359
|
+
):
|
|
1193
1360
|
"""
|
|
1194
1361
|
Read a file without putting it into the cache.
|
|
1195
1362
|
|
|
@@ -1207,7 +1374,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1207
1374
|
# read a file that is 'uploadable' or 'uploading' and hasn't hit
|
|
1208
1375
|
# the backing job store yet.
|
|
1209
1376
|
|
|
1210
|
-
with self._with_copying_reference_to_upload(
|
|
1377
|
+
with self._with_copying_reference_to_upload(
|
|
1378
|
+
fileStoreID, readerID, localFilePath
|
|
1379
|
+
) as ref_path:
|
|
1211
1380
|
if ref_path is not None:
|
|
1212
1381
|
# We got a copying reference, so the file is being uploaded and
|
|
1213
1382
|
# must be read from the cache for consistency. And it will
|
|
@@ -1221,11 +1390,16 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1221
1390
|
|
|
1222
1391
|
# Find where the file is cached
|
|
1223
1392
|
cachedPath = None
|
|
1224
|
-
for row in self._read(
|
|
1393
|
+
for row in self._read(
|
|
1394
|
+
"SELECT path FROM files WHERE id = ?", (fileStoreID,)
|
|
1395
|
+
):
|
|
1225
1396
|
cachedPath = row[0]
|
|
1226
1397
|
|
|
1227
1398
|
if cachedPath is None:
|
|
1228
|
-
raise RuntimeError(
|
|
1399
|
+
raise RuntimeError(
|
|
1400
|
+
"File %s went away while we had a reference to it!"
|
|
1401
|
+
% fileStoreID
|
|
1402
|
+
)
|
|
1229
1403
|
|
|
1230
1404
|
if self.forceDownloadDelay is not None:
|
|
1231
1405
|
# Wait around to simulate a big file for testing
|
|
@@ -1234,8 +1408,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1234
1408
|
atomic_copy(cachedPath, ref_path)
|
|
1235
1409
|
|
|
1236
1410
|
# Change the reference to mutable so it sticks around
|
|
1237
|
-
self._write(
|
|
1238
|
-
|
|
1411
|
+
self._write(
|
|
1412
|
+
[
|
|
1413
|
+
(
|
|
1414
|
+
"UPDATE refs SET state = ? WHERE path = ? and file_id = ?",
|
|
1415
|
+
("mutable", ref_path, fileStoreID),
|
|
1416
|
+
)
|
|
1417
|
+
]
|
|
1418
|
+
)
|
|
1239
1419
|
else:
|
|
1240
1420
|
# File is not being uploaded currently.
|
|
1241
1421
|
|
|
@@ -1245,8 +1425,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1245
1425
|
|
|
1246
1426
|
# Create a 'mutable' reference (even if we end up with a link)
|
|
1247
1427
|
# so we can see this file in deleteLocalFile.
|
|
1248
|
-
self._write(
|
|
1249
|
-
|
|
1428
|
+
self._write(
|
|
1429
|
+
[
|
|
1430
|
+
(
|
|
1431
|
+
"INSERT INTO refs VALUES (?, ?, ?, ?)",
|
|
1432
|
+
(localFilePath, fileStoreID, readerID, "mutable"),
|
|
1433
|
+
)
|
|
1434
|
+
]
|
|
1435
|
+
)
|
|
1250
1436
|
|
|
1251
1437
|
if self.forceDownloadDelay is not None:
|
|
1252
1438
|
# Wait around to simulate a big file for testing
|
|
@@ -1306,15 +1492,32 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1306
1492
|
# Start a loop until we can do one of these
|
|
1307
1493
|
while True:
|
|
1308
1494
|
# Try and create a downloading entry if no entry exists
|
|
1309
|
-
logger.debug(
|
|
1310
|
-
self._write(
|
|
1311
|
-
|
|
1495
|
+
logger.debug("Trying to make file record for id %s", fileStoreID)
|
|
1496
|
+
self._write(
|
|
1497
|
+
[
|
|
1498
|
+
(
|
|
1499
|
+
"INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)",
|
|
1500
|
+
(
|
|
1501
|
+
fileStoreID,
|
|
1502
|
+
cachedPath,
|
|
1503
|
+
self.getGlobalFileSize(fileStoreID),
|
|
1504
|
+
"downloading",
|
|
1505
|
+
me,
|
|
1506
|
+
),
|
|
1507
|
+
)
|
|
1508
|
+
]
|
|
1509
|
+
)
|
|
1312
1510
|
|
|
1313
1511
|
# See if we won the race
|
|
1314
|
-
self._read(
|
|
1512
|
+
self._read(
|
|
1513
|
+
"SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?",
|
|
1514
|
+
(fileStoreID, "downloading", me),
|
|
1515
|
+
)
|
|
1315
1516
|
if self.cur.fetchone()[0] > 0:
|
|
1316
1517
|
# We are responsible for downloading the file
|
|
1317
|
-
logger.debug(
|
|
1518
|
+
logger.debug(
|
|
1519
|
+
"We are now responsible for downloading file %s", fileStoreID
|
|
1520
|
+
)
|
|
1318
1521
|
|
|
1319
1522
|
# Make sure we have space for this download.
|
|
1320
1523
|
self._freeUpSpace()
|
|
@@ -1329,37 +1532,65 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1329
1532
|
# two readers, one cached copy, and space for two copies total.
|
|
1330
1533
|
|
|
1331
1534
|
# Make the copying reference
|
|
1332
|
-
self._write(
|
|
1333
|
-
|
|
1535
|
+
self._write(
|
|
1536
|
+
[
|
|
1537
|
+
(
|
|
1538
|
+
"INSERT INTO refs VALUES (?, ?, ?, ?)",
|
|
1539
|
+
(localFilePath, fileStoreID, readerID, "copying"),
|
|
1540
|
+
)
|
|
1541
|
+
]
|
|
1542
|
+
)
|
|
1334
1543
|
|
|
1335
1544
|
# Fulfill it with a full copy or by giving away the cached copy
|
|
1336
|
-
self._fulfillCopyingReference(
|
|
1545
|
+
self._fulfillCopyingReference(
|
|
1546
|
+
fileStoreID, cachedPath, localFilePath
|
|
1547
|
+
)
|
|
1337
1548
|
|
|
1338
1549
|
# Now we're done
|
|
1339
1550
|
return localFilePath
|
|
1340
1551
|
|
|
1341
1552
|
else:
|
|
1342
|
-
logger.debug(
|
|
1553
|
+
logger.debug(
|
|
1554
|
+
"Someone else is already responsible for file %s", fileStoreID
|
|
1555
|
+
)
|
|
1343
1556
|
|
|
1344
1557
|
# A record already existed for this file.
|
|
1345
1558
|
# Try and create an immutable or copying reference to an entry that
|
|
1346
1559
|
# is in 'cached' or 'uploadable' or 'uploading' state.
|
|
1347
1560
|
# It might be uploading because *we* are supposed to be uploading it.
|
|
1348
|
-
logger.debug(
|
|
1349
|
-
self._write(
|
|
1350
|
-
|
|
1561
|
+
logger.debug("Trying to make reference to file %s", fileStoreID)
|
|
1562
|
+
self._write(
|
|
1563
|
+
[
|
|
1564
|
+
(
|
|
1565
|
+
"INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)",
|
|
1566
|
+
(
|
|
1567
|
+
localFilePath,
|
|
1568
|
+
readerID,
|
|
1569
|
+
"copying",
|
|
1570
|
+
fileStoreID,
|
|
1571
|
+
"cached",
|
|
1572
|
+
"uploadable",
|
|
1573
|
+
"uploading",
|
|
1574
|
+
),
|
|
1575
|
+
)
|
|
1576
|
+
]
|
|
1577
|
+
)
|
|
1351
1578
|
|
|
1352
1579
|
# See if we got it
|
|
1353
|
-
self._read(
|
|
1580
|
+
self._read(
|
|
1581
|
+
"SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?",
|
|
1582
|
+
(localFilePath, fileStoreID),
|
|
1583
|
+
)
|
|
1354
1584
|
if self.cur.fetchone()[0] > 0:
|
|
1355
1585
|
# The file is cached and we can copy or link it
|
|
1356
|
-
logger.debug(
|
|
1586
|
+
logger.debug("Obtained reference to file %s", fileStoreID)
|
|
1357
1587
|
|
|
1358
1588
|
# Get the path it is actually at in the cache, instead of where we wanted to put it
|
|
1359
|
-
for row in self._read(
|
|
1589
|
+
for row in self._read(
|
|
1590
|
+
"SELECT path FROM files WHERE id = ?", (fileStoreID,)
|
|
1591
|
+
):
|
|
1360
1592
|
cachedPath = row[0]
|
|
1361
1593
|
|
|
1362
|
-
|
|
1363
1594
|
while self.getCacheAvailable() < 0:
|
|
1364
1595
|
# Since we now have a copying reference, see if we have used too much space.
|
|
1365
1596
|
# If so, try to free up some space by deleting or uploading, but
|
|
@@ -1372,15 +1603,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1372
1603
|
|
|
1373
1604
|
# See if we have no other references and we can give away the file.
|
|
1374
1605
|
# Change it to downloading owned by us if we can grab it.
|
|
1375
|
-
self._write(
|
|
1606
|
+
self._write(
|
|
1607
|
+
[
|
|
1608
|
+
(
|
|
1609
|
+
"""
|
|
1376
1610
|
UPDATE files SET files.owner = ?, files.state = ? WHERE files.id = ? AND files.state = ?
|
|
1377
1611
|
AND files.owner IS NULL AND NOT EXISTS (
|
|
1378
1612
|
SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
|
|
1379
1613
|
)
|
|
1380
1614
|
""",
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1615
|
+
(me, "downloading", fileStoreID, "cached"),
|
|
1616
|
+
)
|
|
1617
|
+
]
|
|
1618
|
+
)
|
|
1619
|
+
|
|
1620
|
+
if self._giveAwayDownloadingFile(
|
|
1621
|
+
fileStoreID, cachedPath, localFilePath
|
|
1622
|
+
):
|
|
1384
1623
|
# We got ownership of the file and managed to give it away.
|
|
1385
1624
|
return localFilePath
|
|
1386
1625
|
|
|
@@ -1401,14 +1640,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1401
1640
|
atomic_copy(cachedPath, localFilePath)
|
|
1402
1641
|
|
|
1403
1642
|
# Change the reference to mutable
|
|
1404
|
-
self._write(
|
|
1643
|
+
self._write(
|
|
1644
|
+
[
|
|
1645
|
+
(
|
|
1646
|
+
"UPDATE refs SET state = ? WHERE path = ? AND file_id = ?",
|
|
1647
|
+
("mutable", localFilePath, fileStoreID),
|
|
1648
|
+
)
|
|
1649
|
+
]
|
|
1650
|
+
)
|
|
1405
1651
|
|
|
1406
1652
|
# Now we're done
|
|
1407
1653
|
return localFilePath
|
|
1408
1654
|
|
|
1409
1655
|
else:
|
|
1410
1656
|
# We didn't get a reference. Maybe it is still downloading.
|
|
1411
|
-
logger.debug(
|
|
1657
|
+
logger.debug(
|
|
1658
|
+
"Could not obtain reference to file %s", fileStoreID
|
|
1659
|
+
)
|
|
1412
1660
|
|
|
1413
1661
|
# Loop around again and see if either we can download it or we can get a reference to it.
|
|
1414
1662
|
|
|
@@ -1448,8 +1696,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1448
1696
|
# Expose this file as cached so other people can copy off of it too.
|
|
1449
1697
|
|
|
1450
1698
|
# Change state from downloading to cached
|
|
1451
|
-
self._write(
|
|
1452
|
-
|
|
1699
|
+
self._write(
|
|
1700
|
+
[
|
|
1701
|
+
(
|
|
1702
|
+
"UPDATE files SET state = ?, owner = NULL WHERE id = ?",
|
|
1703
|
+
("cached", fileStoreID),
|
|
1704
|
+
)
|
|
1705
|
+
]
|
|
1706
|
+
)
|
|
1453
1707
|
|
|
1454
1708
|
if self.forceDownloadDelay is not None:
|
|
1455
1709
|
# Wait around to simulate a big file for testing
|
|
@@ -1459,12 +1713,18 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1459
1713
|
atomic_copy(cachedPath, localFilePath)
|
|
1460
1714
|
|
|
1461
1715
|
# Change our reference to mutable
|
|
1462
|
-
self._write(
|
|
1716
|
+
self._write(
|
|
1717
|
+
[
|
|
1718
|
+
(
|
|
1719
|
+
"UPDATE refs SET state = ? WHERE path = ? AND file_id = ?",
|
|
1720
|
+
("mutable", localFilePath, fileStoreID),
|
|
1721
|
+
)
|
|
1722
|
+
]
|
|
1723
|
+
)
|
|
1463
1724
|
|
|
1464
1725
|
# Now we're done
|
|
1465
1726
|
return
|
|
1466
1727
|
|
|
1467
|
-
|
|
1468
1728
|
def _giveAwayDownloadingFile(self, fileStoreID, cachedPath, localFilePath):
|
|
1469
1729
|
"""
|
|
1470
1730
|
Move a downloaded file in 'downloading' state, owned by us, from the cache to a user-specified destination path.
|
|
@@ -1484,8 +1744,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1484
1744
|
with self.as_process() as me:
|
|
1485
1745
|
|
|
1486
1746
|
# See if we actually own this file and can giove it away
|
|
1487
|
-
self._read(
|
|
1488
|
-
(
|
|
1747
|
+
self._read(
|
|
1748
|
+
"SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?",
|
|
1749
|
+
(fileStoreID, "downloading", me),
|
|
1750
|
+
)
|
|
1489
1751
|
if self.cur.fetchone()[0] > 0:
|
|
1490
1752
|
# Now we have exclusive control of the cached copy of the file, so we can give it away.
|
|
1491
1753
|
|
|
@@ -1494,8 +1756,15 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1494
1756
|
# We are giving it away
|
|
1495
1757
|
shutil.move(cachedPath, localFilePath)
|
|
1496
1758
|
# Record that.
|
|
1497
|
-
self._write(
|
|
1498
|
-
|
|
1759
|
+
self._write(
|
|
1760
|
+
[
|
|
1761
|
+
(
|
|
1762
|
+
"UPDATE refs SET state = ? WHERE path = ? AND file_id = ?",
|
|
1763
|
+
("mutable", localFilePath, fileStoreID),
|
|
1764
|
+
),
|
|
1765
|
+
("DELETE FROM files WHERE id = ?", (fileStoreID,)),
|
|
1766
|
+
]
|
|
1767
|
+
)
|
|
1499
1768
|
|
|
1500
1769
|
# Now we're done
|
|
1501
1770
|
return True
|
|
@@ -1520,7 +1789,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1520
1789
|
:rtype: bool
|
|
1521
1790
|
"""
|
|
1522
1791
|
|
|
1523
|
-
assert os.path.exists(cachedPath),
|
|
1792
|
+
assert os.path.exists(cachedPath), (
|
|
1793
|
+
"Cannot create link to missing cache file %s" % cachedPath
|
|
1794
|
+
)
|
|
1524
1795
|
|
|
1525
1796
|
try:
|
|
1526
1797
|
# Try and make the hard link.
|
|
@@ -1562,17 +1833,46 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1562
1833
|
# Try and create a downloading entry if no entry exists.
|
|
1563
1834
|
# Make sure to create a reference at the same time if it succeeds, to bill it against our job's space.
|
|
1564
1835
|
# Don't create the mutable reference yet because we might not necessarily be able to clear that space.
|
|
1565
|
-
logger.debug(
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1836
|
+
logger.debug(
|
|
1837
|
+
"Trying to make file downloading file record and reference for id %s",
|
|
1838
|
+
fileStoreID,
|
|
1839
|
+
)
|
|
1840
|
+
self._write(
|
|
1841
|
+
[
|
|
1842
|
+
(
|
|
1843
|
+
"INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)",
|
|
1844
|
+
(
|
|
1845
|
+
fileStoreID,
|
|
1846
|
+
cachedPath,
|
|
1847
|
+
self.getGlobalFileSize(fileStoreID),
|
|
1848
|
+
"downloading",
|
|
1849
|
+
me,
|
|
1850
|
+
),
|
|
1851
|
+
),
|
|
1852
|
+
(
|
|
1853
|
+
"INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND state = ? AND owner = ?",
|
|
1854
|
+
(
|
|
1855
|
+
localFilePath,
|
|
1856
|
+
readerID,
|
|
1857
|
+
"immutable",
|
|
1858
|
+
fileStoreID,
|
|
1859
|
+
"downloading",
|
|
1860
|
+
me,
|
|
1861
|
+
),
|
|
1862
|
+
),
|
|
1863
|
+
]
|
|
1864
|
+
)
|
|
1570
1865
|
|
|
1571
1866
|
# See if we won the race
|
|
1572
|
-
self._read(
|
|
1867
|
+
self._read(
|
|
1868
|
+
"SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?",
|
|
1869
|
+
(fileStoreID, "downloading", me),
|
|
1870
|
+
)
|
|
1573
1871
|
if self.cur.fetchone()[0] > 0:
|
|
1574
1872
|
# We are responsible for downloading the file (and we have the reference)
|
|
1575
|
-
logger.debug(
|
|
1873
|
+
logger.debug(
|
|
1874
|
+
"We are now responsible for downloading file %s", fileStoreID
|
|
1875
|
+
)
|
|
1576
1876
|
|
|
1577
1877
|
# Make sure we have space for this download.
|
|
1578
1878
|
self._freeUpSpace()
|
|
@@ -1586,8 +1886,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1586
1886
|
# We made the link!
|
|
1587
1887
|
|
|
1588
1888
|
# Change file state from downloading to cached so other people can use it
|
|
1589
|
-
self._write(
|
|
1590
|
-
|
|
1889
|
+
self._write(
|
|
1890
|
+
[
|
|
1891
|
+
(
|
|
1892
|
+
"UPDATE files SET state = ?, owner = NULL WHERE id = ?",
|
|
1893
|
+
("cached", fileStoreID),
|
|
1894
|
+
)
|
|
1895
|
+
]
|
|
1896
|
+
)
|
|
1591
1897
|
|
|
1592
1898
|
# Now we're done!
|
|
1593
1899
|
return localFilePath
|
|
@@ -1595,36 +1901,69 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1595
1901
|
# We could not make a link. We need to make a copy.
|
|
1596
1902
|
|
|
1597
1903
|
# Change the reference to copying.
|
|
1598
|
-
self._write(
|
|
1904
|
+
self._write(
|
|
1905
|
+
[
|
|
1906
|
+
(
|
|
1907
|
+
"UPDATE refs SET state = ? WHERE path = ? AND file_id = ?",
|
|
1908
|
+
("copying", localFilePath, fileStoreID),
|
|
1909
|
+
)
|
|
1910
|
+
]
|
|
1911
|
+
)
|
|
1599
1912
|
|
|
1600
1913
|
# Fulfill it with a full copy or by giving away the cached copy
|
|
1601
|
-
self._fulfillCopyingReference(
|
|
1914
|
+
self._fulfillCopyingReference(
|
|
1915
|
+
fileStoreID, cachedPath, localFilePath
|
|
1916
|
+
)
|
|
1602
1917
|
|
|
1603
1918
|
# Now we're done
|
|
1604
1919
|
return localFilePath
|
|
1605
1920
|
|
|
1606
1921
|
else:
|
|
1607
|
-
logger.debug(
|
|
1922
|
+
logger.debug(
|
|
1923
|
+
"We already have an entry in the cache database for file %s",
|
|
1924
|
+
fileStoreID,
|
|
1925
|
+
)
|
|
1608
1926
|
|
|
1609
1927
|
# A record already existed for this file.
|
|
1610
1928
|
# Try and create an immutable reference to an entry that
|
|
1611
1929
|
# is in 'cached' or 'uploadable' or 'uploading' state.
|
|
1612
1930
|
# It might be uploading because *we* are supposed to be uploading it.
|
|
1613
|
-
logger.debug(
|
|
1614
|
-
self._write(
|
|
1615
|
-
|
|
1931
|
+
logger.debug("Trying to make reference to file %s", fileStoreID)
|
|
1932
|
+
self._write(
|
|
1933
|
+
[
|
|
1934
|
+
(
|
|
1935
|
+
"INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)",
|
|
1936
|
+
(
|
|
1937
|
+
localFilePath,
|
|
1938
|
+
readerID,
|
|
1939
|
+
"immutable",
|
|
1940
|
+
fileStoreID,
|
|
1941
|
+
"cached",
|
|
1942
|
+
"uploadable",
|
|
1943
|
+
"uploading",
|
|
1944
|
+
),
|
|
1945
|
+
)
|
|
1946
|
+
]
|
|
1947
|
+
)
|
|
1616
1948
|
|
|
1617
1949
|
# See if we got it
|
|
1618
|
-
self._read(
|
|
1950
|
+
self._read(
|
|
1951
|
+
"SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?",
|
|
1952
|
+
(localFilePath, fileStoreID),
|
|
1953
|
+
)
|
|
1619
1954
|
if self.cur.fetchone()[0] > 0:
|
|
1620
1955
|
# The file is cached and we can copy or link it
|
|
1621
|
-
logger.debug(
|
|
1956
|
+
logger.debug("Obtained reference to file %s", fileStoreID)
|
|
1622
1957
|
|
|
1623
1958
|
# Get the path it is actually at in the cache, instead of where we wanted to put it
|
|
1624
|
-
for row in self._read(
|
|
1959
|
+
for row in self._read(
|
|
1960
|
+
"SELECT path FROM files WHERE id = ?", (fileStoreID,)
|
|
1961
|
+
):
|
|
1625
1962
|
cachedPath = row[0]
|
|
1626
1963
|
|
|
1627
|
-
if self._createLinkFromCache(
|
|
1964
|
+
if self._createLinkFromCache(
|
|
1965
|
+
cachedPath, localFilePath, symlink
|
|
1966
|
+
):
|
|
1628
1967
|
# We managed to make the link
|
|
1629
1968
|
return localFilePath
|
|
1630
1969
|
else:
|
|
@@ -1636,11 +1975,22 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1636
1975
|
# we already have code for that for mutable downloads,
|
|
1637
1976
|
# so just clear the reference and download mutably.
|
|
1638
1977
|
|
|
1639
|
-
self._write(
|
|
1640
|
-
|
|
1641
|
-
|
|
1978
|
+
self._write(
|
|
1979
|
+
[
|
|
1980
|
+
(
|
|
1981
|
+
"DELETE FROM refs WHERE path = ? AND file_id = ?",
|
|
1982
|
+
(localFilePath, fileStoreID),
|
|
1983
|
+
)
|
|
1984
|
+
]
|
|
1985
|
+
)
|
|
1986
|
+
|
|
1987
|
+
return self._readGlobalFileMutablyWithCache(
|
|
1988
|
+
fileStoreID, localFilePath, readerID
|
|
1989
|
+
)
|
|
1642
1990
|
else:
|
|
1643
|
-
logger.debug(
|
|
1991
|
+
logger.debug(
|
|
1992
|
+
"Could not obtain reference to file %s", fileStoreID
|
|
1993
|
+
)
|
|
1644
1994
|
|
|
1645
1995
|
# If we didn't get a download or a reference, adopt and do work from dead workers and loop again.
|
|
1646
1996
|
# We may have to wait for someone else's download or delete to
|
|
@@ -1656,7 +2006,12 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1656
2006
|
time.sleep(self.contentionBackoff)
|
|
1657
2007
|
|
|
1658
2008
|
@contextmanager
|
|
1659
|
-
def _with_copying_reference_to_upload(
|
|
2009
|
+
def _with_copying_reference_to_upload(
|
|
2010
|
+
self,
|
|
2011
|
+
file_store_id: FileID,
|
|
2012
|
+
reader_id: str,
|
|
2013
|
+
local_file_path: Optional[str] = None,
|
|
2014
|
+
) -> Generator:
|
|
1660
2015
|
"""
|
|
1661
2016
|
Get a context manager that gives you either the local file path for a
|
|
1662
2017
|
copyuing reference to the given file, or None if that file is not in an
|
|
@@ -1678,12 +2033,28 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1678
2033
|
local_file_path = self.getLocalTempFileName()
|
|
1679
2034
|
|
|
1680
2035
|
# Try and make a 'copying' reference to such a file
|
|
1681
|
-
self._write(
|
|
1682
|
-
|
|
2036
|
+
self._write(
|
|
2037
|
+
[
|
|
2038
|
+
(
|
|
2039
|
+
"INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ?)",
|
|
2040
|
+
(
|
|
2041
|
+
local_file_path,
|
|
2042
|
+
reader_id,
|
|
2043
|
+
"copying",
|
|
2044
|
+
file_store_id,
|
|
2045
|
+
"uploadable",
|
|
2046
|
+
"uploading",
|
|
2047
|
+
),
|
|
2048
|
+
)
|
|
2049
|
+
]
|
|
2050
|
+
)
|
|
1683
2051
|
|
|
1684
2052
|
# See if we got it
|
|
1685
2053
|
have_reference = False
|
|
1686
|
-
for row in self._read(
|
|
2054
|
+
for row in self._read(
|
|
2055
|
+
"SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?",
|
|
2056
|
+
(local_file_path, file_store_id),
|
|
2057
|
+
):
|
|
1687
2058
|
have_reference = row[0] > 0
|
|
1688
2059
|
|
|
1689
2060
|
if have_reference:
|
|
@@ -1692,8 +2063,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1692
2063
|
yield local_file_path
|
|
1693
2064
|
finally:
|
|
1694
2065
|
# Clean up the reference if it is unmodified
|
|
1695
|
-
self._write(
|
|
1696
|
-
|
|
2066
|
+
self._write(
|
|
2067
|
+
[
|
|
2068
|
+
(
|
|
2069
|
+
"DELETE FROM refs WHERE path = ? AND file_id = ? AND state = ?",
|
|
2070
|
+
(local_file_path, file_store_id, "copying"),
|
|
2071
|
+
)
|
|
2072
|
+
]
|
|
2073
|
+
)
|
|
1697
2074
|
else:
|
|
1698
2075
|
# No reference was obtained.
|
|
1699
2076
|
yield None
|
|
@@ -1702,11 +2079,13 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1702
2079
|
def readGlobalFileStream(self, fileStoreID, encoding=None, errors=None):
|
|
1703
2080
|
if str(fileStoreID) in self.filesToDelete:
|
|
1704
2081
|
# File has already been deleted
|
|
1705
|
-
raise FileNotFoundError(f
|
|
2082
|
+
raise FileNotFoundError(f"Attempted to read deleted file: {fileStoreID}")
|
|
1706
2083
|
|
|
1707
2084
|
self.logAccess(fileStoreID)
|
|
1708
2085
|
|
|
1709
|
-
with self._with_copying_reference_to_upload(
|
|
2086
|
+
with self._with_copying_reference_to_upload(
|
|
2087
|
+
fileStoreID, self.jobDesc.jobStoreID
|
|
2088
|
+
) as ref_path:
|
|
1710
2089
|
# Try and grab a reference to the file if it is being uploaded.
|
|
1711
2090
|
if ref_path is not None:
|
|
1712
2091
|
# We have an update in the cache that isn't written back yet.
|
|
@@ -1715,11 +2094,16 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1715
2094
|
# The ref file is not actually copied to; find the actual file
|
|
1716
2095
|
# in the cache
|
|
1717
2096
|
cached_path = None
|
|
1718
|
-
for row in self._read(
|
|
2097
|
+
for row in self._read(
|
|
2098
|
+
"SELECT path FROM files WHERE id = ?", (fileStoreID,)
|
|
2099
|
+
):
|
|
1719
2100
|
cached_path = row[0]
|
|
1720
2101
|
|
|
1721
2102
|
if cached_path is None:
|
|
1722
|
-
raise RuntimeError(
|
|
2103
|
+
raise RuntimeError(
|
|
2104
|
+
"File %s went away while we had a reference to it!"
|
|
2105
|
+
% fileStoreID
|
|
2106
|
+
)
|
|
1723
2107
|
|
|
1724
2108
|
with open(cached_path, encoding=encoding, errors=errors) as result:
|
|
1725
2109
|
# Pass along the results of the open context manager on the
|
|
@@ -1730,7 +2114,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1730
2114
|
else:
|
|
1731
2115
|
# No local update, so we can stream from the job store
|
|
1732
2116
|
# TODO: Maybe stream from cache even when not required for consistency?
|
|
1733
|
-
with self.jobStore.read_file_stream(
|
|
2117
|
+
with self.jobStore.read_file_stream(
|
|
2118
|
+
fileStoreID, encoding=encoding, errors=errors
|
|
2119
|
+
) as result:
|
|
1734
2120
|
yield result
|
|
1735
2121
|
|
|
1736
2122
|
def deleteLocalFile(self, fileStoreID):
|
|
@@ -1743,7 +2129,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1743
2129
|
# missing ref file, we will raise an error about it and stop deleting
|
|
1744
2130
|
# things.
|
|
1745
2131
|
missingFile = None
|
|
1746
|
-
for row in self._read(
|
|
2132
|
+
for row in self._read(
|
|
2133
|
+
"SELECT path FROM refs WHERE file_id = ? AND job_id = ?",
|
|
2134
|
+
(fileStoreID, jobID),
|
|
2135
|
+
):
|
|
1747
2136
|
# Delete all the files that are references to this cached file (even mutable copies)
|
|
1748
2137
|
path = row[0]
|
|
1749
2138
|
|
|
@@ -1764,12 +2153,22 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1764
2153
|
if len(deleted) == 0 and not missingFile:
|
|
1765
2154
|
# We have to tell the user if they tried to delete 0 local copies.
|
|
1766
2155
|
# But if we found a missing local copy, go on to report that instead.
|
|
1767
|
-
raise OSError(
|
|
2156
|
+
raise OSError(
|
|
2157
|
+
errno.ENOENT,
|
|
2158
|
+
f"Attempting to delete local copies of a file with none: {fileStoreID}",
|
|
2159
|
+
)
|
|
1768
2160
|
|
|
1769
2161
|
for path in deleted:
|
|
1770
2162
|
# Drop the references
|
|
1771
|
-
self._write(
|
|
1772
|
-
|
|
2163
|
+
self._write(
|
|
2164
|
+
[
|
|
2165
|
+
(
|
|
2166
|
+
"DELETE FROM refs WHERE file_id = ? AND job_id = ? AND path = ?",
|
|
2167
|
+
(fileStoreID, jobID, path),
|
|
2168
|
+
)
|
|
2169
|
+
]
|
|
2170
|
+
)
|
|
2171
|
+
logger.debug("Deleted local file %s for global file %s", path, fileStoreID)
|
|
1773
2172
|
|
|
1774
2173
|
# Now space has been revoked from the cache because that job needs its space back.
|
|
1775
2174
|
# That might result in stuff having to be evicted.
|
|
@@ -1797,13 +2196,25 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1797
2196
|
with self.as_process() as me:
|
|
1798
2197
|
|
|
1799
2198
|
# Make sure nobody else has references to it
|
|
1800
|
-
for row in self._read(
|
|
1801
|
-
|
|
2199
|
+
for row in self._read(
|
|
2200
|
+
"SELECT job_id FROM refs WHERE file_id = ? AND state != ?",
|
|
2201
|
+
(fileStoreID, "mutable"),
|
|
2202
|
+
):
|
|
2203
|
+
raise RuntimeError(
|
|
2204
|
+
f"Deleted file ID {fileStoreID} which is still in use by job {row[0]}"
|
|
2205
|
+
)
|
|
1802
2206
|
# TODO: should we just let other jobs and the cache keep the file until
|
|
1803
2207
|
# it gets evicted, and only delete at the back end?
|
|
1804
2208
|
|
|
1805
2209
|
# Pop the file into deleting state owned by us if it exists
|
|
1806
|
-
self._write(
|
|
2210
|
+
self._write(
|
|
2211
|
+
[
|
|
2212
|
+
(
|
|
2213
|
+
"UPDATE files SET state = ?, owner = ? WHERE id = ?",
|
|
2214
|
+
("deleting", me, fileStoreID),
|
|
2215
|
+
)
|
|
2216
|
+
]
|
|
2217
|
+
)
|
|
1807
2218
|
|
|
1808
2219
|
# Finish the delete if the file is present
|
|
1809
2220
|
self._executePendingDeletions()
|
|
@@ -1811,10 +2222,13 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1811
2222
|
# Add the file to the list of files to be deleted from the job store
|
|
1812
2223
|
# once the run method completes.
|
|
1813
2224
|
self.filesToDelete.add(str(fileStoreID))
|
|
1814
|
-
self.log_to_leader(
|
|
1815
|
-
|
|
2225
|
+
self.log_to_leader(
|
|
2226
|
+
"Added file with ID '%s' to the list of files to be" % fileStoreID
|
|
2227
|
+
+ " globally deleted.",
|
|
2228
|
+
level=logging.DEBUG,
|
|
2229
|
+
)
|
|
1816
2230
|
|
|
1817
|
-
@deprecated(new_function_name=
|
|
2231
|
+
@deprecated(new_function_name="export_file")
|
|
1818
2232
|
def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None:
|
|
1819
2233
|
return self.export_file(jobStoreFileID, dstUrl)
|
|
1820
2234
|
|
|
@@ -1845,7 +2259,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1845
2259
|
# thread. It can do some destructor work after it finishes its real
|
|
1846
2260
|
# work.
|
|
1847
2261
|
|
|
1848
|
-
if
|
|
2262
|
+
if (
|
|
2263
|
+
self.commitThread is not None
|
|
2264
|
+
and self.commitThread is not threading.current_thread()
|
|
2265
|
+
):
|
|
1849
2266
|
self.commitThread.join()
|
|
1850
2267
|
|
|
1851
2268
|
return True
|
|
@@ -1872,17 +2289,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1872
2289
|
# might be necessary for later jobs to see earlier jobs' deleted
|
|
1873
2290
|
# before they are committed?
|
|
1874
2291
|
|
|
1875
|
-
logger.debug(
|
|
2292
|
+
logger.debug(
|
|
2293
|
+
"Starting commit of %s forked from %s", state_to_commit, self.jobDesc
|
|
2294
|
+
)
|
|
1876
2295
|
# Make sure the deep copy isn't summoning ghosts of old job
|
|
1877
2296
|
# versions. It must be as new or newer at this point.
|
|
1878
|
-
self.jobDesc.
|
|
2297
|
+
self.jobDesc.assert_is_not_newer_than(state_to_commit)
|
|
1879
2298
|
|
|
1880
2299
|
# Bump the original's version since saving will do that too and we
|
|
1881
2300
|
# don't want duplicate versions.
|
|
1882
|
-
self.jobDesc.reserve_versions(
|
|
2301
|
+
self.jobDesc.reserve_versions(
|
|
2302
|
+
1 if len(state_to_commit.filesToDelete) == 0 else 2
|
|
2303
|
+
)
|
|
1883
2304
|
|
|
1884
2305
|
# Start the commit thread
|
|
1885
|
-
self.commitThread = threading.Thread(
|
|
2306
|
+
self.commitThread = threading.Thread(
|
|
2307
|
+
target=self.startCommitThread, args=(state_to_commit,)
|
|
2308
|
+
)
|
|
1886
2309
|
self.commitThread.start()
|
|
1887
2310
|
|
|
1888
2311
|
def startCommitThread(self, state_to_commit: Optional[JobDescription]):
|
|
@@ -1895,7 +2318,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1895
2318
|
self.waitForPreviousCommit()
|
|
1896
2319
|
|
|
1897
2320
|
try:
|
|
1898
|
-
logger.debug(
|
|
2321
|
+
logger.debug("Committing file uploads asynchronously")
|
|
1899
2322
|
|
|
1900
2323
|
# Finish all uploads
|
|
1901
2324
|
self._executePendingUploads()
|
|
@@ -1905,7 +2328,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1905
2328
|
if state_to_commit is not None:
|
|
1906
2329
|
# Do all the things that make this job not redoable
|
|
1907
2330
|
|
|
1908
|
-
logger.debug(
|
|
2331
|
+
logger.debug(
|
|
2332
|
+
"Committing file deletes and job state changes asynchronously from %s",
|
|
2333
|
+
state_to_commit,
|
|
2334
|
+
)
|
|
1909
2335
|
|
|
1910
2336
|
# Complete the job
|
|
1911
2337
|
self.jobStore.update_job(state_to_commit)
|
|
@@ -1921,10 +2347,8 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1921
2347
|
self._terminateEvent.set()
|
|
1922
2348
|
raise
|
|
1923
2349
|
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
2350
|
@classmethod
|
|
1927
|
-
def shutdown(cls, shutdown_info:
|
|
2351
|
+
def shutdown(cls, shutdown_info: tuple[str, str]) -> None:
|
|
1928
2352
|
"""
|
|
1929
2353
|
:param shutdown_info: Tuple of the coordination directory (where the
|
|
1930
2354
|
cache database is) and the cache directory (where the cached data is).
|
|
@@ -1951,7 +2375,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1951
2375
|
# So we just go and find the cache-n.db with the largest n value,
|
|
1952
2376
|
# and use that.
|
|
1953
2377
|
dbFilename = None
|
|
1954
|
-
dbAttempt = float(
|
|
2378
|
+
dbAttempt = float("-inf")
|
|
1955
2379
|
|
|
1956
2380
|
# We also need to remember all the plausible database files and
|
|
1957
2381
|
# journals
|
|
@@ -1959,12 +2383,15 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1959
2383
|
|
|
1960
2384
|
for dbCandidate in os.listdir(coordination_dir):
|
|
1961
2385
|
# For each thing in the coordination directory, see if it starts like a database file.
|
|
1962
|
-
match = re.match(
|
|
2386
|
+
match = re.match("^cache-([0-9]+).db.*", dbCandidate)
|
|
1963
2387
|
if match:
|
|
1964
2388
|
# This is caching-related.
|
|
1965
2389
|
all_db_files.append(dbCandidate)
|
|
1966
2390
|
attempt_number = int(match.group(1))
|
|
1967
|
-
if
|
|
2391
|
+
if (
|
|
2392
|
+
attempt_number > dbAttempt
|
|
2393
|
+
and dbCandidate == f"cache-{attempt_number}.db"
|
|
2394
|
+
):
|
|
1968
2395
|
# This is a main database, and the newest we have seen.
|
|
1969
2396
|
dbFilename = dbCandidate
|
|
1970
2397
|
dbAttempt = attempt_number
|
|
@@ -1972,7 +2399,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1972
2399
|
if dbFilename is not None:
|
|
1973
2400
|
# We found a caching database
|
|
1974
2401
|
|
|
1975
|
-
logger.debug(
|
|
2402
|
+
logger.debug(
|
|
2403
|
+
"Connecting to latest caching database %s for cleanup", dbFilename
|
|
2404
|
+
)
|
|
1976
2405
|
|
|
1977
2406
|
dbPath = os.path.join(coordination_dir, dbFilename)
|
|
1978
2407
|
|
|
@@ -1996,7 +2425,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1996
2425
|
|
|
1997
2426
|
con.close()
|
|
1998
2427
|
else:
|
|
1999
|
-
logger.debug(
|
|
2428
|
+
logger.debug("No caching database found in %s", dir_)
|
|
2000
2429
|
|
|
2001
2430
|
# Whether or not we found a database, we need to clean up the cache
|
|
2002
2431
|
# directory. Delete everything cached.
|
|
@@ -2033,7 +2462,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
2033
2462
|
|
|
2034
2463
|
# Get all the dead worker PIDs
|
|
2035
2464
|
workers = []
|
|
2036
|
-
for row in cls._static_read(
|
|
2465
|
+
for row in cls._static_read(
|
|
2466
|
+
cur, "SELECT DISTINCT worker FROM jobs WHERE worker IS NOT NULL"
|
|
2467
|
+
):
|
|
2037
2468
|
workers.append(row[0])
|
|
2038
2469
|
|
|
2039
2470
|
# Work out which of them are not currently running.
|
|
@@ -2046,14 +2477,18 @@ class CachingFileStore(AbstractFileStore):
|
|
|
2046
2477
|
# Now we know which workers are dead.
|
|
2047
2478
|
# Clear them off of the jobs they had.
|
|
2048
2479
|
for deadWorker in deadWorkers:
|
|
2049
|
-
cls._static_write(
|
|
2480
|
+
cls._static_write(
|
|
2481
|
+
con,
|
|
2482
|
+
cur,
|
|
2483
|
+
[("UPDATE jobs SET worker = NULL WHERE worker = ?", (deadWorker,))],
|
|
2484
|
+
)
|
|
2050
2485
|
if len(deadWorkers) > 0:
|
|
2051
|
-
logger.debug(
|
|
2486
|
+
logger.debug("Reaped %d dead workers", len(deadWorkers))
|
|
2052
2487
|
|
|
2053
2488
|
while True:
|
|
2054
2489
|
# Find an unowned job.
|
|
2055
2490
|
# Don't take all of them; other people could come along and want to help us with the other jobs.
|
|
2056
|
-
cls._static_read(cur,
|
|
2491
|
+
cls._static_read(cur, "SELECT id FROM jobs WHERE worker IS NULL LIMIT 1")
|
|
2057
2492
|
row = cur.fetchone()
|
|
2058
2493
|
if row is None:
|
|
2059
2494
|
# We cleaned up all the jobs
|
|
@@ -2062,10 +2497,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
2062
2497
|
jobID = row[0]
|
|
2063
2498
|
|
|
2064
2499
|
# Try to own this job
|
|
2065
|
-
cls._static_write(
|
|
2500
|
+
cls._static_write(
|
|
2501
|
+
con,
|
|
2502
|
+
cur,
|
|
2503
|
+
[
|
|
2504
|
+
(
|
|
2505
|
+
"UPDATE jobs SET worker = ? WHERE id = ? AND worker IS NULL",
|
|
2506
|
+
(me, jobID),
|
|
2507
|
+
)
|
|
2508
|
+
],
|
|
2509
|
+
)
|
|
2066
2510
|
|
|
2067
2511
|
# See if we won the race
|
|
2068
|
-
cls._static_read(
|
|
2512
|
+
cls._static_read(
|
|
2513
|
+
cur,
|
|
2514
|
+
"SELECT id, tempdir FROM jobs WHERE id = ? AND worker = ?",
|
|
2515
|
+
(jobID, me),
|
|
2516
|
+
)
|
|
2069
2517
|
row = cur.fetchone()
|
|
2070
2518
|
if row is None:
|
|
2071
2519
|
# We didn't win the race. Try another one.
|
|
@@ -2074,6 +2522,6 @@ class CachingFileStore(AbstractFileStore):
|
|
|
2074
2522
|
# If we did win, delete the job and its files and temp dir
|
|
2075
2523
|
cls._removeJob(con, cur, jobID)
|
|
2076
2524
|
|
|
2077
|
-
logger.debug(
|
|
2525
|
+
logger.debug("Cleaned up orphaned job %s", jobID)
|
|
2078
2526
|
|
|
2079
2527
|
# Now we have cleaned up all the jobs that belonged to dead workers that were dead when we entered this function.
|