toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +121 -83
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +38 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +489 -137
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +630 -359
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1114 -532
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +988 -315
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +727 -403
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +193 -58
- toil/lib/aws/utils.py +238 -218
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +99 -11
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +65 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +115 -77
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/options/common.py +834 -401
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +148 -64
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +93 -47
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/cwlTest.py +271 -71
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +11 -11
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3513 -1052
- toil/worker.py +269 -128
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -22,15 +22,10 @@ import sqlite3
|
|
|
22
22
|
import stat
|
|
23
23
|
import threading
|
|
24
24
|
import time
|
|
25
|
+
from collections.abc import Generator, Iterator, Sequence
|
|
25
26
|
from contextlib import contextmanager
|
|
26
27
|
from tempfile import mkstemp
|
|
27
|
-
from typing import
|
|
28
|
-
Callable,
|
|
29
|
-
Generator,
|
|
30
|
-
Iterator,
|
|
31
|
-
Optional,
|
|
32
|
-
Sequence,
|
|
33
|
-
Tuple)
|
|
28
|
+
from typing import Any, Callable, Optional
|
|
34
29
|
|
|
35
30
|
from toil.common import cacheDirName, getFileSystemSize
|
|
36
31
|
from toil.fileStores import FileID
|
|
@@ -38,11 +33,13 @@ from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
|
38
33
|
from toil.job import Job, JobDescription
|
|
39
34
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
40
35
|
from toil.lib.compatibility import deprecated
|
|
41
|
-
from toil.lib.io import (
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
36
|
+
from toil.lib.io import (
|
|
37
|
+
atomic_copy,
|
|
38
|
+
atomic_copyobj,
|
|
39
|
+
make_public_dir,
|
|
40
|
+
mkdtemp,
|
|
41
|
+
robust_rmtree,
|
|
42
|
+
)
|
|
46
43
|
from toil.lib.retry import ErrorCondition, retry
|
|
47
44
|
from toil.lib.threading import get_process_name, process_name_exists
|
|
48
45
|
|
|
@@ -66,9 +63,12 @@ class CacheUnbalancedError(CacheError):
|
|
|
66
63
|
"""
|
|
67
64
|
Raised if file store can't free enough space for caching
|
|
68
65
|
"""
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
66
|
+
|
|
67
|
+
message = (
|
|
68
|
+
"Unable unable to free enough space for caching. This error frequently arises due "
|
|
69
|
+
"to jobs using more disk than they have requested. Turn on debug logging to see "
|
|
70
|
+
"more information leading up to this error through cache usage logs."
|
|
71
|
+
)
|
|
72
72
|
|
|
73
73
|
def __init__(self):
|
|
74
74
|
super().__init__(self.message)
|
|
@@ -87,9 +87,11 @@ class IllegalDeletionCacheError(CacheError):
|
|
|
87
87
|
"""
|
|
88
88
|
|
|
89
89
|
def __init__(self, deletedFile):
|
|
90
|
-
message =
|
|
91
|
-
|
|
92
|
-
|
|
90
|
+
message = (
|
|
91
|
+
"Cache tracked file (%s) has been deleted or moved by user "
|
|
92
|
+
" without updating cache database. Use deleteLocalFile to "
|
|
93
|
+
"delete such files." % deletedFile
|
|
94
|
+
)
|
|
93
95
|
super().__init__(message)
|
|
94
96
|
|
|
95
97
|
|
|
@@ -208,13 +210,15 @@ class CachingFileStore(AbstractFileStore):
|
|
|
208
210
|
# Variables related to caching
|
|
209
211
|
# Decide where the cache directory will be. We put it in the local
|
|
210
212
|
# workflow directory.
|
|
211
|
-
self.localCacheDir = os.path.join(
|
|
213
|
+
self.localCacheDir = os.path.join(
|
|
214
|
+
self.workflow_dir, cacheDirName(self.jobStore.config.workflowID)
|
|
215
|
+
)
|
|
212
216
|
|
|
213
217
|
# Since each worker has it's own unique CachingFileStore instance, and only one Job can run
|
|
214
218
|
# at a time on a worker, we can track some stuff about the running job in ourselves.
|
|
215
219
|
self.jobName: str = str(self.jobDesc)
|
|
216
220
|
self.jobID = self.jobDesc.jobStoreID
|
|
217
|
-
logger.debug(
|
|
221
|
+
logger.debug("Starting job (%s) with ID (%s).", self.jobName, self.jobID)
|
|
218
222
|
|
|
219
223
|
# When the job actually starts, we will fill this in with the job's disk requirement.
|
|
220
224
|
self.jobDiskBytes: Optional[float] = None
|
|
@@ -230,7 +234,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
230
234
|
# the workflow left one behind without cleaning up properly; we need to
|
|
231
235
|
# be able to tell that from showing up on a machine where a cache has
|
|
232
236
|
# already been created.
|
|
233
|
-
self.dbPath = os.path.join(
|
|
237
|
+
self.dbPath = os.path.join(
|
|
238
|
+
self.coordination_dir, f"cache-{self.workflowAttemptNumber}.db"
|
|
239
|
+
)
|
|
234
240
|
|
|
235
241
|
# Database connections are provided by magic properties self.con and
|
|
236
242
|
# self.cur that always have the right object for the current thread to
|
|
@@ -254,7 +260,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
254
260
|
|
|
255
261
|
# Initialize the space accounting properties
|
|
256
262
|
freeSpace, _ = getFileSystemSize(self.localCacheDir)
|
|
257
|
-
self._write(
|
|
263
|
+
self._write(
|
|
264
|
+
[
|
|
265
|
+
(
|
|
266
|
+
"INSERT OR IGNORE INTO properties VALUES (?, ?)",
|
|
267
|
+
("maxSpace", freeSpace),
|
|
268
|
+
)
|
|
269
|
+
]
|
|
270
|
+
)
|
|
258
271
|
|
|
259
272
|
# Space used by caching and by jobs is accounted with queries
|
|
260
273
|
|
|
@@ -284,10 +297,12 @@ class CachingFileStore(AbstractFileStore):
|
|
|
284
297
|
"""
|
|
285
298
|
Get the database connection to be used for the current thread.
|
|
286
299
|
"""
|
|
287
|
-
if not hasattr(self._thread_local,
|
|
300
|
+
if not hasattr(self._thread_local, "con"):
|
|
288
301
|
# Connect to the database for this thread.
|
|
289
302
|
# TODO: We assume the connection closes when the thread goes away and can no longer use it.
|
|
290
|
-
self._thread_local.con = sqlite3.connect(
|
|
303
|
+
self._thread_local.con = sqlite3.connect(
|
|
304
|
+
self.dbPath, timeout=SQLITE_TIMEOUT_SECS
|
|
305
|
+
)
|
|
291
306
|
return self._thread_local.con
|
|
292
307
|
|
|
293
308
|
@property
|
|
@@ -295,18 +310,20 @@ class CachingFileStore(AbstractFileStore):
|
|
|
295
310
|
"""
|
|
296
311
|
Get the main cursor to be used for the current thread.
|
|
297
312
|
"""
|
|
298
|
-
if not hasattr(self._thread_local,
|
|
313
|
+
if not hasattr(self._thread_local, "cur"):
|
|
299
314
|
# If we don't already have a main cursor for the thread, make one.
|
|
300
315
|
self._thread_local.cur = self.con.cursor()
|
|
301
316
|
return self._thread_local.cur
|
|
302
317
|
|
|
303
318
|
@staticmethod
|
|
304
|
-
@retry(
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
319
|
+
@retry(
|
|
320
|
+
infinite_retries=True,
|
|
321
|
+
errors=[
|
|
322
|
+
ErrorCondition(
|
|
323
|
+
error=sqlite3.OperationalError, error_message_must_include="is locked"
|
|
324
|
+
)
|
|
325
|
+
],
|
|
326
|
+
)
|
|
310
327
|
def _static_write(con, cur, operations):
|
|
311
328
|
"""
|
|
312
329
|
Write to the caching database, using the given connection.
|
|
@@ -340,7 +357,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
340
357
|
# Do it
|
|
341
358
|
cur.execute(command, args)
|
|
342
359
|
except Exception as e:
|
|
343
|
-
logging.error(
|
|
360
|
+
logging.error("Error talking to caching database: %s", str(e))
|
|
344
361
|
|
|
345
362
|
# Try to make sure we don't somehow leave anything part-done if a
|
|
346
363
|
# middle operation somehow fails.
|
|
@@ -360,13 +377,17 @@ class CachingFileStore(AbstractFileStore):
|
|
|
360
377
|
return cur.rowcount
|
|
361
378
|
|
|
362
379
|
@staticmethod
|
|
363
|
-
@retry(
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
380
|
+
@retry(
|
|
381
|
+
infinite_retries=True,
|
|
382
|
+
errors=[
|
|
383
|
+
ErrorCondition(
|
|
384
|
+
error=sqlite3.OperationalError, error_message_must_include="is locked"
|
|
385
|
+
)
|
|
386
|
+
],
|
|
387
|
+
)
|
|
388
|
+
def _static_read(
|
|
389
|
+
cur: sqlite3.Cursor, query: str, args: Optional[Sequence[Any]] = ()
|
|
390
|
+
) -> Iterator[Any]:
|
|
370
391
|
"""
|
|
371
392
|
Read from the database.
|
|
372
393
|
|
|
@@ -419,7 +440,11 @@ class CachingFileStore(AbstractFileStore):
|
|
|
419
440
|
# Get a cursor
|
|
420
441
|
cur = con.cursor()
|
|
421
442
|
|
|
422
|
-
cls._static_write(
|
|
443
|
+
cls._static_write(
|
|
444
|
+
con,
|
|
445
|
+
cur,
|
|
446
|
+
[
|
|
447
|
+
"""
|
|
423
448
|
CREATE TABLE IF NOT EXISTS files (
|
|
424
449
|
id TEXT NOT NULL PRIMARY KEY,
|
|
425
450
|
path TEXT UNIQUE NOT NULL,
|
|
@@ -427,7 +452,8 @@ class CachingFileStore(AbstractFileStore):
|
|
|
427
452
|
state TEXT NOT NULL,
|
|
428
453
|
owner TEXT
|
|
429
454
|
)
|
|
430
|
-
""",
|
|
455
|
+
""",
|
|
456
|
+
"""
|
|
431
457
|
CREATE TABLE IF NOT EXISTS refs (
|
|
432
458
|
path TEXT NOT NULL,
|
|
433
459
|
file_id TEXT NOT NULL,
|
|
@@ -435,19 +461,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
435
461
|
state TEXT NOT NULL,
|
|
436
462
|
PRIMARY KEY (path, file_id)
|
|
437
463
|
)
|
|
438
|
-
""",
|
|
464
|
+
""",
|
|
465
|
+
"""
|
|
439
466
|
CREATE TABLE IF NOT EXISTS jobs (
|
|
440
467
|
id TEXT NOT NULL PRIMARY KEY,
|
|
441
468
|
tempdir TEXT NOT NULL,
|
|
442
469
|
disk INT NOT NULL,
|
|
443
470
|
worker TEXT
|
|
444
471
|
)
|
|
445
|
-
""",
|
|
472
|
+
""",
|
|
473
|
+
"""
|
|
446
474
|
CREATE TABLE IF NOT EXISTS properties (
|
|
447
475
|
name TEXT NOT NULL PRIMARY KEY,
|
|
448
476
|
value INT NOT NULL
|
|
449
477
|
)
|
|
450
|
-
"""
|
|
478
|
+
""",
|
|
479
|
+
],
|
|
480
|
+
)
|
|
451
481
|
|
|
452
482
|
# Caching-specific API
|
|
453
483
|
|
|
@@ -458,10 +488,12 @@ class CachingFileStore(AbstractFileStore):
|
|
|
458
488
|
If no limit is available, raises an error.
|
|
459
489
|
"""
|
|
460
490
|
|
|
461
|
-
for row in self.cur.execute(
|
|
491
|
+
for row in self.cur.execute(
|
|
492
|
+
"SELECT value FROM properties WHERE name = ?", ("maxSpace",)
|
|
493
|
+
):
|
|
462
494
|
return row[0]
|
|
463
495
|
|
|
464
|
-
raise RuntimeError(
|
|
496
|
+
raise RuntimeError("Unable to retrieve cache limit")
|
|
465
497
|
|
|
466
498
|
def getCacheUsed(self):
|
|
467
499
|
"""
|
|
@@ -474,10 +506,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
474
506
|
if self.cachingIsFree():
|
|
475
507
|
return 0
|
|
476
508
|
|
|
477
|
-
for row in self._read(
|
|
509
|
+
for row in self._read("SELECT TOTAL(size) FROM files"):
|
|
478
510
|
return row[0]
|
|
479
511
|
|
|
480
|
-
raise RuntimeError(
|
|
512
|
+
raise RuntimeError("Unable to retrieve cache usage")
|
|
481
513
|
|
|
482
514
|
def getCacheExtraJobSpace(self):
|
|
483
515
|
"""
|
|
@@ -492,15 +524,17 @@ class CachingFileStore(AbstractFileStore):
|
|
|
492
524
|
"""
|
|
493
525
|
|
|
494
526
|
# Total up the sizes of all the reads of files and subtract it from the total disk reservation of all jobs
|
|
495
|
-
for row in self._read(
|
|
527
|
+
for row in self._read(
|
|
528
|
+
"""
|
|
496
529
|
SELECT (
|
|
497
530
|
(SELECT TOTAL(disk) FROM jobs) -
|
|
498
531
|
(SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state == 'immutable')
|
|
499
532
|
) as result
|
|
500
|
-
"""
|
|
533
|
+
"""
|
|
534
|
+
):
|
|
501
535
|
return row[0]
|
|
502
536
|
|
|
503
|
-
raise RuntimeError(
|
|
537
|
+
raise RuntimeError("Unable to retrieve extra job space")
|
|
504
538
|
|
|
505
539
|
def getCacheAvailable(self):
|
|
506
540
|
"""
|
|
@@ -519,33 +553,38 @@ class CachingFileStore(AbstractFileStore):
|
|
|
519
553
|
|
|
520
554
|
# Do a little report first
|
|
521
555
|
for row in self._read("SELECT value FROM properties WHERE name = 'maxSpace'"):
|
|
522
|
-
logger.debug(
|
|
556
|
+
logger.debug("Max space: %d", row[0])
|
|
523
557
|
for row in self._read("SELECT TOTAL(size) FROM files"):
|
|
524
|
-
logger.debug(
|
|
558
|
+
logger.debug("Total file size: %d", row[0])
|
|
525
559
|
for row in self._read("SELECT TOTAL(disk) FROM jobs"):
|
|
526
|
-
logger.debug(
|
|
527
|
-
for row in self._read(
|
|
528
|
-
|
|
560
|
+
logger.debug("Total job disk requirement size: %d", row[0])
|
|
561
|
+
for row in self._read(
|
|
562
|
+
"SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state = 'immutable'"
|
|
563
|
+
):
|
|
564
|
+
logger.debug("Total immutable reference size: %d", row[0])
|
|
529
565
|
|
|
530
566
|
if self.cachingIsFree():
|
|
531
567
|
# If caching is free, we just say that all the space is always available.
|
|
532
|
-
for row in self._read(
|
|
568
|
+
for row in self._read(
|
|
569
|
+
"SELECT value FROM properties WHERE name = 'maxSpace'"
|
|
570
|
+
):
|
|
533
571
|
return row[0]
|
|
534
572
|
|
|
535
|
-
raise RuntimeError(
|
|
573
|
+
raise RuntimeError("Unable to retrieve available cache space")
|
|
536
574
|
|
|
537
|
-
|
|
538
|
-
|
|
575
|
+
for row in self._read(
|
|
576
|
+
"""
|
|
539
577
|
SELECT (
|
|
540
578
|
(SELECT value FROM properties WHERE name = 'maxSpace') -
|
|
541
579
|
(SELECT TOTAL(size) FROM files) -
|
|
542
580
|
((SELECT TOTAL(disk) FROM jobs) -
|
|
543
581
|
(SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state = 'immutable'))
|
|
544
582
|
) as result
|
|
545
|
-
"""
|
|
583
|
+
"""
|
|
584
|
+
):
|
|
546
585
|
return row[0]
|
|
547
586
|
|
|
548
|
-
raise RuntimeError(
|
|
587
|
+
raise RuntimeError("Unable to retrieve available cache space")
|
|
549
588
|
|
|
550
589
|
def getSpaceUsableForJobs(self):
|
|
551
590
|
"""
|
|
@@ -555,15 +594,17 @@ class CachingFileStore(AbstractFileStore):
|
|
|
555
594
|
If not retrievable, raises an error.
|
|
556
595
|
"""
|
|
557
596
|
|
|
558
|
-
for row in self._read(
|
|
597
|
+
for row in self._read(
|
|
598
|
+
"""
|
|
559
599
|
SELECT (
|
|
560
600
|
(SELECT value FROM properties WHERE name = 'maxSpace') -
|
|
561
601
|
(SELECT TOTAL(disk) FROM jobs)
|
|
562
602
|
) as result
|
|
563
|
-
"""
|
|
603
|
+
"""
|
|
604
|
+
):
|
|
564
605
|
return row[0]
|
|
565
606
|
|
|
566
|
-
raise RuntimeError(
|
|
607
|
+
raise RuntimeError("Unable to retrieve usabel space for jobs")
|
|
567
608
|
|
|
568
609
|
def getCacheUnusedJobRequirement(self):
|
|
569
610
|
"""
|
|
@@ -575,28 +616,36 @@ class CachingFileStore(AbstractFileStore):
|
|
|
575
616
|
If no value is available, raises an error.
|
|
576
617
|
"""
|
|
577
618
|
|
|
578
|
-
logger.debug(
|
|
579
|
-
|
|
580
|
-
for row in self._read('SELECT * FROM files'):
|
|
581
|
-
logger.debug('File record: %s', str(row))
|
|
619
|
+
logger.debug("Get unused space for job %s", self.jobID)
|
|
582
620
|
|
|
583
|
-
for row in self._read(
|
|
584
|
-
logger.debug(
|
|
621
|
+
for row in self._read("SELECT * FROM files"):
|
|
622
|
+
logger.debug("File record: %s", str(row))
|
|
585
623
|
|
|
624
|
+
for row in self._read("SELECT * FROM refs"):
|
|
625
|
+
logger.debug("Ref record: %s", str(row))
|
|
586
626
|
|
|
587
|
-
for row in self._read(
|
|
588
|
-
(
|
|
627
|
+
for row in self._read(
|
|
628
|
+
"SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.job_id = ? AND refs.state != ?",
|
|
629
|
+
(self.jobID, "mutable"),
|
|
630
|
+
):
|
|
589
631
|
# Sum up all the sizes of our referenced files, then subtract that from how much we came in with
|
|
590
632
|
return self.jobDiskBytes - row[0]
|
|
591
633
|
|
|
592
|
-
raise RuntimeError(
|
|
634
|
+
raise RuntimeError("Unable to retrieve unused job requirement space")
|
|
593
635
|
|
|
594
636
|
def adjustCacheLimit(self, newTotalBytes):
|
|
595
637
|
"""
|
|
596
638
|
Adjust the total cache size limit to the given number of bytes.
|
|
597
639
|
"""
|
|
598
640
|
|
|
599
|
-
self._write(
|
|
641
|
+
self._write(
|
|
642
|
+
[
|
|
643
|
+
(
|
|
644
|
+
"UPDATE properties SET value = ? WHERE name = ?",
|
|
645
|
+
(newTotalBytes, "maxSpace"),
|
|
646
|
+
)
|
|
647
|
+
]
|
|
648
|
+
)
|
|
600
649
|
|
|
601
650
|
def fileIsCached(self, fileID):
|
|
602
651
|
"""
|
|
@@ -607,8 +656,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
607
656
|
file you need to do it in a transaction.
|
|
608
657
|
"""
|
|
609
658
|
|
|
610
|
-
for row in self._read(
|
|
611
|
-
(
|
|
659
|
+
for row in self._read(
|
|
660
|
+
"SELECT COUNT(*) FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)",
|
|
661
|
+
(fileID, "cached", "uploadable", "uploading"),
|
|
662
|
+
):
|
|
612
663
|
|
|
613
664
|
return row[0] > 0
|
|
614
665
|
return False
|
|
@@ -620,7 +671,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
620
671
|
Counts mutable references too.
|
|
621
672
|
"""
|
|
622
673
|
|
|
623
|
-
for row in self._read(
|
|
674
|
+
for row in self._read("SELECT COUNT(*) FROM refs WHERE file_id = ?", (fileID,)):
|
|
624
675
|
return row[0]
|
|
625
676
|
return 0
|
|
626
677
|
|
|
@@ -633,11 +684,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
633
684
|
configurations, most notably the FileJobStore.
|
|
634
685
|
"""
|
|
635
686
|
|
|
636
|
-
for row in self._read(
|
|
687
|
+
for row in self._read(
|
|
688
|
+
"SELECT value FROM properties WHERE name = ?", ("freeCaching",)
|
|
689
|
+
):
|
|
637
690
|
return row[0] == 1
|
|
638
691
|
|
|
639
692
|
# Otherwise we need to set it
|
|
640
693
|
from toil.jobStores.fileJobStore import FileJobStore
|
|
694
|
+
|
|
641
695
|
if isinstance(self.jobStore, FileJobStore) and not self.forceNonFreeCaching:
|
|
642
696
|
# Caching may be free since we are using a file job store.
|
|
643
697
|
|
|
@@ -646,7 +700,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
646
700
|
|
|
647
701
|
# Read it out to a generated name.
|
|
648
702
|
destDir = mkdtemp(dir=self.localCacheDir)
|
|
649
|
-
cachedFile = os.path.join(destDir,
|
|
703
|
+
cachedFile = os.path.join(destDir, "sniffLinkCount")
|
|
650
704
|
self.jobStore.read_file(emptyID, cachedFile, symlink=False)
|
|
651
705
|
|
|
652
706
|
# Check the link count
|
|
@@ -666,7 +720,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
666
720
|
free = 0
|
|
667
721
|
|
|
668
722
|
# Save to the database if we're the first to work this out
|
|
669
|
-
self._write(
|
|
723
|
+
self._write(
|
|
724
|
+
[("INSERT OR IGNORE INTO properties VALUES (?, ?)", ("freeCaching", free))]
|
|
725
|
+
)
|
|
670
726
|
|
|
671
727
|
# Return true if we said caching was free
|
|
672
728
|
return free == 1
|
|
@@ -683,7 +739,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
683
739
|
|
|
684
740
|
# Hash the file ID
|
|
685
741
|
hasher = hashlib.sha1()
|
|
686
|
-
hasher.update(fileStoreID.encode(
|
|
742
|
+
hasher.update(fileStoreID.encode("utf-8"))
|
|
687
743
|
|
|
688
744
|
# Get a unique temp file name, including the file ID's hash to make
|
|
689
745
|
# sure we can never collide even though we are going to remove the
|
|
@@ -707,17 +763,19 @@ class CachingFileStore(AbstractFileStore):
|
|
|
707
763
|
# Get a list of all file owner processes on this node.
|
|
708
764
|
# Exclude NULL because it comes out as 0 and we can't look for PID 0.
|
|
709
765
|
owners = []
|
|
710
|
-
for row in self._read(
|
|
766
|
+
for row in self._read(
|
|
767
|
+
"SELECT DISTINCT owner FROM files WHERE owner IS NOT NULL"
|
|
768
|
+
):
|
|
711
769
|
owners.append(row[0])
|
|
712
770
|
|
|
713
771
|
# Work out which of them have died.
|
|
714
772
|
deadOwners = []
|
|
715
773
|
for owner in owners:
|
|
716
774
|
if not process_name_exists(self.coordination_dir, owner):
|
|
717
|
-
logger.debug(
|
|
775
|
+
logger.debug("Owner %s is dead", owner)
|
|
718
776
|
deadOwners.append(owner)
|
|
719
777
|
else:
|
|
720
|
-
logger.debug(
|
|
778
|
+
logger.debug("Owner %s is alive", owner)
|
|
721
779
|
|
|
722
780
|
for owner in deadOwners:
|
|
723
781
|
# Try and adopt all the files that any dead owner had
|
|
@@ -736,14 +794,28 @@ class CachingFileStore(AbstractFileStore):
|
|
|
736
794
|
#
|
|
737
795
|
# TODO: if we ever let other PIDs be responsible for writing our
|
|
738
796
|
# files asynchronously, this will need to change.
|
|
739
|
-
self._write(
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
797
|
+
self._write(
|
|
798
|
+
[
|
|
799
|
+
(
|
|
800
|
+
"UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?",
|
|
801
|
+
(me, "deleting", owner, "deleting"),
|
|
802
|
+
),
|
|
803
|
+
(
|
|
804
|
+
"UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?",
|
|
805
|
+
(me, "deleting", owner, "downloading"),
|
|
806
|
+
),
|
|
807
|
+
(
|
|
808
|
+
"UPDATE files SET owner = NULL, state = ? WHERE owner = ? AND (state = ? OR state = ?)",
|
|
809
|
+
("cached", owner, "uploadable", "uploading"),
|
|
810
|
+
),
|
|
811
|
+
]
|
|
812
|
+
)
|
|
745
813
|
|
|
746
|
-
logger.debug(
|
|
814
|
+
logger.debug(
|
|
815
|
+
"Tried to adopt file operations from dead worker %s to ourselves as %s",
|
|
816
|
+
owner,
|
|
817
|
+
me,
|
|
818
|
+
)
|
|
747
819
|
|
|
748
820
|
def _executePendingDeletions(self):
|
|
749
821
|
"""
|
|
@@ -757,16 +829,19 @@ class CachingFileStore(AbstractFileStore):
|
|
|
757
829
|
|
|
758
830
|
# Remember the file IDs we are deleting
|
|
759
831
|
deletedFiles = []
|
|
760
|
-
for row in self._read(
|
|
832
|
+
for row in self._read(
|
|
833
|
+
"SELECT id, path FROM files WHERE owner = ? AND state = ?",
|
|
834
|
+
(me, "deleting"),
|
|
835
|
+
):
|
|
761
836
|
# Grab everything we are supposed to delete and delete it
|
|
762
837
|
fileID = row[0]
|
|
763
838
|
filePath = row[1]
|
|
764
839
|
try:
|
|
765
840
|
os.unlink(filePath)
|
|
766
|
-
logger.debug(
|
|
841
|
+
logger.debug("Successfully deleted: %s", filePath)
|
|
767
842
|
except OSError:
|
|
768
843
|
# Probably already deleted
|
|
769
|
-
logger.debug(
|
|
844
|
+
logger.debug("File already gone: %s", filePath)
|
|
770
845
|
# Still need to mark it as deleted
|
|
771
846
|
|
|
772
847
|
# Whether we deleted the file or just found out that it is gone, we
|
|
@@ -777,8 +852,15 @@ class CachingFileStore(AbstractFileStore):
|
|
|
777
852
|
for fileID in deletedFiles:
|
|
778
853
|
# Drop all the files. They should have stayed in deleting state. We move them from there to not present at all.
|
|
779
854
|
# Also drop their references, if they had any from dead downloaders.
|
|
780
|
-
self._write(
|
|
781
|
-
|
|
855
|
+
self._write(
|
|
856
|
+
[
|
|
857
|
+
(
|
|
858
|
+
"DELETE FROM files WHERE id = ? AND state = ?",
|
|
859
|
+
(fileID, "deleting"),
|
|
860
|
+
),
|
|
861
|
+
("DELETE FROM refs WHERE file_id = ?", (fileID,)),
|
|
862
|
+
]
|
|
863
|
+
)
|
|
782
864
|
|
|
783
865
|
return len(deletedFiles)
|
|
784
866
|
|
|
@@ -798,7 +880,11 @@ class CachingFileStore(AbstractFileStore):
|
|
|
798
880
|
# Try and find a file we might want to upload
|
|
799
881
|
fileID = None
|
|
800
882
|
filePath = None
|
|
801
|
-
for row in self._static_read(
|
|
883
|
+
for row in self._static_read(
|
|
884
|
+
self.cur,
|
|
885
|
+
"SELECT id, path FROM files WHERE state = ? AND owner = ? LIMIT 1",
|
|
886
|
+
("uploadable", me),
|
|
887
|
+
):
|
|
802
888
|
fileID = row[0]
|
|
803
889
|
filePath = row[1]
|
|
804
890
|
|
|
@@ -807,30 +893,57 @@ class CachingFileStore(AbstractFileStore):
|
|
|
807
893
|
break
|
|
808
894
|
|
|
809
895
|
# We need to set it to uploading in a way that we can detect that *we* won the update race instead of anyone else.
|
|
810
|
-
rowCount = self._static_write(
|
|
896
|
+
rowCount = self._static_write(
|
|
897
|
+
self.con,
|
|
898
|
+
self.cur,
|
|
899
|
+
[
|
|
900
|
+
(
|
|
901
|
+
"UPDATE files SET state = ? WHERE id = ? AND state = ?",
|
|
902
|
+
("uploading", fileID, "uploadable"),
|
|
903
|
+
)
|
|
904
|
+
],
|
|
905
|
+
)
|
|
811
906
|
if rowCount != 1:
|
|
812
907
|
# We didn't manage to update it. Someone else (a running job if
|
|
813
908
|
# we are a committing thread, or visa versa) must have grabbed
|
|
814
909
|
# it.
|
|
815
|
-
logger.debug(
|
|
910
|
+
logger.debug("Lost race to upload %s", fileID)
|
|
816
911
|
# Try again to see if there is something else to grab.
|
|
817
912
|
continue
|
|
818
913
|
|
|
819
914
|
# Upload the file
|
|
820
|
-
logger.debug(
|
|
915
|
+
logger.debug("Actually executing upload for file %s", fileID)
|
|
821
916
|
try:
|
|
822
917
|
self.jobStore.update_file(fileID, filePath)
|
|
823
918
|
except:
|
|
824
919
|
# We need to set the state back to 'uploadable' in case of any failures to ensure
|
|
825
920
|
# we can retry properly.
|
|
826
|
-
self._static_write(
|
|
921
|
+
self._static_write(
|
|
922
|
+
self.con,
|
|
923
|
+
self.cur,
|
|
924
|
+
[
|
|
925
|
+
(
|
|
926
|
+
"UPDATE files SET state = ? WHERE id = ? AND state = ?",
|
|
927
|
+
("uploadable", fileID, "uploading"),
|
|
928
|
+
)
|
|
929
|
+
],
|
|
930
|
+
)
|
|
827
931
|
raise
|
|
828
932
|
|
|
829
933
|
# Count it for the total uploaded files value we need to return
|
|
830
934
|
uploadedCount += 1
|
|
831
935
|
|
|
832
936
|
# Remember that we uploaded it in the database
|
|
833
|
-
self._static_write(
|
|
937
|
+
self._static_write(
|
|
938
|
+
self.con,
|
|
939
|
+
self.cur,
|
|
940
|
+
[
|
|
941
|
+
(
|
|
942
|
+
"UPDATE files SET state = ?, owner = NULL WHERE id = ?",
|
|
943
|
+
("cached", fileID),
|
|
944
|
+
)
|
|
945
|
+
],
|
|
946
|
+
)
|
|
834
947
|
|
|
835
948
|
return uploadedCount
|
|
836
949
|
|
|
@@ -854,7 +967,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
854
967
|
# But we won't actually let the job run and use any of this space until
|
|
855
968
|
# the cache has been successfully cleared out.
|
|
856
969
|
with self.as_process() as me:
|
|
857
|
-
self._write(
|
|
970
|
+
self._write(
|
|
971
|
+
[
|
|
972
|
+
(
|
|
973
|
+
"INSERT INTO jobs VALUES (?, ?, ?, ?)",
|
|
974
|
+
(self.jobID, self.localTempDir, newJobReqs, me),
|
|
975
|
+
)
|
|
976
|
+
]
|
|
977
|
+
)
|
|
858
978
|
|
|
859
979
|
# Now we need to make sure that we can fit all currently cached files,
|
|
860
980
|
# and the parts of the total job requirements not currently spent on
|
|
@@ -862,7 +982,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
862
982
|
|
|
863
983
|
available = self.getCacheAvailable()
|
|
864
984
|
|
|
865
|
-
logger.debug(
|
|
985
|
+
logger.debug("Available space with job: %d bytes", available)
|
|
866
986
|
|
|
867
987
|
if available >= 0:
|
|
868
988
|
# We're fine on disk space
|
|
@@ -886,10 +1006,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
886
1006
|
"""
|
|
887
1007
|
|
|
888
1008
|
# Get the job's temp dir
|
|
889
|
-
for row in cls._static_read(
|
|
1009
|
+
for row in cls._static_read(
|
|
1010
|
+
cur, "SELECT tempdir FROM jobs WHERE id = ?", (jobID,)
|
|
1011
|
+
):
|
|
890
1012
|
jobTemp = row[0]
|
|
891
1013
|
|
|
892
|
-
for row in cls._static_read(
|
|
1014
|
+
for row in cls._static_read(
|
|
1015
|
+
cur, "SELECT path FROM refs WHERE job_id = ?", (jobID,)
|
|
1016
|
+
):
|
|
893
1017
|
try:
|
|
894
1018
|
# Delete all the reference files.
|
|
895
1019
|
os.unlink(row[0])
|
|
@@ -897,7 +1021,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
897
1021
|
# May not exist
|
|
898
1022
|
pass
|
|
899
1023
|
# And their database entries
|
|
900
|
-
cls._static_write(con, cur, [(
|
|
1024
|
+
cls._static_write(con, cur, [("DELETE FROM refs WHERE job_id = ?", (jobID,))])
|
|
901
1025
|
|
|
902
1026
|
try:
|
|
903
1027
|
# Delete the job's temp directory to the extent that we can.
|
|
@@ -906,7 +1030,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
906
1030
|
pass
|
|
907
1031
|
|
|
908
1032
|
# Strike the job from the database
|
|
909
|
-
cls._static_write(con, cur, [(
|
|
1033
|
+
cls._static_write(con, cur, [("DELETE FROM jobs WHERE id = ?", (jobID,))])
|
|
910
1034
|
|
|
911
1035
|
def _deallocateSpaceForJob(self):
|
|
912
1036
|
"""
|
|
@@ -937,12 +1061,12 @@ class CachingFileStore(AbstractFileStore):
|
|
|
937
1061
|
if self._executePendingDeletions() > 0:
|
|
938
1062
|
# We actually had something to delete, which we deleted.
|
|
939
1063
|
# Maybe there is space now
|
|
940
|
-
logger.debug(
|
|
1064
|
+
logger.debug("Successfully executed pending deletions to free space")
|
|
941
1065
|
return True
|
|
942
1066
|
|
|
943
1067
|
if self._executePendingUploads() > 0:
|
|
944
1068
|
# We had something to upload. Maybe it can be evicted now.
|
|
945
|
-
logger.debug(
|
|
1069
|
+
logger.debug("Successfully executed pending uploads to free space")
|
|
946
1070
|
return True
|
|
947
1071
|
|
|
948
1072
|
# Otherwise, not enough files could be found in deleting state to solve our problem.
|
|
@@ -952,37 +1076,45 @@ class CachingFileStore(AbstractFileStore):
|
|
|
952
1076
|
# soon as we hit the cache limit.
|
|
953
1077
|
|
|
954
1078
|
# Find something that has no non-mutable references and is not already being deleted.
|
|
955
|
-
self._read(
|
|
1079
|
+
self._read(
|
|
1080
|
+
"""
|
|
956
1081
|
SELECT files.id FROM files WHERE files.state = 'cached' AND NOT EXISTS (
|
|
957
1082
|
SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
|
|
958
1083
|
) LIMIT 1
|
|
959
|
-
"""
|
|
1084
|
+
"""
|
|
1085
|
+
)
|
|
960
1086
|
row = self.cur.fetchone()
|
|
961
1087
|
if row is None:
|
|
962
1088
|
# Nothing can be evicted by us.
|
|
963
1089
|
# Someone else might be in the process of evicting something that will free up space for us too.
|
|
964
1090
|
# Or someone mught be uploading something and we have to wait for them to finish before it can be deleted.
|
|
965
|
-
logger.debug(
|
|
1091
|
+
logger.debug("Could not find anything to evict! Cannot free up space!")
|
|
966
1092
|
return False
|
|
967
1093
|
|
|
968
1094
|
# Otherwise we found an eviction candidate.
|
|
969
1095
|
fileID = row[0]
|
|
970
1096
|
|
|
971
1097
|
# Try and grab it for deletion, subject to the condition that nothing has started reading it
|
|
972
|
-
self._write(
|
|
1098
|
+
self._write(
|
|
1099
|
+
[
|
|
1100
|
+
(
|
|
1101
|
+
"""
|
|
973
1102
|
UPDATE files SET owner = ?, state = ? WHERE id = ? AND state = ?
|
|
974
1103
|
AND owner IS NULL AND NOT EXISTS (
|
|
975
1104
|
SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
|
|
976
1105
|
)
|
|
977
1106
|
""",
|
|
978
|
-
|
|
1107
|
+
(me, "deleting", fileID, "cached"),
|
|
1108
|
+
)
|
|
1109
|
+
]
|
|
1110
|
+
)
|
|
979
1111
|
|
|
980
|
-
logger.debug(
|
|
1112
|
+
logger.debug("Evicting file %s", fileID)
|
|
981
1113
|
|
|
982
1114
|
# Whether we actually got it or not, try deleting everything we have to delete
|
|
983
1115
|
if self._executePendingDeletions() > 0:
|
|
984
1116
|
# We deleted something
|
|
985
|
-
logger.debug(
|
|
1117
|
+
logger.debug("Successfully executed pending deletions to free space")
|
|
986
1118
|
return True
|
|
987
1119
|
|
|
988
1120
|
def _freeUpSpace(self):
|
|
@@ -999,7 +1131,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
999
1131
|
|
|
1000
1132
|
while availableSpace < 0:
|
|
1001
1133
|
# While there isn't enough space for the thing we want
|
|
1002
|
-
logger.debug(
|
|
1134
|
+
logger.debug(
|
|
1135
|
+
"Cache is full (%d bytes free). Trying to free up space!",
|
|
1136
|
+
availableSpace,
|
|
1137
|
+
)
|
|
1003
1138
|
# Free up space. See if we made any progress
|
|
1004
1139
|
progress = self._tryToFreeUpSpace()
|
|
1005
1140
|
availableSpace = self.getCacheAvailable()
|
|
@@ -1011,19 +1146,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1011
1146
|
# See if we've been oversubscribed.
|
|
1012
1147
|
jobSpace = self.getSpaceUsableForJobs()
|
|
1013
1148
|
if jobSpace < 0:
|
|
1014
|
-
logger.critical(
|
|
1149
|
+
logger.critical(
|
|
1150
|
+
"Jobs on this machine have oversubscribed our total available space (%d bytes)!",
|
|
1151
|
+
jobSpace,
|
|
1152
|
+
)
|
|
1015
1153
|
raise CacheUnbalancedError
|
|
1016
1154
|
else:
|
|
1017
1155
|
patience -= 1
|
|
1018
1156
|
if patience <= 0:
|
|
1019
|
-
logger.critical(
|
|
1157
|
+
logger.critical(
|
|
1158
|
+
"Waited implausibly long for active uploads and deletes."
|
|
1159
|
+
)
|
|
1020
1160
|
raise CacheUnbalancedError
|
|
1021
1161
|
else:
|
|
1022
1162
|
# Wait a bit and come back
|
|
1023
1163
|
time.sleep(2)
|
|
1024
1164
|
|
|
1025
|
-
logger.debug(
|
|
1026
|
-
|
|
1165
|
+
logger.debug("Cache has %d bytes free.", availableSpace)
|
|
1027
1166
|
|
|
1028
1167
|
# Normal AbstractFileStore API
|
|
1029
1168
|
|
|
@@ -1044,8 +1183,13 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1044
1183
|
# have filled the cache or not.
|
|
1045
1184
|
self.jobDiskBytes = job.disk
|
|
1046
1185
|
|
|
1047
|
-
logger.debug(
|
|
1048
|
-
|
|
1186
|
+
logger.debug(
|
|
1187
|
+
"Actually running job (%s) with ID (%s) which wants %d of our %d bytes.",
|
|
1188
|
+
self.jobName,
|
|
1189
|
+
self.jobID,
|
|
1190
|
+
self.jobDiskBytes,
|
|
1191
|
+
self.getCacheLimit(),
|
|
1192
|
+
)
|
|
1049
1193
|
|
|
1050
1194
|
# Register the current job as taking this much space, and evict files
|
|
1051
1195
|
# from the cache to make room before letting the job run.
|
|
@@ -1079,7 +1223,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1079
1223
|
# Create an empty file to get an ID.
|
|
1080
1224
|
# Make sure to pass along the file basename.
|
|
1081
1225
|
# TODO: this empty file could leak if we die now...
|
|
1082
|
-
fileID = self.jobStore.get_empty_file_store_id(
|
|
1226
|
+
fileID = self.jobStore.get_empty_file_store_id(
|
|
1227
|
+
creatorID, cleanup, os.path.basename(localFileName)
|
|
1228
|
+
)
|
|
1083
1229
|
# Work out who we are
|
|
1084
1230
|
with self.as_process() as me:
|
|
1085
1231
|
|
|
@@ -1088,10 +1234,22 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1088
1234
|
|
|
1089
1235
|
# Create a file in uploadable state and a reference, in the same transaction.
|
|
1090
1236
|
# Say the reference is an immutable reference
|
|
1091
|
-
self._write(
|
|
1092
|
-
|
|
1237
|
+
self._write(
|
|
1238
|
+
[
|
|
1239
|
+
(
|
|
1240
|
+
"INSERT INTO files VALUES (?, ?, ?, ?, ?)",
|
|
1241
|
+
(fileID, cachePath, fileSize, "uploadable", me),
|
|
1242
|
+
),
|
|
1243
|
+
(
|
|
1244
|
+
"INSERT INTO refs VALUES (?, ?, ?, ?)",
|
|
1245
|
+
(absLocalFileName, fileID, creatorID, "immutable"),
|
|
1246
|
+
),
|
|
1247
|
+
]
|
|
1248
|
+
)
|
|
1093
1249
|
|
|
1094
|
-
if absLocalFileName.startswith(self.localTempDir) and not os.path.islink(
|
|
1250
|
+
if absLocalFileName.startswith(self.localTempDir) and not os.path.islink(
|
|
1251
|
+
absLocalFileName
|
|
1252
|
+
):
|
|
1095
1253
|
# We should link into the cache, because the upload is coming from our local temp dir (and not via a symlink in there)
|
|
1096
1254
|
try:
|
|
1097
1255
|
# Try and hardlink the file into the cache.
|
|
@@ -1102,8 +1260,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1102
1260
|
|
|
1103
1261
|
linkedToCache = True
|
|
1104
1262
|
|
|
1105
|
-
logger.debug(
|
|
1106
|
-
|
|
1263
|
+
logger.debug(
|
|
1264
|
+
"Hardlinked file %s into cache at %s; deferring write to job store",
|
|
1265
|
+
localFileName,
|
|
1266
|
+
cachePath,
|
|
1267
|
+
)
|
|
1268
|
+
assert not os.path.islink(cachePath), (
|
|
1269
|
+
"Symlink %s has invaded cache!" % cachePath
|
|
1270
|
+
)
|
|
1107
1271
|
|
|
1108
1272
|
# Don't do the upload now. Let it be deferred until later (when the job is committing).
|
|
1109
1273
|
except OSError:
|
|
@@ -1117,7 +1281,6 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1117
1281
|
# files to vanish from our cache.
|
|
1118
1282
|
linkedToCache = False
|
|
1119
1283
|
|
|
1120
|
-
|
|
1121
1284
|
if not linkedToCache:
|
|
1122
1285
|
# If we can't do the link into the cache and upload from there, we
|
|
1123
1286
|
# have to just upload right away. We can't guarantee sufficient
|
|
@@ -1126,27 +1289,40 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1126
1289
|
|
|
1127
1290
|
# Change the reference to 'mutable', which it will be.
|
|
1128
1291
|
# And drop the file altogether.
|
|
1129
|
-
self._write(
|
|
1130
|
-
|
|
1292
|
+
self._write(
|
|
1293
|
+
[
|
|
1294
|
+
(
|
|
1295
|
+
"UPDATE refs SET state = ? WHERE path = ? AND file_id = ?",
|
|
1296
|
+
("mutable", absLocalFileName, fileID),
|
|
1297
|
+
),
|
|
1298
|
+
("DELETE FROM files WHERE id = ?", (fileID,)),
|
|
1299
|
+
]
|
|
1300
|
+
)
|
|
1131
1301
|
|
|
1132
1302
|
# Save the file to the job store right now
|
|
1133
|
-
logger.debug(
|
|
1303
|
+
logger.debug(
|
|
1304
|
+
"Actually executing upload immediately for file %s", fileID
|
|
1305
|
+
)
|
|
1134
1306
|
self.jobStore.update_file(fileID, absLocalFileName)
|
|
1135
1307
|
|
|
1136
1308
|
# Ship out the completed FileID object with its real size.
|
|
1137
1309
|
return FileID.forPath(fileID, absLocalFileName)
|
|
1138
1310
|
|
|
1139
|
-
def readGlobalFile(
|
|
1311
|
+
def readGlobalFile(
|
|
1312
|
+
self, fileStoreID, userPath=None, cache=True, mutable=False, symlink=False
|
|
1313
|
+
):
|
|
1140
1314
|
|
|
1141
1315
|
if str(fileStoreID) in self.filesToDelete:
|
|
1142
1316
|
# File has already been deleted
|
|
1143
|
-
raise FileNotFoundError(f
|
|
1317
|
+
raise FileNotFoundError(f"Attempted to read deleted file: {fileStoreID}")
|
|
1144
1318
|
|
|
1145
1319
|
if userPath is not None:
|
|
1146
1320
|
# Validate the destination we got
|
|
1147
1321
|
localFilePath = self._resolveAbsoluteLocalPath(userPath)
|
|
1148
1322
|
if os.path.exists(localFilePath):
|
|
1149
|
-
raise RuntimeError(
|
|
1323
|
+
raise RuntimeError(
|
|
1324
|
+
" File %s " % localFilePath + " exists. Cannot Overwrite."
|
|
1325
|
+
)
|
|
1150
1326
|
else:
|
|
1151
1327
|
# Make our own destination
|
|
1152
1328
|
localFilePath = self.getLocalTempFileName()
|
|
@@ -1158,22 +1334,29 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1158
1334
|
# We want to use the cache
|
|
1159
1335
|
|
|
1160
1336
|
if mutable:
|
|
1161
|
-
finalPath = self._readGlobalFileMutablyWithCache(
|
|
1337
|
+
finalPath = self._readGlobalFileMutablyWithCache(
|
|
1338
|
+
fileStoreID, localFilePath, readerID
|
|
1339
|
+
)
|
|
1162
1340
|
else:
|
|
1163
|
-
finalPath = self._readGlobalFileWithCache(
|
|
1341
|
+
finalPath = self._readGlobalFileWithCache(
|
|
1342
|
+
fileStoreID, localFilePath, symlink, readerID
|
|
1343
|
+
)
|
|
1164
1344
|
else:
|
|
1165
1345
|
# We do not want to use the cache
|
|
1166
|
-
finalPath = self._readGlobalFileWithoutCache(
|
|
1346
|
+
finalPath = self._readGlobalFileWithoutCache(
|
|
1347
|
+
fileStoreID, localFilePath, mutable, symlink, readerID
|
|
1348
|
+
)
|
|
1167
1349
|
|
|
1168
|
-
if getattr(fileStoreID,
|
|
1350
|
+
if getattr(fileStoreID, "executable", False):
|
|
1169
1351
|
os.chmod(finalPath, os.stat(finalPath).st_mode | stat.S_IXUSR)
|
|
1170
1352
|
|
|
1171
1353
|
# Record access in case the job crashes and we have to log it
|
|
1172
1354
|
self.logAccess(fileStoreID, finalPath)
|
|
1173
1355
|
return finalPath
|
|
1174
1356
|
|
|
1175
|
-
|
|
1176
|
-
|
|
1357
|
+
def _readGlobalFileWithoutCache(
|
|
1358
|
+
self, fileStoreID, localFilePath, mutable, symlink, readerID
|
|
1359
|
+
):
|
|
1177
1360
|
"""
|
|
1178
1361
|
Read a file without putting it into the cache.
|
|
1179
1362
|
|
|
@@ -1191,7 +1374,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1191
1374
|
# read a file that is 'uploadable' or 'uploading' and hasn't hit
|
|
1192
1375
|
# the backing job store yet.
|
|
1193
1376
|
|
|
1194
|
-
with self._with_copying_reference_to_upload(
|
|
1377
|
+
with self._with_copying_reference_to_upload(
|
|
1378
|
+
fileStoreID, readerID, localFilePath
|
|
1379
|
+
) as ref_path:
|
|
1195
1380
|
if ref_path is not None:
|
|
1196
1381
|
# We got a copying reference, so the file is being uploaded and
|
|
1197
1382
|
# must be read from the cache for consistency. And it will
|
|
@@ -1205,11 +1390,16 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1205
1390
|
|
|
1206
1391
|
# Find where the file is cached
|
|
1207
1392
|
cachedPath = None
|
|
1208
|
-
for row in self._read(
|
|
1393
|
+
for row in self._read(
|
|
1394
|
+
"SELECT path FROM files WHERE id = ?", (fileStoreID,)
|
|
1395
|
+
):
|
|
1209
1396
|
cachedPath = row[0]
|
|
1210
1397
|
|
|
1211
1398
|
if cachedPath is None:
|
|
1212
|
-
raise RuntimeError(
|
|
1399
|
+
raise RuntimeError(
|
|
1400
|
+
"File %s went away while we had a reference to it!"
|
|
1401
|
+
% fileStoreID
|
|
1402
|
+
)
|
|
1213
1403
|
|
|
1214
1404
|
if self.forceDownloadDelay is not None:
|
|
1215
1405
|
# Wait around to simulate a big file for testing
|
|
@@ -1218,8 +1408,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1218
1408
|
atomic_copy(cachedPath, ref_path)
|
|
1219
1409
|
|
|
1220
1410
|
# Change the reference to mutable so it sticks around
|
|
1221
|
-
self._write(
|
|
1222
|
-
|
|
1411
|
+
self._write(
|
|
1412
|
+
[
|
|
1413
|
+
(
|
|
1414
|
+
"UPDATE refs SET state = ? WHERE path = ? and file_id = ?",
|
|
1415
|
+
("mutable", ref_path, fileStoreID),
|
|
1416
|
+
)
|
|
1417
|
+
]
|
|
1418
|
+
)
|
|
1223
1419
|
else:
|
|
1224
1420
|
# File is not being uploaded currently.
|
|
1225
1421
|
|
|
@@ -1229,8 +1425,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1229
1425
|
|
|
1230
1426
|
# Create a 'mutable' reference (even if we end up with a link)
|
|
1231
1427
|
# so we can see this file in deleteLocalFile.
|
|
1232
|
-
self._write(
|
|
1233
|
-
|
|
1428
|
+
self._write(
|
|
1429
|
+
[
|
|
1430
|
+
(
|
|
1431
|
+
"INSERT INTO refs VALUES (?, ?, ?, ?)",
|
|
1432
|
+
(localFilePath, fileStoreID, readerID, "mutable"),
|
|
1433
|
+
)
|
|
1434
|
+
]
|
|
1435
|
+
)
|
|
1234
1436
|
|
|
1235
1437
|
if self.forceDownloadDelay is not None:
|
|
1236
1438
|
# Wait around to simulate a big file for testing
|
|
@@ -1290,15 +1492,32 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1290
1492
|
# Start a loop until we can do one of these
|
|
1291
1493
|
while True:
|
|
1292
1494
|
# Try and create a downloading entry if no entry exists
|
|
1293
|
-
logger.debug(
|
|
1294
|
-
self._write(
|
|
1295
|
-
|
|
1495
|
+
logger.debug("Trying to make file record for id %s", fileStoreID)
|
|
1496
|
+
self._write(
|
|
1497
|
+
[
|
|
1498
|
+
(
|
|
1499
|
+
"INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)",
|
|
1500
|
+
(
|
|
1501
|
+
fileStoreID,
|
|
1502
|
+
cachedPath,
|
|
1503
|
+
self.getGlobalFileSize(fileStoreID),
|
|
1504
|
+
"downloading",
|
|
1505
|
+
me,
|
|
1506
|
+
),
|
|
1507
|
+
)
|
|
1508
|
+
]
|
|
1509
|
+
)
|
|
1296
1510
|
|
|
1297
1511
|
# See if we won the race
|
|
1298
|
-
self._read(
|
|
1512
|
+
self._read(
|
|
1513
|
+
"SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?",
|
|
1514
|
+
(fileStoreID, "downloading", me),
|
|
1515
|
+
)
|
|
1299
1516
|
if self.cur.fetchone()[0] > 0:
|
|
1300
1517
|
# We are responsible for downloading the file
|
|
1301
|
-
logger.debug(
|
|
1518
|
+
logger.debug(
|
|
1519
|
+
"We are now responsible for downloading file %s", fileStoreID
|
|
1520
|
+
)
|
|
1302
1521
|
|
|
1303
1522
|
# Make sure we have space for this download.
|
|
1304
1523
|
self._freeUpSpace()
|
|
@@ -1313,37 +1532,65 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1313
1532
|
# two readers, one cached copy, and space for two copies total.
|
|
1314
1533
|
|
|
1315
1534
|
# Make the copying reference
|
|
1316
|
-
self._write(
|
|
1317
|
-
|
|
1535
|
+
self._write(
|
|
1536
|
+
[
|
|
1537
|
+
(
|
|
1538
|
+
"INSERT INTO refs VALUES (?, ?, ?, ?)",
|
|
1539
|
+
(localFilePath, fileStoreID, readerID, "copying"),
|
|
1540
|
+
)
|
|
1541
|
+
]
|
|
1542
|
+
)
|
|
1318
1543
|
|
|
1319
1544
|
# Fulfill it with a full copy or by giving away the cached copy
|
|
1320
|
-
self._fulfillCopyingReference(
|
|
1545
|
+
self._fulfillCopyingReference(
|
|
1546
|
+
fileStoreID, cachedPath, localFilePath
|
|
1547
|
+
)
|
|
1321
1548
|
|
|
1322
1549
|
# Now we're done
|
|
1323
1550
|
return localFilePath
|
|
1324
1551
|
|
|
1325
1552
|
else:
|
|
1326
|
-
logger.debug(
|
|
1553
|
+
logger.debug(
|
|
1554
|
+
"Someone else is already responsible for file %s", fileStoreID
|
|
1555
|
+
)
|
|
1327
1556
|
|
|
1328
1557
|
# A record already existed for this file.
|
|
1329
1558
|
# Try and create an immutable or copying reference to an entry that
|
|
1330
1559
|
# is in 'cached' or 'uploadable' or 'uploading' state.
|
|
1331
1560
|
# It might be uploading because *we* are supposed to be uploading it.
|
|
1332
|
-
logger.debug(
|
|
1333
|
-
self._write(
|
|
1334
|
-
|
|
1561
|
+
logger.debug("Trying to make reference to file %s", fileStoreID)
|
|
1562
|
+
self._write(
|
|
1563
|
+
[
|
|
1564
|
+
(
|
|
1565
|
+
"INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)",
|
|
1566
|
+
(
|
|
1567
|
+
localFilePath,
|
|
1568
|
+
readerID,
|
|
1569
|
+
"copying",
|
|
1570
|
+
fileStoreID,
|
|
1571
|
+
"cached",
|
|
1572
|
+
"uploadable",
|
|
1573
|
+
"uploading",
|
|
1574
|
+
),
|
|
1575
|
+
)
|
|
1576
|
+
]
|
|
1577
|
+
)
|
|
1335
1578
|
|
|
1336
1579
|
# See if we got it
|
|
1337
|
-
self._read(
|
|
1580
|
+
self._read(
|
|
1581
|
+
"SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?",
|
|
1582
|
+
(localFilePath, fileStoreID),
|
|
1583
|
+
)
|
|
1338
1584
|
if self.cur.fetchone()[0] > 0:
|
|
1339
1585
|
# The file is cached and we can copy or link it
|
|
1340
|
-
logger.debug(
|
|
1586
|
+
logger.debug("Obtained reference to file %s", fileStoreID)
|
|
1341
1587
|
|
|
1342
1588
|
# Get the path it is actually at in the cache, instead of where we wanted to put it
|
|
1343
|
-
for row in self._read(
|
|
1589
|
+
for row in self._read(
|
|
1590
|
+
"SELECT path FROM files WHERE id = ?", (fileStoreID,)
|
|
1591
|
+
):
|
|
1344
1592
|
cachedPath = row[0]
|
|
1345
1593
|
|
|
1346
|
-
|
|
1347
1594
|
while self.getCacheAvailable() < 0:
|
|
1348
1595
|
# Since we now have a copying reference, see if we have used too much space.
|
|
1349
1596
|
# If so, try to free up some space by deleting or uploading, but
|
|
@@ -1356,15 +1603,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1356
1603
|
|
|
1357
1604
|
# See if we have no other references and we can give away the file.
|
|
1358
1605
|
# Change it to downloading owned by us if we can grab it.
|
|
1359
|
-
self._write(
|
|
1606
|
+
self._write(
|
|
1607
|
+
[
|
|
1608
|
+
(
|
|
1609
|
+
"""
|
|
1360
1610
|
UPDATE files SET files.owner = ?, files.state = ? WHERE files.id = ? AND files.state = ?
|
|
1361
1611
|
AND files.owner IS NULL AND NOT EXISTS (
|
|
1362
1612
|
SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
|
|
1363
1613
|
)
|
|
1364
1614
|
""",
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1615
|
+
(me, "downloading", fileStoreID, "cached"),
|
|
1616
|
+
)
|
|
1617
|
+
]
|
|
1618
|
+
)
|
|
1619
|
+
|
|
1620
|
+
if self._giveAwayDownloadingFile(
|
|
1621
|
+
fileStoreID, cachedPath, localFilePath
|
|
1622
|
+
):
|
|
1368
1623
|
# We got ownership of the file and managed to give it away.
|
|
1369
1624
|
return localFilePath
|
|
1370
1625
|
|
|
@@ -1385,14 +1640,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1385
1640
|
atomic_copy(cachedPath, localFilePath)
|
|
1386
1641
|
|
|
1387
1642
|
# Change the reference to mutable
|
|
1388
|
-
self._write(
|
|
1643
|
+
self._write(
|
|
1644
|
+
[
|
|
1645
|
+
(
|
|
1646
|
+
"UPDATE refs SET state = ? WHERE path = ? AND file_id = ?",
|
|
1647
|
+
("mutable", localFilePath, fileStoreID),
|
|
1648
|
+
)
|
|
1649
|
+
]
|
|
1650
|
+
)
|
|
1389
1651
|
|
|
1390
1652
|
# Now we're done
|
|
1391
1653
|
return localFilePath
|
|
1392
1654
|
|
|
1393
1655
|
else:
|
|
1394
1656
|
# We didn't get a reference. Maybe it is still downloading.
|
|
1395
|
-
logger.debug(
|
|
1657
|
+
logger.debug(
|
|
1658
|
+
"Could not obtain reference to file %s", fileStoreID
|
|
1659
|
+
)
|
|
1396
1660
|
|
|
1397
1661
|
# Loop around again and see if either we can download it or we can get a reference to it.
|
|
1398
1662
|
|
|
@@ -1432,8 +1696,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1432
1696
|
# Expose this file as cached so other people can copy off of it too.
|
|
1433
1697
|
|
|
1434
1698
|
# Change state from downloading to cached
|
|
1435
|
-
self._write(
|
|
1436
|
-
|
|
1699
|
+
self._write(
|
|
1700
|
+
[
|
|
1701
|
+
(
|
|
1702
|
+
"UPDATE files SET state = ?, owner = NULL WHERE id = ?",
|
|
1703
|
+
("cached", fileStoreID),
|
|
1704
|
+
)
|
|
1705
|
+
]
|
|
1706
|
+
)
|
|
1437
1707
|
|
|
1438
1708
|
if self.forceDownloadDelay is not None:
|
|
1439
1709
|
# Wait around to simulate a big file for testing
|
|
@@ -1443,12 +1713,18 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1443
1713
|
atomic_copy(cachedPath, localFilePath)
|
|
1444
1714
|
|
|
1445
1715
|
# Change our reference to mutable
|
|
1446
|
-
self._write(
|
|
1716
|
+
self._write(
|
|
1717
|
+
[
|
|
1718
|
+
(
|
|
1719
|
+
"UPDATE refs SET state = ? WHERE path = ? AND file_id = ?",
|
|
1720
|
+
("mutable", localFilePath, fileStoreID),
|
|
1721
|
+
)
|
|
1722
|
+
]
|
|
1723
|
+
)
|
|
1447
1724
|
|
|
1448
1725
|
# Now we're done
|
|
1449
1726
|
return
|
|
1450
1727
|
|
|
1451
|
-
|
|
1452
1728
|
def _giveAwayDownloadingFile(self, fileStoreID, cachedPath, localFilePath):
|
|
1453
1729
|
"""
|
|
1454
1730
|
Move a downloaded file in 'downloading' state, owned by us, from the cache to a user-specified destination path.
|
|
@@ -1468,8 +1744,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1468
1744
|
with self.as_process() as me:
|
|
1469
1745
|
|
|
1470
1746
|
# See if we actually own this file and can giove it away
|
|
1471
|
-
self._read(
|
|
1472
|
-
(
|
|
1747
|
+
self._read(
|
|
1748
|
+
"SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?",
|
|
1749
|
+
(fileStoreID, "downloading", me),
|
|
1750
|
+
)
|
|
1473
1751
|
if self.cur.fetchone()[0] > 0:
|
|
1474
1752
|
# Now we have exclusive control of the cached copy of the file, so we can give it away.
|
|
1475
1753
|
|
|
@@ -1478,8 +1756,15 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1478
1756
|
# We are giving it away
|
|
1479
1757
|
shutil.move(cachedPath, localFilePath)
|
|
1480
1758
|
# Record that.
|
|
1481
|
-
self._write(
|
|
1482
|
-
|
|
1759
|
+
self._write(
|
|
1760
|
+
[
|
|
1761
|
+
(
|
|
1762
|
+
"UPDATE refs SET state = ? WHERE path = ? AND file_id = ?",
|
|
1763
|
+
("mutable", localFilePath, fileStoreID),
|
|
1764
|
+
),
|
|
1765
|
+
("DELETE FROM files WHERE id = ?", (fileStoreID,)),
|
|
1766
|
+
]
|
|
1767
|
+
)
|
|
1483
1768
|
|
|
1484
1769
|
# Now we're done
|
|
1485
1770
|
return True
|
|
@@ -1504,7 +1789,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1504
1789
|
:rtype: bool
|
|
1505
1790
|
"""
|
|
1506
1791
|
|
|
1507
|
-
assert os.path.exists(cachedPath),
|
|
1792
|
+
assert os.path.exists(cachedPath), (
|
|
1793
|
+
"Cannot create link to missing cache file %s" % cachedPath
|
|
1794
|
+
)
|
|
1508
1795
|
|
|
1509
1796
|
try:
|
|
1510
1797
|
# Try and make the hard link.
|
|
@@ -1546,17 +1833,46 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1546
1833
|
# Try and create a downloading entry if no entry exists.
|
|
1547
1834
|
# Make sure to create a reference at the same time if it succeeds, to bill it against our job's space.
|
|
1548
1835
|
# Don't create the mutable reference yet because we might not necessarily be able to clear that space.
|
|
1549
|
-
logger.debug(
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1836
|
+
logger.debug(
|
|
1837
|
+
"Trying to make file downloading file record and reference for id %s",
|
|
1838
|
+
fileStoreID,
|
|
1839
|
+
)
|
|
1840
|
+
self._write(
|
|
1841
|
+
[
|
|
1842
|
+
(
|
|
1843
|
+
"INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)",
|
|
1844
|
+
(
|
|
1845
|
+
fileStoreID,
|
|
1846
|
+
cachedPath,
|
|
1847
|
+
self.getGlobalFileSize(fileStoreID),
|
|
1848
|
+
"downloading",
|
|
1849
|
+
me,
|
|
1850
|
+
),
|
|
1851
|
+
),
|
|
1852
|
+
(
|
|
1853
|
+
"INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND state = ? AND owner = ?",
|
|
1854
|
+
(
|
|
1855
|
+
localFilePath,
|
|
1856
|
+
readerID,
|
|
1857
|
+
"immutable",
|
|
1858
|
+
fileStoreID,
|
|
1859
|
+
"downloading",
|
|
1860
|
+
me,
|
|
1861
|
+
),
|
|
1862
|
+
),
|
|
1863
|
+
]
|
|
1864
|
+
)
|
|
1554
1865
|
|
|
1555
1866
|
# See if we won the race
|
|
1556
|
-
self._read(
|
|
1867
|
+
self._read(
|
|
1868
|
+
"SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?",
|
|
1869
|
+
(fileStoreID, "downloading", me),
|
|
1870
|
+
)
|
|
1557
1871
|
if self.cur.fetchone()[0] > 0:
|
|
1558
1872
|
# We are responsible for downloading the file (and we have the reference)
|
|
1559
|
-
logger.debug(
|
|
1873
|
+
logger.debug(
|
|
1874
|
+
"We are now responsible for downloading file %s", fileStoreID
|
|
1875
|
+
)
|
|
1560
1876
|
|
|
1561
1877
|
# Make sure we have space for this download.
|
|
1562
1878
|
self._freeUpSpace()
|
|
@@ -1570,8 +1886,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1570
1886
|
# We made the link!
|
|
1571
1887
|
|
|
1572
1888
|
# Change file state from downloading to cached so other people can use it
|
|
1573
|
-
self._write(
|
|
1574
|
-
|
|
1889
|
+
self._write(
|
|
1890
|
+
[
|
|
1891
|
+
(
|
|
1892
|
+
"UPDATE files SET state = ?, owner = NULL WHERE id = ?",
|
|
1893
|
+
("cached", fileStoreID),
|
|
1894
|
+
)
|
|
1895
|
+
]
|
|
1896
|
+
)
|
|
1575
1897
|
|
|
1576
1898
|
# Now we're done!
|
|
1577
1899
|
return localFilePath
|
|
@@ -1579,36 +1901,69 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1579
1901
|
# We could not make a link. We need to make a copy.
|
|
1580
1902
|
|
|
1581
1903
|
# Change the reference to copying.
|
|
1582
|
-
self._write(
|
|
1904
|
+
self._write(
|
|
1905
|
+
[
|
|
1906
|
+
(
|
|
1907
|
+
"UPDATE refs SET state = ? WHERE path = ? AND file_id = ?",
|
|
1908
|
+
("copying", localFilePath, fileStoreID),
|
|
1909
|
+
)
|
|
1910
|
+
]
|
|
1911
|
+
)
|
|
1583
1912
|
|
|
1584
1913
|
# Fulfill it with a full copy or by giving away the cached copy
|
|
1585
|
-
self._fulfillCopyingReference(
|
|
1914
|
+
self._fulfillCopyingReference(
|
|
1915
|
+
fileStoreID, cachedPath, localFilePath
|
|
1916
|
+
)
|
|
1586
1917
|
|
|
1587
1918
|
# Now we're done
|
|
1588
1919
|
return localFilePath
|
|
1589
1920
|
|
|
1590
1921
|
else:
|
|
1591
|
-
logger.debug(
|
|
1922
|
+
logger.debug(
|
|
1923
|
+
"We already have an entry in the cache database for file %s",
|
|
1924
|
+
fileStoreID,
|
|
1925
|
+
)
|
|
1592
1926
|
|
|
1593
1927
|
# A record already existed for this file.
|
|
1594
1928
|
# Try and create an immutable reference to an entry that
|
|
1595
1929
|
# is in 'cached' or 'uploadable' or 'uploading' state.
|
|
1596
1930
|
# It might be uploading because *we* are supposed to be uploading it.
|
|
1597
|
-
logger.debug(
|
|
1598
|
-
self._write(
|
|
1599
|
-
|
|
1931
|
+
logger.debug("Trying to make reference to file %s", fileStoreID)
|
|
1932
|
+
self._write(
|
|
1933
|
+
[
|
|
1934
|
+
(
|
|
1935
|
+
"INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)",
|
|
1936
|
+
(
|
|
1937
|
+
localFilePath,
|
|
1938
|
+
readerID,
|
|
1939
|
+
"immutable",
|
|
1940
|
+
fileStoreID,
|
|
1941
|
+
"cached",
|
|
1942
|
+
"uploadable",
|
|
1943
|
+
"uploading",
|
|
1944
|
+
),
|
|
1945
|
+
)
|
|
1946
|
+
]
|
|
1947
|
+
)
|
|
1600
1948
|
|
|
1601
1949
|
# See if we got it
|
|
1602
|
-
self._read(
|
|
1950
|
+
self._read(
|
|
1951
|
+
"SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?",
|
|
1952
|
+
(localFilePath, fileStoreID),
|
|
1953
|
+
)
|
|
1603
1954
|
if self.cur.fetchone()[0] > 0:
|
|
1604
1955
|
# The file is cached and we can copy or link it
|
|
1605
|
-
logger.debug(
|
|
1956
|
+
logger.debug("Obtained reference to file %s", fileStoreID)
|
|
1606
1957
|
|
|
1607
1958
|
# Get the path it is actually at in the cache, instead of where we wanted to put it
|
|
1608
|
-
for row in self._read(
|
|
1959
|
+
for row in self._read(
|
|
1960
|
+
"SELECT path FROM files WHERE id = ?", (fileStoreID,)
|
|
1961
|
+
):
|
|
1609
1962
|
cachedPath = row[0]
|
|
1610
1963
|
|
|
1611
|
-
if self._createLinkFromCache(
|
|
1964
|
+
if self._createLinkFromCache(
|
|
1965
|
+
cachedPath, localFilePath, symlink
|
|
1966
|
+
):
|
|
1612
1967
|
# We managed to make the link
|
|
1613
1968
|
return localFilePath
|
|
1614
1969
|
else:
|
|
@@ -1620,11 +1975,22 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1620
1975
|
# we already have code for that for mutable downloads,
|
|
1621
1976
|
# so just clear the reference and download mutably.
|
|
1622
1977
|
|
|
1623
|
-
self._write(
|
|
1624
|
-
|
|
1625
|
-
|
|
1978
|
+
self._write(
|
|
1979
|
+
[
|
|
1980
|
+
(
|
|
1981
|
+
"DELETE FROM refs WHERE path = ? AND file_id = ?",
|
|
1982
|
+
(localFilePath, fileStoreID),
|
|
1983
|
+
)
|
|
1984
|
+
]
|
|
1985
|
+
)
|
|
1986
|
+
|
|
1987
|
+
return self._readGlobalFileMutablyWithCache(
|
|
1988
|
+
fileStoreID, localFilePath, readerID
|
|
1989
|
+
)
|
|
1626
1990
|
else:
|
|
1627
|
-
logger.debug(
|
|
1991
|
+
logger.debug(
|
|
1992
|
+
"Could not obtain reference to file %s", fileStoreID
|
|
1993
|
+
)
|
|
1628
1994
|
|
|
1629
1995
|
# If we didn't get a download or a reference, adopt and do work from dead workers and loop again.
|
|
1630
1996
|
# We may have to wait for someone else's download or delete to
|
|
@@ -1640,7 +2006,12 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1640
2006
|
time.sleep(self.contentionBackoff)
|
|
1641
2007
|
|
|
1642
2008
|
@contextmanager
|
|
1643
|
-
def _with_copying_reference_to_upload(
|
|
2009
|
+
def _with_copying_reference_to_upload(
|
|
2010
|
+
self,
|
|
2011
|
+
file_store_id: FileID,
|
|
2012
|
+
reader_id: str,
|
|
2013
|
+
local_file_path: Optional[str] = None,
|
|
2014
|
+
) -> Generator:
|
|
1644
2015
|
"""
|
|
1645
2016
|
Get a context manager that gives you either the local file path for a
|
|
1646
2017
|
copyuing reference to the given file, or None if that file is not in an
|
|
@@ -1662,12 +2033,28 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1662
2033
|
local_file_path = self.getLocalTempFileName()
|
|
1663
2034
|
|
|
1664
2035
|
# Try and make a 'copying' reference to such a file
|
|
1665
|
-
self._write(
|
|
1666
|
-
|
|
2036
|
+
self._write(
|
|
2037
|
+
[
|
|
2038
|
+
(
|
|
2039
|
+
"INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ?)",
|
|
2040
|
+
(
|
|
2041
|
+
local_file_path,
|
|
2042
|
+
reader_id,
|
|
2043
|
+
"copying",
|
|
2044
|
+
file_store_id,
|
|
2045
|
+
"uploadable",
|
|
2046
|
+
"uploading",
|
|
2047
|
+
),
|
|
2048
|
+
)
|
|
2049
|
+
]
|
|
2050
|
+
)
|
|
1667
2051
|
|
|
1668
2052
|
# See if we got it
|
|
1669
2053
|
have_reference = False
|
|
1670
|
-
for row in self._read(
|
|
2054
|
+
for row in self._read(
|
|
2055
|
+
"SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?",
|
|
2056
|
+
(local_file_path, file_store_id),
|
|
2057
|
+
):
|
|
1671
2058
|
have_reference = row[0] > 0
|
|
1672
2059
|
|
|
1673
2060
|
if have_reference:
|
|
@@ -1676,8 +2063,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1676
2063
|
yield local_file_path
|
|
1677
2064
|
finally:
|
|
1678
2065
|
# Clean up the reference if it is unmodified
|
|
1679
|
-
self._write(
|
|
1680
|
-
|
|
2066
|
+
self._write(
|
|
2067
|
+
[
|
|
2068
|
+
(
|
|
2069
|
+
"DELETE FROM refs WHERE path = ? AND file_id = ? AND state = ?",
|
|
2070
|
+
(local_file_path, file_store_id, "copying"),
|
|
2071
|
+
)
|
|
2072
|
+
]
|
|
2073
|
+
)
|
|
1681
2074
|
else:
|
|
1682
2075
|
# No reference was obtained.
|
|
1683
2076
|
yield None
|
|
@@ -1686,11 +2079,13 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1686
2079
|
def readGlobalFileStream(self, fileStoreID, encoding=None, errors=None):
|
|
1687
2080
|
if str(fileStoreID) in self.filesToDelete:
|
|
1688
2081
|
# File has already been deleted
|
|
1689
|
-
raise FileNotFoundError(f
|
|
2082
|
+
raise FileNotFoundError(f"Attempted to read deleted file: {fileStoreID}")
|
|
1690
2083
|
|
|
1691
2084
|
self.logAccess(fileStoreID)
|
|
1692
2085
|
|
|
1693
|
-
with self._with_copying_reference_to_upload(
|
|
2086
|
+
with self._with_copying_reference_to_upload(
|
|
2087
|
+
fileStoreID, self.jobDesc.jobStoreID
|
|
2088
|
+
) as ref_path:
|
|
1694
2089
|
# Try and grab a reference to the file if it is being uploaded.
|
|
1695
2090
|
if ref_path is not None:
|
|
1696
2091
|
# We have an update in the cache that isn't written back yet.
|
|
@@ -1699,11 +2094,16 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1699
2094
|
# The ref file is not actually copied to; find the actual file
|
|
1700
2095
|
# in the cache
|
|
1701
2096
|
cached_path = None
|
|
1702
|
-
for row in self._read(
|
|
2097
|
+
for row in self._read(
|
|
2098
|
+
"SELECT path FROM files WHERE id = ?", (fileStoreID,)
|
|
2099
|
+
):
|
|
1703
2100
|
cached_path = row[0]
|
|
1704
2101
|
|
|
1705
2102
|
if cached_path is None:
|
|
1706
|
-
raise RuntimeError(
|
|
2103
|
+
raise RuntimeError(
|
|
2104
|
+
"File %s went away while we had a reference to it!"
|
|
2105
|
+
% fileStoreID
|
|
2106
|
+
)
|
|
1707
2107
|
|
|
1708
2108
|
with open(cached_path, encoding=encoding, errors=errors) as result:
|
|
1709
2109
|
# Pass along the results of the open context manager on the
|
|
@@ -1714,7 +2114,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1714
2114
|
else:
|
|
1715
2115
|
# No local update, so we can stream from the job store
|
|
1716
2116
|
# TODO: Maybe stream from cache even when not required for consistency?
|
|
1717
|
-
with self.jobStore.read_file_stream(
|
|
2117
|
+
with self.jobStore.read_file_stream(
|
|
2118
|
+
fileStoreID, encoding=encoding, errors=errors
|
|
2119
|
+
) as result:
|
|
1718
2120
|
yield result
|
|
1719
2121
|
|
|
1720
2122
|
def deleteLocalFile(self, fileStoreID):
|
|
@@ -1727,7 +2129,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1727
2129
|
# missing ref file, we will raise an error about it and stop deleting
|
|
1728
2130
|
# things.
|
|
1729
2131
|
missingFile = None
|
|
1730
|
-
for row in self._read(
|
|
2132
|
+
for row in self._read(
|
|
2133
|
+
"SELECT path FROM refs WHERE file_id = ? AND job_id = ?",
|
|
2134
|
+
(fileStoreID, jobID),
|
|
2135
|
+
):
|
|
1731
2136
|
# Delete all the files that are references to this cached file (even mutable copies)
|
|
1732
2137
|
path = row[0]
|
|
1733
2138
|
|
|
@@ -1748,12 +2153,22 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1748
2153
|
if len(deleted) == 0 and not missingFile:
|
|
1749
2154
|
# We have to tell the user if they tried to delete 0 local copies.
|
|
1750
2155
|
# But if we found a missing local copy, go on to report that instead.
|
|
1751
|
-
raise OSError(
|
|
2156
|
+
raise OSError(
|
|
2157
|
+
errno.ENOENT,
|
|
2158
|
+
f"Attempting to delete local copies of a file with none: {fileStoreID}",
|
|
2159
|
+
)
|
|
1752
2160
|
|
|
1753
2161
|
for path in deleted:
|
|
1754
2162
|
# Drop the references
|
|
1755
|
-
self._write(
|
|
1756
|
-
|
|
2163
|
+
self._write(
|
|
2164
|
+
[
|
|
2165
|
+
(
|
|
2166
|
+
"DELETE FROM refs WHERE file_id = ? AND job_id = ? AND path = ?",
|
|
2167
|
+
(fileStoreID, jobID, path),
|
|
2168
|
+
)
|
|
2169
|
+
]
|
|
2170
|
+
)
|
|
2171
|
+
logger.debug("Deleted local file %s for global file %s", path, fileStoreID)
|
|
1757
2172
|
|
|
1758
2173
|
# Now space has been revoked from the cache because that job needs its space back.
|
|
1759
2174
|
# That might result in stuff having to be evicted.
|
|
@@ -1781,13 +2196,25 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1781
2196
|
with self.as_process() as me:
|
|
1782
2197
|
|
|
1783
2198
|
# Make sure nobody else has references to it
|
|
1784
|
-
for row in self._read(
|
|
1785
|
-
|
|
2199
|
+
for row in self._read(
|
|
2200
|
+
"SELECT job_id FROM refs WHERE file_id = ? AND state != ?",
|
|
2201
|
+
(fileStoreID, "mutable"),
|
|
2202
|
+
):
|
|
2203
|
+
raise RuntimeError(
|
|
2204
|
+
f"Deleted file ID {fileStoreID} which is still in use by job {row[0]}"
|
|
2205
|
+
)
|
|
1786
2206
|
# TODO: should we just let other jobs and the cache keep the file until
|
|
1787
2207
|
# it gets evicted, and only delete at the back end?
|
|
1788
2208
|
|
|
1789
2209
|
# Pop the file into deleting state owned by us if it exists
|
|
1790
|
-
self._write(
|
|
2210
|
+
self._write(
|
|
2211
|
+
[
|
|
2212
|
+
(
|
|
2213
|
+
"UPDATE files SET state = ?, owner = ? WHERE id = ?",
|
|
2214
|
+
("deleting", me, fileStoreID),
|
|
2215
|
+
)
|
|
2216
|
+
]
|
|
2217
|
+
)
|
|
1791
2218
|
|
|
1792
2219
|
# Finish the delete if the file is present
|
|
1793
2220
|
self._executePendingDeletions()
|
|
@@ -1795,10 +2222,13 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1795
2222
|
# Add the file to the list of files to be deleted from the job store
|
|
1796
2223
|
# once the run method completes.
|
|
1797
2224
|
self.filesToDelete.add(str(fileStoreID))
|
|
1798
|
-
self.log_to_leader(
|
|
1799
|
-
|
|
2225
|
+
self.log_to_leader(
|
|
2226
|
+
"Added file with ID '%s' to the list of files to be" % fileStoreID
|
|
2227
|
+
+ " globally deleted.",
|
|
2228
|
+
level=logging.DEBUG,
|
|
2229
|
+
)
|
|
1800
2230
|
|
|
1801
|
-
@deprecated(new_function_name=
|
|
2231
|
+
@deprecated(new_function_name="export_file")
|
|
1802
2232
|
def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None:
|
|
1803
2233
|
return self.export_file(jobStoreFileID, dstUrl)
|
|
1804
2234
|
|
|
@@ -1829,7 +2259,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1829
2259
|
# thread. It can do some destructor work after it finishes its real
|
|
1830
2260
|
# work.
|
|
1831
2261
|
|
|
1832
|
-
if
|
|
2262
|
+
if (
|
|
2263
|
+
self.commitThread is not None
|
|
2264
|
+
and self.commitThread is not threading.current_thread()
|
|
2265
|
+
):
|
|
1833
2266
|
self.commitThread.join()
|
|
1834
2267
|
|
|
1835
2268
|
return True
|
|
@@ -1856,17 +2289,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1856
2289
|
# might be necessary for later jobs to see earlier jobs' deleted
|
|
1857
2290
|
# before they are committed?
|
|
1858
2291
|
|
|
1859
|
-
logger.debug(
|
|
2292
|
+
logger.debug(
|
|
2293
|
+
"Starting commit of %s forked from %s", state_to_commit, self.jobDesc
|
|
2294
|
+
)
|
|
1860
2295
|
# Make sure the deep copy isn't summoning ghosts of old job
|
|
1861
2296
|
# versions. It must be as new or newer at this point.
|
|
1862
2297
|
self.jobDesc.assert_is_not_newer_than(state_to_commit)
|
|
1863
2298
|
|
|
1864
2299
|
# Bump the original's version since saving will do that too and we
|
|
1865
2300
|
# don't want duplicate versions.
|
|
1866
|
-
self.jobDesc.reserve_versions(
|
|
2301
|
+
self.jobDesc.reserve_versions(
|
|
2302
|
+
1 if len(state_to_commit.filesToDelete) == 0 else 2
|
|
2303
|
+
)
|
|
1867
2304
|
|
|
1868
2305
|
# Start the commit thread
|
|
1869
|
-
self.commitThread = threading.Thread(
|
|
2306
|
+
self.commitThread = threading.Thread(
|
|
2307
|
+
target=self.startCommitThread, args=(state_to_commit,)
|
|
2308
|
+
)
|
|
1870
2309
|
self.commitThread.start()
|
|
1871
2310
|
|
|
1872
2311
|
def startCommitThread(self, state_to_commit: Optional[JobDescription]):
|
|
@@ -1879,7 +2318,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1879
2318
|
self.waitForPreviousCommit()
|
|
1880
2319
|
|
|
1881
2320
|
try:
|
|
1882
|
-
logger.debug(
|
|
2321
|
+
logger.debug("Committing file uploads asynchronously")
|
|
1883
2322
|
|
|
1884
2323
|
# Finish all uploads
|
|
1885
2324
|
self._executePendingUploads()
|
|
@@ -1889,7 +2328,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1889
2328
|
if state_to_commit is not None:
|
|
1890
2329
|
# Do all the things that make this job not redoable
|
|
1891
2330
|
|
|
1892
|
-
logger.debug(
|
|
2331
|
+
logger.debug(
|
|
2332
|
+
"Committing file deletes and job state changes asynchronously from %s",
|
|
2333
|
+
state_to_commit,
|
|
2334
|
+
)
|
|
1893
2335
|
|
|
1894
2336
|
# Complete the job
|
|
1895
2337
|
self.jobStore.update_job(state_to_commit)
|
|
@@ -1905,10 +2347,8 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1905
2347
|
self._terminateEvent.set()
|
|
1906
2348
|
raise
|
|
1907
2349
|
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
2350
|
@classmethod
|
|
1911
|
-
def shutdown(cls, shutdown_info:
|
|
2351
|
+
def shutdown(cls, shutdown_info: tuple[str, str]) -> None:
|
|
1912
2352
|
"""
|
|
1913
2353
|
:param shutdown_info: Tuple of the coordination directory (where the
|
|
1914
2354
|
cache database is) and the cache directory (where the cached data is).
|
|
@@ -1935,7 +2375,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1935
2375
|
# So we just go and find the cache-n.db with the largest n value,
|
|
1936
2376
|
# and use that.
|
|
1937
2377
|
dbFilename = None
|
|
1938
|
-
dbAttempt = float(
|
|
2378
|
+
dbAttempt = float("-inf")
|
|
1939
2379
|
|
|
1940
2380
|
# We also need to remember all the plausible database files and
|
|
1941
2381
|
# journals
|
|
@@ -1943,12 +2383,15 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1943
2383
|
|
|
1944
2384
|
for dbCandidate in os.listdir(coordination_dir):
|
|
1945
2385
|
# For each thing in the coordination directory, see if it starts like a database file.
|
|
1946
|
-
match = re.match(
|
|
2386
|
+
match = re.match("^cache-([0-9]+).db.*", dbCandidate)
|
|
1947
2387
|
if match:
|
|
1948
2388
|
# This is caching-related.
|
|
1949
2389
|
all_db_files.append(dbCandidate)
|
|
1950
2390
|
attempt_number = int(match.group(1))
|
|
1951
|
-
if
|
|
2391
|
+
if (
|
|
2392
|
+
attempt_number > dbAttempt
|
|
2393
|
+
and dbCandidate == f"cache-{attempt_number}.db"
|
|
2394
|
+
):
|
|
1952
2395
|
# This is a main database, and the newest we have seen.
|
|
1953
2396
|
dbFilename = dbCandidate
|
|
1954
2397
|
dbAttempt = attempt_number
|
|
@@ -1956,7 +2399,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1956
2399
|
if dbFilename is not None:
|
|
1957
2400
|
# We found a caching database
|
|
1958
2401
|
|
|
1959
|
-
logger.debug(
|
|
2402
|
+
logger.debug(
|
|
2403
|
+
"Connecting to latest caching database %s for cleanup", dbFilename
|
|
2404
|
+
)
|
|
1960
2405
|
|
|
1961
2406
|
dbPath = os.path.join(coordination_dir, dbFilename)
|
|
1962
2407
|
|
|
@@ -1980,7 +2425,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1980
2425
|
|
|
1981
2426
|
con.close()
|
|
1982
2427
|
else:
|
|
1983
|
-
logger.debug(
|
|
2428
|
+
logger.debug("No caching database found in %s", dir_)
|
|
1984
2429
|
|
|
1985
2430
|
# Whether or not we found a database, we need to clean up the cache
|
|
1986
2431
|
# directory. Delete everything cached.
|
|
@@ -2017,7 +2462,9 @@ class CachingFileStore(AbstractFileStore):
|
|
|
2017
2462
|
|
|
2018
2463
|
# Get all the dead worker PIDs
|
|
2019
2464
|
workers = []
|
|
2020
|
-
for row in cls._static_read(
|
|
2465
|
+
for row in cls._static_read(
|
|
2466
|
+
cur, "SELECT DISTINCT worker FROM jobs WHERE worker IS NOT NULL"
|
|
2467
|
+
):
|
|
2021
2468
|
workers.append(row[0])
|
|
2022
2469
|
|
|
2023
2470
|
# Work out which of them are not currently running.
|
|
@@ -2030,14 +2477,18 @@ class CachingFileStore(AbstractFileStore):
|
|
|
2030
2477
|
# Now we know which workers are dead.
|
|
2031
2478
|
# Clear them off of the jobs they had.
|
|
2032
2479
|
for deadWorker in deadWorkers:
|
|
2033
|
-
cls._static_write(
|
|
2480
|
+
cls._static_write(
|
|
2481
|
+
con,
|
|
2482
|
+
cur,
|
|
2483
|
+
[("UPDATE jobs SET worker = NULL WHERE worker = ?", (deadWorker,))],
|
|
2484
|
+
)
|
|
2034
2485
|
if len(deadWorkers) > 0:
|
|
2035
|
-
logger.debug(
|
|
2486
|
+
logger.debug("Reaped %d dead workers", len(deadWorkers))
|
|
2036
2487
|
|
|
2037
2488
|
while True:
|
|
2038
2489
|
# Find an unowned job.
|
|
2039
2490
|
# Don't take all of them; other people could come along and want to help us with the other jobs.
|
|
2040
|
-
cls._static_read(cur,
|
|
2491
|
+
cls._static_read(cur, "SELECT id FROM jobs WHERE worker IS NULL LIMIT 1")
|
|
2041
2492
|
row = cur.fetchone()
|
|
2042
2493
|
if row is None:
|
|
2043
2494
|
# We cleaned up all the jobs
|
|
@@ -2046,10 +2497,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
2046
2497
|
jobID = row[0]
|
|
2047
2498
|
|
|
2048
2499
|
# Try to own this job
|
|
2049
|
-
cls._static_write(
|
|
2500
|
+
cls._static_write(
|
|
2501
|
+
con,
|
|
2502
|
+
cur,
|
|
2503
|
+
[
|
|
2504
|
+
(
|
|
2505
|
+
"UPDATE jobs SET worker = ? WHERE id = ? AND worker IS NULL",
|
|
2506
|
+
(me, jobID),
|
|
2507
|
+
)
|
|
2508
|
+
],
|
|
2509
|
+
)
|
|
2050
2510
|
|
|
2051
2511
|
# See if we won the race
|
|
2052
|
-
cls._static_read(
|
|
2512
|
+
cls._static_read(
|
|
2513
|
+
cur,
|
|
2514
|
+
"SELECT id, tempdir FROM jobs WHERE id = ? AND worker = ?",
|
|
2515
|
+
(jobID, me),
|
|
2516
|
+
)
|
|
2053
2517
|
row = cur.fetchone()
|
|
2054
2518
|
if row is None:
|
|
2055
2519
|
# We didn't win the race. Try another one.
|
|
@@ -2058,6 +2522,6 @@ class CachingFileStore(AbstractFileStore):
|
|
|
2058
2522
|
# If we did win, delete the job and its files and temp dir
|
|
2059
2523
|
cls._removeJob(con, cur, jobID)
|
|
2060
2524
|
|
|
2061
|
-
logger.debug(
|
|
2525
|
+
logger.debug("Cleaned up orphaned job %s", jobID)
|
|
2062
2526
|
|
|
2063
2527
|
# Now we have cleaned up all the jobs that belonged to dead workers that were dead when we entered this function.
|