toil 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +18 -13
- toil/batchSystems/abstractBatchSystem.py +39 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +24 -24
- toil/batchSystems/awsBatch.py +14 -14
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/contained_executor.py +3 -3
- toil/batchSystems/htcondor.py +0 -1
- toil/batchSystems/kubernetes.py +34 -31
- toil/batchSystems/local_support.py +3 -1
- toil/batchSystems/lsf.py +7 -7
- toil/batchSystems/mesos/batchSystem.py +7 -7
- toil/batchSystems/options.py +32 -83
- toil/batchSystems/registry.py +104 -23
- toil/batchSystems/singleMachine.py +16 -13
- toil/batchSystems/slurm.py +87 -16
- toil/batchSystems/torque.py +0 -1
- toil/bus.py +44 -8
- toil/common.py +544 -753
- toil/cwl/__init__.py +28 -32
- toil/cwl/cwltoil.py +595 -574
- toil/cwl/utils.py +55 -10
- toil/exceptions.py +1 -1
- toil/fileStores/__init__.py +2 -2
- toil/fileStores/abstractFileStore.py +88 -14
- toil/fileStores/cachingFileStore.py +610 -549
- toil/fileStores/nonCachingFileStore.py +46 -22
- toil/job.py +182 -101
- toil/jobStores/abstractJobStore.py +161 -95
- toil/jobStores/aws/jobStore.py +23 -9
- toil/jobStores/aws/utils.py +6 -6
- toil/jobStores/fileJobStore.py +116 -18
- toil/jobStores/googleJobStore.py +16 -7
- toil/jobStores/utils.py +5 -6
- toil/leader.py +87 -56
- toil/lib/accelerators.py +10 -5
- toil/lib/aws/__init__.py +3 -14
- toil/lib/aws/ami.py +22 -9
- toil/lib/aws/iam.py +21 -13
- toil/lib/aws/session.py +2 -16
- toil/lib/aws/utils.py +4 -5
- toil/lib/compatibility.py +1 -1
- toil/lib/conversions.py +26 -3
- toil/lib/docker.py +22 -23
- toil/lib/ec2.py +10 -6
- toil/lib/ec2nodes.py +106 -100
- toil/lib/encryption/_nacl.py +2 -1
- toil/lib/generatedEC2Lists.py +325 -18
- toil/lib/io.py +49 -2
- toil/lib/misc.py +1 -1
- toil/lib/resources.py +9 -2
- toil/lib/threading.py +101 -38
- toil/options/common.py +736 -0
- toil/options/cwl.py +336 -0
- toil/options/wdl.py +37 -0
- toil/provisioners/abstractProvisioner.py +9 -4
- toil/provisioners/aws/__init__.py +3 -6
- toil/provisioners/aws/awsProvisioner.py +6 -0
- toil/provisioners/clusterScaler.py +3 -2
- toil/provisioners/gceProvisioner.py +2 -2
- toil/realtimeLogger.py +2 -1
- toil/resource.py +24 -18
- toil/server/app.py +2 -3
- toil/server/cli/wes_cwl_runner.py +4 -4
- toil/server/utils.py +1 -1
- toil/server/wes/abstract_backend.py +3 -2
- toil/server/wes/amazon_wes_utils.py +5 -4
- toil/server/wes/tasks.py +2 -3
- toil/server/wes/toil_backend.py +2 -10
- toil/server/wsgi_app.py +2 -0
- toil/serviceManager.py +12 -10
- toil/statsAndLogging.py +41 -9
- toil/test/__init__.py +29 -54
- toil/test/batchSystems/batchSystemTest.py +11 -111
- toil/test/batchSystems/test_slurm.py +24 -8
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +438 -223
- toil/test/cwl/glob_dir.cwl +15 -0
- toil/test/cwl/preemptible.cwl +21 -0
- toil/test/cwl/preemptible_expression.cwl +28 -0
- toil/test/cwl/revsort.cwl +1 -1
- toil/test/cwl/revsort2.cwl +1 -1
- toil/test/docs/scriptsTest.py +2 -3
- toil/test/jobStores/jobStoreTest.py +34 -21
- toil/test/lib/aws/test_iam.py +4 -14
- toil/test/lib/aws/test_utils.py +0 -3
- toil/test/lib/dockerTest.py +4 -4
- toil/test/lib/test_ec2.py +12 -17
- toil/test/mesos/helloWorld.py +4 -5
- toil/test/mesos/stress.py +1 -1
- toil/test/{wdl/conftest.py → options/__init__.py} +0 -10
- toil/test/options/options.py +37 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
- toil/test/provisioners/clusterScalerTest.py +6 -4
- toil/test/provisioners/clusterTest.py +23 -11
- toil/test/provisioners/gceProvisionerTest.py +0 -6
- toil/test/provisioners/restartScript.py +3 -2
- toil/test/server/serverTest.py +1 -1
- toil/test/sort/restart_sort.py +2 -1
- toil/test/sort/sort.py +2 -1
- toil/test/sort/sortTest.py +2 -13
- toil/test/src/autoDeploymentTest.py +45 -45
- toil/test/src/busTest.py +5 -5
- toil/test/src/checkpointTest.py +2 -2
- toil/test/src/deferredFunctionTest.py +1 -1
- toil/test/src/fileStoreTest.py +32 -16
- toil/test/src/helloWorldTest.py +1 -1
- toil/test/src/importExportFileTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +2 -1
- toil/test/src/jobServiceTest.py +1 -1
- toil/test/src/jobTest.py +18 -18
- toil/test/src/miscTests.py +5 -3
- toil/test/src/promisedRequirementTest.py +3 -3
- toil/test/src/realtimeLoggerTest.py +1 -1
- toil/test/src/resourceTest.py +2 -2
- toil/test/src/restartDAGTest.py +1 -1
- toil/test/src/resumabilityTest.py +36 -2
- toil/test/src/retainTempDirTest.py +1 -1
- toil/test/src/systemTest.py +2 -2
- toil/test/src/toilContextManagerTest.py +2 -2
- toil/test/src/userDefinedJobArgTypeTest.py +1 -1
- toil/test/utils/toilDebugTest.py +98 -32
- toil/test/utils/toilKillTest.py +2 -2
- toil/test/utils/utilsTest.py +23 -3
- toil/test/wdl/wdltoil_test.py +223 -45
- toil/toilState.py +7 -6
- toil/utils/toilClean.py +1 -1
- toil/utils/toilConfig.py +36 -0
- toil/utils/toilDebugFile.py +60 -33
- toil/utils/toilDebugJob.py +39 -12
- toil/utils/toilDestroyCluster.py +1 -1
- toil/utils/toilKill.py +1 -1
- toil/utils/toilLaunchCluster.py +13 -2
- toil/utils/toilMain.py +3 -2
- toil/utils/toilRsyncCluster.py +1 -1
- toil/utils/toilSshCluster.py +1 -1
- toil/utils/toilStats.py +445 -305
- toil/utils/toilStatus.py +2 -5
- toil/version.py +10 -10
- toil/wdl/utils.py +2 -122
- toil/wdl/wdltoil.py +1257 -492
- toil/worker.py +55 -46
- toil-6.1.0.dist-info/METADATA +124 -0
- toil-6.1.0.dist-info/RECORD +241 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/WHEEL +1 -1
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -1
- toil/batchSystems/parasol.py +0 -379
- toil/batchSystems/tes.py +0 -459
- toil/test/batchSystems/parasolTestSupport.py +0 -117
- toil/test/wdl/builtinTest.py +0 -506
- toil/test/wdl/toilwdlTest.py +0 -522
- toil/wdl/toilwdl.py +0 -141
- toil/wdl/versions/dev.py +0 -107
- toil/wdl/versions/draft2.py +0 -980
- toil/wdl/versions/v1.py +0 -794
- toil/wdl/wdl_analysis.py +0 -116
- toil/wdl/wdl_functions.py +0 -997
- toil/wdl/wdl_synthesis.py +0 -1011
- toil/wdl/wdl_types.py +0 -243
- toil-5.12.0.dist-info/METADATA +0 -118
- toil-5.12.0.dist-info/RECORD +0 -244
- /toil/{wdl/versions → options}/__init__.py +0 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import copy
|
|
14
15
|
import errno
|
|
15
16
|
import hashlib
|
|
16
17
|
import logging
|
|
@@ -19,22 +20,28 @@ import re
|
|
|
19
20
|
import shutil
|
|
20
21
|
import sqlite3
|
|
21
22
|
import stat
|
|
22
|
-
import tempfile
|
|
23
23
|
import threading
|
|
24
24
|
import time
|
|
25
25
|
from contextlib import contextmanager
|
|
26
|
-
from
|
|
27
|
-
|
|
28
|
-
|
|
26
|
+
from tempfile import mkstemp
|
|
27
|
+
from typing import (Any,
|
|
28
|
+
Callable,
|
|
29
|
+
Generator,
|
|
30
|
+
Iterator,
|
|
31
|
+
Optional,
|
|
32
|
+
Sequence,
|
|
33
|
+
Tuple)
|
|
34
|
+
|
|
35
|
+
from toil.common import cacheDirName, getFileSystemSize
|
|
29
36
|
from toil.fileStores import FileID
|
|
30
37
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
31
38
|
from toil.job import Job, JobDescription
|
|
32
39
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
33
40
|
from toil.lib.compatibility import deprecated
|
|
34
|
-
from toil.lib.conversions import bytes2human
|
|
35
41
|
from toil.lib.io import (atomic_copy,
|
|
36
42
|
atomic_copyobj,
|
|
37
43
|
make_public_dir,
|
|
44
|
+
mkdtemp,
|
|
38
45
|
robust_rmtree)
|
|
39
46
|
from toil.lib.retry import ErrorCondition, retry
|
|
40
47
|
from toil.lib.threading import get_process_name, process_name_exists
|
|
@@ -224,9 +231,11 @@ class CachingFileStore(AbstractFileStore):
|
|
|
224
231
|
# be able to tell that from showing up on a machine where a cache has
|
|
225
232
|
# already been created.
|
|
226
233
|
self.dbPath = os.path.join(self.coordination_dir, f'cache-{self.workflowAttemptNumber}.db')
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
self.cur
|
|
234
|
+
|
|
235
|
+
# Database connections are provided by magic properties self.con and
|
|
236
|
+
# self.cur that always have the right object for the current thread to
|
|
237
|
+
# use. They store stuff in this thread-local storage.
|
|
238
|
+
self._thread_local = threading.local()
|
|
230
239
|
|
|
231
240
|
# Note that sqlite3 automatically starts a transaction when we go to
|
|
232
241
|
# modify the database.
|
|
@@ -234,6 +243,12 @@ class CachingFileStore(AbstractFileStore):
|
|
|
234
243
|
# write themselves), we need to COMMIT after every coherent set of
|
|
235
244
|
# writes.
|
|
236
245
|
|
|
246
|
+
# Because we support multi-threaded access to files, but we talk to the
|
|
247
|
+
# database as one process with one identity for owning file references,
|
|
248
|
+
# we need to make sure only one thread of our process is trying to e.g.
|
|
249
|
+
# free up space in the cache for a file at a time.
|
|
250
|
+
self.process_identity_lock = threading.RLock()
|
|
251
|
+
|
|
237
252
|
# Set up the tables
|
|
238
253
|
self._ensureTables(self.con)
|
|
239
254
|
|
|
@@ -253,6 +268,37 @@ class CachingFileStore(AbstractFileStore):
|
|
|
253
268
|
# time.
|
|
254
269
|
self.commitThread = None
|
|
255
270
|
|
|
271
|
+
@contextmanager
|
|
272
|
+
def as_process(self) -> Generator[str, None, None]:
|
|
273
|
+
"""
|
|
274
|
+
Assume the process's identity to act on the caching database.
|
|
275
|
+
|
|
276
|
+
Yields the process's name in the caching database, and holds onto a
|
|
277
|
+
lock while your thread has it.
|
|
278
|
+
"""
|
|
279
|
+
with self.process_identity_lock:
|
|
280
|
+
yield get_process_name(self.coordination_dir)
|
|
281
|
+
|
|
282
|
+
@property
|
|
283
|
+
def con(self) -> sqlite3.Connection:
|
|
284
|
+
"""
|
|
285
|
+
Get the database connection to be used for the current thread.
|
|
286
|
+
"""
|
|
287
|
+
if not hasattr(self._thread_local, 'con'):
|
|
288
|
+
# Connect to the database for this thread.
|
|
289
|
+
# TODO: We assume the connection closes when the thread goes away and can no longer use it.
|
|
290
|
+
self._thread_local.con = sqlite3.connect(self.dbPath, timeout=SQLITE_TIMEOUT_SECS)
|
|
291
|
+
return self._thread_local.con
|
|
292
|
+
|
|
293
|
+
@property
|
|
294
|
+
def cur(self) -> sqlite3.Cursor:
|
|
295
|
+
"""
|
|
296
|
+
Get the main cursor to be used for the current thread.
|
|
297
|
+
"""
|
|
298
|
+
if not hasattr(self._thread_local, 'cur'):
|
|
299
|
+
# If we don't already have a main cursor for the thread, make one.
|
|
300
|
+
self._thread_local.cur = self.con.cursor()
|
|
301
|
+
return self._thread_local.cur
|
|
256
302
|
|
|
257
303
|
@staticmethod
|
|
258
304
|
@retry(infinite_retries=True,
|
|
@@ -261,7 +307,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
261
307
|
error=sqlite3.OperationalError,
|
|
262
308
|
error_message_must_include='is locked')
|
|
263
309
|
])
|
|
264
|
-
def
|
|
310
|
+
def _static_write(con, cur, operations):
|
|
265
311
|
"""
|
|
266
312
|
Write to the caching database, using the given connection.
|
|
267
313
|
|
|
@@ -313,6 +359,35 @@ class CachingFileStore(AbstractFileStore):
|
|
|
313
359
|
|
|
314
360
|
return cur.rowcount
|
|
315
361
|
|
|
362
|
+
@staticmethod
|
|
363
|
+
@retry(infinite_retries=True,
|
|
364
|
+
errors=[
|
|
365
|
+
ErrorCondition(
|
|
366
|
+
error=sqlite3.OperationalError,
|
|
367
|
+
error_message_must_include='is locked')
|
|
368
|
+
])
|
|
369
|
+
def _static_read(cur: sqlite3.Cursor, query: str, args: Optional[Sequence[Any]] = ()) -> Iterator[Any]:
|
|
370
|
+
"""
|
|
371
|
+
Read from the database.
|
|
372
|
+
|
|
373
|
+
Run the given select query with the given arguments. Yield each result.
|
|
374
|
+
If the query cannot be run because someone else has a write lock on the
|
|
375
|
+
database, retry.
|
|
376
|
+
"""
|
|
377
|
+
# All the real work is the decorators
|
|
378
|
+
return cur.execute(query, args)
|
|
379
|
+
|
|
380
|
+
def _read(self, query: str, args: Optional[Sequence[Any]] = ()) -> Iterator[Any]:
|
|
381
|
+
"""
|
|
382
|
+
Read from the database using the instance's connection.
|
|
383
|
+
|
|
384
|
+
Run the given select query with the given arguments. Yield each result.
|
|
385
|
+
If the query cannot be run because someone else has a write lock on the
|
|
386
|
+
database, retry.
|
|
387
|
+
"""
|
|
388
|
+
|
|
389
|
+
return self._static_read(self.cur, query, args)
|
|
390
|
+
|
|
316
391
|
def _write(self, operations):
|
|
317
392
|
"""
|
|
318
393
|
Write to the caching database, using the instance's connection
|
|
@@ -331,7 +406,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
331
406
|
:rtype: int
|
|
332
407
|
"""
|
|
333
408
|
|
|
334
|
-
return self.
|
|
409
|
+
return self._static_write(self.con, self.cur, operations)
|
|
335
410
|
|
|
336
411
|
@classmethod
|
|
337
412
|
def _ensureTables(cls, con):
|
|
@@ -344,7 +419,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
344
419
|
# Get a cursor
|
|
345
420
|
cur = con.cursor()
|
|
346
421
|
|
|
347
|
-
cls.
|
|
422
|
+
cls._static_write(con, cur, ["""
|
|
348
423
|
CREATE TABLE IF NOT EXISTS files (
|
|
349
424
|
id TEXT NOT NULL PRIMARY KEY,
|
|
350
425
|
path TEXT UNIQUE NOT NULL,
|
|
@@ -399,7 +474,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
399
474
|
if self.cachingIsFree():
|
|
400
475
|
return 0
|
|
401
476
|
|
|
402
|
-
for row in self.
|
|
477
|
+
for row in self._read('SELECT TOTAL(size) FROM files'):
|
|
403
478
|
return row[0]
|
|
404
479
|
|
|
405
480
|
raise RuntimeError('Unable to retrieve cache usage')
|
|
@@ -417,7 +492,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
417
492
|
"""
|
|
418
493
|
|
|
419
494
|
# Total up the sizes of all the reads of files and subtract it from the total disk reservation of all jobs
|
|
420
|
-
for row in self.
|
|
495
|
+
for row in self._read("""
|
|
421
496
|
SELECT (
|
|
422
497
|
(SELECT TOTAL(disk) FROM jobs) -
|
|
423
498
|
(SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state == 'immutable')
|
|
@@ -443,24 +518,24 @@ class CachingFileStore(AbstractFileStore):
|
|
|
443
518
|
# content.
|
|
444
519
|
|
|
445
520
|
# Do a little report first
|
|
446
|
-
for row in self.
|
|
521
|
+
for row in self._read("SELECT value FROM properties WHERE name = 'maxSpace'"):
|
|
447
522
|
logger.debug('Max space: %d', row[0])
|
|
448
|
-
for row in self.
|
|
523
|
+
for row in self._read("SELECT TOTAL(size) FROM files"):
|
|
449
524
|
logger.debug('Total file size: %d', row[0])
|
|
450
|
-
for row in self.
|
|
525
|
+
for row in self._read("SELECT TOTAL(disk) FROM jobs"):
|
|
451
526
|
logger.debug('Total job disk requirement size: %d', row[0])
|
|
452
|
-
for row in self.
|
|
527
|
+
for row in self._read("SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state = 'immutable'"):
|
|
453
528
|
logger.debug('Total immutable reference size: %d', row[0])
|
|
454
529
|
|
|
455
530
|
if self.cachingIsFree():
|
|
456
531
|
# If caching is free, we just say that all the space is always available.
|
|
457
|
-
for row in self.
|
|
532
|
+
for row in self._read("SELECT value FROM properties WHERE name = 'maxSpace'"):
|
|
458
533
|
return row[0]
|
|
459
534
|
|
|
460
535
|
raise RuntimeError('Unable to retrieve available cache space')
|
|
461
536
|
|
|
462
537
|
|
|
463
|
-
for row in self.
|
|
538
|
+
for row in self._read("""
|
|
464
539
|
SELECT (
|
|
465
540
|
(SELECT value FROM properties WHERE name = 'maxSpace') -
|
|
466
541
|
(SELECT TOTAL(size) FROM files) -
|
|
@@ -480,7 +555,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
480
555
|
If not retrievable, raises an error.
|
|
481
556
|
"""
|
|
482
557
|
|
|
483
|
-
for row in self.
|
|
558
|
+
for row in self._read("""
|
|
484
559
|
SELECT (
|
|
485
560
|
(SELECT value FROM properties WHERE name = 'maxSpace') -
|
|
486
561
|
(SELECT TOTAL(disk) FROM jobs)
|
|
@@ -502,14 +577,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
502
577
|
|
|
503
578
|
logger.debug('Get unused space for job %s', self.jobID)
|
|
504
579
|
|
|
505
|
-
for row in self.
|
|
580
|
+
for row in self._read('SELECT * FROM files'):
|
|
506
581
|
logger.debug('File record: %s', str(row))
|
|
507
582
|
|
|
508
|
-
for row in self.
|
|
583
|
+
for row in self._read('SELECT * FROM refs'):
|
|
509
584
|
logger.debug('Ref record: %s', str(row))
|
|
510
585
|
|
|
511
586
|
|
|
512
|
-
for row in self.
|
|
587
|
+
for row in self._read('SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.job_id = ? AND refs.state != ?',
|
|
513
588
|
(self.jobID, 'mutable')):
|
|
514
589
|
# Sum up all the sizes of our referenced files, then subtract that from how much we came in with
|
|
515
590
|
return self.jobDiskBytes - row[0]
|
|
@@ -532,7 +607,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
532
607
|
file you need to do it in a transaction.
|
|
533
608
|
"""
|
|
534
609
|
|
|
535
|
-
for row in self.
|
|
610
|
+
for row in self._read('SELECT COUNT(*) FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)',
|
|
536
611
|
(fileID, 'cached', 'uploadable', 'uploading')):
|
|
537
612
|
|
|
538
613
|
return row[0] > 0
|
|
@@ -545,7 +620,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
545
620
|
Counts mutable references too.
|
|
546
621
|
"""
|
|
547
622
|
|
|
548
|
-
for row in self.
|
|
623
|
+
for row in self._read('SELECT COUNT(*) FROM refs WHERE file_id = ?', (fileID,)):
|
|
549
624
|
return row[0]
|
|
550
625
|
return 0
|
|
551
626
|
|
|
@@ -558,7 +633,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
558
633
|
configurations, most notably the FileJobStore.
|
|
559
634
|
"""
|
|
560
635
|
|
|
561
|
-
for row in self.
|
|
636
|
+
for row in self._read('SELECT value FROM properties WHERE name = ?', ('freeCaching',)):
|
|
562
637
|
return row[0] == 1
|
|
563
638
|
|
|
564
639
|
# Otherwise we need to set it
|
|
@@ -570,7 +645,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
570
645
|
emptyID = self.jobStore.getEmptyFileStoreID()
|
|
571
646
|
|
|
572
647
|
# Read it out to a generated name.
|
|
573
|
-
destDir =
|
|
648
|
+
destDir = mkdtemp(dir=self.localCacheDir)
|
|
574
649
|
cachedFile = os.path.join(destDir, 'sniffLinkCount')
|
|
575
650
|
self.jobStore.read_file(emptyID, cachedFile, symlink=False)
|
|
576
651
|
|
|
@@ -614,7 +689,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
614
689
|
# sure we can never collide even though we are going to remove the
|
|
615
690
|
# file.
|
|
616
691
|
# TODO: use a de-slashed version of the ID instead?
|
|
617
|
-
handle, path =
|
|
692
|
+
handle, path = mkstemp(dir=self.localCacheDir, suffix=hasher.hexdigest())
|
|
618
693
|
os.close(handle)
|
|
619
694
|
os.unlink(path)
|
|
620
695
|
|
|
@@ -627,153 +702,137 @@ class CachingFileStore(AbstractFileStore):
|
|
|
627
702
|
We don't actually process them here. We take action based on the states of files we own later.
|
|
628
703
|
"""
|
|
629
704
|
|
|
630
|
-
|
|
705
|
+
with self.as_process() as me:
|
|
631
706
|
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
707
|
+
# Get a list of all file owner processes on this node.
|
|
708
|
+
# Exclude NULL because it comes out as 0 and we can't look for PID 0.
|
|
709
|
+
owners = []
|
|
710
|
+
for row in self._read('SELECT DISTINCT owner FROM files WHERE owner IS NOT NULL'):
|
|
711
|
+
owners.append(row[0])
|
|
637
712
|
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
def _executePendingDeletions(cls, coordination_dir, con, cur):
|
|
713
|
+
# Work out which of them have died.
|
|
714
|
+
deadOwners = []
|
|
715
|
+
for owner in owners:
|
|
716
|
+
if not process_name_exists(self.coordination_dir, owner):
|
|
717
|
+
logger.debug('Owner %s is dead', owner)
|
|
718
|
+
deadOwners.append(owner)
|
|
719
|
+
else:
|
|
720
|
+
logger.debug('Owner %s is alive', owner)
|
|
721
|
+
|
|
722
|
+
for owner in deadOwners:
|
|
723
|
+
# Try and adopt all the files that any dead owner had
|
|
724
|
+
|
|
725
|
+
# If they were deleting, we delete.
|
|
726
|
+
# If they were downloading, we delete. Any outstanding references
|
|
727
|
+
# can't be in use since they are from the dead downloader.
|
|
728
|
+
# If they were uploading or uploadable, we mark as cached even
|
|
729
|
+
# though it never made it to the job store (and leave it unowned).
|
|
730
|
+
#
|
|
731
|
+
# Once the dead job that it was being uploaded from is cleaned up,
|
|
732
|
+
# and there are no longer any immutable references, it will be
|
|
733
|
+
# evicted as normal. Since the dead job can't have been marked
|
|
734
|
+
# successfully completed (since the file is still not uploaded),
|
|
735
|
+
# nobody is allowed to actually try and use the file.
|
|
736
|
+
#
|
|
737
|
+
# TODO: if we ever let other PIDs be responsible for writing our
|
|
738
|
+
# files asynchronously, this will need to change.
|
|
739
|
+
self._write([('UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?',
|
|
740
|
+
(me, 'deleting', owner, 'deleting')),
|
|
741
|
+
('UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?',
|
|
742
|
+
(me, 'deleting', owner, 'downloading')),
|
|
743
|
+
('UPDATE files SET owner = NULL, state = ? WHERE owner = ? AND (state = ? OR state = ?)',
|
|
744
|
+
('cached', owner, 'uploadable', 'uploading'))])
|
|
745
|
+
|
|
746
|
+
logger.debug('Tried to adopt file operations from dead worker %s to ourselves as %s', owner, me)
|
|
747
|
+
|
|
748
|
+
def _executePendingDeletions(self):
|
|
675
749
|
"""
|
|
676
750
|
Delete all the files that are registered in the database as in the
|
|
677
751
|
process of being deleted from the cache by us.
|
|
678
752
|
|
|
679
753
|
Returns the number of files that were deleted.
|
|
680
|
-
|
|
681
|
-
Implemented as a class method so it can use the database connection
|
|
682
|
-
appropriate to its thread without any chance of getting at the main
|
|
683
|
-
thread's connection and cursor in self.
|
|
684
|
-
|
|
685
|
-
:param str coordination_dir: The coordination directory.
|
|
686
|
-
:param sqlite3.Connection con: Connection to the cache database.
|
|
687
|
-
:param sqlite3.Cursor cur: Cursor in the cache database.
|
|
688
754
|
"""
|
|
689
755
|
|
|
690
|
-
|
|
756
|
+
with self.as_process() as me:
|
|
691
757
|
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
758
|
+
# Remember the file IDs we are deleting
|
|
759
|
+
deletedFiles = []
|
|
760
|
+
for row in self._read('SELECT id, path FROM files WHERE owner = ? AND state = ?', (me, 'deleting')):
|
|
761
|
+
# Grab everything we are supposed to delete and delete it
|
|
762
|
+
fileID = row[0]
|
|
763
|
+
filePath = row[1]
|
|
764
|
+
try:
|
|
765
|
+
os.unlink(filePath)
|
|
766
|
+
logger.debug('Successfully deleted: %s', filePath)
|
|
767
|
+
except OSError:
|
|
768
|
+
# Probably already deleted
|
|
769
|
+
logger.debug('File already gone: %s', filePath)
|
|
770
|
+
# Still need to mark it as deleted
|
|
705
771
|
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
772
|
+
# Whether we deleted the file or just found out that it is gone, we
|
|
773
|
+
# need to take credit for deleting it so that we remove it from the
|
|
774
|
+
# database.
|
|
775
|
+
deletedFiles.append(fileID)
|
|
710
776
|
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
777
|
+
for fileID in deletedFiles:
|
|
778
|
+
# Drop all the files. They should have stayed in deleting state. We move them from there to not present at all.
|
|
779
|
+
# Also drop their references, if they had any from dead downloaders.
|
|
780
|
+
self._write([('DELETE FROM files WHERE id = ? AND state = ?', (fileID, 'deleting')),
|
|
781
|
+
('DELETE FROM refs WHERE file_id = ?', (fileID,))])
|
|
716
782
|
|
|
717
|
-
|
|
783
|
+
return len(deletedFiles)
|
|
718
784
|
|
|
719
|
-
def _executePendingUploads(self
|
|
785
|
+
def _executePendingUploads(self):
|
|
720
786
|
"""
|
|
721
787
|
Uploads all files in uploadable state that we own.
|
|
722
788
|
|
|
723
789
|
Returns the number of files that were uploaded.
|
|
724
|
-
|
|
725
|
-
Needs access to self to get at the job store for uploading files, but
|
|
726
|
-
still needs to take con and cur so it can run in a thread with the
|
|
727
|
-
thread's database connection.
|
|
728
|
-
|
|
729
|
-
:param sqlite3.Connection con: Connection to the cache database.
|
|
730
|
-
:param sqlite3.Cursor cur: Cursor in the cache database.
|
|
731
790
|
"""
|
|
732
791
|
|
|
733
792
|
# Work out who we are
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
# We need to set it to uploading in a way that we can detect that *we* won the update race instead of anyone else.
|
|
751
|
-
rowCount = self._staticWrite(con, cur, [('UPDATE files SET state = ? WHERE id = ? AND state = ?', ('uploading', fileID, 'uploadable'))])
|
|
752
|
-
if rowCount != 1:
|
|
753
|
-
# We didn't manage to update it. Someone else (a running job if
|
|
754
|
-
# we are a committing thread, or visa versa) must have grabbed
|
|
755
|
-
# it.
|
|
756
|
-
logger.debug('Lost race to upload %s', fileID)
|
|
757
|
-
# Try again to see if there is something else to grab.
|
|
758
|
-
continue
|
|
793
|
+
with self.as_process() as me:
|
|
794
|
+
|
|
795
|
+
# Record how many files we upload
|
|
796
|
+
uploadedCount = 0
|
|
797
|
+
while True:
|
|
798
|
+
# Try and find a file we might want to upload
|
|
799
|
+
fileID = None
|
|
800
|
+
filePath = None
|
|
801
|
+
for row in self._static_read(self.cur, 'SELECT id, path FROM files WHERE state = ? AND owner = ? LIMIT 1', ('uploadable', me)):
|
|
802
|
+
fileID = row[0]
|
|
803
|
+
filePath = row[1]
|
|
804
|
+
|
|
805
|
+
if fileID is None:
|
|
806
|
+
# Nothing else exists to upload
|
|
807
|
+
break
|
|
759
808
|
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
809
|
+
# We need to set it to uploading in a way that we can detect that *we* won the update race instead of anyone else.
|
|
810
|
+
rowCount = self._static_write(self.con, self.cur, [('UPDATE files SET state = ? WHERE id = ? AND state = ?', ('uploading', fileID, 'uploadable'))])
|
|
811
|
+
if rowCount != 1:
|
|
812
|
+
# We didn't manage to update it. Someone else (a running job if
|
|
813
|
+
# we are a committing thread, or visa versa) must have grabbed
|
|
814
|
+
# it.
|
|
815
|
+
logger.debug('Lost race to upload %s', fileID)
|
|
816
|
+
# Try again to see if there is something else to grab.
|
|
817
|
+
continue
|
|
818
|
+
|
|
819
|
+
# Upload the file
|
|
820
|
+
logger.debug('Actually executing upload for file %s', fileID)
|
|
821
|
+
try:
|
|
822
|
+
self.jobStore.update_file(fileID, filePath)
|
|
823
|
+
except:
|
|
824
|
+
# We need to set the state back to 'uploadable' in case of any failures to ensure
|
|
825
|
+
# we can retry properly.
|
|
826
|
+
self._static_write(self.con, self.cur, [('UPDATE files SET state = ? WHERE id = ? AND state = ?', ('uploadable', fileID, 'uploading'))])
|
|
827
|
+
raise
|
|
769
828
|
|
|
770
|
-
|
|
771
|
-
|
|
829
|
+
# Count it for the total uploaded files value we need to return
|
|
830
|
+
uploadedCount += 1
|
|
772
831
|
|
|
773
|
-
|
|
774
|
-
|
|
832
|
+
# Remember that we uploaded it in the database
|
|
833
|
+
self._static_write(self.con, self.cur, [('UPDATE files SET state = ?, owner = NULL WHERE id = ?', ('cached', fileID))])
|
|
775
834
|
|
|
776
|
-
|
|
835
|
+
return uploadedCount
|
|
777
836
|
|
|
778
837
|
def _allocateSpaceForJob(self, newJobReqs):
|
|
779
838
|
"""
|
|
@@ -794,23 +853,23 @@ class CachingFileStore(AbstractFileStore):
|
|
|
794
853
|
# This will take up space for us and potentially make the cache over-full.
|
|
795
854
|
# But we won't actually let the job run and use any of this space until
|
|
796
855
|
# the cache has been successfully cleared out.
|
|
797
|
-
|
|
798
|
-
|
|
856
|
+
with self.as_process() as me:
|
|
857
|
+
self._write([('INSERT INTO jobs VALUES (?, ?, ?, ?)', (self.jobID, self.localTempDir, newJobReqs, me))])
|
|
799
858
|
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
859
|
+
# Now we need to make sure that we can fit all currently cached files,
|
|
860
|
+
# and the parts of the total job requirements not currently spent on
|
|
861
|
+
# cached files, in under the total disk space limit.
|
|
803
862
|
|
|
804
|
-
|
|
863
|
+
available = self.getCacheAvailable()
|
|
805
864
|
|
|
806
|
-
|
|
865
|
+
logger.debug('Available space with job: %d bytes', available)
|
|
807
866
|
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
867
|
+
if available >= 0:
|
|
868
|
+
# We're fine on disk space
|
|
869
|
+
return
|
|
811
870
|
|
|
812
|
-
|
|
813
|
-
|
|
871
|
+
# Otherwise we need to clear stuff.
|
|
872
|
+
self._freeUpSpace()
|
|
814
873
|
|
|
815
874
|
@classmethod
|
|
816
875
|
def _removeJob(cls, con, cur, jobID):
|
|
@@ -827,10 +886,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
827
886
|
"""
|
|
828
887
|
|
|
829
888
|
# Get the job's temp dir
|
|
830
|
-
for row in
|
|
889
|
+
for row in cls._static_read(cur, 'SELECT tempdir FROM jobs WHERE id = ?', (jobID,)):
|
|
831
890
|
jobTemp = row[0]
|
|
832
891
|
|
|
833
|
-
for row in
|
|
892
|
+
for row in cls._static_read(cur, 'SELECT path FROM refs WHERE job_id = ?', (jobID,)):
|
|
834
893
|
try:
|
|
835
894
|
# Delete all the reference files.
|
|
836
895
|
os.unlink(row[0])
|
|
@@ -838,7 +897,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
838
897
|
# May not exist
|
|
839
898
|
pass
|
|
840
899
|
# And their database entries
|
|
841
|
-
cls.
|
|
900
|
+
cls._static_write(con, cur, [('DELETE FROM refs WHERE job_id = ?', (jobID,))])
|
|
842
901
|
|
|
843
902
|
try:
|
|
844
903
|
# Delete the job's temp directory to the extent that we can.
|
|
@@ -847,7 +906,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
847
906
|
pass
|
|
848
907
|
|
|
849
908
|
# Strike the job from the database
|
|
850
|
-
cls.
|
|
909
|
+
cls._static_write(con, cur, [('DELETE FROM jobs WHERE id = ?', (jobID,))])
|
|
851
910
|
|
|
852
911
|
def _deallocateSpaceForJob(self):
|
|
853
912
|
"""
|
|
@@ -866,66 +925,65 @@ class CachingFileStore(AbstractFileStore):
|
|
|
866
925
|
Return whether we manage to get any space freed or not.
|
|
867
926
|
"""
|
|
868
927
|
|
|
869
|
-
|
|
870
|
-
# references to files and keeping them from looking unused.
|
|
871
|
-
self._removeDeadJobs(self.coordination_dir, self.con)
|
|
872
|
-
|
|
873
|
-
# Adopt work from any dead workers
|
|
874
|
-
self._stealWorkFromTheDead()
|
|
928
|
+
with self.as_process() as me:
|
|
875
929
|
|
|
876
|
-
|
|
877
|
-
#
|
|
878
|
-
|
|
879
|
-
logger.debug('Successfully executed pending deletions to free space')
|
|
880
|
-
return True
|
|
930
|
+
# First we want to make sure that dead jobs aren't holding
|
|
931
|
+
# references to files and keeping them from looking unused.
|
|
932
|
+
self._removeDeadJobs(self.coordination_dir, self.con)
|
|
881
933
|
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
logger.debug('Successfully executed pending uploads to free space')
|
|
885
|
-
return True
|
|
934
|
+
# Adopt work from any dead workers
|
|
935
|
+
self._stealWorkFromTheDead()
|
|
886
936
|
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
#
|
|
902
|
-
#
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
937
|
+
if self._executePendingDeletions() > 0:
|
|
938
|
+
# We actually had something to delete, which we deleted.
|
|
939
|
+
# Maybe there is space now
|
|
940
|
+
logger.debug('Successfully executed pending deletions to free space')
|
|
941
|
+
return True
|
|
942
|
+
|
|
943
|
+
if self._executePendingUploads() > 0:
|
|
944
|
+
# We had something to upload. Maybe it can be evicted now.
|
|
945
|
+
logger.debug('Successfully executed pending uploads to free space')
|
|
946
|
+
return True
|
|
947
|
+
|
|
948
|
+
# Otherwise, not enough files could be found in deleting state to solve our problem.
|
|
949
|
+
# We need to put something into the deleting state.
|
|
950
|
+
# TODO: give other people time to finish their in-progress
|
|
951
|
+
# evictions before starting more, or we might evict everything as
|
|
952
|
+
# soon as we hit the cache limit.
|
|
953
|
+
|
|
954
|
+
# Find something that has no non-mutable references and is not already being deleted.
|
|
955
|
+
self._read("""
|
|
956
|
+
SELECT files.id FROM files WHERE files.state = 'cached' AND NOT EXISTS (
|
|
957
|
+
SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
|
|
958
|
+
) LIMIT 1
|
|
959
|
+
""")
|
|
960
|
+
row = self.cur.fetchone()
|
|
961
|
+
if row is None:
|
|
962
|
+
# Nothing can be evicted by us.
|
|
963
|
+
# Someone else might be in the process of evicting something that will free up space for us too.
|
|
964
|
+
# Or someone mught be uploading something and we have to wait for them to finish before it can be deleted.
|
|
965
|
+
logger.debug('Could not find anything to evict! Cannot free up space!')
|
|
966
|
+
return False
|
|
909
967
|
|
|
910
|
-
|
|
911
|
-
|
|
968
|
+
# Otherwise we found an eviction candidate.
|
|
969
|
+
fileID = row[0]
|
|
912
970
|
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
971
|
+
# Try and grab it for deletion, subject to the condition that nothing has started reading it
|
|
972
|
+
self._write([("""
|
|
973
|
+
UPDATE files SET owner = ?, state = ? WHERE id = ? AND state = ?
|
|
974
|
+
AND owner IS NULL AND NOT EXISTS (
|
|
975
|
+
SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
|
|
976
|
+
)
|
|
977
|
+
""",
|
|
978
|
+
(me, 'deleting', fileID, 'cached'))])
|
|
921
979
|
|
|
922
|
-
|
|
980
|
+
logger.debug('Evicting file %s', fileID)
|
|
923
981
|
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
982
|
+
# Whether we actually got it or not, try deleting everything we have to delete
|
|
983
|
+
if self._executePendingDeletions() > 0:
|
|
984
|
+
# We deleted something
|
|
985
|
+
logger.debug('Successfully executed pending deletions to free space')
|
|
986
|
+
return True
|
|
929
987
|
|
|
930
988
|
def _freeUpSpace(self):
|
|
931
989
|
"""
|
|
@@ -982,7 +1040,8 @@ class CachingFileStore(AbstractFileStore):
|
|
|
982
1040
|
# Check the status of all jobs on this node. If there are jobs that started and died before
|
|
983
1041
|
# cleaning up their presence from the database, clean them up ourselves.
|
|
984
1042
|
self._removeDeadJobs(self.coordination_dir, self.con)
|
|
985
|
-
# Get the
|
|
1043
|
+
# Get the disk requirement for the job, which we will use to know if we
|
|
1044
|
+
# have filled the cache or not.
|
|
986
1045
|
self.jobDiskBytes = job.disk
|
|
987
1046
|
|
|
988
1047
|
logger.debug('Actually running job (%s) with ID (%s) which wants %d of our %d bytes.',
|
|
@@ -996,22 +1055,6 @@ class CachingFileStore(AbstractFileStore):
|
|
|
996
1055
|
with super().open(job):
|
|
997
1056
|
yield
|
|
998
1057
|
finally:
|
|
999
|
-
# See how much disk space is used at the end of the job.
|
|
1000
|
-
# Not a real peak disk usage, but close enough to be useful for warning the user.
|
|
1001
|
-
# TODO: Push this logic into the abstract file store
|
|
1002
|
-
disk: int = getDirSizeRecursively(self.localTempDir)
|
|
1003
|
-
percent: float = 0.0
|
|
1004
|
-
if self.jobDiskBytes and self.jobDiskBytes > 0:
|
|
1005
|
-
percent = float(disk) / self.jobDiskBytes * 100
|
|
1006
|
-
disk_usage: str = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(disk)}B [{disk}B] used, "
|
|
1007
|
-
f"{bytes2human(self.jobDiskBytes)}B [{self.jobDiskBytes}B] requested).")
|
|
1008
|
-
if disk > self.jobDiskBytes:
|
|
1009
|
-
self.logToMaster("Job used more disk than requested. For CWL, consider increasing the outdirMin "
|
|
1010
|
-
f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
|
|
1011
|
-
level=logging.WARNING)
|
|
1012
|
-
else:
|
|
1013
|
-
self.logToMaster(disk_usage, level=logging.DEBUG)
|
|
1014
|
-
|
|
1015
1058
|
# Go back up to the per-worker local temp directory.
|
|
1016
1059
|
os.chdir(startingDir)
|
|
1017
1060
|
self.cleanupInProgress = True
|
|
@@ -1036,62 +1079,62 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1036
1079
|
# Create an empty file to get an ID.
|
|
1037
1080
|
# Make sure to pass along the file basename.
|
|
1038
1081
|
# TODO: this empty file could leak if we die now...
|
|
1039
|
-
fileID = self.jobStore.
|
|
1082
|
+
fileID = self.jobStore.get_empty_file_store_id(creatorID, cleanup, os.path.basename(localFileName))
|
|
1040
1083
|
# Work out who we are
|
|
1041
|
-
|
|
1084
|
+
with self.as_process() as me:
|
|
1042
1085
|
|
|
1043
|
-
|
|
1044
|
-
|
|
1086
|
+
# Work out where the file ought to go in the cache
|
|
1087
|
+
cachePath = self._getNewCachingPath(fileID)
|
|
1045
1088
|
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1089
|
+
# Create a file in uploadable state and a reference, in the same transaction.
|
|
1090
|
+
# Say the reference is an immutable reference
|
|
1091
|
+
self._write([('INSERT INTO files VALUES (?, ?, ?, ?, ?)', (fileID, cachePath, fileSize, 'uploadable', me)),
|
|
1092
|
+
('INSERT INTO refs VALUES (?, ?, ?, ?)', (absLocalFileName, fileID, creatorID, 'immutable'))])
|
|
1050
1093
|
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1094
|
+
if absLocalFileName.startswith(self.localTempDir) and not os.path.islink(absLocalFileName):
|
|
1095
|
+
# We should link into the cache, because the upload is coming from our local temp dir (and not via a symlink in there)
|
|
1096
|
+
try:
|
|
1097
|
+
# Try and hardlink the file into the cache.
|
|
1098
|
+
# This can only fail if the system doesn't have hardlinks, or the
|
|
1099
|
+
# file we're trying to link to has too many hardlinks to it
|
|
1100
|
+
# already, or something.
|
|
1101
|
+
os.link(absLocalFileName, cachePath)
|
|
1059
1102
|
|
|
1060
|
-
|
|
1103
|
+
linkedToCache = True
|
|
1061
1104
|
|
|
1062
|
-
|
|
1063
|
-
|
|
1105
|
+
logger.debug('Hardlinked file %s into cache at %s; deferring write to job store', localFileName, cachePath)
|
|
1106
|
+
assert not os.path.islink(cachePath), "Symlink %s has invaded cache!" % cachePath
|
|
1064
1107
|
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1108
|
+
# Don't do the upload now. Let it be deferred until later (when the job is committing).
|
|
1109
|
+
except OSError:
|
|
1110
|
+
# We couldn't make the link for some reason
|
|
1111
|
+
linkedToCache = False
|
|
1112
|
+
else:
|
|
1113
|
+
# If you are uploading a file that physically exists outside the
|
|
1114
|
+
# local temp dir, it should not be linked into the cache. On
|
|
1115
|
+
# systems that support it, we could end up with a
|
|
1116
|
+
# hardlink-to-symlink in the cache if we break this rule, allowing
|
|
1117
|
+
# files to vanish from our cache.
|
|
1068
1118
|
linkedToCache = False
|
|
1069
|
-
else:
|
|
1070
|
-
# If you are uploading a file that physically exists outside the
|
|
1071
|
-
# local temp dir, it should not be linked into the cache. On
|
|
1072
|
-
# systems that support it, we could end up with a
|
|
1073
|
-
# hardlink-to-symlink in the cache if we break this rule, allowing
|
|
1074
|
-
# files to vanish from our cache.
|
|
1075
|
-
linkedToCache = False
|
|
1076
1119
|
|
|
1077
1120
|
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1121
|
+
if not linkedToCache:
|
|
1122
|
+
# If we can't do the link into the cache and upload from there, we
|
|
1123
|
+
# have to just upload right away. We can't guarantee sufficient
|
|
1124
|
+
# space to make a full copy in the cache, if we aren't allowed to
|
|
1125
|
+
# take this copy away from the writer.
|
|
1083
1126
|
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1127
|
+
# Change the reference to 'mutable', which it will be.
|
|
1128
|
+
# And drop the file altogether.
|
|
1129
|
+
self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', absLocalFileName, fileID)),
|
|
1130
|
+
('DELETE FROM files WHERE id = ?', (fileID,))])
|
|
1088
1131
|
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1132
|
+
# Save the file to the job store right now
|
|
1133
|
+
logger.debug('Actually executing upload immediately for file %s', fileID)
|
|
1134
|
+
self.jobStore.update_file(fileID, absLocalFileName)
|
|
1092
1135
|
|
|
1093
|
-
|
|
1094
|
-
|
|
1136
|
+
# Ship out the completed FileID object with its real size.
|
|
1137
|
+
return FileID.forPath(fileID, absLocalFileName)
|
|
1095
1138
|
|
|
1096
1139
|
def readGlobalFile(self, fileStoreID, userPath=None, cache=True, mutable=False, symlink=False):
|
|
1097
1140
|
|
|
@@ -1162,7 +1205,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1162
1205
|
|
|
1163
1206
|
# Find where the file is cached
|
|
1164
1207
|
cachedPath = None
|
|
1165
|
-
for row in self.
|
|
1208
|
+
for row in self._read('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
|
|
1166
1209
|
cachedPath = row[0]
|
|
1167
1210
|
|
|
1168
1211
|
if cachedPath is None:
|
|
@@ -1239,130 +1282,130 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1239
1282
|
"""
|
|
1240
1283
|
|
|
1241
1284
|
# Work out who we are
|
|
1242
|
-
|
|
1285
|
+
with self.as_process() as me:
|
|
1243
1286
|
|
|
1244
|
-
|
|
1245
|
-
|
|
1287
|
+
# Work out where to cache the file if it isn't cached already
|
|
1288
|
+
cachedPath = self._getNewCachingPath(fileStoreID)
|
|
1246
1289
|
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
# See if we won the race
|
|
1255
|
-
self.cur.execute('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?', (fileStoreID, 'downloading', me))
|
|
1256
|
-
if self.cur.fetchone()[0] > 0:
|
|
1257
|
-
# We are responsible for downloading the file
|
|
1258
|
-
logger.debug('We are now responsible for downloading file %s', fileStoreID)
|
|
1259
|
-
|
|
1260
|
-
# Make sure we have space for this download.
|
|
1261
|
-
self._freeUpSpace()
|
|
1262
|
-
|
|
1263
|
-
# Do the download into the cache.
|
|
1264
|
-
self._downloadToCache(fileStoreID, cachedPath)
|
|
1265
|
-
|
|
1266
|
-
# Now, we may have to immediately give away this file, because
|
|
1267
|
-
# we don't have space for two copies.
|
|
1268
|
-
# If so, we can't let it go to cached state, because someone
|
|
1269
|
-
# else might make a reference to it, and we may get stuck with
|
|
1270
|
-
# two readers, one cached copy, and space for two copies total.
|
|
1271
|
-
|
|
1272
|
-
# Make the copying reference
|
|
1273
|
-
self._write([('INSERT INTO refs VALUES (?, ?, ?, ?)',
|
|
1274
|
-
(localFilePath, fileStoreID, readerID, 'copying'))])
|
|
1275
|
-
|
|
1276
|
-
# Fulfill it with a full copy or by giving away the cached copy
|
|
1277
|
-
self._fulfillCopyingReference(fileStoreID, cachedPath, localFilePath)
|
|
1278
|
-
|
|
1279
|
-
# Now we're done
|
|
1280
|
-
return localFilePath
|
|
1290
|
+
# Start a loop until we can do one of these
|
|
1291
|
+
while True:
|
|
1292
|
+
# Try and create a downloading entry if no entry exists
|
|
1293
|
+
logger.debug('Trying to make file record for id %s', fileStoreID)
|
|
1294
|
+
self._write([('INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)',
|
|
1295
|
+
(fileStoreID, cachedPath, self.getGlobalFileSize(fileStoreID), 'downloading', me))])
|
|
1281
1296
|
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
# A record already existed for this file.
|
|
1286
|
-
# Try and create an immutable or copying reference to an entry that
|
|
1287
|
-
# is in 'cached' or 'uploadable' or 'uploading' state.
|
|
1288
|
-
# It might be uploading because *we* are supposed to be uploading it.
|
|
1289
|
-
logger.debug('Trying to make reference to file %s', fileStoreID)
|
|
1290
|
-
self._write([('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)',
|
|
1291
|
-
(localFilePath, readerID, 'copying', fileStoreID, 'cached', 'uploadable', 'uploading'))])
|
|
1292
|
-
|
|
1293
|
-
# See if we got it
|
|
1294
|
-
self.cur.execute('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (localFilePath, fileStoreID))
|
|
1297
|
+
# See if we won the race
|
|
1298
|
+
self._read('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?', (fileStoreID, 'downloading', me))
|
|
1295
1299
|
if self.cur.fetchone()[0] > 0:
|
|
1296
|
-
#
|
|
1297
|
-
logger.debug('
|
|
1298
|
-
|
|
1299
|
-
# Get the path it is actually at in the cache, instead of where we wanted to put it
|
|
1300
|
-
for row in self.cur.execute('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
|
|
1301
|
-
cachedPath = row[0]
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
while self.getCacheAvailable() < 0:
|
|
1305
|
-
# Since we now have a copying reference, see if we have used too much space.
|
|
1306
|
-
# If so, try to free up some space by deleting or uploading, but
|
|
1307
|
-
# don't loop forever if we can't get enough.
|
|
1308
|
-
self._tryToFreeUpSpace()
|
|
1309
|
-
|
|
1310
|
-
if self.getCacheAvailable() >= 0:
|
|
1311
|
-
# We made room
|
|
1312
|
-
break
|
|
1313
|
-
|
|
1314
|
-
# See if we have no other references and we can give away the file.
|
|
1315
|
-
# Change it to downloading owned by us if we can grab it.
|
|
1316
|
-
self._write([("""
|
|
1317
|
-
UPDATE files SET files.owner = ?, files.state = ? WHERE files.id = ? AND files.state = ?
|
|
1318
|
-
AND files.owner IS NULL AND NOT EXISTS (
|
|
1319
|
-
SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
|
|
1320
|
-
)
|
|
1321
|
-
""",
|
|
1322
|
-
(me, 'downloading', fileStoreID, 'cached'))])
|
|
1323
|
-
|
|
1324
|
-
if self._giveAwayDownloadingFile(fileStoreID, cachedPath, localFilePath):
|
|
1325
|
-
# We got ownership of the file and managed to give it away.
|
|
1326
|
-
return localFilePath
|
|
1300
|
+
# We are responsible for downloading the file
|
|
1301
|
+
logger.debug('We are now responsible for downloading file %s', fileStoreID)
|
|
1327
1302
|
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
# need to wait for one of those people with references to the file
|
|
1331
|
-
# to finish and give it up.
|
|
1332
|
-
# TODO: work out if that will never happen somehow.
|
|
1333
|
-
time.sleep(self.contentionBackoff)
|
|
1303
|
+
# Make sure we have space for this download.
|
|
1304
|
+
self._freeUpSpace()
|
|
1334
1305
|
|
|
1335
|
-
#
|
|
1306
|
+
# Do the download into the cache.
|
|
1307
|
+
self._downloadToCache(fileStoreID, cachedPath)
|
|
1336
1308
|
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1309
|
+
# Now, we may have to immediately give away this file, because
|
|
1310
|
+
# we don't have space for two copies.
|
|
1311
|
+
# If so, we can't let it go to cached state, because someone
|
|
1312
|
+
# else might make a reference to it, and we may get stuck with
|
|
1313
|
+
# two readers, one cached copy, and space for two copies total.
|
|
1340
1314
|
|
|
1341
|
-
# Make the
|
|
1342
|
-
|
|
1315
|
+
# Make the copying reference
|
|
1316
|
+
self._write([('INSERT INTO refs VALUES (?, ?, ?, ?)',
|
|
1317
|
+
(localFilePath, fileStoreID, readerID, 'copying'))])
|
|
1343
1318
|
|
|
1344
|
-
#
|
|
1345
|
-
self.
|
|
1319
|
+
# Fulfill it with a full copy or by giving away the cached copy
|
|
1320
|
+
self._fulfillCopyingReference(fileStoreID, cachedPath, localFilePath)
|
|
1346
1321
|
|
|
1347
1322
|
# Now we're done
|
|
1348
1323
|
return localFilePath
|
|
1349
1324
|
|
|
1350
1325
|
else:
|
|
1351
|
-
|
|
1352
|
-
|
|
1326
|
+
logger.debug('Someone else is already responsible for file %s', fileStoreID)
|
|
1327
|
+
|
|
1328
|
+
# A record already existed for this file.
|
|
1329
|
+
# Try and create an immutable or copying reference to an entry that
|
|
1330
|
+
# is in 'cached' or 'uploadable' or 'uploading' state.
|
|
1331
|
+
# It might be uploading because *we* are supposed to be uploading it.
|
|
1332
|
+
logger.debug('Trying to make reference to file %s', fileStoreID)
|
|
1333
|
+
self._write([('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)',
|
|
1334
|
+
(localFilePath, readerID, 'copying', fileStoreID, 'cached', 'uploadable', 'uploading'))])
|
|
1335
|
+
|
|
1336
|
+
# See if we got it
|
|
1337
|
+
self._read('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (localFilePath, fileStoreID))
|
|
1338
|
+
if self.cur.fetchone()[0] > 0:
|
|
1339
|
+
# The file is cached and we can copy or link it
|
|
1340
|
+
logger.debug('Obtained reference to file %s', fileStoreID)
|
|
1341
|
+
|
|
1342
|
+
# Get the path it is actually at in the cache, instead of where we wanted to put it
|
|
1343
|
+
for row in self._read('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
|
|
1344
|
+
cachedPath = row[0]
|
|
1345
|
+
|
|
1346
|
+
|
|
1347
|
+
while self.getCacheAvailable() < 0:
|
|
1348
|
+
# Since we now have a copying reference, see if we have used too much space.
|
|
1349
|
+
# If so, try to free up some space by deleting or uploading, but
|
|
1350
|
+
# don't loop forever if we can't get enough.
|
|
1351
|
+
self._tryToFreeUpSpace()
|
|
1352
|
+
|
|
1353
|
+
if self.getCacheAvailable() >= 0:
|
|
1354
|
+
# We made room
|
|
1355
|
+
break
|
|
1356
|
+
|
|
1357
|
+
# See if we have no other references and we can give away the file.
|
|
1358
|
+
# Change it to downloading owned by us if we can grab it.
|
|
1359
|
+
self._write([("""
|
|
1360
|
+
UPDATE files SET files.owner = ?, files.state = ? WHERE files.id = ? AND files.state = ?
|
|
1361
|
+
AND files.owner IS NULL AND NOT EXISTS (
|
|
1362
|
+
SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
|
|
1363
|
+
)
|
|
1364
|
+
""",
|
|
1365
|
+
(me, 'downloading', fileStoreID, 'cached'))])
|
|
1366
|
+
|
|
1367
|
+
if self._giveAwayDownloadingFile(fileStoreID, cachedPath, localFilePath):
|
|
1368
|
+
# We got ownership of the file and managed to give it away.
|
|
1369
|
+
return localFilePath
|
|
1370
|
+
|
|
1371
|
+
# If we don't have space, and we couldn't make space, and we
|
|
1372
|
+
# couldn't get exclusive control of the file to give it away, we
|
|
1373
|
+
# need to wait for one of those people with references to the file
|
|
1374
|
+
# to finish and give it up.
|
|
1375
|
+
# TODO: work out if that will never happen somehow.
|
|
1376
|
+
time.sleep(self.contentionBackoff)
|
|
1377
|
+
|
|
1378
|
+
# OK, now we have space to make a copy.
|
|
1379
|
+
|
|
1380
|
+
if self.forceDownloadDelay is not None:
|
|
1381
|
+
# Wait around to simulate a big file for testing
|
|
1382
|
+
time.sleep(self.forceDownloadDelay)
|
|
1383
|
+
|
|
1384
|
+
# Make the copy
|
|
1385
|
+
atomic_copy(cachedPath, localFilePath)
|
|
1386
|
+
|
|
1387
|
+
# Change the reference to mutable
|
|
1388
|
+
self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', localFilePath, fileStoreID))])
|
|
1389
|
+
|
|
1390
|
+
# Now we're done
|
|
1391
|
+
return localFilePath
|
|
1353
1392
|
|
|
1354
|
-
|
|
1393
|
+
else:
|
|
1394
|
+
# We didn't get a reference. Maybe it is still downloading.
|
|
1395
|
+
logger.debug('Could not obtain reference to file %s', fileStoreID)
|
|
1355
1396
|
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1397
|
+
# Loop around again and see if either we can download it or we can get a reference to it.
|
|
1398
|
+
|
|
1399
|
+
# If we didn't get a download or a reference, adopt and do work
|
|
1400
|
+
# from dead workers and loop again.
|
|
1401
|
+
# We may have to wait for someone else's download or delete to
|
|
1402
|
+
# finish. If they die, we will notice.
|
|
1403
|
+
self._removeDeadJobs(self.coordination_dir, self.con)
|
|
1404
|
+
self._stealWorkFromTheDead()
|
|
1405
|
+
self._executePendingDeletions()
|
|
1363
1406
|
|
|
1364
|
-
|
|
1365
|
-
|
|
1407
|
+
# Wait for other people's downloads to progress before re-polling.
|
|
1408
|
+
time.sleep(self.contentionBackoff)
|
|
1366
1409
|
|
|
1367
1410
|
def _fulfillCopyingReference(self, fileStoreID, cachedPath, localFilePath):
|
|
1368
1411
|
"""
|
|
@@ -1422,27 +1465,27 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1422
1465
|
"""
|
|
1423
1466
|
|
|
1424
1467
|
# Work out who we are
|
|
1425
|
-
|
|
1468
|
+
with self.as_process() as me:
|
|
1426
1469
|
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1470
|
+
# See if we actually own this file and can giove it away
|
|
1471
|
+
self._read('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?',
|
|
1472
|
+
(fileStoreID, 'downloading', me))
|
|
1473
|
+
if self.cur.fetchone()[0] > 0:
|
|
1474
|
+
# Now we have exclusive control of the cached copy of the file, so we can give it away.
|
|
1432
1475
|
|
|
1433
|
-
|
|
1476
|
+
# Don't fake a delay here; this should be a rename always.
|
|
1434
1477
|
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1478
|
+
# We are giving it away
|
|
1479
|
+
shutil.move(cachedPath, localFilePath)
|
|
1480
|
+
# Record that.
|
|
1481
|
+
self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', localFilePath, fileStoreID)),
|
|
1482
|
+
('DELETE FROM files WHERE id = ?', (fileStoreID,))])
|
|
1440
1483
|
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1484
|
+
# Now we're done
|
|
1485
|
+
return True
|
|
1486
|
+
else:
|
|
1487
|
+
# We don't own this file in 'downloading' state
|
|
1488
|
+
return False
|
|
1446
1489
|
|
|
1447
1490
|
def _createLinkFromCache(self, cachedPath, localFilePath, symlink=True):
|
|
1448
1491
|
"""
|
|
@@ -1493,108 +1536,108 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1493
1536
|
# Now we know to use the cache, and that we don't require a mutable copy.
|
|
1494
1537
|
|
|
1495
1538
|
# Work out who we are
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
# Make sure we have space for this download.
|
|
1519
|
-
self._freeUpSpace()
|
|
1520
|
-
|
|
1521
|
-
# Do the download into the cache.
|
|
1522
|
-
self._downloadToCache(fileStoreID, cachedPath)
|
|
1523
|
-
|
|
1524
|
-
# Try and make the link before we let the file go to cached state.
|
|
1525
|
-
# If we fail we may end up having to give away the file we just downloaded.
|
|
1526
|
-
if self._createLinkFromCache(cachedPath, localFilePath, symlink):
|
|
1527
|
-
# We made the link!
|
|
1528
|
-
|
|
1529
|
-
# Change file state from downloading to cached so other people can use it
|
|
1530
|
-
self._write([('UPDATE files SET state = ?, owner = NULL WHERE id = ?',
|
|
1531
|
-
('cached', fileStoreID))])
|
|
1532
|
-
|
|
1533
|
-
# Now we're done!
|
|
1534
|
-
return localFilePath
|
|
1535
|
-
else:
|
|
1536
|
-
# We could not make a link. We need to make a copy.
|
|
1539
|
+
with self.as_process() as me:
|
|
1540
|
+
|
|
1541
|
+
# Work out where to cache the file if it isn't cached already
|
|
1542
|
+
cachedPath = self._getNewCachingPath(fileStoreID)
|
|
1543
|
+
|
|
1544
|
+
# Start a loop until we can do one of these
|
|
1545
|
+
while True:
|
|
1546
|
+
# Try and create a downloading entry if no entry exists.
|
|
1547
|
+
# Make sure to create a reference at the same time if it succeeds, to bill it against our job's space.
|
|
1548
|
+
# Don't create the mutable reference yet because we might not necessarily be able to clear that space.
|
|
1549
|
+
logger.debug('Trying to make file downloading file record and reference for id %s', fileStoreID)
|
|
1550
|
+
self._write([('INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)',
|
|
1551
|
+
(fileStoreID, cachedPath, self.getGlobalFileSize(fileStoreID), 'downloading', me)),
|
|
1552
|
+
('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND state = ? AND owner = ?',
|
|
1553
|
+
(localFilePath, readerID, 'immutable', fileStoreID, 'downloading', me))])
|
|
1554
|
+
|
|
1555
|
+
# See if we won the race
|
|
1556
|
+
self._read('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?', (fileStoreID, 'downloading', me))
|
|
1557
|
+
if self.cur.fetchone()[0] > 0:
|
|
1558
|
+
# We are responsible for downloading the file (and we have the reference)
|
|
1559
|
+
logger.debug('We are now responsible for downloading file %s', fileStoreID)
|
|
1537
1560
|
|
|
1538
|
-
#
|
|
1539
|
-
self.
|
|
1561
|
+
# Make sure we have space for this download.
|
|
1562
|
+
self._freeUpSpace()
|
|
1540
1563
|
|
|
1541
|
-
#
|
|
1542
|
-
self.
|
|
1564
|
+
# Do the download into the cache.
|
|
1565
|
+
self._downloadToCache(fileStoreID, cachedPath)
|
|
1543
1566
|
|
|
1544
|
-
#
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
logger.debug('We already have an entry in the cache database for file %s', fileStoreID)
|
|
1549
|
-
|
|
1550
|
-
# A record already existed for this file.
|
|
1551
|
-
# Try and create an immutable reference to an entry that
|
|
1552
|
-
# is in 'cached' or 'uploadable' or 'uploading' state.
|
|
1553
|
-
# It might be uploading because *we* are supposed to be uploading it.
|
|
1554
|
-
logger.debug('Trying to make reference to file %s', fileStoreID)
|
|
1555
|
-
self._write([('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)',
|
|
1556
|
-
(localFilePath, readerID, 'immutable', fileStoreID, 'cached', 'uploadable', 'uploading'))])
|
|
1557
|
-
|
|
1558
|
-
# See if we got it
|
|
1559
|
-
self.cur.execute('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (localFilePath, fileStoreID))
|
|
1560
|
-
if self.cur.fetchone()[0] > 0:
|
|
1561
|
-
# The file is cached and we can copy or link it
|
|
1562
|
-
logger.debug('Obtained reference to file %s', fileStoreID)
|
|
1567
|
+
# Try and make the link before we let the file go to cached state.
|
|
1568
|
+
# If we fail we may end up having to give away the file we just downloaded.
|
|
1569
|
+
if self._createLinkFromCache(cachedPath, localFilePath, symlink):
|
|
1570
|
+
# We made the link!
|
|
1563
1571
|
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1572
|
+
# Change file state from downloading to cached so other people can use it
|
|
1573
|
+
self._write([('UPDATE files SET state = ?, owner = NULL WHERE id = ?',
|
|
1574
|
+
('cached', fileStoreID))])
|
|
1567
1575
|
|
|
1568
|
-
|
|
1569
|
-
# We managed to make the link
|
|
1576
|
+
# Now we're done!
|
|
1570
1577
|
return localFilePath
|
|
1571
1578
|
else:
|
|
1572
|
-
# We
|
|
1579
|
+
# We could not make a link. We need to make a copy.
|
|
1580
|
+
|
|
1581
|
+
# Change the reference to copying.
|
|
1582
|
+
self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('copying', localFilePath, fileStoreID))])
|
|
1573
1583
|
|
|
1574
|
-
#
|
|
1575
|
-
|
|
1576
|
-
# the file if there isn't space, and give it away, but
|
|
1577
|
-
# we already have code for that for mutable downloads,
|
|
1578
|
-
# so just clear the reference and download mutably.
|
|
1584
|
+
# Fulfill it with a full copy or by giving away the cached copy
|
|
1585
|
+
self._fulfillCopyingReference(fileStoreID, cachedPath, localFilePath)
|
|
1579
1586
|
|
|
1580
|
-
|
|
1587
|
+
# Now we're done
|
|
1588
|
+
return localFilePath
|
|
1581
1589
|
|
|
1582
|
-
return self._readGlobalFileMutablyWithCache(fileStoreID, localFilePath, readerID)
|
|
1583
1590
|
else:
|
|
1584
|
-
logger.debug('
|
|
1591
|
+
logger.debug('We already have an entry in the cache database for file %s', fileStoreID)
|
|
1592
|
+
|
|
1593
|
+
# A record already existed for this file.
|
|
1594
|
+
# Try and create an immutable reference to an entry that
|
|
1595
|
+
# is in 'cached' or 'uploadable' or 'uploading' state.
|
|
1596
|
+
# It might be uploading because *we* are supposed to be uploading it.
|
|
1597
|
+
logger.debug('Trying to make reference to file %s', fileStoreID)
|
|
1598
|
+
self._write([('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)',
|
|
1599
|
+
(localFilePath, readerID, 'immutable', fileStoreID, 'cached', 'uploadable', 'uploading'))])
|
|
1600
|
+
|
|
1601
|
+
# See if we got it
|
|
1602
|
+
self._read('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (localFilePath, fileStoreID))
|
|
1603
|
+
if self.cur.fetchone()[0] > 0:
|
|
1604
|
+
# The file is cached and we can copy or link it
|
|
1605
|
+
logger.debug('Obtained reference to file %s', fileStoreID)
|
|
1606
|
+
|
|
1607
|
+
# Get the path it is actually at in the cache, instead of where we wanted to put it
|
|
1608
|
+
for row in self._read('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
|
|
1609
|
+
cachedPath = row[0]
|
|
1610
|
+
|
|
1611
|
+
if self._createLinkFromCache(cachedPath, localFilePath, symlink):
|
|
1612
|
+
# We managed to make the link
|
|
1613
|
+
return localFilePath
|
|
1614
|
+
else:
|
|
1615
|
+
# We can't make the link. We need a copy instead.
|
|
1616
|
+
|
|
1617
|
+
# We could change the reference to copying, see if
|
|
1618
|
+
# there's space, make the copy, try and get ahold of
|
|
1619
|
+
# the file if there isn't space, and give it away, but
|
|
1620
|
+
# we already have code for that for mutable downloads,
|
|
1621
|
+
# so just clear the reference and download mutably.
|
|
1585
1622
|
|
|
1586
|
-
|
|
1587
|
-
# We may have to wait for someone else's download or delete to
|
|
1588
|
-
# finish. If they die, we will notice.
|
|
1589
|
-
self._removeDeadJobs(self.coordination_dir, self.con)
|
|
1590
|
-
self._stealWorkFromTheDead()
|
|
1591
|
-
# We may have acquired ownership of partially-downloaded
|
|
1592
|
-
# files, now in deleting state, that we need to delete
|
|
1593
|
-
# before we can download them.
|
|
1594
|
-
self._executePendingDeletions(self.coordination_dir, self.con, self.cur)
|
|
1623
|
+
self._write([('DELETE FROM refs WHERE path = ? AND file_id = ?', (localFilePath, fileStoreID))])
|
|
1595
1624
|
|
|
1596
|
-
|
|
1597
|
-
|
|
1625
|
+
return self._readGlobalFileMutablyWithCache(fileStoreID, localFilePath, readerID)
|
|
1626
|
+
else:
|
|
1627
|
+
logger.debug('Could not obtain reference to file %s', fileStoreID)
|
|
1628
|
+
|
|
1629
|
+
# If we didn't get a download or a reference, adopt and do work from dead workers and loop again.
|
|
1630
|
+
# We may have to wait for someone else's download or delete to
|
|
1631
|
+
# finish. If they die, we will notice.
|
|
1632
|
+
self._removeDeadJobs(self.coordination_dir, self.con)
|
|
1633
|
+
self._stealWorkFromTheDead()
|
|
1634
|
+
# We may have acquired ownership of partially-downloaded
|
|
1635
|
+
# files, now in deleting state, that we need to delete
|
|
1636
|
+
# before we can download them.
|
|
1637
|
+
self._executePendingDeletions()
|
|
1638
|
+
|
|
1639
|
+
# Wait for other people's downloads to progress.
|
|
1640
|
+
time.sleep(self.contentionBackoff)
|
|
1598
1641
|
|
|
1599
1642
|
@contextmanager
|
|
1600
1643
|
def _with_copying_reference_to_upload(self, file_store_id: FileID, reader_id: str, local_file_path: Optional[str] = None) -> Generator:
|
|
@@ -1624,7 +1667,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1624
1667
|
|
|
1625
1668
|
# See if we got it
|
|
1626
1669
|
have_reference = False
|
|
1627
|
-
for row in self.
|
|
1670
|
+
for row in self._read('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (local_file_path, file_store_id)):
|
|
1628
1671
|
have_reference = row[0] > 0
|
|
1629
1672
|
|
|
1630
1673
|
if have_reference:
|
|
@@ -1651,12 +1694,12 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1651
1694
|
# Try and grab a reference to the file if it is being uploaded.
|
|
1652
1695
|
if ref_path is not None:
|
|
1653
1696
|
# We have an update in the cache that isn't written back yet.
|
|
1654
|
-
# So we must stream from the
|
|
1697
|
+
# So we must stream from the cache for consistency.
|
|
1655
1698
|
|
|
1656
1699
|
# The ref file is not actually copied to; find the actual file
|
|
1657
1700
|
# in the cache
|
|
1658
1701
|
cached_path = None
|
|
1659
|
-
for row in self.
|
|
1702
|
+
for row in self._read('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
|
|
1660
1703
|
cached_path = row[0]
|
|
1661
1704
|
|
|
1662
1705
|
if cached_path is None:
|
|
@@ -1666,7 +1709,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1666
1709
|
# Pass along the results of the open context manager on the
|
|
1667
1710
|
# file in the cache.
|
|
1668
1711
|
yield result
|
|
1669
|
-
# When we exit the with the copying reference will go away and
|
|
1712
|
+
# When we exit the with, the copying reference will go away and
|
|
1670
1713
|
# the file will be allowed to leave the cache again.
|
|
1671
1714
|
else:
|
|
1672
1715
|
# No local update, so we can stream from the job store
|
|
@@ -1684,7 +1727,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1684
1727
|
# missing ref file, we will raise an error about it and stop deleting
|
|
1685
1728
|
# things.
|
|
1686
1729
|
missingFile = None
|
|
1687
|
-
for row in self.
|
|
1730
|
+
for row in self._read('SELECT path FROM refs WHERE file_id = ? AND job_id = ?', (fileStoreID, jobID)):
|
|
1688
1731
|
# Delete all the files that are references to this cached file (even mutable copies)
|
|
1689
1732
|
path = row[0]
|
|
1690
1733
|
|
|
@@ -1735,25 +1778,25 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1735
1778
|
raise
|
|
1736
1779
|
|
|
1737
1780
|
# Work out who we are
|
|
1738
|
-
|
|
1781
|
+
with self.as_process() as me:
|
|
1739
1782
|
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1783
|
+
# Make sure nobody else has references to it
|
|
1784
|
+
for row in self._read('SELECT job_id FROM refs WHERE file_id = ? AND state != ?', (fileStoreID, 'mutable')):
|
|
1785
|
+
raise RuntimeError(f'Deleted file ID {fileStoreID} which is still in use by job {row[0]}')
|
|
1786
|
+
# TODO: should we just let other jobs and the cache keep the file until
|
|
1787
|
+
# it gets evicted, and only delete at the back end?
|
|
1745
1788
|
|
|
1746
|
-
|
|
1747
|
-
|
|
1789
|
+
# Pop the file into deleting state owned by us if it exists
|
|
1790
|
+
self._write([('UPDATE files SET state = ?, owner = ? WHERE id = ?', ('deleting', me, fileStoreID))])
|
|
1748
1791
|
|
|
1749
|
-
|
|
1750
|
-
|
|
1792
|
+
# Finish the delete if the file is present
|
|
1793
|
+
self._executePendingDeletions()
|
|
1751
1794
|
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1795
|
+
# Add the file to the list of files to be deleted from the job store
|
|
1796
|
+
# once the run method completes.
|
|
1797
|
+
self.filesToDelete.add(str(fileStoreID))
|
|
1798
|
+
self.log_to_leader('Added file with ID \'%s\' to the list of files to be' % fileStoreID +
|
|
1799
|
+
' globally deleted.', level=logging.DEBUG)
|
|
1757
1800
|
|
|
1758
1801
|
@deprecated(new_function_name='export_file')
|
|
1759
1802
|
def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None:
|
|
@@ -1768,7 +1811,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1768
1811
|
# until they are done.
|
|
1769
1812
|
|
|
1770
1813
|
# For safety and simplicity, we just execute all pending uploads now.
|
|
1771
|
-
self._executePendingUploads(
|
|
1814
|
+
self._executePendingUploads()
|
|
1772
1815
|
|
|
1773
1816
|
# Then we let the job store export. TODO: let the export come from the
|
|
1774
1817
|
# cache? How would we write the URL?
|
|
@@ -1796,11 +1839,37 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1796
1839
|
# value?) wait on it, so we can't forget to join it later.
|
|
1797
1840
|
self.waitForCommit()
|
|
1798
1841
|
|
|
1842
|
+
if len(self.jobDesc.filesToDelete) > 0:
|
|
1843
|
+
raise RuntimeError("Job is already in the process of being committed!")
|
|
1844
|
+
|
|
1845
|
+
state_to_commit: Optional[JobDescription] = None
|
|
1846
|
+
|
|
1847
|
+
if jobState:
|
|
1848
|
+
# Clone the current job description, so that further updates to it
|
|
1849
|
+
# (such as new successors being added when it runs) occur after the
|
|
1850
|
+
# commit process, and aren't committed early or partially.
|
|
1851
|
+
state_to_commit = copy.deepcopy(self.jobDesc)
|
|
1852
|
+
# Also snapshot the files that should be seen as deleted once the
|
|
1853
|
+
# update of the job description is visible.
|
|
1854
|
+
state_to_commit.filesToDelete = list(self.filesToDelete)
|
|
1855
|
+
# TODO: We never clear this out on the file store itself. This
|
|
1856
|
+
# might be necessary for later jobs to see earlier jobs' deleted
|
|
1857
|
+
# before they are committed?
|
|
1858
|
+
|
|
1859
|
+
logger.debug('Starting commit of %s forked from %s', state_to_commit, self.jobDesc)
|
|
1860
|
+
# Make sure the deep copy isn't summoning ghosts of old job
|
|
1861
|
+
# versions. It must be as new or newer at this point.
|
|
1862
|
+
self.jobDesc.check_new_version(state_to_commit)
|
|
1863
|
+
|
|
1864
|
+
# Bump the original's version since saving will do that too and we
|
|
1865
|
+
# don't want duplicate versions.
|
|
1866
|
+
self.jobDesc.reserve_versions(1 if len(state_to_commit.filesToDelete) == 0 else 2)
|
|
1867
|
+
|
|
1799
1868
|
# Start the commit thread
|
|
1800
|
-
self.commitThread = threading.Thread(target=self.startCommitThread, args=(
|
|
1869
|
+
self.commitThread = threading.Thread(target=self.startCommitThread, args=(state_to_commit,))
|
|
1801
1870
|
self.commitThread.start()
|
|
1802
1871
|
|
|
1803
|
-
def startCommitThread(self,
|
|
1872
|
+
def startCommitThread(self, state_to_commit: Optional[JobDescription]):
|
|
1804
1873
|
"""
|
|
1805
1874
|
Run in a thread to actually commit the current job.
|
|
1806
1875
|
"""
|
|
@@ -1810,38 +1879,28 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1810
1879
|
self.waitForPreviousCommit()
|
|
1811
1880
|
|
|
1812
1881
|
try:
|
|
1813
|
-
# Reconnect to the database from this thread. The main thread can
|
|
1814
|
-
# keep using self.con and self.cur. We need to do this because
|
|
1815
|
-
# SQLite objects are tied to a thread.
|
|
1816
|
-
con = sqlite3.connect(self.dbPath, timeout=SQLITE_TIMEOUT_SECS)
|
|
1817
|
-
cur = con.cursor()
|
|
1818
|
-
|
|
1819
1882
|
logger.debug('Committing file uploads asynchronously')
|
|
1820
1883
|
|
|
1821
1884
|
# Finish all uploads
|
|
1822
|
-
self._executePendingUploads(
|
|
1885
|
+
self._executePendingUploads()
|
|
1823
1886
|
# Finish all deletions out of the cache (not from the job store)
|
|
1824
|
-
self._executePendingDeletions(
|
|
1887
|
+
self._executePendingDeletions()
|
|
1825
1888
|
|
|
1826
|
-
if
|
|
1889
|
+
if state_to_commit is not None:
|
|
1827
1890
|
# Do all the things that make this job not redoable
|
|
1828
1891
|
|
|
1829
|
-
logger.debug('Committing file deletes and job state changes asynchronously')
|
|
1892
|
+
logger.debug('Committing file deletes and job state changes asynchronously from %s', state_to_commit)
|
|
1830
1893
|
|
|
1831
|
-
# Indicate any files that should be deleted once the update of
|
|
1832
|
-
# the job wrapper is completed.
|
|
1833
|
-
self.jobDesc.filesToDelete = list(self.filesToDelete)
|
|
1834
1894
|
# Complete the job
|
|
1835
|
-
self.jobStore.update_job(
|
|
1836
|
-
# Delete
|
|
1837
|
-
list(map(self.jobStore.
|
|
1838
|
-
# Delete any remnant files
|
|
1839
|
-
list(map(self.jobStore.delete_file, self.filesToDelete))
|
|
1895
|
+
self.jobStore.update_job(state_to_commit)
|
|
1896
|
+
# Delete the files
|
|
1897
|
+
list(map(self.jobStore.delete_file, state_to_commit.filesToDelete))
|
|
1840
1898
|
# Remove the files to delete list, having successfully removed the files
|
|
1841
|
-
if len(
|
|
1842
|
-
|
|
1899
|
+
if len(state_to_commit.filesToDelete) > 0:
|
|
1900
|
+
state_to_commit.filesToDelete = []
|
|
1843
1901
|
# Update, removing emptying files to delete
|
|
1844
|
-
self.jobStore.update_job(
|
|
1902
|
+
self.jobStore.update_job(state_to_commit)
|
|
1903
|
+
|
|
1845
1904
|
except:
|
|
1846
1905
|
self._terminateEvent.set()
|
|
1847
1906
|
raise
|
|
@@ -1852,14 +1911,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1852
1911
|
def shutdown(cls, shutdown_info: Tuple[str, str]) -> None:
|
|
1853
1912
|
"""
|
|
1854
1913
|
:param shutdown_info: Tuple of the coordination directory (where the
|
|
1855
|
-
cache database is) and the cache directory (where the cached data is).
|
|
1856
|
-
|
|
1914
|
+
cache database is) and the cache directory (where the cached data is).
|
|
1915
|
+
|
|
1857
1916
|
Job local temp directories will be removed due to their appearance in
|
|
1858
1917
|
the database.
|
|
1859
1918
|
"""
|
|
1860
|
-
|
|
1919
|
+
|
|
1861
1920
|
coordination_dir, cache_dir = shutdown_info
|
|
1862
|
-
|
|
1921
|
+
|
|
1863
1922
|
if os.path.isdir(cache_dir):
|
|
1864
1923
|
# There is a directory to clean up
|
|
1865
1924
|
|
|
@@ -1877,7 +1936,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1877
1936
|
# and use that.
|
|
1878
1937
|
dbFilename = None
|
|
1879
1938
|
dbAttempt = float('-inf')
|
|
1880
|
-
|
|
1939
|
+
|
|
1881
1940
|
# We also need to remember all the plausible database files and
|
|
1882
1941
|
# journals
|
|
1883
1942
|
all_db_files = []
|
|
@@ -1929,7 +1988,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1929
1988
|
for filename in all_db_files:
|
|
1930
1989
|
# And delete everything related to the caching database
|
|
1931
1990
|
robust_rmtree(filename)
|
|
1932
|
-
|
|
1991
|
+
|
|
1933
1992
|
def __del__(self):
|
|
1934
1993
|
"""
|
|
1935
1994
|
Cleanup function that is run when destroying the class instance that ensures that all the
|
|
@@ -1951,12 +2010,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1951
2010
|
# Get a cursor
|
|
1952
2011
|
cur = con.cursor()
|
|
1953
2012
|
|
|
1954
|
-
#
|
|
2013
|
+
# We're allowed to assign jobs to us without acquiring the process
|
|
2014
|
+
# identity lock; we know it won't interfere with any of the other logic
|
|
2015
|
+
# happening under our process's identity in the database.
|
|
1955
2016
|
me = get_process_name(coordination_dir)
|
|
1956
2017
|
|
|
1957
2018
|
# Get all the dead worker PIDs
|
|
1958
2019
|
workers = []
|
|
1959
|
-
for row in
|
|
2020
|
+
for row in cls._static_read(cur, 'SELECT DISTINCT worker FROM jobs WHERE worker IS NOT NULL'):
|
|
1960
2021
|
workers.append(row[0])
|
|
1961
2022
|
|
|
1962
2023
|
# Work out which of them are not currently running.
|
|
@@ -1969,14 +2030,14 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1969
2030
|
# Now we know which workers are dead.
|
|
1970
2031
|
# Clear them off of the jobs they had.
|
|
1971
2032
|
for deadWorker in deadWorkers:
|
|
1972
|
-
cls.
|
|
2033
|
+
cls._static_write(con, cur, [('UPDATE jobs SET worker = NULL WHERE worker = ?', (deadWorker,))])
|
|
1973
2034
|
if len(deadWorkers) > 0:
|
|
1974
2035
|
logger.debug('Reaped %d dead workers', len(deadWorkers))
|
|
1975
2036
|
|
|
1976
2037
|
while True:
|
|
1977
2038
|
# Find an unowned job.
|
|
1978
2039
|
# Don't take all of them; other people could come along and want to help us with the other jobs.
|
|
1979
|
-
|
|
2040
|
+
cls._static_read(cur, 'SELECT id FROM jobs WHERE worker IS NULL LIMIT 1')
|
|
1980
2041
|
row = cur.fetchone()
|
|
1981
2042
|
if row is None:
|
|
1982
2043
|
# We cleaned up all the jobs
|
|
@@ -1985,10 +2046,10 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1985
2046
|
jobID = row[0]
|
|
1986
2047
|
|
|
1987
2048
|
# Try to own this job
|
|
1988
|
-
cls.
|
|
2049
|
+
cls._static_write(con, cur, [('UPDATE jobs SET worker = ? WHERE id = ? AND worker IS NULL', (me, jobID))])
|
|
1989
2050
|
|
|
1990
2051
|
# See if we won the race
|
|
1991
|
-
|
|
2052
|
+
cls._static_read(cur, 'SELECT id, tempdir FROM jobs WHERE id = ? AND worker = ?', (jobID, me))
|
|
1992
2053
|
row = cur.fetchone()
|
|
1993
2054
|
if row is None:
|
|
1994
2055
|
# We didn't win the race. Try another one.
|