toil 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. toil/__init__.py +18 -13
  2. toil/batchSystems/abstractBatchSystem.py +39 -13
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +24 -24
  4. toil/batchSystems/awsBatch.py +14 -14
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +3 -3
  7. toil/batchSystems/htcondor.py +0 -1
  8. toil/batchSystems/kubernetes.py +34 -31
  9. toil/batchSystems/local_support.py +3 -1
  10. toil/batchSystems/lsf.py +7 -7
  11. toil/batchSystems/mesos/batchSystem.py +7 -7
  12. toil/batchSystems/options.py +32 -83
  13. toil/batchSystems/registry.py +104 -23
  14. toil/batchSystems/singleMachine.py +16 -13
  15. toil/batchSystems/slurm.py +87 -16
  16. toil/batchSystems/torque.py +0 -1
  17. toil/bus.py +44 -8
  18. toil/common.py +544 -753
  19. toil/cwl/__init__.py +28 -32
  20. toil/cwl/cwltoil.py +595 -574
  21. toil/cwl/utils.py +55 -10
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/__init__.py +2 -2
  24. toil/fileStores/abstractFileStore.py +88 -14
  25. toil/fileStores/cachingFileStore.py +610 -549
  26. toil/fileStores/nonCachingFileStore.py +46 -22
  27. toil/job.py +182 -101
  28. toil/jobStores/abstractJobStore.py +161 -95
  29. toil/jobStores/aws/jobStore.py +23 -9
  30. toil/jobStores/aws/utils.py +6 -6
  31. toil/jobStores/fileJobStore.py +116 -18
  32. toil/jobStores/googleJobStore.py +16 -7
  33. toil/jobStores/utils.py +5 -6
  34. toil/leader.py +87 -56
  35. toil/lib/accelerators.py +10 -5
  36. toil/lib/aws/__init__.py +3 -14
  37. toil/lib/aws/ami.py +22 -9
  38. toil/lib/aws/iam.py +21 -13
  39. toil/lib/aws/session.py +2 -16
  40. toil/lib/aws/utils.py +4 -5
  41. toil/lib/compatibility.py +1 -1
  42. toil/lib/conversions.py +26 -3
  43. toil/lib/docker.py +22 -23
  44. toil/lib/ec2.py +10 -6
  45. toil/lib/ec2nodes.py +106 -100
  46. toil/lib/encryption/_nacl.py +2 -1
  47. toil/lib/generatedEC2Lists.py +325 -18
  48. toil/lib/io.py +49 -2
  49. toil/lib/misc.py +1 -1
  50. toil/lib/resources.py +9 -2
  51. toil/lib/threading.py +101 -38
  52. toil/options/common.py +736 -0
  53. toil/options/cwl.py +336 -0
  54. toil/options/wdl.py +37 -0
  55. toil/provisioners/abstractProvisioner.py +9 -4
  56. toil/provisioners/aws/__init__.py +3 -6
  57. toil/provisioners/aws/awsProvisioner.py +6 -0
  58. toil/provisioners/clusterScaler.py +3 -2
  59. toil/provisioners/gceProvisioner.py +2 -2
  60. toil/realtimeLogger.py +2 -1
  61. toil/resource.py +24 -18
  62. toil/server/app.py +2 -3
  63. toil/server/cli/wes_cwl_runner.py +4 -4
  64. toil/server/utils.py +1 -1
  65. toil/server/wes/abstract_backend.py +3 -2
  66. toil/server/wes/amazon_wes_utils.py +5 -4
  67. toil/server/wes/tasks.py +2 -3
  68. toil/server/wes/toil_backend.py +2 -10
  69. toil/server/wsgi_app.py +2 -0
  70. toil/serviceManager.py +12 -10
  71. toil/statsAndLogging.py +41 -9
  72. toil/test/__init__.py +29 -54
  73. toil/test/batchSystems/batchSystemTest.py +11 -111
  74. toil/test/batchSystems/test_slurm.py +24 -8
  75. toil/test/cactus/__init__.py +0 -0
  76. toil/test/cactus/test_cactus_integration.py +58 -0
  77. toil/test/cwl/cwlTest.py +438 -223
  78. toil/test/cwl/glob_dir.cwl +15 -0
  79. toil/test/cwl/preemptible.cwl +21 -0
  80. toil/test/cwl/preemptible_expression.cwl +28 -0
  81. toil/test/cwl/revsort.cwl +1 -1
  82. toil/test/cwl/revsort2.cwl +1 -1
  83. toil/test/docs/scriptsTest.py +2 -3
  84. toil/test/jobStores/jobStoreTest.py +34 -21
  85. toil/test/lib/aws/test_iam.py +4 -14
  86. toil/test/lib/aws/test_utils.py +0 -3
  87. toil/test/lib/dockerTest.py +4 -4
  88. toil/test/lib/test_ec2.py +12 -17
  89. toil/test/mesos/helloWorld.py +4 -5
  90. toil/test/mesos/stress.py +1 -1
  91. toil/test/{wdl/conftest.py → options/__init__.py} +0 -10
  92. toil/test/options/options.py +37 -0
  93. toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
  94. toil/test/provisioners/clusterScalerTest.py +6 -4
  95. toil/test/provisioners/clusterTest.py +23 -11
  96. toil/test/provisioners/gceProvisionerTest.py +0 -6
  97. toil/test/provisioners/restartScript.py +3 -2
  98. toil/test/server/serverTest.py +1 -1
  99. toil/test/sort/restart_sort.py +2 -1
  100. toil/test/sort/sort.py +2 -1
  101. toil/test/sort/sortTest.py +2 -13
  102. toil/test/src/autoDeploymentTest.py +45 -45
  103. toil/test/src/busTest.py +5 -5
  104. toil/test/src/checkpointTest.py +2 -2
  105. toil/test/src/deferredFunctionTest.py +1 -1
  106. toil/test/src/fileStoreTest.py +32 -16
  107. toil/test/src/helloWorldTest.py +1 -1
  108. toil/test/src/importExportFileTest.py +1 -1
  109. toil/test/src/jobDescriptionTest.py +2 -1
  110. toil/test/src/jobServiceTest.py +1 -1
  111. toil/test/src/jobTest.py +18 -18
  112. toil/test/src/miscTests.py +5 -3
  113. toil/test/src/promisedRequirementTest.py +3 -3
  114. toil/test/src/realtimeLoggerTest.py +1 -1
  115. toil/test/src/resourceTest.py +2 -2
  116. toil/test/src/restartDAGTest.py +1 -1
  117. toil/test/src/resumabilityTest.py +36 -2
  118. toil/test/src/retainTempDirTest.py +1 -1
  119. toil/test/src/systemTest.py +2 -2
  120. toil/test/src/toilContextManagerTest.py +2 -2
  121. toil/test/src/userDefinedJobArgTypeTest.py +1 -1
  122. toil/test/utils/toilDebugTest.py +98 -32
  123. toil/test/utils/toilKillTest.py +2 -2
  124. toil/test/utils/utilsTest.py +23 -3
  125. toil/test/wdl/wdltoil_test.py +223 -45
  126. toil/toilState.py +7 -6
  127. toil/utils/toilClean.py +1 -1
  128. toil/utils/toilConfig.py +36 -0
  129. toil/utils/toilDebugFile.py +60 -33
  130. toil/utils/toilDebugJob.py +39 -12
  131. toil/utils/toilDestroyCluster.py +1 -1
  132. toil/utils/toilKill.py +1 -1
  133. toil/utils/toilLaunchCluster.py +13 -2
  134. toil/utils/toilMain.py +3 -2
  135. toil/utils/toilRsyncCluster.py +1 -1
  136. toil/utils/toilSshCluster.py +1 -1
  137. toil/utils/toilStats.py +445 -305
  138. toil/utils/toilStatus.py +2 -5
  139. toil/version.py +10 -10
  140. toil/wdl/utils.py +2 -122
  141. toil/wdl/wdltoil.py +1257 -492
  142. toil/worker.py +55 -46
  143. toil-6.1.0.dist-info/METADATA +124 -0
  144. toil-6.1.0.dist-info/RECORD +241 -0
  145. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/WHEEL +1 -1
  146. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -1
  147. toil/batchSystems/parasol.py +0 -379
  148. toil/batchSystems/tes.py +0 -459
  149. toil/test/batchSystems/parasolTestSupport.py +0 -117
  150. toil/test/wdl/builtinTest.py +0 -506
  151. toil/test/wdl/toilwdlTest.py +0 -522
  152. toil/wdl/toilwdl.py +0 -141
  153. toil/wdl/versions/dev.py +0 -107
  154. toil/wdl/versions/draft2.py +0 -980
  155. toil/wdl/versions/v1.py +0 -794
  156. toil/wdl/wdl_analysis.py +0 -116
  157. toil/wdl/wdl_functions.py +0 -997
  158. toil/wdl/wdl_synthesis.py +0 -1011
  159. toil/wdl/wdl_types.py +0 -243
  160. toil-5.12.0.dist-info/METADATA +0 -118
  161. toil-5.12.0.dist-info/RECORD +0 -244
  162. /toil/{wdl/versions → options}/__init__.py +0 -0
  163. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
  164. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ import copy
14
15
  import errno
15
16
  import hashlib
16
17
  import logging
@@ -19,22 +20,28 @@ import re
19
20
  import shutil
20
21
  import sqlite3
21
22
  import stat
22
- import tempfile
23
23
  import threading
24
24
  import time
25
25
  from contextlib import contextmanager
26
- from typing import Any, Callable, Generator, Optional, Tuple
27
-
28
- from toil.common import cacheDirName, getDirSizeRecursively, getFileSystemSize
26
+ from tempfile import mkstemp
27
+ from typing import (Any,
28
+ Callable,
29
+ Generator,
30
+ Iterator,
31
+ Optional,
32
+ Sequence,
33
+ Tuple)
34
+
35
+ from toil.common import cacheDirName, getFileSystemSize
29
36
  from toil.fileStores import FileID
30
37
  from toil.fileStores.abstractFileStore import AbstractFileStore
31
38
  from toil.job import Job, JobDescription
32
39
  from toil.jobStores.abstractJobStore import AbstractJobStore
33
40
  from toil.lib.compatibility import deprecated
34
- from toil.lib.conversions import bytes2human
35
41
  from toil.lib.io import (atomic_copy,
36
42
  atomic_copyobj,
37
43
  make_public_dir,
44
+ mkdtemp,
38
45
  robust_rmtree)
39
46
  from toil.lib.retry import ErrorCondition, retry
40
47
  from toil.lib.threading import get_process_name, process_name_exists
@@ -224,9 +231,11 @@ class CachingFileStore(AbstractFileStore):
224
231
  # be able to tell that from showing up on a machine where a cache has
225
232
  # already been created.
226
233
  self.dbPath = os.path.join(self.coordination_dir, f'cache-{self.workflowAttemptNumber}.db')
227
- # We need to hold onto both a connection (to commit) and a cursor (to actually use the database)
228
- self.con = sqlite3.connect(self.dbPath, timeout=SQLITE_TIMEOUT_SECS)
229
- self.cur = self.con.cursor()
234
+
235
+ # Database connections are provided by magic properties self.con and
236
+ # self.cur that always have the right object for the current thread to
237
+ # use. They store stuff in this thread-local storage.
238
+ self._thread_local = threading.local()
230
239
 
231
240
  # Note that sqlite3 automatically starts a transaction when we go to
232
241
  # modify the database.
@@ -234,6 +243,12 @@ class CachingFileStore(AbstractFileStore):
234
243
  # write themselves), we need to COMMIT after every coherent set of
235
244
  # writes.
236
245
 
246
+ # Because we support multi-threaded access to files, but we talk to the
247
+ # database as one process with one identity for owning file references,
248
+ # we need to make sure only one thread of our process is trying to e.g.
249
+ # free up space in the cache for a file at a time.
250
+ self.process_identity_lock = threading.RLock()
251
+
237
252
  # Set up the tables
238
253
  self._ensureTables(self.con)
239
254
 
@@ -253,6 +268,37 @@ class CachingFileStore(AbstractFileStore):
253
268
  # time.
254
269
  self.commitThread = None
255
270
 
271
+ @contextmanager
272
+ def as_process(self) -> Generator[str, None, None]:
273
+ """
274
+ Assume the process's identity to act on the caching database.
275
+
276
+ Yields the process's name in the caching database, and holds onto a
277
+ lock while your thread has it.
278
+ """
279
+ with self.process_identity_lock:
280
+ yield get_process_name(self.coordination_dir)
281
+
282
+ @property
283
+ def con(self) -> sqlite3.Connection:
284
+ """
285
+ Get the database connection to be used for the current thread.
286
+ """
287
+ if not hasattr(self._thread_local, 'con'):
288
+ # Connect to the database for this thread.
289
+ # TODO: We assume the connection closes when the thread goes away and can no longer use it.
290
+ self._thread_local.con = sqlite3.connect(self.dbPath, timeout=SQLITE_TIMEOUT_SECS)
291
+ return self._thread_local.con
292
+
293
+ @property
294
+ def cur(self) -> sqlite3.Cursor:
295
+ """
296
+ Get the main cursor to be used for the current thread.
297
+ """
298
+ if not hasattr(self._thread_local, 'cur'):
299
+ # If we don't already have a main cursor for the thread, make one.
300
+ self._thread_local.cur = self.con.cursor()
301
+ return self._thread_local.cur
256
302
 
257
303
  @staticmethod
258
304
  @retry(infinite_retries=True,
@@ -261,7 +307,7 @@ class CachingFileStore(AbstractFileStore):
261
307
  error=sqlite3.OperationalError,
262
308
  error_message_must_include='is locked')
263
309
  ])
264
- def _staticWrite(con, cur, operations):
310
+ def _static_write(con, cur, operations):
265
311
  """
266
312
  Write to the caching database, using the given connection.
267
313
 
@@ -313,6 +359,35 @@ class CachingFileStore(AbstractFileStore):
313
359
 
314
360
  return cur.rowcount
315
361
 
362
+ @staticmethod
363
+ @retry(infinite_retries=True,
364
+ errors=[
365
+ ErrorCondition(
366
+ error=sqlite3.OperationalError,
367
+ error_message_must_include='is locked')
368
+ ])
369
+ def _static_read(cur: sqlite3.Cursor, query: str, args: Optional[Sequence[Any]] = ()) -> Iterator[Any]:
370
+ """
371
+ Read from the database.
372
+
373
+ Run the given select query with the given arguments. Yield each result.
374
+ If the query cannot be run because someone else has a write lock on the
375
+ database, retry.
376
+ """
377
+ # All the real work is the decorators
378
+ return cur.execute(query, args)
379
+
380
+ def _read(self, query: str, args: Optional[Sequence[Any]] = ()) -> Iterator[Any]:
381
+ """
382
+ Read from the database using the instance's connection.
383
+
384
+ Run the given select query with the given arguments. Yield each result.
385
+ If the query cannot be run because someone else has a write lock on the
386
+ database, retry.
387
+ """
388
+
389
+ return self._static_read(self.cur, query, args)
390
+
316
391
  def _write(self, operations):
317
392
  """
318
393
  Write to the caching database, using the instance's connection
@@ -331,7 +406,7 @@ class CachingFileStore(AbstractFileStore):
331
406
  :rtype: int
332
407
  """
333
408
 
334
- return self._staticWrite(self.con, self.cur, operations)
409
+ return self._static_write(self.con, self.cur, operations)
335
410
 
336
411
  @classmethod
337
412
  def _ensureTables(cls, con):
@@ -344,7 +419,7 @@ class CachingFileStore(AbstractFileStore):
344
419
  # Get a cursor
345
420
  cur = con.cursor()
346
421
 
347
- cls._staticWrite(con, cur, ["""
422
+ cls._static_write(con, cur, ["""
348
423
  CREATE TABLE IF NOT EXISTS files (
349
424
  id TEXT NOT NULL PRIMARY KEY,
350
425
  path TEXT UNIQUE NOT NULL,
@@ -399,7 +474,7 @@ class CachingFileStore(AbstractFileStore):
399
474
  if self.cachingIsFree():
400
475
  return 0
401
476
 
402
- for row in self.cur.execute('SELECT TOTAL(size) FROM files'):
477
+ for row in self._read('SELECT TOTAL(size) FROM files'):
403
478
  return row[0]
404
479
 
405
480
  raise RuntimeError('Unable to retrieve cache usage')
@@ -417,7 +492,7 @@ class CachingFileStore(AbstractFileStore):
417
492
  """
418
493
 
419
494
  # Total up the sizes of all the reads of files and subtract it from the total disk reservation of all jobs
420
- for row in self.cur.execute("""
495
+ for row in self._read("""
421
496
  SELECT (
422
497
  (SELECT TOTAL(disk) FROM jobs) -
423
498
  (SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state == 'immutable')
@@ -443,24 +518,24 @@ class CachingFileStore(AbstractFileStore):
443
518
  # content.
444
519
 
445
520
  # Do a little report first
446
- for row in self.cur.execute("SELECT value FROM properties WHERE name = 'maxSpace'"):
521
+ for row in self._read("SELECT value FROM properties WHERE name = 'maxSpace'"):
447
522
  logger.debug('Max space: %d', row[0])
448
- for row in self.cur.execute("SELECT TOTAL(size) FROM files"):
523
+ for row in self._read("SELECT TOTAL(size) FROM files"):
449
524
  logger.debug('Total file size: %d', row[0])
450
- for row in self.cur.execute("SELECT TOTAL(disk) FROM jobs"):
525
+ for row in self._read("SELECT TOTAL(disk) FROM jobs"):
451
526
  logger.debug('Total job disk requirement size: %d', row[0])
452
- for row in self.cur.execute("SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state = 'immutable'"):
527
+ for row in self._read("SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.state = 'immutable'"):
453
528
  logger.debug('Total immutable reference size: %d', row[0])
454
529
 
455
530
  if self.cachingIsFree():
456
531
  # If caching is free, we just say that all the space is always available.
457
- for row in self.cur.execute("SELECT value FROM properties WHERE name = 'maxSpace'"):
532
+ for row in self._read("SELECT value FROM properties WHERE name = 'maxSpace'"):
458
533
  return row[0]
459
534
 
460
535
  raise RuntimeError('Unable to retrieve available cache space')
461
536
 
462
537
 
463
- for row in self.cur.execute("""
538
+ for row in self._read("""
464
539
  SELECT (
465
540
  (SELECT value FROM properties WHERE name = 'maxSpace') -
466
541
  (SELECT TOTAL(size) FROM files) -
@@ -480,7 +555,7 @@ class CachingFileStore(AbstractFileStore):
480
555
  If not retrievable, raises an error.
481
556
  """
482
557
 
483
- for row in self.cur.execute("""
558
+ for row in self._read("""
484
559
  SELECT (
485
560
  (SELECT value FROM properties WHERE name = 'maxSpace') -
486
561
  (SELECT TOTAL(disk) FROM jobs)
@@ -502,14 +577,14 @@ class CachingFileStore(AbstractFileStore):
502
577
 
503
578
  logger.debug('Get unused space for job %s', self.jobID)
504
579
 
505
- for row in self.cur.execute('SELECT * FROM files'):
580
+ for row in self._read('SELECT * FROM files'):
506
581
  logger.debug('File record: %s', str(row))
507
582
 
508
- for row in self.cur.execute('SELECT * FROM refs'):
583
+ for row in self._read('SELECT * FROM refs'):
509
584
  logger.debug('Ref record: %s', str(row))
510
585
 
511
586
 
512
- for row in self.cur.execute('SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.job_id = ? AND refs.state != ?',
587
+ for row in self._read('SELECT TOTAL(files.size) FROM refs INNER JOIN files ON refs.file_id = files.id WHERE refs.job_id = ? AND refs.state != ?',
513
588
  (self.jobID, 'mutable')):
514
589
  # Sum up all the sizes of our referenced files, then subtract that from how much we came in with
515
590
  return self.jobDiskBytes - row[0]
@@ -532,7 +607,7 @@ class CachingFileStore(AbstractFileStore):
532
607
  file you need to do it in a transaction.
533
608
  """
534
609
 
535
- for row in self.cur.execute('SELECT COUNT(*) FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)',
610
+ for row in self._read('SELECT COUNT(*) FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)',
536
611
  (fileID, 'cached', 'uploadable', 'uploading')):
537
612
 
538
613
  return row[0] > 0
@@ -545,7 +620,7 @@ class CachingFileStore(AbstractFileStore):
545
620
  Counts mutable references too.
546
621
  """
547
622
 
548
- for row in self.cur.execute('SELECT COUNT(*) FROM refs WHERE file_id = ?', (fileID,)):
623
+ for row in self._read('SELECT COUNT(*) FROM refs WHERE file_id = ?', (fileID,)):
549
624
  return row[0]
550
625
  return 0
551
626
 
@@ -558,7 +633,7 @@ class CachingFileStore(AbstractFileStore):
558
633
  configurations, most notably the FileJobStore.
559
634
  """
560
635
 
561
- for row in self.cur.execute('SELECT value FROM properties WHERE name = ?', ('freeCaching',)):
636
+ for row in self._read('SELECT value FROM properties WHERE name = ?', ('freeCaching',)):
562
637
  return row[0] == 1
563
638
 
564
639
  # Otherwise we need to set it
@@ -570,7 +645,7 @@ class CachingFileStore(AbstractFileStore):
570
645
  emptyID = self.jobStore.getEmptyFileStoreID()
571
646
 
572
647
  # Read it out to a generated name.
573
- destDir = tempfile.mkdtemp(dir=self.localCacheDir)
648
+ destDir = mkdtemp(dir=self.localCacheDir)
574
649
  cachedFile = os.path.join(destDir, 'sniffLinkCount')
575
650
  self.jobStore.read_file(emptyID, cachedFile, symlink=False)
576
651
 
@@ -614,7 +689,7 @@ class CachingFileStore(AbstractFileStore):
614
689
  # sure we can never collide even though we are going to remove the
615
690
  # file.
616
691
  # TODO: use a de-slashed version of the ID instead?
617
- handle, path = tempfile.mkstemp(dir=self.localCacheDir, suffix=hasher.hexdigest())
692
+ handle, path = mkstemp(dir=self.localCacheDir, suffix=hasher.hexdigest())
618
693
  os.close(handle)
619
694
  os.unlink(path)
620
695
 
@@ -627,153 +702,137 @@ class CachingFileStore(AbstractFileStore):
627
702
  We don't actually process them here. We take action based on the states of files we own later.
628
703
  """
629
704
 
630
- me = get_process_name(self.coordination_dir)
705
+ with self.as_process() as me:
631
706
 
632
- # Get a list of all file owner processes on this node.
633
- # Exclude NULL because it comes out as 0 and we can't look for PID 0.
634
- owners = []
635
- for row in self.cur.execute('SELECT DISTINCT owner FROM files WHERE owner IS NOT NULL'):
636
- owners.append(row[0])
707
+ # Get a list of all file owner processes on this node.
708
+ # Exclude NULL because it comes out as 0 and we can't look for PID 0.
709
+ owners = []
710
+ for row in self._read('SELECT DISTINCT owner FROM files WHERE owner IS NOT NULL'):
711
+ owners.append(row[0])
637
712
 
638
- # Work out which of them have died.
639
- deadOwners = []
640
- for owner in owners:
641
- if not process_name_exists(self.coordination_dir, owner):
642
- logger.debug('Owner %s is dead', owner)
643
- deadOwners.append(owner)
644
- else:
645
- logger.debug('Owner %s is alive', owner)
646
-
647
- for owner in deadOwners:
648
- # Try and adopt all the files that any dead owner had
649
-
650
- # If they were deleting, we delete.
651
- # If they were downloading, we delete. Any outstanding references
652
- # can't be in use since they are from the dead downloader.
653
- # If they were uploading or uploadable, we mark as cached even
654
- # though it never made it to the job store (and leave it unowned).
655
- #
656
- # Once the dead job that it was being uploaded from is cleaned up,
657
- # and there are no longer any immutable references, it will be
658
- # evicted as normal. Since the dead job can't have been marked
659
- # successfully completed (since the file is still not uploaded),
660
- # nobody is allowed to actually try and use the file.
661
- #
662
- # TODO: if we ever let other PIDs be responsible for writing our
663
- # files asynchronously, this will need to change.
664
- self._write([('UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?',
665
- (me, 'deleting', owner, 'deleting')),
666
- ('UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?',
667
- (me, 'deleting', owner, 'downloading')),
668
- ('UPDATE files SET owner = NULL, state = ? WHERE owner = ? AND (state = ? OR state = ?)',
669
- ('cached', owner, 'uploadable', 'uploading'))])
670
-
671
- logger.debug('Tried to adopt file operations from dead worker %s to ourselves as %s', owner, me)
672
-
673
- @classmethod
674
- def _executePendingDeletions(cls, coordination_dir, con, cur):
713
+ # Work out which of them have died.
714
+ deadOwners = []
715
+ for owner in owners:
716
+ if not process_name_exists(self.coordination_dir, owner):
717
+ logger.debug('Owner %s is dead', owner)
718
+ deadOwners.append(owner)
719
+ else:
720
+ logger.debug('Owner %s is alive', owner)
721
+
722
+ for owner in deadOwners:
723
+ # Try and adopt all the files that any dead owner had
724
+
725
+ # If they were deleting, we delete.
726
+ # If they were downloading, we delete. Any outstanding references
727
+ # can't be in use since they are from the dead downloader.
728
+ # If they were uploading or uploadable, we mark as cached even
729
+ # though it never made it to the job store (and leave it unowned).
730
+ #
731
+ # Once the dead job that it was being uploaded from is cleaned up,
732
+ # and there are no longer any immutable references, it will be
733
+ # evicted as normal. Since the dead job can't have been marked
734
+ # successfully completed (since the file is still not uploaded),
735
+ # nobody is allowed to actually try and use the file.
736
+ #
737
+ # TODO: if we ever let other PIDs be responsible for writing our
738
+ # files asynchronously, this will need to change.
739
+ self._write([('UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?',
740
+ (me, 'deleting', owner, 'deleting')),
741
+ ('UPDATE files SET owner = ?, state = ? WHERE owner = ? AND state = ?',
742
+ (me, 'deleting', owner, 'downloading')),
743
+ ('UPDATE files SET owner = NULL, state = ? WHERE owner = ? AND (state = ? OR state = ?)',
744
+ ('cached', owner, 'uploadable', 'uploading'))])
745
+
746
+ logger.debug('Tried to adopt file operations from dead worker %s to ourselves as %s', owner, me)
747
+
748
+ def _executePendingDeletions(self):
675
749
  """
676
750
  Delete all the files that are registered in the database as in the
677
751
  process of being deleted from the cache by us.
678
752
 
679
753
  Returns the number of files that were deleted.
680
-
681
- Implemented as a class method so it can use the database connection
682
- appropriate to its thread without any chance of getting at the main
683
- thread's connection and cursor in self.
684
-
685
- :param str coordination_dir: The coordination directory.
686
- :param sqlite3.Connection con: Connection to the cache database.
687
- :param sqlite3.Cursor cur: Cursor in the cache database.
688
754
  """
689
755
 
690
- me = get_process_name(coordination_dir)
756
+ with self.as_process() as me:
691
757
 
692
- # Remember the file IDs we are deleting
693
- deletedFiles = []
694
- for row in cur.execute('SELECT id, path FROM files WHERE owner = ? AND state = ?', (me, 'deleting')):
695
- # Grab everything we are supposed to delete and delete it
696
- fileID = row[0]
697
- filePath = row[1]
698
- try:
699
- os.unlink(filePath)
700
- logger.debug('Successfully deleted: %s', filePath)
701
- except OSError:
702
- # Probably already deleted
703
- logger.debug('File already gone: %s', filePath)
704
- # Still need to mark it as deleted
758
+ # Remember the file IDs we are deleting
759
+ deletedFiles = []
760
+ for row in self._read('SELECT id, path FROM files WHERE owner = ? AND state = ?', (me, 'deleting')):
761
+ # Grab everything we are supposed to delete and delete it
762
+ fileID = row[0]
763
+ filePath = row[1]
764
+ try:
765
+ os.unlink(filePath)
766
+ logger.debug('Successfully deleted: %s', filePath)
767
+ except OSError:
768
+ # Probably already deleted
769
+ logger.debug('File already gone: %s', filePath)
770
+ # Still need to mark it as deleted
705
771
 
706
- # Whether we deleted the file or just found out that it is gone, we
707
- # need to take credit for deleting it so that we remove it from the
708
- # database.
709
- deletedFiles.append(fileID)
772
+ # Whether we deleted the file or just found out that it is gone, we
773
+ # need to take credit for deleting it so that we remove it from the
774
+ # database.
775
+ deletedFiles.append(fileID)
710
776
 
711
- for fileID in deletedFiles:
712
- # Drop all the files. They should have stayed in deleting state. We move them from there to not present at all.
713
- # Also drop their references, if they had any from dead downloaders.
714
- cls._staticWrite(con, cur, [('DELETE FROM files WHERE id = ? AND state = ?', (fileID, 'deleting')),
715
- ('DELETE FROM refs WHERE file_id = ?', (fileID,))])
777
+ for fileID in deletedFiles:
778
+ # Drop all the files. They should have stayed in deleting state. We move them from there to not present at all.
779
+ # Also drop their references, if they had any from dead downloaders.
780
+ self._write([('DELETE FROM files WHERE id = ? AND state = ?', (fileID, 'deleting')),
781
+ ('DELETE FROM refs WHERE file_id = ?', (fileID,))])
716
782
 
717
- return len(deletedFiles)
783
+ return len(deletedFiles)
718
784
 
719
- def _executePendingUploads(self, con, cur):
785
+ def _executePendingUploads(self):
720
786
  """
721
787
  Uploads all files in uploadable state that we own.
722
788
 
723
789
  Returns the number of files that were uploaded.
724
-
725
- Needs access to self to get at the job store for uploading files, but
726
- still needs to take con and cur so it can run in a thread with the
727
- thread's database connection.
728
-
729
- :param sqlite3.Connection con: Connection to the cache database.
730
- :param sqlite3.Cursor cur: Cursor in the cache database.
731
790
  """
732
791
 
733
792
  # Work out who we are
734
- me = get_process_name(self.coordination_dir)
735
-
736
- # Record how many files we upload
737
- uploadedCount = 0
738
- while True:
739
- # Try and find a file we might want to upload
740
- fileID = None
741
- filePath = None
742
- for row in cur.execute('SELECT id, path FROM files WHERE state = ? AND owner = ? LIMIT 1', ('uploadable', me)):
743
- fileID = row[0]
744
- filePath = row[1]
745
-
746
- if fileID is None:
747
- # Nothing else exists to upload
748
- break
749
-
750
- # We need to set it to uploading in a way that we can detect that *we* won the update race instead of anyone else.
751
- rowCount = self._staticWrite(con, cur, [('UPDATE files SET state = ? WHERE id = ? AND state = ?', ('uploading', fileID, 'uploadable'))])
752
- if rowCount != 1:
753
- # We didn't manage to update it. Someone else (a running job if
754
- # we are a committing thread, or visa versa) must have grabbed
755
- # it.
756
- logger.debug('Lost race to upload %s', fileID)
757
- # Try again to see if there is something else to grab.
758
- continue
793
+ with self.as_process() as me:
794
+
795
+ # Record how many files we upload
796
+ uploadedCount = 0
797
+ while True:
798
+ # Try and find a file we might want to upload
799
+ fileID = None
800
+ filePath = None
801
+ for row in self._static_read(self.cur, 'SELECT id, path FROM files WHERE state = ? AND owner = ? LIMIT 1', ('uploadable', me)):
802
+ fileID = row[0]
803
+ filePath = row[1]
804
+
805
+ if fileID is None:
806
+ # Nothing else exists to upload
807
+ break
759
808
 
760
- # Upload the file
761
- logger.debug('Actually executing upload for file %s', fileID)
762
- try:
763
- self.jobStore.update_file(fileID, filePath)
764
- except:
765
- # We need to set the state back to 'uploadable' in case of any failures to ensure
766
- # we can retry properly.
767
- self._staticWrite(con, cur, [('UPDATE files SET state = ? WHERE id = ? AND state = ?', ('uploadable', fileID, 'uploading'))])
768
- raise
809
+ # We need to set it to uploading in a way that we can detect that *we* won the update race instead of anyone else.
810
+ rowCount = self._static_write(self.con, self.cur, [('UPDATE files SET state = ? WHERE id = ? AND state = ?', ('uploading', fileID, 'uploadable'))])
811
+ if rowCount != 1:
812
+ # We didn't manage to update it. Someone else (a running job if
813
+ # we are a committing thread, or visa versa) must have grabbed
814
+ # it.
815
+ logger.debug('Lost race to upload %s', fileID)
816
+ # Try again to see if there is something else to grab.
817
+ continue
818
+
819
+ # Upload the file
820
+ logger.debug('Actually executing upload for file %s', fileID)
821
+ try:
822
+ self.jobStore.update_file(fileID, filePath)
823
+ except:
824
+ # We need to set the state back to 'uploadable' in case of any failures to ensure
825
+ # we can retry properly.
826
+ self._static_write(self.con, self.cur, [('UPDATE files SET state = ? WHERE id = ? AND state = ?', ('uploadable', fileID, 'uploading'))])
827
+ raise
769
828
 
770
- # Count it for the total uploaded files value we need to return
771
- uploadedCount += 1
829
+ # Count it for the total uploaded files value we need to return
830
+ uploadedCount += 1
772
831
 
773
- # Remember that we uploaded it in the database
774
- self._staticWrite(con, cur, [('UPDATE files SET state = ?, owner = NULL WHERE id = ?', ('cached', fileID))])
832
+ # Remember that we uploaded it in the database
833
+ self._static_write(self.con, self.cur, [('UPDATE files SET state = ?, owner = NULL WHERE id = ?', ('cached', fileID))])
775
834
 
776
- return uploadedCount
835
+ return uploadedCount
777
836
 
778
837
  def _allocateSpaceForJob(self, newJobReqs):
779
838
  """
@@ -794,23 +853,23 @@ class CachingFileStore(AbstractFileStore):
794
853
  # This will take up space for us and potentially make the cache over-full.
795
854
  # But we won't actually let the job run and use any of this space until
796
855
  # the cache has been successfully cleared out.
797
- me = get_process_name(self.coordination_dir)
798
- self._write([('INSERT INTO jobs VALUES (?, ?, ?, ?)', (self.jobID, self.localTempDir, newJobReqs, me))])
856
+ with self.as_process() as me:
857
+ self._write([('INSERT INTO jobs VALUES (?, ?, ?, ?)', (self.jobID, self.localTempDir, newJobReqs, me))])
799
858
 
800
- # Now we need to make sure that we can fit all currently cached files,
801
- # and the parts of the total job requirements not currently spent on
802
- # cached files, in under the total disk space limit.
859
+ # Now we need to make sure that we can fit all currently cached files,
860
+ # and the parts of the total job requirements not currently spent on
861
+ # cached files, in under the total disk space limit.
803
862
 
804
- available = self.getCacheAvailable()
863
+ available = self.getCacheAvailable()
805
864
 
806
- logger.debug('Available space with job: %d bytes', available)
865
+ logger.debug('Available space with job: %d bytes', available)
807
866
 
808
- if available >= 0:
809
- # We're fine on disk space
810
- return
867
+ if available >= 0:
868
+ # We're fine on disk space
869
+ return
811
870
 
812
- # Otherwise we need to clear stuff.
813
- self._freeUpSpace()
871
+ # Otherwise we need to clear stuff.
872
+ self._freeUpSpace()
814
873
 
815
874
  @classmethod
816
875
  def _removeJob(cls, con, cur, jobID):
@@ -827,10 +886,10 @@ class CachingFileStore(AbstractFileStore):
827
886
  """
828
887
 
829
888
  # Get the job's temp dir
830
- for row in cur.execute('SELECT tempdir FROM jobs WHERE id = ?', (jobID,)):
889
+ for row in cls._static_read(cur, 'SELECT tempdir FROM jobs WHERE id = ?', (jobID,)):
831
890
  jobTemp = row[0]
832
891
 
833
- for row in cur.execute('SELECT path FROM refs WHERE job_id = ?', (jobID,)):
892
+ for row in cls._static_read(cur, 'SELECT path FROM refs WHERE job_id = ?', (jobID,)):
834
893
  try:
835
894
  # Delete all the reference files.
836
895
  os.unlink(row[0])
@@ -838,7 +897,7 @@ class CachingFileStore(AbstractFileStore):
838
897
  # May not exist
839
898
  pass
840
899
  # And their database entries
841
- cls._staticWrite(con, cur, [('DELETE FROM refs WHERE job_id = ?', (jobID,))])
900
+ cls._static_write(con, cur, [('DELETE FROM refs WHERE job_id = ?', (jobID,))])
842
901
 
843
902
  try:
844
903
  # Delete the job's temp directory to the extent that we can.
@@ -847,7 +906,7 @@ class CachingFileStore(AbstractFileStore):
847
906
  pass
848
907
 
849
908
  # Strike the job from the database
850
- cls._staticWrite(con, cur, [('DELETE FROM jobs WHERE id = ?', (jobID,))])
909
+ cls._static_write(con, cur, [('DELETE FROM jobs WHERE id = ?', (jobID,))])
851
910
 
852
911
  def _deallocateSpaceForJob(self):
853
912
  """
@@ -866,66 +925,65 @@ class CachingFileStore(AbstractFileStore):
866
925
  Return whether we manage to get any space freed or not.
867
926
  """
868
927
 
869
- # First we want to make sure that dead jobs aren't holding
870
- # references to files and keeping them from looking unused.
871
- self._removeDeadJobs(self.coordination_dir, self.con)
872
-
873
- # Adopt work from any dead workers
874
- self._stealWorkFromTheDead()
928
+ with self.as_process() as me:
875
929
 
876
- if self._executePendingDeletions(self.coordination_dir, self.con, self.cur) > 0:
877
- # We actually had something to delete, which we deleted.
878
- # Maybe there is space now
879
- logger.debug('Successfully executed pending deletions to free space')
880
- return True
930
+ # First we want to make sure that dead jobs aren't holding
931
+ # references to files and keeping them from looking unused.
932
+ self._removeDeadJobs(self.coordination_dir, self.con)
881
933
 
882
- if self._executePendingUploads(self.con, self.cur) > 0:
883
- # We had something to upload. Maybe it can be evicted now.
884
- logger.debug('Successfully executed pending uploads to free space')
885
- return True
934
+ # Adopt work from any dead workers
935
+ self._stealWorkFromTheDead()
886
936
 
887
- # Otherwise, not enough files could be found in deleting state to solve our problem.
888
- # We need to put something into the deleting state.
889
- # TODO: give other people time to finish their in-progress
890
- # evictions before starting more, or we might evict everything as
891
- # soon as we hit the cache limit.
892
-
893
- # Find something that has no non-mutable references and is not already being deleted.
894
- self.cur.execute("""
895
- SELECT files.id FROM files WHERE files.state = 'cached' AND NOT EXISTS (
896
- SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
897
- ) LIMIT 1
898
- """)
899
- row = self.cur.fetchone()
900
- if row is None:
901
- # Nothing can be evicted by us.
902
- # Someone else might be in the process of evicting something that will free up space for us too.
903
- # Or someone mught be uploading something and we have to wait for them to finish before it can be deleted.
904
- logger.debug('Could not find anything to evict! Cannot free up space!')
905
- return False
906
-
907
- # Otherwise we found an eviction candidate.
908
- fileID = row[0]
937
+ if self._executePendingDeletions() > 0:
938
+ # We actually had something to delete, which we deleted.
939
+ # Maybe there is space now
940
+ logger.debug('Successfully executed pending deletions to free space')
941
+ return True
942
+
943
+ if self._executePendingUploads() > 0:
944
+ # We had something to upload. Maybe it can be evicted now.
945
+ logger.debug('Successfully executed pending uploads to free space')
946
+ return True
947
+
948
+ # Otherwise, not enough files could be found in deleting state to solve our problem.
949
+ # We need to put something into the deleting state.
950
+ # TODO: give other people time to finish their in-progress
951
+ # evictions before starting more, or we might evict everything as
952
+ # soon as we hit the cache limit.
953
+
954
+ # Find something that has no non-mutable references and is not already being deleted.
955
+ self._read("""
956
+ SELECT files.id FROM files WHERE files.state = 'cached' AND NOT EXISTS (
957
+ SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
958
+ ) LIMIT 1
959
+ """)
960
+ row = self.cur.fetchone()
961
+ if row is None:
962
+ # Nothing can be evicted by us.
963
+ # Someone else might be in the process of evicting something that will free up space for us too.
964
+ # Or someone mught be uploading something and we have to wait for them to finish before it can be deleted.
965
+ logger.debug('Could not find anything to evict! Cannot free up space!')
966
+ return False
909
967
 
910
- # Work out who we are
911
- me = get_process_name(self.coordination_dir)
968
+ # Otherwise we found an eviction candidate.
969
+ fileID = row[0]
912
970
 
913
- # Try and grab it for deletion, subject to the condition that nothing has started reading it
914
- self._write([("""
915
- UPDATE files SET owner = ?, state = ? WHERE id = ? AND state = ?
916
- AND owner IS NULL AND NOT EXISTS (
917
- SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
918
- )
919
- """,
920
- (me, 'deleting', fileID, 'cached'))])
971
+ # Try and grab it for deletion, subject to the condition that nothing has started reading it
972
+ self._write([("""
973
+ UPDATE files SET owner = ?, state = ? WHERE id = ? AND state = ?
974
+ AND owner IS NULL AND NOT EXISTS (
975
+ SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
976
+ )
977
+ """,
978
+ (me, 'deleting', fileID, 'cached'))])
921
979
 
922
- logger.debug('Evicting file %s', fileID)
980
+ logger.debug('Evicting file %s', fileID)
923
981
 
924
- # Whether we actually got it or not, try deleting everything we have to delete
925
- if self._executePendingDeletions(self.coordination_dir, self.con, self.cur) > 0:
926
- # We deleted something
927
- logger.debug('Successfully executed pending deletions to free space')
928
- return True
982
+ # Whether we actually got it or not, try deleting everything we have to delete
983
+ if self._executePendingDeletions() > 0:
984
+ # We deleted something
985
+ logger.debug('Successfully executed pending deletions to free space')
986
+ return True
929
987
 
930
988
  def _freeUpSpace(self):
931
989
  """
@@ -982,7 +1040,8 @@ class CachingFileStore(AbstractFileStore):
982
1040
  # Check the status of all jobs on this node. If there are jobs that started and died before
983
1041
  # cleaning up their presence from the database, clean them up ourselves.
984
1042
  self._removeDeadJobs(self.coordination_dir, self.con)
985
- # Get the requirements for the job.
1043
+ # Get the disk requirement for the job, which we will use to know if we
1044
+ # have filled the cache or not.
986
1045
  self.jobDiskBytes = job.disk
987
1046
 
988
1047
  logger.debug('Actually running job (%s) with ID (%s) which wants %d of our %d bytes.',
@@ -996,22 +1055,6 @@ class CachingFileStore(AbstractFileStore):
996
1055
  with super().open(job):
997
1056
  yield
998
1057
  finally:
999
- # See how much disk space is used at the end of the job.
1000
- # Not a real peak disk usage, but close enough to be useful for warning the user.
1001
- # TODO: Push this logic into the abstract file store
1002
- disk: int = getDirSizeRecursively(self.localTempDir)
1003
- percent: float = 0.0
1004
- if self.jobDiskBytes and self.jobDiskBytes > 0:
1005
- percent = float(disk) / self.jobDiskBytes * 100
1006
- disk_usage: str = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(disk)}B [{disk}B] used, "
1007
- f"{bytes2human(self.jobDiskBytes)}B [{self.jobDiskBytes}B] requested).")
1008
- if disk > self.jobDiskBytes:
1009
- self.logToMaster("Job used more disk than requested. For CWL, consider increasing the outdirMin "
1010
- f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
1011
- level=logging.WARNING)
1012
- else:
1013
- self.logToMaster(disk_usage, level=logging.DEBUG)
1014
-
1015
1058
  # Go back up to the per-worker local temp directory.
1016
1059
  os.chdir(startingDir)
1017
1060
  self.cleanupInProgress = True
@@ -1036,62 +1079,62 @@ class CachingFileStore(AbstractFileStore):
1036
1079
  # Create an empty file to get an ID.
1037
1080
  # Make sure to pass along the file basename.
1038
1081
  # TODO: this empty file could leak if we die now...
1039
- fileID = self.jobStore.getEmptyFileStoreID(creatorID, cleanup, os.path.basename(localFileName))
1082
+ fileID = self.jobStore.get_empty_file_store_id(creatorID, cleanup, os.path.basename(localFileName))
1040
1083
  # Work out who we are
1041
- me = get_process_name(self.coordination_dir)
1084
+ with self.as_process() as me:
1042
1085
 
1043
- # Work out where the file ought to go in the cache
1044
- cachePath = self._getNewCachingPath(fileID)
1086
+ # Work out where the file ought to go in the cache
1087
+ cachePath = self._getNewCachingPath(fileID)
1045
1088
 
1046
- # Create a file in uploadable state and a reference, in the same transaction.
1047
- # Say the reference is an immutable reference
1048
- self._write([('INSERT INTO files VALUES (?, ?, ?, ?, ?)', (fileID, cachePath, fileSize, 'uploadable', me)),
1049
- ('INSERT INTO refs VALUES (?, ?, ?, ?)', (absLocalFileName, fileID, creatorID, 'immutable'))])
1089
+ # Create a file in uploadable state and a reference, in the same transaction.
1090
+ # Say the reference is an immutable reference
1091
+ self._write([('INSERT INTO files VALUES (?, ?, ?, ?, ?)', (fileID, cachePath, fileSize, 'uploadable', me)),
1092
+ ('INSERT INTO refs VALUES (?, ?, ?, ?)', (absLocalFileName, fileID, creatorID, 'immutable'))])
1050
1093
 
1051
- if absLocalFileName.startswith(self.localTempDir) and not os.path.islink(absLocalFileName):
1052
- # We should link into the cache, because the upload is coming from our local temp dir (and not via a symlink in there)
1053
- try:
1054
- # Try and hardlink the file into the cache.
1055
- # This can only fail if the system doesn't have hardlinks, or the
1056
- # file we're trying to link to has too many hardlinks to it
1057
- # already, or something.
1058
- os.link(absLocalFileName, cachePath)
1094
+ if absLocalFileName.startswith(self.localTempDir) and not os.path.islink(absLocalFileName):
1095
+ # We should link into the cache, because the upload is coming from our local temp dir (and not via a symlink in there)
1096
+ try:
1097
+ # Try and hardlink the file into the cache.
1098
+ # This can only fail if the system doesn't have hardlinks, or the
1099
+ # file we're trying to link to has too many hardlinks to it
1100
+ # already, or something.
1101
+ os.link(absLocalFileName, cachePath)
1059
1102
 
1060
- linkedToCache = True
1103
+ linkedToCache = True
1061
1104
 
1062
- logger.debug('Hardlinked file %s into cache at %s; deferring write to job store', localFileName, cachePath)
1063
- assert not os.path.islink(cachePath), "Symlink %s has invaded cache!" % cachePath
1105
+ logger.debug('Hardlinked file %s into cache at %s; deferring write to job store', localFileName, cachePath)
1106
+ assert not os.path.islink(cachePath), "Symlink %s has invaded cache!" % cachePath
1064
1107
 
1065
- # Don't do the upload now. Let it be deferred until later (when the job is committing).
1066
- except OSError:
1067
- # We couldn't make the link for some reason
1108
+ # Don't do the upload now. Let it be deferred until later (when the job is committing).
1109
+ except OSError:
1110
+ # We couldn't make the link for some reason
1111
+ linkedToCache = False
1112
+ else:
1113
+ # If you are uploading a file that physically exists outside the
1114
+ # local temp dir, it should not be linked into the cache. On
1115
+ # systems that support it, we could end up with a
1116
+ # hardlink-to-symlink in the cache if we break this rule, allowing
1117
+ # files to vanish from our cache.
1068
1118
  linkedToCache = False
1069
- else:
1070
- # If you are uploading a file that physically exists outside the
1071
- # local temp dir, it should not be linked into the cache. On
1072
- # systems that support it, we could end up with a
1073
- # hardlink-to-symlink in the cache if we break this rule, allowing
1074
- # files to vanish from our cache.
1075
- linkedToCache = False
1076
1119
 
1077
1120
 
1078
- if not linkedToCache:
1079
- # If we can't do the link into the cache and upload from there, we
1080
- # have to just upload right away. We can't guarantee sufficient
1081
- # space to make a full copy in the cache, if we aren't allowed to
1082
- # take this copy away from the writer.
1121
+ if not linkedToCache:
1122
+ # If we can't do the link into the cache and upload from there, we
1123
+ # have to just upload right away. We can't guarantee sufficient
1124
+ # space to make a full copy in the cache, if we aren't allowed to
1125
+ # take this copy away from the writer.
1083
1126
 
1084
- # Change the reference to 'mutable', which it will be.
1085
- # And drop the file altogether.
1086
- self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', absLocalFileName, fileID)),
1087
- ('DELETE FROM files WHERE id = ?', (fileID,))])
1127
+ # Change the reference to 'mutable', which it will be.
1128
+ # And drop the file altogether.
1129
+ self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', absLocalFileName, fileID)),
1130
+ ('DELETE FROM files WHERE id = ?', (fileID,))])
1088
1131
 
1089
- # Save the file to the job store right now
1090
- logger.debug('Actually executing upload immediately for file %s', fileID)
1091
- self.jobStore.update_file(fileID, absLocalFileName)
1132
+ # Save the file to the job store right now
1133
+ logger.debug('Actually executing upload immediately for file %s', fileID)
1134
+ self.jobStore.update_file(fileID, absLocalFileName)
1092
1135
 
1093
- # Ship out the completed FileID object with its real size.
1094
- return FileID.forPath(fileID, absLocalFileName)
1136
+ # Ship out the completed FileID object with its real size.
1137
+ return FileID.forPath(fileID, absLocalFileName)
1095
1138
 
1096
1139
  def readGlobalFile(self, fileStoreID, userPath=None, cache=True, mutable=False, symlink=False):
1097
1140
 
@@ -1162,7 +1205,7 @@ class CachingFileStore(AbstractFileStore):
1162
1205
 
1163
1206
  # Find where the file is cached
1164
1207
  cachedPath = None
1165
- for row in self.cur.execute('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
1208
+ for row in self._read('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
1166
1209
  cachedPath = row[0]
1167
1210
 
1168
1211
  if cachedPath is None:
@@ -1239,130 +1282,130 @@ class CachingFileStore(AbstractFileStore):
1239
1282
  """
1240
1283
 
1241
1284
  # Work out who we are
1242
- me = get_process_name(self.coordination_dir)
1285
+ with self.as_process() as me:
1243
1286
 
1244
- # Work out where to cache the file if it isn't cached already
1245
- cachedPath = self._getNewCachingPath(fileStoreID)
1287
+ # Work out where to cache the file if it isn't cached already
1288
+ cachedPath = self._getNewCachingPath(fileStoreID)
1246
1289
 
1247
- # Start a loop until we can do one of these
1248
- while True:
1249
- # Try and create a downloading entry if no entry exists
1250
- logger.debug('Trying to make file record for id %s', fileStoreID)
1251
- self._write([('INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)',
1252
- (fileStoreID, cachedPath, self.getGlobalFileSize(fileStoreID), 'downloading', me))])
1253
-
1254
- # See if we won the race
1255
- self.cur.execute('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?', (fileStoreID, 'downloading', me))
1256
- if self.cur.fetchone()[0] > 0:
1257
- # We are responsible for downloading the file
1258
- logger.debug('We are now responsible for downloading file %s', fileStoreID)
1259
-
1260
- # Make sure we have space for this download.
1261
- self._freeUpSpace()
1262
-
1263
- # Do the download into the cache.
1264
- self._downloadToCache(fileStoreID, cachedPath)
1265
-
1266
- # Now, we may have to immediately give away this file, because
1267
- # we don't have space for two copies.
1268
- # If so, we can't let it go to cached state, because someone
1269
- # else might make a reference to it, and we may get stuck with
1270
- # two readers, one cached copy, and space for two copies total.
1271
-
1272
- # Make the copying reference
1273
- self._write([('INSERT INTO refs VALUES (?, ?, ?, ?)',
1274
- (localFilePath, fileStoreID, readerID, 'copying'))])
1275
-
1276
- # Fulfill it with a full copy or by giving away the cached copy
1277
- self._fulfillCopyingReference(fileStoreID, cachedPath, localFilePath)
1278
-
1279
- # Now we're done
1280
- return localFilePath
1290
+ # Start a loop until we can do one of these
1291
+ while True:
1292
+ # Try and create a downloading entry if no entry exists
1293
+ logger.debug('Trying to make file record for id %s', fileStoreID)
1294
+ self._write([('INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)',
1295
+ (fileStoreID, cachedPath, self.getGlobalFileSize(fileStoreID), 'downloading', me))])
1281
1296
 
1282
- else:
1283
- logger.debug('Someone else is already responsible for file %s', fileStoreID)
1284
-
1285
- # A record already existed for this file.
1286
- # Try and create an immutable or copying reference to an entry that
1287
- # is in 'cached' or 'uploadable' or 'uploading' state.
1288
- # It might be uploading because *we* are supposed to be uploading it.
1289
- logger.debug('Trying to make reference to file %s', fileStoreID)
1290
- self._write([('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)',
1291
- (localFilePath, readerID, 'copying', fileStoreID, 'cached', 'uploadable', 'uploading'))])
1292
-
1293
- # See if we got it
1294
- self.cur.execute('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (localFilePath, fileStoreID))
1297
+ # See if we won the race
1298
+ self._read('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?', (fileStoreID, 'downloading', me))
1295
1299
  if self.cur.fetchone()[0] > 0:
1296
- # The file is cached and we can copy or link it
1297
- logger.debug('Obtained reference to file %s', fileStoreID)
1298
-
1299
- # Get the path it is actually at in the cache, instead of where we wanted to put it
1300
- for row in self.cur.execute('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
1301
- cachedPath = row[0]
1302
-
1303
-
1304
- while self.getCacheAvailable() < 0:
1305
- # Since we now have a copying reference, see if we have used too much space.
1306
- # If so, try to free up some space by deleting or uploading, but
1307
- # don't loop forever if we can't get enough.
1308
- self._tryToFreeUpSpace()
1309
-
1310
- if self.getCacheAvailable() >= 0:
1311
- # We made room
1312
- break
1313
-
1314
- # See if we have no other references and we can give away the file.
1315
- # Change it to downloading owned by us if we can grab it.
1316
- self._write([("""
1317
- UPDATE files SET files.owner = ?, files.state = ? WHERE files.id = ? AND files.state = ?
1318
- AND files.owner IS NULL AND NOT EXISTS (
1319
- SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
1320
- )
1321
- """,
1322
- (me, 'downloading', fileStoreID, 'cached'))])
1323
-
1324
- if self._giveAwayDownloadingFile(fileStoreID, cachedPath, localFilePath):
1325
- # We got ownership of the file and managed to give it away.
1326
- return localFilePath
1300
+ # We are responsible for downloading the file
1301
+ logger.debug('We are now responsible for downloading file %s', fileStoreID)
1327
1302
 
1328
- # If we don't have space, and we couldn't make space, and we
1329
- # couldn't get exclusive control of the file to give it away, we
1330
- # need to wait for one of those people with references to the file
1331
- # to finish and give it up.
1332
- # TODO: work out if that will never happen somehow.
1333
- time.sleep(self.contentionBackoff)
1303
+ # Make sure we have space for this download.
1304
+ self._freeUpSpace()
1334
1305
 
1335
- # OK, now we have space to make a copy.
1306
+ # Do the download into the cache.
1307
+ self._downloadToCache(fileStoreID, cachedPath)
1336
1308
 
1337
- if self.forceDownloadDelay is not None:
1338
- # Wait around to simulate a big file for testing
1339
- time.sleep(self.forceDownloadDelay)
1309
+ # Now, we may have to immediately give away this file, because
1310
+ # we don't have space for two copies.
1311
+ # If so, we can't let it go to cached state, because someone
1312
+ # else might make a reference to it, and we may get stuck with
1313
+ # two readers, one cached copy, and space for two copies total.
1340
1314
 
1341
- # Make the copy
1342
- atomic_copy(cachedPath, localFilePath)
1315
+ # Make the copying reference
1316
+ self._write([('INSERT INTO refs VALUES (?, ?, ?, ?)',
1317
+ (localFilePath, fileStoreID, readerID, 'copying'))])
1343
1318
 
1344
- # Change the reference to mutable
1345
- self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', localFilePath, fileStoreID))])
1319
+ # Fulfill it with a full copy or by giving away the cached copy
1320
+ self._fulfillCopyingReference(fileStoreID, cachedPath, localFilePath)
1346
1321
 
1347
1322
  # Now we're done
1348
1323
  return localFilePath
1349
1324
 
1350
1325
  else:
1351
- # We didn't get a reference. Maybe it is still downloading.
1352
- logger.debug('Could not obtain reference to file %s', fileStoreID)
1326
+ logger.debug('Someone else is already responsible for file %s', fileStoreID)
1327
+
1328
+ # A record already existed for this file.
1329
+ # Try and create an immutable or copying reference to an entry that
1330
+ # is in 'cached' or 'uploadable' or 'uploading' state.
1331
+ # It might be uploading because *we* are supposed to be uploading it.
1332
+ logger.debug('Trying to make reference to file %s', fileStoreID)
1333
+ self._write([('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)',
1334
+ (localFilePath, readerID, 'copying', fileStoreID, 'cached', 'uploadable', 'uploading'))])
1335
+
1336
+ # See if we got it
1337
+ self._read('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (localFilePath, fileStoreID))
1338
+ if self.cur.fetchone()[0] > 0:
1339
+ # The file is cached and we can copy or link it
1340
+ logger.debug('Obtained reference to file %s', fileStoreID)
1341
+
1342
+ # Get the path it is actually at in the cache, instead of where we wanted to put it
1343
+ for row in self._read('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
1344
+ cachedPath = row[0]
1345
+
1346
+
1347
+ while self.getCacheAvailable() < 0:
1348
+ # Since we now have a copying reference, see if we have used too much space.
1349
+ # If so, try to free up some space by deleting or uploading, but
1350
+ # don't loop forever if we can't get enough.
1351
+ self._tryToFreeUpSpace()
1352
+
1353
+ if self.getCacheAvailable() >= 0:
1354
+ # We made room
1355
+ break
1356
+
1357
+ # See if we have no other references and we can give away the file.
1358
+ # Change it to downloading owned by us if we can grab it.
1359
+ self._write([("""
1360
+ UPDATE files SET files.owner = ?, files.state = ? WHERE files.id = ? AND files.state = ?
1361
+ AND files.owner IS NULL AND NOT EXISTS (
1362
+ SELECT NULL FROM refs WHERE refs.file_id = files.id AND refs.state != 'mutable'
1363
+ )
1364
+ """,
1365
+ (me, 'downloading', fileStoreID, 'cached'))])
1366
+
1367
+ if self._giveAwayDownloadingFile(fileStoreID, cachedPath, localFilePath):
1368
+ # We got ownership of the file and managed to give it away.
1369
+ return localFilePath
1370
+
1371
+ # If we don't have space, and we couldn't make space, and we
1372
+ # couldn't get exclusive control of the file to give it away, we
1373
+ # need to wait for one of those people with references to the file
1374
+ # to finish and give it up.
1375
+ # TODO: work out if that will never happen somehow.
1376
+ time.sleep(self.contentionBackoff)
1377
+
1378
+ # OK, now we have space to make a copy.
1379
+
1380
+ if self.forceDownloadDelay is not None:
1381
+ # Wait around to simulate a big file for testing
1382
+ time.sleep(self.forceDownloadDelay)
1383
+
1384
+ # Make the copy
1385
+ atomic_copy(cachedPath, localFilePath)
1386
+
1387
+ # Change the reference to mutable
1388
+ self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', localFilePath, fileStoreID))])
1389
+
1390
+ # Now we're done
1391
+ return localFilePath
1353
1392
 
1354
- # Loop around again and see if either we can download it or we can get a reference to it.
1393
+ else:
1394
+ # We didn't get a reference. Maybe it is still downloading.
1395
+ logger.debug('Could not obtain reference to file %s', fileStoreID)
1355
1396
 
1356
- # If we didn't get a download or a reference, adopt and do work
1357
- # from dead workers and loop again.
1358
- # We may have to wait for someone else's download or delete to
1359
- # finish. If they die, we will notice.
1360
- self._removeDeadJobs(self.coordination_dir, self.con)
1361
- self._stealWorkFromTheDead()
1362
- self._executePendingDeletions(self.coordination_dir, self.con, self.cur)
1397
+ # Loop around again and see if either we can download it or we can get a reference to it.
1398
+
1399
+ # If we didn't get a download or a reference, adopt and do work
1400
+ # from dead workers and loop again.
1401
+ # We may have to wait for someone else's download or delete to
1402
+ # finish. If they die, we will notice.
1403
+ self._removeDeadJobs(self.coordination_dir, self.con)
1404
+ self._stealWorkFromTheDead()
1405
+ self._executePendingDeletions()
1363
1406
 
1364
- # Wait for other people's downloads to progress before re-polling.
1365
- time.sleep(self.contentionBackoff)
1407
+ # Wait for other people's downloads to progress before re-polling.
1408
+ time.sleep(self.contentionBackoff)
1366
1409
 
1367
1410
  def _fulfillCopyingReference(self, fileStoreID, cachedPath, localFilePath):
1368
1411
  """
@@ -1422,27 +1465,27 @@ class CachingFileStore(AbstractFileStore):
1422
1465
  """
1423
1466
 
1424
1467
  # Work out who we are
1425
- me = get_process_name(self.coordination_dir)
1468
+ with self.as_process() as me:
1426
1469
 
1427
- # See if we actually own this file and can giove it away
1428
- self.cur.execute('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?',
1429
- (fileStoreID, 'downloading', me))
1430
- if self.cur.fetchone()[0] > 0:
1431
- # Now we have exclusive control of the cached copy of the file, so we can give it away.
1470
+ # See if we actually own this file and can giove it away
1471
+ self._read('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?',
1472
+ (fileStoreID, 'downloading', me))
1473
+ if self.cur.fetchone()[0] > 0:
1474
+ # Now we have exclusive control of the cached copy of the file, so we can give it away.
1432
1475
 
1433
- # Don't fake a delay here; this should be a rename always.
1476
+ # Don't fake a delay here; this should be a rename always.
1434
1477
 
1435
- # We are giving it away
1436
- shutil.move(cachedPath, localFilePath)
1437
- # Record that.
1438
- self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', localFilePath, fileStoreID)),
1439
- ('DELETE FROM files WHERE id = ?', (fileStoreID,))])
1478
+ # We are giving it away
1479
+ shutil.move(cachedPath, localFilePath)
1480
+ # Record that.
1481
+ self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('mutable', localFilePath, fileStoreID)),
1482
+ ('DELETE FROM files WHERE id = ?', (fileStoreID,))])
1440
1483
 
1441
- # Now we're done
1442
- return True
1443
- else:
1444
- # We don't own this file in 'downloading' state
1445
- return False
1484
+ # Now we're done
1485
+ return True
1486
+ else:
1487
+ # We don't own this file in 'downloading' state
1488
+ return False
1446
1489
 
1447
1490
  def _createLinkFromCache(self, cachedPath, localFilePath, symlink=True):
1448
1491
  """
@@ -1493,108 +1536,108 @@ class CachingFileStore(AbstractFileStore):
1493
1536
  # Now we know to use the cache, and that we don't require a mutable copy.
1494
1537
 
1495
1538
  # Work out who we are
1496
- me = get_process_name(self.coordination_dir)
1497
-
1498
- # Work out where to cache the file if it isn't cached already
1499
- cachedPath = self._getNewCachingPath(fileStoreID)
1500
-
1501
- # Start a loop until we can do one of these
1502
- while True:
1503
- # Try and create a downloading entry if no entry exists.
1504
- # Make sure to create a reference at the same time if it succeeds, to bill it against our job's space.
1505
- # Don't create the mutable reference yet because we might not necessarily be able to clear that space.
1506
- logger.debug('Trying to make file downloading file record and reference for id %s', fileStoreID)
1507
- self._write([('INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)',
1508
- (fileStoreID, cachedPath, self.getGlobalFileSize(fileStoreID), 'downloading', me)),
1509
- ('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND state = ? AND owner = ?',
1510
- (localFilePath, readerID, 'immutable', fileStoreID, 'downloading', me))])
1511
-
1512
- # See if we won the race
1513
- self.cur.execute('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?', (fileStoreID, 'downloading', me))
1514
- if self.cur.fetchone()[0] > 0:
1515
- # We are responsible for downloading the file (and we have the reference)
1516
- logger.debug('We are now responsible for downloading file %s', fileStoreID)
1517
-
1518
- # Make sure we have space for this download.
1519
- self._freeUpSpace()
1520
-
1521
- # Do the download into the cache.
1522
- self._downloadToCache(fileStoreID, cachedPath)
1523
-
1524
- # Try and make the link before we let the file go to cached state.
1525
- # If we fail we may end up having to give away the file we just downloaded.
1526
- if self._createLinkFromCache(cachedPath, localFilePath, symlink):
1527
- # We made the link!
1528
-
1529
- # Change file state from downloading to cached so other people can use it
1530
- self._write([('UPDATE files SET state = ?, owner = NULL WHERE id = ?',
1531
- ('cached', fileStoreID))])
1532
-
1533
- # Now we're done!
1534
- return localFilePath
1535
- else:
1536
- # We could not make a link. We need to make a copy.
1539
+ with self.as_process() as me:
1540
+
1541
+ # Work out where to cache the file if it isn't cached already
1542
+ cachedPath = self._getNewCachingPath(fileStoreID)
1543
+
1544
+ # Start a loop until we can do one of these
1545
+ while True:
1546
+ # Try and create a downloading entry if no entry exists.
1547
+ # Make sure to create a reference at the same time if it succeeds, to bill it against our job's space.
1548
+ # Don't create the mutable reference yet because we might not necessarily be able to clear that space.
1549
+ logger.debug('Trying to make file downloading file record and reference for id %s', fileStoreID)
1550
+ self._write([('INSERT OR IGNORE INTO files VALUES (?, ?, ?, ?, ?)',
1551
+ (fileStoreID, cachedPath, self.getGlobalFileSize(fileStoreID), 'downloading', me)),
1552
+ ('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND state = ? AND owner = ?',
1553
+ (localFilePath, readerID, 'immutable', fileStoreID, 'downloading', me))])
1554
+
1555
+ # See if we won the race
1556
+ self._read('SELECT COUNT(*) FROM files WHERE id = ? AND state = ? AND owner = ?', (fileStoreID, 'downloading', me))
1557
+ if self.cur.fetchone()[0] > 0:
1558
+ # We are responsible for downloading the file (and we have the reference)
1559
+ logger.debug('We are now responsible for downloading file %s', fileStoreID)
1537
1560
 
1538
- # Change the reference to copying.
1539
- self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('copying', localFilePath, fileStoreID))])
1561
+ # Make sure we have space for this download.
1562
+ self._freeUpSpace()
1540
1563
 
1541
- # Fulfill it with a full copy or by giving away the cached copy
1542
- self._fulfillCopyingReference(fileStoreID, cachedPath, localFilePath)
1564
+ # Do the download into the cache.
1565
+ self._downloadToCache(fileStoreID, cachedPath)
1543
1566
 
1544
- # Now we're done
1545
- return localFilePath
1546
-
1547
- else:
1548
- logger.debug('We already have an entry in the cache database for file %s', fileStoreID)
1549
-
1550
- # A record already existed for this file.
1551
- # Try and create an immutable reference to an entry that
1552
- # is in 'cached' or 'uploadable' or 'uploading' state.
1553
- # It might be uploading because *we* are supposed to be uploading it.
1554
- logger.debug('Trying to make reference to file %s', fileStoreID)
1555
- self._write([('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)',
1556
- (localFilePath, readerID, 'immutable', fileStoreID, 'cached', 'uploadable', 'uploading'))])
1557
-
1558
- # See if we got it
1559
- self.cur.execute('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (localFilePath, fileStoreID))
1560
- if self.cur.fetchone()[0] > 0:
1561
- # The file is cached and we can copy or link it
1562
- logger.debug('Obtained reference to file %s', fileStoreID)
1567
+ # Try and make the link before we let the file go to cached state.
1568
+ # If we fail we may end up having to give away the file we just downloaded.
1569
+ if self._createLinkFromCache(cachedPath, localFilePath, symlink):
1570
+ # We made the link!
1563
1571
 
1564
- # Get the path it is actually at in the cache, instead of where we wanted to put it
1565
- for row in self.cur.execute('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
1566
- cachedPath = row[0]
1572
+ # Change file state from downloading to cached so other people can use it
1573
+ self._write([('UPDATE files SET state = ?, owner = NULL WHERE id = ?',
1574
+ ('cached', fileStoreID))])
1567
1575
 
1568
- if self._createLinkFromCache(cachedPath, localFilePath, symlink):
1569
- # We managed to make the link
1576
+ # Now we're done!
1570
1577
  return localFilePath
1571
1578
  else:
1572
- # We can't make the link. We need a copy instead.
1579
+ # We could not make a link. We need to make a copy.
1580
+
1581
+ # Change the reference to copying.
1582
+ self._write([('UPDATE refs SET state = ? WHERE path = ? AND file_id = ?', ('copying', localFilePath, fileStoreID))])
1573
1583
 
1574
- # We could change the reference to copying, see if
1575
- # there's space, make the copy, try and get ahold of
1576
- # the file if there isn't space, and give it away, but
1577
- # we already have code for that for mutable downloads,
1578
- # so just clear the reference and download mutably.
1584
+ # Fulfill it with a full copy or by giving away the cached copy
1585
+ self._fulfillCopyingReference(fileStoreID, cachedPath, localFilePath)
1579
1586
 
1580
- self._write([('DELETE FROM refs WHERE path = ? AND file_id = ?', (localFilePath, fileStoreID))])
1587
+ # Now we're done
1588
+ return localFilePath
1581
1589
 
1582
- return self._readGlobalFileMutablyWithCache(fileStoreID, localFilePath, readerID)
1583
1590
  else:
1584
- logger.debug('Could not obtain reference to file %s', fileStoreID)
1591
+ logger.debug('We already have an entry in the cache database for file %s', fileStoreID)
1592
+
1593
+ # A record already existed for this file.
1594
+ # Try and create an immutable reference to an entry that
1595
+ # is in 'cached' or 'uploadable' or 'uploading' state.
1596
+ # It might be uploading because *we* are supposed to be uploading it.
1597
+ logger.debug('Trying to make reference to file %s', fileStoreID)
1598
+ self._write([('INSERT INTO refs SELECT ?, id, ?, ? FROM files WHERE id = ? AND (state = ? OR state = ? OR state = ?)',
1599
+ (localFilePath, readerID, 'immutable', fileStoreID, 'cached', 'uploadable', 'uploading'))])
1600
+
1601
+ # See if we got it
1602
+ self._read('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (localFilePath, fileStoreID))
1603
+ if self.cur.fetchone()[0] > 0:
1604
+ # The file is cached and we can copy or link it
1605
+ logger.debug('Obtained reference to file %s', fileStoreID)
1606
+
1607
+ # Get the path it is actually at in the cache, instead of where we wanted to put it
1608
+ for row in self._read('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
1609
+ cachedPath = row[0]
1610
+
1611
+ if self._createLinkFromCache(cachedPath, localFilePath, symlink):
1612
+ # We managed to make the link
1613
+ return localFilePath
1614
+ else:
1615
+ # We can't make the link. We need a copy instead.
1616
+
1617
+ # We could change the reference to copying, see if
1618
+ # there's space, make the copy, try and get ahold of
1619
+ # the file if there isn't space, and give it away, but
1620
+ # we already have code for that for mutable downloads,
1621
+ # so just clear the reference and download mutably.
1585
1622
 
1586
- # If we didn't get a download or a reference, adopt and do work from dead workers and loop again.
1587
- # We may have to wait for someone else's download or delete to
1588
- # finish. If they die, we will notice.
1589
- self._removeDeadJobs(self.coordination_dir, self.con)
1590
- self._stealWorkFromTheDead()
1591
- # We may have acquired ownership of partially-downloaded
1592
- # files, now in deleting state, that we need to delete
1593
- # before we can download them.
1594
- self._executePendingDeletions(self.coordination_dir, self.con, self.cur)
1623
+ self._write([('DELETE FROM refs WHERE path = ? AND file_id = ?', (localFilePath, fileStoreID))])
1595
1624
 
1596
- # Wait for other people's downloads to progress.
1597
- time.sleep(self.contentionBackoff)
1625
+ return self._readGlobalFileMutablyWithCache(fileStoreID, localFilePath, readerID)
1626
+ else:
1627
+ logger.debug('Could not obtain reference to file %s', fileStoreID)
1628
+
1629
+ # If we didn't get a download or a reference, adopt and do work from dead workers and loop again.
1630
+ # We may have to wait for someone else's download or delete to
1631
+ # finish. If they die, we will notice.
1632
+ self._removeDeadJobs(self.coordination_dir, self.con)
1633
+ self._stealWorkFromTheDead()
1634
+ # We may have acquired ownership of partially-downloaded
1635
+ # files, now in deleting state, that we need to delete
1636
+ # before we can download them.
1637
+ self._executePendingDeletions()
1638
+
1639
+ # Wait for other people's downloads to progress.
1640
+ time.sleep(self.contentionBackoff)
1598
1641
 
1599
1642
  @contextmanager
1600
1643
  def _with_copying_reference_to_upload(self, file_store_id: FileID, reader_id: str, local_file_path: Optional[str] = None) -> Generator:
@@ -1624,7 +1667,7 @@ class CachingFileStore(AbstractFileStore):
1624
1667
 
1625
1668
  # See if we got it
1626
1669
  have_reference = False
1627
- for row in self.cur.execute('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (local_file_path, file_store_id)):
1670
+ for row in self._read('SELECT COUNT(*) FROM refs WHERE path = ? and file_id = ?', (local_file_path, file_store_id)):
1628
1671
  have_reference = row[0] > 0
1629
1672
 
1630
1673
  if have_reference:
@@ -1651,12 +1694,12 @@ class CachingFileStore(AbstractFileStore):
1651
1694
  # Try and grab a reference to the file if it is being uploaded.
1652
1695
  if ref_path is not None:
1653
1696
  # We have an update in the cache that isn't written back yet.
1654
- # So we must stream from the ceche for consistency.
1697
+ # So we must stream from the cache for consistency.
1655
1698
 
1656
1699
  # The ref file is not actually copied to; find the actual file
1657
1700
  # in the cache
1658
1701
  cached_path = None
1659
- for row in self.cur.execute('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
1702
+ for row in self._read('SELECT path FROM files WHERE id = ?', (fileStoreID,)):
1660
1703
  cached_path = row[0]
1661
1704
 
1662
1705
  if cached_path is None:
@@ -1666,7 +1709,7 @@ class CachingFileStore(AbstractFileStore):
1666
1709
  # Pass along the results of the open context manager on the
1667
1710
  # file in the cache.
1668
1711
  yield result
1669
- # When we exit the with the copying reference will go away and
1712
+ # When we exit the with, the copying reference will go away and
1670
1713
  # the file will be allowed to leave the cache again.
1671
1714
  else:
1672
1715
  # No local update, so we can stream from the job store
@@ -1684,7 +1727,7 @@ class CachingFileStore(AbstractFileStore):
1684
1727
  # missing ref file, we will raise an error about it and stop deleting
1685
1728
  # things.
1686
1729
  missingFile = None
1687
- for row in self.cur.execute('SELECT path FROM refs WHERE file_id = ? AND job_id = ?', (fileStoreID, jobID)):
1730
+ for row in self._read('SELECT path FROM refs WHERE file_id = ? AND job_id = ?', (fileStoreID, jobID)):
1688
1731
  # Delete all the files that are references to this cached file (even mutable copies)
1689
1732
  path = row[0]
1690
1733
 
@@ -1735,25 +1778,25 @@ class CachingFileStore(AbstractFileStore):
1735
1778
  raise
1736
1779
 
1737
1780
  # Work out who we are
1738
- me = get_process_name(self.coordination_dir)
1781
+ with self.as_process() as me:
1739
1782
 
1740
- # Make sure nobody else has references to it
1741
- for row in self.cur.execute('SELECT job_id FROM refs WHERE file_id = ? AND state != ?', (fileStoreID, 'mutable')):
1742
- raise RuntimeError(f'Deleted file ID {fileStoreID} which is still in use by job {row[0]}')
1743
- # TODO: should we just let other jobs and the cache keep the file until
1744
- # it gets evicted, and only delete at the back end?
1783
+ # Make sure nobody else has references to it
1784
+ for row in self._read('SELECT job_id FROM refs WHERE file_id = ? AND state != ?', (fileStoreID, 'mutable')):
1785
+ raise RuntimeError(f'Deleted file ID {fileStoreID} which is still in use by job {row[0]}')
1786
+ # TODO: should we just let other jobs and the cache keep the file until
1787
+ # it gets evicted, and only delete at the back end?
1745
1788
 
1746
- # Pop the file into deleting state owned by us if it exists
1747
- self._write([('UPDATE files SET state = ?, owner = ? WHERE id = ?', ('deleting', me, fileStoreID))])
1789
+ # Pop the file into deleting state owned by us if it exists
1790
+ self._write([('UPDATE files SET state = ?, owner = ? WHERE id = ?', ('deleting', me, fileStoreID))])
1748
1791
 
1749
- # Finish the delete if the file is present
1750
- self._executePendingDeletions(self.coordination_dir, self.con, self.cur)
1792
+ # Finish the delete if the file is present
1793
+ self._executePendingDeletions()
1751
1794
 
1752
- # Add the file to the list of files to be deleted from the job store
1753
- # once the run method completes.
1754
- self.filesToDelete.add(str(fileStoreID))
1755
- self.logToMaster('Added file with ID \'%s\' to the list of files to be' % fileStoreID +
1756
- ' globally deleted.', level=logging.DEBUG)
1795
+ # Add the file to the list of files to be deleted from the job store
1796
+ # once the run method completes.
1797
+ self.filesToDelete.add(str(fileStoreID))
1798
+ self.log_to_leader('Added file with ID \'%s\' to the list of files to be' % fileStoreID +
1799
+ ' globally deleted.', level=logging.DEBUG)
1757
1800
 
1758
1801
  @deprecated(new_function_name='export_file')
1759
1802
  def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None:
@@ -1768,7 +1811,7 @@ class CachingFileStore(AbstractFileStore):
1768
1811
  # until they are done.
1769
1812
 
1770
1813
  # For safety and simplicity, we just execute all pending uploads now.
1771
- self._executePendingUploads(self.con, self.cur)
1814
+ self._executePendingUploads()
1772
1815
 
1773
1816
  # Then we let the job store export. TODO: let the export come from the
1774
1817
  # cache? How would we write the URL?
@@ -1796,11 +1839,37 @@ class CachingFileStore(AbstractFileStore):
1796
1839
  # value?) wait on it, so we can't forget to join it later.
1797
1840
  self.waitForCommit()
1798
1841
 
1842
+ if len(self.jobDesc.filesToDelete) > 0:
1843
+ raise RuntimeError("Job is already in the process of being committed!")
1844
+
1845
+ state_to_commit: Optional[JobDescription] = None
1846
+
1847
+ if jobState:
1848
+ # Clone the current job description, so that further updates to it
1849
+ # (such as new successors being added when it runs) occur after the
1850
+ # commit process, and aren't committed early or partially.
1851
+ state_to_commit = copy.deepcopy(self.jobDesc)
1852
+ # Also snapshot the files that should be seen as deleted once the
1853
+ # update of the job description is visible.
1854
+ state_to_commit.filesToDelete = list(self.filesToDelete)
1855
+ # TODO: We never clear this out on the file store itself. This
1856
+ # might be necessary for later jobs to see earlier jobs' deleted
1857
+ # before they are committed?
1858
+
1859
+ logger.debug('Starting commit of %s forked from %s', state_to_commit, self.jobDesc)
1860
+ # Make sure the deep copy isn't summoning ghosts of old job
1861
+ # versions. It must be as new or newer at this point.
1862
+ self.jobDesc.check_new_version(state_to_commit)
1863
+
1864
+ # Bump the original's version since saving will do that too and we
1865
+ # don't want duplicate versions.
1866
+ self.jobDesc.reserve_versions(1 if len(state_to_commit.filesToDelete) == 0 else 2)
1867
+
1799
1868
  # Start the commit thread
1800
- self.commitThread = threading.Thread(target=self.startCommitThread, args=(jobState,))
1869
+ self.commitThread = threading.Thread(target=self.startCommitThread, args=(state_to_commit,))
1801
1870
  self.commitThread.start()
1802
1871
 
1803
- def startCommitThread(self, jobState):
1872
+ def startCommitThread(self, state_to_commit: Optional[JobDescription]):
1804
1873
  """
1805
1874
  Run in a thread to actually commit the current job.
1806
1875
  """
@@ -1810,38 +1879,28 @@ class CachingFileStore(AbstractFileStore):
1810
1879
  self.waitForPreviousCommit()
1811
1880
 
1812
1881
  try:
1813
- # Reconnect to the database from this thread. The main thread can
1814
- # keep using self.con and self.cur. We need to do this because
1815
- # SQLite objects are tied to a thread.
1816
- con = sqlite3.connect(self.dbPath, timeout=SQLITE_TIMEOUT_SECS)
1817
- cur = con.cursor()
1818
-
1819
1882
  logger.debug('Committing file uploads asynchronously')
1820
1883
 
1821
1884
  # Finish all uploads
1822
- self._executePendingUploads(con, cur)
1885
+ self._executePendingUploads()
1823
1886
  # Finish all deletions out of the cache (not from the job store)
1824
- self._executePendingDeletions(self.coordination_dir, con, cur)
1887
+ self._executePendingDeletions()
1825
1888
 
1826
- if jobState:
1889
+ if state_to_commit is not None:
1827
1890
  # Do all the things that make this job not redoable
1828
1891
 
1829
- logger.debug('Committing file deletes and job state changes asynchronously')
1892
+ logger.debug('Committing file deletes and job state changes asynchronously from %s', state_to_commit)
1830
1893
 
1831
- # Indicate any files that should be deleted once the update of
1832
- # the job wrapper is completed.
1833
- self.jobDesc.filesToDelete = list(self.filesToDelete)
1834
1894
  # Complete the job
1835
- self.jobStore.update_job(self.jobDesc)
1836
- # Delete any remnant jobs
1837
- list(map(self.jobStore.delete_job, self.jobsToDelete))
1838
- # Delete any remnant files
1839
- list(map(self.jobStore.delete_file, self.filesToDelete))
1895
+ self.jobStore.update_job(state_to_commit)
1896
+ # Delete the files
1897
+ list(map(self.jobStore.delete_file, state_to_commit.filesToDelete))
1840
1898
  # Remove the files to delete list, having successfully removed the files
1841
- if len(self.filesToDelete) > 0:
1842
- self.jobDesc.filesToDelete = []
1899
+ if len(state_to_commit.filesToDelete) > 0:
1900
+ state_to_commit.filesToDelete = []
1843
1901
  # Update, removing emptying files to delete
1844
- self.jobStore.update_job(self.jobDesc)
1902
+ self.jobStore.update_job(state_to_commit)
1903
+
1845
1904
  except:
1846
1905
  self._terminateEvent.set()
1847
1906
  raise
@@ -1852,14 +1911,14 @@ class CachingFileStore(AbstractFileStore):
1852
1911
  def shutdown(cls, shutdown_info: Tuple[str, str]) -> None:
1853
1912
  """
1854
1913
  :param shutdown_info: Tuple of the coordination directory (where the
1855
- cache database is) and the cache directory (where the cached data is).
1856
-
1914
+ cache database is) and the cache directory (where the cached data is).
1915
+
1857
1916
  Job local temp directories will be removed due to their appearance in
1858
1917
  the database.
1859
1918
  """
1860
-
1919
+
1861
1920
  coordination_dir, cache_dir = shutdown_info
1862
-
1921
+
1863
1922
  if os.path.isdir(cache_dir):
1864
1923
  # There is a directory to clean up
1865
1924
 
@@ -1877,7 +1936,7 @@ class CachingFileStore(AbstractFileStore):
1877
1936
  # and use that.
1878
1937
  dbFilename = None
1879
1938
  dbAttempt = float('-inf')
1880
-
1939
+
1881
1940
  # We also need to remember all the plausible database files and
1882
1941
  # journals
1883
1942
  all_db_files = []
@@ -1929,7 +1988,7 @@ class CachingFileStore(AbstractFileStore):
1929
1988
  for filename in all_db_files:
1930
1989
  # And delete everything related to the caching database
1931
1990
  robust_rmtree(filename)
1932
-
1991
+
1933
1992
  def __del__(self):
1934
1993
  """
1935
1994
  Cleanup function that is run when destroying the class instance that ensures that all the
@@ -1951,12 +2010,14 @@ class CachingFileStore(AbstractFileStore):
1951
2010
  # Get a cursor
1952
2011
  cur = con.cursor()
1953
2012
 
1954
- # Work out our process name for taking ownership of jobs
2013
+ # We're allowed to assign jobs to us without acquiring the process
2014
+ # identity lock; we know it won't interfere with any of the other logic
2015
+ # happening under our process's identity in the database.
1955
2016
  me = get_process_name(coordination_dir)
1956
2017
 
1957
2018
  # Get all the dead worker PIDs
1958
2019
  workers = []
1959
- for row in cur.execute('SELECT DISTINCT worker FROM jobs WHERE worker IS NOT NULL'):
2020
+ for row in cls._static_read(cur, 'SELECT DISTINCT worker FROM jobs WHERE worker IS NOT NULL'):
1960
2021
  workers.append(row[0])
1961
2022
 
1962
2023
  # Work out which of them are not currently running.
@@ -1969,14 +2030,14 @@ class CachingFileStore(AbstractFileStore):
1969
2030
  # Now we know which workers are dead.
1970
2031
  # Clear them off of the jobs they had.
1971
2032
  for deadWorker in deadWorkers:
1972
- cls._staticWrite(con, cur, [('UPDATE jobs SET worker = NULL WHERE worker = ?', (deadWorker,))])
2033
+ cls._static_write(con, cur, [('UPDATE jobs SET worker = NULL WHERE worker = ?', (deadWorker,))])
1973
2034
  if len(deadWorkers) > 0:
1974
2035
  logger.debug('Reaped %d dead workers', len(deadWorkers))
1975
2036
 
1976
2037
  while True:
1977
2038
  # Find an unowned job.
1978
2039
  # Don't take all of them; other people could come along and want to help us with the other jobs.
1979
- cur.execute('SELECT id FROM jobs WHERE worker IS NULL LIMIT 1')
2040
+ cls._static_read(cur, 'SELECT id FROM jobs WHERE worker IS NULL LIMIT 1')
1980
2041
  row = cur.fetchone()
1981
2042
  if row is None:
1982
2043
  # We cleaned up all the jobs
@@ -1985,10 +2046,10 @@ class CachingFileStore(AbstractFileStore):
1985
2046
  jobID = row[0]
1986
2047
 
1987
2048
  # Try to own this job
1988
- cls._staticWrite(con, cur, [('UPDATE jobs SET worker = ? WHERE id = ? AND worker IS NULL', (me, jobID))])
2049
+ cls._static_write(con, cur, [('UPDATE jobs SET worker = ? WHERE id = ? AND worker IS NULL', (me, jobID))])
1989
2050
 
1990
2051
  # See if we won the race
1991
- cur.execute('SELECT id, tempdir FROM jobs WHERE id = ? AND worker = ?', (jobID, me))
2052
+ cls._static_read(cur, 'SELECT id, tempdir FROM jobs WHERE id = ? AND worker = ?', (jobID, me))
1992
2053
  row = cur.fetchone()
1993
2054
  if row is None:
1994
2055
  # We didn't win the race. Try another one.