vocker 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vocker/dedup.py CHANGED
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import abc
4
+ import datetime
5
+ from collections import defaultdict
4
6
  import contextlib
5
7
  import filelock
6
8
  import io
@@ -18,11 +20,10 @@ import structlog
18
20
  import concurrent.futures as cf
19
21
 
20
22
  import sqlalchemy as sa
21
- from sqlalchemy import orm as sao
23
+ from sqlalchemy import orm as sao, literal_column as _lit
22
24
  from sqlalchemy_boltons import sqlite as sq
23
25
  from sqlalchemy_boltons.orm import RelationshipComparator as Rel, IdKey
24
- from sqlalchemy_boltons.temporary import temporary_table
25
- from sqlalchemy_boltons.core import bytes_startswith
26
+ from sqlalchemy_boltons.core import bytes_startswith, count
26
27
  from boltons.iterutils import chunked_iter
27
28
  from cached_property import cached_property
28
29
 
@@ -39,15 +40,22 @@ logger = structlog.get_logger(__name__)
39
40
  class Corrupted:
40
41
  path: Path | None = attr.ib()
41
42
  file_id: int = attr.ib()
42
- exception: str = attr.ib()
43
- link_paths: frozenset[str] = attr.ib()
44
- raw_link_paths: frozenset[str] = attr.ib()
43
+ exception_name: str = attr.ib()
44
+ exception_string: str = attr.ib()
45
+ link_paths: tuple[Path, ...] = attr.ib()
46
+ raw_link_paths: tuple[bytes, ...] = attr.ib()
45
47
 
46
48
  def to_json(self):
47
49
  d = attr.asdict(self)
48
50
  d["path"] = p if (p := d["path"]) is None else str(p)
49
- for k in ("link_paths", "raw_link_paths"):
50
- d[k] = sorted(d[k])
51
+
52
+ d["link_paths"] = [str(x) for x in d["link_paths"]]
53
+ d["link_paths"].sort()
54
+
55
+ # JSON cannot handle raw bytes
56
+ d["raw_link_paths"] = [x.decode("iso-8859-1") for x in d["raw_link_paths"]]
57
+ d["raw_link_paths"].sort()
58
+
51
59
  return d
52
60
 
53
61
 
@@ -132,10 +140,11 @@ class DedupLinkRequest(DedupRequest):
132
140
  """
133
141
 
134
142
  hash_function: mh.HashFunction = attr.ib()
135
- link_path: Path = attr.ib()
143
+ link_path: Path | None = attr.ib()
136
144
  file_metadata: DedupFileMetadata = attr.ib()
137
145
  file_contents_hash: mh.Digest | None = attr.ib()
138
146
  open_file_once: ty.Callable[[], ty.BinaryIO] | None = attr.ib()
147
+ adopt_existing: bool = attr.ib(default=False)
139
148
  file_not_needed: ty.Callable[[], None] | None = attr.ib(default=None)
140
149
  tags: ty.Set[bytes] = attr.ib(factory=frozenset)
141
150
 
@@ -163,15 +172,9 @@ class _ImplDedupRequestCommon:
163
172
  @attr.s(eq=False, hash=False, kw_only=True)
164
173
  class _ImplDedupLinkRequest(_ImplDedupRequestCommon):
165
174
  req: DedupLinkRequest = attr.ib(default=None)
166
- lookup_key = attr.ib(default=None)
167
- dedup_file_path: Path = attr.ib(default=None)
175
+ obj: _Obj | None = attr.ib(default=None)
168
176
  link_path_str: bytes | None = attr.ib(default=None)
169
- file: IdKey[mo.DedupFile] | None = attr.ib(default=None)
170
177
  metadata_bytes: bytes | None = attr.ib(default=None)
171
- file_size: int = attr.ib(default=None)
172
- file_mtime: int = attr.ib(default=None)
173
- fast_path: bool = attr.ib(default=False) # can we use the fast-path without db transaction?
174
- is_new: bool = attr.ib(default=False) # is it a brand new FileDedup?
175
178
  hashes_promised: dict[mh.HashFunction, mh.Digest] = attr.ib(default=None)
176
179
  hashes_computed: dict[mh.HashFunction, mh.Digest] | None = attr.ib(default=None)
177
180
  called_file: bool = attr.ib(default=False)
@@ -222,19 +225,21 @@ class AdoptRequest:
222
225
  path: Path = attr.ib()
223
226
  tags: ty.Set[bytes] = attr.ib(factory=frozenset)
224
227
 
225
- out_size: int | None = attr.ib(init=False, default=None)
226
- out_digest: mh.Digest | None = attr.ib(init=False, default=None)
227
228
 
229
+ @attr.s(eq=False, slots=True)
230
+ class _Obj:
231
+ id: int = attr.ib(factory=None)
232
+ pending_file_ids = attr.ib(factory=list)
233
+ completed_file_ids = attr.ib(factory=list)
234
+ file_size: int | None = attr.ib(default=None)
235
+ adopted_file_path: Path | None = attr.ib(default=None)
228
236
 
229
- @attr.s(eq=False, hash=False)
230
- class _ImplAdoptRequest:
231
- req: AdoptRequest = attr.ib()
232
- link_path: bytes = attr.ib(default=None)
233
- file_metadata: DedupFileMetadata = attr.ib(default=None)
234
- file_metadata_bytes: bytes = attr.ib(default=None)
235
- done: bool = attr.ib(default=False)
236
- dedup_file_path: Path = attr.ib(default=None)
237
- delete: bool = attr.ib(default=False)
237
+
238
+ @attr.s(eq=False, slots=True)
239
+ class _Updates:
240
+ obj_updates = attr.ib(factory=list)
241
+ file_updates = attr.ib(factory=list)
242
+ link_updates = attr.ib(factory=list)
238
243
 
239
244
 
240
245
  """
@@ -284,7 +289,7 @@ class _PendingUpdater:
284
289
  raise ValueError(f"invalid update_interval={u!r}")
285
290
 
286
291
  def _update(self):
287
- with self.sessionmaker_w() as s:
292
+ with self.sessionmaker_w.begin() as s:
288
293
  pending: mo.Pending = self.pending.get_one(s)
289
294
  pending.expire_at = mo.now() + self.seconds_in_the_future
290
295
 
@@ -335,6 +340,10 @@ def make_sqlite_options(synchronous):
335
340
  )
336
341
 
337
342
 
343
+ def _ns(stmt):
344
+ return stmt.execution_options(synchronize_session=False)
345
+
346
+
338
347
  @attr.s(eq=False, hash=False)
339
348
  class Dedup(abc.ABC):
340
349
  base_path: Path = attr.ib()
@@ -345,10 +354,14 @@ class Dedup(abc.ABC):
345
354
  _path_db: Path | None = attr.ib(default=None, kw_only=True)
346
355
  path_temporary: Path | None = attr.ib(default=None, kw_only=True)
347
356
  path_deleted: Path | None = attr.ib(default=None, kw_only=True)
348
- path_corrupted: Path | None = attr.ib(default=None, kw_only=True)
349
357
  _integer_to_path = attr.ib(factory=IntegerToPath, kw_only=True)
350
358
  _sqlite_synchronous = attr.ib(default="NORMAL", kw_only=True)
351
359
  _batch_size = 1000
360
+ max_link_count: int = ...
361
+ _clean_dedup_mtime = (
362
+ round(datetime.datetime(2000, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc).timestamp()) & ~1
363
+ )
364
+ _corrupted_pending_id = -1 # reserved ID
352
365
 
353
366
  def __attrs_post_init__(self):
354
367
  if self._path_dedup is None:
@@ -363,15 +376,12 @@ class Dedup(abc.ABC):
363
376
  if self.path_temporary is None:
364
377
  self.path_temporary = self.base_path / "tmp"
365
378
 
366
- if self.path_corrupted is None:
367
- self.path_corrupted = self.base_path / "corrupted"
368
-
369
379
  self._path_dedup.mkdir(exist_ok=True, parents=True)
370
380
  self._path_db.parent.mkdir(exist_ok=True, parents=True)
371
- self.path_corrupted.mkdir(exist_ok=True, parents=True)
372
381
  self.path_deleted.mkdir(exist_ok=True, parents=True)
373
382
  self._path_temporary_dirs.mkdir(exist_ok=True, parents=True)
374
383
  self._path_temporary_lock.mkdir(exist_ok=True, parents=True)
384
+ self._path_temporary_simple_dir.mkdir(exist_ok=True, parents=True)
375
385
  engine = sq.create_engine_sqlite(self._path_db, create_engine_args=dict(echo=False))
376
386
  engine = make_sqlite_options(synchronous=self._sqlite_synchronous).apply(engine)
377
387
  self._engine_r = engine
@@ -380,10 +390,6 @@ class Dedup(abc.ABC):
380
390
  self._SessionR = sao.sessionmaker(self._engine_r)
381
391
  self._SessionW = sao.sessionmaker(self._engine_w)
382
392
 
383
- # FIXME: use proper session management
384
- # self.session = Session(self.engine_rw) # HACK
385
- # self.engine = self.engine_rw # HACK
386
-
387
393
  self._initialize_db()
388
394
 
389
395
  def _initialize_db(self):
@@ -392,6 +398,10 @@ class Dedup(abc.ABC):
392
398
  mo.BaseDedup.metadata.create_all(conn)
393
399
  conn.commit()
394
400
 
401
+ with self._beginw() as s:
402
+ if s.get(mo.Pending, self._corrupted_pending_id) is None:
403
+ s.add(mo.Pending(id=self._corrupted_pending_id, expire_at=1))
404
+
395
405
  @contextlib.contextmanager
396
406
  def _beginw(self):
397
407
  with self._SessionW.begin() as s:
@@ -408,6 +418,10 @@ class Dedup(abc.ABC):
408
418
  new_mode |= mask
409
419
  os.chmod(str(path), new_mode, follow_symlinks=False)
410
420
 
421
+ def _set_clean_file_mtime(self, path: Path) -> None:
422
+ t = self._clean_dedup_mtime
423
+ os.utime(path, (t, t))
424
+
411
425
  def get_metadata_from_file(self, path: Path) -> DedupFileMetadata:
412
426
  if supports_executable():
413
427
  mode = path.stat().st_mode
@@ -470,42 +484,269 @@ class Dedup(abc.ABC):
470
484
  hashes=[f(h) for h in link.hashes_promised.values()],
471
485
  )
472
486
 
473
- def _add_tags_to_file(self, session: sao.Session, file: mo.DedupFile, tags: ty.Set[bytes]):
474
- if not tags:
475
- return
487
+ def _tmp_sqlite(self, tmp):
488
+ with self._SessionR() as s:
489
+ return tmp.get(s, "").value
476
490
 
477
- Tag = sao.aliased(mo.Tag)
478
- current_tags = frozenset(
479
- session.execute(sa.select(Tag.name).where(Tag.file == file)).scalars().all()
491
+ @cached_property
492
+ def _tmp_files(self):
493
+ return self._tmp_sqlite(mo.tmp_new_files)
494
+
495
+ @cached_property
496
+ def _tmp_files2(self):
497
+ return self._tmp_sqlite(mo.tmp_new_files2)
498
+
499
+ @cached_property
500
+ def _tmp_check_links(self):
501
+ return self._tmp_sqlite(mo.tmp_check_links)
502
+
503
+ @cached_property
504
+ def _tmp_delete_extra(self):
505
+ return self._tmp_sqlite(mo.tmp_delete_extra)
506
+
507
+ @cached_property
508
+ def _sql_prebatch_check_link(self):
509
+ L = sao.aliased(mo.Link)
510
+ return count(L).where(L.path == sa.bindparam("p_path"))
511
+
512
+ @cached_property
513
+ def _sql_prebatch_update_with_existing_dedup_files(self):
514
+ tmp = self._tmp_files
515
+ ONE = _lit("1")
516
+
517
+ def _eq(x, y, attributes):
518
+ return sa.and_(getattr(x, a) == getattr(y, a) for a in attributes)
519
+
520
+ O = sao.aliased(mo.Obj, name="obj")
521
+ tmp_f = sao.aliased(tmp.files, name="t_file")
522
+ tmp_tag = sao.aliased(tmp.tags, name="t_tag")
523
+ tmp_hash = sao.aliased(tmp.hashes, name="t_hash")
524
+ Tag = sao.aliased(mo.Tag, name="tag")
525
+ Hash = sao.aliased(mo.Hash, name="hash")
526
+ cond_obj = O.q_is_complete() | (O.pending_id == sa.bindparam("p_pending_id"))
527
+ cond_obj &= _eq(O, tmp_f.c, ["metadata_bytes"])
528
+ sqt = (
529
+ sa.select(O.id)
530
+ .join(Tag, O.tags)
531
+ .join(tmp_tag, _eq(tmp_tag.c, Tag, ["name"]))
532
+ .where(cond_obj, tmp_tag.c.id == tmp_f.c.id)
533
+ .limit(ONE)
534
+ )
535
+ q = sa.select(tmp_f.c.id, sqt.scalar_subquery().label("obj_id"))
536
+ q = q.where(tmp_f.c.obj_id == None).subquery()
537
+
538
+ sqh = (
539
+ sa.select(O.id)
540
+ .join(Hash, O.hashes)
541
+ .join(tmp_hash, _eq(tmp_hash.c, Hash, ["hash_function", "hash"]))
542
+ .where(cond_obj, tmp_hash.c.id == q.c.id)
543
+ .limit(ONE)
544
+ )
545
+ q = sa.select(
546
+ q.c.id,
547
+ sa.case((q.c.obj_id == None, sqh.scalar_subquery()), else_=q.c.obj_id).label("obj_id"),
480
548
  )
481
- for name in tags - current_tags:
482
- session.add(mo.Tag(name=name, file=file))
483
549
 
484
- def _prepare_dedup_file_for_linking(
485
- self, session: sao.Session, file: mo.DedupFile, link: _ImplDedupLinkRequest
486
- ):
487
- if link.is_new:
488
- # We need to flush so that the DedupFile gets assigned an ID. The merge below needs it.
489
- session.flush()
490
-
491
- # We add our tags.
492
- self._add_tags_to_file(session, file, link.req.tags)
493
-
494
- # Delete any existing link.
495
- session.connection().execute(
496
- sa.delete(mo.Link)
497
- .where(mo.Link.link_path == link.link_path_str)
498
- .execution_options(synchronize_session=False)
550
+ tmp_f_up = sao.aliased(tmp.files, name="t_file")
551
+
552
+ # Finally, create the UPDATE statement that uses `qu` to update `tmp_files`.
553
+ qu = q.cte(name="t_file_changes")
554
+ stmt = sa.update(tmp_f_up)
555
+ stmt = stmt.values(obj_id=qu.c.obj_id).where(tmp_f_up.c.id == qu.c.id)
556
+ return _ns(stmt)
557
+
558
+ @cached_property
559
+ def _sql_prebatch_insert_missing_objs(self):
560
+ """
561
+ Create Obj records where missing.
562
+ """
563
+ tmp = self._tmp_files
564
+ fake_created_at = sa.bindparam("p_fake_created_at")
565
+
566
+ tmp_f = sa.alias(tmp.files, name="t_file")
567
+ Obj = sao.aliased(mo.Obj, name="obj")
568
+ q = sa.select(
569
+ tmp_f.c.metadata_bytes.label("metadata"),
570
+ tmp_f.c.id.label("size"), # smuggle ID through this field
571
+ fake_created_at.label("created_at"),
572
+ sa.null().label("orphaned_at"),
573
+ sa.bindparam("p_pending_id").label("pending_id"),
574
+ )
575
+ q = q.select_from(tmp_f).where(
576
+ tmp_f.c.obj_id == None, tmp_f.c.insert_obj_if_missing == True
577
+ )
578
+ qi = sa.insert(mo.Obj)
579
+ qi = qi.from_select(["metadata", "size", "created_at", "orphaned_at", "pending_id"], q)
580
+ del q, Obj, tmp_f
581
+
582
+ Obj = sao.aliased(mo.Obj, name="obj")
583
+ q = sa.select(Obj.id.label("obj_id"), Obj.size.label("id")).where(
584
+ Obj.created_at < _lit("0"), Obj.created_at == fake_created_at
585
+ )
586
+ q = q.cte(name="t_file_changes")
587
+
588
+ tmp_f = sa.alias(tmp.files, name="t_file")
589
+ qu = sa.update(tmp_f).add_cte(q)
590
+ qu = qu.values(new_obj_id=q.c.obj_id).where(tmp_f.c.id == q.c.id)
591
+
592
+ return _ns(qi), _ns(qu)
593
+
594
+ @cached_property
595
+ def _sql_prebatch_fix_and_delete_objs(self):
596
+ tmp = self._tmp_files
597
+ tmp_f = sao.aliased(tmp.files, name="t_files")
598
+ pending_id = sa.bindparam("p_pending_id")
599
+ created_at = sa.bindparam("p_created_at")
600
+ Obj = sao.aliased(mo.Obj, name="obj")
601
+
602
+ # Set a proper created_at for the new Objs that are actually in use.
603
+ q = sa.select(tmp_f.c.obj_id).where(tmp_f.c.obj_id != None)
604
+ r1 = sa.update(Obj).values(created_at=created_at)
605
+ r1 = r1.where(Obj.id.in_(q), Obj.pending_id == pending_id)
606
+
607
+ # Set updated_at to the current time.
608
+ r2 = sa.update(Obj).values(updated_at=created_at)
609
+ r2 = r2.where(Obj.id.in_(q))
610
+
611
+ # Delete remaining Objs.
612
+ r3 = sa.delete(Obj).where(
613
+ Obj.id.in_(sa.select(tmp_f.c.new_obj_id)), Obj.created_at < _lit("0")
499
614
  )
500
615
 
501
- # Create link object.
502
- session.add(mo.Link(link_path=link.link_path_str, file=file))
616
+ return tuple(_ns(x) for x in (r1, r2, r3))
503
617
 
504
- # Since we created a link, the file is definitely not orphaned.
505
- file.orphaned_at = None
618
+ @cached_property
619
+ def _sql_prebatch_insert_hashes(self):
620
+ """
621
+ Create Hash records.
622
+ """
623
+ tmp = self._tmp_files
624
+ tmp_f = sao.aliased(tmp.files, name="t_files")
625
+ tmp_h = sao.aliased(tmp.hashes, name="t_hash")
626
+ Obj = sao.aliased(mo.Obj, name="obj")
627
+ Hash = sao.aliased(mo.Hash, name="h")
628
+
629
+ q = sa.select(tmp_f.c.new_obj_id, tmp_h.c.hash_function, tmp_h.c.hash)
630
+ q = q.select_from(tmp_h).join(tmp_f, tmp_f.c.id == tmp_h.c.id)
631
+ exists = sa.exists().select_from(Hash)
632
+ exists = exists.where(
633
+ Hash.hash_function == tmp_h.c.hash_function, Hash.obj_id == tmp_f.c.new_obj_id
634
+ )
635
+ q = q.where(~exists, tmp_f.c.new_obj_id != None)
636
+ stmt = sa.insert(mo.Hash).from_select(["obj_id", "hash_function", "hash"], q)
637
+ return _ns(stmt)
506
638
 
507
- # This also relies on the flush above.
508
- link.dedup_file_path = self._make_dedup_file_path(file.id)
639
+ @cached_property
640
+ def _sql_prebatch_insert_tags(self):
641
+ """
642
+ Create Tag records.
643
+ """
644
+ # Sadly this has a lot in common with `_sql_insert_hashes`. The urge to refactor is intense.
645
+ tmp = self._tmp_files
646
+ tmp_f = sao.aliased(tmp.files, name="t_files")
647
+ tmp_t = sao.aliased(tmp.tags, name="t_tag")
648
+ Obj = sao.aliased(mo.Obj, name="obj")
649
+ Tag = sao.aliased(mo.Tag, name="tag")
650
+
651
+ q = sa.select(tmp_f.c.new_obj_id, tmp_t.c.name)
652
+ q = q.select_from(tmp_t).join(tmp_f, tmp_f.c.id == tmp_t.c.id)
653
+ exists = sa.exists().select_from(Tag)
654
+ exists = exists.where(Tag.name == tmp_t.c.name, Tag.obj_id == tmp_f.c.new_obj_id)
655
+ q = q.where(~exists, tmp_f.c.new_obj_id != None)
656
+ stmt = sa.insert(mo.Tag).from_select(["obj_id", "name"], q)
657
+ return _ns(stmt)
658
+
659
+ @cached_property
660
+ def _sql_prebatch_insert_files(self):
661
+ tmp = self._tmp_files
662
+ tmp_f = sao.aliased(tmp.files, name="t_files")
663
+ q = sa.select(
664
+ tmp_f.c.obj_id,
665
+ sa.bindparam("p_pending_id").label("pending_id"),
666
+ sa.bindparam("p_created_at").label("created_at"),
667
+ )
668
+ q = q.where(tmp_f.c.obj_id != None)
669
+ stmt = sa.insert(mo.File).from_select(["obj_id", "pending_id", "created_at"], q)
670
+ return _ns(stmt)
671
+
672
+ @cached_property
673
+ def _sql_prebatch_delete_and_insert_links(self):
674
+ tmp = self._tmp_files
675
+ F = sao.aliased(mo.File, name="file")
676
+ L = sao.aliased(mo.Link, name="link")
677
+ tmp_f = sao.aliased(tmp.files, name="t_file")
678
+ null_id = sa.bindparam("p_null_file_id")
679
+
680
+ cond_link = (tmp_f.c.obj_id != None) & (tmp_f.c.link_path != None)
681
+
682
+ # Invalidate file link counts for the links we are about to delete.
683
+ q = sa.select(L.file_id).where(L.path.in_(sa.select(tmp_f.c.link_path).where(cond_link)))
684
+ r0 = sa.update(F).values(link_count=-1)
685
+ r0 = r0.where(F.id.in_(q))
686
+
687
+ # Delete the old links.
688
+ r1 = sa.delete(L).where(L.path.in_(sa.select(tmp_f.c.link_path).where(cond_link)))
689
+
690
+ # Insert the new links.
691
+ q = sa.select(tmp_f.c.link_path.label("path"), null_id.label("file_id")).where(cond_link)
692
+ r2 = sa.insert(mo.Link).from_select(["path", "file_id"], q)
693
+
694
+ # Set in-use Objs as not orphaned as they now have links.
695
+ O = sao.aliased(mo.Obj, name="obj")
696
+ q = sa.select(tmp_f.c.obj_id).where(cond_link)
697
+ r3 = sa.update(O).values(orphaned_at=None).where(O.id.in_(q))
698
+
699
+ return tuple(_ns(r) for r in (r0, r1, r2, r3))
700
+
701
+ @cached_property
702
+ def _sql_postbatch_update_objs(self):
703
+ t = sao.aliased(self._tmp_files2.objs, name="tu_obj")
704
+ q = sa.select(t.c.obj_id, t.c.size).subquery()
705
+ O = sao.aliased(mo.Obj, name="obj")
706
+ stmt = sa.update(O).where(O.id == q.c.obj_id)
707
+ stmt = stmt.values(size=q.c.size, pending_id=None)
708
+ return _ns(stmt)
709
+
710
+ @cached_property
711
+ def _sql_postbatch_update_files(self):
712
+ t = sao.aliased(self._tmp_files2.files, name="tu_files")
713
+ q = sa.select(t.c.file_id, t.c.obj_id).subquery()
714
+ F = sao.aliased(mo.File, name="file")
715
+ stmt = sa.update(F).where(F.id == q.c.file_id)
716
+ stmt = stmt.values(obj_id=q.c.obj_id, pending_id=None)
717
+ return _ns(stmt)
718
+
719
+ @cached_property
720
+ def _sql_postbatch_update_links(self):
721
+ t = sao.aliased(self._tmp_files2.links, name="tu_links")
722
+ q = sa.select(t.c.link_path, t.c.file_id, t.c.link_count).subquery()
723
+ F = sao.aliased(mo.File, name="file")
724
+ L = sao.aliased(mo.Link, name="link")
725
+ stmt1 = sa.update(F).where(F.id == q.c.file_id)
726
+ stmt1 = stmt1.values(link_count=q.c.link_count)
727
+ stmt2 = sa.update(L).where(L.path == q.c.link_path)
728
+ stmt2 = stmt2.values(file_id=q.c.file_id)
729
+ return _ns(stmt1), _ns(stmt2)
730
+
731
+ @cached_property
732
+ def _sql_prebatch_select_req_obj(self):
733
+ tmp = self._tmp_files
734
+ t_files = sao.aliased(tmp.files, name="t_files")
735
+ q = sa.select(t_files.c.id, t_files.c.obj_id).where(t_files.c.obj_id != None)
736
+ return _ns(q)
737
+
738
+ @cached_property
739
+ def _sql_prebatch_select_obj_file(self):
740
+ tmp = self._tmp_files
741
+ p_id = sa.bindparam("p_pending_id")
742
+ O = sao.aliased(mo.Obj, name="obj")
743
+ F = sao.aliased(mo.File, name="file")
744
+ t_files = sao.aliased(tmp.files, name="t_files")
745
+ qo = sa.select(t_files.c.obj_id).where(t_files.c.obj_id != None)
746
+ q = sa.select(F.obj_id, F.id, F.pending_id)
747
+ q = q.where(F.link_count < sa.bindparam("p_max_link_count"))
748
+ q = q.where(F.obj_id.in_(qo), (F.pending_id == None) | (F.pending_id == p_id))
749
+ return _ns(q)
509
750
 
510
751
  def run_batch(self, requests: ty.Iterable[DedupRequest]) -> None:
511
752
  """
@@ -514,24 +755,42 @@ class Dedup(abc.ABC):
514
755
 
515
756
  The requests will be addressed in the order that they appear in the iterable.
516
757
 
517
- Notes
518
- -----
758
+ BUG: If the same file (same hash or same tag) appears multiple times in the *requests* then
759
+ multiple files will be created. You are welcome to fix this without breaking the tests and
760
+ without incurring a significant performance penalty.
761
+
762
+ We create more Objs and Files than we need, then clean them up later no biggie.
763
+ They get automatically deleted when the Pending record is deleted.
764
+
765
+ Pre-batch:
519
766
 
520
- The implementation tries to spend as little time as possible inside database transactions.
767
+ 1. Insert temporary files, hashes, tags
768
+ 2. Update tmp.files.obj_id by matching existing Objs by hash or tag.
769
+ 3. Insert a new Obj for each tmp_file where obj_id is NULL.
770
+ 4. Insert Objs and update t.files.obj_id to point to the right Obj.
771
+ 5. Insert Hash and Tag rows corresponding to stuff in t.files.
772
+ 6. Update tmp.files.obj_id by matching existing Objs by hash or tag.
773
+ 7. Insert a new File for each tmp_file. We might not need it.
774
+ 8. Insert a new Link for each tmp_file.
775
+ 9. Select (Obj.id, File.id) for each of tmp.files. These are all the possible usable files
776
+ that we will attempt to create a link to.
521
777
 
522
- 1. Search database for existing deduplicated files that can be reused. These are files
523
- that match either the hash or one of the tags.
524
- 2. Create a record for each new deduplicated file. Create a Pending
525
- 3.
778
+ Batch:
526
779
 
527
- NEW IDEA FIXME
528
- --------------
780
+ 1. For each request:
781
+ 2. If no content is present, then use the existing pending File id to write the content.
782
+ 3. If content is already present:
783
+ 3. For each related file_id:
784
+ 4. Check the link count and make a reminder to update the link count in the DB.
785
+ 5. Attempt to create a link pointing to that file_id. If it succeeds, continue
786
+ to the next request.
787
+ 6. None of the file IDs succeeded. Make a copy of an existing file using the spare
788
+ pending File id.
529
789
 
530
- Split into fast path and slow path. If it's a brand new file OR it's an existing file that
531
- is done being written (not pending), then that's the fast path. Otherwise it's the slow
532
- path.
790
+ Post-batch:
533
791
 
534
- On the *fast path* we don't need to check for what other threads are doing.
792
+ 1. Update the link counts.
793
+ 2. Match each
535
794
 
536
795
  """
537
796
 
@@ -553,236 +812,189 @@ class Dedup(abc.ABC):
553
812
  "doing both links and copies in the same batch is not supported for now"
554
813
  )
555
814
 
815
+ # cases to consider:
816
+ # adopt_existing==True, link_path is an existing dedup link: NOT IMPLEMENTED
817
+ # adopt_existing==True, link_path is a regular file, hash matches content inside dedup db
818
+ # adopt_existing==True, link_path is a regular file, content is novel
819
+
556
820
  # Preliminaries to do before we start writing to the database.
557
- all_tags: set[bytes] = set()
558
- hashes_to_search: list[dict] = []
821
+ tmp_files = []
822
+ tmp_tags = []
823
+ tmp_hashes = []
559
824
  with self._SessionR() as s:
560
825
  for link in links:
561
- with self._ignore_skip(), self._catch_req_exc(link):
562
- req = link.req
563
- link.link_path_str = self._link_path_to_string(req.link_path)
826
+ req = link.req
827
+
828
+ if req.link_path is not None:
829
+ link.link_path_str = ps = self._link_path_to_string(req.link_path)
564
830
  # Remove existing file if present. This may raise if the path is actually a
565
831
  # directory.
566
- req.link_path.unlink(missing_ok=True)
567
-
568
- all_tags |= req.tags
569
-
570
- link.metadata_bytes = self.convert_file_metadata_to_bytes(req.file_metadata)
571
-
572
- if (h := req.file_contents_hash) is not None:
573
- link.lookup_key = h, link.metadata_bytes
574
- d = {
575
- "id": link.index,
576
- "hash_function": h.function.function_code,
577
- "digest": h.digest,
578
- "metadata_bytes": link.metadata_bytes,
579
- }
580
- hashes_to_search.append(d)
581
- link.hashes_promised = {h.function: h}
832
+ if req.adopt_existing:
833
+ pass # Assertion is too expensive.
834
+ # assert not s.execute(
835
+ # self._sql_prebatch_check_link, {"p_path": ps}
836
+ # ).scalar(), "adopting an existing link is not supported yet"
582
837
  else:
583
- link.hashes_promised = {}
584
-
585
- for copy in copies:
586
- with self._ignore_skip(), self._catch_req_exc(copy):
587
- req = copy.req
588
- copy.src_str = self._link_path_to_string(req.src)
589
- copy.dst_str = self._link_path_to_string(req.dst)
590
-
591
- def _q_gather_file_related(s, cls, attribute, values_set):
592
- """
593
- Query DedupFile-related information.
594
- """
595
- if not values_set:
596
- return () # short-cut to avoid doing the query at all
597
- Related = sao.aliased(cls)
598
- q = sa.select(Related).where(getattr(Related, attribute).in_(values_set))
599
- q = q.options(sao.joinedload(Related.file))
600
- return s.execute(q).scalars()
838
+ req.link_path.unlink(missing_ok=True)
839
+ else:
840
+ # The user is requesting insert of content but doesn't want an actual link
841
+ # to be created.
842
+ link.link_path_str = ps = None
843
+ assert not req.adopt_existing
844
+
845
+ if req.adopt_existing:
846
+ req.file_metadata = m = self.get_metadata_from_file(req.link_path)
847
+ with req.link_path.open("rb") as f:
848
+ r = self._compute_file_hash(req.hash_function, f)
849
+ link.file_size, req.file_contents_hash = r
850
+ else:
851
+ m = req.file_metadata
601
852
 
602
- # Now we check the database and add file hash records where we can.
603
- with self._beginw() as s:
604
- s.add(pending := mo.Pending(expire_at=mo.now() + 30.0))
605
- s.flush()
606
- pending_key = IdKey.from_instance(pending)
853
+ link.metadata_bytes = m = self.convert_file_metadata_to_bytes(m)
607
854
 
608
- # Load relevant tags.
609
- q = _q_gather_file_related(s, mo.Tag, "name", all_tags)
610
- tag_to_file: dict[bytes, mo.DedupFile] = {x.name: x.file for x in q}
611
-
612
- # Load relevant hashes.
613
- if hashes_to_search:
614
- with temporary_table(s, mo.tmp_hash_lookup) as tmp:
615
- s.connection().execute(sa.insert(tmp), hashes_to_search).close()
616
- H = sao.aliased(mo.Hash)
617
- F = sao.aliased(mo.DedupFile)
618
- q = (
619
- sa.select(H, F)
620
- .join(F, H.file)
621
- .join(
622
- tmp,
623
- (tmp.c.digest == H.hash)
624
- & (tmp.c.hash_function == H.hash_function)
625
- & (tmp.c.metadata_bytes == F.file_metadata),
626
- )
627
- )
628
- hash_to_file = {
629
- (h.to_digest(), f.file_metadata): f for h, f in s.execute(q).all()
855
+ tmp_files.append(
856
+ {
857
+ "id": link.index,
858
+ "link_path": ps,
859
+ "metadata_bytes": m,
860
+ "insert_obj_if_missing": req.open_file_once is not None
861
+ or req.adopt_existing,
630
862
  }
631
- else:
632
- hash_to_file = {}
863
+ )
633
864
 
634
- # Construct a set so that we can check for intersection quickly.
635
- tag_to_file_set = set(tag_to_file)
865
+ tmp_tags += ({"id": link.index, "name": tag} for tag in req.tags)
636
866
 
637
- for link in links:
638
- if link.failed:
639
- continue
640
-
641
- req = link.req
642
-
643
- if overlap := req.tags & tag_to_file_set:
644
- # We found a deduped file with a common alternate key! We use it!
645
- file = tag_to_file[next(iter(overlap))]
646
- elif (key := link.lookup_key) is not None:
647
- # Check for a deduped file with the same hash.
648
- file = hash_to_file.get(key, None)
867
+ if (h := req.file_contents_hash) is not None:
868
+ d = {
869
+ "id": link.index,
870
+ "hash_function": h.function.function_code,
871
+ "hash": h.digest,
872
+ }
873
+ tmp_hashes.append(d)
874
+ link.hashes_promised = {h.function: h}
649
875
  else:
650
- file = None
876
+ link.hashes_promised = {}
651
877
 
652
- if file is None:
653
- # We did not find a matching file. We create a new one if we can.
654
- link.is_new = True
655
- link.fast_path = True
878
+ if (req.file_contents_hash is None) and not req.tags:
879
+ raise AssertionError("must provide hash and/or tags")
656
880
 
657
- if req.open_file_once is None:
658
- # The user does not actually have the contents of the file. We skip over
659
- # it.
660
- link.set_failed(MissingContentError())
661
- continue
881
+ updates = _Updates()
882
+ objs: dict[int, _Obj] = {}
662
883
 
663
- # We must create a file.
664
- s.add(file := self._make_dedup_file(link, pending))
665
- elif file.pending_id is None:
666
- # We found a matching file and it is not pending. We can use it directly.
667
- link.fast_path = True
884
+ # Now we check the database and add file hash records where we can.
885
+ with self._beginw() as s, mo.tmp_new_files(s, "") as t:
886
+ c = s.connection()
887
+ s.add(pending := mo.Pending(expire_at=mo.now() + 30.0))
888
+ s.flush()
889
+ pending_key = IdKey.from_instance(pending)
890
+ pending_id = pending.id
891
+ if tmp_files:
892
+ c.execute(sa.insert(t.files), tmp_files).close()
893
+ if tmp_hashes:
894
+ c.execute(sa.insert(t.hashes), tmp_hashes).close()
895
+ if tmp_tags:
896
+ c.execute(sa.insert(t.tags), tmp_tags).close()
897
+
898
+ s.add(temp_file := mo.File(pending_id=pending_id))
899
+ s.flush()
900
+ temp_file_id = temp_file.id
901
+
902
+ # Set t.files.obj_id using existing Obj and File records.
903
+ d = {
904
+ "p_fake_created_at": -1,
905
+ "p_created_at": mo.now(),
906
+ "p_null_file_id": temp_file_id,
907
+ "p_pending_id": pending_id,
908
+ "p_max_link_count": self.max_link_count,
909
+ }
910
+ c.execute(self._sql_prebatch_update_with_existing_dedup_files, d).close()
911
+
912
+ # We insert Obj records for the requests where t.files.obj_id is NULL.
913
+ for stmt in self._sql_prebatch_insert_missing_objs:
914
+ c.execute(stmt, d).close()
915
+
916
+ # Now all requests have an Obj. We add tag or hash records where necessary.
917
+ c.execute(self._sql_prebatch_insert_tags).close()
918
+ c.execute(self._sql_prebatch_insert_hashes).close()
919
+
920
+ # Coalesce overlapping new Objs.
921
+ c.execute(self._sql_prebatch_update_with_existing_dedup_files, d).close()
922
+
923
+ # Delete unused Objs. Set `Obj.created_at` for remaining ones.
924
+ for stmt in self._sql_prebatch_fix_and_delete_objs:
925
+ c.execute(stmt, d).close()
926
+
927
+ # Speculatively insert as many files as there are members in the batch.
928
+ c.execute(self._sql_prebatch_insert_files, d).close()
929
+
930
+ for r in self._sql_prebatch_delete_and_insert_links:
931
+ c.execute(r, d).close()
932
+
933
+ if 0:
934
+ tmp = self._tmp_files
935
+ print("**************** files, hashes, tags")
936
+ for tab in (tmp.files, tmp.hashes, tmp.tags):
937
+ print(s.execute(sa.select(tab)).all())
938
+
939
+ for req_id, obj_id in c.execute(self._sql_prebatch_select_req_obj):
940
+ if (obj := objs.get(obj_id)) is None:
941
+ objs[obj_id] = obj = _Obj(id=obj_id)
942
+ (link := links[req_id]).obj = obj
943
+ if link.req.adopt_existing:
944
+ obj.adopted_file_path = link.req.link_path
945
+
946
+ for obj_id, file_id, pending_id in c.execute(self._sql_prebatch_select_obj_file, d):
947
+ o = objs[obj_id]
948
+ if pending_id is None:
949
+ o.completed_file_ids.append(file_id)
668
950
  else:
669
- # If the file is still in a pending state, the hashes and tags are unreliable.
670
- # The file might fail to be written, the hashes might be invalid, etc. We must
671
- # use the slow path and wait for the file to become ready.
672
- link.fast_path = False
673
- file = None
674
-
675
- if link.fast_path:
676
- self._prepare_dedup_file_for_linking(s, file, link)
677
- if link.is_new:
678
- # If the same file shows up later in the batch, ensure that it is used.
679
- for v in link.hashes_promised.values():
680
- hash_to_file[v, file.file_metadata] = file
681
-
682
- # the _prepare_dedup_file_for_linking caused a flush, so our primary key is ready
683
- if file is not None:
684
- link.file = IdKey.from_instance(file)
685
-
686
- L = sao.aliased(mo.Link)
687
- q = sa.select(L).where(
688
- (L.link_path == sa.bindparam("x_src")) | (L.link_path == sa.bindparam("x_dst"))
689
- )
690
- for copy in copies:
691
- with self._ignore_skip(), self._catch_req_exc(copy):
692
- link_objs = {
693
- x.link_path: x
694
- for x in s.execute(q, {"x_src": copy.src_str, "x_dst": copy.dst_str})
695
- .scalars()
696
- .all()
697
- }
698
-
699
- if (src_link := link_objs.get(copy.src_str)) is None:
700
- raise NotADedupLinkError
701
-
702
- if (dst_link := link_objs.get(copy.dst_str)) is not None:
703
- s.delete(dst_link)
704
-
705
- copy.dedup_file_path = self._make_dedup_file_path(src_link.file_id)
706
- s.add(mo.Link(file_id=src_link.file_id, link_path=copy.dst_str))
707
- s.flush()
708
- del q, L
951
+ o.pending_file_ids.append(file_id)
709
952
 
710
953
  pending.expire_at = mo.now() + 30.0
954
+ del pending
955
+
956
+ failed_link_paths = []
957
+ with self._PendingUpdater(
958
+ pending=pending_key,
959
+ sessionmaker_r=self._SessionR,
960
+ sessionmaker_w=self._SessionW,
961
+ seconds_in_the_future=20,
962
+ ) as pu, self.temporary_directory(check_links=False) as tmp_path:
963
+ for link in links:
964
+ with self._ignore_skip(), self._catch_req_exc(link):
965
+ if (obj := link.obj) is None:
966
+ # nothing to be done here
967
+ link.call_file_not_needed()
968
+ link.set_failed(MissingContentError(f"no obj {link}"))
969
+ continue
711
970
 
712
- del hash_to_file, tag_to_file, tag_to_file_set, pending
713
-
714
- to_be_flushed = []
715
- failed_requests = []
716
-
717
- def _flush_now(s: sao.Session):
718
- for link in to_be_flushed:
719
- file: mo.DedupFile | None = None if (f := link.file) is None else f.get(s)
971
+ self._write_dedup_file_contents(link, tmp_path, updates)
720
972
 
721
- if link.failed or file is None:
722
- failed_requests.append(link.req)
723
- if file is not None:
724
- s.delete(file)
725
- continue
973
+ with self._beginw() as s, mo.tmp_new_files2(s, "") as t:
974
+ c = s.connection()
726
975
 
727
- if (size := link.file_size) is not None:
728
- file.size = size
729
- if (mtime := link.file_mtime) is not None:
730
- file.mtime = mtime
731
-
732
- # We need to add whatever extra hashes were computed.
733
- if d := link.hashes_computed:
734
- already_in_db = link.hashes_promised
735
- for k, v in d.items():
736
- if k not in already_in_db:
737
- s.add(mo.Hash.from_digest(v, file=file))
738
-
739
- # We checked the hashes (if any), the file contents are written, and the link
740
- # (if any) has been created. We are therefore ready to set the "file.pending"
741
- # column to NULL, thus marking the dedup file as finalized.
742
- file.pending = None
743
-
744
- to_be_flushed.clear()
745
-
746
- for copy in copies:
747
- with self._ignore_skip(), self._catch_req_exc(copy):
748
- self._delete_file(copy.req.dst)
749
- self._create_actual_link(copy.dedup_file_path, copy.req.dst)
750
-
751
- if links:
752
- # Now we write the file data without holding the database transaction open. The
753
- # "_PendingUpdater" ensures that other threads know that we're working.
754
- with self._PendingUpdater(
755
- pending=pending_key,
756
- sessionmaker_r=self._SessionR,
757
- sessionmaker_w=self._SessionW,
758
- seconds_in_the_future=20,
759
- ) as pu:
760
- for link in links:
761
- with self._ignore_skip(), self._catch_req_exc(link):
762
- if not link.fast_path:
763
- with self._beginw() as s:
764
- _flush_now(s)
765
- self._slow_path_wait_for_dedup_file(link=link, pending=pending_key)
766
-
767
- self._write_dedup_file_contents(link=link)
768
- to_be_flushed.append(link)
769
- pu.update_on_exit = True
976
+ if u := updates.link_updates:
977
+ c.execute(sa.insert(self._tmp_files2.links), u).close()
978
+ for stmt in self._sql_postbatch_update_links:
979
+ c.execute(stmt).close()
770
980
 
771
- with self._beginw() as s:
772
- _flush_now(s)
981
+ if u := updates.file_updates:
982
+ c.execute(sa.insert(self._tmp_files2.files), u).close()
983
+ c.execute(self._sql_postbatch_update_files)
773
984
 
774
- # Delete Pending object along with any DedupFile objects that had errors in them
775
- # using the "ON DELETE CASCADE".
776
- s.delete(pending_key.get_one(s))
985
+ if u := updates.obj_updates:
986
+ c.execute(sa.insert(self._tmp_files2.objs), u).close()
987
+ c.execute(self._sql_postbatch_update_objs).close()
777
988
 
778
- for link in links:
779
- link.req.success = not link.failed
989
+ # Delete the pending object.
990
+ s.delete(pending_key.get_one(s))
991
+ s.flush()
780
992
 
781
- if copies:
782
- for copy in copies:
783
- copy.req.success = not copy.failed
784
- if not copy.req.success:
785
- failed_requests.append(copy.req)
993
+ failed_requests = []
994
+ for link in links:
995
+ ok = link.req.success = not link.failed
996
+ if not ok:
997
+ failed_requests.append(link.req)
786
998
 
787
999
  if failed_requests:
788
1000
  first_exc = failed_requests[0].exc
@@ -794,7 +1006,6 @@ class Dedup(abc.ABC):
794
1006
  def _write_file_computing_hashes(
795
1007
  self, target: Path, open1, hashes: ty.Iterable[mh.HashFunction]
796
1008
  ) -> tuple[int, dict[mh.HashFunction, mh.Digest]]:
797
- target.parent.mkdir(exist_ok=True, parents=True)
798
1009
  m = mh.MultiHasher({f: f() for f in hashes})
799
1010
  with target.open("wb") as f_w, open1() as f_r:
800
1011
  while block := f_r.read(65536):
@@ -802,114 +1013,126 @@ class Dedup(abc.ABC):
802
1013
  f_w.write(block)
803
1014
  return m.size, m.digest()
804
1015
 
805
- def _write_dedup_file_contents(self, link: _ImplDedupLinkRequest) -> None:
806
- if link.is_new:
807
- if link.req.open_file_once is None:
808
- link.call_file_not_needed()
809
- return
810
-
811
- p = link.dedup_file_path
812
- (fs := set(link.hashes_promised)).update(self.extra_hashes)
813
- link.file_size, d = self._write_file_computing_hashes(p, link.call_open_file_once, fs)
814
- self.apply_metadata_to_file(p, link.req.file_metadata)
815
- link.file_mtime = int(p.stat().st_mtime)
816
- link.hashes_computed = d
817
-
818
- # Check that the hashes match what was claimed inside the link request.
819
- computed = {k: d[k] for k in link.hashes_promised}
820
- if link.hashes_promised != computed:
821
- p.unlink(missing_ok=True)
822
- raise InvalidContentsError(
823
- link_request=link.req,
824
- hashes_expected=link.hashes_promised,
825
- hashes_observed=computed,
826
- )
827
- else:
828
- # existing file - we don't need to do anything
829
- link.call_file_not_needed()
830
-
831
- # TODO: quickly check whether the file mtime matches and check the content hash if not
832
-
833
- self._create_actual_link(link.dedup_file_path, link.req.link_path)
834
-
835
- def _slow_path_wait_for_dedup_file(
836
- self, link: _ImplDedupLinkRequest, pending: IdKey[mo.Pending]
1016
+ def _write_dedup_file_contents(
1017
+ self, link: _ImplDedupLinkRequest, tmp_path: Path, updates: _Updates
837
1018
  ) -> None:
838
- """
839
- The file we are interested in is actively being written to by another thread. We need to
840
- wait for it to be finished or for the other thread to fail.
841
-
842
- Either way, we add the required data to the database such that we can continue with the
843
- fast path procedure after this method returns.
844
- """
1019
+ obj = link.obj
1020
+ target = link.req.link_path
1021
+ skip_link_for_file_id = None
1022
+ adopting = obj.adopted_file_path
1023
+ adopted = False
1024
+
1025
+ def _mkdirp(path):
1026
+ if not path.exists():
1027
+ path.mkdir(exist_ok=True, parents=True)
1028
+
1029
+ # Do we have any completed File IDs at all?
1030
+ if obj.completed_file_ids:
1031
+ link.call_file_not_needed()
1032
+ if adopting is not None:
1033
+ # We don't need the file there.
1034
+ self._delete_file(target)
1035
+ else:
1036
+ # No completed IDs, we need to make one. Try to adopt if possible.
1037
+ tmp_p = tmp_path / "f.bin"
1038
+ tmp_p.unlink(missing_ok=True)
845
1039
 
846
- # Construct query which looks for a DedupFile matching hashes or overlapping tags.
847
- F = sao.aliased(mo.DedupFile)
848
- H = sao.aliased(mo.Hash)
849
- T = sao.aliased(mo.Tag)
850
-
851
- def _exists(Alias):
852
- return sa.exists().select_from(Alias).where(Rel(Alias.file) == F)
853
-
854
- q = sa.select(F)
855
- for v in link.hashes_promised.values():
856
- q = q.where(_exists(H).where(H.compare_digest() == v))
857
- if link.req.tags:
858
- q = q.where(_exists(T).where(T.name.in_(link.req.tags)))
859
- q = q.options(sao.joinedload(F.pending))
860
-
861
- def _check(s: sao.Session) -> mo.DedupFile | bool:
862
- for x in s.execute(q).scalars():
863
- x: mo.DedupFile
864
- if x.pending is None:
865
- # We found a finished DedupFile we can use directly.
866
- return x
867
- elif x.pending_id == pending.key[0]:
868
- # It's already our dedupfile!!!
869
- raise AssertionError("deadlock")
870
- elif x.pending.expire_at >= mo.now():
871
- # We found an in-progress DedupFile, so we stand down and continue polling.
872
- return False
1040
+ if adopting is not None:
1041
+ link.call_file_not_needed()
1042
+ self._adopt_file_and_link(adopting, tmp_p)
1043
+ adopted = True
1044
+ size = tmp_p.stat().st_size
1045
+ apply_metadata = False
1046
+ elif (open1 := link.call_open_file_once) is not None:
1047
+ (fs := set(link.hashes_promised)).update(self.extra_hashes)
1048
+ size, d = self._write_file_computing_hashes(tmp_p, open1, fs)
1049
+ link.hashes_computed = d
1050
+
1051
+ # Check that the hashes match what was claimed inside the link request.
1052
+ computed = {k: d[k] for k in link.hashes_promised}
1053
+ if link.hashes_promised != computed:
1054
+ raise InvalidContentsError(
1055
+ link_request=link.req,
1056
+ hashes_expected=link.hashes_promised,
1057
+ hashes_observed=computed,
1058
+ )
1059
+ apply_metadata = True
1060
+ else:
1061
+ link.set_failed(MissingContentError("content not provided"))
1062
+ return
873
1063
 
874
- # There are no matching DedupFile objects, so we can create a new one ourselves.
875
- return True
1064
+ if apply_metadata:
1065
+ self.apply_metadata_to_file(tmp_p, link.req.file_metadata)
1066
+ self._set_clean_file_mtime(tmp_p)
876
1067
 
877
- def _wait_first_time():
878
- nonlocal _wait
879
- _wait = _wait_normal
1068
+ file_id = obj.pending_file_ids.pop()
1069
+ p = self._make_dedup_file_path(file_id)
1070
+ _mkdirp(p.parent)
1071
+ tmp_p.rename(p)
1072
+ obj.completed_file_ids.append(file_id)
1073
+ updates.file_updates.append({"file_id": file_id, "obj_id": obj.id})
1074
+ updates.obj_updates.append({"obj_id": obj.id, "size": size})
1075
+ # Now the file has the right contents. Let's also make a link now.
880
1076
 
881
- def _wait_normal():
882
- time.sleep(2)
1077
+ if adopting is not None:
1078
+ skip_link_for_file_id = file_id
883
1079
 
884
- _wait = _wait_first_time
1080
+ endgame = False
1081
+ completed = obj.completed_file_ids
885
1082
  while True:
886
- _wait()
1083
+ file_id = completed[-1]
1084
+ p = self._make_dedup_file_path(file_id)
1085
+ if adopted and adopting == target:
1086
+ ok = True
1087
+ else:
1088
+ ok = False
1089
+ try:
1090
+ self._create_actual_link(p, target)
1091
+ ok = True
1092
+ except Exception:
1093
+ pass
887
1094
 
888
- with self._SessionR() as s: # check using a read-only transaction
889
- result = _check(s)
890
- if result is False:
891
- continue
1095
+ if not ok and target.exists():
1096
+ self._delete_file(target)
1097
+ try:
1098
+ self._create_actual_link(p, target)
1099
+ ok = True
1100
+ except Exception:
1101
+ pass
892
1102
 
893
- with self._beginw() as s: # use a write transaction
894
- result = _check(s)
895
- if result is False:
896
- continue
1103
+ link_count = p.stat().st_nlink - 1
1104
+ updates.link_updates.append(
1105
+ {
1106
+ "link_path": link.link_path_str if ok else None,
1107
+ "file_id": file_id,
1108
+ "link_count": link_count,
1109
+ }
1110
+ )
1111
+ if ok:
1112
+ # We're done! Bye!
1113
+ return
897
1114
 
898
- if result is True:
899
- # We need to create a new DedupFile
900
- s.add(file := self._make_dedup_file(link, pending.get_one(s)))
901
- link.is_new = True
902
- else:
903
- file = result
904
- link.is_new = False
1115
+ if len(completed) > 1:
1116
+ completed.pop()
1117
+ continue
905
1118
 
906
- link.fast_path = True
907
- self._prepare_dedup_file_for_linking(s, file, link)
1119
+ if endgame:
1120
+ raise AssertionError
1121
+
1122
+ endgame = True
908
1123
 
909
- # we can only do this after the flush
910
- link.file = IdKey.from_instance(file)
1124
+ # This is our last one file, we must make a copy.
1125
+ tmp_p = tmp_path / "f.bin"
1126
+ tmp_p.unlink(missing_ok=True)
1127
+ shutil.copyfile(str(self._make_dedup_file_path(file_id)), str(tmp_p))
911
1128
 
912
- break
1129
+ file_id = obj.pending_file_ids.pop()
1130
+ p = self._make_dedup_file_path(file_id)
1131
+ _mkdirp(p.parent)
1132
+ tmp_p.rename(p)
1133
+ obj.completed_file_ids[0] = file_id
1134
+ updates.file_updates.append({"file_id": file_id, "obj_id": obj.id})
1135
+ # We made a copy. Hope it works now.
913
1136
 
914
1137
  @property
915
1138
  def _PendingUpdater(self):
@@ -940,8 +1163,24 @@ class Dedup(abc.ABC):
940
1163
  """
941
1164
  self._check_links(path, False)
942
1165
 
1166
+ @cached_property
1167
+ def _sql_checklinks(self):
1168
+ tmp = self._tmp_check_links
1169
+ t_links = sao.aliased(tmp.links)
1170
+ F = sao.aliased(mo.File)
1171
+ L = sao.aliased(mo.Link)
1172
+
1173
+ # Invalidate link_count for affected dedup Files.
1174
+ q = sa.select(L.file_id).join(t_links, L.path == t_links.c.path)
1175
+ r0 = sa.update(F).values(link_count=-1).where(F.id.in_(q))
1176
+
1177
+ # Delete Link records.
1178
+ r1 = sa.delete(L).where(L.path.in_(sa.select(t_links.c.path)))
1179
+
1180
+ return tuple(_ns(x) for x in (r0, r1))
1181
+
943
1182
  def _check_links(self, path: Path | None, pre_delete: bool) -> None:
944
- F = sao.aliased(mo.DedupFile)
1183
+ F = sao.aliased(mo.File)
945
1184
  L = sao.aliased(mo.Link)
946
1185
 
947
1186
  _verify_link = self._verify_link
@@ -956,10 +1195,10 @@ class Dedup(abc.ABC):
956
1195
  # do any checking.
957
1196
  _verify_link = lambda link: False
958
1197
 
959
- q = sa.select(L).order_by(L.link_path).options(sao.joinedload(L.file))
1198
+ q = sa.select(L).order_by(L.path).options(sao.joinedload(L.file))
960
1199
  q = q.limit(self._batch_size)
961
1200
  if prefix is not None:
962
- q = q.where((L.link_path == exact_path) | bytes_startswith(L.link_path, prefix))
1201
+ q = q.where((L.path == exact_path) | bytes_startswith(L.path, prefix))
963
1202
 
964
1203
  with self._SessionR() as s:
965
1204
  last_link_path: str | None = None
@@ -967,7 +1206,7 @@ class Dedup(abc.ABC):
967
1206
  if last_link_path is None:
968
1207
  q2 = q
969
1208
  else:
970
- q2 = q.where(L.link_path > last_link_path)
1209
+ q2 = q.where(L.path > last_link_path)
971
1210
 
972
1211
  results: list[mo.Link] = s.execute(q2).scalars().all()
973
1212
  if not results:
@@ -976,74 +1215,115 @@ class Dedup(abc.ABC):
976
1215
  to_delete = []
977
1216
  for link in results:
978
1217
  if not _verify_link(link):
979
- to_delete.append(link.link_path)
1218
+ # TODO: Instead of just deleting them from the DB, maybe we should keep
1219
+ # track of invalid links or even repair them?
1220
+ to_delete.append(link.path)
980
1221
 
981
1222
  if to_delete:
982
- with self._beginw() as s2, temporary_table(
983
- s2, mo.tmp_bytes
984
- ) as t_links, temporary_table(s2, mo.tmp_ints) as t_files:
1223
+ with self._beginw() as s2, mo.tmp_check_links(s2, "") as tmp:
1224
+ # 1. Insert Link paths into a temporary table.
985
1225
  s2.connection().execute(
986
- sa.insert(t_links), [{"id": x} for x in to_delete]
1226
+ sa.insert(tmp.links), [{"path": x} for x in to_delete]
987
1227
  ).close()
988
1228
 
989
- # There are the DedupFile entries that may end up orphaned.
990
- s2.connection().execute(
991
- sa.insert(t_files).from_select(
992
- [t_files.c.id],
993
- sa.select(F.id)
994
- .distinct()
995
- .select_from(L)
996
- .join(F, L.file)
997
- .join(t_links, t_links.c.id == L.link_path),
998
- )
999
- ).close()
1229
+ # 2. Invalidate link_count inside parent Files.
1230
+ # 3. Delete Links.
1231
+ # 4. Recompute link_count for affected Files.
1232
+ for stmt in self._sql_checklinks:
1233
+ s2.execute(stmt).close()
1000
1234
 
1001
- # Remove the links that have been deleted.
1002
- s2.connection().execute(
1003
- sa.delete(L).where(L.link_path.in_(sa.select(t_links.c.id))),
1004
- ).close()
1235
+ last_link_path = results[-1].path
1005
1236
 
1006
- # Detect newly-orphaned files.
1007
- s2.connection().execute(
1008
- F.make_update_orphaned().where(F.id.in_(sa.select(t_files.c.id)))
1009
- ).close()
1010
-
1011
- last_link_path = results[-1].link_path
1012
-
1013
- def update_all_orphaned(self):
1237
+ @cached_property
1238
+ def _sql_orph_update(self):
1239
+ now = sa.bindparam("p_now")
1240
+ updated_since = sa.bindparam("p_updated_since")
1241
+ Obj = sao.aliased(mo.Obj)
1242
+ return Obj.make_sql_update_orphaned(now).where(Obj.updated_at >= updated_since)
1243
+
1244
+ def detect_orphaned(self):
1245
+ # Update link count for files where it was invalidated.
1246
+ self.integrity_check(skip_same_mtime=True, only_invalid_link_count=True)
1247
+
1248
+ C = sao.aliased(mo.DedupConfig)
1249
+ KEY = "last_detect_orphaned"
1014
1250
  with self._beginw() as s:
1015
- F = sao.aliased(mo.DedupFile)
1016
- s.connection().execute(F.make_update_orphaned()).close()
1251
+ last_check = s.execute(sa.select(C).where(C.key == KEY)).scalar()
1252
+ if last_check is None:
1253
+ s.add(last_check := mo.DedupConfig(key=KEY, value="0"))
1254
+ since = int(last_check.value)
1255
+ now = mo.now()
1256
+ s.execute(self._sql_orph_update, {"p_now": now, "p_updated_since": since}).close()
1257
+ last_check.value = str(now)
1017
1258
 
1018
- def garbage_collect_dedup_files(self, min_age_seconds: int) -> None:
1259
+ @cached_property
1260
+ def _sql_gc_orphaned_to_pending(self):
1261
+ O = sao.aliased(mo.Obj)
1262
+ F = sao.aliased(mo.File)
1263
+ F2 = sao.aliased(mo.File)
1264
+ q = sa.select(F.id).join(O, F.obj).where(O.orphaned_at < sa.bindparam("p_cutoff"))
1265
+ return _ns(
1266
+ sa.update(F2).values(pending_id=sa.bindparam("p_pending_id")).where(F2.id.in_(q))
1267
+ )
1268
+
1269
+ @cached_property
1270
+ def _sql_gc_select_pending(self):
1271
+ F = sao.aliased(mo.File)
1272
+ P = sao.aliased(mo.Pending)
1273
+ L = sao.aliased(mo.Link)
1274
+ cond = P.expire_at < sa.bindparam("p_cutoff")
1275
+ cond &= P.id != _lit(str(self._corrupted_pending_id))
1276
+ q0 = sa.select(P.id).where(cond)
1277
+ q1 = sa.select(F.id).join(P).where(cond)
1278
+ q2 = sa.select(L.path).where(L.file_id.in_(q1))
1279
+ return q0, q1, q2
1280
+
1281
+ def garbage_collect_dedup_files(
1282
+ self, min_age_orphan_seconds: int, min_age_pending_seconds: int = None
1283
+ ) -> None:
1019
1284
  """
1020
1285
  Remove dedup files that have no links to them as well as dedup files that were left behind
1021
1286
  by a failed batch of content insertion.
1022
1287
  """
1023
- cutoff = mo.now() - min_age_seconds
1024
- pending_cutoff = 7200
1025
- F = sao.aliased(mo.DedupFile)
1026
- P = sao.aliased(mo.Pending)
1027
- q = sa.select(F).options(sao.selectinload(F.links)).limit(self._batch_size).order_by(F.id)
1028
- q1 = q.where(F.orphaned_at != None, F.orphaned_at <= cutoff)
1029
- q2 = q.join(P, F.pending).where(P.expire_at <= pending_cutoff)
1030
- self._garbage_collect_using_query(q1, F)
1031
- self._garbage_collect_using_query(q2, F)
1032
-
1033
- def _garbage_collect_using_query(self, q, F):
1034
- F1 = sao.aliased(mo.DedupFile)
1035
- while True:
1288
+
1289
+ self.detect_orphaned()
1290
+ now = mo.now()
1291
+ orphan_cutoff = now - min_age_orphan_seconds
1292
+ pending_cutoff = now - (min_age_pending_seconds or 7200)
1293
+
1294
+ self._garbage_collect_dedup_files(orphan_cutoff, pending_cutoff)
1295
+
1296
+ def _garbage_collect_dedup_files(self, orphan_cutoff: int | None, pending_cutoff: int):
1297
+ if orphan_cutoff is not None:
1036
1298
  with self._beginw() as s:
1037
- files: list[mo.DedupFile] = s.scalars(q).all()
1038
- if not files:
1039
- break
1040
- s.expunge_all() # remove DedupFile objects from session
1041
- s.connection().execute(sa.delete(F1).where(F1.id.in_(q.with_only_columns(F.id))))
1299
+ s.add(pending := mo.Pending(expire_at=1)) # expiration time far into the past
1300
+ s.flush()
1301
+
1302
+ # Convert orphaned files to pending. We will collect the pending afterwards.
1303
+ params = {"p_pending_id": pending.id, "p_cutoff": orphan_cutoff}
1304
+ s.execute(self._sql_gc_orphaned_to_pending, params).close()
1305
+
1306
+ with self._SessionR.begin() as s:
1307
+ # Iterate through the expired pending File IDs and delete the files and links. Gather the list
1308
+ # of pending objects that are finished.
1309
+ params = {"p_cutoff": pending_cutoff}
1310
+ q0, q1, q2 = self._sql_gc_select_pending
1311
+ pending_ids = s.execute(q0, params).scalars().all()
1312
+
1313
+ for link_path in s.execute(q2, params).scalars():
1314
+ p = self._link_path_from_string(link_path)
1315
+ if p.exists():
1316
+ self._delete_file(p)
1317
+
1318
+ for file_id in s.execute(q1, params).scalars():
1319
+ p = self._make_dedup_file_path(file_id)
1320
+ if p.exists():
1321
+ self._delete_file(p)
1042
1322
 
1043
- for file in files:
1044
- for link in file.links:
1045
- self._delete_file(link._link_path_from_string(link.link_path))
1046
- self._delete_file(self._make_dedup_file_path(file.id))
1323
+ # We only update the database after successfully deleting all the files and links.
1324
+ P = sao.aliased(mo.Pending)
1325
+ with self._beginw() as s:
1326
+ s.execute(sa.delete(P).where(P.id.in_(pending_ids))).close()
1047
1327
 
1048
1328
  def garbage_collect_deleted(self):
1049
1329
  """
@@ -1102,70 +1382,73 @@ class Dedup(abc.ABC):
1102
1382
 
1103
1383
  This recursively lists every file in the dedup store, so it takes a long time.
1104
1384
  """
1105
- F = sao.aliased(mo.DedupFile)
1385
+ F = sao.aliased(mo.File)
1106
1386
  i2p = self._integer_to_path
1107
1387
  cutoff = mo.now() - 3600
1108
1388
 
1389
+ t_f = sao.aliased(self._tmp_delete_extra.files)
1390
+ q = sa.select(t_f.c.id).where(~sa.exists().select_from(F).where(F.id == t_f.c.id))
1391
+
1109
1392
  base = self._path_dedup
1110
- for chunk in chunked_iter(base.rglob("*"), self._batch_size):
1111
- to_be_unlinked = []
1112
- file_ids = {}
1113
- for p in chunk:
1114
- if not p.is_file():
1115
- continue
1393
+ with self._SessionR.begin() as s, mo.tmp_delete_extra(s, "") as tmp:
1394
+ for chunk in chunked_iter(base.rglob("*"), self._batch_size):
1395
+ file_ids = {}
1396
+ for p in chunk:
1397
+ if not p.is_file():
1398
+ continue
1116
1399
 
1117
- try:
1118
- file_id = i2p.invert("/".join(p.relative_to(base).parts))
1119
- except InvalidPathError:
1120
- if p.stat().st_mtime < cutoff:
1121
- to_be_unlinked.append(p)
1122
- continue
1400
+ try:
1401
+ file_id = i2p.invert("/".join(p.relative_to(base).parts))
1402
+ except InvalidPathError:
1403
+ if p.stat().st_mtime < cutoff:
1404
+ self._delete_file(p)
1405
+ continue
1123
1406
 
1124
- file_ids[file_id] = p
1407
+ file_ids[file_id] = p
1408
+
1409
+ if file_ids:
1410
+ s.execute(sa.insert(tmp.files), tuple({"id": x} for x in file_ids)).close()
1411
+ bad_file_ids = s.execute(q).scalars().all()
1412
+ s.execute(sa.delete(tmp.files)).close()
1125
1413
 
1126
- if file_ids:
1127
- # We use a write transaction to avoid a race condition between checking that a path
1128
- # does not contain a valid file ID and then later deleting that file outside the
1129
- # transaction.
1130
- with self._SessionW() as s, temporary_table(s, mo.tmp_ints) as tmp:
1131
- s.execute(sa.insert(tmp), [{"id": x} for x in file_ids]).close()
1132
- tmp_ = sa.alias(tmp)
1133
- bad_file_ids = (
1134
- s.execute(
1135
- sa.select(tmp_.c.id).where(
1136
- ~sa.exists().select_from(F).where(F.id == tmp_.c.id)
1137
- )
1138
- )
1139
- .scalars()
1140
- .all()
1141
- )
1142
1414
  for file_id in bad_file_ids:
1143
1415
  self._delete_file(file_ids[file_id])
1144
1416
 
1145
- for p in to_be_unlinked:
1146
- self._delete_file(p)
1147
-
1148
1417
  def corrupted_list(self) -> ty.Generator[Corrupted]:
1149
1418
  """
1150
1419
  Get the list of corrupted files found using :meth:`integrity_check`.
1151
1420
  """
1152
- for p in self.path_corrupted.glob("*.json"):
1153
- d = json.loads(p.read_bytes())
1154
- yield Corrupted(
1155
- path=bin_path if (bin_path := p.with_suffix(".bin")).exists() else None,
1156
- file_id=d["file_id"],
1157
- exception=d["exception"],
1158
- link_paths=frozenset(d["link_paths"]),
1159
- raw_link_paths=frozenset(d["raw_link_paths"]),
1160
- )
1421
+
1422
+ L = sao.aliased(mo.Link)
1423
+
1424
+ with self._SessionR() as s:
1425
+ for fc in s.execute(sa.select(mo.FileCorruption)).scalars():
1426
+ fc.id
1427
+ links_bytes = s.execute(sa.select(L.path).where(L.file_id == fc.id)).scalars().all()
1428
+ links_paths = tuple((self._link_path_from_string(x)) for x in links_bytes)
1429
+ yield Corrupted(
1430
+ path=self._make_dedup_file_path(fc.id),
1431
+ file_id=fc.id,
1432
+ exception_name=fc.exception_name,
1433
+ exception_string=fc.exception_string,
1434
+ link_paths=links_paths,
1435
+ raw_link_paths=tuple(links_bytes),
1436
+ )
1161
1437
 
1162
1438
  def corrupted_clear(self):
1163
1439
  """
1164
1440
  Delete all corrupted files.
1165
1441
  """
1166
- for glob in ["*.bin", "*.json"]:
1167
- for p in self.path_corrupted.glob(glob):
1168
- self._delete_file(p)
1442
+ F = sao.aliased(mo.File)
1443
+ with self._beginw() as s:
1444
+ s.add(p := mo.Pending(expire_at=1))
1445
+ s.flush()
1446
+ s.execute(
1447
+ sa.update(F)
1448
+ .values(pending_id=p.id)
1449
+ .where(F.pending_id == self._corrupted_pending_id)
1450
+ ).close()
1451
+ self._garbage_collect_dedup_files(orphan_cutoff=None, pending_cutoff=2)
1169
1452
 
1170
1453
  @staticmethod
1171
1454
  def _copy_tree_default_fallback(src: Path, dst: Path):
@@ -1203,7 +1486,7 @@ class Dedup(abc.ABC):
1203
1486
  if to_copy:
1204
1487
  _run()
1205
1488
 
1206
- def delete_tree(self, p: Path) -> None:
1489
+ def delete_tree(self, p: Path, check_links: bool = True) -> None:
1207
1490
  def f(func, path, exc_info):
1208
1491
  if (p := Path(path)).exists():
1209
1492
  self._move_to_deleted(p)
@@ -1211,7 +1494,8 @@ class Dedup(abc.ABC):
1211
1494
  shutil.rmtree(str(p.absolute()), onerror=f)
1212
1495
  if p.exists():
1213
1496
  self._move_to_deleted(p)
1214
- self.check_links(p)
1497
+ if check_links:
1498
+ self.check_links(p)
1215
1499
 
1216
1500
  def delete_file(self, p: Path) -> None:
1217
1501
  self._delete_file(p)
@@ -1247,20 +1531,24 @@ class Dedup(abc.ABC):
1247
1531
  def _filelock(self, path: Path, blocking: bool):
1248
1532
  return filelock.FileLock(path, blocking=blocking)
1249
1533
 
1250
- @property
1534
+ @cached_property
1535
+ def _path_temporary_simple_dir(self):
1536
+ return self.path_temporary / "simple"
1537
+
1538
+ @cached_property
1251
1539
  def _path_temporary_dirs(self):
1252
1540
  return self.path_temporary / "dirs"
1253
1541
 
1254
- @property
1542
+ @cached_property
1255
1543
  def _path_temporary_lock(self):
1256
1544
  return self.path_temporary / "lock"
1257
1545
 
1258
- @property
1546
+ @cached_property
1259
1547
  def _path_temporary_master_lock(self):
1260
1548
  return self.path_temporary / "master.lock"
1261
1549
 
1262
1550
  @contextlib.contextmanager
1263
- def temporary_directory(self, prefix="tmp_", suffix=""):
1551
+ def temporary_directory(self, prefix="tmp_", suffix="", check_links: bool = True):
1264
1552
  exc = None
1265
1553
  for name in random_names(prefix=prefix, suffix=suffix):
1266
1554
  p = self._path_temporary_dirs / name
@@ -1287,7 +1575,7 @@ class Dedup(abc.ABC):
1287
1575
  yield p
1288
1576
  break
1289
1577
  finally:
1290
- self.delete_tree(p)
1578
+ self.delete_tree(p, check_links=check_links)
1291
1579
 
1292
1580
  # Release the lock file. We will attempt to delete it next.
1293
1581
  ex.close()
@@ -1304,18 +1592,62 @@ class Dedup(abc.ABC):
1304
1592
  else:
1305
1593
  raise AssertionError("retry count exceeded, unknown cause") if exc is None else exc
1306
1594
 
1595
+ @cached_property
1596
+ def _sql_obh_select_file(self):
1597
+ F = sao.aliased(mo.File)
1598
+ O = sao.aliased(mo.Obj)
1599
+ H = sao.aliased(mo.Hash)
1600
+ b = sa.bindparam
1601
+ q = (
1602
+ sa.select(
1603
+ F.id.label("file_id"),
1604
+ sa.case((O.orphaned_at != None, O.id), else_=None).label("obj_id"),
1605
+ )
1606
+ .join(O, F.obj)
1607
+ .join(H, O.hashes)
1608
+ .where(F.pending_id == None, H.hash_function == b("p_hf"), H.hash == b("p_h"))
1609
+ .limit(_lit("1"))
1610
+ )
1611
+ return q
1612
+
1613
+ @cached_property
1614
+ def _sql_obh_update_obj(self):
1615
+ O = sao.aliased(mo.Obj)
1616
+ b = sa.bindparam
1617
+ return (
1618
+ sa.update(O)
1619
+ .values(orphaned_at=sa.case((O.orphaned_at != None, b("p_now")), else_=None))
1620
+ .where(O.id == b("p_obj_id"))
1621
+ )
1622
+
1623
+ def open_by_hash(self, digest: mh.Digest) -> ty.BinaryIO | None:
1624
+ d = {"p_hf": digest.function.function_code, "p_h": digest.digest}
1625
+ with self._beginw() as s:
1626
+ c = s.connection()
1627
+ if (r := c.execute(self._sql_obh_select_file, d).one_or_none()) is None:
1628
+ return None
1629
+ file_id, obj_id = r
1630
+
1631
+ if obj_id is not None:
1632
+ d = {"p_now": mo.now(), "p_obj_id": obj_id}
1633
+ c.execute(self._sql_obh_update_obj, d).close()
1634
+
1635
+ return self._make_dedup_file_path(file_id).open("rb")
1636
+
1307
1637
  @cached_property
1308
1638
  def _q_get_hash(self):
1309
1639
  L = sao.aliased(mo.Link)
1310
- F = sao.aliased(mo.DedupFile)
1640
+ F = sao.aliased(mo.File)
1641
+ O = sao.aliased(mo.Obj)
1311
1642
  H = sao.aliased(mo.Hash)
1312
1643
  return (
1313
- sa.select(L, H, F.size)
1644
+ sa.select(L, H, O.size)
1314
1645
  .select_from(L)
1315
1646
  .join(F, L.file)
1316
- .outerjoin(H, (Rel(H.file) == F) & (H.hash_function == sa.bindparam("x_hf")))
1647
+ .join(O, F.obj)
1648
+ .outerjoin(H, (Rel(H.obj) == O) & (H.hash_function == sa.bindparam("x_hf")))
1317
1649
  .options(sao.contains_eager(L.file.of_type(F)))
1318
- .where(L.link_path == sa.bindparam("x_link_path"), F.pending == None)
1650
+ .where(L.path == sa.bindparam("x_link_path"), F.pending == None)
1319
1651
  )
1320
1652
 
1321
1653
  def _query_by_link_path(
@@ -1357,19 +1689,24 @@ class Dedup(abc.ABC):
1357
1689
  ) -> tuple[int, mh.Digest] | None:
1358
1690
  r = self.get_file_hash(hash_function, path, **kw)
1359
1691
  if r is None:
1360
- hasher = hash_function()
1361
- size = 0
1362
1692
  with path.open("rb") as f:
1363
- while block := f.read(65536):
1364
- size += len(block)
1365
- hasher.update(block)
1366
- r = size, hasher.digest()
1693
+ r = self._compute_file_hash(hash_function, f)
1367
1694
  return r
1368
1695
 
1696
+ def _compute_file_hash(self, hash_function, file):
1697
+ size = 0
1698
+ hasher = hash_function()
1699
+ while block := file.read(65536):
1700
+ size += len(block)
1701
+ hasher.update(block)
1702
+ return size, hasher.digest()
1703
+
1369
1704
  def adopt_files(
1370
1705
  self, hash_function: mh.HashFunction, requests: ty.Iterable[AdoptRequest]
1371
1706
  ) -> None:
1372
1707
  """
1708
+ HACK: DO NOT RUN THIS ON EXISTING DEDUP LINKS
1709
+
1373
1710
  Adopt each file given in *paths*. If the path is already a dedup link, then leave it
1374
1711
  alone. If the path is not a dedup link, then compute its hash and move the file to the
1375
1712
  dedup store and create a link to it. If the path is already a dedup link but does not
@@ -1378,125 +1715,25 @@ class Dedup(abc.ABC):
1378
1715
 
1379
1716
  This method is implemented in a somewhat inefficient way.
1380
1717
  """
1381
- reqs = [_ImplAdoptRequest(req) for req in requests]
1382
-
1383
- # first use a read-only session while we compute file hashes
1384
- with self._SessionR() as s:
1385
- for x in reqs:
1386
- x.link_path = self._link_path_to_string(x.req.path)
1387
- existing = self._query_by_link_path(s, x.link_path, hash_function)
1388
- if existing:
1389
- l, h, sz = existing[0]
1390
- if h is not None:
1391
- x.req.out_digest = h.to_digest()
1392
- x.req.out_size = sz
1393
- x.done = True
1394
-
1395
- if not x.done:
1396
- with open(x.req.path, "rb") as f:
1397
- h = hash_function()
1398
- size = 0
1399
- while block := f.read(65536):
1400
- h.update(block)
1401
- size += len(block)
1402
- x.req.out_digest = h.digest()
1403
- x.file_metadata = DedupFileMetadata(executable=False) # TODO
1404
- x.req.out_size = size
1405
- x.file_metadata_bytes = self.convert_file_metadata_to_bytes(x.file_metadata)
1406
-
1407
- F = sao.aliased(mo.DedupFile)
1408
- H = sao.aliased(mo.Hash)
1409
- q = (
1410
- sa.select(F)
1411
- .join(H, F.hashes)
1412
- .where(
1413
- H.hash_function == sa.bindparam("x_hf"),
1414
- H.hash == sa.bindparam("x_h"),
1415
- F.pending == None,
1416
- F.file_metadata == sa.bindparam("x_f_meta"),
1718
+ self.run_batch(
1719
+ DedupLinkRequest(
1720
+ hash_function=hash_function,
1721
+ link_path=req.path,
1722
+ tags=req.tags,
1723
+ file_metadata=self.get_metadata_from_file(req.path),
1724
+ open_file_once=None,
1725
+ adopt_existing=True,
1726
+ file_contents_hash=None,
1417
1727
  )
1728
+ for req in requests
1418
1729
  )
1419
1730
 
1420
- # then we use a RW session to update the database
1421
- with self._beginw() as s:
1422
- for x in reqs:
1423
- if x.done:
1424
- continue
1425
-
1426
- # re-check for an existing link
1427
- existing = self._query_by_link_path(s, x.link_path, hash_function)
1428
- if existing:
1429
- l, h, sz = existing[0]
1430
- file = l.file
1431
- if h is None:
1432
- s.add(mo.Hash.from_digest(x.req.out_digest, file=file))
1433
- else:
1434
- # never mind, nothing to do here
1435
- x.req.out_size = sz
1436
- x.req.out_digest = h.to_digest()
1437
- x.done = True
1438
- continue
1439
- else:
1440
- # try to lookup by digest first
1441
- # TODO: also look up by tag
1442
- files = (
1443
- s.execute(
1444
- q,
1445
- dict(
1446
- x_hf=hash_function.function_code,
1447
- x_h=x.req.out_digest.digest,
1448
- x_f_meta=x.file_metadata_bytes,
1449
- ),
1450
- )
1451
- .scalars()
1452
- .all()
1453
- )
1454
- if files:
1455
- file = files[0]
1456
- else:
1457
- file = None
1458
- if file is not None:
1459
- file.orphaned_at = None
1460
- x.delete = True
1461
- else:
1462
- # no existing file found, need to create one
1463
- file = mo.DedupFile(
1464
- file_metadata=x.file_metadata_bytes,
1465
- size=x.req.out_size,
1466
- mtime=int(x.req.path.stat().st_mtime),
1467
- orphaned_at=None,
1468
- pending=None,
1469
- hashes=[mo.Hash.from_digest(x.req.out_digest)],
1470
- )
1471
- s.add(file)
1472
- s.flush() # we need to make sure the file has an ID
1473
-
1474
- s.add(mo.Link(link_path=x.link_path, file=file))
1475
-
1476
- x.dedup_file_path = self._make_dedup_file_path(file.id)
1477
-
1478
- # We add our tags.
1479
- self._add_tags_to_file(s, file, x.req.tags)
1480
-
1481
- s.flush()
1482
-
1483
- # and finally we make filesystem changes
1484
- for x in reqs:
1485
- if (dst := x.dedup_file_path) is not None:
1486
- if x.delete:
1487
- # We already have a DedupFile with the required contents, so we replace the
1488
- # link_path file with a link to that existing DedupFile.
1489
- self._delete_file(x.req.path)
1490
- self._create_actual_link(dst, x.req.path)
1491
- else:
1492
- dst.parent.mkdir(exist_ok=True, parents=True)
1493
- self._adopt_file_and_link(x.req.path, dst)
1494
-
1495
1731
  def integrity_check(
1496
1732
  self,
1497
1733
  skip_same_mtime: bool,
1498
1734
  threads: int | None = None,
1499
- keep_corrupted: bool = True,
1735
+ *,
1736
+ only_invalid_link_count: bool = False,
1500
1737
  ):
1501
1738
  """
1502
1739
  Verify all deduplicated files match their stored hashes. Use modification time to skip
@@ -1504,17 +1741,28 @@ class Dedup(abc.ABC):
1504
1741
  :attr:`path_corrupted`.
1505
1742
  """
1506
1743
 
1507
- F = sao.aliased(mo.DedupFile)
1744
+ F = sao.aliased(mo.File)
1745
+ O = sao.aliased(mo.Obj)
1508
1746
  batch_size = 1000
1509
- q = sa.select(F).options(sao.selectinload(F.hashes)).order_by(F.id).limit(batch_size)
1747
+ q = sa.select(F).options(sao.selectinload(F.obj.of_type(O)).selectinload(O.hashes))
1748
+ if only_invalid_link_count:
1749
+ q = q.where(F.link_count < _lit("0"))
1750
+ q = q.where(F.pending_id == None).order_by(F.id).limit(batch_size)
1510
1751
 
1511
- def _hash_check(file: mo.DedupFile) -> None:
1752
+ def _hash_check(file: mo.File) -> None:
1512
1753
  p = self._make_dedup_file_path(file.id)
1513
- st_mtime = int(p.stat().st_mtime)
1514
- if skip_same_mtime and file.mtime == st_mtime:
1515
- return
1754
+ st = p.stat()
1755
+
1756
+ # FIXME: specific to hardlink backend
1757
+ if (n := st.st_nlink - 1) != file.link_count:
1758
+ link_count_updates.append({"id": file.id, "link_count": n})
1759
+ changed_obj_ids.add(file.obj_id)
1760
+
1761
+ if skip_same_mtime:
1762
+ if (st_mtime := int(st.st_mtime)) == self._clean_dedup_mtime:
1763
+ return
1516
1764
 
1517
- d = file.hashes_dict
1765
+ d = file.obj.hashes_dict
1518
1766
  m = mh.MultiHasher({hf: hf() for hf in d})
1519
1767
  with p.open("rb") as fh:
1520
1768
  while block := fh.read(65536):
@@ -1524,22 +1772,21 @@ class Dedup(abc.ABC):
1524
1772
 
1525
1773
  # TODO: also check file metadata matches, such as the executable bit
1526
1774
 
1527
- # The digest was the same, so update the mtime in the DB.
1528
- with self._SessionW() as s:
1529
- IdKey.from_instance(file).get_one(s).mtime = st_mtime
1530
-
1531
- id_min = -1
1775
+ id_min = None
1532
1776
  with cf.ThreadPoolExecutor(max_workers=threads) as exe:
1533
1777
  while True:
1778
+ changed_obj_ids = set()
1779
+ link_count_updates = []
1534
1780
  invalid_file_ids = []
1535
1781
 
1536
1782
  with self._SessionR() as s:
1537
- q2 = q.where(F.id > id_min, F.pending == None)
1538
- dedup_files: list[mo.DedupFile] = s.execute(q2).scalars().all()
1783
+ q2 = q if id_min is None else q.where(F.id > id_min)
1784
+ dedup_files: list[mo.File] = s.execute(q2).scalars().all()
1539
1785
 
1540
1786
  if not dedup_files:
1541
1787
  break
1542
1788
 
1789
+ s.expunge_all()
1543
1790
  id_min = dedup_files[-1].id
1544
1791
  futures = {exe.submit(_hash_check, f): f for f in dedup_files}
1545
1792
  for future in cf.as_completed(futures):
@@ -1549,53 +1796,37 @@ class Dedup(abc.ABC):
1549
1796
  raise exc
1550
1797
 
1551
1798
  file = futures[future]
1552
- self._integrity_check_process_corrupt_one(s, file, exc, keep_corrupted)
1553
- invalid_file_ids.append(file.id)
1799
+ invalid_file_ids.append((file.id, exc))
1554
1800
 
1555
- if invalid_file_ids:
1556
- with self._SessionW() as s:
1801
+ if link_count_updates:
1802
+ with self._beginw() as s:
1803
+ s.execute(sa.update(F), link_count_updates).close()
1804
+ now = mo.now()
1557
1805
  s.connection().execute(
1558
- sa.delete(F).where(F.id == sa.bindparam("_id")),
1559
- [{"_id": x} for x in invalid_file_ids],
1560
- )
1561
-
1562
- def _integrity_check_process_corrupt_one(
1563
- self, s: sao.Session, file: mo.DedupFile, exc: Exception, keep_corrupted: bool
1564
- ):
1565
- """
1566
- Process one file that has been found to be corrupted.
1567
- """
1568
-
1569
- path_file = self._make_dedup_file_path(file.id)
1570
-
1571
- # Load the links as we will need them
1572
- s.refresh(file, ["links"])
1573
-
1574
- link_paths = [self._link_path_from_string(link.link_path) for link in file.links]
1575
- json_data = {
1576
- "file_id": file.id,
1577
- "link_paths": [str(x) for x in link_paths],
1578
- "raw_link_paths": [
1579
- link.link_path.decode("utf-8", errors="replace") for link in file.links
1580
- ],
1581
- "exception": repr(exc),
1582
- }
1583
-
1584
- with create_file_random(self.path_corrupted, "f_", ".json") as f:
1585
- path_json = Path(f.name)
1586
- f.write(json.dumps(json_data, indent=2, sort_keys=True).encode("utf-8"))
1587
-
1588
- if keep_corrupted:
1589
- try:
1590
- path_file.rename(path_json.with_suffix(".bin"))
1591
- except Exception:
1592
- if path_file.exists():
1593
- logger.warning(
1594
- "failed to rename corrupt file", exc_info=True, data=str(path_file)
1595
- )
1806
+ sa.update(O)
1807
+ .where(O.id == sa.bindparam("p_id"))
1808
+ .values(updated_at=sa.bindparam("p_now")),
1809
+ [{"p_id": x, "p_now": now} for x in changed_obj_ids],
1810
+ ).close()
1596
1811
 
1597
- for x in link_paths:
1598
- self._delete_file(x)
1812
+ if invalid_file_ids:
1813
+ with self._beginw() as s:
1814
+ s.execute(
1815
+ sa.insert(mo.FileCorruption),
1816
+ [
1817
+ {
1818
+ "id": file_id,
1819
+ "exception_name": type(exc).__name__,
1820
+ "exception_string": str(exc),
1821
+ }
1822
+ for file_id, exc in invalid_file_ids
1823
+ ],
1824
+ ).close()
1825
+ s.connection().execute(
1826
+ sa.update(F)
1827
+ .values(pending_id=self._corrupted_pending_id)
1828
+ .where(F.id.in_(x[0] for x in invalid_file_ids))
1829
+ ).close()
1599
1830
 
1600
1831
  class _compute_stats_ZeroRow:
1601
1832
  orphaned = None
@@ -1603,18 +1834,22 @@ class Dedup(abc.ABC):
1603
1834
  size = 0
1604
1835
 
1605
1836
  def compute_stats(self) -> DedupStats:
1837
+ self.detect_orphaned()
1838
+
1606
1839
  with self._SessionR() as s:
1607
- F = sao.aliased(mo.DedupFile)
1840
+ O = sao.aliased(mo.Obj)
1841
+ F = sao.aliased(mo.File)
1608
1842
  L = sao.aliased(mo.Link)
1609
- orph = F.orphaned_at != None
1843
+ orph = O.orphaned_at != None
1610
1844
 
1611
1845
  q = (
1612
1846
  sa.select(
1613
1847
  orph.label("orphaned"),
1614
1848
  sa.func.count().label("count"),
1615
- sa.func.sum(F.size).label("size"),
1849
+ sa.func.sum(O.size).label("size"),
1616
1850
  )
1617
1851
  .select_from(F)
1852
+ .join(O, F.obj)
1618
1853
  .where(F.pending == None)
1619
1854
  .group_by(orph)
1620
1855
  )
@@ -1622,9 +1857,10 @@ class Dedup(abc.ABC):
1622
1857
  file_stats |= {row.orphaned: row for row in s.execute(q).all()}
1623
1858
 
1624
1859
  q = (
1625
- sa.select(sa.func.count().label("count"), sa.func.sum(F.size).label("size"))
1860
+ sa.select(sa.func.count().label("count"), sa.func.sum(O.size).label("size"))
1626
1861
  .select_from(L)
1627
1862
  .join(F, L.file)
1863
+ .join(O, F.obj)
1628
1864
  ).where(F.pending == None)
1629
1865
  link_stats = s.execute(q).one()
1630
1866
 
@@ -1639,6 +1875,8 @@ class Dedup(abc.ABC):
1639
1875
 
1640
1876
 
1641
1877
  class DedupBackendHardlink(Dedup):
1878
+ max_link_count = 1000 # Windows limits it to 1023
1879
+
1642
1880
  def _create_actual_link(self, existing: Path, new: Path):
1643
1881
  # Path.link_to was removed and replaced by Path.hardlink_to, but I want this to work across
1644
1882
  # Python 3.9 to 3.13
@@ -1649,14 +1887,13 @@ class DedupBackendHardlink(Dedup):
1649
1887
  self._create_actual_link(existing_path, dedup_file_path)
1650
1888
 
1651
1889
  def _verify_link(self, link: mo.Link) -> bool:
1652
- p = Path(link.link_path.decode("utf-8"))
1653
-
1890
+ p = self._link_path_from_string(link.path)
1654
1891
  try:
1655
1892
  a = p.lstat()
1656
1893
  except Exception:
1657
1894
  return False
1658
1895
 
1659
- if link.file.mtime != int(a.st_mtime):
1896
+ if int(a.st_mtime) != self._clean_dedup_mtime:
1660
1897
  return False
1661
1898
 
1662
1899
  # st_ino is 0 on unsupported filesystems on Windows.