PyPI - vocker - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

vocker 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

vocker/cli.py +1 -2
vocker/dedup.py +913 -676
vocker/dedup_models.py +204 -49
vocker/repo/io.py +7 -0
vocker/system.py +39 -16
vocker/util.py +6 -4
{vocker-0.2.0.dist-info → vocker-0.3.1.dist-info}/METADATA +5 -5
{vocker-0.2.0.dist-info → vocker-0.3.1.dist-info}/RECORD +10 -10
{vocker-0.2.0.dist-info → vocker-0.3.1.dist-info}/WHEEL +0 -0
{vocker-0.2.0.dist-info → vocker-0.3.1.dist-info}/top_level.txt +0 -0

vocker/dedup.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
 import abc
+import datetime
+from collections import defaultdict
 import contextlib
 import filelock
 import io
@@ -18,11 +20,10 @@ import structlog
 import concurrent.futures as cf
 import sqlalchemy as sa
-from sqlalchemy import orm as sao
+from sqlalchemy import orm as sao, literal_column as _lit
 from sqlalchemy_boltons import sqlite as sq
 from sqlalchemy_boltons.orm import RelationshipComparator as Rel, IdKey
-from sqlalchemy_boltons.temporary import temporary_table
-from sqlalchemy_boltons.core import bytes_startswith
+from sqlalchemy_boltons.core import bytes_startswith, count
 from boltons.iterutils import chunked_iter
 from cached_property import cached_property
@@ -39,15 +40,22 @@ logger = structlog.get_logger(__name__)
 class Corrupted:
     path: Path | None = attr.ib()
     file_id: int = attr.ib()
-    exception: str = attr.ib()
-    link_paths: frozenset[str] = attr.ib()
-    raw_link_paths: frozenset[str] = attr.ib()
+    exception_name: str = attr.ib()
+    exception_string: str = attr.ib()
+    link_paths: tuple[Path, ...] = attr.ib()
+    raw_link_paths: tuple[bytes, ...] = attr.ib()
     def to_json(self):
         d = attr.asdict(self)
         d["path"] = p if (p := d["path"]) is None else str(p)
-        for k in ("link_paths", "raw_link_paths"):
-            d[k] = sorted(d[k])
+        d["link_paths"] = [str(x) for x in d["link_paths"]]
+        d["link_paths"].sort()
+        # JSON cannot handle raw bytes
+        d["raw_link_paths"] = [x.decode("iso-8859-1") for x in d["raw_link_paths"]]
+        d["raw_link_paths"].sort()
         return d
@@ -132,10 +140,11 @@ class DedupLinkRequest(DedupRequest):
     """
     hash_function: mh.HashFunction = attr.ib()
-    link_path: Path = attr.ib()
+    link_path: Path | None = attr.ib()
     file_metadata: DedupFileMetadata = attr.ib()
     file_contents_hash: mh.Digest | None = attr.ib()
     open_file_once: ty.Callable[[], ty.BinaryIO] | None = attr.ib()
+    adopt_existing: bool = attr.ib(default=False)
     file_not_needed: ty.Callable[[], None] | None = attr.ib(default=None)
     tags: ty.Set[bytes] = attr.ib(factory=frozenset)
@@ -163,15 +172,9 @@ class _ImplDedupRequestCommon:
 @attr.s(eq=False, hash=False, kw_only=True)
 class _ImplDedupLinkRequest(_ImplDedupRequestCommon):
     req: DedupLinkRequest = attr.ib(default=None)
-    lookup_key = attr.ib(default=None)
-    dedup_file_path: Path = attr.ib(default=None)
+    obj: _Obj | None = attr.ib(default=None)
     link_path_str: bytes | None = attr.ib(default=None)
-    file: IdKey[mo.DedupFile] | None = attr.ib(default=None)
     metadata_bytes: bytes | None = attr.ib(default=None)
-    file_size: int = attr.ib(default=None)
-    file_mtime: int = attr.ib(default=None)
-    fast_path: bool = attr.ib(default=False)  # can we use the fast-path without db transaction?
-    is_new: bool = attr.ib(default=False)  # is it a brand new FileDedup?
     hashes_promised: dict[mh.HashFunction, mh.Digest] = attr.ib(default=None)
     hashes_computed: dict[mh.HashFunction, mh.Digest] | None = attr.ib(default=None)
     called_file: bool = attr.ib(default=False)
@@ -222,19 +225,21 @@ class AdoptRequest:
     path: Path = attr.ib()
     tags: ty.Set[bytes] = attr.ib(factory=frozenset)
-    out_size: int | None = attr.ib(init=False, default=None)
-    out_digest: mh.Digest | None = attr.ib(init=False, default=None)
+@attr.s(eq=False, slots=True)
+class _Obj:
+    id: int = attr.ib(factory=None)
+    pending_file_ids = attr.ib(factory=list)
+    completed_file_ids = attr.ib(factory=list)
+    file_size: int | None = attr.ib(default=None)
+    adopted_file_path: Path | None = attr.ib(default=None)
-@attr.s(eq=False, hash=False)
-class _ImplAdoptRequest:
-    req: AdoptRequest = attr.ib()
-    link_path: bytes = attr.ib(default=None)
-    file_metadata: DedupFileMetadata = attr.ib(default=None)
-    file_metadata_bytes: bytes = attr.ib(default=None)
-    done: bool = attr.ib(default=False)
-    dedup_file_path: Path = attr.ib(default=None)
-    delete: bool = attr.ib(default=False)
+@attr.s(eq=False, slots=True)
+class _Updates:
+    obj_updates = attr.ib(factory=list)
+    file_updates = attr.ib(factory=list)
+    link_updates = attr.ib(factory=list)
 """
@@ -284,7 +289,7 @@ class _PendingUpdater:
             raise ValueError(f"invalid update_interval={u!r}")
     def _update(self):
-        with self.sessionmaker_w() as s:
+        with self.sessionmaker_w.begin() as s:
             pending: mo.Pending = self.pending.get_one(s)
             pending.expire_at = mo.now() + self.seconds_in_the_future
@@ -335,6 +340,10 @@ def make_sqlite_options(synchronous):
     )
+def _ns(stmt):
+    return stmt.execution_options(synchronize_session=False)
 @attr.s(eq=False, hash=False)
 class Dedup(abc.ABC):
     base_path: Path = attr.ib()
@@ -345,10 +354,14 @@ class Dedup(abc.ABC):
     _path_db: Path | None = attr.ib(default=None, kw_only=True)
     path_temporary: Path | None = attr.ib(default=None, kw_only=True)
     path_deleted: Path | None = attr.ib(default=None, kw_only=True)
-    path_corrupted: Path | None = attr.ib(default=None, kw_only=True)
     _integer_to_path = attr.ib(factory=IntegerToPath, kw_only=True)
     _sqlite_synchronous = attr.ib(default="NORMAL", kw_only=True)
     _batch_size = 1000
+    max_link_count: int = ...
+    _clean_dedup_mtime = (
+        round(datetime.datetime(2000, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc).timestamp()) & ~1
+    )
+    _corrupted_pending_id = -1  # reserved ID
     def __attrs_post_init__(self):
         if self._path_dedup is None:
@@ -363,15 +376,12 @@ class Dedup(abc.ABC):
         if self.path_temporary is None:
             self.path_temporary = self.base_path / "tmp"
-        if self.path_corrupted is None:
-            self.path_corrupted = self.base_path / "corrupted"
         self._path_dedup.mkdir(exist_ok=True, parents=True)
         self._path_db.parent.mkdir(exist_ok=True, parents=True)
-        self.path_corrupted.mkdir(exist_ok=True, parents=True)
         self.path_deleted.mkdir(exist_ok=True, parents=True)
         self._path_temporary_dirs.mkdir(exist_ok=True, parents=True)
         self._path_temporary_lock.mkdir(exist_ok=True, parents=True)
+        self._path_temporary_simple_dir.mkdir(exist_ok=True, parents=True)
         engine = sq.create_engine_sqlite(self._path_db, create_engine_args=dict(echo=False))
         engine = make_sqlite_options(synchronous=self._sqlite_synchronous).apply(engine)
         self._engine_r = engine
@@ -380,10 +390,6 @@ class Dedup(abc.ABC):
         self._SessionR = sao.sessionmaker(self._engine_r)
         self._SessionW = sao.sessionmaker(self._engine_w)
-        # FIXME: use proper session management
-        # self.session = Session(self.engine_rw)  # HACK
-        # self.engine = self.engine_rw  # HACK
         self._initialize_db()
     def _initialize_db(self):
@@ -392,6 +398,10 @@ class Dedup(abc.ABC):
             mo.BaseDedup.metadata.create_all(conn)
             conn.commit()
+        with self._beginw() as s:
+            if s.get(mo.Pending, self._corrupted_pending_id) is None:
+                s.add(mo.Pending(id=self._corrupted_pending_id, expire_at=1))
     @contextlib.contextmanager
     def _beginw(self):
         with self._SessionW.begin() as s:
@@ -408,6 +418,10 @@ class Dedup(abc.ABC):
                     new_mode |= mask
                 os.chmod(str(path), new_mode, follow_symlinks=False)
+    def _set_clean_file_mtime(self, path: Path) -> None:
+        t = self._clean_dedup_mtime
+        os.utime(path, (t, t))
     def get_metadata_from_file(self, path: Path) -> DedupFileMetadata:
         if supports_executable():
             mode = path.stat().st_mode
@@ -470,42 +484,269 @@ class Dedup(abc.ABC):
             hashes=[f(h) for h in link.hashes_promised.values()],
         )
-    def _add_tags_to_file(self, session: sao.Session, file: mo.DedupFile, tags: ty.Set[bytes]):
-        if not tags:
-            return
+    def _tmp_sqlite(self, tmp):
+        with self._SessionR() as s:
+            return tmp.get(s, "").value
-        Tag = sao.aliased(mo.Tag)
-        current_tags = frozenset(
-            session.execute(sa.select(Tag.name).where(Tag.file == file)).scalars().all()
+    @cached_property
+    def _tmp_files(self):
+        return self._tmp_sqlite(mo.tmp_new_files)
+    @cached_property
+    def _tmp_files2(self):
+        return self._tmp_sqlite(mo.tmp_new_files2)
+    @cached_property
+    def _tmp_check_links(self):
+        return self._tmp_sqlite(mo.tmp_check_links)
+    @cached_property
+    def _tmp_delete_extra(self):
+        return self._tmp_sqlite(mo.tmp_delete_extra)
+    @cached_property
+    def _sql_prebatch_check_link(self):
+        L = sao.aliased(mo.Link)
+        return count(L).where(L.path == sa.bindparam("p_path"))
+    @cached_property
+    def _sql_prebatch_update_with_existing_dedup_files(self):
+        tmp = self._tmp_files
+        ONE = _lit("1")
+        def _eq(x, y, attributes):
+            return sa.and_(getattr(x, a) == getattr(y, a) for a in attributes)
+        O = sao.aliased(mo.Obj, name="obj")
+        tmp_f = sao.aliased(tmp.files, name="t_file")
+        tmp_tag = sao.aliased(tmp.tags, name="t_tag")
+        tmp_hash = sao.aliased(tmp.hashes, name="t_hash")
+        Tag = sao.aliased(mo.Tag, name="tag")
+        Hash = sao.aliased(mo.Hash, name="hash")
+        cond_obj = O.q_is_complete() | (O.pending_id == sa.bindparam("p_pending_id"))
+        cond_obj &= _eq(O, tmp_f.c, ["metadata_bytes"])
+        sqt = (
+            sa.select(O.id)
+            .join(Tag, O.tags)
+            .join(tmp_tag, _eq(tmp_tag.c, Tag, ["name"]))
+            .where(cond_obj, tmp_tag.c.id == tmp_f.c.id)
+            .limit(ONE)
+        )
+        q = sa.select(tmp_f.c.id, sqt.scalar_subquery().label("obj_id"))
+        q = q.where(tmp_f.c.obj_id == None).subquery()
+        sqh = (
+            sa.select(O.id)
+            .join(Hash, O.hashes)
+            .join(tmp_hash, _eq(tmp_hash.c, Hash, ["hash_function", "hash"]))
+            .where(cond_obj, tmp_hash.c.id == q.c.id)
+            .limit(ONE)
+        )
+        q = sa.select(
+            q.c.id,
+            sa.case((q.c.obj_id == None, sqh.scalar_subquery()), else_=q.c.obj_id).label("obj_id"),
         )
-        for name in tags - current_tags:
-            session.add(mo.Tag(name=name, file=file))
-    def _prepare_dedup_file_for_linking(
-        self, session: sao.Session, file: mo.DedupFile, link: _ImplDedupLinkRequest
-    ):
-        if link.is_new:
-            # We need to flush so that the DedupFile gets assigned an ID. The merge below needs it.
-            session.flush()
-        # We add our tags.
-        self._add_tags_to_file(session, file, link.req.tags)
-        # Delete any existing link.
-        session.connection().execute(
-            sa.delete(mo.Link)
-            .where(mo.Link.link_path == link.link_path_str)
-            .execution_options(synchronize_session=False)
+        tmp_f_up = sao.aliased(tmp.files, name="t_file")
+        # Finally, create the UPDATE statement that uses `qu` to update `tmp_files`.
+        qu = q.cte(name="t_file_changes")
+        stmt = sa.update(tmp_f_up)
+        stmt = stmt.values(obj_id=qu.c.obj_id).where(tmp_f_up.c.id == qu.c.id)
+        return _ns(stmt)
+    @cached_property
+    def _sql_prebatch_insert_missing_objs(self):
+        """
+        Create Obj records where missing.
+        """
+        tmp = self._tmp_files
+        fake_created_at = sa.bindparam("p_fake_created_at")
+        tmp_f = sa.alias(tmp.files, name="t_file")
+        Obj = sao.aliased(mo.Obj, name="obj")
+        q = sa.select(
+            tmp_f.c.metadata_bytes.label("metadata"),
+            tmp_f.c.id.label("size"),  # smuggle ID through this field
+            fake_created_at.label("created_at"),
+            sa.null().label("orphaned_at"),
+            sa.bindparam("p_pending_id").label("pending_id"),
+        )
+        q = q.select_from(tmp_f).where(
+            tmp_f.c.obj_id == None, tmp_f.c.insert_obj_if_missing == True
+        )
+        qi = sa.insert(mo.Obj)
+        qi = qi.from_select(["metadata", "size", "created_at", "orphaned_at", "pending_id"], q)
+        del q, Obj, tmp_f
+        Obj = sao.aliased(mo.Obj, name="obj")
+        q = sa.select(Obj.id.label("obj_id"), Obj.size.label("id")).where(
+            Obj.created_at < _lit("0"), Obj.created_at == fake_created_at
+        )
+        q = q.cte(name="t_file_changes")
+        tmp_f = sa.alias(tmp.files, name="t_file")
+        qu = sa.update(tmp_f).add_cte(q)
+        qu = qu.values(new_obj_id=q.c.obj_id).where(tmp_f.c.id == q.c.id)
+        return _ns(qi), _ns(qu)
+    @cached_property
+    def _sql_prebatch_fix_and_delete_objs(self):
+        tmp = self._tmp_files
+        tmp_f = sao.aliased(tmp.files, name="t_files")
+        pending_id = sa.bindparam("p_pending_id")
+        created_at = sa.bindparam("p_created_at")
+        Obj = sao.aliased(mo.Obj, name="obj")
+        # Set a proper created_at for the new Objs that are actually in use.
+        q = sa.select(tmp_f.c.obj_id).where(tmp_f.c.obj_id != None)
+        r1 = sa.update(Obj).values(created_at=created_at)
+        r1 = r1.where(Obj.id.in_(q), Obj.pending_id == pending_id)
+        # Set updated_at to the current time.
+        r2 = sa.update(Obj).values(updated_at=created_at)
+        r2 = r2.where(Obj.id.in_(q))
+        # Delete remaining Objs.
+        r3 = sa.delete(Obj).where(
+            Obj.id.in_(sa.select(tmp_f.c.new_obj_id)), Obj.created_at < _lit("0")
         )
-        # Create link object.
-        session.add(mo.Link(link_path=link.link_path_str, file=file))
+        return tuple(_ns(x) for x in (r1, r2, r3))
-        # Since we created a link, the file is definitely not orphaned.
-        file.orphaned_at = None
+    @cached_property
+    def _sql_prebatch_insert_hashes(self):
+        """
+        Create Hash records.
+        """
+        tmp = self._tmp_files
+        tmp_f = sao.aliased(tmp.files, name="t_files")
+        tmp_h = sao.aliased(tmp.hashes, name="t_hash")
+        Obj = sao.aliased(mo.Obj, name="obj")
+        Hash = sao.aliased(mo.Hash, name="h")
+        q = sa.select(tmp_f.c.new_obj_id, tmp_h.c.hash_function, tmp_h.c.hash)
+        q = q.select_from(tmp_h).join(tmp_f, tmp_f.c.id == tmp_h.c.id)
+        exists = sa.exists().select_from(Hash)
+        exists = exists.where(
+            Hash.hash_function == tmp_h.c.hash_function, Hash.obj_id == tmp_f.c.new_obj_id
+        )
+        q = q.where(~exists, tmp_f.c.new_obj_id != None)
+        stmt = sa.insert(mo.Hash).from_select(["obj_id", "hash_function", "hash"], q)
+        return _ns(stmt)
-        # This also relies on the flush above.
-        link.dedup_file_path = self._make_dedup_file_path(file.id)
+    @cached_property
+    def _sql_prebatch_insert_tags(self):
+        """
+        Create Tag records.
+        """
+        # Sadly this has a lot in common with `_sql_insert_hashes`. The urge to refactor is intense.
+        tmp = self._tmp_files
+        tmp_f = sao.aliased(tmp.files, name="t_files")
+        tmp_t = sao.aliased(tmp.tags, name="t_tag")
+        Obj = sao.aliased(mo.Obj, name="obj")
+        Tag = sao.aliased(mo.Tag, name="tag")
+        q = sa.select(tmp_f.c.new_obj_id, tmp_t.c.name)
+        q = q.select_from(tmp_t).join(tmp_f, tmp_f.c.id == tmp_t.c.id)
+        exists = sa.exists().select_from(Tag)
+        exists = exists.where(Tag.name == tmp_t.c.name, Tag.obj_id == tmp_f.c.new_obj_id)
+        q = q.where(~exists, tmp_f.c.new_obj_id != None)
+        stmt = sa.insert(mo.Tag).from_select(["obj_id", "name"], q)
+        return _ns(stmt)
+    @cached_property
+    def _sql_prebatch_insert_files(self):
+        tmp = self._tmp_files
+        tmp_f = sao.aliased(tmp.files, name="t_files")
+        q = sa.select(
+            tmp_f.c.obj_id,
+            sa.bindparam("p_pending_id").label("pending_id"),
+            sa.bindparam("p_created_at").label("created_at"),
+        )
+        q = q.where(tmp_f.c.obj_id != None)
+        stmt = sa.insert(mo.File).from_select(["obj_id", "pending_id", "created_at"], q)
+        return _ns(stmt)
+    @cached_property
+    def _sql_prebatch_delete_and_insert_links(self):
+        tmp = self._tmp_files
+        F = sao.aliased(mo.File, name="file")
+        L = sao.aliased(mo.Link, name="link")
+        tmp_f = sao.aliased(tmp.files, name="t_file")
+        null_id = sa.bindparam("p_null_file_id")
+        cond_link = (tmp_f.c.obj_id != None) & (tmp_f.c.link_path != None)
+        # Invalidate file link counts for the links we are about to delete.
+        q = sa.select(L.file_id).where(L.path.in_(sa.select(tmp_f.c.link_path).where(cond_link)))
+        r0 = sa.update(F).values(link_count=-1)
+        r0 = r0.where(F.id.in_(q))
+        # Delete the old links.
+        r1 = sa.delete(L).where(L.path.in_(sa.select(tmp_f.c.link_path).where(cond_link)))
+        # Insert the new links.
+        q = sa.select(tmp_f.c.link_path.label("path"), null_id.label("file_id")).where(cond_link)
+        r2 = sa.insert(mo.Link).from_select(["path", "file_id"], q)
+        # Set in-use Objs as not orphaned as they now have links.
+        O = sao.aliased(mo.Obj, name="obj")
+        q = sa.select(tmp_f.c.obj_id).where(cond_link)
+        r3 = sa.update(O).values(orphaned_at=None).where(O.id.in_(q))
+        return tuple(_ns(r) for r in (r0, r1, r2, r3))
+    @cached_property
+    def _sql_postbatch_update_objs(self):
+        t = sao.aliased(self._tmp_files2.objs, name="tu_obj")
+        q = sa.select(t.c.obj_id, t.c.size).subquery()
+        O = sao.aliased(mo.Obj, name="obj")
+        stmt = sa.update(O).where(O.id == q.c.obj_id)
+        stmt = stmt.values(size=q.c.size, pending_id=None)
+        return _ns(stmt)
+    @cached_property
+    def _sql_postbatch_update_files(self):
+        t = sao.aliased(self._tmp_files2.files, name="tu_files")
+        q = sa.select(t.c.file_id, t.c.obj_id).subquery()
+        F = sao.aliased(mo.File, name="file")
+        stmt = sa.update(F).where(F.id == q.c.file_id)
+        stmt = stmt.values(obj_id=q.c.obj_id, pending_id=None)
+        return _ns(stmt)
+    @cached_property
+    def _sql_postbatch_update_links(self):
+        t = sao.aliased(self._tmp_files2.links, name="tu_links")
+        q = sa.select(t.c.link_path, t.c.file_id, t.c.link_count).subquery()
+        F = sao.aliased(mo.File, name="file")
+        L = sao.aliased(mo.Link, name="link")
+        stmt1 = sa.update(F).where(F.id == q.c.file_id)
+        stmt1 = stmt1.values(link_count=q.c.link_count)
+        stmt2 = sa.update(L).where(L.path == q.c.link_path)
+        stmt2 = stmt2.values(file_id=q.c.file_id)
+        return _ns(stmt1), _ns(stmt2)
+    @cached_property
+    def _sql_prebatch_select_req_obj(self):
+        tmp = self._tmp_files
+        t_files = sao.aliased(tmp.files, name="t_files")
+        q = sa.select(t_files.c.id, t_files.c.obj_id).where(t_files.c.obj_id != None)
+        return _ns(q)
+    @cached_property
+    def _sql_prebatch_select_obj_file(self):
+        tmp = self._tmp_files
+        p_id = sa.bindparam("p_pending_id")
+        O = sao.aliased(mo.Obj, name="obj")
+        F = sao.aliased(mo.File, name="file")
+        t_files = sao.aliased(tmp.files, name="t_files")
+        qo = sa.select(t_files.c.obj_id).where(t_files.c.obj_id != None)
+        q = sa.select(F.obj_id, F.id, F.pending_id)
+        q = q.where(F.link_count < sa.bindparam("p_max_link_count"))
+        q = q.where(F.obj_id.in_(qo), (F.pending_id == None) | (F.pending_id == p_id))
+        return _ns(q)
     def run_batch(self, requests: ty.Iterable[DedupRequest]) -> None:
         """
@@ -514,24 +755,42 @@ class Dedup(abc.ABC):
         The requests will be addressed in the order that they appear in the iterable.
-        Notes
-        -----
+        BUG: If the same file (same hash or same tag) appears multiple times in the *requests* then
+        multiple files will be created. You are welcome to fix this without breaking the tests and
+        without incurring a significant performance penalty.
+        We create more Objs and Files than we need, then clean them up later no biggie.
+        They get automatically deleted when the Pending record is deleted.
+        Pre-batch:
-        The implementation tries to spend as little time as possible inside database transactions.
+        1. Insert temporary files, hashes, tags
+        2. Update tmp.files.obj_id by matching existing Objs by hash or tag.
+        3. Insert a new Obj for each tmp_file where obj_id is NULL.
+        4. Insert Objs and update t.files.obj_id to point to the right Obj.
+        5. Insert Hash and Tag rows corresponding to stuff in t.files.
+        6. Update tmp.files.obj_id by matching existing Objs by hash or tag.
+        7. Insert a new File for each tmp_file. We might not need it.
+        8. Insert a new Link for each tmp_file.
+        9. Select (Obj.id, File.id) for each of tmp.files. These are all the possible usable files
+           that we will attempt to create a link to.
-        1. Search database for existing deduplicated files that can be reused. These are files
-           that match either the hash or one of the tags.
-        2. Create a record for each new deduplicated file. Create a Pending
-        3.
+        Batch:
-        NEW IDEA FIXME
-        --------------
+        1. For each request:
+            2. If no content is present, then use the existing pending File id to write the content.
+            3. If content is already present:
+                3. For each related file_id:
+                    4. Check the link count and make a reminder to update the link count in the DB.
+                    5. Attempt to create a link pointing to that file_id. If it succeeds, continue
+                       to the next request.
+                6. None of the file IDs succeeded. Make a copy of an existing file using the spare
+                pending File id.
-        Split into fast path and slow path. If it's a brand new file OR it's an existing file that
-        is done being written (not pending), then that's the fast path. Otherwise it's the slow
-        path.
+        Post-batch:
-        On the *fast path* we don't need to check for what other threads are doing.
+        1. Update the link counts.
+        2. Match each
         """
@@ -553,236 +812,189 @@ class Dedup(abc.ABC):
                 "doing both links and copies in the same batch is not supported for now"
             )
+        # cases to consider:
+        # adopt_existing==True, link_path is an existing dedup link: NOT IMPLEMENTED
+        # adopt_existing==True, link_path is a regular file, hash matches content inside dedup db
+        # adopt_existing==True, link_path is a regular file, content is novel
         # Preliminaries to do before we start writing to the database.
-        all_tags: set[bytes] = set()
-        hashes_to_search: list[dict] = []
+        tmp_files = []
+        tmp_tags = []
+        tmp_hashes = []
         with self._SessionR() as s:
             for link in links:
-                with self._ignore_skip(), self._catch_req_exc(link):
-                    req = link.req
-                    link.link_path_str = self._link_path_to_string(req.link_path)
+                req = link.req
+                if req.link_path is not None:
+                    link.link_path_str = ps = self._link_path_to_string(req.link_path)
                     # Remove existing file if present. This may raise if the path is actually a
                     # directory.
-                    req.link_path.unlink(missing_ok=True)
-                    all_tags |= req.tags
-                    link.metadata_bytes = self.convert_file_metadata_to_bytes(req.file_metadata)
-                    if (h := req.file_contents_hash) is not None:
-                        link.lookup_key = h, link.metadata_bytes
-                        d = {
-                            "id": link.index,
-                            "hash_function": h.function.function_code,
-                            "digest": h.digest,
-                            "metadata_bytes": link.metadata_bytes,
-                        }
-                        hashes_to_search.append(d)
-                        link.hashes_promised = {h.function: h}
+                    if req.adopt_existing:
+                        pass  # Assertion is too expensive.
+                        # assert not s.execute(
+                        #     self._sql_prebatch_check_link, {"p_path": ps}
+                        # ).scalar(), "adopting an existing link is not supported yet"
                     else:
-                        link.hashes_promised = {}
-        for copy in copies:
-            with self._ignore_skip(), self._catch_req_exc(copy):
-                req = copy.req
-                copy.src_str = self._link_path_to_string(req.src)
-                copy.dst_str = self._link_path_to_string(req.dst)
-        def _q_gather_file_related(s, cls, attribute, values_set):
-            """
-            Query DedupFile-related information.
-            """
-            if not values_set:
-                return ()  # short-cut to avoid doing the query at all
-            Related = sao.aliased(cls)
-            q = sa.select(Related).where(getattr(Related, attribute).in_(values_set))
-            q = q.options(sao.joinedload(Related.file))
-            return s.execute(q).scalars()
+                        req.link_path.unlink(missing_ok=True)
+                else:
+                    # The user is requesting insert of content but doesn't want an actual link
+                    # to be created.
+                    link.link_path_str = ps = None
+                    assert not req.adopt_existing
+                if req.adopt_existing:
+                    req.file_metadata = m = self.get_metadata_from_file(req.link_path)
+                    with req.link_path.open("rb") as f:
+                        r = self._compute_file_hash(req.hash_function, f)
+                    link.file_size, req.file_contents_hash = r
+                else:
+                    m = req.file_metadata
-        # Now we check the database and add file hash records where we can.
-        with self._beginw() as s:
-            s.add(pending := mo.Pending(expire_at=mo.now() + 30.0))
-            s.flush()
-            pending_key = IdKey.from_instance(pending)
+                link.metadata_bytes = m = self.convert_file_metadata_to_bytes(m)
-            # Load relevant tags.
-            q = _q_gather_file_related(s, mo.Tag, "name", all_tags)
-            tag_to_file: dict[bytes, mo.DedupFile] = {x.name: x.file for x in q}
-            # Load relevant hashes.
-            if hashes_to_search:
-                with temporary_table(s, mo.tmp_hash_lookup) as tmp:
-                    s.connection().execute(sa.insert(tmp), hashes_to_search).close()
-                    H = sao.aliased(mo.Hash)
-                    F = sao.aliased(mo.DedupFile)
-                    q = (
-                        sa.select(H, F)
-                        .join(F, H.file)
-                        .join(
-                            tmp,
-                            (tmp.c.digest == H.hash)
-                            & (tmp.c.hash_function == H.hash_function)
-                            & (tmp.c.metadata_bytes == F.file_metadata),
-                        )
-                    )
-                    hash_to_file = {
-                        (h.to_digest(), f.file_metadata): f for h, f in s.execute(q).all()
+                tmp_files.append(
+                    {
+                        "id": link.index,
+                        "link_path": ps,
+                        "metadata_bytes": m,
+                        "insert_obj_if_missing": req.open_file_once is not None
+                        or req.adopt_existing,
                     }
-            else:
-                hash_to_file = {}
+                )
-            # Construct a set so that we can check for intersection quickly.
-            tag_to_file_set = set(tag_to_file)
+                tmp_tags += ({"id": link.index, "name": tag} for tag in req.tags)
-            for link in links:
-                if link.failed:
-                    continue
-                req = link.req
-                if overlap := req.tags & tag_to_file_set:
-                    # We found a deduped file with a common alternate key! We use it!
-                    file = tag_to_file[next(iter(overlap))]
-                elif (key := link.lookup_key) is not None:
-                    # Check for a deduped file with the same hash.
-                    file = hash_to_file.get(key, None)
+                if (h := req.file_contents_hash) is not None:
+                    d = {
+                        "id": link.index,
+                        "hash_function": h.function.function_code,
+                        "hash": h.digest,
+                    }
+                    tmp_hashes.append(d)
+                    link.hashes_promised = {h.function: h}
                 else:
-                    file = None
+                    link.hashes_promised = {}
-                if file is None:
-                    # We did not find a matching file. We create a new one if we can.
-                    link.is_new = True
-                    link.fast_path = True
+                if (req.file_contents_hash is None) and not req.tags:
+                    raise AssertionError("must provide hash and/or tags")
-                    if req.open_file_once is None:
-                        # The user does not actually have the contents of the file. We skip over
-                        # it.
-                        link.set_failed(MissingContentError())
-                        continue
+        updates = _Updates()
+        objs: dict[int, _Obj] = {}
-                    # We must create a file.
-                    s.add(file := self._make_dedup_file(link, pending))
-                elif file.pending_id is None:
-                    # We found a matching file and it is not pending. We can use it directly.
-                    link.fast_path = True
+        # Now we check the database and add file hash records where we can.
+        with self._beginw() as s, mo.tmp_new_files(s, "") as t:
+            c = s.connection()
+            s.add(pending := mo.Pending(expire_at=mo.now() + 30.0))
+            s.flush()
+            pending_key = IdKey.from_instance(pending)
+            pending_id = pending.id
+            if tmp_files:
+                c.execute(sa.insert(t.files), tmp_files).close()
+            if tmp_hashes:
+                c.execute(sa.insert(t.hashes), tmp_hashes).close()
+            if tmp_tags:
+                c.execute(sa.insert(t.tags), tmp_tags).close()
+            s.add(temp_file := mo.File(pending_id=pending_id))
+            s.flush()
+            temp_file_id = temp_file.id
+            # Set t.files.obj_id using existing Obj and File records.
+            d = {
+                "p_fake_created_at": -1,
+                "p_created_at": mo.now(),
+                "p_null_file_id": temp_file_id,
+                "p_pending_id": pending_id,
+                "p_max_link_count": self.max_link_count,
+            }
+            c.execute(self._sql_prebatch_update_with_existing_dedup_files, d).close()
+            # We insert Obj records for the requests where t.files.obj_id is NULL.
+            for stmt in self._sql_prebatch_insert_missing_objs:
+                c.execute(stmt, d).close()
+            # Now all requests have an Obj. We add tag or hash records where necessary.
+            c.execute(self._sql_prebatch_insert_tags).close()
+            c.execute(self._sql_prebatch_insert_hashes).close()
+            # Coalesce overlapping new Objs.
+            c.execute(self._sql_prebatch_update_with_existing_dedup_files, d).close()
+            # Delete unused Objs. Set `Obj.created_at` for remaining ones.
+            for stmt in self._sql_prebatch_fix_and_delete_objs:
+                c.execute(stmt, d).close()
+            # Speculatively insert as many files as there are members in the batch.
+            c.execute(self._sql_prebatch_insert_files, d).close()
+            for r in self._sql_prebatch_delete_and_insert_links:
+                c.execute(r, d).close()
+            if 0:
+                tmp = self._tmp_files
+                print("**************** files, hashes, tags")
+                for tab in (tmp.files, tmp.hashes, tmp.tags):
+                    print(s.execute(sa.select(tab)).all())
+            for req_id, obj_id in c.execute(self._sql_prebatch_select_req_obj):
+                if (obj := objs.get(obj_id)) is None:
+                    objs[obj_id] = obj = _Obj(id=obj_id)
+                (link := links[req_id]).obj = obj
+                if link.req.adopt_existing:
+                    obj.adopted_file_path = link.req.link_path
+            for obj_id, file_id, pending_id in c.execute(self._sql_prebatch_select_obj_file, d):
+                o = objs[obj_id]
+                if pending_id is None:
+                    o.completed_file_ids.append(file_id)
                 else:
-                    # If the file is still in a pending state, the hashes and tags are unreliable.
-                    # The file might fail to be written, the hashes might be invalid, etc. We must
-                    # use the slow path and wait for the file to become ready.
-                    link.fast_path = False
-                    file = None
-                if link.fast_path:
-                    self._prepare_dedup_file_for_linking(s, file, link)
-                    if link.is_new:
-                        # If the same file shows up later in the batch, ensure that it is used.
-                        for v in link.hashes_promised.values():
-                            hash_to_file[v, file.file_metadata] = file
-                # the _prepare_dedup_file_for_linking caused a flush, so our primary key is ready
-                if file is not None:
-                    link.file = IdKey.from_instance(file)
-            L = sao.aliased(mo.Link)
-            q = sa.select(L).where(
-                (L.link_path == sa.bindparam("x_src")) | (L.link_path == sa.bindparam("x_dst"))
-            )
-            for copy in copies:
-                with self._ignore_skip(), self._catch_req_exc(copy):
-                    link_objs = {
-                        x.link_path: x
-                        for x in s.execute(q, {"x_src": copy.src_str, "x_dst": copy.dst_str})
-                        .scalars()
-                        .all()
-                    }
-                    if (src_link := link_objs.get(copy.src_str)) is None:
-                        raise NotADedupLinkError
-                    if (dst_link := link_objs.get(copy.dst_str)) is not None:
-                        s.delete(dst_link)
-                    copy.dedup_file_path = self._make_dedup_file_path(src_link.file_id)
-                    s.add(mo.Link(file_id=src_link.file_id, link_path=copy.dst_str))
-                    s.flush()
-            del q, L
+                    o.pending_file_ids.append(file_id)
             pending.expire_at = mo.now() + 30.0
+            del pending
+        failed_link_paths = []
+        with self._PendingUpdater(
+            pending=pending_key,
+            sessionmaker_r=self._SessionR,
+            sessionmaker_w=self._SessionW,
+            seconds_in_the_future=20,
+        ) as pu, self.temporary_directory(check_links=False) as tmp_path:
+            for link in links:
+                with self._ignore_skip(), self._catch_req_exc(link):
+                    if (obj := link.obj) is None:
+                        # nothing to be done here
+                        link.call_file_not_needed()
+                        link.set_failed(MissingContentError(f"no obj {link}"))
+                        continue
-        del hash_to_file, tag_to_file, tag_to_file_set, pending
-        to_be_flushed = []
-        failed_requests = []
-        def _flush_now(s: sao.Session):
-            for link in to_be_flushed:
-                file: mo.DedupFile | None = None if (f := link.file) is None else f.get(s)
+                    self._write_dedup_file_contents(link, tmp_path, updates)
-                if link.failed or file is None:
-                    failed_requests.append(link.req)
-                    if file is not None:
-                        s.delete(file)
-                    continue
+        with self._beginw() as s, mo.tmp_new_files2(s, "") as t:
+            c = s.connection()
-                if (size := link.file_size) is not None:
-                    file.size = size
-                if (mtime := link.file_mtime) is not None:
-                    file.mtime = mtime
-                # We need to add whatever extra hashes were computed.
-                if d := link.hashes_computed:
-                    already_in_db = link.hashes_promised
-                    for k, v in d.items():
-                        if k not in already_in_db:
-                            s.add(mo.Hash.from_digest(v, file=file))
-                # We checked the hashes (if any), the file contents are written, and the link
-                # (if any) has been created. We are therefore ready to set the "file.pending"
-                # column to NULL, thus marking the dedup file as finalized.
-                file.pending = None
-            to_be_flushed.clear()
-        for copy in copies:
-            with self._ignore_skip(), self._catch_req_exc(copy):
-                self._delete_file(copy.req.dst)
-                self._create_actual_link(copy.dedup_file_path, copy.req.dst)
-        if links:
-            # Now we write the file data without holding the database transaction open. The
-            # "_PendingUpdater" ensures that other threads know that we're working.
-            with self._PendingUpdater(
-                pending=pending_key,
-                sessionmaker_r=self._SessionR,
-                sessionmaker_w=self._SessionW,
-                seconds_in_the_future=20,
-            ) as pu:
-                for link in links:
-                    with self._ignore_skip(), self._catch_req_exc(link):
-                        if not link.fast_path:
-                            with self._beginw() as s:
-                                _flush_now(s)
-                            self._slow_path_wait_for_dedup_file(link=link, pending=pending_key)
-                        self._write_dedup_file_contents(link=link)
-                    to_be_flushed.append(link)
-                pu.update_on_exit = True
+            if u := updates.link_updates:
+                c.execute(sa.insert(self._tmp_files2.links), u).close()
+                for stmt in self._sql_postbatch_update_links:
+                    c.execute(stmt).close()
-            with self._beginw() as s:
-                _flush_now(s)
+            if u := updates.file_updates:
+                c.execute(sa.insert(self._tmp_files2.files), u).close()
+                c.execute(self._sql_postbatch_update_files)
-                # Delete Pending object along with any DedupFile objects that had errors in them
-                # using the "ON DELETE CASCADE".
-                s.delete(pending_key.get_one(s))
+            if u := updates.obj_updates:
+                c.execute(sa.insert(self._tmp_files2.objs), u).close()
+                c.execute(self._sql_postbatch_update_objs).close()
-            for link in links:
-                link.req.success = not link.failed
+            # Delete the pending object.
+            s.delete(pending_key.get_one(s))
+            s.flush()
-        if copies:
-            for copy in copies:
-                copy.req.success = not copy.failed
-                if not copy.req.success:
-                    failed_requests.append(copy.req)
+        failed_requests = []
+        for link in links:
+            ok = link.req.success = not link.failed
+            if not ok:
+                failed_requests.append(link.req)
         if failed_requests:
             first_exc = failed_requests[0].exc
@@ -794,7 +1006,6 @@ class Dedup(abc.ABC):
     def _write_file_computing_hashes(
         self, target: Path, open1, hashes: ty.Iterable[mh.HashFunction]
     ) -> tuple[int, dict[mh.HashFunction, mh.Digest]]:
-        target.parent.mkdir(exist_ok=True, parents=True)
         m = mh.MultiHasher({f: f() for f in hashes})
         with target.open("wb") as f_w, open1() as f_r:
             while block := f_r.read(65536):
@@ -802,114 +1013,126 @@ class Dedup(abc.ABC):
                 f_w.write(block)
         return m.size, m.digest()
-    def _write_dedup_file_contents(self, link: _ImplDedupLinkRequest) -> None:
-        if link.is_new:
-            if link.req.open_file_once is None:
-                link.call_file_not_needed()
-                return
-            p = link.dedup_file_path
-            (fs := set(link.hashes_promised)).update(self.extra_hashes)
-            link.file_size, d = self._write_file_computing_hashes(p, link.call_open_file_once, fs)
-            self.apply_metadata_to_file(p, link.req.file_metadata)
-            link.file_mtime = int(p.stat().st_mtime)
-            link.hashes_computed = d
-            # Check that the hashes match what was claimed inside the link request.
-            computed = {k: d[k] for k in link.hashes_promised}
-            if link.hashes_promised != computed:
-                p.unlink(missing_ok=True)
-                raise InvalidContentsError(
-                    link_request=link.req,
-                    hashes_expected=link.hashes_promised,
-                    hashes_observed=computed,
-                )
-        else:
-            # existing file - we don't need to do anything
-            link.call_file_not_needed()
-            # TODO: quickly check whether the file mtime matches and check the content hash if not
-        self._create_actual_link(link.dedup_file_path, link.req.link_path)
-    def _slow_path_wait_for_dedup_file(
-        self, link: _ImplDedupLinkRequest, pending: IdKey[mo.Pending]
+    def _write_dedup_file_contents(
+        self, link: _ImplDedupLinkRequest, tmp_path: Path, updates: _Updates
     ) -> None:
-        """
-        The file we are interested in is actively being written to by another thread. We need to
-        wait for it to be finished or for the other thread to fail.
-        Either way, we add the required data to the database such that we can continue with the
-        fast path procedure after this method returns.
-        """
+        obj = link.obj
+        target = link.req.link_path
+        skip_link_for_file_id = None
+        adopting = obj.adopted_file_path
+        adopted = False
+        def _mkdirp(path):
+            if not path.exists():
+                path.mkdir(exist_ok=True, parents=True)
+        # Do we have any completed File IDs at all?
+        if obj.completed_file_ids:
+            link.call_file_not_needed()
+            if adopting is not None:
+                # We don't need the file there.
+                self._delete_file(target)
+        else:
+            # No completed IDs, we need to make one. Try to adopt if possible.
+            tmp_p = tmp_path / "f.bin"
+            tmp_p.unlink(missing_ok=True)
-        # Construct query which looks for a DedupFile matching hashes or overlapping tags.
-        F = sao.aliased(mo.DedupFile)
-        H = sao.aliased(mo.Hash)
-        T = sao.aliased(mo.Tag)
-        def _exists(Alias):
-            return sa.exists().select_from(Alias).where(Rel(Alias.file) == F)
-        q = sa.select(F)
-        for v in link.hashes_promised.values():
-            q = q.where(_exists(H).where(H.compare_digest() == v))
-        if link.req.tags:
-            q = q.where(_exists(T).where(T.name.in_(link.req.tags)))
-        q = q.options(sao.joinedload(F.pending))
-        def _check(s: sao.Session) -> mo.DedupFile | bool:
-            for x in s.execute(q).scalars():
-                x: mo.DedupFile
-                if x.pending is None:
-                    # We found a finished DedupFile we can use directly.
-                    return x
-                elif x.pending_id == pending.key[0]:
-                    # It's already our dedupfile!!!
-                    raise AssertionError("deadlock")
-                elif x.pending.expire_at >= mo.now():
-                    # We found an in-progress DedupFile, so we stand down and continue polling.
-                    return False
+            if adopting is not None:
+                link.call_file_not_needed()
+                self._adopt_file_and_link(adopting, tmp_p)
+                adopted = True
+                size = tmp_p.stat().st_size
+                apply_metadata = False
+            elif (open1 := link.call_open_file_once) is not None:
+                (fs := set(link.hashes_promised)).update(self.extra_hashes)
+                size, d = self._write_file_computing_hashes(tmp_p, open1, fs)
+                link.hashes_computed = d
+                # Check that the hashes match what was claimed inside the link request.
+                computed = {k: d[k] for k in link.hashes_promised}
+                if link.hashes_promised != computed:
+                    raise InvalidContentsError(
+                        link_request=link.req,
+                        hashes_expected=link.hashes_promised,
+                        hashes_observed=computed,
+                    )
+                apply_metadata = True
+            else:
+                link.set_failed(MissingContentError("content not provided"))
+                return
-            # There are no matching DedupFile objects, so we can create a new one ourselves.
-            return True
+            if apply_metadata:
+                self.apply_metadata_to_file(tmp_p, link.req.file_metadata)
+            self._set_clean_file_mtime(tmp_p)
-        def _wait_first_time():
-            nonlocal _wait
-            _wait = _wait_normal
+            file_id = obj.pending_file_ids.pop()
+            p = self._make_dedup_file_path(file_id)
+            _mkdirp(p.parent)
+            tmp_p.rename(p)
+            obj.completed_file_ids.append(file_id)
+            updates.file_updates.append({"file_id": file_id, "obj_id": obj.id})
+            updates.obj_updates.append({"obj_id": obj.id, "size": size})
+            # Now the file has the right contents. Let's also make a link now.
-        def _wait_normal():
-            time.sleep(2)
+            if adopting is not None:
+                skip_link_for_file_id = file_id
-        _wait = _wait_first_time
+        endgame = False
+        completed = obj.completed_file_ids
         while True:
-            _wait()
+            file_id = completed[-1]
+            p = self._make_dedup_file_path(file_id)
+            if adopted and adopting == target:
+                ok = True
+            else:
+                ok = False
+                try:
+                    self._create_actual_link(p, target)
+                    ok = True
+                except Exception:
+                    pass
-            with self._SessionR() as s:  # check using a read-only transaction
-                result = _check(s)
-                if result is False:
-                    continue
+                if not ok and target.exists():
+                    self._delete_file(target)
+                    try:
+                        self._create_actual_link(p, target)
+                        ok = True
+                    except Exception:
+                        pass
-            with self._beginw() as s:  # use a write transaction
-                result = _check(s)
-                if result is False:
-                    continue
+            link_count = p.stat().st_nlink - 1
+            updates.link_updates.append(
+                {
+                    "link_path": link.link_path_str if ok else None,
+                    "file_id": file_id,
+                    "link_count": link_count,
+                }
+            )
+            if ok:
+                # We're done! Bye!
+                return
-                if result is True:
-                    # We need to create a new DedupFile
-                    s.add(file := self._make_dedup_file(link, pending.get_one(s)))
-                    link.is_new = True
-                else:
-                    file = result
-                    link.is_new = False
+            if len(completed) > 1:
+                completed.pop()
+                continue
-                link.fast_path = True
-                self._prepare_dedup_file_for_linking(s, file, link)
+            if endgame:
+                raise AssertionError
+            endgame = True
-                # we can only do this after the flush
-                link.file = IdKey.from_instance(file)
+            # This is our last one file, we must make a copy.
+            tmp_p = tmp_path / "f.bin"
+            tmp_p.unlink(missing_ok=True)
+            shutil.copyfile(str(self._make_dedup_file_path(file_id)), str(tmp_p))
-            break
+            file_id = obj.pending_file_ids.pop()
+            p = self._make_dedup_file_path(file_id)
+            _mkdirp(p.parent)
+            tmp_p.rename(p)
+            obj.completed_file_ids[0] = file_id
+            updates.file_updates.append({"file_id": file_id, "obj_id": obj.id})
+            # We made a copy. Hope it works now.
     @property
     def _PendingUpdater(self):
@@ -940,8 +1163,24 @@ class Dedup(abc.ABC):
         """
         self._check_links(path, False)
+    @cached_property
+    def _sql_checklinks(self):
+        tmp = self._tmp_check_links
+        t_links = sao.aliased(tmp.links)
+        F = sao.aliased(mo.File)
+        L = sao.aliased(mo.Link)
+        # Invalidate link_count for affected dedup Files.
+        q = sa.select(L.file_id).join(t_links, L.path == t_links.c.path)
+        r0 = sa.update(F).values(link_count=-1).where(F.id.in_(q))
+        # Delete Link records.
+        r1 = sa.delete(L).where(L.path.in_(sa.select(t_links.c.path)))
+        return tuple(_ns(x) for x in (r0, r1))
     def _check_links(self, path: Path | None, pre_delete: bool) -> None:
-        F = sao.aliased(mo.DedupFile)
+        F = sao.aliased(mo.File)
         L = sao.aliased(mo.Link)
         _verify_link = self._verify_link
@@ -956,10 +1195,10 @@ class Dedup(abc.ABC):
                 # do any checking.
                 _verify_link = lambda link: False
-        q = sa.select(L).order_by(L.link_path).options(sao.joinedload(L.file))
+        q = sa.select(L).order_by(L.path).options(sao.joinedload(L.file))
         q = q.limit(self._batch_size)
         if prefix is not None:
-            q = q.where((L.link_path == exact_path) | bytes_startswith(L.link_path, prefix))
+            q = q.where((L.path == exact_path) | bytes_startswith(L.path, prefix))
         with self._SessionR() as s:
             last_link_path: str | None = None
@@ -967,7 +1206,7 @@ class Dedup(abc.ABC):
                 if last_link_path is None:
                     q2 = q
                 else:
-                    q2 = q.where(L.link_path > last_link_path)
+                    q2 = q.where(L.path > last_link_path)
                 results: list[mo.Link] = s.execute(q2).scalars().all()
                 if not results:
@@ -976,74 +1215,115 @@ class Dedup(abc.ABC):
                 to_delete = []
                 for link in results:
                     if not _verify_link(link):
-                        to_delete.append(link.link_path)
+                        # TODO: Instead of just deleting them from the DB, maybe we should keep
+                        # track of invalid links or even repair them?
+                        to_delete.append(link.path)
                 if to_delete:
-                    with self._beginw() as s2, temporary_table(
-                        s2, mo.tmp_bytes
-                    ) as t_links, temporary_table(s2, mo.tmp_ints) as t_files:
+                    with self._beginw() as s2, mo.tmp_check_links(s2, "") as tmp:
+                        # 1. Insert Link paths into a temporary table.
                         s2.connection().execute(
-                            sa.insert(t_links), [{"id": x} for x in to_delete]
+                            sa.insert(tmp.links), [{"path": x} for x in to_delete]
                         ).close()
-                        # There are the DedupFile entries that may end up orphaned.
-                        s2.connection().execute(
-                            sa.insert(t_files).from_select(
-                                [t_files.c.id],
-                                sa.select(F.id)
-                                .distinct()
-                                .select_from(L)
-                                .join(F, L.file)
-                                .join(t_links, t_links.c.id == L.link_path),
-                            )
-                        ).close()
+                        # 2. Invalidate link_count inside parent Files.
+                        # 3. Delete Links.
+                        # 4. Recompute link_count for affected Files.
+                        for stmt in self._sql_checklinks:
+                            s2.execute(stmt).close()
-                        # Remove the links that have been deleted.
-                        s2.connection().execute(
-                            sa.delete(L).where(L.link_path.in_(sa.select(t_links.c.id))),
-                        ).close()
+                last_link_path = results[-1].path
-                        # Detect newly-orphaned files.
-                        s2.connection().execute(
-                            F.make_update_orphaned().where(F.id.in_(sa.select(t_files.c.id)))
-                        ).close()
-                last_link_path = results[-1].link_path
-    def update_all_orphaned(self):
+    @cached_property
+    def _sql_orph_update(self):
+        now = sa.bindparam("p_now")
+        updated_since = sa.bindparam("p_updated_since")
+        Obj = sao.aliased(mo.Obj)
+        return Obj.make_sql_update_orphaned(now).where(Obj.updated_at >= updated_since)
+    def detect_orphaned(self):
+        # Update link count for files where it was invalidated.
+        self.integrity_check(skip_same_mtime=True, only_invalid_link_count=True)
+        C = sao.aliased(mo.DedupConfig)
+        KEY = "last_detect_orphaned"
         with self._beginw() as s:
-            F = sao.aliased(mo.DedupFile)
-            s.connection().execute(F.make_update_orphaned()).close()
+            last_check = s.execute(sa.select(C).where(C.key == KEY)).scalar()
+            if last_check is None:
+                s.add(last_check := mo.DedupConfig(key=KEY, value="0"))
+            since = int(last_check.value)
+            now = mo.now()
+            s.execute(self._sql_orph_update, {"p_now": now, "p_updated_since": since}).close()
+            last_check.value = str(now)
-    def garbage_collect_dedup_files(self, min_age_seconds: int) -> None:
+    @cached_property
+    def _sql_gc_orphaned_to_pending(self):
+        O = sao.aliased(mo.Obj)
+        F = sao.aliased(mo.File)
+        F2 = sao.aliased(mo.File)
+        q = sa.select(F.id).join(O, F.obj).where(O.orphaned_at < sa.bindparam("p_cutoff"))
+        return _ns(
+            sa.update(F2).values(pending_id=sa.bindparam("p_pending_id")).where(F2.id.in_(q))
+        )
+    @cached_property
+    def _sql_gc_select_pending(self):
+        F = sao.aliased(mo.File)
+        P = sao.aliased(mo.Pending)
+        L = sao.aliased(mo.Link)
+        cond = P.expire_at < sa.bindparam("p_cutoff")
+        cond &= P.id != _lit(str(self._corrupted_pending_id))
+        q0 = sa.select(P.id).where(cond)
+        q1 = sa.select(F.id).join(P).where(cond)
+        q2 = sa.select(L.path).where(L.file_id.in_(q1))
+        return q0, q1, q2
+    def garbage_collect_dedup_files(
+        self, min_age_orphan_seconds: int, min_age_pending_seconds: int = None
+    ) -> None:
         """
         Remove dedup files that have no links to them as well as dedup files that were left behind
         by a failed batch of content insertion.
         """
-        cutoff = mo.now() - min_age_seconds
-        pending_cutoff = 7200
-        F = sao.aliased(mo.DedupFile)
-        P = sao.aliased(mo.Pending)
-        q = sa.select(F).options(sao.selectinload(F.links)).limit(self._batch_size).order_by(F.id)
-        q1 = q.where(F.orphaned_at != None, F.orphaned_at <= cutoff)
-        q2 = q.join(P, F.pending).where(P.expire_at <= pending_cutoff)
-        self._garbage_collect_using_query(q1, F)
-        self._garbage_collect_using_query(q2, F)
-    def _garbage_collect_using_query(self, q, F):
-        F1 = sao.aliased(mo.DedupFile)
-        while True:
+        self.detect_orphaned()
+        now = mo.now()
+        orphan_cutoff = now - min_age_orphan_seconds
+        pending_cutoff = now - (min_age_pending_seconds or 7200)
+        self._garbage_collect_dedup_files(orphan_cutoff, pending_cutoff)
+    def _garbage_collect_dedup_files(self, orphan_cutoff: int | None, pending_cutoff: int):
+        if orphan_cutoff is not None:
             with self._beginw() as s:
-                files: list[mo.DedupFile] = s.scalars(q).all()
-                if not files:
-                    break
-                s.expunge_all()  # remove DedupFile objects from session
-                s.connection().execute(sa.delete(F1).where(F1.id.in_(q.with_only_columns(F.id))))
+                s.add(pending := mo.Pending(expire_at=1))  # expiration time far into the past
+                s.flush()
+                # Convert orphaned files to pending. We will collect the pending afterwards.
+                params = {"p_pending_id": pending.id, "p_cutoff": orphan_cutoff}
+                s.execute(self._sql_gc_orphaned_to_pending, params).close()
+        with self._SessionR.begin() as s:
+            # Iterate through the expired pending File IDs and delete the files and links. Gather the list
+            # of pending objects that are finished.
+            params = {"p_cutoff": pending_cutoff}
+            q0, q1, q2 = self._sql_gc_select_pending
+            pending_ids = s.execute(q0, params).scalars().all()
+            for link_path in s.execute(q2, params).scalars():
+                p = self._link_path_from_string(link_path)
+                if p.exists():
+                    self._delete_file(p)
+            for file_id in s.execute(q1, params).scalars():
+                p = self._make_dedup_file_path(file_id)
+                if p.exists():
+                    self._delete_file(p)
-            for file in files:
-                for link in file.links:
-                    self._delete_file(link._link_path_from_string(link.link_path))
-                self._delete_file(self._make_dedup_file_path(file.id))
+        # We only update the database after successfully deleting all the files and links.
+        P = sao.aliased(mo.Pending)
+        with self._beginw() as s:
+            s.execute(sa.delete(P).where(P.id.in_(pending_ids))).close()
     def garbage_collect_deleted(self):
         """
@@ -1102,70 +1382,73 @@ class Dedup(abc.ABC):
         This recursively lists every file in the dedup store, so it takes a long time.
         """
-        F = sao.aliased(mo.DedupFile)
+        F = sao.aliased(mo.File)
         i2p = self._integer_to_path
         cutoff = mo.now() - 3600
+        t_f = sao.aliased(self._tmp_delete_extra.files)
+        q = sa.select(t_f.c.id).where(~sa.exists().select_from(F).where(F.id == t_f.c.id))
         base = self._path_dedup
-        for chunk in chunked_iter(base.rglob("*"), self._batch_size):
-            to_be_unlinked = []
-            file_ids = {}
-            for p in chunk:
-                if not p.is_file():
-                    continue
+        with self._SessionR.begin() as s, mo.tmp_delete_extra(s, "") as tmp:
+            for chunk in chunked_iter(base.rglob("*"), self._batch_size):
+                file_ids = {}
+                for p in chunk:
+                    if not p.is_file():
+                        continue
-                try:
-                    file_id = i2p.invert("/".join(p.relative_to(base).parts))
-                except InvalidPathError:
-                    if p.stat().st_mtime < cutoff:
-                        to_be_unlinked.append(p)
-                    continue
+                    try:
+                        file_id = i2p.invert("/".join(p.relative_to(base).parts))
+                    except InvalidPathError:
+                        if p.stat().st_mtime < cutoff:
+                            self._delete_file(p)
+                        continue
-                file_ids[file_id] = p
+                    file_ids[file_id] = p
+                if file_ids:
+                    s.execute(sa.insert(tmp.files), tuple({"id": x} for x in file_ids)).close()
+                    bad_file_ids = s.execute(q).scalars().all()
+                    s.execute(sa.delete(tmp.files)).close()
-            if file_ids:
-                # We use a write transaction to avoid a race condition between checking that a path
-                # does not contain a valid file ID and then later deleting that file outside the
-                # transaction.
-                with self._SessionW() as s, temporary_table(s, mo.tmp_ints) as tmp:
-                    s.execute(sa.insert(tmp), [{"id": x} for x in file_ids]).close()
-                    tmp_ = sa.alias(tmp)
-                    bad_file_ids = (
-                        s.execute(
-                            sa.select(tmp_.c.id).where(
-                                ~sa.exists().select_from(F).where(F.id == tmp_.c.id)
-                            )
-                        )
-                        .scalars()
-                        .all()
-                    )
                     for file_id in bad_file_ids:
                         self._delete_file(file_ids[file_id])
-            for p in to_be_unlinked:
-                self._delete_file(p)
     def corrupted_list(self) -> ty.Generator[Corrupted]:
         """
         Get the list of corrupted files found using :meth:`integrity_check`.
         """
-        for p in self.path_corrupted.glob("*.json"):
-            d = json.loads(p.read_bytes())
-            yield Corrupted(
-                path=bin_path if (bin_path := p.with_suffix(".bin")).exists() else None,
-                file_id=d["file_id"],
-                exception=d["exception"],
-                link_paths=frozenset(d["link_paths"]),
-                raw_link_paths=frozenset(d["raw_link_paths"]),
-            )
+        L = sao.aliased(mo.Link)
+        with self._SessionR() as s:
+            for fc in s.execute(sa.select(mo.FileCorruption)).scalars():
+                fc.id
+                links_bytes = s.execute(sa.select(L.path).where(L.file_id == fc.id)).scalars().all()
+                links_paths = tuple((self._link_path_from_string(x)) for x in links_bytes)
+                yield Corrupted(
+                    path=self._make_dedup_file_path(fc.id),
+                    file_id=fc.id,
+                    exception_name=fc.exception_name,
+                    exception_string=fc.exception_string,
+                    link_paths=links_paths,
+                    raw_link_paths=tuple(links_bytes),
+                )
     def corrupted_clear(self):
         """
         Delete all corrupted files.
         """
-        for glob in ["*.bin", "*.json"]:
-            for p in self.path_corrupted.glob(glob):
-                self._delete_file(p)
+        F = sao.aliased(mo.File)
+        with self._beginw() as s:
+            s.add(p := mo.Pending(expire_at=1))
+            s.flush()
+            s.execute(
+                sa.update(F)
+                .values(pending_id=p.id)
+                .where(F.pending_id == self._corrupted_pending_id)
+            ).close()
+        self._garbage_collect_dedup_files(orphan_cutoff=None, pending_cutoff=2)
     @staticmethod
     def _copy_tree_default_fallback(src: Path, dst: Path):
@@ -1203,7 +1486,7 @@ class Dedup(abc.ABC):
         if to_copy:
             _run()
-    def delete_tree(self, p: Path) -> None:
+    def delete_tree(self, p: Path, check_links: bool = True) -> None:
         def f(func, path, exc_info):
             if (p := Path(path)).exists():
                 self._move_to_deleted(p)
@@ -1211,7 +1494,8 @@ class Dedup(abc.ABC):
         shutil.rmtree(str(p.absolute()), onerror=f)
         if p.exists():
             self._move_to_deleted(p)
-        self.check_links(p)
+        if check_links:
+            self.check_links(p)
     def delete_file(self, p: Path) -> None:
         self._delete_file(p)
@@ -1247,20 +1531,24 @@ class Dedup(abc.ABC):
     def _filelock(self, path: Path, blocking: bool):
         return filelock.FileLock(path, blocking=blocking)
-    @property
+    @cached_property
+    def _path_temporary_simple_dir(self):
+        return self.path_temporary / "simple"
+    @cached_property
     def _path_temporary_dirs(self):
         return self.path_temporary / "dirs"
-    @property
+    @cached_property
     def _path_temporary_lock(self):
         return self.path_temporary / "lock"
-    @property
+    @cached_property
     def _path_temporary_master_lock(self):
         return self.path_temporary / "master.lock"
     @contextlib.contextmanager
-    def temporary_directory(self, prefix="tmp_", suffix=""):
+    def temporary_directory(self, prefix="tmp_", suffix="", check_links: bool = True):
         exc = None
         for name in random_names(prefix=prefix, suffix=suffix):
             p = self._path_temporary_dirs / name
@@ -1287,7 +1575,7 @@ class Dedup(abc.ABC):
                     yield p
                     break
                 finally:
-                    self.delete_tree(p)
+                    self.delete_tree(p, check_links=check_links)
                     # Release the lock file. We will attempt to delete it next.
                     ex.close()
@@ -1304,18 +1592,62 @@ class Dedup(abc.ABC):
         else:
             raise AssertionError("retry count exceeded, unknown cause") if exc is None else exc
+    @cached_property
+    def _sql_obh_select_file(self):
+        F = sao.aliased(mo.File)
+        O = sao.aliased(mo.Obj)
+        H = sao.aliased(mo.Hash)
+        b = sa.bindparam
+        q = (
+            sa.select(
+                F.id.label("file_id"),
+                sa.case((O.orphaned_at != None, O.id), else_=None).label("obj_id"),
+            )
+            .join(O, F.obj)
+            .join(H, O.hashes)
+            .where(F.pending_id == None, H.hash_function == b("p_hf"), H.hash == b("p_h"))
+            .limit(_lit("1"))
+        )
+        return q
+    @cached_property
+    def _sql_obh_update_obj(self):
+        O = sao.aliased(mo.Obj)
+        b = sa.bindparam
+        return (
+            sa.update(O)
+            .values(orphaned_at=sa.case((O.orphaned_at != None, b("p_now")), else_=None))
+            .where(O.id == b("p_obj_id"))
+        )
+    def open_by_hash(self, digest: mh.Digest) -> ty.BinaryIO | None:
+        d = {"p_hf": digest.function.function_code, "p_h": digest.digest}
+        with self._beginw() as s:
+            c = s.connection()
+            if (r := c.execute(self._sql_obh_select_file, d).one_or_none()) is None:
+                return None
+            file_id, obj_id = r
+            if obj_id is not None:
+                d = {"p_now": mo.now(), "p_obj_id": obj_id}
+                c.execute(self._sql_obh_update_obj, d).close()
+        return self._make_dedup_file_path(file_id).open("rb")
     @cached_property
     def _q_get_hash(self):
         L = sao.aliased(mo.Link)
-        F = sao.aliased(mo.DedupFile)
+        F = sao.aliased(mo.File)
+        O = sao.aliased(mo.Obj)
         H = sao.aliased(mo.Hash)
         return (
-            sa.select(L, H, F.size)
+            sa.select(L, H, O.size)
             .select_from(L)
             .join(F, L.file)
-            .outerjoin(H, (Rel(H.file) == F) & (H.hash_function == sa.bindparam("x_hf")))
+            .join(O, F.obj)
+            .outerjoin(H, (Rel(H.obj) == O) & (H.hash_function == sa.bindparam("x_hf")))
             .options(sao.contains_eager(L.file.of_type(F)))
-            .where(L.link_path == sa.bindparam("x_link_path"), F.pending == None)
+            .where(L.path == sa.bindparam("x_link_path"), F.pending == None)
         )
     def _query_by_link_path(
@@ -1357,19 +1689,24 @@ class Dedup(abc.ABC):
     ) -> tuple[int, mh.Digest] | None:
         r = self.get_file_hash(hash_function, path, **kw)
         if r is None:
-            hasher = hash_function()
-            size = 0
             with path.open("rb") as f:
-                while block := f.read(65536):
-                    size += len(block)
-                    hasher.update(block)
-            r = size, hasher.digest()
+                r = self._compute_file_hash(hash_function, f)
         return r
+    def _compute_file_hash(self, hash_function, file):
+        size = 0
+        hasher = hash_function()
+        while block := file.read(65536):
+            size += len(block)
+            hasher.update(block)
+        return size, hasher.digest()
     def adopt_files(
         self, hash_function: mh.HashFunction, requests: ty.Iterable[AdoptRequest]
     ) -> None:
         """
+        HACK: DO NOT RUN THIS ON EXISTING DEDUP LINKS
         Adopt each file given in *paths*. If the path is already a dedup link, then leave it
         alone. If the path is not a dedup link, then compute its hash and move the file to the
         dedup store and create a link to it. If the path is already a dedup link but does not
@@ -1378,125 +1715,25 @@ class Dedup(abc.ABC):
         This method is implemented in a somewhat inefficient way.
         """
-        reqs = [_ImplAdoptRequest(req) for req in requests]
-        # first use a read-only session while we compute file hashes
-        with self._SessionR() as s:
-            for x in reqs:
-                x.link_path = self._link_path_to_string(x.req.path)
-                existing = self._query_by_link_path(s, x.link_path, hash_function)
-                if existing:
-                    l, h, sz = existing[0]
-                    if h is not None:
-                        x.req.out_digest = h.to_digest()
-                        x.req.out_size = sz
-                        x.done = True
-                if not x.done:
-                    with open(x.req.path, "rb") as f:
-                        h = hash_function()
-                        size = 0
-                        while block := f.read(65536):
-                            h.update(block)
-                            size += len(block)
-                        x.req.out_digest = h.digest()
-                    x.file_metadata = DedupFileMetadata(executable=False)  # TODO
-                    x.req.out_size = size
-                    x.file_metadata_bytes = self.convert_file_metadata_to_bytes(x.file_metadata)
-        F = sao.aliased(mo.DedupFile)
-        H = sao.aliased(mo.Hash)
-        q = (
-            sa.select(F)
-            .join(H, F.hashes)
-            .where(
-                H.hash_function == sa.bindparam("x_hf"),
-                H.hash == sa.bindparam("x_h"),
-                F.pending == None,
-                F.file_metadata == sa.bindparam("x_f_meta"),
+        self.run_batch(
+            DedupLinkRequest(
+                hash_function=hash_function,
+                link_path=req.path,
+                tags=req.tags,
+                file_metadata=self.get_metadata_from_file(req.path),
+                open_file_once=None,
+                adopt_existing=True,
+                file_contents_hash=None,
             )
+            for req in requests
         )
-        # then we use a RW session to update the database
-        with self._beginw() as s:
-            for x in reqs:
-                if x.done:
-                    continue
-                # re-check for an existing link
-                existing = self._query_by_link_path(s, x.link_path, hash_function)
-                if existing:
-                    l, h, sz = existing[0]
-                    file = l.file
-                    if h is None:
-                        s.add(mo.Hash.from_digest(x.req.out_digest, file=file))
-                    else:
-                        # never mind, nothing to do here
-                        x.req.out_size = sz
-                        x.req.out_digest = h.to_digest()
-                        x.done = True
-                        continue
-                else:
-                    # try to lookup by digest first
-                    # TODO: also look up by tag
-                    files = (
-                        s.execute(
-                            q,
-                            dict(
-                                x_hf=hash_function.function_code,
-                                x_h=x.req.out_digest.digest,
-                                x_f_meta=x.file_metadata_bytes,
-                            ),
-                        )
-                        .scalars()
-                        .all()
-                    )
-                    if files:
-                        file = files[0]
-                    else:
-                        file = None
-                    if file is not None:
-                        file.orphaned_at = None
-                        x.delete = True
-                    else:
-                        # no existing file found, need to create one
-                        file = mo.DedupFile(
-                            file_metadata=x.file_metadata_bytes,
-                            size=x.req.out_size,
-                            mtime=int(x.req.path.stat().st_mtime),
-                            orphaned_at=None,
-                            pending=None,
-                            hashes=[mo.Hash.from_digest(x.req.out_digest)],
-                        )
-                        s.add(file)
-                        s.flush()  # we need to make sure the file has an ID
-                    s.add(mo.Link(link_path=x.link_path, file=file))
-                x.dedup_file_path = self._make_dedup_file_path(file.id)
-                # We add our tags.
-                self._add_tags_to_file(s, file, x.req.tags)
-                s.flush()
-        # and finally we make filesystem changes
-        for x in reqs:
-            if (dst := x.dedup_file_path) is not None:
-                if x.delete:
-                    # We already have a DedupFile with the required contents, so we replace the
-                    # link_path file with a link to that existing DedupFile.
-                    self._delete_file(x.req.path)
-                    self._create_actual_link(dst, x.req.path)
-                else:
-                    dst.parent.mkdir(exist_ok=True, parents=True)
-                    self._adopt_file_and_link(x.req.path, dst)
     def integrity_check(
         self,
         skip_same_mtime: bool,
         threads: int | None = None,
-        keep_corrupted: bool = True,
+        *,
+        only_invalid_link_count: bool = False,
     ):
         """
         Verify all deduplicated files match their stored hashes. Use modification time to skip
@@ -1504,17 +1741,28 @@ class Dedup(abc.ABC):
         :attr:`path_corrupted`.
         """
-        F = sao.aliased(mo.DedupFile)
+        F = sao.aliased(mo.File)
+        O = sao.aliased(mo.Obj)
         batch_size = 1000
-        q = sa.select(F).options(sao.selectinload(F.hashes)).order_by(F.id).limit(batch_size)
+        q = sa.select(F).options(sao.selectinload(F.obj.of_type(O)).selectinload(O.hashes))
+        if only_invalid_link_count:
+            q = q.where(F.link_count < _lit("0"))
+        q = q.where(F.pending_id == None).order_by(F.id).limit(batch_size)
-        def _hash_check(file: mo.DedupFile) -> None:
+        def _hash_check(file: mo.File) -> None:
             p = self._make_dedup_file_path(file.id)
-            st_mtime = int(p.stat().st_mtime)
-            if skip_same_mtime and file.mtime == st_mtime:
-                return
+            st = p.stat()
+            # FIXME: specific to hardlink backend
+            if (n := st.st_nlink - 1) != file.link_count:
+                link_count_updates.append({"id": file.id, "link_count": n})
+                changed_obj_ids.add(file.obj_id)
+            if skip_same_mtime:
+                if (st_mtime := int(st.st_mtime)) == self._clean_dedup_mtime:
+                    return
-            d = file.hashes_dict
+            d = file.obj.hashes_dict
             m = mh.MultiHasher({hf: hf() for hf in d})
             with p.open("rb") as fh:
                 while block := fh.read(65536):
@@ -1524,22 +1772,21 @@ class Dedup(abc.ABC):
             # TODO: also check file metadata matches, such as the executable bit
-            # The digest was the same, so update the mtime in the DB.
-            with self._SessionW() as s:
-                IdKey.from_instance(file).get_one(s).mtime = st_mtime
-        id_min = -1
+        id_min = None
         with cf.ThreadPoolExecutor(max_workers=threads) as exe:
             while True:
+                changed_obj_ids = set()
+                link_count_updates = []
                 invalid_file_ids = []
                 with self._SessionR() as s:
-                    q2 = q.where(F.id > id_min, F.pending == None)
-                    dedup_files: list[mo.DedupFile] = s.execute(q2).scalars().all()
+                    q2 = q if id_min is None else q.where(F.id > id_min)
+                    dedup_files: list[mo.File] = s.execute(q2).scalars().all()
                     if not dedup_files:
                         break
+                    s.expunge_all()
                     id_min = dedup_files[-1].id
                     futures = {exe.submit(_hash_check, f): f for f in dedup_files}
                     for future in cf.as_completed(futures):
@@ -1549,53 +1796,37 @@ class Dedup(abc.ABC):
                                 raise exc
                             file = futures[future]
-                            self._integrity_check_process_corrupt_one(s, file, exc, keep_corrupted)
-                            invalid_file_ids.append(file.id)
+                            invalid_file_ids.append((file.id, exc))
-                if invalid_file_ids:
-                    with self._SessionW() as s:
+                if link_count_updates:
+                    with self._beginw() as s:
+                        s.execute(sa.update(F), link_count_updates).close()
+                        now = mo.now()
                         s.connection().execute(
-                            sa.delete(F).where(F.id == sa.bindparam("_id")),
-                            [{"_id": x} for x in invalid_file_ids],
-                        )
-    def _integrity_check_process_corrupt_one(
-        self, s: sao.Session, file: mo.DedupFile, exc: Exception, keep_corrupted: bool
-    ):
-        """
-        Process one file that has been found to be corrupted.
-        """
-        path_file = self._make_dedup_file_path(file.id)
-        # Load the links as we will need them
-        s.refresh(file, ["links"])
-        link_paths = [self._link_path_from_string(link.link_path) for link in file.links]
-        json_data = {
-            "file_id": file.id,
-            "link_paths": [str(x) for x in link_paths],
-            "raw_link_paths": [
-                link.link_path.decode("utf-8", errors="replace") for link in file.links
-            ],
-            "exception": repr(exc),
-        }
-        with create_file_random(self.path_corrupted, "f_", ".json") as f:
-            path_json = Path(f.name)
-            f.write(json.dumps(json_data, indent=2, sort_keys=True).encode("utf-8"))
-        if keep_corrupted:
-            try:
-                path_file.rename(path_json.with_suffix(".bin"))
-            except Exception:
-                if path_file.exists():
-                    logger.warning(
-                        "failed to rename corrupt file", exc_info=True, data=str(path_file)
-                    )
+                            sa.update(O)
+                            .where(O.id == sa.bindparam("p_id"))
+                            .values(updated_at=sa.bindparam("p_now")),
+                            [{"p_id": x, "p_now": now} for x in changed_obj_ids],
+                        ).close()
-        for x in link_paths:
-            self._delete_file(x)
+                if invalid_file_ids:
+                    with self._beginw() as s:
+                        s.execute(
+                            sa.insert(mo.FileCorruption),
+                            [
+                                {
+                                    "id": file_id,
+                                    "exception_name": type(exc).__name__,
+                                    "exception_string": str(exc),
+                                }
+                                for file_id, exc in invalid_file_ids
+                            ],
+                        ).close()
+                        s.connection().execute(
+                            sa.update(F)
+                            .values(pending_id=self._corrupted_pending_id)
+                            .where(F.id.in_(x[0] for x in invalid_file_ids))
+                        ).close()
     class _compute_stats_ZeroRow:
         orphaned = None
@@ -1603,18 +1834,22 @@ class Dedup(abc.ABC):
         size = 0
     def compute_stats(self) -> DedupStats:
+        self.detect_orphaned()
         with self._SessionR() as s:
-            F = sao.aliased(mo.DedupFile)
+            O = sao.aliased(mo.Obj)
+            F = sao.aliased(mo.File)
             L = sao.aliased(mo.Link)
-            orph = F.orphaned_at != None
+            orph = O.orphaned_at != None
             q = (
                 sa.select(
                     orph.label("orphaned"),
                     sa.func.count().label("count"),
-                    sa.func.sum(F.size).label("size"),
+                    sa.func.sum(O.size).label("size"),
                 )
                 .select_from(F)
+                .join(O, F.obj)
                 .where(F.pending == None)
                 .group_by(orph)
             )
@@ -1622,9 +1857,10 @@ class Dedup(abc.ABC):
             file_stats |= {row.orphaned: row for row in s.execute(q).all()}
             q = (
-                sa.select(sa.func.count().label("count"), sa.func.sum(F.size).label("size"))
+                sa.select(sa.func.count().label("count"), sa.func.sum(O.size).label("size"))
                 .select_from(L)
                 .join(F, L.file)
+                .join(O, F.obj)
             ).where(F.pending == None)
             link_stats = s.execute(q).one()
@@ -1639,6 +1875,8 @@ class Dedup(abc.ABC):
 class DedupBackendHardlink(Dedup):
+    max_link_count = 1000  # Windows limits it to 1023
     def _create_actual_link(self, existing: Path, new: Path):
         # Path.link_to was removed and replaced by Path.hardlink_to, but I want this to work across
         # Python 3.9 to 3.13
@@ -1649,14 +1887,13 @@ class DedupBackendHardlink(Dedup):
         self._create_actual_link(existing_path, dedup_file_path)
     def _verify_link(self, link: mo.Link) -> bool:
-        p = Path(link.link_path.decode("utf-8"))
+        p = self._link_path_from_string(link.path)
         try:
             a = p.lstat()
         except Exception:
             return False
-        if link.file.mtime != int(a.st_mtime):
+        if int(a.st_mtime) != self._clean_dedup_mtime:
             return False
         # st_ino is 0 on unsupported filesystems on Windows.

vocker 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

vocker 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl