vocker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vocker/dedup.py ADDED
@@ -0,0 +1,1676 @@
1
+ from __future__ import annotations
2
+
3
+ import abc
4
+ import contextlib
5
+ import filelock
6
+ import io
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ import shutil
11
+ import stat
12
+ import threading
13
+ import time
14
+
15
+ import typing as ty
16
+ import attr
17
+ import structlog
18
+ import concurrent.futures as cf
19
+
20
+ import sqlalchemy as sa
21
+ from sqlalchemy import orm as sao
22
+ from sqlalchemy_boltons import sqlite as sq
23
+ from sqlalchemy_boltons.orm import RelationshipComparator as Rel, IdKey
24
+ from sqlalchemy_boltons.temporary import temporary_table
25
+ from sqlalchemy_boltons.core import bytes_startswith
26
+ from boltons.iterutils import chunked_iter
27
+ from cached_property import cached_property
28
+
29
+ from .integer_to_path import IntegerToPath, InvalidPathError
30
+ from .util import pathwalk, random_names, create_file_random, supports_executable
31
+ from . import dedup_models as mo
32
+ from . import multihash as mh
33
+
34
+
35
+ logger = structlog.get_logger(__name__)
36
+
37
+
38
+ @attr.s(eq=False, hash=False)
39
+ class Corrupted:
40
+ path: Path | None = attr.ib()
41
+ file_id: int = attr.ib()
42
+ exception: str = attr.ib()
43
+ link_paths: frozenset[str] = attr.ib()
44
+ raw_link_paths: frozenset[str] = attr.ib()
45
+
46
+ def to_json(self):
47
+ d = attr.asdict(self)
48
+ d["path"] = p if (p := d["path"]) is None else str(p)
49
+ for k in ("link_paths", "raw_link_paths"):
50
+ d[k] = sorted(d[k])
51
+ return d
52
+
53
+
54
+ @attr.s(eq=False, hash=False, kw_only=True)
55
+ class DedupFileMetadata:
56
+ executable: bool = attr.ib(default=False)
57
+
58
+ @classmethod
59
+ def make_plain(cls):
60
+ return cls(executable=False)
61
+
62
+
63
+ @attr.s(eq=False, hash=False, auto_exc=True)
64
+ class InvalidContentsError(Exception):
65
+ message = attr.ib(default="file contents do not match hash")
66
+ link_request: DedupLinkRequest | None = attr.ib(default=None)
67
+ hashes_expected: dict[mh.HashFunction, mh.Digest] | None = attr.ib(default=None)
68
+ hashes_observed: dict[mh.HashFunction, mh.Digest] | None = attr.ib(default=None)
69
+
70
+
71
+ @attr.s(eq=False, hash=False, auto_exc=True)
72
+ class BatchError(Exception):
73
+ message = attr.ib(default="at least one of the DedupLinkRequests failed")
74
+ requests: list[DedupRequest] | None = attr.ib(default=None)
75
+
76
+
77
+ class NotADedupLinkError(Exception):
78
+ pass
79
+
80
+
81
+ class MissingContentError(Exception):
82
+ pass
83
+
84
+
85
+ @attr.s(eq=False, hash=False, kw_only=True)
86
+ class DedupRequest:
87
+ success: bool = attr.ib(init=False, default=False)
88
+ exc: Exception | None = attr.ib(init=False, default=None)
89
+
90
+ def result(self):
91
+ if self.exc is not None:
92
+ raise self.exc
93
+ return self.success
94
+
95
+
96
+ @attr.s(eq=False, hash=False, kw_only=True)
97
+ class DedupLinkRequest(DedupRequest):
98
+ """
99
+ Represents a single request to link a deduped file at a filesystem location :attr:`link_path`.
100
+ If the file is already in the dedup folder, then link it. Otherwise add it to the dedup folder
101
+ by first getting its contents from :attr:`open_file_once`. These requests are batched and
102
+ executed together.
103
+
104
+ If a file already exists at :attr:`link_path`, then it will be removed before linking. If it is
105
+ a directory, then an exception will be raised.
106
+
107
+ The :attr:`open_file_once` function will be called *at most* once. If a deduplicated file
108
+ already exists in the dedup folder with the same :attr:`file_contents_hash` and equal or
109
+ equivalent :attr:`file_metadata`, then it will be reused and the :attr:`open_file_once` function
110
+ will not be called at all.
111
+
112
+ The :attr:`open_file_once` function should an open file handle from which the file contents can
113
+ be read. If :attr:`open_file_once` is None, then the link request will be silently
114
+ discarded.
115
+
116
+ Each :attr:`open_file_once` function will be called in the order it appears in a batch of
117
+ requests. This guarantee supports the use case of directly decompressing a
118
+ [solid archive](https://en.wikipedia.org/wiki/Solid_archive), in which case file contents
119
+ become available in a sequential manner as the archive is decompressed and it is impossible
120
+ to efficiently access files in a random order.
121
+
122
+ The file contents hash will be (over)written to :attr:`file_contents_hash`.
123
+
124
+ The :attr:`tags` argument is used as a sort of label that can be used to refer to a deduplicated
125
+ file. If there exists another deduplicated file that shares at least one tag with :attr:`tags`,
126
+ then that deduplicated file will be used. That existing deduplicated file will be used
127
+ regardless of the :attr:`file_contents_hash`.
128
+
129
+ If :attr:`file_contents_hash` is None and no matching :attr:`tags` was found,
130
+ then :attr:`open_file_once` will always be called. Without the content hash, we have no way
131
+ of checking whether a deduplicated file with the same hash exists.
132
+ """
133
+
134
+ hash_function: mh.HashFunction = attr.ib()
135
+ link_path: Path = attr.ib()
136
+ file_metadata: DedupFileMetadata = attr.ib()
137
+ file_contents_hash: mh.Digest | None = attr.ib()
138
+ open_file_once: ty.Callable[[], ty.BinaryIO] | None = attr.ib()
139
+ file_not_needed: ty.Callable[[], None] | None = attr.ib(default=None)
140
+ tags: ty.Set[bytes] = attr.ib(factory=frozenset)
141
+
142
+ @classmethod
143
+ def from_content(cls, content: bytes, **kwargs):
144
+ kwargs.setdefault("open_file_once", None)
145
+ kwargs.setdefault("file_contents_hash", None)
146
+ return cls(**kwargs).set_content(content)
147
+
148
+ def set_content(self, content: bytes):
149
+ self.file_contents_hash = self.hash_function().update(content).digest()
150
+ self.open_file_once = lambda: io.BytesIO(content)
151
+ return self
152
+
153
+
154
+ @attr.s(eq=False, hash=False, kw_only=True)
155
+ class _ImplDedupRequestCommon:
156
+ index: int = attr.ib()
157
+ failed: bool = attr.ib(default=False)
158
+
159
+ @abc.abstractmethod
160
+ def set_failed(self, exc): ...
161
+
162
+
163
+ @attr.s(eq=False, hash=False, kw_only=True)
164
+ class _ImplDedupLinkRequest(_ImplDedupRequestCommon):
165
+ req: DedupLinkRequest = attr.ib(default=None)
166
+ lookup_key = attr.ib(default=None)
167
+ dedup_file_path: Path = attr.ib(default=None)
168
+ link_path_str: bytes | None = attr.ib(default=None)
169
+ file: IdKey[mo.DedupFile] | None = attr.ib(default=None)
170
+ metadata_bytes: bytes | None = attr.ib(default=None)
171
+ file_size: int = attr.ib(default=None)
172
+ file_mtime: int = attr.ib(default=None)
173
+ fast_path: bool = attr.ib(default=False) # can we use the fast-path without db transaction?
174
+ is_new: bool = attr.ib(default=False) # is it a brand new FileDedup?
175
+ hashes_promised: dict[mh.HashFunction, mh.Digest] = attr.ib(default=None)
176
+ hashes_computed: dict[mh.HashFunction, mh.Digest] | None = attr.ib(default=None)
177
+ called_file: bool = attr.ib(default=False)
178
+
179
+ def set_failed(self, exc):
180
+ self.req.exc = exc
181
+ self.failed = True
182
+ self.call_file_not_needed()
183
+
184
+ def call_file_not_needed(self) -> None:
185
+ if not self.called_file:
186
+ if (f := self.req.file_not_needed) is not None:
187
+ try:
188
+ f()
189
+ except Exception:
190
+ logger.warning("uncaught exception", exc_info=True)
191
+ self.called_file = True
192
+
193
+ def call_open_file_once(self):
194
+ if self.called_file:
195
+ raise AssertionError
196
+ try:
197
+ return self.req.open_file_once()
198
+ finally:
199
+ self.called_file = True
200
+
201
+
202
+ @attr.s(eq=False, hash=False)
203
+ class DedupCopyLinkRequest(DedupRequest):
204
+ src: Path = attr.ib()
205
+ dst: Path = attr.ib()
206
+
207
+
208
+ @attr.s(eq=False, hash=False, kw_only=True)
209
+ class _ImplDedupCopyLinkRequest(_ImplDedupRequestCommon):
210
+ req: DedupCopyLinkRequest = attr.ib()
211
+ src_str: str = attr.ib(default=None)
212
+ dst_str: str = attr.ib(default=None)
213
+ dedup_file_path: Path = attr.ib(default=None)
214
+
215
+ def set_failed(self, exc):
216
+ self.req.exc = exc
217
+ self.failed = True
218
+
219
+
220
+ @attr.s(eq=False, hash=False)
221
+ class AdoptRequest:
222
+ path: Path = attr.ib()
223
+ tags: ty.Set[bytes] = attr.ib(factory=frozenset)
224
+
225
+ out_size: int | None = attr.ib(init=False, default=None)
226
+ out_digest: mh.Digest | None = attr.ib(init=False, default=None)
227
+
228
+
229
+ @attr.s(eq=False, hash=False)
230
+ class _ImplAdoptRequest:
231
+ req: AdoptRequest = attr.ib()
232
+ link_path: bytes = attr.ib(default=None)
233
+ file_metadata: DedupFileMetadata = attr.ib(default=None)
234
+ file_metadata_bytes: bytes = attr.ib(default=None)
235
+ done: bool = attr.ib(default=False)
236
+ dedup_file_path: Path = attr.ib(default=None)
237
+ delete: bool = attr.ib(default=False)
238
+
239
+
240
+ """
241
+ @attr.s(eq=False, hash=False)
242
+ class DedupUnlinkRequest(DedupRequest):
243
+ link_path: Path = attr.ib()
244
+ """
245
+
246
+
247
+ class DedupError:
248
+ pass
249
+
250
+
251
+ @attr.s(frozen=True)
252
+ class DedupStats:
253
+ dedup_count: int = attr.ib()
254
+ orphaned_count: int = attr.ib()
255
+ link_count: int = attr.ib()
256
+ dedup_total_bytes: int = attr.ib()
257
+ orphaned_total_bytes: int = attr.ib()
258
+ link_total_bytes: int = attr.ib()
259
+
260
+ def to_json(self):
261
+ return attr.asdict(self)
262
+
263
+
264
+ @attr.s(frozen=True)
265
+ class DedupFile:
266
+ pass
267
+
268
+
269
+ @attr.s(eq=False, hash=False)
270
+ class _PendingUpdater:
271
+ sessionmaker_r: sao.sessionmaker = attr.ib()
272
+ sessionmaker_w: sao.sessionmaker = attr.ib()
273
+ pending: IdKey[mo.Pending] = attr.ib()
274
+ seconds_in_the_future: int = attr.ib()
275
+ update_interval: float = attr.ib(default=None)
276
+ _should_exit = False
277
+ update_on_exit: bool = attr.ib(default=False)
278
+
279
+ def __attrs_post_init__(self):
280
+ if self.update_interval is None:
281
+ self.update_interval = (self.seconds_in_the_future - 3) / 2
282
+
283
+ if (u := self.update_interval) < 1:
284
+ raise ValueError(f"invalid update_interval={u!r}")
285
+
286
+ def _update(self):
287
+ with self.sessionmaker_w() as s:
288
+ pending: mo.Pending = self.pending.get_one(s)
289
+ pending.expire_at = mo.now() + self.seconds_in_the_future
290
+
291
+ def _thread_target(self):
292
+ while not self._should_exit:
293
+ t = self.update_interval
294
+ try:
295
+ self._update()
296
+ except Exception:
297
+ logger.warning("failed to update pending", exc_info=True)
298
+ t = 1 # try again soon
299
+ self._event.wait(t)
300
+ self._event.clear()
301
+ if self.update_on_exit:
302
+ self._update()
303
+
304
+ def start(self):
305
+ self._should_exit = False
306
+ self._event = threading.Event()
307
+ self._thread = t = threading.Thread(target=self._thread_target)
308
+ t.start()
309
+
310
+ def stop(self):
311
+ self._should_exit = True
312
+ self._event.set()
313
+ self._thread.join()
314
+
315
+ def __enter__(self):
316
+ self.start()
317
+ return self
318
+
319
+ def __exit__(self, exc_type, exc_value, traceback):
320
+ self.stop()
321
+
322
+
323
+ class SkippedReqException(Exception):
324
+ pass
325
+
326
+
327
+ def make_sqlite_options(synchronous):
328
+ return sq.Options.new(
329
+ timeout=60.0,
330
+ begin="DEFERRED",
331
+ foreign_keys="DEFERRED",
332
+ recursive_triggers=True,
333
+ trusted_schema=True,
334
+ schemas={"main": sq.SchemaOptions.new(journal="WAL", synchronous=synchronous)},
335
+ )
336
+
337
+
338
+ @attr.s(eq=False, hash=False)
339
+ class Dedup(abc.ABC):
340
+ base_path: Path = attr.ib()
341
+ extra_hashes: ty.Set[mh.HashFunction] = attr.ib(
342
+ factory=lambda: {mh.registry.name_to_hash["sha2-256"]}
343
+ )
344
+ _path_dedup: Path | None = attr.ib(default=None, kw_only=True)
345
+ _path_db: Path | None = attr.ib(default=None, kw_only=True)
346
+ path_temporary: Path | None = attr.ib(default=None, kw_only=True)
347
+ path_deleted: Path | None = attr.ib(default=None, kw_only=True)
348
+ path_corrupted: Path | None = attr.ib(default=None, kw_only=True)
349
+ _integer_to_path = attr.ib(factory=IntegerToPath, kw_only=True)
350
+ _sqlite_synchronous = attr.ib(default="NORMAL", kw_only=True)
351
+ _batch_size = 1000
352
+
353
+ def __attrs_post_init__(self):
354
+ if self._path_dedup is None:
355
+ self._path_dedup = self.base_path / "f"
356
+
357
+ if self._path_db is None:
358
+ self._path_db = self.base_path / "dedup.db"
359
+
360
+ if self.path_deleted is None:
361
+ self.path_deleted = self.base_path / "deleted"
362
+
363
+ if self.path_temporary is None:
364
+ self.path_temporary = self.base_path / "tmp"
365
+
366
+ if self.path_corrupted is None:
367
+ self.path_corrupted = self.base_path / "corrupted"
368
+
369
+ self._path_dedup.mkdir(exist_ok=True, parents=True)
370
+ self._path_db.parent.mkdir(exist_ok=True, parents=True)
371
+ self.path_corrupted.mkdir(exist_ok=True, parents=True)
372
+ self.path_deleted.mkdir(exist_ok=True, parents=True)
373
+ self._path_temporary_dirs.mkdir(exist_ok=True, parents=True)
374
+ self._path_temporary_lock.mkdir(exist_ok=True, parents=True)
375
+ engine = sq.create_engine_sqlite(self._path_db, create_engine_args=dict(echo=False))
376
+ engine = make_sqlite_options(synchronous=self._sqlite_synchronous).apply(engine)
377
+ self._engine_r = engine
378
+ self._engine_w = sq.Options.apply_lambda(engine, lambda x: x.evolve(begin="IMMEDIATE"))
379
+
380
+ self._SessionR = sao.sessionmaker(self._engine_r)
381
+ self._SessionW = sao.sessionmaker(self._engine_w)
382
+
383
+ # FIXME: use proper session management
384
+ # self.session = Session(self.engine_rw) # HACK
385
+ # self.engine = self.engine_rw # HACK
386
+
387
+ self._initialize_db()
388
+
389
+ def _initialize_db(self):
390
+ """Initialize the database schema."""
391
+ with self._engine_w.connect() as conn:
392
+ mo.BaseDedup.metadata.create_all(conn)
393
+ conn.commit()
394
+
395
+ @contextlib.contextmanager
396
+ def _beginw(self):
397
+ with self._SessionW.begin() as s:
398
+ s.connection() # ensure the transaction is started
399
+ yield s
400
+
401
+ def apply_metadata_to_file(self, path: Path, metadata: DedupFileMetadata) -> None:
402
+ if supports_executable():
403
+ mode = path.lstat().st_mode
404
+ if not stat.S_ISDIR(mode) and bool(stat.S_IXUSR & mode) != metadata.executable:
405
+ mask = stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
406
+ new_mode = mode & ~mask
407
+ if metadata.executable:
408
+ new_mode |= mask
409
+ os.chmod(str(path), new_mode, follow_symlinks=False)
410
+
411
+ def get_metadata_from_file(self, path: Path) -> DedupFileMetadata:
412
+ if supports_executable():
413
+ mode = path.stat().st_mode
414
+ if not stat.S_ISREG(mode):
415
+ raise AssertionError
416
+ return DedupFileMetadata(executable=bool(stat.S_IXUSR & mode))
417
+ else:
418
+ return DedupFileMetadata(executable=False)
419
+
420
+ def convert_file_metadata_to_bytes(self, metadata: DedupFileMetadata) -> bytes:
421
+ # TODO: make it platform-dependent whether we care about the executable bit
422
+ return b"x=" + str(int(metadata.executable)).encode("ascii")
423
+
424
+ def _link_path_to_string(self, p: Path) -> bytes:
425
+ return str(p).encode("utf-8")
426
+
427
+ def _link_path_from_string(self, data: bytes) -> Path:
428
+ return Path(data.decode("utf-8"))
429
+
430
+ @contextlib.contextmanager
431
+ def _ignore_skip(self):
432
+ try:
433
+ yield
434
+ except SkippedReqException:
435
+ pass
436
+
437
+ @contextlib.contextmanager
438
+ def _catch_req_exc(self, r: _ImplDedupLinkRequest | _ImplDedupCopyLinkRequest):
439
+ if r.failed:
440
+ raise SkippedReqException from None
441
+ try:
442
+ yield
443
+ except Exception as exc:
444
+ r.set_failed(exc)
445
+ raise SkippedReqException from None
446
+
447
+ def _cfg_hash_functions_get(self, s: sao.Session):
448
+ # TODO: not used yet
449
+ if (cfg := s.get(mo.DedupConfig, "hashes")) is None:
450
+ h = self._DEFAULT_HASHES
451
+ else:
452
+ h = json.loads(cfg.value)
453
+
454
+ return [mh.registry.name_to_hash[name] for name in h]
455
+
456
+ def _cfg_hash_functions_set(self, s: sao.Session, hashes: list[mh.HashFunction]):
457
+ # TODO: not used yet
458
+ if (cfg := s.get(mo.DedupConfig, "hashes")) is None:
459
+ cfg = mo.DedupConfig(key="hashes", value="")
460
+ cfg.value = json.dumps([h.name for h in hashes])
461
+
462
+ def _make_dedup_file(self, link: _ImplDedupLinkRequest, pending=None):
463
+ f = mo.Hash.from_digest
464
+ return mo.DedupFile(
465
+ file_metadata=link.metadata_bytes,
466
+ size=0,
467
+ mtime=0,
468
+ orphaned_at=None,
469
+ pending=pending,
470
+ hashes=[f(h) for h in link.hashes_promised.values()],
471
+ )
472
+
473
+ def _add_tags_to_file(self, session: sao.Session, file: mo.DedupFile, tags: ty.Set[bytes]):
474
+ if not tags:
475
+ return
476
+
477
+ Tag = sao.aliased(mo.Tag)
478
+ current_tags = frozenset(
479
+ session.execute(sa.select(Tag.name).where(Tag.file == file)).scalars().all()
480
+ )
481
+ for name in tags - current_tags:
482
+ session.add(mo.Tag(name=name, file=file))
483
+
484
+ def _prepare_dedup_file_for_linking(
485
+ self, session: sao.Session, file: mo.DedupFile, link: _ImplDedupLinkRequest
486
+ ):
487
+ if link.is_new:
488
+ # We need to flush so that the DedupFile gets assigned an ID. The merge below needs it.
489
+ session.flush()
490
+
491
+ # We add our tags.
492
+ self._add_tags_to_file(session, file, link.req.tags)
493
+
494
+ # Delete any existing link.
495
+ session.connection().execute(
496
+ sa.delete(mo.Link)
497
+ .where(mo.Link.link_path == link.link_path_str)
498
+ .execution_options(synchronize_session=False)
499
+ )
500
+
501
+ # Create link object.
502
+ session.add(mo.Link(link_path=link.link_path_str, file=file))
503
+
504
+ # Since we created a link, the file is definitely not orphaned.
505
+ file.orphaned_at = None
506
+
507
+ # This also relies on the flush above.
508
+ link.dedup_file_path = self._make_dedup_file_path(file.id)
509
+
510
+ def run_batch(self, requests: ty.Iterable[DedupRequest]) -> None:
511
+ """
512
+ Link and/or delete many files using batching for efficiency. If the
513
+ :attr:`DedupLinkRequest.file_hash` attribute is ``None``, then write the file hash to it.
514
+
515
+ The requests will be addressed in the order that they appear in the iterable.
516
+
517
+ Notes
518
+ -----
519
+
520
+ The implementation tries to spend as little time as possible inside database transactions.
521
+
522
+ 1. Search database for existing deduplicated files that can be reused. These are files
523
+ that match either the hash or one of the tags.
524
+ 2. Create a record for each new deduplicated file. Create a Pending
525
+ 3.
526
+
527
+ NEW IDEA FIXME
528
+ --------------
529
+
530
+ Split into fast path and slow path. If it's a brand new file OR it's an existing file that
531
+ is done being written (not pending), then that's the fast path. Otherwise it's the slow
532
+ path.
533
+
534
+ On the *fast path* we don't need to check for what other threads are doing.
535
+
536
+ """
537
+
538
+ links = []
539
+ copies = []
540
+ # unlinks = []
541
+ for i, req in enumerate(requests):
542
+ if isinstance(req, DedupLinkRequest):
543
+ links.append(_ImplDedupLinkRequest(req=req, index=i))
544
+ elif isinstance(req, DedupCopyLinkRequest):
545
+ copies.append(_ImplDedupCopyLinkRequest(req=req, index=i))
546
+ else:
547
+ raise TypeError(f"{type(req)!r}")
548
+
549
+ if links and copies:
550
+ # We don't do this yet because a copy request could be interfering with a link request
551
+ # by having the same source or destination link.
552
+ raise AssertionError(
553
+ "doing both links and copies in the same batch is not supported for now"
554
+ )
555
+
556
+ # Preliminaries to do before we start writing to the database.
557
+ all_tags: set[bytes] = set()
558
+ hashes_to_search: list[dict] = []
559
+ with self._SessionR() as s:
560
+ for link in links:
561
+ with self._ignore_skip(), self._catch_req_exc(link):
562
+ req = link.req
563
+ link.link_path_str = self._link_path_to_string(req.link_path)
564
+ # Remove existing file if present. This may raise if the path is actually a
565
+ # directory.
566
+ req.link_path.unlink(missing_ok=True)
567
+
568
+ all_tags |= req.tags
569
+
570
+ link.metadata_bytes = self.convert_file_metadata_to_bytes(req.file_metadata)
571
+
572
+ if (h := req.file_contents_hash) is not None:
573
+ link.lookup_key = h, link.metadata_bytes
574
+ d = {
575
+ "id": link.index,
576
+ "hash_function": h.function.function_code,
577
+ "digest": h.digest,
578
+ "metadata_bytes": link.metadata_bytes,
579
+ }
580
+ hashes_to_search.append(d)
581
+ link.hashes_promised = {h.function: h}
582
+ else:
583
+ link.hashes_promised = {}
584
+
585
+ for copy in copies:
586
+ with self._ignore_skip(), self._catch_req_exc(copy):
587
+ req = copy.req
588
+ copy.src_str = self._link_path_to_string(req.src)
589
+ copy.dst_str = self._link_path_to_string(req.dst)
590
+
591
+ def _q_gather_file_related(s, cls, attribute, values_set):
592
+ """
593
+ Query DedupFile-related information.
594
+ """
595
+ if not values_set:
596
+ return () # short-cut to avoid doing the query at all
597
+ Related = sao.aliased(cls)
598
+ q = sa.select(Related).where(getattr(Related, attribute).in_(values_set))
599
+ q = q.options(sao.joinedload(Related.file))
600
+ return s.execute(q).scalars()
601
+
602
+ # Now we check the database and add file hash records where we can.
603
+ with self._beginw() as s:
604
+ s.add(pending := mo.Pending(expire_at=mo.now() + 30.0))
605
+ s.flush()
606
+ pending_key = IdKey.from_instance(pending)
607
+
608
+ # Load relevant tags.
609
+ q = _q_gather_file_related(s, mo.Tag, "name", all_tags)
610
+ tag_to_file: dict[bytes, mo.DedupFile] = {x.name: x.file for x in q}
611
+
612
+ # Load relevant hashes.
613
+ if hashes_to_search:
614
+ with temporary_table(s, mo.tmp_hash_lookup) as tmp:
615
+ s.connection().execute(sa.insert(tmp), hashes_to_search).close()
616
+ H = sao.aliased(mo.Hash)
617
+ F = sao.aliased(mo.DedupFile)
618
+ q = (
619
+ sa.select(H, F)
620
+ .join(F, H.file)
621
+ .join(
622
+ tmp,
623
+ (tmp.c.digest == H.hash)
624
+ & (tmp.c.hash_function == H.hash_function)
625
+ & (tmp.c.metadata_bytes == F.file_metadata),
626
+ )
627
+ )
628
+ hash_to_file = {
629
+ (h.to_digest(), f.file_metadata): f for h, f in s.execute(q).all()
630
+ }
631
+ else:
632
+ hash_to_file = {}
633
+
634
+ # Construct a set so that we can check for intersection quickly.
635
+ tag_to_file_set = set(tag_to_file)
636
+
637
+ for link in links:
638
+ if link.failed:
639
+ continue
640
+
641
+ req = link.req
642
+
643
+ if overlap := req.tags & tag_to_file_set:
644
+ # We found a deduped file with a common alternate key! We use it!
645
+ file = tag_to_file[next(iter(overlap))]
646
+ elif (key := link.lookup_key) is not None:
647
+ # Check for a deduped file with the same hash.
648
+ file = hash_to_file.get(key, None)
649
+ else:
650
+ file = None
651
+
652
+ if file is None:
653
+ # We did not find a matching file. We create a new one if we can.
654
+ link.is_new = True
655
+ link.fast_path = True
656
+
657
+ if req.open_file_once is None:
658
+ # The user does not actually have the contents of the file. We skip over
659
+ # it.
660
+ link.set_failed(MissingContentError())
661
+ continue
662
+
663
+ # We must create a file.
664
+ s.add(file := self._make_dedup_file(link, pending))
665
+ elif file.pending_id is None:
666
+ # We found a matching file and it is not pending. We can use it directly.
667
+ link.fast_path = True
668
+ else:
669
+ # If the file is still in a pending state, the hashes and tags are unreliable.
670
+ # The file might fail to be written, the hashes might be invalid, etc. We must
671
+ # use the slow path and wait for the file to become ready.
672
+ link.fast_path = False
673
+ file = None
674
+
675
+ if link.fast_path:
676
+ self._prepare_dedup_file_for_linking(s, file, link)
677
+ if link.is_new:
678
+ # If the same file shows up later in the batch, ensure that it is used.
679
+ for v in link.hashes_promised.values():
680
+ hash_to_file[v, file.file_metadata] = file
681
+
682
+ # the _prepare_dedup_file_for_linking caused a flush, so our primary key is ready
683
+ if file is not None:
684
+ link.file = IdKey.from_instance(file)
685
+
686
+ L = sao.aliased(mo.Link)
687
+ q = sa.select(L).where(
688
+ (L.link_path == sa.bindparam("x_src")) | (L.link_path == sa.bindparam("x_dst"))
689
+ )
690
+ for copy in copies:
691
+ with self._ignore_skip(), self._catch_req_exc(copy):
692
+ link_objs = {
693
+ x.link_path: x
694
+ for x in s.execute(q, {"x_src": copy.src_str, "x_dst": copy.dst_str})
695
+ .scalars()
696
+ .all()
697
+ }
698
+
699
+ if (src_link := link_objs.get(copy.src_str)) is None:
700
+ raise NotADedupLinkError
701
+
702
+ if (dst_link := link_objs.get(copy.dst_str)) is not None:
703
+ s.delete(dst_link)
704
+
705
+ copy.dedup_file_path = self._make_dedup_file_path(src_link.file_id)
706
+ s.add(mo.Link(file_id=src_link.file_id, link_path=copy.dst_str))
707
+ s.flush()
708
+ del q, L
709
+
710
+ pending.expire_at = mo.now() + 30.0
711
+
712
+ del hash_to_file, tag_to_file, tag_to_file_set, pending
713
+
714
+ to_be_flushed = []
715
+ failed_requests = []
716
+
717
+ def _flush_now(s: sao.Session):
718
+ for link in to_be_flushed:
719
+ file: mo.DedupFile | None = None if (f := link.file) is None else f.get(s)
720
+
721
+ if link.failed or file is None:
722
+ failed_requests.append(link.req)
723
+ if file is not None:
724
+ s.delete(file)
725
+ continue
726
+
727
+ if (size := link.file_size) is not None:
728
+ file.size = size
729
+ if (mtime := link.file_mtime) is not None:
730
+ file.mtime = mtime
731
+
732
+ # We need to add whatever extra hashes were computed.
733
+ if d := link.hashes_computed:
734
+ already_in_db = link.hashes_promised
735
+ for k, v in d.items():
736
+ if k not in already_in_db:
737
+ s.add(mo.Hash.from_digest(v, file=file))
738
+
739
+ # We checked the hashes (if any), the file contents are written, and the link
740
+ # (if any) has been created. We are therefore ready to set the "file.pending"
741
+ # column to NULL, thus marking the dedup file as finalized.
742
+ file.pending = None
743
+
744
+ to_be_flushed.clear()
745
+
746
+ for copy in copies:
747
+ with self._ignore_skip(), self._catch_req_exc(copy):
748
+ self._delete_file(copy.req.dst)
749
+ self._create_actual_link(copy.dedup_file_path, copy.req.dst)
750
+
751
+ if links:
752
+ # Now we write the file data without holding the database transaction open. The
753
+ # "_PendingUpdater" ensures that other threads know that we're working.
754
+ with self._PendingUpdater(
755
+ pending=pending_key,
756
+ sessionmaker_r=self._SessionR,
757
+ sessionmaker_w=self._SessionW,
758
+ seconds_in_the_future=20,
759
+ ) as pu:
760
+ for link in links:
761
+ with self._ignore_skip(), self._catch_req_exc(link):
762
+ if not link.fast_path:
763
+ with self._beginw() as s:
764
+ _flush_now(s)
765
+ self._slow_path_wait_for_dedup_file(link=link, pending=pending_key)
766
+
767
+ self._write_dedup_file_contents(link=link)
768
+ to_be_flushed.append(link)
769
+ pu.update_on_exit = True
770
+
771
+ with self._beginw() as s:
772
+ _flush_now(s)
773
+
774
+ # Delete Pending object along with any DedupFile objects that had errors in them
775
+ # using the "ON DELETE CASCADE".
776
+ s.delete(pending_key.get_one(s))
777
+
778
+ for link in links:
779
+ link.req.success = not link.failed
780
+
781
+ if copies:
782
+ for copy in copies:
783
+ copy.req.success = not copy.failed
784
+ if not copy.req.success:
785
+ failed_requests.append(copy.req)
786
+
787
+ if failed_requests:
788
+ first_exc = failed_requests[0].exc
789
+ raise BatchError(requests=failed_requests) from first_exc
790
+
791
+ def _make_dedup_file_path(self, file_id: int) -> Path:
792
+ return self._path_dedup / self._integer_to_path(file_id)
793
+
794
+ def _write_file_computing_hashes(
795
+ self, target: Path, open1, hashes: ty.Iterable[mh.HashFunction]
796
+ ) -> tuple[int, dict[mh.HashFunction, mh.Digest]]:
797
+ target.parent.mkdir(exist_ok=True, parents=True)
798
+ m = mh.MultiHasher({f: f() for f in hashes})
799
+ with target.open("wb") as f_w, open1() as f_r:
800
+ while block := f_r.read(65536):
801
+ m.update(block)
802
+ f_w.write(block)
803
+ return m.size, m.digest()
804
+
805
+ def _write_dedup_file_contents(self, link: _ImplDedupLinkRequest) -> None:
806
+ if link.is_new:
807
+ if link.req.open_file_once is None:
808
+ link.call_file_not_needed()
809
+ return
810
+
811
+ p = link.dedup_file_path
812
+ (fs := set(link.hashes_promised)).update(self.extra_hashes)
813
+ link.file_size, d = self._write_file_computing_hashes(p, link.call_open_file_once, fs)
814
+ self.apply_metadata_to_file(p, link.req.file_metadata)
815
+ link.file_mtime = int(p.stat().st_mtime)
816
+ link.hashes_computed = d
817
+
818
+ # Check that the hashes match what was claimed inside the link request.
819
+ computed = {k: d[k] for k in link.hashes_promised}
820
+ if link.hashes_promised != computed:
821
+ p.unlink(missing_ok=True)
822
+ raise InvalidContentsError(
823
+ link_request=link.req,
824
+ hashes_expected=link.hashes_promised,
825
+ hashes_observed=computed,
826
+ )
827
+ else:
828
+ # existing file - we don't need to do anything
829
+ link.call_file_not_needed()
830
+
831
+ # TODO: quickly check whether the file mtime matches and check the content hash if not
832
+
833
+ self._create_actual_link(link.dedup_file_path, link.req.link_path)
834
+
835
+ def _slow_path_wait_for_dedup_file(
836
+ self, link: _ImplDedupLinkRequest, pending: IdKey[mo.Pending]
837
+ ) -> None:
838
+ """
839
+ The file we are interested in is actively being written to by another thread. We need to
840
+ wait for it to be finished or for the other thread to fail.
841
+
842
+ Either way, we add the required data to the database such that we can continue with the
843
+ fast path procedure after this method returns.
844
+ """
845
+
846
+ # Construct query which looks for a DedupFile matching hashes or overlapping tags.
847
+ F = sao.aliased(mo.DedupFile)
848
+ H = sao.aliased(mo.Hash)
849
+ T = sao.aliased(mo.Tag)
850
+
851
+ def _exists(Alias):
852
+ return sa.exists().select_from(Alias).where(Rel(Alias.file) == F)
853
+
854
+ q = sa.select(F)
855
+ for v in link.hashes_promised.values():
856
+ q = q.where(_exists(H).where(H.compare_digest() == v))
857
+ if link.req.tags:
858
+ q = q.where(_exists(T).where(T.name.in_(link.req.tags)))
859
+ q = q.options(sao.joinedload(F.pending))
860
+
861
+ def _check(s: sao.Session) -> mo.DedupFile | bool:
862
+ for x in s.execute(q).scalars():
863
+ x: mo.DedupFile
864
+ if x.pending is None:
865
+ # We found a finished DedupFile we can use directly.
866
+ return x
867
+ elif x.pending_id == pending.key[0]:
868
+ # It's already our dedupfile!!!
869
+ raise AssertionError("deadlock")
870
+ elif x.pending.expire_at >= mo.now():
871
+ # We found an in-progress DedupFile, so we stand down and continue polling.
872
+ return False
873
+
874
+ # There are no matching DedupFile objects, so we can create a new one ourselves.
875
+ return True
876
+
877
+ def _wait_first_time():
878
+ nonlocal _wait
879
+ _wait = _wait_normal
880
+
881
+ def _wait_normal():
882
+ time.sleep(2)
883
+
884
+ _wait = _wait_first_time
885
+ while True:
886
+ _wait()
887
+
888
+ with self._SessionR() as s: # check using a read-only transaction
889
+ result = _check(s)
890
+ if result is False:
891
+ continue
892
+
893
+ with self._beginw() as s: # use a write transaction
894
+ result = _check(s)
895
+ if result is False:
896
+ continue
897
+
898
+ if result is True:
899
+ # We need to create a new DedupFile
900
+ s.add(file := self._make_dedup_file(link, pending.get_one(s)))
901
+ link.is_new = True
902
+ else:
903
+ file = result
904
+ link.is_new = False
905
+
906
+ link.fast_path = True
907
+ self._prepare_dedup_file_for_linking(s, file, link)
908
+
909
+ # we can only do this after the flush
910
+ link.file = IdKey.from_instance(file)
911
+
912
+ break
913
+
914
+ @property
915
+ def _PendingUpdater(self):
916
+ return _PendingUpdater
917
+
918
+ @abc.abstractmethod
919
+ def _create_actual_link(self, existing: Path, new: Path): ...
920
+
921
+ @abc.abstractmethod
922
+ def _adopt_file_and_link(self, existing_path: Path, dedup_file_path: Path): ...
923
+
924
+ @abc.abstractmethod
925
+ def _verify_link(self, link: mo.Link) -> bool: ...
926
+
927
+ def _pre_delete_links(self, path: Path):
928
+ """
929
+ Delete link records for all paths under *path*. Note that you must still delete the actual
930
+ files, for example using rmtree.
931
+ """
932
+ self._check_links(path, True)
933
+
934
+ def check_links(self, path: Path | None = None) -> None:
935
+ """
936
+ Detect links that were removed from the filesystem.
937
+
938
+ If *path* is provided, then only traverse files under *path*. If the *path* does not exist,
939
+ that means that everything under that *path* is gone.
940
+ """
941
+ self._check_links(path, False)
942
+
943
+ def _check_links(self, path: Path | None, pre_delete: bool) -> None:
944
+ F = sao.aliased(mo.DedupFile)
945
+ L = sao.aliased(mo.Link)
946
+
947
+ _verify_link = self._verify_link
948
+
949
+ prefix = None
950
+ if path is not None:
951
+ exact_path = self._link_path_to_string(path)
952
+ prefix = self._link_path_to_string(path / "x")[:-1]
953
+
954
+ if pre_delete or not path.exists():
955
+ # FAST PATH: Entire directory is gone, so all of its contents are gone. No need to
956
+ # do any checking.
957
+ _verify_link = lambda link: False
958
+
959
+ q = sa.select(L).order_by(L.link_path).options(sao.joinedload(L.file))
960
+ q = q.limit(self._batch_size)
961
+ if prefix is not None:
962
+ q = q.where((L.link_path == exact_path) | bytes_startswith(L.link_path, prefix))
963
+
964
+ with self._SessionR() as s:
965
+ last_link_path: str | None = None
966
+ while True:
967
+ if last_link_path is None:
968
+ q2 = q
969
+ else:
970
+ q2 = q.where(L.link_path > last_link_path)
971
+
972
+ results: list[mo.Link] = s.execute(q2).scalars().all()
973
+ if not results:
974
+ break
975
+
976
+ to_delete = []
977
+ for link in results:
978
+ if not _verify_link(link):
979
+ to_delete.append(link.link_path)
980
+
981
+ if to_delete:
982
+ with self._beginw() as s2, temporary_table(
983
+ s2, mo.tmp_bytes
984
+ ) as t_links, temporary_table(s2, mo.tmp_ints) as t_files:
985
+ s2.connection().execute(
986
+ sa.insert(t_links), [{"id": x} for x in to_delete]
987
+ ).close()
988
+
989
+ # There are the DedupFile entries that may end up orphaned.
990
+ s2.connection().execute(
991
+ sa.insert(t_files).from_select(
992
+ [t_files.c.id],
993
+ sa.select(F.id)
994
+ .distinct()
995
+ .select_from(L)
996
+ .join(F, L.file)
997
+ .join(t_links, t_links.c.id == L.link_path),
998
+ )
999
+ ).close()
1000
+
1001
+ # Remove the links that have been deleted.
1002
+ s2.connection().execute(
1003
+ sa.delete(L).where(L.link_path.in_(sa.select(t_links.c.id))),
1004
+ ).close()
1005
+
1006
+ # Detect newly-orphaned files.
1007
+ s2.connection().execute(
1008
+ F.make_update_orphaned().where(F.id.in_(sa.select(t_files.c.id)))
1009
+ ).close()
1010
+
1011
+ last_link_path = results[-1].link_path
1012
+
1013
+ def update_all_orphaned(self):
1014
+ with self._beginw() as s:
1015
+ F = sao.aliased(mo.DedupFile)
1016
+ s.connection().execute(F.make_update_orphaned()).close()
1017
+
1018
+ def garbage_collect_dedup_files(self, min_age_seconds: int) -> None:
1019
+ """
1020
+ Remove dedup files that have no links to them as well as dedup files that were left behind
1021
+ by a failed batch of content insertion.
1022
+ """
1023
+ cutoff = mo.now() - min_age_seconds
1024
+ pending_cutoff = 7200
1025
+ F = sao.aliased(mo.DedupFile)
1026
+ P = sao.aliased(mo.Pending)
1027
+ q = sa.select(F).options(sao.selectinload(F.links)).limit(self._batch_size).order_by(F.id)
1028
+ q1 = q.where(F.orphaned_at != None, F.orphaned_at <= cutoff)
1029
+ q2 = q.join(P, F.pending).where(P.expire_at <= pending_cutoff)
1030
+ self._garbage_collect_using_query(q1, F)
1031
+ self._garbage_collect_using_query(q2, F)
1032
+
1033
+ def _garbage_collect_using_query(self, q, F):
1034
+ F1 = sao.aliased(mo.DedupFile)
1035
+ while True:
1036
+ with self._beginw() as s:
1037
+ files: list[mo.DedupFile] = s.scalars(q).all()
1038
+ if not files:
1039
+ break
1040
+ s.expunge_all() # remove DedupFile objects from session
1041
+ s.connection().execute(sa.delete(F1).where(F1.id.in_(q.with_only_columns(F.id))))
1042
+
1043
+ for file in files:
1044
+ for link in file.links:
1045
+ self._delete_file(link._link_path_from_string(link.link_path))
1046
+ self._delete_file(self._make_dedup_file_path(file.id))
1047
+
1048
+ def garbage_collect_deleted(self):
1049
+ """
1050
+ Delete unused temporary directories created with :meth:`.temporary_directory` as well as
1051
+ files that could not be deleted previously (due to locking on Windows, for example).
1052
+ """
1053
+
1054
+ # We must ALWAYS lock self._path_temporary_master_lock before attempting to create or delete
1055
+ # an child lock file inside self._path_temporary_lock.
1056
+ for q in self._path_temporary_lock.iterdir():
1057
+ with contextlib.ExitStack() as ex:
1058
+ # Holding the master lock, we check the timestamp of the child lock and, if it's old
1059
+ # enough, we lock it.
1060
+ with self._filelock(self._path_temporary_master_lock, blocking=True):
1061
+ if q.lstat().st_mtime >= mo.now() - 3600:
1062
+ continue
1063
+
1064
+ try:
1065
+ ex.enter_context(self._filelock(q, blocking=False))
1066
+ except filelock.Timeout:
1067
+ continue # it's still locked, leave it alone
1068
+
1069
+ # We release the master lock as we don't need it anymore.
1070
+
1071
+ # Still holding the child lock, delete the corresponding temporary dir.
1072
+ self.delete_tree(self._path_temporary_dirs / q.name)
1073
+
1074
+ # Holding the master lock, finally delete the child lock.
1075
+ with self._filelock(self._path_temporary_master_lock, blocking=True):
1076
+ try:
1077
+ with self._filelock(q, blocking=False):
1078
+ pass
1079
+ except filelock.Timeout as exc_:
1080
+ pass # another thread chose the same name and locked it, leave it alone
1081
+ else:
1082
+ self._remove_file_or_dir(q, ignore_errors=True)
1083
+
1084
+ for p in self.path_deleted.iterdir():
1085
+ self._remove_file_or_dir(p, ignore_errors=True)
1086
+
1087
+ def _remove_file_or_dir(self, p: Path, ignore_errors: bool):
1088
+ try:
1089
+ p.unlink()
1090
+ except Exception:
1091
+ if not p.exists():
1092
+ pass # mission (already) accomplished
1093
+ elif stat.S_ISDIR(p.lstat().st_mode):
1094
+ shutil.rmtree(str(p), ignore_errors=ignore_errors)
1095
+ elif not ignore_errors:
1096
+ raise
1097
+
1098
+ def garbage_collect_extra_files(self):
1099
+ """
1100
+ Look for files in the dedup directory that were left behind due to errors or unexpected
1101
+ shutdown. Delete such files.
1102
+
1103
+ This recursively lists every file in the dedup store, so it takes a long time.
1104
+ """
1105
+ F = sao.aliased(mo.DedupFile)
1106
+ i2p = self._integer_to_path
1107
+ cutoff = mo.now() - 3600
1108
+
1109
+ base = self._path_dedup
1110
+ for chunk in chunked_iter(base.rglob("*"), self._batch_size):
1111
+ to_be_unlinked = []
1112
+ file_ids = {}
1113
+ for p in chunk:
1114
+ if not p.is_file():
1115
+ continue
1116
+
1117
+ try:
1118
+ file_id = i2p.invert("/".join(p.relative_to(base).parts))
1119
+ except InvalidPathError:
1120
+ if p.stat().st_mtime < cutoff:
1121
+ to_be_unlinked.append(p)
1122
+ continue
1123
+
1124
+ file_ids[file_id] = p
1125
+
1126
+ if file_ids:
1127
+ # We use a write transaction to avoid a race condition between checking that a path
1128
+ # does not contain a valid file ID and then later deleting that file outside the
1129
+ # transaction.
1130
+ with self._SessionW() as s, temporary_table(s, mo.tmp_ints) as tmp:
1131
+ s.execute(sa.insert(tmp), [{"id": x} for x in file_ids]).close()
1132
+ tmp_ = sa.alias(tmp)
1133
+ bad_file_ids = (
1134
+ s.execute(
1135
+ sa.select(tmp_.c.id).where(
1136
+ ~sa.exists().select_from(F).where(F.id == tmp_.c.id)
1137
+ )
1138
+ )
1139
+ .scalars()
1140
+ .all()
1141
+ )
1142
+ for file_id in bad_file_ids:
1143
+ self._delete_file(file_ids[file_id])
1144
+
1145
+ for p in to_be_unlinked:
1146
+ self._delete_file(p)
1147
+
1148
+ def corrupted_list(self) -> ty.Generator[Corrupted]:
1149
+ """
1150
+ Get the list of corrupted files found using :meth:`integrity_check`.
1151
+ """
1152
+ for p in self.path_corrupted.glob("*.json"):
1153
+ d = json.loads(p.read_bytes())
1154
+ yield Corrupted(
1155
+ path=bin_path if (bin_path := p.with_suffix(".bin")).exists() else None,
1156
+ file_id=d["file_id"],
1157
+ exception=d["exception"],
1158
+ link_paths=frozenset(d["link_paths"]),
1159
+ raw_link_paths=frozenset(d["raw_link_paths"]),
1160
+ )
1161
+
1162
+ def corrupted_clear(self):
1163
+ """
1164
+ Delete all corrupted files.
1165
+ """
1166
+ for glob in ["*.bin", "*.json"]:
1167
+ for p in self.path_corrupted.glob(glob):
1168
+ self._delete_file(p)
1169
+
1170
+ @staticmethod
1171
+ def _copy_tree_default_fallback(src: Path, dst: Path):
1172
+ shutil.copy2(str(src), str(dst), follow_symlinks=False)
1173
+
1174
+ def copy_tree(self, src: Path, dst: Path, fallback_copy=None) -> None:
1175
+ if fallback_copy is None:
1176
+ fallback_copy = self._copy_tree_default_fallback
1177
+ if dst.exists():
1178
+ raise AssertionError("dst must not exist")
1179
+ self.check_links(dst)
1180
+
1181
+ def _run():
1182
+ self.run_batch(to_copy)
1183
+ for req in to_copy:
1184
+ try:
1185
+ req.result()
1186
+ except NotADedupLinkError:
1187
+ fallback_copy(req.src, req.dst)
1188
+ to_copy.clear()
1189
+
1190
+ if src.is_dir():
1191
+ to_copy = []
1192
+ for root, dirs, files in pathwalk(src):
1193
+ root_dst = dst / root.relative_to(src)
1194
+ root_dst.mkdir(exist_ok=True, parents=True)
1195
+ for f in files:
1196
+ to_copy.append(DedupCopyLinkRequest(src=root / f, dst=root_dst / f))
1197
+ if len(to_copy) > 1000:
1198
+ _run()
1199
+ else:
1200
+ # must be a file
1201
+ to_copy = [DedupCopyLinkRequest(src=src, dst=dst)]
1202
+
1203
+ if to_copy:
1204
+ _run()
1205
+
1206
+ def delete_tree(self, p: Path) -> None:
1207
+ def f(func, path, exc_info):
1208
+ if (p := Path(path)).exists():
1209
+ self._move_to_deleted(p)
1210
+
1211
+ shutil.rmtree(str(p.absolute()), onerror=f)
1212
+ if p.exists():
1213
+ self._move_to_deleted(p)
1214
+ self.check_links(p)
1215
+
1216
+ def delete_file(self, p: Path) -> None:
1217
+ self._delete_file(p)
1218
+ self.check_links(p)
1219
+
1220
+ def _delete_file(self, p: Path) -> None:
1221
+ """
1222
+ On Windows, a locked file cannot be deleted. So instead we move it out of the way to a
1223
+ different directory in the hopes of deleting it later when it's not locked.
1224
+ """
1225
+ try:
1226
+ p.unlink(missing_ok=True)
1227
+ except OSError:
1228
+ if not p.exists() or p.is_dir():
1229
+ raise
1230
+ else:
1231
+ return
1232
+
1233
+ self._move_to_deleted(p)
1234
+
1235
+ def _move_to_deleted(self, p: Path) -> None:
1236
+ base = self.path_deleted
1237
+ for name in random_names("", ".bin"):
1238
+ try:
1239
+ p.rename(base / name)
1240
+ except OSError as exc:
1241
+ exc_ = exc
1242
+ else:
1243
+ return
1244
+
1245
+ raise exc_
1246
+
1247
+ def _filelock(self, path: Path, blocking: bool):
1248
+ return filelock.FileLock(path, blocking=blocking)
1249
+
1250
+ @property
1251
+ def _path_temporary_dirs(self):
1252
+ return self.path_temporary / "dirs"
1253
+
1254
+ @property
1255
+ def _path_temporary_lock(self):
1256
+ return self.path_temporary / "lock"
1257
+
1258
+ @property
1259
+ def _path_temporary_master_lock(self):
1260
+ return self.path_temporary / "master.lock"
1261
+
1262
+ @contextlib.contextmanager
1263
+ def temporary_directory(self, prefix="tmp_", suffix=""):
1264
+ exc = None
1265
+ for name in random_names(prefix=prefix, suffix=suffix):
1266
+ p = self._path_temporary_dirs / name
1267
+ q = self._path_temporary_lock / name
1268
+
1269
+ # We must always acquire the master lock before acquiring a child lock. The order must
1270
+ # be consistent in order to prevent deadlocks.
1271
+ with contextlib.ExitStack() as ex:
1272
+ with self._filelock(self._path_temporary_master_lock, blocking=True):
1273
+ try:
1274
+ ex.enter_context(self._filelock(q, blocking=False))
1275
+ except filelock.Timeout as exc_:
1276
+ continue # try a different name
1277
+
1278
+ # We now release the master lock because we don't need it any more.
1279
+
1280
+ try:
1281
+ p.mkdir(parents=True)
1282
+ except OSError as exc_:
1283
+ exc = exc_
1284
+ continue
1285
+
1286
+ try:
1287
+ yield p
1288
+ break
1289
+ finally:
1290
+ self.delete_tree(p)
1291
+
1292
+ # Release the lock file. We will attempt to delete it next.
1293
+ ex.close()
1294
+
1295
+ # Attempt to delete the lock file.
1296
+ with self._filelock(self._path_temporary_master_lock, blocking=True):
1297
+ try:
1298
+ with self._filelock(q, blocking=False):
1299
+ pass
1300
+ except filelock.Timeout as exc_:
1301
+ pass # another thread chose the same name and locked it, leave it alone
1302
+ else:
1303
+ self._remove_file_or_dir(q, ignore_errors=True)
1304
+ else:
1305
+ raise AssertionError("retry count exceeded, unknown cause") if exc is None else exc
1306
+
1307
+ @cached_property
1308
+ def _q_get_hash(self):
1309
+ L = sao.aliased(mo.Link)
1310
+ F = sao.aliased(mo.DedupFile)
1311
+ H = sao.aliased(mo.Hash)
1312
+ return (
1313
+ sa.select(L, H, F.size)
1314
+ .select_from(L)
1315
+ .join(F, L.file)
1316
+ .outerjoin(H, (Rel(H.file) == F) & (H.hash_function == sa.bindparam("x_hf")))
1317
+ .options(sao.contains_eager(L.file.of_type(F)))
1318
+ .where(L.link_path == sa.bindparam("x_link_path"), F.pending == None)
1319
+ )
1320
+
1321
+ def _query_by_link_path(
1322
+ self, s: sao.Session, link_path: bytes, hash_function: mh.HashFunction
1323
+ ) -> list[tuple[mo.Link, mo.Hash, int]]:
1324
+ return s.execute(
1325
+ self._q_get_hash,
1326
+ {"x_link_path": link_path, "x_hf": hash_function.function_code},
1327
+ ).all()
1328
+
1329
+ def get_file_hash(
1330
+ self, hash_function: mh.HashFunction, path: Path, check_link: bool
1331
+ ) -> tuple[int, mh.Digest] | None:
1332
+ """
1333
+ Query the database to obtain the file contents hash of file at *path*. Return None if the
1334
+ file is not in the dedup database. If *check_link* is True, then check that the link is
1335
+ intact before returning the hash. If the link is damaged or removed, then call
1336
+ :meth:`check_links` to unregister the link then return None.
1337
+ """
1338
+ with self._SessionR() as s:
1339
+ link_path: bytes = self._link_path_to_string(path)
1340
+ links = self._query_by_link_path(s, link_path, hash_function)
1341
+
1342
+ if not links:
1343
+ return None
1344
+
1345
+ link, h, size = links[0]
1346
+ if h is None:
1347
+ return None
1348
+
1349
+ if not (check_link and not self._verify_link(link)):
1350
+ return size, h.to_digest()
1351
+
1352
+ self.check_links(path)
1353
+ return None
1354
+
1355
+ def get_or_compute_file_hash(
1356
+ self, hash_function: mh.HashFunction, path: Path, **kw
1357
+ ) -> tuple[int, mh.Digest] | None:
1358
+ r = self.get_file_hash(hash_function, path, **kw)
1359
+ if r is None:
1360
+ hasher = hash_function()
1361
+ size = 0
1362
+ with path.open("rb") as f:
1363
+ while block := f.read(65536):
1364
+ size += len(block)
1365
+ hasher.update(block)
1366
+ r = size, hasher.digest()
1367
+ return r
1368
+
1369
+ def adopt_files(
1370
+ self, hash_function: mh.HashFunction, requests: ty.Iterable[AdoptRequest]
1371
+ ) -> None:
1372
+ """
1373
+ Adopt each file given in *paths*. If the path is already a dedup link, then leave it
1374
+ alone. If the path is not a dedup link, then compute its hash and move the file to the
1375
+ dedup store and create a link to it. If the path is already a dedup link but does not
1376
+ have the right kind of hash digest, then compute the hash digest and store it in the
1377
+ database.
1378
+
1379
+ This method is implemented in a somewhat inefficient way.
1380
+ """
1381
+ reqs = [_ImplAdoptRequest(req) for req in requests]
1382
+
1383
+ # first use a read-only session while we compute file hashes
1384
+ with self._SessionR() as s:
1385
+ for x in reqs:
1386
+ x.link_path = self._link_path_to_string(x.req.path)
1387
+ existing = self._query_by_link_path(s, x.link_path, hash_function)
1388
+ if existing:
1389
+ l, h, sz = existing[0]
1390
+ if h is not None:
1391
+ x.req.out_digest = h.to_digest()
1392
+ x.req.out_size = sz
1393
+ x.done = True
1394
+
1395
+ if not x.done:
1396
+ with open(x.req.path, "rb") as f:
1397
+ h = hash_function()
1398
+ size = 0
1399
+ while block := f.read(65536):
1400
+ h.update(block)
1401
+ size += len(block)
1402
+ x.req.out_digest = h.digest()
1403
+ x.file_metadata = DedupFileMetadata(executable=False) # TODO
1404
+ x.req.out_size = size
1405
+ x.file_metadata_bytes = self.convert_file_metadata_to_bytes(x.file_metadata)
1406
+
1407
+ F = sao.aliased(mo.DedupFile)
1408
+ H = sao.aliased(mo.Hash)
1409
+ q = (
1410
+ sa.select(F)
1411
+ .join(H, F.hashes)
1412
+ .where(
1413
+ H.hash_function == sa.bindparam("x_hf"),
1414
+ H.hash == sa.bindparam("x_h"),
1415
+ F.pending == None,
1416
+ F.file_metadata == sa.bindparam("x_f_meta"),
1417
+ )
1418
+ )
1419
+
1420
+ # then we use a RW session to update the database
1421
+ with self._beginw() as s:
1422
+ for x in reqs:
1423
+ if x.done:
1424
+ continue
1425
+
1426
+ # re-check for an existing link
1427
+ existing = self._query_by_link_path(s, x.link_path, hash_function)
1428
+ if existing:
1429
+ l, h, sz = existing[0]
1430
+ file = l.file
1431
+ if h is None:
1432
+ s.add(mo.Hash.from_digest(x.req.out_digest, file=file))
1433
+ else:
1434
+ # never mind, nothing to do here
1435
+ x.req.out_size = sz
1436
+ x.req.out_digest = h.to_digest()
1437
+ x.done = True
1438
+ continue
1439
+ else:
1440
+ # try to lookup by digest first
1441
+ # TODO: also look up by tag
1442
+ files = (
1443
+ s.execute(
1444
+ q,
1445
+ dict(
1446
+ x_hf=hash_function.function_code,
1447
+ x_h=x.req.out_digest.digest,
1448
+ x_f_meta=x.file_metadata_bytes,
1449
+ ),
1450
+ )
1451
+ .scalars()
1452
+ .all()
1453
+ )
1454
+ if files:
1455
+ file = files[0]
1456
+ else:
1457
+ file = None
1458
+ if file is not None:
1459
+ file.orphaned_at = None
1460
+ x.delete = True
1461
+ else:
1462
+ # no existing file found, need to create one
1463
+ file = mo.DedupFile(
1464
+ file_metadata=x.file_metadata_bytes,
1465
+ size=x.req.out_size,
1466
+ mtime=int(x.req.path.stat().st_mtime),
1467
+ orphaned_at=None,
1468
+ pending=None,
1469
+ hashes=[mo.Hash.from_digest(x.req.out_digest)],
1470
+ )
1471
+ s.add(file)
1472
+ s.flush() # we need to make sure the file has an ID
1473
+
1474
+ s.add(mo.Link(link_path=x.link_path, file=file))
1475
+
1476
+ x.dedup_file_path = self._make_dedup_file_path(file.id)
1477
+
1478
+ # We add our tags.
1479
+ self._add_tags_to_file(s, file, x.req.tags)
1480
+
1481
+ s.flush()
1482
+
1483
+ # and finally we make filesystem changes
1484
+ for x in reqs:
1485
+ if (dst := x.dedup_file_path) is not None:
1486
+ if x.delete:
1487
+ # We already have a DedupFile with the required contents, so we replace the
1488
+ # link_path file with a link to that existing DedupFile.
1489
+ self._delete_file(x.req.path)
1490
+ self._create_actual_link(dst, x.req.path)
1491
+ else:
1492
+ dst.parent.mkdir(exist_ok=True, parents=True)
1493
+ self._adopt_file_and_link(x.req.path, dst)
1494
+
1495
+ def integrity_check(
1496
+ self,
1497
+ skip_same_mtime: bool,
1498
+ threads: int | None = None,
1499
+ keep_corrupted: bool = True,
1500
+ ):
1501
+ """
1502
+ Verify all deduplicated files match their stored hashes. Use modification time to skip
1503
+ unchanged files if *skip_same_mtime* is True. Move the corrupted files to
1504
+ :attr:`path_corrupted`.
1505
+ """
1506
+
1507
+ F = sao.aliased(mo.DedupFile)
1508
+ batch_size = 1000
1509
+ q = sa.select(F).options(sao.selectinload(F.hashes)).order_by(F.id).limit(batch_size)
1510
+
1511
+ def _hash_check(file: mo.DedupFile) -> None:
1512
+ p = self._make_dedup_file_path(file.id)
1513
+ st_mtime = int(p.stat().st_mtime)
1514
+ if skip_same_mtime and file.mtime == st_mtime:
1515
+ return
1516
+
1517
+ d = file.hashes_dict
1518
+ m = mh.MultiHasher({hf: hf() for hf in d})
1519
+ with p.open("rb") as fh:
1520
+ while block := fh.read(65536):
1521
+ m.update(block)
1522
+ if d != (observed := m.digest()):
1523
+ raise InvalidContentsError(hashes_expected=d, hashes_observed=observed)
1524
+
1525
+ # TODO: also check file metadata matches, such as the executable bit
1526
+
1527
+ # The digest was the same, so update the mtime in the DB.
1528
+ with self._SessionW() as s:
1529
+ IdKey.from_instance(file).get_one(s).mtime = st_mtime
1530
+
1531
+ id_min = -1
1532
+ with cf.ThreadPoolExecutor(max_workers=threads) as exe:
1533
+ while True:
1534
+ invalid_file_ids = []
1535
+
1536
+ with self._SessionR() as s:
1537
+ q2 = q.where(F.id > id_min, F.pending == None)
1538
+ dedup_files: list[mo.DedupFile] = s.execute(q2).scalars().all()
1539
+
1540
+ if not dedup_files:
1541
+ break
1542
+
1543
+ id_min = dedup_files[-1].id
1544
+ futures = {exe.submit(_hash_check, f): f for f in dedup_files}
1545
+ for future in cf.as_completed(futures):
1546
+ if (exc := future.exception()) is not None:
1547
+ if not isinstance(exc, Exception):
1548
+ # Some other type of exception
1549
+ raise exc
1550
+
1551
+ file = futures[future]
1552
+ self._integrity_check_process_corrupt_one(s, file, exc, keep_corrupted)
1553
+ invalid_file_ids.append(file.id)
1554
+
1555
+ if invalid_file_ids:
1556
+ with self._SessionW() as s:
1557
+ s.connection().execute(
1558
+ sa.delete(F).where(F.id == sa.bindparam("_id")),
1559
+ [{"_id": x} for x in invalid_file_ids],
1560
+ )
1561
+
1562
+ def _integrity_check_process_corrupt_one(
1563
+ self, s: sao.Session, file: mo.DedupFile, exc: Exception, keep_corrupted: bool
1564
+ ):
1565
+ """
1566
+ Process one file that has been found to be corrupted.
1567
+ """
1568
+
1569
+ path_file = self._make_dedup_file_path(file.id)
1570
+
1571
+ # Load the links as we will need them
1572
+ s.refresh(file, ["links"])
1573
+
1574
+ link_paths = [self._link_path_from_string(link.link_path) for link in file.links]
1575
+ json_data = {
1576
+ "file_id": file.id,
1577
+ "link_paths": [str(x) for x in link_paths],
1578
+ "raw_link_paths": [
1579
+ link.link_path.decode("utf-8", errors="replace") for link in file.links
1580
+ ],
1581
+ "exception": repr(exc),
1582
+ }
1583
+
1584
+ with create_file_random(self.path_corrupted, "f_", ".json") as f:
1585
+ path_json = Path(f.name)
1586
+ f.write(json.dumps(json_data, indent=2, sort_keys=True).encode("utf-8"))
1587
+
1588
+ if keep_corrupted:
1589
+ try:
1590
+ path_file.rename(path_json.with_suffix(".bin"))
1591
+ except Exception:
1592
+ if path_file.exists():
1593
+ logger.warning(
1594
+ "failed to rename corrupt file", exc_info=True, data=str(path_file)
1595
+ )
1596
+
1597
+ for x in link_paths:
1598
+ self._delete_file(x)
1599
+
1600
+ class _compute_stats_ZeroRow:
1601
+ orphaned = None
1602
+ count = 0
1603
+ size = 0
1604
+
1605
+ def compute_stats(self) -> DedupStats:
1606
+ with self._SessionR() as s:
1607
+ F = sao.aliased(mo.DedupFile)
1608
+ L = sao.aliased(mo.Link)
1609
+ orph = F.orphaned_at != None
1610
+
1611
+ q = (
1612
+ sa.select(
1613
+ orph.label("orphaned"),
1614
+ sa.func.count().label("count"),
1615
+ sa.func.sum(F.size).label("size"),
1616
+ )
1617
+ .select_from(F)
1618
+ .where(F.pending == None)
1619
+ .group_by(orph)
1620
+ )
1621
+ file_stats = {k: self._compute_stats_ZeroRow() for k in (False, True)}
1622
+ file_stats |= {row.orphaned: row for row in s.execute(q).all()}
1623
+
1624
+ q = (
1625
+ sa.select(sa.func.count().label("count"), sa.func.sum(F.size).label("size"))
1626
+ .select_from(L)
1627
+ .join(F, L.file)
1628
+ ).where(F.pending == None)
1629
+ link_stats = s.execute(q).one()
1630
+
1631
+ return DedupStats(
1632
+ dedup_count=file_stats[False].count,
1633
+ dedup_total_bytes=file_stats[False].size,
1634
+ orphaned_count=file_stats[True].count,
1635
+ orphaned_total_bytes=file_stats[True].size,
1636
+ link_count=link_stats.count,
1637
+ link_total_bytes=link_stats.size or 0,
1638
+ )
1639
+
1640
+
1641
+ class DedupBackendHardlink(Dedup):
1642
+ def _create_actual_link(self, existing: Path, new: Path):
1643
+ # Path.link_to was removed and replaced by Path.hardlink_to, but I want this to work across
1644
+ # Python 3.9 to 3.13
1645
+ os.link(str(existing), str(new))
1646
+
1647
+ def _adopt_file_and_link(self, existing_path: Path, dedup_file_path: Path):
1648
+ # hard links are indistinguishable from each other
1649
+ self._create_actual_link(existing_path, dedup_file_path)
1650
+
1651
+ def _verify_link(self, link: mo.Link) -> bool:
1652
+ p = Path(link.link_path.decode("utf-8"))
1653
+
1654
+ try:
1655
+ a = p.lstat()
1656
+ except Exception:
1657
+ return False
1658
+
1659
+ if link.file.mtime != int(a.st_mtime):
1660
+ return False
1661
+
1662
+ # st_ino is 0 on unsupported filesystems on Windows.
1663
+
1664
+ # TODO: should we even allow st_ino=0?
1665
+ if a.st_ino != 0:
1666
+ if (file_stat := getattr(link.file, "_cached_file_stat", None)) is None:
1667
+ try:
1668
+ file_stat = self._make_dedup_file_path(link.file.id).stat()
1669
+ except Exception:
1670
+ return False
1671
+ link.file._cached_file_stat = file_stat
1672
+
1673
+ if a.st_ino != file_stat.st_ino:
1674
+ return False
1675
+
1676
+ return True