vocker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vocker/__init__.py +0 -0
- vocker/__main__.py +3 -0
- vocker/cli.py +384 -0
- vocker/dedup.py +1676 -0
- vocker/dedup_models.py +174 -0
- vocker/image.py +870 -0
- vocker/integer_to_path.py +51 -0
- vocker/multihash.py +302 -0
- vocker/py.typed +0 -0
- vocker/repo/__init__.py +0 -0
- vocker/repo/compression.py +239 -0
- vocker/repo/io.py +711 -0
- vocker/system.py +681 -0
- vocker/util.py +120 -0
- vocker/util_models.py +13 -0
- vocker-0.1.0.dist-info/METADATA +56 -0
- vocker-0.1.0.dist-info/RECORD +19 -0
- vocker-0.1.0.dist-info/WHEEL +5 -0
- vocker-0.1.0.dist-info/top_level.txt +1 -0
vocker/dedup.py
ADDED
|
@@ -0,0 +1,1676 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import abc
|
|
4
|
+
import contextlib
|
|
5
|
+
import filelock
|
|
6
|
+
import io
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import shutil
|
|
11
|
+
import stat
|
|
12
|
+
import threading
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
import typing as ty
|
|
16
|
+
import attr
|
|
17
|
+
import structlog
|
|
18
|
+
import concurrent.futures as cf
|
|
19
|
+
|
|
20
|
+
import sqlalchemy as sa
|
|
21
|
+
from sqlalchemy import orm as sao
|
|
22
|
+
from sqlalchemy_boltons import sqlite as sq
|
|
23
|
+
from sqlalchemy_boltons.orm import RelationshipComparator as Rel, IdKey
|
|
24
|
+
from sqlalchemy_boltons.temporary import temporary_table
|
|
25
|
+
from sqlalchemy_boltons.core import bytes_startswith
|
|
26
|
+
from boltons.iterutils import chunked_iter
|
|
27
|
+
from cached_property import cached_property
|
|
28
|
+
|
|
29
|
+
from .integer_to_path import IntegerToPath, InvalidPathError
|
|
30
|
+
from .util import pathwalk, random_names, create_file_random, supports_executable
|
|
31
|
+
from . import dedup_models as mo
|
|
32
|
+
from . import multihash as mh
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
logger = structlog.get_logger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@attr.s(eq=False, hash=False)
|
|
39
|
+
class Corrupted:
|
|
40
|
+
path: Path | None = attr.ib()
|
|
41
|
+
file_id: int = attr.ib()
|
|
42
|
+
exception: str = attr.ib()
|
|
43
|
+
link_paths: frozenset[str] = attr.ib()
|
|
44
|
+
raw_link_paths: frozenset[str] = attr.ib()
|
|
45
|
+
|
|
46
|
+
def to_json(self):
|
|
47
|
+
d = attr.asdict(self)
|
|
48
|
+
d["path"] = p if (p := d["path"]) is None else str(p)
|
|
49
|
+
for k in ("link_paths", "raw_link_paths"):
|
|
50
|
+
d[k] = sorted(d[k])
|
|
51
|
+
return d
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@attr.s(eq=False, hash=False, kw_only=True)
|
|
55
|
+
class DedupFileMetadata:
|
|
56
|
+
executable: bool = attr.ib(default=False)
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def make_plain(cls):
|
|
60
|
+
return cls(executable=False)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@attr.s(eq=False, hash=False, auto_exc=True)
|
|
64
|
+
class InvalidContentsError(Exception):
|
|
65
|
+
message = attr.ib(default="file contents do not match hash")
|
|
66
|
+
link_request: DedupLinkRequest | None = attr.ib(default=None)
|
|
67
|
+
hashes_expected: dict[mh.HashFunction, mh.Digest] | None = attr.ib(default=None)
|
|
68
|
+
hashes_observed: dict[mh.HashFunction, mh.Digest] | None = attr.ib(default=None)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@attr.s(eq=False, hash=False, auto_exc=True)
|
|
72
|
+
class BatchError(Exception):
|
|
73
|
+
message = attr.ib(default="at least one of the DedupLinkRequests failed")
|
|
74
|
+
requests: list[DedupRequest] | None = attr.ib(default=None)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class NotADedupLinkError(Exception):
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class MissingContentError(Exception):
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@attr.s(eq=False, hash=False, kw_only=True)
|
|
86
|
+
class DedupRequest:
|
|
87
|
+
success: bool = attr.ib(init=False, default=False)
|
|
88
|
+
exc: Exception | None = attr.ib(init=False, default=None)
|
|
89
|
+
|
|
90
|
+
def result(self):
|
|
91
|
+
if self.exc is not None:
|
|
92
|
+
raise self.exc
|
|
93
|
+
return self.success
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@attr.s(eq=False, hash=False, kw_only=True)
|
|
97
|
+
class DedupLinkRequest(DedupRequest):
|
|
98
|
+
"""
|
|
99
|
+
Represents a single request to link a deduped file at a filesystem location :attr:`link_path`.
|
|
100
|
+
If the file is already in the dedup folder, then link it. Otherwise add it to the dedup folder
|
|
101
|
+
by first getting its contents from :attr:`open_file_once`. These requests are batched and
|
|
102
|
+
executed together.
|
|
103
|
+
|
|
104
|
+
If a file already exists at :attr:`link_path`, then it will be removed before linking. If it is
|
|
105
|
+
a directory, then an exception will be raised.
|
|
106
|
+
|
|
107
|
+
The :attr:`open_file_once` function will be called *at most* once. If a deduplicated file
|
|
108
|
+
already exists in the dedup folder with the same :attr:`file_contents_hash` and equal or
|
|
109
|
+
equivalent :attr:`file_metadata`, then it will be reused and the :attr:`open_file_once` function
|
|
110
|
+
will not be called at all.
|
|
111
|
+
|
|
112
|
+
The :attr:`open_file_once` function should an open file handle from which the file contents can
|
|
113
|
+
be read. If :attr:`open_file_once` is None, then the link request will be silently
|
|
114
|
+
discarded.
|
|
115
|
+
|
|
116
|
+
Each :attr:`open_file_once` function will be called in the order it appears in a batch of
|
|
117
|
+
requests. This guarantee supports the use case of directly decompressing a
|
|
118
|
+
[solid archive](https://en.wikipedia.org/wiki/Solid_archive), in which case file contents
|
|
119
|
+
become available in a sequential manner as the archive is decompressed and it is impossible
|
|
120
|
+
to efficiently access files in a random order.
|
|
121
|
+
|
|
122
|
+
The file contents hash will be (over)written to :attr:`file_contents_hash`.
|
|
123
|
+
|
|
124
|
+
The :attr:`tags` argument is used as a sort of label that can be used to refer to a deduplicated
|
|
125
|
+
file. If there exists another deduplicated file that shares at least one tag with :attr:`tags`,
|
|
126
|
+
then that deduplicated file will be used. That existing deduplicated file will be used
|
|
127
|
+
regardless of the :attr:`file_contents_hash`.
|
|
128
|
+
|
|
129
|
+
If :attr:`file_contents_hash` is None and no matching :attr:`tags` was found,
|
|
130
|
+
then :attr:`open_file_once` will always be called. Without the content hash, we have no way
|
|
131
|
+
of checking whether a deduplicated file with the same hash exists.
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
hash_function: mh.HashFunction = attr.ib()
|
|
135
|
+
link_path: Path = attr.ib()
|
|
136
|
+
file_metadata: DedupFileMetadata = attr.ib()
|
|
137
|
+
file_contents_hash: mh.Digest | None = attr.ib()
|
|
138
|
+
open_file_once: ty.Callable[[], ty.BinaryIO] | None = attr.ib()
|
|
139
|
+
file_not_needed: ty.Callable[[], None] | None = attr.ib(default=None)
|
|
140
|
+
tags: ty.Set[bytes] = attr.ib(factory=frozenset)
|
|
141
|
+
|
|
142
|
+
@classmethod
|
|
143
|
+
def from_content(cls, content: bytes, **kwargs):
|
|
144
|
+
kwargs.setdefault("open_file_once", None)
|
|
145
|
+
kwargs.setdefault("file_contents_hash", None)
|
|
146
|
+
return cls(**kwargs).set_content(content)
|
|
147
|
+
|
|
148
|
+
def set_content(self, content: bytes):
|
|
149
|
+
self.file_contents_hash = self.hash_function().update(content).digest()
|
|
150
|
+
self.open_file_once = lambda: io.BytesIO(content)
|
|
151
|
+
return self
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@attr.s(eq=False, hash=False, kw_only=True)
|
|
155
|
+
class _ImplDedupRequestCommon:
|
|
156
|
+
index: int = attr.ib()
|
|
157
|
+
failed: bool = attr.ib(default=False)
|
|
158
|
+
|
|
159
|
+
@abc.abstractmethod
|
|
160
|
+
def set_failed(self, exc): ...
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@attr.s(eq=False, hash=False, kw_only=True)
|
|
164
|
+
class _ImplDedupLinkRequest(_ImplDedupRequestCommon):
|
|
165
|
+
req: DedupLinkRequest = attr.ib(default=None)
|
|
166
|
+
lookup_key = attr.ib(default=None)
|
|
167
|
+
dedup_file_path: Path = attr.ib(default=None)
|
|
168
|
+
link_path_str: bytes | None = attr.ib(default=None)
|
|
169
|
+
file: IdKey[mo.DedupFile] | None = attr.ib(default=None)
|
|
170
|
+
metadata_bytes: bytes | None = attr.ib(default=None)
|
|
171
|
+
file_size: int = attr.ib(default=None)
|
|
172
|
+
file_mtime: int = attr.ib(default=None)
|
|
173
|
+
fast_path: bool = attr.ib(default=False) # can we use the fast-path without db transaction?
|
|
174
|
+
is_new: bool = attr.ib(default=False) # is it a brand new FileDedup?
|
|
175
|
+
hashes_promised: dict[mh.HashFunction, mh.Digest] = attr.ib(default=None)
|
|
176
|
+
hashes_computed: dict[mh.HashFunction, mh.Digest] | None = attr.ib(default=None)
|
|
177
|
+
called_file: bool = attr.ib(default=False)
|
|
178
|
+
|
|
179
|
+
def set_failed(self, exc):
|
|
180
|
+
self.req.exc = exc
|
|
181
|
+
self.failed = True
|
|
182
|
+
self.call_file_not_needed()
|
|
183
|
+
|
|
184
|
+
def call_file_not_needed(self) -> None:
|
|
185
|
+
if not self.called_file:
|
|
186
|
+
if (f := self.req.file_not_needed) is not None:
|
|
187
|
+
try:
|
|
188
|
+
f()
|
|
189
|
+
except Exception:
|
|
190
|
+
logger.warning("uncaught exception", exc_info=True)
|
|
191
|
+
self.called_file = True
|
|
192
|
+
|
|
193
|
+
def call_open_file_once(self):
|
|
194
|
+
if self.called_file:
|
|
195
|
+
raise AssertionError
|
|
196
|
+
try:
|
|
197
|
+
return self.req.open_file_once()
|
|
198
|
+
finally:
|
|
199
|
+
self.called_file = True
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@attr.s(eq=False, hash=False)
|
|
203
|
+
class DedupCopyLinkRequest(DedupRequest):
|
|
204
|
+
src: Path = attr.ib()
|
|
205
|
+
dst: Path = attr.ib()
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
@attr.s(eq=False, hash=False, kw_only=True)
|
|
209
|
+
class _ImplDedupCopyLinkRequest(_ImplDedupRequestCommon):
|
|
210
|
+
req: DedupCopyLinkRequest = attr.ib()
|
|
211
|
+
src_str: str = attr.ib(default=None)
|
|
212
|
+
dst_str: str = attr.ib(default=None)
|
|
213
|
+
dedup_file_path: Path = attr.ib(default=None)
|
|
214
|
+
|
|
215
|
+
def set_failed(self, exc):
|
|
216
|
+
self.req.exc = exc
|
|
217
|
+
self.failed = True
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
@attr.s(eq=False, hash=False)
|
|
221
|
+
class AdoptRequest:
|
|
222
|
+
path: Path = attr.ib()
|
|
223
|
+
tags: ty.Set[bytes] = attr.ib(factory=frozenset)
|
|
224
|
+
|
|
225
|
+
out_size: int | None = attr.ib(init=False, default=None)
|
|
226
|
+
out_digest: mh.Digest | None = attr.ib(init=False, default=None)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
@attr.s(eq=False, hash=False)
|
|
230
|
+
class _ImplAdoptRequest:
|
|
231
|
+
req: AdoptRequest = attr.ib()
|
|
232
|
+
link_path: bytes = attr.ib(default=None)
|
|
233
|
+
file_metadata: DedupFileMetadata = attr.ib(default=None)
|
|
234
|
+
file_metadata_bytes: bytes = attr.ib(default=None)
|
|
235
|
+
done: bool = attr.ib(default=False)
|
|
236
|
+
dedup_file_path: Path = attr.ib(default=None)
|
|
237
|
+
delete: bool = attr.ib(default=False)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
"""
|
|
241
|
+
@attr.s(eq=False, hash=False)
|
|
242
|
+
class DedupUnlinkRequest(DedupRequest):
|
|
243
|
+
link_path: Path = attr.ib()
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class DedupError:
|
|
248
|
+
pass
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@attr.s(frozen=True)
|
|
252
|
+
class DedupStats:
|
|
253
|
+
dedup_count: int = attr.ib()
|
|
254
|
+
orphaned_count: int = attr.ib()
|
|
255
|
+
link_count: int = attr.ib()
|
|
256
|
+
dedup_total_bytes: int = attr.ib()
|
|
257
|
+
orphaned_total_bytes: int = attr.ib()
|
|
258
|
+
link_total_bytes: int = attr.ib()
|
|
259
|
+
|
|
260
|
+
def to_json(self):
|
|
261
|
+
return attr.asdict(self)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
@attr.s(frozen=True)
|
|
265
|
+
class DedupFile:
|
|
266
|
+
pass
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@attr.s(eq=False, hash=False)
|
|
270
|
+
class _PendingUpdater:
|
|
271
|
+
sessionmaker_r: sao.sessionmaker = attr.ib()
|
|
272
|
+
sessionmaker_w: sao.sessionmaker = attr.ib()
|
|
273
|
+
pending: IdKey[mo.Pending] = attr.ib()
|
|
274
|
+
seconds_in_the_future: int = attr.ib()
|
|
275
|
+
update_interval: float = attr.ib(default=None)
|
|
276
|
+
_should_exit = False
|
|
277
|
+
update_on_exit: bool = attr.ib(default=False)
|
|
278
|
+
|
|
279
|
+
def __attrs_post_init__(self):
|
|
280
|
+
if self.update_interval is None:
|
|
281
|
+
self.update_interval = (self.seconds_in_the_future - 3) / 2
|
|
282
|
+
|
|
283
|
+
if (u := self.update_interval) < 1:
|
|
284
|
+
raise ValueError(f"invalid update_interval={u!r}")
|
|
285
|
+
|
|
286
|
+
def _update(self):
|
|
287
|
+
with self.sessionmaker_w() as s:
|
|
288
|
+
pending: mo.Pending = self.pending.get_one(s)
|
|
289
|
+
pending.expire_at = mo.now() + self.seconds_in_the_future
|
|
290
|
+
|
|
291
|
+
def _thread_target(self):
|
|
292
|
+
while not self._should_exit:
|
|
293
|
+
t = self.update_interval
|
|
294
|
+
try:
|
|
295
|
+
self._update()
|
|
296
|
+
except Exception:
|
|
297
|
+
logger.warning("failed to update pending", exc_info=True)
|
|
298
|
+
t = 1 # try again soon
|
|
299
|
+
self._event.wait(t)
|
|
300
|
+
self._event.clear()
|
|
301
|
+
if self.update_on_exit:
|
|
302
|
+
self._update()
|
|
303
|
+
|
|
304
|
+
def start(self):
|
|
305
|
+
self._should_exit = False
|
|
306
|
+
self._event = threading.Event()
|
|
307
|
+
self._thread = t = threading.Thread(target=self._thread_target)
|
|
308
|
+
t.start()
|
|
309
|
+
|
|
310
|
+
def stop(self):
|
|
311
|
+
self._should_exit = True
|
|
312
|
+
self._event.set()
|
|
313
|
+
self._thread.join()
|
|
314
|
+
|
|
315
|
+
def __enter__(self):
|
|
316
|
+
self.start()
|
|
317
|
+
return self
|
|
318
|
+
|
|
319
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
320
|
+
self.stop()
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
class SkippedReqException(Exception):
|
|
324
|
+
pass
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def make_sqlite_options(synchronous):
|
|
328
|
+
return sq.Options.new(
|
|
329
|
+
timeout=60.0,
|
|
330
|
+
begin="DEFERRED",
|
|
331
|
+
foreign_keys="DEFERRED",
|
|
332
|
+
recursive_triggers=True,
|
|
333
|
+
trusted_schema=True,
|
|
334
|
+
schemas={"main": sq.SchemaOptions.new(journal="WAL", synchronous=synchronous)},
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
@attr.s(eq=False, hash=False)
|
|
339
|
+
class Dedup(abc.ABC):
|
|
340
|
+
base_path: Path = attr.ib()
|
|
341
|
+
extra_hashes: ty.Set[mh.HashFunction] = attr.ib(
|
|
342
|
+
factory=lambda: {mh.registry.name_to_hash["sha2-256"]}
|
|
343
|
+
)
|
|
344
|
+
_path_dedup: Path | None = attr.ib(default=None, kw_only=True)
|
|
345
|
+
_path_db: Path | None = attr.ib(default=None, kw_only=True)
|
|
346
|
+
path_temporary: Path | None = attr.ib(default=None, kw_only=True)
|
|
347
|
+
path_deleted: Path | None = attr.ib(default=None, kw_only=True)
|
|
348
|
+
path_corrupted: Path | None = attr.ib(default=None, kw_only=True)
|
|
349
|
+
_integer_to_path = attr.ib(factory=IntegerToPath, kw_only=True)
|
|
350
|
+
_sqlite_synchronous = attr.ib(default="NORMAL", kw_only=True)
|
|
351
|
+
_batch_size = 1000
|
|
352
|
+
|
|
353
|
+
def __attrs_post_init__(self):
|
|
354
|
+
if self._path_dedup is None:
|
|
355
|
+
self._path_dedup = self.base_path / "f"
|
|
356
|
+
|
|
357
|
+
if self._path_db is None:
|
|
358
|
+
self._path_db = self.base_path / "dedup.db"
|
|
359
|
+
|
|
360
|
+
if self.path_deleted is None:
|
|
361
|
+
self.path_deleted = self.base_path / "deleted"
|
|
362
|
+
|
|
363
|
+
if self.path_temporary is None:
|
|
364
|
+
self.path_temporary = self.base_path / "tmp"
|
|
365
|
+
|
|
366
|
+
if self.path_corrupted is None:
|
|
367
|
+
self.path_corrupted = self.base_path / "corrupted"
|
|
368
|
+
|
|
369
|
+
self._path_dedup.mkdir(exist_ok=True, parents=True)
|
|
370
|
+
self._path_db.parent.mkdir(exist_ok=True, parents=True)
|
|
371
|
+
self.path_corrupted.mkdir(exist_ok=True, parents=True)
|
|
372
|
+
self.path_deleted.mkdir(exist_ok=True, parents=True)
|
|
373
|
+
self._path_temporary_dirs.mkdir(exist_ok=True, parents=True)
|
|
374
|
+
self._path_temporary_lock.mkdir(exist_ok=True, parents=True)
|
|
375
|
+
engine = sq.create_engine_sqlite(self._path_db, create_engine_args=dict(echo=False))
|
|
376
|
+
engine = make_sqlite_options(synchronous=self._sqlite_synchronous).apply(engine)
|
|
377
|
+
self._engine_r = engine
|
|
378
|
+
self._engine_w = sq.Options.apply_lambda(engine, lambda x: x.evolve(begin="IMMEDIATE"))
|
|
379
|
+
|
|
380
|
+
self._SessionR = sao.sessionmaker(self._engine_r)
|
|
381
|
+
self._SessionW = sao.sessionmaker(self._engine_w)
|
|
382
|
+
|
|
383
|
+
# FIXME: use proper session management
|
|
384
|
+
# self.session = Session(self.engine_rw) # HACK
|
|
385
|
+
# self.engine = self.engine_rw # HACK
|
|
386
|
+
|
|
387
|
+
self._initialize_db()
|
|
388
|
+
|
|
389
|
+
def _initialize_db(self):
|
|
390
|
+
"""Initialize the database schema."""
|
|
391
|
+
with self._engine_w.connect() as conn:
|
|
392
|
+
mo.BaseDedup.metadata.create_all(conn)
|
|
393
|
+
conn.commit()
|
|
394
|
+
|
|
395
|
+
@contextlib.contextmanager
|
|
396
|
+
def _beginw(self):
|
|
397
|
+
with self._SessionW.begin() as s:
|
|
398
|
+
s.connection() # ensure the transaction is started
|
|
399
|
+
yield s
|
|
400
|
+
|
|
401
|
+
def apply_metadata_to_file(self, path: Path, metadata: DedupFileMetadata) -> None:
|
|
402
|
+
if supports_executable():
|
|
403
|
+
mode = path.lstat().st_mode
|
|
404
|
+
if not stat.S_ISDIR(mode) and bool(stat.S_IXUSR & mode) != metadata.executable:
|
|
405
|
+
mask = stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
|
|
406
|
+
new_mode = mode & ~mask
|
|
407
|
+
if metadata.executable:
|
|
408
|
+
new_mode |= mask
|
|
409
|
+
os.chmod(str(path), new_mode, follow_symlinks=False)
|
|
410
|
+
|
|
411
|
+
def get_metadata_from_file(self, path: Path) -> DedupFileMetadata:
|
|
412
|
+
if supports_executable():
|
|
413
|
+
mode = path.stat().st_mode
|
|
414
|
+
if not stat.S_ISREG(mode):
|
|
415
|
+
raise AssertionError
|
|
416
|
+
return DedupFileMetadata(executable=bool(stat.S_IXUSR & mode))
|
|
417
|
+
else:
|
|
418
|
+
return DedupFileMetadata(executable=False)
|
|
419
|
+
|
|
420
|
+
def convert_file_metadata_to_bytes(self, metadata: DedupFileMetadata) -> bytes:
|
|
421
|
+
# TODO: make it platform-dependent whether we care about the executable bit
|
|
422
|
+
return b"x=" + str(int(metadata.executable)).encode("ascii")
|
|
423
|
+
|
|
424
|
+
def _link_path_to_string(self, p: Path) -> bytes:
|
|
425
|
+
return str(p).encode("utf-8")
|
|
426
|
+
|
|
427
|
+
def _link_path_from_string(self, data: bytes) -> Path:
|
|
428
|
+
return Path(data.decode("utf-8"))
|
|
429
|
+
|
|
430
|
+
@contextlib.contextmanager
|
|
431
|
+
def _ignore_skip(self):
|
|
432
|
+
try:
|
|
433
|
+
yield
|
|
434
|
+
except SkippedReqException:
|
|
435
|
+
pass
|
|
436
|
+
|
|
437
|
+
@contextlib.contextmanager
|
|
438
|
+
def _catch_req_exc(self, r: _ImplDedupLinkRequest | _ImplDedupCopyLinkRequest):
|
|
439
|
+
if r.failed:
|
|
440
|
+
raise SkippedReqException from None
|
|
441
|
+
try:
|
|
442
|
+
yield
|
|
443
|
+
except Exception as exc:
|
|
444
|
+
r.set_failed(exc)
|
|
445
|
+
raise SkippedReqException from None
|
|
446
|
+
|
|
447
|
+
def _cfg_hash_functions_get(self, s: sao.Session):
|
|
448
|
+
# TODO: not used yet
|
|
449
|
+
if (cfg := s.get(mo.DedupConfig, "hashes")) is None:
|
|
450
|
+
h = self._DEFAULT_HASHES
|
|
451
|
+
else:
|
|
452
|
+
h = json.loads(cfg.value)
|
|
453
|
+
|
|
454
|
+
return [mh.registry.name_to_hash[name] for name in h]
|
|
455
|
+
|
|
456
|
+
def _cfg_hash_functions_set(self, s: sao.Session, hashes: list[mh.HashFunction]):
|
|
457
|
+
# TODO: not used yet
|
|
458
|
+
if (cfg := s.get(mo.DedupConfig, "hashes")) is None:
|
|
459
|
+
cfg = mo.DedupConfig(key="hashes", value="")
|
|
460
|
+
cfg.value = json.dumps([h.name for h in hashes])
|
|
461
|
+
|
|
462
|
+
def _make_dedup_file(self, link: _ImplDedupLinkRequest, pending=None):
|
|
463
|
+
f = mo.Hash.from_digest
|
|
464
|
+
return mo.DedupFile(
|
|
465
|
+
file_metadata=link.metadata_bytes,
|
|
466
|
+
size=0,
|
|
467
|
+
mtime=0,
|
|
468
|
+
orphaned_at=None,
|
|
469
|
+
pending=pending,
|
|
470
|
+
hashes=[f(h) for h in link.hashes_promised.values()],
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
def _add_tags_to_file(self, session: sao.Session, file: mo.DedupFile, tags: ty.Set[bytes]):
|
|
474
|
+
if not tags:
|
|
475
|
+
return
|
|
476
|
+
|
|
477
|
+
Tag = sao.aliased(mo.Tag)
|
|
478
|
+
current_tags = frozenset(
|
|
479
|
+
session.execute(sa.select(Tag.name).where(Tag.file == file)).scalars().all()
|
|
480
|
+
)
|
|
481
|
+
for name in tags - current_tags:
|
|
482
|
+
session.add(mo.Tag(name=name, file=file))
|
|
483
|
+
|
|
484
|
+
def _prepare_dedup_file_for_linking(
|
|
485
|
+
self, session: sao.Session, file: mo.DedupFile, link: _ImplDedupLinkRequest
|
|
486
|
+
):
|
|
487
|
+
if link.is_new:
|
|
488
|
+
# We need to flush so that the DedupFile gets assigned an ID. The merge below needs it.
|
|
489
|
+
session.flush()
|
|
490
|
+
|
|
491
|
+
# We add our tags.
|
|
492
|
+
self._add_tags_to_file(session, file, link.req.tags)
|
|
493
|
+
|
|
494
|
+
# Delete any existing link.
|
|
495
|
+
session.connection().execute(
|
|
496
|
+
sa.delete(mo.Link)
|
|
497
|
+
.where(mo.Link.link_path == link.link_path_str)
|
|
498
|
+
.execution_options(synchronize_session=False)
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
# Create link object.
|
|
502
|
+
session.add(mo.Link(link_path=link.link_path_str, file=file))
|
|
503
|
+
|
|
504
|
+
# Since we created a link, the file is definitely not orphaned.
|
|
505
|
+
file.orphaned_at = None
|
|
506
|
+
|
|
507
|
+
# This also relies on the flush above.
|
|
508
|
+
link.dedup_file_path = self._make_dedup_file_path(file.id)
|
|
509
|
+
|
|
510
|
+
def run_batch(self, requests: ty.Iterable[DedupRequest]) -> None:
|
|
511
|
+
"""
|
|
512
|
+
Link and/or delete many files using batching for efficiency. If the
|
|
513
|
+
:attr:`DedupLinkRequest.file_hash` attribute is ``None``, then write the file hash to it.
|
|
514
|
+
|
|
515
|
+
The requests will be addressed in the order that they appear in the iterable.
|
|
516
|
+
|
|
517
|
+
Notes
|
|
518
|
+
-----
|
|
519
|
+
|
|
520
|
+
The implementation tries to spend as little time as possible inside database transactions.
|
|
521
|
+
|
|
522
|
+
1. Search database for existing deduplicated files that can be reused. These are files
|
|
523
|
+
that match either the hash or one of the tags.
|
|
524
|
+
2. Create a record for each new deduplicated file. Create a Pending
|
|
525
|
+
3.
|
|
526
|
+
|
|
527
|
+
NEW IDEA FIXME
|
|
528
|
+
--------------
|
|
529
|
+
|
|
530
|
+
Split into fast path and slow path. If it's a brand new file OR it's an existing file that
|
|
531
|
+
is done being written (not pending), then that's the fast path. Otherwise it's the slow
|
|
532
|
+
path.
|
|
533
|
+
|
|
534
|
+
On the *fast path* we don't need to check for what other threads are doing.
|
|
535
|
+
|
|
536
|
+
"""
|
|
537
|
+
|
|
538
|
+
links = []
|
|
539
|
+
copies = []
|
|
540
|
+
# unlinks = []
|
|
541
|
+
for i, req in enumerate(requests):
|
|
542
|
+
if isinstance(req, DedupLinkRequest):
|
|
543
|
+
links.append(_ImplDedupLinkRequest(req=req, index=i))
|
|
544
|
+
elif isinstance(req, DedupCopyLinkRequest):
|
|
545
|
+
copies.append(_ImplDedupCopyLinkRequest(req=req, index=i))
|
|
546
|
+
else:
|
|
547
|
+
raise TypeError(f"{type(req)!r}")
|
|
548
|
+
|
|
549
|
+
if links and copies:
|
|
550
|
+
# We don't do this yet because a copy request could be interfering with a link request
|
|
551
|
+
# by having the same source or destination link.
|
|
552
|
+
raise AssertionError(
|
|
553
|
+
"doing both links and copies in the same batch is not supported for now"
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
# Preliminaries to do before we start writing to the database.
|
|
557
|
+
all_tags: set[bytes] = set()
|
|
558
|
+
hashes_to_search: list[dict] = []
|
|
559
|
+
with self._SessionR() as s:
|
|
560
|
+
for link in links:
|
|
561
|
+
with self._ignore_skip(), self._catch_req_exc(link):
|
|
562
|
+
req = link.req
|
|
563
|
+
link.link_path_str = self._link_path_to_string(req.link_path)
|
|
564
|
+
# Remove existing file if present. This may raise if the path is actually a
|
|
565
|
+
# directory.
|
|
566
|
+
req.link_path.unlink(missing_ok=True)
|
|
567
|
+
|
|
568
|
+
all_tags |= req.tags
|
|
569
|
+
|
|
570
|
+
link.metadata_bytes = self.convert_file_metadata_to_bytes(req.file_metadata)
|
|
571
|
+
|
|
572
|
+
if (h := req.file_contents_hash) is not None:
|
|
573
|
+
link.lookup_key = h, link.metadata_bytes
|
|
574
|
+
d = {
|
|
575
|
+
"id": link.index,
|
|
576
|
+
"hash_function": h.function.function_code,
|
|
577
|
+
"digest": h.digest,
|
|
578
|
+
"metadata_bytes": link.metadata_bytes,
|
|
579
|
+
}
|
|
580
|
+
hashes_to_search.append(d)
|
|
581
|
+
link.hashes_promised = {h.function: h}
|
|
582
|
+
else:
|
|
583
|
+
link.hashes_promised = {}
|
|
584
|
+
|
|
585
|
+
for copy in copies:
|
|
586
|
+
with self._ignore_skip(), self._catch_req_exc(copy):
|
|
587
|
+
req = copy.req
|
|
588
|
+
copy.src_str = self._link_path_to_string(req.src)
|
|
589
|
+
copy.dst_str = self._link_path_to_string(req.dst)
|
|
590
|
+
|
|
591
|
+
def _q_gather_file_related(s, cls, attribute, values_set):
|
|
592
|
+
"""
|
|
593
|
+
Query DedupFile-related information.
|
|
594
|
+
"""
|
|
595
|
+
if not values_set:
|
|
596
|
+
return () # short-cut to avoid doing the query at all
|
|
597
|
+
Related = sao.aliased(cls)
|
|
598
|
+
q = sa.select(Related).where(getattr(Related, attribute).in_(values_set))
|
|
599
|
+
q = q.options(sao.joinedload(Related.file))
|
|
600
|
+
return s.execute(q).scalars()
|
|
601
|
+
|
|
602
|
+
# Now we check the database and add file hash records where we can.
|
|
603
|
+
with self._beginw() as s:
|
|
604
|
+
s.add(pending := mo.Pending(expire_at=mo.now() + 30.0))
|
|
605
|
+
s.flush()
|
|
606
|
+
pending_key = IdKey.from_instance(pending)
|
|
607
|
+
|
|
608
|
+
# Load relevant tags.
|
|
609
|
+
q = _q_gather_file_related(s, mo.Tag, "name", all_tags)
|
|
610
|
+
tag_to_file: dict[bytes, mo.DedupFile] = {x.name: x.file for x in q}
|
|
611
|
+
|
|
612
|
+
# Load relevant hashes.
|
|
613
|
+
if hashes_to_search:
|
|
614
|
+
with temporary_table(s, mo.tmp_hash_lookup) as tmp:
|
|
615
|
+
s.connection().execute(sa.insert(tmp), hashes_to_search).close()
|
|
616
|
+
H = sao.aliased(mo.Hash)
|
|
617
|
+
F = sao.aliased(mo.DedupFile)
|
|
618
|
+
q = (
|
|
619
|
+
sa.select(H, F)
|
|
620
|
+
.join(F, H.file)
|
|
621
|
+
.join(
|
|
622
|
+
tmp,
|
|
623
|
+
(tmp.c.digest == H.hash)
|
|
624
|
+
& (tmp.c.hash_function == H.hash_function)
|
|
625
|
+
& (tmp.c.metadata_bytes == F.file_metadata),
|
|
626
|
+
)
|
|
627
|
+
)
|
|
628
|
+
hash_to_file = {
|
|
629
|
+
(h.to_digest(), f.file_metadata): f for h, f in s.execute(q).all()
|
|
630
|
+
}
|
|
631
|
+
else:
|
|
632
|
+
hash_to_file = {}
|
|
633
|
+
|
|
634
|
+
# Construct a set so that we can check for intersection quickly.
|
|
635
|
+
tag_to_file_set = set(tag_to_file)
|
|
636
|
+
|
|
637
|
+
for link in links:
|
|
638
|
+
if link.failed:
|
|
639
|
+
continue
|
|
640
|
+
|
|
641
|
+
req = link.req
|
|
642
|
+
|
|
643
|
+
if overlap := req.tags & tag_to_file_set:
|
|
644
|
+
# We found a deduped file with a common alternate key! We use it!
|
|
645
|
+
file = tag_to_file[next(iter(overlap))]
|
|
646
|
+
elif (key := link.lookup_key) is not None:
|
|
647
|
+
# Check for a deduped file with the same hash.
|
|
648
|
+
file = hash_to_file.get(key, None)
|
|
649
|
+
else:
|
|
650
|
+
file = None
|
|
651
|
+
|
|
652
|
+
if file is None:
|
|
653
|
+
# We did not find a matching file. We create a new one if we can.
|
|
654
|
+
link.is_new = True
|
|
655
|
+
link.fast_path = True
|
|
656
|
+
|
|
657
|
+
if req.open_file_once is None:
|
|
658
|
+
# The user does not actually have the contents of the file. We skip over
|
|
659
|
+
# it.
|
|
660
|
+
link.set_failed(MissingContentError())
|
|
661
|
+
continue
|
|
662
|
+
|
|
663
|
+
# We must create a file.
|
|
664
|
+
s.add(file := self._make_dedup_file(link, pending))
|
|
665
|
+
elif file.pending_id is None:
|
|
666
|
+
# We found a matching file and it is not pending. We can use it directly.
|
|
667
|
+
link.fast_path = True
|
|
668
|
+
else:
|
|
669
|
+
# If the file is still in a pending state, the hashes and tags are unreliable.
|
|
670
|
+
# The file might fail to be written, the hashes might be invalid, etc. We must
|
|
671
|
+
# use the slow path and wait for the file to become ready.
|
|
672
|
+
link.fast_path = False
|
|
673
|
+
file = None
|
|
674
|
+
|
|
675
|
+
if link.fast_path:
|
|
676
|
+
self._prepare_dedup_file_for_linking(s, file, link)
|
|
677
|
+
if link.is_new:
|
|
678
|
+
# If the same file shows up later in the batch, ensure that it is used.
|
|
679
|
+
for v in link.hashes_promised.values():
|
|
680
|
+
hash_to_file[v, file.file_metadata] = file
|
|
681
|
+
|
|
682
|
+
# the _prepare_dedup_file_for_linking caused a flush, so our primary key is ready
|
|
683
|
+
if file is not None:
|
|
684
|
+
link.file = IdKey.from_instance(file)
|
|
685
|
+
|
|
686
|
+
L = sao.aliased(mo.Link)
|
|
687
|
+
q = sa.select(L).where(
|
|
688
|
+
(L.link_path == sa.bindparam("x_src")) | (L.link_path == sa.bindparam("x_dst"))
|
|
689
|
+
)
|
|
690
|
+
for copy in copies:
|
|
691
|
+
with self._ignore_skip(), self._catch_req_exc(copy):
|
|
692
|
+
link_objs = {
|
|
693
|
+
x.link_path: x
|
|
694
|
+
for x in s.execute(q, {"x_src": copy.src_str, "x_dst": copy.dst_str})
|
|
695
|
+
.scalars()
|
|
696
|
+
.all()
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
if (src_link := link_objs.get(copy.src_str)) is None:
|
|
700
|
+
raise NotADedupLinkError
|
|
701
|
+
|
|
702
|
+
if (dst_link := link_objs.get(copy.dst_str)) is not None:
|
|
703
|
+
s.delete(dst_link)
|
|
704
|
+
|
|
705
|
+
copy.dedup_file_path = self._make_dedup_file_path(src_link.file_id)
|
|
706
|
+
s.add(mo.Link(file_id=src_link.file_id, link_path=copy.dst_str))
|
|
707
|
+
s.flush()
|
|
708
|
+
del q, L
|
|
709
|
+
|
|
710
|
+
pending.expire_at = mo.now() + 30.0
|
|
711
|
+
|
|
712
|
+
del hash_to_file, tag_to_file, tag_to_file_set, pending
|
|
713
|
+
|
|
714
|
+
to_be_flushed = []
|
|
715
|
+
failed_requests = []
|
|
716
|
+
|
|
717
|
+
def _flush_now(s: sao.Session):
|
|
718
|
+
for link in to_be_flushed:
|
|
719
|
+
file: mo.DedupFile | None = None if (f := link.file) is None else f.get(s)
|
|
720
|
+
|
|
721
|
+
if link.failed or file is None:
|
|
722
|
+
failed_requests.append(link.req)
|
|
723
|
+
if file is not None:
|
|
724
|
+
s.delete(file)
|
|
725
|
+
continue
|
|
726
|
+
|
|
727
|
+
if (size := link.file_size) is not None:
|
|
728
|
+
file.size = size
|
|
729
|
+
if (mtime := link.file_mtime) is not None:
|
|
730
|
+
file.mtime = mtime
|
|
731
|
+
|
|
732
|
+
# We need to add whatever extra hashes were computed.
|
|
733
|
+
if d := link.hashes_computed:
|
|
734
|
+
already_in_db = link.hashes_promised
|
|
735
|
+
for k, v in d.items():
|
|
736
|
+
if k not in already_in_db:
|
|
737
|
+
s.add(mo.Hash.from_digest(v, file=file))
|
|
738
|
+
|
|
739
|
+
# We checked the hashes (if any), the file contents are written, and the link
|
|
740
|
+
# (if any) has been created. We are therefore ready to set the "file.pending"
|
|
741
|
+
# column to NULL, thus marking the dedup file as finalized.
|
|
742
|
+
file.pending = None
|
|
743
|
+
|
|
744
|
+
to_be_flushed.clear()
|
|
745
|
+
|
|
746
|
+
for copy in copies:
|
|
747
|
+
with self._ignore_skip(), self._catch_req_exc(copy):
|
|
748
|
+
self._delete_file(copy.req.dst)
|
|
749
|
+
self._create_actual_link(copy.dedup_file_path, copy.req.dst)
|
|
750
|
+
|
|
751
|
+
if links:
|
|
752
|
+
# Now we write the file data without holding the database transaction open. The
|
|
753
|
+
# "_PendingUpdater" ensures that other threads know that we're working.
|
|
754
|
+
with self._PendingUpdater(
|
|
755
|
+
pending=pending_key,
|
|
756
|
+
sessionmaker_r=self._SessionR,
|
|
757
|
+
sessionmaker_w=self._SessionW,
|
|
758
|
+
seconds_in_the_future=20,
|
|
759
|
+
) as pu:
|
|
760
|
+
for link in links:
|
|
761
|
+
with self._ignore_skip(), self._catch_req_exc(link):
|
|
762
|
+
if not link.fast_path:
|
|
763
|
+
with self._beginw() as s:
|
|
764
|
+
_flush_now(s)
|
|
765
|
+
self._slow_path_wait_for_dedup_file(link=link, pending=pending_key)
|
|
766
|
+
|
|
767
|
+
self._write_dedup_file_contents(link=link)
|
|
768
|
+
to_be_flushed.append(link)
|
|
769
|
+
pu.update_on_exit = True
|
|
770
|
+
|
|
771
|
+
with self._beginw() as s:
|
|
772
|
+
_flush_now(s)
|
|
773
|
+
|
|
774
|
+
# Delete Pending object along with any DedupFile objects that had errors in them
|
|
775
|
+
# using the "ON DELETE CASCADE".
|
|
776
|
+
s.delete(pending_key.get_one(s))
|
|
777
|
+
|
|
778
|
+
for link in links:
|
|
779
|
+
link.req.success = not link.failed
|
|
780
|
+
|
|
781
|
+
if copies:
|
|
782
|
+
for copy in copies:
|
|
783
|
+
copy.req.success = not copy.failed
|
|
784
|
+
if not copy.req.success:
|
|
785
|
+
failed_requests.append(copy.req)
|
|
786
|
+
|
|
787
|
+
if failed_requests:
|
|
788
|
+
first_exc = failed_requests[0].exc
|
|
789
|
+
raise BatchError(requests=failed_requests) from first_exc
|
|
790
|
+
|
|
791
|
+
def _make_dedup_file_path(self, file_id: int) -> Path:
|
|
792
|
+
return self._path_dedup / self._integer_to_path(file_id)
|
|
793
|
+
|
|
794
|
+
def _write_file_computing_hashes(
|
|
795
|
+
self, target: Path, open1, hashes: ty.Iterable[mh.HashFunction]
|
|
796
|
+
) -> tuple[int, dict[mh.HashFunction, mh.Digest]]:
|
|
797
|
+
target.parent.mkdir(exist_ok=True, parents=True)
|
|
798
|
+
m = mh.MultiHasher({f: f() for f in hashes})
|
|
799
|
+
with target.open("wb") as f_w, open1() as f_r:
|
|
800
|
+
while block := f_r.read(65536):
|
|
801
|
+
m.update(block)
|
|
802
|
+
f_w.write(block)
|
|
803
|
+
return m.size, m.digest()
|
|
804
|
+
|
|
805
|
+
def _write_dedup_file_contents(self, link: _ImplDedupLinkRequest) -> None:
|
|
806
|
+
if link.is_new:
|
|
807
|
+
if link.req.open_file_once is None:
|
|
808
|
+
link.call_file_not_needed()
|
|
809
|
+
return
|
|
810
|
+
|
|
811
|
+
p = link.dedup_file_path
|
|
812
|
+
(fs := set(link.hashes_promised)).update(self.extra_hashes)
|
|
813
|
+
link.file_size, d = self._write_file_computing_hashes(p, link.call_open_file_once, fs)
|
|
814
|
+
self.apply_metadata_to_file(p, link.req.file_metadata)
|
|
815
|
+
link.file_mtime = int(p.stat().st_mtime)
|
|
816
|
+
link.hashes_computed = d
|
|
817
|
+
|
|
818
|
+
# Check that the hashes match what was claimed inside the link request.
|
|
819
|
+
computed = {k: d[k] for k in link.hashes_promised}
|
|
820
|
+
if link.hashes_promised != computed:
|
|
821
|
+
p.unlink(missing_ok=True)
|
|
822
|
+
raise InvalidContentsError(
|
|
823
|
+
link_request=link.req,
|
|
824
|
+
hashes_expected=link.hashes_promised,
|
|
825
|
+
hashes_observed=computed,
|
|
826
|
+
)
|
|
827
|
+
else:
|
|
828
|
+
# existing file - we don't need to do anything
|
|
829
|
+
link.call_file_not_needed()
|
|
830
|
+
|
|
831
|
+
# TODO: quickly check whether the file mtime matches and check the content hash if not
|
|
832
|
+
|
|
833
|
+
self._create_actual_link(link.dedup_file_path, link.req.link_path)
|
|
834
|
+
|
|
835
|
+
def _slow_path_wait_for_dedup_file(
|
|
836
|
+
self, link: _ImplDedupLinkRequest, pending: IdKey[mo.Pending]
|
|
837
|
+
) -> None:
|
|
838
|
+
"""
|
|
839
|
+
The file we are interested in is actively being written to by another thread. We need to
|
|
840
|
+
wait for it to be finished or for the other thread to fail.
|
|
841
|
+
|
|
842
|
+
Either way, we add the required data to the database such that we can continue with the
|
|
843
|
+
fast path procedure after this method returns.
|
|
844
|
+
"""
|
|
845
|
+
|
|
846
|
+
# Construct query which looks for a DedupFile matching hashes or overlapping tags.
|
|
847
|
+
F = sao.aliased(mo.DedupFile)
|
|
848
|
+
H = sao.aliased(mo.Hash)
|
|
849
|
+
T = sao.aliased(mo.Tag)
|
|
850
|
+
|
|
851
|
+
def _exists(Alias):
|
|
852
|
+
return sa.exists().select_from(Alias).where(Rel(Alias.file) == F)
|
|
853
|
+
|
|
854
|
+
q = sa.select(F)
|
|
855
|
+
for v in link.hashes_promised.values():
|
|
856
|
+
q = q.where(_exists(H).where(H.compare_digest() == v))
|
|
857
|
+
if link.req.tags:
|
|
858
|
+
q = q.where(_exists(T).where(T.name.in_(link.req.tags)))
|
|
859
|
+
q = q.options(sao.joinedload(F.pending))
|
|
860
|
+
|
|
861
|
+
def _check(s: sao.Session) -> mo.DedupFile | bool:
|
|
862
|
+
for x in s.execute(q).scalars():
|
|
863
|
+
x: mo.DedupFile
|
|
864
|
+
if x.pending is None:
|
|
865
|
+
# We found a finished DedupFile we can use directly.
|
|
866
|
+
return x
|
|
867
|
+
elif x.pending_id == pending.key[0]:
|
|
868
|
+
# It's already our dedupfile!!!
|
|
869
|
+
raise AssertionError("deadlock")
|
|
870
|
+
elif x.pending.expire_at >= mo.now():
|
|
871
|
+
# We found an in-progress DedupFile, so we stand down and continue polling.
|
|
872
|
+
return False
|
|
873
|
+
|
|
874
|
+
# There are no matching DedupFile objects, so we can create a new one ourselves.
|
|
875
|
+
return True
|
|
876
|
+
|
|
877
|
+
def _wait_first_time():
|
|
878
|
+
nonlocal _wait
|
|
879
|
+
_wait = _wait_normal
|
|
880
|
+
|
|
881
|
+
def _wait_normal():
|
|
882
|
+
time.sleep(2)
|
|
883
|
+
|
|
884
|
+
_wait = _wait_first_time
|
|
885
|
+
while True:
|
|
886
|
+
_wait()
|
|
887
|
+
|
|
888
|
+
with self._SessionR() as s: # check using a read-only transaction
|
|
889
|
+
result = _check(s)
|
|
890
|
+
if result is False:
|
|
891
|
+
continue
|
|
892
|
+
|
|
893
|
+
with self._beginw() as s: # use a write transaction
|
|
894
|
+
result = _check(s)
|
|
895
|
+
if result is False:
|
|
896
|
+
continue
|
|
897
|
+
|
|
898
|
+
if result is True:
|
|
899
|
+
# We need to create a new DedupFile
|
|
900
|
+
s.add(file := self._make_dedup_file(link, pending.get_one(s)))
|
|
901
|
+
link.is_new = True
|
|
902
|
+
else:
|
|
903
|
+
file = result
|
|
904
|
+
link.is_new = False
|
|
905
|
+
|
|
906
|
+
link.fast_path = True
|
|
907
|
+
self._prepare_dedup_file_for_linking(s, file, link)
|
|
908
|
+
|
|
909
|
+
# we can only do this after the flush
|
|
910
|
+
link.file = IdKey.from_instance(file)
|
|
911
|
+
|
|
912
|
+
break
|
|
913
|
+
|
|
914
|
+
@property
|
|
915
|
+
def _PendingUpdater(self):
|
|
916
|
+
return _PendingUpdater
|
|
917
|
+
|
|
918
|
+
@abc.abstractmethod
|
|
919
|
+
def _create_actual_link(self, existing: Path, new: Path): ...
|
|
920
|
+
|
|
921
|
+
@abc.abstractmethod
|
|
922
|
+
def _adopt_file_and_link(self, existing_path: Path, dedup_file_path: Path): ...
|
|
923
|
+
|
|
924
|
+
@abc.abstractmethod
|
|
925
|
+
def _verify_link(self, link: mo.Link) -> bool: ...
|
|
926
|
+
|
|
927
|
+
def _pre_delete_links(self, path: Path):
|
|
928
|
+
"""
|
|
929
|
+
Delete link records for all paths under *path*. Note that you must still delete the actual
|
|
930
|
+
files, for example using rmtree.
|
|
931
|
+
"""
|
|
932
|
+
self._check_links(path, True)
|
|
933
|
+
|
|
934
|
+
def check_links(self, path: Path | None = None) -> None:
|
|
935
|
+
"""
|
|
936
|
+
Detect links that were removed from the filesystem.
|
|
937
|
+
|
|
938
|
+
If *path* is provided, then only traverse files under *path*. If the *path* does not exist,
|
|
939
|
+
that means that everything under that *path* is gone.
|
|
940
|
+
"""
|
|
941
|
+
self._check_links(path, False)
|
|
942
|
+
|
|
943
|
+
def _check_links(self, path: Path | None, pre_delete: bool) -> None:
|
|
944
|
+
F = sao.aliased(mo.DedupFile)
|
|
945
|
+
L = sao.aliased(mo.Link)
|
|
946
|
+
|
|
947
|
+
_verify_link = self._verify_link
|
|
948
|
+
|
|
949
|
+
prefix = None
|
|
950
|
+
if path is not None:
|
|
951
|
+
exact_path = self._link_path_to_string(path)
|
|
952
|
+
prefix = self._link_path_to_string(path / "x")[:-1]
|
|
953
|
+
|
|
954
|
+
if pre_delete or not path.exists():
|
|
955
|
+
# FAST PATH: Entire directory is gone, so all of its contents are gone. No need to
|
|
956
|
+
# do any checking.
|
|
957
|
+
_verify_link = lambda link: False
|
|
958
|
+
|
|
959
|
+
q = sa.select(L).order_by(L.link_path).options(sao.joinedload(L.file))
|
|
960
|
+
q = q.limit(self._batch_size)
|
|
961
|
+
if prefix is not None:
|
|
962
|
+
q = q.where((L.link_path == exact_path) | bytes_startswith(L.link_path, prefix))
|
|
963
|
+
|
|
964
|
+
with self._SessionR() as s:
|
|
965
|
+
last_link_path: str | None = None
|
|
966
|
+
while True:
|
|
967
|
+
if last_link_path is None:
|
|
968
|
+
q2 = q
|
|
969
|
+
else:
|
|
970
|
+
q2 = q.where(L.link_path > last_link_path)
|
|
971
|
+
|
|
972
|
+
results: list[mo.Link] = s.execute(q2).scalars().all()
|
|
973
|
+
if not results:
|
|
974
|
+
break
|
|
975
|
+
|
|
976
|
+
to_delete = []
|
|
977
|
+
for link in results:
|
|
978
|
+
if not _verify_link(link):
|
|
979
|
+
to_delete.append(link.link_path)
|
|
980
|
+
|
|
981
|
+
if to_delete:
|
|
982
|
+
with self._beginw() as s2, temporary_table(
|
|
983
|
+
s2, mo.tmp_bytes
|
|
984
|
+
) as t_links, temporary_table(s2, mo.tmp_ints) as t_files:
|
|
985
|
+
s2.connection().execute(
|
|
986
|
+
sa.insert(t_links), [{"id": x} for x in to_delete]
|
|
987
|
+
).close()
|
|
988
|
+
|
|
989
|
+
# There are the DedupFile entries that may end up orphaned.
|
|
990
|
+
s2.connection().execute(
|
|
991
|
+
sa.insert(t_files).from_select(
|
|
992
|
+
[t_files.c.id],
|
|
993
|
+
sa.select(F.id)
|
|
994
|
+
.distinct()
|
|
995
|
+
.select_from(L)
|
|
996
|
+
.join(F, L.file)
|
|
997
|
+
.join(t_links, t_links.c.id == L.link_path),
|
|
998
|
+
)
|
|
999
|
+
).close()
|
|
1000
|
+
|
|
1001
|
+
# Remove the links that have been deleted.
|
|
1002
|
+
s2.connection().execute(
|
|
1003
|
+
sa.delete(L).where(L.link_path.in_(sa.select(t_links.c.id))),
|
|
1004
|
+
).close()
|
|
1005
|
+
|
|
1006
|
+
# Detect newly-orphaned files.
|
|
1007
|
+
s2.connection().execute(
|
|
1008
|
+
F.make_update_orphaned().where(F.id.in_(sa.select(t_files.c.id)))
|
|
1009
|
+
).close()
|
|
1010
|
+
|
|
1011
|
+
last_link_path = results[-1].link_path
|
|
1012
|
+
|
|
1013
|
+
def update_all_orphaned(self):
|
|
1014
|
+
with self._beginw() as s:
|
|
1015
|
+
F = sao.aliased(mo.DedupFile)
|
|
1016
|
+
s.connection().execute(F.make_update_orphaned()).close()
|
|
1017
|
+
|
|
1018
|
+
def garbage_collect_dedup_files(self, min_age_seconds: int) -> None:
|
|
1019
|
+
"""
|
|
1020
|
+
Remove dedup files that have no links to them as well as dedup files that were left behind
|
|
1021
|
+
by a failed batch of content insertion.
|
|
1022
|
+
"""
|
|
1023
|
+
cutoff = mo.now() - min_age_seconds
|
|
1024
|
+
pending_cutoff = 7200
|
|
1025
|
+
F = sao.aliased(mo.DedupFile)
|
|
1026
|
+
P = sao.aliased(mo.Pending)
|
|
1027
|
+
q = sa.select(F).options(sao.selectinload(F.links)).limit(self._batch_size).order_by(F.id)
|
|
1028
|
+
q1 = q.where(F.orphaned_at != None, F.orphaned_at <= cutoff)
|
|
1029
|
+
q2 = q.join(P, F.pending).where(P.expire_at <= pending_cutoff)
|
|
1030
|
+
self._garbage_collect_using_query(q1, F)
|
|
1031
|
+
self._garbage_collect_using_query(q2, F)
|
|
1032
|
+
|
|
1033
|
+
def _garbage_collect_using_query(self, q, F):
|
|
1034
|
+
F1 = sao.aliased(mo.DedupFile)
|
|
1035
|
+
while True:
|
|
1036
|
+
with self._beginw() as s:
|
|
1037
|
+
files: list[mo.DedupFile] = s.scalars(q).all()
|
|
1038
|
+
if not files:
|
|
1039
|
+
break
|
|
1040
|
+
s.expunge_all() # remove DedupFile objects from session
|
|
1041
|
+
s.connection().execute(sa.delete(F1).where(F1.id.in_(q.with_only_columns(F.id))))
|
|
1042
|
+
|
|
1043
|
+
for file in files:
|
|
1044
|
+
for link in file.links:
|
|
1045
|
+
self._delete_file(link._link_path_from_string(link.link_path))
|
|
1046
|
+
self._delete_file(self._make_dedup_file_path(file.id))
|
|
1047
|
+
|
|
1048
|
+
def garbage_collect_deleted(self):
|
|
1049
|
+
"""
|
|
1050
|
+
Delete unused temporary directories created with :meth:`.temporary_directory` as well as
|
|
1051
|
+
files that could not be deleted previously (due to locking on Windows, for example).
|
|
1052
|
+
"""
|
|
1053
|
+
|
|
1054
|
+
# We must ALWAYS lock self._path_temporary_master_lock before attempting to create or delete
|
|
1055
|
+
# an child lock file inside self._path_temporary_lock.
|
|
1056
|
+
for q in self._path_temporary_lock.iterdir():
|
|
1057
|
+
with contextlib.ExitStack() as ex:
|
|
1058
|
+
# Holding the master lock, we check the timestamp of the child lock and, if it's old
|
|
1059
|
+
# enough, we lock it.
|
|
1060
|
+
with self._filelock(self._path_temporary_master_lock, blocking=True):
|
|
1061
|
+
if q.lstat().st_mtime >= mo.now() - 3600:
|
|
1062
|
+
continue
|
|
1063
|
+
|
|
1064
|
+
try:
|
|
1065
|
+
ex.enter_context(self._filelock(q, blocking=False))
|
|
1066
|
+
except filelock.Timeout:
|
|
1067
|
+
continue # it's still locked, leave it alone
|
|
1068
|
+
|
|
1069
|
+
# We release the master lock as we don't need it anymore.
|
|
1070
|
+
|
|
1071
|
+
# Still holding the child lock, delete the corresponding temporary dir.
|
|
1072
|
+
self.delete_tree(self._path_temporary_dirs / q.name)
|
|
1073
|
+
|
|
1074
|
+
# Holding the master lock, finally delete the child lock.
|
|
1075
|
+
with self._filelock(self._path_temporary_master_lock, blocking=True):
|
|
1076
|
+
try:
|
|
1077
|
+
with self._filelock(q, blocking=False):
|
|
1078
|
+
pass
|
|
1079
|
+
except filelock.Timeout as exc_:
|
|
1080
|
+
pass # another thread chose the same name and locked it, leave it alone
|
|
1081
|
+
else:
|
|
1082
|
+
self._remove_file_or_dir(q, ignore_errors=True)
|
|
1083
|
+
|
|
1084
|
+
for p in self.path_deleted.iterdir():
|
|
1085
|
+
self._remove_file_or_dir(p, ignore_errors=True)
|
|
1086
|
+
|
|
1087
|
+
def _remove_file_or_dir(self, p: Path, ignore_errors: bool):
|
|
1088
|
+
try:
|
|
1089
|
+
p.unlink()
|
|
1090
|
+
except Exception:
|
|
1091
|
+
if not p.exists():
|
|
1092
|
+
pass # mission (already) accomplished
|
|
1093
|
+
elif stat.S_ISDIR(p.lstat().st_mode):
|
|
1094
|
+
shutil.rmtree(str(p), ignore_errors=ignore_errors)
|
|
1095
|
+
elif not ignore_errors:
|
|
1096
|
+
raise
|
|
1097
|
+
|
|
1098
|
+
def garbage_collect_extra_files(self):
|
|
1099
|
+
"""
|
|
1100
|
+
Look for files in the dedup directory that were left behind due to errors or unexpected
|
|
1101
|
+
shutdown. Delete such files.
|
|
1102
|
+
|
|
1103
|
+
This recursively lists every file in the dedup store, so it takes a long time.
|
|
1104
|
+
"""
|
|
1105
|
+
F = sao.aliased(mo.DedupFile)
|
|
1106
|
+
i2p = self._integer_to_path
|
|
1107
|
+
cutoff = mo.now() - 3600
|
|
1108
|
+
|
|
1109
|
+
base = self._path_dedup
|
|
1110
|
+
for chunk in chunked_iter(base.rglob("*"), self._batch_size):
|
|
1111
|
+
to_be_unlinked = []
|
|
1112
|
+
file_ids = {}
|
|
1113
|
+
for p in chunk:
|
|
1114
|
+
if not p.is_file():
|
|
1115
|
+
continue
|
|
1116
|
+
|
|
1117
|
+
try:
|
|
1118
|
+
file_id = i2p.invert("/".join(p.relative_to(base).parts))
|
|
1119
|
+
except InvalidPathError:
|
|
1120
|
+
if p.stat().st_mtime < cutoff:
|
|
1121
|
+
to_be_unlinked.append(p)
|
|
1122
|
+
continue
|
|
1123
|
+
|
|
1124
|
+
file_ids[file_id] = p
|
|
1125
|
+
|
|
1126
|
+
if file_ids:
|
|
1127
|
+
# We use a write transaction to avoid a race condition between checking that a path
|
|
1128
|
+
# does not contain a valid file ID and then later deleting that file outside the
|
|
1129
|
+
# transaction.
|
|
1130
|
+
with self._SessionW() as s, temporary_table(s, mo.tmp_ints) as tmp:
|
|
1131
|
+
s.execute(sa.insert(tmp), [{"id": x} for x in file_ids]).close()
|
|
1132
|
+
tmp_ = sa.alias(tmp)
|
|
1133
|
+
bad_file_ids = (
|
|
1134
|
+
s.execute(
|
|
1135
|
+
sa.select(tmp_.c.id).where(
|
|
1136
|
+
~sa.exists().select_from(F).where(F.id == tmp_.c.id)
|
|
1137
|
+
)
|
|
1138
|
+
)
|
|
1139
|
+
.scalars()
|
|
1140
|
+
.all()
|
|
1141
|
+
)
|
|
1142
|
+
for file_id in bad_file_ids:
|
|
1143
|
+
self._delete_file(file_ids[file_id])
|
|
1144
|
+
|
|
1145
|
+
for p in to_be_unlinked:
|
|
1146
|
+
self._delete_file(p)
|
|
1147
|
+
|
|
1148
|
+
def corrupted_list(self) -> ty.Generator[Corrupted]:
|
|
1149
|
+
"""
|
|
1150
|
+
Get the list of corrupted files found using :meth:`integrity_check`.
|
|
1151
|
+
"""
|
|
1152
|
+
for p in self.path_corrupted.glob("*.json"):
|
|
1153
|
+
d = json.loads(p.read_bytes())
|
|
1154
|
+
yield Corrupted(
|
|
1155
|
+
path=bin_path if (bin_path := p.with_suffix(".bin")).exists() else None,
|
|
1156
|
+
file_id=d["file_id"],
|
|
1157
|
+
exception=d["exception"],
|
|
1158
|
+
link_paths=frozenset(d["link_paths"]),
|
|
1159
|
+
raw_link_paths=frozenset(d["raw_link_paths"]),
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
def corrupted_clear(self):
|
|
1163
|
+
"""
|
|
1164
|
+
Delete all corrupted files.
|
|
1165
|
+
"""
|
|
1166
|
+
for glob in ["*.bin", "*.json"]:
|
|
1167
|
+
for p in self.path_corrupted.glob(glob):
|
|
1168
|
+
self._delete_file(p)
|
|
1169
|
+
|
|
1170
|
+
@staticmethod
|
|
1171
|
+
def _copy_tree_default_fallback(src: Path, dst: Path):
|
|
1172
|
+
shutil.copy2(str(src), str(dst), follow_symlinks=False)
|
|
1173
|
+
|
|
1174
|
+
def copy_tree(self, src: Path, dst: Path, fallback_copy=None) -> None:
|
|
1175
|
+
if fallback_copy is None:
|
|
1176
|
+
fallback_copy = self._copy_tree_default_fallback
|
|
1177
|
+
if dst.exists():
|
|
1178
|
+
raise AssertionError("dst must not exist")
|
|
1179
|
+
self.check_links(dst)
|
|
1180
|
+
|
|
1181
|
+
def _run():
|
|
1182
|
+
self.run_batch(to_copy)
|
|
1183
|
+
for req in to_copy:
|
|
1184
|
+
try:
|
|
1185
|
+
req.result()
|
|
1186
|
+
except NotADedupLinkError:
|
|
1187
|
+
fallback_copy(req.src, req.dst)
|
|
1188
|
+
to_copy.clear()
|
|
1189
|
+
|
|
1190
|
+
if src.is_dir():
|
|
1191
|
+
to_copy = []
|
|
1192
|
+
for root, dirs, files in pathwalk(src):
|
|
1193
|
+
root_dst = dst / root.relative_to(src)
|
|
1194
|
+
root_dst.mkdir(exist_ok=True, parents=True)
|
|
1195
|
+
for f in files:
|
|
1196
|
+
to_copy.append(DedupCopyLinkRequest(src=root / f, dst=root_dst / f))
|
|
1197
|
+
if len(to_copy) > 1000:
|
|
1198
|
+
_run()
|
|
1199
|
+
else:
|
|
1200
|
+
# must be a file
|
|
1201
|
+
to_copy = [DedupCopyLinkRequest(src=src, dst=dst)]
|
|
1202
|
+
|
|
1203
|
+
if to_copy:
|
|
1204
|
+
_run()
|
|
1205
|
+
|
|
1206
|
+
def delete_tree(self, p: Path) -> None:
|
|
1207
|
+
def f(func, path, exc_info):
|
|
1208
|
+
if (p := Path(path)).exists():
|
|
1209
|
+
self._move_to_deleted(p)
|
|
1210
|
+
|
|
1211
|
+
shutil.rmtree(str(p.absolute()), onerror=f)
|
|
1212
|
+
if p.exists():
|
|
1213
|
+
self._move_to_deleted(p)
|
|
1214
|
+
self.check_links(p)
|
|
1215
|
+
|
|
1216
|
+
def delete_file(self, p: Path) -> None:
|
|
1217
|
+
self._delete_file(p)
|
|
1218
|
+
self.check_links(p)
|
|
1219
|
+
|
|
1220
|
+
def _delete_file(self, p: Path) -> None:
|
|
1221
|
+
"""
|
|
1222
|
+
On Windows, a locked file cannot be deleted. So instead we move it out of the way to a
|
|
1223
|
+
different directory in the hopes of deleting it later when it's not locked.
|
|
1224
|
+
"""
|
|
1225
|
+
try:
|
|
1226
|
+
p.unlink(missing_ok=True)
|
|
1227
|
+
except OSError:
|
|
1228
|
+
if not p.exists() or p.is_dir():
|
|
1229
|
+
raise
|
|
1230
|
+
else:
|
|
1231
|
+
return
|
|
1232
|
+
|
|
1233
|
+
self._move_to_deleted(p)
|
|
1234
|
+
|
|
1235
|
+
def _move_to_deleted(self, p: Path) -> None:
|
|
1236
|
+
base = self.path_deleted
|
|
1237
|
+
for name in random_names("", ".bin"):
|
|
1238
|
+
try:
|
|
1239
|
+
p.rename(base / name)
|
|
1240
|
+
except OSError as exc:
|
|
1241
|
+
exc_ = exc
|
|
1242
|
+
else:
|
|
1243
|
+
return
|
|
1244
|
+
|
|
1245
|
+
raise exc_
|
|
1246
|
+
|
|
1247
|
+
def _filelock(self, path: Path, blocking: bool):
|
|
1248
|
+
return filelock.FileLock(path, blocking=blocking)
|
|
1249
|
+
|
|
1250
|
+
@property
|
|
1251
|
+
def _path_temporary_dirs(self):
|
|
1252
|
+
return self.path_temporary / "dirs"
|
|
1253
|
+
|
|
1254
|
+
@property
|
|
1255
|
+
def _path_temporary_lock(self):
|
|
1256
|
+
return self.path_temporary / "lock"
|
|
1257
|
+
|
|
1258
|
+
@property
|
|
1259
|
+
def _path_temporary_master_lock(self):
|
|
1260
|
+
return self.path_temporary / "master.lock"
|
|
1261
|
+
|
|
1262
|
+
@contextlib.contextmanager
|
|
1263
|
+
def temporary_directory(self, prefix="tmp_", suffix=""):
|
|
1264
|
+
exc = None
|
|
1265
|
+
for name in random_names(prefix=prefix, suffix=suffix):
|
|
1266
|
+
p = self._path_temporary_dirs / name
|
|
1267
|
+
q = self._path_temporary_lock / name
|
|
1268
|
+
|
|
1269
|
+
# We must always acquire the master lock before acquiring a child lock. The order must
|
|
1270
|
+
# be consistent in order to prevent deadlocks.
|
|
1271
|
+
with contextlib.ExitStack() as ex:
|
|
1272
|
+
with self._filelock(self._path_temporary_master_lock, blocking=True):
|
|
1273
|
+
try:
|
|
1274
|
+
ex.enter_context(self._filelock(q, blocking=False))
|
|
1275
|
+
except filelock.Timeout as exc_:
|
|
1276
|
+
continue # try a different name
|
|
1277
|
+
|
|
1278
|
+
# We now release the master lock because we don't need it any more.
|
|
1279
|
+
|
|
1280
|
+
try:
|
|
1281
|
+
p.mkdir(parents=True)
|
|
1282
|
+
except OSError as exc_:
|
|
1283
|
+
exc = exc_
|
|
1284
|
+
continue
|
|
1285
|
+
|
|
1286
|
+
try:
|
|
1287
|
+
yield p
|
|
1288
|
+
break
|
|
1289
|
+
finally:
|
|
1290
|
+
self.delete_tree(p)
|
|
1291
|
+
|
|
1292
|
+
# Release the lock file. We will attempt to delete it next.
|
|
1293
|
+
ex.close()
|
|
1294
|
+
|
|
1295
|
+
# Attempt to delete the lock file.
|
|
1296
|
+
with self._filelock(self._path_temporary_master_lock, blocking=True):
|
|
1297
|
+
try:
|
|
1298
|
+
with self._filelock(q, blocking=False):
|
|
1299
|
+
pass
|
|
1300
|
+
except filelock.Timeout as exc_:
|
|
1301
|
+
pass # another thread chose the same name and locked it, leave it alone
|
|
1302
|
+
else:
|
|
1303
|
+
self._remove_file_or_dir(q, ignore_errors=True)
|
|
1304
|
+
else:
|
|
1305
|
+
raise AssertionError("retry count exceeded, unknown cause") if exc is None else exc
|
|
1306
|
+
|
|
1307
|
+
@cached_property
|
|
1308
|
+
def _q_get_hash(self):
|
|
1309
|
+
L = sao.aliased(mo.Link)
|
|
1310
|
+
F = sao.aliased(mo.DedupFile)
|
|
1311
|
+
H = sao.aliased(mo.Hash)
|
|
1312
|
+
return (
|
|
1313
|
+
sa.select(L, H, F.size)
|
|
1314
|
+
.select_from(L)
|
|
1315
|
+
.join(F, L.file)
|
|
1316
|
+
.outerjoin(H, (Rel(H.file) == F) & (H.hash_function == sa.bindparam("x_hf")))
|
|
1317
|
+
.options(sao.contains_eager(L.file.of_type(F)))
|
|
1318
|
+
.where(L.link_path == sa.bindparam("x_link_path"), F.pending == None)
|
|
1319
|
+
)
|
|
1320
|
+
|
|
1321
|
+
def _query_by_link_path(
|
|
1322
|
+
self, s: sao.Session, link_path: bytes, hash_function: mh.HashFunction
|
|
1323
|
+
) -> list[tuple[mo.Link, mo.Hash, int]]:
|
|
1324
|
+
return s.execute(
|
|
1325
|
+
self._q_get_hash,
|
|
1326
|
+
{"x_link_path": link_path, "x_hf": hash_function.function_code},
|
|
1327
|
+
).all()
|
|
1328
|
+
|
|
1329
|
+
def get_file_hash(
|
|
1330
|
+
self, hash_function: mh.HashFunction, path: Path, check_link: bool
|
|
1331
|
+
) -> tuple[int, mh.Digest] | None:
|
|
1332
|
+
"""
|
|
1333
|
+
Query the database to obtain the file contents hash of file at *path*. Return None if the
|
|
1334
|
+
file is not in the dedup database. If *check_link* is True, then check that the link is
|
|
1335
|
+
intact before returning the hash. If the link is damaged or removed, then call
|
|
1336
|
+
:meth:`check_links` to unregister the link then return None.
|
|
1337
|
+
"""
|
|
1338
|
+
with self._SessionR() as s:
|
|
1339
|
+
link_path: bytes = self._link_path_to_string(path)
|
|
1340
|
+
links = self._query_by_link_path(s, link_path, hash_function)
|
|
1341
|
+
|
|
1342
|
+
if not links:
|
|
1343
|
+
return None
|
|
1344
|
+
|
|
1345
|
+
link, h, size = links[0]
|
|
1346
|
+
if h is None:
|
|
1347
|
+
return None
|
|
1348
|
+
|
|
1349
|
+
if not (check_link and not self._verify_link(link)):
|
|
1350
|
+
return size, h.to_digest()
|
|
1351
|
+
|
|
1352
|
+
self.check_links(path)
|
|
1353
|
+
return None
|
|
1354
|
+
|
|
1355
|
+
def get_or_compute_file_hash(
|
|
1356
|
+
self, hash_function: mh.HashFunction, path: Path, **kw
|
|
1357
|
+
) -> tuple[int, mh.Digest] | None:
|
|
1358
|
+
r = self.get_file_hash(hash_function, path, **kw)
|
|
1359
|
+
if r is None:
|
|
1360
|
+
hasher = hash_function()
|
|
1361
|
+
size = 0
|
|
1362
|
+
with path.open("rb") as f:
|
|
1363
|
+
while block := f.read(65536):
|
|
1364
|
+
size += len(block)
|
|
1365
|
+
hasher.update(block)
|
|
1366
|
+
r = size, hasher.digest()
|
|
1367
|
+
return r
|
|
1368
|
+
|
|
1369
|
+
def adopt_files(
|
|
1370
|
+
self, hash_function: mh.HashFunction, requests: ty.Iterable[AdoptRequest]
|
|
1371
|
+
) -> None:
|
|
1372
|
+
"""
|
|
1373
|
+
Adopt each file given in *paths*. If the path is already a dedup link, then leave it
|
|
1374
|
+
alone. If the path is not a dedup link, then compute its hash and move the file to the
|
|
1375
|
+
dedup store and create a link to it. If the path is already a dedup link but does not
|
|
1376
|
+
have the right kind of hash digest, then compute the hash digest and store it in the
|
|
1377
|
+
database.
|
|
1378
|
+
|
|
1379
|
+
This method is implemented in a somewhat inefficient way.
|
|
1380
|
+
"""
|
|
1381
|
+
reqs = [_ImplAdoptRequest(req) for req in requests]
|
|
1382
|
+
|
|
1383
|
+
# first use a read-only session while we compute file hashes
|
|
1384
|
+
with self._SessionR() as s:
|
|
1385
|
+
for x in reqs:
|
|
1386
|
+
x.link_path = self._link_path_to_string(x.req.path)
|
|
1387
|
+
existing = self._query_by_link_path(s, x.link_path, hash_function)
|
|
1388
|
+
if existing:
|
|
1389
|
+
l, h, sz = existing[0]
|
|
1390
|
+
if h is not None:
|
|
1391
|
+
x.req.out_digest = h.to_digest()
|
|
1392
|
+
x.req.out_size = sz
|
|
1393
|
+
x.done = True
|
|
1394
|
+
|
|
1395
|
+
if not x.done:
|
|
1396
|
+
with open(x.req.path, "rb") as f:
|
|
1397
|
+
h = hash_function()
|
|
1398
|
+
size = 0
|
|
1399
|
+
while block := f.read(65536):
|
|
1400
|
+
h.update(block)
|
|
1401
|
+
size += len(block)
|
|
1402
|
+
x.req.out_digest = h.digest()
|
|
1403
|
+
x.file_metadata = DedupFileMetadata(executable=False) # TODO
|
|
1404
|
+
x.req.out_size = size
|
|
1405
|
+
x.file_metadata_bytes = self.convert_file_metadata_to_bytes(x.file_metadata)
|
|
1406
|
+
|
|
1407
|
+
F = sao.aliased(mo.DedupFile)
|
|
1408
|
+
H = sao.aliased(mo.Hash)
|
|
1409
|
+
q = (
|
|
1410
|
+
sa.select(F)
|
|
1411
|
+
.join(H, F.hashes)
|
|
1412
|
+
.where(
|
|
1413
|
+
H.hash_function == sa.bindparam("x_hf"),
|
|
1414
|
+
H.hash == sa.bindparam("x_h"),
|
|
1415
|
+
F.pending == None,
|
|
1416
|
+
F.file_metadata == sa.bindparam("x_f_meta"),
|
|
1417
|
+
)
|
|
1418
|
+
)
|
|
1419
|
+
|
|
1420
|
+
# then we use a RW session to update the database
|
|
1421
|
+
with self._beginw() as s:
|
|
1422
|
+
for x in reqs:
|
|
1423
|
+
if x.done:
|
|
1424
|
+
continue
|
|
1425
|
+
|
|
1426
|
+
# re-check for an existing link
|
|
1427
|
+
existing = self._query_by_link_path(s, x.link_path, hash_function)
|
|
1428
|
+
if existing:
|
|
1429
|
+
l, h, sz = existing[0]
|
|
1430
|
+
file = l.file
|
|
1431
|
+
if h is None:
|
|
1432
|
+
s.add(mo.Hash.from_digest(x.req.out_digest, file=file))
|
|
1433
|
+
else:
|
|
1434
|
+
# never mind, nothing to do here
|
|
1435
|
+
x.req.out_size = sz
|
|
1436
|
+
x.req.out_digest = h.to_digest()
|
|
1437
|
+
x.done = True
|
|
1438
|
+
continue
|
|
1439
|
+
else:
|
|
1440
|
+
# try to lookup by digest first
|
|
1441
|
+
# TODO: also look up by tag
|
|
1442
|
+
files = (
|
|
1443
|
+
s.execute(
|
|
1444
|
+
q,
|
|
1445
|
+
dict(
|
|
1446
|
+
x_hf=hash_function.function_code,
|
|
1447
|
+
x_h=x.req.out_digest.digest,
|
|
1448
|
+
x_f_meta=x.file_metadata_bytes,
|
|
1449
|
+
),
|
|
1450
|
+
)
|
|
1451
|
+
.scalars()
|
|
1452
|
+
.all()
|
|
1453
|
+
)
|
|
1454
|
+
if files:
|
|
1455
|
+
file = files[0]
|
|
1456
|
+
else:
|
|
1457
|
+
file = None
|
|
1458
|
+
if file is not None:
|
|
1459
|
+
file.orphaned_at = None
|
|
1460
|
+
x.delete = True
|
|
1461
|
+
else:
|
|
1462
|
+
# no existing file found, need to create one
|
|
1463
|
+
file = mo.DedupFile(
|
|
1464
|
+
file_metadata=x.file_metadata_bytes,
|
|
1465
|
+
size=x.req.out_size,
|
|
1466
|
+
mtime=int(x.req.path.stat().st_mtime),
|
|
1467
|
+
orphaned_at=None,
|
|
1468
|
+
pending=None,
|
|
1469
|
+
hashes=[mo.Hash.from_digest(x.req.out_digest)],
|
|
1470
|
+
)
|
|
1471
|
+
s.add(file)
|
|
1472
|
+
s.flush() # we need to make sure the file has an ID
|
|
1473
|
+
|
|
1474
|
+
s.add(mo.Link(link_path=x.link_path, file=file))
|
|
1475
|
+
|
|
1476
|
+
x.dedup_file_path = self._make_dedup_file_path(file.id)
|
|
1477
|
+
|
|
1478
|
+
# We add our tags.
|
|
1479
|
+
self._add_tags_to_file(s, file, x.req.tags)
|
|
1480
|
+
|
|
1481
|
+
s.flush()
|
|
1482
|
+
|
|
1483
|
+
# and finally we make filesystem changes
|
|
1484
|
+
for x in reqs:
|
|
1485
|
+
if (dst := x.dedup_file_path) is not None:
|
|
1486
|
+
if x.delete:
|
|
1487
|
+
# We already have a DedupFile with the required contents, so we replace the
|
|
1488
|
+
# link_path file with a link to that existing DedupFile.
|
|
1489
|
+
self._delete_file(x.req.path)
|
|
1490
|
+
self._create_actual_link(dst, x.req.path)
|
|
1491
|
+
else:
|
|
1492
|
+
dst.parent.mkdir(exist_ok=True, parents=True)
|
|
1493
|
+
self._adopt_file_and_link(x.req.path, dst)
|
|
1494
|
+
|
|
1495
|
+
def integrity_check(
|
|
1496
|
+
self,
|
|
1497
|
+
skip_same_mtime: bool,
|
|
1498
|
+
threads: int | None = None,
|
|
1499
|
+
keep_corrupted: bool = True,
|
|
1500
|
+
):
|
|
1501
|
+
"""
|
|
1502
|
+
Verify all deduplicated files match their stored hashes. Use modification time to skip
|
|
1503
|
+
unchanged files if *skip_same_mtime* is True. Move the corrupted files to
|
|
1504
|
+
:attr:`path_corrupted`.
|
|
1505
|
+
"""
|
|
1506
|
+
|
|
1507
|
+
F = sao.aliased(mo.DedupFile)
|
|
1508
|
+
batch_size = 1000
|
|
1509
|
+
q = sa.select(F).options(sao.selectinload(F.hashes)).order_by(F.id).limit(batch_size)
|
|
1510
|
+
|
|
1511
|
+
def _hash_check(file: mo.DedupFile) -> None:
|
|
1512
|
+
p = self._make_dedup_file_path(file.id)
|
|
1513
|
+
st_mtime = int(p.stat().st_mtime)
|
|
1514
|
+
if skip_same_mtime and file.mtime == st_mtime:
|
|
1515
|
+
return
|
|
1516
|
+
|
|
1517
|
+
d = file.hashes_dict
|
|
1518
|
+
m = mh.MultiHasher({hf: hf() for hf in d})
|
|
1519
|
+
with p.open("rb") as fh:
|
|
1520
|
+
while block := fh.read(65536):
|
|
1521
|
+
m.update(block)
|
|
1522
|
+
if d != (observed := m.digest()):
|
|
1523
|
+
raise InvalidContentsError(hashes_expected=d, hashes_observed=observed)
|
|
1524
|
+
|
|
1525
|
+
# TODO: also check file metadata matches, such as the executable bit
|
|
1526
|
+
|
|
1527
|
+
# The digest was the same, so update the mtime in the DB.
|
|
1528
|
+
with self._SessionW() as s:
|
|
1529
|
+
IdKey.from_instance(file).get_one(s).mtime = st_mtime
|
|
1530
|
+
|
|
1531
|
+
id_min = -1
|
|
1532
|
+
with cf.ThreadPoolExecutor(max_workers=threads) as exe:
|
|
1533
|
+
while True:
|
|
1534
|
+
invalid_file_ids = []
|
|
1535
|
+
|
|
1536
|
+
with self._SessionR() as s:
|
|
1537
|
+
q2 = q.where(F.id > id_min, F.pending == None)
|
|
1538
|
+
dedup_files: list[mo.DedupFile] = s.execute(q2).scalars().all()
|
|
1539
|
+
|
|
1540
|
+
if not dedup_files:
|
|
1541
|
+
break
|
|
1542
|
+
|
|
1543
|
+
id_min = dedup_files[-1].id
|
|
1544
|
+
futures = {exe.submit(_hash_check, f): f for f in dedup_files}
|
|
1545
|
+
for future in cf.as_completed(futures):
|
|
1546
|
+
if (exc := future.exception()) is not None:
|
|
1547
|
+
if not isinstance(exc, Exception):
|
|
1548
|
+
# Some other type of exception
|
|
1549
|
+
raise exc
|
|
1550
|
+
|
|
1551
|
+
file = futures[future]
|
|
1552
|
+
self._integrity_check_process_corrupt_one(s, file, exc, keep_corrupted)
|
|
1553
|
+
invalid_file_ids.append(file.id)
|
|
1554
|
+
|
|
1555
|
+
if invalid_file_ids:
|
|
1556
|
+
with self._SessionW() as s:
|
|
1557
|
+
s.connection().execute(
|
|
1558
|
+
sa.delete(F).where(F.id == sa.bindparam("_id")),
|
|
1559
|
+
[{"_id": x} for x in invalid_file_ids],
|
|
1560
|
+
)
|
|
1561
|
+
|
|
1562
|
+
def _integrity_check_process_corrupt_one(
|
|
1563
|
+
self, s: sao.Session, file: mo.DedupFile, exc: Exception, keep_corrupted: bool
|
|
1564
|
+
):
|
|
1565
|
+
"""
|
|
1566
|
+
Process one file that has been found to be corrupted.
|
|
1567
|
+
"""
|
|
1568
|
+
|
|
1569
|
+
path_file = self._make_dedup_file_path(file.id)
|
|
1570
|
+
|
|
1571
|
+
# Load the links as we will need them
|
|
1572
|
+
s.refresh(file, ["links"])
|
|
1573
|
+
|
|
1574
|
+
link_paths = [self._link_path_from_string(link.link_path) for link in file.links]
|
|
1575
|
+
json_data = {
|
|
1576
|
+
"file_id": file.id,
|
|
1577
|
+
"link_paths": [str(x) for x in link_paths],
|
|
1578
|
+
"raw_link_paths": [
|
|
1579
|
+
link.link_path.decode("utf-8", errors="replace") for link in file.links
|
|
1580
|
+
],
|
|
1581
|
+
"exception": repr(exc),
|
|
1582
|
+
}
|
|
1583
|
+
|
|
1584
|
+
with create_file_random(self.path_corrupted, "f_", ".json") as f:
|
|
1585
|
+
path_json = Path(f.name)
|
|
1586
|
+
f.write(json.dumps(json_data, indent=2, sort_keys=True).encode("utf-8"))
|
|
1587
|
+
|
|
1588
|
+
if keep_corrupted:
|
|
1589
|
+
try:
|
|
1590
|
+
path_file.rename(path_json.with_suffix(".bin"))
|
|
1591
|
+
except Exception:
|
|
1592
|
+
if path_file.exists():
|
|
1593
|
+
logger.warning(
|
|
1594
|
+
"failed to rename corrupt file", exc_info=True, data=str(path_file)
|
|
1595
|
+
)
|
|
1596
|
+
|
|
1597
|
+
for x in link_paths:
|
|
1598
|
+
self._delete_file(x)
|
|
1599
|
+
|
|
1600
|
+
class _compute_stats_ZeroRow:
|
|
1601
|
+
orphaned = None
|
|
1602
|
+
count = 0
|
|
1603
|
+
size = 0
|
|
1604
|
+
|
|
1605
|
+
def compute_stats(self) -> DedupStats:
|
|
1606
|
+
with self._SessionR() as s:
|
|
1607
|
+
F = sao.aliased(mo.DedupFile)
|
|
1608
|
+
L = sao.aliased(mo.Link)
|
|
1609
|
+
orph = F.orphaned_at != None
|
|
1610
|
+
|
|
1611
|
+
q = (
|
|
1612
|
+
sa.select(
|
|
1613
|
+
orph.label("orphaned"),
|
|
1614
|
+
sa.func.count().label("count"),
|
|
1615
|
+
sa.func.sum(F.size).label("size"),
|
|
1616
|
+
)
|
|
1617
|
+
.select_from(F)
|
|
1618
|
+
.where(F.pending == None)
|
|
1619
|
+
.group_by(orph)
|
|
1620
|
+
)
|
|
1621
|
+
file_stats = {k: self._compute_stats_ZeroRow() for k in (False, True)}
|
|
1622
|
+
file_stats |= {row.orphaned: row for row in s.execute(q).all()}
|
|
1623
|
+
|
|
1624
|
+
q = (
|
|
1625
|
+
sa.select(sa.func.count().label("count"), sa.func.sum(F.size).label("size"))
|
|
1626
|
+
.select_from(L)
|
|
1627
|
+
.join(F, L.file)
|
|
1628
|
+
).where(F.pending == None)
|
|
1629
|
+
link_stats = s.execute(q).one()
|
|
1630
|
+
|
|
1631
|
+
return DedupStats(
|
|
1632
|
+
dedup_count=file_stats[False].count,
|
|
1633
|
+
dedup_total_bytes=file_stats[False].size,
|
|
1634
|
+
orphaned_count=file_stats[True].count,
|
|
1635
|
+
orphaned_total_bytes=file_stats[True].size,
|
|
1636
|
+
link_count=link_stats.count,
|
|
1637
|
+
link_total_bytes=link_stats.size or 0,
|
|
1638
|
+
)
|
|
1639
|
+
|
|
1640
|
+
|
|
1641
|
+
class DedupBackendHardlink(Dedup):
|
|
1642
|
+
def _create_actual_link(self, existing: Path, new: Path):
|
|
1643
|
+
# Path.link_to was removed and replaced by Path.hardlink_to, but I want this to work across
|
|
1644
|
+
# Python 3.9 to 3.13
|
|
1645
|
+
os.link(str(existing), str(new))
|
|
1646
|
+
|
|
1647
|
+
def _adopt_file_and_link(self, existing_path: Path, dedup_file_path: Path):
|
|
1648
|
+
# hard links are indistinguishable from each other
|
|
1649
|
+
self._create_actual_link(existing_path, dedup_file_path)
|
|
1650
|
+
|
|
1651
|
+
def _verify_link(self, link: mo.Link) -> bool:
|
|
1652
|
+
p = Path(link.link_path.decode("utf-8"))
|
|
1653
|
+
|
|
1654
|
+
try:
|
|
1655
|
+
a = p.lstat()
|
|
1656
|
+
except Exception:
|
|
1657
|
+
return False
|
|
1658
|
+
|
|
1659
|
+
if link.file.mtime != int(a.st_mtime):
|
|
1660
|
+
return False
|
|
1661
|
+
|
|
1662
|
+
# st_ino is 0 on unsupported filesystems on Windows.
|
|
1663
|
+
|
|
1664
|
+
# TODO: should we even allow st_ino=0?
|
|
1665
|
+
if a.st_ino != 0:
|
|
1666
|
+
if (file_stat := getattr(link.file, "_cached_file_stat", None)) is None:
|
|
1667
|
+
try:
|
|
1668
|
+
file_stat = self._make_dedup_file_path(link.file.id).stat()
|
|
1669
|
+
except Exception:
|
|
1670
|
+
return False
|
|
1671
|
+
link.file._cached_file_stat = file_stat
|
|
1672
|
+
|
|
1673
|
+
if a.st_ino != file_stat.st_ino:
|
|
1674
|
+
return False
|
|
1675
|
+
|
|
1676
|
+
return True
|