vocker 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vocker/cli.py +1 -2
- vocker/dedup.py +913 -676
- vocker/dedup_models.py +204 -49
- vocker/repo/io.py +7 -0
- vocker/system.py +64 -34
- vocker/util.py +6 -4
- {vocker-0.1.0.dist-info → vocker-0.3.0.dist-info}/METADATA +5 -6
- {vocker-0.1.0.dist-info → vocker-0.3.0.dist-info}/RECORD +10 -10
- {vocker-0.1.0.dist-info → vocker-0.3.0.dist-info}/WHEEL +0 -0
- {vocker-0.1.0.dist-info → vocker-0.3.0.dist-info}/top_level.txt +0 -0
vocker/dedup.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
|
+
import datetime
|
|
5
|
+
from collections import defaultdict
|
|
4
6
|
import contextlib
|
|
5
7
|
import filelock
|
|
6
8
|
import io
|
|
@@ -18,11 +20,10 @@ import structlog
|
|
|
18
20
|
import concurrent.futures as cf
|
|
19
21
|
|
|
20
22
|
import sqlalchemy as sa
|
|
21
|
-
from sqlalchemy import orm as sao
|
|
23
|
+
from sqlalchemy import orm as sao, literal_column as _lit
|
|
22
24
|
from sqlalchemy_boltons import sqlite as sq
|
|
23
25
|
from sqlalchemy_boltons.orm import RelationshipComparator as Rel, IdKey
|
|
24
|
-
from sqlalchemy_boltons.
|
|
25
|
-
from sqlalchemy_boltons.core import bytes_startswith
|
|
26
|
+
from sqlalchemy_boltons.core import bytes_startswith, count
|
|
26
27
|
from boltons.iterutils import chunked_iter
|
|
27
28
|
from cached_property import cached_property
|
|
28
29
|
|
|
@@ -39,15 +40,22 @@ logger = structlog.get_logger(__name__)
|
|
|
39
40
|
class Corrupted:
|
|
40
41
|
path: Path | None = attr.ib()
|
|
41
42
|
file_id: int = attr.ib()
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
43
|
+
exception_name: str = attr.ib()
|
|
44
|
+
exception_string: str = attr.ib()
|
|
45
|
+
link_paths: tuple[Path, ...] = attr.ib()
|
|
46
|
+
raw_link_paths: tuple[bytes, ...] = attr.ib()
|
|
45
47
|
|
|
46
48
|
def to_json(self):
|
|
47
49
|
d = attr.asdict(self)
|
|
48
50
|
d["path"] = p if (p := d["path"]) is None else str(p)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
+
|
|
52
|
+
d["link_paths"] = [str(x) for x in d["link_paths"]]
|
|
53
|
+
d["link_paths"].sort()
|
|
54
|
+
|
|
55
|
+
# JSON cannot handle raw bytes
|
|
56
|
+
d["raw_link_paths"] = [x.decode("iso-8859-1") for x in d["raw_link_paths"]]
|
|
57
|
+
d["raw_link_paths"].sort()
|
|
58
|
+
|
|
51
59
|
return d
|
|
52
60
|
|
|
53
61
|
|
|
@@ -132,10 +140,11 @@ class DedupLinkRequest(DedupRequest):
|
|
|
132
140
|
"""
|
|
133
141
|
|
|
134
142
|
hash_function: mh.HashFunction = attr.ib()
|
|
135
|
-
link_path: Path = attr.ib()
|
|
143
|
+
link_path: Path | None = attr.ib()
|
|
136
144
|
file_metadata: DedupFileMetadata = attr.ib()
|
|
137
145
|
file_contents_hash: mh.Digest | None = attr.ib()
|
|
138
146
|
open_file_once: ty.Callable[[], ty.BinaryIO] | None = attr.ib()
|
|
147
|
+
adopt_existing: bool = attr.ib(default=False)
|
|
139
148
|
file_not_needed: ty.Callable[[], None] | None = attr.ib(default=None)
|
|
140
149
|
tags: ty.Set[bytes] = attr.ib(factory=frozenset)
|
|
141
150
|
|
|
@@ -163,15 +172,9 @@ class _ImplDedupRequestCommon:
|
|
|
163
172
|
@attr.s(eq=False, hash=False, kw_only=True)
|
|
164
173
|
class _ImplDedupLinkRequest(_ImplDedupRequestCommon):
|
|
165
174
|
req: DedupLinkRequest = attr.ib(default=None)
|
|
166
|
-
|
|
167
|
-
dedup_file_path: Path = attr.ib(default=None)
|
|
175
|
+
obj: _Obj | None = attr.ib(default=None)
|
|
168
176
|
link_path_str: bytes | None = attr.ib(default=None)
|
|
169
|
-
file: IdKey[mo.DedupFile] | None = attr.ib(default=None)
|
|
170
177
|
metadata_bytes: bytes | None = attr.ib(default=None)
|
|
171
|
-
file_size: int = attr.ib(default=None)
|
|
172
|
-
file_mtime: int = attr.ib(default=None)
|
|
173
|
-
fast_path: bool = attr.ib(default=False) # can we use the fast-path without db transaction?
|
|
174
|
-
is_new: bool = attr.ib(default=False) # is it a brand new FileDedup?
|
|
175
178
|
hashes_promised: dict[mh.HashFunction, mh.Digest] = attr.ib(default=None)
|
|
176
179
|
hashes_computed: dict[mh.HashFunction, mh.Digest] | None = attr.ib(default=None)
|
|
177
180
|
called_file: bool = attr.ib(default=False)
|
|
@@ -222,19 +225,21 @@ class AdoptRequest:
|
|
|
222
225
|
path: Path = attr.ib()
|
|
223
226
|
tags: ty.Set[bytes] = attr.ib(factory=frozenset)
|
|
224
227
|
|
|
225
|
-
out_size: int | None = attr.ib(init=False, default=None)
|
|
226
|
-
out_digest: mh.Digest | None = attr.ib(init=False, default=None)
|
|
227
228
|
|
|
229
|
+
@attr.s(eq=False, slots=True)
|
|
230
|
+
class _Obj:
|
|
231
|
+
id: int = attr.ib(factory=None)
|
|
232
|
+
pending_file_ids = attr.ib(factory=list)
|
|
233
|
+
completed_file_ids = attr.ib(factory=list)
|
|
234
|
+
file_size: int | None = attr.ib(default=None)
|
|
235
|
+
adopted_file_path: Path | None = attr.ib(default=None)
|
|
228
236
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
done: bool = attr.ib(default=False)
|
|
236
|
-
dedup_file_path: Path = attr.ib(default=None)
|
|
237
|
-
delete: bool = attr.ib(default=False)
|
|
237
|
+
|
|
238
|
+
@attr.s(eq=False, slots=True)
|
|
239
|
+
class _Updates:
|
|
240
|
+
obj_updates = attr.ib(factory=list)
|
|
241
|
+
file_updates = attr.ib(factory=list)
|
|
242
|
+
link_updates = attr.ib(factory=list)
|
|
238
243
|
|
|
239
244
|
|
|
240
245
|
"""
|
|
@@ -284,7 +289,7 @@ class _PendingUpdater:
|
|
|
284
289
|
raise ValueError(f"invalid update_interval={u!r}")
|
|
285
290
|
|
|
286
291
|
def _update(self):
|
|
287
|
-
with self.sessionmaker_w() as s:
|
|
292
|
+
with self.sessionmaker_w.begin() as s:
|
|
288
293
|
pending: mo.Pending = self.pending.get_one(s)
|
|
289
294
|
pending.expire_at = mo.now() + self.seconds_in_the_future
|
|
290
295
|
|
|
@@ -335,6 +340,10 @@ def make_sqlite_options(synchronous):
|
|
|
335
340
|
)
|
|
336
341
|
|
|
337
342
|
|
|
343
|
+
def _ns(stmt):
|
|
344
|
+
return stmt.execution_options(synchronize_session=False)
|
|
345
|
+
|
|
346
|
+
|
|
338
347
|
@attr.s(eq=False, hash=False)
|
|
339
348
|
class Dedup(abc.ABC):
|
|
340
349
|
base_path: Path = attr.ib()
|
|
@@ -345,10 +354,14 @@ class Dedup(abc.ABC):
|
|
|
345
354
|
_path_db: Path | None = attr.ib(default=None, kw_only=True)
|
|
346
355
|
path_temporary: Path | None = attr.ib(default=None, kw_only=True)
|
|
347
356
|
path_deleted: Path | None = attr.ib(default=None, kw_only=True)
|
|
348
|
-
path_corrupted: Path | None = attr.ib(default=None, kw_only=True)
|
|
349
357
|
_integer_to_path = attr.ib(factory=IntegerToPath, kw_only=True)
|
|
350
358
|
_sqlite_synchronous = attr.ib(default="NORMAL", kw_only=True)
|
|
351
359
|
_batch_size = 1000
|
|
360
|
+
max_link_count: int = ...
|
|
361
|
+
_clean_dedup_mtime = (
|
|
362
|
+
round(datetime.datetime(2000, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc).timestamp()) & ~1
|
|
363
|
+
)
|
|
364
|
+
_corrupted_pending_id = -1 # reserved ID
|
|
352
365
|
|
|
353
366
|
def __attrs_post_init__(self):
|
|
354
367
|
if self._path_dedup is None:
|
|
@@ -363,15 +376,12 @@ class Dedup(abc.ABC):
|
|
|
363
376
|
if self.path_temporary is None:
|
|
364
377
|
self.path_temporary = self.base_path / "tmp"
|
|
365
378
|
|
|
366
|
-
if self.path_corrupted is None:
|
|
367
|
-
self.path_corrupted = self.base_path / "corrupted"
|
|
368
|
-
|
|
369
379
|
self._path_dedup.mkdir(exist_ok=True, parents=True)
|
|
370
380
|
self._path_db.parent.mkdir(exist_ok=True, parents=True)
|
|
371
|
-
self.path_corrupted.mkdir(exist_ok=True, parents=True)
|
|
372
381
|
self.path_deleted.mkdir(exist_ok=True, parents=True)
|
|
373
382
|
self._path_temporary_dirs.mkdir(exist_ok=True, parents=True)
|
|
374
383
|
self._path_temporary_lock.mkdir(exist_ok=True, parents=True)
|
|
384
|
+
self._path_temporary_simple_dir.mkdir(exist_ok=True, parents=True)
|
|
375
385
|
engine = sq.create_engine_sqlite(self._path_db, create_engine_args=dict(echo=False))
|
|
376
386
|
engine = make_sqlite_options(synchronous=self._sqlite_synchronous).apply(engine)
|
|
377
387
|
self._engine_r = engine
|
|
@@ -380,10 +390,6 @@ class Dedup(abc.ABC):
|
|
|
380
390
|
self._SessionR = sao.sessionmaker(self._engine_r)
|
|
381
391
|
self._SessionW = sao.sessionmaker(self._engine_w)
|
|
382
392
|
|
|
383
|
-
# FIXME: use proper session management
|
|
384
|
-
# self.session = Session(self.engine_rw) # HACK
|
|
385
|
-
# self.engine = self.engine_rw # HACK
|
|
386
|
-
|
|
387
393
|
self._initialize_db()
|
|
388
394
|
|
|
389
395
|
def _initialize_db(self):
|
|
@@ -392,6 +398,10 @@ class Dedup(abc.ABC):
|
|
|
392
398
|
mo.BaseDedup.metadata.create_all(conn)
|
|
393
399
|
conn.commit()
|
|
394
400
|
|
|
401
|
+
with self._beginw() as s:
|
|
402
|
+
if s.get(mo.Pending, self._corrupted_pending_id) is None:
|
|
403
|
+
s.add(mo.Pending(id=self._corrupted_pending_id, expire_at=1))
|
|
404
|
+
|
|
395
405
|
@contextlib.contextmanager
|
|
396
406
|
def _beginw(self):
|
|
397
407
|
with self._SessionW.begin() as s:
|
|
@@ -408,6 +418,10 @@ class Dedup(abc.ABC):
|
|
|
408
418
|
new_mode |= mask
|
|
409
419
|
os.chmod(str(path), new_mode, follow_symlinks=False)
|
|
410
420
|
|
|
421
|
+
def _set_clean_file_mtime(self, path: Path) -> None:
|
|
422
|
+
t = self._clean_dedup_mtime
|
|
423
|
+
os.utime(path, (t, t))
|
|
424
|
+
|
|
411
425
|
def get_metadata_from_file(self, path: Path) -> DedupFileMetadata:
|
|
412
426
|
if supports_executable():
|
|
413
427
|
mode = path.stat().st_mode
|
|
@@ -470,42 +484,269 @@ class Dedup(abc.ABC):
|
|
|
470
484
|
hashes=[f(h) for h in link.hashes_promised.values()],
|
|
471
485
|
)
|
|
472
486
|
|
|
473
|
-
def
|
|
474
|
-
|
|
475
|
-
return
|
|
487
|
+
def _tmp_sqlite(self, tmp):
|
|
488
|
+
with self._SessionR() as s:
|
|
489
|
+
return tmp.get(s, "").value
|
|
476
490
|
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
491
|
+
@cached_property
|
|
492
|
+
def _tmp_files(self):
|
|
493
|
+
return self._tmp_sqlite(mo.tmp_new_files)
|
|
494
|
+
|
|
495
|
+
@cached_property
|
|
496
|
+
def _tmp_files2(self):
|
|
497
|
+
return self._tmp_sqlite(mo.tmp_new_files2)
|
|
498
|
+
|
|
499
|
+
@cached_property
|
|
500
|
+
def _tmp_check_links(self):
|
|
501
|
+
return self._tmp_sqlite(mo.tmp_check_links)
|
|
502
|
+
|
|
503
|
+
@cached_property
|
|
504
|
+
def _tmp_delete_extra(self):
|
|
505
|
+
return self._tmp_sqlite(mo.tmp_delete_extra)
|
|
506
|
+
|
|
507
|
+
@cached_property
|
|
508
|
+
def _sql_prebatch_check_link(self):
|
|
509
|
+
L = sao.aliased(mo.Link)
|
|
510
|
+
return count(L).where(L.path == sa.bindparam("p_path"))
|
|
511
|
+
|
|
512
|
+
@cached_property
|
|
513
|
+
def _sql_prebatch_update_with_existing_dedup_files(self):
|
|
514
|
+
tmp = self._tmp_files
|
|
515
|
+
ONE = _lit("1")
|
|
516
|
+
|
|
517
|
+
def _eq(x, y, attributes):
|
|
518
|
+
return sa.and_(getattr(x, a) == getattr(y, a) for a in attributes)
|
|
519
|
+
|
|
520
|
+
O = sao.aliased(mo.Obj, name="obj")
|
|
521
|
+
tmp_f = sao.aliased(tmp.files, name="t_file")
|
|
522
|
+
tmp_tag = sao.aliased(tmp.tags, name="t_tag")
|
|
523
|
+
tmp_hash = sao.aliased(tmp.hashes, name="t_hash")
|
|
524
|
+
Tag = sao.aliased(mo.Tag, name="tag")
|
|
525
|
+
Hash = sao.aliased(mo.Hash, name="hash")
|
|
526
|
+
cond_obj = O.q_is_complete() | (O.pending_id == sa.bindparam("p_pending_id"))
|
|
527
|
+
cond_obj &= _eq(O, tmp_f.c, ["metadata_bytes"])
|
|
528
|
+
sqt = (
|
|
529
|
+
sa.select(O.id)
|
|
530
|
+
.join(Tag, O.tags)
|
|
531
|
+
.join(tmp_tag, _eq(tmp_tag.c, Tag, ["name"]))
|
|
532
|
+
.where(cond_obj, tmp_tag.c.id == tmp_f.c.id)
|
|
533
|
+
.limit(ONE)
|
|
534
|
+
)
|
|
535
|
+
q = sa.select(tmp_f.c.id, sqt.scalar_subquery().label("obj_id"))
|
|
536
|
+
q = q.where(tmp_f.c.obj_id == None).subquery()
|
|
537
|
+
|
|
538
|
+
sqh = (
|
|
539
|
+
sa.select(O.id)
|
|
540
|
+
.join(Hash, O.hashes)
|
|
541
|
+
.join(tmp_hash, _eq(tmp_hash.c, Hash, ["hash_function", "hash"]))
|
|
542
|
+
.where(cond_obj, tmp_hash.c.id == q.c.id)
|
|
543
|
+
.limit(ONE)
|
|
544
|
+
)
|
|
545
|
+
q = sa.select(
|
|
546
|
+
q.c.id,
|
|
547
|
+
sa.case((q.c.obj_id == None, sqh.scalar_subquery()), else_=q.c.obj_id).label("obj_id"),
|
|
480
548
|
)
|
|
481
|
-
for name in tags - current_tags:
|
|
482
|
-
session.add(mo.Tag(name=name, file=file))
|
|
483
549
|
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
550
|
+
tmp_f_up = sao.aliased(tmp.files, name="t_file")
|
|
551
|
+
|
|
552
|
+
# Finally, create the UPDATE statement that uses `qu` to update `tmp_files`.
|
|
553
|
+
qu = q.cte(name="t_file_changes")
|
|
554
|
+
stmt = sa.update(tmp_f_up)
|
|
555
|
+
stmt = stmt.values(obj_id=qu.c.obj_id).where(tmp_f_up.c.id == qu.c.id)
|
|
556
|
+
return _ns(stmt)
|
|
557
|
+
|
|
558
|
+
@cached_property
|
|
559
|
+
def _sql_prebatch_insert_missing_objs(self):
|
|
560
|
+
"""
|
|
561
|
+
Create Obj records where missing.
|
|
562
|
+
"""
|
|
563
|
+
tmp = self._tmp_files
|
|
564
|
+
fake_created_at = sa.bindparam("p_fake_created_at")
|
|
565
|
+
|
|
566
|
+
tmp_f = sa.alias(tmp.files, name="t_file")
|
|
567
|
+
Obj = sao.aliased(mo.Obj, name="obj")
|
|
568
|
+
q = sa.select(
|
|
569
|
+
tmp_f.c.metadata_bytes.label("metadata"),
|
|
570
|
+
tmp_f.c.id.label("size"), # smuggle ID through this field
|
|
571
|
+
fake_created_at.label("created_at"),
|
|
572
|
+
sa.null().label("orphaned_at"),
|
|
573
|
+
sa.bindparam("p_pending_id").label("pending_id"),
|
|
574
|
+
)
|
|
575
|
+
q = q.select_from(tmp_f).where(
|
|
576
|
+
tmp_f.c.obj_id == None, tmp_f.c.insert_obj_if_missing == True
|
|
577
|
+
)
|
|
578
|
+
qi = sa.insert(mo.Obj)
|
|
579
|
+
qi = qi.from_select(["metadata", "size", "created_at", "orphaned_at", "pending_id"], q)
|
|
580
|
+
del q, Obj, tmp_f
|
|
581
|
+
|
|
582
|
+
Obj = sao.aliased(mo.Obj, name="obj")
|
|
583
|
+
q = sa.select(Obj.id.label("obj_id"), Obj.size.label("id")).where(
|
|
584
|
+
Obj.created_at < _lit("0"), Obj.created_at == fake_created_at
|
|
585
|
+
)
|
|
586
|
+
q = q.cte(name="t_file_changes")
|
|
587
|
+
|
|
588
|
+
tmp_f = sa.alias(tmp.files, name="t_file")
|
|
589
|
+
qu = sa.update(tmp_f).add_cte(q)
|
|
590
|
+
qu = qu.values(new_obj_id=q.c.obj_id).where(tmp_f.c.id == q.c.id)
|
|
591
|
+
|
|
592
|
+
return _ns(qi), _ns(qu)
|
|
593
|
+
|
|
594
|
+
@cached_property
|
|
595
|
+
def _sql_prebatch_fix_and_delete_objs(self):
|
|
596
|
+
tmp = self._tmp_files
|
|
597
|
+
tmp_f = sao.aliased(tmp.files, name="t_files")
|
|
598
|
+
pending_id = sa.bindparam("p_pending_id")
|
|
599
|
+
created_at = sa.bindparam("p_created_at")
|
|
600
|
+
Obj = sao.aliased(mo.Obj, name="obj")
|
|
601
|
+
|
|
602
|
+
# Set a proper created_at for the new Objs that are actually in use.
|
|
603
|
+
q = sa.select(tmp_f.c.obj_id).where(tmp_f.c.obj_id != None)
|
|
604
|
+
r1 = sa.update(Obj).values(created_at=created_at)
|
|
605
|
+
r1 = r1.where(Obj.id.in_(q), Obj.pending_id == pending_id)
|
|
606
|
+
|
|
607
|
+
# Set updated_at to the current time.
|
|
608
|
+
r2 = sa.update(Obj).values(updated_at=created_at)
|
|
609
|
+
r2 = r2.where(Obj.id.in_(q))
|
|
610
|
+
|
|
611
|
+
# Delete remaining Objs.
|
|
612
|
+
r3 = sa.delete(Obj).where(
|
|
613
|
+
Obj.id.in_(sa.select(tmp_f.c.new_obj_id)), Obj.created_at < _lit("0")
|
|
499
614
|
)
|
|
500
615
|
|
|
501
|
-
|
|
502
|
-
session.add(mo.Link(link_path=link.link_path_str, file=file))
|
|
616
|
+
return tuple(_ns(x) for x in (r1, r2, r3))
|
|
503
617
|
|
|
504
|
-
|
|
505
|
-
|
|
618
|
+
@cached_property
|
|
619
|
+
def _sql_prebatch_insert_hashes(self):
|
|
620
|
+
"""
|
|
621
|
+
Create Hash records.
|
|
622
|
+
"""
|
|
623
|
+
tmp = self._tmp_files
|
|
624
|
+
tmp_f = sao.aliased(tmp.files, name="t_files")
|
|
625
|
+
tmp_h = sao.aliased(tmp.hashes, name="t_hash")
|
|
626
|
+
Obj = sao.aliased(mo.Obj, name="obj")
|
|
627
|
+
Hash = sao.aliased(mo.Hash, name="h")
|
|
628
|
+
|
|
629
|
+
q = sa.select(tmp_f.c.new_obj_id, tmp_h.c.hash_function, tmp_h.c.hash)
|
|
630
|
+
q = q.select_from(tmp_h).join(tmp_f, tmp_f.c.id == tmp_h.c.id)
|
|
631
|
+
exists = sa.exists().select_from(Hash)
|
|
632
|
+
exists = exists.where(
|
|
633
|
+
Hash.hash_function == tmp_h.c.hash_function, Hash.obj_id == tmp_f.c.new_obj_id
|
|
634
|
+
)
|
|
635
|
+
q = q.where(~exists, tmp_f.c.new_obj_id != None)
|
|
636
|
+
stmt = sa.insert(mo.Hash).from_select(["obj_id", "hash_function", "hash"], q)
|
|
637
|
+
return _ns(stmt)
|
|
506
638
|
|
|
507
|
-
|
|
508
|
-
|
|
639
|
+
@cached_property
|
|
640
|
+
def _sql_prebatch_insert_tags(self):
|
|
641
|
+
"""
|
|
642
|
+
Create Tag records.
|
|
643
|
+
"""
|
|
644
|
+
# Sadly this has a lot in common with `_sql_insert_hashes`. The urge to refactor is intense.
|
|
645
|
+
tmp = self._tmp_files
|
|
646
|
+
tmp_f = sao.aliased(tmp.files, name="t_files")
|
|
647
|
+
tmp_t = sao.aliased(tmp.tags, name="t_tag")
|
|
648
|
+
Obj = sao.aliased(mo.Obj, name="obj")
|
|
649
|
+
Tag = sao.aliased(mo.Tag, name="tag")
|
|
650
|
+
|
|
651
|
+
q = sa.select(tmp_f.c.new_obj_id, tmp_t.c.name)
|
|
652
|
+
q = q.select_from(tmp_t).join(tmp_f, tmp_f.c.id == tmp_t.c.id)
|
|
653
|
+
exists = sa.exists().select_from(Tag)
|
|
654
|
+
exists = exists.where(Tag.name == tmp_t.c.name, Tag.obj_id == tmp_f.c.new_obj_id)
|
|
655
|
+
q = q.where(~exists, tmp_f.c.new_obj_id != None)
|
|
656
|
+
stmt = sa.insert(mo.Tag).from_select(["obj_id", "name"], q)
|
|
657
|
+
return _ns(stmt)
|
|
658
|
+
|
|
659
|
+
@cached_property
|
|
660
|
+
def _sql_prebatch_insert_files(self):
|
|
661
|
+
tmp = self._tmp_files
|
|
662
|
+
tmp_f = sao.aliased(tmp.files, name="t_files")
|
|
663
|
+
q = sa.select(
|
|
664
|
+
tmp_f.c.obj_id,
|
|
665
|
+
sa.bindparam("p_pending_id").label("pending_id"),
|
|
666
|
+
sa.bindparam("p_created_at").label("created_at"),
|
|
667
|
+
)
|
|
668
|
+
q = q.where(tmp_f.c.obj_id != None)
|
|
669
|
+
stmt = sa.insert(mo.File).from_select(["obj_id", "pending_id", "created_at"], q)
|
|
670
|
+
return _ns(stmt)
|
|
671
|
+
|
|
672
|
+
@cached_property
|
|
673
|
+
def _sql_prebatch_delete_and_insert_links(self):
|
|
674
|
+
tmp = self._tmp_files
|
|
675
|
+
F = sao.aliased(mo.File, name="file")
|
|
676
|
+
L = sao.aliased(mo.Link, name="link")
|
|
677
|
+
tmp_f = sao.aliased(tmp.files, name="t_file")
|
|
678
|
+
null_id = sa.bindparam("p_null_file_id")
|
|
679
|
+
|
|
680
|
+
cond_link = (tmp_f.c.obj_id != None) & (tmp_f.c.link_path != None)
|
|
681
|
+
|
|
682
|
+
# Invalidate file link counts for the links we are about to delete.
|
|
683
|
+
q = sa.select(L.file_id).where(L.path.in_(sa.select(tmp_f.c.link_path).where(cond_link)))
|
|
684
|
+
r0 = sa.update(F).values(link_count=-1)
|
|
685
|
+
r0 = r0.where(F.id.in_(q))
|
|
686
|
+
|
|
687
|
+
# Delete the old links.
|
|
688
|
+
r1 = sa.delete(L).where(L.path.in_(sa.select(tmp_f.c.link_path).where(cond_link)))
|
|
689
|
+
|
|
690
|
+
# Insert the new links.
|
|
691
|
+
q = sa.select(tmp_f.c.link_path.label("path"), null_id.label("file_id")).where(cond_link)
|
|
692
|
+
r2 = sa.insert(mo.Link).from_select(["path", "file_id"], q)
|
|
693
|
+
|
|
694
|
+
# Set in-use Objs as not orphaned as they now have links.
|
|
695
|
+
O = sao.aliased(mo.Obj, name="obj")
|
|
696
|
+
q = sa.select(tmp_f.c.obj_id).where(cond_link)
|
|
697
|
+
r3 = sa.update(O).values(orphaned_at=None).where(O.id.in_(q))
|
|
698
|
+
|
|
699
|
+
return tuple(_ns(r) for r in (r0, r1, r2, r3))
|
|
700
|
+
|
|
701
|
+
@cached_property
|
|
702
|
+
def _sql_postbatch_update_objs(self):
|
|
703
|
+
t = sao.aliased(self._tmp_files2.objs, name="tu_obj")
|
|
704
|
+
q = sa.select(t.c.obj_id, t.c.size).subquery()
|
|
705
|
+
O = sao.aliased(mo.Obj, name="obj")
|
|
706
|
+
stmt = sa.update(O).where(O.id == q.c.obj_id)
|
|
707
|
+
stmt = stmt.values(size=q.c.size, pending_id=None)
|
|
708
|
+
return _ns(stmt)
|
|
709
|
+
|
|
710
|
+
@cached_property
|
|
711
|
+
def _sql_postbatch_update_files(self):
|
|
712
|
+
t = sao.aliased(self._tmp_files2.files, name="tu_files")
|
|
713
|
+
q = sa.select(t.c.file_id, t.c.obj_id).subquery()
|
|
714
|
+
F = sao.aliased(mo.File, name="file")
|
|
715
|
+
stmt = sa.update(F).where(F.id == q.c.file_id)
|
|
716
|
+
stmt = stmt.values(obj_id=q.c.obj_id, pending_id=None)
|
|
717
|
+
return _ns(stmt)
|
|
718
|
+
|
|
719
|
+
@cached_property
|
|
720
|
+
def _sql_postbatch_update_links(self):
|
|
721
|
+
t = sao.aliased(self._tmp_files2.links, name="tu_links")
|
|
722
|
+
q = sa.select(t.c.link_path, t.c.file_id, t.c.link_count).subquery()
|
|
723
|
+
F = sao.aliased(mo.File, name="file")
|
|
724
|
+
L = sao.aliased(mo.Link, name="link")
|
|
725
|
+
stmt1 = sa.update(F).where(F.id == q.c.file_id)
|
|
726
|
+
stmt1 = stmt1.values(link_count=q.c.link_count)
|
|
727
|
+
stmt2 = sa.update(L).where(L.path == q.c.link_path)
|
|
728
|
+
stmt2 = stmt2.values(file_id=q.c.file_id)
|
|
729
|
+
return _ns(stmt1), _ns(stmt2)
|
|
730
|
+
|
|
731
|
+
@cached_property
|
|
732
|
+
def _sql_prebatch_select_req_obj(self):
|
|
733
|
+
tmp = self._tmp_files
|
|
734
|
+
t_files = sao.aliased(tmp.files, name="t_files")
|
|
735
|
+
q = sa.select(t_files.c.id, t_files.c.obj_id).where(t_files.c.obj_id != None)
|
|
736
|
+
return _ns(q)
|
|
737
|
+
|
|
738
|
+
@cached_property
|
|
739
|
+
def _sql_prebatch_select_obj_file(self):
|
|
740
|
+
tmp = self._tmp_files
|
|
741
|
+
p_id = sa.bindparam("p_pending_id")
|
|
742
|
+
O = sao.aliased(mo.Obj, name="obj")
|
|
743
|
+
F = sao.aliased(mo.File, name="file")
|
|
744
|
+
t_files = sao.aliased(tmp.files, name="t_files")
|
|
745
|
+
qo = sa.select(t_files.c.obj_id).where(t_files.c.obj_id != None)
|
|
746
|
+
q = sa.select(F.obj_id, F.id, F.pending_id)
|
|
747
|
+
q = q.where(F.link_count < sa.bindparam("p_max_link_count"))
|
|
748
|
+
q = q.where(F.obj_id.in_(qo), (F.pending_id == None) | (F.pending_id == p_id))
|
|
749
|
+
return _ns(q)
|
|
509
750
|
|
|
510
751
|
def run_batch(self, requests: ty.Iterable[DedupRequest]) -> None:
|
|
511
752
|
"""
|
|
@@ -514,24 +755,42 @@ class Dedup(abc.ABC):
|
|
|
514
755
|
|
|
515
756
|
The requests will be addressed in the order that they appear in the iterable.
|
|
516
757
|
|
|
517
|
-
|
|
518
|
-
|
|
758
|
+
BUG: If the same file (same hash or same tag) appears multiple times in the *requests* then
|
|
759
|
+
multiple files will be created. You are welcome to fix this without breaking the tests and
|
|
760
|
+
without incurring a significant performance penalty.
|
|
761
|
+
|
|
762
|
+
We create more Objs and Files than we need, then clean them up later no biggie.
|
|
763
|
+
They get automatically deleted when the Pending record is deleted.
|
|
764
|
+
|
|
765
|
+
Pre-batch:
|
|
519
766
|
|
|
520
|
-
|
|
767
|
+
1. Insert temporary files, hashes, tags
|
|
768
|
+
2. Update tmp.files.obj_id by matching existing Objs by hash or tag.
|
|
769
|
+
3. Insert a new Obj for each tmp_file where obj_id is NULL.
|
|
770
|
+
4. Insert Objs and update t.files.obj_id to point to the right Obj.
|
|
771
|
+
5. Insert Hash and Tag rows corresponding to stuff in t.files.
|
|
772
|
+
6. Update tmp.files.obj_id by matching existing Objs by hash or tag.
|
|
773
|
+
7. Insert a new File for each tmp_file. We might not need it.
|
|
774
|
+
8. Insert a new Link for each tmp_file.
|
|
775
|
+
9. Select (Obj.id, File.id) for each of tmp.files. These are all the possible usable files
|
|
776
|
+
that we will attempt to create a link to.
|
|
521
777
|
|
|
522
|
-
|
|
523
|
-
that match either the hash or one of the tags.
|
|
524
|
-
2. Create a record for each new deduplicated file. Create a Pending
|
|
525
|
-
3.
|
|
778
|
+
Batch:
|
|
526
779
|
|
|
527
|
-
|
|
528
|
-
|
|
780
|
+
1. For each request:
|
|
781
|
+
2. If no content is present, then use the existing pending File id to write the content.
|
|
782
|
+
3. If content is already present:
|
|
783
|
+
3. For each related file_id:
|
|
784
|
+
4. Check the link count and make a reminder to update the link count in the DB.
|
|
785
|
+
5. Attempt to create a link pointing to that file_id. If it succeeds, continue
|
|
786
|
+
to the next request.
|
|
787
|
+
6. None of the file IDs succeeded. Make a copy of an existing file using the spare
|
|
788
|
+
pending File id.
|
|
529
789
|
|
|
530
|
-
|
|
531
|
-
is done being written (not pending), then that's the fast path. Otherwise it's the slow
|
|
532
|
-
path.
|
|
790
|
+
Post-batch:
|
|
533
791
|
|
|
534
|
-
|
|
792
|
+
1. Update the link counts.
|
|
793
|
+
2. Match each
|
|
535
794
|
|
|
536
795
|
"""
|
|
537
796
|
|
|
@@ -553,236 +812,189 @@ class Dedup(abc.ABC):
|
|
|
553
812
|
"doing both links and copies in the same batch is not supported for now"
|
|
554
813
|
)
|
|
555
814
|
|
|
815
|
+
# cases to consider:
|
|
816
|
+
# adopt_existing==True, link_path is an existing dedup link: NOT IMPLEMENTED
|
|
817
|
+
# adopt_existing==True, link_path is a regular file, hash matches content inside dedup db
|
|
818
|
+
# adopt_existing==True, link_path is a regular file, content is novel
|
|
819
|
+
|
|
556
820
|
# Preliminaries to do before we start writing to the database.
|
|
557
|
-
|
|
558
|
-
|
|
821
|
+
tmp_files = []
|
|
822
|
+
tmp_tags = []
|
|
823
|
+
tmp_hashes = []
|
|
559
824
|
with self._SessionR() as s:
|
|
560
825
|
for link in links:
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
826
|
+
req = link.req
|
|
827
|
+
|
|
828
|
+
if req.link_path is not None:
|
|
829
|
+
link.link_path_str = ps = self._link_path_to_string(req.link_path)
|
|
564
830
|
# Remove existing file if present. This may raise if the path is actually a
|
|
565
831
|
# directory.
|
|
566
|
-
req.
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
if (h := req.file_contents_hash) is not None:
|
|
573
|
-
link.lookup_key = h, link.metadata_bytes
|
|
574
|
-
d = {
|
|
575
|
-
"id": link.index,
|
|
576
|
-
"hash_function": h.function.function_code,
|
|
577
|
-
"digest": h.digest,
|
|
578
|
-
"metadata_bytes": link.metadata_bytes,
|
|
579
|
-
}
|
|
580
|
-
hashes_to_search.append(d)
|
|
581
|
-
link.hashes_promised = {h.function: h}
|
|
832
|
+
if req.adopt_existing:
|
|
833
|
+
pass # Assertion is too expensive.
|
|
834
|
+
# assert not s.execute(
|
|
835
|
+
# self._sql_prebatch_check_link, {"p_path": ps}
|
|
836
|
+
# ).scalar(), "adopting an existing link is not supported yet"
|
|
582
837
|
else:
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
Related = sao.aliased(cls)
|
|
598
|
-
q = sa.select(Related).where(getattr(Related, attribute).in_(values_set))
|
|
599
|
-
q = q.options(sao.joinedload(Related.file))
|
|
600
|
-
return s.execute(q).scalars()
|
|
838
|
+
req.link_path.unlink(missing_ok=True)
|
|
839
|
+
else:
|
|
840
|
+
# The user is requesting insert of content but doesn't want an actual link
|
|
841
|
+
# to be created.
|
|
842
|
+
link.link_path_str = ps = None
|
|
843
|
+
assert not req.adopt_existing
|
|
844
|
+
|
|
845
|
+
if req.adopt_existing:
|
|
846
|
+
req.file_metadata = m = self.get_metadata_from_file(req.link_path)
|
|
847
|
+
with req.link_path.open("rb") as f:
|
|
848
|
+
r = self._compute_file_hash(req.hash_function, f)
|
|
849
|
+
link.file_size, req.file_contents_hash = r
|
|
850
|
+
else:
|
|
851
|
+
m = req.file_metadata
|
|
601
852
|
|
|
602
|
-
|
|
603
|
-
with self._beginw() as s:
|
|
604
|
-
s.add(pending := mo.Pending(expire_at=mo.now() + 30.0))
|
|
605
|
-
s.flush()
|
|
606
|
-
pending_key = IdKey.from_instance(pending)
|
|
853
|
+
link.metadata_bytes = m = self.convert_file_metadata_to_bytes(m)
|
|
607
854
|
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
s.connection().execute(sa.insert(tmp), hashes_to_search).close()
|
|
616
|
-
H = sao.aliased(mo.Hash)
|
|
617
|
-
F = sao.aliased(mo.DedupFile)
|
|
618
|
-
q = (
|
|
619
|
-
sa.select(H, F)
|
|
620
|
-
.join(F, H.file)
|
|
621
|
-
.join(
|
|
622
|
-
tmp,
|
|
623
|
-
(tmp.c.digest == H.hash)
|
|
624
|
-
& (tmp.c.hash_function == H.hash_function)
|
|
625
|
-
& (tmp.c.metadata_bytes == F.file_metadata),
|
|
626
|
-
)
|
|
627
|
-
)
|
|
628
|
-
hash_to_file = {
|
|
629
|
-
(h.to_digest(), f.file_metadata): f for h, f in s.execute(q).all()
|
|
855
|
+
tmp_files.append(
|
|
856
|
+
{
|
|
857
|
+
"id": link.index,
|
|
858
|
+
"link_path": ps,
|
|
859
|
+
"metadata_bytes": m,
|
|
860
|
+
"insert_obj_if_missing": req.open_file_once is not None
|
|
861
|
+
or req.adopt_existing,
|
|
630
862
|
}
|
|
631
|
-
|
|
632
|
-
hash_to_file = {}
|
|
863
|
+
)
|
|
633
864
|
|
|
634
|
-
|
|
635
|
-
tag_to_file_set = set(tag_to_file)
|
|
865
|
+
tmp_tags += ({"id": link.index, "name": tag} for tag in req.tags)
|
|
636
866
|
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
file = tag_to_file[next(iter(overlap))]
|
|
646
|
-
elif (key := link.lookup_key) is not None:
|
|
647
|
-
# Check for a deduped file with the same hash.
|
|
648
|
-
file = hash_to_file.get(key, None)
|
|
867
|
+
if (h := req.file_contents_hash) is not None:
|
|
868
|
+
d = {
|
|
869
|
+
"id": link.index,
|
|
870
|
+
"hash_function": h.function.function_code,
|
|
871
|
+
"hash": h.digest,
|
|
872
|
+
}
|
|
873
|
+
tmp_hashes.append(d)
|
|
874
|
+
link.hashes_promised = {h.function: h}
|
|
649
875
|
else:
|
|
650
|
-
|
|
876
|
+
link.hashes_promised = {}
|
|
651
877
|
|
|
652
|
-
if
|
|
653
|
-
|
|
654
|
-
link.is_new = True
|
|
655
|
-
link.fast_path = True
|
|
878
|
+
if (req.file_contents_hash is None) and not req.tags:
|
|
879
|
+
raise AssertionError("must provide hash and/or tags")
|
|
656
880
|
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
# it.
|
|
660
|
-
link.set_failed(MissingContentError())
|
|
661
|
-
continue
|
|
881
|
+
updates = _Updates()
|
|
882
|
+
objs: dict[int, _Obj] = {}
|
|
662
883
|
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
884
|
+
# Now we check the database and add file hash records where we can.
|
|
885
|
+
with self._beginw() as s, mo.tmp_new_files(s, "") as t:
|
|
886
|
+
c = s.connection()
|
|
887
|
+
s.add(pending := mo.Pending(expire_at=mo.now() + 30.0))
|
|
888
|
+
s.flush()
|
|
889
|
+
pending_key = IdKey.from_instance(pending)
|
|
890
|
+
pending_id = pending.id
|
|
891
|
+
if tmp_files:
|
|
892
|
+
c.execute(sa.insert(t.files), tmp_files).close()
|
|
893
|
+
if tmp_hashes:
|
|
894
|
+
c.execute(sa.insert(t.hashes), tmp_hashes).close()
|
|
895
|
+
if tmp_tags:
|
|
896
|
+
c.execute(sa.insert(t.tags), tmp_tags).close()
|
|
897
|
+
|
|
898
|
+
s.add(temp_file := mo.File(pending_id=pending_id))
|
|
899
|
+
s.flush()
|
|
900
|
+
temp_file_id = temp_file.id
|
|
901
|
+
|
|
902
|
+
# Set t.files.obj_id using existing Obj and File records.
|
|
903
|
+
d = {
|
|
904
|
+
"p_fake_created_at": -1,
|
|
905
|
+
"p_created_at": mo.now(),
|
|
906
|
+
"p_null_file_id": temp_file_id,
|
|
907
|
+
"p_pending_id": pending_id,
|
|
908
|
+
"p_max_link_count": self.max_link_count,
|
|
909
|
+
}
|
|
910
|
+
c.execute(self._sql_prebatch_update_with_existing_dedup_files, d).close()
|
|
911
|
+
|
|
912
|
+
# We insert Obj records for the requests where t.files.obj_id is NULL.
|
|
913
|
+
for stmt in self._sql_prebatch_insert_missing_objs:
|
|
914
|
+
c.execute(stmt, d).close()
|
|
915
|
+
|
|
916
|
+
# Now all requests have an Obj. We add tag or hash records where necessary.
|
|
917
|
+
c.execute(self._sql_prebatch_insert_tags).close()
|
|
918
|
+
c.execute(self._sql_prebatch_insert_hashes).close()
|
|
919
|
+
|
|
920
|
+
# Coalesce overlapping new Objs.
|
|
921
|
+
c.execute(self._sql_prebatch_update_with_existing_dedup_files, d).close()
|
|
922
|
+
|
|
923
|
+
# Delete unused Objs. Set `Obj.created_at` for remaining ones.
|
|
924
|
+
for stmt in self._sql_prebatch_fix_and_delete_objs:
|
|
925
|
+
c.execute(stmt, d).close()
|
|
926
|
+
|
|
927
|
+
# Speculatively insert as many files as there are members in the batch.
|
|
928
|
+
c.execute(self._sql_prebatch_insert_files, d).close()
|
|
929
|
+
|
|
930
|
+
for r in self._sql_prebatch_delete_and_insert_links:
|
|
931
|
+
c.execute(r, d).close()
|
|
932
|
+
|
|
933
|
+
if 0:
|
|
934
|
+
tmp = self._tmp_files
|
|
935
|
+
print("**************** files, hashes, tags")
|
|
936
|
+
for tab in (tmp.files, tmp.hashes, tmp.tags):
|
|
937
|
+
print(s.execute(sa.select(tab)).all())
|
|
938
|
+
|
|
939
|
+
for req_id, obj_id in c.execute(self._sql_prebatch_select_req_obj):
|
|
940
|
+
if (obj := objs.get(obj_id)) is None:
|
|
941
|
+
objs[obj_id] = obj = _Obj(id=obj_id)
|
|
942
|
+
(link := links[req_id]).obj = obj
|
|
943
|
+
if link.req.adopt_existing:
|
|
944
|
+
obj.adopted_file_path = link.req.link_path
|
|
945
|
+
|
|
946
|
+
for obj_id, file_id, pending_id in c.execute(self._sql_prebatch_select_obj_file, d):
|
|
947
|
+
o = objs[obj_id]
|
|
948
|
+
if pending_id is None:
|
|
949
|
+
o.completed_file_ids.append(file_id)
|
|
668
950
|
else:
|
|
669
|
-
|
|
670
|
-
# The file might fail to be written, the hashes might be invalid, etc. We must
|
|
671
|
-
# use the slow path and wait for the file to become ready.
|
|
672
|
-
link.fast_path = False
|
|
673
|
-
file = None
|
|
674
|
-
|
|
675
|
-
if link.fast_path:
|
|
676
|
-
self._prepare_dedup_file_for_linking(s, file, link)
|
|
677
|
-
if link.is_new:
|
|
678
|
-
# If the same file shows up later in the batch, ensure that it is used.
|
|
679
|
-
for v in link.hashes_promised.values():
|
|
680
|
-
hash_to_file[v, file.file_metadata] = file
|
|
681
|
-
|
|
682
|
-
# the _prepare_dedup_file_for_linking caused a flush, so our primary key is ready
|
|
683
|
-
if file is not None:
|
|
684
|
-
link.file = IdKey.from_instance(file)
|
|
685
|
-
|
|
686
|
-
L = sao.aliased(mo.Link)
|
|
687
|
-
q = sa.select(L).where(
|
|
688
|
-
(L.link_path == sa.bindparam("x_src")) | (L.link_path == sa.bindparam("x_dst"))
|
|
689
|
-
)
|
|
690
|
-
for copy in copies:
|
|
691
|
-
with self._ignore_skip(), self._catch_req_exc(copy):
|
|
692
|
-
link_objs = {
|
|
693
|
-
x.link_path: x
|
|
694
|
-
for x in s.execute(q, {"x_src": copy.src_str, "x_dst": copy.dst_str})
|
|
695
|
-
.scalars()
|
|
696
|
-
.all()
|
|
697
|
-
}
|
|
698
|
-
|
|
699
|
-
if (src_link := link_objs.get(copy.src_str)) is None:
|
|
700
|
-
raise NotADedupLinkError
|
|
701
|
-
|
|
702
|
-
if (dst_link := link_objs.get(copy.dst_str)) is not None:
|
|
703
|
-
s.delete(dst_link)
|
|
704
|
-
|
|
705
|
-
copy.dedup_file_path = self._make_dedup_file_path(src_link.file_id)
|
|
706
|
-
s.add(mo.Link(file_id=src_link.file_id, link_path=copy.dst_str))
|
|
707
|
-
s.flush()
|
|
708
|
-
del q, L
|
|
951
|
+
o.pending_file_ids.append(file_id)
|
|
709
952
|
|
|
710
953
|
pending.expire_at = mo.now() + 30.0
|
|
954
|
+
del pending
|
|
955
|
+
|
|
956
|
+
failed_link_paths = []
|
|
957
|
+
with self._PendingUpdater(
|
|
958
|
+
pending=pending_key,
|
|
959
|
+
sessionmaker_r=self._SessionR,
|
|
960
|
+
sessionmaker_w=self._SessionW,
|
|
961
|
+
seconds_in_the_future=20,
|
|
962
|
+
) as pu, self.temporary_directory(check_links=False) as tmp_path:
|
|
963
|
+
for link in links:
|
|
964
|
+
with self._ignore_skip(), self._catch_req_exc(link):
|
|
965
|
+
if (obj := link.obj) is None:
|
|
966
|
+
# nothing to be done here
|
|
967
|
+
link.call_file_not_needed()
|
|
968
|
+
link.set_failed(MissingContentError(f"no obj {link}"))
|
|
969
|
+
continue
|
|
711
970
|
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
to_be_flushed = []
|
|
715
|
-
failed_requests = []
|
|
716
|
-
|
|
717
|
-
def _flush_now(s: sao.Session):
|
|
718
|
-
for link in to_be_flushed:
|
|
719
|
-
file: mo.DedupFile | None = None if (f := link.file) is None else f.get(s)
|
|
971
|
+
self._write_dedup_file_contents(link, tmp_path, updates)
|
|
720
972
|
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
if file is not None:
|
|
724
|
-
s.delete(file)
|
|
725
|
-
continue
|
|
973
|
+
with self._beginw() as s, mo.tmp_new_files2(s, "") as t:
|
|
974
|
+
c = s.connection()
|
|
726
975
|
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
# We need to add whatever extra hashes were computed.
|
|
733
|
-
if d := link.hashes_computed:
|
|
734
|
-
already_in_db = link.hashes_promised
|
|
735
|
-
for k, v in d.items():
|
|
736
|
-
if k not in already_in_db:
|
|
737
|
-
s.add(mo.Hash.from_digest(v, file=file))
|
|
738
|
-
|
|
739
|
-
# We checked the hashes (if any), the file contents are written, and the link
|
|
740
|
-
# (if any) has been created. We are therefore ready to set the "file.pending"
|
|
741
|
-
# column to NULL, thus marking the dedup file as finalized.
|
|
742
|
-
file.pending = None
|
|
743
|
-
|
|
744
|
-
to_be_flushed.clear()
|
|
745
|
-
|
|
746
|
-
for copy in copies:
|
|
747
|
-
with self._ignore_skip(), self._catch_req_exc(copy):
|
|
748
|
-
self._delete_file(copy.req.dst)
|
|
749
|
-
self._create_actual_link(copy.dedup_file_path, copy.req.dst)
|
|
750
|
-
|
|
751
|
-
if links:
|
|
752
|
-
# Now we write the file data without holding the database transaction open. The
|
|
753
|
-
# "_PendingUpdater" ensures that other threads know that we're working.
|
|
754
|
-
with self._PendingUpdater(
|
|
755
|
-
pending=pending_key,
|
|
756
|
-
sessionmaker_r=self._SessionR,
|
|
757
|
-
sessionmaker_w=self._SessionW,
|
|
758
|
-
seconds_in_the_future=20,
|
|
759
|
-
) as pu:
|
|
760
|
-
for link in links:
|
|
761
|
-
with self._ignore_skip(), self._catch_req_exc(link):
|
|
762
|
-
if not link.fast_path:
|
|
763
|
-
with self._beginw() as s:
|
|
764
|
-
_flush_now(s)
|
|
765
|
-
self._slow_path_wait_for_dedup_file(link=link, pending=pending_key)
|
|
766
|
-
|
|
767
|
-
self._write_dedup_file_contents(link=link)
|
|
768
|
-
to_be_flushed.append(link)
|
|
769
|
-
pu.update_on_exit = True
|
|
976
|
+
if u := updates.link_updates:
|
|
977
|
+
c.execute(sa.insert(self._tmp_files2.links), u).close()
|
|
978
|
+
for stmt in self._sql_postbatch_update_links:
|
|
979
|
+
c.execute(stmt).close()
|
|
770
980
|
|
|
771
|
-
|
|
772
|
-
|
|
981
|
+
if u := updates.file_updates:
|
|
982
|
+
c.execute(sa.insert(self._tmp_files2.files), u).close()
|
|
983
|
+
c.execute(self._sql_postbatch_update_files)
|
|
773
984
|
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
985
|
+
if u := updates.obj_updates:
|
|
986
|
+
c.execute(sa.insert(self._tmp_files2.objs), u).close()
|
|
987
|
+
c.execute(self._sql_postbatch_update_objs).close()
|
|
777
988
|
|
|
778
|
-
|
|
779
|
-
|
|
989
|
+
# Delete the pending object.
|
|
990
|
+
s.delete(pending_key.get_one(s))
|
|
991
|
+
s.flush()
|
|
780
992
|
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
993
|
+
failed_requests = []
|
|
994
|
+
for link in links:
|
|
995
|
+
ok = link.req.success = not link.failed
|
|
996
|
+
if not ok:
|
|
997
|
+
failed_requests.append(link.req)
|
|
786
998
|
|
|
787
999
|
if failed_requests:
|
|
788
1000
|
first_exc = failed_requests[0].exc
|
|
@@ -794,7 +1006,6 @@ class Dedup(abc.ABC):
|
|
|
794
1006
|
def _write_file_computing_hashes(
|
|
795
1007
|
self, target: Path, open1, hashes: ty.Iterable[mh.HashFunction]
|
|
796
1008
|
) -> tuple[int, dict[mh.HashFunction, mh.Digest]]:
|
|
797
|
-
target.parent.mkdir(exist_ok=True, parents=True)
|
|
798
1009
|
m = mh.MultiHasher({f: f() for f in hashes})
|
|
799
1010
|
with target.open("wb") as f_w, open1() as f_r:
|
|
800
1011
|
while block := f_r.read(65536):
|
|
@@ -802,114 +1013,126 @@ class Dedup(abc.ABC):
|
|
|
802
1013
|
f_w.write(block)
|
|
803
1014
|
return m.size, m.digest()
|
|
804
1015
|
|
|
805
|
-
def _write_dedup_file_contents(
|
|
806
|
-
|
|
807
|
-
if link.req.open_file_once is None:
|
|
808
|
-
link.call_file_not_needed()
|
|
809
|
-
return
|
|
810
|
-
|
|
811
|
-
p = link.dedup_file_path
|
|
812
|
-
(fs := set(link.hashes_promised)).update(self.extra_hashes)
|
|
813
|
-
link.file_size, d = self._write_file_computing_hashes(p, link.call_open_file_once, fs)
|
|
814
|
-
self.apply_metadata_to_file(p, link.req.file_metadata)
|
|
815
|
-
link.file_mtime = int(p.stat().st_mtime)
|
|
816
|
-
link.hashes_computed = d
|
|
817
|
-
|
|
818
|
-
# Check that the hashes match what was claimed inside the link request.
|
|
819
|
-
computed = {k: d[k] for k in link.hashes_promised}
|
|
820
|
-
if link.hashes_promised != computed:
|
|
821
|
-
p.unlink(missing_ok=True)
|
|
822
|
-
raise InvalidContentsError(
|
|
823
|
-
link_request=link.req,
|
|
824
|
-
hashes_expected=link.hashes_promised,
|
|
825
|
-
hashes_observed=computed,
|
|
826
|
-
)
|
|
827
|
-
else:
|
|
828
|
-
# existing file - we don't need to do anything
|
|
829
|
-
link.call_file_not_needed()
|
|
830
|
-
|
|
831
|
-
# TODO: quickly check whether the file mtime matches and check the content hash if not
|
|
832
|
-
|
|
833
|
-
self._create_actual_link(link.dedup_file_path, link.req.link_path)
|
|
834
|
-
|
|
835
|
-
def _slow_path_wait_for_dedup_file(
|
|
836
|
-
self, link: _ImplDedupLinkRequest, pending: IdKey[mo.Pending]
|
|
1016
|
+
def _write_dedup_file_contents(
|
|
1017
|
+
self, link: _ImplDedupLinkRequest, tmp_path: Path, updates: _Updates
|
|
837
1018
|
) -> None:
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
1019
|
+
obj = link.obj
|
|
1020
|
+
target = link.req.link_path
|
|
1021
|
+
skip_link_for_file_id = None
|
|
1022
|
+
adopting = obj.adopted_file_path
|
|
1023
|
+
adopted = False
|
|
1024
|
+
|
|
1025
|
+
def _mkdirp(path):
|
|
1026
|
+
if not path.exists():
|
|
1027
|
+
path.mkdir(exist_ok=True, parents=True)
|
|
1028
|
+
|
|
1029
|
+
# Do we have any completed File IDs at all?
|
|
1030
|
+
if obj.completed_file_ids:
|
|
1031
|
+
link.call_file_not_needed()
|
|
1032
|
+
if adopting is not None:
|
|
1033
|
+
# We don't need the file there.
|
|
1034
|
+
self._delete_file(target)
|
|
1035
|
+
else:
|
|
1036
|
+
# No completed IDs, we need to make one. Try to adopt if possible.
|
|
1037
|
+
tmp_p = tmp_path / "f.bin"
|
|
1038
|
+
tmp_p.unlink(missing_ok=True)
|
|
845
1039
|
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
raise AssertionError("deadlock")
|
|
870
|
-
elif x.pending.expire_at >= mo.now():
|
|
871
|
-
# We found an in-progress DedupFile, so we stand down and continue polling.
|
|
872
|
-
return False
|
|
1040
|
+
if adopting is not None:
|
|
1041
|
+
link.call_file_not_needed()
|
|
1042
|
+
self._adopt_file_and_link(adopting, tmp_p)
|
|
1043
|
+
adopted = True
|
|
1044
|
+
size = tmp_p.stat().st_size
|
|
1045
|
+
apply_metadata = False
|
|
1046
|
+
elif (open1 := link.call_open_file_once) is not None:
|
|
1047
|
+
(fs := set(link.hashes_promised)).update(self.extra_hashes)
|
|
1048
|
+
size, d = self._write_file_computing_hashes(tmp_p, open1, fs)
|
|
1049
|
+
link.hashes_computed = d
|
|
1050
|
+
|
|
1051
|
+
# Check that the hashes match what was claimed inside the link request.
|
|
1052
|
+
computed = {k: d[k] for k in link.hashes_promised}
|
|
1053
|
+
if link.hashes_promised != computed:
|
|
1054
|
+
raise InvalidContentsError(
|
|
1055
|
+
link_request=link.req,
|
|
1056
|
+
hashes_expected=link.hashes_promised,
|
|
1057
|
+
hashes_observed=computed,
|
|
1058
|
+
)
|
|
1059
|
+
apply_metadata = True
|
|
1060
|
+
else:
|
|
1061
|
+
link.set_failed(MissingContentError("content not provided"))
|
|
1062
|
+
return
|
|
873
1063
|
|
|
874
|
-
|
|
875
|
-
|
|
1064
|
+
if apply_metadata:
|
|
1065
|
+
self.apply_metadata_to_file(tmp_p, link.req.file_metadata)
|
|
1066
|
+
self._set_clean_file_mtime(tmp_p)
|
|
876
1067
|
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
1068
|
+
file_id = obj.pending_file_ids.pop()
|
|
1069
|
+
p = self._make_dedup_file_path(file_id)
|
|
1070
|
+
_mkdirp(p.parent)
|
|
1071
|
+
tmp_p.rename(p)
|
|
1072
|
+
obj.completed_file_ids.append(file_id)
|
|
1073
|
+
updates.file_updates.append({"file_id": file_id, "obj_id": obj.id})
|
|
1074
|
+
updates.obj_updates.append({"obj_id": obj.id, "size": size})
|
|
1075
|
+
# Now the file has the right contents. Let's also make a link now.
|
|
880
1076
|
|
|
881
|
-
|
|
882
|
-
|
|
1077
|
+
if adopting is not None:
|
|
1078
|
+
skip_link_for_file_id = file_id
|
|
883
1079
|
|
|
884
|
-
|
|
1080
|
+
endgame = False
|
|
1081
|
+
completed = obj.completed_file_ids
|
|
885
1082
|
while True:
|
|
886
|
-
|
|
1083
|
+
file_id = completed[-1]
|
|
1084
|
+
p = self._make_dedup_file_path(file_id)
|
|
1085
|
+
if adopted and adopting == target:
|
|
1086
|
+
ok = True
|
|
1087
|
+
else:
|
|
1088
|
+
ok = False
|
|
1089
|
+
try:
|
|
1090
|
+
self._create_actual_link(p, target)
|
|
1091
|
+
ok = True
|
|
1092
|
+
except Exception:
|
|
1093
|
+
pass
|
|
887
1094
|
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
1095
|
+
if not ok and target.exists():
|
|
1096
|
+
self._delete_file(target)
|
|
1097
|
+
try:
|
|
1098
|
+
self._create_actual_link(p, target)
|
|
1099
|
+
ok = True
|
|
1100
|
+
except Exception:
|
|
1101
|
+
pass
|
|
892
1102
|
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
1103
|
+
link_count = p.stat().st_nlink - 1
|
|
1104
|
+
updates.link_updates.append(
|
|
1105
|
+
{
|
|
1106
|
+
"link_path": link.link_path_str if ok else None,
|
|
1107
|
+
"file_id": file_id,
|
|
1108
|
+
"link_count": link_count,
|
|
1109
|
+
}
|
|
1110
|
+
)
|
|
1111
|
+
if ok:
|
|
1112
|
+
# We're done! Bye!
|
|
1113
|
+
return
|
|
897
1114
|
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
link.is_new = True
|
|
902
|
-
else:
|
|
903
|
-
file = result
|
|
904
|
-
link.is_new = False
|
|
1115
|
+
if len(completed) > 1:
|
|
1116
|
+
completed.pop()
|
|
1117
|
+
continue
|
|
905
1118
|
|
|
906
|
-
|
|
907
|
-
|
|
1119
|
+
if endgame:
|
|
1120
|
+
raise AssertionError
|
|
1121
|
+
|
|
1122
|
+
endgame = True
|
|
908
1123
|
|
|
909
|
-
|
|
910
|
-
|
|
1124
|
+
# This is our last one file, we must make a copy.
|
|
1125
|
+
tmp_p = tmp_path / "f.bin"
|
|
1126
|
+
tmp_p.unlink(missing_ok=True)
|
|
1127
|
+
shutil.copyfile(str(self._make_dedup_file_path(file_id)), str(tmp_p))
|
|
911
1128
|
|
|
912
|
-
|
|
1129
|
+
file_id = obj.pending_file_ids.pop()
|
|
1130
|
+
p = self._make_dedup_file_path(file_id)
|
|
1131
|
+
_mkdirp(p.parent)
|
|
1132
|
+
tmp_p.rename(p)
|
|
1133
|
+
obj.completed_file_ids[0] = file_id
|
|
1134
|
+
updates.file_updates.append({"file_id": file_id, "obj_id": obj.id})
|
|
1135
|
+
# We made a copy. Hope it works now.
|
|
913
1136
|
|
|
914
1137
|
@property
|
|
915
1138
|
def _PendingUpdater(self):
|
|
@@ -940,8 +1163,24 @@ class Dedup(abc.ABC):
|
|
|
940
1163
|
"""
|
|
941
1164
|
self._check_links(path, False)
|
|
942
1165
|
|
|
1166
|
+
@cached_property
|
|
1167
|
+
def _sql_checklinks(self):
|
|
1168
|
+
tmp = self._tmp_check_links
|
|
1169
|
+
t_links = sao.aliased(tmp.links)
|
|
1170
|
+
F = sao.aliased(mo.File)
|
|
1171
|
+
L = sao.aliased(mo.Link)
|
|
1172
|
+
|
|
1173
|
+
# Invalidate link_count for affected dedup Files.
|
|
1174
|
+
q = sa.select(L.file_id).join(t_links, L.path == t_links.c.path)
|
|
1175
|
+
r0 = sa.update(F).values(link_count=-1).where(F.id.in_(q))
|
|
1176
|
+
|
|
1177
|
+
# Delete Link records.
|
|
1178
|
+
r1 = sa.delete(L).where(L.path.in_(sa.select(t_links.c.path)))
|
|
1179
|
+
|
|
1180
|
+
return tuple(_ns(x) for x in (r0, r1))
|
|
1181
|
+
|
|
943
1182
|
def _check_links(self, path: Path | None, pre_delete: bool) -> None:
|
|
944
|
-
F = sao.aliased(mo.
|
|
1183
|
+
F = sao.aliased(mo.File)
|
|
945
1184
|
L = sao.aliased(mo.Link)
|
|
946
1185
|
|
|
947
1186
|
_verify_link = self._verify_link
|
|
@@ -956,10 +1195,10 @@ class Dedup(abc.ABC):
|
|
|
956
1195
|
# do any checking.
|
|
957
1196
|
_verify_link = lambda link: False
|
|
958
1197
|
|
|
959
|
-
q = sa.select(L).order_by(L.
|
|
1198
|
+
q = sa.select(L).order_by(L.path).options(sao.joinedload(L.file))
|
|
960
1199
|
q = q.limit(self._batch_size)
|
|
961
1200
|
if prefix is not None:
|
|
962
|
-
q = q.where((L.
|
|
1201
|
+
q = q.where((L.path == exact_path) | bytes_startswith(L.path, prefix))
|
|
963
1202
|
|
|
964
1203
|
with self._SessionR() as s:
|
|
965
1204
|
last_link_path: str | None = None
|
|
@@ -967,7 +1206,7 @@ class Dedup(abc.ABC):
|
|
|
967
1206
|
if last_link_path is None:
|
|
968
1207
|
q2 = q
|
|
969
1208
|
else:
|
|
970
|
-
q2 = q.where(L.
|
|
1209
|
+
q2 = q.where(L.path > last_link_path)
|
|
971
1210
|
|
|
972
1211
|
results: list[mo.Link] = s.execute(q2).scalars().all()
|
|
973
1212
|
if not results:
|
|
@@ -976,74 +1215,115 @@ class Dedup(abc.ABC):
|
|
|
976
1215
|
to_delete = []
|
|
977
1216
|
for link in results:
|
|
978
1217
|
if not _verify_link(link):
|
|
979
|
-
|
|
1218
|
+
# TODO: Instead of just deleting them from the DB, maybe we should keep
|
|
1219
|
+
# track of invalid links or even repair them?
|
|
1220
|
+
to_delete.append(link.path)
|
|
980
1221
|
|
|
981
1222
|
if to_delete:
|
|
982
|
-
with self._beginw() as s2,
|
|
983
|
-
|
|
984
|
-
) as t_links, temporary_table(s2, mo.tmp_ints) as t_files:
|
|
1223
|
+
with self._beginw() as s2, mo.tmp_check_links(s2, "") as tmp:
|
|
1224
|
+
# 1. Insert Link paths into a temporary table.
|
|
985
1225
|
s2.connection().execute(
|
|
986
|
-
sa.insert(
|
|
1226
|
+
sa.insert(tmp.links), [{"path": x} for x in to_delete]
|
|
987
1227
|
).close()
|
|
988
1228
|
|
|
989
|
-
#
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
.distinct()
|
|
995
|
-
.select_from(L)
|
|
996
|
-
.join(F, L.file)
|
|
997
|
-
.join(t_links, t_links.c.id == L.link_path),
|
|
998
|
-
)
|
|
999
|
-
).close()
|
|
1229
|
+
# 2. Invalidate link_count inside parent Files.
|
|
1230
|
+
# 3. Delete Links.
|
|
1231
|
+
# 4. Recompute link_count for affected Files.
|
|
1232
|
+
for stmt in self._sql_checklinks:
|
|
1233
|
+
s2.execute(stmt).close()
|
|
1000
1234
|
|
|
1001
|
-
|
|
1002
|
-
s2.connection().execute(
|
|
1003
|
-
sa.delete(L).where(L.link_path.in_(sa.select(t_links.c.id))),
|
|
1004
|
-
).close()
|
|
1235
|
+
last_link_path = results[-1].path
|
|
1005
1236
|
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
def
|
|
1237
|
+
@cached_property
|
|
1238
|
+
def _sql_orph_update(self):
|
|
1239
|
+
now = sa.bindparam("p_now")
|
|
1240
|
+
updated_since = sa.bindparam("p_updated_since")
|
|
1241
|
+
Obj = sao.aliased(mo.Obj)
|
|
1242
|
+
return Obj.make_sql_update_orphaned(now).where(Obj.updated_at >= updated_since)
|
|
1243
|
+
|
|
1244
|
+
def detect_orphaned(self):
|
|
1245
|
+
# Update link count for files where it was invalidated.
|
|
1246
|
+
self.integrity_check(skip_same_mtime=True, only_invalid_link_count=True)
|
|
1247
|
+
|
|
1248
|
+
C = sao.aliased(mo.DedupConfig)
|
|
1249
|
+
KEY = "last_detect_orphaned"
|
|
1014
1250
|
with self._beginw() as s:
|
|
1015
|
-
|
|
1016
|
-
|
|
1251
|
+
last_check = s.execute(sa.select(C).where(C.key == KEY)).scalar()
|
|
1252
|
+
if last_check is None:
|
|
1253
|
+
s.add(last_check := mo.DedupConfig(key=KEY, value="0"))
|
|
1254
|
+
since = int(last_check.value)
|
|
1255
|
+
now = mo.now()
|
|
1256
|
+
s.execute(self._sql_orph_update, {"p_now": now, "p_updated_since": since}).close()
|
|
1257
|
+
last_check.value = str(now)
|
|
1017
1258
|
|
|
1018
|
-
|
|
1259
|
+
@cached_property
|
|
1260
|
+
def _sql_gc_orphaned_to_pending(self):
|
|
1261
|
+
O = sao.aliased(mo.Obj)
|
|
1262
|
+
F = sao.aliased(mo.File)
|
|
1263
|
+
F2 = sao.aliased(mo.File)
|
|
1264
|
+
q = sa.select(F.id).join(O, F.obj).where(O.orphaned_at < sa.bindparam("p_cutoff"))
|
|
1265
|
+
return _ns(
|
|
1266
|
+
sa.update(F2).values(pending_id=sa.bindparam("p_pending_id")).where(F2.id.in_(q))
|
|
1267
|
+
)
|
|
1268
|
+
|
|
1269
|
+
@cached_property
|
|
1270
|
+
def _sql_gc_select_pending(self):
|
|
1271
|
+
F = sao.aliased(mo.File)
|
|
1272
|
+
P = sao.aliased(mo.Pending)
|
|
1273
|
+
L = sao.aliased(mo.Link)
|
|
1274
|
+
cond = P.expire_at < sa.bindparam("p_cutoff")
|
|
1275
|
+
cond &= P.id != _lit(str(self._corrupted_pending_id))
|
|
1276
|
+
q0 = sa.select(P.id).where(cond)
|
|
1277
|
+
q1 = sa.select(F.id).join(P).where(cond)
|
|
1278
|
+
q2 = sa.select(L.path).where(L.file_id.in_(q1))
|
|
1279
|
+
return q0, q1, q2
|
|
1280
|
+
|
|
1281
|
+
def garbage_collect_dedup_files(
|
|
1282
|
+
self, min_age_orphan_seconds: int, min_age_pending_seconds: int = None
|
|
1283
|
+
) -> None:
|
|
1019
1284
|
"""
|
|
1020
1285
|
Remove dedup files that have no links to them as well as dedup files that were left behind
|
|
1021
1286
|
by a failed batch of content insertion.
|
|
1022
1287
|
"""
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
def _garbage_collect_using_query(self, q, F):
|
|
1034
|
-
F1 = sao.aliased(mo.DedupFile)
|
|
1035
|
-
while True:
|
|
1288
|
+
|
|
1289
|
+
self.detect_orphaned()
|
|
1290
|
+
now = mo.now()
|
|
1291
|
+
orphan_cutoff = now - min_age_orphan_seconds
|
|
1292
|
+
pending_cutoff = now - (min_age_pending_seconds or 7200)
|
|
1293
|
+
|
|
1294
|
+
self._garbage_collect_dedup_files(orphan_cutoff, pending_cutoff)
|
|
1295
|
+
|
|
1296
|
+
def _garbage_collect_dedup_files(self, orphan_cutoff: int | None, pending_cutoff: int):
|
|
1297
|
+
if orphan_cutoff is not None:
|
|
1036
1298
|
with self._beginw() as s:
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1299
|
+
s.add(pending := mo.Pending(expire_at=1)) # expiration time far into the past
|
|
1300
|
+
s.flush()
|
|
1301
|
+
|
|
1302
|
+
# Convert orphaned files to pending. We will collect the pending afterwards.
|
|
1303
|
+
params = {"p_pending_id": pending.id, "p_cutoff": orphan_cutoff}
|
|
1304
|
+
s.execute(self._sql_gc_orphaned_to_pending, params).close()
|
|
1305
|
+
|
|
1306
|
+
with self._SessionR.begin() as s:
|
|
1307
|
+
# Iterate through the expired pending File IDs and delete the files and links. Gather the list
|
|
1308
|
+
# of pending objects that are finished.
|
|
1309
|
+
params = {"p_cutoff": pending_cutoff}
|
|
1310
|
+
q0, q1, q2 = self._sql_gc_select_pending
|
|
1311
|
+
pending_ids = s.execute(q0, params).scalars().all()
|
|
1312
|
+
|
|
1313
|
+
for link_path in s.execute(q2, params).scalars():
|
|
1314
|
+
p = self._link_path_from_string(link_path)
|
|
1315
|
+
if p.exists():
|
|
1316
|
+
self._delete_file(p)
|
|
1317
|
+
|
|
1318
|
+
for file_id in s.execute(q1, params).scalars():
|
|
1319
|
+
p = self._make_dedup_file_path(file_id)
|
|
1320
|
+
if p.exists():
|
|
1321
|
+
self._delete_file(p)
|
|
1042
1322
|
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1323
|
+
# We only update the database after successfully deleting all the files and links.
|
|
1324
|
+
P = sao.aliased(mo.Pending)
|
|
1325
|
+
with self._beginw() as s:
|
|
1326
|
+
s.execute(sa.delete(P).where(P.id.in_(pending_ids))).close()
|
|
1047
1327
|
|
|
1048
1328
|
def garbage_collect_deleted(self):
|
|
1049
1329
|
"""
|
|
@@ -1102,70 +1382,73 @@ class Dedup(abc.ABC):
|
|
|
1102
1382
|
|
|
1103
1383
|
This recursively lists every file in the dedup store, so it takes a long time.
|
|
1104
1384
|
"""
|
|
1105
|
-
F = sao.aliased(mo.
|
|
1385
|
+
F = sao.aliased(mo.File)
|
|
1106
1386
|
i2p = self._integer_to_path
|
|
1107
1387
|
cutoff = mo.now() - 3600
|
|
1108
1388
|
|
|
1389
|
+
t_f = sao.aliased(self._tmp_delete_extra.files)
|
|
1390
|
+
q = sa.select(t_f.c.id).where(~sa.exists().select_from(F).where(F.id == t_f.c.id))
|
|
1391
|
+
|
|
1109
1392
|
base = self._path_dedup
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1393
|
+
with self._SessionR.begin() as s, mo.tmp_delete_extra(s, "") as tmp:
|
|
1394
|
+
for chunk in chunked_iter(base.rglob("*"), self._batch_size):
|
|
1395
|
+
file_ids = {}
|
|
1396
|
+
for p in chunk:
|
|
1397
|
+
if not p.is_file():
|
|
1398
|
+
continue
|
|
1116
1399
|
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1400
|
+
try:
|
|
1401
|
+
file_id = i2p.invert("/".join(p.relative_to(base).parts))
|
|
1402
|
+
except InvalidPathError:
|
|
1403
|
+
if p.stat().st_mtime < cutoff:
|
|
1404
|
+
self._delete_file(p)
|
|
1405
|
+
continue
|
|
1123
1406
|
|
|
1124
|
-
|
|
1407
|
+
file_ids[file_id] = p
|
|
1408
|
+
|
|
1409
|
+
if file_ids:
|
|
1410
|
+
s.execute(sa.insert(tmp.files), tuple({"id": x} for x in file_ids)).close()
|
|
1411
|
+
bad_file_ids = s.execute(q).scalars().all()
|
|
1412
|
+
s.execute(sa.delete(tmp.files)).close()
|
|
1125
1413
|
|
|
1126
|
-
if file_ids:
|
|
1127
|
-
# We use a write transaction to avoid a race condition between checking that a path
|
|
1128
|
-
# does not contain a valid file ID and then later deleting that file outside the
|
|
1129
|
-
# transaction.
|
|
1130
|
-
with self._SessionW() as s, temporary_table(s, mo.tmp_ints) as tmp:
|
|
1131
|
-
s.execute(sa.insert(tmp), [{"id": x} for x in file_ids]).close()
|
|
1132
|
-
tmp_ = sa.alias(tmp)
|
|
1133
|
-
bad_file_ids = (
|
|
1134
|
-
s.execute(
|
|
1135
|
-
sa.select(tmp_.c.id).where(
|
|
1136
|
-
~sa.exists().select_from(F).where(F.id == tmp_.c.id)
|
|
1137
|
-
)
|
|
1138
|
-
)
|
|
1139
|
-
.scalars()
|
|
1140
|
-
.all()
|
|
1141
|
-
)
|
|
1142
1414
|
for file_id in bad_file_ids:
|
|
1143
1415
|
self._delete_file(file_ids[file_id])
|
|
1144
1416
|
|
|
1145
|
-
for p in to_be_unlinked:
|
|
1146
|
-
self._delete_file(p)
|
|
1147
|
-
|
|
1148
1417
|
def corrupted_list(self) -> ty.Generator[Corrupted]:
|
|
1149
1418
|
"""
|
|
1150
1419
|
Get the list of corrupted files found using :meth:`integrity_check`.
|
|
1151
1420
|
"""
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1421
|
+
|
|
1422
|
+
L = sao.aliased(mo.Link)
|
|
1423
|
+
|
|
1424
|
+
with self._SessionR() as s:
|
|
1425
|
+
for fc in s.execute(sa.select(mo.FileCorruption)).scalars():
|
|
1426
|
+
fc.id
|
|
1427
|
+
links_bytes = s.execute(sa.select(L.path).where(L.file_id == fc.id)).scalars().all()
|
|
1428
|
+
links_paths = tuple((self._link_path_from_string(x)) for x in links_bytes)
|
|
1429
|
+
yield Corrupted(
|
|
1430
|
+
path=self._make_dedup_file_path(fc.id),
|
|
1431
|
+
file_id=fc.id,
|
|
1432
|
+
exception_name=fc.exception_name,
|
|
1433
|
+
exception_string=fc.exception_string,
|
|
1434
|
+
link_paths=links_paths,
|
|
1435
|
+
raw_link_paths=tuple(links_bytes),
|
|
1436
|
+
)
|
|
1161
1437
|
|
|
1162
1438
|
def corrupted_clear(self):
|
|
1163
1439
|
"""
|
|
1164
1440
|
Delete all corrupted files.
|
|
1165
1441
|
"""
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1442
|
+
F = sao.aliased(mo.File)
|
|
1443
|
+
with self._beginw() as s:
|
|
1444
|
+
s.add(p := mo.Pending(expire_at=1))
|
|
1445
|
+
s.flush()
|
|
1446
|
+
s.execute(
|
|
1447
|
+
sa.update(F)
|
|
1448
|
+
.values(pending_id=p.id)
|
|
1449
|
+
.where(F.pending_id == self._corrupted_pending_id)
|
|
1450
|
+
).close()
|
|
1451
|
+
self._garbage_collect_dedup_files(orphan_cutoff=None, pending_cutoff=2)
|
|
1169
1452
|
|
|
1170
1453
|
@staticmethod
|
|
1171
1454
|
def _copy_tree_default_fallback(src: Path, dst: Path):
|
|
@@ -1203,7 +1486,7 @@ class Dedup(abc.ABC):
|
|
|
1203
1486
|
if to_copy:
|
|
1204
1487
|
_run()
|
|
1205
1488
|
|
|
1206
|
-
def delete_tree(self, p: Path) -> None:
|
|
1489
|
+
def delete_tree(self, p: Path, check_links: bool = True) -> None:
|
|
1207
1490
|
def f(func, path, exc_info):
|
|
1208
1491
|
if (p := Path(path)).exists():
|
|
1209
1492
|
self._move_to_deleted(p)
|
|
@@ -1211,7 +1494,8 @@ class Dedup(abc.ABC):
|
|
|
1211
1494
|
shutil.rmtree(str(p.absolute()), onerror=f)
|
|
1212
1495
|
if p.exists():
|
|
1213
1496
|
self._move_to_deleted(p)
|
|
1214
|
-
|
|
1497
|
+
if check_links:
|
|
1498
|
+
self.check_links(p)
|
|
1215
1499
|
|
|
1216
1500
|
def delete_file(self, p: Path) -> None:
|
|
1217
1501
|
self._delete_file(p)
|
|
@@ -1247,20 +1531,24 @@ class Dedup(abc.ABC):
|
|
|
1247
1531
|
def _filelock(self, path: Path, blocking: bool):
|
|
1248
1532
|
return filelock.FileLock(path, blocking=blocking)
|
|
1249
1533
|
|
|
1250
|
-
@
|
|
1534
|
+
@cached_property
|
|
1535
|
+
def _path_temporary_simple_dir(self):
|
|
1536
|
+
return self.path_temporary / "simple"
|
|
1537
|
+
|
|
1538
|
+
@cached_property
|
|
1251
1539
|
def _path_temporary_dirs(self):
|
|
1252
1540
|
return self.path_temporary / "dirs"
|
|
1253
1541
|
|
|
1254
|
-
@
|
|
1542
|
+
@cached_property
|
|
1255
1543
|
def _path_temporary_lock(self):
|
|
1256
1544
|
return self.path_temporary / "lock"
|
|
1257
1545
|
|
|
1258
|
-
@
|
|
1546
|
+
@cached_property
|
|
1259
1547
|
def _path_temporary_master_lock(self):
|
|
1260
1548
|
return self.path_temporary / "master.lock"
|
|
1261
1549
|
|
|
1262
1550
|
@contextlib.contextmanager
|
|
1263
|
-
def temporary_directory(self, prefix="tmp_", suffix=""):
|
|
1551
|
+
def temporary_directory(self, prefix="tmp_", suffix="", check_links: bool = True):
|
|
1264
1552
|
exc = None
|
|
1265
1553
|
for name in random_names(prefix=prefix, suffix=suffix):
|
|
1266
1554
|
p = self._path_temporary_dirs / name
|
|
@@ -1287,7 +1575,7 @@ class Dedup(abc.ABC):
|
|
|
1287
1575
|
yield p
|
|
1288
1576
|
break
|
|
1289
1577
|
finally:
|
|
1290
|
-
self.delete_tree(p)
|
|
1578
|
+
self.delete_tree(p, check_links=check_links)
|
|
1291
1579
|
|
|
1292
1580
|
# Release the lock file. We will attempt to delete it next.
|
|
1293
1581
|
ex.close()
|
|
@@ -1304,18 +1592,62 @@ class Dedup(abc.ABC):
|
|
|
1304
1592
|
else:
|
|
1305
1593
|
raise AssertionError("retry count exceeded, unknown cause") if exc is None else exc
|
|
1306
1594
|
|
|
1595
|
+
@cached_property
|
|
1596
|
+
def _sql_obh_select_file(self):
|
|
1597
|
+
F = sao.aliased(mo.File)
|
|
1598
|
+
O = sao.aliased(mo.Obj)
|
|
1599
|
+
H = sao.aliased(mo.Hash)
|
|
1600
|
+
b = sa.bindparam
|
|
1601
|
+
q = (
|
|
1602
|
+
sa.select(
|
|
1603
|
+
F.id.label("file_id"),
|
|
1604
|
+
sa.case((O.orphaned_at != None, O.id), else_=None).label("obj_id"),
|
|
1605
|
+
)
|
|
1606
|
+
.join(O, F.obj)
|
|
1607
|
+
.join(H, O.hashes)
|
|
1608
|
+
.where(F.pending_id == None, H.hash_function == b("p_hf"), H.hash == b("p_h"))
|
|
1609
|
+
.limit(_lit("1"))
|
|
1610
|
+
)
|
|
1611
|
+
return q
|
|
1612
|
+
|
|
1613
|
+
@cached_property
|
|
1614
|
+
def _sql_obh_update_obj(self):
|
|
1615
|
+
O = sao.aliased(mo.Obj)
|
|
1616
|
+
b = sa.bindparam
|
|
1617
|
+
return (
|
|
1618
|
+
sa.update(O)
|
|
1619
|
+
.values(orphaned_at=sa.case((O.orphaned_at != None, b("p_now")), else_=None))
|
|
1620
|
+
.where(O.id == b("p_obj_id"))
|
|
1621
|
+
)
|
|
1622
|
+
|
|
1623
|
+
def open_by_hash(self, digest: mh.Digest) -> ty.BinaryIO | None:
|
|
1624
|
+
d = {"p_hf": digest.function.function_code, "p_h": digest.digest}
|
|
1625
|
+
with self._beginw() as s:
|
|
1626
|
+
c = s.connection()
|
|
1627
|
+
if (r := c.execute(self._sql_obh_select_file, d).one_or_none()) is None:
|
|
1628
|
+
return None
|
|
1629
|
+
file_id, obj_id = r
|
|
1630
|
+
|
|
1631
|
+
if obj_id is not None:
|
|
1632
|
+
d = {"p_now": mo.now(), "p_obj_id": obj_id}
|
|
1633
|
+
c.execute(self._sql_obh_update_obj, d).close()
|
|
1634
|
+
|
|
1635
|
+
return self._make_dedup_file_path(file_id).open("rb")
|
|
1636
|
+
|
|
1307
1637
|
@cached_property
|
|
1308
1638
|
def _q_get_hash(self):
|
|
1309
1639
|
L = sao.aliased(mo.Link)
|
|
1310
|
-
F = sao.aliased(mo.
|
|
1640
|
+
F = sao.aliased(mo.File)
|
|
1641
|
+
O = sao.aliased(mo.Obj)
|
|
1311
1642
|
H = sao.aliased(mo.Hash)
|
|
1312
1643
|
return (
|
|
1313
|
-
sa.select(L, H,
|
|
1644
|
+
sa.select(L, H, O.size)
|
|
1314
1645
|
.select_from(L)
|
|
1315
1646
|
.join(F, L.file)
|
|
1316
|
-
.
|
|
1647
|
+
.join(O, F.obj)
|
|
1648
|
+
.outerjoin(H, (Rel(H.obj) == O) & (H.hash_function == sa.bindparam("x_hf")))
|
|
1317
1649
|
.options(sao.contains_eager(L.file.of_type(F)))
|
|
1318
|
-
.where(L.
|
|
1650
|
+
.where(L.path == sa.bindparam("x_link_path"), F.pending == None)
|
|
1319
1651
|
)
|
|
1320
1652
|
|
|
1321
1653
|
def _query_by_link_path(
|
|
@@ -1357,19 +1689,24 @@ class Dedup(abc.ABC):
|
|
|
1357
1689
|
) -> tuple[int, mh.Digest] | None:
|
|
1358
1690
|
r = self.get_file_hash(hash_function, path, **kw)
|
|
1359
1691
|
if r is None:
|
|
1360
|
-
hasher = hash_function()
|
|
1361
|
-
size = 0
|
|
1362
1692
|
with path.open("rb") as f:
|
|
1363
|
-
|
|
1364
|
-
size += len(block)
|
|
1365
|
-
hasher.update(block)
|
|
1366
|
-
r = size, hasher.digest()
|
|
1693
|
+
r = self._compute_file_hash(hash_function, f)
|
|
1367
1694
|
return r
|
|
1368
1695
|
|
|
1696
|
+
def _compute_file_hash(self, hash_function, file):
|
|
1697
|
+
size = 0
|
|
1698
|
+
hasher = hash_function()
|
|
1699
|
+
while block := file.read(65536):
|
|
1700
|
+
size += len(block)
|
|
1701
|
+
hasher.update(block)
|
|
1702
|
+
return size, hasher.digest()
|
|
1703
|
+
|
|
1369
1704
|
def adopt_files(
|
|
1370
1705
|
self, hash_function: mh.HashFunction, requests: ty.Iterable[AdoptRequest]
|
|
1371
1706
|
) -> None:
|
|
1372
1707
|
"""
|
|
1708
|
+
HACK: DO NOT RUN THIS ON EXISTING DEDUP LINKS
|
|
1709
|
+
|
|
1373
1710
|
Adopt each file given in *paths*. If the path is already a dedup link, then leave it
|
|
1374
1711
|
alone. If the path is not a dedup link, then compute its hash and move the file to the
|
|
1375
1712
|
dedup store and create a link to it. If the path is already a dedup link but does not
|
|
@@ -1378,125 +1715,25 @@ class Dedup(abc.ABC):
|
|
|
1378
1715
|
|
|
1379
1716
|
This method is implemented in a somewhat inefficient way.
|
|
1380
1717
|
"""
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
if h is not None:
|
|
1391
|
-
x.req.out_digest = h.to_digest()
|
|
1392
|
-
x.req.out_size = sz
|
|
1393
|
-
x.done = True
|
|
1394
|
-
|
|
1395
|
-
if not x.done:
|
|
1396
|
-
with open(x.req.path, "rb") as f:
|
|
1397
|
-
h = hash_function()
|
|
1398
|
-
size = 0
|
|
1399
|
-
while block := f.read(65536):
|
|
1400
|
-
h.update(block)
|
|
1401
|
-
size += len(block)
|
|
1402
|
-
x.req.out_digest = h.digest()
|
|
1403
|
-
x.file_metadata = DedupFileMetadata(executable=False) # TODO
|
|
1404
|
-
x.req.out_size = size
|
|
1405
|
-
x.file_metadata_bytes = self.convert_file_metadata_to_bytes(x.file_metadata)
|
|
1406
|
-
|
|
1407
|
-
F = sao.aliased(mo.DedupFile)
|
|
1408
|
-
H = sao.aliased(mo.Hash)
|
|
1409
|
-
q = (
|
|
1410
|
-
sa.select(F)
|
|
1411
|
-
.join(H, F.hashes)
|
|
1412
|
-
.where(
|
|
1413
|
-
H.hash_function == sa.bindparam("x_hf"),
|
|
1414
|
-
H.hash == sa.bindparam("x_h"),
|
|
1415
|
-
F.pending == None,
|
|
1416
|
-
F.file_metadata == sa.bindparam("x_f_meta"),
|
|
1718
|
+
self.run_batch(
|
|
1719
|
+
DedupLinkRequest(
|
|
1720
|
+
hash_function=hash_function,
|
|
1721
|
+
link_path=req.path,
|
|
1722
|
+
tags=req.tags,
|
|
1723
|
+
file_metadata=self.get_metadata_from_file(req.path),
|
|
1724
|
+
open_file_once=None,
|
|
1725
|
+
adopt_existing=True,
|
|
1726
|
+
file_contents_hash=None,
|
|
1417
1727
|
)
|
|
1728
|
+
for req in requests
|
|
1418
1729
|
)
|
|
1419
1730
|
|
|
1420
|
-
# then we use a RW session to update the database
|
|
1421
|
-
with self._beginw() as s:
|
|
1422
|
-
for x in reqs:
|
|
1423
|
-
if x.done:
|
|
1424
|
-
continue
|
|
1425
|
-
|
|
1426
|
-
# re-check for an existing link
|
|
1427
|
-
existing = self._query_by_link_path(s, x.link_path, hash_function)
|
|
1428
|
-
if existing:
|
|
1429
|
-
l, h, sz = existing[0]
|
|
1430
|
-
file = l.file
|
|
1431
|
-
if h is None:
|
|
1432
|
-
s.add(mo.Hash.from_digest(x.req.out_digest, file=file))
|
|
1433
|
-
else:
|
|
1434
|
-
# never mind, nothing to do here
|
|
1435
|
-
x.req.out_size = sz
|
|
1436
|
-
x.req.out_digest = h.to_digest()
|
|
1437
|
-
x.done = True
|
|
1438
|
-
continue
|
|
1439
|
-
else:
|
|
1440
|
-
# try to lookup by digest first
|
|
1441
|
-
# TODO: also look up by tag
|
|
1442
|
-
files = (
|
|
1443
|
-
s.execute(
|
|
1444
|
-
q,
|
|
1445
|
-
dict(
|
|
1446
|
-
x_hf=hash_function.function_code,
|
|
1447
|
-
x_h=x.req.out_digest.digest,
|
|
1448
|
-
x_f_meta=x.file_metadata_bytes,
|
|
1449
|
-
),
|
|
1450
|
-
)
|
|
1451
|
-
.scalars()
|
|
1452
|
-
.all()
|
|
1453
|
-
)
|
|
1454
|
-
if files:
|
|
1455
|
-
file = files[0]
|
|
1456
|
-
else:
|
|
1457
|
-
file = None
|
|
1458
|
-
if file is not None:
|
|
1459
|
-
file.orphaned_at = None
|
|
1460
|
-
x.delete = True
|
|
1461
|
-
else:
|
|
1462
|
-
# no existing file found, need to create one
|
|
1463
|
-
file = mo.DedupFile(
|
|
1464
|
-
file_metadata=x.file_metadata_bytes,
|
|
1465
|
-
size=x.req.out_size,
|
|
1466
|
-
mtime=int(x.req.path.stat().st_mtime),
|
|
1467
|
-
orphaned_at=None,
|
|
1468
|
-
pending=None,
|
|
1469
|
-
hashes=[mo.Hash.from_digest(x.req.out_digest)],
|
|
1470
|
-
)
|
|
1471
|
-
s.add(file)
|
|
1472
|
-
s.flush() # we need to make sure the file has an ID
|
|
1473
|
-
|
|
1474
|
-
s.add(mo.Link(link_path=x.link_path, file=file))
|
|
1475
|
-
|
|
1476
|
-
x.dedup_file_path = self._make_dedup_file_path(file.id)
|
|
1477
|
-
|
|
1478
|
-
# We add our tags.
|
|
1479
|
-
self._add_tags_to_file(s, file, x.req.tags)
|
|
1480
|
-
|
|
1481
|
-
s.flush()
|
|
1482
|
-
|
|
1483
|
-
# and finally we make filesystem changes
|
|
1484
|
-
for x in reqs:
|
|
1485
|
-
if (dst := x.dedup_file_path) is not None:
|
|
1486
|
-
if x.delete:
|
|
1487
|
-
# We already have a DedupFile with the required contents, so we replace the
|
|
1488
|
-
# link_path file with a link to that existing DedupFile.
|
|
1489
|
-
self._delete_file(x.req.path)
|
|
1490
|
-
self._create_actual_link(dst, x.req.path)
|
|
1491
|
-
else:
|
|
1492
|
-
dst.parent.mkdir(exist_ok=True, parents=True)
|
|
1493
|
-
self._adopt_file_and_link(x.req.path, dst)
|
|
1494
|
-
|
|
1495
1731
|
def integrity_check(
|
|
1496
1732
|
self,
|
|
1497
1733
|
skip_same_mtime: bool,
|
|
1498
1734
|
threads: int | None = None,
|
|
1499
|
-
|
|
1735
|
+
*,
|
|
1736
|
+
only_invalid_link_count: bool = False,
|
|
1500
1737
|
):
|
|
1501
1738
|
"""
|
|
1502
1739
|
Verify all deduplicated files match their stored hashes. Use modification time to skip
|
|
@@ -1504,17 +1741,28 @@ class Dedup(abc.ABC):
|
|
|
1504
1741
|
:attr:`path_corrupted`.
|
|
1505
1742
|
"""
|
|
1506
1743
|
|
|
1507
|
-
F = sao.aliased(mo.
|
|
1744
|
+
F = sao.aliased(mo.File)
|
|
1745
|
+
O = sao.aliased(mo.Obj)
|
|
1508
1746
|
batch_size = 1000
|
|
1509
|
-
q = sa.select(F).options(sao.selectinload(F.
|
|
1747
|
+
q = sa.select(F).options(sao.selectinload(F.obj.of_type(O)).selectinload(O.hashes))
|
|
1748
|
+
if only_invalid_link_count:
|
|
1749
|
+
q = q.where(F.link_count < _lit("0"))
|
|
1750
|
+
q = q.where(F.pending_id == None).order_by(F.id).limit(batch_size)
|
|
1510
1751
|
|
|
1511
|
-
def _hash_check(file: mo.
|
|
1752
|
+
def _hash_check(file: mo.File) -> None:
|
|
1512
1753
|
p = self._make_dedup_file_path(file.id)
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1754
|
+
st = p.stat()
|
|
1755
|
+
|
|
1756
|
+
# FIXME: specific to hardlink backend
|
|
1757
|
+
if (n := st.st_nlink - 1) != file.link_count:
|
|
1758
|
+
link_count_updates.append({"id": file.id, "link_count": n})
|
|
1759
|
+
changed_obj_ids.add(file.obj_id)
|
|
1760
|
+
|
|
1761
|
+
if skip_same_mtime:
|
|
1762
|
+
if (st_mtime := int(st.st_mtime)) == self._clean_dedup_mtime:
|
|
1763
|
+
return
|
|
1516
1764
|
|
|
1517
|
-
d = file.hashes_dict
|
|
1765
|
+
d = file.obj.hashes_dict
|
|
1518
1766
|
m = mh.MultiHasher({hf: hf() for hf in d})
|
|
1519
1767
|
with p.open("rb") as fh:
|
|
1520
1768
|
while block := fh.read(65536):
|
|
@@ -1524,22 +1772,21 @@ class Dedup(abc.ABC):
|
|
|
1524
1772
|
|
|
1525
1773
|
# TODO: also check file metadata matches, such as the executable bit
|
|
1526
1774
|
|
|
1527
|
-
|
|
1528
|
-
with self._SessionW() as s:
|
|
1529
|
-
IdKey.from_instance(file).get_one(s).mtime = st_mtime
|
|
1530
|
-
|
|
1531
|
-
id_min = -1
|
|
1775
|
+
id_min = None
|
|
1532
1776
|
with cf.ThreadPoolExecutor(max_workers=threads) as exe:
|
|
1533
1777
|
while True:
|
|
1778
|
+
changed_obj_ids = set()
|
|
1779
|
+
link_count_updates = []
|
|
1534
1780
|
invalid_file_ids = []
|
|
1535
1781
|
|
|
1536
1782
|
with self._SessionR() as s:
|
|
1537
|
-
q2 = q.where(F.id > id_min
|
|
1538
|
-
dedup_files: list[mo.
|
|
1783
|
+
q2 = q if id_min is None else q.where(F.id > id_min)
|
|
1784
|
+
dedup_files: list[mo.File] = s.execute(q2).scalars().all()
|
|
1539
1785
|
|
|
1540
1786
|
if not dedup_files:
|
|
1541
1787
|
break
|
|
1542
1788
|
|
|
1789
|
+
s.expunge_all()
|
|
1543
1790
|
id_min = dedup_files[-1].id
|
|
1544
1791
|
futures = {exe.submit(_hash_check, f): f for f in dedup_files}
|
|
1545
1792
|
for future in cf.as_completed(futures):
|
|
@@ -1549,53 +1796,37 @@ class Dedup(abc.ABC):
|
|
|
1549
1796
|
raise exc
|
|
1550
1797
|
|
|
1551
1798
|
file = futures[future]
|
|
1552
|
-
|
|
1553
|
-
invalid_file_ids.append(file.id)
|
|
1799
|
+
invalid_file_ids.append((file.id, exc))
|
|
1554
1800
|
|
|
1555
|
-
if
|
|
1556
|
-
with self.
|
|
1801
|
+
if link_count_updates:
|
|
1802
|
+
with self._beginw() as s:
|
|
1803
|
+
s.execute(sa.update(F), link_count_updates).close()
|
|
1804
|
+
now = mo.now()
|
|
1557
1805
|
s.connection().execute(
|
|
1558
|
-
sa.
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
self, s: sao.Session, file: mo.DedupFile, exc: Exception, keep_corrupted: bool
|
|
1564
|
-
):
|
|
1565
|
-
"""
|
|
1566
|
-
Process one file that has been found to be corrupted.
|
|
1567
|
-
"""
|
|
1568
|
-
|
|
1569
|
-
path_file = self._make_dedup_file_path(file.id)
|
|
1570
|
-
|
|
1571
|
-
# Load the links as we will need them
|
|
1572
|
-
s.refresh(file, ["links"])
|
|
1573
|
-
|
|
1574
|
-
link_paths = [self._link_path_from_string(link.link_path) for link in file.links]
|
|
1575
|
-
json_data = {
|
|
1576
|
-
"file_id": file.id,
|
|
1577
|
-
"link_paths": [str(x) for x in link_paths],
|
|
1578
|
-
"raw_link_paths": [
|
|
1579
|
-
link.link_path.decode("utf-8", errors="replace") for link in file.links
|
|
1580
|
-
],
|
|
1581
|
-
"exception": repr(exc),
|
|
1582
|
-
}
|
|
1583
|
-
|
|
1584
|
-
with create_file_random(self.path_corrupted, "f_", ".json") as f:
|
|
1585
|
-
path_json = Path(f.name)
|
|
1586
|
-
f.write(json.dumps(json_data, indent=2, sort_keys=True).encode("utf-8"))
|
|
1587
|
-
|
|
1588
|
-
if keep_corrupted:
|
|
1589
|
-
try:
|
|
1590
|
-
path_file.rename(path_json.with_suffix(".bin"))
|
|
1591
|
-
except Exception:
|
|
1592
|
-
if path_file.exists():
|
|
1593
|
-
logger.warning(
|
|
1594
|
-
"failed to rename corrupt file", exc_info=True, data=str(path_file)
|
|
1595
|
-
)
|
|
1806
|
+
sa.update(O)
|
|
1807
|
+
.where(O.id == sa.bindparam("p_id"))
|
|
1808
|
+
.values(updated_at=sa.bindparam("p_now")),
|
|
1809
|
+
[{"p_id": x, "p_now": now} for x in changed_obj_ids],
|
|
1810
|
+
).close()
|
|
1596
1811
|
|
|
1597
|
-
|
|
1598
|
-
|
|
1812
|
+
if invalid_file_ids:
|
|
1813
|
+
with self._beginw() as s:
|
|
1814
|
+
s.execute(
|
|
1815
|
+
sa.insert(mo.FileCorruption),
|
|
1816
|
+
[
|
|
1817
|
+
{
|
|
1818
|
+
"id": file_id,
|
|
1819
|
+
"exception_name": type(exc).__name__,
|
|
1820
|
+
"exception_string": str(exc),
|
|
1821
|
+
}
|
|
1822
|
+
for file_id, exc in invalid_file_ids
|
|
1823
|
+
],
|
|
1824
|
+
).close()
|
|
1825
|
+
s.connection().execute(
|
|
1826
|
+
sa.update(F)
|
|
1827
|
+
.values(pending_id=self._corrupted_pending_id)
|
|
1828
|
+
.where(F.id.in_(x[0] for x in invalid_file_ids))
|
|
1829
|
+
).close()
|
|
1599
1830
|
|
|
1600
1831
|
class _compute_stats_ZeroRow:
|
|
1601
1832
|
orphaned = None
|
|
@@ -1603,18 +1834,22 @@ class Dedup(abc.ABC):
|
|
|
1603
1834
|
size = 0
|
|
1604
1835
|
|
|
1605
1836
|
def compute_stats(self) -> DedupStats:
|
|
1837
|
+
self.detect_orphaned()
|
|
1838
|
+
|
|
1606
1839
|
with self._SessionR() as s:
|
|
1607
|
-
|
|
1840
|
+
O = sao.aliased(mo.Obj)
|
|
1841
|
+
F = sao.aliased(mo.File)
|
|
1608
1842
|
L = sao.aliased(mo.Link)
|
|
1609
|
-
orph =
|
|
1843
|
+
orph = O.orphaned_at != None
|
|
1610
1844
|
|
|
1611
1845
|
q = (
|
|
1612
1846
|
sa.select(
|
|
1613
1847
|
orph.label("orphaned"),
|
|
1614
1848
|
sa.func.count().label("count"),
|
|
1615
|
-
sa.func.sum(
|
|
1849
|
+
sa.func.sum(O.size).label("size"),
|
|
1616
1850
|
)
|
|
1617
1851
|
.select_from(F)
|
|
1852
|
+
.join(O, F.obj)
|
|
1618
1853
|
.where(F.pending == None)
|
|
1619
1854
|
.group_by(orph)
|
|
1620
1855
|
)
|
|
@@ -1622,9 +1857,10 @@ class Dedup(abc.ABC):
|
|
|
1622
1857
|
file_stats |= {row.orphaned: row for row in s.execute(q).all()}
|
|
1623
1858
|
|
|
1624
1859
|
q = (
|
|
1625
|
-
sa.select(sa.func.count().label("count"), sa.func.sum(
|
|
1860
|
+
sa.select(sa.func.count().label("count"), sa.func.sum(O.size).label("size"))
|
|
1626
1861
|
.select_from(L)
|
|
1627
1862
|
.join(F, L.file)
|
|
1863
|
+
.join(O, F.obj)
|
|
1628
1864
|
).where(F.pending == None)
|
|
1629
1865
|
link_stats = s.execute(q).one()
|
|
1630
1866
|
|
|
@@ -1639,6 +1875,8 @@ class Dedup(abc.ABC):
|
|
|
1639
1875
|
|
|
1640
1876
|
|
|
1641
1877
|
class DedupBackendHardlink(Dedup):
|
|
1878
|
+
max_link_count = 1000 # Windows limits it to 1023
|
|
1879
|
+
|
|
1642
1880
|
def _create_actual_link(self, existing: Path, new: Path):
|
|
1643
1881
|
# Path.link_to was removed and replaced by Path.hardlink_to, but I want this to work across
|
|
1644
1882
|
# Python 3.9 to 3.13
|
|
@@ -1649,14 +1887,13 @@ class DedupBackendHardlink(Dedup):
|
|
|
1649
1887
|
self._create_actual_link(existing_path, dedup_file_path)
|
|
1650
1888
|
|
|
1651
1889
|
def _verify_link(self, link: mo.Link) -> bool:
|
|
1652
|
-
p =
|
|
1653
|
-
|
|
1890
|
+
p = self._link_path_from_string(link.path)
|
|
1654
1891
|
try:
|
|
1655
1892
|
a = p.lstat()
|
|
1656
1893
|
except Exception:
|
|
1657
1894
|
return False
|
|
1658
1895
|
|
|
1659
|
-
if
|
|
1896
|
+
if int(a.st_mtime) != self._clean_dedup_mtime:
|
|
1660
1897
|
return False
|
|
1661
1898
|
|
|
1662
1899
|
# st_ino is 0 on unsupported filesystems on Windows.
|