PyHardLinkBackup 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyHardLinkBackup/__init__.py +7 -0
- PyHardLinkBackup/__main__.py +10 -0
- PyHardLinkBackup/backup.py +203 -0
- PyHardLinkBackup/cli_app/__init__.py +41 -0
- PyHardLinkBackup/cli_app/phlb.py +50 -0
- PyHardLinkBackup/cli_dev/__init__.py +70 -0
- PyHardLinkBackup/cli_dev/benchmark.py +119 -0
- PyHardLinkBackup/cli_dev/code_style.py +12 -0
- PyHardLinkBackup/cli_dev/packaging.py +65 -0
- PyHardLinkBackup/cli_dev/shell_completion.py +23 -0
- PyHardLinkBackup/cli_dev/testing.py +52 -0
- PyHardLinkBackup/cli_dev/update_readme_history.py +33 -0
- PyHardLinkBackup/constants.py +16 -0
- PyHardLinkBackup/tests/__init__.py +36 -0
- PyHardLinkBackup/tests/test_backup.py +399 -0
- PyHardLinkBackup/tests/test_doc_write.py +25 -0
- PyHardLinkBackup/tests/test_doctests.py +10 -0
- PyHardLinkBackup/tests/test_project_setup.py +46 -0
- PyHardLinkBackup/tests/test_readme.py +75 -0
- PyHardLinkBackup/tests/test_readme_history.py +8 -0
- PyHardLinkBackup/utilities/__init__.py +0 -0
- PyHardLinkBackup/utilities/file_hash_database.py +58 -0
- PyHardLinkBackup/utilities/file_size_database.py +46 -0
- PyHardLinkBackup/utilities/filesystem.py +133 -0
- PyHardLinkBackup/utilities/humanize.py +22 -0
- PyHardLinkBackup/utilities/rich_utils.py +98 -0
- PyHardLinkBackup/utilities/tests/__init__.py +0 -0
- PyHardLinkBackup/utilities/tests/test_file_hash_database.py +134 -0
- PyHardLinkBackup/utilities/tests/test_file_size_database.py +131 -0
- PyHardLinkBackup/utilities/tests/test_filesystem.py +94 -0
- pyhardlinkbackup-1.0.0.dist-info/METADATA +547 -0
- pyhardlinkbackup-1.0.0.dist-info/RECORD +34 -0
- pyhardlinkbackup-1.0.0.dist-info/WHEEL +4 -0
- pyhardlinkbackup-1.0.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class HashAlreadyExistsError(ValueError):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FileHashDatabase:
|
|
10
|
+
"""DocWrite: README.md ## FileHashDatabase
|
|
11
|
+
A simple "database" to store file content hash <-> relative path mappings.
|
|
12
|
+
Uses a directory structure to avoid too many files in a single directory.
|
|
13
|
+
Path structure:
|
|
14
|
+
{base_dst}/.phlb/hash-lookup/{XX}/{YY}/{hash}
|
|
15
|
+
e.g.:
|
|
16
|
+
hash '12ab000a1b2c3...' results in: {base_dst}/.phlb/hash-lookup/12/ab/12ab000a1b2c3...
|
|
17
|
+
|
|
18
|
+
Notes:
|
|
19
|
+
* Hash length will be not validated, so it can be used with any hash algorithm.
|
|
20
|
+
* The "relative path" that will be stored is not validated, so it can be any string.
|
|
21
|
+
* We don't "cache" anything in Memory, to avoid high memory consumption for large datasets.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, backup_root: Path, phlb_conf_dir: Path):
|
|
25
|
+
self.backup_root = backup_root
|
|
26
|
+
self.base_path = phlb_conf_dir / 'hash-lookup'
|
|
27
|
+
self.base_path.mkdir(parents=False, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
def _get_hash_path(self, hash: str) -> Path:
|
|
30
|
+
first_dir_name = hash[:2]
|
|
31
|
+
second_dir_name = hash[2:4]
|
|
32
|
+
hash_path = self.base_path / first_dir_name / second_dir_name / hash
|
|
33
|
+
return hash_path
|
|
34
|
+
|
|
35
|
+
def get(self, hash: str) -> Path | None:
|
|
36
|
+
hash_path = self._get_hash_path(hash)
|
|
37
|
+
try:
|
|
38
|
+
rel_file_path = hash_path.read_text()
|
|
39
|
+
except FileNotFoundError:
|
|
40
|
+
return None
|
|
41
|
+
else:
|
|
42
|
+
abs_file_path = self.backup_root / rel_file_path
|
|
43
|
+
if not abs_file_path.is_file():
|
|
44
|
+
logging.warning('Hash database entry found, but file does not exist: %s', abs_file_path)
|
|
45
|
+
hash_path.unlink()
|
|
46
|
+
return None
|
|
47
|
+
return abs_file_path
|
|
48
|
+
|
|
49
|
+
def __setitem__(self, hash: str, abs_file_path: Path):
|
|
50
|
+
hash_path = self._get_hash_path(hash)
|
|
51
|
+
hash_path.parent.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
|
|
53
|
+
# File should be found before and results in hardlink creation!
|
|
54
|
+
# So deny change of existing hashes:
|
|
55
|
+
if hash_path.exists():
|
|
56
|
+
raise HashAlreadyExistsError(f'Hash {hash} already exists in the database!')
|
|
57
|
+
|
|
58
|
+
hash_path.write_text(str(abs_file_path.relative_to(self.backup_root)))
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class FileSizeDatabase:
|
|
5
|
+
"""DocWrite: README.md ## FileSizeDatabase
|
|
6
|
+
A simple "database" to track which file sizes have been seen.
|
|
7
|
+
|
|
8
|
+
Uses a directory structure to avoid too many files in a single directory.
|
|
9
|
+
We don't "cache" anything in Memory, to avoid high memory consumption for large datasets.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
MIN_SIZE = 1000 # no padding is made, so the min size is 1000 bytes!
|
|
13
|
+
|
|
14
|
+
def __init__(self, phlb_conf_dir: Path):
|
|
15
|
+
self.base_path = phlb_conf_dir / 'size-lookup'
|
|
16
|
+
self.base_path.mkdir(parents=False, exist_ok=True)
|
|
17
|
+
|
|
18
|
+
def _get_size_path(self, size: int) -> Path:
|
|
19
|
+
assert size >= self.MIN_SIZE, f'Size must be at least {self.MIN_SIZE} bytes'
|
|
20
|
+
size_str = str(size)
|
|
21
|
+
|
|
22
|
+
"""DocWrite: README.md ## FileSizeDatabase
|
|
23
|
+
Path structure:
|
|
24
|
+
* `{base_dst}/.phlb/size-lookup/{XX}/{YY}/{size}`
|
|
25
|
+
|
|
26
|
+
e.g.:
|
|
27
|
+
|
|
28
|
+
* `1234567890` results in: `{base_dst}/.phlb/size-lookup/12/34/1234567890`
|
|
29
|
+
"""
|
|
30
|
+
first_dir_name = size_str[:2]
|
|
31
|
+
second_dir_name = size_str[2:4]
|
|
32
|
+
size_path = self.base_path / first_dir_name / second_dir_name / size_str
|
|
33
|
+
return size_path
|
|
34
|
+
|
|
35
|
+
def __contains__(self, size: int) -> bool:
|
|
36
|
+
size_path = self._get_size_path(size)
|
|
37
|
+
return size_path.exists()
|
|
38
|
+
|
|
39
|
+
def add(self, size: int):
|
|
40
|
+
size_path = self._get_size_path(size)
|
|
41
|
+
if not size_path.exists():
|
|
42
|
+
size_path.parent.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
|
|
44
|
+
"""DocWrite: README.md ## FileSizeDatabase
|
|
45
|
+
All files are created empty, as we only care about their existence."""
|
|
46
|
+
size_path.touch(exist_ok=False)
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Iterable
|
|
8
|
+
|
|
9
|
+
from rich.progress import (
|
|
10
|
+
Progress,
|
|
11
|
+
SpinnerColumn,
|
|
12
|
+
TextColumn,
|
|
13
|
+
TimeElapsedColumn,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from PyHardLinkBackup.constants import CHUNK_SIZE, HASH_ALGO
|
|
17
|
+
from PyHardLinkBackup.utilities.rich_utils import HumanFileSizeColumn
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def hash_file(path: Path) -> str:
|
|
24
|
+
logger.debug('Hash file %s using %s', path, HASH_ALGO)
|
|
25
|
+
with path.open('rb') as f:
|
|
26
|
+
digest = hashlib.file_digest(f, HASH_ALGO)
|
|
27
|
+
|
|
28
|
+
file_hash = digest.hexdigest()
|
|
29
|
+
logger.info('%s %s hash: %s', path, HASH_ALGO, file_hash)
|
|
30
|
+
return file_hash
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def copy_and_hash(src: Path, dst: Path) -> str:
|
|
34
|
+
logger.debug('Copy and hash file %s to %s using %s', src, dst, HASH_ALGO)
|
|
35
|
+
hasher = hashlib.new(HASH_ALGO)
|
|
36
|
+
with src.open('rb') as source_file, dst.open('wb') as dst_file:
|
|
37
|
+
while chunk := source_file.read(CHUNK_SIZE):
|
|
38
|
+
dst_file.write(chunk)
|
|
39
|
+
hasher.update(chunk)
|
|
40
|
+
|
|
41
|
+
# Keep original file metadata (permission bits, last access time, last modification time, and flags)
|
|
42
|
+
shutil.copystat(src, dst)
|
|
43
|
+
|
|
44
|
+
file_hash = hasher.hexdigest()
|
|
45
|
+
logger.info('%s backup to %s with %s hash: %s', src, dst, HASH_ALGO, file_hash)
|
|
46
|
+
return file_hash
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def read_and_hash_file(path: Path) -> tuple[bytes, str]:
|
|
50
|
+
logger.debug('Read and hash file %s using %s into RAM', path, HASH_ALGO)
|
|
51
|
+
content = path.read_bytes()
|
|
52
|
+
hasher = hashlib.new(HASH_ALGO, content)
|
|
53
|
+
file_hash = hasher.hexdigest()
|
|
54
|
+
logger.info('%s %s hash: %s', path, HASH_ALGO, file_hash)
|
|
55
|
+
return content, file_hash
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def iter_scandir_files(path: Path, excludes: set[str]) -> Iterable[os.DirEntry]:
|
|
59
|
+
"""
|
|
60
|
+
Recursively yield all files+symlinks in the given directory.
|
|
61
|
+
"""
|
|
62
|
+
logger.debug('Scanning directory %s', path)
|
|
63
|
+
with os.scandir(path) as scandir_iterator:
|
|
64
|
+
for entry in scandir_iterator:
|
|
65
|
+
if entry.is_dir(follow_symlinks=True):
|
|
66
|
+
if entry.name in excludes:
|
|
67
|
+
logger.debug('Excluding directory %s', entry.path)
|
|
68
|
+
continue
|
|
69
|
+
yield from iter_scandir_files(Path(entry.path), excludes=excludes)
|
|
70
|
+
else:
|
|
71
|
+
# It's a file or symlink or broken symlink
|
|
72
|
+
yield entry
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def humanized_fs_scan(path: Path, excludes: set[str]) -> tuple[int, int]:
|
|
76
|
+
print(f'\nScanning filesystem at: {path}...')
|
|
77
|
+
|
|
78
|
+
progress = Progress(
|
|
79
|
+
TimeElapsedColumn(),
|
|
80
|
+
'{task.description}',
|
|
81
|
+
SpinnerColumn('simpleDots'),
|
|
82
|
+
TextColumn('[green]{task.fields[file_count]} Files'),
|
|
83
|
+
HumanFileSizeColumn(field_name='total_size'),
|
|
84
|
+
TextColumn('| [cyan]{task.fields[files_per_sec]} Files/sec'),
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
file_count = 0
|
|
88
|
+
total_size = 0
|
|
89
|
+
start_time = time.time()
|
|
90
|
+
scan_task_id = progress.add_task(
|
|
91
|
+
description='Scanning',
|
|
92
|
+
file_count=file_count,
|
|
93
|
+
total_size=total_size,
|
|
94
|
+
files_per_sec=0.0,
|
|
95
|
+
total=None,
|
|
96
|
+
)
|
|
97
|
+
next_update = 0
|
|
98
|
+
with progress:
|
|
99
|
+
for entry in iter_scandir_files(path, excludes=excludes):
|
|
100
|
+
file_count += 1
|
|
101
|
+
try:
|
|
102
|
+
total_size += entry.stat().st_size
|
|
103
|
+
except FileNotFoundError:
|
|
104
|
+
# e.g.: broken symlink
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
now = time.time()
|
|
108
|
+
if now >= next_update:
|
|
109
|
+
elapsed = max(now - start_time, 1e-6)
|
|
110
|
+
files_per_sec = int(file_count / elapsed)
|
|
111
|
+
progress.update(
|
|
112
|
+
scan_task_id,
|
|
113
|
+
file_count=file_count,
|
|
114
|
+
total_size=total_size,
|
|
115
|
+
files_per_sec=files_per_sec,
|
|
116
|
+
)
|
|
117
|
+
next_update = now + 1
|
|
118
|
+
|
|
119
|
+
now = time.time()
|
|
120
|
+
|
|
121
|
+
elapsed = max(now - start_time, 1e-6)
|
|
122
|
+
files_per_sec = int(file_count / elapsed)
|
|
123
|
+
progress.stop_task(scan_task_id)
|
|
124
|
+
progress.update(
|
|
125
|
+
scan_task_id,
|
|
126
|
+
description='Completed',
|
|
127
|
+
completed=True,
|
|
128
|
+
file_count=file_count,
|
|
129
|
+
total_size=total_size,
|
|
130
|
+
files_per_sec=files_per_sec,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
return file_count, total_size
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
def human_filesize(size: int | float) -> str:
|
|
4
|
+
"""
|
|
5
|
+
>>> human_filesize(1024)
|
|
6
|
+
'1.00 KiB'
|
|
7
|
+
>>> human_filesize(2.2*1024)
|
|
8
|
+
'2.20 KiB'
|
|
9
|
+
>>> human_filesize(3.33*1024*1024)
|
|
10
|
+
'3.33 MiB'
|
|
11
|
+
>>> human_filesize(4.44*1024*1024*1024)
|
|
12
|
+
'4.44 GiB'
|
|
13
|
+
>>> human_filesize(5.55*1024*1024*1024*1024)
|
|
14
|
+
'5.55 TiB'
|
|
15
|
+
>>> human_filesize(6.66*1024*1024*1024*1024*1024)
|
|
16
|
+
'6.66 PiB'
|
|
17
|
+
"""
|
|
18
|
+
for unit in ['Bytes', 'KiB', 'MiB', 'GiB', 'TiB']:
|
|
19
|
+
if size < 1024.0:
|
|
20
|
+
return f'{size:.2f} {unit}'
|
|
21
|
+
size /= 1024.0
|
|
22
|
+
return f'{size:.2f} PiB'
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from rich.live import Live
|
|
2
|
+
from rich.panel import Panel
|
|
3
|
+
from rich.progress import (
|
|
4
|
+
BarColumn,
|
|
5
|
+
Progress,
|
|
6
|
+
ProgressColumn,
|
|
7
|
+
TaskProgressColumn,
|
|
8
|
+
TextColumn,
|
|
9
|
+
TimeElapsedColumn,
|
|
10
|
+
TimeRemainingColumn,
|
|
11
|
+
TransferSpeedColumn,
|
|
12
|
+
)
|
|
13
|
+
from rich.table import Table
|
|
14
|
+
from rich.text import Text
|
|
15
|
+
|
|
16
|
+
from PyHardLinkBackup.utilities.humanize import human_filesize
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HumanFileSizeColumn(ProgressColumn):
|
|
20
|
+
def __init__(self, field_name: str | None = None, **kwargs) -> None:
|
|
21
|
+
super().__init__(**kwargs)
|
|
22
|
+
self.field_name = field_name
|
|
23
|
+
|
|
24
|
+
def render(self, task):
|
|
25
|
+
if self.field_name is None:
|
|
26
|
+
file_size = task.completed
|
|
27
|
+
else:
|
|
28
|
+
try:
|
|
29
|
+
file_size = task.fields[self.field_name]
|
|
30
|
+
except KeyError:
|
|
31
|
+
raise KeyError(f'Field {self.field_name=} not found in: {task.fields.keys()=}') from None
|
|
32
|
+
return Text(f'| {human_filesize(file_size)}')
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BackupProgress:
|
|
36
|
+
def __init__(self, src_file_count: int, src_total_size: int):
|
|
37
|
+
self.overall_progress = Progress(
|
|
38
|
+
TaskProgressColumn(),
|
|
39
|
+
BarColumn(bar_width=50),
|
|
40
|
+
TextColumn('Elapsed:'),
|
|
41
|
+
TimeElapsedColumn(),
|
|
42
|
+
TextColumn('Remaining:'),
|
|
43
|
+
TimeRemainingColumn(),
|
|
44
|
+
)
|
|
45
|
+
self.overall_progress_task_id = self.overall_progress.add_task(description='', total=100)
|
|
46
|
+
|
|
47
|
+
self.file_count_progress = Progress(
|
|
48
|
+
TaskProgressColumn(),
|
|
49
|
+
BarColumn(bar_width=50),
|
|
50
|
+
TextColumn('{task.completed} Files'),
|
|
51
|
+
)
|
|
52
|
+
self.file_count_progress_task_id = self.file_count_progress.add_task(description='', total=src_file_count)
|
|
53
|
+
self.file_count_progress_task = self.file_count_progress.tasks[0]
|
|
54
|
+
|
|
55
|
+
self.file_size_progress = Progress(
|
|
56
|
+
TaskProgressColumn(),
|
|
57
|
+
BarColumn(bar_width=50),
|
|
58
|
+
HumanFileSizeColumn(),
|
|
59
|
+
'|',
|
|
60
|
+
TransferSpeedColumn(),
|
|
61
|
+
)
|
|
62
|
+
self.file_size_progress_task_id = self.file_size_progress.add_task(description='', total=src_total_size)
|
|
63
|
+
self.file_size_progress_task = self.file_size_progress.tasks[0]
|
|
64
|
+
|
|
65
|
+
progress_table = Table.grid()
|
|
66
|
+
progress_table.add_row(Panel(self.overall_progress, title='[b]Overall Backup Progress', border_style='green'))
|
|
67
|
+
progress_table.add_row(Panel(self.file_count_progress, title='Total files saved'))
|
|
68
|
+
progress_table.add_row(Panel(self.file_size_progress, title='Total file size processed'))
|
|
69
|
+
|
|
70
|
+
self.live = Live(progress_table, auto_refresh=False)
|
|
71
|
+
|
|
72
|
+
def __enter__(self):
|
|
73
|
+
self.live.__enter__()
|
|
74
|
+
return self
|
|
75
|
+
|
|
76
|
+
def update(self, backup_count: int, backup_size: int):
|
|
77
|
+
self.file_count_progress.update(
|
|
78
|
+
task_id=self.file_count_progress_task_id,
|
|
79
|
+
completed=backup_count,
|
|
80
|
+
refresh=True,
|
|
81
|
+
)
|
|
82
|
+
self.file_size_progress.update(
|
|
83
|
+
task_id=self.file_size_progress_task_id,
|
|
84
|
+
completed=backup_size,
|
|
85
|
+
refresh=True,
|
|
86
|
+
)
|
|
87
|
+
self.overall_progress.update(
|
|
88
|
+
task_id=self.overall_progress_task_id,
|
|
89
|
+
completed=(self.file_count_progress_task.percentage + self.file_size_progress_task.percentage) / 2,
|
|
90
|
+
refresh=True,
|
|
91
|
+
)
|
|
92
|
+
self.live.refresh()
|
|
93
|
+
|
|
94
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
95
|
+
self.overall_progress.stop()
|
|
96
|
+
self.file_count_progress.stop()
|
|
97
|
+
self.file_size_progress.stop()
|
|
98
|
+
self.live.__exit__(exc_type, exc_value, traceback)
|
|
File without changes
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import tempfile
|
|
3
|
+
import textwrap
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from unittest import TestCase
|
|
6
|
+
|
|
7
|
+
from bx_py_utils.path import assert_is_dir
|
|
8
|
+
from bx_py_utils.test_utils.assertion import assert_text_equal
|
|
9
|
+
|
|
10
|
+
from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase, HashAlreadyExistsError
|
|
11
|
+
from PyHardLinkBackup.utilities.filesystem import iter_scandir_files
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TemporaryFileHashDatabase(tempfile.TemporaryDirectory):
|
|
15
|
+
def __enter__(self) -> FileHashDatabase:
|
|
16
|
+
temp_dir = super().__enter__()
|
|
17
|
+
backup_root = Path(temp_dir)
|
|
18
|
+
|
|
19
|
+
phlb_conf_dir = backup_root / '.phlb'
|
|
20
|
+
phlb_conf_dir.mkdir()
|
|
21
|
+
|
|
22
|
+
hash_db = FileHashDatabase(backup_root=backup_root, phlb_conf_dir=phlb_conf_dir)
|
|
23
|
+
return hash_db
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_hash_db_filenames(hash_db: FileHashDatabase) -> list[str]:
|
|
27
|
+
return sorted(
|
|
28
|
+
str(Path(entry.path).relative_to(hash_db.base_path))
|
|
29
|
+
for entry in iter_scandir_files(hash_db.base_path, excludes=set())
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_hash_db_info(backup_root: Path) -> str:
|
|
34
|
+
db_base_path = backup_root / '.phlb' / 'hash-lookup'
|
|
35
|
+
assert_is_dir(db_base_path)
|
|
36
|
+
|
|
37
|
+
lines = []
|
|
38
|
+
for entry in iter_scandir_files(db_base_path, excludes=set()):
|
|
39
|
+
hash_path = Path(entry.path)
|
|
40
|
+
rel_path = hash_path.relative_to(db_base_path)
|
|
41
|
+
rel_file_path = hash_path.read_text()
|
|
42
|
+
lines.append(f'{str(rel_path)[:20]}… -> {rel_file_path}')
|
|
43
|
+
return '\n'.join(sorted(lines))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def assert_hash_db_info(backup_root: Path, expected: str):
|
|
47
|
+
expected = textwrap.dedent(expected).strip()
|
|
48
|
+
actual = get_hash_db_info(backup_root)
|
|
49
|
+
assert_text_equal(
|
|
50
|
+
actual,
|
|
51
|
+
expected,
|
|
52
|
+
msg=f'FileHashDatabase info does not match as expected.\n\n{actual}\n\n',
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class FileHashDatabaseTestCase(TestCase):
|
|
57
|
+
def test_happy_path(self):
|
|
58
|
+
with TemporaryFileHashDatabase() as hash_db:
|
|
59
|
+
self.assertIsInstance(hash_db, FileHashDatabase)
|
|
60
|
+
|
|
61
|
+
backup_root_path = hash_db.backup_root
|
|
62
|
+
assert_is_dir(backup_root_path)
|
|
63
|
+
|
|
64
|
+
test_path = hash_db._get_hash_path('12345678abcdef')
|
|
65
|
+
self.assertEqual(test_path, hash_db.base_path / '12' / '34' / '12345678abcdef')
|
|
66
|
+
|
|
67
|
+
file_a_path = backup_root_path / 'rel/path/to/file-A'
|
|
68
|
+
file_a_path.parent.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
file_a_path.touch()
|
|
70
|
+
|
|
71
|
+
self.assertIs(hash_db.get('12345678abcdef'), None)
|
|
72
|
+
hash_db['12345678abcdef'] = file_a_path
|
|
73
|
+
self.assertEqual(hash_db.get('12345678abcdef'), file_a_path)
|
|
74
|
+
self.assertEqual(
|
|
75
|
+
get_hash_db_filenames(hash_db),
|
|
76
|
+
['12/34/12345678abcdef'],
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
########################################################################################
|
|
80
|
+
# Another instance using the same directory:
|
|
81
|
+
|
|
82
|
+
another_hash_db = FileHashDatabase(
|
|
83
|
+
backup_root=hash_db.backup_root,
|
|
84
|
+
phlb_conf_dir=hash_db.base_path.parent,
|
|
85
|
+
)
|
|
86
|
+
self.assertEqual(another_hash_db.get('12345678abcdef'), file_a_path)
|
|
87
|
+
self.assertIs(another_hash_db.get('12abcd345678abcdef'), None)
|
|
88
|
+
|
|
89
|
+
file_b_path = backup_root_path / 'rel/path/to/file-B'
|
|
90
|
+
file_b_path.parent.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
file_b_path.touch()
|
|
92
|
+
|
|
93
|
+
another_hash_db['12abcd345678abcdef'] = file_b_path
|
|
94
|
+
self.assertEqual(another_hash_db.get('12abcd345678abcdef'), file_b_path)
|
|
95
|
+
self.assertEqual(
|
|
96
|
+
get_hash_db_filenames(another_hash_db),
|
|
97
|
+
[
|
|
98
|
+
'12/34/12345678abcdef',
|
|
99
|
+
'12/ab/12abcd345678abcdef',
|
|
100
|
+
],
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
assert_hash_db_info(
|
|
104
|
+
backup_root=hash_db.backup_root,
|
|
105
|
+
expected="""
|
|
106
|
+
12/34/12345678abcdef… -> rel/path/to/file-A
|
|
107
|
+
12/ab/12abcd345678ab… -> rel/path/to/file-B
|
|
108
|
+
""",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
########################################################################################
|
|
112
|
+
# Deny "overwrite" of existing hash:
|
|
113
|
+
|
|
114
|
+
with self.assertRaises(HashAlreadyExistsError):
|
|
115
|
+
hash_db['12abcd345678abcdef'] = 'foo/bar/baz' # already exists!
|
|
116
|
+
|
|
117
|
+
########################################################################################
|
|
118
|
+
# Don't use stale entries pointing to missing files:
|
|
119
|
+
|
|
120
|
+
self.assertEqual(hash_db.get('12345678abcdef'), file_a_path)
|
|
121
|
+
file_a_path.unlink()
|
|
122
|
+
|
|
123
|
+
"""DocWrite: README.md ## FileHashDatabase - Missing hardlink target file
|
|
124
|
+
We check if the hardlink source file still exists. If not, we remove the hash entry from the database.
|
|
125
|
+
A warning is logged in this case."""
|
|
126
|
+
with self.assertLogs(level=logging.WARNING) as logs:
|
|
127
|
+
self.assertIs(hash_db.get('12345678abcdef'), None)
|
|
128
|
+
self.assertIn('Hash database entry found, but file does not exist', ''.join(logs.output))
|
|
129
|
+
assert_hash_db_info(
|
|
130
|
+
backup_root=hash_db.backup_root,
|
|
131
|
+
expected="""
|
|
132
|
+
12/ab/12abcd345678ab… -> rel/path/to/file-B
|
|
133
|
+
""",
|
|
134
|
+
)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from unittest import TestCase
|
|
5
|
+
|
|
6
|
+
from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
|
|
7
|
+
from PyHardLinkBackup.utilities.filesystem import iter_scandir_files
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TemporaryFileSizeDatabase(tempfile.TemporaryDirectory):
|
|
11
|
+
def __enter__(self) -> FileSizeDatabase:
|
|
12
|
+
temp_dir = super().__enter__()
|
|
13
|
+
backup_root = Path(temp_dir)
|
|
14
|
+
|
|
15
|
+
phlb_conf_dir = backup_root / '.phlb'
|
|
16
|
+
phlb_conf_dir.mkdir()
|
|
17
|
+
|
|
18
|
+
size_db = FileSizeDatabase(phlb_conf_dir=phlb_conf_dir)
|
|
19
|
+
return size_db
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_size_db_filenames(size_db: FileSizeDatabase) -> Iterable[str]:
|
|
23
|
+
return sorted(
|
|
24
|
+
str(Path(entry.path).relative_to(size_db.base_path))
|
|
25
|
+
for entry in iter_scandir_files(size_db.base_path, excludes=set())
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_sizes(size_db: FileSizeDatabase) -> Iterable[int]:
|
|
30
|
+
return sorted(int(entry.name) for entry in iter_scandir_files(size_db.base_path, excludes=set()))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class FileSizeDatabaseTestCase(TestCase):
|
|
34
|
+
def test_happy_path(self):
|
|
35
|
+
with TemporaryFileSizeDatabase() as size_db:
|
|
36
|
+
self.assertIsInstance(size_db, FileSizeDatabase)
|
|
37
|
+
|
|
38
|
+
test_path1 = size_db._get_size_path(1234)
|
|
39
|
+
self.assertEqual(test_path1, size_db.base_path / '12' / '34' / '1234')
|
|
40
|
+
|
|
41
|
+
test_path2 = size_db._get_size_path(567890)
|
|
42
|
+
self.assertEqual(test_path2, size_db.base_path / '56' / '78' / '567890')
|
|
43
|
+
|
|
44
|
+
self.assertNotIn(1234, size_db)
|
|
45
|
+
self.assertNotIn(567890, size_db)
|
|
46
|
+
|
|
47
|
+
size_db.add(1234)
|
|
48
|
+
self.assertIn(1234, size_db)
|
|
49
|
+
self.assertNotIn(567890, size_db)
|
|
50
|
+
|
|
51
|
+
size_db.add(567890)
|
|
52
|
+
self.assertIn(1234, size_db)
|
|
53
|
+
self.assertIn(567890, size_db)
|
|
54
|
+
|
|
55
|
+
self.assertEqual(get_sizes(size_db), [1234, 567890])
|
|
56
|
+
self.assertEqual(
|
|
57
|
+
get_size_db_filenames(size_db),
|
|
58
|
+
[
|
|
59
|
+
'12/34/1234',
|
|
60
|
+
'56/78/567890',
|
|
61
|
+
],
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
########################################################################################
|
|
65
|
+
# Another instance using the same directory:
|
|
66
|
+
|
|
67
|
+
another_size_db = FileSizeDatabase(phlb_conf_dir=size_db.base_path.parent)
|
|
68
|
+
self.assertEqual(get_sizes(another_size_db), [1234, 567890])
|
|
69
|
+
self.assertEqual(
|
|
70
|
+
get_size_db_filenames(another_size_db),
|
|
71
|
+
[
|
|
72
|
+
'12/34/1234',
|
|
73
|
+
'56/78/567890',
|
|
74
|
+
],
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
########################################################################################
|
|
78
|
+
# "Share" directories:
|
|
79
|
+
|
|
80
|
+
for size in (123400001111, 123400002222, 128800003333, 129900004444):
|
|
81
|
+
self.assertNotIn(size, size_db)
|
|
82
|
+
size_db.add(size)
|
|
83
|
+
self.assertIn(size, size_db)
|
|
84
|
+
|
|
85
|
+
########################################################################################
|
|
86
|
+
# Min size is 1000 bytes:
|
|
87
|
+
|
|
88
|
+
"""DocWrite: README.md ## FileSizeDatabase - minimum file size
|
|
89
|
+
The minimum file size that can be stored in the FileSizeDatabase is 1000 bytes.
|
|
90
|
+
This is because no padding is made for sizes below 1000 bytes, which would
|
|
91
|
+
break the directory structure.
|
|
92
|
+
"""
|
|
93
|
+
self.assertEqual(FileSizeDatabase.MIN_SIZE, 1000)
|
|
94
|
+
"""DocWrite: README.md ## FileSizeDatabase - minimum file size
|
|
95
|
+
The idea is, that it's more efficient to backup small files directly, instead of
|
|
96
|
+
checking for duplicates via hardlinks. Therefore, small files below this size
|
|
97
|
+
are not tracked in the FileSizeDatabase.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
with self.assertRaises(AssertionError):
|
|
101
|
+
size_db._get_size_path(999)
|
|
102
|
+
with self.assertRaises(AssertionError):
|
|
103
|
+
size_db.add(999)
|
|
104
|
+
with self.assertRaises(AssertionError):
|
|
105
|
+
999 in size_db
|
|
106
|
+
|
|
107
|
+
########################################################################################
|
|
108
|
+
# Check final state:
|
|
109
|
+
|
|
110
|
+
self.assertEqual(
|
|
111
|
+
get_size_db_filenames(size_db),
|
|
112
|
+
[
|
|
113
|
+
'12/34/1234',
|
|
114
|
+
'12/34/123400001111',
|
|
115
|
+
'12/34/123400002222',
|
|
116
|
+
'12/88/128800003333',
|
|
117
|
+
'12/99/129900004444',
|
|
118
|
+
'56/78/567890',
|
|
119
|
+
],
|
|
120
|
+
)
|
|
121
|
+
self.assertEqual(
|
|
122
|
+
get_sizes(size_db),
|
|
123
|
+
[
|
|
124
|
+
1234,
|
|
125
|
+
567890,
|
|
126
|
+
123400001111,
|
|
127
|
+
123400002222,
|
|
128
|
+
128800003333,
|
|
129
|
+
129900004444,
|
|
130
|
+
],
|
|
131
|
+
)
|