PyHardLinkBackup 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyHardLinkBackup/__init__.py +7 -0
- PyHardLinkBackup/__main__.py +10 -0
- PyHardLinkBackup/backup.py +261 -0
- PyHardLinkBackup/cli_app/__init__.py +41 -0
- PyHardLinkBackup/cli_app/phlb.py +123 -0
- PyHardLinkBackup/cli_dev/__init__.py +70 -0
- PyHardLinkBackup/cli_dev/benchmark.py +138 -0
- PyHardLinkBackup/cli_dev/code_style.py +12 -0
- PyHardLinkBackup/cli_dev/packaging.py +65 -0
- PyHardLinkBackup/cli_dev/shell_completion.py +23 -0
- PyHardLinkBackup/cli_dev/testing.py +52 -0
- PyHardLinkBackup/cli_dev/update_readme_history.py +33 -0
- PyHardLinkBackup/compare_backup.py +212 -0
- PyHardLinkBackup/constants.py +16 -0
- PyHardLinkBackup/logging_setup.py +124 -0
- PyHardLinkBackup/rebuild_databases.py +176 -0
- PyHardLinkBackup/tests/__init__.py +36 -0
- PyHardLinkBackup/tests/test_backup.py +628 -0
- PyHardLinkBackup/tests/test_compare_backup.py +86 -0
- PyHardLinkBackup/tests/test_doc_write.py +26 -0
- PyHardLinkBackup/tests/test_doctests.py +10 -0
- PyHardLinkBackup/tests/test_project_setup.py +46 -0
- PyHardLinkBackup/tests/test_readme.py +75 -0
- PyHardLinkBackup/tests/test_readme_history.py +9 -0
- PyHardLinkBackup/tests/test_rebuild_database.py +224 -0
- PyHardLinkBackup/utilities/__init__.py +0 -0
- PyHardLinkBackup/utilities/file_hash_database.py +62 -0
- PyHardLinkBackup/utilities/file_size_database.py +46 -0
- PyHardLinkBackup/utilities/filesystem.py +158 -0
- PyHardLinkBackup/utilities/humanize.py +39 -0
- PyHardLinkBackup/utilities/rich_utils.py +99 -0
- PyHardLinkBackup/utilities/sha256sums.py +61 -0
- PyHardLinkBackup/utilities/tee.py +40 -0
- PyHardLinkBackup/utilities/tests/__init__.py +0 -0
- PyHardLinkBackup/utilities/tests/test_file_hash_database.py +143 -0
- PyHardLinkBackup/utilities/tests/test_file_size_database.py +138 -0
- PyHardLinkBackup/utilities/tests/test_filesystem.py +126 -0
- PyHardLinkBackup/utilities/tyro_cli_shared_args.py +12 -0
- pyhardlinkbackup-1.5.0.dist-info/METADATA +600 -0
- pyhardlinkbackup-1.5.0.dist-info/RECORD +42 -0
- pyhardlinkbackup-1.5.0.dist-info/WHEEL +4 -0
- pyhardlinkbackup-1.5.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Iterable
|
|
8
|
+
|
|
9
|
+
from bx_py_utils.path import assert_is_dir
|
|
10
|
+
from rich.progress import (
|
|
11
|
+
Progress,
|
|
12
|
+
SpinnerColumn,
|
|
13
|
+
TextColumn,
|
|
14
|
+
TimeElapsedColumn,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
from PyHardLinkBackup.constants import CHUNK_SIZE, HASH_ALGO
|
|
18
|
+
from PyHardLinkBackup.utilities.rich_utils import HumanFileSizeColumn
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def hash_file(path: Path) -> str:
|
|
25
|
+
logger.debug('Hash file %s using %s', path, HASH_ALGO)
|
|
26
|
+
with path.open('rb') as f:
|
|
27
|
+
digest = hashlib.file_digest(f, HASH_ALGO)
|
|
28
|
+
|
|
29
|
+
file_hash = digest.hexdigest()
|
|
30
|
+
logger.info('%s %s hash: %s', path, HASH_ALGO, file_hash)
|
|
31
|
+
return file_hash
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def copy_and_hash(src: Path, dst: Path) -> str:
|
|
35
|
+
logger.debug('Copy and hash file %s to %s using %s', src, dst, HASH_ALGO)
|
|
36
|
+
hasher = hashlib.new(HASH_ALGO)
|
|
37
|
+
with src.open('rb') as source_file, dst.open('wb') as dst_file:
|
|
38
|
+
while chunk := source_file.read(CHUNK_SIZE):
|
|
39
|
+
dst_file.write(chunk)
|
|
40
|
+
hasher.update(chunk)
|
|
41
|
+
|
|
42
|
+
# Keep original file metadata (permission bits, last access time, last modification time, and flags)
|
|
43
|
+
shutil.copystat(src, dst)
|
|
44
|
+
|
|
45
|
+
file_hash = hasher.hexdigest()
|
|
46
|
+
logger.info('%s backup to %s with %s hash: %s', src, dst, HASH_ALGO, file_hash)
|
|
47
|
+
return file_hash
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def read_and_hash_file(path: Path) -> tuple[bytes, str]:
|
|
51
|
+
logger.debug('Read and hash file %s using %s into RAM', path, HASH_ALGO)
|
|
52
|
+
content = path.read_bytes()
|
|
53
|
+
hasher = hashlib.new(HASH_ALGO, content)
|
|
54
|
+
file_hash = hasher.hexdigest()
|
|
55
|
+
logger.info('%s %s hash: %s', path, HASH_ALGO, file_hash)
|
|
56
|
+
return content, file_hash
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def iter_scandir_files(path: Path, excludes: set[str]) -> Iterable[os.DirEntry]:
|
|
60
|
+
"""
|
|
61
|
+
Recursively yield all files+symlinks in the given directory.
|
|
62
|
+
"""
|
|
63
|
+
logger.debug('Scanning directory %s', path)
|
|
64
|
+
with os.scandir(path) as scandir_iterator:
|
|
65
|
+
for entry in scandir_iterator:
|
|
66
|
+
if entry.is_dir(follow_symlinks=True):
|
|
67
|
+
if entry.name in excludes:
|
|
68
|
+
logger.debug('Excluding directory %s', entry.path)
|
|
69
|
+
continue
|
|
70
|
+
yield from iter_scandir_files(Path(entry.path), excludes=excludes)
|
|
71
|
+
else:
|
|
72
|
+
# It's a file or symlink or broken symlink
|
|
73
|
+
yield entry
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def humanized_fs_scan(path: Path, excludes: set[str]) -> tuple[int, int]:
|
|
77
|
+
print(f'\nScanning filesystem at: {path}...')
|
|
78
|
+
|
|
79
|
+
progress = Progress(
|
|
80
|
+
TimeElapsedColumn(),
|
|
81
|
+
'{task.description}',
|
|
82
|
+
SpinnerColumn('simpleDots'),
|
|
83
|
+
TextColumn('[green]{task.fields[file_count]} Files'),
|
|
84
|
+
'|',
|
|
85
|
+
HumanFileSizeColumn(field_name='total_size'),
|
|
86
|
+
'|',
|
|
87
|
+
TextColumn('[cyan]{task.fields[files_per_sec]} Files/sec'),
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
file_count = 0
|
|
91
|
+
total_size = 0
|
|
92
|
+
start_time = time.time()
|
|
93
|
+
scan_task_id = progress.add_task(
|
|
94
|
+
description='Scanning',
|
|
95
|
+
file_count=file_count,
|
|
96
|
+
total_size=total_size,
|
|
97
|
+
files_per_sec=0.0,
|
|
98
|
+
total=None,
|
|
99
|
+
)
|
|
100
|
+
next_update = 0
|
|
101
|
+
with progress:
|
|
102
|
+
for entry in iter_scandir_files(path, excludes=excludes):
|
|
103
|
+
file_count += 1
|
|
104
|
+
try:
|
|
105
|
+
total_size += entry.stat().st_size
|
|
106
|
+
except FileNotFoundError:
|
|
107
|
+
# e.g.: broken symlink
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
now = time.time()
|
|
111
|
+
if now >= next_update:
|
|
112
|
+
elapsed = max(now - start_time, 1e-6)
|
|
113
|
+
files_per_sec = int(file_count / elapsed)
|
|
114
|
+
progress.update(
|
|
115
|
+
scan_task_id,
|
|
116
|
+
file_count=file_count,
|
|
117
|
+
total_size=total_size,
|
|
118
|
+
files_per_sec=files_per_sec,
|
|
119
|
+
)
|
|
120
|
+
next_update = now + 1
|
|
121
|
+
|
|
122
|
+
now = time.time()
|
|
123
|
+
|
|
124
|
+
elapsed = max(now - start_time, 1e-6)
|
|
125
|
+
files_per_sec = int(file_count / elapsed)
|
|
126
|
+
progress.stop_task(scan_task_id)
|
|
127
|
+
progress.update(
|
|
128
|
+
scan_task_id,
|
|
129
|
+
description='Completed',
|
|
130
|
+
completed=True,
|
|
131
|
+
file_count=file_count,
|
|
132
|
+
total_size=total_size,
|
|
133
|
+
files_per_sec=files_per_sec,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return file_count, total_size
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def supports_hardlinks(directory: Path) -> bool:
|
|
140
|
+
logger.debug('Checking hardlink support in %s', directory)
|
|
141
|
+
assert_is_dir(directory)
|
|
142
|
+
test_src_file = directory / '.phlb_test'
|
|
143
|
+
test_dst_file = directory / '.phlb_test_link'
|
|
144
|
+
hardlinks_supported = False
|
|
145
|
+
try:
|
|
146
|
+
test_src_file.write_text('test')
|
|
147
|
+
os.link(test_src_file, test_dst_file)
|
|
148
|
+
assert test_dst_file.read_text() == 'test'
|
|
149
|
+
hardlinks_supported = True
|
|
150
|
+
except OSError as err:
|
|
151
|
+
# e.g.: FAT/exFAT filesystems ;)
|
|
152
|
+
logger.exception('Hardlink test failed in %s: %s', directory, err)
|
|
153
|
+
finally:
|
|
154
|
+
test_src_file.unlink(missing_ok=True)
|
|
155
|
+
test_dst_file.unlink(missing_ok=True)
|
|
156
|
+
|
|
157
|
+
logger.info('Hardlink support in %s: %s', directory, hardlinks_supported)
|
|
158
|
+
return hardlinks_supported
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
from bx_py_utils.humanize.time import human_timedelta
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def human_filesize(size: int | float) -> str:
|
|
7
|
+
"""
|
|
8
|
+
>>> human_filesize(1024)
|
|
9
|
+
'1.00 KiB'
|
|
10
|
+
>>> human_filesize(2.2*1024)
|
|
11
|
+
'2.20 KiB'
|
|
12
|
+
>>> human_filesize(3.33*1024*1024)
|
|
13
|
+
'3.33 MiB'
|
|
14
|
+
>>> human_filesize(4.44*1024*1024*1024)
|
|
15
|
+
'4.44 GiB'
|
|
16
|
+
>>> human_filesize(5.55*1024*1024*1024*1024)
|
|
17
|
+
'5.55 TiB'
|
|
18
|
+
>>> human_filesize(6.66*1024*1024*1024*1024*1024)
|
|
19
|
+
'6.66 PiB'
|
|
20
|
+
"""
|
|
21
|
+
for unit in ['Bytes', 'KiB', 'MiB', 'GiB', 'TiB']:
|
|
22
|
+
if size < 1024.0:
|
|
23
|
+
return f'{size:.2f} {unit}'
|
|
24
|
+
size /= 1024.0
|
|
25
|
+
return f'{size:.2f} PiB'
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PrintTimingContextManager:
|
|
29
|
+
def __init__(self, description: str):
|
|
30
|
+
self.description = description
|
|
31
|
+
|
|
32
|
+
def __enter__(self) -> None:
|
|
33
|
+
self.start_time = time.perf_counter()
|
|
34
|
+
|
|
35
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
36
|
+
duration = time.perf_counter() - self.start_time
|
|
37
|
+
print(f'{self.description}: {human_timedelta(duration)}\n')
|
|
38
|
+
if exc_type:
|
|
39
|
+
return False # Do not suppress exceptions
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from rich.live import Live
|
|
2
|
+
from rich.panel import Panel
|
|
3
|
+
from rich.progress import (
|
|
4
|
+
BarColumn,
|
|
5
|
+
Progress,
|
|
6
|
+
ProgressColumn,
|
|
7
|
+
TaskProgressColumn,
|
|
8
|
+
TextColumn,
|
|
9
|
+
TimeElapsedColumn,
|
|
10
|
+
TimeRemainingColumn,
|
|
11
|
+
TransferSpeedColumn,
|
|
12
|
+
)
|
|
13
|
+
from rich.table import Table
|
|
14
|
+
from rich.text import Text
|
|
15
|
+
|
|
16
|
+
from PyHardLinkBackup.utilities.humanize import human_filesize
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HumanFileSizeColumn(ProgressColumn):
|
|
20
|
+
def __init__(self, field_name: str | None = None, **kwargs) -> None:
|
|
21
|
+
super().__init__(**kwargs)
|
|
22
|
+
self.field_name = field_name
|
|
23
|
+
|
|
24
|
+
def render(self, task):
|
|
25
|
+
if self.field_name is None:
|
|
26
|
+
file_size = task.completed
|
|
27
|
+
else:
|
|
28
|
+
try:
|
|
29
|
+
file_size = task.fields[self.field_name]
|
|
30
|
+
except KeyError:
|
|
31
|
+
raise KeyError(f'Field {self.field_name=} not found in: {task.fields.keys()=}') from None
|
|
32
|
+
return Text(human_filesize(file_size))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DisplayFileTreeProgress:
|
|
36
|
+
def __init__(self, total_file_count: int, total_size: int):
|
|
37
|
+
percentage_format = '[progress.percentage]{task.percentage:>3.1f}%'
|
|
38
|
+
self.overall_progress = Progress(
|
|
39
|
+
TaskProgressColumn(text_format=percentage_format),
|
|
40
|
+
BarColumn(bar_width=50),
|
|
41
|
+
TextColumn('Elapsed:'),
|
|
42
|
+
TimeElapsedColumn(),
|
|
43
|
+
TextColumn('Remaining:'),
|
|
44
|
+
TimeRemainingColumn(),
|
|
45
|
+
)
|
|
46
|
+
self.overall_progress_task_id = self.overall_progress.add_task(description='', total=100)
|
|
47
|
+
|
|
48
|
+
self.file_count_progress = Progress(
|
|
49
|
+
TaskProgressColumn(text_format=percentage_format),
|
|
50
|
+
BarColumn(bar_width=50),
|
|
51
|
+
TextColumn('{task.completed}/{task.total} Files'),
|
|
52
|
+
)
|
|
53
|
+
self.file_count_progress_task_id = self.file_count_progress.add_task(description='', total=total_file_count)
|
|
54
|
+
self.file_count_progress_task = self.file_count_progress.tasks[0]
|
|
55
|
+
|
|
56
|
+
self.file_size_progress = Progress(
|
|
57
|
+
TaskProgressColumn(text_format=percentage_format),
|
|
58
|
+
BarColumn(bar_width=50),
|
|
59
|
+
HumanFileSizeColumn(),
|
|
60
|
+
'|',
|
|
61
|
+
TransferSpeedColumn(),
|
|
62
|
+
)
|
|
63
|
+
self.file_size_progress_task_id = self.file_size_progress.add_task(description='', total=total_size)
|
|
64
|
+
self.file_size_progress_task = self.file_size_progress.tasks[0]
|
|
65
|
+
|
|
66
|
+
progress_table = Table.grid()
|
|
67
|
+
progress_table.add_row(Panel(self.overall_progress, title='[b]Overall Progress', border_style='green'))
|
|
68
|
+
progress_table.add_row(Panel(self.file_count_progress, title='Total files saved'))
|
|
69
|
+
progress_table.add_row(Panel(self.file_size_progress, title='Total file size processed'))
|
|
70
|
+
|
|
71
|
+
self.live = Live(progress_table, auto_refresh=False)
|
|
72
|
+
|
|
73
|
+
def __enter__(self):
|
|
74
|
+
self.live.__enter__()
|
|
75
|
+
return self
|
|
76
|
+
|
|
77
|
+
def update(self, completed_file_count: int, completed_size: int):
|
|
78
|
+
self.file_count_progress.update(
|
|
79
|
+
task_id=self.file_count_progress_task_id,
|
|
80
|
+
completed=completed_file_count,
|
|
81
|
+
refresh=True,
|
|
82
|
+
)
|
|
83
|
+
self.file_size_progress.update(
|
|
84
|
+
task_id=self.file_size_progress_task_id,
|
|
85
|
+
completed=completed_size,
|
|
86
|
+
refresh=True,
|
|
87
|
+
)
|
|
88
|
+
self.overall_progress.update(
|
|
89
|
+
task_id=self.overall_progress_task_id,
|
|
90
|
+
completed=(self.file_count_progress_task.percentage + self.file_size_progress_task.percentage) / 2,
|
|
91
|
+
refresh=True,
|
|
92
|
+
)
|
|
93
|
+
self.live.refresh()
|
|
94
|
+
|
|
95
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
96
|
+
self.overall_progress.stop()
|
|
97
|
+
self.file_count_progress.stop()
|
|
98
|
+
self.file_size_progress.stop()
|
|
99
|
+
self.live.__exit__(exc_type, exc_value, traceback)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_sha256sums_path(file_path: Path):
|
|
9
|
+
"""
|
|
10
|
+
>>> get_sha256sums_path(Path('foo/bar/baz.txt'))
|
|
11
|
+
PosixPath('foo/bar/SHA256SUMS')
|
|
12
|
+
"""
|
|
13
|
+
hash_file_path = file_path.parent / 'SHA256SUMS'
|
|
14
|
+
return hash_file_path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def store_hash(file_path: Path, file_hash: str):
|
|
18
|
+
"""DocWrite: README.md ## SHA256SUMS
|
|
19
|
+
A `SHA256SUMS` file is stored in each backup directory containing the SHA256 hashes of all files in that directory.
|
|
20
|
+
It's the same format as e.g.: `sha256sum * > SHA256SUMS` command produces.
|
|
21
|
+
So it's possible to verify the integrity of the backup files later.
|
|
22
|
+
e.g.:
|
|
23
|
+
```bash
|
|
24
|
+
cd .../your/backup/foobar/20240101_120000/
|
|
25
|
+
sha256sum -c SHA256SUMS
|
|
26
|
+
```
|
|
27
|
+
"""
|
|
28
|
+
hash_file_path = get_sha256sums_path(file_path)
|
|
29
|
+
with hash_file_path.open('a') as f:
|
|
30
|
+
f.write(f'{file_hash} {file_path.name}\n')
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def check_sha256sums(
|
|
34
|
+
*,
|
|
35
|
+
file_path: Path,
|
|
36
|
+
file_hash: str,
|
|
37
|
+
) -> bool | None:
|
|
38
|
+
hash_file_path = get_sha256sums_path(file_path=file_path)
|
|
39
|
+
if not hash_file_path.is_file():
|
|
40
|
+
return None # Nothing to verify against
|
|
41
|
+
|
|
42
|
+
with hash_file_path.open('r') as f:
|
|
43
|
+
for line in f:
|
|
44
|
+
try:
|
|
45
|
+
expected_hash, filename = line.split(' ', maxsplit=1)
|
|
46
|
+
except ValueError:
|
|
47
|
+
logger.exception(f'Invalid line in "{hash_file_path}": {line!r}')
|
|
48
|
+
else:
|
|
49
|
+
filename = filename.strip()
|
|
50
|
+
if filename == file_path.name:
|
|
51
|
+
if not expected_hash == file_hash:
|
|
52
|
+
logger.error(
|
|
53
|
+
f'Hash {file_hash} from file {file_path} does not match hash in {hash_file_path} !'
|
|
54
|
+
)
|
|
55
|
+
return False
|
|
56
|
+
else:
|
|
57
|
+
logger.debug(f'{file_path} hash verified successfully from {hash_file_path}.')
|
|
58
|
+
return True
|
|
59
|
+
|
|
60
|
+
logger.info('No SHA256SUMS entry found for file: %s', file_path)
|
|
61
|
+
return None
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import sys
|
|
3
|
+
from contextlib import redirect_stdout
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# Borrowed from click:
|
|
7
|
+
_ansi_re = re.compile(r'\033\[[;?0-9]*[a-zA-Z]')
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def strip_ansi_codes(value: str) -> str:
|
|
11
|
+
return _ansi_re.sub('', value)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TeeStdout:
|
|
15
|
+
def __init__(self, file):
|
|
16
|
+
self.file = file
|
|
17
|
+
self.stdout = sys.stdout
|
|
18
|
+
|
|
19
|
+
def write(self, data):
|
|
20
|
+
self.stdout.write(data)
|
|
21
|
+
self.file.write(strip_ansi_codes(data))
|
|
22
|
+
|
|
23
|
+
def flush(self):
|
|
24
|
+
self.stdout.flush()
|
|
25
|
+
self.file.flush()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TeeStdoutContext:
|
|
29
|
+
def __init__(self, file_path):
|
|
30
|
+
self.file_path = file_path
|
|
31
|
+
|
|
32
|
+
def __enter__(self):
|
|
33
|
+
self.file = open(self.file_path, 'w')
|
|
34
|
+
self.redirect = redirect_stdout(TeeStdout(self.file))
|
|
35
|
+
self.redirect.__enter__()
|
|
36
|
+
return self
|
|
37
|
+
|
|
38
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
39
|
+
self.redirect.__exit__(exc_type, exc_val, exc_tb)
|
|
40
|
+
self.file.close()
|
|
File without changes
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import tempfile
|
|
3
|
+
import textwrap
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from bx_py_utils.path import assert_is_dir
|
|
7
|
+
from bx_py_utils.test_utils.assertion import assert_text_equal
|
|
8
|
+
from bx_py_utils.test_utils.log_utils import NoLogs
|
|
9
|
+
from cli_base.cli_tools.test_utils.base_testcases import BaseTestCase
|
|
10
|
+
|
|
11
|
+
from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase, HashAlreadyExistsError
|
|
12
|
+
from PyHardLinkBackup.utilities.filesystem import iter_scandir_files
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TemporaryFileHashDatabase(tempfile.TemporaryDirectory):
|
|
16
|
+
def __enter__(self) -> FileHashDatabase:
|
|
17
|
+
temp_dir = super().__enter__()
|
|
18
|
+
backup_root = Path(temp_dir).resolve()
|
|
19
|
+
|
|
20
|
+
phlb_conf_dir = backup_root / '.phlb'
|
|
21
|
+
phlb_conf_dir.mkdir()
|
|
22
|
+
|
|
23
|
+
hash_db = FileHashDatabase(backup_root=backup_root, phlb_conf_dir=phlb_conf_dir)
|
|
24
|
+
return hash_db
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_hash_db_filenames(hash_db: FileHashDatabase) -> list[str]:
|
|
28
|
+
# with NoLogs('PyHardLinkBackup.utilities.filesystem'):
|
|
29
|
+
return sorted(
|
|
30
|
+
str(Path(entry.path).relative_to(hash_db.base_path))
|
|
31
|
+
for entry in iter_scandir_files(hash_db.base_path, excludes=set())
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_hash_db_info(backup_root: Path) -> str:
|
|
36
|
+
db_base_path = backup_root / '.phlb' / 'hash-lookup'
|
|
37
|
+
assert_is_dir(db_base_path)
|
|
38
|
+
|
|
39
|
+
with NoLogs(logger_name='XY'):
|
|
40
|
+
lines = []
|
|
41
|
+
for entry in iter_scandir_files(db_base_path, excludes=set()):
|
|
42
|
+
hash_path = Path(entry.path)
|
|
43
|
+
rel_path = hash_path.relative_to(db_base_path)
|
|
44
|
+
rel_file_path = hash_path.read_text()
|
|
45
|
+
lines.append(f'{str(rel_path)[:20]}… -> {rel_file_path}')
|
|
46
|
+
return '\n'.join(sorted(lines))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def assert_hash_db_info(backup_root: Path, expected: str):
|
|
50
|
+
expected = textwrap.dedent(expected).strip()
|
|
51
|
+
actual = get_hash_db_info(backup_root)
|
|
52
|
+
assert_text_equal(
|
|
53
|
+
actual,
|
|
54
|
+
expected,
|
|
55
|
+
msg=f'FileHashDatabase info does not match as expected.\n\n{actual}\n\n',
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class FileHashDatabaseTestCase(BaseTestCase):
|
|
60
|
+
def test_happy_path(self):
|
|
61
|
+
with TemporaryFileHashDatabase() as hash_db:
|
|
62
|
+
self.assertIsInstance(hash_db, FileHashDatabase)
|
|
63
|
+
|
|
64
|
+
backup_root_path = hash_db.backup_root
|
|
65
|
+
assert_is_dir(backup_root_path)
|
|
66
|
+
|
|
67
|
+
test_path = hash_db._get_hash_path('12345678abcdef')
|
|
68
|
+
self.assertEqual(test_path, hash_db.base_path / '12' / '34' / '12345678abcdef')
|
|
69
|
+
|
|
70
|
+
file_a_path = backup_root_path / 'rel/path/to/file-A'
|
|
71
|
+
file_a_path.parent.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
file_a_path.touch()
|
|
73
|
+
|
|
74
|
+
self.assertIs(hash_db.get('12345678abcdef'), None)
|
|
75
|
+
self.assertIs('12345678abcdef' in hash_db, False)
|
|
76
|
+
hash_db['12345678abcdef'] = file_a_path
|
|
77
|
+
self.assertEqual(hash_db.get('12345678abcdef'), file_a_path)
|
|
78
|
+
self.assertIs('12345678abcdef' in hash_db, True)
|
|
79
|
+
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
|
|
80
|
+
self.assertEqual(
|
|
81
|
+
get_hash_db_filenames(hash_db),
|
|
82
|
+
['12/34/12345678abcdef'],
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
########################################################################################
|
|
86
|
+
# Another instance using the same directory:
|
|
87
|
+
|
|
88
|
+
another_hash_db = FileHashDatabase(
|
|
89
|
+
backup_root=hash_db.backup_root,
|
|
90
|
+
phlb_conf_dir=hash_db.base_path.parent,
|
|
91
|
+
)
|
|
92
|
+
self.assertEqual(another_hash_db.get('12345678abcdef'), file_a_path)
|
|
93
|
+
self.assertIs(another_hash_db.get('12abcd345678abcdef'), None)
|
|
94
|
+
|
|
95
|
+
file_b_path = backup_root_path / 'rel/path/to/file-B'
|
|
96
|
+
file_b_path.parent.mkdir(parents=True, exist_ok=True)
|
|
97
|
+
file_b_path.touch()
|
|
98
|
+
|
|
99
|
+
another_hash_db['12abcd345678abcdef'] = file_b_path
|
|
100
|
+
self.assertEqual(another_hash_db.get('12abcd345678abcdef'), file_b_path)
|
|
101
|
+
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
|
|
102
|
+
self.assertEqual(
|
|
103
|
+
get_hash_db_filenames(another_hash_db),
|
|
104
|
+
[
|
|
105
|
+
'12/34/12345678abcdef',
|
|
106
|
+
'12/ab/12abcd345678abcdef',
|
|
107
|
+
],
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
|
|
111
|
+
assert_hash_db_info(
|
|
112
|
+
backup_root=hash_db.backup_root,
|
|
113
|
+
expected="""
|
|
114
|
+
12/34/12345678abcdef… -> rel/path/to/file-A
|
|
115
|
+
12/ab/12abcd345678ab… -> rel/path/to/file-B
|
|
116
|
+
""",
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
########################################################################################
|
|
120
|
+
# Deny "overwrite" of existing hash:
|
|
121
|
+
|
|
122
|
+
with self.assertRaises(HashAlreadyExistsError):
|
|
123
|
+
hash_db['12abcd345678abcdef'] = 'foo/bar/baz' # already exists!
|
|
124
|
+
|
|
125
|
+
########################################################################################
|
|
126
|
+
# Don't use stale entries pointing to missing files:
|
|
127
|
+
|
|
128
|
+
self.assertEqual(hash_db.get('12345678abcdef'), file_a_path)
|
|
129
|
+
file_a_path.unlink()
|
|
130
|
+
|
|
131
|
+
"""DocWrite: README.md ## FileHashDatabase - Missing hardlink target file
|
|
132
|
+
We check if the hardlink source file still exists. If not, we remove the hash entry from the database.
|
|
133
|
+
A warning is logged in this case."""
|
|
134
|
+
with self.assertLogs(level=logging.WARNING) as logs:
|
|
135
|
+
self.assertIs(hash_db.get('12345678abcdef'), None)
|
|
136
|
+
self.assertIn('Hash database entry found, but file does not exist', ''.join(logs.output))
|
|
137
|
+
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
|
|
138
|
+
assert_hash_db_info(
|
|
139
|
+
backup_root=hash_db.backup_root,
|
|
140
|
+
expected="""
|
|
141
|
+
12/ab/12abcd345678ab… -> rel/path/to/file-B
|
|
142
|
+
""",
|
|
143
|
+
)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import tempfile
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from bx_py_utils.test_utils.log_utils import NoLogs
|
|
7
|
+
from cli_base.cli_tools.test_utils.base_testcases import BaseTestCase
|
|
8
|
+
|
|
9
|
+
from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
|
|
10
|
+
from PyHardLinkBackup.utilities.filesystem import iter_scandir_files
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TemporaryFileSizeDatabase(tempfile.TemporaryDirectory):
|
|
14
|
+
def __enter__(self) -> FileSizeDatabase:
|
|
15
|
+
temp_dir = super().__enter__()
|
|
16
|
+
backup_root = Path(temp_dir).resolve()
|
|
17
|
+
|
|
18
|
+
phlb_conf_dir = backup_root / '.phlb'
|
|
19
|
+
phlb_conf_dir.mkdir()
|
|
20
|
+
|
|
21
|
+
size_db = FileSizeDatabase(phlb_conf_dir=phlb_conf_dir)
|
|
22
|
+
return size_db
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_size_db_filenames(size_db: FileSizeDatabase) -> Iterable[str]:
|
|
26
|
+
return sorted(
|
|
27
|
+
str(Path(entry.path).relative_to(size_db.base_path))
|
|
28
|
+
for entry in iter_scandir_files(size_db.base_path, excludes=set())
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_sizes(size_db: FileSizeDatabase) -> Iterable[int]:
|
|
33
|
+
with NoLogs('PyHardLinkBackup.utilities.filesystem'):
|
|
34
|
+
return sorted(int(entry.name) for entry in iter_scandir_files(size_db.base_path, excludes=set()))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class FileSizeDatabaseTestCase(BaseTestCase):
|
|
38
|
+
def test_happy_path(self):
|
|
39
|
+
with TemporaryFileSizeDatabase() as size_db:
|
|
40
|
+
self.assertIsInstance(size_db, FileSizeDatabase)
|
|
41
|
+
|
|
42
|
+
test_path1 = size_db._get_size_path(1234)
|
|
43
|
+
self.assertEqual(test_path1, size_db.base_path / '12' / '34' / '1234')
|
|
44
|
+
|
|
45
|
+
test_path2 = size_db._get_size_path(567890)
|
|
46
|
+
self.assertEqual(test_path2, size_db.base_path / '56' / '78' / '567890')
|
|
47
|
+
|
|
48
|
+
self.assertNotIn(1234, size_db)
|
|
49
|
+
self.assertNotIn(567890, size_db)
|
|
50
|
+
|
|
51
|
+
size_db.add(1234)
|
|
52
|
+
self.assertIn(1234, size_db)
|
|
53
|
+
self.assertNotIn(567890, size_db)
|
|
54
|
+
|
|
55
|
+
size_db.add(567890)
|
|
56
|
+
self.assertIn(1234, size_db)
|
|
57
|
+
self.assertIn(567890, size_db)
|
|
58
|
+
|
|
59
|
+
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
|
|
60
|
+
self.assertEqual(get_sizes(size_db), [1234, 567890])
|
|
61
|
+
self.assertEqual(
|
|
62
|
+
get_size_db_filenames(size_db),
|
|
63
|
+
[
|
|
64
|
+
'12/34/1234',
|
|
65
|
+
'56/78/567890',
|
|
66
|
+
],
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
########################################################################################
|
|
70
|
+
# Another instance using the same directory:
|
|
71
|
+
|
|
72
|
+
another_size_db = FileSizeDatabase(phlb_conf_dir=size_db.base_path.parent)
|
|
73
|
+
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
|
|
74
|
+
self.assertEqual(get_sizes(another_size_db), [1234, 567890])
|
|
75
|
+
self.assertEqual(
|
|
76
|
+
get_size_db_filenames(another_size_db),
|
|
77
|
+
[
|
|
78
|
+
'12/34/1234',
|
|
79
|
+
'56/78/567890',
|
|
80
|
+
],
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
########################################################################################
|
|
84
|
+
# "Share" directories:
|
|
85
|
+
|
|
86
|
+
for size in (123400001111, 123400002222, 128800003333, 129900004444):
|
|
87
|
+
self.assertNotIn(size, size_db)
|
|
88
|
+
size_db.add(size)
|
|
89
|
+
self.assertIn(size, size_db)
|
|
90
|
+
|
|
91
|
+
########################################################################################
|
|
92
|
+
# Min size is 1000 bytes:
|
|
93
|
+
|
|
94
|
+
"""DocWrite: README.md ## FileSizeDatabase - minimum file size
|
|
95
|
+
The minimum file size that can be stored in the FileSizeDatabase is 1000 bytes.
|
|
96
|
+
This is because no padding is made for sizes below 1000 bytes, which would
|
|
97
|
+
break the directory structure.
|
|
98
|
+
"""
|
|
99
|
+
self.assertEqual(FileSizeDatabase.MIN_SIZE, 1000)
|
|
100
|
+
"""DocWrite: README.md ## FileSizeDatabase - minimum file size
|
|
101
|
+
The idea is, that it's more efficient to backup small files directly, instead of
|
|
102
|
+
checking for duplicates via hardlinks. Therefore, small files below this size
|
|
103
|
+
are not tracked in the FileSizeDatabase.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
with self.assertRaises(AssertionError):
|
|
107
|
+
size_db._get_size_path(999)
|
|
108
|
+
with self.assertRaises(AssertionError):
|
|
109
|
+
size_db.add(999)
|
|
110
|
+
with self.assertRaises(AssertionError):
|
|
111
|
+
999 in size_db
|
|
112
|
+
|
|
113
|
+
########################################################################################
|
|
114
|
+
# Check final state:
|
|
115
|
+
|
|
116
|
+
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
|
|
117
|
+
self.assertEqual(
|
|
118
|
+
get_size_db_filenames(size_db),
|
|
119
|
+
[
|
|
120
|
+
'12/34/1234',
|
|
121
|
+
'12/34/123400001111',
|
|
122
|
+
'12/34/123400002222',
|
|
123
|
+
'12/88/128800003333',
|
|
124
|
+
'12/99/129900004444',
|
|
125
|
+
'56/78/567890',
|
|
126
|
+
],
|
|
127
|
+
)
|
|
128
|
+
self.assertEqual(
|
|
129
|
+
get_sizes(size_db),
|
|
130
|
+
[
|
|
131
|
+
1234,
|
|
132
|
+
567890,
|
|
133
|
+
123400001111,
|
|
134
|
+
123400002222,
|
|
135
|
+
128800003333,
|
|
136
|
+
129900004444,
|
|
137
|
+
],
|
|
138
|
+
)
|