PyHardLinkBackup 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. PyHardLinkBackup/__init__.py +7 -0
  2. PyHardLinkBackup/__main__.py +10 -0
  3. PyHardLinkBackup/backup.py +261 -0
  4. PyHardLinkBackup/cli_app/__init__.py +41 -0
  5. PyHardLinkBackup/cli_app/phlb.py +123 -0
  6. PyHardLinkBackup/cli_dev/__init__.py +70 -0
  7. PyHardLinkBackup/cli_dev/benchmark.py +138 -0
  8. PyHardLinkBackup/cli_dev/code_style.py +12 -0
  9. PyHardLinkBackup/cli_dev/packaging.py +65 -0
  10. PyHardLinkBackup/cli_dev/shell_completion.py +23 -0
  11. PyHardLinkBackup/cli_dev/testing.py +52 -0
  12. PyHardLinkBackup/cli_dev/update_readme_history.py +33 -0
  13. PyHardLinkBackup/compare_backup.py +212 -0
  14. PyHardLinkBackup/constants.py +16 -0
  15. PyHardLinkBackup/logging_setup.py +124 -0
  16. PyHardLinkBackup/rebuild_databases.py +176 -0
  17. PyHardLinkBackup/tests/__init__.py +36 -0
  18. PyHardLinkBackup/tests/test_backup.py +628 -0
  19. PyHardLinkBackup/tests/test_compare_backup.py +86 -0
  20. PyHardLinkBackup/tests/test_doc_write.py +26 -0
  21. PyHardLinkBackup/tests/test_doctests.py +10 -0
  22. PyHardLinkBackup/tests/test_project_setup.py +46 -0
  23. PyHardLinkBackup/tests/test_readme.py +75 -0
  24. PyHardLinkBackup/tests/test_readme_history.py +9 -0
  25. PyHardLinkBackup/tests/test_rebuild_database.py +224 -0
  26. PyHardLinkBackup/utilities/__init__.py +0 -0
  27. PyHardLinkBackup/utilities/file_hash_database.py +62 -0
  28. PyHardLinkBackup/utilities/file_size_database.py +46 -0
  29. PyHardLinkBackup/utilities/filesystem.py +158 -0
  30. PyHardLinkBackup/utilities/humanize.py +39 -0
  31. PyHardLinkBackup/utilities/rich_utils.py +99 -0
  32. PyHardLinkBackup/utilities/sha256sums.py +61 -0
  33. PyHardLinkBackup/utilities/tee.py +40 -0
  34. PyHardLinkBackup/utilities/tests/__init__.py +0 -0
  35. PyHardLinkBackup/utilities/tests/test_file_hash_database.py +143 -0
  36. PyHardLinkBackup/utilities/tests/test_file_size_database.py +138 -0
  37. PyHardLinkBackup/utilities/tests/test_filesystem.py +126 -0
  38. PyHardLinkBackup/utilities/tyro_cli_shared_args.py +12 -0
  39. pyhardlinkbackup-1.5.0.dist-info/METADATA +600 -0
  40. pyhardlinkbackup-1.5.0.dist-info/RECORD +42 -0
  41. pyhardlinkbackup-1.5.0.dist-info/WHEEL +4 -0
  42. pyhardlinkbackup-1.5.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,158 @@
1
+ import hashlib
2
+ import logging
3
+ import os
4
+ import shutil
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Iterable
8
+
9
+ from bx_py_utils.path import assert_is_dir
10
+ from rich.progress import (
11
+ Progress,
12
+ SpinnerColumn,
13
+ TextColumn,
14
+ TimeElapsedColumn,
15
+ )
16
+
17
+ from PyHardLinkBackup.constants import CHUNK_SIZE, HASH_ALGO
18
+ from PyHardLinkBackup.utilities.rich_utils import HumanFileSizeColumn
19
+
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def hash_file(path: Path) -> str:
25
+ logger.debug('Hash file %s using %s', path, HASH_ALGO)
26
+ with path.open('rb') as f:
27
+ digest = hashlib.file_digest(f, HASH_ALGO)
28
+
29
+ file_hash = digest.hexdigest()
30
+ logger.info('%s %s hash: %s', path, HASH_ALGO, file_hash)
31
+ return file_hash
32
+
33
+
34
+ def copy_and_hash(src: Path, dst: Path) -> str:
35
+ logger.debug('Copy and hash file %s to %s using %s', src, dst, HASH_ALGO)
36
+ hasher = hashlib.new(HASH_ALGO)
37
+ with src.open('rb') as source_file, dst.open('wb') as dst_file:
38
+ while chunk := source_file.read(CHUNK_SIZE):
39
+ dst_file.write(chunk)
40
+ hasher.update(chunk)
41
+
42
+ # Keep original file metadata (permission bits, last access time, last modification time, and flags)
43
+ shutil.copystat(src, dst)
44
+
45
+ file_hash = hasher.hexdigest()
46
+ logger.info('%s backup to %s with %s hash: %s', src, dst, HASH_ALGO, file_hash)
47
+ return file_hash
48
+
49
+
50
+ def read_and_hash_file(path: Path) -> tuple[bytes, str]:
51
+ logger.debug('Read and hash file %s using %s into RAM', path, HASH_ALGO)
52
+ content = path.read_bytes()
53
+ hasher = hashlib.new(HASH_ALGO, content)
54
+ file_hash = hasher.hexdigest()
55
+ logger.info('%s %s hash: %s', path, HASH_ALGO, file_hash)
56
+ return content, file_hash
57
+
58
+
59
+ def iter_scandir_files(path: Path, excludes: set[str]) -> Iterable[os.DirEntry]:
60
+ """
61
+ Recursively yield all files+symlinks in the given directory.
62
+ """
63
+ logger.debug('Scanning directory %s', path)
64
+ with os.scandir(path) as scandir_iterator:
65
+ for entry in scandir_iterator:
66
+ if entry.is_dir(follow_symlinks=True):
67
+ if entry.name in excludes:
68
+ logger.debug('Excluding directory %s', entry.path)
69
+ continue
70
+ yield from iter_scandir_files(Path(entry.path), excludes=excludes)
71
+ else:
72
+ # It's a file or symlink or broken symlink
73
+ yield entry
74
+
75
+
76
+ def humanized_fs_scan(path: Path, excludes: set[str]) -> tuple[int, int]:
77
+ print(f'\nScanning filesystem at: {path}...')
78
+
79
+ progress = Progress(
80
+ TimeElapsedColumn(),
81
+ '{task.description}',
82
+ SpinnerColumn('simpleDots'),
83
+ TextColumn('[green]{task.fields[file_count]} Files'),
84
+ '|',
85
+ HumanFileSizeColumn(field_name='total_size'),
86
+ '|',
87
+ TextColumn('[cyan]{task.fields[files_per_sec]} Files/sec'),
88
+ )
89
+
90
+ file_count = 0
91
+ total_size = 0
92
+ start_time = time.time()
93
+ scan_task_id = progress.add_task(
94
+ description='Scanning',
95
+ file_count=file_count,
96
+ total_size=total_size,
97
+ files_per_sec=0.0,
98
+ total=None,
99
+ )
100
+ next_update = 0
101
+ with progress:
102
+ for entry in iter_scandir_files(path, excludes=excludes):
103
+ file_count += 1
104
+ try:
105
+ total_size += entry.stat().st_size
106
+ except FileNotFoundError:
107
+ # e.g.: broken symlink
108
+ continue
109
+
110
+ now = time.time()
111
+ if now >= next_update:
112
+ elapsed = max(now - start_time, 1e-6)
113
+ files_per_sec = int(file_count / elapsed)
114
+ progress.update(
115
+ scan_task_id,
116
+ file_count=file_count,
117
+ total_size=total_size,
118
+ files_per_sec=files_per_sec,
119
+ )
120
+ next_update = now + 1
121
+
122
+ now = time.time()
123
+
124
+ elapsed = max(now - start_time, 1e-6)
125
+ files_per_sec = int(file_count / elapsed)
126
+ progress.stop_task(scan_task_id)
127
+ progress.update(
128
+ scan_task_id,
129
+ description='Completed',
130
+ completed=True,
131
+ file_count=file_count,
132
+ total_size=total_size,
133
+ files_per_sec=files_per_sec,
134
+ )
135
+
136
+ return file_count, total_size
137
+
138
+
139
+ def supports_hardlinks(directory: Path) -> bool:
140
+ logger.debug('Checking hardlink support in %s', directory)
141
+ assert_is_dir(directory)
142
+ test_src_file = directory / '.phlb_test'
143
+ test_dst_file = directory / '.phlb_test_link'
144
+ hardlinks_supported = False
145
+ try:
146
+ test_src_file.write_text('test')
147
+ os.link(test_src_file, test_dst_file)
148
+ assert test_dst_file.read_text() == 'test'
149
+ hardlinks_supported = True
150
+ except OSError as err:
151
+ # e.g.: FAT/exFAT filesystems ;)
152
+ logger.exception('Hardlink test failed in %s: %s', directory, err)
153
+ finally:
154
+ test_src_file.unlink(missing_ok=True)
155
+ test_dst_file.unlink(missing_ok=True)
156
+
157
+ logger.info('Hardlink support in %s: %s', directory, hardlinks_supported)
158
+ return hardlinks_supported
@@ -0,0 +1,39 @@
1
+ import time
2
+
3
+ from bx_py_utils.humanize.time import human_timedelta
4
+
5
+
6
+ def human_filesize(size: int | float) -> str:
7
+ """
8
+ >>> human_filesize(1024)
9
+ '1.00 KiB'
10
+ >>> human_filesize(2.2*1024)
11
+ '2.20 KiB'
12
+ >>> human_filesize(3.33*1024*1024)
13
+ '3.33 MiB'
14
+ >>> human_filesize(4.44*1024*1024*1024)
15
+ '4.44 GiB'
16
+ >>> human_filesize(5.55*1024*1024*1024*1024)
17
+ '5.55 TiB'
18
+ >>> human_filesize(6.66*1024*1024*1024*1024*1024)
19
+ '6.66 PiB'
20
+ """
21
+ for unit in ['Bytes', 'KiB', 'MiB', 'GiB', 'TiB']:
22
+ if size < 1024.0:
23
+ return f'{size:.2f} {unit}'
24
+ size /= 1024.0
25
+ return f'{size:.2f} PiB'
26
+
27
+
28
+ class PrintTimingContextManager:
29
+ def __init__(self, description: str):
30
+ self.description = description
31
+
32
+ def __enter__(self) -> None:
33
+ self.start_time = time.perf_counter()
34
+
35
+ def __exit__(self, exc_type, exc_value, traceback):
36
+ duration = time.perf_counter() - self.start_time
37
+ print(f'{self.description}: {human_timedelta(duration)}\n')
38
+ if exc_type:
39
+ return False # Do not suppress exceptions
@@ -0,0 +1,99 @@
1
+ from rich.live import Live
2
+ from rich.panel import Panel
3
+ from rich.progress import (
4
+ BarColumn,
5
+ Progress,
6
+ ProgressColumn,
7
+ TaskProgressColumn,
8
+ TextColumn,
9
+ TimeElapsedColumn,
10
+ TimeRemainingColumn,
11
+ TransferSpeedColumn,
12
+ )
13
+ from rich.table import Table
14
+ from rich.text import Text
15
+
16
+ from PyHardLinkBackup.utilities.humanize import human_filesize
17
+
18
+
19
+ class HumanFileSizeColumn(ProgressColumn):
20
+ def __init__(self, field_name: str | None = None, **kwargs) -> None:
21
+ super().__init__(**kwargs)
22
+ self.field_name = field_name
23
+
24
+ def render(self, task):
25
+ if self.field_name is None:
26
+ file_size = task.completed
27
+ else:
28
+ try:
29
+ file_size = task.fields[self.field_name]
30
+ except KeyError:
31
+ raise KeyError(f'Field {self.field_name=} not found in: {task.fields.keys()=}') from None
32
+ return Text(human_filesize(file_size))
33
+
34
+
35
+ class DisplayFileTreeProgress:
36
+ def __init__(self, total_file_count: int, total_size: int):
37
+ percentage_format = '[progress.percentage]{task.percentage:>3.1f}%'
38
+ self.overall_progress = Progress(
39
+ TaskProgressColumn(text_format=percentage_format),
40
+ BarColumn(bar_width=50),
41
+ TextColumn('Elapsed:'),
42
+ TimeElapsedColumn(),
43
+ TextColumn('Remaining:'),
44
+ TimeRemainingColumn(),
45
+ )
46
+ self.overall_progress_task_id = self.overall_progress.add_task(description='', total=100)
47
+
48
+ self.file_count_progress = Progress(
49
+ TaskProgressColumn(text_format=percentage_format),
50
+ BarColumn(bar_width=50),
51
+ TextColumn('{task.completed}/{task.total} Files'),
52
+ )
53
+ self.file_count_progress_task_id = self.file_count_progress.add_task(description='', total=total_file_count)
54
+ self.file_count_progress_task = self.file_count_progress.tasks[0]
55
+
56
+ self.file_size_progress = Progress(
57
+ TaskProgressColumn(text_format=percentage_format),
58
+ BarColumn(bar_width=50),
59
+ HumanFileSizeColumn(),
60
+ '|',
61
+ TransferSpeedColumn(),
62
+ )
63
+ self.file_size_progress_task_id = self.file_size_progress.add_task(description='', total=total_size)
64
+ self.file_size_progress_task = self.file_size_progress.tasks[0]
65
+
66
+ progress_table = Table.grid()
67
+ progress_table.add_row(Panel(self.overall_progress, title='[b]Overall Progress', border_style='green'))
68
+ progress_table.add_row(Panel(self.file_count_progress, title='Total files saved'))
69
+ progress_table.add_row(Panel(self.file_size_progress, title='Total file size processed'))
70
+
71
+ self.live = Live(progress_table, auto_refresh=False)
72
+
73
+ def __enter__(self):
74
+ self.live.__enter__()
75
+ return self
76
+
77
+ def update(self, completed_file_count: int, completed_size: int):
78
+ self.file_count_progress.update(
79
+ task_id=self.file_count_progress_task_id,
80
+ completed=completed_file_count,
81
+ refresh=True,
82
+ )
83
+ self.file_size_progress.update(
84
+ task_id=self.file_size_progress_task_id,
85
+ completed=completed_size,
86
+ refresh=True,
87
+ )
88
+ self.overall_progress.update(
89
+ task_id=self.overall_progress_task_id,
90
+ completed=(self.file_count_progress_task.percentage + self.file_size_progress_task.percentage) / 2,
91
+ refresh=True,
92
+ )
93
+ self.live.refresh()
94
+
95
+ def __exit__(self, exc_type, exc_value, traceback):
96
+ self.overall_progress.stop()
97
+ self.file_count_progress.stop()
98
+ self.file_size_progress.stop()
99
+ self.live.__exit__(exc_type, exc_value, traceback)
@@ -0,0 +1,61 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ def get_sha256sums_path(file_path: Path):
9
+ """
10
+ >>> get_sha256sums_path(Path('foo/bar/baz.txt'))
11
+ PosixPath('foo/bar/SHA256SUMS')
12
+ """
13
+ hash_file_path = file_path.parent / 'SHA256SUMS'
14
+ return hash_file_path
15
+
16
+
17
+ def store_hash(file_path: Path, file_hash: str):
18
+ """DocWrite: README.md ## SHA256SUMS
19
+ A `SHA256SUMS` file is stored in each backup directory containing the SHA256 hashes of all files in that directory.
20
+ It's the same format as e.g.: `sha256sum * > SHA256SUMS` command produces.
21
+ So it's possible to verify the integrity of the backup files later.
22
+ e.g.:
23
+ ```bash
24
+ cd .../your/backup/foobar/20240101_120000/
25
+ sha256sum -c SHA256SUMS
26
+ ```
27
+ """
28
+ hash_file_path = get_sha256sums_path(file_path)
29
+ with hash_file_path.open('a') as f:
30
+ f.write(f'{file_hash} {file_path.name}\n')
31
+
32
+
33
+ def check_sha256sums(
34
+ *,
35
+ file_path: Path,
36
+ file_hash: str,
37
+ ) -> bool | None:
38
+ hash_file_path = get_sha256sums_path(file_path=file_path)
39
+ if not hash_file_path.is_file():
40
+ return None # Nothing to verify against
41
+
42
+ with hash_file_path.open('r') as f:
43
+ for line in f:
44
+ try:
45
+ expected_hash, filename = line.split(' ', maxsplit=1)
46
+ except ValueError:
47
+ logger.exception(f'Invalid line in "{hash_file_path}": {line!r}')
48
+ else:
49
+ filename = filename.strip()
50
+ if filename == file_path.name:
51
+ if not expected_hash == file_hash:
52
+ logger.error(
53
+ f'Hash {file_hash} from file {file_path} does not match hash in {hash_file_path} !'
54
+ )
55
+ return False
56
+ else:
57
+ logger.debug(f'{file_path} hash verified successfully from {hash_file_path}.')
58
+ return True
59
+
60
+ logger.info('No SHA256SUMS entry found for file: %s', file_path)
61
+ return None
@@ -0,0 +1,40 @@
1
+ import re
2
+ import sys
3
+ from contextlib import redirect_stdout
4
+
5
+
6
+ # Borrowed from click:
7
+ _ansi_re = re.compile(r'\033\[[;?0-9]*[a-zA-Z]')
8
+
9
+
10
+ def strip_ansi_codes(value: str) -> str:
11
+ return _ansi_re.sub('', value)
12
+
13
+
14
+ class TeeStdout:
15
+ def __init__(self, file):
16
+ self.file = file
17
+ self.stdout = sys.stdout
18
+
19
+ def write(self, data):
20
+ self.stdout.write(data)
21
+ self.file.write(strip_ansi_codes(data))
22
+
23
+ def flush(self):
24
+ self.stdout.flush()
25
+ self.file.flush()
26
+
27
+
28
+ class TeeStdoutContext:
29
+ def __init__(self, file_path):
30
+ self.file_path = file_path
31
+
32
+ def __enter__(self):
33
+ self.file = open(self.file_path, 'w')
34
+ self.redirect = redirect_stdout(TeeStdout(self.file))
35
+ self.redirect.__enter__()
36
+ return self
37
+
38
+ def __exit__(self, exc_type, exc_val, exc_tb):
39
+ self.redirect.__exit__(exc_type, exc_val, exc_tb)
40
+ self.file.close()
File without changes
@@ -0,0 +1,143 @@
1
+ import logging
2
+ import tempfile
3
+ import textwrap
4
+ from pathlib import Path
5
+
6
+ from bx_py_utils.path import assert_is_dir
7
+ from bx_py_utils.test_utils.assertion import assert_text_equal
8
+ from bx_py_utils.test_utils.log_utils import NoLogs
9
+ from cli_base.cli_tools.test_utils.base_testcases import BaseTestCase
10
+
11
+ from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase, HashAlreadyExistsError
12
+ from PyHardLinkBackup.utilities.filesystem import iter_scandir_files
13
+
14
+
15
+ class TemporaryFileHashDatabase(tempfile.TemporaryDirectory):
16
+ def __enter__(self) -> FileHashDatabase:
17
+ temp_dir = super().__enter__()
18
+ backup_root = Path(temp_dir).resolve()
19
+
20
+ phlb_conf_dir = backup_root / '.phlb'
21
+ phlb_conf_dir.mkdir()
22
+
23
+ hash_db = FileHashDatabase(backup_root=backup_root, phlb_conf_dir=phlb_conf_dir)
24
+ return hash_db
25
+
26
+
27
+ def get_hash_db_filenames(hash_db: FileHashDatabase) -> list[str]:
28
+ # with NoLogs('PyHardLinkBackup.utilities.filesystem'):
29
+ return sorted(
30
+ str(Path(entry.path).relative_to(hash_db.base_path))
31
+ for entry in iter_scandir_files(hash_db.base_path, excludes=set())
32
+ )
33
+
34
+
35
+ def get_hash_db_info(backup_root: Path) -> str:
36
+ db_base_path = backup_root / '.phlb' / 'hash-lookup'
37
+ assert_is_dir(db_base_path)
38
+
39
+ with NoLogs(logger_name='XY'):
40
+ lines = []
41
+ for entry in iter_scandir_files(db_base_path, excludes=set()):
42
+ hash_path = Path(entry.path)
43
+ rel_path = hash_path.relative_to(db_base_path)
44
+ rel_file_path = hash_path.read_text()
45
+ lines.append(f'{str(rel_path)[:20]}… -> {rel_file_path}')
46
+ return '\n'.join(sorted(lines))
47
+
48
+
49
+ def assert_hash_db_info(backup_root: Path, expected: str):
50
+ expected = textwrap.dedent(expected).strip()
51
+ actual = get_hash_db_info(backup_root)
52
+ assert_text_equal(
53
+ actual,
54
+ expected,
55
+ msg=f'FileHashDatabase info does not match as expected.\n\n{actual}\n\n',
56
+ )
57
+
58
+
59
+ class FileHashDatabaseTestCase(BaseTestCase):
60
+ def test_happy_path(self):
61
+ with TemporaryFileHashDatabase() as hash_db:
62
+ self.assertIsInstance(hash_db, FileHashDatabase)
63
+
64
+ backup_root_path = hash_db.backup_root
65
+ assert_is_dir(backup_root_path)
66
+
67
+ test_path = hash_db._get_hash_path('12345678abcdef')
68
+ self.assertEqual(test_path, hash_db.base_path / '12' / '34' / '12345678abcdef')
69
+
70
+ file_a_path = backup_root_path / 'rel/path/to/file-A'
71
+ file_a_path.parent.mkdir(parents=True, exist_ok=True)
72
+ file_a_path.touch()
73
+
74
+ self.assertIs(hash_db.get('12345678abcdef'), None)
75
+ self.assertIs('12345678abcdef' in hash_db, False)
76
+ hash_db['12345678abcdef'] = file_a_path
77
+ self.assertEqual(hash_db.get('12345678abcdef'), file_a_path)
78
+ self.assertIs('12345678abcdef' in hash_db, True)
79
+ with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
80
+ self.assertEqual(
81
+ get_hash_db_filenames(hash_db),
82
+ ['12/34/12345678abcdef'],
83
+ )
84
+
85
+ ########################################################################################
86
+ # Another instance using the same directory:
87
+
88
+ another_hash_db = FileHashDatabase(
89
+ backup_root=hash_db.backup_root,
90
+ phlb_conf_dir=hash_db.base_path.parent,
91
+ )
92
+ self.assertEqual(another_hash_db.get('12345678abcdef'), file_a_path)
93
+ self.assertIs(another_hash_db.get('12abcd345678abcdef'), None)
94
+
95
+ file_b_path = backup_root_path / 'rel/path/to/file-B'
96
+ file_b_path.parent.mkdir(parents=True, exist_ok=True)
97
+ file_b_path.touch()
98
+
99
+ another_hash_db['12abcd345678abcdef'] = file_b_path
100
+ self.assertEqual(another_hash_db.get('12abcd345678abcdef'), file_b_path)
101
+ with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
102
+ self.assertEqual(
103
+ get_hash_db_filenames(another_hash_db),
104
+ [
105
+ '12/34/12345678abcdef',
106
+ '12/ab/12abcd345678abcdef',
107
+ ],
108
+ )
109
+
110
+ with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
111
+ assert_hash_db_info(
112
+ backup_root=hash_db.backup_root,
113
+ expected="""
114
+ 12/34/12345678abcdef… -> rel/path/to/file-A
115
+ 12/ab/12abcd345678ab… -> rel/path/to/file-B
116
+ """,
117
+ )
118
+
119
+ ########################################################################################
120
+ # Deny "overwrite" of existing hash:
121
+
122
+ with self.assertRaises(HashAlreadyExistsError):
123
+ hash_db['12abcd345678abcdef'] = 'foo/bar/baz' # already exists!
124
+
125
+ ########################################################################################
126
+ # Don't use stale entries pointing to missing files:
127
+
128
+ self.assertEqual(hash_db.get('12345678abcdef'), file_a_path)
129
+ file_a_path.unlink()
130
+
131
+ """DocWrite: README.md ## FileHashDatabase - Missing hardlink target file
132
+ We check if the hardlink source file still exists. If not, we remove the hash entry from the database.
133
+ A warning is logged in this case."""
134
+ with self.assertLogs(level=logging.WARNING) as logs:
135
+ self.assertIs(hash_db.get('12345678abcdef'), None)
136
+ self.assertIn('Hash database entry found, but file does not exist', ''.join(logs.output))
137
+ with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
138
+ assert_hash_db_info(
139
+ backup_root=hash_db.backup_root,
140
+ expected="""
141
+ 12/ab/12abcd345678ab… -> rel/path/to/file-B
142
+ """,
143
+ )
@@ -0,0 +1,138 @@
1
+ import logging
2
+ import tempfile
3
+ from collections.abc import Iterable
4
+ from pathlib import Path
5
+
6
+ from bx_py_utils.test_utils.log_utils import NoLogs
7
+ from cli_base.cli_tools.test_utils.base_testcases import BaseTestCase
8
+
9
+ from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
10
+ from PyHardLinkBackup.utilities.filesystem import iter_scandir_files
11
+
12
+
13
+ class TemporaryFileSizeDatabase(tempfile.TemporaryDirectory):
14
+ def __enter__(self) -> FileSizeDatabase:
15
+ temp_dir = super().__enter__()
16
+ backup_root = Path(temp_dir).resolve()
17
+
18
+ phlb_conf_dir = backup_root / '.phlb'
19
+ phlb_conf_dir.mkdir()
20
+
21
+ size_db = FileSizeDatabase(phlb_conf_dir=phlb_conf_dir)
22
+ return size_db
23
+
24
+
25
+ def get_size_db_filenames(size_db: FileSizeDatabase) -> Iterable[str]:
26
+ return sorted(
27
+ str(Path(entry.path).relative_to(size_db.base_path))
28
+ for entry in iter_scandir_files(size_db.base_path, excludes=set())
29
+ )
30
+
31
+
32
+ def get_sizes(size_db: FileSizeDatabase) -> Iterable[int]:
33
+ with NoLogs('PyHardLinkBackup.utilities.filesystem'):
34
+ return sorted(int(entry.name) for entry in iter_scandir_files(size_db.base_path, excludes=set()))
35
+
36
+
37
+ class FileSizeDatabaseTestCase(BaseTestCase):
38
+ def test_happy_path(self):
39
+ with TemporaryFileSizeDatabase() as size_db:
40
+ self.assertIsInstance(size_db, FileSizeDatabase)
41
+
42
+ test_path1 = size_db._get_size_path(1234)
43
+ self.assertEqual(test_path1, size_db.base_path / '12' / '34' / '1234')
44
+
45
+ test_path2 = size_db._get_size_path(567890)
46
+ self.assertEqual(test_path2, size_db.base_path / '56' / '78' / '567890')
47
+
48
+ self.assertNotIn(1234, size_db)
49
+ self.assertNotIn(567890, size_db)
50
+
51
+ size_db.add(1234)
52
+ self.assertIn(1234, size_db)
53
+ self.assertNotIn(567890, size_db)
54
+
55
+ size_db.add(567890)
56
+ self.assertIn(1234, size_db)
57
+ self.assertIn(567890, size_db)
58
+
59
+ with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
60
+ self.assertEqual(get_sizes(size_db), [1234, 567890])
61
+ self.assertEqual(
62
+ get_size_db_filenames(size_db),
63
+ [
64
+ '12/34/1234',
65
+ '56/78/567890',
66
+ ],
67
+ )
68
+
69
+ ########################################################################################
70
+ # Another instance using the same directory:
71
+
72
+ another_size_db = FileSizeDatabase(phlb_conf_dir=size_db.base_path.parent)
73
+ with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
74
+ self.assertEqual(get_sizes(another_size_db), [1234, 567890])
75
+ self.assertEqual(
76
+ get_size_db_filenames(another_size_db),
77
+ [
78
+ '12/34/1234',
79
+ '56/78/567890',
80
+ ],
81
+ )
82
+
83
+ ########################################################################################
84
+ # "Share" directories:
85
+
86
+ for size in (123400001111, 123400002222, 128800003333, 129900004444):
87
+ self.assertNotIn(size, size_db)
88
+ size_db.add(size)
89
+ self.assertIn(size, size_db)
90
+
91
+ ########################################################################################
92
+ # Min size is 1000 bytes:
93
+
94
+ """DocWrite: README.md ## FileSizeDatabase - minimum file size
95
+ The minimum file size that can be stored in the FileSizeDatabase is 1000 bytes.
96
+ This is because no padding is made for sizes below 1000 bytes, which would
97
+ break the directory structure.
98
+ """
99
+ self.assertEqual(FileSizeDatabase.MIN_SIZE, 1000)
100
+ """DocWrite: README.md ## FileSizeDatabase - minimum file size
101
+ The idea is, that it's more efficient to backup small files directly, instead of
102
+ checking for duplicates via hardlinks. Therefore, small files below this size
103
+ are not tracked in the FileSizeDatabase.
104
+ """
105
+
106
+ with self.assertRaises(AssertionError):
107
+ size_db._get_size_path(999)
108
+ with self.assertRaises(AssertionError):
109
+ size_db.add(999)
110
+ with self.assertRaises(AssertionError):
111
+ 999 in size_db
112
+
113
+ ########################################################################################
114
+ # Check final state:
115
+
116
+ with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
117
+ self.assertEqual(
118
+ get_size_db_filenames(size_db),
119
+ [
120
+ '12/34/1234',
121
+ '12/34/123400001111',
122
+ '12/34/123400002222',
123
+ '12/88/128800003333',
124
+ '12/99/129900004444',
125
+ '56/78/567890',
126
+ ],
127
+ )
128
+ self.assertEqual(
129
+ get_sizes(size_db),
130
+ [
131
+ 1234,
132
+ 567890,
133
+ 123400001111,
134
+ 123400002222,
135
+ 128800003333,
136
+ 129900004444,
137
+ ],
138
+ )