PyHardLinkBackup 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. PyHardLinkBackup/__init__.py +7 -0
  2. PyHardLinkBackup/__main__.py +10 -0
  3. PyHardLinkBackup/backup.py +297 -0
  4. PyHardLinkBackup/cli_app/__init__.py +41 -0
  5. PyHardLinkBackup/cli_app/phlb.py +136 -0
  6. PyHardLinkBackup/cli_dev/__init__.py +70 -0
  7. PyHardLinkBackup/cli_dev/__main__.py +10 -0
  8. PyHardLinkBackup/cli_dev/benchmark.py +138 -0
  9. PyHardLinkBackup/cli_dev/code_style.py +12 -0
  10. PyHardLinkBackup/cli_dev/debugging.py +47 -0
  11. PyHardLinkBackup/cli_dev/packaging.py +62 -0
  12. PyHardLinkBackup/cli_dev/shell_completion.py +23 -0
  13. PyHardLinkBackup/cli_dev/testing.py +52 -0
  14. PyHardLinkBackup/cli_dev/update_readme_history.py +33 -0
  15. PyHardLinkBackup/compare_backup.py +259 -0
  16. PyHardLinkBackup/constants.py +18 -0
  17. PyHardLinkBackup/logging_setup.py +124 -0
  18. PyHardLinkBackup/rebuild_databases.py +217 -0
  19. PyHardLinkBackup/tests/__init__.py +36 -0
  20. PyHardLinkBackup/tests/test_backup.py +1167 -0
  21. PyHardLinkBackup/tests/test_compare_backup.py +167 -0
  22. PyHardLinkBackup/tests/test_doc_write.py +26 -0
  23. PyHardLinkBackup/tests/test_doctests.py +10 -0
  24. PyHardLinkBackup/tests/test_project_setup.py +46 -0
  25. PyHardLinkBackup/tests/test_readme.py +75 -0
  26. PyHardLinkBackup/tests/test_readme_history.py +9 -0
  27. PyHardLinkBackup/tests/test_rebuild_database.py +266 -0
  28. PyHardLinkBackup/utilities/__init__.py +0 -0
  29. PyHardLinkBackup/utilities/file_hash_database.py +62 -0
  30. PyHardLinkBackup/utilities/file_size_database.py +46 -0
  31. PyHardLinkBackup/utilities/filesystem.py +257 -0
  32. PyHardLinkBackup/utilities/humanize.py +39 -0
  33. PyHardLinkBackup/utilities/rich_utils.py +237 -0
  34. PyHardLinkBackup/utilities/sha256sums.py +61 -0
  35. PyHardLinkBackup/utilities/tee.py +40 -0
  36. PyHardLinkBackup/utilities/tests/__init__.py +0 -0
  37. PyHardLinkBackup/utilities/tests/test_file_hash_database.py +153 -0
  38. PyHardLinkBackup/utilities/tests/test_file_size_database.py +151 -0
  39. PyHardLinkBackup/utilities/tests/test_filesystem.py +167 -0
  40. PyHardLinkBackup/utilities/tests/unittest_utilities.py +78 -0
  41. PyHardLinkBackup/utilities/tyro_cli_shared_args.py +29 -0
  42. pyhardlinkbackup-1.8.1.dist-info/METADATA +700 -0
  43. pyhardlinkbackup-1.8.1.dist-info/RECORD +45 -0
  44. pyhardlinkbackup-1.8.1.dist-info/WHEEL +4 -0
  45. pyhardlinkbackup-1.8.1.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,167 @@
1
+ import shutil
2
+ from pathlib import Path
3
+ from unittest import TestCase
4
+
5
+ from bx_py_utils.test_utils.redirect import RedirectOut
6
+ from cli_base.cli_tools.test_utils.assertion import assert_in
7
+ from cli_base.cli_tools.test_utils.rich_test_utils import NoColorEnvRich
8
+ from freezegun import freeze_time
9
+
10
+ from PyHardLinkBackup.compare_backup import CompareResult, LoggingManager, compare_tree
11
+ from PyHardLinkBackup.logging_setup import DEFAULT_LOG_FILE_LEVEL
12
+ from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase
13
+ from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
14
+ from PyHardLinkBackup.utilities.filesystem import hash_file
15
+ from PyHardLinkBackup.utilities.rich_utils import NoopProgress
16
+ from PyHardLinkBackup.utilities.tests.unittest_utilities import (
17
+ CollectOpenFiles,
18
+ PyHardLinkBackupTestCaseMixin,
19
+ )
20
+
21
+
22
+ def assert_compare_backup(
23
+ test_case: TestCase,
24
+ src_root: Path,
25
+ backup_root: Path,
26
+ excpected_last_timestamp: str,
27
+ excpected_total_file_count: int,
28
+ excpected_successful_file_count: int,
29
+ std_out_parts: tuple[str, ...] = ('Compare completed.',),
30
+ excludes: tuple[str, ...] = (),
31
+ excpected_error_count: int = 0,
32
+ ) -> None:
33
+ with (
34
+ NoColorEnvRich(
35
+ width=200, # Wide width to avoid line breaks in test output that failed assert_in()
36
+ ),
37
+ RedirectOut() as redirected_out,
38
+ ):
39
+ result = compare_tree(
40
+ src_root=src_root,
41
+ backup_root=backup_root,
42
+ one_file_system=True,
43
+ excludes=excludes,
44
+ log_manager=LoggingManager(
45
+ console_level='info',
46
+ file_level=DEFAULT_LOG_FILE_LEVEL,
47
+ ),
48
+ )
49
+ stdout = redirected_out.stdout
50
+ test_case.assertEqual(redirected_out.stderr, '', stdout)
51
+
52
+ assert_in(content=stdout, parts=std_out_parts)
53
+
54
+ test_case.assertEqual(result.last_timestamp, excpected_last_timestamp, stdout)
55
+ test_case.assertEqual(result.total_file_count, excpected_total_file_count, stdout)
56
+ test_case.assertEqual(result.successful_file_count, excpected_successful_file_count, stdout)
57
+ test_case.assertEqual(result.error_count, excpected_error_count, stdout)
58
+
59
+
60
+ class CompareBackupTestCase(PyHardLinkBackupTestCaseMixin, TestCase):
61
+ def test_happy_path(self):
62
+ # Setup backup structure
63
+ phlb_conf_dir = self.backup_root / '.phlb'
64
+ phlb_conf_dir.mkdir()
65
+
66
+ # Create some "older" compare dirs
67
+ (self.backup_root / '2025-12-31-235959').mkdir()
68
+ (self.backup_root / '2026-01-10-235959').mkdir()
69
+
70
+ # Create "last" backup dir:
71
+ timestamp = '2026-01-17-120000'
72
+ last_backup_dir = self.backup_root / self.src_root.name / timestamp
73
+ last_backup_dir.mkdir(parents=True)
74
+
75
+ # Create source files
76
+ (self.src_root / 'small_file.txt').write_text('hello world')
77
+ (self.src_root / 'large_file_missing.txt').write_bytes(b'X' * FileSizeDatabase.MIN_SIZE)
78
+ large_file_in_dbs = self.src_root / 'large_file_in_dbs.txt'
79
+ large_file_in_dbs.write_bytes(b'Y' * (FileSizeDatabase.MIN_SIZE + 1))
80
+
81
+ # Copy files to backup
82
+ total_size = 0
83
+ total_file_count = 0
84
+ for file_path in self.src_root.iterdir():
85
+ shutil.copy2(file_path, last_backup_dir / file_path.name)
86
+ total_size += file_path.stat().st_size
87
+ total_file_count += 1
88
+ self.assertEqual(total_file_count, 3)
89
+ self.assertEqual(total_size, 2012)
90
+
91
+ # Create databases and add values from 'large_file_in_dbs.txt'
92
+ size_db = FileSizeDatabase(phlb_conf_dir)
93
+ size_db.add(FileSizeDatabase.MIN_SIZE + 1)
94
+ hash_db = FileHashDatabase(self.backup_root, phlb_conf_dir)
95
+ src_hash = hash_file(large_file_in_dbs, progress=NoopProgress(), total_size=1234)
96
+ hash_db[src_hash] = last_backup_dir / 'large_file_in_dbs.txt'
97
+
98
+ #######################################################################################
99
+ # Run compare_tree
100
+
101
+ with (
102
+ CollectOpenFiles(self.temp_path) as collector,
103
+ freeze_time('2026-01-18T22:12:34+0000', auto_tick_seconds=0),
104
+ RedirectOut() as redirected_out,
105
+ ):
106
+ result = compare_tree(
107
+ src_root=self.src_root,
108
+ backup_root=self.backup_root,
109
+ one_file_system=True,
110
+ excludes=(),
111
+ log_manager=LoggingManager(
112
+ console_level='info',
113
+ file_level=DEFAULT_LOG_FILE_LEVEL,
114
+ ),
115
+ )
116
+ self.assertEqual(redirected_out.stderr, '')
117
+ self.assertIn('Compare completed.', redirected_out.stdout)
118
+ self.assertEqual(
119
+ sorted(collector.opened_for_read),
120
+ [
121
+ 'rb backups/source/2026-01-17-120000/large_file_in_dbs.txt',
122
+ 'rb backups/source/2026-01-17-120000/large_file_missing.txt',
123
+ 'rb backups/source/2026-01-17-120000/small_file.txt',
124
+ 'rb source/large_file_in_dbs.txt',
125
+ 'rb source/large_file_missing.txt',
126
+ 'rb source/small_file.txt',
127
+ ],
128
+ )
129
+ self.assertEqual(
130
+ collector.opened_for_write,
131
+ [
132
+ 'a backups/source/2026-01-18-221234-compare.log',
133
+ 'w backups/source/2026-01-18-221234-summary.txt',
134
+ ],
135
+ )
136
+ self.assertEqual(
137
+ result,
138
+ CompareResult(
139
+ last_timestamp='2026-01-17-120000',
140
+ compare_dir=last_backup_dir,
141
+ log_file=result.log_file,
142
+ total_file_count=total_file_count,
143
+ total_size=total_size,
144
+ src_file_new_count=0,
145
+ file_size_missmatch=0,
146
+ file_hash_missmatch=0,
147
+ small_file_count=1,
148
+ size_db_missing_count=1,
149
+ hash_db_missing_count=1,
150
+ successful_file_count=total_file_count,
151
+ error_count=0,
152
+ ),
153
+ redirected_out.stdout,
154
+ )
155
+
156
+ #######################################################################################
157
+ # Check again with our test helper:
158
+
159
+ assert_compare_backup(
160
+ test_case=self,
161
+ src_root=self.src_root,
162
+ backup_root=self.backup_root,
163
+ excpected_last_timestamp='2026-01-17-120000',
164
+ excpected_total_file_count=total_file_count,
165
+ excpected_successful_file_count=total_file_count,
166
+ excpected_error_count=0,
167
+ )
@@ -0,0 +1,26 @@
1
+ from unittest import TestCase
2
+
3
+ from bx_py_utils.doc_write.api import GeneratedInfo, generate
4
+ from bx_py_utils.path import assert_is_file
5
+
6
+ from PyHardLinkBackup.cli_dev import PACKAGE_ROOT
7
+
8
+
9
+ class DocuWriteApiTestCase(TestCase):
10
+ def test_up2date_docs(self):
11
+ """DocWrite: about-docs.md # generate Doc-Write
12
+
13
+ These documentation files are generated automatically with the "Doc-Write" tool.
14
+ They updated automatically by unittests.
15
+
16
+ More information about Doc-Write can be found here:
17
+
18
+ https://github.com/boxine/bx_py_utils/tree/master/bx_py_utils/doc_write
19
+ """
20
+ assert_is_file(PACKAGE_ROOT / 'pyproject.toml')
21
+
22
+ with self.assertLogs():
23
+ info: GeneratedInfo = generate(base_path=PACKAGE_ROOT)
24
+ self.assertGreaterEqual(len(info.paths), 2)
25
+ self.assertEqual(info.update_count, 0, 'No files should be updated, commit the changes')
26
+ self.assertEqual(info.remove_count, 0, 'No files should be removed, commit the changes')
@@ -0,0 +1,10 @@
1
+ from bx_py_utils.test_utils.unittest_utils import BaseDocTests
2
+
3
+ import PyHardLinkBackup
4
+
5
+
6
+ class DocTests(BaseDocTests):
7
+ def test_doctests(self):
8
+ self.run_doctests(
9
+ modules=(PyHardLinkBackup,),
10
+ )
@@ -0,0 +1,46 @@
1
+ import subprocess
2
+ from unittest import TestCase
3
+
4
+ from bx_py_utils.path import assert_is_file
5
+ from cli_base.cli_tools.code_style import assert_code_style
6
+ from cli_base.cli_tools.subprocess_utils import ToolsExecutor
7
+ from manageprojects.test_utils.project_setup import check_editor_config, get_py_max_line_length
8
+ from packaging.version import Version
9
+
10
+ from PyHardLinkBackup import __version__
11
+ from PyHardLinkBackup.cli_dev import PACKAGE_ROOT
12
+
13
+
14
+ class ProjectSetupTestCase(TestCase):
15
+ def test_version(self):
16
+ self.assertIsNotNone(__version__)
17
+
18
+ version = Version(__version__) # Will raise InvalidVersion() if wrong formatted
19
+ self.assertEqual(str(version), __version__)
20
+
21
+ cli_bin = PACKAGE_ROOT / 'cli.py'
22
+ assert_is_file(cli_bin)
23
+
24
+ output = subprocess.check_output([cli_bin, 'version'], text=True)
25
+ self.assertIn(f'PyHardLinkBackup v{__version__}', output)
26
+
27
+ dev_cli_bin = PACKAGE_ROOT / 'dev-cli.py'
28
+ assert_is_file(dev_cli_bin)
29
+
30
+ output = subprocess.check_output([dev_cli_bin, 'version'], text=True)
31
+ self.assertIn(f'PyHardLinkBackup v{__version__}', output)
32
+
33
+ def test_code_style(self):
34
+ return_code = assert_code_style(package_root=PACKAGE_ROOT)
35
+ self.assertEqual(return_code, 0, 'Code style error, see output above!')
36
+
37
+ def test_check_editor_config(self):
38
+ check_editor_config(package_root=PACKAGE_ROOT)
39
+
40
+ max_line_length = get_py_max_line_length(package_root=PACKAGE_ROOT)
41
+ self.assertEqual(max_line_length, 119)
42
+
43
+ def test_pre_commit_hooks(self):
44
+ executor = ToolsExecutor(cwd=PACKAGE_ROOT)
45
+ for command in ('migrate-config', 'validate-config', 'validate-manifest'):
46
+ executor.verbose_check_call('pre-commit', command, exit_on_error=True)
@@ -0,0 +1,75 @@
1
+ from bx_py_utils.auto_doc import assert_readme_block
2
+ from bx_py_utils.path import assert_is_file
3
+ from cli_base.cli_tools.test_utils.rich_test_utils import NoColorEnvRich, invoke
4
+ from manageprojects.tests.base import BaseTestCase
5
+
6
+ from PyHardLinkBackup import constants
7
+ from PyHardLinkBackup.cli_dev import PACKAGE_ROOT
8
+
9
+
10
+ def assert_cli_help_in_readme(text_block: str, marker: str):
11
+ README_PATH = PACKAGE_ROOT / 'README.md'
12
+ assert_is_file(README_PATH)
13
+
14
+ text_block = text_block.replace(constants.CLI_EPILOG, '')
15
+ text_block = f'```\n{text_block.strip()}\n```'
16
+ assert_readme_block(
17
+ readme_path=README_PATH,
18
+ text_block=text_block,
19
+ start_marker_line=f'[comment]: <> (✂✂✂ auto generated {marker} start ✂✂✂)',
20
+ end_marker_line=f'[comment]: <> (✂✂✂ auto generated {marker} end ✂✂✂)',
21
+ )
22
+
23
+
24
+ class ReadmeTestCase(BaseTestCase):
25
+
26
+ def test_main_help(self):
27
+ with NoColorEnvRich():
28
+ stdout = invoke(cli_bin=PACKAGE_ROOT / 'cli.py', args=['--help'], strip_line_prefix='usage: ')
29
+
30
+ self.assert_in_content(
31
+ got=stdout,
32
+ parts=(
33
+ 'usage: ./cli.py [-h]',
34
+ ' version ',
35
+ 'Print version and exit',
36
+ constants.CLI_EPILOG,
37
+ ),
38
+ )
39
+
40
+ # Installed via pipx is called 'phlb', not 'cli.py':
41
+ stdout = stdout.replace('./cli.py', 'phlb')
42
+
43
+ assert_cli_help_in_readme(text_block=stdout, marker='main help')
44
+
45
+ def test_backup_help(self):
46
+ with NoColorEnvRich():
47
+ stdout = invoke(cli_bin=PACKAGE_ROOT / 'cli.py', args=['backup', '--help'], strip_line_prefix='usage: ')
48
+ self.assert_in_content(
49
+ got=stdout,
50
+ parts=(
51
+ 'usage: ./cli.py backup [-h] ',
52
+ 'Backup the source directory to the destination',
53
+ ),
54
+ )
55
+
56
+ # Installed via pipx is called 'phlb', not 'cli.py':
57
+ stdout = stdout.replace('./cli.py', 'phlb')
58
+
59
+ assert_cli_help_in_readme(text_block=stdout, marker='backup help')
60
+
61
+ def test_dev_help(self):
62
+ with NoColorEnvRich():
63
+ stdout = invoke(cli_bin=PACKAGE_ROOT / 'dev-cli.py', args=['--help'], strip_line_prefix='usage: ')
64
+ self.assert_in_content(
65
+ got=stdout,
66
+ parts=(
67
+ 'usage: ./dev-cli.py [-h]',
68
+ ' lint ',
69
+ ' coverage ',
70
+ ' update-readme-history ',
71
+ ' publish ',
72
+ constants.CLI_EPILOG,
73
+ ),
74
+ )
75
+ assert_cli_help_in_readme(text_block=stdout, marker='dev help')
@@ -0,0 +1,9 @@
1
+ from unittest import TestCase
2
+
3
+ from cli_base.cli_tools.git_history import update_readme_history
4
+
5
+
6
+ class ReadmeHistoryTestCase(TestCase):
7
+ def test_readme_history(self):
8
+ with self.assertLogs():
9
+ update_readme_history(raise_update_error=True)
@@ -0,0 +1,266 @@
1
+ import logging
2
+ import os
3
+ import textwrap
4
+ from pathlib import Path
5
+ from unittest.mock import patch
6
+
7
+ from bx_py_utils.test_utils.redirect import RedirectOut
8
+ from cli_base.cli_tools.test_utils.base_testcases import BaseTestCase
9
+ from freezegun import freeze_time
10
+
11
+ from PyHardLinkBackup import rebuild_databases
12
+ from PyHardLinkBackup.logging_setup import NoopLoggingManager
13
+ from PyHardLinkBackup.rebuild_databases import RebuildResult, rebuild, rebuild_one_file
14
+ from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
15
+ from PyHardLinkBackup.utilities.tests.unittest_utilities import TemporaryDirectoryPath
16
+
17
+
18
+ def sorted_rglob_paths(path: Path):
19
+ return sorted([str(p.relative_to(path)) for p in path.rglob('*')])
20
+
21
+
22
+ def sorted_rglob_files(path: Path):
23
+ return sorted([str(p.relative_to(path)) for p in path.rglob('*') if p.is_file()])
24
+
25
+
26
+ class RebuildDatabaseTestCase(BaseTestCase):
27
+ maxDiff = None
28
+
29
+ def test_happy_path(self):
30
+ with TemporaryDirectoryPath() as temp_path:
31
+ backup_root = temp_path / 'backup'
32
+
33
+ with self.assertRaises(SystemExit), RedirectOut() as redirected_out:
34
+ rebuild(backup_root, skip_same_inode=True, log_manager=NoopLoggingManager())
35
+
36
+ self.assertEqual(redirected_out.stderr, '')
37
+ self.assertEqual(redirected_out.stdout, f'Error: Backup directory "{backup_root}" does not exist!\n')
38
+
39
+ backup_root.mkdir()
40
+
41
+ with self.assertRaises(SystemExit), RedirectOut() as redirected_out:
42
+ rebuild(backup_root, skip_same_inode=True, log_manager=NoopLoggingManager())
43
+
44
+ self.assertEqual(redirected_out.stderr, '')
45
+ self.assertIn('hidden ".phlb" configuration directory is missing', redirected_out.stdout)
46
+
47
+ phlb_conf_dir = backup_root / '.phlb'
48
+ phlb_conf_dir.mkdir()
49
+
50
+ #######################################################################################
51
+ # Run on empty backup directory:
52
+
53
+ self.assertEqual(sorted_rglob_paths(backup_root), ['.phlb'])
54
+
55
+ with (
56
+ self.assertLogs('PyHardLinkBackup', level=logging.DEBUG),
57
+ RedirectOut() as redirected_out,
58
+ freeze_time('2026-01-16T12:34:56Z', auto_tick_seconds=0),
59
+ ):
60
+ rebuild_result = rebuild(backup_root, skip_same_inode=True, log_manager=NoopLoggingManager())
61
+ self.assertEqual(redirected_out.stderr, '')
62
+ self.assertEqual(
63
+ rebuild_result,
64
+ RebuildResult(
65
+ process_count=0,
66
+ process_size=0,
67
+ added_size_count=0,
68
+ added_hash_count=0,
69
+ error_count=0,
70
+ ),
71
+ )
72
+ self.assertEqual(
73
+ sorted_rglob_paths(backup_root),
74
+ [
75
+ '.phlb',
76
+ '.phlb/hash-lookup',
77
+ '.phlb/size-lookup',
78
+ '2026-01-16-123456-rebuild-summary.txt',
79
+ ],
80
+ )
81
+ self.assertEqual(redirected_out.stderr, '')
82
+
83
+ #######################################################################################
84
+ # Add one backuped file and run again:
85
+
86
+ snapshot_path = backup_root / 'source-name' / '2026-01-15-181709'
87
+ snapshot_path.mkdir(parents=True)
88
+
89
+ minimum_file_content = 'X' * FileSizeDatabase.MIN_SIZE
90
+ (snapshot_path / 'file1.txt').write_text(minimum_file_content)
91
+
92
+ with (
93
+ self.assertLogs('PyHardLinkBackup', level=logging.DEBUG),
94
+ RedirectOut() as redirected_out,
95
+ freeze_time('2026-01-16T12:34:56Z', auto_tick_seconds=0),
96
+ ):
97
+ rebuild_result = rebuild(backup_root, skip_same_inode=True, log_manager=NoopLoggingManager())
98
+ self.assertEqual(redirected_out.stderr, '')
99
+ self.assertEqual(
100
+ sorted_rglob_paths(backup_root),
101
+ [
102
+ '.phlb',
103
+ '.phlb/hash-lookup',
104
+ '.phlb/hash-lookup/bb',
105
+ '.phlb/hash-lookup/bb/c4',
106
+ '.phlb/hash-lookup/bb/c4/bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8',
107
+ '.phlb/size-lookup',
108
+ '.phlb/size-lookup/10',
109
+ '.phlb/size-lookup/10/00',
110
+ '.phlb/size-lookup/10/00/1000',
111
+ '2026-01-16-123456-rebuild-summary.txt',
112
+ 'source-name',
113
+ 'source-name/2026-01-15-181709',
114
+ 'source-name/2026-01-15-181709/SHA256SUMS',
115
+ 'source-name/2026-01-15-181709/file1.txt',
116
+ ], redirected_out.stdout
117
+ )
118
+ self.assertEqual(
119
+ sorted_rglob_files(backup_root),
120
+ [
121
+ '.phlb/hash-lookup/bb/c4/bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8',
122
+ '.phlb/size-lookup/10/00/1000',
123
+ '2026-01-16-123456-rebuild-summary.txt',
124
+ 'source-name/2026-01-15-181709/SHA256SUMS',
125
+ 'source-name/2026-01-15-181709/file1.txt',
126
+ ],
127
+ )
128
+ self.assertEqual(redirected_out.stderr, '')
129
+ self.assertEqual(
130
+ rebuild_result,
131
+ RebuildResult(
132
+ process_count=1,
133
+ process_size=1000,
134
+ added_size_count=1,
135
+ added_hash_count=1,
136
+ error_count=0,
137
+ hash_verified_count=0,
138
+ hash_mismatch_count=0,
139
+ hash_not_found_count=1,
140
+ unique_inode_count=2,
141
+ skip_by_inode_count=0,
142
+ ),
143
+ )
144
+ self.assertEqual(
145
+ (snapshot_path / 'SHA256SUMS').read_text(),
146
+ 'bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8 file1.txt\n',
147
+ )
148
+
149
+ #######################################################################################
150
+ # Add a file with same content and run again:
151
+
152
+ minimum_file_content = 'X' * FileSizeDatabase.MIN_SIZE
153
+ (snapshot_path / 'same_content.txt').write_text(minimum_file_content)
154
+
155
+ with (
156
+ self.assertLogs('PyHardLinkBackup', level=logging.DEBUG) as logs,
157
+ RedirectOut() as redirected_out,
158
+ freeze_time('2026-01-16T12:34:56Z', auto_tick_seconds=0),
159
+ ):
160
+ rebuild_result = rebuild(backup_root, skip_same_inode=True, log_manager=NoopLoggingManager())
161
+ # No new hash of size entries, just the new file:
162
+ self.assertEqual(
163
+ sorted_rglob_files(backup_root),
164
+ [
165
+ '.phlb/hash-lookup/bb/c4/bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8',
166
+ '.phlb/size-lookup/10/00/1000',
167
+ '2026-01-16-123456-rebuild-summary.txt',
168
+ 'source-name/2026-01-15-181709/SHA256SUMS',
169
+ 'source-name/2026-01-15-181709/file1.txt',
170
+ 'source-name/2026-01-15-181709/same_content.txt',
171
+ ],
172
+ )
173
+ self.assertEqual(redirected_out.stderr, '')
174
+ self.assertEqual(
175
+ rebuild_result,
176
+ RebuildResult(
177
+ process_count=3,
178
+ process_size=2000,
179
+ added_size_count=0,
180
+ added_hash_count=0,
181
+ error_count=0,
182
+ hash_verified_count=1, # Existing file verified successfully
183
+ hash_mismatch_count=0,
184
+ hash_not_found_count=1, # One file added
185
+ unique_inode_count=4,
186
+ skip_by_inode_count=0,
187
+ ),
188
+ '\n'.join(logs.output) + redirected_out.stdout,
189
+ )
190
+ self.assertEqual(
191
+ (snapshot_path / 'SHA256SUMS').read_text(),
192
+ textwrap.dedent("""\
193
+ bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8 file1.txt
194
+ bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8 same_content.txt
195
+ """),
196
+ )
197
+
198
+ #######################################################################################
199
+ # Test error handling
200
+
201
+ def rebuild_one_file_mock(*, entry, **kwargs):
202
+ if entry.name == 'file1.txt':
203
+ raise IOError('Bam!')
204
+ return rebuild_one_file(entry=entry, **kwargs)
205
+
206
+ with (
207
+ self.assertLogs('PyHardLinkBackup', level=logging.ERROR) as logs,
208
+ RedirectOut() as redirected_out,
209
+ patch.object(rebuild_databases, 'rebuild_one_file', rebuild_one_file_mock),
210
+ ):
211
+ rebuild_result = rebuild(backup_root, skip_same_inode=True, log_manager=NoopLoggingManager())
212
+ logs = ''.join(logs.output)
213
+ self.assertIn(f'Backup {snapshot_path}/file1.txt OSError: Bam!\n', logs)
214
+ self.assertIn('\nTraceback (most recent call last):\n', logs)
215
+ self.assertEqual(redirected_out.stderr, '')
216
+
217
+ self.assertEqual(
218
+ rebuild_result,
219
+ RebuildResult(
220
+ process_count=2,
221
+ process_size=1000,
222
+ added_size_count=0,
223
+ added_hash_count=0,
224
+ error_count=1, # <<< one file caused error
225
+ hash_verified_count=1,
226
+ hash_mismatch_count=0,
227
+ hash_not_found_count=0,
228
+ unique_inode_count=3,
229
+ skip_by_inode_count=0,
230
+ ),
231
+ )
232
+
233
+ def test_skip_same_inode(self):
234
+ with TemporaryDirectoryPath() as temp_path:
235
+ backup_root = temp_path / 'backup'
236
+ backup_root.mkdir()
237
+ (backup_root / '.phlb').mkdir()
238
+
239
+ file1_path = backup_root / 'file2.txt'
240
+ file1_path.write_text('123456')
241
+
242
+ (backup_root / 'symlink2file1.txt').symlink_to(file1_path)
243
+ os.link(file1_path, backup_root / 'hardlink2file1')
244
+
245
+ with (
246
+ self.assertLogs('PyHardLinkBackup', level=logging.DEBUG),
247
+ RedirectOut() as redirected_out,
248
+ ):
249
+ rebuild_result = rebuild(backup_root, skip_same_inode=True, log_manager=NoopLoggingManager())
250
+ self.assertEqual(redirected_out.stderr, '')
251
+ self.assertEqual(
252
+ rebuild_result,
253
+ RebuildResult(
254
+ process_count=1,
255
+ process_size=6,
256
+ added_size_count=0,
257
+ added_hash_count=0,
258
+ error_count=0,
259
+ hash_verified_count=0,
260
+ hash_mismatch_count=0,
261
+ hash_not_found_count=0,
262
+ unique_inode_count=2,
263
+ skip_by_inode_count=1,
264
+ ),
265
+ redirected_out.stdout,
266
+ )
File without changes
@@ -0,0 +1,62 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+
5
+ class HashAlreadyExistsError(ValueError):
6
+ pass
7
+
8
+
9
+ class FileHashDatabase:
10
+ """DocWrite: README.md ## FileHashDatabase
11
+ A simple "database" to store file content hash <-> relative path mappings.
12
+ Uses a directory structure to avoid too many files in a single directory.
13
+ Path structure:
14
+ {base_dst}/.phlb/hash-lookup/{XX}/{YY}/{hash}
15
+ e.g.:
16
+ hash '12ab000a1b2c3...' results in: {base_dst}/.phlb/hash-lookup/12/ab/12ab000a1b2c3...
17
+
18
+ Notes:
19
+ * Hash length will be not validated, so it can be used with any hash algorithm.
20
+ * The "relative path" that will be stored is not validated, so it can be any string.
21
+ * We don't "cache" anything in Memory, to avoid high memory consumption for large datasets.
22
+ """
23
+
24
+ def __init__(self, backup_root: Path, phlb_conf_dir: Path):
25
+ self.backup_root = backup_root
26
+ self.base_path = phlb_conf_dir / 'hash-lookup'
27
+ self.base_path.mkdir(parents=False, exist_ok=True)
28
+
29
+ def _get_hash_path(self, hash: str) -> Path:
30
+ first_dir_name = hash[:2]
31
+ second_dir_name = hash[2:4]
32
+ hash_path = self.base_path / first_dir_name / second_dir_name / hash
33
+ return hash_path
34
+
35
+ def __contains__(self, hash: str) -> bool:
36
+ hash_path = self._get_hash_path(hash)
37
+ return hash_path.exists()
38
+
39
+ def get(self, hash: str) -> Path | None:
40
+ hash_path = self._get_hash_path(hash)
41
+ try:
42
+ rel_file_path = hash_path.read_text()
43
+ except FileNotFoundError:
44
+ return None
45
+ else:
46
+ abs_file_path = self.backup_root / rel_file_path
47
+ if not abs_file_path.is_file():
48
+ logging.warning('Hash database entry found, but file does not exist: %s', abs_file_path)
49
+ hash_path.unlink()
50
+ return None
51
+ return abs_file_path
52
+
53
+ def __setitem__(self, hash: str, abs_file_path: Path):
54
+ hash_path = self._get_hash_path(hash)
55
+ hash_path.parent.mkdir(parents=True, exist_ok=True)
56
+
57
+ # File should be found before and results in hardlink creation!
58
+ # So deny change of existing hashes:
59
+ if hash_path.exists():
60
+ raise HashAlreadyExistsError(f'Hash {hash} already exists in the database!')
61
+
62
+ hash_path.write_text(str(abs_file_path.relative_to(self.backup_root)))