PyHardLinkBackup 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. PyHardLinkBackup/__init__.py +7 -0
  2. PyHardLinkBackup/__main__.py +10 -0
  3. PyHardLinkBackup/backup.py +261 -0
  4. PyHardLinkBackup/cli_app/__init__.py +41 -0
  5. PyHardLinkBackup/cli_app/phlb.py +123 -0
  6. PyHardLinkBackup/cli_dev/__init__.py +70 -0
  7. PyHardLinkBackup/cli_dev/benchmark.py +138 -0
  8. PyHardLinkBackup/cli_dev/code_style.py +12 -0
  9. PyHardLinkBackup/cli_dev/packaging.py +65 -0
  10. PyHardLinkBackup/cli_dev/shell_completion.py +23 -0
  11. PyHardLinkBackup/cli_dev/testing.py +52 -0
  12. PyHardLinkBackup/cli_dev/update_readme_history.py +33 -0
  13. PyHardLinkBackup/compare_backup.py +212 -0
  14. PyHardLinkBackup/constants.py +16 -0
  15. PyHardLinkBackup/logging_setup.py +124 -0
  16. PyHardLinkBackup/rebuild_databases.py +176 -0
  17. PyHardLinkBackup/tests/__init__.py +36 -0
  18. PyHardLinkBackup/tests/test_backup.py +628 -0
  19. PyHardLinkBackup/tests/test_compare_backup.py +86 -0
  20. PyHardLinkBackup/tests/test_doc_write.py +26 -0
  21. PyHardLinkBackup/tests/test_doctests.py +10 -0
  22. PyHardLinkBackup/tests/test_project_setup.py +46 -0
  23. PyHardLinkBackup/tests/test_readme.py +75 -0
  24. PyHardLinkBackup/tests/test_readme_history.py +9 -0
  25. PyHardLinkBackup/tests/test_rebuild_database.py +224 -0
  26. PyHardLinkBackup/utilities/__init__.py +0 -0
  27. PyHardLinkBackup/utilities/file_hash_database.py +62 -0
  28. PyHardLinkBackup/utilities/file_size_database.py +46 -0
  29. PyHardLinkBackup/utilities/filesystem.py +158 -0
  30. PyHardLinkBackup/utilities/humanize.py +39 -0
  31. PyHardLinkBackup/utilities/rich_utils.py +99 -0
  32. PyHardLinkBackup/utilities/sha256sums.py +61 -0
  33. PyHardLinkBackup/utilities/tee.py +40 -0
  34. PyHardLinkBackup/utilities/tests/__init__.py +0 -0
  35. PyHardLinkBackup/utilities/tests/test_file_hash_database.py +143 -0
  36. PyHardLinkBackup/utilities/tests/test_file_size_database.py +138 -0
  37. PyHardLinkBackup/utilities/tests/test_filesystem.py +126 -0
  38. PyHardLinkBackup/utilities/tyro_cli_shared_args.py +12 -0
  39. pyhardlinkbackup-1.5.0.dist-info/METADATA +600 -0
  40. pyhardlinkbackup-1.5.0.dist-info/RECORD +42 -0
  41. pyhardlinkbackup-1.5.0.dist-info/WHEEL +4 -0
  42. pyhardlinkbackup-1.5.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,86 @@
1
+ import shutil
2
+ import tempfile
3
+ from pathlib import Path
4
+ from unittest import TestCase
5
+
6
+ from bx_py_utils.test_utils.redirect import RedirectOut
7
+ from cli_base.cli_tools.test_utils.base_testcases import OutputMustCapturedTestCaseMixin
8
+
9
+ from PyHardLinkBackup.compare_backup import CompareResult, LoggingManager, compare_tree
10
+ from PyHardLinkBackup.logging_setup import DEFAULT_LOG_FILE_LEVEL
11
+ from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase
12
+ from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
13
+ from PyHardLinkBackup.utilities.filesystem import hash_file
14
+
15
+
16
+ class CompareBackupTestCase(OutputMustCapturedTestCaseMixin, TestCase):
17
+ def test_happy_path(self):
18
+ with tempfile.TemporaryDirectory() as src_dir, tempfile.TemporaryDirectory() as backup_dir:
19
+ src_root = Path(src_dir).resolve()
20
+ backup_root = Path(backup_dir).resolve()
21
+
22
+ # Setup backup structure
23
+ phlb_conf_dir = backup_root / '.phlb'
24
+ phlb_conf_dir.mkdir()
25
+
26
+ compare_main_dir = backup_root / src_root.name
27
+ compare_main_dir.mkdir()
28
+
29
+ timestamp = '2026-01-17-120000'
30
+ compare_dir = compare_main_dir / timestamp
31
+ compare_dir.mkdir()
32
+
33
+ # Create source files
34
+ (src_root / 'small_file.txt').write_text('hello world')
35
+ (src_root / 'large_file_missing.txt').write_bytes(b'X' * FileSizeDatabase.MIN_SIZE)
36
+ large_file_in_dbs = src_root / 'large_file_in_dbs.txt'
37
+ large_file_in_dbs.write_bytes(b'Y' * (FileSizeDatabase.MIN_SIZE + 1))
38
+
39
+ # Copy files to backup
40
+ total_size = 0
41
+ total_file_count = 0
42
+ for file_path in src_root.iterdir():
43
+ shutil.copy2(file_path, compare_dir / file_path.name)
44
+ total_size += file_path.stat().st_size
45
+ total_file_count += 1
46
+ self.assertEqual(total_file_count, 3)
47
+ self.assertEqual(total_size, 2012)
48
+
49
+ # Create databases and add values from 'large_file_in_dbs.txt'
50
+ size_db = FileSizeDatabase(phlb_conf_dir)
51
+ size_db.add(FileSizeDatabase.MIN_SIZE + 1)
52
+ hash_db = FileHashDatabase(backup_root, phlb_conf_dir)
53
+ src_hash = hash_file(large_file_in_dbs)
54
+ hash_db[src_hash] = compare_dir / 'large_file_in_dbs.txt'
55
+
56
+ # Run compare_tree
57
+ with RedirectOut() as redirected_out:
58
+ result = compare_tree(
59
+ src_root=src_root,
60
+ backup_root=backup_root,
61
+ excludes=(),
62
+ log_manager=LoggingManager(
63
+ console_level='info',
64
+ file_level=DEFAULT_LOG_FILE_LEVEL,
65
+ ),
66
+ )
67
+ self.assertEqual(redirected_out.stderr, '')
68
+ self.assertIn('Compare completed.', redirected_out.stdout)
69
+ self.assertEqual(
70
+ result,
71
+ CompareResult(
72
+ compare_dir=compare_dir,
73
+ log_file=result.log_file,
74
+ total_file_count=total_file_count,
75
+ total_size=total_size,
76
+ src_file_new_count=0,
77
+ file_size_missmatch=0,
78
+ file_hash_missmatch=0,
79
+ small_file_count=1,
80
+ size_db_missing_count=1,
81
+ hash_db_missing_count=1,
82
+ successful_file_count=total_file_count,
83
+ error_count=0,
84
+ ),
85
+ redirected_out.stdout,
86
+ )
@@ -0,0 +1,26 @@
1
+ from unittest import TestCase
2
+
3
+ from bx_py_utils.doc_write.api import GeneratedInfo, generate
4
+ from bx_py_utils.path import assert_is_file
5
+
6
+ from PyHardLinkBackup.cli_dev import PACKAGE_ROOT
7
+
8
+
9
+ class DocuWriteApiTestCase(TestCase):
10
+ def test_up2date_docs(self):
11
+ """DocWrite: about-docs.md # generate Doc-Write
12
+
13
+ These documentation files are generated automatically with the "Doc-Write" tool.
14
+ They updated automatically by unittests.
15
+
16
+ More information about Doc-Write can be found here:
17
+
18
+ https://github.com/boxine/bx_py_utils/tree/master/bx_py_utils/doc_write
19
+ """
20
+ assert_is_file(PACKAGE_ROOT / 'pyproject.toml')
21
+
22
+ with self.assertLogs():
23
+ info: GeneratedInfo = generate(base_path=PACKAGE_ROOT)
24
+ self.assertGreaterEqual(len(info.paths), 1)
25
+ self.assertEqual(info.update_count, 0, 'No files should be updated, commit the changes')
26
+ self.assertEqual(info.remove_count, 0, 'No files should be removed, commit the changes')
@@ -0,0 +1,10 @@
1
+ from bx_py_utils.test_utils.unittest_utils import BaseDocTests
2
+
3
+ import PyHardLinkBackup
4
+
5
+
6
+ class DocTests(BaseDocTests):
7
+ def test_doctests(self):
8
+ self.run_doctests(
9
+ modules=(PyHardLinkBackup,),
10
+ )
@@ -0,0 +1,46 @@
1
+ import subprocess
2
+ from unittest import TestCase
3
+
4
+ from bx_py_utils.path import assert_is_file
5
+ from cli_base.cli_tools.code_style import assert_code_style
6
+ from cli_base.cli_tools.subprocess_utils import ToolsExecutor
7
+ from manageprojects.test_utils.project_setup import check_editor_config, get_py_max_line_length
8
+ from packaging.version import Version
9
+
10
+ from PyHardLinkBackup import __version__
11
+ from PyHardLinkBackup.cli_dev import PACKAGE_ROOT
12
+
13
+
14
+ class ProjectSetupTestCase(TestCase):
15
+ def test_version(self):
16
+ self.assertIsNotNone(__version__)
17
+
18
+ version = Version(__version__) # Will raise InvalidVersion() if wrong formatted
19
+ self.assertEqual(str(version), __version__)
20
+
21
+ cli_bin = PACKAGE_ROOT / 'cli.py'
22
+ assert_is_file(cli_bin)
23
+
24
+ output = subprocess.check_output([cli_bin, 'version'], text=True)
25
+ self.assertIn(f'PyHardLinkBackup v{__version__}', output)
26
+
27
+ dev_cli_bin = PACKAGE_ROOT / 'dev-cli.py'
28
+ assert_is_file(dev_cli_bin)
29
+
30
+ output = subprocess.check_output([dev_cli_bin, 'version'], text=True)
31
+ self.assertIn(f'PyHardLinkBackup v{__version__}', output)
32
+
33
+ def test_code_style(self):
34
+ return_code = assert_code_style(package_root=PACKAGE_ROOT)
35
+ self.assertEqual(return_code, 0, 'Code style error, see output above!')
36
+
37
+ def test_check_editor_config(self):
38
+ check_editor_config(package_root=PACKAGE_ROOT)
39
+
40
+ max_line_length = get_py_max_line_length(package_root=PACKAGE_ROOT)
41
+ self.assertEqual(max_line_length, 119)
42
+
43
+ def test_pre_commit_hooks(self):
44
+ executor = ToolsExecutor(cwd=PACKAGE_ROOT)
45
+ for command in ('migrate-config', 'validate-config', 'validate-manifest'):
46
+ executor.verbose_check_call('pre-commit', command, exit_on_error=True)
@@ -0,0 +1,75 @@
1
+ from bx_py_utils.auto_doc import assert_readme_block
2
+ from bx_py_utils.path import assert_is_file
3
+ from cli_base.cli_tools.test_utils.rich_test_utils import NoColorEnvRich, invoke
4
+ from manageprojects.tests.base import BaseTestCase
5
+
6
+ from PyHardLinkBackup import constants
7
+ from PyHardLinkBackup.cli_dev import PACKAGE_ROOT
8
+
9
+
10
+ def assert_cli_help_in_readme(text_block: str, marker: str):
11
+ README_PATH = PACKAGE_ROOT / 'README.md'
12
+ assert_is_file(README_PATH)
13
+
14
+ text_block = text_block.replace(constants.CLI_EPILOG, '')
15
+ text_block = f'```\n{text_block.strip()}\n```'
16
+ assert_readme_block(
17
+ readme_path=README_PATH,
18
+ text_block=text_block,
19
+ start_marker_line=f'[comment]: <> (✂✂✂ auto generated {marker} start ✂✂✂)',
20
+ end_marker_line=f'[comment]: <> (✂✂✂ auto generated {marker} end ✂✂✂)',
21
+ )
22
+
23
+
24
+ class ReadmeTestCase(BaseTestCase):
25
+
26
+ def test_main_help(self):
27
+ with NoColorEnvRich():
28
+ stdout = invoke(cli_bin=PACKAGE_ROOT / 'cli.py', args=['--help'], strip_line_prefix='usage: ')
29
+
30
+ self.assert_in_content(
31
+ got=stdout,
32
+ parts=(
33
+ 'usage: ./cli.py [-h]',
34
+ ' version ',
35
+ 'Print version and exit',
36
+ constants.CLI_EPILOG,
37
+ ),
38
+ )
39
+
40
+ # Installed via pipx is called 'phlb', not 'cli.py':
41
+ stdout = stdout.replace('./cli.py', 'phlb')
42
+
43
+ assert_cli_help_in_readme(text_block=stdout, marker='main help')
44
+
45
+ def test_backup_help(self):
46
+ with NoColorEnvRich():
47
+ stdout = invoke(cli_bin=PACKAGE_ROOT / 'cli.py', args=['backup', '--help'], strip_line_prefix='usage: ')
48
+ self.assert_in_content(
49
+ got=stdout,
50
+ parts=(
51
+ 'usage: ./cli.py backup [-h] ',
52
+ 'Backup the source directory to the destination',
53
+ ),
54
+ )
55
+
56
+ # Installed via pipx is called 'phlb', not 'cli.py':
57
+ stdout = stdout.replace('./cli.py', 'phlb')
58
+
59
+ assert_cli_help_in_readme(text_block=stdout, marker='backup help')
60
+
61
+ def test_dev_help(self):
62
+ with NoColorEnvRich():
63
+ stdout = invoke(cli_bin=PACKAGE_ROOT / 'dev-cli.py', args=['--help'], strip_line_prefix='usage: ')
64
+ self.assert_in_content(
65
+ got=stdout,
66
+ parts=(
67
+ 'usage: ./dev-cli.py [-h]',
68
+ ' lint ',
69
+ ' coverage ',
70
+ ' update-readme-history ',
71
+ ' publish ',
72
+ constants.CLI_EPILOG,
73
+ ),
74
+ )
75
+ assert_cli_help_in_readme(text_block=stdout, marker='dev help')
@@ -0,0 +1,9 @@
1
+ from unittest import TestCase
2
+
3
+ from cli_base.cli_tools.git_history import update_readme_history
4
+
5
+
6
+ class ReadmeHistoryTestCase(TestCase):
7
+ def test_readme_history(self):
8
+ with self.assertLogs():
9
+ update_readme_history(raise_update_error=True)
@@ -0,0 +1,224 @@
1
+ import logging
2
+ import tempfile
3
+ import textwrap
4
+ from pathlib import Path
5
+ from unittest.mock import patch
6
+
7
+ from bx_py_utils.test_utils.redirect import RedirectOut
8
+ from cli_base.cli_tools.test_utils.base_testcases import BaseTestCase
9
+ from freezegun import freeze_time
10
+
11
+ from PyHardLinkBackup import rebuild_databases
12
+ from PyHardLinkBackup.logging_setup import NoopLoggingManager
13
+ from PyHardLinkBackup.rebuild_databases import RebuildResult, rebuild, rebuild_one_file
14
+ from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
15
+
16
+
17
+ def sorted_rglob_paths(path: Path):
18
+ return sorted([str(p.relative_to(path)) for p in path.rglob('*')])
19
+
20
+
21
+ def sorted_rglob_files(path: Path):
22
+ return sorted([str(p.relative_to(path)) for p in path.rglob('*') if p.is_file()])
23
+
24
+
25
+ class RebuildDatabaseTestCase(BaseTestCase):
26
+ maxDiff = None
27
+
28
+ def test_happy_path(self):
29
+ with tempfile.TemporaryDirectory() as temp_dir:
30
+ temp_path = Path(temp_dir).resolve()
31
+
32
+ backup_root = temp_path / 'backup'
33
+
34
+ with self.assertRaises(SystemExit), RedirectOut() as redirected_out:
35
+ rebuild(backup_root, log_manager=NoopLoggingManager())
36
+
37
+ self.assertEqual(redirected_out.stderr, '')
38
+ self.assertEqual(redirected_out.stdout, f'Error: Backup directory "{backup_root}" does not exist!\n')
39
+
40
+ backup_root.mkdir()
41
+
42
+ with self.assertRaises(SystemExit), RedirectOut() as redirected_out:
43
+ rebuild(backup_root, log_manager=NoopLoggingManager())
44
+
45
+ self.assertEqual(redirected_out.stderr, '')
46
+ self.assertIn('hidden ".phlb" configuration directory is missing', redirected_out.stdout)
47
+
48
+ phlb_conf_dir = backup_root / '.phlb'
49
+ phlb_conf_dir.mkdir()
50
+
51
+ #######################################################################################
52
+ # Run on empty backup directory:
53
+
54
+ self.assertEqual(sorted_rglob_paths(backup_root), ['.phlb'])
55
+
56
+ with (
57
+ self.assertLogs('PyHardLinkBackup', level=logging.DEBUG),
58
+ RedirectOut() as redirected_out,
59
+ freeze_time('2026-01-16T12:34:56Z', auto_tick_seconds=0),
60
+ ):
61
+ rebuild_result = rebuild(backup_root, log_manager=NoopLoggingManager())
62
+ self.assertEqual(
63
+ rebuild_result,
64
+ RebuildResult(
65
+ process_count=0,
66
+ process_size=0,
67
+ added_size_count=0,
68
+ added_hash_count=0,
69
+ error_count=0,
70
+ ),
71
+ )
72
+ self.assertEqual(
73
+ sorted_rglob_paths(backup_root),
74
+ [
75
+ '.phlb',
76
+ '.phlb/hash-lookup',
77
+ '.phlb/size-lookup',
78
+ '2026-01-16-123456-rebuild-summary.txt',
79
+ ],
80
+ )
81
+ self.assertEqual(redirected_out.stderr, '')
82
+
83
+ #######################################################################################
84
+ # Add one backuped file and run again:
85
+
86
+ snapshot_path = backup_root / 'source-name' / '2026-01-15-181709'
87
+ snapshot_path.mkdir(parents=True)
88
+
89
+ minimum_file_content = 'X' * FileSizeDatabase.MIN_SIZE
90
+ (snapshot_path / 'file1.txt').write_text(minimum_file_content)
91
+
92
+ with (
93
+ self.assertLogs('PyHardLinkBackup', level=logging.DEBUG),
94
+ RedirectOut() as redirected_out,
95
+ freeze_time('2026-01-16T12:34:56Z', auto_tick_seconds=0),
96
+ ):
97
+ rebuild_result = rebuild(backup_root, log_manager=NoopLoggingManager())
98
+ self.assertEqual(
99
+ sorted_rglob_paths(backup_root),
100
+ [
101
+ '.phlb',
102
+ '.phlb/hash-lookup',
103
+ '.phlb/hash-lookup/bb',
104
+ '.phlb/hash-lookup/bb/c4',
105
+ '.phlb/hash-lookup/bb/c4/bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8',
106
+ '.phlb/size-lookup',
107
+ '.phlb/size-lookup/10',
108
+ '.phlb/size-lookup/10/00',
109
+ '.phlb/size-lookup/10/00/1000',
110
+ '2026-01-16-123456-rebuild-summary.txt',
111
+ 'source-name',
112
+ 'source-name/2026-01-15-181709',
113
+ 'source-name/2026-01-15-181709/SHA256SUMS',
114
+ 'source-name/2026-01-15-181709/file1.txt',
115
+ ],
116
+ )
117
+ self.assertEqual(
118
+ sorted_rglob_files(backup_root),
119
+ [
120
+ '.phlb/hash-lookup/bb/c4/bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8',
121
+ '.phlb/size-lookup/10/00/1000',
122
+ '2026-01-16-123456-rebuild-summary.txt',
123
+ 'source-name/2026-01-15-181709/SHA256SUMS',
124
+ 'source-name/2026-01-15-181709/file1.txt',
125
+ ],
126
+ )
127
+ self.assertEqual(redirected_out.stderr, '')
128
+ self.assertEqual(
129
+ rebuild_result,
130
+ RebuildResult(
131
+ process_count=1,
132
+ process_size=1000,
133
+ added_size_count=1,
134
+ added_hash_count=1,
135
+ error_count=0,
136
+ hash_verified_count=0,
137
+ hash_mismatch_count=0,
138
+ hash_not_found_count=1,
139
+ ),
140
+ )
141
+ self.assertEqual(
142
+ (snapshot_path / 'SHA256SUMS').read_text(),
143
+ 'bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8 file1.txt\n',
144
+ )
145
+
146
+ #######################################################################################
147
+ # Add a file with same content and run again:
148
+
149
+ minimum_file_content = 'X' * FileSizeDatabase.MIN_SIZE
150
+ (snapshot_path / 'same_content.txt').write_text(minimum_file_content)
151
+
152
+ with (
153
+ self.assertLogs('PyHardLinkBackup', level=logging.DEBUG) as logs,
154
+ RedirectOut() as redirected_out,
155
+ freeze_time('2026-01-16T12:34:56Z', auto_tick_seconds=0),
156
+ ):
157
+ rebuild_result = rebuild(backup_root, log_manager=NoopLoggingManager())
158
+ # No new hash of size entries, just the new file:
159
+ self.assertEqual(
160
+ sorted_rglob_files(backup_root),
161
+ [
162
+ '.phlb/hash-lookup/bb/c4/bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8',
163
+ '.phlb/size-lookup/10/00/1000',
164
+ '2026-01-16-123456-rebuild-summary.txt',
165
+ 'source-name/2026-01-15-181709/SHA256SUMS',
166
+ 'source-name/2026-01-15-181709/file1.txt',
167
+ 'source-name/2026-01-15-181709/same_content.txt',
168
+ ],
169
+ )
170
+ self.assertEqual(redirected_out.stderr, '')
171
+ self.assertEqual(
172
+ rebuild_result,
173
+ RebuildResult(
174
+ process_count=3,
175
+ process_size=2000,
176
+ added_size_count=0,
177
+ added_hash_count=0,
178
+ error_count=0,
179
+ hash_verified_count=1, # Existing file verified successfully
180
+ hash_mismatch_count=0,
181
+ hash_not_found_count=1, # One file added
182
+ ),
183
+ '\n'.join(logs.output) + redirected_out.stdout,
184
+ )
185
+ self.assertEqual(
186
+ (snapshot_path / 'SHA256SUMS').read_text(),
187
+ textwrap.dedent("""\
188
+ bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8 file1.txt
189
+ bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8 same_content.txt
190
+ """),
191
+ )
192
+
193
+ #######################################################################################
194
+ # Test error handling
195
+
196
+ def rebuild_one_file_mock(*, entry, **kwargs):
197
+ if entry.name == 'file1.txt':
198
+ raise IOError('Bam!')
199
+ return rebuild_one_file(entry=entry, **kwargs)
200
+
201
+ with (
202
+ self.assertLogs('PyHardLinkBackup', level=logging.ERROR) as logs,
203
+ RedirectOut() as redirected_out,
204
+ patch.object(rebuild_databases, 'rebuild_one_file', rebuild_one_file_mock),
205
+ ):
206
+ rebuild_result = rebuild(backup_root, log_manager=NoopLoggingManager())
207
+ logs = ''.join(logs.output)
208
+ self.assertIn(f'Backup {snapshot_path}/file1.txt OSError: Bam!\n', logs)
209
+ self.assertIn('\nTraceback (most recent call last):\n', logs)
210
+ self.assertEqual(redirected_out.stderr, '')
211
+
212
+ self.assertEqual(
213
+ rebuild_result,
214
+ RebuildResult(
215
+ process_count=2,
216
+ process_size=1000,
217
+ added_size_count=0,
218
+ added_hash_count=0,
219
+ error_count=1, # <<< one file caused error
220
+ hash_verified_count=1,
221
+ hash_mismatch_count=0,
222
+ hash_not_found_count=0,
223
+ ),
224
+ )
File without changes
@@ -0,0 +1,62 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+
5
+ class HashAlreadyExistsError(ValueError):
6
+ pass
7
+
8
+
9
+ class FileHashDatabase:
10
+ """DocWrite: README.md ## FileHashDatabase
11
+ A simple "database" to store file content hash <-> relative path mappings.
12
+ Uses a directory structure to avoid too many files in a single directory.
13
+ Path structure:
14
+ {base_dst}/.phlb/hash-lookup/{XX}/{YY}/{hash}
15
+ e.g.:
16
+ hash '12ab000a1b2c3...' results in: {base_dst}/.phlb/hash-lookup/12/ab/12ab000a1b2c3...
17
+
18
+ Notes:
19
+ * Hash length will be not validated, so it can be used with any hash algorithm.
20
+ * The "relative path" that will be stored is not validated, so it can be any string.
21
+ * We don't "cache" anything in Memory, to avoid high memory consumption for large datasets.
22
+ """
23
+
24
+ def __init__(self, backup_root: Path, phlb_conf_dir: Path):
25
+ self.backup_root = backup_root
26
+ self.base_path = phlb_conf_dir / 'hash-lookup'
27
+ self.base_path.mkdir(parents=False, exist_ok=True)
28
+
29
+ def _get_hash_path(self, hash: str) -> Path:
30
+ first_dir_name = hash[:2]
31
+ second_dir_name = hash[2:4]
32
+ hash_path = self.base_path / first_dir_name / second_dir_name / hash
33
+ return hash_path
34
+
35
+ def __contains__(self, hash: str) -> bool:
36
+ hash_path = self._get_hash_path(hash)
37
+ return hash_path.exists()
38
+
39
+ def get(self, hash: str) -> Path | None:
40
+ hash_path = self._get_hash_path(hash)
41
+ try:
42
+ rel_file_path = hash_path.read_text()
43
+ except FileNotFoundError:
44
+ return None
45
+ else:
46
+ abs_file_path = self.backup_root / rel_file_path
47
+ if not abs_file_path.is_file():
48
+ logging.warning('Hash database entry found, but file does not exist: %s', abs_file_path)
49
+ hash_path.unlink()
50
+ return None
51
+ return abs_file_path
52
+
53
+ def __setitem__(self, hash: str, abs_file_path: Path):
54
+ hash_path = self._get_hash_path(hash)
55
+ hash_path.parent.mkdir(parents=True, exist_ok=True)
56
+
57
+ # File should be found before and results in hardlink creation!
58
+ # So deny change of existing hashes:
59
+ if hash_path.exists():
60
+ raise HashAlreadyExistsError(f'Hash {hash} already exists in the database!')
61
+
62
+ hash_path.write_text(str(abs_file_path.relative_to(self.backup_root)))
@@ -0,0 +1,46 @@
1
+ from pathlib import Path
2
+
3
+
4
+ class FileSizeDatabase:
5
+ """DocWrite: README.md ## FileSizeDatabase
6
+ A simple "database" to track which file sizes have been seen.
7
+
8
+ Uses a directory structure to avoid too many files in a single directory.
9
+ We don't "cache" anything in Memory, to avoid high memory consumption for large datasets.
10
+ """
11
+
12
+ MIN_SIZE = 1000 # no padding is made, so the min size is 1000 bytes!
13
+
14
+ def __init__(self, phlb_conf_dir: Path):
15
+ self.base_path = phlb_conf_dir / 'size-lookup'
16
+ self.base_path.mkdir(parents=False, exist_ok=True)
17
+
18
+ def _get_size_path(self, size: int) -> Path:
19
+ assert size >= self.MIN_SIZE, f'Size must be at least {self.MIN_SIZE} bytes'
20
+ size_str = str(size)
21
+
22
+ """DocWrite: README.md ## FileSizeDatabase
23
+ Path structure:
24
+ * `{base_dst}/.phlb/size-lookup/{XX}/{YY}/{size}`
25
+
26
+ e.g.:
27
+
28
+ * `1234567890` results in: `{base_dst}/.phlb/size-lookup/12/34/1234567890`
29
+ """
30
+ first_dir_name = size_str[:2]
31
+ second_dir_name = size_str[2:4]
32
+ size_path = self.base_path / first_dir_name / second_dir_name / size_str
33
+ return size_path
34
+
35
+ def __contains__(self, size: int) -> bool:
36
+ size_path = self._get_size_path(size)
37
+ return size_path.exists()
38
+
39
+ def add(self, size: int):
40
+ size_path = self._get_size_path(size)
41
+ if not size_path.exists():
42
+ size_path.parent.mkdir(parents=True, exist_ok=True)
43
+
44
+ """DocWrite: README.md ## FileSizeDatabase
45
+ All files are created empty, as we only care about their existence."""
46
+ size_path.touch(exist_ok=False)