PyHardLinkBackup 1.1.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.pre-commit-config.yaml +1 -1
  2. pyhardlinkbackup-1.1.0/README.md → pyhardlinkbackup-1.3.0/PKG-INFO +55 -12
  3. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/__init__.py +1 -1
  4. pyhardlinkbackup-1.3.0/PyHardLinkBackup/backup.py +239 -0
  5. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_app/phlb.py +21 -0
  6. pyhardlinkbackup-1.3.0/PyHardLinkBackup/rebuild_databases.py +147 -0
  7. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/test_backup.py +68 -2
  8. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/test_doc_write.py +2 -1
  9. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/test_readme_history.py +2 -1
  10. pyhardlinkbackup-1.3.0/PyHardLinkBackup/tests/test_rebuild_database.py +204 -0
  11. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/file_hash_database.py +4 -0
  12. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/filesystem.py +26 -1
  13. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/rich_utils.py +14 -13
  14. pyhardlinkbackup-1.3.0/PyHardLinkBackup/utilities/sha256sums.py +61 -0
  15. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/tests/test_file_hash_database.py +3 -1
  16. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/tests/test_file_size_database.py +1 -1
  17. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/tests/test_filesystem.py +34 -2
  18. pyhardlinkbackup-1.1.0/PKG-INFO → pyhardlinkbackup-1.3.0/README.md +40 -27
  19. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/pyproject.toml +1 -1
  20. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/uv.lock +80 -80
  21. pyhardlinkbackup-1.1.0/PyHardLinkBackup/backup.py +0 -229
  22. pyhardlinkbackup-1.1.0/PyHardLinkBackup/utilities/tests/base_testcases.py +0 -88
  23. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.editorconfig +0 -0
  24. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.github/workflows/tests.yml +0 -0
  25. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.gitignore +0 -0
  26. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.idea/.gitignore +0 -0
  27. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.pre-commit-hooks.yaml +0 -0
  28. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.run/Template Python tests.run.xml +0 -0
  29. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.run/Unittests - __all__.run.xml +0 -0
  30. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.run/cli.py --help.run.xml +0 -0
  31. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.run/dev-cli update.run.xml +0 -0
  32. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.run/only DocTests.run.xml +0 -0
  33. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.run/only DocWrite.run.xml +0 -0
  34. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.venv-app/lib/python3.12/site-packages/cli_base/tests/shell_complete_snapshots/.gitignore +0 -0
  35. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/__main__.py +0 -0
  36. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_app/__init__.py +0 -0
  37. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/__init__.py +0 -0
  38. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/benchmark.py +0 -0
  39. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/code_style.py +0 -0
  40. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/packaging.py +0 -0
  41. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/shell_completion.py +0 -0
  42. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/testing.py +0 -0
  43. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/update_readme_history.py +0 -0
  44. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/constants.py +0 -0
  45. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/__init__.py +0 -0
  46. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/test_doctests.py +0 -0
  47. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/test_project_setup.py +0 -0
  48. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/test_readme.py +0 -0
  49. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/__init__.py +0 -0
  50. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/file_size_database.py +0 -0
  51. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/humanize.py +0 -0
  52. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/tests/__init__.py +0 -0
  53. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/cli.py +0 -0
  54. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/dev-cli.py +0 -0
  55. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/dist/.gitignore +0 -0
  56. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/docs/README.md +0 -0
  57. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/docs/about-docs.md +0 -0
  58. {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/noxfile.py +0 -0
@@ -2,6 +2,6 @@
2
2
  # See https://pre-commit.com for more information
3
3
  repos:
4
4
  - repo: https://github.com/jedie/cli-base-utilities
5
- rev: v0.26.0
5
+ rev: v0.27.0
6
6
  hooks:
7
7
  - id: update-readme-history
@@ -1,3 +1,18 @@
1
+ Metadata-Version: 2.4
2
+ Name: PyHardLinkBackup
3
+ Version: 1.3.0
4
+ Summary: HardLink/Deduplication Backups with Python
5
+ Project-URL: Documentation, https://github.com/jedie/PyHardLinkBackup
6
+ Project-URL: Source, https://github.com/jedie/PyHardLinkBackup
7
+ Author-email: Jens Diemer <PyHardLinkBackup@jensdiemer.de>
8
+ License: GPL-3.0-or-later
9
+ Requires-Python: >=3.12
10
+ Requires-Dist: bx-py-utils
11
+ Requires-Dist: cli-base-utilities>=0.27.0
12
+ Requires-Dist: rich
13
+ Requires-Dist: tyro
14
+ Description-Content-Type: text/markdown
15
+
1
16
  # PyHardLinkBackup
2
17
 
3
18
  [![tests](https://github.com/jedie/PyHardLinkBackup/actions/workflows/tests.yml/badge.svg?branch=main)](https://github.com/jedie/PyHardLinkBackup/actions/workflows/tests.yml)
@@ -10,6 +25,8 @@ HardLink/Deduplication Backups with Python
10
25
 
11
26
  **WIP:** v1.0.0 is a complete rewrite of PyHardLinkBackup.
12
27
 
28
+ It's similar to `rsync --link-dest` but the deduplication is done globally for all backups and all paths.
29
+
13
30
  ## installation
14
31
 
15
32
  You can use [pipx](https://pipx.pypa.io/stable/installation/) to install and use PyHardLinkBackup, e.g.:
@@ -46,22 +63,36 @@ Backup the source directory to the destination directory using hard links for de
46
63
 
47
64
 
48
65
 
66
+ Running a backup looks like:
67
+
68
+ ![2026-01-15-phlb1.png](https://raw.githubusercontent.com/jedie/jedie.github.io/main/screenshots/PyHardLinkBackup/2026-01-15-phlb1.png "2026-01-15-phlb1.png")
69
+
70
+
71
+
72
+ If it's finished it display a summary:
73
+
74
+ ![2026-01-15-phlb2.png](https://raw.githubusercontent.com/jedie/jedie.github.io/main/screenshots/PyHardLinkBackup/2026-01-15-phlb2.png "2026-01-15-phlb2.png")
75
+
76
+
77
+
49
78
  complete help for main CLI app:
50
79
 
51
80
  [comment]: <> (✂✂✂ auto generated main help start ✂✂✂)
52
81
  ```
53
- usage: phlb [-h] {backup,version}
82
+ usage: phlb [-h] {backup,rebuild,version}
54
83
 
55
84
 
56
85
 
57
- ╭─ options ─────────────────────────────────────────────────────────────────────────────────────────────────╮
58
- │ -h, --help show this help message and exit
59
- ╰───────────────────────────────────────────────────────────────────────────────────────────────────────────╯
60
- ╭─ subcommands ─────────────────────────────────────────────────────────────────────────────────────────────╮
61
- │ (required)
62
- │ • backup Backup the source directory to the destination directory using hard links for deduplication.
63
- │ • version Print version and exit
64
- ╰───────────────────────────────────────────────────────────────────────────────────────────────────────────╯
86
+ ╭─ options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
87
+ │ -h, --help show this help message and exit
88
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
89
+ ╭─ subcommands ────────────────────────────────────────────────────────────────────────────────────────────────────────╮
90
+ │ (required)
91
+ │ • backup Backup the source directory to the destination directory using hard links for deduplication.
92
+ │ • rebuild Rebuild the file hash and size database by scanning all backup files. And also verify SHA256SUMS and/or
93
+ │ store missing hashes in SHA256SUMS files. │
94
+ │ • version Print version and exit │
95
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
65
96
  ```
66
97
  [comment]: <> (✂✂✂ auto generated main help end ✂✂✂)
67
98
 
@@ -197,11 +228,26 @@ Overview of main changes:
197
228
 
198
229
  [comment]: <> (✂✂✂ auto generated history start ✂✂✂)
199
230
 
231
+ * [v1.3.0](https://github.com/jedie/PyHardLinkBackup/compare/v1.2.0...v1.3.0)
232
+ * 2026-01-15 - Verify SHA256SUMS files in "rebuild" command, too.
233
+ * 2026-01-15 - Code cleanup: use more generic names for and in BackupProgress
234
+ * 2026-01-15 - Add tests for rebuild
235
+ * 2026-01-15 - Add command to "rebuld" the size and hash filesystem database
236
+ * 2026-01-15 - Add screenshots in the README
237
+ * [v1.2.0](https://github.com/jedie/PyHardLinkBackup/compare/v1.1.0...v1.2.0)
238
+ * 2026-01-15 - Add error handling: Log exception but continue with the backup
239
+ * 2026-01-15 - Check permission and hadlink support on destination path
240
+ * 2026-01-14 - Enhance progress bars
241
+ * 2026-01-14 - A a note to rsync --link-dest
242
+ * 2026-01-14 - Use cli_base.cli_tools.test_utils.base_testcases
200
243
  * [v1.1.0](https://github.com/jedie/PyHardLinkBackup/compare/v1.0.1...v1.1.0)
201
244
  * 2026-01-14 - Change backup timestamp directory to old schema: '%Y-%m-%d-%H%M%S'
202
245
  * 2026-01-14 - Add "Overview of main changes" to README
203
246
  * [v1.0.1](https://github.com/jedie/PyHardLinkBackup/compare/v1.0.0...v1.0.1)
204
247
  * 2026-01-13 - Store SHA256SUMS files in backup directories
248
+
249
+ <details><summary>Expand older history entries ...</summary>
250
+
205
251
  * [v1.0.0](https://github.com/jedie/PyHardLinkBackup/compare/v0.13.0...v1.0.0)
206
252
  * 2026-01-13 - Change "./cli.py" to "phlb" (because it's the name installed via pipx)
207
253
  * 2026-01-13 - Update README
@@ -230,9 +276,6 @@ Overview of main changes:
230
276
  * 2020-03-17 - dynamic chunk size
231
277
  * 2020-03-17 - ignore *.sha512 by default
232
278
  * 2020-03-17 - Update boot_pyhardlinkbackup.sh
233
-
234
- <details><summary>Expand older history entries ...</summary>
235
-
236
279
  * [v0.12.3](https://github.com/jedie/PyHardLinkBackup/compare/v0.12.2...v0.12.3)
237
280
  * 2020-03-17 - update README.rst
238
281
  * 2020-03-17 - don't publish if tests fail
@@ -3,5 +3,5 @@
3
3
  """
4
4
 
5
5
  # See https://packaging.python.org/en/latest/specifications/version-specifiers/
6
- __version__ = '1.1.0'
6
+ __version__ = '1.3.0'
7
7
  __author__ = 'Jens Diemer <PyHardLinkBackup@jensdiemer.de>'
@@ -0,0 +1,239 @@
1
+ import dataclasses
2
+ import logging
3
+ import os
4
+ import shutil
5
+ import sys
6
+ import time
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+
10
+ from rich import print # noqa
11
+
12
+ from PyHardLinkBackup.constants import CHUNK_SIZE
13
+ from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase
14
+ from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
15
+ from PyHardLinkBackup.utilities.filesystem import (
16
+ copy_and_hash,
17
+ hash_file,
18
+ humanized_fs_scan,
19
+ iter_scandir_files,
20
+ read_and_hash_file,
21
+ supports_hardlinks,
22
+ )
23
+ from PyHardLinkBackup.utilities.humanize import human_filesize
24
+ from PyHardLinkBackup.utilities.rich_utils import DisplayFileTreeProgress
25
+ from PyHardLinkBackup.utilities.sha256sums import store_hash
26
+
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ @dataclasses.dataclass
32
+ class BackupResult:
33
+ backup_dir: Path
34
+ #
35
+ backup_count: int = 0
36
+ backup_size: int = 0
37
+ #
38
+ symlink_files: int = 0
39
+ hardlinked_files: int = 0
40
+ hardlinked_size: int = 0
41
+ #
42
+ copied_files: int = 0
43
+ copied_size: int = 0
44
+ #
45
+ copied_small_files: int = 0
46
+ copied_small_size: int = 0
47
+ #
48
+ error_count: int = 0
49
+
50
+
51
+ def backup_one_file(
52
+ *,
53
+ src_root: Path,
54
+ entry: os.DirEntry,
55
+ size_db: FileSizeDatabase,
56
+ hash_db: FileHashDatabase,
57
+ backup_dir: Path,
58
+ backup_result: BackupResult,
59
+ ) -> None:
60
+ backup_result.backup_count += 1
61
+ src_path = Path(entry.path)
62
+
63
+ dst_path = backup_dir / src_path.relative_to(src_root)
64
+ dst_dir_path = dst_path.parent
65
+ if not dst_dir_path.exists():
66
+ dst_dir_path.mkdir(parents=True, exist_ok=False)
67
+
68
+ try:
69
+ size = entry.stat().st_size
70
+ except FileNotFoundError:
71
+ # e.g.: Handle broken symlink
72
+ target = os.readlink(src_path)
73
+ dst_path.symlink_to(target)
74
+ backup_result.symlink_files += 1
75
+ return
76
+
77
+ backup_result.backup_size += size
78
+
79
+ if entry.name == 'SHA256SUMS':
80
+ # Skip existing SHA256SUMS files in source tree,
81
+ # because we create our own SHA256SUMS files.
82
+ logger.debug('Skip existing SHA256SUMS file: %s', src_path)
83
+ return
84
+
85
+ if entry.is_symlink():
86
+ logger.debug('Copy symlink: %s to %s', src_path, dst_path)
87
+ target = os.readlink(src_path)
88
+ dst_path.symlink_to(target)
89
+ backup_result.symlink_files += 1
90
+ return
91
+
92
+ # Process regular files
93
+ assert entry.is_file(follow_symlinks=False), f'Unexpected non-file: {src_path}'
94
+
95
+ # Deduplication logic
96
+
97
+ if size < size_db.MIN_SIZE:
98
+ # Small file -> always copy without deduplication
99
+ logger.info('Copy small file: %s to %s', src_path, dst_path)
100
+ file_hash = copy_and_hash(src_path, dst_path)
101
+ backup_result.copied_files += 1
102
+ backup_result.copied_size += size
103
+ backup_result.copied_small_files += 1
104
+ backup_result.copied_small_size += size
105
+ store_hash(dst_path, file_hash)
106
+ return
107
+
108
+ if size in size_db:
109
+ logger.debug('File with size %iBytes found before -> hash: %s', size, src_path)
110
+
111
+ if size <= CHUNK_SIZE:
112
+ # File can be read complete into memory
113
+ logger.debug('File size %iBytes <= CHUNK_SIZE (%iBytes) -> read complete into memory', size, CHUNK_SIZE)
114
+ file_content, file_hash = read_and_hash_file(src_path)
115
+ if existing_path := hash_db.get(file_hash):
116
+ logger.info('Hardlink duplicate file: %s to %s', dst_path, existing_path)
117
+ os.link(existing_path, dst_path)
118
+ backup_result.hardlinked_files += 1
119
+ backup_result.hardlinked_size += size
120
+ else:
121
+ logger.info('Store unique file: %s to %s', src_path, dst_path)
122
+ dst_path.write_bytes(file_content)
123
+ hash_db[file_hash] = dst_path
124
+ backup_result.copied_files += 1
125
+ backup_result.copied_size += size
126
+
127
+ else:
128
+ # Large file
129
+ file_hash = hash_file(src_path) # Calculate hash without copying
130
+
131
+ if existing_path := hash_db.get(file_hash):
132
+ logger.info('Hardlink duplicate file: %s to %s', dst_path, existing_path)
133
+ os.link(existing_path, dst_path)
134
+ backup_result.hardlinked_files += 1
135
+ backup_result.hardlinked_size += size
136
+ else:
137
+ logger.info('Copy unique file: %s to %s', src_path, dst_path)
138
+ hash_db[file_hash] = dst_path
139
+ backup_result.copied_files += 1
140
+ backup_result.copied_size += size
141
+
142
+ # Keep original file metadata (permission bits, time stamps, and flags)
143
+ shutil.copy2(src_path, dst_path)
144
+ else:
145
+ # A file with this size not backuped before -> Can't be duplicate -> copy and hash
146
+ file_hash = copy_and_hash(src_path, dst_path)
147
+ size_db.add(size)
148
+ hash_db[file_hash] = dst_path
149
+ backup_result.copied_files += 1
150
+ backup_result.copied_size += size
151
+
152
+ store_hash(dst_path, file_hash)
153
+
154
+
155
+ def backup_tree(*, src_root: Path, backup_root: Path, excludes: set[str]) -> BackupResult:
156
+ src_root = src_root.resolve()
157
+ if not src_root.is_dir():
158
+ print('Error: Source directory does not exist!')
159
+ print(f'Please check source directory: "{src_root}"\n')
160
+ sys.exit(1)
161
+
162
+ backup_root = backup_root.resolve()
163
+ if not backup_root.is_dir():
164
+ print('Error: Backup directory does not exist!')
165
+ print(f'Please create "{backup_root}" directory first and start again!\n')
166
+ sys.exit(1)
167
+
168
+ if not os.access(backup_root, os.W_OK):
169
+ print('Error: No write access to backup directory!')
170
+ print(f'Please check permissions for backup directory: "{backup_root}"\n')
171
+ sys.exit(1)
172
+
173
+ if not supports_hardlinks(backup_root):
174
+ print('Error: Filesystem for backup directory does not support hardlinks!')
175
+ print(f'Please check backup directory: "{backup_root}"\n')
176
+ sys.exit(1)
177
+
178
+ # Step 1: Scan source directory:
179
+ src_file_count, src_total_size = humanized_fs_scan(src_root, excludes)
180
+
181
+ phlb_conf_dir = backup_root / '.phlb'
182
+ phlb_conf_dir.mkdir(parents=False, exist_ok=True)
183
+
184
+ backup_dir = backup_root / src_root.name / datetime.now().strftime('%Y-%m-%d-%H%M%S')
185
+ logger.info('Backup %s to %s', src_root, backup_dir)
186
+ backup_dir.mkdir(parents=True, exist_ok=False)
187
+
188
+ print(f'\nBackup to {backup_dir}...\n')
189
+
190
+ with DisplayFileTreeProgress(src_file_count, src_total_size) as progress:
191
+ # "Databases" for deduplication
192
+ size_db = FileSizeDatabase(phlb_conf_dir)
193
+ hash_db = FileHashDatabase(backup_root, phlb_conf_dir)
194
+
195
+ backup_result = BackupResult(backup_dir=backup_dir)
196
+
197
+ next_update = 0
198
+ for entry in iter_scandir_files(src_root, excludes=excludes):
199
+ try:
200
+ backup_one_file(
201
+ src_root=src_root,
202
+ entry=entry,
203
+ size_db=size_db,
204
+ hash_db=hash_db,
205
+ backup_dir=backup_dir,
206
+ backup_result=backup_result,
207
+ )
208
+ except Exception as err:
209
+ logger.exception(f'Backup {entry.path} {err.__class__.__name__}: {err}')
210
+ backup_result.error_count += 1
211
+ else:
212
+ now = time.monotonic()
213
+ if now >= next_update:
214
+ progress.update(
215
+ completed_file_count=backup_result.backup_count, completed_size=backup_result.backup_size
216
+ )
217
+ next_update = now + 0.5
218
+
219
+ # Finalize progress indicator values:
220
+ progress.update(completed_file_count=backup_result.backup_count, completed_size=backup_result.backup_size)
221
+
222
+ print(f'\nBackup complete: {backup_dir} (total size {human_filesize(backup_result.backup_size)})\n')
223
+ print(f' Total files processed: {backup_result.backup_count}')
224
+ print(f' * Symlinked files: {backup_result.symlink_files}')
225
+ print(
226
+ f' * Hardlinked files: {backup_result.hardlinked_files}'
227
+ f' (saved {human_filesize(backup_result.hardlinked_size)})'
228
+ )
229
+ print(f' * Copied files: {backup_result.copied_files} (total {human_filesize(backup_result.copied_size)})')
230
+ print(
231
+ f' of which small (<{size_db.MIN_SIZE} Bytes)'
232
+ f' files: {backup_result.copied_small_files}'
233
+ f' (total {human_filesize(backup_result.copied_small_size)})'
234
+ )
235
+ if backup_result.error_count > 0:
236
+ print(f' Errors during backup: {backup_result.error_count} (see log for details)')
237
+ print()
238
+
239
+ return backup_result
@@ -7,6 +7,7 @@ from cli_base.cli_tools.verbosity import setup_logging
7
7
  from cli_base.tyro_commands import TyroVerbosityArgType
8
8
  from rich import print # noqa
9
9
 
10
+ from PyHardLinkBackup import rebuild_databases
10
11
  from PyHardLinkBackup.backup import backup_tree
11
12
  from PyHardLinkBackup.cli_app import app
12
13
 
@@ -48,3 +49,23 @@ def backup(
48
49
  backup_root=dst,
49
50
  excludes=set(excludes),
50
51
  )
52
+
53
+
54
+ @app.command
55
+ def rebuild(
56
+ backup_root: Annotated[
57
+ Path,
58
+ tyro.conf.arg(
59
+ metavar='backup-directory',
60
+ help='Root directory of the the backups.',
61
+ ),
62
+ ],
63
+ /,
64
+ verbosity: TyroVerbosityArgType = 2,
65
+ ) -> None:
66
+ """
67
+ Rebuild the file hash and size database by scanning all backup files. And also verify SHA256SUMS
68
+ and/or store missing hashes in SHA256SUMS files.
69
+ """
70
+ setup_logging(verbosity=verbosity)
71
+ rebuild_databases.rebuild(backup_root=backup_root)
@@ -0,0 +1,147 @@
1
+ import dataclasses
2
+ import logging
3
+ import os
4
+ import sys
5
+ import time
6
+ from pathlib import Path
7
+
8
+ from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase
9
+ from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
10
+ from PyHardLinkBackup.utilities.filesystem import hash_file, humanized_fs_scan, iter_scandir_files
11
+ from PyHardLinkBackup.utilities.humanize import human_filesize
12
+ from PyHardLinkBackup.utilities.rich_utils import DisplayFileTreeProgress
13
+ from PyHardLinkBackup.utilities.sha256sums import check_sha256sums, store_hash
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @dataclasses.dataclass
20
+ class RebuildResult:
21
+ process_count: int = 0
22
+ process_size: int = 0
23
+ #
24
+ added_size_count: int = 0
25
+ added_hash_count: int = 0
26
+ #
27
+ error_count: int = 0
28
+ #
29
+ hash_verified_count: int = 0
30
+ hash_mismatch_count: int = 0
31
+ hash_not_found_count: int = 0
32
+
33
+
34
+ def rebuild_one_file(
35
+ *,
36
+ entry: os.DirEntry,
37
+ size_db: FileSizeDatabase,
38
+ hash_db: FileHashDatabase,
39
+ rebuild_result: RebuildResult,
40
+ ):
41
+ rebuild_result.process_count += 1
42
+
43
+ if entry.name == 'SHA256SUMS':
44
+ # Skip existing SHA256SUMS files
45
+ return
46
+
47
+ size = entry.stat().st_size
48
+ rebuild_result.process_size += size
49
+
50
+ if size < size_db.MIN_SIZE:
51
+ # Small files will never deduplicate, skip them
52
+ return
53
+
54
+ file_path = Path(entry.path)
55
+ file_hash = hash_file(file_path)
56
+
57
+ if size not in size_db:
58
+ size_db.add(size)
59
+ rebuild_result.added_size_count += 1
60
+
61
+ if file_hash not in hash_db:
62
+ hash_db[file_hash] = file_path
63
+ rebuild_result.added_hash_count += 1
64
+
65
+ # We have calculated the current hash of the file,
66
+ # Let's check if we can verify it, too:
67
+ file_path = Path(entry.path)
68
+ compare_result = check_sha256sums(
69
+ file_path=file_path,
70
+ file_hash=file_hash,
71
+ )
72
+ if compare_result is True:
73
+ rebuild_result.hash_verified_count += 1
74
+ elif compare_result is False:
75
+ rebuild_result.hash_mismatch_count += 1
76
+ elif compare_result is None:
77
+ rebuild_result.hash_not_found_count += 1
78
+ store_hash(
79
+ file_path=file_path,
80
+ file_hash=file_hash,
81
+ )
82
+
83
+
84
+ def rebuild(backup_root: Path) -> RebuildResult:
85
+ backup_root = backup_root.resolve()
86
+ if not backup_root.is_dir():
87
+ print(f'Error: Backup directory "{backup_root}" does not exist!')
88
+ sys.exit(1)
89
+
90
+ phlb_conf_dir = backup_root / '.phlb'
91
+ if not phlb_conf_dir.is_dir():
92
+ print(
93
+ f'Error: Backup directory "{backup_root}" seems to be wrong:'
94
+ f' Our hidden ".phlb" configuration directory is missing!'
95
+ )
96
+ sys.exit(1)
97
+
98
+ file_count, total_size = humanized_fs_scan(backup_root, excludes={'.phlb'})
99
+
100
+ with DisplayFileTreeProgress(file_count, total_size) as progress:
101
+ # "Databases" for deduplication
102
+ size_db = FileSizeDatabase(phlb_conf_dir)
103
+ hash_db = FileHashDatabase(backup_root, phlb_conf_dir)
104
+
105
+ rebuild_result = RebuildResult()
106
+
107
+ next_update = 0
108
+ for entry in iter_scandir_files(backup_root, excludes={'.phlb'}):
109
+ try:
110
+ rebuild_one_file(
111
+ entry=entry,
112
+ size_db=size_db,
113
+ hash_db=hash_db,
114
+ rebuild_result=rebuild_result,
115
+ )
116
+ except Exception as err:
117
+ logger.exception(f'Backup {entry.path} {err.__class__.__name__}: {err}')
118
+ rebuild_result.error_count += 1
119
+ else:
120
+ now = time.monotonic()
121
+ if now >= next_update:
122
+ progress.update(
123
+ completed_file_count=rebuild_result.process_count, completed_size=rebuild_result.process_size
124
+ )
125
+ next_update = now + 0.5
126
+
127
+ # Finalize progress indicator values:
128
+ progress.update(completed_file_count=rebuild_result.process_count, completed_size=rebuild_result.process_size)
129
+
130
+ print(f'\nRebuild "{backup_root}" completed:')
131
+ print(f' Total files processed: {rebuild_result.process_count}')
132
+ print(f' Total size processed: {human_filesize(rebuild_result.process_size)}')
133
+
134
+ print(f' Added file size information entries: {rebuild_result.added_size_count}')
135
+ print(f' Added file hash entries: {rebuild_result.added_hash_count}')
136
+
137
+ if rebuild_result.error_count > 0:
138
+ print(f' Errors during rebuild: {rebuild_result.error_count} (see log for details)')
139
+
140
+ print('\nSHA256SUMS verification results:')
141
+ print(f' Successfully verified files: {rebuild_result.hash_verified_count}')
142
+ print(f' File hash mismatches: {rebuild_result.hash_mismatch_count}')
143
+ print(f' File hashes not found, newly stored: {rebuild_result.hash_not_found_count}')
144
+
145
+ print()
146
+
147
+ return rebuild_result
@@ -13,14 +13,14 @@ from bx_py_utils.test_utils.assertion import assert_text_equal
13
13
  from bx_py_utils.test_utils.datetime import parse_dt
14
14
  from bx_py_utils.test_utils.log_utils import NoLogs
15
15
  from bx_py_utils.test_utils.redirect import RedirectOut
16
+ from cli_base.cli_tools.test_utils.base_testcases import BaseTestCase
16
17
  from freezegun import freeze_time
17
18
  from tabulate import tabulate
18
19
 
19
20
  from PyHardLinkBackup.backup import BackupResult, backup_tree
20
21
  from PyHardLinkBackup.constants import CHUNK_SIZE
21
22
  from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
22
- from PyHardLinkBackup.utilities.filesystem import iter_scandir_files
23
- from PyHardLinkBackup.utilities.tests.base_testcases import BaseTestCase
23
+ from PyHardLinkBackup.utilities.filesystem import copy_and_hash, iter_scandir_files
24
24
  from PyHardLinkBackup.utilities.tests.test_file_hash_database import assert_hash_db_info
25
25
 
26
26
 
@@ -184,6 +184,7 @@ class BackupTreeTestCase(BaseTestCase):
184
184
  copied_size=67109915,
185
185
  copied_small_files=3,
186
186
  copied_small_size=50,
187
+ error_count=0,
187
188
  ),
188
189
  )
189
190
 
@@ -267,6 +268,7 @@ class BackupTreeTestCase(BaseTestCase):
267
268
  copied_size=50,
268
269
  copied_small_files=3,
269
270
  copied_small_size=50,
271
+ error_count=0,
270
272
  ),
271
273
  )
272
274
  # The second backup:
@@ -360,6 +362,7 @@ class BackupTreeTestCase(BaseTestCase):
360
362
  copied_size=1050,
361
363
  copied_small_files=3,
362
364
  copied_small_size=50,
365
+ error_count=0,
363
366
  ),
364
367
  )
365
368
 
@@ -454,6 +457,7 @@ class BackupTreeTestCase(BaseTestCase):
454
457
  copied_size=31,
455
458
  copied_small_files=1,
456
459
  copied_small_size=31,
460
+ error_count=0,
457
461
  ),
458
462
  )
459
463
 
@@ -474,3 +478,65 @@ class BackupTreeTestCase(BaseTestCase):
474
478
  Symlinks are not stored in our FileHashDatabase, because they are not considered for hardlinking."""
475
479
  with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
476
480
  assert_hash_db_info(backup_root=backup_root, expected='')
481
+
482
+ def test_error_handling(self):
483
+ with tempfile.TemporaryDirectory() as temp_dir:
484
+ temp_path = Path(temp_dir)
485
+
486
+ src_root = temp_path / 'source'
487
+ backup_root = temp_path / 'backup'
488
+
489
+ src_root.mkdir()
490
+ backup_root.mkdir()
491
+
492
+ (src_root / 'file1.txt').write_text('File 1')
493
+ (src_root / 'file2.txt').write_text('File 2')
494
+ (src_root / 'file3.txt').write_text('File 3')
495
+
496
+ # Set modification times to a fixed time for easier testing:
497
+ set_file_times(src_root, dt=parse_dt('2026-01-01T12:00:00+0000'))
498
+
499
+ def mocked_copy_and_hash(src: Path, dst: Path):
500
+ if src.name == 'file2.txt':
501
+ raise PermissionError('Bam!')
502
+ else:
503
+ return copy_and_hash(src, dst)
504
+
505
+ with (
506
+ self.assertLogs(level=logging.ERROR) as logs,
507
+ patch('PyHardLinkBackup.backup.iter_scandir_files', SortedIterScandirFiles),
508
+ patch('PyHardLinkBackup.backup.copy_and_hash', mocked_copy_and_hash),
509
+ freeze_time('2026-01-01T12:34:56Z', auto_tick_seconds=0),
510
+ RedirectOut() as redirected_out,
511
+ ):
512
+ result = backup_tree(
513
+ src_root=src_root,
514
+ backup_root=backup_root,
515
+ excludes={'.cache'},
516
+ )
517
+ self.assertEqual(redirected_out.stderr, '')
518
+ self.assertIn('Backup complete', redirected_out.stdout)
519
+ self.assertIn('Errors during backup:', redirected_out.stdout)
520
+
521
+ logs = ''.join(logs.output)
522
+ self.assertIn(
523
+ f'ERROR:PyHardLinkBackup.backup:Backup {src_root / "file2.txt"} PermissionError: Bam!\n',
524
+ logs,
525
+ )
526
+ self.assertIn('\nTraceback (most recent call last):\n', logs)
527
+ self.assertEqual(
528
+ result,
529
+ BackupResult(
530
+ backup_dir=result.backup_dir,
531
+ backup_count=3,
532
+ backup_size=18,
533
+ symlink_files=0,
534
+ hardlinked_files=0,
535
+ hardlinked_size=0,
536
+ copied_files=2,
537
+ copied_size=12,
538
+ copied_small_files=2,
539
+ copied_small_size=12,
540
+ error_count=1,
541
+ ),
542
+ )
@@ -19,7 +19,8 @@ class DocuWriteApiTestCase(TestCase):
19
19
  """
20
20
  assert_is_file(PACKAGE_ROOT / 'pyproject.toml')
21
21
 
22
- info: GeneratedInfo = generate(base_path=PACKAGE_ROOT)
22
+ with self.assertLogs():
23
+ info: GeneratedInfo = generate(base_path=PACKAGE_ROOT)
23
24
  self.assertGreaterEqual(len(info.paths), 1)
24
25
  self.assertEqual(info.update_count, 0, 'No files should be updated, commit the changes')
25
26
  self.assertEqual(info.remove_count, 0, 'No files should be removed, commit the changes')
@@ -5,4 +5,5 @@ from cli_base.cli_tools.git_history import update_readme_history
5
5
 
6
6
  class ReadmeHistoryTestCase(TestCase):
7
7
  def test_readme_history(self):
8
- update_readme_history(raise_update_error=True)
8
+ with self.assertLogs():
9
+ update_readme_history(raise_update_error=True)