PyHardLinkBackup 1.0.1__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.pre-commit-config.yaml +1 -1
  2. pyhardlinkbackup-1.0.1/README.md → pyhardlinkbackup-1.2.0/PKG-INFO +37 -3
  3. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/__init__.py +1 -1
  4. pyhardlinkbackup-1.2.0/PyHardLinkBackup/backup.py +252 -0
  5. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/tests/test_backup.py +201 -110
  6. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/tests/test_doc_write.py +2 -1
  7. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/tests/test_readme_history.py +2 -1
  8. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/utilities/filesystem.py +26 -1
  9. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/utilities/rich_utils.py +6 -5
  10. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/utilities/tests/test_file_hash_database.py +40 -35
  11. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/utilities/tests/test_file_size_database.py +48 -41
  12. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/utilities/tests/test_filesystem.py +35 -3
  13. pyhardlinkbackup-1.0.1/PKG-INFO → pyhardlinkbackup-1.2.0/README.md +22 -18
  14. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/pyproject.toml +1 -1
  15. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/uv.lock +80 -80
  16. pyhardlinkbackup-1.0.1/PyHardLinkBackup/backup.py +0 -229
  17. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.editorconfig +0 -0
  18. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.github/workflows/tests.yml +0 -0
  19. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.gitignore +0 -0
  20. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.idea/.gitignore +0 -0
  21. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.pre-commit-hooks.yaml +0 -0
  22. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.run/Template Python tests.run.xml +0 -0
  23. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.run/Unittests - __all__.run.xml +0 -0
  24. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.run/cli.py --help.run.xml +0 -0
  25. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.run/dev-cli update.run.xml +0 -0
  26. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.run/only DocTests.run.xml +0 -0
  27. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.run/only DocWrite.run.xml +0 -0
  28. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/.venv-app/lib/python3.12/site-packages/cli_base/tests/shell_complete_snapshots/.gitignore +0 -0
  29. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/__main__.py +0 -0
  30. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/cli_app/__init__.py +0 -0
  31. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/cli_app/phlb.py +0 -0
  32. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/cli_dev/__init__.py +0 -0
  33. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/cli_dev/benchmark.py +0 -0
  34. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/cli_dev/code_style.py +0 -0
  35. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/cli_dev/packaging.py +0 -0
  36. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/cli_dev/shell_completion.py +0 -0
  37. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/cli_dev/testing.py +0 -0
  38. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/cli_dev/update_readme_history.py +0 -0
  39. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/constants.py +0 -0
  40. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/tests/__init__.py +0 -0
  41. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/tests/test_doctests.py +0 -0
  42. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/tests/test_project_setup.py +0 -0
  43. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/tests/test_readme.py +0 -0
  44. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/utilities/__init__.py +0 -0
  45. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/utilities/file_hash_database.py +0 -0
  46. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/utilities/file_size_database.py +0 -0
  47. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/utilities/humanize.py +0 -0
  48. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/PyHardLinkBackup/utilities/tests/__init__.py +0 -0
  49. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/cli.py +0 -0
  50. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/dev-cli.py +0 -0
  51. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/dist/.gitignore +0 -0
  52. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/docs/README.md +0 -0
  53. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/docs/about-docs.md +0 -0
  54. {pyhardlinkbackup-1.0.1 → pyhardlinkbackup-1.2.0}/noxfile.py +0 -0
@@ -2,6 +2,6 @@
2
2
  # See https://pre-commit.com for more information
3
3
  repos:
4
4
  - repo: https://github.com/jedie/cli-base-utilities
5
- rev: v0.26.0
5
+ rev: v0.27.0
6
6
  hooks:
7
7
  - id: update-readme-history
@@ -1,3 +1,18 @@
1
+ Metadata-Version: 2.4
2
+ Name: PyHardLinkBackup
3
+ Version: 1.2.0
4
+ Summary: HardLink/Deduplication Backups with Python
5
+ Project-URL: Documentation, https://github.com/jedie/PyHardLinkBackup
6
+ Project-URL: Source, https://github.com/jedie/PyHardLinkBackup
7
+ Author-email: Jens Diemer <PyHardLinkBackup@jensdiemer.de>
8
+ License: GPL-3.0-or-later
9
+ Requires-Python: >=3.12
10
+ Requires-Dist: bx-py-utils
11
+ Requires-Dist: cli-base-utilities>=0.27.0
12
+ Requires-Dist: rich
13
+ Requires-Dist: tyro
14
+ Description-Content-Type: text/markdown
15
+
1
16
  # PyHardLinkBackup
2
17
 
3
18
  [![tests](https://github.com/jedie/PyHardLinkBackup/actions/workflows/tests.yml/badge.svg?branch=main)](https://github.com/jedie/PyHardLinkBackup/actions/workflows/tests.yml)
@@ -10,6 +25,8 @@ HardLink/Deduplication Backups with Python
10
25
 
11
26
  **WIP:** v1.0.0 is a complete rewrite of PyHardLinkBackup.
12
27
 
28
+ It's similar to `rsync --link-dest` but the deduplication is done globally for all backups and all paths.
29
+
13
30
  ## installation
14
31
 
15
32
  You can use [pipx](https://pipx.pypa.io/stable/installation/) to install and use PyHardLinkBackup, e.g.:
@@ -185,10 +202,27 @@ usage: ./dev-cli.py [-h] {benchmark-hashes,coverage,install,lint,mypy,nox,pip-au
185
202
 
186
203
  v1 is a complete rewrite of PyHardLinkBackup.
187
204
 
205
+ Overview of main changes:
206
+
207
+ * Remove Django dependency:
208
+ * No SQlite database anymore -> Data for deduplication stored in filesystem only
209
+ * No Django Admin, because we have no database anymore ;)
210
+ * Change hash algorithm from SHA512 to SHA256, because it's faster and still secure enough
211
+ * Don't store `*.sha512` for every file anymore -> We store one `SHA256SUMS` file in every backup directory
212
+
188
213
  ## History
189
214
 
190
215
  [comment]: <> (✂✂✂ auto generated history start ✂✂✂)
191
216
 
217
+ * [v1.2.0](https://github.com/jedie/PyHardLinkBackup/compare/v1.1.0...v1.2.0)
218
+ * 2026-01-15 - Add error handling: Log exception but continue with the backup
219
+ * 2026-01-15 - Check permission and hadlink support on destination path
220
+ * 2026-01-14 - Enhance progress bars
221
+ * 2026-01-14 - A a note to rsync --link-dest
222
+ * 2026-01-14 - Use cli_base.cli_tools.test_utils.base_testcases
223
+ * [v1.1.0](https://github.com/jedie/PyHardLinkBackup/compare/v1.0.1...v1.1.0)
224
+ * 2026-01-14 - Change backup timestamp directory to old schema: '%Y-%m-%d-%H%M%S'
225
+ * 2026-01-14 - Add "Overview of main changes" to README
192
226
  * [v1.0.1](https://github.com/jedie/PyHardLinkBackup/compare/v1.0.0...v1.0.1)
193
227
  * 2026-01-13 - Store SHA256SUMS files in backup directories
194
228
  * [v1.0.0](https://github.com/jedie/PyHardLinkBackup/compare/v0.13.0...v1.0.0)
@@ -201,6 +235,9 @@ v1 is a complete rewrite of PyHardLinkBackup.
201
235
  * 2026-01-13 - Add DocWrite, handle broken symlinks, keep file meta, handle missing hardlink sources
202
236
  * 2026-01-12 - First working iteration with rich progess bar
203
237
  * 2026-01-08 - Rewrite everything
238
+
239
+ <details><summary>Expand older history entries ...</summary>
240
+
204
241
  * [v0.13.0](https://github.com/jedie/PyHardLinkBackup/compare/v0.12.3...v0.13.0)
205
242
  * 2020-03-18 - release v0.13.0
206
243
  * 2020-03-17 - deactivate pypy tests in travis, because of SQLite errors, like:
@@ -228,9 +265,6 @@ v1 is a complete rewrite of PyHardLinkBackup.
228
265
  * 2020-03-16 - just warn if used directly (needfull for devlopment to call this directly ;)
229
266
  * 2020-03-16 - update requirements
230
267
  * 2020-03-16 - +pytest-randomly
231
-
232
- <details><summary>Expand older history entries ...</summary>
233
-
234
268
  * [v0.12.2](https://github.com/jedie/PyHardLinkBackup/compare/v0.12.1...v0.12.2)
235
269
  * 2020-03-06 - repare v0.12.2 release
236
270
  * 2020-03-06 - enhance log file content
@@ -3,5 +3,5 @@
3
3
  """
4
4
 
5
5
  # See https://packaging.python.org/en/latest/specifications/version-specifiers/
6
- __version__ = '1.0.1'
6
+ __version__ = '1.2.0'
7
7
  __author__ = 'Jens Diemer <PyHardLinkBackup@jensdiemer.de>'
@@ -0,0 +1,252 @@
1
+ import dataclasses
2
+ import logging
3
+ import os
4
+ import shutil
5
+ import sys
6
+ import time
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+
10
+ from rich import print # noqa
11
+
12
+ from PyHardLinkBackup.constants import CHUNK_SIZE
13
+ from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase
14
+ from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
15
+ from PyHardLinkBackup.utilities.filesystem import (
16
+ copy_and_hash,
17
+ hash_file,
18
+ humanized_fs_scan,
19
+ iter_scandir_files,
20
+ read_and_hash_file,
21
+ supports_hardlinks,
22
+ )
23
+ from PyHardLinkBackup.utilities.humanize import human_filesize
24
+ from PyHardLinkBackup.utilities.rich_utils import BackupProgress
25
+
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ @dataclasses.dataclass
31
+ class BackupResult:
32
+ backup_dir: Path
33
+ #
34
+ backup_count: int = 0
35
+ backup_size: int = 0
36
+ #
37
+ symlink_files: int = 0
38
+ hardlinked_files: int = 0
39
+ hardlinked_size: int = 0
40
+ #
41
+ copied_files: int = 0
42
+ copied_size: int = 0
43
+ #
44
+ copied_small_files: int = 0
45
+ copied_small_size: int = 0
46
+ #
47
+ error_count: int = 0
48
+
49
+
50
+ def backup_one_file(
51
+ *,
52
+ src_root: Path,
53
+ entry: os.DirEntry,
54
+ size_db: FileSizeDatabase,
55
+ hash_db: FileHashDatabase,
56
+ backup_dir: Path,
57
+ backup_result: BackupResult,
58
+ ) -> None:
59
+ backup_result.backup_count += 1
60
+ src_path = Path(entry.path)
61
+
62
+ dst_path = backup_dir / src_path.relative_to(src_root)
63
+ dst_dir_path = dst_path.parent
64
+ if not dst_dir_path.exists():
65
+ dst_dir_path.mkdir(parents=True, exist_ok=False)
66
+
67
+ try:
68
+ size = entry.stat().st_size
69
+ except FileNotFoundError:
70
+ # e.g.: Handle broken symlink
71
+ target = os.readlink(src_path)
72
+ dst_path.symlink_to(target)
73
+ backup_result.symlink_files += 1
74
+ return
75
+
76
+ backup_result.backup_size += size
77
+
78
+ if entry.name == 'SHA256SUMS':
79
+ # Skip existing SHA256SUMS files in source tree,
80
+ # because we create our own SHA256SUMS files.
81
+ logger.debug('Skip existing SHA256SUMS file: %s', src_path)
82
+ return
83
+
84
+ if entry.is_symlink():
85
+ logger.debug('Copy symlink: %s to %s', src_path, dst_path)
86
+ target = os.readlink(src_path)
87
+ dst_path.symlink_to(target)
88
+ backup_result.symlink_files += 1
89
+ return
90
+
91
+ # Process regular files
92
+ assert entry.is_file(follow_symlinks=False), f'Unexpected non-file: {src_path}'
93
+
94
+ # Deduplication logic
95
+
96
+ if size < size_db.MIN_SIZE:
97
+ # Small file -> always copy without deduplication
98
+ logger.info('Copy small file: %s to %s', src_path, dst_path)
99
+ file_hash = copy_and_hash(src_path, dst_path)
100
+ backup_result.copied_files += 1
101
+ backup_result.copied_size += size
102
+ backup_result.copied_small_files += 1
103
+ backup_result.copied_small_size += size
104
+ store_hash(dst_path, file_hash)
105
+ return
106
+
107
+ if size in size_db:
108
+ logger.debug('File with size %iBytes found before -> hash: %s', size, src_path)
109
+
110
+ if size <= CHUNK_SIZE:
111
+ # File can be read complete into memory
112
+ logger.debug('File size %iBytes <= CHUNK_SIZE (%iBytes) -> read complete into memory', size, CHUNK_SIZE)
113
+ file_content, file_hash = read_and_hash_file(src_path)
114
+ if existing_path := hash_db.get(file_hash):
115
+ logger.info('Hardlink duplicate file: %s to %s', dst_path, existing_path)
116
+ os.link(existing_path, dst_path)
117
+ backup_result.hardlinked_files += 1
118
+ backup_result.hardlinked_size += size
119
+ else:
120
+ logger.info('Store unique file: %s to %s', src_path, dst_path)
121
+ dst_path.write_bytes(file_content)
122
+ hash_db[file_hash] = dst_path
123
+ backup_result.copied_files += 1
124
+ backup_result.copied_size += size
125
+
126
+ else:
127
+ # Large file
128
+ file_hash = hash_file(src_path) # Calculate hash without copying
129
+
130
+ if existing_path := hash_db.get(file_hash):
131
+ logger.info('Hardlink duplicate file: %s to %s', dst_path, existing_path)
132
+ os.link(existing_path, dst_path)
133
+ backup_result.hardlinked_files += 1
134
+ backup_result.hardlinked_size += size
135
+ else:
136
+ logger.info('Copy unique file: %s to %s', src_path, dst_path)
137
+ hash_db[file_hash] = dst_path
138
+ backup_result.copied_files += 1
139
+ backup_result.copied_size += size
140
+
141
+ # Keep original file metadata (permission bits, time stamps, and flags)
142
+ shutil.copy2(src_path, dst_path)
143
+ else:
144
+ # A file with this size not backuped before -> Can't be duplicate -> copy and hash
145
+ file_hash = copy_and_hash(src_path, dst_path)
146
+ size_db.add(size)
147
+ hash_db[file_hash] = dst_path
148
+ backup_result.copied_files += 1
149
+ backup_result.copied_size += size
150
+
151
+ store_hash(dst_path, file_hash)
152
+
153
+
154
+ def store_hash(file_path: Path, file_hash: str):
155
+ """DocWrite: README.md ## SHA256SUMS
156
+ A `SHA256SUMS` file is stored in each backup directory containing the SHA256 hashes of all files in that directory.
157
+ It's the same format as e.g.: `sha256sum * > SHA256SUMS` command produces.
158
+ So it's possible to verify the integrity of the backup files later.
159
+ e.g.:
160
+ ```bash
161
+ cd .../your/backup/foobar/20240101_120000/
162
+ sha256sum -c SHA256SUMS
163
+ ```
164
+ """
165
+ hash_file_path = file_path.parent / 'SHA256SUMS'
166
+ with hash_file_path.open('a') as f:
167
+ f.write(f'{file_hash} {file_path.name}\n')
168
+
169
+
170
+ def backup_tree(*, src_root: Path, backup_root: Path, excludes: set[str]) -> BackupResult:
171
+ src_root = src_root.resolve()
172
+ if not src_root.is_dir():
173
+ print('Error: Source directory does not exist!')
174
+ print(f'Please check source directory: "{src_root}"\n')
175
+ sys.exit(1)
176
+
177
+ backup_root = backup_root.resolve()
178
+ if not backup_root.is_dir():
179
+ print('Error: Backup directory does not exist!')
180
+ print(f'Please create "{backup_root}" directory first and start again!\n')
181
+ sys.exit(1)
182
+
183
+ if not os.access(backup_root, os.W_OK):
184
+ print('Error: No write access to backup directory!')
185
+ print(f'Please check permissions for backup directory: "{backup_root}"\n')
186
+ sys.exit(1)
187
+
188
+ if not supports_hardlinks(backup_root):
189
+ print('Error: Filesystem for backup directory does not support hardlinks!')
190
+ print(f'Please check backup directory: "{backup_root}"\n')
191
+ sys.exit(1)
192
+
193
+ # Step 1: Scan source directory:
194
+ src_file_count, src_total_size = humanized_fs_scan(src_root, excludes)
195
+
196
+ phlb_conf_dir = backup_root / '.phlb'
197
+ phlb_conf_dir.mkdir(parents=False, exist_ok=True)
198
+
199
+ backup_dir = backup_root / src_root.name / datetime.now().strftime('%Y-%m-%d-%H%M%S')
200
+ logger.info('Backup %s to %s', src_root, backup_dir)
201
+ backup_dir.mkdir(parents=True, exist_ok=False)
202
+
203
+ print(f'\nBackup to {backup_dir}...\n')
204
+
205
+ with BackupProgress(src_file_count, src_total_size) as progress:
206
+ # "Databases" for deduplication
207
+ size_db = FileSizeDatabase(phlb_conf_dir)
208
+ hash_db = FileHashDatabase(backup_root, phlb_conf_dir)
209
+
210
+ backup_result = BackupResult(backup_dir=backup_dir)
211
+
212
+ next_update = 0
213
+ for entry in iter_scandir_files(src_root, excludes=excludes):
214
+ try:
215
+ backup_one_file(
216
+ src_root=src_root,
217
+ entry=entry,
218
+ size_db=size_db,
219
+ hash_db=hash_db,
220
+ backup_dir=backup_dir,
221
+ backup_result=backup_result,
222
+ )
223
+ except Exception as err:
224
+ logger.exception(f'Backup {entry.path} {err.__class__.__name__}: {err}')
225
+ backup_result.error_count += 1
226
+ else:
227
+ now = time.monotonic()
228
+ if now >= next_update:
229
+ progress.update(backup_count=backup_result.backup_count, backup_size=backup_result.backup_size)
230
+ next_update = now + 0.5
231
+
232
+ # Finalize progress indicator values:
233
+ progress.update(backup_count=backup_result.backup_count, backup_size=backup_result.backup_size)
234
+
235
+ print(f'\nBackup complete: {backup_dir} (total size {human_filesize(backup_result.backup_size)})\n')
236
+ print(f' Total files processed: {backup_result.backup_count}')
237
+ print(f' * Symlinked files: {backup_result.symlink_files}')
238
+ print(
239
+ f' * Hardlinked files: {backup_result.hardlinked_files}'
240
+ f' (saved {human_filesize(backup_result.hardlinked_size)})'
241
+ )
242
+ print(f' * Copied files: {backup_result.copied_files} (total {human_filesize(backup_result.copied_size)})')
243
+ print(
244
+ f' of which small (<{size_db.MIN_SIZE} Bytes)'
245
+ f' files: {backup_result.copied_small_files}'
246
+ f' (total {human_filesize(backup_result.copied_small_size)})'
247
+ )
248
+ if backup_result.error_count > 0:
249
+ print(f' Errors during backup: {backup_result.error_count} (see log for details)')
250
+ print()
251
+
252
+ return backup_result