PyHardLinkBackup 1.1.0__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.pre-commit-config.yaml +1 -1
- pyhardlinkbackup-1.1.0/README.md → pyhardlinkbackup-1.3.0/PKG-INFO +55 -12
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/__init__.py +1 -1
- pyhardlinkbackup-1.3.0/PyHardLinkBackup/backup.py +239 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_app/phlb.py +21 -0
- pyhardlinkbackup-1.3.0/PyHardLinkBackup/rebuild_databases.py +147 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/test_backup.py +68 -2
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/test_doc_write.py +2 -1
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/test_readme_history.py +2 -1
- pyhardlinkbackup-1.3.0/PyHardLinkBackup/tests/test_rebuild_database.py +204 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/file_hash_database.py +4 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/filesystem.py +26 -1
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/rich_utils.py +14 -13
- pyhardlinkbackup-1.3.0/PyHardLinkBackup/utilities/sha256sums.py +61 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/tests/test_file_hash_database.py +3 -1
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/tests/test_file_size_database.py +1 -1
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/tests/test_filesystem.py +34 -2
- pyhardlinkbackup-1.1.0/PKG-INFO → pyhardlinkbackup-1.3.0/README.md +40 -27
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/pyproject.toml +1 -1
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/uv.lock +80 -80
- pyhardlinkbackup-1.1.0/PyHardLinkBackup/backup.py +0 -229
- pyhardlinkbackup-1.1.0/PyHardLinkBackup/utilities/tests/base_testcases.py +0 -88
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.editorconfig +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.github/workflows/tests.yml +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.gitignore +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.idea/.gitignore +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.pre-commit-hooks.yaml +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.run/Template Python tests.run.xml +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.run/Unittests - __all__.run.xml +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.run/cli.py --help.run.xml +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.run/dev-cli update.run.xml +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.run/only DocTests.run.xml +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.run/only DocWrite.run.xml +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/.venv-app/lib/python3.12/site-packages/cli_base/tests/shell_complete_snapshots/.gitignore +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/__main__.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_app/__init__.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/__init__.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/benchmark.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/code_style.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/packaging.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/shell_completion.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/testing.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/cli_dev/update_readme_history.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/constants.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/__init__.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/test_doctests.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/test_project_setup.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/tests/test_readme.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/__init__.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/file_size_database.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/humanize.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/PyHardLinkBackup/utilities/tests/__init__.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/cli.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/dev-cli.py +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/dist/.gitignore +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/docs/README.md +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/docs/about-docs.md +0 -0
- {pyhardlinkbackup-1.1.0 → pyhardlinkbackup-1.3.0}/noxfile.py +0 -0
|
@@ -1,3 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: PyHardLinkBackup
|
|
3
|
+
Version: 1.3.0
|
|
4
|
+
Summary: HardLink/Deduplication Backups with Python
|
|
5
|
+
Project-URL: Documentation, https://github.com/jedie/PyHardLinkBackup
|
|
6
|
+
Project-URL: Source, https://github.com/jedie/PyHardLinkBackup
|
|
7
|
+
Author-email: Jens Diemer <PyHardLinkBackup@jensdiemer.de>
|
|
8
|
+
License: GPL-3.0-or-later
|
|
9
|
+
Requires-Python: >=3.12
|
|
10
|
+
Requires-Dist: bx-py-utils
|
|
11
|
+
Requires-Dist: cli-base-utilities>=0.27.0
|
|
12
|
+
Requires-Dist: rich
|
|
13
|
+
Requires-Dist: tyro
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
1
16
|
# PyHardLinkBackup
|
|
2
17
|
|
|
3
18
|
[](https://github.com/jedie/PyHardLinkBackup/actions/workflows/tests.yml)
|
|
@@ -10,6 +25,8 @@ HardLink/Deduplication Backups with Python
|
|
|
10
25
|
|
|
11
26
|
**WIP:** v1.0.0 is a complete rewrite of PyHardLinkBackup.
|
|
12
27
|
|
|
28
|
+
It's similar to `rsync --link-dest` but the deduplication is done globally for all backups and all paths.
|
|
29
|
+
|
|
13
30
|
## installation
|
|
14
31
|
|
|
15
32
|
You can use [pipx](https://pipx.pypa.io/stable/installation/) to install and use PyHardLinkBackup, e.g.:
|
|
@@ -46,22 +63,36 @@ Backup the source directory to the destination directory using hard links for de
|
|
|
46
63
|
|
|
47
64
|
|
|
48
65
|
|
|
66
|
+
Running a backup looks like:
|
|
67
|
+
|
|
68
|
+

|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
If it's finished it display a summary:
|
|
73
|
+
|
|
74
|
+

|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
49
78
|
complete help for main CLI app:
|
|
50
79
|
|
|
51
80
|
[comment]: <> (✂✂✂ auto generated main help start ✂✂✂)
|
|
52
81
|
```
|
|
53
|
-
usage: phlb [-h] {backup,version}
|
|
82
|
+
usage: phlb [-h] {backup,rebuild,version}
|
|
54
83
|
|
|
55
84
|
|
|
56
85
|
|
|
57
|
-
╭─ options
|
|
58
|
-
│ -h, --help show this help message and exit
|
|
59
|
-
|
|
60
|
-
╭─ subcommands
|
|
61
|
-
│ (required)
|
|
62
|
-
│ • backup Backup the source directory to the destination directory using hard links for deduplication.
|
|
63
|
-
│ •
|
|
64
|
-
|
|
86
|
+
╭─ options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
87
|
+
│ -h, --help show this help message and exit │
|
|
88
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
89
|
+
╭─ subcommands ────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
90
|
+
│ (required) │
|
|
91
|
+
│ • backup Backup the source directory to the destination directory using hard links for deduplication. │
|
|
92
|
+
│ • rebuild Rebuild the file hash and size database by scanning all backup files. And also verify SHA256SUMS and/or │
|
|
93
|
+
│ store missing hashes in SHA256SUMS files. │
|
|
94
|
+
│ • version Print version and exit │
|
|
95
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
65
96
|
```
|
|
66
97
|
[comment]: <> (✂✂✂ auto generated main help end ✂✂✂)
|
|
67
98
|
|
|
@@ -197,11 +228,26 @@ Overview of main changes:
|
|
|
197
228
|
|
|
198
229
|
[comment]: <> (✂✂✂ auto generated history start ✂✂✂)
|
|
199
230
|
|
|
231
|
+
* [v1.3.0](https://github.com/jedie/PyHardLinkBackup/compare/v1.2.0...v1.3.0)
|
|
232
|
+
* 2026-01-15 - Verify SHA256SUMS files in "rebuild" command, too.
|
|
233
|
+
* 2026-01-15 - Code cleanup: use more generic names for and in BackupProgress
|
|
234
|
+
* 2026-01-15 - Add tests for rebuild
|
|
235
|
+
* 2026-01-15 - Add command to "rebuld" the size and hash filesystem database
|
|
236
|
+
* 2026-01-15 - Add screenshots in the README
|
|
237
|
+
* [v1.2.0](https://github.com/jedie/PyHardLinkBackup/compare/v1.1.0...v1.2.0)
|
|
238
|
+
* 2026-01-15 - Add error handling: Log exception but continue with the backup
|
|
239
|
+
* 2026-01-15 - Check permission and hadlink support on destination path
|
|
240
|
+
* 2026-01-14 - Enhance progress bars
|
|
241
|
+
* 2026-01-14 - A a note to rsync --link-dest
|
|
242
|
+
* 2026-01-14 - Use cli_base.cli_tools.test_utils.base_testcases
|
|
200
243
|
* [v1.1.0](https://github.com/jedie/PyHardLinkBackup/compare/v1.0.1...v1.1.0)
|
|
201
244
|
* 2026-01-14 - Change backup timestamp directory to old schema: '%Y-%m-%d-%H%M%S'
|
|
202
245
|
* 2026-01-14 - Add "Overview of main changes" to README
|
|
203
246
|
* [v1.0.1](https://github.com/jedie/PyHardLinkBackup/compare/v1.0.0...v1.0.1)
|
|
204
247
|
* 2026-01-13 - Store SHA256SUMS files in backup directories
|
|
248
|
+
|
|
249
|
+
<details><summary>Expand older history entries ...</summary>
|
|
250
|
+
|
|
205
251
|
* [v1.0.0](https://github.com/jedie/PyHardLinkBackup/compare/v0.13.0...v1.0.0)
|
|
206
252
|
* 2026-01-13 - Change "./cli.py" to "phlb" (because it's the name installed via pipx)
|
|
207
253
|
* 2026-01-13 - Update README
|
|
@@ -230,9 +276,6 @@ Overview of main changes:
|
|
|
230
276
|
* 2020-03-17 - dynamic chunk size
|
|
231
277
|
* 2020-03-17 - ignore *.sha512 by default
|
|
232
278
|
* 2020-03-17 - Update boot_pyhardlinkbackup.sh
|
|
233
|
-
|
|
234
|
-
<details><summary>Expand older history entries ...</summary>
|
|
235
|
-
|
|
236
279
|
* [v0.12.3](https://github.com/jedie/PyHardLinkBackup/compare/v0.12.2...v0.12.3)
|
|
237
280
|
* 2020-03-17 - update README.rst
|
|
238
281
|
* 2020-03-17 - don't publish if tests fail
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from rich import print # noqa
|
|
11
|
+
|
|
12
|
+
from PyHardLinkBackup.constants import CHUNK_SIZE
|
|
13
|
+
from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase
|
|
14
|
+
from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
|
|
15
|
+
from PyHardLinkBackup.utilities.filesystem import (
|
|
16
|
+
copy_and_hash,
|
|
17
|
+
hash_file,
|
|
18
|
+
humanized_fs_scan,
|
|
19
|
+
iter_scandir_files,
|
|
20
|
+
read_and_hash_file,
|
|
21
|
+
supports_hardlinks,
|
|
22
|
+
)
|
|
23
|
+
from PyHardLinkBackup.utilities.humanize import human_filesize
|
|
24
|
+
from PyHardLinkBackup.utilities.rich_utils import DisplayFileTreeProgress
|
|
25
|
+
from PyHardLinkBackup.utilities.sha256sums import store_hash
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclasses.dataclass
|
|
32
|
+
class BackupResult:
|
|
33
|
+
backup_dir: Path
|
|
34
|
+
#
|
|
35
|
+
backup_count: int = 0
|
|
36
|
+
backup_size: int = 0
|
|
37
|
+
#
|
|
38
|
+
symlink_files: int = 0
|
|
39
|
+
hardlinked_files: int = 0
|
|
40
|
+
hardlinked_size: int = 0
|
|
41
|
+
#
|
|
42
|
+
copied_files: int = 0
|
|
43
|
+
copied_size: int = 0
|
|
44
|
+
#
|
|
45
|
+
copied_small_files: int = 0
|
|
46
|
+
copied_small_size: int = 0
|
|
47
|
+
#
|
|
48
|
+
error_count: int = 0
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def backup_one_file(
|
|
52
|
+
*,
|
|
53
|
+
src_root: Path,
|
|
54
|
+
entry: os.DirEntry,
|
|
55
|
+
size_db: FileSizeDatabase,
|
|
56
|
+
hash_db: FileHashDatabase,
|
|
57
|
+
backup_dir: Path,
|
|
58
|
+
backup_result: BackupResult,
|
|
59
|
+
) -> None:
|
|
60
|
+
backup_result.backup_count += 1
|
|
61
|
+
src_path = Path(entry.path)
|
|
62
|
+
|
|
63
|
+
dst_path = backup_dir / src_path.relative_to(src_root)
|
|
64
|
+
dst_dir_path = dst_path.parent
|
|
65
|
+
if not dst_dir_path.exists():
|
|
66
|
+
dst_dir_path.mkdir(parents=True, exist_ok=False)
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
size = entry.stat().st_size
|
|
70
|
+
except FileNotFoundError:
|
|
71
|
+
# e.g.: Handle broken symlink
|
|
72
|
+
target = os.readlink(src_path)
|
|
73
|
+
dst_path.symlink_to(target)
|
|
74
|
+
backup_result.symlink_files += 1
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
backup_result.backup_size += size
|
|
78
|
+
|
|
79
|
+
if entry.name == 'SHA256SUMS':
|
|
80
|
+
# Skip existing SHA256SUMS files in source tree,
|
|
81
|
+
# because we create our own SHA256SUMS files.
|
|
82
|
+
logger.debug('Skip existing SHA256SUMS file: %s', src_path)
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
if entry.is_symlink():
|
|
86
|
+
logger.debug('Copy symlink: %s to %s', src_path, dst_path)
|
|
87
|
+
target = os.readlink(src_path)
|
|
88
|
+
dst_path.symlink_to(target)
|
|
89
|
+
backup_result.symlink_files += 1
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
# Process regular files
|
|
93
|
+
assert entry.is_file(follow_symlinks=False), f'Unexpected non-file: {src_path}'
|
|
94
|
+
|
|
95
|
+
# Deduplication logic
|
|
96
|
+
|
|
97
|
+
if size < size_db.MIN_SIZE:
|
|
98
|
+
# Small file -> always copy without deduplication
|
|
99
|
+
logger.info('Copy small file: %s to %s', src_path, dst_path)
|
|
100
|
+
file_hash = copy_and_hash(src_path, dst_path)
|
|
101
|
+
backup_result.copied_files += 1
|
|
102
|
+
backup_result.copied_size += size
|
|
103
|
+
backup_result.copied_small_files += 1
|
|
104
|
+
backup_result.copied_small_size += size
|
|
105
|
+
store_hash(dst_path, file_hash)
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
if size in size_db:
|
|
109
|
+
logger.debug('File with size %iBytes found before -> hash: %s', size, src_path)
|
|
110
|
+
|
|
111
|
+
if size <= CHUNK_SIZE:
|
|
112
|
+
# File can be read complete into memory
|
|
113
|
+
logger.debug('File size %iBytes <= CHUNK_SIZE (%iBytes) -> read complete into memory', size, CHUNK_SIZE)
|
|
114
|
+
file_content, file_hash = read_and_hash_file(src_path)
|
|
115
|
+
if existing_path := hash_db.get(file_hash):
|
|
116
|
+
logger.info('Hardlink duplicate file: %s to %s', dst_path, existing_path)
|
|
117
|
+
os.link(existing_path, dst_path)
|
|
118
|
+
backup_result.hardlinked_files += 1
|
|
119
|
+
backup_result.hardlinked_size += size
|
|
120
|
+
else:
|
|
121
|
+
logger.info('Store unique file: %s to %s', src_path, dst_path)
|
|
122
|
+
dst_path.write_bytes(file_content)
|
|
123
|
+
hash_db[file_hash] = dst_path
|
|
124
|
+
backup_result.copied_files += 1
|
|
125
|
+
backup_result.copied_size += size
|
|
126
|
+
|
|
127
|
+
else:
|
|
128
|
+
# Large file
|
|
129
|
+
file_hash = hash_file(src_path) # Calculate hash without copying
|
|
130
|
+
|
|
131
|
+
if existing_path := hash_db.get(file_hash):
|
|
132
|
+
logger.info('Hardlink duplicate file: %s to %s', dst_path, existing_path)
|
|
133
|
+
os.link(existing_path, dst_path)
|
|
134
|
+
backup_result.hardlinked_files += 1
|
|
135
|
+
backup_result.hardlinked_size += size
|
|
136
|
+
else:
|
|
137
|
+
logger.info('Copy unique file: %s to %s', src_path, dst_path)
|
|
138
|
+
hash_db[file_hash] = dst_path
|
|
139
|
+
backup_result.copied_files += 1
|
|
140
|
+
backup_result.copied_size += size
|
|
141
|
+
|
|
142
|
+
# Keep original file metadata (permission bits, time stamps, and flags)
|
|
143
|
+
shutil.copy2(src_path, dst_path)
|
|
144
|
+
else:
|
|
145
|
+
# A file with this size not backuped before -> Can't be duplicate -> copy and hash
|
|
146
|
+
file_hash = copy_and_hash(src_path, dst_path)
|
|
147
|
+
size_db.add(size)
|
|
148
|
+
hash_db[file_hash] = dst_path
|
|
149
|
+
backup_result.copied_files += 1
|
|
150
|
+
backup_result.copied_size += size
|
|
151
|
+
|
|
152
|
+
store_hash(dst_path, file_hash)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def backup_tree(*, src_root: Path, backup_root: Path, excludes: set[str]) -> BackupResult:
|
|
156
|
+
src_root = src_root.resolve()
|
|
157
|
+
if not src_root.is_dir():
|
|
158
|
+
print('Error: Source directory does not exist!')
|
|
159
|
+
print(f'Please check source directory: "{src_root}"\n')
|
|
160
|
+
sys.exit(1)
|
|
161
|
+
|
|
162
|
+
backup_root = backup_root.resolve()
|
|
163
|
+
if not backup_root.is_dir():
|
|
164
|
+
print('Error: Backup directory does not exist!')
|
|
165
|
+
print(f'Please create "{backup_root}" directory first and start again!\n')
|
|
166
|
+
sys.exit(1)
|
|
167
|
+
|
|
168
|
+
if not os.access(backup_root, os.W_OK):
|
|
169
|
+
print('Error: No write access to backup directory!')
|
|
170
|
+
print(f'Please check permissions for backup directory: "{backup_root}"\n')
|
|
171
|
+
sys.exit(1)
|
|
172
|
+
|
|
173
|
+
if not supports_hardlinks(backup_root):
|
|
174
|
+
print('Error: Filesystem for backup directory does not support hardlinks!')
|
|
175
|
+
print(f'Please check backup directory: "{backup_root}"\n')
|
|
176
|
+
sys.exit(1)
|
|
177
|
+
|
|
178
|
+
# Step 1: Scan source directory:
|
|
179
|
+
src_file_count, src_total_size = humanized_fs_scan(src_root, excludes)
|
|
180
|
+
|
|
181
|
+
phlb_conf_dir = backup_root / '.phlb'
|
|
182
|
+
phlb_conf_dir.mkdir(parents=False, exist_ok=True)
|
|
183
|
+
|
|
184
|
+
backup_dir = backup_root / src_root.name / datetime.now().strftime('%Y-%m-%d-%H%M%S')
|
|
185
|
+
logger.info('Backup %s to %s', src_root, backup_dir)
|
|
186
|
+
backup_dir.mkdir(parents=True, exist_ok=False)
|
|
187
|
+
|
|
188
|
+
print(f'\nBackup to {backup_dir}...\n')
|
|
189
|
+
|
|
190
|
+
with DisplayFileTreeProgress(src_file_count, src_total_size) as progress:
|
|
191
|
+
# "Databases" for deduplication
|
|
192
|
+
size_db = FileSizeDatabase(phlb_conf_dir)
|
|
193
|
+
hash_db = FileHashDatabase(backup_root, phlb_conf_dir)
|
|
194
|
+
|
|
195
|
+
backup_result = BackupResult(backup_dir=backup_dir)
|
|
196
|
+
|
|
197
|
+
next_update = 0
|
|
198
|
+
for entry in iter_scandir_files(src_root, excludes=excludes):
|
|
199
|
+
try:
|
|
200
|
+
backup_one_file(
|
|
201
|
+
src_root=src_root,
|
|
202
|
+
entry=entry,
|
|
203
|
+
size_db=size_db,
|
|
204
|
+
hash_db=hash_db,
|
|
205
|
+
backup_dir=backup_dir,
|
|
206
|
+
backup_result=backup_result,
|
|
207
|
+
)
|
|
208
|
+
except Exception as err:
|
|
209
|
+
logger.exception(f'Backup {entry.path} {err.__class__.__name__}: {err}')
|
|
210
|
+
backup_result.error_count += 1
|
|
211
|
+
else:
|
|
212
|
+
now = time.monotonic()
|
|
213
|
+
if now >= next_update:
|
|
214
|
+
progress.update(
|
|
215
|
+
completed_file_count=backup_result.backup_count, completed_size=backup_result.backup_size
|
|
216
|
+
)
|
|
217
|
+
next_update = now + 0.5
|
|
218
|
+
|
|
219
|
+
# Finalize progress indicator values:
|
|
220
|
+
progress.update(completed_file_count=backup_result.backup_count, completed_size=backup_result.backup_size)
|
|
221
|
+
|
|
222
|
+
print(f'\nBackup complete: {backup_dir} (total size {human_filesize(backup_result.backup_size)})\n')
|
|
223
|
+
print(f' Total files processed: {backup_result.backup_count}')
|
|
224
|
+
print(f' * Symlinked files: {backup_result.symlink_files}')
|
|
225
|
+
print(
|
|
226
|
+
f' * Hardlinked files: {backup_result.hardlinked_files}'
|
|
227
|
+
f' (saved {human_filesize(backup_result.hardlinked_size)})'
|
|
228
|
+
)
|
|
229
|
+
print(f' * Copied files: {backup_result.copied_files} (total {human_filesize(backup_result.copied_size)})')
|
|
230
|
+
print(
|
|
231
|
+
f' of which small (<{size_db.MIN_SIZE} Bytes)'
|
|
232
|
+
f' files: {backup_result.copied_small_files}'
|
|
233
|
+
f' (total {human_filesize(backup_result.copied_small_size)})'
|
|
234
|
+
)
|
|
235
|
+
if backup_result.error_count > 0:
|
|
236
|
+
print(f' Errors during backup: {backup_result.error_count} (see log for details)')
|
|
237
|
+
print()
|
|
238
|
+
|
|
239
|
+
return backup_result
|
|
@@ -7,6 +7,7 @@ from cli_base.cli_tools.verbosity import setup_logging
|
|
|
7
7
|
from cli_base.tyro_commands import TyroVerbosityArgType
|
|
8
8
|
from rich import print # noqa
|
|
9
9
|
|
|
10
|
+
from PyHardLinkBackup import rebuild_databases
|
|
10
11
|
from PyHardLinkBackup.backup import backup_tree
|
|
11
12
|
from PyHardLinkBackup.cli_app import app
|
|
12
13
|
|
|
@@ -48,3 +49,23 @@ def backup(
|
|
|
48
49
|
backup_root=dst,
|
|
49
50
|
excludes=set(excludes),
|
|
50
51
|
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@app.command
|
|
55
|
+
def rebuild(
|
|
56
|
+
backup_root: Annotated[
|
|
57
|
+
Path,
|
|
58
|
+
tyro.conf.arg(
|
|
59
|
+
metavar='backup-directory',
|
|
60
|
+
help='Root directory of the the backups.',
|
|
61
|
+
),
|
|
62
|
+
],
|
|
63
|
+
/,
|
|
64
|
+
verbosity: TyroVerbosityArgType = 2,
|
|
65
|
+
) -> None:
|
|
66
|
+
"""
|
|
67
|
+
Rebuild the file hash and size database by scanning all backup files. And also verify SHA256SUMS
|
|
68
|
+
and/or store missing hashes in SHA256SUMS files.
|
|
69
|
+
"""
|
|
70
|
+
setup_logging(verbosity=verbosity)
|
|
71
|
+
rebuild_databases.rebuild(backup_root=backup_root)
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase
|
|
9
|
+
from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
|
|
10
|
+
from PyHardLinkBackup.utilities.filesystem import hash_file, humanized_fs_scan, iter_scandir_files
|
|
11
|
+
from PyHardLinkBackup.utilities.humanize import human_filesize
|
|
12
|
+
from PyHardLinkBackup.utilities.rich_utils import DisplayFileTreeProgress
|
|
13
|
+
from PyHardLinkBackup.utilities.sha256sums import check_sha256sums, store_hash
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclasses.dataclass
|
|
20
|
+
class RebuildResult:
|
|
21
|
+
process_count: int = 0
|
|
22
|
+
process_size: int = 0
|
|
23
|
+
#
|
|
24
|
+
added_size_count: int = 0
|
|
25
|
+
added_hash_count: int = 0
|
|
26
|
+
#
|
|
27
|
+
error_count: int = 0
|
|
28
|
+
#
|
|
29
|
+
hash_verified_count: int = 0
|
|
30
|
+
hash_mismatch_count: int = 0
|
|
31
|
+
hash_not_found_count: int = 0
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def rebuild_one_file(
|
|
35
|
+
*,
|
|
36
|
+
entry: os.DirEntry,
|
|
37
|
+
size_db: FileSizeDatabase,
|
|
38
|
+
hash_db: FileHashDatabase,
|
|
39
|
+
rebuild_result: RebuildResult,
|
|
40
|
+
):
|
|
41
|
+
rebuild_result.process_count += 1
|
|
42
|
+
|
|
43
|
+
if entry.name == 'SHA256SUMS':
|
|
44
|
+
# Skip existing SHA256SUMS files
|
|
45
|
+
return
|
|
46
|
+
|
|
47
|
+
size = entry.stat().st_size
|
|
48
|
+
rebuild_result.process_size += size
|
|
49
|
+
|
|
50
|
+
if size < size_db.MIN_SIZE:
|
|
51
|
+
# Small files will never deduplicate, skip them
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
file_path = Path(entry.path)
|
|
55
|
+
file_hash = hash_file(file_path)
|
|
56
|
+
|
|
57
|
+
if size not in size_db:
|
|
58
|
+
size_db.add(size)
|
|
59
|
+
rebuild_result.added_size_count += 1
|
|
60
|
+
|
|
61
|
+
if file_hash not in hash_db:
|
|
62
|
+
hash_db[file_hash] = file_path
|
|
63
|
+
rebuild_result.added_hash_count += 1
|
|
64
|
+
|
|
65
|
+
# We have calculated the current hash of the file,
|
|
66
|
+
# Let's check if we can verify it, too:
|
|
67
|
+
file_path = Path(entry.path)
|
|
68
|
+
compare_result = check_sha256sums(
|
|
69
|
+
file_path=file_path,
|
|
70
|
+
file_hash=file_hash,
|
|
71
|
+
)
|
|
72
|
+
if compare_result is True:
|
|
73
|
+
rebuild_result.hash_verified_count += 1
|
|
74
|
+
elif compare_result is False:
|
|
75
|
+
rebuild_result.hash_mismatch_count += 1
|
|
76
|
+
elif compare_result is None:
|
|
77
|
+
rebuild_result.hash_not_found_count += 1
|
|
78
|
+
store_hash(
|
|
79
|
+
file_path=file_path,
|
|
80
|
+
file_hash=file_hash,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def rebuild(backup_root: Path) -> RebuildResult:
|
|
85
|
+
backup_root = backup_root.resolve()
|
|
86
|
+
if not backup_root.is_dir():
|
|
87
|
+
print(f'Error: Backup directory "{backup_root}" does not exist!')
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
phlb_conf_dir = backup_root / '.phlb'
|
|
91
|
+
if not phlb_conf_dir.is_dir():
|
|
92
|
+
print(
|
|
93
|
+
f'Error: Backup directory "{backup_root}" seems to be wrong:'
|
|
94
|
+
f' Our hidden ".phlb" configuration directory is missing!'
|
|
95
|
+
)
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
file_count, total_size = humanized_fs_scan(backup_root, excludes={'.phlb'})
|
|
99
|
+
|
|
100
|
+
with DisplayFileTreeProgress(file_count, total_size) as progress:
|
|
101
|
+
# "Databases" for deduplication
|
|
102
|
+
size_db = FileSizeDatabase(phlb_conf_dir)
|
|
103
|
+
hash_db = FileHashDatabase(backup_root, phlb_conf_dir)
|
|
104
|
+
|
|
105
|
+
rebuild_result = RebuildResult()
|
|
106
|
+
|
|
107
|
+
next_update = 0
|
|
108
|
+
for entry in iter_scandir_files(backup_root, excludes={'.phlb'}):
|
|
109
|
+
try:
|
|
110
|
+
rebuild_one_file(
|
|
111
|
+
entry=entry,
|
|
112
|
+
size_db=size_db,
|
|
113
|
+
hash_db=hash_db,
|
|
114
|
+
rebuild_result=rebuild_result,
|
|
115
|
+
)
|
|
116
|
+
except Exception as err:
|
|
117
|
+
logger.exception(f'Backup {entry.path} {err.__class__.__name__}: {err}')
|
|
118
|
+
rebuild_result.error_count += 1
|
|
119
|
+
else:
|
|
120
|
+
now = time.monotonic()
|
|
121
|
+
if now >= next_update:
|
|
122
|
+
progress.update(
|
|
123
|
+
completed_file_count=rebuild_result.process_count, completed_size=rebuild_result.process_size
|
|
124
|
+
)
|
|
125
|
+
next_update = now + 0.5
|
|
126
|
+
|
|
127
|
+
# Finalize progress indicator values:
|
|
128
|
+
progress.update(completed_file_count=rebuild_result.process_count, completed_size=rebuild_result.process_size)
|
|
129
|
+
|
|
130
|
+
print(f'\nRebuild "{backup_root}" completed:')
|
|
131
|
+
print(f' Total files processed: {rebuild_result.process_count}')
|
|
132
|
+
print(f' Total size processed: {human_filesize(rebuild_result.process_size)}')
|
|
133
|
+
|
|
134
|
+
print(f' Added file size information entries: {rebuild_result.added_size_count}')
|
|
135
|
+
print(f' Added file hash entries: {rebuild_result.added_hash_count}')
|
|
136
|
+
|
|
137
|
+
if rebuild_result.error_count > 0:
|
|
138
|
+
print(f' Errors during rebuild: {rebuild_result.error_count} (see log for details)')
|
|
139
|
+
|
|
140
|
+
print('\nSHA256SUMS verification results:')
|
|
141
|
+
print(f' Successfully verified files: {rebuild_result.hash_verified_count}')
|
|
142
|
+
print(f' File hash mismatches: {rebuild_result.hash_mismatch_count}')
|
|
143
|
+
print(f' File hashes not found, newly stored: {rebuild_result.hash_not_found_count}')
|
|
144
|
+
|
|
145
|
+
print()
|
|
146
|
+
|
|
147
|
+
return rebuild_result
|
|
@@ -13,14 +13,14 @@ from bx_py_utils.test_utils.assertion import assert_text_equal
|
|
|
13
13
|
from bx_py_utils.test_utils.datetime import parse_dt
|
|
14
14
|
from bx_py_utils.test_utils.log_utils import NoLogs
|
|
15
15
|
from bx_py_utils.test_utils.redirect import RedirectOut
|
|
16
|
+
from cli_base.cli_tools.test_utils.base_testcases import BaseTestCase
|
|
16
17
|
from freezegun import freeze_time
|
|
17
18
|
from tabulate import tabulate
|
|
18
19
|
|
|
19
20
|
from PyHardLinkBackup.backup import BackupResult, backup_tree
|
|
20
21
|
from PyHardLinkBackup.constants import CHUNK_SIZE
|
|
21
22
|
from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
|
|
22
|
-
from PyHardLinkBackup.utilities.filesystem import iter_scandir_files
|
|
23
|
-
from PyHardLinkBackup.utilities.tests.base_testcases import BaseTestCase
|
|
23
|
+
from PyHardLinkBackup.utilities.filesystem import copy_and_hash, iter_scandir_files
|
|
24
24
|
from PyHardLinkBackup.utilities.tests.test_file_hash_database import assert_hash_db_info
|
|
25
25
|
|
|
26
26
|
|
|
@@ -184,6 +184,7 @@ class BackupTreeTestCase(BaseTestCase):
|
|
|
184
184
|
copied_size=67109915,
|
|
185
185
|
copied_small_files=3,
|
|
186
186
|
copied_small_size=50,
|
|
187
|
+
error_count=0,
|
|
187
188
|
),
|
|
188
189
|
)
|
|
189
190
|
|
|
@@ -267,6 +268,7 @@ class BackupTreeTestCase(BaseTestCase):
|
|
|
267
268
|
copied_size=50,
|
|
268
269
|
copied_small_files=3,
|
|
269
270
|
copied_small_size=50,
|
|
271
|
+
error_count=0,
|
|
270
272
|
),
|
|
271
273
|
)
|
|
272
274
|
# The second backup:
|
|
@@ -360,6 +362,7 @@ class BackupTreeTestCase(BaseTestCase):
|
|
|
360
362
|
copied_size=1050,
|
|
361
363
|
copied_small_files=3,
|
|
362
364
|
copied_small_size=50,
|
|
365
|
+
error_count=0,
|
|
363
366
|
),
|
|
364
367
|
)
|
|
365
368
|
|
|
@@ -454,6 +457,7 @@ class BackupTreeTestCase(BaseTestCase):
|
|
|
454
457
|
copied_size=31,
|
|
455
458
|
copied_small_files=1,
|
|
456
459
|
copied_small_size=31,
|
|
460
|
+
error_count=0,
|
|
457
461
|
),
|
|
458
462
|
)
|
|
459
463
|
|
|
@@ -474,3 +478,65 @@ class BackupTreeTestCase(BaseTestCase):
|
|
|
474
478
|
Symlinks are not stored in our FileHashDatabase, because they are not considered for hardlinking."""
|
|
475
479
|
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
|
|
476
480
|
assert_hash_db_info(backup_root=backup_root, expected='')
|
|
481
|
+
|
|
482
|
+
def test_error_handling(self):
|
|
483
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
484
|
+
temp_path = Path(temp_dir)
|
|
485
|
+
|
|
486
|
+
src_root = temp_path / 'source'
|
|
487
|
+
backup_root = temp_path / 'backup'
|
|
488
|
+
|
|
489
|
+
src_root.mkdir()
|
|
490
|
+
backup_root.mkdir()
|
|
491
|
+
|
|
492
|
+
(src_root / 'file1.txt').write_text('File 1')
|
|
493
|
+
(src_root / 'file2.txt').write_text('File 2')
|
|
494
|
+
(src_root / 'file3.txt').write_text('File 3')
|
|
495
|
+
|
|
496
|
+
# Set modification times to a fixed time for easier testing:
|
|
497
|
+
set_file_times(src_root, dt=parse_dt('2026-01-01T12:00:00+0000'))
|
|
498
|
+
|
|
499
|
+
def mocked_copy_and_hash(src: Path, dst: Path):
|
|
500
|
+
if src.name == 'file2.txt':
|
|
501
|
+
raise PermissionError('Bam!')
|
|
502
|
+
else:
|
|
503
|
+
return copy_and_hash(src, dst)
|
|
504
|
+
|
|
505
|
+
with (
|
|
506
|
+
self.assertLogs(level=logging.ERROR) as logs,
|
|
507
|
+
patch('PyHardLinkBackup.backup.iter_scandir_files', SortedIterScandirFiles),
|
|
508
|
+
patch('PyHardLinkBackup.backup.copy_and_hash', mocked_copy_and_hash),
|
|
509
|
+
freeze_time('2026-01-01T12:34:56Z', auto_tick_seconds=0),
|
|
510
|
+
RedirectOut() as redirected_out,
|
|
511
|
+
):
|
|
512
|
+
result = backup_tree(
|
|
513
|
+
src_root=src_root,
|
|
514
|
+
backup_root=backup_root,
|
|
515
|
+
excludes={'.cache'},
|
|
516
|
+
)
|
|
517
|
+
self.assertEqual(redirected_out.stderr, '')
|
|
518
|
+
self.assertIn('Backup complete', redirected_out.stdout)
|
|
519
|
+
self.assertIn('Errors during backup:', redirected_out.stdout)
|
|
520
|
+
|
|
521
|
+
logs = ''.join(logs.output)
|
|
522
|
+
self.assertIn(
|
|
523
|
+
f'ERROR:PyHardLinkBackup.backup:Backup {src_root / "file2.txt"} PermissionError: Bam!\n',
|
|
524
|
+
logs,
|
|
525
|
+
)
|
|
526
|
+
self.assertIn('\nTraceback (most recent call last):\n', logs)
|
|
527
|
+
self.assertEqual(
|
|
528
|
+
result,
|
|
529
|
+
BackupResult(
|
|
530
|
+
backup_dir=result.backup_dir,
|
|
531
|
+
backup_count=3,
|
|
532
|
+
backup_size=18,
|
|
533
|
+
symlink_files=0,
|
|
534
|
+
hardlinked_files=0,
|
|
535
|
+
hardlinked_size=0,
|
|
536
|
+
copied_files=2,
|
|
537
|
+
copied_size=12,
|
|
538
|
+
copied_small_files=2,
|
|
539
|
+
copied_small_size=12,
|
|
540
|
+
error_count=1,
|
|
541
|
+
),
|
|
542
|
+
)
|
|
@@ -19,7 +19,8 @@ class DocuWriteApiTestCase(TestCase):
|
|
|
19
19
|
"""
|
|
20
20
|
assert_is_file(PACKAGE_ROOT / 'pyproject.toml')
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
with self.assertLogs():
|
|
23
|
+
info: GeneratedInfo = generate(base_path=PACKAGE_ROOT)
|
|
23
24
|
self.assertGreaterEqual(len(info.paths), 1)
|
|
24
25
|
self.assertEqual(info.update_count, 0, 'No files should be updated, commit the changes')
|
|
25
26
|
self.assertEqual(info.remove_count, 0, 'No files should be removed, commit the changes')
|