PyHardLinkBackup 1.0.0rc0__tar.gz → 1.0.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyhardlinkbackup-1.0.0rc1/.run/only DocWrite.run.xml +24 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PKG-INFO +4 -2
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/__init__.py +2 -3
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/backup.py +20 -8
- pyhardlinkbackup-1.0.0rc1/PyHardLinkBackup/tests/test_backup.py +399 -0
- pyhardlinkbackup-1.0.0rc1/PyHardLinkBackup/tests/test_doc_write.py +25 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/file_hash_database.py +7 -2
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/file_size_database.py +16 -10
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/filesystem.py +20 -9
- pyhardlinkbackup-1.0.0rc1/PyHardLinkBackup/utilities/tests/test_file_hash_database.py +134 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/tests/test_file_size_database.py +12 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/tests/test_filesystem.py +6 -2
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/README.md +3 -1
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/cli.py +1 -1
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/dev-cli.py +1 -1
- pyhardlinkbackup-1.0.0rc1/docs/README.md +57 -0
- pyhardlinkbackup-1.0.0rc1/docs/about-docs.md +8 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/pyproject.toml +11 -2
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/uv.lock +17 -6
- pyhardlinkbackup-1.0.0rc0/PyHardLinkBackup/tests/test_backup.py +0 -188
- pyhardlinkbackup-1.0.0rc0/PyHardLinkBackup/utilities/tests/test_file_hash_database.py +0 -68
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.editorconfig +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.github/workflows/tests.yml +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.gitignore +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.idea/.gitignore +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.pre-commit-config.yaml +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.pre-commit-hooks.yaml +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.run/Template Python tests.run.xml +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.run/Unittests - __all__.run.xml +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.run/cli.py --help.run.xml +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.run/dev-cli update.run.xml +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.run/only DocTests.run.xml +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.venv-app/lib/python3.12/site-packages/cli_base/tests/shell_complete_snapshots/.gitignore +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/__main__.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_app/__init__.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_app/benchmark.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_app/phlb.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_dev/__init__.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_dev/code_style.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_dev/packaging.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_dev/shell_completion.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_dev/testing.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_dev/update_readme_history.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/constants.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/tests/__init__.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/tests/test_doctests.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/tests/test_project_setup.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/tests/test_readme.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/tests/test_readme_history.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/__init__.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/humanize.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/rich_utils.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/tests/__init__.py +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/dist/.gitignore +0 -0
- {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/noxfile.py +0 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
<component name="ProjectRunConfigurationManager">
|
|
2
|
+
<configuration default="false" name="only DocWrite" type="tests" factoryName="Unittests">
|
|
3
|
+
<module name="PyHardLinkBackup" />
|
|
4
|
+
<option name="ENV_FILES" value="" />
|
|
5
|
+
<option name="INTERPRETER_OPTIONS" value="" />
|
|
6
|
+
<option name="PARENT_ENVS" value="true" />
|
|
7
|
+
<envs>
|
|
8
|
+
<env name="PYTHONUNBUFFERED" value="1" />
|
|
9
|
+
<env name="PYTHONWARNINGS" value="always" />
|
|
10
|
+
</envs>
|
|
11
|
+
<option name="SDK_HOME" value="" />
|
|
12
|
+
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
|
13
|
+
<option name="IS_MODULE_SDK" value="true" />
|
|
14
|
+
<option name="ADD_CONTENT_ROOTS" value="false" />
|
|
15
|
+
<option name="ADD_SOURCE_ROOTS" value="false" />
|
|
16
|
+
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
|
17
|
+
<option name="RUN_TOOL" value="" />
|
|
18
|
+
<option name="_new_pattern" value="""" />
|
|
19
|
+
<option name="_new_additionalArguments" value="""" />
|
|
20
|
+
<option name="_new_target" value=""PyHardLinkBackup.tests.test_doc_write"" />
|
|
21
|
+
<option name="_new_targetType" value=""PYTHON"" />
|
|
22
|
+
<method v="2" />
|
|
23
|
+
</configuration>
|
|
24
|
+
</component>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: PyHardLinkBackup
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0rc1
|
|
4
4
|
Summary: HardLink/Deduplication Backups with Python
|
|
5
5
|
Project-URL: Documentation, https://github.com/jedie/PyHardLinkBackup
|
|
6
6
|
Project-URL: Source, https://github.com/jedie/PyHardLinkBackup
|
|
@@ -190,7 +190,9 @@ v1 is a complete rewrite of PyHardLinkBackup.
|
|
|
190
190
|
|
|
191
191
|
[comment]: <> (✂✂✂ auto generated history start ✂✂✂)
|
|
192
192
|
|
|
193
|
-
* [v1.0.
|
|
193
|
+
* [v1.0.0rc1](https://github.com/jedie/PyHardLinkBackup/compare/v0.13.0...v1.0.0rc1)
|
|
194
|
+
* 2026-01-13 - Rename [project.scripts] hooks
|
|
195
|
+
* 2026-01-13 - Add DocWrite, handle broken symlinks, keep file meta, handle missing hardlink sources
|
|
194
196
|
* 2026-01-12 - First working iteration with rich progess bar
|
|
195
197
|
* 2026-01-08 - Rewrite everything
|
|
196
198
|
* [v0.13.0](https://github.com/jedie/PyHardLinkBackup/compare/v0.12.3...v0.13.0)
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
"""
|
|
2
|
-
PyHardLinkBackup
|
|
1
|
+
"""DocWrite: README.md # PyHardLinkBackup
|
|
3
2
|
HardLink/Deduplication Backups with Python
|
|
4
3
|
"""
|
|
5
4
|
|
|
6
5
|
# See https://packaging.python.org/en/latest/specifications/version-specifiers/
|
|
7
|
-
__version__ = '1.0.
|
|
6
|
+
__version__ = '1.0.0rc1'
|
|
8
7
|
__author__ = 'Jens Diemer <PyHardLinkBackup@jensdiemer.de>'
|
|
@@ -84,7 +84,22 @@ def backup_tree(*, src_root: Path, backup_root: Path, excludes: set[str]) -> Bac
|
|
|
84
84
|
next_update = 0
|
|
85
85
|
for entry in iter_scandir_files(src_root, excludes=excludes):
|
|
86
86
|
backup_count += 1
|
|
87
|
-
|
|
87
|
+
src_path = Path(entry.path)
|
|
88
|
+
|
|
89
|
+
dst_path = backup_dir / src_path.relative_to(src_root)
|
|
90
|
+
dst_dir_path = dst_path.parent
|
|
91
|
+
if not dst_dir_path.exists():
|
|
92
|
+
dst_dir_path.mkdir(parents=True, exist_ok=False)
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
size = entry.stat().st_size
|
|
96
|
+
except FileNotFoundError:
|
|
97
|
+
# e.g.: Handle broken symlink
|
|
98
|
+
target = os.readlink(src_path)
|
|
99
|
+
dst_path.symlink_to(target)
|
|
100
|
+
symlink_files += 1
|
|
101
|
+
continue
|
|
102
|
+
|
|
88
103
|
backup_size += size
|
|
89
104
|
|
|
90
105
|
now = time.monotonic()
|
|
@@ -92,14 +107,8 @@ def backup_tree(*, src_root: Path, backup_root: Path, excludes: set[str]) -> Bac
|
|
|
92
107
|
progress.update(backup_count=backup_count, backup_size=backup_size)
|
|
93
108
|
next_update = now + 0.5
|
|
94
109
|
|
|
95
|
-
src_path = Path(entry.path)
|
|
96
|
-
dst_path = backup_dir / src_path.relative_to(src_root)
|
|
97
|
-
|
|
98
|
-
dst_path.parent.mkdir(parents=True, exist_ok=True)
|
|
99
|
-
|
|
100
110
|
if entry.is_symlink():
|
|
101
111
|
logger.debug('Copy symlink: %s to %s', src_path, dst_path)
|
|
102
|
-
# Copy symlinks as-is
|
|
103
112
|
target = os.readlink(src_path)
|
|
104
113
|
dst_path.symlink_to(target)
|
|
105
114
|
symlink_files += 1
|
|
@@ -152,10 +161,12 @@ def backup_tree(*, src_root: Path, backup_root: Path, excludes: set[str]) -> Bac
|
|
|
152
161
|
hardlinked_size += size
|
|
153
162
|
else:
|
|
154
163
|
logger.info('Copy unique file: %s to %s', src_path, dst_path)
|
|
155
|
-
shutil.copy2(src_path, dst_path)
|
|
156
164
|
hash_db[file_hash] = dst_path
|
|
157
165
|
copied_files += 1
|
|
158
166
|
copied_size += size
|
|
167
|
+
|
|
168
|
+
# Keep original file metadata (permission bits, time stamps, and flags)
|
|
169
|
+
shutil.copy2(src_path, dst_path)
|
|
159
170
|
else:
|
|
160
171
|
# A file with this size not backuped before -> Can't be duplicate -> copy and hash
|
|
161
172
|
file_hash = copy_and_hash(src_path, dst_path)
|
|
@@ -164,6 +175,7 @@ def backup_tree(*, src_root: Path, backup_root: Path, excludes: set[str]) -> Bac
|
|
|
164
175
|
copied_files += 1
|
|
165
176
|
copied_size += size
|
|
166
177
|
|
|
178
|
+
# Finalize progress indicator values:
|
|
167
179
|
progress.update(backup_count=backup_count, backup_size=backup_size)
|
|
168
180
|
|
|
169
181
|
print(f'\nBackup complete: {backup_dir} (total size {human_filesize(backup_size)})\n')
|
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
import textwrap
|
|
5
|
+
import zlib
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from unittest import TestCase
|
|
8
|
+
|
|
9
|
+
from bx_py_utils.test_utils.assertion import assert_text_equal
|
|
10
|
+
from bx_py_utils.test_utils.datetime import parse_dt
|
|
11
|
+
from freezegun import freeze_time
|
|
12
|
+
from tabulate import tabulate
|
|
13
|
+
|
|
14
|
+
from PyHardLinkBackup.backup import BackupResult, backup_tree
|
|
15
|
+
from PyHardLinkBackup.constants import CHUNK_SIZE
|
|
16
|
+
from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
|
|
17
|
+
from PyHardLinkBackup.utilities.filesystem import iter_scandir_files
|
|
18
|
+
from PyHardLinkBackup.utilities.tests.test_file_hash_database import assert_hash_db_info
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def set_file_times(path: Path, dt: datetime.datetime):
|
|
22
|
+
# move dt to UTC if it has timezone info:
|
|
23
|
+
if dt.tzinfo is not None:
|
|
24
|
+
dt = dt.astimezone(datetime.timezone.utc).replace(tzinfo=None)
|
|
25
|
+
fixed_time = dt.timestamp()
|
|
26
|
+
for entry in iter_scandir_files(path, excludes=set()):
|
|
27
|
+
os.utime(entry.path, (fixed_time, fixed_time))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def fs_tree_overview(root: Path) -> str:
|
|
31
|
+
lines = []
|
|
32
|
+
for entry in iter_scandir_files(root, excludes=set()):
|
|
33
|
+
file_path = Path(entry.path)
|
|
34
|
+
try:
|
|
35
|
+
file_stat = entry.stat()
|
|
36
|
+
except FileNotFoundError:
|
|
37
|
+
crc32 = '-'
|
|
38
|
+
nlink = '-'
|
|
39
|
+
size = '-'
|
|
40
|
+
birthtime = '-'
|
|
41
|
+
else:
|
|
42
|
+
crc32 = zlib.crc32(file_path.read_bytes())
|
|
43
|
+
crc32 = f'{crc32:08x}'
|
|
44
|
+
nlink = file_stat.st_nlink
|
|
45
|
+
size = file_stat.st_size
|
|
46
|
+
birthtime = getattr(file_stat, 'st_birthtime', file_stat.st_mtime)
|
|
47
|
+
birthtime = datetime.datetime.fromtimestamp(birthtime).strftime('%H:%M:%S')
|
|
48
|
+
|
|
49
|
+
if entry.is_symlink():
|
|
50
|
+
file_type = 'symlink'
|
|
51
|
+
elif nlink > 1:
|
|
52
|
+
file_type = 'hardlink'
|
|
53
|
+
else:
|
|
54
|
+
file_type = 'file'
|
|
55
|
+
|
|
56
|
+
lines.append(
|
|
57
|
+
[
|
|
58
|
+
str(file_path.relative_to(root)),
|
|
59
|
+
birthtime,
|
|
60
|
+
file_type,
|
|
61
|
+
nlink,
|
|
62
|
+
size,
|
|
63
|
+
crc32,
|
|
64
|
+
]
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
result = tabulate(sorted(lines), headers=['path', 'birthtime', 'type', 'nlink', 'size', 'CRC32'], tablefmt='plain')
|
|
68
|
+
return result
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def assert_fs_tree_overview(root: Path, expected_overview: str):
|
|
72
|
+
expected_overview = textwrap.dedent(expected_overview).strip()
|
|
73
|
+
actual_overview = fs_tree_overview(root)
|
|
74
|
+
assert_text_equal(
|
|
75
|
+
actual_overview,
|
|
76
|
+
expected_overview,
|
|
77
|
+
msg=f'Filesystem tree overview does not match expected overview.\n\n{actual_overview}\n\n',
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class BackupTreeTestCase(TestCase):
|
|
82
|
+
def test_happy_path(self):
|
|
83
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
84
|
+
temp_path = Path(temp_dir)
|
|
85
|
+
|
|
86
|
+
src_root = temp_path / 'source'
|
|
87
|
+
backup_root = temp_path / 'backup'
|
|
88
|
+
|
|
89
|
+
src_root.mkdir()
|
|
90
|
+
backup_root.mkdir()
|
|
91
|
+
|
|
92
|
+
file1_path = src_root / 'file2.txt'
|
|
93
|
+
file1_path.write_text('This is file 1')
|
|
94
|
+
|
|
95
|
+
(src_root / 'symlink2file1').symlink_to(file1_path)
|
|
96
|
+
os.link(file1_path, src_root / 'hardlink2file1')
|
|
97
|
+
|
|
98
|
+
sub_dir = src_root / 'subdir'
|
|
99
|
+
sub_dir.mkdir()
|
|
100
|
+
(sub_dir / 'file.txt').write_text('This is file in subdir')
|
|
101
|
+
|
|
102
|
+
# Only files bigger than MIN_SIZE will be considered for hardlinking:
|
|
103
|
+
size_db_min_file = src_root / 'min_sized_file1.bin'
|
|
104
|
+
size_db_min_file.write_bytes(b'X' * FileSizeDatabase.MIN_SIZE)
|
|
105
|
+
|
|
106
|
+
# Same content and big enough to be considered for hardlinking:
|
|
107
|
+
size_db_min_file = src_root / 'min_sized_file2.bin'
|
|
108
|
+
size_db_min_file.write_bytes(b'X' * FileSizeDatabase.MIN_SIZE)
|
|
109
|
+
|
|
110
|
+
# Larger then CHUNK_SIZE file will be handled differently:
|
|
111
|
+
large_file = src_root / 'large_file.bin'
|
|
112
|
+
large_file.write_bytes(b'Y' * (CHUNK_SIZE + 1))
|
|
113
|
+
|
|
114
|
+
excluded_dir = src_root / '.cache'
|
|
115
|
+
excluded_dir.mkdir()
|
|
116
|
+
(excluded_dir / 'tempfile.tmp').write_text('Temporary file that should be excluded')
|
|
117
|
+
|
|
118
|
+
# FIXME: freezegun doesn't handle this, see: https://github.com/spulec/freezegun/issues/392
|
|
119
|
+
# Set modification times to a fixed time for easier testing:
|
|
120
|
+
set_file_times(src_root, dt=parse_dt('2026-01-01T12:00:00+0000'))
|
|
121
|
+
|
|
122
|
+
#######################################################################################
|
|
123
|
+
# Create first backup:
|
|
124
|
+
|
|
125
|
+
with freeze_time('2026-01-01T12:34:56Z', auto_tick_seconds=0):
|
|
126
|
+
result = backup_tree(
|
|
127
|
+
src_root=src_root,
|
|
128
|
+
backup_root=backup_root,
|
|
129
|
+
excludes={'.cache'},
|
|
130
|
+
)
|
|
131
|
+
backup_dir = result.backup_dir
|
|
132
|
+
self.assertEqual(
|
|
133
|
+
str(Path(backup_dir).relative_to(temp_path)),
|
|
134
|
+
'backup/source/20260101_123456',
|
|
135
|
+
)
|
|
136
|
+
self.assertEqual(
|
|
137
|
+
result,
|
|
138
|
+
BackupResult(
|
|
139
|
+
backup_dir=backup_dir,
|
|
140
|
+
backup_count=7,
|
|
141
|
+
backup_size=67110929,
|
|
142
|
+
symlink_files=1,
|
|
143
|
+
hardlinked_files=1,
|
|
144
|
+
hardlinked_size=1000,
|
|
145
|
+
copied_files=5,
|
|
146
|
+
copied_size=67109915,
|
|
147
|
+
copied_small_files=3,
|
|
148
|
+
copied_small_size=50,
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# The sources:
|
|
153
|
+
assert_fs_tree_overview(
|
|
154
|
+
root=src_root,
|
|
155
|
+
expected_overview="""
|
|
156
|
+
path birthtime type nlink size CRC32
|
|
157
|
+
.cache/tempfile.tmp 12:00:00 file 1 38 41d7a2c9
|
|
158
|
+
file2.txt 12:00:00 hardlink 2 14 8a11514a
|
|
159
|
+
hardlink2file1 12:00:00 hardlink 2 14 8a11514a
|
|
160
|
+
large_file.bin 12:00:00 file 1 67108865 9671eaac
|
|
161
|
+
min_sized_file1.bin 12:00:00 file 1 1000 f0d93de4
|
|
162
|
+
min_sized_file2.bin 12:00:00 file 1 1000 f0d93de4
|
|
163
|
+
subdir/file.txt 12:00:00 file 1 22 c0167e63
|
|
164
|
+
symlink2file1 12:00:00 symlink 2 14 8a11514a
|
|
165
|
+
""",
|
|
166
|
+
)
|
|
167
|
+
# The backup:
|
|
168
|
+
# * /.cache/ -> excluded
|
|
169
|
+
# * min_sized_file1.bin and min_sized_file2.bin -> hardlinked
|
|
170
|
+
assert_fs_tree_overview(
|
|
171
|
+
root=backup_dir,
|
|
172
|
+
expected_overview="""
|
|
173
|
+
path birthtime type nlink size CRC32
|
|
174
|
+
file2.txt 12:00:00 file 1 14 8a11514a
|
|
175
|
+
hardlink2file1 12:00:00 file 1 14 8a11514a
|
|
176
|
+
large_file.bin 12:00:00 file 1 67108865 9671eaac
|
|
177
|
+
min_sized_file1.bin 12:00:00 hardlink 2 1000 f0d93de4
|
|
178
|
+
min_sized_file2.bin 12:00:00 hardlink 2 1000 f0d93de4
|
|
179
|
+
subdir/file.txt 12:00:00 file 1 22 c0167e63
|
|
180
|
+
symlink2file1 12:00:00 symlink 2 14 8a11514a
|
|
181
|
+
""",
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Let's check our FileHashDatabase:
|
|
185
|
+
assert_hash_db_info(
|
|
186
|
+
backup_root=backup_root,
|
|
187
|
+
expected="""
|
|
188
|
+
bb/c4/bbc4de2ca238d1… -> source/20260101_123456/min_sized_file1.bin
|
|
189
|
+
e6/37/e6374ac11d9049… -> source/20260101_123456/large_file.bin
|
|
190
|
+
""",
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
#######################################################################################
|
|
194
|
+
# Just backup again:
|
|
195
|
+
|
|
196
|
+
with freeze_time('2026-01-02T12:34:56Z', auto_tick_seconds=0):
|
|
197
|
+
result = backup_tree(
|
|
198
|
+
src_root=src_root,
|
|
199
|
+
backup_root=backup_root,
|
|
200
|
+
excludes={'.cache'},
|
|
201
|
+
)
|
|
202
|
+
backup_dir = result.backup_dir
|
|
203
|
+
self.assertEqual(
|
|
204
|
+
str(Path(backup_dir).relative_to(temp_path)),
|
|
205
|
+
'backup/source/20260102_123456',
|
|
206
|
+
)
|
|
207
|
+
self.assertEqual(
|
|
208
|
+
result,
|
|
209
|
+
BackupResult(
|
|
210
|
+
backup_dir=backup_dir,
|
|
211
|
+
backup_count=7,
|
|
212
|
+
backup_size=67110929,
|
|
213
|
+
symlink_files=1,
|
|
214
|
+
hardlinked_files=3, # <<< More hardlinks this time!
|
|
215
|
+
hardlinked_size=67110865,
|
|
216
|
+
copied_files=3,
|
|
217
|
+
copied_size=50,
|
|
218
|
+
copied_small_files=3,
|
|
219
|
+
copied_small_size=50,
|
|
220
|
+
),
|
|
221
|
+
)
|
|
222
|
+
# The second backup:
|
|
223
|
+
# * /.cache/ -> excluded
|
|
224
|
+
# * min_sized_file1.bin and min_sized_file2.bin -> hardlinked
|
|
225
|
+
assert_fs_tree_overview(
|
|
226
|
+
root=backup_dir,
|
|
227
|
+
expected_overview="""
|
|
228
|
+
path birthtime type nlink size CRC32
|
|
229
|
+
file2.txt 12:00:00 file 1 14 8a11514a
|
|
230
|
+
hardlink2file1 12:00:00 file 1 14 8a11514a
|
|
231
|
+
large_file.bin 12:00:00 hardlink 2 67108865 9671eaac
|
|
232
|
+
min_sized_file1.bin 12:00:00 hardlink 4 1000 f0d93de4
|
|
233
|
+
min_sized_file2.bin 12:00:00 hardlink 4 1000 f0d93de4
|
|
234
|
+
subdir/file.txt 12:00:00 file 1 22 c0167e63
|
|
235
|
+
symlink2file1 12:00:00 symlink 2 14 8a11514a
|
|
236
|
+
""",
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# The FileHashDatabase remains the same:
|
|
240
|
+
assert_hash_db_info(
|
|
241
|
+
backup_root=backup_root,
|
|
242
|
+
expected="""
|
|
243
|
+
bb/c4/bbc4de2ca238d1… -> source/20260101_123456/min_sized_file1.bin
|
|
244
|
+
e6/37/e6374ac11d9049… -> source/20260101_123456/large_file.bin
|
|
245
|
+
""",
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
#######################################################################################
|
|
249
|
+
# Don't create broken hardlinks!
|
|
250
|
+
|
|
251
|
+
"""DocWrite: README.md ## FileHashDatabase - Missing hardlink target file
|
|
252
|
+
If a hardlink source from a old backup is missing, we cannot create a hardlink to it.
|
|
253
|
+
But it still works to hardlink same files within the current backup.
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
# Let's remove one of the files used for hardlinking from the first backup:
|
|
257
|
+
(backup_root / 'source/20260101_123456/min_sized_file1.bin').unlink()
|
|
258
|
+
|
|
259
|
+
# Backup again:
|
|
260
|
+
with freeze_time('2026-01-03T12:34:56Z', auto_tick_seconds=0):
|
|
261
|
+
result = backup_tree(
|
|
262
|
+
src_root=src_root,
|
|
263
|
+
backup_root=backup_root,
|
|
264
|
+
excludes={'.cache'},
|
|
265
|
+
)
|
|
266
|
+
backup_dir = result.backup_dir
|
|
267
|
+
self.assertEqual(
|
|
268
|
+
result,
|
|
269
|
+
BackupResult(
|
|
270
|
+
backup_dir=backup_dir,
|
|
271
|
+
backup_count=7,
|
|
272
|
+
backup_size=67110929,
|
|
273
|
+
symlink_files=1,
|
|
274
|
+
hardlinked_files=2, # <<< Less hardlinks this time, because of missing link source!
|
|
275
|
+
hardlinked_size=67109865,
|
|
276
|
+
copied_files=4,
|
|
277
|
+
copied_size=1050,
|
|
278
|
+
copied_small_files=3,
|
|
279
|
+
copied_small_size=50,
|
|
280
|
+
),
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
# Note: min_sized_file1.bin and min_sized_file2.bin are hardlinked,
|
|
284
|
+
# but not with the first backup anymore! So it's only nlink=2 now!
|
|
285
|
+
assert_fs_tree_overview(
|
|
286
|
+
root=backup_dir,
|
|
287
|
+
expected_overview="""
|
|
288
|
+
path birthtime type nlink size CRC32
|
|
289
|
+
file2.txt 12:00:00 file 1 14 8a11514a
|
|
290
|
+
hardlink2file1 12:00:00 file 1 14 8a11514a
|
|
291
|
+
large_file.bin 12:00:00 hardlink 3 67108865 9671eaac
|
|
292
|
+
min_sized_file1.bin 12:00:00 hardlink 2 1000 f0d93de4
|
|
293
|
+
min_sized_file2.bin 12:00:00 hardlink 2 1000 f0d93de4
|
|
294
|
+
subdir/file.txt 12:00:00 file 1 22 c0167e63
|
|
295
|
+
symlink2file1 12:00:00 symlink 2 14 8a11514a
|
|
296
|
+
""",
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Note: min_sized_file1.bin is now from the 2026-01-03 backup!
|
|
300
|
+
self.assertEqual(backup_dir.name, '20260103_123456') # Latest backup dir name
|
|
301
|
+
assert_hash_db_info(
|
|
302
|
+
backup_root=backup_root,
|
|
303
|
+
expected="""
|
|
304
|
+
bb/c4/bbc4de2ca238d1… -> source/20260103_123456/min_sized_file1.bin
|
|
305
|
+
e6/37/e6374ac11d9049… -> source/20260101_123456/large_file.bin
|
|
306
|
+
""",
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
def test_symlink(self):
|
|
310
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
311
|
+
temp_path = Path(temp_dir)
|
|
312
|
+
|
|
313
|
+
src_root = temp_path / 'src'
|
|
314
|
+
backup_root = temp_path / 'bak'
|
|
315
|
+
|
|
316
|
+
src_root.mkdir()
|
|
317
|
+
backup_root.mkdir()
|
|
318
|
+
|
|
319
|
+
source_file_path = src_root / 'source_file.txt'
|
|
320
|
+
source_file_path.write_text('File in the "source" directory.')
|
|
321
|
+
|
|
322
|
+
symlink2source_file_path = src_root / 'symlink2source'
|
|
323
|
+
symlink2source_file_path.symlink_to(source_file_path)
|
|
324
|
+
self.assertEqual(symlink2source_file_path.read_text(), 'File in the "source" directory.')
|
|
325
|
+
|
|
326
|
+
outside_file_path = temp_path / 'outside_file.txt'
|
|
327
|
+
outside_file_path.write_text('File outside the "source" directory!')
|
|
328
|
+
|
|
329
|
+
symlink2outside_file_path = src_root / 'symlink2outside'
|
|
330
|
+
symlink2outside_file_path.symlink_to(outside_file_path)
|
|
331
|
+
self.assertEqual(symlink2outside_file_path.read_text(), 'File outside the "source" directory!')
|
|
332
|
+
|
|
333
|
+
# FIXME: freezegun doesn't handle this, see: https://github.com/spulec/freezegun/issues/392
|
|
334
|
+
# Set modification times to a fixed time for easier testing:
|
|
335
|
+
set_file_times(src_root, dt=parse_dt('2026-01-01T12:00:00+0000'))
|
|
336
|
+
|
|
337
|
+
broken_symlink_path = src_root / 'broken_symlink'
|
|
338
|
+
broken_symlink_path.symlink_to(temp_path / 'not/existing/file.txt')
|
|
339
|
+
broken_symlink_path.is_symlink()
|
|
340
|
+
|
|
341
|
+
#######################################################################################
|
|
342
|
+
# Create first backup:
|
|
343
|
+
|
|
344
|
+
with freeze_time('2026-01-01T12:34:56Z', auto_tick_seconds=0):
|
|
345
|
+
result = backup_tree(src_root=src_root, backup_root=backup_root, excludes=set())
|
|
346
|
+
backup_dir1 = result.backup_dir
|
|
347
|
+
self.assertEqual(
|
|
348
|
+
str(Path(backup_dir1).relative_to(temp_path)),
|
|
349
|
+
'bak/src/20260101_123456',
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
assert_fs_tree_overview(
|
|
353
|
+
root=temp_path, # The complete overview os source + backup and outside file
|
|
354
|
+
expected_overview="""
|
|
355
|
+
path birthtime type nlink size CRC32
|
|
356
|
+
bak/src/20260101_123456/broken_symlink - symlink - - -
|
|
357
|
+
bak/src/20260101_123456/source_file.txt 12:00:00 file 1 31 9309a10c
|
|
358
|
+
bak/src/20260101_123456/symlink2outside 12:00:00 symlink 1 36 24b5bf4c
|
|
359
|
+
bak/src/20260101_123456/symlink2source 12:00:00 symlink 1 31 9309a10c
|
|
360
|
+
outside_file.txt 12:00:00 file 1 36 24b5bf4c
|
|
361
|
+
src/broken_symlink - symlink - - -
|
|
362
|
+
src/source_file.txt 12:00:00 file 1 31 9309a10c
|
|
363
|
+
src/symlink2outside 12:00:00 symlink 1 36 24b5bf4c
|
|
364
|
+
src/symlink2source 12:00:00 symlink 1 31 9309a10c
|
|
365
|
+
""",
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
self.assertEqual(
|
|
369
|
+
result,
|
|
370
|
+
BackupResult(
|
|
371
|
+
backup_dir=backup_dir1,
|
|
372
|
+
backup_count=4,
|
|
373
|
+
backup_size=98,
|
|
374
|
+
symlink_files=3,
|
|
375
|
+
hardlinked_files=0,
|
|
376
|
+
hardlinked_size=0,
|
|
377
|
+
copied_files=1,
|
|
378
|
+
copied_size=31,
|
|
379
|
+
copied_small_files=1,
|
|
380
|
+
copied_small_size=31,
|
|
381
|
+
),
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
"""DocWrite: README.md ## backup implementation - Symlinks
|
|
385
|
+
Symlinks are copied as symlinks in the backup."""
|
|
386
|
+
self.assertEqual(
|
|
387
|
+
(backup_dir1 / 'symlink2outside').read_text(),
|
|
388
|
+
'File outside the "source" directory!',
|
|
389
|
+
)
|
|
390
|
+
self.assertEqual(
|
|
391
|
+
(backup_dir1 / 'symlink2source').read_text(),
|
|
392
|
+
'File in the "source" directory.',
|
|
393
|
+
)
|
|
394
|
+
self.assertEqual((backup_dir1 / 'symlink2outside').readlink(), outside_file_path)
|
|
395
|
+
self.assertEqual((backup_dir1 / 'symlink2source').readlink(), source_file_path)
|
|
396
|
+
|
|
397
|
+
"""DocWrite: README.md ## backup implementation - Symlinks
|
|
398
|
+
Symlinks are not stored in our FileHashDatabase, because they are not considered for hardlinking."""
|
|
399
|
+
assert_hash_db_info(backup_root=backup_root, expected='')
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from unittest import TestCase
|
|
2
|
+
|
|
3
|
+
from bx_py_utils.doc_write.api import GeneratedInfo, generate
|
|
4
|
+
from bx_py_utils.path import assert_is_file
|
|
5
|
+
|
|
6
|
+
from PyHardLinkBackup.cli_dev import PACKAGE_ROOT
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DocuWriteApiTestCase(TestCase):
|
|
10
|
+
def test_up2date_docs(self):
|
|
11
|
+
"""DocWrite: about-docs.md # generate Doc-Write
|
|
12
|
+
|
|
13
|
+
These documentation files are generated automatically with the "Doc-Write" tool.
|
|
14
|
+
They updated automatically by unittests.
|
|
15
|
+
|
|
16
|
+
More information about Doc-Write can be found here:
|
|
17
|
+
|
|
18
|
+
https://github.com/boxine/bx_py_utils/tree/master/bx_py_utils/doc_write
|
|
19
|
+
"""
|
|
20
|
+
assert_is_file(PACKAGE_ROOT / 'pyproject.toml')
|
|
21
|
+
|
|
22
|
+
info: GeneratedInfo = generate(base_path=PACKAGE_ROOT)
|
|
23
|
+
self.assertGreaterEqual(len(info.paths), 1)
|
|
24
|
+
self.assertEqual(info.update_count, 0, 'No files should be updated, commit the changes')
|
|
25
|
+
self.assertEqual(info.remove_count, 0, 'No files should be removed, commit the changes')
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from pathlib import Path
|
|
2
3
|
|
|
3
4
|
|
|
@@ -6,8 +7,8 @@ class HashAlreadyExistsError(ValueError):
|
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class FileHashDatabase:
|
|
9
|
-
"""
|
|
10
|
-
A simple database to store file content hash <-> relative path mappings.
|
|
10
|
+
"""DocWrite: README.md ## FileHashDatabase
|
|
11
|
+
A simple "database" to store file content hash <-> relative path mappings.
|
|
11
12
|
Uses a directory structure to avoid too many files in a single directory.
|
|
12
13
|
Path structure:
|
|
13
14
|
{base_dst}/.phlb/hash-lookup/{XX}/{YY}/{hash}
|
|
@@ -39,6 +40,10 @@ class FileHashDatabase:
|
|
|
39
40
|
return None
|
|
40
41
|
else:
|
|
41
42
|
abs_file_path = self.backup_root / rel_file_path
|
|
43
|
+
if not abs_file_path.is_file():
|
|
44
|
+
logging.warning('Hash database entry found, but file does not exist: %s', abs_file_path)
|
|
45
|
+
hash_path.unlink()
|
|
46
|
+
return None
|
|
42
47
|
return abs_file_path
|
|
43
48
|
|
|
44
49
|
def __setitem__(self, hash: str, abs_file_path: Path):
|
|
@@ -2,17 +2,11 @@ from pathlib import Path
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class FileSizeDatabase:
|
|
5
|
-
"""
|
|
6
|
-
A simple database to track which file sizes have been seen.
|
|
7
|
-
Uses a directory structure to avoid too many files in a single directory.
|
|
5
|
+
"""DocWrite: README.md ## FileSizeDatabase
|
|
6
|
+
A simple "database" to track which file sizes have been seen.
|
|
8
7
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
e.g.:
|
|
12
|
-
1234567890 results in: {base_dst}/.phlb/size-lookup/12/34/1234567890
|
|
13
|
-
|
|
14
|
-
Notes:
|
|
15
|
-
* We don't "cache" anything in Memory, to avoid high memory consumption for large datasets.
|
|
8
|
+
Uses a directory structure to avoid too many files in a single directory.
|
|
9
|
+
We don't "cache" anything in Memory, to avoid high memory consumption for large datasets.
|
|
16
10
|
"""
|
|
17
11
|
|
|
18
12
|
MIN_SIZE = 1000 # no padding is made, so the min size is 1000 bytes!
|
|
@@ -24,6 +18,15 @@ class FileSizeDatabase:
|
|
|
24
18
|
def _get_size_path(self, size: int) -> Path:
|
|
25
19
|
assert size >= self.MIN_SIZE, f'Size must be at least {self.MIN_SIZE} bytes'
|
|
26
20
|
size_str = str(size)
|
|
21
|
+
|
|
22
|
+
"""DocWrite: README.md ## FileSizeDatabase
|
|
23
|
+
Path structure:
|
|
24
|
+
* `{base_dst}/.phlb/size-lookup/{XX}/{YY}/{size}`
|
|
25
|
+
|
|
26
|
+
e.g.:
|
|
27
|
+
|
|
28
|
+
* `1234567890` results in: `{base_dst}/.phlb/size-lookup/12/34/1234567890`
|
|
29
|
+
"""
|
|
27
30
|
first_dir_name = size_str[:2]
|
|
28
31
|
second_dir_name = size_str[2:4]
|
|
29
32
|
size_path = self.base_path / first_dir_name / second_dir_name / size_str
|
|
@@ -37,4 +40,7 @@ class FileSizeDatabase:
|
|
|
37
40
|
size_path = self._get_size_path(size)
|
|
38
41
|
if not size_path.exists():
|
|
39
42
|
size_path.parent.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
|
|
44
|
+
"""DocWrite: README.md ## FileSizeDatabase
|
|
45
|
+
All files are created empty, as we only care about their existence."""
|
|
40
46
|
size_path.touch(exist_ok=False)
|
{pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/filesystem.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
+
import shutil
|
|
4
5
|
import time
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Iterable
|
|
@@ -36,6 +37,10 @@ def copy_and_hash(src: Path, dst: Path) -> str:
|
|
|
36
37
|
while chunk := source_file.read(CHUNK_SIZE):
|
|
37
38
|
dst_file.write(chunk)
|
|
38
39
|
hasher.update(chunk)
|
|
40
|
+
|
|
41
|
+
# Keep original file metadata (permission bits, last access time, last modification time, and flags)
|
|
42
|
+
shutil.copystat(src, dst)
|
|
43
|
+
|
|
39
44
|
file_hash = hasher.hexdigest()
|
|
40
45
|
logger.info('%s backup to %s with %s hash: %s', src, dst, HASH_ALGO, file_hash)
|
|
41
46
|
return file_hash
|
|
@@ -55,14 +60,16 @@ def iter_scandir_files(path: Path, excludes: set[str]) -> Iterable[os.DirEntry]:
|
|
|
55
60
|
Recursively yield all files+symlinks in the given directory.
|
|
56
61
|
"""
|
|
57
62
|
logger.debug('Scanning directory %s', path)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
63
|
+
with os.scandir(path) as scandir_iterator:
|
|
64
|
+
for entry in scandir_iterator:
|
|
65
|
+
if entry.is_dir(follow_symlinks=True):
|
|
66
|
+
if entry.name in excludes:
|
|
67
|
+
logger.debug('Excluding directory %s', entry.path)
|
|
68
|
+
continue
|
|
69
|
+
yield from iter_scandir_files(Path(entry.path), excludes=excludes)
|
|
70
|
+
else:
|
|
71
|
+
# It's a file or symlink or broken symlink
|
|
72
|
+
yield entry
|
|
66
73
|
|
|
67
74
|
|
|
68
75
|
def humanized_fs_scan(path: Path, excludes: set[str]) -> tuple[int, int]:
|
|
@@ -91,7 +98,11 @@ def humanized_fs_scan(path: Path, excludes: set[str]) -> tuple[int, int]:
|
|
|
91
98
|
with progress:
|
|
92
99
|
for entry in iter_scandir_files(path, excludes=excludes):
|
|
93
100
|
file_count += 1
|
|
94
|
-
|
|
101
|
+
try:
|
|
102
|
+
total_size += entry.stat().st_size
|
|
103
|
+
except FileNotFoundError:
|
|
104
|
+
# e.g.: broken symlink
|
|
105
|
+
continue
|
|
95
106
|
|
|
96
107
|
now = time.time()
|
|
97
108
|
if now >= next_update:
|