PyHardLinkBackup 1.0.0rc0__tar.gz → 1.0.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. pyhardlinkbackup-1.0.0rc1/.run/only DocWrite.run.xml +24 -0
  2. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PKG-INFO +4 -2
  3. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/__init__.py +2 -3
  4. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/backup.py +20 -8
  5. pyhardlinkbackup-1.0.0rc1/PyHardLinkBackup/tests/test_backup.py +399 -0
  6. pyhardlinkbackup-1.0.0rc1/PyHardLinkBackup/tests/test_doc_write.py +25 -0
  7. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/file_hash_database.py +7 -2
  8. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/file_size_database.py +16 -10
  9. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/filesystem.py +20 -9
  10. pyhardlinkbackup-1.0.0rc1/PyHardLinkBackup/utilities/tests/test_file_hash_database.py +134 -0
  11. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/tests/test_file_size_database.py +12 -0
  12. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/tests/test_filesystem.py +6 -2
  13. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/README.md +3 -1
  14. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/cli.py +1 -1
  15. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/dev-cli.py +1 -1
  16. pyhardlinkbackup-1.0.0rc1/docs/README.md +57 -0
  17. pyhardlinkbackup-1.0.0rc1/docs/about-docs.md +8 -0
  18. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/pyproject.toml +11 -2
  19. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/uv.lock +17 -6
  20. pyhardlinkbackup-1.0.0rc0/PyHardLinkBackup/tests/test_backup.py +0 -188
  21. pyhardlinkbackup-1.0.0rc0/PyHardLinkBackup/utilities/tests/test_file_hash_database.py +0 -68
  22. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.editorconfig +0 -0
  23. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.github/workflows/tests.yml +0 -0
  24. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.gitignore +0 -0
  25. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.idea/.gitignore +0 -0
  26. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.pre-commit-config.yaml +0 -0
  27. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.pre-commit-hooks.yaml +0 -0
  28. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.run/Template Python tests.run.xml +0 -0
  29. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.run/Unittests - __all__.run.xml +0 -0
  30. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.run/cli.py --help.run.xml +0 -0
  31. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.run/dev-cli update.run.xml +0 -0
  32. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.run/only DocTests.run.xml +0 -0
  33. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/.venv-app/lib/python3.12/site-packages/cli_base/tests/shell_complete_snapshots/.gitignore +0 -0
  34. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/__main__.py +0 -0
  35. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_app/__init__.py +0 -0
  36. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_app/benchmark.py +0 -0
  37. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_app/phlb.py +0 -0
  38. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_dev/__init__.py +0 -0
  39. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_dev/code_style.py +0 -0
  40. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_dev/packaging.py +0 -0
  41. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_dev/shell_completion.py +0 -0
  42. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_dev/testing.py +0 -0
  43. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/cli_dev/update_readme_history.py +0 -0
  44. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/constants.py +0 -0
  45. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/tests/__init__.py +0 -0
  46. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/tests/test_doctests.py +0 -0
  47. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/tests/test_project_setup.py +0 -0
  48. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/tests/test_readme.py +0 -0
  49. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/tests/test_readme_history.py +0 -0
  50. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/__init__.py +0 -0
  51. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/humanize.py +0 -0
  52. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/rich_utils.py +0 -0
  53. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/PyHardLinkBackup/utilities/tests/__init__.py +0 -0
  54. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/dist/.gitignore +0 -0
  55. {pyhardlinkbackup-1.0.0rc0 → pyhardlinkbackup-1.0.0rc1}/noxfile.py +0 -0
@@ -0,0 +1,24 @@
1
+ <component name="ProjectRunConfigurationManager">
2
+ <configuration default="false" name="only DocWrite" type="tests" factoryName="Unittests">
3
+ <module name="PyHardLinkBackup" />
4
+ <option name="ENV_FILES" value="" />
5
+ <option name="INTERPRETER_OPTIONS" value="" />
6
+ <option name="PARENT_ENVS" value="true" />
7
+ <envs>
8
+ <env name="PYTHONUNBUFFERED" value="1" />
9
+ <env name="PYTHONWARNINGS" value="always" />
10
+ </envs>
11
+ <option name="SDK_HOME" value="" />
12
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
13
+ <option name="IS_MODULE_SDK" value="true" />
14
+ <option name="ADD_CONTENT_ROOTS" value="false" />
15
+ <option name="ADD_SOURCE_ROOTS" value="false" />
16
+ <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
17
+ <option name="RUN_TOOL" value="" />
18
+ <option name="_new_pattern" value="&quot;&quot;" />
19
+ <option name="_new_additionalArguments" value="&quot;&quot;" />
20
+ <option name="_new_target" value="&quot;PyHardLinkBackup.tests.test_doc_write&quot;" />
21
+ <option name="_new_targetType" value="&quot;PYTHON&quot;" />
22
+ <method v="2" />
23
+ </configuration>
24
+ </component>
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: PyHardLinkBackup
3
- Version: 1.0.0rc0
3
+ Version: 1.0.0rc1
4
4
  Summary: HardLink/Deduplication Backups with Python
5
5
  Project-URL: Documentation, https://github.com/jedie/PyHardLinkBackup
6
6
  Project-URL: Source, https://github.com/jedie/PyHardLinkBackup
@@ -190,7 +190,9 @@ v1 is a complete rewrite of PyHardLinkBackup.
190
190
 
191
191
  [comment]: <> (✂✂✂ auto generated history start ✂✂✂)
192
192
 
193
- * [v1.0.0rc0](https://github.com/jedie/PyHardLinkBackup/compare/v0.13.0...v1.0.0rc0)
193
+ * [v1.0.0rc1](https://github.com/jedie/PyHardLinkBackup/compare/v0.13.0...v1.0.0rc1)
194
+ * 2026-01-13 - Rename [project.scripts] hooks
195
+ * 2026-01-13 - Add DocWrite, handle broken symlinks, keep file meta, handle missing hardlink sources
194
196
  * 2026-01-12 - First working iteration with rich progess bar
195
197
  * 2026-01-08 - Rewrite everything
196
198
  * [v0.13.0](https://github.com/jedie/PyHardLinkBackup/compare/v0.12.3...v0.13.0)
@@ -1,8 +1,7 @@
1
- """
2
- PyHardLinkBackup
1
+ """DocWrite: README.md # PyHardLinkBackup
3
2
  HardLink/Deduplication Backups with Python
4
3
  """
5
4
 
6
5
  # See https://packaging.python.org/en/latest/specifications/version-specifiers/
7
- __version__ = '1.0.0rc0'
6
+ __version__ = '1.0.0rc1'
8
7
  __author__ = 'Jens Diemer <PyHardLinkBackup@jensdiemer.de>'
@@ -84,7 +84,22 @@ def backup_tree(*, src_root: Path, backup_root: Path, excludes: set[str]) -> Bac
84
84
  next_update = 0
85
85
  for entry in iter_scandir_files(src_root, excludes=excludes):
86
86
  backup_count += 1
87
- size = entry.stat().st_size
87
+ src_path = Path(entry.path)
88
+
89
+ dst_path = backup_dir / src_path.relative_to(src_root)
90
+ dst_dir_path = dst_path.parent
91
+ if not dst_dir_path.exists():
92
+ dst_dir_path.mkdir(parents=True, exist_ok=False)
93
+
94
+ try:
95
+ size = entry.stat().st_size
96
+ except FileNotFoundError:
97
+ # e.g.: Handle broken symlink
98
+ target = os.readlink(src_path)
99
+ dst_path.symlink_to(target)
100
+ symlink_files += 1
101
+ continue
102
+
88
103
  backup_size += size
89
104
 
90
105
  now = time.monotonic()
@@ -92,14 +107,8 @@ def backup_tree(*, src_root: Path, backup_root: Path, excludes: set[str]) -> Bac
92
107
  progress.update(backup_count=backup_count, backup_size=backup_size)
93
108
  next_update = now + 0.5
94
109
 
95
- src_path = Path(entry.path)
96
- dst_path = backup_dir / src_path.relative_to(src_root)
97
-
98
- dst_path.parent.mkdir(parents=True, exist_ok=True)
99
-
100
110
  if entry.is_symlink():
101
111
  logger.debug('Copy symlink: %s to %s', src_path, dst_path)
102
- # Copy symlinks as-is
103
112
  target = os.readlink(src_path)
104
113
  dst_path.symlink_to(target)
105
114
  symlink_files += 1
@@ -152,10 +161,12 @@ def backup_tree(*, src_root: Path, backup_root: Path, excludes: set[str]) -> Bac
152
161
  hardlinked_size += size
153
162
  else:
154
163
  logger.info('Copy unique file: %s to %s', src_path, dst_path)
155
- shutil.copy2(src_path, dst_path)
156
164
  hash_db[file_hash] = dst_path
157
165
  copied_files += 1
158
166
  copied_size += size
167
+
168
+ # Keep original file metadata (permission bits, time stamps, and flags)
169
+ shutil.copy2(src_path, dst_path)
159
170
  else:
160
171
  # A file with this size not backuped before -> Can't be duplicate -> copy and hash
161
172
  file_hash = copy_and_hash(src_path, dst_path)
@@ -164,6 +175,7 @@ def backup_tree(*, src_root: Path, backup_root: Path, excludes: set[str]) -> Bac
164
175
  copied_files += 1
165
176
  copied_size += size
166
177
 
178
+ # Finalize progress indicator values:
167
179
  progress.update(backup_count=backup_count, backup_size=backup_size)
168
180
 
169
181
  print(f'\nBackup complete: {backup_dir} (total size {human_filesize(backup_size)})\n')
@@ -0,0 +1,399 @@
1
+ import datetime
2
+ import os
3
+ import tempfile
4
+ import textwrap
5
+ import zlib
6
+ from pathlib import Path
7
+ from unittest import TestCase
8
+
9
+ from bx_py_utils.test_utils.assertion import assert_text_equal
10
+ from bx_py_utils.test_utils.datetime import parse_dt
11
+ from freezegun import freeze_time
12
+ from tabulate import tabulate
13
+
14
+ from PyHardLinkBackup.backup import BackupResult, backup_tree
15
+ from PyHardLinkBackup.constants import CHUNK_SIZE
16
+ from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
17
+ from PyHardLinkBackup.utilities.filesystem import iter_scandir_files
18
+ from PyHardLinkBackup.utilities.tests.test_file_hash_database import assert_hash_db_info
19
+
20
+
21
+ def set_file_times(path: Path, dt: datetime.datetime):
22
+ # move dt to UTC if it has timezone info:
23
+ if dt.tzinfo is not None:
24
+ dt = dt.astimezone(datetime.timezone.utc).replace(tzinfo=None)
25
+ fixed_time = dt.timestamp()
26
+ for entry in iter_scandir_files(path, excludes=set()):
27
+ os.utime(entry.path, (fixed_time, fixed_time))
28
+
29
+
30
+ def fs_tree_overview(root: Path) -> str:
31
+ lines = []
32
+ for entry in iter_scandir_files(root, excludes=set()):
33
+ file_path = Path(entry.path)
34
+ try:
35
+ file_stat = entry.stat()
36
+ except FileNotFoundError:
37
+ crc32 = '-'
38
+ nlink = '-'
39
+ size = '-'
40
+ birthtime = '-'
41
+ else:
42
+ crc32 = zlib.crc32(file_path.read_bytes())
43
+ crc32 = f'{crc32:08x}'
44
+ nlink = file_stat.st_nlink
45
+ size = file_stat.st_size
46
+ birthtime = getattr(file_stat, 'st_birthtime', file_stat.st_mtime)
47
+ birthtime = datetime.datetime.fromtimestamp(birthtime).strftime('%H:%M:%S')
48
+
49
+ if entry.is_symlink():
50
+ file_type = 'symlink'
51
+ elif nlink > 1:
52
+ file_type = 'hardlink'
53
+ else:
54
+ file_type = 'file'
55
+
56
+ lines.append(
57
+ [
58
+ str(file_path.relative_to(root)),
59
+ birthtime,
60
+ file_type,
61
+ nlink,
62
+ size,
63
+ crc32,
64
+ ]
65
+ )
66
+
67
+ result = tabulate(sorted(lines), headers=['path', 'birthtime', 'type', 'nlink', 'size', 'CRC32'], tablefmt='plain')
68
+ return result
69
+
70
+
71
+ def assert_fs_tree_overview(root: Path, expected_overview: str):
72
+ expected_overview = textwrap.dedent(expected_overview).strip()
73
+ actual_overview = fs_tree_overview(root)
74
+ assert_text_equal(
75
+ actual_overview,
76
+ expected_overview,
77
+ msg=f'Filesystem tree overview does not match expected overview.\n\n{actual_overview}\n\n',
78
+ )
79
+
80
+
81
+ class BackupTreeTestCase(TestCase):
82
+ def test_happy_path(self):
83
+ with tempfile.TemporaryDirectory() as temp_dir:
84
+ temp_path = Path(temp_dir)
85
+
86
+ src_root = temp_path / 'source'
87
+ backup_root = temp_path / 'backup'
88
+
89
+ src_root.mkdir()
90
+ backup_root.mkdir()
91
+
92
+ file1_path = src_root / 'file2.txt'
93
+ file1_path.write_text('This is file 1')
94
+
95
+ (src_root / 'symlink2file1').symlink_to(file1_path)
96
+ os.link(file1_path, src_root / 'hardlink2file1')
97
+
98
+ sub_dir = src_root / 'subdir'
99
+ sub_dir.mkdir()
100
+ (sub_dir / 'file.txt').write_text('This is file in subdir')
101
+
102
+ # Only files bigger than MIN_SIZE will be considered for hardlinking:
103
+ size_db_min_file = src_root / 'min_sized_file1.bin'
104
+ size_db_min_file.write_bytes(b'X' * FileSizeDatabase.MIN_SIZE)
105
+
106
+ # Same content and big enough to be considered for hardlinking:
107
+ size_db_min_file = src_root / 'min_sized_file2.bin'
108
+ size_db_min_file.write_bytes(b'X' * FileSizeDatabase.MIN_SIZE)
109
+
110
+ # Larger then CHUNK_SIZE file will be handled differently:
111
+ large_file = src_root / 'large_file.bin'
112
+ large_file.write_bytes(b'Y' * (CHUNK_SIZE + 1))
113
+
114
+ excluded_dir = src_root / '.cache'
115
+ excluded_dir.mkdir()
116
+ (excluded_dir / 'tempfile.tmp').write_text('Temporary file that should be excluded')
117
+
118
+ # FIXME: freezegun doesn't handle this, see: https://github.com/spulec/freezegun/issues/392
119
+ # Set modification times to a fixed time for easier testing:
120
+ set_file_times(src_root, dt=parse_dt('2026-01-01T12:00:00+0000'))
121
+
122
+ #######################################################################################
123
+ # Create first backup:
124
+
125
+ with freeze_time('2026-01-01T12:34:56Z', auto_tick_seconds=0):
126
+ result = backup_tree(
127
+ src_root=src_root,
128
+ backup_root=backup_root,
129
+ excludes={'.cache'},
130
+ )
131
+ backup_dir = result.backup_dir
132
+ self.assertEqual(
133
+ str(Path(backup_dir).relative_to(temp_path)),
134
+ 'backup/source/20260101_123456',
135
+ )
136
+ self.assertEqual(
137
+ result,
138
+ BackupResult(
139
+ backup_dir=backup_dir,
140
+ backup_count=7,
141
+ backup_size=67110929,
142
+ symlink_files=1,
143
+ hardlinked_files=1,
144
+ hardlinked_size=1000,
145
+ copied_files=5,
146
+ copied_size=67109915,
147
+ copied_small_files=3,
148
+ copied_small_size=50,
149
+ ),
150
+ )
151
+
152
+ # The sources:
153
+ assert_fs_tree_overview(
154
+ root=src_root,
155
+ expected_overview="""
156
+ path birthtime type nlink size CRC32
157
+ .cache/tempfile.tmp 12:00:00 file 1 38 41d7a2c9
158
+ file2.txt 12:00:00 hardlink 2 14 8a11514a
159
+ hardlink2file1 12:00:00 hardlink 2 14 8a11514a
160
+ large_file.bin 12:00:00 file 1 67108865 9671eaac
161
+ min_sized_file1.bin 12:00:00 file 1 1000 f0d93de4
162
+ min_sized_file2.bin 12:00:00 file 1 1000 f0d93de4
163
+ subdir/file.txt 12:00:00 file 1 22 c0167e63
164
+ symlink2file1 12:00:00 symlink 2 14 8a11514a
165
+ """,
166
+ )
167
+ # The backup:
168
+ # * /.cache/ -> excluded
169
+ # * min_sized_file1.bin and min_sized_file2.bin -> hardlinked
170
+ assert_fs_tree_overview(
171
+ root=backup_dir,
172
+ expected_overview="""
173
+ path birthtime type nlink size CRC32
174
+ file2.txt 12:00:00 file 1 14 8a11514a
175
+ hardlink2file1 12:00:00 file 1 14 8a11514a
176
+ large_file.bin 12:00:00 file 1 67108865 9671eaac
177
+ min_sized_file1.bin 12:00:00 hardlink 2 1000 f0d93de4
178
+ min_sized_file2.bin 12:00:00 hardlink 2 1000 f0d93de4
179
+ subdir/file.txt 12:00:00 file 1 22 c0167e63
180
+ symlink2file1 12:00:00 symlink 2 14 8a11514a
181
+ """,
182
+ )
183
+
184
+ # Let's check our FileHashDatabase:
185
+ assert_hash_db_info(
186
+ backup_root=backup_root,
187
+ expected="""
188
+ bb/c4/bbc4de2ca238d1… -> source/20260101_123456/min_sized_file1.bin
189
+ e6/37/e6374ac11d9049… -> source/20260101_123456/large_file.bin
190
+ """,
191
+ )
192
+
193
+ #######################################################################################
194
+ # Just backup again:
195
+
196
+ with freeze_time('2026-01-02T12:34:56Z', auto_tick_seconds=0):
197
+ result = backup_tree(
198
+ src_root=src_root,
199
+ backup_root=backup_root,
200
+ excludes={'.cache'},
201
+ )
202
+ backup_dir = result.backup_dir
203
+ self.assertEqual(
204
+ str(Path(backup_dir).relative_to(temp_path)),
205
+ 'backup/source/20260102_123456',
206
+ )
207
+ self.assertEqual(
208
+ result,
209
+ BackupResult(
210
+ backup_dir=backup_dir,
211
+ backup_count=7,
212
+ backup_size=67110929,
213
+ symlink_files=1,
214
+ hardlinked_files=3, # <<< More hardlinks this time!
215
+ hardlinked_size=67110865,
216
+ copied_files=3,
217
+ copied_size=50,
218
+ copied_small_files=3,
219
+ copied_small_size=50,
220
+ ),
221
+ )
222
+ # The second backup:
223
+ # * /.cache/ -> excluded
224
+ # * min_sized_file1.bin and min_sized_file2.bin -> hardlinked
225
+ assert_fs_tree_overview(
226
+ root=backup_dir,
227
+ expected_overview="""
228
+ path birthtime type nlink size CRC32
229
+ file2.txt 12:00:00 file 1 14 8a11514a
230
+ hardlink2file1 12:00:00 file 1 14 8a11514a
231
+ large_file.bin 12:00:00 hardlink 2 67108865 9671eaac
232
+ min_sized_file1.bin 12:00:00 hardlink 4 1000 f0d93de4
233
+ min_sized_file2.bin 12:00:00 hardlink 4 1000 f0d93de4
234
+ subdir/file.txt 12:00:00 file 1 22 c0167e63
235
+ symlink2file1 12:00:00 symlink 2 14 8a11514a
236
+ """,
237
+ )
238
+
239
+ # The FileHashDatabase remains the same:
240
+ assert_hash_db_info(
241
+ backup_root=backup_root,
242
+ expected="""
243
+ bb/c4/bbc4de2ca238d1… -> source/20260101_123456/min_sized_file1.bin
244
+ e6/37/e6374ac11d9049… -> source/20260101_123456/large_file.bin
245
+ """,
246
+ )
247
+
248
+ #######################################################################################
249
+ # Don't create broken hardlinks!
250
+
251
+ """DocWrite: README.md ## FileHashDatabase - Missing hardlink target file
252
+ If a hardlink source from a old backup is missing, we cannot create a hardlink to it.
253
+ But it still works to hardlink same files within the current backup.
254
+ """
255
+
256
+ # Let's remove one of the files used for hardlinking from the first backup:
257
+ (backup_root / 'source/20260101_123456/min_sized_file1.bin').unlink()
258
+
259
+ # Backup again:
260
+ with freeze_time('2026-01-03T12:34:56Z', auto_tick_seconds=0):
261
+ result = backup_tree(
262
+ src_root=src_root,
263
+ backup_root=backup_root,
264
+ excludes={'.cache'},
265
+ )
266
+ backup_dir = result.backup_dir
267
+ self.assertEqual(
268
+ result,
269
+ BackupResult(
270
+ backup_dir=backup_dir,
271
+ backup_count=7,
272
+ backup_size=67110929,
273
+ symlink_files=1,
274
+ hardlinked_files=2, # <<< Less hardlinks this time, because of missing link source!
275
+ hardlinked_size=67109865,
276
+ copied_files=4,
277
+ copied_size=1050,
278
+ copied_small_files=3,
279
+ copied_small_size=50,
280
+ ),
281
+ )
282
+
283
+ # Note: min_sized_file1.bin and min_sized_file2.bin are hardlinked,
284
+ # but not with the first backup anymore! So it's only nlink=2 now!
285
+ assert_fs_tree_overview(
286
+ root=backup_dir,
287
+ expected_overview="""
288
+ path birthtime type nlink size CRC32
289
+ file2.txt 12:00:00 file 1 14 8a11514a
290
+ hardlink2file1 12:00:00 file 1 14 8a11514a
291
+ large_file.bin 12:00:00 hardlink 3 67108865 9671eaac
292
+ min_sized_file1.bin 12:00:00 hardlink 2 1000 f0d93de4
293
+ min_sized_file2.bin 12:00:00 hardlink 2 1000 f0d93de4
294
+ subdir/file.txt 12:00:00 file 1 22 c0167e63
295
+ symlink2file1 12:00:00 symlink 2 14 8a11514a
296
+ """,
297
+ )
298
+
299
+ # Note: min_sized_file1.bin is now from the 2026-01-03 backup!
300
+ self.assertEqual(backup_dir.name, '20260103_123456') # Latest backup dir name
301
+ assert_hash_db_info(
302
+ backup_root=backup_root,
303
+ expected="""
304
+ bb/c4/bbc4de2ca238d1… -> source/20260103_123456/min_sized_file1.bin
305
+ e6/37/e6374ac11d9049… -> source/20260101_123456/large_file.bin
306
+ """,
307
+ )
308
+
309
+ def test_symlink(self):
310
+ with tempfile.TemporaryDirectory() as temp_dir:
311
+ temp_path = Path(temp_dir)
312
+
313
+ src_root = temp_path / 'src'
314
+ backup_root = temp_path / 'bak'
315
+
316
+ src_root.mkdir()
317
+ backup_root.mkdir()
318
+
319
+ source_file_path = src_root / 'source_file.txt'
320
+ source_file_path.write_text('File in the "source" directory.')
321
+
322
+ symlink2source_file_path = src_root / 'symlink2source'
323
+ symlink2source_file_path.symlink_to(source_file_path)
324
+ self.assertEqual(symlink2source_file_path.read_text(), 'File in the "source" directory.')
325
+
326
+ outside_file_path = temp_path / 'outside_file.txt'
327
+ outside_file_path.write_text('File outside the "source" directory!')
328
+
329
+ symlink2outside_file_path = src_root / 'symlink2outside'
330
+ symlink2outside_file_path.symlink_to(outside_file_path)
331
+ self.assertEqual(symlink2outside_file_path.read_text(), 'File outside the "source" directory!')
332
+
333
+ # FIXME: freezegun doesn't handle this, see: https://github.com/spulec/freezegun/issues/392
334
+ # Set modification times to a fixed time for easier testing:
335
+ set_file_times(src_root, dt=parse_dt('2026-01-01T12:00:00+0000'))
336
+
337
+ broken_symlink_path = src_root / 'broken_symlink'
338
+ broken_symlink_path.symlink_to(temp_path / 'not/existing/file.txt')
339
+ broken_symlink_path.is_symlink()
340
+
341
+ #######################################################################################
342
+ # Create first backup:
343
+
344
+ with freeze_time('2026-01-01T12:34:56Z', auto_tick_seconds=0):
345
+ result = backup_tree(src_root=src_root, backup_root=backup_root, excludes=set())
346
+ backup_dir1 = result.backup_dir
347
+ self.assertEqual(
348
+ str(Path(backup_dir1).relative_to(temp_path)),
349
+ 'bak/src/20260101_123456',
350
+ )
351
+
352
+ assert_fs_tree_overview(
353
+ root=temp_path, # The complete overview os source + backup and outside file
354
+ expected_overview="""
355
+ path birthtime type nlink size CRC32
356
+ bak/src/20260101_123456/broken_symlink - symlink - - -
357
+ bak/src/20260101_123456/source_file.txt 12:00:00 file 1 31 9309a10c
358
+ bak/src/20260101_123456/symlink2outside 12:00:00 symlink 1 36 24b5bf4c
359
+ bak/src/20260101_123456/symlink2source 12:00:00 symlink 1 31 9309a10c
360
+ outside_file.txt 12:00:00 file 1 36 24b5bf4c
361
+ src/broken_symlink - symlink - - -
362
+ src/source_file.txt 12:00:00 file 1 31 9309a10c
363
+ src/symlink2outside 12:00:00 symlink 1 36 24b5bf4c
364
+ src/symlink2source 12:00:00 symlink 1 31 9309a10c
365
+ """,
366
+ )
367
+
368
+ self.assertEqual(
369
+ result,
370
+ BackupResult(
371
+ backup_dir=backup_dir1,
372
+ backup_count=4,
373
+ backup_size=98,
374
+ symlink_files=3,
375
+ hardlinked_files=0,
376
+ hardlinked_size=0,
377
+ copied_files=1,
378
+ copied_size=31,
379
+ copied_small_files=1,
380
+ copied_small_size=31,
381
+ ),
382
+ )
383
+
384
+ """DocWrite: README.md ## backup implementation - Symlinks
385
+ Symlinks are copied as symlinks in the backup."""
386
+ self.assertEqual(
387
+ (backup_dir1 / 'symlink2outside').read_text(),
388
+ 'File outside the "source" directory!',
389
+ )
390
+ self.assertEqual(
391
+ (backup_dir1 / 'symlink2source').read_text(),
392
+ 'File in the "source" directory.',
393
+ )
394
+ self.assertEqual((backup_dir1 / 'symlink2outside').readlink(), outside_file_path)
395
+ self.assertEqual((backup_dir1 / 'symlink2source').readlink(), source_file_path)
396
+
397
+ """DocWrite: README.md ## backup implementation - Symlinks
398
+ Symlinks are not stored in our FileHashDatabase, because they are not considered for hardlinking."""
399
+ assert_hash_db_info(backup_root=backup_root, expected='')
@@ -0,0 +1,25 @@
1
+ from unittest import TestCase
2
+
3
+ from bx_py_utils.doc_write.api import GeneratedInfo, generate
4
+ from bx_py_utils.path import assert_is_file
5
+
6
+ from PyHardLinkBackup.cli_dev import PACKAGE_ROOT
7
+
8
+
9
+ class DocuWriteApiTestCase(TestCase):
10
+ def test_up2date_docs(self):
11
+ """DocWrite: about-docs.md # generate Doc-Write
12
+
13
+ These documentation files are generated automatically with the "Doc-Write" tool.
14
+ They updated automatically by unittests.
15
+
16
+ More information about Doc-Write can be found here:
17
+
18
+ https://github.com/boxine/bx_py_utils/tree/master/bx_py_utils/doc_write
19
+ """
20
+ assert_is_file(PACKAGE_ROOT / 'pyproject.toml')
21
+
22
+ info: GeneratedInfo = generate(base_path=PACKAGE_ROOT)
23
+ self.assertGreaterEqual(len(info.paths), 1)
24
+ self.assertEqual(info.update_count, 0, 'No files should be updated, commit the changes')
25
+ self.assertEqual(info.remove_count, 0, 'No files should be removed, commit the changes')
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from pathlib import Path
2
3
 
3
4
 
@@ -6,8 +7,8 @@ class HashAlreadyExistsError(ValueError):
6
7
 
7
8
 
8
9
  class FileHashDatabase:
9
- """
10
- A simple database to store file content hash <-> relative path mappings.
10
+ """DocWrite: README.md ## FileHashDatabase
11
+ A simple "database" to store file content hash <-> relative path mappings.
11
12
  Uses a directory structure to avoid too many files in a single directory.
12
13
  Path structure:
13
14
  {base_dst}/.phlb/hash-lookup/{XX}/{YY}/{hash}
@@ -39,6 +40,10 @@ class FileHashDatabase:
39
40
  return None
40
41
  else:
41
42
  abs_file_path = self.backup_root / rel_file_path
43
+ if not abs_file_path.is_file():
44
+ logging.warning('Hash database entry found, but file does not exist: %s', abs_file_path)
45
+ hash_path.unlink()
46
+ return None
42
47
  return abs_file_path
43
48
 
44
49
  def __setitem__(self, hash: str, abs_file_path: Path):
@@ -2,17 +2,11 @@ from pathlib import Path
2
2
 
3
3
 
4
4
  class FileSizeDatabase:
5
- """
6
- A simple database to track which file sizes have been seen.
7
- Uses a directory structure to avoid too many files in a single directory.
5
+ """DocWrite: README.md ## FileSizeDatabase
6
+ A simple "database" to track which file sizes have been seen.
8
7
 
9
- Path structure:
10
- {base_dst}/.phlb/size-lookup/{XX}/{YY}/{size}
11
- e.g.:
12
- 1234567890 results in: {base_dst}/.phlb/size-lookup/12/34/1234567890
13
-
14
- Notes:
15
- * We don't "cache" anything in Memory, to avoid high memory consumption for large datasets.
8
+ Uses a directory structure to avoid too many files in a single directory.
9
+ We don't "cache" anything in Memory, to avoid high memory consumption for large datasets.
16
10
  """
17
11
 
18
12
  MIN_SIZE = 1000 # no padding is made, so the min size is 1000 bytes!
@@ -24,6 +18,15 @@ class FileSizeDatabase:
24
18
  def _get_size_path(self, size: int) -> Path:
25
19
  assert size >= self.MIN_SIZE, f'Size must be at least {self.MIN_SIZE} bytes'
26
20
  size_str = str(size)
21
+
22
+ """DocWrite: README.md ## FileSizeDatabase
23
+ Path structure:
24
+ * `{base_dst}/.phlb/size-lookup/{XX}/{YY}/{size}`
25
+
26
+ e.g.:
27
+
28
+ * `1234567890` results in: `{base_dst}/.phlb/size-lookup/12/34/1234567890`
29
+ """
27
30
  first_dir_name = size_str[:2]
28
31
  second_dir_name = size_str[2:4]
29
32
  size_path = self.base_path / first_dir_name / second_dir_name / size_str
@@ -37,4 +40,7 @@ class FileSizeDatabase:
37
40
  size_path = self._get_size_path(size)
38
41
  if not size_path.exists():
39
42
  size_path.parent.mkdir(parents=True, exist_ok=True)
43
+
44
+ """DocWrite: README.md ## FileSizeDatabase
45
+ All files are created empty, as we only care about their existence."""
40
46
  size_path.touch(exist_ok=False)
@@ -1,6 +1,7 @@
1
1
  import hashlib
2
2
  import logging
3
3
  import os
4
+ import shutil
4
5
  import time
5
6
  from pathlib import Path
6
7
  from typing import Iterable
@@ -36,6 +37,10 @@ def copy_and_hash(src: Path, dst: Path) -> str:
36
37
  while chunk := source_file.read(CHUNK_SIZE):
37
38
  dst_file.write(chunk)
38
39
  hasher.update(chunk)
40
+
41
+ # Keep original file metadata (permission bits, last access time, last modification time, and flags)
42
+ shutil.copystat(src, dst)
43
+
39
44
  file_hash = hasher.hexdigest()
40
45
  logger.info('%s backup to %s with %s hash: %s', src, dst, HASH_ALGO, file_hash)
41
46
  return file_hash
@@ -55,14 +60,16 @@ def iter_scandir_files(path: Path, excludes: set[str]) -> Iterable[os.DirEntry]:
55
60
  Recursively yield all files+symlinks in the given directory.
56
61
  """
57
62
  logger.debug('Scanning directory %s', path)
58
- for entry in os.scandir(path):
59
- if entry.is_file(follow_symlinks=True):
60
- yield entry
61
- elif entry.is_dir(follow_symlinks=True):
62
- if entry.name in excludes:
63
- logger.debug('Excluding directory %s', entry.path)
64
- continue
65
- yield from iter_scandir_files(Path(entry.path), excludes=excludes)
63
+ with os.scandir(path) as scandir_iterator:
64
+ for entry in scandir_iterator:
65
+ if entry.is_dir(follow_symlinks=True):
66
+ if entry.name in excludes:
67
+ logger.debug('Excluding directory %s', entry.path)
68
+ continue
69
+ yield from iter_scandir_files(Path(entry.path), excludes=excludes)
70
+ else:
71
+ # It's a file or symlink or broken symlink
72
+ yield entry
66
73
 
67
74
 
68
75
  def humanized_fs_scan(path: Path, excludes: set[str]) -> tuple[int, int]:
@@ -91,7 +98,11 @@ def humanized_fs_scan(path: Path, excludes: set[str]) -> tuple[int, int]:
91
98
  with progress:
92
99
  for entry in iter_scandir_files(path, excludes=excludes):
93
100
  file_count += 1
94
- total_size += entry.stat().st_size
101
+ try:
102
+ total_size += entry.stat().st_size
103
+ except FileNotFoundError:
104
+ # e.g.: broken symlink
105
+ continue
95
106
 
96
107
  now = time.time()
97
108
  if now >= next_update: