PyHardLinkBackup 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. PyHardLinkBackup/__init__.py +7 -0
  2. PyHardLinkBackup/__main__.py +10 -0
  3. PyHardLinkBackup/backup.py +261 -0
  4. PyHardLinkBackup/cli_app/__init__.py +41 -0
  5. PyHardLinkBackup/cli_app/phlb.py +123 -0
  6. PyHardLinkBackup/cli_dev/__init__.py +70 -0
  7. PyHardLinkBackup/cli_dev/benchmark.py +138 -0
  8. PyHardLinkBackup/cli_dev/code_style.py +12 -0
  9. PyHardLinkBackup/cli_dev/packaging.py +65 -0
  10. PyHardLinkBackup/cli_dev/shell_completion.py +23 -0
  11. PyHardLinkBackup/cli_dev/testing.py +52 -0
  12. PyHardLinkBackup/cli_dev/update_readme_history.py +33 -0
  13. PyHardLinkBackup/compare_backup.py +212 -0
  14. PyHardLinkBackup/constants.py +16 -0
  15. PyHardLinkBackup/logging_setup.py +124 -0
  16. PyHardLinkBackup/rebuild_databases.py +176 -0
  17. PyHardLinkBackup/tests/__init__.py +36 -0
  18. PyHardLinkBackup/tests/test_backup.py +628 -0
  19. PyHardLinkBackup/tests/test_compare_backup.py +86 -0
  20. PyHardLinkBackup/tests/test_doc_write.py +26 -0
  21. PyHardLinkBackup/tests/test_doctests.py +10 -0
  22. PyHardLinkBackup/tests/test_project_setup.py +46 -0
  23. PyHardLinkBackup/tests/test_readme.py +75 -0
  24. PyHardLinkBackup/tests/test_readme_history.py +9 -0
  25. PyHardLinkBackup/tests/test_rebuild_database.py +224 -0
  26. PyHardLinkBackup/utilities/__init__.py +0 -0
  27. PyHardLinkBackup/utilities/file_hash_database.py +62 -0
  28. PyHardLinkBackup/utilities/file_size_database.py +46 -0
  29. PyHardLinkBackup/utilities/filesystem.py +158 -0
  30. PyHardLinkBackup/utilities/humanize.py +39 -0
  31. PyHardLinkBackup/utilities/rich_utils.py +99 -0
  32. PyHardLinkBackup/utilities/sha256sums.py +61 -0
  33. PyHardLinkBackup/utilities/tee.py +40 -0
  34. PyHardLinkBackup/utilities/tests/__init__.py +0 -0
  35. PyHardLinkBackup/utilities/tests/test_file_hash_database.py +143 -0
  36. PyHardLinkBackup/utilities/tests/test_file_size_database.py +138 -0
  37. PyHardLinkBackup/utilities/tests/test_filesystem.py +126 -0
  38. PyHardLinkBackup/utilities/tyro_cli_shared_args.py +12 -0
  39. pyhardlinkbackup-1.5.0.dist-info/METADATA +600 -0
  40. pyhardlinkbackup-1.5.0.dist-info/RECORD +42 -0
  41. pyhardlinkbackup-1.5.0.dist-info/WHEEL +4 -0
  42. pyhardlinkbackup-1.5.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,7 @@
1
+ """DocWrite: README.md # PyHardLinkBackup
2
+ HardLink/Deduplication Backups with Python
3
+ """
4
+
5
+ # See https://packaging.python.org/en/latest/specifications/version-specifiers/
6
+ __version__ = '1.5.0'
7
+ __author__ = 'Jens Diemer <PyHardLinkBackup@jensdiemer.de>'
@@ -0,0 +1,10 @@
1
+ """
2
+ Allow PyHardLinkBackup to be executable
3
+ through `python -m PyHardLinkBackup`.
4
+ """
5
+
6
+ from PyHardLinkBackup.cli_app import main
7
+
8
+
9
+ if __name__ == '__main__':
10
+ main()
@@ -0,0 +1,261 @@
1
+ import dataclasses
2
+ import datetime
3
+ import logging
4
+ import os
5
+ import shutil
6
+ import sys
7
+ import time
8
+ from pathlib import Path
9
+
10
+ from rich import print # noqa
11
+
12
+ from PyHardLinkBackup.constants import CHUNK_SIZE
13
+ from PyHardLinkBackup.logging_setup import LoggingManager
14
+ from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase
15
+ from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
16
+ from PyHardLinkBackup.utilities.filesystem import (
17
+ copy_and_hash,
18
+ hash_file,
19
+ humanized_fs_scan,
20
+ iter_scandir_files,
21
+ read_and_hash_file,
22
+ supports_hardlinks,
23
+ )
24
+ from PyHardLinkBackup.utilities.humanize import PrintTimingContextManager, human_filesize
25
+ from PyHardLinkBackup.utilities.rich_utils import DisplayFileTreeProgress
26
+ from PyHardLinkBackup.utilities.sha256sums import store_hash
27
+ from PyHardLinkBackup.utilities.tee import TeeStdoutContext
28
+
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ @dataclasses.dataclass
34
+ class BackupResult:
35
+ backup_dir: Path
36
+ log_file: Path
37
+ #
38
+ backup_count: int = 0
39
+ backup_size: int = 0
40
+ #
41
+ symlink_files: int = 0
42
+ hardlinked_files: int = 0
43
+ hardlinked_size: int = 0
44
+ #
45
+ copied_files: int = 0
46
+ copied_size: int = 0
47
+ #
48
+ copied_small_files: int = 0
49
+ copied_small_size: int = 0
50
+ #
51
+ error_count: int = 0
52
+
53
+
54
+ def backup_one_file(
55
+ *,
56
+ src_root: Path,
57
+ entry: os.DirEntry,
58
+ size_db: FileSizeDatabase,
59
+ hash_db: FileHashDatabase,
60
+ backup_dir: Path,
61
+ backup_result: BackupResult,
62
+ ) -> None:
63
+ backup_result.backup_count += 1
64
+ src_path = Path(entry.path)
65
+
66
+ dst_path = backup_dir / src_path.relative_to(src_root)
67
+ dst_dir_path = dst_path.parent
68
+ if not dst_dir_path.exists():
69
+ dst_dir_path.mkdir(parents=True, exist_ok=False)
70
+
71
+ try:
72
+ size = entry.stat().st_size
73
+ except FileNotFoundError:
74
+ # e.g.: Handle broken symlink
75
+ target = os.readlink(src_path)
76
+ dst_path.symlink_to(target)
77
+ backup_result.symlink_files += 1
78
+ return
79
+
80
+ backup_result.backup_size += size
81
+
82
+ if entry.name == 'SHA256SUMS':
83
+ # Skip existing SHA256SUMS files in source tree,
84
+ # because we create our own SHA256SUMS files.
85
+ logger.debug('Skip existing SHA256SUMS file: %s', src_path)
86
+ return
87
+
88
+ if entry.is_symlink():
89
+ logger.debug('Copy symlink: %s to %s', src_path, dst_path)
90
+ target = os.readlink(src_path)
91
+ dst_path.symlink_to(target)
92
+ backup_result.symlink_files += 1
93
+ return
94
+
95
+ # Process regular files
96
+ assert entry.is_file(follow_symlinks=False), f'Unexpected non-file: {src_path}'
97
+
98
+ # Deduplication logic
99
+
100
+ if size < size_db.MIN_SIZE:
101
+ # Small file -> always copy without deduplication
102
+ logger.info('Copy small file: %s to %s', src_path, dst_path)
103
+ file_hash = copy_and_hash(src_path, dst_path)
104
+ backup_result.copied_files += 1
105
+ backup_result.copied_size += size
106
+ backup_result.copied_small_files += 1
107
+ backup_result.copied_small_size += size
108
+ store_hash(dst_path, file_hash)
109
+ return
110
+
111
+ if size in size_db:
112
+ logger.debug('File with size %iBytes found before -> hash: %s', size, src_path)
113
+
114
+ if size <= CHUNK_SIZE:
115
+ # File can be read complete into memory
116
+ logger.debug('File size %iBytes <= CHUNK_SIZE (%iBytes) -> read complete into memory', size, CHUNK_SIZE)
117
+ file_content, file_hash = read_and_hash_file(src_path)
118
+ if existing_path := hash_db.get(file_hash):
119
+ logger.info('Hardlink duplicate file: %s to %s', dst_path, existing_path)
120
+ os.link(existing_path, dst_path)
121
+ backup_result.hardlinked_files += 1
122
+ backup_result.hardlinked_size += size
123
+ else:
124
+ logger.info('Store unique file: %s to %s', src_path, dst_path)
125
+ dst_path.write_bytes(file_content)
126
+ hash_db[file_hash] = dst_path
127
+ backup_result.copied_files += 1
128
+ backup_result.copied_size += size
129
+
130
+ else:
131
+ # Large file
132
+ file_hash = hash_file(src_path) # Calculate hash without copying
133
+
134
+ if existing_path := hash_db.get(file_hash):
135
+ logger.info('Hardlink duplicate file: %s to %s', dst_path, existing_path)
136
+ os.link(existing_path, dst_path)
137
+ backup_result.hardlinked_files += 1
138
+ backup_result.hardlinked_size += size
139
+ else:
140
+ logger.info('Copy unique file: %s to %s', src_path, dst_path)
141
+ file_hash = copy_and_hash(src_path, dst_path)
142
+ hash_db[file_hash] = dst_path
143
+ backup_result.copied_files += 1
144
+ backup_result.copied_size += size
145
+
146
+ # Keep original file metadata (permission bits, time stamps, and flags)
147
+ shutil.copystat(src_path, dst_path)
148
+ else:
149
+ # A file with this size not backuped before -> Can't be duplicate -> copy and hash
150
+ file_hash = copy_and_hash(src_path, dst_path)
151
+ size_db.add(size)
152
+ hash_db[file_hash] = dst_path
153
+ backup_result.copied_files += 1
154
+ backup_result.copied_size += size
155
+
156
+ store_hash(dst_path, file_hash)
157
+
158
+
159
+ def backup_tree(
160
+ *,
161
+ src_root: Path,
162
+ backup_root: Path,
163
+ excludes: tuple[str, ...],
164
+ log_manager: LoggingManager,
165
+ ) -> BackupResult:
166
+ src_root = src_root.resolve()
167
+ if not src_root.is_dir():
168
+ print('Error: Source directory does not exist!')
169
+ print(f'Please check source directory: "{src_root}"\n')
170
+ sys.exit(1)
171
+
172
+ backup_root = backup_root.resolve()
173
+ if not backup_root.is_dir():
174
+ print('Error: Backup directory does not exist!')
175
+ print(f'Please create "{backup_root}" directory first and start again!\n')
176
+ sys.exit(1)
177
+
178
+ if not os.access(backup_root, os.W_OK):
179
+ print('Error: No write access to backup directory!')
180
+ print(f'Please check permissions for backup directory: "{backup_root}"\n')
181
+ sys.exit(1)
182
+
183
+ if not supports_hardlinks(backup_root):
184
+ print('Error: Filesystem for backup directory does not support hardlinks!')
185
+ print(f'Please check backup directory: "{backup_root}"\n')
186
+ sys.exit(1)
187
+
188
+ # Step 1: Scan source directory:
189
+ excludes: set = set(excludes)
190
+ with PrintTimingContextManager('Filesystem scan completed in'):
191
+ src_file_count, src_total_size = humanized_fs_scan(src_root, excludes=excludes)
192
+
193
+ phlb_conf_dir = backup_root / '.phlb'
194
+ phlb_conf_dir.mkdir(parents=False, exist_ok=True)
195
+
196
+ timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')
197
+ backup_main_dir = backup_root / src_root.name
198
+ backup_dir = backup_main_dir / timestamp
199
+ backup_dir.mkdir(parents=True, exist_ok=False)
200
+
201
+ log_file = backup_main_dir / f'{timestamp}-backup.log'
202
+ log_manager.start_file_logging(log_file)
203
+
204
+ logger.info('Backup %s to %s', src_root, backup_dir)
205
+
206
+ print(f'\nBackup to {backup_dir}...\n')
207
+
208
+ with DisplayFileTreeProgress(src_file_count, src_total_size) as progress:
209
+ # "Databases" for deduplication
210
+ size_db = FileSizeDatabase(phlb_conf_dir)
211
+ hash_db = FileHashDatabase(backup_root, phlb_conf_dir)
212
+
213
+ backup_result = BackupResult(backup_dir=backup_dir, log_file=log_file)
214
+
215
+ next_update = 0
216
+ for entry in iter_scandir_files(src_root, excludes=excludes):
217
+ try:
218
+ backup_one_file(
219
+ src_root=src_root,
220
+ entry=entry,
221
+ size_db=size_db,
222
+ hash_db=hash_db,
223
+ backup_dir=backup_dir,
224
+ backup_result=backup_result,
225
+ )
226
+ except Exception as err:
227
+ logger.exception(f'Backup {entry.path} {err.__class__.__name__}: {err}')
228
+ backup_result.error_count += 1
229
+ else:
230
+ now = time.monotonic()
231
+ if now >= next_update:
232
+ progress.update(
233
+ completed_file_count=backup_result.backup_count, completed_size=backup_result.backup_size
234
+ )
235
+ next_update = now + 0.5
236
+
237
+ # Finalize progress indicator values:
238
+ progress.update(completed_file_count=backup_result.backup_count, completed_size=backup_result.backup_size)
239
+
240
+ summary_file = backup_main_dir / f'{timestamp}-summary.txt'
241
+ with TeeStdoutContext(summary_file):
242
+ print(f'\nBackup complete: {backup_dir} (total size {human_filesize(backup_result.backup_size)})\n')
243
+ print(f' Total files processed: {backup_result.backup_count}')
244
+ print(f' * Symlinked files: {backup_result.symlink_files}')
245
+ print(
246
+ f' * Hardlinked files: {backup_result.hardlinked_files}'
247
+ f' (saved {human_filesize(backup_result.hardlinked_size)})'
248
+ )
249
+ print(f' * Copied files: {backup_result.copied_files} (total {human_filesize(backup_result.copied_size)})')
250
+ print(
251
+ f' of which small (<{size_db.MIN_SIZE} Bytes)'
252
+ f' files: {backup_result.copied_small_files}'
253
+ f' (total {human_filesize(backup_result.copied_small_size)})'
254
+ )
255
+ if backup_result.error_count > 0:
256
+ print(f' Errors during backup: {backup_result.error_count} (see log for details)')
257
+ print()
258
+
259
+ logger.info('Backup completed. Summary created: %s', summary_file)
260
+
261
+ return backup_result
@@ -0,0 +1,41 @@
1
+ """
2
+ CLI for usage
3
+ """
4
+
5
+ import logging
6
+ import sys
7
+ from collections.abc import Sequence
8
+
9
+ from cli_base.autodiscover import import_all_files
10
+ from cli_base.cli_tools.version_info import print_version
11
+ from rich import print # noqa
12
+ from tyro.extras import SubcommandApp
13
+
14
+ import PyHardLinkBackup
15
+ from PyHardLinkBackup import constants
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ app = SubcommandApp()
21
+
22
+ # Register all CLI commands, just by import all files in this package:
23
+ import_all_files(package=__package__, init_file=__file__)
24
+
25
+
26
+ @app.command
27
+ def version():
28
+ """Print version and exit"""
29
+ # Pseudo command, because the version always printed on every CLI call ;)
30
+ sys.exit(0)
31
+
32
+
33
+ def main(args: Sequence[str] | None = None):
34
+ print_version(PyHardLinkBackup)
35
+ app.cli(
36
+ prog='./cli.py',
37
+ description=constants.CLI_EPILOG,
38
+ use_underscores=False, # use hyphens instead of underscores
39
+ sort_subcommands=True,
40
+ args=args,
41
+ )
@@ -0,0 +1,123 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Annotated
4
+
5
+ import tyro
6
+ from rich import print # noqa
7
+
8
+ from PyHardLinkBackup import compare_backup, rebuild_databases
9
+ from PyHardLinkBackup.backup import backup_tree
10
+ from PyHardLinkBackup.cli_app import app
11
+ from PyHardLinkBackup.logging_setup import (
12
+ DEFAULT_CONSOLE_LOG_LEVEL,
13
+ DEFAULT_LOG_FILE_LEVEL,
14
+ LoggingManager,
15
+ TyroConsoleLogLevelArgType,
16
+ TyroLogFileLevelArgType,
17
+ )
18
+ from PyHardLinkBackup.utilities.tyro_cli_shared_args import (
19
+ DEFAULT_EXCLUDE_DIRECTORIES,
20
+ TyroExcludeDirectoriesArgType,
21
+ )
22
+
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ @app.command
28
+ def backup(
29
+ src: Annotated[
30
+ Path,
31
+ tyro.conf.arg(
32
+ metavar='source',
33
+ help='Source directory to back up.',
34
+ ),
35
+ ],
36
+ dst: Annotated[
37
+ Path,
38
+ tyro.conf.arg(
39
+ metavar='destination',
40
+ help='Destination directory for the backup.',
41
+ ),
42
+ ],
43
+ /,
44
+ excludes: TyroExcludeDirectoriesArgType = DEFAULT_EXCLUDE_DIRECTORIES,
45
+ verbosity: TyroConsoleLogLevelArgType = DEFAULT_CONSOLE_LOG_LEVEL,
46
+ log_file_level: TyroLogFileLevelArgType = DEFAULT_LOG_FILE_LEVEL,
47
+ ) -> None:
48
+ """
49
+ Backup the source directory to the destination directory using hard links for deduplication.
50
+ """
51
+ log_manager = LoggingManager(
52
+ console_level=verbosity,
53
+ file_level=log_file_level,
54
+ )
55
+ backup_tree(
56
+ src_root=src,
57
+ backup_root=dst,
58
+ excludes=excludes,
59
+ log_manager=log_manager,
60
+ )
61
+
62
+
63
+ @app.command
64
+ def compare(
65
+ src: Annotated[
66
+ Path,
67
+ tyro.conf.arg(
68
+ metavar='source',
69
+ help='Source directory that should be compared with the last backup.',
70
+ ),
71
+ ],
72
+ dst: Annotated[
73
+ Path,
74
+ tyro.conf.arg(
75
+ metavar='destination',
76
+ help='Destination directory with the backups. Will pick the last backup for comparison.',
77
+ ),
78
+ ],
79
+ /,
80
+ excludes: TyroExcludeDirectoriesArgType = DEFAULT_EXCLUDE_DIRECTORIES,
81
+ verbosity: TyroConsoleLogLevelArgType = DEFAULT_CONSOLE_LOG_LEVEL,
82
+ log_file_level: TyroLogFileLevelArgType = DEFAULT_LOG_FILE_LEVEL,
83
+ ) -> None:
84
+ """
85
+ Compares a source tree with the last backup and validates all known file hashes.
86
+ """
87
+ log_manager = LoggingManager(
88
+ console_level=verbosity,
89
+ file_level=log_file_level,
90
+ )
91
+ compare_backup.compare_tree(
92
+ src_root=src,
93
+ backup_root=dst,
94
+ excludes=excludes,
95
+ log_manager=log_manager,
96
+ )
97
+
98
+
99
+ @app.command
100
+ def rebuild(
101
+ backup_root: Annotated[
102
+ Path,
103
+ tyro.conf.arg(
104
+ metavar='backup-directory',
105
+ help='Root directory of the the backups.',
106
+ ),
107
+ ],
108
+ /,
109
+ verbosity: TyroConsoleLogLevelArgType = DEFAULT_CONSOLE_LOG_LEVEL,
110
+ log_file_level: TyroLogFileLevelArgType = DEFAULT_LOG_FILE_LEVEL,
111
+ ) -> None:
112
+ """
113
+ Rebuild the file hash and size database by scanning all backup files. And also verify SHA256SUMS
114
+ and/or store missing hashes in SHA256SUMS files.
115
+ """
116
+ log_manager = LoggingManager(
117
+ console_level=verbosity,
118
+ file_level=log_file_level,
119
+ )
120
+ rebuild_databases.rebuild(
121
+ backup_root=backup_root,
122
+ log_manager=log_manager,
123
+ )
@@ -0,0 +1,70 @@
1
+ """
2
+ CLI for development
3
+ """
4
+
5
+ import importlib
6
+ import logging
7
+ import sys
8
+ from collections.abc import Sequence
9
+
10
+ from bx_py_utils.path import assert_is_file
11
+ from cli_base.autodiscover import import_all_files
12
+ from cli_base.cli_tools.dev_tools import run_coverage, run_nox, run_unittest_cli
13
+ from cli_base.cli_tools.version_info import print_version
14
+ from typeguard import install_import_hook
15
+ from tyro.extras import SubcommandApp
16
+
17
+ import PyHardLinkBackup
18
+ from PyHardLinkBackup import constants
19
+
20
+
21
+ # Check type annotations via typeguard in all tests.
22
+ # Sadly we must activate this here and can't do this in ./tests/__init__.py
23
+ install_import_hook(packages=('PyHardLinkBackup',))
24
+
25
+ # reload the module, after the typeguard import hook is activated:
26
+ importlib.reload(PyHardLinkBackup)
27
+
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ PACKAGE_ROOT = constants.BASE_PATH.parent
33
+ assert_is_file(PACKAGE_ROOT / 'pyproject.toml') # Exists only in cloned git repo
34
+
35
+
36
+ app = SubcommandApp()
37
+
38
+
39
+ # Register all CLI commands, just by import all files in this package:
40
+ import_all_files(package=__package__, init_file=__file__)
41
+
42
+
43
+ @app.command
44
+ def version():
45
+ """Print version and exit"""
46
+ # Pseudo command, because the version always printed on every CLI call ;)
47
+ sys.exit(0)
48
+
49
+
50
+ def main(args: Sequence[str] | None = None):
51
+ print_version(PyHardLinkBackup)
52
+
53
+ if len(sys.argv) >= 2:
54
+ # Check if we can just pass a command call to origin CLI:
55
+ command = sys.argv[1]
56
+ command_map = {
57
+ 'test': run_unittest_cli,
58
+ 'nox': run_nox,
59
+ 'coverage': run_coverage,
60
+ }
61
+ if real_func := command_map.get(command):
62
+ real_func(argv=sys.argv, exit_after_run=True)
63
+
64
+ app.cli(
65
+ prog='./dev-cli.py',
66
+ description=constants.CLI_EPILOG,
67
+ use_underscores=False, # use hyphens instead of underscores
68
+ sort_subcommands=True,
69
+ args=args,
70
+ )
@@ -0,0 +1,138 @@
1
+ import collections
2
+ import hashlib
3
+ import logging
4
+ import time
5
+ from pathlib import Path
6
+
7
+ from bx_py_utils.path import assert_is_dir
8
+ from cli_base.cli_tools.verbosity import setup_logging
9
+ from cli_base.tyro_commands import TyroVerbosityArgType
10
+ from rich import print # noqa
11
+
12
+ from PyHardLinkBackup.cli_dev import app
13
+ from PyHardLinkBackup.utilities.filesystem import humanized_fs_scan, iter_scandir_files
14
+ from PyHardLinkBackup.utilities.humanize import PrintTimingContextManager
15
+ from PyHardLinkBackup.utilities.tyro_cli_shared_args import DEFAULT_EXCLUDE_DIRECTORIES, TyroExcludeDirectoriesArgType
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @app.command
22
+ def scan_benchmark(
23
+ base_path: Path,
24
+ /,
25
+ excludes: TyroExcludeDirectoriesArgType = DEFAULT_EXCLUDE_DIRECTORIES,
26
+ verbosity: TyroVerbosityArgType = 1,
27
+ ) -> None:
28
+ """
29
+ Benchmark our filesystem scan routine.
30
+ """
31
+ setup_logging(verbosity=verbosity)
32
+ with PrintTimingContextManager('Filesystem scan completed in'):
33
+ humanized_fs_scan(path=base_path, excludes=set(excludes))
34
+
35
+
36
+ @app.command
37
+ def benchmark_hashes(
38
+ base_path: Path,
39
+ /,
40
+ excludes: TyroExcludeDirectoriesArgType = DEFAULT_EXCLUDE_DIRECTORIES,
41
+ max_duration: int = 30, # in seconds
42
+ min_file_size: int = 15 * 1024, # 15 KiB
43
+ max_file_size: int = 100 * 1024 * 1024, # 100 MiB
44
+ verbosity: TyroVerbosityArgType = 1,
45
+ ) -> None:
46
+ """
47
+ Benchmark different file hashing algorithms on the given path.
48
+ """
49
+ # Example output:
50
+ #
51
+ # Total files hashed: 220, total size: 1187.7 MiB
52
+ #
53
+ # Results:
54
+ # Total file content read time: 1.7817s
55
+ #
56
+ # sha1 | Total: 0.6827s | 0.4x hash/read
57
+ # sha256 | Total: 0.7189s | 0.4x hash/read
58
+ # sha224 | Total: 0.7375s | 0.4x hash/read
59
+ # sha384 | Total: 1.6552s | 0.9x hash/read
60
+ # blake2b | Total: 1.6708s | 0.9x hash/read
61
+ # md5 | Total: 1.6870s | 0.9x hash/read
62
+ # sha512 | Total: 1.7269s | 1.0x hash/read
63
+ # shake_128 | Total: 1.9834s | 1.1x hash/read
64
+ # sha3_224 | Total: 2.3006s | 1.3x hash/read
65
+ # sha3_256 | Total: 2.3856s | 1.3x hash/read
66
+ # shake_256 | Total: 2.4375s | 1.4x hash/read
67
+ # blake2s | Total: 2.5219s | 1.4x hash/read
68
+ # sha3_384 | Total: 3.2596s | 1.8x hash/read
69
+ # sha3_512 | Total: 4.5328s | 2.5x hash/read
70
+ setup_logging(verbosity=verbosity)
71
+ assert_is_dir(base_path)
72
+ print(f'Benchmarking file hashes under: {base_path}')
73
+
74
+ print(f'Min file size: {min_file_size} bytes')
75
+ print(f'Max file size: {max_file_size} bytes')
76
+ print(f'Max duration: {max_duration} seconds')
77
+
78
+ algorithms = sorted(hashlib.algorithms_guaranteed)
79
+ print(f'\nUsing {len(algorithms)} guaranteed algorithms: {algorithms}')
80
+ print('-' * 80)
81
+
82
+ file_count = 0
83
+ total_size = 0
84
+ total_read_time = 0.0
85
+ results = collections.defaultdict(set)
86
+
87
+ start_time = time.time()
88
+ stop_time = start_time + max_duration
89
+ next_update = start_time + 2
90
+
91
+ with PrintTimingContextManager('Filesystem scan completed in'):
92
+ for dir_entry in iter_scandir_files(path=base_path, excludes=set(excludes)):
93
+ entry_stat = dir_entry.stat()
94
+ file_size = entry_stat.st_size
95
+ if not (min_file_size <= file_size <= max_file_size):
96
+ continue
97
+
98
+ start_time = time.perf_counter()
99
+ file_content = Path(dir_entry.path).read_bytes()
100
+ duration = time.perf_counter() - start_time
101
+ total_read_time += duration
102
+
103
+ for algo in algorithms:
104
+ # Actual measurement:
105
+ start_time = time.perf_counter()
106
+ hashlib.new(algo, file_content)
107
+ duration = time.perf_counter() - start_time
108
+
109
+ results[algo].add(duration)
110
+
111
+ file_count += 1
112
+ total_size += entry_stat.st_size
113
+
114
+ now = time.time()
115
+ if now >= stop_time:
116
+ print('Reached max duration limit, stopping benchmark...')
117
+ break
118
+
119
+ if now >= next_update:
120
+ percent = (now - (stop_time - max_duration)) / max_duration * 100
121
+ print(
122
+ f'{int(percent)}% Processed {file_count} files so far,'
123
+ f' total size: {total_size / 1024 / 1024:.1f} MiB...'
124
+ )
125
+ next_update = now + 2
126
+
127
+ print(f'\nTotal files hashed: {file_count}, total size: {total_size / 1024 / 1024:.1f} MiB')
128
+
129
+ print('\nResults:')
130
+ print(f'Total file content read time: {total_read_time:.4f}s\n')
131
+
132
+ sorted_results = sorted(
133
+ ((algo, sum(durations)) for algo, durations in results.items()),
134
+ key=lambda x: x[1], # Sort by total_duration
135
+ )
136
+ for algo, total_duration in sorted_results:
137
+ ratio = total_duration / total_read_time
138
+ print(f'{algo:10} | Total: {total_duration:.4f}s | {ratio:.1f}x hash/read')
@@ -0,0 +1,12 @@
1
+ from cli_base.cli_tools.code_style import assert_code_style
2
+ from cli_base.tyro_commands import TyroVerbosityArgType
3
+
4
+ from PyHardLinkBackup.cli_dev import PACKAGE_ROOT, app
5
+
6
+
7
+ @app.command
8
+ def lint(verbosity: TyroVerbosityArgType = 1):
9
+ """
10
+ Check/fix code style by run: "ruff check --fix"
11
+ """
12
+ assert_code_style(package_root=PACKAGE_ROOT, verbose=bool(verbosity), sys_exit=True)