PyHardLinkBackup 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyHardLinkBackup/__init__.py +7 -0
- PyHardLinkBackup/__main__.py +10 -0
- PyHardLinkBackup/backup.py +261 -0
- PyHardLinkBackup/cli_app/__init__.py +41 -0
- PyHardLinkBackup/cli_app/phlb.py +123 -0
- PyHardLinkBackup/cli_dev/__init__.py +70 -0
- PyHardLinkBackup/cli_dev/benchmark.py +138 -0
- PyHardLinkBackup/cli_dev/code_style.py +12 -0
- PyHardLinkBackup/cli_dev/packaging.py +65 -0
- PyHardLinkBackup/cli_dev/shell_completion.py +23 -0
- PyHardLinkBackup/cli_dev/testing.py +52 -0
- PyHardLinkBackup/cli_dev/update_readme_history.py +33 -0
- PyHardLinkBackup/compare_backup.py +212 -0
- PyHardLinkBackup/constants.py +16 -0
- PyHardLinkBackup/logging_setup.py +124 -0
- PyHardLinkBackup/rebuild_databases.py +176 -0
- PyHardLinkBackup/tests/__init__.py +36 -0
- PyHardLinkBackup/tests/test_backup.py +628 -0
- PyHardLinkBackup/tests/test_compare_backup.py +86 -0
- PyHardLinkBackup/tests/test_doc_write.py +26 -0
- PyHardLinkBackup/tests/test_doctests.py +10 -0
- PyHardLinkBackup/tests/test_project_setup.py +46 -0
- PyHardLinkBackup/tests/test_readme.py +75 -0
- PyHardLinkBackup/tests/test_readme_history.py +9 -0
- PyHardLinkBackup/tests/test_rebuild_database.py +224 -0
- PyHardLinkBackup/utilities/__init__.py +0 -0
- PyHardLinkBackup/utilities/file_hash_database.py +62 -0
- PyHardLinkBackup/utilities/file_size_database.py +46 -0
- PyHardLinkBackup/utilities/filesystem.py +158 -0
- PyHardLinkBackup/utilities/humanize.py +39 -0
- PyHardLinkBackup/utilities/rich_utils.py +99 -0
- PyHardLinkBackup/utilities/sha256sums.py +61 -0
- PyHardLinkBackup/utilities/tee.py +40 -0
- PyHardLinkBackup/utilities/tests/__init__.py +0 -0
- PyHardLinkBackup/utilities/tests/test_file_hash_database.py +143 -0
- PyHardLinkBackup/utilities/tests/test_file_size_database.py +138 -0
- PyHardLinkBackup/utilities/tests/test_filesystem.py +126 -0
- PyHardLinkBackup/utilities/tyro_cli_shared_args.py +12 -0
- pyhardlinkbackup-1.5.0.dist-info/METADATA +600 -0
- pyhardlinkbackup-1.5.0.dist-info/RECORD +42 -0
- pyhardlinkbackup-1.5.0.dist-info/WHEEL +4 -0
- pyhardlinkbackup-1.5.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import datetime
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from rich import print # noqa
|
|
11
|
+
|
|
12
|
+
from PyHardLinkBackup.constants import CHUNK_SIZE
|
|
13
|
+
from PyHardLinkBackup.logging_setup import LoggingManager
|
|
14
|
+
from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase
|
|
15
|
+
from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
|
|
16
|
+
from PyHardLinkBackup.utilities.filesystem import (
|
|
17
|
+
copy_and_hash,
|
|
18
|
+
hash_file,
|
|
19
|
+
humanized_fs_scan,
|
|
20
|
+
iter_scandir_files,
|
|
21
|
+
read_and_hash_file,
|
|
22
|
+
supports_hardlinks,
|
|
23
|
+
)
|
|
24
|
+
from PyHardLinkBackup.utilities.humanize import PrintTimingContextManager, human_filesize
|
|
25
|
+
from PyHardLinkBackup.utilities.rich_utils import DisplayFileTreeProgress
|
|
26
|
+
from PyHardLinkBackup.utilities.sha256sums import store_hash
|
|
27
|
+
from PyHardLinkBackup.utilities.tee import TeeStdoutContext
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclasses.dataclass
|
|
34
|
+
class BackupResult:
|
|
35
|
+
backup_dir: Path
|
|
36
|
+
log_file: Path
|
|
37
|
+
#
|
|
38
|
+
backup_count: int = 0
|
|
39
|
+
backup_size: int = 0
|
|
40
|
+
#
|
|
41
|
+
symlink_files: int = 0
|
|
42
|
+
hardlinked_files: int = 0
|
|
43
|
+
hardlinked_size: int = 0
|
|
44
|
+
#
|
|
45
|
+
copied_files: int = 0
|
|
46
|
+
copied_size: int = 0
|
|
47
|
+
#
|
|
48
|
+
copied_small_files: int = 0
|
|
49
|
+
copied_small_size: int = 0
|
|
50
|
+
#
|
|
51
|
+
error_count: int = 0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def backup_one_file(
|
|
55
|
+
*,
|
|
56
|
+
src_root: Path,
|
|
57
|
+
entry: os.DirEntry,
|
|
58
|
+
size_db: FileSizeDatabase,
|
|
59
|
+
hash_db: FileHashDatabase,
|
|
60
|
+
backup_dir: Path,
|
|
61
|
+
backup_result: BackupResult,
|
|
62
|
+
) -> None:
|
|
63
|
+
backup_result.backup_count += 1
|
|
64
|
+
src_path = Path(entry.path)
|
|
65
|
+
|
|
66
|
+
dst_path = backup_dir / src_path.relative_to(src_root)
|
|
67
|
+
dst_dir_path = dst_path.parent
|
|
68
|
+
if not dst_dir_path.exists():
|
|
69
|
+
dst_dir_path.mkdir(parents=True, exist_ok=False)
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
size = entry.stat().st_size
|
|
73
|
+
except FileNotFoundError:
|
|
74
|
+
# e.g.: Handle broken symlink
|
|
75
|
+
target = os.readlink(src_path)
|
|
76
|
+
dst_path.symlink_to(target)
|
|
77
|
+
backup_result.symlink_files += 1
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
backup_result.backup_size += size
|
|
81
|
+
|
|
82
|
+
if entry.name == 'SHA256SUMS':
|
|
83
|
+
# Skip existing SHA256SUMS files in source tree,
|
|
84
|
+
# because we create our own SHA256SUMS files.
|
|
85
|
+
logger.debug('Skip existing SHA256SUMS file: %s', src_path)
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
if entry.is_symlink():
|
|
89
|
+
logger.debug('Copy symlink: %s to %s', src_path, dst_path)
|
|
90
|
+
target = os.readlink(src_path)
|
|
91
|
+
dst_path.symlink_to(target)
|
|
92
|
+
backup_result.symlink_files += 1
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
# Process regular files
|
|
96
|
+
assert entry.is_file(follow_symlinks=False), f'Unexpected non-file: {src_path}'
|
|
97
|
+
|
|
98
|
+
# Deduplication logic
|
|
99
|
+
|
|
100
|
+
if size < size_db.MIN_SIZE:
|
|
101
|
+
# Small file -> always copy without deduplication
|
|
102
|
+
logger.info('Copy small file: %s to %s', src_path, dst_path)
|
|
103
|
+
file_hash = copy_and_hash(src_path, dst_path)
|
|
104
|
+
backup_result.copied_files += 1
|
|
105
|
+
backup_result.copied_size += size
|
|
106
|
+
backup_result.copied_small_files += 1
|
|
107
|
+
backup_result.copied_small_size += size
|
|
108
|
+
store_hash(dst_path, file_hash)
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
if size in size_db:
|
|
112
|
+
logger.debug('File with size %iBytes found before -> hash: %s', size, src_path)
|
|
113
|
+
|
|
114
|
+
if size <= CHUNK_SIZE:
|
|
115
|
+
# File can be read complete into memory
|
|
116
|
+
logger.debug('File size %iBytes <= CHUNK_SIZE (%iBytes) -> read complete into memory', size, CHUNK_SIZE)
|
|
117
|
+
file_content, file_hash = read_and_hash_file(src_path)
|
|
118
|
+
if existing_path := hash_db.get(file_hash):
|
|
119
|
+
logger.info('Hardlink duplicate file: %s to %s', dst_path, existing_path)
|
|
120
|
+
os.link(existing_path, dst_path)
|
|
121
|
+
backup_result.hardlinked_files += 1
|
|
122
|
+
backup_result.hardlinked_size += size
|
|
123
|
+
else:
|
|
124
|
+
logger.info('Store unique file: %s to %s', src_path, dst_path)
|
|
125
|
+
dst_path.write_bytes(file_content)
|
|
126
|
+
hash_db[file_hash] = dst_path
|
|
127
|
+
backup_result.copied_files += 1
|
|
128
|
+
backup_result.copied_size += size
|
|
129
|
+
|
|
130
|
+
else:
|
|
131
|
+
# Large file
|
|
132
|
+
file_hash = hash_file(src_path) # Calculate hash without copying
|
|
133
|
+
|
|
134
|
+
if existing_path := hash_db.get(file_hash):
|
|
135
|
+
logger.info('Hardlink duplicate file: %s to %s', dst_path, existing_path)
|
|
136
|
+
os.link(existing_path, dst_path)
|
|
137
|
+
backup_result.hardlinked_files += 1
|
|
138
|
+
backup_result.hardlinked_size += size
|
|
139
|
+
else:
|
|
140
|
+
logger.info('Copy unique file: %s to %s', src_path, dst_path)
|
|
141
|
+
file_hash = copy_and_hash(src_path, dst_path)
|
|
142
|
+
hash_db[file_hash] = dst_path
|
|
143
|
+
backup_result.copied_files += 1
|
|
144
|
+
backup_result.copied_size += size
|
|
145
|
+
|
|
146
|
+
# Keep original file metadata (permission bits, time stamps, and flags)
|
|
147
|
+
shutil.copystat(src_path, dst_path)
|
|
148
|
+
else:
|
|
149
|
+
# A file with this size not backuped before -> Can't be duplicate -> copy and hash
|
|
150
|
+
file_hash = copy_and_hash(src_path, dst_path)
|
|
151
|
+
size_db.add(size)
|
|
152
|
+
hash_db[file_hash] = dst_path
|
|
153
|
+
backup_result.copied_files += 1
|
|
154
|
+
backup_result.copied_size += size
|
|
155
|
+
|
|
156
|
+
store_hash(dst_path, file_hash)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def backup_tree(
|
|
160
|
+
*,
|
|
161
|
+
src_root: Path,
|
|
162
|
+
backup_root: Path,
|
|
163
|
+
excludes: tuple[str, ...],
|
|
164
|
+
log_manager: LoggingManager,
|
|
165
|
+
) -> BackupResult:
|
|
166
|
+
src_root = src_root.resolve()
|
|
167
|
+
if not src_root.is_dir():
|
|
168
|
+
print('Error: Source directory does not exist!')
|
|
169
|
+
print(f'Please check source directory: "{src_root}"\n')
|
|
170
|
+
sys.exit(1)
|
|
171
|
+
|
|
172
|
+
backup_root = backup_root.resolve()
|
|
173
|
+
if not backup_root.is_dir():
|
|
174
|
+
print('Error: Backup directory does not exist!')
|
|
175
|
+
print(f'Please create "{backup_root}" directory first and start again!\n')
|
|
176
|
+
sys.exit(1)
|
|
177
|
+
|
|
178
|
+
if not os.access(backup_root, os.W_OK):
|
|
179
|
+
print('Error: No write access to backup directory!')
|
|
180
|
+
print(f'Please check permissions for backup directory: "{backup_root}"\n')
|
|
181
|
+
sys.exit(1)
|
|
182
|
+
|
|
183
|
+
if not supports_hardlinks(backup_root):
|
|
184
|
+
print('Error: Filesystem for backup directory does not support hardlinks!')
|
|
185
|
+
print(f'Please check backup directory: "{backup_root}"\n')
|
|
186
|
+
sys.exit(1)
|
|
187
|
+
|
|
188
|
+
# Step 1: Scan source directory:
|
|
189
|
+
excludes: set = set(excludes)
|
|
190
|
+
with PrintTimingContextManager('Filesystem scan completed in'):
|
|
191
|
+
src_file_count, src_total_size = humanized_fs_scan(src_root, excludes=excludes)
|
|
192
|
+
|
|
193
|
+
phlb_conf_dir = backup_root / '.phlb'
|
|
194
|
+
phlb_conf_dir.mkdir(parents=False, exist_ok=True)
|
|
195
|
+
|
|
196
|
+
timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')
|
|
197
|
+
backup_main_dir = backup_root / src_root.name
|
|
198
|
+
backup_dir = backup_main_dir / timestamp
|
|
199
|
+
backup_dir.mkdir(parents=True, exist_ok=False)
|
|
200
|
+
|
|
201
|
+
log_file = backup_main_dir / f'{timestamp}-backup.log'
|
|
202
|
+
log_manager.start_file_logging(log_file)
|
|
203
|
+
|
|
204
|
+
logger.info('Backup %s to %s', src_root, backup_dir)
|
|
205
|
+
|
|
206
|
+
print(f'\nBackup to {backup_dir}...\n')
|
|
207
|
+
|
|
208
|
+
with DisplayFileTreeProgress(src_file_count, src_total_size) as progress:
|
|
209
|
+
# "Databases" for deduplication
|
|
210
|
+
size_db = FileSizeDatabase(phlb_conf_dir)
|
|
211
|
+
hash_db = FileHashDatabase(backup_root, phlb_conf_dir)
|
|
212
|
+
|
|
213
|
+
backup_result = BackupResult(backup_dir=backup_dir, log_file=log_file)
|
|
214
|
+
|
|
215
|
+
next_update = 0
|
|
216
|
+
for entry in iter_scandir_files(src_root, excludes=excludes):
|
|
217
|
+
try:
|
|
218
|
+
backup_one_file(
|
|
219
|
+
src_root=src_root,
|
|
220
|
+
entry=entry,
|
|
221
|
+
size_db=size_db,
|
|
222
|
+
hash_db=hash_db,
|
|
223
|
+
backup_dir=backup_dir,
|
|
224
|
+
backup_result=backup_result,
|
|
225
|
+
)
|
|
226
|
+
except Exception as err:
|
|
227
|
+
logger.exception(f'Backup {entry.path} {err.__class__.__name__}: {err}')
|
|
228
|
+
backup_result.error_count += 1
|
|
229
|
+
else:
|
|
230
|
+
now = time.monotonic()
|
|
231
|
+
if now >= next_update:
|
|
232
|
+
progress.update(
|
|
233
|
+
completed_file_count=backup_result.backup_count, completed_size=backup_result.backup_size
|
|
234
|
+
)
|
|
235
|
+
next_update = now + 0.5
|
|
236
|
+
|
|
237
|
+
# Finalize progress indicator values:
|
|
238
|
+
progress.update(completed_file_count=backup_result.backup_count, completed_size=backup_result.backup_size)
|
|
239
|
+
|
|
240
|
+
summary_file = backup_main_dir / f'{timestamp}-summary.txt'
|
|
241
|
+
with TeeStdoutContext(summary_file):
|
|
242
|
+
print(f'\nBackup complete: {backup_dir} (total size {human_filesize(backup_result.backup_size)})\n')
|
|
243
|
+
print(f' Total files processed: {backup_result.backup_count}')
|
|
244
|
+
print(f' * Symlinked files: {backup_result.symlink_files}')
|
|
245
|
+
print(
|
|
246
|
+
f' * Hardlinked files: {backup_result.hardlinked_files}'
|
|
247
|
+
f' (saved {human_filesize(backup_result.hardlinked_size)})'
|
|
248
|
+
)
|
|
249
|
+
print(f' * Copied files: {backup_result.copied_files} (total {human_filesize(backup_result.copied_size)})')
|
|
250
|
+
print(
|
|
251
|
+
f' of which small (<{size_db.MIN_SIZE} Bytes)'
|
|
252
|
+
f' files: {backup_result.copied_small_files}'
|
|
253
|
+
f' (total {human_filesize(backup_result.copied_small_size)})'
|
|
254
|
+
)
|
|
255
|
+
if backup_result.error_count > 0:
|
|
256
|
+
print(f' Errors during backup: {backup_result.error_count} (see log for details)')
|
|
257
|
+
print()
|
|
258
|
+
|
|
259
|
+
logger.info('Backup completed. Summary created: %s', summary_file)
|
|
260
|
+
|
|
261
|
+
return backup_result
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI for usage
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import sys
|
|
7
|
+
from collections.abc import Sequence
|
|
8
|
+
|
|
9
|
+
from cli_base.autodiscover import import_all_files
|
|
10
|
+
from cli_base.cli_tools.version_info import print_version
|
|
11
|
+
from rich import print # noqa
|
|
12
|
+
from tyro.extras import SubcommandApp
|
|
13
|
+
|
|
14
|
+
import PyHardLinkBackup
|
|
15
|
+
from PyHardLinkBackup import constants
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
app = SubcommandApp()
|
|
21
|
+
|
|
22
|
+
# Register all CLI commands, just by import all files in this package:
|
|
23
|
+
import_all_files(package=__package__, init_file=__file__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@app.command
|
|
27
|
+
def version():
|
|
28
|
+
"""Print version and exit"""
|
|
29
|
+
# Pseudo command, because the version always printed on every CLI call ;)
|
|
30
|
+
sys.exit(0)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main(args: Sequence[str] | None = None):
|
|
34
|
+
print_version(PyHardLinkBackup)
|
|
35
|
+
app.cli(
|
|
36
|
+
prog='./cli.py',
|
|
37
|
+
description=constants.CLI_EPILOG,
|
|
38
|
+
use_underscores=False, # use hyphens instead of underscores
|
|
39
|
+
sort_subcommands=True,
|
|
40
|
+
args=args,
|
|
41
|
+
)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Annotated
|
|
4
|
+
|
|
5
|
+
import tyro
|
|
6
|
+
from rich import print # noqa
|
|
7
|
+
|
|
8
|
+
from PyHardLinkBackup import compare_backup, rebuild_databases
|
|
9
|
+
from PyHardLinkBackup.backup import backup_tree
|
|
10
|
+
from PyHardLinkBackup.cli_app import app
|
|
11
|
+
from PyHardLinkBackup.logging_setup import (
|
|
12
|
+
DEFAULT_CONSOLE_LOG_LEVEL,
|
|
13
|
+
DEFAULT_LOG_FILE_LEVEL,
|
|
14
|
+
LoggingManager,
|
|
15
|
+
TyroConsoleLogLevelArgType,
|
|
16
|
+
TyroLogFileLevelArgType,
|
|
17
|
+
)
|
|
18
|
+
from PyHardLinkBackup.utilities.tyro_cli_shared_args import (
|
|
19
|
+
DEFAULT_EXCLUDE_DIRECTORIES,
|
|
20
|
+
TyroExcludeDirectoriesArgType,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@app.command
|
|
28
|
+
def backup(
|
|
29
|
+
src: Annotated[
|
|
30
|
+
Path,
|
|
31
|
+
tyro.conf.arg(
|
|
32
|
+
metavar='source',
|
|
33
|
+
help='Source directory to back up.',
|
|
34
|
+
),
|
|
35
|
+
],
|
|
36
|
+
dst: Annotated[
|
|
37
|
+
Path,
|
|
38
|
+
tyro.conf.arg(
|
|
39
|
+
metavar='destination',
|
|
40
|
+
help='Destination directory for the backup.',
|
|
41
|
+
),
|
|
42
|
+
],
|
|
43
|
+
/,
|
|
44
|
+
excludes: TyroExcludeDirectoriesArgType = DEFAULT_EXCLUDE_DIRECTORIES,
|
|
45
|
+
verbosity: TyroConsoleLogLevelArgType = DEFAULT_CONSOLE_LOG_LEVEL,
|
|
46
|
+
log_file_level: TyroLogFileLevelArgType = DEFAULT_LOG_FILE_LEVEL,
|
|
47
|
+
) -> None:
|
|
48
|
+
"""
|
|
49
|
+
Backup the source directory to the destination directory using hard links for deduplication.
|
|
50
|
+
"""
|
|
51
|
+
log_manager = LoggingManager(
|
|
52
|
+
console_level=verbosity,
|
|
53
|
+
file_level=log_file_level,
|
|
54
|
+
)
|
|
55
|
+
backup_tree(
|
|
56
|
+
src_root=src,
|
|
57
|
+
backup_root=dst,
|
|
58
|
+
excludes=excludes,
|
|
59
|
+
log_manager=log_manager,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@app.command
|
|
64
|
+
def compare(
|
|
65
|
+
src: Annotated[
|
|
66
|
+
Path,
|
|
67
|
+
tyro.conf.arg(
|
|
68
|
+
metavar='source',
|
|
69
|
+
help='Source directory that should be compared with the last backup.',
|
|
70
|
+
),
|
|
71
|
+
],
|
|
72
|
+
dst: Annotated[
|
|
73
|
+
Path,
|
|
74
|
+
tyro.conf.arg(
|
|
75
|
+
metavar='destination',
|
|
76
|
+
help='Destination directory with the backups. Will pick the last backup for comparison.',
|
|
77
|
+
),
|
|
78
|
+
],
|
|
79
|
+
/,
|
|
80
|
+
excludes: TyroExcludeDirectoriesArgType = DEFAULT_EXCLUDE_DIRECTORIES,
|
|
81
|
+
verbosity: TyroConsoleLogLevelArgType = DEFAULT_CONSOLE_LOG_LEVEL,
|
|
82
|
+
log_file_level: TyroLogFileLevelArgType = DEFAULT_LOG_FILE_LEVEL,
|
|
83
|
+
) -> None:
|
|
84
|
+
"""
|
|
85
|
+
Compares a source tree with the last backup and validates all known file hashes.
|
|
86
|
+
"""
|
|
87
|
+
log_manager = LoggingManager(
|
|
88
|
+
console_level=verbosity,
|
|
89
|
+
file_level=log_file_level,
|
|
90
|
+
)
|
|
91
|
+
compare_backup.compare_tree(
|
|
92
|
+
src_root=src,
|
|
93
|
+
backup_root=dst,
|
|
94
|
+
excludes=excludes,
|
|
95
|
+
log_manager=log_manager,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@app.command
|
|
100
|
+
def rebuild(
|
|
101
|
+
backup_root: Annotated[
|
|
102
|
+
Path,
|
|
103
|
+
tyro.conf.arg(
|
|
104
|
+
metavar='backup-directory',
|
|
105
|
+
help='Root directory of the the backups.',
|
|
106
|
+
),
|
|
107
|
+
],
|
|
108
|
+
/,
|
|
109
|
+
verbosity: TyroConsoleLogLevelArgType = DEFAULT_CONSOLE_LOG_LEVEL,
|
|
110
|
+
log_file_level: TyroLogFileLevelArgType = DEFAULT_LOG_FILE_LEVEL,
|
|
111
|
+
) -> None:
|
|
112
|
+
"""
|
|
113
|
+
Rebuild the file hash and size database by scanning all backup files. And also verify SHA256SUMS
|
|
114
|
+
and/or store missing hashes in SHA256SUMS files.
|
|
115
|
+
"""
|
|
116
|
+
log_manager = LoggingManager(
|
|
117
|
+
console_level=verbosity,
|
|
118
|
+
file_level=log_file_level,
|
|
119
|
+
)
|
|
120
|
+
rebuild_databases.rebuild(
|
|
121
|
+
backup_root=backup_root,
|
|
122
|
+
log_manager=log_manager,
|
|
123
|
+
)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI for development
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
import logging
|
|
7
|
+
import sys
|
|
8
|
+
from collections.abc import Sequence
|
|
9
|
+
|
|
10
|
+
from bx_py_utils.path import assert_is_file
|
|
11
|
+
from cli_base.autodiscover import import_all_files
|
|
12
|
+
from cli_base.cli_tools.dev_tools import run_coverage, run_nox, run_unittest_cli
|
|
13
|
+
from cli_base.cli_tools.version_info import print_version
|
|
14
|
+
from typeguard import install_import_hook
|
|
15
|
+
from tyro.extras import SubcommandApp
|
|
16
|
+
|
|
17
|
+
import PyHardLinkBackup
|
|
18
|
+
from PyHardLinkBackup import constants
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Check type annotations via typeguard in all tests.
|
|
22
|
+
# Sadly we must activate this here and can't do this in ./tests/__init__.py
|
|
23
|
+
install_import_hook(packages=('PyHardLinkBackup',))
|
|
24
|
+
|
|
25
|
+
# reload the module, after the typeguard import hook is activated:
|
|
26
|
+
importlib.reload(PyHardLinkBackup)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
PACKAGE_ROOT = constants.BASE_PATH.parent
|
|
33
|
+
assert_is_file(PACKAGE_ROOT / 'pyproject.toml') # Exists only in cloned git repo
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
app = SubcommandApp()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Register all CLI commands, just by import all files in this package:
|
|
40
|
+
import_all_files(package=__package__, init_file=__file__)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@app.command
|
|
44
|
+
def version():
|
|
45
|
+
"""Print version and exit"""
|
|
46
|
+
# Pseudo command, because the version always printed on every CLI call ;)
|
|
47
|
+
sys.exit(0)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def main(args: Sequence[str] | None = None):
|
|
51
|
+
print_version(PyHardLinkBackup)
|
|
52
|
+
|
|
53
|
+
if len(sys.argv) >= 2:
|
|
54
|
+
# Check if we can just pass a command call to origin CLI:
|
|
55
|
+
command = sys.argv[1]
|
|
56
|
+
command_map = {
|
|
57
|
+
'test': run_unittest_cli,
|
|
58
|
+
'nox': run_nox,
|
|
59
|
+
'coverage': run_coverage,
|
|
60
|
+
}
|
|
61
|
+
if real_func := command_map.get(command):
|
|
62
|
+
real_func(argv=sys.argv, exit_after_run=True)
|
|
63
|
+
|
|
64
|
+
app.cli(
|
|
65
|
+
prog='./dev-cli.py',
|
|
66
|
+
description=constants.CLI_EPILOG,
|
|
67
|
+
use_underscores=False, # use hyphens instead of underscores
|
|
68
|
+
sort_subcommands=True,
|
|
69
|
+
args=args,
|
|
70
|
+
)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import hashlib
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from bx_py_utils.path import assert_is_dir
|
|
8
|
+
from cli_base.cli_tools.verbosity import setup_logging
|
|
9
|
+
from cli_base.tyro_commands import TyroVerbosityArgType
|
|
10
|
+
from rich import print # noqa
|
|
11
|
+
|
|
12
|
+
from PyHardLinkBackup.cli_dev import app
|
|
13
|
+
from PyHardLinkBackup.utilities.filesystem import humanized_fs_scan, iter_scandir_files
|
|
14
|
+
from PyHardLinkBackup.utilities.humanize import PrintTimingContextManager
|
|
15
|
+
from PyHardLinkBackup.utilities.tyro_cli_shared_args import DEFAULT_EXCLUDE_DIRECTORIES, TyroExcludeDirectoriesArgType
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@app.command
|
|
22
|
+
def scan_benchmark(
|
|
23
|
+
base_path: Path,
|
|
24
|
+
/,
|
|
25
|
+
excludes: TyroExcludeDirectoriesArgType = DEFAULT_EXCLUDE_DIRECTORIES,
|
|
26
|
+
verbosity: TyroVerbosityArgType = 1,
|
|
27
|
+
) -> None:
|
|
28
|
+
"""
|
|
29
|
+
Benchmark our filesystem scan routine.
|
|
30
|
+
"""
|
|
31
|
+
setup_logging(verbosity=verbosity)
|
|
32
|
+
with PrintTimingContextManager('Filesystem scan completed in'):
|
|
33
|
+
humanized_fs_scan(path=base_path, excludes=set(excludes))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@app.command
|
|
37
|
+
def benchmark_hashes(
|
|
38
|
+
base_path: Path,
|
|
39
|
+
/,
|
|
40
|
+
excludes: TyroExcludeDirectoriesArgType = DEFAULT_EXCLUDE_DIRECTORIES,
|
|
41
|
+
max_duration: int = 30, # in seconds
|
|
42
|
+
min_file_size: int = 15 * 1024, # 15 KiB
|
|
43
|
+
max_file_size: int = 100 * 1024 * 1024, # 100 MiB
|
|
44
|
+
verbosity: TyroVerbosityArgType = 1,
|
|
45
|
+
) -> None:
|
|
46
|
+
"""
|
|
47
|
+
Benchmark different file hashing algorithms on the given path.
|
|
48
|
+
"""
|
|
49
|
+
# Example output:
|
|
50
|
+
#
|
|
51
|
+
# Total files hashed: 220, total size: 1187.7 MiB
|
|
52
|
+
#
|
|
53
|
+
# Results:
|
|
54
|
+
# Total file content read time: 1.7817s
|
|
55
|
+
#
|
|
56
|
+
# sha1 | Total: 0.6827s | 0.4x hash/read
|
|
57
|
+
# sha256 | Total: 0.7189s | 0.4x hash/read
|
|
58
|
+
# sha224 | Total: 0.7375s | 0.4x hash/read
|
|
59
|
+
# sha384 | Total: 1.6552s | 0.9x hash/read
|
|
60
|
+
# blake2b | Total: 1.6708s | 0.9x hash/read
|
|
61
|
+
# md5 | Total: 1.6870s | 0.9x hash/read
|
|
62
|
+
# sha512 | Total: 1.7269s | 1.0x hash/read
|
|
63
|
+
# shake_128 | Total: 1.9834s | 1.1x hash/read
|
|
64
|
+
# sha3_224 | Total: 2.3006s | 1.3x hash/read
|
|
65
|
+
# sha3_256 | Total: 2.3856s | 1.3x hash/read
|
|
66
|
+
# shake_256 | Total: 2.4375s | 1.4x hash/read
|
|
67
|
+
# blake2s | Total: 2.5219s | 1.4x hash/read
|
|
68
|
+
# sha3_384 | Total: 3.2596s | 1.8x hash/read
|
|
69
|
+
# sha3_512 | Total: 4.5328s | 2.5x hash/read
|
|
70
|
+
setup_logging(verbosity=verbosity)
|
|
71
|
+
assert_is_dir(base_path)
|
|
72
|
+
print(f'Benchmarking file hashes under: {base_path}')
|
|
73
|
+
|
|
74
|
+
print(f'Min file size: {min_file_size} bytes')
|
|
75
|
+
print(f'Max file size: {max_file_size} bytes')
|
|
76
|
+
print(f'Max duration: {max_duration} seconds')
|
|
77
|
+
|
|
78
|
+
algorithms = sorted(hashlib.algorithms_guaranteed)
|
|
79
|
+
print(f'\nUsing {len(algorithms)} guaranteed algorithms: {algorithms}')
|
|
80
|
+
print('-' * 80)
|
|
81
|
+
|
|
82
|
+
file_count = 0
|
|
83
|
+
total_size = 0
|
|
84
|
+
total_read_time = 0.0
|
|
85
|
+
results = collections.defaultdict(set)
|
|
86
|
+
|
|
87
|
+
start_time = time.time()
|
|
88
|
+
stop_time = start_time + max_duration
|
|
89
|
+
next_update = start_time + 2
|
|
90
|
+
|
|
91
|
+
with PrintTimingContextManager('Filesystem scan completed in'):
|
|
92
|
+
for dir_entry in iter_scandir_files(path=base_path, excludes=set(excludes)):
|
|
93
|
+
entry_stat = dir_entry.stat()
|
|
94
|
+
file_size = entry_stat.st_size
|
|
95
|
+
if not (min_file_size <= file_size <= max_file_size):
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
start_time = time.perf_counter()
|
|
99
|
+
file_content = Path(dir_entry.path).read_bytes()
|
|
100
|
+
duration = time.perf_counter() - start_time
|
|
101
|
+
total_read_time += duration
|
|
102
|
+
|
|
103
|
+
for algo in algorithms:
|
|
104
|
+
# Actual measurement:
|
|
105
|
+
start_time = time.perf_counter()
|
|
106
|
+
hashlib.new(algo, file_content)
|
|
107
|
+
duration = time.perf_counter() - start_time
|
|
108
|
+
|
|
109
|
+
results[algo].add(duration)
|
|
110
|
+
|
|
111
|
+
file_count += 1
|
|
112
|
+
total_size += entry_stat.st_size
|
|
113
|
+
|
|
114
|
+
now = time.time()
|
|
115
|
+
if now >= stop_time:
|
|
116
|
+
print('Reached max duration limit, stopping benchmark...')
|
|
117
|
+
break
|
|
118
|
+
|
|
119
|
+
if now >= next_update:
|
|
120
|
+
percent = (now - (stop_time - max_duration)) / max_duration * 100
|
|
121
|
+
print(
|
|
122
|
+
f'{int(percent)}% Processed {file_count} files so far,'
|
|
123
|
+
f' total size: {total_size / 1024 / 1024:.1f} MiB...'
|
|
124
|
+
)
|
|
125
|
+
next_update = now + 2
|
|
126
|
+
|
|
127
|
+
print(f'\nTotal files hashed: {file_count}, total size: {total_size / 1024 / 1024:.1f} MiB')
|
|
128
|
+
|
|
129
|
+
print('\nResults:')
|
|
130
|
+
print(f'Total file content read time: {total_read_time:.4f}s\n')
|
|
131
|
+
|
|
132
|
+
sorted_results = sorted(
|
|
133
|
+
((algo, sum(durations)) for algo, durations in results.items()),
|
|
134
|
+
key=lambda x: x[1], # Sort by total_duration
|
|
135
|
+
)
|
|
136
|
+
for algo, total_duration in sorted_results:
|
|
137
|
+
ratio = total_duration / total_read_time
|
|
138
|
+
print(f'{algo:10} | Total: {total_duration:.4f}s | {ratio:.1f}x hash/read')
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from cli_base.cli_tools.code_style import assert_code_style
|
|
2
|
+
from cli_base.tyro_commands import TyroVerbosityArgType
|
|
3
|
+
|
|
4
|
+
from PyHardLinkBackup.cli_dev import PACKAGE_ROOT, app
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@app.command
|
|
8
|
+
def lint(verbosity: TyroVerbosityArgType = 1):
|
|
9
|
+
"""
|
|
10
|
+
Check/fix code style by run: "ruff check --fix"
|
|
11
|
+
"""
|
|
12
|
+
assert_code_style(package_root=PACKAGE_ROOT, verbose=bool(verbosity), sys_exit=True)
|