PyHardLinkBackup 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. PyHardLinkBackup/__init__.py +7 -0
  2. PyHardLinkBackup/__main__.py +10 -0
  3. PyHardLinkBackup/backup.py +203 -0
  4. PyHardLinkBackup/cli_app/__init__.py +41 -0
  5. PyHardLinkBackup/cli_app/phlb.py +50 -0
  6. PyHardLinkBackup/cli_dev/__init__.py +70 -0
  7. PyHardLinkBackup/cli_dev/benchmark.py +119 -0
  8. PyHardLinkBackup/cli_dev/code_style.py +12 -0
  9. PyHardLinkBackup/cli_dev/packaging.py +65 -0
  10. PyHardLinkBackup/cli_dev/shell_completion.py +23 -0
  11. PyHardLinkBackup/cli_dev/testing.py +52 -0
  12. PyHardLinkBackup/cli_dev/update_readme_history.py +33 -0
  13. PyHardLinkBackup/constants.py +16 -0
  14. PyHardLinkBackup/tests/__init__.py +36 -0
  15. PyHardLinkBackup/tests/test_backup.py +399 -0
  16. PyHardLinkBackup/tests/test_doc_write.py +25 -0
  17. PyHardLinkBackup/tests/test_doctests.py +10 -0
  18. PyHardLinkBackup/tests/test_project_setup.py +46 -0
  19. PyHardLinkBackup/tests/test_readme.py +75 -0
  20. PyHardLinkBackup/tests/test_readme_history.py +8 -0
  21. PyHardLinkBackup/utilities/__init__.py +0 -0
  22. PyHardLinkBackup/utilities/file_hash_database.py +58 -0
  23. PyHardLinkBackup/utilities/file_size_database.py +46 -0
  24. PyHardLinkBackup/utilities/filesystem.py +133 -0
  25. PyHardLinkBackup/utilities/humanize.py +22 -0
  26. PyHardLinkBackup/utilities/rich_utils.py +98 -0
  27. PyHardLinkBackup/utilities/tests/__init__.py +0 -0
  28. PyHardLinkBackup/utilities/tests/test_file_hash_database.py +134 -0
  29. PyHardLinkBackup/utilities/tests/test_file_size_database.py +131 -0
  30. PyHardLinkBackup/utilities/tests/test_filesystem.py +94 -0
  31. pyhardlinkbackup-1.0.0.dist-info/METADATA +547 -0
  32. pyhardlinkbackup-1.0.0.dist-info/RECORD +34 -0
  33. pyhardlinkbackup-1.0.0.dist-info/WHEEL +4 -0
  34. pyhardlinkbackup-1.0.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,58 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+
5
+ class HashAlreadyExistsError(ValueError):
6
+ pass
7
+
8
+
9
+ class FileHashDatabase:
10
+ """DocWrite: README.md ## FileHashDatabase
11
+ A simple "database" to store file content hash <-> relative path mappings.
12
+ Uses a directory structure to avoid too many files in a single directory.
13
+ Path structure:
14
+ {base_dst}/.phlb/hash-lookup/{XX}/{YY}/{hash}
15
+ e.g.:
16
+ hash '12ab000a1b2c3...' results in: {base_dst}/.phlb/hash-lookup/12/ab/12ab000a1b2c3...
17
+
18
+ Notes:
19
+ * Hash length will be not validated, so it can be used with any hash algorithm.
20
+ * The "relative path" that will be stored is not validated, so it can be any string.
21
+ * We don't "cache" anything in Memory, to avoid high memory consumption for large datasets.
22
+ """
23
+
24
+ def __init__(self, backup_root: Path, phlb_conf_dir: Path):
25
+ self.backup_root = backup_root
26
+ self.base_path = phlb_conf_dir / 'hash-lookup'
27
+ self.base_path.mkdir(parents=False, exist_ok=True)
28
+
29
+ def _get_hash_path(self, hash: str) -> Path:
30
+ first_dir_name = hash[:2]
31
+ second_dir_name = hash[2:4]
32
+ hash_path = self.base_path / first_dir_name / second_dir_name / hash
33
+ return hash_path
34
+
35
+ def get(self, hash: str) -> Path | None:
36
+ hash_path = self._get_hash_path(hash)
37
+ try:
38
+ rel_file_path = hash_path.read_text()
39
+ except FileNotFoundError:
40
+ return None
41
+ else:
42
+ abs_file_path = self.backup_root / rel_file_path
43
+ if not abs_file_path.is_file():
44
+ logging.warning('Hash database entry found, but file does not exist: %s', abs_file_path)
45
+ hash_path.unlink()
46
+ return None
47
+ return abs_file_path
48
+
49
+ def __setitem__(self, hash: str, abs_file_path: Path):
50
+ hash_path = self._get_hash_path(hash)
51
+ hash_path.parent.mkdir(parents=True, exist_ok=True)
52
+
53
+ # File should be found before and results in hardlink creation!
54
+ # So deny change of existing hashes:
55
+ if hash_path.exists():
56
+ raise HashAlreadyExistsError(f'Hash {hash} already exists in the database!')
57
+
58
+ hash_path.write_text(str(abs_file_path.relative_to(self.backup_root)))
@@ -0,0 +1,46 @@
1
+ from pathlib import Path
2
+
3
+
4
+ class FileSizeDatabase:
5
+ """DocWrite: README.md ## FileSizeDatabase
6
+ A simple "database" to track which file sizes have been seen.
7
+
8
+ Uses a directory structure to avoid too many files in a single directory.
9
+ We don't "cache" anything in Memory, to avoid high memory consumption for large datasets.
10
+ """
11
+
12
+ MIN_SIZE = 1000 # no padding is made, so the min size is 1000 bytes!
13
+
14
+ def __init__(self, phlb_conf_dir: Path):
15
+ self.base_path = phlb_conf_dir / 'size-lookup'
16
+ self.base_path.mkdir(parents=False, exist_ok=True)
17
+
18
+ def _get_size_path(self, size: int) -> Path:
19
+ assert size >= self.MIN_SIZE, f'Size must be at least {self.MIN_SIZE} bytes'
20
+ size_str = str(size)
21
+
22
+ """DocWrite: README.md ## FileSizeDatabase
23
+ Path structure:
24
+ * `{base_dst}/.phlb/size-lookup/{XX}/{YY}/{size}`
25
+
26
+ e.g.:
27
+
28
+ * `1234567890` results in: `{base_dst}/.phlb/size-lookup/12/34/1234567890`
29
+ """
30
+ first_dir_name = size_str[:2]
31
+ second_dir_name = size_str[2:4]
32
+ size_path = self.base_path / first_dir_name / second_dir_name / size_str
33
+ return size_path
34
+
35
+ def __contains__(self, size: int) -> bool:
36
+ size_path = self._get_size_path(size)
37
+ return size_path.exists()
38
+
39
+ def add(self, size: int):
40
+ size_path = self._get_size_path(size)
41
+ if not size_path.exists():
42
+ size_path.parent.mkdir(parents=True, exist_ok=True)
43
+
44
+ """DocWrite: README.md ## FileSizeDatabase
45
+ All files are created empty, as we only care about their existence."""
46
+ size_path.touch(exist_ok=False)
@@ -0,0 +1,133 @@
1
+ import hashlib
2
+ import logging
3
+ import os
4
+ import shutil
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Iterable
8
+
9
+ from rich.progress import (
10
+ Progress,
11
+ SpinnerColumn,
12
+ TextColumn,
13
+ TimeElapsedColumn,
14
+ )
15
+
16
+ from PyHardLinkBackup.constants import CHUNK_SIZE, HASH_ALGO
17
+ from PyHardLinkBackup.utilities.rich_utils import HumanFileSizeColumn
18
+
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def hash_file(path: Path) -> str:
24
+ logger.debug('Hash file %s using %s', path, HASH_ALGO)
25
+ with path.open('rb') as f:
26
+ digest = hashlib.file_digest(f, HASH_ALGO)
27
+
28
+ file_hash = digest.hexdigest()
29
+ logger.info('%s %s hash: %s', path, HASH_ALGO, file_hash)
30
+ return file_hash
31
+
32
+
33
+ def copy_and_hash(src: Path, dst: Path) -> str:
34
+ logger.debug('Copy and hash file %s to %s using %s', src, dst, HASH_ALGO)
35
+ hasher = hashlib.new(HASH_ALGO)
36
+ with src.open('rb') as source_file, dst.open('wb') as dst_file:
37
+ while chunk := source_file.read(CHUNK_SIZE):
38
+ dst_file.write(chunk)
39
+ hasher.update(chunk)
40
+
41
+ # Keep original file metadata (permission bits, last access time, last modification time, and flags)
42
+ shutil.copystat(src, dst)
43
+
44
+ file_hash = hasher.hexdigest()
45
+ logger.info('%s backup to %s with %s hash: %s', src, dst, HASH_ALGO, file_hash)
46
+ return file_hash
47
+
48
+
49
+ def read_and_hash_file(path: Path) -> tuple[bytes, str]:
50
+ logger.debug('Read and hash file %s using %s into RAM', path, HASH_ALGO)
51
+ content = path.read_bytes()
52
+ hasher = hashlib.new(HASH_ALGO, content)
53
+ file_hash = hasher.hexdigest()
54
+ logger.info('%s %s hash: %s', path, HASH_ALGO, file_hash)
55
+ return content, file_hash
56
+
57
+
58
+ def iter_scandir_files(path: Path, excludes: set[str]) -> Iterable[os.DirEntry]:
59
+ """
60
+ Recursively yield all files+symlinks in the given directory.
61
+ """
62
+ logger.debug('Scanning directory %s', path)
63
+ with os.scandir(path) as scandir_iterator:
64
+ for entry in scandir_iterator:
65
+ if entry.is_dir(follow_symlinks=True):
66
+ if entry.name in excludes:
67
+ logger.debug('Excluding directory %s', entry.path)
68
+ continue
69
+ yield from iter_scandir_files(Path(entry.path), excludes=excludes)
70
+ else:
71
+ # It's a file or symlink or broken symlink
72
+ yield entry
73
+
74
+
75
+ def humanized_fs_scan(path: Path, excludes: set[str]) -> tuple[int, int]:
76
+ print(f'\nScanning filesystem at: {path}...')
77
+
78
+ progress = Progress(
79
+ TimeElapsedColumn(),
80
+ '{task.description}',
81
+ SpinnerColumn('simpleDots'),
82
+ TextColumn('[green]{task.fields[file_count]} Files'),
83
+ HumanFileSizeColumn(field_name='total_size'),
84
+ TextColumn('| [cyan]{task.fields[files_per_sec]} Files/sec'),
85
+ )
86
+
87
+ file_count = 0
88
+ total_size = 0
89
+ start_time = time.time()
90
+ scan_task_id = progress.add_task(
91
+ description='Scanning',
92
+ file_count=file_count,
93
+ total_size=total_size,
94
+ files_per_sec=0.0,
95
+ total=None,
96
+ )
97
+ next_update = 0
98
+ with progress:
99
+ for entry in iter_scandir_files(path, excludes=excludes):
100
+ file_count += 1
101
+ try:
102
+ total_size += entry.stat().st_size
103
+ except FileNotFoundError:
104
+ # e.g.: broken symlink
105
+ continue
106
+
107
+ now = time.time()
108
+ if now >= next_update:
109
+ elapsed = max(now - start_time, 1e-6)
110
+ files_per_sec = int(file_count / elapsed)
111
+ progress.update(
112
+ scan_task_id,
113
+ file_count=file_count,
114
+ total_size=total_size,
115
+ files_per_sec=files_per_sec,
116
+ )
117
+ next_update = now + 1
118
+
119
+ now = time.time()
120
+
121
+ elapsed = max(now - start_time, 1e-6)
122
+ files_per_sec = int(file_count / elapsed)
123
+ progress.stop_task(scan_task_id)
124
+ progress.update(
125
+ scan_task_id,
126
+ description='Completed',
127
+ completed=True,
128
+ file_count=file_count,
129
+ total_size=total_size,
130
+ files_per_sec=files_per_sec,
131
+ )
132
+
133
+ return file_count, total_size
@@ -0,0 +1,22 @@
1
+
2
+
3
+ def human_filesize(size: int | float) -> str:
4
+ """
5
+ >>> human_filesize(1024)
6
+ '1.00 KiB'
7
+ >>> human_filesize(2.2*1024)
8
+ '2.20 KiB'
9
+ >>> human_filesize(3.33*1024*1024)
10
+ '3.33 MiB'
11
+ >>> human_filesize(4.44*1024*1024*1024)
12
+ '4.44 GiB'
13
+ >>> human_filesize(5.55*1024*1024*1024*1024)
14
+ '5.55 TiB'
15
+ >>> human_filesize(6.66*1024*1024*1024*1024*1024)
16
+ '6.66 PiB'
17
+ """
18
+ for unit in ['Bytes', 'KiB', 'MiB', 'GiB', 'TiB']:
19
+ if size < 1024.0:
20
+ return f'{size:.2f} {unit}'
21
+ size /= 1024.0
22
+ return f'{size:.2f} PiB'
@@ -0,0 +1,98 @@
1
+ from rich.live import Live
2
+ from rich.panel import Panel
3
+ from rich.progress import (
4
+ BarColumn,
5
+ Progress,
6
+ ProgressColumn,
7
+ TaskProgressColumn,
8
+ TextColumn,
9
+ TimeElapsedColumn,
10
+ TimeRemainingColumn,
11
+ TransferSpeedColumn,
12
+ )
13
+ from rich.table import Table
14
+ from rich.text import Text
15
+
16
+ from PyHardLinkBackup.utilities.humanize import human_filesize
17
+
18
+
19
+ class HumanFileSizeColumn(ProgressColumn):
20
+ def __init__(self, field_name: str | None = None, **kwargs) -> None:
21
+ super().__init__(**kwargs)
22
+ self.field_name = field_name
23
+
24
+ def render(self, task):
25
+ if self.field_name is None:
26
+ file_size = task.completed
27
+ else:
28
+ try:
29
+ file_size = task.fields[self.field_name]
30
+ except KeyError:
31
+ raise KeyError(f'Field {self.field_name=} not found in: {task.fields.keys()=}') from None
32
+ return Text(f'| {human_filesize(file_size)}')
33
+
34
+
35
+ class BackupProgress:
36
+ def __init__(self, src_file_count: int, src_total_size: int):
37
+ self.overall_progress = Progress(
38
+ TaskProgressColumn(),
39
+ BarColumn(bar_width=50),
40
+ TextColumn('Elapsed:'),
41
+ TimeElapsedColumn(),
42
+ TextColumn('Remaining:'),
43
+ TimeRemainingColumn(),
44
+ )
45
+ self.overall_progress_task_id = self.overall_progress.add_task(description='', total=100)
46
+
47
+ self.file_count_progress = Progress(
48
+ TaskProgressColumn(),
49
+ BarColumn(bar_width=50),
50
+ TextColumn('{task.completed} Files'),
51
+ )
52
+ self.file_count_progress_task_id = self.file_count_progress.add_task(description='', total=src_file_count)
53
+ self.file_count_progress_task = self.file_count_progress.tasks[0]
54
+
55
+ self.file_size_progress = Progress(
56
+ TaskProgressColumn(),
57
+ BarColumn(bar_width=50),
58
+ HumanFileSizeColumn(),
59
+ '|',
60
+ TransferSpeedColumn(),
61
+ )
62
+ self.file_size_progress_task_id = self.file_size_progress.add_task(description='', total=src_total_size)
63
+ self.file_size_progress_task = self.file_size_progress.tasks[0]
64
+
65
+ progress_table = Table.grid()
66
+ progress_table.add_row(Panel(self.overall_progress, title='[b]Overall Backup Progress', border_style='green'))
67
+ progress_table.add_row(Panel(self.file_count_progress, title='Total files saved'))
68
+ progress_table.add_row(Panel(self.file_size_progress, title='Total file size processed'))
69
+
70
+ self.live = Live(progress_table, auto_refresh=False)
71
+
72
+ def __enter__(self):
73
+ self.live.__enter__()
74
+ return self
75
+
76
+ def update(self, backup_count: int, backup_size: int):
77
+ self.file_count_progress.update(
78
+ task_id=self.file_count_progress_task_id,
79
+ completed=backup_count,
80
+ refresh=True,
81
+ )
82
+ self.file_size_progress.update(
83
+ task_id=self.file_size_progress_task_id,
84
+ completed=backup_size,
85
+ refresh=True,
86
+ )
87
+ self.overall_progress.update(
88
+ task_id=self.overall_progress_task_id,
89
+ completed=(self.file_count_progress_task.percentage + self.file_size_progress_task.percentage) / 2,
90
+ refresh=True,
91
+ )
92
+ self.live.refresh()
93
+
94
+ def __exit__(self, exc_type, exc_value, traceback):
95
+ self.overall_progress.stop()
96
+ self.file_count_progress.stop()
97
+ self.file_size_progress.stop()
98
+ self.live.__exit__(exc_type, exc_value, traceback)
File without changes
@@ -0,0 +1,134 @@
1
+ import logging
2
+ import tempfile
3
+ import textwrap
4
+ from pathlib import Path
5
+ from unittest import TestCase
6
+
7
+ from bx_py_utils.path import assert_is_dir
8
+ from bx_py_utils.test_utils.assertion import assert_text_equal
9
+
10
+ from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase, HashAlreadyExistsError
11
+ from PyHardLinkBackup.utilities.filesystem import iter_scandir_files
12
+
13
+
14
+ class TemporaryFileHashDatabase(tempfile.TemporaryDirectory):
15
+ def __enter__(self) -> FileHashDatabase:
16
+ temp_dir = super().__enter__()
17
+ backup_root = Path(temp_dir)
18
+
19
+ phlb_conf_dir = backup_root / '.phlb'
20
+ phlb_conf_dir.mkdir()
21
+
22
+ hash_db = FileHashDatabase(backup_root=backup_root, phlb_conf_dir=phlb_conf_dir)
23
+ return hash_db
24
+
25
+
26
+ def get_hash_db_filenames(hash_db: FileHashDatabase) -> list[str]:
27
+ return sorted(
28
+ str(Path(entry.path).relative_to(hash_db.base_path))
29
+ for entry in iter_scandir_files(hash_db.base_path, excludes=set())
30
+ )
31
+
32
+
33
+ def get_hash_db_info(backup_root: Path) -> str:
34
+ db_base_path = backup_root / '.phlb' / 'hash-lookup'
35
+ assert_is_dir(db_base_path)
36
+
37
+ lines = []
38
+ for entry in iter_scandir_files(db_base_path, excludes=set()):
39
+ hash_path = Path(entry.path)
40
+ rel_path = hash_path.relative_to(db_base_path)
41
+ rel_file_path = hash_path.read_text()
42
+ lines.append(f'{str(rel_path)[:20]}… -> {rel_file_path}')
43
+ return '\n'.join(sorted(lines))
44
+
45
+
46
+ def assert_hash_db_info(backup_root: Path, expected: str):
47
+ expected = textwrap.dedent(expected).strip()
48
+ actual = get_hash_db_info(backup_root)
49
+ assert_text_equal(
50
+ actual,
51
+ expected,
52
+ msg=f'FileHashDatabase info does not match as expected.\n\n{actual}\n\n',
53
+ )
54
+
55
+
56
+ class FileHashDatabaseTestCase(TestCase):
57
+ def test_happy_path(self):
58
+ with TemporaryFileHashDatabase() as hash_db:
59
+ self.assertIsInstance(hash_db, FileHashDatabase)
60
+
61
+ backup_root_path = hash_db.backup_root
62
+ assert_is_dir(backup_root_path)
63
+
64
+ test_path = hash_db._get_hash_path('12345678abcdef')
65
+ self.assertEqual(test_path, hash_db.base_path / '12' / '34' / '12345678abcdef')
66
+
67
+ file_a_path = backup_root_path / 'rel/path/to/file-A'
68
+ file_a_path.parent.mkdir(parents=True, exist_ok=True)
69
+ file_a_path.touch()
70
+
71
+ self.assertIs(hash_db.get('12345678abcdef'), None)
72
+ hash_db['12345678abcdef'] = file_a_path
73
+ self.assertEqual(hash_db.get('12345678abcdef'), file_a_path)
74
+ self.assertEqual(
75
+ get_hash_db_filenames(hash_db),
76
+ ['12/34/12345678abcdef'],
77
+ )
78
+
79
+ ########################################################################################
80
+ # Another instance using the same directory:
81
+
82
+ another_hash_db = FileHashDatabase(
83
+ backup_root=hash_db.backup_root,
84
+ phlb_conf_dir=hash_db.base_path.parent,
85
+ )
86
+ self.assertEqual(another_hash_db.get('12345678abcdef'), file_a_path)
87
+ self.assertIs(another_hash_db.get('12abcd345678abcdef'), None)
88
+
89
+ file_b_path = backup_root_path / 'rel/path/to/file-B'
90
+ file_b_path.parent.mkdir(parents=True, exist_ok=True)
91
+ file_b_path.touch()
92
+
93
+ another_hash_db['12abcd345678abcdef'] = file_b_path
94
+ self.assertEqual(another_hash_db.get('12abcd345678abcdef'), file_b_path)
95
+ self.assertEqual(
96
+ get_hash_db_filenames(another_hash_db),
97
+ [
98
+ '12/34/12345678abcdef',
99
+ '12/ab/12abcd345678abcdef',
100
+ ],
101
+ )
102
+
103
+ assert_hash_db_info(
104
+ backup_root=hash_db.backup_root,
105
+ expected="""
106
+ 12/34/12345678abcdef… -> rel/path/to/file-A
107
+ 12/ab/12abcd345678ab… -> rel/path/to/file-B
108
+ """,
109
+ )
110
+
111
+ ########################################################################################
112
+ # Deny "overwrite" of existing hash:
113
+
114
+ with self.assertRaises(HashAlreadyExistsError):
115
+ hash_db['12abcd345678abcdef'] = 'foo/bar/baz' # already exists!
116
+
117
+ ########################################################################################
118
+ # Don't use stale entries pointing to missing files:
119
+
120
+ self.assertEqual(hash_db.get('12345678abcdef'), file_a_path)
121
+ file_a_path.unlink()
122
+
123
+ """DocWrite: README.md ## FileHashDatabase - Missing hardlink target file
124
+ We check if the hardlink source file still exists. If not, we remove the hash entry from the database.
125
+ A warning is logged in this case."""
126
+ with self.assertLogs(level=logging.WARNING) as logs:
127
+ self.assertIs(hash_db.get('12345678abcdef'), None)
128
+ self.assertIn('Hash database entry found, but file does not exist', ''.join(logs.output))
129
+ assert_hash_db_info(
130
+ backup_root=hash_db.backup_root,
131
+ expected="""
132
+ 12/ab/12abcd345678ab… -> rel/path/to/file-B
133
+ """,
134
+ )
@@ -0,0 +1,131 @@
1
+ import tempfile
2
+ from collections.abc import Iterable
3
+ from pathlib import Path
4
+ from unittest import TestCase
5
+
6
+ from PyHardLinkBackup.utilities.file_size_database import FileSizeDatabase
7
+ from PyHardLinkBackup.utilities.filesystem import iter_scandir_files
8
+
9
+
10
+ class TemporaryFileSizeDatabase(tempfile.TemporaryDirectory):
11
+ def __enter__(self) -> FileSizeDatabase:
12
+ temp_dir = super().__enter__()
13
+ backup_root = Path(temp_dir)
14
+
15
+ phlb_conf_dir = backup_root / '.phlb'
16
+ phlb_conf_dir.mkdir()
17
+
18
+ size_db = FileSizeDatabase(phlb_conf_dir=phlb_conf_dir)
19
+ return size_db
20
+
21
+
22
+ def get_size_db_filenames(size_db: FileSizeDatabase) -> Iterable[str]:
23
+ return sorted(
24
+ str(Path(entry.path).relative_to(size_db.base_path))
25
+ for entry in iter_scandir_files(size_db.base_path, excludes=set())
26
+ )
27
+
28
+
29
+ def get_sizes(size_db: FileSizeDatabase) -> Iterable[int]:
30
+ return sorted(int(entry.name) for entry in iter_scandir_files(size_db.base_path, excludes=set()))
31
+
32
+
33
+ class FileSizeDatabaseTestCase(TestCase):
34
+ def test_happy_path(self):
35
+ with TemporaryFileSizeDatabase() as size_db:
36
+ self.assertIsInstance(size_db, FileSizeDatabase)
37
+
38
+ test_path1 = size_db._get_size_path(1234)
39
+ self.assertEqual(test_path1, size_db.base_path / '12' / '34' / '1234')
40
+
41
+ test_path2 = size_db._get_size_path(567890)
42
+ self.assertEqual(test_path2, size_db.base_path / '56' / '78' / '567890')
43
+
44
+ self.assertNotIn(1234, size_db)
45
+ self.assertNotIn(567890, size_db)
46
+
47
+ size_db.add(1234)
48
+ self.assertIn(1234, size_db)
49
+ self.assertNotIn(567890, size_db)
50
+
51
+ size_db.add(567890)
52
+ self.assertIn(1234, size_db)
53
+ self.assertIn(567890, size_db)
54
+
55
+ self.assertEqual(get_sizes(size_db), [1234, 567890])
56
+ self.assertEqual(
57
+ get_size_db_filenames(size_db),
58
+ [
59
+ '12/34/1234',
60
+ '56/78/567890',
61
+ ],
62
+ )
63
+
64
+ ########################################################################################
65
+ # Another instance using the same directory:
66
+
67
+ another_size_db = FileSizeDatabase(phlb_conf_dir=size_db.base_path.parent)
68
+ self.assertEqual(get_sizes(another_size_db), [1234, 567890])
69
+ self.assertEqual(
70
+ get_size_db_filenames(another_size_db),
71
+ [
72
+ '12/34/1234',
73
+ '56/78/567890',
74
+ ],
75
+ )
76
+
77
+ ########################################################################################
78
+ # "Share" directories:
79
+
80
+ for size in (123400001111, 123400002222, 128800003333, 129900004444):
81
+ self.assertNotIn(size, size_db)
82
+ size_db.add(size)
83
+ self.assertIn(size, size_db)
84
+
85
+ ########################################################################################
86
+ # Min size is 1000 bytes:
87
+
88
+ """DocWrite: README.md ## FileSizeDatabase - minimum file size
89
+ The minimum file size that can be stored in the FileSizeDatabase is 1000 bytes.
90
+ This is because no padding is made for sizes below 1000 bytes, which would
91
+ break the directory structure.
92
+ """
93
+ self.assertEqual(FileSizeDatabase.MIN_SIZE, 1000)
94
+ """DocWrite: README.md ## FileSizeDatabase - minimum file size
95
+ The idea is, that it's more efficient to backup small files directly, instead of
96
+ checking for duplicates via hardlinks. Therefore, small files below this size
97
+ are not tracked in the FileSizeDatabase.
98
+ """
99
+
100
+ with self.assertRaises(AssertionError):
101
+ size_db._get_size_path(999)
102
+ with self.assertRaises(AssertionError):
103
+ size_db.add(999)
104
+ with self.assertRaises(AssertionError):
105
+ 999 in size_db
106
+
107
+ ########################################################################################
108
+ # Check final state:
109
+
110
+ self.assertEqual(
111
+ get_size_db_filenames(size_db),
112
+ [
113
+ '12/34/1234',
114
+ '12/34/123400001111',
115
+ '12/34/123400002222',
116
+ '12/88/128800003333',
117
+ '12/99/129900004444',
118
+ '56/78/567890',
119
+ ],
120
+ )
121
+ self.assertEqual(
122
+ get_sizes(size_db),
123
+ [
124
+ 1234,
125
+ 567890,
126
+ 123400001111,
127
+ 123400002222,
128
+ 128800003333,
129
+ 129900004444,
130
+ ],
131
+ )