cacheback-snapshot 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 M B
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,59 @@
1
+ Metadata-Version: 2.4
2
+ Name: cacheback-snapshot
3
+ Version: 0.0.1
4
+ Summary: File system snapshotting tool that prioritizes speed and reducing redundant storage.
5
+ Author: MB
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/m-bartlett/cacheback
8
+ Project-URL: Issues, https://github.com/m-bartlett/cacheback/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Requires-Python: >=3.10
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Dynamic: license-file
14
+
15
+ <p align="center">
16
+ <picture>
17
+ <source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/51214078-61b3-4afe-8add-7df04a34ae54" width="700">
18
+ <source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/379c90ea-03cd-4062-a57d-d5da6fc2689f" width="700">
19
+ <img alt="Fallback image description" src="https://github.com/user-attachments/assets/379c90ea-03cd-4062-a57d-d5da6fc2689f" width="700">
20
+ </picture>
21
+ </p>
22
+
23
+ <p align="center">
24
+ File system snapshot tool that prioritizes snapshot speed and reducing redundant storage.
25
+ </p>
26
+ <br/>
27
+
28
+ ## How it works
29
+
30
+ `cacheback` achieves its goals of quick snapshots and minimized snapshot storage size by using hardlink features of modern filesystems
31
+ for files whose contents are unchanged between snapshots.
32
+ This is similar to how git tracks objects in a repository by storing a file's data based on its content hash.
33
+ To further improve speed, a cache of the previous snapshot scan is stored which stores each file's last modification timestamp and
34
+ these timestamps are compared before computing the file content hash. If the timestamp is unchanged, it is assumed that the file has
35
+ not changed since the previous snapshot and is linked to the existing content stored on disk.
36
+
37
+ Here is a diagram visualizing this concept of files within snapshots being pointers to stored data based on content hash:
38
+
39
+ <p align="center">
40
+ <picture width="600">
41
+ <source
42
+ media="(prefers-color-scheme: light)"
43
+ srcset="https://github.com/user-attachments/assets/4f99e5f0-1aef-48f4-a3cb-960a469353f7"
44
+ >
45
+ <source
46
+ media="(prefers-color-scheme: dark)"
47
+ srcset="https://github.com/user-attachments/assets/05c41fb1-f8a0-4465-8d9b-0f30374317d3"
48
+ >
49
+ <img src="https://github.com/user-attachments/assets/4f99e5f0-1aef-48f4-a3cb-960a469353f7">
50
+ </picture>
51
+ </p>
52
+
53
+ If a file is unchanged between multiple snapshots, each file will point to the same hash-named object and therefore the literal file content
54
+ is only stored on disk one time. If snapshots are deleted and a given hashed content is no longer pointed to by any files in any snapshots,
55
+ then the `--gargbage-collect` flag will prompt `cacheback` to purge these unused hash-named files to recover storage space.
56
+
57
+ ## Install
58
+ `pip install cacheback`
59
+
@@ -0,0 +1,45 @@
1
+ <p align="center">
2
+ <picture>
3
+ <source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/51214078-61b3-4afe-8add-7df04a34ae54" width="700">
4
+ <source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/379c90ea-03cd-4062-a57d-d5da6fc2689f" width="700">
5
+ <img alt="Fallback image description" src="https://github.com/user-attachments/assets/379c90ea-03cd-4062-a57d-d5da6fc2689f" width="700">
6
+ </picture>
7
+ </p>
8
+
9
+ <p align="center">
10
+ File system snapshot tool that prioritizes snapshot speed and reducing redundant storage.
11
+ </p>
12
+ <br/>
13
+
14
+ ## How it works
15
+
16
+ `cacheback` achieves its goals of quick snapshots and minimized snapshot storage size by using hardlink features of modern filesystems
17
+ for files whose contents are unchanged between snapshots.
18
+ This is similar to how git tracks objects in a repository by storing a file's data based on its content hash.
19
+ To further improve speed, a cache of the previous snapshot scan is stored which stores each file's last modification timestamp and
20
+ these timestamps are compared before computing the file content hash. If the timestamp is unchanged, it is assumed that the file has
21
+ not changed since the previous snapshot and is linked to the existing content stored on disk.
22
+
23
+ Here is a diagram visualizing this concept of files within snapshots being pointers to stored data based on content hash:
24
+
25
+ <p align="center">
26
+ <picture width="600">
27
+ <source
28
+ media="(prefers-color-scheme: light)"
29
+ srcset="https://github.com/user-attachments/assets/4f99e5f0-1aef-48f4-a3cb-960a469353f7"
30
+ >
31
+ <source
32
+ media="(prefers-color-scheme: dark)"
33
+ srcset="https://github.com/user-attachments/assets/05c41fb1-f8a0-4465-8d9b-0f30374317d3"
34
+ >
35
+ <img src="https://github.com/user-attachments/assets/4f99e5f0-1aef-48f4-a3cb-960a469353f7">
36
+ </picture>
37
+ </p>
38
+
39
+ If a file is unchanged between multiple snapshots, each file will point to the same hash-named object and therefore the literal file content
40
+ is only stored on disk one time. If snapshots are deleted and a given hashed content is no longer pointed to by any files in any snapshots,
41
+ then the `--gargbage-collect` flag will prompt `cacheback` to purge these unused hash-named files to recover storage space.
42
+
43
+ ## Install
44
+ `pip install cacheback`
45
+
@@ -0,0 +1,59 @@
1
+ Metadata-Version: 2.4
2
+ Name: cacheback-snapshot
3
+ Version: 0.0.1
4
+ Summary: File system snapshotting tool that prioritizes speed and reducing redundant storage.
5
+ Author: MB
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/m-bartlett/cacheback
8
+ Project-URL: Issues, https://github.com/m-bartlett/cacheback/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Requires-Python: >=3.10
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Dynamic: license-file
14
+
15
+ <p align="center">
16
+ <picture>
17
+ <source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/51214078-61b3-4afe-8add-7df04a34ae54" width="700">
18
+ <source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/379c90ea-03cd-4062-a57d-d5da6fc2689f" width="700">
19
+ <img alt="Fallback image description" src="https://github.com/user-attachments/assets/379c90ea-03cd-4062-a57d-d5da6fc2689f" width="700">
20
+ </picture>
21
+ </p>
22
+
23
+ <p align="center">
24
+ File system snapshot tool that prioritizes snapshot speed and reducing redundant storage.
25
+ </p>
26
+ <br/>
27
+
28
+ ## How it works
29
+
30
+ `cacheback` achieves its goals of quick snapshots and minimized snapshot storage size by using hardlink features of modern filesystems
31
+ for files whose contents are unchanged between snapshots.
32
+ This is similar to how git tracks objects in a repository by storing a file's data based on its content hash.
33
+ To further improve speed, a cache of the previous snapshot scan is stored which stores each file's last modification timestamp and
34
+ these timestamps are compared before computing the file content hash. If the timestamp is unchanged, it is assumed that the file has
35
+ not changed since the previous snapshot and is linked to the existing content stored on disk.
36
+
37
+ Here is a diagram visualizing this concept of files within snapshots being pointers to stored data based on content hash:
38
+
39
+ <p align="center">
40
+ <picture width="600">
41
+ <source
42
+ media="(prefers-color-scheme: light)"
43
+ srcset="https://github.com/user-attachments/assets/4f99e5f0-1aef-48f4-a3cb-960a469353f7"
44
+ >
45
+ <source
46
+ media="(prefers-color-scheme: dark)"
47
+ srcset="https://github.com/user-attachments/assets/05c41fb1-f8a0-4465-8d9b-0f30374317d3"
48
+ >
49
+ <img src="https://github.com/user-attachments/assets/4f99e5f0-1aef-48f4-a3cb-960a469353f7">
50
+ </picture>
51
+ </p>
52
+
53
+ If a file is unchanged between multiple snapshots, each file will point to the same hash-named object and therefore the literal file content
54
+ is only stored on disk one time. If snapshots are deleted and a given hashed content is no longer pointed to by any files in any snapshots,
55
+ then the `--gargbage-collect` flag will prompt `cacheback` to purge these unused hash-named files to recover storage space.
56
+
57
+ ## Install
58
+ `pip install cacheback`
59
+
@@ -0,0 +1,11 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ cacheback_snapshot.egg-info/PKG-INFO
5
+ cacheback_snapshot.egg-info/SOURCES.txt
6
+ cacheback_snapshot.egg-info/dependency_links.txt
7
+ cacheback_snapshot.egg-info/entry_points.txt
8
+ cacheback_snapshot.egg-info/top_level.txt
9
+ src/__init__.py
10
+ src/__main__.py
11
+ src/file_system_snapshot.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ cacheback = cacheback.__main__:main
@@ -0,0 +1,70 @@
1
+ [project]
2
+ name = "cacheback-snapshot"
3
+ version = "0.0.1"
4
+
5
+ authors = [
6
+ { name="MB" },
7
+ ]
8
+
9
+ description = """
10
+ File system snapshotting tool that prioritizes speed and reducing redundant storage.
11
+ """
12
+
13
+ readme = "README.md"
14
+ requires-python = ">=3.10"
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ ]
18
+ license = "MIT"
19
+
20
+
21
+ [project.urls]
22
+ Homepage = "https://github.com/m-bartlett/cacheback"
23
+ Issues = "https://github.com/m-bartlett/cacheback/issues"
24
+
25
+ [project.scripts]
26
+ cacheback = "cacheback.__main__:main"
27
+
28
+ [build-system]
29
+ requires = ["setuptools"]
30
+ build-backend = "setuptools.build_meta"
31
+
32
+ [tool.setuptools]
33
+ package-dir = {"cacheback" = "src"}
34
+
35
+ [tool.ty.environment]
36
+ root = ["."]
37
+ python-version = "3.13"
38
+
39
+ [tool.ty.rules]
40
+ division-by-zero = "ignore"
41
+
42
+ [tool.ruff]
43
+ line-length = 100
44
+ indent-width = 4
45
+ target-version = "py314"
46
+
47
+ [tool.ruff.format]
48
+ quote-style = "double"
49
+ indent-style = "space"
50
+ line-ending = "auto"
51
+
52
+ [tool.ruff.lint]
53
+ select = [
54
+ "F", # pyflakes
55
+ "E", # pycodestyle
56
+ "I", # isort
57
+ "ANN", # flake8 type annotations
58
+ "RUF", # ruff-specific rules
59
+ ]
60
+ fixable = ["ALL"]
61
+
62
+ [tool.ruff.lint.flake8-annotations]
63
+ allow-star-arg-any = true
64
+ suppress-dummy-args = false
65
+
66
+ [tool.ruff.lint.pydocstyle]
67
+ convention = "google"
68
+
69
+ [tool.autopep8]
70
+ max_line_length = 100
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env python3
2
+ import importlib.metadata
3
+ import sys
4
+ from argparse import ArgumentParser
5
+ from datetime import datetime
6
+
7
+ from .file_system_snapshot import FileSystemSnapshot
8
+
9
+
10
+ def get_version_string() -> str:
11
+ package_name = __name__.partition('.')[0]
12
+ try:
13
+ return importlib.metadata.version(package_name)
14
+ except importlib.metadata.PackageNotFoundError:
15
+ return "0.0.1"
16
+
17
+
18
+ def main() -> int:
19
+ parser = ArgumentParser()
20
+
21
+ parser.add_argument('--destination', '-o',
22
+ required=True,
23
+ help="""Path to store filesystem snapshots in. Reuse this destination in
24
+ the future to reuse the blob cache and prevent file duplication.""")
25
+
26
+ parser.add_argument('--targets', '-i',
27
+ nargs='+',
28
+ help="Path(s) to recursively snapshot")
29
+
30
+ parser.add_argument('--exclude', '-x',
31
+ nargs='*',
32
+ default=[],
33
+ help="""
34
+ Path patterns to omit from the snapshot. See
35
+ https://docs.python.org/3/library/pathlib.html#pathlib-pattern-language
36
+ for documentaiton on pattern syntax.
37
+ """ )
38
+
39
+ parser.add_argument('--snapshot-name', '-n',
40
+ default=datetime.now().strftime('%F %H:%M'),
41
+ help="""Name to use for this snapshot's directory. Defaults to the current
42
+ timestamp with filesystem-naming compatible delimiters.""")
43
+
44
+ parser.add_argument('--hash-algorithm',
45
+ default='blake2b',
46
+ help="Which hashlib algorithm to compute file hashes. Default is blake2b.")
47
+
48
+ parser.add_argument('--threads',
49
+ type=int,
50
+ default=4,
51
+ help="How many threads to use for processing files.")
52
+
53
+ parser.add_argument('--garbage-collect-cache', '--gc',
54
+ dest='garbage_collect',
55
+ action="store_true",
56
+ help="Run garbage collection in the blob cache")
57
+
58
+ parser.add_argument('--dry',
59
+ action="store_true",
60
+ help="""Only print what file operations would be performed instead of
61
+ actually performing them. Useful as a sanity check.""")
62
+
63
+ parser.add_argument('--verbose', '-v',
64
+ action="store_true",
65
+ help="Output extra information during snapshot operations.")
66
+
67
+ parser.add_argument('--version',
68
+ action="version",
69
+ version=f"%(prog)s {get_version_string()}",
70
+ help="Print version number and exit.")
71
+
72
+ args = parser.parse_args()
73
+
74
+ snapshot = FileSystemSnapshot(snapshots_dir = args.destination,
75
+ snapshot_name = args.snapshot_name,
76
+ target_paths = args.targets,
77
+ exclude_patterns = args.exclude,
78
+ hash_algorithm = args.hash_algorithm,
79
+ threads = args.threads,
80
+ dry_mode = args.dry,
81
+ verbose = args.verbose)
82
+ snapshot.take_snapshot()
83
+
84
+ if args.garbage_collect:
85
+ snapshot.garbage_collect_blob_cache()
86
+
87
+ return 0
88
+
89
+
90
+ if __name__ == '__main__':
91
+ sys.exit(main())
@@ -0,0 +1,460 @@
1
+ import hashlib
2
+ import json
3
+ import os
4
+ import queue
5
+ import shutil
6
+ import signal
7
+ import sys
8
+ import threading
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ from pathlib import Path
11
+ from typing import Any, BinaryIO, Callable, Generator
12
+
13
+ type FileHandler = Callable[[Path,Path], None]
14
+
15
+ class FileSystemSnapshot:
16
+ def __init__(self,
17
+ snapshots_dir: str,
18
+ snapshot_name: str,
19
+ target_paths: list[str],
20
+ exclude_patterns: list[str]=[],
21
+ hash_algorithm: str='blake2b',
22
+ threads: int=4,
23
+ dry_mode: bool=True,
24
+ verbose: bool=False) -> None:
25
+
26
+ self.handle_file : FileHandler
27
+ self.handle_dir : FileHandler
28
+ self.handle_symlink : FileHandler
29
+
30
+ self._print_lock = threading.Lock()
31
+ self._hash_tree_write_lock = threading.Lock()
32
+ self._file_write_lock = threading.Lock()
33
+ self.output_path = Path(snapshots_dir).resolve()
34
+ self.cache_path = self.output_path / '.cacheback.d'
35
+ self.snapshot_name = snapshot_name
36
+ self.blob_store_path = self.cache_path / 'blob'
37
+ self.snapshot_path = self.output_path / snapshot_name
38
+ self.target_paths = set(Path(p).expanduser().resolve() for p in target_paths)
39
+ self.exclude_patterns = set(exclude_patterns)
40
+ self.hash_cache_path = self.cache_path / 'hash_tree_cache.json'
41
+ self._hash_fn = getattr(hashlib, hash_algorithm, hashlib.blake2b)
42
+ self.hash_tree = {}
43
+ self.threads = threads
44
+ self._thread_outputs = {}
45
+
46
+
47
+ self.exclude_patterns.add(f"{self.output_path}")
48
+
49
+ if dry_mode:
50
+ self.handle_file = self._handle_file_dry
51
+ self.handle_dir = self._handle_dir_dry
52
+ self.handle_symlink = self._handle_symlink_dry
53
+ else:
54
+ self.handle_file = self._handle_file
55
+ self.handle_dir = self._handle_dir
56
+ self.handle_symlink = self._handle_symlink
57
+
58
+ if verbose:
59
+ self._verbose = True
60
+ self.update_thread_status = self._update_thread_status_verbose
61
+ self.format_file_status = self._format_file_status_verbose
62
+ else:
63
+ self._verbose = False
64
+ self.update_thread_status = self._update_thread_status_ephemeral
65
+ self.format_file_status = self._format_file_status_ephemeral
66
+
67
+ self._read_terminal_width()
68
+ signal.signal(signal.SIGWINCH, self._read_terminal_width)
69
+
70
+
71
+ def _read_terminal_width(self, *_: Any) -> None:
72
+ with self._print_lock:
73
+ self._terminal_width = shutil.get_terminal_size((80,0)).columns
74
+
75
+
76
+ def printerr(self, *args: Any, **kwargs: Any) -> None:
77
+ kwargs['file']=sys.stderr
78
+ print(*args, **kwargs)
79
+
80
+
81
+ def _count_wrapped_lines(self, text: str) -> int:
82
+ return (len(text)-1) // self._terminal_width
83
+
84
+
85
+ def print_ephemeral(self, text: str='') -> None:
86
+ lines = text.splitlines()
87
+ rows = (len(lines)-1) + sum(map(self._count_wrapped_lines, lines))
88
+ if rows > 0:
89
+ lines_up = f'\033[{rows}F'
90
+ else:
91
+ lines_up = ''
92
+ sys.stderr.write(f'\033[0J{text}\033[0G{lines_up}')
93
+ sys.stderr.flush()
94
+
95
+
96
+ def print_persistent(self, text: str='') -> None:
97
+ with self._print_lock:
98
+ sys.stdout.write(f"\033[0G\033[0J{text}\n")
99
+ sys.stdout.flush()
100
+
101
+
102
+ def target_iter(self) -> Generator[Path]:
103
+ for target in self.target_paths:
104
+ yield from self.path_recursive_iter(target)
105
+
106
+
107
+ def is_path_excluded(self, path: Path) -> bool:
108
+ return any(path.full_match(xpat) for xpat in self.exclude_patterns)
109
+
110
+
111
+ def path_recursive_iter(self, path: Path) -> Generator[Path]:
112
+ if self.is_path_excluded(path):
113
+ self.print_persistent(f"Excluding {path}")
114
+ return
115
+ yield path
116
+ if path.is_dir() and not path.is_symlink():
117
+ for p in path.iterdir():
118
+ yield from self.path_recursive_iter(p)
119
+
120
+
121
+ def compute_file_hash(self, path: Path) -> str:
122
+ return self._hash_fn(path.read_bytes()).digest().hex()
123
+
124
+
125
+ @staticmethod
126
+ def get_file_modified_time(path: Path) -> int:
127
+ return int(path.stat().st_mtime)
128
+
129
+
130
+ @staticmethod
131
+ def get_nested_hash_path(file_hash: str,
132
+ dir_delimiter: str=os.path.sep,
133
+ nesting_levels: int=2,
134
+ nest_length: int=3) -> str:
135
+ _hash = file_hash
136
+ dirs = []
137
+ for i in range(nesting_levels):
138
+ nest_dir = _hash[:nest_length]
139
+ dirs.append(nest_dir)
140
+ _hash = _hash[nest_length:]
141
+ dirs.append(_hash)
142
+ hash_path = dir_delimiter.join(dirs)
143
+ return hash_path
144
+
145
+
146
+ def load_hash_tree_cache(self) -> None:
147
+ if not self.hash_cache_path.exists():
148
+ return
149
+ with self.hash_cache_path.open('r') as cache_fd:
150
+ hash_tree_cached = json.load(cache_fd)
151
+
152
+ def _recursive_path_reconstruct_iter(parent: Path, tree_node: dict) -> Generator:
153
+ for k, v in tree_node.items():
154
+ path = parent / k
155
+ if isinstance(v, dict):
156
+ yield from _recursive_path_reconstruct_iter(path, v)
157
+ else:
158
+ yield (path, [*v, False])
159
+
160
+ self.hash_tree = {
161
+ str(path): path_props
162
+ for path, path_props in _recursive_path_reconstruct_iter(Path('/'), hash_tree_cached)
163
+ }
164
+
165
+
166
+ def store_hash_tree_cache(self) -> None:
167
+ hash_tree_nested = {}
168
+
169
+ for path_str, props in self.hash_tree.items():
170
+ path_mtime, path_hash, path_visited = props
171
+ if not path_visited:
172
+ self.print_persistent(f"Pruning hash cache entry for {path_str}")
173
+ continue
174
+ path = Path(path_str)
175
+ nested_node = hash_tree_nested
176
+ for part in path.parent.parts[1:]:
177
+ if part not in nested_node:
178
+ nested_node[part] = {}
179
+ nested_node = nested_node[part]
180
+ nested_node[path.name] = [path_mtime, path_hash]
181
+
182
+ with self.hash_cache_path.open('w') as cache_fd:
183
+ json.dump(hash_tree_nested, cache_fd, separators=(',', ':'))
184
+
185
+
186
+ def query_hash_tree_cache(self, file_path: Path) -> str:
187
+ current_mtime = self.get_file_modified_time(file_path)
188
+ file_hash = None
189
+ if (cached_props := self.hash_tree.get(str(file_path))):
190
+ cached_mtime, cached_hash, _visited = cached_props
191
+ if cached_mtime == current_mtime:
192
+ file_hash = cached_hash
193
+ if not file_hash:
194
+ file_hash = self.compute_file_hash(file_path)
195
+ with self._hash_tree_write_lock:
196
+ self.hash_tree[str(file_path)] = [current_mtime, file_hash, True]
197
+ return file_hash
198
+
199
+
200
+ def _handle_file_dry(self, file: Path, destination: Path) -> None:
201
+ file_hash = self.query_hash_tree_cache(file)
202
+ self.update_thread_status(self.format_file_status(file, file_hash))
203
+
204
+
205
+ def _handle_file(self, file: Path, destination: Path) -> None:
206
+ file_hash = self.query_hash_tree_cache(file)
207
+ nested_hash_path = self.get_nested_hash_path(file_hash)
208
+
209
+ blob_path = self.blob_store_path / nested_hash_path
210
+ blob_path.parent.mkdir(parents=True, exist_ok=True)
211
+
212
+ if blob_path.exists():
213
+ ...
214
+ else:
215
+ self.set_thread_status(f"Waiting to copy {file}...")
216
+ with self._file_write_lock: # Multiple threads writing makes everything slower :)
217
+ if file.stat().st_size > 100_000_000:
218
+ self.copy_file_with_progress(file, blob_path)
219
+ else:
220
+ shutil.copy2(file, blob_path)
221
+
222
+ self.update_thread_status(self.format_file_status(file, file_hash))
223
+
224
+ destination.parent.mkdir(exist_ok=True, parents=True)
225
+
226
+ if destination.exists():
227
+ if self._verbose:
228
+ if destination.samefile(blob_path):
229
+ self.print_persistent(
230
+ f"{file} is already linked to {file_hash} in this snapshot. The most"
231
+ " likely cause of this is that a symlink captured in this snapshot points"
232
+ " to this path.")
233
+ else:
234
+ self.print_persistent(
235
+ f"{file} already exists in this snapshot but it does not target the blob"
236
+ f" in the cache that matches its current hash {file_hash}. Something"
237
+ " unexpected has occured, dropping into and interactive debugger."
238
+ )
239
+ breakpoint()
240
+ else:
241
+ destination.hardlink_to(blob_path)
242
+ return
243
+
244
+
245
+ def _handle_dir_dry(self, directory: Path, destination: Path) -> None:
246
+ self.update_thread_status(f"DRY: directory {directory}")
247
+
248
+
249
+ def _handle_dir(self, directory: Path, destination: Path) -> None:
250
+ destination.mkdir(parents=True, exist_ok=True)
251
+ return
252
+
253
+
254
+ def _handle_symlink_dry(self, symlink: Path, destination: Path) -> None:
255
+ self.update_thread_status(f"DRY: symlink {symlink} => {symlink.readlink()}")
256
+
257
+
258
+ def _handle_symlink(self, symlink: Path, destination: Path) -> None:
259
+ if destination.exists():
260
+ return
261
+ symlink_destination = symlink.readlink()
262
+ if symlink_destination.is_absolute():
263
+ # re-root the absolute symlink's target
264
+ snapshot_symlink_destination = Path(f"{self.snapshot_path}/{symlink_destination}")
265
+ else:
266
+ # relative symlink will work as expected under the snapshot dir, just duplicate it
267
+ snapshot_symlink_destination = symlink_destination
268
+ symlink_destination = (symlink.parent / symlink_destination).resolve()
269
+
270
+ self.update_thread_status(f"Symlink: {symlink} => {symlink_destination}")
271
+ destination.parent.mkdir(parents=True, exist_ok=True)
272
+ try:
273
+ destination.symlink_to(snapshot_symlink_destination)
274
+ except FileExistsError:
275
+ self.print_persistent(f"symlink {snapshot_symlink_destination} already exists")
276
+
277
+ if not self.is_path_excluded(symlink_destination):
278
+ self.snapshot_by_type(symlink_destination)
279
+ else:
280
+ self.print_persistent(
281
+ f"Symlink {symlink} points to excluded file {symlink_destination}"
282
+ )
283
+ return
284
+
285
+
286
+ def handle_unexpected(self, path: Path, destination: Path) -> None:
287
+ if path.exists():
288
+ breakpoint(header=f"Unhandled type for {path}")
289
+
290
+
291
+ def snapshot_by_type(self, path: Path) -> None:
292
+ destination = Path(f"{self.snapshot_path}/{path}")
293
+ if path.is_symlink():
294
+ return self.handle_symlink(path, destination)
295
+ if path.is_dir():
296
+ return self.handle_dir(path, destination)
297
+ if path.is_file():
298
+ return self.handle_file(path, destination)
299
+ if any((path.is_fifo(), path.is_socket(), path.is_char_device(), path.is_block_device())):
300
+ return
301
+ else:
302
+ return self.handle_unexpected(path, destination)
303
+
304
+
305
+ def take_snapshot(self) -> None:
306
+ self.blob_store_path.mkdir(parents=True, exist_ok=True)
307
+ self.snapshot_path.mkdir(parents=True, exist_ok=False)
308
+ self.print_persistent(f"Creating snapshot {self.snapshot_name}")
309
+ self.load_hash_tree_cache()
310
+ with ThreadPoolExecutor(max_workers=self.threads) as pool:
311
+ pool.map(self.snapshot_by_type, self.target_iter())
312
+ self.store_hash_tree_cache()
313
+ self.print_persistent()
314
+
315
+
316
+ @staticmethod
317
+ def _copy_bytes(fsrc: BinaryIO,
318
+ fdest: BinaryIO,
319
+ callback: Callable,
320
+ total: int,
321
+ length: int) -> None:
322
+ copied = 0
323
+ while True:
324
+ buf = fsrc.read(length)
325
+ if not buf:
326
+ break
327
+ fdest.write(buf)
328
+ copied += len(buf)
329
+ callback(copied, total)
330
+
331
+
332
+ def _copy_file_with_callback(self,
333
+ source_path: Path,
334
+ destination_path: Path,
335
+ callback: Callable,
336
+ callback_batch_size: int=65536) -> None:
337
+ size = os.stat(source_path).st_size
338
+ with open(source_path, "rb") as fsrc:
339
+ with open(destination_path, "wb") as fdest:
340
+ self._copy_bytes(fsrc,
341
+ fdest,
342
+ callback=callback,
343
+ total=size,
344
+ length=callback_batch_size)
345
+ shutil.copymode(str(source_path), str(destination_path))
346
+
347
+
348
+ def make_progress_printer(self, label: str) -> Callable[[int,int], str]:
349
+ template = f"{label} [{{filled}}{{empty}}]"
350
+ template_size = len(template.format(filled='',empty=''))
351
+ template_ansi = f"\033[0J{template}\033[0G"
352
+
353
+ def _progress_printer(current: int, total: int) -> str:
354
+ bar_width = self._terminal_width - template_size
355
+ progress = (current * bar_width) // total
356
+ filled = '-' * progress
357
+ empty = ' ' * (bar_width - progress)
358
+ return template_ansi.format(filled=filled, empty=empty)
359
+
360
+ return _progress_printer
361
+
362
+
363
+ def copy_file_with_progress(self, source_path: Path, destination_path: Path) -> None:
364
+ _file_progress = self.make_progress_printer(str(source_path))
365
+ if self._verbose:
366
+ def _callback(x: int, y: int) -> None:
367
+ self.printerr(_file_progress(x,y), end='\r')
368
+ else:
369
+ def _callback(x: int, y: int) -> None:
370
+ self.update_thread_status(_file_progress(x,y))
371
+ _callback(0,1)
372
+ self._copy_file_with_callback(source_path, destination_path, _callback)
373
+ self.print_persistent(f"\r\033[0JCopied {source_path}")
374
+
375
+
376
+ def garbage_collect_blob_cache(self, threads: int=1) -> None:
377
+ blobs = self.blob_store_path.rglob('*')
378
+ thread_stop_event = threading.Event()
379
+ blob_queue = queue.SimpleQueue()
380
+ blobs_checked = 0
381
+ blobs_total = 1
382
+ _gc_progress = self.make_progress_printer("Garbage collecting")
383
+ self.printerr(_gc_progress(0, 1), end='\r')
384
+
385
+ def _check_blob_queue_thread_task() -> None:
386
+ nonlocal blobs_checked, blobs_total
387
+ while True:
388
+ try:
389
+ blob = blob_queue.get(timeout=1)
390
+ if blob.is_file():
391
+ if blob.stat().st_nlink < 2:
392
+ # If blob isn't hardlinked to, delete it from storage
393
+ self.print_persistent(f"Garbage collecting {blob}")
394
+ blob.unlink()
395
+ blob_queue.put(blob.parent)
396
+ blobs_total += 1
397
+ elif blob.is_dir():
398
+ if not any(blob.iterdir()):
399
+ self.print_persistent(f"Garbage collecting {blob}/")
400
+ blob.rmdir()
401
+ blob_queue.put(blob.parent)
402
+ blobs_checked += 1
403
+ except queue.Empty:
404
+ if thread_stop_event.is_set():
405
+ return
406
+
407
+ pool = ThreadPoolExecutor(max_workers=threads)
408
+ _futures = [pool.submit(_check_blob_queue_thread_task) for i in range(threads)]
409
+ for blob in blobs:
410
+ blobs_total += 1
411
+ blob_queue.put(blob)
412
+ self.printerr(_gc_progress(blobs_checked, blobs_total), end='\r')
413
+ thread_stop_event.set()
414
+ pool.shutdown(wait=True)
415
+
416
+ self.print_persistent()
417
+
418
+
419
+ def print_thread_statuses_ephemeral(self) -> None:
420
+ with self._print_lock:
421
+ dead_threads = (set(self._thread_outputs.keys())
422
+ .difference(t.ident for t in threading.enumerate()))
423
+ for thread_id in dead_threads:
424
+ self._thread_outputs.pop(thread_id, None)
425
+ body = '\n'.join(self._thread_outputs.values())
426
+ self.print_ephemeral(body)
427
+
428
+
429
+ def set_thread_status(self, text: str) -> None:
430
+ self._thread_outputs[threading.get_ident()] = text
431
+
432
+
433
+ def _update_thread_status_ephemeral(self, text: str) -> None:
434
+ self.set_thread_status(text)
435
+ self.print_thread_statuses_ephemeral()
436
+
437
+
438
+ def _update_thread_status_verbose(self, text: str) -> None:
439
+ with self._print_lock:
440
+ print(text)
441
+
442
+
443
+ _hash_prefix = " ↳ "
444
+ _hash_prefix_size = len(_hash_prefix)
445
+ _truncate_str = "…"
446
+ _truncate_size = len(_truncate_str)
447
+
448
+ def _format_file_status_ephemeral(self, file_path: Path, file_hash: str) -> str:
449
+ file_path_str = str(file_path)
450
+ hash_len_delta = len(file_hash) + self._hash_prefix_size - self._terminal_width
451
+ if hash_len_delta > 0:
452
+ file_hash = f"{file_hash[:-(hash_len_delta+self._truncate_size)]}{self._truncate_str}"
453
+ if len(file_path_str)> self._terminal_width:
454
+ path_truncate_size = self._terminal_width - self._truncate_size
455
+ file_path_str = f"{self._truncate_str}{file_path_str[-path_truncate_size:]}"
456
+ return f"{file_path_str}\n{self._hash_prefix}{file_hash}"
457
+
458
+
459
+ def _format_file_status_verbose(self, file_path: Path, file_hash: str) -> str:
460
+ return f"{file_path} -> {file_hash}"