cacheback-snapshot 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cacheback_snapshot-0.0.1/LICENSE +21 -0
- cacheback_snapshot-0.0.1/PKG-INFO +59 -0
- cacheback_snapshot-0.0.1/README.md +45 -0
- cacheback_snapshot-0.0.1/cacheback_snapshot.egg-info/PKG-INFO +59 -0
- cacheback_snapshot-0.0.1/cacheback_snapshot.egg-info/SOURCES.txt +11 -0
- cacheback_snapshot-0.0.1/cacheback_snapshot.egg-info/dependency_links.txt +1 -0
- cacheback_snapshot-0.0.1/cacheback_snapshot.egg-info/entry_points.txt +2 -0
- cacheback_snapshot-0.0.1/cacheback_snapshot.egg-info/top_level.txt +1 -0
- cacheback_snapshot-0.0.1/pyproject.toml +70 -0
- cacheback_snapshot-0.0.1/setup.cfg +4 -0
- cacheback_snapshot-0.0.1/src/__init__.py +0 -0
- cacheback_snapshot-0.0.1/src/__main__.py +91 -0
- cacheback_snapshot-0.0.1/src/file_system_snapshot.py +460 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 M B
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cacheback-snapshot
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: File system snapshotting tool that prioritizes speed and reducing redundant storage.
|
|
5
|
+
Author: MB
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/m-bartlett/cacheback
|
|
8
|
+
Project-URL: Issues, https://github.com/m-bartlett/cacheback/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
<p align="center">
|
|
16
|
+
<picture>
|
|
17
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/51214078-61b3-4afe-8add-7df04a34ae54" width="700">
|
|
18
|
+
<source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/379c90ea-03cd-4062-a57d-d5da6fc2689f" width="700">
|
|
19
|
+
<img alt="Fallback image description" src="https://github.com/user-attachments/assets/379c90ea-03cd-4062-a57d-d5da6fc2689f" width="700">
|
|
20
|
+
</picture>
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
<p align="center">
|
|
24
|
+
File system snapshot tool that prioritizes snapshot speed and reducing redundant storage.
|
|
25
|
+
</p>
|
|
26
|
+
<br/>
|
|
27
|
+
|
|
28
|
+
## How it works
|
|
29
|
+
|
|
30
|
+
`cacheback` achieves its goals of quick snapshots and minimized snapshot storage size by using hardlink features of modern filesystems
|
|
31
|
+
for files whose contents are unchanged between snapshots.
|
|
32
|
+
This is similar to how git tracks objects in a repository by storing a file's data based on its content hash.
|
|
33
|
+
To further improve speed, a cache of the previous snapshot scan is stored which stores each file's last modification timestamp and
|
|
34
|
+
these timestamps are compared before computing the file content hash. If the timestamp is unchanged, it is assumed that the file has
|
|
35
|
+
not changed since the previous snapshot and is linked to the existing content stored on disk.
|
|
36
|
+
|
|
37
|
+
Here is a diagram visualizing this concept of files within snapshots being pointers to stored data based on content hash:
|
|
38
|
+
|
|
39
|
+
<p align="center">
|
|
40
|
+
<picture width="600">
|
|
41
|
+
<source
|
|
42
|
+
media="(prefers-color-scheme: light)"
|
|
43
|
+
srcset="https://github.com/user-attachments/assets/4f99e5f0-1aef-48f4-a3cb-960a469353f7"
|
|
44
|
+
>
|
|
45
|
+
<source
|
|
46
|
+
media="(prefers-color-scheme: dark)"
|
|
47
|
+
srcset="https://github.com/user-attachments/assets/05c41fb1-f8a0-4465-8d9b-0f30374317d3"
|
|
48
|
+
>
|
|
49
|
+
<img src="https://github.com/user-attachments/assets/4f99e5f0-1aef-48f4-a3cb-960a469353f7">
|
|
50
|
+
</picture>
|
|
51
|
+
</p>
|
|
52
|
+
|
|
53
|
+
If a file is unchanged between multiple snapshots, each file will point to the same hash-named object and therefore the literal file content
|
|
54
|
+
is only stored on disk one time. If snapshots are deleted and a given hashed content is no longer pointed to by any files in any snapshots,
|
|
55
|
+
then the `--gargbage-collect` flag will prompt `cacheback` to purge these unused hash-named files to recover storage space.
|
|
56
|
+
|
|
57
|
+
## Install
|
|
58
|
+
`pip install cacheback`
|
|
59
|
+
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<picture>
|
|
3
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/51214078-61b3-4afe-8add-7df04a34ae54" width="700">
|
|
4
|
+
<source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/379c90ea-03cd-4062-a57d-d5da6fc2689f" width="700">
|
|
5
|
+
<img alt="Fallback image description" src="https://github.com/user-attachments/assets/379c90ea-03cd-4062-a57d-d5da6fc2689f" width="700">
|
|
6
|
+
</picture>
|
|
7
|
+
</p>
|
|
8
|
+
|
|
9
|
+
<p align="center">
|
|
10
|
+
File system snapshot tool that prioritizes snapshot speed and reducing redundant storage.
|
|
11
|
+
</p>
|
|
12
|
+
<br/>
|
|
13
|
+
|
|
14
|
+
## How it works
|
|
15
|
+
|
|
16
|
+
`cacheback` achieves its goals of quick snapshots and minimized snapshot storage size by using hardlink features of modern filesystems
|
|
17
|
+
for files whose contents are unchanged between snapshots.
|
|
18
|
+
This is similar to how git tracks objects in a repository by storing a file's data based on its content hash.
|
|
19
|
+
To further improve speed, a cache of the previous snapshot scan is stored which stores each file's last modification timestamp and
|
|
20
|
+
these timestamps are compared before computing the file content hash. If the timestamp is unchanged, it is assumed that the file has
|
|
21
|
+
not changed since the previous snapshot and is linked to the existing content stored on disk.
|
|
22
|
+
|
|
23
|
+
Here is a diagram visualizing this concept of files within snapshots being pointers to stored data based on content hash:
|
|
24
|
+
|
|
25
|
+
<p align="center">
|
|
26
|
+
<picture width="600">
|
|
27
|
+
<source
|
|
28
|
+
media="(prefers-color-scheme: light)"
|
|
29
|
+
srcset="https://github.com/user-attachments/assets/4f99e5f0-1aef-48f4-a3cb-960a469353f7"
|
|
30
|
+
>
|
|
31
|
+
<source
|
|
32
|
+
media="(prefers-color-scheme: dark)"
|
|
33
|
+
srcset="https://github.com/user-attachments/assets/05c41fb1-f8a0-4465-8d9b-0f30374317d3"
|
|
34
|
+
>
|
|
35
|
+
<img src="https://github.com/user-attachments/assets/4f99e5f0-1aef-48f4-a3cb-960a469353f7">
|
|
36
|
+
</picture>
|
|
37
|
+
</p>
|
|
38
|
+
|
|
39
|
+
If a file is unchanged between multiple snapshots, each file will point to the same hash-named object and therefore the literal file content
|
|
40
|
+
is only stored on disk one time. If snapshots are deleted and a given hashed content is no longer pointed to by any files in any snapshots,
|
|
41
|
+
then the `--gargbage-collect` flag will prompt `cacheback` to purge these unused hash-named files to recover storage space.
|
|
42
|
+
|
|
43
|
+
## Install
|
|
44
|
+
`pip install cacheback`
|
|
45
|
+
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cacheback-snapshot
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: File system snapshotting tool that prioritizes speed and reducing redundant storage.
|
|
5
|
+
Author: MB
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/m-bartlett/cacheback
|
|
8
|
+
Project-URL: Issues, https://github.com/m-bartlett/cacheback/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
<p align="center">
|
|
16
|
+
<picture>
|
|
17
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/51214078-61b3-4afe-8add-7df04a34ae54" width="700">
|
|
18
|
+
<source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/379c90ea-03cd-4062-a57d-d5da6fc2689f" width="700">
|
|
19
|
+
<img alt="Fallback image description" src="https://github.com/user-attachments/assets/379c90ea-03cd-4062-a57d-d5da6fc2689f" width="700">
|
|
20
|
+
</picture>
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
<p align="center">
|
|
24
|
+
File system snapshot tool that prioritizes snapshot speed and reducing redundant storage.
|
|
25
|
+
</p>
|
|
26
|
+
<br/>
|
|
27
|
+
|
|
28
|
+
## How it works
|
|
29
|
+
|
|
30
|
+
`cacheback` achieves its goals of quick snapshots and minimized snapshot storage size by using hardlink features of modern filesystems
|
|
31
|
+
for files whose contents are unchanged between snapshots.
|
|
32
|
+
This is similar to how git tracks objects in a repository by storing a file's data based on its content hash.
|
|
33
|
+
To further improve speed, a cache of the previous snapshot scan is stored which stores each file's last modification timestamp and
|
|
34
|
+
these timestamps are compared before computing the file content hash. If the timestamp is unchanged, it is assumed that the file has
|
|
35
|
+
not changed since the previous snapshot and is linked to the existing content stored on disk.
|
|
36
|
+
|
|
37
|
+
Here is a diagram visualizing this concept of files within snapshots being pointers to stored data based on content hash:
|
|
38
|
+
|
|
39
|
+
<p align="center">
|
|
40
|
+
<picture width="600">
|
|
41
|
+
<source
|
|
42
|
+
media="(prefers-color-scheme: light)"
|
|
43
|
+
srcset="https://github.com/user-attachments/assets/4f99e5f0-1aef-48f4-a3cb-960a469353f7"
|
|
44
|
+
>
|
|
45
|
+
<source
|
|
46
|
+
media="(prefers-color-scheme: dark)"
|
|
47
|
+
srcset="https://github.com/user-attachments/assets/05c41fb1-f8a0-4465-8d9b-0f30374317d3"
|
|
48
|
+
>
|
|
49
|
+
<img src="https://github.com/user-attachments/assets/4f99e5f0-1aef-48f4-a3cb-960a469353f7">
|
|
50
|
+
</picture>
|
|
51
|
+
</p>
|
|
52
|
+
|
|
53
|
+
If a file is unchanged between multiple snapshots, each file will point to the same hash-named object and therefore the literal file content
|
|
54
|
+
is only stored on disk one time. If snapshots are deleted and a given hashed content is no longer pointed to by any files in any snapshots,
|
|
55
|
+
then the `--gargbage-collect` flag will prompt `cacheback` to purge these unused hash-named files to recover storage space.
|
|
56
|
+
|
|
57
|
+
## Install
|
|
58
|
+
`pip install cacheback`
|
|
59
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
cacheback_snapshot.egg-info/PKG-INFO
|
|
5
|
+
cacheback_snapshot.egg-info/SOURCES.txt
|
|
6
|
+
cacheback_snapshot.egg-info/dependency_links.txt
|
|
7
|
+
cacheback_snapshot.egg-info/entry_points.txt
|
|
8
|
+
cacheback_snapshot.egg-info/top_level.txt
|
|
9
|
+
src/__init__.py
|
|
10
|
+
src/__main__.py
|
|
11
|
+
src/file_system_snapshot.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
cacheback
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "cacheback-snapshot"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
|
|
5
|
+
authors = [
|
|
6
|
+
{ name="MB" },
|
|
7
|
+
]
|
|
8
|
+
|
|
9
|
+
description = """
|
|
10
|
+
File system snapshotting tool that prioritizes speed and reducing redundant storage.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
readme = "README.md"
|
|
14
|
+
requires-python = ">=3.10"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
]
|
|
18
|
+
license = "MIT"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
[project.urls]
|
|
22
|
+
Homepage = "https://github.com/m-bartlett/cacheback"
|
|
23
|
+
Issues = "https://github.com/m-bartlett/cacheback/issues"
|
|
24
|
+
|
|
25
|
+
[project.scripts]
|
|
26
|
+
cacheback = "cacheback.__main__:main"
|
|
27
|
+
|
|
28
|
+
[build-system]
|
|
29
|
+
requires = ["setuptools"]
|
|
30
|
+
build-backend = "setuptools.build_meta"
|
|
31
|
+
|
|
32
|
+
[tool.setuptools]
|
|
33
|
+
package-dir = {"cacheback" = "src"}
|
|
34
|
+
|
|
35
|
+
[tool.ty.environment]
|
|
36
|
+
root = ["."]
|
|
37
|
+
python-version = "3.13"
|
|
38
|
+
|
|
39
|
+
[tool.ty.rules]
|
|
40
|
+
division-by-zero = "ignore"
|
|
41
|
+
|
|
42
|
+
[tool.ruff]
|
|
43
|
+
line-length = 100
|
|
44
|
+
indent-width = 4
|
|
45
|
+
target-version = "py314"
|
|
46
|
+
|
|
47
|
+
[tool.ruff.format]
|
|
48
|
+
quote-style = "double"
|
|
49
|
+
indent-style = "space"
|
|
50
|
+
line-ending = "auto"
|
|
51
|
+
|
|
52
|
+
[tool.ruff.lint]
|
|
53
|
+
select = [
|
|
54
|
+
"F", # pyflakes
|
|
55
|
+
"E", # pycodestyle
|
|
56
|
+
"I", # isort
|
|
57
|
+
"ANN", # flake8 type annotations
|
|
58
|
+
"RUF", # ruff-specific rules
|
|
59
|
+
]
|
|
60
|
+
fixable = ["ALL"]
|
|
61
|
+
|
|
62
|
+
[tool.ruff.lint.flake8-annotations]
|
|
63
|
+
allow-star-arg-any = true
|
|
64
|
+
suppress-dummy-args = false
|
|
65
|
+
|
|
66
|
+
[tool.ruff.lint.pydocstyle]
|
|
67
|
+
convention = "google"
|
|
68
|
+
|
|
69
|
+
[tool.autopep8]
|
|
70
|
+
max_line_length = 100
|
|
File without changes
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import importlib.metadata
|
|
3
|
+
import sys
|
|
4
|
+
from argparse import ArgumentParser
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
from .file_system_snapshot import FileSystemSnapshot
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_version_string() -> str:
|
|
11
|
+
package_name = __name__.partition('.')[0]
|
|
12
|
+
try:
|
|
13
|
+
return importlib.metadata.version(package_name)
|
|
14
|
+
except importlib.metadata.PackageNotFoundError:
|
|
15
|
+
return "0.0.1"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def main() -> int:
|
|
19
|
+
parser = ArgumentParser()
|
|
20
|
+
|
|
21
|
+
parser.add_argument('--destination', '-o',
|
|
22
|
+
required=True,
|
|
23
|
+
help="""Path to store filesystem snapshots in. Reuse this destination in
|
|
24
|
+
the future to reuse the blob cache and prevent file duplication.""")
|
|
25
|
+
|
|
26
|
+
parser.add_argument('--targets', '-i',
|
|
27
|
+
nargs='+',
|
|
28
|
+
help="Path(s) to recursively snapshot")
|
|
29
|
+
|
|
30
|
+
parser.add_argument('--exclude', '-x',
|
|
31
|
+
nargs='*',
|
|
32
|
+
default=[],
|
|
33
|
+
help="""
|
|
34
|
+
Path patterns to omit from the snapshot. See
|
|
35
|
+
https://docs.python.org/3/library/pathlib.html#pathlib-pattern-language
|
|
36
|
+
for documentaiton on pattern syntax.
|
|
37
|
+
""" )
|
|
38
|
+
|
|
39
|
+
parser.add_argument('--snapshot-name', '-n',
|
|
40
|
+
default=datetime.now().strftime('%F %H:%M'),
|
|
41
|
+
help="""Name to use for this snapshot's directory. Defaults to the current
|
|
42
|
+
timestamp with filesystem-naming compatible delimiters.""")
|
|
43
|
+
|
|
44
|
+
parser.add_argument('--hash-algorithm',
|
|
45
|
+
default='blake2b',
|
|
46
|
+
help="Which hashlib algorithm to compute file hashes. Default is blake2b.")
|
|
47
|
+
|
|
48
|
+
parser.add_argument('--threads',
|
|
49
|
+
type=int,
|
|
50
|
+
default=4,
|
|
51
|
+
help="How many threads to use for processing files.")
|
|
52
|
+
|
|
53
|
+
parser.add_argument('--garbage-collect-cache', '--gc',
|
|
54
|
+
dest='garbage_collect',
|
|
55
|
+
action="store_true",
|
|
56
|
+
help="Run garbage collection in the blob cache")
|
|
57
|
+
|
|
58
|
+
parser.add_argument('--dry',
|
|
59
|
+
action="store_true",
|
|
60
|
+
help="""Only print what file operations would be performed instead of
|
|
61
|
+
actually performing them. Useful as a sanity check.""")
|
|
62
|
+
|
|
63
|
+
parser.add_argument('--verbose', '-v',
|
|
64
|
+
action="store_true",
|
|
65
|
+
help="Output extra information during snapshot operations.")
|
|
66
|
+
|
|
67
|
+
parser.add_argument('--version',
|
|
68
|
+
action="version",
|
|
69
|
+
version=f"%(prog)s {get_version_string()}",
|
|
70
|
+
help="Print version number and exit.")
|
|
71
|
+
|
|
72
|
+
args = parser.parse_args()
|
|
73
|
+
|
|
74
|
+
snapshot = FileSystemSnapshot(snapshots_dir = args.destination,
|
|
75
|
+
snapshot_name = args.snapshot_name,
|
|
76
|
+
target_paths = args.targets,
|
|
77
|
+
exclude_patterns = args.exclude,
|
|
78
|
+
hash_algorithm = args.hash_algorithm,
|
|
79
|
+
threads = args.threads,
|
|
80
|
+
dry_mode = args.dry,
|
|
81
|
+
verbose = args.verbose)
|
|
82
|
+
snapshot.take_snapshot()
|
|
83
|
+
|
|
84
|
+
if args.garbage_collect:
|
|
85
|
+
snapshot.garbage_collect_blob_cache()
|
|
86
|
+
|
|
87
|
+
return 0
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
if __name__ == '__main__':
|
|
91
|
+
sys.exit(main())
|
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import queue
|
|
5
|
+
import shutil
|
|
6
|
+
import signal
|
|
7
|
+
import sys
|
|
8
|
+
import threading
|
|
9
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, BinaryIO, Callable, Generator
|
|
12
|
+
|
|
13
|
+
type FileHandler = Callable[[Path,Path], None]
|
|
14
|
+
|
|
15
|
+
class FileSystemSnapshot:
|
|
16
|
+
def __init__(self,
|
|
17
|
+
snapshots_dir: str,
|
|
18
|
+
snapshot_name: str,
|
|
19
|
+
target_paths: list[str],
|
|
20
|
+
exclude_patterns: list[str]=[],
|
|
21
|
+
hash_algorithm: str='blake2b',
|
|
22
|
+
threads: int=4,
|
|
23
|
+
dry_mode: bool=True,
|
|
24
|
+
verbose: bool=False) -> None:
|
|
25
|
+
|
|
26
|
+
self.handle_file : FileHandler
|
|
27
|
+
self.handle_dir : FileHandler
|
|
28
|
+
self.handle_symlink : FileHandler
|
|
29
|
+
|
|
30
|
+
self._print_lock = threading.Lock()
|
|
31
|
+
self._hash_tree_write_lock = threading.Lock()
|
|
32
|
+
self._file_write_lock = threading.Lock()
|
|
33
|
+
self.output_path = Path(snapshots_dir).resolve()
|
|
34
|
+
self.cache_path = self.output_path / '.cacheback.d'
|
|
35
|
+
self.snapshot_name = snapshot_name
|
|
36
|
+
self.blob_store_path = self.cache_path / 'blob'
|
|
37
|
+
self.snapshot_path = self.output_path / snapshot_name
|
|
38
|
+
self.target_paths = set(Path(p).expanduser().resolve() for p in target_paths)
|
|
39
|
+
self.exclude_patterns = set(exclude_patterns)
|
|
40
|
+
self.hash_cache_path = self.cache_path / 'hash_tree_cache.json'
|
|
41
|
+
self._hash_fn = getattr(hashlib, hash_algorithm, hashlib.blake2b)
|
|
42
|
+
self.hash_tree = {}
|
|
43
|
+
self.threads = threads
|
|
44
|
+
self._thread_outputs = {}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
self.exclude_patterns.add(f"{self.output_path}")
|
|
48
|
+
|
|
49
|
+
if dry_mode:
|
|
50
|
+
self.handle_file = self._handle_file_dry
|
|
51
|
+
self.handle_dir = self._handle_dir_dry
|
|
52
|
+
self.handle_symlink = self._handle_symlink_dry
|
|
53
|
+
else:
|
|
54
|
+
self.handle_file = self._handle_file
|
|
55
|
+
self.handle_dir = self._handle_dir
|
|
56
|
+
self.handle_symlink = self._handle_symlink
|
|
57
|
+
|
|
58
|
+
if verbose:
|
|
59
|
+
self._verbose = True
|
|
60
|
+
self.update_thread_status = self._update_thread_status_verbose
|
|
61
|
+
self.format_file_status = self._format_file_status_verbose
|
|
62
|
+
else:
|
|
63
|
+
self._verbose = False
|
|
64
|
+
self.update_thread_status = self._update_thread_status_ephemeral
|
|
65
|
+
self.format_file_status = self._format_file_status_ephemeral
|
|
66
|
+
|
|
67
|
+
self._read_terminal_width()
|
|
68
|
+
signal.signal(signal.SIGWINCH, self._read_terminal_width)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _read_terminal_width(self, *_: Any) -> None:
|
|
72
|
+
with self._print_lock:
|
|
73
|
+
self._terminal_width = shutil.get_terminal_size((80,0)).columns
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def printerr(self, *args: Any, **kwargs: Any) -> None:
|
|
77
|
+
kwargs['file']=sys.stderr
|
|
78
|
+
print(*args, **kwargs)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _count_wrapped_lines(self, text: str) -> int:
|
|
82
|
+
return (len(text)-1) // self._terminal_width
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def print_ephemeral(self, text: str='') -> None:
|
|
86
|
+
lines = text.splitlines()
|
|
87
|
+
rows = (len(lines)-1) + sum(map(self._count_wrapped_lines, lines))
|
|
88
|
+
if rows > 0:
|
|
89
|
+
lines_up = f'\033[{rows}F'
|
|
90
|
+
else:
|
|
91
|
+
lines_up = ''
|
|
92
|
+
sys.stderr.write(f'\033[0J{text}\033[0G{lines_up}')
|
|
93
|
+
sys.stderr.flush()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def print_persistent(self, text: str='') -> None:
|
|
97
|
+
with self._print_lock:
|
|
98
|
+
sys.stdout.write(f"\033[0G\033[0J{text}\n")
|
|
99
|
+
sys.stdout.flush()
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def target_iter(self) -> Generator[Path]:
|
|
103
|
+
for target in self.target_paths:
|
|
104
|
+
yield from self.path_recursive_iter(target)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def is_path_excluded(self, path: Path) -> bool:
|
|
108
|
+
return any(path.full_match(xpat) for xpat in self.exclude_patterns)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def path_recursive_iter(self, path: Path) -> Generator[Path]:
|
|
112
|
+
if self.is_path_excluded(path):
|
|
113
|
+
self.print_persistent(f"Excluding {path}")
|
|
114
|
+
return
|
|
115
|
+
yield path
|
|
116
|
+
if path.is_dir() and not path.is_symlink():
|
|
117
|
+
for p in path.iterdir():
|
|
118
|
+
yield from self.path_recursive_iter(p)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def compute_file_hash(self, path: Path) -> str:
|
|
122
|
+
return self._hash_fn(path.read_bytes()).digest().hex()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@staticmethod
|
|
126
|
+
def get_file_modified_time(path: Path) -> int:
|
|
127
|
+
return int(path.stat().st_mtime)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@staticmethod
|
|
131
|
+
def get_nested_hash_path(file_hash: str,
|
|
132
|
+
dir_delimiter: str=os.path.sep,
|
|
133
|
+
nesting_levels: int=2,
|
|
134
|
+
nest_length: int=3) -> str:
|
|
135
|
+
_hash = file_hash
|
|
136
|
+
dirs = []
|
|
137
|
+
for i in range(nesting_levels):
|
|
138
|
+
nest_dir = _hash[:nest_length]
|
|
139
|
+
dirs.append(nest_dir)
|
|
140
|
+
_hash = _hash[nest_length:]
|
|
141
|
+
dirs.append(_hash)
|
|
142
|
+
hash_path = dir_delimiter.join(dirs)
|
|
143
|
+
return hash_path
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def load_hash_tree_cache(self) -> None:
|
|
147
|
+
if not self.hash_cache_path.exists():
|
|
148
|
+
return
|
|
149
|
+
with self.hash_cache_path.open('r') as cache_fd:
|
|
150
|
+
hash_tree_cached = json.load(cache_fd)
|
|
151
|
+
|
|
152
|
+
def _recursive_path_reconstruct_iter(parent: Path, tree_node: dict) -> Generator:
|
|
153
|
+
for k, v in tree_node.items():
|
|
154
|
+
path = parent / k
|
|
155
|
+
if isinstance(v, dict):
|
|
156
|
+
yield from _recursive_path_reconstruct_iter(path, v)
|
|
157
|
+
else:
|
|
158
|
+
yield (path, [*v, False])
|
|
159
|
+
|
|
160
|
+
self.hash_tree = {
|
|
161
|
+
str(path): path_props
|
|
162
|
+
for path, path_props in _recursive_path_reconstruct_iter(Path('/'), hash_tree_cached)
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def store_hash_tree_cache(self) -> None:
|
|
167
|
+
hash_tree_nested = {}
|
|
168
|
+
|
|
169
|
+
for path_str, props in self.hash_tree.items():
|
|
170
|
+
path_mtime, path_hash, path_visited = props
|
|
171
|
+
if not path_visited:
|
|
172
|
+
self.print_persistent(f"Pruning hash cache entry for {path_str}")
|
|
173
|
+
continue
|
|
174
|
+
path = Path(path_str)
|
|
175
|
+
nested_node = hash_tree_nested
|
|
176
|
+
for part in path.parent.parts[1:]:
|
|
177
|
+
if part not in nested_node:
|
|
178
|
+
nested_node[part] = {}
|
|
179
|
+
nested_node = nested_node[part]
|
|
180
|
+
nested_node[path.name] = [path_mtime, path_hash]
|
|
181
|
+
|
|
182
|
+
with self.hash_cache_path.open('w') as cache_fd:
|
|
183
|
+
json.dump(hash_tree_nested, cache_fd, separators=(',', ':'))
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def query_hash_tree_cache(self, file_path: Path) -> str:
|
|
187
|
+
current_mtime = self.get_file_modified_time(file_path)
|
|
188
|
+
file_hash = None
|
|
189
|
+
if (cached_props := self.hash_tree.get(str(file_path))):
|
|
190
|
+
cached_mtime, cached_hash, _visited = cached_props
|
|
191
|
+
if cached_mtime == current_mtime:
|
|
192
|
+
file_hash = cached_hash
|
|
193
|
+
if not file_hash:
|
|
194
|
+
file_hash = self.compute_file_hash(file_path)
|
|
195
|
+
with self._hash_tree_write_lock:
|
|
196
|
+
self.hash_tree[str(file_path)] = [current_mtime, file_hash, True]
|
|
197
|
+
return file_hash
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _handle_file_dry(self, file: Path, destination: Path) -> None:
|
|
201
|
+
file_hash = self.query_hash_tree_cache(file)
|
|
202
|
+
self.update_thread_status(self.format_file_status(file, file_hash))
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _handle_file(self, file: Path, destination: Path) -> None:
|
|
206
|
+
file_hash = self.query_hash_tree_cache(file)
|
|
207
|
+
nested_hash_path = self.get_nested_hash_path(file_hash)
|
|
208
|
+
|
|
209
|
+
blob_path = self.blob_store_path / nested_hash_path
|
|
210
|
+
blob_path.parent.mkdir(parents=True, exist_ok=True)
|
|
211
|
+
|
|
212
|
+
if blob_path.exists():
|
|
213
|
+
...
|
|
214
|
+
else:
|
|
215
|
+
self.set_thread_status(f"Waiting to copy {file}...")
|
|
216
|
+
with self._file_write_lock: # Multiple threads writing makes everything slower :)
|
|
217
|
+
if file.stat().st_size > 100_000_000:
|
|
218
|
+
self.copy_file_with_progress(file, blob_path)
|
|
219
|
+
else:
|
|
220
|
+
shutil.copy2(file, blob_path)
|
|
221
|
+
|
|
222
|
+
self.update_thread_status(self.format_file_status(file, file_hash))
|
|
223
|
+
|
|
224
|
+
destination.parent.mkdir(exist_ok=True, parents=True)
|
|
225
|
+
|
|
226
|
+
if destination.exists():
|
|
227
|
+
if self._verbose:
|
|
228
|
+
if destination.samefile(blob_path):
|
|
229
|
+
self.print_persistent(
|
|
230
|
+
f"{file} is already linked to {file_hash} in this snapshot. The most"
|
|
231
|
+
" likely cause of this is that a symlink captured in this snapshot points"
|
|
232
|
+
" to this path.")
|
|
233
|
+
else:
|
|
234
|
+
self.print_persistent(
|
|
235
|
+
f"{file} already exists in this snapshot but it does not target the blob"
|
|
236
|
+
f" in the cache that matches its current hash {file_hash}. Something"
|
|
237
|
+
" unexpected has occured, dropping into and interactive debugger."
|
|
238
|
+
)
|
|
239
|
+
breakpoint()
|
|
240
|
+
else:
|
|
241
|
+
destination.hardlink_to(blob_path)
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _handle_dir_dry(self, directory: Path, destination: Path) -> None:
|
|
246
|
+
self.update_thread_status(f"DRY: directory {directory}")
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _handle_dir(self, directory: Path, destination: Path) -> None:
|
|
250
|
+
destination.mkdir(parents=True, exist_ok=True)
|
|
251
|
+
return
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _handle_symlink_dry(self, symlink: Path, destination: Path) -> None:
|
|
255
|
+
self.update_thread_status(f"DRY: symlink {symlink} => {symlink.readlink()}")
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _handle_symlink(self, symlink: Path, destination: Path) -> None:
|
|
259
|
+
if destination.exists():
|
|
260
|
+
return
|
|
261
|
+
symlink_destination = symlink.readlink()
|
|
262
|
+
if symlink_destination.is_absolute():
|
|
263
|
+
# re-root the absolute symlink's target
|
|
264
|
+
snapshot_symlink_destination = Path(f"{self.snapshot_path}/{symlink_destination}")
|
|
265
|
+
else:
|
|
266
|
+
# relative symlink will work as expected under the snapshot dir, just duplicate it
|
|
267
|
+
snapshot_symlink_destination = symlink_destination
|
|
268
|
+
symlink_destination = (symlink.parent / symlink_destination).resolve()
|
|
269
|
+
|
|
270
|
+
self.update_thread_status(f"Symlink: {symlink} => {symlink_destination}")
|
|
271
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
272
|
+
try:
|
|
273
|
+
destination.symlink_to(snapshot_symlink_destination)
|
|
274
|
+
except FileExistsError:
|
|
275
|
+
self.print_persistent(f"symlink {snapshot_symlink_destination} already exists")
|
|
276
|
+
|
|
277
|
+
if not self.is_path_excluded(symlink_destination):
|
|
278
|
+
self.snapshot_by_type(symlink_destination)
|
|
279
|
+
else:
|
|
280
|
+
self.print_persistent(
|
|
281
|
+
f"Symlink {symlink} points to excluded file {symlink_destination}"
|
|
282
|
+
)
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def handle_unexpected(self, path: Path, destination: Path) -> None:
|
|
287
|
+
if path.exists():
|
|
288
|
+
breakpoint(header=f"Unhandled type for {path}")
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def snapshot_by_type(self, path: Path) -> None:
|
|
292
|
+
destination = Path(f"{self.snapshot_path}/{path}")
|
|
293
|
+
if path.is_symlink():
|
|
294
|
+
return self.handle_symlink(path, destination)
|
|
295
|
+
if path.is_dir():
|
|
296
|
+
return self.handle_dir(path, destination)
|
|
297
|
+
if path.is_file():
|
|
298
|
+
return self.handle_file(path, destination)
|
|
299
|
+
if any((path.is_fifo(), path.is_socket(), path.is_char_device(), path.is_block_device())):
|
|
300
|
+
return
|
|
301
|
+
else:
|
|
302
|
+
return self.handle_unexpected(path, destination)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def take_snapshot(self) -> None:
|
|
306
|
+
self.blob_store_path.mkdir(parents=True, exist_ok=True)
|
|
307
|
+
self.snapshot_path.mkdir(parents=True, exist_ok=False)
|
|
308
|
+
self.print_persistent(f"Creating snapshot {self.snapshot_name}")
|
|
309
|
+
self.load_hash_tree_cache()
|
|
310
|
+
with ThreadPoolExecutor(max_workers=self.threads) as pool:
|
|
311
|
+
pool.map(self.snapshot_by_type, self.target_iter())
|
|
312
|
+
self.store_hash_tree_cache()
|
|
313
|
+
self.print_persistent()
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
@staticmethod
|
|
317
|
+
def _copy_bytes(fsrc: BinaryIO,
|
|
318
|
+
fdest: BinaryIO,
|
|
319
|
+
callback: Callable,
|
|
320
|
+
total: int,
|
|
321
|
+
length: int) -> None:
|
|
322
|
+
copied = 0
|
|
323
|
+
while True:
|
|
324
|
+
buf = fsrc.read(length)
|
|
325
|
+
if not buf:
|
|
326
|
+
break
|
|
327
|
+
fdest.write(buf)
|
|
328
|
+
copied += len(buf)
|
|
329
|
+
callback(copied, total)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _copy_file_with_callback(self,
|
|
333
|
+
source_path: Path,
|
|
334
|
+
destination_path: Path,
|
|
335
|
+
callback: Callable,
|
|
336
|
+
callback_batch_size: int=65536) -> None:
|
|
337
|
+
size = os.stat(source_path).st_size
|
|
338
|
+
with open(source_path, "rb") as fsrc:
|
|
339
|
+
with open(destination_path, "wb") as fdest:
|
|
340
|
+
self._copy_bytes(fsrc,
|
|
341
|
+
fdest,
|
|
342
|
+
callback=callback,
|
|
343
|
+
total=size,
|
|
344
|
+
length=callback_batch_size)
|
|
345
|
+
shutil.copymode(str(source_path), str(destination_path))
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def make_progress_printer(self, label: str) -> Callable[[int,int], str]:
|
|
349
|
+
template = f"{label} [{{filled}}{{empty}}]"
|
|
350
|
+
template_size = len(template.format(filled='',empty=''))
|
|
351
|
+
template_ansi = f"\033[0J{template}\033[0G"
|
|
352
|
+
|
|
353
|
+
def _progress_printer(current: int, total: int) -> str:
|
|
354
|
+
bar_width = self._terminal_width - template_size
|
|
355
|
+
progress = (current * bar_width) // total
|
|
356
|
+
filled = '-' * progress
|
|
357
|
+
empty = ' ' * (bar_width - progress)
|
|
358
|
+
return template_ansi.format(filled=filled, empty=empty)
|
|
359
|
+
|
|
360
|
+
return _progress_printer
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def copy_file_with_progress(self, source_path: Path, destination_path: Path) -> None:
|
|
364
|
+
_file_progress = self.make_progress_printer(str(source_path))
|
|
365
|
+
if self._verbose:
|
|
366
|
+
def _callback(x: int, y: int) -> None:
|
|
367
|
+
self.printerr(_file_progress(x,y), end='\r')
|
|
368
|
+
else:
|
|
369
|
+
def _callback(x: int, y: int) -> None:
|
|
370
|
+
self.update_thread_status(_file_progress(x,y))
|
|
371
|
+
_callback(0,1)
|
|
372
|
+
self._copy_file_with_callback(source_path, destination_path, _callback)
|
|
373
|
+
self.print_persistent(f"\r\033[0JCopied {source_path}")
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def garbage_collect_blob_cache(self, threads: int=1) -> None:
|
|
377
|
+
blobs = self.blob_store_path.rglob('*')
|
|
378
|
+
thread_stop_event = threading.Event()
|
|
379
|
+
blob_queue = queue.SimpleQueue()
|
|
380
|
+
blobs_checked = 0
|
|
381
|
+
blobs_total = 1
|
|
382
|
+
_gc_progress = self.make_progress_printer("Garbage collecting")
|
|
383
|
+
self.printerr(_gc_progress(0, 1), end='\r')
|
|
384
|
+
|
|
385
|
+
def _check_blob_queue_thread_task() -> None:
|
|
386
|
+
nonlocal blobs_checked, blobs_total
|
|
387
|
+
while True:
|
|
388
|
+
try:
|
|
389
|
+
blob = blob_queue.get(timeout=1)
|
|
390
|
+
if blob.is_file():
|
|
391
|
+
if blob.stat().st_nlink < 2:
|
|
392
|
+
# If blob isn't hardlinked to, delete it from storage
|
|
393
|
+
self.print_persistent(f"Garbage collecting {blob}")
|
|
394
|
+
blob.unlink()
|
|
395
|
+
blob_queue.put(blob.parent)
|
|
396
|
+
blobs_total += 1
|
|
397
|
+
elif blob.is_dir():
|
|
398
|
+
if not any(blob.iterdir()):
|
|
399
|
+
self.print_persistent(f"Garbage collecting {blob}/")
|
|
400
|
+
blob.rmdir()
|
|
401
|
+
blob_queue.put(blob.parent)
|
|
402
|
+
blobs_checked += 1
|
|
403
|
+
except queue.Empty:
|
|
404
|
+
if thread_stop_event.is_set():
|
|
405
|
+
return
|
|
406
|
+
|
|
407
|
+
pool = ThreadPoolExecutor(max_workers=threads)
|
|
408
|
+
_futures = [pool.submit(_check_blob_queue_thread_task) for i in range(threads)]
|
|
409
|
+
for blob in blobs:
|
|
410
|
+
blobs_total += 1
|
|
411
|
+
blob_queue.put(blob)
|
|
412
|
+
self.printerr(_gc_progress(blobs_checked, blobs_total), end='\r')
|
|
413
|
+
thread_stop_event.set()
|
|
414
|
+
pool.shutdown(wait=True)
|
|
415
|
+
|
|
416
|
+
self.print_persistent()
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def print_thread_statuses_ephemeral(self) -> None:
|
|
420
|
+
with self._print_lock:
|
|
421
|
+
dead_threads = (set(self._thread_outputs.keys())
|
|
422
|
+
.difference(t.ident for t in threading.enumerate()))
|
|
423
|
+
for thread_id in dead_threads:
|
|
424
|
+
self._thread_outputs.pop(thread_id, None)
|
|
425
|
+
body = '\n'.join(self._thread_outputs.values())
|
|
426
|
+
self.print_ephemeral(body)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def set_thread_status(self, text: str) -> None:
|
|
430
|
+
self._thread_outputs[threading.get_ident()] = text
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def _update_thread_status_ephemeral(self, text: str) -> None:
|
|
434
|
+
self.set_thread_status(text)
|
|
435
|
+
self.print_thread_statuses_ephemeral()
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _update_thread_status_verbose(self, text: str) -> None:
|
|
439
|
+
with self._print_lock:
|
|
440
|
+
print(text)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
_hash_prefix = " ↳ "
|
|
444
|
+
_hash_prefix_size = len(_hash_prefix)
|
|
445
|
+
_truncate_str = "…"
|
|
446
|
+
_truncate_size = len(_truncate_str)
|
|
447
|
+
|
|
448
|
+
def _format_file_status_ephemeral(self, file_path: Path, file_hash: str) -> str:
|
|
449
|
+
file_path_str = str(file_path)
|
|
450
|
+
hash_len_delta = len(file_hash) + self._hash_prefix_size - self._terminal_width
|
|
451
|
+
if hash_len_delta > 0:
|
|
452
|
+
file_hash = f"{file_hash[:-(hash_len_delta+self._truncate_size)]}{self._truncate_str}"
|
|
453
|
+
if len(file_path_str)> self._terminal_width:
|
|
454
|
+
path_truncate_size = self._terminal_width - self._truncate_size
|
|
455
|
+
file_path_str = f"{self._truncate_str}{file_path_str[-path_truncate_size:]}"
|
|
456
|
+
return f"{file_path_str}\n{self._hash_prefix}{file_hash}"
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _format_file_status_verbose(self, file_path: Path, file_hash: str) -> str:
|
|
460
|
+
return f"{file_path} -> {file_hash}"
|