dirark 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dirark-0.1.0/.github/workflows/test.yml +21 -0
- dirark-0.1.0/.github/workflows/twine_release.yml +17 -0
- dirark-0.1.0/.gitignore +119 -0
- dirark-0.1.0/LICENSE +7 -0
- dirark-0.1.0/PKG-INFO +22 -0
- dirark-0.1.0/README.md +5 -0
- dirark-0.1.0/coverage/report.md +14 -0
- dirark-0.1.0/dirark/__init__.py +17 -0
- dirark-0.1.0/dirark/__main__.py +124 -0
- dirark-0.1.0/dirark/core.py +118 -0
- dirark-0.1.0/dirark/reader.py +67 -0
- dirark-0.1.0/dirark/storage.py +143 -0
- dirark-0.1.0/dirark/sync.py +115 -0
- dirark-0.1.0/dirark/tests/test_core.py +204 -0
- dirark-0.1.0/dirark/tests/test_init.py +5 -0
- dirark-0.1.0/dirark/tests/test_reader.py +69 -0
- dirark-0.1.0/dirark/tests/test_storage.py +124 -0
- dirark-0.1.0/dirark/tests/test_sync.py +142 -0
- dirark-0.1.0/docs/api.md +68 -0
- dirark-0.1.0/docs/release_notes.md +3 -0
- dirark-0.1.0/pyproject.toml +22 -0
- dirark-0.1.0/uv.lock +7 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: Python Test
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
branches:
|
|
6
|
+
- main
|
|
7
|
+
push:
|
|
8
|
+
branches:
|
|
9
|
+
- main
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
test:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- uses: astral-sh/setup-uv@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: '3.12'
|
|
19
|
+
enable-cache: true
|
|
20
|
+
- run: uv tool install branthebuilder
|
|
21
|
+
- run: branb test
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
|
|
2
|
+
name: Python Package Release
|
|
3
|
+
|
|
4
|
+
on:
|
|
5
|
+
release:
|
|
6
|
+
types: [published]
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
release:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
- uses: astral-sh/setup-uv@v5
|
|
14
|
+
- run: uv build && uv publish
|
|
15
|
+
env:
|
|
16
|
+
UV_PUBLISH_TOKEN: ${{ secrets.PYPI_TOKEN }}
|
|
17
|
+
|
dirark-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
pip-wheel-metadata/
|
|
24
|
+
share/python-wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
MANIFEST
|
|
29
|
+
|
|
30
|
+
# PyInstaller
|
|
31
|
+
# Usually these files are written by a python script from a template
|
|
32
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
33
|
+
*.manifest
|
|
34
|
+
*.spec
|
|
35
|
+
|
|
36
|
+
# Installer logs
|
|
37
|
+
pip-log.txt
|
|
38
|
+
pip-delete-this-directory.txt
|
|
39
|
+
|
|
40
|
+
# Unit test / coverage reports
|
|
41
|
+
htmlcov/
|
|
42
|
+
.tox/
|
|
43
|
+
.nox/
|
|
44
|
+
.coverage
|
|
45
|
+
.coverage.*
|
|
46
|
+
.cache
|
|
47
|
+
nosetests.xml
|
|
48
|
+
coverage.xml
|
|
49
|
+
*.cover
|
|
50
|
+
*.py,cover
|
|
51
|
+
.hypothesis/
|
|
52
|
+
.pytest_cache/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Generated coverage HTML (report.md is committed, HTML is not)
|
|
72
|
+
coverage/html/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
target/
|
|
76
|
+
|
|
77
|
+
# pyenv
|
|
78
|
+
.python-version
|
|
79
|
+
|
|
80
|
+
# pipenv
|
|
81
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
82
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
83
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
84
|
+
# install all needed dependencies.
|
|
85
|
+
#Pipfile.lock
|
|
86
|
+
|
|
87
|
+
# celery beat schedule file
|
|
88
|
+
celerybeat-schedule
|
|
89
|
+
|
|
90
|
+
# SageMath parsed files
|
|
91
|
+
*.sage.py
|
|
92
|
+
|
|
93
|
+
# Environments
|
|
94
|
+
.env
|
|
95
|
+
.venv
|
|
96
|
+
env/
|
|
97
|
+
venv/
|
|
98
|
+
ENV/
|
|
99
|
+
env.bak/
|
|
100
|
+
venv.bak/
|
|
101
|
+
|
|
102
|
+
# Spyder project settings
|
|
103
|
+
.spyderproject
|
|
104
|
+
.spyproject
|
|
105
|
+
|
|
106
|
+
# Rope project settings
|
|
107
|
+
.ropeproject
|
|
108
|
+
|
|
109
|
+
# mypy
|
|
110
|
+
.mypy_cache/
|
|
111
|
+
.dmypy.json
|
|
112
|
+
dmypy.json
|
|
113
|
+
|
|
114
|
+
# Pyre type checker
|
|
115
|
+
.pyre/
|
|
116
|
+
|
|
117
|
+
.idea
|
|
118
|
+
.vscode
|
|
119
|
+
dask-worker-space/
|
dirark-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright 2026 Endre Márk Borza
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
dirark-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dirark
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Directory archival
|
|
5
|
+
Project-URL: Homepage, https://github.com/endremborza/dirark
|
|
6
|
+
Author-email: Endre Márk Borza <endremborza@gmail.com>
|
|
7
|
+
License: Copyright 2026 Endre Márk Borza
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# dirark
|
|
19
|
+
|
|
20
|
+
[](https://pypi.org/project/dirark/)
|
|
21
|
+
|
|
22
|
+
Directory archival
|
dirark-0.1.0/README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
| Name | Stmts | Miss | Cover |
|
|
2
|
+
|------------------------------ | -------: | -------: | ------: |
|
|
3
|
+
| dirark/\_\_init\_\_.py | 5 | 0 | 100% |
|
|
4
|
+
| dirark/\_\_main\_\_.py | 92 | 92 | 0% |
|
|
5
|
+
| dirark/core.py | 71 | 4 | 94% |
|
|
6
|
+
| dirark/reader.py | 35 | 0 | 100% |
|
|
7
|
+
| dirark/storage.py | 57 | 0 | 100% |
|
|
8
|
+
| dirark/sync.py | 55 | 1 | 98% |
|
|
9
|
+
| dirark/tests/test\_core.py | 164 | 0 | 100% |
|
|
10
|
+
| dirark/tests/test\_init.py | 3 | 0 | 100% |
|
|
11
|
+
| dirark/tests/test\_reader.py | 55 | 0 | 100% |
|
|
12
|
+
| dirark/tests/test\_storage.py | 88 | 0 | 100% |
|
|
13
|
+
| dirark/tests/test\_sync.py | 116 | 0 | 100% |
|
|
14
|
+
| **TOTAL** | **741** | **97** | **87%** |
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""dirark – directory archival and retrieval tool."""
|
|
2
|
+
|
|
3
|
+
from .core import archive_dir, restore_ark
|
|
4
|
+
from .reader import ArkReader
|
|
5
|
+
from .sync import add_dir_to_remote_ark, merge_arks, pull_ark, push_ark
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"archive_dir",
|
|
11
|
+
"restore_ark",
|
|
12
|
+
"ArkReader",
|
|
13
|
+
"push_ark",
|
|
14
|
+
"pull_ark",
|
|
15
|
+
"merge_arks",
|
|
16
|
+
"add_dir_to_remote_ark",
|
|
17
|
+
]
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""CLI entry point for dirark."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .core import archive_dir, restore_ark
|
|
8
|
+
from .reader import ArkReader
|
|
9
|
+
from .sync import add_dir_to_remote_ark, merge_arks, pull_ark, push_ark
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main() -> None:
|
|
13
|
+
"""Parse arguments and dispatch to the appropriate subcommand."""
|
|
14
|
+
parser = argparse.ArgumentParser(
|
|
15
|
+
description="Cold-storage directory archival tool."
|
|
16
|
+
)
|
|
17
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
18
|
+
|
|
19
|
+
p = sub.add_parser("archive", help="Archive a directory.")
|
|
20
|
+
p.add_argument("source_dir", type=Path)
|
|
21
|
+
p.set_defaults(func=_archive)
|
|
22
|
+
|
|
23
|
+
p = sub.add_parser("restore", help="Restore files from an ark.")
|
|
24
|
+
p.add_argument("ark_dir", type=Path)
|
|
25
|
+
p.add_argument("dest_dir", type=Path)
|
|
26
|
+
p.set_defaults(func=_restore)
|
|
27
|
+
|
|
28
|
+
p = sub.add_parser("push", help="Push local ark to remote via rsync.")
|
|
29
|
+
p.add_argument("local_ark", type=Path)
|
|
30
|
+
p.add_argument("remote")
|
|
31
|
+
p.set_defaults(func=_push)
|
|
32
|
+
|
|
33
|
+
p = sub.add_parser("pull", help="Pull remote ark to local path via rsync.")
|
|
34
|
+
p.add_argument("remote")
|
|
35
|
+
p.add_argument("local_ark", type=Path)
|
|
36
|
+
p.set_defaults(func=_pull)
|
|
37
|
+
|
|
38
|
+
p = sub.add_parser("merge", help="Merge src_ark into dst_ark (local).")
|
|
39
|
+
p.add_argument("src_ark", type=Path)
|
|
40
|
+
p.add_argument("dst_ark", type=Path)
|
|
41
|
+
p.set_defaults(func=_merge)
|
|
42
|
+
|
|
43
|
+
p = sub.add_parser("add", help="Archive a directory and add it to a remote ark.")
|
|
44
|
+
p.add_argument("source_dir", type=Path)
|
|
45
|
+
p.add_argument("remote_ark")
|
|
46
|
+
p.set_defaults(func=_add)
|
|
47
|
+
|
|
48
|
+
p = sub.add_parser("read", help="Print a file from an ark to stdout.")
|
|
49
|
+
p.add_argument("ark_dir", type=Path)
|
|
50
|
+
p.add_argument("file_path")
|
|
51
|
+
p.set_defaults(func=_read)
|
|
52
|
+
|
|
53
|
+
args = parser.parse_args()
|
|
54
|
+
args.func(args)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _archive(args: argparse.Namespace) -> None:
|
|
58
|
+
try:
|
|
59
|
+
archive_dir(args.source_dir)
|
|
60
|
+
print(f"Archived '{args.source_dir}'.")
|
|
61
|
+
except Exception as e:
|
|
62
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _restore(args: argparse.Namespace) -> None:
|
|
67
|
+
try:
|
|
68
|
+
restore_ark(args.ark_dir, args.dest_dir)
|
|
69
|
+
print(f"Restored to '{args.dest_dir}'.")
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
72
|
+
sys.exit(1)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _push(args: argparse.Namespace) -> None:
|
|
76
|
+
try:
|
|
77
|
+
push_ark(args.local_ark, args.remote)
|
|
78
|
+
print(f"Pushed '{args.local_ark}' to '{args.remote}'.")
|
|
79
|
+
except Exception as e:
|
|
80
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
81
|
+
sys.exit(1)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _pull(args: argparse.Namespace) -> None:
|
|
85
|
+
try:
|
|
86
|
+
pull_ark(args.remote, args.local_ark)
|
|
87
|
+
print(f"Pulled '{args.remote}' to '{args.local_ark}'.")
|
|
88
|
+
except Exception as e:
|
|
89
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
90
|
+
sys.exit(1)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _merge(args: argparse.Namespace) -> None:
|
|
94
|
+
try:
|
|
95
|
+
merge_arks(args.src_ark, args.dst_ark)
|
|
96
|
+
print(f"Merged '{args.src_ark}' into '{args.dst_ark}'.")
|
|
97
|
+
except Exception as e:
|
|
98
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
99
|
+
sys.exit(1)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _add(args: argparse.Namespace) -> None:
|
|
103
|
+
try:
|
|
104
|
+
add_dir_to_remote_ark(args.source_dir, args.remote_ark)
|
|
105
|
+
print(f"Added '{args.source_dir}' to '{args.remote_ark}'.")
|
|
106
|
+
except Exception as e:
|
|
107
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
108
|
+
sys.exit(1)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _read(args: argparse.Namespace) -> None:
|
|
112
|
+
try:
|
|
113
|
+
with ArkReader(args.ark_dir) as reader:
|
|
114
|
+
sys.stdout.buffer.write(reader.read_file(args.file_path))
|
|
115
|
+
except KeyError as e:
|
|
116
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
117
|
+
sys.exit(1)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
120
|
+
sys.exit(1)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
main()
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Core archive and restore operations."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import tempfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .storage import (
|
|
8
|
+
ARK_DIR_EXT,
|
|
9
|
+
DB_NAME,
|
|
10
|
+
b2sum,
|
|
11
|
+
ensure_clean_outdir,
|
|
12
|
+
extract_tar_zst,
|
|
13
|
+
open_db,
|
|
14
|
+
write_objects_to_tar,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def archive_dir(src_dir: Path, ark_out: Path | None = None) -> None:
|
|
19
|
+
"""Archive src_dir into a content-addressed store.
|
|
20
|
+
|
|
21
|
+
By default the archive is created at src_dir + ARK_DIR_EXT. Pass ark_out
|
|
22
|
+
to write into an existing ark directory (useful for merging or remote push).
|
|
23
|
+
|
|
24
|
+
Archiving is idempotent: re-running on the same directory is a no-op.
|
|
25
|
+
Files with duplicate content are deduplicated by BLAKE2b checksum.
|
|
26
|
+
"""
|
|
27
|
+
if ark_out is None:
|
|
28
|
+
ark_out = Path(f"{src_dir}{ARK_DIR_EXT}")
|
|
29
|
+
ark_out.mkdir(exist_ok=True, parents=True)
|
|
30
|
+
ensure_clean_outdir(ark_out)
|
|
31
|
+
db = open_db(ark_out / DB_NAME)
|
|
32
|
+
cur = db.cursor()
|
|
33
|
+
|
|
34
|
+
new_objects: dict[str, Path] = {}
|
|
35
|
+
new_files: list[tuple[str, str]] = []
|
|
36
|
+
|
|
37
|
+
for path in sorted(src_dir.rglob("*")):
|
|
38
|
+
if not path.is_file():
|
|
39
|
+
continue
|
|
40
|
+
rel = path.relative_to(src_dir).as_posix()
|
|
41
|
+
checksum = b2sum(path)
|
|
42
|
+
|
|
43
|
+
cur.execute("SELECT 1 FROM files WHERE path=?", (rel,))
|
|
44
|
+
if cur.fetchone():
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
cur.execute("SELECT tar_name FROM objects WHERE checksum=?", (checksum,))
|
|
48
|
+
if cur.fetchone() is None:
|
|
49
|
+
new_objects.setdefault(checksum, path)
|
|
50
|
+
|
|
51
|
+
new_files.append((rel, checksum))
|
|
52
|
+
|
|
53
|
+
if not new_files:
|
|
54
|
+
db.close()
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
if new_objects:
|
|
58
|
+
tar_name = write_objects_to_tar(ark_out, new_objects)
|
|
59
|
+
for checksum in new_objects:
|
|
60
|
+
cur.execute("INSERT INTO objects VALUES (?, ?)", (checksum, tar_name))
|
|
61
|
+
|
|
62
|
+
for rel, checksum in new_files:
|
|
63
|
+
cur.execute("INSERT INTO files VALUES (?, ?)", (rel, checksum))
|
|
64
|
+
|
|
65
|
+
db.commit()
|
|
66
|
+
db.close()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def restore_ark(ark_dir: Path, dest_dir: Path) -> None:
|
|
70
|
+
"""Restore all files from an ark to dest_dir.
|
|
71
|
+
|
|
72
|
+
dest_dir is only created if there are files to restore.
|
|
73
|
+
Missing tars or objects produce warnings but do not abort the restore.
|
|
74
|
+
"""
|
|
75
|
+
db = open_db(ark_dir / DB_NAME)
|
|
76
|
+
cur = db.cursor()
|
|
77
|
+
|
|
78
|
+
cur.execute("SELECT path, checksum FROM files")
|
|
79
|
+
files = cur.fetchall()
|
|
80
|
+
|
|
81
|
+
if not files:
|
|
82
|
+
print("No files found to restore in the archive.")
|
|
83
|
+
db.close()
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
87
|
+
cur.execute("SELECT checksum, tar_name FROM objects")
|
|
88
|
+
checksum_to_tar = dict(cur.fetchall())
|
|
89
|
+
db.close()
|
|
90
|
+
|
|
91
|
+
by_tar: dict[Path, list[tuple[str, str]]] = {}
|
|
92
|
+
for rel, checksum in files:
|
|
93
|
+
tar_name = checksum_to_tar.get(checksum)
|
|
94
|
+
if tar_name:
|
|
95
|
+
by_tar.setdefault(ark_dir / tar_name, []).append((rel, checksum))
|
|
96
|
+
else:
|
|
97
|
+
print(f"Warning: checksum {checksum} for {rel} not in objects.")
|
|
98
|
+
|
|
99
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
100
|
+
tmp_path = Path(tmp)
|
|
101
|
+
obj_dir = tmp_path / "objects"
|
|
102
|
+
obj_dir.mkdir()
|
|
103
|
+
|
|
104
|
+
for tar_path, tar_files in by_tar.items():
|
|
105
|
+
if not tar_path.exists():
|
|
106
|
+
print(f"Warning: {tar_path.name} not found.")
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
extract_tar_zst(tar_path, tmp_path)
|
|
110
|
+
|
|
111
|
+
for rel, checksum in tar_files:
|
|
112
|
+
dest_file = dest_dir / rel
|
|
113
|
+
dest_file.parent.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
src_obj = obj_dir / checksum
|
|
115
|
+
if src_obj.exists():
|
|
116
|
+
shutil.copy2(src_obj, dest_file)
|
|
117
|
+
else:
|
|
118
|
+
print(f"Warning: object {checksum} missing from {tar_path.name}.")
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""ArkReader: retrieve individual files from a dirark archive."""
|
|
2
|
+
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .storage import DB_NAME, extract_object_from_tar, open_db
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ArkReader:
|
|
10
|
+
"""Read-only access to individual files within a dirark archive.
|
|
11
|
+
|
|
12
|
+
Supports context manager usage::
|
|
13
|
+
|
|
14
|
+
with ArkReader(ark_dir) as reader:
|
|
15
|
+
data = reader.read_file("path/to/file.txt")
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, ark_dir: Path) -> None:
|
|
19
|
+
"""Open an ark for reading."""
|
|
20
|
+
self._ark_dir = ark_dir
|
|
21
|
+
self._db = open_db(ark_dir / DB_NAME)
|
|
22
|
+
|
|
23
|
+
def __enter__(self) -> "ArkReader":
|
|
24
|
+
return self
|
|
25
|
+
|
|
26
|
+
def __exit__(self, *_: object) -> None:
|
|
27
|
+
self.close()
|
|
28
|
+
|
|
29
|
+
def list_files(self) -> list[str]:
|
|
30
|
+
"""Return sorted list of all archived file paths."""
|
|
31
|
+
cur = self._db.cursor()
|
|
32
|
+
cur.execute("SELECT path FROM files ORDER BY path")
|
|
33
|
+
return [row[0] for row in cur.fetchall()]
|
|
34
|
+
|
|
35
|
+
def get_checksum(self, path: str) -> str:
|
|
36
|
+
"""Return the BLAKE2b checksum of an archived file by relative path."""
|
|
37
|
+
cur = self._db.cursor()
|
|
38
|
+
cur.execute("SELECT checksum FROM files WHERE path=?", (path,))
|
|
39
|
+
row = cur.fetchone()
|
|
40
|
+
if row is None:
|
|
41
|
+
raise KeyError(f"File not found in ark: {path}")
|
|
42
|
+
return row[0]
|
|
43
|
+
|
|
44
|
+
def read_file(self, path: str) -> bytes:
|
|
45
|
+
"""Read and return the raw bytes of an archived file.
|
|
46
|
+
|
|
47
|
+
Raises KeyError if path is not in the ark.
|
|
48
|
+
"""
|
|
49
|
+
cur = self._db.cursor()
|
|
50
|
+
cur.execute(
|
|
51
|
+
"SELECT f.checksum, o.tar_name "
|
|
52
|
+
"FROM files f JOIN objects o ON f.checksum = o.checksum "
|
|
53
|
+
"WHERE f.path=?",
|
|
54
|
+
(path,),
|
|
55
|
+
)
|
|
56
|
+
row = cur.fetchone()
|
|
57
|
+
if row is None:
|
|
58
|
+
raise KeyError(f"File not found in ark: {path}")
|
|
59
|
+
checksum, tar_name = row
|
|
60
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
61
|
+
tmp_path = Path(tmp)
|
|
62
|
+
extract_object_from_tar(self._ark_dir / tar_name, checksum, tmp_path)
|
|
63
|
+
return (tmp_path / "objects" / checksum).read_bytes()
|
|
64
|
+
|
|
65
|
+
def close(self) -> None:
|
|
66
|
+
"""Close the underlying database connection."""
|
|
67
|
+
self._db.close()
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Low-level storage primitives: checksums, tar I/O, and database access."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import sqlite3
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
MAX_TAR_MB = 256
|
|
10
|
+
DB_NAME = "index.sqlite"
|
|
11
|
+
SEP = "-"
|
|
12
|
+
TAR_PREFIX = "data" + SEP
|
|
13
|
+
TAR_EXT = ".tar.zst"
|
|
14
|
+
ARK_DIR_EXT = ".ark.d"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def b2sum(path: Path) -> str:
|
|
18
|
+
"""Compute BLAKE2b checksum of a file using the system b2sum utility."""
|
|
19
|
+
res = subprocess.run(
|
|
20
|
+
["b2sum", str(path)],
|
|
21
|
+
check=True,
|
|
22
|
+
stdout=subprocess.PIPE,
|
|
23
|
+
text=True,
|
|
24
|
+
)
|
|
25
|
+
return res.stdout.split()[0]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def open_db(path: Path) -> sqlite3.Connection:
|
|
29
|
+
"""Open or create an ark SQLite database, ensuring the schema exists.
|
|
30
|
+
|
|
31
|
+
Tables:
|
|
32
|
+
files(path TEXT PK, checksum TEXT) -- relative path → checksum
|
|
33
|
+
objects(checksum TEXT PK, tar_name TEXT) -- checksum → archive name
|
|
34
|
+
"""
|
|
35
|
+
db = sqlite3.connect(path)
|
|
36
|
+
db.execute(
|
|
37
|
+
"CREATE TABLE IF NOT EXISTS files("
|
|
38
|
+
"path TEXT PRIMARY KEY, checksum TEXT NOT NULL)"
|
|
39
|
+
)
|
|
40
|
+
db.execute(
|
|
41
|
+
"CREATE TABLE IF NOT EXISTS objects("
|
|
42
|
+
"checksum TEXT PRIMARY KEY, tar_name TEXT NOT NULL)"
|
|
43
|
+
)
|
|
44
|
+
return db
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def list_tars(ark_dir: Path) -> list[Path]:
|
|
48
|
+
"""Return sorted list of tar archives in an ark directory."""
|
|
49
|
+
return sorted(ark_dir.glob(f"{TAR_PREFIX}*{TAR_EXT}"))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def tar_size_mb(path: Path) -> float:
|
|
53
|
+
"""Return file size in megabytes."""
|
|
54
|
+
return path.stat().st_size / (1024 * 1024)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def next_tar_path(ark_dir: Path, min_idx: int = 0) -> Path:
|
|
58
|
+
"""Return path for the next tar archive in ark_dir.
|
|
59
|
+
|
|
60
|
+
Index is max(last_existing + 1, min_idx), guaranteeing no name collision.
|
|
61
|
+
"""
|
|
62
|
+
tars = list_tars(ark_dir)
|
|
63
|
+
last_idx = int(tars[-1].name.removesuffix(TAR_EXT).split(SEP)[-1]) if tars else 0
|
|
64
|
+
idx = max(last_idx + 1, min_idx)
|
|
65
|
+
return ark_dir / f"{TAR_PREFIX}{idx:05d}{TAR_EXT}"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def extract_tar_zst(src: Path, dest: Path) -> None:
|
|
69
|
+
"""Extract a zstd-compressed tar archive into dest."""
|
|
70
|
+
subprocess.run(
|
|
71
|
+
["tar", "-I", "zstd", "-xf", str(src), "-C", str(dest)],
|
|
72
|
+
check=True,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def extract_object_from_tar(tar: Path, checksum: str, dest: Path) -> None:
|
|
77
|
+
"""Extract a single object by checksum from a tar archive into dest."""
|
|
78
|
+
subprocess.run(
|
|
79
|
+
[
|
|
80
|
+
"tar",
|
|
81
|
+
"-I",
|
|
82
|
+
"zstd",
|
|
83
|
+
"-xf",
|
|
84
|
+
str(tar),
|
|
85
|
+
"-C",
|
|
86
|
+
str(dest),
|
|
87
|
+
f"./objects/{checksum}",
|
|
88
|
+
],
|
|
89
|
+
check=True,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def create_tar_zst(src_dir: Path, out: Path) -> None:
|
|
94
|
+
"""Create a zstd-compressed tar of src_dir contents at out."""
|
|
95
|
+
subprocess.run(
|
|
96
|
+
["tar", "-C", str(src_dir), "-I", "zstd", "-cf", str(out), "."],
|
|
97
|
+
check=True,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def ensure_clean_outdir(ark_dir: Path) -> None:
|
|
102
|
+
"""Create ark_dir if needed and raise RuntimeError on unexpected files."""
|
|
103
|
+
ark_dir.mkdir(parents=True, exist_ok=True)
|
|
104
|
+
allowed = {DB_NAME} | {p.name for p in ark_dir.glob(f"{TAR_PREFIX}*{TAR_EXT}")}
|
|
105
|
+
for p in ark_dir.iterdir():
|
|
106
|
+
if p.name not in allowed:
|
|
107
|
+
raise RuntimeError(f"Unexpected file in archive dir: {p}")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def write_objects_to_tar(
|
|
111
|
+
ark_dir: Path,
|
|
112
|
+
objects: dict[str, Path],
|
|
113
|
+
min_tar_idx: int = 0,
|
|
114
|
+
) -> str:
|
|
115
|
+
"""Append objects (checksum → source path) to a tar archive in ark_dir.
|
|
116
|
+
|
|
117
|
+
Reuses the last tar if it is under MAX_TAR_MB, otherwise creates a new
|
|
118
|
+
one. min_tar_idx can be used to avoid index collisions when merging arks.
|
|
119
|
+
|
|
120
|
+
Returns the tar filename written to.
|
|
121
|
+
"""
|
|
122
|
+
tars = list_tars(ark_dir)
|
|
123
|
+
if tars and tar_size_mb(tars[-1]) < MAX_TAR_MB:
|
|
124
|
+
tar_path = tars[-1]
|
|
125
|
+
else:
|
|
126
|
+
tar_path = next_tar_path(ark_dir, min_idx=min_tar_idx)
|
|
127
|
+
|
|
128
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
129
|
+
tmp_dir = Path(tmp)
|
|
130
|
+
obj_dir = tmp_dir / "objects"
|
|
131
|
+
obj_dir.mkdir(parents=True)
|
|
132
|
+
|
|
133
|
+
if tar_path.exists():
|
|
134
|
+
extract_tar_zst(tar_path, tmp_dir)
|
|
135
|
+
|
|
136
|
+
for checksum, src in objects.items():
|
|
137
|
+
dest = obj_dir / checksum
|
|
138
|
+
if not dest.exists():
|
|
139
|
+
shutil.copy2(src, dest)
|
|
140
|
+
|
|
141
|
+
create_tar_zst(tmp_dir, tar_path)
|
|
142
|
+
|
|
143
|
+
return tar_path.name
|