dirark 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dirark/__init__.py +17 -0
- dirark/__main__.py +124 -0
- dirark/core.py +118 -0
- dirark/reader.py +67 -0
- dirark/storage.py +143 -0
- dirark/sync.py +115 -0
- dirark/tests/test_core.py +204 -0
- dirark/tests/test_init.py +5 -0
- dirark/tests/test_reader.py +69 -0
- dirark/tests/test_storage.py +124 -0
- dirark/tests/test_sync.py +142 -0
- dirark-0.1.0.dist-info/METADATA +22 -0
- dirark-0.1.0.dist-info/RECORD +15 -0
- dirark-0.1.0.dist-info/WHEEL +4 -0
- dirark-0.1.0.dist-info/licenses/LICENSE +7 -0
dirark/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""dirark – directory archival and retrieval tool."""
|
|
2
|
+
|
|
3
|
+
from .core import archive_dir, restore_ark
|
|
4
|
+
from .reader import ArkReader
|
|
5
|
+
from .sync import add_dir_to_remote_ark, merge_arks, pull_ark, push_ark
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"archive_dir",
|
|
11
|
+
"restore_ark",
|
|
12
|
+
"ArkReader",
|
|
13
|
+
"push_ark",
|
|
14
|
+
"pull_ark",
|
|
15
|
+
"merge_arks",
|
|
16
|
+
"add_dir_to_remote_ark",
|
|
17
|
+
]
|
dirark/__main__.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""CLI entry point for dirark."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .core import archive_dir, restore_ark
|
|
8
|
+
from .reader import ArkReader
|
|
9
|
+
from .sync import add_dir_to_remote_ark, merge_arks, pull_ark, push_ark
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main() -> None:
|
|
13
|
+
"""Parse arguments and dispatch to the appropriate subcommand."""
|
|
14
|
+
parser = argparse.ArgumentParser(
|
|
15
|
+
description="Cold-storage directory archival tool."
|
|
16
|
+
)
|
|
17
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
18
|
+
|
|
19
|
+
p = sub.add_parser("archive", help="Archive a directory.")
|
|
20
|
+
p.add_argument("source_dir", type=Path)
|
|
21
|
+
p.set_defaults(func=_archive)
|
|
22
|
+
|
|
23
|
+
p = sub.add_parser("restore", help="Restore files from an ark.")
|
|
24
|
+
p.add_argument("ark_dir", type=Path)
|
|
25
|
+
p.add_argument("dest_dir", type=Path)
|
|
26
|
+
p.set_defaults(func=_restore)
|
|
27
|
+
|
|
28
|
+
p = sub.add_parser("push", help="Push local ark to remote via rsync.")
|
|
29
|
+
p.add_argument("local_ark", type=Path)
|
|
30
|
+
p.add_argument("remote")
|
|
31
|
+
p.set_defaults(func=_push)
|
|
32
|
+
|
|
33
|
+
p = sub.add_parser("pull", help="Pull remote ark to local path via rsync.")
|
|
34
|
+
p.add_argument("remote")
|
|
35
|
+
p.add_argument("local_ark", type=Path)
|
|
36
|
+
p.set_defaults(func=_pull)
|
|
37
|
+
|
|
38
|
+
p = sub.add_parser("merge", help="Merge src_ark into dst_ark (local).")
|
|
39
|
+
p.add_argument("src_ark", type=Path)
|
|
40
|
+
p.add_argument("dst_ark", type=Path)
|
|
41
|
+
p.set_defaults(func=_merge)
|
|
42
|
+
|
|
43
|
+
p = sub.add_parser("add", help="Archive a directory and add it to a remote ark.")
|
|
44
|
+
p.add_argument("source_dir", type=Path)
|
|
45
|
+
p.add_argument("remote_ark")
|
|
46
|
+
p.set_defaults(func=_add)
|
|
47
|
+
|
|
48
|
+
p = sub.add_parser("read", help="Print a file from an ark to stdout.")
|
|
49
|
+
p.add_argument("ark_dir", type=Path)
|
|
50
|
+
p.add_argument("file_path")
|
|
51
|
+
p.set_defaults(func=_read)
|
|
52
|
+
|
|
53
|
+
args = parser.parse_args()
|
|
54
|
+
args.func(args)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _archive(args: argparse.Namespace) -> None:
|
|
58
|
+
try:
|
|
59
|
+
archive_dir(args.source_dir)
|
|
60
|
+
print(f"Archived '{args.source_dir}'.")
|
|
61
|
+
except Exception as e:
|
|
62
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _restore(args: argparse.Namespace) -> None:
|
|
67
|
+
try:
|
|
68
|
+
restore_ark(args.ark_dir, args.dest_dir)
|
|
69
|
+
print(f"Restored to '{args.dest_dir}'.")
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
72
|
+
sys.exit(1)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _push(args: argparse.Namespace) -> None:
|
|
76
|
+
try:
|
|
77
|
+
push_ark(args.local_ark, args.remote)
|
|
78
|
+
print(f"Pushed '{args.local_ark}' to '{args.remote}'.")
|
|
79
|
+
except Exception as e:
|
|
80
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
81
|
+
sys.exit(1)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _pull(args: argparse.Namespace) -> None:
|
|
85
|
+
try:
|
|
86
|
+
pull_ark(args.remote, args.local_ark)
|
|
87
|
+
print(f"Pulled '{args.remote}' to '{args.local_ark}'.")
|
|
88
|
+
except Exception as e:
|
|
89
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
90
|
+
sys.exit(1)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _merge(args: argparse.Namespace) -> None:
|
|
94
|
+
try:
|
|
95
|
+
merge_arks(args.src_ark, args.dst_ark)
|
|
96
|
+
print(f"Merged '{args.src_ark}' into '{args.dst_ark}'.")
|
|
97
|
+
except Exception as e:
|
|
98
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
99
|
+
sys.exit(1)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _add(args: argparse.Namespace) -> None:
|
|
103
|
+
try:
|
|
104
|
+
add_dir_to_remote_ark(args.source_dir, args.remote_ark)
|
|
105
|
+
print(f"Added '{args.source_dir}' to '{args.remote_ark}'.")
|
|
106
|
+
except Exception as e:
|
|
107
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
108
|
+
sys.exit(1)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _read(args: argparse.Namespace) -> None:
|
|
112
|
+
try:
|
|
113
|
+
with ArkReader(args.ark_dir) as reader:
|
|
114
|
+
sys.stdout.buffer.write(reader.read_file(args.file_path))
|
|
115
|
+
except KeyError as e:
|
|
116
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
117
|
+
sys.exit(1)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
120
|
+
sys.exit(1)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
main()
|
dirark/core.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Core archive and restore operations."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import tempfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .storage import (
|
|
8
|
+
ARK_DIR_EXT,
|
|
9
|
+
DB_NAME,
|
|
10
|
+
b2sum,
|
|
11
|
+
ensure_clean_outdir,
|
|
12
|
+
extract_tar_zst,
|
|
13
|
+
open_db,
|
|
14
|
+
write_objects_to_tar,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def archive_dir(src_dir: Path, ark_out: Path | None = None) -> None:
|
|
19
|
+
"""Archive src_dir into a content-addressed store.
|
|
20
|
+
|
|
21
|
+
By default the archive is created at src_dir + ARK_DIR_EXT. Pass ark_out
|
|
22
|
+
to write into an existing ark directory (useful for merging or remote push).
|
|
23
|
+
|
|
24
|
+
Archiving is idempotent: re-running on the same directory is a no-op.
|
|
25
|
+
Files with duplicate content are deduplicated by BLAKE2b checksum.
|
|
26
|
+
"""
|
|
27
|
+
if ark_out is None:
|
|
28
|
+
ark_out = Path(f"{src_dir}{ARK_DIR_EXT}")
|
|
29
|
+
ark_out.mkdir(exist_ok=True, parents=True)
|
|
30
|
+
ensure_clean_outdir(ark_out)
|
|
31
|
+
db = open_db(ark_out / DB_NAME)
|
|
32
|
+
cur = db.cursor()
|
|
33
|
+
|
|
34
|
+
new_objects: dict[str, Path] = {}
|
|
35
|
+
new_files: list[tuple[str, str]] = []
|
|
36
|
+
|
|
37
|
+
for path in sorted(src_dir.rglob("*")):
|
|
38
|
+
if not path.is_file():
|
|
39
|
+
continue
|
|
40
|
+
rel = path.relative_to(src_dir).as_posix()
|
|
41
|
+
checksum = b2sum(path)
|
|
42
|
+
|
|
43
|
+
cur.execute("SELECT 1 FROM files WHERE path=?", (rel,))
|
|
44
|
+
if cur.fetchone():
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
cur.execute("SELECT tar_name FROM objects WHERE checksum=?", (checksum,))
|
|
48
|
+
if cur.fetchone() is None:
|
|
49
|
+
new_objects.setdefault(checksum, path)
|
|
50
|
+
|
|
51
|
+
new_files.append((rel, checksum))
|
|
52
|
+
|
|
53
|
+
if not new_files:
|
|
54
|
+
db.close()
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
if new_objects:
|
|
58
|
+
tar_name = write_objects_to_tar(ark_out, new_objects)
|
|
59
|
+
for checksum in new_objects:
|
|
60
|
+
cur.execute("INSERT INTO objects VALUES (?, ?)", (checksum, tar_name))
|
|
61
|
+
|
|
62
|
+
for rel, checksum in new_files:
|
|
63
|
+
cur.execute("INSERT INTO files VALUES (?, ?)", (rel, checksum))
|
|
64
|
+
|
|
65
|
+
db.commit()
|
|
66
|
+
db.close()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def restore_ark(ark_dir: Path, dest_dir: Path) -> None:
|
|
70
|
+
"""Restore all files from an ark to dest_dir.
|
|
71
|
+
|
|
72
|
+
dest_dir is only created if there are files to restore.
|
|
73
|
+
Missing tars or objects produce warnings but do not abort the restore.
|
|
74
|
+
"""
|
|
75
|
+
db = open_db(ark_dir / DB_NAME)
|
|
76
|
+
cur = db.cursor()
|
|
77
|
+
|
|
78
|
+
cur.execute("SELECT path, checksum FROM files")
|
|
79
|
+
files = cur.fetchall()
|
|
80
|
+
|
|
81
|
+
if not files:
|
|
82
|
+
print("No files found to restore in the archive.")
|
|
83
|
+
db.close()
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
87
|
+
cur.execute("SELECT checksum, tar_name FROM objects")
|
|
88
|
+
checksum_to_tar = dict(cur.fetchall())
|
|
89
|
+
db.close()
|
|
90
|
+
|
|
91
|
+
by_tar: dict[Path, list[tuple[str, str]]] = {}
|
|
92
|
+
for rel, checksum in files:
|
|
93
|
+
tar_name = checksum_to_tar.get(checksum)
|
|
94
|
+
if tar_name:
|
|
95
|
+
by_tar.setdefault(ark_dir / tar_name, []).append((rel, checksum))
|
|
96
|
+
else:
|
|
97
|
+
print(f"Warning: checksum {checksum} for {rel} not in objects.")
|
|
98
|
+
|
|
99
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
100
|
+
tmp_path = Path(tmp)
|
|
101
|
+
obj_dir = tmp_path / "objects"
|
|
102
|
+
obj_dir.mkdir()
|
|
103
|
+
|
|
104
|
+
for tar_path, tar_files in by_tar.items():
|
|
105
|
+
if not tar_path.exists():
|
|
106
|
+
print(f"Warning: {tar_path.name} not found.")
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
extract_tar_zst(tar_path, tmp_path)
|
|
110
|
+
|
|
111
|
+
for rel, checksum in tar_files:
|
|
112
|
+
dest_file = dest_dir / rel
|
|
113
|
+
dest_file.parent.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
src_obj = obj_dir / checksum
|
|
115
|
+
if src_obj.exists():
|
|
116
|
+
shutil.copy2(src_obj, dest_file)
|
|
117
|
+
else:
|
|
118
|
+
print(f"Warning: object {checksum} missing from {tar_path.name}.")
|
dirark/reader.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""ArkReader: retrieve individual files from a dirark archive."""
|
|
2
|
+
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .storage import DB_NAME, extract_object_from_tar, open_db
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ArkReader:
|
|
10
|
+
"""Read-only access to individual files within a dirark archive.
|
|
11
|
+
|
|
12
|
+
Supports context manager usage::
|
|
13
|
+
|
|
14
|
+
with ArkReader(ark_dir) as reader:
|
|
15
|
+
data = reader.read_file("path/to/file.txt")
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, ark_dir: Path) -> None:
|
|
19
|
+
"""Open an ark for reading."""
|
|
20
|
+
self._ark_dir = ark_dir
|
|
21
|
+
self._db = open_db(ark_dir / DB_NAME)
|
|
22
|
+
|
|
23
|
+
def __enter__(self) -> "ArkReader":
|
|
24
|
+
return self
|
|
25
|
+
|
|
26
|
+
def __exit__(self, *_: object) -> None:
|
|
27
|
+
self.close()
|
|
28
|
+
|
|
29
|
+
def list_files(self) -> list[str]:
|
|
30
|
+
"""Return sorted list of all archived file paths."""
|
|
31
|
+
cur = self._db.cursor()
|
|
32
|
+
cur.execute("SELECT path FROM files ORDER BY path")
|
|
33
|
+
return [row[0] for row in cur.fetchall()]
|
|
34
|
+
|
|
35
|
+
def get_checksum(self, path: str) -> str:
|
|
36
|
+
"""Return the BLAKE2b checksum of an archived file by relative path."""
|
|
37
|
+
cur = self._db.cursor()
|
|
38
|
+
cur.execute("SELECT checksum FROM files WHERE path=?", (path,))
|
|
39
|
+
row = cur.fetchone()
|
|
40
|
+
if row is None:
|
|
41
|
+
raise KeyError(f"File not found in ark: {path}")
|
|
42
|
+
return row[0]
|
|
43
|
+
|
|
44
|
+
def read_file(self, path: str) -> bytes:
|
|
45
|
+
"""Read and return the raw bytes of an archived file.
|
|
46
|
+
|
|
47
|
+
Raises KeyError if path is not in the ark.
|
|
48
|
+
"""
|
|
49
|
+
cur = self._db.cursor()
|
|
50
|
+
cur.execute(
|
|
51
|
+
"SELECT f.checksum, o.tar_name "
|
|
52
|
+
"FROM files f JOIN objects o ON f.checksum = o.checksum "
|
|
53
|
+
"WHERE f.path=?",
|
|
54
|
+
(path,),
|
|
55
|
+
)
|
|
56
|
+
row = cur.fetchone()
|
|
57
|
+
if row is None:
|
|
58
|
+
raise KeyError(f"File not found in ark: {path}")
|
|
59
|
+
checksum, tar_name = row
|
|
60
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
61
|
+
tmp_path = Path(tmp)
|
|
62
|
+
extract_object_from_tar(self._ark_dir / tar_name, checksum, tmp_path)
|
|
63
|
+
return (tmp_path / "objects" / checksum).read_bytes()
|
|
64
|
+
|
|
65
|
+
def close(self) -> None:
|
|
66
|
+
"""Close the underlying database connection."""
|
|
67
|
+
self._db.close()
|
dirark/storage.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Low-level storage primitives: checksums, tar I/O, and database access."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import sqlite3
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
MAX_TAR_MB = 256
|
|
10
|
+
DB_NAME = "index.sqlite"
|
|
11
|
+
SEP = "-"
|
|
12
|
+
TAR_PREFIX = "data" + SEP
|
|
13
|
+
TAR_EXT = ".tar.zst"
|
|
14
|
+
ARK_DIR_EXT = ".ark.d"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def b2sum(path: Path) -> str:
|
|
18
|
+
"""Compute BLAKE2b checksum of a file using the system b2sum utility."""
|
|
19
|
+
res = subprocess.run(
|
|
20
|
+
["b2sum", str(path)],
|
|
21
|
+
check=True,
|
|
22
|
+
stdout=subprocess.PIPE,
|
|
23
|
+
text=True,
|
|
24
|
+
)
|
|
25
|
+
return res.stdout.split()[0]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def open_db(path: Path) -> sqlite3.Connection:
|
|
29
|
+
"""Open or create an ark SQLite database, ensuring the schema exists.
|
|
30
|
+
|
|
31
|
+
Tables:
|
|
32
|
+
files(path TEXT PK, checksum TEXT) -- relative path → checksum
|
|
33
|
+
objects(checksum TEXT PK, tar_name TEXT) -- checksum → archive name
|
|
34
|
+
"""
|
|
35
|
+
db = sqlite3.connect(path)
|
|
36
|
+
db.execute(
|
|
37
|
+
"CREATE TABLE IF NOT EXISTS files("
|
|
38
|
+
"path TEXT PRIMARY KEY, checksum TEXT NOT NULL)"
|
|
39
|
+
)
|
|
40
|
+
db.execute(
|
|
41
|
+
"CREATE TABLE IF NOT EXISTS objects("
|
|
42
|
+
"checksum TEXT PRIMARY KEY, tar_name TEXT NOT NULL)"
|
|
43
|
+
)
|
|
44
|
+
return db
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def list_tars(ark_dir: Path) -> list[Path]:
|
|
48
|
+
"""Return sorted list of tar archives in an ark directory."""
|
|
49
|
+
return sorted(ark_dir.glob(f"{TAR_PREFIX}*{TAR_EXT}"))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def tar_size_mb(path: Path) -> float:
|
|
53
|
+
"""Return file size in megabytes."""
|
|
54
|
+
return path.stat().st_size / (1024 * 1024)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def next_tar_path(ark_dir: Path, min_idx: int = 0) -> Path:
|
|
58
|
+
"""Return path for the next tar archive in ark_dir.
|
|
59
|
+
|
|
60
|
+
Index is max(last_existing + 1, min_idx), guaranteeing no name collision.
|
|
61
|
+
"""
|
|
62
|
+
tars = list_tars(ark_dir)
|
|
63
|
+
last_idx = int(tars[-1].name.removesuffix(TAR_EXT).split(SEP)[-1]) if tars else 0
|
|
64
|
+
idx = max(last_idx + 1, min_idx)
|
|
65
|
+
return ark_dir / f"{TAR_PREFIX}{idx:05d}{TAR_EXT}"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def extract_tar_zst(src: Path, dest: Path) -> None:
|
|
69
|
+
"""Extract a zstd-compressed tar archive into dest."""
|
|
70
|
+
subprocess.run(
|
|
71
|
+
["tar", "-I", "zstd", "-xf", str(src), "-C", str(dest)],
|
|
72
|
+
check=True,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def extract_object_from_tar(tar: Path, checksum: str, dest: Path) -> None:
|
|
77
|
+
"""Extract a single object by checksum from a tar archive into dest."""
|
|
78
|
+
subprocess.run(
|
|
79
|
+
[
|
|
80
|
+
"tar",
|
|
81
|
+
"-I",
|
|
82
|
+
"zstd",
|
|
83
|
+
"-xf",
|
|
84
|
+
str(tar),
|
|
85
|
+
"-C",
|
|
86
|
+
str(dest),
|
|
87
|
+
f"./objects/{checksum}",
|
|
88
|
+
],
|
|
89
|
+
check=True,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def create_tar_zst(src_dir: Path, out: Path) -> None:
|
|
94
|
+
"""Create a zstd-compressed tar of src_dir contents at out."""
|
|
95
|
+
subprocess.run(
|
|
96
|
+
["tar", "-C", str(src_dir), "-I", "zstd", "-cf", str(out), "."],
|
|
97
|
+
check=True,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def ensure_clean_outdir(ark_dir: Path) -> None:
|
|
102
|
+
"""Create ark_dir if needed and raise RuntimeError on unexpected files."""
|
|
103
|
+
ark_dir.mkdir(parents=True, exist_ok=True)
|
|
104
|
+
allowed = {DB_NAME} | {p.name for p in ark_dir.glob(f"{TAR_PREFIX}*{TAR_EXT}")}
|
|
105
|
+
for p in ark_dir.iterdir():
|
|
106
|
+
if p.name not in allowed:
|
|
107
|
+
raise RuntimeError(f"Unexpected file in archive dir: {p}")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def write_objects_to_tar(
|
|
111
|
+
ark_dir: Path,
|
|
112
|
+
objects: dict[str, Path],
|
|
113
|
+
min_tar_idx: int = 0,
|
|
114
|
+
) -> str:
|
|
115
|
+
"""Append objects (checksum → source path) to a tar archive in ark_dir.
|
|
116
|
+
|
|
117
|
+
Reuses the last tar if it is under MAX_TAR_MB, otherwise creates a new
|
|
118
|
+
one. min_tar_idx can be used to avoid index collisions when merging arks.
|
|
119
|
+
|
|
120
|
+
Returns the tar filename written to.
|
|
121
|
+
"""
|
|
122
|
+
tars = list_tars(ark_dir)
|
|
123
|
+
if tars and tar_size_mb(tars[-1]) < MAX_TAR_MB:
|
|
124
|
+
tar_path = tars[-1]
|
|
125
|
+
else:
|
|
126
|
+
tar_path = next_tar_path(ark_dir, min_idx=min_tar_idx)
|
|
127
|
+
|
|
128
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
129
|
+
tmp_dir = Path(tmp)
|
|
130
|
+
obj_dir = tmp_dir / "objects"
|
|
131
|
+
obj_dir.mkdir(parents=True)
|
|
132
|
+
|
|
133
|
+
if tar_path.exists():
|
|
134
|
+
extract_tar_zst(tar_path, tmp_dir)
|
|
135
|
+
|
|
136
|
+
for checksum, src in objects.items():
|
|
137
|
+
dest = obj_dir / checksum
|
|
138
|
+
if not dest.exists():
|
|
139
|
+
shutil.copy2(src, dest)
|
|
140
|
+
|
|
141
|
+
create_tar_zst(tmp_dir, tar_path)
|
|
142
|
+
|
|
143
|
+
return tar_path.name
|
dirark/sync.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Sync operations between local and remote dirark archives."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
import tempfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .core import archive_dir
|
|
9
|
+
from .storage import (
|
|
10
|
+
DB_NAME,
|
|
11
|
+
extract_tar_zst,
|
|
12
|
+
open_db,
|
|
13
|
+
write_objects_to_tar,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def push_ark(local_ark: Path, remote: str) -> None:
|
|
18
|
+
"""Push a local ark to a remote location via rsync.
|
|
19
|
+
|
|
20
|
+
Uses checksum comparison (not just mtime+size) to ensure all changes are
|
|
21
|
+
transferred, even when modifications happen within the same second.
|
|
22
|
+
remote may be a local path string or an SSH target (user@host:/path).
|
|
23
|
+
"""
|
|
24
|
+
subprocess.run(
|
|
25
|
+
["rsync", "-avzc", f"{local_ark}/", remote],
|
|
26
|
+
check=True,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def pull_ark(remote: str, local_ark: Path) -> None:
|
|
31
|
+
"""Pull a remote ark to a local path via rsync.
|
|
32
|
+
|
|
33
|
+
remote may be a local path string or an SSH target (user@host:/path).
|
|
34
|
+
"""
|
|
35
|
+
local_ark.mkdir(parents=True, exist_ok=True)
|
|
36
|
+
subprocess.run(
|
|
37
|
+
["rsync", "-avz", f"{remote}/", str(local_ark)],
|
|
38
|
+
check=True,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def merge_arks(src_ark: Path, dst_ark: Path) -> None:
|
|
43
|
+
"""Merge all objects and file mappings from src_ark into dst_ark.
|
|
44
|
+
|
|
45
|
+
Objects already present in dst by checksum are skipped (deduplication).
|
|
46
|
+
File path mappings are added with INSERT OR IGNORE, so existing paths
|
|
47
|
+
in dst take precedence.
|
|
48
|
+
"""
|
|
49
|
+
dst_ark.mkdir(parents=True, exist_ok=True)
|
|
50
|
+
src_db = open_db(src_ark / DB_NAME)
|
|
51
|
+
dst_db = open_db(dst_ark / DB_NAME)
|
|
52
|
+
src_cur, dst_cur = src_db.cursor(), dst_db.cursor()
|
|
53
|
+
|
|
54
|
+
src_cur.execute("SELECT checksum, tar_name FROM objects")
|
|
55
|
+
src_objects = dict(src_cur.fetchall())
|
|
56
|
+
|
|
57
|
+
dst_cur.execute("SELECT checksum FROM objects")
|
|
58
|
+
dst_known = {row[0] for row in dst_cur.fetchall()}
|
|
59
|
+
|
|
60
|
+
missing = {cs: tar for cs, tar in src_objects.items() if cs not in dst_known}
|
|
61
|
+
|
|
62
|
+
if missing:
|
|
63
|
+
by_tar: dict[str, list[str]] = {}
|
|
64
|
+
for cs, tar_name in missing.items():
|
|
65
|
+
by_tar.setdefault(tar_name, []).append(cs)
|
|
66
|
+
|
|
67
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
68
|
+
staging_obj = Path(tmp) / "objects"
|
|
69
|
+
staging_obj.mkdir()
|
|
70
|
+
|
|
71
|
+
for tar_name, checksums in by_tar.items():
|
|
72
|
+
src_tar = src_ark / tar_name
|
|
73
|
+
if not src_tar.exists():
|
|
74
|
+
continue
|
|
75
|
+
with tempfile.TemporaryDirectory() as xtmp:
|
|
76
|
+
extract_tar_zst(src_tar, Path(xtmp))
|
|
77
|
+
for cs in checksums:
|
|
78
|
+
obj = Path(xtmp) / "objects" / cs
|
|
79
|
+
if obj.exists():
|
|
80
|
+
shutil.copy2(obj, staging_obj / cs)
|
|
81
|
+
|
|
82
|
+
staged = {
|
|
83
|
+
cs: staging_obj / cs for cs in missing if (staging_obj / cs).exists()
|
|
84
|
+
}
|
|
85
|
+
if staged:
|
|
86
|
+
tar_name = write_objects_to_tar(dst_ark, staged)
|
|
87
|
+
for cs in staged:
|
|
88
|
+
dst_cur.execute(
|
|
89
|
+
"INSERT OR IGNORE INTO objects VALUES (?, ?)",
|
|
90
|
+
(cs, tar_name),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
src_cur.execute("SELECT path, checksum FROM files")
|
|
94
|
+
for rel, checksum in src_cur.fetchall():
|
|
95
|
+
dst_cur.execute("INSERT OR IGNORE INTO files VALUES (?, ?)", (rel, checksum))
|
|
96
|
+
|
|
97
|
+
dst_db.commit()
|
|
98
|
+
src_db.close()
|
|
99
|
+
dst_db.close()
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def add_dir_to_remote_ark(src_dir: Path, remote_ark: str) -> None:
|
|
103
|
+
"""Archive src_dir and merge its contents into a remote ark.
|
|
104
|
+
|
|
105
|
+
Pulls the remote ark locally, archives src_dir into it, then pushes the
|
|
106
|
+
updated ark back. Supports SSH remotes (user@host:/path) via rsync.
|
|
107
|
+
|
|
108
|
+
The remote ark must already exist (at minimum as an empty directory).
|
|
109
|
+
For a first push, use archive_dir followed by push_ark instead.
|
|
110
|
+
"""
|
|
111
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
112
|
+
local_ark = Path(tmp) / "remote_ark"
|
|
113
|
+
pull_ark(remote_ark, local_ark)
|
|
114
|
+
archive_dir(src_dir, ark_out=local_ark)
|
|
115
|
+
push_ark(local_ark, remote_ark)
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Tests for dirark.core: archive_dir and restore_ark."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import sys
|
|
5
|
+
import tempfile
|
|
6
|
+
import unittest
|
|
7
|
+
from contextlib import redirect_stdout
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from dirark.core import archive_dir, restore_ark
|
|
11
|
+
from dirark.storage import ARK_DIR_EXT, DB_NAME, TAR_EXT, TAR_PREFIX, open_db
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TestArchiveDir(unittest.TestCase):
|
|
15
|
+
def setUp(self) -> None:
|
|
16
|
+
self.tmpdir = tempfile.TemporaryDirectory()
|
|
17
|
+
self.tmp = Path(self.tmpdir.name)
|
|
18
|
+
self.src = self.tmp / "source"
|
|
19
|
+
self.ark = self.tmp / ("source" + ARK_DIR_EXT)
|
|
20
|
+
self.src.mkdir()
|
|
21
|
+
(self.src / "file1.txt").write_text("content of file1")
|
|
22
|
+
(self.src / "subdir").mkdir()
|
|
23
|
+
(self.src / "subdir" / "file2.txt").write_text("content of file2")
|
|
24
|
+
(self.src / "empty_file.txt").touch()
|
|
25
|
+
|
|
26
|
+
def tearDown(self) -> None:
|
|
27
|
+
self.tmpdir.cleanup()
|
|
28
|
+
|
|
29
|
+
def test_initial_archive_creates_structure(self) -> None:
|
|
30
|
+
archive_dir(self.src)
|
|
31
|
+
self.assertTrue(self.ark.exists())
|
|
32
|
+
self.assertTrue((self.ark / DB_NAME).exists())
|
|
33
|
+
tars = list(self.ark.glob(f"{TAR_PREFIX}*{TAR_EXT}"))
|
|
34
|
+
self.assertGreater(len(tars), 0)
|
|
35
|
+
|
|
36
|
+
def test_initial_archive_records_all_files(self) -> None:
|
|
37
|
+
archive_dir(self.src)
|
|
38
|
+
db = open_db(self.ark / DB_NAME)
|
|
39
|
+
cur = db.cursor()
|
|
40
|
+
cur.execute("SELECT path FROM files ORDER BY path")
|
|
41
|
+
paths = [row[0] for row in cur.fetchall()]
|
|
42
|
+
db.close()
|
|
43
|
+
self.assertEqual(paths, ["empty_file.txt", "file1.txt", "subdir/file2.txt"])
|
|
44
|
+
|
|
45
|
+
def test_initial_archive_deduplicates_objects(self) -> None:
|
|
46
|
+
archive_dir(self.src)
|
|
47
|
+
db = open_db(self.ark / DB_NAME)
|
|
48
|
+
cur = db.cursor()
|
|
49
|
+
cur.execute("SELECT COUNT(*) FROM objects")
|
|
50
|
+
n_objects = cur.fetchone()[0]
|
|
51
|
+
db.close()
|
|
52
|
+
self.assertGreaterEqual(n_objects, 2)
|
|
53
|
+
|
|
54
|
+
def test_incremental_archive_adds_new_files(self) -> None:
|
|
55
|
+
archive_dir(self.src)
|
|
56
|
+
(self.src / "new.txt").write_text("new content")
|
|
57
|
+
(self.src / "sub2").mkdir()
|
|
58
|
+
(self.src / "sub2" / "more.txt").write_text("more content")
|
|
59
|
+
archive_dir(self.src)
|
|
60
|
+
|
|
61
|
+
db = open_db(self.ark / DB_NAME)
|
|
62
|
+
cur = db.cursor()
|
|
63
|
+
cur.execute("SELECT COUNT(*) FROM files")
|
|
64
|
+
self.assertEqual(cur.fetchone()[0], 5)
|
|
65
|
+
db.close()
|
|
66
|
+
|
|
67
|
+
def test_idempotent_archiving(self) -> None:
|
|
68
|
+
archive_dir(self.src)
|
|
69
|
+
archive_dir(self.src)
|
|
70
|
+
db = open_db(self.ark / DB_NAME)
|
|
71
|
+
cur = db.cursor()
|
|
72
|
+
cur.execute("SELECT COUNT(*) FROM files")
|
|
73
|
+
self.assertEqual(cur.fetchone()[0], 3)
|
|
74
|
+
db.close()
|
|
75
|
+
|
|
76
|
+
def test_deduplication_across_paths(self) -> None:
|
|
77
|
+
(self.src / "dup.txt").write_text("content of file1")
|
|
78
|
+
archive_dir(self.src)
|
|
79
|
+
db = open_db(self.ark / DB_NAME)
|
|
80
|
+
cur = db.cursor()
|
|
81
|
+
cur.execute("SELECT COUNT(*) FROM files")
|
|
82
|
+
n_files = cur.fetchone()[0]
|
|
83
|
+
cur.execute("SELECT COUNT(*) FROM objects")
|
|
84
|
+
n_objects = cur.fetchone()[0]
|
|
85
|
+
db.close()
|
|
86
|
+
self.assertEqual(n_files, 4)
|
|
87
|
+
self.assertLess(n_objects, n_files)
|
|
88
|
+
|
|
89
|
+
def test_archive_into_existing_ark(self) -> None:
|
|
90
|
+
self.ark.mkdir()
|
|
91
|
+
archive_dir(self.src, ark_out=self.ark)
|
|
92
|
+
db = open_db(self.ark / DB_NAME)
|
|
93
|
+
cur = db.cursor()
|
|
94
|
+
cur.execute("SELECT COUNT(*) FROM files")
|
|
95
|
+
self.assertEqual(cur.fetchone()[0], 3)
|
|
96
|
+
db.close()
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class TestRestoreArk(unittest.TestCase):
|
|
100
|
+
def setUp(self) -> None:
|
|
101
|
+
self.tmpdir = tempfile.TemporaryDirectory()
|
|
102
|
+
self.tmp = Path(self.tmpdir.name)
|
|
103
|
+
self.src = self.tmp / "source"
|
|
104
|
+
self.ark = self.tmp / ("source" + ARK_DIR_EXT)
|
|
105
|
+
self.dest = self.tmp / "restored"
|
|
106
|
+
self.src.mkdir()
|
|
107
|
+
(self.src / "file1.txt").write_text("content of file1")
|
|
108
|
+
(self.src / "subdir").mkdir()
|
|
109
|
+
(self.src / "subdir" / "file2.txt").write_text("content of file2")
|
|
110
|
+
(self.src / "empty_file.txt").touch()
|
|
111
|
+
|
|
112
|
+
def tearDown(self) -> None:
|
|
113
|
+
self.tmpdir.cleanup()
|
|
114
|
+
|
|
115
|
+
def test_restore_recreates_files(self) -> None:
|
|
116
|
+
archive_dir(self.src)
|
|
117
|
+
restore_ark(self.ark, self.dest)
|
|
118
|
+
self.assertTrue((self.dest / "file1.txt").exists())
|
|
119
|
+
self.assertTrue((self.dest / "subdir" / "file2.txt").exists())
|
|
120
|
+
self.assertTrue((self.dest / "empty_file.txt").exists())
|
|
121
|
+
|
|
122
|
+
def test_restore_preserves_content(self) -> None:
|
|
123
|
+
archive_dir(self.src)
|
|
124
|
+
restore_ark(self.ark, self.dest)
|
|
125
|
+
self.assertEqual((self.dest / "file1.txt").read_text(), "content of file1")
|
|
126
|
+
self.assertEqual(
|
|
127
|
+
(self.dest / "subdir" / "file2.txt").read_text(),
|
|
128
|
+
"content of file2",
|
|
129
|
+
)
|
|
130
|
+
self.assertEqual((self.dest / "empty_file.txt").read_text(), "")
|
|
131
|
+
|
|
132
|
+
def test_restore_includes_incrementally_added_files(self) -> None:
|
|
133
|
+
archive_dir(self.src)
|
|
134
|
+
(self.src / "new.txt").write_text("new content")
|
|
135
|
+
archive_dir(self.src)
|
|
136
|
+
restore_ark(self.ark, self.dest)
|
|
137
|
+
self.assertTrue((self.dest / "new.txt").exists())
|
|
138
|
+
self.assertEqual((self.dest / "new.txt").read_text(), "new content")
|
|
139
|
+
|
|
140
|
+
def test_restore_empty_archive_skips_dest_creation(self) -> None:
|
|
141
|
+
self.ark.mkdir()
|
|
142
|
+
open_db(self.ark / DB_NAME).close()
|
|
143
|
+
with open("/dev/null", "w") as f, redirect_stdout(f):
|
|
144
|
+
restore_ark(self.ark, self.dest)
|
|
145
|
+
self.assertFalse(self.dest.exists())
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class TestCLI(unittest.TestCase):
|
|
149
|
+
def setUp(self) -> None:
|
|
150
|
+
self.tmpdir = tempfile.TemporaryDirectory()
|
|
151
|
+
self.tmp = Path(self.tmpdir.name)
|
|
152
|
+
self.src = self.tmp / "source"
|
|
153
|
+
self.ark = self.tmp / ("source" + ARK_DIR_EXT)
|
|
154
|
+
self.dest = self.tmp / "restored"
|
|
155
|
+
self.src.mkdir()
|
|
156
|
+
(self.src / "file1.txt").write_text("content of file1")
|
|
157
|
+
(self.src / "subdir").mkdir()
|
|
158
|
+
(self.src / "subdir" / "file2.txt").write_text("content of file2")
|
|
159
|
+
|
|
160
|
+
def tearDown(self) -> None:
|
|
161
|
+
self.tmpdir.cleanup()
|
|
162
|
+
|
|
163
|
+
def _run(self, *args: str) -> subprocess.CompletedProcess[str]:
|
|
164
|
+
return subprocess.run(
|
|
165
|
+
[sys.executable, "-m", "dirark", *args],
|
|
166
|
+
capture_output=True,
|
|
167
|
+
text=True,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
def test_archive_command(self) -> None:
|
|
171
|
+
result = self._run("archive", str(self.src))
|
|
172
|
+
self.assertEqual(result.returncode, 0)
|
|
173
|
+
self.assertIn("Archived", result.stdout)
|
|
174
|
+
self.assertTrue((self.ark / DB_NAME).exists())
|
|
175
|
+
|
|
176
|
+
def test_restore_command(self) -> None:
|
|
177
|
+
self._run("archive", str(self.src))
|
|
178
|
+
result = self._run("restore", str(self.ark), str(self.dest))
|
|
179
|
+
self.assertEqual(result.returncode, 0)
|
|
180
|
+
self.assertIn("Restored", result.stdout)
|
|
181
|
+
self.assertTrue((self.dest / "file1.txt").exists())
|
|
182
|
+
self.assertEqual((self.dest / "file1.txt").read_text(), "content of file1")
|
|
183
|
+
|
|
184
|
+
def test_restore_empty_archive(self) -> None:
|
|
185
|
+
self.ark.mkdir()
|
|
186
|
+
open_db(self.ark / DB_NAME).close()
|
|
187
|
+
result = self._run("restore", str(self.ark), str(self.dest))
|
|
188
|
+
self.assertEqual(result.returncode, 0)
|
|
189
|
+
self.assertIn("No files found", result.stdout)
|
|
190
|
+
self.assertFalse(self.dest.exists())
|
|
191
|
+
|
|
192
|
+
def test_read_command(self) -> None:
|
|
193
|
+
self._run("archive", str(self.src))
|
|
194
|
+
result = subprocess.run(
|
|
195
|
+
[sys.executable, "-m", "dirark", "read", str(self.ark), "file1.txt"],
|
|
196
|
+
capture_output=True,
|
|
197
|
+
)
|
|
198
|
+
self.assertEqual(result.returncode, 0)
|
|
199
|
+
self.assertEqual(result.stdout, b"content of file1")
|
|
200
|
+
|
|
201
|
+
def test_read_missing_file_exits_nonzero(self) -> None:
|
|
202
|
+
self._run("archive", str(self.src))
|
|
203
|
+
result = self._run("read", str(self.ark), "nonexistent.txt")
|
|
204
|
+
self.assertNotEqual(result.returncode, 0)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Tests for dirark.reader.ArkReader."""
|
|
2
|
+
|
|
3
|
+
import tempfile
|
|
4
|
+
import unittest
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from dirark.core import archive_dir
|
|
8
|
+
from dirark.reader import ArkReader
|
|
9
|
+
from dirark.storage import ARK_DIR_EXT
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TestArkReader(unittest.TestCase):
|
|
13
|
+
def setUp(self) -> None:
|
|
14
|
+
self.tmpdir = tempfile.TemporaryDirectory()
|
|
15
|
+
self.tmp = Path(self.tmpdir.name)
|
|
16
|
+
self.src = self.tmp / "source"
|
|
17
|
+
self.ark = self.tmp / ("source" + ARK_DIR_EXT)
|
|
18
|
+
self.src.mkdir()
|
|
19
|
+
(self.src / "hello.txt").write_text("hello world")
|
|
20
|
+
(self.src / "sub").mkdir()
|
|
21
|
+
(self.src / "sub" / "data.bin").write_bytes(b"\x00\x01\x02\x03")
|
|
22
|
+
(self.src / "empty.txt").touch()
|
|
23
|
+
archive_dir(self.src)
|
|
24
|
+
|
|
25
|
+
def tearDown(self) -> None:
|
|
26
|
+
self.tmpdir.cleanup()
|
|
27
|
+
|
|
28
|
+
def test_list_files(self) -> None:
|
|
29
|
+
with ArkReader(self.ark) as r:
|
|
30
|
+
files = r.list_files()
|
|
31
|
+
self.assertEqual(files, ["empty.txt", "hello.txt", "sub/data.bin"])
|
|
32
|
+
|
|
33
|
+
def test_get_checksum_returns_string(self) -> None:
|
|
34
|
+
with ArkReader(self.ark) as r:
|
|
35
|
+
cs = r.get_checksum("hello.txt")
|
|
36
|
+
self.assertIsInstance(cs, str)
|
|
37
|
+
self.assertEqual(len(cs), 128)
|
|
38
|
+
|
|
39
|
+
def test_read_file_text(self) -> None:
|
|
40
|
+
with ArkReader(self.ark) as r:
|
|
41
|
+
data = r.read_file("hello.txt")
|
|
42
|
+
self.assertEqual(data, b"hello world")
|
|
43
|
+
|
|
44
|
+
def test_read_file_binary(self) -> None:
|
|
45
|
+
with ArkReader(self.ark) as r:
|
|
46
|
+
data = r.read_file("sub/data.bin")
|
|
47
|
+
self.assertEqual(data, b"\x00\x01\x02\x03")
|
|
48
|
+
|
|
49
|
+
def test_read_empty_file(self) -> None:
|
|
50
|
+
with ArkReader(self.ark) as r:
|
|
51
|
+
data = r.read_file("empty.txt")
|
|
52
|
+
self.assertEqual(data, b"")
|
|
53
|
+
|
|
54
|
+
def test_missing_file_raises_key_error(self) -> None:
|
|
55
|
+
with ArkReader(self.ark) as r:
|
|
56
|
+
with self.assertRaises(KeyError):
|
|
57
|
+
r.read_file("does_not_exist.txt")
|
|
58
|
+
|
|
59
|
+
def test_get_checksum_missing_raises_key_error(self) -> None:
|
|
60
|
+
with ArkReader(self.ark) as r:
|
|
61
|
+
with self.assertRaises(KeyError):
|
|
62
|
+
r.get_checksum("does_not_exist.txt")
|
|
63
|
+
|
|
64
|
+
def test_context_manager_closes(self) -> None:
|
|
65
|
+
r = ArkReader(self.ark)
|
|
66
|
+
with r:
|
|
67
|
+
_ = r.list_files()
|
|
68
|
+
with self.assertRaises(Exception):
|
|
69
|
+
r.list_files()
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Tests for dirark.storage primitives."""
|
|
2
|
+
|
|
3
|
+
import tempfile
|
|
4
|
+
import unittest
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from dirark.storage import (
|
|
8
|
+
ARK_DIR_EXT,
|
|
9
|
+
DB_NAME,
|
|
10
|
+
TAR_EXT,
|
|
11
|
+
TAR_PREFIX,
|
|
12
|
+
b2sum,
|
|
13
|
+
create_tar_zst,
|
|
14
|
+
ensure_clean_outdir,
|
|
15
|
+
extract_tar_zst,
|
|
16
|
+
list_tars,
|
|
17
|
+
next_tar_path,
|
|
18
|
+
open_db,
|
|
19
|
+
tar_size_mb,
|
|
20
|
+
write_objects_to_tar,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TestStoragePrimitives(unittest.TestCase):
|
|
25
|
+
def setUp(self) -> None:
|
|
26
|
+
self.tmpdir = tempfile.TemporaryDirectory()
|
|
27
|
+
self.tmp = Path(self.tmpdir.name)
|
|
28
|
+
self.src = self.tmp / "src"
|
|
29
|
+
self.src.mkdir()
|
|
30
|
+
(self.src / "a.txt").write_text("hello")
|
|
31
|
+
(self.src / "sub").mkdir()
|
|
32
|
+
(self.src / "sub" / "b.txt").write_text("world")
|
|
33
|
+
|
|
34
|
+
def tearDown(self) -> None:
|
|
35
|
+
self.tmpdir.cleanup()
|
|
36
|
+
|
|
37
|
+
def test_b2sum_returns_128_hex_chars(self) -> None:
|
|
38
|
+
cs = b2sum(self.src / "a.txt")
|
|
39
|
+
self.assertIsInstance(cs, str)
|
|
40
|
+
self.assertEqual(len(cs), 128)
|
|
41
|
+
|
|
42
|
+
def test_b2sum_is_deterministic(self) -> None:
|
|
43
|
+
self.assertEqual(b2sum(self.src / "a.txt"), b2sum(self.src / "a.txt"))
|
|
44
|
+
|
|
45
|
+
def test_b2sum_differs_for_different_content(self) -> None:
|
|
46
|
+
self.assertNotEqual(
|
|
47
|
+
b2sum(self.src / "a.txt"), b2sum(self.src / "sub" / "b.txt")
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def test_open_db_creates_schema(self) -> None:
|
|
51
|
+
db = open_db(self.tmp / DB_NAME)
|
|
52
|
+
cur = db.cursor()
|
|
53
|
+
cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
54
|
+
tables = {row[0] for row in cur.fetchall()}
|
|
55
|
+
self.assertIn("files", tables)
|
|
56
|
+
self.assertIn("objects", tables)
|
|
57
|
+
db.close()
|
|
58
|
+
|
|
59
|
+
def test_create_and_extract_tar_zst(self) -> None:
|
|
60
|
+
tar = self.tmp / f"test{TAR_EXT}"
|
|
61
|
+
create_tar_zst(self.src, tar)
|
|
62
|
+
self.assertTrue(tar.exists())
|
|
63
|
+
|
|
64
|
+
out = self.tmp / "extracted"
|
|
65
|
+
out.mkdir()
|
|
66
|
+
extract_tar_zst(tar, out)
|
|
67
|
+
self.assertTrue((out / "a.txt").exists())
|
|
68
|
+
self.assertEqual((out / "a.txt").read_text(), "hello")
|
|
69
|
+
|
|
70
|
+
def test_list_tars_sorted(self) -> None:
|
|
71
|
+
ark = self.tmp / "ark.ark.d"
|
|
72
|
+
ark.mkdir()
|
|
73
|
+
(ark / f"{TAR_PREFIX}00002{TAR_EXT}").touch()
|
|
74
|
+
(ark / f"{TAR_PREFIX}00001{TAR_EXT}").touch()
|
|
75
|
+
tars = list_tars(ark)
|
|
76
|
+
names = [t.name for t in tars]
|
|
77
|
+
self.assertEqual(names, sorted(names))
|
|
78
|
+
|
|
79
|
+
def test_tar_size_mb(self) -> None:
|
|
80
|
+
tar = self.tmp / f"test{TAR_EXT}"
|
|
81
|
+
create_tar_zst(self.src, tar)
|
|
82
|
+
self.assertGreater(tar_size_mb(tar), 0)
|
|
83
|
+
|
|
84
|
+
def test_next_tar_path_first(self) -> None:
|
|
85
|
+
ark = self.tmp / "ark.ark.d"
|
|
86
|
+
ark.mkdir()
|
|
87
|
+
path = next_tar_path(ark)
|
|
88
|
+
self.assertEqual(path.name, f"{TAR_PREFIX}00001{TAR_EXT}")
|
|
89
|
+
|
|
90
|
+
def test_next_tar_path_increments(self) -> None:
|
|
91
|
+
ark = self.tmp / "ark.ark.d"
|
|
92
|
+
ark.mkdir()
|
|
93
|
+
(ark / f"{TAR_PREFIX}00003{TAR_EXT}").touch()
|
|
94
|
+
path = next_tar_path(ark)
|
|
95
|
+
self.assertEqual(path.name, f"{TAR_PREFIX}00004{TAR_EXT}")
|
|
96
|
+
|
|
97
|
+
def test_next_tar_path_min_idx(self) -> None:
|
|
98
|
+
ark = self.tmp / "ark.ark.d"
|
|
99
|
+
ark.mkdir()
|
|
100
|
+
path = next_tar_path(ark, min_idx=5)
|
|
101
|
+
self.assertEqual(path.name, f"{TAR_PREFIX}00005{TAR_EXT}")
|
|
102
|
+
|
|
103
|
+
def test_ensure_clean_outdir_passes(self) -> None:
|
|
104
|
+
ark = self.tmp / f"src{ARK_DIR_EXT}"
|
|
105
|
+
ark.mkdir()
|
|
106
|
+
(ark / DB_NAME).touch()
|
|
107
|
+
(ark / f"{TAR_PREFIX}00001{TAR_EXT}").touch()
|
|
108
|
+
ensure_clean_outdir(ark) # should not raise
|
|
109
|
+
|
|
110
|
+
def test_ensure_clean_outdir_raises_on_unexpected(self) -> None:
|
|
111
|
+
ark = self.tmp / f"src{ARK_DIR_EXT}"
|
|
112
|
+
ark.mkdir()
|
|
113
|
+
(ark / "unexpected.txt").touch()
|
|
114
|
+
with self.assertRaises(RuntimeError):
|
|
115
|
+
ensure_clean_outdir(ark)
|
|
116
|
+
|
|
117
|
+
def test_write_objects_to_tar(self) -> None:
|
|
118
|
+
ark = self.tmp / "ark.ark.d"
|
|
119
|
+
ark.mkdir()
|
|
120
|
+
open_db(ark / DB_NAME).close()
|
|
121
|
+
cs = b2sum(self.src / "a.txt")
|
|
122
|
+
tar_name = write_objects_to_tar(ark, {cs: self.src / "a.txt"})
|
|
123
|
+
self.assertTrue((ark / tar_name).exists())
|
|
124
|
+
self.assertEqual(tar_name, f"{TAR_PREFIX}00001{TAR_EXT}")
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Tests for dirark.sync: merge_arks, push_ark, pull_ark, add_dir_to_remote_ark."""
|
|
2
|
+
|
|
3
|
+
import tempfile
|
|
4
|
+
import unittest
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from dirark.core import archive_dir, restore_ark
|
|
8
|
+
from dirark.storage import ARK_DIR_EXT, DB_NAME, open_db
|
|
9
|
+
from dirark.sync import add_dir_to_remote_ark, merge_arks, pull_ark, push_ark
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TestMergeArks(unittest.TestCase):
|
|
13
|
+
def setUp(self) -> None:
|
|
14
|
+
self.tmpdir = tempfile.TemporaryDirectory()
|
|
15
|
+
self.tmp = Path(self.tmpdir.name)
|
|
16
|
+
|
|
17
|
+
src_a = self.tmp / "a"
|
|
18
|
+
src_a.mkdir()
|
|
19
|
+
(src_a / "file_a.txt").write_text("content a")
|
|
20
|
+
archive_dir(src_a)
|
|
21
|
+
self.ark_a = self.tmp / ("a" + ARK_DIR_EXT)
|
|
22
|
+
|
|
23
|
+
src_b = self.tmp / "b"
|
|
24
|
+
src_b.mkdir()
|
|
25
|
+
(src_b / "file_b.txt").write_text("content b")
|
|
26
|
+
archive_dir(src_b)
|
|
27
|
+
self.ark_b = self.tmp / ("b" + ARK_DIR_EXT)
|
|
28
|
+
|
|
29
|
+
def tearDown(self) -> None:
|
|
30
|
+
self.tmpdir.cleanup()
|
|
31
|
+
|
|
32
|
+
def test_merge_adds_missing_files(self) -> None:
|
|
33
|
+
merge_arks(self.ark_b, self.ark_a)
|
|
34
|
+
db = open_db(self.ark_a / DB_NAME)
|
|
35
|
+
cur = db.cursor()
|
|
36
|
+
cur.execute("SELECT path FROM files ORDER BY path")
|
|
37
|
+
paths = [row[0] for row in cur.fetchall()]
|
|
38
|
+
db.close()
|
|
39
|
+
self.assertIn("file_a.txt", paths)
|
|
40
|
+
self.assertIn("file_b.txt", paths)
|
|
41
|
+
|
|
42
|
+
def test_merged_ark_restores_all_files(self) -> None:
|
|
43
|
+
merge_arks(self.ark_b, self.ark_a)
|
|
44
|
+
dest = self.tmp / "restored"
|
|
45
|
+
restore_ark(self.ark_a, dest)
|
|
46
|
+
self.assertTrue((dest / "file_a.txt").exists())
|
|
47
|
+
self.assertTrue((dest / "file_b.txt").exists())
|
|
48
|
+
self.assertEqual((dest / "file_b.txt").read_text(), "content b")
|
|
49
|
+
|
|
50
|
+
def test_merge_deduplicates_objects(self) -> None:
|
|
51
|
+
src_c = self.tmp / "c"
|
|
52
|
+
src_c.mkdir()
|
|
53
|
+
(src_c / "dup.txt").write_text("content a")
|
|
54
|
+
archive_dir(src_c)
|
|
55
|
+
ark_c = self.tmp / ("c" + ARK_DIR_EXT)
|
|
56
|
+
|
|
57
|
+
merge_arks(ark_c, self.ark_a)
|
|
58
|
+
db = open_db(self.ark_a / DB_NAME)
|
|
59
|
+
cur = db.cursor()
|
|
60
|
+
cur.execute("SELECT COUNT(*) FROM files")
|
|
61
|
+
n_files = cur.fetchone()[0]
|
|
62
|
+
cur.execute("SELECT COUNT(*) FROM objects")
|
|
63
|
+
n_objects = cur.fetchone()[0]
|
|
64
|
+
db.close()
|
|
65
|
+
self.assertEqual(n_files, 2)
|
|
66
|
+
self.assertEqual(n_objects, 1)
|
|
67
|
+
|
|
68
|
+
def test_merge_is_idempotent(self) -> None:
|
|
69
|
+
merge_arks(self.ark_b, self.ark_a)
|
|
70
|
+
merge_arks(self.ark_b, self.ark_a)
|
|
71
|
+
db = open_db(self.ark_a / DB_NAME)
|
|
72
|
+
cur = db.cursor()
|
|
73
|
+
cur.execute("SELECT COUNT(*) FROM files")
|
|
74
|
+
self.assertEqual(cur.fetchone()[0], 2)
|
|
75
|
+
db.close()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class TestPushPullArk(unittest.TestCase):
|
|
79
|
+
def setUp(self) -> None:
|
|
80
|
+
self.tmpdir = tempfile.TemporaryDirectory()
|
|
81
|
+
self.tmp = Path(self.tmpdir.name)
|
|
82
|
+
src = self.tmp / "source"
|
|
83
|
+
src.mkdir()
|
|
84
|
+
(src / "file1.txt").write_text("pushed content")
|
|
85
|
+
archive_dir(src)
|
|
86
|
+
self.local_ark = self.tmp / ("source" + ARK_DIR_EXT)
|
|
87
|
+
self.remote = self.tmp / "remote_ark"
|
|
88
|
+
|
|
89
|
+
def tearDown(self) -> None:
|
|
90
|
+
self.tmpdir.cleanup()
|
|
91
|
+
|
|
92
|
+
def test_push_copies_files(self) -> None:
|
|
93
|
+
push_ark(self.local_ark, str(self.remote))
|
|
94
|
+
self.assertTrue((self.remote / DB_NAME).exists())
|
|
95
|
+
tars = list(self.remote.glob("*.tar.zst"))
|
|
96
|
+
self.assertGreater(len(tars), 0)
|
|
97
|
+
|
|
98
|
+
def test_pull_copies_files(self) -> None:
|
|
99
|
+
push_ark(self.local_ark, str(self.remote))
|
|
100
|
+
dest = self.tmp / "pulled"
|
|
101
|
+
pull_ark(str(self.remote), dest)
|
|
102
|
+
self.assertTrue((dest / DB_NAME).exists())
|
|
103
|
+
|
|
104
|
+
def test_push_then_restore(self) -> None:
|
|
105
|
+
push_ark(self.local_ark, str(self.remote))
|
|
106
|
+
dest = self.tmp / "restored"
|
|
107
|
+
restore_ark(self.remote, dest)
|
|
108
|
+
self.assertTrue((dest / "file1.txt").exists())
|
|
109
|
+
self.assertEqual((dest / "file1.txt").read_text(), "pushed content")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class TestAddDirToRemoteArk(unittest.TestCase):
|
|
113
|
+
def setUp(self) -> None:
|
|
114
|
+
self.tmpdir = tempfile.TemporaryDirectory()
|
|
115
|
+
self.tmp = Path(self.tmpdir.name)
|
|
116
|
+
|
|
117
|
+
remote_src = self.tmp / "remote_src"
|
|
118
|
+
remote_src.mkdir()
|
|
119
|
+
(remote_src / "existing.txt").write_text("existing content")
|
|
120
|
+
archive_dir(remote_src)
|
|
121
|
+
self.remote_ark = self.tmp / ("remote_src" + ARK_DIR_EXT)
|
|
122
|
+
|
|
123
|
+
self.new_src = self.tmp / "new_content"
|
|
124
|
+
self.new_src.mkdir()
|
|
125
|
+
(self.new_src / "new.txt").write_text("new content")
|
|
126
|
+
|
|
127
|
+
def tearDown(self) -> None:
|
|
128
|
+
self.tmpdir.cleanup()
|
|
129
|
+
|
|
130
|
+
def test_add_dir_merges_into_remote(self) -> None:
|
|
131
|
+
add_dir_to_remote_ark(self.new_src, str(self.remote_ark))
|
|
132
|
+
dest = self.tmp / "restored"
|
|
133
|
+
restore_ark(self.remote_ark, dest)
|
|
134
|
+
self.assertTrue((dest / "existing.txt").exists())
|
|
135
|
+
self.assertTrue((dest / "new.txt").exists())
|
|
136
|
+
self.assertEqual((dest / "new.txt").read_text(), "new content")
|
|
137
|
+
|
|
138
|
+
def test_add_dir_preserves_existing_content(self) -> None:
|
|
139
|
+
add_dir_to_remote_ark(self.new_src, str(self.remote_ark))
|
|
140
|
+
dest = self.tmp / "restored"
|
|
141
|
+
restore_ark(self.remote_ark, dest)
|
|
142
|
+
self.assertEqual((dest / "existing.txt").read_text(), "existing content")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dirark
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Directory archival
|
|
5
|
+
Project-URL: Homepage, https://github.com/endremborza/dirark
|
|
6
|
+
Author-email: Endre Márk Borza <endremborza@gmail.com>
|
|
7
|
+
License: Copyright 2026 Endre Márk Borza
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# dirark
|
|
19
|
+
|
|
20
|
+
[](https://pypi.org/project/dirark/)
|
|
21
|
+
|
|
22
|
+
Directory archival
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
dirark/__init__.py,sha256=fzqQo7enYCicLXFvn28YdD32nE5bZwbrBMiOFrwIE7E,374
|
|
2
|
+
dirark/__main__.py,sha256=w6QRjnFJEV7N_-wpBNcZTzVmmP3qiRoUdXZLvN7XKtE,3747
|
|
3
|
+
dirark/core.py,sha256=KkHNCATlFm2xkKS13Py3VYhSI9IHUamFaYDR4TZf4xc,3644
|
|
4
|
+
dirark/reader.py,sha256=Awxk-sysH7w8y4ndhKTlU4zSBNSpDOitf48ULxPRoXg,2199
|
|
5
|
+
dirark/storage.py,sha256=9lXXem7qZ5hBDJYY3C-0R4F8DuR6MMf7yAx4AQ7iGNo,4167
|
|
6
|
+
dirark/sync.py,sha256=2UkxBrI0cxoQzpQ89vvqcvv_rb5N0CayWoBPmtDsKJ0,3942
|
|
7
|
+
dirark/tests/test_core.py,sha256=yE6CPYHZ5zpCumFOCfyRphsNmX_NTC1n571IiyEkRSU,7775
|
|
8
|
+
dirark/tests/test_init.py,sha256=9mLvYz3kQNqcrU8URCnfuGrmq7ZO9wFLwUIklyFhdt8,92
|
|
9
|
+
dirark/tests/test_reader.py,sha256=0dbqb-Kbwg7OvCi2VwaWJ3nvbNJTPfGtwRIy0YG0Ydg,2327
|
|
10
|
+
dirark/tests/test_storage.py,sha256=4_BBwaf1B4yO4UrpO_prah52LK-neM_IOtPLtMavMc0,4082
|
|
11
|
+
dirark/tests/test_sync.py,sha256=p2hH9Yc5AYka-3ZVS9fsE7Zw6tonGVp5W2Jkaj60nV8,5153
|
|
12
|
+
dirark-0.1.0.dist-info/METADATA,sha256=8mPfk_QhmehS4zl_fo9eJ_n4YTHsZkFTX5mpIAJntLw,1512
|
|
13
|
+
dirark-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
14
|
+
dirark-0.1.0.dist-info/licenses/LICENSE,sha256=NrHeFb1pfm6nQHXGcuumvUALSfJjJv9EZTv4rK-jwR0,1056
|
|
15
|
+
dirark-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright 2026 Endre Márk Borza
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|