dirark 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ name: Python Test
2
+
3
+ on:
4
+ pull_request:
5
+ branches:
6
+ - main
7
+ push:
8
+ branches:
9
+ - main
10
+
11
+ jobs:
12
+ test:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - uses: astral-sh/setup-uv@v5
17
+ with:
18
+ python-version: '3.12'
19
+ enable-cache: true
20
+ - run: uv tool install branthebuilder
21
+ - run: branb test
@@ -0,0 +1,17 @@
1
+
2
+ name: Python Package Release
3
+
4
+ on:
5
+ release:
6
+ types: [published]
7
+
8
+ jobs:
9
+ release:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: astral-sh/setup-uv@v5
14
+ - run: uv build && uv publish
15
+ env:
16
+ UV_PUBLISH_TOKEN: ${{ secrets.PYPI_TOKEN }}
17
+
@@ -0,0 +1,119 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Generated coverage HTML (report.md is committed, HTML is not)
72
+ coverage/html/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # pyenv
78
+ .python-version
79
+
80
+ # pipenv
81
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
82
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
83
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
84
+ # install all needed dependencies.
85
+ #Pipfile.lock
86
+
87
+ # celery beat schedule file
88
+ celerybeat-schedule
89
+
90
+ # SageMath parsed files
91
+ *.sage.py
92
+
93
+ # Environments
94
+ .env
95
+ .venv
96
+ env/
97
+ venv/
98
+ ENV/
99
+ env.bak/
100
+ venv.bak/
101
+
102
+ # Spyder project settings
103
+ .spyderproject
104
+ .spyproject
105
+
106
+ # Rope project settings
107
+ .ropeproject
108
+
109
+ # mypy
110
+ .mypy_cache/
111
+ .dmypy.json
112
+ dmypy.json
113
+
114
+ # Pyre type checker
115
+ .pyre/
116
+
117
+ .idea
118
+ .vscode
119
+ dask-worker-space/
dirark-0.1.0/LICENSE ADDED
@@ -0,0 +1,7 @@
1
+ Copyright 2026 Endre Márk Borza
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
dirark-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.4
2
+ Name: dirark
3
+ Version: 0.1.0
4
+ Summary: Directory archival
5
+ Project-URL: Homepage, https://github.com/endremborza/dirark
6
+ Author-email: Endre Márk Borza <endremborza@gmail.com>
7
+ License: Copyright 2026 Endre Márk Borza
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
14
+ License-File: LICENSE
15
+ Requires-Python: >=3.12
16
+ Description-Content-Type: text/markdown
17
+
18
+ # dirark
19
+
20
+ [![pypi](https://img.shields.io/pypi/v/dirark.svg)](https://pypi.org/project/dirark/)
21
+
22
+ Directory archival
dirark-0.1.0/README.md ADDED
@@ -0,0 +1,5 @@
1
+ # dirark
2
+
3
+ [![pypi](https://img.shields.io/pypi/v/dirark.svg)](https://pypi.org/project/dirark/)
4
+
5
+ Directory archival
@@ -0,0 +1,14 @@
1
+ | Name | Stmts | Miss | Cover |
2
+ |------------------------------ | -------: | -------: | ------: |
3
+ | dirark/\_\_init\_\_.py | 5 | 0 | 100% |
4
+ | dirark/\_\_main\_\_.py | 92 | 92 | 0% |
5
+ | dirark/core.py | 71 | 4 | 94% |
6
+ | dirark/reader.py | 35 | 0 | 100% |
7
+ | dirark/storage.py | 57 | 0 | 100% |
8
+ | dirark/sync.py | 55 | 1 | 98% |
9
+ | dirark/tests/test\_core.py | 164 | 0 | 100% |
10
+ | dirark/tests/test\_init.py | 3 | 0 | 100% |
11
+ | dirark/tests/test\_reader.py | 55 | 0 | 100% |
12
+ | dirark/tests/test\_storage.py | 88 | 0 | 100% |
13
+ | dirark/tests/test\_sync.py | 116 | 0 | 100% |
14
+ | **TOTAL** | **741** | **97** | **87%** |
@@ -0,0 +1,17 @@
1
+ """dirark – directory archival and retrieval tool."""
2
+
3
+ from .core import archive_dir, restore_ark
4
+ from .reader import ArkReader
5
+ from .sync import add_dir_to_remote_ark, merge_arks, pull_ark, push_ark
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ __all__ = [
10
+ "archive_dir",
11
+ "restore_ark",
12
+ "ArkReader",
13
+ "push_ark",
14
+ "pull_ark",
15
+ "merge_arks",
16
+ "add_dir_to_remote_ark",
17
+ ]
@@ -0,0 +1,124 @@
1
+ """CLI entry point for dirark."""
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from .core import archive_dir, restore_ark
8
+ from .reader import ArkReader
9
+ from .sync import add_dir_to_remote_ark, merge_arks, pull_ark, push_ark
10
+
11
+
12
+ def main() -> None:
13
+ """Parse arguments and dispatch to the appropriate subcommand."""
14
+ parser = argparse.ArgumentParser(
15
+ description="Cold-storage directory archival tool."
16
+ )
17
+ sub = parser.add_subparsers(dest="command", required=True)
18
+
19
+ p = sub.add_parser("archive", help="Archive a directory.")
20
+ p.add_argument("source_dir", type=Path)
21
+ p.set_defaults(func=_archive)
22
+
23
+ p = sub.add_parser("restore", help="Restore files from an ark.")
24
+ p.add_argument("ark_dir", type=Path)
25
+ p.add_argument("dest_dir", type=Path)
26
+ p.set_defaults(func=_restore)
27
+
28
+ p = sub.add_parser("push", help="Push local ark to remote via rsync.")
29
+ p.add_argument("local_ark", type=Path)
30
+ p.add_argument("remote")
31
+ p.set_defaults(func=_push)
32
+
33
+ p = sub.add_parser("pull", help="Pull remote ark to local path via rsync.")
34
+ p.add_argument("remote")
35
+ p.add_argument("local_ark", type=Path)
36
+ p.set_defaults(func=_pull)
37
+
38
+ p = sub.add_parser("merge", help="Merge src_ark into dst_ark (local).")
39
+ p.add_argument("src_ark", type=Path)
40
+ p.add_argument("dst_ark", type=Path)
41
+ p.set_defaults(func=_merge)
42
+
43
+ p = sub.add_parser("add", help="Archive a directory and add it to a remote ark.")
44
+ p.add_argument("source_dir", type=Path)
45
+ p.add_argument("remote_ark")
46
+ p.set_defaults(func=_add)
47
+
48
+ p = sub.add_parser("read", help="Print a file from an ark to stdout.")
49
+ p.add_argument("ark_dir", type=Path)
50
+ p.add_argument("file_path")
51
+ p.set_defaults(func=_read)
52
+
53
+ args = parser.parse_args()
54
+ args.func(args)
55
+
56
+
57
+ def _archive(args: argparse.Namespace) -> None:
58
+ try:
59
+ archive_dir(args.source_dir)
60
+ print(f"Archived '{args.source_dir}'.")
61
+ except Exception as e:
62
+ print(f"Error: {e}", file=sys.stderr)
63
+ sys.exit(1)
64
+
65
+
66
+ def _restore(args: argparse.Namespace) -> None:
67
+ try:
68
+ restore_ark(args.ark_dir, args.dest_dir)
69
+ print(f"Restored to '{args.dest_dir}'.")
70
+ except Exception as e:
71
+ print(f"Error: {e}", file=sys.stderr)
72
+ sys.exit(1)
73
+
74
+
75
+ def _push(args: argparse.Namespace) -> None:
76
+ try:
77
+ push_ark(args.local_ark, args.remote)
78
+ print(f"Pushed '{args.local_ark}' to '{args.remote}'.")
79
+ except Exception as e:
80
+ print(f"Error: {e}", file=sys.stderr)
81
+ sys.exit(1)
82
+
83
+
84
+ def _pull(args: argparse.Namespace) -> None:
85
+ try:
86
+ pull_ark(args.remote, args.local_ark)
87
+ print(f"Pulled '{args.remote}' to '{args.local_ark}'.")
88
+ except Exception as e:
89
+ print(f"Error: {e}", file=sys.stderr)
90
+ sys.exit(1)
91
+
92
+
93
+ def _merge(args: argparse.Namespace) -> None:
94
+ try:
95
+ merge_arks(args.src_ark, args.dst_ark)
96
+ print(f"Merged '{args.src_ark}' into '{args.dst_ark}'.")
97
+ except Exception as e:
98
+ print(f"Error: {e}", file=sys.stderr)
99
+ sys.exit(1)
100
+
101
+
102
+ def _add(args: argparse.Namespace) -> None:
103
+ try:
104
+ add_dir_to_remote_ark(args.source_dir, args.remote_ark)
105
+ print(f"Added '{args.source_dir}' to '{args.remote_ark}'.")
106
+ except Exception as e:
107
+ print(f"Error: {e}", file=sys.stderr)
108
+ sys.exit(1)
109
+
110
+
111
+ def _read(args: argparse.Namespace) -> None:
112
+ try:
113
+ with ArkReader(args.ark_dir) as reader:
114
+ sys.stdout.buffer.write(reader.read_file(args.file_path))
115
+ except KeyError as e:
116
+ print(f"Error: {e}", file=sys.stderr)
117
+ sys.exit(1)
118
+ except Exception as e:
119
+ print(f"Error: {e}", file=sys.stderr)
120
+ sys.exit(1)
121
+
122
+
123
+ if __name__ == "__main__":
124
+ main()
@@ -0,0 +1,118 @@
1
+ """Core archive and restore operations."""
2
+
3
+ import shutil
4
+ import tempfile
5
+ from pathlib import Path
6
+
7
+ from .storage import (
8
+ ARK_DIR_EXT,
9
+ DB_NAME,
10
+ b2sum,
11
+ ensure_clean_outdir,
12
+ extract_tar_zst,
13
+ open_db,
14
+ write_objects_to_tar,
15
+ )
16
+
17
+
18
+ def archive_dir(src_dir: Path, ark_out: Path | None = None) -> None:
19
+ """Archive src_dir into a content-addressed store.
20
+
21
+ By default the archive is created at src_dir + ARK_DIR_EXT. Pass ark_out
22
+ to write into an existing ark directory (useful for merging or remote push).
23
+
24
+ Archiving is idempotent: re-running on the same directory is a no-op.
25
+ Files with duplicate content are deduplicated by BLAKE2b checksum.
26
+ """
27
+ if ark_out is None:
28
+ ark_out = Path(f"{src_dir}{ARK_DIR_EXT}")
29
+ ark_out.mkdir(exist_ok=True, parents=True)
30
+ ensure_clean_outdir(ark_out)
31
+ db = open_db(ark_out / DB_NAME)
32
+ cur = db.cursor()
33
+
34
+ new_objects: dict[str, Path] = {}
35
+ new_files: list[tuple[str, str]] = []
36
+
37
+ for path in sorted(src_dir.rglob("*")):
38
+ if not path.is_file():
39
+ continue
40
+ rel = path.relative_to(src_dir).as_posix()
41
+ checksum = b2sum(path)
42
+
43
+ cur.execute("SELECT 1 FROM files WHERE path=?", (rel,))
44
+ if cur.fetchone():
45
+ continue
46
+
47
+ cur.execute("SELECT tar_name FROM objects WHERE checksum=?", (checksum,))
48
+ if cur.fetchone() is None:
49
+ new_objects.setdefault(checksum, path)
50
+
51
+ new_files.append((rel, checksum))
52
+
53
+ if not new_files:
54
+ db.close()
55
+ return
56
+
57
+ if new_objects:
58
+ tar_name = write_objects_to_tar(ark_out, new_objects)
59
+ for checksum in new_objects:
60
+ cur.execute("INSERT INTO objects VALUES (?, ?)", (checksum, tar_name))
61
+
62
+ for rel, checksum in new_files:
63
+ cur.execute("INSERT INTO files VALUES (?, ?)", (rel, checksum))
64
+
65
+ db.commit()
66
+ db.close()
67
+
68
+
69
+ def restore_ark(ark_dir: Path, dest_dir: Path) -> None:
70
+ """Restore all files from an ark to dest_dir.
71
+
72
+ dest_dir is only created if there are files to restore.
73
+ Missing tars or objects produce warnings but do not abort the restore.
74
+ """
75
+ db = open_db(ark_dir / DB_NAME)
76
+ cur = db.cursor()
77
+
78
+ cur.execute("SELECT path, checksum FROM files")
79
+ files = cur.fetchall()
80
+
81
+ if not files:
82
+ print("No files found to restore in the archive.")
83
+ db.close()
84
+ return
85
+
86
+ dest_dir.mkdir(parents=True, exist_ok=True)
87
+ cur.execute("SELECT checksum, tar_name FROM objects")
88
+ checksum_to_tar = dict(cur.fetchall())
89
+ db.close()
90
+
91
+ by_tar: dict[Path, list[tuple[str, str]]] = {}
92
+ for rel, checksum in files:
93
+ tar_name = checksum_to_tar.get(checksum)
94
+ if tar_name:
95
+ by_tar.setdefault(ark_dir / tar_name, []).append((rel, checksum))
96
+ else:
97
+ print(f"Warning: checksum {checksum} for {rel} not in objects.")
98
+
99
+ with tempfile.TemporaryDirectory() as tmp:
100
+ tmp_path = Path(tmp)
101
+ obj_dir = tmp_path / "objects"
102
+ obj_dir.mkdir()
103
+
104
+ for tar_path, tar_files in by_tar.items():
105
+ if not tar_path.exists():
106
+ print(f"Warning: {tar_path.name} not found.")
107
+ continue
108
+
109
+ extract_tar_zst(tar_path, tmp_path)
110
+
111
+ for rel, checksum in tar_files:
112
+ dest_file = dest_dir / rel
113
+ dest_file.parent.mkdir(parents=True, exist_ok=True)
114
+ src_obj = obj_dir / checksum
115
+ if src_obj.exists():
116
+ shutil.copy2(src_obj, dest_file)
117
+ else:
118
+ print(f"Warning: object {checksum} missing from {tar_path.name}.")
@@ -0,0 +1,67 @@
1
+ """ArkReader: retrieve individual files from a dirark archive."""
2
+
3
+ import tempfile
4
+ from pathlib import Path
5
+
6
+ from .storage import DB_NAME, extract_object_from_tar, open_db
7
+
8
+
9
+ class ArkReader:
10
+ """Read-only access to individual files within a dirark archive.
11
+
12
+ Supports context manager usage::
13
+
14
+ with ArkReader(ark_dir) as reader:
15
+ data = reader.read_file("path/to/file.txt")
16
+ """
17
+
18
+ def __init__(self, ark_dir: Path) -> None:
19
+ """Open an ark for reading."""
20
+ self._ark_dir = ark_dir
21
+ self._db = open_db(ark_dir / DB_NAME)
22
+
23
+ def __enter__(self) -> "ArkReader":
24
+ return self
25
+
26
+ def __exit__(self, *_: object) -> None:
27
+ self.close()
28
+
29
+ def list_files(self) -> list[str]:
30
+ """Return sorted list of all archived file paths."""
31
+ cur = self._db.cursor()
32
+ cur.execute("SELECT path FROM files ORDER BY path")
33
+ return [row[0] for row in cur.fetchall()]
34
+
35
+ def get_checksum(self, path: str) -> str:
36
+ """Return the BLAKE2b checksum of an archived file by relative path."""
37
+ cur = self._db.cursor()
38
+ cur.execute("SELECT checksum FROM files WHERE path=?", (path,))
39
+ row = cur.fetchone()
40
+ if row is None:
41
+ raise KeyError(f"File not found in ark: {path}")
42
+ return row[0]
43
+
44
+ def read_file(self, path: str) -> bytes:
45
+ """Read and return the raw bytes of an archived file.
46
+
47
+ Raises KeyError if path is not in the ark.
48
+ """
49
+ cur = self._db.cursor()
50
+ cur.execute(
51
+ "SELECT f.checksum, o.tar_name "
52
+ "FROM files f JOIN objects o ON f.checksum = o.checksum "
53
+ "WHERE f.path=?",
54
+ (path,),
55
+ )
56
+ row = cur.fetchone()
57
+ if row is None:
58
+ raise KeyError(f"File not found in ark: {path}")
59
+ checksum, tar_name = row
60
+ with tempfile.TemporaryDirectory() as tmp:
61
+ tmp_path = Path(tmp)
62
+ extract_object_from_tar(self._ark_dir / tar_name, checksum, tmp_path)
63
+ return (tmp_path / "objects" / checksum).read_bytes()
64
+
65
+ def close(self) -> None:
66
+ """Close the underlying database connection."""
67
+ self._db.close()
@@ -0,0 +1,143 @@
1
+ """Low-level storage primitives: checksums, tar I/O, and database access."""
2
+
3
+ import shutil
4
+ import sqlite3
5
+ import subprocess
6
+ import tempfile
7
+ from pathlib import Path
8
+
9
+ MAX_TAR_MB = 256
10
+ DB_NAME = "index.sqlite"
11
+ SEP = "-"
12
+ TAR_PREFIX = "data" + SEP
13
+ TAR_EXT = ".tar.zst"
14
+ ARK_DIR_EXT = ".ark.d"
15
+
16
+
17
+ def b2sum(path: Path) -> str:
18
+ """Compute BLAKE2b checksum of a file using the system b2sum utility."""
19
+ res = subprocess.run(
20
+ ["b2sum", str(path)],
21
+ check=True,
22
+ stdout=subprocess.PIPE,
23
+ text=True,
24
+ )
25
+ return res.stdout.split()[0]
26
+
27
+
28
+ def open_db(path: Path) -> sqlite3.Connection:
29
+ """Open or create an ark SQLite database, ensuring the schema exists.
30
+
31
+ Tables:
32
+ files(path TEXT PK, checksum TEXT) -- relative path → checksum
33
+ objects(checksum TEXT PK, tar_name TEXT) -- checksum → archive name
34
+ """
35
+ db = sqlite3.connect(path)
36
+ db.execute(
37
+ "CREATE TABLE IF NOT EXISTS files("
38
+ "path TEXT PRIMARY KEY, checksum TEXT NOT NULL)"
39
+ )
40
+ db.execute(
41
+ "CREATE TABLE IF NOT EXISTS objects("
42
+ "checksum TEXT PRIMARY KEY, tar_name TEXT NOT NULL)"
43
+ )
44
+ return db
45
+
46
+
47
+ def list_tars(ark_dir: Path) -> list[Path]:
48
+ """Return sorted list of tar archives in an ark directory."""
49
+ return sorted(ark_dir.glob(f"{TAR_PREFIX}*{TAR_EXT}"))
50
+
51
+
52
+ def tar_size_mb(path: Path) -> float:
53
+ """Return file size in megabytes."""
54
+ return path.stat().st_size / (1024 * 1024)
55
+
56
+
57
+ def next_tar_path(ark_dir: Path, min_idx: int = 0) -> Path:
58
+ """Return path for the next tar archive in ark_dir.
59
+
60
+ Index is max(last_existing + 1, min_idx), guaranteeing no name collision.
61
+ """
62
+ tars = list_tars(ark_dir)
63
+ last_idx = int(tars[-1].name.removesuffix(TAR_EXT).split(SEP)[-1]) if tars else 0
64
+ idx = max(last_idx + 1, min_idx)
65
+ return ark_dir / f"{TAR_PREFIX}{idx:05d}{TAR_EXT}"
66
+
67
+
68
+ def extract_tar_zst(src: Path, dest: Path) -> None:
69
+ """Extract a zstd-compressed tar archive into dest."""
70
+ subprocess.run(
71
+ ["tar", "-I", "zstd", "-xf", str(src), "-C", str(dest)],
72
+ check=True,
73
+ )
74
+
75
+
76
+ def extract_object_from_tar(tar: Path, checksum: str, dest: Path) -> None:
77
+ """Extract a single object by checksum from a tar archive into dest."""
78
+ subprocess.run(
79
+ [
80
+ "tar",
81
+ "-I",
82
+ "zstd",
83
+ "-xf",
84
+ str(tar),
85
+ "-C",
86
+ str(dest),
87
+ f"./objects/{checksum}",
88
+ ],
89
+ check=True,
90
+ )
91
+
92
+
93
+ def create_tar_zst(src_dir: Path, out: Path) -> None:
94
+ """Create a zstd-compressed tar of src_dir contents at out."""
95
+ subprocess.run(
96
+ ["tar", "-C", str(src_dir), "-I", "zstd", "-cf", str(out), "."],
97
+ check=True,
98
+ )
99
+
100
+
101
+ def ensure_clean_outdir(ark_dir: Path) -> None:
102
+ """Create ark_dir if needed and raise RuntimeError on unexpected files."""
103
+ ark_dir.mkdir(parents=True, exist_ok=True)
104
+ allowed = {DB_NAME} | {p.name for p in ark_dir.glob(f"{TAR_PREFIX}*{TAR_EXT}")}
105
+ for p in ark_dir.iterdir():
106
+ if p.name not in allowed:
107
+ raise RuntimeError(f"Unexpected file in archive dir: {p}")
108
+
109
+
110
+ def write_objects_to_tar(
111
+ ark_dir: Path,
112
+ objects: dict[str, Path],
113
+ min_tar_idx: int = 0,
114
+ ) -> str:
115
+ """Append objects (checksum → source path) to a tar archive in ark_dir.
116
+
117
+ Reuses the last tar if it is under MAX_TAR_MB, otherwise creates a new
118
+ one. min_tar_idx can be used to avoid index collisions when merging arks.
119
+
120
+ Returns the tar filename written to.
121
+ """
122
+ tars = list_tars(ark_dir)
123
+ if tars and tar_size_mb(tars[-1]) < MAX_TAR_MB:
124
+ tar_path = tars[-1]
125
+ else:
126
+ tar_path = next_tar_path(ark_dir, min_idx=min_tar_idx)
127
+
128
+ with tempfile.TemporaryDirectory() as tmp:
129
+ tmp_dir = Path(tmp)
130
+ obj_dir = tmp_dir / "objects"
131
+ obj_dir.mkdir(parents=True)
132
+
133
+ if tar_path.exists():
134
+ extract_tar_zst(tar_path, tmp_dir)
135
+
136
+ for checksum, src in objects.items():
137
+ dest = obj_dir / checksum
138
+ if not dest.exists():
139
+ shutil.copy2(src, dest)
140
+
141
+ create_tar_zst(tmp_dir, tar_path)
142
+
143
+ return tar_path.name