PyPI - nedb-engine - Versions diffs - 0.1.0__py3-none-any.whl - Mend

nedb-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

nedb/__init__.py +23 -0
nedb/cascade.py +130 -0
nedb/engine.py +253 -0
nedb/index.py +88 -0
nedb/log.py +126 -0
nedb/merkle.py +62 -0
nedb/query.py +235 -0
nedb/relations.py +51 -0
nedb/store.py +53 -0
nedb_engine-0.1.0.dist-info/METADATA +168 -0
nedb_engine-0.1.0.dist-info/RECORD +14 -0
nedb_engine-0.1.0.dist-info/WHEEL +5 -0
nedb_engine-0.1.0.dist-info/licenses/LICENSE +201 -0
nedb_engine-0.1.0.dist-info/top_level.txt +1 -0

nedb/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""
+NEDB — a versioned, self-compressing, time-traveling embedded database.
+  * Replay-protected & idempotent: every write carries a monotonic nonce and an
+    optional idempotency key, enforced by a hash-chained append-only log.
+  * Time-travel: read the database AS OF any past sequence number.
+  * Relational: first-class, time-travel-aware relations with O(1) traversal.
+  * Filterable / sortable / searchable: equality, ordered, and full-text indexes.
+  * Queryable: NQL text queries and a fluent builder that share one plan.
+  * git-style files with Cascade compression: content-defined chunking + dedup +
+    temperature tiers, with a Merkle root per version anchorable on-chain.
+This pure-Python package is the reference implementation. The production speed core
+is Rust (see ../rust), exposed to PyPI via PyO3 and to npm via napi-rs.
+"""
+from __future__ import annotations
+from .engine import NEDB
+from .log import Op, OpLog, ReplayError
+from .query import Query, parse_nql
+__all__ = ["NEDB", "OpLog", "Op", "ReplayError", "Query", "parse_nql"]
+__version__ = "0.1.0"

nedb/cascade.py ADDED Viewed

@@ -0,0 +1,130 @@
+"""
+nedb.cascade — the Cascade compression pipeline + content-addressed blob store.
+This is what makes NEDB double as a git-style file manager with maximum compression
+WITHOUT inventing a new entropy coder. The novelty is the pipeline composition:
+  1. Content-defined chunking (Gear rolling hash) — boundaries follow content, so a
+     one-byte insert only changes the chunk(s) around it, not everything after it.
+  2. Content-addressed dedup (BLAKE) — identical chunks across all files and all
+     versions are stored exactly once.
+  3. Temperature tiers — warm data uses a fast codec (zstd in prod; zlib in this
+     reference), cold/archival history uses a maximum-ratio codec (LZMA).
+The production pipeline adds similarity-picked binary deltas (zstd --patch-from) and
+schema-aware columnar transforms before the entropy stage; both are documented in
+docs/SPEC.md and stubbed for the reference engine.
+"""
+from __future__ import annotations
+import hashlib
+import lzma
+import random
+import zlib
+from typing import Dict, List
+from .merkle import merkle_root
+# --- Gear-hash content-defined chunking -------------------------------------
+_MASK = (1 << 13) - 1            # ~8 KiB average chunk
+_MIN = 2 * 1024
+_MAX = 64 * 1024
+_M64 = 0xFFFFFFFFFFFFFFFF
+_GEAR = [random.Random(0x12345678 + i).getrandbits(64) for i in range(256)]
+def chunk(data: bytes) -> List[bytes]:
+    chunks: List[bytes] = []
+    n = len(data)
+    i = 0
+    while i < n:
+        limit = min(i + _MAX, n)
+        h = 0
+        pos = i
+        cut = limit
+        while pos < limit:
+            h = ((h << 1) + _GEAR[data[pos]]) & _M64
+            pos += 1
+            if (pos - i) >= _MIN and (h & _MASK) == 0:
+                cut = pos
+                break
+        chunks.append(data[i:cut])
+        i = cut
+    return chunks
+def _blake(b: bytes) -> str:
+    return hashlib.blake2b(b, digest_size=32).hexdigest()
+# --- temperature tiers ------------------------------------------------------
+def warm_compress(b: bytes) -> bytes:    # zstd stand-in in the reference
+    return zlib.compress(b, 6)
+def warm_decompress(b: bytes) -> bytes:
+    return zlib.decompress(b)
+def cold_compress(b: bytes) -> bytes:    # real LZMA — the maximum-ratio archival tier
+    return lzma.compress(b, preset=9 | lzma.PRESET_EXTREME)
+def cold_decompress(b: bytes) -> bytes:
+    return lzma.decompress(b)
+class BlobStore:
+    """Content-addressed, deduplicated, tiered blob store with versioned files."""
+    def __init__(self, tier: str = "warm") -> None:
+        self.tier = tier
+        self.chunks: Dict[str, bytes] = {}                      # hash -> compressed bytes
+        self.files: Dict[str, Dict[str, list]] = {}            # name -> {versions, roots}
+        self.logical_bytes = 0
+        self.dedup_hits = 0
+    def _compress(self, b: bytes) -> bytes:
+        return cold_compress(b) if self.tier == "cold" else warm_compress(b)
+    def _decompress(self, b: bytes) -> bytes:
+        return cold_decompress(b) if self.tier == "cold" else warm_decompress(b)
+    def put_file(self, name: str, data: bytes) -> int:
+        recipe: List[str] = []
+        for c in chunk(data):
+            hh = _blake(c)
+            recipe.append(hh)
+            if hh in self.chunks:
+                self.dedup_hits += 1
+            else:
+                self.chunks[hh] = self._compress(c)
+        self.logical_bytes += len(data)
+        f = self.files.setdefault(name, {"versions": [], "roots": []})
+        f["versions"].append(recipe)
+        f["roots"].append(merkle_root(recipe))
+        return len(f["versions"]) - 1
+    def get_file(self, name: str, version: int = -1) -> bytes:
+        recipe = self.files[name]["versions"][version]
+        out = bytearray()
+        for hh in recipe:
+            out += self._decompress(self.chunks[hh])
+        return bytes(out)
+    def root(self, name: str, version: int = -1) -> str:
+        return self.files[name]["roots"][version]
+    def stored_bytes(self) -> int:
+        return sum(len(v) for v in self.chunks.values())
+    def stats(self) -> dict:
+        stored = self.stored_bytes()
+        return {
+            "tier": self.tier,
+            "unique_chunks": len(self.chunks),
+            "dedup_hits": self.dedup_hits,
+            "logical_bytes": self.logical_bytes,
+            "stored_bytes": stored,
+            "ratio": round(self.logical_bytes / stored, 2) if stored else 0.0,
+        }

nedb/engine.py ADDED Viewed

@@ -0,0 +1,253 @@
+"""
+nedb.engine — the NEDB database: log + MVCC store + relations + indexes + Cascade.
+The OpLog is the source of truth. Every mutation appends an Op; `_apply` deterministically
+folds an Op into the materialized state (store / relations / indexes). Because state is a
+pure function of the log, we get crash recovery and determinism (rebuild) for free, and
+"AS OF seq" time-travel because the log carries monotonic seqs.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+from .cascade import BlobStore
+from .index import Indexes, tokenize
+from .log import Op, OpLog, ReplayError  # noqa: F401  (re-exported)
+from .merkle import merkle_proof, merkle_verify
+from .query import Query, cmp, parse_nql
+from .relations import Relations
+from .store import MVCCStore
+def apply_op(store: MVCCStore, relations: Relations, indexes: Indexes, op: Op) -> None:
+    """Deterministically fold one op into materialized state."""
+    p = op.payload
+    if op.op == "put":
+        key, coll, doc = p["key"], p["coll"], p["doc"]
+        old = store.get(key)
+        if old is not None:
+            indexes.remove(coll, key, old)
+        store.put(key, doc, op.seq)
+        indexes.add(coll, key, doc)
+    elif op.op == "delete":
+        key, coll = p["key"], p["coll"]
+        old = store.get(key)
+        if old is not None:
+            indexes.remove(coll, key, old)
+        store.delete(key, op.seq)
+    elif op.op == "link":
+        relations.link(p["frm"], p["rel"], p["to"], op.seq)
+    elif op.op == "unlink":
+        relations.unlink(p["frm"], p["rel"], p["to"], op.seq)
+    elif op.op == "put_file":
+        pass  # bytes live in the content-addressed BlobStore; log records the root only
+class NEDB:
+    def __init__(self) -> None:
+        self.log = OpLog()
+        self.store = MVCCStore()
+        self.relations = Relations()
+        self.indexes = Indexes()
+        self.blobs: Dict[str, BlobStore] = {"warm": BlobStore("warm"), "cold": BlobStore("cold")}
+        self._nonce: Dict[str, int] = {}
+    # --- nonce helper -------------------------------------------------------
+    def _next(self, client: str) -> int:
+        n = self._nonce.get(client, 0) + 1
+        self._nonce[client] = n
+        return n
+    # --- mutations ----------------------------------------------------------
+    def put(self, coll: str, id: str, doc: dict, client: str = "local",
+            nonce: Optional[int] = None, idem: Optional[str] = None) -> dict:
+        key = f"{coll}:{id}"
+        doc = dict(doc)
+        doc.setdefault("_id", id)
+        nonce = self._next(client) if nonce is None else nonce
+        op, created = self.log.append(client, nonce, "put",
+                                      {"key": key, "coll": coll, "id": id, "doc": doc}, idem)
+        if created:
+            apply_op(self.store, self.relations, self.indexes, op)
+        return self.store.get(key)
+    def delete(self, coll: str, id: str, client: str = "local",
+               nonce: Optional[int] = None, idem: Optional[str] = None) -> None:
+        key = f"{coll}:{id}"
+        nonce = self._next(client) if nonce is None else nonce
+        op, created = self.log.append(client, nonce, "delete",
+                                      {"key": key, "coll": coll, "id": id}, idem)
+        if created:
+            apply_op(self.store, self.relations, self.indexes, op)
+    def get(self, coll: str, id: str, as_of: Optional[int] = None) -> Optional[dict]:
+        return self.store.get(f"{coll}:{id}", as_of)
+    # --- relations ----------------------------------------------------------
+    def link(self, frm: str, rel: str, to: str, client: str = "local",
+             nonce: Optional[int] = None) -> None:
+        nonce = self._next(client) if nonce is None else nonce
+        op, created = self.log.append(client, nonce, "link", {"frm": frm, "rel": rel, "to": to})
+        if created:
+            apply_op(self.store, self.relations, self.indexes, op)
+    def unlink(self, frm: str, rel: str, to: str, client: str = "local",
+               nonce: Optional[int] = None) -> None:
+        nonce = self._next(client) if nonce is None else nonce
+        op, created = self.log.append(client, nonce, "unlink", {"frm": frm, "rel": rel, "to": to})
+        if created:
+            apply_op(self.store, self.relations, self.indexes, op)
+    def neighbors(self, frm: str, rel: str, as_of: Optional[int] = None) -> List[str]:
+        return self.relations.neighbors(frm, rel, as_of)
+    def inbound(self, to: str, rel: str, as_of: Optional[int] = None) -> List[str]:
+        return self.relations.inbound(to, rel, as_of)
+    # --- indexes ------------------------------------------------------------
+    def create_index(self, coll: str, field: str, kind: str = "eq") -> None:
+        self.indexes.ensure(coll, field, kind)
+        # backfill existing rows at HEAD
+        for key in self.store.keys(coll + ":"):
+            doc = self.store.get(key)
+            if doc is not None:
+                self.indexes.add(coll, key, doc)
+    # --- queries ------------------------------------------------------------
+    def q(self, coll: str) -> Query:
+        return Query(self, coll)
+    def query(self, nql: str) -> List[dict]:
+        return self.execute(parse_nql(nql))
+    def execute(self, plan: dict) -> List[dict]:
+        coll = plan["from"]
+        as_of = plan.get("as_of")
+        prefix = coll + ":"
+        where = plan.get("where", [])
+        search = plan.get("search")
+        candidates: Optional[set] = None
+        # 1) full-text search is usually most selective
+        if search:
+            sfields = self.indexes.search_fields(coll)
+            if sfields:
+                per_term = []
+                for term in tokenize(search):
+                    s: set = set()
+                    for f in sfields:
+                        s |= self.indexes.search_lookup(coll, f, term)
+                    per_term.append(s)
+                candidates = set.intersection(*per_term) if per_term else set()
+        # 2) equality-index acceleration (HEAD reads only)
+        if candidates is None and as_of is None:
+            for (f, op, v) in where:
+                if op == "=" and self.indexes.has_eq(coll, f):
+                    candidates = self.indexes.eq_lookup(coll, f, v)
+                    break
+        # 3) fallback: scan the collection
+        if candidates is None:
+            candidates = set(self.store.keys(prefix, as_of))
+        # load + final predicate filter (guarantees correctness regardless of index path)
+        rows = []
+        for key in candidates:
+            doc = self.store.get(key, as_of)
+            if doc is None:
+                continue
+            if all(cmp(doc.get(f), op, v) for (f, op, v) in where):
+                if search and not self.indexes.search_fields(coll):
+                    blob = " ".join(str(x) for x in doc.values()).lower()
+                    if not all(t in blob for t in tokenize(search)):
+                        continue
+                rows.append((key, doc))
+        # order
+        ob = plan.get("order_by")
+        if ob:
+            field, direction = ob
+            try:
+                rows.sort(key=lambda kv: (kv[1].get(field) is None, kv[1].get(field)),
+                          reverse=(direction == "DESC"))
+            except TypeError:
+                rows.sort(key=lambda kv: str(kv[1].get(field)), reverse=(direction == "DESC"))
+        # traverse relations
+        if plan.get("traverse"):
+            rel = plan["traverse"]
+            seen, trav = set(), []
+            for key, _ in rows:
+                for nb in self.relations.neighbors(key, rel, as_of):
+                    if nb in seen:
+                        continue
+                    seen.add(nb)
+                    d = self.store.get(nb, as_of)
+                    if d is not None:
+                        trav.append((nb, d))
+            rows = trav
+        if plan.get("limit") is not None:
+            rows = rows[: plan["limit"]]
+        return [d for _, d in rows]
+    # --- files (git-style, Cascade-compressed) ------------------------------
+    def put_file(self, name: str, data: bytes, tier: str = "warm", client: str = "local",
+                 nonce: Optional[int] = None, idem: Optional[str] = None) -> int:
+        """Store a file version (Cascade-compressed, deduplicated). Returns the
+        integer version index; fetch its anchorable hash via file_root(name, version)."""
+        bs = self.blobs[tier]
+        version = bs.put_file(name, data)
+        root = bs.root(name, version)
+        nonce = self._next(client) if nonce is None else nonce
+        self.log.append(client, nonce, "put_file",
+                        {"name": name, "tier": tier, "version": version, "root": root}, idem)
+        return version
+    def get_file(self, name: str, version: int = -1, tier: str = "warm") -> bytes:
+        return self.blobs[tier].get_file(name, version)
+    def file_root(self, name: str, version: int = -1, tier: str = "warm") -> str:
+        return self.blobs[tier].root(name, version)
+    def file_proof(self, name: str, chunk_index: int, version: int = -1, tier: str = "warm"):
+        """Return (leaf, proof, root) proving chunk_index is part of the version."""
+        recipe = self.blobs[tier].files[name]["versions"][version]
+        root = self.blobs[tier].files[name]["roots"][version]
+        leaf = recipe[chunk_index]
+        return leaf, merkle_proof(recipe, chunk_index), root
+    @staticmethod
+    def verify_proof(leaf, proof, root) -> bool:
+        return merkle_verify(leaf, proof, root)
+    def compression_stats(self, tier: str = "warm") -> dict:
+        return self.blobs[tier].stats()
+    # --- integrity / determinism -------------------------------------------
+    def verify(self) -> bool:
+        """Verify the hash-chained op log has not been tampered with."""
+        return self.log.verify()
+    def rebuild(self):
+        """Replay the log into fresh state — proves state is a pure function of the log."""
+        store, relations, indexes = MVCCStore(), Relations(), Indexes()
+        for (c, f, k) in self.indexes.config:
+            indexes.ensure(c, f, k)
+        for op in self.log.ops:
+            apply_op(store, relations, indexes, op)
+        return store, relations, indexes
+    def verify_determinism(self) -> bool:
+        store, _, _ = self.rebuild()
+        return store.snapshot() == self.store.snapshot()
+    @property
+    def head(self) -> str:
+        return self.log.head
+    @property
+    def seq(self) -> int:
+        return len(self.log) - 1

nedb/index.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""
+nedb.index — secondary indexes: equality (hash), ordered (bisect), full-text (inverted).
+Indexes are maintained incrementally on write and reflect HEAD. They turn filter,
+sort and search from O(n) scans into index lookups. Indexes are keyed by
+"collection.field" so each collection has its own index namespace.
+(Time-travel queries fall back to a version scan in the engine; temporally-indexed
+reads are a documented later optimization.)
+"""
+from __future__ import annotations
+import bisect
+import re
+from typing import Any, Dict, List, Set
+_TOKEN = re.compile(r"[a-z0-9]+")
+def tokenize(text: str) -> Set[str]:
+    return set(_TOKEN.findall(text.lower()))
+class Indexes:
+    def __init__(self) -> None:
+        self.eq: Dict[str, Dict[Any, Set[str]]] = {}        # key -> value -> {ids}
+        self.ordered: Dict[str, List[tuple]] = {}           # key -> sorted [(value,id)]
+        self.inv: Dict[str, Dict[str, Set[str]]] = {}       # key -> token -> {ids}
+        self.config: List[tuple] = []                       # [(coll, field, kind)]
+    def ensure(self, coll: str, field: str, kind: str = "eq") -> None:
+        k = f"{coll}.{field}"
+        if (coll, field, kind) not in self.config:
+            self.config.append((coll, field, kind))
+        if kind == "eq":
+            self.eq.setdefault(k, {})
+        elif kind == "ordered":
+            self.ordered.setdefault(k, [])
+        elif kind == "search":
+            self.inv.setdefault(k, {})
+        else:
+            raise ValueError(f"unknown index kind: {kind}")
+    def add(self, coll: str, key: str, doc: dict) -> None:
+        for field, vmap in self.eq.items():
+            f = field.split(".", 1)[1]
+            if field.startswith(coll + ".") and f in doc:
+                vmap.setdefault(doc[f], set()).add(key)
+        for field, lst in self.ordered.items():
+            f = field.split(".", 1)[1]
+            if field.startswith(coll + ".") and f in doc and isinstance(doc[f], (int, float, str)):
+                bisect.insort(lst, (doc[f], key))
+        for field, inv in self.inv.items():
+            f = field.split(".", 1)[1]
+            if field.startswith(coll + ".") and isinstance(doc.get(f), str):
+                for tok in tokenize(doc[f]):
+                    inv.setdefault(tok, set()).add(key)
+    def remove(self, coll: str, key: str, doc: dict) -> None:
+        for field, vmap in self.eq.items():
+            f = field.split(".", 1)[1]
+            if field.startswith(coll + ".") and f in doc and doc[f] in vmap:
+                vmap[doc[f]].discard(key)
+        for field, lst in self.ordered.items():
+            f = field.split(".", 1)[1]
+            if field.startswith(coll + ".") and f in doc:
+                try:
+                    lst.remove((doc[f], key))
+                except ValueError:
+                    pass
+        for field, inv in self.inv.items():
+            f = field.split(".", 1)[1]
+            if field.startswith(coll + ".") and isinstance(doc.get(f), str):
+                for tok in tokenize(doc[f]):
+                    if tok in inv:
+                        inv[tok].discard(key)
+    def eq_lookup(self, coll: str, field: str, value: Any):
+        return set(self.eq.get(f"{coll}.{field}", {}).get(value, set()))
+    def search_lookup(self, coll: str, field: str, term: str):
+        return set(self.inv.get(f"{coll}.{field}", {}).get(term, set()))
+    def has_eq(self, coll: str, field: str) -> bool:
+        return f"{coll}.{field}" in self.eq
+    def search_fields(self, coll: str) -> List[str]:
+        return [k.split(".", 1)[1] for k in self.inv if k.startswith(coll + ".")]

nedb/log.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""
+nedb.log — the append-only, hash-chained, nonce-enforced, idempotent operation log.
+This is the single source of truth for NEDB. Every mutation in the database is an
+Op appended here. Three guarantees live in this one structure:
+  * Replay protection  — each client has a strictly-monotonic nonce; an op whose
+                         nonce is <= the client's last seen nonce is rejected.
+  * Idempotency        — an op carrying an idempotency key that was already applied
+                         returns the original result and is NOT appended again.
+  * Tamper evidence    — ops are chained by hash (h_n = H(h_{n-1} || op_n)), so the
+                         whole history is a verifiable chain and the head hash is a
+                         commitment to the entire log (anchorable on a blockchain).
+The same log is the substrate for MVCC snapshot isolation, crash recovery, and
+time-travel reads: every Op has a monotonic `seq`, and state "AS OF seq N" is just
+the log truncated at N.
+"""
+from __future__ import annotations
+import hashlib
+import json
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+GENESIS = "0" * 64
+def canon(obj: Any) -> bytes:
+    """Deterministic canonical encoding for hashing."""
+    return json.dumps(obj, sort_keys=True, separators=(",", ":"), default=str).encode()
+def blake(data: bytes) -> str:
+    # Reference uses BLAKE2b (stdlib). The production Rust core uses BLAKE3
+    # (faster, natively tree-structured for the Merkle history).
+    return hashlib.blake2b(data, digest_size=32).hexdigest()
+class ReplayError(Exception):
+    """Raised when an op is replayed with a stale/duplicate nonce."""
+@dataclass
+class Op:
+    seq: int
+    client: str
+    nonce: int
+    op: str  # put | delete | link | unlink | put_file
+    payload: dict
+    ts: float
+    idem: Optional[str]
+    prev_hash: str
+    hash: str
+class OpLog:
+    def __init__(self) -> None:
+        self.ops: List[Op] = []
+        self._last_nonce: Dict[str, int] = {}
+        self._idem: Dict[str, int] = {}  # idem key -> seq of original op
+        self._head = GENESIS
+    def append(
+        self,
+        client: str,
+        nonce: int,
+        op: str,
+        payload: dict,
+        idem: Optional[str] = None,
+        ts: Optional[float] = None,
+    ) -> Tuple[Op, bool]:
+        """Append an op. Returns (op, created). `created` is False when the op was
+        deduplicated by its idempotency key (a no-op replay-safe return)."""
+        # Idempotency: a known key returns the original op without re-appending.
+        if idem is not None and idem in self._idem:
+            return self.ops[self._idem[idem]], False
+        # Replay protection: nonce must strictly exceed the client's last nonce.
+        last = self._last_nonce.get(client, 0)
+        if nonce <= last:
+            raise ReplayError(
+                f"replay/stale nonce for client '{client}': {nonce} <= {last}"
+            )
+        seq = len(self.ops)
+        ts = time.time() if ts is None else ts
+        body = {
+            "seq": seq, "client": client, "nonce": nonce,
+            "op": op, "payload": payload, "ts": ts, "idem": idem,
+        }
+        h = blake(self._head.encode() + canon(body))
+        rec = Op(seq, client, nonce, op, payload, ts, idem, self._head, h)
+        self.ops.append(rec)
+        self._last_nonce[client] = nonce
+        if idem is not None:
+            self._idem[idem] = seq
+        self._head = h
+        return rec, True
+    def verify(self) -> bool:
+        """Re-walk the chain and confirm no op has been tampered with."""
+        prev = GENESIS
+        for o in self.ops:
+            body = {
+                "seq": o.seq, "client": o.client, "nonce": o.nonce,
+                "op": o.op, "payload": o.payload, "ts": o.ts, "idem": o.idem,
+            }
+            if o.prev_hash != prev:
+                return False
+            if o.hash != blake(prev.encode() + canon(body)):
+                return False
+            prev = o.hash
+        return True
+    @property
+    def head(self) -> str:
+        return self._head
+    def slice_until(self, as_of: int) -> List[Op]:
+        return [o for o in self.ops if o.seq <= as_of]
+    def __len__(self) -> int:
+        return len(self.ops)

nedb/merkle.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""
+nedb.merkle — Merkle tree over content-addressed chunk hashes.
+Because every file version is a list of BLAKE-addressed chunks, a file version has
+a Merkle root that commits to its exact bytes. Any chunk's membership is provable in
+O(log n), and the root can be anchored on-chain (e.g. ITC) for tamper-evident,
+notarized version history.
+"""
+from __future__ import annotations
+import hashlib
+from typing import List, Tuple
+def _h(b: bytes) -> bytes:
+    return hashlib.blake2b(b, digest_size=32).digest()
+def _to_bytes(x) -> bytes:
+    return bytes.fromhex(x) if isinstance(x, str) else bytes(x)
+def merkle_root(leaves: List[str]) -> str:
+    if not leaves:
+        return "0" * 64
+    level = [_to_bytes(x) for x in leaves]
+    while len(level) > 1:
+        nxt = []
+        for i in range(0, len(level), 2):
+            a = level[i]
+            b = level[i + 1] if i + 1 < len(level) else level[i]
+            nxt.append(_h(a + b))
+        level = nxt
+    return level[0].hex()
+def merkle_proof(leaves: List[str], idx: int) -> List[Tuple[str, str]]:
+    """Return inclusion proof for leaf at idx: list of (sibling_hex, side)."""
+    level = [_to_bytes(x) for x in leaves]
+    path: List[Tuple[str, str]] = []
+    while len(level) > 1:
+        if idx % 2 == 0:
+            sib = level[idx + 1] if idx + 1 < len(level) else level[idx]
+            path.append((sib.hex(), "R"))
+        else:
+            path.append((level[idx - 1].hex(), "L"))
+        nxt = []
+        for i in range(0, len(level), 2):
+            a = level[i]
+            b = level[i + 1] if i + 1 < len(level) else level[i]
+            nxt.append(_h(a + b))
+        level = nxt
+        idx //= 2
+    return path
+def merkle_verify(leaf: str, path: List[Tuple[str, str]], root: str) -> bool:
+    h = _to_bytes(leaf)
+    for sib_hex, side in path:
+        sib = _to_bytes(sib_hex)
+        h = _h(h + sib) if side == "R" else _h(sib + h)
+    return h.hex() == root