nitrodb 2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nedb/__init__.py +92 -0
- nedb/autoindex.py +142 -0
- nedb/backends/__init__.py +0 -0
- nedb/backends/redis_backend.py +115 -0
- nedb/cascade.py +130 -0
- nedb/concurrent.py +218 -0
- nedb/crypto.py +294 -0
- nedb/engine.py +783 -0
- nedb/index.py +98 -0
- nedb/log.py +216 -0
- nedb/merkle.py +62 -0
- nedb/mongo.py +824 -0
- nedb/proof.py +126 -0
- nedb/query.py +305 -0
- nedb/redis_compat.py +516 -0
- nedb/relations.py +51 -0
- nedb/resp2.py +250 -0
- nedb/server.py +1011 -0
- nedb/snapshot.py +216 -0
- nedb/sql.py +430 -0
- nedb/store.py +68 -0
- nedb/wrap_redis.py +725 -0
- nitrodb-2.4.3.dist-info/METADATA +64 -0
- nitrodb-2.4.3.dist-info/RECORD +27 -0
- nitrodb-2.4.3.dist-info/WHEEL +4 -0
- nitrodb-2.4.3.dist-info/entry_points.txt +2 -0
- nitrodb-2.4.3.dist-info/licenses/LICENSE +65 -0
nedb/__init__.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NEDB — a versioned, self-compressing, time-traveling embedded database.
|
|
3
|
+
|
|
4
|
+
* Replay-protected & idempotent: every write carries a monotonic nonce and an
|
|
5
|
+
optional idempotency key, enforced by a hash-chained append-only log.
|
|
6
|
+
* Time-travel: read the database AS OF any past sequence number.
|
|
7
|
+
* Relational: first-class, time-travel-aware relations with O(1) traversal.
|
|
8
|
+
* Filterable / sortable / searchable: equality, ordered, and full-text indexes.
|
|
9
|
+
* Queryable: NQL text queries and a fluent builder that share one plan.
|
|
10
|
+
* git-style files with Cascade compression: content-defined chunking + dedup +
|
|
11
|
+
temperature tiers, with a Merkle root per version anchorable on-chain.
|
|
12
|
+
|
|
13
|
+
The pure-Python package is the reference implementation and the always-works
|
|
14
|
+
fallback. When installed from a platform wheel, the compiled Rust core is available
|
|
15
|
+
as ``nedb._native`` (``nedb.__has_native__`` reports whether it loaded).
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from .engine import NEDB
|
|
20
|
+
from .log import Op, OpLog, ReplayError
|
|
21
|
+
from .query import Query, parse_nql
|
|
22
|
+
from .snapshot import save_snapshot, load_snapshot
|
|
23
|
+
from .crypto import resolve_tmk, rewrap_dek
|
|
24
|
+
from .sql import sql_exec, sql_to_nql, SQLError, SQLUnsupportedError
|
|
25
|
+
from .redis_compat import RedisCompat, RedisError, RedisUnsupportedError
|
|
26
|
+
from .mongo import (
|
|
27
|
+
MongoCompat, MongoClient, MongoError, MongoUnsupportedError, ObjectId,
|
|
28
|
+
)
|
|
29
|
+
from .autoindex import AutoIndexDB
|
|
30
|
+
from .concurrent import Sequencer
|
|
31
|
+
from .wrap_redis import wrap_redis, WrappedRedis
|
|
32
|
+
from .proof import verify_proof, fold_head
|
|
33
|
+
|
|
34
|
+
try: # compiled Rust core, present in platform wheels (PyO3 via maturin)
|
|
35
|
+
from . import _native # type: ignore
|
|
36
|
+
__has_native__ = True
|
|
37
|
+
except ImportError: # pure-Python install (sdist / unsupported platform)
|
|
38
|
+
# Provide a stub module so `from nedb._native import NedbCore` raises an
|
|
39
|
+
# informative error instead of a bare ImportError with no guidance.
|
|
40
|
+
import types as _types, sys as _sys
|
|
41
|
+
|
|
42
|
+
import sys as _sys_tmp, os as _os_tmp
|
|
43
|
+
_is_msys2 = bool(_os_tmp.environ.get("MSYSTEM")) or "mingw" in _sys_tmp.executable.lower()
|
|
44
|
+
del _sys_tmp, _os_tmp
|
|
45
|
+
|
|
46
|
+
class _NativeStub(_types.ModuleType):
|
|
47
|
+
# Primary fix: install the Rust crate → get the nedbd server → use HTTP mode.
|
|
48
|
+
# Secondary fix (CPython only): pip reinstall to get the platform wheel with _native embedded.
|
|
49
|
+
_MSG_MSYS2 = (
|
|
50
|
+
"\n\n"
|
|
51
|
+
" nedb._native (embedded v2 DAG core) is not available on MSYS2/MinGW Python.\n\n"
|
|
52
|
+
" To use NEDB v2 features, install the server binary and use HTTP mode:\n\n"
|
|
53
|
+
" cargo install nedb-engine # install nedbd v2 server\n"
|
|
54
|
+
" nedbd --dag ./data # start DAG server\n"
|
|
55
|
+
" NEDB_URL=http://localhost:7070 python3 your_script.py\n\n"
|
|
56
|
+
" Run 'nedbd --doctor' for a full diagnosis.\n"
|
|
57
|
+
)
|
|
58
|
+
_MSG_OTHER = (
|
|
59
|
+
"\n\n"
|
|
60
|
+
" nedb._native (embedded v2 DAG core) is not available.\n"
|
|
61
|
+
" You have the universal wheel — reinstall to get the platform wheel:\n\n"
|
|
62
|
+
" pip install --force-reinstall --no-cache-dir nedb-engine\n\n"
|
|
63
|
+
" Or install the server binary and use HTTP mode (works everywhere):\n\n"
|
|
64
|
+
" cargo install nedb-engine # install nedbd v2 server\n"
|
|
65
|
+
" nedbd --dag ./data # start DAG server\n"
|
|
66
|
+
" NEDB_URL=http://localhost:7070 python3 your_script.py\n\n"
|
|
67
|
+
" Run 'nedbd --doctor' for a full diagnosis.\n"
|
|
68
|
+
)
|
|
69
|
+
_MSG = _MSG_MSYS2 if _is_msys2 else _MSG_OTHER
|
|
70
|
+
|
|
71
|
+
def __getattr__(self, name: str):
|
|
72
|
+
raise ImportError(f"nedb._native.{name} is not available.{self._MSG}")
|
|
73
|
+
|
|
74
|
+
_native_stub = _NativeStub("nedb._native")
|
|
75
|
+
_native_stub.__package__ = "nedb"
|
|
76
|
+
_sys.modules["nedb._native"] = _native_stub # type: ignore
|
|
77
|
+
_native = _native_stub # type: ignore
|
|
78
|
+
__has_native__ = False
|
|
79
|
+
del _types, _sys, _NativeStub, _native_stub
|
|
80
|
+
|
|
81
|
+
__all__ = [
|
|
82
|
+
"NEDB", "OpLog", "Op", "ReplayError", "Query", "parse_nql",
|
|
83
|
+
"save_snapshot", "load_snapshot",
|
|
84
|
+
"sql_exec", "sql_to_nql", "SQLError", "SQLUnsupportedError",
|
|
85
|
+
"RedisCompat", "RedisError", "RedisUnsupportedError",
|
|
86
|
+
"MongoCompat", "MongoClient", "MongoError", "MongoUnsupportedError", "ObjectId",
|
|
87
|
+
"AutoIndexDB", "Sequencer",
|
|
88
|
+
"wrap_redis", "WrappedRedis",
|
|
89
|
+
"verify_proof", "fold_head",
|
|
90
|
+
"_native", "__has_native__",
|
|
91
|
+
]
|
|
92
|
+
__version__ = "2.4.3"
|
nedb/autoindex.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""
|
|
2
|
+
nedb.autoindex — automatic index management.
|
|
3
|
+
|
|
4
|
+
Wraps a NEDB instance and intercepts query() calls. It tracks which fields are
|
|
5
|
+
used in WHERE and ORDER BY clauses per collection. Once a field reaches the
|
|
6
|
+
usage threshold it auto-creates the appropriate index:
|
|
7
|
+
|
|
8
|
+
- Equality conditions (= / !=) → "eq" index
|
|
9
|
+
- Ordered comparisons (< > ≤ ≥) → "ordered" index
|
|
10
|
+
- ORDER BY field → "ordered" index
|
|
11
|
+
- SEARCH clause on a field → deferred (no per-field signal in NQL)
|
|
12
|
+
|
|
13
|
+
Usage::
|
|
14
|
+
|
|
15
|
+
from nedb import NEDB
|
|
16
|
+
from nedb.autoindex import AutoIndexDB
|
|
17
|
+
|
|
18
|
+
db = AutoIndexDB(NEDB("./data"), threshold=3)
|
|
19
|
+
db.query('FROM users WHERE status = "active"') # tallied
|
|
20
|
+
db.query('FROM users WHERE status = "active"')
|
|
21
|
+
db.query('FROM users WHERE status = "active"') # threshold reached → index created
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import re
|
|
26
|
+
from collections import defaultdict
|
|
27
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
_WHERE_RE = re.compile(r"\bWHERE\b([\s\S]*?)(?:\bSEARCH\b|\bORDER\b|\bTRAVERSE\b|\bLIMIT\b|$)", re.IGNORECASE)
|
|
31
|
+
_ORDER_RE = re.compile(r"\bORDER\s+BY\s+(\w+)", re.IGNORECASE)
|
|
32
|
+
_FROM_RE = re.compile(r"\bFROM\s+(\w+)", re.IGNORECASE)
|
|
33
|
+
_COND_RE = re.compile(r"(\w+)\s*(=|!=|<>|<=|>=|<|>)", re.IGNORECASE)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _parse_signals(nql: str) -> List[Tuple[str, str, str]]:
|
|
37
|
+
"""Return [(collection, field, 'eq'|'ordered')] from a NQL query string."""
|
|
38
|
+
signals = []
|
|
39
|
+
fm = _FROM_RE.search(nql)
|
|
40
|
+
if not fm:
|
|
41
|
+
return signals
|
|
42
|
+
coll = fm.group(1)
|
|
43
|
+
|
|
44
|
+
wm = _WHERE_RE.search(nql)
|
|
45
|
+
if wm:
|
|
46
|
+
for m in _COND_RE.finditer(wm.group(1)):
|
|
47
|
+
field, op = m.group(1), m.group(2)
|
|
48
|
+
kind = "eq" if op in ("=", "!=", "<>") else "ordered"
|
|
49
|
+
signals.append((coll, field, kind))
|
|
50
|
+
|
|
51
|
+
om = _ORDER_RE.search(nql)
|
|
52
|
+
if om:
|
|
53
|
+
signals.append((coll, om.group(1), "ordered"))
|
|
54
|
+
|
|
55
|
+
return signals
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class AutoIndexDB:
|
|
59
|
+
"""
|
|
60
|
+
NEDB wrapper that creates indexes automatically based on query usage.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
db : NEDB
|
|
65
|
+
A NEDB database instance (embedded or opened with a path).
|
|
66
|
+
threshold : int
|
|
67
|
+
Number of times a (collection, field, kind) combination must be
|
|
68
|
+
observed before the index is created. Default: 5.
|
|
69
|
+
verbose : bool
|
|
70
|
+
Print a message when an index is auto-created. Default: False.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(self, db: Any, threshold: int = 5, verbose: bool = False):
|
|
74
|
+
self._db = db
|
|
75
|
+
self.threshold = threshold
|
|
76
|
+
self.verbose = verbose
|
|
77
|
+
# counts[(coll, field, kind)] = n
|
|
78
|
+
self._counts: Dict[Tuple[str, str, str], int] = defaultdict(int)
|
|
79
|
+
# indexes already created so we don't re-create
|
|
80
|
+
self._created: set = set()
|
|
81
|
+
# Seed from existing index config if available
|
|
82
|
+
if hasattr(db, "indexes") and hasattr(db.indexes, "config"):
|
|
83
|
+
for coll, field, kind in db.indexes.config:
|
|
84
|
+
self._created.add((coll, field, kind))
|
|
85
|
+
|
|
86
|
+
# ── Proxy every NEDB attribute ────────────────────────────────────────────
|
|
87
|
+
|
|
88
|
+
def __getattr__(self, name: str) -> Any:
|
|
89
|
+
return getattr(self._db, name)
|
|
90
|
+
|
|
91
|
+
# ── Instrumented query ────────────────────────────────────────────────────
|
|
92
|
+
|
|
93
|
+
def query(self, nql: str) -> List[dict]:
|
|
94
|
+
"""Execute a NQL query, tally field usage, and auto-create indexes."""
|
|
95
|
+
signals = _parse_signals(nql)
|
|
96
|
+
for coll, field, kind in signals:
|
|
97
|
+
key = (coll, field, kind)
|
|
98
|
+
if key in self._created:
|
|
99
|
+
continue
|
|
100
|
+
# "ordered" supersedes "eq" — if we already have eq, upgrade to ordered
|
|
101
|
+
eq_key = (coll, field, "eq")
|
|
102
|
+
if kind == "ordered" and eq_key not in self._created:
|
|
103
|
+
self._counts[key] += 1
|
|
104
|
+
elif kind == "eq" and (coll, field, "ordered") not in self._created:
|
|
105
|
+
self._counts[key] += 1
|
|
106
|
+
else:
|
|
107
|
+
self._counts[key] += 1
|
|
108
|
+
|
|
109
|
+
if self._counts[key] >= self.threshold:
|
|
110
|
+
self._auto_create(coll, field, kind)
|
|
111
|
+
|
|
112
|
+
return self._db.query(nql)
|
|
113
|
+
|
|
114
|
+
def _auto_create(self, coll: str, field: str, kind: str) -> None:
|
|
115
|
+
key = (coll, field, kind)
|
|
116
|
+
if key in self._created:
|
|
117
|
+
return
|
|
118
|
+
# Don't index internal NEDB fields
|
|
119
|
+
if field.startswith("_") and field not in ("_id",):
|
|
120
|
+
return
|
|
121
|
+
self._db.create_index(coll, field, kind)
|
|
122
|
+
self._created.add(key)
|
|
123
|
+
if self.verbose:
|
|
124
|
+
print(f"[autoindex] created {kind} index on {coll}.{field} (threshold={self.threshold})")
|
|
125
|
+
|
|
126
|
+
# ── Manual analysis ───────────────────────────────────────────────────────
|
|
127
|
+
|
|
128
|
+
def analyze(self) -> Dict[str, Any]:
|
|
129
|
+
"""Return current tallies and the indexes already created."""
|
|
130
|
+
return {
|
|
131
|
+
"tallies": {f"{c}.{f} ({k})": n for (c, f, k), n in self._counts.items()},
|
|
132
|
+
"indexes_created": [f"{c}.{f} ({k})" for (c, f, k) in sorted(self._created)],
|
|
133
|
+
"threshold": self.threshold,
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
def suggest(self) -> List[str]:
|
|
137
|
+
"""Return suggestions for indexes that are close to the threshold."""
|
|
138
|
+
out = []
|
|
139
|
+
for (coll, field, kind), count in sorted(self._counts.items(), key=lambda x: -x[1]):
|
|
140
|
+
if (coll, field, kind) not in self._created:
|
|
141
|
+
out.append(f"{coll}.{field} ({kind}) — {count}/{self.threshold} queries")
|
|
142
|
+
return out
|
|
File without changes
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
nedb.backends.redis_backend — Redis Streams as the NEDB append-only log.
|
|
3
|
+
|
|
4
|
+
Alice's existing Redis keys are NEVER touched. NEDB operates in a strictly
|
|
5
|
+
isolated namespace:
|
|
6
|
+
|
|
7
|
+
nedb:{db_name}:oplog Redis Stream — hash-chained op log
|
|
8
|
+
nedb:{db_name}:snapshot Redis Hash — checkpoint for fast restart
|
|
9
|
+
nedb:{db_name}:events Pub/Sub chan — live subscriptions (future)
|
|
10
|
+
nedb:{db_name}:meta Redis Hash — version, index config
|
|
11
|
+
|
|
12
|
+
On startup NEDB replays the stream to rebuild its in-memory MVCC store.
|
|
13
|
+
On every write a new entry is XADD'd. One Redis connection, zero impact on
|
|
14
|
+
the user's existing keys.
|
|
15
|
+
|
|
16
|
+
© INTERCHAINED LLC × Claude Sonnet 4.6
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
from typing import Any, Dict, List, Optional
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RedisBackend:
|
|
25
|
+
"""
|
|
26
|
+
Redis-Streams-backed persistence for NEDB.
|
|
27
|
+
|
|
28
|
+
Pass an instance to NEDB as the `backend` parameter::
|
|
29
|
+
|
|
30
|
+
import redis
|
|
31
|
+
from nedb.backends.redis_backend import RedisBackend
|
|
32
|
+
from nedb import NEDB
|
|
33
|
+
|
|
34
|
+
r = redis.Redis("localhost", 6379)
|
|
35
|
+
db = NEDB(backend=RedisBackend(r, "rideshare"))
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, r: Any, db_name: str):
|
|
39
|
+
self._r = r
|
|
40
|
+
self.db_name = db_name
|
|
41
|
+
self.stream = f"nedb:{db_name}:oplog"
|
|
42
|
+
self.snap_key = f"nedb:{db_name}:snapshot"
|
|
43
|
+
self.meta_key = f"nedb:{db_name}:meta"
|
|
44
|
+
self.events_ch = f"nedb:{db_name}:events"
|
|
45
|
+
|
|
46
|
+
# ── Op log ──────────────────────────────────────────────────────────────────
|
|
47
|
+
|
|
48
|
+
def append(self, op_json: str) -> None:
|
|
49
|
+
"""Append one JSON-serialised op to the stream."""
|
|
50
|
+
self._r.xadd(self.stream, {"op": op_json})
|
|
51
|
+
|
|
52
|
+
def append_batch(self, ops: List[str]) -> None:
|
|
53
|
+
"""Append multiple ops in a single pipeline (one round-trip)."""
|
|
54
|
+
pipe = self._r.pipeline(transaction=False)
|
|
55
|
+
for op_json in ops:
|
|
56
|
+
pipe.xadd(self.stream, {"op": op_json})
|
|
57
|
+
pipe.execute()
|
|
58
|
+
|
|
59
|
+
def read_all(self) -> List[str]:
|
|
60
|
+
"""Return all ops from the stream in insertion order."""
|
|
61
|
+
entries = self._r.xrange(self.stream, "-", "+")
|
|
62
|
+
return [e[1][b"op"].decode() for e in entries]
|
|
63
|
+
|
|
64
|
+
def read_after(self, last_id: str = "0") -> List[str]:
|
|
65
|
+
"""Return ops appended after `last_id` (for incremental replay)."""
|
|
66
|
+
entries = self._r.xrange(self.stream, f"({last_id}", "+")
|
|
67
|
+
return [e[1][b"op"].decode() for e in entries]
|
|
68
|
+
|
|
69
|
+
# ── Snapshot / checkpoint ────────────────────────────────────────────────────
|
|
70
|
+
|
|
71
|
+
def save_snapshot(self, data: Dict[str, Any]) -> None:
|
|
72
|
+
"""Persist a checkpoint so restart replay only needs the delta."""
|
|
73
|
+
self._r.hset(self.snap_key, mapping={
|
|
74
|
+
k: json.dumps(v, separators=(",", ":"), default=str)
|
|
75
|
+
for k, v in data.items()
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
def load_snapshot(self) -> Optional[Dict[str, Any]]:
|
|
79
|
+
"""Load the last checkpoint, or None if none exists."""
|
|
80
|
+
raw = self._r.hgetall(self.snap_key)
|
|
81
|
+
if not raw:
|
|
82
|
+
return None
|
|
83
|
+
return {k.decode(): json.loads(v) for k, v in raw.items()}
|
|
84
|
+
|
|
85
|
+
# ── Pub/sub live events ──────────────────────────────────────────────────────
|
|
86
|
+
|
|
87
|
+
def publish_ops(self, ops: List[str]) -> None:
|
|
88
|
+
"""Publish committed ops to the events channel for live subscribers."""
|
|
89
|
+
if ops:
|
|
90
|
+
payload = json.dumps(ops, separators=(",", ":"))
|
|
91
|
+
self._r.publish(self.events_ch, payload)
|
|
92
|
+
|
|
93
|
+
# ── Meta ─────────────────────────────────────────────────────────────────────
|
|
94
|
+
|
|
95
|
+
def save_meta(self, meta: Dict[str, Any]) -> None:
|
|
96
|
+
self._r.hset(self.meta_key, mapping={
|
|
97
|
+
k: json.dumps(v, separators=(",", ":"), default=str)
|
|
98
|
+
for k, v in meta.items()
|
|
99
|
+
})
|
|
100
|
+
|
|
101
|
+
def load_meta(self) -> Dict[str, Any]:
|
|
102
|
+
raw = self._r.hgetall(self.meta_key)
|
|
103
|
+
if not raw:
|
|
104
|
+
return {}
|
|
105
|
+
return {k.decode(): json.loads(v) for k, v in raw.items()}
|
|
106
|
+
|
|
107
|
+
# ── Utility ──────────────────────────────────────────────────────────────────
|
|
108
|
+
|
|
109
|
+
def stream_len(self) -> int:
|
|
110
|
+
return self._r.xlen(self.stream)
|
|
111
|
+
|
|
112
|
+
def flush(self) -> None:
|
|
113
|
+
"""Delete all NEDB shadow keys for this database (non-destructive to user keys)."""
|
|
114
|
+
for key in [self.stream, self.snap_key, self.meta_key]:
|
|
115
|
+
self._r.delete(key)
|
nedb/cascade.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""
|
|
2
|
+
nedb.cascade — the Cascade compression pipeline + content-addressed blob store.
|
|
3
|
+
|
|
4
|
+
This is what makes NEDB double as a git-style file manager with maximum compression
|
|
5
|
+
WITHOUT inventing a new entropy coder. The novelty is the pipeline composition:
|
|
6
|
+
|
|
7
|
+
1. Content-defined chunking (Gear rolling hash) — boundaries follow content, so a
|
|
8
|
+
one-byte insert only changes the chunk(s) around it, not everything after it.
|
|
9
|
+
2. Content-addressed dedup (BLAKE) — identical chunks across all files and all
|
|
10
|
+
versions are stored exactly once.
|
|
11
|
+
3. Temperature tiers — warm data uses a fast codec (zstd in prod; zlib in this
|
|
12
|
+
reference), cold/archival history uses a maximum-ratio codec (LZMA).
|
|
13
|
+
|
|
14
|
+
The production pipeline adds similarity-picked binary deltas (zstd --patch-from) and
|
|
15
|
+
schema-aware columnar transforms before the entropy stage; both are documented in
|
|
16
|
+
docs/SPEC.md and stubbed for the reference engine.
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import hashlib
|
|
21
|
+
import lzma
|
|
22
|
+
import random
|
|
23
|
+
import zlib
|
|
24
|
+
from typing import Dict, List
|
|
25
|
+
|
|
26
|
+
from .merkle import merkle_root
|
|
27
|
+
|
|
28
|
+
# --- Gear-hash content-defined chunking -------------------------------------
|
|
29
|
+
_MASK = (1 << 13) - 1 # ~8 KiB average chunk
|
|
30
|
+
_MIN = 2 * 1024
|
|
31
|
+
_MAX = 64 * 1024
|
|
32
|
+
_M64 = 0xFFFFFFFFFFFFFFFF
|
|
33
|
+
_GEAR = [random.Random(0x12345678 + i).getrandbits(64) for i in range(256)]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def chunk(data: bytes) -> List[bytes]:
|
|
37
|
+
chunks: List[bytes] = []
|
|
38
|
+
n = len(data)
|
|
39
|
+
i = 0
|
|
40
|
+
while i < n:
|
|
41
|
+
limit = min(i + _MAX, n)
|
|
42
|
+
h = 0
|
|
43
|
+
pos = i
|
|
44
|
+
cut = limit
|
|
45
|
+
while pos < limit:
|
|
46
|
+
h = ((h << 1) + _GEAR[data[pos]]) & _M64
|
|
47
|
+
pos += 1
|
|
48
|
+
if (pos - i) >= _MIN and (h & _MASK) == 0:
|
|
49
|
+
cut = pos
|
|
50
|
+
break
|
|
51
|
+
chunks.append(data[i:cut])
|
|
52
|
+
i = cut
|
|
53
|
+
return chunks
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _blake(b: bytes) -> str:
|
|
57
|
+
return hashlib.blake2b(b, digest_size=32).hexdigest()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# --- temperature tiers ------------------------------------------------------
|
|
61
|
+
def warm_compress(b: bytes) -> bytes: # zstd stand-in in the reference
|
|
62
|
+
return zlib.compress(b, 6)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def warm_decompress(b: bytes) -> bytes:
|
|
66
|
+
return zlib.decompress(b)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def cold_compress(b: bytes) -> bytes: # real LZMA — the maximum-ratio archival tier
|
|
70
|
+
return lzma.compress(b, preset=9 | lzma.PRESET_EXTREME)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def cold_decompress(b: bytes) -> bytes:
|
|
74
|
+
return lzma.decompress(b)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class BlobStore:
|
|
78
|
+
"""Content-addressed, deduplicated, tiered blob store with versioned files."""
|
|
79
|
+
|
|
80
|
+
def __init__(self, tier: str = "warm") -> None:
|
|
81
|
+
self.tier = tier
|
|
82
|
+
self.chunks: Dict[str, bytes] = {} # hash -> compressed bytes
|
|
83
|
+
self.files: Dict[str, Dict[str, list]] = {} # name -> {versions, roots}
|
|
84
|
+
self.logical_bytes = 0
|
|
85
|
+
self.dedup_hits = 0
|
|
86
|
+
|
|
87
|
+
def _compress(self, b: bytes) -> bytes:
|
|
88
|
+
return cold_compress(b) if self.tier == "cold" else warm_compress(b)
|
|
89
|
+
|
|
90
|
+
def _decompress(self, b: bytes) -> bytes:
|
|
91
|
+
return cold_decompress(b) if self.tier == "cold" else warm_decompress(b)
|
|
92
|
+
|
|
93
|
+
def put_file(self, name: str, data: bytes) -> int:
|
|
94
|
+
recipe: List[str] = []
|
|
95
|
+
for c in chunk(data):
|
|
96
|
+
hh = _blake(c)
|
|
97
|
+
recipe.append(hh)
|
|
98
|
+
if hh in self.chunks:
|
|
99
|
+
self.dedup_hits += 1
|
|
100
|
+
else:
|
|
101
|
+
self.chunks[hh] = self._compress(c)
|
|
102
|
+
self.logical_bytes += len(data)
|
|
103
|
+
f = self.files.setdefault(name, {"versions": [], "roots": []})
|
|
104
|
+
f["versions"].append(recipe)
|
|
105
|
+
f["roots"].append(merkle_root(recipe))
|
|
106
|
+
return len(f["versions"]) - 1
|
|
107
|
+
|
|
108
|
+
def get_file(self, name: str, version: int = -1) -> bytes:
|
|
109
|
+
recipe = self.files[name]["versions"][version]
|
|
110
|
+
out = bytearray()
|
|
111
|
+
for hh in recipe:
|
|
112
|
+
out += self._decompress(self.chunks[hh])
|
|
113
|
+
return bytes(out)
|
|
114
|
+
|
|
115
|
+
def root(self, name: str, version: int = -1) -> str:
|
|
116
|
+
return self.files[name]["roots"][version]
|
|
117
|
+
|
|
118
|
+
def stored_bytes(self) -> int:
|
|
119
|
+
return sum(len(v) for v in self.chunks.values())
|
|
120
|
+
|
|
121
|
+
def stats(self) -> dict:
|
|
122
|
+
stored = self.stored_bytes()
|
|
123
|
+
return {
|
|
124
|
+
"tier": self.tier,
|
|
125
|
+
"unique_chunks": len(self.chunks),
|
|
126
|
+
"dedup_hits": self.dedup_hits,
|
|
127
|
+
"logical_bytes": self.logical_bytes,
|
|
128
|
+
"stored_bytes": stored,
|
|
129
|
+
"ratio": round(self.logical_bytes / stored, 2) if stored else 0.0,
|
|
130
|
+
}
|