nitrodb 2.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nedb/engine.py ADDED
@@ -0,0 +1,783 @@
1
+ """
2
+ nedb.engine — the NEDB database: log + MVCC store + relations + indexes + Cascade.
3
+
4
+ The OpLog is the source of truth. Every mutation appends an Op; `_apply` deterministically
5
+ folds an Op into the materialized state (store / relations / indexes). Because state is a
6
+ pure function of the log, we get crash recovery and determinism (rebuild) for free, and
7
+ "AS OF seq" time-travel because the log carries monotonic seqs.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import os
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ from .cascade import BlobStore
16
+ from . import snapshot as _snap
17
+ from . import crypto as _crypto
18
+ from .index import Indexes, tokenize
19
+ from .log import Op, OpLog, ReplayError, GENESIS, blake, canon # noqa: F401 (re-exported)
20
+ from typing import List as _List
21
+ from .merkle import merkle_proof, merkle_verify
22
+ from .query import Query, cmp, parse_nql
23
+ from .relations import Relations
24
+ from .store import MVCCStore
25
+
26
+
27
+ def apply_op(store: MVCCStore, relations: Relations, indexes: Indexes, op: Op,
28
+ cause_map: Optional[Dict[int, List[int]]] = None) -> None:
29
+ """Deterministically fold one op into materialized state."""
30
+ p = op.payload
31
+ if op.op == "put":
32
+ key, coll = p["key"], p["coll"]
33
+ doc = dict(p["doc"]) # copy so we don't mutate the op payload
34
+ doc["_seq"] = op.seq # inject sequence number into every stored doc
35
+ old = store.get(key)
36
+ if old is not None:
37
+ indexes.remove(coll, key, old)
38
+ store.put(key, doc, op.seq)
39
+ indexes.add(coll, key, doc)
40
+ elif op.op == "delete":
41
+ key, coll = p["key"], p["coll"]
42
+ old = store.get(key)
43
+ if old is not None:
44
+ indexes.remove(coll, key, old)
45
+ store.delete(key, op.seq)
46
+ elif op.op == "link":
47
+ relations.link(p["frm"], p["rel"], p["to"], op.seq)
48
+ elif op.op == "unlink":
49
+ relations.unlink(p["frm"], p["rel"], p["to"], op.seq)
50
+ elif op.op == "put_file":
51
+ pass # bytes live in the content-addressed BlobStore; log records the root only
52
+ # Build causal reverse index so TRACE ... REVERSE queries are O(1).
53
+ if cause_map is not None and op.caused_by:
54
+ for cause_seq in op.caused_by:
55
+ cause_map.setdefault(cause_seq, []).append(op.seq)
56
+
57
+
58
+ class NEDB:
59
+ def __init__(self, path: Optional[str] = None,
60
+ tmk: Optional[bytes] = None) -> None:
61
+ """Create a database.
62
+
63
+ With no `path`, NEDB is in-memory (the original behavior). With a `path`
64
+ (a directory), NEDB is DURABLE: every op is appended to a hash-chained
65
+ append-only log file (AOF) and fsync'd, and the database reloads by
66
+ replaying that log on open — Redis-style persistence, except the log is
67
+ the same tamper-evident chain the engine already treats as the source of
68
+ truth, so verify() and AS OF hold across restarts. The append-only log is
69
+ never rewritten: the chain (and its anchorable head) stays provable.
70
+ """
71
+ self.log = OpLog()
72
+ self.store = MVCCStore()
73
+ self.relations = Relations()
74
+ self.indexes = Indexes()
75
+ self.blobs: Dict[str, BlobStore] = {"warm": BlobStore("warm"), "cold": BlobStore("cold")}
76
+ self._nonce: Dict[str, int] = {}
77
+ # Causal provenance reverse index: cause_seq → [dependent_seq, ...]
78
+ # Populated in apply_op when an op carries caused_by seqs.
79
+ self.cause_map: Dict[int, List[int]] = {}
80
+
81
+ self.path = path
82
+ self._aof = None
83
+ # When True, _log_append buffers writes and skips the per-op fsync; the
84
+ # caller (the concurrent Sequencer) issues ONE fsync per batch via flush()
85
+ # — group commit. Default False keeps embedded/direct use durable per-op.
86
+ self._defer_sync = False
87
+ # Encryption: resolve TMK (arg > env) → load/create DEK → None if no TMK
88
+ self._dek: Optional[bytes] = None
89
+ resolved_tmk = _crypto.resolve_tmk(tmk)
90
+ if resolved_tmk is not None and path is not None:
91
+ # Ensure the database directory exists BEFORE creating the DEK file.
92
+ # _open() calls os.makedirs() too, but it runs after this block —
93
+ # so a brand-new database would fail with FileNotFoundError on key.enc.tmp.
94
+ os.makedirs(path, exist_ok=True)
95
+ self._dek = _crypto.load_or_create_dek(path, resolved_tmk)
96
+ if path is not None:
97
+ self._open(path)
98
+
99
+ # --- persistence (AOF) --------------------------------------------------
100
+ def _open(self, path: str) -> None:
101
+ os.makedirs(path, exist_ok=True)
102
+ self._aof_path = os.path.join(path, "log.aof")
103
+ self._meta_path = os.path.join(path, "meta.json")
104
+ if os.path.exists(self._aof_path) or os.path.exists(self._meta_path):
105
+ self._load()
106
+ # Backfill: if encryption is now enabled but the AOF still has plain
107
+ # lines, rewrite the entire log encrypted in place so no cleartext
108
+ # survives on disk.
109
+ if self._dek is not None and os.path.exists(self._aof_path):
110
+ self._backfill_encrypt_if_needed()
111
+ # Self-heal a structurally-broken chain (e.g. a historical encrypt-
112
+ # backfill gap) before we start appending. No-op on a healthy log;
113
+ # leaves genuine tampering untouched (verify stays False + warns).
114
+ self._self_heal_if_needed()
115
+ # Append mode: never truncates the existing log.
116
+ self._aof = open(self._aof_path, "a", encoding="utf-8")
117
+
118
+ def _backfill_encrypt_if_needed(self) -> None:
119
+ """Detect a plain-text AOF and rewrite it fully encrypted.
120
+
121
+ Triggered on open when NEDB_TMK is set but the existing log contains
122
+ unencrypted entries. Uses atomic write (tmp → rename) so the old log
123
+ is never left in a half-written state. Checkpoints afterwards so the
124
+ snapshot is also encrypted.
125
+
126
+ Safe to call on an already-encrypted AOF — it's a no-op if every line
127
+ is already an encrypted envelope.
128
+ """
129
+ # Peek at the first non-empty line of the AOF.
130
+ first_plain = None
131
+ with open(self._aof_path, encoding="utf-8") as fh:
132
+ for raw in fh:
133
+ stripped = raw.strip()
134
+ if not stripped:
135
+ continue
136
+ try:
137
+ env = json.loads(stripped)
138
+ if isinstance(env, dict) and env.get("enc") == 1:
139
+ return # already encrypted — nothing to do
140
+ first_plain = stripped
141
+ except Exception:
142
+ pass
143
+ break
144
+
145
+ if first_plain is None:
146
+ return # empty AOF
147
+
148
+ # AOF has plain lines — rewrite fully encrypted
149
+ print(f" [nedb] backfill-encrypting existing log ({self._aof_path})…")
150
+ tmp_path = self._aof_path + ".enc_tmp"
151
+ with open(self._aof_path, encoding="utf-8") as fh_in, \
152
+ open(tmp_path, "w", encoding="utf-8") as fh_out:
153
+ for raw in fh_in:
154
+ # Each line might already be encrypted (mixed state is fine)
155
+ decoded = _crypto.aof_decode(raw, self._dek)
156
+ if not decoded:
157
+ continue
158
+ # Re-encode encrypted
159
+ fh_out.write(_crypto.aof_encode(decoded, self._dek) + "\n")
160
+ fh_out.flush()
161
+ os.fsync(fh_out.fileno())
162
+
163
+ os.replace(tmp_path, self._aof_path)
164
+ print(f" [nedb] AOF backfill-encrypt complete.")
165
+
166
+ # Drop any stale (pre-encryption) snapshot so the next open rebuilds purely
167
+ # from the re-encrypted AOF. We MUST NOT checkpoint here: self._aof isn't
168
+ # open yet during _open, so a checkpoint op would advance the in-memory head
169
+ # WITHOUT being persisted — leaving a permanent gap in the on-disk chain
170
+ # (the "tampered after restart" bug). The daemon checkpoints normally later.
171
+ snap = _snap._snap_path(self.path)
172
+ if os.path.exists(snap):
173
+ os.remove(snap)
174
+
175
+ # ── self-healing ────────────────────────────────────────────────────────
176
+ @staticmethod
177
+ def _op_body(o: Op) -> dict:
178
+ """The exact hashed body — delegates to OpLog's canonical method."""
179
+ from .log import OpLog as _OL
180
+ return _OL._op_body(o)
181
+
182
+ def _rewrite_aof(self, ops: List[Op]) -> None:
183
+ """Atomically rewrite the AOF from `ops` (encrypted if a DEK is set)."""
184
+ tmp = self._aof_path + ".heal_tmp"
185
+ with open(tmp, "w", encoding="utf-8") as fh:
186
+ for o in ops:
187
+ fh.write(_crypto.aof_encode(json.dumps(o.to_dict()), self._dek) + "\n")
188
+ fh.flush()
189
+ os.fsync(fh.fileno())
190
+ os.replace(tmp, self._aof_path)
191
+
192
+ def _self_heal_if_needed(self) -> None:
193
+ """Repair a structurally-broken hash chain in place.
194
+
195
+ A chain can break WITHOUT any tampering: the historical encrypt-backfill
196
+ bug appended a checkpoint op that was never persisted, so the on-disk log
197
+ is missing a link and every op after it chains from a vanished head.
198
+
199
+ We distinguish that from real tampering: if every op is *internally*
200
+ consistent (``hash == H(prev_hash || body)`` for its OWN stored prev_hash)
201
+ but the running chain is discontinuous, only the linkage is broken — the
202
+ content is intact, so we re-link it (preserving each op's fields/seq) and
203
+ rewrite the AOF. If any op's content was altered (hash doesn't match its
204
+ own body), we do NOT rewrite — verify() stays False and we warn, so real
205
+ tampering is never silently masked.
206
+ """
207
+ if self.log.verify():
208
+ return # healthy
209
+
210
+ ops = self.log.ops
211
+ internally_valid = all(
212
+ o.hash == blake(o.prev_hash.encode() + canon(self._op_body(o)))
213
+ for o in ops
214
+ )
215
+ if not internally_valid:
216
+ print(" [nedb] WARNING: chain verification failed and op content is "
217
+ "inconsistent — possible tampering. NOT auto-repairing.")
218
+ return
219
+
220
+ print(" [nedb] self-healing chain (structural break, content intact)…")
221
+ prev = GENESIS
222
+ healed: List[Op] = []
223
+ for o in ops:
224
+ body = self._op_body(o)
225
+ h = blake(prev.encode() + canon(body))
226
+ healed.append(Op(o.seq, o.client, o.nonce, o.op, o.payload, o.ts, o.idem, prev, h))
227
+ prev = h
228
+ self.log.ops = healed
229
+ self.log._head = prev
230
+ if self.path is not None and os.path.exists(self._aof_path):
231
+ self._rewrite_aof(healed)
232
+ snap = _snap._snap_path(self.path)
233
+ if os.path.exists(snap):
234
+ os.remove(snap) # stale: references the old head/seq
235
+ print(f" [nedb] self-heal complete — verify={self.log.verify()} "
236
+ f"head={self.head[:12]}…")
237
+
238
+ def _load(self) -> None:
239
+ # ── Try snapshot-assisted load first (O(delta) instead of O(total)) ─
240
+ snap_seq = _snap.load_snapshot(self)
241
+ if snap_seq >= 0:
242
+ # Snapshot loaded: only replay AOF ops AFTER the checkpoint op.
243
+ ops: List[Op] = []
244
+ if os.path.exists(self._aof_path):
245
+ with open(self._aof_path, encoding="utf-8") as fh:
246
+ for raw_line in fh:
247
+ line = _crypto.aof_decode(raw_line, self._dek)
248
+ if line:
249
+ ops.append(Op.from_dict(json.loads(line)))
250
+ # Build the full log (needed for verify() and AS OF) but only
251
+ # apply ops that arrive after the checkpoint to avoid double-fold.
252
+ self.log.load(ops)
253
+ for op in self.log.ops:
254
+ if op.seq > snap_seq:
255
+ apply_op(self.store, self.relations, self.indexes, op, self.cause_map)
256
+ self._nonce = dict(self.log._last_nonce)
257
+ return
258
+
259
+ # ── No snapshot: full replay (original behaviour) ─────────────────
260
+ # 1) index configuration
261
+ if os.path.exists(self._meta_path):
262
+ with open(self._meta_path, encoding="utf-8") as fh:
263
+ for coll, field, kind in json.load(fh).get("indexes", []):
264
+ self.indexes.ensure(coll, field, kind)
265
+ # 2) the hash-chained op log
266
+ # Self-healing AOF load — never crash on corruption.
267
+ ops = []
268
+ corrupt_count = 0
269
+ if os.path.exists(self._aof_path):
270
+ with open(self._aof_path, encoding="utf-8") as fh:
271
+ for raw_line in fh:
272
+ try:
273
+ line = _crypto.aof_decode(raw_line, self._dek)
274
+ if line:
275
+ ops.append(Op.from_dict(json.loads(line)))
276
+ except Exception as exc:
277
+ corrupt_count += 1
278
+ print(
279
+ f" [nedb] AOF self-repair: corrupt entry in"
280
+ f" {self._aof_path!r}"
281
+ f" ({type(exc).__name__}: {exc})"
282
+ f" -- truncating and recovering {len(ops)} op(s)."
283
+ )
284
+ break
285
+ if corrupt_count:
286
+ self._rewrite_aof(ops)
287
+ print(
288
+ f" [nedb] AOF self-repair complete:"
289
+ f" {len(ops)} op(s) recovered,"
290
+ f" {corrupt_count} corrupt entry/entries removed."
291
+ )
292
+ self.log.load(ops)
293
+ # 3) fold
294
+ for op in self.log.ops:
295
+ apply_op(self.store, self.relations, self.indexes, op, self.cause_map)
296
+ # 4) nonce restoration
297
+ self._nonce = dict(self.log._last_nonce)
298
+
299
+ def _persist_meta(self) -> None:
300
+ if self.path is None:
301
+ return
302
+ with open(self._meta_path, "w", encoding="utf-8") as fh:
303
+ json.dump({"indexes": [list(t) for t in self.indexes.config]}, fh)
304
+
305
+ def _log_append(self, client: str, nonce: int, op: str, payload: dict,
306
+ idem: Optional[str] = None,
307
+ caused_by: Optional[List[int]] = None,
308
+ evidence: Optional[str] = None,
309
+ confidence: Optional[float] = None,
310
+ valid_from: Optional[str] = None,
311
+ valid_to: Optional[str] = None):
312
+ """Append to the in-memory log AND, if durable, to the AOF (encrypted if DEK set)."""
313
+ rec, created = self.log.append(client, nonce, op, payload, idem,
314
+ caused_by=caused_by, evidence=evidence,
315
+ confidence=confidence,
316
+ valid_from=valid_from, valid_to=valid_to)
317
+ if created and self._aof is not None:
318
+ line = _crypto.aof_encode(json.dumps(rec.to_dict()), self._dek)
319
+ self._aof.write(line + "\n")
320
+ if not self._defer_sync:
321
+ self._aof.flush()
322
+ os.fsync(self._aof.fileno())
323
+ return rec, created
324
+
325
+ def flush(self) -> None:
326
+ """Force buffered writes to disk."""
327
+ if self._aof is not None:
328
+ self._aof.flush()
329
+ os.fsync(self._aof.fileno())
330
+
331
+ def close(self) -> None:
332
+ """Flush and close the append-only log."""
333
+ if self._aof is not None:
334
+ self._aof.flush()
335
+ os.fsync(self._aof.fileno())
336
+ self._aof.close()
337
+ self._aof = None
338
+
339
+ def __enter__(self) -> "NEDB":
340
+ return self
341
+
342
+ def __exit__(self, *exc) -> None:
343
+ self.close()
344
+
345
+ def rewrap_key(self, old_tmk: bytes, new_tmk: bytes) -> None:
346
+ """
347
+ Key rotation: re-wrap the DEK under a new TMK without re-encrypting data.
348
+
349
+ After this call the database opens only with ``new_tmk``. The DEK —
350
+ and therefore all encrypted data — stays untouched.
351
+
352
+ Example::
353
+
354
+ db.rewrap_key(old_tmk=bytes.fromhex("aa..."), new_tmk=bytes.fromhex("bb..."))
355
+ """
356
+ if self.path is None:
357
+ raise ValueError("Key rotation requires a durable NEDB(path) database.")
358
+ old_k = _crypto.resolve_tmk(old_tmk)
359
+ new_k = _crypto.resolve_tmk(new_tmk)
360
+ _crypto.rewrap_dek(self.path, old_k, new_k)
361
+ # Update in-memory DEK so current session keeps working
362
+ self._dek = _crypto.load_or_create_dek(self.path, new_k)
363
+
364
+ def checkpoint(self) -> str:
365
+ """
366
+ Capture a snapshot checkpoint and anchor it in the hash chain.
367
+
368
+ Writes ``snapshot.json`` alongside the AOF so future opens load in
369
+ O(delta) time instead of replaying the full log. The chain is never
370
+ broken — the checkpoint is a real op in the AOF whose hash chains
371
+ from the previous op, so ``verify()`` and ``AS OF`` remain valid.
372
+
373
+ Returns the head hash after the checkpoint op.
374
+
375
+ Example::
376
+
377
+ db = NEDB("./data")
378
+ # … write 100 K rows …
379
+ db.checkpoint() # O(total) once; future opens are O(delta)
380
+ db.close()
381
+ db2 = NEDB("./data") # fast: loads snapshot then replays only new ops
382
+ assert db2.verify()
383
+
384
+ Call periodically for long-running databases or before a planned restart.
385
+ """
386
+ return _snap.save_snapshot(self)
387
+
388
+ # --- nonce helper -------------------------------------------------------
389
+ def _next(self, client: str) -> int:
390
+ n = self._nonce.get(client, 0) + 1
391
+ self._nonce[client] = n
392
+ return n
393
+
394
+ # --- TTL helpers --------------------------------------------------------
395
+ @staticmethod
396
+ def _embed_ttl(doc: dict, ttl_s: Optional[float]) -> dict:
397
+ if ttl_s is None:
398
+ return doc
399
+ import time
400
+ d = dict(doc)
401
+ d["_expires_at"] = time.time() + ttl_s
402
+ return d
403
+
404
+ def _check_ttl(self, coll: str, id: str, doc: Optional[dict]) -> Optional[dict]:
405
+ """Lazy expiry: if the doc has _expires_at and it has passed, delete it."""
406
+ if doc is None:
407
+ return None
408
+ exp = doc.get("_expires_at")
409
+ if exp is None:
410
+ return doc
411
+ import time
412
+ if time.time() > exp:
413
+ key = f"{coll}:{id}"
414
+ self._log_append("__ttl__", self._next("__ttl__"), "delete",
415
+ {"key": key, "coll": coll, "id": id})
416
+ apply_op(self.store, self.relations, self.indexes,
417
+ self.log.ops[-1])
418
+ return None
419
+ return doc
420
+
421
+ @staticmethod
422
+ def _valid_at(doc: dict, date: str) -> bool:
423
+ """Return True if the doc is valid at `date` (ISO 8601 string).
424
+
425
+ Bi-temporal check:
426
+ - Docs without _valid_from / _valid_to are ALWAYS valid (backward compat).
427
+ - _valid_from <= date (or _valid_from absent → open start)
428
+ - _valid_to >= date (or _valid_to absent → open end / still valid)
429
+
430
+ ISO 8601 strings compare correctly lexicographically so simple str ops work.
431
+ """
432
+ vf = doc.get("_valid_from")
433
+ vt = doc.get("_valid_to")
434
+ if vf is None and vt is None:
435
+ return True # no valid-time metadata → always valid
436
+ if vf is not None and date < vf:
437
+ return False # not yet in effect
438
+ if vt is not None and date > vt:
439
+ return False # already expired
440
+ return True
441
+
442
+ # --- mutations ----------------------------------------------------------
443
+ def put(self, coll: str, id: str, doc: dict, client: str = "local",
444
+ nonce: Optional[int] = None, idem: Optional[str] = None,
445
+ ttl_s: Optional[float] = None,
446
+ caused_by: Optional[List[int]] = None,
447
+ evidence: Optional[str] = None,
448
+ confidence: Optional[float] = None,
449
+ valid_from: Optional[str] = None,
450
+ valid_to: Optional[str] = None) -> dict:
451
+ key = f"{coll}:{id}"
452
+ doc = dict(doc)
453
+ doc.setdefault("_id", id)
454
+ doc = self._embed_ttl(doc, ttl_s)
455
+ # Mirror provenance + valid-time into the doc as queryable _-prefixed fields.
456
+ # They're also sealed in the Op hash via _log_append so they're tamper-evident.
457
+ if caused_by is not None: doc["_caused_by"] = caused_by
458
+ if evidence is not None: doc["_evidence"] = evidence
459
+ if confidence is not None: doc["_confidence"] = confidence
460
+ if valid_from is not None: doc["_valid_from"] = valid_from
461
+ if valid_to is not None: doc["_valid_to"] = valid_to
462
+ nonce = self._next(client) if nonce is None else nonce
463
+ op, created = self._log_append(client, nonce, "put",
464
+ {"key": key, "coll": coll, "id": id, "doc": doc},
465
+ idem, caused_by=caused_by,
466
+ evidence=evidence, confidence=confidence,
467
+ valid_from=valid_from, valid_to=valid_to)
468
+ if created:
469
+ apply_op(self.store, self.relations, self.indexes, op, self.cause_map)
470
+ return self.store.get(key)
471
+
472
+ def delete(self, coll: str, id: str, client: str = "local",
473
+ nonce: Optional[int] = None, idem: Optional[str] = None) -> None:
474
+ key = f"{coll}:{id}"
475
+ nonce = self._next(client) if nonce is None else nonce
476
+ op, created = self._log_append(client, nonce, "delete",
477
+ {"key": key, "coll": coll, "id": id}, idem)
478
+ if created:
479
+ apply_op(self.store, self.relations, self.indexes, op, self.cause_map)
480
+
481
+ def get(self, coll: str, id: str, as_of: Optional[int] = None) -> Optional[dict]:
482
+ doc = self.store.get(f"{coll}:{id}", as_of)
483
+ if as_of is None:
484
+ return self._check_ttl(coll, id, doc)
485
+ return doc # time-travel reads never trigger lazy expiry
486
+
487
+ def expire(self, coll: str, id: str, ttl_s: float) -> bool:
488
+ """Set or update the TTL on an existing document. Returns False if not found."""
489
+ doc = self.store.get(f"{coll}:{id}")
490
+ if doc is None:
491
+ return False
492
+ self.put(coll, id, doc, ttl_s=ttl_s)
493
+ return True
494
+
495
+ def sweep(self) -> int:
496
+ """Delete all documents whose TTL has expired. Returns the count deleted."""
497
+ import time
498
+ now = time.time()
499
+ deleted = 0
500
+ for key in list(self.store.keys()):
501
+ doc = self.store.get(key)
502
+ if doc and isinstance(doc, dict) and doc.get("_expires_at") and now > doc["_expires_at"]:
503
+ coll, id_ = key.split(":", 1)
504
+ self._log_append("__ttl__", self._next("__ttl__"), "delete",
505
+ {"key": key, "coll": coll, "id": id_})
506
+ apply_op(self.store, self.relations, self.indexes, self.log.ops[-1])
507
+ deleted += 1
508
+ if deleted and self.path:
509
+ self._persist_meta()
510
+ return deleted
511
+
512
+ # --- relations ----------------------------------------------------------
513
+ def link(self, frm: str, rel: str, to: str, client: str = "local",
514
+ nonce: Optional[int] = None) -> None:
515
+ nonce = self._next(client) if nonce is None else nonce
516
+ op, created = self._log_append(client, nonce, "link", {"frm": frm, "rel": rel, "to": to})
517
+ if created:
518
+ apply_op(self.store, self.relations, self.indexes, op, self.cause_map)
519
+
520
+ def unlink(self, frm: str, rel: str, to: str, client: str = "local",
521
+ nonce: Optional[int] = None) -> None:
522
+ nonce = self._next(client) if nonce is None else nonce
523
+ op, created = self._log_append(client, nonce, "unlink", {"frm": frm, "rel": rel, "to": to})
524
+ if created:
525
+ apply_op(self.store, self.relations, self.indexes, op, self.cause_map)
526
+
527
+ def neighbors(self, frm: str, rel: str, as_of: Optional[int] = None) -> List[str]:
528
+ return self.relations.neighbors(frm, rel, as_of)
529
+
530
+ def inbound(self, to: str, rel: str, as_of: Optional[int] = None) -> List[str]:
531
+ return self.relations.inbound(to, rel, as_of)
532
+
533
+ # --- indexes ------------------------------------------------------------
534
+ def create_index(self, coll: str, field: str, kind: str = "eq") -> None:
535
+ self.indexes.ensure(coll, field, kind)
536
+ # backfill existing rows at HEAD
537
+ for key in self.store.keys(coll + ":"):
538
+ doc = self.store.get(key)
539
+ if doc is not None:
540
+ self.indexes.add(coll, key, doc)
541
+ # index config isn't an op-log entry, so snapshot it for durable reload
542
+ self._persist_meta()
543
+
544
+ # --- queries ------------------------------------------------------------
545
+ def q(self, coll: str) -> Query:
546
+ return Query(self, coll)
547
+
548
+ def query(self, nql: str) -> List[dict]:
549
+ return self.execute(parse_nql(nql))
550
+
551
+ def execute(self, plan: dict) -> List[dict]:
552
+ coll = plan["from"]
553
+ as_of = plan.get("as_of")
554
+ prefix = coll + ":"
555
+ where = plan.get("where", [])
556
+ search = plan.get("search")
557
+
558
+ candidates: Optional[set] = None
559
+
560
+ # 1) full-text search is usually most selective
561
+ if search:
562
+ sfields = self.indexes.search_fields(coll)
563
+ if sfields:
564
+ per_term = []
565
+ for term in tokenize(search):
566
+ s: set = set()
567
+ for f in sfields:
568
+ s |= self.indexes.search_lookup(coll, f, term)
569
+ per_term.append(s)
570
+ candidates = set.intersection(*per_term) if per_term else set()
571
+
572
+ # 2) equality-index acceleration (HEAD reads only)
573
+ if candidates is None and as_of is None:
574
+ for (f, op, v) in where:
575
+ if op == "=" and self.indexes.has_eq(coll, f):
576
+ candidates = self.indexes.eq_lookup(coll, f, v)
577
+ break
578
+
579
+ # 3) fallback: scan the collection
580
+ if candidates is None:
581
+ candidates = set(self.store.keys(prefix, as_of))
582
+
583
+ # load + final predicate filter (guarantees correctness regardless of index path)
584
+ rows = []
585
+ for key in candidates:
586
+ doc = self.store.get(key, as_of)
587
+ if doc is None:
588
+ continue
589
+ if all(cmp(doc.get(f), op, v) for (f, op, v) in where):
590
+ if search and not self.indexes.search_fields(coll):
591
+ blob = " ".join(str(x) for x in doc.values()).lower()
592
+ if not all(t in blob for t in tokenize(search)):
593
+ continue
594
+ rows.append((key, doc))
595
+
596
+ # order
597
+ ob = plan.get("order_by")
598
+ if ob:
599
+ field, direction = ob
600
+ try:
601
+ rows.sort(key=lambda kv: (kv[1].get(field) is None, kv[1].get(field)),
602
+ reverse=(direction == "DESC"))
603
+ except TypeError:
604
+ rows.sort(key=lambda kv: str(kv[1].get(field)), reverse=(direction == "DESC"))
605
+
606
+ # traverse relations
607
+ if plan.get("traverse"):
608
+ rel = plan["traverse"]
609
+ seen, trav = set(), []
610
+ for key, _ in rows:
611
+ for nb in self.relations.neighbors(key, rel, as_of):
612
+ if nb in seen:
613
+ continue
614
+ seen.add(nb)
615
+ d = self.store.get(nb, as_of)
616
+ if d is not None:
617
+ trav.append((nb, d))
618
+ rows = trav
619
+
620
+ # TRACE caused_by — causal provenance traversal
621
+ if plan.get("trace"):
622
+ if not plan.get("trace_reverse"):
623
+ # Backward: start from current result set, follow caused_by seqs
624
+ # recursively to their originating documents.
625
+ result_docs = [d for _, d in rows]
626
+ visited_seqs: set = set()
627
+ frontier = list(result_docs)
628
+ out_docs = []
629
+ while frontier:
630
+ doc = frontier.pop()
631
+ causes = doc.get("_caused_by") or []
632
+ for cause_seq in causes:
633
+ if cause_seq in visited_seqs:
634
+ continue
635
+ visited_seqs.add(cause_seq)
636
+ if cause_seq < len(self.log.ops):
637
+ op = self.log.ops[cause_seq]
638
+ if op.op == "put":
639
+ cause_key = op.payload.get("key", "")
640
+ # Use cause_seq as the AS OF point so TRACE returns
641
+ # the historical version that existed when the causal
642
+ # link was created — not the current (possibly updated) doc.
643
+ cause_doc = self.store.get(cause_key, cause_seq)
644
+ if cause_doc is not None:
645
+ out_docs.append((cause_key, cause_doc))
646
+ frontier.append(cause_doc)
647
+ rows = out_docs
648
+ else:
649
+ # Forward: start from current result set, follow cause_map to find
650
+ # all documents that declared these as causes (downstream effects).
651
+ result_docs = [d for _, d in rows]
652
+ visited_seqs_fwd: set = set()
653
+ out_fwd = []
654
+ queue = []
655
+ for doc in result_docs:
656
+ d_id = doc.get("_id")
657
+ if d_id is not None:
658
+ coll_p = plan["from"]
659
+ key_p = f"{coll_p}:{d_id}"
660
+ # Find the seq of the op that last wrote this key
661
+ for op in reversed(self.log.ops):
662
+ if op.op == "put" and op.payload.get("key") == key_p:
663
+ queue.append(op.seq)
664
+ break
665
+ # BFS — feed found dep_seqs back into the queue for full
666
+ # transitive traversal (mirrors the backward TRACE frontier loop).
667
+ while queue:
668
+ seq = queue.pop(0)
669
+ for dep_seq in self.cause_map.get(seq, []):
670
+ if dep_seq in visited_seqs_fwd:
671
+ continue
672
+ visited_seqs_fwd.add(dep_seq)
673
+ if dep_seq < len(self.log.ops):
674
+ dep_op = self.log.ops[dep_seq]
675
+ if dep_op.op == "put":
676
+ dep_key = dep_op.payload.get("key", "")
677
+ dep_doc = self.store.get(dep_key, as_of)
678
+ if dep_doc is not None:
679
+ out_fwd.append((dep_key, dep_doc))
680
+ queue.append(dep_seq) # continue BFS
681
+ rows = out_fwd
682
+
683
+ if plan.get("limit") is not None:
684
+ rows = rows[: plan["limit"]]
685
+
686
+ # VALID AS OF <date> — bi-temporal valid-time filter.
687
+ # Applied after all other filters so WHERE/ORDER BY/LIMIT can still
688
+ # reference _valid_from/_valid_to as regular queryable fields.
689
+ # Docs without _valid_from/_valid_to always pass (backward compat).
690
+ valid_date = plan.get("valid_as_of")
691
+ if valid_date:
692
+ rows = [(k, d) for k, d in rows if self._valid_at(d, valid_date)]
693
+
694
+ result = [d for _, d in rows]
695
+
696
+ # GROUP BY [COUNT | SUM f | AVG f | MIN f | MAX f]
697
+ if plan.get("group_by"):
698
+ gb_field = plan["group_by"]
699
+ agg = plan.get("aggregate")
700
+ groups: dict = {}
701
+ for d in result:
702
+ gkey = d.get(gb_field)
703
+ groups.setdefault(gkey, []).append(d)
704
+ grouped = []
705
+ for gval, gdocs in groups.items():
706
+ entry: dict = {gb_field: gval, "count": len(gdocs)}
707
+ if agg:
708
+ fn, af = agg
709
+ if fn == "count":
710
+ pass # already in entry["count"]
711
+ else:
712
+ nums = [d[af] for d in gdocs if af in d and isinstance(d[af], (int, float))]
713
+ if fn == "sum":
714
+ entry[f"sum_{af}"] = sum(nums)
715
+ elif fn == "avg":
716
+ entry[f"avg_{af}"] = sum(nums) / len(nums) if nums else None
717
+ elif fn == "min":
718
+ entry[f"min_{af}"] = min(nums) if nums else None
719
+ elif fn == "max":
720
+ entry[f"max_{af}"] = max(nums) if nums else None
721
+ grouped.append(entry)
722
+ return grouped
723
+
724
+ return result
725
+
726
+ # --- files (git-style, Cascade-compressed) ------------------------------
727
+ def put_file(self, name: str, data: bytes, tier: str = "warm", client: str = "local",
728
+ nonce: Optional[int] = None, idem: Optional[str] = None) -> int:
729
+ """Store a file version (Cascade-compressed, deduplicated). Returns the
730
+ integer version index; fetch its anchorable hash via file_root(name, version)."""
731
+ bs = self.blobs[tier]
732
+ version = bs.put_file(name, data)
733
+ root = bs.root(name, version)
734
+ nonce = self._next(client) if nonce is None else nonce
735
+ self._log_append(client, nonce, "put_file",
736
+ {"name": name, "tier": tier, "version": version, "root": root}, idem)
737
+ return version
738
+
739
+ def get_file(self, name: str, version: int = -1, tier: str = "warm") -> bytes:
740
+ return self.blobs[tier].get_file(name, version)
741
+
742
+ def file_root(self, name: str, version: int = -1, tier: str = "warm") -> str:
743
+ return self.blobs[tier].root(name, version)
744
+
745
+ def file_proof(self, name: str, chunk_index: int, version: int = -1, tier: str = "warm"):
746
+ """Return (leaf, proof, root) proving chunk_index is part of the version."""
747
+ recipe = self.blobs[tier].files[name]["versions"][version]
748
+ root = self.blobs[tier].files[name]["roots"][version]
749
+ leaf = recipe[chunk_index]
750
+ return leaf, merkle_proof(recipe, chunk_index), root
751
+
752
+ @staticmethod
753
+ def verify_proof(leaf, proof, root) -> bool:
754
+ return merkle_verify(leaf, proof, root)
755
+
756
+ def compression_stats(self, tier: str = "warm") -> dict:
757
+ return self.blobs[tier].stats()
758
+
759
+ # --- integrity / determinism -------------------------------------------
760
+ def verify(self) -> bool:
761
+ """Verify the hash-chained op log has not been tampered with."""
762
+ return self.log.verify()
763
+
764
+ def rebuild(self):
765
+ """Replay the log into fresh state — proves state is a pure function of the log."""
766
+ store, relations, indexes = MVCCStore(), Relations(), Indexes()
767
+ for (c, f, k) in self.indexes.config:
768
+ indexes.ensure(c, f, k)
769
+ for op in self.log.ops:
770
+ apply_op(store, relations, indexes, op)
771
+ return store, relations, indexes
772
+
773
+ def verify_determinism(self) -> bool:
774
+ store, _, _ = self.rebuild()
775
+ return store.snapshot() == self.store.snapshot()
776
+
777
+ @property
778
+ def head(self) -> str:
779
+ return self.log.head
780
+
781
+ @property
782
+ def seq(self) -> int:
783
+ return len(self.log) - 1