meshlogd 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
meshlog/__init__.py ADDED
@@ -0,0 +1,21 @@
1
+ """meshlog -- an event-sourced, eventually-consistent dataset that converges over
2
+ a frequently-partitioning, low-bandwidth mesh (Reticulum on real hardware).
3
+
4
+ Public surface:
5
+ Node, Event, FeedStore, HLC, materialize
6
+ SimNetwork, SimTransport (in-memory transport for demo/tests)
7
+ """
8
+
9
+ __version__ = "0.1.0"
10
+
11
+ from .event import Event
12
+ from .feedstore import FeedStore
13
+ from .hlc import HLC
14
+ from .node import Node
15
+ from .reducer import materialize
16
+ from .transport import SimNetwork, SimTransport, Transport
17
+
18
+ __all__ = [
19
+ "Node", "Event", "FeedStore", "HLC", "materialize",
20
+ "SimNetwork", "SimTransport", "Transport",
21
+ ]
meshlog/cli.py ADDED
@@ -0,0 +1,101 @@
1
+ """Run a meshlog node over a real Reticulum mesh.
2
+
3
+ Start two of these on machines that share any Reticulum interface (same LAN via
4
+ AutoInterface to begin with; add an RNode/LoRa interface later in your
5
+ ~/.reticulum/config -- the application code does not change) and watch a shared
6
+ dataset converge.
7
+
8
+ Examples
9
+ --------
10
+ Terminal 1 (base camp):
11
+ python node_rns.py --name base --data ./base.jsonl \\
12
+ --create R-100 site_assessment --set R-100 status OPENED
13
+
14
+ Terminal 2 (field team), on another machine:
15
+ python node_rns.py --name team_a --data ./team_a.jsonl \\
16
+ --create R-200 water_point --set R-200 functional true
17
+
18
+ Each node periodically gossips; type `view` + Enter at any node to print its
19
+ current materialized dataset, or `front` to print its frontier.
20
+
21
+ NOTE: requires `pip install rns`. This script drives the real RNSTransport, which
22
+ is faithful to the RNS API but should be validated on your own mesh.
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import json
28
+ import threading
29
+ import time
30
+
31
+ from .node import Node
32
+ from .rns_transport import RNSTransport
33
+
34
+
35
+ def coerce(value: str):
36
+ """Light type coercion so CLI values aren't all strings."""
37
+ low = value.lower()
38
+ if low in ("true", "false"):
39
+ return low == "true"
40
+ try:
41
+ return int(value)
42
+ except ValueError:
43
+ try:
44
+ return float(value)
45
+ except ValueError:
46
+ return value
47
+
48
+
49
+ def gossip_loop(node: Node, interval: float):
50
+ while True:
51
+ node.gossip_round()
52
+ time.sleep(interval)
53
+
54
+
55
+ def main():
56
+ ap = argparse.ArgumentParser(description="meshlog node over Reticulum")
57
+ ap.add_argument("--name", help="author id; default = identity hash", default=None)
58
+ ap.add_argument("--data", help="append-only log file (jsonl)", default=None)
59
+ ap.add_argument("--identity", help="persistent identity file", default=None)
60
+ ap.add_argument("--configdir", help="Reticulum config dir", default=None)
61
+ ap.add_argument("--gossip-interval", type=float, default=10.0)
62
+ ap.add_argument("--create", nargs=2, action="append", metavar=("RID", "FORM"),
63
+ default=[], help="create a record")
64
+ ap.add_argument("--set", nargs=3, action="append", metavar=("RID", "FIELD", "VALUE"),
65
+ default=[], help="set a field")
66
+ ap.add_argument("--note", nargs=2, action="append", metavar=("RID", "TEXT"),
67
+ default=[], help="add a note")
68
+ args = ap.parse_args()
69
+
70
+ transport = RNSTransport(configdir=args.configdir, identity_path=args.identity)
71
+ author = args.name or transport.node_id
72
+ node = Node(author, transport, persist_path=args.data)
73
+ transport.set_frontier_provider(node.frontier)
74
+
75
+ for rid, form in args.create:
76
+ node.create_record(rid, form)
77
+ for rid, field, value in args.set:
78
+ node.set_field(rid, field, coerce(value))
79
+ for rid, text in args.note:
80
+ node.add_note(rid, text)
81
+
82
+ threading.Thread(target=gossip_loop, args=(node, args.gossip_interval),
83
+ daemon=True).start()
84
+
85
+ print(f"meshlog node '{author}' running over Reticulum.")
86
+ print("commands: view | front | quit")
87
+ try:
88
+ while True:
89
+ cmd = input("> ").strip().lower()
90
+ if cmd == "view":
91
+ print(json.dumps(node.view(), indent=2, sort_keys=True))
92
+ elif cmd == "front":
93
+ print(json.dumps(node.frontier(), sort_keys=True))
94
+ elif cmd in ("quit", "exit"):
95
+ break
96
+ except (EOFError, KeyboardInterrupt):
97
+ pass
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()
meshlog/event.py ADDED
@@ -0,0 +1,99 @@
1
+ """The Event: the only thing that ever crosses the wire or hits disk.
2
+
3
+ Design choices that make the distributed-systems math easy:
4
+
5
+ * **Immutable.** An event is never edited. "Edits" are new events that
6
+ reference an earlier record. This is what gives us a free audit trail and,
7
+ more importantly, conflict-free merges: adding events to a set never
8
+ conflicts with adding other events.
9
+
10
+ * **Per-author feeds.** Every event carries (author, seq). For a given author,
11
+ seq is strictly contiguous: 1, 2, 3, ... This is the Secure-Scuttlebutt
12
+ trick. It collapses "what do you have?" from "enumerate a set" down to a
13
+ single integer per author (the high-water mark), which is what makes the
14
+ anti-entropy digest tiny enough to ride inside a Reticulum announce.
15
+
16
+ * **Hash-chained per feed.** Each event names the id of the previous event in
17
+ its own feed (``prev``). That makes a feed tamper-evident and lets a
18
+ receiver verify it is appending contiguous, authentic history rather than a
19
+ forged gap. (Trusted-collaborator model: we still want integrity, just not
20
+ anonymity.)
21
+
22
+ * **Content-addressed id.** ``id`` is a hash of the whole event. Receiving the
23
+ same event twice is trivially detected and ignored -> idempotent ingest.
24
+ """
25
+ from __future__ import annotations
26
+
27
+ import hashlib
28
+ import json
29
+ from dataclasses import dataclass, field
30
+ from typing import Any
31
+
32
+ # Truncated hash length in bytes. 16 bytes = 128 bits is plenty for collision
33
+ # resistance here and keeps ids compact on low-bandwidth links.
34
+ _ID_BYTES = 16
35
+
36
+
37
+ def _canonical(obj: Any) -> bytes:
38
+ """Deterministic byte encoding used for both hashing and the wire."""
39
+ return json.dumps(obj, sort_keys=True, separators=(",", ":")).encode("utf-8")
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class Event:
44
+ author: str # stable node id (in RNS: identity hash hex)
45
+ seq: int # per-author sequence, 1-based, contiguous
46
+ hlc: tuple[int, int] # hybrid logical clock stamp (wall_ms, counter)
47
+ kind: str # domain event type, e.g. "field.set"
48
+ payload: dict # domain data
49
+ prev: str # id of previous event in this author's feed ("" if seq==1)
50
+ id: str = field(default="") # content hash; filled by from_parts()
51
+
52
+ # ---- construction -----------------------------------------------------
53
+ @staticmethod
54
+ def _hash_fields(author, seq, hlc, kind, payload, prev) -> str:
55
+ body = _canonical(
56
+ {
57
+ "author": author,
58
+ "seq": seq,
59
+ "hlc": list(hlc),
60
+ "kind": kind,
61
+ "payload": payload,
62
+ "prev": prev,
63
+ }
64
+ )
65
+ return hashlib.sha256(body).hexdigest()[: _ID_BYTES * 2]
66
+
67
+ @classmethod
68
+ def from_parts(cls, author, seq, hlc, kind, payload, prev) -> "Event":
69
+ ev_id = cls._hash_fields(author, seq, tuple(hlc), kind, payload, prev)
70
+ return cls(author, seq, tuple(hlc), kind, payload, prev, ev_id)
71
+
72
+ # ---- integrity --------------------------------------------------------
73
+ def recompute_id(self) -> str:
74
+ return self._hash_fields(
75
+ self.author, self.seq, self.hlc, self.kind, self.payload, self.prev
76
+ )
77
+
78
+ def is_authentic(self) -> bool:
79
+ """The id must match the content. (A full implementation would also
80
+ verify an Ed25519 signature by ``author``'s key here -- RNS Identity
81
+ gives us exactly that primitive; omitted in the prototype core to keep
82
+ it dependency-free.)"""
83
+ return self.id == self.recompute_id()
84
+
85
+ # ---- wire encoding ----------------------------------------------------
86
+ def to_wire(self) -> dict:
87
+ return {
88
+ "a": self.author,
89
+ "s": self.seq,
90
+ "h": list(self.hlc),
91
+ "k": self.kind,
92
+ "p": self.payload,
93
+ "v": self.prev,
94
+ "i": self.id,
95
+ }
96
+
97
+ @classmethod
98
+ def from_wire(cls, d: dict) -> "Event":
99
+ return cls(d["a"], d["s"], tuple(d["h"]), d["k"], d["p"], d["v"], d["i"])
meshlog/feedstore.py ADDED
@@ -0,0 +1,176 @@
1
+ """FeedStore: the append-only log, organised as one contiguous feed per author.
2
+
3
+ This is the heart of the replication model. It enforces three invariants that
4
+ together make merging safe over a constantly-partitioning network:
5
+
6
+ 1. **Contiguity.** A feed accepts seq N only when it already holds 1..N-1.
7
+ Out-of-order arrivals are buffered until the gap fills. This guarantees
8
+ that a high-water mark (an integer per author) is a *complete* description
9
+ of what we hold for that author -- no holes.
10
+
11
+ 2. **Idempotence.** Re-ingesting an event we already have is a no-op. So we can
12
+ push aggressively and overlap freely; duplicates cost a hash compare.
13
+
14
+ 3. **Chain integrity.** seq N's ``prev`` must equal the id of seq N-1. A feed
15
+ that fails this is rejected rather than silently corrupting the merge.
16
+
17
+ Because of (1)-(3), the global dataset is just the set-union of every node's
18
+ feeds, and union is commutative, associative, and idempotent -> the database is
19
+ a CRDT (a grow-only set of immutable events) and converges regardless of the
20
+ order or grouping in which events arrive.
21
+ """
22
+ from __future__ import annotations
23
+
24
+ import json
25
+ import os
26
+ from typing import Iterable
27
+
28
+ from .event import Event
29
+
30
+ Frontier = dict[str, int] # author -> highest contiguous seq held
31
+
32
+
33
+ class FeedStore:
34
+ def __init__(self, persist_path: str | None = None):
35
+ self._feeds: dict[str, list[Event]] = {} # author -> [seq1, seq2, ...]
36
+ self._ids: set[str] = set() # all event ids held
37
+ self._pending: dict[str, dict[int, Event]] = {} # author -> {seq: event} buffer
38
+ self._persist_path = persist_path
39
+ if persist_path and os.path.exists(persist_path):
40
+ self._load()
41
+
42
+ # ---- queries ----------------------------------------------------------
43
+ def frontier(self) -> Frontier:
44
+ return {author: len(feed) for author, feed in self._feeds.items()}
45
+
46
+ def has(self, event_id: str) -> bool:
47
+ return event_id in self._ids
48
+
49
+ def all_events(self) -> list[Event]:
50
+ out: list[Event] = []
51
+ for feed in self._feeds.values():
52
+ out.extend(feed)
53
+ return out
54
+
55
+ def event_count(self) -> int:
56
+ return len(self._ids)
57
+
58
+ def last_id(self, author: str) -> str:
59
+ feed = self._feeds.get(author)
60
+ return feed[-1].id if feed else ""
61
+
62
+ def next_seq(self, author: str) -> int:
63
+ return len(self._feeds.get(author, [])) + 1
64
+
65
+ # ---- anti-entropy -----------------------------------------------------
66
+ def delta_for(self, remote: Frontier, limit: int | None = None) -> list[Event]:
67
+ """Events this store holds that ``remote`` is missing, given its frontier.
68
+
69
+ This is the entire "what to send" computation: for each author, ship the
70
+ slice of the feed above the remote's high-water mark. Note it naturally
71
+ relays *other* authors' events too (store-and-forward): if we hold A's
72
+ feed because we once met A, we'll offer it to C who never met A.
73
+ """
74
+ out: list[Event] = []
75
+ for author, feed in self._feeds.items():
76
+ have = remote.get(author, 0)
77
+ if len(feed) > have:
78
+ out.extend(feed[have:])
79
+ # Deterministic, dependency-friendly order: lower seqs first per author.
80
+ out.sort(key=lambda e: (e.author, e.seq))
81
+ if limit is not None:
82
+ out = out[:limit]
83
+ return out
84
+
85
+ # ---- mutation ---------------------------------------------------------
86
+ def append_local(self, event: Event) -> None:
87
+ """Append an event minted by *this* node. Assumes seq/prev already set
88
+ correctly by the Node (which owns the author identity and HLC)."""
89
+ feed = self._feeds.setdefault(event.author, [])
90
+ assert event.seq == len(feed) + 1, "local feed must stay contiguous"
91
+ feed.append(event)
92
+ self._ids.add(event.id)
93
+ self._persist(event)
94
+
95
+ def ingest(self, event: Event) -> bool:
96
+ """Accept a remote event. Returns True iff it advanced our state.
97
+
98
+ Handles duplicates, out-of-order arrivals (buffered), and chain
99
+ verification. Safe to call with anything; bad events are dropped.
100
+ """
101
+ if not event.is_authentic():
102
+ return False
103
+ if event.id in self._ids:
104
+ return False # idempotent
105
+
106
+ feed = self._feeds.setdefault(event.author, [])
107
+ expected = len(feed) + 1
108
+
109
+ if event.seq < expected:
110
+ return False # stale / already have a different copy of this slot
111
+ if event.seq > expected:
112
+ # Future event: buffer and wait for the gap to fill.
113
+ self._pending.setdefault(event.author, {})[event.seq] = event
114
+ return False
115
+
116
+ # event.seq == expected: verify chain link, then append.
117
+ prev_id = feed[-1].id if feed else ""
118
+ if event.prev != prev_id:
119
+ return False # forged or forked history; reject
120
+ feed.append(event)
121
+ self._ids.add(event.id)
122
+ self._persist(event)
123
+
124
+ # Try to drain any buffered successors that are now contiguous.
125
+ self._drain(event.author)
126
+ return True
127
+
128
+ def ingest_many(self, events: Iterable[Event]) -> int:
129
+ """Ingest a batch (e.g. a received delta). Because a batch may arrive in
130
+ any order, we loop until no further progress is made."""
131
+ events = list(events)
132
+ applied = 0
133
+ progress = True
134
+ while progress:
135
+ progress = False
136
+ for ev in events:
137
+ if ev.id not in self._ids and self.ingest(ev):
138
+ applied += 1
139
+ progress = True
140
+ return applied
141
+
142
+ def _drain(self, author: str) -> None:
143
+ buf = self._pending.get(author)
144
+ if not buf:
145
+ return
146
+ feed = self._feeds[author]
147
+ while (nxt := buf.pop(len(feed) + 1, None)) is not None:
148
+ expected_prev = feed[-1].id if feed else ""
149
+ if nxt.prev != expected_prev:
150
+ # Forked history at this slot; stop draining this feed.
151
+ break
152
+ feed.append(nxt)
153
+ self._ids.add(nxt.id)
154
+ self._persist(nxt)
155
+
156
+ # ---- persistence (append-only jsonl; the log *is* the database) -------
157
+ def _persist(self, event: Event) -> None:
158
+ if not self._persist_path:
159
+ return
160
+ with open(self._persist_path, "a") as fh:
161
+ fh.write(json.dumps(event.to_wire()) + "\n")
162
+
163
+ def _load(self) -> None:
164
+ rows = []
165
+ with open(self._persist_path) as fh:
166
+ for line in fh:
167
+ line = line.strip()
168
+ if line:
169
+ rows.append(Event.from_wire(json.loads(line)))
170
+ # Sort so feeds load contiguously, then ingest.
171
+ rows.sort(key=lambda e: (e.author, e.seq))
172
+ for ev in rows:
173
+ feed = self._feeds.setdefault(ev.author, [])
174
+ if ev.seq == len(feed) + 1 and ev.id not in self._ids:
175
+ feed.append(ev)
176
+ self._ids.add(ev.id)
meshlog/hlc.py ADDED
@@ -0,0 +1,64 @@
1
+ """Hybrid Logical Clocks (HLC).
2
+
3
+ Why not wall-clock timestamps? In this network there is no NTP, devices drift,
4
+ and a node may be offline for days. Ordering events by wall time alone would let
5
+ a laggy clock silently win a last-writer-wins race. HLCs fuse physical time with
6
+ a logical counter so that:
7
+
8
+ * timestamps stay close to real wall-clock time (good for humans/display), and
9
+ * causality is never violated: if event B is created after observing event A,
10
+ then hlc(B) > hlc(A), regardless of clock skew.
11
+
12
+ Reference: Kulkarni et al., "Logical Physical Clocks and Consistent Snapshots
13
+ in Globally Distributed Databases" (2014).
14
+
15
+ An HLC stamp here is (wall_ms, counter). For a *total* order across authors we
16
+ break ties with the author id at the call site (see reducer.total_order_key).
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import time
21
+ from dataclasses import dataclass
22
+
23
+
24
+ def _phys_now_ms() -> int:
25
+ return int(time.time() * 1000)
26
+
27
+
28
+ @dataclass
29
+ class HLC:
30
+ """A per-node hybrid logical clock.
31
+
32
+ ``now()`` is called when the node *creates* a local event.
33
+ ``update()`` is called when the node *receives* a remote stamp, so that any
34
+ event it subsequently creates causally follows what it has seen.
35
+ """
36
+
37
+ wall: int = 0
38
+ counter: int = 0
39
+ # Injectable clock for deterministic tests; defaults to real time.
40
+ _phys: callable = _phys_now_ms
41
+
42
+ def now(self) -> tuple[int, int]:
43
+ pt = self._phys()
44
+ if pt > self.wall:
45
+ self.wall, self.counter = pt, 0
46
+ else:
47
+ # Physical clock didn't advance (or went backwards): bump logical.
48
+ self.counter += 1
49
+ return (self.wall, self.counter)
50
+
51
+ def update(self, remote: tuple[int, int]) -> tuple[int, int]:
52
+ r_wall, r_counter = remote
53
+ pt = self._phys()
54
+ new_wall = max(self.wall, r_wall, pt)
55
+ if new_wall == self.wall == r_wall:
56
+ self.counter = max(self.counter, r_counter) + 1
57
+ elif new_wall == self.wall:
58
+ self.counter += 1
59
+ elif new_wall == r_wall:
60
+ self.counter = r_counter + 1
61
+ else: # new_wall == pt, a fresh physical tick beyond anything seen
62
+ self.counter = 0
63
+ self.wall = new_wall
64
+ return (self.wall, self.counter)
meshlog/node.py ADDED
@@ -0,0 +1,114 @@
1
+ """The Node and the anti-entropy protocol.
2
+
3
+ Anti-entropy (a.k.a. gossip reconciliation) is deliberately tiny:
4
+
5
+ When two peers can talk, each sends the other its FRONTIER (the per-author
6
+ high-water-mark vector). On receiving a peer's frontier, a node computes the
7
+ slice of its log the peer is missing and PUSHES it. Ingest is idempotent and
8
+ gap-tolerant, so overlapping pushes and reordered batches are harmless.
9
+
10
+ message := {"t": "frontier", "f": {author: seq}}
11
+ | {"t": "events", "e": [event_wire, ...]}
12
+
13
+ That's the whole protocol. One frontier exchange per contact reconciles the pair
14
+ completely; transitive (multi-hop) convergence happens because each node relays
15
+ every author's events it holds, not just its own. Over a partitioning network,
16
+ repeated pairwise contacts compose into global convergence -- which is exactly
17
+ the store-and-forward property we want for "base camp + teams that never meet."
18
+
19
+ Batching: deltas are chunked (``MAX_EVENTS_PER_MSG``) so a single message stays
20
+ small on a low-bandwidth link. A real radio build would additionally send large
21
+ batches as an RNS Resource rather than inline packets (see rns_transport.py).
22
+ """
23
+ from __future__ import annotations
24
+
25
+ import json
26
+
27
+ from .event import Event
28
+ from .feedstore import FeedStore, Frontier
29
+ from .hlc import HLC
30
+ from .reducer import materialize
31
+ from .transport import Transport
32
+
33
+ MAX_EVENTS_PER_MSG = 32
34
+
35
+
36
+ class Node:
37
+ def __init__(self, author: str, transport: Transport,
38
+ persist_path: str | None = None, hlc: HLC | None = None):
39
+ self.author = author
40
+ self.store = FeedStore(persist_path)
41
+ self.hlc = hlc or HLC()
42
+ self.transport = transport
43
+ transport.set_receive(self._on_message)
44
+
45
+ # ---- producing local events ------------------------------------------
46
+ def emit(self, kind: str, payload: dict) -> Event:
47
+ seq = self.store.next_seq(self.author)
48
+ ev = Event.from_parts(
49
+ author=self.author,
50
+ seq=seq,
51
+ hlc=self.hlc.now(),
52
+ kind=kind,
53
+ payload=payload,
54
+ prev=self.store.last_id(self.author),
55
+ )
56
+ self.store.append_local(ev)
57
+ return ev
58
+
59
+ # Convenience domain helpers (KoboToolbox-style records) ----------------
60
+ def create_record(self, record_id: str, form: str) -> Event:
61
+ return self.emit("record.create", {"record_id": record_id, "form": form, "by": self.author})
62
+
63
+ def set_field(self, record_id: str, field: str, value) -> Event:
64
+ return self.emit("field.set", {"record_id": record_id, "field": field, "value": value})
65
+
66
+ def add_note(self, record_id: str, text: str) -> Event:
67
+ return self.emit("note.add", {"record_id": record_id, "text": text})
68
+
69
+ def add_attachment(self, record_id: str, field: str, blob_hash: str, size: int, mime: str) -> Event:
70
+ return self.emit("attachment.add", {
71
+ "record_id": record_id, "field": field,
72
+ "blob_hash": blob_hash, "size": size, "mime": mime,
73
+ })
74
+
75
+ # ---- the materialized view -------------------------------------------
76
+ def view(self) -> dict:
77
+ return materialize(self.store.all_events())
78
+
79
+ def frontier(self) -> Frontier:
80
+ return self.store.frontier()
81
+
82
+ # ---- anti-entropy: initiate ------------------------------------------
83
+ def gossip_round(self) -> None:
84
+ """Offer our frontier to everyone currently reachable."""
85
+ msg = self._encode({"t": "frontier", "f": self.store.frontier()})
86
+ for peer in self.transport.reachable_peers():
87
+ self.transport.send(peer, msg)
88
+
89
+ # ---- anti-entropy: react ---------------------------------------------
90
+ def _on_message(self, src: str, raw: bytes) -> None:
91
+ msg = self._decode(raw)
92
+ if msg["t"] == "frontier":
93
+ # Peer told us what it has; push what it lacks, in bounded chunks.
94
+ delta = self.store.delta_for(msg["f"])
95
+ for i in range(0, len(delta), MAX_EVENTS_PER_MSG):
96
+ chunk = delta[i:i + MAX_EVENTS_PER_MSG]
97
+ self.transport.send(src, self._encode(
98
+ {"t": "events", "e": [e.to_wire() for e in chunk]}
99
+ ))
100
+ elif msg["t"] == "events":
101
+ incoming = [Event.from_wire(d) for d in msg["e"]]
102
+ # Advance our HLC past anything we observe (causality).
103
+ for e in incoming:
104
+ self.hlc.update(e.hlc)
105
+ self.store.ingest_many(incoming)
106
+
107
+ # ---- wire codec (swap for msgpack/CBOR in production) -----------------
108
+ @staticmethod
109
+ def _encode(obj: dict) -> bytes:
110
+ return json.dumps(obj, separators=(",", ":")).encode("utf-8")
111
+
112
+ @staticmethod
113
+ def _decode(raw: bytes) -> dict:
114
+ return json.loads(raw.decode("utf-8"))
meshlog/reducer.py ADDED
@@ -0,0 +1,87 @@
1
+ """Reducer: fold the event set into the materialized view (the "current" data).
2
+
3
+ The view is a pure function of the *set* of events. Two nodes holding the same
4
+ set always compute byte-identical views, no matter what order events arrived in.
5
+ That is the property that turns "everyone eventually holds the same events"
6
+ (guaranteed by FeedStore + anti-entropy) into "everyone eventually shows the
7
+ same data" (what the user actually cares about).
8
+
9
+ Conflict handling:
10
+ * Most events are additive (notes, attachments) and never conflict.
11
+ * Mutable scalar fields use Last-Writer-Wins, but "last" is decided by the
12
+ event's HLC stamp -- NOT arrival order and NOT wall clock -- with the author
13
+ id as a deterministic tiebreaker. So concurrent edits from two partitions
14
+ resolve to the same winner on every node.
15
+
16
+ Domain model here is a stand-in for KoboToolbox-style assessment records:
17
+ record.create {record_id, form, by}
18
+ field.set {record_id, field, value}
19
+ attachment.add {record_id, field, blob_hash, size, mime}
20
+ note.add {record_id, text}
21
+ """
22
+ from __future__ import annotations
23
+
24
+ from dataclasses import dataclass, field as dc_field
25
+
26
+ from .event import Event
27
+
28
+
29
+ def total_order_key(ev: Event):
30
+ """A deterministic total order over events for LWW resolution."""
31
+ return (ev.hlc[0], ev.hlc[1], ev.author, ev.seq)
32
+
33
+
34
+ @dataclass
35
+ class Record:
36
+ record_id: str
37
+ form: str = ""
38
+ created_by: str = ""
39
+ fields: dict = dc_field(default_factory=dict) # field -> value (LWW)
40
+ _field_stamp: dict = dc_field(default_factory=dict) # field -> winning order key
41
+ attachments: list = dc_field(default_factory=list) # [{field, blob_hash, size, mime}]
42
+ notes: list = dc_field(default_factory=list) # [text, ...] (append-only)
43
+
44
+ def public(self) -> dict:
45
+ return {
46
+ "record_id": self.record_id,
47
+ "form": self.form,
48
+ "created_by": self.created_by,
49
+ "fields": dict(sorted(self.fields.items())),
50
+ "attachments": sorted(self.attachments, key=lambda a: (a["field"], a["blob_hash"])),
51
+ "notes": list(self.notes),
52
+ }
53
+
54
+
55
+ def materialize(events: list[Event]) -> dict[str, dict]:
56
+ """Fold events -> {record_id: public_record_dict}."""
57
+ records: dict[str, Record] = {}
58
+ # Process in HLC total order so LWW is order-independent of *arrival*.
59
+ for ev in sorted(events, key=total_order_key):
60
+ p = ev.payload
61
+ rid = p.get("record_id")
62
+ if ev.kind == "record.create":
63
+ rec = records.setdefault(rid, Record(rid))
64
+ rec.form = p.get("form", rec.form)
65
+ rec.created_by = p.get("by", rec.created_by)
66
+ elif ev.kind == "field.set":
67
+ rec = records.setdefault(rid, Record(rid))
68
+ key = total_order_key(ev)
69
+ # LWW: only overwrite if this event is later in the total order.
70
+ if key >= rec._field_stamp.get(p["field"], (-1, -1, "", -1)):
71
+ rec.fields[p["field"]] = p["value"]
72
+ rec._field_stamp[p["field"]] = key
73
+ elif ev.kind == "attachment.add":
74
+ rec = records.setdefault(rid, Record(rid))
75
+ rec.attachments.append(
76
+ {
77
+ "field": p["field"],
78
+ "blob_hash": p["blob_hash"],
79
+ "size": p.get("size", 0),
80
+ "mime": p.get("mime", ""),
81
+ }
82
+ )
83
+ elif ev.kind == "note.add":
84
+ rec = records.setdefault(rid, Record(rid))
85
+ rec.notes.append(p["text"])
86
+ # Unknown kinds are ignored: forward-compatible by construction.
87
+ return {rid: rec.public() for rid, rec in records.items()}