brainclaw 1.7.5 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/README.md +28 -11
  2. package/dist/brainclaw-vscode.vsix +0 -0
  3. package/dist/cli.js +139 -13
  4. package/dist/commands/add-step.js +1 -1
  5. package/dist/commands/bootstrap.js +2 -26
  6. package/dist/commands/check-security-mcp.js +50 -33
  7. package/dist/commands/check-security.js +86 -43
  8. package/dist/commands/claim.js +22 -21
  9. package/dist/commands/confirm.js +26 -0
  10. package/dist/commands/context-diff.js +1 -1
  11. package/dist/commands/dispatch-watch.js +142 -0
  12. package/dist/commands/doctor.js +113 -2
  13. package/dist/commands/estimation-report.js +115 -16
  14. package/dist/commands/harvest.js +502 -16
  15. package/dist/commands/init.js +123 -21
  16. package/dist/commands/loops-handlers.js +4 -0
  17. package/dist/commands/mcp-read-handlers.js +198 -29
  18. package/dist/commands/mcp.js +615 -92
  19. package/dist/commands/memory.js +21 -17
  20. package/dist/commands/migrate.js +81 -17
  21. package/dist/commands/prune.js +78 -4
  22. package/dist/commands/reflect.js +26 -20
  23. package/dist/commands/register-agent.js +57 -1
  24. package/dist/commands/repair.js +20 -0
  25. package/dist/commands/session-end.js +15 -6
  26. package/dist/commands/session-start.js +18 -1
  27. package/dist/commands/setup-security.js +39 -18
  28. package/dist/commands/setup.js +26 -27
  29. package/dist/commands/stale.js +16 -2
  30. package/dist/commands/uninstall.js +126 -34
  31. package/dist/commands/update-step.js +6 -0
  32. package/dist/commands/worktree.js +60 -0
  33. package/dist/core/actions.js +12 -3
  34. package/dist/core/agent-capability.js +11 -13
  35. package/dist/core/agent-files.js +844 -547
  36. package/dist/core/agent-integrations.js +0 -3
  37. package/dist/core/agent-inventory.js +67 -0
  38. package/dist/core/agent-registry.js +163 -29
  39. package/dist/core/agentrun-reconciler.js +33 -2
  40. package/dist/core/agentruns.js +7 -1
  41. package/dist/core/ai-agent-detection.js +31 -44
  42. package/dist/core/archival.js +15 -9
  43. package/dist/core/assignment-reconciler.js +56 -0
  44. package/dist/core/assignment-sweeper.js +127 -4
  45. package/dist/core/assignments.js +69 -11
  46. package/dist/core/bootstrap.js +233 -67
  47. package/dist/core/brainclaw-version.js +22 -0
  48. package/dist/core/candidates.js +21 -1
  49. package/dist/core/claims.js +313 -150
  50. package/dist/core/config.js +6 -1
  51. package/dist/core/context-diff.js +148 -20
  52. package/dist/core/context.js +129 -8
  53. package/dist/core/coordination.js +22 -3
  54. package/dist/core/dispatch-status.js +109 -5
  55. package/dist/core/dispatcher.js +65 -11
  56. package/dist/core/entity-operations.js +45 -24
  57. package/dist/core/entity-registry.js +31 -5
  58. package/dist/core/event-log.js +138 -21
  59. package/dist/core/events/checkpoint.js +258 -0
  60. package/dist/core/events/genesis.js +220 -0
  61. package/dist/core/events/journal.js +507 -0
  62. package/dist/core/events/materialize.js +126 -0
  63. package/dist/core/events/registry-post-image.js +110 -0
  64. package/dist/core/events/verify.js +109 -0
  65. package/dist/core/execution-adapters.js +23 -0
  66. package/dist/core/execution.js +25 -0
  67. package/dist/core/facade-schema.js +48 -0
  68. package/dist/core/gc-semantic.js +130 -5
  69. package/dist/core/handoff-snapshot.js +68 -0
  70. package/dist/core/ids.js +19 -8
  71. package/dist/core/instruction-templates.js +34 -115
  72. package/dist/core/io.js +39 -3
  73. package/dist/core/json-store.js +10 -1
  74. package/dist/core/lock.js +153 -28
  75. package/dist/core/loops/bootstrap-acquire.js +25 -1
  76. package/dist/core/loops/facade-schema.js +2 -0
  77. package/dist/core/loops/hooks/survey-signals-baseline.js +36 -0
  78. package/dist/core/loops/index.js +1 -0
  79. package/dist/core/loops/presets/bootstrap.js +7 -0
  80. package/dist/core/loops/store.js +17 -0
  81. package/dist/core/loops/verbs.js +24 -1
  82. package/dist/core/markdown.js +8 -76
  83. package/dist/core/mcp-command-resolution.js +245 -0
  84. package/dist/core/memory-compactor.js +5 -3
  85. package/dist/core/memory-lifecycle.js +282 -0
  86. package/dist/core/merge-risk.js +150 -0
  87. package/dist/core/messaging.js +8 -1
  88. package/dist/core/migration.js +11 -1
  89. package/dist/core/observer-mode.js +26 -0
  90. package/dist/core/operations/memory-mutation.js +90 -65
  91. package/dist/core/operations/plan.js +27 -1
  92. package/dist/core/protocol-skills.js +210 -0
  93. package/dist/core/reflection-safety.js +6 -7
  94. package/dist/core/reputation.js +84 -2
  95. package/dist/core/runtime-signals.js +71 -9
  96. package/dist/core/runtime.js +84 -1
  97. package/dist/core/schema.js +125 -0
  98. package/dist/core/security-detectors.js +125 -0
  99. package/dist/core/security-extract.js +189 -0
  100. package/dist/core/security-guard.js +107 -29
  101. package/dist/core/security-packages.js +121 -0
  102. package/dist/core/security-scoring.js +76 -9
  103. package/dist/core/security.js +34 -2
  104. package/dist/core/sequence.js +11 -2
  105. package/dist/core/setup-flow.js +141 -13
  106. package/dist/core/spawn-check.js +110 -4
  107. package/dist/core/staleness.js +109 -1
  108. package/dist/core/state.js +250 -54
  109. package/dist/core/store-resolution.js +19 -5
  110. package/dist/core/worktree.js +169 -7
  111. package/dist/facts.js +8 -8
  112. package/dist/facts.json +7 -7
  113. package/docs/PROTOCOL.md +223 -0
  114. package/docs/cli.md +11 -10
  115. package/docs/concepts/coordinator-runbook.md +129 -0
  116. package/docs/concepts/dispatch-lifecycle.md +17 -0
  117. package/docs/concepts/event-log-store-critique-A.md +333 -0
  118. package/docs/concepts/event-log-store-critique-B.md +353 -0
  119. package/docs/concepts/event-log-store-phase0-measurements.md +58 -0
  120. package/docs/concepts/event-log-store-proposal-A.md +365 -0
  121. package/docs/concepts/event-log-store-proposal-B.md +404 -0
  122. package/docs/concepts/event-log-store.md +928 -0
  123. package/docs/concepts/identity-model-proposal.md +371 -0
  124. package/docs/concepts/memory.md +5 -4
  125. package/docs/concepts/observer-protocol.md +361 -0
  126. package/docs/concepts/parallel-merge-protocol.md +71 -0
  127. package/docs/concepts/plans-and-claims.md +43 -0
  128. package/docs/concepts/skills.md +78 -0
  129. package/docs/concepts/workspace-bootstrapping.md +61 -0
  130. package/docs/integrations/agents.md +4 -4
  131. package/docs/integrations/cline.md +10 -11
  132. package/docs/integrations/codex.md +2 -2
  133. package/docs/integrations/continue.md +5 -5
  134. package/docs/integrations/copilot.md +14 -12
  135. package/docs/integrations/openclaw.md +7 -6
  136. package/docs/integrations/overview.md +7 -7
  137. package/docs/integrations/roo.md +3 -3
  138. package/docs/integrations/windsurf.md +6 -6
  139. package/docs/mcp-schema-changelog.md +51 -20
  140. package/docs/quickstart.md +48 -47
  141. package/docs/security.md +174 -15
  142. package/docs/storage.md +4 -2
  143. package/package.json +8 -6
@@ -0,0 +1,353 @@
1
+ # Event-Log Store — Cross-critique by slot B (round 2)
2
+
3
+ > Loop lop_3bf55b9492e0d96c, pln#543 step 1. Attacks proposal A
4
+ > (`event-log-store-proposal-A.md`), the shared spine, and adjudicates every
5
+ > divergence between A and B. Author: claude-code slot B. Status: CRITIQUE.
6
+
7
+ Convergence note: A and B independently chose the same spine (full snapshots,
8
+ sealed segments + checkpoints, global seq under the existing lock, lazy
9
+ projections, off/dual/primary migration). That raises confidence in the spine
10
+ but proves nothing about the edges — and the edges are where both proposals
11
+ have real bugs. Found below: one design contradiction in A (cursors vs.
12
+ sealing), one in B (seq:null vs. seq cursors), one shared escalation (seq
13
+ counter corruption in the lock-steal window), and one durability hole in A
14
+ (fsync policy silently breaks journal-first).
15
+
16
+ ---
17
+
18
+ ## 1. Attacks on proposal A
19
+
20
+ ### A1. Rename-based sealing breaks A's own cursor format — HIGH, design contradiction
21
+
22
+ A seals by renaming `active.jsonl` → `segments/seg-<first>-<last>.jsonl`
23
+ (A §5) and defines cursors as `{segment_id, offset, last_read}` (A §5,
24
+ "Cursors survive rotation"). These two choices contradict each other:
25
+
26
+ - A cursor pointing into the *active* segment carries `segment_id =
27
+ "active"` (or the active file's identity, whichever — A never says how the
28
+ active segment is identified before it has a seq-range name). After the
29
+ seal-rename, that identity now refers to a **different file** (the fresh
30
+ `active.jsonl`). The cursor's byte offset lands in the middle of unrelated
31
+ content. The reader either reads garbage from a record boundary that
32
+ doesn't exist, or silently re-reads/skips events. This is the v1 rotation
33
+ bug reintroduced with extra steps.
34
+ - Byte offsets are additionally invalidated by A's own torn-tail repair
35
+ (attack A2), which truncates the active file.
36
+ - On Windows, renaming a file held open by a concurrent reader fails
37
+ EPERM/EBUSY. A acknowledges this and says "retry/defer" — but MCP readers
38
+ are worker-per-call processes that come and go constantly; under steady
39
+ read traffic the seal can starve for a long time, during which the active
40
+ segment grows unboundedly past 10MB, eroding the bounded-rebuild claim.
41
+ "Retry until quiet" is not a protocol; it's a hope.
42
+
43
+ B's design (never rename; the active segment is born with its permanent
44
+ first-seq name; cursors are seq watermarks) dissolves all three problems
45
+ structurally. **Verdict: B wins outright; A's §5 layout and cursor format
46
+ should not survive synthesis in any part.**
47
+
48
+ ### A2. Writer-inline torn-tail quarantine is a race against the seatbelt — HIGH
49
+
50
+ A §3: "the next writer, before appending, checks the last byte of the file;
51
+ if it is not `\n`, it moves the torn bytes into quarantine." Problems:
52
+
53
+ 1. **It mutates the append-only file.** "Move bytes out" = read tail +
54
+ truncate. Truncation under a concurrent reader (who is streaming the
55
+ active segment from a byte offset, per A's own cursor format) yields
56
+ short reads or garbage. The one file class whose integrity the whole
57
+ design rests on is now edited in place on the hot path.
58
+ 2. **It races the two-writer window A itself defends against.** In the
59
+ lock-expiry break (O2 residual), writer W2 runs the last-byte check while
60
+ stale-but-alive W1 is mid-`write()`. W2 sees no trailing `\n` — because
61
+ W1's write is *in flight, not torn* — and quarantines live bytes, then
62
+ appends. W1's syscall may complete after the truncate (fd offsets under
63
+ O_APPEND are kernel-managed; truncate + concurrent append behavior is
64
+ exactly the kind of cross-platform UB this spec must not contain). A's
65
+ O_APPEND seatbelt exists *for* this window; the repair path defeats it
66
+ *in* this window.
67
+ 3. The check-then-truncate-then-append sequence is three syscalls where the
68
+ seatbelt's guarantee covers exactly one.
69
+
70
+ B's leading-`\n` framing is passive: a torn fragment is *neutralized* (it
71
+ becomes one malformed line, capped at one event) without anyone touching
72
+ existing bytes. **Verdict: B's framing wins for the hot path. Keep A's
73
+ quarantine as an offline `doctor` repair only (doctor runs under the lock,
74
+ with no concurrent appender by construction, and can safely excise + park).**
75
+
76
+ ### A3. fsync=rotate default silently breaks journal-first — HIGH
77
+
78
+ A §3 defaults to **no fsync per mutation** (only on seal/checkpoint), while
79
+ A §3.1 claims journal-first ordering. Program-order journal-first is **not**
80
+ durable-media journal-first: the OS may persist the (later) projection
81
+ writes before the (earlier) journal append. Crash in that window ⇒ a
82
+ projection **from the future** — state the journal cannot explain. B §2.3
83
+ names this exact hazard as "the single most important invariant" and pays
84
+ one fsync per `mutate()` to keep it. A's design then makes it worse: the
85
+ lazy reconciler sees `applied_seq > head_seq`... actually no — A's
86
+ `last_applied_seq` lives in `meta.json`, which is also written without
87
+ fsync, so the watermark itself may or may not have survived. Depending on
88
+ which writes hit disk, the reconciler may (a) detect an impossible
89
+ watermark, (b) see a consistent-looking but regressed journal and **replay
90
+ backwards over newer projection state**, destroying a committed mutation's
91
+ visible effects. (b) is silent data loss — the trp_d5595086 class A invokes
92
+ everywhere else.
93
+
94
+ Two acceptable fixes; synthesis must take at least one, preferably both:
95
+ - **B's policy**: one `fsync` on the journal fd per `mutate()`, after the
96
+ last append, *before* any projection write. At brainclaw mutation rates
97
+ (user-action frequency, not hot-loop) this is affordable even on NTFS.
98
+ - **Never-regress guard**: the reconciler refuses to overwrite a projection
99
+ whose content disagrees with replay *toward an older state*; mismatch in
100
+ the regressing direction ⇒ doctor error, not write. Cheap with
101
+ `entity_rev` in the projection metadata.
102
+
103
+ **Verdict: B's fsync default wins; add the never-regress guard regardless
104
+ (defense in depth — fsync can be configured off).**
105
+
106
+ ### A4. Seq counter corruption escalates beyond the race window — HIGH (applies to B too, but A is blinder to it)
107
+
108
+ Both proposals persist `next_seq` in a small meta file (`meta.json` /
109
+ `HEAD.json`) written under the lock. In the lock-steal window, W1 and W2
110
+ both read `next_seq = N`, both append seq N, both write meta `next_seq =
111
+ N+1`. A treats this as a bounded anomaly: "duplicate seq detected via
112
+ `(seq, writer)`, reducer applies both in file order." But the damage is
113
+ **not bounded to the window**:
114
+
115
+ - The counter itself is now wrong relative to history if either writer had
116
+ appended *more than one* event (A §3 explicitly allows "append event(s)"
117
+ plural per mutation): W1 appends N, N+1; W2 appends N; meta ends at N+1 or
118
+ N+2 depending on write order — future writers can mint seqs that collide
119
+ with already-written ones **after** the race is over.
120
+ - Duplicate seqs break every consumer that treats seq as an address:
121
+ `last_applied_seq` watermarks (replay from N+1 misses the second N),
122
+ notification cursors, federation idempotency keys `(origin_store_id,
123
+ seq)`, and segment-name binary search.
124
+
125
+ A's mitigation (writer id in the envelope) detects but does not contain.
126
+ The containment fix is cheap and must be normative in the synthesis: **on
127
+ lock acquisition, the writer validates `meta.next_seq` against the actual
128
+ tail of the active segment** (read last line — O(1)) **and takes
129
+ `max(meta, tail+1)`**. This re-derives truth from the journal (the meta file
130
+ is a cache, per B's discipline) and caps any collision to the single
131
+ in-flight race write, restoring A's "bounded anomaly" claim, which is
132
+ currently asserted but not earned.
133
+
134
+ Also unaddressed by A: **pid reuse against the lock itself.** A's `writer =
135
+ pid + start-nonce` correctly survives pid reuse *for the envelope*, but if
136
+ `lock.ts` liveness-checks the owner by pid, a recycled pid makes a dead
137
+ owner look alive (lock never breakable — availability bug) or contributes to
138
+ premature breaks. Out of scope to fix here, but the spec must state the
139
+ dependency: the journal's two-writer story is only as rare as lock.ts's
140
+ steal rate, and should cite how lock.ts identifies owners (token, not pid).
141
+
142
+ ### A5. Checkpoint design: the "referencing" variant (Q3) is circular — MEDIUM, kill it
143
+
144
+ A's Q3 offers checkpoints that reference projection files by hash. This
145
+ inverts the truth direction: checkpoints exist to rebuild state when
146
+ projections are suspect; a checkpoint whose validity depends on projection
147
+ integrity is useless in exactly the scenarios it exists for (projection
148
+ corruption, the regression in A3, divergence found by `--verify-journal`).
149
+ **Verdict: kill the referencing option without further study. Checkpoints
150
+ are self-contained.** Remaining real choice: A's external file +
151
+ `checkpoint_ref` event vs. B's in-journal checkpoint event run — see D5.
152
+
153
+ A's external checkpoint also lacks a corruption story: `ckpt-<seq>.json`
154
+ has no checksum, and A's crash matrix covers only the *orphan* case (file
155
+ without meta ref), not the *corrupt referenced* case. Recovery must be:
156
+ checksum in the `checkpoint_ref` event; on mismatch, fall back to the
157
+ previous checkpoint and replay more segments (which requires sealed
158
+ segments older than the latest checkpoint to remain locally readable —
159
+ consistent with park-don't-delete, but it constrains A's `gc` archiving Q5:
160
+ never archive past the *second*-newest verified checkpoint).
161
+
162
+ ### A6. Stale-path reads take the write lock with no contention fallback — MEDIUM
163
+
164
+ A §6: read path "Behind → acquire lock, replay the delta." Two problems:
165
+
166
+ 1. **Reads now contend with writes.** After every external mutation, *every*
167
+ MCP worker that reads is "behind" and queues on the store lock — a
168
+ thundering herd of reconcilers during dispatch storms (20-agent target),
169
+ each wanting to do the same replay. A has no answer for lock-unavailable;
170
+ B serves the stale projection with a `stale: true` annotation. Synthesis:
171
+ B's fallback, plus reconcile-once semantics (whoever gets the lock heals;
172
+ others serve stale or retry briefly).
173
+ 2. A second-order cost A doesn't price: with `fsync` per mutation (A3 fix),
174
+ lock hold time grows; reads queuing on that lock amplify it.
175
+
176
+ B's open question Q3 (are stale reads acceptable for claims?) is the right
177
+ question and is a **Juan call** — claims are correctness-bearing; everything
178
+ else can tolerate annotated staleness.
179
+
180
+ ### A7. Git-diffability of segments is a non-claim — LOW, but scope both proposals honestly
181
+
182
+ A §2 ("the journal is diffable too, just verbose") and B's Q5 both flirt
183
+ with committing segments via memory-git. Attack: any branched/concurrent
184
+ store history (two worktrees, a restored backup, federation pull) appends
185
+ different lines to the **same active segment file** with **colliding seqs**
186
+ minted from divergent `next_seq` — a git merge produces line-level
187
+ conflicts in JSONL, a meta/HEAD conflict, and a semantically unmergeable seq
188
+ space. There is no sane merge driver for this. The journal is a
189
+ single-writer-lineage artifact; git is a DAG. **Verdict: segments and meta
190
+ are gitignored inside the store repo. The git-diffable identity of the store
191
+ is the projections, full stop (plus optionally checkpoints, which are
192
+ single-file snapshots and merge as whole-file conflicts a human can
193
+ adjudicate). This answers B Q5 and deletes A's "journal is diffable too"
194
+ sentence.** Cross-machine transport of segments is federation's job
195
+ (rsync/dumb-bus of immutable files), not git's.
196
+
197
+ ### A8. Smaller holes in A — LOW
198
+
199
+ - **Network filesystems**: A is silent; B correctly scopes correctness to
200
+ local FS and has doctor warn on network mounts. Adopt B's stance. Note
201
+ O_APPEND on NFS/SMB is *not* atomic; a store on a network share gets
202
+ corrupted journals under concurrency, silently.
203
+ - **Short writes**: neither proposal checks `writeSync`'s return value.
204
+ POSIX permits partial writes on regular files (signals, quota, ENOSPC
205
+ mid-write); Windows similar. A partial write *with* the lock held is a
206
+ torn line that the framing heals, but the mutation must **fail loudly** at
207
+ that point, not proceed to projections. One-line spec fix: `bytesWritten
208
+ !== buffer.length` ⇒ throw inside `mutate()`.
209
+ - **>4KB appends**: the "single write doesn't interleave" folklore is solid
210
+ for small buffers on local FS but has no formal guarantee at arbitrary
211
+ sizes on either platform. With the lock as primary and framing as
212
+ containment, this is acceptable risk — but the spec should cap a single
213
+ event line (say 256KB hard error) so a pathological payload can't turn
214
+ the folklore into a bet. Connects to B Q1 (`payload_ref` deferral): the
215
+ cap is the tripwire that tells us when deferral expires.
216
+ - **`journal_repair` events** (A §3): an event about journal damage written
217
+ *to the damaged journal* by the racy inline path (A2) — fold into doctor.
218
+
219
+ ---
220
+
221
+ ## 2. Attacks on the SHARED spine
222
+
223
+ ### S1. Full-snapshot-per-event is wrong for heartbeat-class churn — must be scoped now
224
+
225
+ Both proposals wave at assignments/agent_runs/claims as future journal
226
+ citizens (A Q4: "highest churn — biggest win, biggest blast radius"). Attack
227
+ with numbers: a claim refresh or run heartbeat every ~30s × 20 agents × 2KB
228
+ snapshot ≈ **>100MB/day of journal for zero information** — segment rolls
229
+ every couple of hours, checkpoint runs constantly, sealed-segment storage
230
+ growing ~3GB/month, all to record "still alive." Snapshot-per-event is the
231
+ *right* call for intentional state transitions and the *wrong* call for
232
+ liveness signals.
233
+
234
+ **Synthesis must rule, not defer:** heartbeat/refresh-class field updates
235
+ (claim `refreshed_at`, run liveness, lock metadata) are **ephemeral, not
236
+ journaled state** — they live in the projection/registry layer only, or in
237
+ their own non-authoritative sidecar. Only lifecycle *transitions* (claimed,
238
+ released, completed, failed) are events. **Falsifier for the spine choice:**
239
+ instrument event volume by action type during one dogfooding sprint in
240
+ `dual` mode; if any non-heartbeat action class exceeds ~50% of journal bytes
241
+ with snapshots, that class needs a delta or ref format — until then,
242
+ snapshots stand.
243
+
244
+ ### S2. Immutable history + park-don't-delete = no redaction path — product risk
245
+
246
+ A snapshot journal embeds every entity's full content in sealed, immutable,
247
+ never-deleted segments. Today, removing a leaked secret or personal datum
248
+ from the store means editing one JSON file (+ git history surgery, which is
249
+ at least *possible*). Under this design the datum persists in N sealed
250
+ segments and M checkpoints with no normative way to excise it — and the
251
+ house rule says never delete. Given the EU/GDPR positioning in the project's
252
+ strategy, "we cannot redact" is not a footnote. **Synthesis needs a
253
+ redaction mechanism**: a `redact` event + doctor-driven rewrite of affected
254
+ sealed segments (breaking immutability under a documented, locked, audited
255
+ procedure — sealed-segment immutability becomes "immutable except via
256
+ `doctor redact`"). Ugly, but the alternative is discovering this during an
257
+ incident. **Juan call on whether this ships in v1 or is a documented gap.**
258
+
259
+ ### S3. Global-seq-under-lock: where it's the wrong call, and the falsifier
260
+
261
+ The choice couples *event ordering* to *mutation serialization*. That is
262
+ correct **iff** the store lock remains a low-contention, store-global
263
+ primitive. Two falsifiers:
264
+
265
+ 1. **Lock contention at 20-agent scale.** With fsync-per-mutation (A3 fix)
266
+ plus reconciling readers (A6), the lock becomes the system's global
267
+ serialization point. If dispatch-storm profiling shows mutation latency
268
+ dominated by lock waits, the fix is per-writer journals merged on read
269
+ (vector ordering) — a real redesign. Decide by measurement, not
270
+ speculation: the Phase 1 `dual` sprint must record lock wait-time
271
+ distribution. If p95 lock wait > ~200ms under normal multi-agent load,
272
+ the spine choice is falsified.
273
+ 2. **Seq is presented as an identity but is only an *almost*-identity.**
274
+ Both proposals admit duplicate seqs in the steal window. Every downstream
275
+ consumer (cursors, federation keys, segment names) must therefore treat
276
+ `(seq)` as an address and `(seq, writer)` as the identity — A says this
277
+ once and then names segments, checkpoints, and federation keys by bare
278
+ seq. The synthesis must make `(seq, writer)` the normative identity
279
+ everywhere, with bare seq legal only where the lock guarantees held
280
+ (i.e., after the A4 tail-validation fix bounds collisions).
281
+
282
+ Where global-seq is *definitely* right: it costs zero new coordination
283
+ today, and every alternative (vector clocks, per-writer logs) imports
284
+ merge complexity that brainclaw's actual write rates don't justify. Keep
285
+ it; instrument the falsifier.
286
+
287
+ ---
288
+
289
+ ## 3. Divergence adjudications (A vs B)
290
+
291
+ | # | Divergence | A | B | Verdict |
292
+ |---|---|---|---|---|
293
+ | D1 | Cursor format | `{segment_id, offset}` | `{last_seq}` watermark | **B**, decisively. Offsets die under rename (A1), quarantine truncation (A2), and any future segment surgery (S2 redaction). Watermarks survive all of it and make `readUnseenEvents` rotation-proof. B's `{gap:true}` + checkpoint-summary degradation for archaeology-aged watermarks is the right notification semantics. |
294
+ | D2 | Sealing mechanics | Rename `active.jsonl` → seq-range name | Never rename; segment born with first-seq name; HEAD is a rebuildable cache | **B**, decisively (A1). Bonus: B's first-seq filename gives binary-search addressing with no index. Adopt A's `quarantine/` directory into B's layout for doctor-parked bytes. |
295
+ | D3 | Torn-tail handling | Writer-inline check + truncate-to-quarantine + `journal_repair` event | Leading-`\n` framing; reader skips; doctor reports | **B** on the hot path (A2: race-free, append-only preserved); **A's quarantine demoted to offline doctor repair**. Third option adopted: both — framing for containment, doctor for cleanup. Add the short-write check (A8) to both. |
296
+ | D4 | fsync default | `rotate` (none per mutation) | One per `mutate()`, before projection writes | **B** (A3: program-order journal-first is fiction without a barrier). Plus never-regress guard in the reconciler as defense in depth. Config knob stays; CI uses prod default (B Q2 — fidelity over speed, per test-env-contamination history). |
297
+ | D5 | Checkpoint shape | External `ckpt-<seq>.json` + `checkpoint_ref` event (+ referencing variant Q3) | In-journal checkpoint event run + terminator record | **A's external file, hardened by B's corruption discipline**: self-contained, sha256 recorded in the `checkpoint_ref` event, fall back to previous checkpoint on mismatch. Reasons: keeps segments lean (checkpoint = O(live entities) bytes that would otherwise inflate the seq-addressed stream), rebuild = load 1 file + replay tail without scanning for a terminator, and whole-file checkpoints are the one journal artifact that *can* be git-committed sanely (A7). B's terminator trick is elegant but its partial-checkpoint-run detection is strictly replaced by the checksum. **A's referencing variant is killed** (A5). |
298
+ | D6 | Event emission | Diff synthesis at `persistStateUnlocked`, then "opportunistic migration to explicit verb-level emission" | `diffToEvents` in `mutateState`, permanent | **B's permanence wins; A's migration goal is deleted.** Same mechanism, different end-state. Explicit emission at ~30 verb sites is a drift machine: every new call site is a chance to forget, double-emit, or emit-without-persisting. The diff boundary is a single choke point that is *provably* consistent with what was persisted. Explicit emission is justified only for registries that never pass through `State` (assignments/runs/loops) — and those should reuse the same append+project primitive. |
299
+ | D7 | `entity_rev` (A) vs per-writer `writer_seq` (B) | Per-entity monotonic rev in envelope | Per-writer counter for federation | **A's `entity_rev`** — it powers the never-regress guard (D4), cheap projection dirty-checks, and optimistic concurrency. It is scalar and per-store, so it is *not* sufficient for federation conflict detection (A's own Q2); origin-tagging or a vector component is federation-spec work. B's `writer_seq` adds nothing locally — drop until federation demands it. |
300
+ | D8 | Observability events | All appends under lock, with seq (implied) | Lock-free append, `seq: null`, excluded from rebuild | **A (all under lock, all get seqs)** — B has an internal contradiction: B's cursors are seq watermarks (D1), and `seq:null` records are *unaddressable* by a seq watermark; B's own notification reader cannot deliver them. B's escape ("they can cheaply take the lock") is the actual design. Notification volume is low; lock cost is negligible; uniformity means one reader, one cursor type, one ordering. B Q6 (separate notification stream) resolves to **no — same journal** unless S1 instrumentation shows notification bytes dominating segments. |
301
+ | D9 | `doctor --verify-journal` | Phase 2: rebuild in temp dir, diff vs live projections, CI on both OS + full dogfood sprint | doctor checks + kill-9 convergence tests | **Both, merged**: A's verify-rebuild-diff is the acceptance instrument (it would have caught A3's regression class); B's kill-9 storm tests are the crash-matrix executor. Neither subsumes the other. Add: two-process append stress test on both OS families (A §3) and the A4 tail-validation test. |
302
+ | D10 | Legacy v1 events | Park to `journal/legacy/` | Park to `events/archive/events.v1.jsonl` | Equivalent (both park-don't-delete, both decline to translate payload-less v1). Cosmetic; follow D2's layout. |
303
+ | D11 | Payload elision rule | Q1: payload required iff action mutates a persisted entity | Same rule, stated as design (§1.1) | Agreed by both; **adopt as normative, send the action-union → payload-requirement mapping to Codex** for hole-hunting (A Q1 stands). |
304
+
305
+ ---
306
+
307
+ ## 4. VERDICT
308
+
309
+ ### Five decisions the synthesis MUST take
310
+
311
+ 1. **Segment lifecycle (D1+D2):** never rename; segments named by first seq;
312
+ active segment is just the newest segment; `HEAD`/meta is a rebuildable
313
+ cache; cursors are seq watermarks. A's rename/offset design is dead.
314
+ 2. **Append protocol (D3+A4+A8):** leading-`\n` framing, single-buffer
315
+ write, short-write ⇒ loud mutation failure, single-line size cap;
316
+ `next_seq = max(meta, tail+1)` validated on every lock acquisition;
317
+ torn-tail excision is offline doctor-only; `(seq, writer)` is the
318
+ normative event identity.
319
+ 3. **Durability (D4):** fsync journal fd once per `mutate()` *before* any
320
+ projection write (configurable, prod default everywhere incl. CI), plus
321
+ a never-regress guard in the reconciler keyed on `entity_rev`.
322
+ 4. **Emission & scope (D6+S1):** diff-to-event synthesis at the persist
323
+ choke point is permanent — no migration to explicit call-site emission;
324
+ heartbeat/refresh-class updates are ephemeral and never journaled, only
325
+ lifecycle transitions are; all events (including observability) append
326
+ under the lock and carry seqs (D8).
327
+ 5. **Checkpoints & git (D5+A5+A7):** self-contained external checkpoint
328
+ files with sha256 recorded in an in-journal `checkpoint_ref` event,
329
+ previous-checkpoint fallback chain, gc never archives past the
330
+ second-newest verified checkpoint; segments/meta are **gitignored** —
331
+ the store's git-diffable identity is projections (+ optionally
332
+ checkpoints) only.
333
+
334
+ ### Open questions that genuinely need escalation (severity-ranked)
335
+
336
+ | Rank | Question | Owner | Why it can't be settled here |
337
+ |---|---|---|---|
338
+ | 1 | **Redaction in immutable segments** (S2): ship a `doctor redact` segment-rewrite procedure in v1, or document the gap? | **Juan** (product/compliance) | Trades the immutability invariant against GDPR positioning; pure product risk call. |
339
+ | 2 | **Stale reads for claims** (A6 / B Q3): may a contended reader serve a stale claims projection, or must claims read through the journal? | **Juan** | Correctness-vs-availability for the one entity class where staleness can cause double-work or conflicting edits. |
340
+ | 3 | **Action-union → payload mapping** (D11 / A Q1): exact rule for which `EventAction`s carry payloads, tombstone semantics per type, checkpoint/`checkpoint_ref`/genesis record schemas, dup-seq reducer semantics | **Codex** (schema review) | Needs adversarial enumeration of the full action union against the envelope; exactly Codex's strength per capability mapping. |
341
+ | 4 | **Federation conflict primitive** (A Q2 / B Q4): scalar `entity_rev` + origin tag vs vector component | **Codex + federation spec owner** | Journal design is agnostic (both proposals agree); deciding now would front-run the federation architecture. |
342
+ | 5 | **Registry entities' journal entry phase** (A Q4, post-S1-scoping): assignments/runs/claims lifecycle transitions in Phase 1 or 1.5 | **Juan** (sequencing/risk) | Pure blast-radius sequencing once S1 has excluded heartbeats; depends on sprint appetite, not design. |
343
+ | 6 | **gc archiving policy** (A Q5, now constrained by D5's two-checkpoint floor) | defer | No federation consumer exists yet; constraint recorded, policy can wait. |
344
+
345
+ ### What this critique changes about my own round-1 proposal (B)
346
+
347
+ For symmetry: B loses D5 (in-journal checkpoint run → external+checksum),
348
+ loses D8 (its `seq:null` lock-free observability events contradicted its own
349
+ cursor design), had not spotted the A4 counter-corruption escalation (B's
350
+ "impossible by construction" two-writer row was overconfident — the lock can
351
+ be broken), and B's missing `entity_rev` is adopted from A (D7). B's Q5
352
+ resolves to gitignore (A7). Convergence was not validation in either
353
+ direction.
@@ -0,0 +1,58 @@
1
+ # Event-Log Store — Phase-0 Measurements (C3 falsifier)
2
+
3
+ > Measured 2026-06-10 on the brainclaw dogfood store (`shared_agent_memory_mvp/.brainclaw`),
4
+ > per spec §6 C3: "p95 size × frequency per item_type; if a poison combination exists
5
+ > (record > 64 KB or segment rolls faster than ~weekly), `payload_ref` enters phase 1
6
+ > and the record schema changes — decide before the format ships."
7
+
8
+ ## Entity snapshot sizes (current per-entity JSON files as payload proxy)
9
+
10
+ | item_type | files | p50 B | p95 B | max B |
11
+ |---|---|---|---|---|
12
+ | assignment | 160 | 1,900 | 7,209 | 10,680 |
13
+ | claim | 383 | 692 | 5,535 | 10,385 |
14
+ | constraint | 12 | 654 | 1,542 | 1,542 |
15
+ | decision | 73 | 1,190 | 2,451 | 4,768 |
16
+ | **handoff** | **495** | **109,700** | **225,157** | **296,032** |
17
+ | plan | 193 | 2,195 | 7,481 | 13,388 |
18
+ | trap | 55 | 1,196 | 3,680 | 4,217 |
19
+
20
+ ## Event frequency (events.jsonl, 17,727 events since 2026-04)
21
+
22
+ | item_type | events (all) | events (last 7d) |
23
+ |---|---|---|
24
+ | runtime_note | 5,192 | 244 |
25
+ | session | 4,611 | 223 |
26
+ | state | 3,060 | 275 |
27
+ | agent_run | 1,387 | 424 |
28
+ | assignment | 1,316 | 401 |
29
+ | claim | 717 | 83 |
30
+ | handoff | 454 | 0 |
31
+ | plan | 399 | 41 |
32
+ | trap | 156 | 31 |
33
+ | decision | 147 | 9 |
34
+
35
+ ## Verdict — the falsifier FIRES on handoffs
36
+
37
+ - **Handoffs are 15-45× over the 64 KB poison threshold** at p50 already (the inline
38
+ `snapshot.diff` dominates — same root cause as the 41 MB `handoffs/compacted.jsonl`).
39
+ At historical frequency (454 events), full-snapshot handoff records would roll a
40
+ 10 MB segment in ~90 events — days, not weeks.
41
+ - **Every other entity class is comfortable** (worst p95 = plan at 7.5 KB; even
42
+ the high-churn registry classes are ≤ 7.2 KB p95). Full-snapshot-per-event stands
43
+ for everything except handoff-class payloads.
44
+
45
+ **Consequence for the spec (phase 1, per C3's own rule):** `payload_ref` enters the
46
+ record format in phase 1 for oversized payloads — recommended shape: inline snapshot
47
+ when `payload <= 64 KB`, else `payload_ref` to a content-addressed blob
48
+ (`journal/blobs/<sha256>`), with the envelope carrying the hash either way.
49
+ Alternative worth Codex's review: a handoff "diet" (externalize `snapshot.diff` from
50
+ the handoff entity itself, which would also fix the 41 MB compacted.jsonl class) —
51
+ the two are composable, not exclusive.
52
+
53
+ ~~Open for C3 review~~ — **RESOLVED 2026-06-10** in spec §2.10: blob gc extends the
54
+ two-checkpoint floor verbatim (a blob archives only when unreferenced by non-archived
55
+ segments AND by both newest verified checkpoints' closures); `runtime_note`/`session`
56
+ volume needs **no early retention knob** — both classes are payload-free
57
+ (observability) in v2, so 10k events ≈ 2–3 MB of line overhead, no threat to the
58
+ weekly-roll target (J5 unchanged). Residual product call: J6 (handoff diet).