@lcv-ideas-software/cross-review 4.0.7 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  import crypto from "node:crypto";
2
2
  import fs from "node:fs";
3
3
  import path from "node:path";
4
+ import lockfile from "proper-lockfile";
4
5
  import { redact } from "../security/redact.js";
5
6
  import { mergeCost, mergeUsage } from "./cost.js";
6
7
  import { PEERS } from "./types.js";
@@ -21,7 +22,7 @@ function now() {
21
22
  const ATOMIC_WRITE_RETRY_CODES = new Set(["EPERM", "EACCES", "EBUSY", "EEXIST"]);
22
23
  const ATOMIC_WRITE_MAX_ATTEMPTS = 5;
23
24
  const TMP_NONCE_BYTES = 2;
24
- function writeJson(file, data) {
25
+ async function writeJson(file, data) {
25
26
  fs.mkdirSync(path.dirname(file), { recursive: true });
26
27
  const nonce = crypto.randomBytes(TMP_NONCE_BYTES).toString("hex");
27
28
  const tmp = `${file}.${process.pid}.${Date.now()}.${nonce}.tmp`;
@@ -37,11 +38,17 @@ function writeJson(file, data) {
37
38
  const code = err.code;
38
39
  if (!code || !ATOMIC_WRITE_RETRY_CODES.has(code))
39
40
  break;
41
+ // v4.1.0 hardening: pre-v4.1.0 used `while (Date.now() - start <
42
+ // wait) {}` busy-wait which blocked the single Node.js event loop
43
+ // thread for up to 310 ms (10+20+40+80+160) under repeated
44
+ // Windows-AV-induced EPERM/EBUSY contention. The CPU-burning
45
+ // busy-wait starved SSE streaming + concurrent sessions + MCP
46
+ // stdio reads. Now the backoff awaits a Promise-based timer:
47
+ // event loop remains fully responsive between attempts.
40
48
  const wait = 10 * 2 ** attempt; // 10, 20, 40, 80, 160 ms
41
- const start = Date.now();
42
- while (Date.now() - start < wait) {
43
- /* spin — sync write path, brief by design */
44
- }
49
+ await new Promise((resolve) => {
50
+ setTimeout(resolve, wait);
51
+ });
45
52
  }
46
53
  }
47
54
  // Terminal failure path: best-effort tmp cleanup so callers don't see
@@ -99,6 +106,16 @@ export class SessionStore {
99
106
  // monotonically thereafter. Restart re-initializes from disk, so seq
100
107
  // remains correct across process boundaries.
101
108
  seqCache = new Map();
109
+ // v4.1.0: track in-flight fire-and-forget appendEvent promises so
110
+ // callers that need synchronous read-after-write semantics (smoke
111
+ // tests, post-round aggregation) can call `flushPendingEvents()` to
112
+ // wait for all pending event writes to settle before reading.
113
+ // appendEvent is async because withSessionLock is async (proper-
114
+ // lockfile); the emit pipeline must stay sync, so it uses
115
+ // `void store.appendEvent(event)` and the store remembers the
116
+ // promise here. Promises resolve/reject within appendEvent's own
117
+ // try/catch — flush() therefore always settles, never rejects.
118
+ pendingEventWrites = new Set();
102
119
  constructor(config) {
103
120
  this.config = config;
104
121
  fs.mkdirSync(this.sessionsDir(), { recursive: true });
@@ -140,10 +157,6 @@ export class SessionStore {
140
157
  return false;
141
158
  }
142
159
  }
143
- sleepSync(ms) {
144
- const buffer = new SharedArrayBuffer(4);
145
- Atomics.wait(new Int32Array(buffer), 0, 0, ms);
146
- }
147
160
  totalsFor(meta) {
148
161
  const peerResults = meta.rounds.flatMap((round) => round.peers);
149
162
  const generations = meta.generation_files ?? [];
@@ -158,46 +171,114 @@ export class SessionStore {
158
171
  ]),
159
172
  };
160
173
  }
161
- withSessionLock(sessionId, fn) {
174
+ // v4.1.0 hardening: pre-v4.1.0 acquired the lock via an exclusive
175
+ // file-create syscall followed by a separate JSON metadata write,
176
+ // which had a multi-process TOCTOU race window. Process A's create
177
+ // returned an empty inode + fd; before A's metadata write executed,
178
+ // process B could observe the empty file, fail to JSON-parse it,
179
+ // remove the lock path, create its own valid lock, and enter the
180
+ // critical section. Process A would then write into the now-orphan
181
+ // inode via the still-open fd and ALSO enter the critical section,
182
+ // corrupting meta.json. proper-lockfile uses `fs.mkdir` (atomic
183
+ // across NTFS and POSIX) so the lock comes into existence as a
184
+ // directory in a single syscall — no empty-window race possible.
185
+ // The mkdir-based lock also fixes the lock-holder freshness signal:
186
+ // proper-lockfile's `update` interval touches the lockfile's mtime
187
+ // every 5 s, and any other process treats the lock as stale once the
188
+ // mtime is older than `stale` ms (120 s). This is more robust than
189
+ // the pre-v4.1.0 PID-aliveness check, which had collision risk after
190
+ // process restart.
191
+ async withSessionLock(sessionId, fn) {
162
192
  const dir = this.sessionDir(sessionId);
163
- const lockPath = path.join(dir, ".lock");
164
- const timeoutAt = Date.now() + 30_000;
165
- while (true) {
193
+ const target = this.metaPath(sessionId);
194
+ const lockfilePath = path.join(dir, ".lock");
195
+ fs.mkdirSync(dir, { recursive: true });
196
+ // proper-lockfile requires the target path to exist (it uses it for
197
+ // realpath resolution). Init creates the session dir then immediately
198
+ // calls withSessionLock-protected writes; pre-create an empty meta
199
+ // placeholder so the first init() can acquire the lock. Existing
200
+ // session reuses preserve their meta.
201
+ if (!fs.existsSync(target)) {
166
202
  try {
167
- const fd = fs.openSync(lockPath, "wx");
168
- fs.writeFileSync(fd, JSON.stringify({ pid: process.pid, acquired_at: now() }));
169
- fs.closeSync(fd);
170
- break;
203
+ fs.writeFileSync(target, "{}\n", { flag: "wx" });
171
204
  }
172
- catch (error) {
173
- if (error.code !== "EEXIST")
174
- throw error;
175
- try {
176
- const lock = readJson(lockPath);
177
- const age = lock.acquired_at ? Date.now() - Date.parse(lock.acquired_at) : Infinity;
178
- if (!lock.pid || age > 120_000 || !this.processAlive(lock.pid)) {
179
- fs.rmSync(lockPath, { force: true });
180
- continue;
181
- }
182
- }
183
- catch {
184
- fs.rmSync(lockPath, { force: true });
185
- continue;
186
- }
187
- if (Date.now() >= timeoutAt) {
188
- throw new Error(`timed out waiting for session lock: ${sessionId}`, { cause: error });
189
- }
190
- this.sleepSync(100);
205
+ catch (err) {
206
+ if (err.code !== "EEXIST")
207
+ throw err;
208
+ /* concurrent process created it; fine */
191
209
  }
192
210
  }
211
+ // Pre-v4.1.0 legacy `.lock` regular file detection — FAIL CLOSED.
212
+ //
213
+ // Pre-v4.1.0 created `.lock` as a regular file containing
214
+ // `{pid, ts}` JSON. proper-lockfile claims `.lock` as a DIRECTORY
215
+ // via mkdir, so a leftover regular file blocks every subsequent
216
+ // lockfile.lock() with EEXIST. The original v4.1.0 design tried
217
+ // to auto-clean stale legacy files. Codex (session 059b0093 R1
218
+ // through R4) progressively demonstrated that NO auto-clean is
219
+ // safe under live cross-version operation:
220
+ //
221
+ // • R1: unconditional removal split-brained with a live legacy
222
+ // holder.
223
+ // • R2: removal-when-pid-alive-but-mtime-stale split-brained
224
+ // because legacy locks do not heartbeat (mtime is frozen at
225
+ // acquisition).
226
+ // • R3: per-process atomic decisions still raced two v4.1
227
+ // migrators.
228
+ // • R4: serializing v4.1 migrators via a separate mutex still
229
+ // left the cross-version race: v4.0.x's own stale-removal
230
+ // path does not honor any v4.1 mutex, so a concurrent v4.0.x
231
+ // could remove a stale `.lock` and create its own live one
232
+ // between v4.1's read and v4.1's path-based rmSync —
233
+ // v4.1 then deletes the new live legacy lock → split-brain.
234
+ //
235
+ // Resolution: v4.1.0 NEVER auto-removes a legacy regular `.lock`
236
+ // file. If one is observed, withSessionLock throws a clear
237
+ // remediation error to the caller, instructing the operator to
238
+ // stop all cross-review processes and remove the file manually.
239
+ // This is a ONE-TIME operator step at v4.0.x → v4.1.0 upgrade.
240
+ // After all hosts are on v4.1.0 the locks are mkdir-atomic and
241
+ // the issue cannot recur.
193
242
  try {
194
- return fn();
243
+ const stat = fs.statSync(lockfilePath);
244
+ if (stat.isFile()) {
245
+ throw new Error(`cross-review v4.1.0 detected a pre-v4.1.0 lock file at ${lockfilePath}. ` +
246
+ `Live cross-version migration is not supported (would split-brain with any ` +
247
+ `concurrent v4.0.x process). To migrate safely: (1) stop all cross-review ` +
248
+ `processes / close all MCP hosts that loaded the server, (2) remove the ` +
249
+ `legacy lock file, (3) restart. POSIX one-liner for full cleanup: ` +
250
+ `\`find ${this.config.data_dir}/sessions -name .lock -type f -delete\`. ` +
251
+ `See CHANGELOG v04.01.00 migration notes for the rationale.`);
252
+ }
253
+ }
254
+ catch (err) {
255
+ if (err instanceof Error && err.message.includes("detected a pre-v4.1.0 lock file")) {
256
+ throw err;
257
+ }
258
+ if (err.code !== "ENOENT") {
259
+ /* ignore other stat errors; lockfile.lock will surface them */
260
+ }
261
+ }
262
+ const release = await lockfile.lock(target, {
263
+ stale: 120_000,
264
+ update: 5_000,
265
+ retries: { retries: 30, factor: 1.5, minTimeout: 100, maxTimeout: 1_000 },
266
+ realpath: false,
267
+ lockfilePath,
268
+ });
269
+ try {
270
+ return await fn();
195
271
  }
196
272
  finally {
197
- fs.rmSync(lockPath, { force: true });
273
+ try {
274
+ await release();
275
+ }
276
+ catch {
277
+ /* lock was already released by stale-detection or sibling process */
278
+ }
198
279
  }
199
280
  }
200
- init(task, caller, snapshot, reviewFocus) {
281
+ async init(task, caller, snapshot, reviewFocus) {
201
282
  const session_id = crypto.randomUUID();
202
283
  // v2.22.0 (B.P3): snapshot the cost ceiling at session_init time so
203
284
  // budget pressure analysis is decoupled from later env-var mutation.
@@ -227,7 +308,7 @@ export class SessionStore {
227
308
  budget_warning_emitted: false,
228
309
  };
229
310
  fs.mkdirSync(path.join(this.sessionDir(session_id), "agent-runs"), { recursive: true });
230
- writeJson(this.metaPath(session_id), meta);
311
+ await writeJson(this.metaPath(session_id), meta);
231
312
  fs.writeFileSync(path.join(this.sessionDir(session_id), "task.md"), task, "utf8");
232
313
  if (reviewFocus) {
233
314
  fs.writeFileSync(path.join(this.sessionDir(session_id), "review-focus.md"), reviewFocus, "utf8");
@@ -243,8 +324,8 @@ export class SessionStore {
243
324
  // R5 throws when in_flight is already populated; the boot-time
244
325
  // `clearStaleInFlight` sweep clears any orphan in_flight from a
245
326
  // crashed prior host so legitimate operators are not blocked.
246
- markInFlight(sessionId, params) {
247
- return this.withSessionLock(sessionId, () => {
327
+ async markInFlight(sessionId, params) {
328
+ return this.withSessionLock(sessionId, async () => {
248
329
  const meta = this.read(sessionId);
249
330
  if (meta.in_flight) {
250
331
  throw new Error(`session ${sessionId} already has an in-flight round (round=${meta.in_flight.round}, started_at=${meta.in_flight.started_at}); refusing to start a concurrent round. Wait for the round to complete, cancel it via session_cancel_job, or recover it via session_recover_interrupted.`);
@@ -262,7 +343,7 @@ export class SessionStore {
262
343
  detail: `Round ${params.round} is running.`,
263
344
  };
264
345
  meta.updated_at = now();
265
- writeJson(this.metaPath(sessionId), meta);
346
+ await writeJson(this.metaPath(sessionId), meta);
266
347
  return meta;
267
348
  });
268
349
  }
@@ -304,23 +385,47 @@ export class SessionStore {
304
385
  commitSeq(sessionId, committed) {
305
386
  this.seqCache.set(sessionId, committed);
306
387
  }
307
- appendEvent(event) {
388
+ // v4.1.0: durable event persistence. withSessionLock became async
389
+ // with the proper-lockfile refactor; appendEvent awaits the lock so
390
+ // callers that read events after persisting get the expected
391
+ // synchronous-write semantics (e.g. the session_doctor sweep + smoke
392
+ // fixtures that read events.ndjson immediately after appendEvent).
393
+ // Fire-and-forget callers wrap with `void store.appendEvent(...)`.
394
+ async appendEvent(event) {
308
395
  const sessionId = event.session_id;
309
396
  if (!sessionId)
310
397
  return;
311
- try {
312
- this.withSessionLock(sessionId, () => {
313
- const file = this.eventsPath(sessionId);
314
- const seq = this.peekNextSeq(sessionId, file);
315
- fs.appendFileSync(file, `${JSON.stringify({ ...event, seq, ts: event.ts ?? now() })}\n`, "utf8");
316
- // Only commit the cache AFTER the durable append succeeded.
317
- // If appendFileSync threw above, the cache still reflects the
318
- // last persisted seq and the next call reuses this seq number.
319
- this.commitSeq(sessionId, seq);
320
- });
321
- }
322
- catch {
323
- // Event persistence must never break provider calls or MCP responses.
398
+ const write = (async () => {
399
+ try {
400
+ await this.withSessionLock(sessionId, () => {
401
+ const file = this.eventsPath(sessionId);
402
+ const seq = this.peekNextSeq(sessionId, file);
403
+ fs.appendFileSync(file, `${JSON.stringify({ ...event, seq, ts: event.ts ?? now() })}\n`, "utf8");
404
+ // Only commit the cache AFTER the durable append succeeded.
405
+ // If appendFileSync threw above, the cache still reflects the
406
+ // last persisted seq and the next call reuses this seq number.
407
+ this.commitSeq(sessionId, seq);
408
+ });
409
+ }
410
+ catch {
411
+ // Event persistence must never break provider calls or MCP responses.
412
+ }
413
+ })();
414
+ this.pendingEventWrites.add(write);
415
+ void write.finally(() => {
416
+ this.pendingEventWrites.delete(write);
417
+ });
418
+ return write;
419
+ }
420
+ // v4.1.0: wait for all in-flight fire-and-forget event writes to
421
+ // settle. Used by tests/sweeps that need synchronous read-after-write
422
+ // semantics for events.ndjson when the emit pipeline used
423
+ // `void store.appendEvent(...)`. Always resolves (never rejects);
424
+ // appendEvent swallows its own errors.
425
+ async flushPendingEvents() {
426
+ while (this.pendingEventWrites.size > 0) {
427
+ const snapshot = Array.from(this.pendingEventWrites);
428
+ await Promise.allSettled(snapshot);
324
429
  }
325
430
  }
326
431
  readEvents(sessionId, sinceSeq = 0) {
@@ -415,11 +520,11 @@ export class SessionStore {
415
520
  fs.writeFileSync(file, redact(draft), "utf8");
416
521
  return path.relative(this.sessionDir(sessionId), file).replace(/\\/g, "/");
417
522
  }
418
- saveGeneration(sessionId, round, result, label = "generation") {
523
+ async saveGeneration(sessionId, round, result, label = "generation") {
419
524
  const file = path.join(this.sessionDir(sessionId), "agent-runs", `round-${round}-${result.peer}-${label}.json`);
420
- writeJson(file, { ...result, text: redact(result.text) });
525
+ await writeJson(file, { ...result, text: redact(result.text) });
421
526
  const relativePath = path.relative(this.sessionDir(sessionId), file).replace(/\\/g, "/");
422
- this.withSessionLock(sessionId, () => {
527
+ await this.withSessionLock(sessionId, async () => {
423
528
  const meta = this.read(sessionId);
424
529
  const artifact = {
425
530
  ts: now(),
@@ -434,7 +539,7 @@ export class SessionStore {
434
539
  meta.generation_files = [...(meta.generation_files ?? []), artifact];
435
540
  meta.totals = this.totalsFor(meta);
436
541
  meta.updated_at = now();
437
- writeJson(this.metaPath(sessionId), meta);
542
+ await writeJson(this.metaPath(sessionId), meta);
438
543
  });
439
544
  return relativePath;
440
545
  }
@@ -448,18 +553,18 @@ export class SessionStore {
448
553
  fs.writeFileSync(file, redact(text), "utf8");
449
554
  return path.relative(this.sessionDir(sessionId), file).replace(/\\/g, "/");
450
555
  }
451
- savePeerResult(sessionId, round, result, label = "response") {
556
+ async savePeerResult(sessionId, round, result, label = "response") {
452
557
  const file = path.join(this.sessionDir(sessionId), "agent-runs", `round-${round}-${result.peer}-${label}.json`);
453
- writeJson(file, { ...result, text: redact(result.text) });
558
+ await writeJson(file, { ...result, text: redact(result.text) });
454
559
  return path.relative(this.sessionDir(sessionId), file).replace(/\\/g, "/");
455
560
  }
456
- savePeerFailure(sessionId, round, failure) {
561
+ async savePeerFailure(sessionId, round, failure) {
457
562
  const file = path.join(this.sessionDir(sessionId), "agent-runs", `round-${round}-${failure.peer}-failure.json`);
458
- writeJson(file, { ...failure, message: redact(failure.message) });
563
+ await writeJson(file, { ...failure, message: redact(failure.message) });
459
564
  return path.relative(this.sessionDir(sessionId), file).replace(/\\/g, "/");
460
565
  }
461
- appendRound(sessionId, params) {
462
- return this.withSessionLock(sessionId, () => {
566
+ async appendRound(sessionId, params) {
567
+ return this.withSessionLock(sessionId, async () => {
463
568
  const meta = this.read(sessionId);
464
569
  // v3.2.0 (Codex bug report 2026-05-12): refuse to append a round
465
570
  // to a finalized session. Otherwise the per-round
@@ -507,19 +612,19 @@ export class SessionStore {
507
612
  // diff-based drift if a peer's cost changed in a retry loop.
508
613
  const roundCost = params.peers.reduce((sum, peer) => sum + (peer.cost?.total_cost ?? 0), 0);
509
614
  meta.costs_per_round = [...(meta.costs_per_round ?? []), roundCost];
510
- writeJson(this.metaPath(sessionId), meta);
615
+ await writeJson(this.metaPath(sessionId), meta);
511
616
  return round;
512
617
  });
513
618
  }
514
619
  // v2.22.0 (B.P3): one-shot guard for `session.budget_warning` emit
515
620
  // idempotency. Persisted in meta.json so the warning fires at most
516
621
  // once per session even across host restarts.
517
- markBudgetWarningEmitted(sessionId) {
518
- return this.withSessionLock(sessionId, () => {
622
+ async markBudgetWarningEmitted(sessionId) {
623
+ return this.withSessionLock(sessionId, async () => {
519
624
  const meta = this.read(sessionId);
520
625
  meta.budget_warning_emitted = true;
521
626
  meta.updated_at = now();
522
- writeJson(this.metaPath(sessionId), meta);
627
+ await writeJson(this.metaPath(sessionId), meta);
523
628
  return meta;
524
629
  });
525
630
  }
@@ -527,12 +632,12 @@ export class SessionStore {
527
632
  // orchestrator's circular loop calls this every round so resumed
528
633
  // sessions can pick up the rotation cursor and consecutive-no-change
529
634
  // count from disk without re-deriving them by walking events.
530
- setCircularState(sessionId, state) {
531
- return this.withSessionLock(sessionId, () => {
635
+ async setCircularState(sessionId, state) {
636
+ return this.withSessionLock(sessionId, async () => {
532
637
  const meta = this.read(sessionId);
533
638
  meta.circular_state = state;
534
639
  meta.updated_at = now();
535
- writeJson(this.metaPath(sessionId), meta);
640
+ await writeJson(this.metaPath(sessionId), meta);
536
641
  return meta;
537
642
  });
538
643
  }
@@ -545,8 +650,8 @@ export class SessionStore {
545
650
  // max_rounds the caller actually requested. This fills that gap with
546
651
  // pure-additive metadata; `cost_ceiling_usd` is kept in sync with
547
652
  // `effective_cost_ceiling_usd` for back-compat with v3.4.x readers.
548
- setSessionTraceability(sessionId, traceability) {
549
- return this.withSessionLock(sessionId, () => {
653
+ async setSessionTraceability(sessionId, traceability) {
654
+ return this.withSessionLock(sessionId, async () => {
550
655
  const meta = this.read(sessionId);
551
656
  meta.requested_max_rounds = traceability.requested_max_rounds;
552
657
  meta.effective_max_rounds = traceability.effective_max_rounds;
@@ -557,7 +662,7 @@ export class SessionStore {
557
662
  // only know `cost_ceiling_usd` still see the effective ceiling.
558
663
  meta.cost_ceiling_usd = traceability.effective_cost_ceiling_usd;
559
664
  meta.updated_at = now();
560
- writeJson(this.metaPath(sessionId), meta);
665
+ await writeJson(this.metaPath(sessionId), meta);
561
666
  return meta;
562
667
  });
563
668
  }
@@ -576,8 +681,8 @@ export class SessionStore {
576
681
  throw err;
577
682
  }
578
683
  }
579
- finalize(sessionId, outcome, reason) {
580
- return this.withSessionLock(sessionId, () => {
684
+ async finalize(sessionId, outcome, reason) {
685
+ return this.withSessionLock(sessionId, async () => {
581
686
  const meta = this.read(sessionId);
582
687
  // v3.2.0 (Codex bug report 2026-05-12): when the caller asserts
583
688
  // outcome="converged", the latest round (if any) MUST have
@@ -606,12 +711,12 @@ export class SessionStore {
606
711
  detail: reason ?? outcome,
607
712
  };
608
713
  meta.updated_at = now();
609
- writeJson(this.metaPath(sessionId), meta);
714
+ await writeJson(this.metaPath(sessionId), meta);
610
715
  return meta;
611
716
  });
612
717
  }
613
- requestCancellation(sessionId, reason = "operator_requested", jobId) {
614
- return this.withSessionLock(sessionId, () => {
718
+ async requestCancellation(sessionId, reason = "operator_requested", jobId) {
719
+ return this.withSessionLock(sessionId, async () => {
615
720
  const meta = this.read(sessionId);
616
721
  meta.control = {
617
722
  status: "cancel_requested",
@@ -626,12 +731,12 @@ export class SessionStore {
626
731
  detail: `Cancellation requested: ${reason}`,
627
732
  };
628
733
  meta.updated_at = now();
629
- writeJson(this.metaPath(sessionId), meta);
734
+ await writeJson(this.metaPath(sessionId), meta);
630
735
  return meta;
631
736
  });
632
737
  }
633
- markCancelled(sessionId, reason = "cancelled") {
634
- return this.withSessionLock(sessionId, () => {
738
+ async markCancelled(sessionId, reason = "cancelled") {
739
+ return this.withSessionLock(sessionId, async () => {
635
740
  const meta = this.read(sessionId);
636
741
  meta.outcome = "aborted";
637
742
  meta.outcome_reason = reason;
@@ -649,7 +754,7 @@ export class SessionStore {
649
754
  detail: reason,
650
755
  };
651
756
  meta.updated_at = now();
652
- writeJson(this.metaPath(sessionId), meta);
757
+ await writeJson(this.metaPath(sessionId), meta);
653
758
  return meta;
654
759
  });
655
760
  }
@@ -657,12 +762,12 @@ export class SessionStore {
657
762
  const meta = this.read(sessionId);
658
763
  return meta.control?.status === "cancel_requested";
659
764
  }
660
- appendFallbackEvent(sessionId, event) {
661
- return this.withSessionLock(sessionId, () => {
765
+ async appendFallbackEvent(sessionId, event) {
766
+ return this.withSessionLock(sessionId, async () => {
662
767
  const meta = this.read(sessionId);
663
768
  meta.fallback_events = [...(meta.fallback_events ?? []), event];
664
769
  meta.updated_at = now();
665
- writeJson(this.metaPath(sessionId), meta);
770
+ await writeJson(this.metaPath(sessionId), meta);
666
771
  return meta;
667
772
  });
668
773
  }
@@ -672,10 +777,10 @@ export class SessionStore {
672
777
  // across rounds increments `round_count` instead of producing
673
778
  // duplicate entries. Returns the updated checklist (or empty array
674
779
  // if nothing was added/updated).
675
- appendEvidenceChecklistItems(sessionId, round, incoming) {
780
+ async appendEvidenceChecklistItems(sessionId, round, incoming) {
676
781
  if (!incoming.length)
677
782
  return [];
678
- return this.withSessionLock(sessionId, () => {
783
+ return this.withSessionLock(sessionId, async () => {
679
784
  const meta = this.read(sessionId);
680
785
  const existing = meta.evidence_checklist ?? [];
681
786
  const byId = new Map(existing.map((item) => [item.id, item]));
@@ -724,7 +829,7 @@ export class SessionStore {
724
829
  });
725
830
  meta.evidence_checklist = updated;
726
831
  meta.updated_at = ts;
727
- writeJson(this.metaPath(sessionId), meta);
832
+ await writeJson(this.metaPath(sessionId), meta);
728
833
  return updated;
729
834
  });
730
835
  }
@@ -750,8 +855,8 @@ export class SessionStore {
750
855
  // by the orchestrator via a separate event so operators see when peers
751
856
  // keep asking for items they explicitly closed; the status itself is
752
857
  // operator-owned.
753
- runEvidenceChecklistAddressDetection(sessionId, currentRound) {
754
- return this.withSessionLock(sessionId, () => {
858
+ async runEvidenceChecklistAddressDetection(sessionId, currentRound) {
859
+ return this.withSessionLock(sessionId, async () => {
755
860
  const meta = this.read(sessionId);
756
861
  const checklist = meta.evidence_checklist ?? [];
757
862
  if (!checklist.length) {
@@ -824,7 +929,7 @@ export class SessionStore {
824
929
  if (notResurfaced.length || reopened.length) {
825
930
  meta.evidence_status_history = history;
826
931
  meta.updated_at = ts;
827
- writeJson(this.metaPath(sessionId), meta);
932
+ await writeJson(this.metaPath(sessionId), meta);
828
933
  }
829
934
  return {
830
935
  not_resurfaced: notResurfaced,
@@ -843,8 +948,8 @@ export class SessionStore {
843
948
  // "not_resurfaced" — both are runtime-managed (judge promotion and
844
949
  // resurfacing inference respectively). Returns the mutated item and the
845
950
  // appended history entry.
846
- setEvidenceChecklistItemStatus(sessionId, itemId, status, options = {}) {
847
- return this.withSessionLock(sessionId, () => {
951
+ async setEvidenceChecklistItemStatus(sessionId, itemId, status, options = {}) {
952
+ return this.withSessionLock(sessionId, async () => {
848
953
  const meta = this.read(sessionId);
849
954
  const checklist = meta.evidence_checklist ?? [];
850
955
  const item = checklist.find((entry) => entry.id === itemId);
@@ -878,7 +983,7 @@ export class SessionStore {
878
983
  meta.evidence_status_history = history;
879
984
  meta.evidence_checklist = checklist;
880
985
  meta.updated_at = ts;
881
- writeJson(this.metaPath(sessionId), meta);
986
+ await writeJson(this.metaPath(sessionId), meta);
882
987
  return { item, history_entry: entry };
883
988
  });
884
989
  }
@@ -887,8 +992,8 @@ export class SessionStore {
887
992
  // moves anything other than open. Atomic under the session lock.
888
993
  // Returns null when the item is not currently `open` (already
889
994
  // addressed, terminal, or missing) so the caller can skip emit.
890
- markEvidenceItemAddressedByJudge(sessionId, itemId, params) {
891
- return this.withSessionLock(sessionId, () => {
995
+ async markEvidenceItemAddressedByJudge(sessionId, itemId, params) {
996
+ return this.withSessionLock(sessionId, async () => {
892
997
  const meta = this.read(sessionId);
893
998
  const checklist = meta.evidence_checklist ?? [];
894
999
  const item = checklist.find((entry) => entry.id === itemId);
@@ -920,16 +1025,16 @@ export class SessionStore {
920
1025
  meta.evidence_status_history = history;
921
1026
  meta.evidence_checklist = checklist;
922
1027
  meta.updated_at = ts;
923
- writeJson(this.metaPath(sessionId), meta);
1028
+ await writeJson(this.metaPath(sessionId), meta);
924
1029
  return { item, history_entry: entry };
925
1030
  });
926
1031
  }
927
- recoverInterruptedSessions(activeSessionIds = new Set()) {
1032
+ async recoverInterruptedSessions(activeSessionIds = new Set()) {
928
1033
  const recovered = [];
929
1034
  for (const session of this.list()) {
930
1035
  if (session.outcome || activeSessionIds.has(session.session_id) || !session.in_flight)
931
1036
  continue;
932
- const updated = this.withSessionLock(session.session_id, () => {
1037
+ const updated = await this.withSessionLock(session.session_id, async () => {
933
1038
  const current = this.read(session.session_id);
934
1039
  if (current.outcome || activeSessionIds.has(current.session_id) || !current.in_flight) {
935
1040
  return current;
@@ -947,7 +1052,7 @@ export class SessionStore {
947
1052
  detail: `Recovered interrupted round ${round} after MCP restart. Start a new round to continue from saved session context.`,
948
1053
  };
949
1054
  current.updated_at = now();
950
- writeJson(this.metaPath(current.session_id), current);
1055
+ await writeJson(this.metaPath(current.session_id), current);
951
1056
  return current;
952
1057
  });
953
1058
  recovered.push(updated);
@@ -1162,7 +1267,7 @@ export class SessionStore {
1162
1267
  // `item_types` (open items grouped by surfacing peer) and
1163
1268
  // `chronic_blockers` (item ids with `round_count >= 3`) so operators
1164
1269
  // can see which evidence asks are systemic vs cauda ruidosa.
1165
- sessionDoctor(limit = 20, includeLegacy = false, repair = false) {
1270
+ async sessionDoctor(limit = 20, includeLegacy = false, repair = false) {
1166
1271
  const cappedLimit = Math.max(1, Math.min(100, Math.trunc(limit) || 20));
1167
1272
  // v3.6.0 (C): opt-in repair pass BEFORE the read-only audit. Fixes
1168
1273
  // the contradictory `outcome="converged" + health.state="blocked"`
@@ -1184,7 +1289,7 @@ export class SessionStore {
1184
1289
  // for manual operator inspection rather than guessing.
1185
1290
  if (latestConverged) {
1186
1291
  const fromState = session.convergence_health?.state;
1187
- const fixed = this.withSessionLock(session.session_id, () => {
1292
+ const fixed = await this.withSessionLock(session.session_id, async () => {
1188
1293
  const meta = this.read(session.session_id);
1189
1294
  if (meta.outcome === "converged" &&
1190
1295
  meta.convergence_health?.state === "blocked" &&
@@ -1195,7 +1300,7 @@ export class SessionStore {
1195
1300
  detail: `v3.6.0 doctor repair: recomputed health from latest round (was "blocked" with outcome="converged" — pre-v3.2.0 corruption artifact)`,
1196
1301
  };
1197
1302
  meta.updated_at = now();
1198
- writeJson(this.metaPath(session.session_id), meta);
1303
+ await writeJson(this.metaPath(session.session_id), meta);
1199
1304
  return true;
1200
1305
  }
1201
1306
  return false;
@@ -1576,7 +1681,7 @@ export class SessionStore {
1576
1681
  // original session is preserved (append-only); a new session opens
1577
1682
  // for re-deliberation with a fresh task + initial_draft and a
1578
1683
  // structural reference back to the contested session.
1579
- contestVerdict(params) {
1684
+ async contestVerdict(params) {
1580
1685
  const original = this.read(params.session_id);
1581
1686
  if (!original.outcome) {
1582
1687
  throw new Error(`cannot_contest_in_flight_session: session ${params.session_id} has no outcome yet (still in flight). Wait for it to converge or finalize before contesting.`);
@@ -1585,17 +1690,17 @@ export class SessionStore {
1585
1690
  throw new Error(`session_already_contested: session ${params.session_id} was already contested at ${original.contestation.contested_at} (new_session_id=${original.contestation.new_session_id}).`);
1586
1691
  }
1587
1692
  const newCaller = params.new_caller ?? "operator";
1588
- const newSession = this.init(params.new_task, newCaller, [], undefined);
1693
+ const newSession = await this.init(params.new_task, newCaller, [], undefined);
1589
1694
  // Cross-link new session → original.
1590
- this.withSessionLock(newSession.session_id, () => {
1695
+ await this.withSessionLock(newSession.session_id, async () => {
1591
1696
  const m = this.read(newSession.session_id);
1592
1697
  m.contests_session_id = params.session_id;
1593
1698
  m.updated_at = now();
1594
- writeJson(this.metaPath(newSession.session_id), m);
1699
+ await writeJson(this.metaPath(newSession.session_id), m);
1595
1700
  return m;
1596
1701
  });
1597
1702
  // Stamp original with contestation record.
1598
- const contestedMeta = this.withSessionLock(params.session_id, () => {
1703
+ const contestedMeta = await this.withSessionLock(params.session_id, async () => {
1599
1704
  const m = this.read(params.session_id);
1600
1705
  m.contestation = {
1601
1706
  contested_at: now(),
@@ -1604,19 +1709,19 @@ export class SessionStore {
1604
1709
  new_session_id: newSession.session_id,
1605
1710
  };
1606
1711
  m.updated_at = now();
1607
- writeJson(this.metaPath(params.session_id), m);
1712
+ await writeJson(this.metaPath(params.session_id), m);
1608
1713
  return m;
1609
1714
  });
1610
1715
  return { contested_meta: contestedMeta, new_session_id: newSession.session_id };
1611
1716
  }
1612
- attachEvidence(sessionId, params) {
1717
+ async attachEvidence(sessionId, params) {
1613
1718
  const extension = safeFilePart(params.extension ?? "txt").replace(/\./g, "") || "txt";
1614
1719
  const label = safeFilePart(params.label);
1615
1720
  const relativePath = `evidence/${timestampFilePart()}-${label}.${extension}`;
1616
1721
  const file = path.join(this.sessionDir(sessionId), relativePath);
1617
1722
  fs.mkdirSync(path.dirname(file), { recursive: true });
1618
1723
  fs.writeFileSync(file, redact(params.content), "utf8");
1619
- const meta = this.withSessionLock(sessionId, () => {
1724
+ const meta = await this.withSessionLock(sessionId, async () => {
1620
1725
  const current = this.read(sessionId);
1621
1726
  current.evidence_files = [
1622
1727
  ...(current.evidence_files ?? []),
@@ -1628,13 +1733,13 @@ export class SessionStore {
1628
1733
  },
1629
1734
  ];
1630
1735
  current.updated_at = now();
1631
- writeJson(this.metaPath(sessionId), current);
1736
+ await writeJson(this.metaPath(sessionId), current);
1632
1737
  return current;
1633
1738
  });
1634
1739
  return { path: relativePath.replace(/\\/g, "/"), meta };
1635
1740
  }
1636
- escalateToOperator(sessionId, params) {
1637
- return this.withSessionLock(sessionId, () => {
1741
+ async escalateToOperator(sessionId, params) {
1742
+ return this.withSessionLock(sessionId, async () => {
1638
1743
  const meta = this.read(sessionId);
1639
1744
  meta.operator_escalations = [
1640
1745
  ...(meta.operator_escalations ?? []),
@@ -1646,11 +1751,11 @@ export class SessionStore {
1646
1751
  detail: `Operator escalation requested: ${params.reason}`,
1647
1752
  };
1648
1753
  meta.updated_at = now();
1649
- writeJson(this.metaPath(sessionId), meta);
1754
+ await writeJson(this.metaPath(sessionId), meta);
1650
1755
  return meta;
1651
1756
  });
1652
1757
  }
1653
- sweepIdle(idleMs, outcome = "aborted", reason = "stale") {
1758
+ async sweepIdle(idleMs, outcome = "aborted", reason = "stale") {
1654
1759
  const effectiveIdleMs = Math.max(idleMs, SWEEP_MIN_IDLE_MS);
1655
1760
  const nowMs = Date.now();
1656
1761
  const swept = [];
@@ -1661,7 +1766,7 @@ export class SessionStore {
1661
1766
  const idleFor = Number.isFinite(updatedAt) ? nowMs - updatedAt : Infinity;
1662
1767
  if (idleFor < effectiveIdleMs)
1663
1768
  continue;
1664
- const finalized = this.withSessionLock(session.session_id, () => {
1769
+ const finalized = await this.withSessionLock(session.session_id, async () => {
1665
1770
  const current = this.read(session.session_id);
1666
1771
  current.outcome = outcome;
1667
1772
  current.outcome_reason = reason;
@@ -1673,7 +1778,7 @@ export class SessionStore {
1673
1778
  idle_ms: idleFor,
1674
1779
  };
1675
1780
  current.updated_at = now();
1676
- writeJson(this.metaPath(session.session_id), current);
1781
+ await writeJson(this.metaPath(session.session_id), current);
1677
1782
  return current;
1678
1783
  });
1679
1784
  swept.push(finalized);
@@ -1798,7 +1903,7 @@ export class SessionStore {
1798
1903
  // - in_flight.started_at is older than HEARTBEAT_STALE_AFTER_MS.
1799
1904
  // Sessions still actively running on a live PID are skipped. Idempotent
1800
1905
  // + best-effort. Returns counts for telemetry.
1801
- clearStaleInFlight() {
1906
+ async clearStaleInFlight() {
1802
1907
  const HEARTBEAT_STALE_AFTER_MS = 30 * 60 * 1000; // 30 minutes
1803
1908
  let scanned = 0;
1804
1909
  let cleared = 0;
@@ -1808,34 +1913,39 @@ export class SessionStore {
1808
1913
  scanned += 1;
1809
1914
  const startedIso = session.in_flight.started_at;
1810
1915
  const startedAge = startedIso ? Date.now() - Date.parse(startedIso) : Infinity;
1811
- // Best-effort liveness probe via the active lock holder pid (if any).
1812
- let holderAlive = true;
1813
- const lockPath = path.join(this.sessionDir(session.session_id), ".lock");
1814
- if (fs.existsSync(lockPath)) {
1815
- try {
1816
- const lock = readJson(lockPath);
1817
- if (Number.isInteger(lock.pid)) {
1818
- holderAlive = this.processAlive(lock.pid);
1819
- }
1820
- }
1821
- catch {
1822
- // malformed lock — assume dead so the lock sweep cleans it up.
1823
- holderAlive = false;
1824
- }
1916
+ // v4.1.0: lock-holder freshness is reported by proper-lockfile's
1917
+ // mtime-based stale detection. lockfile.check returns true if the
1918
+ // lock is actively held (mtime within `stale` ms), false otherwise.
1919
+ // This replaces the pre-v4.1.0 PID-aliveness check, which had
1920
+ // collision risk after PID-recycling restart.
1921
+ let holderAlive;
1922
+ try {
1923
+ holderAlive = await lockfile.check(this.metaPath(session.session_id), {
1924
+ stale: 120_000,
1925
+ realpath: false,
1926
+ lockfilePath: path.join(this.sessionDir(session.session_id), ".lock"),
1927
+ });
1825
1928
  }
1826
- else {
1827
- // No active lock heartbeat staleness is the only signal.
1828
- holderAlive = !Number.isFinite(startedAge) ? false : startedAge <= HEARTBEAT_STALE_AFTER_MS;
1929
+ catch {
1930
+ // metaPath missing or unreadable: treat as no active holder.
1931
+ holderAlive = false;
1932
+ }
1933
+ // Fallback heartbeat staleness signal when no active lock and
1934
+ // started_at indicates the in_flight marker itself is stale.
1935
+ if (!holderAlive && Number.isFinite(startedAge) && startedAge <= HEARTBEAT_STALE_AFTER_MS) {
1936
+ // No live holder but started_at is recent; do nothing yet (lock
1937
+ // may have been released cleanly; let normal finalize handle it).
1938
+ continue;
1829
1939
  }
1830
1940
  if (!holderAlive || startedAge > HEARTBEAT_STALE_AFTER_MS) {
1831
1941
  try {
1832
- this.withSessionLock(session.session_id, () => {
1942
+ await this.withSessionLock(session.session_id, async () => {
1833
1943
  const current = this.read(session.session_id);
1834
1944
  if (!current.in_flight)
1835
1945
  return;
1836
1946
  delete current.in_flight;
1837
1947
  current.updated_at = now();
1838
- writeJson(this.metaPath(session.session_id), current);
1948
+ await writeJson(this.metaPath(session.session_id), current);
1839
1949
  cleared += 1;
1840
1950
  });
1841
1951
  }
@@ -1871,7 +1981,7 @@ export class SessionStore {
1871
1981
  // threshold (default 24h via CROSS_REVIEW_STALE_HOURS).
1872
1982
  //
1873
1983
  // Idempotent + best-effort. Returns counts for telemetry.
1874
- abortStaleSessions(staleHours) {
1984
+ async abortStaleSessions(staleHours) {
1875
1985
  const envHours = Number.parseFloat(process.env.CROSS_REVIEW_STALE_HOURS ?? "");
1876
1986
  const hours = staleHours != null && staleHours > 0
1877
1987
  ? staleHours
@@ -1891,26 +2001,29 @@ export class SessionStore {
1891
2001
  if (session.in_flight)
1892
2002
  continue;
1893
2003
  scanned += 1;
1894
- // Live lock holder => assume still running, skip.
1895
- const lockPath = path.join(this.sessionDir(session.session_id), ".lock");
1896
- if (fs.existsSync(lockPath)) {
1897
- try {
1898
- const lock = readJson(lockPath);
1899
- if (Number.isInteger(lock.pid) && this.processAlive(lock.pid)) {
1900
- continue;
1901
- }
1902
- }
1903
- catch {
1904
- /* malformed lock — fall through to staleness check */
1905
- }
2004
+ // v4.1.0: lock-holder freshness via proper-lockfile mtime-based
2005
+ // stale detection. lockfile.check returns true if a live holder
2006
+ // is touching the lockfile mtime within `stale` ms.
2007
+ let holderAlive;
2008
+ try {
2009
+ holderAlive = await lockfile.check(this.metaPath(session.session_id), {
2010
+ stale: 120_000,
2011
+ realpath: false,
2012
+ lockfilePath: path.join(this.sessionDir(session.session_id), ".lock"),
2013
+ });
1906
2014
  }
2015
+ catch {
2016
+ holderAlive = false;
2017
+ }
2018
+ if (holderAlive)
2019
+ continue;
1907
2020
  const lastTouched = Date.parse(session.updated_at);
1908
2021
  if (!Number.isFinite(lastTouched))
1909
2022
  continue;
1910
2023
  if (Date.now() - lastTouched < staleThresholdMs)
1911
2024
  continue;
1912
2025
  try {
1913
- this.finalize(session.session_id, "aborted", `stale_no_finalize_${hours}h`);
2026
+ await this.finalize(session.session_id, "aborted", `stale_no_finalize_${hours}h`);
1914
2027
  aborted += 1;
1915
2028
  }
1916
2029
  catch {