@lcv-ideas-software/cross-review 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  import crypto from "node:crypto";
2
2
  import fs from "node:fs";
3
3
  import path from "node:path";
4
+ import lockfile from "proper-lockfile";
4
5
  import { redact } from "../security/redact.js";
5
6
  import { mergeCost, mergeUsage } from "./cost.js";
6
7
  import { PEERS } from "./types.js";
@@ -21,7 +22,7 @@ function now() {
21
22
  const ATOMIC_WRITE_RETRY_CODES = new Set(["EPERM", "EACCES", "EBUSY", "EEXIST"]);
22
23
  const ATOMIC_WRITE_MAX_ATTEMPTS = 5;
23
24
  const TMP_NONCE_BYTES = 2;
24
- function writeJson(file, data) {
25
+ async function writeJson(file, data) {
25
26
  fs.mkdirSync(path.dirname(file), { recursive: true });
26
27
  const nonce = crypto.randomBytes(TMP_NONCE_BYTES).toString("hex");
27
28
  const tmp = `${file}.${process.pid}.${Date.now()}.${nonce}.tmp`;
@@ -37,11 +38,17 @@ function writeJson(file, data) {
37
38
  const code = err.code;
38
39
  if (!code || !ATOMIC_WRITE_RETRY_CODES.has(code))
39
40
  break;
41
+ // v4.1.0 hardening: pre-v4.1.0 used `while (Date.now() - start <
42
+ // wait) {}` busy-wait which blocked the single Node.js event loop
43
+ // thread for up to 310 ms (10+20+40+80+160) under repeated
44
+ // Windows-AV-induced EPERM/EBUSY contention. The CPU-burning
45
+ // busy-wait starved SSE streaming + concurrent sessions + MCP
46
+ // stdio reads. Now the backoff awaits a Promise-based timer:
47
+ // event loop remains fully responsive between attempts.
40
48
  const wait = 10 * 2 ** attempt; // 10, 20, 40, 80, 160 ms
41
- const start = Date.now();
42
- while (Date.now() - start < wait) {
43
- /* spin — sync write path, brief by design */
44
- }
49
+ await new Promise((resolve) => {
50
+ setTimeout(resolve, wait);
51
+ });
45
52
  }
46
53
  }
47
54
  // Terminal failure path: best-effort tmp cleanup so callers don't see
@@ -99,6 +106,16 @@ export class SessionStore {
99
106
  // monotonically thereafter. Restart re-initializes from disk, so seq
100
107
  // remains correct across process boundaries.
101
108
  seqCache = new Map();
109
+ // v4.1.0: track in-flight fire-and-forget appendEvent promises so
110
+ // callers that need synchronous read-after-write semantics (smoke
111
+ // tests, post-round aggregation) can call `flushPendingEvents()` to
112
+ // wait for all pending event writes to settle before reading.
113
+ // appendEvent is async because withSessionLock is async (proper-
114
+ // lockfile); the emit pipeline must stay sync, so it uses
115
+ // `void store.appendEvent(event)` and the store remembers the
116
+ // promise here. Promises resolve/reject within appendEvent's own
117
+ // try/catch — flush() therefore always settles, never rejects.
118
+ pendingEventWrites = new Set();
102
119
  constructor(config) {
103
120
  this.config = config;
104
121
  fs.mkdirSync(this.sessionsDir(), { recursive: true });
@@ -140,10 +157,6 @@ export class SessionStore {
140
157
  return false;
141
158
  }
142
159
  }
143
- sleepSync(ms) {
144
- const buffer = new SharedArrayBuffer(4);
145
- Atomics.wait(new Int32Array(buffer), 0, 0, ms);
146
- }
147
160
  totalsFor(meta) {
148
161
  const peerResults = meta.rounds.flatMap((round) => round.peers);
149
162
  const generations = meta.generation_files ?? [];
@@ -158,46 +171,112 @@ export class SessionStore {
158
171
  ]),
159
172
  };
160
173
  }
161
- withSessionLock(sessionId, fn) {
174
+ // v4.1.0 hardening: pre-v4.1.0 acquired the lock via an exclusive
175
+ // file-create syscall followed by a separate JSON metadata write,
176
+ // which had a multi-process TOCTOU race window. Process A's create
177
+ // returned an empty inode + fd; before A's metadata write executed,
178
+ // process B could observe the empty file, fail to JSON-parse it,
179
+ // remove the lock path, create its own valid lock, and enter the
180
+ // critical section. Process A would then write into the now-orphan
181
+ // inode via the still-open fd and ALSO enter the critical section,
182
+ // corrupting meta.json. proper-lockfile uses `fs.mkdir` (atomic
183
+ // across NTFS and POSIX) so the lock comes into existence as a
184
+ // directory in a single syscall — no empty-window race possible.
185
+ // The mkdir-based lock also fixes the lock-holder freshness signal:
186
+ // proper-lockfile's `update` interval touches the lockfile's mtime
187
+ // every 5 s, and any other process treats the lock as stale once the
188
+ // mtime is older than `stale` ms (120 s). This is more robust than
189
+ // the pre-v4.1.0 PID-aliveness check, which had collision risk after
190
+ // process restart.
191
+ async withSessionLock(sessionId, fn) {
162
192
  const dir = this.sessionDir(sessionId);
163
- const lockPath = path.join(dir, ".lock");
164
- const timeoutAt = Date.now() + 30_000;
165
- while (true) {
166
- try {
167
- const fd = fs.openSync(lockPath, "wx");
168
- fs.writeFileSync(fd, JSON.stringify({ pid: process.pid, acquired_at: now() }));
169
- fs.closeSync(fd);
170
- break;
193
+ const target = this.metaPath(sessionId);
194
+ const lockfilePath = path.join(dir, ".lock");
195
+ fs.mkdirSync(dir, { recursive: true });
196
+ // proper-lockfile requires the target path to exist (it uses it for
197
+ // realpath resolution). Init creates the session dir then immediately
198
+ // calls withSessionLock-protected writes; pre-create an empty meta
199
+ // placeholder so the first init() can acquire the lock. Existing
200
+ // session reuses preserve their meta.
201
+ try {
202
+ fs.writeFileSync(target, "{}\n", { flag: "wx" });
203
+ }
204
+ catch (err) {
205
+ if (err.code !== "EEXIST")
206
+ throw err;
207
+ /* existing or concurrently-created meta; fine */
208
+ }
209
+ // Pre-v4.1.0 legacy `.lock` regular file detection — FAIL CLOSED.
210
+ //
211
+ // Pre-v4.1.0 created `.lock` as a regular file containing
212
+ // `{pid, ts}` JSON. proper-lockfile claims `.lock` as a DIRECTORY
213
+ // via mkdir, so a leftover regular file blocks every subsequent
214
+ // lockfile.lock() with EEXIST. The original v4.1.0 design tried
215
+ // to auto-clean stale legacy files. Codex (session 059b0093 R1
216
+ // through R4) progressively demonstrated that NO auto-clean is
217
+ // safe under live cross-version operation:
218
+ //
219
+ // • R1: unconditional removal split-brained with a live legacy
220
+ // holder.
221
+ // • R2: removal-when-pid-alive-but-mtime-stale split-brained
222
+ // because legacy locks do not heartbeat (mtime is frozen at
223
+ // acquisition).
224
+ // • R3: per-process atomic decisions still raced two v4.1
225
+ // migrators.
226
+ // • R4: serializing v4.1 migrators via a separate mutex still
227
+ // left the cross-version race: v4.0.x's own stale-removal
228
+ // path does not honor any v4.1 mutex, so a concurrent v4.0.x
229
+ // could remove a stale `.lock` and create its own live one
230
+ // between v4.1's read and v4.1's path-based rmSync —
231
+ // v4.1 then deletes the new live legacy lock → split-brain.
232
+ //
233
+ // Resolution: v4.1.0 NEVER auto-removes a legacy regular `.lock`
234
+ // file. If one is observed, withSessionLock throws a clear
235
+ // remediation error to the caller, instructing the operator to
236
+ // stop all cross-review processes and remove the file manually.
237
+ // This is a ONE-TIME operator step at v4.0.x → v4.1.0 upgrade.
238
+ // After all hosts are on v4.1.0 the locks are mkdir-atomic and
239
+ // the issue cannot recur.
240
+ try {
241
+ const stat = fs.statSync(lockfilePath);
242
+ if (stat.isFile()) {
243
+ throw new Error(`cross-review v4.1.0 detected a pre-v4.1.0 lock file at ${lockfilePath}. ` +
244
+ `Live cross-version migration is not supported (would split-brain with any ` +
245
+ `concurrent v4.0.x process). To migrate safely: (1) stop all cross-review ` +
246
+ `processes / close all MCP hosts that loaded the server, (2) remove the ` +
247
+ `legacy lock file, (3) restart. POSIX one-liner for full cleanup: ` +
248
+ `\`find ${this.config.data_dir}/sessions -name .lock -type f -delete\`. ` +
249
+ `See CHANGELOG v04.01.00 migration notes for the rationale.`);
171
250
  }
172
- catch (error) {
173
- if (error.code !== "EEXIST")
174
- throw error;
175
- try {
176
- const lock = readJson(lockPath);
177
- const age = lock.acquired_at ? Date.now() - Date.parse(lock.acquired_at) : Infinity;
178
- if (!lock.pid || age > 120_000 || !this.processAlive(lock.pid)) {
179
- fs.rmSync(lockPath, { force: true });
180
- continue;
181
- }
182
- }
183
- catch {
184
- fs.rmSync(lockPath, { force: true });
185
- continue;
186
- }
187
- if (Date.now() >= timeoutAt) {
188
- throw new Error(`timed out waiting for session lock: ${sessionId}`, { cause: error });
189
- }
190
- this.sleepSync(100);
251
+ }
252
+ catch (err) {
253
+ if (err instanceof Error && err.message.includes("detected a pre-v4.1.0 lock file")) {
254
+ throw err;
255
+ }
256
+ if (err.code !== "ENOENT") {
257
+ /* ignore other stat errors; lockfile.lock will surface them */
191
258
  }
192
259
  }
260
+ const release = await lockfile.lock(target, {
261
+ stale: 120_000,
262
+ update: 5_000,
263
+ retries: { retries: 30, factor: 1.5, minTimeout: 100, maxTimeout: 1_000 },
264
+ realpath: false,
265
+ lockfilePath,
266
+ });
193
267
  try {
194
- return fn();
268
+ return await fn();
195
269
  }
196
270
  finally {
197
- fs.rmSync(lockPath, { force: true });
271
+ try {
272
+ await release();
273
+ }
274
+ catch {
275
+ /* lock was already released by stale-detection or sibling process */
276
+ }
198
277
  }
199
278
  }
200
- init(task, caller, snapshot, reviewFocus) {
279
+ async init(task, caller, snapshot, reviewFocus) {
201
280
  const session_id = crypto.randomUUID();
202
281
  // v2.22.0 (B.P3): snapshot the cost ceiling at session_init time so
203
282
  // budget pressure analysis is decoupled from later env-var mutation.
@@ -227,7 +306,7 @@ export class SessionStore {
227
306
  budget_warning_emitted: false,
228
307
  };
229
308
  fs.mkdirSync(path.join(this.sessionDir(session_id), "agent-runs"), { recursive: true });
230
- writeJson(this.metaPath(session_id), meta);
309
+ await writeJson(this.metaPath(session_id), meta);
231
310
  fs.writeFileSync(path.join(this.sessionDir(session_id), "task.md"), task, "utf8");
232
311
  if (reviewFocus) {
233
312
  fs.writeFileSync(path.join(this.sessionDir(session_id), "review-focus.md"), reviewFocus, "utf8");
@@ -243,8 +322,8 @@ export class SessionStore {
243
322
  // R5 throws when in_flight is already populated; the boot-time
244
323
  // `clearStaleInFlight` sweep clears any orphan in_flight from a
245
324
  // crashed prior host so legitimate operators are not blocked.
246
- markInFlight(sessionId, params) {
247
- return this.withSessionLock(sessionId, () => {
325
+ async markInFlight(sessionId, params) {
326
+ return this.withSessionLock(sessionId, async () => {
248
327
  const meta = this.read(sessionId);
249
328
  if (meta.in_flight) {
250
329
  throw new Error(`session ${sessionId} already has an in-flight round (round=${meta.in_flight.round}, started_at=${meta.in_flight.started_at}); refusing to start a concurrent round. Wait for the round to complete, cancel it via session_cancel_job, or recover it via session_recover_interrupted.`);
@@ -262,7 +341,7 @@ export class SessionStore {
262
341
  detail: `Round ${params.round} is running.`,
263
342
  };
264
343
  meta.updated_at = now();
265
- writeJson(this.metaPath(sessionId), meta);
344
+ await writeJson(this.metaPath(sessionId), meta);
266
345
  return meta;
267
346
  });
268
347
  }
@@ -304,23 +383,47 @@ export class SessionStore {
304
383
  commitSeq(sessionId, committed) {
305
384
  this.seqCache.set(sessionId, committed);
306
385
  }
307
- appendEvent(event) {
386
+ // v4.1.0: durable event persistence. withSessionLock became async
387
+ // with the proper-lockfile refactor; appendEvent awaits the lock so
388
+ // callers that read events after persisting get the expected
389
+ // synchronous-write semantics (e.g. the session_doctor sweep + smoke
390
+ // fixtures that read events.ndjson immediately after appendEvent).
391
+ // Fire-and-forget callers wrap with `void store.appendEvent(...)`.
392
+ async appendEvent(event) {
308
393
  const sessionId = event.session_id;
309
394
  if (!sessionId)
310
395
  return;
311
- try {
312
- this.withSessionLock(sessionId, () => {
313
- const file = this.eventsPath(sessionId);
314
- const seq = this.peekNextSeq(sessionId, file);
315
- fs.appendFileSync(file, `${JSON.stringify({ ...event, seq, ts: event.ts ?? now() })}\n`, "utf8");
316
- // Only commit the cache AFTER the durable append succeeded.
317
- // If appendFileSync threw above, the cache still reflects the
318
- // last persisted seq and the next call reuses this seq number.
319
- this.commitSeq(sessionId, seq);
320
- });
321
- }
322
- catch {
323
- // Event persistence must never break provider calls or MCP responses.
396
+ const write = (async () => {
397
+ try {
398
+ await this.withSessionLock(sessionId, () => {
399
+ const file = this.eventsPath(sessionId);
400
+ const seq = this.peekNextSeq(sessionId, file);
401
+ fs.appendFileSync(file, `${JSON.stringify({ ...event, seq, ts: event.ts ?? now() })}\n`, "utf8");
402
+ // Only commit the cache AFTER the durable append succeeded.
403
+ // If appendFileSync threw above, the cache still reflects the
404
+ // last persisted seq and the next call reuses this seq number.
405
+ this.commitSeq(sessionId, seq);
406
+ });
407
+ }
408
+ catch {
409
+ // Event persistence must never break provider calls or MCP responses.
410
+ }
411
+ })();
412
+ this.pendingEventWrites.add(write);
413
+ void write.finally(() => {
414
+ this.pendingEventWrites.delete(write);
415
+ });
416
+ return write;
417
+ }
418
+ // v4.1.0: wait for all in-flight fire-and-forget event writes to
419
+ // settle. Used by tests/sweeps that need synchronous read-after-write
420
+ // semantics for events.ndjson when the emit pipeline used
421
+ // `void store.appendEvent(...)`. Always resolves (never rejects);
422
+ // appendEvent swallows its own errors.
423
+ async flushPendingEvents() {
424
+ while (this.pendingEventWrites.size > 0) {
425
+ const snapshot = Array.from(this.pendingEventWrites);
426
+ await Promise.allSettled(snapshot);
324
427
  }
325
428
  }
326
429
  readEvents(sessionId, sinceSeq = 0) {
@@ -415,11 +518,11 @@ export class SessionStore {
415
518
  fs.writeFileSync(file, redact(draft), "utf8");
416
519
  return path.relative(this.sessionDir(sessionId), file).replace(/\\/g, "/");
417
520
  }
418
- saveGeneration(sessionId, round, result, label = "generation") {
521
+ async saveGeneration(sessionId, round, result, label = "generation") {
419
522
  const file = path.join(this.sessionDir(sessionId), "agent-runs", `round-${round}-${result.peer}-${label}.json`);
420
- writeJson(file, { ...result, text: redact(result.text) });
523
+ await writeJson(file, { ...result, text: redact(result.text) });
421
524
  const relativePath = path.relative(this.sessionDir(sessionId), file).replace(/\\/g, "/");
422
- this.withSessionLock(sessionId, () => {
525
+ await this.withSessionLock(sessionId, async () => {
423
526
  const meta = this.read(sessionId);
424
527
  const artifact = {
425
528
  ts: now(),
@@ -434,7 +537,7 @@ export class SessionStore {
434
537
  meta.generation_files = [...(meta.generation_files ?? []), artifact];
435
538
  meta.totals = this.totalsFor(meta);
436
539
  meta.updated_at = now();
437
- writeJson(this.metaPath(sessionId), meta);
540
+ await writeJson(this.metaPath(sessionId), meta);
438
541
  });
439
542
  return relativePath;
440
543
  }
@@ -448,18 +551,18 @@ export class SessionStore {
448
551
  fs.writeFileSync(file, redact(text), "utf8");
449
552
  return path.relative(this.sessionDir(sessionId), file).replace(/\\/g, "/");
450
553
  }
451
- savePeerResult(sessionId, round, result, label = "response") {
554
+ async savePeerResult(sessionId, round, result, label = "response") {
452
555
  const file = path.join(this.sessionDir(sessionId), "agent-runs", `round-${round}-${result.peer}-${label}.json`);
453
- writeJson(file, { ...result, text: redact(result.text) });
556
+ await writeJson(file, { ...result, text: redact(result.text) });
454
557
  return path.relative(this.sessionDir(sessionId), file).replace(/\\/g, "/");
455
558
  }
456
- savePeerFailure(sessionId, round, failure) {
559
+ async savePeerFailure(sessionId, round, failure) {
457
560
  const file = path.join(this.sessionDir(sessionId), "agent-runs", `round-${round}-${failure.peer}-failure.json`);
458
- writeJson(file, { ...failure, message: redact(failure.message) });
561
+ await writeJson(file, { ...failure, message: redact(failure.message) });
459
562
  return path.relative(this.sessionDir(sessionId), file).replace(/\\/g, "/");
460
563
  }
461
- appendRound(sessionId, params) {
462
- return this.withSessionLock(sessionId, () => {
564
+ async appendRound(sessionId, params) {
565
+ return this.withSessionLock(sessionId, async () => {
463
566
  const meta = this.read(sessionId);
464
567
  // v3.2.0 (Codex bug report 2026-05-12): refuse to append a round
465
568
  // to a finalized session. Otherwise the per-round
@@ -507,19 +610,19 @@ export class SessionStore {
507
610
  // diff-based drift if a peer's cost changed in a retry loop.
508
611
  const roundCost = params.peers.reduce((sum, peer) => sum + (peer.cost?.total_cost ?? 0), 0);
509
612
  meta.costs_per_round = [...(meta.costs_per_round ?? []), roundCost];
510
- writeJson(this.metaPath(sessionId), meta);
613
+ await writeJson(this.metaPath(sessionId), meta);
511
614
  return round;
512
615
  });
513
616
  }
514
617
  // v2.22.0 (B.P3): one-shot guard for `session.budget_warning` emit
515
618
  // idempotency. Persisted in meta.json so the warning fires at most
516
619
  // once per session even across host restarts.
517
- markBudgetWarningEmitted(sessionId) {
518
- return this.withSessionLock(sessionId, () => {
620
+ async markBudgetWarningEmitted(sessionId) {
621
+ return this.withSessionLock(sessionId, async () => {
519
622
  const meta = this.read(sessionId);
520
623
  meta.budget_warning_emitted = true;
521
624
  meta.updated_at = now();
522
- writeJson(this.metaPath(sessionId), meta);
625
+ await writeJson(this.metaPath(sessionId), meta);
523
626
  return meta;
524
627
  });
525
628
  }
@@ -527,12 +630,12 @@ export class SessionStore {
527
630
  // orchestrator's circular loop calls this every round so resumed
528
631
  // sessions can pick up the rotation cursor and consecutive-no-change
529
632
  // count from disk without re-deriving them by walking events.
530
- setCircularState(sessionId, state) {
531
- return this.withSessionLock(sessionId, () => {
633
+ async setCircularState(sessionId, state) {
634
+ return this.withSessionLock(sessionId, async () => {
532
635
  const meta = this.read(sessionId);
533
636
  meta.circular_state = state;
534
637
  meta.updated_at = now();
535
- writeJson(this.metaPath(sessionId), meta);
638
+ await writeJson(this.metaPath(sessionId), meta);
536
639
  return meta;
537
640
  });
538
641
  }
@@ -545,8 +648,8 @@ export class SessionStore {
545
648
  // max_rounds the caller actually requested. This fills that gap with
546
649
  // pure-additive metadata; `cost_ceiling_usd` is kept in sync with
547
650
  // `effective_cost_ceiling_usd` for back-compat with v3.4.x readers.
548
- setSessionTraceability(sessionId, traceability) {
549
- return this.withSessionLock(sessionId, () => {
651
+ async setSessionTraceability(sessionId, traceability) {
652
+ return this.withSessionLock(sessionId, async () => {
550
653
  const meta = this.read(sessionId);
551
654
  meta.requested_max_rounds = traceability.requested_max_rounds;
552
655
  meta.effective_max_rounds = traceability.effective_max_rounds;
@@ -557,7 +660,7 @@ export class SessionStore {
557
660
  // only know `cost_ceiling_usd` still see the effective ceiling.
558
661
  meta.cost_ceiling_usd = traceability.effective_cost_ceiling_usd;
559
662
  meta.updated_at = now();
560
- writeJson(this.metaPath(sessionId), meta);
663
+ await writeJson(this.metaPath(sessionId), meta);
561
664
  return meta;
562
665
  });
563
666
  }
@@ -576,8 +679,8 @@ export class SessionStore {
576
679
  throw err;
577
680
  }
578
681
  }
579
- finalize(sessionId, outcome, reason) {
580
- return this.withSessionLock(sessionId, () => {
682
+ async finalize(sessionId, outcome, reason) {
683
+ return this.withSessionLock(sessionId, async () => {
581
684
  const meta = this.read(sessionId);
582
685
  // v3.2.0 (Codex bug report 2026-05-12): when the caller asserts
583
686
  // outcome="converged", the latest round (if any) MUST have
@@ -606,12 +709,12 @@ export class SessionStore {
606
709
  detail: reason ?? outcome,
607
710
  };
608
711
  meta.updated_at = now();
609
- writeJson(this.metaPath(sessionId), meta);
712
+ await writeJson(this.metaPath(sessionId), meta);
610
713
  return meta;
611
714
  });
612
715
  }
613
- requestCancellation(sessionId, reason = "operator_requested", jobId) {
614
- return this.withSessionLock(sessionId, () => {
716
+ async requestCancellation(sessionId, reason = "operator_requested", jobId) {
717
+ return this.withSessionLock(sessionId, async () => {
615
718
  const meta = this.read(sessionId);
616
719
  meta.control = {
617
720
  status: "cancel_requested",
@@ -626,12 +729,12 @@ export class SessionStore {
626
729
  detail: `Cancellation requested: ${reason}`,
627
730
  };
628
731
  meta.updated_at = now();
629
- writeJson(this.metaPath(sessionId), meta);
732
+ await writeJson(this.metaPath(sessionId), meta);
630
733
  return meta;
631
734
  });
632
735
  }
633
- markCancelled(sessionId, reason = "cancelled") {
634
- return this.withSessionLock(sessionId, () => {
736
+ async markCancelled(sessionId, reason = "cancelled") {
737
+ return this.withSessionLock(sessionId, async () => {
635
738
  const meta = this.read(sessionId);
636
739
  meta.outcome = "aborted";
637
740
  meta.outcome_reason = reason;
@@ -649,7 +752,7 @@ export class SessionStore {
649
752
  detail: reason,
650
753
  };
651
754
  meta.updated_at = now();
652
- writeJson(this.metaPath(sessionId), meta);
755
+ await writeJson(this.metaPath(sessionId), meta);
653
756
  return meta;
654
757
  });
655
758
  }
@@ -657,12 +760,12 @@ export class SessionStore {
657
760
  const meta = this.read(sessionId);
658
761
  return meta.control?.status === "cancel_requested";
659
762
  }
660
- appendFallbackEvent(sessionId, event) {
661
- return this.withSessionLock(sessionId, () => {
763
+ async appendFallbackEvent(sessionId, event) {
764
+ return this.withSessionLock(sessionId, async () => {
662
765
  const meta = this.read(sessionId);
663
766
  meta.fallback_events = [...(meta.fallback_events ?? []), event];
664
767
  meta.updated_at = now();
665
- writeJson(this.metaPath(sessionId), meta);
768
+ await writeJson(this.metaPath(sessionId), meta);
666
769
  return meta;
667
770
  });
668
771
  }
@@ -672,10 +775,10 @@ export class SessionStore {
672
775
  // across rounds increments `round_count` instead of producing
673
776
  // duplicate entries. Returns the updated checklist (or empty array
674
777
  // if nothing was added/updated).
675
- appendEvidenceChecklistItems(sessionId, round, incoming) {
778
+ async appendEvidenceChecklistItems(sessionId, round, incoming) {
676
779
  if (!incoming.length)
677
780
  return [];
678
- return this.withSessionLock(sessionId, () => {
781
+ return this.withSessionLock(sessionId, async () => {
679
782
  const meta = this.read(sessionId);
680
783
  const existing = meta.evidence_checklist ?? [];
681
784
  const byId = new Map(existing.map((item) => [item.id, item]));
@@ -724,7 +827,7 @@ export class SessionStore {
724
827
  });
725
828
  meta.evidence_checklist = updated;
726
829
  meta.updated_at = ts;
727
- writeJson(this.metaPath(sessionId), meta);
830
+ await writeJson(this.metaPath(sessionId), meta);
728
831
  return updated;
729
832
  });
730
833
  }
@@ -750,8 +853,8 @@ export class SessionStore {
750
853
  // by the orchestrator via a separate event so operators see when peers
751
854
  // keep asking for items they explicitly closed; the status itself is
752
855
  // operator-owned.
753
- runEvidenceChecklistAddressDetection(sessionId, currentRound) {
754
- return this.withSessionLock(sessionId, () => {
856
+ async runEvidenceChecklistAddressDetection(sessionId, currentRound) {
857
+ return this.withSessionLock(sessionId, async () => {
755
858
  const meta = this.read(sessionId);
756
859
  const checklist = meta.evidence_checklist ?? [];
757
860
  if (!checklist.length) {
@@ -824,7 +927,7 @@ export class SessionStore {
824
927
  if (notResurfaced.length || reopened.length) {
825
928
  meta.evidence_status_history = history;
826
929
  meta.updated_at = ts;
827
- writeJson(this.metaPath(sessionId), meta);
930
+ await writeJson(this.metaPath(sessionId), meta);
828
931
  }
829
932
  return {
830
933
  not_resurfaced: notResurfaced,
@@ -843,8 +946,8 @@ export class SessionStore {
843
946
  // "not_resurfaced" — both are runtime-managed (judge promotion and
844
947
  // resurfacing inference respectively). Returns the mutated item and the
845
948
  // appended history entry.
846
- setEvidenceChecklistItemStatus(sessionId, itemId, status, options = {}) {
847
- return this.withSessionLock(sessionId, () => {
949
+ async setEvidenceChecklistItemStatus(sessionId, itemId, status, options = {}) {
950
+ return this.withSessionLock(sessionId, async () => {
848
951
  const meta = this.read(sessionId);
849
952
  const checklist = meta.evidence_checklist ?? [];
850
953
  const item = checklist.find((entry) => entry.id === itemId);
@@ -878,7 +981,7 @@ export class SessionStore {
878
981
  meta.evidence_status_history = history;
879
982
  meta.evidence_checklist = checklist;
880
983
  meta.updated_at = ts;
881
- writeJson(this.metaPath(sessionId), meta);
984
+ await writeJson(this.metaPath(sessionId), meta);
882
985
  return { item, history_entry: entry };
883
986
  });
884
987
  }
@@ -887,8 +990,8 @@ export class SessionStore {
887
990
  // moves anything other than open. Atomic under the session lock.
888
991
  // Returns null when the item is not currently `open` (already
889
992
  // addressed, terminal, or missing) so the caller can skip emit.
890
- markEvidenceItemAddressedByJudge(sessionId, itemId, params) {
891
- return this.withSessionLock(sessionId, () => {
993
+ async markEvidenceItemAddressedByJudge(sessionId, itemId, params) {
994
+ return this.withSessionLock(sessionId, async () => {
892
995
  const meta = this.read(sessionId);
893
996
  const checklist = meta.evidence_checklist ?? [];
894
997
  const item = checklist.find((entry) => entry.id === itemId);
@@ -920,16 +1023,16 @@ export class SessionStore {
920
1023
  meta.evidence_status_history = history;
921
1024
  meta.evidence_checklist = checklist;
922
1025
  meta.updated_at = ts;
923
- writeJson(this.metaPath(sessionId), meta);
1026
+ await writeJson(this.metaPath(sessionId), meta);
924
1027
  return { item, history_entry: entry };
925
1028
  });
926
1029
  }
927
- recoverInterruptedSessions(activeSessionIds = new Set()) {
1030
+ async recoverInterruptedSessions(activeSessionIds = new Set()) {
928
1031
  const recovered = [];
929
1032
  for (const session of this.list()) {
930
1033
  if (session.outcome || activeSessionIds.has(session.session_id) || !session.in_flight)
931
1034
  continue;
932
- const updated = this.withSessionLock(session.session_id, () => {
1035
+ const updated = await this.withSessionLock(session.session_id, async () => {
933
1036
  const current = this.read(session.session_id);
934
1037
  if (current.outcome || activeSessionIds.has(current.session_id) || !current.in_flight) {
935
1038
  return current;
@@ -947,7 +1050,7 @@ export class SessionStore {
947
1050
  detail: `Recovered interrupted round ${round} after MCP restart. Start a new round to continue from saved session context.`,
948
1051
  };
949
1052
  current.updated_at = now();
950
- writeJson(this.metaPath(current.session_id), current);
1053
+ await writeJson(this.metaPath(current.session_id), current);
951
1054
  return current;
952
1055
  });
953
1056
  recovered.push(updated);
@@ -1162,7 +1265,7 @@ export class SessionStore {
1162
1265
  // `item_types` (open items grouped by surfacing peer) and
1163
1266
  // `chronic_blockers` (item ids with `round_count >= 3`) so operators
1164
1267
  // can see which evidence asks are systemic vs cauda ruidosa.
1165
- sessionDoctor(limit = 20, includeLegacy = false, repair = false) {
1268
+ async sessionDoctor(limit = 20, includeLegacy = false, repair = false) {
1166
1269
  const cappedLimit = Math.max(1, Math.min(100, Math.trunc(limit) || 20));
1167
1270
  // v3.6.0 (C): opt-in repair pass BEFORE the read-only audit. Fixes
1168
1271
  // the contradictory `outcome="converged" + health.state="blocked"`
@@ -1184,7 +1287,7 @@ export class SessionStore {
1184
1287
  // for manual operator inspection rather than guessing.
1185
1288
  if (latestConverged) {
1186
1289
  const fromState = session.convergence_health?.state;
1187
- const fixed = this.withSessionLock(session.session_id, () => {
1290
+ const fixed = await this.withSessionLock(session.session_id, async () => {
1188
1291
  const meta = this.read(session.session_id);
1189
1292
  if (meta.outcome === "converged" &&
1190
1293
  meta.convergence_health?.state === "blocked" &&
@@ -1195,7 +1298,7 @@ export class SessionStore {
1195
1298
  detail: `v3.6.0 doctor repair: recomputed health from latest round (was "blocked" with outcome="converged" — pre-v3.2.0 corruption artifact)`,
1196
1299
  };
1197
1300
  meta.updated_at = now();
1198
- writeJson(this.metaPath(session.session_id), meta);
1301
+ await writeJson(this.metaPath(session.session_id), meta);
1199
1302
  return true;
1200
1303
  }
1201
1304
  return false;
@@ -1576,7 +1679,7 @@ export class SessionStore {
1576
1679
  // original session is preserved (append-only); a new session opens
1577
1680
  // for re-deliberation with a fresh task + initial_draft and a
1578
1681
  // structural reference back to the contested session.
1579
- contestVerdict(params) {
1682
+ async contestVerdict(params) {
1580
1683
  const original = this.read(params.session_id);
1581
1684
  if (!original.outcome) {
1582
1685
  throw new Error(`cannot_contest_in_flight_session: session ${params.session_id} has no outcome yet (still in flight). Wait for it to converge or finalize before contesting.`);
@@ -1585,17 +1688,17 @@ export class SessionStore {
1585
1688
  throw new Error(`session_already_contested: session ${params.session_id} was already contested at ${original.contestation.contested_at} (new_session_id=${original.contestation.new_session_id}).`);
1586
1689
  }
1587
1690
  const newCaller = params.new_caller ?? "operator";
1588
- const newSession = this.init(params.new_task, newCaller, [], undefined);
1691
+ const newSession = await this.init(params.new_task, newCaller, [], undefined);
1589
1692
  // Cross-link new session → original.
1590
- this.withSessionLock(newSession.session_id, () => {
1693
+ await this.withSessionLock(newSession.session_id, async () => {
1591
1694
  const m = this.read(newSession.session_id);
1592
1695
  m.contests_session_id = params.session_id;
1593
1696
  m.updated_at = now();
1594
- writeJson(this.metaPath(newSession.session_id), m);
1697
+ await writeJson(this.metaPath(newSession.session_id), m);
1595
1698
  return m;
1596
1699
  });
1597
1700
  // Stamp original with contestation record.
1598
- const contestedMeta = this.withSessionLock(params.session_id, () => {
1701
+ const contestedMeta = await this.withSessionLock(params.session_id, async () => {
1599
1702
  const m = this.read(params.session_id);
1600
1703
  m.contestation = {
1601
1704
  contested_at: now(),
@@ -1604,19 +1707,19 @@ export class SessionStore {
1604
1707
  new_session_id: newSession.session_id,
1605
1708
  };
1606
1709
  m.updated_at = now();
1607
- writeJson(this.metaPath(params.session_id), m);
1710
+ await writeJson(this.metaPath(params.session_id), m);
1608
1711
  return m;
1609
1712
  });
1610
1713
  return { contested_meta: contestedMeta, new_session_id: newSession.session_id };
1611
1714
  }
1612
- attachEvidence(sessionId, params) {
1715
+ async attachEvidence(sessionId, params) {
1613
1716
  const extension = safeFilePart(params.extension ?? "txt").replace(/\./g, "") || "txt";
1614
1717
  const label = safeFilePart(params.label);
1615
1718
  const relativePath = `evidence/${timestampFilePart()}-${label}.${extension}`;
1616
1719
  const file = path.join(this.sessionDir(sessionId), relativePath);
1617
1720
  fs.mkdirSync(path.dirname(file), { recursive: true });
1618
1721
  fs.writeFileSync(file, redact(params.content), "utf8");
1619
- const meta = this.withSessionLock(sessionId, () => {
1722
+ const meta = await this.withSessionLock(sessionId, async () => {
1620
1723
  const current = this.read(sessionId);
1621
1724
  current.evidence_files = [
1622
1725
  ...(current.evidence_files ?? []),
@@ -1628,13 +1731,13 @@ export class SessionStore {
1628
1731
  },
1629
1732
  ];
1630
1733
  current.updated_at = now();
1631
- writeJson(this.metaPath(sessionId), current);
1734
+ await writeJson(this.metaPath(sessionId), current);
1632
1735
  return current;
1633
1736
  });
1634
1737
  return { path: relativePath.replace(/\\/g, "/"), meta };
1635
1738
  }
1636
- escalateToOperator(sessionId, params) {
1637
- return this.withSessionLock(sessionId, () => {
1739
+ async escalateToOperator(sessionId, params) {
1740
+ return this.withSessionLock(sessionId, async () => {
1638
1741
  const meta = this.read(sessionId);
1639
1742
  meta.operator_escalations = [
1640
1743
  ...(meta.operator_escalations ?? []),
@@ -1646,11 +1749,11 @@ export class SessionStore {
1646
1749
  detail: `Operator escalation requested: ${params.reason}`,
1647
1750
  };
1648
1751
  meta.updated_at = now();
1649
- writeJson(this.metaPath(sessionId), meta);
1752
+ await writeJson(this.metaPath(sessionId), meta);
1650
1753
  return meta;
1651
1754
  });
1652
1755
  }
1653
- sweepIdle(idleMs, outcome = "aborted", reason = "stale") {
1756
+ async sweepIdle(idleMs, outcome = "aborted", reason = "stale") {
1654
1757
  const effectiveIdleMs = Math.max(idleMs, SWEEP_MIN_IDLE_MS);
1655
1758
  const nowMs = Date.now();
1656
1759
  const swept = [];
@@ -1661,7 +1764,7 @@ export class SessionStore {
1661
1764
  const idleFor = Number.isFinite(updatedAt) ? nowMs - updatedAt : Infinity;
1662
1765
  if (idleFor < effectiveIdleMs)
1663
1766
  continue;
1664
- const finalized = this.withSessionLock(session.session_id, () => {
1767
+ const finalized = await this.withSessionLock(session.session_id, async () => {
1665
1768
  const current = this.read(session.session_id);
1666
1769
  current.outcome = outcome;
1667
1770
  current.outcome_reason = reason;
@@ -1673,7 +1776,7 @@ export class SessionStore {
1673
1776
  idle_ms: idleFor,
1674
1777
  };
1675
1778
  current.updated_at = now();
1676
- writeJson(this.metaPath(session.session_id), current);
1779
+ await writeJson(this.metaPath(session.session_id), current);
1677
1780
  return current;
1678
1781
  });
1679
1782
  swept.push(finalized);
@@ -1798,7 +1901,7 @@ export class SessionStore {
1798
1901
  // - in_flight.started_at is older than HEARTBEAT_STALE_AFTER_MS.
1799
1902
  // Sessions still actively running on a live PID are skipped. Idempotent
1800
1903
  // + best-effort. Returns counts for telemetry.
1801
- clearStaleInFlight() {
1904
+ async clearStaleInFlight() {
1802
1905
  const HEARTBEAT_STALE_AFTER_MS = 30 * 60 * 1000; // 30 minutes
1803
1906
  let scanned = 0;
1804
1907
  let cleared = 0;
@@ -1808,34 +1911,39 @@ export class SessionStore {
1808
1911
  scanned += 1;
1809
1912
  const startedIso = session.in_flight.started_at;
1810
1913
  const startedAge = startedIso ? Date.now() - Date.parse(startedIso) : Infinity;
1811
- // Best-effort liveness probe via the active lock holder pid (if any).
1812
- let holderAlive = true;
1813
- const lockPath = path.join(this.sessionDir(session.session_id), ".lock");
1814
- if (fs.existsSync(lockPath)) {
1815
- try {
1816
- const lock = readJson(lockPath);
1817
- if (Number.isInteger(lock.pid)) {
1818
- holderAlive = this.processAlive(lock.pid);
1819
- }
1820
- }
1821
- catch {
1822
- // malformed lock — assume dead so the lock sweep cleans it up.
1823
- holderAlive = false;
1824
- }
1914
+ // v4.1.0: lock-holder freshness is reported by proper-lockfile's
1915
+ // mtime-based stale detection. lockfile.check returns true if the
1916
+ // lock is actively held (mtime within `stale` ms), false otherwise.
1917
+ // This replaces the pre-v4.1.0 PID-aliveness check, which had
1918
+ // collision risk after PID-recycling restart.
1919
+ let holderAlive;
1920
+ try {
1921
+ holderAlive = await lockfile.check(this.metaPath(session.session_id), {
1922
+ stale: 120_000,
1923
+ realpath: false,
1924
+ lockfilePath: path.join(this.sessionDir(session.session_id), ".lock"),
1925
+ });
1825
1926
  }
1826
- else {
1827
- // No active lock heartbeat staleness is the only signal.
1828
- holderAlive = !Number.isFinite(startedAge) ? false : startedAge <= HEARTBEAT_STALE_AFTER_MS;
1927
+ catch {
1928
+ // metaPath missing or unreadable: treat as no active holder.
1929
+ holderAlive = false;
1930
+ }
1931
+ // Fallback heartbeat staleness signal when no active lock and
1932
+ // started_at indicates the in_flight marker itself is stale.
1933
+ if (!holderAlive && Number.isFinite(startedAge) && startedAge <= HEARTBEAT_STALE_AFTER_MS) {
1934
+ // No live holder but started_at is recent; do nothing yet (lock
1935
+ // may have been released cleanly; let normal finalize handle it).
1936
+ continue;
1829
1937
  }
1830
1938
  if (!holderAlive || startedAge > HEARTBEAT_STALE_AFTER_MS) {
1831
1939
  try {
1832
- this.withSessionLock(session.session_id, () => {
1940
+ await this.withSessionLock(session.session_id, async () => {
1833
1941
  const current = this.read(session.session_id);
1834
1942
  if (!current.in_flight)
1835
1943
  return;
1836
1944
  delete current.in_flight;
1837
1945
  current.updated_at = now();
1838
- writeJson(this.metaPath(session.session_id), current);
1946
+ await writeJson(this.metaPath(session.session_id), current);
1839
1947
  cleared += 1;
1840
1948
  });
1841
1949
  }
@@ -1871,7 +1979,7 @@ export class SessionStore {
1871
1979
  // threshold (default 24h via CROSS_REVIEW_STALE_HOURS).
1872
1980
  //
1873
1981
  // Idempotent + best-effort. Returns counts for telemetry.
1874
- abortStaleSessions(staleHours) {
1982
+ async abortStaleSessions(staleHours) {
1875
1983
  const envHours = Number.parseFloat(process.env.CROSS_REVIEW_STALE_HOURS ?? "");
1876
1984
  const hours = staleHours != null && staleHours > 0
1877
1985
  ? staleHours
@@ -1891,26 +1999,29 @@ export class SessionStore {
1891
1999
  if (session.in_flight)
1892
2000
  continue;
1893
2001
  scanned += 1;
1894
- // Live lock holder => assume still running, skip.
1895
- const lockPath = path.join(this.sessionDir(session.session_id), ".lock");
1896
- if (fs.existsSync(lockPath)) {
1897
- try {
1898
- const lock = readJson(lockPath);
1899
- if (Number.isInteger(lock.pid) && this.processAlive(lock.pid)) {
1900
- continue;
1901
- }
1902
- }
1903
- catch {
1904
- /* malformed lock — fall through to staleness check */
1905
- }
2002
+ // v4.1.0: lock-holder freshness via proper-lockfile mtime-based
2003
+ // stale detection. lockfile.check returns true if a live holder
2004
+ // is touching the lockfile mtime within `stale` ms.
2005
+ let holderAlive;
2006
+ try {
2007
+ holderAlive = await lockfile.check(this.metaPath(session.session_id), {
2008
+ stale: 120_000,
2009
+ realpath: false,
2010
+ lockfilePath: path.join(this.sessionDir(session.session_id), ".lock"),
2011
+ });
1906
2012
  }
2013
+ catch {
2014
+ holderAlive = false;
2015
+ }
2016
+ if (holderAlive)
2017
+ continue;
1907
2018
  const lastTouched = Date.parse(session.updated_at);
1908
2019
  if (!Number.isFinite(lastTouched))
1909
2020
  continue;
1910
2021
  if (Date.now() - lastTouched < staleThresholdMs)
1911
2022
  continue;
1912
2023
  try {
1913
- this.finalize(session.session_id, "aborted", `stale_no_finalize_${hours}h`);
2024
+ await this.finalize(session.session_id, "aborted", `stale_no_finalize_${hours}h`);
1914
2025
  aborted += 1;
1915
2026
  }
1916
2027
  catch {