@remnic/core 9.3.679 → 9.3.680

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/access-cli.js +4 -4
  2. package/dist/access-http.js +7 -7
  3. package/dist/access-mcp.js +6 -6
  4. package/dist/access-schema.js +3 -3
  5. package/dist/access-service.js +4 -4
  6. package/dist/{capsule-crypto-7FJQINUR.js → capsule-crypto-YO5QJ6L3.js} +2 -2
  7. package/dist/{chunk-K2JYO6QV.js → chunk-5TEYIXMP.js} +3 -3
  8. package/dist/{chunk-2NLLXCJG.js → chunk-BXLOS5AJ.js} +2 -2
  9. package/dist/{chunk-ARV3AUOM.js → chunk-DL6H3D7S.js} +2 -2
  10. package/dist/{chunk-X7Y7WX73.js → chunk-DQEMWVMT.js} +1 -1
  11. package/dist/{chunk-UNZLU2MX.js → chunk-DWQPM67F.js} +4 -4
  12. package/dist/{chunk-UDJLF3BO.js → chunk-JI6HWBYL.js} +2 -2
  13. package/dist/{chunk-4PPMUNV5.js → chunk-OBM7EVFU.js} +3 -3
  14. package/dist/{chunk-KQAFEZQX.js → chunk-VDX2J7OX.js} +2 -2
  15. package/dist/{chunk-PCGCQTU6.js → chunk-W67ZZDHO.js} +10 -10
  16. package/dist/cli.js +11 -11
  17. package/dist/contradiction/index.js +4 -4
  18. package/dist/index.js +15 -15
  19. package/dist/transfer/backup.js +2 -2
  20. package/dist/transfer/capsule-export.js +2 -2
  21. package/dist/transfer/capsule-import.js +2 -2
  22. package/dist/transfer/types.d.ts +6 -6
  23. package/dist/utils/serialize-mutations.d.ts +122 -0
  24. package/dist/utils/serialize-mutations.js +287 -0
  25. package/dist/utils/serialize-mutations.js.map +1 -0
  26. package/package.json +12 -2
  27. package/src/utils/serialize-mutations.test.ts +1047 -0
  28. package/src/utils/serialize-mutations.ts +679 -0
  29. /package/dist/{capsule-crypto-7FJQINUR.js.map → capsule-crypto-YO5QJ6L3.js.map} +0 -0
  30. /package/dist/{chunk-K2JYO6QV.js.map → chunk-5TEYIXMP.js.map} +0 -0
  31. /package/dist/{chunk-2NLLXCJG.js.map → chunk-BXLOS5AJ.js.map} +0 -0
  32. /package/dist/{chunk-ARV3AUOM.js.map → chunk-DL6H3D7S.js.map} +0 -0
  33. /package/dist/{chunk-X7Y7WX73.js.map → chunk-DQEMWVMT.js.map} +0 -0
  34. /package/dist/{chunk-UNZLU2MX.js.map → chunk-DWQPM67F.js.map} +0 -0
  35. /package/dist/{chunk-UDJLF3BO.js.map → chunk-JI6HWBYL.js.map} +0 -0
  36. /package/dist/{chunk-4PPMUNV5.js.map → chunk-OBM7EVFU.js.map} +0 -0
  37. /package/dist/{chunk-KQAFEZQX.js.map → chunk-VDX2J7OX.js.map} +0 -0
  38. /package/dist/{chunk-PCGCQTU6.js.map → chunk-W67ZZDHO.js.map} +0 -0
@@ -0,0 +1,679 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Shared serialized-mutation utilities for TOCTOU hotspots (issue #1524).
3
+ //
4
+ // Two complementary primitives that the namespace catalog (`queueCritical` +
5
+ // `withHeldCatalogLock`), the storage router's resolve-hook serialization, and
6
+ // the summary-snapshot writer each re-implement today:
7
+ //
8
+ // 1. `serializeMutations(key, task)` — keyed IN-PROCESS async serialization
9
+ // that recovers after a rejection (CLAUDE.md rule #40). One failed task
10
+ // never poisons the tasks queued behind it; the failed task's error is
11
+ // still surfaced to ITS caller.
12
+ //
13
+ // 2. `withHeldFileLock(lockPath, opts, task)` — a held CROSS-PROCESS file
14
+ // lock with replacement-safe stale breaking (the NG7Bg invariant from
15
+ // #1506 round 28) and ownership-checked release.
16
+ //
17
+ // This is the UTILITY module only. Per-issue PR split: one PR for the utility
18
+ // + tests (this file), then one PR per adoption hotspot (catalog, router
19
+ // provenance, summary snapshot). No adoptions live here.
20
+ // ---------------------------------------------------------------------------
21
+
22
+ import { randomUUID } from "node:crypto";
23
+ import { link, mkdir, open, readFile, rename, stat, unlink, utimes } from "node:fs/promises";
24
+ import path from "node:path";
25
+
26
+ // ─────────────────────────────────────────────────────────────────────────────
27
+ // 1. serializeMutations — keyed async serialization with rejection recovery
28
+ // ─────────────────────────────────────────────────────────────────────────────
29
+
30
+ /**
31
+ * One entry in the per-key serialization map. `tail` is the recovered promise
32
+ * the next queued task chains off of; it never rejects (both settle handlers
33
+ * swallow), so a prior task's failure can never break subsequent ones.
34
+ */
35
+ interface MutationChainEntry {
36
+ tail: Promise<void>;
37
+ }
38
+
39
+ /**
40
+ * Instance-scoped keyed serializer. Holds the per-key chain map so that all
41
+ * tasks queued under the same key on the SAME serializer run strictly in order.
42
+ *
43
+ * The map is instance-scoped (not module-level) so tests can construct a fresh
44
+ * serializer per case and avoid cross-test contamination, and so adopters that
45
+ * want isolation (e.g. one serializer per storage root) can have it. The free
46
+ * {@link serializeMutations} export delegates to a single shared default
47
+ * instance for callers that want process-wide serialization.
48
+ */
49
+ export class MutationSerializer {
50
+ private readonly chains = new Map<string, MutationChainEntry>();
51
+
52
+ /**
53
+ * Run `task` strictly after every other task already queued under `key` on
54
+ * this serializer has settled.
55
+ *
56
+ * Rejection recovery (rule #40, mirroring the catalog's `queueCritical`):
57
+ * if a prior task rejects, later tasks STILL RUN, while the rejecting task's
58
+ * error is surfaced to ITS OWN caller. Concretely, the recovered tail is
59
+ * `run.then(noop, noop)` — never a bare `.then(fn)`, which would let one
60
+ * failure kill every queued task behind it.
61
+ *
62
+ * No unbounded growth: when a chain's last task settles and no newer task
63
+ * chained onto it, its entry is deleted (the storage router's
64
+ * `inFlightResolved` marker-then-clear discipline).
65
+ */
66
+ serialize<T>(key: string, task: () => Promise<T>): Promise<T> {
67
+ if (typeof key !== "string" || key.length === 0) {
68
+ throw new TypeError("MutationSerializer.serialize: key must be a non-empty string");
69
+ }
70
+ if (typeof task !== "function") {
71
+ throw new TypeError("MutationSerializer.serialize: task must be a function returning a promise");
72
+ }
73
+
74
+ let entry = this.chains.get(key);
75
+ if (!entry) {
76
+ entry = { tail: Promise.resolve() };
77
+ this.chains.set(key, entry);
78
+ }
79
+
80
+ // Chain this task off the prior tail. `tail.then(task)` runs task only once
81
+ // the previous task has settled, preserving read-modify-write ordering.
82
+ const run = entry.tail.then(task);
83
+
84
+ // Recover the tail after a rejection so a failed task never poisons later
85
+ // ones. Both handlers swallow; `run` still carries the original resolution
86
+ // (or rejection) to THIS caller. This is the line a naive `.then(fn)`
87
+ // implementation omits — see the "naive poison chain" prove-fail test.
88
+ const recovered = run.then(settleNoop, settleNoop);
89
+ entry.tail = recovered;
90
+
91
+ // Self-cleaning: once our recovered tail settles, if no newer task chained
92
+ // onto us the entry still points at `recovered` and is safe to delete. A
93
+ // concurrent `serialize()` call enqueues synchronously and would have
94
+ // replaced `entry.tail` BEFORE this microtask runs, so the identity check
95
+ // is race-free (no newer task's entry can be wrongly removed).
96
+ //
97
+ // `recovered` cannot reject in correct operation (both handlers above
98
+ // swallow) and the cleanup body cannot throw — but we attach a rejection
99
+ // handler anyway so that IF the recovery invariant is ever broken, the
100
+ // failure surfaces as a behavioral assertion (skipped tasks) rather than an
101
+ // unhandled-rejection storm that masks which task failed. The handler is a
102
+ // no-op: cleanup only runs on fulfillment.
103
+ void recovered.then(
104
+ () => {
105
+ if (entry && entry.tail === recovered) {
106
+ this.chains.delete(key);
107
+ }
108
+ },
109
+ () => undefined,
110
+ );
111
+
112
+ return run;
113
+ }
114
+
115
+ /**
116
+ * Test-only: the number of keys with a not-yet-cleaned chain. Used to assert
117
+ * the no-unbounded-growth invariant. Not part of the public contract.
118
+ */
119
+ pendingKeysForTest(): number {
120
+ return this.chains.size;
121
+ }
122
+ }
123
+
124
+ /**
125
+ * Recovery handler shared by both settle arms. Named (not inline
126
+ * `() => undefined`) so the chain assignment stays self-documenting in stack
127
+ * traces and the review-patterns poison-chain check can see the chain is
128
+ * recovered, not bare `.then(fn)`.
129
+ */
130
+ function settleNoop(): void {
131
+ /* swallow — the original resolution/rejection is carried by `run` */
132
+ }
133
+
134
+ /**
135
+ * Process-wide default serializer backing the free {@link serializeMutations}
136
+ * export. Lazy so it is only created when first used (tests that construct
137
+ * their own `MutationSerializer` pay nothing).
138
+ */
139
+ let defaultSerializer: MutationSerializer | undefined;
140
+
141
+ /**
142
+ * Free-function entry point (issue #1524 signature). Serializes `task` against
143
+ * every other task queued under `key` across the whole process, via a shared
144
+ * default {@link MutationSerializer}. For isolated/testable serialization,
145
+ * construct a `MutationSerializer` directly.
146
+ */
147
+ export function serializeMutations<T>(key: string, task: () => Promise<T>): Promise<T> {
148
+ if (!defaultSerializer) defaultSerializer = new MutationSerializer();
149
+ return defaultSerializer.serialize(key, task);
150
+ }
151
+
152
+ // ─────────────────────────────────────────────────────────────────────────────
153
+ // 2. withHeldFileLock — cross-process held file lock with stale breaking
154
+ // ─────────────────────────────────────────────────────────────────────────────
155
+
156
+ /** Options for {@link withHeldFileLock}. */
157
+ export interface HeldFileLockOptions {
158
+ /**
159
+ * A lock whose mtime is older than this (in ms) is treated as a crashed
160
+ * holder and broken. Required — there is no safe default, since the right
161
+ * value depends on how long the guarded critical section can legitimately
162
+ * run.
163
+ */
164
+ readonly staleMs: number;
165
+ /**
166
+ * Bounded acquisition: give up trying to acquire a busy lock after this long
167
+ * (ms) and invoke `task(false)` best-effort WITHOUT holding the lock, rather
168
+ * than blocking forever or crashing the primary op. Default 5000ms (matches
169
+ * the namespace catalog's `REBUILD_LOCK_MAX_WAIT_MS`).
170
+ */
171
+ readonly maxWaitMs?: number;
172
+ /**
173
+ * Poll interval (ms) while waiting for a busy lock to clear. Default 50ms.
174
+ */
175
+ readonly pollMs?: number;
176
+ /**
177
+ * While WE hold the lock, refresh its mtime on this cadence (ms) so a
178
+ * legitimately long task is not mistaken for a crashed holder and broken out
179
+ * from under. Default `floor(staleMs / 3)` (at least 100ms), mirroring the
180
+ * catalog heartbeat ratio. Must be comfortably below `staleMs`.
181
+ */
182
+ readonly heartbeatMs?: number;
183
+ /**
184
+ * Test seam (NG7Bg, #1506 round 28): fires AFTER a lock is judged stale and
185
+ * BEFORE the re-verify + unlink, simulating a replacement lock being created
186
+ * in the race window. No-op in production.
187
+ */
188
+ readonly onBeforeBreakStaleUnlinkForTest?: () => Promise<void> | void;
189
+ /**
190
+ * Test seam (codex P2): fires AFTER the release rename moves the lock to a
191
+ * trash path and BEFORE the ownership re-verify/restore — simulating a third
192
+ * contender acquiring the (now-empty) lockPath in the race window. No-op in
193
+ * production. Used to prove the pre-check prevents the rename entirely.
194
+ */
195
+ readonly onAfterReleaseRenameForTest?: () => Promise<void> | void;
196
+ /**
197
+ * Best-effort hook for non-fatal lock warnings (heartbeat refresh failure,
198
+ * release-time ownership check failure). Never throws into the caller. If
199
+ * omitted, warnings are swallowed (the lock is advisory; release/heartbeat
200
+ * failures must never crash the guarded op).
201
+ */
202
+ readonly onLockWarning?: (message: string, err: unknown) => void;
203
+ }
204
+
205
+ /** Default bounded acquisition wait, mirroring the catalog. */
206
+ const DEFAULT_MAX_WAIT_MS = 5_000;
207
+ /** Default busy-lock poll interval, mirroring the catalog. */
208
+ const DEFAULT_POLL_MS = 50;
209
+ /** Floor for the derived heartbeat cadence. */
210
+ const MIN_HEARTBEAT_MS = 100;
211
+ /** Node's setTimeout/setInterval 32-bit signed-int ceiling (2^31 − 1 ms ≈ 24.8
212
+ * days). Delays above this are silently clamped to 1ms by the Node timer, so
213
+ * timer-backed options (pollMs, heartbeatMs) must be rejected at this boundary
214
+ * (chatgpt-codex-connector P2). */
215
+ const MAX_TIMER_DELAY_MS = 2_147_483_647;
216
+
217
+ /** Internal handle for a lock we successfully acquired. */
218
+ interface HeldLock {
219
+ readonly path: string;
220
+ readonly ownerId: string;
221
+ }
222
+
223
+ /**
224
+ * Run `task` under an exclusive on-disk lock at `lockPath`.
225
+ *
226
+ * Cross-process mutex via `open(lockPath, "wx")` (atomic exclusive create).
227
+ * While held, a heartbeat timer refreshes the lock's mtime so a legitimately
228
+ * long task is not mistaken for a crashed holder and broken out from under. A
229
+ * lock older than `opts.staleMs` is treated as stale and broken — but
230
+ * REPLACEMENT-SAFE (NG7Bg): we capture the stale lock's identity (full content
231
+ * line: `<pid> <owner-uuid> <iso>`) when judging it stale, then RE-READ and
232
+ * RE-STAT immediately before `unlink`, deleting only if byte-identical AND
233
+ * still stale. A replacement lock created in the window has a different owner
234
+ * id / timestamp, so its content differs and is left untouched.
235
+ *
236
+ * `task` receives `acquired: boolean` — `true` when we hold the lock, `false`
237
+ * when acquisition timed out (best-effort). The signature takes
238
+ * `(acquired) => Promise<T>` rather than the issue's sketched `() => Promise<T>`
239
+ * so this can be the SINGLE lock home (issue: "do NOT leave two lock
240
+ * implementations; pick one home"): the catalog's touch path needs to DROP on
241
+ * timeout, which requires knowing whether the lock was acquired. A caller that
242
+ * ignores the flag is still assignable (`() => Promise<T>` ⊆
243
+ * `(acquired: boolean) => Promise<T>` in TypeScript).
244
+ *
245
+ * Release is ownership-checked: we only `unlink` a lock whose content still
246
+ * identifies THIS acquirer (same owner id), so a replacement created after we
247
+ * stopped heartbeating is never destroyed — mirroring the catalog's
248
+ * `rebuildLockHeldBySelf`.
249
+ *
250
+ * ADOPTION NOTE: lock only the brief final read-merge-write window, never a
251
+ * long scan — a scan-length lock makes concurrent writers time out and
252
+ * silently drop work (catalog round 5, codex/cursor P2).
253
+ */
254
+ export async function withHeldFileLock<T>(
255
+ lockPath: string,
256
+ opts: HeldFileLockOptions,
257
+ task: (acquired: boolean) => Promise<T>,
258
+ ): Promise<T> {
259
+ if (typeof lockPath !== "string" || lockPath.length === 0) {
260
+ throw new TypeError("withHeldFileLock: lockPath must be a non-empty string");
261
+ }
262
+ if (typeof opts?.staleMs !== "number" || !Number.isFinite(opts.staleMs) || opts.staleMs <= 0) {
263
+ throw new TypeError(
264
+ `withHeldFileLock: opts.staleMs must be a positive finite number ` +
265
+ `(valid range: > 0 ms, finite; got ${formatInvalidNumber(opts?.staleMs)}).`,
266
+ );
267
+ }
268
+
269
+ // Validate optional timings: a NaN/Infinity here is a real hazard (e.g.
270
+ // `Date.now() + NaN` === NaN, so `Date.now() >= deadline` is always false and
271
+ // the bounded acquire loop would wait forever instead of falling back to
272
+ // best-effort). Reject invalid input rather than silently defaulting it
273
+ // (codex P2 review). Omitting an option still picks its default.
274
+ const maxWaitMs = optionalPositiveMs(opts.maxWaitMs, "maxWaitMs", DEFAULT_MAX_WAIT_MS, MAX_TIMER_DELAY_MS);
275
+ const pollMs = optionalPositiveMs(opts.pollMs, "pollMs", DEFAULT_POLL_MS, MAX_TIMER_DELAY_MS);
276
+ const heartbeatMs = optionalPositiveMs(
277
+ opts.heartbeatMs,
278
+ "heartbeatMs",
279
+ Math.max(MIN_HEARTBEAT_MS, Math.floor(opts.staleMs / 3)),
280
+ MAX_TIMER_DELAY_MS,
281
+ );
282
+ if (heartbeatMs >= opts.staleMs) {
283
+ throw new TypeError(
284
+ `withHeldFileLock: heartbeatMs (${heartbeatMs}) must be below staleMs (${opts.staleMs}) ` +
285
+ `(valid range: > 0 and < staleMs ms) so at least one heartbeat lands per stale window.`,
286
+ );
287
+ }
288
+ if (heartbeatMs > MAX_TIMER_DELAY_MS) {
289
+ throw new TypeError(
290
+ `withHeldFileLock: derived heartbeatMs (${heartbeatMs} = floor(staleMs/3)) exceeds ` +
291
+ `Node's setTimeout ceiling (${MAX_TIMER_DELAY_MS} ms). Use an explicit opts.heartbeatMs ` +
292
+ `at or below ${MAX_TIMER_DELAY_MS} ms.`,
293
+ );
294
+ }
295
+ // Wrap the consumer's warning hook so a throwing callback never turns a
296
+ // non-fatal advisory lock warning into an unhandled rejection (heartbeat
297
+ // catch handler) or overrides the task's result (release path). The option
298
+ // is documented as never throwing into the caller; enforce that here
299
+ // (codex P2 review).
300
+ const rawWarn = opts.onLockWarning;
301
+ const warn = (message: string, err: unknown): void => {
302
+ if (!rawWarn) return;
303
+ try {
304
+ rawWarn(message, err);
305
+ } catch {
306
+ /* swallow — a throwing advisory hook must not crash the guarded op */
307
+ }
308
+ };
309
+
310
+ // Per-call owner identity. Two withHeldFileLock calls in the SAME process
311
+ // get different ids, so neither mistakes the other's lock for its own
312
+ // (stronger than the catalog's per-instance id, which is what we want for a
313
+ // stateless utility).
314
+ const ownerId = randomUUID();
315
+ const lockDir = path.dirname(lockPath);
316
+
317
+ const held = await acquireLock(lockPath, lockDir, ownerId, opts, maxWaitMs, pollMs);
318
+ if (!held) {
319
+ // Best-effort: run the task WITHOUT the lock. The caller decides what to
320
+ // do (the catalog touch path will drop its append); we never crash the
321
+ // primary op on contention.
322
+ return task(false);
323
+ }
324
+
325
+ // Heartbeat: while WE hold the lock, refresh its mtime so age-based stale
326
+ // detection sees an active holder and does not break us out from under
327
+ // (catalog round 5). Failures are swallowed (advisory lock); the timer is
328
+ // always cleared in the finally.
329
+ //
330
+ // OWNERSHIP CHECK (codex P2): if our event loop was paused long enough that
331
+ // another process judged us stale, broke our lock, and created a replacement,
332
+ // we must NOT refresh the replacement's mtime — that would keep a (possibly
333
+ // crashed) replacement looking fresh. Verify lockHeldBySelf before each
334
+ // utimes; if ownership is lost, stop heartbeating (our lock is gone).
335
+ const heartbeat = setInterval(() => {
336
+ lockHeldBySelf(held)
337
+ .then((ours) => {
338
+ if (!ours) return; // broken/replaced — stop refreshing
339
+ return utimes(held.path, new Date(), new Date());
340
+ })
341
+ .catch((err: unknown) => {
342
+ warn("withHeldFileLock heartbeat refresh failed", err);
343
+ });
344
+ }, heartbeatMs);
345
+ // Don't keep the event loop alive solely for the heartbeat.
346
+ heartbeat.unref?.();
347
+ try {
348
+ return await task(true);
349
+ } finally {
350
+ clearInterval(heartbeat);
351
+ await releaseLock(held, warn, opts.onAfterReleaseRenameForTest);
352
+ }
353
+ }
354
+
355
+ /**
356
+ * Resolve an optional millisecond timing option, REJECTING invalid values
357
+ * (NaN, Infinity, non-positive, or above `maxMs`) rather than silently defaulting
358
+ * them. A NaN or Infinity maxWaitMs would make the bounded acquire loop wait
359
+ * forever (`Date.now() + NaN` is NaN); a non-positive poll/heartbeat makes no
360
+ * sense. Timer-backed options (pollMs, heartbeatMs) are bounded to Node's
361
+ * setTimeout ceiling (`MAX_TIMER_DELAY_MS`): a value above 2^31−1 is silently
362
+ * clamped to 1ms by the timer, turning a typo into tight polling (codex P2).
363
+ * Omitting the option (`undefined`) picks `fallback`. Non-number types are also
364
+ * rejected (defensive against config/env coercion).
365
+ */
366
+ function optionalPositiveMs(
367
+ value: number | undefined,
368
+ name: "maxWaitMs" | "pollMs" | "heartbeatMs",
369
+ fallback: number,
370
+ maxMs: number,
371
+ ): number {
372
+ if (value === undefined) return fallback;
373
+ if (typeof value !== "number" || !Number.isFinite(value) || value <= 0) {
374
+ throw new TypeError(
375
+ `withHeldFileLock: opts.${name} must be a positive finite number ` +
376
+ `(valid range: > 0 ms, finite; got ${formatInvalidNumber(value)}). ` +
377
+ `Omit the option to use the default of ${fallback} ms.`,
378
+ );
379
+ }
380
+ if (value > maxMs) {
381
+ throw new TypeError(
382
+ `withHeldFileLock: opts.${name} (${value} ms) exceeds the ${maxMs} ms ` +
383
+ `ceiling (Node's setTimeout clamps larger delays to 1ms, turning a ` +
384
+ `typo into tight polling). Omit the option to use the default of ${fallback} ms.`,
385
+ );
386
+ }
387
+ return value;
388
+ }
389
+
390
+ /**
391
+ * Human-readable label for a rejected numeric input. Makes the error message
392
+ * immediately actionable for NaN/Infinity (which print as "NaN"/"Infinity" via
393
+ * String() but are easier to triage with an explicit sign), and surfaces the
394
+ * actual type for non-number values (defensive against config/env coercion).
395
+ */
396
+ function formatInvalidNumber(value: unknown): string {
397
+ if (typeof value === "number") {
398
+ if (Number.isNaN(value)) return "NaN";
399
+ if (value === Infinity) return "+Infinity";
400
+ if (value === -Infinity) return "-Infinity";
401
+ return String(value);
402
+ }
403
+ return `${typeof value} ${JSON.stringify(value)}`;
404
+ }
405
+
406
+ /**
407
+ * Atomically create the lock file, looping until acquired/stale-broken/timeout.
408
+ * Returns the held-lock handle on success, or `undefined` on bounded-timeout.
409
+ * Unexpected FS errors proceed best-effort (return undefined) rather than
410
+ * crashing the guarded op, matching the catalog.
411
+ */
412
+ async function acquireLock(
413
+ lockPath: string,
414
+ lockDir: string,
415
+ ownerId: string,
416
+ opts: HeldFileLockOptions,
417
+ maxWaitMs: number,
418
+ pollMs: number,
419
+ ): Promise<HeldLock | undefined> {
420
+ try {
421
+ await mkdir(lockDir, { recursive: true });
422
+ } catch {
423
+ // Lock-directory setup failure (e.g. an intermediate path is a file, or
424
+ // permissions deny mkdir) must NOT crash the guarded op — the advisory
425
+ // lock contract is best-effort. Return undefined so task(false) runs
426
+ // instead of rejecting (codex P2 review).
427
+ return undefined;
428
+ }
429
+ const deadline = Date.now() + maxWaitMs;
430
+ for (;;) {
431
+ try {
432
+ const handle = await open(lockPath, "wx");
433
+ let wroteMeta = true;
434
+ try {
435
+ await handle.writeFile(`${process.pid} ${ownerId} ${new Date().toISOString()}\n`, "utf8");
436
+ } catch {
437
+ // The metadata write failed; the lock file may be empty or partial.
438
+ // Our ownership check on release would NOT find this ownerId, leaving
439
+ // a malformed lock that lingers until stale and blocks other callers
440
+ // out of the mutex (codex P2). Undo our exclusive create and report
441
+ // acquisition failure so the caller runs best-effort instead.
442
+ wroteMeta = false;
443
+ } finally {
444
+ try {
445
+ await handle.close();
446
+ } catch {
447
+ // close() can report a deferred I/O error (e.g. write that appeared
448
+ // to succeed but failed on flush). The lock file may be malformed —
449
+ // treat it as a metadata-write failure so the cleanup path unlinks
450
+ // the orphaned lock (codex P2 review).
451
+ wroteMeta = false;
452
+ }
453
+ }
454
+ if (!wroteMeta) {
455
+ await unlink(lockPath).catch(() => undefined);
456
+ return undefined;
457
+ }
458
+ return { path: lockPath, ownerId };
459
+ } catch (err) {
460
+ if ((err as NodeJS.ErrnoException | undefined)?.code !== "EEXIST") {
461
+ // Unexpected FS error — proceed best-effort without the lock.
462
+ return undefined;
463
+ }
464
+ // Lock exists: break it if stale, then poll. breakStaleLock is
465
+ // replacement-safe (NG7Bg) and never throws.
466
+ await breakStaleLock(lockPath, opts.staleMs, opts.onBeforeBreakStaleUnlinkForTest);
467
+ if (Date.now() >= deadline) return undefined;
468
+ // Cap the sleep to the remaining budget so a large pollMs cannot block
469
+ // acquisition far past maxWaitMs (e.g. maxWaitMs=1000, pollMs=60000
470
+ // would otherwise block ~60s instead of 1s — codex P2).
471
+ await sleep(Math.min(pollMs, deadline - Date.now()));
472
+ }
473
+ }
474
+ }
475
+
476
+ /**
477
+ * Replacement-safe stale-lock breaking (NG7Bg, #1506 round 28). Capture the
478
+ * lock's identity when judging it stale, then ATOMICALLY rename it to a unique
479
+ * trash path and verify the moved content matches. A replacement lock created
480
+ * in the race window is either left untouched (different identity at
481
+ * lockPath, so the rename moves the stale lock — not the replacement) or
482
+ * restored (if the rename accidentally moves a replacement, the verify
483
+ * detects the mismatch and renames it back).
484
+ *
485
+ * ATOMICITY (codex P2): `rename` is atomic on POSIX — only ONE contender can
486
+ * successfully rename a given file. This eliminates the TOCTOU between the
487
+ * identity/stat checks and the deletion that a bare `unlink` leaves open:
488
+ * without rename, contender A could verify identity X, pause, then unlink
489
+ * contender B's freshly acquired replacement Y. With rename, A moves whatever
490
+ * is at lockPath, then checks: if it is X, A broke the stale lock; if it is
491
+ * not X (a replacement appeared between A's last check and the rename), A
492
+ * restores it.
493
+ */
494
+ async function breakStaleLock(
495
+ lockPath: string,
496
+ staleMs: number,
497
+ onBeforeBreakStaleUnlinkForTest: (() => Promise<void> | void) | undefined,
498
+ ): Promise<void> {
499
+ let staleIdentity: string;
500
+ try {
501
+ const info = await stat(lockPath);
502
+ if (Date.now() - info.mtimeMs <= staleMs) {
503
+ // Not stale (a live holder's heartbeat keeps it fresh) — leave it.
504
+ return;
505
+ }
506
+ staleIdentity = await readFile(lockPath, "utf8");
507
+ } catch {
508
+ // Lock vanished (released by holder) or stat/read failed — nothing to do.
509
+ return;
510
+ }
511
+ // Test seam: simulate a replacement lock being created in the race window
512
+ // between the staleness judgment and the atomic break. No-op in production.
513
+ if (onBeforeBreakStaleUnlinkForTest) {
514
+ await onBeforeBreakStaleUnlinkForTest();
515
+ }
516
+ try {
517
+ // Re-validate immediately before breaking: the lock must still carry the
518
+ // SAME identity AND still be stale.
519
+ const current = await readFile(lockPath, "utf8");
520
+ if (current !== staleIdentity) return; // replaced — leave the fresh lock
521
+ const recheck = await stat(lockPath);
522
+ if (Date.now() - recheck.mtimeMs <= staleMs) return; // heartbeat refreshed it
523
+
524
+ // ATOMIC BREAK: rename is atomic on POSIX. Only one contender succeeds;
525
+ // others get ENOENT (the file is already gone). After the rename, verify
526
+ // the moved content: if it matches staleIdentity we broke the right lock;
527
+ // if it does not, a replacement appeared in the window and we restore it.
528
+ const trashPath = `${lockPath}.breaking.${process.pid}.${Date.now()}`;
529
+ await rename(lockPath, trashPath);
530
+ try {
531
+ const moved = await readFile(trashPath, "utf8");
532
+ if (moved !== staleIdentity) {
533
+ // We accidentally moved a replacement lock (created between our last
534
+ // check and the rename). Restore it so the replacement holder's lock
535
+ // survives. Use link (not rename) to AVOID overwriting a fresh lock
536
+ // that a third contender may have acquired at lockPath while the file
537
+ // was in trash: link fails with EEXIST if lockPath exists, leaving
538
+ // the third contender's lock intact (codex P2 review).
539
+ try {
540
+ await link(trashPath, lockPath);
541
+ // link succeeded — remove the redundant trash hard link. The lock
542
+ // now lives only at lockPath.
543
+ await unlink(trashPath).catch(() => undefined);
544
+ } catch {
545
+ // lockPath already exists (a third contender acquired it). Do NOT
546
+ // unlink the moved file — it may be a LIVE lock whose holder is
547
+ // still in its critical section. Destroying it would leave the
548
+ // holder running with no visible lock, breaking mutual exclusion
549
+ // (codex P2). Leave it in trash as a breadcrumb; it is not at
550
+ // lockPath so it does not block other contenders.
551
+ }
552
+ } else {
553
+ // Content matches — but verify the moved file is STILL stale. The
554
+ // original holder may have resumed and heartbeated between our
555
+ // pre-rename stat() and the rename, refreshing the mtime. If so, the
556
+ // holder is live: restore the lock instead of deleting it (codex P2).
557
+ const movedStat = await stat(trashPath);
558
+ if (Date.now() - movedStat.mtimeMs <= staleMs) {
559
+ // Mtime was refreshed — the holder resumed. Restore the lock.
560
+ try {
561
+ await link(trashPath, lockPath);
562
+ await unlink(trashPath).catch(() => undefined);
563
+ } catch {
564
+ // lockPath already exists — another contender acquired it. Do NOT
565
+ // unlink the moved file (it may be a live lock). Leave it in trash.
566
+ }
567
+ } else {
568
+ // Still stale — we broke the right lock. Clean up the trash.
569
+ await unlink(trashPath).catch(() => undefined);
570
+ }
571
+ }
572
+ } catch {
573
+ // Could not read the trash file — clean it up best-effort.
574
+ await unlink(trashPath).catch(() => undefined);
575
+ }
576
+ } catch {
577
+ // The lock changed/vanished between checks — another process handled it.
578
+ }
579
+ }
580
+
581
+ /**
582
+ * Release the lock ONLY if its content still identifies THIS acquirer (same
583
+ * owner id). Two-stage ownership check:
584
+ *
585
+ * 1. PRE-CHECK (chatgpt-codex-connector P2): read lockPath BEFORE renaming.
586
+ * If the lock is already a replacement (a contender broke our stale lock),
587
+ * return WITHOUT renaming — renaming a replacement out of lockPath leaves
588
+ * it empty, letting a third contender acquire while the replacement holder
589
+ * is still active. The replacement is safe at lockPath; leave it alone.
590
+ *
591
+ * 2. ATOMIC CLAIM: if the pre-check saw our ownerId, rename lockPath→trash
592
+ * (POSIX-atomic) and re-verify on the moved file. A replacement could
593
+ * appear between the pre-check and the rename; if the moved file is no
594
+ * longer ours, restore it via link (non-overwriting). This ties the
595
+ * ownership check to the deletion so a bare readFile-then-unlink TOCTOU
596
+ * cannot delete a fresh replacement (codex P2).
597
+ */
598
+ async function releaseLock(
599
+ held: HeldLock,
600
+ warn: (message: string, err: unknown) => void,
601
+ onAfterReleaseRenameForTest: (() => Promise<void> | void) | undefined,
602
+ ): Promise<void> {
603
+ try {
604
+ // PRE-CHECK (chatgpt-codex-connector P2): read lockPath before renaming. If
605
+ // the lock is no longer ours, a contender broke our stale lock and created a
606
+ // replacement. Return WITHOUT renaming — renaming the replacement out of
607
+ // lockPath leaves it empty, so a third contender could acquire while the
608
+ // replacement holder is still active. The replacement is safe at lockPath.
609
+ let precheck: string;
610
+ try {
611
+ precheck = await readFile(held.path, "utf8");
612
+ } catch {
613
+ return; // lock vanished — nothing to release.
614
+ }
615
+ if (!precheck.includes(held.ownerId)) {
616
+ return; // replacement lock — leave it untouched for its holder.
617
+ }
618
+ // It was ours when we read it. Atomically claim via rename, then re-verify
619
+ // on the moved file: a replacement could appear between the pre-check read
620
+ // above and this rename.
621
+ const trashPath = `${held.path}.releasing.${process.pid}.${Date.now()}`;
622
+ await rename(held.path, trashPath);
623
+ // Test seam: simulate a third contender acquiring the now-empty lockPath
624
+ // in the rename-to-restore window. No-op in production.
625
+ if (onAfterReleaseRenameForTest) {
626
+ await onAfterReleaseRenameForTest();
627
+ }
628
+ try {
629
+ const moved = await readFile(trashPath, "utf8");
630
+ if (moved.includes(held.ownerId)) {
631
+ // Still our lock — safe to delete.
632
+ await unlink(trashPath).catch(() => undefined);
633
+ } else {
634
+ // Not ours: a replacement appeared between the pre-check and the rename.
635
+ // Restore it via link (non-overwriting — if lockPath already has a newer
636
+ // lock, leave it).
637
+ try {
638
+ await link(trashPath, held.path);
639
+ } catch {
640
+ // lockPath already exists — a newer holder is active. Leave the
641
+ // moved file in trash rather than destroying a live lock (codex P2).
642
+ return;
643
+ }
644
+ await unlink(trashPath).catch(() => undefined);
645
+ }
646
+ } catch {
647
+ // Could not read the moved file — clean it up best-effort.
648
+ await unlink(trashPath).catch(() => undefined);
649
+ }
650
+ } catch (err) {
651
+ // Best-effort release; a stale lock will be broken on the next acquire.
652
+ warn("withHeldFileLock release failed", err);
653
+ }
654
+ }
655
+
656
+ /**
657
+ * Whether the lock file at `held.path` was written by THIS acquirer (same owner
658
+ * id). Reads the content and matches the `<pid> <owner-uuid>` prefix; the iso
659
+ * timestamp varies so it is not part of the identity check.
660
+ */
661
+ async function lockHeldBySelf(held: HeldLock): Promise<boolean> {
662
+ try {
663
+ const body = await readFile(held.path, "utf8");
664
+ const parts = body.trim().split(/\s+/);
665
+ const fileOwner = parts[1];
666
+ return typeof fileOwner === "string" && fileOwner === held.ownerId;
667
+ } catch {
668
+ return false;
669
+ }
670
+ }
671
+
672
+ function sleep(ms: number): Promise<void> {
673
+ const { promise, resolve } = Promise.withResolvers<void>();
674
+ // NOT unref'd: this polls inside an awaited acquire loop, so the caller's
675
+ // await chain keeps the loop alive; unref would let Node exit mid-poll when
676
+ // nothing else is pending (the heartbeat interval IS unref'd separately).
677
+ setTimeout(resolve, ms);
678
+ return promise;
679
+ }