@indigoai-us/hq-cloud 6.7.1 → 6.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3604,7 +3604,7 @@ describe("runRunnerWithLoop — operation lock", () => {
3604
3604
  return p;
3605
3605
  }
3606
3606
 
3607
- it("one-shot sync refuses fast (exit 17) when another op holds the root", async () => {
3607
+ it("one-shot sync refuses immediately (exit 17) with --lock-timeout 0 when another op holds the root", async () => {
3608
3608
  const lp = writeLiveHolder("rescue");
3609
3609
  const errs: string[] = [];
3610
3610
  const spy = vi
@@ -3614,12 +3614,21 @@ describe("runRunnerWithLoop — operation lock", () => {
3614
3614
  return true;
3615
3615
  });
3616
3616
 
3617
- // No --watch → one-shot. Refusal short-circuits BEFORE runRunner (so no
3618
- // network / auth is touched).
3619
- const code = await runRunnerWithLoop(["--companies", "--hq-root", HQ]);
3617
+ // No --watch → one-shot. `--lock-timeout 0` keeps the pre-wait
3618
+ // refuse-immediately behavior (the default is now to WAIT for the holder).
3619
+ // Refusal short-circuits BEFORE runRunner (so no network / auth is touched).
3620
+ const start = Date.now();
3621
+ const code = await runRunnerWithLoop([
3622
+ "--companies",
3623
+ "--hq-root",
3624
+ HQ,
3625
+ "--lock-timeout",
3626
+ "0",
3627
+ ]);
3620
3628
 
3621
3629
  spy.mockRestore();
3622
3630
  expect(code).toBe(OPERATION_LOCKED_EXIT);
3631
+ expect(Date.now() - start).toBeLessThan(1500); // did not wait
3623
3632
  expect(errs.join("")).toContain("rescue"); // names the holder
3624
3633
  // The holder's lock is left intact — we refused, we didn't take it over.
3625
3634
  const held = JSON.parse(fs.readFileSync(lp, "utf8"));
@@ -3627,6 +3636,76 @@ describe("runRunnerWithLoop — operation lock", () => {
3627
3636
  expect(held.command).toBe("rescue");
3628
3637
  });
3629
3638
 
3639
+ it("one-shot sync WAITS for a live holder (default), bounded by --lock-timeout, then refuses with exit 17", async () => {
3640
+ const lp = writeLiveHolder("rescue");
3641
+ const errs: string[] = [];
3642
+ const spy = vi
3643
+ .spyOn(process.stderr, "write")
3644
+ .mockImplementation((chunk: string | Uint8Array) => {
3645
+ errs.push(String(chunk));
3646
+ return true;
3647
+ });
3648
+
3649
+ // The holder never releases here; --lock-timeout 1 bounds the wait so the
3650
+ // runner waits ~1s, emits a single "Waiting for …" status line, then
3651
+ // refuses (exit 17) — proving the runner threads the flag through and
3652
+ // takes the wait path (rather than refusing instantly). It still never
3653
+ // enters runRunner, so no network/auth is touched.
3654
+ const start = Date.now();
3655
+ const code = await runRunnerWithLoop([
3656
+ "--companies",
3657
+ "--hq-root",
3658
+ HQ,
3659
+ "--lock-timeout",
3660
+ "1",
3661
+ ]);
3662
+ const elapsed = Date.now() - start;
3663
+
3664
+ spy.mockRestore();
3665
+ expect(code).toBe(OPERATION_LOCKED_EXIT);
3666
+ expect(elapsed).toBeGreaterThanOrEqual(800); // actually waited ~1s
3667
+ expect(errs.join("")).toContain("Waiting for"); // status line emitted once
3668
+ expect(errs.join("")).toContain("rescue"); // names the holder
3669
+ const held = JSON.parse(fs.readFileSync(lp, "utf8"));
3670
+ expect(held.pid).toBe(1); // holder untouched
3671
+ }, 20_000);
3672
+
3673
+ it("DEV-1772: one-shot waits out a SHORT-LIVED holder (reindex) then PROCEEDS to sync", async () => {
3674
+ // The exact reported scenario (feedback_28a1833f / DEV-1772): a frequent
3675
+ // ~1-min `reindex` briefly holds the lock; the instant-sync one-shot used
3676
+ // to exit 17 and silently die. Now it WAITS (default) and proceeds the
3677
+ // moment the short holder releases. We model the short holder with a
3678
+ // foreign live pid (1) whose lock file is removed shortly after, and inject
3679
+ // runPass so "proceeds → syncs" is observable without the network.
3680
+ writeLiveHolder("reindex");
3681
+ const errs: string[] = [];
3682
+ const spy = vi
3683
+ .spyOn(process.stderr, "write")
3684
+ .mockImplementation((chunk: string | Uint8Array) => {
3685
+ errs.push(String(chunk));
3686
+ return true;
3687
+ });
3688
+
3689
+ const runPass = vi.fn().mockResolvedValue(0);
3690
+ // The short-lived holder releases ~150ms in.
3691
+ setTimeout(() => fs.rmSync(lockPathFor(HQ), { force: true }), 150);
3692
+
3693
+ const code = await runRunnerWithLoop(
3694
+ ["--companies", "--hq-root", HQ],
3695
+ { runPass },
3696
+ );
3697
+
3698
+ spy.mockRestore();
3699
+ // It did NOT refuse/die — it waited then ran the sync pass exactly once.
3700
+ expect(code).toBe(0);
3701
+ expect(code).not.toBe(OPERATION_LOCKED_EXIT);
3702
+ expect(runPass).toHaveBeenCalledTimes(1);
3703
+ // A single "Waiting for …" status line named the short holder.
3704
+ const out = errs.join("");
3705
+ expect(out).toContain("Waiting for");
3706
+ expect(out).toContain("reindex");
3707
+ }, 20_000);
3708
+
3630
3709
  it("the watch runner is EXEMPT — runs despite a held lock and never takes it", async () => {
3631
3710
  const lp = writeLiveHolder("sync");
3632
3711
  const watcher = makeWatcherStub();
@@ -607,6 +607,13 @@ interface ParsedArgs {
607
607
  * mode (single-company runs never visit the personal target).
608
608
  */
609
609
  skipPersonal: boolean;
610
+ /**
611
+ * Bounded wait (seconds) for the per-root operation lock when another op is
612
+ * already running. `0` → refuse immediately (pre-wait behavior); omitted →
613
+ * inherit `HQ_OP_LOCK_TIMEOUT` / infinite wait. Only meaningful on the
614
+ * one-shot path (the `--watch` runner is lock-exempt).
615
+ */
616
+ lockTimeoutSec?: number;
610
617
  }
611
618
 
612
619
  function parseArgs(argv: string[]): ParsedArgs | { error: string } {
@@ -620,6 +627,7 @@ function parseArgs(argv: string[]): ParsedArgs | { error: string } {
620
627
  let pollRemoteMs: number | undefined;
621
628
  let skipPersonal = false;
622
629
  let eventPush = false;
630
+ let lockTimeoutSec: number | undefined;
623
631
 
624
632
  for (let i = 0; i < argv.length; i++) {
625
633
  const arg = argv[i];
@@ -693,6 +701,18 @@ function parseArgs(argv: string[]): ParsedArgs | { error: string } {
693
701
  // @getindigo.ai identities for the first release.
694
702
  eventPush = true;
695
703
  break;
704
+ case "--lock-timeout": {
705
+ const val = argv[++i];
706
+ if (!val) return { error: "--lock-timeout requires a value (seconds)" };
707
+ const n = Number(val);
708
+ if (!Number.isInteger(n) || n < 0) {
709
+ return {
710
+ error: `--lock-timeout must be a non-negative integer (seconds), got: ${val}`,
711
+ };
712
+ }
713
+ lockTimeoutSec = n;
714
+ break;
715
+ }
696
716
  default:
697
717
  return { error: `Unknown argument: ${arg}` };
698
718
  }
@@ -732,6 +752,7 @@ function parseArgs(argv: string[]): ParsedArgs | { error: string } {
732
752
  pollRemoteMs,
733
753
  skipPersonal,
734
754
  eventPush,
755
+ lockTimeoutSec,
735
756
  };
736
757
  }
737
758
 
@@ -1855,10 +1876,27 @@ export async function runRunnerWithLoop(
1855
1876
  // surfaces the parse error rather than us masking it with a lock failure.
1856
1877
  const parsed = parseArgs(argv);
1857
1878
  if ("error" in parsed) return runRunner(argv);
1879
+ // The actual sync pass — same seam the watch loop uses (deps.runPass),
1880
+ // so a test can assert "waits for a short-lived holder, THEN proceeds to
1881
+ // sync" without touching the network. Production passes `argv` to
1882
+ // runRunner exactly as before. Regression guard for DEV-1772
1883
+ // (feedback_28a1833f): instant-sync one-shots used to exit 17 and die on
1884
+ // a lock conflict with the ~1-min reindex hook; they now WAIT (default)
1885
+ // and proceed once the short holder releases.
1886
+ const runOnce = deps.runPass ?? ((passArgv: string[]) => runRunner(passArgv));
1858
1887
  try {
1859
- return await withOperationLock(parsed.hqRoot, "sync", () => runRunner(argv));
1888
+ return await withOperationLock(parsed.hqRoot, "sync", () => runOnce(argv), {
1889
+ timeoutSec: parsed.lockTimeoutSec,
1890
+ });
1860
1891
  } catch (err) {
1861
1892
  if (err instanceof OperationLockedError) {
1893
+ // The lock wait was BOUNDED and tripped (a holder never released
1894
+ // within --lock-timeout / HQ_OP_LOCK_TIMEOUT). Surface it loudly on
1895
+ // stderr and exit with the stable OPERATION_LOCKED_EXIT (17) so the
1896
+ // spawner (menubar) can recognize a lock conflict and SCHEDULE A
1897
+ // RETRY rather than treating it as a hard failure and silently giving
1898
+ // up (DEV-1772). With the default (no bound) we never reach here — the
1899
+ // one-shot waits indefinitely and proceeds.
1862
1900
  process.stderr.write(err.message + "\n");
1863
1901
  return OPERATION_LOCKED_EXIT;
1864
1902
  }
@@ -135,9 +135,10 @@ describe("reindex", () => {
135
135
 
136
136
  // ── operation lock (mutual exclusion with sync/rescue) ──────────────────
137
137
 
138
- it("refuses (OPERATION_LOCKED_EXIT) when another op holds this root's lock", () => {
139
- // A live holder in another process (pid 1) → reindex must refuse fast and
140
- // do no work.
138
+ it("refuses (OPERATION_LOCKED_EXIT) when another op holds this root's lock (lockTimeoutSec:0)", () => {
139
+ // A live holder in another process (pid 1) → with lockTimeoutSec:0 reindex
140
+ // refuses immediately and does no work. (The new default is to WAIT; the
141
+ // wait-then-acquire path is covered in operation-lock.test.ts.)
141
142
  const lp = lockPathFor(root);
142
143
  fs.mkdirSync(path.dirname(lp), { recursive: true });
143
144
  fs.writeFileSync(
@@ -145,7 +146,7 @@ describe("reindex", () => {
145
146
  JSON.stringify({ pid: 1, command: "sync", startedAt: new Date(0).toISOString(), hqRoot: root }),
146
147
  );
147
148
  const before = fs.existsSync(path.join(root, ".claude/skills"));
148
- const { status } = reindex({ repoRoot: root });
149
+ const { status } = reindex({ repoRoot: root, lockTimeoutSec: 0 });
149
150
  expect(status).toBe(OPERATION_LOCKED_EXIT);
150
151
  // It refused before doing any work (didn't create .claude/skills).
151
152
  expect(fs.existsSync(path.join(root, ".claude/skills"))).toBe(before);
@@ -35,6 +35,17 @@ export interface ReindexOptions {
35
35
  * exclusive with a running sync/rescue.
36
36
  */
37
37
  skipLock?: boolean;
38
+ /**
39
+ * Bounded wait (seconds) for the per-root operation lock when a sync/rescue
40
+ * is already running. `0` → refuse immediately; omitted → inherit
41
+ * `HQ_OP_LOCK_TIMEOUT` / infinite wait. Ignored when `skipLock` is set.
42
+ *
43
+ * NB: standalone `hq reindex` waits by default, which is what a human running
44
+ * it wants. The hq-core reindex HOOK (Stop / PostToolUse) should set a small
45
+ * `HQ_OP_LOCK_TIMEOUT` (or `0`) so a hook fired mid-sync never blocks the
46
+ * interactive agent indefinitely.
47
+ */
48
+ lockTimeoutSec?: number;
38
49
  }
39
50
 
40
51
  export interface ReindexResult {
@@ -151,7 +162,7 @@ export function reindex(opts: ReindexOptions = {}): ReindexResult {
151
162
  let opLock: LockHandle | null = null;
152
163
  if (!opts.skipLock) {
153
164
  try {
154
- opLock = acquireOperationLock(root, "reindex");
165
+ opLock = acquireOperationLock(root, "reindex", { timeoutSec: opts.lockTimeoutSec });
155
166
  } catch (err) {
156
167
  if (err instanceof OperationLockedError) {
157
168
  warn(err.message);
@@ -1,5 +1,47 @@
1
1
  import { describe, it, expect } from "vitest";
2
- import { buildRescueArgs } from "./rescue.js";
2
+ import { buildRescueArgs, extractLockTimeout } from "./rescue.js";
3
+
4
+ describe("extractLockTimeout", () => {
5
+ it("returns the explicit option untouched when extraArgs is empty/absent", () => {
6
+ expect(extractLockTimeout({ lockTimeoutSec: 5 })).toEqual({
7
+ lockTimeoutSec: 5,
8
+ cleanedExtraArgs: undefined,
9
+ });
10
+ expect(extractLockTimeout({ lockTimeoutSec: 5, extraArgs: [] })).toEqual({
11
+ lockTimeoutSec: 5,
12
+ cleanedExtraArgs: [],
13
+ });
14
+ });
15
+
16
+ it("pulls --lock-timeout <secs> out of extraArgs and strips it from the forwarded args", () => {
17
+ const r = extractLockTimeout({
18
+ extraArgs: ["--lock-timeout", "30", "--dry-run", "--yes"],
19
+ });
20
+ expect(r.lockTimeoutSec).toBe(30);
21
+ // The flag + its value never reach the rescue script.
22
+ expect(r.cleanedExtraArgs).toEqual(["--dry-run", "--yes"]);
23
+ });
24
+
25
+ it("honors --lock-timeout 0 (refuse immediately)", () => {
26
+ const r = extractLockTimeout({ extraArgs: ["--lock-timeout", "0"] });
27
+ expect(r.lockTimeoutSec).toBe(0);
28
+ expect(r.cleanedExtraArgs).toEqual([]);
29
+ });
30
+
31
+ it("explicit option wins over a flag in extraArgs", () => {
32
+ const r = extractLockTimeout({
33
+ lockTimeoutSec: 0,
34
+ extraArgs: ["--lock-timeout", "60"],
35
+ });
36
+ expect(r.lockTimeoutSec).toBe(0);
37
+ expect(r.cleanedExtraArgs).toEqual([]);
38
+ });
39
+
40
+ it("ignores a non-numeric / negative --lock-timeout value (treated as unset)", () => {
41
+ expect(extractLockTimeout({ extraArgs: ["--lock-timeout", "abc"] }).lockTimeoutSec).toBeUndefined();
42
+ expect(extractLockTimeout({ extraArgs: ["--lock-timeout", "-3"] }).lockTimeoutSec).toBeUndefined();
43
+ });
44
+ });
3
45
 
4
46
  describe("buildRescueArgs", () => {
5
47
  it("emits no args for an empty option set (script defaults apply)", () => {
package/src/cli/rescue.ts CHANGED
@@ -57,10 +57,52 @@ export interface RescueOptions {
57
57
  /** GitHub token forwarded to the script as `GH_TOKEN` (avoids the
58
58
  * anonymous-clone rate limit; required for private sources). */
59
59
  ghToken?: string;
60
+ /**
61
+ * Bounded wait (seconds) for the per-root operation lock when a sync/reindex
62
+ * is already running. `0` → refuse immediately; omitted → inherit
63
+ * `HQ_OP_LOCK_TIMEOUT` / infinite wait. Also accepted as `--lock-timeout
64
+ * <secs>` inside {@link extraArgs} (the machine `hq-rescue` entrypoint
65
+ * forwards raw argv) — it is consumed here and never passed to the rescue
66
+ * script, which doesn't understand it.
67
+ */
68
+ lockTimeoutSec?: number;
60
69
  /** Escape hatch — additional raw args appended verbatim. */
61
70
  extraArgs?: string[];
62
71
  }
63
72
 
73
+ /**
74
+ * Pull `--lock-timeout <secs>` out of `extraArgs` (the raw argv the machine
75
+ * `hq-rescue` entrypoint forwards) so it controls the operation-lock wait
76
+ * instead of leaking to the rescue script. Returns the resolved timeout
77
+ * (explicit `opts.lockTimeoutSec` wins) and a copy of extraArgs with the flag
78
+ * removed.
79
+ */
80
+ export function extractLockTimeout(opts: RescueOptions): {
81
+ lockTimeoutSec: number | undefined;
82
+ cleanedExtraArgs: string[] | undefined;
83
+ } {
84
+ const raw = opts.extraArgs;
85
+ if (!raw || raw.length === 0) {
86
+ return { lockTimeoutSec: opts.lockTimeoutSec, cleanedExtraArgs: raw };
87
+ }
88
+ const cleaned: string[] = [];
89
+ let fromArgs: number | undefined;
90
+ for (let i = 0; i < raw.length; i++) {
91
+ if (raw[i] === "--lock-timeout") {
92
+ const val = Number(raw[i + 1]);
93
+ if (Number.isInteger(val) && val >= 0) fromArgs = val;
94
+ i++; // skip the value too
95
+ continue;
96
+ }
97
+ cleaned.push(raw[i]);
98
+ }
99
+ // Explicit option beats the parsed flag.
100
+ return {
101
+ lockTimeoutSec: opts.lockTimeoutSec ?? fromArgs,
102
+ cleanedExtraArgs: cleaned,
103
+ };
104
+ }
105
+
64
106
  export interface RescueResult {
65
107
  /** Exit status of the underlying script (0 = success). */
66
108
  status: number;
@@ -100,9 +142,13 @@ export function rescue(opts: RescueOptions = {}): RescueResult {
100
142
  // lock on the same root the rescue script resolves (cwd when --hq-root is
101
143
  // omitted). Rescue is never the exempt push watcher, so it always locks.
102
144
  const lockRoot = opts.hqRoot ?? process.cwd();
145
+ // Consume --lock-timeout from extraArgs (machine entrypoint) before it can
146
+ // reach the rescue script, which doesn't understand it.
147
+ const { lockTimeoutSec, cleanedExtraArgs } = extractLockTimeout(opts);
148
+ const rescueOpts: RescueOptions = { ...opts, extraArgs: cleanedExtraArgs };
103
149
  let handle;
104
150
  try {
105
- handle = acquireOperationLock(lockRoot, "rescue");
151
+ handle = acquireOperationLock(lockRoot, "rescue", { timeoutSec: lockTimeoutSec });
106
152
  } catch (err) {
107
153
  if (err instanceof OperationLockedError) {
108
154
  process.stderr.write(err.message + "\n");
@@ -111,7 +157,7 @@ export function rescue(opts: RescueOptions = {}): RescueResult {
111
157
  throw err;
112
158
  }
113
159
  try {
114
- const args = buildRescueArgs(opts);
160
+ const args = buildRescueArgs(rescueOpts);
115
161
  const env: NodeJS.ProcessEnv = { ...process.env };
116
162
  if (opts.ghToken) env.GH_TOKEN = opts.ghToken;
117
163
  const { status } = runRescue(args, { env });
@@ -9,10 +9,13 @@ import * as os from "os";
9
9
  import * as path from "path";
10
10
  import {
11
11
  acquireOperationLock,
12
+ acquireOperationLockAsync,
13
+ withOperationLock,
12
14
  withOperationLockSync,
13
15
  lockPathFor,
14
16
  OperationLockedError,
15
17
  OPERATION_LOCKED_EXIT,
18
+ DEFAULT_LOCK_POLL_MS,
16
19
  type LockInfo,
17
20
  } from "./operation-lock.js";
18
21
 
@@ -44,6 +47,7 @@ describe("operation-lock", () => {
44
47
  stateDir = fs.mkdtempSync(path.join(os.tmpdir(), "hq-oplock-state-"));
45
48
  process.env.HQ_STATE_DIR = stateDir;
46
49
  delete process.env.HQ_DISABLE_OP_LOCK;
50
+ delete process.env.HQ_OP_LOCK_TIMEOUT;
47
51
  rootA = fs.mkdtempSync(path.join(os.tmpdir(), "hq-rootA-"));
48
52
  rootB = fs.mkdtempSync(path.join(os.tmpdir(), "hq-rootB-"));
49
53
  });
@@ -54,6 +58,7 @@ describe("operation-lock", () => {
54
58
  fs.rmSync(rootB, { recursive: true, force: true });
55
59
  delete process.env.HQ_STATE_DIR;
56
60
  delete process.env.HQ_DISABLE_OP_LOCK;
61
+ delete process.env.HQ_OP_LOCK_TIMEOUT;
57
62
  });
58
63
 
59
64
  it("the lock path is under the state dir, keyed per canonical root", () => {
@@ -75,14 +80,17 @@ describe("operation-lock", () => {
75
80
  expect(fs.existsSync(h.path)).toBe(false);
76
81
  });
77
82
 
78
- it("refuses fast with the holder's command + pid when a LIVE process holds it", () => {
83
+ it("refuses immediately (wait:false) with the holder's command + pid when a LIVE process holds it", () => {
79
84
  // Simulate a DIFFERENT live process holding the lock. PID 1 (init/systemd)
80
85
  // is always alive and is never our own pid, so kill(1,0) reports alive and
81
- // the same-process reclaim path does not apply.
86
+ // the same-process reclaim path does not apply. `wait:false` keeps the old
87
+ // refuse-immediately behavior (the default is now to WAIT).
82
88
  writeLock(lockPathFor(rootA), { pid: 1, command: "rescue" });
83
- expect(() => acquireOperationLock(rootA, "sync")).toThrowError(OperationLockedError);
89
+ expect(() => acquireOperationLock(rootA, "sync", { wait: false })).toThrowError(
90
+ OperationLockedError,
91
+ );
84
92
  try {
85
- acquireOperationLock(rootA, "sync");
93
+ acquireOperationLock(rootA, "sync", { wait: false });
86
94
  } catch (e) {
87
95
  const err = e as OperationLockedError;
88
96
  expect(err.holder.command).toBe("rescue");
@@ -92,11 +100,124 @@ describe("operation-lock", () => {
92
100
  }
93
101
  });
94
102
 
95
- it("reclaims a stale lock whose holder PID is dead (takeover)", () => {
103
+ it("timeoutSec:0 refuses immediately (no wait) equivalent to wait:false", () => {
104
+ writeLock(lockPathFor(rootA), { pid: 1, command: "sync" });
105
+ const start = Date.now();
106
+ expect(() => acquireOperationLock(rootA, "reindex", { timeoutSec: 0 })).toThrowError(
107
+ OperationLockedError,
108
+ );
109
+ // Did not actually sleep.
110
+ expect(Date.now() - start).toBeLessThan(DEFAULT_LOCK_POLL_MS);
111
+ });
112
+
113
+ it("a bounded timeoutSec waits, then refuses with the old message + exit code", () => {
114
+ writeLock(lockPathFor(rootA), { pid: 1, command: "rescue" });
115
+ const start = Date.now();
116
+ let thrown: unknown;
117
+ try {
118
+ // 150ms bound, 40ms poll → waits ~150ms then gives up. Suppress the
119
+ // stderr status line with a no-op onWaitStart.
120
+ acquireOperationLock(rootA, "sync", {
121
+ timeoutSec: 0.15,
122
+ pollIntervalMs: 40,
123
+ onWaitStart: () => {},
124
+ });
125
+ } catch (e) {
126
+ thrown = e;
127
+ }
128
+ const elapsed = Date.now() - start;
129
+ expect(thrown).toBeInstanceOf(OperationLockedError);
130
+ expect((thrown as OperationLockedError).message).toContain("rescue");
131
+ expect(OPERATION_LOCKED_EXIT).toBe(17);
132
+ // It actually waited (didn't refuse instantly) but didn't hang forever.
133
+ expect(elapsed).toBeGreaterThanOrEqual(120);
134
+ expect(elapsed).toBeLessThan(3000);
135
+ });
136
+
137
+ it("HQ_OP_LOCK_TIMEOUT env bounds the wait when no explicit option is given", () => {
138
+ process.env.HQ_OP_LOCK_TIMEOUT = "0"; // 0 → refuse immediately
139
+ writeLock(lockPathFor(rootA), { pid: 1, command: "sync" });
140
+ const start = Date.now();
141
+ expect(() =>
142
+ acquireOperationLock(rootA, "rescue", { onWaitStart: () => {} }),
143
+ ).toThrowError(OperationLockedError);
144
+ expect(Date.now() - start).toBeLessThan(DEFAULT_LOCK_POLL_MS);
145
+ });
146
+
147
+ it("an explicit timeoutSec overrides the HQ_OP_LOCK_TIMEOUT env", () => {
148
+ process.env.HQ_OP_LOCK_TIMEOUT = "9999"; // would be a near-infinite wait
149
+ writeLock(lockPathFor(rootA), { pid: 1, command: "sync" });
150
+ // The explicit 0 wins → refuse immediately rather than honoring the env.
151
+ expect(() =>
152
+ acquireOperationLock(rootA, "rescue", { timeoutSec: 0 }),
153
+ ).toThrowError(OperationLockedError);
154
+ });
155
+
156
+ it("onWaitStart fires exactly once, naming the holder, even across many polls", () => {
157
+ writeLock(lockPathFor(rootA), { pid: 1, command: "rescue" });
158
+ const calls: Array<{ cmd: string; attempted: string }> = [];
159
+ expect(() =>
160
+ acquireOperationLock(rootA, "sync", {
161
+ timeoutSec: 0.16,
162
+ pollIntervalMs: 30, // ~5 polls within the window
163
+ onWaitStart: (holder, attempted) => calls.push({ cmd: holder.command, attempted }),
164
+ }),
165
+ ).toThrowError(OperationLockedError);
166
+ expect(calls).toHaveLength(1);
167
+ expect(calls[0]).toEqual({ cmd: "rescue", attempted: "sync" });
168
+ });
169
+
170
+ it("a waiter acquires the lock the moment the holder releases (async poll path)", async () => {
171
+ const p = lockPathFor(rootA);
172
+ // A foreign LIVE holder (pid 1) initially owns the lock.
173
+ writeLock(p, { pid: 1, command: "rescue" });
174
+ // Simulate the holder finishing ~80ms in by removing its lock file.
175
+ const release = setTimeout(() => fs.rmSync(p, { force: true }), 80);
176
+ const start = Date.now();
177
+ const h = await acquireOperationLockAsync(rootA, "sync", {
178
+ pollIntervalMs: 20,
179
+ onWaitStart: () => {},
180
+ });
181
+ clearTimeout(release);
182
+ const elapsed = Date.now() - start;
183
+ // We waited for the release, then took it over.
184
+ expect(elapsed).toBeGreaterThanOrEqual(60);
185
+ const info = JSON.parse(fs.readFileSync(h.path, "utf8")) as LockInfo;
186
+ expect(info.pid).toBe(process.pid);
187
+ expect(info.command).toBe("sync");
188
+ h.release();
189
+ });
190
+
191
+ it("multiple foreign holders in a row: each release lets the next waiter in (no FIFO guarantee)", async () => {
192
+ // The mutex is CROSS-PROCESS: it keys liveness on the holder's PID. Two
193
+ // waiters in the SAME process share a pid, so the same-process reclaim path
194
+ // would let them stomp each other — that scenario is unsupported by design.
195
+ // Here we model the real case: a sequence of FOREIGN holders (pid 1) that
196
+ // each release, with a single waiter acquiring the instant the lock frees.
197
+ // Order among multiple distinct-process waiters is whoever wins the next
198
+ // O_EXCL race after a free — best-effort, NOT FIFO (documented).
199
+ const p = lockPathFor(rootA);
200
+ writeLock(p, { pid: 1, command: "sync" });
201
+ // Free it shortly; the waiter should grab it right after.
202
+ setTimeout(() => fs.rmSync(p, { force: true }), 50);
203
+ const h = await acquireOperationLockAsync(rootA, "reindex", {
204
+ pollIntervalMs: 15,
205
+ onWaitStart: () => {},
206
+ });
207
+ const info = JSON.parse(fs.readFileSync(h.path, "utf8")) as LockInfo;
208
+ expect(info.command).toBe("reindex");
209
+ expect(info.pid).toBe(process.pid);
210
+ h.release();
211
+ });
212
+
213
+ it("reclaims a stale lock whose holder PID is dead (takeover, never waits)", () => {
96
214
  const stale = deadPid();
97
215
  writeLock(lockPathFor(rootA), { pid: stale, command: "sync" });
98
- // The dead holder must not block us.
216
+ const start = Date.now();
217
+ // The dead holder must not block us — even with an infinite default wait,
218
+ // takeover is immediate.
99
219
  const h = acquireOperationLock(rootA, "rescue");
220
+ expect(Date.now() - start).toBeLessThan(DEFAULT_LOCK_POLL_MS);
100
221
  const info = JSON.parse(fs.readFileSync(h.path, "utf8")) as LockInfo;
101
222
  expect(info.pid).toBe(process.pid); // we took it over
102
223
  expect(info.command).toBe("rescue");
@@ -114,7 +235,7 @@ describe("operation-lock", () => {
114
235
 
115
236
  it("different HQ roots are independent — both may hold concurrently", () => {
116
237
  const a = acquireOperationLock(rootA, "sync");
117
- const b = acquireOperationLock(rootB, "rescue"); // must NOT refuse
238
+ const b = acquireOperationLock(rootB, "rescue"); // must NOT block
118
239
  expect(fs.existsSync(a.path)).toBe(true);
119
240
  expect(fs.existsSync(b.path)).toBe(true);
120
241
  expect(a.path).not.toBe(b.path);
@@ -126,9 +247,14 @@ describe("operation-lock", () => {
126
247
  // A live sync in ANOTHER process holds the root (pid 1 stands in for it).
127
248
  const p = lockPathFor(rootA);
128
249
  writeLock(p, { pid: 1, command: "sync" });
129
- // Neither rescue nor reindex may acquire while that sync holds it.
130
- expect(() => acquireOperationLock(rootA, "rescue")).toThrowError(OperationLockedError);
131
- expect(() => acquireOperationLock(rootA, "reindex")).toThrowError(OperationLockedError);
250
+ // Neither rescue nor reindex may acquire while that sync holds it
251
+ // (wait:false assert the refusal without hanging on the new wait default).
252
+ expect(() => acquireOperationLock(rootA, "rescue", { wait: false })).toThrowError(
253
+ OperationLockedError,
254
+ );
255
+ expect(() => acquireOperationLock(rootA, "reindex", { wait: false })).toThrowError(
256
+ OperationLockedError,
257
+ );
132
258
  // Once that sync finishes (its lock is gone), the next command acquires.
133
259
  fs.unlinkSync(p);
134
260
  const h2 = acquireOperationLock(rootA, "reindex");
@@ -147,6 +273,17 @@ describe("operation-lock", () => {
147
273
  expect(fs.existsSync(p)).toBe(false); // released on the way out
148
274
  });
149
275
 
276
+ it("withOperationLock (async) releases even when the body throws", async () => {
277
+ const p = lockPathFor(rootA);
278
+ await expect(
279
+ withOperationLock(rootA, "sync", async () => {
280
+ expect(fs.existsSync(p)).toBe(true);
281
+ throw new Error("boom");
282
+ }),
283
+ ).rejects.toThrow("boom");
284
+ expect(fs.existsSync(p)).toBe(false);
285
+ });
286
+
150
287
  it("HQ_DISABLE_OP_LOCK=1 makes acquisition a no-op", () => {
151
288
  process.env.HQ_DISABLE_OP_LOCK = "1";
152
289
  // Even with a live holder on record, the escape hatch acquires without error.