@indigoai-us/hq-cloud 6.7.1 → 6.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/sync-runner.d.ts.map +1 -1
- package/dist/bin/sync-runner.js +33 -1
- package/dist/bin/sync-runner.js.map +1 -1
- package/dist/bin/sync-runner.test.js +73 -4
- package/dist/bin/sync-runner.test.js.map +1 -1
- package/dist/cli/reindex.d.ts +11 -0
- package/dist/cli/reindex.d.ts.map +1 -1
- package/dist/cli/reindex.js +1 -1
- package/dist/cli/reindex.js.map +1 -1
- package/dist/cli/reindex.test.js +5 -4
- package/dist/cli/reindex.test.js.map +1 -1
- package/dist/cli/rescue.d.ts +20 -0
- package/dist/cli/rescue.d.ts.map +1 -1
- package/dist/cli/rescue.js +36 -2
- package/dist/cli/rescue.js.map +1 -1
- package/dist/cli/rescue.test.js +38 -1
- package/dist/cli/rescue.test.js.map +1 -1
- package/dist/operation-lock.d.ts +81 -10
- package/dist/operation-lock.d.ts.map +1 -1
- package/dist/operation-lock.js +177 -27
- package/dist/operation-lock.js.map +1 -1
- package/dist/operation-lock.test.js +122 -11
- package/dist/operation-lock.test.js.map +1 -1
- package/package.json +1 -1
- package/src/bin/sync-runner.test.ts +83 -4
- package/src/bin/sync-runner.ts +39 -1
- package/src/cli/reindex.test.ts +5 -4
- package/src/cli/reindex.ts +12 -1
- package/src/cli/rescue.test.ts +43 -1
- package/src/cli/rescue.ts +48 -2
- package/src/operation-lock.test.ts +147 -10
- package/src/operation-lock.ts +234 -26
|
@@ -3604,7 +3604,7 @@ describe("runRunnerWithLoop — operation lock", () => {
|
|
|
3604
3604
|
return p;
|
|
3605
3605
|
}
|
|
3606
3606
|
|
|
3607
|
-
it("one-shot sync refuses
|
|
3607
|
+
it("one-shot sync refuses immediately (exit 17) with --lock-timeout 0 when another op holds the root", async () => {
|
|
3608
3608
|
const lp = writeLiveHolder("rescue");
|
|
3609
3609
|
const errs: string[] = [];
|
|
3610
3610
|
const spy = vi
|
|
@@ -3614,12 +3614,21 @@ describe("runRunnerWithLoop — operation lock", () => {
|
|
|
3614
3614
|
return true;
|
|
3615
3615
|
});
|
|
3616
3616
|
|
|
3617
|
-
// No --watch → one-shot.
|
|
3618
|
-
//
|
|
3619
|
-
|
|
3617
|
+
// No --watch → one-shot. `--lock-timeout 0` keeps the pre-wait
|
|
3618
|
+
// refuse-immediately behavior (the default is now to WAIT for the holder).
|
|
3619
|
+
// Refusal short-circuits BEFORE runRunner (so no network / auth is touched).
|
|
3620
|
+
const start = Date.now();
|
|
3621
|
+
const code = await runRunnerWithLoop([
|
|
3622
|
+
"--companies",
|
|
3623
|
+
"--hq-root",
|
|
3624
|
+
HQ,
|
|
3625
|
+
"--lock-timeout",
|
|
3626
|
+
"0",
|
|
3627
|
+
]);
|
|
3620
3628
|
|
|
3621
3629
|
spy.mockRestore();
|
|
3622
3630
|
expect(code).toBe(OPERATION_LOCKED_EXIT);
|
|
3631
|
+
expect(Date.now() - start).toBeLessThan(1500); // did not wait
|
|
3623
3632
|
expect(errs.join("")).toContain("rescue"); // names the holder
|
|
3624
3633
|
// The holder's lock is left intact — we refused, we didn't take it over.
|
|
3625
3634
|
const held = JSON.parse(fs.readFileSync(lp, "utf8"));
|
|
@@ -3627,6 +3636,76 @@ describe("runRunnerWithLoop — operation lock", () => {
|
|
|
3627
3636
|
expect(held.command).toBe("rescue");
|
|
3628
3637
|
});
|
|
3629
3638
|
|
|
3639
|
+
it("one-shot sync WAITS for a live holder (default), bounded by --lock-timeout, then refuses with exit 17", async () => {
|
|
3640
|
+
const lp = writeLiveHolder("rescue");
|
|
3641
|
+
const errs: string[] = [];
|
|
3642
|
+
const spy = vi
|
|
3643
|
+
.spyOn(process.stderr, "write")
|
|
3644
|
+
.mockImplementation((chunk: string | Uint8Array) => {
|
|
3645
|
+
errs.push(String(chunk));
|
|
3646
|
+
return true;
|
|
3647
|
+
});
|
|
3648
|
+
|
|
3649
|
+
// The holder never releases here; --lock-timeout 1 bounds the wait so the
|
|
3650
|
+
// runner waits ~1s, emits a single "Waiting for …" status line, then
|
|
3651
|
+
// refuses (exit 17) — proving the runner threads the flag through and
|
|
3652
|
+
// takes the wait path (rather than refusing instantly). It still never
|
|
3653
|
+
// enters runRunner, so no network/auth is touched.
|
|
3654
|
+
const start = Date.now();
|
|
3655
|
+
const code = await runRunnerWithLoop([
|
|
3656
|
+
"--companies",
|
|
3657
|
+
"--hq-root",
|
|
3658
|
+
HQ,
|
|
3659
|
+
"--lock-timeout",
|
|
3660
|
+
"1",
|
|
3661
|
+
]);
|
|
3662
|
+
const elapsed = Date.now() - start;
|
|
3663
|
+
|
|
3664
|
+
spy.mockRestore();
|
|
3665
|
+
expect(code).toBe(OPERATION_LOCKED_EXIT);
|
|
3666
|
+
expect(elapsed).toBeGreaterThanOrEqual(800); // actually waited ~1s
|
|
3667
|
+
expect(errs.join("")).toContain("Waiting for"); // status line emitted once
|
|
3668
|
+
expect(errs.join("")).toContain("rescue"); // names the holder
|
|
3669
|
+
const held = JSON.parse(fs.readFileSync(lp, "utf8"));
|
|
3670
|
+
expect(held.pid).toBe(1); // holder untouched
|
|
3671
|
+
}, 20_000);
|
|
3672
|
+
|
|
3673
|
+
it("DEV-1772: one-shot waits out a SHORT-LIVED holder (reindex) then PROCEEDS to sync", async () => {
|
|
3674
|
+
// The exact reported scenario (feedback_28a1833f / DEV-1772): a frequent
|
|
3675
|
+
// ~1-min `reindex` briefly holds the lock; the instant-sync one-shot used
|
|
3676
|
+
// to exit 17 and silently die. Now it WAITS (default) and proceeds the
|
|
3677
|
+
// moment the short holder releases. We model the short holder with a
|
|
3678
|
+
// foreign live pid (1) whose lock file is removed shortly after, and inject
|
|
3679
|
+
// runPass so "proceeds → syncs" is observable without the network.
|
|
3680
|
+
writeLiveHolder("reindex");
|
|
3681
|
+
const errs: string[] = [];
|
|
3682
|
+
const spy = vi
|
|
3683
|
+
.spyOn(process.stderr, "write")
|
|
3684
|
+
.mockImplementation((chunk: string | Uint8Array) => {
|
|
3685
|
+
errs.push(String(chunk));
|
|
3686
|
+
return true;
|
|
3687
|
+
});
|
|
3688
|
+
|
|
3689
|
+
const runPass = vi.fn().mockResolvedValue(0);
|
|
3690
|
+
// The short-lived holder releases ~150ms in.
|
|
3691
|
+
setTimeout(() => fs.rmSync(lockPathFor(HQ), { force: true }), 150);
|
|
3692
|
+
|
|
3693
|
+
const code = await runRunnerWithLoop(
|
|
3694
|
+
["--companies", "--hq-root", HQ],
|
|
3695
|
+
{ runPass },
|
|
3696
|
+
);
|
|
3697
|
+
|
|
3698
|
+
spy.mockRestore();
|
|
3699
|
+
// It did NOT refuse/die — it waited then ran the sync pass exactly once.
|
|
3700
|
+
expect(code).toBe(0);
|
|
3701
|
+
expect(code).not.toBe(OPERATION_LOCKED_EXIT);
|
|
3702
|
+
expect(runPass).toHaveBeenCalledTimes(1);
|
|
3703
|
+
// A single "Waiting for …" status line named the short holder.
|
|
3704
|
+
const out = errs.join("");
|
|
3705
|
+
expect(out).toContain("Waiting for");
|
|
3706
|
+
expect(out).toContain("reindex");
|
|
3707
|
+
}, 20_000);
|
|
3708
|
+
|
|
3630
3709
|
it("the watch runner is EXEMPT — runs despite a held lock and never takes it", async () => {
|
|
3631
3710
|
const lp = writeLiveHolder("sync");
|
|
3632
3711
|
const watcher = makeWatcherStub();
|
package/src/bin/sync-runner.ts
CHANGED
|
@@ -607,6 +607,13 @@ interface ParsedArgs {
|
|
|
607
607
|
* mode (single-company runs never visit the personal target).
|
|
608
608
|
*/
|
|
609
609
|
skipPersonal: boolean;
|
|
610
|
+
/**
|
|
611
|
+
* Bounded wait (seconds) for the per-root operation lock when another op is
|
|
612
|
+
* already running. `0` → refuse immediately (pre-wait behavior); omitted →
|
|
613
|
+
* inherit `HQ_OP_LOCK_TIMEOUT` / infinite wait. Only meaningful on the
|
|
614
|
+
* one-shot path (the `--watch` runner is lock-exempt).
|
|
615
|
+
*/
|
|
616
|
+
lockTimeoutSec?: number;
|
|
610
617
|
}
|
|
611
618
|
|
|
612
619
|
function parseArgs(argv: string[]): ParsedArgs | { error: string } {
|
|
@@ -620,6 +627,7 @@ function parseArgs(argv: string[]): ParsedArgs | { error: string } {
|
|
|
620
627
|
let pollRemoteMs: number | undefined;
|
|
621
628
|
let skipPersonal = false;
|
|
622
629
|
let eventPush = false;
|
|
630
|
+
let lockTimeoutSec: number | undefined;
|
|
623
631
|
|
|
624
632
|
for (let i = 0; i < argv.length; i++) {
|
|
625
633
|
const arg = argv[i];
|
|
@@ -693,6 +701,18 @@ function parseArgs(argv: string[]): ParsedArgs | { error: string } {
|
|
|
693
701
|
// @getindigo.ai identities for the first release.
|
|
694
702
|
eventPush = true;
|
|
695
703
|
break;
|
|
704
|
+
case "--lock-timeout": {
|
|
705
|
+
const val = argv[++i];
|
|
706
|
+
if (!val) return { error: "--lock-timeout requires a value (seconds)" };
|
|
707
|
+
const n = Number(val);
|
|
708
|
+
if (!Number.isInteger(n) || n < 0) {
|
|
709
|
+
return {
|
|
710
|
+
error: `--lock-timeout must be a non-negative integer (seconds), got: ${val}`,
|
|
711
|
+
};
|
|
712
|
+
}
|
|
713
|
+
lockTimeoutSec = n;
|
|
714
|
+
break;
|
|
715
|
+
}
|
|
696
716
|
default:
|
|
697
717
|
return { error: `Unknown argument: ${arg}` };
|
|
698
718
|
}
|
|
@@ -732,6 +752,7 @@ function parseArgs(argv: string[]): ParsedArgs | { error: string } {
|
|
|
732
752
|
pollRemoteMs,
|
|
733
753
|
skipPersonal,
|
|
734
754
|
eventPush,
|
|
755
|
+
lockTimeoutSec,
|
|
735
756
|
};
|
|
736
757
|
}
|
|
737
758
|
|
|
@@ -1855,10 +1876,27 @@ export async function runRunnerWithLoop(
|
|
|
1855
1876
|
// surfaces the parse error rather than us masking it with a lock failure.
|
|
1856
1877
|
const parsed = parseArgs(argv);
|
|
1857
1878
|
if ("error" in parsed) return runRunner(argv);
|
|
1879
|
+
// The actual sync pass — same seam the watch loop uses (deps.runPass),
|
|
1880
|
+
// so a test can assert "waits for a short-lived holder, THEN proceeds to
|
|
1881
|
+
// sync" without touching the network. Production passes `argv` to
|
|
1882
|
+
// runRunner exactly as before. Regression guard for DEV-1772
|
|
1883
|
+
// (feedback_28a1833f): instant-sync one-shots used to exit 17 and die on
|
|
1884
|
+
// a lock conflict with the ~1-min reindex hook; they now WAIT (default)
|
|
1885
|
+
// and proceed once the short holder releases.
|
|
1886
|
+
const runOnce = deps.runPass ?? ((passArgv: string[]) => runRunner(passArgv));
|
|
1858
1887
|
try {
|
|
1859
|
-
return await withOperationLock(parsed.hqRoot, "sync", () =>
|
|
1888
|
+
return await withOperationLock(parsed.hqRoot, "sync", () => runOnce(argv), {
|
|
1889
|
+
timeoutSec: parsed.lockTimeoutSec,
|
|
1890
|
+
});
|
|
1860
1891
|
} catch (err) {
|
|
1861
1892
|
if (err instanceof OperationLockedError) {
|
|
1893
|
+
// The lock wait was BOUNDED and tripped (a holder never released
|
|
1894
|
+
// within --lock-timeout / HQ_OP_LOCK_TIMEOUT). Surface it loudly on
|
|
1895
|
+
// stderr and exit with the stable OPERATION_LOCKED_EXIT (17) so the
|
|
1896
|
+
// spawner (menubar) can recognize a lock conflict and SCHEDULE A
|
|
1897
|
+
// RETRY rather than treating it as a hard failure and silently giving
|
|
1898
|
+
// up (DEV-1772). With the default (no bound) we never reach here — the
|
|
1899
|
+
// one-shot waits indefinitely and proceeds.
|
|
1862
1900
|
process.stderr.write(err.message + "\n");
|
|
1863
1901
|
return OPERATION_LOCKED_EXIT;
|
|
1864
1902
|
}
|
package/src/cli/reindex.test.ts
CHANGED
|
@@ -135,9 +135,10 @@ describe("reindex", () => {
|
|
|
135
135
|
|
|
136
136
|
// ── operation lock (mutual exclusion with sync/rescue) ──────────────────
|
|
137
137
|
|
|
138
|
-
it("refuses (OPERATION_LOCKED_EXIT) when another op holds this root's lock", () => {
|
|
139
|
-
// A live holder in another process (pid 1) →
|
|
140
|
-
//
|
|
138
|
+
it("refuses (OPERATION_LOCKED_EXIT) when another op holds this root's lock (lockTimeoutSec:0)", () => {
|
|
139
|
+
// A live holder in another process (pid 1) → with lockTimeoutSec:0 reindex
|
|
140
|
+
// refuses immediately and does no work. (The new default is to WAIT; the
|
|
141
|
+
// wait-then-acquire path is covered in operation-lock.test.ts.)
|
|
141
142
|
const lp = lockPathFor(root);
|
|
142
143
|
fs.mkdirSync(path.dirname(lp), { recursive: true });
|
|
143
144
|
fs.writeFileSync(
|
|
@@ -145,7 +146,7 @@ describe("reindex", () => {
|
|
|
145
146
|
JSON.stringify({ pid: 1, command: "sync", startedAt: new Date(0).toISOString(), hqRoot: root }),
|
|
146
147
|
);
|
|
147
148
|
const before = fs.existsSync(path.join(root, ".claude/skills"));
|
|
148
|
-
const { status } = reindex({ repoRoot: root });
|
|
149
|
+
const { status } = reindex({ repoRoot: root, lockTimeoutSec: 0 });
|
|
149
150
|
expect(status).toBe(OPERATION_LOCKED_EXIT);
|
|
150
151
|
// It refused before doing any work (didn't create .claude/skills).
|
|
151
152
|
expect(fs.existsSync(path.join(root, ".claude/skills"))).toBe(before);
|
package/src/cli/reindex.ts
CHANGED
|
@@ -35,6 +35,17 @@ export interface ReindexOptions {
|
|
|
35
35
|
* exclusive with a running sync/rescue.
|
|
36
36
|
*/
|
|
37
37
|
skipLock?: boolean;
|
|
38
|
+
/**
|
|
39
|
+
* Bounded wait (seconds) for the per-root operation lock when a sync/rescue
|
|
40
|
+
* is already running. `0` → refuse immediately; omitted → inherit
|
|
41
|
+
* `HQ_OP_LOCK_TIMEOUT` / infinite wait. Ignored when `skipLock` is set.
|
|
42
|
+
*
|
|
43
|
+
* NB: standalone `hq reindex` waits by default, which is what a human running
|
|
44
|
+
* it wants. The hq-core reindex HOOK (Stop / PostToolUse) should set a small
|
|
45
|
+
* `HQ_OP_LOCK_TIMEOUT` (or `0`) so a hook fired mid-sync never blocks the
|
|
46
|
+
* interactive agent indefinitely.
|
|
47
|
+
*/
|
|
48
|
+
lockTimeoutSec?: number;
|
|
38
49
|
}
|
|
39
50
|
|
|
40
51
|
export interface ReindexResult {
|
|
@@ -151,7 +162,7 @@ export function reindex(opts: ReindexOptions = {}): ReindexResult {
|
|
|
151
162
|
let opLock: LockHandle | null = null;
|
|
152
163
|
if (!opts.skipLock) {
|
|
153
164
|
try {
|
|
154
|
-
opLock = acquireOperationLock(root, "reindex");
|
|
165
|
+
opLock = acquireOperationLock(root, "reindex", { timeoutSec: opts.lockTimeoutSec });
|
|
155
166
|
} catch (err) {
|
|
156
167
|
if (err instanceof OperationLockedError) {
|
|
157
168
|
warn(err.message);
|
package/src/cli/rescue.test.ts
CHANGED
|
@@ -1,5 +1,47 @@
|
|
|
1
1
|
import { describe, it, expect } from "vitest";
|
|
2
|
-
import { buildRescueArgs } from "./rescue.js";
|
|
2
|
+
import { buildRescueArgs, extractLockTimeout } from "./rescue.js";
|
|
3
|
+
|
|
4
|
+
describe("extractLockTimeout", () => {
|
|
5
|
+
it("returns the explicit option untouched when extraArgs is empty/absent", () => {
|
|
6
|
+
expect(extractLockTimeout({ lockTimeoutSec: 5 })).toEqual({
|
|
7
|
+
lockTimeoutSec: 5,
|
|
8
|
+
cleanedExtraArgs: undefined,
|
|
9
|
+
});
|
|
10
|
+
expect(extractLockTimeout({ lockTimeoutSec: 5, extraArgs: [] })).toEqual({
|
|
11
|
+
lockTimeoutSec: 5,
|
|
12
|
+
cleanedExtraArgs: [],
|
|
13
|
+
});
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
it("pulls --lock-timeout <secs> out of extraArgs and strips it from the forwarded args", () => {
|
|
17
|
+
const r = extractLockTimeout({
|
|
18
|
+
extraArgs: ["--lock-timeout", "30", "--dry-run", "--yes"],
|
|
19
|
+
});
|
|
20
|
+
expect(r.lockTimeoutSec).toBe(30);
|
|
21
|
+
// The flag + its value never reach the rescue script.
|
|
22
|
+
expect(r.cleanedExtraArgs).toEqual(["--dry-run", "--yes"]);
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it("honors --lock-timeout 0 (refuse immediately)", () => {
|
|
26
|
+
const r = extractLockTimeout({ extraArgs: ["--lock-timeout", "0"] });
|
|
27
|
+
expect(r.lockTimeoutSec).toBe(0);
|
|
28
|
+
expect(r.cleanedExtraArgs).toEqual([]);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it("explicit option wins over a flag in extraArgs", () => {
|
|
32
|
+
const r = extractLockTimeout({
|
|
33
|
+
lockTimeoutSec: 0,
|
|
34
|
+
extraArgs: ["--lock-timeout", "60"],
|
|
35
|
+
});
|
|
36
|
+
expect(r.lockTimeoutSec).toBe(0);
|
|
37
|
+
expect(r.cleanedExtraArgs).toEqual([]);
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
it("ignores a non-numeric / negative --lock-timeout value (treated as unset)", () => {
|
|
41
|
+
expect(extractLockTimeout({ extraArgs: ["--lock-timeout", "abc"] }).lockTimeoutSec).toBeUndefined();
|
|
42
|
+
expect(extractLockTimeout({ extraArgs: ["--lock-timeout", "-3"] }).lockTimeoutSec).toBeUndefined();
|
|
43
|
+
});
|
|
44
|
+
});
|
|
3
45
|
|
|
4
46
|
describe("buildRescueArgs", () => {
|
|
5
47
|
it("emits no args for an empty option set (script defaults apply)", () => {
|
package/src/cli/rescue.ts
CHANGED
|
@@ -57,10 +57,52 @@ export interface RescueOptions {
|
|
|
57
57
|
/** GitHub token forwarded to the script as `GH_TOKEN` (avoids the
|
|
58
58
|
* anonymous-clone rate limit; required for private sources). */
|
|
59
59
|
ghToken?: string;
|
|
60
|
+
/**
|
|
61
|
+
* Bounded wait (seconds) for the per-root operation lock when a sync/reindex
|
|
62
|
+
* is already running. `0` → refuse immediately; omitted → inherit
|
|
63
|
+
* `HQ_OP_LOCK_TIMEOUT` / infinite wait. Also accepted as `--lock-timeout
|
|
64
|
+
* <secs>` inside {@link extraArgs} (the machine `hq-rescue` entrypoint
|
|
65
|
+
* forwards raw argv) — it is consumed here and never passed to the rescue
|
|
66
|
+
* script, which doesn't understand it.
|
|
67
|
+
*/
|
|
68
|
+
lockTimeoutSec?: number;
|
|
60
69
|
/** Escape hatch — additional raw args appended verbatim. */
|
|
61
70
|
extraArgs?: string[];
|
|
62
71
|
}
|
|
63
72
|
|
|
73
|
+
/**
|
|
74
|
+
* Pull `--lock-timeout <secs>` out of `extraArgs` (the raw argv the machine
|
|
75
|
+
* `hq-rescue` entrypoint forwards) so it controls the operation-lock wait
|
|
76
|
+
* instead of leaking to the rescue script. Returns the resolved timeout
|
|
77
|
+
* (explicit `opts.lockTimeoutSec` wins) and a copy of extraArgs with the flag
|
|
78
|
+
* removed.
|
|
79
|
+
*/
|
|
80
|
+
export function extractLockTimeout(opts: RescueOptions): {
|
|
81
|
+
lockTimeoutSec: number | undefined;
|
|
82
|
+
cleanedExtraArgs: string[] | undefined;
|
|
83
|
+
} {
|
|
84
|
+
const raw = opts.extraArgs;
|
|
85
|
+
if (!raw || raw.length === 0) {
|
|
86
|
+
return { lockTimeoutSec: opts.lockTimeoutSec, cleanedExtraArgs: raw };
|
|
87
|
+
}
|
|
88
|
+
const cleaned: string[] = [];
|
|
89
|
+
let fromArgs: number | undefined;
|
|
90
|
+
for (let i = 0; i < raw.length; i++) {
|
|
91
|
+
if (raw[i] === "--lock-timeout") {
|
|
92
|
+
const val = Number(raw[i + 1]);
|
|
93
|
+
if (Number.isInteger(val) && val >= 0) fromArgs = val;
|
|
94
|
+
i++; // skip the value too
|
|
95
|
+
continue;
|
|
96
|
+
}
|
|
97
|
+
cleaned.push(raw[i]);
|
|
98
|
+
}
|
|
99
|
+
// Explicit option beats the parsed flag.
|
|
100
|
+
return {
|
|
101
|
+
lockTimeoutSec: opts.lockTimeoutSec ?? fromArgs,
|
|
102
|
+
cleanedExtraArgs: cleaned,
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
|
|
64
106
|
export interface RescueResult {
|
|
65
107
|
/** Exit status of the underlying script (0 = success). */
|
|
66
108
|
status: number;
|
|
@@ -100,9 +142,13 @@ export function rescue(opts: RescueOptions = {}): RescueResult {
|
|
|
100
142
|
// lock on the same root the rescue script resolves (cwd when --hq-root is
|
|
101
143
|
// omitted). Rescue is never the exempt push watcher, so it always locks.
|
|
102
144
|
const lockRoot = opts.hqRoot ?? process.cwd();
|
|
145
|
+
// Consume --lock-timeout from extraArgs (machine entrypoint) before it can
|
|
146
|
+
// reach the rescue script, which doesn't understand it.
|
|
147
|
+
const { lockTimeoutSec, cleanedExtraArgs } = extractLockTimeout(opts);
|
|
148
|
+
const rescueOpts: RescueOptions = { ...opts, extraArgs: cleanedExtraArgs };
|
|
103
149
|
let handle;
|
|
104
150
|
try {
|
|
105
|
-
handle = acquireOperationLock(lockRoot, "rescue");
|
|
151
|
+
handle = acquireOperationLock(lockRoot, "rescue", { timeoutSec: lockTimeoutSec });
|
|
106
152
|
} catch (err) {
|
|
107
153
|
if (err instanceof OperationLockedError) {
|
|
108
154
|
process.stderr.write(err.message + "\n");
|
|
@@ -111,7 +157,7 @@ export function rescue(opts: RescueOptions = {}): RescueResult {
|
|
|
111
157
|
throw err;
|
|
112
158
|
}
|
|
113
159
|
try {
|
|
114
|
-
const args = buildRescueArgs(
|
|
160
|
+
const args = buildRescueArgs(rescueOpts);
|
|
115
161
|
const env: NodeJS.ProcessEnv = { ...process.env };
|
|
116
162
|
if (opts.ghToken) env.GH_TOKEN = opts.ghToken;
|
|
117
163
|
const { status } = runRescue(args, { env });
|
|
@@ -9,10 +9,13 @@ import * as os from "os";
|
|
|
9
9
|
import * as path from "path";
|
|
10
10
|
import {
|
|
11
11
|
acquireOperationLock,
|
|
12
|
+
acquireOperationLockAsync,
|
|
13
|
+
withOperationLock,
|
|
12
14
|
withOperationLockSync,
|
|
13
15
|
lockPathFor,
|
|
14
16
|
OperationLockedError,
|
|
15
17
|
OPERATION_LOCKED_EXIT,
|
|
18
|
+
DEFAULT_LOCK_POLL_MS,
|
|
16
19
|
type LockInfo,
|
|
17
20
|
} from "./operation-lock.js";
|
|
18
21
|
|
|
@@ -44,6 +47,7 @@ describe("operation-lock", () => {
|
|
|
44
47
|
stateDir = fs.mkdtempSync(path.join(os.tmpdir(), "hq-oplock-state-"));
|
|
45
48
|
process.env.HQ_STATE_DIR = stateDir;
|
|
46
49
|
delete process.env.HQ_DISABLE_OP_LOCK;
|
|
50
|
+
delete process.env.HQ_OP_LOCK_TIMEOUT;
|
|
47
51
|
rootA = fs.mkdtempSync(path.join(os.tmpdir(), "hq-rootA-"));
|
|
48
52
|
rootB = fs.mkdtempSync(path.join(os.tmpdir(), "hq-rootB-"));
|
|
49
53
|
});
|
|
@@ -54,6 +58,7 @@ describe("operation-lock", () => {
|
|
|
54
58
|
fs.rmSync(rootB, { recursive: true, force: true });
|
|
55
59
|
delete process.env.HQ_STATE_DIR;
|
|
56
60
|
delete process.env.HQ_DISABLE_OP_LOCK;
|
|
61
|
+
delete process.env.HQ_OP_LOCK_TIMEOUT;
|
|
57
62
|
});
|
|
58
63
|
|
|
59
64
|
it("the lock path is under the state dir, keyed per canonical root", () => {
|
|
@@ -75,14 +80,17 @@ describe("operation-lock", () => {
|
|
|
75
80
|
expect(fs.existsSync(h.path)).toBe(false);
|
|
76
81
|
});
|
|
77
82
|
|
|
78
|
-
it("refuses
|
|
83
|
+
it("refuses immediately (wait:false) with the holder's command + pid when a LIVE process holds it", () => {
|
|
79
84
|
// Simulate a DIFFERENT live process holding the lock. PID 1 (init/systemd)
|
|
80
85
|
// is always alive and is never our own pid, so kill(1,0) reports alive and
|
|
81
|
-
// the same-process reclaim path does not apply.
|
|
86
|
+
// the same-process reclaim path does not apply. `wait:false` keeps the old
|
|
87
|
+
// refuse-immediately behavior (the default is now to WAIT).
|
|
82
88
|
writeLock(lockPathFor(rootA), { pid: 1, command: "rescue" });
|
|
83
|
-
expect(() => acquireOperationLock(rootA, "sync")).toThrowError(
|
|
89
|
+
expect(() => acquireOperationLock(rootA, "sync", { wait: false })).toThrowError(
|
|
90
|
+
OperationLockedError,
|
|
91
|
+
);
|
|
84
92
|
try {
|
|
85
|
-
acquireOperationLock(rootA, "sync");
|
|
93
|
+
acquireOperationLock(rootA, "sync", { wait: false });
|
|
86
94
|
} catch (e) {
|
|
87
95
|
const err = e as OperationLockedError;
|
|
88
96
|
expect(err.holder.command).toBe("rescue");
|
|
@@ -92,11 +100,124 @@ describe("operation-lock", () => {
|
|
|
92
100
|
}
|
|
93
101
|
});
|
|
94
102
|
|
|
95
|
-
it("
|
|
103
|
+
it("timeoutSec:0 refuses immediately (no wait) — equivalent to wait:false", () => {
|
|
104
|
+
writeLock(lockPathFor(rootA), { pid: 1, command: "sync" });
|
|
105
|
+
const start = Date.now();
|
|
106
|
+
expect(() => acquireOperationLock(rootA, "reindex", { timeoutSec: 0 })).toThrowError(
|
|
107
|
+
OperationLockedError,
|
|
108
|
+
);
|
|
109
|
+
// Did not actually sleep.
|
|
110
|
+
expect(Date.now() - start).toBeLessThan(DEFAULT_LOCK_POLL_MS);
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
it("a bounded timeoutSec waits, then refuses with the old message + exit code", () => {
|
|
114
|
+
writeLock(lockPathFor(rootA), { pid: 1, command: "rescue" });
|
|
115
|
+
const start = Date.now();
|
|
116
|
+
let thrown: unknown;
|
|
117
|
+
try {
|
|
118
|
+
// 150ms bound, 40ms poll → waits ~150ms then gives up. Suppress the
|
|
119
|
+
// stderr status line with a no-op onWaitStart.
|
|
120
|
+
acquireOperationLock(rootA, "sync", {
|
|
121
|
+
timeoutSec: 0.15,
|
|
122
|
+
pollIntervalMs: 40,
|
|
123
|
+
onWaitStart: () => {},
|
|
124
|
+
});
|
|
125
|
+
} catch (e) {
|
|
126
|
+
thrown = e;
|
|
127
|
+
}
|
|
128
|
+
const elapsed = Date.now() - start;
|
|
129
|
+
expect(thrown).toBeInstanceOf(OperationLockedError);
|
|
130
|
+
expect((thrown as OperationLockedError).message).toContain("rescue");
|
|
131
|
+
expect(OPERATION_LOCKED_EXIT).toBe(17);
|
|
132
|
+
// It actually waited (didn't refuse instantly) but didn't hang forever.
|
|
133
|
+
expect(elapsed).toBeGreaterThanOrEqual(120);
|
|
134
|
+
expect(elapsed).toBeLessThan(3000);
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
it("HQ_OP_LOCK_TIMEOUT env bounds the wait when no explicit option is given", () => {
|
|
138
|
+
process.env.HQ_OP_LOCK_TIMEOUT = "0"; // 0 → refuse immediately
|
|
139
|
+
writeLock(lockPathFor(rootA), { pid: 1, command: "sync" });
|
|
140
|
+
const start = Date.now();
|
|
141
|
+
expect(() =>
|
|
142
|
+
acquireOperationLock(rootA, "rescue", { onWaitStart: () => {} }),
|
|
143
|
+
).toThrowError(OperationLockedError);
|
|
144
|
+
expect(Date.now() - start).toBeLessThan(DEFAULT_LOCK_POLL_MS);
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
it("an explicit timeoutSec overrides the HQ_OP_LOCK_TIMEOUT env", () => {
|
|
148
|
+
process.env.HQ_OP_LOCK_TIMEOUT = "9999"; // would be a near-infinite wait
|
|
149
|
+
writeLock(lockPathFor(rootA), { pid: 1, command: "sync" });
|
|
150
|
+
// The explicit 0 wins → refuse immediately rather than honoring the env.
|
|
151
|
+
expect(() =>
|
|
152
|
+
acquireOperationLock(rootA, "rescue", { timeoutSec: 0 }),
|
|
153
|
+
).toThrowError(OperationLockedError);
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
it("onWaitStart fires exactly once, naming the holder, even across many polls", () => {
|
|
157
|
+
writeLock(lockPathFor(rootA), { pid: 1, command: "rescue" });
|
|
158
|
+
const calls: Array<{ cmd: string; attempted: string }> = [];
|
|
159
|
+
expect(() =>
|
|
160
|
+
acquireOperationLock(rootA, "sync", {
|
|
161
|
+
timeoutSec: 0.16,
|
|
162
|
+
pollIntervalMs: 30, // ~5 polls within the window
|
|
163
|
+
onWaitStart: (holder, attempted) => calls.push({ cmd: holder.command, attempted }),
|
|
164
|
+
}),
|
|
165
|
+
).toThrowError(OperationLockedError);
|
|
166
|
+
expect(calls).toHaveLength(1);
|
|
167
|
+
expect(calls[0]).toEqual({ cmd: "rescue", attempted: "sync" });
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
it("a waiter acquires the lock the moment the holder releases (async poll path)", async () => {
|
|
171
|
+
const p = lockPathFor(rootA);
|
|
172
|
+
// A foreign LIVE holder (pid 1) initially owns the lock.
|
|
173
|
+
writeLock(p, { pid: 1, command: "rescue" });
|
|
174
|
+
// Simulate the holder finishing ~80ms in by removing its lock file.
|
|
175
|
+
const release = setTimeout(() => fs.rmSync(p, { force: true }), 80);
|
|
176
|
+
const start = Date.now();
|
|
177
|
+
const h = await acquireOperationLockAsync(rootA, "sync", {
|
|
178
|
+
pollIntervalMs: 20,
|
|
179
|
+
onWaitStart: () => {},
|
|
180
|
+
});
|
|
181
|
+
clearTimeout(release);
|
|
182
|
+
const elapsed = Date.now() - start;
|
|
183
|
+
// We waited for the release, then took it over.
|
|
184
|
+
expect(elapsed).toBeGreaterThanOrEqual(60);
|
|
185
|
+
const info = JSON.parse(fs.readFileSync(h.path, "utf8")) as LockInfo;
|
|
186
|
+
expect(info.pid).toBe(process.pid);
|
|
187
|
+
expect(info.command).toBe("sync");
|
|
188
|
+
h.release();
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
it("multiple foreign holders in a row: each release lets the next waiter in (no FIFO guarantee)", async () => {
|
|
192
|
+
// The mutex is CROSS-PROCESS: it keys liveness on the holder's PID. Two
|
|
193
|
+
// waiters in the SAME process share a pid, so the same-process reclaim path
|
|
194
|
+
// would let them stomp each other — that scenario is unsupported by design.
|
|
195
|
+
// Here we model the real case: a sequence of FOREIGN holders (pid 1) that
|
|
196
|
+
// each release, with a single waiter acquiring the instant the lock frees.
|
|
197
|
+
// Order among multiple distinct-process waiters is whoever wins the next
|
|
198
|
+
// O_EXCL race after a free — best-effort, NOT FIFO (documented).
|
|
199
|
+
const p = lockPathFor(rootA);
|
|
200
|
+
writeLock(p, { pid: 1, command: "sync" });
|
|
201
|
+
// Free it shortly; the waiter should grab it right after.
|
|
202
|
+
setTimeout(() => fs.rmSync(p, { force: true }), 50);
|
|
203
|
+
const h = await acquireOperationLockAsync(rootA, "reindex", {
|
|
204
|
+
pollIntervalMs: 15,
|
|
205
|
+
onWaitStart: () => {},
|
|
206
|
+
});
|
|
207
|
+
const info = JSON.parse(fs.readFileSync(h.path, "utf8")) as LockInfo;
|
|
208
|
+
expect(info.command).toBe("reindex");
|
|
209
|
+
expect(info.pid).toBe(process.pid);
|
|
210
|
+
h.release();
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
it("reclaims a stale lock whose holder PID is dead (takeover, never waits)", () => {
|
|
96
214
|
const stale = deadPid();
|
|
97
215
|
writeLock(lockPathFor(rootA), { pid: stale, command: "sync" });
|
|
98
|
-
|
|
216
|
+
const start = Date.now();
|
|
217
|
+
// The dead holder must not block us — even with an infinite default wait,
|
|
218
|
+
// takeover is immediate.
|
|
99
219
|
const h = acquireOperationLock(rootA, "rescue");
|
|
220
|
+
expect(Date.now() - start).toBeLessThan(DEFAULT_LOCK_POLL_MS);
|
|
100
221
|
const info = JSON.parse(fs.readFileSync(h.path, "utf8")) as LockInfo;
|
|
101
222
|
expect(info.pid).toBe(process.pid); // we took it over
|
|
102
223
|
expect(info.command).toBe("rescue");
|
|
@@ -114,7 +235,7 @@ describe("operation-lock", () => {
|
|
|
114
235
|
|
|
115
236
|
it("different HQ roots are independent — both may hold concurrently", () => {
|
|
116
237
|
const a = acquireOperationLock(rootA, "sync");
|
|
117
|
-
const b = acquireOperationLock(rootB, "rescue"); // must NOT
|
|
238
|
+
const b = acquireOperationLock(rootB, "rescue"); // must NOT block
|
|
118
239
|
expect(fs.existsSync(a.path)).toBe(true);
|
|
119
240
|
expect(fs.existsSync(b.path)).toBe(true);
|
|
120
241
|
expect(a.path).not.toBe(b.path);
|
|
@@ -126,9 +247,14 @@ describe("operation-lock", () => {
|
|
|
126
247
|
// A live sync in ANOTHER process holds the root (pid 1 stands in for it).
|
|
127
248
|
const p = lockPathFor(rootA);
|
|
128
249
|
writeLock(p, { pid: 1, command: "sync" });
|
|
129
|
-
// Neither rescue nor reindex may acquire while that sync holds it
|
|
130
|
-
|
|
131
|
-
expect(() => acquireOperationLock(rootA, "
|
|
250
|
+
// Neither rescue nor reindex may acquire while that sync holds it
|
|
251
|
+
// (wait:false → assert the refusal without hanging on the new wait default).
|
|
252
|
+
expect(() => acquireOperationLock(rootA, "rescue", { wait: false })).toThrowError(
|
|
253
|
+
OperationLockedError,
|
|
254
|
+
);
|
|
255
|
+
expect(() => acquireOperationLock(rootA, "reindex", { wait: false })).toThrowError(
|
|
256
|
+
OperationLockedError,
|
|
257
|
+
);
|
|
132
258
|
// Once that sync finishes (its lock is gone), the next command acquires.
|
|
133
259
|
fs.unlinkSync(p);
|
|
134
260
|
const h2 = acquireOperationLock(rootA, "reindex");
|
|
@@ -147,6 +273,17 @@ describe("operation-lock", () => {
|
|
|
147
273
|
expect(fs.existsSync(p)).toBe(false); // released on the way out
|
|
148
274
|
});
|
|
149
275
|
|
|
276
|
+
it("withOperationLock (async) releases even when the body throws", async () => {
|
|
277
|
+
const p = lockPathFor(rootA);
|
|
278
|
+
await expect(
|
|
279
|
+
withOperationLock(rootA, "sync", async () => {
|
|
280
|
+
expect(fs.existsSync(p)).toBe(true);
|
|
281
|
+
throw new Error("boom");
|
|
282
|
+
}),
|
|
283
|
+
).rejects.toThrow("boom");
|
|
284
|
+
expect(fs.existsSync(p)).toBe(false);
|
|
285
|
+
});
|
|
286
|
+
|
|
150
287
|
it("HQ_DISABLE_OP_LOCK=1 makes acquisition a no-op", () => {
|
|
151
288
|
process.env.HQ_DISABLE_OP_LOCK = "1";
|
|
152
289
|
// Even with a live holder on record, the escape hatch acquires without error.
|