@indigoai-us/hq-cloud 6.7.0 → 6.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/sync-runner.d.ts.map +1 -1
- package/dist/bin/sync-runner.js +33 -1
- package/dist/bin/sync-runner.js.map +1 -1
- package/dist/bin/sync-runner.test.js +73 -4
- package/dist/bin/sync-runner.test.js.map +1 -1
- package/dist/cli/reindex.d.ts +11 -0
- package/dist/cli/reindex.d.ts.map +1 -1
- package/dist/cli/reindex.js +1 -1
- package/dist/cli/reindex.js.map +1 -1
- package/dist/cli/reindex.test.js +5 -4
- package/dist/cli/reindex.test.js.map +1 -1
- package/dist/cli/rescue.d.ts +20 -0
- package/dist/cli/rescue.d.ts.map +1 -1
- package/dist/cli/rescue.js +36 -2
- package/dist/cli/rescue.js.map +1 -1
- package/dist/cli/rescue.test.js +38 -1
- package/dist/cli/rescue.test.js.map +1 -1
- package/dist/cognito-auth.d.ts +6 -1
- package/dist/cognito-auth.d.ts.map +1 -1
- package/dist/cognito-auth.js +22 -5
- package/dist/cognito-auth.js.map +1 -1
- package/dist/machine-auth.test.js +35 -2
- package/dist/machine-auth.test.js.map +1 -1
- package/dist/operation-lock.d.ts +81 -10
- package/dist/operation-lock.d.ts.map +1 -1
- package/dist/operation-lock.js +177 -27
- package/dist/operation-lock.js.map +1 -1
- package/dist/operation-lock.test.js +122 -11
- package/dist/operation-lock.test.js.map +1 -1
- package/package.json +1 -1
- package/src/bin/sync-runner.test.ts +83 -4
- package/src/bin/sync-runner.ts +39 -1
- package/src/cli/reindex.test.ts +5 -4
- package/src/cli/reindex.ts +12 -1
- package/src/cli/rescue.test.ts +43 -1
- package/src/cli/rescue.ts +48 -2
- package/src/cognito-auth.ts +30 -6
- package/src/machine-auth.test.ts +37 -2
- package/src/operation-lock.test.ts +147 -10
- package/src/operation-lock.ts +234 -26
|
@@ -9,10 +9,13 @@ import * as os from "os";
|
|
|
9
9
|
import * as path from "path";
|
|
10
10
|
import {
|
|
11
11
|
acquireOperationLock,
|
|
12
|
+
acquireOperationLockAsync,
|
|
13
|
+
withOperationLock,
|
|
12
14
|
withOperationLockSync,
|
|
13
15
|
lockPathFor,
|
|
14
16
|
OperationLockedError,
|
|
15
17
|
OPERATION_LOCKED_EXIT,
|
|
18
|
+
DEFAULT_LOCK_POLL_MS,
|
|
16
19
|
type LockInfo,
|
|
17
20
|
} from "./operation-lock.js";
|
|
18
21
|
|
|
@@ -44,6 +47,7 @@ describe("operation-lock", () => {
|
|
|
44
47
|
stateDir = fs.mkdtempSync(path.join(os.tmpdir(), "hq-oplock-state-"));
|
|
45
48
|
process.env.HQ_STATE_DIR = stateDir;
|
|
46
49
|
delete process.env.HQ_DISABLE_OP_LOCK;
|
|
50
|
+
delete process.env.HQ_OP_LOCK_TIMEOUT;
|
|
47
51
|
rootA = fs.mkdtempSync(path.join(os.tmpdir(), "hq-rootA-"));
|
|
48
52
|
rootB = fs.mkdtempSync(path.join(os.tmpdir(), "hq-rootB-"));
|
|
49
53
|
});
|
|
@@ -54,6 +58,7 @@ describe("operation-lock", () => {
|
|
|
54
58
|
fs.rmSync(rootB, { recursive: true, force: true });
|
|
55
59
|
delete process.env.HQ_STATE_DIR;
|
|
56
60
|
delete process.env.HQ_DISABLE_OP_LOCK;
|
|
61
|
+
delete process.env.HQ_OP_LOCK_TIMEOUT;
|
|
57
62
|
});
|
|
58
63
|
|
|
59
64
|
it("the lock path is under the state dir, keyed per canonical root", () => {
|
|
@@ -75,14 +80,17 @@ describe("operation-lock", () => {
|
|
|
75
80
|
expect(fs.existsSync(h.path)).toBe(false);
|
|
76
81
|
});
|
|
77
82
|
|
|
78
|
-
it("refuses
|
|
83
|
+
it("refuses immediately (wait:false) with the holder's command + pid when a LIVE process holds it", () => {
|
|
79
84
|
// Simulate a DIFFERENT live process holding the lock. PID 1 (init/systemd)
|
|
80
85
|
// is always alive and is never our own pid, so kill(1,0) reports alive and
|
|
81
|
-
// the same-process reclaim path does not apply.
|
|
86
|
+
// the same-process reclaim path does not apply. `wait:false` keeps the old
|
|
87
|
+
// refuse-immediately behavior (the default is now to WAIT).
|
|
82
88
|
writeLock(lockPathFor(rootA), { pid: 1, command: "rescue" });
|
|
83
|
-
expect(() => acquireOperationLock(rootA, "sync")).toThrowError(
|
|
89
|
+
expect(() => acquireOperationLock(rootA, "sync", { wait: false })).toThrowError(
|
|
90
|
+
OperationLockedError,
|
|
91
|
+
);
|
|
84
92
|
try {
|
|
85
|
-
acquireOperationLock(rootA, "sync");
|
|
93
|
+
acquireOperationLock(rootA, "sync", { wait: false });
|
|
86
94
|
} catch (e) {
|
|
87
95
|
const err = e as OperationLockedError;
|
|
88
96
|
expect(err.holder.command).toBe("rescue");
|
|
@@ -92,11 +100,124 @@ describe("operation-lock", () => {
|
|
|
92
100
|
}
|
|
93
101
|
});
|
|
94
102
|
|
|
95
|
-
it("
|
|
103
|
+
it("timeoutSec:0 refuses immediately (no wait) — equivalent to wait:false", () => {
|
|
104
|
+
writeLock(lockPathFor(rootA), { pid: 1, command: "sync" });
|
|
105
|
+
const start = Date.now();
|
|
106
|
+
expect(() => acquireOperationLock(rootA, "reindex", { timeoutSec: 0 })).toThrowError(
|
|
107
|
+
OperationLockedError,
|
|
108
|
+
);
|
|
109
|
+
// Did not actually sleep.
|
|
110
|
+
expect(Date.now() - start).toBeLessThan(DEFAULT_LOCK_POLL_MS);
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
it("a bounded timeoutSec waits, then refuses with the old message + exit code", () => {
|
|
114
|
+
writeLock(lockPathFor(rootA), { pid: 1, command: "rescue" });
|
|
115
|
+
const start = Date.now();
|
|
116
|
+
let thrown: unknown;
|
|
117
|
+
try {
|
|
118
|
+
// 150ms bound, 40ms poll → waits ~150ms then gives up. Suppress the
|
|
119
|
+
// stderr status line with a no-op onWaitStart.
|
|
120
|
+
acquireOperationLock(rootA, "sync", {
|
|
121
|
+
timeoutSec: 0.15,
|
|
122
|
+
pollIntervalMs: 40,
|
|
123
|
+
onWaitStart: () => {},
|
|
124
|
+
});
|
|
125
|
+
} catch (e) {
|
|
126
|
+
thrown = e;
|
|
127
|
+
}
|
|
128
|
+
const elapsed = Date.now() - start;
|
|
129
|
+
expect(thrown).toBeInstanceOf(OperationLockedError);
|
|
130
|
+
expect((thrown as OperationLockedError).message).toContain("rescue");
|
|
131
|
+
expect(OPERATION_LOCKED_EXIT).toBe(17);
|
|
132
|
+
// It actually waited (didn't refuse instantly) but didn't hang forever.
|
|
133
|
+
expect(elapsed).toBeGreaterThanOrEqual(120);
|
|
134
|
+
expect(elapsed).toBeLessThan(3000);
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
it("HQ_OP_LOCK_TIMEOUT env bounds the wait when no explicit option is given", () => {
|
|
138
|
+
process.env.HQ_OP_LOCK_TIMEOUT = "0"; // 0 → refuse immediately
|
|
139
|
+
writeLock(lockPathFor(rootA), { pid: 1, command: "sync" });
|
|
140
|
+
const start = Date.now();
|
|
141
|
+
expect(() =>
|
|
142
|
+
acquireOperationLock(rootA, "rescue", { onWaitStart: () => {} }),
|
|
143
|
+
).toThrowError(OperationLockedError);
|
|
144
|
+
expect(Date.now() - start).toBeLessThan(DEFAULT_LOCK_POLL_MS);
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
it("an explicit timeoutSec overrides the HQ_OP_LOCK_TIMEOUT env", () => {
|
|
148
|
+
process.env.HQ_OP_LOCK_TIMEOUT = "9999"; // would be a near-infinite wait
|
|
149
|
+
writeLock(lockPathFor(rootA), { pid: 1, command: "sync" });
|
|
150
|
+
// The explicit 0 wins → refuse immediately rather than honoring the env.
|
|
151
|
+
expect(() =>
|
|
152
|
+
acquireOperationLock(rootA, "rescue", { timeoutSec: 0 }),
|
|
153
|
+
).toThrowError(OperationLockedError);
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
it("onWaitStart fires exactly once, naming the holder, even across many polls", () => {
|
|
157
|
+
writeLock(lockPathFor(rootA), { pid: 1, command: "rescue" });
|
|
158
|
+
const calls: Array<{ cmd: string; attempted: string }> = [];
|
|
159
|
+
expect(() =>
|
|
160
|
+
acquireOperationLock(rootA, "sync", {
|
|
161
|
+
timeoutSec: 0.16,
|
|
162
|
+
pollIntervalMs: 30, // ~5 polls within the window
|
|
163
|
+
onWaitStart: (holder, attempted) => calls.push({ cmd: holder.command, attempted }),
|
|
164
|
+
}),
|
|
165
|
+
).toThrowError(OperationLockedError);
|
|
166
|
+
expect(calls).toHaveLength(1);
|
|
167
|
+
expect(calls[0]).toEqual({ cmd: "rescue", attempted: "sync" });
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
it("a waiter acquires the lock the moment the holder releases (async poll path)", async () => {
|
|
171
|
+
const p = lockPathFor(rootA);
|
|
172
|
+
// A foreign LIVE holder (pid 1) initially owns the lock.
|
|
173
|
+
writeLock(p, { pid: 1, command: "rescue" });
|
|
174
|
+
// Simulate the holder finishing ~80ms in by removing its lock file.
|
|
175
|
+
const release = setTimeout(() => fs.rmSync(p, { force: true }), 80);
|
|
176
|
+
const start = Date.now();
|
|
177
|
+
const h = await acquireOperationLockAsync(rootA, "sync", {
|
|
178
|
+
pollIntervalMs: 20,
|
|
179
|
+
onWaitStart: () => {},
|
|
180
|
+
});
|
|
181
|
+
clearTimeout(release);
|
|
182
|
+
const elapsed = Date.now() - start;
|
|
183
|
+
// We waited for the release, then took it over.
|
|
184
|
+
expect(elapsed).toBeGreaterThanOrEqual(60);
|
|
185
|
+
const info = JSON.parse(fs.readFileSync(h.path, "utf8")) as LockInfo;
|
|
186
|
+
expect(info.pid).toBe(process.pid);
|
|
187
|
+
expect(info.command).toBe("sync");
|
|
188
|
+
h.release();
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
it("multiple foreign holders in a row: each release lets the next waiter in (no FIFO guarantee)", async () => {
|
|
192
|
+
// The mutex is CROSS-PROCESS: it keys liveness on the holder's PID. Two
|
|
193
|
+
// waiters in the SAME process share a pid, so the same-process reclaim path
|
|
194
|
+
// would let them stomp each other — that scenario is unsupported by design.
|
|
195
|
+
// Here we model the real case: a sequence of FOREIGN holders (pid 1) that
|
|
196
|
+
// each release, with a single waiter acquiring the instant the lock frees.
|
|
197
|
+
// Order among multiple distinct-process waiters is whoever wins the next
|
|
198
|
+
// O_EXCL race after a free — best-effort, NOT FIFO (documented).
|
|
199
|
+
const p = lockPathFor(rootA);
|
|
200
|
+
writeLock(p, { pid: 1, command: "sync" });
|
|
201
|
+
// Free it shortly; the waiter should grab it right after.
|
|
202
|
+
setTimeout(() => fs.rmSync(p, { force: true }), 50);
|
|
203
|
+
const h = await acquireOperationLockAsync(rootA, "reindex", {
|
|
204
|
+
pollIntervalMs: 15,
|
|
205
|
+
onWaitStart: () => {},
|
|
206
|
+
});
|
|
207
|
+
const info = JSON.parse(fs.readFileSync(h.path, "utf8")) as LockInfo;
|
|
208
|
+
expect(info.command).toBe("reindex");
|
|
209
|
+
expect(info.pid).toBe(process.pid);
|
|
210
|
+
h.release();
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
it("reclaims a stale lock whose holder PID is dead (takeover, never waits)", () => {
|
|
96
214
|
const stale = deadPid();
|
|
97
215
|
writeLock(lockPathFor(rootA), { pid: stale, command: "sync" });
|
|
98
|
-
|
|
216
|
+
const start = Date.now();
|
|
217
|
+
// The dead holder must not block us — even with an infinite default wait,
|
|
218
|
+
// takeover is immediate.
|
|
99
219
|
const h = acquireOperationLock(rootA, "rescue");
|
|
220
|
+
expect(Date.now() - start).toBeLessThan(DEFAULT_LOCK_POLL_MS);
|
|
100
221
|
const info = JSON.parse(fs.readFileSync(h.path, "utf8")) as LockInfo;
|
|
101
222
|
expect(info.pid).toBe(process.pid); // we took it over
|
|
102
223
|
expect(info.command).toBe("rescue");
|
|
@@ -114,7 +235,7 @@ describe("operation-lock", () => {
|
|
|
114
235
|
|
|
115
236
|
it("different HQ roots are independent — both may hold concurrently", () => {
|
|
116
237
|
const a = acquireOperationLock(rootA, "sync");
|
|
117
|
-
const b = acquireOperationLock(rootB, "rescue"); // must NOT
|
|
238
|
+
const b = acquireOperationLock(rootB, "rescue"); // must NOT block
|
|
118
239
|
expect(fs.existsSync(a.path)).toBe(true);
|
|
119
240
|
expect(fs.existsSync(b.path)).toBe(true);
|
|
120
241
|
expect(a.path).not.toBe(b.path);
|
|
@@ -126,9 +247,14 @@ describe("operation-lock", () => {
|
|
|
126
247
|
// A live sync in ANOTHER process holds the root (pid 1 stands in for it).
|
|
127
248
|
const p = lockPathFor(rootA);
|
|
128
249
|
writeLock(p, { pid: 1, command: "sync" });
|
|
129
|
-
// Neither rescue nor reindex may acquire while that sync holds it
|
|
130
|
-
|
|
131
|
-
expect(() => acquireOperationLock(rootA, "
|
|
250
|
+
// Neither rescue nor reindex may acquire while that sync holds it
|
|
251
|
+
// (wait:false → assert the refusal without hanging on the new wait default).
|
|
252
|
+
expect(() => acquireOperationLock(rootA, "rescue", { wait: false })).toThrowError(
|
|
253
|
+
OperationLockedError,
|
|
254
|
+
);
|
|
255
|
+
expect(() => acquireOperationLock(rootA, "reindex", { wait: false })).toThrowError(
|
|
256
|
+
OperationLockedError,
|
|
257
|
+
);
|
|
132
258
|
// Once that sync finishes (its lock is gone), the next command acquires.
|
|
133
259
|
fs.unlinkSync(p);
|
|
134
260
|
const h2 = acquireOperationLock(rootA, "reindex");
|
|
@@ -147,6 +273,17 @@ describe("operation-lock", () => {
|
|
|
147
273
|
expect(fs.existsSync(p)).toBe(false); // released on the way out
|
|
148
274
|
});
|
|
149
275
|
|
|
276
|
+
it("withOperationLock (async) releases even when the body throws", async () => {
|
|
277
|
+
const p = lockPathFor(rootA);
|
|
278
|
+
await expect(
|
|
279
|
+
withOperationLock(rootA, "sync", async () => {
|
|
280
|
+
expect(fs.existsSync(p)).toBe(true);
|
|
281
|
+
throw new Error("boom");
|
|
282
|
+
}),
|
|
283
|
+
).rejects.toThrow("boom");
|
|
284
|
+
expect(fs.existsSync(p)).toBe(false);
|
|
285
|
+
});
|
|
286
|
+
|
|
150
287
|
it("HQ_DISABLE_OP_LOCK=1 makes acquisition a no-op", () => {
|
|
151
288
|
process.env.HQ_DISABLE_OP_LOCK = "1";
|
|
152
289
|
// Even with a live holder on record, the escape hatch acquires without error.
|
package/src/operation-lock.ts
CHANGED
|
@@ -35,11 +35,44 @@
|
|
|
35
35
|
* - The lock records the holder's `{ pid, command, startedAt, hqRoot }`. On
|
|
36
36
|
* EEXIST we test the recorded PID with `process.kill(pid, 0)`:
|
|
37
37
|
* * ESRCH → the holder is gone (crashed / killed -9 / stale file) →
|
|
38
|
-
* reclaim the lock
|
|
38
|
+
* reclaim the lock IMMEDIATELY (a dead holder never makes us
|
|
39
|
+
* wait).
|
|
39
40
|
* * EPERM → the PID exists but is owned by another user → treat as ALIVE
|
|
40
|
-
* (conservative:
|
|
41
|
-
* * success → alive →
|
|
42
|
-
*
|
|
41
|
+
* (conservative: wait rather than risk two concurrent ops).
|
|
42
|
+
* * success → alive → WAIT for the holder to release, then acquire (see
|
|
43
|
+
* "Waiting" below). The fast-refusal path is still reachable
|
|
44
|
+
* via an explicit timeout / `wait: false`.
|
|
45
|
+
*
|
|
46
|
+
* ## Waiting for a live holder (default behavior)
|
|
47
|
+
*
|
|
48
|
+
* When a LIVE holder owns the lock, acquisition WAITS by default: it polls
|
|
49
|
+
* (~2s) and acquires the instant the holder releases, rather than refusing
|
|
50
|
+
* fast. A single status line is written to stderr the first time we start
|
|
51
|
+
* waiting ("Waiting for <command> (pid N) to finish…"), never per-poll.
|
|
52
|
+
* This is what an interactive `sync` / `rescue` / `reindex` invocation wants —
|
|
53
|
+
* queue behind the running op instead of erroring out.
|
|
54
|
+
*
|
|
55
|
+
* A bounded escape exists for scripts that must not block forever:
|
|
56
|
+
* - `timeoutSec` option, or the `HQ_OP_LOCK_TIMEOUT` env var (seconds).
|
|
57
|
+
* The option wins over the env. After the bound elapses we throw
|
|
58
|
+
* {@link OperationLockedError} (exit 17) with the same clear refusal
|
|
59
|
+
* message as before.
|
|
60
|
+
* - `timeoutSec === 0` (or `HQ_OP_LOCK_TIMEOUT=0`, or `wait: false`) → do
|
|
61
|
+
* not wait at all; refuse immediately. This is the pre-wait behavior.
|
|
62
|
+
* - absent / negative / unparseable → INFINITE wait (the documented
|
|
63
|
+
* default).
|
|
64
|
+
* Stale-PID takeover is unconditional and happens BEFORE any wait — a dead
|
|
65
|
+
* holder is reclaimed at once regardless of the wait config.
|
|
66
|
+
*
|
|
67
|
+
* Ordering / scope caveats:
|
|
68
|
+
* - This is a CROSS-PROCESS mutex keyed on the holder's PID. Two concurrent
|
|
69
|
+
* acquisitions inside the SAME process share a PID, so the same-process
|
|
70
|
+
* reclaim path lets them stomp each other — in-process concurrent acquire
|
|
71
|
+
* is unsupported (the real consumers — sync / rescue / reindex — are
|
|
72
|
+
* separate processes).
|
|
73
|
+
* - When several distinct processes wait on the same lock, the next one to
|
|
74
|
+
* win the O_EXCL race after a free acquires. Order is best-effort, NOT
|
|
75
|
+
* FIFO — do not depend on arrival order.
|
|
43
76
|
* - PID reuse is an inherent, un-eliminable race for any PID-based scheme: if
|
|
44
77
|
* the original holder crashed and the OS later handed its PID to an
|
|
45
78
|
* unrelated process, we conservatively read that as "still held" and
|
|
@@ -104,6 +137,100 @@ export interface LockHandle {
|
|
|
104
137
|
release(): void;
|
|
105
138
|
}
|
|
106
139
|
|
|
140
|
+
/** Default poll interval while waiting on a live holder. */
|
|
141
|
+
export const DEFAULT_LOCK_POLL_MS = 2000;
|
|
142
|
+
|
|
143
|
+
/** Options controlling how `acquireOperationLock*` behaves against a LIVE holder. */
|
|
144
|
+
export interface AcquireOptions {
|
|
145
|
+
/**
|
|
146
|
+
* When a LIVE holder owns the lock: `true` (default) → WAIT-poll until it
|
|
147
|
+
* frees, then acquire; `false` → refuse immediately with
|
|
148
|
+
* {@link OperationLockedError}. A `timeoutSec` of 0 is equivalent to
|
|
149
|
+
* `wait: false`.
|
|
150
|
+
*/
|
|
151
|
+
wait?: boolean;
|
|
152
|
+
/**
|
|
153
|
+
* Bounded wait, in seconds, before giving up and throwing
|
|
154
|
+
* {@link OperationLockedError} (exit 17). Precedence: this option > the
|
|
155
|
+
* `HQ_OP_LOCK_TIMEOUT` env var > infinite. `0` → do not wait at all (refuse
|
|
156
|
+
* immediately). Negative / non-finite → treated as absent (infinite wait).
|
|
157
|
+
* Fractional values are honored (used by tests); the CLI flags accept whole
|
|
158
|
+
* seconds.
|
|
159
|
+
*/
|
|
160
|
+
timeoutSec?: number;
|
|
161
|
+
/** Poll interval in ms while waiting. Defaults to {@link DEFAULT_LOCK_POLL_MS}. */
|
|
162
|
+
pollIntervalMs?: number;
|
|
163
|
+
/**
|
|
164
|
+
* Invoked exactly ONCE, the first time we begin waiting on a live holder.
|
|
165
|
+
* Defaults to a single "Waiting for …" line on stderr. Pass a custom hook
|
|
166
|
+
* (or a no-op) to redirect/silence the status line.
|
|
167
|
+
*/
|
|
168
|
+
onWaitStart?: (holder: LockInfo, attempted: string) => void;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
interface ResolvedWaitConfig {
|
|
172
|
+
/** null → wait forever; >= 0 → wait at most this many ms (0 = no wait). */
|
|
173
|
+
timeoutMs: number | null;
|
|
174
|
+
pollMs: number;
|
|
175
|
+
onWaitStart: (holder: LockInfo, attempted: string) => void;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/** Default status line: a single stderr message naming the holder. */
|
|
179
|
+
function defaultOnWaitStart(holder: LockInfo, attempted: string): void {
|
|
180
|
+
process.stderr.write(
|
|
181
|
+
`Waiting for "${holder.command}" (pid ${holder.pid}) to finish before ` +
|
|
182
|
+
`starting "${attempted}"… (set HQ_OP_LOCK_TIMEOUT=<secs> to bound the wait)\n`,
|
|
183
|
+
);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Resolve the effective wait config from explicit options + the
|
|
188
|
+
* `HQ_OP_LOCK_TIMEOUT` env var. Option timeout wins over the env; `wait: false`
|
|
189
|
+
* forces a zero (no-wait) timeout.
|
|
190
|
+
*/
|
|
191
|
+
function resolveWaitConfig(opts: AcquireOptions): ResolvedWaitConfig {
|
|
192
|
+
// Parse a seconds value into ms, or null for "absent/infinite". Only a
|
|
193
|
+
// finite, non-negative number counts; everything else (NaN, Infinity, <0)
|
|
194
|
+
// means "no explicit bound".
|
|
195
|
+
const toMs = (sec: number | undefined): number | null => {
|
|
196
|
+
if (sec === undefined) return null;
|
|
197
|
+
if (!Number.isFinite(sec) || sec < 0) return null;
|
|
198
|
+
return Math.round(sec * 1000);
|
|
199
|
+
};
|
|
200
|
+
|
|
201
|
+
let timeoutMs: number | null;
|
|
202
|
+
if (opts.timeoutSec !== undefined) {
|
|
203
|
+
timeoutMs = toMs(opts.timeoutSec);
|
|
204
|
+
} else {
|
|
205
|
+
const envRaw = process.env.HQ_OP_LOCK_TIMEOUT;
|
|
206
|
+
timeoutMs = envRaw !== undefined && envRaw !== "" ? toMs(Number(envRaw)) : null;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// `wait: false` is shorthand for a zero-length wait (refuse immediately).
|
|
210
|
+
if (opts.wait === false) timeoutMs = 0;
|
|
211
|
+
|
|
212
|
+
const pollMs =
|
|
213
|
+
opts.pollIntervalMs && opts.pollIntervalMs > 0
|
|
214
|
+
? opts.pollIntervalMs
|
|
215
|
+
: DEFAULT_LOCK_POLL_MS;
|
|
216
|
+
|
|
217
|
+
return { timeoutMs, pollMs, onWaitStart: opts.onWaitStart ?? defaultOnWaitStart };
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/** Block the current thread for `ms` without busy-spinning (sync consumers). */
|
|
221
|
+
function sleepSync(ms: number): void {
|
|
222
|
+
if (ms <= 0) return;
|
|
223
|
+
// Atomics.wait on a private buffer is a clean, CPU-free sleep. The value at
|
|
224
|
+
// index 0 is 0 and nothing ever notifies it, so this always sleeps the full
|
|
225
|
+
// timeout (or less if interrupted) and returns "timed-out".
|
|
226
|
+
Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, ms);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/** Non-blocking sleep for the async consumer. */
|
|
230
|
+
function sleepAsync(ms: number): Promise<void> {
|
|
231
|
+
return new Promise((resolve) => setTimeout(resolve, Math.max(0, ms)));
|
|
232
|
+
}
|
|
233
|
+
|
|
107
234
|
function stateDir(): string {
|
|
108
235
|
return process.env.HQ_STATE_DIR || path.join(os.homedir(), ".hq");
|
|
109
236
|
}
|
|
@@ -200,35 +327,45 @@ function makeHandle(p: string, info: LockInfo): LockHandle {
|
|
|
200
327
|
|
|
201
328
|
const NOOP_HANDLE_BASE = { release() {} };
|
|
202
329
|
|
|
203
|
-
/**
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
startedAt: new Date().toISOString(),
|
|
214
|
-
hqRoot: path.resolve(hqRoot),
|
|
215
|
-
};
|
|
216
|
-
return { ...NOOP_HANDLE_BASE, path: "", info };
|
|
217
|
-
}
|
|
330
|
+
/** No-op handle for the `HQ_DISABLE_OP_LOCK=1` escape hatch. */
|
|
331
|
+
function disabledHandle(hqRoot: string, command: string): LockHandle {
|
|
332
|
+
const info: LockInfo = {
|
|
333
|
+
pid: process.pid,
|
|
334
|
+
command,
|
|
335
|
+
startedAt: new Date().toISOString(),
|
|
336
|
+
hqRoot: path.resolve(hqRoot),
|
|
337
|
+
};
|
|
338
|
+
return { ...NOOP_HANDLE_BASE, path: "", info };
|
|
339
|
+
}
|
|
218
340
|
|
|
341
|
+
/** Build the lock payload + ensure the locks dir exists. */
|
|
342
|
+
function prepareLock(hqRoot: string, command: string): { p: string; info: LockInfo; payload: string } {
|
|
219
343
|
const p = lockPathFor(hqRoot);
|
|
220
344
|
fs.mkdirSync(path.dirname(p), { recursive: true });
|
|
221
|
-
|
|
222
345
|
const info: LockInfo = {
|
|
223
346
|
pid: process.pid,
|
|
224
347
|
command,
|
|
225
348
|
startedAt: new Date().toISOString(),
|
|
226
349
|
hqRoot: path.resolve(hqRoot),
|
|
227
350
|
};
|
|
228
|
-
|
|
351
|
+
return { p, info, payload: JSON.stringify(info, null, 2) };
|
|
352
|
+
}
|
|
229
353
|
|
|
354
|
+
/**
|
|
355
|
+
* One acquisition pass. Returns the {@link LockHandle} on success, or
|
|
356
|
+
* `{ busy }` naming the LIVE holder that blocked us (so the caller can decide
|
|
357
|
+
* to wait or refuse). A stale/torn/own-leftover lock is reclaimed in-pass and
|
|
358
|
+
* never reported as busy. Throws only on genuinely pathological churn or a
|
|
359
|
+
* non-EEXIST fs error.
|
|
360
|
+
*/
|
|
361
|
+
function tryAcquireOnce(
|
|
362
|
+
p: string,
|
|
363
|
+
info: LockInfo,
|
|
364
|
+
payload: string,
|
|
365
|
+
): { handle: LockHandle } | { busy: LockInfo } {
|
|
230
366
|
// Bounded retry: each iteration is one atomic create attempt. EEXIST against
|
|
231
|
-
// a stale holder reclaims and retries; EEXIST against a live holder
|
|
367
|
+
// a stale holder reclaims and retries; EEXIST against a live holder reports
|
|
368
|
+
// it as busy.
|
|
232
369
|
const MAX_ATTEMPTS = 5;
|
|
233
370
|
for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
|
|
234
371
|
let fd: number;
|
|
@@ -239,7 +376,7 @@ export function acquireOperationLock(hqRoot: string, command: string): LockHandl
|
|
|
239
376
|
|
|
240
377
|
const holder = readLockInfo(p);
|
|
241
378
|
if (holder && holder.pid !== process.pid && pidAlive(holder.pid)) {
|
|
242
|
-
|
|
379
|
+
return { busy: holder };
|
|
243
380
|
}
|
|
244
381
|
// Stale (dead holder), unreadable/torn, or our own leftover → reclaim.
|
|
245
382
|
try {
|
|
@@ -254,7 +391,7 @@ export function acquireOperationLock(hqRoot: string, command: string): LockHandl
|
|
|
254
391
|
} finally {
|
|
255
392
|
fs.closeSync(fd);
|
|
256
393
|
}
|
|
257
|
-
return makeHandle(p, info);
|
|
394
|
+
return { handle: makeHandle(p, info) };
|
|
258
395
|
}
|
|
259
396
|
|
|
260
397
|
// Pathological churn (another process reclaiming in lockstep). Surface it
|
|
@@ -264,13 +401,83 @@ export function acquireOperationLock(hqRoot: string, command: string): LockHandl
|
|
|
264
401
|
);
|
|
265
402
|
}
|
|
266
403
|
|
|
404
|
+
/** ms left until `deadline` (null deadline → never expires). */
|
|
405
|
+
function remainingMs(deadline: number | null): number {
|
|
406
|
+
return deadline === null ? Infinity : deadline - Date.now();
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
/**
|
|
410
|
+
* Acquire the per-root operation lock for `command` (synchronous). Returns a
|
|
411
|
+
* {@link LockHandle} on success. Against a LIVE holder it WAITS by default
|
|
412
|
+
* (polling, blocking the thread) and acquires the moment the holder releases;
|
|
413
|
+
* pass `timeoutSec`/`wait` (or set `HQ_OP_LOCK_TIMEOUT`) to bound or disable the
|
|
414
|
+
* wait — on expiry it throws {@link OperationLockedError}. A stale lock (dead
|
|
415
|
+
* holder) is reclaimed immediately, never waited on.
|
|
416
|
+
*/
|
|
417
|
+
export function acquireOperationLock(
|
|
418
|
+
hqRoot: string,
|
|
419
|
+
command: string,
|
|
420
|
+
opts: AcquireOptions = {},
|
|
421
|
+
): LockHandle {
|
|
422
|
+
if (process.env.HQ_DISABLE_OP_LOCK === "1") return disabledHandle(hqRoot, command);
|
|
423
|
+
|
|
424
|
+
const { p, info, payload } = prepareLock(hqRoot, command);
|
|
425
|
+
const cfg = resolveWaitConfig(opts);
|
|
426
|
+
const deadline = cfg.timeoutMs === null ? null : Date.now() + cfg.timeoutMs;
|
|
427
|
+
let announced = false;
|
|
428
|
+
|
|
429
|
+
for (;;) {
|
|
430
|
+
const res = tryAcquireOnce(p, info, payload);
|
|
431
|
+
if ("handle" in res) return res.handle;
|
|
432
|
+
|
|
433
|
+
// A live holder blocked us. Decide: refuse now, or wait and retry.
|
|
434
|
+
if (remainingMs(deadline) <= 0) throw new OperationLockedError(res.busy, command);
|
|
435
|
+
if (!announced) {
|
|
436
|
+
announced = true;
|
|
437
|
+
cfg.onWaitStart(res.busy, command);
|
|
438
|
+
}
|
|
439
|
+
sleepSync(Math.min(cfg.pollMs, remainingMs(deadline)));
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
/**
|
|
444
|
+
* Async counterpart to {@link acquireOperationLock}. Identical semantics, but
|
|
445
|
+
* the wait yields the event loop (via `setTimeout`) instead of blocking the
|
|
446
|
+
* thread — required for the async `sync` runner.
|
|
447
|
+
*/
|
|
448
|
+
export async function acquireOperationLockAsync(
|
|
449
|
+
hqRoot: string,
|
|
450
|
+
command: string,
|
|
451
|
+
opts: AcquireOptions = {},
|
|
452
|
+
): Promise<LockHandle> {
|
|
453
|
+
if (process.env.HQ_DISABLE_OP_LOCK === "1") return disabledHandle(hqRoot, command);
|
|
454
|
+
|
|
455
|
+
const { p, info, payload } = prepareLock(hqRoot, command);
|
|
456
|
+
const cfg = resolveWaitConfig(opts);
|
|
457
|
+
const deadline = cfg.timeoutMs === null ? null : Date.now() + cfg.timeoutMs;
|
|
458
|
+
let announced = false;
|
|
459
|
+
|
|
460
|
+
for (;;) {
|
|
461
|
+
const res = tryAcquireOnce(p, info, payload);
|
|
462
|
+
if ("handle" in res) return res.handle;
|
|
463
|
+
|
|
464
|
+
if (remainingMs(deadline) <= 0) throw new OperationLockedError(res.busy, command);
|
|
465
|
+
if (!announced) {
|
|
466
|
+
announced = true;
|
|
467
|
+
cfg.onWaitStart(res.busy, command);
|
|
468
|
+
}
|
|
469
|
+
await sleepAsync(Math.min(cfg.pollMs, remainingMs(deadline)));
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
|
|
267
473
|
/** Run `fn` while holding the per-root lock for `command` (async). */
|
|
268
474
|
export async function withOperationLock<T>(
|
|
269
475
|
hqRoot: string,
|
|
270
476
|
command: string,
|
|
271
477
|
fn: () => Promise<T>,
|
|
478
|
+
opts: AcquireOptions = {},
|
|
272
479
|
): Promise<T> {
|
|
273
|
-
const handle =
|
|
480
|
+
const handle = await acquireOperationLockAsync(hqRoot, command, opts);
|
|
274
481
|
try {
|
|
275
482
|
return await fn();
|
|
276
483
|
} finally {
|
|
@@ -283,8 +490,9 @@ export function withOperationLockSync<T>(
|
|
|
283
490
|
hqRoot: string,
|
|
284
491
|
command: string,
|
|
285
492
|
fn: () => T,
|
|
493
|
+
opts: AcquireOptions = {},
|
|
286
494
|
): T {
|
|
287
|
-
const handle = acquireOperationLock(hqRoot, command);
|
|
495
|
+
const handle = acquireOperationLock(hqRoot, command, opts);
|
|
288
496
|
try {
|
|
289
497
|
return fn();
|
|
290
498
|
} finally {
|