@indigoai-us/hq-cloud 6.7.1 → 6.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/sync-runner.d.ts.map +1 -1
- package/dist/bin/sync-runner.js +33 -1
- package/dist/bin/sync-runner.js.map +1 -1
- package/dist/bin/sync-runner.test.js +73 -4
- package/dist/bin/sync-runner.test.js.map +1 -1
- package/dist/cli/reindex.d.ts +11 -0
- package/dist/cli/reindex.d.ts.map +1 -1
- package/dist/cli/reindex.js +1 -1
- package/dist/cli/reindex.js.map +1 -1
- package/dist/cli/reindex.test.js +5 -4
- package/dist/cli/reindex.test.js.map +1 -1
- package/dist/cli/rescue.d.ts +20 -0
- package/dist/cli/rescue.d.ts.map +1 -1
- package/dist/cli/rescue.js +36 -2
- package/dist/cli/rescue.js.map +1 -1
- package/dist/cli/rescue.test.js +38 -1
- package/dist/cli/rescue.test.js.map +1 -1
- package/dist/operation-lock.d.ts +81 -10
- package/dist/operation-lock.d.ts.map +1 -1
- package/dist/operation-lock.js +177 -27
- package/dist/operation-lock.js.map +1 -1
- package/dist/operation-lock.test.js +122 -11
- package/dist/operation-lock.test.js.map +1 -1
- package/package.json +1 -1
- package/src/bin/sync-runner.test.ts +83 -4
- package/src/bin/sync-runner.ts +39 -1
- package/src/cli/reindex.test.ts +5 -4
- package/src/cli/reindex.ts +12 -1
- package/src/cli/rescue.test.ts +43 -1
- package/src/cli/rescue.ts +48 -2
- package/src/operation-lock.test.ts +147 -10
- package/src/operation-lock.ts +234 -26
package/src/operation-lock.ts
CHANGED
|
@@ -35,11 +35,44 @@
|
|
|
35
35
|
* - The lock records the holder's `{ pid, command, startedAt, hqRoot }`. On
|
|
36
36
|
* EEXIST we test the recorded PID with `process.kill(pid, 0)`:
|
|
37
37
|
* * ESRCH → the holder is gone (crashed / killed -9 / stale file) →
|
|
38
|
-
* reclaim the lock
|
|
38
|
+
* reclaim the lock IMMEDIATELY (a dead holder never makes us
|
|
39
|
+
* wait).
|
|
39
40
|
* * EPERM → the PID exists but is owned by another user → treat as ALIVE
|
|
40
|
-
* (conservative:
|
|
41
|
-
* * success → alive →
|
|
42
|
-
*
|
|
41
|
+
* (conservative: wait rather than risk two concurrent ops).
|
|
42
|
+
* * success → alive → WAIT for the holder to release, then acquire (see
|
|
43
|
+
* "Waiting" below). The fast-refusal path is still reachable
|
|
44
|
+
* via an explicit timeout / `wait: false`.
|
|
45
|
+
*
|
|
46
|
+
* ## Waiting for a live holder (default behavior)
|
|
47
|
+
*
|
|
48
|
+
* When a LIVE holder owns the lock, acquisition WAITS by default: it polls
|
|
49
|
+
* (~2s) and acquires the instant the holder releases, rather than refusing
|
|
50
|
+
* fast. A single status line is written to stderr the first time we start
|
|
51
|
+
* waiting ("Waiting for <command> (pid N) to finish…"), never per-poll.
|
|
52
|
+
* This is what an interactive `sync` / `rescue` / `reindex` invocation wants —
|
|
53
|
+
* queue behind the running op instead of erroring out.
|
|
54
|
+
*
|
|
55
|
+
* A bounded escape exists for scripts that must not block forever:
|
|
56
|
+
* - `timeoutSec` option, or the `HQ_OP_LOCK_TIMEOUT` env var (seconds).
|
|
57
|
+
* The option wins over the env. After the bound elapses we throw
|
|
58
|
+
* {@link OperationLockedError} (exit 17) with the same clear refusal
|
|
59
|
+
* message as before.
|
|
60
|
+
* - `timeoutSec === 0` (or `HQ_OP_LOCK_TIMEOUT=0`, or `wait: false`) → do
|
|
61
|
+
* not wait at all; refuse immediately. This is the pre-wait behavior.
|
|
62
|
+
* - absent / negative / unparseable → INFINITE wait (the documented
|
|
63
|
+
* default).
|
|
64
|
+
* Stale-PID takeover is unconditional and happens BEFORE any wait — a dead
|
|
65
|
+
* holder is reclaimed at once regardless of the wait config.
|
|
66
|
+
*
|
|
67
|
+
* Ordering / scope caveats:
|
|
68
|
+
* - This is a CROSS-PROCESS mutex keyed on the holder's PID. Two concurrent
|
|
69
|
+
* acquisitions inside the SAME process share a PID, so the same-process
|
|
70
|
+
* reclaim path lets them stomp each other — in-process concurrent acquire
|
|
71
|
+
* is unsupported (the real consumers — sync / rescue / reindex — are
|
|
72
|
+
* separate processes).
|
|
73
|
+
* - When several distinct processes wait on the same lock, the next one to
|
|
74
|
+
* win the O_EXCL race after a free acquires. Order is best-effort, NOT
|
|
75
|
+
* FIFO — do not depend on arrival order.
|
|
43
76
|
* - PID reuse is an inherent, un-eliminable race for any PID-based scheme: if
|
|
44
77
|
* the original holder crashed and the OS later handed its PID to an
|
|
45
78
|
* unrelated process, we conservatively read that as "still held" and
|
|
@@ -104,6 +137,100 @@ export interface LockHandle {
|
|
|
104
137
|
release(): void;
|
|
105
138
|
}
|
|
106
139
|
|
|
140
|
+
/** Default poll interval while waiting on a live holder. */
|
|
141
|
+
export const DEFAULT_LOCK_POLL_MS = 2000;
|
|
142
|
+
|
|
143
|
+
/** Options controlling how `acquireOperationLock*` behaves against a LIVE holder. */
|
|
144
|
+
export interface AcquireOptions {
|
|
145
|
+
/**
|
|
146
|
+
* When a LIVE holder owns the lock: `true` (default) → WAIT-poll until it
|
|
147
|
+
* frees, then acquire; `false` → refuse immediately with
|
|
148
|
+
* {@link OperationLockedError}. A `timeoutSec` of 0 is equivalent to
|
|
149
|
+
* `wait: false`.
|
|
150
|
+
*/
|
|
151
|
+
wait?: boolean;
|
|
152
|
+
/**
|
|
153
|
+
* Bounded wait, in seconds, before giving up and throwing
|
|
154
|
+
* {@link OperationLockedError} (exit 17). Precedence: this option > the
|
|
155
|
+
* `HQ_OP_LOCK_TIMEOUT` env var > infinite. `0` → do not wait at all (refuse
|
|
156
|
+
* immediately). Negative / non-finite → treated as absent (infinite wait).
|
|
157
|
+
* Fractional values are honored (used by tests); the CLI flags accept whole
|
|
158
|
+
* seconds.
|
|
159
|
+
*/
|
|
160
|
+
timeoutSec?: number;
|
|
161
|
+
/** Poll interval in ms while waiting. Defaults to {@link DEFAULT_LOCK_POLL_MS}. */
|
|
162
|
+
pollIntervalMs?: number;
|
|
163
|
+
/**
|
|
164
|
+
* Invoked exactly ONCE, the first time we begin waiting on a live holder.
|
|
165
|
+
* Defaults to a single "Waiting for …" line on stderr. Pass a custom hook
|
|
166
|
+
* (or a no-op) to redirect/silence the status line.
|
|
167
|
+
*/
|
|
168
|
+
onWaitStart?: (holder: LockInfo, attempted: string) => void;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
interface ResolvedWaitConfig {
|
|
172
|
+
/** null → wait forever; >= 0 → wait at most this many ms (0 = no wait). */
|
|
173
|
+
timeoutMs: number | null;
|
|
174
|
+
pollMs: number;
|
|
175
|
+
onWaitStart: (holder: LockInfo, attempted: string) => void;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/** Default status line: a single stderr message naming the holder. */
|
|
179
|
+
function defaultOnWaitStart(holder: LockInfo, attempted: string): void {
|
|
180
|
+
process.stderr.write(
|
|
181
|
+
`Waiting for "${holder.command}" (pid ${holder.pid}) to finish before ` +
|
|
182
|
+
`starting "${attempted}"… (set HQ_OP_LOCK_TIMEOUT=<secs> to bound the wait)\n`,
|
|
183
|
+
);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Resolve the effective wait config from explicit options + the
|
|
188
|
+
* `HQ_OP_LOCK_TIMEOUT` env var. Option timeout wins over the env; `wait: false`
|
|
189
|
+
* forces a zero (no-wait) timeout.
|
|
190
|
+
*/
|
|
191
|
+
function resolveWaitConfig(opts: AcquireOptions): ResolvedWaitConfig {
|
|
192
|
+
// Parse a seconds value into ms, or null for "absent/infinite". Only a
|
|
193
|
+
// finite, non-negative number counts; everything else (NaN, Infinity, <0)
|
|
194
|
+
// means "no explicit bound".
|
|
195
|
+
const toMs = (sec: number | undefined): number | null => {
|
|
196
|
+
if (sec === undefined) return null;
|
|
197
|
+
if (!Number.isFinite(sec) || sec < 0) return null;
|
|
198
|
+
return Math.round(sec * 1000);
|
|
199
|
+
};
|
|
200
|
+
|
|
201
|
+
let timeoutMs: number | null;
|
|
202
|
+
if (opts.timeoutSec !== undefined) {
|
|
203
|
+
timeoutMs = toMs(opts.timeoutSec);
|
|
204
|
+
} else {
|
|
205
|
+
const envRaw = process.env.HQ_OP_LOCK_TIMEOUT;
|
|
206
|
+
timeoutMs = envRaw !== undefined && envRaw !== "" ? toMs(Number(envRaw)) : null;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// `wait: false` is shorthand for a zero-length wait (refuse immediately).
|
|
210
|
+
if (opts.wait === false) timeoutMs = 0;
|
|
211
|
+
|
|
212
|
+
const pollMs =
|
|
213
|
+
opts.pollIntervalMs && opts.pollIntervalMs > 0
|
|
214
|
+
? opts.pollIntervalMs
|
|
215
|
+
: DEFAULT_LOCK_POLL_MS;
|
|
216
|
+
|
|
217
|
+
return { timeoutMs, pollMs, onWaitStart: opts.onWaitStart ?? defaultOnWaitStart };
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/** Block the current thread for `ms` without busy-spinning (sync consumers). */
|
|
221
|
+
function sleepSync(ms: number): void {
|
|
222
|
+
if (ms <= 0) return;
|
|
223
|
+
// Atomics.wait on a private buffer is a clean, CPU-free sleep. The value at
|
|
224
|
+
// index 0 is 0 and nothing ever notifies it, so this always sleeps the full
|
|
225
|
+
// timeout (or less if interrupted) and returns "timed-out".
|
|
226
|
+
Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, ms);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/** Non-blocking sleep for the async consumer. */
|
|
230
|
+
function sleepAsync(ms: number): Promise<void> {
|
|
231
|
+
return new Promise((resolve) => setTimeout(resolve, Math.max(0, ms)));
|
|
232
|
+
}
|
|
233
|
+
|
|
107
234
|
function stateDir(): string {
|
|
108
235
|
return process.env.HQ_STATE_DIR || path.join(os.homedir(), ".hq");
|
|
109
236
|
}
|
|
@@ -200,35 +327,45 @@ function makeHandle(p: string, info: LockInfo): LockHandle {
|
|
|
200
327
|
|
|
201
328
|
const NOOP_HANDLE_BASE = { release() {} };
|
|
202
329
|
|
|
203
|
-
/**
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
startedAt: new Date().toISOString(),
|
|
214
|
-
hqRoot: path.resolve(hqRoot),
|
|
215
|
-
};
|
|
216
|
-
return { ...NOOP_HANDLE_BASE, path: "", info };
|
|
217
|
-
}
|
|
330
|
+
/** No-op handle for the `HQ_DISABLE_OP_LOCK=1` escape hatch. */
|
|
331
|
+
function disabledHandle(hqRoot: string, command: string): LockHandle {
|
|
332
|
+
const info: LockInfo = {
|
|
333
|
+
pid: process.pid,
|
|
334
|
+
command,
|
|
335
|
+
startedAt: new Date().toISOString(),
|
|
336
|
+
hqRoot: path.resolve(hqRoot),
|
|
337
|
+
};
|
|
338
|
+
return { ...NOOP_HANDLE_BASE, path: "", info };
|
|
339
|
+
}
|
|
218
340
|
|
|
341
|
+
/** Build the lock payload + ensure the locks dir exists. */
|
|
342
|
+
function prepareLock(hqRoot: string, command: string): { p: string; info: LockInfo; payload: string } {
|
|
219
343
|
const p = lockPathFor(hqRoot);
|
|
220
344
|
fs.mkdirSync(path.dirname(p), { recursive: true });
|
|
221
|
-
|
|
222
345
|
const info: LockInfo = {
|
|
223
346
|
pid: process.pid,
|
|
224
347
|
command,
|
|
225
348
|
startedAt: new Date().toISOString(),
|
|
226
349
|
hqRoot: path.resolve(hqRoot),
|
|
227
350
|
};
|
|
228
|
-
|
|
351
|
+
return { p, info, payload: JSON.stringify(info, null, 2) };
|
|
352
|
+
}
|
|
229
353
|
|
|
354
|
+
/**
|
|
355
|
+
* One acquisition pass. Returns the {@link LockHandle} on success, or
|
|
356
|
+
* `{ busy }` naming the LIVE holder that blocked us (so the caller can decide
|
|
357
|
+
* to wait or refuse). A stale/torn/own-leftover lock is reclaimed in-pass and
|
|
358
|
+
* never reported as busy. Throws only on genuinely pathological churn or a
|
|
359
|
+
* non-EEXIST fs error.
|
|
360
|
+
*/
|
|
361
|
+
function tryAcquireOnce(
|
|
362
|
+
p: string,
|
|
363
|
+
info: LockInfo,
|
|
364
|
+
payload: string,
|
|
365
|
+
): { handle: LockHandle } | { busy: LockInfo } {
|
|
230
366
|
// Bounded retry: each iteration is one atomic create attempt. EEXIST against
|
|
231
|
-
// a stale holder reclaims and retries; EEXIST against a live holder
|
|
367
|
+
// a stale holder reclaims and retries; EEXIST against a live holder reports
|
|
368
|
+
// it as busy.
|
|
232
369
|
const MAX_ATTEMPTS = 5;
|
|
233
370
|
for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
|
|
234
371
|
let fd: number;
|
|
@@ -239,7 +376,7 @@ export function acquireOperationLock(hqRoot: string, command: string): LockHandl
|
|
|
239
376
|
|
|
240
377
|
const holder = readLockInfo(p);
|
|
241
378
|
if (holder && holder.pid !== process.pid && pidAlive(holder.pid)) {
|
|
242
|
-
|
|
379
|
+
return { busy: holder };
|
|
243
380
|
}
|
|
244
381
|
// Stale (dead holder), unreadable/torn, or our own leftover → reclaim.
|
|
245
382
|
try {
|
|
@@ -254,7 +391,7 @@ export function acquireOperationLock(hqRoot: string, command: string): LockHandl
|
|
|
254
391
|
} finally {
|
|
255
392
|
fs.closeSync(fd);
|
|
256
393
|
}
|
|
257
|
-
return makeHandle(p, info);
|
|
394
|
+
return { handle: makeHandle(p, info) };
|
|
258
395
|
}
|
|
259
396
|
|
|
260
397
|
// Pathological churn (another process reclaiming in lockstep). Surface it
|
|
@@ -264,13 +401,83 @@ export function acquireOperationLock(hqRoot: string, command: string): LockHandl
|
|
|
264
401
|
);
|
|
265
402
|
}
|
|
266
403
|
|
|
404
|
+
/** ms left until `deadline` (null deadline → never expires). */
|
|
405
|
+
function remainingMs(deadline: number | null): number {
|
|
406
|
+
return deadline === null ? Infinity : deadline - Date.now();
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
/**
|
|
410
|
+
* Acquire the per-root operation lock for `command` (synchronous). Returns a
|
|
411
|
+
* {@link LockHandle} on success. Against a LIVE holder it WAITS by default
|
|
412
|
+
* (polling, blocking the thread) and acquires the moment the holder releases;
|
|
413
|
+
* pass `timeoutSec`/`wait` (or set `HQ_OP_LOCK_TIMEOUT`) to bound or disable the
|
|
414
|
+
* wait — on expiry it throws {@link OperationLockedError}. A stale lock (dead
|
|
415
|
+
* holder) is reclaimed immediately, never waited on.
|
|
416
|
+
*/
|
|
417
|
+
export function acquireOperationLock(
|
|
418
|
+
hqRoot: string,
|
|
419
|
+
command: string,
|
|
420
|
+
opts: AcquireOptions = {},
|
|
421
|
+
): LockHandle {
|
|
422
|
+
if (process.env.HQ_DISABLE_OP_LOCK === "1") return disabledHandle(hqRoot, command);
|
|
423
|
+
|
|
424
|
+
const { p, info, payload } = prepareLock(hqRoot, command);
|
|
425
|
+
const cfg = resolveWaitConfig(opts);
|
|
426
|
+
const deadline = cfg.timeoutMs === null ? null : Date.now() + cfg.timeoutMs;
|
|
427
|
+
let announced = false;
|
|
428
|
+
|
|
429
|
+
for (;;) {
|
|
430
|
+
const res = tryAcquireOnce(p, info, payload);
|
|
431
|
+
if ("handle" in res) return res.handle;
|
|
432
|
+
|
|
433
|
+
// A live holder blocked us. Decide: refuse now, or wait and retry.
|
|
434
|
+
if (remainingMs(deadline) <= 0) throw new OperationLockedError(res.busy, command);
|
|
435
|
+
if (!announced) {
|
|
436
|
+
announced = true;
|
|
437
|
+
cfg.onWaitStart(res.busy, command);
|
|
438
|
+
}
|
|
439
|
+
sleepSync(Math.min(cfg.pollMs, remainingMs(deadline)));
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
/**
|
|
444
|
+
* Async counterpart to {@link acquireOperationLock}. Identical semantics, but
|
|
445
|
+
* the wait yields the event loop (via `setTimeout`) instead of blocking the
|
|
446
|
+
* thread — required for the async `sync` runner.
|
|
447
|
+
*/
|
|
448
|
+
export async function acquireOperationLockAsync(
|
|
449
|
+
hqRoot: string,
|
|
450
|
+
command: string,
|
|
451
|
+
opts: AcquireOptions = {},
|
|
452
|
+
): Promise<LockHandle> {
|
|
453
|
+
if (process.env.HQ_DISABLE_OP_LOCK === "1") return disabledHandle(hqRoot, command);
|
|
454
|
+
|
|
455
|
+
const { p, info, payload } = prepareLock(hqRoot, command);
|
|
456
|
+
const cfg = resolveWaitConfig(opts);
|
|
457
|
+
const deadline = cfg.timeoutMs === null ? null : Date.now() + cfg.timeoutMs;
|
|
458
|
+
let announced = false;
|
|
459
|
+
|
|
460
|
+
for (;;) {
|
|
461
|
+
const res = tryAcquireOnce(p, info, payload);
|
|
462
|
+
if ("handle" in res) return res.handle;
|
|
463
|
+
|
|
464
|
+
if (remainingMs(deadline) <= 0) throw new OperationLockedError(res.busy, command);
|
|
465
|
+
if (!announced) {
|
|
466
|
+
announced = true;
|
|
467
|
+
cfg.onWaitStart(res.busy, command);
|
|
468
|
+
}
|
|
469
|
+
await sleepAsync(Math.min(cfg.pollMs, remainingMs(deadline)));
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
|
|
267
473
|
/** Run `fn` while holding the per-root lock for `command` (async). */
|
|
268
474
|
export async function withOperationLock<T>(
|
|
269
475
|
hqRoot: string,
|
|
270
476
|
command: string,
|
|
271
477
|
fn: () => Promise<T>,
|
|
478
|
+
opts: AcquireOptions = {},
|
|
272
479
|
): Promise<T> {
|
|
273
|
-
const handle =
|
|
480
|
+
const handle = await acquireOperationLockAsync(hqRoot, command, opts);
|
|
274
481
|
try {
|
|
275
482
|
return await fn();
|
|
276
483
|
} finally {
|
|
@@ -283,8 +490,9 @@ export function withOperationLockSync<T>(
|
|
|
283
490
|
hqRoot: string,
|
|
284
491
|
command: string,
|
|
285
492
|
fn: () => T,
|
|
493
|
+
opts: AcquireOptions = {},
|
|
286
494
|
): T {
|
|
287
|
-
const handle = acquireOperationLock(hqRoot, command);
|
|
495
|
+
const handle = acquireOperationLock(hqRoot, command, opts);
|
|
288
496
|
try {
|
|
289
497
|
return fn();
|
|
290
498
|
} finally {
|