@indigoai-us/hq-cloud 6.7.1 → 6.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -35,11 +35,44 @@
35
35
  * - The lock records the holder's `{ pid, command, startedAt, hqRoot }`. On
36
36
  * EEXIST we test the recorded PID with `process.kill(pid, 0)`:
37
37
  * * ESRCH → the holder is gone (crashed / killed -9 / stale file) →
38
- * reclaim the lock.
38
+ * reclaim the lock IMMEDIATELY (a dead holder never makes us
39
+ * wait).
39
40
  * * EPERM → the PID exists but is owned by another user → treat as ALIVE
40
- * (conservative: refuse rather than risk two concurrent ops).
41
- * * success → alive → refuse fast with {@link OperationLockedError}
42
- * naming the holding command + PID.
41
+ * (conservative: wait rather than risk two concurrent ops).
42
+ * * success → alive → WAIT for the holder to release, then acquire (see
43
+ * "Waiting" below). The fast-refusal path is still reachable
44
+ * via an explicit timeout / `wait: false`.
45
+ *
46
+ * ## Waiting for a live holder (default behavior)
47
+ *
48
+ * When a LIVE holder owns the lock, acquisition WAITS by default: it polls
49
+ * (~2s) and acquires the instant the holder releases, rather than refusing
50
+ * fast. A single status line is written to stderr the first time we start
51
+ * waiting ("Waiting for <command> (pid N) to finish…"), never per-poll.
52
+ * This is what an interactive `sync` / `rescue` / `reindex` invocation wants —
53
+ * queue behind the running op instead of erroring out.
54
+ *
55
+ * A bounded escape exists for scripts that must not block forever:
56
+ * - `timeoutSec` option, or the `HQ_OP_LOCK_TIMEOUT` env var (seconds).
57
+ * The option wins over the env. After the bound elapses we throw
58
+ * {@link OperationLockedError} (exit 17) with the same clear refusal
59
+ * message as before.
60
+ * - `timeoutSec === 0` (or `HQ_OP_LOCK_TIMEOUT=0`, or `wait: false`) → do
61
+ * not wait at all; refuse immediately. This is the pre-wait behavior.
62
+ * - absent / negative / unparseable → INFINITE wait (the documented
63
+ * default).
64
+ * Stale-PID takeover is unconditional and happens BEFORE any wait — a dead
65
+ * holder is reclaimed at once regardless of the wait config.
66
+ *
67
+ * Ordering / scope caveats:
68
+ * - This is a CROSS-PROCESS mutex keyed on the holder's PID. Two concurrent
69
+ * acquisitions inside the SAME process share a PID, so the same-process
70
+ * reclaim path lets them stomp each other — in-process concurrent acquire
71
+ * is unsupported (the real consumers — sync / rescue / reindex — are
72
+ * separate processes).
73
+ * - When several distinct processes wait on the same lock, the next one to
74
+ * win the O_EXCL race after a free acquires. Order is best-effort, NOT
75
+ * FIFO — do not depend on arrival order.
43
76
  * - PID reuse is an inherent, un-eliminable race for any PID-based scheme: if
44
77
  * the original holder crashed and the OS later handed its PID to an
45
78
  * unrelated process, we conservatively read that as "still held" and
@@ -104,6 +137,100 @@ export interface LockHandle {
104
137
  release(): void;
105
138
  }
106
139
 
140
+ /** Default poll interval while waiting on a live holder. */
141
+ export const DEFAULT_LOCK_POLL_MS = 2000;
142
+
143
+ /** Options controlling how `acquireOperationLock*` behaves against a LIVE holder. */
144
+ export interface AcquireOptions {
145
+ /**
146
+ * When a LIVE holder owns the lock: `true` (default) → WAIT-poll until it
147
+ * frees, then acquire; `false` → refuse immediately with
148
+ * {@link OperationLockedError}. A `timeoutSec` of 0 is equivalent to
149
+ * `wait: false`.
150
+ */
151
+ wait?: boolean;
152
+ /**
153
+ * Bounded wait, in seconds, before giving up and throwing
154
+ * {@link OperationLockedError} (exit 17). Precedence: this option > the
155
+ * `HQ_OP_LOCK_TIMEOUT` env var > infinite. `0` → do not wait at all (refuse
156
+ * immediately). Negative / non-finite → treated as absent (infinite wait).
157
+ * Fractional values are honored (used by tests); the CLI flags accept whole
158
+ * seconds.
159
+ */
160
+ timeoutSec?: number;
161
+ /** Poll interval in ms while waiting. Defaults to {@link DEFAULT_LOCK_POLL_MS}. */
162
+ pollIntervalMs?: number;
163
+ /**
164
+ * Invoked exactly ONCE, the first time we begin waiting on a live holder.
165
+ * Defaults to a single "Waiting for …" line on stderr. Pass a custom hook
166
+ * (or a no-op) to redirect/silence the status line.
167
+ */
168
+ onWaitStart?: (holder: LockInfo, attempted: string) => void;
169
+ }
170
+
171
+ interface ResolvedWaitConfig {
172
+ /** null → wait forever; >= 0 → wait at most this many ms (0 = no wait). */
173
+ timeoutMs: number | null;
174
+ pollMs: number;
175
+ onWaitStart: (holder: LockInfo, attempted: string) => void;
176
+ }
177
+
178
+ /** Default status line: a single stderr message naming the holder. */
179
+ function defaultOnWaitStart(holder: LockInfo, attempted: string): void {
180
+ process.stderr.write(
181
+ `Waiting for "${holder.command}" (pid ${holder.pid}) to finish before ` +
182
+ `starting "${attempted}"… (set HQ_OP_LOCK_TIMEOUT=<secs> to bound the wait)\n`,
183
+ );
184
+ }
185
+
186
+ /**
187
+ * Resolve the effective wait config from explicit options + the
188
+ * `HQ_OP_LOCK_TIMEOUT` env var. Option timeout wins over the env; `wait: false`
189
+ * forces a zero (no-wait) timeout.
190
+ */
191
+ function resolveWaitConfig(opts: AcquireOptions): ResolvedWaitConfig {
192
+ // Parse a seconds value into ms, or null for "absent/infinite". Only a
193
+ // finite, non-negative number counts; everything else (NaN, Infinity, <0)
194
+ // means "no explicit bound".
195
+ const toMs = (sec: number | undefined): number | null => {
196
+ if (sec === undefined) return null;
197
+ if (!Number.isFinite(sec) || sec < 0) return null;
198
+ return Math.round(sec * 1000);
199
+ };
200
+
201
+ let timeoutMs: number | null;
202
+ if (opts.timeoutSec !== undefined) {
203
+ timeoutMs = toMs(opts.timeoutSec);
204
+ } else {
205
+ const envRaw = process.env.HQ_OP_LOCK_TIMEOUT;
206
+ timeoutMs = envRaw !== undefined && envRaw !== "" ? toMs(Number(envRaw)) : null;
207
+ }
208
+
209
+ // `wait: false` is shorthand for a zero-length wait (refuse immediately).
210
+ if (opts.wait === false) timeoutMs = 0;
211
+
212
+ const pollMs =
213
+ opts.pollIntervalMs && opts.pollIntervalMs > 0
214
+ ? opts.pollIntervalMs
215
+ : DEFAULT_LOCK_POLL_MS;
216
+
217
+ return { timeoutMs, pollMs, onWaitStart: opts.onWaitStart ?? defaultOnWaitStart };
218
+ }
219
+
220
+ /** Block the current thread for `ms` without busy-spinning (sync consumers). */
221
+ function sleepSync(ms: number): void {
222
+ if (ms <= 0) return;
223
+ // Atomics.wait on a private buffer is a clean, CPU-free sleep. The value at
224
+ // index 0 is 0 and nothing ever notifies it, so this always sleeps the full
225
+ // timeout (or less if interrupted) and returns "timed-out".
226
+ Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, ms);
227
+ }
228
+
229
+ /** Non-blocking sleep for the async consumer. */
230
+ function sleepAsync(ms: number): Promise<void> {
231
+ return new Promise((resolve) => setTimeout(resolve, Math.max(0, ms)));
232
+ }
233
+
107
234
  function stateDir(): string {
108
235
  return process.env.HQ_STATE_DIR || path.join(os.homedir(), ".hq");
109
236
  }
@@ -200,35 +327,45 @@ function makeHandle(p: string, info: LockInfo): LockHandle {
200
327
 
201
328
  const NOOP_HANDLE_BASE = { release() {} };
202
329
 
203
- /**
204
- * Acquire the per-root operation lock for `command`. Returns a {@link LockHandle}
205
- * on success; throws {@link OperationLockedError} when a live holder owns it.
206
- * Reclaims a stale lock (dead holder) transparently.
207
- */
208
- export function acquireOperationLock(hqRoot: string, command: string): LockHandle {
209
- if (process.env.HQ_DISABLE_OP_LOCK === "1") {
210
- const info: LockInfo = {
211
- pid: process.pid,
212
- command,
213
- startedAt: new Date().toISOString(),
214
- hqRoot: path.resolve(hqRoot),
215
- };
216
- return { ...NOOP_HANDLE_BASE, path: "", info };
217
- }
330
+ /** No-op handle for the `HQ_DISABLE_OP_LOCK=1` escape hatch. */
331
+ function disabledHandle(hqRoot: string, command: string): LockHandle {
332
+ const info: LockInfo = {
333
+ pid: process.pid,
334
+ command,
335
+ startedAt: new Date().toISOString(),
336
+ hqRoot: path.resolve(hqRoot),
337
+ };
338
+ return { ...NOOP_HANDLE_BASE, path: "", info };
339
+ }
218
340
 
341
+ /** Build the lock payload + ensure the locks dir exists. */
342
+ function prepareLock(hqRoot: string, command: string): { p: string; info: LockInfo; payload: string } {
219
343
  const p = lockPathFor(hqRoot);
220
344
  fs.mkdirSync(path.dirname(p), { recursive: true });
221
-
222
345
  const info: LockInfo = {
223
346
  pid: process.pid,
224
347
  command,
225
348
  startedAt: new Date().toISOString(),
226
349
  hqRoot: path.resolve(hqRoot),
227
350
  };
228
- const payload = JSON.stringify(info, null, 2);
351
+ return { p, info, payload: JSON.stringify(info, null, 2) };
352
+ }
229
353
 
354
+ /**
355
+ * One acquisition pass. Returns the {@link LockHandle} on success, or
356
+ * `{ busy }` naming the LIVE holder that blocked us (so the caller can decide
357
+ * to wait or refuse). A stale/torn/own-leftover lock is reclaimed in-pass and
358
+ * never reported as busy. Throws only on genuinely pathological churn or a
359
+ * non-EEXIST fs error.
360
+ */
361
+ function tryAcquireOnce(
362
+ p: string,
363
+ info: LockInfo,
364
+ payload: string,
365
+ ): { handle: LockHandle } | { busy: LockInfo } {
230
366
  // Bounded retry: each iteration is one atomic create attempt. EEXIST against
231
- // a stale holder reclaims and retries; EEXIST against a live holder refuses.
367
+ // a stale holder reclaims and retries; EEXIST against a live holder reports
368
+ // it as busy.
232
369
  const MAX_ATTEMPTS = 5;
233
370
  for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
234
371
  let fd: number;
@@ -239,7 +376,7 @@ export function acquireOperationLock(hqRoot: string, command: string): LockHandl
239
376
 
240
377
  const holder = readLockInfo(p);
241
378
  if (holder && holder.pid !== process.pid && pidAlive(holder.pid)) {
242
- throw new OperationLockedError(holder, command);
379
+ return { busy: holder };
243
380
  }
244
381
  // Stale (dead holder), unreadable/torn, or our own leftover → reclaim.
245
382
  try {
@@ -254,7 +391,7 @@ export function acquireOperationLock(hqRoot: string, command: string): LockHandl
254
391
  } finally {
255
392
  fs.closeSync(fd);
256
393
  }
257
- return makeHandle(p, info);
394
+ return { handle: makeHandle(p, info) };
258
395
  }
259
396
 
260
397
  // Pathological churn (another process reclaiming in lockstep). Surface it
@@ -264,13 +401,83 @@ export function acquireOperationLock(hqRoot: string, command: string): LockHandl
264
401
  );
265
402
  }
266
403
 
404
+ /** ms left until `deadline` (null deadline → never expires). */
405
+ function remainingMs(deadline: number | null): number {
406
+ return deadline === null ? Infinity : deadline - Date.now();
407
+ }
408
+
409
+ /**
410
+ * Acquire the per-root operation lock for `command` (synchronous). Returns a
411
+ * {@link LockHandle} on success. Against a LIVE holder it WAITS by default
412
+ * (polling, blocking the thread) and acquires the moment the holder releases;
413
+ * pass `timeoutSec`/`wait` (or set `HQ_OP_LOCK_TIMEOUT`) to bound or disable the
414
+ * wait — on expiry it throws {@link OperationLockedError}. A stale lock (dead
415
+ * holder) is reclaimed immediately, never waited on.
416
+ */
417
+ export function acquireOperationLock(
418
+ hqRoot: string,
419
+ command: string,
420
+ opts: AcquireOptions = {},
421
+ ): LockHandle {
422
+ if (process.env.HQ_DISABLE_OP_LOCK === "1") return disabledHandle(hqRoot, command);
423
+
424
+ const { p, info, payload } = prepareLock(hqRoot, command);
425
+ const cfg = resolveWaitConfig(opts);
426
+ const deadline = cfg.timeoutMs === null ? null : Date.now() + cfg.timeoutMs;
427
+ let announced = false;
428
+
429
+ for (;;) {
430
+ const res = tryAcquireOnce(p, info, payload);
431
+ if ("handle" in res) return res.handle;
432
+
433
+ // A live holder blocked us. Decide: refuse now, or wait and retry.
434
+ if (remainingMs(deadline) <= 0) throw new OperationLockedError(res.busy, command);
435
+ if (!announced) {
436
+ announced = true;
437
+ cfg.onWaitStart(res.busy, command);
438
+ }
439
+ sleepSync(Math.min(cfg.pollMs, remainingMs(deadline)));
440
+ }
441
+ }
442
+
443
+ /**
444
+ * Async counterpart to {@link acquireOperationLock}. Identical semantics, but
445
+ * the wait yields the event loop (via `setTimeout`) instead of blocking the
446
+ * thread — required for the async `sync` runner.
447
+ */
448
+ export async function acquireOperationLockAsync(
449
+ hqRoot: string,
450
+ command: string,
451
+ opts: AcquireOptions = {},
452
+ ): Promise<LockHandle> {
453
+ if (process.env.HQ_DISABLE_OP_LOCK === "1") return disabledHandle(hqRoot, command);
454
+
455
+ const { p, info, payload } = prepareLock(hqRoot, command);
456
+ const cfg = resolveWaitConfig(opts);
457
+ const deadline = cfg.timeoutMs === null ? null : Date.now() + cfg.timeoutMs;
458
+ let announced = false;
459
+
460
+ for (;;) {
461
+ const res = tryAcquireOnce(p, info, payload);
462
+ if ("handle" in res) return res.handle;
463
+
464
+ if (remainingMs(deadline) <= 0) throw new OperationLockedError(res.busy, command);
465
+ if (!announced) {
466
+ announced = true;
467
+ cfg.onWaitStart(res.busy, command);
468
+ }
469
+ await sleepAsync(Math.min(cfg.pollMs, remainingMs(deadline)));
470
+ }
471
+ }
472
+
267
473
  /** Run `fn` while holding the per-root lock for `command` (async). */
268
474
  export async function withOperationLock<T>(
269
475
  hqRoot: string,
270
476
  command: string,
271
477
  fn: () => Promise<T>,
478
+ opts: AcquireOptions = {},
272
479
  ): Promise<T> {
273
- const handle = acquireOperationLock(hqRoot, command);
480
+ const handle = await acquireOperationLockAsync(hqRoot, command, opts);
274
481
  try {
275
482
  return await fn();
276
483
  } finally {
@@ -283,8 +490,9 @@ export function withOperationLockSync<T>(
283
490
  hqRoot: string,
284
491
  command: string,
285
492
  fn: () => T,
493
+ opts: AcquireOptions = {},
286
494
  ): T {
287
- const handle = acquireOperationLock(hqRoot, command);
495
+ const handle = acquireOperationLock(hqRoot, command, opts);
288
496
  try {
289
497
  return fn();
290
498
  } finally {