@indigoai-us/hq-cloud 6.2.7 → 6.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/dist/bin/sync-runner.d.ts.map +1 -1
  2. package/dist/bin/sync-runner.js +20 -1
  3. package/dist/bin/sync-runner.js.map +1 -1
  4. package/dist/bin/sync-runner.test.js +61 -0
  5. package/dist/bin/sync-runner.test.js.map +1 -1
  6. package/dist/cli/reindex.d.ts +8 -0
  7. package/dist/cli/reindex.d.ts.map +1 -1
  8. package/dist/cli/reindex.js +222 -198
  9. package/dist/cli/reindex.js.map +1 -1
  10. package/dist/cli/reindex.test.js +35 -0
  11. package/dist/cli/reindex.test.js.map +1 -1
  12. package/dist/cli/rescue.d.ts.map +1 -1
  13. package/dist/cli/rescue.js +39 -16
  14. package/dist/cli/rescue.js.map +1 -1
  15. package/dist/cli/rescue.reindex.test.js +15 -2
  16. package/dist/cli/rescue.reindex.test.js.map +1 -1
  17. package/dist/cli/sync.d.ts.map +1 -1
  18. package/dist/cli/sync.js +3 -1
  19. package/dist/cli/sync.js.map +1 -1
  20. package/dist/cli/sync.test.js +2 -1
  21. package/dist/cli/sync.test.js.map +1 -1
  22. package/dist/operation-lock.d.ts +100 -0
  23. package/dist/operation-lock.d.ts.map +1 -0
  24. package/dist/operation-lock.js +256 -0
  25. package/dist/operation-lock.js.map +1 -0
  26. package/dist/operation-lock.test.d.ts +5 -0
  27. package/dist/operation-lock.test.d.ts.map +1 -0
  28. package/dist/operation-lock.test.js +140 -0
  29. package/dist/operation-lock.test.js.map +1 -0
  30. package/package.json +1 -1
  31. package/src/bin/sync-runner.test.ts +77 -0
  32. package/src/bin/sync-runner.ts +22 -1
  33. package/src/cli/reindex.test.ts +45 -0
  34. package/src/cli/reindex.ts +36 -0
  35. package/src/cli/rescue.reindex.test.ts +17 -2
  36. package/src/cli/rescue.ts +40 -15
  37. package/src/cli/sync.test.ts +2 -1
  38. package/src/cli/sync.ts +3 -1
  39. package/src/operation-lock.test.ts +162 -0
  40. package/src/operation-lock.ts +293 -0
@@ -17,10 +17,24 @@
17
17
  import { spawnSync } from "child_process";
18
18
  import * as fs from "fs";
19
19
  import * as path from "path";
20
+ import {
21
+ acquireOperationLock,
22
+ OperationLockedError,
23
+ OPERATION_LOCKED_EXIT,
24
+ type LockHandle,
25
+ } from "../operation-lock.js";
20
26
 
21
27
  export interface ReindexOptions {
22
28
  /** HQ root to operate on. Defaults to process.cwd(). */
23
29
  repoRoot?: string;
30
+ /**
31
+ * Skip the per-root operation lock. Internal callers (`sync()` / `rescue()`)
32
+ * already hold the lock for this root and pass `true` so reindex doesn't try
33
+ * to re-acquire it and refuse against their own live PID. Standalone callers
34
+ * (`hq reindex`, the reindex hook) leave it falsy so reindex is mutually
35
+ * exclusive with a running sync/rescue.
36
+ */
37
+ skipLock?: boolean;
24
38
  }
25
39
 
26
40
  export interface ReindexResult {
@@ -129,6 +143,25 @@ export function reindex(opts: ReindexOptions = {}): ReindexResult {
129
143
  }
130
144
  const root = path.resolve(rawRoot);
131
145
 
146
+ // Acquire the per-root operation lock unless an internal caller (sync/rescue,
147
+ // which already hold it) opted out. A live holder → refuse fast with the
148
+ // holder's command + PID. The whole body runs inside the try so the lock is
149
+ // released on every exit path (the process-level signal/exit hooks are the
150
+ // crash backstop).
151
+ let opLock: LockHandle | null = null;
152
+ if (!opts.skipLock) {
153
+ try {
154
+ opLock = acquireOperationLock(root, "reindex");
155
+ } catch (err) {
156
+ if (err instanceof OperationLockedError) {
157
+ warn(err.message);
158
+ return { status: OPERATION_LOCKED_EXIT };
159
+ }
160
+ throw err;
161
+ }
162
+ }
163
+ try {
164
+
132
165
  fs.mkdirSync(path.join(root, ".claude", "skills"), { recursive: true });
133
166
 
134
167
  // --- Build (namespace, src_rel) pairs -------------------------------------
@@ -363,4 +396,7 @@ export function reindex(opts: ReindexOptions = {}): ReindexResult {
363
396
  }
364
397
 
365
398
  return { status: 0 };
399
+ } finally {
400
+ opLock?.release();
401
+ }
366
402
  }
@@ -7,7 +7,10 @@
7
7
  * runs, and ./reindex.js is mocked to a spy so we assert the call without
8
8
  * touching disk.
9
9
  */
10
- import { describe, it, expect, vi, beforeEach } from "vitest";
10
+ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
11
+ import * as fs from "fs";
12
+ import * as os from "os";
13
+ import * as path from "path";
11
14
 
12
15
  vi.mock("./rescue-core.js", () => ({
13
16
  runRescue: vi.fn(() => ({ status: 0 })),
@@ -21,16 +24,28 @@ import { reindex } from "./reindex.js";
21
24
  import { rescue } from "./rescue.js";
22
25
 
23
26
  describe("rescue → reindex", () => {
27
+ let stateDir: string;
28
+
24
29
  beforeEach(() => {
25
30
  vi.clearAllMocks();
26
31
  (runRescue as unknown as ReturnType<typeof vi.fn>).mockReturnValue({ status: 0 });
32
+ // rescue() now takes the per-root operation lock; redirect it to a tmp dir.
33
+ stateDir = fs.mkdtempSync(path.join(os.tmpdir(), "rescue-reindex-state-"));
34
+ process.env.HQ_STATE_DIR = stateDir;
35
+ });
36
+
37
+ afterEach(() => {
38
+ fs.rmSync(stateDir, { recursive: true, force: true });
39
+ delete process.env.HQ_STATE_DIR;
27
40
  });
28
41
 
29
42
  it("refreshes via reindex after a successful rescue", () => {
30
43
  const r = rescue({ hqRoot: "/tmp/hq", assumeYes: true });
31
44
  expect(r.status).toBe(0);
32
45
  expect(reindex).toHaveBeenCalledTimes(1);
33
- expect(reindex).toHaveBeenCalledWith({ repoRoot: "/tmp/hq" });
46
+ // skipLock: rescue already holds the per-root operation lock, so the
47
+ // internal reindex must not try to re-acquire it.
48
+ expect(reindex).toHaveBeenCalledWith({ repoRoot: "/tmp/hq", skipLock: true });
34
49
  });
35
50
 
36
51
  it("does NOT run reindex on a dry-run", () => {
package/src/cli/rescue.ts CHANGED
@@ -14,6 +14,11 @@
14
14
  */
15
15
  import { reindex } from "./reindex.js";
16
16
  import { runRescue } from "./rescue-core.js";
17
+ import {
18
+ acquireOperationLock,
19
+ OperationLockedError,
20
+ OPERATION_LOCKED_EXIT,
21
+ } from "../operation-lock.js";
17
22
 
18
23
  export interface RescueOptions {
19
24
  /** HQ root to operate on. Passed as `--hq-root`. Defaults to the script's
@@ -91,21 +96,41 @@ export function buildRescueArgs(opts: RescueOptions = {}): string[] {
91
96
  * confirmation prompt (unless `assumeYes` is set).
92
97
  */
93
98
  export function rescue(opts: RescueOptions = {}): RescueResult {
94
- const args = buildRescueArgs(opts);
95
- const env: NodeJS.ProcessEnv = { ...process.env };
96
- if (opts.ghToken) env.GH_TOKEN = opts.ghToken;
97
- const { status } = runRescue(args, { env });
98
- // A successful, non-dry-run rescue re-lays-down core/, so refresh the
99
- // generated skill wrappers, personal-overlay mirrors, and workers registry.
100
- // Best-effort + idempotent — never overrides the rescue's own exit status.
101
- // repoRoot falls back to process.cwd() (reindex's default) when hqRoot is
102
- // omitted, matching the rescue script's own cwd-based default.
103
- if (status === 0 && !opts.dryRun) {
104
- try {
105
- reindex({ repoRoot: opts.hqRoot });
106
- } catch {
107
- // best-effort
99
+ // Rescue is mutually exclusive with sync/reindex on this HQ root. Key the
100
+ // lock on the same root the rescue script resolves (cwd when --hq-root is
101
+ // omitted). Rescue is never the exempt push watcher, so it always locks.
102
+ const lockRoot = opts.hqRoot ?? process.cwd();
103
+ let handle;
104
+ try {
105
+ handle = acquireOperationLock(lockRoot, "rescue");
106
+ } catch (err) {
107
+ if (err instanceof OperationLockedError) {
108
+ process.stderr.write(err.message + "\n");
109
+ return { status: OPERATION_LOCKED_EXIT };
108
110
  }
111
+ throw err;
112
+ }
113
+ try {
114
+ const args = buildRescueArgs(opts);
115
+ const env: NodeJS.ProcessEnv = { ...process.env };
116
+ if (opts.ghToken) env.GH_TOKEN = opts.ghToken;
117
+ const { status } = runRescue(args, { env });
118
+ // A successful, non-dry-run rescue re-lays-down core/, so refresh the
119
+ // generated skill wrappers, personal-overlay mirrors, and workers registry.
120
+ // Best-effort + idempotent — never overrides the rescue's own exit status.
121
+ // repoRoot falls back to process.cwd() (reindex's default) when hqRoot is
122
+ // omitted, matching the rescue script's own cwd-based default. `skipLock`
123
+ // because we already hold the per-root lock — reindex's own acquire would
124
+ // otherwise see our live PID and refuse.
125
+ if (status === 0 && !opts.dryRun) {
126
+ try {
127
+ reindex({ repoRoot: opts.hqRoot, skipLock: true });
128
+ } catch {
129
+ // best-effort
130
+ }
131
+ }
132
+ return { status };
133
+ } finally {
134
+ handle.release();
109
135
  }
110
- return { status };
111
136
  }
@@ -123,7 +123,8 @@ describe("sync", () => {
123
123
 
124
124
  it("runs reindex against hqRoot after a sync that downloaded files", async () => {
125
125
  await sync({ company: "acme", vaultConfig: mockConfig, hqRoot: tmpDir });
126
- expect(reindex).toHaveBeenCalledWith({ repoRoot: tmpDir });
126
+ // skipLock: the surrounding sync run already holds the per-root lock.
127
+ expect(reindex).toHaveBeenCalledWith({ repoRoot: tmpDir, skipLock: true });
127
128
  });
128
129
 
129
130
  it("skips reindex when skipReindex is set", async () => {
package/src/cli/sync.ts CHANGED
@@ -1338,7 +1338,9 @@ export async function sync(options: SyncOptions): Promise<SyncResult> {
1338
1338
  shrinkResult.cleanRemoved > 0;
1339
1339
  if (!options.skipReindex && changedOnDisk) {
1340
1340
  try {
1341
- reindex({ repoRoot: hqRoot });
1341
+ // skipLock: the surrounding sync run already holds this root's operation
1342
+ // lock; reindex re-acquiring would refuse against our own live PID.
1343
+ reindex({ repoRoot: hqRoot, skipLock: true });
1342
1344
  } catch {
1343
1345
  // best-effort: a post-sync refresh failure never fails the sync
1344
1346
  }
@@ -0,0 +1,162 @@
1
+ /**
2
+ * Unit tests for the per-HQ-root operation mutex.
3
+ */
4
+
5
+ import { describe, it, expect, beforeEach, afterEach } from "vitest";
6
+ import { spawnSync } from "child_process";
7
+ import * as fs from "fs";
8
+ import * as os from "os";
9
+ import * as path from "path";
10
+ import {
11
+ acquireOperationLock,
12
+ withOperationLockSync,
13
+ lockPathFor,
14
+ OperationLockedError,
15
+ OPERATION_LOCKED_EXIT,
16
+ type LockInfo,
17
+ } from "./operation-lock.js";
18
+
19
+ /** A PID that is guaranteed dead: spawn a node that exits immediately, reuse its pid. */
20
+ function deadPid(): number {
21
+ const r = spawnSync(process.execPath, ["-e", ""], { stdio: "ignore" });
22
+ if (!r.pid) throw new Error("could not spawn to obtain a dead pid");
23
+ return r.pid;
24
+ }
25
+
26
+ function writeLock(p: string, info: Partial<LockInfo>): void {
27
+ fs.mkdirSync(path.dirname(p), { recursive: true });
28
+ const full: LockInfo = {
29
+ pid: 1,
30
+ command: "sync",
31
+ startedAt: new Date(0).toISOString(),
32
+ hqRoot: "/x",
33
+ ...info,
34
+ };
35
+ fs.writeFileSync(p, JSON.stringify(full));
36
+ }
37
+
38
+ describe("operation-lock", () => {
39
+ let stateDir: string;
40
+ let rootA: string;
41
+ let rootB: string;
42
+
43
+ beforeEach(() => {
44
+ stateDir = fs.mkdtempSync(path.join(os.tmpdir(), "hq-oplock-state-"));
45
+ process.env.HQ_STATE_DIR = stateDir;
46
+ delete process.env.HQ_DISABLE_OP_LOCK;
47
+ rootA = fs.mkdtempSync(path.join(os.tmpdir(), "hq-rootA-"));
48
+ rootB = fs.mkdtempSync(path.join(os.tmpdir(), "hq-rootB-"));
49
+ });
50
+
51
+ afterEach(() => {
52
+ fs.rmSync(stateDir, { recursive: true, force: true });
53
+ fs.rmSync(rootA, { recursive: true, force: true });
54
+ fs.rmSync(rootB, { recursive: true, force: true });
55
+ delete process.env.HQ_STATE_DIR;
56
+ delete process.env.HQ_DISABLE_OP_LOCK;
57
+ });
58
+
59
+ it("the lock path is under the state dir, keyed per canonical root", () => {
60
+ const a = lockPathFor(rootA);
61
+ const b = lockPathFor(rootB);
62
+ expect(a.startsWith(path.join(stateDir, "locks"))).toBe(true);
63
+ expect(a).not.toBe(b); // different roots → different lock files
64
+ // canonical: a trailing-slash variant maps to the same lock
65
+ expect(lockPathFor(rootA + path.sep)).toBe(a);
66
+ });
67
+
68
+ it("acquires, writes holder info, and releases (file gone after release)", () => {
69
+ const h = acquireOperationLock(rootA, "sync");
70
+ expect(fs.existsSync(h.path)).toBe(true);
71
+ const info = JSON.parse(fs.readFileSync(h.path, "utf8")) as LockInfo;
72
+ expect(info.pid).toBe(process.pid);
73
+ expect(info.command).toBe("sync");
74
+ h.release();
75
+ expect(fs.existsSync(h.path)).toBe(false);
76
+ });
77
+
78
+ it("refuses fast with the holder's command + pid when a LIVE process holds it", () => {
79
+ // Simulate a DIFFERENT live process holding the lock. PID 1 (init/systemd)
80
+ // is always alive and is never our own pid, so kill(1,0) reports alive and
81
+ // the same-process reclaim path does not apply.
82
+ writeLock(lockPathFor(rootA), { pid: 1, command: "rescue" });
83
+ expect(() => acquireOperationLock(rootA, "sync")).toThrowError(OperationLockedError);
84
+ try {
85
+ acquireOperationLock(rootA, "sync");
86
+ } catch (e) {
87
+ const err = e as OperationLockedError;
88
+ expect(err.holder.command).toBe("rescue");
89
+ expect(err.holder.pid).toBe(1);
90
+ expect(err.message).toContain("rescue");
91
+ expect(err.message).toContain("pid 1");
92
+ }
93
+ });
94
+
95
+ it("reclaims a stale lock whose holder PID is dead (takeover)", () => {
96
+ const stale = deadPid();
97
+ writeLock(lockPathFor(rootA), { pid: stale, command: "sync" });
98
+ // The dead holder must not block us.
99
+ const h = acquireOperationLock(rootA, "rescue");
100
+ const info = JSON.parse(fs.readFileSync(h.path, "utf8")) as LockInfo;
101
+ expect(info.pid).toBe(process.pid); // we took it over
102
+ expect(info.command).toBe("rescue");
103
+ h.release();
104
+ });
105
+
106
+ it("reclaims a torn/unreadable lock file", () => {
107
+ const p = lockPathFor(rootA);
108
+ fs.mkdirSync(path.dirname(p), { recursive: true });
109
+ fs.writeFileSync(p, "{ this is not valid json");
110
+ const h = acquireOperationLock(rootA, "reindex");
111
+ expect(fs.existsSync(h.path)).toBe(true);
112
+ h.release();
113
+ });
114
+
115
+ it("different HQ roots are independent — both may hold concurrently", () => {
116
+ const a = acquireOperationLock(rootA, "sync");
117
+ const b = acquireOperationLock(rootB, "rescue"); // must NOT refuse
118
+ expect(fs.existsSync(a.path)).toBe(true);
119
+ expect(fs.existsSync(b.path)).toBe(true);
120
+ expect(a.path).not.toBe(b.path);
121
+ a.release();
122
+ b.release();
123
+ });
124
+
125
+ it("the same root is mutually exclusive across different commands", () => {
126
+ // A live sync in ANOTHER process holds the root (pid 1 stands in for it).
127
+ const p = lockPathFor(rootA);
128
+ writeLock(p, { pid: 1, command: "sync" });
129
+ // Neither rescue nor reindex may acquire while that sync holds it.
130
+ expect(() => acquireOperationLock(rootA, "rescue")).toThrowError(OperationLockedError);
131
+ expect(() => acquireOperationLock(rootA, "reindex")).toThrowError(OperationLockedError);
132
+ // Once that sync finishes (its lock is gone), the next command acquires.
133
+ fs.unlinkSync(p);
134
+ const h2 = acquireOperationLock(rootA, "reindex");
135
+ expect(fs.existsSync(h2.path)).toBe(true);
136
+ h2.release();
137
+ });
138
+
139
+ it("withOperationLockSync releases even when the body throws", () => {
140
+ const p = lockPathFor(rootA);
141
+ expect(() =>
142
+ withOperationLockSync(rootA, "reindex", () => {
143
+ expect(fs.existsSync(p)).toBe(true); // held during the body
144
+ throw new Error("boom");
145
+ }),
146
+ ).toThrow("boom");
147
+ expect(fs.existsSync(p)).toBe(false); // released on the way out
148
+ });
149
+
150
+ it("HQ_DISABLE_OP_LOCK=1 makes acquisition a no-op", () => {
151
+ process.env.HQ_DISABLE_OP_LOCK = "1";
152
+ // Even with a live holder on record, the escape hatch acquires without error.
153
+ writeLock(lockPathFor(rootA), { pid: process.pid, command: "sync" });
154
+ const h = acquireOperationLock(rootA, "rescue");
155
+ expect(h.path).toBe(""); // no real lock file written
156
+ h.release(); // no-op, no throw
157
+ });
158
+
159
+ it("OPERATION_LOCKED_EXIT is a stable non-zero code", () => {
160
+ expect(OPERATION_LOCKED_EXIT).toBe(17);
161
+ });
162
+ });
@@ -0,0 +1,293 @@
1
+ /**
2
+ * Per-HQ-root mutual exclusion for the long-running operations
3
+ * (`sync`, `rescue`, `reindex`).
4
+ *
5
+ * Contract:
6
+ * - At most ONE of sync / rescue / reindex runs at a time **per HQ root**.
7
+ * The lock is shared across all three (keyed only by the root, not the
8
+ * command), so e.g. a rescue refuses while a sync holds it. Different HQ
9
+ * roots are fully independent — they hash to different lock files.
10
+ * - The push watcher / watch+event-push runner is EXEMPT: it never calls in
11
+ * here, so it neither takes the lock nor is blocked by it (its targeted
12
+ * in-process push passes are likewise lock-free).
13
+ *
14
+ * ## Where the lock lives — and why
15
+ *
16
+ * `<stateDir>/locks/operation-<hash(canonicalRoot)>.lock`, where
17
+ * `stateDir = $HQ_STATE_DIR || ~/.hq`. This is deliberately NOT inside the HQ
18
+ * root:
19
+ * - It must never round-trip to the cloud. A lock is machine-local, per-run
20
+ * state; syncing it to S3 (and thence to other machines/roots) would be a
21
+ * correctness bug. `~/.hq` is the established machine-local state dir
22
+ * (journals already live there) and is never synced.
23
+ * - `rescue` repairs a possibly-broken HQ root; a lock that depends on the
24
+ * root being healthy is exactly backwards. `~/.hq` is independent of the
25
+ * root's health.
26
+ * - Keying the filename by a hash of the *canonical* root path makes the
27
+ * lock per-root and prevents leakage across roots, while keeping the path
28
+ * short and filesystem-safe.
29
+ *
30
+ * ## Atomicity, liveness, takeover
31
+ *
32
+ * - Acquisition uses `open(…, "wx")` (O_CREAT | O_EXCL) — an atomic
33
+ * create-if-absent. Exactly one racer can create the file; the loser sees
34
+ * EEXIST and re-evaluates.
35
+ * - The lock records the holder's `{ pid, command, startedAt, hqRoot }`. On
36
+ * EEXIST we test the recorded PID with `process.kill(pid, 0)`:
37
+ * * ESRCH → the holder is gone (crashed / killed -9 / stale file) →
38
+ * reclaim the lock.
39
+ * * EPERM → the PID exists but is owned by another user → treat as ALIVE
40
+ * (conservative: refuse rather than risk two concurrent ops).
41
+ * * success → alive → refuse fast with {@link OperationLockedError}
42
+ * naming the holding command + PID.
43
+ * - PID reuse is an inherent, un-eliminable race for any PID-based scheme: if
44
+ * the original holder crashed and the OS later handed its PID to an
45
+ * unrelated process, we conservatively read that as "still held" and
46
+ * refuse. We accept that false-busy over the far worse false-free, and
47
+ * record `startedAt`/`command` so an operator can diagnose a wedged lock.
48
+ *
49
+ * ## Release
50
+ *
51
+ * - Normal exit: the `with*` wrappers release in a `finally`.
52
+ * - Signals (SIGINT/SIGTERM): a one-time handler releases every held lock,
53
+ * then re-raises the default disposition so exit status is unchanged.
54
+ * - Hard crash (SIGKILL / power loss): nothing runs, but the stale-PID
55
+ * takeover above reclaims the lock on the next attempt.
56
+ * - `process.on("exit")`: a final best-effort synchronous unlink.
57
+ *
58
+ * ## Escape hatch
59
+ *
60
+ * `HQ_DISABLE_OP_LOCK=1` makes acquisition a no-op (returns a handle whose
61
+ * release does nothing). For emergencies and for callers that manage
62
+ * exclusion themselves; documented, off by default.
63
+ */
64
+
65
+ import * as crypto from "crypto";
66
+ import * as fs from "fs";
67
+ import * as os from "os";
68
+ import * as path from "path";
69
+
70
+ /** Process exit code used when an operation is refused because the lock is held. */
71
+ export const OPERATION_LOCKED_EXIT = 17;
72
+
73
+ export interface LockInfo {
74
+ pid: number;
75
+ command: string;
76
+ /** ISO-8601 acquisition time. */
77
+ startedAt: string;
78
+ /** Canonical HQ root the lock guards (diagnostic only). */
79
+ hqRoot: string;
80
+ }
81
+
82
+ /** Thrown by `acquireOperationLock` when a LIVE holder owns the lock. */
83
+ export class OperationLockedError extends Error {
84
+ constructor(
85
+ public readonly holder: LockInfo,
86
+ public readonly attempted: string,
87
+ ) {
88
+ super(
89
+ `Refusing to start "${attempted}": another HQ operation is already ` +
90
+ `running for this HQ root — "${holder.command}" (pid ${holder.pid}, ` +
91
+ `started ${holder.startedAt}). Wait for it to finish, or stop that ` +
92
+ `process, then retry.`,
93
+ );
94
+ this.name = "OperationLockedError";
95
+ }
96
+ }
97
+
98
+ export interface LockHandle {
99
+ /** Absolute path of the lock file. */
100
+ readonly path: string;
101
+ /** The info written for this holder. */
102
+ readonly info: LockInfo;
103
+ /** Idempotently release the lock iff this process still owns it. */
104
+ release(): void;
105
+ }
106
+
107
+ function stateDir(): string {
108
+ return process.env.HQ_STATE_DIR || path.join(os.homedir(), ".hq");
109
+ }
110
+
111
+ /** Absolute lock path for a given HQ root. Exported for tests. */
112
+ export function lockPathFor(hqRoot: string): string {
113
+ const canon = path.resolve(hqRoot);
114
+ const key = crypto.createHash("sha1").update(canon).digest("hex").slice(0, 16);
115
+ return path.join(stateDir(), "locks", `operation-${key}.lock`);
116
+ }
117
+
118
+ /**
119
+ * Is `pid` a live process? `kill(pid, 0)` sends no signal; it only probes.
120
+ * ESRCH → no such process (dead/stale). EPERM → exists but not ours → ALIVE
121
+ * (conservative). Anything else → assume alive rather than risk a double-run.
122
+ */
123
+ function pidAlive(pid: number): boolean {
124
+ if (!Number.isInteger(pid) || pid <= 0) return false;
125
+ try {
126
+ process.kill(pid, 0);
127
+ return true;
128
+ } catch (err) {
129
+ const code = (err as NodeJS.ErrnoException)?.code;
130
+ if (code === "ESRCH") return false;
131
+ return true; // EPERM (exists) or unknown → treat as alive
132
+ }
133
+ }
134
+
135
+ function readLockInfo(p: string): LockInfo | null {
136
+ try {
137
+ const parsed = JSON.parse(fs.readFileSync(p, "utf8")) as LockInfo;
138
+ if (parsed && typeof parsed.pid === "number" && typeof parsed.command === "string") {
139
+ return parsed;
140
+ }
141
+ return null;
142
+ } catch {
143
+ return null;
144
+ }
145
+ }
146
+
147
+ // ── Process-wide release plumbing ──────────────────────────────────────────
148
+ // Track every lock this process currently holds so the signal/exit hooks can
149
+ // release all of them. The hooks are installed exactly once.
150
+
151
+ const heldLocks = new Set<LockHandle>();
152
+ let hooksInstalled = false;
153
+
154
+ function unlinkIfOwned(p: string): void {
155
+ // Only remove a lock whose recorded pid is THIS process — never clobber a
156
+ // lock another process took over after a (hypothetical) reclaim race.
157
+ const info = readLockInfo(p);
158
+ if (info && info.pid === process.pid) {
159
+ try {
160
+ fs.unlinkSync(p);
161
+ } catch {
162
+ /* already gone — fine */
163
+ }
164
+ }
165
+ }
166
+
167
+ function installHooksOnce(): void {
168
+ if (hooksInstalled) return;
169
+ hooksInstalled = true;
170
+
171
+ process.on("exit", () => {
172
+ for (const h of heldLocks) unlinkIfOwned(h.path);
173
+ });
174
+
175
+ for (const sig of ["SIGINT", "SIGTERM", "SIGHUP"] as const) {
176
+ process.on(sig, () => {
177
+ for (const h of heldLocks) unlinkIfOwned(h.path);
178
+ // Re-raise with the default disposition so the exit status is the normal
179
+ // signal status (and a second Ctrl-C still works). Removing our listener
180
+ // first avoids recursing back into this handler.
181
+ process.removeAllListeners(sig);
182
+ process.kill(process.pid, sig);
183
+ });
184
+ }
185
+ }
186
+
187
+ function makeHandle(p: string, info: LockInfo): LockHandle {
188
+ const handle: LockHandle = {
189
+ path: p,
190
+ info,
191
+ release() {
192
+ heldLocks.delete(handle);
193
+ unlinkIfOwned(p);
194
+ },
195
+ };
196
+ heldLocks.add(handle);
197
+ installHooksOnce();
198
+ return handle;
199
+ }
200
+
201
+ const NOOP_HANDLE_BASE = { release() {} };
202
+
203
+ /**
204
+ * Acquire the per-root operation lock for `command`. Returns a {@link LockHandle}
205
+ * on success; throws {@link OperationLockedError} when a live holder owns it.
206
+ * Reclaims a stale lock (dead holder) transparently.
207
+ */
208
+ export function acquireOperationLock(hqRoot: string, command: string): LockHandle {
209
+ if (process.env.HQ_DISABLE_OP_LOCK === "1") {
210
+ const info: LockInfo = {
211
+ pid: process.pid,
212
+ command,
213
+ startedAt: new Date().toISOString(),
214
+ hqRoot: path.resolve(hqRoot),
215
+ };
216
+ return { ...NOOP_HANDLE_BASE, path: "", info };
217
+ }
218
+
219
+ const p = lockPathFor(hqRoot);
220
+ fs.mkdirSync(path.dirname(p), { recursive: true });
221
+
222
+ const info: LockInfo = {
223
+ pid: process.pid,
224
+ command,
225
+ startedAt: new Date().toISOString(),
226
+ hqRoot: path.resolve(hqRoot),
227
+ };
228
+ const payload = JSON.stringify(info, null, 2);
229
+
230
+ // Bounded retry: each iteration is one atomic create attempt. EEXIST against
231
+ // a stale holder reclaims and retries; EEXIST against a live holder refuses.
232
+ const MAX_ATTEMPTS = 5;
233
+ for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
234
+ let fd: number;
235
+ try {
236
+ fd = fs.openSync(p, "wx"); // O_CREAT | O_EXCL — atomic
237
+ } catch (err) {
238
+ if ((err as NodeJS.ErrnoException)?.code !== "EEXIST") throw err;
239
+
240
+ const holder = readLockInfo(p);
241
+ if (holder && holder.pid !== process.pid && pidAlive(holder.pid)) {
242
+ throw new OperationLockedError(holder, command);
243
+ }
244
+ // Stale (dead holder), unreadable/torn, or our own leftover → reclaim.
245
+ try {
246
+ fs.unlinkSync(p);
247
+ } catch {
248
+ /* someone else reclaimed it first; the next openSync re-evaluates */
249
+ }
250
+ continue;
251
+ }
252
+ try {
253
+ fs.writeSync(fd, payload);
254
+ } finally {
255
+ fs.closeSync(fd);
256
+ }
257
+ return makeHandle(p, info);
258
+ }
259
+
260
+ // Pathological churn (another process reclaiming in lockstep). Surface it
261
+ // rather than spin forever.
262
+ throw new Error(
263
+ `Could not acquire HQ operation lock at ${p} after ${MAX_ATTEMPTS} attempts`,
264
+ );
265
+ }
266
+
267
+ /** Run `fn` while holding the per-root lock for `command` (async). */
268
+ export async function withOperationLock<T>(
269
+ hqRoot: string,
270
+ command: string,
271
+ fn: () => Promise<T>,
272
+ ): Promise<T> {
273
+ const handle = acquireOperationLock(hqRoot, command);
274
+ try {
275
+ return await fn();
276
+ } finally {
277
+ handle.release();
278
+ }
279
+ }
280
+
281
+ /** Run `fn` while holding the per-root lock for `command` (synchronous). */
282
+ export function withOperationLockSync<T>(
283
+ hqRoot: string,
284
+ command: string,
285
+ fn: () => T,
286
+ ): T {
287
+ const handle = acquireOperationLock(hqRoot, command);
288
+ try {
289
+ return fn();
290
+ } finally {
291
+ handle.release();
292
+ }
293
+ }