@indigoai-us/hq-cloud 6.2.7 → 6.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/sync-runner.d.ts +22 -2
- package/dist/bin/sync-runner.d.ts.map +1 -1
- package/dist/bin/sync-runner.js +105 -3
- package/dist/bin/sync-runner.js.map +1 -1
- package/dist/bin/sync-runner.test.js +262 -0
- package/dist/bin/sync-runner.test.js.map +1 -1
- package/dist/cli/reindex.d.ts +8 -0
- package/dist/cli/reindex.d.ts.map +1 -1
- package/dist/cli/reindex.js +222 -198
- package/dist/cli/reindex.js.map +1 -1
- package/dist/cli/reindex.test.js +35 -0
- package/dist/cli/reindex.test.js.map +1 -1
- package/dist/cli/rescue-core.js +14 -2
- package/dist/cli/rescue-core.js.map +1 -1
- package/dist/cli/rescue-hq-root-guard.test.d.ts +2 -0
- package/dist/cli/rescue-hq-root-guard.test.d.ts.map +1 -0
- package/dist/cli/rescue-hq-root-guard.test.js +176 -0
- package/dist/cli/rescue-hq-root-guard.test.js.map +1 -0
- package/dist/cli/rescue.d.ts.map +1 -1
- package/dist/cli/rescue.js +39 -16
- package/dist/cli/rescue.js.map +1 -1
- package/dist/cli/rescue.reindex.test.js +15 -2
- package/dist/cli/rescue.reindex.test.js.map +1 -1
- package/dist/cli/sync.d.ts.map +1 -1
- package/dist/cli/sync.js +3 -1
- package/dist/cli/sync.js.map +1 -1
- package/dist/cli/sync.test.js +2 -1
- package/dist/cli/sync.test.js.map +1 -1
- package/dist/operation-lock.d.ts +100 -0
- package/dist/operation-lock.d.ts.map +1 -0
- package/dist/operation-lock.js +256 -0
- package/dist/operation-lock.js.map +1 -0
- package/dist/operation-lock.test.d.ts +5 -0
- package/dist/operation-lock.test.d.ts.map +1 -0
- package/dist/operation-lock.test.js +140 -0
- package/dist/operation-lock.test.js.map +1 -0
- package/dist/sync/event-sync.d.ts +181 -0
- package/dist/sync/event-sync.d.ts.map +1 -0
- package/dist/sync/event-sync.js +316 -0
- package/dist/sync/event-sync.js.map +1 -0
- package/dist/sync/event-sync.test.d.ts +14 -0
- package/dist/sync/event-sync.test.d.ts.map +1 -0
- package/dist/sync/event-sync.test.js +440 -0
- package/dist/sync/event-sync.test.js.map +1 -0
- package/package.json +1 -1
- package/src/bin/sync-runner.test.ts +323 -0
- package/src/bin/sync-runner.ts +139 -4
- package/src/cli/reindex.test.ts +45 -0
- package/src/cli/reindex.ts +36 -0
- package/src/cli/rescue-core.ts +15 -2
- package/src/cli/rescue-hq-root-guard.test.ts +193 -0
- package/src/cli/rescue.reindex.test.ts +17 -2
- package/src/cli/rescue.ts +40 -15
- package/src/cli/sync.test.ts +2 -1
- package/src/cli/sync.ts +3 -1
- package/src/operation-lock.test.ts +162 -0
- package/src/operation-lock.ts +293 -0
- package/src/sync/event-sync.test.ts +533 -0
- package/src/sync/event-sync.ts +481 -0
- package/test/e2e/sync/cross-tenant-isolation.test.ts +126 -0
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-HQ-root mutual exclusion for the long-running operations
|
|
3
|
+
* (`sync`, `rescue`, `reindex`).
|
|
4
|
+
*
|
|
5
|
+
* Contract:
|
|
6
|
+
* - At most ONE of sync / rescue / reindex runs at a time **per HQ root**.
|
|
7
|
+
* The lock is shared across all three (keyed only by the root, not the
|
|
8
|
+
* command), so e.g. a rescue refuses while a sync holds it. Different HQ
|
|
9
|
+
* roots are fully independent — they hash to different lock files.
|
|
10
|
+
* - The push watcher / watch+event-push runner is EXEMPT: it never calls in
|
|
11
|
+
* here, so it neither takes the lock nor is blocked by it (its targeted
|
|
12
|
+
* in-process push passes are likewise lock-free).
|
|
13
|
+
*
|
|
14
|
+
* ## Where the lock lives — and why
|
|
15
|
+
*
|
|
16
|
+
* `<stateDir>/locks/operation-<hash(canonicalRoot)>.lock`, where
|
|
17
|
+
* `stateDir = $HQ_STATE_DIR || ~/.hq`. This is deliberately NOT inside the HQ
|
|
18
|
+
* root:
|
|
19
|
+
* - It must never round-trip to the cloud. A lock is machine-local, per-run
|
|
20
|
+
* state; syncing it to S3 (and thence to other machines/roots) would be a
|
|
21
|
+
* correctness bug. `~/.hq` is the established machine-local state dir
|
|
22
|
+
* (journals already live there) and is never synced.
|
|
23
|
+
* - `rescue` repairs a possibly-broken HQ root; a lock that depends on the
|
|
24
|
+
* root being healthy is exactly backwards. `~/.hq` is independent of the
|
|
25
|
+
* root's health.
|
|
26
|
+
* - Keying the filename by a hash of the *canonical* root path makes the
|
|
27
|
+
* lock per-root and prevents leakage across roots, while keeping the path
|
|
28
|
+
* short and filesystem-safe.
|
|
29
|
+
*
|
|
30
|
+
* ## Atomicity, liveness, takeover
|
|
31
|
+
*
|
|
32
|
+
* - Acquisition uses `open(…, "wx")` (O_CREAT | O_EXCL) — an atomic
|
|
33
|
+
* create-if-absent. Exactly one racer can create the file; the loser sees
|
|
34
|
+
* EEXIST and re-evaluates.
|
|
35
|
+
* - The lock records the holder's `{ pid, command, startedAt, hqRoot }`. On
|
|
36
|
+
* EEXIST we test the recorded PID with `process.kill(pid, 0)`:
|
|
37
|
+
* * ESRCH → the holder is gone (crashed / killed -9 / stale file) →
|
|
38
|
+
* reclaim the lock.
|
|
39
|
+
* * EPERM → the PID exists but is owned by another user → treat as ALIVE
|
|
40
|
+
* (conservative: refuse rather than risk two concurrent ops).
|
|
41
|
+
* * success → alive → refuse fast with {@link OperationLockedError}
|
|
42
|
+
* naming the holding command + PID.
|
|
43
|
+
* - PID reuse is an inherent, un-eliminable race for any PID-based scheme: if
|
|
44
|
+
* the original holder crashed and the OS later handed its PID to an
|
|
45
|
+
* unrelated process, we conservatively read that as "still held" and
|
|
46
|
+
* refuse. We accept that false-busy over the far worse false-free, and
|
|
47
|
+
* record `startedAt`/`command` so an operator can diagnose a wedged lock.
|
|
48
|
+
*
|
|
49
|
+
* ## Release
|
|
50
|
+
*
|
|
51
|
+
* - Normal exit: the `with*` wrappers release in a `finally`.
|
|
52
|
+
* - Signals (SIGINT/SIGTERM): a one-time handler releases every held lock,
|
|
53
|
+
* then re-raises the default disposition so exit status is unchanged.
|
|
54
|
+
* - Hard crash (SIGKILL / power loss): nothing runs, but the stale-PID
|
|
55
|
+
* takeover above reclaims the lock on the next attempt.
|
|
56
|
+
* - `process.on("exit")`: a final best-effort synchronous unlink.
|
|
57
|
+
*
|
|
58
|
+
* ## Escape hatch
|
|
59
|
+
*
|
|
60
|
+
* `HQ_DISABLE_OP_LOCK=1` makes acquisition a no-op (returns a handle whose
|
|
61
|
+
* release does nothing). For emergencies and for callers that manage
|
|
62
|
+
* exclusion themselves; documented, off by default.
|
|
63
|
+
*/
|
|
64
|
+
|
|
65
|
+
import * as crypto from "crypto";
|
|
66
|
+
import * as fs from "fs";
|
|
67
|
+
import * as os from "os";
|
|
68
|
+
import * as path from "path";
|
|
69
|
+
|
|
70
|
+
/** Process exit code used when an operation is refused because the lock is held. */
|
|
71
|
+
export const OPERATION_LOCKED_EXIT = 17;
|
|
72
|
+
|
|
73
|
+
export interface LockInfo {
|
|
74
|
+
pid: number;
|
|
75
|
+
command: string;
|
|
76
|
+
/** ISO-8601 acquisition time. */
|
|
77
|
+
startedAt: string;
|
|
78
|
+
/** Canonical HQ root the lock guards (diagnostic only). */
|
|
79
|
+
hqRoot: string;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/** Thrown by `acquireOperationLock` when a LIVE holder owns the lock. */
|
|
83
|
+
export class OperationLockedError extends Error {
|
|
84
|
+
constructor(
|
|
85
|
+
public readonly holder: LockInfo,
|
|
86
|
+
public readonly attempted: string,
|
|
87
|
+
) {
|
|
88
|
+
super(
|
|
89
|
+
`Refusing to start "${attempted}": another HQ operation is already ` +
|
|
90
|
+
`running for this HQ root — "${holder.command}" (pid ${holder.pid}, ` +
|
|
91
|
+
`started ${holder.startedAt}). Wait for it to finish, or stop that ` +
|
|
92
|
+
`process, then retry.`,
|
|
93
|
+
);
|
|
94
|
+
this.name = "OperationLockedError";
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
export interface LockHandle {
|
|
99
|
+
/** Absolute path of the lock file. */
|
|
100
|
+
readonly path: string;
|
|
101
|
+
/** The info written for this holder. */
|
|
102
|
+
readonly info: LockInfo;
|
|
103
|
+
/** Idempotently release the lock iff this process still owns it. */
|
|
104
|
+
release(): void;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function stateDir(): string {
|
|
108
|
+
return process.env.HQ_STATE_DIR || path.join(os.homedir(), ".hq");
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/** Absolute lock path for a given HQ root. Exported for tests. */
|
|
112
|
+
export function lockPathFor(hqRoot: string): string {
|
|
113
|
+
const canon = path.resolve(hqRoot);
|
|
114
|
+
const key = crypto.createHash("sha1").update(canon).digest("hex").slice(0, 16);
|
|
115
|
+
return path.join(stateDir(), "locks", `operation-${key}.lock`);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Is `pid` a live process? `kill(pid, 0)` sends no signal; it only probes.
|
|
120
|
+
* ESRCH → no such process (dead/stale). EPERM → exists but not ours → ALIVE
|
|
121
|
+
* (conservative). Anything else → assume alive rather than risk a double-run.
|
|
122
|
+
*/
|
|
123
|
+
function pidAlive(pid: number): boolean {
|
|
124
|
+
if (!Number.isInteger(pid) || pid <= 0) return false;
|
|
125
|
+
try {
|
|
126
|
+
process.kill(pid, 0);
|
|
127
|
+
return true;
|
|
128
|
+
} catch (err) {
|
|
129
|
+
const code = (err as NodeJS.ErrnoException)?.code;
|
|
130
|
+
if (code === "ESRCH") return false;
|
|
131
|
+
return true; // EPERM (exists) or unknown → treat as alive
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function readLockInfo(p: string): LockInfo | null {
|
|
136
|
+
try {
|
|
137
|
+
const parsed = JSON.parse(fs.readFileSync(p, "utf8")) as LockInfo;
|
|
138
|
+
if (parsed && typeof parsed.pid === "number" && typeof parsed.command === "string") {
|
|
139
|
+
return parsed;
|
|
140
|
+
}
|
|
141
|
+
return null;
|
|
142
|
+
} catch {
|
|
143
|
+
return null;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// ── Process-wide release plumbing ──────────────────────────────────────────
|
|
148
|
+
// Track every lock this process currently holds so the signal/exit hooks can
|
|
149
|
+
// release all of them. The hooks are installed exactly once.
|
|
150
|
+
|
|
151
|
+
const heldLocks = new Set<LockHandle>();
|
|
152
|
+
let hooksInstalled = false;
|
|
153
|
+
|
|
154
|
+
function unlinkIfOwned(p: string): void {
|
|
155
|
+
// Only remove a lock whose recorded pid is THIS process — never clobber a
|
|
156
|
+
// lock another process took over after a (hypothetical) reclaim race.
|
|
157
|
+
const info = readLockInfo(p);
|
|
158
|
+
if (info && info.pid === process.pid) {
|
|
159
|
+
try {
|
|
160
|
+
fs.unlinkSync(p);
|
|
161
|
+
} catch {
|
|
162
|
+
/* already gone — fine */
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
function installHooksOnce(): void {
|
|
168
|
+
if (hooksInstalled) return;
|
|
169
|
+
hooksInstalled = true;
|
|
170
|
+
|
|
171
|
+
process.on("exit", () => {
|
|
172
|
+
for (const h of heldLocks) unlinkIfOwned(h.path);
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
for (const sig of ["SIGINT", "SIGTERM", "SIGHUP"] as const) {
|
|
176
|
+
process.on(sig, () => {
|
|
177
|
+
for (const h of heldLocks) unlinkIfOwned(h.path);
|
|
178
|
+
// Re-raise with the default disposition so the exit status is the normal
|
|
179
|
+
// signal status (and a second Ctrl-C still works). Removing our listener
|
|
180
|
+
// first avoids recursing back into this handler.
|
|
181
|
+
process.removeAllListeners(sig);
|
|
182
|
+
process.kill(process.pid, sig);
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
function makeHandle(p: string, info: LockInfo): LockHandle {
|
|
188
|
+
const handle: LockHandle = {
|
|
189
|
+
path: p,
|
|
190
|
+
info,
|
|
191
|
+
release() {
|
|
192
|
+
heldLocks.delete(handle);
|
|
193
|
+
unlinkIfOwned(p);
|
|
194
|
+
},
|
|
195
|
+
};
|
|
196
|
+
heldLocks.add(handle);
|
|
197
|
+
installHooksOnce();
|
|
198
|
+
return handle;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const NOOP_HANDLE_BASE = { release() {} };
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Acquire the per-root operation lock for `command`. Returns a {@link LockHandle}
|
|
205
|
+
* on success; throws {@link OperationLockedError} when a live holder owns it.
|
|
206
|
+
* Reclaims a stale lock (dead holder) transparently.
|
|
207
|
+
*/
|
|
208
|
+
export function acquireOperationLock(hqRoot: string, command: string): LockHandle {
|
|
209
|
+
if (process.env.HQ_DISABLE_OP_LOCK === "1") {
|
|
210
|
+
const info: LockInfo = {
|
|
211
|
+
pid: process.pid,
|
|
212
|
+
command,
|
|
213
|
+
startedAt: new Date().toISOString(),
|
|
214
|
+
hqRoot: path.resolve(hqRoot),
|
|
215
|
+
};
|
|
216
|
+
return { ...NOOP_HANDLE_BASE, path: "", info };
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const p = lockPathFor(hqRoot);
|
|
220
|
+
fs.mkdirSync(path.dirname(p), { recursive: true });
|
|
221
|
+
|
|
222
|
+
const info: LockInfo = {
|
|
223
|
+
pid: process.pid,
|
|
224
|
+
command,
|
|
225
|
+
startedAt: new Date().toISOString(),
|
|
226
|
+
hqRoot: path.resolve(hqRoot),
|
|
227
|
+
};
|
|
228
|
+
const payload = JSON.stringify(info, null, 2);
|
|
229
|
+
|
|
230
|
+
// Bounded retry: each iteration is one atomic create attempt. EEXIST against
|
|
231
|
+
// a stale holder reclaims and retries; EEXIST against a live holder refuses.
|
|
232
|
+
const MAX_ATTEMPTS = 5;
|
|
233
|
+
for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
|
|
234
|
+
let fd: number;
|
|
235
|
+
try {
|
|
236
|
+
fd = fs.openSync(p, "wx"); // O_CREAT | O_EXCL — atomic
|
|
237
|
+
} catch (err) {
|
|
238
|
+
if ((err as NodeJS.ErrnoException)?.code !== "EEXIST") throw err;
|
|
239
|
+
|
|
240
|
+
const holder = readLockInfo(p);
|
|
241
|
+
if (holder && holder.pid !== process.pid && pidAlive(holder.pid)) {
|
|
242
|
+
throw new OperationLockedError(holder, command);
|
|
243
|
+
}
|
|
244
|
+
// Stale (dead holder), unreadable/torn, or our own leftover → reclaim.
|
|
245
|
+
try {
|
|
246
|
+
fs.unlinkSync(p);
|
|
247
|
+
} catch {
|
|
248
|
+
/* someone else reclaimed it first; the next openSync re-evaluates */
|
|
249
|
+
}
|
|
250
|
+
continue;
|
|
251
|
+
}
|
|
252
|
+
try {
|
|
253
|
+
fs.writeSync(fd, payload);
|
|
254
|
+
} finally {
|
|
255
|
+
fs.closeSync(fd);
|
|
256
|
+
}
|
|
257
|
+
return makeHandle(p, info);
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Pathological churn (another process reclaiming in lockstep). Surface it
|
|
261
|
+
// rather than spin forever.
|
|
262
|
+
throw new Error(
|
|
263
|
+
`Could not acquire HQ operation lock at ${p} after ${MAX_ATTEMPTS} attempts`,
|
|
264
|
+
);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/** Run `fn` while holding the per-root lock for `command` (async). */
|
|
268
|
+
export async function withOperationLock<T>(
|
|
269
|
+
hqRoot: string,
|
|
270
|
+
command: string,
|
|
271
|
+
fn: () => Promise<T>,
|
|
272
|
+
): Promise<T> {
|
|
273
|
+
const handle = acquireOperationLock(hqRoot, command);
|
|
274
|
+
try {
|
|
275
|
+
return await fn();
|
|
276
|
+
} finally {
|
|
277
|
+
handle.release();
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/** Run `fn` while holding the per-root lock for `command` (synchronous). */
|
|
282
|
+
export function withOperationLockSync<T>(
|
|
283
|
+
hqRoot: string,
|
|
284
|
+
command: string,
|
|
285
|
+
fn: () => T,
|
|
286
|
+
): T {
|
|
287
|
+
const handle = acquireOperationLock(hqRoot, command);
|
|
288
|
+
try {
|
|
289
|
+
return fn();
|
|
290
|
+
} finally {
|
|
291
|
+
handle.release();
|
|
292
|
+
}
|
|
293
|
+
}
|