mcp-coordinator 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ import { Command } from "commander";
2
+ /**
3
+ * Check whether the coordinator daemon appears to be running.
4
+ * Returns the pid if alive, or null otherwise.
5
+ */
6
+ export declare function getRunningCoordinatorPid(configDir: string): number | null;
7
+ export declare function createServerBackupCommand(): Command;
@@ -0,0 +1,162 @@
1
+ import { Command } from "commander";
2
+ import { existsSync, readFileSync, statSync } from "fs";
3
+ import { join, resolve, basename } from "path";
4
+ import { create as tarCreate } from "tar";
5
+ import { getConfigDir, loadConfig } from "../config.js";
6
+ /**
7
+ * Format a timestamp suitable for filenames: YYYY-MM-DD-HHMMSS (UTC).
8
+ */
9
+ function timestampSlug(date = new Date()) {
10
+ const pad = (n) => String(n).padStart(2, "0");
11
+ return (`${date.getUTCFullYear()}-${pad(date.getUTCMonth() + 1)}-${pad(date.getUTCDate())}` +
12
+ `-${pad(date.getUTCHours())}${pad(date.getUTCMinutes())}${pad(date.getUTCSeconds())}`);
13
+ }
14
+ /**
15
+ * Returns true if a process with the given pid is currently alive.
16
+ * On both POSIX and Windows, sending signal 0 acts as a liveness probe
17
+ * (it raises ESRCH if the process is dead, EPERM if alive but not ours).
18
+ */
19
+ function isProcessAlive(pid) {
20
+ try {
21
+ process.kill(pid, 0);
22
+ return true;
23
+ }
24
+ catch (err) {
25
+ // EPERM means the process exists but we lack permission — still "alive".
26
+ if (err.code === "EPERM")
27
+ return true;
28
+ return false;
29
+ }
30
+ }
31
+ /**
32
+ * Check whether the coordinator daemon appears to be running.
33
+ * Returns the pid if alive, or null otherwise.
34
+ */
35
+ export function getRunningCoordinatorPid(configDir) {
36
+ const pidPath = join(configDir, "server.pid");
37
+ if (!existsSync(pidPath))
38
+ return null;
39
+ const raw = readFileSync(pidPath, "utf-8").trim();
40
+ const pid = parseInt(raw, 10);
41
+ if (Number.isNaN(pid))
42
+ return null;
43
+ return isProcessAlive(pid) ? pid : null;
44
+ }
45
+ /**
46
+ * Recursively walk a directory and yield relative file paths.
47
+ * Used purely for reporting (file count) — tar handles the actual packing.
48
+ */
49
+ async function countFiles(root) {
50
+ const { readdir, stat } = await import("fs/promises");
51
+ let count = 0;
52
+ const walk = async (dir) => {
53
+ const entries = await readdir(dir, { withFileTypes: true });
54
+ for (const entry of entries) {
55
+ const full = join(dir, entry.name);
56
+ if (entry.isDirectory()) {
57
+ await walk(full);
58
+ }
59
+ else if (entry.isFile()) {
60
+ count += 1;
61
+ }
62
+ }
63
+ };
64
+ if (existsSync(root))
65
+ await walk(root);
66
+ return count;
67
+ }
68
+ /**
69
+ * Build the list of entries (relative to configDir) we want to include in the
70
+ * tarball. We deliberately keep the layout flat — the same paths used at
71
+ * runtime — so a `tar -xzf` into `~/.mcp-coordinator/` is a valid restore.
72
+ *
73
+ * NOTE on live backups: this command refuses to run while the coordinator is
74
+ * up because better-sqlite3's WAL journal may have uncommitted writes that
75
+ * file-copy will miss. For online backups, switch to SQLite's Online Backup
76
+ * API (`db.backup(path)` from better-sqlite3) and snapshot config.json
77
+ * separately — see docs/superpowers/working/v04/backup-integration.md.
78
+ */
79
+ function buildEntries(configDir, dataDirAbsolute) {
80
+ const entries = [];
81
+ if (existsSync(join(configDir, "config.json")))
82
+ entries.push("config.json");
83
+ // The data dir might live outside ~/.mcp-coordinator (custom --data-dir).
84
+ // tar's `cwd` option can only point at one directory, so when the data dir
85
+ // is non-default we pack it under its absolute path inside the archive
86
+ // (preserving structure for round-trip restore via --data-dir).
87
+ const defaultDataDir = join(configDir, "data");
88
+ if (resolve(dataDirAbsolute) === resolve(defaultDataDir) && existsSync(defaultDataDir)) {
89
+ entries.push("data");
90
+ }
91
+ return entries;
92
+ }
93
+ export function createServerBackupCommand() {
94
+ return new Command("backup")
95
+ .description("Snapshot the coordinator config + SQLite database to a tar.gz archive")
96
+ .option("--output <path>", "Output tarball path (default ./mcp-coordinator-backup-<ts>.tar.gz)")
97
+ .option("--data-dir <path>", "Data directory to back up (overrides config.server.data_dir)")
98
+ .option("--force", "Skip the running-coordinator safety check")
99
+ .action(async (opts) => {
100
+ const configDir = getConfigDir();
101
+ const config = loadConfig(configDir);
102
+ const dataDir = resolve(opts.dataDir ?? process.env.COORDINATOR_DATA_DIR ?? config.server.data_dir);
103
+ // Safety: refuse when the daemon is up. WAL writes might be in flight.
104
+ const runningPid = getRunningCoordinatorPid(configDir);
105
+ if (runningPid !== null && !opts.force) {
106
+ console.error(`Coordinator is running (PID ${runningPid}).`);
107
+ console.error("Refusing to back up: live SQLite WAL writes may be in flight.");
108
+ console.error("Either stop it first ('mcp-coordinator server stop') or pass --force.");
109
+ process.exit(1);
110
+ }
111
+ if (!existsSync(configDir)) {
112
+ console.error(`No coordinator config directory at ${configDir} — nothing to back up.`);
113
+ process.exit(1);
114
+ }
115
+ const ts = timestampSlug();
116
+ const outputPath = resolve(opts.output ?? join(process.cwd(), `mcp-coordinator-backup-${ts}.tar.gz`));
117
+ const defaultDataDir = join(configDir, "data");
118
+ const dataIsCustom = resolve(dataDir) !== resolve(defaultDataDir);
119
+ // Pack ~/.mcp-coordinator entries from configDir as cwd.
120
+ const entries = buildEntries(configDir, dataDir);
121
+ // For custom data dirs, pack them under their absolute path so restore
122
+ // can reproduce the original location (or be redirected with --data-dir).
123
+ const customDataEntries = [];
124
+ if (dataIsCustom && existsSync(dataDir)) {
125
+ customDataEntries.push({ cwd: resolve(dataDir, ".."), entry: basename(dataDir) });
126
+ }
127
+ if (entries.length === 0 && customDataEntries.length === 0) {
128
+ console.error("Nothing to back up: no config.json and no data directory found.");
129
+ process.exit(1);
130
+ }
131
+ // First archive pass: config + default data (if any).
132
+ if (entries.length > 0) {
133
+ await tarCreate({ gzip: true, file: outputPath, cwd: configDir, portable: true }, entries);
134
+ }
135
+ // Second pass for a custom data dir — append into the same archive.
136
+ // tar's gzip mode doesn't support append, so we re-create instead by
137
+ // extracting the previous entries and re-packing. Simpler path: emit
138
+ // a sibling .data.tar.gz when custom dir is in use, and document it.
139
+ if (customDataEntries.length > 0) {
140
+ const dataArchive = outputPath.replace(/\.tar\.gz$/, ".data.tar.gz");
141
+ for (const { cwd, entry } of customDataEntries) {
142
+ await tarCreate({ gzip: true, file: dataArchive, cwd, portable: true }, [entry]);
143
+ }
144
+ console.log(`Custom data dir packed separately: ${dataArchive}`);
145
+ }
146
+ // outputPath only exists if entries.length > 0; report on whichever
147
+ // archive(s) we actually produced.
148
+ const reportPath = entries.length > 0
149
+ ? outputPath
150
+ : outputPath.replace(/\.tar\.gz$/, ".data.tar.gz");
151
+ const sizeBytes = existsSync(reportPath) ? statSync(reportPath).size : 0;
152
+ const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(2);
153
+ const fileCount = (existsSync(join(configDir, "config.json")) ? 1 : 0) +
154
+ (await countFiles(dataIsCustom ? dataDir : defaultDataDir));
155
+ console.log("Backup complete.");
156
+ console.log(` Archive: ${reportPath}`);
157
+ console.log(` Size: ${sizeMB} MB (${sizeBytes} bytes)`);
158
+ console.log(` Files: ${fileCount}`);
159
+ console.log(` ConfigDir: ${configDir}`);
160
+ console.log(` DataDir: ${dataDir}${dataIsCustom ? " (custom)" : ""}`);
161
+ });
162
+ }
@@ -3,11 +3,16 @@ import { createServerStartCommand } from "./start.js";
3
3
  import { createServerStopCommand } from "./stop.js";
4
4
  import { createServerStatusCommand } from "./status.js";
5
5
  import { createServerLogsCommand } from "./logs.js";
6
+ import { createServerBackupCommand } from "./backup.js";
7
+ import { createServerRestoreCommand } from "./restore.js";
6
8
  export function createServerProgram() {
7
9
  const server = new Command("server").description("Manage the coordination server");
8
10
  server.addCommand(createServerStartCommand());
9
11
  server.addCommand(createServerStopCommand());
10
12
  server.addCommand(createServerStatusCommand());
11
13
  server.addCommand(createServerLogsCommand());
14
+ // v0.4 Operability
15
+ server.addCommand(createServerBackupCommand());
16
+ server.addCommand(createServerRestoreCommand());
12
17
  return server;
13
18
  }
@@ -0,0 +1,2 @@
1
+ import { Command } from "commander";
2
+ export declare function createServerRestoreCommand(): Command;
@@ -0,0 +1,117 @@
1
+ import { Command } from "commander";
2
+ import { existsSync, mkdirSync, renameSync, statSync } from "fs";
3
+ import { resolve } from "path";
4
+ import { extract as tarExtract, list as tarList } from "tar";
5
+ import { getConfigDir } from "../config.js";
6
+ import { getRunningCoordinatorPid } from "./backup.js";
7
+ function timestampSlug(date = new Date()) {
8
+ const pad = (n) => String(n).padStart(2, "0");
9
+ return (`${date.getUTCFullYear()}-${pad(date.getUTCMonth() + 1)}-${pad(date.getUTCDate())}` +
10
+ `-${pad(date.getUTCHours())}${pad(date.getUTCMinutes())}${pad(date.getUTCSeconds())}`);
11
+ }
12
+ /**
13
+ * Inspect the tarball without extracting and return the list of top-level
14
+ * entries (file or dir names). Used to validate structure before touching
15
+ * the user's existing config dir.
16
+ */
17
+ async function listTarballEntries(tarPath) {
18
+ const entries = [];
19
+ await tarList({
20
+ file: tarPath,
21
+ onReadEntry: (entry) => {
22
+ // Trailing slash on dirs — strip it, take the first path segment only.
23
+ const head = entry.path.replace(/\\/g, "/").split("/")[0];
24
+ if (head && !entries.includes(head))
25
+ entries.push(head);
26
+ },
27
+ });
28
+ return entries;
29
+ }
30
+ export function createServerRestoreCommand() {
31
+ return new Command("restore")
32
+ .description("Restore a coordinator config + database snapshot from a tar.gz archive")
33
+ .argument("<tarball>", "Path to the backup .tar.gz produced by 'mcp-coordinator server backup'")
34
+ .option("--force", "Skip the running-coordinator safety check")
35
+ .option("--no-backup", "Do not snapshot the existing config dir before overwriting")
36
+ .option("--data-dir <path>", "Override data directory (rarely needed)")
37
+ .action(async (tarballArg, opts) => {
38
+ const tarPath = resolve(tarballArg);
39
+ if (!existsSync(tarPath)) {
40
+ console.error(`Tarball not found: ${tarPath}`);
41
+ process.exit(1);
42
+ }
43
+ const tarStat = statSync(tarPath);
44
+ if (!tarStat.isFile()) {
45
+ console.error(`Not a regular file: ${tarPath}`);
46
+ process.exit(1);
47
+ }
48
+ const configDir = getConfigDir();
49
+ // Safety: refuse when the daemon is running so we don't clobber an
50
+ // open SQLite handle (would corrupt the WAL on the daemon side).
51
+ const runningPid = getRunningCoordinatorPid(configDir);
52
+ if (runningPid !== null && !opts.force) {
53
+ console.error(`Coordinator is running (PID ${runningPid}).`);
54
+ console.error("Refusing to restore: stop the coordinator first or pass --force.");
55
+ process.exit(1);
56
+ }
57
+ // Validate tarball contents BEFORE moving anything aside.
58
+ let entries;
59
+ try {
60
+ entries = await listTarballEntries(tarPath);
61
+ }
62
+ catch (err) {
63
+ console.error(`Failed to read tarball: ${err.message}`);
64
+ process.exit(1);
65
+ }
66
+ const hasConfig = entries.includes("config.json");
67
+ const hasData = entries.includes("data");
68
+ if (!hasConfig && !hasData) {
69
+ console.error("Tarball does not contain expected entries (config.json or data/).");
70
+ console.error(`Top-level entries found: ${entries.join(", ") || "(none)"}`);
71
+ process.exit(1);
72
+ }
73
+ // Snapshot the existing config dir before overwriting.
74
+ // commander auto-flips `--no-backup` -> opts.backup = false. The default
75
+ // value is `true` (the option is registered with .option("--no-backup")
76
+ // below, which makes commander seed `opts.backup = true`).
77
+ const shouldSnapshot = opts.backup !== false;
78
+ let snapshotPath = null;
79
+ if (shouldSnapshot && existsSync(configDir)) {
80
+ snapshotPath = `${configDir}.bak-${timestampSlug()}`;
81
+ renameSync(configDir, snapshotPath);
82
+ console.log(`Existing config moved aside: ${snapshotPath}`);
83
+ }
84
+ // Recreate the target dir and extract.
85
+ mkdirSync(configDir, { recursive: true });
86
+ try {
87
+ await tarExtract({ file: tarPath, cwd: configDir });
88
+ }
89
+ catch (err) {
90
+ console.error(`Extraction failed: ${err.message}`);
91
+ // Try to roll back the snapshot if we made one.
92
+ if (snapshotPath !== null && existsSync(snapshotPath)) {
93
+ // Best-effort: only roll back if extraction created an empty dir.
94
+ try {
95
+ renameSync(configDir, `${configDir}.failed-${timestampSlug()}`);
96
+ renameSync(snapshotPath, configDir);
97
+ console.error("Rolled back to previous config dir.");
98
+ }
99
+ catch {
100
+ console.error(`Manual recovery required — snapshot at: ${snapshotPath}`);
101
+ }
102
+ }
103
+ process.exit(1);
104
+ }
105
+ console.log("Restore complete.");
106
+ console.log(` Source: ${tarPath}`);
107
+ console.log(` ConfigDir: ${configDir}`);
108
+ console.log(` Restored: ${entries.filter((e) => e === "config.json" || e === "data").join(", ")}`);
109
+ if (snapshotPath !== null) {
110
+ console.log(` Previous: ${snapshotPath} (delete once verified)`);
111
+ }
112
+ if (opts.dataDir !== undefined) {
113
+ console.log(` Note: --data-dir was provided but restore extracts to default location.\n` +
114
+ ` Update config.json or COORDINATOR_DATA_DIR if you need a non-default path.`);
115
+ }
116
+ });
117
+ }
@@ -74,6 +74,14 @@ export declare class Consultation {
74
74
  * parsing the thread list themselves.
75
75
  */
76
76
  assigned_to_me?: string;
77
+ /**
78
+ * P2 perf: bound resolved-thread queries to a recency window. Without
79
+ * this, the impact scorer would scan all-time resolved threads on every
80
+ * announce_work call (O(historical-threads) per scoring pass). The window
81
+ * applies to resolved_at when status='resolved', otherwise to created_at,
82
+ * so the filter is meaningful for both states.
83
+ */
84
+ since_minutes?: number;
77
85
  }): Thread[];
78
86
  getThreadUpdates(agentId: string, since?: string): ThreadMessage[];
79
87
  logActionSummary(params: {
@@ -320,6 +320,14 @@ export class Consultation {
320
320
  sql += " AND (assigned_to IS NULL OR assigned_to = ?)";
321
321
  params.push(filters.assigned_to_me);
322
322
  }
323
+ if (typeof filters.since_minutes === "number") {
324
+ // For resolved threads, gate on resolved_at (the moment that matters
325
+ // for "recent enough to still influence scoring"). For open/resolving
326
+ // threads, gate on created_at since they have no resolved_at yet.
327
+ // COALESCE picks the right column per row.
328
+ sql += " AND COALESCE(resolved_at, created_at) > datetime('now', '-' || ? || ' minutes')";
329
+ params.push(filters.since_minutes);
330
+ }
323
331
  sql += " ORDER BY created_at DESC";
324
332
  return db.prepare(sql).all(...params);
325
333
  }
@@ -1,3 +1,15 @@
1
+ /**
2
+ * Database adapter surface.
3
+ *
4
+ * Design intent: this file is the *contract* both `createBetterSqlite3` and
5
+ * `createBunSqlite` (in `database.ts`) implement. The interfaces are a strict
6
+ * subset of better-sqlite3's API that Bun:sqlite also satisfies, so callers
7
+ * stay portable across both runtimes.
8
+ *
9
+ * Helpers (e.g. `withTransaction`) live here so portable code paths can use
10
+ * one canonical entry point without each call site re-deriving the
11
+ * `db.transaction(fn)()` two-step pattern.
12
+ */
1
13
  export interface RunResult {
2
14
  changes: number;
3
15
  lastInsertRowid: number;
@@ -13,3 +25,21 @@ export interface DatabaseAdapter {
13
25
  close(): void;
14
26
  transaction<T>(fn: () => T): () => T;
15
27
  }
28
+ /**
29
+ * Run `fn` inside a single SQLite transaction and return its result.
30
+ *
31
+ * Replaces the verbose two-step pattern:
32
+ *
33
+ * const tx = db.transaction(() => { ...; return value; });
34
+ * const value = tx();
35
+ *
36
+ * with:
37
+ *
38
+ * const value = withTransaction(db, () => { ...; return value; });
39
+ *
40
+ * Errors thrown inside `fn` propagate to the caller and the transaction is
41
+ * rolled back by the underlying driver (better-sqlite3 / bun:sqlite both do
42
+ * this). Use this for any read-modify-write block where multiple statements
43
+ * must be atomic.
44
+ */
45
+ export declare function withTransaction<T>(db: DatabaseAdapter, fn: () => T): T;
@@ -1 +1,32 @@
1
- export {};
1
+ /**
2
+ * Database adapter surface.
3
+ *
4
+ * Design intent: this file is the *contract* both `createBetterSqlite3` and
5
+ * `createBunSqlite` (in `database.ts`) implement. The interfaces are a strict
6
+ * subset of better-sqlite3's API that Bun:sqlite also satisfies, so callers
7
+ * stay portable across both runtimes.
8
+ *
9
+ * Helpers (e.g. `withTransaction`) live here so portable code paths can use
10
+ * one canonical entry point without each call site re-deriving the
11
+ * `db.transaction(fn)()` two-step pattern.
12
+ */
13
+ /**
14
+ * Run `fn` inside a single SQLite transaction and return its result.
15
+ *
16
+ * Replaces the verbose two-step pattern:
17
+ *
18
+ * const tx = db.transaction(() => { ...; return value; });
19
+ * const value = tx();
20
+ *
21
+ * with:
22
+ *
23
+ * const value = withTransaction(db, () => { ...; return value; });
24
+ *
25
+ * Errors thrown inside `fn` propagate to the caller and the transaction is
26
+ * rolled back by the underlying driver (better-sqlite3 / bun:sqlite both do
27
+ * this). Use this for any read-modify-write block where multiple statements
28
+ * must be atomic.
29
+ */
30
+ export function withTransaction(db, fn) {
31
+ return db.transaction(fn)();
32
+ }
@@ -1,4 +1,5 @@
1
1
  import { getDb } from "./database.js";
2
+ import { withTransaction } from "./db-adapter.js";
2
3
  export class DependencyMapper {
3
4
  getMap() {
4
5
  const db = getDb();
@@ -20,12 +21,11 @@ export class DependencyMapper {
20
21
  VALUES (?, ?, ?, ?)
21
22
  ON CONFLICT(module_id) DO UPDATE SET
22
23
  depends_on = excluded.depends_on, exports = excluded.exports, owners = excluded.owners`);
23
- const tx = db.transaction(() => {
24
+ withTransaction(db, () => {
24
25
  for (const [id, info] of Object.entries(map)) {
25
26
  stmt.run(id, JSON.stringify(info.depends_on), JSON.stringify(info.exports), JSON.stringify(info.owners));
26
27
  }
27
28
  });
28
- tx();
29
29
  }
30
30
  getModuleInfo(moduleId) {
31
31
  const db = getDb();
@@ -17,5 +17,15 @@ export declare class FileTracker {
17
17
  conflict: boolean;
18
18
  agents: string[];
19
19
  };
20
+ /**
21
+ * P2 perf: batch lookup of recent file→agents activity. Replaces N
22
+ * `checkFileConflict` calls (one per file) with a single SQL query, then
23
+ * builds an in-memory reverse index. The impact scorer uses this so its
24
+ * per-file inner loop is O(1) Map.get() rather than O(F) SQL round-trips.
25
+ *
26
+ * Excludes the calling agent (so the scorer doesn't flag the announcer
27
+ * against themselves). Returns Map<file_path, Set<agent_id>>.
28
+ */
29
+ getFileToAgentsIndex(filePaths: string[], excludeAgentId: string, withinMinutes?: number): Map<string, Set<string>>;
20
30
  fileToModule(filePath: string): string;
21
31
  }
@@ -31,6 +31,38 @@ export class FileTracker {
31
31
  AND created_at > datetime('now', '-' || ? || ' minutes')`).all(filePath, agentId, withinMinutes);
32
32
  return { conflict: rows.length > 0, agents: rows.map((r) => r.agent_id) };
33
33
  }
34
+ /**
35
+ * P2 perf: batch lookup of recent file→agents activity. Replaces N
36
+ * `checkFileConflict` calls (one per file) with a single SQL query, then
37
+ * builds an in-memory reverse index. The impact scorer uses this so its
38
+ * per-file inner loop is O(1) Map.get() rather than O(F) SQL round-trips.
39
+ *
40
+ * Excludes the calling agent (so the scorer doesn't flag the announcer
41
+ * against themselves). Returns Map<file_path, Set<agent_id>>.
42
+ */
43
+ getFileToAgentsIndex(filePaths, excludeAgentId, withinMinutes = 30) {
44
+ const index = new Map();
45
+ if (filePaths.length === 0)
46
+ return index;
47
+ const db = getDb();
48
+ // Dynamic IN-list — better-sqlite3 binds each ? positionally. Cheap because
49
+ // the impact scorer only passes target_files + depends_on_files (typically
50
+ // a handful of files per announce_work call).
51
+ const placeholders = filePaths.map(() => "?").join(",");
52
+ const rows = db.prepare(`SELECT DISTINCT file_path, agent_id FROM file_activity
53
+ WHERE file_path IN (${placeholders})
54
+ AND agent_id != ?
55
+ AND created_at > datetime('now', '-' || ? || ' minutes')`).all(...filePaths, excludeAgentId, withinMinutes);
56
+ for (const r of rows) {
57
+ let set = index.get(r.file_path);
58
+ if (!set) {
59
+ set = new Set();
60
+ index.set(r.file_path, set);
61
+ }
62
+ set.add(r.agent_id);
63
+ }
64
+ return index;
65
+ }
34
66
  fileToModule(filePath) {
35
67
  // Strip leading / so "/server/src/x.ts" and "server/src/x.ts" produce the
36
68
  // same module name. Without this, split("/") on an absolute path yields
@@ -0,0 +1,23 @@
1
+ import type { IncomingMessage, ServerResponse } from "http";
2
+ import type { CoordinatorServices } from "../server-setup.js";
3
+ /**
4
+ * Liveness probe — process is alive. Always returns 200 with no dep checks
5
+ * so orchestrators don't restart the pod over transient downstream failures.
6
+ */
7
+ export declare function handleLivez(_req: IncomingMessage, res: ServerResponse): void;
8
+ /**
9
+ * Readiness probe — downstream deps must all be green for the LB to route
10
+ * traffic here. 503 when any check fails so the pod is drained until ready.
11
+ *
12
+ * Each check is wrapped in try/catch so a thrown DB/MQTT error becomes a
13
+ * structured `{ok:false,error:"…"}` instead of a 500. The response shape is
14
+ * identical between 200 and 503 so consumers can parse uniformly.
15
+ */
16
+ export declare function handleReadyz(_req: IncomingMessage, res: ServerResponse, services: Pick<CoordinatorServices, "mqttBridge">): void;
17
+ /**
18
+ * Backwards-compatible alias. The original /health route returned a fixed
19
+ * {status:"ok",version} payload with no dep checks; semantically that is a
20
+ * liveness probe, so we delegate. Anything that polled /health for "is the
21
+ * process up" continues to work without changes.
22
+ */
23
+ export declare function handleHealth(req: IncomingMessage, res: ServerResponse): void;
@@ -0,0 +1,86 @@
1
+ import { getDb } from "../database.js";
2
+ import { json } from "./utils.js";
3
+ import { getVersion } from "../../cli/version.js";
4
+ /**
5
+ * v0.4 Operability: Kubernetes-style health probes.
6
+ *
7
+ * - /livez → is the process alive? Used by an orchestrator (k8s, systemd,
8
+ * docker swarm) to decide whether to restart the pod. MUST NOT
9
+ * check downstream deps; an unreachable DB does not mean the
10
+ * coordinator process should be killed and restarted.
11
+ *
12
+ * - /readyz → are downstream deps ready? Used by a load balancer / service
13
+ * mesh to decide whether to add this pod to rotation. Returns 503
14
+ * when the DB or MQTT broker is not reachable so the LB drains
15
+ * traffic until the coordinator can actually serve it.
16
+ *
17
+ * - /health → backwards-compat alias for /livez. The original stub returned
18
+ * {status:"ok",version} unconditionally; preserving alive-only
19
+ * semantics keeps existing dashboards and uptime probes green
20
+ * without forcing them to migrate.
21
+ */
22
+ const STARTED_AT_MS = Date.now();
23
+ const VERSION = getVersion();
24
+ function uptimeSeconds() {
25
+ return Math.floor((Date.now() - STARTED_AT_MS) / 1000);
26
+ }
27
+ /**
28
+ * Liveness probe — process is alive. Always returns 200 with no dep checks
29
+ * so orchestrators don't restart the pod over transient downstream failures.
30
+ */
31
+ export function handleLivez(_req, res) {
32
+ json(res, {
33
+ status: "alive",
34
+ uptime_seconds: uptimeSeconds(),
35
+ version: VERSION,
36
+ });
37
+ }
38
+ /**
39
+ * Readiness probe — downstream deps must all be green for the LB to route
40
+ * traffic here. 503 when any check fails so the pod is drained until ready.
41
+ *
42
+ * Each check is wrapped in try/catch so a thrown DB/MQTT error becomes a
43
+ * structured `{ok:false,error:"…"}` instead of a 500. The response shape is
44
+ * identical between 200 and 503 so consumers can parse uniformly.
45
+ */
46
+ export function handleReadyz(_req, res, services) {
47
+ const checks = {
48
+ db: { ok: false },
49
+ mqtt: { ok: false },
50
+ };
51
+ try {
52
+ // Cheapest possible round-trip that exercises the connection without
53
+ // touching application tables. Throws if the handle is closed or the
54
+ // file is locked beyond busy_timeout.
55
+ getDb().prepare("SELECT 1").get();
56
+ checks.db.ok = true;
57
+ }
58
+ catch (err) {
59
+ checks.db.error = err.message;
60
+ }
61
+ try {
62
+ if (services.mqttBridge.isConnected()) {
63
+ checks.mqtt.ok = true;
64
+ }
65
+ else {
66
+ checks.mqtt.error = "not connected";
67
+ }
68
+ }
69
+ catch (err) {
70
+ checks.mqtt.error = err.message;
71
+ }
72
+ const allOk = checks.db.ok && checks.mqtt.ok;
73
+ json(res, {
74
+ status: allOk ? "ready" : "not_ready",
75
+ checks,
76
+ }, allOk ? 200 : 503);
77
+ }
78
+ /**
79
+ * Backwards-compatible alias. The original /health route returned a fixed
80
+ * {status:"ok",version} payload with no dep checks; semantically that is a
81
+ * liveness probe, so we delegate. Anything that polled /health for "is the
82
+ * process up" continues to work without changes.
83
+ */
84
+ export function handleHealth(req, res) {
85
+ return handleLivez(req, res);
86
+ }