alvin-bot 4.25.1 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +167 -0
- package/bin/cli.js +159 -4
- package/dist/index.js +39 -0
- package/dist/services/auto-diagnostic.js +228 -0
- package/dist/services/critical-notify.js +203 -0
- package/dist/services/heartbeat-file.js +65 -0
- package/dist/services/preflight.js +292 -0
- package/dist/services/self-diagnosis.js +272 -0
- package/dist/services/trends.js +309 -0
- package/dist/services/watchdog.js +47 -0
- package/package.json +1 -1
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Critical-Event Cross-Channel Notify (Self-Preservation Phase 1, feature 1D).
|
|
3
|
+
*
|
|
4
|
+
* When something genuinely critical happens — watchdog brake engaged,
|
|
5
|
+
* repeated Telegram 409s, all providers dead, disk full, memory blow-up —
|
|
6
|
+
* deliver the alert through a fallback chain so the user actually finds
|
|
7
|
+
* out even if Telegram (the primary channel) is itself the failure mode.
|
|
8
|
+
*
|
|
9
|
+
* Channel cascade — ALL fire, in order of preference:
|
|
10
|
+
* 1. File flag at ~/.alvin-bot/CRITICAL.log [durable audit trail, always written]
|
|
11
|
+
* 2. macOS native notification (osascript) [if darwin, visible immediately]
|
|
12
|
+
* 3. Telegram DM to admin (detached curl) [survives process exit via spawn+unref]
|
|
13
|
+
*
|
|
14
|
+
* Order is deliberate: we ALWAYS persist the audit (1) first, so even
|
|
15
|
+
* if the process crashes mid-notify we have a forensic record. Then we
|
|
16
|
+
* try the user-facing channels (2, 3) best-effort.
|
|
17
|
+
*
|
|
18
|
+
* The Telegram channel uses a detached child `curl` process precisely
|
|
19
|
+
* because critical events often come paired with process.exit() — most
|
|
20
|
+
* notably the watchdog brake. A normal in-process fetch() wouldn't
|
|
21
|
+
* survive parent termination. `spawn + detached + unref` does.
|
|
22
|
+
*
|
|
23
|
+
* Performance: ZERO steady-state overhead. Only the file-flag write
|
|
24
|
+
* runs at all, and only when emitCritical() is called.
|
|
25
|
+
*
|
|
26
|
+
* Opt-out:
|
|
27
|
+
* ALVIN_DISABLE_CRITICAL_NOTIFY=true → skip Tier 1/2/3 entirely
|
|
28
|
+
* ALVIN_DISABLE_SELF_PRESERVATION=true → skip ALL Phase-1 features
|
|
29
|
+
*/
|
|
30
|
+
import { spawn, execFileSync, spawnSync } from "child_process";
|
|
31
|
+
import { appendFileSync, mkdirSync } from "fs";
|
|
32
|
+
import { join } from "path";
|
|
33
|
+
import { homedir } from "os";
|
|
34
|
+
function isDisabled() {
|
|
35
|
+
return (process.env.ALVIN_DISABLE_CRITICAL_NOTIFY === "true" ||
|
|
36
|
+
process.env.ALVIN_DISABLE_SELF_PRESERVATION === "true");
|
|
37
|
+
}
|
|
38
|
+
function resolveOptions(opts) {
|
|
39
|
+
const botToken = opts?.botToken ?? process.env.BOT_TOKEN ?? undefined;
|
|
40
|
+
let adminChatId = opts?.adminChatId;
|
|
41
|
+
if (adminChatId === undefined && process.env.ALLOWED_USERS) {
|
|
42
|
+
const first = process.env.ALLOWED_USERS.split(",")[0]?.trim();
|
|
43
|
+
if (first) {
|
|
44
|
+
const parsed = parseInt(first, 10);
|
|
45
|
+
if (Number.isFinite(parsed))
|
|
46
|
+
adminChatId = parsed;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return { botToken, adminChatId };
|
|
50
|
+
}
|
|
51
|
+
// ── Tier 3: Durable file flag — ALWAYS written first ──────────────────────
|
|
52
|
+
function writeFileFlag(event) {
|
|
53
|
+
try {
|
|
54
|
+
const dir = join(homedir(), ".alvin-bot");
|
|
55
|
+
mkdirSync(dir, { recursive: true });
|
|
56
|
+
const path = join(dir, "CRITICAL.log");
|
|
57
|
+
const ts = (event.ts || new Date()).toISOString();
|
|
58
|
+
const block = [
|
|
59
|
+
`[${ts}] ${event.severity.toUpperCase()} ${event.category}`,
|
|
60
|
+
` ${event.title}`,
|
|
61
|
+
...event.detail.split("\n").map((l) => ` ${l}`),
|
|
62
|
+
...(event.suggestedAction ? [` Suggested: ${event.suggestedAction}`] : []),
|
|
63
|
+
"",
|
|
64
|
+
].join("\n");
|
|
65
|
+
appendFileSync(path, block);
|
|
66
|
+
return true;
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
// ── Tier 2: macOS native notification (silent on Linux/Windows) ───────────
|
|
73
|
+
function macosNotification(event) {
|
|
74
|
+
if (process.platform !== "darwin")
|
|
75
|
+
return false;
|
|
76
|
+
try {
|
|
77
|
+
// Escape any embedded double-quotes for AppleScript string literal
|
|
78
|
+
const message = `${event.title} — ${event.detail.split("\n")[0]}`.replace(/"/g, '\\"');
|
|
79
|
+
const title = `Alvin Bot ${event.severity === "critical" ? "🚨" : "⚠️"}`;
|
|
80
|
+
execFileSync("osascript", ["-e", `display notification "${message}" with title "${title}"`], { timeout: 3000, stdio: "pipe" });
|
|
81
|
+
return true;
|
|
82
|
+
}
|
|
83
|
+
catch {
|
|
84
|
+
return false;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
// ── Tier 1: Telegram DM to admin via detached curl ────────────────────────
|
|
88
|
+
//
|
|
89
|
+
// Why detached + curl instead of in-process fetch:
|
|
90
|
+
// - emitCritical() is sometimes called moments before process.exit()
|
|
91
|
+
// (notably from the watchdog brake path). In-process async work
|
|
92
|
+
// would be cancelled.
|
|
93
|
+
// - A detached child with stdio:'ignore' + unref() outlives its parent
|
|
94
|
+
// and is the standard pattern for "survive my own death" notifications.
|
|
95
|
+
// - curl is universally available on macOS + Linux. No node-only deps.
|
|
96
|
+
function telegramAdminDM(event, opts) {
|
|
97
|
+
if (!opts.botToken || !opts.adminChatId)
|
|
98
|
+
return false;
|
|
99
|
+
// Plain text — NOT Markdown. Critical events frequently contain shell
|
|
100
|
+
// commands in `suggestedAction` (paths with quotes, `&&` chains, etc.)
|
|
101
|
+
// which break Telegram's Markdown parser with HTTP 400. Reliability >
|
|
102
|
+
// visual prettiness for an alarm channel. The emoji prefix already
|
|
103
|
+
// makes it visually obvious.
|
|
104
|
+
const lines = [
|
|
105
|
+
`🚨 Alvin Bot — ${event.severity.toUpperCase()}`,
|
|
106
|
+
"",
|
|
107
|
+
event.title,
|
|
108
|
+
"",
|
|
109
|
+
event.detail,
|
|
110
|
+
];
|
|
111
|
+
if (event.suggestedAction) {
|
|
112
|
+
lines.push("", `Suggested: ${event.suggestedAction}`);
|
|
113
|
+
}
|
|
114
|
+
const text = lines.join("\n");
|
|
115
|
+
const curlArgs = [
|
|
116
|
+
"-s",
|
|
117
|
+
"-o", "/dev/null",
|
|
118
|
+
"-X", "POST",
|
|
119
|
+
"--max-time", "5",
|
|
120
|
+
`https://api.telegram.org/bot${opts.botToken}/sendMessage`,
|
|
121
|
+
"-d", `chat_id=${opts.adminChatId}`,
|
|
122
|
+
"--data-urlencode", `text=${text}`,
|
|
123
|
+
];
|
|
124
|
+
if (opts.blockTelegram) {
|
|
125
|
+
// Synchronous: caller is about to process.exit(). spawnSync blocks
|
|
126
|
+
// up to max-time + a small buffer, then returns. Guaranteed delivery
|
|
127
|
+
// attempt — no fork-race with process termination.
|
|
128
|
+
try {
|
|
129
|
+
// Drop -s -o /dev/null so we can see the HTTP response. The body
|
|
130
|
+
// is logged to stderr if Telegram returns a non-2xx.
|
|
131
|
+
const verboseArgs = curlArgs.filter((a) => a !== "-s" && a !== "/dev/null" && a !== "-o");
|
|
132
|
+
verboseArgs.push("-w", "HTTP=%{http_code}");
|
|
133
|
+
const result = spawnSync("curl", verboseArgs, { timeout: 7000, encoding: "utf-8" });
|
|
134
|
+
const stdout = (result.stdout || "").toString();
|
|
135
|
+
const stderr = (result.stderr || "").toString();
|
|
136
|
+
// Diagnostic — only logs in failure path. Helps debug "DM never arrived".
|
|
137
|
+
if (result.status !== 0 || !/HTTP=2\d\d/.test(stdout)) {
|
|
138
|
+
console.error(`[critical-notify] telegram sync curl status=${result.status} stdout=${stdout.slice(0, 200)} stderr=${stderr.slice(0, 200)}`);
|
|
139
|
+
return false;
|
|
140
|
+
}
|
|
141
|
+
return true;
|
|
142
|
+
}
|
|
143
|
+
catch (err) {
|
|
144
|
+
console.error(`[critical-notify] telegram sync curl threw: ${err instanceof Error ? err.message : String(err)}`);
|
|
145
|
+
return false;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
// Async detached: bot keeps running afterwards, no need to block.
|
|
149
|
+
// detached + stdio:ignore + unref is the standard pattern for
|
|
150
|
+
// "fire and forget". Note: NOT safe if caller calls process.exit()
|
|
151
|
+
// immediately after — use blockTelegram:true for those cases.
|
|
152
|
+
try {
|
|
153
|
+
const child = spawn("curl", curlArgs, { detached: true, stdio: "ignore" });
|
|
154
|
+
child.unref();
|
|
155
|
+
return true;
|
|
156
|
+
}
|
|
157
|
+
catch {
|
|
158
|
+
return false;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Emit a critical event across all configured channels.
|
|
163
|
+
*
|
|
164
|
+
* Synchronous-fast: file flag + osascript run inline (<60ms total typical).
|
|
165
|
+
* Telegram is detached so it doesn't block; we return true if it was
|
|
166
|
+
* scheduled (not whether it succeeded — that we can't know synchronously
|
|
167
|
+
* without blocking).
|
|
168
|
+
*
|
|
169
|
+
* Always safe to call. Never throws. Never blocks longer than ~3s
|
|
170
|
+
* (osascript timeout) in the worst case.
|
|
171
|
+
*
|
|
172
|
+
* Outcome of each tier is also logged to stderr so users can diagnose
|
|
173
|
+
* "why didn't I get the Telegram DM?" by reading their err.log.
|
|
174
|
+
*/
|
|
175
|
+
export function emitCritical(event, opts) {
|
|
176
|
+
if (isDisabled()) {
|
|
177
|
+
console.error("[critical-notify] skipped — opt-out via env var");
|
|
178
|
+
return { fileFlag: false, macos: false, telegram: false, reachedAtLeastOne: false };
|
|
179
|
+
}
|
|
180
|
+
// Tier 3 first — most durable, cheapest.
|
|
181
|
+
const fileFlag = writeFileFlag(event);
|
|
182
|
+
// Tier 2 — macOS user-facing.
|
|
183
|
+
const macos = macosNotification(event);
|
|
184
|
+
// Tier 1 — Telegram DM (sync if caller signaled exit, else detached).
|
|
185
|
+
const resolved = resolveOptions(opts);
|
|
186
|
+
const telegram = telegramAdminDM(event, { ...resolved, blockTelegram: opts?.blockTelegram });
|
|
187
|
+
// Diagnostics — written to stderr so even brake-context invocations
|
|
188
|
+
// leave a paper trail in err.log. The user previously hit a case
|
|
189
|
+
// where 1D fired the file flag and osascript but the Telegram DM
|
|
190
|
+
// seemingly never arrived — this log makes it obvious whether
|
|
191
|
+
// resolveOptions found a token + chat_id.
|
|
192
|
+
console.error(`[critical-notify] event="${event.category}" ` +
|
|
193
|
+
`file=${fileFlag ? "ok" : "fail"} ` +
|
|
194
|
+
`macos=${macos ? "ok" : "skip"} ` +
|
|
195
|
+
`telegram=${telegram ? "scheduled" : "skip"}` +
|
|
196
|
+
(telegram ? "" : ` (botToken=${resolved.botToken ? "set" : "missing"} adminChatId=${resolved.adminChatId ?? "missing"})`));
|
|
197
|
+
return {
|
|
198
|
+
fileFlag,
|
|
199
|
+
macos,
|
|
200
|
+
telegram,
|
|
201
|
+
reachedAtLeastOne: fileFlag || macos || telegram,
|
|
202
|
+
};
|
|
203
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Heartbeat-File Writer (Self-Preservation Phase 1, feature 2E).
|
|
3
|
+
*
|
|
4
|
+
* Writes a unix timestamp (seconds) to ~/.alvin-bot/heartbeat.txt every
|
|
5
|
+
* 60 seconds. An external launchd-managed dead-man watcher reads this
|
|
6
|
+
* file every 5 minutes — if the timestamp is older than 10 minutes,
|
|
7
|
+
* the bot is presumed frozen (event-loop deadlock, blocked I/O,
|
|
8
|
+
* unresponsive but alive process) and the watcher force-restarts via
|
|
9
|
+
* `launchctl kickstart -k`.
|
|
10
|
+
*
|
|
11
|
+
* This complements the in-process watchdog (src/services/watchdog.ts)
|
|
12
|
+
* which only catches process exits — it cannot catch "process alive
|
|
13
|
+
* but frozen" because that's exactly the state where the watchdog's
|
|
14
|
+
* own beacon writer also stops.
|
|
15
|
+
*
|
|
16
|
+
* Why a file + external watcher instead of an internal timer:
|
|
17
|
+
* - An internal "I'm frozen" timer is a contradiction in terms.
|
|
18
|
+
* If the event loop is dead, the timer doesn't fire either.
|
|
19
|
+
* - The file-based external watcher is the only architecturally
|
|
20
|
+
* sound way to detect this class of failure.
|
|
21
|
+
*
|
|
22
|
+
* Performance: file write of 11 bytes every 60s. CPU cost ~1ms/min,
|
|
23
|
+
* disk I/O ~0.7 KB/day. Truly negligible.
|
|
24
|
+
*
|
|
25
|
+
* Opt-out:
|
|
26
|
+
* ALVIN_DISABLE_DEAD_MAN=true → skip heartbeat writer
|
|
27
|
+
* ALVIN_DISABLE_SELF_PRESERVATION=true → skip all Phase-1
|
|
28
|
+
*/
|
|
29
|
+
import { writeFileSync, mkdirSync } from "fs";
|
|
30
|
+
import { join } from "path";
|
|
31
|
+
import { homedir } from "os";
|
|
32
|
+
const HEARTBEAT_PATH = join(homedir(), ".alvin-bot", "heartbeat.txt");
|
|
33
|
+
const HEARTBEAT_INTERVAL_MS = 60_000;
|
|
34
|
+
let heartbeatTimer = null;
|
|
35
|
+
function writeHeartbeat() {
|
|
36
|
+
try {
|
|
37
|
+
mkdirSync(join(homedir(), ".alvin-bot"), { recursive: true });
|
|
38
|
+
// 11 bytes — Unix seconds + newline. Easy to parse from shell.
|
|
39
|
+
writeFileSync(HEARTBEAT_PATH, `${Math.floor(Date.now() / 1000)}\n`);
|
|
40
|
+
}
|
|
41
|
+
catch {
|
|
42
|
+
// Disk full or permissions — non-fatal. The dead-man watcher will
|
|
43
|
+
// see a stale file and kickstart, which is the right behaviour:
|
|
44
|
+
// a bot that can't write its heartbeat IS effectively stuck.
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
export function startHeartbeatWriter() {
|
|
48
|
+
if (process.env.ALVIN_DISABLE_DEAD_MAN === "true" ||
|
|
49
|
+
process.env.ALVIN_DISABLE_SELF_PRESERVATION === "true") {
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
// Write immediately so the dead-man watcher doesn't see a stale file
|
|
53
|
+
// from the previous process incarnation.
|
|
54
|
+
writeHeartbeat();
|
|
55
|
+
heartbeatTimer = setInterval(writeHeartbeat, HEARTBEAT_INTERVAL_MS);
|
|
56
|
+
// Allow the process to exit without waiting for this timer.
|
|
57
|
+
if (heartbeatTimer.unref)
|
|
58
|
+
heartbeatTimer.unref();
|
|
59
|
+
}
|
|
60
|
+
export function stopHeartbeatWriter() {
|
|
61
|
+
if (heartbeatTimer) {
|
|
62
|
+
clearInterval(heartbeatTimer);
|
|
63
|
+
heartbeatTimer = null;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pre-Flight Sanity Check (Self-Preservation Phase 1, feature 1A).
|
|
3
|
+
*
|
|
4
|
+
* Runs in PARALLEL at startup, fire-and-forget: never blocks the bot's main
|
|
5
|
+
* startup sequence. Each check has a tight timeout. Results are logged with
|
|
6
|
+
* a severity classification (ok / warn / critical). Critical findings can
|
|
7
|
+
* optionally feed into the cross-channel notify pipeline (1D).
|
|
8
|
+
*
|
|
9
|
+
* Provider-agnostic: AI-provider check is routed through the active
|
|
10
|
+
* Provider's `isAvailable()` method, which every concrete provider
|
|
11
|
+
* implements — so the same check works for claude-sdk, codex-cli,
|
|
12
|
+
* groq, gemini, openai, openrouter, ollama (gemma), nvidia.
|
|
13
|
+
*
|
|
14
|
+
* Opt-out:
|
|
15
|
+
* ALVIN_DISABLE_PREFLIGHT=true → skip Pre-Flight specifically
|
|
16
|
+
* ALVIN_DISABLE_SELF_PRESERVATION=true → skip ALL Phase-1 features
|
|
17
|
+
*
|
|
18
|
+
* Performance budget (measured on Apple Silicon M-series):
|
|
19
|
+
* - Telegram getMe: typical 150-400ms, timeout 3000ms
|
|
20
|
+
* - AI Provider isAvailable: typical 50-800ms, timeout 5000ms
|
|
21
|
+
* - SQLite PRAGMA quick_check: typical 5-50ms, timeout 10000ms
|
|
22
|
+
* - df disk space: typical 5-15ms, timeout 2000ms
|
|
23
|
+
* - Total wall-clock = max of all four (Promise.all) — typically <1s
|
|
24
|
+
*/
|
|
25
|
+
import { existsSync } from "fs";
|
|
26
|
+
import { join } from "path";
|
|
27
|
+
import { homedir } from "os";
|
|
28
|
+
function isDisabled() {
|
|
29
|
+
return (process.env.ALVIN_DISABLE_PREFLIGHT === "true" ||
|
|
30
|
+
process.env.ALVIN_DISABLE_SELF_PRESERVATION === "true");
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Run a promise with a wall-clock timeout. Returns `fallback` if the
|
|
34
|
+
* promise doesn't settle in time. Never rejects.
|
|
35
|
+
*/
|
|
36
|
+
function withTimeout(promise, ms, fallback) {
|
|
37
|
+
return new Promise((resolve) => {
|
|
38
|
+
let settled = false;
|
|
39
|
+
const timer = setTimeout(() => {
|
|
40
|
+
if (!settled) {
|
|
41
|
+
settled = true;
|
|
42
|
+
resolve(fallback);
|
|
43
|
+
}
|
|
44
|
+
}, ms);
|
|
45
|
+
promise.then((value) => {
|
|
46
|
+
if (!settled) {
|
|
47
|
+
settled = true;
|
|
48
|
+
clearTimeout(timer);
|
|
49
|
+
resolve(value);
|
|
50
|
+
}
|
|
51
|
+
}, () => {
|
|
52
|
+
if (!settled) {
|
|
53
|
+
settled = true;
|
|
54
|
+
clearTimeout(timer);
|
|
55
|
+
resolve(fallback);
|
|
56
|
+
}
|
|
57
|
+
});
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
async function checkTelegram(botToken) {
|
|
61
|
+
const start = Date.now();
|
|
62
|
+
if (!botToken) {
|
|
63
|
+
return {
|
|
64
|
+
name: "telegram",
|
|
65
|
+
ok: true,
|
|
66
|
+
severity: "ok",
|
|
67
|
+
message: "skipped (WebUI-only mode, no BOT_TOKEN)",
|
|
68
|
+
durationMs: Date.now() - start,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
const url = `https://api.telegram.org/bot${botToken}/getMe`;
|
|
72
|
+
const result = await withTimeout(fetch(url).then(async (r) => ({ ok: r.ok, status: r.status, body: await r.json().catch(() => null) })), 3000, null);
|
|
73
|
+
if (!result) {
|
|
74
|
+
return {
|
|
75
|
+
name: "telegram",
|
|
76
|
+
ok: false,
|
|
77
|
+
severity: "warn",
|
|
78
|
+
message: "getMe timed out (3s) — bot may have network / Telegram issues",
|
|
79
|
+
durationMs: Date.now() - start,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
if (!result.ok) {
|
|
83
|
+
return {
|
|
84
|
+
name: "telegram",
|
|
85
|
+
ok: false,
|
|
86
|
+
severity: "critical",
|
|
87
|
+
message: `getMe HTTP ${result.status} — token may be invalid`,
|
|
88
|
+
durationMs: Date.now() - start,
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
const username = result.body?.result?.username;
|
|
92
|
+
return {
|
|
93
|
+
name: "telegram",
|
|
94
|
+
ok: true,
|
|
95
|
+
severity: "ok",
|
|
96
|
+
message: username ? `bot=@${username}` : "bot reachable",
|
|
97
|
+
durationMs: Date.now() - start,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
async function checkAiProvider(registry) {
|
|
101
|
+
const start = Date.now();
|
|
102
|
+
if (!registry) {
|
|
103
|
+
return {
|
|
104
|
+
name: "ai-provider",
|
|
105
|
+
ok: false,
|
|
106
|
+
severity: "warn",
|
|
107
|
+
message: "no provider configured (AI features will be disabled)",
|
|
108
|
+
durationMs: Date.now() - start,
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
let provider;
|
|
112
|
+
let activeKey = "(unknown)";
|
|
113
|
+
try {
|
|
114
|
+
provider = registry.getActive();
|
|
115
|
+
activeKey = registry.getActiveKey();
|
|
116
|
+
}
|
|
117
|
+
catch {
|
|
118
|
+
return {
|
|
119
|
+
name: "ai-provider",
|
|
120
|
+
ok: false,
|
|
121
|
+
severity: "warn",
|
|
122
|
+
message: "no active provider in registry",
|
|
123
|
+
durationMs: Date.now() - start,
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
if (!provider) {
|
|
127
|
+
return {
|
|
128
|
+
name: "ai-provider",
|
|
129
|
+
ok: false,
|
|
130
|
+
severity: "warn",
|
|
131
|
+
message: "no active provider in registry",
|
|
132
|
+
durationMs: Date.now() - start,
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
const available = await withTimeout(provider.isAvailable(), 5000, false);
|
|
136
|
+
return {
|
|
137
|
+
name: "ai-provider",
|
|
138
|
+
ok: available,
|
|
139
|
+
severity: available ? "ok" : "warn",
|
|
140
|
+
message: available
|
|
141
|
+
? `${activeKey} reachable`
|
|
142
|
+
: `${activeKey} not reachable / not configured — bot will degrade gracefully on AI calls`,
|
|
143
|
+
durationMs: Date.now() - start,
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
async function checkSqliteIntegrity() {
|
|
147
|
+
const start = Date.now();
|
|
148
|
+
const dbPath = join(homedir(), ".alvin-bot", "memory", ".embeddings.db");
|
|
149
|
+
if (!existsSync(dbPath)) {
|
|
150
|
+
return {
|
|
151
|
+
name: "sqlite",
|
|
152
|
+
ok: true,
|
|
153
|
+
severity: "ok",
|
|
154
|
+
message: "embeddings DB not yet created (lazily on first use)",
|
|
155
|
+
durationMs: Date.now() - start,
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
try {
|
|
159
|
+
const { createRequire } = await import("module");
|
|
160
|
+
const req = createRequire(import.meta.url);
|
|
161
|
+
const Database = req("better-sqlite3");
|
|
162
|
+
const db = new Database(dbPath, { readonly: true });
|
|
163
|
+
// PRAGMA quick_check is materially faster than integrity_check
|
|
164
|
+
// (catches the same classes of corruption but doesn't verify every
|
|
165
|
+
// page). For our purpose — "is the file readable + structurally
|
|
166
|
+
// sane?" — quick_check is the right tool.
|
|
167
|
+
const result = await withTimeout(Promise.resolve(db.prepare("PRAGMA quick_check").get()), 10_000, null);
|
|
168
|
+
db.close();
|
|
169
|
+
if (result === null) {
|
|
170
|
+
return {
|
|
171
|
+
name: "sqlite",
|
|
172
|
+
ok: false,
|
|
173
|
+
severity: "warn",
|
|
174
|
+
message: "PRAGMA quick_check timed out (>10s) — DB may be very large or locked",
|
|
175
|
+
durationMs: Date.now() - start,
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
const r = result;
|
|
179
|
+
const checkResult = r.quick_check || "(unknown)";
|
|
180
|
+
const ok = checkResult === "ok";
|
|
181
|
+
return {
|
|
182
|
+
name: "sqlite",
|
|
183
|
+
ok,
|
|
184
|
+
severity: ok ? "ok" : "critical",
|
|
185
|
+
message: ok ? "embeddings DB integrity ok" : `embeddings DB integrity FAILED: ${checkResult}`,
|
|
186
|
+
durationMs: Date.now() - start,
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
catch (err) {
|
|
190
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
191
|
+
return {
|
|
192
|
+
name: "sqlite",
|
|
193
|
+
ok: true,
|
|
194
|
+
severity: "ok",
|
|
195
|
+
message: `check skipped: ${message.split("\n")[0]}`,
|
|
196
|
+
durationMs: Date.now() - start,
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
async function checkDiskSpace() {
|
|
201
|
+
const start = Date.now();
|
|
202
|
+
try {
|
|
203
|
+
const { execSync } = await import("child_process");
|
|
204
|
+
const dataDir = join(homedir(), ".alvin-bot");
|
|
205
|
+
const out = execSync(`df -k "${dataDir}"`, { encoding: "utf-8", timeout: 2000 });
|
|
206
|
+
const lines = out.trim().split("\n");
|
|
207
|
+
const data = lines[lines.length - 1].split(/\s+/);
|
|
208
|
+
// df output: Filesystem 1024-blocks Used Available Capacity ...
|
|
209
|
+
const availableKB = parseInt(data[3], 10);
|
|
210
|
+
if (!Number.isFinite(availableKB)) {
|
|
211
|
+
return {
|
|
212
|
+
name: "disk",
|
|
213
|
+
ok: true,
|
|
214
|
+
severity: "ok",
|
|
215
|
+
message: "could not parse df output",
|
|
216
|
+
durationMs: Date.now() - start,
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
const availableGB = availableKB / 1024 / 1024;
|
|
220
|
+
const severity = availableKB < 512 * 1024 ? "critical" :
|
|
221
|
+
availableKB < 1024 * 1024 ? "warn" :
|
|
222
|
+
"ok";
|
|
223
|
+
return {
|
|
224
|
+
name: "disk",
|
|
225
|
+
ok: severity === "ok",
|
|
226
|
+
severity,
|
|
227
|
+
message: `${availableGB.toFixed(2)} GB free`,
|
|
228
|
+
durationMs: Date.now() - start,
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
catch (err) {
|
|
232
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
233
|
+
return {
|
|
234
|
+
name: "disk",
|
|
235
|
+
ok: true,
|
|
236
|
+
severity: "ok",
|
|
237
|
+
message: `check skipped: ${message.split("\n")[0]}`,
|
|
238
|
+
durationMs: Date.now() - start,
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
/**
|
|
243
|
+
* Run the full pre-flight suite in parallel. Always resolves (never
|
|
244
|
+
* throws). Returns a structured report so the caller can decide how
|
|
245
|
+
* to react.
|
|
246
|
+
*/
|
|
247
|
+
export async function runPreFlight(botToken, registry) {
|
|
248
|
+
if (isDisabled()) {
|
|
249
|
+
return {
|
|
250
|
+
results: [],
|
|
251
|
+
slowestMs: 0,
|
|
252
|
+
totalMs: 0,
|
|
253
|
+
anyCritical: false,
|
|
254
|
+
anyWarning: false,
|
|
255
|
+
skipped: true,
|
|
256
|
+
};
|
|
257
|
+
}
|
|
258
|
+
const start = Date.now();
|
|
259
|
+
const results = await Promise.all([
|
|
260
|
+
checkTelegram(botToken),
|
|
261
|
+
checkAiProvider(registry),
|
|
262
|
+
checkSqliteIntegrity(),
|
|
263
|
+
checkDiskSpace(),
|
|
264
|
+
]);
|
|
265
|
+
return {
|
|
266
|
+
results,
|
|
267
|
+
slowestMs: Math.max(...results.map((r) => r.durationMs)),
|
|
268
|
+
totalMs: Date.now() - start,
|
|
269
|
+
anyCritical: results.some((r) => r.severity === "critical"),
|
|
270
|
+
anyWarning: results.some((r) => r.severity === "warn"),
|
|
271
|
+
skipped: false,
|
|
272
|
+
};
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* Format a PreFlightReport for console output. Compact, single line per
|
|
276
|
+
* check, clear severity icons.
|
|
277
|
+
*/
|
|
278
|
+
export function formatPreFlightReport(report) {
|
|
279
|
+
if (report.skipped) {
|
|
280
|
+
return "🩺 Pre-Flight: skipped (ALVIN_DISABLE_PREFLIGHT=true)";
|
|
281
|
+
}
|
|
282
|
+
const icons = { ok: "✓", warn: "⚠", critical: "❌" };
|
|
283
|
+
const headline = report.anyCritical
|
|
284
|
+
? "❌ Pre-Flight: critical issues"
|
|
285
|
+
: report.anyWarning
|
|
286
|
+
? "⚠️ Pre-Flight: warnings"
|
|
287
|
+
: "✅ Pre-Flight: all checks ok";
|
|
288
|
+
const lines = report.results.map((r) => {
|
|
289
|
+
return ` ${icons[r.severity]} ${r.name.padEnd(12)} ${r.message} (${r.durationMs}ms)`;
|
|
290
|
+
});
|
|
291
|
+
return `🩺 ${headline} — ${report.totalMs}ms total\n${lines.join("\n")}`;
|
|
292
|
+
}
|