alvin-bot 4.25.1 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +167 -0
- package/bin/cli.js +159 -4
- package/dist/index.js +39 -0
- package/dist/services/auto-diagnostic.js +228 -0
- package/dist/services/critical-notify.js +203 -0
- package/dist/services/heartbeat-file.js +65 -0
- package/dist/services/preflight.js +292 -0
- package/dist/services/self-diagnosis.js +272 -0
- package/dist/services/trends.js +309 -0
- package/dist/services/watchdog.js +47 -0
- package/package.json +1 -1
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AI-driven Self-Diagnosis (Self-Preservation Phase 2, feature 3I).
|
|
3
|
+
*
|
|
4
|
+
* Trigger model — IMPORTANT:
|
|
5
|
+
*
|
|
6
|
+
* 3I does NOT run at watchdog-brake time. The bot is mid-exit then;
|
|
7
|
+
* spawning a fresh process just to make one AI call would be heavy
|
|
8
|
+
* and racy. Instead, 3I runs at the next successful bot start:
|
|
9
|
+
* it scans ~/.alvin-bot/diagnostics/ for forensic bundles that don't
|
|
10
|
+
* yet have a sidecar .analysis.md file, runs AI analysis on each,
|
|
11
|
+
* writes the sidecar, and delivers a Telegram summary via 1D.
|
|
12
|
+
*
|
|
13
|
+
* User-visible consequence: when the bot recovers from a brake, the
|
|
14
|
+
* first thing it does after Pre-Flight is analyze why it crashed —
|
|
15
|
+
* the operator gets the diagnosis on their phone within ~30 s of
|
|
16
|
+
* the bot coming back up.
|
|
17
|
+
*
|
|
18
|
+
* Provider-agnostic — uses the active Provider's query() async generator,
|
|
19
|
+
* works for claude-sdk / codex-cli / groq / gemini / openai / offline-gemma4.
|
|
20
|
+
* The prompt is deliberately tight (~250 tokens base + bundle content,
|
|
21
|
+
* truncated to ~12 KB) so even small-context models can handle it.
|
|
22
|
+
*
|
|
23
|
+
* Output shape — we force a structured plain-text response (no JSON;
|
|
24
|
+
* JSON parsing reliability is unever across providers, especially with
|
|
25
|
+
* smaller models). The 5-line format is hard to mess up:
|
|
26
|
+
*
|
|
27
|
+
* HYPOTHESIS: ...
|
|
28
|
+
* ROOT_CAUSE_CATEGORY: ...
|
|
29
|
+
* REMEDIATION: ...
|
|
30
|
+
* CONFIDENCE: HIGH|MEDIUM|LOW
|
|
31
|
+
* EXPLANATION: ...
|
|
32
|
+
*
|
|
33
|
+
* Privacy: forensic bundles are already curated by 2F to exclude
|
|
34
|
+
* secrets (BOT_TOKEN, API keys); only whitelisted non-secret env vars
|
|
35
|
+
* are included. So the AI request contains: bot version, logs, env
|
|
36
|
+
* keys (non-secret), tool inventory, disk state. No tokens leave the
|
|
37
|
+
* machine.
|
|
38
|
+
*
|
|
39
|
+
* Auto-remediation policy (v1, intentionally conservative):
|
|
40
|
+
* - We NEVER auto-apply any remediation. The AI's REMEDIATION line
|
|
41
|
+
* is shown to the operator as a suggestion only.
|
|
42
|
+
* - Operator runs it manually if it looks right.
|
|
43
|
+
* - This will likely relax in a future release once we build
|
|
44
|
+
* confidence in the AI's track record per remediation category.
|
|
45
|
+
*
|
|
46
|
+
* Opt-out:
|
|
47
|
+
* ALVIN_DISABLE_SELF_DIAGNOSIS=true → skip 3I specifically
|
|
48
|
+
* ALVIN_DISABLE_SELF_PRESERVATION=true → skip ALL Phase-1/2
|
|
49
|
+
*/
|
|
50
|
+
import { existsSync, readFileSync, writeFileSync, readdirSync } from "fs";
|
|
51
|
+
import { join } from "path";
|
|
52
|
+
import { homedir } from "os";
|
|
53
|
+
import { emitCritical } from "./critical-notify.js";
|
|
54
|
+
const PROMPT_TEMPLATE = `You are an SRE assistant. An Alvin Bot instance hit a critical failure.
|
|
55
|
+
Below is the forensic dump. Read it, then respond in EXACTLY this 5-line format —
|
|
56
|
+
no markdown, no commentary, no extra lines:
|
|
57
|
+
|
|
58
|
+
HYPOTHESIS: <one short sentence: what likely went wrong>
|
|
59
|
+
ROOT_CAUSE_CATEGORY: <pick ONE: config-error | resource-exhaustion | external-failure | code-bug | environment-conflict | unknown>
|
|
60
|
+
REMEDIATION: <one shell command the operator can run, OR "no automated action available">
|
|
61
|
+
CONFIDENCE: <HIGH | MEDIUM | LOW>
|
|
62
|
+
EXPLANATION: <2-4 sentences explaining your reasoning, plain text>
|
|
63
|
+
|
|
64
|
+
--- FORENSIC DUMP ---
|
|
65
|
+
{BUNDLE_CONTENT}
|
|
66
|
+
--- END OF DUMP ---`;
|
|
67
|
+
function isDisabled() {
|
|
68
|
+
return (process.env.ALVIN_DISABLE_SELF_DIAGNOSIS === "true" ||
|
|
69
|
+
process.env.ALVIN_DISABLE_SELF_PRESERVATION === "true");
|
|
70
|
+
}
|
|
71
|
+
function parseAIResponse(text) {
|
|
72
|
+
const get = (key) => {
|
|
73
|
+
// ^KEY: ... up to next \n^KEY:|end. Multiline-safe.
|
|
74
|
+
const re = new RegExp(`^${key}:\\s*([\\s\\S]*?)(?=^(?:HYPOTHESIS|ROOT_CAUSE_CATEGORY|REMEDIATION|CONFIDENCE|EXPLANATION):|$)`, "m");
|
|
75
|
+
const m = text.match(re);
|
|
76
|
+
return m ? m[1].trim() : "";
|
|
77
|
+
};
|
|
78
|
+
const conf = get("CONFIDENCE").split(/\s+/)[0]?.toUpperCase() || "";
|
|
79
|
+
return {
|
|
80
|
+
hypothesis: get("HYPOTHESIS") || "(no hypothesis returned)",
|
|
81
|
+
rootCauseCategory: get("ROOT_CAUSE_CATEGORY") || "unknown",
|
|
82
|
+
remediation: get("REMEDIATION") || "(no remediation)",
|
|
83
|
+
confidence: (["HIGH", "MEDIUM", "LOW"].includes(conf) ? conf : "UNKNOWN"),
|
|
84
|
+
explanation: get("EXPLANATION") || "(no explanation)",
|
|
85
|
+
raw: text,
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Truncate the forensic bundle to fit within small-context windows.
|
|
90
|
+
* Keep the first 2 KB (event detail, process state, env) and last 8 KB
|
|
91
|
+
* (recent logs, where the actual error usually surfaces).
|
|
92
|
+
*/
|
|
93
|
+
function truncateBundle(text, maxChars = 12_000) {
|
|
94
|
+
if (text.length <= maxChars)
|
|
95
|
+
return text;
|
|
96
|
+
const head = text.slice(0, 2000);
|
|
97
|
+
const tail = text.slice(-(maxChars - 2000 - 50));
|
|
98
|
+
return `${head}\n\n[... ${text.length - maxChars} chars elided ...]\n\n${tail}`;
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Run AI analysis on a single forensic bundle. Returns null on opt-out,
|
|
102
|
+
* unparseable response, or provider failure. Side-effects: writes
|
|
103
|
+
* `<bundlePath>.analysis.md` sidecar with the formatted result.
|
|
104
|
+
*/
|
|
105
|
+
export async function analyzeBundle(bundlePath, registry, opts = {}) {
|
|
106
|
+
if (isDisabled())
|
|
107
|
+
return null;
|
|
108
|
+
if (!existsSync(bundlePath))
|
|
109
|
+
return null;
|
|
110
|
+
let provider, activeKey = "(unknown)";
|
|
111
|
+
try {
|
|
112
|
+
provider = registry.getActive();
|
|
113
|
+
activeKey = registry.getActiveKey();
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
if (!provider)
|
|
119
|
+
return null;
|
|
120
|
+
const t0 = Date.now();
|
|
121
|
+
const bundleRaw = readFileSync(bundlePath, "utf-8");
|
|
122
|
+
const bundleClipped = truncateBundle(bundleRaw);
|
|
123
|
+
const prompt = PROMPT_TEMPLATE.replace("{BUNDLE_CONTENT}", bundleClipped);
|
|
124
|
+
// Hard timeout — protects against a hung provider call wedging
|
|
125
|
+
// the bot startup forever. Stream may be aborted mid-flight.
|
|
126
|
+
const timeoutMs = opts.timeoutMs ?? 120_000;
|
|
127
|
+
const abortController = new AbortController();
|
|
128
|
+
const timer = setTimeout(() => abortController.abort(), timeoutMs);
|
|
129
|
+
let fullText = "";
|
|
130
|
+
try {
|
|
131
|
+
for await (const chunk of provider.query({
|
|
132
|
+
prompt,
|
|
133
|
+
systemPrompt: "You are a precise SRE assistant. Reply ONLY in the requested format.",
|
|
134
|
+
abortSignal: abortController.signal,
|
|
135
|
+
})) {
|
|
136
|
+
if (chunk.type === "text") {
|
|
137
|
+
if (chunk.delta)
|
|
138
|
+
fullText += chunk.delta;
|
|
139
|
+
else if (chunk.text)
|
|
140
|
+
fullText = chunk.text;
|
|
141
|
+
}
|
|
142
|
+
else if (chunk.type === "error") {
|
|
143
|
+
clearTimeout(timer);
|
|
144
|
+
return null;
|
|
145
|
+
}
|
|
146
|
+
else if (chunk.type === "done") {
|
|
147
|
+
if (chunk.text)
|
|
148
|
+
fullText = chunk.text;
|
|
149
|
+
break;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
catch {
|
|
154
|
+
clearTimeout(timer);
|
|
155
|
+
return null;
|
|
156
|
+
}
|
|
157
|
+
clearTimeout(timer);
|
|
158
|
+
if (!fullText.trim())
|
|
159
|
+
return null;
|
|
160
|
+
const parsed = parseAIResponse(fullText);
|
|
161
|
+
const result = {
|
|
162
|
+
...parsed,
|
|
163
|
+
durationMs: Date.now() - t0,
|
|
164
|
+
provider: activeKey,
|
|
165
|
+
};
|
|
166
|
+
// Write sidecar — overwrites if user re-runs analysis
|
|
167
|
+
try {
|
|
168
|
+
const sidecarPath = bundlePath.replace(/\.md$/, ".analysis.md");
|
|
169
|
+
writeFileSync(sidecarPath, formatAnalysis(result, bundlePath), { mode: 0o600 });
|
|
170
|
+
}
|
|
171
|
+
catch {
|
|
172
|
+
// Sidecar write failed — non-fatal, we still return the result
|
|
173
|
+
}
|
|
174
|
+
return result;
|
|
175
|
+
}
|
|
176
|
+
function formatAnalysis(r, bundlePath) {
|
|
177
|
+
return [
|
|
178
|
+
`# AI Self-Diagnosis`,
|
|
179
|
+
``,
|
|
180
|
+
`**Bundle:** \`${bundlePath}\``,
|
|
181
|
+
`**Generated:** ${new Date().toISOString()}`,
|
|
182
|
+
`**Provider:** ${r.provider}`,
|
|
183
|
+
`**Duration:** ${r.durationMs} ms`,
|
|
184
|
+
``,
|
|
185
|
+
`## Hypothesis`,
|
|
186
|
+
r.hypothesis,
|
|
187
|
+
``,
|
|
188
|
+
`## Root Cause Category`,
|
|
189
|
+
`\`${r.rootCauseCategory}\``,
|
|
190
|
+
``,
|
|
191
|
+
`## Suggested Remediation`,
|
|
192
|
+
`\`\`\`bash`,
|
|
193
|
+
r.remediation,
|
|
194
|
+
`\`\`\``,
|
|
195
|
+
`> **Note:** the bot does NOT auto-apply this. Run it yourself only if it makes sense.`,
|
|
196
|
+
``,
|
|
197
|
+
`## Confidence`,
|
|
198
|
+
`**${r.confidence}**`,
|
|
199
|
+
``,
|
|
200
|
+
`## Explanation`,
|
|
201
|
+
r.explanation,
|
|
202
|
+
``,
|
|
203
|
+
`---`,
|
|
204
|
+
``,
|
|
205
|
+
`### Raw AI response`,
|
|
206
|
+
`\`\`\``,
|
|
207
|
+
r.raw,
|
|
208
|
+
`\`\`\``,
|
|
209
|
+
``,
|
|
210
|
+
].join("\n");
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Find diagnostic bundles in ~/.alvin-bot/diagnostics/ that do NOT yet
|
|
214
|
+
* have a sidecar .analysis.md. Used by the startup scanner.
|
|
215
|
+
*/
|
|
216
|
+
function findUnanalyzedBundles() {
|
|
217
|
+
const dir = join(homedir(), ".alvin-bot", "diagnostics");
|
|
218
|
+
if (!existsSync(dir))
|
|
219
|
+
return [];
|
|
220
|
+
try {
|
|
221
|
+
return readdirSync(dir)
|
|
222
|
+
.filter((f) => f.endsWith(".md") && !f.endsWith(".analysis.md"))
|
|
223
|
+
.filter((f) => !existsSync(join(dir, f.replace(/\.md$/, ".analysis.md"))))
|
|
224
|
+
.map((f) => join(dir, f));
|
|
225
|
+
}
|
|
226
|
+
catch {
|
|
227
|
+
return [];
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Scan for unanalyzed bundles and run AI analysis on each. Designed
|
|
232
|
+
* to be called once at bot startup, in the background (non-blocking).
|
|
233
|
+
*
|
|
234
|
+
* Delivers one Telegram DM per analyzed bundle via the 1D channel,
|
|
235
|
+
* with the structured findings. Auto-deduplicates: a bundle with an
|
|
236
|
+
* existing sidecar is skipped.
|
|
237
|
+
*/
|
|
238
|
+
export async function runStartupAnalyzer(registry) {
|
|
239
|
+
if (isDisabled())
|
|
240
|
+
return;
|
|
241
|
+
if (!registry)
|
|
242
|
+
return;
|
|
243
|
+
const bundles = findUnanalyzedBundles();
|
|
244
|
+
if (bundles.length === 0)
|
|
245
|
+
return;
|
|
246
|
+
// Process oldest first (FIFO so the operator sees them in order)
|
|
247
|
+
bundles.sort();
|
|
248
|
+
console.log(`🧠 Self-diagnosis: ${bundles.length} unanalyzed bundle(s) found — analyzing...`);
|
|
249
|
+
for (const bundlePath of bundles) {
|
|
250
|
+
try {
|
|
251
|
+
const result = await analyzeBundle(bundlePath, registry);
|
|
252
|
+
if (!result) {
|
|
253
|
+
console.warn(` ⚠ ${bundlePath}: analysis returned no result (provider error or opt-out)`);
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
console.log(` ✓ ${bundlePath.split("/").pop()} → ${result.rootCauseCategory} (${result.confidence}, ${result.durationMs}ms via ${result.provider})`);
|
|
257
|
+
// Deliver via 1D — severity warn (informational), not critical
|
|
258
|
+
emitCritical({
|
|
259
|
+
category: "custom",
|
|
260
|
+
severity: "warn",
|
|
261
|
+
title: `AI diagnosis ready — ${result.rootCauseCategory} (${result.confidence} confidence)`,
|
|
262
|
+
detail: `Hypothesis: ${result.hypothesis}\n\n` +
|
|
263
|
+
`Explanation: ${result.explanation}\n\n` +
|
|
264
|
+
`Full analysis: ${bundlePath.replace(/\.md$/, ".analysis.md")}`,
|
|
265
|
+
suggestedAction: result.remediation,
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
catch (err) {
|
|
269
|
+
console.warn(` ⚠ ${bundlePath}: analyzer threw — ${err instanceof Error ? err.message : String(err)}`);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Predictive Maintenance via Trends (Self-Preservation Phase 2, 3J).
|
|
3
|
+
*
|
|
4
|
+
* Mechanism:
|
|
5
|
+
* 1. Once every 24 h: snapshot lightweight health metrics and append
|
|
6
|
+
* one JSON line to ~/.alvin-bot/state/trends.jsonl
|
|
7
|
+
* 2. After the file has ≥ 7 days of data: also run a daily AI
|
|
8
|
+
* "anomaly detection" pass over the last 30 days of snapshots
|
|
9
|
+
* 3. If the AI flags a concerning trend → DM operator via 1D
|
|
10
|
+
*
|
|
11
|
+
* Why daily and not continuous: trends are about slow degradation
|
|
12
|
+
* (memory growth, error-rate increase, crash-frequency drift). A
|
|
13
|
+
* 24-hour sampling cadence is right for these timescales and keeps
|
|
14
|
+
* the storage + AI-call cost near zero.
|
|
15
|
+
*
|
|
16
|
+
* Storage format — JSONL, one line per day:
|
|
17
|
+
*
|
|
18
|
+
* {"ts": "2026-05-13T04:00:00Z", "uptime_s": 86400, "rss_mb": 105,
|
|
19
|
+
* "crashes_24h": 0, "diag_24h": 0, "errors_24h": 3,
|
|
20
|
+
* "provider": "claude-sdk", "version": "5.0.0"}
|
|
21
|
+
*
|
|
22
|
+
* The JSONL design is deliberate: easy to inspect with tail/head/awk,
|
|
23
|
+
* easy to truncate to last N days, no parsing pitfalls. The AI gets
|
|
24
|
+
* the whole tail as plain text — works with small-context models.
|
|
25
|
+
*
|
|
26
|
+
* Provider-agnostic — same engine.query() pipeline as 3I.
|
|
27
|
+
*
|
|
28
|
+
* Opt-out:
|
|
29
|
+
* ALVIN_DISABLE_TRENDS=true → skip 3J entirely
|
|
30
|
+
* ALVIN_DISABLE_SELF_PRESERVATION=true → skip all Phase-1/2
|
|
31
|
+
*
|
|
32
|
+
* Tunable for testing:
|
|
33
|
+
* ALVIN_TRENDS_INTERVAL_HOURS=24 → snapshot cadence
|
|
34
|
+
* ALVIN_TRENDS_AI_AFTER_DAYS=7 → days of data before AI analysis kicks in
|
|
35
|
+
*/
|
|
36
|
+
import { appendFileSync, existsSync, readFileSync, mkdirSync } from "fs";
|
|
37
|
+
import { join, dirname } from "path";
|
|
38
|
+
import { homedir } from "os";
|
|
39
|
+
import { BOT_VERSION } from "../version.js";
|
|
40
|
+
import { emitCritical } from "./critical-notify.js";
|
|
41
|
+
const TRENDS_PATH = join(homedir(), ".alvin-bot", "state", "trends.jsonl");
|
|
42
|
+
const DEFAULT_INTERVAL_HOURS = 24;
|
|
43
|
+
const DEFAULT_AI_THRESHOLD_DAYS = 7;
|
|
44
|
+
const MAX_RETAIN_DAYS = 90;
|
|
45
|
+
let trendsTimer = null;
|
|
46
|
+
function isDisabled() {
|
|
47
|
+
return (process.env.ALVIN_DISABLE_TRENDS === "true" ||
|
|
48
|
+
process.env.ALVIN_DISABLE_SELF_PRESERVATION === "true");
|
|
49
|
+
}
|
|
50
|
+
function countLogLinesLast24h(filename, pattern) {
|
|
51
|
+
const path = join(homedir(), ".alvin-bot", "logs", filename);
|
|
52
|
+
if (!existsSync(path))
|
|
53
|
+
return 0;
|
|
54
|
+
try {
|
|
55
|
+
const content = readFileSync(path, "utf-8");
|
|
56
|
+
const cutoff = Date.now() - 24 * 60 * 60 * 1000;
|
|
57
|
+
let count = 0;
|
|
58
|
+
for (const line of content.split("\n")) {
|
|
59
|
+
const tsMatch = line.match(/^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z)/);
|
|
60
|
+
if (!tsMatch)
|
|
61
|
+
continue;
|
|
62
|
+
const lineTs = new Date(tsMatch[1]).getTime();
|
|
63
|
+
if (Number.isFinite(lineTs) && lineTs >= cutoff) {
|
|
64
|
+
if (!pattern || pattern.test(line))
|
|
65
|
+
count++;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return count;
|
|
69
|
+
}
|
|
70
|
+
catch {
|
|
71
|
+
return 0;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
function readWatchdogCrashes24h() {
|
|
75
|
+
try {
|
|
76
|
+
const path = join(homedir(), ".alvin-bot", "state", "watchdog.json");
|
|
77
|
+
if (!existsSync(path))
|
|
78
|
+
return 0;
|
|
79
|
+
const data = JSON.parse(readFileSync(path, "utf-8"));
|
|
80
|
+
return data.dailyCrashCount ?? 0;
|
|
81
|
+
}
|
|
82
|
+
catch {
|
|
83
|
+
return 0;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
function countDiagnosticBundlesLast24h() {
|
|
87
|
+
try {
|
|
88
|
+
const dir = join(homedir(), ".alvin-bot", "diagnostics");
|
|
89
|
+
if (!existsSync(dir))
|
|
90
|
+
return 0;
|
|
91
|
+
const { readdirSync, statSync } = require("fs");
|
|
92
|
+
const cutoff = Date.now() - 24 * 60 * 60 * 1000;
|
|
93
|
+
return readdirSync(dir)
|
|
94
|
+
.filter((f) => f.endsWith(".md") && !f.endsWith(".analysis.md"))
|
|
95
|
+
.filter((f) => {
|
|
96
|
+
try {
|
|
97
|
+
return statSync(join(dir, f)).mtimeMs >= cutoff;
|
|
98
|
+
}
|
|
99
|
+
catch {
|
|
100
|
+
return false;
|
|
101
|
+
}
|
|
102
|
+
}).length;
|
|
103
|
+
}
|
|
104
|
+
catch {
|
|
105
|
+
return 0;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
function takeSnapshot(activeProvider) {
|
|
109
|
+
const mem = process.memoryUsage();
|
|
110
|
+
return {
|
|
111
|
+
ts: new Date().toISOString(),
|
|
112
|
+
uptime_s: Math.round(process.uptime()),
|
|
113
|
+
rss_mb: Math.round(mem.rss / 1024 / 1024),
|
|
114
|
+
heap_mb: Math.round(mem.heapUsed / 1024 / 1024),
|
|
115
|
+
crashes_24h: readWatchdogCrashes24h(),
|
|
116
|
+
diag_24h: countDiagnosticBundlesLast24h(),
|
|
117
|
+
errors_24h: countLogLinesLast24h("alvin-bot.err.log"),
|
|
118
|
+
provider: activeProvider,
|
|
119
|
+
version: BOT_VERSION,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
function appendSnapshot(snap) {
|
|
123
|
+
try {
|
|
124
|
+
mkdirSync(dirname(TRENDS_PATH), { recursive: true });
|
|
125
|
+
appendFileSync(TRENDS_PATH, JSON.stringify(snap) + "\n");
|
|
126
|
+
}
|
|
127
|
+
catch {
|
|
128
|
+
// Disk full / permissions — non-fatal
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
function readSnapshots(lastN = 30) {
|
|
132
|
+
if (!existsSync(TRENDS_PATH))
|
|
133
|
+
return [];
|
|
134
|
+
try {
|
|
135
|
+
const content = readFileSync(TRENDS_PATH, "utf-8");
|
|
136
|
+
const lines = content.split("\n").filter((l) => l.trim());
|
|
137
|
+
const recent = lines.slice(-lastN);
|
|
138
|
+
return recent
|
|
139
|
+
.map((l) => {
|
|
140
|
+
try {
|
|
141
|
+
return JSON.parse(l);
|
|
142
|
+
}
|
|
143
|
+
catch {
|
|
144
|
+
return null;
|
|
145
|
+
}
|
|
146
|
+
})
|
|
147
|
+
.filter((s) => s !== null);
|
|
148
|
+
}
|
|
149
|
+
catch {
|
|
150
|
+
return [];
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
const TREND_PROMPT_TEMPLATE = `You are an SRE monitoring an Alvin Bot instance over the last days.
|
|
154
|
+
Below is one JSON line per day with the bot's daily health metrics.
|
|
155
|
+
|
|
156
|
+
Detect any CONCERNING trend that suggests slow degradation — like:
|
|
157
|
+
- Memory (rss_mb / heap_mb) growing day over day
|
|
158
|
+
- Error rate (errors_24h) climbing
|
|
159
|
+
- Crashes (crashes_24h) above 0 for multiple days
|
|
160
|
+
- Diagnostic bundles (diag_24h) > 0 repeatedly
|
|
161
|
+
|
|
162
|
+
If there is NO concerning trend, respond with EXACTLY this one line:
|
|
163
|
+
ANOMALY: NONE
|
|
164
|
+
|
|
165
|
+
If there IS a concerning trend, respond in this 3-line format — nothing else:
|
|
166
|
+
|
|
167
|
+
ANOMALY: <one short sentence — what trend you noticed>
|
|
168
|
+
SEVERITY: <warn | critical>
|
|
169
|
+
SUGGESTION: <one shell command OR observation for the operator>
|
|
170
|
+
|
|
171
|
+
--- LAST {N} DAYS OF SNAPSHOTS ---
|
|
172
|
+
{SNAPSHOTS}
|
|
173
|
+
--- END ---`;
|
|
174
|
+
function parseTrendResponse(text) {
|
|
175
|
+
if (/^ANOMALY:\s*NONE/im.test(text)) {
|
|
176
|
+
return {
|
|
177
|
+
anomalyDetected: false,
|
|
178
|
+
description: "no concerning trend detected",
|
|
179
|
+
severity: "none",
|
|
180
|
+
suggestion: "",
|
|
181
|
+
raw: text,
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
const get = (key) => {
|
|
185
|
+
const m = text.match(new RegExp(`^${key}:\\s*(.+?)$`, "m"));
|
|
186
|
+
return m ? m[1].trim() : "";
|
|
187
|
+
};
|
|
188
|
+
const sevRaw = get("SEVERITY").toLowerCase();
|
|
189
|
+
const sev = sevRaw === "critical" ? "critical" : "warn";
|
|
190
|
+
return {
|
|
191
|
+
anomalyDetected: true,
|
|
192
|
+
description: get("ANOMALY") || "(no description)",
|
|
193
|
+
severity: sev,
|
|
194
|
+
suggestion: get("SUGGESTION") || "(no suggestion)",
|
|
195
|
+
raw: text,
|
|
196
|
+
};
|
|
197
|
+
}
|
|
198
|
+
export async function analyzeTrends(registry) {
|
|
199
|
+
if (isDisabled())
|
|
200
|
+
return null;
|
|
201
|
+
if (!registry)
|
|
202
|
+
return null;
|
|
203
|
+
const snaps = readSnapshots(30);
|
|
204
|
+
const threshold = parseInt(process.env.ALVIN_TRENDS_AI_AFTER_DAYS || "", 10) || DEFAULT_AI_THRESHOLD_DAYS;
|
|
205
|
+
if (snaps.length < threshold)
|
|
206
|
+
return null;
|
|
207
|
+
let provider;
|
|
208
|
+
try {
|
|
209
|
+
provider = registry.getActive();
|
|
210
|
+
}
|
|
211
|
+
catch {
|
|
212
|
+
return null;
|
|
213
|
+
}
|
|
214
|
+
if (!provider)
|
|
215
|
+
return null;
|
|
216
|
+
const snapsBlock = snaps.map((s) => JSON.stringify(s)).join("\n");
|
|
217
|
+
const prompt = TREND_PROMPT_TEMPLATE.replace("{N}", String(snaps.length)).replace("{SNAPSHOTS}", snapsBlock);
|
|
218
|
+
const abortController = new AbortController();
|
|
219
|
+
const timer = setTimeout(() => abortController.abort(), 60_000);
|
|
220
|
+
let fullText = "";
|
|
221
|
+
try {
|
|
222
|
+
for await (const chunk of provider.query({
|
|
223
|
+
prompt,
|
|
224
|
+
systemPrompt: "You are a precise SRE assistant. Reply ONLY in the requested format.",
|
|
225
|
+
abortSignal: abortController.signal,
|
|
226
|
+
})) {
|
|
227
|
+
if (chunk.type === "text") {
|
|
228
|
+
if (chunk.delta)
|
|
229
|
+
fullText += chunk.delta;
|
|
230
|
+
else if (chunk.text)
|
|
231
|
+
fullText = chunk.text;
|
|
232
|
+
}
|
|
233
|
+
else if (chunk.type === "error") {
|
|
234
|
+
clearTimeout(timer);
|
|
235
|
+
return null;
|
|
236
|
+
}
|
|
237
|
+
else if (chunk.type === "done") {
|
|
238
|
+
if (chunk.text)
|
|
239
|
+
fullText = chunk.text;
|
|
240
|
+
break;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
catch {
|
|
245
|
+
clearTimeout(timer);
|
|
246
|
+
return null;
|
|
247
|
+
}
|
|
248
|
+
clearTimeout(timer);
|
|
249
|
+
if (!fullText.trim())
|
|
250
|
+
return null;
|
|
251
|
+
return parseTrendResponse(fullText);
|
|
252
|
+
}
|
|
253
|
+
async function dailyTask(registry) {
|
|
254
|
+
try {
|
|
255
|
+
// Snapshot first — always, regardless of AI being available
|
|
256
|
+
const activeProviderKey = (() => {
|
|
257
|
+
try {
|
|
258
|
+
return registry?.getActiveKey() || "none";
|
|
259
|
+
}
|
|
260
|
+
catch {
|
|
261
|
+
return "none";
|
|
262
|
+
}
|
|
263
|
+
})();
|
|
264
|
+
const snap = takeSnapshot(activeProviderKey);
|
|
265
|
+
appendSnapshot(snap);
|
|
266
|
+
console.log(`📊 Trends snapshot taken: rss=${snap.rss_mb}MB errors=${snap.errors_24h} crashes=${snap.crashes_24h}`);
|
|
267
|
+
// Then attempt AI analysis if we have enough history
|
|
268
|
+
const result = await analyzeTrends(registry);
|
|
269
|
+
if (!result)
|
|
270
|
+
return;
|
|
271
|
+
if (!result.anomalyDetected) {
|
|
272
|
+
console.log(`📊 Trends AI: no anomaly detected`);
|
|
273
|
+
return;
|
|
274
|
+
}
|
|
275
|
+
console.log(`📊 Trends AI: ANOMALY (${result.severity}) — ${result.description}`);
|
|
276
|
+
emitCritical({
|
|
277
|
+
category: "custom",
|
|
278
|
+
severity: result.severity === "critical" ? "critical" : "warn",
|
|
279
|
+
title: `Trend anomaly detected: ${result.description}`,
|
|
280
|
+
detail: `30-day trend analysis flagged a concerning pattern.\n\n` +
|
|
281
|
+
`Suggestion: ${result.suggestion}\n\n` +
|
|
282
|
+
`Trend data: ${TRENDS_PATH}`,
|
|
283
|
+
suggestedAction: result.suggestion,
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
catch (err) {
|
|
287
|
+
console.warn(`📊 Trends daily task threw: ${err instanceof Error ? err.message : String(err)}`);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
export function startTrendsCollector(registry) {
|
|
291
|
+
if (isDisabled())
|
|
292
|
+
return;
|
|
293
|
+
const intervalH = parseInt(process.env.ALVIN_TRENDS_INTERVAL_HOURS || "", 10) || DEFAULT_INTERVAL_HOURS;
|
|
294
|
+
const intervalMs = intervalH * 60 * 60 * 1000;
|
|
295
|
+
// Initial: take a first snapshot after 60 s to avoid measuring the
|
|
296
|
+
// startup transient. Subsequent snapshots every intervalMs.
|
|
297
|
+
setTimeout(() => {
|
|
298
|
+
void dailyTask(registry);
|
|
299
|
+
trendsTimer = setInterval(() => void dailyTask(registry), intervalMs);
|
|
300
|
+
if (trendsTimer.unref)
|
|
301
|
+
trendsTimer.unref();
|
|
302
|
+
}, 60_000);
|
|
303
|
+
}
|
|
304
|
+
export function stopTrendsCollector() {
|
|
305
|
+
if (trendsTimer) {
|
|
306
|
+
clearInterval(trendsTimer);
|
|
307
|
+
trendsTimer = null;
|
|
308
|
+
}
|
|
309
|
+
}
|
|
@@ -27,6 +27,8 @@ import { resolve } from "path";
|
|
|
27
27
|
import os from "os";
|
|
28
28
|
import { execSync } from "child_process";
|
|
29
29
|
import { BOT_VERSION } from "../version.js";
|
|
30
|
+
import { emitCritical } from "./critical-notify.js";
|
|
31
|
+
import { writeDiagnosticBundle } from "./auto-diagnostic.js";
|
|
30
32
|
import { decideBrakeAction, shouldResetCrashCounter, DEFAULTS, } from "./watchdog-brake.js";
|
|
31
33
|
const DATA_DIR = process.env.ALVIN_DATA_DIR || resolve(os.homedir(), ".alvin-bot");
|
|
32
34
|
const STATE_DIR = resolve(DATA_DIR, "state");
|
|
@@ -164,6 +166,51 @@ export function startWatchdog() {
|
|
|
164
166
|
if (decision.action === "brake") {
|
|
165
167
|
console.error(`[watchdog] crash-loop brake triggered: ${decision.reason}`);
|
|
166
168
|
writeAlert(decision.reason, previous?.crashCount ?? 0);
|
|
169
|
+
// Critical-event notify (Self-Preservation Phase 1, feature 1D).
|
|
170
|
+
// emitCritical is synchronous-fast (file flag + osascript inline)
|
|
171
|
+
// and schedules a detached Telegram DM via curl that survives the
|
|
172
|
+
// process.exit(3) below — exactly the case this mechanism was
|
|
173
|
+
// built for.
|
|
174
|
+
// Auto-diagnostic (feature 2F) — collect forensic bundle BEFORE
|
|
175
|
+
// emitCritical so the Telegram DM can reference the file path.
|
|
176
|
+
let bundlePath = null;
|
|
177
|
+
try {
|
|
178
|
+
bundlePath = writeDiagnosticBundle({
|
|
179
|
+
category: "watchdog-brake",
|
|
180
|
+
severity: "critical",
|
|
181
|
+
title: "Watchdog crash-loop brake engaged",
|
|
182
|
+
detail: `${decision.reason}\n` +
|
|
183
|
+
`Bot version: ${BOT_VERSION}`,
|
|
184
|
+
suggestedAction: `rm "${ALERT_FILE}" && alvin-bot launchd install`,
|
|
185
|
+
});
|
|
186
|
+
if (bundlePath) {
|
|
187
|
+
console.error(`[auto-diagnostic] forensic bundle written: ${bundlePath}`);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
catch (err) {
|
|
191
|
+
console.error("[watchdog] auto-diagnostic failed:", err);
|
|
192
|
+
}
|
|
193
|
+
try {
|
|
194
|
+
emitCritical({
|
|
195
|
+
category: "watchdog-brake",
|
|
196
|
+
severity: "critical",
|
|
197
|
+
title: "Watchdog crash-loop brake engaged",
|
|
198
|
+
detail: `${decision.reason}\n` +
|
|
199
|
+
`Bot version: ${BOT_VERSION}\n` +
|
|
200
|
+
`The bot has stopped itself to prevent further damage.` +
|
|
201
|
+
(bundlePath ? `\n\nDiagnostic bundle: ${bundlePath}` : ""),
|
|
202
|
+
suggestedAction: `rm "${ALERT_FILE}" && alvin-bot launchd install`,
|
|
203
|
+
}, {
|
|
204
|
+
// We're about to process.exit(3). Block on the Telegram POST
|
|
205
|
+
// synchronously — detached spawn races the exit on macOS+launchd
|
|
206
|
+
// and the alert silently never lands. Adds ~1-2 s before exit;
|
|
207
|
+
// worth it to actually inform the user their bot just braked.
|
|
208
|
+
blockTelegram: true,
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
catch (err) {
|
|
212
|
+
console.error("[watchdog] critical-notify failed:", err);
|
|
213
|
+
}
|
|
167
214
|
// checkCrashLoopBrake tries to unload the LaunchAgent so launchd stops
|
|
168
215
|
// retrying. It only runs the exit path if ALERT_FILE exists, which is
|
|
169
216
|
// normally true after writeAlert — but if writeAlert failed silently
|