@steadwing/openalerts 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +155 -0
- package/dist/core/alert-channel.d.ts +23 -0
- package/dist/core/alert-channel.js +44 -0
- package/dist/core/bounded-map.d.ts +51 -0
- package/dist/core/bounded-map.js +128 -0
- package/dist/core/engine.d.ts +41 -0
- package/dist/core/engine.js +167 -0
- package/dist/core/evaluator.d.ts +18 -0
- package/dist/core/evaluator.js +150 -0
- package/dist/core/event-bus.d.ts +17 -0
- package/dist/core/event-bus.js +31 -0
- package/dist/core/formatter.d.ts +14 -0
- package/dist/core/formatter.js +124 -0
- package/dist/core/index.d.ts +11 -0
- package/dist/core/index.js +21 -0
- package/dist/core/platform.d.ts +17 -0
- package/dist/core/platform.js +93 -0
- package/dist/core/rules.d.ts +2 -0
- package/dist/core/rules.js +274 -0
- package/dist/core/store.d.ts +12 -0
- package/dist/core/store.js +125 -0
- package/dist/core/types.d.ts +152 -0
- package/dist/core/types.js +17 -0
- package/dist/index.js +5 -5
- package/dist/{src → plugin}/adapter.d.ts +1 -1
- package/dist/{src → plugin}/commands.d.ts +1 -1
- package/dist/{src → plugin}/commands.js +1 -1
- package/dist/{src → plugin}/dashboard-routes.d.ts +1 -1
- package/dist/{src → plugin}/log-bridge.d.ts +1 -1
- package/dist/{src → plugin}/log-bridge.js +1 -1
- package/package.json +9 -7
- /package/dist/{src → plugin}/adapter.js +0 -0
- /package/dist/{src → plugin}/dashboard-html.d.ts +0 -0
- /package/dist/{src → plugin}/dashboard-html.js +0 -0
- /package/dist/{src → plugin}/dashboard-routes.js +0 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
import { DEFAULTS, } from "./types.js";
|
|
2
|
+
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
|
3
|
+
function makeAlertId(ruleId, fingerprint, ts) {
|
|
4
|
+
return `${ruleId}:${fingerprint}:${ts}`;
|
|
5
|
+
}
|
|
6
|
+
function pushWindow(ctx, name, entry) {
|
|
7
|
+
let window = ctx.state.windows.get(name);
|
|
8
|
+
if (!window) {
|
|
9
|
+
window = [];
|
|
10
|
+
ctx.state.windows.set(name, window);
|
|
11
|
+
}
|
|
12
|
+
window.push(entry);
|
|
13
|
+
// Evict old entries beyond max
|
|
14
|
+
if (window.length > DEFAULTS.maxWindowEntries) {
|
|
15
|
+
window.splice(0, window.length - DEFAULTS.maxWindowEntries);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
function countInWindow(ctx, name, windowMs) {
|
|
19
|
+
const window = ctx.state.windows.get(name);
|
|
20
|
+
if (!window)
|
|
21
|
+
return 0;
|
|
22
|
+
const cutoff = ctx.now - windowMs;
|
|
23
|
+
return window.filter((e) => e.ts >= cutoff).length;
|
|
24
|
+
}
|
|
25
|
+
function getRuleThreshold(ctx, ruleId, defaultVal) {
|
|
26
|
+
return ctx.config.rules?.[ruleId]?.threshold ?? defaultVal;
|
|
27
|
+
}
|
|
28
|
+
function isRuleEnabled(ctx, ruleId) {
|
|
29
|
+
return ctx.config.rules?.[ruleId]?.enabled !== false;
|
|
30
|
+
}
|
|
31
|
+
// ─── Rule: infra-errors (was: webhook-errors) ───────────────────────────────
|
|
32
|
+
const infraErrors = {
|
|
33
|
+
id: "infra-errors",
|
|
34
|
+
defaultCooldownMs: 15 * 60 * 1000,
|
|
35
|
+
defaultThreshold: 3,
|
|
36
|
+
evaluate(event, ctx) {
|
|
37
|
+
if (event.type !== "infra.error")
|
|
38
|
+
return null;
|
|
39
|
+
if (!isRuleEnabled(ctx, "infra-errors"))
|
|
40
|
+
return null;
|
|
41
|
+
const channel = event.channel ?? "unknown";
|
|
42
|
+
pushWindow(ctx, "infra-errors", { ts: ctx.now });
|
|
43
|
+
const threshold = getRuleThreshold(ctx, "infra-errors", 3);
|
|
44
|
+
const windowMs = 5 * 60 * 1000; // 5 minutes
|
|
45
|
+
const count = countInWindow(ctx, "infra-errors", windowMs);
|
|
46
|
+
if (count < threshold)
|
|
47
|
+
return null;
|
|
48
|
+
const fingerprint = `infra-errors:${channel}`;
|
|
49
|
+
return {
|
|
50
|
+
type: "alert",
|
|
51
|
+
id: makeAlertId("infra-errors", fingerprint, ctx.now),
|
|
52
|
+
ruleId: "infra-errors",
|
|
53
|
+
severity: "error",
|
|
54
|
+
title: "Infrastructure errors spike",
|
|
55
|
+
detail: `${count} infra errors on ${channel} in the last 5 minutes.`,
|
|
56
|
+
ts: ctx.now,
|
|
57
|
+
fingerprint,
|
|
58
|
+
};
|
|
59
|
+
},
|
|
60
|
+
};
|
|
61
|
+
// ─── Rule: llm-errors (was: message-errors) ─────────────────────────────────
|
|
62
|
+
const llmErrors = {
|
|
63
|
+
id: "llm-errors",
|
|
64
|
+
defaultCooldownMs: 15 * 60 * 1000,
|
|
65
|
+
defaultThreshold: 3,
|
|
66
|
+
evaluate(event, ctx) {
|
|
67
|
+
if (event.type !== "llm.call")
|
|
68
|
+
return null;
|
|
69
|
+
if (!isRuleEnabled(ctx, "llm-errors"))
|
|
70
|
+
return null;
|
|
71
|
+
// Track all LLM calls for stats
|
|
72
|
+
ctx.state.stats.messagesProcessed++;
|
|
73
|
+
if (event.outcome !== "error")
|
|
74
|
+
return null;
|
|
75
|
+
ctx.state.stats.messageErrors++;
|
|
76
|
+
const channel = event.channel ?? "unknown";
|
|
77
|
+
pushWindow(ctx, "llm-errors", { ts: ctx.now });
|
|
78
|
+
const threshold = getRuleThreshold(ctx, "llm-errors", 3);
|
|
79
|
+
const windowMs = 5 * 60 * 1000;
|
|
80
|
+
const count = countInWindow(ctx, "llm-errors", windowMs);
|
|
81
|
+
if (count < threshold)
|
|
82
|
+
return null;
|
|
83
|
+
const fingerprint = `llm-errors:${channel}`;
|
|
84
|
+
return {
|
|
85
|
+
type: "alert",
|
|
86
|
+
id: makeAlertId("llm-errors", fingerprint, ctx.now),
|
|
87
|
+
ruleId: "llm-errors",
|
|
88
|
+
severity: "error",
|
|
89
|
+
title: "LLM call errors",
|
|
90
|
+
detail: `${count} LLM errors on ${channel} in the last 5 minutes.`,
|
|
91
|
+
ts: ctx.now,
|
|
92
|
+
fingerprint,
|
|
93
|
+
};
|
|
94
|
+
},
|
|
95
|
+
};
|
|
96
|
+
// ─── Rule: session-stuck ─────────────────────────────────────────────────────
|
|
97
|
+
const sessionStuck = {
|
|
98
|
+
id: "session-stuck",
|
|
99
|
+
defaultCooldownMs: 30 * 60 * 1000,
|
|
100
|
+
defaultThreshold: 120_000, // 120 seconds
|
|
101
|
+
evaluate(event, ctx) {
|
|
102
|
+
if (event.type !== "session.stuck")
|
|
103
|
+
return null;
|
|
104
|
+
if (!isRuleEnabled(ctx, "session-stuck"))
|
|
105
|
+
return null;
|
|
106
|
+
ctx.state.stats.stuckSessions++;
|
|
107
|
+
const ageMs = event.ageMs ?? 0;
|
|
108
|
+
const threshold = getRuleThreshold(ctx, "session-stuck", 120_000);
|
|
109
|
+
if (ageMs < threshold)
|
|
110
|
+
return null;
|
|
111
|
+
const sessionKey = event.sessionKey ?? "unknown";
|
|
112
|
+
const fingerprint = `session-stuck:${sessionKey}`;
|
|
113
|
+
const ageSec = Math.round(ageMs / 1000);
|
|
114
|
+
return {
|
|
115
|
+
type: "alert",
|
|
116
|
+
id: makeAlertId("session-stuck", fingerprint, ctx.now),
|
|
117
|
+
ruleId: "session-stuck",
|
|
118
|
+
severity: "warn",
|
|
119
|
+
title: "Session stuck",
|
|
120
|
+
detail: `Session ${sessionKey} stuck in processing for ${ageSec}s.`,
|
|
121
|
+
ts: ctx.now,
|
|
122
|
+
fingerprint,
|
|
123
|
+
};
|
|
124
|
+
},
|
|
125
|
+
};
|
|
126
|
+
// ─── Rule: heartbeat-fail ────────────────────────────────────────────────────
|
|
127
|
+
const heartbeatFail = {
|
|
128
|
+
id: "heartbeat-fail",
|
|
129
|
+
defaultCooldownMs: 30 * 60 * 1000,
|
|
130
|
+
defaultThreshold: 3, // consecutive failures
|
|
131
|
+
evaluate(event, ctx) {
|
|
132
|
+
if (event.type !== "infra.heartbeat")
|
|
133
|
+
return null;
|
|
134
|
+
if (!isRuleEnabled(ctx, "heartbeat-fail"))
|
|
135
|
+
return null;
|
|
136
|
+
const counterKey = "heartbeat-consecutive-fail";
|
|
137
|
+
if (event.outcome === "error") {
|
|
138
|
+
const count = (ctx.state.consecutives.get(counterKey) ?? 0) + 1;
|
|
139
|
+
ctx.state.consecutives.set(counterKey, count);
|
|
140
|
+
const threshold = getRuleThreshold(ctx, "heartbeat-fail", 3);
|
|
141
|
+
if (count < threshold)
|
|
142
|
+
return null;
|
|
143
|
+
const channel = event.channel ?? "";
|
|
144
|
+
const fingerprint = `heartbeat-fail:${channel}`;
|
|
145
|
+
return {
|
|
146
|
+
type: "alert",
|
|
147
|
+
id: makeAlertId("heartbeat-fail", fingerprint, ctx.now),
|
|
148
|
+
ruleId: "heartbeat-fail",
|
|
149
|
+
severity: "error",
|
|
150
|
+
title: "Heartbeat delivery failing",
|
|
151
|
+
detail: `${count} consecutive heartbeat failures.${channel ? ` Channel: ${channel}.` : ""}`,
|
|
152
|
+
ts: ctx.now,
|
|
153
|
+
fingerprint,
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
// Reset on success
|
|
157
|
+
if (event.outcome === "success") {
|
|
158
|
+
ctx.state.consecutives.set(counterKey, 0);
|
|
159
|
+
}
|
|
160
|
+
return null;
|
|
161
|
+
},
|
|
162
|
+
};
|
|
163
|
+
// ─── Rule: queue-depth ───────────────────────────────────────────────────────
|
|
164
|
+
const queueDepth = {
|
|
165
|
+
id: "queue-depth",
|
|
166
|
+
defaultCooldownMs: 15 * 60 * 1000,
|
|
167
|
+
defaultThreshold: 10,
|
|
168
|
+
evaluate(event, ctx) {
|
|
169
|
+
// Fire on heartbeat (which carries queue depth) and dedicated queue_depth events
|
|
170
|
+
if (event.type !== "infra.heartbeat" && event.type !== "infra.queue_depth")
|
|
171
|
+
return null;
|
|
172
|
+
if (!isRuleEnabled(ctx, "queue-depth"))
|
|
173
|
+
return null;
|
|
174
|
+
// Update last heartbeat timestamp (used by gateway-down rule)
|
|
175
|
+
if (event.type === "infra.heartbeat") {
|
|
176
|
+
ctx.state.lastHeartbeatTs = ctx.now;
|
|
177
|
+
}
|
|
178
|
+
const queued = event.queueDepth ?? 0;
|
|
179
|
+
const threshold = getRuleThreshold(ctx, "queue-depth", 10);
|
|
180
|
+
if (queued < threshold)
|
|
181
|
+
return null;
|
|
182
|
+
const fingerprint = "queue-depth";
|
|
183
|
+
return {
|
|
184
|
+
type: "alert",
|
|
185
|
+
id: makeAlertId("queue-depth", fingerprint, ctx.now),
|
|
186
|
+
ruleId: "queue-depth",
|
|
187
|
+
severity: "warn",
|
|
188
|
+
title: "Queue depth high",
|
|
189
|
+
detail: `${queued} items queued for processing.`,
|
|
190
|
+
ts: ctx.now,
|
|
191
|
+
fingerprint,
|
|
192
|
+
};
|
|
193
|
+
},
|
|
194
|
+
};
|
|
195
|
+
// ─── Rule: high-error-rate ───────────────────────────────────────────────────
|
|
196
|
+
const highErrorRate = {
|
|
197
|
+
id: "high-error-rate",
|
|
198
|
+
defaultCooldownMs: 30 * 60 * 1000,
|
|
199
|
+
defaultThreshold: 50, // percent
|
|
200
|
+
evaluate(event, ctx) {
|
|
201
|
+
if (event.type !== "llm.call")
|
|
202
|
+
return null;
|
|
203
|
+
if (!isRuleEnabled(ctx, "high-error-rate"))
|
|
204
|
+
return null;
|
|
205
|
+
const isError = event.outcome === "error";
|
|
206
|
+
pushWindow(ctx, "msg-outcomes", { ts: ctx.now, value: isError ? 1 : 0 });
|
|
207
|
+
const window = ctx.state.windows.get("msg-outcomes");
|
|
208
|
+
if (!window || window.length < 20)
|
|
209
|
+
return null; // Need 20 messages minimum
|
|
210
|
+
// Check last 20 messages
|
|
211
|
+
const recent = window.slice(-20);
|
|
212
|
+
const errors = recent.filter((e) => e.value === 1).length;
|
|
213
|
+
const rate = (errors / recent.length) * 100;
|
|
214
|
+
const threshold = getRuleThreshold(ctx, "high-error-rate", 50);
|
|
215
|
+
if (rate < threshold)
|
|
216
|
+
return null;
|
|
217
|
+
const fingerprint = "high-error-rate";
|
|
218
|
+
return {
|
|
219
|
+
type: "alert",
|
|
220
|
+
id: makeAlertId("high-error-rate", fingerprint, ctx.now),
|
|
221
|
+
ruleId: "high-error-rate",
|
|
222
|
+
severity: "error",
|
|
223
|
+
title: "High error rate",
|
|
224
|
+
detail: `${Math.round(rate)}% of the last ${recent.length} messages failed.`,
|
|
225
|
+
ts: ctx.now,
|
|
226
|
+
fingerprint,
|
|
227
|
+
};
|
|
228
|
+
},
|
|
229
|
+
};
|
|
230
|
+
// ─── Rule: gateway-down ──────────────────────────────────────────────────────
|
|
231
|
+
const gatewayDown = {
|
|
232
|
+
id: "gateway-down",
|
|
233
|
+
defaultCooldownMs: 60 * 60 * 1000,
|
|
234
|
+
defaultThreshold: 90_000, // 90 seconds
|
|
235
|
+
evaluate(event, ctx) {
|
|
236
|
+
// This rule is called by the watchdog timer, not by events directly.
|
|
237
|
+
if (event.type !== "watchdog.tick")
|
|
238
|
+
return null;
|
|
239
|
+
if (!isRuleEnabled(ctx, "gateway-down"))
|
|
240
|
+
return null;
|
|
241
|
+
if (ctx.state.lastHeartbeatTs === 0)
|
|
242
|
+
return null; // No heartbeat received yet
|
|
243
|
+
const silenceMs = ctx.now - ctx.state.lastHeartbeatTs;
|
|
244
|
+
const threshold = getRuleThreshold(ctx, "gateway-down", DEFAULTS.gatewayDownThresholdMs);
|
|
245
|
+
if (silenceMs < threshold)
|
|
246
|
+
return null;
|
|
247
|
+
const fingerprint = "gateway-down";
|
|
248
|
+
const silenceSec = Math.round(silenceMs / 1000);
|
|
249
|
+
const lastTime = new Date(ctx.state.lastHeartbeatTs).toLocaleTimeString([], {
|
|
250
|
+
hour: "2-digit",
|
|
251
|
+
minute: "2-digit",
|
|
252
|
+
});
|
|
253
|
+
return {
|
|
254
|
+
type: "alert",
|
|
255
|
+
id: makeAlertId("gateway-down", fingerprint, ctx.now),
|
|
256
|
+
ruleId: "gateway-down",
|
|
257
|
+
severity: "critical",
|
|
258
|
+
title: "Gateway unresponsive",
|
|
259
|
+
detail: `No heartbeat received for ${silenceSec}s. Last successful: ${lastTime}.`,
|
|
260
|
+
ts: ctx.now,
|
|
261
|
+
fingerprint,
|
|
262
|
+
};
|
|
263
|
+
},
|
|
264
|
+
};
|
|
265
|
+
// ─── Export all rules ────────────────────────────────────────────────────────
|
|
266
|
+
export const ALL_RULES = [
|
|
267
|
+
infraErrors,
|
|
268
|
+
llmErrors,
|
|
269
|
+
sessionStuck,
|
|
270
|
+
heartbeatFail,
|
|
271
|
+
queueDepth,
|
|
272
|
+
highErrorRate,
|
|
273
|
+
gatewayDown,
|
|
274
|
+
];
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { type StoredEvent } from "./types.js";
|
|
2
|
+
/** Append a single event to the JSONL log. */
|
|
3
|
+
export declare function appendEvent(stateDir: string, event: StoredEvent): void;
|
|
4
|
+
/** Read the most recent N events from the log. Skips malformed lines. */
|
|
5
|
+
export declare function readRecentEvents(stateDir: string, limit: number): StoredEvent[];
|
|
6
|
+
/** Read all events (for warm-start). Caps at 1000 most recent. */
|
|
7
|
+
export declare function readAllEvents(stateDir: string): StoredEvent[];
|
|
8
|
+
/** Prune the log by age and size. Atomic rewrite via .tmp + rename. */
|
|
9
|
+
export declare function pruneLog(stateDir: string, opts?: {
|
|
10
|
+
maxAgeMs?: number;
|
|
11
|
+
maxSizeKb?: number;
|
|
12
|
+
}): void;
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { DEFAULTS, LOG_FILENAME, STORE_DIR_NAME } from "./types.js";
|
|
4
|
+
function resolveDir(stateDir) {
|
|
5
|
+
return path.join(stateDir, STORE_DIR_NAME);
|
|
6
|
+
}
|
|
7
|
+
function resolveLogPath(stateDir) {
|
|
8
|
+
return path.join(resolveDir(stateDir), LOG_FILENAME);
|
|
9
|
+
}
|
|
10
|
+
function ensureDir(stateDir) {
|
|
11
|
+
const dir = resolveDir(stateDir);
|
|
12
|
+
if (!fs.existsSync(dir)) {
|
|
13
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
/** Append a single event to the JSONL log. */
|
|
17
|
+
export function appendEvent(stateDir, event) {
|
|
18
|
+
ensureDir(stateDir);
|
|
19
|
+
const line = JSON.stringify(event) + "\n";
|
|
20
|
+
fs.appendFileSync(resolveLogPath(stateDir), line, "utf-8");
|
|
21
|
+
}
|
|
22
|
+
/** Read the most recent N events from the log. Skips malformed lines. */
|
|
23
|
+
export function readRecentEvents(stateDir, limit) {
|
|
24
|
+
const logPath = resolveLogPath(stateDir);
|
|
25
|
+
if (!fs.existsSync(logPath))
|
|
26
|
+
return [];
|
|
27
|
+
let content;
|
|
28
|
+
try {
|
|
29
|
+
content = fs.readFileSync(logPath, "utf-8");
|
|
30
|
+
}
|
|
31
|
+
catch {
|
|
32
|
+
return [];
|
|
33
|
+
}
|
|
34
|
+
const lines = content.trim().split("\n").filter(Boolean);
|
|
35
|
+
const recent = lines.slice(-limit);
|
|
36
|
+
const events = [];
|
|
37
|
+
for (const line of recent) {
|
|
38
|
+
try {
|
|
39
|
+
const parsed = JSON.parse(line);
|
|
40
|
+
if (parsed && typeof parsed.type === "string" && typeof parsed.ts === "number") {
|
|
41
|
+
events.push(parsed);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
catch {
|
|
45
|
+
// Skip malformed lines silently
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return events;
|
|
49
|
+
}
|
|
50
|
+
/** Read all events (for warm-start). Caps at 1000 most recent. */
|
|
51
|
+
export function readAllEvents(stateDir) {
|
|
52
|
+
return readRecentEvents(stateDir, 1000);
|
|
53
|
+
}
|
|
54
|
+
/** Prune the log by age and size. Atomic rewrite via .tmp + rename. */
|
|
55
|
+
export function pruneLog(stateDir, opts) {
|
|
56
|
+
const logPath = resolveLogPath(stateDir);
|
|
57
|
+
if (!fs.existsSync(logPath))
|
|
58
|
+
return;
|
|
59
|
+
const maxAgeMs = opts?.maxAgeMs ?? DEFAULTS.maxLogAgeDays * 24 * 60 * 60 * 1000;
|
|
60
|
+
const maxSizeBytes = (opts?.maxSizeKb ?? DEFAULTS.maxLogSizeKb) * 1024;
|
|
61
|
+
let content;
|
|
62
|
+
try {
|
|
63
|
+
content = fs.readFileSync(logPath, "utf-8");
|
|
64
|
+
}
|
|
65
|
+
catch {
|
|
66
|
+
return;
|
|
67
|
+
}
|
|
68
|
+
// Check size first — skip if well within limits
|
|
69
|
+
if (Buffer.byteLength(content, "utf-8") < maxSizeBytes * 0.8) {
|
|
70
|
+
// Only prune by age if size is okay
|
|
71
|
+
const cutoff = Date.now() - maxAgeMs;
|
|
72
|
+
const lines = content.trim().split("\n").filter(Boolean);
|
|
73
|
+
const filtered = lines.filter((line) => {
|
|
74
|
+
try {
|
|
75
|
+
const parsed = JSON.parse(line);
|
|
76
|
+
return typeof parsed.ts === "number" && parsed.ts >= cutoff;
|
|
77
|
+
}
|
|
78
|
+
catch {
|
|
79
|
+
return false; // Drop malformed lines during prune
|
|
80
|
+
}
|
|
81
|
+
});
|
|
82
|
+
if (filtered.length < lines.length) {
|
|
83
|
+
writeAtomic(logPath, filtered.join("\n") + "\n");
|
|
84
|
+
}
|
|
85
|
+
return;
|
|
86
|
+
}
|
|
87
|
+
// Over size limit — filter by age first, then trim oldest if still too large
|
|
88
|
+
const cutoff = Date.now() - maxAgeMs;
|
|
89
|
+
let lines = content.trim().split("\n").filter(Boolean);
|
|
90
|
+
// Remove expired
|
|
91
|
+
lines = lines.filter((line) => {
|
|
92
|
+
try {
|
|
93
|
+
const parsed = JSON.parse(line);
|
|
94
|
+
return typeof parsed.ts === "number" && parsed.ts >= cutoff;
|
|
95
|
+
}
|
|
96
|
+
catch {
|
|
97
|
+
return false;
|
|
98
|
+
}
|
|
99
|
+
});
|
|
100
|
+
// Still too large? Keep only the newest lines that fit
|
|
101
|
+
let result = lines.join("\n") + "\n";
|
|
102
|
+
while (Buffer.byteLength(result, "utf-8") > maxSizeBytes && lines.length > 10) {
|
|
103
|
+
lines = lines.slice(Math.floor(lines.length * 0.25)); // Drop oldest quarter
|
|
104
|
+
result = lines.join("\n") + "\n";
|
|
105
|
+
}
|
|
106
|
+
writeAtomic(logPath, result);
|
|
107
|
+
}
|
|
108
|
+
/** Atomic write: write to .tmp, then rename. Falls back to direct write on Windows. */
|
|
109
|
+
function writeAtomic(filePath, content) {
|
|
110
|
+
const tmpPath = filePath + ".tmp";
|
|
111
|
+
try {
|
|
112
|
+
fs.writeFileSync(tmpPath, content, "utf-8");
|
|
113
|
+
fs.renameSync(tmpPath, filePath);
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
// Windows fallback: direct write (rename can be flaky)
|
|
117
|
+
fs.writeFileSync(filePath, content, "utf-8");
|
|
118
|
+
try {
|
|
119
|
+
fs.unlinkSync(tmpPath);
|
|
120
|
+
}
|
|
121
|
+
catch {
|
|
122
|
+
// Ignore cleanup failure
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
export type AlertSeverity = "info" | "warn" | "error" | "critical";
|
|
2
|
+
export type OpenAlertsEventType = "llm.call" | "llm.error" | "llm.token_usage" | "tool.call" | "tool.error" | "agent.start" | "agent.end" | "agent.error" | "agent.stuck" | "session.start" | "session.end" | "session.stuck" | "infra.error" | "infra.heartbeat" | "infra.queue_depth" | "custom" | "watchdog.tick";
|
|
3
|
+
export type OpenAlertsEvent = {
|
|
4
|
+
type: OpenAlertsEventType;
|
|
5
|
+
ts: number;
|
|
6
|
+
severity?: AlertSeverity;
|
|
7
|
+
channel?: string;
|
|
8
|
+
sessionKey?: string;
|
|
9
|
+
agentId?: string;
|
|
10
|
+
durationMs?: number;
|
|
11
|
+
tokenCount?: number;
|
|
12
|
+
queueDepth?: number;
|
|
13
|
+
ageMs?: number;
|
|
14
|
+
costUsd?: number;
|
|
15
|
+
outcome?: "success" | "error" | "skipped" | "timeout";
|
|
16
|
+
error?: string;
|
|
17
|
+
meta?: Record<string, unknown>;
|
|
18
|
+
};
|
|
19
|
+
export interface AlertChannel {
|
|
20
|
+
readonly name: string;
|
|
21
|
+
send(alert: AlertEvent, formatted: string): Promise<void> | void;
|
|
22
|
+
}
|
|
23
|
+
export type AlertEvent = {
|
|
24
|
+
type: "alert";
|
|
25
|
+
id: string;
|
|
26
|
+
ruleId: string;
|
|
27
|
+
severity: AlertSeverity;
|
|
28
|
+
title: string;
|
|
29
|
+
detail: string;
|
|
30
|
+
ts: number;
|
|
31
|
+
fingerprint: string;
|
|
32
|
+
};
|
|
33
|
+
export type DiagnosticSnapshot = {
|
|
34
|
+
type: "diagnostic";
|
|
35
|
+
eventType: string;
|
|
36
|
+
ts: number;
|
|
37
|
+
summary: string;
|
|
38
|
+
channel?: string;
|
|
39
|
+
sessionKey?: string;
|
|
40
|
+
};
|
|
41
|
+
export type HeartbeatSnapshot = {
|
|
42
|
+
type: "heartbeat";
|
|
43
|
+
status: string;
|
|
44
|
+
ts: number;
|
|
45
|
+
reason?: string;
|
|
46
|
+
channel?: string;
|
|
47
|
+
};
|
|
48
|
+
export type StoredEvent = AlertEvent | DiagnosticSnapshot | HeartbeatSnapshot;
|
|
49
|
+
export type AlertTarget = {
|
|
50
|
+
channel: string;
|
|
51
|
+
to: string;
|
|
52
|
+
accountId?: string;
|
|
53
|
+
};
|
|
54
|
+
export type RuleOverride = {
|
|
55
|
+
enabled?: boolean;
|
|
56
|
+
threshold?: number;
|
|
57
|
+
cooldownMinutes?: number;
|
|
58
|
+
};
|
|
59
|
+
export type MonitorConfig = {
|
|
60
|
+
apiKey?: string;
|
|
61
|
+
alertChannel?: string;
|
|
62
|
+
alertTo?: string;
|
|
63
|
+
alertAccountId?: string;
|
|
64
|
+
cooldownMinutes?: number;
|
|
65
|
+
maxLogSizeKb?: number;
|
|
66
|
+
maxLogAgeDays?: number;
|
|
67
|
+
quiet?: boolean;
|
|
68
|
+
rules?: Record<string, RuleOverride>;
|
|
69
|
+
};
|
|
70
|
+
export type OpenAlertsInitOptions = {
|
|
71
|
+
/** Where to store JSONL event logs */
|
|
72
|
+
stateDir: string;
|
|
73
|
+
/** Monitor config (rules, cooldowns, etc.) */
|
|
74
|
+
config: MonitorConfig;
|
|
75
|
+
/** Alert channels to send to */
|
|
76
|
+
channels?: AlertChannel[];
|
|
77
|
+
/** Logger (defaults to console) */
|
|
78
|
+
logger?: OpenAlertsLogger;
|
|
79
|
+
/** Log prefix for messages */
|
|
80
|
+
logPrefix?: string;
|
|
81
|
+
/** Diagnosis hint shown in critical alerts (e.g., 'Run "openclaw doctor"') */
|
|
82
|
+
diagnosisHint?: string;
|
|
83
|
+
};
|
|
84
|
+
export type OpenAlertsLogger = {
|
|
85
|
+
info: (msg: string) => void;
|
|
86
|
+
warn: (msg: string) => void;
|
|
87
|
+
error: (msg: string) => void;
|
|
88
|
+
};
|
|
89
|
+
export type WindowEntry = {
|
|
90
|
+
ts: number;
|
|
91
|
+
value?: number;
|
|
92
|
+
};
|
|
93
|
+
export type EvaluatorState = {
|
|
94
|
+
/** Sliding window counters keyed by window name */
|
|
95
|
+
windows: Map<string, WindowEntry[]>;
|
|
96
|
+
/** Cooldown: fingerprint → last alerted timestamp */
|
|
97
|
+
cooldowns: Map<string, number>;
|
|
98
|
+
/** Consecutive failure counters keyed by counter name */
|
|
99
|
+
consecutives: Map<string, number>;
|
|
100
|
+
/** Hourly alert count for hard cap */
|
|
101
|
+
hourlyAlerts: {
|
|
102
|
+
count: number;
|
|
103
|
+
resetAt: number;
|
|
104
|
+
};
|
|
105
|
+
/** Last diagnostic heartbeat timestamp (for gateway-down detection) */
|
|
106
|
+
lastHeartbeatTs: number;
|
|
107
|
+
/** Startup timestamp */
|
|
108
|
+
startedAt: number;
|
|
109
|
+
/** Aggregate 24h counters for /health display */
|
|
110
|
+
stats: {
|
|
111
|
+
messagesProcessed: number;
|
|
112
|
+
messageErrors: number;
|
|
113
|
+
messagesReceived: number;
|
|
114
|
+
webhookErrors: number;
|
|
115
|
+
stuckSessions: number;
|
|
116
|
+
toolCalls: number;
|
|
117
|
+
toolErrors: number;
|
|
118
|
+
agentStarts: number;
|
|
119
|
+
agentErrors: number;
|
|
120
|
+
sessionsStarted: number;
|
|
121
|
+
compactions: number;
|
|
122
|
+
totalTokens: number;
|
|
123
|
+
totalCostUsd: number;
|
|
124
|
+
lastResetTs: number;
|
|
125
|
+
};
|
|
126
|
+
};
|
|
127
|
+
export type RuleContext = {
|
|
128
|
+
state: EvaluatorState;
|
|
129
|
+
config: MonitorConfig;
|
|
130
|
+
now: number;
|
|
131
|
+
};
|
|
132
|
+
export type AlertRuleDefinition = {
|
|
133
|
+
id: string;
|
|
134
|
+
defaultCooldownMs: number;
|
|
135
|
+
defaultThreshold: number;
|
|
136
|
+
evaluate: (event: OpenAlertsEvent, ctx: RuleContext) => AlertEvent | null;
|
|
137
|
+
};
|
|
138
|
+
export declare const STORE_DIR_NAME = "openalerts";
|
|
139
|
+
export declare const LOG_FILENAME = "events.jsonl";
|
|
140
|
+
export declare const DEFAULTS: {
|
|
141
|
+
readonly cooldownMs: number;
|
|
142
|
+
readonly maxLogSizeKb: 512;
|
|
143
|
+
readonly maxLogAgeDays: 7;
|
|
144
|
+
readonly maxWindowEntries: 100;
|
|
145
|
+
readonly maxCooldownEntries: 50;
|
|
146
|
+
readonly maxAlertsPerHour: 5;
|
|
147
|
+
readonly watchdogIntervalMs: 30000;
|
|
148
|
+
readonly pruneIntervalMs: number;
|
|
149
|
+
readonly platformFlushIntervalMs: number;
|
|
150
|
+
readonly platformBatchSize: 100;
|
|
151
|
+
readonly gatewayDownThresholdMs: 90000;
|
|
152
|
+
};
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
// ─── Alert Severity ──────────────────────────────────────────────────────────
|
|
2
|
+
// ─── Constants ───────────────────────────────────────────────────────────────
|
|
3
|
+
export const STORE_DIR_NAME = "openalerts";
|
|
4
|
+
export const LOG_FILENAME = "events.jsonl";
|
|
5
|
+
export const DEFAULTS = {
|
|
6
|
+
cooldownMs: 15 * 60 * 1000, // 15 minutes
|
|
7
|
+
maxLogSizeKb: 512,
|
|
8
|
+
maxLogAgeDays: 7,
|
|
9
|
+
maxWindowEntries: 100,
|
|
10
|
+
maxCooldownEntries: 50,
|
|
11
|
+
maxAlertsPerHour: 5,
|
|
12
|
+
watchdogIntervalMs: 30_000, // 30 seconds
|
|
13
|
+
pruneIntervalMs: 6 * 60 * 60 * 1000, // 6 hours
|
|
14
|
+
platformFlushIntervalMs: 5 * 60 * 1000, // 5 minutes
|
|
15
|
+
platformBatchSize: 100,
|
|
16
|
+
gatewayDownThresholdMs: 90_000, // 90 seconds
|
|
17
|
+
};
|
package/dist/index.js
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { OpenAlertsEngine } from "
|
|
1
|
+
import { OpenAlertsEngine } from "./core/index.js";
|
|
2
2
|
import { onDiagnosticEvent, registerLogTransport } from "openclaw/plugin-sdk";
|
|
3
|
-
import { createLogBridge } from "./
|
|
4
|
-
import { OpenClawAlertChannel, parseConfig, resolveAlertTarget, translateOpenClawEvent, translateToolCallHook, translateAgentStartHook, translateAgentEndHook, translateSessionStartHook, translateSessionEndHook, translateMessageSentHook, translateMessageReceivedHook, translateBeforeToolCallHook, translateBeforeCompactionHook, translateAfterCompactionHook, translateMessageSendingHook, translateToolResultPersistHook, translateGatewayStartHook, translateGatewayStopHook, } from "./
|
|
5
|
-
import { bindEngine, createMonitorCommands } from "./
|
|
6
|
-
import { createDashboardHandler, closeDashboardConnections, } from "./
|
|
3
|
+
import { createLogBridge } from "./plugin/log-bridge.js";
|
|
4
|
+
import { OpenClawAlertChannel, parseConfig, resolveAlertTarget, translateOpenClawEvent, translateToolCallHook, translateAgentStartHook, translateAgentEndHook, translateSessionStartHook, translateSessionEndHook, translateMessageSentHook, translateMessageReceivedHook, translateBeforeToolCallHook, translateBeforeCompactionHook, translateAfterCompactionHook, translateMessageSendingHook, translateToolResultPersistHook, translateGatewayStartHook, translateGatewayStopHook, } from "./plugin/adapter.js";
|
|
5
|
+
import { bindEngine, createMonitorCommands } from "./plugin/commands.js";
|
|
6
|
+
import { createDashboardHandler, closeDashboardConnections, } from "./plugin/dashboard-routes.js";
|
|
7
7
|
const PLUGIN_ID = "openalerts";
|
|
8
8
|
const LOG_PREFIX = "openalerts";
|
|
9
9
|
let engine = null;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { AlertChannel, AlertEvent, AlertTarget, MonitorConfig, OpenAlertsEvent } from "
|
|
1
|
+
import type { AlertChannel, AlertEvent, AlertTarget, MonitorConfig, OpenAlertsEvent } from "../core/index.js";
|
|
2
2
|
import type { OpenClawPluginApi } from "openclaw/plugin-sdk";
|
|
3
3
|
/**
|
|
4
4
|
* Translate an OpenClaw diagnostic event into a universal OpenAlertsEvent.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { IncomingMessage, ServerResponse } from "node:http";
|
|
2
|
-
import type { OpenAlertsEngine } from "
|
|
2
|
+
import type { OpenAlertsEngine } from "../core/index.js";
|
|
3
3
|
type HttpHandler = (req: IncomingMessage, res: ServerResponse) => Promise<boolean> | boolean;
|
|
4
4
|
/** Close all active SSE connections. Call on engine stop. */
|
|
5
5
|
export declare function closeDashboardConnections(): void;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { BoundedMap } from "
|
|
1
|
+
import { BoundedMap } from "../core/index.js";
|
|
2
2
|
// ─── Parsing Helpers ─────────────────────────────────────────────────────────
|
|
3
3
|
const KV_RE = /(\w+)=([\S]+)/g;
|
|
4
4
|
/** Parse key=value pairs from a log message string. */
|