@littlebearapps/create-platform 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +98 -0
- package/dist/index.d.ts +6 -1
- package/dist/index.js +36 -6
- package/dist/prompts.d.ts +14 -2
- package/dist/prompts.js +29 -7
- package/dist/templates.js +78 -0
- package/package.json +3 -2
- package/templates/full/workers/lib/pattern-discovery/ai-prompt.ts +644 -0
- package/templates/full/workers/lib/pattern-discovery/clustering.ts +278 -0
- package/templates/full/workers/lib/pattern-discovery/shadow-evaluation.ts +603 -0
- package/templates/full/workers/lib/pattern-discovery/storage.ts +806 -0
- package/templates/full/workers/lib/pattern-discovery/types.ts +159 -0
- package/templates/full/workers/lib/pattern-discovery/validation.ts +278 -0
- package/templates/full/workers/pattern-discovery.ts +661 -0
- package/templates/full/workers/platform-alert-router.ts +1809 -0
- package/templates/full/workers/platform-notifications.ts +424 -0
- package/templates/full/workers/platform-search.ts +480 -0
- package/templates/full/workers/platform-settings.ts +436 -0
- package/templates/shared/workers/lib/analytics-engine.ts +357 -0
- package/templates/shared/workers/lib/billing.ts +293 -0
- package/templates/shared/workers/lib/circuit-breaker-middleware.ts +25 -0
- package/templates/shared/workers/lib/control.ts +292 -0
- package/templates/shared/workers/lib/economics.ts +368 -0
- package/templates/shared/workers/lib/metrics.ts +103 -0
- package/templates/shared/workers/lib/platform-settings.ts +407 -0
- package/templates/shared/workers/lib/shared/allowances.ts +333 -0
- package/templates/shared/workers/lib/shared/cloudflare.ts +1362 -0
- package/templates/shared/workers/lib/shared/types.ts +58 -0
- package/templates/shared/workers/lib/telemetry-sampling.ts +360 -0
- package/templates/shared/workers/lib/usage/collectors/example.ts +96 -0
- package/templates/shared/workers/lib/usage/collectors/index.ts +128 -0
- package/templates/shared/workers/lib/usage/handlers/audit.ts +306 -0
- package/templates/shared/workers/lib/usage/handlers/backfill.ts +845 -0
- package/templates/shared/workers/lib/usage/handlers/behavioral.ts +429 -0
- package/templates/shared/workers/lib/usage/handlers/data-queries.ts +507 -0
- package/templates/shared/workers/lib/usage/handlers/dlq-admin.ts +364 -0
- package/templates/shared/workers/lib/usage/handlers/health-trends.ts +222 -0
- package/templates/shared/workers/lib/usage/handlers/index.ts +35 -0
- package/templates/shared/workers/lib/usage/handlers/usage-admin.ts +421 -0
- package/templates/shared/workers/lib/usage/handlers/usage-features.ts +1262 -0
- package/templates/shared/workers/lib/usage/handlers/usage-metrics.ts +2420 -0
- package/templates/shared/workers/lib/usage/handlers/usage-settings.ts +610 -0
- package/templates/shared/workers/lib/usage/queue/budget-enforcement.ts +1032 -0
- package/templates/shared/workers/lib/usage/queue/cost-budget-enforcement.ts +128 -0
- package/templates/shared/workers/lib/usage/queue/cost-calculator.ts +77 -0
- package/templates/shared/workers/lib/usage/queue/dlq-handler.ts +161 -0
- package/templates/shared/workers/lib/usage/queue/index.ts +19 -0
- package/templates/shared/workers/lib/usage/queue/telemetry-processor.ts +790 -0
- package/templates/shared/workers/lib/usage/scheduled/anomaly-detection.ts +732 -0
- package/templates/shared/workers/lib/usage/scheduled/data-collection.ts +956 -0
- package/templates/shared/workers/lib/usage/scheduled/error-digest.ts +343 -0
- package/templates/shared/workers/lib/usage/scheduled/index.ts +18 -0
- package/templates/shared/workers/lib/usage/scheduled/rollups.ts +1561 -0
- package/templates/shared/workers/lib/usage/shared/constants.ts +362 -0
- package/templates/shared/workers/lib/usage/shared/index.ts +14 -0
- package/templates/shared/workers/lib/usage/shared/types.ts +1066 -0
- package/templates/shared/workers/lib/usage/shared/utils.ts +795 -0
- package/templates/shared/workers/platform-usage.ts +1915 -0
- package/templates/standard/workers/error-collector.ts +2670 -0
- package/templates/standard/workers/lib/error-collector/capture.ts +213 -0
- package/templates/standard/workers/lib/error-collector/digest.ts +448 -0
- package/templates/standard/workers/lib/error-collector/email-health-alerts.ts +262 -0
- package/templates/standard/workers/lib/error-collector/fingerprint.ts +258 -0
- package/templates/standard/workers/lib/error-collector/gap-alerts.ts +293 -0
- package/templates/standard/workers/lib/error-collector/github.ts +329 -0
- package/templates/standard/workers/lib/error-collector/types.ts +262 -0
- package/templates/standard/workers/lib/sentinel/gap-detection.ts +734 -0
- package/templates/standard/workers/lib/shared/slack-alerts.ts +585 -0
- package/templates/standard/workers/platform-sentinel.ts +1744 -0
|
@@ -0,0 +1,1809 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Alert Router Worker
|
|
3
|
+
*
|
|
4
|
+
* Consolidates alerts from multiple monitoring sources:
|
|
5
|
+
* - Gatus (uptime monitors + heartbeats -- primary)
|
|
6
|
+
* - HetrixTools (external HTTP checks)
|
|
7
|
+
* - Netdata (VPS internal metrics)
|
|
8
|
+
* - GitHub Actions (deployment failures)
|
|
9
|
+
*
|
|
10
|
+
* Features:
|
|
11
|
+
* - Alert normalization (common event format)
|
|
12
|
+
* - Deduplication (KV-based, 1-hour TTL)
|
|
13
|
+
* - Dependency correlation (suppress child alerts when parent down)
|
|
14
|
+
* - Priority-based routing (Slack channels + in-app notifications)
|
|
15
|
+
* - Incident grouping (related alerts)
|
|
16
|
+
*
|
|
17
|
+
* Cost: $0/month (within Workers free tier)
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import {
|
|
21
|
+
withFeatureBudget,
|
|
22
|
+
CircuitBreakerError,
|
|
23
|
+
completeTracking,
|
|
24
|
+
MONITOR_ALERT_ROUTER,
|
|
25
|
+
createLoggerFromRequest,
|
|
26
|
+
type Logger,
|
|
27
|
+
} from '@littlebearapps/platform-sdk';
|
|
28
|
+
|
|
29
|
+
interface Env {
|
|
30
|
+
PLATFORM_DB: D1Database;
|
|
31
|
+
PLATFORM_CACHE: KVNamespace;
|
|
32
|
+
PLATFORM_ALERTS: KVNamespace; // For deduplication
|
|
33
|
+
SLACK_WEBHOOK_URL: string;
|
|
34
|
+
SERVICE_REGISTRY: KVNamespace; // Cached service registry
|
|
35
|
+
GITHUB_TOKEN: string; // For issue creation
|
|
36
|
+
PLATFORM_TELEMETRY: Queue; // For SDK telemetry
|
|
37
|
+
NOTIFICATIONS_API: Fetcher; // In-app notifications via platform-notifications
|
|
38
|
+
CLOUDFLARE_ACCOUNT_ID: string;
|
|
39
|
+
// Dashboard URL for action links (e.g. "https://admin.example.com")
|
|
40
|
+
DASHBOARD_URL?: string;
|
|
41
|
+
// Gatus status page URL (e.g. "https://status.example.com")
|
|
42
|
+
GATUS_URL?: string;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
interface Alert {
|
|
46
|
+
id: string; // Generated UUID
|
|
47
|
+
source: 'hetrixtools' | 'netdata' | 'github' | 'github-security' | 'gatus' | 'custom';
|
|
48
|
+
severity: 'p0' | 'p1' | 'p2'; // Critical, High, Medium
|
|
49
|
+
status: 'firing' | 'resolved';
|
|
50
|
+
service_id: string; // Maps to service registry
|
|
51
|
+
monitor_id?: string; // Source-specific monitor ID
|
|
52
|
+
summary: string; // Short description
|
|
53
|
+
message: string; // Detailed message
|
|
54
|
+
timestamp: string; // ISO 8601
|
|
55
|
+
metadata?: Record<string, any>; // Source-specific data
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
interface NormalizedIncident {
|
|
59
|
+
incident_key: string; // For deduplication
|
|
60
|
+
alert: Alert;
|
|
61
|
+
parent_down: boolean; // If dependency is down
|
|
62
|
+
suppressed: boolean; // If alert should be suppressed
|
|
63
|
+
baseline_suppressed?: boolean; // If CodeQL baseline alert (48h window)
|
|
64
|
+
related_alerts: string[]; // Other alerts in same incident
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Webhook payload interfaces
|
|
68
|
+
interface HetrixToolsPayload {
|
|
69
|
+
monitor_id: string;
|
|
70
|
+
monitor_name: string;
|
|
71
|
+
monitor_target: string;
|
|
72
|
+
monitor_type: string; // 'website' | 'ping' | 'service[X]' | 'smtp[X]'
|
|
73
|
+
monitor_category: string;
|
|
74
|
+
monitor_status: string; // 'online' | 'offline'
|
|
75
|
+
timestamp: number; // UNIX timestamp
|
|
76
|
+
monitor_errors?: Record<string, string>; // location -> error message (only when offline)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
interface NetdataPayload {
|
|
80
|
+
status: string; // 'WARNING' | 'CRITICAL' | 'CLEAR'
|
|
81
|
+
alarm: string;
|
|
82
|
+
chart: string;
|
|
83
|
+
info: string;
|
|
84
|
+
family?: string;
|
|
85
|
+
host: string;
|
|
86
|
+
value: string;
|
|
87
|
+
units?: string;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
interface GatusWebhookPayload {
|
|
91
|
+
endpoint_name: string;
|
|
92
|
+
endpoint_group: string;
|
|
93
|
+
endpoint_url: string;
|
|
94
|
+
alert_description: string;
|
|
95
|
+
resolved: boolean;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Parse Gatus default template body format (non-JSON with [PLACEHOLDER] syntax).
|
|
100
|
+
* Extracts key-value pairs from lines like: "endpoint_name": [ENDPOINT_NAME]
|
|
101
|
+
*/
|
|
102
|
+
function parseGatusTemplateBody(body: string): GatusWebhookPayload {
|
|
103
|
+
const extract = (key: string): string => {
|
|
104
|
+
const match = body.match(new RegExp(`"${key}"\\s*:\\s*(.+)`));
|
|
105
|
+
return match ? match[1].trim().replace(/^"/, '').replace(/"$/, '').replace(/,\s*$/, '') : '';
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
// Gatus substitutes [RESOLVED] as true/false text, or the literal [RESOLVED]/[NOT_RESOLVED]
|
|
109
|
+
const resolvedRaw = extract('resolved');
|
|
110
|
+
const resolved = resolvedRaw === 'true' || resolvedRaw === '[RESOLVED]';
|
|
111
|
+
|
|
112
|
+
return {
|
|
113
|
+
endpoint_name: extract('endpoint_name'),
|
|
114
|
+
endpoint_group: extract('endpoint_group'),
|
|
115
|
+
endpoint_url: extract('endpoint_url'),
|
|
116
|
+
alert_description: extract('alert_description'),
|
|
117
|
+
resolved,
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
interface GitHubActionsPayload {
|
|
122
|
+
event: string;
|
|
123
|
+
service: string;
|
|
124
|
+
status: string; // 'failure' | 'success'
|
|
125
|
+
commit: string;
|
|
126
|
+
workflow: string;
|
|
127
|
+
timestamp: string;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Error alert payload from platform-usage worker
|
|
131
|
+
interface ErrorAlertPayload {
|
|
132
|
+
type: 'p0_immediate' | 'p1_digest' | 'p2_summary';
|
|
133
|
+
feature_key: string; // e.g. 'my-project:scanner:github'
|
|
134
|
+
project: string;
|
|
135
|
+
category: string;
|
|
136
|
+
feature: string;
|
|
137
|
+
worker?: string;
|
|
138
|
+
correlation_id?: string;
|
|
139
|
+
|
|
140
|
+
// P0 fields
|
|
141
|
+
error_category?: string; // 'CIRCUIT_BREAKER', 'NETWORK', etc.
|
|
142
|
+
error_code?: string;
|
|
143
|
+
error_message?: string;
|
|
144
|
+
error_rate?: number; // Percentage 0-100
|
|
145
|
+
window_minutes?: number;
|
|
146
|
+
|
|
147
|
+
// P1/P2 digest fields
|
|
148
|
+
total_errors?: number;
|
|
149
|
+
distinct_types?: number;
|
|
150
|
+
top_errors?: Array<{
|
|
151
|
+
feature_key: string;
|
|
152
|
+
error_category: string;
|
|
153
|
+
count: number;
|
|
154
|
+
}>;
|
|
155
|
+
period_start?: string;
|
|
156
|
+
period_end?: string;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
interface GitHubCodeScanningPayload {
|
|
160
|
+
action: string; // 'created' | 'reopened' | 'closed_by_user' | 'fixed' | 'appeared_in_branch' | 'closed_by_push'
|
|
161
|
+
alert: {
|
|
162
|
+
number: number;
|
|
163
|
+
created_at: string;
|
|
164
|
+
updated_at?: string;
|
|
165
|
+
url: string;
|
|
166
|
+
html_url: string;
|
|
167
|
+
state: string; // 'open' | 'dismissed' | 'fixed'
|
|
168
|
+
dismissed_by?: any;
|
|
169
|
+
dismissed_at?: string;
|
|
170
|
+
dismissed_reason?: string;
|
|
171
|
+
rule: {
|
|
172
|
+
id: string;
|
|
173
|
+
severity: string; // 'error' | 'warning' | 'note'
|
|
174
|
+
security_severity_level?: string; // 'critical' | 'high' | 'medium' | 'low'
|
|
175
|
+
description: string;
|
|
176
|
+
name?: string;
|
|
177
|
+
tags?: string[];
|
|
178
|
+
};
|
|
179
|
+
tool: {
|
|
180
|
+
name: string; // 'CodeQL'
|
|
181
|
+
version?: string;
|
|
182
|
+
};
|
|
183
|
+
most_recent_instance: {
|
|
184
|
+
ref: string;
|
|
185
|
+
state: string;
|
|
186
|
+
commit_sha: string;
|
|
187
|
+
message: {
|
|
188
|
+
text: string;
|
|
189
|
+
};
|
|
190
|
+
location: {
|
|
191
|
+
path: string;
|
|
192
|
+
start_line?: number;
|
|
193
|
+
end_line?: number;
|
|
194
|
+
};
|
|
195
|
+
};
|
|
196
|
+
};
|
|
197
|
+
repository: {
|
|
198
|
+
id: number;
|
|
199
|
+
name: string;
|
|
200
|
+
full_name: string;
|
|
201
|
+
html_url: string;
|
|
202
|
+
};
|
|
203
|
+
sender: {
|
|
204
|
+
login: string;
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Helper: get dashboard URL from env or fallback
|
|
210
|
+
*/
|
|
211
|
+
function getDashboardUrl(env: Env): string {
|
|
212
|
+
return env.DASHBOARD_URL || '/dashboard';
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Helper: get Gatus status page URL from env or fallback
|
|
217
|
+
*/
|
|
218
|
+
function getGatusUrl(env: Env): string {
|
|
219
|
+
return env.GATUS_URL || '';
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
export default {
|
|
223
|
+
async fetch(request: Request, env: Env, ctx: ExecutionContext): Promise<Response> {
|
|
224
|
+
const url = new URL(request.url);
|
|
225
|
+
|
|
226
|
+
// Health check - bypass SDK for lightweight endpoint
|
|
227
|
+
if (url.pathname === '/health') {
|
|
228
|
+
return new Response(JSON.stringify({ status: 'ok', service: 'alert-router' }), {
|
|
229
|
+
headers: { 'Content-Type': 'application/json' },
|
|
230
|
+
});
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Create structured logger
|
|
234
|
+
const log = createLoggerFromRequest(request, env, 'alert-router', MONITOR_ALERT_ROUTER);
|
|
235
|
+
|
|
236
|
+
// Capture raw Fetcher BEFORE SDK proxying -- the triple-layer Proxy in
|
|
237
|
+
// withFeatureBudget() wraps Fetcher.fetch() in an async wrapper that causes
|
|
238
|
+
// "Illegal invocation" on Cloudflare's native service bindings.
|
|
239
|
+
const notificationsApi = env.NOTIFICATIONS_API;
|
|
240
|
+
|
|
241
|
+
// Wrap with SDK tracking for all alert processing
|
|
242
|
+
try {
|
|
243
|
+
const trackedEnv = withFeatureBudget(env, MONITOR_ALERT_ROUTER, { ctx });
|
|
244
|
+
|
|
245
|
+
let response: Response;
|
|
246
|
+
|
|
247
|
+
// Route by source
|
|
248
|
+
if (url.pathname === '/gatus') {
|
|
249
|
+
response = await handleGatus(request, trackedEnv, log, notificationsApi);
|
|
250
|
+
} else if (url.pathname === '/hetrixtools') {
|
|
251
|
+
response = await handleHetrixTools(request, trackedEnv, log, notificationsApi);
|
|
252
|
+
} else if (url.pathname === '/netdata') {
|
|
253
|
+
response = await handleNetdata(request, trackedEnv, log, notificationsApi);
|
|
254
|
+
} else if (url.pathname === '/github/code-scanning') {
|
|
255
|
+
response = await handleGitHubCodeScanning(request, trackedEnv, log, notificationsApi);
|
|
256
|
+
} else if (url.pathname === '/github') {
|
|
257
|
+
response = await handleGitHubActions(request, trackedEnv, log, notificationsApi);
|
|
258
|
+
} else if (url.pathname === '/errors') {
|
|
259
|
+
response = await handleErrorAlert(request, trackedEnv, log, notificationsApi);
|
|
260
|
+
} else if (url.pathname === '/custom') {
|
|
261
|
+
response = await handleCustomAlert(request, trackedEnv, log, notificationsApi);
|
|
262
|
+
} else {
|
|
263
|
+
response = new Response('Alert Router Worker', { status: 200 });
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
await completeTracking(trackedEnv);
|
|
267
|
+
return response;
|
|
268
|
+
} catch (e) {
|
|
269
|
+
if (e instanceof CircuitBreakerError) {
|
|
270
|
+
log.warn('Circuit breaker STOP', e, { reason: e.reason });
|
|
271
|
+
return new Response(
|
|
272
|
+
JSON.stringify({
|
|
273
|
+
error: 'Service temporarily unavailable',
|
|
274
|
+
reason: e.reason,
|
|
275
|
+
}),
|
|
276
|
+
{ status: 503, headers: { 'Content-Type': 'application/json' } }
|
|
277
|
+
);
|
|
278
|
+
}
|
|
279
|
+
throw e;
|
|
280
|
+
}
|
|
281
|
+
},
|
|
282
|
+
};
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Handle HetrixTools webhook (HTTP uptime monitoring)
|
|
286
|
+
*/
|
|
287
|
+
async function handleHetrixTools(request: Request, env: Env, log: Logger, notificationsApi?: Fetcher): Promise<Response> {
|
|
288
|
+
try {
|
|
289
|
+
const payload = (await request.json()) as HetrixToolsPayload;
|
|
290
|
+
|
|
291
|
+
// Build error message from location-specific errors
|
|
292
|
+
let errorMessage = 'No details provided';
|
|
293
|
+
if (payload.monitor_errors && Object.keys(payload.monitor_errors).length > 0) {
|
|
294
|
+
errorMessage = Object.entries(payload.monitor_errors)
|
|
295
|
+
.map(([location, error]) => `${location}: ${error}`)
|
|
296
|
+
.join(', ');
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
// Extract service_id from monitor_name: "Platform: error-collector /health" -> "error-collector"
|
|
300
|
+
const serviceId = extractHetrixToolsServiceId(payload.monitor_name);
|
|
301
|
+
|
|
302
|
+
const alert: Alert = {
|
|
303
|
+
id: crypto.randomUUID(),
|
|
304
|
+
source: 'hetrixtools',
|
|
305
|
+
severity: payload.monitor_status === 'offline' ? 'p1' : 'p2',
|
|
306
|
+
status: payload.monitor_status === 'offline' ? 'firing' : 'resolved',
|
|
307
|
+
service_id: serviceId,
|
|
308
|
+
monitor_id: payload.monitor_id,
|
|
309
|
+
summary: `${payload.monitor_name}: ${payload.monitor_status}`,
|
|
310
|
+
message: errorMessage,
|
|
311
|
+
timestamp: new Date(payload.timestamp * 1000).toISOString(),
|
|
312
|
+
metadata: {
|
|
313
|
+
monitorTarget: payload.monitor_target,
|
|
314
|
+
monitorType: payload.monitor_type,
|
|
315
|
+
monitorCategory: payload.monitor_category,
|
|
316
|
+
monitorErrors: payload.monitor_errors,
|
|
317
|
+
rawPayload: payload,
|
|
318
|
+
},
|
|
319
|
+
};
|
|
320
|
+
|
|
321
|
+
const incident = await processAlert(alert, env, log);
|
|
322
|
+
await routeAlert(incident, env, log, notificationsApi);
|
|
323
|
+
|
|
324
|
+
log.info('HetrixTools alert processed', {
|
|
325
|
+
incident_key: incident.incident_key,
|
|
326
|
+
status: alert.status,
|
|
327
|
+
});
|
|
328
|
+
|
|
329
|
+
return new Response(JSON.stringify({ status: 'processed', incident }), {
|
|
330
|
+
headers: { 'Content-Type': 'application/json' },
|
|
331
|
+
});
|
|
332
|
+
} catch (error) {
|
|
333
|
+
log.error('HetrixTools webhook error', error);
|
|
334
|
+
return new Response(
|
|
335
|
+
JSON.stringify({ error: error instanceof Error ? error.message : 'Unknown error' }),
|
|
336
|
+
{
|
|
337
|
+
status: 500,
|
|
338
|
+
headers: { 'Content-Type': 'application/json' },
|
|
339
|
+
}
|
|
340
|
+
);
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
/**
|
|
345
|
+
* Handle Netdata webhook
|
|
346
|
+
*/
|
|
347
|
+
async function handleNetdata(request: Request, env: Env, log: Logger, notificationsApi?: Fetcher): Promise<Response> {
|
|
348
|
+
try {
|
|
349
|
+
const payload = (await request.json()) as NetdataPayload;
|
|
350
|
+
|
|
351
|
+
const alert: Alert = {
|
|
352
|
+
id: crypto.randomUUID(),
|
|
353
|
+
source: 'netdata',
|
|
354
|
+
severity: payload.status === 'CRITICAL' ? 'p0' : payload.status === 'WARNING' ? 'p1' : 'p2',
|
|
355
|
+
status: payload.status === 'CLEAR' ? 'resolved' : 'firing',
|
|
356
|
+
service_id: extractNetdataServiceId(payload.alarm, payload.host),
|
|
357
|
+
monitor_id: `${payload.host}:${payload.alarm}`,
|
|
358
|
+
summary: `${payload.alarm} on ${payload.host}`,
|
|
359
|
+
message: `${payload.info} (value: ${payload.value}${payload.units})`,
|
|
360
|
+
timestamp: new Date().toISOString(),
|
|
361
|
+
metadata: {
|
|
362
|
+
chart: payload.chart,
|
|
363
|
+
family: payload.family,
|
|
364
|
+
rawPayload: payload,
|
|
365
|
+
},
|
|
366
|
+
};
|
|
367
|
+
|
|
368
|
+
const incident = await processAlert(alert, env, log);
|
|
369
|
+
await routeAlert(incident, env, log, notificationsApi);
|
|
370
|
+
|
|
371
|
+
log.info('Netdata alert processed', {
|
|
372
|
+
incident_key: incident.incident_key,
|
|
373
|
+
status: alert.status,
|
|
374
|
+
});
|
|
375
|
+
|
|
376
|
+
return new Response(JSON.stringify({ status: 'processed', incident }), {
|
|
377
|
+
headers: { 'Content-Type': 'application/json' },
|
|
378
|
+
});
|
|
379
|
+
} catch (error) {
|
|
380
|
+
log.error('Netdata webhook error', error);
|
|
381
|
+
return new Response(
|
|
382
|
+
JSON.stringify({ error: error instanceof Error ? error.message : 'Unknown error' }),
|
|
383
|
+
{
|
|
384
|
+
status: 500,
|
|
385
|
+
headers: { 'Content-Type': 'application/json' },
|
|
386
|
+
}
|
|
387
|
+
);
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* Handle Gatus custom webhook alert
|
|
393
|
+
*
|
|
394
|
+
* Gatus sends alerts via its custom alerting provider when endpoints go down/up.
|
|
395
|
+
* Payload template is configured in your Gatus config.yaml.
|
|
396
|
+
*/
|
|
397
|
+
async function handleGatus(request: Request, env: Env, log: Logger, notificationsApi?: Fetcher): Promise<Response> {
|
|
398
|
+
try {
|
|
399
|
+
// Parse body defensively - Gatus may send non-JSON with [PLACEHOLDER] syntax
|
|
400
|
+
const bodyText = await request.text();
|
|
401
|
+
let payload: GatusWebhookPayload;
|
|
402
|
+
try {
|
|
403
|
+
payload = JSON.parse(bodyText) as GatusWebhookPayload;
|
|
404
|
+
} catch {
|
|
405
|
+
// Gatus default template uses [RESOLVED], [ENDPOINT_NAME] etc. - not valid JSON
|
|
406
|
+
log.warn('Gatus sent non-JSON body, parsing as template format', undefined, {
|
|
407
|
+
bodyPreview: bodyText.slice(0, 200),
|
|
408
|
+
});
|
|
409
|
+
payload = parseGatusTemplateBody(bodyText);
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
const isHeartbeat = payload.endpoint_group === 'heartbeats';
|
|
413
|
+
const isDown = !payload.resolved;
|
|
414
|
+
const name = payload.endpoint_name || 'Unknown endpoint';
|
|
415
|
+
|
|
416
|
+
const alert: Alert = {
|
|
417
|
+
id: crypto.randomUUID(),
|
|
418
|
+
source: 'gatus',
|
|
419
|
+
severity: isDown ? (isHeartbeat ? 'p0' : 'p1') : 'p2',
|
|
420
|
+
status: isDown ? 'firing' : 'resolved',
|
|
421
|
+
service_id: extractGatusServiceId(name),
|
|
422
|
+
monitor_id: `gatus:${payload.endpoint_group}:${name}`,
|
|
423
|
+
summary: `${name}: ${isDown ? 'DOWN' : 'UP'}`,
|
|
424
|
+
message: payload.alert_description || `Endpoint status: ${isDown ? 'down' : 'up'}`,
|
|
425
|
+
timestamp: new Date().toISOString(),
|
|
426
|
+
metadata: {
|
|
427
|
+
endpointUrl: payload.endpoint_url,
|
|
428
|
+
endpointGroup: payload.endpoint_group,
|
|
429
|
+
rawPayload: payload,
|
|
430
|
+
},
|
|
431
|
+
};
|
|
432
|
+
|
|
433
|
+
const incident = await processAlert(alert, env, log);
|
|
434
|
+
await routeAlert(incident, env, log, notificationsApi);
|
|
435
|
+
|
|
436
|
+
log.info('Gatus alert processed', {
|
|
437
|
+
incident_key: incident.incident_key,
|
|
438
|
+
status: alert.status,
|
|
439
|
+
});
|
|
440
|
+
|
|
441
|
+
return new Response(JSON.stringify({ status: 'processed', incident }), {
|
|
442
|
+
headers: { 'Content-Type': 'application/json' },
|
|
443
|
+
});
|
|
444
|
+
} catch (error) {
|
|
445
|
+
log.error('Gatus webhook error', error);
|
|
446
|
+
return new Response(
|
|
447
|
+
JSON.stringify({ error: error instanceof Error ? error.message : 'Unknown error' }),
|
|
448
|
+
{
|
|
449
|
+
status: 500,
|
|
450
|
+
headers: { 'Content-Type': 'application/json' },
|
|
451
|
+
}
|
|
452
|
+
);
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
/**
|
|
457
|
+
* Handle GitHub Actions webhook
|
|
458
|
+
*/
|
|
459
|
+
async function handleGitHubActions(request: Request, env: Env, log: Logger, notificationsApi?: Fetcher): Promise<Response> {
|
|
460
|
+
try {
|
|
461
|
+
const payload = (await request.json()) as GitHubActionsPayload;
|
|
462
|
+
|
|
463
|
+
const alert: Alert = {
|
|
464
|
+
id: crypto.randomUUID(),
|
|
465
|
+
source: 'github',
|
|
466
|
+
severity: 'p0', // Deployment failures are critical
|
|
467
|
+
status: payload.status === 'failure' ? 'firing' : 'resolved',
|
|
468
|
+
service_id: payload.service,
|
|
469
|
+
monitor_id: `github:${payload.service}:${payload.event}`,
|
|
470
|
+
summary: `${payload.workflow} failed for ${payload.service}`,
|
|
471
|
+
message: `Commit: ${payload.commit}`,
|
|
472
|
+
timestamp: payload.timestamp,
|
|
473
|
+
metadata: {
|
|
474
|
+
workflow: payload.workflow,
|
|
475
|
+
commit: payload.commit,
|
|
476
|
+
rawPayload: payload,
|
|
477
|
+
},
|
|
478
|
+
};
|
|
479
|
+
|
|
480
|
+
const incident = await processAlert(alert, env, log);
|
|
481
|
+
await routeAlert(incident, env, log, notificationsApi);
|
|
482
|
+
|
|
483
|
+
log.info('GitHub Actions alert processed', {
|
|
484
|
+
incident_key: incident.incident_key,
|
|
485
|
+
status: alert.status,
|
|
486
|
+
});
|
|
487
|
+
|
|
488
|
+
return new Response(JSON.stringify({ status: 'processed', incident }), {
|
|
489
|
+
headers: { 'Content-Type': 'application/json' },
|
|
490
|
+
});
|
|
491
|
+
} catch (error) {
|
|
492
|
+
log.error('GitHub Actions webhook error', error);
|
|
493
|
+
return new Response(
|
|
494
|
+
JSON.stringify({ error: error instanceof Error ? error.message : 'Unknown error' }),
|
|
495
|
+
{
|
|
496
|
+
status: 500,
|
|
497
|
+
headers: { 'Content-Type': 'application/json' },
|
|
498
|
+
}
|
|
499
|
+
);
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
/**
|
|
504
|
+
* Handle GitHub Code Scanning webhook (CodeQL, etc.)
|
|
505
|
+
*/
|
|
506
|
+
async function handleGitHubCodeScanning(
|
|
507
|
+
request: Request,
|
|
508
|
+
env: Env,
|
|
509
|
+
log: Logger,
|
|
510
|
+
notificationsApi?: Fetcher
|
|
511
|
+
): Promise<Response> {
|
|
512
|
+
try {
|
|
513
|
+
const payload = (await request.json()) as GitHubCodeScanningPayload;
|
|
514
|
+
|
|
515
|
+
// Map CodeQL severity to Platform severity
|
|
516
|
+
const securityLevel =
|
|
517
|
+
payload.alert.rule.security_severity_level ||
|
|
518
|
+
(payload.alert.rule.severity === 'error' ? 'high' : 'medium');
|
|
519
|
+
|
|
520
|
+
const severity = mapCodeQLSeverity(securityLevel);
|
|
521
|
+
|
|
522
|
+
// Determine status from action
|
|
523
|
+
const status = mapCodeQLAction(payload.action);
|
|
524
|
+
|
|
525
|
+
// Extract repository name for service_id
|
|
526
|
+
const repoName = payload.repository.name;
|
|
527
|
+
const serviceId = `github-${repoName}`;
|
|
528
|
+
|
|
529
|
+
// Build alert summary
|
|
530
|
+
const location = payload.alert.most_recent_instance?.location?.path || 'unknown';
|
|
531
|
+
const line = payload.alert.most_recent_instance?.location?.start_line || 0;
|
|
532
|
+
const ruleName = payload.alert.rule.name || payload.alert.rule.id;
|
|
533
|
+
|
|
534
|
+
const alert: Alert = {
|
|
535
|
+
id: crypto.randomUUID(),
|
|
536
|
+
source: 'github-security',
|
|
537
|
+
severity,
|
|
538
|
+
status,
|
|
539
|
+
service_id: serviceId,
|
|
540
|
+
monitor_id: `codeql:${payload.repository.full_name}:${payload.alert.number}`,
|
|
541
|
+
summary: `CodeQL: ${ruleName} in ${location}${line > 0 ? `:${line}` : ''}`,
|
|
542
|
+
message: payload.alert.rule.description,
|
|
543
|
+
timestamp: payload.alert.created_at,
|
|
544
|
+
metadata: {
|
|
545
|
+
repository: payload.repository.full_name,
|
|
546
|
+
alert_number: payload.alert.number,
|
|
547
|
+
rule_id: payload.alert.rule.id,
|
|
548
|
+
security_level: securityLevel,
|
|
549
|
+
html_url: payload.alert.html_url,
|
|
550
|
+
tool: payload.alert.tool.name,
|
|
551
|
+
action: payload.action,
|
|
552
|
+
location: {
|
|
553
|
+
path: location,
|
|
554
|
+
line: line,
|
|
555
|
+
},
|
|
556
|
+
rawPayload: payload,
|
|
557
|
+
},
|
|
558
|
+
};
|
|
559
|
+
|
|
560
|
+
// Check 48h silent window for baseline suppression
|
|
561
|
+
const silentWindowKey = `codeql-scan-start:${payload.repository.full_name}`;
|
|
562
|
+
const scanStartTime = await env.PLATFORM_ALERTS.get(silentWindowKey);
|
|
563
|
+
|
|
564
|
+
let baselineSuppressed = false;
|
|
565
|
+
|
|
566
|
+
if (!scanStartTime && status === 'firing') {
|
|
567
|
+
// First time seeing alerts from this repo - start 48h window
|
|
568
|
+
await env.PLATFORM_ALERTS.put(silentWindowKey, new Date().toISOString(), {
|
|
569
|
+
expirationTtl: 48 * 3600, // 48 hours
|
|
570
|
+
});
|
|
571
|
+
baselineSuppressed = true;
|
|
572
|
+
log.info('Started 48h silent window', { repository: payload.repository.full_name });
|
|
573
|
+
} else if (scanStartTime && status === 'firing') {
|
|
574
|
+
// Check if we're still within 48h window
|
|
575
|
+
const startTime = new Date(scanStartTime).getTime();
|
|
576
|
+
const now = new Date().getTime();
|
|
577
|
+
const hoursSinceStart = (now - startTime) / (1000 * 3600);
|
|
578
|
+
|
|
579
|
+
if (hoursSinceStart < 48) {
|
|
580
|
+
baselineSuppressed = true;
|
|
581
|
+
log.info('Suppressing baseline alert', {
|
|
582
|
+
repository: payload.repository.full_name,
|
|
583
|
+
hours_since_start: hoursSinceStart.toFixed(1),
|
|
584
|
+
});
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
// Add baseline suppression to metadata
|
|
589
|
+
if (baselineSuppressed) {
|
|
590
|
+
if (!alert.metadata) alert.metadata = {};
|
|
591
|
+
alert.metadata.baseline_suppressed = true;
|
|
592
|
+
alert.metadata.suppression_reason = 'Within 48h of first CodeQL scan (baseline alerts)';
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
const incident = await processAlert(alert, env, log, baselineSuppressed);
|
|
596
|
+
|
|
597
|
+
// Only route to Slack if not baseline-suppressed
|
|
598
|
+
if (!baselineSuppressed) {
|
|
599
|
+
await routeAlert(incident, env, log, notificationsApi);
|
|
600
|
+
|
|
601
|
+
// Create GitHub Issue for P0 alerts
|
|
602
|
+
if (severity === 'p0' && status === 'firing') {
|
|
603
|
+
await createGitHubIssue(alert, payload, env, log);
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
log.info('GitHub Code Scanning alert processed', {
|
|
608
|
+
incident_key: incident.incident_key,
|
|
609
|
+
status: alert.status,
|
|
610
|
+
baseline_suppressed: baselineSuppressed,
|
|
611
|
+
});
|
|
612
|
+
|
|
613
|
+
return new Response(
|
|
614
|
+
JSON.stringify({
|
|
615
|
+
status: 'processed',
|
|
616
|
+
incident,
|
|
617
|
+
baseline_suppressed: baselineSuppressed,
|
|
618
|
+
}),
|
|
619
|
+
{
|
|
620
|
+
headers: { 'Content-Type': 'application/json' },
|
|
621
|
+
}
|
|
622
|
+
);
|
|
623
|
+
} catch (error) {
|
|
624
|
+
log.error('GitHub Code Scanning webhook error', error);
|
|
625
|
+
return new Response(
|
|
626
|
+
JSON.stringify({ error: error instanceof Error ? error.message : 'Unknown error' }),
|
|
627
|
+
{
|
|
628
|
+
status: 500,
|
|
629
|
+
headers: { 'Content-Type': 'application/json' },
|
|
630
|
+
}
|
|
631
|
+
);
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
/**
|
|
636
|
+
* Map CodeQL security severity to Platform severity
|
|
637
|
+
*/
|
|
638
|
+
function mapCodeQLSeverity(securityLevel: string): 'p0' | 'p1' | 'p2' {
|
|
639
|
+
const mapping: Record<string, 'p0' | 'p1' | 'p2'> = {
|
|
640
|
+
critical: 'p0',
|
|
641
|
+
high: 'p0',
|
|
642
|
+
medium: 'p1',
|
|
643
|
+
low: 'p2',
|
|
644
|
+
warning: 'p2',
|
|
645
|
+
note: 'p2',
|
|
646
|
+
};
|
|
647
|
+
|
|
648
|
+
return mapping[securityLevel.toLowerCase()] || 'p1';
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
/**
|
|
652
|
+
* Map CodeQL action to alert status
|
|
653
|
+
*/
|
|
654
|
+
function mapCodeQLAction(action: string): 'firing' | 'resolved' {
|
|
655
|
+
const firingActions = ['created', 'reopened', 'appeared_in_branch'];
|
|
656
|
+
const resolvedActions = ['closed_by_user', 'fixed', 'closed_by_push'];
|
|
657
|
+
|
|
658
|
+
if (firingActions.includes(action)) {
|
|
659
|
+
return 'firing';
|
|
660
|
+
} else if (resolvedActions.includes(action)) {
|
|
661
|
+
return 'resolved';
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
// Default to firing for unknown actions
|
|
665
|
+
return 'firing';
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
/**
|
|
669
|
+
* Handle error alerts from platform-usage worker
|
|
670
|
+
*
|
|
671
|
+
* Routes:
|
|
672
|
+
* - P0: Immediate alert (circuit breaker, >50% error rate)
|
|
673
|
+
* - P1: Hourly digest (>20% error rate, >100 errors)
|
|
674
|
+
* - P2: Daily summary
|
|
675
|
+
*/
|
|
676
|
+
async function handleErrorAlert(request: Request, env: Env, log: Logger, notificationsApi?: Fetcher): Promise<Response> {
|
|
677
|
+
try {
|
|
678
|
+
const payload = (await request.json()) as ErrorAlertPayload;
|
|
679
|
+
log.info('Processing error alert', { type: payload.type, feature_key: payload.feature_key });
|
|
680
|
+
|
|
681
|
+
// Check deduplication for P0 alerts
|
|
682
|
+
if (payload.type === 'p0_immediate') {
|
|
683
|
+
const dedupeKey = `error:${payload.feature_key}:${payload.error_category}:${Math.floor(Date.now() / 3600000)}`;
|
|
684
|
+
const existing = await env.PLATFORM_ALERTS.get(dedupeKey);
|
|
685
|
+
if (existing) {
|
|
686
|
+
log.info('Error alert deduplicated', { dedupe_key: dedupeKey });
|
|
687
|
+
return new Response(JSON.stringify({ status: 'deduplicated', key: dedupeKey }), {
|
|
688
|
+
headers: { 'Content-Type': 'application/json' },
|
|
689
|
+
});
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
// Store with TTL based on priority
|
|
693
|
+
await env.PLATFORM_ALERTS.put(dedupeKey, JSON.stringify(payload), {
|
|
694
|
+
expirationTtl: 3600, // 1 hour for P0
|
|
695
|
+
});
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
// Build and send Slack message
|
|
699
|
+
const slackMessage = buildErrorSlackMessage(payload, env);
|
|
700
|
+
|
|
701
|
+
const response = await fetch(env.SLACK_WEBHOOK_URL, {
|
|
702
|
+
method: 'POST',
|
|
703
|
+
headers: { 'Content-Type': 'application/json' },
|
|
704
|
+
body: JSON.stringify(slackMessage),
|
|
705
|
+
});
|
|
706
|
+
|
|
707
|
+
if (!response.ok) {
|
|
708
|
+
throw new Error(`Slack webhook failed: ${response.status}`);
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
// Store in D1 for historical analysis
|
|
712
|
+
await storeErrorAlertInD1(payload, env, log);
|
|
713
|
+
|
|
714
|
+
// Create in-app notification (uses raw Fetcher, not proxied env)
|
|
715
|
+
if (notificationsApi) {
|
|
716
|
+
const dashboardUrl = getDashboardUrl(env);
|
|
717
|
+
const priorityMap: Record<string, string> = {
|
|
718
|
+
p0_immediate: 'critical',
|
|
719
|
+
p1_digest: 'high',
|
|
720
|
+
p2_summary: 'medium',
|
|
721
|
+
};
|
|
722
|
+
const categoryMap: Record<string, string> = {
|
|
723
|
+
p0_immediate: 'error',
|
|
724
|
+
p1_digest: 'warning',
|
|
725
|
+
p2_summary: 'info',
|
|
726
|
+
};
|
|
727
|
+
try {
|
|
728
|
+
await notificationsApi.fetch(
|
|
729
|
+
'https://platform-notifications/notifications',
|
|
730
|
+
{
|
|
731
|
+
method: 'POST',
|
|
732
|
+
headers: { 'Content-Type': 'application/json' },
|
|
733
|
+
body: JSON.stringify({
|
|
734
|
+
category: categoryMap[payload.type] || 'warning',
|
|
735
|
+
source: 'alert-router:error',
|
|
736
|
+
source_id: payload.correlation_id || `error-${Date.now()}`,
|
|
737
|
+
title: payload.type === 'p0_immediate'
|
|
738
|
+
? `P0 Error: ${payload.feature_key}`
|
|
739
|
+
: payload.type === 'p1_digest'
|
|
740
|
+
? `Error Digest: ${payload.total_errors} errors`
|
|
741
|
+
: `Daily Summary: ${payload.total_errors} errors`,
|
|
742
|
+
description: payload.error_message?.slice(0, 500) || `${payload.total_errors || 1} errors detected`,
|
|
743
|
+
priority: priorityMap[payload.type] || 'medium',
|
|
744
|
+
action_url: `${dashboardUrl}/errors`,
|
|
745
|
+
action_label: 'View Errors',
|
|
746
|
+
project: payload.project,
|
|
747
|
+
}),
|
|
748
|
+
}
|
|
749
|
+
);
|
|
750
|
+
} catch (error) {
|
|
751
|
+
log.error('Failed to create error notification', error);
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
return new Response(JSON.stringify({ status: 'processed', type: payload.type }), {
|
|
756
|
+
headers: { 'Content-Type': 'application/json' },
|
|
757
|
+
});
|
|
758
|
+
} catch (error) {
|
|
759
|
+
log.error('Error alert processing failed', error);
|
|
760
|
+
return new Response(
|
|
761
|
+
JSON.stringify({ error: error instanceof Error ? error.message : 'Unknown error' }),
|
|
762
|
+
{ status: 500, headers: { 'Content-Type': 'application/json' } }
|
|
763
|
+
);
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
/**
|
|
768
|
+
* Build Slack message for error alerts with rich investigation context
|
|
769
|
+
*/
|
|
770
|
+
function buildErrorSlackMessage(payload: ErrorAlertPayload, env: Env): Record<string, unknown> {
|
|
771
|
+
const timestamp = new Date().toISOString();
|
|
772
|
+
const dashboardUrl = getDashboardUrl(env);
|
|
773
|
+
|
|
774
|
+
// Dashboard and observability URLs
|
|
775
|
+
const monitorUrl = `${dashboardUrl}/usage/monitor`;
|
|
776
|
+
const observabilityUrl = `https://dash.cloudflare.com/?to=/:account/workers/observability`;
|
|
777
|
+
const featureKey = payload.feature_key;
|
|
778
|
+
|
|
779
|
+
if (payload.type === 'p0_immediate') {
|
|
780
|
+
// P0: Critical immediate alert with full context
|
|
781
|
+
const isCircuitBreaker = payload.error_category === 'CIRCUIT_BREAKER';
|
|
782
|
+
const emoji = isCircuitBreaker ? '🔴' : '🚨';
|
|
783
|
+
const title = isCircuitBreaker
|
|
784
|
+
? `Circuit Breaker Tripped: ${featureKey}`
|
|
785
|
+
: `High Error Rate: ${featureKey}`;
|
|
786
|
+
|
|
787
|
+
return {
|
|
788
|
+
text: `${emoji} [P0] ${title}`,
|
|
789
|
+
blocks: [
|
|
790
|
+
{
|
|
791
|
+
type: 'header',
|
|
792
|
+
text: {
|
|
793
|
+
type: 'plain_text',
|
|
794
|
+
text: `${emoji} [P0] ${title}`,
|
|
795
|
+
emoji: true,
|
|
796
|
+
},
|
|
797
|
+
},
|
|
798
|
+
{
|
|
799
|
+
type: 'section',
|
|
800
|
+
fields: [
|
|
801
|
+
{ type: 'mrkdwn', text: `*Feature:*\n\`${featureKey}\`` },
|
|
802
|
+
{ type: 'mrkdwn', text: `*Error Category:*\n${payload.error_category || 'Unknown'}` },
|
|
803
|
+
{ type: 'mrkdwn', text: `*Error Code:*\n${payload.error_code || 'N/A'}` },
|
|
804
|
+
{
|
|
805
|
+
type: 'mrkdwn',
|
|
806
|
+
text: `*Error Rate:*\n${payload.error_rate?.toFixed(1)}% (last ${payload.window_minutes}min)`,
|
|
807
|
+
},
|
|
808
|
+
{ type: 'mrkdwn', text: `*Worker:*\n${payload.worker || 'Unknown'}` },
|
|
809
|
+
{ type: 'mrkdwn', text: `*Time:*\n${timestamp}` },
|
|
810
|
+
],
|
|
811
|
+
},
|
|
812
|
+
{
|
|
813
|
+
type: 'section',
|
|
814
|
+
text: {
|
|
815
|
+
type: 'mrkdwn',
|
|
816
|
+
text: `*Error Message:*\n\`\`\`${(payload.error_message || 'No message available').slice(0, 500)}\`\`\``,
|
|
817
|
+
},
|
|
818
|
+
},
|
|
819
|
+
{
|
|
820
|
+
type: 'divider',
|
|
821
|
+
},
|
|
822
|
+
{
|
|
823
|
+
type: 'section',
|
|
824
|
+
text: {
|
|
825
|
+
type: 'mrkdwn',
|
|
826
|
+
text:
|
|
827
|
+
`*Investigation Context*\n` +
|
|
828
|
+
`• *Correlation ID:* \`${payload.correlation_id || 'N/A'}\`\n` +
|
|
829
|
+
`• *Project:* ${payload.project} | *Category:* ${payload.category} | *Feature:* ${payload.feature}\n` +
|
|
830
|
+
`• *Pattern:* Check if this is a recurring failure or new issue`,
|
|
831
|
+
},
|
|
832
|
+
},
|
|
833
|
+
{
|
|
834
|
+
type: 'section',
|
|
835
|
+
text: {
|
|
836
|
+
type: 'mrkdwn',
|
|
837
|
+
text:
|
|
838
|
+
`*Suggested Investigation Steps*\n` +
|
|
839
|
+
`1. Check Workers Observability for recent logs with correlation ID\n` +
|
|
840
|
+
`2. Review feature budget status in dashboard\n` +
|
|
841
|
+
`3. Check if upstream dependencies are healthy\n` +
|
|
842
|
+
`4. Look for recent deployments or config changes\n` +
|
|
843
|
+
isCircuitBreaker
|
|
844
|
+
? `5. If safe, reset circuit breaker: \`KV delete CIRCUIT:${featureKey}:state\``
|
|
845
|
+
: `5. Consider increasing error budget threshold if expected`,
|
|
846
|
+
},
|
|
847
|
+
},
|
|
848
|
+
{
|
|
849
|
+
type: 'actions',
|
|
850
|
+
elements: [
|
|
851
|
+
{
|
|
852
|
+
type: 'button',
|
|
853
|
+
text: { type: 'plain_text', text: 'Usage Monitor', emoji: true },
|
|
854
|
+
url: monitorUrl,
|
|
855
|
+
action_id: 'open_dashboard',
|
|
856
|
+
},
|
|
857
|
+
{
|
|
858
|
+
type: 'button',
|
|
859
|
+
text: { type: 'plain_text', text: 'Workers Observability', emoji: true },
|
|
860
|
+
url: observabilityUrl,
|
|
861
|
+
action_id: 'open_observability',
|
|
862
|
+
},
|
|
863
|
+
],
|
|
864
|
+
},
|
|
865
|
+
],
|
|
866
|
+
attachments: [
|
|
867
|
+
{
|
|
868
|
+
color: 'danger',
|
|
869
|
+
footer: `Platform Alert Router | Feature: ${featureKey}`,
|
|
870
|
+
ts: Math.floor(Date.now() / 1000),
|
|
871
|
+
},
|
|
872
|
+
],
|
|
873
|
+
};
|
|
874
|
+
} else if (payload.type === 'p1_digest') {
|
|
875
|
+
// P1: Hourly digest with aggregated errors
|
|
876
|
+
return {
|
|
877
|
+
text: `[P1] Error Digest: ${payload.total_errors} errors (last hour)`,
|
|
878
|
+
blocks: [
|
|
879
|
+
{
|
|
880
|
+
type: 'header',
|
|
881
|
+
text: {
|
|
882
|
+
type: 'plain_text',
|
|
883
|
+
text: `[P1] Hourly Error Digest`,
|
|
884
|
+
emoji: true,
|
|
885
|
+
},
|
|
886
|
+
},
|
|
887
|
+
{
|
|
888
|
+
type: 'section',
|
|
889
|
+
fields: [
|
|
890
|
+
{ type: 'mrkdwn', text: `*Total Errors:*\n${payload.total_errors}` },
|
|
891
|
+
{ type: 'mrkdwn', text: `*Distinct Types:*\n${payload.distinct_types}` },
|
|
892
|
+
{ type: 'mrkdwn', text: `*Period:*\n${payload.period_start} - ${payload.period_end}` },
|
|
893
|
+
],
|
|
894
|
+
},
|
|
895
|
+
{
|
|
896
|
+
type: 'section',
|
|
897
|
+
text: {
|
|
898
|
+
type: 'mrkdwn',
|
|
899
|
+
text:
|
|
900
|
+
'*Top Errors:*\n' +
|
|
901
|
+
(payload.top_errors || [])
|
|
902
|
+
.slice(0, 5)
|
|
903
|
+
.map((e, i) => `${i + 1}. \`${e.feature_key}\` - ${e.error_category} (${e.count})`)
|
|
904
|
+
.join('\n'),
|
|
905
|
+
},
|
|
906
|
+
},
|
|
907
|
+
{
|
|
908
|
+
type: 'divider',
|
|
909
|
+
},
|
|
910
|
+
{
|
|
911
|
+
type: 'section',
|
|
912
|
+
text: {
|
|
913
|
+
type: 'mrkdwn',
|
|
914
|
+
text:
|
|
915
|
+
`*Investigation Context*\n` +
|
|
916
|
+
`• Review the top error features for patterns\n` +
|
|
917
|
+
`• Check if errors correlate with traffic spikes\n` +
|
|
918
|
+
`• Look for common error categories across features`,
|
|
919
|
+
},
|
|
920
|
+
},
|
|
921
|
+
{
|
|
922
|
+
type: 'actions',
|
|
923
|
+
elements: [
|
|
924
|
+
{
|
|
925
|
+
type: 'button',
|
|
926
|
+
text: { type: 'plain_text', text: 'Usage Monitor', emoji: true },
|
|
927
|
+
url: monitorUrl,
|
|
928
|
+
action_id: 'open_dashboard',
|
|
929
|
+
},
|
|
930
|
+
],
|
|
931
|
+
},
|
|
932
|
+
],
|
|
933
|
+
attachments: [
|
|
934
|
+
{
|
|
935
|
+
color: 'warning',
|
|
936
|
+
footer: 'Platform Alert Router | Hourly Digest',
|
|
937
|
+
ts: Math.floor(Date.now() / 1000),
|
|
938
|
+
},
|
|
939
|
+
],
|
|
940
|
+
};
|
|
941
|
+
} else {
|
|
942
|
+
// P2: Daily summary
|
|
943
|
+
return {
|
|
944
|
+
text: `[P2] Daily Error Summary: ${payload.total_errors} errors`,
|
|
945
|
+
blocks: [
|
|
946
|
+
{
|
|
947
|
+
type: 'header',
|
|
948
|
+
text: {
|
|
949
|
+
type: 'plain_text',
|
|
950
|
+
text: `[P2] Daily Error Summary`,
|
|
951
|
+
emoji: true,
|
|
952
|
+
},
|
|
953
|
+
},
|
|
954
|
+
{
|
|
955
|
+
type: 'section',
|
|
956
|
+
fields: [
|
|
957
|
+
{ type: 'mrkdwn', text: `*Total Errors:*\n${payload.total_errors}` },
|
|
958
|
+
{ type: 'mrkdwn', text: `*Features Affected:*\n${payload.distinct_types}` },
|
|
959
|
+
{ type: 'mrkdwn', text: `*Period:*\n${payload.period_start} - ${payload.period_end}` },
|
|
960
|
+
],
|
|
961
|
+
},
|
|
962
|
+
{
|
|
963
|
+
type: 'section',
|
|
964
|
+
text: {
|
|
965
|
+
type: 'mrkdwn',
|
|
966
|
+
text:
|
|
967
|
+
'*Error Breakdown:*\n' +
|
|
968
|
+
(payload.top_errors || [])
|
|
969
|
+
.map((e, i) => `${i + 1}. \`${e.feature_key}\` - ${e.error_category} (${e.count})`)
|
|
970
|
+
.join('\n'),
|
|
971
|
+
},
|
|
972
|
+
},
|
|
973
|
+
{
|
|
974
|
+
type: 'context',
|
|
975
|
+
elements: [
|
|
976
|
+
{
|
|
977
|
+
type: 'mrkdwn',
|
|
978
|
+
text: `_Low-priority errors aggregated for review. No immediate action required._`,
|
|
979
|
+
},
|
|
980
|
+
],
|
|
981
|
+
},
|
|
982
|
+
],
|
|
983
|
+
attachments: [
|
|
984
|
+
{
|
|
985
|
+
color: '#439FE0',
|
|
986
|
+
footer: 'Platform Alert Router | Daily Summary',
|
|
987
|
+
ts: Math.floor(Date.now() / 1000),
|
|
988
|
+
},
|
|
989
|
+
],
|
|
990
|
+
};
|
|
991
|
+
}
|
|
992
|
+
}
|
|
993
|
+
|
|
994
|
+
/**
|
|
995
|
+
* Store error alert in D1 for historical analysis
|
|
996
|
+
*/
|
|
997
|
+
async function storeErrorAlertInD1(
|
|
998
|
+
payload: ErrorAlertPayload,
|
|
999
|
+
env: Env,
|
|
1000
|
+
log: Logger
|
|
1001
|
+
): Promise<void> {
|
|
1002
|
+
try {
|
|
1003
|
+
await env.PLATFORM_DB.prepare(
|
|
1004
|
+
`INSERT INTO error_alerts (
|
|
1005
|
+
feature_key,
|
|
1006
|
+
alert_type,
|
|
1007
|
+
error_category,
|
|
1008
|
+
error_code,
|
|
1009
|
+
error_count,
|
|
1010
|
+
error_rate,
|
|
1011
|
+
correlation_id,
|
|
1012
|
+
worker,
|
|
1013
|
+
created_at
|
|
1014
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
1015
|
+
)
|
|
1016
|
+
.bind(
|
|
1017
|
+
payload.feature_key,
|
|
1018
|
+
payload.type,
|
|
1019
|
+
payload.error_category || null,
|
|
1020
|
+
payload.error_code || null,
|
|
1021
|
+
payload.total_errors || 1,
|
|
1022
|
+
payload.error_rate || null,
|
|
1023
|
+
payload.correlation_id || null,
|
|
1024
|
+
payload.worker || null,
|
|
1025
|
+
Math.floor(Date.now() / 1000)
|
|
1026
|
+
)
|
|
1027
|
+
.run();
|
|
1028
|
+
} catch (error) {
|
|
1029
|
+
log.error('Failed to store error alert in D1', error);
|
|
1030
|
+
// Don't fail alert processing if D1 fails
|
|
1031
|
+
}
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
/**
|
|
1035
|
+
* Create GitHub Issue for P0 CodeQL alert
|
|
1036
|
+
*/
|
|
1037
|
+
async function createGitHubIssue(
|
|
1038
|
+
alert: Alert,
|
|
1039
|
+
payload: GitHubCodeScanningPayload,
|
|
1040
|
+
env: Env,
|
|
1041
|
+
log: Logger
|
|
1042
|
+
): Promise<void> {
|
|
1043
|
+
try {
|
|
1044
|
+
// Extract repo details
|
|
1045
|
+
const repo = payload.repository.full_name;
|
|
1046
|
+
const [owner, repoName] = repo.split('/');
|
|
1047
|
+
|
|
1048
|
+
// Check for existing issue (idempotency)
|
|
1049
|
+
const existingIssue = await findExistingIssue(owner, repoName, alert, env, log);
|
|
1050
|
+
if (existingIssue) {
|
|
1051
|
+
log.info('Issue already exists for alert', {
|
|
1052
|
+
monitor_id: alert.monitor_id,
|
|
1053
|
+
issue_number: existingIssue.number,
|
|
1054
|
+
});
|
|
1055
|
+
return;
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
// Build labels
|
|
1059
|
+
const labels = buildIssueLabels(alert, payload);
|
|
1060
|
+
|
|
1061
|
+
// Build issue body
|
|
1062
|
+
const body = buildIssueBody(alert, payload);
|
|
1063
|
+
|
|
1064
|
+
// Create issue via GitHub API
|
|
1065
|
+
const response = await fetch(`https://api.github.com/repos/${repo}/issues`, {
|
|
1066
|
+
method: 'POST',
|
|
1067
|
+
headers: {
|
|
1068
|
+
Authorization: `Bearer ${env.GITHUB_TOKEN}`,
|
|
1069
|
+
Accept: 'application/vnd.github+json',
|
|
1070
|
+
'Content-Type': 'application/json',
|
|
1071
|
+
'User-Agent': 'Platform-Alert-Router/1.0',
|
|
1072
|
+
},
|
|
1073
|
+
body: JSON.stringify({
|
|
1074
|
+
title: `[P0][CodeQL] ${alert.summary}`,
|
|
1075
|
+
body: body,
|
|
1076
|
+
labels: labels,
|
|
1077
|
+
}),
|
|
1078
|
+
});
|
|
1079
|
+
|
|
1080
|
+
if (!response.ok) {
|
|
1081
|
+
const error = await response.text();
|
|
1082
|
+
log.error('Failed to create GitHub Issue', { status: response.status, error });
|
|
1083
|
+
return;
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
const issue = (await response.json()) as { number: number; html_url: string };
|
|
1087
|
+
log.info('Created GitHub Issue', {
|
|
1088
|
+
issue_number: issue.number,
|
|
1089
|
+
monitor_id: alert.monitor_id,
|
|
1090
|
+
url: issue.html_url,
|
|
1091
|
+
});
|
|
1092
|
+
} catch (error) {
|
|
1093
|
+
log.error('Failed to create GitHub Issue', error);
|
|
1094
|
+
// Don't fail alert processing if issue creation fails
|
|
1095
|
+
}
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
/**
|
|
1099
|
+
* Build labels for GitHub Issue
|
|
1100
|
+
*/
|
|
1101
|
+
function buildIssueLabels(alert: Alert, payload: GitHubCodeScanningPayload): string[] {
|
|
1102
|
+
const labels: string[] = [
|
|
1103
|
+
'security',
|
|
1104
|
+
'codeql',
|
|
1105
|
+
`severity/${alert.severity.toUpperCase()}`, // severity/P0
|
|
1106
|
+
];
|
|
1107
|
+
|
|
1108
|
+
// Add rule label (best-effort)
|
|
1109
|
+
if (payload.alert.rule?.id) {
|
|
1110
|
+
labels.push(`rule/${payload.alert.rule.id}`);
|
|
1111
|
+
}
|
|
1112
|
+
|
|
1113
|
+
// Extract CWE from tags (best-effort)
|
|
1114
|
+
const cweTags = payload.alert.rule?.tags?.filter((tag: string) =>
|
|
1115
|
+
tag.startsWith('external/cwe/')
|
|
1116
|
+
);
|
|
1117
|
+
|
|
1118
|
+
if (cweTags && cweTags.length > 0) {
|
|
1119
|
+
// Extract first CWE: "external/cwe/cwe-079" -> "cwe/CWE-79"
|
|
1120
|
+
const cweMatch = cweTags[0].match(/cwe-(\d+)/i);
|
|
1121
|
+
if (cweMatch) {
|
|
1122
|
+
labels.push(`cwe/CWE-${cweMatch[1]}`);
|
|
1123
|
+
}
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1126
|
+
return labels;
|
|
1127
|
+
}
|
|
1128
|
+
|
|
1129
|
+
/**
|
|
1130
|
+
* Build issue body with alert details
|
|
1131
|
+
*/
|
|
1132
|
+
function buildIssueBody(alert: Alert, payload: GitHubCodeScanningPayload): string {
|
|
1133
|
+
const location = alert.metadata?.location as { path: string; line: number };
|
|
1134
|
+
const htmlUrl = alert.metadata?.html_url as string;
|
|
1135
|
+
const alertNumber = alert.metadata?.alert_number as number;
|
|
1136
|
+
const ruleId = alert.metadata?.rule_id as string;
|
|
1137
|
+
|
|
1138
|
+
return `## CodeQL Security Alert
|
|
1139
|
+
|
|
1140
|
+
**Severity**: ${alert.severity.toUpperCase()} (${payload.alert.rule.security_severity_level || 'N/A'})
|
|
1141
|
+
**Rule**: ${ruleId}
|
|
1142
|
+
**File**: \`${location.path}:${location.line}\`
|
|
1143
|
+
|
|
1144
|
+
### Description
|
|
1145
|
+
|
|
1146
|
+
${payload.alert.rule.description}
|
|
1147
|
+
|
|
1148
|
+
### Remediation
|
|
1149
|
+
|
|
1150
|
+
See CodeQL documentation for remediation guidance for rule \`${ruleId}\`.
|
|
1151
|
+
|
|
1152
|
+
### Alert Details
|
|
1153
|
+
|
|
1154
|
+
- **Alert Number**: #${alertNumber}
|
|
1155
|
+
- **Created**: ${payload.alert.created_at}
|
|
1156
|
+
- **Tool**: ${payload.alert.tool.name} ${payload.alert.tool.version || ''}
|
|
1157
|
+
- **View on GitHub**: ${htmlUrl}
|
|
1158
|
+
|
|
1159
|
+
### Fix Workflow
|
|
1160
|
+
|
|
1161
|
+
1. Create feature branch: \`git checkout -b fix/codeql-${ruleId}-${alertNumber}\`
|
|
1162
|
+
2. Open file at line: \`${location.path}:${location.line}\`
|
|
1163
|
+
3. Apply fix (see remediation guidance above)
|
|
1164
|
+
4. Run tests: \`npm test\`
|
|
1165
|
+
5. Commit: \`git commit -m "fix: CodeQL ${ruleId} (alert #${alertNumber})"\`
|
|
1166
|
+
6. Create PR: \`gh pr create --fill\`
|
|
1167
|
+
7. Merge and close this issue
|
|
1168
|
+
|
|
1169
|
+
---
|
|
1170
|
+
|
|
1171
|
+
Auto-created by Platform Alert Router
|
|
1172
|
+
`;
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
/**
|
|
1176
|
+
* Find existing issue for alert (idempotency)
|
|
1177
|
+
*/
|
|
1178
|
+
async function findExistingIssue(
|
|
1179
|
+
owner: string,
|
|
1180
|
+
repo: string,
|
|
1181
|
+
alert: Alert,
|
|
1182
|
+
env: Env,
|
|
1183
|
+
log: Logger
|
|
1184
|
+
): Promise<{ number: number } | null> {
|
|
1185
|
+
try {
|
|
1186
|
+
// Search for open issues with codeql label and alert number in title
|
|
1187
|
+
const alertNumber = alert.metadata?.alert_number as number;
|
|
1188
|
+
const ruleId = alert.metadata?.rule_id as string;
|
|
1189
|
+
|
|
1190
|
+
const query = `repo:${owner}/${repo} is:issue is:open label:codeql "${ruleId}" in:title "${alertNumber}" in:body`;
|
|
1191
|
+
|
|
1192
|
+
const response = await fetch(
|
|
1193
|
+
`https://api.github.com/search/issues?q=${encodeURIComponent(query)}`,
|
|
1194
|
+
{
|
|
1195
|
+
headers: {
|
|
1196
|
+
Authorization: `Bearer ${env.GITHUB_TOKEN}`,
|
|
1197
|
+
Accept: 'application/vnd.github+json',
|
|
1198
|
+
'User-Agent': 'Platform-Alert-Router/1.0',
|
|
1199
|
+
},
|
|
1200
|
+
}
|
|
1201
|
+
);
|
|
1202
|
+
|
|
1203
|
+
if (!response.ok) {
|
|
1204
|
+
log.error('Failed to search for existing issues', { status: response.status });
|
|
1205
|
+
return null;
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1208
|
+
const data = (await response.json()) as {
|
|
1209
|
+
items?: Array<{ number: number; title: string; body: string }>;
|
|
1210
|
+
};
|
|
1211
|
+
|
|
1212
|
+
// Check if any results match this alert number
|
|
1213
|
+
const match = data.items?.find((issue) =>
|
|
1214
|
+
issue.body?.includes(`Alert Number**: #${alertNumber}`)
|
|
1215
|
+
);
|
|
1216
|
+
|
|
1217
|
+
return match ? { number: match.number } : null;
|
|
1218
|
+
} catch (error) {
|
|
1219
|
+
log.error('Error searching for existing issues', error);
|
|
1220
|
+
return null;
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
|
|
1224
|
+
/**
|
|
1225
|
+
* Handle custom alerts from internal sources (anomaly detection, etc.)
|
|
1226
|
+
*/
|
|
1227
|
+
async function handleCustomAlert(request: Request, env: Env, log: Logger, notificationsApi?: Fetcher): Promise<Response> {
|
|
1228
|
+
try {
|
|
1229
|
+
const payload = (await request.json()) as {
|
|
1230
|
+
source: string;
|
|
1231
|
+
severity: string;
|
|
1232
|
+
status: string;
|
|
1233
|
+
service_id: string;
|
|
1234
|
+
summary: string;
|
|
1235
|
+
message: string;
|
|
1236
|
+
timestamp: string;
|
|
1237
|
+
metadata?: Record<string, unknown>;
|
|
1238
|
+
};
|
|
1239
|
+
|
|
1240
|
+
const alert: Alert = {
|
|
1241
|
+
id: crypto.randomUUID(),
|
|
1242
|
+
source: 'custom',
|
|
1243
|
+
severity: (['p0', 'p1', 'p2'].includes(payload.severity) ? payload.severity : 'p2') as 'p0' | 'p1' | 'p2',
|
|
1244
|
+
status: payload.status === 'resolved' ? 'resolved' : 'firing',
|
|
1245
|
+
service_id: payload.service_id || 'unknown',
|
|
1246
|
+
summary: payload.summary,
|
|
1247
|
+
message: payload.message,
|
|
1248
|
+
timestamp: payload.timestamp || new Date().toISOString(),
|
|
1249
|
+
metadata: {
|
|
1250
|
+
...payload.metadata,
|
|
1251
|
+
customSource: payload.source,
|
|
1252
|
+
},
|
|
1253
|
+
};
|
|
1254
|
+
|
|
1255
|
+
const incident = await processAlert(alert, env, log);
|
|
1256
|
+
await routeAlert(incident, env, log, notificationsApi);
|
|
1257
|
+
|
|
1258
|
+
log.info('Custom alert processed', {
|
|
1259
|
+
tag: 'CUSTOM_ALERT',
|
|
1260
|
+
incident_key: incident.incident_key,
|
|
1261
|
+
custom_source: payload.source,
|
|
1262
|
+
});
|
|
1263
|
+
|
|
1264
|
+
return new Response(
|
|
1265
|
+
JSON.stringify({ status: 'processed', incident_key: incident.incident_key }),
|
|
1266
|
+
{ headers: { 'Content-Type': 'application/json' } }
|
|
1267
|
+
);
|
|
1268
|
+
} catch (error) {
|
|
1269
|
+
log.error('Custom alert webhook error', error instanceof Error ? error : undefined);
|
|
1270
|
+
return new Response(
|
|
1271
|
+
JSON.stringify({ error: error instanceof Error ? error.message : 'Unknown error' }),
|
|
1272
|
+
{ status: 500, headers: { 'Content-Type': 'application/json' } }
|
|
1273
|
+
);
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
/**
|
|
1278
|
+
* Process alert: deduplicate, correlate, suppress
|
|
1279
|
+
*/
|
|
1280
|
+
async function processAlert(
|
|
1281
|
+
alert: Alert,
|
|
1282
|
+
env: Env,
|
|
1283
|
+
log: Logger,
|
|
1284
|
+
baselineSuppressed: boolean = false
|
|
1285
|
+
): Promise<NormalizedIncident> {
|
|
1286
|
+
// 1. Generate incident key (for deduplication)
|
|
1287
|
+
const incident_key = `${alert.service_id}:${alert.status}:${alert.summary}`;
|
|
1288
|
+
|
|
1289
|
+
// 2. Check deduplication (KV cache)
|
|
1290
|
+
const existingIncident = await env.PLATFORM_ALERTS.get(incident_key);
|
|
1291
|
+
if (existingIncident && alert.status === 'firing') {
|
|
1292
|
+
log.info('Alert deduplicated', { incident_key });
|
|
1293
|
+
return JSON.parse(existingIncident);
|
|
1294
|
+
}
|
|
1295
|
+
|
|
1296
|
+
// 3. Load service registry (for dependency correlation)
|
|
1297
|
+
const serviceRegistry = await loadServiceRegistry(env, log);
|
|
1298
|
+
|
|
1299
|
+
// 4. Check if parent service is down
|
|
1300
|
+
const parentDown = await checkParentDown(alert.service_id, serviceRegistry, env, log);
|
|
1301
|
+
|
|
1302
|
+
// 5. Build incident
|
|
1303
|
+
const incident: NormalizedIncident = {
|
|
1304
|
+
incident_key,
|
|
1305
|
+
alert,
|
|
1306
|
+
parent_down: parentDown,
|
|
1307
|
+
suppressed: parentDown && alert.severity !== 'p0', // Suppress child alerts if parent down (unless P0)
|
|
1308
|
+
baseline_suppressed: baselineSuppressed, // CodeQL 48h baseline window suppression
|
|
1309
|
+
related_alerts: [],
|
|
1310
|
+
};
|
|
1311
|
+
|
|
1312
|
+
// 6. Store in KV (1-hour TTL for deduplication)
|
|
1313
|
+
if (alert.status === 'firing') {
|
|
1314
|
+
await env.PLATFORM_ALERTS.put(incident_key, JSON.stringify(incident), {
|
|
1315
|
+
expirationTtl: 3600, // 1 hour
|
|
1316
|
+
});
|
|
1317
|
+
} else {
|
|
1318
|
+
// Clear resolved alerts
|
|
1319
|
+
await env.PLATFORM_ALERTS.delete(incident_key);
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
// 7. Store in D1 (historical record)
|
|
1323
|
+
await storeIncidentInD1(incident, env, log);
|
|
1324
|
+
|
|
1325
|
+
return incident;
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
/**
|
|
1329
|
+
* Load service registry from KV cache
|
|
1330
|
+
*/
|
|
1331
|
+
async function loadServiceRegistry(env: Env, log: Logger): Promise<any> {
|
|
1332
|
+
const registryJSON = await env.SERVICE_REGISTRY.get('registry:latest');
|
|
1333
|
+
|
|
1334
|
+
if (!registryJSON) {
|
|
1335
|
+
log.warn('Service registry not found in KV, using empty registry');
|
|
1336
|
+
return { services: [], connections: [] };
|
|
1337
|
+
}
|
|
1338
|
+
|
|
1339
|
+
return JSON.parse(registryJSON);
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
/**
|
|
1343
|
+
* Check if parent service is down
|
|
1344
|
+
*/
|
|
1345
|
+
async function checkParentDown(
|
|
1346
|
+
serviceId: string,
|
|
1347
|
+
registry: any,
|
|
1348
|
+
env: Env,
|
|
1349
|
+
log: Logger
|
|
1350
|
+
): Promise<boolean> {
|
|
1351
|
+
// Find service in registry
|
|
1352
|
+
const service = registry.services.find((s: any) => s.id === serviceId);
|
|
1353
|
+
|
|
1354
|
+
if (!service || !service.dependencies || service.dependencies.length === 0) {
|
|
1355
|
+
return false; // No dependencies
|
|
1356
|
+
}
|
|
1357
|
+
|
|
1358
|
+
// Check if any parent dependency has active DOWN alert
|
|
1359
|
+
for (const parentId of service.dependencies) {
|
|
1360
|
+
const parentIncidentKey = `${parentId}:firing:`;
|
|
1361
|
+
|
|
1362
|
+
// Scan KV for parent incidents
|
|
1363
|
+
const keys = await env.PLATFORM_ALERTS.list({ prefix: parentIncidentKey });
|
|
1364
|
+
|
|
1365
|
+
if (keys.keys.length > 0) {
|
|
1366
|
+
log.info('Parent service down, suppressing child alert', {
|
|
1367
|
+
parent_id: parentId,
|
|
1368
|
+
service_id: serviceId,
|
|
1369
|
+
});
|
|
1370
|
+
return true;
|
|
1371
|
+
}
|
|
1372
|
+
}
|
|
1373
|
+
|
|
1374
|
+
return false;
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1377
|
+
/**
|
|
1378
|
+
* Store incident in D1 for historical analysis
|
|
1379
|
+
*/
|
|
1380
|
+
async function storeIncidentInD1(
|
|
1381
|
+
incident: NormalizedIncident,
|
|
1382
|
+
env: Env,
|
|
1383
|
+
log: Logger
|
|
1384
|
+
): Promise<void> {
|
|
1385
|
+
try {
|
|
1386
|
+
await env.PLATFORM_DB.prepare(
|
|
1387
|
+
`INSERT INTO incidents (
|
|
1388
|
+
incident_key,
|
|
1389
|
+
source,
|
|
1390
|
+
severity,
|
|
1391
|
+
status,
|
|
1392
|
+
service_id,
|
|
1393
|
+
summary,
|
|
1394
|
+
message,
|
|
1395
|
+
timestamp,
|
|
1396
|
+
parent_down,
|
|
1397
|
+
suppressed,
|
|
1398
|
+
metadata
|
|
1399
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
1400
|
+
)
|
|
1401
|
+
.bind(
|
|
1402
|
+
incident.incident_key,
|
|
1403
|
+
incident.alert.source,
|
|
1404
|
+
incident.alert.severity,
|
|
1405
|
+
incident.alert.status,
|
|
1406
|
+
incident.alert.service_id,
|
|
1407
|
+
incident.alert.summary,
|
|
1408
|
+
incident.alert.message,
|
|
1409
|
+
incident.alert.timestamp,
|
|
1410
|
+
incident.parent_down ? 1 : 0,
|
|
1411
|
+
incident.suppressed ? 1 : 0,
|
|
1412
|
+
JSON.stringify(incident.alert.metadata)
|
|
1413
|
+
)
|
|
1414
|
+
.run();
|
|
1415
|
+
} catch (error) {
|
|
1416
|
+
log.error('Failed to store incident in D1', error);
|
|
1417
|
+
// Don't fail alert routing if D1 fails
|
|
1418
|
+
}
|
|
1419
|
+
}
|
|
1420
|
+
|
|
1421
|
+
/**
|
|
1422
|
+
* Create in-app notification for an incident via platform-notifications.
|
|
1423
|
+
* Accepts the raw Fetcher binding directly (not proxied env) to avoid
|
|
1424
|
+
* the SDK triple-proxy wrapping Fetcher.fetch() incorrectly.
|
|
1425
|
+
*/
|
|
1426
|
+
async function createRouterNotification(
|
|
1427
|
+
api: Fetcher | undefined,
|
|
1428
|
+
incident: NormalizedIncident,
|
|
1429
|
+
env: Env,
|
|
1430
|
+
log: Logger
|
|
1431
|
+
): Promise<void> {
|
|
1432
|
+
if (!api) return;
|
|
1433
|
+
|
|
1434
|
+
const { alert } = incident;
|
|
1435
|
+
const priorityMap: Record<string, string> = { p0: 'critical', p1: 'high', p2: 'medium' };
|
|
1436
|
+
const category = alert.status === 'resolved' ? 'success' : alert.severity === 'p2' ? 'warning' : 'error';
|
|
1437
|
+
|
|
1438
|
+
const project = mapServiceToProject(alert.service_id);
|
|
1439
|
+
const actionUrl = getAlertActionUrl(alert, env);
|
|
1440
|
+
|
|
1441
|
+
try {
|
|
1442
|
+
const notifResponse = await api.fetch(
|
|
1443
|
+
'https://platform-notifications/notifications',
|
|
1444
|
+
{
|
|
1445
|
+
method: 'POST',
|
|
1446
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1447
|
+
body: JSON.stringify({
|
|
1448
|
+
category,
|
|
1449
|
+
source: `alert-router:${alert.source}`,
|
|
1450
|
+
source_id: alert.id,
|
|
1451
|
+
title: alert.summary,
|
|
1452
|
+
description: alert.message.slice(0, 500),
|
|
1453
|
+
priority: priorityMap[alert.severity] || 'medium',
|
|
1454
|
+
action_url: actionUrl,
|
|
1455
|
+
action_label: 'Investigate',
|
|
1456
|
+
project,
|
|
1457
|
+
}),
|
|
1458
|
+
}
|
|
1459
|
+
);
|
|
1460
|
+
if (!notifResponse.ok) {
|
|
1461
|
+
const body = await notifResponse.text();
|
|
1462
|
+
log.error('Router notification failed', undefined, { status: notifResponse.status, body: body.slice(0, 300) });
|
|
1463
|
+
} else {
|
|
1464
|
+
log.info('Router notification created', { project, source: alert.source });
|
|
1465
|
+
}
|
|
1466
|
+
} catch (error) {
|
|
1467
|
+
log.error('Failed to create router notification', error);
|
|
1468
|
+
}
|
|
1469
|
+
}
|
|
1470
|
+
|
|
1471
|
+
/**
|
|
1472
|
+
* Get dashboard action URL based on alert source
|
|
1473
|
+
*/
|
|
1474
|
+
function getAlertActionUrl(alert: Alert, env: Env): string {
|
|
1475
|
+
const dashboardUrl = getDashboardUrl(env);
|
|
1476
|
+
switch (alert.source) {
|
|
1477
|
+
case 'hetrixtools':
|
|
1478
|
+
case 'gatus':
|
|
1479
|
+
return `${dashboardUrl}/infrastructure`;
|
|
1480
|
+
case 'github':
|
|
1481
|
+
return `${dashboardUrl}/infrastructure`;
|
|
1482
|
+
case 'github-security':
|
|
1483
|
+
return (alert.metadata?.html_url as string) || `${dashboardUrl}/dashboard`;
|
|
1484
|
+
case 'netdata':
|
|
1485
|
+
return `${dashboardUrl}/infrastructure`;
|
|
1486
|
+
default:
|
|
1487
|
+
return `${dashboardUrl}/dashboard`;
|
|
1488
|
+
}
|
|
1489
|
+
}
|
|
1490
|
+
|
|
1491
|
+
/**
|
|
1492
|
+
* Map service_id to a project name for notification filtering.
|
|
1493
|
+
* Customise this function to match your project naming conventions.
|
|
1494
|
+
*/
|
|
1495
|
+
function mapServiceToProject(serviceId: string): string {
|
|
1496
|
+
// Add your project prefix mappings here, e.g.:
|
|
1497
|
+
// if (serviceId.startsWith('my-project')) return 'my-project';
|
|
1498
|
+
if (serviceId.startsWith('platform')) return 'platform';
|
|
1499
|
+
if (serviceId.startsWith('github-')) return serviceId.replace('github-', '');
|
|
1500
|
+
if (serviceId.startsWith('vps-')) return 'infrastructure';
|
|
1501
|
+
return serviceId;
|
|
1502
|
+
}
|
|
1503
|
+
|
|
1504
|
+
/**
|
|
1505
|
+
* Route alert to Slack and create in-app notification
|
|
1506
|
+
*/
|
|
1507
|
+
async function routeAlert(incident: NormalizedIncident, env: Env, log: Logger, notificationsApi?: Fetcher): Promise<void> {
|
|
1508
|
+
// Skip suppressed alerts (parent down OR baseline)
|
|
1509
|
+
if (incident.suppressed) {
|
|
1510
|
+
log.info('Alert suppressed due to parent down', { incident_key: incident.incident_key });
|
|
1511
|
+
return;
|
|
1512
|
+
}
|
|
1513
|
+
|
|
1514
|
+
if (incident.baseline_suppressed) {
|
|
1515
|
+
log.info('Alert suppressed due to CodeQL baseline (48h window)', {
|
|
1516
|
+
incident_key: incident.incident_key,
|
|
1517
|
+
});
|
|
1518
|
+
return;
|
|
1519
|
+
}
|
|
1520
|
+
|
|
1521
|
+
const { alert } = incident;
|
|
1522
|
+
|
|
1523
|
+
// Build Slack message
|
|
1524
|
+
const color = getSeverityColor(alert.severity, alert.status);
|
|
1525
|
+
const emoji = getSeverityEmoji(alert.severity, alert.status);
|
|
1526
|
+
|
|
1527
|
+
// Get investigation context based on alert source
|
|
1528
|
+
const investigationContext = getMonitoringInvestigationContext(alert, env);
|
|
1529
|
+
|
|
1530
|
+
const slackMessage = {
|
|
1531
|
+
text: `${emoji} [${alert.severity.toUpperCase()}] ${alert.summary}`,
|
|
1532
|
+
blocks: [
|
|
1533
|
+
{
|
|
1534
|
+
type: 'section',
|
|
1535
|
+
text: {
|
|
1536
|
+
type: 'mrkdwn',
|
|
1537
|
+
text: `${emoji} *[${alert.severity.toUpperCase()}] ${alert.summary}*\n\n*Status*: ${alert.status}\n*Service*: ${alert.service_id}\n*Message*: ${alert.message}\n*Source*: ${alert.source}\n*Time*: ${alert.timestamp}`,
|
|
1538
|
+
},
|
|
1539
|
+
},
|
|
1540
|
+
{
|
|
1541
|
+
type: 'section',
|
|
1542
|
+
text: {
|
|
1543
|
+
type: 'mrkdwn',
|
|
1544
|
+
text: `*Investigation:*\n\`\`\`${investigationContext.commands}\`\`\``,
|
|
1545
|
+
},
|
|
1546
|
+
},
|
|
1547
|
+
{
|
|
1548
|
+
type: 'actions',
|
|
1549
|
+
elements: investigationContext.buttons,
|
|
1550
|
+
},
|
|
1551
|
+
],
|
|
1552
|
+
attachments: [
|
|
1553
|
+
{
|
|
1554
|
+
color: color,
|
|
1555
|
+
fields: [
|
|
1556
|
+
{
|
|
1557
|
+
title: 'Incident Key',
|
|
1558
|
+
value: incident.incident_key,
|
|
1559
|
+
short: true,
|
|
1560
|
+
},
|
|
1561
|
+
{
|
|
1562
|
+
title: 'Parent Down',
|
|
1563
|
+
value: incident.parent_down ? 'Yes' : 'No',
|
|
1564
|
+
short: true,
|
|
1565
|
+
},
|
|
1566
|
+
],
|
|
1567
|
+
},
|
|
1568
|
+
],
|
|
1569
|
+
};
|
|
1570
|
+
|
|
1571
|
+
// Send to Slack
|
|
1572
|
+
try {
|
|
1573
|
+
const response = await fetch(env.SLACK_WEBHOOK_URL, {
|
|
1574
|
+
method: 'POST',
|
|
1575
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1576
|
+
body: JSON.stringify(slackMessage),
|
|
1577
|
+
});
|
|
1578
|
+
|
|
1579
|
+
if (!response.ok) {
|
|
1580
|
+
throw new Error(`Slack webhook failed: ${response.status}`);
|
|
1581
|
+
}
|
|
1582
|
+
|
|
1583
|
+
log.info('Alert routed to Slack', { incident_key: incident.incident_key });
|
|
1584
|
+
} catch (error) {
|
|
1585
|
+
log.error('Failed to send alert to Slack', error);
|
|
1586
|
+
// Don't fail alert processing if Slack fails
|
|
1587
|
+
}
|
|
1588
|
+
|
|
1589
|
+
// Create in-app notification (non-blocking, independent of Slack success)
|
|
1590
|
+
await createRouterNotification(notificationsApi, incident, env, log);
|
|
1591
|
+
}
|
|
1592
|
+
|
|
1593
|
+
/**
|
|
1594
|
+
* Helper: Extract service ID from monitor name
|
|
1595
|
+
*/
|
|
1596
|
+
function extractServiceId(monitorName: string): string {
|
|
1597
|
+
const normalized = monitorName
|
|
1598
|
+
.toLowerCase()
|
|
1599
|
+
.replace(/[^a-z0-9\s-]/g, '')
|
|
1600
|
+
.replace(/\s+/g, '-')
|
|
1601
|
+
.replace(/-+/g, '-')
|
|
1602
|
+
.split('-')
|
|
1603
|
+
.slice(0, 3) // First 3 parts
|
|
1604
|
+
.join('-');
|
|
1605
|
+
|
|
1606
|
+
return normalized;
|
|
1607
|
+
}
|
|
1608
|
+
|
|
1609
|
+
/**
|
|
1610
|
+
* Helper: Extract service ID from Gatus endpoint name
|
|
1611
|
+
*/
|
|
1612
|
+
function extractGatusServiceId(endpointName: string): string {
|
|
1613
|
+
return endpointName
|
|
1614
|
+
.toLowerCase()
|
|
1615
|
+
.replace(/\.com$/, '')
|
|
1616
|
+
.replace(/\s+/g, '-')
|
|
1617
|
+
.replace(/[^a-z0-9-]/g, '');
|
|
1618
|
+
}
|
|
1619
|
+
|
|
1620
|
+
/**
|
|
1621
|
+
* Helper: Extract service ID from HetrixTools monitor name
|
|
1622
|
+
* "Platform: error-collector /health" -> "error-collector"
|
|
1623
|
+
* "Platform: platform-usage /health" -> "platform-usage"
|
|
1624
|
+
*/
|
|
1625
|
+
function extractHetrixToolsServiceId(monitorName: string): string {
|
|
1626
|
+
// Try "Platform: <service> /health" pattern first
|
|
1627
|
+
const platformMatch = monitorName.match(/^Platform:\s+(\S+)/i);
|
|
1628
|
+
if (platformMatch) {
|
|
1629
|
+
return platformMatch[1].toLowerCase();
|
|
1630
|
+
}
|
|
1631
|
+
|
|
1632
|
+
// Fallback: normalise from monitor name
|
|
1633
|
+
return extractServiceId(monitorName);
|
|
1634
|
+
}
|
|
1635
|
+
|
|
1636
|
+
/**
|
|
1637
|
+
* Helper: Extract service ID from Netdata alarm.
|
|
1638
|
+
* Customise the service-specific alarm detection for your projects.
|
|
1639
|
+
*/
|
|
1640
|
+
function extractNetdataServiceId(alarm: string, host: string): string {
|
|
1641
|
+
// Add your service-specific alarm mappings here, e.g.:
|
|
1642
|
+
// if (alarm.includes('myservice')) return 'my-service';
|
|
1643
|
+
|
|
1644
|
+
// VPS-level alarms: "cpu_usage_high" -> "vps-<host>"
|
|
1645
|
+
return `vps-${host}`;
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
/**
|
|
1649
|
+
* Helper: Get Slack color for severity
|
|
1650
|
+
*/
|
|
1651
|
+
function getSeverityColor(severity: string, status: string): string {
|
|
1652
|
+
if (status === 'resolved') {
|
|
1653
|
+
return 'good'; // Green
|
|
1654
|
+
}
|
|
1655
|
+
|
|
1656
|
+
const colors: Record<string, string> = {
|
|
1657
|
+
p0: 'danger', // Red
|
|
1658
|
+
p1: 'warning', // Orange
|
|
1659
|
+
p2: '#439FE0', // Blue
|
|
1660
|
+
};
|
|
1661
|
+
|
|
1662
|
+
return colors[severity] || colors.p2;
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
/**
|
|
1666
|
+
* Helper: Get emoji for severity
|
|
1667
|
+
*/
|
|
1668
|
+
function getSeverityEmoji(severity: string, status: string): string {
|
|
1669
|
+
if (status === 'resolved') {
|
|
1670
|
+
return 'OK';
|
|
1671
|
+
}
|
|
1672
|
+
|
|
1673
|
+
const emojis: Record<string, string> = {
|
|
1674
|
+
p0: 'CRITICAL',
|
|
1675
|
+
p1: 'WARNING',
|
|
1676
|
+
p2: 'INFO',
|
|
1677
|
+
};
|
|
1678
|
+
|
|
1679
|
+
return emojis[severity] || emojis.p2;
|
|
1680
|
+
}
|
|
1681
|
+
|
|
1682
|
+
/**
|
|
1683
|
+
* Get investigation context for monitoring alerts
|
|
1684
|
+
* Provides actionable commands and links
|
|
1685
|
+
*/
|
|
1686
|
+
function getMonitoringInvestigationContext(alert: Alert, env: Env): {
|
|
1687
|
+
commands: string;
|
|
1688
|
+
buttons: Array<{
|
|
1689
|
+
type: 'button';
|
|
1690
|
+
text: { type: 'plain_text'; text: string; emoji: boolean };
|
|
1691
|
+
url: string;
|
|
1692
|
+
}>;
|
|
1693
|
+
} {
|
|
1694
|
+
const dashboardUrl = getDashboardUrl(env);
|
|
1695
|
+
const gatusUrl = getGatusUrl(env);
|
|
1696
|
+
|
|
1697
|
+
const baseButtons = [
|
|
1698
|
+
{
|
|
1699
|
+
type: 'button' as const,
|
|
1700
|
+
text: { type: 'plain_text' as const, text: 'Platform Dashboard', emoji: true },
|
|
1701
|
+
url: `${dashboardUrl}/usage/monitor`,
|
|
1702
|
+
},
|
|
1703
|
+
];
|
|
1704
|
+
|
|
1705
|
+
// Source-specific investigation commands
|
|
1706
|
+
if (alert.source === 'gatus') {
|
|
1707
|
+
const endpointUrl = (alert.metadata?.endpointUrl as string) || 'N/A';
|
|
1708
|
+
const endpointGroup = (alert.metadata?.endpointGroup as string) || 'monitors';
|
|
1709
|
+
const isHeartbeat = endpointGroup === 'heartbeats';
|
|
1710
|
+
return {
|
|
1711
|
+
commands: isHeartbeat
|
|
1712
|
+
? `# Check worker logs (last 15 min)
|
|
1713
|
+
npx wrangler tail ${alert.service_id} --format pretty
|
|
1714
|
+
|
|
1715
|
+
# Check cron triggers
|
|
1716
|
+
# Cloudflare dashboard > Workers > ${alert.service_id} > Triggers
|
|
1717
|
+
|
|
1718
|
+
# Manual heartbeat test
|
|
1719
|
+
${gatusUrl ? `curl -X POST "${gatusUrl}/api/v1/endpoints/heartbeats_${alert.service_id}/external?success=true" -H "Authorization: Bearer $GATUS_TOKEN"` : '# Configure GATUS_URL to enable heartbeat testing'}`
|
|
1720
|
+
: `# Check endpoint directly
|
|
1721
|
+
curl -s -o /dev/null -w "%{http_code}" "${endpointUrl}"
|
|
1722
|
+
|
|
1723
|
+
# Check worker logs
|
|
1724
|
+
npx wrangler tail ${alert.service_id} --format pretty`,
|
|
1725
|
+
buttons: [
|
|
1726
|
+
...baseButtons,
|
|
1727
|
+
...(gatusUrl ? [{
|
|
1728
|
+
type: 'button' as const,
|
|
1729
|
+
text: { type: 'plain_text' as const, text: 'Gatus Status', emoji: true },
|
|
1730
|
+
url: gatusUrl,
|
|
1731
|
+
}] : []),
|
|
1732
|
+
],
|
|
1733
|
+
};
|
|
1734
|
+
}
|
|
1735
|
+
|
|
1736
|
+
if (alert.source === 'hetrixtools') {
|
|
1737
|
+
const monitorTarget = (alert.metadata?.monitorTarget as string) || 'N/A';
|
|
1738
|
+
const errors = alert.metadata?.monitorErrors as Record<string, string> | undefined;
|
|
1739
|
+
const errorDetail = errors
|
|
1740
|
+
? Object.entries(errors).map(([loc, err]) => ` ${loc}: ${err}`).join('\n')
|
|
1741
|
+
: ' No error details';
|
|
1742
|
+
return {
|
|
1743
|
+
commands: `# Check endpoint directly
|
|
1744
|
+
curl -s -o /dev/null -w "%{http_code}" "${monitorTarget}"
|
|
1745
|
+
|
|
1746
|
+
# Location errors:
|
|
1747
|
+
${errorDetail}
|
|
1748
|
+
|
|
1749
|
+
# Check worker logs (last 15 min)
|
|
1750
|
+
npx wrangler tail ${alert.service_id} --format pretty
|
|
1751
|
+
|
|
1752
|
+
# Check recent incidents
|
|
1753
|
+
npx wrangler d1 execute platform-metrics --remote --command "SELECT * FROM incidents WHERE service_id = '${alert.service_id}' ORDER BY created_at DESC LIMIT 5"`,
|
|
1754
|
+
buttons: [
|
|
1755
|
+
...baseButtons,
|
|
1756
|
+
{
|
|
1757
|
+
type: 'button' as const,
|
|
1758
|
+
text: { type: 'plain_text' as const, text: 'HetrixTools', emoji: true },
|
|
1759
|
+
url: 'https://hetrixtools.com/dashboard/uptime-monitors/',
|
|
1760
|
+
},
|
|
1761
|
+
],
|
|
1762
|
+
};
|
|
1763
|
+
}
|
|
1764
|
+
|
|
1765
|
+
if (alert.source === 'netdata') {
|
|
1766
|
+
const host = alert.metadata?.rawPayload?.host || 'unknown';
|
|
1767
|
+
const chart = alert.metadata?.chart || 'system.cpu';
|
|
1768
|
+
return {
|
|
1769
|
+
commands: `# SSH to VPS and check metrics
|
|
1770
|
+
ssh ${host}
|
|
1771
|
+
|
|
1772
|
+
# Check Netdata dashboard
|
|
1773
|
+
# URL: http://${host}:19999
|
|
1774
|
+
|
|
1775
|
+
# View specific chart
|
|
1776
|
+
# Chart: ${chart}
|
|
1777
|
+
|
|
1778
|
+
# Check system resources
|
|
1779
|
+
htop
|
|
1780
|
+
df -h
|
|
1781
|
+
free -m`,
|
|
1782
|
+
buttons: [
|
|
1783
|
+
...baseButtons,
|
|
1784
|
+
{
|
|
1785
|
+
type: 'button' as const,
|
|
1786
|
+
text: { type: 'plain_text' as const, text: 'Netdata Dashboard', emoji: true },
|
|
1787
|
+
url: `http://${host}:19999`,
|
|
1788
|
+
},
|
|
1789
|
+
],
|
|
1790
|
+
};
|
|
1791
|
+
}
|
|
1792
|
+
|
|
1793
|
+
// Default for other sources
|
|
1794
|
+
return {
|
|
1795
|
+
commands: `# Check worker logs
|
|
1796
|
+
npx wrangler tail ${alert.service_id} --format pretty
|
|
1797
|
+
|
|
1798
|
+
# View Workers Observability
|
|
1799
|
+
# Filter by service: ${alert.service_id}`,
|
|
1800
|
+
buttons: [
|
|
1801
|
+
...baseButtons,
|
|
1802
|
+
{
|
|
1803
|
+
type: 'button' as const,
|
|
1804
|
+
text: { type: 'plain_text' as const, text: 'Workers Observability', emoji: true },
|
|
1805
|
+
url: 'https://dash.cloudflare.com/?to=/:account/workers/observability',
|
|
1806
|
+
},
|
|
1807
|
+
],
|
|
1808
|
+
};
|
|
1809
|
+
}
|