@littlebearapps/platform-admin-sdk 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +112 -0
- package/dist/index.d.ts +16 -0
- package/dist/index.js +89 -0
- package/dist/prompts.d.ts +27 -0
- package/dist/prompts.js +80 -0
- package/dist/scaffold.d.ts +5 -0
- package/dist/scaffold.js +65 -0
- package/dist/templates.d.ts +16 -0
- package/dist/templates.js +131 -0
- package/package.json +46 -0
- package/templates/full/migrations/006_pattern_discovery.sql +199 -0
- package/templates/full/migrations/007_notifications_search.sql +127 -0
- package/templates/full/workers/lib/pattern-discovery/ai-prompt.ts +644 -0
- package/templates/full/workers/lib/pattern-discovery/clustering.ts +278 -0
- package/templates/full/workers/lib/pattern-discovery/shadow-evaluation.ts +603 -0
- package/templates/full/workers/lib/pattern-discovery/storage.ts +806 -0
- package/templates/full/workers/lib/pattern-discovery/types.ts +159 -0
- package/templates/full/workers/lib/pattern-discovery/validation.ts +278 -0
- package/templates/full/workers/pattern-discovery.ts +661 -0
- package/templates/full/workers/platform-alert-router.ts +1809 -0
- package/templates/full/workers/platform-notifications.ts +424 -0
- package/templates/full/workers/platform-search.ts +480 -0
- package/templates/full/workers/platform-settings.ts +436 -0
- package/templates/full/wrangler.alert-router.jsonc.hbs +34 -0
- package/templates/full/wrangler.notifications.jsonc.hbs +23 -0
- package/templates/full/wrangler.pattern-discovery.jsonc.hbs +33 -0
- package/templates/full/wrangler.search.jsonc.hbs +16 -0
- package/templates/full/wrangler.settings.jsonc.hbs +23 -0
- package/templates/shared/README.md.hbs +69 -0
- package/templates/shared/config/budgets.yaml.hbs +72 -0
- package/templates/shared/config/services.yaml.hbs +45 -0
- package/templates/shared/migrations/001_core_tables.sql +117 -0
- package/templates/shared/migrations/002_usage_warehouse.sql +830 -0
- package/templates/shared/migrations/003_feature_tracking.sql +250 -0
- package/templates/shared/migrations/004_settings_alerts.sql +452 -0
- package/templates/shared/migrations/seed.sql.hbs +4 -0
- package/templates/shared/package.json.hbs +21 -0
- package/templates/shared/scripts/sync-config.ts +242 -0
- package/templates/shared/tsconfig.json +12 -0
- package/templates/shared/workers/lib/analytics-engine.ts +357 -0
- package/templates/shared/workers/lib/billing.ts +293 -0
- package/templates/shared/workers/lib/circuit-breaker-middleware.ts +25 -0
- package/templates/shared/workers/lib/control.ts +292 -0
- package/templates/shared/workers/lib/economics.ts +368 -0
- package/templates/shared/workers/lib/metrics.ts +103 -0
- package/templates/shared/workers/lib/platform-settings.ts +407 -0
- package/templates/shared/workers/lib/shared/allowances.ts +333 -0
- package/templates/shared/workers/lib/shared/cloudflare.ts +1362 -0
- package/templates/shared/workers/lib/shared/types.ts +58 -0
- package/templates/shared/workers/lib/telemetry-sampling.ts +360 -0
- package/templates/shared/workers/lib/usage/collectors/example.ts +96 -0
- package/templates/shared/workers/lib/usage/collectors/index.ts +128 -0
- package/templates/shared/workers/lib/usage/handlers/audit.ts +306 -0
- package/templates/shared/workers/lib/usage/handlers/backfill.ts +845 -0
- package/templates/shared/workers/lib/usage/handlers/behavioral.ts +429 -0
- package/templates/shared/workers/lib/usage/handlers/data-queries.ts +507 -0
- package/templates/shared/workers/lib/usage/handlers/dlq-admin.ts +364 -0
- package/templates/shared/workers/lib/usage/handlers/health-trends.ts +222 -0
- package/templates/shared/workers/lib/usage/handlers/index.ts +35 -0
- package/templates/shared/workers/lib/usage/handlers/usage-admin.ts +421 -0
- package/templates/shared/workers/lib/usage/handlers/usage-features.ts +1262 -0
- package/templates/shared/workers/lib/usage/handlers/usage-metrics.ts +2420 -0
- package/templates/shared/workers/lib/usage/handlers/usage-settings.ts +610 -0
- package/templates/shared/workers/lib/usage/queue/budget-enforcement.ts +1032 -0
- package/templates/shared/workers/lib/usage/queue/cost-budget-enforcement.ts +128 -0
- package/templates/shared/workers/lib/usage/queue/cost-calculator.ts +77 -0
- package/templates/shared/workers/lib/usage/queue/dlq-handler.ts +161 -0
- package/templates/shared/workers/lib/usage/queue/index.ts +19 -0
- package/templates/shared/workers/lib/usage/queue/telemetry-processor.ts +790 -0
- package/templates/shared/workers/lib/usage/scheduled/anomaly-detection.ts +732 -0
- package/templates/shared/workers/lib/usage/scheduled/data-collection.ts +956 -0
- package/templates/shared/workers/lib/usage/scheduled/error-digest.ts +343 -0
- package/templates/shared/workers/lib/usage/scheduled/index.ts +18 -0
- package/templates/shared/workers/lib/usage/scheduled/rollups.ts +1561 -0
- package/templates/shared/workers/lib/usage/shared/constants.ts +362 -0
- package/templates/shared/workers/lib/usage/shared/index.ts +14 -0
- package/templates/shared/workers/lib/usage/shared/types.ts +1066 -0
- package/templates/shared/workers/lib/usage/shared/utils.ts +795 -0
- package/templates/shared/workers/platform-usage.ts +1915 -0
- package/templates/shared/wrangler.usage.jsonc.hbs +58 -0
- package/templates/standard/migrations/005_error_collection.sql +162 -0
- package/templates/standard/workers/error-collector.ts +2670 -0
- package/templates/standard/workers/lib/error-collector/capture.ts +213 -0
- package/templates/standard/workers/lib/error-collector/digest.ts +448 -0
- package/templates/standard/workers/lib/error-collector/email-health-alerts.ts +262 -0
- package/templates/standard/workers/lib/error-collector/fingerprint.ts +258 -0
- package/templates/standard/workers/lib/error-collector/gap-alerts.ts +293 -0
- package/templates/standard/workers/lib/error-collector/github.ts +329 -0
- package/templates/standard/workers/lib/error-collector/types.ts +262 -0
- package/templates/standard/workers/lib/sentinel/gap-detection.ts +734 -0
- package/templates/standard/workers/lib/shared/slack-alerts.ts +585 -0
- package/templates/standard/workers/platform-sentinel.ts +1744 -0
- package/templates/standard/wrangler.error-collector.jsonc.hbs +44 -0
- package/templates/standard/wrangler.sentinel.jsonc.hbs +45 -0
|
@@ -0,0 +1,732 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Anomaly Detection Module
|
|
3
|
+
*
|
|
4
|
+
* Functions for detecting usage anomalies using rolling statistics,
|
|
5
|
+
* dataset drift detection, and alerting via Slack.
|
|
6
|
+
* Extracted from platform-usage.ts as part of scheduled task modularisation.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { Env, RollingStats } from '../shared';
|
|
10
|
+
import { KNOWN_DATASETS, QUERIED_DATASETS, generateId, fetchWithRetry } from '../shared';
|
|
11
|
+
import { createLoggerFromEnv } from '@littlebearapps/platform-consumer-sdk';
|
|
12
|
+
|
|
13
|
+
// =============================================================================
|
|
14
|
+
// SLACK ALERTING
|
|
15
|
+
// =============================================================================
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Slack alert payload structure.
|
|
19
|
+
*/
|
|
20
|
+
interface SlackAlertPayload {
|
|
21
|
+
text: string;
|
|
22
|
+
attachments?: Array<{
|
|
23
|
+
color: string;
|
|
24
|
+
fields: Array<{ title: string; value: string; short?: boolean }>;
|
|
25
|
+
}>;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Send a Slack alert.
|
|
30
|
+
*/
|
|
31
|
+
async function sendSlackAlert(env: Env, payload: SlackAlertPayload): Promise<void> {
|
|
32
|
+
if (!env.SLACK_WEBHOOK_URL) return;
|
|
33
|
+
|
|
34
|
+
try {
|
|
35
|
+
await fetchWithRetry(env.SLACK_WEBHOOK_URL, {
|
|
36
|
+
method: 'POST',
|
|
37
|
+
headers: { 'Content-Type': 'application/json' },
|
|
38
|
+
body: JSON.stringify(payload),
|
|
39
|
+
});
|
|
40
|
+
} catch (error) {
|
|
41
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:slack');
|
|
42
|
+
log.error('Failed to send Slack alert', error instanceof Error ? error : undefined, {
|
|
43
|
+
tag: 'SLACK_ERROR',
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// =============================================================================
|
|
49
|
+
// ROLLING STATISTICS
|
|
50
|
+
// =============================================================================
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Allowed metrics for rolling stats calculation.
|
|
54
|
+
* These metrics can be used in parameterised SQL queries.
|
|
55
|
+
*/
|
|
56
|
+
const ALLOWED_ROLLING_METRICS = [
|
|
57
|
+
'workers_requests',
|
|
58
|
+
'workers_errors',
|
|
59
|
+
'workers_cost_usd',
|
|
60
|
+
'd1_rows_read',
|
|
61
|
+
'd1_rows_written',
|
|
62
|
+
'd1_cost_usd',
|
|
63
|
+
'kv_reads',
|
|
64
|
+
'kv_writes',
|
|
65
|
+
'kv_cost_usd',
|
|
66
|
+
'r2_class_a_ops',
|
|
67
|
+
'r2_class_b_ops',
|
|
68
|
+
'r2_cost_usd',
|
|
69
|
+
'aigateway_requests',
|
|
70
|
+
'aigateway_cost_usd',
|
|
71
|
+
'workersai_requests',
|
|
72
|
+
'workersai_neurons',
|
|
73
|
+
'workersai_cost_usd',
|
|
74
|
+
'total_cost_usd',
|
|
75
|
+
] as const;
|
|
76
|
+
|
|
77
|
+
type AllowedMetric = (typeof ALLOWED_ROLLING_METRICS)[number];
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Check if a metric is in the allowed list.
|
|
81
|
+
*/
|
|
82
|
+
function isAllowedMetric(metric: string): metric is AllowedMetric {
|
|
83
|
+
return ALLOWED_ROLLING_METRICS.includes(metric as AllowedMetric);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Calculate 7-day rolling statistics for a metric.
|
|
88
|
+
* Uses daily rollups for efficient computation.
|
|
89
|
+
*/
|
|
90
|
+
export async function calculate7DayRollingStats(
|
|
91
|
+
env: Env,
|
|
92
|
+
metric: string,
|
|
93
|
+
project: string
|
|
94
|
+
): Promise<RollingStats | null> {
|
|
95
|
+
if (!isAllowedMetric(metric)) {
|
|
96
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:anomaly');
|
|
97
|
+
log.warn('Invalid metric for rolling stats', undefined, {
|
|
98
|
+
tag: 'INVALID_METRIC',
|
|
99
|
+
metric,
|
|
100
|
+
});
|
|
101
|
+
return null;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
try {
|
|
105
|
+
// SQLite doesn't have native STDDEV, so calculate manually using sum and sum of squares
|
|
106
|
+
const result = await env.PLATFORM_DB.prepare(
|
|
107
|
+
`
|
|
108
|
+
SELECT
|
|
109
|
+
COUNT(*) as sample_count,
|
|
110
|
+
SUM(${metric}) as sum_value,
|
|
111
|
+
SUM(${metric} * ${metric}) as sum_squared,
|
|
112
|
+
AVG(${metric}) as avg_value
|
|
113
|
+
FROM daily_usage_rollups
|
|
114
|
+
WHERE project = ?
|
|
115
|
+
AND snapshot_date >= date('now', '-7 days')
|
|
116
|
+
AND snapshot_date < date('now')
|
|
117
|
+
`
|
|
118
|
+
)
|
|
119
|
+
.bind(project)
|
|
120
|
+
.first<{
|
|
121
|
+
sample_count: number;
|
|
122
|
+
sum_value: number;
|
|
123
|
+
sum_squared: number;
|
|
124
|
+
avg_value: number;
|
|
125
|
+
}>();
|
|
126
|
+
|
|
127
|
+
if (!result || result.sample_count === 0) {
|
|
128
|
+
return null;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const n = result.sample_count;
|
|
132
|
+
const avg = result.avg_value;
|
|
133
|
+
// Variance = (sum of squares - n * mean^2) / n
|
|
134
|
+
const variance = (result.sum_squared - n * avg * avg) / n;
|
|
135
|
+
const stddev = Math.sqrt(Math.max(0, variance)); // Ensure non-negative
|
|
136
|
+
|
|
137
|
+
return {
|
|
138
|
+
avg,
|
|
139
|
+
stddev,
|
|
140
|
+
samples: n,
|
|
141
|
+
};
|
|
142
|
+
} catch (error) {
|
|
143
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:anomaly');
|
|
144
|
+
log.error('Error calculating rolling stats', error instanceof Error ? error : undefined, {
|
|
145
|
+
tag: 'ROLLING_STATS_ERROR',
|
|
146
|
+
metric,
|
|
147
|
+
project,
|
|
148
|
+
});
|
|
149
|
+
return null;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// =============================================================================
|
|
154
|
+
// TODAY'S METRIC VALUE
|
|
155
|
+
// =============================================================================
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Get today's value for a metric from hourly snapshots.
|
|
159
|
+
*/
|
|
160
|
+
export async function getTodayMetricValue(
|
|
161
|
+
env: Env,
|
|
162
|
+
metric: string,
|
|
163
|
+
project: string = 'all'
|
|
164
|
+
): Promise<number> {
|
|
165
|
+
if (!isAllowedMetric(metric)) {
|
|
166
|
+
return 0;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
try {
|
|
170
|
+
const result = await env.PLATFORM_DB.prepare(
|
|
171
|
+
`
|
|
172
|
+
SELECT SUM(${metric}) as total
|
|
173
|
+
FROM hourly_usage_snapshots
|
|
174
|
+
WHERE project = ?
|
|
175
|
+
AND snapshot_hour >= datetime('now', 'start of day')
|
|
176
|
+
AND snapshot_hour < datetime('now', '+1 day', 'start of day')
|
|
177
|
+
`
|
|
178
|
+
)
|
|
179
|
+
.bind(project)
|
|
180
|
+
.first<{ total: number }>();
|
|
181
|
+
|
|
182
|
+
return result?.total ?? 0;
|
|
183
|
+
} catch (error) {
|
|
184
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:anomaly');
|
|
185
|
+
log.error('Error getting today metric', error instanceof Error ? error : undefined, {
|
|
186
|
+
tag: 'TODAY_METRIC_ERROR',
|
|
187
|
+
metric,
|
|
188
|
+
project,
|
|
189
|
+
});
|
|
190
|
+
return 0;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// =============================================================================
|
|
195
|
+
// ANOMALY RECORDING
|
|
196
|
+
// =============================================================================
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Record an anomaly to the D1 database.
|
|
200
|
+
*/
|
|
201
|
+
export async function recordAnomaly(
|
|
202
|
+
env: Env,
|
|
203
|
+
metric: string,
|
|
204
|
+
currentValue: number,
|
|
205
|
+
stats: RollingStats,
|
|
206
|
+
deviation: number,
|
|
207
|
+
project: string = 'all'
|
|
208
|
+
): Promise<void> {
|
|
209
|
+
try {
|
|
210
|
+
await env.PLATFORM_DB.prepare(
|
|
211
|
+
`
|
|
212
|
+
INSERT INTO usage_anomalies (
|
|
213
|
+
id, detected_at, metric_name, project,
|
|
214
|
+
current_value, rolling_avg, rolling_stddev, deviation_factor
|
|
215
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
216
|
+
`
|
|
217
|
+
)
|
|
218
|
+
.bind(
|
|
219
|
+
generateId(),
|
|
220
|
+
Math.floor(Date.now() / 1000),
|
|
221
|
+
metric,
|
|
222
|
+
project,
|
|
223
|
+
currentValue,
|
|
224
|
+
stats.avg,
|
|
225
|
+
stats.stddev,
|
|
226
|
+
deviation
|
|
227
|
+
)
|
|
228
|
+
.run();
|
|
229
|
+
} catch (error) {
|
|
230
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:anomaly');
|
|
231
|
+
log.error('Error recording anomaly', error instanceof Error ? error : undefined, {
|
|
232
|
+
tag: 'RECORD_ANOMALY_ERROR',
|
|
233
|
+
metric,
|
|
234
|
+
project,
|
|
235
|
+
});
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// =============================================================================
|
|
240
|
+
// ANOMALY ALERTING
|
|
241
|
+
// =============================================================================
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Send a Slack alert for detected anomaly.
|
|
245
|
+
*/
|
|
246
|
+
export async function sendAnomalySlackAlert(
|
|
247
|
+
env: Env,
|
|
248
|
+
metric: string,
|
|
249
|
+
currentValue: number,
|
|
250
|
+
stats: RollingStats,
|
|
251
|
+
deviation: number
|
|
252
|
+
): Promise<void> {
|
|
253
|
+
// Determine severity color
|
|
254
|
+
const color = deviation > 5 ? 'danger' : 'warning';
|
|
255
|
+
|
|
256
|
+
// Format metric for display
|
|
257
|
+
const metricDisplay = metric.replace(/_/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
|
|
258
|
+
|
|
259
|
+
// Format values
|
|
260
|
+
const formatValue = (val: number): string => {
|
|
261
|
+
if (metric.includes('cost')) {
|
|
262
|
+
return `$${val.toFixed(4)}`;
|
|
263
|
+
}
|
|
264
|
+
if (val >= 1_000_000) {
|
|
265
|
+
return `${(val / 1_000_000).toFixed(2)}M`;
|
|
266
|
+
}
|
|
267
|
+
if (val >= 1_000) {
|
|
268
|
+
return `${(val / 1_000).toFixed(2)}K`;
|
|
269
|
+
}
|
|
270
|
+
return val.toFixed(2);
|
|
271
|
+
};
|
|
272
|
+
|
|
273
|
+
const payload: SlackAlertPayload = {
|
|
274
|
+
text: `:warning: Usage Anomaly Detected`,
|
|
275
|
+
attachments: [
|
|
276
|
+
{
|
|
277
|
+
color,
|
|
278
|
+
fields: [
|
|
279
|
+
{ title: 'Metric', value: metricDisplay, short: true },
|
|
280
|
+
{ title: 'Deviation', value: `${deviation.toFixed(1)} stddev`, short: true },
|
|
281
|
+
{ title: 'Current Value', value: formatValue(currentValue), short: true },
|
|
282
|
+
{ title: '7-Day Avg', value: formatValue(stats.avg), short: true },
|
|
283
|
+
{ title: 'Stddev', value: formatValue(stats.stddev), short: true },
|
|
284
|
+
{ title: 'Samples', value: `${stats.samples} days`, short: true },
|
|
285
|
+
],
|
|
286
|
+
},
|
|
287
|
+
],
|
|
288
|
+
};
|
|
289
|
+
|
|
290
|
+
await sendSlackAlert(env, payload);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
// =============================================================================
|
|
294
|
+
// ALERT ROUTER INTEGRATION
|
|
295
|
+
// =============================================================================
|
|
296
|
+
|
|
297
|
+
/**
|
|
298
|
+
* Route anomaly alert through the central alert-router.
|
|
299
|
+
* Provides unified Slack alerting + in-app notifications.
|
|
300
|
+
* Falls back to direct Slack webhook if alert-router unavailable.
|
|
301
|
+
*/
|
|
302
|
+
async function sendAnomalyToAlertRouter(
|
|
303
|
+
env: Env,
|
|
304
|
+
metric: string,
|
|
305
|
+
currentValue: number,
|
|
306
|
+
stats: RollingStats,
|
|
307
|
+
deviation: number,
|
|
308
|
+
project: string = 'all'
|
|
309
|
+
): Promise<void> {
|
|
310
|
+
if (!env.ALERT_ROUTER) {
|
|
311
|
+
await sendAnomalySlackAlert(env, metric, currentValue, stats, deviation);
|
|
312
|
+
return;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
const metricDisplay = metric.replace(/_/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
|
|
316
|
+
const severity = deviation > 5 ? 'p0' : deviation > 3 ? 'p1' : 'p2';
|
|
317
|
+
|
|
318
|
+
const formatVal = (val: number): string => {
|
|
319
|
+
if (metric.includes('cost')) return `$${val.toFixed(4)}`;
|
|
320
|
+
if (val >= 1_000_000) return `${(val / 1_000_000).toFixed(2)}M`;
|
|
321
|
+
if (val >= 1_000) return `${(val / 1_000).toFixed(2)}K`;
|
|
322
|
+
return val.toFixed(2);
|
|
323
|
+
};
|
|
324
|
+
|
|
325
|
+
const payload = {
|
|
326
|
+
source: 'anomaly-detection',
|
|
327
|
+
severity,
|
|
328
|
+
status: 'firing',
|
|
329
|
+
service_id: 'platform-usage',
|
|
330
|
+
summary: `Usage Anomaly: ${metricDisplay} (${deviation.toFixed(1)} stddev)`,
|
|
331
|
+
message: `Current: ${formatVal(currentValue)}, 7-day avg: ${formatVal(stats.avg)}, StdDev: ${formatVal(stats.stddev)}, Project: ${project}`,
|
|
332
|
+
timestamp: new Date().toISOString(),
|
|
333
|
+
metadata: {
|
|
334
|
+
metric,
|
|
335
|
+
project,
|
|
336
|
+
currentValue,
|
|
337
|
+
rollingAvg: stats.avg,
|
|
338
|
+
rollingStddev: stats.stddev,
|
|
339
|
+
deviationFactor: deviation,
|
|
340
|
+
samples: stats.samples,
|
|
341
|
+
},
|
|
342
|
+
};
|
|
343
|
+
|
|
344
|
+
try {
|
|
345
|
+
const response = await env.ALERT_ROUTER.fetch(
|
|
346
|
+
// Service binding URL — the hostname is ignored; only the path matters
|
|
347
|
+
'https://platform-alert-router.internal/custom',
|
|
348
|
+
{
|
|
349
|
+
method: 'POST',
|
|
350
|
+
headers: { 'Content-Type': 'application/json' },
|
|
351
|
+
body: JSON.stringify(payload),
|
|
352
|
+
}
|
|
353
|
+
);
|
|
354
|
+
|
|
355
|
+
if (!response.ok) {
|
|
356
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:anomaly');
|
|
357
|
+
log.warn('Alert router returned non-OK, falling back to direct Slack', undefined, {
|
|
358
|
+
tag: 'ALERT_ROUTER_FALLBACK',
|
|
359
|
+
status: response.status,
|
|
360
|
+
});
|
|
361
|
+
await sendAnomalySlackAlert(env, metric, currentValue, stats, deviation);
|
|
362
|
+
}
|
|
363
|
+
} catch (error) {
|
|
364
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:anomaly');
|
|
365
|
+
log.error('Alert router failed, falling back to direct Slack', error instanceof Error ? error : undefined, {
|
|
366
|
+
tag: 'ALERT_ROUTER_ERROR',
|
|
367
|
+
});
|
|
368
|
+
await sendAnomalySlackAlert(env, metric, currentValue, stats, deviation);
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// =============================================================================
|
|
373
|
+
// MAIN ANOMALY DETECTION
|
|
374
|
+
// =============================================================================
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Metrics to monitor for anomalies.
|
|
378
|
+
*/
|
|
379
|
+
const MONITORED_METRICS = [
|
|
380
|
+
'workers_requests',
|
|
381
|
+
'd1_rows_written',
|
|
382
|
+
'total_cost_usd',
|
|
383
|
+
'aigateway_requests',
|
|
384
|
+
'workersai_neurons',
|
|
385
|
+
] as const;
|
|
386
|
+
|
|
387
|
+
/**
|
|
388
|
+
* Projects monitored for per-project anomaly detection.
|
|
389
|
+
* Includes 'all' (aggregate) plus individual projects.
|
|
390
|
+
*/
|
|
391
|
+
// TODO: Add your project IDs here (must match project_registry in D1)
|
|
392
|
+
const MONITORED_PROJECTS = ['all', 'platform'] as const;
|
|
393
|
+
|
|
394
|
+
/**
|
|
395
|
+
* Run anomaly detection for key metrics across all monitored projects.
|
|
396
|
+
* Called during scheduled runs (typically at midnight).
|
|
397
|
+
*
|
|
398
|
+
* @returns Number of anomalies detected
|
|
399
|
+
*/
|
|
400
|
+
export async function detectAnomalies(env: Env): Promise<number> {
|
|
401
|
+
let anomaliesDetected = 0;
|
|
402
|
+
|
|
403
|
+
for (const project of MONITORED_PROJECTS) {
|
|
404
|
+
for (const metric of MONITORED_METRICS) {
|
|
405
|
+
try {
|
|
406
|
+
const stats = await calculate7DayRollingStats(env, metric, project);
|
|
407
|
+
|
|
408
|
+
// Need at least 7 days of data for reliable anomaly detection
|
|
409
|
+
if (!stats || stats.samples < 7) {
|
|
410
|
+
continue;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
const todayValue = await getTodayMetricValue(env, metric, project);
|
|
414
|
+
|
|
415
|
+
// Skip if stddev is 0 (no variation in data)
|
|
416
|
+
if (stats.stddev === 0) {
|
|
417
|
+
continue;
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
const deviation = (todayValue - stats.avg) / stats.stddev;
|
|
421
|
+
|
|
422
|
+
// Detect anomaly if deviation > 3 standard deviations
|
|
423
|
+
if (deviation > 3) {
|
|
424
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:anomaly');
|
|
425
|
+
log.info('Anomaly detected', {
|
|
426
|
+
tag: 'ANOMALY_DETECTED',
|
|
427
|
+
metric,
|
|
428
|
+
project,
|
|
429
|
+
todayValue,
|
|
430
|
+
deviation: deviation.toFixed(1),
|
|
431
|
+
avg: stats.avg.toFixed(2),
|
|
432
|
+
});
|
|
433
|
+
|
|
434
|
+
await recordAnomaly(env, metric, todayValue, stats, deviation, project);
|
|
435
|
+
await sendAnomalyToAlertRouter(env, metric, todayValue, stats, deviation, project);
|
|
436
|
+
anomaliesDetected++;
|
|
437
|
+
}
|
|
438
|
+
} catch (error) {
|
|
439
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:anomaly');
|
|
440
|
+
log.error('Error checking metric for anomaly', error instanceof Error ? error : undefined, {
|
|
441
|
+
tag: 'CHECK_ANOMALY_ERROR',
|
|
442
|
+
metric,
|
|
443
|
+
project,
|
|
444
|
+
});
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
return anomaliesDetected;
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// =============================================================================
|
|
453
|
+
// HOURLY D1 WRITE ANOMALY DETECTION
|
|
454
|
+
// =============================================================================
|
|
455
|
+
|
|
456
|
+
/**
|
|
457
|
+
* Calculate rolling stats from hourly snapshots (168 hours = 7 days).
|
|
458
|
+
* Used for hourly anomaly detection where daily rollups are too coarse.
|
|
459
|
+
*/
|
|
460
|
+
export async function calculateHourlyRollingStats(
|
|
461
|
+
env: Env,
|
|
462
|
+
metric: string,
|
|
463
|
+
project: string
|
|
464
|
+
): Promise<RollingStats | null> {
|
|
465
|
+
if (!isAllowedMetric(metric)) {
|
|
466
|
+
return null;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
try {
|
|
470
|
+
const result = await env.PLATFORM_DB.prepare(
|
|
471
|
+
`
|
|
472
|
+
SELECT
|
|
473
|
+
COUNT(*) as sample_count,
|
|
474
|
+
SUM(${metric}) as sum_value,
|
|
475
|
+
SUM(${metric} * ${metric}) as sum_squared,
|
|
476
|
+
AVG(${metric}) as avg_value
|
|
477
|
+
FROM hourly_usage_snapshots
|
|
478
|
+
WHERE project = ?
|
|
479
|
+
AND snapshot_hour >= datetime('now', '-7 days')
|
|
480
|
+
AND snapshot_hour < datetime('now', '-1 hour')
|
|
481
|
+
`
|
|
482
|
+
)
|
|
483
|
+
.bind(project)
|
|
484
|
+
.first<{
|
|
485
|
+
sample_count: number;
|
|
486
|
+
sum_value: number;
|
|
487
|
+
sum_squared: number;
|
|
488
|
+
avg_value: number;
|
|
489
|
+
}>();
|
|
490
|
+
|
|
491
|
+
if (!result || result.sample_count < 48) {
|
|
492
|
+
return null; // Need at least 2 days of hourly data
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
const n = result.sample_count;
|
|
496
|
+
const avg = result.avg_value;
|
|
497
|
+
const variance = (result.sum_squared - n * avg * avg) / n;
|
|
498
|
+
const stddev = Math.sqrt(Math.max(0, variance));
|
|
499
|
+
|
|
500
|
+
return { avg, stddev, samples: n };
|
|
501
|
+
} catch (error) {
|
|
502
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:anomaly');
|
|
503
|
+
log.error('Error calculating hourly rolling stats', error instanceof Error ? error : undefined, {
|
|
504
|
+
tag: 'HOURLY_ROLLING_STATS_ERROR',
|
|
505
|
+
metric,
|
|
506
|
+
project,
|
|
507
|
+
});
|
|
508
|
+
return null;
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
/**
|
|
513
|
+
* Hourly D1 write anomaly check.
|
|
514
|
+
* Runs every hour to catch write spikes within hours, not days.
|
|
515
|
+
* Only checks d1_rows_written (highest-risk metric from Jan 2026 incident).
|
|
516
|
+
*
|
|
517
|
+
* @returns Number of anomalies detected (0 or 1)
|
|
518
|
+
*/
|
|
519
|
+
export async function detectHourlyD1WriteAnomalies(env: Env): Promise<number> {
|
|
520
|
+
const metric = 'd1_rows_written';
|
|
521
|
+
const project = 'all';
|
|
522
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:anomaly');
|
|
523
|
+
|
|
524
|
+
try {
|
|
525
|
+
// Get the last completed hour's value
|
|
526
|
+
const lastHourResult = await env.PLATFORM_DB.prepare(
|
|
527
|
+
`
|
|
528
|
+
SELECT ${metric} as value
|
|
529
|
+
FROM hourly_usage_snapshots
|
|
530
|
+
WHERE project = ?
|
|
531
|
+
AND snapshot_hour >= datetime('now', '-2 hours')
|
|
532
|
+
AND snapshot_hour < datetime('now', '-1 hour')
|
|
533
|
+
ORDER BY snapshot_hour DESC
|
|
534
|
+
LIMIT 1
|
|
535
|
+
`
|
|
536
|
+
)
|
|
537
|
+
.bind(project)
|
|
538
|
+
.first<{ value: number }>();
|
|
539
|
+
|
|
540
|
+
if (!lastHourResult || lastHourResult.value === 0) {
|
|
541
|
+
return 0; // No data or zero writes — nothing to flag
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
const stats = await calculateHourlyRollingStats(env, metric, project);
|
|
545
|
+
if (!stats || stats.stddev === 0) {
|
|
546
|
+
return 0;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
const deviation = (lastHourResult.value - stats.avg) / stats.stddev;
|
|
550
|
+
|
|
551
|
+
if (deviation > 3) {
|
|
552
|
+
log.info('Hourly D1 write anomaly detected', {
|
|
553
|
+
tag: 'HOURLY_D1_ANOMALY',
|
|
554
|
+
value: lastHourResult.value,
|
|
555
|
+
deviation: deviation.toFixed(1),
|
|
556
|
+
avg: stats.avg.toFixed(2),
|
|
557
|
+
stddev: stats.stddev.toFixed(2),
|
|
558
|
+
});
|
|
559
|
+
|
|
560
|
+
await recordAnomaly(env, metric, lastHourResult.value, stats, deviation, project);
|
|
561
|
+
await sendAnomalyToAlertRouter(env, metric, lastHourResult.value, stats, deviation, project);
|
|
562
|
+
return 1;
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
return 0;
|
|
566
|
+
} catch (error) {
|
|
567
|
+
log.error('Error in hourly D1 write anomaly check', error instanceof Error ? error : undefined, {
|
|
568
|
+
tag: 'HOURLY_D1_CHECK_ERROR',
|
|
569
|
+
});
|
|
570
|
+
return 0;
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
// =============================================================================
|
|
575
|
+
// DATASET REGISTRY - Drift Detection for Cloudflare GraphQL Datasets
|
|
576
|
+
// =============================================================================
|
|
577
|
+
|
|
578
|
+
/**
|
|
579
|
+
* Probe a single GraphQL dataset to check if it's available.
|
|
580
|
+
* Returns true if the dataset exists and is queryable.
|
|
581
|
+
*/
|
|
582
|
+
export async function probeDataset(env: Env, datasetName: string): Promise<boolean> {
|
|
583
|
+
const GRAPHQL_ENDPOINT = 'https://api.cloudflare.com/client/v4/graphql';
|
|
584
|
+
const now = new Date();
|
|
585
|
+
const yesterday = new Date(now);
|
|
586
|
+
yesterday.setUTCDate(yesterday.getUTCDate() - 1);
|
|
587
|
+
|
|
588
|
+
// Build a minimal probe query
|
|
589
|
+
const query = `
|
|
590
|
+
query ProbeDataset($accountTag: String!, $limit: Int!) {
|
|
591
|
+
viewer {
|
|
592
|
+
accounts(filter: { accountTag: $accountTag }) {
|
|
593
|
+
${datasetName}(limit: $limit, filter: {
|
|
594
|
+
datetime_geq: "${yesterday.toISOString().split('T')[0]}T00:00:00Z",
|
|
595
|
+
datetime_leq: "${now.toISOString().split('T')[0]}T00:00:00Z"
|
|
596
|
+
}) {
|
|
597
|
+
dimensions {
|
|
598
|
+
datetime
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
`;
|
|
605
|
+
|
|
606
|
+
try {
|
|
607
|
+
const response = await fetchWithRetry(GRAPHQL_ENDPOINT, {
|
|
608
|
+
method: 'POST',
|
|
609
|
+
headers: {
|
|
610
|
+
'Content-Type': 'application/json',
|
|
611
|
+
Authorization: `Bearer ${env.CLOUDFLARE_API_TOKEN}`,
|
|
612
|
+
},
|
|
613
|
+
body: JSON.stringify({
|
|
614
|
+
query,
|
|
615
|
+
variables: {
|
|
616
|
+
accountTag: env.CLOUDFLARE_ACCOUNT_ID,
|
|
617
|
+
limit: 1,
|
|
618
|
+
},
|
|
619
|
+
}),
|
|
620
|
+
});
|
|
621
|
+
|
|
622
|
+
if (!response.ok) {
|
|
623
|
+
return false;
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
const result = (await response.json()) as { errors?: Array<{ message: string }> };
|
|
627
|
+
|
|
628
|
+
// Check for GraphQL errors indicating dataset doesn't exist
|
|
629
|
+
if (result.errors) {
|
|
630
|
+
const errorStr = JSON.stringify(result.errors);
|
|
631
|
+
if (
|
|
632
|
+
errorStr.includes('Cannot query field') ||
|
|
633
|
+
errorStr.includes('Unknown field') ||
|
|
634
|
+
errorStr.includes('not enabled') ||
|
|
635
|
+
errorStr.includes('not available')
|
|
636
|
+
) {
|
|
637
|
+
return false;
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
return true;
|
|
642
|
+
} catch {
|
|
643
|
+
// Network errors or other issues - assume unavailable
|
|
644
|
+
return false;
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
/**
|
|
649
|
+
* Discover and update the dataset registry.
|
|
650
|
+
* Probes known datasets, updates last_seen, and alerts on new billable datasets.
|
|
651
|
+
*
|
|
652
|
+
* @returns Object with counts of datasets checked and alerts generated
|
|
653
|
+
*/
|
|
654
|
+
export async function discoverAndUpdateDatasetRegistry(
|
|
655
|
+
env: Env
|
|
656
|
+
): Promise<{ datasetsChecked: number; newBillableAlerts: number; d1Writes: number }> {
|
|
657
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:dataset-registry');
|
|
658
|
+
log.info('Starting weekly dataset discovery');
|
|
659
|
+
|
|
660
|
+
let datasetsChecked = 0;
|
|
661
|
+
let newBillableAlerts = 0;
|
|
662
|
+
let d1Writes = 0;
|
|
663
|
+
const now = new Date().toISOString();
|
|
664
|
+
|
|
665
|
+
for (const dataset of KNOWN_DATASETS) {
|
|
666
|
+
const available = await probeDataset(env, dataset.name);
|
|
667
|
+
datasetsChecked++;
|
|
668
|
+
|
|
669
|
+
if (available) {
|
|
670
|
+
// Update last_seen for this dataset
|
|
671
|
+
try {
|
|
672
|
+
await env.PLATFORM_DB.prepare(
|
|
673
|
+
`
|
|
674
|
+
INSERT INTO dataset_registry (dataset_name, first_seen, last_seen, is_queried, is_billable, category, created_at, updated_at)
|
|
675
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
676
|
+
ON CONFLICT (dataset_name) DO UPDATE SET
|
|
677
|
+
last_seen = excluded.last_seen,
|
|
678
|
+
updated_at = excluded.updated_at
|
|
679
|
+
`
|
|
680
|
+
)
|
|
681
|
+
.bind(
|
|
682
|
+
dataset.name,
|
|
683
|
+
now,
|
|
684
|
+
now,
|
|
685
|
+
QUERIED_DATASETS.has(dataset.name) ? 1 : 0,
|
|
686
|
+
dataset.billable ? 1 : 0,
|
|
687
|
+
dataset.category,
|
|
688
|
+
now,
|
|
689
|
+
now
|
|
690
|
+
)
|
|
691
|
+
.run();
|
|
692
|
+
d1Writes++;
|
|
693
|
+
|
|
694
|
+
// Alert if this is a billable dataset we're not querying
|
|
695
|
+
if (dataset.billable && !QUERIED_DATASETS.has(dataset.name)) {
|
|
696
|
+
log.info('Available billable dataset not queried', { dataset: dataset.name });
|
|
697
|
+
newBillableAlerts++;
|
|
698
|
+
|
|
699
|
+
// Send Slack alert for new billable dataset
|
|
700
|
+
if (env.SLACK_WEBHOOK_URL) {
|
|
701
|
+
await sendSlackAlert(env, {
|
|
702
|
+
text: ':warning: Billable Dataset Not Queried',
|
|
703
|
+
attachments: [
|
|
704
|
+
{
|
|
705
|
+
color: 'warning',
|
|
706
|
+
fields: [
|
|
707
|
+
{ title: 'Dataset', value: dataset.name, short: true },
|
|
708
|
+
{ title: 'Category', value: dataset.category, short: true },
|
|
709
|
+
{
|
|
710
|
+
title: 'Action Required',
|
|
711
|
+
value: 'Consider adding query for accurate cost tracking',
|
|
712
|
+
short: false,
|
|
713
|
+
},
|
|
714
|
+
],
|
|
715
|
+
},
|
|
716
|
+
],
|
|
717
|
+
});
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
} catch (error) {
|
|
721
|
+
log.error(`Error updating ${dataset.name}`, error instanceof Error ? error : undefined);
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
// Small delay between probes to avoid rate limiting
|
|
726
|
+
await new Promise((resolve) => setTimeout(resolve, 50));
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
log.info('Discovery complete', { datasetsChecked, newBillableAlerts, d1Writes });
|
|
730
|
+
|
|
731
|
+
return { datasetsChecked, newBillableAlerts, d1Writes };
|
|
732
|
+
}
|