@littlebearapps/platform-admin-sdk 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +112 -0
  2. package/dist/index.d.ts +16 -0
  3. package/dist/index.js +89 -0
  4. package/dist/prompts.d.ts +27 -0
  5. package/dist/prompts.js +80 -0
  6. package/dist/scaffold.d.ts +5 -0
  7. package/dist/scaffold.js +65 -0
  8. package/dist/templates.d.ts +16 -0
  9. package/dist/templates.js +131 -0
  10. package/package.json +46 -0
  11. package/templates/full/migrations/006_pattern_discovery.sql +199 -0
  12. package/templates/full/migrations/007_notifications_search.sql +127 -0
  13. package/templates/full/workers/lib/pattern-discovery/ai-prompt.ts +644 -0
  14. package/templates/full/workers/lib/pattern-discovery/clustering.ts +278 -0
  15. package/templates/full/workers/lib/pattern-discovery/shadow-evaluation.ts +603 -0
  16. package/templates/full/workers/lib/pattern-discovery/storage.ts +806 -0
  17. package/templates/full/workers/lib/pattern-discovery/types.ts +159 -0
  18. package/templates/full/workers/lib/pattern-discovery/validation.ts +278 -0
  19. package/templates/full/workers/pattern-discovery.ts +661 -0
  20. package/templates/full/workers/platform-alert-router.ts +1809 -0
  21. package/templates/full/workers/platform-notifications.ts +424 -0
  22. package/templates/full/workers/platform-search.ts +480 -0
  23. package/templates/full/workers/platform-settings.ts +436 -0
  24. package/templates/full/wrangler.alert-router.jsonc.hbs +34 -0
  25. package/templates/full/wrangler.notifications.jsonc.hbs +23 -0
  26. package/templates/full/wrangler.pattern-discovery.jsonc.hbs +33 -0
  27. package/templates/full/wrangler.search.jsonc.hbs +16 -0
  28. package/templates/full/wrangler.settings.jsonc.hbs +23 -0
  29. package/templates/shared/README.md.hbs +69 -0
  30. package/templates/shared/config/budgets.yaml.hbs +72 -0
  31. package/templates/shared/config/services.yaml.hbs +45 -0
  32. package/templates/shared/migrations/001_core_tables.sql +117 -0
  33. package/templates/shared/migrations/002_usage_warehouse.sql +830 -0
  34. package/templates/shared/migrations/003_feature_tracking.sql +250 -0
  35. package/templates/shared/migrations/004_settings_alerts.sql +452 -0
  36. package/templates/shared/migrations/seed.sql.hbs +4 -0
  37. package/templates/shared/package.json.hbs +21 -0
  38. package/templates/shared/scripts/sync-config.ts +242 -0
  39. package/templates/shared/tsconfig.json +12 -0
  40. package/templates/shared/workers/lib/analytics-engine.ts +357 -0
  41. package/templates/shared/workers/lib/billing.ts +293 -0
  42. package/templates/shared/workers/lib/circuit-breaker-middleware.ts +25 -0
  43. package/templates/shared/workers/lib/control.ts +292 -0
  44. package/templates/shared/workers/lib/economics.ts +368 -0
  45. package/templates/shared/workers/lib/metrics.ts +103 -0
  46. package/templates/shared/workers/lib/platform-settings.ts +407 -0
  47. package/templates/shared/workers/lib/shared/allowances.ts +333 -0
  48. package/templates/shared/workers/lib/shared/cloudflare.ts +1362 -0
  49. package/templates/shared/workers/lib/shared/types.ts +58 -0
  50. package/templates/shared/workers/lib/telemetry-sampling.ts +360 -0
  51. package/templates/shared/workers/lib/usage/collectors/example.ts +96 -0
  52. package/templates/shared/workers/lib/usage/collectors/index.ts +128 -0
  53. package/templates/shared/workers/lib/usage/handlers/audit.ts +306 -0
  54. package/templates/shared/workers/lib/usage/handlers/backfill.ts +845 -0
  55. package/templates/shared/workers/lib/usage/handlers/behavioral.ts +429 -0
  56. package/templates/shared/workers/lib/usage/handlers/data-queries.ts +507 -0
  57. package/templates/shared/workers/lib/usage/handlers/dlq-admin.ts +364 -0
  58. package/templates/shared/workers/lib/usage/handlers/health-trends.ts +222 -0
  59. package/templates/shared/workers/lib/usage/handlers/index.ts +35 -0
  60. package/templates/shared/workers/lib/usage/handlers/usage-admin.ts +421 -0
  61. package/templates/shared/workers/lib/usage/handlers/usage-features.ts +1262 -0
  62. package/templates/shared/workers/lib/usage/handlers/usage-metrics.ts +2420 -0
  63. package/templates/shared/workers/lib/usage/handlers/usage-settings.ts +610 -0
  64. package/templates/shared/workers/lib/usage/queue/budget-enforcement.ts +1032 -0
  65. package/templates/shared/workers/lib/usage/queue/cost-budget-enforcement.ts +128 -0
  66. package/templates/shared/workers/lib/usage/queue/cost-calculator.ts +77 -0
  67. package/templates/shared/workers/lib/usage/queue/dlq-handler.ts +161 -0
  68. package/templates/shared/workers/lib/usage/queue/index.ts +19 -0
  69. package/templates/shared/workers/lib/usage/queue/telemetry-processor.ts +790 -0
  70. package/templates/shared/workers/lib/usage/scheduled/anomaly-detection.ts +732 -0
  71. package/templates/shared/workers/lib/usage/scheduled/data-collection.ts +956 -0
  72. package/templates/shared/workers/lib/usage/scheduled/error-digest.ts +343 -0
  73. package/templates/shared/workers/lib/usage/scheduled/index.ts +18 -0
  74. package/templates/shared/workers/lib/usage/scheduled/rollups.ts +1561 -0
  75. package/templates/shared/workers/lib/usage/shared/constants.ts +362 -0
  76. package/templates/shared/workers/lib/usage/shared/index.ts +14 -0
  77. package/templates/shared/workers/lib/usage/shared/types.ts +1066 -0
  78. package/templates/shared/workers/lib/usage/shared/utils.ts +795 -0
  79. package/templates/shared/workers/platform-usage.ts +1915 -0
  80. package/templates/shared/wrangler.usage.jsonc.hbs +58 -0
  81. package/templates/standard/migrations/005_error_collection.sql +162 -0
  82. package/templates/standard/workers/error-collector.ts +2670 -0
  83. package/templates/standard/workers/lib/error-collector/capture.ts +213 -0
  84. package/templates/standard/workers/lib/error-collector/digest.ts +448 -0
  85. package/templates/standard/workers/lib/error-collector/email-health-alerts.ts +262 -0
  86. package/templates/standard/workers/lib/error-collector/fingerprint.ts +258 -0
  87. package/templates/standard/workers/lib/error-collector/gap-alerts.ts +293 -0
  88. package/templates/standard/workers/lib/error-collector/github.ts +329 -0
  89. package/templates/standard/workers/lib/error-collector/types.ts +262 -0
  90. package/templates/standard/workers/lib/sentinel/gap-detection.ts +734 -0
  91. package/templates/standard/workers/lib/shared/slack-alerts.ts +585 -0
  92. package/templates/standard/workers/platform-sentinel.ts +1744 -0
  93. package/templates/standard/wrangler.error-collector.jsonc.hbs +44 -0
  94. package/templates/standard/wrangler.sentinel.jsonc.hbs +45 -0
@@ -0,0 +1,1744 @@
1
+ /**
2
+ * Platform Sentinel Worker
3
+ *
4
+ * Monitors Cloudflare resource costs and sends alerts via Slack and Email
5
+ * when costs exceed configured thresholds or spike significantly.
6
+ *
7
+ * Runs on a cron schedule (every 15 minutes) and uses KV for rate limiting
8
+ * to prevent alert fatigue.
9
+ *
10
+ * @module workers/platform-sentinel
11
+ * @created 2026-01-05
12
+ * @renamed 2026-01-23 (from cost-spike-alerter)
13
+ * @task task-17.20 - Slack webhook alerts for cost spikes
14
+ * @task task-17.21 - Email alerts via Resend
15
+ */
16
+
17
+ import type {
18
+ KVNamespace,
19
+ ExecutionContext,
20
+ ScheduledEvent,
21
+ D1Database,
22
+ Fetcher,
23
+ } from '@cloudflare/workers-types';
24
+ import {
25
+ withFeatureBudget,
26
+ withCronBudget,
27
+ CircuitBreakerError,
28
+ completeTracking,
29
+ MONITOR_COST_SPIKE,
30
+ HEARTBEAT_HEALTH,
31
+ createLogger,
32
+ createLoggerFromRequest,
33
+ createTraceContext,
34
+ health,
35
+ type Logger,
36
+ } from '@littlebearapps/platform-consumer-sdk';
37
+ import {
38
+ detectGaps,
39
+ storeGapReport,
40
+ alertGaps,
41
+ alertGapsEmail,
42
+ detectProjectGaps,
43
+ type ProjectGap,
44
+ } from './lib/sentinel/gap-detection';
45
+ import { pingHeartbeat } from '@littlebearapps/platform-consumer-sdk';
46
+ import { PAID_ALLOWANCES, PRICING_TIERS } from '@littlebearapps/platform-consumer-sdk';
47
+
48
+ interface Env {
49
+ CLOUDFLARE_API_TOKEN: string;
50
+ CLOUDFLARE_ACCOUNT_ID: string;
51
+ SLACK_WEBHOOK_URL: string;
52
+ RESEND_API_KEY: string;
53
+ ALERT_EMAIL_TO: string;
54
+ PLATFORM_DB: D1Database; // For system health checks
55
+ PLATFORM_CACHE: KVNamespace;
56
+ PLATFORM_ALERTS: KVNamespace; // For rate limiting
57
+ PLATFORM_TELEMETRY: Queue; // For SDK telemetry
58
+ GATUS_HEARTBEAT_URL?: string; // Gatus heartbeat ping URL for cron monitoring
59
+ GATUS_TOKEN?: string; // Bearer token for Gatus external endpoints
60
+ NOTIFICATIONS_API?: Fetcher; // For creating dashboard notifications
61
+ ERROR_COLLECTOR?: Fetcher; // For creating gap alert GitHub issues
62
+ }
63
+
64
+ // TODO: Set your dashboard URL and alert email address
65
+ const DASHBOARD_URL = 'https://your-dashboard.example.com';
66
+ const ALERT_FROM_EMAIL = 'Usage Alerts <alerts@mail.your-domain.com>';
67
+
68
+ // Module-scope raw Fetcher references — set in scheduled() BEFORE SDK wrapping.
69
+ // The SDK proxy wraps .fetch() causing "Illegal invocation" on native Fetcher bindings.
70
+ let _rawNotificationsApi: Fetcher | undefined;
71
+ let _rawErrorCollector: Fetcher | undefined;
72
+
73
+ /**
74
+ * Threshold configuration stored in KV
75
+ */
76
+ interface ServiceThreshold {
77
+ warningPct: number;
78
+ highPct: number;
79
+ criticalPct: number;
80
+ absoluteMax: number;
81
+ enabled: boolean;
82
+ }
83
+
84
+ interface AlertThresholds {
85
+ [key: string]: ServiceThreshold;
86
+ }
87
+
88
+ /**
89
+ * Cost breakdown by service
90
+ */
91
+ interface CostBreakdown {
92
+ workers: number;
93
+ d1: number;
94
+ kv: number;
95
+ r2: number;
96
+ durableObjects: number;
97
+ vectorize: number;
98
+ aiGateway: number;
99
+ workersAI: number;
100
+ pages: number;
101
+ queues: number;
102
+ workflows: number;
103
+ total: number;
104
+ }
105
+
106
+ /**
107
+ * Alert data structure
108
+ */
109
+ interface CostSpikeAlert {
110
+ id: string;
111
+ serviceType: string;
112
+ resourceName: string;
113
+ currentCost: number;
114
+ previousCost: number;
115
+ costDeltaPct: number;
116
+ costPercentOfMax: number;
117
+ thresholdLevel: 'normal' | 'warning' | 'high' | 'critical';
118
+ absoluteMax: number;
119
+ timestamp: string;
120
+ /** Billing period context */
121
+ billingPeriodStart: string;
122
+ billingPeriodEnd: string;
123
+ billingDaysElapsed: number;
124
+ billingDaysTotal: number;
125
+ /** Workers Paid plan allowance context */
126
+ monthlyAllowance: string;
127
+ isWithinAllowance: boolean;
128
+ overageCost: number;
129
+ /** Per-project cost breakdown (top contributors) */
130
+ topProjects: Array<{ project: string; cost: number; pctOfTotal: number }>;
131
+ /** Per-feature usage breakdown (top contributors) */
132
+ topFeatures: Array<{ featureKey: string; usage: number; pctOfTotal: number }>;
133
+ /** Per-metric usage vs plan allowance breakdown */
134
+ usageBreakdown: UsageMetricBreakdown[];
135
+ }
136
+
137
+ /**
138
+ * Workers Paid plan allowance descriptions for alert context.
139
+ * These describe what's included free each month.
140
+ */
141
+ const SERVICE_ALLOWANCE_DESCRIPTIONS: Record<string, string> = {
142
+ workers: '10M requests + 30M CPU-ms/mo (Workers Paid)',
143
+ d1: '25B reads + 50M writes/mo (Workers Paid)',
144
+ kv: '10M reads + 1M writes + 1M deletes + 1M lists/mo',
145
+ r2: '10GB storage + 1M Class A + 10M Class B ops/mo',
146
+ durableObjects: '1M requests + 400K GB-s/mo',
147
+ vectorize: '10M stored + 50M queried dimensions/mo',
148
+ aiGateway: 'Free (pass-through)',
149
+ pages: '500 builds/mo + 100GB bandwidth',
150
+ queues: '1M operations/mo',
151
+ workflows: 'Beta (free)',
152
+ workersAI: 'Usage-based (10K neurons/day free)',
153
+ };
154
+
155
+ /**
156
+ * Default thresholds (fallback if KV not configured)
157
+ */
158
+ const DEFAULT_THRESHOLDS: AlertThresholds = {
159
+ workers: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 5, enabled: true },
160
+ d1: { warningPct: 40, highPct: 60, criticalPct: 80, absoluteMax: 20, enabled: true },
161
+ kv: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 5, enabled: true },
162
+ r2: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 20, enabled: true },
163
+ durableObjects: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 10, enabled: true },
164
+ vectorize: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 5, enabled: true },
165
+ aiGateway: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 0, enabled: false },
166
+ pages: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 5, enabled: true },
167
+ queues: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 5, enabled: true },
168
+ workflows: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 0, enabled: false },
169
+ };
170
+
171
+ /**
172
+ * Slack rate limit: 1 alert per resource per hour
173
+ */
174
+ const SLACK_RATE_LIMIT_TTL = 3600;
175
+
176
+ /**
177
+ * Email rate limit: 1 alert per resource per 4 hours
178
+ */
179
+ const EMAIL_RATE_LIMIT_TTL = 14400;
180
+
181
+ export default {
182
+ /**
183
+ * Cron trigger handler
184
+ */
185
+ async scheduled(event: ScheduledEvent, env: Env, ctx: ExecutionContext): Promise<void> {
186
+ const log = createLogger({ worker: 'platform-sentinel', featureId: MONITOR_COST_SPIKE });
187
+ log.info('Cron triggered', { scheduled_time: new Date(event.scheduledTime).toISOString() });
188
+
189
+ // Gatus heartbeat is pinged on success/fail only (no /start support)
190
+
191
+ // CRITICAL: Capture raw Fetcher bindings BEFORE SDK wrapping.
192
+ // The SDK triple-proxy wraps .fetch() in async wrapper causing "Illegal invocation"
193
+ // on native Cloudflare Fetcher bindings. See platform-alert-router.ts for same pattern.
194
+ _rawNotificationsApi = env.NOTIFICATIONS_API;
195
+ _rawErrorCollector = env.ERROR_COLLECTOR;
196
+
197
+ try {
198
+ // Wrap with Platform SDK for usage tracking and circuit breaker protection
199
+ const trackedEnv = withCronBudget(env, MONITOR_COST_SPIKE, {
200
+ ctx,
201
+ cronExpression: '*/15 * * * *', // Every 15 minutes
202
+ });
203
+
204
+ // 1. Gap detection - check for missing hourly snapshots (ALWAYS runs, independent of cost data)
205
+ // This was previously step 8, but must run regardless of cache state (fix for task-312)
206
+ const gaps = await detectGaps(trackedEnv, log);
207
+ if (gaps.severity !== 'ok') {
208
+ // Store gap report for aggregation by platform-auditor
209
+ await storeGapReport(trackedEnv, gaps, log);
210
+ // Send alerts
211
+ await alertGaps(trackedEnv, gaps, log);
212
+ await alertGapsEmail(trackedEnv, gaps, log);
213
+ }
214
+
215
+ // 1b. Per-project gap detection - check resource_usage_snapshots coverage
216
+ // Creates GitHub issues in correct repo when coverage drops below 90%
217
+ const projectGaps = await detectProjectGaps(trackedEnv, log);
218
+ if (projectGaps.length > 0 && _rawErrorCollector) {
219
+ log.info('Detected per-project gaps, sending to error-collector', {
220
+ projectCount: projectGaps.length,
221
+ });
222
+ for (const gap of projectGaps) {
223
+ try {
224
+ const response = await _rawErrorCollector.fetch(
225
+ 'https://error-collector.internal/gap-alerts',
226
+ {
227
+ method: 'POST',
228
+ headers: { 'Content-Type': 'application/json' },
229
+ body: JSON.stringify({
230
+ project: gap.project,
231
+ hoursWithData: gap.hoursWithData,
232
+ expectedHours: gap.expectedHours,
233
+ coveragePct: gap.coveragePct,
234
+ missingHours: gap.missingHours,
235
+ repository: gap.repository,
236
+ }),
237
+ }
238
+ );
239
+ const result = await response.json();
240
+ log.debug('Gap alert result', { project: gap.project, result });
241
+ } catch (e) {
242
+ log.error('Failed to send gap alert to error-collector', e, {
243
+ project: gap.project,
244
+ });
245
+ }
246
+ }
247
+ }
248
+
249
+ // 2. Check for stale heartbeats (DO health monitoring) - also runs always
250
+ await checkStaleHeartbeats(trackedEnv, log);
251
+
252
+ // 3. Load thresholds from KV (or use defaults)
253
+ const thresholds = await loadThresholds(trackedEnv, log);
254
+
255
+ // 4. Fetch current costs from Usage API (optional - may be cache cold)
256
+ const currentCosts = await fetchCurrentCosts(trackedEnv, log);
257
+ if (currentCosts) {
258
+ // 5. Load previous costs from KV (for delta comparison)
259
+ const previousCosts = await loadPreviousCosts(trackedEnv, log);
260
+
261
+ // 6. Evaluate alerts (async — queries D1 for per-project/feature attribution)
262
+ const alerts = await evaluateAlerts(currentCosts, previousCosts, thresholds, trackedEnv, log);
263
+ log.info('Evaluated potential alerts', { alert_count: alerts.length });
264
+
265
+ // 7. Send alerts (with rate limiting)
266
+ for (const alert of alerts) {
267
+ await sendAlerts(alert, trackedEnv, log);
268
+ }
269
+
270
+ // 8. Store current costs for next comparison
271
+ await storeCosts(currentCosts, trackedEnv, log);
272
+ } else {
273
+ // Not an error - cache may be cold (expected during cold starts or low traffic)
274
+ log.debug('No cost data available (cache cold), skipping cost alerting', {
275
+ hint: 'Call GET /usage on platform-usage to populate cache',
276
+ });
277
+ }
278
+
279
+ // 9. Send Platform SDK heartbeat
280
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
281
+ await health(HEARTBEAT_HEALTH, env.PLATFORM_CACHE as any, env.PLATFORM_TELEMETRY, ctx);
282
+ log.debug('Heartbeat sent');
283
+
284
+ // Complete SDK tracking
285
+ await completeTracking(trackedEnv);
286
+
287
+ // Signal success to Gatus heartbeat
288
+ pingHeartbeat(ctx, env.GATUS_HEARTBEAT_URL, env.GATUS_TOKEN, true);
289
+
290
+ log.info('Completed successfully');
291
+ } catch (error) {
292
+ // Handle circuit breaker gracefully - skip execution
293
+ if (error instanceof CircuitBreakerError) {
294
+ log.warn('Circuit breaker STOP', error, { reason: error.reason });
295
+ return;
296
+ }
297
+
298
+ // Signal failure to Gatus heartbeat
299
+ pingHeartbeat(ctx, env.GATUS_HEARTBEAT_URL, env.GATUS_TOKEN, false);
300
+
301
+ log.error('Error', error);
302
+ }
303
+ },
304
+
305
+ /**
306
+ * HTTP handler (for manual trigger / health check)
307
+ */
308
+ async fetch(request: Request, env: Env, ctx: ExecutionContext): Promise<Response> {
309
+ const url = new URL(request.url);
310
+
311
+ // Health check bypasses SDK for lightweight responses
312
+ if (url.pathname === '/health') {
313
+ return new Response(
314
+ JSON.stringify({
315
+ status: 'ok',
316
+ service: 'platform-sentinel',
317
+ timestamp: new Date().toISOString(),
318
+ }),
319
+ { headers: { 'Content-Type': 'application/json' } }
320
+ );
321
+ }
322
+
323
+ // Create logger with trace context from request
324
+ const traceContext = createTraceContext(request, env);
325
+ const log = createLoggerFromRequest(request, env, 'platform-sentinel', MONITOR_COST_SPIKE);
326
+
327
+ log.info('Request received', {
328
+ method: request.method,
329
+ path: url.pathname,
330
+ traceId: traceContext.traceId,
331
+ });
332
+
333
+ try {
334
+ // Wrap with Platform SDK for usage tracking
335
+ const trackedEnv = withFeatureBudget(env, MONITOR_COST_SPIKE, { ctx });
336
+
337
+ if (url.pathname === '/trigger' && request.method === 'POST') {
338
+ // Manual trigger (for testing)
339
+ log.info('Manual trigger requested');
340
+ const event = {
341
+ scheduledTime: Date.now(),
342
+ cron: '*/15 * * * *',
343
+ noRetry: () => {},
344
+ } as unknown as ScheduledEvent;
345
+ await this.scheduled(event, env, ctx);
346
+ await completeTracking(trackedEnv);
347
+ log.info('Manual trigger completed');
348
+ return new Response(
349
+ JSON.stringify({ status: 'triggered', traceId: traceContext.traceId }),
350
+ {
351
+ headers: { 'Content-Type': 'application/json' },
352
+ }
353
+ );
354
+ }
355
+
356
+ await completeTracking(trackedEnv);
357
+ return new Response(
358
+ JSON.stringify({
359
+ service: 'platform-sentinel',
360
+ endpoints: ['/health', '/trigger (POST)'],
361
+ }),
362
+ { headers: { 'Content-Type': 'application/json' } }
363
+ );
364
+ } catch (error) {
365
+ if (error instanceof CircuitBreakerError) {
366
+ log.warn('Circuit breaker tripped', error, {
367
+ path: url.pathname,
368
+ reason: error.reason,
369
+ });
370
+ return new Response(
371
+ JSON.stringify({
372
+ error: 'Service temporarily unavailable',
373
+ code: 'CIRCUIT_BREAKER',
374
+ traceId: traceContext.traceId,
375
+ }),
376
+ {
377
+ status: 503,
378
+ headers: { 'Content-Type': 'application/json', 'Retry-After': '60' },
379
+ }
380
+ );
381
+ }
382
+
383
+ // Log full error with stack trace for debugging
384
+ log.error('Request failed', error, {
385
+ path: url.pathname,
386
+ method: request.method,
387
+ traceId: traceContext.traceId,
388
+ });
389
+
390
+ return new Response(
391
+ JSON.stringify({
392
+ error: 'Internal server error',
393
+ traceId: traceContext.traceId,
394
+ }),
395
+ {
396
+ status: 500,
397
+ headers: { 'Content-Type': 'application/json' },
398
+ }
399
+ );
400
+ }
401
+ },
402
+ };
403
+
404
+ /**
405
+ * Load thresholds from KV
406
+ */
407
+ async function loadThresholds(env: Env, log: Logger): Promise<AlertThresholds> {
408
+ try {
409
+ const stored = await env.PLATFORM_CACHE.get('alert-thresholds:config');
410
+ if (stored) {
411
+ const parsed = JSON.parse(stored);
412
+ // Merge with defaults to ensure all services have thresholds
413
+ return { ...DEFAULT_THRESHOLDS, ...parsed };
414
+ }
415
+ } catch (error) {
416
+ log.error('Failed to load thresholds from KV', error);
417
+ }
418
+ return DEFAULT_THRESHOLDS;
419
+ }
420
+
421
+ /**
422
+ * Get cache key with hourly timestamp (must match usage-api.ts)
423
+ * Format: usage:{period}:{project}:{hourTimestamp}
424
+ */
425
+ function getUsageCacheKey(period: string, project: string, hourOffset = 0): string {
426
+ const hourTimestamp = Math.floor(Date.now() / (60 * 60 * 1000)) + hourOffset;
427
+ return `usage:${period}:${project}:${hourTimestamp}`;
428
+ }
429
+
430
+ /**
431
+ * Fetch current costs from Usage API
432
+ *
433
+ * Tries current hour's cache first, then falls back to previous hour's cache.
434
+ * Cache is populated by platform-usage /usage endpoint calls (30-min TTL).
435
+ */
436
+ async function fetchCurrentCosts(env: Env, log: Logger): Promise<CostBreakdown | null> {
437
+ const currentCacheKey = getUsageCacheKey('30d', 'all', 0);
438
+ const prevCacheKey = getUsageCacheKey('30d', 'all', -1);
439
+
440
+ try {
441
+ // Try current hour's cache first
442
+ let usageData = await env.PLATFORM_CACHE.get(currentCacheKey);
443
+ let cacheKeyUsed = currentCacheKey;
444
+
445
+ if (!usageData) {
446
+ // Fall back to previous hour's cache (covers cache cold starts)
447
+ usageData = await env.PLATFORM_CACHE.get(prevCacheKey);
448
+ cacheKeyUsed = prevCacheKey;
449
+
450
+ if (!usageData) {
451
+ // KV cache is cold — fall back to computing costs from D1
452
+ log.info('KV cache cold, falling back to D1 cost computation', {
453
+ current_key: currentCacheKey,
454
+ prev_key: prevCacheKey,
455
+ });
456
+ return fetchCostsFromD1(env, log);
457
+ }
458
+
459
+ log.debug('Using previous hour cache', { cache_key: prevCacheKey });
460
+ }
461
+
462
+ // Validate the data before parsing
463
+ if (typeof usageData !== 'string' || usageData.trim() === '') {
464
+ log.warn('Invalid cache data (empty or non-string)', {
465
+ cache_key: cacheKeyUsed,
466
+ data_type: typeof usageData,
467
+ data_length: usageData?.length ?? 0,
468
+ });
469
+ return fetchCostsFromD1(env, log);
470
+ }
471
+
472
+ // Parse JSON with specific error handling
473
+ let usage: { costs?: CostBreakdown };
474
+ try {
475
+ usage = JSON.parse(usageData);
476
+ } catch (parseError) {
477
+ log.warn('Cache data is not valid JSON', {
478
+ cache_key: cacheKeyUsed,
479
+ data_preview: usageData.slice(0, 100),
480
+ error: parseError instanceof Error ? parseError.message : String(parseError),
481
+ });
482
+ return fetchCostsFromD1(env, log);
483
+ }
484
+
485
+ // Validate the costs property exists
486
+ if (!usage.costs) {
487
+ log.warn('Cache data missing costs property', {
488
+ cache_key: cacheKeyUsed,
489
+ available_keys: Object.keys(usage),
490
+ });
491
+ return fetchCostsFromD1(env, log);
492
+ }
493
+
494
+ return usage.costs;
495
+ } catch (error) {
496
+ // This catch is for unexpected errors (KV failures, etc.)
497
+ log.error('Failed to fetch costs from KV', error, {
498
+ current_key: currentCacheKey,
499
+ prev_key: prevCacheKey,
500
+ });
501
+ return fetchCostsFromD1(env, log);
502
+ }
503
+ }
504
+
505
+ /**
506
+ * Compute MTD cost breakdown directly from D1 hourly_usage_snapshots.
507
+ * Used as fallback when KV cache is cold (no recent dashboard API calls).
508
+ * Sums the per-service cost columns already stored in each hourly row.
509
+ */
510
+ async function fetchCostsFromD1(env: Env, log: Logger): Promise<CostBreakdown | null> {
511
+ try {
512
+ const billing = getBillingPeriod();
513
+ const result = await env.PLATFORM_DB.prepare(`
514
+ SELECT
515
+ SUM(COALESCE(workers_cost_usd, 0)) as workers,
516
+ SUM(COALESCE(d1_cost_usd, 0)) as d1,
517
+ SUM(COALESCE(kv_cost_usd, 0)) as kv,
518
+ SUM(COALESCE(r2_cost_usd, 0)) as r2,
519
+ SUM(COALESCE(do_cost_usd, 0)) as durableObjects,
520
+ SUM(COALESCE(vectorize_cost_usd, 0)) as vectorize,
521
+ SUM(COALESCE(aigateway_cost_usd, 0)) as aiGateway,
522
+ SUM(COALESCE(workersai_cost_usd, 0)) as workersAI,
523
+ SUM(COALESCE(pages_cost_usd, 0)) as pages,
524
+ SUM(COALESCE(queues_cost_usd, 0)) as queues,
525
+ SUM(COALESCE(workflows_cost_usd, 0)) as workflows,
526
+ SUM(COALESCE(total_cost_usd, 0)) as total
527
+ FROM hourly_usage_snapshots
528
+ WHERE project = 'all' AND DATE(snapshot_hour) >= ?
529
+ `).bind(billing.start).first<CostBreakdown>();
530
+
531
+ if (!result) {
532
+ log.warn('D1 fallback returned no data');
533
+ return null;
534
+ }
535
+
536
+ log.info('Computed costs from D1 fallback', {
537
+ total: result.total,
538
+ billing_start: billing.start,
539
+ source: 'd1-fallback',
540
+ });
541
+
542
+ return result;
543
+ } catch (error) {
544
+ log.error('D1 fallback cost computation failed', error);
545
+ return null;
546
+ }
547
+ }
548
+
549
+ /**
550
+ * Load previous costs from KV
551
+ */
552
+ async function loadPreviousCosts(env: Env, log: Logger): Promise<CostBreakdown | null> {
553
+ try {
554
+ const stored = await env.PLATFORM_CACHE.get('platform-sentinel:previous-costs');
555
+ if (stored) {
556
+ return JSON.parse(stored);
557
+ }
558
+ } catch (error) {
559
+ log.error('Failed to load previous costs', error);
560
+ }
561
+ return null;
562
+ }
563
+
564
+ /**
565
+ * Store current costs for next comparison
566
+ */
567
+ async function storeCosts(costs: CostBreakdown, env: Env, log: Logger): Promise<void> {
568
+ try {
569
+ await env.PLATFORM_CACHE.put('platform-sentinel:previous-costs', JSON.stringify(costs), {
570
+ expirationTtl: 86400 * 7, // Keep for 7 days
571
+ });
572
+ } catch (error) {
573
+ log.error('Failed to store costs', error);
574
+ }
575
+ }
576
+
577
+ // =============================================================================
578
+ // PER-PROJECT / PER-FEATURE ATTRIBUTION
579
+ // =============================================================================
580
+
581
+ /**
582
+ * Map service names to resource_usage_snapshots config for per-project cost attribution.
583
+ * Cost expressions use the same pricing as PRICING_TIERS in workers/lib/costs.ts.
584
+ * Note: allowances are account-level so NOT subtracted here — this shows proportional attribution.
585
+ */
586
+ const SERVICE_RESOURCE_CONFIG: Record<string, { resourceType: string; costExpr: string }> = {
587
+ workers: {
588
+ resourceType: 'worker',
589
+ costExpr: `SUM(COALESCE(requests, 0)) / 1000000.0 * 0.30 + SUM(COALESCE(cpu_time_ms, 0)) / 1000000.0 * 0.02`,
590
+ },
591
+ d1: {
592
+ resourceType: 'd1',
593
+ costExpr: `SUM(COALESCE(rows_read, 0)) / 1000000000.0 * 0.001 + SUM(COALESCE(rows_written, 0)) / 1000000.0 * 1.00`,
594
+ },
595
+ kv: {
596
+ resourceType: 'kv',
597
+ costExpr: `SUM(COALESCE(reads, 0)) / 1000000.0 * 0.50 + SUM(COALESCE(writes, 0)) / 1000000.0 * 5.00 + SUM(COALESCE(deletes, 0)) / 1000000.0 * 5.00`,
598
+ },
599
+ r2: {
600
+ resourceType: 'r2',
601
+ costExpr: `SUM(COALESCE(class_a_ops, 0)) / 1000000.0 * 4.50 + SUM(COALESCE(class_b_ops, 0)) / 1000000.0 * 0.36`,
602
+ },
603
+ durableObjects: {
604
+ resourceType: 'do',
605
+ costExpr: `SUM(COALESCE(requests, 0)) / 1000000.0 * 0.15 + SUM(COALESCE(gb_seconds, 0)) / 1000000.0 * 12.50`,
606
+ },
607
+ queues: {
608
+ resourceType: 'queues',
609
+ costExpr: `(SUM(COALESCE(reads, 0)) + SUM(COALESCE(writes, 0))) / 1000000.0 * 0.04`,
610
+ },
611
+ // vectorize: excluded — resource_usage_snapshots only stores vector storage, not queried dimensions
612
+ // pages: excluded — no meaningful per-project cost metrics
613
+ // aiGateway: excluded — free service
614
+ // workersAI: excluded — neurons tracked but not per-project in resource_usage_snapshots
615
+ };
616
+
617
+ /**
618
+ * Map service resource names to the feature_usage_daily metric column(s).
619
+ * These represent the primary usage metric for each service.
620
+ */
621
+ const SERVICE_FEATURE_COLUMN: Record<string, string> = {
622
+ workers: 'requests',
623
+ d1: 'd1_writes',
624
+ kv: 'kv_writes',
625
+ r2: 'r2_class_a',
626
+ durableObjects: 'do_requests',
627
+ pages: 'requests',
628
+ queues: 'queue_messages',
629
+ workersAI: 'ai_neurons',
630
+ };
631
+
632
+ /**
633
+ * Query per-project cost breakdown for a specific service.
634
+ * Uses resource_usage_snapshots (which has real per-project, per-resource data)
635
+ * and calculates costs on-the-fly using pricing constants.
636
+ */
637
+ async function queryTopProjects(
638
+ env: Env,
639
+ serviceName: string,
640
+ billingStart: string,
641
+ log: Logger
642
+ ): Promise<Array<{ project: string; cost: number; pctOfTotal: number }>> {
643
+ const config = SERVICE_RESOURCE_CONFIG[serviceName];
644
+ if (!config) return []; // Vectorize, Pages, AI Gateway — no per-project data available
645
+
646
+ try {
647
+ const result = await env.PLATFORM_DB.prepare(
648
+ `SELECT project, (${config.costExpr}) as cost
649
+ FROM resource_usage_snapshots
650
+ WHERE resource_type = ?
651
+ AND snapshot_hour >= ?
652
+ AND project NOT IN ('_unattributed', 'unknown')
653
+ GROUP BY project
654
+ HAVING cost > 0.001
655
+ ORDER BY cost DESC
656
+ LIMIT 5`
657
+ )
658
+ .bind(config.resourceType, billingStart)
659
+ .all<{ project: string; cost: number }>();
660
+
661
+ if (!result.results || result.results.length === 0) return [];
662
+
663
+ const totalCost = result.results.reduce((sum, r) => sum + r.cost, 0);
664
+ return result.results.map((r) => ({
665
+ project: r.project,
666
+ cost: r.cost,
667
+ pctOfTotal: totalCost > 0 ? Math.round((r.cost / totalCost) * 100) : 0,
668
+ }));
669
+ } catch (error) {
670
+ log.error('Failed to query top projects', error, { service: serviceName });
671
+ return [];
672
+ }
673
+ }
674
+
675
+ /**
676
+ * Query per-feature usage breakdown for a specific service.
677
+ * Returns top features by usage metric from feature_usage_daily.
678
+ */
679
+ async function queryTopFeatures(
680
+ env: Env,
681
+ serviceName: string,
682
+ log: Logger
683
+ ): Promise<Array<{ featureKey: string; usage: number; pctOfTotal: number }>> {
684
+ const usageCol = SERVICE_FEATURE_COLUMN[serviceName];
685
+ if (!usageCol) return [];
686
+
687
+ try {
688
+ const result = await env.PLATFORM_DB.prepare(
689
+ `SELECT feature_key, SUM(${usageCol}) as usage
690
+ FROM feature_usage_daily
691
+ WHERE usage_date >= date('now', '-7 days')
692
+ AND ${usageCol} > 0
693
+ GROUP BY feature_key
694
+ ORDER BY usage DESC
695
+ LIMIT 5`
696
+ )
697
+ .all<{ feature_key: string; usage: number }>();
698
+
699
+ if (!result.results || result.results.length === 0) return [];
700
+
701
+ const totalUsage = result.results.reduce((sum, r) => sum + r.usage, 0);
702
+ return result.results.map((r) => ({
703
+ featureKey: r.feature_key,
704
+ usage: r.usage,
705
+ pctOfTotal: totalUsage > 0 ? Math.round((r.usage / totalUsage) * 100) : 0,
706
+ }));
707
+ } catch (error) {
708
+ log.error('Failed to query top features', error, { service: serviceName });
709
+ return [];
710
+ }
711
+ }
712
+
713
+ // =============================================================================
714
+ // ALLOWANCE STATUS (Direct D1 query for accurate usage-vs-allowance)
715
+ // =============================================================================
716
+
717
+ /**
718
+ * Per-metric usage breakdown with allowance comparison.
719
+ */
720
+ interface UsageMetricBreakdown {
721
+ metric: string;
722
+ label: string;
723
+ used: number;
724
+ allowance: number;
725
+ pctOfAllowance: number;
726
+ overageUnits: number;
727
+ overageCost: number;
728
+ }
729
+
730
+ /**
731
+ * Allowance status for a service — determines whether alerts should fire.
732
+ */
733
+ interface AllowanceStatus {
734
+ /** True if ALL metrics for this service are within their plan allowance */
735
+ withinAllowance: boolean;
736
+ /** Per-metric breakdown */
737
+ metrics: UsageMetricBreakdown[];
738
+ /** Total overage cost (sum of all metric overages) */
739
+ totalOverageCost: number;
740
+ }
741
+
742
+ /**
743
+ * Service-to-metric definitions for allowance checking.
744
+ * Maps each service to its D1 columns, plan allowances, and pricing.
745
+ */
746
+ const SERVICE_ALLOWANCE_METRICS: Record<string, Array<{
747
+ metric: string;
748
+ label: string;
749
+ sqlExpr: string;
750
+ allowance: number;
751
+ pricePerUnit: number;
752
+ unitDivisor: number;
753
+ }>> = {
754
+ d1: [
755
+ { metric: 'rows_read', label: 'Rows Read', sqlExpr: 'SUM(COALESCE(d1_rows_read, 0))', allowance: PAID_ALLOWANCES.d1.rowsRead, pricePerUnit: PRICING_TIERS.d1.rowsReadPerBillion, unitDivisor: 1_000_000_000 },
756
+ { metric: 'rows_written', label: 'Rows Written', sqlExpr: 'SUM(COALESCE(d1_rows_written, 0))', allowance: PAID_ALLOWANCES.d1.rowsWritten, pricePerUnit: PRICING_TIERS.d1.rowsWrittenPerMillion, unitDivisor: 1_000_000 },
757
+ ],
758
+ kv: [
759
+ { metric: 'reads', label: 'Reads', sqlExpr: 'SUM(COALESCE(kv_reads, 0))', allowance: PAID_ALLOWANCES.kv.reads, pricePerUnit: PRICING_TIERS.kv.readsPerMillion, unitDivisor: 1_000_000 },
760
+ { metric: 'writes', label: 'Writes', sqlExpr: 'SUM(COALESCE(kv_writes, 0))', allowance: PAID_ALLOWANCES.kv.writes, pricePerUnit: PRICING_TIERS.kv.writesPerMillion, unitDivisor: 1_000_000 },
761
+ { metric: 'deletes', label: 'Deletes', sqlExpr: 'SUM(COALESCE(kv_deletes, 0))', allowance: PAID_ALLOWANCES.kv.deletes, pricePerUnit: PRICING_TIERS.kv.deletesPerMillion, unitDivisor: 1_000_000 },
762
+ { metric: 'list_ops', label: 'List Ops', sqlExpr: 'SUM(COALESCE(kv_list_ops, 0))', allowance: PAID_ALLOWANCES.kv.lists, pricePerUnit: PRICING_TIERS.kv.listsPerMillion, unitDivisor: 1_000_000 },
763
+ ],
764
+ r2: [
765
+ { metric: 'class_a', label: 'Class A Ops', sqlExpr: 'SUM(COALESCE(r2_class_a_ops, 0))', allowance: PAID_ALLOWANCES.r2.classA, pricePerUnit: PRICING_TIERS.r2.classAPerMillion, unitDivisor: 1_000_000 },
766
+ { metric: 'class_b', label: 'Class B Ops', sqlExpr: 'SUM(COALESCE(r2_class_b_ops, 0))', allowance: PAID_ALLOWANCES.r2.classB, pricePerUnit: PRICING_TIERS.r2.classBPerMillion, unitDivisor: 1_000_000 },
767
+ { metric: 'storage', label: 'Storage', sqlExpr: 'MAX(COALESCE(r2_storage_bytes, 0))', allowance: PAID_ALLOWANCES.r2.storage, pricePerUnit: PRICING_TIERS.r2.storagePerGbMonth, unitDivisor: 1_000_000_000 },
768
+ ],
769
+ durableObjects: [
770
+ { metric: 'requests', label: 'Requests', sqlExpr: 'SUM(COALESCE(do_requests, 0))', allowance: PAID_ALLOWANCES.durableObjects.requests, pricePerUnit: PRICING_TIERS.durableObjects.requestsPerMillion, unitDivisor: 1_000_000 },
771
+ { metric: 'gb_seconds', label: 'GB-seconds', sqlExpr: 'MAX(COALESCE(do_gb_seconds, 0))', allowance: PAID_ALLOWANCES.durableObjects.gbSeconds, pricePerUnit: PRICING_TIERS.durableObjects.gbSecondsPerMillion, unitDivisor: 1_000_000 },
772
+ ],
773
+ vectorize: [
774
+ { metric: 'queried_dimensions', label: 'Queried Dimensions', sqlExpr: 'SUM(COALESCE(vectorize_queries, 0))', allowance: PAID_ALLOWANCES.vectorize.queriedDimensions, pricePerUnit: PRICING_TIERS.vectorize.queriedDimensionsPerMillion, unitDivisor: 1_000_000 },
775
+ { metric: 'stored_dimensions', label: 'Stored Dimensions', sqlExpr: 'MAX(COALESCE(vectorize_vectors_stored, 0))', allowance: PAID_ALLOWANCES.vectorize.storedDimensions, pricePerUnit: PRICING_TIERS.vectorize.storedDimensionsPerMillion, unitDivisor: 1_000_000 },
776
+ ],
777
+ workers: [
778
+ { metric: 'requests', label: 'Requests', sqlExpr: 'SUM(COALESCE(workers_requests, 0))', allowance: 10_000_000, pricePerUnit: PRICING_TIERS.workers.requestsPerMillion, unitDivisor: 1_000_000 },
779
+ { metric: 'cpu_ms', label: 'CPU Time (ms)', sqlExpr: 'SUM(COALESCE(workers_cpu_time_ms, 0))', allowance: 30_000_000, pricePerUnit: PRICING_TIERS.workers.cpuMsPerMillion, unitDivisor: 1_000_000 },
780
+ ],
781
+ queues: [
782
+ { metric: 'operations', label: 'Operations', sqlExpr: 'SUM(COALESCE(queues_messages_produced, 0)) + SUM(COALESCE(queues_messages_consumed, 0))', allowance: PAID_ALLOWANCES.queues.operations, pricePerUnit: PRICING_TIERS.queues.operationsPerMillion, unitDivisor: 1_000_000 },
783
+ ],
784
+ // pages, workflows — no meaningful allowance thresholds for alerting
785
+ // workersAI — 10K neurons/day free (daily reset, not monthly; too complex for MTD SUM check)
786
+ };
787
+
788
+ /**
789
+ * Query D1 for actual MTD usage per service and compare against plan allowances.
790
+ * Returns definitive allowance status — this is the ONLY source of truth for
791
+ * whether a service has exceeded its Workers Paid plan allowance.
792
+ */
793
+ async function queryAllowanceStatus(
794
+ env: Env,
795
+ serviceName: string,
796
+ billingStart: string,
797
+ log: Logger
798
+ ): Promise<AllowanceStatus> {
799
+ const metricDefs = SERVICE_ALLOWANCE_METRICS[serviceName];
800
+ if (!metricDefs || metricDefs.length === 0) {
801
+ // Services without defined allowances (pages, queues) — always "within"
802
+ return { withinAllowance: true, metrics: [], totalOverageCost: 0 };
803
+ }
804
+
805
+ try {
806
+ // Build a single query for all metrics of this service
807
+ const selectExprs = metricDefs.map((m, i) => `${m.sqlExpr} as metric_${i}`).join(', ');
808
+ const sql = `SELECT ${selectExprs} FROM hourly_usage_snapshots WHERE project = 'all' AND DATE(snapshot_hour) >= ?`;
809
+
810
+ const result = await env.PLATFORM_DB.prepare(sql)
811
+ .bind(billingStart)
812
+ .first<Record<string, number>>();
813
+
814
+ if (!result) {
815
+ return { withinAllowance: true, metrics: [], totalOverageCost: 0 };
816
+ }
817
+
818
+ const metrics: UsageMetricBreakdown[] = metricDefs.map((def, i) => {
819
+ const used = result[`metric_${i}`] ?? 0;
820
+ const pctOfAllowance = def.allowance > 0 ? (used / def.allowance) * 100 : 0;
821
+ const overageUnits = Math.max(0, used - def.allowance);
822
+ const overageCost = (overageUnits / def.unitDivisor) * def.pricePerUnit;
823
+ return {
824
+ metric: def.metric,
825
+ label: def.label,
826
+ used,
827
+ allowance: def.allowance,
828
+ pctOfAllowance: Math.round(pctOfAllowance * 10) / 10,
829
+ overageUnits,
830
+ overageCost: Math.round(overageCost * 100) / 100,
831
+ };
832
+ });
833
+
834
+ const withinAllowance = metrics.every((m) => m.used <= m.allowance);
835
+ const totalOverageCost = metrics.reduce((sum, m) => sum + m.overageCost, 0);
836
+
837
+ return {
838
+ withinAllowance,
839
+ metrics,
840
+ totalOverageCost: Math.round(totalOverageCost * 100) / 100,
841
+ };
842
+ } catch (error) {
843
+ log.error('Failed to query allowance status', error, { service: serviceName });
844
+ return { withinAllowance: true, metrics: [], totalOverageCost: 0 };
845
+ }
846
+ }
847
+
848
+ /**
849
+ * Compute billing period for the current month.
850
+ * Cloudflare bills from the 1st to the last day of each calendar month.
851
+ */
852
+ function getBillingPeriod(): {
853
+ start: string;
854
+ end: string;
855
+ daysElapsed: number;
856
+ daysTotal: number;
857
+ } {
858
+ const now = new Date();
859
+ const year = now.getUTCFullYear();
860
+ const month = now.getUTCMonth();
861
+ const start = new Date(Date.UTC(year, month, 1));
862
+ const end = new Date(Date.UTC(year, month + 1, 0)); // last day of current month
863
+ const daysTotal = end.getUTCDate();
864
+ const daysElapsed = now.getUTCDate();
865
+ return {
866
+ start: start.toISOString().slice(0, 10),
867
+ end: end.toISOString().slice(0, 10),
868
+ daysElapsed,
869
+ daysTotal,
870
+ };
871
+ }
872
+
873
+ /**
874
+ * Evaluate alerts based on thresholds.
875
+ *
876
+ * Only fires high/critical email alerts when a service EXCEEDS its Workers Paid
877
+ * plan allowance (i.e. has real overage cost). Usage spikes within the free
878
+ * allowance are downgraded to warning-level (Slack only, no email).
879
+ */
880
+ async function evaluateAlerts(
881
+ current: CostBreakdown,
882
+ previous: CostBreakdown | null,
883
+ thresholds: AlertThresholds,
884
+ env: Env,
885
+ log: Logger
886
+ ): Promise<CostSpikeAlert[]> {
887
+ const alerts: CostSpikeAlert[] = [];
888
+ const billing = getBillingPeriod();
889
+
890
+ const services: (keyof CostBreakdown)[] = [
891
+ 'workers',
892
+ 'd1',
893
+ 'kv',
894
+ 'r2',
895
+ 'durableObjects',
896
+ 'vectorize',
897
+ 'pages',
898
+ 'queues',
899
+ 'workersAI',
900
+ ];
901
+
902
+ for (const service of services) {
903
+ if (service === 'total') continue;
904
+
905
+ const threshold = thresholds[service];
906
+ if (!threshold || !threshold.enabled) continue;
907
+
908
+ // STEP 1: Check actual usage against plan allowance via D1 query.
909
+ // This is the single source of truth — NOT the cached cost data.
910
+ const allowanceStatus = await queryAllowanceStatus(env, service, billing.start, log);
911
+
912
+ // STEP 2: If ALL metrics for this service are within plan allowance, SKIP entirely.
913
+ // No alert should fire for services covered by the Workers Paid plan inclusion.
914
+ if (allowanceStatus.withinAllowance) {
915
+ log.debug('Service within plan allowance, skipping alert', {
916
+ service,
917
+ metrics: allowanceStatus.metrics.map((m) => `${m.label}: ${m.pctOfAllowance}%`),
918
+ });
919
+ continue;
920
+ }
921
+
922
+ // STEP 3: We have a real overage — use the D1-derived overage cost, not cached cost.
923
+ const overageCost = allowanceStatus.totalOverageCost;
924
+ const previousCost = previous ? previous[service] : 0;
925
+ const costDeltaPct = previousCost > 0 ? ((overageCost - previousCost) / previousCost) * 100 : 0;
926
+
927
+ // Determine threshold level based on actual overage cost vs absoluteMax
928
+ let level: CostSpikeAlert['thresholdLevel'] = 'normal';
929
+ const costPercentOfMax =
930
+ threshold.absoluteMax > 0 ? (overageCost / threshold.absoluteMax) * 100 : 0;
931
+
932
+ if (costPercentOfMax >= threshold.criticalPct) {
933
+ level = 'critical';
934
+ } else if (costPercentOfMax >= threshold.highPct) {
935
+ level = 'high';
936
+ } else if (costPercentOfMax >= threshold.warningPct) {
937
+ level = 'warning';
938
+ }
939
+
940
+ // Alert conditions:
941
+ // 1. Overage cost > $0.10 AND threshold level is 'warning' or higher
942
+ // 2. Overage cost exceeds absolute max
943
+ const shouldAlert =
944
+ (overageCost > 0.10 && level !== 'normal') ||
945
+ (threshold.absoluteMax > 0 && overageCost > threshold.absoluteMax);
946
+
947
+ if (shouldAlert) {
948
+ // Upgrade to critical if overage cost exceeds max
949
+ if (threshold.absoluteMax > 0 && overageCost > threshold.absoluteMax) {
950
+ level = 'critical';
951
+ }
952
+
953
+ // Query per-project and per-feature attribution (non-blocking)
954
+ const [topProjects, topFeatures] = await Promise.all([
955
+ queryTopProjects(env, service, billing.start, log),
956
+ queryTopFeatures(env, service, log),
957
+ ]);
958
+
959
+ alerts.push({
960
+ id: crypto.randomUUID(),
961
+ serviceType: formatServiceName(service),
962
+ resourceName: service,
963
+ currentCost: overageCost,
964
+ previousCost,
965
+ costDeltaPct,
966
+ costPercentOfMax,
967
+ thresholdLevel: level,
968
+ absoluteMax: threshold.absoluteMax,
969
+ timestamp: new Date().toISOString(),
970
+ billingPeriodStart: billing.start,
971
+ billingPeriodEnd: billing.end,
972
+ billingDaysElapsed: billing.daysElapsed,
973
+ billingDaysTotal: billing.daysTotal,
974
+ monthlyAllowance: SERVICE_ALLOWANCE_DESCRIPTIONS[service] ?? 'N/A',
975
+ isWithinAllowance: false,
976
+ overageCost,
977
+ topProjects,
978
+ topFeatures,
979
+ usageBreakdown: allowanceStatus.metrics,
980
+ });
981
+ }
982
+ }
983
+
984
+ return alerts;
985
+ }
986
+
987
+ /**
988
+ * Send alerts via Slack, Email, and Dashboard notifications (with rate limiting)
989
+ */
990
+ async function sendAlerts(alert: CostSpikeAlert, env: Env, log: Logger): Promise<void> {
991
+ const alertKey = `cost-spike:${alert.resourceName}`;
992
+
993
+ // Check Slack rate limit
994
+ const slackKey = `slack:${alertKey}`;
995
+ const slackSent = await env.PLATFORM_ALERTS.get(slackKey);
996
+
997
+ if (!slackSent && env.SLACK_WEBHOOK_URL) {
998
+ const slackResult = await sendSlackAlert(alert, env);
999
+ if (slackResult.success) {
1000
+ await env.PLATFORM_ALERTS.put(slackKey, new Date().toISOString(), {
1001
+ expirationTtl: SLACK_RATE_LIMIT_TTL,
1002
+ });
1003
+ log.info('Sent Slack alert', { resource: alert.resourceName });
1004
+ } else {
1005
+ log.error('Slack alert failed', { resource: alert.resourceName, error: slackResult.error });
1006
+ }
1007
+ } else if (slackSent) {
1008
+ log.debug('Slack rate limited', { resource: alert.resourceName });
1009
+ }
1010
+
1011
+ // Check Email rate limit (only for high/critical that EXCEED plan allowance)
1012
+ // Within-allowance alerts are capped at 'warning' by evaluateAlerts(), but guard explicitly
1013
+ if ((alert.thresholdLevel === 'high' || alert.thresholdLevel === 'critical') && !alert.isWithinAllowance) {
1014
+ const emailKey = `email:${alertKey}`;
1015
+ const emailSent = await env.PLATFORM_ALERTS.get(emailKey);
1016
+
1017
+ if (!emailSent && env.RESEND_API_KEY && env.ALERT_EMAIL_TO) {
1018
+ const emailResult = await sendEmailAlert(alert, env);
1019
+ if (emailResult.success) {
1020
+ await env.PLATFORM_ALERTS.put(emailKey, new Date().toISOString(), {
1021
+ expirationTtl: EMAIL_RATE_LIMIT_TTL,
1022
+ });
1023
+ log.info('Sent email alert', { resource: alert.resourceName });
1024
+ } else {
1025
+ log.error('Email alert failed', { resource: alert.resourceName, error: emailResult.error });
1026
+ }
1027
+ } else if (emailSent) {
1028
+ log.debug('Email rate limited', { resource: alert.resourceName });
1029
+ }
1030
+ }
1031
+
1032
+ // Create dashboard notification (using same rate limit as Slack)
1033
+ if (!slackSent && _rawNotificationsApi) {
1034
+ await createCostNotification(alert, env, log);
1035
+ }
1036
+ }
1037
+
1038
+ /**
1039
+ * Create dashboard notification for cost alert
1040
+ */
1041
+ async function createCostNotification(
1042
+ alert: CostSpikeAlert,
1043
+ env: Env,
1044
+ log: Logger
1045
+ ): Promise<void> {
1046
+ if (!_rawNotificationsApi) return;
1047
+
1048
+ // Map threshold level to notification priority
1049
+ const priorityMap: Record<CostSpikeAlert['thresholdLevel'], string> = {
1050
+ critical: 'critical',
1051
+ high: 'high',
1052
+ warning: 'medium',
1053
+ normal: 'low',
1054
+ };
1055
+
1056
+ // Map threshold level to category
1057
+ const categoryMap: Record<CostSpikeAlert['thresholdLevel'], string> = {
1058
+ critical: 'error',
1059
+ high: 'error',
1060
+ warning: 'warning',
1061
+ normal: 'info',
1062
+ };
1063
+
1064
+ try {
1065
+ const resp = await _rawNotificationsApi!.fetch(
1066
+ 'https://platform-notifications.internal/notifications',
1067
+ {
1068
+ method: 'POST',
1069
+ headers: { 'Content-Type': 'application/json' },
1070
+ body: JSON.stringify({
1071
+ category: categoryMap[alert.thresholdLevel],
1072
+ source: 'sentinel',
1073
+ source_id: alert.id,
1074
+ title: `${alert.serviceType}: ${formatCurrency(alert.overageCost)} overage`,
1075
+ description: `${alert.serviceType} has exceeded plan allowance. Overage cost: ${formatCurrency(alert.overageCost)} (threshold: ${formatCurrency(alert.absoluteMax)})`,
1076
+ priority: priorityMap[alert.thresholdLevel],
1077
+ action_url: '/costs',
1078
+ action_label: 'View Costs',
1079
+ project: 'platform',
1080
+ }),
1081
+ }
1082
+ );
1083
+ const body = await resp.text();
1084
+ if (resp.ok) {
1085
+ log.debug('Created cost notification', { resource: alert.resourceName });
1086
+ } else {
1087
+ log.warn('Cost notification failed', { status: resp.status, body });
1088
+ }
1089
+ } catch (error) {
1090
+ // Non-blocking - log and continue
1091
+ log.error('Failed to create cost notification', error);
1092
+ }
1093
+ }
1094
+
1095
+ /**
1096
+ * Send Slack alert
1097
+ *
1098
+ * Includes rich context for Claude Code follow-up:
1099
+ * - Service breakdown with operation types
1100
+ * - Investigation commands (D1, KV queries)
1101
+ * - Direct links to usage dashboard
1102
+ * - Historical context (percent of monthly max)
1103
+ */
1104
+ async function sendSlackAlert(
1105
+ alert: CostSpikeAlert,
1106
+ env: Env
1107
+ ): Promise<{ success: boolean; error?: string }> {
1108
+ const emoji = getEmoji(alert.thresholdLevel);
1109
+ const colour = getColour(alert.thresholdLevel);
1110
+ const deltaText = formatPercentage(alert.costDeltaPct);
1111
+
1112
+ // Build investigation commands based on service type
1113
+ const investigationCommands = getInvestigationCommands(alert.serviceType);
1114
+
1115
+ // Build usage breakdown text for Slack
1116
+ const usageBreakdownText = alert.usageBreakdown.length > 0
1117
+ ? alert.usageBreakdown.map(m => {
1118
+ const status = m.pctOfAllowance > 100 ? ':red_circle:' : ':white_check_mark:';
1119
+ const overageText = m.overageCost > 0 ? ` \u2014 ${formatCurrency(m.overageCost)} overage` : '';
1120
+ return `${status} *${m.label}:* ${formatLargeNumber(m.used)} / ${formatLargeNumber(m.allowance)} (${m.pctOfAllowance}%)${overageText}`;
1121
+ }).join('\n')
1122
+ : '';
1123
+
1124
+ const message = {
1125
+ text: `[${alert.thresholdLevel.toUpperCase()}] ${alert.serviceType}: ${formatCurrency(alert.overageCost)} overage`,
1126
+ blocks: [
1127
+ {
1128
+ type: 'header',
1129
+ text: {
1130
+ type: 'plain_text',
1131
+ text: `${emoji} ${alert.serviceType}: ${formatCurrency(alert.overageCost)} overage`,
1132
+ },
1133
+ },
1134
+ {
1135
+ type: 'section',
1136
+ fields: [
1137
+ { type: 'mrkdwn', text: `*Service:*\n${alert.serviceType}` },
1138
+ { type: 'mrkdwn', text: `*Billing Period:*\n${formatBillingPeriod(alert)}` },
1139
+ { type: 'mrkdwn', text: `*Overage Cost:*\n${formatCurrency(alert.overageCost)}` },
1140
+ { type: 'mrkdwn', text: `*Alert Threshold:*\n${formatCurrency(alert.absoluteMax)}` },
1141
+ ],
1142
+ },
1143
+ ...(usageBreakdownText ? [{
1144
+ type: 'section' as const,
1145
+ text: {
1146
+ type: 'mrkdwn' as const,
1147
+ text: `*Usage vs Plan Allowance:*\n${usageBreakdownText}`,
1148
+ },
1149
+ }] : []),
1150
+ ...(alert.topProjects.length > 0 ? [{
1151
+ type: 'section' as const,
1152
+ text: {
1153
+ type: 'mrkdwn' as const,
1154
+ text: `*Top Projects:*\n${alert.topProjects.map(p =>
1155
+ `\u2022 *${p.project}*: ${formatCurrency(p.cost)} (${p.pctOfTotal}%)`
1156
+ ).join('\n')}`,
1157
+ },
1158
+ }] : []),
1159
+ ...(alert.topFeatures.length > 0 ? [{
1160
+ type: 'section' as const,
1161
+ text: {
1162
+ type: 'mrkdwn' as const,
1163
+ text: `*Top Features:*\n${alert.topFeatures.map(f =>
1164
+ `\u2022 \`${f.featureKey}\` \u2014 ${f.usage.toLocaleString()} ops (${f.pctOfTotal}%)`
1165
+ ).join('\n')}`,
1166
+ },
1167
+ }] : []),
1168
+ {
1169
+ type: 'section',
1170
+ text: {
1171
+ type: 'mrkdwn',
1172
+ text: `*Investigation Commands:*\n\`\`\`${investigationCommands}\`\`\``,
1173
+ },
1174
+ },
1175
+ {
1176
+ type: 'context',
1177
+ elements: [
1178
+ {
1179
+ type: 'mrkdwn',
1180
+ text: `Alert ID: ${alert.id} | ${new Date(alert.timestamp).toLocaleString('en-AU')}`,
1181
+ },
1182
+ ],
1183
+ },
1184
+ {
1185
+ type: 'actions',
1186
+ elements: [
1187
+ {
1188
+ type: 'button',
1189
+ text: {
1190
+ type: 'plain_text',
1191
+ text: 'Usage Dashboard',
1192
+ emoji: true,
1193
+ },
1194
+ url: `${DASHBOARD_URL}/usage`,
1195
+ },
1196
+ {
1197
+ type: 'button',
1198
+ text: {
1199
+ type: 'plain_text',
1200
+ text: 'Usage Monitor',
1201
+ emoji: true,
1202
+ },
1203
+ url: `${DASHBOARD_URL}/usage/monitor`,
1204
+ },
1205
+ ],
1206
+ },
1207
+ ],
1208
+ attachments: [
1209
+ {
1210
+ color: colour,
1211
+ fields: [
1212
+ {
1213
+ title: 'Action Required',
1214
+ value: getActionText(alert.thresholdLevel),
1215
+ short: false,
1216
+ },
1217
+ ],
1218
+ },
1219
+ ],
1220
+ };
1221
+
1222
+ try {
1223
+ const response = await fetch(env.SLACK_WEBHOOK_URL, {
1224
+ method: 'POST',
1225
+ headers: { 'Content-Type': 'application/json' },
1226
+ body: JSON.stringify(message),
1227
+ });
1228
+
1229
+ if (!response.ok) {
1230
+ const text = await response.text();
1231
+ return { success: false, error: `Slack error: ${response.status} ${text}` };
1232
+ }
1233
+
1234
+ return { success: true };
1235
+ } catch (error) {
1236
+ return {
1237
+ success: false,
1238
+ error: `Slack error: ${error instanceof Error ? error.message : 'Unknown'}`,
1239
+ };
1240
+ }
1241
+ }
1242
+
1243
+ /**
1244
+ * Send Email alert via Resend
1245
+ */
1246
+ async function sendEmailAlert(
1247
+ alert: CostSpikeAlert,
1248
+ env: Env
1249
+ ): Promise<{ success: boolean; error?: string }> {
1250
+ const colour = getColour(alert.thresholdLevel);
1251
+ const billingPeriodText = formatBillingPeriod(alert);
1252
+
1253
+ // Build usage breakdown HTML rows
1254
+ const usageBreakdownHtml = alert.usageBreakdown.length > 0 ? `
1255
+ <div style="margin-top: 15px;">
1256
+ <strong style="font-size: 14px;">Usage vs Plan Allowance</strong>
1257
+ <table style="width: 100%; border-collapse: collapse; margin-top: 8px;">
1258
+ <tr style="background: #f8f9fa;">
1259
+ <th style="padding: 8px; text-align: left; font-size: 12px;">Metric</th>
1260
+ <th style="padding: 8px; text-align: right; font-size: 12px;">Used</th>
1261
+ <th style="padding: 8px; text-align: right; font-size: 12px;">Allowance</th>
1262
+ <th style="padding: 8px; text-align: right; font-size: 12px;">%</th>
1263
+ <th style="padding: 8px; text-align: right; font-size: 12px;">Overage Cost</th>
1264
+ </tr>
1265
+ ${alert.usageBreakdown.map(m => {
1266
+ const pctColour = m.pctOfAllowance > 100 ? '#dc3545' : m.pctOfAllowance > 75 ? '#ffc107' : '#28a745';
1267
+ return `<tr style="border-bottom: 1px solid #eee;">
1268
+ <td style="padding: 8px; font-size: 13px;">${m.label}</td>
1269
+ <td style="padding: 8px; text-align: right; font-size: 13px;">${formatLargeNumber(m.used)}</td>
1270
+ <td style="padding: 8px; text-align: right; font-size: 13px; color: #666;">${formatLargeNumber(m.allowance)}</td>
1271
+ <td style="padding: 8px; text-align: right; font-size: 13px; font-weight: bold; color: ${pctColour};">${m.pctOfAllowance}%</td>
1272
+ <td style="padding: 8px; text-align: right; font-size: 13px;">${m.overageCost > 0 ? formatCurrency(m.overageCost) : '-'}</td>
1273
+ </tr>`;
1274
+ }).join('')}
1275
+ </table>
1276
+ </div>` : '';
1277
+
1278
+ const html = `
1279
+ <!DOCTYPE html>
1280
+ <html>
1281
+ <head>
1282
+ <meta charset="UTF-8">
1283
+ <title>Usage Alert: ${alert.serviceType}</title>
1284
+ </head>
1285
+ <body style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; margin: 0; padding: 20px; background-color: #f5f5f5;">
1286
+ <div style="max-width: 600px; margin: 0 auto; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
1287
+ <div style="background-color: ${colour}; color: white; padding: 20px;">
1288
+ <h1 style="margin: 0; font-size: 20px;">[${alert.thresholdLevel.toUpperCase()}] ${alert.serviceType}: ${formatCurrency(alert.overageCost)} overage</h1>
1289
+ </div>
1290
+ <div style="padding: 20px;">
1291
+ <table style="width: 100%; border-collapse: collapse;">
1292
+ <tr><td style="padding: 10px 0; border-bottom: 1px solid #eee;"><strong>Service</strong></td><td style="padding: 10px 0; border-bottom: 1px solid #eee;">${alert.serviceType}</td></tr>
1293
+ <tr><td style="padding: 10px 0; border-bottom: 1px solid #eee;"><strong>Billing Period</strong></td><td style="padding: 10px 0; border-bottom: 1px solid #eee;">${billingPeriodText}</td></tr>
1294
+ <tr><td style="padding: 10px 0; border-bottom: 1px solid #eee;"><strong>Overage Cost</strong></td><td style="padding: 10px 0; border-bottom: 1px solid #eee; color: #dc3545; font-weight: bold;">${formatCurrency(alert.overageCost)}</td></tr>
1295
+ <tr><td style="padding: 10px 0; border-bottom: 1px solid #eee;"><strong>Alert Threshold</strong></td><td style="padding: 10px 0; border-bottom: 1px solid #eee;">${formatCurrency(alert.absoluteMax)}</td></tr>
1296
+ <tr><td style="padding: 10px 0;"><strong>Plan Allowance</strong></td><td style="padding: 10px 0;">${alert.monthlyAllowance}</td></tr>
1297
+ </table>
1298
+ <div style="margin-top: 15px; padding: 15px; background: #f8d7da; border-radius: 4px; border-left: 4px solid #dc3545;">
1299
+ <strong>&#9888; Plan Allowance Exceeded</strong>
1300
+ <p style="margin: 8px 0 0 0; color: #555; font-size: 14px;">You have exceeded your monthly plan allowance. Overage cost: ${formatCurrency(alert.overageCost)}</p>
1301
+ </div>
1302
+ ${usageBreakdownHtml}
1303
+ ${alert.topProjects.length > 0 ? `
1304
+ <div style="margin-top: 15px;">
1305
+ <strong style="font-size: 14px;">Top Projects by Cost</strong>
1306
+ <table style="width: 100%; border-collapse: collapse; margin-top: 8px;">
1307
+ ${alert.topProjects.map(p => `
1308
+ <tr>
1309
+ <td style="padding: 6px 0; width: 40%;"><strong>${p.project}</strong></td>
1310
+ <td style="padding: 6px 0; width: 25%; text-align: right;">${formatCurrency(p.cost)}</td>
1311
+ <td style="padding: 6px 8px; width: 35%;">
1312
+ <div style="background: #e9ecef; border-radius: 3px; height: 16px; position: relative;">
1313
+ <div style="background: #0d6efd; border-radius: 3px; height: 16px; width: ${Math.min(p.pctOfTotal, 100)}%; display: flex; align-items: center; justify-content: flex-end; padding-right: 4px;">
1314
+ <span style="color: white; font-size: 10px; font-weight: bold;">${p.pctOfTotal}%</span>
1315
+ </div>
1316
+ </div>
1317
+ </td>
1318
+ </tr>`).join('')}
1319
+ </table>
1320
+ </div>` : ''}
1321
+ ${alert.topFeatures.length > 0 ? `
1322
+ <div style="margin-top: 15px;">
1323
+ <strong style="font-size: 14px;">Top Features by Usage</strong>
1324
+ <table style="width: 100%; border-collapse: collapse; margin-top: 8px;">
1325
+ ${alert.topFeatures.map(f => `
1326
+ <tr>
1327
+ <td style="padding: 6px 0; width: 40%; font-family: monospace; font-size: 12px;">${f.featureKey}</td>
1328
+ <td style="padding: 6px 0; width: 25%; text-align: right;">${f.usage.toLocaleString()} ops</td>
1329
+ <td style="padding: 6px 8px; width: 35%;">
1330
+ <div style="background: #e9ecef; border-radius: 3px; height: 16px; position: relative;">
1331
+ <div style="background: #6f42c1; border-radius: 3px; height: 16px; width: ${Math.min(f.pctOfTotal, 100)}%; display: flex; align-items: center; justify-content: flex-end; padding-right: 4px;">
1332
+ <span style="color: white; font-size: 10px; font-weight: bold;">${f.pctOfTotal}%</span>
1333
+ </div>
1334
+ </div>
1335
+ </td>
1336
+ </tr>`).join('')}
1337
+ </table>
1338
+ </div>` : ''}
1339
+ <div style="margin-top: 15px; padding: 15px; background: #f8f9fa; border-radius: 4px;">
1340
+ <strong>Recommended Action:</strong>
1341
+ <p style="margin: 10px 0 0 0; color: #666;">${getActionText(alert.thresholdLevel)}</p>
1342
+ </div>
1343
+ </div>
1344
+ <div style="background: #f8f9fa; padding: 15px 20px; font-size: 12px; color: #666;">
1345
+ <p style="margin: 0;">Alert ID: ${alert.id}</p>
1346
+ <p style="margin: 5px 0 0 0;">Generated: ${new Date(alert.timestamp).toLocaleString('en-AU')}</p>
1347
+ </div>
1348
+ </div>
1349
+ </body>
1350
+ </html>`;
1351
+
1352
+ try {
1353
+ const response = await fetch('https://api.resend.com/emails', {
1354
+ method: 'POST',
1355
+ headers: {
1356
+ Authorization: `Bearer ${env.RESEND_API_KEY}`,
1357
+ 'Content-Type': 'application/json',
1358
+ },
1359
+ body: JSON.stringify({
1360
+ from: ALERT_FROM_EMAIL,
1361
+ to: env.ALERT_EMAIL_TO,
1362
+ subject: `[${alert.thresholdLevel.toUpperCase()}] ${alert.serviceType}: ${formatCurrency(alert.overageCost)} overage (threshold: ${formatCurrency(alert.absoluteMax)})`,
1363
+ html,
1364
+ }),
1365
+ });
1366
+
1367
+ if (!response.ok) {
1368
+ const text = await response.text();
1369
+ return { success: false, error: `Resend error: ${response.status} ${text}` };
1370
+ }
1371
+
1372
+ return { success: true };
1373
+ } catch (error) {
1374
+ return {
1375
+ success: false,
1376
+ error: `Resend error: ${error instanceof Error ? error.message : 'Unknown'}`,
1377
+ };
1378
+ }
1379
+ }
1380
+
1381
+ /**
1382
+ * Format service name for display
1383
+ */
1384
+ function formatServiceName(service: string): string {
1385
+ const names: Record<string, string> = {
1386
+ workers: 'Workers',
1387
+ d1: 'D1 Database',
1388
+ kv: 'KV Storage',
1389
+ r2: 'R2 Storage',
1390
+ durableObjects: 'Durable Objects',
1391
+ vectorize: 'Vectorize',
1392
+ aiGateway: 'AI Gateway',
1393
+ pages: 'Pages',
1394
+ queues: 'Queues',
1395
+ workflows: 'Workflows',
1396
+ };
1397
+ return names[service] || service;
1398
+ }
1399
+
1400
+ /**
1401
+ * Get emoji for threshold level
1402
+ */
1403
+ function getEmoji(level: CostSpikeAlert['thresholdLevel']): string {
1404
+ const emojis: Record<string, string> = {
1405
+ critical: ':rotating_light:',
1406
+ high: ':warning:',
1407
+ warning: ':yellow_circle:',
1408
+ normal: ':white_check_mark:',
1409
+ };
1410
+ return emojis[level] || ':bell:';
1411
+ }
1412
+
1413
+ /**
1414
+ * Get colour for threshold level
1415
+ */
1416
+ function getColour(level: CostSpikeAlert['thresholdLevel']): string {
1417
+ const colours: Record<string, string> = {
1418
+ critical: '#dc3545', // Red
1419
+ high: '#dc3545', // Red (same as critical)
1420
+ warning: '#ffc107', // Yellow
1421
+ normal: '#28a745', // Light green
1422
+ };
1423
+ return colours[level] || '#17a2b8';
1424
+ }
1425
+
1426
+ /**
1427
+ * Get investigation commands based on service type
1428
+ * Provides Claude Code with actionable commands for follow-up
1429
+ */
1430
+ function getInvestigationCommands(serviceType: string): string {
1431
+ const base = `# Query daily usage rollups
1432
+ npx wrangler d1 execute platform-metrics --remote --command "SELECT snapshot_date, SUM(${serviceType}_cost_usd) as cost FROM daily_usage_rollups WHERE snapshot_date >= date('now', '-7 days') GROUP BY snapshot_date ORDER BY snapshot_date DESC"`;
1433
+
1434
+ const serviceSpecific: Record<string, string> = {
1435
+ d1: `
1436
+ # Check D1 per-feature usage
1437
+ npx wrangler d1 execute platform-metrics --remote --command "SELECT feature_key, SUM(d1_writes) as writes, SUM(d1_reads) as reads FROM feature_usage_daily WHERE snapshot_date = date('now', '-1 day') GROUP BY feature_key ORDER BY writes DESC LIMIT 10"`,
1438
+ kv: `
1439
+ # Check KV per-feature usage
1440
+ npx wrangler d1 execute platform-metrics --remote --command "SELECT feature_key, SUM(kv_writes) as writes, SUM(kv_reads) as reads FROM feature_usage_daily WHERE snapshot_date = date('now', '-1 day') GROUP BY feature_key ORDER BY writes DESC LIMIT 10"`,
1441
+ workers: `
1442
+ # Check Workers per-project usage
1443
+ npx wrangler d1 execute platform-metrics --remote --command "SELECT project, SUM(workers_requests) as requests, SUM(workers_cpu_time) as cpu_ms FROM daily_usage_rollups WHERE snapshot_date = date('now', '-1 day') GROUP BY project ORDER BY requests DESC"`,
1444
+ vectorize: `
1445
+ # Check Vectorize per-feature usage
1446
+ npx wrangler d1 execute platform-metrics --remote --command "SELECT feature_key, SUM(vectorize_queries) as queries FROM feature_usage_daily WHERE snapshot_date = date('now', '-1 day') AND vectorize_queries > 0 GROUP BY feature_key ORDER BY queries DESC LIMIT 10"`,
1447
+ };
1448
+
1449
+ return base + (serviceSpecific[serviceType] || '');
1450
+ }
1451
+
1452
+ /**
1453
+ * Get action text for threshold level
1454
+ */
1455
+ function getActionText(level: CostSpikeAlert['thresholdLevel']): string {
1456
+ switch (level) {
1457
+ case 'critical':
1458
+ return 'Investigate immediately - usage significantly exceeds budget';
1459
+ case 'high':
1460
+ return 'Review usage patterns and consider optimisation';
1461
+ case 'warning':
1462
+ return 'Monitor closely - approaching threshold';
1463
+ default:
1464
+ return 'No action required';
1465
+ }
1466
+ }
1467
+
1468
+ /**
1469
+ * Format currency
1470
+ */
1471
+ function formatCurrency(amount: number): string {
1472
+ return `$${amount.toFixed(2)}`;
1473
+ }
1474
+
1475
+ /**
1476
+ * Format percentage
1477
+ */
1478
+ function formatPercentage(pct: number): string {
1479
+ const sign = pct >= 0 ? '+' : '';
1480
+ return `${sign}${pct.toFixed(1)}%`;
1481
+ }
1482
+
1483
+ /**
1484
+ * Format large numbers with K/M/B suffixes for readability.
1485
+ */
1486
+ function formatLargeNumber(n: number): string {
1487
+ if (n >= 1_000_000_000) return `${(n / 1_000_000_000).toFixed(1)}B`;
1488
+ if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(1)}M`;
1489
+ if (n >= 1_000) return `${(n / 1_000).toFixed(1)}K`;
1490
+ return n.toFixed(0);
1491
+ }
1492
+
1493
+ /**
1494
+ * Format billing period for display.
1495
+ * Example: "1 Feb - 28 Feb 2026 (Day 6 of 28)"
1496
+ */
1497
+ function formatBillingPeriod(alert: CostSpikeAlert): string {
1498
+ const start = new Date(alert.billingPeriodStart + 'T00:00:00Z');
1499
+ const end = new Date(alert.billingPeriodEnd + 'T00:00:00Z');
1500
+ const months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
1501
+ const startStr = `${start.getUTCDate()} ${months[start.getUTCMonth()]}`;
1502
+ const endStr = `${end.getUTCDate()} ${months[end.getUTCMonth()]} ${end.getUTCFullYear()}`;
1503
+ return `${startStr} - ${endStr} (Day ${alert.billingDaysElapsed} of ${alert.billingDaysTotal})`;
1504
+ }
1505
+
1506
+ // =============================================================================
1507
+ // STALE HEARTBEAT DETECTION
1508
+ // =============================================================================
1509
+
1510
+ /**
1511
+ * Stale threshold: 2x the default heartbeat interval (5 minutes).
1512
+ * Features that haven't sent a heartbeat in 15 minutes are considered stale.
1513
+ */
1514
+ const STALE_THRESHOLD_SECONDS = 15 * 60;
1515
+
1516
+ /**
1517
+ * Stale heartbeat alert rate limit: 1 alert per feature per hour
1518
+ */
1519
+ const STALE_HEARTBEAT_RATE_LIMIT_TTL = 3600;
1520
+
1521
+ /**
1522
+ * Row type for stale heartbeat query results.
1523
+ */
1524
+ interface StaleHeartbeatRow {
1525
+ project_id: string;
1526
+ feature_id: string;
1527
+ last_heartbeat: number;
1528
+ age_seconds: number;
1529
+ status: string;
1530
+ }
1531
+
1532
+ /**
1533
+ * Check for Durable Objects that have stopped sending heartbeats.
1534
+ *
1535
+ * Queries the system_health_checks table for features that:
1536
+ * 1. Have status = 'healthy'
1537
+ * 2. Haven't sent a heartbeat in STALE_THRESHOLD_SECONDS
1538
+ *
1539
+ * Updates status to 'stale' and fires Slack alerts.
1540
+ */
1541
+ async function checkStaleHeartbeats(env: Env, log: Logger): Promise<void> {
1542
+ const now = Math.floor(Date.now() / 1000);
1543
+
1544
+ try {
1545
+ // Find healthy features that haven't sent heartbeats recently
1546
+ const staleResult = await env.PLATFORM_DB.prepare(
1547
+ `
1548
+ SELECT
1549
+ project_id,
1550
+ feature_id,
1551
+ last_heartbeat,
1552
+ ? - last_heartbeat as age_seconds,
1553
+ status
1554
+ FROM system_health_checks
1555
+ WHERE status = 'healthy' AND ? - last_heartbeat > ?
1556
+ `
1557
+ )
1558
+ .bind(now, now, STALE_THRESHOLD_SECONDS)
1559
+ .all<StaleHeartbeatRow>();
1560
+
1561
+ if (!staleResult.results || staleResult.results.length === 0) {
1562
+ log.debug('No stale heartbeats detected');
1563
+ return;
1564
+ }
1565
+
1566
+ log.warn('Stale heartbeats detected', { count: staleResult.results.length });
1567
+
1568
+ for (const stale of staleResult.results) {
1569
+ // Update status to 'stale'
1570
+ await env.PLATFORM_DB.prepare(
1571
+ `
1572
+ UPDATE system_health_checks
1573
+ SET status = 'stale',
1574
+ consecutive_failures = consecutive_failures + 1,
1575
+ updated_at = ?
1576
+ WHERE feature_id = ?
1577
+ `
1578
+ )
1579
+ .bind(now, stale.feature_id)
1580
+ .run();
1581
+
1582
+ log.info('Marked feature as stale', {
1583
+ feature_id: stale.feature_id,
1584
+ project_id: stale.project_id,
1585
+ age_seconds: stale.age_seconds,
1586
+ });
1587
+
1588
+ // Fire Slack alert (with rate limiting)
1589
+ await fireStaleHeartbeatAlert(stale, env, log);
1590
+ }
1591
+ } catch (error) {
1592
+ log.error('Failed to check stale heartbeats', error);
1593
+ }
1594
+ }
1595
+
1596
+ /**
1597
+ * Send Slack alert for stale heartbeat.
1598
+ */
1599
+ async function fireStaleHeartbeatAlert(
1600
+ stale: StaleHeartbeatRow,
1601
+ env: Env,
1602
+ log: Logger
1603
+ ): Promise<void> {
1604
+ // Check rate limit
1605
+ const alertKey = `stale-heartbeat:${stale.feature_id}`;
1606
+ const alreadySent = await env.PLATFORM_ALERTS.get(alertKey);
1607
+
1608
+ if (alreadySent) {
1609
+ log.debug('Stale heartbeat alert rate limited', { feature_id: stale.feature_id });
1610
+ return;
1611
+ }
1612
+
1613
+ if (!env.SLACK_WEBHOOK_URL) {
1614
+ log.debug('No SLACK_WEBHOOK_URL configured, skipping stale heartbeat alert');
1615
+ return;
1616
+ }
1617
+
1618
+ const ageMinutes = Math.round(stale.age_seconds / 60);
1619
+ const lastHeartbeatTime = new Date(stale.last_heartbeat * 1000).toISOString();
1620
+
1621
+ const message = {
1622
+ text: `[STALE] Durable Object ${stale.feature_id} has not sent a heartbeat in ${ageMinutes} minutes`,
1623
+ blocks: [
1624
+ {
1625
+ type: 'header',
1626
+ text: {
1627
+ type: 'plain_text',
1628
+ text: ':broken_heart: Stale Heartbeat Detected',
1629
+ },
1630
+ },
1631
+ {
1632
+ type: 'section',
1633
+ fields: [
1634
+ { type: 'mrkdwn', text: `*Feature:*\n${stale.feature_id}` },
1635
+ { type: 'mrkdwn', text: `*Project:*\n${stale.project_id}` },
1636
+ { type: 'mrkdwn', text: `*Last Heartbeat:*\n${lastHeartbeatTime}` },
1637
+ { type: 'mrkdwn', text: `*Age:*\n${ageMinutes} minutes` },
1638
+ ],
1639
+ },
1640
+ {
1641
+ type: 'section',
1642
+ text: {
1643
+ type: 'mrkdwn',
1644
+ text: `*Investigation Commands:*\n\`\`\`# Check DO status in D1
1645
+ npx wrangler d1 execute platform-metrics --remote --command "SELECT * FROM system_health_checks WHERE feature_id = '${stale.feature_id}'"
1646
+
1647
+ # Check recent telemetry for this feature
1648
+ npx wrangler d1 execute platform-metrics --remote --command "SELECT * FROM feature_usage_daily WHERE feature_key = '${stale.feature_id}' ORDER BY snapshot_date DESC LIMIT 5"\`\`\``,
1649
+ },
1650
+ },
1651
+ {
1652
+ type: 'context',
1653
+ elements: [
1654
+ {
1655
+ type: 'mrkdwn',
1656
+ text: `Expected heartbeat interval: 5 minutes | Stale threshold: 15 minutes`,
1657
+ },
1658
+ ],
1659
+ },
1660
+ {
1661
+ type: 'actions',
1662
+ elements: [
1663
+ {
1664
+ type: 'button',
1665
+ text: {
1666
+ type: 'plain_text',
1667
+ text: 'Features Dashboard',
1668
+ emoji: true,
1669
+ },
1670
+ url: `${DASHBOARD_URL}/usage/features`,
1671
+ },
1672
+ ],
1673
+ },
1674
+ ],
1675
+ attachments: [
1676
+ {
1677
+ color: '#dc3545', // Red
1678
+ fields: [
1679
+ {
1680
+ title: 'Action Required',
1681
+ value:
1682
+ 'Durable Object may be unhealthy or stopped. Check DO logs in Cloudflare dashboard and verify the worker is deployed correctly.',
1683
+ short: false,
1684
+ },
1685
+ ],
1686
+ },
1687
+ ],
1688
+ };
1689
+
1690
+ try {
1691
+ const response = await fetch(env.SLACK_WEBHOOK_URL, {
1692
+ method: 'POST',
1693
+ headers: { 'Content-Type': 'application/json' },
1694
+ body: JSON.stringify(message),
1695
+ });
1696
+
1697
+ if (response.ok) {
1698
+ // Set rate limit
1699
+ await env.PLATFORM_ALERTS.put(alertKey, new Date().toISOString(), {
1700
+ expirationTtl: STALE_HEARTBEAT_RATE_LIMIT_TTL,
1701
+ });
1702
+ log.info('Sent stale heartbeat Slack alert', { feature_id: stale.feature_id });
1703
+
1704
+ // Create dashboard notification
1705
+ if (_rawNotificationsApi) {
1706
+ try {
1707
+ const notifResp = await _rawNotificationsApi.fetch(
1708
+ 'https://platform-notifications.internal/notifications',
1709
+ {
1710
+ method: 'POST',
1711
+ headers: { 'Content-Type': 'application/json' },
1712
+ body: JSON.stringify({
1713
+ category: 'warning',
1714
+ source: 'sentinel',
1715
+ source_id: stale.feature_id,
1716
+ title: `Stale Heartbeat: ${stale.feature_id}`,
1717
+ description: `Durable Object has not sent a heartbeat in ${ageMinutes} minutes. Last seen: ${lastHeartbeatTime}`,
1718
+ priority: 'high',
1719
+ action_url: '/usage/features',
1720
+ action_label: 'View Features',
1721
+ project: stale.project_id,
1722
+ }),
1723
+ }
1724
+ );
1725
+ const notifBody = await notifResp.text();
1726
+ if (!notifResp.ok) {
1727
+ log.warn('Stale heartbeat notification failed', { status: notifResp.status, body: notifBody });
1728
+ }
1729
+ } catch (notifError) {
1730
+ log.error('Failed to create stale heartbeat notification', notifError);
1731
+ }
1732
+ }
1733
+ } else {
1734
+ const text = await response.text();
1735
+ log.error('Failed to send stale heartbeat Slack alert', {
1736
+ feature_id: stale.feature_id,
1737
+ status: response.status,
1738
+ error: text,
1739
+ });
1740
+ }
1741
+ } catch (error) {
1742
+ log.error('Error sending stale heartbeat Slack alert', error);
1743
+ }
1744
+ }