@littlebearapps/platform-admin-sdk 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +112 -0
- package/dist/index.d.ts +16 -0
- package/dist/index.js +89 -0
- package/dist/prompts.d.ts +27 -0
- package/dist/prompts.js +80 -0
- package/dist/scaffold.d.ts +5 -0
- package/dist/scaffold.js +65 -0
- package/dist/templates.d.ts +16 -0
- package/dist/templates.js +131 -0
- package/package.json +46 -0
- package/templates/full/migrations/006_pattern_discovery.sql +199 -0
- package/templates/full/migrations/007_notifications_search.sql +127 -0
- package/templates/full/workers/lib/pattern-discovery/ai-prompt.ts +644 -0
- package/templates/full/workers/lib/pattern-discovery/clustering.ts +278 -0
- package/templates/full/workers/lib/pattern-discovery/shadow-evaluation.ts +603 -0
- package/templates/full/workers/lib/pattern-discovery/storage.ts +806 -0
- package/templates/full/workers/lib/pattern-discovery/types.ts +159 -0
- package/templates/full/workers/lib/pattern-discovery/validation.ts +278 -0
- package/templates/full/workers/pattern-discovery.ts +661 -0
- package/templates/full/workers/platform-alert-router.ts +1809 -0
- package/templates/full/workers/platform-notifications.ts +424 -0
- package/templates/full/workers/platform-search.ts +480 -0
- package/templates/full/workers/platform-settings.ts +436 -0
- package/templates/full/wrangler.alert-router.jsonc.hbs +34 -0
- package/templates/full/wrangler.notifications.jsonc.hbs +23 -0
- package/templates/full/wrangler.pattern-discovery.jsonc.hbs +33 -0
- package/templates/full/wrangler.search.jsonc.hbs +16 -0
- package/templates/full/wrangler.settings.jsonc.hbs +23 -0
- package/templates/shared/README.md.hbs +69 -0
- package/templates/shared/config/budgets.yaml.hbs +72 -0
- package/templates/shared/config/services.yaml.hbs +45 -0
- package/templates/shared/migrations/001_core_tables.sql +117 -0
- package/templates/shared/migrations/002_usage_warehouse.sql +830 -0
- package/templates/shared/migrations/003_feature_tracking.sql +250 -0
- package/templates/shared/migrations/004_settings_alerts.sql +452 -0
- package/templates/shared/migrations/seed.sql.hbs +4 -0
- package/templates/shared/package.json.hbs +21 -0
- package/templates/shared/scripts/sync-config.ts +242 -0
- package/templates/shared/tsconfig.json +12 -0
- package/templates/shared/workers/lib/analytics-engine.ts +357 -0
- package/templates/shared/workers/lib/billing.ts +293 -0
- package/templates/shared/workers/lib/circuit-breaker-middleware.ts +25 -0
- package/templates/shared/workers/lib/control.ts +292 -0
- package/templates/shared/workers/lib/economics.ts +368 -0
- package/templates/shared/workers/lib/metrics.ts +103 -0
- package/templates/shared/workers/lib/platform-settings.ts +407 -0
- package/templates/shared/workers/lib/shared/allowances.ts +333 -0
- package/templates/shared/workers/lib/shared/cloudflare.ts +1362 -0
- package/templates/shared/workers/lib/shared/types.ts +58 -0
- package/templates/shared/workers/lib/telemetry-sampling.ts +360 -0
- package/templates/shared/workers/lib/usage/collectors/example.ts +96 -0
- package/templates/shared/workers/lib/usage/collectors/index.ts +128 -0
- package/templates/shared/workers/lib/usage/handlers/audit.ts +306 -0
- package/templates/shared/workers/lib/usage/handlers/backfill.ts +845 -0
- package/templates/shared/workers/lib/usage/handlers/behavioral.ts +429 -0
- package/templates/shared/workers/lib/usage/handlers/data-queries.ts +507 -0
- package/templates/shared/workers/lib/usage/handlers/dlq-admin.ts +364 -0
- package/templates/shared/workers/lib/usage/handlers/health-trends.ts +222 -0
- package/templates/shared/workers/lib/usage/handlers/index.ts +35 -0
- package/templates/shared/workers/lib/usage/handlers/usage-admin.ts +421 -0
- package/templates/shared/workers/lib/usage/handlers/usage-features.ts +1262 -0
- package/templates/shared/workers/lib/usage/handlers/usage-metrics.ts +2420 -0
- package/templates/shared/workers/lib/usage/handlers/usage-settings.ts +610 -0
- package/templates/shared/workers/lib/usage/queue/budget-enforcement.ts +1032 -0
- package/templates/shared/workers/lib/usage/queue/cost-budget-enforcement.ts +128 -0
- package/templates/shared/workers/lib/usage/queue/cost-calculator.ts +77 -0
- package/templates/shared/workers/lib/usage/queue/dlq-handler.ts +161 -0
- package/templates/shared/workers/lib/usage/queue/index.ts +19 -0
- package/templates/shared/workers/lib/usage/queue/telemetry-processor.ts +790 -0
- package/templates/shared/workers/lib/usage/scheduled/anomaly-detection.ts +732 -0
- package/templates/shared/workers/lib/usage/scheduled/data-collection.ts +956 -0
- package/templates/shared/workers/lib/usage/scheduled/error-digest.ts +343 -0
- package/templates/shared/workers/lib/usage/scheduled/index.ts +18 -0
- package/templates/shared/workers/lib/usage/scheduled/rollups.ts +1561 -0
- package/templates/shared/workers/lib/usage/shared/constants.ts +362 -0
- package/templates/shared/workers/lib/usage/shared/index.ts +14 -0
- package/templates/shared/workers/lib/usage/shared/types.ts +1066 -0
- package/templates/shared/workers/lib/usage/shared/utils.ts +795 -0
- package/templates/shared/workers/platform-usage.ts +1915 -0
- package/templates/shared/wrangler.usage.jsonc.hbs +58 -0
- package/templates/standard/migrations/005_error_collection.sql +162 -0
- package/templates/standard/workers/error-collector.ts +2670 -0
- package/templates/standard/workers/lib/error-collector/capture.ts +213 -0
- package/templates/standard/workers/lib/error-collector/digest.ts +448 -0
- package/templates/standard/workers/lib/error-collector/email-health-alerts.ts +262 -0
- package/templates/standard/workers/lib/error-collector/fingerprint.ts +258 -0
- package/templates/standard/workers/lib/error-collector/gap-alerts.ts +293 -0
- package/templates/standard/workers/lib/error-collector/github.ts +329 -0
- package/templates/standard/workers/lib/error-collector/types.ts +262 -0
- package/templates/standard/workers/lib/sentinel/gap-detection.ts +734 -0
- package/templates/standard/workers/lib/shared/slack-alerts.ts +585 -0
- package/templates/standard/workers/platform-sentinel.ts +1744 -0
- package/templates/standard/wrangler.error-collector.jsonc.hbs +44 -0
- package/templates/standard/wrangler.sentinel.jsonc.hbs +45 -0
|
@@ -0,0 +1,1744 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Platform Sentinel Worker
|
|
3
|
+
*
|
|
4
|
+
* Monitors Cloudflare resource costs and sends alerts via Slack and Email
|
|
5
|
+
* when costs exceed configured thresholds or spike significantly.
|
|
6
|
+
*
|
|
7
|
+
* Runs on a cron schedule (every 15 minutes) and uses KV for rate limiting
|
|
8
|
+
* to prevent alert fatigue.
|
|
9
|
+
*
|
|
10
|
+
* @module workers/platform-sentinel
|
|
11
|
+
* @created 2026-01-05
|
|
12
|
+
* @renamed 2026-01-23 (from cost-spike-alerter)
|
|
13
|
+
* @task task-17.20 - Slack webhook alerts for cost spikes
|
|
14
|
+
* @task task-17.21 - Email alerts via Resend
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import type {
|
|
18
|
+
KVNamespace,
|
|
19
|
+
ExecutionContext,
|
|
20
|
+
ScheduledEvent,
|
|
21
|
+
D1Database,
|
|
22
|
+
Fetcher,
|
|
23
|
+
} from '@cloudflare/workers-types';
|
|
24
|
+
import {
|
|
25
|
+
withFeatureBudget,
|
|
26
|
+
withCronBudget,
|
|
27
|
+
CircuitBreakerError,
|
|
28
|
+
completeTracking,
|
|
29
|
+
MONITOR_COST_SPIKE,
|
|
30
|
+
HEARTBEAT_HEALTH,
|
|
31
|
+
createLogger,
|
|
32
|
+
createLoggerFromRequest,
|
|
33
|
+
createTraceContext,
|
|
34
|
+
health,
|
|
35
|
+
type Logger,
|
|
36
|
+
} from '@littlebearapps/platform-consumer-sdk';
|
|
37
|
+
import {
|
|
38
|
+
detectGaps,
|
|
39
|
+
storeGapReport,
|
|
40
|
+
alertGaps,
|
|
41
|
+
alertGapsEmail,
|
|
42
|
+
detectProjectGaps,
|
|
43
|
+
type ProjectGap,
|
|
44
|
+
} from './lib/sentinel/gap-detection';
|
|
45
|
+
import { pingHeartbeat } from '@littlebearapps/platform-consumer-sdk';
|
|
46
|
+
import { PAID_ALLOWANCES, PRICING_TIERS } from '@littlebearapps/platform-consumer-sdk';
|
|
47
|
+
|
|
48
|
+
interface Env {
|
|
49
|
+
CLOUDFLARE_API_TOKEN: string;
|
|
50
|
+
CLOUDFLARE_ACCOUNT_ID: string;
|
|
51
|
+
SLACK_WEBHOOK_URL: string;
|
|
52
|
+
RESEND_API_KEY: string;
|
|
53
|
+
ALERT_EMAIL_TO: string;
|
|
54
|
+
PLATFORM_DB: D1Database; // For system health checks
|
|
55
|
+
PLATFORM_CACHE: KVNamespace;
|
|
56
|
+
PLATFORM_ALERTS: KVNamespace; // For rate limiting
|
|
57
|
+
PLATFORM_TELEMETRY: Queue; // For SDK telemetry
|
|
58
|
+
GATUS_HEARTBEAT_URL?: string; // Gatus heartbeat ping URL for cron monitoring
|
|
59
|
+
GATUS_TOKEN?: string; // Bearer token for Gatus external endpoints
|
|
60
|
+
NOTIFICATIONS_API?: Fetcher; // For creating dashboard notifications
|
|
61
|
+
ERROR_COLLECTOR?: Fetcher; // For creating gap alert GitHub issues
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// TODO: Set your dashboard URL and alert email address
|
|
65
|
+
const DASHBOARD_URL = 'https://your-dashboard.example.com';
|
|
66
|
+
const ALERT_FROM_EMAIL = 'Usage Alerts <alerts@mail.your-domain.com>';
|
|
67
|
+
|
|
68
|
+
// Module-scope raw Fetcher references — set in scheduled() BEFORE SDK wrapping.
|
|
69
|
+
// The SDK proxy wraps .fetch() causing "Illegal invocation" on native Fetcher bindings.
|
|
70
|
+
let _rawNotificationsApi: Fetcher | undefined;
|
|
71
|
+
let _rawErrorCollector: Fetcher | undefined;
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Threshold configuration stored in KV
|
|
75
|
+
*/
|
|
76
|
+
interface ServiceThreshold {
|
|
77
|
+
warningPct: number;
|
|
78
|
+
highPct: number;
|
|
79
|
+
criticalPct: number;
|
|
80
|
+
absoluteMax: number;
|
|
81
|
+
enabled: boolean;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
interface AlertThresholds {
|
|
85
|
+
[key: string]: ServiceThreshold;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Cost breakdown by service
|
|
90
|
+
*/
|
|
91
|
+
interface CostBreakdown {
|
|
92
|
+
workers: number;
|
|
93
|
+
d1: number;
|
|
94
|
+
kv: number;
|
|
95
|
+
r2: number;
|
|
96
|
+
durableObjects: number;
|
|
97
|
+
vectorize: number;
|
|
98
|
+
aiGateway: number;
|
|
99
|
+
workersAI: number;
|
|
100
|
+
pages: number;
|
|
101
|
+
queues: number;
|
|
102
|
+
workflows: number;
|
|
103
|
+
total: number;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Alert data structure
|
|
108
|
+
*/
|
|
109
|
+
interface CostSpikeAlert {
|
|
110
|
+
id: string;
|
|
111
|
+
serviceType: string;
|
|
112
|
+
resourceName: string;
|
|
113
|
+
currentCost: number;
|
|
114
|
+
previousCost: number;
|
|
115
|
+
costDeltaPct: number;
|
|
116
|
+
costPercentOfMax: number;
|
|
117
|
+
thresholdLevel: 'normal' | 'warning' | 'high' | 'critical';
|
|
118
|
+
absoluteMax: number;
|
|
119
|
+
timestamp: string;
|
|
120
|
+
/** Billing period context */
|
|
121
|
+
billingPeriodStart: string;
|
|
122
|
+
billingPeriodEnd: string;
|
|
123
|
+
billingDaysElapsed: number;
|
|
124
|
+
billingDaysTotal: number;
|
|
125
|
+
/** Workers Paid plan allowance context */
|
|
126
|
+
monthlyAllowance: string;
|
|
127
|
+
isWithinAllowance: boolean;
|
|
128
|
+
overageCost: number;
|
|
129
|
+
/** Per-project cost breakdown (top contributors) */
|
|
130
|
+
topProjects: Array<{ project: string; cost: number; pctOfTotal: number }>;
|
|
131
|
+
/** Per-feature usage breakdown (top contributors) */
|
|
132
|
+
topFeatures: Array<{ featureKey: string; usage: number; pctOfTotal: number }>;
|
|
133
|
+
/** Per-metric usage vs plan allowance breakdown */
|
|
134
|
+
usageBreakdown: UsageMetricBreakdown[];
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Workers Paid plan allowance descriptions for alert context.
|
|
139
|
+
* These describe what's included free each month.
|
|
140
|
+
*/
|
|
141
|
+
const SERVICE_ALLOWANCE_DESCRIPTIONS: Record<string, string> = {
|
|
142
|
+
workers: '10M requests + 30M CPU-ms/mo (Workers Paid)',
|
|
143
|
+
d1: '25B reads + 50M writes/mo (Workers Paid)',
|
|
144
|
+
kv: '10M reads + 1M writes + 1M deletes + 1M lists/mo',
|
|
145
|
+
r2: '10GB storage + 1M Class A + 10M Class B ops/mo',
|
|
146
|
+
durableObjects: '1M requests + 400K GB-s/mo',
|
|
147
|
+
vectorize: '10M stored + 50M queried dimensions/mo',
|
|
148
|
+
aiGateway: 'Free (pass-through)',
|
|
149
|
+
pages: '500 builds/mo + 100GB bandwidth',
|
|
150
|
+
queues: '1M operations/mo',
|
|
151
|
+
workflows: 'Beta (free)',
|
|
152
|
+
workersAI: 'Usage-based (10K neurons/day free)',
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Default thresholds (fallback if KV not configured)
|
|
157
|
+
*/
|
|
158
|
+
const DEFAULT_THRESHOLDS: AlertThresholds = {
|
|
159
|
+
workers: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 5, enabled: true },
|
|
160
|
+
d1: { warningPct: 40, highPct: 60, criticalPct: 80, absoluteMax: 20, enabled: true },
|
|
161
|
+
kv: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 5, enabled: true },
|
|
162
|
+
r2: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 20, enabled: true },
|
|
163
|
+
durableObjects: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 10, enabled: true },
|
|
164
|
+
vectorize: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 5, enabled: true },
|
|
165
|
+
aiGateway: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 0, enabled: false },
|
|
166
|
+
pages: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 5, enabled: true },
|
|
167
|
+
queues: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 5, enabled: true },
|
|
168
|
+
workflows: { warningPct: 50, highPct: 75, criticalPct: 90, absoluteMax: 0, enabled: false },
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Slack rate limit: 1 alert per resource per hour
|
|
173
|
+
*/
|
|
174
|
+
const SLACK_RATE_LIMIT_TTL = 3600;
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Email rate limit: 1 alert per resource per 4 hours
|
|
178
|
+
*/
|
|
179
|
+
const EMAIL_RATE_LIMIT_TTL = 14400;
|
|
180
|
+
|
|
181
|
+
export default {
|
|
182
|
+
/**
|
|
183
|
+
* Cron trigger handler
|
|
184
|
+
*/
|
|
185
|
+
async scheduled(event: ScheduledEvent, env: Env, ctx: ExecutionContext): Promise<void> {
|
|
186
|
+
const log = createLogger({ worker: 'platform-sentinel', featureId: MONITOR_COST_SPIKE });
|
|
187
|
+
log.info('Cron triggered', { scheduled_time: new Date(event.scheduledTime).toISOString() });
|
|
188
|
+
|
|
189
|
+
// Gatus heartbeat is pinged on success/fail only (no /start support)
|
|
190
|
+
|
|
191
|
+
// CRITICAL: Capture raw Fetcher bindings BEFORE SDK wrapping.
|
|
192
|
+
// The SDK triple-proxy wraps .fetch() in async wrapper causing "Illegal invocation"
|
|
193
|
+
// on native Cloudflare Fetcher bindings. See platform-alert-router.ts for same pattern.
|
|
194
|
+
_rawNotificationsApi = env.NOTIFICATIONS_API;
|
|
195
|
+
_rawErrorCollector = env.ERROR_COLLECTOR;
|
|
196
|
+
|
|
197
|
+
try {
|
|
198
|
+
// Wrap with Platform SDK for usage tracking and circuit breaker protection
|
|
199
|
+
const trackedEnv = withCronBudget(env, MONITOR_COST_SPIKE, {
|
|
200
|
+
ctx,
|
|
201
|
+
cronExpression: '*/15 * * * *', // Every 15 minutes
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
// 1. Gap detection - check for missing hourly snapshots (ALWAYS runs, independent of cost data)
|
|
205
|
+
// This was previously step 8, but must run regardless of cache state (fix for task-312)
|
|
206
|
+
const gaps = await detectGaps(trackedEnv, log);
|
|
207
|
+
if (gaps.severity !== 'ok') {
|
|
208
|
+
// Store gap report for aggregation by platform-auditor
|
|
209
|
+
await storeGapReport(trackedEnv, gaps, log);
|
|
210
|
+
// Send alerts
|
|
211
|
+
await alertGaps(trackedEnv, gaps, log);
|
|
212
|
+
await alertGapsEmail(trackedEnv, gaps, log);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// 1b. Per-project gap detection - check resource_usage_snapshots coverage
|
|
216
|
+
// Creates GitHub issues in correct repo when coverage drops below 90%
|
|
217
|
+
const projectGaps = await detectProjectGaps(trackedEnv, log);
|
|
218
|
+
if (projectGaps.length > 0 && _rawErrorCollector) {
|
|
219
|
+
log.info('Detected per-project gaps, sending to error-collector', {
|
|
220
|
+
projectCount: projectGaps.length,
|
|
221
|
+
});
|
|
222
|
+
for (const gap of projectGaps) {
|
|
223
|
+
try {
|
|
224
|
+
const response = await _rawErrorCollector.fetch(
|
|
225
|
+
'https://error-collector.internal/gap-alerts',
|
|
226
|
+
{
|
|
227
|
+
method: 'POST',
|
|
228
|
+
headers: { 'Content-Type': 'application/json' },
|
|
229
|
+
body: JSON.stringify({
|
|
230
|
+
project: gap.project,
|
|
231
|
+
hoursWithData: gap.hoursWithData,
|
|
232
|
+
expectedHours: gap.expectedHours,
|
|
233
|
+
coveragePct: gap.coveragePct,
|
|
234
|
+
missingHours: gap.missingHours,
|
|
235
|
+
repository: gap.repository,
|
|
236
|
+
}),
|
|
237
|
+
}
|
|
238
|
+
);
|
|
239
|
+
const result = await response.json();
|
|
240
|
+
log.debug('Gap alert result', { project: gap.project, result });
|
|
241
|
+
} catch (e) {
|
|
242
|
+
log.error('Failed to send gap alert to error-collector', e, {
|
|
243
|
+
project: gap.project,
|
|
244
|
+
});
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// 2. Check for stale heartbeats (DO health monitoring) - also runs always
|
|
250
|
+
await checkStaleHeartbeats(trackedEnv, log);
|
|
251
|
+
|
|
252
|
+
// 3. Load thresholds from KV (or use defaults)
|
|
253
|
+
const thresholds = await loadThresholds(trackedEnv, log);
|
|
254
|
+
|
|
255
|
+
// 4. Fetch current costs from Usage API (optional - may be cache cold)
|
|
256
|
+
const currentCosts = await fetchCurrentCosts(trackedEnv, log);
|
|
257
|
+
if (currentCosts) {
|
|
258
|
+
// 5. Load previous costs from KV (for delta comparison)
|
|
259
|
+
const previousCosts = await loadPreviousCosts(trackedEnv, log);
|
|
260
|
+
|
|
261
|
+
// 6. Evaluate alerts (async — queries D1 for per-project/feature attribution)
|
|
262
|
+
const alerts = await evaluateAlerts(currentCosts, previousCosts, thresholds, trackedEnv, log);
|
|
263
|
+
log.info('Evaluated potential alerts', { alert_count: alerts.length });
|
|
264
|
+
|
|
265
|
+
// 7. Send alerts (with rate limiting)
|
|
266
|
+
for (const alert of alerts) {
|
|
267
|
+
await sendAlerts(alert, trackedEnv, log);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// 8. Store current costs for next comparison
|
|
271
|
+
await storeCosts(currentCosts, trackedEnv, log);
|
|
272
|
+
} else {
|
|
273
|
+
// Not an error - cache may be cold (expected during cold starts or low traffic)
|
|
274
|
+
log.debug('No cost data available (cache cold), skipping cost alerting', {
|
|
275
|
+
hint: 'Call GET /usage on platform-usage to populate cache',
|
|
276
|
+
});
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// 9. Send Platform SDK heartbeat
|
|
280
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
281
|
+
await health(HEARTBEAT_HEALTH, env.PLATFORM_CACHE as any, env.PLATFORM_TELEMETRY, ctx);
|
|
282
|
+
log.debug('Heartbeat sent');
|
|
283
|
+
|
|
284
|
+
// Complete SDK tracking
|
|
285
|
+
await completeTracking(trackedEnv);
|
|
286
|
+
|
|
287
|
+
// Signal success to Gatus heartbeat
|
|
288
|
+
pingHeartbeat(ctx, env.GATUS_HEARTBEAT_URL, env.GATUS_TOKEN, true);
|
|
289
|
+
|
|
290
|
+
log.info('Completed successfully');
|
|
291
|
+
} catch (error) {
|
|
292
|
+
// Handle circuit breaker gracefully - skip execution
|
|
293
|
+
if (error instanceof CircuitBreakerError) {
|
|
294
|
+
log.warn('Circuit breaker STOP', error, { reason: error.reason });
|
|
295
|
+
return;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// Signal failure to Gatus heartbeat
|
|
299
|
+
pingHeartbeat(ctx, env.GATUS_HEARTBEAT_URL, env.GATUS_TOKEN, false);
|
|
300
|
+
|
|
301
|
+
log.error('Error', error);
|
|
302
|
+
}
|
|
303
|
+
},
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* HTTP handler (for manual trigger / health check)
|
|
307
|
+
*/
|
|
308
|
+
async fetch(request: Request, env: Env, ctx: ExecutionContext): Promise<Response> {
|
|
309
|
+
const url = new URL(request.url);
|
|
310
|
+
|
|
311
|
+
// Health check bypasses SDK for lightweight responses
|
|
312
|
+
if (url.pathname === '/health') {
|
|
313
|
+
return new Response(
|
|
314
|
+
JSON.stringify({
|
|
315
|
+
status: 'ok',
|
|
316
|
+
service: 'platform-sentinel',
|
|
317
|
+
timestamp: new Date().toISOString(),
|
|
318
|
+
}),
|
|
319
|
+
{ headers: { 'Content-Type': 'application/json' } }
|
|
320
|
+
);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Create logger with trace context from request
|
|
324
|
+
const traceContext = createTraceContext(request, env);
|
|
325
|
+
const log = createLoggerFromRequest(request, env, 'platform-sentinel', MONITOR_COST_SPIKE);
|
|
326
|
+
|
|
327
|
+
log.info('Request received', {
|
|
328
|
+
method: request.method,
|
|
329
|
+
path: url.pathname,
|
|
330
|
+
traceId: traceContext.traceId,
|
|
331
|
+
});
|
|
332
|
+
|
|
333
|
+
try {
|
|
334
|
+
// Wrap with Platform SDK for usage tracking
|
|
335
|
+
const trackedEnv = withFeatureBudget(env, MONITOR_COST_SPIKE, { ctx });
|
|
336
|
+
|
|
337
|
+
if (url.pathname === '/trigger' && request.method === 'POST') {
|
|
338
|
+
// Manual trigger (for testing)
|
|
339
|
+
log.info('Manual trigger requested');
|
|
340
|
+
const event = {
|
|
341
|
+
scheduledTime: Date.now(),
|
|
342
|
+
cron: '*/15 * * * *',
|
|
343
|
+
noRetry: () => {},
|
|
344
|
+
} as unknown as ScheduledEvent;
|
|
345
|
+
await this.scheduled(event, env, ctx);
|
|
346
|
+
await completeTracking(trackedEnv);
|
|
347
|
+
log.info('Manual trigger completed');
|
|
348
|
+
return new Response(
|
|
349
|
+
JSON.stringify({ status: 'triggered', traceId: traceContext.traceId }),
|
|
350
|
+
{
|
|
351
|
+
headers: { 'Content-Type': 'application/json' },
|
|
352
|
+
}
|
|
353
|
+
);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
await completeTracking(trackedEnv);
|
|
357
|
+
return new Response(
|
|
358
|
+
JSON.stringify({
|
|
359
|
+
service: 'platform-sentinel',
|
|
360
|
+
endpoints: ['/health', '/trigger (POST)'],
|
|
361
|
+
}),
|
|
362
|
+
{ headers: { 'Content-Type': 'application/json' } }
|
|
363
|
+
);
|
|
364
|
+
} catch (error) {
|
|
365
|
+
if (error instanceof CircuitBreakerError) {
|
|
366
|
+
log.warn('Circuit breaker tripped', error, {
|
|
367
|
+
path: url.pathname,
|
|
368
|
+
reason: error.reason,
|
|
369
|
+
});
|
|
370
|
+
return new Response(
|
|
371
|
+
JSON.stringify({
|
|
372
|
+
error: 'Service temporarily unavailable',
|
|
373
|
+
code: 'CIRCUIT_BREAKER',
|
|
374
|
+
traceId: traceContext.traceId,
|
|
375
|
+
}),
|
|
376
|
+
{
|
|
377
|
+
status: 503,
|
|
378
|
+
headers: { 'Content-Type': 'application/json', 'Retry-After': '60' },
|
|
379
|
+
}
|
|
380
|
+
);
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// Log full error with stack trace for debugging
|
|
384
|
+
log.error('Request failed', error, {
|
|
385
|
+
path: url.pathname,
|
|
386
|
+
method: request.method,
|
|
387
|
+
traceId: traceContext.traceId,
|
|
388
|
+
});
|
|
389
|
+
|
|
390
|
+
return new Response(
|
|
391
|
+
JSON.stringify({
|
|
392
|
+
error: 'Internal server error',
|
|
393
|
+
traceId: traceContext.traceId,
|
|
394
|
+
}),
|
|
395
|
+
{
|
|
396
|
+
status: 500,
|
|
397
|
+
headers: { 'Content-Type': 'application/json' },
|
|
398
|
+
}
|
|
399
|
+
);
|
|
400
|
+
}
|
|
401
|
+
},
|
|
402
|
+
};
|
|
403
|
+
|
|
404
|
+
/**
|
|
405
|
+
* Load thresholds from KV
|
|
406
|
+
*/
|
|
407
|
+
async function loadThresholds(env: Env, log: Logger): Promise<AlertThresholds> {
|
|
408
|
+
try {
|
|
409
|
+
const stored = await env.PLATFORM_CACHE.get('alert-thresholds:config');
|
|
410
|
+
if (stored) {
|
|
411
|
+
const parsed = JSON.parse(stored);
|
|
412
|
+
// Merge with defaults to ensure all services have thresholds
|
|
413
|
+
return { ...DEFAULT_THRESHOLDS, ...parsed };
|
|
414
|
+
}
|
|
415
|
+
} catch (error) {
|
|
416
|
+
log.error('Failed to load thresholds from KV', error);
|
|
417
|
+
}
|
|
418
|
+
return DEFAULT_THRESHOLDS;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
/**
|
|
422
|
+
* Get cache key with hourly timestamp (must match usage-api.ts)
|
|
423
|
+
* Format: usage:{period}:{project}:{hourTimestamp}
|
|
424
|
+
*/
|
|
425
|
+
function getUsageCacheKey(period: string, project: string, hourOffset = 0): string {
|
|
426
|
+
const hourTimestamp = Math.floor(Date.now() / (60 * 60 * 1000)) + hourOffset;
|
|
427
|
+
return `usage:${period}:${project}:${hourTimestamp}`;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
/**
|
|
431
|
+
* Fetch current costs from Usage API
|
|
432
|
+
*
|
|
433
|
+
* Tries current hour's cache first, then falls back to previous hour's cache.
|
|
434
|
+
* Cache is populated by platform-usage /usage endpoint calls (30-min TTL).
|
|
435
|
+
*/
|
|
436
|
+
async function fetchCurrentCosts(env: Env, log: Logger): Promise<CostBreakdown | null> {
|
|
437
|
+
const currentCacheKey = getUsageCacheKey('30d', 'all', 0);
|
|
438
|
+
const prevCacheKey = getUsageCacheKey('30d', 'all', -1);
|
|
439
|
+
|
|
440
|
+
try {
|
|
441
|
+
// Try current hour's cache first
|
|
442
|
+
let usageData = await env.PLATFORM_CACHE.get(currentCacheKey);
|
|
443
|
+
let cacheKeyUsed = currentCacheKey;
|
|
444
|
+
|
|
445
|
+
if (!usageData) {
|
|
446
|
+
// Fall back to previous hour's cache (covers cache cold starts)
|
|
447
|
+
usageData = await env.PLATFORM_CACHE.get(prevCacheKey);
|
|
448
|
+
cacheKeyUsed = prevCacheKey;
|
|
449
|
+
|
|
450
|
+
if (!usageData) {
|
|
451
|
+
// KV cache is cold — fall back to computing costs from D1
|
|
452
|
+
log.info('KV cache cold, falling back to D1 cost computation', {
|
|
453
|
+
current_key: currentCacheKey,
|
|
454
|
+
prev_key: prevCacheKey,
|
|
455
|
+
});
|
|
456
|
+
return fetchCostsFromD1(env, log);
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
log.debug('Using previous hour cache', { cache_key: prevCacheKey });
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
// Validate the data before parsing
|
|
463
|
+
if (typeof usageData !== 'string' || usageData.trim() === '') {
|
|
464
|
+
log.warn('Invalid cache data (empty or non-string)', {
|
|
465
|
+
cache_key: cacheKeyUsed,
|
|
466
|
+
data_type: typeof usageData,
|
|
467
|
+
data_length: usageData?.length ?? 0,
|
|
468
|
+
});
|
|
469
|
+
return fetchCostsFromD1(env, log);
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// Parse JSON with specific error handling
|
|
473
|
+
let usage: { costs?: CostBreakdown };
|
|
474
|
+
try {
|
|
475
|
+
usage = JSON.parse(usageData);
|
|
476
|
+
} catch (parseError) {
|
|
477
|
+
log.warn('Cache data is not valid JSON', {
|
|
478
|
+
cache_key: cacheKeyUsed,
|
|
479
|
+
data_preview: usageData.slice(0, 100),
|
|
480
|
+
error: parseError instanceof Error ? parseError.message : String(parseError),
|
|
481
|
+
});
|
|
482
|
+
return fetchCostsFromD1(env, log);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
// Validate the costs property exists
|
|
486
|
+
if (!usage.costs) {
|
|
487
|
+
log.warn('Cache data missing costs property', {
|
|
488
|
+
cache_key: cacheKeyUsed,
|
|
489
|
+
available_keys: Object.keys(usage),
|
|
490
|
+
});
|
|
491
|
+
return fetchCostsFromD1(env, log);
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
return usage.costs;
|
|
495
|
+
} catch (error) {
|
|
496
|
+
// This catch is for unexpected errors (KV failures, etc.)
|
|
497
|
+
log.error('Failed to fetch costs from KV', error, {
|
|
498
|
+
current_key: currentCacheKey,
|
|
499
|
+
prev_key: prevCacheKey,
|
|
500
|
+
});
|
|
501
|
+
return fetchCostsFromD1(env, log);
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
/**
|
|
506
|
+
* Compute MTD cost breakdown directly from D1 hourly_usage_snapshots.
|
|
507
|
+
* Used as fallback when KV cache is cold (no recent dashboard API calls).
|
|
508
|
+
* Sums the per-service cost columns already stored in each hourly row.
|
|
509
|
+
*/
|
|
510
|
+
async function fetchCostsFromD1(env: Env, log: Logger): Promise<CostBreakdown | null> {
|
|
511
|
+
try {
|
|
512
|
+
const billing = getBillingPeriod();
|
|
513
|
+
const result = await env.PLATFORM_DB.prepare(`
|
|
514
|
+
SELECT
|
|
515
|
+
SUM(COALESCE(workers_cost_usd, 0)) as workers,
|
|
516
|
+
SUM(COALESCE(d1_cost_usd, 0)) as d1,
|
|
517
|
+
SUM(COALESCE(kv_cost_usd, 0)) as kv,
|
|
518
|
+
SUM(COALESCE(r2_cost_usd, 0)) as r2,
|
|
519
|
+
SUM(COALESCE(do_cost_usd, 0)) as durableObjects,
|
|
520
|
+
SUM(COALESCE(vectorize_cost_usd, 0)) as vectorize,
|
|
521
|
+
SUM(COALESCE(aigateway_cost_usd, 0)) as aiGateway,
|
|
522
|
+
SUM(COALESCE(workersai_cost_usd, 0)) as workersAI,
|
|
523
|
+
SUM(COALESCE(pages_cost_usd, 0)) as pages,
|
|
524
|
+
SUM(COALESCE(queues_cost_usd, 0)) as queues,
|
|
525
|
+
SUM(COALESCE(workflows_cost_usd, 0)) as workflows,
|
|
526
|
+
SUM(COALESCE(total_cost_usd, 0)) as total
|
|
527
|
+
FROM hourly_usage_snapshots
|
|
528
|
+
WHERE project = 'all' AND DATE(snapshot_hour) >= ?
|
|
529
|
+
`).bind(billing.start).first<CostBreakdown>();
|
|
530
|
+
|
|
531
|
+
if (!result) {
|
|
532
|
+
log.warn('D1 fallback returned no data');
|
|
533
|
+
return null;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
log.info('Computed costs from D1 fallback', {
|
|
537
|
+
total: result.total,
|
|
538
|
+
billing_start: billing.start,
|
|
539
|
+
source: 'd1-fallback',
|
|
540
|
+
});
|
|
541
|
+
|
|
542
|
+
return result;
|
|
543
|
+
} catch (error) {
|
|
544
|
+
log.error('D1 fallback cost computation failed', error);
|
|
545
|
+
return null;
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
/**
|
|
550
|
+
* Load previous costs from KV
|
|
551
|
+
*/
|
|
552
|
+
async function loadPreviousCosts(env: Env, log: Logger): Promise<CostBreakdown | null> {
|
|
553
|
+
try {
|
|
554
|
+
const stored = await env.PLATFORM_CACHE.get('platform-sentinel:previous-costs');
|
|
555
|
+
if (stored) {
|
|
556
|
+
return JSON.parse(stored);
|
|
557
|
+
}
|
|
558
|
+
} catch (error) {
|
|
559
|
+
log.error('Failed to load previous costs', error);
|
|
560
|
+
}
|
|
561
|
+
return null;
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
/**
|
|
565
|
+
* Store current costs for next comparison
|
|
566
|
+
*/
|
|
567
|
+
async function storeCosts(costs: CostBreakdown, env: Env, log: Logger): Promise<void> {
|
|
568
|
+
try {
|
|
569
|
+
await env.PLATFORM_CACHE.put('platform-sentinel:previous-costs', JSON.stringify(costs), {
|
|
570
|
+
expirationTtl: 86400 * 7, // Keep for 7 days
|
|
571
|
+
});
|
|
572
|
+
} catch (error) {
|
|
573
|
+
log.error('Failed to store costs', error);
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
// =============================================================================
|
|
578
|
+
// PER-PROJECT / PER-FEATURE ATTRIBUTION
|
|
579
|
+
// =============================================================================
|
|
580
|
+
|
|
581
|
+
/**
|
|
582
|
+
* Map service names to resource_usage_snapshots config for per-project cost attribution.
|
|
583
|
+
* Cost expressions use the same pricing as PRICING_TIERS in workers/lib/costs.ts.
|
|
584
|
+
* Note: allowances are account-level so NOT subtracted here — this shows proportional attribution.
|
|
585
|
+
*/
|
|
586
|
+
const SERVICE_RESOURCE_CONFIG: Record<string, { resourceType: string; costExpr: string }> = {
|
|
587
|
+
workers: {
|
|
588
|
+
resourceType: 'worker',
|
|
589
|
+
costExpr: `SUM(COALESCE(requests, 0)) / 1000000.0 * 0.30 + SUM(COALESCE(cpu_time_ms, 0)) / 1000000.0 * 0.02`,
|
|
590
|
+
},
|
|
591
|
+
d1: {
|
|
592
|
+
resourceType: 'd1',
|
|
593
|
+
costExpr: `SUM(COALESCE(rows_read, 0)) / 1000000000.0 * 0.001 + SUM(COALESCE(rows_written, 0)) / 1000000.0 * 1.00`,
|
|
594
|
+
},
|
|
595
|
+
kv: {
|
|
596
|
+
resourceType: 'kv',
|
|
597
|
+
costExpr: `SUM(COALESCE(reads, 0)) / 1000000.0 * 0.50 + SUM(COALESCE(writes, 0)) / 1000000.0 * 5.00 + SUM(COALESCE(deletes, 0)) / 1000000.0 * 5.00`,
|
|
598
|
+
},
|
|
599
|
+
r2: {
|
|
600
|
+
resourceType: 'r2',
|
|
601
|
+
costExpr: `SUM(COALESCE(class_a_ops, 0)) / 1000000.0 * 4.50 + SUM(COALESCE(class_b_ops, 0)) / 1000000.0 * 0.36`,
|
|
602
|
+
},
|
|
603
|
+
durableObjects: {
|
|
604
|
+
resourceType: 'do',
|
|
605
|
+
costExpr: `SUM(COALESCE(requests, 0)) / 1000000.0 * 0.15 + SUM(COALESCE(gb_seconds, 0)) / 1000000.0 * 12.50`,
|
|
606
|
+
},
|
|
607
|
+
queues: {
|
|
608
|
+
resourceType: 'queues',
|
|
609
|
+
costExpr: `(SUM(COALESCE(reads, 0)) + SUM(COALESCE(writes, 0))) / 1000000.0 * 0.04`,
|
|
610
|
+
},
|
|
611
|
+
// vectorize: excluded — resource_usage_snapshots only stores vector storage, not queried dimensions
|
|
612
|
+
// pages: excluded — no meaningful per-project cost metrics
|
|
613
|
+
// aiGateway: excluded — free service
|
|
614
|
+
// workersAI: excluded — neurons tracked but not per-project in resource_usage_snapshots
|
|
615
|
+
};
|
|
616
|
+
|
|
617
|
+
/**
|
|
618
|
+
* Map service resource names to the feature_usage_daily metric column(s).
|
|
619
|
+
* These represent the primary usage metric for each service.
|
|
620
|
+
*/
|
|
621
|
+
const SERVICE_FEATURE_COLUMN: Record<string, string> = {
|
|
622
|
+
workers: 'requests',
|
|
623
|
+
d1: 'd1_writes',
|
|
624
|
+
kv: 'kv_writes',
|
|
625
|
+
r2: 'r2_class_a',
|
|
626
|
+
durableObjects: 'do_requests',
|
|
627
|
+
pages: 'requests',
|
|
628
|
+
queues: 'queue_messages',
|
|
629
|
+
workersAI: 'ai_neurons',
|
|
630
|
+
};
|
|
631
|
+
|
|
632
|
+
/**
|
|
633
|
+
* Query per-project cost breakdown for a specific service.
|
|
634
|
+
* Uses resource_usage_snapshots (which has real per-project, per-resource data)
|
|
635
|
+
* and calculates costs on-the-fly using pricing constants.
|
|
636
|
+
*/
|
|
637
|
+
async function queryTopProjects(
|
|
638
|
+
env: Env,
|
|
639
|
+
serviceName: string,
|
|
640
|
+
billingStart: string,
|
|
641
|
+
log: Logger
|
|
642
|
+
): Promise<Array<{ project: string; cost: number; pctOfTotal: number }>> {
|
|
643
|
+
const config = SERVICE_RESOURCE_CONFIG[serviceName];
|
|
644
|
+
if (!config) return []; // Vectorize, Pages, AI Gateway — no per-project data available
|
|
645
|
+
|
|
646
|
+
try {
|
|
647
|
+
const result = await env.PLATFORM_DB.prepare(
|
|
648
|
+
`SELECT project, (${config.costExpr}) as cost
|
|
649
|
+
FROM resource_usage_snapshots
|
|
650
|
+
WHERE resource_type = ?
|
|
651
|
+
AND snapshot_hour >= ?
|
|
652
|
+
AND project NOT IN ('_unattributed', 'unknown')
|
|
653
|
+
GROUP BY project
|
|
654
|
+
HAVING cost > 0.001
|
|
655
|
+
ORDER BY cost DESC
|
|
656
|
+
LIMIT 5`
|
|
657
|
+
)
|
|
658
|
+
.bind(config.resourceType, billingStart)
|
|
659
|
+
.all<{ project: string; cost: number }>();
|
|
660
|
+
|
|
661
|
+
if (!result.results || result.results.length === 0) return [];
|
|
662
|
+
|
|
663
|
+
const totalCost = result.results.reduce((sum, r) => sum + r.cost, 0);
|
|
664
|
+
return result.results.map((r) => ({
|
|
665
|
+
project: r.project,
|
|
666
|
+
cost: r.cost,
|
|
667
|
+
pctOfTotal: totalCost > 0 ? Math.round((r.cost / totalCost) * 100) : 0,
|
|
668
|
+
}));
|
|
669
|
+
} catch (error) {
|
|
670
|
+
log.error('Failed to query top projects', error, { service: serviceName });
|
|
671
|
+
return [];
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
/**
|
|
676
|
+
* Query per-feature usage breakdown for a specific service.
|
|
677
|
+
* Returns top features by usage metric from feature_usage_daily.
|
|
678
|
+
*/
|
|
679
|
+
async function queryTopFeatures(
|
|
680
|
+
env: Env,
|
|
681
|
+
serviceName: string,
|
|
682
|
+
log: Logger
|
|
683
|
+
): Promise<Array<{ featureKey: string; usage: number; pctOfTotal: number }>> {
|
|
684
|
+
const usageCol = SERVICE_FEATURE_COLUMN[serviceName];
|
|
685
|
+
if (!usageCol) return [];
|
|
686
|
+
|
|
687
|
+
try {
|
|
688
|
+
const result = await env.PLATFORM_DB.prepare(
|
|
689
|
+
`SELECT feature_key, SUM(${usageCol}) as usage
|
|
690
|
+
FROM feature_usage_daily
|
|
691
|
+
WHERE usage_date >= date('now', '-7 days')
|
|
692
|
+
AND ${usageCol} > 0
|
|
693
|
+
GROUP BY feature_key
|
|
694
|
+
ORDER BY usage DESC
|
|
695
|
+
LIMIT 5`
|
|
696
|
+
)
|
|
697
|
+
.all<{ feature_key: string; usage: number }>();
|
|
698
|
+
|
|
699
|
+
if (!result.results || result.results.length === 0) return [];
|
|
700
|
+
|
|
701
|
+
const totalUsage = result.results.reduce((sum, r) => sum + r.usage, 0);
|
|
702
|
+
return result.results.map((r) => ({
|
|
703
|
+
featureKey: r.feature_key,
|
|
704
|
+
usage: r.usage,
|
|
705
|
+
pctOfTotal: totalUsage > 0 ? Math.round((r.usage / totalUsage) * 100) : 0,
|
|
706
|
+
}));
|
|
707
|
+
} catch (error) {
|
|
708
|
+
log.error('Failed to query top features', error, { service: serviceName });
|
|
709
|
+
return [];
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
// =============================================================================
|
|
714
|
+
// ALLOWANCE STATUS (Direct D1 query for accurate usage-vs-allowance)
|
|
715
|
+
// =============================================================================
|
|
716
|
+
|
|
717
|
+
/**
|
|
718
|
+
* Per-metric usage breakdown with allowance comparison.
|
|
719
|
+
*/
|
|
720
|
+
interface UsageMetricBreakdown {
|
|
721
|
+
metric: string;
|
|
722
|
+
label: string;
|
|
723
|
+
used: number;
|
|
724
|
+
allowance: number;
|
|
725
|
+
pctOfAllowance: number;
|
|
726
|
+
overageUnits: number;
|
|
727
|
+
overageCost: number;
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
/**
|
|
731
|
+
* Allowance status for a service — determines whether alerts should fire.
|
|
732
|
+
*/
|
|
733
|
+
interface AllowanceStatus {
|
|
734
|
+
/** True if ALL metrics for this service are within their plan allowance */
|
|
735
|
+
withinAllowance: boolean;
|
|
736
|
+
/** Per-metric breakdown */
|
|
737
|
+
metrics: UsageMetricBreakdown[];
|
|
738
|
+
/** Total overage cost (sum of all metric overages) */
|
|
739
|
+
totalOverageCost: number;
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
/**
|
|
743
|
+
* Service-to-metric definitions for allowance checking.
|
|
744
|
+
* Maps each service to its D1 columns, plan allowances, and pricing.
|
|
745
|
+
*/
|
|
746
|
+
const SERVICE_ALLOWANCE_METRICS: Record<string, Array<{
|
|
747
|
+
metric: string;
|
|
748
|
+
label: string;
|
|
749
|
+
sqlExpr: string;
|
|
750
|
+
allowance: number;
|
|
751
|
+
pricePerUnit: number;
|
|
752
|
+
unitDivisor: number;
|
|
753
|
+
}>> = {
|
|
754
|
+
d1: [
|
|
755
|
+
{ metric: 'rows_read', label: 'Rows Read', sqlExpr: 'SUM(COALESCE(d1_rows_read, 0))', allowance: PAID_ALLOWANCES.d1.rowsRead, pricePerUnit: PRICING_TIERS.d1.rowsReadPerBillion, unitDivisor: 1_000_000_000 },
|
|
756
|
+
{ metric: 'rows_written', label: 'Rows Written', sqlExpr: 'SUM(COALESCE(d1_rows_written, 0))', allowance: PAID_ALLOWANCES.d1.rowsWritten, pricePerUnit: PRICING_TIERS.d1.rowsWrittenPerMillion, unitDivisor: 1_000_000 },
|
|
757
|
+
],
|
|
758
|
+
kv: [
|
|
759
|
+
{ metric: 'reads', label: 'Reads', sqlExpr: 'SUM(COALESCE(kv_reads, 0))', allowance: PAID_ALLOWANCES.kv.reads, pricePerUnit: PRICING_TIERS.kv.readsPerMillion, unitDivisor: 1_000_000 },
|
|
760
|
+
{ metric: 'writes', label: 'Writes', sqlExpr: 'SUM(COALESCE(kv_writes, 0))', allowance: PAID_ALLOWANCES.kv.writes, pricePerUnit: PRICING_TIERS.kv.writesPerMillion, unitDivisor: 1_000_000 },
|
|
761
|
+
{ metric: 'deletes', label: 'Deletes', sqlExpr: 'SUM(COALESCE(kv_deletes, 0))', allowance: PAID_ALLOWANCES.kv.deletes, pricePerUnit: PRICING_TIERS.kv.deletesPerMillion, unitDivisor: 1_000_000 },
|
|
762
|
+
{ metric: 'list_ops', label: 'List Ops', sqlExpr: 'SUM(COALESCE(kv_list_ops, 0))', allowance: PAID_ALLOWANCES.kv.lists, pricePerUnit: PRICING_TIERS.kv.listsPerMillion, unitDivisor: 1_000_000 },
|
|
763
|
+
],
|
|
764
|
+
r2: [
|
|
765
|
+
{ metric: 'class_a', label: 'Class A Ops', sqlExpr: 'SUM(COALESCE(r2_class_a_ops, 0))', allowance: PAID_ALLOWANCES.r2.classA, pricePerUnit: PRICING_TIERS.r2.classAPerMillion, unitDivisor: 1_000_000 },
|
|
766
|
+
{ metric: 'class_b', label: 'Class B Ops', sqlExpr: 'SUM(COALESCE(r2_class_b_ops, 0))', allowance: PAID_ALLOWANCES.r2.classB, pricePerUnit: PRICING_TIERS.r2.classBPerMillion, unitDivisor: 1_000_000 },
|
|
767
|
+
{ metric: 'storage', label: 'Storage', sqlExpr: 'MAX(COALESCE(r2_storage_bytes, 0))', allowance: PAID_ALLOWANCES.r2.storage, pricePerUnit: PRICING_TIERS.r2.storagePerGbMonth, unitDivisor: 1_000_000_000 },
|
|
768
|
+
],
|
|
769
|
+
durableObjects: [
|
|
770
|
+
{ metric: 'requests', label: 'Requests', sqlExpr: 'SUM(COALESCE(do_requests, 0))', allowance: PAID_ALLOWANCES.durableObjects.requests, pricePerUnit: PRICING_TIERS.durableObjects.requestsPerMillion, unitDivisor: 1_000_000 },
|
|
771
|
+
{ metric: 'gb_seconds', label: 'GB-seconds', sqlExpr: 'MAX(COALESCE(do_gb_seconds, 0))', allowance: PAID_ALLOWANCES.durableObjects.gbSeconds, pricePerUnit: PRICING_TIERS.durableObjects.gbSecondsPerMillion, unitDivisor: 1_000_000 },
|
|
772
|
+
],
|
|
773
|
+
vectorize: [
|
|
774
|
+
{ metric: 'queried_dimensions', label: 'Queried Dimensions', sqlExpr: 'SUM(COALESCE(vectorize_queries, 0))', allowance: PAID_ALLOWANCES.vectorize.queriedDimensions, pricePerUnit: PRICING_TIERS.vectorize.queriedDimensionsPerMillion, unitDivisor: 1_000_000 },
|
|
775
|
+
{ metric: 'stored_dimensions', label: 'Stored Dimensions', sqlExpr: 'MAX(COALESCE(vectorize_vectors_stored, 0))', allowance: PAID_ALLOWANCES.vectorize.storedDimensions, pricePerUnit: PRICING_TIERS.vectorize.storedDimensionsPerMillion, unitDivisor: 1_000_000 },
|
|
776
|
+
],
|
|
777
|
+
workers: [
|
|
778
|
+
{ metric: 'requests', label: 'Requests', sqlExpr: 'SUM(COALESCE(workers_requests, 0))', allowance: 10_000_000, pricePerUnit: PRICING_TIERS.workers.requestsPerMillion, unitDivisor: 1_000_000 },
|
|
779
|
+
{ metric: 'cpu_ms', label: 'CPU Time (ms)', sqlExpr: 'SUM(COALESCE(workers_cpu_time_ms, 0))', allowance: 30_000_000, pricePerUnit: PRICING_TIERS.workers.cpuMsPerMillion, unitDivisor: 1_000_000 },
|
|
780
|
+
],
|
|
781
|
+
queues: [
|
|
782
|
+
{ metric: 'operations', label: 'Operations', sqlExpr: 'SUM(COALESCE(queues_messages_produced, 0)) + SUM(COALESCE(queues_messages_consumed, 0))', allowance: PAID_ALLOWANCES.queues.operations, pricePerUnit: PRICING_TIERS.queues.operationsPerMillion, unitDivisor: 1_000_000 },
|
|
783
|
+
],
|
|
784
|
+
// pages, workflows — no meaningful allowance thresholds for alerting
|
|
785
|
+
// workersAI — 10K neurons/day free (daily reset, not monthly; too complex for MTD SUM check)
|
|
786
|
+
};
|
|
787
|
+
|
|
788
|
+
/**
|
|
789
|
+
* Query D1 for actual MTD usage per service and compare against plan allowances.
|
|
790
|
+
* Returns definitive allowance status — this is the ONLY source of truth for
|
|
791
|
+
* whether a service has exceeded its Workers Paid plan allowance.
|
|
792
|
+
*/
|
|
793
|
+
async function queryAllowanceStatus(
|
|
794
|
+
env: Env,
|
|
795
|
+
serviceName: string,
|
|
796
|
+
billingStart: string,
|
|
797
|
+
log: Logger
|
|
798
|
+
): Promise<AllowanceStatus> {
|
|
799
|
+
const metricDefs = SERVICE_ALLOWANCE_METRICS[serviceName];
|
|
800
|
+
if (!metricDefs || metricDefs.length === 0) {
|
|
801
|
+
// Services without defined allowances (pages, queues) — always "within"
|
|
802
|
+
return { withinAllowance: true, metrics: [], totalOverageCost: 0 };
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
try {
|
|
806
|
+
// Build a single query for all metrics of this service
|
|
807
|
+
const selectExprs = metricDefs.map((m, i) => `${m.sqlExpr} as metric_${i}`).join(', ');
|
|
808
|
+
const sql = `SELECT ${selectExprs} FROM hourly_usage_snapshots WHERE project = 'all' AND DATE(snapshot_hour) >= ?`;
|
|
809
|
+
|
|
810
|
+
const result = await env.PLATFORM_DB.prepare(sql)
|
|
811
|
+
.bind(billingStart)
|
|
812
|
+
.first<Record<string, number>>();
|
|
813
|
+
|
|
814
|
+
if (!result) {
|
|
815
|
+
return { withinAllowance: true, metrics: [], totalOverageCost: 0 };
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
const metrics: UsageMetricBreakdown[] = metricDefs.map((def, i) => {
|
|
819
|
+
const used = result[`metric_${i}`] ?? 0;
|
|
820
|
+
const pctOfAllowance = def.allowance > 0 ? (used / def.allowance) * 100 : 0;
|
|
821
|
+
const overageUnits = Math.max(0, used - def.allowance);
|
|
822
|
+
const overageCost = (overageUnits / def.unitDivisor) * def.pricePerUnit;
|
|
823
|
+
return {
|
|
824
|
+
metric: def.metric,
|
|
825
|
+
label: def.label,
|
|
826
|
+
used,
|
|
827
|
+
allowance: def.allowance,
|
|
828
|
+
pctOfAllowance: Math.round(pctOfAllowance * 10) / 10,
|
|
829
|
+
overageUnits,
|
|
830
|
+
overageCost: Math.round(overageCost * 100) / 100,
|
|
831
|
+
};
|
|
832
|
+
});
|
|
833
|
+
|
|
834
|
+
const withinAllowance = metrics.every((m) => m.used <= m.allowance);
|
|
835
|
+
const totalOverageCost = metrics.reduce((sum, m) => sum + m.overageCost, 0);
|
|
836
|
+
|
|
837
|
+
return {
|
|
838
|
+
withinAllowance,
|
|
839
|
+
metrics,
|
|
840
|
+
totalOverageCost: Math.round(totalOverageCost * 100) / 100,
|
|
841
|
+
};
|
|
842
|
+
} catch (error) {
|
|
843
|
+
log.error('Failed to query allowance status', error, { service: serviceName });
|
|
844
|
+
return { withinAllowance: true, metrics: [], totalOverageCost: 0 };
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
/**
|
|
849
|
+
* Compute billing period for the current month.
|
|
850
|
+
* Cloudflare bills from the 1st to the last day of each calendar month.
|
|
851
|
+
*/
|
|
852
|
+
function getBillingPeriod(): {
|
|
853
|
+
start: string;
|
|
854
|
+
end: string;
|
|
855
|
+
daysElapsed: number;
|
|
856
|
+
daysTotal: number;
|
|
857
|
+
} {
|
|
858
|
+
const now = new Date();
|
|
859
|
+
const year = now.getUTCFullYear();
|
|
860
|
+
const month = now.getUTCMonth();
|
|
861
|
+
const start = new Date(Date.UTC(year, month, 1));
|
|
862
|
+
const end = new Date(Date.UTC(year, month + 1, 0)); // last day of current month
|
|
863
|
+
const daysTotal = end.getUTCDate();
|
|
864
|
+
const daysElapsed = now.getUTCDate();
|
|
865
|
+
return {
|
|
866
|
+
start: start.toISOString().slice(0, 10),
|
|
867
|
+
end: end.toISOString().slice(0, 10),
|
|
868
|
+
daysElapsed,
|
|
869
|
+
daysTotal,
|
|
870
|
+
};
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
/**
|
|
874
|
+
* Evaluate alerts based on thresholds.
|
|
875
|
+
*
|
|
876
|
+
* Only fires high/critical email alerts when a service EXCEEDS its Workers Paid
|
|
877
|
+
* plan allowance (i.e. has real overage cost). Usage spikes within the free
|
|
878
|
+
* allowance are downgraded to warning-level (Slack only, no email).
|
|
879
|
+
*/
|
|
880
|
+
async function evaluateAlerts(
|
|
881
|
+
current: CostBreakdown,
|
|
882
|
+
previous: CostBreakdown | null,
|
|
883
|
+
thresholds: AlertThresholds,
|
|
884
|
+
env: Env,
|
|
885
|
+
log: Logger
|
|
886
|
+
): Promise<CostSpikeAlert[]> {
|
|
887
|
+
const alerts: CostSpikeAlert[] = [];
|
|
888
|
+
const billing = getBillingPeriod();
|
|
889
|
+
|
|
890
|
+
const services: (keyof CostBreakdown)[] = [
|
|
891
|
+
'workers',
|
|
892
|
+
'd1',
|
|
893
|
+
'kv',
|
|
894
|
+
'r2',
|
|
895
|
+
'durableObjects',
|
|
896
|
+
'vectorize',
|
|
897
|
+
'pages',
|
|
898
|
+
'queues',
|
|
899
|
+
'workersAI',
|
|
900
|
+
];
|
|
901
|
+
|
|
902
|
+
for (const service of services) {
|
|
903
|
+
if (service === 'total') continue;
|
|
904
|
+
|
|
905
|
+
const threshold = thresholds[service];
|
|
906
|
+
if (!threshold || !threshold.enabled) continue;
|
|
907
|
+
|
|
908
|
+
// STEP 1: Check actual usage against plan allowance via D1 query.
|
|
909
|
+
// This is the single source of truth — NOT the cached cost data.
|
|
910
|
+
const allowanceStatus = await queryAllowanceStatus(env, service, billing.start, log);
|
|
911
|
+
|
|
912
|
+
// STEP 2: If ALL metrics for this service are within plan allowance, SKIP entirely.
|
|
913
|
+
// No alert should fire for services covered by the Workers Paid plan inclusion.
|
|
914
|
+
if (allowanceStatus.withinAllowance) {
|
|
915
|
+
log.debug('Service within plan allowance, skipping alert', {
|
|
916
|
+
service,
|
|
917
|
+
metrics: allowanceStatus.metrics.map((m) => `${m.label}: ${m.pctOfAllowance}%`),
|
|
918
|
+
});
|
|
919
|
+
continue;
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
// STEP 3: We have a real overage — use the D1-derived overage cost, not cached cost.
|
|
923
|
+
const overageCost = allowanceStatus.totalOverageCost;
|
|
924
|
+
const previousCost = previous ? previous[service] : 0;
|
|
925
|
+
const costDeltaPct = previousCost > 0 ? ((overageCost - previousCost) / previousCost) * 100 : 0;
|
|
926
|
+
|
|
927
|
+
// Determine threshold level based on actual overage cost vs absoluteMax
|
|
928
|
+
let level: CostSpikeAlert['thresholdLevel'] = 'normal';
|
|
929
|
+
const costPercentOfMax =
|
|
930
|
+
threshold.absoluteMax > 0 ? (overageCost / threshold.absoluteMax) * 100 : 0;
|
|
931
|
+
|
|
932
|
+
if (costPercentOfMax >= threshold.criticalPct) {
|
|
933
|
+
level = 'critical';
|
|
934
|
+
} else if (costPercentOfMax >= threshold.highPct) {
|
|
935
|
+
level = 'high';
|
|
936
|
+
} else if (costPercentOfMax >= threshold.warningPct) {
|
|
937
|
+
level = 'warning';
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
// Alert conditions:
|
|
941
|
+
// 1. Overage cost > $0.10 AND threshold level is 'warning' or higher
|
|
942
|
+
// 2. Overage cost exceeds absolute max
|
|
943
|
+
const shouldAlert =
|
|
944
|
+
(overageCost > 0.10 && level !== 'normal') ||
|
|
945
|
+
(threshold.absoluteMax > 0 && overageCost > threshold.absoluteMax);
|
|
946
|
+
|
|
947
|
+
if (shouldAlert) {
|
|
948
|
+
// Upgrade to critical if overage cost exceeds max
|
|
949
|
+
if (threshold.absoluteMax > 0 && overageCost > threshold.absoluteMax) {
|
|
950
|
+
level = 'critical';
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
// Query per-project and per-feature attribution (non-blocking)
|
|
954
|
+
const [topProjects, topFeatures] = await Promise.all([
|
|
955
|
+
queryTopProjects(env, service, billing.start, log),
|
|
956
|
+
queryTopFeatures(env, service, log),
|
|
957
|
+
]);
|
|
958
|
+
|
|
959
|
+
alerts.push({
|
|
960
|
+
id: crypto.randomUUID(),
|
|
961
|
+
serviceType: formatServiceName(service),
|
|
962
|
+
resourceName: service,
|
|
963
|
+
currentCost: overageCost,
|
|
964
|
+
previousCost,
|
|
965
|
+
costDeltaPct,
|
|
966
|
+
costPercentOfMax,
|
|
967
|
+
thresholdLevel: level,
|
|
968
|
+
absoluteMax: threshold.absoluteMax,
|
|
969
|
+
timestamp: new Date().toISOString(),
|
|
970
|
+
billingPeriodStart: billing.start,
|
|
971
|
+
billingPeriodEnd: billing.end,
|
|
972
|
+
billingDaysElapsed: billing.daysElapsed,
|
|
973
|
+
billingDaysTotal: billing.daysTotal,
|
|
974
|
+
monthlyAllowance: SERVICE_ALLOWANCE_DESCRIPTIONS[service] ?? 'N/A',
|
|
975
|
+
isWithinAllowance: false,
|
|
976
|
+
overageCost,
|
|
977
|
+
topProjects,
|
|
978
|
+
topFeatures,
|
|
979
|
+
usageBreakdown: allowanceStatus.metrics,
|
|
980
|
+
});
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
return alerts;
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
/**
|
|
988
|
+
* Send alerts via Slack, Email, and Dashboard notifications (with rate limiting)
|
|
989
|
+
*/
|
|
990
|
+
async function sendAlerts(alert: CostSpikeAlert, env: Env, log: Logger): Promise<void> {
|
|
991
|
+
const alertKey = `cost-spike:${alert.resourceName}`;
|
|
992
|
+
|
|
993
|
+
// Check Slack rate limit
|
|
994
|
+
const slackKey = `slack:${alertKey}`;
|
|
995
|
+
const slackSent = await env.PLATFORM_ALERTS.get(slackKey);
|
|
996
|
+
|
|
997
|
+
if (!slackSent && env.SLACK_WEBHOOK_URL) {
|
|
998
|
+
const slackResult = await sendSlackAlert(alert, env);
|
|
999
|
+
if (slackResult.success) {
|
|
1000
|
+
await env.PLATFORM_ALERTS.put(slackKey, new Date().toISOString(), {
|
|
1001
|
+
expirationTtl: SLACK_RATE_LIMIT_TTL,
|
|
1002
|
+
});
|
|
1003
|
+
log.info('Sent Slack alert', { resource: alert.resourceName });
|
|
1004
|
+
} else {
|
|
1005
|
+
log.error('Slack alert failed', { resource: alert.resourceName, error: slackResult.error });
|
|
1006
|
+
}
|
|
1007
|
+
} else if (slackSent) {
|
|
1008
|
+
log.debug('Slack rate limited', { resource: alert.resourceName });
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
// Check Email rate limit (only for high/critical that EXCEED plan allowance)
|
|
1012
|
+
// Within-allowance alerts are capped at 'warning' by evaluateAlerts(), but guard explicitly
|
|
1013
|
+
if ((alert.thresholdLevel === 'high' || alert.thresholdLevel === 'critical') && !alert.isWithinAllowance) {
|
|
1014
|
+
const emailKey = `email:${alertKey}`;
|
|
1015
|
+
const emailSent = await env.PLATFORM_ALERTS.get(emailKey);
|
|
1016
|
+
|
|
1017
|
+
if (!emailSent && env.RESEND_API_KEY && env.ALERT_EMAIL_TO) {
|
|
1018
|
+
const emailResult = await sendEmailAlert(alert, env);
|
|
1019
|
+
if (emailResult.success) {
|
|
1020
|
+
await env.PLATFORM_ALERTS.put(emailKey, new Date().toISOString(), {
|
|
1021
|
+
expirationTtl: EMAIL_RATE_LIMIT_TTL,
|
|
1022
|
+
});
|
|
1023
|
+
log.info('Sent email alert', { resource: alert.resourceName });
|
|
1024
|
+
} else {
|
|
1025
|
+
log.error('Email alert failed', { resource: alert.resourceName, error: emailResult.error });
|
|
1026
|
+
}
|
|
1027
|
+
} else if (emailSent) {
|
|
1028
|
+
log.debug('Email rate limited', { resource: alert.resourceName });
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
// Create dashboard notification (using same rate limit as Slack)
|
|
1033
|
+
if (!slackSent && _rawNotificationsApi) {
|
|
1034
|
+
await createCostNotification(alert, env, log);
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
/**
|
|
1039
|
+
* Create dashboard notification for cost alert
|
|
1040
|
+
*/
|
|
1041
|
+
async function createCostNotification(
|
|
1042
|
+
alert: CostSpikeAlert,
|
|
1043
|
+
env: Env,
|
|
1044
|
+
log: Logger
|
|
1045
|
+
): Promise<void> {
|
|
1046
|
+
if (!_rawNotificationsApi) return;
|
|
1047
|
+
|
|
1048
|
+
// Map threshold level to notification priority
|
|
1049
|
+
const priorityMap: Record<CostSpikeAlert['thresholdLevel'], string> = {
|
|
1050
|
+
critical: 'critical',
|
|
1051
|
+
high: 'high',
|
|
1052
|
+
warning: 'medium',
|
|
1053
|
+
normal: 'low',
|
|
1054
|
+
};
|
|
1055
|
+
|
|
1056
|
+
// Map threshold level to category
|
|
1057
|
+
const categoryMap: Record<CostSpikeAlert['thresholdLevel'], string> = {
|
|
1058
|
+
critical: 'error',
|
|
1059
|
+
high: 'error',
|
|
1060
|
+
warning: 'warning',
|
|
1061
|
+
normal: 'info',
|
|
1062
|
+
};
|
|
1063
|
+
|
|
1064
|
+
try {
|
|
1065
|
+
const resp = await _rawNotificationsApi!.fetch(
|
|
1066
|
+
'https://platform-notifications.internal/notifications',
|
|
1067
|
+
{
|
|
1068
|
+
method: 'POST',
|
|
1069
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1070
|
+
body: JSON.stringify({
|
|
1071
|
+
category: categoryMap[alert.thresholdLevel],
|
|
1072
|
+
source: 'sentinel',
|
|
1073
|
+
source_id: alert.id,
|
|
1074
|
+
title: `${alert.serviceType}: ${formatCurrency(alert.overageCost)} overage`,
|
|
1075
|
+
description: `${alert.serviceType} has exceeded plan allowance. Overage cost: ${formatCurrency(alert.overageCost)} (threshold: ${formatCurrency(alert.absoluteMax)})`,
|
|
1076
|
+
priority: priorityMap[alert.thresholdLevel],
|
|
1077
|
+
action_url: '/costs',
|
|
1078
|
+
action_label: 'View Costs',
|
|
1079
|
+
project: 'platform',
|
|
1080
|
+
}),
|
|
1081
|
+
}
|
|
1082
|
+
);
|
|
1083
|
+
const body = await resp.text();
|
|
1084
|
+
if (resp.ok) {
|
|
1085
|
+
log.debug('Created cost notification', { resource: alert.resourceName });
|
|
1086
|
+
} else {
|
|
1087
|
+
log.warn('Cost notification failed', { status: resp.status, body });
|
|
1088
|
+
}
|
|
1089
|
+
} catch (error) {
|
|
1090
|
+
// Non-blocking - log and continue
|
|
1091
|
+
log.error('Failed to create cost notification', error);
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
/**
|
|
1096
|
+
* Send Slack alert
|
|
1097
|
+
*
|
|
1098
|
+
* Includes rich context for Claude Code follow-up:
|
|
1099
|
+
* - Service breakdown with operation types
|
|
1100
|
+
* - Investigation commands (D1, KV queries)
|
|
1101
|
+
* - Direct links to usage dashboard
|
|
1102
|
+
* - Historical context (percent of monthly max)
|
|
1103
|
+
*/
|
|
1104
|
+
async function sendSlackAlert(
|
|
1105
|
+
alert: CostSpikeAlert,
|
|
1106
|
+
env: Env
|
|
1107
|
+
): Promise<{ success: boolean; error?: string }> {
|
|
1108
|
+
const emoji = getEmoji(alert.thresholdLevel);
|
|
1109
|
+
const colour = getColour(alert.thresholdLevel);
|
|
1110
|
+
const deltaText = formatPercentage(alert.costDeltaPct);
|
|
1111
|
+
|
|
1112
|
+
// Build investigation commands based on service type
|
|
1113
|
+
const investigationCommands = getInvestigationCommands(alert.serviceType);
|
|
1114
|
+
|
|
1115
|
+
// Build usage breakdown text for Slack
|
|
1116
|
+
const usageBreakdownText = alert.usageBreakdown.length > 0
|
|
1117
|
+
? alert.usageBreakdown.map(m => {
|
|
1118
|
+
const status = m.pctOfAllowance > 100 ? ':red_circle:' : ':white_check_mark:';
|
|
1119
|
+
const overageText = m.overageCost > 0 ? ` \u2014 ${formatCurrency(m.overageCost)} overage` : '';
|
|
1120
|
+
return `${status} *${m.label}:* ${formatLargeNumber(m.used)} / ${formatLargeNumber(m.allowance)} (${m.pctOfAllowance}%)${overageText}`;
|
|
1121
|
+
}).join('\n')
|
|
1122
|
+
: '';
|
|
1123
|
+
|
|
1124
|
+
const message = {
|
|
1125
|
+
text: `[${alert.thresholdLevel.toUpperCase()}] ${alert.serviceType}: ${formatCurrency(alert.overageCost)} overage`,
|
|
1126
|
+
blocks: [
|
|
1127
|
+
{
|
|
1128
|
+
type: 'header',
|
|
1129
|
+
text: {
|
|
1130
|
+
type: 'plain_text',
|
|
1131
|
+
text: `${emoji} ${alert.serviceType}: ${formatCurrency(alert.overageCost)} overage`,
|
|
1132
|
+
},
|
|
1133
|
+
},
|
|
1134
|
+
{
|
|
1135
|
+
type: 'section',
|
|
1136
|
+
fields: [
|
|
1137
|
+
{ type: 'mrkdwn', text: `*Service:*\n${alert.serviceType}` },
|
|
1138
|
+
{ type: 'mrkdwn', text: `*Billing Period:*\n${formatBillingPeriod(alert)}` },
|
|
1139
|
+
{ type: 'mrkdwn', text: `*Overage Cost:*\n${formatCurrency(alert.overageCost)}` },
|
|
1140
|
+
{ type: 'mrkdwn', text: `*Alert Threshold:*\n${formatCurrency(alert.absoluteMax)}` },
|
|
1141
|
+
],
|
|
1142
|
+
},
|
|
1143
|
+
...(usageBreakdownText ? [{
|
|
1144
|
+
type: 'section' as const,
|
|
1145
|
+
text: {
|
|
1146
|
+
type: 'mrkdwn' as const,
|
|
1147
|
+
text: `*Usage vs Plan Allowance:*\n${usageBreakdownText}`,
|
|
1148
|
+
},
|
|
1149
|
+
}] : []),
|
|
1150
|
+
...(alert.topProjects.length > 0 ? [{
|
|
1151
|
+
type: 'section' as const,
|
|
1152
|
+
text: {
|
|
1153
|
+
type: 'mrkdwn' as const,
|
|
1154
|
+
text: `*Top Projects:*\n${alert.topProjects.map(p =>
|
|
1155
|
+
`\u2022 *${p.project}*: ${formatCurrency(p.cost)} (${p.pctOfTotal}%)`
|
|
1156
|
+
).join('\n')}`,
|
|
1157
|
+
},
|
|
1158
|
+
}] : []),
|
|
1159
|
+
...(alert.topFeatures.length > 0 ? [{
|
|
1160
|
+
type: 'section' as const,
|
|
1161
|
+
text: {
|
|
1162
|
+
type: 'mrkdwn' as const,
|
|
1163
|
+
text: `*Top Features:*\n${alert.topFeatures.map(f =>
|
|
1164
|
+
`\u2022 \`${f.featureKey}\` \u2014 ${f.usage.toLocaleString()} ops (${f.pctOfTotal}%)`
|
|
1165
|
+
).join('\n')}`,
|
|
1166
|
+
},
|
|
1167
|
+
}] : []),
|
|
1168
|
+
{
|
|
1169
|
+
type: 'section',
|
|
1170
|
+
text: {
|
|
1171
|
+
type: 'mrkdwn',
|
|
1172
|
+
text: `*Investigation Commands:*\n\`\`\`${investigationCommands}\`\`\``,
|
|
1173
|
+
},
|
|
1174
|
+
},
|
|
1175
|
+
{
|
|
1176
|
+
type: 'context',
|
|
1177
|
+
elements: [
|
|
1178
|
+
{
|
|
1179
|
+
type: 'mrkdwn',
|
|
1180
|
+
text: `Alert ID: ${alert.id} | ${new Date(alert.timestamp).toLocaleString('en-AU')}`,
|
|
1181
|
+
},
|
|
1182
|
+
],
|
|
1183
|
+
},
|
|
1184
|
+
{
|
|
1185
|
+
type: 'actions',
|
|
1186
|
+
elements: [
|
|
1187
|
+
{
|
|
1188
|
+
type: 'button',
|
|
1189
|
+
text: {
|
|
1190
|
+
type: 'plain_text',
|
|
1191
|
+
text: 'Usage Dashboard',
|
|
1192
|
+
emoji: true,
|
|
1193
|
+
},
|
|
1194
|
+
url: `${DASHBOARD_URL}/usage`,
|
|
1195
|
+
},
|
|
1196
|
+
{
|
|
1197
|
+
type: 'button',
|
|
1198
|
+
text: {
|
|
1199
|
+
type: 'plain_text',
|
|
1200
|
+
text: 'Usage Monitor',
|
|
1201
|
+
emoji: true,
|
|
1202
|
+
},
|
|
1203
|
+
url: `${DASHBOARD_URL}/usage/monitor`,
|
|
1204
|
+
},
|
|
1205
|
+
],
|
|
1206
|
+
},
|
|
1207
|
+
],
|
|
1208
|
+
attachments: [
|
|
1209
|
+
{
|
|
1210
|
+
color: colour,
|
|
1211
|
+
fields: [
|
|
1212
|
+
{
|
|
1213
|
+
title: 'Action Required',
|
|
1214
|
+
value: getActionText(alert.thresholdLevel),
|
|
1215
|
+
short: false,
|
|
1216
|
+
},
|
|
1217
|
+
],
|
|
1218
|
+
},
|
|
1219
|
+
],
|
|
1220
|
+
};
|
|
1221
|
+
|
|
1222
|
+
try {
|
|
1223
|
+
const response = await fetch(env.SLACK_WEBHOOK_URL, {
|
|
1224
|
+
method: 'POST',
|
|
1225
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1226
|
+
body: JSON.stringify(message),
|
|
1227
|
+
});
|
|
1228
|
+
|
|
1229
|
+
if (!response.ok) {
|
|
1230
|
+
const text = await response.text();
|
|
1231
|
+
return { success: false, error: `Slack error: ${response.status} ${text}` };
|
|
1232
|
+
}
|
|
1233
|
+
|
|
1234
|
+
return { success: true };
|
|
1235
|
+
} catch (error) {
|
|
1236
|
+
return {
|
|
1237
|
+
success: false,
|
|
1238
|
+
error: `Slack error: ${error instanceof Error ? error.message : 'Unknown'}`,
|
|
1239
|
+
};
|
|
1240
|
+
}
|
|
1241
|
+
}
|
|
1242
|
+
|
|
1243
|
+
/**
|
|
1244
|
+
* Send Email alert via Resend
|
|
1245
|
+
*/
|
|
1246
|
+
async function sendEmailAlert(
|
|
1247
|
+
alert: CostSpikeAlert,
|
|
1248
|
+
env: Env
|
|
1249
|
+
): Promise<{ success: boolean; error?: string }> {
|
|
1250
|
+
const colour = getColour(alert.thresholdLevel);
|
|
1251
|
+
const billingPeriodText = formatBillingPeriod(alert);
|
|
1252
|
+
|
|
1253
|
+
// Build usage breakdown HTML rows
|
|
1254
|
+
const usageBreakdownHtml = alert.usageBreakdown.length > 0 ? `
|
|
1255
|
+
<div style="margin-top: 15px;">
|
|
1256
|
+
<strong style="font-size: 14px;">Usage vs Plan Allowance</strong>
|
|
1257
|
+
<table style="width: 100%; border-collapse: collapse; margin-top: 8px;">
|
|
1258
|
+
<tr style="background: #f8f9fa;">
|
|
1259
|
+
<th style="padding: 8px; text-align: left; font-size: 12px;">Metric</th>
|
|
1260
|
+
<th style="padding: 8px; text-align: right; font-size: 12px;">Used</th>
|
|
1261
|
+
<th style="padding: 8px; text-align: right; font-size: 12px;">Allowance</th>
|
|
1262
|
+
<th style="padding: 8px; text-align: right; font-size: 12px;">%</th>
|
|
1263
|
+
<th style="padding: 8px; text-align: right; font-size: 12px;">Overage Cost</th>
|
|
1264
|
+
</tr>
|
|
1265
|
+
${alert.usageBreakdown.map(m => {
|
|
1266
|
+
const pctColour = m.pctOfAllowance > 100 ? '#dc3545' : m.pctOfAllowance > 75 ? '#ffc107' : '#28a745';
|
|
1267
|
+
return `<tr style="border-bottom: 1px solid #eee;">
|
|
1268
|
+
<td style="padding: 8px; font-size: 13px;">${m.label}</td>
|
|
1269
|
+
<td style="padding: 8px; text-align: right; font-size: 13px;">${formatLargeNumber(m.used)}</td>
|
|
1270
|
+
<td style="padding: 8px; text-align: right; font-size: 13px; color: #666;">${formatLargeNumber(m.allowance)}</td>
|
|
1271
|
+
<td style="padding: 8px; text-align: right; font-size: 13px; font-weight: bold; color: ${pctColour};">${m.pctOfAllowance}%</td>
|
|
1272
|
+
<td style="padding: 8px; text-align: right; font-size: 13px;">${m.overageCost > 0 ? formatCurrency(m.overageCost) : '-'}</td>
|
|
1273
|
+
</tr>`;
|
|
1274
|
+
}).join('')}
|
|
1275
|
+
</table>
|
|
1276
|
+
</div>` : '';
|
|
1277
|
+
|
|
1278
|
+
const html = `
|
|
1279
|
+
<!DOCTYPE html>
|
|
1280
|
+
<html>
|
|
1281
|
+
<head>
|
|
1282
|
+
<meta charset="UTF-8">
|
|
1283
|
+
<title>Usage Alert: ${alert.serviceType}</title>
|
|
1284
|
+
</head>
|
|
1285
|
+
<body style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; margin: 0; padding: 20px; background-color: #f5f5f5;">
|
|
1286
|
+
<div style="max-width: 600px; margin: 0 auto; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
1287
|
+
<div style="background-color: ${colour}; color: white; padding: 20px;">
|
|
1288
|
+
<h1 style="margin: 0; font-size: 20px;">[${alert.thresholdLevel.toUpperCase()}] ${alert.serviceType}: ${formatCurrency(alert.overageCost)} overage</h1>
|
|
1289
|
+
</div>
|
|
1290
|
+
<div style="padding: 20px;">
|
|
1291
|
+
<table style="width: 100%; border-collapse: collapse;">
|
|
1292
|
+
<tr><td style="padding: 10px 0; border-bottom: 1px solid #eee;"><strong>Service</strong></td><td style="padding: 10px 0; border-bottom: 1px solid #eee;">${alert.serviceType}</td></tr>
|
|
1293
|
+
<tr><td style="padding: 10px 0; border-bottom: 1px solid #eee;"><strong>Billing Period</strong></td><td style="padding: 10px 0; border-bottom: 1px solid #eee;">${billingPeriodText}</td></tr>
|
|
1294
|
+
<tr><td style="padding: 10px 0; border-bottom: 1px solid #eee;"><strong>Overage Cost</strong></td><td style="padding: 10px 0; border-bottom: 1px solid #eee; color: #dc3545; font-weight: bold;">${formatCurrency(alert.overageCost)}</td></tr>
|
|
1295
|
+
<tr><td style="padding: 10px 0; border-bottom: 1px solid #eee;"><strong>Alert Threshold</strong></td><td style="padding: 10px 0; border-bottom: 1px solid #eee;">${formatCurrency(alert.absoluteMax)}</td></tr>
|
|
1296
|
+
<tr><td style="padding: 10px 0;"><strong>Plan Allowance</strong></td><td style="padding: 10px 0;">${alert.monthlyAllowance}</td></tr>
|
|
1297
|
+
</table>
|
|
1298
|
+
<div style="margin-top: 15px; padding: 15px; background: #f8d7da; border-radius: 4px; border-left: 4px solid #dc3545;">
|
|
1299
|
+
<strong>⚠ Plan Allowance Exceeded</strong>
|
|
1300
|
+
<p style="margin: 8px 0 0 0; color: #555; font-size: 14px;">You have exceeded your monthly plan allowance. Overage cost: ${formatCurrency(alert.overageCost)}</p>
|
|
1301
|
+
</div>
|
|
1302
|
+
${usageBreakdownHtml}
|
|
1303
|
+
${alert.topProjects.length > 0 ? `
|
|
1304
|
+
<div style="margin-top: 15px;">
|
|
1305
|
+
<strong style="font-size: 14px;">Top Projects by Cost</strong>
|
|
1306
|
+
<table style="width: 100%; border-collapse: collapse; margin-top: 8px;">
|
|
1307
|
+
${alert.topProjects.map(p => `
|
|
1308
|
+
<tr>
|
|
1309
|
+
<td style="padding: 6px 0; width: 40%;"><strong>${p.project}</strong></td>
|
|
1310
|
+
<td style="padding: 6px 0; width: 25%; text-align: right;">${formatCurrency(p.cost)}</td>
|
|
1311
|
+
<td style="padding: 6px 8px; width: 35%;">
|
|
1312
|
+
<div style="background: #e9ecef; border-radius: 3px; height: 16px; position: relative;">
|
|
1313
|
+
<div style="background: #0d6efd; border-radius: 3px; height: 16px; width: ${Math.min(p.pctOfTotal, 100)}%; display: flex; align-items: center; justify-content: flex-end; padding-right: 4px;">
|
|
1314
|
+
<span style="color: white; font-size: 10px; font-weight: bold;">${p.pctOfTotal}%</span>
|
|
1315
|
+
</div>
|
|
1316
|
+
</div>
|
|
1317
|
+
</td>
|
|
1318
|
+
</tr>`).join('')}
|
|
1319
|
+
</table>
|
|
1320
|
+
</div>` : ''}
|
|
1321
|
+
${alert.topFeatures.length > 0 ? `
|
|
1322
|
+
<div style="margin-top: 15px;">
|
|
1323
|
+
<strong style="font-size: 14px;">Top Features by Usage</strong>
|
|
1324
|
+
<table style="width: 100%; border-collapse: collapse; margin-top: 8px;">
|
|
1325
|
+
${alert.topFeatures.map(f => `
|
|
1326
|
+
<tr>
|
|
1327
|
+
<td style="padding: 6px 0; width: 40%; font-family: monospace; font-size: 12px;">${f.featureKey}</td>
|
|
1328
|
+
<td style="padding: 6px 0; width: 25%; text-align: right;">${f.usage.toLocaleString()} ops</td>
|
|
1329
|
+
<td style="padding: 6px 8px; width: 35%;">
|
|
1330
|
+
<div style="background: #e9ecef; border-radius: 3px; height: 16px; position: relative;">
|
|
1331
|
+
<div style="background: #6f42c1; border-radius: 3px; height: 16px; width: ${Math.min(f.pctOfTotal, 100)}%; display: flex; align-items: center; justify-content: flex-end; padding-right: 4px;">
|
|
1332
|
+
<span style="color: white; font-size: 10px; font-weight: bold;">${f.pctOfTotal}%</span>
|
|
1333
|
+
</div>
|
|
1334
|
+
</div>
|
|
1335
|
+
</td>
|
|
1336
|
+
</tr>`).join('')}
|
|
1337
|
+
</table>
|
|
1338
|
+
</div>` : ''}
|
|
1339
|
+
<div style="margin-top: 15px; padding: 15px; background: #f8f9fa; border-radius: 4px;">
|
|
1340
|
+
<strong>Recommended Action:</strong>
|
|
1341
|
+
<p style="margin: 10px 0 0 0; color: #666;">${getActionText(alert.thresholdLevel)}</p>
|
|
1342
|
+
</div>
|
|
1343
|
+
</div>
|
|
1344
|
+
<div style="background: #f8f9fa; padding: 15px 20px; font-size: 12px; color: #666;">
|
|
1345
|
+
<p style="margin: 0;">Alert ID: ${alert.id}</p>
|
|
1346
|
+
<p style="margin: 5px 0 0 0;">Generated: ${new Date(alert.timestamp).toLocaleString('en-AU')}</p>
|
|
1347
|
+
</div>
|
|
1348
|
+
</div>
|
|
1349
|
+
</body>
|
|
1350
|
+
</html>`;
|
|
1351
|
+
|
|
1352
|
+
try {
|
|
1353
|
+
const response = await fetch('https://api.resend.com/emails', {
|
|
1354
|
+
method: 'POST',
|
|
1355
|
+
headers: {
|
|
1356
|
+
Authorization: `Bearer ${env.RESEND_API_KEY}`,
|
|
1357
|
+
'Content-Type': 'application/json',
|
|
1358
|
+
},
|
|
1359
|
+
body: JSON.stringify({
|
|
1360
|
+
from: ALERT_FROM_EMAIL,
|
|
1361
|
+
to: env.ALERT_EMAIL_TO,
|
|
1362
|
+
subject: `[${alert.thresholdLevel.toUpperCase()}] ${alert.serviceType}: ${formatCurrency(alert.overageCost)} overage (threshold: ${formatCurrency(alert.absoluteMax)})`,
|
|
1363
|
+
html,
|
|
1364
|
+
}),
|
|
1365
|
+
});
|
|
1366
|
+
|
|
1367
|
+
if (!response.ok) {
|
|
1368
|
+
const text = await response.text();
|
|
1369
|
+
return { success: false, error: `Resend error: ${response.status} ${text}` };
|
|
1370
|
+
}
|
|
1371
|
+
|
|
1372
|
+
return { success: true };
|
|
1373
|
+
} catch (error) {
|
|
1374
|
+
return {
|
|
1375
|
+
success: false,
|
|
1376
|
+
error: `Resend error: ${error instanceof Error ? error.message : 'Unknown'}`,
|
|
1377
|
+
};
|
|
1378
|
+
}
|
|
1379
|
+
}
|
|
1380
|
+
|
|
1381
|
+
/**
|
|
1382
|
+
* Format service name for display
|
|
1383
|
+
*/
|
|
1384
|
+
function formatServiceName(service: string): string {
|
|
1385
|
+
const names: Record<string, string> = {
|
|
1386
|
+
workers: 'Workers',
|
|
1387
|
+
d1: 'D1 Database',
|
|
1388
|
+
kv: 'KV Storage',
|
|
1389
|
+
r2: 'R2 Storage',
|
|
1390
|
+
durableObjects: 'Durable Objects',
|
|
1391
|
+
vectorize: 'Vectorize',
|
|
1392
|
+
aiGateway: 'AI Gateway',
|
|
1393
|
+
pages: 'Pages',
|
|
1394
|
+
queues: 'Queues',
|
|
1395
|
+
workflows: 'Workflows',
|
|
1396
|
+
};
|
|
1397
|
+
return names[service] || service;
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
/**
|
|
1401
|
+
* Get emoji for threshold level
|
|
1402
|
+
*/
|
|
1403
|
+
function getEmoji(level: CostSpikeAlert['thresholdLevel']): string {
|
|
1404
|
+
const emojis: Record<string, string> = {
|
|
1405
|
+
critical: ':rotating_light:',
|
|
1406
|
+
high: ':warning:',
|
|
1407
|
+
warning: ':yellow_circle:',
|
|
1408
|
+
normal: ':white_check_mark:',
|
|
1409
|
+
};
|
|
1410
|
+
return emojis[level] || ':bell:';
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
/**
|
|
1414
|
+
* Get colour for threshold level
|
|
1415
|
+
*/
|
|
1416
|
+
function getColour(level: CostSpikeAlert['thresholdLevel']): string {
|
|
1417
|
+
const colours: Record<string, string> = {
|
|
1418
|
+
critical: '#dc3545', // Red
|
|
1419
|
+
high: '#dc3545', // Red (same as critical)
|
|
1420
|
+
warning: '#ffc107', // Yellow
|
|
1421
|
+
normal: '#28a745', // Light green
|
|
1422
|
+
};
|
|
1423
|
+
return colours[level] || '#17a2b8';
|
|
1424
|
+
}
|
|
1425
|
+
|
|
1426
|
+
/**
|
|
1427
|
+
* Get investigation commands based on service type
|
|
1428
|
+
* Provides Claude Code with actionable commands for follow-up
|
|
1429
|
+
*/
|
|
1430
|
+
function getInvestigationCommands(serviceType: string): string {
|
|
1431
|
+
const base = `# Query daily usage rollups
|
|
1432
|
+
npx wrangler d1 execute platform-metrics --remote --command "SELECT snapshot_date, SUM(${serviceType}_cost_usd) as cost FROM daily_usage_rollups WHERE snapshot_date >= date('now', '-7 days') GROUP BY snapshot_date ORDER BY snapshot_date DESC"`;
|
|
1433
|
+
|
|
1434
|
+
const serviceSpecific: Record<string, string> = {
|
|
1435
|
+
d1: `
|
|
1436
|
+
# Check D1 per-feature usage
|
|
1437
|
+
npx wrangler d1 execute platform-metrics --remote --command "SELECT feature_key, SUM(d1_writes) as writes, SUM(d1_reads) as reads FROM feature_usage_daily WHERE snapshot_date = date('now', '-1 day') GROUP BY feature_key ORDER BY writes DESC LIMIT 10"`,
|
|
1438
|
+
kv: `
|
|
1439
|
+
# Check KV per-feature usage
|
|
1440
|
+
npx wrangler d1 execute platform-metrics --remote --command "SELECT feature_key, SUM(kv_writes) as writes, SUM(kv_reads) as reads FROM feature_usage_daily WHERE snapshot_date = date('now', '-1 day') GROUP BY feature_key ORDER BY writes DESC LIMIT 10"`,
|
|
1441
|
+
workers: `
|
|
1442
|
+
# Check Workers per-project usage
|
|
1443
|
+
npx wrangler d1 execute platform-metrics --remote --command "SELECT project, SUM(workers_requests) as requests, SUM(workers_cpu_time) as cpu_ms FROM daily_usage_rollups WHERE snapshot_date = date('now', '-1 day') GROUP BY project ORDER BY requests DESC"`,
|
|
1444
|
+
vectorize: `
|
|
1445
|
+
# Check Vectorize per-feature usage
|
|
1446
|
+
npx wrangler d1 execute platform-metrics --remote --command "SELECT feature_key, SUM(vectorize_queries) as queries FROM feature_usage_daily WHERE snapshot_date = date('now', '-1 day') AND vectorize_queries > 0 GROUP BY feature_key ORDER BY queries DESC LIMIT 10"`,
|
|
1447
|
+
};
|
|
1448
|
+
|
|
1449
|
+
return base + (serviceSpecific[serviceType] || '');
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
/**
|
|
1453
|
+
* Get action text for threshold level
|
|
1454
|
+
*/
|
|
1455
|
+
function getActionText(level: CostSpikeAlert['thresholdLevel']): string {
|
|
1456
|
+
switch (level) {
|
|
1457
|
+
case 'critical':
|
|
1458
|
+
return 'Investigate immediately - usage significantly exceeds budget';
|
|
1459
|
+
case 'high':
|
|
1460
|
+
return 'Review usage patterns and consider optimisation';
|
|
1461
|
+
case 'warning':
|
|
1462
|
+
return 'Monitor closely - approaching threshold';
|
|
1463
|
+
default:
|
|
1464
|
+
return 'No action required';
|
|
1465
|
+
}
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
/**
|
|
1469
|
+
* Format currency
|
|
1470
|
+
*/
|
|
1471
|
+
function formatCurrency(amount: number): string {
|
|
1472
|
+
return `$${amount.toFixed(2)}`;
|
|
1473
|
+
}
|
|
1474
|
+
|
|
1475
|
+
/**
|
|
1476
|
+
* Format percentage
|
|
1477
|
+
*/
|
|
1478
|
+
function formatPercentage(pct: number): string {
|
|
1479
|
+
const sign = pct >= 0 ? '+' : '';
|
|
1480
|
+
return `${sign}${pct.toFixed(1)}%`;
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
/**
|
|
1484
|
+
* Format large numbers with K/M/B suffixes for readability.
|
|
1485
|
+
*/
|
|
1486
|
+
function formatLargeNumber(n: number): string {
|
|
1487
|
+
if (n >= 1_000_000_000) return `${(n / 1_000_000_000).toFixed(1)}B`;
|
|
1488
|
+
if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(1)}M`;
|
|
1489
|
+
if (n >= 1_000) return `${(n / 1_000).toFixed(1)}K`;
|
|
1490
|
+
return n.toFixed(0);
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1493
|
+
/**
|
|
1494
|
+
* Format billing period for display.
|
|
1495
|
+
* Example: "1 Feb - 28 Feb 2026 (Day 6 of 28)"
|
|
1496
|
+
*/
|
|
1497
|
+
function formatBillingPeriod(alert: CostSpikeAlert): string {
|
|
1498
|
+
const start = new Date(alert.billingPeriodStart + 'T00:00:00Z');
|
|
1499
|
+
const end = new Date(alert.billingPeriodEnd + 'T00:00:00Z');
|
|
1500
|
+
const months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
|
|
1501
|
+
const startStr = `${start.getUTCDate()} ${months[start.getUTCMonth()]}`;
|
|
1502
|
+
const endStr = `${end.getUTCDate()} ${months[end.getUTCMonth()]} ${end.getUTCFullYear()}`;
|
|
1503
|
+
return `${startStr} - ${endStr} (Day ${alert.billingDaysElapsed} of ${alert.billingDaysTotal})`;
|
|
1504
|
+
}
|
|
1505
|
+
|
|
1506
|
+
// =============================================================================
|
|
1507
|
+
// STALE HEARTBEAT DETECTION
|
|
1508
|
+
// =============================================================================
|
|
1509
|
+
|
|
1510
|
+
/**
|
|
1511
|
+
* Stale threshold: 2x the default heartbeat interval (5 minutes).
|
|
1512
|
+
* Features that haven't sent a heartbeat in 15 minutes are considered stale.
|
|
1513
|
+
*/
|
|
1514
|
+
const STALE_THRESHOLD_SECONDS = 15 * 60;
|
|
1515
|
+
|
|
1516
|
+
/**
|
|
1517
|
+
* Stale heartbeat alert rate limit: 1 alert per feature per hour
|
|
1518
|
+
*/
|
|
1519
|
+
const STALE_HEARTBEAT_RATE_LIMIT_TTL = 3600;
|
|
1520
|
+
|
|
1521
|
+
/**
|
|
1522
|
+
* Row type for stale heartbeat query results.
|
|
1523
|
+
*/
|
|
1524
|
+
interface StaleHeartbeatRow {
|
|
1525
|
+
project_id: string;
|
|
1526
|
+
feature_id: string;
|
|
1527
|
+
last_heartbeat: number;
|
|
1528
|
+
age_seconds: number;
|
|
1529
|
+
status: string;
|
|
1530
|
+
}
|
|
1531
|
+
|
|
1532
|
+
/**
|
|
1533
|
+
* Check for Durable Objects that have stopped sending heartbeats.
|
|
1534
|
+
*
|
|
1535
|
+
* Queries the system_health_checks table for features that:
|
|
1536
|
+
* 1. Have status = 'healthy'
|
|
1537
|
+
* 2. Haven't sent a heartbeat in STALE_THRESHOLD_SECONDS
|
|
1538
|
+
*
|
|
1539
|
+
* Updates status to 'stale' and fires Slack alerts.
|
|
1540
|
+
*/
|
|
1541
|
+
async function checkStaleHeartbeats(env: Env, log: Logger): Promise<void> {
|
|
1542
|
+
const now = Math.floor(Date.now() / 1000);
|
|
1543
|
+
|
|
1544
|
+
try {
|
|
1545
|
+
// Find healthy features that haven't sent heartbeats recently
|
|
1546
|
+
const staleResult = await env.PLATFORM_DB.prepare(
|
|
1547
|
+
`
|
|
1548
|
+
SELECT
|
|
1549
|
+
project_id,
|
|
1550
|
+
feature_id,
|
|
1551
|
+
last_heartbeat,
|
|
1552
|
+
? - last_heartbeat as age_seconds,
|
|
1553
|
+
status
|
|
1554
|
+
FROM system_health_checks
|
|
1555
|
+
WHERE status = 'healthy' AND ? - last_heartbeat > ?
|
|
1556
|
+
`
|
|
1557
|
+
)
|
|
1558
|
+
.bind(now, now, STALE_THRESHOLD_SECONDS)
|
|
1559
|
+
.all<StaleHeartbeatRow>();
|
|
1560
|
+
|
|
1561
|
+
if (!staleResult.results || staleResult.results.length === 0) {
|
|
1562
|
+
log.debug('No stale heartbeats detected');
|
|
1563
|
+
return;
|
|
1564
|
+
}
|
|
1565
|
+
|
|
1566
|
+
log.warn('Stale heartbeats detected', { count: staleResult.results.length });
|
|
1567
|
+
|
|
1568
|
+
for (const stale of staleResult.results) {
|
|
1569
|
+
// Update status to 'stale'
|
|
1570
|
+
await env.PLATFORM_DB.prepare(
|
|
1571
|
+
`
|
|
1572
|
+
UPDATE system_health_checks
|
|
1573
|
+
SET status = 'stale',
|
|
1574
|
+
consecutive_failures = consecutive_failures + 1,
|
|
1575
|
+
updated_at = ?
|
|
1576
|
+
WHERE feature_id = ?
|
|
1577
|
+
`
|
|
1578
|
+
)
|
|
1579
|
+
.bind(now, stale.feature_id)
|
|
1580
|
+
.run();
|
|
1581
|
+
|
|
1582
|
+
log.info('Marked feature as stale', {
|
|
1583
|
+
feature_id: stale.feature_id,
|
|
1584
|
+
project_id: stale.project_id,
|
|
1585
|
+
age_seconds: stale.age_seconds,
|
|
1586
|
+
});
|
|
1587
|
+
|
|
1588
|
+
// Fire Slack alert (with rate limiting)
|
|
1589
|
+
await fireStaleHeartbeatAlert(stale, env, log);
|
|
1590
|
+
}
|
|
1591
|
+
} catch (error) {
|
|
1592
|
+
log.error('Failed to check stale heartbeats', error);
|
|
1593
|
+
}
|
|
1594
|
+
}
|
|
1595
|
+
|
|
1596
|
+
/**
|
|
1597
|
+
* Send Slack alert for stale heartbeat.
|
|
1598
|
+
*/
|
|
1599
|
+
async function fireStaleHeartbeatAlert(
|
|
1600
|
+
stale: StaleHeartbeatRow,
|
|
1601
|
+
env: Env,
|
|
1602
|
+
log: Logger
|
|
1603
|
+
): Promise<void> {
|
|
1604
|
+
// Check rate limit
|
|
1605
|
+
const alertKey = `stale-heartbeat:${stale.feature_id}`;
|
|
1606
|
+
const alreadySent = await env.PLATFORM_ALERTS.get(alertKey);
|
|
1607
|
+
|
|
1608
|
+
if (alreadySent) {
|
|
1609
|
+
log.debug('Stale heartbeat alert rate limited', { feature_id: stale.feature_id });
|
|
1610
|
+
return;
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1613
|
+
if (!env.SLACK_WEBHOOK_URL) {
|
|
1614
|
+
log.debug('No SLACK_WEBHOOK_URL configured, skipping stale heartbeat alert');
|
|
1615
|
+
return;
|
|
1616
|
+
}
|
|
1617
|
+
|
|
1618
|
+
const ageMinutes = Math.round(stale.age_seconds / 60);
|
|
1619
|
+
const lastHeartbeatTime = new Date(stale.last_heartbeat * 1000).toISOString();
|
|
1620
|
+
|
|
1621
|
+
const message = {
|
|
1622
|
+
text: `[STALE] Durable Object ${stale.feature_id} has not sent a heartbeat in ${ageMinutes} minutes`,
|
|
1623
|
+
blocks: [
|
|
1624
|
+
{
|
|
1625
|
+
type: 'header',
|
|
1626
|
+
text: {
|
|
1627
|
+
type: 'plain_text',
|
|
1628
|
+
text: ':broken_heart: Stale Heartbeat Detected',
|
|
1629
|
+
},
|
|
1630
|
+
},
|
|
1631
|
+
{
|
|
1632
|
+
type: 'section',
|
|
1633
|
+
fields: [
|
|
1634
|
+
{ type: 'mrkdwn', text: `*Feature:*\n${stale.feature_id}` },
|
|
1635
|
+
{ type: 'mrkdwn', text: `*Project:*\n${stale.project_id}` },
|
|
1636
|
+
{ type: 'mrkdwn', text: `*Last Heartbeat:*\n${lastHeartbeatTime}` },
|
|
1637
|
+
{ type: 'mrkdwn', text: `*Age:*\n${ageMinutes} minutes` },
|
|
1638
|
+
],
|
|
1639
|
+
},
|
|
1640
|
+
{
|
|
1641
|
+
type: 'section',
|
|
1642
|
+
text: {
|
|
1643
|
+
type: 'mrkdwn',
|
|
1644
|
+
text: `*Investigation Commands:*\n\`\`\`# Check DO status in D1
|
|
1645
|
+
npx wrangler d1 execute platform-metrics --remote --command "SELECT * FROM system_health_checks WHERE feature_id = '${stale.feature_id}'"
|
|
1646
|
+
|
|
1647
|
+
# Check recent telemetry for this feature
|
|
1648
|
+
npx wrangler d1 execute platform-metrics --remote --command "SELECT * FROM feature_usage_daily WHERE feature_key = '${stale.feature_id}' ORDER BY snapshot_date DESC LIMIT 5"\`\`\``,
|
|
1649
|
+
},
|
|
1650
|
+
},
|
|
1651
|
+
{
|
|
1652
|
+
type: 'context',
|
|
1653
|
+
elements: [
|
|
1654
|
+
{
|
|
1655
|
+
type: 'mrkdwn',
|
|
1656
|
+
text: `Expected heartbeat interval: 5 minutes | Stale threshold: 15 minutes`,
|
|
1657
|
+
},
|
|
1658
|
+
],
|
|
1659
|
+
},
|
|
1660
|
+
{
|
|
1661
|
+
type: 'actions',
|
|
1662
|
+
elements: [
|
|
1663
|
+
{
|
|
1664
|
+
type: 'button',
|
|
1665
|
+
text: {
|
|
1666
|
+
type: 'plain_text',
|
|
1667
|
+
text: 'Features Dashboard',
|
|
1668
|
+
emoji: true,
|
|
1669
|
+
},
|
|
1670
|
+
url: `${DASHBOARD_URL}/usage/features`,
|
|
1671
|
+
},
|
|
1672
|
+
],
|
|
1673
|
+
},
|
|
1674
|
+
],
|
|
1675
|
+
attachments: [
|
|
1676
|
+
{
|
|
1677
|
+
color: '#dc3545', // Red
|
|
1678
|
+
fields: [
|
|
1679
|
+
{
|
|
1680
|
+
title: 'Action Required',
|
|
1681
|
+
value:
|
|
1682
|
+
'Durable Object may be unhealthy or stopped. Check DO logs in Cloudflare dashboard and verify the worker is deployed correctly.',
|
|
1683
|
+
short: false,
|
|
1684
|
+
},
|
|
1685
|
+
],
|
|
1686
|
+
},
|
|
1687
|
+
],
|
|
1688
|
+
};
|
|
1689
|
+
|
|
1690
|
+
try {
|
|
1691
|
+
const response = await fetch(env.SLACK_WEBHOOK_URL, {
|
|
1692
|
+
method: 'POST',
|
|
1693
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1694
|
+
body: JSON.stringify(message),
|
|
1695
|
+
});
|
|
1696
|
+
|
|
1697
|
+
if (response.ok) {
|
|
1698
|
+
// Set rate limit
|
|
1699
|
+
await env.PLATFORM_ALERTS.put(alertKey, new Date().toISOString(), {
|
|
1700
|
+
expirationTtl: STALE_HEARTBEAT_RATE_LIMIT_TTL,
|
|
1701
|
+
});
|
|
1702
|
+
log.info('Sent stale heartbeat Slack alert', { feature_id: stale.feature_id });
|
|
1703
|
+
|
|
1704
|
+
// Create dashboard notification
|
|
1705
|
+
if (_rawNotificationsApi) {
|
|
1706
|
+
try {
|
|
1707
|
+
const notifResp = await _rawNotificationsApi.fetch(
|
|
1708
|
+
'https://platform-notifications.internal/notifications',
|
|
1709
|
+
{
|
|
1710
|
+
method: 'POST',
|
|
1711
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1712
|
+
body: JSON.stringify({
|
|
1713
|
+
category: 'warning',
|
|
1714
|
+
source: 'sentinel',
|
|
1715
|
+
source_id: stale.feature_id,
|
|
1716
|
+
title: `Stale Heartbeat: ${stale.feature_id}`,
|
|
1717
|
+
description: `Durable Object has not sent a heartbeat in ${ageMinutes} minutes. Last seen: ${lastHeartbeatTime}`,
|
|
1718
|
+
priority: 'high',
|
|
1719
|
+
action_url: '/usage/features',
|
|
1720
|
+
action_label: 'View Features',
|
|
1721
|
+
project: stale.project_id,
|
|
1722
|
+
}),
|
|
1723
|
+
}
|
|
1724
|
+
);
|
|
1725
|
+
const notifBody = await notifResp.text();
|
|
1726
|
+
if (!notifResp.ok) {
|
|
1727
|
+
log.warn('Stale heartbeat notification failed', { status: notifResp.status, body: notifBody });
|
|
1728
|
+
}
|
|
1729
|
+
} catch (notifError) {
|
|
1730
|
+
log.error('Failed to create stale heartbeat notification', notifError);
|
|
1731
|
+
}
|
|
1732
|
+
}
|
|
1733
|
+
} else {
|
|
1734
|
+
const text = await response.text();
|
|
1735
|
+
log.error('Failed to send stale heartbeat Slack alert', {
|
|
1736
|
+
feature_id: stale.feature_id,
|
|
1737
|
+
status: response.status,
|
|
1738
|
+
error: text,
|
|
1739
|
+
});
|
|
1740
|
+
}
|
|
1741
|
+
} catch (error) {
|
|
1742
|
+
log.error('Error sending stale heartbeat Slack alert', error);
|
|
1743
|
+
}
|
|
1744
|
+
}
|