@littlebearapps/platform-admin-sdk 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +112 -0
- package/dist/index.d.ts +16 -0
- package/dist/index.js +89 -0
- package/dist/prompts.d.ts +27 -0
- package/dist/prompts.js +80 -0
- package/dist/scaffold.d.ts +5 -0
- package/dist/scaffold.js +65 -0
- package/dist/templates.d.ts +16 -0
- package/dist/templates.js +131 -0
- package/package.json +46 -0
- package/templates/full/migrations/006_pattern_discovery.sql +199 -0
- package/templates/full/migrations/007_notifications_search.sql +127 -0
- package/templates/full/workers/lib/pattern-discovery/ai-prompt.ts +644 -0
- package/templates/full/workers/lib/pattern-discovery/clustering.ts +278 -0
- package/templates/full/workers/lib/pattern-discovery/shadow-evaluation.ts +603 -0
- package/templates/full/workers/lib/pattern-discovery/storage.ts +806 -0
- package/templates/full/workers/lib/pattern-discovery/types.ts +159 -0
- package/templates/full/workers/lib/pattern-discovery/validation.ts +278 -0
- package/templates/full/workers/pattern-discovery.ts +661 -0
- package/templates/full/workers/platform-alert-router.ts +1809 -0
- package/templates/full/workers/platform-notifications.ts +424 -0
- package/templates/full/workers/platform-search.ts +480 -0
- package/templates/full/workers/platform-settings.ts +436 -0
- package/templates/full/wrangler.alert-router.jsonc.hbs +34 -0
- package/templates/full/wrangler.notifications.jsonc.hbs +23 -0
- package/templates/full/wrangler.pattern-discovery.jsonc.hbs +33 -0
- package/templates/full/wrangler.search.jsonc.hbs +16 -0
- package/templates/full/wrangler.settings.jsonc.hbs +23 -0
- package/templates/shared/README.md.hbs +69 -0
- package/templates/shared/config/budgets.yaml.hbs +72 -0
- package/templates/shared/config/services.yaml.hbs +45 -0
- package/templates/shared/migrations/001_core_tables.sql +117 -0
- package/templates/shared/migrations/002_usage_warehouse.sql +830 -0
- package/templates/shared/migrations/003_feature_tracking.sql +250 -0
- package/templates/shared/migrations/004_settings_alerts.sql +452 -0
- package/templates/shared/migrations/seed.sql.hbs +4 -0
- package/templates/shared/package.json.hbs +21 -0
- package/templates/shared/scripts/sync-config.ts +242 -0
- package/templates/shared/tsconfig.json +12 -0
- package/templates/shared/workers/lib/analytics-engine.ts +357 -0
- package/templates/shared/workers/lib/billing.ts +293 -0
- package/templates/shared/workers/lib/circuit-breaker-middleware.ts +25 -0
- package/templates/shared/workers/lib/control.ts +292 -0
- package/templates/shared/workers/lib/economics.ts +368 -0
- package/templates/shared/workers/lib/metrics.ts +103 -0
- package/templates/shared/workers/lib/platform-settings.ts +407 -0
- package/templates/shared/workers/lib/shared/allowances.ts +333 -0
- package/templates/shared/workers/lib/shared/cloudflare.ts +1362 -0
- package/templates/shared/workers/lib/shared/types.ts +58 -0
- package/templates/shared/workers/lib/telemetry-sampling.ts +360 -0
- package/templates/shared/workers/lib/usage/collectors/example.ts +96 -0
- package/templates/shared/workers/lib/usage/collectors/index.ts +128 -0
- package/templates/shared/workers/lib/usage/handlers/audit.ts +306 -0
- package/templates/shared/workers/lib/usage/handlers/backfill.ts +845 -0
- package/templates/shared/workers/lib/usage/handlers/behavioral.ts +429 -0
- package/templates/shared/workers/lib/usage/handlers/data-queries.ts +507 -0
- package/templates/shared/workers/lib/usage/handlers/dlq-admin.ts +364 -0
- package/templates/shared/workers/lib/usage/handlers/health-trends.ts +222 -0
- package/templates/shared/workers/lib/usage/handlers/index.ts +35 -0
- package/templates/shared/workers/lib/usage/handlers/usage-admin.ts +421 -0
- package/templates/shared/workers/lib/usage/handlers/usage-features.ts +1262 -0
- package/templates/shared/workers/lib/usage/handlers/usage-metrics.ts +2420 -0
- package/templates/shared/workers/lib/usage/handlers/usage-settings.ts +610 -0
- package/templates/shared/workers/lib/usage/queue/budget-enforcement.ts +1032 -0
- package/templates/shared/workers/lib/usage/queue/cost-budget-enforcement.ts +128 -0
- package/templates/shared/workers/lib/usage/queue/cost-calculator.ts +77 -0
- package/templates/shared/workers/lib/usage/queue/dlq-handler.ts +161 -0
- package/templates/shared/workers/lib/usage/queue/index.ts +19 -0
- package/templates/shared/workers/lib/usage/queue/telemetry-processor.ts +790 -0
- package/templates/shared/workers/lib/usage/scheduled/anomaly-detection.ts +732 -0
- package/templates/shared/workers/lib/usage/scheduled/data-collection.ts +956 -0
- package/templates/shared/workers/lib/usage/scheduled/error-digest.ts +343 -0
- package/templates/shared/workers/lib/usage/scheduled/index.ts +18 -0
- package/templates/shared/workers/lib/usage/scheduled/rollups.ts +1561 -0
- package/templates/shared/workers/lib/usage/shared/constants.ts +362 -0
- package/templates/shared/workers/lib/usage/shared/index.ts +14 -0
- package/templates/shared/workers/lib/usage/shared/types.ts +1066 -0
- package/templates/shared/workers/lib/usage/shared/utils.ts +795 -0
- package/templates/shared/workers/platform-usage.ts +1915 -0
- package/templates/shared/wrangler.usage.jsonc.hbs +58 -0
- package/templates/standard/migrations/005_error_collection.sql +162 -0
- package/templates/standard/workers/error-collector.ts +2670 -0
- package/templates/standard/workers/lib/error-collector/capture.ts +213 -0
- package/templates/standard/workers/lib/error-collector/digest.ts +448 -0
- package/templates/standard/workers/lib/error-collector/email-health-alerts.ts +262 -0
- package/templates/standard/workers/lib/error-collector/fingerprint.ts +258 -0
- package/templates/standard/workers/lib/error-collector/gap-alerts.ts +293 -0
- package/templates/standard/workers/lib/error-collector/github.ts +329 -0
- package/templates/standard/workers/lib/error-collector/types.ts +262 -0
- package/templates/standard/workers/lib/sentinel/gap-detection.ts +734 -0
- package/templates/standard/workers/lib/shared/slack-alerts.ts +585 -0
- package/templates/standard/workers/platform-sentinel.ts +1744 -0
- package/templates/standard/wrangler.error-collector.jsonc.hbs +44 -0
- package/templates/standard/wrangler.sentinel.jsonc.hbs +45 -0
|
@@ -0,0 +1,1032 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Budget Enforcement
|
|
3
|
+
*
|
|
4
|
+
* Circuit breaker management and budget enforcement for platform-usage queue processing.
|
|
5
|
+
* Handles D1 write limits, DO GB-seconds tracking, and feature-level budget violations.
|
|
6
|
+
*
|
|
7
|
+
* Extracted from platform-usage.ts as part of Phase D modularisation.
|
|
8
|
+
*
|
|
9
|
+
* Key Components:
|
|
10
|
+
* - determineCircuitBreakerStatus: Tiered status (CLOSED/WARNING/OPEN) from usage vs limit
|
|
11
|
+
* - checkAndTripCircuitBreakers: Evaluates D1/DO limits and trips project-level breakers
|
|
12
|
+
* - checkAndUpdateBudgetStatus: Feature-level budget checking from telemetry metrics
|
|
13
|
+
* - logCircuitBreakerEvent: D1 audit trail for CB events
|
|
14
|
+
* - sendSlackAlert: Alert delivery to Slack webhook
|
|
15
|
+
* - D1/KV tracking helpers: Read/write usage counters
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import type { Env, DailyLimits } from '../shared';
|
|
19
|
+
import type { FeatureMetrics } from '@littlebearapps/platform-consumer-sdk';
|
|
20
|
+
import { CB_KEYS, METRIC_TO_BUDGET_KEY } from '../shared';
|
|
21
|
+
import { generateId, fetchWithRetry } from '../shared';
|
|
22
|
+
import { createLoggerFromEnv } from '@littlebearapps/platform-consumer-sdk';
|
|
23
|
+
import { CB_STATUS, type CircuitBreakerStatusValue } from '../../circuit-breaker-middleware';
|
|
24
|
+
/**
|
|
25
|
+
* Hard limit multiplier for circuit breaker enforcement.
|
|
26
|
+
* When usage exceeds (soft_limit * HARD_LIMIT_MULTIPLIER), the project is hard-paused.
|
|
27
|
+
*/
|
|
28
|
+
const HARD_LIMIT_MULTIPLIER = 1.5;
|
|
29
|
+
import {
|
|
30
|
+
getPlatformSettings,
|
|
31
|
+
getProjectSetting,
|
|
32
|
+
DEFAULT_PLATFORM_SETTINGS,
|
|
33
|
+
} from '../../platform-settings';
|
|
34
|
+
|
|
35
|
+
// =============================================================================
|
|
36
|
+
// CIRCUIT BREAKER STATUS DETERMINATION
|
|
37
|
+
// =============================================================================
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Determine circuit breaker status based on usage vs limit with tiered logic.
|
|
41
|
+
*
|
|
42
|
+
* - OPEN (paused): usage > limit * 1.5 (hard limit exceeded - block requests)
|
|
43
|
+
* - WARNING: usage > limit (soft limit exceeded - allow with warnings)
|
|
44
|
+
* - CLOSED (active): usage <= limit (normal operation)
|
|
45
|
+
*
|
|
46
|
+
* @param usage - Current usage value
|
|
47
|
+
* @param limit - Soft limit threshold
|
|
48
|
+
* @returns Circuit breaker status value
|
|
49
|
+
*/
|
|
50
|
+
export function determineCircuitBreakerStatus(
|
|
51
|
+
usage: number,
|
|
52
|
+
limit: number
|
|
53
|
+
): CircuitBreakerStatusValue {
|
|
54
|
+
const hardLimit = limit * HARD_LIMIT_MULTIPLIER;
|
|
55
|
+
|
|
56
|
+
if (usage >= hardLimit) {
|
|
57
|
+
return CB_STATUS.OPEN; // 'paused' - block requests
|
|
58
|
+
} else if (usage >= limit) {
|
|
59
|
+
return CB_STATUS.WARNING; // 'warning' - allow with logging
|
|
60
|
+
} else {
|
|
61
|
+
return CB_STATUS.CLOSED; // 'active' - normal operation
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// =============================================================================
|
|
66
|
+
// D1 WRITE TRACKING
|
|
67
|
+
// =============================================================================
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Get the current D1 write count for the rolling 24h window.
|
|
71
|
+
* Stored in KV with timestamp for sliding window calculation.
|
|
72
|
+
*
|
|
73
|
+
* @param env - Worker environment
|
|
74
|
+
* @returns Current D1 write count
|
|
75
|
+
*/
|
|
76
|
+
export async function getD1WriteCount(env: Env): Promise<number> {
|
|
77
|
+
try {
|
|
78
|
+
const countStr = await env.PLATFORM_CACHE.get(CB_KEYS.D1_WRITES_24H);
|
|
79
|
+
return countStr ? parseInt(countStr, 10) : 0;
|
|
80
|
+
} catch {
|
|
81
|
+
return 0;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Increment the D1 write counter.
|
|
87
|
+
* Called after each batch of D1 writes.
|
|
88
|
+
*
|
|
89
|
+
* @param env - Worker environment
|
|
90
|
+
* @param count - Number of writes to add
|
|
91
|
+
*/
|
|
92
|
+
export async function incrementD1WriteCount(env: Env, count: number): Promise<void> {
|
|
93
|
+
const current = await getD1WriteCount(env);
|
|
94
|
+
const newCount = current + count;
|
|
95
|
+
// TTL of 24 hours for automatic cleanup
|
|
96
|
+
await env.PLATFORM_CACHE.put(CB_KEYS.D1_WRITES_24H, String(newCount), {
|
|
97
|
+
expirationTtl: 86400,
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// =============================================================================
|
|
102
|
+
// DO GB-SECONDS TRACKING (per-project)
|
|
103
|
+
// =============================================================================
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Get the rolling 24h DO GB-seconds count for a project.
|
|
107
|
+
*
|
|
108
|
+
* @param env - Worker environment
|
|
109
|
+
* @param project - Project identifier
|
|
110
|
+
* @returns Current DO GB-seconds count
|
|
111
|
+
*/
|
|
112
|
+
export async function getDOGbSecondsCount(env: Env, project: string): Promise<number> {
|
|
113
|
+
try {
|
|
114
|
+
const countStr = await env.PLATFORM_CACHE.get(`${CB_KEYS.DO_GB_SECONDS_24H_PREFIX}${project}`);
|
|
115
|
+
return countStr ? parseFloat(countStr) : 0;
|
|
116
|
+
} catch {
|
|
117
|
+
return 0;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Set the DO GB-seconds count for a project.
|
|
123
|
+
* Called after collecting metrics for each project.
|
|
124
|
+
*
|
|
125
|
+
* @param env - Worker environment
|
|
126
|
+
* @param project - Project identifier
|
|
127
|
+
* @param gbSeconds - New GB-seconds value
|
|
128
|
+
*/
|
|
129
|
+
export async function setDOGbSecondsCount(
|
|
130
|
+
env: Env,
|
|
131
|
+
project: string,
|
|
132
|
+
gbSeconds: number
|
|
133
|
+
): Promise<void> {
|
|
134
|
+
// TTL of 24 hours for automatic cleanup
|
|
135
|
+
await env.PLATFORM_CACHE.put(`${CB_KEYS.DO_GB_SECONDS_24H_PREFIX}${project}`, String(gbSeconds), {
|
|
136
|
+
expirationTtl: 86400,
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Get DO GB-seconds threshold for a project from usage_settings.
|
|
142
|
+
* Falls back to global setting, then to default if not found.
|
|
143
|
+
* Uses platform-settings module with KV caching.
|
|
144
|
+
*
|
|
145
|
+
* @param env - Worker environment
|
|
146
|
+
* @param project - Project identifier
|
|
147
|
+
* @returns DO GB-seconds threshold
|
|
148
|
+
*/
|
|
149
|
+
export async function getDOGbSecondsThreshold(env: Env, project: string): Promise<number> {
|
|
150
|
+
return getProjectSetting(
|
|
151
|
+
env,
|
|
152
|
+
project,
|
|
153
|
+
'do_gb_seconds_daily_limit',
|
|
154
|
+
DEFAULT_PLATFORM_SETTINGS.doGbSecondsDailyLimit
|
|
155
|
+
);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// =============================================================================
|
|
159
|
+
// CIRCUIT BREAKER EVENT LOGGING
|
|
160
|
+
// =============================================================================
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Log a circuit breaker event to D1 for audit trail.
|
|
164
|
+
*
|
|
165
|
+
* @param env - Worker environment
|
|
166
|
+
* @param eventType - Type of event (trip, reset, sample_reduce, sample_restore)
|
|
167
|
+
* @param service - Service/project name
|
|
168
|
+
* @param reason - Human-readable reason for the event
|
|
169
|
+
* @param d1Writes24h - Current D1 write count (optional)
|
|
170
|
+
* @param samplingMode - Current sampling mode (optional)
|
|
171
|
+
* @param previousSamplingMode - Previous sampling mode (optional)
|
|
172
|
+
* @param doGbSeconds24h - Current DO GB-seconds count (optional)
|
|
173
|
+
* @param d1Limit - D1 write limit threshold
|
|
174
|
+
* @param doGbSecondsLimit - DO GB-seconds limit threshold
|
|
175
|
+
*/
|
|
176
|
+
export async function logCircuitBreakerEvent(
|
|
177
|
+
env: Env,
|
|
178
|
+
eventType: 'trip' | 'reset' | 'sample_reduce' | 'sample_restore',
|
|
179
|
+
service: string,
|
|
180
|
+
reason: string,
|
|
181
|
+
d1Writes24h?: number,
|
|
182
|
+
samplingMode?: string,
|
|
183
|
+
previousSamplingMode?: string,
|
|
184
|
+
doGbSeconds24h?: number,
|
|
185
|
+
d1Limit: number = DEFAULT_PLATFORM_SETTINGS.d1WriteLimit,
|
|
186
|
+
doGbSecondsLimit: number = DEFAULT_PLATFORM_SETTINGS.doGbSecondsDailyLimit
|
|
187
|
+
): Promise<void> {
|
|
188
|
+
await env.PLATFORM_DB.prepare(
|
|
189
|
+
`
|
|
190
|
+
INSERT INTO circuit_breaker_logs (
|
|
191
|
+
id, event_type, service, reason,
|
|
192
|
+
d1_writes_24h, d1_limit, sampling_mode, previous_sampling_mode,
|
|
193
|
+
do_gb_seconds_24h, do_gb_seconds_limit,
|
|
194
|
+
alert_sent, alert_channel
|
|
195
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
196
|
+
`
|
|
197
|
+
)
|
|
198
|
+
.bind(
|
|
199
|
+
generateId(),
|
|
200
|
+
eventType,
|
|
201
|
+
service,
|
|
202
|
+
reason,
|
|
203
|
+
d1Writes24h || null,
|
|
204
|
+
d1Limit,
|
|
205
|
+
samplingMode || null,
|
|
206
|
+
previousSamplingMode || null,
|
|
207
|
+
doGbSeconds24h || null,
|
|
208
|
+
doGbSeconds24h ? doGbSecondsLimit : null,
|
|
209
|
+
env.SLACK_WEBHOOK_URL ? 1 : 0,
|
|
210
|
+
env.SLACK_WEBHOOK_URL ? 'slack' : null
|
|
211
|
+
)
|
|
212
|
+
.run();
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// =============================================================================
|
|
216
|
+
// SLACK ALERTING
|
|
217
|
+
// =============================================================================
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Slack alert payload structure.
|
|
221
|
+
*/
|
|
222
|
+
export interface SlackAlertPayload {
|
|
223
|
+
text: string;
|
|
224
|
+
attachments?: Array<{
|
|
225
|
+
color: string;
|
|
226
|
+
fields: Array<{ title: string; value: string; short?: boolean }>;
|
|
227
|
+
}>;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Send a Slack alert via webhook.
|
|
232
|
+
*
|
|
233
|
+
* @param env - Worker environment
|
|
234
|
+
* @param payload - Slack message payload
|
|
235
|
+
*/
|
|
236
|
+
export async function sendSlackAlert(env: Env, payload: SlackAlertPayload): Promise<void> {
|
|
237
|
+
if (!env.SLACK_WEBHOOK_URL) return;
|
|
238
|
+
|
|
239
|
+
try {
|
|
240
|
+
await fetchWithRetry(env.SLACK_WEBHOOK_URL, {
|
|
241
|
+
method: 'POST',
|
|
242
|
+
headers: { 'Content-Type': 'application/json' },
|
|
243
|
+
body: JSON.stringify(payload),
|
|
244
|
+
});
|
|
245
|
+
} catch (error) {
|
|
246
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:slack');
|
|
247
|
+
log.error('Failed to send Slack alert', error instanceof Error ? error : undefined, {
|
|
248
|
+
tag: 'SLACK_ERROR',
|
|
249
|
+
});
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// =============================================================================
|
|
254
|
+
// DASHBOARD NOTIFICATIONS
|
|
255
|
+
// =============================================================================
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* Notification payload for the platform-notifications API.
|
|
259
|
+
*/
|
|
260
|
+
interface NotificationPayload {
|
|
261
|
+
category: 'error' | 'warning' | 'info' | 'success';
|
|
262
|
+
source: string;
|
|
263
|
+
source_id?: string;
|
|
264
|
+
title: string;
|
|
265
|
+
description?: string;
|
|
266
|
+
priority: 'critical' | 'high' | 'medium' | 'low' | 'info';
|
|
267
|
+
action_url?: string;
|
|
268
|
+
action_label?: string;
|
|
269
|
+
project?: string | null;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/**
|
|
273
|
+
* Create a dashboard notification via the platform-notifications API.
|
|
274
|
+
*
|
|
275
|
+
* @param api - The NOTIFICATIONS_API fetcher binding
|
|
276
|
+
* @param payload - Notification data
|
|
277
|
+
*/
|
|
278
|
+
async function createDashboardNotification(
|
|
279
|
+
api: Fetcher | undefined,
|
|
280
|
+
payload: NotificationPayload
|
|
281
|
+
): Promise<void> {
|
|
282
|
+
if (!api) return;
|
|
283
|
+
|
|
284
|
+
try {
|
|
285
|
+
// Service binding URL — the hostname is ignored; only the path matters
|
|
286
|
+
await api.fetch('https://platform-notifications.internal/notifications', {
|
|
287
|
+
method: 'POST',
|
|
288
|
+
headers: { 'Content-Type': 'application/json' },
|
|
289
|
+
body: JSON.stringify(payload),
|
|
290
|
+
});
|
|
291
|
+
} catch (error) {
|
|
292
|
+
// Silently fail - notifications are non-critical
|
|
293
|
+
console.error('Failed to create dashboard notification:', error);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// =============================================================================
|
|
298
|
+
// PROJECT CB KEY HELPER
|
|
299
|
+
// =============================================================================
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Get circuit breaker KV keys for all registered projects.
|
|
303
|
+
* Queries project_registry in D1 and generates CB key names.
|
|
304
|
+
*
|
|
305
|
+
* TODO: Ensure your projects are registered in project_registry.
|
|
306
|
+
* CB key format: PROJECT:{PROJECT_ID_UPPERCASE}:STATUS
|
|
307
|
+
*
|
|
308
|
+
* @param env - Worker environment
|
|
309
|
+
* @returns Record mapping project ID to its CB KV key
|
|
310
|
+
*/
|
|
311
|
+
async function getProjectCBKeys(env: Env): Promise<Record<string, string>> {
|
|
312
|
+
try {
|
|
313
|
+
const rows = await env.PLATFORM_DB.prepare(
|
|
314
|
+
`SELECT project_id FROM project_registry WHERE project_id != 'all' LIMIT 50`
|
|
315
|
+
).all<{ project_id: string }>();
|
|
316
|
+
|
|
317
|
+
const keys: Record<string, string> = {};
|
|
318
|
+
for (const row of rows.results ?? []) {
|
|
319
|
+
keys[row.project_id] = `PROJECT:${row.project_id.toUpperCase().replace(/-/g, '-')}:STATUS`;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// Always include 'platform' as a fallback
|
|
323
|
+
if (!keys['platform']) {
|
|
324
|
+
keys['platform'] = 'PROJECT:PLATFORM:STATUS';
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
return keys;
|
|
328
|
+
} catch {
|
|
329
|
+
// Fallback if project_registry doesn't exist yet
|
|
330
|
+
return {
|
|
331
|
+
platform: 'PROJECT:PLATFORM:STATUS',
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// =============================================================================
|
|
337
|
+
// PROJECT-LEVEL CIRCUIT BREAKER CHECKING
|
|
338
|
+
// =============================================================================
|
|
339
|
+
|
|
340
|
+
/**
|
|
341
|
+
* Check and update circuit breakers for all registered projects based on usage limits.
|
|
342
|
+
*
|
|
343
|
+
* Tiered approach (HARD_LIMIT_MULTIPLIER = 1.5):
|
|
344
|
+
* - CLOSED (active): usage < limit - normal operation
|
|
345
|
+
* - WARNING: usage >= limit but < limit*1.5 - requests pass with warning logged
|
|
346
|
+
* - OPEN (paused): usage >= limit*1.5 - requests blocked with 503
|
|
347
|
+
*
|
|
348
|
+
* This allows background jobs to complete even when slightly over budget,
|
|
349
|
+
* while still alerting operators.
|
|
350
|
+
*
|
|
351
|
+
* @param env - Worker environment
|
|
352
|
+
* @returns True if any circuit breaker was tripped
|
|
353
|
+
*/
|
|
354
|
+
export async function checkAndTripCircuitBreakers(env: Env): Promise<boolean> {
|
|
355
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:circuitbreaker');
|
|
356
|
+
let tripped = false;
|
|
357
|
+
|
|
358
|
+
// Fetch settings and D1 writes in parallel
|
|
359
|
+
const [settings, writes24h] = await Promise.all([getPlatformSettings(env), getD1WriteCount(env)]);
|
|
360
|
+
const d1WriteLimit = settings.d1WriteLimit;
|
|
361
|
+
|
|
362
|
+
// Check D1 write limit (global)
|
|
363
|
+
const d1Status = determineCircuitBreakerStatus(writes24h, d1WriteLimit);
|
|
364
|
+
const hardLimit = d1WriteLimit * HARD_LIMIT_MULTIPLIER;
|
|
365
|
+
|
|
366
|
+
if (d1Status === CB_STATUS.OPEN) {
|
|
367
|
+
// OPEN: Hard limit exceeded - block all requests
|
|
368
|
+
log.info('D1 writes exceeded HARD limit, setting status to OPEN (paused)', {
|
|
369
|
+
tag: 'CB_OPEN',
|
|
370
|
+
writes24h,
|
|
371
|
+
hardLimit,
|
|
372
|
+
});
|
|
373
|
+
|
|
374
|
+
// Set OPEN status for all registered projects (24h expiry)
|
|
375
|
+
// TODO: Add your project IDs to project_registry in D1
|
|
376
|
+
const projectCBKeys = await getProjectCBKeys(env);
|
|
377
|
+
for (const cbKey of Object.values(projectCBKeys)) {
|
|
378
|
+
await env.PLATFORM_CACHE.put(cbKey, CB_STATUS.OPEN, { expirationTtl: 86400 });
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Log the event
|
|
382
|
+
await logCircuitBreakerEvent(
|
|
383
|
+
env,
|
|
384
|
+
'trip',
|
|
385
|
+
'all',
|
|
386
|
+
`D1 writes exceeded hard limit ${hardLimit.toLocaleString()} (1.5x soft limit)`,
|
|
387
|
+
writes24h,
|
|
388
|
+
undefined, // samplingMode
|
|
389
|
+
undefined, // previousSamplingMode
|
|
390
|
+
undefined, // doGbSeconds24h
|
|
391
|
+
d1WriteLimit
|
|
392
|
+
);
|
|
393
|
+
|
|
394
|
+
// Send Slack alert for OPEN
|
|
395
|
+
if (env.SLACK_WEBHOOK_URL) {
|
|
396
|
+
const projectNames = Object.keys(projectCBKeys).join(', ');
|
|
397
|
+
await sendSlackAlert(env, {
|
|
398
|
+
text: ':rotating_light: Circuit Breaker OPEN - Requests Blocked',
|
|
399
|
+
attachments: [
|
|
400
|
+
{
|
|
401
|
+
color: 'danger',
|
|
402
|
+
fields: [
|
|
403
|
+
{ title: 'Event', value: 'D1 write HARD limit exceeded', short: true },
|
|
404
|
+
{ title: 'Writes (24h)', value: writes24h.toLocaleString(), short: true },
|
|
405
|
+
{ title: 'Soft Limit', value: d1WriteLimit.toLocaleString(), short: true },
|
|
406
|
+
{ title: 'Hard Limit (1.5x)', value: hardLimit.toLocaleString(), short: true },
|
|
407
|
+
{ title: 'Status', value: 'OPEN (paused)', short: true },
|
|
408
|
+
{ title: 'Action', value: `Projects blocked for 24h: ${projectNames}`, short: true },
|
|
409
|
+
],
|
|
410
|
+
},
|
|
411
|
+
],
|
|
412
|
+
});
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
// Create dashboard notification for OPEN state
|
|
416
|
+
await createDashboardNotification(env.NOTIFICATIONS_API, {
|
|
417
|
+
category: 'error',
|
|
418
|
+
source: 'circuit-breaker',
|
|
419
|
+
title: 'Circuit Breaker OPEN - All Requests Blocked',
|
|
420
|
+
description: `D1 writes (${writes24h.toLocaleString()}) exceeded hard limit (${hardLimit.toLocaleString()}). All projects blocked for 24h.`,
|
|
421
|
+
priority: 'critical',
|
|
422
|
+
action_url: '/circuit-breakers',
|
|
423
|
+
action_label: 'View Status',
|
|
424
|
+
project: 'platform',
|
|
425
|
+
});
|
|
426
|
+
|
|
427
|
+
tripped = true;
|
|
428
|
+
} else if (d1Status === CB_STATUS.WARNING) {
|
|
429
|
+
// WARNING: Soft limit exceeded - allow requests but log warning
|
|
430
|
+
log.info('D1 writes exceeded soft limit, setting status to WARNING', {
|
|
431
|
+
tag: 'CB_WARNING',
|
|
432
|
+
writes24h,
|
|
433
|
+
softLimit: d1WriteLimit,
|
|
434
|
+
});
|
|
435
|
+
|
|
436
|
+
// Set WARNING status for all registered projects (24h expiry)
|
|
437
|
+
const projectCBKeysWarn = await getProjectCBKeys(env);
|
|
438
|
+
for (const cbKey of Object.values(projectCBKeysWarn)) {
|
|
439
|
+
await env.PLATFORM_CACHE.put(cbKey, CB_STATUS.WARNING, { expirationTtl: 86400 });
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// Log the event as 'warning' (not 'trip')
|
|
443
|
+
await logCircuitBreakerEvent(
|
|
444
|
+
env,
|
|
445
|
+
'sample_reduce', // Reusing for warning events
|
|
446
|
+
'all',
|
|
447
|
+
`D1 writes exceeded soft limit ${d1WriteLimit.toLocaleString()}`,
|
|
448
|
+
writes24h,
|
|
449
|
+
'warning',
|
|
450
|
+
undefined, // previousSamplingMode
|
|
451
|
+
undefined, // doGbSeconds24h
|
|
452
|
+
d1WriteLimit
|
|
453
|
+
);
|
|
454
|
+
|
|
455
|
+
// Send Slack alert for WARNING
|
|
456
|
+
if (env.SLACK_WEBHOOK_URL) {
|
|
457
|
+
await sendSlackAlert(env, {
|
|
458
|
+
text: ':warning: Circuit Breaker WARNING - Budget Exceeded',
|
|
459
|
+
attachments: [
|
|
460
|
+
{
|
|
461
|
+
color: 'warning',
|
|
462
|
+
fields: [
|
|
463
|
+
{ title: 'Event', value: 'D1 write soft limit exceeded', short: true },
|
|
464
|
+
{ title: 'Writes (24h)', value: writes24h.toLocaleString(), short: true },
|
|
465
|
+
{ title: 'Soft Limit', value: d1WriteLimit.toLocaleString(), short: true },
|
|
466
|
+
{ title: 'Hard Limit (1.5x)', value: hardLimit.toLocaleString(), short: true },
|
|
467
|
+
{ title: 'Status', value: 'WARNING (requests allowed)', short: true },
|
|
468
|
+
{ title: 'Action', value: 'Monitoring - will block at hard limit', short: true },
|
|
469
|
+
],
|
|
470
|
+
},
|
|
471
|
+
],
|
|
472
|
+
});
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
// Create dashboard notification for WARNING state
|
|
476
|
+
await createDashboardNotification(env.NOTIFICATIONS_API, {
|
|
477
|
+
category: 'warning',
|
|
478
|
+
source: 'circuit-breaker',
|
|
479
|
+
title: 'Circuit Breaker WARNING - Budget Exceeded',
|
|
480
|
+
description: `D1 writes (${writes24h.toLocaleString()}) exceeded soft limit (${d1WriteLimit.toLocaleString()}). Will block at ${hardLimit.toLocaleString()}.`,
|
|
481
|
+
priority: 'high',
|
|
482
|
+
action_url: '/circuit-breakers',
|
|
483
|
+
action_label: 'View Status',
|
|
484
|
+
project: 'platform',
|
|
485
|
+
});
|
|
486
|
+
} else {
|
|
487
|
+
// CLOSED: Under limit - ensure status is reset to active
|
|
488
|
+
const projectCBKeysClosed = await getProjectCBKeys(env);
|
|
489
|
+
for (const cbKey of Object.values(projectCBKeysClosed)) {
|
|
490
|
+
await env.PLATFORM_CACHE.put(cbKey, CB_STATUS.CLOSED, { expirationTtl: 86400 });
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
// Check DO GB-seconds per project
|
|
495
|
+
// Uses the same dynamic project CB keys from project_registry
|
|
496
|
+
const projectStatusKeys = await getProjectCBKeys(env);
|
|
497
|
+
|
|
498
|
+
for (const [project, statusKey] of Object.entries(projectStatusKeys)) {
|
|
499
|
+
const gbSeconds24h = await getDOGbSecondsCount(env, project);
|
|
500
|
+
const threshold = await getDOGbSecondsThreshold(env, project);
|
|
501
|
+
const doStatus = determineCircuitBreakerStatus(gbSeconds24h, threshold);
|
|
502
|
+
const doHardLimit = threshold * HARD_LIMIT_MULTIPLIER;
|
|
503
|
+
|
|
504
|
+
if (doStatus === CB_STATUS.OPEN) {
|
|
505
|
+
// OPEN: Hard limit exceeded - block requests
|
|
506
|
+
log.info('DO GB-seconds exceeded HARD limit, setting status to OPEN', {
|
|
507
|
+
tag: 'CB_DO_OPEN',
|
|
508
|
+
project,
|
|
509
|
+
gbSeconds24h: Math.round(gbSeconds24h),
|
|
510
|
+
hardLimit: Math.round(doHardLimit),
|
|
511
|
+
});
|
|
512
|
+
|
|
513
|
+
await env.PLATFORM_CACHE.put(statusKey, CB_STATUS.OPEN, { expirationTtl: 86400 });
|
|
514
|
+
|
|
515
|
+
await logCircuitBreakerEvent(
|
|
516
|
+
env,
|
|
517
|
+
'trip',
|
|
518
|
+
project,
|
|
519
|
+
`DO GB-seconds exceeded hard limit ${doHardLimit.toFixed(0)} (1.5x soft limit)`,
|
|
520
|
+
undefined, // d1Writes24h
|
|
521
|
+
undefined, // samplingMode
|
|
522
|
+
undefined, // previousSamplingMode
|
|
523
|
+
gbSeconds24h,
|
|
524
|
+
d1WriteLimit,
|
|
525
|
+
threshold
|
|
526
|
+
);
|
|
527
|
+
|
|
528
|
+
if (env.SLACK_WEBHOOK_URL) {
|
|
529
|
+
const estimatedCost = (gbSeconds24h / 1_000_000) * 12.5;
|
|
530
|
+
await sendSlackAlert(env, {
|
|
531
|
+
text: ':rotating_light: DO Circuit Breaker OPEN',
|
|
532
|
+
attachments: [
|
|
533
|
+
{
|
|
534
|
+
color: 'danger',
|
|
535
|
+
fields: [
|
|
536
|
+
{ title: 'Project', value: project, short: true },
|
|
537
|
+
{ title: 'Event', value: 'DO GB-seconds HARD limit exceeded', short: true },
|
|
538
|
+
{
|
|
539
|
+
title: 'GB-seconds (24h)',
|
|
540
|
+
value: gbSeconds24h.toLocaleString(undefined, { maximumFractionDigits: 0 }),
|
|
541
|
+
short: true,
|
|
542
|
+
},
|
|
543
|
+
{
|
|
544
|
+
title: 'Soft Limit',
|
|
545
|
+
value: threshold.toLocaleString(undefined, { maximumFractionDigits: 0 }),
|
|
546
|
+
short: true,
|
|
547
|
+
},
|
|
548
|
+
{
|
|
549
|
+
title: 'Hard Limit (1.5x)',
|
|
550
|
+
value: doHardLimit.toLocaleString(undefined, { maximumFractionDigits: 0 }),
|
|
551
|
+
short: true,
|
|
552
|
+
},
|
|
553
|
+
{
|
|
554
|
+
title: 'Est. Cost',
|
|
555
|
+
value: `$${estimatedCost.toFixed(2)}`,
|
|
556
|
+
short: true,
|
|
557
|
+
},
|
|
558
|
+
{ title: 'Status', value: 'OPEN (paused)', short: true },
|
|
559
|
+
{ title: 'Action', value: `${project} blocked for 24h`, short: true },
|
|
560
|
+
],
|
|
561
|
+
},
|
|
562
|
+
],
|
|
563
|
+
});
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
// Create dashboard notification for DO OPEN state
|
|
567
|
+
const estimatedCostOpen = (gbSeconds24h / 1_000_000) * 12.5;
|
|
568
|
+
await createDashboardNotification(env.NOTIFICATIONS_API, {
|
|
569
|
+
category: 'error',
|
|
570
|
+
source: 'circuit-breaker',
|
|
571
|
+
title: `Circuit Breaker OPEN - ${project} Blocked`,
|
|
572
|
+
description: `DO GB-seconds (${Math.round(gbSeconds24h).toLocaleString()}) exceeded hard limit. Est. cost: $${estimatedCostOpen.toFixed(2)}`,
|
|
573
|
+
priority: 'critical',
|
|
574
|
+
action_url: '/circuit-breakers',
|
|
575
|
+
action_label: 'View Status',
|
|
576
|
+
project,
|
|
577
|
+
});
|
|
578
|
+
|
|
579
|
+
tripped = true;
|
|
580
|
+
} else if (doStatus === CB_STATUS.WARNING) {
|
|
581
|
+
// WARNING: Soft limit exceeded - allow with logging
|
|
582
|
+
log.info('DO GB-seconds exceeded soft limit, setting status to WARNING', {
|
|
583
|
+
tag: 'CB_DO_WARNING',
|
|
584
|
+
project,
|
|
585
|
+
gbSeconds24h: Math.round(gbSeconds24h),
|
|
586
|
+
softLimit: threshold,
|
|
587
|
+
});
|
|
588
|
+
|
|
589
|
+
await env.PLATFORM_CACHE.put(statusKey, CB_STATUS.WARNING, { expirationTtl: 86400 });
|
|
590
|
+
|
|
591
|
+
await logCircuitBreakerEvent(
|
|
592
|
+
env,
|
|
593
|
+
'sample_reduce',
|
|
594
|
+
project,
|
|
595
|
+
`DO GB-seconds exceeded soft limit ${threshold.toFixed(0)}`,
|
|
596
|
+
undefined, // d1Writes24h
|
|
597
|
+
'warning', // samplingMode
|
|
598
|
+
undefined, // previousSamplingMode
|
|
599
|
+
gbSeconds24h,
|
|
600
|
+
d1WriteLimit,
|
|
601
|
+
threshold
|
|
602
|
+
);
|
|
603
|
+
|
|
604
|
+
if (env.SLACK_WEBHOOK_URL) {
|
|
605
|
+
const estimatedCost = (gbSeconds24h / 1_000_000) * 12.5;
|
|
606
|
+
await sendSlackAlert(env, {
|
|
607
|
+
text: ':warning: DO Circuit Breaker WARNING',
|
|
608
|
+
attachments: [
|
|
609
|
+
{
|
|
610
|
+
color: 'warning',
|
|
611
|
+
fields: [
|
|
612
|
+
{ title: 'Project', value: project, short: true },
|
|
613
|
+
{ title: 'Event', value: 'DO GB-seconds soft limit exceeded', short: true },
|
|
614
|
+
{
|
|
615
|
+
title: 'GB-seconds (24h)',
|
|
616
|
+
value: gbSeconds24h.toLocaleString(undefined, { maximumFractionDigits: 0 }),
|
|
617
|
+
short: true,
|
|
618
|
+
},
|
|
619
|
+
{
|
|
620
|
+
title: 'Soft Limit',
|
|
621
|
+
value: threshold.toLocaleString(undefined, { maximumFractionDigits: 0 }),
|
|
622
|
+
short: true,
|
|
623
|
+
},
|
|
624
|
+
{
|
|
625
|
+
title: 'Hard Limit (1.5x)',
|
|
626
|
+
value: doHardLimit.toLocaleString(undefined, { maximumFractionDigits: 0 }),
|
|
627
|
+
short: true,
|
|
628
|
+
},
|
|
629
|
+
{
|
|
630
|
+
title: 'Est. Cost',
|
|
631
|
+
value: `$${estimatedCost.toFixed(2)}`,
|
|
632
|
+
short: true,
|
|
633
|
+
},
|
|
634
|
+
{ title: 'Status', value: 'WARNING (requests allowed)', short: true },
|
|
635
|
+
{ title: 'Action', value: 'Monitoring - will block at hard limit', short: true },
|
|
636
|
+
],
|
|
637
|
+
},
|
|
638
|
+
],
|
|
639
|
+
});
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
// Create dashboard notification for DO WARNING state
|
|
643
|
+
const estimatedCostWarning = (gbSeconds24h / 1_000_000) * 12.5;
|
|
644
|
+
await createDashboardNotification(env.NOTIFICATIONS_API, {
|
|
645
|
+
category: 'warning',
|
|
646
|
+
source: 'circuit-breaker',
|
|
647
|
+
title: `Circuit Breaker WARNING - ${project} Budget Exceeded`,
|
|
648
|
+
description: `DO GB-seconds (${Math.round(gbSeconds24h).toLocaleString()}) exceeded soft limit. Est. cost: $${estimatedCostWarning.toFixed(2)}`,
|
|
649
|
+
priority: 'high',
|
|
650
|
+
action_url: '/circuit-breakers',
|
|
651
|
+
action_label: 'View Status',
|
|
652
|
+
project,
|
|
653
|
+
});
|
|
654
|
+
} else {
|
|
655
|
+
// CLOSED: Under limit - reset to active
|
|
656
|
+
await env.PLATFORM_CACHE.put(statusKey, CB_STATUS.CLOSED, { expirationTtl: 86400 });
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
return tripped;
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
// =============================================================================
|
|
664
|
+
// FEATURE-LEVEL BUDGET CHECKING
|
|
665
|
+
// =============================================================================
|
|
666
|
+
|
|
667
|
+
/**
|
|
668
|
+
* Check if any metrics exceed budget limits and update status.
|
|
669
|
+
* Reads budget from CONFIG:FEATURE:{key}:BUDGET, writes to CONFIG:FEATURE:{key}:STATUS.
|
|
670
|
+
*
|
|
671
|
+
* This is called during queue processing for each telemetry message to enforce
|
|
672
|
+
* feature-level circuit breakers based on configured budgets.
|
|
673
|
+
*
|
|
674
|
+
* @param featureKey - Feature identifier (e.g., 'my-app:scanner:harvest')
|
|
675
|
+
* @param metrics - Feature metrics from telemetry message
|
|
676
|
+
* @param env - Worker environment
|
|
677
|
+
*/
|
|
678
|
+
export async function checkAndUpdateBudgetStatus(
|
|
679
|
+
featureKey: string,
|
|
680
|
+
metrics: FeatureMetrics,
|
|
681
|
+
env: Env
|
|
682
|
+
): Promise<void> {
|
|
683
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:budget');
|
|
684
|
+
const budgetKvKey = `CONFIG:FEATURE:${featureKey}:BUDGET`;
|
|
685
|
+
const statusKey = `CONFIG:FEATURE:${featureKey}:STATUS`;
|
|
686
|
+
|
|
687
|
+
try {
|
|
688
|
+
const budgetJson = await env.PLATFORM_CACHE.get(budgetKvKey);
|
|
689
|
+
if (!budgetJson) {
|
|
690
|
+
// No budget configured for this feature - skip checking
|
|
691
|
+
return;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
const budget = JSON.parse(budgetJson) as DailyLimits;
|
|
695
|
+
|
|
696
|
+
// Check each metric against budget
|
|
697
|
+
const violations: string[] = [];
|
|
698
|
+
const warnings: Array<{ metricKey: string; value: number; limit: number; percent: number }> = [];
|
|
699
|
+
|
|
700
|
+
for (const [metricKey, value] of Object.entries(metrics)) {
|
|
701
|
+
if (value === undefined || value === 0) continue;
|
|
702
|
+
|
|
703
|
+
const budgetKey = METRIC_TO_BUDGET_KEY[metricKey as keyof typeof METRIC_TO_BUDGET_KEY];
|
|
704
|
+
if (!budgetKey) continue;
|
|
705
|
+
|
|
706
|
+
const rawLimit = budget[budgetKey];
|
|
707
|
+
if (rawLimit === undefined) continue;
|
|
708
|
+
// Defense-in-depth: YAML 1.2 may store "1_000" as string in KV.
|
|
709
|
+
// Type says number but runtime may be string from JSON.parse of KV value.
|
|
710
|
+
const rawLimitAny = rawLimit as unknown;
|
|
711
|
+
const limit = typeof rawLimitAny === 'string' ? Number(rawLimitAny.replace(/_/g, '')) : Number(rawLimitAny);
|
|
712
|
+
if (isNaN(limit) || limit === 0) continue;
|
|
713
|
+
|
|
714
|
+
const numValue = value as number;
|
|
715
|
+
if (numValue > limit) {
|
|
716
|
+
violations.push(`${metricKey}=${value}>${limit}`);
|
|
717
|
+
} else {
|
|
718
|
+
const percent = (numValue / limit) * 100;
|
|
719
|
+
if (percent >= 70) {
|
|
720
|
+
warnings.push({ metricKey, value: numValue, limit, percent });
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
// Send warnings for metrics approaching budget limits (70% and 90%)
|
|
726
|
+
for (const warn of warnings) {
|
|
727
|
+
const threshold = warn.percent >= 90 ? 90 : 70;
|
|
728
|
+
const dedupKey = `BUDGET_WARN:${featureKey}:${warn.metricKey}:${threshold}`;
|
|
729
|
+
try {
|
|
730
|
+
const alreadySent = await env.PLATFORM_CACHE.get(dedupKey);
|
|
731
|
+
if (!alreadySent) {
|
|
732
|
+
await env.PLATFORM_CACHE.put(dedupKey, '1', { expirationTtl: 3600 });
|
|
733
|
+
const [project, ...featureParts] = featureKey.split(':');
|
|
734
|
+
const featureName = featureParts.join(':') || featureKey;
|
|
735
|
+
await sendSlackAlert(env, {
|
|
736
|
+
text: `:warning: Feature Budget Warning (${threshold}%)`,
|
|
737
|
+
attachments: [{
|
|
738
|
+
color: threshold >= 90 ? '#ff9800' : '#ffc107',
|
|
739
|
+
fields: [
|
|
740
|
+
{ title: 'Feature', value: `${featureName} (${featureKey})`, short: false },
|
|
741
|
+
{ title: 'Project', value: project, short: true },
|
|
742
|
+
{ title: 'Metric', value: warn.metricKey, short: true },
|
|
743
|
+
{
|
|
744
|
+
title: 'Usage',
|
|
745
|
+
value: `${warn.percent.toFixed(0)}% (${warn.value.toLocaleString()} / ${warn.limit.toLocaleString()})`,
|
|
746
|
+
short: false,
|
|
747
|
+
},
|
|
748
|
+
],
|
|
749
|
+
}],
|
|
750
|
+
});
|
|
751
|
+
}
|
|
752
|
+
} catch (warnError) {
|
|
753
|
+
log.error(`Failed to send budget warning for ${featureKey}`, warnError);
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
if (violations.length > 0) {
|
|
758
|
+
const reason = violations.join(', ');
|
|
759
|
+
const trippedAt = new Date().toISOString();
|
|
760
|
+
|
|
761
|
+
// Trip the circuit breaker in KV
|
|
762
|
+
await env.PLATFORM_CACHE.put(statusKey, 'STOP', {
|
|
763
|
+
metadata: { reason, trippedAt },
|
|
764
|
+
expirationTtl: 3600,
|
|
765
|
+
});
|
|
766
|
+
|
|
767
|
+
// Log to D1 for historical tracking
|
|
768
|
+
// Parse the first violation to extract details (format: "metricKey=value>limit")
|
|
769
|
+
const firstViolation = violations[0];
|
|
770
|
+
const match = firstViolation.match(/^(\w+)=(\d+(?:\.\d+)?)>(\d+(?:\.\d+)?)$/);
|
|
771
|
+
const violatedResource = match?.[1] ?? null;
|
|
772
|
+
const currentValue = match ? parseFloat(match[2]) : null;
|
|
773
|
+
const budgetLimit = match ? parseFloat(match[3]) : null;
|
|
774
|
+
|
|
775
|
+
// Send Slack alert for feature-level circuit breaker trip
|
|
776
|
+
let alertSent = 0;
|
|
777
|
+
if (env.SLACK_WEBHOOK_URL) {
|
|
778
|
+
try {
|
|
779
|
+
// Parse feature key to extract project and feature name
|
|
780
|
+
const [project, ...featureParts] = featureKey.split(':');
|
|
781
|
+
const featureName = featureParts.join(':') || featureKey;
|
|
782
|
+
|
|
783
|
+
await sendSlackAlert(env, {
|
|
784
|
+
text: `:zap: Feature Circuit Breaker Tripped`,
|
|
785
|
+
attachments: [
|
|
786
|
+
{
|
|
787
|
+
color: 'danger',
|
|
788
|
+
fields: [
|
|
789
|
+
{ title: 'Feature', value: featureKey, short: false },
|
|
790
|
+
{ title: 'Project', value: project, short: true },
|
|
791
|
+
{ title: 'Status', value: 'STOP (blocked)', short: true },
|
|
792
|
+
{ title: 'Violation', value: reason, short: false },
|
|
793
|
+
{
|
|
794
|
+
title: 'Violated Resource',
|
|
795
|
+
value: violatedResource ?? 'unknown',
|
|
796
|
+
short: true,
|
|
797
|
+
},
|
|
798
|
+
{
|
|
799
|
+
title: 'Current / Limit',
|
|
800
|
+
value: `${currentValue?.toLocaleString() ?? '?'} / ${budgetLimit?.toLocaleString() ?? '?'}`,
|
|
801
|
+
short: true,
|
|
802
|
+
},
|
|
803
|
+
{ title: 'Time', value: trippedAt, short: false },
|
|
804
|
+
],
|
|
805
|
+
},
|
|
806
|
+
],
|
|
807
|
+
});
|
|
808
|
+
alertSent = 1;
|
|
809
|
+
} catch (slackError) {
|
|
810
|
+
log.error(`Failed to send Slack alert for ${featureKey}`, slackError);
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
// Log to D1 for historical tracking
|
|
815
|
+
try {
|
|
816
|
+
await env.PLATFORM_DB.prepare(
|
|
817
|
+
`INSERT INTO feature_circuit_breaker_events
|
|
818
|
+
(id, feature_key, event_type, reason, violated_resource, current_value, budget_limit, auto_reset, alert_sent, created_at)
|
|
819
|
+
VALUES (?1, ?2, 'trip', ?3, ?4, ?5, ?6, 0, ?7, unixepoch())`
|
|
820
|
+
)
|
|
821
|
+
.bind(
|
|
822
|
+
crypto.randomUUID(),
|
|
823
|
+
featureKey,
|
|
824
|
+
reason,
|
|
825
|
+
violatedResource,
|
|
826
|
+
currentValue,
|
|
827
|
+
budgetLimit,
|
|
828
|
+
alertSent
|
|
829
|
+
)
|
|
830
|
+
.run();
|
|
831
|
+
} catch (d1Error) {
|
|
832
|
+
// D1 logging failure should not prevent KV trip
|
|
833
|
+
log.error(`Failed to log CB event to D1 for ${featureKey}`, d1Error);
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
log.warn(`${featureKey} exceeded: ${reason}`, { alertSent });
|
|
837
|
+
}
|
|
838
|
+
} catch (error) {
|
|
839
|
+
// Budget check failures should not fail the telemetry write
|
|
840
|
+
log.error(`Error checking ${featureKey}`, error);
|
|
841
|
+
}
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
// =============================================================================
|
|
845
|
+
// MONTHLY BUDGET CHECK (runs at midnight UTC)
|
|
846
|
+
// =============================================================================
|
|
847
|
+
|
|
848
|
+
/**
|
|
849
|
+
* Mapping from DailyLimits keys to daily_usage_rollups column names.
|
|
850
|
+
* Only includes metrics available in the rollups table.
|
|
851
|
+
*/
|
|
852
|
+
const MONTHLY_METRIC_TO_COLUMN: Record<string, string> = {
|
|
853
|
+
d1_writes: 'd1_rows_written',
|
|
854
|
+
d1_rows_written: 'd1_rows_written',
|
|
855
|
+
d1_rows_read: 'd1_rows_read',
|
|
856
|
+
kv_reads: 'kv_reads',
|
|
857
|
+
kv_writes: 'kv_writes',
|
|
858
|
+
kv_deletes: 'kv_deletes',
|
|
859
|
+
r2_class_a: 'r2_class_a_ops',
|
|
860
|
+
r2_class_b: 'r2_class_b_ops',
|
|
861
|
+
ai_requests: 'workersai_requests',
|
|
862
|
+
ai_neurons: 'workersai_neurons',
|
|
863
|
+
requests: 'workers_requests',
|
|
864
|
+
queue_messages: 'queues_messages_produced',
|
|
865
|
+
vectorize_queries: 'vectorize_queries',
|
|
866
|
+
vectorize_inserts: 'vectorize_inserts',
|
|
867
|
+
};
|
|
868
|
+
|
|
869
|
+
/** Allowlist for safe column interpolation in SQL. */
|
|
870
|
+
const ALLOWED_MONTHLY_COLUMNS = new Set(Object.values(MONTHLY_METRIC_TO_COLUMN));
|
|
871
|
+
|
|
872
|
+
// TODO: Add your project IDs here (must match project_registry in D1)
|
|
873
|
+
const MONTHLY_PROJECTS = ['all', 'platform'] as const;
|
|
874
|
+
|
|
875
|
+
/**
|
|
876
|
+
* Check monthly budget usage against limits.
|
|
877
|
+
* Runs once daily at midnight. Sums daily_usage_rollups for the current calendar month
|
|
878
|
+
* and compares against monthly limits stored in KV (BUDGET_MONTHLY keys).
|
|
879
|
+
*
|
|
880
|
+
* Falls back to daily limits × 30 if no explicit monthly limits are configured.
|
|
881
|
+
*
|
|
882
|
+
* @returns Number of monthly violations detected
|
|
883
|
+
*/
|
|
884
|
+
export async function checkMonthlyBudgets(env: Env): Promise<number> {
|
|
885
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:monthly-budget');
|
|
886
|
+
let violations = 0;
|
|
887
|
+
|
|
888
|
+
try {
|
|
889
|
+
// List all features with monthly budgets via KV list
|
|
890
|
+
const kvList = await env.PLATFORM_CACHE.list({ prefix: 'CONFIG:FEATURE:', limit: 1000 });
|
|
891
|
+
|
|
892
|
+
// Collect feature keys that have BUDGET_MONTHLY entries
|
|
893
|
+
const monthlyFeatures: Array<{ featureKey: string; limits: DailyLimits }> = [];
|
|
894
|
+
for (const key of kvList.keys) {
|
|
895
|
+
if (!key.name.endsWith(':BUDGET_MONTHLY')) continue;
|
|
896
|
+
const featureKey = key.name
|
|
897
|
+
.replace('CONFIG:FEATURE:', '')
|
|
898
|
+
.replace(':BUDGET_MONTHLY', '');
|
|
899
|
+
const limitsJson = await env.PLATFORM_CACHE.get(key.name);
|
|
900
|
+
if (!limitsJson) continue;
|
|
901
|
+
try {
|
|
902
|
+
monthlyFeatures.push({ featureKey, limits: JSON.parse(limitsJson) as DailyLimits });
|
|
903
|
+
} catch {
|
|
904
|
+
log.error(`Invalid monthly budget JSON for ${featureKey}`);
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
if (monthlyFeatures.length === 0) {
|
|
909
|
+
log.info('No features with monthly budgets configured');
|
|
910
|
+
return 0;
|
|
911
|
+
}
|
|
912
|
+
|
|
913
|
+
// For each project, get monthly totals from daily_usage_rollups
|
|
914
|
+
for (const project of MONTHLY_PROJECTS) {
|
|
915
|
+
// Get the monthly sum for this project
|
|
916
|
+
const monthlyTotals = await env.PLATFORM_DB.prepare(`
|
|
917
|
+
SELECT
|
|
918
|
+
SUM(d1_rows_written) as d1_rows_written,
|
|
919
|
+
SUM(d1_rows_read) as d1_rows_read,
|
|
920
|
+
SUM(kv_reads) as kv_reads,
|
|
921
|
+
SUM(kv_writes) as kv_writes,
|
|
922
|
+
SUM(kv_deletes) as kv_deletes,
|
|
923
|
+
SUM(r2_class_a_ops) as r2_class_a_ops,
|
|
924
|
+
SUM(r2_class_b_ops) as r2_class_b_ops,
|
|
925
|
+
SUM(workersai_requests) as workersai_requests,
|
|
926
|
+
SUM(workersai_neurons) as workersai_neurons,
|
|
927
|
+
SUM(workers_requests) as workers_requests,
|
|
928
|
+
SUM(queues_messages_produced) as queues_messages_produced,
|
|
929
|
+
SUM(vectorize_queries) as vectorize_queries,
|
|
930
|
+
SUM(vectorize_inserts) as vectorize_inserts
|
|
931
|
+
FROM daily_usage_rollups
|
|
932
|
+
WHERE project = ? AND snapshot_date >= date('now', 'start of month')
|
|
933
|
+
LIMIT 1
|
|
934
|
+
`).bind(project).first<Record<string, number | null>>();
|
|
935
|
+
|
|
936
|
+
if (!monthlyTotals) continue;
|
|
937
|
+
|
|
938
|
+
// Check each feature that maps to this project
|
|
939
|
+
for (const { featureKey, limits } of monthlyFeatures) {
|
|
940
|
+
const [featureProject] = featureKey.split(':');
|
|
941
|
+
// Only check features belonging to this project (or 'all' catches everything)
|
|
942
|
+
if (project !== 'all' && featureProject !== project) continue;
|
|
943
|
+
if (project === 'all' && featureProject !== 'all' && featureProject !== 'platform') continue;
|
|
944
|
+
|
|
945
|
+
for (const [limitKey, rawLimitValue] of Object.entries(limits)) {
|
|
946
|
+
if (rawLimitValue === undefined || rawLimitValue === 0) continue;
|
|
947
|
+
const column = MONTHLY_METRIC_TO_COLUMN[limitKey];
|
|
948
|
+
if (!column || !ALLOWED_MONTHLY_COLUMNS.has(column)) continue;
|
|
949
|
+
|
|
950
|
+
// Defense-in-depth: YAML 1.2 may store "1_000_000" as string in KV.
|
|
951
|
+
// Type says number but runtime may be string from JSON.parse of KV value.
|
|
952
|
+
const rawAny = rawLimitValue as unknown;
|
|
953
|
+
const limitValue = typeof rawAny === 'string'
|
|
954
|
+
? Number(rawAny.replace(/_/g, ''))
|
|
955
|
+
: Number(rawAny);
|
|
956
|
+
if (isNaN(limitValue) || limitValue === 0) continue;
|
|
957
|
+
|
|
958
|
+
const currentValue = monthlyTotals[column] ?? 0;
|
|
959
|
+
if (currentValue === 0) continue;
|
|
960
|
+
|
|
961
|
+
const percent = (currentValue / limitValue) * 100;
|
|
962
|
+
|
|
963
|
+
if (currentValue > limitValue) {
|
|
964
|
+
// Monthly budget exceeded — alert
|
|
965
|
+
violations++;
|
|
966
|
+
const dedupKey = `BUDGET_WARN_MONTHLY:${featureKey}:${limitKey}:exceeded`;
|
|
967
|
+
const alreadySent = await env.PLATFORM_CACHE.get(dedupKey);
|
|
968
|
+
if (!alreadySent) {
|
|
969
|
+
await env.PLATFORM_CACHE.put(dedupKey, '1', { expirationTtl: 86400 }); // 24hr dedup
|
|
970
|
+
await sendSlackAlert(env, {
|
|
971
|
+
text: `:rotating_light: Monthly Budget Exceeded`,
|
|
972
|
+
attachments: [{
|
|
973
|
+
color: '#e53e3e',
|
|
974
|
+
fields: [
|
|
975
|
+
{ title: 'Feature', value: featureKey, short: false },
|
|
976
|
+
{ title: 'Project', value: project, short: true },
|
|
977
|
+
{ title: 'Metric', value: limitKey, short: true },
|
|
978
|
+
{
|
|
979
|
+
title: 'Monthly Usage',
|
|
980
|
+
value: `${percent.toFixed(0)}% (${currentValue.toLocaleString()} / ${limitValue.toLocaleString()})`,
|
|
981
|
+
short: false,
|
|
982
|
+
},
|
|
983
|
+
{ title: 'Period', value: `${new Date().toISOString().slice(0, 7)} (month to date)`, short: false },
|
|
984
|
+
],
|
|
985
|
+
}],
|
|
986
|
+
});
|
|
987
|
+
}
|
|
988
|
+
} else if (percent >= 70) {
|
|
989
|
+
// Monthly warning threshold
|
|
990
|
+
const threshold = percent >= 90 ? 90 : 70;
|
|
991
|
+
const dedupKey = `BUDGET_WARN_MONTHLY:${featureKey}:${limitKey}:${threshold}`;
|
|
992
|
+
const alreadySent = await env.PLATFORM_CACHE.get(dedupKey);
|
|
993
|
+
if (!alreadySent) {
|
|
994
|
+
await env.PLATFORM_CACHE.put(dedupKey, '1', { expirationTtl: 86400 }); // 24hr dedup
|
|
995
|
+
await sendSlackAlert(env, {
|
|
996
|
+
text: `:warning: Monthly Budget Warning (${threshold}%)`,
|
|
997
|
+
attachments: [{
|
|
998
|
+
color: threshold >= 90 ? '#ff9800' : '#ffc107',
|
|
999
|
+
fields: [
|
|
1000
|
+
{ title: 'Feature', value: featureKey, short: false },
|
|
1001
|
+
{ title: 'Project', value: project, short: true },
|
|
1002
|
+
{ title: 'Metric', value: limitKey, short: true },
|
|
1003
|
+
{
|
|
1004
|
+
title: 'Monthly Usage',
|
|
1005
|
+
value: `${percent.toFixed(0)}% (${currentValue.toLocaleString()} / ${limitValue.toLocaleString()})`,
|
|
1006
|
+
short: false,
|
|
1007
|
+
},
|
|
1008
|
+
{ title: 'Period', value: `${new Date().toISOString().slice(0, 7)} (month to date)`, short: false },
|
|
1009
|
+
],
|
|
1010
|
+
}],
|
|
1011
|
+
});
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
log.info(`Monthly budget check complete`, { violations, featuresChecked: monthlyFeatures.length });
|
|
1019
|
+
} catch (error) {
|
|
1020
|
+
log.error('Monthly budget check failed', error instanceof Error ? error : new Error(String(error)));
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
return violations;
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
// =============================================================================
|
|
1027
|
+
// RE-EXPORTS
|
|
1028
|
+
// =============================================================================
|
|
1029
|
+
|
|
1030
|
+
// Re-export CB_STATUS and CircuitBreakerStatusValue for convenience
|
|
1031
|
+
export { CB_STATUS, type CircuitBreakerStatusValue } from '../../circuit-breaker-middleware';
|
|
1032
|
+
export { HARD_LIMIT_MULTIPLIER };
|