@littlebearapps/platform-admin-sdk 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +112 -0
  2. package/dist/index.d.ts +16 -0
  3. package/dist/index.js +89 -0
  4. package/dist/prompts.d.ts +27 -0
  5. package/dist/prompts.js +80 -0
  6. package/dist/scaffold.d.ts +5 -0
  7. package/dist/scaffold.js +65 -0
  8. package/dist/templates.d.ts +16 -0
  9. package/dist/templates.js +131 -0
  10. package/package.json +46 -0
  11. package/templates/full/migrations/006_pattern_discovery.sql +199 -0
  12. package/templates/full/migrations/007_notifications_search.sql +127 -0
  13. package/templates/full/workers/lib/pattern-discovery/ai-prompt.ts +644 -0
  14. package/templates/full/workers/lib/pattern-discovery/clustering.ts +278 -0
  15. package/templates/full/workers/lib/pattern-discovery/shadow-evaluation.ts +603 -0
  16. package/templates/full/workers/lib/pattern-discovery/storage.ts +806 -0
  17. package/templates/full/workers/lib/pattern-discovery/types.ts +159 -0
  18. package/templates/full/workers/lib/pattern-discovery/validation.ts +278 -0
  19. package/templates/full/workers/pattern-discovery.ts +661 -0
  20. package/templates/full/workers/platform-alert-router.ts +1809 -0
  21. package/templates/full/workers/platform-notifications.ts +424 -0
  22. package/templates/full/workers/platform-search.ts +480 -0
  23. package/templates/full/workers/platform-settings.ts +436 -0
  24. package/templates/full/wrangler.alert-router.jsonc.hbs +34 -0
  25. package/templates/full/wrangler.notifications.jsonc.hbs +23 -0
  26. package/templates/full/wrangler.pattern-discovery.jsonc.hbs +33 -0
  27. package/templates/full/wrangler.search.jsonc.hbs +16 -0
  28. package/templates/full/wrangler.settings.jsonc.hbs +23 -0
  29. package/templates/shared/README.md.hbs +69 -0
  30. package/templates/shared/config/budgets.yaml.hbs +72 -0
  31. package/templates/shared/config/services.yaml.hbs +45 -0
  32. package/templates/shared/migrations/001_core_tables.sql +117 -0
  33. package/templates/shared/migrations/002_usage_warehouse.sql +830 -0
  34. package/templates/shared/migrations/003_feature_tracking.sql +250 -0
  35. package/templates/shared/migrations/004_settings_alerts.sql +452 -0
  36. package/templates/shared/migrations/seed.sql.hbs +4 -0
  37. package/templates/shared/package.json.hbs +21 -0
  38. package/templates/shared/scripts/sync-config.ts +242 -0
  39. package/templates/shared/tsconfig.json +12 -0
  40. package/templates/shared/workers/lib/analytics-engine.ts +357 -0
  41. package/templates/shared/workers/lib/billing.ts +293 -0
  42. package/templates/shared/workers/lib/circuit-breaker-middleware.ts +25 -0
  43. package/templates/shared/workers/lib/control.ts +292 -0
  44. package/templates/shared/workers/lib/economics.ts +368 -0
  45. package/templates/shared/workers/lib/metrics.ts +103 -0
  46. package/templates/shared/workers/lib/platform-settings.ts +407 -0
  47. package/templates/shared/workers/lib/shared/allowances.ts +333 -0
  48. package/templates/shared/workers/lib/shared/cloudflare.ts +1362 -0
  49. package/templates/shared/workers/lib/shared/types.ts +58 -0
  50. package/templates/shared/workers/lib/telemetry-sampling.ts +360 -0
  51. package/templates/shared/workers/lib/usage/collectors/example.ts +96 -0
  52. package/templates/shared/workers/lib/usage/collectors/index.ts +128 -0
  53. package/templates/shared/workers/lib/usage/handlers/audit.ts +306 -0
  54. package/templates/shared/workers/lib/usage/handlers/backfill.ts +845 -0
  55. package/templates/shared/workers/lib/usage/handlers/behavioral.ts +429 -0
  56. package/templates/shared/workers/lib/usage/handlers/data-queries.ts +507 -0
  57. package/templates/shared/workers/lib/usage/handlers/dlq-admin.ts +364 -0
  58. package/templates/shared/workers/lib/usage/handlers/health-trends.ts +222 -0
  59. package/templates/shared/workers/lib/usage/handlers/index.ts +35 -0
  60. package/templates/shared/workers/lib/usage/handlers/usage-admin.ts +421 -0
  61. package/templates/shared/workers/lib/usage/handlers/usage-features.ts +1262 -0
  62. package/templates/shared/workers/lib/usage/handlers/usage-metrics.ts +2420 -0
  63. package/templates/shared/workers/lib/usage/handlers/usage-settings.ts +610 -0
  64. package/templates/shared/workers/lib/usage/queue/budget-enforcement.ts +1032 -0
  65. package/templates/shared/workers/lib/usage/queue/cost-budget-enforcement.ts +128 -0
  66. package/templates/shared/workers/lib/usage/queue/cost-calculator.ts +77 -0
  67. package/templates/shared/workers/lib/usage/queue/dlq-handler.ts +161 -0
  68. package/templates/shared/workers/lib/usage/queue/index.ts +19 -0
  69. package/templates/shared/workers/lib/usage/queue/telemetry-processor.ts +790 -0
  70. package/templates/shared/workers/lib/usage/scheduled/anomaly-detection.ts +732 -0
  71. package/templates/shared/workers/lib/usage/scheduled/data-collection.ts +956 -0
  72. package/templates/shared/workers/lib/usage/scheduled/error-digest.ts +343 -0
  73. package/templates/shared/workers/lib/usage/scheduled/index.ts +18 -0
  74. package/templates/shared/workers/lib/usage/scheduled/rollups.ts +1561 -0
  75. package/templates/shared/workers/lib/usage/shared/constants.ts +362 -0
  76. package/templates/shared/workers/lib/usage/shared/index.ts +14 -0
  77. package/templates/shared/workers/lib/usage/shared/types.ts +1066 -0
  78. package/templates/shared/workers/lib/usage/shared/utils.ts +795 -0
  79. package/templates/shared/workers/platform-usage.ts +1915 -0
  80. package/templates/shared/wrangler.usage.jsonc.hbs +58 -0
  81. package/templates/standard/migrations/005_error_collection.sql +162 -0
  82. package/templates/standard/workers/error-collector.ts +2670 -0
  83. package/templates/standard/workers/lib/error-collector/capture.ts +213 -0
  84. package/templates/standard/workers/lib/error-collector/digest.ts +448 -0
  85. package/templates/standard/workers/lib/error-collector/email-health-alerts.ts +262 -0
  86. package/templates/standard/workers/lib/error-collector/fingerprint.ts +258 -0
  87. package/templates/standard/workers/lib/error-collector/gap-alerts.ts +293 -0
  88. package/templates/standard/workers/lib/error-collector/github.ts +329 -0
  89. package/templates/standard/workers/lib/error-collector/types.ts +262 -0
  90. package/templates/standard/workers/lib/sentinel/gap-detection.ts +734 -0
  91. package/templates/standard/workers/lib/shared/slack-alerts.ts +585 -0
  92. package/templates/standard/workers/platform-sentinel.ts +1744 -0
  93. package/templates/standard/wrangler.error-collector.jsonc.hbs +44 -0
  94. package/templates/standard/wrangler.sentinel.jsonc.hbs +45 -0
@@ -0,0 +1,790 @@
1
+ /**
2
+ * Telemetry Processor
3
+ *
4
+ * Queue consumer for platform telemetry messages.
5
+ * Handles:
6
+ * - Main queue processing (handleQueue)
7
+ * - Heartbeat messages (handleHeartbeat)
8
+ * - Intelligent degradation (processIntelligentDegradation)
9
+ * - Error alerting (checkAndAlertErrors)
10
+ * - AI model usage persistence (persistFeatureAIModelUsage)
11
+ *
12
+ * Budget enforcement (checkAndUpdateBudgetStatus) is imported from ./budget-enforcement.
13
+ *
14
+ * Extracted from platform-usage.ts as part of Phase D modularization.
15
+ */
16
+
17
+ import type { MessageBatch } from '@cloudflare/workers-types';
18
+ import type { Env, TelemetryMessage, FeatureBatchState, ErrorAlertPayload } from '../shared';
19
+ import { ERROR_RATE_THRESHOLDS } from '../shared';
20
+ import { generateId } from '../shared';
21
+ import { createLoggerFromEnv } from '@littlebearapps/platform-consumer-sdk';
22
+ import { checkAndUpdateBudgetStatus } from './budget-enforcement';
23
+ import { calculateCFCostFromMetrics } from './cost-calculator';
24
+ import { checkAndUpdateCostBudgetStatus } from './cost-budget-enforcement';
25
+ import {
26
+ getPIDState,
27
+ savePIDState,
28
+ computePID,
29
+ calculateUtilisation,
30
+ shouldUpdatePID,
31
+ formatThrottleRate,
32
+ } from '../../control';
33
+ import {
34
+ getReservoirState,
35
+ saveReservoirState,
36
+ addSample,
37
+ getPercentiles,
38
+ formatPercentiles,
39
+ } from '../../telemetry-sampling';
40
+ import { calculateBCU, formatBCUResult, type BCUResult } from '../../economics';
41
+ import { categoriseError, extractErrorCode } from '@littlebearapps/platform-consumer-sdk';
42
+
43
+ // =============================================================================
44
+ // ERROR LOGGING HELPERS
45
+ // =============================================================================
46
+
47
+ /**
48
+ * Create a safe partial payload for logging (truncates to maxLength chars).
49
+ * Redacts correlation_id to keep logs shorter while preserving debugging context.
50
+ */
51
+ function createPartialPayload(telemetry: TelemetryMessage, maxLength = 500): string {
52
+ const summary = {
53
+ feature_key: telemetry.feature_key,
54
+ project: telemetry.project,
55
+ category: telemetry.category,
56
+ feature: telemetry.feature,
57
+ timestamp: telemetry.timestamp,
58
+ is_heartbeat: telemetry.is_heartbeat,
59
+ error_category: telemetry.error_category,
60
+ error_count: telemetry.error_count,
61
+ metrics_keys: Object.keys(telemetry.metrics).filter(
62
+ (k) => (telemetry.metrics as Record<string, number>)[k] > 0
63
+ ),
64
+ };
65
+ const json = JSON.stringify(summary);
66
+ return json.length > maxLength ? json.slice(0, maxLength) + '...' : json;
67
+ }
68
+
69
+ /**
70
+ * Generate an error fingerprint for deduplication in logs.
71
+ * Combines error name, category, and first line of stack trace.
72
+ */
73
+ function generateErrorFingerprint(error: unknown): string {
74
+ if (!(error instanceof Error)) {
75
+ return `unknown:${String(error).slice(0, 50)}`;
76
+ }
77
+
78
+ const category = categoriseError(error);
79
+ const code = extractErrorCode(error) || 'no_code';
80
+ const stackLine = error.stack?.split('\n')[1]?.trim().slice(0, 80) || 'no_stack';
81
+
82
+ return `${category}:${code}:${error.name}:${stackLine}`;
83
+ }
84
+
85
+ // =============================================================================
86
+ // ERROR SAMPLING
87
+ // =============================================================================
88
+
89
+ /**
90
+ * Error sampling configuration.
91
+ * Reduces D1 writes during high error rate periods.
92
+ */
93
+ interface ErrorSamplingConfig {
94
+ /** Error rate threshold to trigger sampling (e.g., 0.1 = 10%) */
95
+ triggerThreshold: number;
96
+ /** Sample rate when triggered (e.g., 0.1 = keep 10%) */
97
+ sampleRate: number;
98
+ /** Error categories that are never sampled (always stored) */
99
+ neverSampleCategories: string[];
100
+ }
101
+
102
+ const ERROR_SAMPLING_CONFIG: ErrorSamplingConfig = {
103
+ triggerThreshold: 0.1, // 10% error rate
104
+ sampleRate: 0.1, // Keep 10% of errors when sampling
105
+ neverSampleCategories: ['CIRCUIT_BREAKER', 'AUTH', 'INTERNAL'],
106
+ };
107
+
108
+ /**
109
+ * Per-batch error sampling state.
110
+ * Tracks error counts across the batch for adaptive sampling.
111
+ */
112
+ interface ErrorSamplingState {
113
+ totalErrors: number;
114
+ sampledErrors: number;
115
+ totalMessages: number;
116
+ samplingActive: boolean;
117
+ }
118
+
119
+ /**
120
+ * Determine if an error should be sampled (stored in D1).
121
+ * Returns true if the error should be stored, false to skip.
122
+ *
123
+ * @param telemetry - The telemetry message with error
124
+ * @param state - Current batch sampling state
125
+ * @returns Whether to store this error in D1
126
+ */
127
+ function shouldStoreError(telemetry: TelemetryMessage, state: ErrorSamplingState): boolean {
128
+ // Never sample critical error categories
129
+ if (
130
+ telemetry.error_category &&
131
+ ERROR_SAMPLING_CONFIG.neverSampleCategories.includes(telemetry.error_category)
132
+ ) {
133
+ return true;
134
+ }
135
+
136
+ // Calculate error rate for the batch
137
+ const errorRate = state.totalMessages > 0 ? state.totalErrors / state.totalMessages : 0;
138
+
139
+ // If error rate below threshold, store all errors
140
+ if (errorRate < ERROR_SAMPLING_CONFIG.triggerThreshold) {
141
+ return true;
142
+ }
143
+
144
+ // Sampling is active - use probabilistic sampling
145
+ state.samplingActive = true;
146
+ return Math.random() < ERROR_SAMPLING_CONFIG.sampleRate;
147
+ }
148
+
149
+ /**
150
+ * Create initial sampling state for a batch.
151
+ */
152
+ function createSamplingState(): ErrorSamplingState {
153
+ return {
154
+ totalErrors: 0,
155
+ sampledErrors: 0,
156
+ totalMessages: 0,
157
+ samplingActive: false,
158
+ };
159
+ }
160
+
161
+ // =============================================================================
162
+ // HEARTBEAT HANDLING
163
+ // =============================================================================
164
+
165
+ /**
166
+ * Handle a heartbeat message from health checks.
167
+ * Writes to Analytics Engine (zeros) and upserts to D1 health_checks table.
168
+ */
169
+ async function handleHeartbeat(telemetry: TelemetryMessage, env: Env): Promise<void> {
170
+ const now = Math.floor(Date.now() / 1000);
171
+
172
+ // Write to Analytics Engine with zeros (for consistency, shows heartbeat was processed)
173
+ env.PLATFORM_ANALYTICS.writeDataPoint({
174
+ blobs: [telemetry.project, telemetry.category, telemetry.feature],
175
+ doubles: new Array(20).fill(0),
176
+ indexes: [telemetry.feature_key],
177
+ });
178
+
179
+ // Upsert to D1 system_health_checks table
180
+ await env.PLATFORM_DB.prepare(
181
+ `
182
+ INSERT INTO system_health_checks (id, project_id, feature_id, last_heartbeat, status, updated_at)
183
+ VALUES (?1, ?2, ?3, ?4, 'healthy', ?4)
184
+ ON CONFLICT (project_id, feature_id) DO UPDATE SET
185
+ last_heartbeat = excluded.last_heartbeat,
186
+ status = 'healthy',
187
+ consecutive_failures = 0,
188
+ updated_at = excluded.updated_at
189
+ `
190
+ )
191
+ .bind(crypto.randomUUID(), telemetry.project, telemetry.feature_key, now)
192
+ .run();
193
+
194
+ // Note: logger not created per call - this is a hot path
195
+ // Using inline log to avoid overhead
196
+ }
197
+
198
+ // =============================================================================
199
+ // ERROR ALERTING
200
+ // =============================================================================
201
+
202
+ /**
203
+ * Check if telemetry message contains errors that warrant alerting.
204
+ * Detects P0 conditions: circuit breaker trips, high error rates.
205
+ * Uses adaptive sampling to reduce D1 writes during high error rate periods.
206
+ */
207
+ async function checkAndAlertErrors(
208
+ telemetry: TelemetryMessage,
209
+ env: Env,
210
+ samplingState: ErrorSamplingState
211
+ ): Promise<void> {
212
+ // Skip if no errors reported
213
+ if (!telemetry.error_count || telemetry.error_count === 0) {
214
+ return;
215
+ }
216
+
217
+ // P0 Condition 1: Circuit breaker error (always alert, always store)
218
+ if (telemetry.error_category === 'CIRCUIT_BREAKER') {
219
+ await sendErrorAlert(env, {
220
+ type: 'p0_immediate',
221
+ feature_key: telemetry.feature_key,
222
+ project: telemetry.project,
223
+ category: telemetry.category,
224
+ feature: telemetry.feature,
225
+ correlation_id: telemetry.correlation_id,
226
+ error_category: telemetry.error_category,
227
+ error_code: telemetry.error_codes?.[0],
228
+ window_minutes: ERROR_RATE_THRESHOLDS.windowMinutes,
229
+ });
230
+ // Always store P0 errors
231
+ await storeErrorEvent(telemetry, env);
232
+ samplingState.sampledErrors++;
233
+ return;
234
+ }
235
+
236
+ // Apply adaptive sampling for error storage
237
+ if (shouldStoreError(telemetry, samplingState)) {
238
+ await storeErrorEvent(telemetry, env);
239
+ samplingState.sampledErrors++;
240
+ }
241
+
242
+ // Check error rate over window for P0/P1 conditions
243
+ const errorStats = await getErrorRateStats(telemetry.feature_key, env);
244
+
245
+ if (errorStats.totalRequests >= ERROR_RATE_THRESHOLDS.minRequests) {
246
+ const errorRate = (errorStats.errorCount / errorStats.totalRequests) * 100;
247
+
248
+ if (errorRate >= ERROR_RATE_THRESHOLDS.p0) {
249
+ // P0: High error rate (>50%)
250
+ await sendErrorAlert(env, {
251
+ type: 'p0_immediate',
252
+ feature_key: telemetry.feature_key,
253
+ project: telemetry.project,
254
+ category: telemetry.category,
255
+ feature: telemetry.feature,
256
+ correlation_id: telemetry.correlation_id,
257
+ error_category: telemetry.error_category,
258
+ error_code: telemetry.error_codes?.[0],
259
+ error_rate: errorRate,
260
+ window_minutes: ERROR_RATE_THRESHOLDS.windowMinutes,
261
+ });
262
+ }
263
+ }
264
+ }
265
+
266
+ /**
267
+ * Store error event in D1 for aggregation and historical analysis.
268
+ */
269
+ async function storeErrorEvent(telemetry: TelemetryMessage, env: Env): Promise<void> {
270
+ const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:error-tracking');
271
+ try {
272
+ await env.PLATFORM_DB.prepare(
273
+ `INSERT INTO feature_error_events (
274
+ id, feature_key, error_category, error_code, error_message,
275
+ correlation_id, worker, priority, created_at
276
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
277
+ )
278
+ .bind(
279
+ crypto.randomUUID(),
280
+ telemetry.feature_key,
281
+ telemetry.error_category || 'INTERNAL',
282
+ telemetry.error_codes?.[0] || null,
283
+ null, // No message in telemetry (truncated for space)
284
+ telemetry.correlation_id || null,
285
+ null, // Worker name not in telemetry
286
+ 'P2', // Default priority, upgraded by alert detection
287
+ Math.floor(Date.now() / 1000)
288
+ )
289
+ .run();
290
+ } catch (error) {
291
+ log.error('Failed to store error event', error);
292
+ }
293
+ }
294
+
295
+ /**
296
+ * Update error budget window for SLA tracking.
297
+ * Aggregates success/error counts in 5-minute windows.
298
+ */
299
+ async function updateErrorBudgetWindow(telemetry: TelemetryMessage, env: Env): Promise<void> {
300
+ const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:sla-tracking');
301
+
302
+ try {
303
+ // Calculate 5-minute window boundaries
304
+ const WINDOW_SIZE_SECONDS = 5 * 60; // 5 minutes
305
+ const now = Math.floor(Date.now() / 1000);
306
+ const windowStart = Math.floor(now / WINDOW_SIZE_SECONDS) * WINDOW_SIZE_SECONDS;
307
+ const windowEnd = windowStart + WINDOW_SIZE_SECONDS;
308
+
309
+ const hasError = (telemetry.error_count ?? 0) > 0;
310
+ const errorCategory = telemetry.error_category;
311
+
312
+ // Determine error category counts
313
+ const timeoutIncrement = errorCategory === 'TIMEOUT' ? 1 : 0;
314
+ const validationIncrement = errorCategory === 'VALIDATION' ? 1 : 0;
315
+ const internalIncrement = errorCategory === 'INTERNAL' ? 1 : 0;
316
+ const externalIncrement = errorCategory === 'EXTERNAL_API' ? 1 : 0;
317
+ const otherIncrement =
318
+ hasError &&
319
+ !['TIMEOUT', 'VALIDATION', 'INTERNAL', 'EXTERNAL_API'].includes(errorCategory || '')
320
+ ? 1
321
+ : 0;
322
+
323
+ // Upsert window record
324
+ await env.PLATFORM_DB.prepare(
325
+ `INSERT INTO error_budget_windows (
326
+ id, feature_key, project, window_start, window_end,
327
+ success_count, error_count, total_count,
328
+ timeout_count, validation_count, internal_count, external_count, other_count,
329
+ created_at, updated_at
330
+ ) VALUES (
331
+ ?1, ?2, ?3, ?4, ?5,
332
+ ?6, ?7, 1,
333
+ ?8, ?9, ?10, ?11, ?12,
334
+ unixepoch(), unixepoch()
335
+ )
336
+ ON CONFLICT(feature_key, window_start) DO UPDATE SET
337
+ success_count = success_count + excluded.success_count,
338
+ error_count = error_count + excluded.error_count,
339
+ total_count = total_count + 1,
340
+ timeout_count = timeout_count + excluded.timeout_count,
341
+ validation_count = validation_count + excluded.validation_count,
342
+ internal_count = internal_count + excluded.internal_count,
343
+ external_count = external_count + excluded.external_count,
344
+ other_count = other_count + excluded.other_count,
345
+ updated_at = unixepoch()`
346
+ )
347
+ .bind(
348
+ `${telemetry.feature_key}:${windowStart}`,
349
+ telemetry.feature_key,
350
+ telemetry.project,
351
+ windowStart,
352
+ windowEnd,
353
+ hasError ? 0 : 1, // success_count
354
+ hasError ? 1 : 0, // error_count
355
+ timeoutIncrement,
356
+ validationIncrement,
357
+ internalIncrement,
358
+ externalIncrement,
359
+ otherIncrement
360
+ )
361
+ .run();
362
+ } catch (error) {
363
+ log.error('Failed to update error budget window', error);
364
+ }
365
+ }
366
+
367
+ /**
368
+ * Get error rate statistics for a feature over the sliding window.
369
+ */
370
+ async function getErrorRateStats(
371
+ featureKey: string,
372
+ env: Env
373
+ ): Promise<{ errorCount: number; totalRequests: number }> {
374
+ try {
375
+ const windowStart = Math.floor(Date.now() / 1000) - ERROR_RATE_THRESHOLDS.windowMinutes * 60;
376
+
377
+ const result = await env.PLATFORM_DB.prepare(
378
+ `SELECT
379
+ COUNT(*) as error_count,
380
+ (SELECT COUNT(*) FROM feature_error_events
381
+ WHERE feature_key = ?1 AND created_at >= ?2) as total_events
382
+ FROM feature_error_events
383
+ WHERE feature_key = ?1 AND created_at >= ?2`
384
+ )
385
+ .bind(featureKey, windowStart)
386
+ .first<{ error_count: number; total_events: number }>();
387
+
388
+ return {
389
+ errorCount: result?.error_count ?? 0,
390
+ totalRequests: result?.total_events ?? 0,
391
+ };
392
+ } catch (error) {
393
+ const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:error-tracking');
394
+ log.error('Failed to get error rate', error);
395
+ return { errorCount: 0, totalRequests: 0 };
396
+ }
397
+ }
398
+
399
+ /**
400
+ * Send error alert to alert-router.
401
+ * Uses service binding if available, falls back to direct Slack.
402
+ */
403
+ async function sendErrorAlert(env: Env, payload: ErrorAlertPayload): Promise<void> {
404
+ const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:error-alerting');
405
+ try {
406
+ if (env.ALERT_ROUTER) {
407
+ // Use service binding to call alert-router
408
+ const response = await env.ALERT_ROUTER.fetch('https://alert-router/errors', {
409
+ method: 'POST',
410
+ headers: { 'Content-Type': 'application/json' },
411
+ body: JSON.stringify(payload),
412
+ });
413
+
414
+ if (!response.ok) {
415
+ log.error(`alert-router returned ${response.status}`);
416
+ } else {
417
+ log.info('Alert sent', { type: payload.type, featureKey: payload.feature_key });
418
+ }
419
+ } else if (env.SLACK_WEBHOOK_URL) {
420
+ // Fallback: send directly to Slack (basic format)
421
+ const emoji = payload.type === 'p0_immediate' ? '🚨' : '⚠️';
422
+ await fetch(env.SLACK_WEBHOOK_URL, {
423
+ method: 'POST',
424
+ headers: { 'Content-Type': 'application/json' },
425
+ body: JSON.stringify({
426
+ text: `${emoji} [${payload.type.toUpperCase()}] Error in ${payload.feature_key}: ${payload.error_category}`,
427
+ }),
428
+ });
429
+ }
430
+ } catch (error) {
431
+ log.error('Failed to send alert', error);
432
+ }
433
+ }
434
+
435
+ // =============================================================================
436
+ // AI MODEL USAGE PERSISTENCE
437
+ // =============================================================================
438
+
439
+ /**
440
+ * Persist feature-level AI model usage to D1.
441
+ * Called from queue consumer when telemetry includes aiModelBreakdown.
442
+ * Uses upsert to aggregate invocations for the same feature/model/date.
443
+ */
444
+ async function persistFeatureAIModelUsage(
445
+ env: Env,
446
+ featureKey: string,
447
+ modelBreakdown: Record<string, number>,
448
+ timestamp: Date
449
+ ): Promise<number> {
450
+ const usageDate = timestamp.toISOString().split('T')[0]; // YYYY-MM-DD
451
+ let writes = 0;
452
+
453
+ for (const [model, invocations] of Object.entries(modelBreakdown)) {
454
+ if (invocations <= 0) continue;
455
+
456
+ await env.PLATFORM_DB.prepare(
457
+ `
458
+ INSERT INTO feature_ai_model_usage (
459
+ id, feature_key, model, usage_date, invocations, updated_at
460
+ ) VALUES (?, ?, ?, ?, ?, unixepoch())
461
+ ON CONFLICT (feature_key, model, usage_date) DO UPDATE SET
462
+ invocations = invocations + excluded.invocations,
463
+ updated_at = unixepoch()
464
+ `
465
+ )
466
+ .bind(generateId(), featureKey, model, usageDate, invocations)
467
+ .run();
468
+ writes++;
469
+ }
470
+
471
+ return writes;
472
+ }
473
+
474
+ // =============================================================================
475
+ // INTELLIGENT DEGRADATION
476
+ // =============================================================================
477
+
478
+ /**
479
+ * Process intelligent degradation updates for features seen in a batch.
480
+ * Updates reservoir sampling and PID controller state in KV.
481
+ *
482
+ * Shadow mode: Currently logs throttle rates without applying them.
483
+ * Set ENABLE_THROTTLE_WRITES=true in env to write throttle rates to KV.
484
+ */
485
+ async function processIntelligentDegradation(
486
+ featureStates: Map<string, FeatureBatchState>,
487
+ env: Env
488
+ ): Promise<void> {
489
+ if (featureStates.size === 0) return;
490
+
491
+ const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:intelligent-degradation');
492
+
493
+ // Cast KV to work around type version mismatch between workers and lib modules
494
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
495
+ const kv = env.PLATFORM_CACHE as any;
496
+ const shadowMode = true; // TODO: Make configurable via env.ENABLE_THROTTLE_WRITES
497
+
498
+ for (const [featureKey, batchState] of featureStates) {
499
+ try {
500
+ // 1. Update reservoir sampling with cpuMs samples
501
+ if (batchState.cpuMsSamples.length > 0) {
502
+ const reservoirState = await getReservoirState(featureKey, kv);
503
+ for (const sample of batchState.cpuMsSamples) {
504
+ addSample(reservoirState, sample);
505
+ }
506
+ await saveReservoirState(featureKey, reservoirState, kv);
507
+
508
+ // Log percentiles periodically (every 100 samples)
509
+ if (reservoirState.totalSeen % 100 === 0) {
510
+ const percentiles = getPercentiles(reservoirState);
511
+ if (percentiles) {
512
+ log.info('Feature latency', {
513
+ featureKey,
514
+ latency: formatPercentiles(percentiles),
515
+ });
516
+ }
517
+ }
518
+ }
519
+
520
+ // 2. Update PID controller if enough time has passed (60s interval)
521
+ const pidState = await getPIDState(featureKey, kv);
522
+ if (shouldUpdatePID(pidState.lastUpdate, 60_000)) {
523
+ // Get current budget utilisation from KV
524
+ // For now, use BCU as a proxy for utilisation
525
+ // TODO: Get actual budget limit from CONFIG:FEATURE:{id}:BUDGET
526
+ const budgetLimit = 10000; // Default BCU budget per 60s interval
527
+ const currentUsage = calculateUtilisation(batchState.bcuTotal, budgetLimit);
528
+ const deltaTimeMs = Date.now() - pidState.lastUpdate;
529
+
530
+ const pidOutput = computePID(pidState, { currentUsage, deltaTimeMs });
531
+
532
+ if (shadowMode) {
533
+ // Shadow mode: log but don't write throttle rate to KV
534
+ if (pidOutput.throttleRate > 0.01) {
535
+ log.info('SHADOW throttle', {
536
+ featureKey,
537
+ throttle: formatThrottleRate(pidOutput.throttleRate),
538
+ usagePct: (currentUsage * 100).toFixed(1),
539
+ bcu: batchState.bcuTotal,
540
+ });
541
+ }
542
+ // Still save PID state to maintain continuity
543
+ pidOutput.newState.throttleRate = 0; // Don't persist throttle in shadow mode
544
+ await kv.put(`STATE:PID:${featureKey}`, JSON.stringify(pidOutput.newState), {
545
+ expirationTtl: 86400,
546
+ });
547
+ } else {
548
+ // Active mode: save state and write throttle rate to KV
549
+ await savePIDState(featureKey, pidOutput.newState, kv);
550
+ if (pidOutput.throttleRate > 0.01) {
551
+ log.info('Throttle applied', {
552
+ featureKey,
553
+ throttle: formatThrottleRate(pidOutput.throttleRate),
554
+ usagePct: (currentUsage * 100).toFixed(1),
555
+ });
556
+ }
557
+ }
558
+ }
559
+
560
+ // 3. Log BCU summary for monitoring
561
+ if (batchState.bcuTotal > 1000) {
562
+ // Only log significant BCU usage
563
+ const bcuResult = { total: batchState.bcuTotal } as BCUResult;
564
+ log.info('BCU summary', {
565
+ featureKey,
566
+ bcu: formatBCUResult(bcuResult),
567
+ messages: batchState.messageCount,
568
+ });
569
+ }
570
+ } catch (error) {
571
+ // Don't fail the batch for intelligent degradation errors
572
+ log.error(`Error processing ${featureKey}`, error);
573
+ }
574
+ }
575
+ }
576
+
577
+ // =============================================================================
578
+ // MAIN QUEUE HANDLER
579
+ // =============================================================================
580
+
581
+ /**
582
+ * Main queue consumer handler for telemetry messages.
583
+ * Processes batches of TelemetryMessage from the platform-telemetry queue.
584
+ *
585
+ * Processing steps per message:
586
+ * 1. Handle heartbeat messages (write zeros, update health check)
587
+ * 2. Write metrics to Analytics Engine
588
+ * 3. Accumulate intelligent degradation data
589
+ * 4. Check budget and update status if exceeded
590
+ * 5. Check for errors and send alerts if needed
591
+ * 6. Persist AI model breakdown to D1 if present
592
+ *
593
+ * After batch processing:
594
+ * - Process intelligent degradation updates for all features seen
595
+ */
596
+ async function handleQueue(batch: MessageBatch<TelemetryMessage>, env: Env): Promise<void> {
597
+ const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:telemetry');
598
+ log.info('Processing batch', { messages: batch.messages.length });
599
+
600
+ let successCount = 0;
601
+ let errorCount = 0;
602
+ let heartbeatCount = 0;
603
+
604
+ // Accumulate state per feature for intelligent degradation
605
+ // This allows batch-level KV operations instead of per-message
606
+ const featureStates = new Map<string, FeatureBatchState>();
607
+
608
+ // Error sampling state for adaptive D1 write reduction during incidents
609
+ const samplingState = createSamplingState();
610
+ samplingState.totalMessages = batch.messages.length;
611
+
612
+ for (const message of batch.messages) {
613
+ try {
614
+ const telemetry = message.body;
615
+
616
+ // Handle heartbeat messages differently (skip budget check)
617
+ if (telemetry.is_heartbeat) {
618
+ await handleHeartbeat(telemetry, env);
619
+ message.ack();
620
+ heartbeatCount++;
621
+ successCount++;
622
+ continue;
623
+ }
624
+
625
+ // Calculate total cost (CF resources + external APIs)
626
+ const cfCost = calculateCFCostFromMetrics(telemetry.metrics);
627
+ const externalCost = telemetry.external_cost_usd ?? 0;
628
+ const totalCost = cfCost + externalCost;
629
+
630
+ // Write to Analytics Engine
631
+ // Schema must match METRIC_FIELDS order from constants.ts:
632
+ // - blobs: [project, category, feature] (feature_key is in indexes)
633
+ // - doubles: ordered per METRIC_FIELDS (d1Writes, d1Reads, kvReads, ...)
634
+ // Note: AE has a hard limit of 20 doubles
635
+ env.PLATFORM_ANALYTICS.writeDataPoint({
636
+ blobs: [
637
+ telemetry.project, // blob1: project
638
+ telemetry.category, // blob2: category
639
+ telemetry.feature, // blob3: feature
640
+ ],
641
+ doubles: [
642
+ // Legacy fields (positions 1-12) - DO NOT REORDER
643
+ telemetry.metrics.d1Writes ?? 0, // double1
644
+ telemetry.metrics.d1Reads ?? 0, // double2
645
+ telemetry.metrics.kvReads ?? 0, // double3
646
+ telemetry.metrics.kvWrites ?? 0, // double4
647
+ telemetry.metrics.doRequests ?? 0, // double5
648
+ telemetry.metrics.doGbSeconds ?? 0, // double6
649
+ telemetry.metrics.r2ClassA ?? 0, // double7
650
+ telemetry.metrics.r2ClassB ?? 0, // double8
651
+ telemetry.metrics.aiNeurons ?? 0, // double9
652
+ telemetry.metrics.queueMessages ?? 0, // double10
653
+ telemetry.metrics.requests ?? 0, // double11
654
+ telemetry.metrics.cpuMs ?? 0, // double12
655
+ // Extended fields (positions 13-20) - APPEND ONLY (20 field limit)
656
+ telemetry.metrics.d1RowsRead ?? 0, // double13
657
+ telemetry.metrics.d1RowsWritten ?? 0, // double14
658
+ telemetry.metrics.kvDeletes ?? 0, // double15
659
+ telemetry.metrics.kvLists ?? 0, // double16
660
+ telemetry.metrics.aiRequests ?? 0, // double17
661
+ telemetry.metrics.vectorizeQueries ?? 0, // double18
662
+ telemetry.metrics.vectorizeInserts ?? 0, // double19
663
+ // 2026-01-27: Repurposed from workflowInvocations (free in beta) for external API cost tracking
664
+ externalCost, // double20: external_cost_usd (OpenAI, Apify, etc.)
665
+ ],
666
+ indexes: [telemetry.feature_key],
667
+ });
668
+
669
+ // Accumulate intelligent degradation data for this feature
670
+ const featureKey = telemetry.feature_key;
671
+ let state = featureStates.get(featureKey);
672
+ if (!state) {
673
+ state = { cpuMsSamples: [], bcuTotal: 0, messageCount: 0, lastTimestamp: 0 };
674
+ featureStates.set(featureKey, state);
675
+ }
676
+
677
+ // Collect cpuMs sample for reservoir
678
+ const cpuMs = telemetry.metrics.cpuMs ?? 0;
679
+ if (cpuMs > 0) {
680
+ state.cpuMsSamples.push(cpuMs);
681
+ }
682
+
683
+ // Calculate BCU for this message
684
+ const bcuResult = calculateBCU(telemetry.metrics);
685
+ state.bcuTotal += bcuResult.total;
686
+ state.messageCount++;
687
+ state.lastTimestamp = Math.max(state.lastTimestamp, telemetry.timestamp);
688
+
689
+ // Check budget and update status if exceeded
690
+ await checkAndUpdateBudgetStatus(telemetry.feature_key, telemetry.metrics, env);
691
+
692
+ // Check cost budget if there's a cost to track
693
+ if (totalCost > 0) {
694
+ await checkAndUpdateCostBudgetStatus(telemetry.feature_key, totalCost, env);
695
+ }
696
+
697
+ // Track total errors for sampling calculation
698
+ if (telemetry.error_count && telemetry.error_count > 0) {
699
+ samplingState.totalErrors++;
700
+ }
701
+
702
+ // Check for errors and send alerts if needed (with adaptive sampling)
703
+ await checkAndAlertErrors(telemetry, env, samplingState);
704
+
705
+ // Update error budget window for SLA tracking
706
+ await updateErrorBudgetWindow(telemetry, env);
707
+
708
+ // Persist AI model breakdown to D1 if present
709
+ if (telemetry.metrics.aiModelBreakdown) {
710
+ await persistFeatureAIModelUsage(
711
+ env,
712
+ telemetry.feature_key,
713
+ telemetry.metrics.aiModelBreakdown,
714
+ new Date(telemetry.timestamp)
715
+ );
716
+ }
717
+
718
+ message.ack();
719
+ successCount++;
720
+ } catch (error) {
721
+ // Enhanced error logging with full context for debugging
722
+ const telemetry = message.body;
723
+ const errorCategory = categoriseError(error);
724
+ const errorCode = extractErrorCode(error);
725
+ const fingerprint = generateErrorFingerprint(error);
726
+
727
+ log.error('Error processing telemetry message', error, {
728
+ feature_key: telemetry.feature_key,
729
+ project: telemetry.project,
730
+ category: telemetry.category,
731
+ error_category: errorCategory,
732
+ error_code: errorCode,
733
+ fingerprint,
734
+ partial_payload: createPartialPayload(telemetry),
735
+ correlation_id: telemetry.correlation_id,
736
+ });
737
+
738
+ message.retry();
739
+ errorCount++;
740
+ }
741
+ }
742
+
743
+ // Process intelligent degradation updates for each feature seen in batch
744
+ // This is done after message processing to not block the main loop
745
+ await processIntelligentDegradation(featureStates, env);
746
+
747
+ // Log batch summary with error rate and sampling info for monitoring
748
+ if (errorCount > 0) {
749
+ const errorRate = ((errorCount / batch.messages.length) * 100).toFixed(1);
750
+ log.warn('Batch complete with errors', {
751
+ success: successCount,
752
+ heartbeats: heartbeatCount,
753
+ errors: errorCount,
754
+ total: batch.messages.length,
755
+ error_rate_pct: errorRate,
756
+ sampling_active: samplingState.samplingActive,
757
+ errors_sampled: samplingState.sampledErrors,
758
+ errors_total: samplingState.totalErrors,
759
+ });
760
+ } else {
761
+ log.info('Batch complete', {
762
+ success: successCount,
763
+ heartbeats: heartbeatCount,
764
+ errors: errorCount,
765
+ });
766
+ }
767
+ }
768
+
769
+ // =============================================================================
770
+ // EXPORTS
771
+ // =============================================================================
772
+
773
+ export {
774
+ // Main queue handler
775
+ handleQueue,
776
+ // Heartbeat handling
777
+ handleHeartbeat,
778
+ // Intelligent degradation
779
+ processIntelligentDegradation,
780
+ // Error alerting
781
+ checkAndAlertErrors,
782
+ storeErrorEvent,
783
+ getErrorRateStats,
784
+ sendErrorAlert,
785
+ // AI model usage
786
+ persistFeatureAIModelUsage,
787
+ };
788
+
789
+ // Re-export checkAndUpdateBudgetStatus from budget-enforcement for backward compatibility
790
+ export { checkAndUpdateBudgetStatus } from './budget-enforcement';