@littlebearapps/platform-admin-sdk 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +112 -0
- package/dist/index.d.ts +16 -0
- package/dist/index.js +89 -0
- package/dist/prompts.d.ts +27 -0
- package/dist/prompts.js +80 -0
- package/dist/scaffold.d.ts +5 -0
- package/dist/scaffold.js +65 -0
- package/dist/templates.d.ts +16 -0
- package/dist/templates.js +131 -0
- package/package.json +46 -0
- package/templates/full/migrations/006_pattern_discovery.sql +199 -0
- package/templates/full/migrations/007_notifications_search.sql +127 -0
- package/templates/full/workers/lib/pattern-discovery/ai-prompt.ts +644 -0
- package/templates/full/workers/lib/pattern-discovery/clustering.ts +278 -0
- package/templates/full/workers/lib/pattern-discovery/shadow-evaluation.ts +603 -0
- package/templates/full/workers/lib/pattern-discovery/storage.ts +806 -0
- package/templates/full/workers/lib/pattern-discovery/types.ts +159 -0
- package/templates/full/workers/lib/pattern-discovery/validation.ts +278 -0
- package/templates/full/workers/pattern-discovery.ts +661 -0
- package/templates/full/workers/platform-alert-router.ts +1809 -0
- package/templates/full/workers/platform-notifications.ts +424 -0
- package/templates/full/workers/platform-search.ts +480 -0
- package/templates/full/workers/platform-settings.ts +436 -0
- package/templates/full/wrangler.alert-router.jsonc.hbs +34 -0
- package/templates/full/wrangler.notifications.jsonc.hbs +23 -0
- package/templates/full/wrangler.pattern-discovery.jsonc.hbs +33 -0
- package/templates/full/wrangler.search.jsonc.hbs +16 -0
- package/templates/full/wrangler.settings.jsonc.hbs +23 -0
- package/templates/shared/README.md.hbs +69 -0
- package/templates/shared/config/budgets.yaml.hbs +72 -0
- package/templates/shared/config/services.yaml.hbs +45 -0
- package/templates/shared/migrations/001_core_tables.sql +117 -0
- package/templates/shared/migrations/002_usage_warehouse.sql +830 -0
- package/templates/shared/migrations/003_feature_tracking.sql +250 -0
- package/templates/shared/migrations/004_settings_alerts.sql +452 -0
- package/templates/shared/migrations/seed.sql.hbs +4 -0
- package/templates/shared/package.json.hbs +21 -0
- package/templates/shared/scripts/sync-config.ts +242 -0
- package/templates/shared/tsconfig.json +12 -0
- package/templates/shared/workers/lib/analytics-engine.ts +357 -0
- package/templates/shared/workers/lib/billing.ts +293 -0
- package/templates/shared/workers/lib/circuit-breaker-middleware.ts +25 -0
- package/templates/shared/workers/lib/control.ts +292 -0
- package/templates/shared/workers/lib/economics.ts +368 -0
- package/templates/shared/workers/lib/metrics.ts +103 -0
- package/templates/shared/workers/lib/platform-settings.ts +407 -0
- package/templates/shared/workers/lib/shared/allowances.ts +333 -0
- package/templates/shared/workers/lib/shared/cloudflare.ts +1362 -0
- package/templates/shared/workers/lib/shared/types.ts +58 -0
- package/templates/shared/workers/lib/telemetry-sampling.ts +360 -0
- package/templates/shared/workers/lib/usage/collectors/example.ts +96 -0
- package/templates/shared/workers/lib/usage/collectors/index.ts +128 -0
- package/templates/shared/workers/lib/usage/handlers/audit.ts +306 -0
- package/templates/shared/workers/lib/usage/handlers/backfill.ts +845 -0
- package/templates/shared/workers/lib/usage/handlers/behavioral.ts +429 -0
- package/templates/shared/workers/lib/usage/handlers/data-queries.ts +507 -0
- package/templates/shared/workers/lib/usage/handlers/dlq-admin.ts +364 -0
- package/templates/shared/workers/lib/usage/handlers/health-trends.ts +222 -0
- package/templates/shared/workers/lib/usage/handlers/index.ts +35 -0
- package/templates/shared/workers/lib/usage/handlers/usage-admin.ts +421 -0
- package/templates/shared/workers/lib/usage/handlers/usage-features.ts +1262 -0
- package/templates/shared/workers/lib/usage/handlers/usage-metrics.ts +2420 -0
- package/templates/shared/workers/lib/usage/handlers/usage-settings.ts +610 -0
- package/templates/shared/workers/lib/usage/queue/budget-enforcement.ts +1032 -0
- package/templates/shared/workers/lib/usage/queue/cost-budget-enforcement.ts +128 -0
- package/templates/shared/workers/lib/usage/queue/cost-calculator.ts +77 -0
- package/templates/shared/workers/lib/usage/queue/dlq-handler.ts +161 -0
- package/templates/shared/workers/lib/usage/queue/index.ts +19 -0
- package/templates/shared/workers/lib/usage/queue/telemetry-processor.ts +790 -0
- package/templates/shared/workers/lib/usage/scheduled/anomaly-detection.ts +732 -0
- package/templates/shared/workers/lib/usage/scheduled/data-collection.ts +956 -0
- package/templates/shared/workers/lib/usage/scheduled/error-digest.ts +343 -0
- package/templates/shared/workers/lib/usage/scheduled/index.ts +18 -0
- package/templates/shared/workers/lib/usage/scheduled/rollups.ts +1561 -0
- package/templates/shared/workers/lib/usage/shared/constants.ts +362 -0
- package/templates/shared/workers/lib/usage/shared/index.ts +14 -0
- package/templates/shared/workers/lib/usage/shared/types.ts +1066 -0
- package/templates/shared/workers/lib/usage/shared/utils.ts +795 -0
- package/templates/shared/workers/platform-usage.ts +1915 -0
- package/templates/shared/wrangler.usage.jsonc.hbs +58 -0
- package/templates/standard/migrations/005_error_collection.sql +162 -0
- package/templates/standard/workers/error-collector.ts +2670 -0
- package/templates/standard/workers/lib/error-collector/capture.ts +213 -0
- package/templates/standard/workers/lib/error-collector/digest.ts +448 -0
- package/templates/standard/workers/lib/error-collector/email-health-alerts.ts +262 -0
- package/templates/standard/workers/lib/error-collector/fingerprint.ts +258 -0
- package/templates/standard/workers/lib/error-collector/gap-alerts.ts +293 -0
- package/templates/standard/workers/lib/error-collector/github.ts +329 -0
- package/templates/standard/workers/lib/error-collector/types.ts +262 -0
- package/templates/standard/workers/lib/sentinel/gap-detection.ts +734 -0
- package/templates/standard/workers/lib/shared/slack-alerts.ts +585 -0
- package/templates/standard/workers/platform-sentinel.ts +1744 -0
- package/templates/standard/wrangler.error-collector.jsonc.hbs +44 -0
- package/templates/standard/wrangler.sentinel.jsonc.hbs +45 -0
|
@@ -0,0 +1,790 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Telemetry Processor
|
|
3
|
+
*
|
|
4
|
+
* Queue consumer for platform telemetry messages.
|
|
5
|
+
* Handles:
|
|
6
|
+
* - Main queue processing (handleQueue)
|
|
7
|
+
* - Heartbeat messages (handleHeartbeat)
|
|
8
|
+
* - Intelligent degradation (processIntelligentDegradation)
|
|
9
|
+
* - Error alerting (checkAndAlertErrors)
|
|
10
|
+
* - AI model usage persistence (persistFeatureAIModelUsage)
|
|
11
|
+
*
|
|
12
|
+
* Budget enforcement (checkAndUpdateBudgetStatus) is imported from ./budget-enforcement.
|
|
13
|
+
*
|
|
14
|
+
* Extracted from platform-usage.ts as part of Phase D modularization.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import type { MessageBatch } from '@cloudflare/workers-types';
|
|
18
|
+
import type { Env, TelemetryMessage, FeatureBatchState, ErrorAlertPayload } from '../shared';
|
|
19
|
+
import { ERROR_RATE_THRESHOLDS } from '../shared';
|
|
20
|
+
import { generateId } from '../shared';
|
|
21
|
+
import { createLoggerFromEnv } from '@littlebearapps/platform-consumer-sdk';
|
|
22
|
+
import { checkAndUpdateBudgetStatus } from './budget-enforcement';
|
|
23
|
+
import { calculateCFCostFromMetrics } from './cost-calculator';
|
|
24
|
+
import { checkAndUpdateCostBudgetStatus } from './cost-budget-enforcement';
|
|
25
|
+
import {
|
|
26
|
+
getPIDState,
|
|
27
|
+
savePIDState,
|
|
28
|
+
computePID,
|
|
29
|
+
calculateUtilisation,
|
|
30
|
+
shouldUpdatePID,
|
|
31
|
+
formatThrottleRate,
|
|
32
|
+
} from '../../control';
|
|
33
|
+
import {
|
|
34
|
+
getReservoirState,
|
|
35
|
+
saveReservoirState,
|
|
36
|
+
addSample,
|
|
37
|
+
getPercentiles,
|
|
38
|
+
formatPercentiles,
|
|
39
|
+
} from '../../telemetry-sampling';
|
|
40
|
+
import { calculateBCU, formatBCUResult, type BCUResult } from '../../economics';
|
|
41
|
+
import { categoriseError, extractErrorCode } from '@littlebearapps/platform-consumer-sdk';
|
|
42
|
+
|
|
43
|
+
// =============================================================================
|
|
44
|
+
// ERROR LOGGING HELPERS
|
|
45
|
+
// =============================================================================
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Create a safe partial payload for logging (truncates to maxLength chars).
|
|
49
|
+
* Redacts correlation_id to keep logs shorter while preserving debugging context.
|
|
50
|
+
*/
|
|
51
|
+
function createPartialPayload(telemetry: TelemetryMessage, maxLength = 500): string {
|
|
52
|
+
const summary = {
|
|
53
|
+
feature_key: telemetry.feature_key,
|
|
54
|
+
project: telemetry.project,
|
|
55
|
+
category: telemetry.category,
|
|
56
|
+
feature: telemetry.feature,
|
|
57
|
+
timestamp: telemetry.timestamp,
|
|
58
|
+
is_heartbeat: telemetry.is_heartbeat,
|
|
59
|
+
error_category: telemetry.error_category,
|
|
60
|
+
error_count: telemetry.error_count,
|
|
61
|
+
metrics_keys: Object.keys(telemetry.metrics).filter(
|
|
62
|
+
(k) => (telemetry.metrics as Record<string, number>)[k] > 0
|
|
63
|
+
),
|
|
64
|
+
};
|
|
65
|
+
const json = JSON.stringify(summary);
|
|
66
|
+
return json.length > maxLength ? json.slice(0, maxLength) + '...' : json;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Generate an error fingerprint for deduplication in logs.
|
|
71
|
+
* Combines error name, category, and first line of stack trace.
|
|
72
|
+
*/
|
|
73
|
+
function generateErrorFingerprint(error: unknown): string {
|
|
74
|
+
if (!(error instanceof Error)) {
|
|
75
|
+
return `unknown:${String(error).slice(0, 50)}`;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const category = categoriseError(error);
|
|
79
|
+
const code = extractErrorCode(error) || 'no_code';
|
|
80
|
+
const stackLine = error.stack?.split('\n')[1]?.trim().slice(0, 80) || 'no_stack';
|
|
81
|
+
|
|
82
|
+
return `${category}:${code}:${error.name}:${stackLine}`;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// =============================================================================
|
|
86
|
+
// ERROR SAMPLING
|
|
87
|
+
// =============================================================================
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Error sampling configuration.
|
|
91
|
+
* Reduces D1 writes during high error rate periods.
|
|
92
|
+
*/
|
|
93
|
+
interface ErrorSamplingConfig {
|
|
94
|
+
/** Error rate threshold to trigger sampling (e.g., 0.1 = 10%) */
|
|
95
|
+
triggerThreshold: number;
|
|
96
|
+
/** Sample rate when triggered (e.g., 0.1 = keep 10%) */
|
|
97
|
+
sampleRate: number;
|
|
98
|
+
/** Error categories that are never sampled (always stored) */
|
|
99
|
+
neverSampleCategories: string[];
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const ERROR_SAMPLING_CONFIG: ErrorSamplingConfig = {
|
|
103
|
+
triggerThreshold: 0.1, // 10% error rate
|
|
104
|
+
sampleRate: 0.1, // Keep 10% of errors when sampling
|
|
105
|
+
neverSampleCategories: ['CIRCUIT_BREAKER', 'AUTH', 'INTERNAL'],
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Per-batch error sampling state.
|
|
110
|
+
* Tracks error counts across the batch for adaptive sampling.
|
|
111
|
+
*/
|
|
112
|
+
interface ErrorSamplingState {
|
|
113
|
+
totalErrors: number;
|
|
114
|
+
sampledErrors: number;
|
|
115
|
+
totalMessages: number;
|
|
116
|
+
samplingActive: boolean;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Determine if an error should be sampled (stored in D1).
|
|
121
|
+
* Returns true if the error should be stored, false to skip.
|
|
122
|
+
*
|
|
123
|
+
* @param telemetry - The telemetry message with error
|
|
124
|
+
* @param state - Current batch sampling state
|
|
125
|
+
* @returns Whether to store this error in D1
|
|
126
|
+
*/
|
|
127
|
+
function shouldStoreError(telemetry: TelemetryMessage, state: ErrorSamplingState): boolean {
|
|
128
|
+
// Never sample critical error categories
|
|
129
|
+
if (
|
|
130
|
+
telemetry.error_category &&
|
|
131
|
+
ERROR_SAMPLING_CONFIG.neverSampleCategories.includes(telemetry.error_category)
|
|
132
|
+
) {
|
|
133
|
+
return true;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Calculate error rate for the batch
|
|
137
|
+
const errorRate = state.totalMessages > 0 ? state.totalErrors / state.totalMessages : 0;
|
|
138
|
+
|
|
139
|
+
// If error rate below threshold, store all errors
|
|
140
|
+
if (errorRate < ERROR_SAMPLING_CONFIG.triggerThreshold) {
|
|
141
|
+
return true;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Sampling is active - use probabilistic sampling
|
|
145
|
+
state.samplingActive = true;
|
|
146
|
+
return Math.random() < ERROR_SAMPLING_CONFIG.sampleRate;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Create initial sampling state for a batch.
|
|
151
|
+
*/
|
|
152
|
+
function createSamplingState(): ErrorSamplingState {
|
|
153
|
+
return {
|
|
154
|
+
totalErrors: 0,
|
|
155
|
+
sampledErrors: 0,
|
|
156
|
+
totalMessages: 0,
|
|
157
|
+
samplingActive: false,
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// =============================================================================
|
|
162
|
+
// HEARTBEAT HANDLING
|
|
163
|
+
// =============================================================================
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Handle a heartbeat message from health checks.
|
|
167
|
+
* Writes to Analytics Engine (zeros) and upserts to D1 health_checks table.
|
|
168
|
+
*/
|
|
169
|
+
async function handleHeartbeat(telemetry: TelemetryMessage, env: Env): Promise<void> {
|
|
170
|
+
const now = Math.floor(Date.now() / 1000);
|
|
171
|
+
|
|
172
|
+
// Write to Analytics Engine with zeros (for consistency, shows heartbeat was processed)
|
|
173
|
+
env.PLATFORM_ANALYTICS.writeDataPoint({
|
|
174
|
+
blobs: [telemetry.project, telemetry.category, telemetry.feature],
|
|
175
|
+
doubles: new Array(20).fill(0),
|
|
176
|
+
indexes: [telemetry.feature_key],
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
// Upsert to D1 system_health_checks table
|
|
180
|
+
await env.PLATFORM_DB.prepare(
|
|
181
|
+
`
|
|
182
|
+
INSERT INTO system_health_checks (id, project_id, feature_id, last_heartbeat, status, updated_at)
|
|
183
|
+
VALUES (?1, ?2, ?3, ?4, 'healthy', ?4)
|
|
184
|
+
ON CONFLICT (project_id, feature_id) DO UPDATE SET
|
|
185
|
+
last_heartbeat = excluded.last_heartbeat,
|
|
186
|
+
status = 'healthy',
|
|
187
|
+
consecutive_failures = 0,
|
|
188
|
+
updated_at = excluded.updated_at
|
|
189
|
+
`
|
|
190
|
+
)
|
|
191
|
+
.bind(crypto.randomUUID(), telemetry.project, telemetry.feature_key, now)
|
|
192
|
+
.run();
|
|
193
|
+
|
|
194
|
+
// Note: logger not created per call - this is a hot path
|
|
195
|
+
// Using inline log to avoid overhead
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// =============================================================================
|
|
199
|
+
// ERROR ALERTING
|
|
200
|
+
// =============================================================================
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Check if telemetry message contains errors that warrant alerting.
|
|
204
|
+
* Detects P0 conditions: circuit breaker trips, high error rates.
|
|
205
|
+
* Uses adaptive sampling to reduce D1 writes during high error rate periods.
|
|
206
|
+
*/
|
|
207
|
+
async function checkAndAlertErrors(
|
|
208
|
+
telemetry: TelemetryMessage,
|
|
209
|
+
env: Env,
|
|
210
|
+
samplingState: ErrorSamplingState
|
|
211
|
+
): Promise<void> {
|
|
212
|
+
// Skip if no errors reported
|
|
213
|
+
if (!telemetry.error_count || telemetry.error_count === 0) {
|
|
214
|
+
return;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// P0 Condition 1: Circuit breaker error (always alert, always store)
|
|
218
|
+
if (telemetry.error_category === 'CIRCUIT_BREAKER') {
|
|
219
|
+
await sendErrorAlert(env, {
|
|
220
|
+
type: 'p0_immediate',
|
|
221
|
+
feature_key: telemetry.feature_key,
|
|
222
|
+
project: telemetry.project,
|
|
223
|
+
category: telemetry.category,
|
|
224
|
+
feature: telemetry.feature,
|
|
225
|
+
correlation_id: telemetry.correlation_id,
|
|
226
|
+
error_category: telemetry.error_category,
|
|
227
|
+
error_code: telemetry.error_codes?.[0],
|
|
228
|
+
window_minutes: ERROR_RATE_THRESHOLDS.windowMinutes,
|
|
229
|
+
});
|
|
230
|
+
// Always store P0 errors
|
|
231
|
+
await storeErrorEvent(telemetry, env);
|
|
232
|
+
samplingState.sampledErrors++;
|
|
233
|
+
return;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// Apply adaptive sampling for error storage
|
|
237
|
+
if (shouldStoreError(telemetry, samplingState)) {
|
|
238
|
+
await storeErrorEvent(telemetry, env);
|
|
239
|
+
samplingState.sampledErrors++;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Check error rate over window for P0/P1 conditions
|
|
243
|
+
const errorStats = await getErrorRateStats(telemetry.feature_key, env);
|
|
244
|
+
|
|
245
|
+
if (errorStats.totalRequests >= ERROR_RATE_THRESHOLDS.minRequests) {
|
|
246
|
+
const errorRate = (errorStats.errorCount / errorStats.totalRequests) * 100;
|
|
247
|
+
|
|
248
|
+
if (errorRate >= ERROR_RATE_THRESHOLDS.p0) {
|
|
249
|
+
// P0: High error rate (>50%)
|
|
250
|
+
await sendErrorAlert(env, {
|
|
251
|
+
type: 'p0_immediate',
|
|
252
|
+
feature_key: telemetry.feature_key,
|
|
253
|
+
project: telemetry.project,
|
|
254
|
+
category: telemetry.category,
|
|
255
|
+
feature: telemetry.feature,
|
|
256
|
+
correlation_id: telemetry.correlation_id,
|
|
257
|
+
error_category: telemetry.error_category,
|
|
258
|
+
error_code: telemetry.error_codes?.[0],
|
|
259
|
+
error_rate: errorRate,
|
|
260
|
+
window_minutes: ERROR_RATE_THRESHOLDS.windowMinutes,
|
|
261
|
+
});
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Store error event in D1 for aggregation and historical analysis.
|
|
268
|
+
*/
|
|
269
|
+
async function storeErrorEvent(telemetry: TelemetryMessage, env: Env): Promise<void> {
|
|
270
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:error-tracking');
|
|
271
|
+
try {
|
|
272
|
+
await env.PLATFORM_DB.prepare(
|
|
273
|
+
`INSERT INTO feature_error_events (
|
|
274
|
+
id, feature_key, error_category, error_code, error_message,
|
|
275
|
+
correlation_id, worker, priority, created_at
|
|
276
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
277
|
+
)
|
|
278
|
+
.bind(
|
|
279
|
+
crypto.randomUUID(),
|
|
280
|
+
telemetry.feature_key,
|
|
281
|
+
telemetry.error_category || 'INTERNAL',
|
|
282
|
+
telemetry.error_codes?.[0] || null,
|
|
283
|
+
null, // No message in telemetry (truncated for space)
|
|
284
|
+
telemetry.correlation_id || null,
|
|
285
|
+
null, // Worker name not in telemetry
|
|
286
|
+
'P2', // Default priority, upgraded by alert detection
|
|
287
|
+
Math.floor(Date.now() / 1000)
|
|
288
|
+
)
|
|
289
|
+
.run();
|
|
290
|
+
} catch (error) {
|
|
291
|
+
log.error('Failed to store error event', error);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/**
|
|
296
|
+
* Update error budget window for SLA tracking.
|
|
297
|
+
* Aggregates success/error counts in 5-minute windows.
|
|
298
|
+
*/
|
|
299
|
+
async function updateErrorBudgetWindow(telemetry: TelemetryMessage, env: Env): Promise<void> {
|
|
300
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:sla-tracking');
|
|
301
|
+
|
|
302
|
+
try {
|
|
303
|
+
// Calculate 5-minute window boundaries
|
|
304
|
+
const WINDOW_SIZE_SECONDS = 5 * 60; // 5 minutes
|
|
305
|
+
const now = Math.floor(Date.now() / 1000);
|
|
306
|
+
const windowStart = Math.floor(now / WINDOW_SIZE_SECONDS) * WINDOW_SIZE_SECONDS;
|
|
307
|
+
const windowEnd = windowStart + WINDOW_SIZE_SECONDS;
|
|
308
|
+
|
|
309
|
+
const hasError = (telemetry.error_count ?? 0) > 0;
|
|
310
|
+
const errorCategory = telemetry.error_category;
|
|
311
|
+
|
|
312
|
+
// Determine error category counts
|
|
313
|
+
const timeoutIncrement = errorCategory === 'TIMEOUT' ? 1 : 0;
|
|
314
|
+
const validationIncrement = errorCategory === 'VALIDATION' ? 1 : 0;
|
|
315
|
+
const internalIncrement = errorCategory === 'INTERNAL' ? 1 : 0;
|
|
316
|
+
const externalIncrement = errorCategory === 'EXTERNAL_API' ? 1 : 0;
|
|
317
|
+
const otherIncrement =
|
|
318
|
+
hasError &&
|
|
319
|
+
!['TIMEOUT', 'VALIDATION', 'INTERNAL', 'EXTERNAL_API'].includes(errorCategory || '')
|
|
320
|
+
? 1
|
|
321
|
+
: 0;
|
|
322
|
+
|
|
323
|
+
// Upsert window record
|
|
324
|
+
await env.PLATFORM_DB.prepare(
|
|
325
|
+
`INSERT INTO error_budget_windows (
|
|
326
|
+
id, feature_key, project, window_start, window_end,
|
|
327
|
+
success_count, error_count, total_count,
|
|
328
|
+
timeout_count, validation_count, internal_count, external_count, other_count,
|
|
329
|
+
created_at, updated_at
|
|
330
|
+
) VALUES (
|
|
331
|
+
?1, ?2, ?3, ?4, ?5,
|
|
332
|
+
?6, ?7, 1,
|
|
333
|
+
?8, ?9, ?10, ?11, ?12,
|
|
334
|
+
unixepoch(), unixepoch()
|
|
335
|
+
)
|
|
336
|
+
ON CONFLICT(feature_key, window_start) DO UPDATE SET
|
|
337
|
+
success_count = success_count + excluded.success_count,
|
|
338
|
+
error_count = error_count + excluded.error_count,
|
|
339
|
+
total_count = total_count + 1,
|
|
340
|
+
timeout_count = timeout_count + excluded.timeout_count,
|
|
341
|
+
validation_count = validation_count + excluded.validation_count,
|
|
342
|
+
internal_count = internal_count + excluded.internal_count,
|
|
343
|
+
external_count = external_count + excluded.external_count,
|
|
344
|
+
other_count = other_count + excluded.other_count,
|
|
345
|
+
updated_at = unixepoch()`
|
|
346
|
+
)
|
|
347
|
+
.bind(
|
|
348
|
+
`${telemetry.feature_key}:${windowStart}`,
|
|
349
|
+
telemetry.feature_key,
|
|
350
|
+
telemetry.project,
|
|
351
|
+
windowStart,
|
|
352
|
+
windowEnd,
|
|
353
|
+
hasError ? 0 : 1, // success_count
|
|
354
|
+
hasError ? 1 : 0, // error_count
|
|
355
|
+
timeoutIncrement,
|
|
356
|
+
validationIncrement,
|
|
357
|
+
internalIncrement,
|
|
358
|
+
externalIncrement,
|
|
359
|
+
otherIncrement
|
|
360
|
+
)
|
|
361
|
+
.run();
|
|
362
|
+
} catch (error) {
|
|
363
|
+
log.error('Failed to update error budget window', error);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
/**
|
|
368
|
+
* Get error rate statistics for a feature over the sliding window.
|
|
369
|
+
*/
|
|
370
|
+
async function getErrorRateStats(
|
|
371
|
+
featureKey: string,
|
|
372
|
+
env: Env
|
|
373
|
+
): Promise<{ errorCount: number; totalRequests: number }> {
|
|
374
|
+
try {
|
|
375
|
+
const windowStart = Math.floor(Date.now() / 1000) - ERROR_RATE_THRESHOLDS.windowMinutes * 60;
|
|
376
|
+
|
|
377
|
+
const result = await env.PLATFORM_DB.prepare(
|
|
378
|
+
`SELECT
|
|
379
|
+
COUNT(*) as error_count,
|
|
380
|
+
(SELECT COUNT(*) FROM feature_error_events
|
|
381
|
+
WHERE feature_key = ?1 AND created_at >= ?2) as total_events
|
|
382
|
+
FROM feature_error_events
|
|
383
|
+
WHERE feature_key = ?1 AND created_at >= ?2`
|
|
384
|
+
)
|
|
385
|
+
.bind(featureKey, windowStart)
|
|
386
|
+
.first<{ error_count: number; total_events: number }>();
|
|
387
|
+
|
|
388
|
+
return {
|
|
389
|
+
errorCount: result?.error_count ?? 0,
|
|
390
|
+
totalRequests: result?.total_events ?? 0,
|
|
391
|
+
};
|
|
392
|
+
} catch (error) {
|
|
393
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:error-tracking');
|
|
394
|
+
log.error('Failed to get error rate', error);
|
|
395
|
+
return { errorCount: 0, totalRequests: 0 };
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
/**
|
|
400
|
+
* Send error alert to alert-router.
|
|
401
|
+
* Uses service binding if available, falls back to direct Slack.
|
|
402
|
+
*/
|
|
403
|
+
async function sendErrorAlert(env: Env, payload: ErrorAlertPayload): Promise<void> {
|
|
404
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:error-alerting');
|
|
405
|
+
try {
|
|
406
|
+
if (env.ALERT_ROUTER) {
|
|
407
|
+
// Use service binding to call alert-router
|
|
408
|
+
const response = await env.ALERT_ROUTER.fetch('https://alert-router/errors', {
|
|
409
|
+
method: 'POST',
|
|
410
|
+
headers: { 'Content-Type': 'application/json' },
|
|
411
|
+
body: JSON.stringify(payload),
|
|
412
|
+
});
|
|
413
|
+
|
|
414
|
+
if (!response.ok) {
|
|
415
|
+
log.error(`alert-router returned ${response.status}`);
|
|
416
|
+
} else {
|
|
417
|
+
log.info('Alert sent', { type: payload.type, featureKey: payload.feature_key });
|
|
418
|
+
}
|
|
419
|
+
} else if (env.SLACK_WEBHOOK_URL) {
|
|
420
|
+
// Fallback: send directly to Slack (basic format)
|
|
421
|
+
const emoji = payload.type === 'p0_immediate' ? '🚨' : '⚠️';
|
|
422
|
+
await fetch(env.SLACK_WEBHOOK_URL, {
|
|
423
|
+
method: 'POST',
|
|
424
|
+
headers: { 'Content-Type': 'application/json' },
|
|
425
|
+
body: JSON.stringify({
|
|
426
|
+
text: `${emoji} [${payload.type.toUpperCase()}] Error in ${payload.feature_key}: ${payload.error_category}`,
|
|
427
|
+
}),
|
|
428
|
+
});
|
|
429
|
+
}
|
|
430
|
+
} catch (error) {
|
|
431
|
+
log.error('Failed to send alert', error);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// =============================================================================
|
|
436
|
+
// AI MODEL USAGE PERSISTENCE
|
|
437
|
+
// =============================================================================
|
|
438
|
+
|
|
439
|
+
/**
|
|
440
|
+
* Persist feature-level AI model usage to D1.
|
|
441
|
+
* Called from queue consumer when telemetry includes aiModelBreakdown.
|
|
442
|
+
* Uses upsert to aggregate invocations for the same feature/model/date.
|
|
443
|
+
*/
|
|
444
|
+
async function persistFeatureAIModelUsage(
|
|
445
|
+
env: Env,
|
|
446
|
+
featureKey: string,
|
|
447
|
+
modelBreakdown: Record<string, number>,
|
|
448
|
+
timestamp: Date
|
|
449
|
+
): Promise<number> {
|
|
450
|
+
const usageDate = timestamp.toISOString().split('T')[0]; // YYYY-MM-DD
|
|
451
|
+
let writes = 0;
|
|
452
|
+
|
|
453
|
+
for (const [model, invocations] of Object.entries(modelBreakdown)) {
|
|
454
|
+
if (invocations <= 0) continue;
|
|
455
|
+
|
|
456
|
+
await env.PLATFORM_DB.prepare(
|
|
457
|
+
`
|
|
458
|
+
INSERT INTO feature_ai_model_usage (
|
|
459
|
+
id, feature_key, model, usage_date, invocations, updated_at
|
|
460
|
+
) VALUES (?, ?, ?, ?, ?, unixepoch())
|
|
461
|
+
ON CONFLICT (feature_key, model, usage_date) DO UPDATE SET
|
|
462
|
+
invocations = invocations + excluded.invocations,
|
|
463
|
+
updated_at = unixepoch()
|
|
464
|
+
`
|
|
465
|
+
)
|
|
466
|
+
.bind(generateId(), featureKey, model, usageDate, invocations)
|
|
467
|
+
.run();
|
|
468
|
+
writes++;
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
return writes;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
// =============================================================================
|
|
475
|
+
// INTELLIGENT DEGRADATION
|
|
476
|
+
// =============================================================================
|
|
477
|
+
|
|
478
|
+
/**
|
|
479
|
+
* Process intelligent degradation updates for features seen in a batch.
|
|
480
|
+
* Updates reservoir sampling and PID controller state in KV.
|
|
481
|
+
*
|
|
482
|
+
* Shadow mode: Currently logs throttle rates without applying them.
|
|
483
|
+
* Set ENABLE_THROTTLE_WRITES=true in env to write throttle rates to KV.
|
|
484
|
+
*/
|
|
485
|
+
async function processIntelligentDegradation(
|
|
486
|
+
featureStates: Map<string, FeatureBatchState>,
|
|
487
|
+
env: Env
|
|
488
|
+
): Promise<void> {
|
|
489
|
+
if (featureStates.size === 0) return;
|
|
490
|
+
|
|
491
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:intelligent-degradation');
|
|
492
|
+
|
|
493
|
+
// Cast KV to work around type version mismatch between workers and lib modules
|
|
494
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
495
|
+
const kv = env.PLATFORM_CACHE as any;
|
|
496
|
+
const shadowMode = true; // TODO: Make configurable via env.ENABLE_THROTTLE_WRITES
|
|
497
|
+
|
|
498
|
+
for (const [featureKey, batchState] of featureStates) {
|
|
499
|
+
try {
|
|
500
|
+
// 1. Update reservoir sampling with cpuMs samples
|
|
501
|
+
if (batchState.cpuMsSamples.length > 0) {
|
|
502
|
+
const reservoirState = await getReservoirState(featureKey, kv);
|
|
503
|
+
for (const sample of batchState.cpuMsSamples) {
|
|
504
|
+
addSample(reservoirState, sample);
|
|
505
|
+
}
|
|
506
|
+
await saveReservoirState(featureKey, reservoirState, kv);
|
|
507
|
+
|
|
508
|
+
// Log percentiles periodically (every 100 samples)
|
|
509
|
+
if (reservoirState.totalSeen % 100 === 0) {
|
|
510
|
+
const percentiles = getPercentiles(reservoirState);
|
|
511
|
+
if (percentiles) {
|
|
512
|
+
log.info('Feature latency', {
|
|
513
|
+
featureKey,
|
|
514
|
+
latency: formatPercentiles(percentiles),
|
|
515
|
+
});
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
// 2. Update PID controller if enough time has passed (60s interval)
|
|
521
|
+
const pidState = await getPIDState(featureKey, kv);
|
|
522
|
+
if (shouldUpdatePID(pidState.lastUpdate, 60_000)) {
|
|
523
|
+
// Get current budget utilisation from KV
|
|
524
|
+
// For now, use BCU as a proxy for utilisation
|
|
525
|
+
// TODO: Get actual budget limit from CONFIG:FEATURE:{id}:BUDGET
|
|
526
|
+
const budgetLimit = 10000; // Default BCU budget per 60s interval
|
|
527
|
+
const currentUsage = calculateUtilisation(batchState.bcuTotal, budgetLimit);
|
|
528
|
+
const deltaTimeMs = Date.now() - pidState.lastUpdate;
|
|
529
|
+
|
|
530
|
+
const pidOutput = computePID(pidState, { currentUsage, deltaTimeMs });
|
|
531
|
+
|
|
532
|
+
if (shadowMode) {
|
|
533
|
+
// Shadow mode: log but don't write throttle rate to KV
|
|
534
|
+
if (pidOutput.throttleRate > 0.01) {
|
|
535
|
+
log.info('SHADOW throttle', {
|
|
536
|
+
featureKey,
|
|
537
|
+
throttle: formatThrottleRate(pidOutput.throttleRate),
|
|
538
|
+
usagePct: (currentUsage * 100).toFixed(1),
|
|
539
|
+
bcu: batchState.bcuTotal,
|
|
540
|
+
});
|
|
541
|
+
}
|
|
542
|
+
// Still save PID state to maintain continuity
|
|
543
|
+
pidOutput.newState.throttleRate = 0; // Don't persist throttle in shadow mode
|
|
544
|
+
await kv.put(`STATE:PID:${featureKey}`, JSON.stringify(pidOutput.newState), {
|
|
545
|
+
expirationTtl: 86400,
|
|
546
|
+
});
|
|
547
|
+
} else {
|
|
548
|
+
// Active mode: save state and write throttle rate to KV
|
|
549
|
+
await savePIDState(featureKey, pidOutput.newState, kv);
|
|
550
|
+
if (pidOutput.throttleRate > 0.01) {
|
|
551
|
+
log.info('Throttle applied', {
|
|
552
|
+
featureKey,
|
|
553
|
+
throttle: formatThrottleRate(pidOutput.throttleRate),
|
|
554
|
+
usagePct: (currentUsage * 100).toFixed(1),
|
|
555
|
+
});
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
// 3. Log BCU summary for monitoring
|
|
561
|
+
if (batchState.bcuTotal > 1000) {
|
|
562
|
+
// Only log significant BCU usage
|
|
563
|
+
const bcuResult = { total: batchState.bcuTotal } as BCUResult;
|
|
564
|
+
log.info('BCU summary', {
|
|
565
|
+
featureKey,
|
|
566
|
+
bcu: formatBCUResult(bcuResult),
|
|
567
|
+
messages: batchState.messageCount,
|
|
568
|
+
});
|
|
569
|
+
}
|
|
570
|
+
} catch (error) {
|
|
571
|
+
// Don't fail the batch for intelligent degradation errors
|
|
572
|
+
log.error(`Error processing ${featureKey}`, error);
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
// =============================================================================
|
|
578
|
+
// MAIN QUEUE HANDLER
|
|
579
|
+
// =============================================================================
|
|
580
|
+
|
|
581
|
+
/**
|
|
582
|
+
* Main queue consumer handler for telemetry messages.
|
|
583
|
+
* Processes batches of TelemetryMessage from the platform-telemetry queue.
|
|
584
|
+
*
|
|
585
|
+
* Processing steps per message:
|
|
586
|
+
* 1. Handle heartbeat messages (write zeros, update health check)
|
|
587
|
+
* 2. Write metrics to Analytics Engine
|
|
588
|
+
* 3. Accumulate intelligent degradation data
|
|
589
|
+
* 4. Check budget and update status if exceeded
|
|
590
|
+
* 5. Check for errors and send alerts if needed
|
|
591
|
+
* 6. Persist AI model breakdown to D1 if present
|
|
592
|
+
*
|
|
593
|
+
* After batch processing:
|
|
594
|
+
* - Process intelligent degradation updates for all features seen
|
|
595
|
+
*/
|
|
596
|
+
async function handleQueue(batch: MessageBatch<TelemetryMessage>, env: Env): Promise<void> {
|
|
597
|
+
const log = createLoggerFromEnv(env, 'platform-usage', 'platform:usage:telemetry');
|
|
598
|
+
log.info('Processing batch', { messages: batch.messages.length });
|
|
599
|
+
|
|
600
|
+
let successCount = 0;
|
|
601
|
+
let errorCount = 0;
|
|
602
|
+
let heartbeatCount = 0;
|
|
603
|
+
|
|
604
|
+
// Accumulate state per feature for intelligent degradation
|
|
605
|
+
// This allows batch-level KV operations instead of per-message
|
|
606
|
+
const featureStates = new Map<string, FeatureBatchState>();
|
|
607
|
+
|
|
608
|
+
// Error sampling state for adaptive D1 write reduction during incidents
|
|
609
|
+
const samplingState = createSamplingState();
|
|
610
|
+
samplingState.totalMessages = batch.messages.length;
|
|
611
|
+
|
|
612
|
+
for (const message of batch.messages) {
|
|
613
|
+
try {
|
|
614
|
+
const telemetry = message.body;
|
|
615
|
+
|
|
616
|
+
// Handle heartbeat messages differently (skip budget check)
|
|
617
|
+
if (telemetry.is_heartbeat) {
|
|
618
|
+
await handleHeartbeat(telemetry, env);
|
|
619
|
+
message.ack();
|
|
620
|
+
heartbeatCount++;
|
|
621
|
+
successCount++;
|
|
622
|
+
continue;
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
// Calculate total cost (CF resources + external APIs)
|
|
626
|
+
const cfCost = calculateCFCostFromMetrics(telemetry.metrics);
|
|
627
|
+
const externalCost = telemetry.external_cost_usd ?? 0;
|
|
628
|
+
const totalCost = cfCost + externalCost;
|
|
629
|
+
|
|
630
|
+
// Write to Analytics Engine
|
|
631
|
+
// Schema must match METRIC_FIELDS order from constants.ts:
|
|
632
|
+
// - blobs: [project, category, feature] (feature_key is in indexes)
|
|
633
|
+
// - doubles: ordered per METRIC_FIELDS (d1Writes, d1Reads, kvReads, ...)
|
|
634
|
+
// Note: AE has a hard limit of 20 doubles
|
|
635
|
+
env.PLATFORM_ANALYTICS.writeDataPoint({
|
|
636
|
+
blobs: [
|
|
637
|
+
telemetry.project, // blob1: project
|
|
638
|
+
telemetry.category, // blob2: category
|
|
639
|
+
telemetry.feature, // blob3: feature
|
|
640
|
+
],
|
|
641
|
+
doubles: [
|
|
642
|
+
// Legacy fields (positions 1-12) - DO NOT REORDER
|
|
643
|
+
telemetry.metrics.d1Writes ?? 0, // double1
|
|
644
|
+
telemetry.metrics.d1Reads ?? 0, // double2
|
|
645
|
+
telemetry.metrics.kvReads ?? 0, // double3
|
|
646
|
+
telemetry.metrics.kvWrites ?? 0, // double4
|
|
647
|
+
telemetry.metrics.doRequests ?? 0, // double5
|
|
648
|
+
telemetry.metrics.doGbSeconds ?? 0, // double6
|
|
649
|
+
telemetry.metrics.r2ClassA ?? 0, // double7
|
|
650
|
+
telemetry.metrics.r2ClassB ?? 0, // double8
|
|
651
|
+
telemetry.metrics.aiNeurons ?? 0, // double9
|
|
652
|
+
telemetry.metrics.queueMessages ?? 0, // double10
|
|
653
|
+
telemetry.metrics.requests ?? 0, // double11
|
|
654
|
+
telemetry.metrics.cpuMs ?? 0, // double12
|
|
655
|
+
// Extended fields (positions 13-20) - APPEND ONLY (20 field limit)
|
|
656
|
+
telemetry.metrics.d1RowsRead ?? 0, // double13
|
|
657
|
+
telemetry.metrics.d1RowsWritten ?? 0, // double14
|
|
658
|
+
telemetry.metrics.kvDeletes ?? 0, // double15
|
|
659
|
+
telemetry.metrics.kvLists ?? 0, // double16
|
|
660
|
+
telemetry.metrics.aiRequests ?? 0, // double17
|
|
661
|
+
telemetry.metrics.vectorizeQueries ?? 0, // double18
|
|
662
|
+
telemetry.metrics.vectorizeInserts ?? 0, // double19
|
|
663
|
+
// 2026-01-27: Repurposed from workflowInvocations (free in beta) for external API cost tracking
|
|
664
|
+
externalCost, // double20: external_cost_usd (OpenAI, Apify, etc.)
|
|
665
|
+
],
|
|
666
|
+
indexes: [telemetry.feature_key],
|
|
667
|
+
});
|
|
668
|
+
|
|
669
|
+
// Accumulate intelligent degradation data for this feature
|
|
670
|
+
const featureKey = telemetry.feature_key;
|
|
671
|
+
let state = featureStates.get(featureKey);
|
|
672
|
+
if (!state) {
|
|
673
|
+
state = { cpuMsSamples: [], bcuTotal: 0, messageCount: 0, lastTimestamp: 0 };
|
|
674
|
+
featureStates.set(featureKey, state);
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
// Collect cpuMs sample for reservoir
|
|
678
|
+
const cpuMs = telemetry.metrics.cpuMs ?? 0;
|
|
679
|
+
if (cpuMs > 0) {
|
|
680
|
+
state.cpuMsSamples.push(cpuMs);
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
// Calculate BCU for this message
|
|
684
|
+
const bcuResult = calculateBCU(telemetry.metrics);
|
|
685
|
+
state.bcuTotal += bcuResult.total;
|
|
686
|
+
state.messageCount++;
|
|
687
|
+
state.lastTimestamp = Math.max(state.lastTimestamp, telemetry.timestamp);
|
|
688
|
+
|
|
689
|
+
// Check budget and update status if exceeded
|
|
690
|
+
await checkAndUpdateBudgetStatus(telemetry.feature_key, telemetry.metrics, env);
|
|
691
|
+
|
|
692
|
+
// Check cost budget if there's a cost to track
|
|
693
|
+
if (totalCost > 0) {
|
|
694
|
+
await checkAndUpdateCostBudgetStatus(telemetry.feature_key, totalCost, env);
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
// Track total errors for sampling calculation
|
|
698
|
+
if (telemetry.error_count && telemetry.error_count > 0) {
|
|
699
|
+
samplingState.totalErrors++;
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
// Check for errors and send alerts if needed (with adaptive sampling)
|
|
703
|
+
await checkAndAlertErrors(telemetry, env, samplingState);
|
|
704
|
+
|
|
705
|
+
// Update error budget window for SLA tracking
|
|
706
|
+
await updateErrorBudgetWindow(telemetry, env);
|
|
707
|
+
|
|
708
|
+
// Persist AI model breakdown to D1 if present
|
|
709
|
+
if (telemetry.metrics.aiModelBreakdown) {
|
|
710
|
+
await persistFeatureAIModelUsage(
|
|
711
|
+
env,
|
|
712
|
+
telemetry.feature_key,
|
|
713
|
+
telemetry.metrics.aiModelBreakdown,
|
|
714
|
+
new Date(telemetry.timestamp)
|
|
715
|
+
);
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
message.ack();
|
|
719
|
+
successCount++;
|
|
720
|
+
} catch (error) {
|
|
721
|
+
// Enhanced error logging with full context for debugging
|
|
722
|
+
const telemetry = message.body;
|
|
723
|
+
const errorCategory = categoriseError(error);
|
|
724
|
+
const errorCode = extractErrorCode(error);
|
|
725
|
+
const fingerprint = generateErrorFingerprint(error);
|
|
726
|
+
|
|
727
|
+
log.error('Error processing telemetry message', error, {
|
|
728
|
+
feature_key: telemetry.feature_key,
|
|
729
|
+
project: telemetry.project,
|
|
730
|
+
category: telemetry.category,
|
|
731
|
+
error_category: errorCategory,
|
|
732
|
+
error_code: errorCode,
|
|
733
|
+
fingerprint,
|
|
734
|
+
partial_payload: createPartialPayload(telemetry),
|
|
735
|
+
correlation_id: telemetry.correlation_id,
|
|
736
|
+
});
|
|
737
|
+
|
|
738
|
+
message.retry();
|
|
739
|
+
errorCount++;
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
// Process intelligent degradation updates for each feature seen in batch
|
|
744
|
+
// This is done after message processing to not block the main loop
|
|
745
|
+
await processIntelligentDegradation(featureStates, env);
|
|
746
|
+
|
|
747
|
+
// Log batch summary with error rate and sampling info for monitoring
|
|
748
|
+
if (errorCount > 0) {
|
|
749
|
+
const errorRate = ((errorCount / batch.messages.length) * 100).toFixed(1);
|
|
750
|
+
log.warn('Batch complete with errors', {
|
|
751
|
+
success: successCount,
|
|
752
|
+
heartbeats: heartbeatCount,
|
|
753
|
+
errors: errorCount,
|
|
754
|
+
total: batch.messages.length,
|
|
755
|
+
error_rate_pct: errorRate,
|
|
756
|
+
sampling_active: samplingState.samplingActive,
|
|
757
|
+
errors_sampled: samplingState.sampledErrors,
|
|
758
|
+
errors_total: samplingState.totalErrors,
|
|
759
|
+
});
|
|
760
|
+
} else {
|
|
761
|
+
log.info('Batch complete', {
|
|
762
|
+
success: successCount,
|
|
763
|
+
heartbeats: heartbeatCount,
|
|
764
|
+
errors: errorCount,
|
|
765
|
+
});
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
// =============================================================================
|
|
770
|
+
// EXPORTS
|
|
771
|
+
// =============================================================================
|
|
772
|
+
|
|
773
|
+
export {
|
|
774
|
+
// Main queue handler
|
|
775
|
+
handleQueue,
|
|
776
|
+
// Heartbeat handling
|
|
777
|
+
handleHeartbeat,
|
|
778
|
+
// Intelligent degradation
|
|
779
|
+
processIntelligentDegradation,
|
|
780
|
+
// Error alerting
|
|
781
|
+
checkAndAlertErrors,
|
|
782
|
+
storeErrorEvent,
|
|
783
|
+
getErrorRateStats,
|
|
784
|
+
sendErrorAlert,
|
|
785
|
+
// AI model usage
|
|
786
|
+
persistFeatureAIModelUsage,
|
|
787
|
+
};
|
|
788
|
+
|
|
789
|
+
// Re-export checkAndUpdateBudgetStatus from budget-enforcement for backward compatibility
|
|
790
|
+
export { checkAndUpdateBudgetStatus } from './budget-enforcement';
|