@dotdo/postgres 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/backup/backup-manager.d.ts +244 -0
- package/dist/backup/backup-manager.d.ts.map +1 -0
- package/dist/backup/backup-manager.js +726 -0
- package/dist/backup/backup-manager.js.map +1 -0
- package/dist/observability/production-metrics.d.ts +318 -0
- package/dist/observability/production-metrics.d.ts.map +1 -0
- package/dist/observability/production-metrics.js +747 -0
- package/dist/observability/production-metrics.js.map +1 -0
- package/dist/pglite-assets/pglite.data +0 -0
- package/dist/pglite-assets/pglite.wasm +0 -0
- package/dist/pitr/pitr-manager.d.ts +240 -0
- package/dist/pitr/pitr-manager.d.ts.map +1 -0
- package/dist/pitr/pitr-manager.js +837 -0
- package/dist/pitr/pitr-manager.js.map +1 -0
- package/dist/streaming/cdc-iceberg-connector.d.ts +1 -1
- package/dist/streaming/cdc-iceberg-connector.js +1 -1
- package/dist/streaming/live-cdc-stream.d.ts +1 -1
- package/dist/streaming/live-cdc-stream.js +1 -1
- package/dist/worker/auth.d.ts.map +1 -1
- package/dist/worker/auth.js +16 -6
- package/dist/worker/auth.js.map +1 -1
- package/dist/worker/entry.d.ts.map +1 -1
- package/dist/worker/entry.js +108 -26
- package/dist/worker/entry.js.map +1 -1
- package/package.json +7 -6
- package/src/__tests__/backup.test.ts +944 -0
- package/src/__tests__/observability.test.ts +1089 -0
- package/src/__tests__/pitr.test.ts +1240 -0
- package/src/backup/backup-manager.ts +1006 -0
- package/src/observability/production-metrics.ts +1054 -0
- package/src/pglite-assets/pglite.data +0 -0
- package/src/pglite-assets/pglite.wasm +0 -0
- package/src/pitr/pitr-manager.ts +1136 -0
- package/src/worker/auth.ts +17 -6
- package/src/worker/entry.ts +112 -30
|
@@ -0,0 +1,747 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Production Observability Metrics for PostgreSQL Durable Objects
|
|
3
|
+
*
|
|
4
|
+
* Provides query metrics, connection stats, storage tier monitoring,
|
|
5
|
+
* health checks, Prometheus export, and alerting capabilities.
|
|
6
|
+
*/
|
|
7
|
+
// =============================================================================
|
|
8
|
+
// Constants
|
|
9
|
+
// =============================================================================
|
|
10
|
+
/** Default threshold for classifying queries as "slow" (milliseconds) */
|
|
11
|
+
const DEFAULT_SLOW_QUERY_THRESHOLD_MS = 100;
|
|
12
|
+
/** Maximum number of slow queries to retain in the log */
|
|
13
|
+
const MAX_SLOW_QUERY_LOG_SIZE = 100;
|
|
14
|
+
/** Default maximum number of query digest patterns to track */
|
|
15
|
+
const DEFAULT_MAX_QUERY_DIGESTS = 1000;
|
|
16
|
+
/** Default error rate alert threshold (percent) */
|
|
17
|
+
const DEFAULT_ERROR_RATE_THRESHOLD_PERCENT = 5;
|
|
18
|
+
/** Default P99 latency alert threshold (milliseconds) */
|
|
19
|
+
const DEFAULT_P99_LATENCY_THRESHOLD_MS = 1000;
|
|
20
|
+
/** Reservoir sample size for large dataset percentile calculations */
|
|
21
|
+
const PERCENTILE_SAMPLE_SIZE = 1000;
|
|
22
|
+
/** Threshold for switching to reservoir sampling */
|
|
23
|
+
const RESERVOIR_SAMPLING_THRESHOLD = 10000;
|
|
24
|
+
/** Number of microtask yields for the health check timeout */
|
|
25
|
+
const HEALTH_CHECK_TIMEOUT_YIELD_COUNT = 20;
|
|
26
|
+
/** Default health check timeout (milliseconds) */
|
|
27
|
+
export const DEFAULT_HEALTH_CHECK_TIMEOUT_MS = 5000;
|
|
28
|
+
/** Simulated heap usage in bytes (Workers environment approximation) */
|
|
29
|
+
const SIMULATED_HEAP_USED_BYTES = 50 * 1024 * 1024;
|
|
30
|
+
/** Cloudflare Workers memory limit in bytes */
|
|
31
|
+
const WORKER_MEMORY_LIMIT_BYTES = 128 * 1024 * 1024;
|
|
32
|
+
/** Alert evaluation frequency - evaluate every N queries to avoid performance issues */
|
|
33
|
+
const ALERT_EVALUATION_INTERVAL = 100;
|
|
34
|
+
/** Time window durations for metrics windowing */
|
|
35
|
+
const ONE_MINUTE_MS = 60_000;
|
|
36
|
+
const FIVE_MINUTES_MS = 300_000;
|
|
37
|
+
const FIFTEEN_MINUTES_MS = 900_000;
|
|
38
|
+
/** Storage cost estimates per byte (simplified) */
|
|
39
|
+
const COST_PER_BYTE_HOT = 0.000001; // ~$1/MB (DO SQLite blob storage)
|
|
40
|
+
const COST_PER_BYTE_WARM = 0.0000001; // Free (Cloudflare Cache)
|
|
41
|
+
const COST_PER_BYTE_COLD = 0.000000015; // R2 pricing
|
|
42
|
+
/** Default Prometheus histogram boundaries (seconds) */
|
|
43
|
+
const DEFAULT_HISTOGRAM_BOUNDARIES = [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0];
|
|
44
|
+
// =============================================================================
|
|
45
|
+
// Utility Functions
|
|
46
|
+
// =============================================================================
|
|
47
|
+
/** Extracts the SQL operation type (SELECT, INSERT, UPDATE, DELETE, OTHER) from a query string */
|
|
48
|
+
function extractOperation(sql) {
|
|
49
|
+
const trimmed = sql.trim().toUpperCase();
|
|
50
|
+
if (trimmed.startsWith('SELECT'))
|
|
51
|
+
return 'SELECT';
|
|
52
|
+
if (trimmed.startsWith('INSERT'))
|
|
53
|
+
return 'INSERT';
|
|
54
|
+
if (trimmed.startsWith('UPDATE'))
|
|
55
|
+
return 'UPDATE';
|
|
56
|
+
if (trimmed.startsWith('DELETE'))
|
|
57
|
+
return 'DELETE';
|
|
58
|
+
return 'OTHER';
|
|
59
|
+
}
|
|
60
|
+
/** Normalizes a query by replacing numeric literals with placeholders for digest grouping */
|
|
61
|
+
function normalizeQuery(sql) {
|
|
62
|
+
return sql.replace(/\b\d+\b/g, '$N');
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Calculates the p-th percentile from a pre-sorted array using the nearest-rank method.
|
|
66
|
+
* Returns 0 for empty arrays.
|
|
67
|
+
*/
|
|
68
|
+
function percentile(sorted, p) {
|
|
69
|
+
if (sorted.length === 0)
|
|
70
|
+
return 0;
|
|
71
|
+
if (sorted.length === 1)
|
|
72
|
+
return sorted[0];
|
|
73
|
+
const idx = Math.ceil((p / 100) * sorted.length);
|
|
74
|
+
return sorted[Math.min(idx, sorted.length - 1)];
|
|
75
|
+
}
|
|
76
|
+
// =============================================================================
|
|
77
|
+
// ProductionMetrics Class
|
|
78
|
+
// =============================================================================
|
|
79
|
+
/**
|
|
80
|
+
* Collects and exposes production metrics for PostgreSQL Durable Objects.
|
|
81
|
+
* Tracks query performance, connection statistics, storage tier operations,
|
|
82
|
+
* and provides health checks, Prometheus export, and alerting.
|
|
83
|
+
*/
|
|
84
|
+
export class ProductionMetrics {
|
|
85
|
+
config;
|
|
86
|
+
startTime;
|
|
87
|
+
// Query metrics state
|
|
88
|
+
queries = [];
|
|
89
|
+
slowQueries = [];
|
|
90
|
+
digestMap = new Map();
|
|
91
|
+
totalErrors = 0;
|
|
92
|
+
totalRowsReturned = 0;
|
|
93
|
+
// Connection state
|
|
94
|
+
connections = [];
|
|
95
|
+
totalConnectionsOpened = 0;
|
|
96
|
+
peakConnections = 0;
|
|
97
|
+
connectionErrors = 0;
|
|
98
|
+
connectionDurations = [];
|
|
99
|
+
waitTimes = [];
|
|
100
|
+
idleConnections = 0;
|
|
101
|
+
// Storage state
|
|
102
|
+
storageOps = [];
|
|
103
|
+
tierPromotions = { coldToWarm: 0, warmToHot: 0 };
|
|
104
|
+
tierDemotions = { hotToWarm: 0, warmToCold: 0 };
|
|
105
|
+
tierErrors = { hot: 0, warm: 0, cold: 0 };
|
|
106
|
+
tierUsage = { hot: 0, warm: 0, cold: 0 };
|
|
107
|
+
tierHealth = { hot: 'healthy', warm: 'healthy', cold: 'healthy' };
|
|
108
|
+
// Dependencies
|
|
109
|
+
pgliteInstance = null;
|
|
110
|
+
storageOrchestrator = null;
|
|
111
|
+
// Alert state
|
|
112
|
+
activeAlerts = [];
|
|
113
|
+
customThresholds = [];
|
|
114
|
+
constructor(config) {
|
|
115
|
+
this.config = config;
|
|
116
|
+
this.startTime = Date.now();
|
|
117
|
+
}
|
|
118
|
+
// ===========================================================================
|
|
119
|
+
// Query Metrics
|
|
120
|
+
// ===========================================================================
|
|
121
|
+
/** Records a completed query, updating metrics, digests, slow query log, and alerts */
|
|
122
|
+
recordQuery(record) {
|
|
123
|
+
const operation = extractOperation(record.sql);
|
|
124
|
+
const entry = {
|
|
125
|
+
durationMs: Math.max(0, record.durationMs),
|
|
126
|
+
success: record.success,
|
|
127
|
+
timestamp: Date.now(),
|
|
128
|
+
operation,
|
|
129
|
+
rowsReturned: record.rowsReturned,
|
|
130
|
+
};
|
|
131
|
+
this.queries.push(entry);
|
|
132
|
+
this.totalRowsReturned += record.rowsReturned;
|
|
133
|
+
if (!record.success) {
|
|
134
|
+
this.totalErrors++;
|
|
135
|
+
}
|
|
136
|
+
this.trackSlowQuery(record);
|
|
137
|
+
this.updateQueryDigest(record);
|
|
138
|
+
// Evaluate alerts periodically to avoid performance overhead on high-throughput workloads
|
|
139
|
+
if (this.queries.length <= ALERT_EVALUATION_INTERVAL || this.queries.length % ALERT_EVALUATION_INTERVAL === 0) {
|
|
140
|
+
this.evaluateAlerts();
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
/** Adds a query to the slow query log if it exceeds the configured threshold */
|
|
144
|
+
trackSlowQuery(record) {
|
|
145
|
+
const threshold = this.config.slowQueryThresholdMs || DEFAULT_SLOW_QUERY_THRESHOLD_MS;
|
|
146
|
+
if (record.durationMs >= threshold) {
|
|
147
|
+
this.slowQueries.push({
|
|
148
|
+
sql: record.sql,
|
|
149
|
+
durationMs: record.durationMs,
|
|
150
|
+
rowsReturned: record.rowsReturned,
|
|
151
|
+
timestamp: Date.now(),
|
|
152
|
+
});
|
|
153
|
+
if (this.slowQueries.length > MAX_SLOW_QUERY_LOG_SIZE) {
|
|
154
|
+
this.slowQueries = this.slowQueries.slice(-MAX_SLOW_QUERY_LOG_SIZE);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
/** Updates the query digest map with a new query record */
|
|
159
|
+
updateQueryDigest(record) {
|
|
160
|
+
const pattern = normalizeQuery(record.sql);
|
|
161
|
+
const existing = this.digestMap.get(pattern);
|
|
162
|
+
if (existing) {
|
|
163
|
+
existing.count++;
|
|
164
|
+
existing.totalDurationMs += record.durationMs;
|
|
165
|
+
existing.avgDurationMs = existing.totalDurationMs / existing.count;
|
|
166
|
+
existing.lastSeen = Date.now();
|
|
167
|
+
}
|
|
168
|
+
else {
|
|
169
|
+
const maxDigests = this.config.maxQueryDigests || DEFAULT_MAX_QUERY_DIGESTS;
|
|
170
|
+
if (this.digestMap.size < maxDigests) {
|
|
171
|
+
this.digestMap.set(pattern, {
|
|
172
|
+
pattern,
|
|
173
|
+
count: 1,
|
|
174
|
+
avgDurationMs: record.durationMs,
|
|
175
|
+
totalDurationMs: record.durationMs,
|
|
176
|
+
lastSeen: Date.now(),
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
/** Returns a comprehensive snapshot of query performance metrics */
|
|
182
|
+
getQueryMetrics() {
|
|
183
|
+
const total = this.queries.length;
|
|
184
|
+
const durations = this.getSortedDurationsSample(total);
|
|
185
|
+
const totalDurationSum = total > 0 ? this.queries.reduce((s, q) => s + q.durationMs, 0) : 0;
|
|
186
|
+
const avgDuration = total > 0 ? totalDurationSum / total : 0;
|
|
187
|
+
const errors = this.queries.filter((q) => !q.success).length;
|
|
188
|
+
const errorRate = total > 0 ? errors / total : 0;
|
|
189
|
+
const now = Date.now();
|
|
190
|
+
const elapsedSeconds = Math.max(1, (now - this.startTime) / 1000);
|
|
191
|
+
const queriesPerSecond = total / elapsedSeconds;
|
|
192
|
+
const byOperation = this.computeOperationBreakdown();
|
|
193
|
+
const windows = {
|
|
194
|
+
oneMinute: this.getWindowMetrics(now - ONE_MINUTE_MS),
|
|
195
|
+
fiveMinutes: this.getWindowMetrics(now - FIVE_MINUTES_MS),
|
|
196
|
+
fifteenMinutes: this.getWindowMetrics(now - FIFTEEN_MINUTES_MS),
|
|
197
|
+
};
|
|
198
|
+
return {
|
|
199
|
+
totalQueries: total,
|
|
200
|
+
avgDurationMs: avgDuration,
|
|
201
|
+
p50DurationMs: percentile(durations, 50),
|
|
202
|
+
p95DurationMs: percentile(durations, 95),
|
|
203
|
+
p99DurationMs: percentile(durations, 99),
|
|
204
|
+
queriesPerSecond,
|
|
205
|
+
errorRate,
|
|
206
|
+
totalErrors: errors,
|
|
207
|
+
totalRowsReturned: this.totalRowsReturned,
|
|
208
|
+
byOperation,
|
|
209
|
+
windows,
|
|
210
|
+
};
|
|
211
|
+
}
|
|
212
|
+
/** Returns sorted duration samples, using reservoir sampling for large datasets */
|
|
213
|
+
getSortedDurationsSample(total) {
|
|
214
|
+
if (total > RESERVOIR_SAMPLING_THRESHOLD) {
|
|
215
|
+
const sample = [];
|
|
216
|
+
for (let i = 0; i < total; i++) {
|
|
217
|
+
if (i < PERCENTILE_SAMPLE_SIZE) {
|
|
218
|
+
sample.push(this.queries[i].durationMs);
|
|
219
|
+
}
|
|
220
|
+
else {
|
|
221
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
222
|
+
if (j < PERCENTILE_SAMPLE_SIZE) {
|
|
223
|
+
sample[j] = this.queries[i].durationMs;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
return sample.sort((a, b) => a - b);
|
|
228
|
+
}
|
|
229
|
+
return this.queries.map((q) => q.durationMs).sort((a, b) => a - b);
|
|
230
|
+
}
|
|
231
|
+
/** Computes per-operation count and average duration breakdown */
|
|
232
|
+
computeOperationBreakdown() {
|
|
233
|
+
const byOperation = {};
|
|
234
|
+
for (const q of this.queries) {
|
|
235
|
+
if (!byOperation[q.operation]) {
|
|
236
|
+
byOperation[q.operation] = { count: 0, avgDurationMs: 0 };
|
|
237
|
+
}
|
|
238
|
+
byOperation[q.operation].count++;
|
|
239
|
+
}
|
|
240
|
+
for (const op of Object.keys(byOperation)) {
|
|
241
|
+
const opQueries = this.queries.filter((q) => q.operation === op);
|
|
242
|
+
const sum = opQueries.reduce((s, q) => s + q.durationMs, 0);
|
|
243
|
+
byOperation[op].avgDurationMs = opQueries.length > 0 ? sum / opQueries.length : 0;
|
|
244
|
+
}
|
|
245
|
+
return byOperation;
|
|
246
|
+
}
|
|
247
|
+
/** Returns a copy of the slow query log */
|
|
248
|
+
getSlowQueryLog() {
|
|
249
|
+
return [...this.slowQueries];
|
|
250
|
+
}
|
|
251
|
+
/** Returns all tracked query digest patterns */
|
|
252
|
+
getQueryDigests() {
|
|
253
|
+
return Array.from(this.digestMap.values());
|
|
254
|
+
}
|
|
255
|
+
/** Resets all query-related metrics, including digests and slow query log */
|
|
256
|
+
resetQueryMetrics() {
|
|
257
|
+
this.queries = [];
|
|
258
|
+
this.slowQueries = [];
|
|
259
|
+
this.digestMap.clear();
|
|
260
|
+
this.totalErrors = 0;
|
|
261
|
+
this.totalRowsReturned = 0;
|
|
262
|
+
}
|
|
263
|
+
// ===========================================================================
|
|
264
|
+
// Connection Stats
|
|
265
|
+
// ===========================================================================
|
|
266
|
+
/** Records a new connection being opened */
|
|
267
|
+
recordConnectionOpen(options) {
|
|
268
|
+
const conn = {
|
|
269
|
+
type: options?.type || 'unknown',
|
|
270
|
+
openedAt: Date.now(),
|
|
271
|
+
idle: false,
|
|
272
|
+
};
|
|
273
|
+
this.connections.push(conn);
|
|
274
|
+
this.totalConnectionsOpened++;
|
|
275
|
+
if (this.connections.length > this.peakConnections) {
|
|
276
|
+
this.peakConnections = this.connections.length;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
/** Records a connection being closed, optionally with its total duration */
|
|
280
|
+
recordConnectionClose(options) {
|
|
281
|
+
if (this.connections.length > 0) {
|
|
282
|
+
const conn = this.connections.pop();
|
|
283
|
+
if (conn.idle) {
|
|
284
|
+
this.idleConnections = Math.max(0, this.idleConnections - 1);
|
|
285
|
+
}
|
|
286
|
+
if (options?.durationMs !== undefined) {
|
|
287
|
+
this.connectionDurations.push(options.durationMs);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
/** Records a connection error */
|
|
292
|
+
recordConnectionError(_message) {
|
|
293
|
+
this.connectionErrors++;
|
|
294
|
+
}
|
|
295
|
+
/** Records a connection transitioning to idle state */
|
|
296
|
+
recordConnectionIdle() {
|
|
297
|
+
this.idleConnections++;
|
|
298
|
+
}
|
|
299
|
+
/** Records a connection being acquired from the pool, with wait time */
|
|
300
|
+
recordConnectionAcquired(options) {
|
|
301
|
+
this.waitTimes.push(options.waitTimeMs);
|
|
302
|
+
}
|
|
303
|
+
/** Returns a snapshot of connection statistics */
|
|
304
|
+
getConnectionStats() {
|
|
305
|
+
const wsCount = this.connections.filter((c) => c.type === 'websocket').length;
|
|
306
|
+
const httpCount = this.connections.filter((c) => c.type === 'http').length;
|
|
307
|
+
const avgDuration = this.connectionDurations.length > 0
|
|
308
|
+
? this.connectionDurations.reduce((s, d) => s + d, 0) / this.connectionDurations.length
|
|
309
|
+
: 0;
|
|
310
|
+
const avgWait = this.waitTimes.length > 0
|
|
311
|
+
? this.waitTimes.reduce((s, w) => s + w, 0) / this.waitTimes.length
|
|
312
|
+
: 0;
|
|
313
|
+
return {
|
|
314
|
+
activeConnections: this.connections.length,
|
|
315
|
+
totalConnectionsOpened: this.totalConnectionsOpened,
|
|
316
|
+
peakConnections: this.peakConnections,
|
|
317
|
+
connectionErrors: this.connectionErrors,
|
|
318
|
+
avgConnectionDurationMs: avgDuration,
|
|
319
|
+
idleConnections: this.idleConnections,
|
|
320
|
+
poolUtilization: Math.min(1.0, this.connections.length), // max_connections=1 in DO model
|
|
321
|
+
websocketConnections: wsCount,
|
|
322
|
+
httpConnections: httpCount,
|
|
323
|
+
avgWaitTimeMs: avgWait,
|
|
324
|
+
uptimeMs: Date.now() - this.startTime,
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
// ===========================================================================
|
|
328
|
+
// Storage Tier Stats
|
|
329
|
+
// ===========================================================================
|
|
330
|
+
/** Records a storage tier operation (read/write) with optional hit/miss and timing */
|
|
331
|
+
recordStorageOperation(tier, operation, options) {
|
|
332
|
+
this.storageOps.push({
|
|
333
|
+
tier,
|
|
334
|
+
operation,
|
|
335
|
+
hit: options.hit,
|
|
336
|
+
bytes: options.bytes,
|
|
337
|
+
durationMs: options.durationMs,
|
|
338
|
+
timestamp: Date.now(),
|
|
339
|
+
});
|
|
340
|
+
}
|
|
341
|
+
/** Records a data promotion between storage tiers */
|
|
342
|
+
recordTierPromotion(from, to, _details) {
|
|
343
|
+
if (from === 'cold' && to === 'warm')
|
|
344
|
+
this.tierPromotions.coldToWarm++;
|
|
345
|
+
if (from === 'warm' && to === 'hot')
|
|
346
|
+
this.tierPromotions.warmToHot++;
|
|
347
|
+
}
|
|
348
|
+
/** Records a data demotion between storage tiers */
|
|
349
|
+
recordTierDemotion(from, to, _details) {
|
|
350
|
+
if (from === 'hot' && to === 'warm')
|
|
351
|
+
this.tierDemotions.hotToWarm++;
|
|
352
|
+
if (from === 'warm' && to === 'cold')
|
|
353
|
+
this.tierDemotions.warmToCold++;
|
|
354
|
+
}
|
|
355
|
+
/** Records a storage error and triggers alert if threshold is exceeded */
|
|
356
|
+
recordStorageError(tier, _message) {
|
|
357
|
+
this.tierErrors[tier] = (this.tierErrors[tier] || 0) + 1;
|
|
358
|
+
// Check alert threshold
|
|
359
|
+
const totalStorageErrors = Object.values(this.tierErrors).reduce((s, e) => s + e, 0);
|
|
360
|
+
const threshold = this.config.alertThresholds?.storageErrorRate;
|
|
361
|
+
if (threshold !== undefined && totalStorageErrors > threshold) {
|
|
362
|
+
this.addAlert({
|
|
363
|
+
type: 'storage_error',
|
|
364
|
+
severity: 'critical',
|
|
365
|
+
message: `Storage error rate exceeded threshold: ${totalStorageErrors} errors`,
|
|
366
|
+
triggeredAt: Date.now(),
|
|
367
|
+
value: totalStorageErrors,
|
|
368
|
+
});
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
/** Records the current storage usage for a tier */
|
|
372
|
+
recordStorageUsage(tier, bytes) {
|
|
373
|
+
this.tierUsage[tier] = bytes;
|
|
374
|
+
}
|
|
375
|
+
/** Records a change in tier health status */
|
|
376
|
+
recordTierHealthChange(tier, status) {
|
|
377
|
+
this.tierHealth[tier] = status;
|
|
378
|
+
}
|
|
379
|
+
/** Returns a comprehensive snapshot of storage tier statistics including costs */
|
|
380
|
+
getStorageTierStats() {
|
|
381
|
+
const getTierStats = (tier) => {
|
|
382
|
+
const ops = this.storageOps.filter((o) => o.tier === tier);
|
|
383
|
+
const reads = ops.filter((o) => o.operation === 'read');
|
|
384
|
+
const hits = reads.filter((o) => o.hit === true).length;
|
|
385
|
+
const hitRate = reads.length > 0 ? hits / reads.length : 0;
|
|
386
|
+
const bytesRead = ops.filter((o) => o.operation === 'read').reduce((s, o) => s + o.bytes, 0);
|
|
387
|
+
const bytesWritten = ops.filter((o) => o.operation === 'write').reduce((s, o) => s + o.bytes, 0);
|
|
388
|
+
const durationsWithValues = ops.filter((o) => o.durationMs !== undefined);
|
|
389
|
+
const avgLatencyMs = durationsWithValues.length > 0
|
|
390
|
+
? durationsWithValues.reduce((s, o) => s + (o.durationMs || 0), 0) / durationsWithValues.length
|
|
391
|
+
: 0;
|
|
392
|
+
return {
|
|
393
|
+
hitRate,
|
|
394
|
+
bytesRead,
|
|
395
|
+
bytesWritten,
|
|
396
|
+
totalOperations: ops.length,
|
|
397
|
+
avgLatencyMs,
|
|
398
|
+
errors: this.tierErrors[tier] || 0,
|
|
399
|
+
usageBytes: this.tierUsage[tier] || 0,
|
|
400
|
+
healthStatus: (this.tierHealth[tier] || 'healthy'),
|
|
401
|
+
};
|
|
402
|
+
};
|
|
403
|
+
const hotStats = getTierStats('hot');
|
|
404
|
+
const warmStats = getTierStats('warm');
|
|
405
|
+
const coldStats = getTierStats('cold');
|
|
406
|
+
// Estimate costs based on bytes stored
|
|
407
|
+
const hotCost = (this.tierUsage['hot'] || 0) * COST_PER_BYTE_HOT;
|
|
408
|
+
const warmCost = (this.tierUsage['warm'] || 0) * COST_PER_BYTE_WARM;
|
|
409
|
+
const coldCost = (this.tierUsage['cold'] || 0) * COST_PER_BYTE_COLD;
|
|
410
|
+
// Calculate tiering efficiency (hot hit rate as primary metric)
|
|
411
|
+
const hotReads = this.storageOps.filter((o) => o.tier === 'hot' && o.operation === 'read');
|
|
412
|
+
const hotHits = hotReads.filter((o) => o.hit === true).length;
|
|
413
|
+
const tieringEfficiency = hotReads.length > 0 ? hotHits / hotReads.length : 0;
|
|
414
|
+
return {
|
|
415
|
+
hot: hotStats,
|
|
416
|
+
warm: warmStats,
|
|
417
|
+
cold: coldStats,
|
|
418
|
+
promotions: { ...this.tierPromotions },
|
|
419
|
+
demotions: { ...this.tierDemotions },
|
|
420
|
+
estimatedCosts: {
|
|
421
|
+
hot: hotCost,
|
|
422
|
+
warm: warmCost,
|
|
423
|
+
cold: coldCost,
|
|
424
|
+
total: hotCost + warmCost + coldCost,
|
|
425
|
+
},
|
|
426
|
+
tieringEfficiency,
|
|
427
|
+
};
|
|
428
|
+
}
|
|
429
|
+
// ===========================================================================
|
|
430
|
+
// Health Checks
|
|
431
|
+
// ===========================================================================
|
|
432
|
+
/** Sets the PGLite instance for health check queries */
|
|
433
|
+
setPGLiteInstance(pglite) {
|
|
434
|
+
this.pgliteInstance = pglite;
|
|
435
|
+
}
|
|
436
|
+
/** Sets the storage orchestrator for health check tier status */
|
|
437
|
+
setStorageOrchestrator(orchestrator) {
|
|
438
|
+
this.storageOrchestrator = orchestrator;
|
|
439
|
+
}
|
|
440
|
+
/** Returns a simple liveness probe result (always healthy if the service is running) */
|
|
441
|
+
liveness() {
|
|
442
|
+
return {
|
|
443
|
+
status: 'healthy',
|
|
444
|
+
service: this.config.serviceName,
|
|
445
|
+
uptimeMs: Date.now() - this.startTime,
|
|
446
|
+
};
|
|
447
|
+
}
|
|
448
|
+
/** Performs a readiness check including PGLite, storage, and memory health */
|
|
449
|
+
async readiness(_options) {
|
|
450
|
+
const startTime = Date.now();
|
|
451
|
+
const checks = {};
|
|
452
|
+
checks.pglite = await this.checkPGLiteHealth();
|
|
453
|
+
checks.storage = this.checkStorageHealth();
|
|
454
|
+
checks.memory = {
|
|
455
|
+
status: 'healthy',
|
|
456
|
+
details: {
|
|
457
|
+
heapUsedBytes: SIMULATED_HEAP_USED_BYTES,
|
|
458
|
+
heapTotalBytes: WORKER_MEMORY_LIMIT_BYTES,
|
|
459
|
+
},
|
|
460
|
+
};
|
|
461
|
+
const overallStatus = this.determineOverallStatus(checks);
|
|
462
|
+
return {
|
|
463
|
+
status: overallStatus,
|
|
464
|
+
checks,
|
|
465
|
+
responseTimeMs: Date.now() - startTime,
|
|
466
|
+
};
|
|
467
|
+
}
|
|
468
|
+
/** Checks PGLite health with a microtask-based timeout */
|
|
469
|
+
async checkPGLiteHealth() {
|
|
470
|
+
if (!this.pgliteInstance) {
|
|
471
|
+
return { status: 'unhealthy', error: 'PGLite not initialized' };
|
|
472
|
+
}
|
|
473
|
+
try {
|
|
474
|
+
const queryPromise = this.pgliteInstance.query('SELECT 1 as result');
|
|
475
|
+
let settled = false;
|
|
476
|
+
const wrappedQuery = queryPromise.then((v) => { settled = true; return v; }, (e) => { settled = true; throw e; });
|
|
477
|
+
const timeoutCheck = new Promise(async (_, reject) => {
|
|
478
|
+
for (let i = 0; i < HEALTH_CHECK_TIMEOUT_YIELD_COUNT; i++) {
|
|
479
|
+
await Promise.resolve();
|
|
480
|
+
if (settled)
|
|
481
|
+
return;
|
|
482
|
+
}
|
|
483
|
+
if (!settled) {
|
|
484
|
+
reject(new Error('Health check timeout'));
|
|
485
|
+
}
|
|
486
|
+
});
|
|
487
|
+
await Promise.race([wrappedQuery, timeoutCheck]);
|
|
488
|
+
return { status: 'healthy' };
|
|
489
|
+
}
|
|
490
|
+
catch (e) {
|
|
491
|
+
const errorMsg = e instanceof Error ? e.message : 'Unknown health check error';
|
|
492
|
+
return {
|
|
493
|
+
status: 'unhealthy',
|
|
494
|
+
error: errorMsg.includes('timeout') ? 'Health check timeout' : errorMsg,
|
|
495
|
+
};
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
/** Checks storage orchestrator health */
|
|
499
|
+
checkStorageHealth() {
|
|
500
|
+
if (!this.storageOrchestrator) {
|
|
501
|
+
return { status: 'healthy', details: { message: 'No orchestrator configured' } };
|
|
502
|
+
}
|
|
503
|
+
const tierHealth = this.storageOrchestrator.getTierHealth();
|
|
504
|
+
const allHealthy = Object.values(tierHealth).every((t) => t.status === 'healthy');
|
|
505
|
+
const anyDegraded = Object.values(tierHealth).some((t) => t.status === 'degraded');
|
|
506
|
+
return {
|
|
507
|
+
status: allHealthy ? 'healthy' : (anyDegraded ? 'degraded' : 'unhealthy'),
|
|
508
|
+
details: tierHealth,
|
|
509
|
+
};
|
|
510
|
+
}
|
|
511
|
+
/** Determines the worst overall status from all component checks */
|
|
512
|
+
determineOverallStatus(checks) {
|
|
513
|
+
const statuses = Object.values(checks).map((c) => c.status);
|
|
514
|
+
if (statuses.includes('unhealthy'))
|
|
515
|
+
return 'unhealthy';
|
|
516
|
+
if (statuses.includes('degraded'))
|
|
517
|
+
return 'degraded';
|
|
518
|
+
return 'healthy';
|
|
519
|
+
}
|
|
520
|
+
/** Performs a deep health check including WAL status */
|
|
521
|
+
async deepCheck() {
|
|
522
|
+
const startTime = Date.now();
|
|
523
|
+
const readinessResult = await this.readiness();
|
|
524
|
+
// Add WAL check
|
|
525
|
+
readinessResult.checks.wal = {
|
|
526
|
+
status: 'healthy',
|
|
527
|
+
details: { lastArchive: 'N/A' },
|
|
528
|
+
};
|
|
529
|
+
readinessResult.responseTimeMs = Date.now() - startTime;
|
|
530
|
+
return readinessResult;
|
|
531
|
+
}
|
|
532
|
+
// ===========================================================================
|
|
533
|
+
// Metrics Export
|
|
534
|
+
// ===========================================================================
|
|
535
|
+
/** Exports all metrics in Prometheus text exposition format */
|
|
536
|
+
exportPrometheus() {
|
|
537
|
+
const lines = [];
|
|
538
|
+
const labels = `service="${this.config.serviceName}",do_id="${this.config.doId}"`;
|
|
539
|
+
const queryMetrics = this.getQueryMetrics();
|
|
540
|
+
// Query total counter
|
|
541
|
+
lines.push('# HELP postgres_query_total Total number of queries executed');
|
|
542
|
+
lines.push('# TYPE postgres_query_total counter');
|
|
543
|
+
lines.push(`postgres_query_total{${labels}} ${queryMetrics.totalQueries}`);
|
|
544
|
+
// Query errors
|
|
545
|
+
lines.push('# HELP postgres_query_errors_total Total number of query errors');
|
|
546
|
+
lines.push('# TYPE postgres_query_errors_total counter');
|
|
547
|
+
lines.push(`postgres_query_errors_total{${labels}} ${queryMetrics.totalErrors}`);
|
|
548
|
+
// Query duration histogram
|
|
549
|
+
lines.push('# HELP postgres_query_duration_seconds Query execution time in seconds');
|
|
550
|
+
lines.push('# TYPE postgres_query_duration_seconds histogram');
|
|
551
|
+
const boundaries = this.config.histogramBoundaries || DEFAULT_HISTOGRAM_BOUNDARIES;
|
|
552
|
+
const durations = this.queries.map((q) => q.durationMs / 1000).sort((a, b) => a - b);
|
|
553
|
+
let sum = 0;
|
|
554
|
+
for (const boundary of boundaries) {
|
|
555
|
+
const count = durations.filter((d) => d <= boundary).length;
|
|
556
|
+
lines.push(`postgres_query_duration_seconds_bucket{${labels},le="${boundary}"} ${count}`);
|
|
557
|
+
}
|
|
558
|
+
lines.push(`postgres_query_duration_seconds_bucket{${labels},le="+Inf"} ${durations.length}`);
|
|
559
|
+
sum = durations.reduce((s, d) => s + d, 0);
|
|
560
|
+
lines.push(`postgres_query_duration_seconds_count{${labels}} ${durations.length}`);
|
|
561
|
+
lines.push(`postgres_query_duration_seconds_sum{${labels}} ${sum}`);
|
|
562
|
+
// Connections
|
|
563
|
+
const connStats = this.getConnectionStats();
|
|
564
|
+
lines.push('# HELP postgres_connections_active Current active connections');
|
|
565
|
+
lines.push('# TYPE postgres_connections_active gauge');
|
|
566
|
+
lines.push(`postgres_connections_active{${labels}} ${connStats.activeConnections}`);
|
|
567
|
+
// Storage operations
|
|
568
|
+
lines.push('# HELP postgres_storage_operations_total Total storage operations');
|
|
569
|
+
lines.push('# TYPE postgres_storage_operations_total counter');
|
|
570
|
+
for (const tier of ['hot', 'warm', 'cold']) {
|
|
571
|
+
const count = this.storageOps.filter((o) => o.tier === tier).length;
|
|
572
|
+
lines.push(`postgres_storage_operations_total{${labels},tier="${tier}"} ${count}`);
|
|
573
|
+
}
|
|
574
|
+
return lines.join('\n');
|
|
575
|
+
}
|
|
576
|
+
/** Exports all metrics as a structured JSON object */
|
|
577
|
+
exportJSON() {
|
|
578
|
+
return {
|
|
579
|
+
metrics: {
|
|
580
|
+
queries: this.getQueryMetrics(),
|
|
581
|
+
connections: this.getConnectionStats(),
|
|
582
|
+
storage: this.getStorageTierStats(),
|
|
583
|
+
},
|
|
584
|
+
timestamp: Date.now(),
|
|
585
|
+
service: this.config.serviceName,
|
|
586
|
+
};
|
|
587
|
+
}
|
|
588
|
+
/** Creates an HTTP request handler that serves metrics in Prometheus or JSON format */
|
|
589
|
+
createMetricsHandler() {
|
|
590
|
+
return async (request) => {
|
|
591
|
+
const accept = request.headers.get('Accept') || 'text/plain';
|
|
592
|
+
if (accept.includes('application/json')) {
|
|
593
|
+
const json = this.exportJSON();
|
|
594
|
+
return new Response(JSON.stringify(json), {
|
|
595
|
+
status: 200,
|
|
596
|
+
headers: { 'content-type': 'application/json' },
|
|
597
|
+
});
|
|
598
|
+
}
|
|
599
|
+
// Default: Prometheus format
|
|
600
|
+
const prometheus = this.exportPrometheus();
|
|
601
|
+
return new Response(prometheus, {
|
|
602
|
+
status: 200,
|
|
603
|
+
headers: { 'content-type': 'text/plain; charset=utf-8' },
|
|
604
|
+
});
|
|
605
|
+
};
|
|
606
|
+
}
|
|
607
|
+
// ===========================================================================
|
|
608
|
+
// Dashboard
|
|
609
|
+
// ===========================================================================
|
|
610
|
+
/** Returns a complete metrics dashboard snapshot for display */
|
|
611
|
+
getDashboard() {
|
|
612
|
+
return {
|
|
613
|
+
queries: this.getQueryMetrics(),
|
|
614
|
+
connections: this.getConnectionStats(),
|
|
615
|
+
storage: this.getStorageTierStats(),
|
|
616
|
+
health: {
|
|
617
|
+
status: 'healthy',
|
|
618
|
+
checks: {},
|
|
619
|
+
},
|
|
620
|
+
service: {
|
|
621
|
+
name: this.config.serviceName,
|
|
622
|
+
doId: this.config.doId,
|
|
623
|
+
uptimeMs: Date.now() - this.startTime,
|
|
624
|
+
version: this.config.serviceVersion,
|
|
625
|
+
},
|
|
626
|
+
memory: {
|
|
627
|
+
heapUsedBytes: SIMULATED_HEAP_USED_BYTES,
|
|
628
|
+
heapTotalBytes: WORKER_MEMORY_LIMIT_BYTES,
|
|
629
|
+
},
|
|
630
|
+
alerts: this.getActiveAlerts(),
|
|
631
|
+
timestamp: Date.now(),
|
|
632
|
+
};
|
|
633
|
+
}
|
|
634
|
+
// ===========================================================================
|
|
635
|
+
// Alerts
|
|
636
|
+
// ===========================================================================
|
|
637
|
+
/** Evaluates all alert conditions against current metrics, adding/removing alerts as needed */
|
|
638
|
+
evaluateAlerts() {
|
|
639
|
+
const queryMetrics = this.getQueryMetrics();
|
|
640
|
+
const errorThreshold = this.config.alertThresholds?.errorRatePercent ?? DEFAULT_ERROR_RATE_THRESHOLD_PERCENT;
|
|
641
|
+
const errorRatePercent = queryMetrics.errorRate * 100;
|
|
642
|
+
if (errorRatePercent > errorThreshold) {
|
|
643
|
+
this.addAlert({
|
|
644
|
+
type: 'error_rate',
|
|
645
|
+
severity: 'critical',
|
|
646
|
+
message: `Error rate ${errorRatePercent.toFixed(1)}% exceeds threshold ${errorThreshold}%`,
|
|
647
|
+
triggeredAt: Date.now(),
|
|
648
|
+
value: errorRatePercent,
|
|
649
|
+
});
|
|
650
|
+
}
|
|
651
|
+
else {
|
|
652
|
+
// Resolve error rate alerts
|
|
653
|
+
this.activeAlerts = this.activeAlerts.filter((a) => a.type !== 'error_rate');
|
|
654
|
+
}
|
|
655
|
+
const p99Threshold = this.config.alertThresholds?.p99LatencyMs ?? DEFAULT_P99_LATENCY_THRESHOLD_MS;
|
|
656
|
+
if (queryMetrics.p99DurationMs > p99Threshold) {
|
|
657
|
+
this.addAlert({
|
|
658
|
+
type: 'high_latency',
|
|
659
|
+
severity: 'warning',
|
|
660
|
+
message: `P99 latency ${queryMetrics.p99DurationMs}ms exceeds threshold ${p99Threshold}ms`,
|
|
661
|
+
triggeredAt: Date.now(),
|
|
662
|
+
value: queryMetrics.p99DurationMs,
|
|
663
|
+
});
|
|
664
|
+
}
|
|
665
|
+
else {
|
|
666
|
+
this.activeAlerts = this.activeAlerts.filter((a) => a.type !== 'high_latency');
|
|
667
|
+
}
|
|
668
|
+
// Custom thresholds (e.g. slow query count)
|
|
669
|
+
for (const threshold of this.customThresholds) {
|
|
670
|
+
if (threshold.type === 'slow_query_count') {
|
|
671
|
+
const slowCount = this.slowQueries.length;
|
|
672
|
+
if (slowCount > threshold.threshold) {
|
|
673
|
+
this.addAlert({
|
|
674
|
+
name: threshold.name,
|
|
675
|
+
type: threshold.type,
|
|
676
|
+
severity: threshold.severity,
|
|
677
|
+
message: `Slow query count ${slowCount} exceeds threshold ${threshold.threshold}`,
|
|
678
|
+
triggeredAt: Date.now(),
|
|
679
|
+
value: slowCount,
|
|
680
|
+
});
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
/** Returns a copy of all currently active alerts */
|
|
686
|
+
getActiveAlerts() {
|
|
687
|
+
return [...this.activeAlerts];
|
|
688
|
+
}
|
|
689
|
+
/** Registers a custom alert threshold for evaluation */
|
|
690
|
+
registerAlertThreshold(threshold) {
|
|
691
|
+
this.customThresholds.push(threshold);
|
|
692
|
+
}
|
|
693
|
+
// ===========================================================================
|
|
694
|
+
// Reset
|
|
695
|
+
// ===========================================================================
|
|
696
|
+
/** Resets all metrics state (queries, connections, storage, alerts) */
|
|
697
|
+
resetAll() {
|
|
698
|
+
this.resetQueryMetrics();
|
|
699
|
+
this.connections = [];
|
|
700
|
+
this.totalConnectionsOpened = 0;
|
|
701
|
+
this.peakConnections = 0;
|
|
702
|
+
this.connectionErrors = 0;
|
|
703
|
+
this.connectionDurations = [];
|
|
704
|
+
this.waitTimes = [];
|
|
705
|
+
this.idleConnections = 0;
|
|
706
|
+
this.storageOps = [];
|
|
707
|
+
this.tierPromotions = { coldToWarm: 0, warmToHot: 0 };
|
|
708
|
+
this.tierDemotions = { hotToWarm: 0, warmToCold: 0 };
|
|
709
|
+
this.tierErrors = { hot: 0, warm: 0, cold: 0 };
|
|
710
|
+
this.tierUsage = { hot: 0, warm: 0, cold: 0 };
|
|
711
|
+
this.activeAlerts = [];
|
|
712
|
+
}
|
|
713
|
+
// ===========================================================================
|
|
714
|
+
// Private Helpers
|
|
715
|
+
// ===========================================================================
|
|
716
|
+
getWindowMetrics(sinceTimestamp) {
|
|
717
|
+
const windowQueries = this.queries.filter((q) => q.timestamp >= sinceTimestamp);
|
|
718
|
+
const total = windowQueries.length;
|
|
719
|
+
const errors = windowQueries.filter((q) => !q.success).length;
|
|
720
|
+
const avgDuration = total > 0
|
|
721
|
+
? windowQueries.reduce((s, q) => s + q.durationMs, 0) / total
|
|
722
|
+
: 0;
|
|
723
|
+
return {
|
|
724
|
+
totalQueries: total,
|
|
725
|
+
avgDurationMs: avgDuration,
|
|
726
|
+
errorRate: total > 0 ? errors / total : 0,
|
|
727
|
+
};
|
|
728
|
+
}
|
|
729
|
+
addAlert(alert) {
|
|
730
|
+
// Don't add duplicate alerts of same type
|
|
731
|
+
const existing = this.activeAlerts.find((a) => a.type === alert.type && (alert.name ? a.name === alert.name : true));
|
|
732
|
+
if (!existing) {
|
|
733
|
+
this.activeAlerts.push(alert);
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
// =============================================================================
|
|
738
|
+
// Factory Function
|
|
739
|
+
// =============================================================================
|
|
740
|
+
/** Creates a ProductionMetrics instance, validating required configuration */
|
|
741
|
+
export function createProductionMetrics(config) {
|
|
742
|
+
if (!config.serviceName) {
|
|
743
|
+
throw new Error('ProductionMetrics requires a non-empty serviceName');
|
|
744
|
+
}
|
|
745
|
+
return new ProductionMetrics(config);
|
|
746
|
+
}
|
|
747
|
+
//# sourceMappingURL=production-metrics.js.map
|