@dotdo/postgres 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/dist/backup/backup-manager.d.ts +244 -0
  2. package/dist/backup/backup-manager.d.ts.map +1 -0
  3. package/dist/backup/backup-manager.js +726 -0
  4. package/dist/backup/backup-manager.js.map +1 -0
  5. package/dist/observability/production-metrics.d.ts +318 -0
  6. package/dist/observability/production-metrics.d.ts.map +1 -0
  7. package/dist/observability/production-metrics.js +747 -0
  8. package/dist/observability/production-metrics.js.map +1 -0
  9. package/dist/pglite-assets/pglite.data +0 -0
  10. package/dist/pglite-assets/pglite.wasm +0 -0
  11. package/dist/pitr/pitr-manager.d.ts +240 -0
  12. package/dist/pitr/pitr-manager.d.ts.map +1 -0
  13. package/dist/pitr/pitr-manager.js +837 -0
  14. package/dist/pitr/pitr-manager.js.map +1 -0
  15. package/dist/streaming/cdc-iceberg-connector.d.ts +1 -1
  16. package/dist/streaming/cdc-iceberg-connector.js +1 -1
  17. package/dist/streaming/live-cdc-stream.d.ts +1 -1
  18. package/dist/streaming/live-cdc-stream.js +1 -1
  19. package/dist/worker/auth.d.ts.map +1 -1
  20. package/dist/worker/auth.js +16 -6
  21. package/dist/worker/auth.js.map +1 -1
  22. package/dist/worker/entry.d.ts.map +1 -1
  23. package/dist/worker/entry.js +108 -26
  24. package/dist/worker/entry.js.map +1 -1
  25. package/package.json +7 -6
  26. package/src/__tests__/backup.test.ts +944 -0
  27. package/src/__tests__/observability.test.ts +1089 -0
  28. package/src/__tests__/pitr.test.ts +1240 -0
  29. package/src/backup/backup-manager.ts +1006 -0
  30. package/src/observability/production-metrics.ts +1054 -0
  31. package/src/pglite-assets/pglite.data +0 -0
  32. package/src/pglite-assets/pglite.wasm +0 -0
  33. package/src/pitr/pitr-manager.ts +1136 -0
  34. package/src/worker/auth.ts +17 -6
  35. package/src/worker/entry.ts +112 -30
@@ -0,0 +1,747 @@
1
+ /**
2
+ * Production Observability Metrics for PostgreSQL Durable Objects
3
+ *
4
+ * Provides query metrics, connection stats, storage tier monitoring,
5
+ * health checks, Prometheus export, and alerting capabilities.
6
+ */
7
+ // =============================================================================
8
+ // Constants
9
+ // =============================================================================
10
+ /** Default threshold for classifying queries as "slow" (milliseconds) */
11
+ const DEFAULT_SLOW_QUERY_THRESHOLD_MS = 100;
12
+ /** Maximum number of slow queries to retain in the log */
13
+ const MAX_SLOW_QUERY_LOG_SIZE = 100;
14
+ /** Default maximum number of query digest patterns to track */
15
+ const DEFAULT_MAX_QUERY_DIGESTS = 1000;
16
+ /** Default error rate alert threshold (percent) */
17
+ const DEFAULT_ERROR_RATE_THRESHOLD_PERCENT = 5;
18
+ /** Default P99 latency alert threshold (milliseconds) */
19
+ const DEFAULT_P99_LATENCY_THRESHOLD_MS = 1000;
20
+ /** Reservoir sample size for large dataset percentile calculations */
21
+ const PERCENTILE_SAMPLE_SIZE = 1000;
22
+ /** Threshold for switching to reservoir sampling */
23
+ const RESERVOIR_SAMPLING_THRESHOLD = 10000;
24
+ /** Number of microtask yields for the health check timeout */
25
+ const HEALTH_CHECK_TIMEOUT_YIELD_COUNT = 20;
26
+ /** Default health check timeout (milliseconds) */
27
+ export const DEFAULT_HEALTH_CHECK_TIMEOUT_MS = 5000;
28
+ /** Simulated heap usage in bytes (Workers environment approximation) */
29
+ const SIMULATED_HEAP_USED_BYTES = 50 * 1024 * 1024;
30
+ /** Cloudflare Workers memory limit in bytes */
31
+ const WORKER_MEMORY_LIMIT_BYTES = 128 * 1024 * 1024;
32
+ /** Alert evaluation frequency - evaluate every N queries to avoid performance issues */
33
+ const ALERT_EVALUATION_INTERVAL = 100;
34
+ /** Time window durations for metrics windowing */
35
+ const ONE_MINUTE_MS = 60_000;
36
+ const FIVE_MINUTES_MS = 300_000;
37
+ const FIFTEEN_MINUTES_MS = 900_000;
38
+ /** Storage cost estimates per byte (simplified) */
39
+ const COST_PER_BYTE_HOT = 0.000001; // ~$1/MB (DO SQLite blob storage)
40
+ const COST_PER_BYTE_WARM = 0.0000001; // Free (Cloudflare Cache)
41
+ const COST_PER_BYTE_COLD = 0.000000015; // R2 pricing
42
+ /** Default Prometheus histogram boundaries (seconds) */
43
+ const DEFAULT_HISTOGRAM_BOUNDARIES = [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0];
44
+ // =============================================================================
45
+ // Utility Functions
46
+ // =============================================================================
47
+ /** Extracts the SQL operation type (SELECT, INSERT, UPDATE, DELETE, OTHER) from a query string */
48
+ function extractOperation(sql) {
49
+ const trimmed = sql.trim().toUpperCase();
50
+ if (trimmed.startsWith('SELECT'))
51
+ return 'SELECT';
52
+ if (trimmed.startsWith('INSERT'))
53
+ return 'INSERT';
54
+ if (trimmed.startsWith('UPDATE'))
55
+ return 'UPDATE';
56
+ if (trimmed.startsWith('DELETE'))
57
+ return 'DELETE';
58
+ return 'OTHER';
59
+ }
60
+ /** Normalizes a query by replacing numeric literals with placeholders for digest grouping */
61
+ function normalizeQuery(sql) {
62
+ return sql.replace(/\b\d+\b/g, '$N');
63
+ }
64
+ /**
65
+ * Calculates the p-th percentile from a pre-sorted array using the nearest-rank method.
66
+ * Returns 0 for empty arrays.
67
+ */
68
+ function percentile(sorted, p) {
69
+ if (sorted.length === 0)
70
+ return 0;
71
+ if (sorted.length === 1)
72
+ return sorted[0];
73
+ const idx = Math.ceil((p / 100) * sorted.length);
74
+ return sorted[Math.min(idx, sorted.length - 1)];
75
+ }
76
+ // =============================================================================
77
+ // ProductionMetrics Class
78
+ // =============================================================================
79
+ /**
80
+ * Collects and exposes production metrics for PostgreSQL Durable Objects.
81
+ * Tracks query performance, connection statistics, storage tier operations,
82
+ * and provides health checks, Prometheus export, and alerting.
83
+ */
84
+ export class ProductionMetrics {
85
+ config;
86
+ startTime;
87
+ // Query metrics state
88
+ queries = [];
89
+ slowQueries = [];
90
+ digestMap = new Map();
91
+ totalErrors = 0;
92
+ totalRowsReturned = 0;
93
+ // Connection state
94
+ connections = [];
95
+ totalConnectionsOpened = 0;
96
+ peakConnections = 0;
97
+ connectionErrors = 0;
98
+ connectionDurations = [];
99
+ waitTimes = [];
100
+ idleConnections = 0;
101
+ // Storage state
102
+ storageOps = [];
103
+ tierPromotions = { coldToWarm: 0, warmToHot: 0 };
104
+ tierDemotions = { hotToWarm: 0, warmToCold: 0 };
105
+ tierErrors = { hot: 0, warm: 0, cold: 0 };
106
+ tierUsage = { hot: 0, warm: 0, cold: 0 };
107
+ tierHealth = { hot: 'healthy', warm: 'healthy', cold: 'healthy' };
108
+ // Dependencies
109
+ pgliteInstance = null;
110
+ storageOrchestrator = null;
111
+ // Alert state
112
+ activeAlerts = [];
113
+ customThresholds = [];
114
+ constructor(config) {
115
+ this.config = config;
116
+ this.startTime = Date.now();
117
+ }
118
+ // ===========================================================================
119
+ // Query Metrics
120
+ // ===========================================================================
121
+ /** Records a completed query, updating metrics, digests, slow query log, and alerts */
122
+ recordQuery(record) {
123
+ const operation = extractOperation(record.sql);
124
+ const entry = {
125
+ durationMs: Math.max(0, record.durationMs),
126
+ success: record.success,
127
+ timestamp: Date.now(),
128
+ operation,
129
+ rowsReturned: record.rowsReturned,
130
+ };
131
+ this.queries.push(entry);
132
+ this.totalRowsReturned += record.rowsReturned;
133
+ if (!record.success) {
134
+ this.totalErrors++;
135
+ }
136
+ this.trackSlowQuery(record);
137
+ this.updateQueryDigest(record);
138
+ // Evaluate alerts periodically to avoid performance overhead on high-throughput workloads
139
+ if (this.queries.length <= ALERT_EVALUATION_INTERVAL || this.queries.length % ALERT_EVALUATION_INTERVAL === 0) {
140
+ this.evaluateAlerts();
141
+ }
142
+ }
143
+ /** Adds a query to the slow query log if it exceeds the configured threshold */
144
+ trackSlowQuery(record) {
145
+ const threshold = this.config.slowQueryThresholdMs || DEFAULT_SLOW_QUERY_THRESHOLD_MS;
146
+ if (record.durationMs >= threshold) {
147
+ this.slowQueries.push({
148
+ sql: record.sql,
149
+ durationMs: record.durationMs,
150
+ rowsReturned: record.rowsReturned,
151
+ timestamp: Date.now(),
152
+ });
153
+ if (this.slowQueries.length > MAX_SLOW_QUERY_LOG_SIZE) {
154
+ this.slowQueries = this.slowQueries.slice(-MAX_SLOW_QUERY_LOG_SIZE);
155
+ }
156
+ }
157
+ }
158
+ /** Updates the query digest map with a new query record */
159
+ updateQueryDigest(record) {
160
+ const pattern = normalizeQuery(record.sql);
161
+ const existing = this.digestMap.get(pattern);
162
+ if (existing) {
163
+ existing.count++;
164
+ existing.totalDurationMs += record.durationMs;
165
+ existing.avgDurationMs = existing.totalDurationMs / existing.count;
166
+ existing.lastSeen = Date.now();
167
+ }
168
+ else {
169
+ const maxDigests = this.config.maxQueryDigests || DEFAULT_MAX_QUERY_DIGESTS;
170
+ if (this.digestMap.size < maxDigests) {
171
+ this.digestMap.set(pattern, {
172
+ pattern,
173
+ count: 1,
174
+ avgDurationMs: record.durationMs,
175
+ totalDurationMs: record.durationMs,
176
+ lastSeen: Date.now(),
177
+ });
178
+ }
179
+ }
180
+ }
181
+ /** Returns a comprehensive snapshot of query performance metrics */
182
+ getQueryMetrics() {
183
+ const total = this.queries.length;
184
+ const durations = this.getSortedDurationsSample(total);
185
+ const totalDurationSum = total > 0 ? this.queries.reduce((s, q) => s + q.durationMs, 0) : 0;
186
+ const avgDuration = total > 0 ? totalDurationSum / total : 0;
187
+ const errors = this.queries.filter((q) => !q.success).length;
188
+ const errorRate = total > 0 ? errors / total : 0;
189
+ const now = Date.now();
190
+ const elapsedSeconds = Math.max(1, (now - this.startTime) / 1000);
191
+ const queriesPerSecond = total / elapsedSeconds;
192
+ const byOperation = this.computeOperationBreakdown();
193
+ const windows = {
194
+ oneMinute: this.getWindowMetrics(now - ONE_MINUTE_MS),
195
+ fiveMinutes: this.getWindowMetrics(now - FIVE_MINUTES_MS),
196
+ fifteenMinutes: this.getWindowMetrics(now - FIFTEEN_MINUTES_MS),
197
+ };
198
+ return {
199
+ totalQueries: total,
200
+ avgDurationMs: avgDuration,
201
+ p50DurationMs: percentile(durations, 50),
202
+ p95DurationMs: percentile(durations, 95),
203
+ p99DurationMs: percentile(durations, 99),
204
+ queriesPerSecond,
205
+ errorRate,
206
+ totalErrors: errors,
207
+ totalRowsReturned: this.totalRowsReturned,
208
+ byOperation,
209
+ windows,
210
+ };
211
+ }
212
+ /** Returns sorted duration samples, using reservoir sampling for large datasets */
213
+ getSortedDurationsSample(total) {
214
+ if (total > RESERVOIR_SAMPLING_THRESHOLD) {
215
+ const sample = [];
216
+ for (let i = 0; i < total; i++) {
217
+ if (i < PERCENTILE_SAMPLE_SIZE) {
218
+ sample.push(this.queries[i].durationMs);
219
+ }
220
+ else {
221
+ const j = Math.floor(Math.random() * (i + 1));
222
+ if (j < PERCENTILE_SAMPLE_SIZE) {
223
+ sample[j] = this.queries[i].durationMs;
224
+ }
225
+ }
226
+ }
227
+ return sample.sort((a, b) => a - b);
228
+ }
229
+ return this.queries.map((q) => q.durationMs).sort((a, b) => a - b);
230
+ }
231
+ /** Computes per-operation count and average duration breakdown */
232
+ computeOperationBreakdown() {
233
+ const byOperation = {};
234
+ for (const q of this.queries) {
235
+ if (!byOperation[q.operation]) {
236
+ byOperation[q.operation] = { count: 0, avgDurationMs: 0 };
237
+ }
238
+ byOperation[q.operation].count++;
239
+ }
240
+ for (const op of Object.keys(byOperation)) {
241
+ const opQueries = this.queries.filter((q) => q.operation === op);
242
+ const sum = opQueries.reduce((s, q) => s + q.durationMs, 0);
243
+ byOperation[op].avgDurationMs = opQueries.length > 0 ? sum / opQueries.length : 0;
244
+ }
245
+ return byOperation;
246
+ }
247
+ /** Returns a copy of the slow query log */
248
+ getSlowQueryLog() {
249
+ return [...this.slowQueries];
250
+ }
251
+ /** Returns all tracked query digest patterns */
252
+ getQueryDigests() {
253
+ return Array.from(this.digestMap.values());
254
+ }
255
+ /** Resets all query-related metrics, including digests and slow query log */
256
+ resetQueryMetrics() {
257
+ this.queries = [];
258
+ this.slowQueries = [];
259
+ this.digestMap.clear();
260
+ this.totalErrors = 0;
261
+ this.totalRowsReturned = 0;
262
+ }
263
+ // ===========================================================================
264
+ // Connection Stats
265
+ // ===========================================================================
266
+ /** Records a new connection being opened */
267
+ recordConnectionOpen(options) {
268
+ const conn = {
269
+ type: options?.type || 'unknown',
270
+ openedAt: Date.now(),
271
+ idle: false,
272
+ };
273
+ this.connections.push(conn);
274
+ this.totalConnectionsOpened++;
275
+ if (this.connections.length > this.peakConnections) {
276
+ this.peakConnections = this.connections.length;
277
+ }
278
+ }
279
+ /** Records a connection being closed, optionally with its total duration */
280
+ recordConnectionClose(options) {
281
+ if (this.connections.length > 0) {
282
+ const conn = this.connections.pop();
283
+ if (conn.idle) {
284
+ this.idleConnections = Math.max(0, this.idleConnections - 1);
285
+ }
286
+ if (options?.durationMs !== undefined) {
287
+ this.connectionDurations.push(options.durationMs);
288
+ }
289
+ }
290
+ }
291
+ /** Records a connection error */
292
+ recordConnectionError(_message) {
293
+ this.connectionErrors++;
294
+ }
295
+ /** Records a connection transitioning to idle state */
296
+ recordConnectionIdle() {
297
+ this.idleConnections++;
298
+ }
299
+ /** Records a connection being acquired from the pool, with wait time */
300
+ recordConnectionAcquired(options) {
301
+ this.waitTimes.push(options.waitTimeMs);
302
+ }
303
+ /** Returns a snapshot of connection statistics */
304
+ getConnectionStats() {
305
+ const wsCount = this.connections.filter((c) => c.type === 'websocket').length;
306
+ const httpCount = this.connections.filter((c) => c.type === 'http').length;
307
+ const avgDuration = this.connectionDurations.length > 0
308
+ ? this.connectionDurations.reduce((s, d) => s + d, 0) / this.connectionDurations.length
309
+ : 0;
310
+ const avgWait = this.waitTimes.length > 0
311
+ ? this.waitTimes.reduce((s, w) => s + w, 0) / this.waitTimes.length
312
+ : 0;
313
+ return {
314
+ activeConnections: this.connections.length,
315
+ totalConnectionsOpened: this.totalConnectionsOpened,
316
+ peakConnections: this.peakConnections,
317
+ connectionErrors: this.connectionErrors,
318
+ avgConnectionDurationMs: avgDuration,
319
+ idleConnections: this.idleConnections,
320
+ poolUtilization: Math.min(1.0, this.connections.length), // max_connections=1 in DO model
321
+ websocketConnections: wsCount,
322
+ httpConnections: httpCount,
323
+ avgWaitTimeMs: avgWait,
324
+ uptimeMs: Date.now() - this.startTime,
325
+ };
326
+ }
327
+ // ===========================================================================
328
+ // Storage Tier Stats
329
+ // ===========================================================================
330
+ /** Records a storage tier operation (read/write) with optional hit/miss and timing */
331
+ recordStorageOperation(tier, operation, options) {
332
+ this.storageOps.push({
333
+ tier,
334
+ operation,
335
+ hit: options.hit,
336
+ bytes: options.bytes,
337
+ durationMs: options.durationMs,
338
+ timestamp: Date.now(),
339
+ });
340
+ }
341
+ /** Records a data promotion between storage tiers */
342
+ recordTierPromotion(from, to, _details) {
343
+ if (from === 'cold' && to === 'warm')
344
+ this.tierPromotions.coldToWarm++;
345
+ if (from === 'warm' && to === 'hot')
346
+ this.tierPromotions.warmToHot++;
347
+ }
348
+ /** Records a data demotion between storage tiers */
349
+ recordTierDemotion(from, to, _details) {
350
+ if (from === 'hot' && to === 'warm')
351
+ this.tierDemotions.hotToWarm++;
352
+ if (from === 'warm' && to === 'cold')
353
+ this.tierDemotions.warmToCold++;
354
+ }
355
+ /** Records a storage error and triggers alert if threshold is exceeded */
356
+ recordStorageError(tier, _message) {
357
+ this.tierErrors[tier] = (this.tierErrors[tier] || 0) + 1;
358
+ // Check alert threshold
359
+ const totalStorageErrors = Object.values(this.tierErrors).reduce((s, e) => s + e, 0);
360
+ const threshold = this.config.alertThresholds?.storageErrorRate;
361
+ if (threshold !== undefined && totalStorageErrors > threshold) {
362
+ this.addAlert({
363
+ type: 'storage_error',
364
+ severity: 'critical',
365
+ message: `Storage error rate exceeded threshold: ${totalStorageErrors} errors`,
366
+ triggeredAt: Date.now(),
367
+ value: totalStorageErrors,
368
+ });
369
+ }
370
+ }
371
+ /** Records the current storage usage for a tier */
372
+ recordStorageUsage(tier, bytes) {
373
+ this.tierUsage[tier] = bytes;
374
+ }
375
+ /** Records a change in tier health status */
376
+ recordTierHealthChange(tier, status) {
377
+ this.tierHealth[tier] = status;
378
+ }
379
+ /** Returns a comprehensive snapshot of storage tier statistics including costs */
380
+ getStorageTierStats() {
381
+ const getTierStats = (tier) => {
382
+ const ops = this.storageOps.filter((o) => o.tier === tier);
383
+ const reads = ops.filter((o) => o.operation === 'read');
384
+ const hits = reads.filter((o) => o.hit === true).length;
385
+ const hitRate = reads.length > 0 ? hits / reads.length : 0;
386
+ const bytesRead = ops.filter((o) => o.operation === 'read').reduce((s, o) => s + o.bytes, 0);
387
+ const bytesWritten = ops.filter((o) => o.operation === 'write').reduce((s, o) => s + o.bytes, 0);
388
+ const durationsWithValues = ops.filter((o) => o.durationMs !== undefined);
389
+ const avgLatencyMs = durationsWithValues.length > 0
390
+ ? durationsWithValues.reduce((s, o) => s + (o.durationMs || 0), 0) / durationsWithValues.length
391
+ : 0;
392
+ return {
393
+ hitRate,
394
+ bytesRead,
395
+ bytesWritten,
396
+ totalOperations: ops.length,
397
+ avgLatencyMs,
398
+ errors: this.tierErrors[tier] || 0,
399
+ usageBytes: this.tierUsage[tier] || 0,
400
+ healthStatus: (this.tierHealth[tier] || 'healthy'),
401
+ };
402
+ };
403
+ const hotStats = getTierStats('hot');
404
+ const warmStats = getTierStats('warm');
405
+ const coldStats = getTierStats('cold');
406
+ // Estimate costs based on bytes stored
407
+ const hotCost = (this.tierUsage['hot'] || 0) * COST_PER_BYTE_HOT;
408
+ const warmCost = (this.tierUsage['warm'] || 0) * COST_PER_BYTE_WARM;
409
+ const coldCost = (this.tierUsage['cold'] || 0) * COST_PER_BYTE_COLD;
410
+ // Calculate tiering efficiency (hot hit rate as primary metric)
411
+ const hotReads = this.storageOps.filter((o) => o.tier === 'hot' && o.operation === 'read');
412
+ const hotHits = hotReads.filter((o) => o.hit === true).length;
413
+ const tieringEfficiency = hotReads.length > 0 ? hotHits / hotReads.length : 0;
414
+ return {
415
+ hot: hotStats,
416
+ warm: warmStats,
417
+ cold: coldStats,
418
+ promotions: { ...this.tierPromotions },
419
+ demotions: { ...this.tierDemotions },
420
+ estimatedCosts: {
421
+ hot: hotCost,
422
+ warm: warmCost,
423
+ cold: coldCost,
424
+ total: hotCost + warmCost + coldCost,
425
+ },
426
+ tieringEfficiency,
427
+ };
428
+ }
429
+ // ===========================================================================
430
+ // Health Checks
431
+ // ===========================================================================
432
+ /** Sets the PGLite instance for health check queries */
433
+ setPGLiteInstance(pglite) {
434
+ this.pgliteInstance = pglite;
435
+ }
436
+ /** Sets the storage orchestrator for health check tier status */
437
+ setStorageOrchestrator(orchestrator) {
438
+ this.storageOrchestrator = orchestrator;
439
+ }
440
+ /** Returns a simple liveness probe result (always healthy if the service is running) */
441
+ liveness() {
442
+ return {
443
+ status: 'healthy',
444
+ service: this.config.serviceName,
445
+ uptimeMs: Date.now() - this.startTime,
446
+ };
447
+ }
448
+ /** Performs a readiness check including PGLite, storage, and memory health */
449
+ async readiness(_options) {
450
+ const startTime = Date.now();
451
+ const checks = {};
452
+ checks.pglite = await this.checkPGLiteHealth();
453
+ checks.storage = this.checkStorageHealth();
454
+ checks.memory = {
455
+ status: 'healthy',
456
+ details: {
457
+ heapUsedBytes: SIMULATED_HEAP_USED_BYTES,
458
+ heapTotalBytes: WORKER_MEMORY_LIMIT_BYTES,
459
+ },
460
+ };
461
+ const overallStatus = this.determineOverallStatus(checks);
462
+ return {
463
+ status: overallStatus,
464
+ checks,
465
+ responseTimeMs: Date.now() - startTime,
466
+ };
467
+ }
468
+ /** Checks PGLite health with a microtask-based timeout */
469
+ async checkPGLiteHealth() {
470
+ if (!this.pgliteInstance) {
471
+ return { status: 'unhealthy', error: 'PGLite not initialized' };
472
+ }
473
+ try {
474
+ const queryPromise = this.pgliteInstance.query('SELECT 1 as result');
475
+ let settled = false;
476
+ const wrappedQuery = queryPromise.then((v) => { settled = true; return v; }, (e) => { settled = true; throw e; });
477
+ const timeoutCheck = new Promise(async (_, reject) => {
478
+ for (let i = 0; i < HEALTH_CHECK_TIMEOUT_YIELD_COUNT; i++) {
479
+ await Promise.resolve();
480
+ if (settled)
481
+ return;
482
+ }
483
+ if (!settled) {
484
+ reject(new Error('Health check timeout'));
485
+ }
486
+ });
487
+ await Promise.race([wrappedQuery, timeoutCheck]);
488
+ return { status: 'healthy' };
489
+ }
490
+ catch (e) {
491
+ const errorMsg = e instanceof Error ? e.message : 'Unknown health check error';
492
+ return {
493
+ status: 'unhealthy',
494
+ error: errorMsg.includes('timeout') ? 'Health check timeout' : errorMsg,
495
+ };
496
+ }
497
+ }
498
+ /** Checks storage orchestrator health */
499
+ checkStorageHealth() {
500
+ if (!this.storageOrchestrator) {
501
+ return { status: 'healthy', details: { message: 'No orchestrator configured' } };
502
+ }
503
+ const tierHealth = this.storageOrchestrator.getTierHealth();
504
+ const allHealthy = Object.values(tierHealth).every((t) => t.status === 'healthy');
505
+ const anyDegraded = Object.values(tierHealth).some((t) => t.status === 'degraded');
506
+ return {
507
+ status: allHealthy ? 'healthy' : (anyDegraded ? 'degraded' : 'unhealthy'),
508
+ details: tierHealth,
509
+ };
510
+ }
511
+ /** Determines the worst overall status from all component checks */
512
+ determineOverallStatus(checks) {
513
+ const statuses = Object.values(checks).map((c) => c.status);
514
+ if (statuses.includes('unhealthy'))
515
+ return 'unhealthy';
516
+ if (statuses.includes('degraded'))
517
+ return 'degraded';
518
+ return 'healthy';
519
+ }
520
+ /** Performs a deep health check including WAL status */
521
+ async deepCheck() {
522
+ const startTime = Date.now();
523
+ const readinessResult = await this.readiness();
524
+ // Add WAL check
525
+ readinessResult.checks.wal = {
526
+ status: 'healthy',
527
+ details: { lastArchive: 'N/A' },
528
+ };
529
+ readinessResult.responseTimeMs = Date.now() - startTime;
530
+ return readinessResult;
531
+ }
532
+ // ===========================================================================
533
+ // Metrics Export
534
+ // ===========================================================================
535
+ /** Exports all metrics in Prometheus text exposition format */
536
+ exportPrometheus() {
537
+ const lines = [];
538
+ const labels = `service="${this.config.serviceName}",do_id="${this.config.doId}"`;
539
+ const queryMetrics = this.getQueryMetrics();
540
+ // Query total counter
541
+ lines.push('# HELP postgres_query_total Total number of queries executed');
542
+ lines.push('# TYPE postgres_query_total counter');
543
+ lines.push(`postgres_query_total{${labels}} ${queryMetrics.totalQueries}`);
544
+ // Query errors
545
+ lines.push('# HELP postgres_query_errors_total Total number of query errors');
546
+ lines.push('# TYPE postgres_query_errors_total counter');
547
+ lines.push(`postgres_query_errors_total{${labels}} ${queryMetrics.totalErrors}`);
548
+ // Query duration histogram
549
+ lines.push('# HELP postgres_query_duration_seconds Query execution time in seconds');
550
+ lines.push('# TYPE postgres_query_duration_seconds histogram');
551
+ const boundaries = this.config.histogramBoundaries || DEFAULT_HISTOGRAM_BOUNDARIES;
552
+ const durations = this.queries.map((q) => q.durationMs / 1000).sort((a, b) => a - b);
553
+ let sum = 0;
554
+ for (const boundary of boundaries) {
555
+ const count = durations.filter((d) => d <= boundary).length;
556
+ lines.push(`postgres_query_duration_seconds_bucket{${labels},le="${boundary}"} ${count}`);
557
+ }
558
+ lines.push(`postgres_query_duration_seconds_bucket{${labels},le="+Inf"} ${durations.length}`);
559
+ sum = durations.reduce((s, d) => s + d, 0);
560
+ lines.push(`postgres_query_duration_seconds_count{${labels}} ${durations.length}`);
561
+ lines.push(`postgres_query_duration_seconds_sum{${labels}} ${sum}`);
562
+ // Connections
563
+ const connStats = this.getConnectionStats();
564
+ lines.push('# HELP postgres_connections_active Current active connections');
565
+ lines.push('# TYPE postgres_connections_active gauge');
566
+ lines.push(`postgres_connections_active{${labels}} ${connStats.activeConnections}`);
567
+ // Storage operations
568
+ lines.push('# HELP postgres_storage_operations_total Total storage operations');
569
+ lines.push('# TYPE postgres_storage_operations_total counter');
570
+ for (const tier of ['hot', 'warm', 'cold']) {
571
+ const count = this.storageOps.filter((o) => o.tier === tier).length;
572
+ lines.push(`postgres_storage_operations_total{${labels},tier="${tier}"} ${count}`);
573
+ }
574
+ return lines.join('\n');
575
+ }
576
+ /** Exports all metrics as a structured JSON object */
577
+ exportJSON() {
578
+ return {
579
+ metrics: {
580
+ queries: this.getQueryMetrics(),
581
+ connections: this.getConnectionStats(),
582
+ storage: this.getStorageTierStats(),
583
+ },
584
+ timestamp: Date.now(),
585
+ service: this.config.serviceName,
586
+ };
587
+ }
588
+ /** Creates an HTTP request handler that serves metrics in Prometheus or JSON format */
589
+ createMetricsHandler() {
590
+ return async (request) => {
591
+ const accept = request.headers.get('Accept') || 'text/plain';
592
+ if (accept.includes('application/json')) {
593
+ const json = this.exportJSON();
594
+ return new Response(JSON.stringify(json), {
595
+ status: 200,
596
+ headers: { 'content-type': 'application/json' },
597
+ });
598
+ }
599
+ // Default: Prometheus format
600
+ const prometheus = this.exportPrometheus();
601
+ return new Response(prometheus, {
602
+ status: 200,
603
+ headers: { 'content-type': 'text/plain; charset=utf-8' },
604
+ });
605
+ };
606
+ }
607
+ // ===========================================================================
608
+ // Dashboard
609
+ // ===========================================================================
610
+ /** Returns a complete metrics dashboard snapshot for display */
611
+ getDashboard() {
612
+ return {
613
+ queries: this.getQueryMetrics(),
614
+ connections: this.getConnectionStats(),
615
+ storage: this.getStorageTierStats(),
616
+ health: {
617
+ status: 'healthy',
618
+ checks: {},
619
+ },
620
+ service: {
621
+ name: this.config.serviceName,
622
+ doId: this.config.doId,
623
+ uptimeMs: Date.now() - this.startTime,
624
+ version: this.config.serviceVersion,
625
+ },
626
+ memory: {
627
+ heapUsedBytes: SIMULATED_HEAP_USED_BYTES,
628
+ heapTotalBytes: WORKER_MEMORY_LIMIT_BYTES,
629
+ },
630
+ alerts: this.getActiveAlerts(),
631
+ timestamp: Date.now(),
632
+ };
633
+ }
634
+ // ===========================================================================
635
+ // Alerts
636
+ // ===========================================================================
637
+ /** Evaluates all alert conditions against current metrics, adding/removing alerts as needed */
638
+ evaluateAlerts() {
639
+ const queryMetrics = this.getQueryMetrics();
640
+ const errorThreshold = this.config.alertThresholds?.errorRatePercent ?? DEFAULT_ERROR_RATE_THRESHOLD_PERCENT;
641
+ const errorRatePercent = queryMetrics.errorRate * 100;
642
+ if (errorRatePercent > errorThreshold) {
643
+ this.addAlert({
644
+ type: 'error_rate',
645
+ severity: 'critical',
646
+ message: `Error rate ${errorRatePercent.toFixed(1)}% exceeds threshold ${errorThreshold}%`,
647
+ triggeredAt: Date.now(),
648
+ value: errorRatePercent,
649
+ });
650
+ }
651
+ else {
652
+ // Resolve error rate alerts
653
+ this.activeAlerts = this.activeAlerts.filter((a) => a.type !== 'error_rate');
654
+ }
655
+ const p99Threshold = this.config.alertThresholds?.p99LatencyMs ?? DEFAULT_P99_LATENCY_THRESHOLD_MS;
656
+ if (queryMetrics.p99DurationMs > p99Threshold) {
657
+ this.addAlert({
658
+ type: 'high_latency',
659
+ severity: 'warning',
660
+ message: `P99 latency ${queryMetrics.p99DurationMs}ms exceeds threshold ${p99Threshold}ms`,
661
+ triggeredAt: Date.now(),
662
+ value: queryMetrics.p99DurationMs,
663
+ });
664
+ }
665
+ else {
666
+ this.activeAlerts = this.activeAlerts.filter((a) => a.type !== 'high_latency');
667
+ }
668
+ // Custom thresholds (e.g. slow query count)
669
+ for (const threshold of this.customThresholds) {
670
+ if (threshold.type === 'slow_query_count') {
671
+ const slowCount = this.slowQueries.length;
672
+ if (slowCount > threshold.threshold) {
673
+ this.addAlert({
674
+ name: threshold.name,
675
+ type: threshold.type,
676
+ severity: threshold.severity,
677
+ message: `Slow query count ${slowCount} exceeds threshold ${threshold.threshold}`,
678
+ triggeredAt: Date.now(),
679
+ value: slowCount,
680
+ });
681
+ }
682
+ }
683
+ }
684
+ }
685
+ /** Returns a copy of all currently active alerts */
686
+ getActiveAlerts() {
687
+ return [...this.activeAlerts];
688
+ }
689
+ /** Registers a custom alert threshold for evaluation */
690
+ registerAlertThreshold(threshold) {
691
+ this.customThresholds.push(threshold);
692
+ }
693
+ // ===========================================================================
694
+ // Reset
695
+ // ===========================================================================
696
+ /** Resets all metrics state (queries, connections, storage, alerts) */
697
+ resetAll() {
698
+ this.resetQueryMetrics();
699
+ this.connections = [];
700
+ this.totalConnectionsOpened = 0;
701
+ this.peakConnections = 0;
702
+ this.connectionErrors = 0;
703
+ this.connectionDurations = [];
704
+ this.waitTimes = [];
705
+ this.idleConnections = 0;
706
+ this.storageOps = [];
707
+ this.tierPromotions = { coldToWarm: 0, warmToHot: 0 };
708
+ this.tierDemotions = { hotToWarm: 0, warmToCold: 0 };
709
+ this.tierErrors = { hot: 0, warm: 0, cold: 0 };
710
+ this.tierUsage = { hot: 0, warm: 0, cold: 0 };
711
+ this.activeAlerts = [];
712
+ }
713
+ // ===========================================================================
714
+ // Private Helpers
715
+ // ===========================================================================
716
+ getWindowMetrics(sinceTimestamp) {
717
+ const windowQueries = this.queries.filter((q) => q.timestamp >= sinceTimestamp);
718
+ const total = windowQueries.length;
719
+ const errors = windowQueries.filter((q) => !q.success).length;
720
+ const avgDuration = total > 0
721
+ ? windowQueries.reduce((s, q) => s + q.durationMs, 0) / total
722
+ : 0;
723
+ return {
724
+ totalQueries: total,
725
+ avgDurationMs: avgDuration,
726
+ errorRate: total > 0 ? errors / total : 0,
727
+ };
728
+ }
729
+ addAlert(alert) {
730
+ // Don't add duplicate alerts of same type
731
+ const existing = this.activeAlerts.find((a) => a.type === alert.type && (alert.name ? a.name === alert.name : true));
732
+ if (!existing) {
733
+ this.activeAlerts.push(alert);
734
+ }
735
+ }
736
+ }
737
+ // =============================================================================
738
+ // Factory Function
739
+ // =============================================================================
740
+ /** Creates a ProductionMetrics instance, validating required configuration */
741
+ export function createProductionMetrics(config) {
742
+ if (!config.serviceName) {
743
+ throw new Error('ProductionMetrics requires a non-empty serviceName');
744
+ }
745
+ return new ProductionMetrics(config);
746
+ }
747
+ //# sourceMappingURL=production-metrics.js.map