@dotdo/postgres 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1054 @@
1
+ /**
2
+ * Production Observability Metrics for PostgreSQL Durable Objects
3
+ *
4
+ * Provides query metrics, connection stats, storage tier monitoring,
5
+ * health checks, Prometheus export, and alerting capabilities.
6
+ */
7
+
8
+ // =============================================================================
9
+ // Constants
10
+ // =============================================================================
11
+
12
+ /** Default threshold for classifying queries as "slow" (milliseconds) */
13
+ const DEFAULT_SLOW_QUERY_THRESHOLD_MS = 100
14
+
15
+ /** Maximum number of slow queries to retain in the log */
16
+ const MAX_SLOW_QUERY_LOG_SIZE = 100
17
+
18
+ /** Default maximum number of query digest patterns to track */
19
+ const DEFAULT_MAX_QUERY_DIGESTS = 1000
20
+
21
+ /** Default error rate alert threshold (percent) */
22
+ const DEFAULT_ERROR_RATE_THRESHOLD_PERCENT = 5
23
+
24
+ /** Default P99 latency alert threshold (milliseconds) */
25
+ const DEFAULT_P99_LATENCY_THRESHOLD_MS = 1000
26
+
27
+ /** Reservoir sample size for large dataset percentile calculations */
28
+ const PERCENTILE_SAMPLE_SIZE = 1000
29
+
30
+ /** Threshold for switching to reservoir sampling */
31
+ const RESERVOIR_SAMPLING_THRESHOLD = 10000
32
+
33
+ /** Number of microtask yields for the health check timeout */
34
+ const HEALTH_CHECK_TIMEOUT_YIELD_COUNT = 20
35
+
36
+ /** Default health check timeout (milliseconds) */
37
+ export const DEFAULT_HEALTH_CHECK_TIMEOUT_MS = 5000
38
+
39
+ /** Simulated heap usage in bytes (Workers environment approximation) */
40
+ const SIMULATED_HEAP_USED_BYTES = 50 * 1024 * 1024
41
+
42
+ /** Cloudflare Workers memory limit in bytes */
43
+ const WORKER_MEMORY_LIMIT_BYTES = 128 * 1024 * 1024
44
+
45
+ /** Alert evaluation frequency - evaluate every N queries to avoid performance issues */
46
+ const ALERT_EVALUATION_INTERVAL = 100
47
+
48
+ /** Time window durations for metrics windowing */
49
+ const ONE_MINUTE_MS = 60_000
50
+ const FIVE_MINUTES_MS = 300_000
51
+ const FIFTEEN_MINUTES_MS = 900_000
52
+
53
+ /** Storage cost estimates per byte (simplified) */
54
+ const COST_PER_BYTE_HOT = 0.000001 // ~$1/MB (DO SQLite blob storage)
55
+ const COST_PER_BYTE_WARM = 0.0000001 // Free (Cloudflare Cache)
56
+ const COST_PER_BYTE_COLD = 0.000000015 // R2 pricing
57
+
58
+ /** Default Prometheus histogram boundaries (seconds) */
59
+ const DEFAULT_HISTOGRAM_BOUNDARIES = [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0]
60
+
61
+ // =============================================================================
62
+ // Types
63
+ // =============================================================================
64
+
65
+ /** Configuration for the ProductionMetrics collector */
66
+ export interface ProductionMetricsConfig {
67
+ /** Service name used in metric labels and exports */
68
+ serviceName: string
69
+ /** Service version for dashboard display */
70
+ serviceVersion?: string
71
+ /** Durable Object identifier */
72
+ doId: string
73
+ /** Deployment environment (e.g., 'production', 'staging') */
74
+ environment?: string
75
+ /** Collection interval for periodic metrics gathering (milliseconds) */
76
+ collectIntervalMs?: number
77
+ /** Custom histogram bucket boundaries for query duration (seconds) */
78
+ histogramBoundaries?: number[]
79
+ /** Threshold for classifying queries as slow (milliseconds) */
80
+ slowQueryThresholdMs?: number
81
+ /** Maximum number of unique query digest patterns to track */
82
+ maxQueryDigests?: number
83
+ /** Whether to collect detailed per-tier storage metrics */
84
+ enableDetailedStorageMetrics?: boolean
85
+ /** Alert threshold configuration */
86
+ alertThresholds?: AlertThresholdConfig
87
+ }
88
+
89
+ interface AlertThresholdConfig {
90
+ errorRatePercent?: number
91
+ p99LatencyMs?: number
92
+ memoryUsagePercent?: number
93
+ storageErrorRate?: number
94
+ }
95
+
96
+ export interface QueryRecord {
97
+ sql: string
98
+ durationMs: number
99
+ rowsReturned: number
100
+ success: boolean
101
+ error?: string
102
+ }
103
+
104
+ export interface QueryMetricsSnapshot {
105
+ totalQueries: number
106
+ avgDurationMs: number
107
+ p50DurationMs: number
108
+ p95DurationMs: number
109
+ p99DurationMs: number
110
+ queriesPerSecond: number
111
+ errorRate: number
112
+ totalErrors: number
113
+ totalRowsReturned: number
114
+ byOperation: Record<string, { count: number; avgDurationMs: number }>
115
+ windows: {
116
+ oneMinute: WindowMetrics
117
+ fiveMinutes: WindowMetrics
118
+ fifteenMinutes: WindowMetrics
119
+ }
120
+ }
121
+
122
+ interface WindowMetrics {
123
+ totalQueries: number
124
+ avgDurationMs: number
125
+ errorRate: number
126
+ }
127
+
128
+ export interface ConnectionStats {
129
+ activeConnections: number
130
+ totalConnectionsOpened: number
131
+ peakConnections: number
132
+ connectionErrors: number
133
+ avgConnectionDurationMs: number
134
+ idleConnections: number
135
+ poolUtilization: number
136
+ websocketConnections: number
137
+ httpConnections: number
138
+ avgWaitTimeMs: number
139
+ uptimeMs: number
140
+ }
141
+
142
+ export interface StorageTierSnapshot {
143
+ hot: TierStats
144
+ warm: TierStats
145
+ cold: TierStats
146
+ promotions: { coldToWarm: number; warmToHot: number }
147
+ demotions: { hotToWarm: number; warmToCold: number }
148
+ estimatedCosts: { hot: number; warm: number; cold: number; total: number }
149
+ tieringEfficiency: number
150
+ }
151
+
152
+ interface TierStats {
153
+ hitRate: number
154
+ bytesRead: number
155
+ bytesWritten: number
156
+ totalOperations: number
157
+ avgLatencyMs: number
158
+ errors: number
159
+ usageBytes: number
160
+ healthStatus: 'healthy' | 'degraded' | 'unhealthy'
161
+ }
162
+
163
+ export interface HealthCheckResult {
164
+ status: 'healthy' | 'degraded' | 'unhealthy'
165
+ service?: string
166
+ uptimeMs?: number
167
+ checks: Record<string, ComponentHealth>
168
+ responseTimeMs?: number
169
+ }
170
+
171
+ interface ComponentHealth {
172
+ status: 'healthy' | 'degraded' | 'unhealthy'
173
+ error?: string
174
+ details?: any
175
+ }
176
+
177
+ export interface MetricsDashboard {
178
+ queries: QueryMetricsSnapshot
179
+ connections: ConnectionStats
180
+ storage: StorageTierSnapshot
181
+ health: HealthCheckResult
182
+ service: { name: string; doId: string; uptimeMs: number; version?: string }
183
+ memory: { heapUsedBytes: number; heapTotalBytes: number }
184
+ alerts: Alert[]
185
+ timestamp: number
186
+ }
187
+
188
+ export interface AlertThreshold {
189
+ name: string
190
+ type: string
191
+ threshold: number
192
+ windowMs?: number
193
+ severity: 'warning' | 'critical'
194
+ }
195
+
196
+ interface Alert {
197
+ name?: string
198
+ type: string
199
+ severity: 'warning' | 'critical'
200
+ message: string
201
+ triggeredAt: number
202
+ value?: number
203
+ }
204
+
205
+ export type MetricsExportFormat = 'prometheus' | 'json'
206
+
207
+ export interface PrometheusMetric {
208
+ name: string
209
+ help: string
210
+ type: 'counter' | 'gauge' | 'histogram'
211
+ value?: number
212
+ labels?: Record<string, string>
213
+ }
214
+
215
+ export interface QueryDigest {
216
+ pattern: string
217
+ count: number
218
+ avgDurationMs: number
219
+ totalDurationMs: number
220
+ lastSeen: number
221
+ }
222
+
223
+ export interface SlowQueryLog {
224
+ sql: string
225
+ durationMs: number
226
+ rowsReturned: number
227
+ timestamp: number
228
+ }
229
+
230
+ // =============================================================================
231
+ // Internal State Types
232
+ // =============================================================================
233
+
234
+ interface QueryEntry {
235
+ durationMs: number
236
+ success: boolean
237
+ timestamp: number
238
+ operation: string
239
+ rowsReturned: number
240
+ }
241
+
242
+ interface ConnectionEntry {
243
+ type: 'websocket' | 'http' | 'unknown'
244
+ openedAt: number
245
+ idle: boolean
246
+ }
247
+
248
+ interface StorageOp {
249
+ tier: 'hot' | 'warm' | 'cold'
250
+ operation: 'read' | 'write'
251
+ hit?: boolean
252
+ bytes: number
253
+ durationMs?: number
254
+ timestamp: number
255
+ }
256
+
257
+ // =============================================================================
258
+ // Utility Functions
259
+ // =============================================================================
260
+
261
+ /** Extracts the SQL operation type (SELECT, INSERT, UPDATE, DELETE, OTHER) from a query string */
262
+ function extractOperation(sql: string): string {
263
+ const trimmed = sql.trim().toUpperCase()
264
+ if (trimmed.startsWith('SELECT')) return 'SELECT'
265
+ if (trimmed.startsWith('INSERT')) return 'INSERT'
266
+ if (trimmed.startsWith('UPDATE')) return 'UPDATE'
267
+ if (trimmed.startsWith('DELETE')) return 'DELETE'
268
+ return 'OTHER'
269
+ }
270
+
271
+ /** Normalizes a query by replacing numeric literals with placeholders for digest grouping */
272
+ function normalizeQuery(sql: string): string {
273
+ return sql.replace(/\b\d+\b/g, '$N')
274
+ }
275
+
276
+ /**
277
+ * Calculates the p-th percentile from a pre-sorted array using the nearest-rank method.
278
+ * Returns 0 for empty arrays.
279
+ */
280
+ function percentile(sorted: number[], p: number): number {
281
+ if (sorted.length === 0) return 0
282
+ if (sorted.length === 1) return sorted[0]
283
+ const idx = Math.ceil((p / 100) * sorted.length)
284
+ return sorted[Math.min(idx, sorted.length - 1)]
285
+ }
286
+
287
+ // =============================================================================
288
+ // ProductionMetrics Class
289
+ // =============================================================================
290
+
291
+ /**
292
+ * Collects and exposes production metrics for PostgreSQL Durable Objects.
293
+ * Tracks query performance, connection statistics, storage tier operations,
294
+ * and provides health checks, Prometheus export, and alerting.
295
+ */
296
+ export class ProductionMetrics {
297
+ private config: ProductionMetricsConfig
298
+ private startTime: number
299
+
300
+ // Query metrics state
301
+ private queries: QueryEntry[] = []
302
+ private slowQueries: SlowQueryLog[] = []
303
+ private digestMap: Map<string, QueryDigest> = new Map()
304
+ private totalErrors = 0
305
+ private totalRowsReturned = 0
306
+
307
+ // Connection state
308
+ private connections: ConnectionEntry[] = []
309
+ private totalConnectionsOpened = 0
310
+ private peakConnections = 0
311
+ private connectionErrors = 0
312
+ private connectionDurations: number[] = []
313
+ private waitTimes: number[] = []
314
+ private idleConnections = 0
315
+
316
+ // Storage state
317
+ private storageOps: StorageOp[] = []
318
+ private tierPromotions = { coldToWarm: 0, warmToHot: 0 }
319
+ private tierDemotions = { hotToWarm: 0, warmToCold: 0 }
320
+ private tierErrors: Record<string, number> = { hot: 0, warm: 0, cold: 0 }
321
+ private tierUsage: Record<string, number> = { hot: 0, warm: 0, cold: 0 }
322
+ private tierHealth: Record<string, string> = { hot: 'healthy', warm: 'healthy', cold: 'healthy' }
323
+
324
+ // Dependencies
325
+ private pgliteInstance: any = null
326
+ private storageOrchestrator: any = null
327
+
328
+ // Alert state
329
+ private activeAlerts: Alert[] = []
330
+ private customThresholds: AlertThreshold[] = []
331
+
332
+ constructor(config: ProductionMetricsConfig) {
333
+ this.config = config
334
+ this.startTime = Date.now()
335
+ }
336
+
337
+ // ===========================================================================
338
+ // Query Metrics
339
+ // ===========================================================================
340
+
341
+ /** Records a completed query, updating metrics, digests, slow query log, and alerts */
342
+ recordQuery(record: QueryRecord): void {
343
+ const operation = extractOperation(record.sql)
344
+ const entry: QueryEntry = {
345
+ durationMs: Math.max(0, record.durationMs),
346
+ success: record.success,
347
+ timestamp: Date.now(),
348
+ operation,
349
+ rowsReturned: record.rowsReturned,
350
+ }
351
+
352
+ this.queries.push(entry)
353
+ this.totalRowsReturned += record.rowsReturned
354
+
355
+ if (!record.success) {
356
+ this.totalErrors++
357
+ }
358
+
359
+ this.trackSlowQuery(record)
360
+ this.updateQueryDigest(record)
361
+
362
+ // Evaluate alerts periodically to avoid performance overhead on high-throughput workloads
363
+ if (this.queries.length <= ALERT_EVALUATION_INTERVAL || this.queries.length % ALERT_EVALUATION_INTERVAL === 0) {
364
+ this.evaluateAlerts()
365
+ }
366
+ }
367
+
368
+ /** Adds a query to the slow query log if it exceeds the configured threshold */
369
+ private trackSlowQuery(record: QueryRecord): void {
370
+ const threshold = this.config.slowQueryThresholdMs || DEFAULT_SLOW_QUERY_THRESHOLD_MS
371
+ if (record.durationMs >= threshold) {
372
+ this.slowQueries.push({
373
+ sql: record.sql,
374
+ durationMs: record.durationMs,
375
+ rowsReturned: record.rowsReturned,
376
+ timestamp: Date.now(),
377
+ })
378
+ if (this.slowQueries.length > MAX_SLOW_QUERY_LOG_SIZE) {
379
+ this.slowQueries = this.slowQueries.slice(-MAX_SLOW_QUERY_LOG_SIZE)
380
+ }
381
+ }
382
+ }
383
+
384
+ /** Updates the query digest map with a new query record */
385
+ private updateQueryDigest(record: QueryRecord): void {
386
+ const pattern = normalizeQuery(record.sql)
387
+ const existing = this.digestMap.get(pattern)
388
+ if (existing) {
389
+ existing.count++
390
+ existing.totalDurationMs += record.durationMs
391
+ existing.avgDurationMs = existing.totalDurationMs / existing.count
392
+ existing.lastSeen = Date.now()
393
+ } else {
394
+ const maxDigests = this.config.maxQueryDigests || DEFAULT_MAX_QUERY_DIGESTS
395
+ if (this.digestMap.size < maxDigests) {
396
+ this.digestMap.set(pattern, {
397
+ pattern,
398
+ count: 1,
399
+ avgDurationMs: record.durationMs,
400
+ totalDurationMs: record.durationMs,
401
+ lastSeen: Date.now(),
402
+ })
403
+ }
404
+ }
405
+ }
406
+
407
+ /** Returns a comprehensive snapshot of query performance metrics */
408
+ getQueryMetrics(): QueryMetricsSnapshot {
409
+ const total = this.queries.length
410
+ const durations = this.getSortedDurationsSample(total)
411
+ const totalDurationSum = total > 0 ? this.queries.reduce((s, q) => s + q.durationMs, 0) : 0
412
+ const avgDuration = total > 0 ? totalDurationSum / total : 0
413
+ const errors = this.queries.filter((q) => !q.success).length
414
+ const errorRate = total > 0 ? errors / total : 0
415
+
416
+ const now = Date.now()
417
+ const elapsedSeconds = Math.max(1, (now - this.startTime) / 1000)
418
+ const queriesPerSecond = total / elapsedSeconds
419
+
420
+ const byOperation = this.computeOperationBreakdown()
421
+
422
+ const windows = {
423
+ oneMinute: this.getWindowMetrics(now - ONE_MINUTE_MS),
424
+ fiveMinutes: this.getWindowMetrics(now - FIVE_MINUTES_MS),
425
+ fifteenMinutes: this.getWindowMetrics(now - FIFTEEN_MINUTES_MS),
426
+ }
427
+
428
+ return {
429
+ totalQueries: total,
430
+ avgDurationMs: avgDuration,
431
+ p50DurationMs: percentile(durations, 50),
432
+ p95DurationMs: percentile(durations, 95),
433
+ p99DurationMs: percentile(durations, 99),
434
+ queriesPerSecond,
435
+ errorRate,
436
+ totalErrors: errors,
437
+ totalRowsReturned: this.totalRowsReturned,
438
+ byOperation,
439
+ windows,
440
+ }
441
+ }
442
+
443
+ /** Returns sorted duration samples, using reservoir sampling for large datasets */
444
+ private getSortedDurationsSample(total: number): number[] {
445
+ if (total > RESERVOIR_SAMPLING_THRESHOLD) {
446
+ const sample: number[] = []
447
+ for (let i = 0; i < total; i++) {
448
+ if (i < PERCENTILE_SAMPLE_SIZE) {
449
+ sample.push(this.queries[i].durationMs)
450
+ } else {
451
+ const j = Math.floor(Math.random() * (i + 1))
452
+ if (j < PERCENTILE_SAMPLE_SIZE) {
453
+ sample[j] = this.queries[i].durationMs
454
+ }
455
+ }
456
+ }
457
+ return sample.sort((a, b) => a - b)
458
+ }
459
+ return this.queries.map((q) => q.durationMs).sort((a, b) => a - b)
460
+ }
461
+
462
+ /** Computes per-operation count and average duration breakdown */
463
+ private computeOperationBreakdown(): Record<string, { count: number; avgDurationMs: number }> {
464
+ const byOperation: Record<string, { count: number; avgDurationMs: number }> = {}
465
+ for (const q of this.queries) {
466
+ if (!byOperation[q.operation]) {
467
+ byOperation[q.operation] = { count: 0, avgDurationMs: 0 }
468
+ }
469
+ byOperation[q.operation].count++
470
+ }
471
+ for (const op of Object.keys(byOperation)) {
472
+ const opQueries = this.queries.filter((q) => q.operation === op)
473
+ const sum = opQueries.reduce((s, q) => s + q.durationMs, 0)
474
+ byOperation[op].avgDurationMs = opQueries.length > 0 ? sum / opQueries.length : 0
475
+ }
476
+ return byOperation
477
+ }
478
+
479
+ /** Returns a copy of the slow query log */
480
+ getSlowQueryLog(): SlowQueryLog[] {
481
+ return [...this.slowQueries]
482
+ }
483
+
484
+ /** Returns all tracked query digest patterns */
485
+ getQueryDigests(): QueryDigest[] {
486
+ return Array.from(this.digestMap.values())
487
+ }
488
+
489
+ /** Resets all query-related metrics, including digests and slow query log */
490
+ resetQueryMetrics(): void {
491
+ this.queries = []
492
+ this.slowQueries = []
493
+ this.digestMap.clear()
494
+ this.totalErrors = 0
495
+ this.totalRowsReturned = 0
496
+ }
497
+
498
+ // ===========================================================================
499
+ // Connection Stats
500
+ // ===========================================================================
501
+
502
+ /** Records a new connection being opened */
503
+ recordConnectionOpen(options?: { type?: 'websocket' | 'http' }): void {
504
+ const conn: ConnectionEntry = {
505
+ type: options?.type || 'unknown',
506
+ openedAt: Date.now(),
507
+ idle: false,
508
+ }
509
+ this.connections.push(conn)
510
+ this.totalConnectionsOpened++
511
+ if (this.connections.length > this.peakConnections) {
512
+ this.peakConnections = this.connections.length
513
+ }
514
+ }
515
+
516
+ /** Records a connection being closed, optionally with its total duration */
517
+ recordConnectionClose(options?: { durationMs?: number }): void {
518
+ if (this.connections.length > 0) {
519
+ const conn = this.connections.pop()!
520
+ if (conn.idle) {
521
+ this.idleConnections = Math.max(0, this.idleConnections - 1)
522
+ }
523
+ if (options?.durationMs !== undefined) {
524
+ this.connectionDurations.push(options.durationMs)
525
+ }
526
+ }
527
+ }
528
+
529
+ /** Records a connection error */
530
+ recordConnectionError(_message: string): void {
531
+ this.connectionErrors++
532
+ }
533
+
534
+ /** Records a connection transitioning to idle state */
535
+ recordConnectionIdle(): void {
536
+ this.idleConnections++
537
+ }
538
+
539
+ /** Records a connection being acquired from the pool, with wait time */
540
+ recordConnectionAcquired(options: { waitTimeMs: number }): void {
541
+ this.waitTimes.push(options.waitTimeMs)
542
+ }
543
+
544
+ /** Returns a snapshot of connection statistics */
545
+ getConnectionStats(): ConnectionStats {
546
+ const wsCount = this.connections.filter((c) => c.type === 'websocket').length
547
+ const httpCount = this.connections.filter((c) => c.type === 'http').length
548
+
549
+ const avgDuration = this.connectionDurations.length > 0
550
+ ? this.connectionDurations.reduce((s, d) => s + d, 0) / this.connectionDurations.length
551
+ : 0
552
+
553
+ const avgWait = this.waitTimes.length > 0
554
+ ? this.waitTimes.reduce((s, w) => s + w, 0) / this.waitTimes.length
555
+ : 0
556
+
557
+ return {
558
+ activeConnections: this.connections.length,
559
+ totalConnectionsOpened: this.totalConnectionsOpened,
560
+ peakConnections: this.peakConnections,
561
+ connectionErrors: this.connectionErrors,
562
+ avgConnectionDurationMs: avgDuration,
563
+ idleConnections: this.idleConnections,
564
+ poolUtilization: Math.min(1.0, this.connections.length), // max_connections=1 in DO model
565
+ websocketConnections: wsCount,
566
+ httpConnections: httpCount,
567
+ avgWaitTimeMs: avgWait,
568
+ uptimeMs: Date.now() - this.startTime,
569
+ }
570
+ }
571
+
572
+ // ===========================================================================
573
+ // Storage Tier Stats
574
+ // ===========================================================================
575
+
576
+ /** Records a storage tier operation (read/write) with optional hit/miss and timing */
577
+ recordStorageOperation(
578
+ tier: 'hot' | 'warm' | 'cold',
579
+ operation: 'read' | 'write',
580
+ options: { hit?: boolean; bytes: number; durationMs?: number }
581
+ ): void {
582
+ this.storageOps.push({
583
+ tier,
584
+ operation,
585
+ hit: options.hit,
586
+ bytes: options.bytes,
587
+ durationMs: options.durationMs,
588
+ timestamp: Date.now(),
589
+ })
590
+ }
591
+
592
+ /** Records a data promotion between storage tiers */
593
+ recordTierPromotion(from: string, to: string, _details: { key: string; bytes: number }): void {
594
+ if (from === 'cold' && to === 'warm') this.tierPromotions.coldToWarm++
595
+ if (from === 'warm' && to === 'hot') this.tierPromotions.warmToHot++
596
+ }
597
+
598
+ /** Records a data demotion between storage tiers */
599
+ recordTierDemotion(from: string, to: string, _details: { key: string; bytes: number }): void {
600
+ if (from === 'hot' && to === 'warm') this.tierDemotions.hotToWarm++
601
+ if (from === 'warm' && to === 'cold') this.tierDemotions.warmToCold++
602
+ }
603
+
604
+ /** Records a storage error and triggers alert if threshold is exceeded */
605
+ recordStorageError(tier: string, _message: string): void {
606
+ this.tierErrors[tier] = (this.tierErrors[tier] || 0) + 1
607
+
608
+ // Check alert threshold
609
+ const totalStorageErrors = Object.values(this.tierErrors).reduce((s, e) => s + e, 0)
610
+ const threshold = this.config.alertThresholds?.storageErrorRate
611
+ if (threshold !== undefined && totalStorageErrors > threshold) {
612
+ this.addAlert({
613
+ type: 'storage_error',
614
+ severity: 'critical',
615
+ message: `Storage error rate exceeded threshold: ${totalStorageErrors} errors`,
616
+ triggeredAt: Date.now(),
617
+ value: totalStorageErrors,
618
+ })
619
+ }
620
+ }
621
+
622
+ /** Records the current storage usage for a tier */
623
+ recordStorageUsage(tier: 'hot' | 'warm' | 'cold', bytes: number): void {
624
+ this.tierUsage[tier] = bytes
625
+ }
626
+
627
+ /** Records a change in tier health status */
628
+ recordTierHealthChange(tier: string, status: 'healthy' | 'degraded' | 'unhealthy'): void {
629
+ this.tierHealth[tier] = status
630
+ }
631
+
632
+ /** Returns a comprehensive snapshot of storage tier statistics including costs */
633
+ getStorageTierStats(): StorageTierSnapshot {
634
+ const getTierStats = (tier: 'hot' | 'warm' | 'cold'): TierStats => {
635
+ const ops = this.storageOps.filter((o) => o.tier === tier)
636
+ const reads = ops.filter((o) => o.operation === 'read')
637
+ const hits = reads.filter((o) => o.hit === true).length
638
+ const hitRate = reads.length > 0 ? hits / reads.length : 0
639
+
640
+ const bytesRead = ops.filter((o) => o.operation === 'read').reduce((s, o) => s + o.bytes, 0)
641
+ const bytesWritten = ops.filter((o) => o.operation === 'write').reduce((s, o) => s + o.bytes, 0)
642
+
643
+ const durationsWithValues = ops.filter((o) => o.durationMs !== undefined)
644
+ const avgLatencyMs = durationsWithValues.length > 0
645
+ ? durationsWithValues.reduce((s, o) => s + (o.durationMs || 0), 0) / durationsWithValues.length
646
+ : 0
647
+
648
+ return {
649
+ hitRate,
650
+ bytesRead,
651
+ bytesWritten,
652
+ totalOperations: ops.length,
653
+ avgLatencyMs,
654
+ errors: this.tierErrors[tier] || 0,
655
+ usageBytes: this.tierUsage[tier] || 0,
656
+ healthStatus: (this.tierHealth[tier] || 'healthy') as 'healthy' | 'degraded' | 'unhealthy',
657
+ }
658
+ }
659
+
660
+ const hotStats = getTierStats('hot')
661
+ const warmStats = getTierStats('warm')
662
+ const coldStats = getTierStats('cold')
663
+
664
+ // Estimate costs based on bytes stored
665
+ const hotCost = (this.tierUsage['hot'] || 0) * COST_PER_BYTE_HOT
666
+ const warmCost = (this.tierUsage['warm'] || 0) * COST_PER_BYTE_WARM
667
+ const coldCost = (this.tierUsage['cold'] || 0) * COST_PER_BYTE_COLD
668
+
669
+ // Calculate tiering efficiency (hot hit rate as primary metric)
670
+ const hotReads = this.storageOps.filter((o) => o.tier === 'hot' && o.operation === 'read')
671
+ const hotHits = hotReads.filter((o) => o.hit === true).length
672
+ const tieringEfficiency = hotReads.length > 0 ? hotHits / hotReads.length : 0
673
+
674
+ return {
675
+ hot: hotStats,
676
+ warm: warmStats,
677
+ cold: coldStats,
678
+ promotions: { ...this.tierPromotions },
679
+ demotions: { ...this.tierDemotions },
680
+ estimatedCosts: {
681
+ hot: hotCost,
682
+ warm: warmCost,
683
+ cold: coldCost,
684
+ total: hotCost + warmCost + coldCost,
685
+ },
686
+ tieringEfficiency,
687
+ }
688
+ }
689
+
690
+ // ===========================================================================
691
+ // Health Checks
692
+ // ===========================================================================
693
+
694
+ /** Sets the PGLite instance for health check queries */
695
+ setPGLiteInstance(pglite: any): void {
696
+ this.pgliteInstance = pglite
697
+ }
698
+
699
+ /** Sets the storage orchestrator for health check tier status */
700
+ setStorageOrchestrator(orchestrator: any): void {
701
+ this.storageOrchestrator = orchestrator
702
+ }
703
+
704
+ /** Returns a simple liveness probe result (always healthy if the service is running) */
705
+ liveness(): { status: 'healthy'; service: string; uptimeMs: number } {
706
+ return {
707
+ status: 'healthy',
708
+ service: this.config.serviceName,
709
+ uptimeMs: Date.now() - this.startTime,
710
+ }
711
+ }
712
+
713
+ /** Performs a readiness check including PGLite, storage, and memory health */
714
+ async readiness(_options?: { timeoutMs?: number }): Promise<HealthCheckResult> {
715
+ const startTime = Date.now()
716
+ const checks: Record<string, ComponentHealth> = {}
717
+
718
+ checks.pglite = await this.checkPGLiteHealth()
719
+ checks.storage = this.checkStorageHealth()
720
+ checks.memory = {
721
+ status: 'healthy',
722
+ details: {
723
+ heapUsedBytes: SIMULATED_HEAP_USED_BYTES,
724
+ heapTotalBytes: WORKER_MEMORY_LIMIT_BYTES,
725
+ },
726
+ }
727
+
728
+ const overallStatus = this.determineOverallStatus(checks)
729
+
730
+ return {
731
+ status: overallStatus,
732
+ checks,
733
+ responseTimeMs: Date.now() - startTime,
734
+ }
735
+ }
736
+
737
+ /** Checks PGLite health with a microtask-based timeout */
738
+ private async checkPGLiteHealth(): Promise<ComponentHealth> {
739
+ if (!this.pgliteInstance) {
740
+ return { status: 'unhealthy', error: 'PGLite not initialized' }
741
+ }
742
+
743
+ try {
744
+ const queryPromise = this.pgliteInstance.query('SELECT 1 as result')
745
+ let settled = false
746
+ const wrappedQuery = queryPromise.then(
747
+ (v: any) => { settled = true; return v },
748
+ (e: any) => { settled = true; throw e }
749
+ )
750
+ const timeoutCheck = new Promise<never>(async (_, reject) => {
751
+ for (let i = 0; i < HEALTH_CHECK_TIMEOUT_YIELD_COUNT; i++) {
752
+ await Promise.resolve()
753
+ if (settled) return
754
+ }
755
+ if (!settled) {
756
+ reject(new Error('Health check timeout'))
757
+ }
758
+ })
759
+ await Promise.race([wrappedQuery, timeoutCheck])
760
+ return { status: 'healthy' }
761
+ } catch (e) {
762
+ const errorMsg = e instanceof Error ? e.message : 'Unknown health check error'
763
+ return {
764
+ status: 'unhealthy',
765
+ error: errorMsg.includes('timeout') ? 'Health check timeout' : errorMsg,
766
+ }
767
+ }
768
+ }
769
+
770
+ /** Checks storage orchestrator health */
771
+ private checkStorageHealth(): ComponentHealth {
772
+ if (!this.storageOrchestrator) {
773
+ return { status: 'healthy', details: { message: 'No orchestrator configured' } }
774
+ }
775
+
776
+ const tierHealth = this.storageOrchestrator.getTierHealth()
777
+ const allHealthy = Object.values(tierHealth).every((t: any) => t.status === 'healthy')
778
+ const anyDegraded = Object.values(tierHealth).some((t: any) => t.status === 'degraded')
779
+
780
+ return {
781
+ status: allHealthy ? 'healthy' : (anyDegraded ? 'degraded' : 'unhealthy'),
782
+ details: tierHealth,
783
+ }
784
+ }
785
+
786
+ /** Determines the worst overall status from all component checks */
787
+ private determineOverallStatus(checks: Record<string, ComponentHealth>): 'healthy' | 'degraded' | 'unhealthy' {
788
+ const statuses = Object.values(checks).map((c) => c.status)
789
+ if (statuses.includes('unhealthy')) return 'unhealthy'
790
+ if (statuses.includes('degraded')) return 'degraded'
791
+ return 'healthy'
792
+ }
793
+
794
+ /** Performs a deep health check including WAL status */
795
+ async deepCheck(): Promise<HealthCheckResult> {
796
+ const startTime = Date.now()
797
+ const readinessResult = await this.readiness()
798
+
799
+ // Add WAL check
800
+ readinessResult.checks.wal = {
801
+ status: 'healthy',
802
+ details: { lastArchive: 'N/A' },
803
+ }
804
+
805
+ readinessResult.responseTimeMs = Date.now() - startTime
806
+ return readinessResult
807
+ }
808
+
809
+ // ===========================================================================
810
+ // Metrics Export
811
+ // ===========================================================================
812
+
813
+ /** Exports all metrics in Prometheus text exposition format */
814
+ exportPrometheus(): string {
815
+ const lines: string[] = []
816
+ const labels = `service="${this.config.serviceName}",do_id="${this.config.doId}"`
817
+ const queryMetrics = this.getQueryMetrics()
818
+
819
+ // Query total counter
820
+ lines.push('# HELP postgres_query_total Total number of queries executed')
821
+ lines.push('# TYPE postgres_query_total counter')
822
+ lines.push(`postgres_query_total{${labels}} ${queryMetrics.totalQueries}`)
823
+
824
+ // Query errors
825
+ lines.push('# HELP postgres_query_errors_total Total number of query errors')
826
+ lines.push('# TYPE postgres_query_errors_total counter')
827
+ lines.push(`postgres_query_errors_total{${labels}} ${queryMetrics.totalErrors}`)
828
+
829
+ // Query duration histogram
830
+ lines.push('# HELP postgres_query_duration_seconds Query execution time in seconds')
831
+ lines.push('# TYPE postgres_query_duration_seconds histogram')
832
+
833
+ const boundaries = this.config.histogramBoundaries || DEFAULT_HISTOGRAM_BOUNDARIES
834
+ const durations = this.queries.map((q) => q.durationMs / 1000).sort((a, b) => a - b)
835
+ let sum = 0
836
+
837
+ for (const boundary of boundaries) {
838
+ const count = durations.filter((d) => d <= boundary).length
839
+ lines.push(`postgres_query_duration_seconds_bucket{${labels},le="${boundary}"} ${count}`)
840
+ }
841
+ lines.push(`postgres_query_duration_seconds_bucket{${labels},le="+Inf"} ${durations.length}`)
842
+
843
+ sum = durations.reduce((s, d) => s + d, 0)
844
+ lines.push(`postgres_query_duration_seconds_count{${labels}} ${durations.length}`)
845
+ lines.push(`postgres_query_duration_seconds_sum{${labels}} ${sum}`)
846
+
847
+ // Connections
848
+ const connStats = this.getConnectionStats()
849
+ lines.push('# HELP postgres_connections_active Current active connections')
850
+ lines.push('# TYPE postgres_connections_active gauge')
851
+ lines.push(`postgres_connections_active{${labels}} ${connStats.activeConnections}`)
852
+
853
+ // Storage operations
854
+ lines.push('# HELP postgres_storage_operations_total Total storage operations')
855
+ lines.push('# TYPE postgres_storage_operations_total counter')
856
+ for (const tier of ['hot', 'warm', 'cold'] as const) {
857
+ const count = this.storageOps.filter((o) => o.tier === tier).length
858
+ lines.push(`postgres_storage_operations_total{${labels},tier="${tier}"} ${count}`)
859
+ }
860
+
861
+ return lines.join('\n')
862
+ }
863
+
864
+ /** Exports all metrics as a structured JSON object */
865
+ exportJSON(): { metrics: any; timestamp: number; service: string } {
866
+ return {
867
+ metrics: {
868
+ queries: this.getQueryMetrics(),
869
+ connections: this.getConnectionStats(),
870
+ storage: this.getStorageTierStats(),
871
+ },
872
+ timestamp: Date.now(),
873
+ service: this.config.serviceName,
874
+ }
875
+ }
876
+
877
+ /** Creates an HTTP request handler that serves metrics in Prometheus or JSON format */
878
+ createMetricsHandler(): (request: Request) => Promise<Response> {
879
+ return async (request: Request) => {
880
+ const accept = request.headers.get('Accept') || 'text/plain'
881
+
882
+ if (accept.includes('application/json')) {
883
+ const json = this.exportJSON()
884
+ return new Response(JSON.stringify(json), {
885
+ status: 200,
886
+ headers: { 'content-type': 'application/json' },
887
+ })
888
+ }
889
+
890
+ // Default: Prometheus format
891
+ const prometheus = this.exportPrometheus()
892
+ return new Response(prometheus, {
893
+ status: 200,
894
+ headers: { 'content-type': 'text/plain; charset=utf-8' },
895
+ })
896
+ }
897
+ }
898
+
899
+ // ===========================================================================
900
+ // Dashboard
901
+ // ===========================================================================
902
+
903
+ /** Returns a complete metrics dashboard snapshot for display */
904
+ getDashboard(): MetricsDashboard {
905
+ return {
906
+ queries: this.getQueryMetrics(),
907
+ connections: this.getConnectionStats(),
908
+ storage: this.getStorageTierStats(),
909
+ health: {
910
+ status: 'healthy',
911
+ checks: {},
912
+ },
913
+ service: {
914
+ name: this.config.serviceName,
915
+ doId: this.config.doId,
916
+ uptimeMs: Date.now() - this.startTime,
917
+ version: this.config.serviceVersion,
918
+ },
919
+ memory: {
920
+ heapUsedBytes: SIMULATED_HEAP_USED_BYTES,
921
+ heapTotalBytes: WORKER_MEMORY_LIMIT_BYTES,
922
+ },
923
+ alerts: this.getActiveAlerts(),
924
+ timestamp: Date.now(),
925
+ }
926
+ }
927
+
928
+ // ===========================================================================
929
+ // Alerts
930
+ // ===========================================================================
931
+
932
+ /** Evaluates all alert conditions against current metrics, adding/removing alerts as needed */
933
+ evaluateAlerts(): void {
934
+ const queryMetrics = this.getQueryMetrics()
935
+
936
+ const errorThreshold = this.config.alertThresholds?.errorRatePercent ?? DEFAULT_ERROR_RATE_THRESHOLD_PERCENT
937
+ const errorRatePercent = queryMetrics.errorRate * 100
938
+ if (errorRatePercent > errorThreshold) {
939
+ this.addAlert({
940
+ type: 'error_rate',
941
+ severity: 'critical',
942
+ message: `Error rate ${errorRatePercent.toFixed(1)}% exceeds threshold ${errorThreshold}%`,
943
+ triggeredAt: Date.now(),
944
+ value: errorRatePercent,
945
+ })
946
+ } else {
947
+ // Resolve error rate alerts
948
+ this.activeAlerts = this.activeAlerts.filter((a) => a.type !== 'error_rate')
949
+ }
950
+
951
+ const p99Threshold = this.config.alertThresholds?.p99LatencyMs ?? DEFAULT_P99_LATENCY_THRESHOLD_MS
952
+ if (queryMetrics.p99DurationMs > p99Threshold) {
953
+ this.addAlert({
954
+ type: 'high_latency',
955
+ severity: 'warning',
956
+ message: `P99 latency ${queryMetrics.p99DurationMs}ms exceeds threshold ${p99Threshold}ms`,
957
+ triggeredAt: Date.now(),
958
+ value: queryMetrics.p99DurationMs,
959
+ })
960
+ } else {
961
+ this.activeAlerts = this.activeAlerts.filter((a) => a.type !== 'high_latency')
962
+ }
963
+
964
+ // Custom thresholds (e.g. slow query count)
965
+ for (const threshold of this.customThresholds) {
966
+ if (threshold.type === 'slow_query_count') {
967
+ const slowCount = this.slowQueries.length
968
+ if (slowCount > threshold.threshold) {
969
+ this.addAlert({
970
+ name: threshold.name,
971
+ type: threshold.type,
972
+ severity: threshold.severity,
973
+ message: `Slow query count ${slowCount} exceeds threshold ${threshold.threshold}`,
974
+ triggeredAt: Date.now(),
975
+ value: slowCount,
976
+ })
977
+ }
978
+ }
979
+ }
980
+ }
981
+
982
+ /** Returns a copy of all currently active alerts */
983
+ getActiveAlerts(): Alert[] {
984
+ return [...this.activeAlerts]
985
+ }
986
+
987
+ /** Registers a custom alert threshold for evaluation */
988
+ registerAlertThreshold(threshold: AlertThreshold): void {
989
+ this.customThresholds.push(threshold)
990
+ }
991
+
992
+ // ===========================================================================
993
+ // Reset
994
+ // ===========================================================================
995
+
996
+ /** Resets all metrics state (queries, connections, storage, alerts) */
997
+ resetAll(): void {
998
+ this.resetQueryMetrics()
999
+ this.connections = []
1000
+ this.totalConnectionsOpened = 0
1001
+ this.peakConnections = 0
1002
+ this.connectionErrors = 0
1003
+ this.connectionDurations = []
1004
+ this.waitTimes = []
1005
+ this.idleConnections = 0
1006
+ this.storageOps = []
1007
+ this.tierPromotions = { coldToWarm: 0, warmToHot: 0 }
1008
+ this.tierDemotions = { hotToWarm: 0, warmToCold: 0 }
1009
+ this.tierErrors = { hot: 0, warm: 0, cold: 0 }
1010
+ this.tierUsage = { hot: 0, warm: 0, cold: 0 }
1011
+ this.activeAlerts = []
1012
+ }
1013
+
1014
+ // ===========================================================================
1015
+ // Private Helpers
1016
+ // ===========================================================================
1017
+
1018
+ private getWindowMetrics(sinceTimestamp: number): WindowMetrics {
1019
+ const windowQueries = this.queries.filter((q) => q.timestamp >= sinceTimestamp)
1020
+ const total = windowQueries.length
1021
+ const errors = windowQueries.filter((q) => !q.success).length
1022
+ const avgDuration = total > 0
1023
+ ? windowQueries.reduce((s, q) => s + q.durationMs, 0) / total
1024
+ : 0
1025
+
1026
+ return {
1027
+ totalQueries: total,
1028
+ avgDurationMs: avgDuration,
1029
+ errorRate: total > 0 ? errors / total : 0,
1030
+ }
1031
+ }
1032
+
1033
+ private addAlert(alert: Alert): void {
1034
+ // Don't add duplicate alerts of same type
1035
+ const existing = this.activeAlerts.find((a) =>
1036
+ a.type === alert.type && (alert.name ? a.name === alert.name : true)
1037
+ )
1038
+ if (!existing) {
1039
+ this.activeAlerts.push(alert)
1040
+ }
1041
+ }
1042
+ }
1043
+
1044
+ // =============================================================================
1045
+ // Factory Function
1046
+ // =============================================================================
1047
+
1048
+ /** Creates a ProductionMetrics instance, validating required configuration */
1049
+ export function createProductionMetrics(config: ProductionMetricsConfig): ProductionMetrics {
1050
+ if (!config.serviceName) {
1051
+ throw new Error('ProductionMetrics requires a non-empty serviceName')
1052
+ }
1053
+ return new ProductionMetrics(config)
1054
+ }