@dotdo/postgres 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1089 @@
1
+ /**
2
+ * Tests for Production Observability Metrics
3
+ * Task: postgres-7yr6.7 - Production readiness: Observability metrics
4
+ *
5
+ * RED phase TDD - These tests define the expected API surface for:
6
+ * - Query metrics (latency percentiles, throughput, error rates)
7
+ * - Connection stats (pool utilization, active connections)
8
+ * - Storage tier stats (hit rates, promotion/demotion, costs)
9
+ * - Health checks (liveness, readiness, deep checks)
10
+ * - Production-ready metrics endpoint
11
+ */
12
+
13
+ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'
14
+
15
+ // These imports define the expected API surface - they will fail until implemented
16
+ import {
17
+ ProductionMetrics,
18
+ createProductionMetrics,
19
+ type ProductionMetricsConfig,
20
+ type QueryMetricsSnapshot,
21
+ type ConnectionStats,
22
+ type StorageTierSnapshot,
23
+ type HealthCheckResult,
24
+ type MetricsDashboard,
25
+ type AlertThreshold,
26
+ type MetricsExportFormat,
27
+ type PrometheusMetric,
28
+ type QueryDigest,
29
+ type SlowQueryLog,
30
+ } from '../observability/production-metrics'
31
+
32
+ // =============================================================================
33
+ // Mock Setup
34
+ // =============================================================================
35
+
36
+ const createMockMetricsCollector = () => ({
37
+ increment: vi.fn(),
38
+ gauge: vi.fn(),
39
+ histogram: vi.fn(),
40
+ getMetrics: vi.fn().mockReturnValue([]),
41
+ reset: vi.fn(),
42
+ })
43
+
44
+ const createMockPGLite = () => ({
45
+ query: vi.fn().mockResolvedValue({ rows: [] }),
46
+ exec: vi.fn().mockResolvedValue(undefined),
47
+ })
48
+
49
+ const createMockStorageOrchestrator = () => ({
50
+ read: vi.fn(),
51
+ write: vi.fn(),
52
+ getStats: vi.fn().mockReturnValue({
53
+ tiers: {
54
+ hot: { reads: 0, writes: 0, hits: 0, misses: 0, bytesRead: 0, bytesWritten: 0, errors: 0, hitRatio: 0, deletes: 0 },
55
+ warm: { reads: 0, writes: 0, hits: 0, misses: 0, bytesRead: 0, bytesWritten: 0, errors: 0, hitRatio: 0, deletes: 0 },
56
+ cold: { reads: 0, writes: 0, hits: 0, misses: 0, bytesRead: 0, bytesWritten: 0, errors: 0, hitRatio: 0, deletes: 0 },
57
+ },
58
+ promotions: { coldToWarm: 0, warmToHot: 0 },
59
+ demotions: { hotToWarm: 0, warmToCold: 0 },
60
+ }),
61
+ getTierHealth: vi.fn().mockReturnValue({
62
+ hot: { status: 'healthy', consecutiveFailures: 0 },
63
+ warm: { status: 'healthy', consecutiveFailures: 0 },
64
+ cold: { status: 'healthy', consecutiveFailures: 0 },
65
+ }),
66
+ })
67
+
68
+ // =============================================================================
69
+ // Tests: ProductionMetrics Creation
70
+ // =============================================================================
71
+
72
+ describe('ProductionMetrics', () => {
73
+ let mockMetrics: ReturnType<typeof createMockMetricsCollector>
74
+ let mockPGLite: ReturnType<typeof createMockPGLite>
75
+ let mockStorage: ReturnType<typeof createMockStorageOrchestrator>
76
+
77
+ beforeEach(() => {
78
+ vi.useFakeTimers()
79
+ mockMetrics = createMockMetricsCollector()
80
+ mockPGLite = createMockPGLite()
81
+ mockStorage = createMockStorageOrchestrator()
82
+ })
83
+
84
+ afterEach(() => {
85
+ vi.useRealTimers()
86
+ vi.clearAllMocks()
87
+ })
88
+
89
+ describe('createProductionMetrics()', () => {
90
+ it('should create a ProductionMetrics instance with minimal config', () => {
91
+ const metrics = createProductionMetrics({
92
+ serviceName: 'postgres-do',
93
+ doId: 'test-do-123',
94
+ })
95
+ expect(metrics).toBeInstanceOf(ProductionMetrics)
96
+ })
97
+
98
+ it('should create with full configuration', () => {
99
+ const config: ProductionMetricsConfig = {
100
+ serviceName: 'postgres-do',
101
+ serviceVersion: '1.0.0',
102
+ doId: 'test-do-123',
103
+ environment: 'production',
104
+ collectIntervalMs: 10000,
105
+ histogramBoundaries: [1, 5, 10, 25, 50, 100, 250, 500, 1000],
106
+ slowQueryThresholdMs: 100,
107
+ maxQueryDigests: 1000,
108
+ enableDetailedStorageMetrics: true,
109
+ alertThresholds: {
110
+ errorRatePercent: 5,
111
+ p99LatencyMs: 1000,
112
+ memoryUsagePercent: 90,
113
+ storageErrorRate: 1,
114
+ },
115
+ }
116
+ const metrics = createProductionMetrics(config)
117
+ expect(metrics).toBeInstanceOf(ProductionMetrics)
118
+ })
119
+
120
+ it('should throw if serviceName is empty', () => {
121
+ expect(() =>
122
+ createProductionMetrics({
123
+ serviceName: '',
124
+ doId: 'test-do-123',
125
+ })
126
+ ).toThrow()
127
+ })
128
+ })
129
+
130
+ // ===========================================================================
131
+ // Tests: Query Metrics
132
+ // ===========================================================================
133
+
134
+ describe('Query Metrics', () => {
135
+ let metrics: InstanceType<typeof ProductionMetrics>
136
+
137
+ beforeEach(() => {
138
+ metrics = createProductionMetrics({
139
+ serviceName: 'postgres-do',
140
+ doId: 'test-do-123',
141
+ slowQueryThresholdMs: 100,
142
+ })
143
+ })
144
+
145
+ it('should record query execution time', () => {
146
+ metrics.recordQuery({
147
+ sql: 'SELECT * FROM users WHERE id = $1',
148
+ durationMs: 15,
149
+ rowsReturned: 1,
150
+ success: true,
151
+ })
152
+
153
+ const snapshot = metrics.getQueryMetrics()
154
+
155
+ expect(snapshot.totalQueries).toBe(1)
156
+ expect(snapshot.avgDurationMs).toBe(15)
157
+ })
158
+
159
+ it('should calculate latency percentiles (p50, p95, p99)', () => {
160
+ // Record various query latencies
161
+ const latencies = [5, 10, 15, 20, 25, 50, 75, 100, 200, 500]
162
+ for (const ms of latencies) {
163
+ metrics.recordQuery({
164
+ sql: 'SELECT 1',
165
+ durationMs: ms,
166
+ rowsReturned: 1,
167
+ success: true,
168
+ })
169
+ }
170
+
171
+ const snapshot = metrics.getQueryMetrics()
172
+
173
+ expect(snapshot.p50DurationMs).toBeDefined()
174
+ expect(snapshot.p95DurationMs).toBeDefined()
175
+ expect(snapshot.p99DurationMs).toBeDefined()
176
+ expect(snapshot.p50DurationMs).toBeLessThanOrEqual(snapshot.p95DurationMs)
177
+ expect(snapshot.p95DurationMs).toBeLessThanOrEqual(snapshot.p99DurationMs)
178
+ })
179
+
180
+ it('should track query throughput (queries per second)', () => {
181
+ for (let i = 0; i < 100; i++) {
182
+ metrics.recordQuery({
183
+ sql: 'SELECT 1',
184
+ durationMs: 5,
185
+ rowsReturned: 1,
186
+ success: true,
187
+ })
188
+ }
189
+
190
+ vi.advanceTimersByTime(10000) // 10 seconds
191
+
192
+ const snapshot = metrics.getQueryMetrics()
193
+
194
+ expect(snapshot.queriesPerSecond).toBeDefined()
195
+ expect(snapshot.queriesPerSecond).toBeGreaterThan(0)
196
+ })
197
+
198
+ it('should track error rate', () => {
199
+ for (let i = 0; i < 10; i++) {
200
+ metrics.recordQuery({
201
+ sql: 'SELECT 1',
202
+ durationMs: 5,
203
+ rowsReturned: i < 8 ? 1 : 0,
204
+ success: i < 8,
205
+ error: i >= 8 ? 'Query failed' : undefined,
206
+ })
207
+ }
208
+
209
+ const snapshot = metrics.getQueryMetrics()
210
+
211
+ expect(snapshot.errorRate).toBeCloseTo(0.2, 1) // 2 out of 10
212
+ expect(snapshot.totalErrors).toBe(2)
213
+ })
214
+
215
+ it('should track queries by operation type (SELECT, INSERT, UPDATE, DELETE)', () => {
216
+ metrics.recordQuery({ sql: 'SELECT * FROM users', durationMs: 5, rowsReturned: 10, success: true })
217
+ metrics.recordQuery({ sql: 'INSERT INTO users VALUES ($1)', durationMs: 10, rowsReturned: 0, success: true })
218
+ metrics.recordQuery({ sql: 'UPDATE users SET name = $1', durationMs: 8, rowsReturned: 0, success: true })
219
+ metrics.recordQuery({ sql: 'DELETE FROM users WHERE id = $1', durationMs: 6, rowsReturned: 0, success: true })
220
+
221
+ const snapshot = metrics.getQueryMetrics()
222
+
223
+ expect(snapshot.byOperation.SELECT.count).toBe(1)
224
+ expect(snapshot.byOperation.INSERT.count).toBe(1)
225
+ expect(snapshot.byOperation.UPDATE.count).toBe(1)
226
+ expect(snapshot.byOperation.DELETE.count).toBe(1)
227
+ })
228
+
229
+ it('should detect and log slow queries', () => {
230
+ metrics.recordQuery({
231
+ sql: 'SELECT * FROM large_table',
232
+ durationMs: 500, // Above 100ms threshold
233
+ rowsReturned: 10000,
234
+ success: true,
235
+ })
236
+
237
+ const slowQueries = metrics.getSlowQueryLog()
238
+
239
+ expect(slowQueries.length).toBe(1)
240
+ expect(slowQueries[0].durationMs).toBe(500)
241
+ expect(slowQueries[0].sql).toBe('SELECT * FROM large_table')
242
+ })
243
+
244
+ it('should limit slow query log size', () => {
245
+ for (let i = 0; i < 200; i++) {
246
+ metrics.recordQuery({
247
+ sql: `SELECT * FROM table_${i}`,
248
+ durationMs: 200,
249
+ rowsReturned: 1,
250
+ success: true,
251
+ })
252
+ }
253
+
254
+ const slowQueries = metrics.getSlowQueryLog()
255
+
256
+ expect(slowQueries.length).toBeLessThanOrEqual(100) // Default limit
257
+ })
258
+
259
+ it('should generate query digests (normalized patterns)', () => {
260
+ metrics.recordQuery({ sql: 'SELECT * FROM users WHERE id = 1', durationMs: 5, rowsReturned: 1, success: true })
261
+ metrics.recordQuery({ sql: 'SELECT * FROM users WHERE id = 2', durationMs: 8, rowsReturned: 1, success: true })
262
+ metrics.recordQuery({ sql: 'SELECT * FROM users WHERE id = 3', durationMs: 6, rowsReturned: 1, success: true })
263
+
264
+ const digests = metrics.getQueryDigests()
265
+
266
+ // All three should be normalized to the same pattern
267
+ expect(digests.length).toBe(1)
268
+ expect(digests[0].pattern).toContain('SELECT * FROM users WHERE id = $')
269
+ expect(digests[0].count).toBe(3)
270
+ expect(digests[0].avgDurationMs).toBeCloseTo(6.33, 0)
271
+ })
272
+
273
+ it('should track total rows returned/affected', () => {
274
+ metrics.recordQuery({ sql: 'SELECT * FROM users', durationMs: 5, rowsReturned: 100, success: true })
275
+ metrics.recordQuery({ sql: 'SELECT * FROM orders', durationMs: 10, rowsReturned: 50, success: true })
276
+
277
+ const snapshot = metrics.getQueryMetrics()
278
+
279
+ expect(snapshot.totalRowsReturned).toBe(150)
280
+ })
281
+
282
+ it('should provide windowed metrics (last 1m, 5m, 15m)', () => {
283
+ // Record queries over time
284
+ for (let i = 0; i < 60; i++) {
285
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5 + i, rowsReturned: 1, success: true })
286
+ vi.advanceTimersByTime(1000) // 1 second intervals
287
+ }
288
+
289
+ const snapshot = metrics.getQueryMetrics()
290
+
291
+ expect(snapshot.windows).toBeDefined()
292
+ expect(snapshot.windows.oneMinute).toBeDefined()
293
+ expect(snapshot.windows.fiveMinutes).toBeDefined()
294
+ expect(snapshot.windows.fifteenMinutes).toBeDefined()
295
+ })
296
+
297
+ it('should reset query metrics', () => {
298
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
299
+
300
+ metrics.resetQueryMetrics()
301
+
302
+ const snapshot = metrics.getQueryMetrics()
303
+ expect(snapshot.totalQueries).toBe(0)
304
+ })
305
+ })
306
+
307
+ // ===========================================================================
308
+ // Tests: Connection Stats
309
+ // ===========================================================================
310
+
311
+ describe('Connection Stats', () => {
312
+ let metrics: InstanceType<typeof ProductionMetrics>
313
+
314
+ beforeEach(() => {
315
+ metrics = createProductionMetrics({
316
+ serviceName: 'postgres-do',
317
+ doId: 'test-do-123',
318
+ })
319
+ })
320
+
321
+ it('should track active connections', () => {
322
+ metrics.recordConnectionOpen()
323
+ metrics.recordConnectionOpen()
324
+
325
+ const stats = metrics.getConnectionStats()
326
+
327
+ expect(stats.activeConnections).toBe(2)
328
+ })
329
+
330
+ it('should track connection close', () => {
331
+ metrics.recordConnectionOpen()
332
+ metrics.recordConnectionOpen()
333
+ metrics.recordConnectionClose()
334
+
335
+ const stats = metrics.getConnectionStats()
336
+
337
+ expect(stats.activeConnections).toBe(1)
338
+ })
339
+
340
+ it('should track total connections ever opened', () => {
341
+ metrics.recordConnectionOpen()
342
+ metrics.recordConnectionClose()
343
+ metrics.recordConnectionOpen()
344
+
345
+ const stats = metrics.getConnectionStats()
346
+
347
+ expect(stats.totalConnectionsOpened).toBe(2)
348
+ expect(stats.activeConnections).toBe(1)
349
+ })
350
+
351
+ it('should track peak concurrent connections', () => {
352
+ metrics.recordConnectionOpen()
353
+ metrics.recordConnectionOpen()
354
+ metrics.recordConnectionOpen()
355
+ metrics.recordConnectionClose()
356
+ metrics.recordConnectionClose()
357
+
358
+ const stats = metrics.getConnectionStats()
359
+
360
+ expect(stats.peakConnections).toBe(3)
361
+ expect(stats.activeConnections).toBe(1)
362
+ })
363
+
364
+ it('should track connection errors', () => {
365
+ metrics.recordConnectionError('Connection refused')
366
+ metrics.recordConnectionError('Timeout')
367
+
368
+ const stats = metrics.getConnectionStats()
369
+
370
+ expect(stats.connectionErrors).toBe(2)
371
+ })
372
+
373
+ it('should track average connection duration', () => {
374
+ metrics.recordConnectionOpen()
375
+ vi.advanceTimersByTime(5000)
376
+ metrics.recordConnectionClose({ durationMs: 5000 })
377
+
378
+ metrics.recordConnectionOpen()
379
+ vi.advanceTimersByTime(10000)
380
+ metrics.recordConnectionClose({ durationMs: 10000 })
381
+
382
+ const stats = metrics.getConnectionStats()
383
+
384
+ expect(stats.avgConnectionDurationMs).toBe(7500)
385
+ })
386
+
387
+ it('should track idle connections', () => {
388
+ metrics.recordConnectionOpen()
389
+ metrics.recordConnectionIdle()
390
+
391
+ const stats = metrics.getConnectionStats()
392
+
393
+ expect(stats.idleConnections).toBe(1)
394
+ })
395
+
396
+ it('should track connection pool utilization for DO single-connection model', () => {
397
+ // In the DO model, max_connections=1
398
+ metrics.recordConnectionOpen()
399
+
400
+ const stats = metrics.getConnectionStats()
401
+
402
+ expect(stats.poolUtilization).toBe(1.0) // 100% utilized
403
+ })
404
+
405
+ it('should track WebSocket connections separately', () => {
406
+ metrics.recordConnectionOpen({ type: 'websocket' })
407
+ metrics.recordConnectionOpen({ type: 'http' })
408
+
409
+ const stats = metrics.getConnectionStats()
410
+
411
+ expect(stats.websocketConnections).toBe(1)
412
+ expect(stats.httpConnections).toBe(1)
413
+ })
414
+
415
+ it('should track connection wait time (time to acquire)', () => {
416
+ metrics.recordConnectionAcquired({ waitTimeMs: 50 })
417
+ metrics.recordConnectionAcquired({ waitTimeMs: 100 })
418
+
419
+ const stats = metrics.getConnectionStats()
420
+
421
+ expect(stats.avgWaitTimeMs).toBe(75)
422
+ })
423
+
424
+ it('should provide connection uptime', () => {
425
+ const stats = metrics.getConnectionStats()
426
+
427
+ expect(stats.uptimeMs).toBeDefined()
428
+ expect(stats.uptimeMs).toBeGreaterThanOrEqual(0)
429
+ })
430
+ })
431
+
432
+ // ===========================================================================
433
+ // Tests: Storage Tier Stats
434
+ // ===========================================================================
435
+
436
+ describe('Storage Tier Stats', () => {
437
+ let metrics: InstanceType<typeof ProductionMetrics>
438
+
439
+ beforeEach(() => {
440
+ metrics = createProductionMetrics({
441
+ serviceName: 'postgres-do',
442
+ doId: 'test-do-123',
443
+ enableDetailedStorageMetrics: true,
444
+ })
445
+ })
446
+
447
+ it('should track hit rates for each tier', () => {
448
+ metrics.recordStorageOperation('hot', 'read', { hit: true, bytes: 1024 })
449
+ metrics.recordStorageOperation('hot', 'read', { hit: true, bytes: 2048 })
450
+ metrics.recordStorageOperation('hot', 'read', { hit: false, bytes: 512 })
451
+
452
+ const snapshot = metrics.getStorageTierStats()
453
+
454
+ expect(snapshot.hot.hitRate).toBeCloseTo(2 / 3, 2)
455
+ })
456
+
457
+ it('should track bytes read/written per tier', () => {
458
+ metrics.recordStorageOperation('warm', 'write', { bytes: 4096 })
459
+ metrics.recordStorageOperation('warm', 'read', { hit: true, bytes: 2048 })
460
+
461
+ const snapshot = metrics.getStorageTierStats()
462
+
463
+ expect(snapshot.warm.bytesWritten).toBe(4096)
464
+ expect(snapshot.warm.bytesRead).toBe(2048)
465
+ })
466
+
467
+ it('should track promotion events (cold to warm, warm to hot)', () => {
468
+ metrics.recordTierPromotion('cold', 'warm', { key: 'page-123', bytes: 1024 })
469
+ metrics.recordTierPromotion('warm', 'hot', { key: 'page-456', bytes: 2048 })
470
+
471
+ const snapshot = metrics.getStorageTierStats()
472
+
473
+ expect(snapshot.promotions.coldToWarm).toBe(1)
474
+ expect(snapshot.promotions.warmToHot).toBe(1)
475
+ })
476
+
477
+ it('should track demotion events (hot to warm, warm to cold)', () => {
478
+ metrics.recordTierDemotion('hot', 'warm', { key: 'page-789', bytes: 1024 })
479
+ metrics.recordTierDemotion('warm', 'cold', { key: 'page-012', bytes: 4096 })
480
+
481
+ const snapshot = metrics.getStorageTierStats()
482
+
483
+ expect(snapshot.demotions.hotToWarm).toBe(1)
484
+ expect(snapshot.demotions.warmToCold).toBe(1)
485
+ })
486
+
487
+ it('should track storage operation latency per tier', () => {
488
+ metrics.recordStorageOperation('hot', 'read', { hit: true, bytes: 1024, durationMs: 1 })
489
+ metrics.recordStorageOperation('warm', 'read', { hit: true, bytes: 1024, durationMs: 10 })
490
+ metrics.recordStorageOperation('cold', 'read', { hit: true, bytes: 1024, durationMs: 100 })
491
+
492
+ const snapshot = metrics.getStorageTierStats()
493
+
494
+ expect(snapshot.hot.avgLatencyMs).toBe(1)
495
+ expect(snapshot.warm.avgLatencyMs).toBe(10)
496
+ expect(snapshot.cold.avgLatencyMs).toBe(100)
497
+ })
498
+
499
+ it('should track storage errors per tier', () => {
500
+ metrics.recordStorageError('hot', 'Cache API error')
501
+ metrics.recordStorageError('cold', 'R2 timeout')
502
+ metrics.recordStorageError('cold', 'R2 permission denied')
503
+
504
+ const snapshot = metrics.getStorageTierStats()
505
+
506
+ expect(snapshot.hot.errors).toBe(1)
507
+ expect(snapshot.cold.errors).toBe(2)
508
+ })
509
+
510
+ it('should estimate storage costs per tier', () => {
511
+ metrics.recordStorageOperation('warm', 'write', { bytes: 1024 * 1024 }) // 1MB
512
+ metrics.recordStorageOperation('cold', 'write', { bytes: 10 * 1024 * 1024 }) // 10MB
513
+
514
+ const snapshot = metrics.getStorageTierStats()
515
+
516
+ expect(snapshot.estimatedCosts).toBeDefined()
517
+ expect(snapshot.estimatedCosts.hot).toBeGreaterThanOrEqual(0)
518
+ expect(snapshot.estimatedCosts.warm).toBeGreaterThanOrEqual(0)
519
+ expect(snapshot.estimatedCosts.cold).toBeGreaterThanOrEqual(0)
520
+ expect(snapshot.estimatedCosts.total).toBeGreaterThanOrEqual(0)
521
+ })
522
+
523
+ it('should track total storage used per tier', () => {
524
+ metrics.recordStorageUsage('hot', 1024 * 1024) // 1MB
525
+ metrics.recordStorageUsage('warm', 10 * 1024 * 1024) // 10MB
526
+ metrics.recordStorageUsage('cold', 100 * 1024 * 1024) // 100MB
527
+
528
+ const snapshot = metrics.getStorageTierStats()
529
+
530
+ expect(snapshot.hot.usageBytes).toBe(1024 * 1024)
531
+ expect(snapshot.warm.usageBytes).toBe(10 * 1024 * 1024)
532
+ expect(snapshot.cold.usageBytes).toBe(100 * 1024 * 1024)
533
+ })
534
+
535
+ it('should track tier health status', () => {
536
+ metrics.recordTierHealthChange('hot', 'healthy')
537
+ metrics.recordTierHealthChange('warm', 'degraded')
538
+ metrics.recordTierHealthChange('cold', 'healthy')
539
+
540
+ const snapshot = metrics.getStorageTierStats()
541
+
542
+ expect(snapshot.hot.healthStatus).toBe('healthy')
543
+ expect(snapshot.warm.healthStatus).toBe('degraded')
544
+ expect(snapshot.cold.healthStatus).toBe('healthy')
545
+ })
546
+
547
+ it('should provide tiering efficiency ratio', () => {
548
+ // Hot tier should have high hit rate (hot data is frequently accessed)
549
+ for (let i = 0; i < 100; i++) {
550
+ metrics.recordStorageOperation('hot', 'read', { hit: true, bytes: 1024 })
551
+ }
552
+ for (let i = 0; i < 10; i++) {
553
+ metrics.recordStorageOperation('hot', 'read', { hit: false, bytes: 1024 })
554
+ }
555
+
556
+ const snapshot = metrics.getStorageTierStats()
557
+
558
+ expect(snapshot.tieringEfficiency).toBeDefined()
559
+ expect(snapshot.tieringEfficiency).toBeGreaterThan(0.5)
560
+ })
561
+ })
562
+
563
+ // ===========================================================================
564
+ // Tests: Health Checks
565
+ // ===========================================================================
566
+
567
+ describe('Health Checks', () => {
568
+ let metrics: InstanceType<typeof ProductionMetrics>
569
+
570
+ beforeEach(() => {
571
+ metrics = createProductionMetrics({
572
+ serviceName: 'postgres-do',
573
+ doId: 'test-do-123',
574
+ })
575
+ })
576
+
577
+ it('should provide liveness check', () => {
578
+ const health = metrics.liveness()
579
+
580
+ expect(health.status).toBe('healthy')
581
+ expect(health.service).toBe('postgres-do')
582
+ })
583
+
584
+ it('should provide readiness check', async () => {
585
+ const health = await metrics.readiness()
586
+
587
+ expect(health.status).toBeDefined()
588
+ expect(health.checks).toBeDefined()
589
+ })
590
+
591
+ it('should check PGLite connectivity in readiness', async () => {
592
+ metrics.setPGLiteInstance(mockPGLite as any)
593
+ mockPGLite.query.mockResolvedValueOnce({ rows: [{ result: 1 }] })
594
+
595
+ const health = await metrics.readiness()
596
+
597
+ expect(health.checks.pglite).toBeDefined()
598
+ expect(health.checks.pglite.status).toBe('healthy')
599
+ })
600
+
601
+ it('should report unhealthy PGLite in readiness', async () => {
602
+ metrics.setPGLiteInstance(mockPGLite as any)
603
+ mockPGLite.query.mockRejectedValueOnce(new Error('PGLite crashed'))
604
+
605
+ const health = await metrics.readiness()
606
+
607
+ expect(health.checks.pglite.status).toBe('unhealthy')
608
+ expect(health.checks.pglite.error).toContain('PGLite crashed')
609
+ })
610
+
611
+ it('should check storage tier health in readiness', async () => {
612
+ metrics.setStorageOrchestrator(mockStorage as any)
613
+
614
+ const health = await metrics.readiness()
615
+
616
+ expect(health.checks.storage).toBeDefined()
617
+ expect(health.checks.storage.status).toBe('healthy')
618
+ })
619
+
620
+ it('should report degraded storage in readiness', async () => {
621
+ mockStorage.getTierHealth.mockReturnValue({
622
+ hot: { status: 'healthy', consecutiveFailures: 0 },
623
+ warm: { status: 'degraded', consecutiveFailures: 3 },
624
+ cold: { status: 'healthy', consecutiveFailures: 0 },
625
+ })
626
+ metrics.setStorageOrchestrator(mockStorage as any)
627
+
628
+ const health = await metrics.readiness()
629
+
630
+ expect(health.checks.storage.status).toBe('degraded')
631
+ })
632
+
633
+ it('should check memory usage in readiness', async () => {
634
+ const health = await metrics.readiness()
635
+
636
+ expect(health.checks.memory).toBeDefined()
637
+ expect(health.checks.memory.details).toBeDefined()
638
+ })
639
+
640
+ it('should provide deep health check with all components', async () => {
641
+ metrics.setPGLiteInstance(mockPGLite as any)
642
+ metrics.setStorageOrchestrator(mockStorage as any)
643
+ mockPGLite.query.mockResolvedValueOnce({ rows: [{ result: 1 }] })
644
+
645
+ const health = await metrics.deepCheck()
646
+
647
+ expect(health.status).toBeDefined()
648
+ expect(health.checks.pglite).toBeDefined()
649
+ expect(health.checks.storage).toBeDefined()
650
+ expect(health.checks.memory).toBeDefined()
651
+ expect(health.checks.wal).toBeDefined()
652
+ expect(health.responseTimeMs).toBeDefined()
653
+ })
654
+
655
+ it('should include uptime in health response', () => {
656
+ vi.advanceTimersByTime(60000) // 1 minute
657
+
658
+ const health = metrics.liveness()
659
+
660
+ expect(health.uptimeMs).toBeGreaterThanOrEqual(60000)
661
+ })
662
+
663
+ it('should aggregate component health into overall status', async () => {
664
+ metrics.setPGLiteInstance(mockPGLite as any)
665
+ mockPGLite.query.mockRejectedValueOnce(new Error('Dead'))
666
+
667
+ const health = await metrics.readiness()
668
+
669
+ // If any critical component is unhealthy, overall should be unhealthy
670
+ expect(health.status).toBe('unhealthy')
671
+ })
672
+
673
+ it('should handle health check timeout', async () => {
674
+ metrics.setPGLiteInstance(mockPGLite as any)
675
+ mockPGLite.query.mockImplementation(
676
+ () => new Promise((resolve) => setTimeout(resolve, 30000))
677
+ )
678
+
679
+ vi.advanceTimersByTime(5000)
680
+
681
+ const health = await metrics.readiness({ timeoutMs: 5000 })
682
+
683
+ expect(health.checks.pglite.status).toBe('unhealthy')
684
+ expect(health.checks.pglite.error).toContain('timeout')
685
+ })
686
+ })
687
+
688
+ // ===========================================================================
689
+ // Tests: Metrics Export
690
+ // ===========================================================================
691
+
692
+ describe('Metrics Export', () => {
693
+ let metrics: InstanceType<typeof ProductionMetrics>
694
+
695
+ beforeEach(() => {
696
+ metrics = createProductionMetrics({
697
+ serviceName: 'postgres-do',
698
+ doId: 'test-do-123',
699
+ })
700
+ })
701
+
702
+ it('should export metrics in Prometheus format', () => {
703
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
704
+
705
+ const output = metrics.exportPrometheus()
706
+
707
+ expect(output).toContain('# HELP')
708
+ expect(output).toContain('# TYPE')
709
+ expect(output).toContain('postgres_query_duration_seconds')
710
+ })
711
+
712
+ it('should export metrics in JSON format', () => {
713
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
714
+
715
+ const output = metrics.exportJSON()
716
+
717
+ expect(output.metrics).toBeDefined()
718
+ expect(output.timestamp).toBeDefined()
719
+ expect(output.service).toBe('postgres-do')
720
+ })
721
+
722
+ it('should include labels in Prometheus metrics', () => {
723
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
724
+
725
+ const output = metrics.exportPrometheus()
726
+
727
+ expect(output).toContain('do_id="test-do-123"')
728
+ expect(output).toContain('service="postgres-do"')
729
+ })
730
+
731
+ it('should export histogram buckets in Prometheus format', () => {
732
+ for (let i = 0; i < 100; i++) {
733
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: i * 10, rowsReturned: 1, success: true })
734
+ }
735
+
736
+ const output = metrics.exportPrometheus()
737
+
738
+ expect(output).toContain('_bucket{')
739
+ expect(output).toContain('le="')
740
+ expect(output).toContain('_count')
741
+ expect(output).toContain('_sum')
742
+ })
743
+
744
+ it('should export query metrics', () => {
745
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
746
+
747
+ const output = metrics.exportPrometheus()
748
+
749
+ expect(output).toContain('postgres_query_total')
750
+ expect(output).toContain('postgres_query_duration_seconds')
751
+ expect(output).toContain('postgres_query_errors_total')
752
+ })
753
+
754
+ it('should export storage tier metrics', () => {
755
+ metrics.recordStorageOperation('hot', 'read', { hit: true, bytes: 1024 })
756
+
757
+ const output = metrics.exportPrometheus()
758
+
759
+ expect(output).toContain('postgres_storage_operations_total')
760
+ expect(output).toContain('tier="hot"')
761
+ })
762
+
763
+ it('should export connection metrics', () => {
764
+ metrics.recordConnectionOpen()
765
+
766
+ const output = metrics.exportPrometheus()
767
+
768
+ expect(output).toContain('postgres_connections_active')
769
+ })
770
+
771
+ it('should handle metrics endpoint request', async () => {
772
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
773
+
774
+ const handler = metrics.createMetricsHandler()
775
+ const request = new Request('http://localhost/metrics', {
776
+ headers: { Accept: 'text/plain' },
777
+ })
778
+
779
+ const response = await handler(request)
780
+
781
+ expect(response.status).toBe(200)
782
+ expect(response.headers.get('content-type')).toContain('text/plain')
783
+ })
784
+
785
+ it('should support content negotiation for metrics endpoint', async () => {
786
+ const handler = metrics.createMetricsHandler()
787
+
788
+ const jsonRequest = new Request('http://localhost/metrics', {
789
+ headers: { Accept: 'application/json' },
790
+ })
791
+ const jsonResponse = await handler(jsonRequest)
792
+ expect(jsonResponse.headers.get('content-type')).toContain('application/json')
793
+
794
+ const promRequest = new Request('http://localhost/metrics', {
795
+ headers: { Accept: 'text/plain' },
796
+ })
797
+ const promResponse = await handler(promRequest)
798
+ expect(promResponse.headers.get('content-type')).toContain('text/plain')
799
+ })
800
+ })
801
+
802
+ // ===========================================================================
803
+ // Tests: Dashboard / Summary
804
+ // ===========================================================================
805
+
806
+ describe('Metrics Dashboard', () => {
807
+ let metrics: InstanceType<typeof ProductionMetrics>
808
+
809
+ beforeEach(() => {
810
+ metrics = createProductionMetrics({
811
+ serviceName: 'postgres-do',
812
+ doId: 'test-do-123',
813
+ })
814
+ })
815
+
816
+ it('should provide a complete metrics dashboard', () => {
817
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
818
+ metrics.recordConnectionOpen()
819
+ metrics.recordStorageOperation('hot', 'read', { hit: true, bytes: 1024 })
820
+
821
+ const dashboard = metrics.getDashboard()
822
+
823
+ expect(dashboard.queries).toBeDefined()
824
+ expect(dashboard.connections).toBeDefined()
825
+ expect(dashboard.storage).toBeDefined()
826
+ expect(dashboard.health).toBeDefined()
827
+ expect(dashboard.timestamp).toBeDefined()
828
+ })
829
+
830
+ it('should include service info in dashboard', () => {
831
+ const dashboard = metrics.getDashboard()
832
+
833
+ expect(dashboard.service.name).toBe('postgres-do')
834
+ expect(dashboard.service.doId).toBe('test-do-123')
835
+ expect(dashboard.service.uptimeMs).toBeDefined()
836
+ })
837
+
838
+ it('should include alerts in dashboard', () => {
839
+ // Trigger high error rate
840
+ for (let i = 0; i < 100; i++) {
841
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 0, success: false, error: 'Error' })
842
+ }
843
+
844
+ const dashboard = metrics.getDashboard()
845
+
846
+ expect(dashboard.alerts).toBeDefined()
847
+ expect(dashboard.alerts.length).toBeGreaterThan(0)
848
+ })
849
+
850
+ it('should include memory metrics in dashboard', () => {
851
+ const dashboard = metrics.getDashboard()
852
+
853
+ expect(dashboard.memory).toBeDefined()
854
+ expect(dashboard.memory.heapUsedBytes).toBeDefined()
855
+ expect(dashboard.memory.heapTotalBytes).toBeDefined()
856
+ })
857
+ })
858
+
859
+ // ===========================================================================
860
+ // Tests: Alert Thresholds
861
+ // ===========================================================================
862
+
863
+ describe('Alert Thresholds', () => {
864
+ let metrics: InstanceType<typeof ProductionMetrics>
865
+
866
+ beforeEach(() => {
867
+ metrics = createProductionMetrics({
868
+ serviceName: 'postgres-do',
869
+ doId: 'test-do-123',
870
+ alertThresholds: {
871
+ errorRatePercent: 5,
872
+ p99LatencyMs: 1000,
873
+ memoryUsagePercent: 90,
874
+ storageErrorRate: 1,
875
+ },
876
+ })
877
+ })
878
+
879
+ it('should trigger alert when error rate exceeds threshold', () => {
880
+ // Generate high error rate (> 5%)
881
+ for (let i = 0; i < 100; i++) {
882
+ metrics.recordQuery({
883
+ sql: 'SELECT 1',
884
+ durationMs: 5,
885
+ rowsReturned: 0,
886
+ success: i < 90, // 10% error rate
887
+ error: i >= 90 ? 'Error' : undefined,
888
+ })
889
+ }
890
+
891
+ const alerts = metrics.getActiveAlerts()
892
+
893
+ expect(alerts.some((a) => a.type === 'error_rate')).toBe(true)
894
+ })
895
+
896
+ it('should trigger alert when p99 latency exceeds threshold', () => {
897
+ // Generate high latency queries
898
+ for (let i = 0; i < 100; i++) {
899
+ metrics.recordQuery({
900
+ sql: 'SELECT 1',
901
+ durationMs: i < 99 ? 5 : 2000, // p99 will be ~2000ms
902
+ rowsReturned: 1,
903
+ success: true,
904
+ })
905
+ }
906
+
907
+ const alerts = metrics.getActiveAlerts()
908
+
909
+ expect(alerts.some((a) => a.type === 'high_latency')).toBe(true)
910
+ })
911
+
912
+ it('should trigger alert when storage errors exceed threshold', () => {
913
+ metrics.recordStorageError('cold', 'R2 error 1')
914
+ metrics.recordStorageError('cold', 'R2 error 2')
915
+
916
+ const alerts = metrics.getActiveAlerts()
917
+
918
+ expect(alerts.some((a) => a.type === 'storage_error')).toBe(true)
919
+ })
920
+
921
+ it('should resolve alerts when conditions return to normal', () => {
922
+ // First trigger an alert
923
+ for (let i = 0; i < 100; i++) {
924
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 0, success: false, error: 'Error' })
925
+ }
926
+
927
+ expect(metrics.getActiveAlerts().length).toBeGreaterThan(0)
928
+
929
+ // Now generate successful queries to bring error rate down
930
+ for (let i = 0; i < 10000; i++) {
931
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
932
+ }
933
+
934
+ metrics.evaluateAlerts()
935
+
936
+ const alerts = metrics.getActiveAlerts()
937
+ const errorAlerts = alerts.filter((a) => a.type === 'error_rate')
938
+ expect(errorAlerts.length).toBe(0)
939
+ })
940
+
941
+ it('should include alert severity level', () => {
942
+ for (let i = 0; i < 100; i++) {
943
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 0, success: false, error: 'Error' })
944
+ }
945
+
946
+ const alerts = metrics.getActiveAlerts()
947
+
948
+ expect(alerts[0].severity).toBeDefined()
949
+ expect(['warning', 'critical']).toContain(alerts[0].severity)
950
+ })
951
+
952
+ it('should register custom alert thresholds', () => {
953
+ metrics.registerAlertThreshold({
954
+ name: 'custom_slow_queries',
955
+ type: 'slow_query_count',
956
+ threshold: 10,
957
+ windowMs: 60000,
958
+ severity: 'warning',
959
+ })
960
+
961
+ // Trigger the custom alert
962
+ for (let i = 0; i < 15; i++) {
963
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 500, rowsReturned: 1, success: true })
964
+ }
965
+
966
+ const alerts = metrics.getActiveAlerts()
967
+ expect(alerts.some((a) => a.name === 'custom_slow_queries')).toBe(true)
968
+ })
969
+ })
970
+
971
+ // ===========================================================================
972
+ // Tests: Edge Cases
973
+ // ===========================================================================
974
+
975
+ describe('Edge Cases', () => {
976
+ let metrics: InstanceType<typeof ProductionMetrics>
977
+
978
+ beforeEach(() => {
979
+ metrics = createProductionMetrics({
980
+ serviceName: 'postgres-do',
981
+ doId: 'test-do-123',
982
+ })
983
+ })
984
+
985
+ it('should handle recording metrics with no queries', () => {
986
+ const snapshot = metrics.getQueryMetrics()
987
+
988
+ expect(snapshot.totalQueries).toBe(0)
989
+ expect(snapshot.avgDurationMs).toBe(0)
990
+ expect(snapshot.errorRate).toBe(0)
991
+ })
992
+
993
+ it('should handle very large number of metrics', () => {
994
+ for (let i = 0; i < 100000; i++) {
995
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: Math.random() * 1000, rowsReturned: 1, success: true })
996
+ }
997
+
998
+ const snapshot = metrics.getQueryMetrics()
999
+
1000
+ expect(snapshot.totalQueries).toBe(100000)
1001
+ })
1002
+
1003
+ it('should handle zero-duration queries', () => {
1004
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 0, rowsReturned: 0, success: true })
1005
+
1006
+ const snapshot = metrics.getQueryMetrics()
1007
+
1008
+ expect(snapshot.totalQueries).toBe(1)
1009
+ expect(snapshot.avgDurationMs).toBe(0)
1010
+ })
1011
+
1012
+ it('should handle negative duration gracefully', () => {
1013
+ // Should not throw
1014
+ expect(() =>
1015
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: -5, rowsReturned: 0, success: true })
1016
+ ).not.toThrow()
1017
+ })
1018
+
1019
+ it('should handle empty SQL in query recording', () => {
1020
+ metrics.recordQuery({ sql: '', durationMs: 5, rowsReturned: 0, success: true })
1021
+
1022
+ const snapshot = metrics.getQueryMetrics()
1023
+ expect(snapshot.totalQueries).toBe(1)
1024
+ })
1025
+
1026
+ it('should handle concurrent metric recording', async () => {
1027
+ const promises = Array.from({ length: 100 }, (_, i) =>
1028
+ Promise.resolve(
1029
+ metrics.recordQuery({ sql: `SELECT ${i}`, durationMs: i, rowsReturned: 1, success: true })
1030
+ )
1031
+ )
1032
+
1033
+ await Promise.all(promises)
1034
+
1035
+ const snapshot = metrics.getQueryMetrics()
1036
+ expect(snapshot.totalQueries).toBe(100)
1037
+ })
1038
+
1039
+ it('should not exceed memory bounds for digest storage', () => {
1040
+ // Record many unique queries
1041
+ for (let i = 0; i < 10000; i++) {
1042
+ metrics.recordQuery({
1043
+ sql: `SELECT * FROM table_${i} WHERE id = ${i}`,
1044
+ durationMs: 5,
1045
+ rowsReturned: 1,
1046
+ success: true,
1047
+ })
1048
+ }
1049
+
1050
+ const digests = metrics.getQueryDigests()
1051
+
1052
+ // Should cap at configured max
1053
+ expect(digests.length).toBeLessThanOrEqual(1000)
1054
+ })
1055
+
1056
+ it('should handle metrics reset during collection', () => {
1057
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
1058
+ metrics.resetAll()
1059
+
1060
+ const snapshot = metrics.getQueryMetrics()
1061
+ expect(snapshot.totalQueries).toBe(0)
1062
+
1063
+ const connStats = metrics.getConnectionStats()
1064
+ expect(connStats.activeConnections).toBe(0)
1065
+
1066
+ const storageStats = metrics.getStorageTierStats()
1067
+ expect(storageStats.hot.totalOperations).toBe(0)
1068
+ })
1069
+
1070
+ it('should provide stable percentiles with small sample size', () => {
1071
+ metrics.recordQuery({ sql: 'SELECT 1', durationMs: 10, rowsReturned: 1, success: true })
1072
+
1073
+ const snapshot = metrics.getQueryMetrics()
1074
+
1075
+ // With single sample, all percentiles should equal that value
1076
+ expect(snapshot.p50DurationMs).toBe(10)
1077
+ expect(snapshot.p95DurationMs).toBe(10)
1078
+ expect(snapshot.p99DurationMs).toBe(10)
1079
+ })
1080
+
1081
+ it('should survive PGLite instance being null for health checks', async () => {
1082
+ // Don't set PGLite instance
1083
+ const health = await metrics.readiness()
1084
+
1085
+ expect(health.checks.pglite.status).toBe('unhealthy')
1086
+ expect(health.checks.pglite.error).toContain('not initialized')
1087
+ })
1088
+ })
1089
+ })