npm - @dotdo/postgres - Versions diffs - 0.1.0 → 0.1.1 - Mend

@dotdo/postgres 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/dist/backup/backup-manager.d.ts +244 -0
package/dist/backup/backup-manager.d.ts.map +1 -0
package/dist/backup/backup-manager.js +726 -0
package/dist/backup/backup-manager.js.map +1 -0
package/dist/observability/production-metrics.d.ts +318 -0
package/dist/observability/production-metrics.d.ts.map +1 -0
package/dist/observability/production-metrics.js +747 -0
package/dist/observability/production-metrics.js.map +1 -0
package/dist/pitr/pitr-manager.d.ts +240 -0
package/dist/pitr/pitr-manager.d.ts.map +1 -0
package/dist/pitr/pitr-manager.js +837 -0
package/dist/pitr/pitr-manager.js.map +1 -0
package/dist/streaming/cdc-iceberg-connector.d.ts +1 -1
package/dist/streaming/cdc-iceberg-connector.js +1 -1
package/dist/streaming/live-cdc-stream.d.ts +1 -1
package/dist/streaming/live-cdc-stream.js +1 -1
package/package.json +4 -4
package/src/__tests__/backup.test.ts +944 -0
package/src/__tests__/observability.test.ts +1089 -0
package/src/__tests__/pitr.test.ts +1240 -0
package/src/backup/backup-manager.ts +1006 -0
package/src/observability/production-metrics.ts +1054 -0
package/src/pitr/pitr-manager.ts +1136 -0

package/src/__tests__/observability.test.ts ADDED Viewed

@@ -0,0 +1,1089 @@
+/**
+ * Tests for Production Observability Metrics
+ * Task: postgres-7yr6.7 - Production readiness: Observability metrics
+ *
+ * RED phase TDD - These tests define the expected API surface for:
+ * - Query metrics (latency percentiles, throughput, error rates)
+ * - Connection stats (pool utilization, active connections)
+ * - Storage tier stats (hit rates, promotion/demotion, costs)
+ * - Health checks (liveness, readiness, deep checks)
+ * - Production-ready metrics endpoint
+ */
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'
+// These imports define the expected API surface - they will fail until implemented
+import {
+  ProductionMetrics,
+  createProductionMetrics,
+  type ProductionMetricsConfig,
+  type QueryMetricsSnapshot,
+  type ConnectionStats,
+  type StorageTierSnapshot,
+  type HealthCheckResult,
+  type MetricsDashboard,
+  type AlertThreshold,
+  type MetricsExportFormat,
+  type PrometheusMetric,
+  type QueryDigest,
+  type SlowQueryLog,
+} from '../observability/production-metrics'
+// =============================================================================
+// Mock Setup
+// =============================================================================
+const createMockMetricsCollector = () => ({
+  increment: vi.fn(),
+  gauge: vi.fn(),
+  histogram: vi.fn(),
+  getMetrics: vi.fn().mockReturnValue([]),
+  reset: vi.fn(),
+})
+const createMockPGLite = () => ({
+  query: vi.fn().mockResolvedValue({ rows: [] }),
+  exec: vi.fn().mockResolvedValue(undefined),
+})
+const createMockStorageOrchestrator = () => ({
+  read: vi.fn(),
+  write: vi.fn(),
+  getStats: vi.fn().mockReturnValue({
+    tiers: {
+      hot: { reads: 0, writes: 0, hits: 0, misses: 0, bytesRead: 0, bytesWritten: 0, errors: 0, hitRatio: 0, deletes: 0 },
+      warm: { reads: 0, writes: 0, hits: 0, misses: 0, bytesRead: 0, bytesWritten: 0, errors: 0, hitRatio: 0, deletes: 0 },
+      cold: { reads: 0, writes: 0, hits: 0, misses: 0, bytesRead: 0, bytesWritten: 0, errors: 0, hitRatio: 0, deletes: 0 },
+    },
+    promotions: { coldToWarm: 0, warmToHot: 0 },
+    demotions: { hotToWarm: 0, warmToCold: 0 },
+  }),
+  getTierHealth: vi.fn().mockReturnValue({
+    hot: { status: 'healthy', consecutiveFailures: 0 },
+    warm: { status: 'healthy', consecutiveFailures: 0 },
+    cold: { status: 'healthy', consecutiveFailures: 0 },
+  }),
+})
+// =============================================================================
+// Tests: ProductionMetrics Creation
+// =============================================================================
+describe('ProductionMetrics', () => {
+  let mockMetrics: ReturnType<typeof createMockMetricsCollector>
+  let mockPGLite: ReturnType<typeof createMockPGLite>
+  let mockStorage: ReturnType<typeof createMockStorageOrchestrator>
+  beforeEach(() => {
+    vi.useFakeTimers()
+    mockMetrics = createMockMetricsCollector()
+    mockPGLite = createMockPGLite()
+    mockStorage = createMockStorageOrchestrator()
+  })
+  afterEach(() => {
+    vi.useRealTimers()
+    vi.clearAllMocks()
+  })
+  describe('createProductionMetrics()', () => {
+    it('should create a ProductionMetrics instance with minimal config', () => {
+      const metrics = createProductionMetrics({
+        serviceName: 'postgres-do',
+        doId: 'test-do-123',
+      })
+      expect(metrics).toBeInstanceOf(ProductionMetrics)
+    })
+    it('should create with full configuration', () => {
+      const config: ProductionMetricsConfig = {
+        serviceName: 'postgres-do',
+        serviceVersion: '1.0.0',
+        doId: 'test-do-123',
+        environment: 'production',
+        collectIntervalMs: 10000,
+        histogramBoundaries: [1, 5, 10, 25, 50, 100, 250, 500, 1000],
+        slowQueryThresholdMs: 100,
+        maxQueryDigests: 1000,
+        enableDetailedStorageMetrics: true,
+        alertThresholds: {
+          errorRatePercent: 5,
+          p99LatencyMs: 1000,
+          memoryUsagePercent: 90,
+          storageErrorRate: 1,
+        },
+      }
+      const metrics = createProductionMetrics(config)
+      expect(metrics).toBeInstanceOf(ProductionMetrics)
+    })
+    it('should throw if serviceName is empty', () => {
+      expect(() =>
+        createProductionMetrics({
+          serviceName: '',
+          doId: 'test-do-123',
+        })
+      ).toThrow()
+    })
+  })
+  // ===========================================================================
+  // Tests: Query Metrics
+  // ===========================================================================
+  describe('Query Metrics', () => {
+    let metrics: InstanceType<typeof ProductionMetrics>
+    beforeEach(() => {
+      metrics = createProductionMetrics({
+        serviceName: 'postgres-do',
+        doId: 'test-do-123',
+        slowQueryThresholdMs: 100,
+      })
+    })
+    it('should record query execution time', () => {
+      metrics.recordQuery({
+        sql: 'SELECT * FROM users WHERE id = $1',
+        durationMs: 15,
+        rowsReturned: 1,
+        success: true,
+      })
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.totalQueries).toBe(1)
+      expect(snapshot.avgDurationMs).toBe(15)
+    })
+    it('should calculate latency percentiles (p50, p95, p99)', () => {
+      // Record various query latencies
+      const latencies = [5, 10, 15, 20, 25, 50, 75, 100, 200, 500]
+      for (const ms of latencies) {
+        metrics.recordQuery({
+          sql: 'SELECT 1',
+          durationMs: ms,
+          rowsReturned: 1,
+          success: true,
+        })
+      }
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.p50DurationMs).toBeDefined()
+      expect(snapshot.p95DurationMs).toBeDefined()
+      expect(snapshot.p99DurationMs).toBeDefined()
+      expect(snapshot.p50DurationMs).toBeLessThanOrEqual(snapshot.p95DurationMs)
+      expect(snapshot.p95DurationMs).toBeLessThanOrEqual(snapshot.p99DurationMs)
+    })
+    it('should track query throughput (queries per second)', () => {
+      for (let i = 0; i < 100; i++) {
+        metrics.recordQuery({
+          sql: 'SELECT 1',
+          durationMs: 5,
+          rowsReturned: 1,
+          success: true,
+        })
+      }
+      vi.advanceTimersByTime(10000) // 10 seconds
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.queriesPerSecond).toBeDefined()
+      expect(snapshot.queriesPerSecond).toBeGreaterThan(0)
+    })
+    it('should track error rate', () => {
+      for (let i = 0; i < 10; i++) {
+        metrics.recordQuery({
+          sql: 'SELECT 1',
+          durationMs: 5,
+          rowsReturned: i < 8 ? 1 : 0,
+          success: i < 8,
+          error: i >= 8 ? 'Query failed' : undefined,
+        })
+      }
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.errorRate).toBeCloseTo(0.2, 1) // 2 out of 10
+      expect(snapshot.totalErrors).toBe(2)
+    })
+    it('should track queries by operation type (SELECT, INSERT, UPDATE, DELETE)', () => {
+      metrics.recordQuery({ sql: 'SELECT * FROM users', durationMs: 5, rowsReturned: 10, success: true })
+      metrics.recordQuery({ sql: 'INSERT INTO users VALUES ($1)', durationMs: 10, rowsReturned: 0, success: true })
+      metrics.recordQuery({ sql: 'UPDATE users SET name = $1', durationMs: 8, rowsReturned: 0, success: true })
+      metrics.recordQuery({ sql: 'DELETE FROM users WHERE id = $1', durationMs: 6, rowsReturned: 0, success: true })
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.byOperation.SELECT.count).toBe(1)
+      expect(snapshot.byOperation.INSERT.count).toBe(1)
+      expect(snapshot.byOperation.UPDATE.count).toBe(1)
+      expect(snapshot.byOperation.DELETE.count).toBe(1)
+    })
+    it('should detect and log slow queries', () => {
+      metrics.recordQuery({
+        sql: 'SELECT * FROM large_table',
+        durationMs: 500, // Above 100ms threshold
+        rowsReturned: 10000,
+        success: true,
+      })
+      const slowQueries = metrics.getSlowQueryLog()
+      expect(slowQueries.length).toBe(1)
+      expect(slowQueries[0].durationMs).toBe(500)
+      expect(slowQueries[0].sql).toBe('SELECT * FROM large_table')
+    })
+    it('should limit slow query log size', () => {
+      for (let i = 0; i < 200; i++) {
+        metrics.recordQuery({
+          sql: `SELECT * FROM table_${i}`,
+          durationMs: 200,
+          rowsReturned: 1,
+          success: true,
+        })
+      }
+      const slowQueries = metrics.getSlowQueryLog()
+      expect(slowQueries.length).toBeLessThanOrEqual(100) // Default limit
+    })
+    it('should generate query digests (normalized patterns)', () => {
+      metrics.recordQuery({ sql: 'SELECT * FROM users WHERE id = 1', durationMs: 5, rowsReturned: 1, success: true })
+      metrics.recordQuery({ sql: 'SELECT * FROM users WHERE id = 2', durationMs: 8, rowsReturned: 1, success: true })
+      metrics.recordQuery({ sql: 'SELECT * FROM users WHERE id = 3', durationMs: 6, rowsReturned: 1, success: true })
+      const digests = metrics.getQueryDigests()
+      // All three should be normalized to the same pattern
+      expect(digests.length).toBe(1)
+      expect(digests[0].pattern).toContain('SELECT * FROM users WHERE id = $')
+      expect(digests[0].count).toBe(3)
+      expect(digests[0].avgDurationMs).toBeCloseTo(6.33, 0)
+    })
+    it('should track total rows returned/affected', () => {
+      metrics.recordQuery({ sql: 'SELECT * FROM users', durationMs: 5, rowsReturned: 100, success: true })
+      metrics.recordQuery({ sql: 'SELECT * FROM orders', durationMs: 10, rowsReturned: 50, success: true })
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.totalRowsReturned).toBe(150)
+    })
+    it('should provide windowed metrics (last 1m, 5m, 15m)', () => {
+      // Record queries over time
+      for (let i = 0; i < 60; i++) {
+        metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5 + i, rowsReturned: 1, success: true })
+        vi.advanceTimersByTime(1000) // 1 second intervals
+      }
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.windows).toBeDefined()
+      expect(snapshot.windows.oneMinute).toBeDefined()
+      expect(snapshot.windows.fiveMinutes).toBeDefined()
+      expect(snapshot.windows.fifteenMinutes).toBeDefined()
+    })
+    it('should reset query metrics', () => {
+      metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
+      metrics.resetQueryMetrics()
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.totalQueries).toBe(0)
+    })
+  })
+  // ===========================================================================
+  // Tests: Connection Stats
+  // ===========================================================================
+  describe('Connection Stats', () => {
+    let metrics: InstanceType<typeof ProductionMetrics>
+    beforeEach(() => {
+      metrics = createProductionMetrics({
+        serviceName: 'postgres-do',
+        doId: 'test-do-123',
+      })
+    })
+    it('should track active connections', () => {
+      metrics.recordConnectionOpen()
+      metrics.recordConnectionOpen()
+      const stats = metrics.getConnectionStats()
+      expect(stats.activeConnections).toBe(2)
+    })
+    it('should track connection close', () => {
+      metrics.recordConnectionOpen()
+      metrics.recordConnectionOpen()
+      metrics.recordConnectionClose()
+      const stats = metrics.getConnectionStats()
+      expect(stats.activeConnections).toBe(1)
+    })
+    it('should track total connections ever opened', () => {
+      metrics.recordConnectionOpen()
+      metrics.recordConnectionClose()
+      metrics.recordConnectionOpen()
+      const stats = metrics.getConnectionStats()
+      expect(stats.totalConnectionsOpened).toBe(2)
+      expect(stats.activeConnections).toBe(1)
+    })
+    it('should track peak concurrent connections', () => {
+      metrics.recordConnectionOpen()
+      metrics.recordConnectionOpen()
+      metrics.recordConnectionOpen()
+      metrics.recordConnectionClose()
+      metrics.recordConnectionClose()
+      const stats = metrics.getConnectionStats()
+      expect(stats.peakConnections).toBe(3)
+      expect(stats.activeConnections).toBe(1)
+    })
+    it('should track connection errors', () => {
+      metrics.recordConnectionError('Connection refused')
+      metrics.recordConnectionError('Timeout')
+      const stats = metrics.getConnectionStats()
+      expect(stats.connectionErrors).toBe(2)
+    })
+    it('should track average connection duration', () => {
+      metrics.recordConnectionOpen()
+      vi.advanceTimersByTime(5000)
+      metrics.recordConnectionClose({ durationMs: 5000 })
+      metrics.recordConnectionOpen()
+      vi.advanceTimersByTime(10000)
+      metrics.recordConnectionClose({ durationMs: 10000 })
+      const stats = metrics.getConnectionStats()
+      expect(stats.avgConnectionDurationMs).toBe(7500)
+    })
+    it('should track idle connections', () => {
+      metrics.recordConnectionOpen()
+      metrics.recordConnectionIdle()
+      const stats = metrics.getConnectionStats()
+      expect(stats.idleConnections).toBe(1)
+    })
+    it('should track connection pool utilization for DO single-connection model', () => {
+      // In the DO model, max_connections=1
+      metrics.recordConnectionOpen()
+      const stats = metrics.getConnectionStats()
+      expect(stats.poolUtilization).toBe(1.0) // 100% utilized
+    })
+    it('should track WebSocket connections separately', () => {
+      metrics.recordConnectionOpen({ type: 'websocket' })
+      metrics.recordConnectionOpen({ type: 'http' })
+      const stats = metrics.getConnectionStats()
+      expect(stats.websocketConnections).toBe(1)
+      expect(stats.httpConnections).toBe(1)
+    })
+    it('should track connection wait time (time to acquire)', () => {
+      metrics.recordConnectionAcquired({ waitTimeMs: 50 })
+      metrics.recordConnectionAcquired({ waitTimeMs: 100 })
+      const stats = metrics.getConnectionStats()
+      expect(stats.avgWaitTimeMs).toBe(75)
+    })
+    it('should provide connection uptime', () => {
+      const stats = metrics.getConnectionStats()
+      expect(stats.uptimeMs).toBeDefined()
+      expect(stats.uptimeMs).toBeGreaterThanOrEqual(0)
+    })
+  })
+  // ===========================================================================
+  // Tests: Storage Tier Stats
+  // ===========================================================================
+  describe('Storage Tier Stats', () => {
+    let metrics: InstanceType<typeof ProductionMetrics>
+    beforeEach(() => {
+      metrics = createProductionMetrics({
+        serviceName: 'postgres-do',
+        doId: 'test-do-123',
+        enableDetailedStorageMetrics: true,
+      })
+    })
+    it('should track hit rates for each tier', () => {
+      metrics.recordStorageOperation('hot', 'read', { hit: true, bytes: 1024 })
+      metrics.recordStorageOperation('hot', 'read', { hit: true, bytes: 2048 })
+      metrics.recordStorageOperation('hot', 'read', { hit: false, bytes: 512 })
+      const snapshot = metrics.getStorageTierStats()
+      expect(snapshot.hot.hitRate).toBeCloseTo(2 / 3, 2)
+    })
+    it('should track bytes read/written per tier', () => {
+      metrics.recordStorageOperation('warm', 'write', { bytes: 4096 })
+      metrics.recordStorageOperation('warm', 'read', { hit: true, bytes: 2048 })
+      const snapshot = metrics.getStorageTierStats()
+      expect(snapshot.warm.bytesWritten).toBe(4096)
+      expect(snapshot.warm.bytesRead).toBe(2048)
+    })
+    it('should track promotion events (cold to warm, warm to hot)', () => {
+      metrics.recordTierPromotion('cold', 'warm', { key: 'page-123', bytes: 1024 })
+      metrics.recordTierPromotion('warm', 'hot', { key: 'page-456', bytes: 2048 })
+      const snapshot = metrics.getStorageTierStats()
+      expect(snapshot.promotions.coldToWarm).toBe(1)
+      expect(snapshot.promotions.warmToHot).toBe(1)
+    })
+    it('should track demotion events (hot to warm, warm to cold)', () => {
+      metrics.recordTierDemotion('hot', 'warm', { key: 'page-789', bytes: 1024 })
+      metrics.recordTierDemotion('warm', 'cold', { key: 'page-012', bytes: 4096 })
+      const snapshot = metrics.getStorageTierStats()
+      expect(snapshot.demotions.hotToWarm).toBe(1)
+      expect(snapshot.demotions.warmToCold).toBe(1)
+    })
+    it('should track storage operation latency per tier', () => {
+      metrics.recordStorageOperation('hot', 'read', { hit: true, bytes: 1024, durationMs: 1 })
+      metrics.recordStorageOperation('warm', 'read', { hit: true, bytes: 1024, durationMs: 10 })
+      metrics.recordStorageOperation('cold', 'read', { hit: true, bytes: 1024, durationMs: 100 })
+      const snapshot = metrics.getStorageTierStats()
+      expect(snapshot.hot.avgLatencyMs).toBe(1)
+      expect(snapshot.warm.avgLatencyMs).toBe(10)
+      expect(snapshot.cold.avgLatencyMs).toBe(100)
+    })
+    it('should track storage errors per tier', () => {
+      metrics.recordStorageError('hot', 'Cache API error')
+      metrics.recordStorageError('cold', 'R2 timeout')
+      metrics.recordStorageError('cold', 'R2 permission denied')
+      const snapshot = metrics.getStorageTierStats()
+      expect(snapshot.hot.errors).toBe(1)
+      expect(snapshot.cold.errors).toBe(2)
+    })
+    it('should estimate storage costs per tier', () => {
+      metrics.recordStorageOperation('warm', 'write', { bytes: 1024 * 1024 }) // 1MB
+      metrics.recordStorageOperation('cold', 'write', { bytes: 10 * 1024 * 1024 }) // 10MB
+      const snapshot = metrics.getStorageTierStats()
+      expect(snapshot.estimatedCosts).toBeDefined()
+      expect(snapshot.estimatedCosts.hot).toBeGreaterThanOrEqual(0)
+      expect(snapshot.estimatedCosts.warm).toBeGreaterThanOrEqual(0)
+      expect(snapshot.estimatedCosts.cold).toBeGreaterThanOrEqual(0)
+      expect(snapshot.estimatedCosts.total).toBeGreaterThanOrEqual(0)
+    })
+    it('should track total storage used per tier', () => {
+      metrics.recordStorageUsage('hot', 1024 * 1024)       // 1MB
+      metrics.recordStorageUsage('warm', 10 * 1024 * 1024) // 10MB
+      metrics.recordStorageUsage('cold', 100 * 1024 * 1024) // 100MB
+      const snapshot = metrics.getStorageTierStats()
+      expect(snapshot.hot.usageBytes).toBe(1024 * 1024)
+      expect(snapshot.warm.usageBytes).toBe(10 * 1024 * 1024)
+      expect(snapshot.cold.usageBytes).toBe(100 * 1024 * 1024)
+    })
+    it('should track tier health status', () => {
+      metrics.recordTierHealthChange('hot', 'healthy')
+      metrics.recordTierHealthChange('warm', 'degraded')
+      metrics.recordTierHealthChange('cold', 'healthy')
+      const snapshot = metrics.getStorageTierStats()
+      expect(snapshot.hot.healthStatus).toBe('healthy')
+      expect(snapshot.warm.healthStatus).toBe('degraded')
+      expect(snapshot.cold.healthStatus).toBe('healthy')
+    })
+    it('should provide tiering efficiency ratio', () => {
+      // Hot tier should have high hit rate (hot data is frequently accessed)
+      for (let i = 0; i < 100; i++) {
+        metrics.recordStorageOperation('hot', 'read', { hit: true, bytes: 1024 })
+      }
+      for (let i = 0; i < 10; i++) {
+        metrics.recordStorageOperation('hot', 'read', { hit: false, bytes: 1024 })
+      }
+      const snapshot = metrics.getStorageTierStats()
+      expect(snapshot.tieringEfficiency).toBeDefined()
+      expect(snapshot.tieringEfficiency).toBeGreaterThan(0.5)
+    })
+  })
+  // ===========================================================================
+  // Tests: Health Checks
+  // ===========================================================================
+  describe('Health Checks', () => {
+    let metrics: InstanceType<typeof ProductionMetrics>
+    beforeEach(() => {
+      metrics = createProductionMetrics({
+        serviceName: 'postgres-do',
+        doId: 'test-do-123',
+      })
+    })
+    it('should provide liveness check', () => {
+      const health = metrics.liveness()
+      expect(health.status).toBe('healthy')
+      expect(health.service).toBe('postgres-do')
+    })
+    it('should provide readiness check', async () => {
+      const health = await metrics.readiness()
+      expect(health.status).toBeDefined()
+      expect(health.checks).toBeDefined()
+    })
+    it('should check PGLite connectivity in readiness', async () => {
+      metrics.setPGLiteInstance(mockPGLite as any)
+      mockPGLite.query.mockResolvedValueOnce({ rows: [{ result: 1 }] })
+      const health = await metrics.readiness()
+      expect(health.checks.pglite).toBeDefined()
+      expect(health.checks.pglite.status).toBe('healthy')
+    })
+    it('should report unhealthy PGLite in readiness', async () => {
+      metrics.setPGLiteInstance(mockPGLite as any)
+      mockPGLite.query.mockRejectedValueOnce(new Error('PGLite crashed'))
+      const health = await metrics.readiness()
+      expect(health.checks.pglite.status).toBe('unhealthy')
+      expect(health.checks.pglite.error).toContain('PGLite crashed')
+    })
+    it('should check storage tier health in readiness', async () => {
+      metrics.setStorageOrchestrator(mockStorage as any)
+      const health = await metrics.readiness()
+      expect(health.checks.storage).toBeDefined()
+      expect(health.checks.storage.status).toBe('healthy')
+    })
+    it('should report degraded storage in readiness', async () => {
+      mockStorage.getTierHealth.mockReturnValue({
+        hot: { status: 'healthy', consecutiveFailures: 0 },
+        warm: { status: 'degraded', consecutiveFailures: 3 },
+        cold: { status: 'healthy', consecutiveFailures: 0 },
+      })
+      metrics.setStorageOrchestrator(mockStorage as any)
+      const health = await metrics.readiness()
+      expect(health.checks.storage.status).toBe('degraded')
+    })
+    it('should check memory usage in readiness', async () => {
+      const health = await metrics.readiness()
+      expect(health.checks.memory).toBeDefined()
+      expect(health.checks.memory.details).toBeDefined()
+    })
+    it('should provide deep health check with all components', async () => {
+      metrics.setPGLiteInstance(mockPGLite as any)
+      metrics.setStorageOrchestrator(mockStorage as any)
+      mockPGLite.query.mockResolvedValueOnce({ rows: [{ result: 1 }] })
+      const health = await metrics.deepCheck()
+      expect(health.status).toBeDefined()
+      expect(health.checks.pglite).toBeDefined()
+      expect(health.checks.storage).toBeDefined()
+      expect(health.checks.memory).toBeDefined()
+      expect(health.checks.wal).toBeDefined()
+      expect(health.responseTimeMs).toBeDefined()
+    })
+    it('should include uptime in health response', () => {
+      vi.advanceTimersByTime(60000) // 1 minute
+      const health = metrics.liveness()
+      expect(health.uptimeMs).toBeGreaterThanOrEqual(60000)
+    })
+    it('should aggregate component health into overall status', async () => {
+      metrics.setPGLiteInstance(mockPGLite as any)
+      mockPGLite.query.mockRejectedValueOnce(new Error('Dead'))
+      const health = await metrics.readiness()
+      // If any critical component is unhealthy, overall should be unhealthy
+      expect(health.status).toBe('unhealthy')
+    })
+    it('should handle health check timeout', async () => {
+      metrics.setPGLiteInstance(mockPGLite as any)
+      mockPGLite.query.mockImplementation(
+        () => new Promise((resolve) => setTimeout(resolve, 30000))
+      )
+      vi.advanceTimersByTime(5000)
+      const health = await metrics.readiness({ timeoutMs: 5000 })
+      expect(health.checks.pglite.status).toBe('unhealthy')
+      expect(health.checks.pglite.error).toContain('timeout')
+    })
+  })
+  // ===========================================================================
+  // Tests: Metrics Export
+  // ===========================================================================
+  describe('Metrics Export', () => {
+    let metrics: InstanceType<typeof ProductionMetrics>
+    beforeEach(() => {
+      metrics = createProductionMetrics({
+        serviceName: 'postgres-do',
+        doId: 'test-do-123',
+      })
+    })
+    it('should export metrics in Prometheus format', () => {
+      metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
+      const output = metrics.exportPrometheus()
+      expect(output).toContain('# HELP')
+      expect(output).toContain('# TYPE')
+      expect(output).toContain('postgres_query_duration_seconds')
+    })
+    it('should export metrics in JSON format', () => {
+      metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
+      const output = metrics.exportJSON()
+      expect(output.metrics).toBeDefined()
+      expect(output.timestamp).toBeDefined()
+      expect(output.service).toBe('postgres-do')
+    })
+    it('should include labels in Prometheus metrics', () => {
+      metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
+      const output = metrics.exportPrometheus()
+      expect(output).toContain('do_id="test-do-123"')
+      expect(output).toContain('service="postgres-do"')
+    })
+    it('should export histogram buckets in Prometheus format', () => {
+      for (let i = 0; i < 100; i++) {
+        metrics.recordQuery({ sql: 'SELECT 1', durationMs: i * 10, rowsReturned: 1, success: true })
+      }
+      const output = metrics.exportPrometheus()
+      expect(output).toContain('_bucket{')
+      expect(output).toContain('le="')
+      expect(output).toContain('_count')
+      expect(output).toContain('_sum')
+    })
+    it('should export query metrics', () => {
+      metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
+      const output = metrics.exportPrometheus()
+      expect(output).toContain('postgres_query_total')
+      expect(output).toContain('postgres_query_duration_seconds')
+      expect(output).toContain('postgres_query_errors_total')
+    })
+    it('should export storage tier metrics', () => {
+      metrics.recordStorageOperation('hot', 'read', { hit: true, bytes: 1024 })
+      const output = metrics.exportPrometheus()
+      expect(output).toContain('postgres_storage_operations_total')
+      expect(output).toContain('tier="hot"')
+    })
+    it('should export connection metrics', () => {
+      metrics.recordConnectionOpen()
+      const output = metrics.exportPrometheus()
+      expect(output).toContain('postgres_connections_active')
+    })
+    it('should handle metrics endpoint request', async () => {
+      metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
+      const handler = metrics.createMetricsHandler()
+      const request = new Request('http://localhost/metrics', {
+        headers: { Accept: 'text/plain' },
+      })
+      const response = await handler(request)
+      expect(response.status).toBe(200)
+      expect(response.headers.get('content-type')).toContain('text/plain')
+    })
+    it('should support content negotiation for metrics endpoint', async () => {
+      const handler = metrics.createMetricsHandler()
+      const jsonRequest = new Request('http://localhost/metrics', {
+        headers: { Accept: 'application/json' },
+      })
+      const jsonResponse = await handler(jsonRequest)
+      expect(jsonResponse.headers.get('content-type')).toContain('application/json')
+      const promRequest = new Request('http://localhost/metrics', {
+        headers: { Accept: 'text/plain' },
+      })
+      const promResponse = await handler(promRequest)
+      expect(promResponse.headers.get('content-type')).toContain('text/plain')
+    })
+  })
+  // ===========================================================================
+  // Tests: Dashboard / Summary
+  // ===========================================================================
+  describe('Metrics Dashboard', () => {
+    let metrics: InstanceType<typeof ProductionMetrics>
+    beforeEach(() => {
+      metrics = createProductionMetrics({
+        serviceName: 'postgres-do',
+        doId: 'test-do-123',
+      })
+    })
+    it('should provide a complete metrics dashboard', () => {
+      metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
+      metrics.recordConnectionOpen()
+      metrics.recordStorageOperation('hot', 'read', { hit: true, bytes: 1024 })
+      const dashboard = metrics.getDashboard()
+      expect(dashboard.queries).toBeDefined()
+      expect(dashboard.connections).toBeDefined()
+      expect(dashboard.storage).toBeDefined()
+      expect(dashboard.health).toBeDefined()
+      expect(dashboard.timestamp).toBeDefined()
+    })
+    it('should include service info in dashboard', () => {
+      const dashboard = metrics.getDashboard()
+      expect(dashboard.service.name).toBe('postgres-do')
+      expect(dashboard.service.doId).toBe('test-do-123')
+      expect(dashboard.service.uptimeMs).toBeDefined()
+    })
+    it('should include alerts in dashboard', () => {
+      // Trigger high error rate
+      for (let i = 0; i < 100; i++) {
+        metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 0, success: false, error: 'Error' })
+      }
+      const dashboard = metrics.getDashboard()
+      expect(dashboard.alerts).toBeDefined()
+      expect(dashboard.alerts.length).toBeGreaterThan(0)
+    })
+    it('should include memory metrics in dashboard', () => {
+      const dashboard = metrics.getDashboard()
+      expect(dashboard.memory).toBeDefined()
+      expect(dashboard.memory.heapUsedBytes).toBeDefined()
+      expect(dashboard.memory.heapTotalBytes).toBeDefined()
+    })
+  })
+  // ===========================================================================
+  // Tests: Alert Thresholds
+  // ===========================================================================
+  describe('Alert Thresholds', () => {
+    let metrics: InstanceType<typeof ProductionMetrics>
+    beforeEach(() => {
+      metrics = createProductionMetrics({
+        serviceName: 'postgres-do',
+        doId: 'test-do-123',
+        alertThresholds: {
+          errorRatePercent: 5,
+          p99LatencyMs: 1000,
+          memoryUsagePercent: 90,
+          storageErrorRate: 1,
+        },
+      })
+    })
+    it('should trigger alert when error rate exceeds threshold', () => {
+      // Generate high error rate (> 5%)
+      for (let i = 0; i < 100; i++) {
+        metrics.recordQuery({
+          sql: 'SELECT 1',
+          durationMs: 5,
+          rowsReturned: 0,
+          success: i < 90, // 10% error rate
+          error: i >= 90 ? 'Error' : undefined,
+        })
+      }
+      const alerts = metrics.getActiveAlerts()
+      expect(alerts.some((a) => a.type === 'error_rate')).toBe(true)
+    })
+    it('should trigger alert when p99 latency exceeds threshold', () => {
+      // Generate high latency queries
+      for (let i = 0; i < 100; i++) {
+        metrics.recordQuery({
+          sql: 'SELECT 1',
+          durationMs: i < 99 ? 5 : 2000, // p99 will be ~2000ms
+          rowsReturned: 1,
+          success: true,
+        })
+      }
+      const alerts = metrics.getActiveAlerts()
+      expect(alerts.some((a) => a.type === 'high_latency')).toBe(true)
+    })
+    it('should trigger alert when storage errors exceed threshold', () => {
+      metrics.recordStorageError('cold', 'R2 error 1')
+      metrics.recordStorageError('cold', 'R2 error 2')
+      const alerts = metrics.getActiveAlerts()
+      expect(alerts.some((a) => a.type === 'storage_error')).toBe(true)
+    })
+    it('should resolve alerts when conditions return to normal', () => {
+      // First trigger an alert
+      for (let i = 0; i < 100; i++) {
+        metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 0, success: false, error: 'Error' })
+      }
+      expect(metrics.getActiveAlerts().length).toBeGreaterThan(0)
+      // Now generate successful queries to bring error rate down
+      for (let i = 0; i < 10000; i++) {
+        metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
+      }
+      metrics.evaluateAlerts()
+      const alerts = metrics.getActiveAlerts()
+      const errorAlerts = alerts.filter((a) => a.type === 'error_rate')
+      expect(errorAlerts.length).toBe(0)
+    })
+    it('should include alert severity level', () => {
+      for (let i = 0; i < 100; i++) {
+        metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 0, success: false, error: 'Error' })
+      }
+      const alerts = metrics.getActiveAlerts()
+      expect(alerts[0].severity).toBeDefined()
+      expect(['warning', 'critical']).toContain(alerts[0].severity)
+    })
+    it('should register custom alert thresholds', () => {
+      metrics.registerAlertThreshold({
+        name: 'custom_slow_queries',
+        type: 'slow_query_count',
+        threshold: 10,
+        windowMs: 60000,
+        severity: 'warning',
+      })
+      // Trigger the custom alert
+      for (let i = 0; i < 15; i++) {
+        metrics.recordQuery({ sql: 'SELECT 1', durationMs: 500, rowsReturned: 1, success: true })
+      }
+      const alerts = metrics.getActiveAlerts()
+      expect(alerts.some((a) => a.name === 'custom_slow_queries')).toBe(true)
+    })
+  })
+  // ===========================================================================
+  // Tests: Edge Cases
+  // ===========================================================================
+  describe('Edge Cases', () => {
+    let metrics: InstanceType<typeof ProductionMetrics>
+    beforeEach(() => {
+      metrics = createProductionMetrics({
+        serviceName: 'postgres-do',
+        doId: 'test-do-123',
+      })
+    })
+    it('should handle recording metrics with no queries', () => {
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.totalQueries).toBe(0)
+      expect(snapshot.avgDurationMs).toBe(0)
+      expect(snapshot.errorRate).toBe(0)
+    })
+    it('should handle very large number of metrics', () => {
+      for (let i = 0; i < 100000; i++) {
+        metrics.recordQuery({ sql: 'SELECT 1', durationMs: Math.random() * 1000, rowsReturned: 1, success: true })
+      }
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.totalQueries).toBe(100000)
+    })
+    it('should handle zero-duration queries', () => {
+      metrics.recordQuery({ sql: 'SELECT 1', durationMs: 0, rowsReturned: 0, success: true })
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.totalQueries).toBe(1)
+      expect(snapshot.avgDurationMs).toBe(0)
+    })
+    it('should handle negative duration gracefully', () => {
+      // Should not throw
+      expect(() =>
+        metrics.recordQuery({ sql: 'SELECT 1', durationMs: -5, rowsReturned: 0, success: true })
+      ).not.toThrow()
+    })
+    it('should handle empty SQL in query recording', () => {
+      metrics.recordQuery({ sql: '', durationMs: 5, rowsReturned: 0, success: true })
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.totalQueries).toBe(1)
+    })
+    it('should handle concurrent metric recording', async () => {
+      const promises = Array.from({ length: 100 }, (_, i) =>
+        Promise.resolve(
+          metrics.recordQuery({ sql: `SELECT ${i}`, durationMs: i, rowsReturned: 1, success: true })
+        )
+      )
+      await Promise.all(promises)
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.totalQueries).toBe(100)
+    })
+    it('should not exceed memory bounds for digest storage', () => {
+      // Record many unique queries
+      for (let i = 0; i < 10000; i++) {
+        metrics.recordQuery({
+          sql: `SELECT * FROM table_${i} WHERE id = ${i}`,
+          durationMs: 5,
+          rowsReturned: 1,
+          success: true,
+        })
+      }
+      const digests = metrics.getQueryDigests()
+      // Should cap at configured max
+      expect(digests.length).toBeLessThanOrEqual(1000)
+    })
+    it('should handle metrics reset during collection', () => {
+      metrics.recordQuery({ sql: 'SELECT 1', durationMs: 5, rowsReturned: 1, success: true })
+      metrics.resetAll()
+      const snapshot = metrics.getQueryMetrics()
+      expect(snapshot.totalQueries).toBe(0)
+      const connStats = metrics.getConnectionStats()
+      expect(connStats.activeConnections).toBe(0)
+      const storageStats = metrics.getStorageTierStats()
+      expect(storageStats.hot.totalOperations).toBe(0)
+    })
+    it('should provide stable percentiles with small sample size', () => {
+      metrics.recordQuery({ sql: 'SELECT 1', durationMs: 10, rowsReturned: 1, success: true })
+      const snapshot = metrics.getQueryMetrics()
+      // With single sample, all percentiles should equal that value
+      expect(snapshot.p50DurationMs).toBe(10)
+      expect(snapshot.p95DurationMs).toBe(10)
+      expect(snapshot.p99DurationMs).toBe(10)
+    })
+    it('should survive PGLite instance being null for health checks', async () => {
+      // Don't set PGLite instance
+      const health = await metrics.readiness()
+      expect(health.checks.pglite.status).toBe('unhealthy')
+      expect(health.checks.pglite.error).toContain('not initialized')
+    })
+  })
+})