@vorionsys/infrastructure 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,920 @@
1
+ /**
2
+ * PostgreSQL High Availability Replication Module
3
+ *
4
+ * Provides infrastructure for PostgreSQL HA with:
5
+ * - Streaming replication configuration
6
+ * - Patroni cluster management helpers
7
+ * - pg_auto_failover setup and monitoring
8
+ * - Automatic failover configuration
9
+ * - Replication lag monitoring
10
+ * - Health check endpoints for replicas
11
+ *
12
+ * @packageDocumentation
13
+ */
14
+ import { Pool } from 'pg';
15
+ import { Counter, Gauge, Histogram, Registry } from 'prom-client';
16
+ import { EventEmitter } from 'events';
17
+ // =============================================================================
18
+ // Metrics Registry
19
+ // =============================================================================
20
+ /**
21
+ * Dedicated registry for replication metrics
22
+ */
23
+ export const replicationRegistry = new Registry();
24
+ // Replication lag metrics
25
+ export const replicationLagBytesGauge = new Gauge({
26
+ name: 'vorion_pg_replication_lag_bytes',
27
+ help: 'PostgreSQL replication lag in bytes',
28
+ labelNames: ['cluster', 'replica_id', 'replica_host'],
29
+ registers: [replicationRegistry],
30
+ });
31
+ export const replicationLagSecondsGauge = new Gauge({
32
+ name: 'vorion_pg_replication_lag_seconds',
33
+ help: 'PostgreSQL replication lag in seconds',
34
+ labelNames: ['cluster', 'replica_id', 'replica_host'],
35
+ registers: [replicationRegistry],
36
+ });
37
+ // Replica status metrics
38
+ export const replicaStatusGauge = new Gauge({
39
+ name: 'vorion_pg_replica_status',
40
+ help: 'PostgreSQL replica status (1=streaming, 0.75=catchup, 0.5=potential, 0.25=disconnected, 0=failed)',
41
+ labelNames: ['cluster', 'replica_id', 'replica_host', 'status'],
42
+ registers: [replicationRegistry],
43
+ });
44
+ // Cluster health metrics
45
+ export const clusterHealthGauge = new Gauge({
46
+ name: 'vorion_pg_cluster_health',
47
+ help: 'PostgreSQL cluster health (1=healthy, 0=unhealthy)',
48
+ labelNames: ['cluster'],
49
+ registers: [replicationRegistry],
50
+ });
51
+ export const healthyReplicaCountGauge = new Gauge({
52
+ name: 'vorion_pg_healthy_replica_count',
53
+ help: 'Number of healthy PostgreSQL replicas',
54
+ labelNames: ['cluster'],
55
+ registers: [replicationRegistry],
56
+ });
57
+ // Failover metrics
58
+ export const failoverEventsCounter = new Counter({
59
+ name: 'vorion_pg_failover_events_total',
60
+ help: 'Total PostgreSQL failover events',
61
+ labelNames: ['cluster', 'reason', 'success'],
62
+ registers: [replicationRegistry],
63
+ });
64
+ export const failoverDurationHistogram = new Histogram({
65
+ name: 'vorion_pg_failover_duration_seconds',
66
+ help: 'PostgreSQL failover duration in seconds',
67
+ labelNames: ['cluster', 'reason'],
68
+ buckets: [1, 5, 10, 30, 60, 120, 300],
69
+ registers: [replicationRegistry],
70
+ });
71
+ // Health check metrics
72
+ export const healthCheckDurationHistogram = new Histogram({
73
+ name: 'vorion_pg_health_check_duration_seconds',
74
+ help: 'PostgreSQL health check duration',
75
+ labelNames: ['cluster', 'node_id', 'role'],
76
+ buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5],
77
+ registers: [replicationRegistry],
78
+ });
79
+ export const healthCheckErrorsCounter = new Counter({
80
+ name: 'vorion_pg_health_check_errors_total',
81
+ help: 'Total PostgreSQL health check errors',
82
+ labelNames: ['cluster', 'node_id', 'error_type'],
83
+ registers: [replicationRegistry],
84
+ });
85
+ // =============================================================================
86
+ // Utility Functions
87
+ // =============================================================================
88
+ /**
89
+ * Create a connection pool for a node
90
+ */
91
+ function createNodePool(config) {
92
+ const poolConfig = {
93
+ host: config.host,
94
+ port: config.port,
95
+ database: config.database,
96
+ user: config.user,
97
+ password: config.password,
98
+ max: config.maxConnections ?? 5,
99
+ idleTimeoutMillis: config.idleTimeoutMs ?? 30000,
100
+ connectionTimeoutMillis: config.connectionTimeoutMs ?? 10000,
101
+ application_name: config.applicationName ?? 'vorion-replication-monitor',
102
+ };
103
+ if (config.ssl) {
104
+ poolConfig.ssl = typeof config.ssl === 'boolean' ? config.ssl : config.ssl;
105
+ }
106
+ return new Pool(poolConfig);
107
+ }
108
+ /**
109
+ * Parse LSN (Log Sequence Number) to bytes offset
110
+ */
111
+ function parseLsn(lsn) {
112
+ if (!lsn)
113
+ return BigInt(0);
114
+ const [segment, offset] = lsn.split('/');
115
+ const segmentBigInt = BigInt('0x' + segment);
116
+ const offsetBigInt = BigInt('0x' + offset);
117
+ return (segmentBigInt << BigInt(32)) + offsetBigInt;
118
+ }
119
+ /**
120
+ * Calculate lag between two LSNs in bytes
121
+ */
122
+ function calculateLagBytes(primaryLsn, replicaLsn) {
123
+ const primaryOffset = parseLsn(primaryLsn);
124
+ const replicaOffset = parseLsn(replicaLsn);
125
+ const lag = primaryOffset - replicaOffset;
126
+ return Number(lag > BigInt(0) ? lag : BigInt(0));
127
+ }
128
+ /**
129
+ * Map replica state to ReplicaStatus
130
+ */
131
+ function mapReplicaState(state) {
132
+ if (!state)
133
+ return 'unknown';
134
+ switch (state.toLowerCase()) {
135
+ case 'streaming':
136
+ return 'streaming';
137
+ case 'catchup':
138
+ return 'catchup';
139
+ case 'potential':
140
+ return 'potential';
141
+ case 'disconnected':
142
+ return 'disconnected';
143
+ default:
144
+ return 'unknown';
145
+ }
146
+ }
147
+ /**
148
+ * Map replica status to numeric value for metrics
149
+ */
150
+ function replicaStatusToNumber(status) {
151
+ switch (status) {
152
+ case 'streaming':
153
+ return 1;
154
+ case 'catchup':
155
+ return 0.75;
156
+ case 'potential':
157
+ return 0.5;
158
+ case 'disconnected':
159
+ return 0.25;
160
+ case 'failed':
161
+ return 0;
162
+ default:
163
+ return 0;
164
+ }
165
+ }
166
+ // =============================================================================
167
+ // PostgreSQL Replication Manager
168
+ // =============================================================================
169
+ /**
170
+ * PostgreSQL Replication Manager
171
+ *
172
+ * Manages PostgreSQL HA clusters with streaming replication,
173
+ * health monitoring, and failover coordination.
174
+ *
175
+ * @example
176
+ * ```typescript
177
+ * const manager = new ReplicationManager({
178
+ * clusterName: 'vorion-production',
179
+ * replicationMode: 'streaming',
180
+ * orchestrator: 'patroni',
181
+ * primary: { nodeId: 'primary', host: 'pg-primary', port: 5432, ... },
182
+ * replicas: [
183
+ * { nodeId: 'replica-1', host: 'pg-replica-1', port: 5432, ... },
184
+ * { nodeId: 'replica-2', host: 'pg-replica-2', port: 5432, ... },
185
+ * ],
186
+ * maxReplicationLagSeconds: 30,
187
+ * healthCheckIntervalMs: 10000,
188
+ * });
189
+ *
190
+ * await manager.start();
191
+ *
192
+ * // Get cluster health
193
+ * const health = await manager.getClusterHealth();
194
+ *
195
+ * // Get replication lag for a specific replica
196
+ * const stats = await manager.getReplicationStats('replica-1');
197
+ *
198
+ * // Manual switchover
199
+ * await manager.switchover('replica-1', 'scheduled_maintenance');
200
+ * ```
201
+ */
202
+ export class ReplicationManager extends EventEmitter {
203
+ config;
204
+ primaryPool = null;
205
+ replicaPools = new Map();
206
+ healthCheckInterval = null;
207
+ isRunning = false;
208
+ currentPrimaryId;
209
+ lastHealth = null;
210
+ constructor(config) {
211
+ super();
212
+ this.config = {
213
+ maxReplicationLagBytes: 100 * 1024 * 1024, // 100MB default
214
+ maxReplicationLagSeconds: 30,
215
+ healthCheckIntervalMs: 10000,
216
+ failoverTimeoutMs: 60000,
217
+ autoFailoverEnabled: true,
218
+ synchronousStandbyCount: 1,
219
+ ...config,
220
+ };
221
+ this.currentPrimaryId = config.primary.nodeId;
222
+ }
223
+ /**
224
+ * Start the replication manager
225
+ */
226
+ async start() {
227
+ if (this.isRunning) {
228
+ return;
229
+ }
230
+ // Initialize primary pool
231
+ this.primaryPool = createNodePool(this.config.primary);
232
+ // Initialize replica pools
233
+ for (const replica of this.config.replicas) {
234
+ const pool = createNodePool(replica);
235
+ this.replicaPools.set(replica.nodeId, pool);
236
+ }
237
+ // Start health check loop
238
+ this.startHealthChecks();
239
+ this.isRunning = true;
240
+ this.emit('started', { clusterName: this.config.clusterName });
241
+ }
242
+ /**
243
+ * Stop the replication manager
244
+ */
245
+ async stop() {
246
+ if (!this.isRunning) {
247
+ return;
248
+ }
249
+ // Stop health checks
250
+ if (this.healthCheckInterval) {
251
+ clearInterval(this.healthCheckInterval);
252
+ this.healthCheckInterval = null;
253
+ }
254
+ // Close all pools
255
+ if (this.primaryPool) {
256
+ await this.primaryPool.end();
257
+ this.primaryPool = null;
258
+ }
259
+ for (const [, pool] of this.replicaPools) {
260
+ await pool.end();
261
+ }
262
+ this.replicaPools.clear();
263
+ this.isRunning = false;
264
+ this.emit('stopped', { clusterName: this.config.clusterName });
265
+ }
266
+ /**
267
+ * Start health check interval
268
+ */
269
+ startHealthChecks() {
270
+ const runHealthCheck = async () => {
271
+ try {
272
+ const health = await this.getClusterHealth();
273
+ this.lastHealth = health;
274
+ // Update metrics
275
+ this.updateMetrics(health);
276
+ // Check for issues
277
+ if (!health.healthy) {
278
+ this.emit('unhealthy', health);
279
+ }
280
+ if (health.lagThresholdExceeded) {
281
+ this.emit('lag_threshold_exceeded', health);
282
+ }
283
+ // Auto-failover check
284
+ if (this.config.autoFailoverEnabled && !health.primaryHealthy) {
285
+ this.emit('primary_failure', {
286
+ currentPrimary: this.currentPrimaryId,
287
+ health,
288
+ });
289
+ // Note: Actual failover should be handled by Patroni/pg_auto_failover
290
+ // This just emits events for monitoring
291
+ }
292
+ }
293
+ catch (error) {
294
+ this.emit('health_check_error', { error });
295
+ healthCheckErrorsCounter.inc({
296
+ cluster: this.config.clusterName,
297
+ node_id: 'cluster',
298
+ error_type: error instanceof Error ? error.name : 'unknown',
299
+ });
300
+ }
301
+ };
302
+ // Run immediately
303
+ runHealthCheck();
304
+ // Schedule interval
305
+ this.healthCheckInterval = setInterval(runHealthCheck, this.config.healthCheckIntervalMs);
306
+ }
307
+ /**
308
+ * Update Prometheus metrics from health data
309
+ */
310
+ updateMetrics(health) {
311
+ const cluster = this.config.clusterName;
312
+ // Cluster health
313
+ clusterHealthGauge.set({ cluster }, health.healthy ? 1 : 0);
314
+ healthyReplicaCountGauge.set({ cluster }, health.healthyReplicaCount);
315
+ // Per-replica metrics
316
+ for (const stats of health.replicationStats) {
317
+ const replica = this.config.replicas.find((r) => r.nodeId === stats.nodeId);
318
+ const host = replica?.host ?? 'unknown';
319
+ replicationLagBytesGauge.set({ cluster, replica_id: stats.nodeId, replica_host: host }, stats.lagBytes);
320
+ replicationLagSecondsGauge.set({ cluster, replica_id: stats.nodeId, replica_host: host }, stats.lagSeconds);
321
+ replicaStatusGauge.set({ cluster, replica_id: stats.nodeId, replica_host: host, status: stats.status }, replicaStatusToNumber(stats.status));
322
+ }
323
+ }
324
+ /**
325
+ * Get comprehensive cluster health status
326
+ */
327
+ async getClusterHealth() {
328
+ const startTime = performance.now();
329
+ // Check primary health
330
+ const primaryHealth = await this.checkNodeHealth(this.config.primary, 'primary');
331
+ // Get replication stats from primary
332
+ const replicationStats = await this.getAllReplicationStats();
333
+ // Calculate health metrics
334
+ const healthyReplicaCount = replicationStats.filter((s) => s.status === 'streaming' && s.isInSync).length;
335
+ const lagValues = replicationStats.map((s) => s.lagBytes);
336
+ const averageLagBytes = lagValues.length > 0
337
+ ? lagValues.reduce((a, b) => a + b, 0) / lagValues.length
338
+ : 0;
339
+ const maxLagBytes = lagValues.length > 0 ? Math.max(...lagValues) : 0;
340
+ const lagThresholdExceeded = maxLagBytes > (this.config.maxReplicationLagBytes ?? Infinity) ||
341
+ replicationStats.some((s) => s.lagSeconds > (this.config.maxReplicationLagSeconds ?? Infinity));
342
+ const checkDurationMs = performance.now() - startTime;
343
+ const health = {
344
+ healthy: primaryHealth.reachable &&
345
+ healthyReplicaCount >= (this.config.synchronousStandbyCount ?? 1),
346
+ primaryHealthy: primaryHealth.reachable && !primaryHealth.isInRecovery,
347
+ healthyReplicaCount,
348
+ totalReplicaCount: this.config.replicas.length,
349
+ currentPrimary: this.currentPrimaryId,
350
+ replicationStats,
351
+ averageLagBytes: Math.round(averageLagBytes),
352
+ maxLagBytes,
353
+ lagThresholdExceeded,
354
+ lastCheckAt: new Date(),
355
+ checkDurationMs: Math.round(checkDurationMs),
356
+ };
357
+ healthCheckDurationHistogram.observe({ cluster: this.config.clusterName, node_id: 'cluster', role: 'cluster' }, checkDurationMs / 1000);
358
+ return health;
359
+ }
360
+ /**
361
+ * Get replication statistics for all replicas
362
+ */
363
+ async getAllReplicationStats() {
364
+ if (!this.primaryPool) {
365
+ throw new Error('Primary pool not initialized');
366
+ }
367
+ let client = null;
368
+ try {
369
+ client = await this.primaryPool.connect();
370
+ // Query pg_stat_replication for all replicas
371
+ const result = await client.query(`
372
+ SELECT
373
+ pid,
374
+ usename,
375
+ application_name,
376
+ client_addr,
377
+ client_hostname,
378
+ client_port,
379
+ backend_start,
380
+ state,
381
+ sent_lsn,
382
+ write_lsn,
383
+ flush_lsn,
384
+ replay_lsn,
385
+ write_lag,
386
+ flush_lag,
387
+ replay_lag,
388
+ sync_priority,
389
+ sync_state,
390
+ reply_time
391
+ FROM pg_stat_replication
392
+ WHERE state IS NOT NULL
393
+ `);
394
+ // Get current primary LSN
395
+ const lsnResult = await client.query('SELECT pg_current_wal_lsn() as current_lsn');
396
+ const currentLsn = lsnResult.rows[0]?.current_lsn ?? '0/0';
397
+ const stats = [];
398
+ for (const row of result.rows) {
399
+ // Find matching replica config
400
+ const replica = this.config.replicas.find((r) => r.host === row.client_addr ||
401
+ r.host === row.client_hostname ||
402
+ row.application_name?.includes(r.nodeId));
403
+ const nodeId = replica?.nodeId ?? row.application_name ?? row.client_addr;
404
+ const lagBytes = calculateLagBytes(currentLsn, row.replay_lsn ?? '0/0');
405
+ const writeLagBytes = calculateLagBytes(currentLsn, row.write_lsn ?? '0/0');
406
+ const flushLagBytes = calculateLagBytes(currentLsn, row.flush_lsn ?? '0/0');
407
+ const replayLagBytes = calculateLagBytes(row.flush_lsn ?? '0/0', row.replay_lsn ?? '0/0');
408
+ // Parse lag interval to seconds
409
+ const lagSeconds = this.parseIntervalToSeconds(row.replay_lag);
410
+ stats.push({
411
+ nodeId,
412
+ status: mapReplicaState(row.state),
413
+ lagBytes,
414
+ lagSeconds,
415
+ receivedLsn: row.sent_lsn ?? '0/0',
416
+ replayedLsn: row.replay_lsn ?? '0/0',
417
+ replayLagBytes,
418
+ writeLagBytes,
419
+ flushLagBytes,
420
+ isInSync: row.sync_state === 'sync' || row.sync_state === 'quorum',
421
+ collectedAt: new Date(),
422
+ });
423
+ }
424
+ // Add stats for replicas not in pg_stat_replication (disconnected)
425
+ for (const replica of this.config.replicas) {
426
+ if (!stats.find((s) => s.nodeId === replica.nodeId)) {
427
+ stats.push({
428
+ nodeId: replica.nodeId,
429
+ status: 'disconnected',
430
+ lagBytes: -1,
431
+ lagSeconds: -1,
432
+ receivedLsn: '0/0',
433
+ replayedLsn: '0/0',
434
+ replayLagBytes: -1,
435
+ writeLagBytes: -1,
436
+ flushLagBytes: -1,
437
+ isInSync: false,
438
+ collectedAt: new Date(),
439
+ });
440
+ }
441
+ }
442
+ return stats;
443
+ }
444
+ finally {
445
+ if (client) {
446
+ client.release();
447
+ }
448
+ }
449
+ }
450
+ /**
451
+ * Get replication statistics for a specific replica
452
+ */
453
+ async getReplicationStats(nodeId) {
454
+ const allStats = await this.getAllReplicationStats();
455
+ return allStats.find((s) => s.nodeId === nodeId) ?? null;
456
+ }
457
+ /**
458
+ * Check health of a specific node
459
+ */
460
+ async checkNodeHealth(nodeConfig, role) {
461
+ const startTime = performance.now();
462
+ const pool = role === 'primary'
463
+ ? this.primaryPool
464
+ : this.replicaPools.get(nodeConfig.nodeId);
465
+ if (!pool) {
466
+ return {
467
+ nodeId: nodeConfig.nodeId,
468
+ role,
469
+ reachable: false,
470
+ latencyMs: 0,
471
+ isInRecovery: false,
472
+ errorMessage: 'Pool not initialized',
473
+ checkedAt: new Date(),
474
+ };
475
+ }
476
+ let client = null;
477
+ try {
478
+ client = await pool.connect();
479
+ // Run health check query
480
+ const result = await client.query(`
481
+ SELECT
482
+ version() as pg_version,
483
+ pg_is_in_recovery() as is_in_recovery,
484
+ txid_current() as current_xid,
485
+ (SELECT timeline_id FROM pg_control_checkpoint()) as timeline_id
486
+ `);
487
+ const row = result.rows[0];
488
+ const latencyMs = performance.now() - startTime;
489
+ healthCheckDurationHistogram.observe({ cluster: this.config.clusterName, node_id: nodeConfig.nodeId, role }, latencyMs / 1000);
490
+ return {
491
+ nodeId: nodeConfig.nodeId,
492
+ role,
493
+ reachable: true,
494
+ latencyMs: Math.round(latencyMs),
495
+ pgVersion: row.pg_version,
496
+ isInRecovery: row.is_in_recovery,
497
+ currentXid: row.current_xid?.toString(),
498
+ timelineId: row.timeline_id,
499
+ checkedAt: new Date(),
500
+ };
501
+ }
502
+ catch (error) {
503
+ const latencyMs = performance.now() - startTime;
504
+ healthCheckErrorsCounter.inc({
505
+ cluster: this.config.clusterName,
506
+ node_id: nodeConfig.nodeId,
507
+ error_type: error instanceof Error ? error.name : 'unknown',
508
+ });
509
+ return {
510
+ nodeId: nodeConfig.nodeId,
511
+ role,
512
+ reachable: false,
513
+ latencyMs: Math.round(latencyMs),
514
+ isInRecovery: false,
515
+ errorMessage: error instanceof Error ? error.message : String(error),
516
+ checkedAt: new Date(),
517
+ };
518
+ }
519
+ finally {
520
+ if (client) {
521
+ client.release();
522
+ }
523
+ }
524
+ }
525
+ /**
526
+ * Parse PostgreSQL interval to seconds
527
+ */
528
+ parseIntervalToSeconds(interval) {
529
+ if (!interval)
530
+ return 0;
531
+ // PostgreSQL interval format: "HH:MM:SS.microseconds"
532
+ const match = interval.match(/^(\d+):(\d+):(\d+(?:\.\d+)?)$/);
533
+ if (!match)
534
+ return 0;
535
+ const hours = parseInt(match[1], 10);
536
+ const minutes = parseInt(match[2], 10);
537
+ const seconds = parseFloat(match[3]);
538
+ return hours * 3600 + minutes * 60 + seconds;
539
+ }
540
+ /**
541
+ * Initiate manual switchover to a specified replica
542
+ * Note: For Patroni/pg_auto_failover, this coordinates with the orchestrator
543
+ */
544
+ async switchover(targetReplicaId, reason = 'manual_switchover') {
545
+ const eventId = `failover-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
546
+ const initiatedAt = new Date();
547
+ const event = {
548
+ eventId,
549
+ previousPrimary: this.currentPrimaryId,
550
+ newPrimary: targetReplicaId,
551
+ reason,
552
+ initiatedAt,
553
+ success: false,
554
+ };
555
+ this.emit('switchover_started', event);
556
+ try {
557
+ // Get current lag before switchover
558
+ const stats = await this.getReplicationStats(targetReplicaId);
559
+ event.lagAtFailover = stats?.lagBytes ?? 0;
560
+ // Perform switchover based on orchestrator
561
+ if (this.config.orchestrator === 'patroni' && this.config.patroni) {
562
+ await this.performPatroniSwitchover(targetReplicaId);
563
+ }
564
+ else if (this.config.orchestrator === 'pg_auto_failover' &&
565
+ this.config.pgAutoFailover) {
566
+ await this.performPgAutoFailoverSwitchover(targetReplicaId);
567
+ }
568
+ else {
569
+ throw new Error(`Manual switchover not supported for orchestrator: ${this.config.orchestrator}`);
570
+ }
571
+ event.success = true;
572
+ event.completedAt = new Date();
573
+ // Update tracking
574
+ this.currentPrimaryId = targetReplicaId;
575
+ // Record metrics
576
+ const durationSeconds = (event.completedAt.getTime() - initiatedAt.getTime()) / 1000;
577
+ failoverEventsCounter.inc({
578
+ cluster: this.config.clusterName,
579
+ reason,
580
+ success: 'true',
581
+ });
582
+ failoverDurationHistogram.observe({ cluster: this.config.clusterName, reason }, durationSeconds);
583
+ this.emit('switchover_completed', event);
584
+ }
585
+ catch (error) {
586
+ event.success = false;
587
+ event.errorMessage = error instanceof Error ? error.message : String(error);
588
+ event.completedAt = new Date();
589
+ failoverEventsCounter.inc({
590
+ cluster: this.config.clusterName,
591
+ reason,
592
+ success: 'false',
593
+ });
594
+ this.emit('switchover_failed', event);
595
+ }
596
+ return event;
597
+ }
598
+ /**
599
+ * Perform switchover via Patroni REST API
600
+ */
601
+ async performPatroniSwitchover(targetReplicaId) {
602
+ const patroni = this.config.patroni;
603
+ if (!patroni) {
604
+ throw new Error('Patroni configuration not provided');
605
+ }
606
+ const url = `${patroni.apiEndpoint}/switchover`;
607
+ const headers = {
608
+ 'Content-Type': 'application/json',
609
+ };
610
+ if (patroni.apiUsername && patroni.apiPassword) {
611
+ const auth = Buffer.from(`${patroni.apiUsername}:${patroni.apiPassword}`).toString('base64');
612
+ headers['Authorization'] = `Basic ${auth}`;
613
+ }
614
+ const response = await fetch(url, {
615
+ method: 'POST',
616
+ headers,
617
+ body: JSON.stringify({
618
+ leader: this.currentPrimaryId,
619
+ candidate: targetReplicaId,
620
+ }),
621
+ });
622
+ if (!response.ok) {
623
+ const text = await response.text();
624
+ throw new Error(`Patroni switchover failed: ${response.status} ${text}`);
625
+ }
626
+ }
627
+ /**
628
+ * Perform switchover via pg_auto_failover
629
+ */
630
+ async performPgAutoFailoverSwitchover(_targetReplicaId) {
631
+ const pgaf = this.config.pgAutoFailover;
632
+ if (!pgaf) {
633
+ throw new Error('pg_auto_failover configuration not provided');
634
+ }
635
+ // Connect to monitor node
636
+ const monitorPool = new Pool({
637
+ connectionString: pgaf.monitorConnectionString,
638
+ max: 1,
639
+ });
640
+ try {
641
+ const client = await monitorPool.connect();
642
+ try {
643
+ await client.query('SELECT pgautofailover.perform_failover($1, $2)', [pgaf.formation, pgaf.groupId]);
644
+ }
645
+ finally {
646
+ client.release();
647
+ }
648
+ }
649
+ finally {
650
+ await monitorPool.end();
651
+ }
652
+ }
653
+ /**
654
+ * Get the current cluster configuration
655
+ */
656
+ getConfig() {
657
+ return { ...this.config };
658
+ }
659
+ /**
660
+ * Get the current primary node ID
661
+ */
662
+ getCurrentPrimary() {
663
+ return this.currentPrimaryId;
664
+ }
665
+ /**
666
+ * Get the last cached health status
667
+ */
668
+ getLastHealth() {
669
+ return this.lastHealth;
670
+ }
671
+ /**
672
+ * Check if the manager is running
673
+ */
674
+ isManagerRunning() {
675
+ return this.isRunning;
676
+ }
677
+ }
678
+ /**
679
+ * Generate streaming replication configuration
680
+ */
681
+ export function generateStreamingReplicationConfig(options) {
682
+ const { replicationUser, primaryHost, primaryPort, replicaHosts, synchronousMode = 'on', synchronousStandbyNames = [], maxWalSenders = 10, maxReplicationSlots = 10, walKeepSize = 1024, hotStandby = true, archiveMode = true, archiveCommand = '/bin/true', } = options;
683
+ // Primary configuration
684
+ const primaryPostgresqlConf = {
685
+ wal_level: 'replica',
686
+ max_wal_senders: maxWalSenders,
687
+ max_replication_slots: maxReplicationSlots,
688
+ wal_keep_size: `${walKeepSize}MB`,
689
+ hot_standby: hotStandby,
690
+ archive_mode: archiveMode,
691
+ archive_command: archiveCommand,
692
+ };
693
+ if (synchronousMode !== 'off' && synchronousStandbyNames.length > 0) {
694
+ primaryPostgresqlConf.synchronous_commit = synchronousMode;
695
+ primaryPostgresqlConf.synchronous_standby_names =
696
+ synchronousStandbyNames.length > 1
697
+ ? `FIRST ${synchronousStandbyNames.length} (${synchronousStandbyNames.join(', ')})`
698
+ : synchronousStandbyNames[0];
699
+ }
700
+ // Generate pg_hba.conf entries for replication
701
+ const pgHbaEntries = [
702
+ '# Replication connections',
703
+ `host replication ${replicationUser} 127.0.0.1/32 scram-sha-256`,
704
+ `host replication ${replicationUser} ::1/128 scram-sha-256`,
705
+ ];
706
+ for (const host of replicaHosts) {
707
+ // Handle both IP addresses and CIDR notation
708
+ const hostEntry = host.includes('/') ? host : `${host}/32`;
709
+ pgHbaEntries.push(`host replication ${replicationUser} ${hostEntry} scram-sha-256`);
710
+ }
711
+ // Replica configuration
712
+ const replicaPostgresqlConf = {
713
+ hot_standby: hotStandby,
714
+ hot_standby_feedback: true,
715
+ max_standby_streaming_delay: '30s',
716
+ max_standby_archive_delay: '30s',
717
+ wal_receiver_status_interval: '10s',
718
+ wal_receiver_timeout: '60s',
719
+ };
720
+ const recoveryConf = {
721
+ primary_conninfo: `host=${primaryHost} port=${primaryPort} user=${replicationUser}`,
722
+ primary_slot_name: 'replica_slot',
723
+ recovery_target_timeline: 'latest',
724
+ restore_command: archiveMode ? 'cp /archive/%f %p' : '',
725
+ };
726
+ return {
727
+ primary: {
728
+ postgresqlConf: primaryPostgresqlConf,
729
+ pgHbaEntries,
730
+ },
731
+ replica: {
732
+ postgresqlConf: replicaPostgresqlConf,
733
+ recoveryConf,
734
+ },
735
+ };
736
+ }
737
+ // =============================================================================
738
+ // Patroni Configuration Generator
739
+ // =============================================================================
740
+ /**
741
+ * Generate Patroni cluster configuration
742
+ */
743
+ export function generatePatroniConfig(options) {
744
+ const { clusterName, nodeName, dataDir, binDir, pgPort, restApiPort, dcsType, dcsHosts, superuser, replication, bootstrap, } = options;
745
+ const config = {
746
+ scope: clusterName,
747
+ name: nodeName,
748
+ restapi: {
749
+ listen: `0.0.0.0:${restApiPort}`,
750
+ connect_address: `${nodeName}:${restApiPort}`,
751
+ },
752
+ postgresql: {
753
+ listen: `0.0.0.0:${pgPort}`,
754
+ connect_address: `${nodeName}:${pgPort}`,
755
+ data_dir: dataDir,
756
+ ...(binDir && { bin_dir: binDir }),
757
+ authentication: {
758
+ superuser: superuser,
759
+ replication: replication,
760
+ },
761
+ parameters: {
762
+ wal_level: 'replica',
763
+ hot_standby: 'on',
764
+ max_connections: 200,
765
+ max_wal_senders: 10,
766
+ max_replication_slots: 10,
767
+ wal_keep_size: '1GB',
768
+ hot_standby_feedback: 'on',
769
+ },
770
+ },
771
+ bootstrap: {
772
+ dcs: {
773
+ ttl: 30,
774
+ loop_wait: 10,
775
+ retry_timeout: 10,
776
+ maximum_lag_on_failover: 1048576, // 1MB
777
+ postgresql: {
778
+ use_pg_rewind: true,
779
+ use_slots: true,
780
+ parameters: {
781
+ wal_level: 'replica',
782
+ hot_standby: 'on',
783
+ max_connections: 200,
784
+ max_wal_senders: 10,
785
+ max_replication_slots: 10,
786
+ },
787
+ },
788
+ ...bootstrap?.dcs,
789
+ },
790
+ initdb: bootstrap?.initdb ?? [
791
+ 'encoding: UTF8',
792
+ 'data-checksums',
793
+ ],
794
+ pg_hba: bootstrap?.pgHba ?? [
795
+ 'host replication replicator 0.0.0.0/0 md5',
796
+ 'host all all 0.0.0.0/0 md5',
797
+ ],
798
+ },
799
+ };
800
+ // Add DCS configuration
801
+ switch (dcsType) {
802
+ case 'etcd':
803
+ config.etcd = {
804
+ hosts: dcsHosts,
805
+ };
806
+ break;
807
+ case 'consul':
808
+ config.consul = {
809
+ host: dcsHosts[0],
810
+ };
811
+ break;
812
+ case 'zookeeper':
813
+ config.zookeeper = {
814
+ hosts: dcsHosts,
815
+ };
816
+ break;
817
+ case 'kubernetes':
818
+ config.kubernetes = {
819
+ use_endpoints: true,
820
+ };
821
+ break;
822
+ }
823
+ return config;
824
+ }
825
+ // =============================================================================
826
+ // pg_auto_failover Configuration Generator
827
+ // =============================================================================
828
+ /**
829
+ * Generate pg_auto_failover setup commands
830
+ */
831
+ export function generatePgAutoFailoverCommands(options) {
832
+ const { formation, role, dataDir, pgPort, monitorConnectionString, nodeName, hostname, groupId = 0, sslMode = 'require', } = options;
833
+ const commands = [];
834
+ if (role === 'monitor') {
835
+ commands.push(`pg_autoctl create monitor \\`, ` --pgdata "${dataDir}" \\`, ` --pgport ${pgPort} \\`, ` --hostname "${hostname}" \\`, ` --ssl-mode ${sslMode} \\`, ` --auth trust`, '', '# Start the monitor', `pg_autoctl run --pgdata "${dataDir}"`);
836
+ }
837
+ else {
838
+ if (!monitorConnectionString) {
839
+ throw new Error('Monitor connection string required for primary/secondary');
840
+ }
841
+ commands.push(`pg_autoctl create postgres \\`, ` --pgdata "${dataDir}" \\`, ` --pgport ${pgPort} \\`, ` --pghost "${hostname}" \\`, ` --name "${nodeName}" \\`, ` --formation "${formation}" \\`, ` --group ${groupId} \\`, ` --monitor "${monitorConnectionString}" \\`, ` --ssl-mode ${sslMode} \\`, ` --auth trust`, '', '# Start the node', `pg_autoctl run --pgdata "${dataDir}"`);
842
+ }
843
+ return commands;
844
+ }
845
+ /**
846
+ * Create a health check response for load balancer integration
847
+ */
848
+ export async function createHealthCheckResponse(options) {
849
+ const { manager, maxLagBytes = 100 * 1024 * 1024, // 100MB
850
+ maxLagSeconds = 30, minHealthyReplicas = 1, } = options;
851
+ try {
852
+ const health = await manager.getClusterHealth();
853
+ // Determine status
854
+ let status = 'healthy';
855
+ let statusCode = 200;
856
+ if (!health.primaryHealthy) {
857
+ status = 'unhealthy';
858
+ statusCode = 503;
859
+ }
860
+ else if (health.healthyReplicaCount < minHealthyReplicas) {
861
+ status = 'degraded';
862
+ statusCode = 200; // Still accept traffic but with warning
863
+ }
864
+ else if (health.lagThresholdExceeded) {
865
+ status = 'degraded';
866
+ statusCode = 200;
867
+ }
868
+ else if (health.maxLagBytes > maxLagBytes ||
869
+ health.replicationStats.some((s) => s.lagSeconds > maxLagSeconds)) {
870
+ status = 'degraded';
871
+ statusCode = 200;
872
+ }
873
+ return {
874
+ status,
875
+ statusCode,
876
+ body: {
877
+ status,
878
+ cluster: manager.getConfig().clusterName,
879
+ primary: {
880
+ nodeId: health.currentPrimary,
881
+ healthy: health.primaryHealthy,
882
+ },
883
+ replicas: {
884
+ healthy: health.healthyReplicaCount,
885
+ total: health.totalReplicaCount,
886
+ stats: health.replicationStats.map((s) => ({
887
+ nodeId: s.nodeId,
888
+ status: s.status,
889
+ lagBytes: s.lagBytes,
890
+ lagSeconds: s.lagSeconds,
891
+ isInSync: s.isInSync,
892
+ })),
893
+ },
894
+ metrics: {
895
+ averageLagBytes: health.averageLagBytes,
896
+ maxLagBytes: health.maxLagBytes,
897
+ lagThresholdExceeded: health.lagThresholdExceeded,
898
+ },
899
+ timestamp: health.lastCheckAt.toISOString(),
900
+ checkDurationMs: health.checkDurationMs,
901
+ },
902
+ };
903
+ }
904
+ catch (error) {
905
+ return {
906
+ status: 'unhealthy',
907
+ statusCode: 503,
908
+ body: {
909
+ status: 'unhealthy',
910
+ error: error instanceof Error ? error.message : String(error),
911
+ timestamp: new Date().toISOString(),
912
+ },
913
+ };
914
+ }
915
+ }
916
+ // =============================================================================
917
+ // Exports
918
+ // =============================================================================
919
+ export default ReplicationManager;
920
+ //# sourceMappingURL=replication.js.map