@vorionsys/infrastructure 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/LICENSE +190 -0
- package/README.md +89 -0
- package/dist/database/index.d.ts +14 -0
- package/dist/database/index.d.ts.map +1 -0
- package/dist/database/index.js +14 -0
- package/dist/database/index.js.map +1 -0
- package/dist/database/replication.d.ts +493 -0
- package/dist/database/replication.d.ts.map +1 -0
- package/dist/database/replication.js +920 -0
- package/dist/database/replication.js.map +1 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +17 -0
- package/dist/index.js.map +1 -0
- package/package.json +85 -0
|
@@ -0,0 +1,920 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PostgreSQL High Availability Replication Module
|
|
3
|
+
*
|
|
4
|
+
* Provides infrastructure for PostgreSQL HA with:
|
|
5
|
+
* - Streaming replication configuration
|
|
6
|
+
* - Patroni cluster management helpers
|
|
7
|
+
* - pg_auto_failover setup and monitoring
|
|
8
|
+
* - Automatic failover configuration
|
|
9
|
+
* - Replication lag monitoring
|
|
10
|
+
* - Health check endpoints for replicas
|
|
11
|
+
*
|
|
12
|
+
* @packageDocumentation
|
|
13
|
+
*/
|
|
14
|
+
import { Pool } from 'pg';
|
|
15
|
+
import { Counter, Gauge, Histogram, Registry } from 'prom-client';
|
|
16
|
+
import { EventEmitter } from 'events';
|
|
17
|
+
// =============================================================================
|
|
18
|
+
// Metrics Registry
|
|
19
|
+
// =============================================================================
|
|
20
|
+
/**
|
|
21
|
+
* Dedicated registry for replication metrics
|
|
22
|
+
*/
|
|
23
|
+
export const replicationRegistry = new Registry();
|
|
24
|
+
// Replication lag metrics
|
|
25
|
+
export const replicationLagBytesGauge = new Gauge({
|
|
26
|
+
name: 'vorion_pg_replication_lag_bytes',
|
|
27
|
+
help: 'PostgreSQL replication lag in bytes',
|
|
28
|
+
labelNames: ['cluster', 'replica_id', 'replica_host'],
|
|
29
|
+
registers: [replicationRegistry],
|
|
30
|
+
});
|
|
31
|
+
export const replicationLagSecondsGauge = new Gauge({
|
|
32
|
+
name: 'vorion_pg_replication_lag_seconds',
|
|
33
|
+
help: 'PostgreSQL replication lag in seconds',
|
|
34
|
+
labelNames: ['cluster', 'replica_id', 'replica_host'],
|
|
35
|
+
registers: [replicationRegistry],
|
|
36
|
+
});
|
|
37
|
+
// Replica status metrics
|
|
38
|
+
export const replicaStatusGauge = new Gauge({
|
|
39
|
+
name: 'vorion_pg_replica_status',
|
|
40
|
+
help: 'PostgreSQL replica status (1=streaming, 0.75=catchup, 0.5=potential, 0.25=disconnected, 0=failed)',
|
|
41
|
+
labelNames: ['cluster', 'replica_id', 'replica_host', 'status'],
|
|
42
|
+
registers: [replicationRegistry],
|
|
43
|
+
});
|
|
44
|
+
// Cluster health metrics
|
|
45
|
+
export const clusterHealthGauge = new Gauge({
|
|
46
|
+
name: 'vorion_pg_cluster_health',
|
|
47
|
+
help: 'PostgreSQL cluster health (1=healthy, 0=unhealthy)',
|
|
48
|
+
labelNames: ['cluster'],
|
|
49
|
+
registers: [replicationRegistry],
|
|
50
|
+
});
|
|
51
|
+
export const healthyReplicaCountGauge = new Gauge({
|
|
52
|
+
name: 'vorion_pg_healthy_replica_count',
|
|
53
|
+
help: 'Number of healthy PostgreSQL replicas',
|
|
54
|
+
labelNames: ['cluster'],
|
|
55
|
+
registers: [replicationRegistry],
|
|
56
|
+
});
|
|
57
|
+
// Failover metrics
|
|
58
|
+
export const failoverEventsCounter = new Counter({
|
|
59
|
+
name: 'vorion_pg_failover_events_total',
|
|
60
|
+
help: 'Total PostgreSQL failover events',
|
|
61
|
+
labelNames: ['cluster', 'reason', 'success'],
|
|
62
|
+
registers: [replicationRegistry],
|
|
63
|
+
});
|
|
64
|
+
export const failoverDurationHistogram = new Histogram({
|
|
65
|
+
name: 'vorion_pg_failover_duration_seconds',
|
|
66
|
+
help: 'PostgreSQL failover duration in seconds',
|
|
67
|
+
labelNames: ['cluster', 'reason'],
|
|
68
|
+
buckets: [1, 5, 10, 30, 60, 120, 300],
|
|
69
|
+
registers: [replicationRegistry],
|
|
70
|
+
});
|
|
71
|
+
// Health check metrics
|
|
72
|
+
export const healthCheckDurationHistogram = new Histogram({
|
|
73
|
+
name: 'vorion_pg_health_check_duration_seconds',
|
|
74
|
+
help: 'PostgreSQL health check duration',
|
|
75
|
+
labelNames: ['cluster', 'node_id', 'role'],
|
|
76
|
+
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5],
|
|
77
|
+
registers: [replicationRegistry],
|
|
78
|
+
});
|
|
79
|
+
export const healthCheckErrorsCounter = new Counter({
|
|
80
|
+
name: 'vorion_pg_health_check_errors_total',
|
|
81
|
+
help: 'Total PostgreSQL health check errors',
|
|
82
|
+
labelNames: ['cluster', 'node_id', 'error_type'],
|
|
83
|
+
registers: [replicationRegistry],
|
|
84
|
+
});
|
|
85
|
+
// =============================================================================
|
|
86
|
+
// Utility Functions
|
|
87
|
+
// =============================================================================
|
|
88
|
+
/**
|
|
89
|
+
* Create a connection pool for a node
|
|
90
|
+
*/
|
|
91
|
+
function createNodePool(config) {
|
|
92
|
+
const poolConfig = {
|
|
93
|
+
host: config.host,
|
|
94
|
+
port: config.port,
|
|
95
|
+
database: config.database,
|
|
96
|
+
user: config.user,
|
|
97
|
+
password: config.password,
|
|
98
|
+
max: config.maxConnections ?? 5,
|
|
99
|
+
idleTimeoutMillis: config.idleTimeoutMs ?? 30000,
|
|
100
|
+
connectionTimeoutMillis: config.connectionTimeoutMs ?? 10000,
|
|
101
|
+
application_name: config.applicationName ?? 'vorion-replication-monitor',
|
|
102
|
+
};
|
|
103
|
+
if (config.ssl) {
|
|
104
|
+
poolConfig.ssl = typeof config.ssl === 'boolean' ? config.ssl : config.ssl;
|
|
105
|
+
}
|
|
106
|
+
return new Pool(poolConfig);
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Parse LSN (Log Sequence Number) to bytes offset
|
|
110
|
+
*/
|
|
111
|
+
function parseLsn(lsn) {
|
|
112
|
+
if (!lsn)
|
|
113
|
+
return BigInt(0);
|
|
114
|
+
const [segment, offset] = lsn.split('/');
|
|
115
|
+
const segmentBigInt = BigInt('0x' + segment);
|
|
116
|
+
const offsetBigInt = BigInt('0x' + offset);
|
|
117
|
+
return (segmentBigInt << BigInt(32)) + offsetBigInt;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Calculate lag between two LSNs in bytes
|
|
121
|
+
*/
|
|
122
|
+
function calculateLagBytes(primaryLsn, replicaLsn) {
|
|
123
|
+
const primaryOffset = parseLsn(primaryLsn);
|
|
124
|
+
const replicaOffset = parseLsn(replicaLsn);
|
|
125
|
+
const lag = primaryOffset - replicaOffset;
|
|
126
|
+
return Number(lag > BigInt(0) ? lag : BigInt(0));
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Map replica state to ReplicaStatus
|
|
130
|
+
*/
|
|
131
|
+
function mapReplicaState(state) {
|
|
132
|
+
if (!state)
|
|
133
|
+
return 'unknown';
|
|
134
|
+
switch (state.toLowerCase()) {
|
|
135
|
+
case 'streaming':
|
|
136
|
+
return 'streaming';
|
|
137
|
+
case 'catchup':
|
|
138
|
+
return 'catchup';
|
|
139
|
+
case 'potential':
|
|
140
|
+
return 'potential';
|
|
141
|
+
case 'disconnected':
|
|
142
|
+
return 'disconnected';
|
|
143
|
+
default:
|
|
144
|
+
return 'unknown';
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Map replica status to numeric value for metrics
|
|
149
|
+
*/
|
|
150
|
+
function replicaStatusToNumber(status) {
|
|
151
|
+
switch (status) {
|
|
152
|
+
case 'streaming':
|
|
153
|
+
return 1;
|
|
154
|
+
case 'catchup':
|
|
155
|
+
return 0.75;
|
|
156
|
+
case 'potential':
|
|
157
|
+
return 0.5;
|
|
158
|
+
case 'disconnected':
|
|
159
|
+
return 0.25;
|
|
160
|
+
case 'failed':
|
|
161
|
+
return 0;
|
|
162
|
+
default:
|
|
163
|
+
return 0;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
// =============================================================================
|
|
167
|
+
// PostgreSQL Replication Manager
|
|
168
|
+
// =============================================================================
|
|
169
|
+
/**
|
|
170
|
+
* PostgreSQL Replication Manager
|
|
171
|
+
*
|
|
172
|
+
* Manages PostgreSQL HA clusters with streaming replication,
|
|
173
|
+
* health monitoring, and failover coordination.
|
|
174
|
+
*
|
|
175
|
+
* @example
|
|
176
|
+
* ```typescript
|
|
177
|
+
* const manager = new ReplicationManager({
|
|
178
|
+
* clusterName: 'vorion-production',
|
|
179
|
+
* replicationMode: 'streaming',
|
|
180
|
+
* orchestrator: 'patroni',
|
|
181
|
+
* primary: { nodeId: 'primary', host: 'pg-primary', port: 5432, ... },
|
|
182
|
+
* replicas: [
|
|
183
|
+
* { nodeId: 'replica-1', host: 'pg-replica-1', port: 5432, ... },
|
|
184
|
+
* { nodeId: 'replica-2', host: 'pg-replica-2', port: 5432, ... },
|
|
185
|
+
* ],
|
|
186
|
+
* maxReplicationLagSeconds: 30,
|
|
187
|
+
* healthCheckIntervalMs: 10000,
|
|
188
|
+
* });
|
|
189
|
+
*
|
|
190
|
+
* await manager.start();
|
|
191
|
+
*
|
|
192
|
+
* // Get cluster health
|
|
193
|
+
* const health = await manager.getClusterHealth();
|
|
194
|
+
*
|
|
195
|
+
* // Get replication lag for a specific replica
|
|
196
|
+
* const stats = await manager.getReplicationStats('replica-1');
|
|
197
|
+
*
|
|
198
|
+
* // Manual switchover
|
|
199
|
+
* await manager.switchover('replica-1', 'scheduled_maintenance');
|
|
200
|
+
* ```
|
|
201
|
+
*/
|
|
202
|
+
export class ReplicationManager extends EventEmitter {
|
|
203
|
+
config;
|
|
204
|
+
primaryPool = null;
|
|
205
|
+
replicaPools = new Map();
|
|
206
|
+
healthCheckInterval = null;
|
|
207
|
+
isRunning = false;
|
|
208
|
+
currentPrimaryId;
|
|
209
|
+
lastHealth = null;
|
|
210
|
+
constructor(config) {
|
|
211
|
+
super();
|
|
212
|
+
this.config = {
|
|
213
|
+
maxReplicationLagBytes: 100 * 1024 * 1024, // 100MB default
|
|
214
|
+
maxReplicationLagSeconds: 30,
|
|
215
|
+
healthCheckIntervalMs: 10000,
|
|
216
|
+
failoverTimeoutMs: 60000,
|
|
217
|
+
autoFailoverEnabled: true,
|
|
218
|
+
synchronousStandbyCount: 1,
|
|
219
|
+
...config,
|
|
220
|
+
};
|
|
221
|
+
this.currentPrimaryId = config.primary.nodeId;
|
|
222
|
+
}
|
|
223
|
+
/**
|
|
224
|
+
* Start the replication manager
|
|
225
|
+
*/
|
|
226
|
+
async start() {
|
|
227
|
+
if (this.isRunning) {
|
|
228
|
+
return;
|
|
229
|
+
}
|
|
230
|
+
// Initialize primary pool
|
|
231
|
+
this.primaryPool = createNodePool(this.config.primary);
|
|
232
|
+
// Initialize replica pools
|
|
233
|
+
for (const replica of this.config.replicas) {
|
|
234
|
+
const pool = createNodePool(replica);
|
|
235
|
+
this.replicaPools.set(replica.nodeId, pool);
|
|
236
|
+
}
|
|
237
|
+
// Start health check loop
|
|
238
|
+
this.startHealthChecks();
|
|
239
|
+
this.isRunning = true;
|
|
240
|
+
this.emit('started', { clusterName: this.config.clusterName });
|
|
241
|
+
}
|
|
242
|
+
/**
|
|
243
|
+
* Stop the replication manager
|
|
244
|
+
*/
|
|
245
|
+
async stop() {
|
|
246
|
+
if (!this.isRunning) {
|
|
247
|
+
return;
|
|
248
|
+
}
|
|
249
|
+
// Stop health checks
|
|
250
|
+
if (this.healthCheckInterval) {
|
|
251
|
+
clearInterval(this.healthCheckInterval);
|
|
252
|
+
this.healthCheckInterval = null;
|
|
253
|
+
}
|
|
254
|
+
// Close all pools
|
|
255
|
+
if (this.primaryPool) {
|
|
256
|
+
await this.primaryPool.end();
|
|
257
|
+
this.primaryPool = null;
|
|
258
|
+
}
|
|
259
|
+
for (const [, pool] of this.replicaPools) {
|
|
260
|
+
await pool.end();
|
|
261
|
+
}
|
|
262
|
+
this.replicaPools.clear();
|
|
263
|
+
this.isRunning = false;
|
|
264
|
+
this.emit('stopped', { clusterName: this.config.clusterName });
|
|
265
|
+
}
|
|
266
|
+
/**
|
|
267
|
+
* Start health check interval
|
|
268
|
+
*/
|
|
269
|
+
startHealthChecks() {
|
|
270
|
+
const runHealthCheck = async () => {
|
|
271
|
+
try {
|
|
272
|
+
const health = await this.getClusterHealth();
|
|
273
|
+
this.lastHealth = health;
|
|
274
|
+
// Update metrics
|
|
275
|
+
this.updateMetrics(health);
|
|
276
|
+
// Check for issues
|
|
277
|
+
if (!health.healthy) {
|
|
278
|
+
this.emit('unhealthy', health);
|
|
279
|
+
}
|
|
280
|
+
if (health.lagThresholdExceeded) {
|
|
281
|
+
this.emit('lag_threshold_exceeded', health);
|
|
282
|
+
}
|
|
283
|
+
// Auto-failover check
|
|
284
|
+
if (this.config.autoFailoverEnabled && !health.primaryHealthy) {
|
|
285
|
+
this.emit('primary_failure', {
|
|
286
|
+
currentPrimary: this.currentPrimaryId,
|
|
287
|
+
health,
|
|
288
|
+
});
|
|
289
|
+
// Note: Actual failover should be handled by Patroni/pg_auto_failover
|
|
290
|
+
// This just emits events for monitoring
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
catch (error) {
|
|
294
|
+
this.emit('health_check_error', { error });
|
|
295
|
+
healthCheckErrorsCounter.inc({
|
|
296
|
+
cluster: this.config.clusterName,
|
|
297
|
+
node_id: 'cluster',
|
|
298
|
+
error_type: error instanceof Error ? error.name : 'unknown',
|
|
299
|
+
});
|
|
300
|
+
}
|
|
301
|
+
};
|
|
302
|
+
// Run immediately
|
|
303
|
+
runHealthCheck();
|
|
304
|
+
// Schedule interval
|
|
305
|
+
this.healthCheckInterval = setInterval(runHealthCheck, this.config.healthCheckIntervalMs);
|
|
306
|
+
}
|
|
307
|
+
/**
|
|
308
|
+
* Update Prometheus metrics from health data
|
|
309
|
+
*/
|
|
310
|
+
updateMetrics(health) {
|
|
311
|
+
const cluster = this.config.clusterName;
|
|
312
|
+
// Cluster health
|
|
313
|
+
clusterHealthGauge.set({ cluster }, health.healthy ? 1 : 0);
|
|
314
|
+
healthyReplicaCountGauge.set({ cluster }, health.healthyReplicaCount);
|
|
315
|
+
// Per-replica metrics
|
|
316
|
+
for (const stats of health.replicationStats) {
|
|
317
|
+
const replica = this.config.replicas.find((r) => r.nodeId === stats.nodeId);
|
|
318
|
+
const host = replica?.host ?? 'unknown';
|
|
319
|
+
replicationLagBytesGauge.set({ cluster, replica_id: stats.nodeId, replica_host: host }, stats.lagBytes);
|
|
320
|
+
replicationLagSecondsGauge.set({ cluster, replica_id: stats.nodeId, replica_host: host }, stats.lagSeconds);
|
|
321
|
+
replicaStatusGauge.set({ cluster, replica_id: stats.nodeId, replica_host: host, status: stats.status }, replicaStatusToNumber(stats.status));
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
/**
|
|
325
|
+
* Get comprehensive cluster health status
|
|
326
|
+
*/
|
|
327
|
+
async getClusterHealth() {
|
|
328
|
+
const startTime = performance.now();
|
|
329
|
+
// Check primary health
|
|
330
|
+
const primaryHealth = await this.checkNodeHealth(this.config.primary, 'primary');
|
|
331
|
+
// Get replication stats from primary
|
|
332
|
+
const replicationStats = await this.getAllReplicationStats();
|
|
333
|
+
// Calculate health metrics
|
|
334
|
+
const healthyReplicaCount = replicationStats.filter((s) => s.status === 'streaming' && s.isInSync).length;
|
|
335
|
+
const lagValues = replicationStats.map((s) => s.lagBytes);
|
|
336
|
+
const averageLagBytes = lagValues.length > 0
|
|
337
|
+
? lagValues.reduce((a, b) => a + b, 0) / lagValues.length
|
|
338
|
+
: 0;
|
|
339
|
+
const maxLagBytes = lagValues.length > 0 ? Math.max(...lagValues) : 0;
|
|
340
|
+
const lagThresholdExceeded = maxLagBytes > (this.config.maxReplicationLagBytes ?? Infinity) ||
|
|
341
|
+
replicationStats.some((s) => s.lagSeconds > (this.config.maxReplicationLagSeconds ?? Infinity));
|
|
342
|
+
const checkDurationMs = performance.now() - startTime;
|
|
343
|
+
const health = {
|
|
344
|
+
healthy: primaryHealth.reachable &&
|
|
345
|
+
healthyReplicaCount >= (this.config.synchronousStandbyCount ?? 1),
|
|
346
|
+
primaryHealthy: primaryHealth.reachable && !primaryHealth.isInRecovery,
|
|
347
|
+
healthyReplicaCount,
|
|
348
|
+
totalReplicaCount: this.config.replicas.length,
|
|
349
|
+
currentPrimary: this.currentPrimaryId,
|
|
350
|
+
replicationStats,
|
|
351
|
+
averageLagBytes: Math.round(averageLagBytes),
|
|
352
|
+
maxLagBytes,
|
|
353
|
+
lagThresholdExceeded,
|
|
354
|
+
lastCheckAt: new Date(),
|
|
355
|
+
checkDurationMs: Math.round(checkDurationMs),
|
|
356
|
+
};
|
|
357
|
+
healthCheckDurationHistogram.observe({ cluster: this.config.clusterName, node_id: 'cluster', role: 'cluster' }, checkDurationMs / 1000);
|
|
358
|
+
return health;
|
|
359
|
+
}
|
|
360
|
+
/**
|
|
361
|
+
* Get replication statistics for all replicas
|
|
362
|
+
*/
|
|
363
|
+
async getAllReplicationStats() {
|
|
364
|
+
if (!this.primaryPool) {
|
|
365
|
+
throw new Error('Primary pool not initialized');
|
|
366
|
+
}
|
|
367
|
+
let client = null;
|
|
368
|
+
try {
|
|
369
|
+
client = await this.primaryPool.connect();
|
|
370
|
+
// Query pg_stat_replication for all replicas
|
|
371
|
+
const result = await client.query(`
|
|
372
|
+
SELECT
|
|
373
|
+
pid,
|
|
374
|
+
usename,
|
|
375
|
+
application_name,
|
|
376
|
+
client_addr,
|
|
377
|
+
client_hostname,
|
|
378
|
+
client_port,
|
|
379
|
+
backend_start,
|
|
380
|
+
state,
|
|
381
|
+
sent_lsn,
|
|
382
|
+
write_lsn,
|
|
383
|
+
flush_lsn,
|
|
384
|
+
replay_lsn,
|
|
385
|
+
write_lag,
|
|
386
|
+
flush_lag,
|
|
387
|
+
replay_lag,
|
|
388
|
+
sync_priority,
|
|
389
|
+
sync_state,
|
|
390
|
+
reply_time
|
|
391
|
+
FROM pg_stat_replication
|
|
392
|
+
WHERE state IS NOT NULL
|
|
393
|
+
`);
|
|
394
|
+
// Get current primary LSN
|
|
395
|
+
const lsnResult = await client.query('SELECT pg_current_wal_lsn() as current_lsn');
|
|
396
|
+
const currentLsn = lsnResult.rows[0]?.current_lsn ?? '0/0';
|
|
397
|
+
const stats = [];
|
|
398
|
+
for (const row of result.rows) {
|
|
399
|
+
// Find matching replica config
|
|
400
|
+
const replica = this.config.replicas.find((r) => r.host === row.client_addr ||
|
|
401
|
+
r.host === row.client_hostname ||
|
|
402
|
+
row.application_name?.includes(r.nodeId));
|
|
403
|
+
const nodeId = replica?.nodeId ?? row.application_name ?? row.client_addr;
|
|
404
|
+
const lagBytes = calculateLagBytes(currentLsn, row.replay_lsn ?? '0/0');
|
|
405
|
+
const writeLagBytes = calculateLagBytes(currentLsn, row.write_lsn ?? '0/0');
|
|
406
|
+
const flushLagBytes = calculateLagBytes(currentLsn, row.flush_lsn ?? '0/0');
|
|
407
|
+
const replayLagBytes = calculateLagBytes(row.flush_lsn ?? '0/0', row.replay_lsn ?? '0/0');
|
|
408
|
+
// Parse lag interval to seconds
|
|
409
|
+
const lagSeconds = this.parseIntervalToSeconds(row.replay_lag);
|
|
410
|
+
stats.push({
|
|
411
|
+
nodeId,
|
|
412
|
+
status: mapReplicaState(row.state),
|
|
413
|
+
lagBytes,
|
|
414
|
+
lagSeconds,
|
|
415
|
+
receivedLsn: row.sent_lsn ?? '0/0',
|
|
416
|
+
replayedLsn: row.replay_lsn ?? '0/0',
|
|
417
|
+
replayLagBytes,
|
|
418
|
+
writeLagBytes,
|
|
419
|
+
flushLagBytes,
|
|
420
|
+
isInSync: row.sync_state === 'sync' || row.sync_state === 'quorum',
|
|
421
|
+
collectedAt: new Date(),
|
|
422
|
+
});
|
|
423
|
+
}
|
|
424
|
+
// Add stats for replicas not in pg_stat_replication (disconnected)
|
|
425
|
+
for (const replica of this.config.replicas) {
|
|
426
|
+
if (!stats.find((s) => s.nodeId === replica.nodeId)) {
|
|
427
|
+
stats.push({
|
|
428
|
+
nodeId: replica.nodeId,
|
|
429
|
+
status: 'disconnected',
|
|
430
|
+
lagBytes: -1,
|
|
431
|
+
lagSeconds: -1,
|
|
432
|
+
receivedLsn: '0/0',
|
|
433
|
+
replayedLsn: '0/0',
|
|
434
|
+
replayLagBytes: -1,
|
|
435
|
+
writeLagBytes: -1,
|
|
436
|
+
flushLagBytes: -1,
|
|
437
|
+
isInSync: false,
|
|
438
|
+
collectedAt: new Date(),
|
|
439
|
+
});
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
return stats;
|
|
443
|
+
}
|
|
444
|
+
finally {
|
|
445
|
+
if (client) {
|
|
446
|
+
client.release();
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
/**
|
|
451
|
+
* Get replication statistics for a specific replica
|
|
452
|
+
*/
|
|
453
|
+
async getReplicationStats(nodeId) {
|
|
454
|
+
const allStats = await this.getAllReplicationStats();
|
|
455
|
+
return allStats.find((s) => s.nodeId === nodeId) ?? null;
|
|
456
|
+
}
|
|
457
|
+
/**
|
|
458
|
+
* Check health of a specific node
|
|
459
|
+
*/
|
|
460
|
+
async checkNodeHealth(nodeConfig, role) {
|
|
461
|
+
const startTime = performance.now();
|
|
462
|
+
const pool = role === 'primary'
|
|
463
|
+
? this.primaryPool
|
|
464
|
+
: this.replicaPools.get(nodeConfig.nodeId);
|
|
465
|
+
if (!pool) {
|
|
466
|
+
return {
|
|
467
|
+
nodeId: nodeConfig.nodeId,
|
|
468
|
+
role,
|
|
469
|
+
reachable: false,
|
|
470
|
+
latencyMs: 0,
|
|
471
|
+
isInRecovery: false,
|
|
472
|
+
errorMessage: 'Pool not initialized',
|
|
473
|
+
checkedAt: new Date(),
|
|
474
|
+
};
|
|
475
|
+
}
|
|
476
|
+
let client = null;
|
|
477
|
+
try {
|
|
478
|
+
client = await pool.connect();
|
|
479
|
+
// Run health check query
|
|
480
|
+
const result = await client.query(`
|
|
481
|
+
SELECT
|
|
482
|
+
version() as pg_version,
|
|
483
|
+
pg_is_in_recovery() as is_in_recovery,
|
|
484
|
+
txid_current() as current_xid,
|
|
485
|
+
(SELECT timeline_id FROM pg_control_checkpoint()) as timeline_id
|
|
486
|
+
`);
|
|
487
|
+
const row = result.rows[0];
|
|
488
|
+
const latencyMs = performance.now() - startTime;
|
|
489
|
+
healthCheckDurationHistogram.observe({ cluster: this.config.clusterName, node_id: nodeConfig.nodeId, role }, latencyMs / 1000);
|
|
490
|
+
return {
|
|
491
|
+
nodeId: nodeConfig.nodeId,
|
|
492
|
+
role,
|
|
493
|
+
reachable: true,
|
|
494
|
+
latencyMs: Math.round(latencyMs),
|
|
495
|
+
pgVersion: row.pg_version,
|
|
496
|
+
isInRecovery: row.is_in_recovery,
|
|
497
|
+
currentXid: row.current_xid?.toString(),
|
|
498
|
+
timelineId: row.timeline_id,
|
|
499
|
+
checkedAt: new Date(),
|
|
500
|
+
};
|
|
501
|
+
}
|
|
502
|
+
catch (error) {
|
|
503
|
+
const latencyMs = performance.now() - startTime;
|
|
504
|
+
healthCheckErrorsCounter.inc({
|
|
505
|
+
cluster: this.config.clusterName,
|
|
506
|
+
node_id: nodeConfig.nodeId,
|
|
507
|
+
error_type: error instanceof Error ? error.name : 'unknown',
|
|
508
|
+
});
|
|
509
|
+
return {
|
|
510
|
+
nodeId: nodeConfig.nodeId,
|
|
511
|
+
role,
|
|
512
|
+
reachable: false,
|
|
513
|
+
latencyMs: Math.round(latencyMs),
|
|
514
|
+
isInRecovery: false,
|
|
515
|
+
errorMessage: error instanceof Error ? error.message : String(error),
|
|
516
|
+
checkedAt: new Date(),
|
|
517
|
+
};
|
|
518
|
+
}
|
|
519
|
+
finally {
|
|
520
|
+
if (client) {
|
|
521
|
+
client.release();
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
/**
|
|
526
|
+
* Parse PostgreSQL interval to seconds
|
|
527
|
+
*/
|
|
528
|
+
parseIntervalToSeconds(interval) {
|
|
529
|
+
if (!interval)
|
|
530
|
+
return 0;
|
|
531
|
+
// PostgreSQL interval format: "HH:MM:SS.microseconds"
|
|
532
|
+
const match = interval.match(/^(\d+):(\d+):(\d+(?:\.\d+)?)$/);
|
|
533
|
+
if (!match)
|
|
534
|
+
return 0;
|
|
535
|
+
const hours = parseInt(match[1], 10);
|
|
536
|
+
const minutes = parseInt(match[2], 10);
|
|
537
|
+
const seconds = parseFloat(match[3]);
|
|
538
|
+
return hours * 3600 + minutes * 60 + seconds;
|
|
539
|
+
}
|
|
540
|
+
/**
|
|
541
|
+
* Initiate manual switchover to a specified replica
|
|
542
|
+
* Note: For Patroni/pg_auto_failover, this coordinates with the orchestrator
|
|
543
|
+
*/
|
|
544
|
+
async switchover(targetReplicaId, reason = 'manual_switchover') {
|
|
545
|
+
const eventId = `failover-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
|
546
|
+
const initiatedAt = new Date();
|
|
547
|
+
const event = {
|
|
548
|
+
eventId,
|
|
549
|
+
previousPrimary: this.currentPrimaryId,
|
|
550
|
+
newPrimary: targetReplicaId,
|
|
551
|
+
reason,
|
|
552
|
+
initiatedAt,
|
|
553
|
+
success: false,
|
|
554
|
+
};
|
|
555
|
+
this.emit('switchover_started', event);
|
|
556
|
+
try {
|
|
557
|
+
// Get current lag before switchover
|
|
558
|
+
const stats = await this.getReplicationStats(targetReplicaId);
|
|
559
|
+
event.lagAtFailover = stats?.lagBytes ?? 0;
|
|
560
|
+
// Perform switchover based on orchestrator
|
|
561
|
+
if (this.config.orchestrator === 'patroni' && this.config.patroni) {
|
|
562
|
+
await this.performPatroniSwitchover(targetReplicaId);
|
|
563
|
+
}
|
|
564
|
+
else if (this.config.orchestrator === 'pg_auto_failover' &&
|
|
565
|
+
this.config.pgAutoFailover) {
|
|
566
|
+
await this.performPgAutoFailoverSwitchover(targetReplicaId);
|
|
567
|
+
}
|
|
568
|
+
else {
|
|
569
|
+
throw new Error(`Manual switchover not supported for orchestrator: ${this.config.orchestrator}`);
|
|
570
|
+
}
|
|
571
|
+
event.success = true;
|
|
572
|
+
event.completedAt = new Date();
|
|
573
|
+
// Update tracking
|
|
574
|
+
this.currentPrimaryId = targetReplicaId;
|
|
575
|
+
// Record metrics
|
|
576
|
+
const durationSeconds = (event.completedAt.getTime() - initiatedAt.getTime()) / 1000;
|
|
577
|
+
failoverEventsCounter.inc({
|
|
578
|
+
cluster: this.config.clusterName,
|
|
579
|
+
reason,
|
|
580
|
+
success: 'true',
|
|
581
|
+
});
|
|
582
|
+
failoverDurationHistogram.observe({ cluster: this.config.clusterName, reason }, durationSeconds);
|
|
583
|
+
this.emit('switchover_completed', event);
|
|
584
|
+
}
|
|
585
|
+
catch (error) {
|
|
586
|
+
event.success = false;
|
|
587
|
+
event.errorMessage = error instanceof Error ? error.message : String(error);
|
|
588
|
+
event.completedAt = new Date();
|
|
589
|
+
failoverEventsCounter.inc({
|
|
590
|
+
cluster: this.config.clusterName,
|
|
591
|
+
reason,
|
|
592
|
+
success: 'false',
|
|
593
|
+
});
|
|
594
|
+
this.emit('switchover_failed', event);
|
|
595
|
+
}
|
|
596
|
+
return event;
|
|
597
|
+
}
|
|
598
|
+
/**
|
|
599
|
+
* Perform switchover via Patroni REST API
|
|
600
|
+
*/
|
|
601
|
+
async performPatroniSwitchover(targetReplicaId) {
|
|
602
|
+
const patroni = this.config.patroni;
|
|
603
|
+
if (!patroni) {
|
|
604
|
+
throw new Error('Patroni configuration not provided');
|
|
605
|
+
}
|
|
606
|
+
const url = `${patroni.apiEndpoint}/switchover`;
|
|
607
|
+
const headers = {
|
|
608
|
+
'Content-Type': 'application/json',
|
|
609
|
+
};
|
|
610
|
+
if (patroni.apiUsername && patroni.apiPassword) {
|
|
611
|
+
const auth = Buffer.from(`${patroni.apiUsername}:${patroni.apiPassword}`).toString('base64');
|
|
612
|
+
headers['Authorization'] = `Basic ${auth}`;
|
|
613
|
+
}
|
|
614
|
+
const response = await fetch(url, {
|
|
615
|
+
method: 'POST',
|
|
616
|
+
headers,
|
|
617
|
+
body: JSON.stringify({
|
|
618
|
+
leader: this.currentPrimaryId,
|
|
619
|
+
candidate: targetReplicaId,
|
|
620
|
+
}),
|
|
621
|
+
});
|
|
622
|
+
if (!response.ok) {
|
|
623
|
+
const text = await response.text();
|
|
624
|
+
throw new Error(`Patroni switchover failed: ${response.status} ${text}`);
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
/**
|
|
628
|
+
* Perform switchover via pg_auto_failover
|
|
629
|
+
*/
|
|
630
|
+
async performPgAutoFailoverSwitchover(_targetReplicaId) {
|
|
631
|
+
const pgaf = this.config.pgAutoFailover;
|
|
632
|
+
if (!pgaf) {
|
|
633
|
+
throw new Error('pg_auto_failover configuration not provided');
|
|
634
|
+
}
|
|
635
|
+
// Connect to monitor node
|
|
636
|
+
const monitorPool = new Pool({
|
|
637
|
+
connectionString: pgaf.monitorConnectionString,
|
|
638
|
+
max: 1,
|
|
639
|
+
});
|
|
640
|
+
try {
|
|
641
|
+
const client = await monitorPool.connect();
|
|
642
|
+
try {
|
|
643
|
+
await client.query('SELECT pgautofailover.perform_failover($1, $2)', [pgaf.formation, pgaf.groupId]);
|
|
644
|
+
}
|
|
645
|
+
finally {
|
|
646
|
+
client.release();
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
finally {
|
|
650
|
+
await monitorPool.end();
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
/**
|
|
654
|
+
* Get the current cluster configuration
|
|
655
|
+
*/
|
|
656
|
+
getConfig() {
|
|
657
|
+
return { ...this.config };
|
|
658
|
+
}
|
|
659
|
+
/**
|
|
660
|
+
* Get the current primary node ID
|
|
661
|
+
*/
|
|
662
|
+
getCurrentPrimary() {
|
|
663
|
+
return this.currentPrimaryId;
|
|
664
|
+
}
|
|
665
|
+
/**
|
|
666
|
+
* Get the last cached health status
|
|
667
|
+
*/
|
|
668
|
+
getLastHealth() {
|
|
669
|
+
return this.lastHealth;
|
|
670
|
+
}
|
|
671
|
+
/**
|
|
672
|
+
* Check if the manager is running
|
|
673
|
+
*/
|
|
674
|
+
isManagerRunning() {
|
|
675
|
+
return this.isRunning;
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
/**
|
|
679
|
+
* Generate streaming replication configuration
|
|
680
|
+
*/
|
|
681
|
+
export function generateStreamingReplicationConfig(options) {
|
|
682
|
+
const { replicationUser, primaryHost, primaryPort, replicaHosts, synchronousMode = 'on', synchronousStandbyNames = [], maxWalSenders = 10, maxReplicationSlots = 10, walKeepSize = 1024, hotStandby = true, archiveMode = true, archiveCommand = '/bin/true', } = options;
|
|
683
|
+
// Primary configuration
|
|
684
|
+
const primaryPostgresqlConf = {
|
|
685
|
+
wal_level: 'replica',
|
|
686
|
+
max_wal_senders: maxWalSenders,
|
|
687
|
+
max_replication_slots: maxReplicationSlots,
|
|
688
|
+
wal_keep_size: `${walKeepSize}MB`,
|
|
689
|
+
hot_standby: hotStandby,
|
|
690
|
+
archive_mode: archiveMode,
|
|
691
|
+
archive_command: archiveCommand,
|
|
692
|
+
};
|
|
693
|
+
if (synchronousMode !== 'off' && synchronousStandbyNames.length > 0) {
|
|
694
|
+
primaryPostgresqlConf.synchronous_commit = synchronousMode;
|
|
695
|
+
primaryPostgresqlConf.synchronous_standby_names =
|
|
696
|
+
synchronousStandbyNames.length > 1
|
|
697
|
+
? `FIRST ${synchronousStandbyNames.length} (${synchronousStandbyNames.join(', ')})`
|
|
698
|
+
: synchronousStandbyNames[0];
|
|
699
|
+
}
|
|
700
|
+
// Generate pg_hba.conf entries for replication
|
|
701
|
+
const pgHbaEntries = [
|
|
702
|
+
'# Replication connections',
|
|
703
|
+
`host replication ${replicationUser} 127.0.0.1/32 scram-sha-256`,
|
|
704
|
+
`host replication ${replicationUser} ::1/128 scram-sha-256`,
|
|
705
|
+
];
|
|
706
|
+
for (const host of replicaHosts) {
|
|
707
|
+
// Handle both IP addresses and CIDR notation
|
|
708
|
+
const hostEntry = host.includes('/') ? host : `${host}/32`;
|
|
709
|
+
pgHbaEntries.push(`host replication ${replicationUser} ${hostEntry} scram-sha-256`);
|
|
710
|
+
}
|
|
711
|
+
// Replica configuration
|
|
712
|
+
const replicaPostgresqlConf = {
|
|
713
|
+
hot_standby: hotStandby,
|
|
714
|
+
hot_standby_feedback: true,
|
|
715
|
+
max_standby_streaming_delay: '30s',
|
|
716
|
+
max_standby_archive_delay: '30s',
|
|
717
|
+
wal_receiver_status_interval: '10s',
|
|
718
|
+
wal_receiver_timeout: '60s',
|
|
719
|
+
};
|
|
720
|
+
const recoveryConf = {
|
|
721
|
+
primary_conninfo: `host=${primaryHost} port=${primaryPort} user=${replicationUser}`,
|
|
722
|
+
primary_slot_name: 'replica_slot',
|
|
723
|
+
recovery_target_timeline: 'latest',
|
|
724
|
+
restore_command: archiveMode ? 'cp /archive/%f %p' : '',
|
|
725
|
+
};
|
|
726
|
+
return {
|
|
727
|
+
primary: {
|
|
728
|
+
postgresqlConf: primaryPostgresqlConf,
|
|
729
|
+
pgHbaEntries,
|
|
730
|
+
},
|
|
731
|
+
replica: {
|
|
732
|
+
postgresqlConf: replicaPostgresqlConf,
|
|
733
|
+
recoveryConf,
|
|
734
|
+
},
|
|
735
|
+
};
|
|
736
|
+
}
|
|
737
|
+
// =============================================================================
|
|
738
|
+
// Patroni Configuration Generator
|
|
739
|
+
// =============================================================================
|
|
740
|
+
/**
|
|
741
|
+
* Generate Patroni cluster configuration
|
|
742
|
+
*/
|
|
743
|
+
export function generatePatroniConfig(options) {
|
|
744
|
+
const { clusterName, nodeName, dataDir, binDir, pgPort, restApiPort, dcsType, dcsHosts, superuser, replication, bootstrap, } = options;
|
|
745
|
+
const config = {
|
|
746
|
+
scope: clusterName,
|
|
747
|
+
name: nodeName,
|
|
748
|
+
restapi: {
|
|
749
|
+
listen: `0.0.0.0:${restApiPort}`,
|
|
750
|
+
connect_address: `${nodeName}:${restApiPort}`,
|
|
751
|
+
},
|
|
752
|
+
postgresql: {
|
|
753
|
+
listen: `0.0.0.0:${pgPort}`,
|
|
754
|
+
connect_address: `${nodeName}:${pgPort}`,
|
|
755
|
+
data_dir: dataDir,
|
|
756
|
+
...(binDir && { bin_dir: binDir }),
|
|
757
|
+
authentication: {
|
|
758
|
+
superuser: superuser,
|
|
759
|
+
replication: replication,
|
|
760
|
+
},
|
|
761
|
+
parameters: {
|
|
762
|
+
wal_level: 'replica',
|
|
763
|
+
hot_standby: 'on',
|
|
764
|
+
max_connections: 200,
|
|
765
|
+
max_wal_senders: 10,
|
|
766
|
+
max_replication_slots: 10,
|
|
767
|
+
wal_keep_size: '1GB',
|
|
768
|
+
hot_standby_feedback: 'on',
|
|
769
|
+
},
|
|
770
|
+
},
|
|
771
|
+
bootstrap: {
|
|
772
|
+
dcs: {
|
|
773
|
+
ttl: 30,
|
|
774
|
+
loop_wait: 10,
|
|
775
|
+
retry_timeout: 10,
|
|
776
|
+
maximum_lag_on_failover: 1048576, // 1MB
|
|
777
|
+
postgresql: {
|
|
778
|
+
use_pg_rewind: true,
|
|
779
|
+
use_slots: true,
|
|
780
|
+
parameters: {
|
|
781
|
+
wal_level: 'replica',
|
|
782
|
+
hot_standby: 'on',
|
|
783
|
+
max_connections: 200,
|
|
784
|
+
max_wal_senders: 10,
|
|
785
|
+
max_replication_slots: 10,
|
|
786
|
+
},
|
|
787
|
+
},
|
|
788
|
+
...bootstrap?.dcs,
|
|
789
|
+
},
|
|
790
|
+
initdb: bootstrap?.initdb ?? [
|
|
791
|
+
'encoding: UTF8',
|
|
792
|
+
'data-checksums',
|
|
793
|
+
],
|
|
794
|
+
pg_hba: bootstrap?.pgHba ?? [
|
|
795
|
+
'host replication replicator 0.0.0.0/0 md5',
|
|
796
|
+
'host all all 0.0.0.0/0 md5',
|
|
797
|
+
],
|
|
798
|
+
},
|
|
799
|
+
};
|
|
800
|
+
// Add DCS configuration
|
|
801
|
+
switch (dcsType) {
|
|
802
|
+
case 'etcd':
|
|
803
|
+
config.etcd = {
|
|
804
|
+
hosts: dcsHosts,
|
|
805
|
+
};
|
|
806
|
+
break;
|
|
807
|
+
case 'consul':
|
|
808
|
+
config.consul = {
|
|
809
|
+
host: dcsHosts[0],
|
|
810
|
+
};
|
|
811
|
+
break;
|
|
812
|
+
case 'zookeeper':
|
|
813
|
+
config.zookeeper = {
|
|
814
|
+
hosts: dcsHosts,
|
|
815
|
+
};
|
|
816
|
+
break;
|
|
817
|
+
case 'kubernetes':
|
|
818
|
+
config.kubernetes = {
|
|
819
|
+
use_endpoints: true,
|
|
820
|
+
};
|
|
821
|
+
break;
|
|
822
|
+
}
|
|
823
|
+
return config;
|
|
824
|
+
}
|
|
825
|
+
// =============================================================================
|
|
826
|
+
// pg_auto_failover Configuration Generator
|
|
827
|
+
// =============================================================================
|
|
828
|
+
/**
|
|
829
|
+
* Generate pg_auto_failover setup commands
|
|
830
|
+
*/
|
|
831
|
+
export function generatePgAutoFailoverCommands(options) {
|
|
832
|
+
const { formation, role, dataDir, pgPort, monitorConnectionString, nodeName, hostname, groupId = 0, sslMode = 'require', } = options;
|
|
833
|
+
const commands = [];
|
|
834
|
+
if (role === 'monitor') {
|
|
835
|
+
commands.push(`pg_autoctl create monitor \\`, ` --pgdata "${dataDir}" \\`, ` --pgport ${pgPort} \\`, ` --hostname "${hostname}" \\`, ` --ssl-mode ${sslMode} \\`, ` --auth trust`, '', '# Start the monitor', `pg_autoctl run --pgdata "${dataDir}"`);
|
|
836
|
+
}
|
|
837
|
+
else {
|
|
838
|
+
if (!monitorConnectionString) {
|
|
839
|
+
throw new Error('Monitor connection string required for primary/secondary');
|
|
840
|
+
}
|
|
841
|
+
commands.push(`pg_autoctl create postgres \\`, ` --pgdata "${dataDir}" \\`, ` --pgport ${pgPort} \\`, ` --pghost "${hostname}" \\`, ` --name "${nodeName}" \\`, ` --formation "${formation}" \\`, ` --group ${groupId} \\`, ` --monitor "${monitorConnectionString}" \\`, ` --ssl-mode ${sslMode} \\`, ` --auth trust`, '', '# Start the node', `pg_autoctl run --pgdata "${dataDir}"`);
|
|
842
|
+
}
|
|
843
|
+
return commands;
|
|
844
|
+
}
|
|
845
|
+
/**
|
|
846
|
+
* Create a health check response for load balancer integration
|
|
847
|
+
*/
|
|
848
|
+
export async function createHealthCheckResponse(options) {
|
|
849
|
+
const { manager, maxLagBytes = 100 * 1024 * 1024, // 100MB
|
|
850
|
+
maxLagSeconds = 30, minHealthyReplicas = 1, } = options;
|
|
851
|
+
try {
|
|
852
|
+
const health = await manager.getClusterHealth();
|
|
853
|
+
// Determine status
|
|
854
|
+
let status = 'healthy';
|
|
855
|
+
let statusCode = 200;
|
|
856
|
+
if (!health.primaryHealthy) {
|
|
857
|
+
status = 'unhealthy';
|
|
858
|
+
statusCode = 503;
|
|
859
|
+
}
|
|
860
|
+
else if (health.healthyReplicaCount < minHealthyReplicas) {
|
|
861
|
+
status = 'degraded';
|
|
862
|
+
statusCode = 200; // Still accept traffic but with warning
|
|
863
|
+
}
|
|
864
|
+
else if (health.lagThresholdExceeded) {
|
|
865
|
+
status = 'degraded';
|
|
866
|
+
statusCode = 200;
|
|
867
|
+
}
|
|
868
|
+
else if (health.maxLagBytes > maxLagBytes ||
|
|
869
|
+
health.replicationStats.some((s) => s.lagSeconds > maxLagSeconds)) {
|
|
870
|
+
status = 'degraded';
|
|
871
|
+
statusCode = 200;
|
|
872
|
+
}
|
|
873
|
+
return {
|
|
874
|
+
status,
|
|
875
|
+
statusCode,
|
|
876
|
+
body: {
|
|
877
|
+
status,
|
|
878
|
+
cluster: manager.getConfig().clusterName,
|
|
879
|
+
primary: {
|
|
880
|
+
nodeId: health.currentPrimary,
|
|
881
|
+
healthy: health.primaryHealthy,
|
|
882
|
+
},
|
|
883
|
+
replicas: {
|
|
884
|
+
healthy: health.healthyReplicaCount,
|
|
885
|
+
total: health.totalReplicaCount,
|
|
886
|
+
stats: health.replicationStats.map((s) => ({
|
|
887
|
+
nodeId: s.nodeId,
|
|
888
|
+
status: s.status,
|
|
889
|
+
lagBytes: s.lagBytes,
|
|
890
|
+
lagSeconds: s.lagSeconds,
|
|
891
|
+
isInSync: s.isInSync,
|
|
892
|
+
})),
|
|
893
|
+
},
|
|
894
|
+
metrics: {
|
|
895
|
+
averageLagBytes: health.averageLagBytes,
|
|
896
|
+
maxLagBytes: health.maxLagBytes,
|
|
897
|
+
lagThresholdExceeded: health.lagThresholdExceeded,
|
|
898
|
+
},
|
|
899
|
+
timestamp: health.lastCheckAt.toISOString(),
|
|
900
|
+
checkDurationMs: health.checkDurationMs,
|
|
901
|
+
},
|
|
902
|
+
};
|
|
903
|
+
}
|
|
904
|
+
catch (error) {
|
|
905
|
+
return {
|
|
906
|
+
status: 'unhealthy',
|
|
907
|
+
statusCode: 503,
|
|
908
|
+
body: {
|
|
909
|
+
status: 'unhealthy',
|
|
910
|
+
error: error instanceof Error ? error.message : String(error),
|
|
911
|
+
timestamp: new Date().toISOString(),
|
|
912
|
+
},
|
|
913
|
+
};
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
// =============================================================================
|
|
917
|
+
// Exports
|
|
918
|
+
// =============================================================================
|
|
919
|
+
export default ReplicationManager;
|
|
920
|
+
//# sourceMappingURL=replication.js.map
|