@topgunbuild/server 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -31,16 +31,25 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
33
  BufferPool: () => BufferPool,
34
+ ClusterCoordinator: () => ClusterCoordinator,
35
+ ClusterManager: () => ClusterManager,
34
36
  ConnectionRateLimiter: () => ConnectionRateLimiter,
37
+ DEFAULT_CLUSTER_COORDINATOR_CONFIG: () => DEFAULT_CLUSTER_COORDINATOR_CONFIG,
38
+ DEFAULT_LAG_TRACKER_CONFIG: () => DEFAULT_LAG_TRACKER_CONFIG,
35
39
  FilterTasklet: () => FilterTasklet,
36
40
  ForEachTasklet: () => ForEachTasklet,
37
41
  IteratorTasklet: () => IteratorTasklet,
42
+ LagTracker: () => LagTracker,
43
+ LockManager: () => LockManager,
38
44
  MapTasklet: () => MapTasklet,
39
45
  MemoryServerAdapter: () => MemoryServerAdapter,
46
+ MigrationManager: () => MigrationManager,
40
47
  ObjectPool: () => ObjectPool,
48
+ PartitionService: () => PartitionService,
41
49
  PostgresAdapter: () => PostgresAdapter,
42
50
  RateLimitInterceptor: () => RateLimitInterceptor,
43
51
  ReduceTasklet: () => ReduceTasklet,
52
+ ReplicationPipeline: () => ReplicationPipeline,
44
53
  SecurityManager: () => SecurityManager,
45
54
  ServerCoordinator: () => ServerCoordinator,
46
55
  TaskletScheduler: () => TaskletScheduler,
@@ -73,7 +82,7 @@ var import_http = require("http");
73
82
  var import_https = require("https");
74
83
  var import_fs2 = require("fs");
75
84
  var import_ws3 = require("ws");
76
- var import_core7 = require("@topgunbuild/core");
85
+ var import_core10 = require("@topgunbuild/core");
77
86
  var jwt = __toESM(require("jsonwebtoken"));
78
87
  var crypto = __toESM(require("crypto"));
79
88
 
@@ -633,11 +642,268 @@ var TopicManager = class {
633
642
 
634
643
  // src/cluster/ClusterManager.ts
635
644
  var import_ws = require("ws");
636
- var import_events = require("events");
645
+ var import_events2 = require("events");
637
646
  var dns = __toESM(require("dns"));
638
647
  var import_fs = require("fs");
639
648
  var https = __toESM(require("https"));
640
- var ClusterManager = class extends import_events.EventEmitter {
649
+
650
+ // src/cluster/FailureDetector.ts
651
+ var import_events = require("events");
652
+ var DEFAULT_FAILURE_DETECTOR_CONFIG = {
653
+ heartbeatIntervalMs: 1e3,
654
+ suspicionTimeoutMs: 5e3,
655
+ confirmationTimeoutMs: 1e4,
656
+ phiThreshold: 8,
657
+ minSamples: 10,
658
+ maxSamples: 100,
659
+ initialHeartbeatIntervalMs: 1e3
660
+ };
661
+ var FailureDetector = class extends import_events.EventEmitter {
662
+ constructor(config = {}) {
663
+ super();
664
+ this.nodeStates = /* @__PURE__ */ new Map();
665
+ this.monitoringNodes = /* @__PURE__ */ new Set();
666
+ this.confirmationTimers = /* @__PURE__ */ new Map();
667
+ this.started = false;
668
+ this.config = { ...DEFAULT_FAILURE_DETECTOR_CONFIG, ...config };
669
+ }
670
+ /**
671
+ * Start the failure detector monitoring loop.
672
+ */
673
+ start() {
674
+ if (this.started) return;
675
+ this.started = true;
676
+ this.checkTimer = setInterval(() => {
677
+ this.checkAllNodes();
678
+ }, this.config.heartbeatIntervalMs);
679
+ logger.info({ config: this.config }, "FailureDetector started");
680
+ }
681
+ /**
682
+ * Stop the failure detector and clean up.
683
+ */
684
+ stop() {
685
+ if (!this.started) return;
686
+ this.started = false;
687
+ if (this.checkTimer) {
688
+ clearInterval(this.checkTimer);
689
+ this.checkTimer = void 0;
690
+ }
691
+ for (const timer of this.confirmationTimers.values()) {
692
+ clearTimeout(timer);
693
+ }
694
+ this.confirmationTimers.clear();
695
+ logger.info("FailureDetector stopped");
696
+ }
697
+ /**
698
+ * Start monitoring a node.
699
+ */
700
+ startMonitoring(nodeId) {
701
+ if (this.monitoringNodes.has(nodeId)) return;
702
+ this.monitoringNodes.add(nodeId);
703
+ this.nodeStates.set(nodeId, {
704
+ lastHeartbeat: Date.now(),
705
+ intervalHistory: [],
706
+ isSuspected: false,
707
+ isConfirmedFailed: false
708
+ });
709
+ logger.debug({ nodeId }, "Started monitoring node");
710
+ }
711
+ /**
712
+ * Stop monitoring a node.
713
+ */
714
+ stopMonitoring(nodeId) {
715
+ this.monitoringNodes.delete(nodeId);
716
+ this.nodeStates.delete(nodeId);
717
+ const timer = this.confirmationTimers.get(nodeId);
718
+ if (timer) {
719
+ clearTimeout(timer);
720
+ this.confirmationTimers.delete(nodeId);
721
+ }
722
+ logger.debug({ nodeId }, "Stopped monitoring node");
723
+ }
724
+ /**
725
+ * Record a heartbeat from a node.
726
+ * This updates the node's state and clears any suspicion.
727
+ */
728
+ recordHeartbeat(nodeId) {
729
+ const state = this.nodeStates.get(nodeId);
730
+ if (!state) {
731
+ this.startMonitoring(nodeId);
732
+ return;
733
+ }
734
+ const now = Date.now();
735
+ const interval = now - state.lastHeartbeat;
736
+ state.intervalHistory.push(interval);
737
+ if (state.intervalHistory.length > this.config.maxSamples) {
738
+ state.intervalHistory.shift();
739
+ }
740
+ state.lastHeartbeat = now;
741
+ if (state.isSuspected) {
742
+ state.isSuspected = false;
743
+ state.suspicionStartTime = void 0;
744
+ state.isConfirmedFailed = false;
745
+ const timer = this.confirmationTimers.get(nodeId);
746
+ if (timer) {
747
+ clearTimeout(timer);
748
+ this.confirmationTimers.delete(nodeId);
749
+ }
750
+ this.emit("nodeRecovered", { nodeId });
751
+ logger.info({ nodeId }, "Node recovered");
752
+ }
753
+ }
754
+ /**
755
+ * Check all monitored nodes for failure.
756
+ */
757
+ checkAllNodes() {
758
+ for (const nodeId of this.monitoringNodes) {
759
+ const phi = this.calculatePhi(nodeId);
760
+ const state = this.nodeStates.get(nodeId);
761
+ if (!state) continue;
762
+ if (phi > this.config.phiThreshold) {
763
+ if (!state.isSuspected) {
764
+ state.isSuspected = true;
765
+ state.suspicionStartTime = Date.now();
766
+ this.emit("nodeSuspected", { nodeId, phi });
767
+ logger.warn({ nodeId, phi }, "Node suspected");
768
+ this.scheduleConfirmation(nodeId);
769
+ }
770
+ }
771
+ }
772
+ }
773
+ /**
774
+ * Schedule failure confirmation after suspicion timeout.
775
+ */
776
+ scheduleConfirmation(nodeId) {
777
+ const existingTimer = this.confirmationTimers.get(nodeId);
778
+ if (existingTimer) {
779
+ clearTimeout(existingTimer);
780
+ }
781
+ const timer = setTimeout(() => {
782
+ this.confirmFailure(nodeId);
783
+ }, this.config.confirmationTimeoutMs);
784
+ this.confirmationTimers.set(nodeId, timer);
785
+ }
786
+ /**
787
+ * Confirm node failure after confirmation timeout.
788
+ */
789
+ confirmFailure(nodeId) {
790
+ const state = this.nodeStates.get(nodeId);
791
+ if (!state) return;
792
+ if (state.isSuspected && !state.isConfirmedFailed) {
793
+ state.isConfirmedFailed = true;
794
+ this.emit("nodeConfirmedFailed", { nodeId });
795
+ logger.error({ nodeId }, "Node failure confirmed");
796
+ }
797
+ this.confirmationTimers.delete(nodeId);
798
+ }
799
+ /**
800
+ * Calculate the phi value for a node using the Phi Accrual algorithm.
801
+ *
802
+ * Phi = -log10(P_later(t_now - t_last))
803
+ *
804
+ * where P_later is the probability that a heartbeat will arrive later than expected.
805
+ */
806
+ calculatePhi(nodeId) {
807
+ const state = this.nodeStates.get(nodeId);
808
+ if (!state) return 0;
809
+ const now = Date.now();
810
+ const timeSinceLastHeartbeat = now - state.lastHeartbeat;
811
+ if (state.intervalHistory.length < this.config.minSamples) {
812
+ const expectedInterval = this.config.initialHeartbeatIntervalMs;
813
+ return timeSinceLastHeartbeat / expectedInterval;
814
+ }
815
+ const mean = this.calculateMean(state.intervalHistory);
816
+ const variance = this.calculateVariance(state.intervalHistory, mean);
817
+ const stdDev = Math.sqrt(variance);
818
+ if (timeSinceLastHeartbeat <= mean) {
819
+ return 0;
820
+ }
821
+ const deviations = stdDev > 0 ? (timeSinceLastHeartbeat - mean) / stdDev : 0;
822
+ const phi = Math.max(0, deviations);
823
+ return phi;
824
+ }
825
+ /**
826
+ * Calculate mean of an array of numbers.
827
+ */
828
+ calculateMean(values) {
829
+ if (values.length === 0) return 0;
830
+ return values.reduce((sum, v) => sum + v, 0) / values.length;
831
+ }
832
+ /**
833
+ * Calculate variance of an array of numbers.
834
+ */
835
+ calculateVariance(values, mean) {
836
+ if (values.length < 2) return 0;
837
+ return values.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / values.length;
838
+ }
839
+ /**
840
+ * Get list of currently suspected nodes.
841
+ */
842
+ getSuspectedNodes() {
843
+ const suspected = [];
844
+ for (const [nodeId, state] of this.nodeStates) {
845
+ if (state.isSuspected) {
846
+ suspected.push(nodeId);
847
+ }
848
+ }
849
+ return suspected;
850
+ }
851
+ /**
852
+ * Get list of confirmed failed nodes.
853
+ */
854
+ getConfirmedFailedNodes() {
855
+ const failed = [];
856
+ for (const [nodeId, state] of this.nodeStates) {
857
+ if (state.isConfirmedFailed) {
858
+ failed.push(nodeId);
859
+ }
860
+ }
861
+ return failed;
862
+ }
863
+ /**
864
+ * Check if a specific node is suspected.
865
+ */
866
+ isSuspected(nodeId) {
867
+ return this.nodeStates.get(nodeId)?.isSuspected ?? false;
868
+ }
869
+ /**
870
+ * Check if a specific node's failure is confirmed.
871
+ */
872
+ isConfirmedFailed(nodeId) {
873
+ return this.nodeStates.get(nodeId)?.isConfirmedFailed ?? false;
874
+ }
875
+ /**
876
+ * Get the current phi value for a node.
877
+ */
878
+ getPhi(nodeId) {
879
+ return this.calculatePhi(nodeId);
880
+ }
881
+ /**
882
+ * Get all monitored nodes.
883
+ */
884
+ getMonitoredNodes() {
885
+ return Array.from(this.monitoringNodes);
886
+ }
887
+ /**
888
+ * Get metrics for monitoring.
889
+ */
890
+ getMetrics() {
891
+ let suspectedCount = 0;
892
+ let confirmedCount = 0;
893
+ for (const state of this.nodeStates.values()) {
894
+ if (state.isSuspected) suspectedCount++;
895
+ if (state.isConfirmedFailed) confirmedCount++;
896
+ }
897
+ return {
898
+ monitoredNodes: this.monitoringNodes.size,
899
+ suspectedNodes: suspectedCount,
900
+ confirmedFailedNodes: confirmedCount
901
+ };
902
+ }
903
+ };
904
+
905
+ // src/cluster/ClusterManager.ts
906
+ var ClusterManager = class extends import_events2.EventEmitter {
641
907
  constructor(config) {
642
908
  super();
643
909
  this.members = /* @__PURE__ */ new Map();
@@ -645,6 +911,30 @@ var ClusterManager = class extends import_events.EventEmitter {
645
911
  this.reconnectIntervals = /* @__PURE__ */ new Map();
646
912
  this._actualPort = 0;
647
913
  this.config = config;
914
+ this.failureDetector = new FailureDetector({
915
+ ...DEFAULT_FAILURE_DETECTOR_CONFIG,
916
+ heartbeatIntervalMs: config.heartbeatIntervalMs ?? 1e3,
917
+ ...config.failureDetection
918
+ });
919
+ this.failureDetector.on("nodeSuspected", (event) => {
920
+ logger.warn({ nodeId: event.nodeId, phi: event.phi }, "Node suspected (failure detector)");
921
+ this.emit("nodeSuspected", event.nodeId, event.phi);
922
+ });
923
+ this.failureDetector.on("nodeRecovered", (event) => {
924
+ logger.info({ nodeId: event.nodeId }, "Node recovered (failure detector)");
925
+ this.emit("nodeRecovered", event.nodeId);
926
+ });
927
+ this.failureDetector.on("nodeConfirmedFailed", (event) => {
928
+ logger.error({ nodeId: event.nodeId }, "Node failure confirmed");
929
+ this.emit("nodeConfirmedFailed", event.nodeId);
930
+ this.handleNodeFailure(event.nodeId);
931
+ });
932
+ }
933
+ /**
934
+ * Get the failure detector instance.
935
+ */
936
+ getFailureDetector() {
937
+ return this.failureDetector;
648
938
  }
649
939
  /** Get the actual port the cluster is listening on */
650
940
  get port() {
@@ -696,6 +986,8 @@ var ClusterManager = class extends import_events.EventEmitter {
696
986
  }
697
987
  stop() {
698
988
  logger.info({ port: this.config.port }, "Stopping Cluster Manager");
989
+ this.stopHeartbeat();
990
+ this.failureDetector.stop();
699
991
  for (const timeout of this.reconnectIntervals.values()) {
700
992
  clearTimeout(timeout);
701
993
  }
@@ -715,6 +1007,61 @@ var ClusterManager = class extends import_events.EventEmitter {
715
1007
  this.server.close();
716
1008
  }
717
1009
  }
1010
+ /**
1011
+ * Start sending heartbeats to all peers.
1012
+ */
1013
+ startHeartbeat() {
1014
+ if (this.heartbeatTimer) return;
1015
+ const intervalMs = this.config.heartbeatIntervalMs ?? 1e3;
1016
+ this.heartbeatTimer = setInterval(() => {
1017
+ this.sendHeartbeatToAll();
1018
+ }, intervalMs);
1019
+ this.failureDetector.start();
1020
+ logger.debug({ intervalMs }, "Heartbeat started");
1021
+ }
1022
+ /**
1023
+ * Stop sending heartbeats.
1024
+ */
1025
+ stopHeartbeat() {
1026
+ if (this.heartbeatTimer) {
1027
+ clearInterval(this.heartbeatTimer);
1028
+ this.heartbeatTimer = void 0;
1029
+ }
1030
+ }
1031
+ /**
1032
+ * Send heartbeat to all connected peers.
1033
+ */
1034
+ sendHeartbeatToAll() {
1035
+ for (const [nodeId, member] of this.members) {
1036
+ if (member.isSelf) continue;
1037
+ if (member.socket && member.socket.readyState === import_ws.WebSocket.OPEN) {
1038
+ this.send(nodeId, "HEARTBEAT", { timestamp: Date.now() });
1039
+ }
1040
+ }
1041
+ }
1042
+ /**
1043
+ * Handle incoming heartbeat from a peer.
1044
+ */
1045
+ handleHeartbeat(senderId, _payload) {
1046
+ this.failureDetector.recordHeartbeat(senderId);
1047
+ }
1048
+ /**
1049
+ * Handle confirmed node failure.
1050
+ */
1051
+ handleNodeFailure(nodeId) {
1052
+ const member = this.members.get(nodeId);
1053
+ if (!member) return;
1054
+ logger.warn({ nodeId }, "Removing failed node from cluster");
1055
+ if (member.socket && member.socket.readyState !== import_ws.WebSocket.CLOSED) {
1056
+ try {
1057
+ member.socket.terminate();
1058
+ } catch (e) {
1059
+ }
1060
+ }
1061
+ this.members.delete(nodeId);
1062
+ this.failureDetector.stopMonitoring(nodeId);
1063
+ this.emit("memberLeft", nodeId);
1064
+ }
718
1065
  connectToPeers() {
719
1066
  for (const peer of this.config.peers) {
720
1067
  this.connectToPeer(peer);
@@ -833,7 +1180,13 @@ var ClusterManager = class extends import_events.EventEmitter {
833
1180
  socket: ws,
834
1181
  isSelf: false
835
1182
  });
1183
+ this.failureDetector.startMonitoring(remoteNodeId);
1184
+ this.startHeartbeat();
836
1185
  this.emit("memberJoined", remoteNodeId);
1186
+ } else if (msg.type === "HEARTBEAT") {
1187
+ if (remoteNodeId) {
1188
+ this.handleHeartbeat(remoteNodeId, msg.payload);
1189
+ }
837
1190
  } else {
838
1191
  this.emit("message", msg);
839
1192
  }
@@ -847,6 +1200,7 @@ var ClusterManager = class extends import_events.EventEmitter {
847
1200
  if (current && current.socket === ws) {
848
1201
  logger.info({ nodeId: remoteNodeId }, "Peer disconnected");
849
1202
  this.members.delete(remoteNodeId);
1203
+ this.failureDetector.stopMonitoring(remoteNodeId);
850
1204
  this.emit("memberLeft", remoteNodeId);
851
1205
  if (initiated && peerAddress) {
852
1206
  this.scheduleReconnect(peerAddress, 0);
@@ -900,21 +1254,639 @@ var ClusterManager = class extends import_events.EventEmitter {
900
1254
  };
901
1255
 
902
1256
  // src/cluster/PartitionService.ts
1257
+ var import_events4 = require("events");
1258
+
1259
+ // src/cluster/MigrationManager.ts
1260
+ var import_events3 = require("events");
903
1261
  var import_core3 = require("@topgunbuild/core");
904
- var PartitionService = class {
905
- // Standard Hazelcast default
906
- constructor(cluster) {
1262
+ var import_native = require("@topgunbuild/native");
1263
+ var MigrationManager = class extends import_events3.EventEmitter {
1264
+ constructor(clusterManager, partitionService, config = {}) {
1265
+ super();
1266
+ // Active outgoing migrations (this node is source)
1267
+ this.activeMigrations = /* @__PURE__ */ new Map();
1268
+ // Queue of migrations to process
1269
+ this.migrationQueue = [];
1270
+ // Incoming migrations (this node is target)
1271
+ this.incomingMigrations = /* @__PURE__ */ new Map();
1272
+ // Pending chunk acknowledgments
1273
+ this.pendingChunkAcks = /* @__PURE__ */ new Map();
1274
+ // Pending verification results
1275
+ this.pendingVerifications = /* @__PURE__ */ new Map();
1276
+ // Metrics tracking
1277
+ this.metrics = {
1278
+ migrationsStarted: 0,
1279
+ migrationsCompleted: 0,
1280
+ migrationsFailed: 0,
1281
+ chunksTransferred: 0,
1282
+ bytesTransferred: 0,
1283
+ activeMigrations: 0,
1284
+ queuedMigrations: 0
1285
+ };
1286
+ // Batch processing timer
1287
+ this.batchTimer = null;
1288
+ // Data collection callback (injected from ServerCoordinator)
1289
+ this.dataCollector = null;
1290
+ // Data storage callback (injected from ServerCoordinator)
1291
+ this.dataStorer = null;
1292
+ this.clusterManager = clusterManager;
1293
+ this.partitionService = partitionService;
1294
+ this.config = {
1295
+ ...import_core3.DEFAULT_MIGRATION_CONFIG,
1296
+ ...config
1297
+ };
1298
+ this.setupMessageHandlers();
1299
+ }
1300
+ // ============================================
1301
+ // Configuration
1302
+ // ============================================
1303
+ /**
1304
+ * Set the data collector callback
1305
+ * Called to collect all records for a partition before migration
1306
+ */
1307
+ setDataCollector(collector) {
1308
+ this.dataCollector = collector;
1309
+ }
1310
+ /**
1311
+ * Set the data storer callback
1312
+ * Called to store received records after successful migration
1313
+ */
1314
+ setDataStorer(storer) {
1315
+ this.dataStorer = storer;
1316
+ }
1317
+ // ============================================
1318
+ // Migration Planning
1319
+ // ============================================
1320
+ /**
1321
+ * Plan migration for topology change
1322
+ */
1323
+ planMigration(oldDistribution, newDistribution) {
1324
+ const migrations = [];
1325
+ for (const [partitionId, newDist] of newDistribution) {
1326
+ const oldDist = oldDistribution.get(partitionId);
1327
+ const oldOwner = oldDist?.owner ?? this.clusterManager.config.nodeId;
1328
+ const newOwner = newDist.owner;
1329
+ if (oldOwner !== newOwner && oldOwner === this.clusterManager.config.nodeId) {
1330
+ migrations.push({
1331
+ partitionId,
1332
+ state: import_core3.PartitionState.STABLE,
1333
+ sourceNode: oldOwner,
1334
+ targetNode: newOwner,
1335
+ startTime: 0,
1336
+ bytesTransferred: 0,
1337
+ totalBytes: 0,
1338
+ retryCount: 0
1339
+ });
1340
+ }
1341
+ }
1342
+ migrations.sort((a, b) => a.partitionId - b.partitionId);
1343
+ this.migrationQueue = migrations;
1344
+ this.metrics.queuedMigrations = migrations.length;
1345
+ logger.info({ total: migrations.length }, "Migration planned");
1346
+ this.emit("migrationPlanned", { total: migrations.length });
1347
+ if (migrations.length > 0) {
1348
+ this.startBatchProcessing();
1349
+ }
1350
+ }
1351
+ /**
1352
+ * Start batch processing timer
1353
+ */
1354
+ startBatchProcessing() {
1355
+ if (this.batchTimer) return;
1356
+ this.startNextBatch().catch((err) => {
1357
+ logger.error({ error: err }, "Failed to start first migration batch");
1358
+ this.emit("error", err);
1359
+ });
1360
+ this.batchTimer = setInterval(() => {
1361
+ this.startNextBatch().catch((err) => {
1362
+ logger.error({ error: err }, "Failed to start migration batch");
1363
+ this.emit("error", err);
1364
+ });
1365
+ }, this.config.batchIntervalMs);
1366
+ }
1367
+ /**
1368
+ * Stop batch processing
1369
+ */
1370
+ stopBatchProcessing() {
1371
+ if (this.batchTimer) {
1372
+ clearInterval(this.batchTimer);
1373
+ this.batchTimer = null;
1374
+ }
1375
+ }
1376
+ /**
1377
+ * Start next batch of migrations
1378
+ */
1379
+ async startNextBatch() {
1380
+ if (this.activeMigrations.size >= this.config.parallelTransfers) {
1381
+ return;
1382
+ }
1383
+ const slotsAvailable = this.config.parallelTransfers - this.activeMigrations.size;
1384
+ const batch = this.migrationQueue.splice(0, Math.min(slotsAvailable, this.config.batchSize));
1385
+ if (batch.length === 0) {
1386
+ if (this.migrationQueue.length === 0 && this.activeMigrations.size === 0) {
1387
+ this.stopBatchProcessing();
1388
+ }
1389
+ return;
1390
+ }
1391
+ for (const migration of batch) {
1392
+ migration.state = import_core3.PartitionState.MIGRATING;
1393
+ migration.startTime = Date.now();
1394
+ this.activeMigrations.set(migration.partitionId, migration);
1395
+ this.metrics.migrationsStarted++;
1396
+ this.metrics.activeMigrations = this.activeMigrations.size;
1397
+ this.metrics.queuedMigrations = this.migrationQueue.length;
1398
+ this.startPartitionMigration(migration).catch((error) => {
1399
+ this.onMigrationFailed(migration.partitionId, error);
1400
+ });
1401
+ }
1402
+ logger.info({ count: batch.length, remaining: this.migrationQueue.length }, "Batch started");
1403
+ this.emit("batchStarted", { count: batch.length, remaining: this.migrationQueue.length });
1404
+ }
1405
+ // ============================================
1406
+ // Migration Execution
1407
+ // ============================================
1408
+ /**
1409
+ * Start migration for a single partition
1410
+ */
1411
+ async startPartitionMigration(migration) {
1412
+ const { partitionId, targetNode } = migration;
1413
+ logger.info({ partitionId, targetNode }, "Starting partition migration");
1414
+ let records;
1415
+ if (this.dataCollector) {
1416
+ records = await this.dataCollector(partitionId);
1417
+ } else {
1418
+ records = [];
1419
+ }
1420
+ migration.totalBytes = records.reduce((sum, r) => sum + r.length, 0);
1421
+ this.clusterManager.send(targetNode, "OP_FORWARD", {
1422
+ _migration: {
1423
+ type: "MIGRATION_START",
1424
+ payload: {
1425
+ partitionId,
1426
+ sourceNode: this.clusterManager.config.nodeId,
1427
+ estimatedSize: migration.totalBytes
1428
+ }
1429
+ }
1430
+ });
1431
+ const chunks = this.chunkify(records);
1432
+ for (let i = 0; i < chunks.length; i++) {
1433
+ const chunk = chunks[i];
1434
+ const checksum = this.calculateChecksum(chunk);
1435
+ this.clusterManager.send(targetNode, "OP_FORWARD", {
1436
+ _migration: {
1437
+ type: "MIGRATION_CHUNK",
1438
+ payload: {
1439
+ partitionId,
1440
+ chunkIndex: i,
1441
+ totalChunks: chunks.length,
1442
+ data: Array.from(chunk),
1443
+ // Convert Uint8Array to array for JSON serialization
1444
+ checksum
1445
+ }
1446
+ }
1447
+ });
1448
+ await this.waitForChunkAck(partitionId, i);
1449
+ migration.bytesTransferred += chunk.length;
1450
+ this.metrics.chunksTransferred++;
1451
+ this.metrics.bytesTransferred += chunk.length;
1452
+ this.emit("migrationProgress", migration);
1453
+ }
1454
+ const fullChecksum = this.calculatePartitionChecksum(records);
1455
+ migration.state = import_core3.PartitionState.SYNC;
1456
+ this.clusterManager.send(targetNode, "OP_FORWARD", {
1457
+ _migration: {
1458
+ type: "MIGRATION_COMPLETE",
1459
+ payload: {
1460
+ partitionId,
1461
+ totalRecords: records.length,
1462
+ checksum: fullChecksum
1463
+ }
1464
+ }
1465
+ });
1466
+ const verified = await this.waitForVerification(partitionId);
1467
+ if (verified) {
1468
+ await this.onMigrationComplete(partitionId);
1469
+ } else {
1470
+ throw new Error(`Migration verification failed for partition ${partitionId}`);
1471
+ }
1472
+ }
1473
+ /**
1474
+ * Split records into chunks
1475
+ */
1476
+ chunkify(records) {
1477
+ const chunks = [];
1478
+ let currentChunk = [];
1479
+ let currentSize = 0;
1480
+ for (const record of records) {
1481
+ const lengthPrefix = new Uint8Array(4);
1482
+ new DataView(lengthPrefix.buffer).setUint32(0, record.length, true);
1483
+ currentChunk.push(...lengthPrefix, ...record);
1484
+ currentSize += 4 + record.length;
1485
+ if (currentSize >= this.config.transferChunkSize) {
1486
+ chunks.push(new Uint8Array(currentChunk));
1487
+ currentChunk = [];
1488
+ currentSize = 0;
1489
+ }
1490
+ }
1491
+ if (currentChunk.length > 0) {
1492
+ chunks.push(new Uint8Array(currentChunk));
1493
+ }
1494
+ if (chunks.length === 0) {
1495
+ chunks.push(new Uint8Array(0));
1496
+ }
1497
+ return chunks;
1498
+ }
1499
+ /**
1500
+ * Calculate checksum for a chunk using native xxhash
1501
+ */
1502
+ calculateChecksum(data) {
1503
+ return String((0, import_native.xxhash64AsNumber)(data));
1504
+ }
1505
+ /**
1506
+ * Calculate checksum for all partition records using streaming xxhash
1507
+ */
1508
+ calculatePartitionChecksum(records) {
1509
+ const state = (0, import_native.createXxHash64State)();
1510
+ for (const record of records) {
1511
+ state.update(record);
1512
+ }
1513
+ return String(state.digestAsNumber());
1514
+ }
1515
+ /**
1516
+ * Wait for chunk acknowledgment
1517
+ */
1518
+ waitForChunkAck(partitionId, chunkIndex) {
1519
+ return new Promise((resolve, reject) => {
1520
+ const key = `${partitionId}:${chunkIndex}`;
1521
+ const timeout = setTimeout(() => {
1522
+ this.pendingChunkAcks.delete(key);
1523
+ reject(new Error(`Chunk ack timeout for partition ${partitionId}, chunk ${chunkIndex}`));
1524
+ }, this.config.syncTimeoutMs);
1525
+ this.pendingChunkAcks.set(key, { resolve, reject, timeout });
1526
+ });
1527
+ }
1528
+ /**
1529
+ * Wait for migration verification
1530
+ */
1531
+ waitForVerification(partitionId) {
1532
+ return new Promise((resolve) => {
1533
+ const timeout = setTimeout(() => {
1534
+ this.pendingVerifications.delete(partitionId);
1535
+ resolve(false);
1536
+ }, this.config.syncTimeoutMs);
1537
+ this.pendingVerifications.set(partitionId, { resolve, timeout });
1538
+ });
1539
+ }
1540
+ // ============================================
1541
+ // Migration Completion
1542
+ // ============================================
1543
+ /**
1544
+ * Handle successful migration completion
1545
+ */
1546
+ async onMigrationComplete(partitionId) {
1547
+ const migration = this.activeMigrations.get(partitionId);
1548
+ if (!migration) return;
1549
+ migration.state = import_core3.PartitionState.STABLE;
1550
+ this.activeMigrations.delete(partitionId);
1551
+ this.metrics.migrationsCompleted++;
1552
+ this.metrics.activeMigrations = this.activeMigrations.size;
1553
+ logger.info({
1554
+ partitionId,
1555
+ duration: Date.now() - migration.startTime,
1556
+ bytesTransferred: migration.bytesTransferred
1557
+ }, "Migration completed");
1558
+ this.emit("migrationComplete", partitionId);
1559
+ }
1560
+ /**
1561
+ * Handle migration failure
1562
+ */
1563
+ async onMigrationFailed(partitionId, error) {
1564
+ const migration = this.activeMigrations.get(partitionId);
1565
+ if (!migration) return;
1566
+ migration.retryCount++;
1567
+ if (migration.retryCount <= this.config.maxRetries) {
1568
+ migration.state = import_core3.PartitionState.STABLE;
1569
+ migration.bytesTransferred = 0;
1570
+ this.activeMigrations.delete(partitionId);
1571
+ this.migrationQueue.unshift(migration);
1572
+ this.metrics.queuedMigrations = this.migrationQueue.length;
1573
+ this.metrics.activeMigrations = this.activeMigrations.size;
1574
+ logger.warn({
1575
+ partitionId,
1576
+ retryCount: migration.retryCount,
1577
+ error: error.message
1578
+ }, "Migration failed, will retry");
1579
+ } else {
1580
+ migration.state = import_core3.PartitionState.FAILED;
1581
+ this.activeMigrations.delete(partitionId);
1582
+ this.metrics.migrationsFailed++;
1583
+ this.metrics.activeMigrations = this.activeMigrations.size;
1584
+ logger.error({
1585
+ partitionId,
1586
+ retryCount: migration.retryCount,
1587
+ error: error.message
1588
+ }, "Migration failed permanently");
1589
+ this.emit("migrationFailed", partitionId, error);
1590
+ }
1591
+ }
1592
+ // ============================================
1593
+ // Incoming Migration Handlers (Target Node)
1594
+ // ============================================
1595
+ /**
1596
+ * Handle MIGRATION_START message
1597
+ */
1598
+ handleMigrationStart(payload) {
1599
+ const { partitionId, sourceNode, estimatedSize } = payload;
1600
+ logger.info({ partitionId, sourceNode, estimatedSize }, "Receiving migration");
1601
+ this.incomingMigrations.set(partitionId, {
1602
+ sourceNode,
1603
+ chunks: [],
1604
+ expectedSize: estimatedSize,
1605
+ receivedSize: 0,
1606
+ startTime: Date.now()
1607
+ });
1608
+ }
1609
+ /**
1610
+ * Handle MIGRATION_CHUNK message
1611
+ */
1612
+ handleMigrationChunk(payload) {
1613
+ const { partitionId, chunkIndex, data, checksum } = payload;
1614
+ const incoming = this.incomingMigrations.get(partitionId);
1615
+ if (!incoming) {
1616
+ logger.warn({ partitionId, chunkIndex }, "Received chunk for unknown migration");
1617
+ return;
1618
+ }
1619
+ const chunkData = new Uint8Array(data);
1620
+ const actualChecksum = this.calculateChecksum(chunkData);
1621
+ const success = actualChecksum === checksum;
1622
+ if (success) {
1623
+ incoming.chunks[chunkIndex] = chunkData;
1624
+ incoming.receivedSize += chunkData.length;
1625
+ } else {
1626
+ logger.warn({ partitionId, chunkIndex, expected: checksum, actual: actualChecksum }, "Chunk checksum mismatch");
1627
+ }
1628
+ this.clusterManager.send(incoming.sourceNode, "OP_FORWARD", {
1629
+ _migration: {
1630
+ type: "MIGRATION_CHUNK_ACK",
1631
+ payload: {
1632
+ partitionId,
1633
+ chunkIndex,
1634
+ success
1635
+ }
1636
+ }
1637
+ });
1638
+ }
1639
+ /**
1640
+ * Handle MIGRATION_COMPLETE message
1641
+ */
1642
+ async handleMigrationComplete(payload) {
1643
+ const { partitionId, totalRecords, checksum } = payload;
1644
+ const incoming = this.incomingMigrations.get(partitionId);
1645
+ if (!incoming) {
1646
+ logger.warn({ partitionId }, "Received complete for unknown migration");
1647
+ return;
1648
+ }
1649
+ const allData = this.reassemble(incoming.chunks);
1650
+ const records = this.deserializeRecords(allData);
1651
+ const actualChecksum = this.calculatePartitionChecksum(records);
1652
+ const checksumMatch = actualChecksum === checksum;
1653
+ const success = checksumMatch && records.length === totalRecords;
1654
+ if (success && this.dataStorer) {
1655
+ await this.dataStorer(partitionId, records);
1656
+ }
1657
+ logger.info({
1658
+ partitionId,
1659
+ duration: Date.now() - incoming.startTime,
1660
+ records: records.length,
1661
+ checksumMatch
1662
+ }, "Migration received");
1663
+ this.clusterManager.send(incoming.sourceNode, "OP_FORWARD", {
1664
+ _migration: {
1665
+ type: "MIGRATION_VERIFY",
1666
+ payload: {
1667
+ partitionId,
1668
+ success,
1669
+ checksumMatch
1670
+ }
1671
+ }
1672
+ });
1673
+ this.incomingMigrations.delete(partitionId);
1674
+ }
1675
+ /**
1676
+ * Handle MIGRATION_CHUNK_ACK message
1677
+ */
1678
+ handleMigrationChunkAck(payload) {
1679
+ const { partitionId, chunkIndex, success } = payload;
1680
+ const key = `${partitionId}:${chunkIndex}`;
1681
+ const pending = this.pendingChunkAcks.get(key);
1682
+ if (pending) {
1683
+ clearTimeout(pending.timeout);
1684
+ this.pendingChunkAcks.delete(key);
1685
+ if (success) {
1686
+ pending.resolve();
1687
+ } else {
1688
+ pending.reject(new Error(`Chunk ${chunkIndex} rejected by target`));
1689
+ }
1690
+ }
1691
+ }
1692
+ /**
1693
+ * Handle MIGRATION_VERIFY message
1694
+ */
1695
+ handleMigrationVerify(payload) {
1696
+ const { partitionId, success } = payload;
1697
+ const pending = this.pendingVerifications.get(partitionId);
1698
+ if (pending) {
1699
+ clearTimeout(pending.timeout);
1700
+ this.pendingVerifications.delete(partitionId);
1701
+ pending.resolve(success);
1702
+ }
1703
+ }
1704
+ /**
1705
+ * Reassemble chunks into continuous data
1706
+ */
1707
+ reassemble(chunks) {
1708
+ const totalLength = chunks.reduce((sum, c) => sum + (c?.length ?? 0), 0);
1709
+ const result = new Uint8Array(totalLength);
1710
+ let offset = 0;
1711
+ for (const chunk of chunks) {
1712
+ if (chunk) {
1713
+ result.set(chunk, offset);
1714
+ offset += chunk.length;
1715
+ }
1716
+ }
1717
+ return result;
1718
+ }
1719
+ /**
1720
+ * Deserialize records from chunk data
1721
+ */
1722
+ deserializeRecords(data) {
1723
+ const records = [];
1724
+ let offset = 0;
1725
+ while (offset < data.length) {
1726
+ if (offset + 4 > data.length) break;
1727
+ const length = new DataView(data.buffer, data.byteOffset + offset, 4).getUint32(0, true);
1728
+ offset += 4;
1729
+ if (offset + length > data.length) break;
1730
+ records.push(data.slice(offset, offset + length));
1731
+ offset += length;
1732
+ }
1733
+ return records;
1734
+ }
1735
+ // ============================================
1736
+ // Message Handling
1737
+ // ============================================
1738
+ /**
1739
+ * Setup cluster message handlers
1740
+ */
1741
+ setupMessageHandlers() {
1742
+ this.clusterManager.on("message", (msg) => {
1743
+ if (msg.payload?._migration) {
1744
+ const migration = msg.payload._migration;
1745
+ switch (migration.type) {
1746
+ case "MIGRATION_START":
1747
+ this.handleMigrationStart(migration.payload);
1748
+ break;
1749
+ case "MIGRATION_CHUNK":
1750
+ this.handleMigrationChunk(migration.payload);
1751
+ break;
1752
+ case "MIGRATION_COMPLETE":
1753
+ this.handleMigrationComplete(migration.payload).catch((err) => {
1754
+ logger.error({ error: err }, "Error handling migration complete");
1755
+ });
1756
+ break;
1757
+ case "MIGRATION_CHUNK_ACK":
1758
+ this.handleMigrationChunkAck(migration.payload);
1759
+ break;
1760
+ case "MIGRATION_VERIFY":
1761
+ this.handleMigrationVerify(migration.payload);
1762
+ break;
1763
+ }
1764
+ }
1765
+ });
1766
+ }
1767
+ // ============================================
1768
+ // Status and Metrics
1769
+ // ============================================
1770
+ /**
1771
+ * Check if a partition is currently migrating
1772
+ */
1773
+ isActive(partitionId) {
1774
+ return this.activeMigrations.has(partitionId) || this.incomingMigrations.has(partitionId);
1775
+ }
1776
+ /**
1777
+ * Get migration status
1778
+ */
1779
+ getStatus() {
1780
+ const avgMigrationTime = this.metrics.migrationsCompleted > 0 ? Date.now() - (this.activeMigrations.values().next().value?.startTime ?? Date.now()) : 0;
1781
+ const estimatedTimeRemainingMs = (this.migrationQueue.length + this.activeMigrations.size) * (avgMigrationTime || 1e3);
1782
+ return {
1783
+ inProgress: this.activeMigrations.size > 0 || this.migrationQueue.length > 0,
1784
+ active: Array.from(this.activeMigrations.values()),
1785
+ queued: this.migrationQueue.length,
1786
+ completed: this.metrics.migrationsCompleted,
1787
+ failed: this.metrics.migrationsFailed,
1788
+ estimatedTimeRemainingMs
1789
+ };
1790
+ }
1791
+ /**
1792
+ * Get migration metrics
1793
+ */
1794
+ getMetrics() {
1795
+ return { ...this.metrics };
1796
+ }
1797
+ /**
1798
+ * Cancel all active and queued migrations
1799
+ */
1800
+ async cancelAll() {
1801
+ this.stopBatchProcessing();
1802
+ this.migrationQueue = [];
1803
+ this.metrics.queuedMigrations = 0;
1804
+ for (const [partitionId, migration] of this.activeMigrations) {
1805
+ migration.state = import_core3.PartitionState.FAILED;
1806
+ this.metrics.migrationsFailed++;
1807
+ this.emit("migrationFailed", partitionId, new Error("Migration cancelled"));
1808
+ }
1809
+ this.activeMigrations.clear();
1810
+ this.metrics.activeMigrations = 0;
1811
+ for (const pending of this.pendingChunkAcks.values()) {
1812
+ clearTimeout(pending.timeout);
1813
+ pending.reject(new Error("Migration cancelled"));
1814
+ }
1815
+ this.pendingChunkAcks.clear();
1816
+ for (const pending of this.pendingVerifications.values()) {
1817
+ clearTimeout(pending.timeout);
1818
+ pending.resolve(false);
1819
+ }
1820
+ this.pendingVerifications.clear();
1821
+ this.incomingMigrations.clear();
1822
+ logger.info("All migrations cancelled");
1823
+ }
1824
+ /**
1825
+ * Cleanup resources (sync version for backwards compatibility)
1826
+ */
1827
+ close() {
1828
+ this.cancelAll();
1829
+ }
1830
+ /**
1831
+ * Async cleanup - waits for cancellation to complete
1832
+ */
1833
+ async closeAsync() {
1834
+ await this.cancelAll();
1835
+ this.removeAllListeners();
1836
+ }
1837
+ };
1838
+
1839
+ // src/cluster/PartitionService.ts
1840
+ var import_core4 = require("@topgunbuild/core");
1841
+ var DEFAULT_PARTITION_SERVICE_CONFIG = {
1842
+ gradualRebalancing: false,
1843
+ migration: import_core4.DEFAULT_MIGRATION_CONFIG
1844
+ };
1845
+ var PartitionService = class extends import_events4.EventEmitter {
1846
+ constructor(cluster, config = {}) {
1847
+ super();
907
1848
  // partitionId -> { owner, backups }
908
1849
  this.partitions = /* @__PURE__ */ new Map();
909
- this.PARTITION_COUNT = 271;
910
- this.BACKUP_COUNT = 1;
1850
+ this.PARTITION_COUNT = import_core4.PARTITION_COUNT;
1851
+ this.BACKUP_COUNT = import_core4.DEFAULT_BACKUP_COUNT;
1852
+ // Phase 4: Version tracking for partition map
1853
+ this.mapVersion = 0;
1854
+ this.lastRebalanceTime = 0;
1855
+ this.migrationManager = null;
911
1856
  this.cluster = cluster;
912
- this.cluster.on("memberJoined", () => this.rebalance());
913
- this.cluster.on("memberLeft", () => this.rebalance());
914
- this.rebalance();
1857
+ this.config = {
1858
+ ...DEFAULT_PARTITION_SERVICE_CONFIG,
1859
+ ...config
1860
+ };
1861
+ if (this.config.gradualRebalancing) {
1862
+ this.migrationManager = new MigrationManager(
1863
+ cluster,
1864
+ this,
1865
+ this.config.migration
1866
+ );
1867
+ this.migrationManager.on("migrationComplete", (partitionId) => {
1868
+ logger.info({ partitionId }, "Migration completed, updating ownership");
1869
+ });
1870
+ this.migrationManager.on("migrationFailed", (partitionId, error) => {
1871
+ logger.error({ partitionId, error: error.message }, "Migration failed");
1872
+ });
1873
+ }
1874
+ this.cluster.on("memberJoined", (nodeId) => this.onMembershipChange("JOIN", nodeId));
1875
+ this.cluster.on("memberLeft", (nodeId) => this.onMembershipChange("LEAVE", nodeId));
1876
+ this.rebalance("REBALANCE");
1877
+ }
1878
+ /**
1879
+ * Handle membership change
1880
+ */
1881
+ onMembershipChange(reason, nodeId) {
1882
+ if (this.config.gradualRebalancing && this.migrationManager) {
1883
+ this.rebalanceGradual(reason, nodeId);
1884
+ } else {
1885
+ this.rebalance(reason, nodeId);
1886
+ }
915
1887
  }
916
1888
  getPartitionId(key) {
917
- return Math.abs((0, import_core3.hashString)(key)) % this.PARTITION_COUNT;
1889
+ return Math.abs((0, import_core4.hashString)(key)) % this.PARTITION_COUNT;
918
1890
  }
919
1891
  getDistribution(key) {
920
1892
  const pId = this.getPartitionId(key);
@@ -936,12 +1908,78 @@ var PartitionService = class {
936
1908
  isRelated(key) {
937
1909
  return this.isLocalOwner(key) || this.isLocalBackup(key);
938
1910
  }
939
- rebalance() {
1911
+ // ============================================
1912
+ // Phase 4: Partition Map Methods
1913
+ // ============================================
1914
+ /**
1915
+ * Get current partition map version
1916
+ */
1917
+ getMapVersion() {
1918
+ return this.mapVersion;
1919
+ }
1920
+ /**
1921
+ * Generate full PartitionMap for client consumption
1922
+ */
1923
+ getPartitionMap() {
1924
+ const nodes = [];
1925
+ const partitions = [];
1926
+ for (const nodeId of this.cluster.getMembers()) {
1927
+ const isSelf = nodeId === this.cluster.config.nodeId;
1928
+ const host = isSelf ? this.cluster.config.host : "unknown";
1929
+ const port = isSelf ? this.cluster.port : 0;
1930
+ nodes.push({
1931
+ nodeId,
1932
+ endpoints: {
1933
+ websocket: `ws://${host}:${port}`
1934
+ },
1935
+ status: "ACTIVE"
1936
+ });
1937
+ }
1938
+ for (let i = 0; i < this.PARTITION_COUNT; i++) {
1939
+ const dist = this.partitions.get(i);
1940
+ if (dist) {
1941
+ partitions.push({
1942
+ partitionId: i,
1943
+ ownerNodeId: dist.owner,
1944
+ backupNodeIds: dist.backups
1945
+ });
1946
+ }
1947
+ }
1948
+ return {
1949
+ version: this.mapVersion,
1950
+ partitionCount: this.PARTITION_COUNT,
1951
+ nodes,
1952
+ partitions,
1953
+ generatedAt: Date.now()
1954
+ };
1955
+ }
1956
+ /**
1957
+ * Get partition info by ID
1958
+ */
1959
+ getPartitionInfo(partitionId) {
1960
+ const dist = this.partitions.get(partitionId);
1961
+ if (!dist) return null;
1962
+ return {
1963
+ partitionId,
1964
+ ownerNodeId: dist.owner,
1965
+ backupNodeIds: dist.backups
1966
+ };
1967
+ }
1968
+ /**
1969
+ * Get owner node for a partition ID
1970
+ */
1971
+ getPartitionOwner(partitionId) {
1972
+ const dist = this.partitions.get(partitionId);
1973
+ return dist?.owner ?? null;
1974
+ }
1975
+ rebalance(reason = "REBALANCE", triggerNodeId) {
1976
+ const oldPartitions = new Map(this.partitions);
940
1977
  let allMembers = this.cluster.getMembers().sort();
941
1978
  if (allMembers.length === 0) {
942
1979
  allMembers = [this.cluster.config.nodeId];
943
1980
  }
944
- logger.info({ memberCount: allMembers.length, members: allMembers }, "Rebalancing partitions");
1981
+ logger.info({ memberCount: allMembers.length, members: allMembers, reason }, "Rebalancing partitions");
1982
+ const changes = [];
945
1983
  for (let i = 0; i < this.PARTITION_COUNT; i++) {
946
1984
  const ownerIndex = i % allMembers.length;
947
1985
  const owner = allMembers[ownerIndex];
@@ -952,14 +1990,141 @@ var PartitionService = class {
952
1990
  backups.push(allMembers[backupIndex]);
953
1991
  }
954
1992
  }
1993
+ const oldDist = oldPartitions.get(i);
1994
+ if (oldDist && oldDist.owner !== owner) {
1995
+ changes.push({
1996
+ partitionId: i,
1997
+ previousOwner: oldDist.owner,
1998
+ newOwner: owner,
1999
+ reason
2000
+ });
2001
+ }
955
2002
  this.partitions.set(i, { owner, backups });
956
2003
  }
2004
+ if (changes.length > 0 || this.mapVersion === 0) {
2005
+ this.mapVersion++;
2006
+ this.lastRebalanceTime = Date.now();
2007
+ logger.info({
2008
+ version: this.mapVersion,
2009
+ changesCount: changes.length,
2010
+ reason
2011
+ }, "Partition map updated");
2012
+ this.emit("rebalanced", this.getPartitionMap(), changes);
2013
+ }
2014
+ }
2015
+ // ============================================
2016
+ // Phase 4 Task 03: Gradual Rebalancing
2017
+ // ============================================
2018
+ /**
2019
+ * Perform gradual rebalancing using MigrationManager
2020
+ */
2021
+ rebalanceGradual(reason, triggerNodeId) {
2022
+ if (!this.migrationManager) {
2023
+ this.rebalance(reason, triggerNodeId);
2024
+ return;
2025
+ }
2026
+ const oldDistribution = new Map(this.partitions);
2027
+ let allMembers = this.cluster.getMembers().sort();
2028
+ if (allMembers.length === 0) {
2029
+ allMembers = [this.cluster.config.nodeId];
2030
+ }
2031
+ const newDistribution = /* @__PURE__ */ new Map();
2032
+ for (let i = 0; i < this.PARTITION_COUNT; i++) {
2033
+ const ownerIndex = i % allMembers.length;
2034
+ const owner = allMembers[ownerIndex];
2035
+ const backups = [];
2036
+ if (allMembers.length > 1) {
2037
+ for (let b = 1; b <= this.BACKUP_COUNT; b++) {
2038
+ const backupIndex = (ownerIndex + b) % allMembers.length;
2039
+ backups.push(allMembers[backupIndex]);
2040
+ }
2041
+ }
2042
+ newDistribution.set(i, { owner, backups });
2043
+ }
2044
+ logger.info({ memberCount: allMembers.length, reason, triggerNodeId }, "Planning gradual rebalance");
2045
+ this.migrationManager.planMigration(oldDistribution, newDistribution);
2046
+ for (const [partitionId, dist] of newDistribution) {
2047
+ this.partitions.set(partitionId, dist);
2048
+ }
2049
+ this.mapVersion++;
2050
+ this.lastRebalanceTime = Date.now();
2051
+ const changes = [];
2052
+ for (const [partitionId, newDist] of newDistribution) {
2053
+ const oldDist = oldDistribution.get(partitionId);
2054
+ if (oldDist && oldDist.owner !== newDist.owner) {
2055
+ changes.push({
2056
+ partitionId,
2057
+ previousOwner: oldDist.owner,
2058
+ newOwner: newDist.owner,
2059
+ reason
2060
+ });
2061
+ }
2062
+ }
2063
+ this.emit("rebalanced", this.getPartitionMap(), changes);
2064
+ }
2065
+ /**
2066
+ * Set partition owner (called after migration completes)
2067
+ */
2068
+ setOwner(partitionId, nodeId) {
2069
+ const partition = this.partitions.get(partitionId);
2070
+ if (!partition) return;
2071
+ const previousOwner = partition.owner;
2072
+ if (previousOwner === nodeId) return;
2073
+ partition.owner = nodeId;
2074
+ this.mapVersion++;
2075
+ logger.info({ partitionId, previousOwner, newOwner: nodeId, version: this.mapVersion }, "Partition owner updated");
2076
+ this.emit("partitionMoved", {
2077
+ partitionId,
2078
+ previousOwner,
2079
+ newOwner: nodeId,
2080
+ version: this.mapVersion
2081
+ });
2082
+ }
2083
+ /**
2084
+ * Get backups for a partition
2085
+ */
2086
+ getBackups(partitionId) {
2087
+ const dist = this.partitions.get(partitionId);
2088
+ return dist?.backups ?? [];
2089
+ }
2090
+ /**
2091
+ * Get migration status
2092
+ */
2093
+ getMigrationStatus() {
2094
+ return this.migrationManager?.getStatus() ?? null;
2095
+ }
2096
+ /**
2097
+ * Check if partition is currently migrating
2098
+ */
2099
+ isMigrating(partitionId) {
2100
+ return this.migrationManager?.isActive(partitionId) ?? false;
2101
+ }
2102
+ /**
2103
+ * Check if any partition is currently migrating
2104
+ */
2105
+ isRebalancing() {
2106
+ const status = this.getMigrationStatus();
2107
+ return status?.inProgress ?? false;
2108
+ }
2109
+ /**
2110
+ * Get MigrationManager for configuration
2111
+ */
2112
+ getMigrationManager() {
2113
+ return this.migrationManager;
2114
+ }
2115
+ /**
2116
+ * Cancel all migrations
2117
+ */
2118
+ async cancelMigrations() {
2119
+ if (this.migrationManager) {
2120
+ await this.migrationManager.cancelAll();
2121
+ }
957
2122
  }
958
2123
  };
959
2124
 
960
2125
  // src/cluster/LockManager.ts
961
- var import_events2 = require("events");
962
- var _LockManager = class _LockManager extends import_events2.EventEmitter {
2126
+ var import_events5 = require("events");
2127
+ var _LockManager = class _LockManager extends import_events5.EventEmitter {
963
2128
  // 5 minutes
964
2129
  constructor() {
965
2130
  super();
@@ -1474,8 +2639,8 @@ var SystemManager = class {
1474
2639
  };
1475
2640
 
1476
2641
  // src/utils/BoundedEventQueue.ts
1477
- var import_events3 = require("events");
1478
- var BoundedEventQueue = class extends import_events3.EventEmitter {
2642
+ var import_events6 = require("events");
2643
+ var BoundedEventQueue = class extends import_events6.EventEmitter {
1479
2644
  constructor(options) {
1480
2645
  super();
1481
2646
  this.queue = [];
@@ -1907,7 +3072,7 @@ var BackpressureRegulator = class {
1907
3072
 
1908
3073
  // src/utils/CoalescingWriter.ts
1909
3074
  var import_ws2 = require("ws");
1910
- var import_core4 = require("@topgunbuild/core");
3075
+ var import_core5 = require("@topgunbuild/core");
1911
3076
 
1912
3077
  // src/memory/BufferPool.ts
1913
3078
  var DEFAULT_CONFIG2 = {
@@ -2438,7 +3603,7 @@ var CoalescingWriter = class {
2438
3603
  if (this.closed) {
2439
3604
  return;
2440
3605
  }
2441
- const data = (0, import_core4.serialize)(message);
3606
+ const data = (0, import_core5.serialize)(message);
2442
3607
  this.writeRaw(data, urgent);
2443
3608
  }
2444
3609
  /**
@@ -2622,7 +3787,7 @@ var CoalescingWriter = class {
2622
3787
  offset += msg.data.length;
2623
3788
  }
2624
3789
  const usedBatch = batch.subarray(0, totalSize);
2625
- const batchEnvelope = (0, import_core4.serialize)({
3790
+ const batchEnvelope = (0, import_core5.serialize)({
2626
3791
  type: "BATCH",
2627
3792
  count: messages.length,
2628
3793
  data: usedBatch
@@ -2637,13 +3802,23 @@ var CoalescingWriter = class {
2637
3802
  // src/utils/coalescingPresets.ts
2638
3803
  var coalescingPresets = {
2639
3804
  /**
2640
- * Conservative defaults - good for low-latency workloads.
2641
- * Minimizes batching delay at the cost of more network calls.
2642
- * Use for: gaming, real-time chat, interactive applications.
3805
+ * Low latency - optimized for minimal response time.
3806
+ * Best for: gaming, real-time chat, interactive applications.
3807
+ * Benchmark: p50=2ms, ~18K ops/sec
3808
+ */
3809
+ lowLatency: {
3810
+ maxBatchSize: 100,
3811
+ maxDelayMs: 1,
3812
+ maxBatchBytes: 65536
3813
+ // 64KB
3814
+ },
3815
+ /**
3816
+ * Conservative - good balance of latency and batching.
3817
+ * Use for: general purpose with latency sensitivity.
2643
3818
  */
2644
3819
  conservative: {
2645
3820
  maxBatchSize: 100,
2646
- maxDelayMs: 5,
3821
+ maxDelayMs: 2,
2647
3822
  maxBatchBytes: 65536
2648
3823
  // 64KB
2649
3824
  },
@@ -2654,7 +3829,7 @@ var coalescingPresets = {
2654
3829
  */
2655
3830
  balanced: {
2656
3831
  maxBatchSize: 300,
2657
- maxDelayMs: 8,
3832
+ maxDelayMs: 2,
2658
3833
  maxBatchBytes: 131072
2659
3834
  // 128KB
2660
3835
  },
@@ -2662,10 +3837,11 @@ var coalescingPresets = {
2662
3837
  * High throughput - optimized for write-heavy workloads.
2663
3838
  * Higher batching for better network utilization.
2664
3839
  * Use for: data ingestion, logging, IoT data streams.
3840
+ * Benchmark: p50=7ms, ~18K ops/sec
2665
3841
  */
2666
3842
  highThroughput: {
2667
3843
  maxBatchSize: 500,
2668
- maxDelayMs: 10,
3844
+ maxDelayMs: 2,
2669
3845
  maxBatchBytes: 262144
2670
3846
  // 256KB
2671
3847
  },
@@ -2676,7 +3852,7 @@ var coalescingPresets = {
2676
3852
  */
2677
3853
  aggressive: {
2678
3854
  maxBatchSize: 1e3,
2679
- maxDelayMs: 15,
3855
+ maxDelayMs: 5,
2680
3856
  maxBatchBytes: 524288
2681
3857
  // 512KB
2682
3858
  }
@@ -3207,6 +4383,7 @@ var WorkerPool = class {
3207
4383
 
3208
4384
  // src/workers/MerkleWorker.ts
3209
4385
  var import_path2 = require("path");
4386
+ var import_core6 = require("@topgunbuild/core");
3210
4387
  var WORKER_THRESHOLD = 10;
3211
4388
  var taskIdCounter = 0;
3212
4389
  function generateTaskId() {
@@ -3416,12 +4593,7 @@ var MerkleWorker = class {
3416
4593
  }
3417
4594
  // ============ Hash utilities ============
3418
4595
  hashString(str) {
3419
- let hash = 2166136261;
3420
- for (let i = 0; i < str.length; i++) {
3421
- hash ^= str.charCodeAt(i);
3422
- hash = Math.imul(hash, 16777619);
3423
- }
3424
- return hash >>> 0;
4596
+ return (0, import_core6.hashString)(str);
3425
4597
  }
3426
4598
  buildTree(entries, depth) {
3427
4599
  const root = { hash: 0, children: {} };
@@ -3648,7 +4820,7 @@ CRDTMergeWorker.BATCH_THRESHOLD = WORKER_THRESHOLD2;
3648
4820
 
3649
4821
  // src/workers/SerializationWorker.ts
3650
4822
  var import_path3 = require("path");
3651
- var import_core5 = require("@topgunbuild/core");
4823
+ var import_core7 = require("@topgunbuild/core");
3652
4824
  var WORKER_BATCH_THRESHOLD = 10;
3653
4825
  var WORKER_SIZE_THRESHOLD = 50 * 1024;
3654
4826
  var taskIdCounter3 = 0;
@@ -3775,26 +4947,26 @@ var SerializationWorker = class {
3775
4947
  * Serialize a single object (always inline, too small for worker)
3776
4948
  */
3777
4949
  serialize(data) {
3778
- return (0, import_core5.serialize)(data);
4950
+ return (0, import_core7.serialize)(data);
3779
4951
  }
3780
4952
  /**
3781
4953
  * Deserialize a single payload (always inline, too small for worker)
3782
4954
  */
3783
4955
  deserialize(data) {
3784
- return (0, import_core5.deserialize)(data);
4956
+ return (0, import_core7.deserialize)(data);
3785
4957
  }
3786
4958
  // ============ Inline implementations for small batches ============
3787
4959
  serializeBatchInline(items) {
3788
4960
  const results = [];
3789
4961
  for (const item of items) {
3790
- results.push((0, import_core5.serialize)(item));
4962
+ results.push((0, import_core7.serialize)(item));
3791
4963
  }
3792
4964
  return results;
3793
4965
  }
3794
4966
  deserializeBatchInline(items) {
3795
4967
  const results = [];
3796
4968
  for (const item of items) {
3797
- results.push((0, import_core5.deserialize)(item));
4969
+ results.push((0, import_core7.deserialize)(item));
3798
4970
  }
3799
4971
  return results;
3800
4972
  }
@@ -4382,13 +5554,13 @@ var ReduceTasklet = class extends IteratorTasklet {
4382
5554
  };
4383
5555
 
4384
5556
  // src/ack/WriteAckManager.ts
4385
- var import_events4 = require("events");
4386
- var import_core6 = require("@topgunbuild/core");
4387
- var WriteAckManager = class extends import_events4.EventEmitter {
5557
+ var import_events7 = require("events");
5558
+ var import_core8 = require("@topgunbuild/core");
5559
+ var WriteAckManager = class extends import_events7.EventEmitter {
4388
5560
  constructor(config) {
4389
5561
  super();
4390
5562
  this.pending = /* @__PURE__ */ new Map();
4391
- this.defaultTimeout = config?.defaultTimeout ?? import_core6.DEFAULT_WRITE_CONCERN_TIMEOUT;
5563
+ this.defaultTimeout = config?.defaultTimeout ?? import_core8.DEFAULT_WRITE_CONCERN_TIMEOUT;
4392
5564
  }
4393
5565
  /**
4394
5566
  * Register a pending write operation.
@@ -4400,11 +5572,11 @@ var WriteAckManager = class extends import_events4.EventEmitter {
4400
5572
  * @returns Promise that resolves with WriteResult
4401
5573
  */
4402
5574
  registerPending(opId, writeConcern, timeout) {
4403
- if (writeConcern === import_core6.WriteConcern.FIRE_AND_FORGET) {
5575
+ if (writeConcern === import_core8.WriteConcern.FIRE_AND_FORGET) {
4404
5576
  return Promise.resolve({
4405
5577
  success: true,
4406
5578
  opId,
4407
- achievedLevel: import_core6.WriteConcern.FIRE_AND_FORGET,
5579
+ achievedLevel: import_core8.WriteConcern.FIRE_AND_FORGET,
4408
5580
  latencyMs: 0
4409
5581
  });
4410
5582
  }
@@ -4418,7 +5590,7 @@ var WriteAckManager = class extends import_events4.EventEmitter {
4418
5590
  timeout: effectiveTimeout,
4419
5591
  resolve,
4420
5592
  reject,
4421
- achievedLevels: /* @__PURE__ */ new Set([import_core6.WriteConcern.FIRE_AND_FORGET])
5593
+ achievedLevels: /* @__PURE__ */ new Set([import_core8.WriteConcern.FIRE_AND_FORGET])
4422
5594
  };
4423
5595
  pendingWrite.timeoutHandle = setTimeout(() => {
4424
5596
  this.handleTimeout(opId);
@@ -4428,8 +5600,8 @@ var WriteAckManager = class extends import_events4.EventEmitter {
4428
5600
  { opId, writeConcern, timeout: effectiveTimeout },
4429
5601
  "Registered pending write"
4430
5602
  );
4431
- if (writeConcern === import_core6.WriteConcern.MEMORY) {
4432
- this.notifyLevel(opId, import_core6.WriteConcern.MEMORY);
5603
+ if (writeConcern === import_core8.WriteConcern.MEMORY) {
5604
+ this.notifyLevel(opId, import_core8.WriteConcern.MEMORY);
4433
5605
  }
4434
5606
  });
4435
5607
  }
@@ -4449,7 +5621,7 @@ var WriteAckManager = class extends import_events4.EventEmitter {
4449
5621
  { opId, level, target: pending.writeConcern },
4450
5622
  "Write Concern level achieved"
4451
5623
  );
4452
- if ((0, import_core6.isWriteConcernAchieved)(pending.achievedLevels, pending.writeConcern)) {
5624
+ if ((0, import_core8.isWriteConcernAchieved)(pending.achievedLevels, pending.writeConcern)) {
4453
5625
  this.resolvePending(opId, level);
4454
5626
  }
4455
5627
  }
@@ -4492,7 +5664,7 @@ var WriteAckManager = class extends import_events4.EventEmitter {
4492
5664
  getAchievedLevel(opId) {
4493
5665
  const pending = this.pending.get(opId);
4494
5666
  if (!pending) return void 0;
4495
- return (0, import_core6.getHighestWriteConcernLevel)(pending.achievedLevels);
5667
+ return (0, import_core8.getHighestWriteConcernLevel)(pending.achievedLevels);
4496
5668
  }
4497
5669
  /**
4498
5670
  * Resolve a pending write with success.
@@ -4524,7 +5696,7 @@ var WriteAckManager = class extends import_events4.EventEmitter {
4524
5696
  handleTimeout(opId) {
4525
5697
  const pending = this.pending.get(opId);
4526
5698
  if (!pending) return;
4527
- const highestAchieved = (0, import_core6.getHighestWriteConcernLevel)(pending.achievedLevels);
5699
+ const highestAchieved = (0, import_core8.getHighestWriteConcernLevel)(pending.achievedLevels);
4528
5700
  const latencyMs = Date.now() - pending.timestamp;
4529
5701
  const result = {
4530
5702
  success: false,
@@ -4552,89 +5724,747 @@ var WriteAckManager = class extends import_events4.EventEmitter {
4552
5724
  * @param opId - Operation ID
4553
5725
  * @param error - Error message
4554
5726
  */
4555
- failPending(opId, error) {
4556
- const pending = this.pending.get(opId);
5727
+ failPending(opId, error) {
5728
+ const pending = this.pending.get(opId);
5729
+ if (!pending) return;
5730
+ if (pending.timeoutHandle) {
5731
+ clearTimeout(pending.timeoutHandle);
5732
+ }
5733
+ const latencyMs = Date.now() - pending.timestamp;
5734
+ const highestAchieved = (0, import_core8.getHighestWriteConcernLevel)(pending.achievedLevels);
5735
+ const result = {
5736
+ success: false,
5737
+ opId,
5738
+ achievedLevel: highestAchieved,
5739
+ latencyMs,
5740
+ error
5741
+ };
5742
+ pending.resolve(result);
5743
+ this.pending.delete(opId);
5744
+ logger.error({ opId, error, latencyMs }, "Write failed");
5745
+ this.emit("failed", result);
5746
+ }
5747
+ /**
5748
+ * Get pending writes statistics.
5749
+ */
5750
+ getStats() {
5751
+ const byLevel = {
5752
+ [import_core8.WriteConcern.FIRE_AND_FORGET]: 0,
5753
+ [import_core8.WriteConcern.MEMORY]: 0,
5754
+ [import_core8.WriteConcern.APPLIED]: 0,
5755
+ [import_core8.WriteConcern.REPLICATED]: 0,
5756
+ [import_core8.WriteConcern.PERSISTED]: 0
5757
+ };
5758
+ for (const pending of this.pending.values()) {
5759
+ byLevel[pending.writeConcern]++;
5760
+ }
5761
+ return { pending: this.pending.size, byLevel };
5762
+ }
5763
+ /**
5764
+ * Get all pending operation IDs.
5765
+ */
5766
+ getPendingIds() {
5767
+ return Array.from(this.pending.keys());
5768
+ }
5769
+ /**
5770
+ * Clear all pending writes (for shutdown).
5771
+ * Rejects all pending promises with an error.
5772
+ */
5773
+ clear() {
5774
+ const count = this.pending.size;
5775
+ for (const pending of this.pending.values()) {
5776
+ if (pending.timeoutHandle) {
5777
+ clearTimeout(pending.timeoutHandle);
5778
+ }
5779
+ pending.reject(new Error("WriteAckManager cleared"));
5780
+ }
5781
+ this.pending.clear();
5782
+ if (count > 0) {
5783
+ logger.info({ count }, "WriteAckManager cleared");
5784
+ }
5785
+ }
5786
+ /**
5787
+ * Graceful shutdown - resolves all pending writes with their current achieved level.
5788
+ */
5789
+ shutdown() {
5790
+ const count = this.pending.size;
5791
+ for (const [opId, pending] of this.pending.entries()) {
5792
+ if (pending.timeoutHandle) {
5793
+ clearTimeout(pending.timeoutHandle);
5794
+ }
5795
+ const highestAchieved = (0, import_core8.getHighestWriteConcernLevel)(pending.achievedLevels);
5796
+ const latencyMs = Date.now() - pending.timestamp;
5797
+ const result = {
5798
+ success: highestAchieved === pending.writeConcern,
5799
+ opId,
5800
+ achievedLevel: highestAchieved,
5801
+ latencyMs,
5802
+ error: highestAchieved !== pending.writeConcern ? `Shutdown: achieved ${highestAchieved}, requested ${pending.writeConcern}` : void 0
5803
+ };
5804
+ pending.resolve(result);
5805
+ }
5806
+ this.pending.clear();
5807
+ if (count > 0) {
5808
+ logger.info({ count }, "WriteAckManager shutdown");
5809
+ }
5810
+ }
5811
+ };
5812
+
5813
+ // src/cluster/ReplicationPipeline.ts
5814
+ var import_events8 = require("events");
5815
+ var import_core9 = require("@topgunbuild/core");
5816
+
5817
+ // src/cluster/LagTracker.ts
5818
+ var DEFAULT_LAG_TRACKER_CONFIG = {
5819
+ historySize: 100,
5820
+ laggyThresholdMs: 5e3,
5821
+ unhealthyThresholdMs: 3e4
5822
+ };
5823
+ var LagTracker = class {
5824
+ constructor(config = {}) {
5825
+ this.lagByNode = /* @__PURE__ */ new Map();
5826
+ this.config = {
5827
+ ...DEFAULT_LAG_TRACKER_CONFIG,
5828
+ ...config
5829
+ };
5830
+ }
5831
+ /**
5832
+ * Update lag measurement for a node
5833
+ */
5834
+ update(nodeId, lagMs) {
5835
+ let info = this.lagByNode.get(nodeId);
5836
+ if (!info) {
5837
+ info = {
5838
+ current: 0,
5839
+ history: [],
5840
+ lastUpdate: Date.now(),
5841
+ pendingOps: 0
5842
+ };
5843
+ this.lagByNode.set(nodeId, info);
5844
+ }
5845
+ info.current = lagMs;
5846
+ info.history.push(lagMs);
5847
+ if (info.history.length > this.config.historySize) {
5848
+ info.history.shift();
5849
+ }
5850
+ info.lastUpdate = Date.now();
5851
+ }
5852
+ /**
5853
+ * Record acknowledgment from a node (lag effectively becomes 0)
5854
+ */
5855
+ recordAck(nodeId) {
5856
+ const info = this.lagByNode.get(nodeId);
5857
+ if (info) {
5858
+ info.current = 0;
5859
+ info.lastUpdate = Date.now();
5860
+ if (info.pendingOps > 0) {
5861
+ info.pendingOps--;
5862
+ }
5863
+ }
5864
+ }
5865
+ /**
5866
+ * Increment pending operations counter for a node
5867
+ */
5868
+ incrementPending(nodeId) {
5869
+ let info = this.lagByNode.get(nodeId);
5870
+ if (!info) {
5871
+ info = {
5872
+ current: 0,
5873
+ history: [],
5874
+ lastUpdate: Date.now(),
5875
+ pendingOps: 0
5876
+ };
5877
+ this.lagByNode.set(nodeId, info);
5878
+ }
5879
+ info.pendingOps++;
5880
+ }
5881
+ /**
5882
+ * Get lag statistics for a specific node
5883
+ */
5884
+ getLag(nodeId) {
5885
+ const info = this.lagByNode.get(nodeId);
5886
+ if (!info || info.history.length === 0) {
5887
+ return { current: 0, avg: 0, max: 0, percentile99: 0 };
5888
+ }
5889
+ const sorted = [...info.history].sort((a, b) => a - b);
5890
+ const avg = sorted.reduce((a, b) => a + b, 0) / sorted.length;
5891
+ const max = sorted[sorted.length - 1] || 0;
5892
+ const p99Index = Math.floor(sorted.length * 0.99);
5893
+ const percentile99 = sorted[p99Index] || max;
5894
+ return {
5895
+ current: info.current,
5896
+ avg: Math.round(avg * 100) / 100,
5897
+ // Round to 2 decimal places
5898
+ max,
5899
+ percentile99
5900
+ };
5901
+ }
5902
+ /**
5903
+ * Get pending operations count for a node
5904
+ */
5905
+ getPendingOps(nodeId) {
5906
+ const info = this.lagByNode.get(nodeId);
5907
+ return info?.pendingOps ?? 0;
5908
+ }
5909
+ /**
5910
+ * Get overall replication health status
5911
+ */
5912
+ getHealth() {
5913
+ const unhealthyNodes = [];
5914
+ const laggyNodes = [];
5915
+ let totalLag = 0;
5916
+ let nodeCount = 0;
5917
+ const now = Date.now();
5918
+ for (const [nodeId, info] of this.lagByNode) {
5919
+ const timeSinceUpdate = now - info.lastUpdate;
5920
+ if (timeSinceUpdate > this.config.unhealthyThresholdMs) {
5921
+ unhealthyNodes.push(nodeId);
5922
+ } else if (info.current > this.config.laggyThresholdMs) {
5923
+ laggyNodes.push(nodeId);
5924
+ }
5925
+ totalLag += info.current;
5926
+ nodeCount++;
5927
+ }
5928
+ const avgLagMs = nodeCount > 0 ? totalLag / nodeCount : 0;
5929
+ return {
5930
+ healthy: unhealthyNodes.length === 0,
5931
+ unhealthyNodes,
5932
+ laggyNodes,
5933
+ avgLagMs: Math.round(avgLagMs * 100) / 100
5934
+ };
5935
+ }
5936
+ /**
5937
+ * Get average lag across all tracked nodes
5938
+ */
5939
+ getAverageLag() {
5940
+ let total = 0;
5941
+ let count = 0;
5942
+ for (const info of this.lagByNode.values()) {
5943
+ total += info.current;
5944
+ count++;
5945
+ }
5946
+ return count > 0 ? total / count : 0;
5947
+ }
5948
+ /**
5949
+ * Check if a specific node is considered healthy
5950
+ */
5951
+ isNodeHealthy(nodeId) {
5952
+ const info = this.lagByNode.get(nodeId);
5953
+ if (!info) return true;
5954
+ const timeSinceUpdate = Date.now() - info.lastUpdate;
5955
+ return timeSinceUpdate < this.config.unhealthyThresholdMs;
5956
+ }
5957
+ /**
5958
+ * Check if a specific node is considered laggy
5959
+ */
5960
+ isNodeLaggy(nodeId) {
5961
+ const info = this.lagByNode.get(nodeId);
5962
+ if (!info) return false;
5963
+ return info.current > this.config.laggyThresholdMs;
5964
+ }
5965
+ /**
5966
+ * Remove a node from tracking
5967
+ */
5968
+ removeNode(nodeId) {
5969
+ this.lagByNode.delete(nodeId);
5970
+ }
5971
+ /**
5972
+ * Get all tracked node IDs
5973
+ */
5974
+ getTrackedNodes() {
5975
+ return Array.from(this.lagByNode.keys());
5976
+ }
5977
+ /**
5978
+ * Get raw lag info for a node (for advanced monitoring)
5979
+ */
5980
+ getRawLagInfo(nodeId) {
5981
+ return this.lagByNode.get(nodeId);
5982
+ }
5983
+ /**
5984
+ * Clear all tracking data
5985
+ */
5986
+ clear() {
5987
+ this.lagByNode.clear();
5988
+ }
5989
+ /**
5990
+ * Export metrics in Prometheus format
5991
+ */
5992
+ toPrometheusMetrics() {
5993
+ const lines = [
5994
+ "# HELP topgun_replication_lag_ms Current replication lag in milliseconds",
5995
+ "# TYPE topgun_replication_lag_ms gauge"
5996
+ ];
5997
+ for (const [nodeId, info] of this.lagByNode) {
5998
+ lines.push(`topgun_replication_lag_ms{node="${nodeId}"} ${info.current}`);
5999
+ }
6000
+ lines.push("");
6001
+ lines.push("# HELP topgun_replication_pending_ops Pending replication operations");
6002
+ lines.push("# TYPE topgun_replication_pending_ops gauge");
6003
+ for (const [nodeId, info] of this.lagByNode) {
6004
+ lines.push(`topgun_replication_pending_ops{node="${nodeId}"} ${info.pendingOps}`);
6005
+ }
6006
+ const health = this.getHealth();
6007
+ lines.push("");
6008
+ lines.push("# HELP topgun_replication_healthy Cluster replication health (1=healthy, 0=unhealthy)");
6009
+ lines.push("# TYPE topgun_replication_healthy gauge");
6010
+ lines.push(`topgun_replication_healthy ${health.healthy ? 1 : 0}`);
6011
+ lines.push("");
6012
+ lines.push("# HELP topgun_replication_avg_lag_ms Average replication lag across all nodes");
6013
+ lines.push("# TYPE topgun_replication_avg_lag_ms gauge");
6014
+ lines.push(`topgun_replication_avg_lag_ms ${health.avgLagMs}`);
6015
+ return lines.join("\n");
6016
+ }
6017
+ };
6018
+
6019
+ // src/cluster/ReplicationPipeline.ts
6020
+ var ReplicationTimeoutError = class extends Error {
6021
+ constructor(opId, targetNodes, ackedNodes) {
6022
+ super(
6023
+ `Replication timeout for operation ${opId}. Expected: ${targetNodes.join(", ")}, Acked: ${ackedNodes.join(", ")}`
6024
+ );
6025
+ this.opId = opId;
6026
+ this.targetNodes = targetNodes;
6027
+ this.ackedNodes = ackedNodes;
6028
+ this.name = "ReplicationTimeoutError";
6029
+ }
6030
+ };
6031
+ var ReplicationPipeline = class extends import_events8.EventEmitter {
6032
+ constructor(clusterManager, partitionService, config = {}) {
6033
+ super();
6034
+ // Replication queues per node (for EVENTUAL mode)
6035
+ this.replicationQueue = /* @__PURE__ */ new Map();
6036
+ // Pending acknowledgments (for STRONG/QUORUM mode)
6037
+ this.pendingAcks = /* @__PURE__ */ new Map();
6038
+ // Queue processor timer
6039
+ this.queueProcessorTimer = null;
6040
+ // Operation applier callback (injected by ServerCoordinator)
6041
+ this.operationApplier = null;
6042
+ this.clusterManager = clusterManager;
6043
+ this.partitionService = partitionService;
6044
+ this.nodeId = clusterManager.config.nodeId;
6045
+ this.config = {
6046
+ ...import_core9.DEFAULT_REPLICATION_CONFIG,
6047
+ ...config
6048
+ };
6049
+ this.lagTracker = new LagTracker();
6050
+ this.setupMessageHandlers();
6051
+ this.startQueueProcessor();
6052
+ }
6053
+ // ============================================
6054
+ // Configuration
6055
+ // ============================================
6056
+ /**
6057
+ * Set the operation applier callback
6058
+ * This is called when replicated operations are received from other nodes
6059
+ */
6060
+ setOperationApplier(applier) {
6061
+ this.operationApplier = applier;
6062
+ }
6063
+ // ============================================
6064
+ // Replication API
6065
+ // ============================================
6066
+ /**
6067
+ * Replicate operation to backup nodes
6068
+ */
6069
+ async replicate(operation, opId, key, options = {}) {
6070
+ const consistency = options.consistency ?? this.config.defaultConsistency;
6071
+ const partitionId = this.partitionService.getPartitionId(key);
6072
+ const backups = this.partitionService.getBackups(partitionId);
6073
+ if (backups.length === 0) {
6074
+ return { success: true, ackedBy: [this.nodeId] };
6075
+ }
6076
+ switch (consistency) {
6077
+ case import_core9.ConsistencyLevel.STRONG:
6078
+ return this.replicateStrong(operation, opId, backups, options.timeout);
6079
+ case import_core9.ConsistencyLevel.QUORUM:
6080
+ return this.replicateQuorum(operation, opId, backups, options.timeout);
6081
+ case import_core9.ConsistencyLevel.EVENTUAL:
6082
+ return this.replicateEventual(operation, opId, backups);
6083
+ }
6084
+ }
6085
+ /**
6086
+ * STRONG: Wait for all replicas to acknowledge
6087
+ */
6088
+ async replicateStrong(operation, opId, backups, timeout) {
6089
+ const targetNodes = backups;
6090
+ return new Promise((resolve, reject) => {
6091
+ const pending = {
6092
+ opId,
6093
+ consistency: import_core9.ConsistencyLevel.STRONG,
6094
+ targetNodes,
6095
+ ackedNodes: /* @__PURE__ */ new Set(),
6096
+ resolve: () => resolve({
6097
+ success: true,
6098
+ ackedBy: [this.nodeId, ...targetNodes]
6099
+ }),
6100
+ reject: (error) => reject(error),
6101
+ timeout: setTimeout(() => {
6102
+ this.pendingAcks.delete(opId);
6103
+ const ackedList = Array.from(pending.ackedNodes);
6104
+ reject(new ReplicationTimeoutError(opId, targetNodes, ackedList));
6105
+ }, timeout ?? this.config.ackTimeoutMs),
6106
+ startTime: Date.now()
6107
+ };
6108
+ this.pendingAcks.set(opId, pending);
6109
+ for (const nodeId of targetNodes) {
6110
+ this.lagTracker.incrementPending(nodeId);
6111
+ }
6112
+ for (const nodeId of targetNodes) {
6113
+ this.sendReplication(nodeId, operation, opId, import_core9.ConsistencyLevel.STRONG);
6114
+ }
6115
+ });
6116
+ }
6117
+ /**
6118
+ * QUORUM: Wait for majority of replicas
6119
+ */
6120
+ async replicateQuorum(operation, opId, backups, timeout) {
6121
+ const targetNodes = backups;
6122
+ const quorumSize = Math.floor(targetNodes.length / 2) + 1;
6123
+ return new Promise((resolve, reject) => {
6124
+ const ackedNodes = /* @__PURE__ */ new Set();
6125
+ const pending = {
6126
+ opId,
6127
+ consistency: import_core9.ConsistencyLevel.QUORUM,
6128
+ targetNodes,
6129
+ ackedNodes,
6130
+ resolve: () => {
6131
+ const ackedSnapshot = Array.from(ackedNodes);
6132
+ const ackedBy = [this.nodeId, ...ackedSnapshot];
6133
+ resolve({ success: true, ackedBy });
6134
+ },
6135
+ reject: (error) => reject(error),
6136
+ timeout: setTimeout(() => {
6137
+ this.pendingAcks.delete(opId);
6138
+ const ackedList = Array.from(ackedNodes);
6139
+ reject(new ReplicationTimeoutError(opId, targetNodes, ackedList));
6140
+ }, timeout ?? this.config.ackTimeoutMs),
6141
+ startTime: Date.now()
6142
+ };
6143
+ this.pendingAcks.set(opId, pending);
6144
+ for (const nodeId of targetNodes) {
6145
+ this.lagTracker.incrementPending(nodeId);
6146
+ }
6147
+ for (const nodeId of targetNodes) {
6148
+ this.sendReplication(nodeId, operation, opId, import_core9.ConsistencyLevel.QUORUM);
6149
+ }
6150
+ });
6151
+ }
6152
+ /**
6153
+ * EVENTUAL: Fire-and-forget with queue
6154
+ */
6155
+ async replicateEventual(operation, opId, backups) {
6156
+ for (const nodeId of backups) {
6157
+ this.enqueue(nodeId, {
6158
+ opId,
6159
+ operation,
6160
+ consistency: import_core9.ConsistencyLevel.EVENTUAL,
6161
+ timestamp: Date.now(),
6162
+ retryCount: 0
6163
+ });
6164
+ }
6165
+ return { success: true, ackedBy: [this.nodeId] };
6166
+ }
6167
+ // ============================================
6168
+ // Queue Management
6169
+ // ============================================
6170
+ /**
6171
+ * Add task to replication queue
6172
+ */
6173
+ enqueue(nodeId, task) {
6174
+ let queue = this.replicationQueue.get(nodeId);
6175
+ if (!queue) {
6176
+ queue = [];
6177
+ this.replicationQueue.set(nodeId, queue);
6178
+ }
6179
+ if (queue.length >= this.config.queueSizeLimit) {
6180
+ this.emit("queueOverflow", nodeId);
6181
+ logger.warn({ nodeId, queueSize: queue.length }, "Replication queue overflow, dropping oldest");
6182
+ queue.shift();
6183
+ }
6184
+ queue.push(task);
6185
+ this.lagTracker.incrementPending(nodeId);
6186
+ }
6187
+ /**
6188
+ * Start queue processor
6189
+ */
6190
+ startQueueProcessor() {
6191
+ if (this.queueProcessorTimer) return;
6192
+ this.queueProcessorTimer = setInterval(() => {
6193
+ for (const nodeId of this.replicationQueue.keys()) {
6194
+ this.processQueue(nodeId).catch((err) => {
6195
+ logger.error({ nodeId, error: err }, "Error processing replication queue");
6196
+ this.emit("error", err);
6197
+ });
6198
+ }
6199
+ }, this.config.batchIntervalMs);
6200
+ }
6201
+ /**
6202
+ * Stop queue processor
6203
+ */
6204
+ stopQueueProcessor() {
6205
+ if (this.queueProcessorTimer) {
6206
+ clearInterval(this.queueProcessorTimer);
6207
+ this.queueProcessorTimer = null;
6208
+ }
6209
+ }
6210
+ /**
6211
+ * Process replication queue for a node
6212
+ */
6213
+ async processQueue(nodeId) {
6214
+ const queue = this.replicationQueue.get(nodeId);
6215
+ if (!queue || queue.length === 0) return;
6216
+ const batch = queue.splice(0, this.config.batchSize);
6217
+ try {
6218
+ this.clusterManager.send(nodeId, "OP_FORWARD", {
6219
+ _replication: {
6220
+ type: "REPLICATION_BATCH",
6221
+ payload: {
6222
+ operations: batch.map((t) => t.operation),
6223
+ opIds: batch.map((t) => t.opId)
6224
+ }
6225
+ }
6226
+ });
6227
+ const oldestTimestamp = Math.min(...batch.map((t) => t.timestamp));
6228
+ this.lagTracker.update(nodeId, Date.now() - oldestTimestamp);
6229
+ logger.debug({ nodeId, batchSize: batch.length }, "Sent replication batch");
6230
+ } catch (error) {
6231
+ for (const task of batch) {
6232
+ task.retryCount++;
6233
+ if (task.retryCount <= this.config.maxRetries) {
6234
+ queue.unshift(task);
6235
+ } else {
6236
+ logger.warn({ nodeId, opId: task.opId, retries: task.retryCount }, "Replication task exceeded max retries");
6237
+ this.emit("replicationFailed", task.opId, new Error("Max retries exceeded"));
6238
+ }
6239
+ }
6240
+ }
6241
+ }
6242
+ // ============================================
6243
+ // Message Handling
6244
+ // ============================================
6245
+ /**
6246
+ * Send replication message to a node
6247
+ */
6248
+ sendReplication(nodeId, operation, opId, consistency) {
6249
+ this.clusterManager.send(nodeId, "OP_FORWARD", {
6250
+ _replication: {
6251
+ type: "REPLICATION",
6252
+ payload: {
6253
+ opId,
6254
+ operation,
6255
+ consistency
6256
+ }
6257
+ }
6258
+ });
6259
+ }
6260
+ /**
6261
+ * Setup cluster message handlers
6262
+ */
6263
+ setupMessageHandlers() {
6264
+ this.clusterManager.on("message", (msg) => {
6265
+ if (msg.payload?._replication) {
6266
+ const replication = msg.payload._replication;
6267
+ switch (replication.type) {
6268
+ case "REPLICATION":
6269
+ this.handleReplication(msg.senderId, replication.payload);
6270
+ break;
6271
+ case "REPLICATION_BATCH":
6272
+ this.handleReplicationBatch(msg.senderId, replication.payload);
6273
+ break;
6274
+ case "REPLICATION_ACK":
6275
+ this.handleReplicationAck(msg.senderId, replication.payload);
6276
+ break;
6277
+ case "REPLICATION_BATCH_ACK":
6278
+ this.handleReplicationBatchAck(msg.senderId, replication.payload);
6279
+ break;
6280
+ }
6281
+ }
6282
+ });
6283
+ }
6284
+ /**
6285
+ * Handle incoming replication request (on backup node)
6286
+ */
6287
+ async handleReplication(sourceNode, payload) {
6288
+ const { opId, operation, consistency } = payload;
6289
+ logger.debug({ sourceNode, opId, consistency }, "Received replication");
6290
+ let success = true;
6291
+ if (this.operationApplier) {
6292
+ try {
6293
+ success = await this.operationApplier(operation, opId, sourceNode);
6294
+ } catch (error) {
6295
+ logger.error({ sourceNode, opId, error }, "Failed to apply replicated operation");
6296
+ success = false;
6297
+ }
6298
+ } else {
6299
+ logger.warn({ sourceNode, opId }, "No operation applier set, operation not applied");
6300
+ }
6301
+ if (consistency === import_core9.ConsistencyLevel.STRONG || consistency === import_core9.ConsistencyLevel.QUORUM) {
6302
+ this.clusterManager.send(sourceNode, "OP_FORWARD", {
6303
+ _replication: {
6304
+ type: "REPLICATION_ACK",
6305
+ payload: {
6306
+ opId,
6307
+ success,
6308
+ timestamp: Date.now()
6309
+ }
6310
+ }
6311
+ });
6312
+ }
6313
+ }
6314
+ /**
6315
+ * Handle incoming batch replication (on backup node)
6316
+ */
6317
+ async handleReplicationBatch(sourceNode, payload) {
6318
+ const { operations, opIds } = payload;
6319
+ logger.debug({ sourceNode, count: operations.length }, "Received replication batch");
6320
+ let allSuccess = true;
6321
+ if (this.operationApplier) {
6322
+ for (let i = 0; i < operations.length; i++) {
6323
+ try {
6324
+ const success = await this.operationApplier(operations[i], opIds[i], sourceNode);
6325
+ if (!success) {
6326
+ allSuccess = false;
6327
+ }
6328
+ } catch (error) {
6329
+ logger.error({ sourceNode, opId: opIds[i], error }, "Failed to apply replicated operation in batch");
6330
+ allSuccess = false;
6331
+ }
6332
+ }
6333
+ } else {
6334
+ logger.warn({ sourceNode, count: operations.length }, "No operation applier set, batch not applied");
6335
+ }
6336
+ this.clusterManager.send(sourceNode, "OP_FORWARD", {
6337
+ _replication: {
6338
+ type: "REPLICATION_BATCH_ACK",
6339
+ payload: {
6340
+ opIds,
6341
+ success: allSuccess,
6342
+ timestamp: Date.now()
6343
+ }
6344
+ }
6345
+ });
6346
+ }
6347
+ /**
6348
+ * Handle replication acknowledgment (on owner node)
6349
+ */
6350
+ handleReplicationAck(sourceNode, payload) {
6351
+ const { opId, success } = payload;
6352
+ this.lagTracker.recordAck(sourceNode);
6353
+ const pending = this.pendingAcks.get(opId);
4557
6354
  if (!pending) return;
4558
- if (pending.timeoutHandle) {
4559
- clearTimeout(pending.timeoutHandle);
6355
+ if (!success) {
6356
+ logger.warn({ sourceNode, opId }, "Replication rejected by backup");
6357
+ return;
6358
+ }
6359
+ pending.ackedNodes.add(sourceNode);
6360
+ const lag = Date.now() - pending.startTime;
6361
+ this.lagTracker.update(sourceNode, lag);
6362
+ const ackedCount = pending.ackedNodes.size;
6363
+ const targetCount = pending.targetNodes.length;
6364
+ switch (pending.consistency) {
6365
+ case import_core9.ConsistencyLevel.STRONG:
6366
+ if (ackedCount === targetCount) {
6367
+ clearTimeout(pending.timeout);
6368
+ this.pendingAcks.delete(opId);
6369
+ pending.resolve();
6370
+ this.emit("replicationComplete", opId, [this.nodeId, ...pending.ackedNodes]);
6371
+ }
6372
+ break;
6373
+ case import_core9.ConsistencyLevel.QUORUM:
6374
+ const quorumSize = Math.floor(targetCount / 2) + 1;
6375
+ if (ackedCount >= quorumSize) {
6376
+ clearTimeout(pending.timeout);
6377
+ this.pendingAcks.delete(opId);
6378
+ pending.resolve();
6379
+ this.emit("replicationComplete", opId, [this.nodeId, ...pending.ackedNodes]);
6380
+ }
6381
+ break;
4560
6382
  }
4561
- const latencyMs = Date.now() - pending.timestamp;
4562
- const highestAchieved = (0, import_core6.getHighestWriteConcernLevel)(pending.achievedLevels);
4563
- const result = {
4564
- success: false,
4565
- opId,
4566
- achievedLevel: highestAchieved,
4567
- latencyMs,
4568
- error
4569
- };
4570
- pending.resolve(result);
4571
- this.pending.delete(opId);
4572
- logger.error({ opId, error, latencyMs }, "Write failed");
4573
- this.emit("failed", result);
4574
6383
  }
4575
6384
  /**
4576
- * Get pending writes statistics.
6385
+ * Handle batch acknowledgment (on owner node)
4577
6386
  */
4578
- getStats() {
4579
- const byLevel = {
4580
- [import_core6.WriteConcern.FIRE_AND_FORGET]: 0,
4581
- [import_core6.WriteConcern.MEMORY]: 0,
4582
- [import_core6.WriteConcern.APPLIED]: 0,
4583
- [import_core6.WriteConcern.REPLICATED]: 0,
4584
- [import_core6.WriteConcern.PERSISTED]: 0
4585
- };
4586
- for (const pending of this.pending.values()) {
4587
- byLevel[pending.writeConcern]++;
6387
+ handleReplicationBatchAck(sourceNode, payload) {
6388
+ const { success } = payload;
6389
+ this.lagTracker.recordAck(sourceNode);
6390
+ if (!success) {
6391
+ logger.warn({ sourceNode, count: payload.opIds.length }, "Batch replication rejected");
4588
6392
  }
4589
- return { pending: this.pending.size, byLevel };
4590
6393
  }
6394
+ // ============================================
6395
+ // Status and Metrics
6396
+ // ============================================
4591
6397
  /**
4592
- * Get all pending operation IDs.
6398
+ * Get replication lag for a specific node
4593
6399
  */
4594
- getPendingIds() {
4595
- return Array.from(this.pending.keys());
6400
+ getLag(nodeId) {
6401
+ return this.lagTracker.getLag(nodeId);
4596
6402
  }
4597
6403
  /**
4598
- * Clear all pending writes (for shutdown).
4599
- * Rejects all pending promises with an error.
6404
+ * Get overall replication health
4600
6405
  */
4601
- clear() {
4602
- const count = this.pending.size;
4603
- for (const pending of this.pending.values()) {
4604
- if (pending.timeoutHandle) {
4605
- clearTimeout(pending.timeoutHandle);
4606
- }
4607
- pending.reject(new Error("WriteAckManager cleared"));
4608
- }
4609
- this.pending.clear();
4610
- if (count > 0) {
4611
- logger.info({ count }, "WriteAckManager cleared");
6406
+ getHealth() {
6407
+ return this.lagTracker.getHealth();
6408
+ }
6409
+ /**
6410
+ * Get queue size for a specific node
6411
+ */
6412
+ getQueueSize(nodeId) {
6413
+ return this.replicationQueue.get(nodeId)?.length ?? 0;
6414
+ }
6415
+ /**
6416
+ * Get total pending operations across all nodes
6417
+ */
6418
+ getTotalPending() {
6419
+ let total = 0;
6420
+ for (const queue of this.replicationQueue.values()) {
6421
+ total += queue.length;
4612
6422
  }
6423
+ return total + this.pendingAcks.size;
4613
6424
  }
4614
6425
  /**
4615
- * Graceful shutdown - resolves all pending writes with their current achieved level.
6426
+ * Check if a node is considered synced (low lag)
4616
6427
  */
4617
- shutdown() {
4618
- const count = this.pending.size;
4619
- for (const [opId, pending] of this.pending.entries()) {
4620
- if (pending.timeoutHandle) {
4621
- clearTimeout(pending.timeoutHandle);
4622
- }
4623
- const highestAchieved = (0, import_core6.getHighestWriteConcernLevel)(pending.achievedLevels);
4624
- const latencyMs = Date.now() - pending.timestamp;
4625
- const result = {
4626
- success: highestAchieved === pending.writeConcern,
4627
- opId,
4628
- achievedLevel: highestAchieved,
4629
- latencyMs,
4630
- error: highestAchieved !== pending.writeConcern ? `Shutdown: achieved ${highestAchieved}, requested ${pending.writeConcern}` : void 0
4631
- };
4632
- pending.resolve(result);
6428
+ isSynced(nodeId, maxLagMs = 1e3) {
6429
+ const lag = this.lagTracker.getLag(nodeId);
6430
+ return lag.current < maxLagMs;
6431
+ }
6432
+ /**
6433
+ * Get LagTracker for advanced monitoring
6434
+ */
6435
+ getLagTracker() {
6436
+ return this.lagTracker;
6437
+ }
6438
+ /**
6439
+ * Export metrics in Prometheus format
6440
+ */
6441
+ toPrometheusMetrics() {
6442
+ const lines = [];
6443
+ lines.push("# HELP topgun_replication_queue_size Pending operations in replication queue");
6444
+ lines.push("# TYPE topgun_replication_queue_size gauge");
6445
+ for (const [nodeId, queue] of this.replicationQueue) {
6446
+ lines.push(`topgun_replication_queue_size{node="${nodeId}"} ${queue.length}`);
4633
6447
  }
4634
- this.pending.clear();
4635
- if (count > 0) {
4636
- logger.info({ count }, "WriteAckManager shutdown");
6448
+ lines.push("");
6449
+ lines.push("# HELP topgun_replication_pending_acks Pending synchronous acknowledgments");
6450
+ lines.push("# TYPE topgun_replication_pending_acks gauge");
6451
+ lines.push(`topgun_replication_pending_acks ${this.pendingAcks.size}`);
6452
+ lines.push("");
6453
+ lines.push(this.lagTracker.toPrometheusMetrics());
6454
+ return lines.join("\n");
6455
+ }
6456
+ /**
6457
+ * Cleanup resources
6458
+ */
6459
+ close() {
6460
+ this.stopQueueProcessor();
6461
+ for (const [opId, pending] of this.pendingAcks) {
6462
+ clearTimeout(pending.timeout);
6463
+ pending.reject(new Error("ReplicationPipeline closed"));
4637
6464
  }
6465
+ this.pendingAcks.clear();
6466
+ this.replicationQueue.clear();
6467
+ this.lagTracker.clear();
4638
6468
  }
4639
6469
  };
4640
6470
 
@@ -4662,7 +6492,7 @@ var ServerCoordinator = class {
4662
6492
  this._readyPromise = new Promise((resolve) => {
4663
6493
  this._readyResolve = resolve;
4664
6494
  });
4665
- this.hlc = new import_core7.HLC(config.nodeId);
6495
+ this.hlc = new import_core10.HLC(config.nodeId);
4666
6496
  this.storage = config.storage;
4667
6497
  const rawSecret = config.jwtSecret || process.env.JWT_SECRET || "topgun-secret-dev";
4668
6498
  this.jwtSecret = rawSecret.replace(/\\n/g, "\n");
@@ -4799,6 +6629,22 @@ var ServerCoordinator = class {
4799
6629
  tls: config.clusterTls
4800
6630
  });
4801
6631
  this.partitionService = new PartitionService(this.cluster);
6632
+ if (config.replicationEnabled !== false) {
6633
+ this.replicationPipeline = new ReplicationPipeline(
6634
+ this.cluster,
6635
+ this.partitionService,
6636
+ {
6637
+ ...import_core10.DEFAULT_REPLICATION_CONFIG,
6638
+ defaultConsistency: config.defaultConsistency ?? import_core10.ConsistencyLevel.EVENTUAL,
6639
+ ...config.replicationConfig
6640
+ }
6641
+ );
6642
+ this.replicationPipeline.setOperationApplier(this.applyReplicatedOperation.bind(this));
6643
+ logger.info({ nodeId: config.nodeId }, "ReplicationPipeline initialized");
6644
+ }
6645
+ this.partitionService.on("rebalanced", (partitionMap, changes) => {
6646
+ this.broadcastPartitionMap(partitionMap);
6647
+ });
4802
6648
  this.lockManager = new LockManager();
4803
6649
  this.lockManager.on("lockGranted", (evt) => this.handleLockGranted(evt));
4804
6650
  this.topicManager = new TopicManager({
@@ -4915,7 +6761,7 @@ var ServerCoordinator = class {
4915
6761
  this.metricsService.destroy();
4916
6762
  this.wss.close();
4917
6763
  logger.info(`Closing ${this.clients.size} client connections...`);
4918
- const shutdownMsg = (0, import_core7.serialize)({ type: "SHUTDOWN_PENDING", retryAfter: 5e3 });
6764
+ const shutdownMsg = (0, import_core10.serialize)({ type: "SHUTDOWN_PENDING", retryAfter: 5e3 });
4919
6765
  for (const client of this.clients.values()) {
4920
6766
  try {
4921
6767
  if (client.socket.readyState === import_ws3.WebSocket.OPEN) {
@@ -4937,6 +6783,9 @@ var ServerCoordinator = class {
4937
6783
  await this.workerPool.shutdown(5e3);
4938
6784
  logger.info("Worker pool shutdown complete.");
4939
6785
  }
6786
+ if (this.replicationPipeline) {
6787
+ this.replicationPipeline.close();
6788
+ }
4940
6789
  if (this.cluster) {
4941
6790
  this.cluster.stop();
4942
6791
  }
@@ -5033,7 +6882,7 @@ var ServerCoordinator = class {
5033
6882
  buf = Buffer.from(message);
5034
6883
  }
5035
6884
  try {
5036
- data = (0, import_core7.deserialize)(buf);
6885
+ data = (0, import_core10.deserialize)(buf);
5037
6886
  } catch (e) {
5038
6887
  try {
5039
6888
  const text = Buffer.isBuffer(buf) ? buf.toString() : new TextDecoder().decode(buf);
@@ -5084,10 +6933,10 @@ var ServerCoordinator = class {
5084
6933
  this.clients.delete(clientId);
5085
6934
  this.metricsService.setConnectedClients(this.clients.size);
5086
6935
  });
5087
- ws.send((0, import_core7.serialize)({ type: "AUTH_REQUIRED" }));
6936
+ ws.send((0, import_core10.serialize)({ type: "AUTH_REQUIRED" }));
5088
6937
  }
5089
6938
  async handleMessage(client, rawMessage) {
5090
- const parseResult = import_core7.MessageSchema.safeParse(rawMessage);
6939
+ const parseResult = import_core10.MessageSchema.safeParse(rawMessage);
5091
6940
  if (!parseResult.success) {
5092
6941
  logger.error({ clientId: client.id, error: parseResult.error }, "Invalid message format from client");
5093
6942
  client.writer.write({
@@ -5327,7 +7176,7 @@ var ServerCoordinator = class {
5327
7176
  this.metricsService.incOp("GET", message.mapName);
5328
7177
  try {
5329
7178
  const mapForSync = await this.getMapAsync(message.mapName);
5330
- if (mapForSync instanceof import_core7.LWWMap) {
7179
+ if (mapForSync instanceof import_core10.LWWMap) {
5331
7180
  const tree = mapForSync.getMerkleTree();
5332
7181
  const rootHash = tree.getRootHash();
5333
7182
  client.writer.write({
@@ -5365,7 +7214,7 @@ var ServerCoordinator = class {
5365
7214
  const { mapName, path } = message.payload;
5366
7215
  try {
5367
7216
  const mapForBucket = await this.getMapAsync(mapName);
5368
- if (mapForBucket instanceof import_core7.LWWMap) {
7217
+ if (mapForBucket instanceof import_core10.LWWMap) {
5369
7218
  const treeForBucket = mapForBucket.getMerkleTree();
5370
7219
  const buckets = treeForBucket.getBuckets(path);
5371
7220
  const node = treeForBucket.getNode(path);
@@ -5494,6 +7343,23 @@ var ServerCoordinator = class {
5494
7343
  }
5495
7344
  break;
5496
7345
  }
7346
+ // ============ Phase 4: Partition Map Request Handler ============
7347
+ case "PARTITION_MAP_REQUEST": {
7348
+ const clientVersion = message.payload?.currentVersion ?? 0;
7349
+ const currentMap = this.partitionService.getPartitionMap();
7350
+ if (clientVersion < currentMap.version) {
7351
+ client.writer.write({
7352
+ type: "PARTITION_MAP",
7353
+ payload: currentMap
7354
+ });
7355
+ logger.debug({
7356
+ clientId: client.id,
7357
+ clientVersion,
7358
+ serverVersion: currentMap.version
7359
+ }, "Sent partition map to client");
7360
+ }
7361
+ break;
7362
+ }
5497
7363
  // ============ ORMap Sync Message Handlers ============
5498
7364
  case "ORMAP_SYNC_INIT": {
5499
7365
  if (!this.securityManager.checkPermission(client.principal, message.mapName, "READ")) {
@@ -5517,7 +7383,7 @@ var ServerCoordinator = class {
5517
7383
  this.metricsService.incOp("GET", message.mapName);
5518
7384
  try {
5519
7385
  const mapForSync = await this.getMapAsync(message.mapName, "OR");
5520
- if (mapForSync instanceof import_core7.ORMap) {
7386
+ if (mapForSync instanceof import_core10.ORMap) {
5521
7387
  const tree = mapForSync.getMerkleTree();
5522
7388
  const rootHash = tree.getRootHash();
5523
7389
  client.writer.write({
@@ -5554,7 +7420,7 @@ var ServerCoordinator = class {
5554
7420
  const { mapName, path } = message.payload;
5555
7421
  try {
5556
7422
  const mapForBucket = await this.getMapAsync(mapName, "OR");
5557
- if (mapForBucket instanceof import_core7.ORMap) {
7423
+ if (mapForBucket instanceof import_core10.ORMap) {
5558
7424
  const tree = mapForBucket.getMerkleTree();
5559
7425
  const buckets = tree.getBuckets(path);
5560
7426
  const isLeaf = tree.isLeaf(path);
@@ -5598,7 +7464,7 @@ var ServerCoordinator = class {
5598
7464
  const { mapName: diffMapName, keys } = message.payload;
5599
7465
  try {
5600
7466
  const mapForDiff = await this.getMapAsync(diffMapName, "OR");
5601
- if (mapForDiff instanceof import_core7.ORMap) {
7467
+ if (mapForDiff instanceof import_core10.ORMap) {
5602
7468
  const entries = [];
5603
7469
  const allTombstones = mapForDiff.getTombstones();
5604
7470
  for (const key of keys) {
@@ -5630,7 +7496,7 @@ var ServerCoordinator = class {
5630
7496
  const { mapName: pushMapName, entries: pushEntries } = message.payload;
5631
7497
  try {
5632
7498
  const mapForPush = await this.getMapAsync(pushMapName, "OR");
5633
- if (mapForPush instanceof import_core7.ORMap) {
7499
+ if (mapForPush instanceof import_core10.ORMap) {
5634
7500
  let totalAdded = 0;
5635
7501
  let totalUpdated = 0;
5636
7502
  for (const entry of pushEntries) {
@@ -5685,7 +7551,7 @@ var ServerCoordinator = class {
5685
7551
  } else if (op.orRecord && op.orRecord.timestamp) {
5686
7552
  } else if (op.orTag) {
5687
7553
  try {
5688
- ts = import_core7.HLC.parse(op.orTag);
7554
+ ts = import_core10.HLC.parse(op.orTag);
5689
7555
  } catch (e) {
5690
7556
  }
5691
7557
  }
@@ -5697,6 +7563,28 @@ var ServerCoordinator = class {
5697
7563
  client.lastActiveHlc = this.hlc.now();
5698
7564
  }
5699
7565
  }
7566
+ // ============ Phase 4: Partition Map Broadcast ============
7567
+ /**
7568
+ * Broadcast partition map to all connected and authenticated clients.
7569
+ * Called when partition topology changes (node join/leave/failover).
7570
+ */
7571
+ broadcastPartitionMap(partitionMap) {
7572
+ const message = {
7573
+ type: "PARTITION_MAP",
7574
+ payload: partitionMap
7575
+ };
7576
+ let broadcastCount = 0;
7577
+ for (const client of this.clients.values()) {
7578
+ if (client.isAuthenticated && client.socket.readyState === import_ws3.WebSocket.OPEN) {
7579
+ client.writer.write(message);
7580
+ broadcastCount++;
7581
+ }
7582
+ }
7583
+ logger.info({
7584
+ version: partitionMap.version,
7585
+ clientCount: broadcastCount
7586
+ }, "Broadcast partition map to clients");
7587
+ }
5700
7588
  broadcast(message, excludeClientId) {
5701
7589
  const isServerEvent = message.type === "SERVER_EVENT";
5702
7590
  if (isServerEvent) {
@@ -5727,7 +7615,7 @@ var ServerCoordinator = class {
5727
7615
  client.writer.write({ ...message, payload: newPayload });
5728
7616
  }
5729
7617
  } else {
5730
- const msgData = (0, import_core7.serialize)(message);
7618
+ const msgData = (0, import_core10.serialize)(message);
5731
7619
  for (const [id, client] of this.clients) {
5732
7620
  if (id !== excludeClientId && client.socket.readyState === 1) {
5733
7621
  client.writer.writeRaw(msgData);
@@ -5805,7 +7693,7 @@ var ServerCoordinator = class {
5805
7693
  payload: { events: filteredEvents },
5806
7694
  timestamp: this.hlc.now()
5807
7695
  };
5808
- const serializedBatch = (0, import_core7.serialize)(batchMessage);
7696
+ const serializedBatch = (0, import_core10.serialize)(batchMessage);
5809
7697
  for (const client of clients) {
5810
7698
  try {
5811
7699
  client.writer.writeRaw(serializedBatch);
@@ -5890,7 +7778,7 @@ var ServerCoordinator = class {
5890
7778
  payload: { events: filteredEvents },
5891
7779
  timestamp: this.hlc.now()
5892
7780
  };
5893
- const serializedBatch = (0, import_core7.serialize)(batchMessage);
7781
+ const serializedBatch = (0, import_core10.serialize)(batchMessage);
5894
7782
  for (const client of clients) {
5895
7783
  sendPromises.push(new Promise((resolve, reject) => {
5896
7784
  try {
@@ -6034,14 +7922,14 @@ var ServerCoordinator = class {
6034
7922
  async executeLocalQuery(mapName, query) {
6035
7923
  const map = await this.getMapAsync(mapName);
6036
7924
  const records = /* @__PURE__ */ new Map();
6037
- if (map instanceof import_core7.LWWMap) {
7925
+ if (map instanceof import_core10.LWWMap) {
6038
7926
  for (const key of map.allKeys()) {
6039
7927
  const rec = map.getRecord(key);
6040
7928
  if (rec && rec.value !== null) {
6041
7929
  records.set(key, rec);
6042
7930
  }
6043
7931
  }
6044
- } else if (map instanceof import_core7.ORMap) {
7932
+ } else if (map instanceof import_core10.ORMap) {
6045
7933
  const items = map.items;
6046
7934
  for (const key of items.keys()) {
6047
7935
  const values = map.get(key);
@@ -6111,11 +7999,11 @@ var ServerCoordinator = class {
6111
7999
  applyOpToMap(op) {
6112
8000
  const typeHint = op.opType === "OR_ADD" || op.opType === "OR_REMOVE" ? "OR" : "LWW";
6113
8001
  const map = this.getMap(op.mapName, typeHint);
6114
- if (typeHint === "OR" && map instanceof import_core7.LWWMap) {
8002
+ if (typeHint === "OR" && map instanceof import_core10.LWWMap) {
6115
8003
  logger.error({ mapName: op.mapName }, "Map type mismatch: LWWMap but received OR op");
6116
8004
  throw new Error("Map type mismatch: LWWMap but received OR op");
6117
8005
  }
6118
- if (typeHint === "LWW" && map instanceof import_core7.ORMap) {
8006
+ if (typeHint === "LWW" && map instanceof import_core10.ORMap) {
6119
8007
  logger.error({ mapName: op.mapName }, "Map type mismatch: ORMap but received LWW op");
6120
8008
  throw new Error("Map type mismatch: ORMap but received LWW op");
6121
8009
  }
@@ -6126,13 +8014,13 @@ var ServerCoordinator = class {
6126
8014
  mapName: op.mapName,
6127
8015
  key: op.key
6128
8016
  };
6129
- if (map instanceof import_core7.LWWMap) {
8017
+ if (map instanceof import_core10.LWWMap) {
6130
8018
  oldRecord = map.getRecord(op.key);
6131
8019
  map.merge(op.key, op.record);
6132
8020
  recordToStore = op.record;
6133
8021
  eventPayload.eventType = "UPDATED";
6134
8022
  eventPayload.record = op.record;
6135
- } else if (map instanceof import_core7.ORMap) {
8023
+ } else if (map instanceof import_core10.ORMap) {
6136
8024
  oldRecord = map.getRecords(op.key);
6137
8025
  if (op.opType === "OR_ADD") {
6138
8026
  map.apply(op.key, op.orRecord);
@@ -6148,7 +8036,7 @@ var ServerCoordinator = class {
6148
8036
  }
6149
8037
  }
6150
8038
  this.queryRegistry.processChange(op.mapName, map, op.key, op.record || op.orRecord, oldRecord);
6151
- const mapSize = map instanceof import_core7.ORMap ? map.totalRecords : map.size;
8039
+ const mapSize = map instanceof import_core10.ORMap ? map.totalRecords : map.size;
6152
8040
  this.metricsService.setMapSize(op.mapName, mapSize);
6153
8041
  if (this.storage) {
6154
8042
  if (recordToStore) {
@@ -6175,6 +8063,26 @@ var ServerCoordinator = class {
6175
8063
  }
6176
8064
  }
6177
8065
  }
8066
+ /**
8067
+ * Apply replicated operation from another node (callback for ReplicationPipeline)
8068
+ * This is called when we receive a replicated operation as a backup node
8069
+ */
8070
+ async applyReplicatedOperation(operation, opId, sourceNode) {
8071
+ try {
8072
+ const op = operation;
8073
+ logger.debug({ sourceNode, opId, mapName: op.mapName, key: op.key }, "Applying replicated operation");
8074
+ const { eventPayload } = this.applyOpToMap(op);
8075
+ this.broadcast({
8076
+ type: "SERVER_EVENT",
8077
+ payload: eventPayload,
8078
+ timestamp: this.hlc.now()
8079
+ });
8080
+ return true;
8081
+ } catch (error) {
8082
+ logger.error({ sourceNode, opId, error }, "Failed to apply replicated operation");
8083
+ return false;
8084
+ }
8085
+ }
6178
8086
  /**
6179
8087
  * Build OpContext for interceptors.
6180
8088
  */
@@ -6263,6 +8171,12 @@ var ServerCoordinator = class {
6263
8171
  throw err;
6264
8172
  }
6265
8173
  const { eventPayload } = this.applyOpToMap(op);
8174
+ if (this.replicationPipeline && !fromCluster) {
8175
+ const opId = op.id || `${op.mapName}:${op.key}:${Date.now()}`;
8176
+ this.replicationPipeline.replicate(op, opId, op.key).catch((err) => {
8177
+ logger.warn({ opId, key: op.key, err }, "Replication failed (non-fatal)");
8178
+ });
8179
+ }
6266
8180
  this.broadcast({
6267
8181
  type: "SERVER_EVENT",
6268
8182
  payload: eventPayload,
@@ -6385,6 +8299,12 @@ var ServerCoordinator = class {
6385
8299
  throw err;
6386
8300
  }
6387
8301
  const { eventPayload } = this.applyOpToMap(op);
8302
+ if (this.replicationPipeline) {
8303
+ const opId = op.id || `${op.mapName}:${op.key}:${Date.now()}`;
8304
+ this.replicationPipeline.replicate(op, opId, op.key).catch((err) => {
8305
+ logger.warn({ opId, key: op.key, err }, "Batch replication failed (non-fatal)");
8306
+ });
8307
+ }
6388
8308
  batchedEvents.push(eventPayload);
6389
8309
  this.broadcastToCluster(eventPayload);
6390
8310
  this.runAfterInterceptors(op, context);
@@ -6392,11 +8312,11 @@ var ServerCoordinator = class {
6392
8312
  handleClusterEvent(payload) {
6393
8313
  const { mapName, key, eventType } = payload;
6394
8314
  const map = this.getMap(mapName, eventType === "OR_ADD" || eventType === "OR_REMOVE" ? "OR" : "LWW");
6395
- const oldRecord = map instanceof import_core7.LWWMap ? map.getRecord(key) : null;
8315
+ const oldRecord = map instanceof import_core10.LWWMap ? map.getRecord(key) : null;
6396
8316
  if (this.partitionService.isRelated(key)) {
6397
- if (map instanceof import_core7.LWWMap && payload.record) {
8317
+ if (map instanceof import_core10.LWWMap && payload.record) {
6398
8318
  map.merge(key, payload.record);
6399
- } else if (map instanceof import_core7.ORMap) {
8319
+ } else if (map instanceof import_core10.ORMap) {
6400
8320
  if (eventType === "OR_ADD" && payload.orRecord) {
6401
8321
  map.apply(key, payload.orRecord);
6402
8322
  } else if (eventType === "OR_REMOVE" && payload.orTag) {
@@ -6415,9 +8335,9 @@ var ServerCoordinator = class {
6415
8335
  if (!this.maps.has(name)) {
6416
8336
  let map;
6417
8337
  if (typeHint === "OR") {
6418
- map = new import_core7.ORMap(this.hlc);
8338
+ map = new import_core10.ORMap(this.hlc);
6419
8339
  } else {
6420
- map = new import_core7.LWWMap(this.hlc);
8340
+ map = new import_core10.LWWMap(this.hlc);
6421
8341
  }
6422
8342
  this.maps.set(name, map);
6423
8343
  if (this.storage) {
@@ -6440,7 +8360,7 @@ var ServerCoordinator = class {
6440
8360
  this.getMap(name, typeHint);
6441
8361
  const loadingPromise = this.mapLoadingPromises.get(name);
6442
8362
  const map = this.maps.get(name);
6443
- const mapSize = map instanceof import_core7.LWWMap ? Array.from(map.entries()).length : map instanceof import_core7.ORMap ? map.size : 0;
8363
+ const mapSize = map instanceof import_core10.LWWMap ? Array.from(map.entries()).length : map instanceof import_core10.ORMap ? map.size : 0;
6444
8364
  logger.info({
6445
8365
  mapName: name,
6446
8366
  mapExisted,
@@ -6450,7 +8370,7 @@ var ServerCoordinator = class {
6450
8370
  if (loadingPromise) {
6451
8371
  logger.info({ mapName: name }, "[getMapAsync] Waiting for loadMapFromStorage...");
6452
8372
  await loadingPromise;
6453
- const newMapSize = map instanceof import_core7.LWWMap ? Array.from(map.entries()).length : map instanceof import_core7.ORMap ? map.size : 0;
8373
+ const newMapSize = map instanceof import_core10.LWWMap ? Array.from(map.entries()).length : map instanceof import_core10.ORMap ? map.size : 0;
6454
8374
  logger.info({ mapName: name, mapSizeAfterLoad: newMapSize }, "[getMapAsync] Load completed");
6455
8375
  }
6456
8376
  return this.maps.get(name);
@@ -6476,16 +8396,16 @@ var ServerCoordinator = class {
6476
8396
  const currentMap = this.maps.get(name);
6477
8397
  if (!currentMap) return;
6478
8398
  let targetMap = currentMap;
6479
- if (isOR && currentMap instanceof import_core7.LWWMap) {
8399
+ if (isOR && currentMap instanceof import_core10.LWWMap) {
6480
8400
  logger.info({ mapName: name }, "Map auto-detected as ORMap. Switching type.");
6481
- targetMap = new import_core7.ORMap(this.hlc);
8401
+ targetMap = new import_core10.ORMap(this.hlc);
6482
8402
  this.maps.set(name, targetMap);
6483
- } else if (!isOR && currentMap instanceof import_core7.ORMap && typeHint !== "OR") {
8403
+ } else if (!isOR && currentMap instanceof import_core10.ORMap && typeHint !== "OR") {
6484
8404
  logger.info({ mapName: name }, "Map auto-detected as LWWMap. Switching type.");
6485
- targetMap = new import_core7.LWWMap(this.hlc);
8405
+ targetMap = new import_core10.LWWMap(this.hlc);
6486
8406
  this.maps.set(name, targetMap);
6487
8407
  }
6488
- if (targetMap instanceof import_core7.ORMap) {
8408
+ if (targetMap instanceof import_core10.ORMap) {
6489
8409
  for (const [key, record] of records) {
6490
8410
  if (key === "__tombstones__") {
6491
8411
  const t = record;
@@ -6498,7 +8418,7 @@ var ServerCoordinator = class {
6498
8418
  }
6499
8419
  }
6500
8420
  }
6501
- } else if (targetMap instanceof import_core7.LWWMap) {
8421
+ } else if (targetMap instanceof import_core10.LWWMap) {
6502
8422
  for (const [key, record] of records) {
6503
8423
  if (!record.type) {
6504
8424
  targetMap.merge(key, record);
@@ -6509,7 +8429,7 @@ var ServerCoordinator = class {
6509
8429
  if (count > 0) {
6510
8430
  logger.info({ mapName: name, count }, "Loaded records for map");
6511
8431
  this.queryRegistry.refreshSubscriptions(name, targetMap);
6512
- const mapSize = targetMap instanceof import_core7.ORMap ? targetMap.totalRecords : targetMap.size;
8432
+ const mapSize = targetMap instanceof import_core10.ORMap ? targetMap.totalRecords : targetMap.size;
6513
8433
  this.metricsService.setMapSize(name, mapSize);
6514
8434
  }
6515
8435
  } catch (err) {
@@ -6591,7 +8511,7 @@ var ServerCoordinator = class {
6591
8511
  reportLocalHlc() {
6592
8512
  let minHlc = this.hlc.now();
6593
8513
  for (const client of this.clients.values()) {
6594
- if (import_core7.HLC.compare(client.lastActiveHlc, minHlc) < 0) {
8514
+ if (import_core10.HLC.compare(client.lastActiveHlc, minHlc) < 0) {
6595
8515
  minHlc = client.lastActiveHlc;
6596
8516
  }
6597
8517
  }
@@ -6612,7 +8532,7 @@ var ServerCoordinator = class {
6612
8532
  let globalSafe = this.hlc.now();
6613
8533
  let initialized = false;
6614
8534
  for (const ts of this.gcReports.values()) {
6615
- if (!initialized || import_core7.HLC.compare(ts, globalSafe) < 0) {
8535
+ if (!initialized || import_core10.HLC.compare(ts, globalSafe) < 0) {
6616
8536
  globalSafe = ts;
6617
8537
  initialized = true;
6618
8538
  }
@@ -6647,7 +8567,7 @@ var ServerCoordinator = class {
6647
8567
  logger.info({ olderThanMillis: olderThan.millis }, "Performing Garbage Collection");
6648
8568
  const now = Date.now();
6649
8569
  for (const [name, map] of this.maps) {
6650
- if (map instanceof import_core7.LWWMap) {
8570
+ if (map instanceof import_core10.LWWMap) {
6651
8571
  for (const key of map.allKeys()) {
6652
8572
  const record = map.getRecord(key);
6653
8573
  if (record && record.value !== null && record.ttlMs) {
@@ -6699,7 +8619,7 @@ var ServerCoordinator = class {
6699
8619
  });
6700
8620
  }
6701
8621
  }
6702
- } else if (map instanceof import_core7.ORMap) {
8622
+ } else if (map instanceof import_core10.ORMap) {
6703
8623
  const items = map.items;
6704
8624
  const tombstonesSet = map.tombstones;
6705
8625
  const tagsToExpire = [];
@@ -6802,17 +8722,17 @@ var ServerCoordinator = class {
6802
8722
  stringToWriteConcern(value) {
6803
8723
  switch (value) {
6804
8724
  case "FIRE_AND_FORGET":
6805
- return import_core7.WriteConcern.FIRE_AND_FORGET;
8725
+ return import_core10.WriteConcern.FIRE_AND_FORGET;
6806
8726
  case "MEMORY":
6807
- return import_core7.WriteConcern.MEMORY;
8727
+ return import_core10.WriteConcern.MEMORY;
6808
8728
  case "APPLIED":
6809
- return import_core7.WriteConcern.APPLIED;
8729
+ return import_core10.WriteConcern.APPLIED;
6810
8730
  case "REPLICATED":
6811
- return import_core7.WriteConcern.REPLICATED;
8731
+ return import_core10.WriteConcern.REPLICATED;
6812
8732
  case "PERSISTED":
6813
- return import_core7.WriteConcern.PERSISTED;
8733
+ return import_core10.WriteConcern.PERSISTED;
6814
8734
  default:
6815
- return import_core7.WriteConcern.MEMORY;
8735
+ return import_core10.WriteConcern.MEMORY;
6816
8736
  }
6817
8737
  }
6818
8738
  /**
@@ -6869,7 +8789,7 @@ var ServerCoordinator = class {
6869
8789
  }
6870
8790
  });
6871
8791
  if (op.id) {
6872
- this.writeAckManager.notifyLevel(op.id, import_core7.WriteConcern.REPLICATED);
8792
+ this.writeAckManager.notifyLevel(op.id, import_core10.WriteConcern.REPLICATED);
6873
8793
  }
6874
8794
  }
6875
8795
  }
@@ -6877,7 +8797,7 @@ var ServerCoordinator = class {
6877
8797
  this.broadcastBatch(batchedEvents, clientId);
6878
8798
  for (const op of ops) {
6879
8799
  if (op.id && this.partitionService.isLocalOwner(op.key)) {
6880
- this.writeAckManager.notifyLevel(op.id, import_core7.WriteConcern.REPLICATED);
8800
+ this.writeAckManager.notifyLevel(op.id, import_core10.WriteConcern.REPLICATED);
6881
8801
  }
6882
8802
  }
6883
8803
  }
@@ -6905,7 +8825,7 @@ var ServerCoordinator = class {
6905
8825
  const owner = this.partitionService.getOwner(op.key);
6906
8826
  await this.forwardOpAndWait(op, owner);
6907
8827
  if (op.id) {
6908
- this.writeAckManager.notifyLevel(op.id, import_core7.WriteConcern.REPLICATED);
8828
+ this.writeAckManager.notifyLevel(op.id, import_core10.WriteConcern.REPLICATED);
6909
8829
  }
6910
8830
  }
6911
8831
  }
@@ -6913,7 +8833,7 @@ var ServerCoordinator = class {
6913
8833
  await this.broadcastBatchSync(batchedEvents, clientId);
6914
8834
  for (const op of ops) {
6915
8835
  if (op.id && this.partitionService.isLocalOwner(op.key)) {
6916
- this.writeAckManager.notifyLevel(op.id, import_core7.WriteConcern.REPLICATED);
8836
+ this.writeAckManager.notifyLevel(op.id, import_core10.WriteConcern.REPLICATED);
6917
8837
  }
6918
8838
  }
6919
8839
  }
@@ -6941,7 +8861,7 @@ var ServerCoordinator = class {
6941
8861
  }
6942
8862
  const { eventPayload } = this.applyOpToMap(op);
6943
8863
  if (op.id) {
6944
- this.writeAckManager.notifyLevel(op.id, import_core7.WriteConcern.APPLIED);
8864
+ this.writeAckManager.notifyLevel(op.id, import_core10.WriteConcern.APPLIED);
6945
8865
  }
6946
8866
  if (eventPayload) {
6947
8867
  batchedEvents.push({
@@ -6955,7 +8875,7 @@ var ServerCoordinator = class {
6955
8875
  try {
6956
8876
  await this.persistOpSync(op);
6957
8877
  if (op.id) {
6958
- this.writeAckManager.notifyLevel(op.id, import_core7.WriteConcern.PERSISTED);
8878
+ this.writeAckManager.notifyLevel(op.id, import_core10.WriteConcern.PERSISTED);
6959
8879
  }
6960
8880
  } catch (err) {
6961
8881
  logger.error({ opId: op.id, err }, "Persistence failed");
@@ -7298,10 +9218,10 @@ var RateLimitInterceptor = class {
7298
9218
  };
7299
9219
 
7300
9220
  // src/utils/nativeStats.ts
7301
- var import_core8 = require("@topgunbuild/core");
9221
+ var import_core11 = require("@topgunbuild/core");
7302
9222
  function getNativeModuleStatus() {
7303
9223
  return {
7304
- nativeHash: (0, import_core8.isUsingNativeHash)(),
9224
+ nativeHash: (0, import_core11.isUsingNativeHash)(),
7305
9225
  sharedArrayBuffer: SharedMemoryManager.isAvailable()
7306
9226
  };
7307
9227
  }
@@ -7332,19 +9252,401 @@ function logNativeStatus() {
7332
9252
  ` - SharedArrayBuffer: ${status.sharedArrayBuffer ? "available" : "unavailable"}`
7333
9253
  );
7334
9254
  }
9255
+
9256
+ // src/cluster/ClusterCoordinator.ts
9257
+ var import_events9 = require("events");
9258
+ var import_core12 = require("@topgunbuild/core");
9259
+ var DEFAULT_CLUSTER_COORDINATOR_CONFIG = {
9260
+ gradualRebalancing: true,
9261
+ migration: import_core12.DEFAULT_MIGRATION_CONFIG,
9262
+ replication: import_core12.DEFAULT_REPLICATION_CONFIG,
9263
+ replicationEnabled: true
9264
+ };
9265
+ var ClusterCoordinator = class extends import_events9.EventEmitter {
9266
+ constructor(config) {
9267
+ super();
9268
+ this.replicationPipeline = null;
9269
+ // State
9270
+ this.started = false;
9271
+ this.actualPort = 0;
9272
+ this.config = {
9273
+ ...DEFAULT_CLUSTER_COORDINATOR_CONFIG,
9274
+ ...config
9275
+ };
9276
+ this.clusterManager = new ClusterManager(this.config.cluster);
9277
+ this.lagTracker = new LagTracker();
9278
+ const partitionServiceConfig = {
9279
+ gradualRebalancing: this.config.gradualRebalancing,
9280
+ migration: this.config.migration
9281
+ };
9282
+ this.partitionService = new PartitionService(this.clusterManager, partitionServiceConfig);
9283
+ if (this.config.replicationEnabled) {
9284
+ this.replicationPipeline = new ReplicationPipeline(
9285
+ this.clusterManager,
9286
+ this.partitionService,
9287
+ this.config.replication
9288
+ );
9289
+ }
9290
+ this.setupEventHandlers();
9291
+ }
9292
+ // ============================================
9293
+ // Lifecycle Methods
9294
+ // ============================================
9295
+ /**
9296
+ * Start the cluster coordinator
9297
+ */
9298
+ async start() {
9299
+ if (this.started) {
9300
+ return this.actualPort;
9301
+ }
9302
+ logger.info({ nodeId: this.config.cluster.nodeId }, "Starting ClusterCoordinator");
9303
+ this.actualPort = await this.clusterManager.start();
9304
+ const migrationManager = this.partitionService.getMigrationManager();
9305
+ if (migrationManager && this.config.dataCollector) {
9306
+ migrationManager.setDataCollector(this.config.dataCollector);
9307
+ }
9308
+ if (migrationManager && this.config.dataStorer) {
9309
+ migrationManager.setDataStorer(this.config.dataStorer);
9310
+ }
9311
+ this.started = true;
9312
+ this.emit("started");
9313
+ logger.info({ nodeId: this.config.cluster.nodeId, port: this.actualPort }, "ClusterCoordinator started");
9314
+ return this.actualPort;
9315
+ }
9316
+ /**
9317
+ * Stop the cluster coordinator
9318
+ */
9319
+ async stop() {
9320
+ if (!this.started) return;
9321
+ logger.info({ nodeId: this.config.cluster.nodeId }, "Stopping ClusterCoordinator");
9322
+ await this.partitionService.cancelMigrations();
9323
+ this.replicationPipeline?.close();
9324
+ this.clusterManager.stop();
9325
+ this.started = false;
9326
+ this.emit("stopped");
9327
+ logger.info({ nodeId: this.config.cluster.nodeId }, "ClusterCoordinator stopped");
9328
+ }
9329
+ // ============================================
9330
+ // Cluster Information
9331
+ // ============================================
9332
+ /**
9333
+ * Get local node ID
9334
+ */
9335
+ getNodeId() {
9336
+ return this.config.cluster.nodeId;
9337
+ }
9338
+ /**
9339
+ * Get cluster port
9340
+ */
9341
+ getPort() {
9342
+ return this.actualPort;
9343
+ }
9344
+ /**
9345
+ * Get all cluster members
9346
+ */
9347
+ getMembers() {
9348
+ return this.clusterManager.getMembers();
9349
+ }
9350
+ /**
9351
+ * Check if this is the local node
9352
+ */
9353
+ isLocal(nodeId) {
9354
+ return this.clusterManager.isLocal(nodeId);
9355
+ }
9356
+ /**
9357
+ * Check if coordinator is started
9358
+ */
9359
+ isStarted() {
9360
+ return this.started;
9361
+ }
9362
+ // ============================================
9363
+ // Partition Operations
9364
+ // ============================================
9365
+ /**
9366
+ * Get current partition map
9367
+ */
9368
+ getPartitionMap() {
9369
+ return this.partitionService.getPartitionMap();
9370
+ }
9371
+ /**
9372
+ * Get partition map version
9373
+ */
9374
+ getPartitionMapVersion() {
9375
+ return this.partitionService.getMapVersion();
9376
+ }
9377
+ /**
9378
+ * Get partition ID for a key
9379
+ */
9380
+ getPartitionId(key) {
9381
+ return this.partitionService.getPartitionId(key);
9382
+ }
9383
+ /**
9384
+ * Get owner node for a key
9385
+ */
9386
+ getOwner(key) {
9387
+ return this.partitionService.getOwner(key);
9388
+ }
9389
+ /**
9390
+ * Check if this node owns the key
9391
+ */
9392
+ isLocalOwner(key) {
9393
+ return this.partitionService.isLocalOwner(key);
9394
+ }
9395
+ /**
9396
+ * Check if this node is a backup for the key
9397
+ */
9398
+ isLocalBackup(key) {
9399
+ return this.partitionService.isLocalBackup(key);
9400
+ }
9401
+ /**
9402
+ * Get backup nodes for a partition
9403
+ */
9404
+ getBackups(partitionId) {
9405
+ return this.partitionService.getBackups(partitionId);
9406
+ }
9407
+ /**
9408
+ * Check if partition is currently migrating
9409
+ */
9410
+ isMigrating(partitionId) {
9411
+ return this.partitionService.isMigrating(partitionId);
9412
+ }
9413
+ /**
9414
+ * Check if any rebalancing is in progress
9415
+ */
9416
+ isRebalancing() {
9417
+ return this.partitionService.isRebalancing();
9418
+ }
9419
+ // ============================================
9420
+ // Migration Operations
9421
+ // ============================================
9422
+ /**
9423
+ * Get migration status
9424
+ */
9425
+ getMigrationStatus() {
9426
+ return this.partitionService.getMigrationStatus();
9427
+ }
9428
+ /**
9429
+ * Get migration metrics
9430
+ */
9431
+ getMigrationMetrics() {
9432
+ return this.partitionService.getMigrationManager()?.getMetrics() ?? null;
9433
+ }
9434
+ /**
9435
+ * Cancel all active migrations
9436
+ */
9437
+ async cancelMigrations() {
9438
+ await this.partitionService.cancelMigrations();
9439
+ }
9440
+ /**
9441
+ * Set data collector for migrations
9442
+ */
9443
+ setDataCollector(collector) {
9444
+ const migrationManager = this.partitionService.getMigrationManager();
9445
+ if (migrationManager) {
9446
+ migrationManager.setDataCollector(collector);
9447
+ }
9448
+ }
9449
+ /**
9450
+ * Set data storer for incoming migrations
9451
+ */
9452
+ setDataStorer(storer) {
9453
+ const migrationManager = this.partitionService.getMigrationManager();
9454
+ if (migrationManager) {
9455
+ migrationManager.setDataStorer(storer);
9456
+ }
9457
+ }
9458
+ // ============================================
9459
+ // Replication Operations
9460
+ // ============================================
9461
+ /**
9462
+ * Replicate an operation to backup nodes
9463
+ */
9464
+ async replicate(operation, opId, key, options = {}) {
9465
+ if (!this.replicationPipeline) {
9466
+ return { success: true, ackedBy: [] };
9467
+ }
9468
+ return this.replicationPipeline.replicate(operation, opId, key, options);
9469
+ }
9470
+ /**
9471
+ * Get replication health status
9472
+ */
9473
+ getReplicationHealth() {
9474
+ return this.lagTracker.getHealth();
9475
+ }
9476
+ /**
9477
+ * Get replication lag for a specific node
9478
+ */
9479
+ getReplicationLag(nodeId) {
9480
+ return this.lagTracker.getLag(nodeId);
9481
+ }
9482
+ /**
9483
+ * Check if a node is healthy for replication
9484
+ */
9485
+ isNodeHealthy(nodeId) {
9486
+ return this.lagTracker.isNodeHealthy(nodeId);
9487
+ }
9488
+ /**
9489
+ * Check if a node is laggy
9490
+ */
9491
+ isNodeLaggy(nodeId) {
9492
+ return this.lagTracker.isNodeLaggy(nodeId);
9493
+ }
9494
+ // ============================================
9495
+ // Cluster Communication
9496
+ // ============================================
9497
+ /**
9498
+ * Send message to a specific node
9499
+ */
9500
+ send(nodeId, message) {
9501
+ this.clusterManager.sendToNode(nodeId, message);
9502
+ }
9503
+ /**
9504
+ * Broadcast message to all nodes
9505
+ */
9506
+ broadcast(message) {
9507
+ for (const nodeId of this.clusterManager.getMembers()) {
9508
+ if (!this.clusterManager.isLocal(nodeId)) {
9509
+ this.clusterManager.sendToNode(nodeId, message);
9510
+ }
9511
+ }
9512
+ }
9513
+ // ============================================
9514
+ // Component Access
9515
+ // ============================================
9516
+ /**
9517
+ * Get underlying ClusterManager
9518
+ */
9519
+ getClusterManager() {
9520
+ return this.clusterManager;
9521
+ }
9522
+ /**
9523
+ * Get underlying PartitionService
9524
+ */
9525
+ getPartitionService() {
9526
+ return this.partitionService;
9527
+ }
9528
+ /**
9529
+ * Get underlying ReplicationPipeline
9530
+ */
9531
+ getReplicationPipeline() {
9532
+ return this.replicationPipeline;
9533
+ }
9534
+ /**
9535
+ * Get underlying LagTracker
9536
+ */
9537
+ getLagTracker() {
9538
+ return this.lagTracker;
9539
+ }
9540
+ // ============================================
9541
+ // Metrics Export
9542
+ // ============================================
9543
+ /**
9544
+ * Get all metrics in Prometheus format
9545
+ */
9546
+ getPrometheusMetrics() {
9547
+ const lines = [];
9548
+ lines.push("# HELP topgun_cluster_members Number of cluster members");
9549
+ lines.push("# TYPE topgun_cluster_members gauge");
9550
+ lines.push(`topgun_cluster_members ${this.clusterManager.getMembers().length}`);
9551
+ lines.push("");
9552
+ lines.push("# HELP topgun_cluster_started Cluster started status (1=started, 0=stopped)");
9553
+ lines.push("# TYPE topgun_cluster_started gauge");
9554
+ lines.push(`topgun_cluster_started ${this.started ? 1 : 0}`);
9555
+ lines.push("");
9556
+ lines.push("# HELP topgun_partition_map_version Current partition map version");
9557
+ lines.push("# TYPE topgun_partition_map_version gauge");
9558
+ lines.push(`topgun_partition_map_version ${this.partitionService.getMapVersion()}`);
9559
+ const migrationMetrics = this.getMigrationMetrics();
9560
+ if (migrationMetrics) {
9561
+ lines.push("");
9562
+ lines.push("# HELP topgun_migrations_started Total migrations started");
9563
+ lines.push("# TYPE topgun_migrations_started counter");
9564
+ lines.push(`topgun_migrations_started ${migrationMetrics.migrationsStarted}`);
9565
+ lines.push("");
9566
+ lines.push("# HELP topgun_migrations_completed Total migrations completed");
9567
+ lines.push("# TYPE topgun_migrations_completed counter");
9568
+ lines.push(`topgun_migrations_completed ${migrationMetrics.migrationsCompleted}`);
9569
+ lines.push("");
9570
+ lines.push("# HELP topgun_migrations_failed Total migrations failed");
9571
+ lines.push("# TYPE topgun_migrations_failed counter");
9572
+ lines.push(`topgun_migrations_failed ${migrationMetrics.migrationsFailed}`);
9573
+ lines.push("");
9574
+ lines.push("# HELP topgun_migrations_active Currently active migrations");
9575
+ lines.push("# TYPE topgun_migrations_active gauge");
9576
+ lines.push(`topgun_migrations_active ${migrationMetrics.activeMigrations}`);
9577
+ lines.push("");
9578
+ lines.push("# HELP topgun_migrations_queued Queued migrations");
9579
+ lines.push("# TYPE topgun_migrations_queued gauge");
9580
+ lines.push(`topgun_migrations_queued ${migrationMetrics.queuedMigrations}`);
9581
+ }
9582
+ lines.push("");
9583
+ lines.push(this.lagTracker.toPrometheusMetrics());
9584
+ return lines.join("\n");
9585
+ }
9586
+ // ============================================
9587
+ // Private Methods
9588
+ // ============================================
9589
+ setupEventHandlers() {
9590
+ this.clusterManager.on("memberJoined", (nodeId) => {
9591
+ logger.info({ nodeId }, "Cluster member joined");
9592
+ this.emit("member:joined", nodeId);
9593
+ });
9594
+ this.clusterManager.on("memberLeft", (nodeId) => {
9595
+ logger.info({ nodeId }, "Cluster member left");
9596
+ this.lagTracker.removeNode(nodeId);
9597
+ this.emit("member:left", nodeId);
9598
+ });
9599
+ this.partitionService.on("rebalanced", (map, changes) => {
9600
+ logger.info({ version: map.version, changesCount: changes.length }, "Partition map rebalanced");
9601
+ this.emit("partition:rebalanced", map, changes);
9602
+ });
9603
+ this.partitionService.on("partitionMoved", (info) => {
9604
+ this.emit("partition:moved", info);
9605
+ });
9606
+ const migrationManager = this.partitionService.getMigrationManager();
9607
+ if (migrationManager) {
9608
+ migrationManager.on("migrationStarted", (partitionId, targetNode) => {
9609
+ this.emit("migration:started", partitionId, targetNode);
9610
+ });
9611
+ migrationManager.on("migrationComplete", (partitionId) => {
9612
+ this.emit("migration:completed", partitionId);
9613
+ });
9614
+ migrationManager.on("migrationFailed", (partitionId, error) => {
9615
+ this.emit("migration:failed", partitionId, error);
9616
+ });
9617
+ }
9618
+ if (this.replicationPipeline) {
9619
+ this.replicationPipeline.on("ackReceived", (nodeId) => {
9620
+ this.lagTracker.recordAck(nodeId);
9621
+ });
9622
+ this.replicationPipeline.on("replicationSent", (nodeId) => {
9623
+ this.lagTracker.incrementPending(nodeId);
9624
+ });
9625
+ }
9626
+ }
9627
+ };
7335
9628
  // Annotate the CommonJS export names for ESM import in node:
7336
9629
  0 && (module.exports = {
7337
9630
  BufferPool,
9631
+ ClusterCoordinator,
9632
+ ClusterManager,
7338
9633
  ConnectionRateLimiter,
9634
+ DEFAULT_CLUSTER_COORDINATOR_CONFIG,
9635
+ DEFAULT_LAG_TRACKER_CONFIG,
7339
9636
  FilterTasklet,
7340
9637
  ForEachTasklet,
7341
9638
  IteratorTasklet,
9639
+ LagTracker,
9640
+ LockManager,
7342
9641
  MapTasklet,
7343
9642
  MemoryServerAdapter,
9643
+ MigrationManager,
7344
9644
  ObjectPool,
9645
+ PartitionService,
7345
9646
  PostgresAdapter,
7346
9647
  RateLimitInterceptor,
7347
9648
  ReduceTasklet,
9649
+ ReplicationPipeline,
7348
9650
  SecurityManager,
7349
9651
  ServerCoordinator,
7350
9652
  TaskletScheduler,