@topgunbuild/server 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +910 -9
- package/dist/index.d.ts +910 -9
- package/dist/index.js +2472 -170
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +3323 -1016
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -3
package/dist/index.js
CHANGED
|
@@ -31,16 +31,25 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
BufferPool: () => BufferPool,
|
|
34
|
+
ClusterCoordinator: () => ClusterCoordinator,
|
|
35
|
+
ClusterManager: () => ClusterManager,
|
|
34
36
|
ConnectionRateLimiter: () => ConnectionRateLimiter,
|
|
37
|
+
DEFAULT_CLUSTER_COORDINATOR_CONFIG: () => DEFAULT_CLUSTER_COORDINATOR_CONFIG,
|
|
38
|
+
DEFAULT_LAG_TRACKER_CONFIG: () => DEFAULT_LAG_TRACKER_CONFIG,
|
|
35
39
|
FilterTasklet: () => FilterTasklet,
|
|
36
40
|
ForEachTasklet: () => ForEachTasklet,
|
|
37
41
|
IteratorTasklet: () => IteratorTasklet,
|
|
42
|
+
LagTracker: () => LagTracker,
|
|
43
|
+
LockManager: () => LockManager,
|
|
38
44
|
MapTasklet: () => MapTasklet,
|
|
39
45
|
MemoryServerAdapter: () => MemoryServerAdapter,
|
|
46
|
+
MigrationManager: () => MigrationManager,
|
|
40
47
|
ObjectPool: () => ObjectPool,
|
|
48
|
+
PartitionService: () => PartitionService,
|
|
41
49
|
PostgresAdapter: () => PostgresAdapter,
|
|
42
50
|
RateLimitInterceptor: () => RateLimitInterceptor,
|
|
43
51
|
ReduceTasklet: () => ReduceTasklet,
|
|
52
|
+
ReplicationPipeline: () => ReplicationPipeline,
|
|
44
53
|
SecurityManager: () => SecurityManager,
|
|
45
54
|
ServerCoordinator: () => ServerCoordinator,
|
|
46
55
|
TaskletScheduler: () => TaskletScheduler,
|
|
@@ -73,7 +82,7 @@ var import_http = require("http");
|
|
|
73
82
|
var import_https = require("https");
|
|
74
83
|
var import_fs2 = require("fs");
|
|
75
84
|
var import_ws3 = require("ws");
|
|
76
|
-
var
|
|
85
|
+
var import_core10 = require("@topgunbuild/core");
|
|
77
86
|
var jwt = __toESM(require("jsonwebtoken"));
|
|
78
87
|
var crypto = __toESM(require("crypto"));
|
|
79
88
|
|
|
@@ -633,11 +642,268 @@ var TopicManager = class {
|
|
|
633
642
|
|
|
634
643
|
// src/cluster/ClusterManager.ts
|
|
635
644
|
var import_ws = require("ws");
|
|
636
|
-
var
|
|
645
|
+
var import_events2 = require("events");
|
|
637
646
|
var dns = __toESM(require("dns"));
|
|
638
647
|
var import_fs = require("fs");
|
|
639
648
|
var https = __toESM(require("https"));
|
|
640
|
-
|
|
649
|
+
|
|
650
|
+
// src/cluster/FailureDetector.ts
|
|
651
|
+
var import_events = require("events");
|
|
652
|
+
var DEFAULT_FAILURE_DETECTOR_CONFIG = {
|
|
653
|
+
heartbeatIntervalMs: 1e3,
|
|
654
|
+
suspicionTimeoutMs: 5e3,
|
|
655
|
+
confirmationTimeoutMs: 1e4,
|
|
656
|
+
phiThreshold: 8,
|
|
657
|
+
minSamples: 10,
|
|
658
|
+
maxSamples: 100,
|
|
659
|
+
initialHeartbeatIntervalMs: 1e3
|
|
660
|
+
};
|
|
661
|
+
var FailureDetector = class extends import_events.EventEmitter {
|
|
662
|
+
constructor(config = {}) {
|
|
663
|
+
super();
|
|
664
|
+
this.nodeStates = /* @__PURE__ */ new Map();
|
|
665
|
+
this.monitoringNodes = /* @__PURE__ */ new Set();
|
|
666
|
+
this.confirmationTimers = /* @__PURE__ */ new Map();
|
|
667
|
+
this.started = false;
|
|
668
|
+
this.config = { ...DEFAULT_FAILURE_DETECTOR_CONFIG, ...config };
|
|
669
|
+
}
|
|
670
|
+
/**
|
|
671
|
+
* Start the failure detector monitoring loop.
|
|
672
|
+
*/
|
|
673
|
+
start() {
|
|
674
|
+
if (this.started) return;
|
|
675
|
+
this.started = true;
|
|
676
|
+
this.checkTimer = setInterval(() => {
|
|
677
|
+
this.checkAllNodes();
|
|
678
|
+
}, this.config.heartbeatIntervalMs);
|
|
679
|
+
logger.info({ config: this.config }, "FailureDetector started");
|
|
680
|
+
}
|
|
681
|
+
/**
|
|
682
|
+
* Stop the failure detector and clean up.
|
|
683
|
+
*/
|
|
684
|
+
stop() {
|
|
685
|
+
if (!this.started) return;
|
|
686
|
+
this.started = false;
|
|
687
|
+
if (this.checkTimer) {
|
|
688
|
+
clearInterval(this.checkTimer);
|
|
689
|
+
this.checkTimer = void 0;
|
|
690
|
+
}
|
|
691
|
+
for (const timer of this.confirmationTimers.values()) {
|
|
692
|
+
clearTimeout(timer);
|
|
693
|
+
}
|
|
694
|
+
this.confirmationTimers.clear();
|
|
695
|
+
logger.info("FailureDetector stopped");
|
|
696
|
+
}
|
|
697
|
+
/**
|
|
698
|
+
* Start monitoring a node.
|
|
699
|
+
*/
|
|
700
|
+
startMonitoring(nodeId) {
|
|
701
|
+
if (this.monitoringNodes.has(nodeId)) return;
|
|
702
|
+
this.monitoringNodes.add(nodeId);
|
|
703
|
+
this.nodeStates.set(nodeId, {
|
|
704
|
+
lastHeartbeat: Date.now(),
|
|
705
|
+
intervalHistory: [],
|
|
706
|
+
isSuspected: false,
|
|
707
|
+
isConfirmedFailed: false
|
|
708
|
+
});
|
|
709
|
+
logger.debug({ nodeId }, "Started monitoring node");
|
|
710
|
+
}
|
|
711
|
+
/**
|
|
712
|
+
* Stop monitoring a node.
|
|
713
|
+
*/
|
|
714
|
+
stopMonitoring(nodeId) {
|
|
715
|
+
this.monitoringNodes.delete(nodeId);
|
|
716
|
+
this.nodeStates.delete(nodeId);
|
|
717
|
+
const timer = this.confirmationTimers.get(nodeId);
|
|
718
|
+
if (timer) {
|
|
719
|
+
clearTimeout(timer);
|
|
720
|
+
this.confirmationTimers.delete(nodeId);
|
|
721
|
+
}
|
|
722
|
+
logger.debug({ nodeId }, "Stopped monitoring node");
|
|
723
|
+
}
|
|
724
|
+
/**
|
|
725
|
+
* Record a heartbeat from a node.
|
|
726
|
+
* This updates the node's state and clears any suspicion.
|
|
727
|
+
*/
|
|
728
|
+
recordHeartbeat(nodeId) {
|
|
729
|
+
const state = this.nodeStates.get(nodeId);
|
|
730
|
+
if (!state) {
|
|
731
|
+
this.startMonitoring(nodeId);
|
|
732
|
+
return;
|
|
733
|
+
}
|
|
734
|
+
const now = Date.now();
|
|
735
|
+
const interval = now - state.lastHeartbeat;
|
|
736
|
+
state.intervalHistory.push(interval);
|
|
737
|
+
if (state.intervalHistory.length > this.config.maxSamples) {
|
|
738
|
+
state.intervalHistory.shift();
|
|
739
|
+
}
|
|
740
|
+
state.lastHeartbeat = now;
|
|
741
|
+
if (state.isSuspected) {
|
|
742
|
+
state.isSuspected = false;
|
|
743
|
+
state.suspicionStartTime = void 0;
|
|
744
|
+
state.isConfirmedFailed = false;
|
|
745
|
+
const timer = this.confirmationTimers.get(nodeId);
|
|
746
|
+
if (timer) {
|
|
747
|
+
clearTimeout(timer);
|
|
748
|
+
this.confirmationTimers.delete(nodeId);
|
|
749
|
+
}
|
|
750
|
+
this.emit("nodeRecovered", { nodeId });
|
|
751
|
+
logger.info({ nodeId }, "Node recovered");
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
/**
|
|
755
|
+
* Check all monitored nodes for failure.
|
|
756
|
+
*/
|
|
757
|
+
checkAllNodes() {
|
|
758
|
+
for (const nodeId of this.monitoringNodes) {
|
|
759
|
+
const phi = this.calculatePhi(nodeId);
|
|
760
|
+
const state = this.nodeStates.get(nodeId);
|
|
761
|
+
if (!state) continue;
|
|
762
|
+
if (phi > this.config.phiThreshold) {
|
|
763
|
+
if (!state.isSuspected) {
|
|
764
|
+
state.isSuspected = true;
|
|
765
|
+
state.suspicionStartTime = Date.now();
|
|
766
|
+
this.emit("nodeSuspected", { nodeId, phi });
|
|
767
|
+
logger.warn({ nodeId, phi }, "Node suspected");
|
|
768
|
+
this.scheduleConfirmation(nodeId);
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
/**
|
|
774
|
+
* Schedule failure confirmation after suspicion timeout.
|
|
775
|
+
*/
|
|
776
|
+
scheduleConfirmation(nodeId) {
|
|
777
|
+
const existingTimer = this.confirmationTimers.get(nodeId);
|
|
778
|
+
if (existingTimer) {
|
|
779
|
+
clearTimeout(existingTimer);
|
|
780
|
+
}
|
|
781
|
+
const timer = setTimeout(() => {
|
|
782
|
+
this.confirmFailure(nodeId);
|
|
783
|
+
}, this.config.confirmationTimeoutMs);
|
|
784
|
+
this.confirmationTimers.set(nodeId, timer);
|
|
785
|
+
}
|
|
786
|
+
/**
|
|
787
|
+
* Confirm node failure after confirmation timeout.
|
|
788
|
+
*/
|
|
789
|
+
confirmFailure(nodeId) {
|
|
790
|
+
const state = this.nodeStates.get(nodeId);
|
|
791
|
+
if (!state) return;
|
|
792
|
+
if (state.isSuspected && !state.isConfirmedFailed) {
|
|
793
|
+
state.isConfirmedFailed = true;
|
|
794
|
+
this.emit("nodeConfirmedFailed", { nodeId });
|
|
795
|
+
logger.error({ nodeId }, "Node failure confirmed");
|
|
796
|
+
}
|
|
797
|
+
this.confirmationTimers.delete(nodeId);
|
|
798
|
+
}
|
|
799
|
+
/**
|
|
800
|
+
* Calculate the phi value for a node using the Phi Accrual algorithm.
|
|
801
|
+
*
|
|
802
|
+
* Phi = -log10(P_later(t_now - t_last))
|
|
803
|
+
*
|
|
804
|
+
* where P_later is the probability that a heartbeat will arrive later than expected.
|
|
805
|
+
*/
|
|
806
|
+
calculatePhi(nodeId) {
|
|
807
|
+
const state = this.nodeStates.get(nodeId);
|
|
808
|
+
if (!state) return 0;
|
|
809
|
+
const now = Date.now();
|
|
810
|
+
const timeSinceLastHeartbeat = now - state.lastHeartbeat;
|
|
811
|
+
if (state.intervalHistory.length < this.config.minSamples) {
|
|
812
|
+
const expectedInterval = this.config.initialHeartbeatIntervalMs;
|
|
813
|
+
return timeSinceLastHeartbeat / expectedInterval;
|
|
814
|
+
}
|
|
815
|
+
const mean = this.calculateMean(state.intervalHistory);
|
|
816
|
+
const variance = this.calculateVariance(state.intervalHistory, mean);
|
|
817
|
+
const stdDev = Math.sqrt(variance);
|
|
818
|
+
if (timeSinceLastHeartbeat <= mean) {
|
|
819
|
+
return 0;
|
|
820
|
+
}
|
|
821
|
+
const deviations = stdDev > 0 ? (timeSinceLastHeartbeat - mean) / stdDev : 0;
|
|
822
|
+
const phi = Math.max(0, deviations);
|
|
823
|
+
return phi;
|
|
824
|
+
}
|
|
825
|
+
/**
|
|
826
|
+
* Calculate mean of an array of numbers.
|
|
827
|
+
*/
|
|
828
|
+
calculateMean(values) {
|
|
829
|
+
if (values.length === 0) return 0;
|
|
830
|
+
return values.reduce((sum, v) => sum + v, 0) / values.length;
|
|
831
|
+
}
|
|
832
|
+
/**
|
|
833
|
+
* Calculate variance of an array of numbers.
|
|
834
|
+
*/
|
|
835
|
+
calculateVariance(values, mean) {
|
|
836
|
+
if (values.length < 2) return 0;
|
|
837
|
+
return values.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / values.length;
|
|
838
|
+
}
|
|
839
|
+
/**
|
|
840
|
+
* Get list of currently suspected nodes.
|
|
841
|
+
*/
|
|
842
|
+
getSuspectedNodes() {
|
|
843
|
+
const suspected = [];
|
|
844
|
+
for (const [nodeId, state] of this.nodeStates) {
|
|
845
|
+
if (state.isSuspected) {
|
|
846
|
+
suspected.push(nodeId);
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
return suspected;
|
|
850
|
+
}
|
|
851
|
+
/**
|
|
852
|
+
* Get list of confirmed failed nodes.
|
|
853
|
+
*/
|
|
854
|
+
getConfirmedFailedNodes() {
|
|
855
|
+
const failed = [];
|
|
856
|
+
for (const [nodeId, state] of this.nodeStates) {
|
|
857
|
+
if (state.isConfirmedFailed) {
|
|
858
|
+
failed.push(nodeId);
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
return failed;
|
|
862
|
+
}
|
|
863
|
+
/**
|
|
864
|
+
* Check if a specific node is suspected.
|
|
865
|
+
*/
|
|
866
|
+
isSuspected(nodeId) {
|
|
867
|
+
return this.nodeStates.get(nodeId)?.isSuspected ?? false;
|
|
868
|
+
}
|
|
869
|
+
/**
|
|
870
|
+
* Check if a specific node's failure is confirmed.
|
|
871
|
+
*/
|
|
872
|
+
isConfirmedFailed(nodeId) {
|
|
873
|
+
return this.nodeStates.get(nodeId)?.isConfirmedFailed ?? false;
|
|
874
|
+
}
|
|
875
|
+
/**
|
|
876
|
+
* Get the current phi value for a node.
|
|
877
|
+
*/
|
|
878
|
+
getPhi(nodeId) {
|
|
879
|
+
return this.calculatePhi(nodeId);
|
|
880
|
+
}
|
|
881
|
+
/**
|
|
882
|
+
* Get all monitored nodes.
|
|
883
|
+
*/
|
|
884
|
+
getMonitoredNodes() {
|
|
885
|
+
return Array.from(this.monitoringNodes);
|
|
886
|
+
}
|
|
887
|
+
/**
|
|
888
|
+
* Get metrics for monitoring.
|
|
889
|
+
*/
|
|
890
|
+
getMetrics() {
|
|
891
|
+
let suspectedCount = 0;
|
|
892
|
+
let confirmedCount = 0;
|
|
893
|
+
for (const state of this.nodeStates.values()) {
|
|
894
|
+
if (state.isSuspected) suspectedCount++;
|
|
895
|
+
if (state.isConfirmedFailed) confirmedCount++;
|
|
896
|
+
}
|
|
897
|
+
return {
|
|
898
|
+
monitoredNodes: this.monitoringNodes.size,
|
|
899
|
+
suspectedNodes: suspectedCount,
|
|
900
|
+
confirmedFailedNodes: confirmedCount
|
|
901
|
+
};
|
|
902
|
+
}
|
|
903
|
+
};
|
|
904
|
+
|
|
905
|
+
// src/cluster/ClusterManager.ts
|
|
906
|
+
var ClusterManager = class extends import_events2.EventEmitter {
|
|
641
907
|
constructor(config) {
|
|
642
908
|
super();
|
|
643
909
|
this.members = /* @__PURE__ */ new Map();
|
|
@@ -645,6 +911,30 @@ var ClusterManager = class extends import_events.EventEmitter {
|
|
|
645
911
|
this.reconnectIntervals = /* @__PURE__ */ new Map();
|
|
646
912
|
this._actualPort = 0;
|
|
647
913
|
this.config = config;
|
|
914
|
+
this.failureDetector = new FailureDetector({
|
|
915
|
+
...DEFAULT_FAILURE_DETECTOR_CONFIG,
|
|
916
|
+
heartbeatIntervalMs: config.heartbeatIntervalMs ?? 1e3,
|
|
917
|
+
...config.failureDetection
|
|
918
|
+
});
|
|
919
|
+
this.failureDetector.on("nodeSuspected", (event) => {
|
|
920
|
+
logger.warn({ nodeId: event.nodeId, phi: event.phi }, "Node suspected (failure detector)");
|
|
921
|
+
this.emit("nodeSuspected", event.nodeId, event.phi);
|
|
922
|
+
});
|
|
923
|
+
this.failureDetector.on("nodeRecovered", (event) => {
|
|
924
|
+
logger.info({ nodeId: event.nodeId }, "Node recovered (failure detector)");
|
|
925
|
+
this.emit("nodeRecovered", event.nodeId);
|
|
926
|
+
});
|
|
927
|
+
this.failureDetector.on("nodeConfirmedFailed", (event) => {
|
|
928
|
+
logger.error({ nodeId: event.nodeId }, "Node failure confirmed");
|
|
929
|
+
this.emit("nodeConfirmedFailed", event.nodeId);
|
|
930
|
+
this.handleNodeFailure(event.nodeId);
|
|
931
|
+
});
|
|
932
|
+
}
|
|
933
|
+
/**
|
|
934
|
+
* Get the failure detector instance.
|
|
935
|
+
*/
|
|
936
|
+
getFailureDetector() {
|
|
937
|
+
return this.failureDetector;
|
|
648
938
|
}
|
|
649
939
|
/** Get the actual port the cluster is listening on */
|
|
650
940
|
get port() {
|
|
@@ -696,6 +986,8 @@ var ClusterManager = class extends import_events.EventEmitter {
|
|
|
696
986
|
}
|
|
697
987
|
stop() {
|
|
698
988
|
logger.info({ port: this.config.port }, "Stopping Cluster Manager");
|
|
989
|
+
this.stopHeartbeat();
|
|
990
|
+
this.failureDetector.stop();
|
|
699
991
|
for (const timeout of this.reconnectIntervals.values()) {
|
|
700
992
|
clearTimeout(timeout);
|
|
701
993
|
}
|
|
@@ -715,6 +1007,61 @@ var ClusterManager = class extends import_events.EventEmitter {
|
|
|
715
1007
|
this.server.close();
|
|
716
1008
|
}
|
|
717
1009
|
}
|
|
1010
|
+
/**
|
|
1011
|
+
* Start sending heartbeats to all peers.
|
|
1012
|
+
*/
|
|
1013
|
+
startHeartbeat() {
|
|
1014
|
+
if (this.heartbeatTimer) return;
|
|
1015
|
+
const intervalMs = this.config.heartbeatIntervalMs ?? 1e3;
|
|
1016
|
+
this.heartbeatTimer = setInterval(() => {
|
|
1017
|
+
this.sendHeartbeatToAll();
|
|
1018
|
+
}, intervalMs);
|
|
1019
|
+
this.failureDetector.start();
|
|
1020
|
+
logger.debug({ intervalMs }, "Heartbeat started");
|
|
1021
|
+
}
|
|
1022
|
+
/**
|
|
1023
|
+
* Stop sending heartbeats.
|
|
1024
|
+
*/
|
|
1025
|
+
stopHeartbeat() {
|
|
1026
|
+
if (this.heartbeatTimer) {
|
|
1027
|
+
clearInterval(this.heartbeatTimer);
|
|
1028
|
+
this.heartbeatTimer = void 0;
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
/**
|
|
1032
|
+
* Send heartbeat to all connected peers.
|
|
1033
|
+
*/
|
|
1034
|
+
sendHeartbeatToAll() {
|
|
1035
|
+
for (const [nodeId, member] of this.members) {
|
|
1036
|
+
if (member.isSelf) continue;
|
|
1037
|
+
if (member.socket && member.socket.readyState === import_ws.WebSocket.OPEN) {
|
|
1038
|
+
this.send(nodeId, "HEARTBEAT", { timestamp: Date.now() });
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
/**
|
|
1043
|
+
* Handle incoming heartbeat from a peer.
|
|
1044
|
+
*/
|
|
1045
|
+
handleHeartbeat(senderId, _payload) {
|
|
1046
|
+
this.failureDetector.recordHeartbeat(senderId);
|
|
1047
|
+
}
|
|
1048
|
+
/**
|
|
1049
|
+
* Handle confirmed node failure.
|
|
1050
|
+
*/
|
|
1051
|
+
handleNodeFailure(nodeId) {
|
|
1052
|
+
const member = this.members.get(nodeId);
|
|
1053
|
+
if (!member) return;
|
|
1054
|
+
logger.warn({ nodeId }, "Removing failed node from cluster");
|
|
1055
|
+
if (member.socket && member.socket.readyState !== import_ws.WebSocket.CLOSED) {
|
|
1056
|
+
try {
|
|
1057
|
+
member.socket.terminate();
|
|
1058
|
+
} catch (e) {
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
this.members.delete(nodeId);
|
|
1062
|
+
this.failureDetector.stopMonitoring(nodeId);
|
|
1063
|
+
this.emit("memberLeft", nodeId);
|
|
1064
|
+
}
|
|
718
1065
|
connectToPeers() {
|
|
719
1066
|
for (const peer of this.config.peers) {
|
|
720
1067
|
this.connectToPeer(peer);
|
|
@@ -833,7 +1180,13 @@ var ClusterManager = class extends import_events.EventEmitter {
|
|
|
833
1180
|
socket: ws,
|
|
834
1181
|
isSelf: false
|
|
835
1182
|
});
|
|
1183
|
+
this.failureDetector.startMonitoring(remoteNodeId);
|
|
1184
|
+
this.startHeartbeat();
|
|
836
1185
|
this.emit("memberJoined", remoteNodeId);
|
|
1186
|
+
} else if (msg.type === "HEARTBEAT") {
|
|
1187
|
+
if (remoteNodeId) {
|
|
1188
|
+
this.handleHeartbeat(remoteNodeId, msg.payload);
|
|
1189
|
+
}
|
|
837
1190
|
} else {
|
|
838
1191
|
this.emit("message", msg);
|
|
839
1192
|
}
|
|
@@ -847,6 +1200,7 @@ var ClusterManager = class extends import_events.EventEmitter {
|
|
|
847
1200
|
if (current && current.socket === ws) {
|
|
848
1201
|
logger.info({ nodeId: remoteNodeId }, "Peer disconnected");
|
|
849
1202
|
this.members.delete(remoteNodeId);
|
|
1203
|
+
this.failureDetector.stopMonitoring(remoteNodeId);
|
|
850
1204
|
this.emit("memberLeft", remoteNodeId);
|
|
851
1205
|
if (initiated && peerAddress) {
|
|
852
1206
|
this.scheduleReconnect(peerAddress, 0);
|
|
@@ -900,21 +1254,639 @@ var ClusterManager = class extends import_events.EventEmitter {
|
|
|
900
1254
|
};
|
|
901
1255
|
|
|
902
1256
|
// src/cluster/PartitionService.ts
|
|
1257
|
+
var import_events4 = require("events");
|
|
1258
|
+
|
|
1259
|
+
// src/cluster/MigrationManager.ts
|
|
1260
|
+
var import_events3 = require("events");
|
|
903
1261
|
var import_core3 = require("@topgunbuild/core");
|
|
904
|
-
var
|
|
905
|
-
|
|
906
|
-
constructor(
|
|
1262
|
+
var import_native = require("@topgunbuild/native");
|
|
1263
|
+
var MigrationManager = class extends import_events3.EventEmitter {
|
|
1264
|
+
constructor(clusterManager, partitionService, config = {}) {
|
|
1265
|
+
super();
|
|
1266
|
+
// Active outgoing migrations (this node is source)
|
|
1267
|
+
this.activeMigrations = /* @__PURE__ */ new Map();
|
|
1268
|
+
// Queue of migrations to process
|
|
1269
|
+
this.migrationQueue = [];
|
|
1270
|
+
// Incoming migrations (this node is target)
|
|
1271
|
+
this.incomingMigrations = /* @__PURE__ */ new Map();
|
|
1272
|
+
// Pending chunk acknowledgments
|
|
1273
|
+
this.pendingChunkAcks = /* @__PURE__ */ new Map();
|
|
1274
|
+
// Pending verification results
|
|
1275
|
+
this.pendingVerifications = /* @__PURE__ */ new Map();
|
|
1276
|
+
// Metrics tracking
|
|
1277
|
+
this.metrics = {
|
|
1278
|
+
migrationsStarted: 0,
|
|
1279
|
+
migrationsCompleted: 0,
|
|
1280
|
+
migrationsFailed: 0,
|
|
1281
|
+
chunksTransferred: 0,
|
|
1282
|
+
bytesTransferred: 0,
|
|
1283
|
+
activeMigrations: 0,
|
|
1284
|
+
queuedMigrations: 0
|
|
1285
|
+
};
|
|
1286
|
+
// Batch processing timer
|
|
1287
|
+
this.batchTimer = null;
|
|
1288
|
+
// Data collection callback (injected from ServerCoordinator)
|
|
1289
|
+
this.dataCollector = null;
|
|
1290
|
+
// Data storage callback (injected from ServerCoordinator)
|
|
1291
|
+
this.dataStorer = null;
|
|
1292
|
+
this.clusterManager = clusterManager;
|
|
1293
|
+
this.partitionService = partitionService;
|
|
1294
|
+
this.config = {
|
|
1295
|
+
...import_core3.DEFAULT_MIGRATION_CONFIG,
|
|
1296
|
+
...config
|
|
1297
|
+
};
|
|
1298
|
+
this.setupMessageHandlers();
|
|
1299
|
+
}
|
|
1300
|
+
// ============================================
|
|
1301
|
+
// Configuration
|
|
1302
|
+
// ============================================
|
|
1303
|
+
/**
|
|
1304
|
+
* Set the data collector callback
|
|
1305
|
+
* Called to collect all records for a partition before migration
|
|
1306
|
+
*/
|
|
1307
|
+
setDataCollector(collector) {
|
|
1308
|
+
this.dataCollector = collector;
|
|
1309
|
+
}
|
|
1310
|
+
/**
|
|
1311
|
+
* Set the data storer callback
|
|
1312
|
+
* Called to store received records after successful migration
|
|
1313
|
+
*/
|
|
1314
|
+
setDataStorer(storer) {
|
|
1315
|
+
this.dataStorer = storer;
|
|
1316
|
+
}
|
|
1317
|
+
// ============================================
|
|
1318
|
+
// Migration Planning
|
|
1319
|
+
// ============================================
|
|
1320
|
+
/**
|
|
1321
|
+
* Plan migration for topology change
|
|
1322
|
+
*/
|
|
1323
|
+
planMigration(oldDistribution, newDistribution) {
|
|
1324
|
+
const migrations = [];
|
|
1325
|
+
for (const [partitionId, newDist] of newDistribution) {
|
|
1326
|
+
const oldDist = oldDistribution.get(partitionId);
|
|
1327
|
+
const oldOwner = oldDist?.owner ?? this.clusterManager.config.nodeId;
|
|
1328
|
+
const newOwner = newDist.owner;
|
|
1329
|
+
if (oldOwner !== newOwner && oldOwner === this.clusterManager.config.nodeId) {
|
|
1330
|
+
migrations.push({
|
|
1331
|
+
partitionId,
|
|
1332
|
+
state: import_core3.PartitionState.STABLE,
|
|
1333
|
+
sourceNode: oldOwner,
|
|
1334
|
+
targetNode: newOwner,
|
|
1335
|
+
startTime: 0,
|
|
1336
|
+
bytesTransferred: 0,
|
|
1337
|
+
totalBytes: 0,
|
|
1338
|
+
retryCount: 0
|
|
1339
|
+
});
|
|
1340
|
+
}
|
|
1341
|
+
}
|
|
1342
|
+
migrations.sort((a, b) => a.partitionId - b.partitionId);
|
|
1343
|
+
this.migrationQueue = migrations;
|
|
1344
|
+
this.metrics.queuedMigrations = migrations.length;
|
|
1345
|
+
logger.info({ total: migrations.length }, "Migration planned");
|
|
1346
|
+
this.emit("migrationPlanned", { total: migrations.length });
|
|
1347
|
+
if (migrations.length > 0) {
|
|
1348
|
+
this.startBatchProcessing();
|
|
1349
|
+
}
|
|
1350
|
+
}
|
|
1351
|
+
/**
|
|
1352
|
+
* Start batch processing timer
|
|
1353
|
+
*/
|
|
1354
|
+
startBatchProcessing() {
|
|
1355
|
+
if (this.batchTimer) return;
|
|
1356
|
+
this.startNextBatch().catch((err) => {
|
|
1357
|
+
logger.error({ error: err }, "Failed to start first migration batch");
|
|
1358
|
+
this.emit("error", err);
|
|
1359
|
+
});
|
|
1360
|
+
this.batchTimer = setInterval(() => {
|
|
1361
|
+
this.startNextBatch().catch((err) => {
|
|
1362
|
+
logger.error({ error: err }, "Failed to start migration batch");
|
|
1363
|
+
this.emit("error", err);
|
|
1364
|
+
});
|
|
1365
|
+
}, this.config.batchIntervalMs);
|
|
1366
|
+
}
|
|
1367
|
+
/**
|
|
1368
|
+
* Stop batch processing
|
|
1369
|
+
*/
|
|
1370
|
+
stopBatchProcessing() {
|
|
1371
|
+
if (this.batchTimer) {
|
|
1372
|
+
clearInterval(this.batchTimer);
|
|
1373
|
+
this.batchTimer = null;
|
|
1374
|
+
}
|
|
1375
|
+
}
|
|
1376
|
+
/**
|
|
1377
|
+
* Start next batch of migrations
|
|
1378
|
+
*/
|
|
1379
|
+
async startNextBatch() {
|
|
1380
|
+
if (this.activeMigrations.size >= this.config.parallelTransfers) {
|
|
1381
|
+
return;
|
|
1382
|
+
}
|
|
1383
|
+
const slotsAvailable = this.config.parallelTransfers - this.activeMigrations.size;
|
|
1384
|
+
const batch = this.migrationQueue.splice(0, Math.min(slotsAvailable, this.config.batchSize));
|
|
1385
|
+
if (batch.length === 0) {
|
|
1386
|
+
if (this.migrationQueue.length === 0 && this.activeMigrations.size === 0) {
|
|
1387
|
+
this.stopBatchProcessing();
|
|
1388
|
+
}
|
|
1389
|
+
return;
|
|
1390
|
+
}
|
|
1391
|
+
for (const migration of batch) {
|
|
1392
|
+
migration.state = import_core3.PartitionState.MIGRATING;
|
|
1393
|
+
migration.startTime = Date.now();
|
|
1394
|
+
this.activeMigrations.set(migration.partitionId, migration);
|
|
1395
|
+
this.metrics.migrationsStarted++;
|
|
1396
|
+
this.metrics.activeMigrations = this.activeMigrations.size;
|
|
1397
|
+
this.metrics.queuedMigrations = this.migrationQueue.length;
|
|
1398
|
+
this.startPartitionMigration(migration).catch((error) => {
|
|
1399
|
+
this.onMigrationFailed(migration.partitionId, error);
|
|
1400
|
+
});
|
|
1401
|
+
}
|
|
1402
|
+
logger.info({ count: batch.length, remaining: this.migrationQueue.length }, "Batch started");
|
|
1403
|
+
this.emit("batchStarted", { count: batch.length, remaining: this.migrationQueue.length });
|
|
1404
|
+
}
|
|
1405
|
+
// ============================================
|
|
1406
|
+
// Migration Execution
|
|
1407
|
+
// ============================================
|
|
1408
|
+
/**
|
|
1409
|
+
* Start migration for a single partition
|
|
1410
|
+
*/
|
|
1411
|
+
async startPartitionMigration(migration) {
|
|
1412
|
+
const { partitionId, targetNode } = migration;
|
|
1413
|
+
logger.info({ partitionId, targetNode }, "Starting partition migration");
|
|
1414
|
+
let records;
|
|
1415
|
+
if (this.dataCollector) {
|
|
1416
|
+
records = await this.dataCollector(partitionId);
|
|
1417
|
+
} else {
|
|
1418
|
+
records = [];
|
|
1419
|
+
}
|
|
1420
|
+
migration.totalBytes = records.reduce((sum, r) => sum + r.length, 0);
|
|
1421
|
+
this.clusterManager.send(targetNode, "OP_FORWARD", {
|
|
1422
|
+
_migration: {
|
|
1423
|
+
type: "MIGRATION_START",
|
|
1424
|
+
payload: {
|
|
1425
|
+
partitionId,
|
|
1426
|
+
sourceNode: this.clusterManager.config.nodeId,
|
|
1427
|
+
estimatedSize: migration.totalBytes
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
});
|
|
1431
|
+
const chunks = this.chunkify(records);
|
|
1432
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
1433
|
+
const chunk = chunks[i];
|
|
1434
|
+
const checksum = this.calculateChecksum(chunk);
|
|
1435
|
+
this.clusterManager.send(targetNode, "OP_FORWARD", {
|
|
1436
|
+
_migration: {
|
|
1437
|
+
type: "MIGRATION_CHUNK",
|
|
1438
|
+
payload: {
|
|
1439
|
+
partitionId,
|
|
1440
|
+
chunkIndex: i,
|
|
1441
|
+
totalChunks: chunks.length,
|
|
1442
|
+
data: Array.from(chunk),
|
|
1443
|
+
// Convert Uint8Array to array for JSON serialization
|
|
1444
|
+
checksum
|
|
1445
|
+
}
|
|
1446
|
+
}
|
|
1447
|
+
});
|
|
1448
|
+
await this.waitForChunkAck(partitionId, i);
|
|
1449
|
+
migration.bytesTransferred += chunk.length;
|
|
1450
|
+
this.metrics.chunksTransferred++;
|
|
1451
|
+
this.metrics.bytesTransferred += chunk.length;
|
|
1452
|
+
this.emit("migrationProgress", migration);
|
|
1453
|
+
}
|
|
1454
|
+
const fullChecksum = this.calculatePartitionChecksum(records);
|
|
1455
|
+
migration.state = import_core3.PartitionState.SYNC;
|
|
1456
|
+
this.clusterManager.send(targetNode, "OP_FORWARD", {
|
|
1457
|
+
_migration: {
|
|
1458
|
+
type: "MIGRATION_COMPLETE",
|
|
1459
|
+
payload: {
|
|
1460
|
+
partitionId,
|
|
1461
|
+
totalRecords: records.length,
|
|
1462
|
+
checksum: fullChecksum
|
|
1463
|
+
}
|
|
1464
|
+
}
|
|
1465
|
+
});
|
|
1466
|
+
const verified = await this.waitForVerification(partitionId);
|
|
1467
|
+
if (verified) {
|
|
1468
|
+
await this.onMigrationComplete(partitionId);
|
|
1469
|
+
} else {
|
|
1470
|
+
throw new Error(`Migration verification failed for partition ${partitionId}`);
|
|
1471
|
+
}
|
|
1472
|
+
}
|
|
1473
|
+
/**
|
|
1474
|
+
* Split records into chunks
|
|
1475
|
+
*/
|
|
1476
|
+
chunkify(records) {
|
|
1477
|
+
const chunks = [];
|
|
1478
|
+
let currentChunk = [];
|
|
1479
|
+
let currentSize = 0;
|
|
1480
|
+
for (const record of records) {
|
|
1481
|
+
const lengthPrefix = new Uint8Array(4);
|
|
1482
|
+
new DataView(lengthPrefix.buffer).setUint32(0, record.length, true);
|
|
1483
|
+
currentChunk.push(...lengthPrefix, ...record);
|
|
1484
|
+
currentSize += 4 + record.length;
|
|
1485
|
+
if (currentSize >= this.config.transferChunkSize) {
|
|
1486
|
+
chunks.push(new Uint8Array(currentChunk));
|
|
1487
|
+
currentChunk = [];
|
|
1488
|
+
currentSize = 0;
|
|
1489
|
+
}
|
|
1490
|
+
}
|
|
1491
|
+
if (currentChunk.length > 0) {
|
|
1492
|
+
chunks.push(new Uint8Array(currentChunk));
|
|
1493
|
+
}
|
|
1494
|
+
if (chunks.length === 0) {
|
|
1495
|
+
chunks.push(new Uint8Array(0));
|
|
1496
|
+
}
|
|
1497
|
+
return chunks;
|
|
1498
|
+
}
|
|
1499
|
+
/**
|
|
1500
|
+
* Calculate checksum for a chunk using native xxhash
|
|
1501
|
+
*/
|
|
1502
|
+
calculateChecksum(data) {
|
|
1503
|
+
return String((0, import_native.xxhash64AsNumber)(data));
|
|
1504
|
+
}
|
|
1505
|
+
/**
|
|
1506
|
+
* Calculate checksum for all partition records using streaming xxhash
|
|
1507
|
+
*/
|
|
1508
|
+
calculatePartitionChecksum(records) {
|
|
1509
|
+
const state = (0, import_native.createXxHash64State)();
|
|
1510
|
+
for (const record of records) {
|
|
1511
|
+
state.update(record);
|
|
1512
|
+
}
|
|
1513
|
+
return String(state.digestAsNumber());
|
|
1514
|
+
}
|
|
1515
|
+
/**
|
|
1516
|
+
* Wait for chunk acknowledgment
|
|
1517
|
+
*/
|
|
1518
|
+
waitForChunkAck(partitionId, chunkIndex) {
|
|
1519
|
+
return new Promise((resolve, reject) => {
|
|
1520
|
+
const key = `${partitionId}:${chunkIndex}`;
|
|
1521
|
+
const timeout = setTimeout(() => {
|
|
1522
|
+
this.pendingChunkAcks.delete(key);
|
|
1523
|
+
reject(new Error(`Chunk ack timeout for partition ${partitionId}, chunk ${chunkIndex}`));
|
|
1524
|
+
}, this.config.syncTimeoutMs);
|
|
1525
|
+
this.pendingChunkAcks.set(key, { resolve, reject, timeout });
|
|
1526
|
+
});
|
|
1527
|
+
}
|
|
1528
|
+
/**
|
|
1529
|
+
* Wait for migration verification
|
|
1530
|
+
*/
|
|
1531
|
+
waitForVerification(partitionId) {
|
|
1532
|
+
return new Promise((resolve) => {
|
|
1533
|
+
const timeout = setTimeout(() => {
|
|
1534
|
+
this.pendingVerifications.delete(partitionId);
|
|
1535
|
+
resolve(false);
|
|
1536
|
+
}, this.config.syncTimeoutMs);
|
|
1537
|
+
this.pendingVerifications.set(partitionId, { resolve, timeout });
|
|
1538
|
+
});
|
|
1539
|
+
}
|
|
1540
|
+
// ============================================
|
|
1541
|
+
// Migration Completion
|
|
1542
|
+
// ============================================
|
|
1543
|
+
/**
|
|
1544
|
+
* Handle successful migration completion
|
|
1545
|
+
*/
|
|
1546
|
+
async onMigrationComplete(partitionId) {
|
|
1547
|
+
const migration = this.activeMigrations.get(partitionId);
|
|
1548
|
+
if (!migration) return;
|
|
1549
|
+
migration.state = import_core3.PartitionState.STABLE;
|
|
1550
|
+
this.activeMigrations.delete(partitionId);
|
|
1551
|
+
this.metrics.migrationsCompleted++;
|
|
1552
|
+
this.metrics.activeMigrations = this.activeMigrations.size;
|
|
1553
|
+
logger.info({
|
|
1554
|
+
partitionId,
|
|
1555
|
+
duration: Date.now() - migration.startTime,
|
|
1556
|
+
bytesTransferred: migration.bytesTransferred
|
|
1557
|
+
}, "Migration completed");
|
|
1558
|
+
this.emit("migrationComplete", partitionId);
|
|
1559
|
+
}
|
|
1560
|
+
/**
|
|
1561
|
+
* Handle migration failure
|
|
1562
|
+
*/
|
|
1563
|
+
async onMigrationFailed(partitionId, error) {
|
|
1564
|
+
const migration = this.activeMigrations.get(partitionId);
|
|
1565
|
+
if (!migration) return;
|
|
1566
|
+
migration.retryCount++;
|
|
1567
|
+
if (migration.retryCount <= this.config.maxRetries) {
|
|
1568
|
+
migration.state = import_core3.PartitionState.STABLE;
|
|
1569
|
+
migration.bytesTransferred = 0;
|
|
1570
|
+
this.activeMigrations.delete(partitionId);
|
|
1571
|
+
this.migrationQueue.unshift(migration);
|
|
1572
|
+
this.metrics.queuedMigrations = this.migrationQueue.length;
|
|
1573
|
+
this.metrics.activeMigrations = this.activeMigrations.size;
|
|
1574
|
+
logger.warn({
|
|
1575
|
+
partitionId,
|
|
1576
|
+
retryCount: migration.retryCount,
|
|
1577
|
+
error: error.message
|
|
1578
|
+
}, "Migration failed, will retry");
|
|
1579
|
+
} else {
|
|
1580
|
+
migration.state = import_core3.PartitionState.FAILED;
|
|
1581
|
+
this.activeMigrations.delete(partitionId);
|
|
1582
|
+
this.metrics.migrationsFailed++;
|
|
1583
|
+
this.metrics.activeMigrations = this.activeMigrations.size;
|
|
1584
|
+
logger.error({
|
|
1585
|
+
partitionId,
|
|
1586
|
+
retryCount: migration.retryCount,
|
|
1587
|
+
error: error.message
|
|
1588
|
+
}, "Migration failed permanently");
|
|
1589
|
+
this.emit("migrationFailed", partitionId, error);
|
|
1590
|
+
}
|
|
1591
|
+
}
|
|
1592
|
+
// ============================================
|
|
1593
|
+
// Incoming Migration Handlers (Target Node)
|
|
1594
|
+
// ============================================
|
|
1595
|
+
/**
|
|
1596
|
+
* Handle MIGRATION_START message
|
|
1597
|
+
*/
|
|
1598
|
+
handleMigrationStart(payload) {
|
|
1599
|
+
const { partitionId, sourceNode, estimatedSize } = payload;
|
|
1600
|
+
logger.info({ partitionId, sourceNode, estimatedSize }, "Receiving migration");
|
|
1601
|
+
this.incomingMigrations.set(partitionId, {
|
|
1602
|
+
sourceNode,
|
|
1603
|
+
chunks: [],
|
|
1604
|
+
expectedSize: estimatedSize,
|
|
1605
|
+
receivedSize: 0,
|
|
1606
|
+
startTime: Date.now()
|
|
1607
|
+
});
|
|
1608
|
+
}
|
|
1609
|
+
/**
|
|
1610
|
+
* Handle MIGRATION_CHUNK message
|
|
1611
|
+
*/
|
|
1612
|
+
handleMigrationChunk(payload) {
|
|
1613
|
+
const { partitionId, chunkIndex, data, checksum } = payload;
|
|
1614
|
+
const incoming = this.incomingMigrations.get(partitionId);
|
|
1615
|
+
if (!incoming) {
|
|
1616
|
+
logger.warn({ partitionId, chunkIndex }, "Received chunk for unknown migration");
|
|
1617
|
+
return;
|
|
1618
|
+
}
|
|
1619
|
+
const chunkData = new Uint8Array(data);
|
|
1620
|
+
const actualChecksum = this.calculateChecksum(chunkData);
|
|
1621
|
+
const success = actualChecksum === checksum;
|
|
1622
|
+
if (success) {
|
|
1623
|
+
incoming.chunks[chunkIndex] = chunkData;
|
|
1624
|
+
incoming.receivedSize += chunkData.length;
|
|
1625
|
+
} else {
|
|
1626
|
+
logger.warn({ partitionId, chunkIndex, expected: checksum, actual: actualChecksum }, "Chunk checksum mismatch");
|
|
1627
|
+
}
|
|
1628
|
+
this.clusterManager.send(incoming.sourceNode, "OP_FORWARD", {
|
|
1629
|
+
_migration: {
|
|
1630
|
+
type: "MIGRATION_CHUNK_ACK",
|
|
1631
|
+
payload: {
|
|
1632
|
+
partitionId,
|
|
1633
|
+
chunkIndex,
|
|
1634
|
+
success
|
|
1635
|
+
}
|
|
1636
|
+
}
|
|
1637
|
+
});
|
|
1638
|
+
}
|
|
1639
|
+
/**
|
|
1640
|
+
* Handle MIGRATION_COMPLETE message
|
|
1641
|
+
*/
|
|
1642
|
+
async handleMigrationComplete(payload) {
|
|
1643
|
+
const { partitionId, totalRecords, checksum } = payload;
|
|
1644
|
+
const incoming = this.incomingMigrations.get(partitionId);
|
|
1645
|
+
if (!incoming) {
|
|
1646
|
+
logger.warn({ partitionId }, "Received complete for unknown migration");
|
|
1647
|
+
return;
|
|
1648
|
+
}
|
|
1649
|
+
const allData = this.reassemble(incoming.chunks);
|
|
1650
|
+
const records = this.deserializeRecords(allData);
|
|
1651
|
+
const actualChecksum = this.calculatePartitionChecksum(records);
|
|
1652
|
+
const checksumMatch = actualChecksum === checksum;
|
|
1653
|
+
const success = checksumMatch && records.length === totalRecords;
|
|
1654
|
+
if (success && this.dataStorer) {
|
|
1655
|
+
await this.dataStorer(partitionId, records);
|
|
1656
|
+
}
|
|
1657
|
+
logger.info({
|
|
1658
|
+
partitionId,
|
|
1659
|
+
duration: Date.now() - incoming.startTime,
|
|
1660
|
+
records: records.length,
|
|
1661
|
+
checksumMatch
|
|
1662
|
+
}, "Migration received");
|
|
1663
|
+
this.clusterManager.send(incoming.sourceNode, "OP_FORWARD", {
|
|
1664
|
+
_migration: {
|
|
1665
|
+
type: "MIGRATION_VERIFY",
|
|
1666
|
+
payload: {
|
|
1667
|
+
partitionId,
|
|
1668
|
+
success,
|
|
1669
|
+
checksumMatch
|
|
1670
|
+
}
|
|
1671
|
+
}
|
|
1672
|
+
});
|
|
1673
|
+
this.incomingMigrations.delete(partitionId);
|
|
1674
|
+
}
|
|
1675
|
+
/**
|
|
1676
|
+
* Handle MIGRATION_CHUNK_ACK message
|
|
1677
|
+
*/
|
|
1678
|
+
handleMigrationChunkAck(payload) {
|
|
1679
|
+
const { partitionId, chunkIndex, success } = payload;
|
|
1680
|
+
const key = `${partitionId}:${chunkIndex}`;
|
|
1681
|
+
const pending = this.pendingChunkAcks.get(key);
|
|
1682
|
+
if (pending) {
|
|
1683
|
+
clearTimeout(pending.timeout);
|
|
1684
|
+
this.pendingChunkAcks.delete(key);
|
|
1685
|
+
if (success) {
|
|
1686
|
+
pending.resolve();
|
|
1687
|
+
} else {
|
|
1688
|
+
pending.reject(new Error(`Chunk ${chunkIndex} rejected by target`));
|
|
1689
|
+
}
|
|
1690
|
+
}
|
|
1691
|
+
}
|
|
1692
|
+
/**
|
|
1693
|
+
* Handle MIGRATION_VERIFY message
|
|
1694
|
+
*/
|
|
1695
|
+
handleMigrationVerify(payload) {
|
|
1696
|
+
const { partitionId, success } = payload;
|
|
1697
|
+
const pending = this.pendingVerifications.get(partitionId);
|
|
1698
|
+
if (pending) {
|
|
1699
|
+
clearTimeout(pending.timeout);
|
|
1700
|
+
this.pendingVerifications.delete(partitionId);
|
|
1701
|
+
pending.resolve(success);
|
|
1702
|
+
}
|
|
1703
|
+
}
|
|
1704
|
+
/**
|
|
1705
|
+
* Reassemble chunks into continuous data
|
|
1706
|
+
*/
|
|
1707
|
+
reassemble(chunks) {
|
|
1708
|
+
const totalLength = chunks.reduce((sum, c) => sum + (c?.length ?? 0), 0);
|
|
1709
|
+
const result = new Uint8Array(totalLength);
|
|
1710
|
+
let offset = 0;
|
|
1711
|
+
for (const chunk of chunks) {
|
|
1712
|
+
if (chunk) {
|
|
1713
|
+
result.set(chunk, offset);
|
|
1714
|
+
offset += chunk.length;
|
|
1715
|
+
}
|
|
1716
|
+
}
|
|
1717
|
+
return result;
|
|
1718
|
+
}
|
|
1719
|
+
/**
|
|
1720
|
+
* Deserialize records from chunk data
|
|
1721
|
+
*/
|
|
1722
|
+
deserializeRecords(data) {
|
|
1723
|
+
const records = [];
|
|
1724
|
+
let offset = 0;
|
|
1725
|
+
while (offset < data.length) {
|
|
1726
|
+
if (offset + 4 > data.length) break;
|
|
1727
|
+
const length = new DataView(data.buffer, data.byteOffset + offset, 4).getUint32(0, true);
|
|
1728
|
+
offset += 4;
|
|
1729
|
+
if (offset + length > data.length) break;
|
|
1730
|
+
records.push(data.slice(offset, offset + length));
|
|
1731
|
+
offset += length;
|
|
1732
|
+
}
|
|
1733
|
+
return records;
|
|
1734
|
+
}
|
|
1735
|
+
// ============================================
|
|
1736
|
+
// Message Handling
|
|
1737
|
+
// ============================================
|
|
1738
|
+
/**
|
|
1739
|
+
* Setup cluster message handlers
|
|
1740
|
+
*/
|
|
1741
|
+
setupMessageHandlers() {
|
|
1742
|
+
this.clusterManager.on("message", (msg) => {
|
|
1743
|
+
if (msg.payload?._migration) {
|
|
1744
|
+
const migration = msg.payload._migration;
|
|
1745
|
+
switch (migration.type) {
|
|
1746
|
+
case "MIGRATION_START":
|
|
1747
|
+
this.handleMigrationStart(migration.payload);
|
|
1748
|
+
break;
|
|
1749
|
+
case "MIGRATION_CHUNK":
|
|
1750
|
+
this.handleMigrationChunk(migration.payload);
|
|
1751
|
+
break;
|
|
1752
|
+
case "MIGRATION_COMPLETE":
|
|
1753
|
+
this.handleMigrationComplete(migration.payload).catch((err) => {
|
|
1754
|
+
logger.error({ error: err }, "Error handling migration complete");
|
|
1755
|
+
});
|
|
1756
|
+
break;
|
|
1757
|
+
case "MIGRATION_CHUNK_ACK":
|
|
1758
|
+
this.handleMigrationChunkAck(migration.payload);
|
|
1759
|
+
break;
|
|
1760
|
+
case "MIGRATION_VERIFY":
|
|
1761
|
+
this.handleMigrationVerify(migration.payload);
|
|
1762
|
+
break;
|
|
1763
|
+
}
|
|
1764
|
+
}
|
|
1765
|
+
});
|
|
1766
|
+
}
|
|
1767
|
+
// ============================================
|
|
1768
|
+
// Status and Metrics
|
|
1769
|
+
// ============================================
|
|
1770
|
+
/**
|
|
1771
|
+
* Check if a partition is currently migrating
|
|
1772
|
+
*/
|
|
1773
|
+
isActive(partitionId) {
|
|
1774
|
+
return this.activeMigrations.has(partitionId) || this.incomingMigrations.has(partitionId);
|
|
1775
|
+
}
|
|
1776
|
+
/**
|
|
1777
|
+
* Get migration status
|
|
1778
|
+
*/
|
|
1779
|
+
getStatus() {
|
|
1780
|
+
const avgMigrationTime = this.metrics.migrationsCompleted > 0 ? Date.now() - (this.activeMigrations.values().next().value?.startTime ?? Date.now()) : 0;
|
|
1781
|
+
const estimatedTimeRemainingMs = (this.migrationQueue.length + this.activeMigrations.size) * (avgMigrationTime || 1e3);
|
|
1782
|
+
return {
|
|
1783
|
+
inProgress: this.activeMigrations.size > 0 || this.migrationQueue.length > 0,
|
|
1784
|
+
active: Array.from(this.activeMigrations.values()),
|
|
1785
|
+
queued: this.migrationQueue.length,
|
|
1786
|
+
completed: this.metrics.migrationsCompleted,
|
|
1787
|
+
failed: this.metrics.migrationsFailed,
|
|
1788
|
+
estimatedTimeRemainingMs
|
|
1789
|
+
};
|
|
1790
|
+
}
|
|
1791
|
+
/**
|
|
1792
|
+
* Get migration metrics
|
|
1793
|
+
*/
|
|
1794
|
+
getMetrics() {
|
|
1795
|
+
return { ...this.metrics };
|
|
1796
|
+
}
|
|
1797
|
+
/**
|
|
1798
|
+
* Cancel all active and queued migrations
|
|
1799
|
+
*/
|
|
1800
|
+
async cancelAll() {
|
|
1801
|
+
this.stopBatchProcessing();
|
|
1802
|
+
this.migrationQueue = [];
|
|
1803
|
+
this.metrics.queuedMigrations = 0;
|
|
1804
|
+
for (const [partitionId, migration] of this.activeMigrations) {
|
|
1805
|
+
migration.state = import_core3.PartitionState.FAILED;
|
|
1806
|
+
this.metrics.migrationsFailed++;
|
|
1807
|
+
this.emit("migrationFailed", partitionId, new Error("Migration cancelled"));
|
|
1808
|
+
}
|
|
1809
|
+
this.activeMigrations.clear();
|
|
1810
|
+
this.metrics.activeMigrations = 0;
|
|
1811
|
+
for (const pending of this.pendingChunkAcks.values()) {
|
|
1812
|
+
clearTimeout(pending.timeout);
|
|
1813
|
+
pending.reject(new Error("Migration cancelled"));
|
|
1814
|
+
}
|
|
1815
|
+
this.pendingChunkAcks.clear();
|
|
1816
|
+
for (const pending of this.pendingVerifications.values()) {
|
|
1817
|
+
clearTimeout(pending.timeout);
|
|
1818
|
+
pending.resolve(false);
|
|
1819
|
+
}
|
|
1820
|
+
this.pendingVerifications.clear();
|
|
1821
|
+
this.incomingMigrations.clear();
|
|
1822
|
+
logger.info("All migrations cancelled");
|
|
1823
|
+
}
|
|
1824
|
+
/**
|
|
1825
|
+
* Cleanup resources (sync version for backwards compatibility)
|
|
1826
|
+
*/
|
|
1827
|
+
close() {
|
|
1828
|
+
this.cancelAll();
|
|
1829
|
+
}
|
|
1830
|
+
/**
|
|
1831
|
+
* Async cleanup - waits for cancellation to complete
|
|
1832
|
+
*/
|
|
1833
|
+
async closeAsync() {
|
|
1834
|
+
await this.cancelAll();
|
|
1835
|
+
this.removeAllListeners();
|
|
1836
|
+
}
|
|
1837
|
+
};
|
|
1838
|
+
|
|
1839
|
+
// src/cluster/PartitionService.ts
|
|
1840
|
+
var import_core4 = require("@topgunbuild/core");
|
|
1841
|
+
var DEFAULT_PARTITION_SERVICE_CONFIG = {
|
|
1842
|
+
gradualRebalancing: false,
|
|
1843
|
+
migration: import_core4.DEFAULT_MIGRATION_CONFIG
|
|
1844
|
+
};
|
|
1845
|
+
var PartitionService = class extends import_events4.EventEmitter {
|
|
1846
|
+
constructor(cluster, config = {}) {
|
|
1847
|
+
super();
|
|
907
1848
|
// partitionId -> { owner, backups }
|
|
908
1849
|
this.partitions = /* @__PURE__ */ new Map();
|
|
909
|
-
this.PARTITION_COUNT =
|
|
910
|
-
this.BACKUP_COUNT =
|
|
1850
|
+
this.PARTITION_COUNT = import_core4.PARTITION_COUNT;
|
|
1851
|
+
this.BACKUP_COUNT = import_core4.DEFAULT_BACKUP_COUNT;
|
|
1852
|
+
// Phase 4: Version tracking for partition map
|
|
1853
|
+
this.mapVersion = 0;
|
|
1854
|
+
this.lastRebalanceTime = 0;
|
|
1855
|
+
this.migrationManager = null;
|
|
911
1856
|
this.cluster = cluster;
|
|
912
|
-
this.
|
|
913
|
-
|
|
914
|
-
|
|
1857
|
+
this.config = {
|
|
1858
|
+
...DEFAULT_PARTITION_SERVICE_CONFIG,
|
|
1859
|
+
...config
|
|
1860
|
+
};
|
|
1861
|
+
if (this.config.gradualRebalancing) {
|
|
1862
|
+
this.migrationManager = new MigrationManager(
|
|
1863
|
+
cluster,
|
|
1864
|
+
this,
|
|
1865
|
+
this.config.migration
|
|
1866
|
+
);
|
|
1867
|
+
this.migrationManager.on("migrationComplete", (partitionId) => {
|
|
1868
|
+
logger.info({ partitionId }, "Migration completed, updating ownership");
|
|
1869
|
+
});
|
|
1870
|
+
this.migrationManager.on("migrationFailed", (partitionId, error) => {
|
|
1871
|
+
logger.error({ partitionId, error: error.message }, "Migration failed");
|
|
1872
|
+
});
|
|
1873
|
+
}
|
|
1874
|
+
this.cluster.on("memberJoined", (nodeId) => this.onMembershipChange("JOIN", nodeId));
|
|
1875
|
+
this.cluster.on("memberLeft", (nodeId) => this.onMembershipChange("LEAVE", nodeId));
|
|
1876
|
+
this.rebalance("REBALANCE");
|
|
1877
|
+
}
|
|
1878
|
+
/**
|
|
1879
|
+
* Handle membership change
|
|
1880
|
+
*/
|
|
1881
|
+
onMembershipChange(reason, nodeId) {
|
|
1882
|
+
if (this.config.gradualRebalancing && this.migrationManager) {
|
|
1883
|
+
this.rebalanceGradual(reason, nodeId);
|
|
1884
|
+
} else {
|
|
1885
|
+
this.rebalance(reason, nodeId);
|
|
1886
|
+
}
|
|
915
1887
|
}
|
|
916
1888
|
getPartitionId(key) {
|
|
917
|
-
return Math.abs((0,
|
|
1889
|
+
return Math.abs((0, import_core4.hashString)(key)) % this.PARTITION_COUNT;
|
|
918
1890
|
}
|
|
919
1891
|
getDistribution(key) {
|
|
920
1892
|
const pId = this.getPartitionId(key);
|
|
@@ -936,12 +1908,78 @@ var PartitionService = class {
|
|
|
936
1908
|
isRelated(key) {
|
|
937
1909
|
return this.isLocalOwner(key) || this.isLocalBackup(key);
|
|
938
1910
|
}
|
|
939
|
-
|
|
1911
|
+
// ============================================
|
|
1912
|
+
// Phase 4: Partition Map Methods
|
|
1913
|
+
// ============================================
|
|
1914
|
+
/**
|
|
1915
|
+
* Get current partition map version
|
|
1916
|
+
*/
|
|
1917
|
+
getMapVersion() {
|
|
1918
|
+
return this.mapVersion;
|
|
1919
|
+
}
|
|
1920
|
+
/**
|
|
1921
|
+
* Generate full PartitionMap for client consumption
|
|
1922
|
+
*/
|
|
1923
|
+
getPartitionMap() {
|
|
1924
|
+
const nodes = [];
|
|
1925
|
+
const partitions = [];
|
|
1926
|
+
for (const nodeId of this.cluster.getMembers()) {
|
|
1927
|
+
const isSelf = nodeId === this.cluster.config.nodeId;
|
|
1928
|
+
const host = isSelf ? this.cluster.config.host : "unknown";
|
|
1929
|
+
const port = isSelf ? this.cluster.port : 0;
|
|
1930
|
+
nodes.push({
|
|
1931
|
+
nodeId,
|
|
1932
|
+
endpoints: {
|
|
1933
|
+
websocket: `ws://${host}:${port}`
|
|
1934
|
+
},
|
|
1935
|
+
status: "ACTIVE"
|
|
1936
|
+
});
|
|
1937
|
+
}
|
|
1938
|
+
for (let i = 0; i < this.PARTITION_COUNT; i++) {
|
|
1939
|
+
const dist = this.partitions.get(i);
|
|
1940
|
+
if (dist) {
|
|
1941
|
+
partitions.push({
|
|
1942
|
+
partitionId: i,
|
|
1943
|
+
ownerNodeId: dist.owner,
|
|
1944
|
+
backupNodeIds: dist.backups
|
|
1945
|
+
});
|
|
1946
|
+
}
|
|
1947
|
+
}
|
|
1948
|
+
return {
|
|
1949
|
+
version: this.mapVersion,
|
|
1950
|
+
partitionCount: this.PARTITION_COUNT,
|
|
1951
|
+
nodes,
|
|
1952
|
+
partitions,
|
|
1953
|
+
generatedAt: Date.now()
|
|
1954
|
+
};
|
|
1955
|
+
}
|
|
1956
|
+
/**
|
|
1957
|
+
* Get partition info by ID
|
|
1958
|
+
*/
|
|
1959
|
+
getPartitionInfo(partitionId) {
|
|
1960
|
+
const dist = this.partitions.get(partitionId);
|
|
1961
|
+
if (!dist) return null;
|
|
1962
|
+
return {
|
|
1963
|
+
partitionId,
|
|
1964
|
+
ownerNodeId: dist.owner,
|
|
1965
|
+
backupNodeIds: dist.backups
|
|
1966
|
+
};
|
|
1967
|
+
}
|
|
1968
|
+
/**
|
|
1969
|
+
* Get owner node for a partition ID
|
|
1970
|
+
*/
|
|
1971
|
+
getPartitionOwner(partitionId) {
|
|
1972
|
+
const dist = this.partitions.get(partitionId);
|
|
1973
|
+
return dist?.owner ?? null;
|
|
1974
|
+
}
|
|
1975
|
+
rebalance(reason = "REBALANCE", triggerNodeId) {
|
|
1976
|
+
const oldPartitions = new Map(this.partitions);
|
|
940
1977
|
let allMembers = this.cluster.getMembers().sort();
|
|
941
1978
|
if (allMembers.length === 0) {
|
|
942
1979
|
allMembers = [this.cluster.config.nodeId];
|
|
943
1980
|
}
|
|
944
|
-
logger.info({ memberCount: allMembers.length, members: allMembers }, "Rebalancing partitions");
|
|
1981
|
+
logger.info({ memberCount: allMembers.length, members: allMembers, reason }, "Rebalancing partitions");
|
|
1982
|
+
const changes = [];
|
|
945
1983
|
for (let i = 0; i < this.PARTITION_COUNT; i++) {
|
|
946
1984
|
const ownerIndex = i % allMembers.length;
|
|
947
1985
|
const owner = allMembers[ownerIndex];
|
|
@@ -952,14 +1990,141 @@ var PartitionService = class {
|
|
|
952
1990
|
backups.push(allMembers[backupIndex]);
|
|
953
1991
|
}
|
|
954
1992
|
}
|
|
1993
|
+
const oldDist = oldPartitions.get(i);
|
|
1994
|
+
if (oldDist && oldDist.owner !== owner) {
|
|
1995
|
+
changes.push({
|
|
1996
|
+
partitionId: i,
|
|
1997
|
+
previousOwner: oldDist.owner,
|
|
1998
|
+
newOwner: owner,
|
|
1999
|
+
reason
|
|
2000
|
+
});
|
|
2001
|
+
}
|
|
955
2002
|
this.partitions.set(i, { owner, backups });
|
|
956
2003
|
}
|
|
2004
|
+
if (changes.length > 0 || this.mapVersion === 0) {
|
|
2005
|
+
this.mapVersion++;
|
|
2006
|
+
this.lastRebalanceTime = Date.now();
|
|
2007
|
+
logger.info({
|
|
2008
|
+
version: this.mapVersion,
|
|
2009
|
+
changesCount: changes.length,
|
|
2010
|
+
reason
|
|
2011
|
+
}, "Partition map updated");
|
|
2012
|
+
this.emit("rebalanced", this.getPartitionMap(), changes);
|
|
2013
|
+
}
|
|
2014
|
+
}
|
|
2015
|
+
// ============================================
|
|
2016
|
+
// Phase 4 Task 03: Gradual Rebalancing
|
|
2017
|
+
// ============================================
|
|
2018
|
+
/**
|
|
2019
|
+
* Perform gradual rebalancing using MigrationManager
|
|
2020
|
+
*/
|
|
2021
|
+
rebalanceGradual(reason, triggerNodeId) {
|
|
2022
|
+
if (!this.migrationManager) {
|
|
2023
|
+
this.rebalance(reason, triggerNodeId);
|
|
2024
|
+
return;
|
|
2025
|
+
}
|
|
2026
|
+
const oldDistribution = new Map(this.partitions);
|
|
2027
|
+
let allMembers = this.cluster.getMembers().sort();
|
|
2028
|
+
if (allMembers.length === 0) {
|
|
2029
|
+
allMembers = [this.cluster.config.nodeId];
|
|
2030
|
+
}
|
|
2031
|
+
const newDistribution = /* @__PURE__ */ new Map();
|
|
2032
|
+
for (let i = 0; i < this.PARTITION_COUNT; i++) {
|
|
2033
|
+
const ownerIndex = i % allMembers.length;
|
|
2034
|
+
const owner = allMembers[ownerIndex];
|
|
2035
|
+
const backups = [];
|
|
2036
|
+
if (allMembers.length > 1) {
|
|
2037
|
+
for (let b = 1; b <= this.BACKUP_COUNT; b++) {
|
|
2038
|
+
const backupIndex = (ownerIndex + b) % allMembers.length;
|
|
2039
|
+
backups.push(allMembers[backupIndex]);
|
|
2040
|
+
}
|
|
2041
|
+
}
|
|
2042
|
+
newDistribution.set(i, { owner, backups });
|
|
2043
|
+
}
|
|
2044
|
+
logger.info({ memberCount: allMembers.length, reason, triggerNodeId }, "Planning gradual rebalance");
|
|
2045
|
+
this.migrationManager.planMigration(oldDistribution, newDistribution);
|
|
2046
|
+
for (const [partitionId, dist] of newDistribution) {
|
|
2047
|
+
this.partitions.set(partitionId, dist);
|
|
2048
|
+
}
|
|
2049
|
+
this.mapVersion++;
|
|
2050
|
+
this.lastRebalanceTime = Date.now();
|
|
2051
|
+
const changes = [];
|
|
2052
|
+
for (const [partitionId, newDist] of newDistribution) {
|
|
2053
|
+
const oldDist = oldDistribution.get(partitionId);
|
|
2054
|
+
if (oldDist && oldDist.owner !== newDist.owner) {
|
|
2055
|
+
changes.push({
|
|
2056
|
+
partitionId,
|
|
2057
|
+
previousOwner: oldDist.owner,
|
|
2058
|
+
newOwner: newDist.owner,
|
|
2059
|
+
reason
|
|
2060
|
+
});
|
|
2061
|
+
}
|
|
2062
|
+
}
|
|
2063
|
+
this.emit("rebalanced", this.getPartitionMap(), changes);
|
|
2064
|
+
}
|
|
2065
|
+
/**
|
|
2066
|
+
* Set partition owner (called after migration completes)
|
|
2067
|
+
*/
|
|
2068
|
+
setOwner(partitionId, nodeId) {
|
|
2069
|
+
const partition = this.partitions.get(partitionId);
|
|
2070
|
+
if (!partition) return;
|
|
2071
|
+
const previousOwner = partition.owner;
|
|
2072
|
+
if (previousOwner === nodeId) return;
|
|
2073
|
+
partition.owner = nodeId;
|
|
2074
|
+
this.mapVersion++;
|
|
2075
|
+
logger.info({ partitionId, previousOwner, newOwner: nodeId, version: this.mapVersion }, "Partition owner updated");
|
|
2076
|
+
this.emit("partitionMoved", {
|
|
2077
|
+
partitionId,
|
|
2078
|
+
previousOwner,
|
|
2079
|
+
newOwner: nodeId,
|
|
2080
|
+
version: this.mapVersion
|
|
2081
|
+
});
|
|
2082
|
+
}
|
|
2083
|
+
/**
|
|
2084
|
+
* Get backups for a partition
|
|
2085
|
+
*/
|
|
2086
|
+
getBackups(partitionId) {
|
|
2087
|
+
const dist = this.partitions.get(partitionId);
|
|
2088
|
+
return dist?.backups ?? [];
|
|
2089
|
+
}
|
|
2090
|
+
/**
|
|
2091
|
+
* Get migration status
|
|
2092
|
+
*/
|
|
2093
|
+
getMigrationStatus() {
|
|
2094
|
+
return this.migrationManager?.getStatus() ?? null;
|
|
2095
|
+
}
|
|
2096
|
+
/**
|
|
2097
|
+
* Check if partition is currently migrating
|
|
2098
|
+
*/
|
|
2099
|
+
isMigrating(partitionId) {
|
|
2100
|
+
return this.migrationManager?.isActive(partitionId) ?? false;
|
|
2101
|
+
}
|
|
2102
|
+
/**
|
|
2103
|
+
* Check if any partition is currently migrating
|
|
2104
|
+
*/
|
|
2105
|
+
isRebalancing() {
|
|
2106
|
+
const status = this.getMigrationStatus();
|
|
2107
|
+
return status?.inProgress ?? false;
|
|
2108
|
+
}
|
|
2109
|
+
/**
|
|
2110
|
+
* Get MigrationManager for configuration
|
|
2111
|
+
*/
|
|
2112
|
+
getMigrationManager() {
|
|
2113
|
+
return this.migrationManager;
|
|
2114
|
+
}
|
|
2115
|
+
/**
|
|
2116
|
+
* Cancel all migrations
|
|
2117
|
+
*/
|
|
2118
|
+
async cancelMigrations() {
|
|
2119
|
+
if (this.migrationManager) {
|
|
2120
|
+
await this.migrationManager.cancelAll();
|
|
2121
|
+
}
|
|
957
2122
|
}
|
|
958
2123
|
};
|
|
959
2124
|
|
|
960
2125
|
// src/cluster/LockManager.ts
|
|
961
|
-
var
|
|
962
|
-
var _LockManager = class _LockManager extends
|
|
2126
|
+
var import_events5 = require("events");
|
|
2127
|
+
var _LockManager = class _LockManager extends import_events5.EventEmitter {
|
|
963
2128
|
// 5 minutes
|
|
964
2129
|
constructor() {
|
|
965
2130
|
super();
|
|
@@ -1474,8 +2639,8 @@ var SystemManager = class {
|
|
|
1474
2639
|
};
|
|
1475
2640
|
|
|
1476
2641
|
// src/utils/BoundedEventQueue.ts
|
|
1477
|
-
var
|
|
1478
|
-
var BoundedEventQueue = class extends
|
|
2642
|
+
var import_events6 = require("events");
|
|
2643
|
+
var BoundedEventQueue = class extends import_events6.EventEmitter {
|
|
1479
2644
|
constructor(options) {
|
|
1480
2645
|
super();
|
|
1481
2646
|
this.queue = [];
|
|
@@ -1907,7 +3072,7 @@ var BackpressureRegulator = class {
|
|
|
1907
3072
|
|
|
1908
3073
|
// src/utils/CoalescingWriter.ts
|
|
1909
3074
|
var import_ws2 = require("ws");
|
|
1910
|
-
var
|
|
3075
|
+
var import_core5 = require("@topgunbuild/core");
|
|
1911
3076
|
|
|
1912
3077
|
// src/memory/BufferPool.ts
|
|
1913
3078
|
var DEFAULT_CONFIG2 = {
|
|
@@ -2438,7 +3603,7 @@ var CoalescingWriter = class {
|
|
|
2438
3603
|
if (this.closed) {
|
|
2439
3604
|
return;
|
|
2440
3605
|
}
|
|
2441
|
-
const data = (0,
|
|
3606
|
+
const data = (0, import_core5.serialize)(message);
|
|
2442
3607
|
this.writeRaw(data, urgent);
|
|
2443
3608
|
}
|
|
2444
3609
|
/**
|
|
@@ -2622,7 +3787,7 @@ var CoalescingWriter = class {
|
|
|
2622
3787
|
offset += msg.data.length;
|
|
2623
3788
|
}
|
|
2624
3789
|
const usedBatch = batch.subarray(0, totalSize);
|
|
2625
|
-
const batchEnvelope = (0,
|
|
3790
|
+
const batchEnvelope = (0, import_core5.serialize)({
|
|
2626
3791
|
type: "BATCH",
|
|
2627
3792
|
count: messages.length,
|
|
2628
3793
|
data: usedBatch
|
|
@@ -2637,13 +3802,23 @@ var CoalescingWriter = class {
|
|
|
2637
3802
|
// src/utils/coalescingPresets.ts
|
|
2638
3803
|
var coalescingPresets = {
|
|
2639
3804
|
/**
|
|
2640
|
-
*
|
|
2641
|
-
*
|
|
2642
|
-
*
|
|
3805
|
+
* Low latency - optimized for minimal response time.
|
|
3806
|
+
* Best for: gaming, real-time chat, interactive applications.
|
|
3807
|
+
* Benchmark: p50=2ms, ~18K ops/sec
|
|
3808
|
+
*/
|
|
3809
|
+
lowLatency: {
|
|
3810
|
+
maxBatchSize: 100,
|
|
3811
|
+
maxDelayMs: 1,
|
|
3812
|
+
maxBatchBytes: 65536
|
|
3813
|
+
// 64KB
|
|
3814
|
+
},
|
|
3815
|
+
/**
|
|
3816
|
+
* Conservative - good balance of latency and batching.
|
|
3817
|
+
* Use for: general purpose with latency sensitivity.
|
|
2643
3818
|
*/
|
|
2644
3819
|
conservative: {
|
|
2645
3820
|
maxBatchSize: 100,
|
|
2646
|
-
maxDelayMs:
|
|
3821
|
+
maxDelayMs: 2,
|
|
2647
3822
|
maxBatchBytes: 65536
|
|
2648
3823
|
// 64KB
|
|
2649
3824
|
},
|
|
@@ -2654,7 +3829,7 @@ var coalescingPresets = {
|
|
|
2654
3829
|
*/
|
|
2655
3830
|
balanced: {
|
|
2656
3831
|
maxBatchSize: 300,
|
|
2657
|
-
maxDelayMs:
|
|
3832
|
+
maxDelayMs: 2,
|
|
2658
3833
|
maxBatchBytes: 131072
|
|
2659
3834
|
// 128KB
|
|
2660
3835
|
},
|
|
@@ -2662,10 +3837,11 @@ var coalescingPresets = {
|
|
|
2662
3837
|
* High throughput - optimized for write-heavy workloads.
|
|
2663
3838
|
* Higher batching for better network utilization.
|
|
2664
3839
|
* Use for: data ingestion, logging, IoT data streams.
|
|
3840
|
+
* Benchmark: p50=7ms, ~18K ops/sec
|
|
2665
3841
|
*/
|
|
2666
3842
|
highThroughput: {
|
|
2667
3843
|
maxBatchSize: 500,
|
|
2668
|
-
maxDelayMs:
|
|
3844
|
+
maxDelayMs: 2,
|
|
2669
3845
|
maxBatchBytes: 262144
|
|
2670
3846
|
// 256KB
|
|
2671
3847
|
},
|
|
@@ -2676,7 +3852,7 @@ var coalescingPresets = {
|
|
|
2676
3852
|
*/
|
|
2677
3853
|
aggressive: {
|
|
2678
3854
|
maxBatchSize: 1e3,
|
|
2679
|
-
maxDelayMs:
|
|
3855
|
+
maxDelayMs: 5,
|
|
2680
3856
|
maxBatchBytes: 524288
|
|
2681
3857
|
// 512KB
|
|
2682
3858
|
}
|
|
@@ -3207,6 +4383,7 @@ var WorkerPool = class {
|
|
|
3207
4383
|
|
|
3208
4384
|
// src/workers/MerkleWorker.ts
|
|
3209
4385
|
var import_path2 = require("path");
|
|
4386
|
+
var import_core6 = require("@topgunbuild/core");
|
|
3210
4387
|
var WORKER_THRESHOLD = 10;
|
|
3211
4388
|
var taskIdCounter = 0;
|
|
3212
4389
|
function generateTaskId() {
|
|
@@ -3416,12 +4593,7 @@ var MerkleWorker = class {
|
|
|
3416
4593
|
}
|
|
3417
4594
|
// ============ Hash utilities ============
|
|
3418
4595
|
hashString(str) {
|
|
3419
|
-
|
|
3420
|
-
for (let i = 0; i < str.length; i++) {
|
|
3421
|
-
hash ^= str.charCodeAt(i);
|
|
3422
|
-
hash = Math.imul(hash, 16777619);
|
|
3423
|
-
}
|
|
3424
|
-
return hash >>> 0;
|
|
4596
|
+
return (0, import_core6.hashString)(str);
|
|
3425
4597
|
}
|
|
3426
4598
|
buildTree(entries, depth) {
|
|
3427
4599
|
const root = { hash: 0, children: {} };
|
|
@@ -3648,7 +4820,7 @@ CRDTMergeWorker.BATCH_THRESHOLD = WORKER_THRESHOLD2;
|
|
|
3648
4820
|
|
|
3649
4821
|
// src/workers/SerializationWorker.ts
|
|
3650
4822
|
var import_path3 = require("path");
|
|
3651
|
-
var
|
|
4823
|
+
var import_core7 = require("@topgunbuild/core");
|
|
3652
4824
|
var WORKER_BATCH_THRESHOLD = 10;
|
|
3653
4825
|
var WORKER_SIZE_THRESHOLD = 50 * 1024;
|
|
3654
4826
|
var taskIdCounter3 = 0;
|
|
@@ -3775,26 +4947,26 @@ var SerializationWorker = class {
|
|
|
3775
4947
|
* Serialize a single object (always inline, too small for worker)
|
|
3776
4948
|
*/
|
|
3777
4949
|
serialize(data) {
|
|
3778
|
-
return (0,
|
|
4950
|
+
return (0, import_core7.serialize)(data);
|
|
3779
4951
|
}
|
|
3780
4952
|
/**
|
|
3781
4953
|
* Deserialize a single payload (always inline, too small for worker)
|
|
3782
4954
|
*/
|
|
3783
4955
|
deserialize(data) {
|
|
3784
|
-
return (0,
|
|
4956
|
+
return (0, import_core7.deserialize)(data);
|
|
3785
4957
|
}
|
|
3786
4958
|
// ============ Inline implementations for small batches ============
|
|
3787
4959
|
serializeBatchInline(items) {
|
|
3788
4960
|
const results = [];
|
|
3789
4961
|
for (const item of items) {
|
|
3790
|
-
results.push((0,
|
|
4962
|
+
results.push((0, import_core7.serialize)(item));
|
|
3791
4963
|
}
|
|
3792
4964
|
return results;
|
|
3793
4965
|
}
|
|
3794
4966
|
deserializeBatchInline(items) {
|
|
3795
4967
|
const results = [];
|
|
3796
4968
|
for (const item of items) {
|
|
3797
|
-
results.push((0,
|
|
4969
|
+
results.push((0, import_core7.deserialize)(item));
|
|
3798
4970
|
}
|
|
3799
4971
|
return results;
|
|
3800
4972
|
}
|
|
@@ -4382,13 +5554,13 @@ var ReduceTasklet = class extends IteratorTasklet {
|
|
|
4382
5554
|
};
|
|
4383
5555
|
|
|
4384
5556
|
// src/ack/WriteAckManager.ts
|
|
4385
|
-
var
|
|
4386
|
-
var
|
|
4387
|
-
var WriteAckManager = class extends
|
|
5557
|
+
var import_events7 = require("events");
|
|
5558
|
+
var import_core8 = require("@topgunbuild/core");
|
|
5559
|
+
var WriteAckManager = class extends import_events7.EventEmitter {
|
|
4388
5560
|
constructor(config) {
|
|
4389
5561
|
super();
|
|
4390
5562
|
this.pending = /* @__PURE__ */ new Map();
|
|
4391
|
-
this.defaultTimeout = config?.defaultTimeout ??
|
|
5563
|
+
this.defaultTimeout = config?.defaultTimeout ?? import_core8.DEFAULT_WRITE_CONCERN_TIMEOUT;
|
|
4392
5564
|
}
|
|
4393
5565
|
/**
|
|
4394
5566
|
* Register a pending write operation.
|
|
@@ -4400,11 +5572,11 @@ var WriteAckManager = class extends import_events4.EventEmitter {
|
|
|
4400
5572
|
* @returns Promise that resolves with WriteResult
|
|
4401
5573
|
*/
|
|
4402
5574
|
registerPending(opId, writeConcern, timeout) {
|
|
4403
|
-
if (writeConcern ===
|
|
5575
|
+
if (writeConcern === import_core8.WriteConcern.FIRE_AND_FORGET) {
|
|
4404
5576
|
return Promise.resolve({
|
|
4405
5577
|
success: true,
|
|
4406
5578
|
opId,
|
|
4407
|
-
achievedLevel:
|
|
5579
|
+
achievedLevel: import_core8.WriteConcern.FIRE_AND_FORGET,
|
|
4408
5580
|
latencyMs: 0
|
|
4409
5581
|
});
|
|
4410
5582
|
}
|
|
@@ -4418,7 +5590,7 @@ var WriteAckManager = class extends import_events4.EventEmitter {
|
|
|
4418
5590
|
timeout: effectiveTimeout,
|
|
4419
5591
|
resolve,
|
|
4420
5592
|
reject,
|
|
4421
|
-
achievedLevels: /* @__PURE__ */ new Set([
|
|
5593
|
+
achievedLevels: /* @__PURE__ */ new Set([import_core8.WriteConcern.FIRE_AND_FORGET])
|
|
4422
5594
|
};
|
|
4423
5595
|
pendingWrite.timeoutHandle = setTimeout(() => {
|
|
4424
5596
|
this.handleTimeout(opId);
|
|
@@ -4428,8 +5600,8 @@ var WriteAckManager = class extends import_events4.EventEmitter {
|
|
|
4428
5600
|
{ opId, writeConcern, timeout: effectiveTimeout },
|
|
4429
5601
|
"Registered pending write"
|
|
4430
5602
|
);
|
|
4431
|
-
if (writeConcern ===
|
|
4432
|
-
this.notifyLevel(opId,
|
|
5603
|
+
if (writeConcern === import_core8.WriteConcern.MEMORY) {
|
|
5604
|
+
this.notifyLevel(opId, import_core8.WriteConcern.MEMORY);
|
|
4433
5605
|
}
|
|
4434
5606
|
});
|
|
4435
5607
|
}
|
|
@@ -4449,7 +5621,7 @@ var WriteAckManager = class extends import_events4.EventEmitter {
|
|
|
4449
5621
|
{ opId, level, target: pending.writeConcern },
|
|
4450
5622
|
"Write Concern level achieved"
|
|
4451
5623
|
);
|
|
4452
|
-
if ((0,
|
|
5624
|
+
if ((0, import_core8.isWriteConcernAchieved)(pending.achievedLevels, pending.writeConcern)) {
|
|
4453
5625
|
this.resolvePending(opId, level);
|
|
4454
5626
|
}
|
|
4455
5627
|
}
|
|
@@ -4492,7 +5664,7 @@ var WriteAckManager = class extends import_events4.EventEmitter {
|
|
|
4492
5664
|
getAchievedLevel(opId) {
|
|
4493
5665
|
const pending = this.pending.get(opId);
|
|
4494
5666
|
if (!pending) return void 0;
|
|
4495
|
-
return (0,
|
|
5667
|
+
return (0, import_core8.getHighestWriteConcernLevel)(pending.achievedLevels);
|
|
4496
5668
|
}
|
|
4497
5669
|
/**
|
|
4498
5670
|
* Resolve a pending write with success.
|
|
@@ -4524,7 +5696,7 @@ var WriteAckManager = class extends import_events4.EventEmitter {
|
|
|
4524
5696
|
handleTimeout(opId) {
|
|
4525
5697
|
const pending = this.pending.get(opId);
|
|
4526
5698
|
if (!pending) return;
|
|
4527
|
-
const highestAchieved = (0,
|
|
5699
|
+
const highestAchieved = (0, import_core8.getHighestWriteConcernLevel)(pending.achievedLevels);
|
|
4528
5700
|
const latencyMs = Date.now() - pending.timestamp;
|
|
4529
5701
|
const result = {
|
|
4530
5702
|
success: false,
|
|
@@ -4552,89 +5724,747 @@ var WriteAckManager = class extends import_events4.EventEmitter {
|
|
|
4552
5724
|
* @param opId - Operation ID
|
|
4553
5725
|
* @param error - Error message
|
|
4554
5726
|
*/
|
|
4555
|
-
failPending(opId, error) {
|
|
4556
|
-
const pending = this.pending.get(opId);
|
|
5727
|
+
failPending(opId, error) {
|
|
5728
|
+
const pending = this.pending.get(opId);
|
|
5729
|
+
if (!pending) return;
|
|
5730
|
+
if (pending.timeoutHandle) {
|
|
5731
|
+
clearTimeout(pending.timeoutHandle);
|
|
5732
|
+
}
|
|
5733
|
+
const latencyMs = Date.now() - pending.timestamp;
|
|
5734
|
+
const highestAchieved = (0, import_core8.getHighestWriteConcernLevel)(pending.achievedLevels);
|
|
5735
|
+
const result = {
|
|
5736
|
+
success: false,
|
|
5737
|
+
opId,
|
|
5738
|
+
achievedLevel: highestAchieved,
|
|
5739
|
+
latencyMs,
|
|
5740
|
+
error
|
|
5741
|
+
};
|
|
5742
|
+
pending.resolve(result);
|
|
5743
|
+
this.pending.delete(opId);
|
|
5744
|
+
logger.error({ opId, error, latencyMs }, "Write failed");
|
|
5745
|
+
this.emit("failed", result);
|
|
5746
|
+
}
|
|
5747
|
+
/**
|
|
5748
|
+
* Get pending writes statistics.
|
|
5749
|
+
*/
|
|
5750
|
+
getStats() {
|
|
5751
|
+
const byLevel = {
|
|
5752
|
+
[import_core8.WriteConcern.FIRE_AND_FORGET]: 0,
|
|
5753
|
+
[import_core8.WriteConcern.MEMORY]: 0,
|
|
5754
|
+
[import_core8.WriteConcern.APPLIED]: 0,
|
|
5755
|
+
[import_core8.WriteConcern.REPLICATED]: 0,
|
|
5756
|
+
[import_core8.WriteConcern.PERSISTED]: 0
|
|
5757
|
+
};
|
|
5758
|
+
for (const pending of this.pending.values()) {
|
|
5759
|
+
byLevel[pending.writeConcern]++;
|
|
5760
|
+
}
|
|
5761
|
+
return { pending: this.pending.size, byLevel };
|
|
5762
|
+
}
|
|
5763
|
+
/**
|
|
5764
|
+
* Get all pending operation IDs.
|
|
5765
|
+
*/
|
|
5766
|
+
getPendingIds() {
|
|
5767
|
+
return Array.from(this.pending.keys());
|
|
5768
|
+
}
|
|
5769
|
+
/**
|
|
5770
|
+
* Clear all pending writes (for shutdown).
|
|
5771
|
+
* Rejects all pending promises with an error.
|
|
5772
|
+
*/
|
|
5773
|
+
clear() {
|
|
5774
|
+
const count = this.pending.size;
|
|
5775
|
+
for (const pending of this.pending.values()) {
|
|
5776
|
+
if (pending.timeoutHandle) {
|
|
5777
|
+
clearTimeout(pending.timeoutHandle);
|
|
5778
|
+
}
|
|
5779
|
+
pending.reject(new Error("WriteAckManager cleared"));
|
|
5780
|
+
}
|
|
5781
|
+
this.pending.clear();
|
|
5782
|
+
if (count > 0) {
|
|
5783
|
+
logger.info({ count }, "WriteAckManager cleared");
|
|
5784
|
+
}
|
|
5785
|
+
}
|
|
5786
|
+
/**
|
|
5787
|
+
* Graceful shutdown - resolves all pending writes with their current achieved level.
|
|
5788
|
+
*/
|
|
5789
|
+
shutdown() {
|
|
5790
|
+
const count = this.pending.size;
|
|
5791
|
+
for (const [opId, pending] of this.pending.entries()) {
|
|
5792
|
+
if (pending.timeoutHandle) {
|
|
5793
|
+
clearTimeout(pending.timeoutHandle);
|
|
5794
|
+
}
|
|
5795
|
+
const highestAchieved = (0, import_core8.getHighestWriteConcernLevel)(pending.achievedLevels);
|
|
5796
|
+
const latencyMs = Date.now() - pending.timestamp;
|
|
5797
|
+
const result = {
|
|
5798
|
+
success: highestAchieved === pending.writeConcern,
|
|
5799
|
+
opId,
|
|
5800
|
+
achievedLevel: highestAchieved,
|
|
5801
|
+
latencyMs,
|
|
5802
|
+
error: highestAchieved !== pending.writeConcern ? `Shutdown: achieved ${highestAchieved}, requested ${pending.writeConcern}` : void 0
|
|
5803
|
+
};
|
|
5804
|
+
pending.resolve(result);
|
|
5805
|
+
}
|
|
5806
|
+
this.pending.clear();
|
|
5807
|
+
if (count > 0) {
|
|
5808
|
+
logger.info({ count }, "WriteAckManager shutdown");
|
|
5809
|
+
}
|
|
5810
|
+
}
|
|
5811
|
+
};
|
|
5812
|
+
|
|
5813
|
+
// src/cluster/ReplicationPipeline.ts
|
|
5814
|
+
var import_events8 = require("events");
|
|
5815
|
+
var import_core9 = require("@topgunbuild/core");
|
|
5816
|
+
|
|
5817
|
+
// src/cluster/LagTracker.ts
|
|
5818
|
+
var DEFAULT_LAG_TRACKER_CONFIG = {
|
|
5819
|
+
historySize: 100,
|
|
5820
|
+
laggyThresholdMs: 5e3,
|
|
5821
|
+
unhealthyThresholdMs: 3e4
|
|
5822
|
+
};
|
|
5823
|
+
var LagTracker = class {
|
|
5824
|
+
constructor(config = {}) {
|
|
5825
|
+
this.lagByNode = /* @__PURE__ */ new Map();
|
|
5826
|
+
this.config = {
|
|
5827
|
+
...DEFAULT_LAG_TRACKER_CONFIG,
|
|
5828
|
+
...config
|
|
5829
|
+
};
|
|
5830
|
+
}
|
|
5831
|
+
/**
|
|
5832
|
+
* Update lag measurement for a node
|
|
5833
|
+
*/
|
|
5834
|
+
update(nodeId, lagMs) {
|
|
5835
|
+
let info = this.lagByNode.get(nodeId);
|
|
5836
|
+
if (!info) {
|
|
5837
|
+
info = {
|
|
5838
|
+
current: 0,
|
|
5839
|
+
history: [],
|
|
5840
|
+
lastUpdate: Date.now(),
|
|
5841
|
+
pendingOps: 0
|
|
5842
|
+
};
|
|
5843
|
+
this.lagByNode.set(nodeId, info);
|
|
5844
|
+
}
|
|
5845
|
+
info.current = lagMs;
|
|
5846
|
+
info.history.push(lagMs);
|
|
5847
|
+
if (info.history.length > this.config.historySize) {
|
|
5848
|
+
info.history.shift();
|
|
5849
|
+
}
|
|
5850
|
+
info.lastUpdate = Date.now();
|
|
5851
|
+
}
|
|
5852
|
+
/**
|
|
5853
|
+
* Record acknowledgment from a node (lag effectively becomes 0)
|
|
5854
|
+
*/
|
|
5855
|
+
recordAck(nodeId) {
|
|
5856
|
+
const info = this.lagByNode.get(nodeId);
|
|
5857
|
+
if (info) {
|
|
5858
|
+
info.current = 0;
|
|
5859
|
+
info.lastUpdate = Date.now();
|
|
5860
|
+
if (info.pendingOps > 0) {
|
|
5861
|
+
info.pendingOps--;
|
|
5862
|
+
}
|
|
5863
|
+
}
|
|
5864
|
+
}
|
|
5865
|
+
/**
|
|
5866
|
+
* Increment pending operations counter for a node
|
|
5867
|
+
*/
|
|
5868
|
+
incrementPending(nodeId) {
|
|
5869
|
+
let info = this.lagByNode.get(nodeId);
|
|
5870
|
+
if (!info) {
|
|
5871
|
+
info = {
|
|
5872
|
+
current: 0,
|
|
5873
|
+
history: [],
|
|
5874
|
+
lastUpdate: Date.now(),
|
|
5875
|
+
pendingOps: 0
|
|
5876
|
+
};
|
|
5877
|
+
this.lagByNode.set(nodeId, info);
|
|
5878
|
+
}
|
|
5879
|
+
info.pendingOps++;
|
|
5880
|
+
}
|
|
5881
|
+
/**
|
|
5882
|
+
* Get lag statistics for a specific node
|
|
5883
|
+
*/
|
|
5884
|
+
getLag(nodeId) {
|
|
5885
|
+
const info = this.lagByNode.get(nodeId);
|
|
5886
|
+
if (!info || info.history.length === 0) {
|
|
5887
|
+
return { current: 0, avg: 0, max: 0, percentile99: 0 };
|
|
5888
|
+
}
|
|
5889
|
+
const sorted = [...info.history].sort((a, b) => a - b);
|
|
5890
|
+
const avg = sorted.reduce((a, b) => a + b, 0) / sorted.length;
|
|
5891
|
+
const max = sorted[sorted.length - 1] || 0;
|
|
5892
|
+
const p99Index = Math.floor(sorted.length * 0.99);
|
|
5893
|
+
const percentile99 = sorted[p99Index] || max;
|
|
5894
|
+
return {
|
|
5895
|
+
current: info.current,
|
|
5896
|
+
avg: Math.round(avg * 100) / 100,
|
|
5897
|
+
// Round to 2 decimal places
|
|
5898
|
+
max,
|
|
5899
|
+
percentile99
|
|
5900
|
+
};
|
|
5901
|
+
}
|
|
5902
|
+
/**
|
|
5903
|
+
* Get pending operations count for a node
|
|
5904
|
+
*/
|
|
5905
|
+
getPendingOps(nodeId) {
|
|
5906
|
+
const info = this.lagByNode.get(nodeId);
|
|
5907
|
+
return info?.pendingOps ?? 0;
|
|
5908
|
+
}
|
|
5909
|
+
/**
|
|
5910
|
+
* Get overall replication health status
|
|
5911
|
+
*/
|
|
5912
|
+
getHealth() {
|
|
5913
|
+
const unhealthyNodes = [];
|
|
5914
|
+
const laggyNodes = [];
|
|
5915
|
+
let totalLag = 0;
|
|
5916
|
+
let nodeCount = 0;
|
|
5917
|
+
const now = Date.now();
|
|
5918
|
+
for (const [nodeId, info] of this.lagByNode) {
|
|
5919
|
+
const timeSinceUpdate = now - info.lastUpdate;
|
|
5920
|
+
if (timeSinceUpdate > this.config.unhealthyThresholdMs) {
|
|
5921
|
+
unhealthyNodes.push(nodeId);
|
|
5922
|
+
} else if (info.current > this.config.laggyThresholdMs) {
|
|
5923
|
+
laggyNodes.push(nodeId);
|
|
5924
|
+
}
|
|
5925
|
+
totalLag += info.current;
|
|
5926
|
+
nodeCount++;
|
|
5927
|
+
}
|
|
5928
|
+
const avgLagMs = nodeCount > 0 ? totalLag / nodeCount : 0;
|
|
5929
|
+
return {
|
|
5930
|
+
healthy: unhealthyNodes.length === 0,
|
|
5931
|
+
unhealthyNodes,
|
|
5932
|
+
laggyNodes,
|
|
5933
|
+
avgLagMs: Math.round(avgLagMs * 100) / 100
|
|
5934
|
+
};
|
|
5935
|
+
}
|
|
5936
|
+
/**
|
|
5937
|
+
* Get average lag across all tracked nodes
|
|
5938
|
+
*/
|
|
5939
|
+
getAverageLag() {
|
|
5940
|
+
let total = 0;
|
|
5941
|
+
let count = 0;
|
|
5942
|
+
for (const info of this.lagByNode.values()) {
|
|
5943
|
+
total += info.current;
|
|
5944
|
+
count++;
|
|
5945
|
+
}
|
|
5946
|
+
return count > 0 ? total / count : 0;
|
|
5947
|
+
}
|
|
5948
|
+
/**
|
|
5949
|
+
* Check if a specific node is considered healthy
|
|
5950
|
+
*/
|
|
5951
|
+
isNodeHealthy(nodeId) {
|
|
5952
|
+
const info = this.lagByNode.get(nodeId);
|
|
5953
|
+
if (!info) return true;
|
|
5954
|
+
const timeSinceUpdate = Date.now() - info.lastUpdate;
|
|
5955
|
+
return timeSinceUpdate < this.config.unhealthyThresholdMs;
|
|
5956
|
+
}
|
|
5957
|
+
/**
|
|
5958
|
+
* Check if a specific node is considered laggy
|
|
5959
|
+
*/
|
|
5960
|
+
isNodeLaggy(nodeId) {
|
|
5961
|
+
const info = this.lagByNode.get(nodeId);
|
|
5962
|
+
if (!info) return false;
|
|
5963
|
+
return info.current > this.config.laggyThresholdMs;
|
|
5964
|
+
}
|
|
5965
|
+
/**
|
|
5966
|
+
* Remove a node from tracking
|
|
5967
|
+
*/
|
|
5968
|
+
removeNode(nodeId) {
|
|
5969
|
+
this.lagByNode.delete(nodeId);
|
|
5970
|
+
}
|
|
5971
|
+
/**
|
|
5972
|
+
* Get all tracked node IDs
|
|
5973
|
+
*/
|
|
5974
|
+
getTrackedNodes() {
|
|
5975
|
+
return Array.from(this.lagByNode.keys());
|
|
5976
|
+
}
|
|
5977
|
+
/**
|
|
5978
|
+
* Get raw lag info for a node (for advanced monitoring)
|
|
5979
|
+
*/
|
|
5980
|
+
getRawLagInfo(nodeId) {
|
|
5981
|
+
return this.lagByNode.get(nodeId);
|
|
5982
|
+
}
|
|
5983
|
+
/**
|
|
5984
|
+
* Clear all tracking data
|
|
5985
|
+
*/
|
|
5986
|
+
clear() {
|
|
5987
|
+
this.lagByNode.clear();
|
|
5988
|
+
}
|
|
5989
|
+
/**
|
|
5990
|
+
* Export metrics in Prometheus format
|
|
5991
|
+
*/
|
|
5992
|
+
toPrometheusMetrics() {
|
|
5993
|
+
const lines = [
|
|
5994
|
+
"# HELP topgun_replication_lag_ms Current replication lag in milliseconds",
|
|
5995
|
+
"# TYPE topgun_replication_lag_ms gauge"
|
|
5996
|
+
];
|
|
5997
|
+
for (const [nodeId, info] of this.lagByNode) {
|
|
5998
|
+
lines.push(`topgun_replication_lag_ms{node="${nodeId}"} ${info.current}`);
|
|
5999
|
+
}
|
|
6000
|
+
lines.push("");
|
|
6001
|
+
lines.push("# HELP topgun_replication_pending_ops Pending replication operations");
|
|
6002
|
+
lines.push("# TYPE topgun_replication_pending_ops gauge");
|
|
6003
|
+
for (const [nodeId, info] of this.lagByNode) {
|
|
6004
|
+
lines.push(`topgun_replication_pending_ops{node="${nodeId}"} ${info.pendingOps}`);
|
|
6005
|
+
}
|
|
6006
|
+
const health = this.getHealth();
|
|
6007
|
+
lines.push("");
|
|
6008
|
+
lines.push("# HELP topgun_replication_healthy Cluster replication health (1=healthy, 0=unhealthy)");
|
|
6009
|
+
lines.push("# TYPE topgun_replication_healthy gauge");
|
|
6010
|
+
lines.push(`topgun_replication_healthy ${health.healthy ? 1 : 0}`);
|
|
6011
|
+
lines.push("");
|
|
6012
|
+
lines.push("# HELP topgun_replication_avg_lag_ms Average replication lag across all nodes");
|
|
6013
|
+
lines.push("# TYPE topgun_replication_avg_lag_ms gauge");
|
|
6014
|
+
lines.push(`topgun_replication_avg_lag_ms ${health.avgLagMs}`);
|
|
6015
|
+
return lines.join("\n");
|
|
6016
|
+
}
|
|
6017
|
+
};
|
|
6018
|
+
|
|
6019
|
+
// src/cluster/ReplicationPipeline.ts
|
|
6020
|
+
var ReplicationTimeoutError = class extends Error {
|
|
6021
|
+
constructor(opId, targetNodes, ackedNodes) {
|
|
6022
|
+
super(
|
|
6023
|
+
`Replication timeout for operation ${opId}. Expected: ${targetNodes.join(", ")}, Acked: ${ackedNodes.join(", ")}`
|
|
6024
|
+
);
|
|
6025
|
+
this.opId = opId;
|
|
6026
|
+
this.targetNodes = targetNodes;
|
|
6027
|
+
this.ackedNodes = ackedNodes;
|
|
6028
|
+
this.name = "ReplicationTimeoutError";
|
|
6029
|
+
}
|
|
6030
|
+
};
|
|
6031
|
+
var ReplicationPipeline = class extends import_events8.EventEmitter {
|
|
6032
|
+
constructor(clusterManager, partitionService, config = {}) {
|
|
6033
|
+
super();
|
|
6034
|
+
// Replication queues per node (for EVENTUAL mode)
|
|
6035
|
+
this.replicationQueue = /* @__PURE__ */ new Map();
|
|
6036
|
+
// Pending acknowledgments (for STRONG/QUORUM mode)
|
|
6037
|
+
this.pendingAcks = /* @__PURE__ */ new Map();
|
|
6038
|
+
// Queue processor timer
|
|
6039
|
+
this.queueProcessorTimer = null;
|
|
6040
|
+
// Operation applier callback (injected by ServerCoordinator)
|
|
6041
|
+
this.operationApplier = null;
|
|
6042
|
+
this.clusterManager = clusterManager;
|
|
6043
|
+
this.partitionService = partitionService;
|
|
6044
|
+
this.nodeId = clusterManager.config.nodeId;
|
|
6045
|
+
this.config = {
|
|
6046
|
+
...import_core9.DEFAULT_REPLICATION_CONFIG,
|
|
6047
|
+
...config
|
|
6048
|
+
};
|
|
6049
|
+
this.lagTracker = new LagTracker();
|
|
6050
|
+
this.setupMessageHandlers();
|
|
6051
|
+
this.startQueueProcessor();
|
|
6052
|
+
}
|
|
6053
|
+
// ============================================
|
|
6054
|
+
// Configuration
|
|
6055
|
+
// ============================================
|
|
6056
|
+
/**
|
|
6057
|
+
* Set the operation applier callback
|
|
6058
|
+
* This is called when replicated operations are received from other nodes
|
|
6059
|
+
*/
|
|
6060
|
+
setOperationApplier(applier) {
|
|
6061
|
+
this.operationApplier = applier;
|
|
6062
|
+
}
|
|
6063
|
+
// ============================================
|
|
6064
|
+
// Replication API
|
|
6065
|
+
// ============================================
|
|
6066
|
+
/**
|
|
6067
|
+
* Replicate operation to backup nodes
|
|
6068
|
+
*/
|
|
6069
|
+
async replicate(operation, opId, key, options = {}) {
|
|
6070
|
+
const consistency = options.consistency ?? this.config.defaultConsistency;
|
|
6071
|
+
const partitionId = this.partitionService.getPartitionId(key);
|
|
6072
|
+
const backups = this.partitionService.getBackups(partitionId);
|
|
6073
|
+
if (backups.length === 0) {
|
|
6074
|
+
return { success: true, ackedBy: [this.nodeId] };
|
|
6075
|
+
}
|
|
6076
|
+
switch (consistency) {
|
|
6077
|
+
case import_core9.ConsistencyLevel.STRONG:
|
|
6078
|
+
return this.replicateStrong(operation, opId, backups, options.timeout);
|
|
6079
|
+
case import_core9.ConsistencyLevel.QUORUM:
|
|
6080
|
+
return this.replicateQuorum(operation, opId, backups, options.timeout);
|
|
6081
|
+
case import_core9.ConsistencyLevel.EVENTUAL:
|
|
6082
|
+
return this.replicateEventual(operation, opId, backups);
|
|
6083
|
+
}
|
|
6084
|
+
}
|
|
6085
|
+
/**
|
|
6086
|
+
* STRONG: Wait for all replicas to acknowledge
|
|
6087
|
+
*/
|
|
6088
|
+
async replicateStrong(operation, opId, backups, timeout) {
|
|
6089
|
+
const targetNodes = backups;
|
|
6090
|
+
return new Promise((resolve, reject) => {
|
|
6091
|
+
const pending = {
|
|
6092
|
+
opId,
|
|
6093
|
+
consistency: import_core9.ConsistencyLevel.STRONG,
|
|
6094
|
+
targetNodes,
|
|
6095
|
+
ackedNodes: /* @__PURE__ */ new Set(),
|
|
6096
|
+
resolve: () => resolve({
|
|
6097
|
+
success: true,
|
|
6098
|
+
ackedBy: [this.nodeId, ...targetNodes]
|
|
6099
|
+
}),
|
|
6100
|
+
reject: (error) => reject(error),
|
|
6101
|
+
timeout: setTimeout(() => {
|
|
6102
|
+
this.pendingAcks.delete(opId);
|
|
6103
|
+
const ackedList = Array.from(pending.ackedNodes);
|
|
6104
|
+
reject(new ReplicationTimeoutError(opId, targetNodes, ackedList));
|
|
6105
|
+
}, timeout ?? this.config.ackTimeoutMs),
|
|
6106
|
+
startTime: Date.now()
|
|
6107
|
+
};
|
|
6108
|
+
this.pendingAcks.set(opId, pending);
|
|
6109
|
+
for (const nodeId of targetNodes) {
|
|
6110
|
+
this.lagTracker.incrementPending(nodeId);
|
|
6111
|
+
}
|
|
6112
|
+
for (const nodeId of targetNodes) {
|
|
6113
|
+
this.sendReplication(nodeId, operation, opId, import_core9.ConsistencyLevel.STRONG);
|
|
6114
|
+
}
|
|
6115
|
+
});
|
|
6116
|
+
}
|
|
6117
|
+
/**
|
|
6118
|
+
* QUORUM: Wait for majority of replicas
|
|
6119
|
+
*/
|
|
6120
|
+
async replicateQuorum(operation, opId, backups, timeout) {
|
|
6121
|
+
const targetNodes = backups;
|
|
6122
|
+
const quorumSize = Math.floor(targetNodes.length / 2) + 1;
|
|
6123
|
+
return new Promise((resolve, reject) => {
|
|
6124
|
+
const ackedNodes = /* @__PURE__ */ new Set();
|
|
6125
|
+
const pending = {
|
|
6126
|
+
opId,
|
|
6127
|
+
consistency: import_core9.ConsistencyLevel.QUORUM,
|
|
6128
|
+
targetNodes,
|
|
6129
|
+
ackedNodes,
|
|
6130
|
+
resolve: () => {
|
|
6131
|
+
const ackedSnapshot = Array.from(ackedNodes);
|
|
6132
|
+
const ackedBy = [this.nodeId, ...ackedSnapshot];
|
|
6133
|
+
resolve({ success: true, ackedBy });
|
|
6134
|
+
},
|
|
6135
|
+
reject: (error) => reject(error),
|
|
6136
|
+
timeout: setTimeout(() => {
|
|
6137
|
+
this.pendingAcks.delete(opId);
|
|
6138
|
+
const ackedList = Array.from(ackedNodes);
|
|
6139
|
+
reject(new ReplicationTimeoutError(opId, targetNodes, ackedList));
|
|
6140
|
+
}, timeout ?? this.config.ackTimeoutMs),
|
|
6141
|
+
startTime: Date.now()
|
|
6142
|
+
};
|
|
6143
|
+
this.pendingAcks.set(opId, pending);
|
|
6144
|
+
for (const nodeId of targetNodes) {
|
|
6145
|
+
this.lagTracker.incrementPending(nodeId);
|
|
6146
|
+
}
|
|
6147
|
+
for (const nodeId of targetNodes) {
|
|
6148
|
+
this.sendReplication(nodeId, operation, opId, import_core9.ConsistencyLevel.QUORUM);
|
|
6149
|
+
}
|
|
6150
|
+
});
|
|
6151
|
+
}
|
|
6152
|
+
/**
|
|
6153
|
+
* EVENTUAL: Fire-and-forget with queue
|
|
6154
|
+
*/
|
|
6155
|
+
async replicateEventual(operation, opId, backups) {
|
|
6156
|
+
for (const nodeId of backups) {
|
|
6157
|
+
this.enqueue(nodeId, {
|
|
6158
|
+
opId,
|
|
6159
|
+
operation,
|
|
6160
|
+
consistency: import_core9.ConsistencyLevel.EVENTUAL,
|
|
6161
|
+
timestamp: Date.now(),
|
|
6162
|
+
retryCount: 0
|
|
6163
|
+
});
|
|
6164
|
+
}
|
|
6165
|
+
return { success: true, ackedBy: [this.nodeId] };
|
|
6166
|
+
}
|
|
6167
|
+
// ============================================
|
|
6168
|
+
// Queue Management
|
|
6169
|
+
// ============================================
|
|
6170
|
+
/**
|
|
6171
|
+
* Add task to replication queue
|
|
6172
|
+
*/
|
|
6173
|
+
enqueue(nodeId, task) {
|
|
6174
|
+
let queue = this.replicationQueue.get(nodeId);
|
|
6175
|
+
if (!queue) {
|
|
6176
|
+
queue = [];
|
|
6177
|
+
this.replicationQueue.set(nodeId, queue);
|
|
6178
|
+
}
|
|
6179
|
+
if (queue.length >= this.config.queueSizeLimit) {
|
|
6180
|
+
this.emit("queueOverflow", nodeId);
|
|
6181
|
+
logger.warn({ nodeId, queueSize: queue.length }, "Replication queue overflow, dropping oldest");
|
|
6182
|
+
queue.shift();
|
|
6183
|
+
}
|
|
6184
|
+
queue.push(task);
|
|
6185
|
+
this.lagTracker.incrementPending(nodeId);
|
|
6186
|
+
}
|
|
6187
|
+
/**
|
|
6188
|
+
* Start queue processor
|
|
6189
|
+
*/
|
|
6190
|
+
startQueueProcessor() {
|
|
6191
|
+
if (this.queueProcessorTimer) return;
|
|
6192
|
+
this.queueProcessorTimer = setInterval(() => {
|
|
6193
|
+
for (const nodeId of this.replicationQueue.keys()) {
|
|
6194
|
+
this.processQueue(nodeId).catch((err) => {
|
|
6195
|
+
logger.error({ nodeId, error: err }, "Error processing replication queue");
|
|
6196
|
+
this.emit("error", err);
|
|
6197
|
+
});
|
|
6198
|
+
}
|
|
6199
|
+
}, this.config.batchIntervalMs);
|
|
6200
|
+
}
|
|
6201
|
+
/**
|
|
6202
|
+
* Stop queue processor
|
|
6203
|
+
*/
|
|
6204
|
+
stopQueueProcessor() {
|
|
6205
|
+
if (this.queueProcessorTimer) {
|
|
6206
|
+
clearInterval(this.queueProcessorTimer);
|
|
6207
|
+
this.queueProcessorTimer = null;
|
|
6208
|
+
}
|
|
6209
|
+
}
|
|
6210
|
+
/**
|
|
6211
|
+
* Process replication queue for a node
|
|
6212
|
+
*/
|
|
6213
|
+
async processQueue(nodeId) {
|
|
6214
|
+
const queue = this.replicationQueue.get(nodeId);
|
|
6215
|
+
if (!queue || queue.length === 0) return;
|
|
6216
|
+
const batch = queue.splice(0, this.config.batchSize);
|
|
6217
|
+
try {
|
|
6218
|
+
this.clusterManager.send(nodeId, "OP_FORWARD", {
|
|
6219
|
+
_replication: {
|
|
6220
|
+
type: "REPLICATION_BATCH",
|
|
6221
|
+
payload: {
|
|
6222
|
+
operations: batch.map((t) => t.operation),
|
|
6223
|
+
opIds: batch.map((t) => t.opId)
|
|
6224
|
+
}
|
|
6225
|
+
}
|
|
6226
|
+
});
|
|
6227
|
+
const oldestTimestamp = Math.min(...batch.map((t) => t.timestamp));
|
|
6228
|
+
this.lagTracker.update(nodeId, Date.now() - oldestTimestamp);
|
|
6229
|
+
logger.debug({ nodeId, batchSize: batch.length }, "Sent replication batch");
|
|
6230
|
+
} catch (error) {
|
|
6231
|
+
for (const task of batch) {
|
|
6232
|
+
task.retryCount++;
|
|
6233
|
+
if (task.retryCount <= this.config.maxRetries) {
|
|
6234
|
+
queue.unshift(task);
|
|
6235
|
+
} else {
|
|
6236
|
+
logger.warn({ nodeId, opId: task.opId, retries: task.retryCount }, "Replication task exceeded max retries");
|
|
6237
|
+
this.emit("replicationFailed", task.opId, new Error("Max retries exceeded"));
|
|
6238
|
+
}
|
|
6239
|
+
}
|
|
6240
|
+
}
|
|
6241
|
+
}
|
|
6242
|
+
// ============================================
|
|
6243
|
+
// Message Handling
|
|
6244
|
+
// ============================================
|
|
6245
|
+
/**
|
|
6246
|
+
* Send replication message to a node
|
|
6247
|
+
*/
|
|
6248
|
+
sendReplication(nodeId, operation, opId, consistency) {
|
|
6249
|
+
this.clusterManager.send(nodeId, "OP_FORWARD", {
|
|
6250
|
+
_replication: {
|
|
6251
|
+
type: "REPLICATION",
|
|
6252
|
+
payload: {
|
|
6253
|
+
opId,
|
|
6254
|
+
operation,
|
|
6255
|
+
consistency
|
|
6256
|
+
}
|
|
6257
|
+
}
|
|
6258
|
+
});
|
|
6259
|
+
}
|
|
6260
|
+
/**
|
|
6261
|
+
* Setup cluster message handlers
|
|
6262
|
+
*/
|
|
6263
|
+
setupMessageHandlers() {
|
|
6264
|
+
this.clusterManager.on("message", (msg) => {
|
|
6265
|
+
if (msg.payload?._replication) {
|
|
6266
|
+
const replication = msg.payload._replication;
|
|
6267
|
+
switch (replication.type) {
|
|
6268
|
+
case "REPLICATION":
|
|
6269
|
+
this.handleReplication(msg.senderId, replication.payload);
|
|
6270
|
+
break;
|
|
6271
|
+
case "REPLICATION_BATCH":
|
|
6272
|
+
this.handleReplicationBatch(msg.senderId, replication.payload);
|
|
6273
|
+
break;
|
|
6274
|
+
case "REPLICATION_ACK":
|
|
6275
|
+
this.handleReplicationAck(msg.senderId, replication.payload);
|
|
6276
|
+
break;
|
|
6277
|
+
case "REPLICATION_BATCH_ACK":
|
|
6278
|
+
this.handleReplicationBatchAck(msg.senderId, replication.payload);
|
|
6279
|
+
break;
|
|
6280
|
+
}
|
|
6281
|
+
}
|
|
6282
|
+
});
|
|
6283
|
+
}
|
|
6284
|
+
/**
|
|
6285
|
+
* Handle incoming replication request (on backup node)
|
|
6286
|
+
*/
|
|
6287
|
+
async handleReplication(sourceNode, payload) {
|
|
6288
|
+
const { opId, operation, consistency } = payload;
|
|
6289
|
+
logger.debug({ sourceNode, opId, consistency }, "Received replication");
|
|
6290
|
+
let success = true;
|
|
6291
|
+
if (this.operationApplier) {
|
|
6292
|
+
try {
|
|
6293
|
+
success = await this.operationApplier(operation, opId, sourceNode);
|
|
6294
|
+
} catch (error) {
|
|
6295
|
+
logger.error({ sourceNode, opId, error }, "Failed to apply replicated operation");
|
|
6296
|
+
success = false;
|
|
6297
|
+
}
|
|
6298
|
+
} else {
|
|
6299
|
+
logger.warn({ sourceNode, opId }, "No operation applier set, operation not applied");
|
|
6300
|
+
}
|
|
6301
|
+
if (consistency === import_core9.ConsistencyLevel.STRONG || consistency === import_core9.ConsistencyLevel.QUORUM) {
|
|
6302
|
+
this.clusterManager.send(sourceNode, "OP_FORWARD", {
|
|
6303
|
+
_replication: {
|
|
6304
|
+
type: "REPLICATION_ACK",
|
|
6305
|
+
payload: {
|
|
6306
|
+
opId,
|
|
6307
|
+
success,
|
|
6308
|
+
timestamp: Date.now()
|
|
6309
|
+
}
|
|
6310
|
+
}
|
|
6311
|
+
});
|
|
6312
|
+
}
|
|
6313
|
+
}
|
|
6314
|
+
/**
|
|
6315
|
+
* Handle incoming batch replication (on backup node)
|
|
6316
|
+
*/
|
|
6317
|
+
async handleReplicationBatch(sourceNode, payload) {
|
|
6318
|
+
const { operations, opIds } = payload;
|
|
6319
|
+
logger.debug({ sourceNode, count: operations.length }, "Received replication batch");
|
|
6320
|
+
let allSuccess = true;
|
|
6321
|
+
if (this.operationApplier) {
|
|
6322
|
+
for (let i = 0; i < operations.length; i++) {
|
|
6323
|
+
try {
|
|
6324
|
+
const success = await this.operationApplier(operations[i], opIds[i], sourceNode);
|
|
6325
|
+
if (!success) {
|
|
6326
|
+
allSuccess = false;
|
|
6327
|
+
}
|
|
6328
|
+
} catch (error) {
|
|
6329
|
+
logger.error({ sourceNode, opId: opIds[i], error }, "Failed to apply replicated operation in batch");
|
|
6330
|
+
allSuccess = false;
|
|
6331
|
+
}
|
|
6332
|
+
}
|
|
6333
|
+
} else {
|
|
6334
|
+
logger.warn({ sourceNode, count: operations.length }, "No operation applier set, batch not applied");
|
|
6335
|
+
}
|
|
6336
|
+
this.clusterManager.send(sourceNode, "OP_FORWARD", {
|
|
6337
|
+
_replication: {
|
|
6338
|
+
type: "REPLICATION_BATCH_ACK",
|
|
6339
|
+
payload: {
|
|
6340
|
+
opIds,
|
|
6341
|
+
success: allSuccess,
|
|
6342
|
+
timestamp: Date.now()
|
|
6343
|
+
}
|
|
6344
|
+
}
|
|
6345
|
+
});
|
|
6346
|
+
}
|
|
6347
|
+
/**
|
|
6348
|
+
* Handle replication acknowledgment (on owner node)
|
|
6349
|
+
*/
|
|
6350
|
+
handleReplicationAck(sourceNode, payload) {
|
|
6351
|
+
const { opId, success } = payload;
|
|
6352
|
+
this.lagTracker.recordAck(sourceNode);
|
|
6353
|
+
const pending = this.pendingAcks.get(opId);
|
|
4557
6354
|
if (!pending) return;
|
|
4558
|
-
if (
|
|
4559
|
-
|
|
6355
|
+
if (!success) {
|
|
6356
|
+
logger.warn({ sourceNode, opId }, "Replication rejected by backup");
|
|
6357
|
+
return;
|
|
6358
|
+
}
|
|
6359
|
+
pending.ackedNodes.add(sourceNode);
|
|
6360
|
+
const lag = Date.now() - pending.startTime;
|
|
6361
|
+
this.lagTracker.update(sourceNode, lag);
|
|
6362
|
+
const ackedCount = pending.ackedNodes.size;
|
|
6363
|
+
const targetCount = pending.targetNodes.length;
|
|
6364
|
+
switch (pending.consistency) {
|
|
6365
|
+
case import_core9.ConsistencyLevel.STRONG:
|
|
6366
|
+
if (ackedCount === targetCount) {
|
|
6367
|
+
clearTimeout(pending.timeout);
|
|
6368
|
+
this.pendingAcks.delete(opId);
|
|
6369
|
+
pending.resolve();
|
|
6370
|
+
this.emit("replicationComplete", opId, [this.nodeId, ...pending.ackedNodes]);
|
|
6371
|
+
}
|
|
6372
|
+
break;
|
|
6373
|
+
case import_core9.ConsistencyLevel.QUORUM:
|
|
6374
|
+
const quorumSize = Math.floor(targetCount / 2) + 1;
|
|
6375
|
+
if (ackedCount >= quorumSize) {
|
|
6376
|
+
clearTimeout(pending.timeout);
|
|
6377
|
+
this.pendingAcks.delete(opId);
|
|
6378
|
+
pending.resolve();
|
|
6379
|
+
this.emit("replicationComplete", opId, [this.nodeId, ...pending.ackedNodes]);
|
|
6380
|
+
}
|
|
6381
|
+
break;
|
|
4560
6382
|
}
|
|
4561
|
-
const latencyMs = Date.now() - pending.timestamp;
|
|
4562
|
-
const highestAchieved = (0, import_core6.getHighestWriteConcernLevel)(pending.achievedLevels);
|
|
4563
|
-
const result = {
|
|
4564
|
-
success: false,
|
|
4565
|
-
opId,
|
|
4566
|
-
achievedLevel: highestAchieved,
|
|
4567
|
-
latencyMs,
|
|
4568
|
-
error
|
|
4569
|
-
};
|
|
4570
|
-
pending.resolve(result);
|
|
4571
|
-
this.pending.delete(opId);
|
|
4572
|
-
logger.error({ opId, error, latencyMs }, "Write failed");
|
|
4573
|
-
this.emit("failed", result);
|
|
4574
6383
|
}
|
|
4575
6384
|
/**
|
|
4576
|
-
*
|
|
6385
|
+
* Handle batch acknowledgment (on owner node)
|
|
4577
6386
|
*/
|
|
4578
|
-
|
|
4579
|
-
const
|
|
4580
|
-
|
|
4581
|
-
|
|
4582
|
-
|
|
4583
|
-
[import_core6.WriteConcern.REPLICATED]: 0,
|
|
4584
|
-
[import_core6.WriteConcern.PERSISTED]: 0
|
|
4585
|
-
};
|
|
4586
|
-
for (const pending of this.pending.values()) {
|
|
4587
|
-
byLevel[pending.writeConcern]++;
|
|
6387
|
+
handleReplicationBatchAck(sourceNode, payload) {
|
|
6388
|
+
const { success } = payload;
|
|
6389
|
+
this.lagTracker.recordAck(sourceNode);
|
|
6390
|
+
if (!success) {
|
|
6391
|
+
logger.warn({ sourceNode, count: payload.opIds.length }, "Batch replication rejected");
|
|
4588
6392
|
}
|
|
4589
|
-
return { pending: this.pending.size, byLevel };
|
|
4590
6393
|
}
|
|
6394
|
+
// ============================================
|
|
6395
|
+
// Status and Metrics
|
|
6396
|
+
// ============================================
|
|
4591
6397
|
/**
|
|
4592
|
-
* Get
|
|
6398
|
+
* Get replication lag for a specific node
|
|
4593
6399
|
*/
|
|
4594
|
-
|
|
4595
|
-
return
|
|
6400
|
+
getLag(nodeId) {
|
|
6401
|
+
return this.lagTracker.getLag(nodeId);
|
|
4596
6402
|
}
|
|
4597
6403
|
/**
|
|
4598
|
-
*
|
|
4599
|
-
* Rejects all pending promises with an error.
|
|
6404
|
+
* Get overall replication health
|
|
4600
6405
|
*/
|
|
4601
|
-
|
|
4602
|
-
|
|
4603
|
-
|
|
4604
|
-
|
|
4605
|
-
|
|
4606
|
-
|
|
4607
|
-
|
|
4608
|
-
|
|
4609
|
-
|
|
4610
|
-
|
|
4611
|
-
|
|
6406
|
+
getHealth() {
|
|
6407
|
+
return this.lagTracker.getHealth();
|
|
6408
|
+
}
|
|
6409
|
+
/**
|
|
6410
|
+
* Get queue size for a specific node
|
|
6411
|
+
*/
|
|
6412
|
+
getQueueSize(nodeId) {
|
|
6413
|
+
return this.replicationQueue.get(nodeId)?.length ?? 0;
|
|
6414
|
+
}
|
|
6415
|
+
/**
|
|
6416
|
+
* Get total pending operations across all nodes
|
|
6417
|
+
*/
|
|
6418
|
+
getTotalPending() {
|
|
6419
|
+
let total = 0;
|
|
6420
|
+
for (const queue of this.replicationQueue.values()) {
|
|
6421
|
+
total += queue.length;
|
|
4612
6422
|
}
|
|
6423
|
+
return total + this.pendingAcks.size;
|
|
4613
6424
|
}
|
|
4614
6425
|
/**
|
|
4615
|
-
*
|
|
6426
|
+
* Check if a node is considered synced (low lag)
|
|
4616
6427
|
*/
|
|
4617
|
-
|
|
4618
|
-
const
|
|
4619
|
-
|
|
4620
|
-
|
|
4621
|
-
|
|
4622
|
-
|
|
4623
|
-
|
|
4624
|
-
|
|
4625
|
-
|
|
4626
|
-
|
|
4627
|
-
|
|
4628
|
-
|
|
4629
|
-
|
|
4630
|
-
|
|
4631
|
-
|
|
4632
|
-
|
|
6428
|
+
isSynced(nodeId, maxLagMs = 1e3) {
|
|
6429
|
+
const lag = this.lagTracker.getLag(nodeId);
|
|
6430
|
+
return lag.current < maxLagMs;
|
|
6431
|
+
}
|
|
6432
|
+
/**
|
|
6433
|
+
* Get LagTracker for advanced monitoring
|
|
6434
|
+
*/
|
|
6435
|
+
getLagTracker() {
|
|
6436
|
+
return this.lagTracker;
|
|
6437
|
+
}
|
|
6438
|
+
/**
|
|
6439
|
+
* Export metrics in Prometheus format
|
|
6440
|
+
*/
|
|
6441
|
+
toPrometheusMetrics() {
|
|
6442
|
+
const lines = [];
|
|
6443
|
+
lines.push("# HELP topgun_replication_queue_size Pending operations in replication queue");
|
|
6444
|
+
lines.push("# TYPE topgun_replication_queue_size gauge");
|
|
6445
|
+
for (const [nodeId, queue] of this.replicationQueue) {
|
|
6446
|
+
lines.push(`topgun_replication_queue_size{node="${nodeId}"} ${queue.length}`);
|
|
4633
6447
|
}
|
|
4634
|
-
|
|
4635
|
-
|
|
4636
|
-
|
|
6448
|
+
lines.push("");
|
|
6449
|
+
lines.push("# HELP topgun_replication_pending_acks Pending synchronous acknowledgments");
|
|
6450
|
+
lines.push("# TYPE topgun_replication_pending_acks gauge");
|
|
6451
|
+
lines.push(`topgun_replication_pending_acks ${this.pendingAcks.size}`);
|
|
6452
|
+
lines.push("");
|
|
6453
|
+
lines.push(this.lagTracker.toPrometheusMetrics());
|
|
6454
|
+
return lines.join("\n");
|
|
6455
|
+
}
|
|
6456
|
+
/**
|
|
6457
|
+
* Cleanup resources
|
|
6458
|
+
*/
|
|
6459
|
+
close() {
|
|
6460
|
+
this.stopQueueProcessor();
|
|
6461
|
+
for (const [opId, pending] of this.pendingAcks) {
|
|
6462
|
+
clearTimeout(pending.timeout);
|
|
6463
|
+
pending.reject(new Error("ReplicationPipeline closed"));
|
|
4637
6464
|
}
|
|
6465
|
+
this.pendingAcks.clear();
|
|
6466
|
+
this.replicationQueue.clear();
|
|
6467
|
+
this.lagTracker.clear();
|
|
4638
6468
|
}
|
|
4639
6469
|
};
|
|
4640
6470
|
|
|
@@ -4662,7 +6492,7 @@ var ServerCoordinator = class {
|
|
|
4662
6492
|
this._readyPromise = new Promise((resolve) => {
|
|
4663
6493
|
this._readyResolve = resolve;
|
|
4664
6494
|
});
|
|
4665
|
-
this.hlc = new
|
|
6495
|
+
this.hlc = new import_core10.HLC(config.nodeId);
|
|
4666
6496
|
this.storage = config.storage;
|
|
4667
6497
|
const rawSecret = config.jwtSecret || process.env.JWT_SECRET || "topgun-secret-dev";
|
|
4668
6498
|
this.jwtSecret = rawSecret.replace(/\\n/g, "\n");
|
|
@@ -4799,6 +6629,22 @@ var ServerCoordinator = class {
|
|
|
4799
6629
|
tls: config.clusterTls
|
|
4800
6630
|
});
|
|
4801
6631
|
this.partitionService = new PartitionService(this.cluster);
|
|
6632
|
+
if (config.replicationEnabled !== false) {
|
|
6633
|
+
this.replicationPipeline = new ReplicationPipeline(
|
|
6634
|
+
this.cluster,
|
|
6635
|
+
this.partitionService,
|
|
6636
|
+
{
|
|
6637
|
+
...import_core10.DEFAULT_REPLICATION_CONFIG,
|
|
6638
|
+
defaultConsistency: config.defaultConsistency ?? import_core10.ConsistencyLevel.EVENTUAL,
|
|
6639
|
+
...config.replicationConfig
|
|
6640
|
+
}
|
|
6641
|
+
);
|
|
6642
|
+
this.replicationPipeline.setOperationApplier(this.applyReplicatedOperation.bind(this));
|
|
6643
|
+
logger.info({ nodeId: config.nodeId }, "ReplicationPipeline initialized");
|
|
6644
|
+
}
|
|
6645
|
+
this.partitionService.on("rebalanced", (partitionMap, changes) => {
|
|
6646
|
+
this.broadcastPartitionMap(partitionMap);
|
|
6647
|
+
});
|
|
4802
6648
|
this.lockManager = new LockManager();
|
|
4803
6649
|
this.lockManager.on("lockGranted", (evt) => this.handleLockGranted(evt));
|
|
4804
6650
|
this.topicManager = new TopicManager({
|
|
@@ -4915,7 +6761,7 @@ var ServerCoordinator = class {
|
|
|
4915
6761
|
this.metricsService.destroy();
|
|
4916
6762
|
this.wss.close();
|
|
4917
6763
|
logger.info(`Closing ${this.clients.size} client connections...`);
|
|
4918
|
-
const shutdownMsg = (0,
|
|
6764
|
+
const shutdownMsg = (0, import_core10.serialize)({ type: "SHUTDOWN_PENDING", retryAfter: 5e3 });
|
|
4919
6765
|
for (const client of this.clients.values()) {
|
|
4920
6766
|
try {
|
|
4921
6767
|
if (client.socket.readyState === import_ws3.WebSocket.OPEN) {
|
|
@@ -4937,6 +6783,9 @@ var ServerCoordinator = class {
|
|
|
4937
6783
|
await this.workerPool.shutdown(5e3);
|
|
4938
6784
|
logger.info("Worker pool shutdown complete.");
|
|
4939
6785
|
}
|
|
6786
|
+
if (this.replicationPipeline) {
|
|
6787
|
+
this.replicationPipeline.close();
|
|
6788
|
+
}
|
|
4940
6789
|
if (this.cluster) {
|
|
4941
6790
|
this.cluster.stop();
|
|
4942
6791
|
}
|
|
@@ -5033,7 +6882,7 @@ var ServerCoordinator = class {
|
|
|
5033
6882
|
buf = Buffer.from(message);
|
|
5034
6883
|
}
|
|
5035
6884
|
try {
|
|
5036
|
-
data = (0,
|
|
6885
|
+
data = (0, import_core10.deserialize)(buf);
|
|
5037
6886
|
} catch (e) {
|
|
5038
6887
|
try {
|
|
5039
6888
|
const text = Buffer.isBuffer(buf) ? buf.toString() : new TextDecoder().decode(buf);
|
|
@@ -5084,10 +6933,10 @@ var ServerCoordinator = class {
|
|
|
5084
6933
|
this.clients.delete(clientId);
|
|
5085
6934
|
this.metricsService.setConnectedClients(this.clients.size);
|
|
5086
6935
|
});
|
|
5087
|
-
ws.send((0,
|
|
6936
|
+
ws.send((0, import_core10.serialize)({ type: "AUTH_REQUIRED" }));
|
|
5088
6937
|
}
|
|
5089
6938
|
async handleMessage(client, rawMessage) {
|
|
5090
|
-
const parseResult =
|
|
6939
|
+
const parseResult = import_core10.MessageSchema.safeParse(rawMessage);
|
|
5091
6940
|
if (!parseResult.success) {
|
|
5092
6941
|
logger.error({ clientId: client.id, error: parseResult.error }, "Invalid message format from client");
|
|
5093
6942
|
client.writer.write({
|
|
@@ -5327,7 +7176,7 @@ var ServerCoordinator = class {
|
|
|
5327
7176
|
this.metricsService.incOp("GET", message.mapName);
|
|
5328
7177
|
try {
|
|
5329
7178
|
const mapForSync = await this.getMapAsync(message.mapName);
|
|
5330
|
-
if (mapForSync instanceof
|
|
7179
|
+
if (mapForSync instanceof import_core10.LWWMap) {
|
|
5331
7180
|
const tree = mapForSync.getMerkleTree();
|
|
5332
7181
|
const rootHash = tree.getRootHash();
|
|
5333
7182
|
client.writer.write({
|
|
@@ -5365,7 +7214,7 @@ var ServerCoordinator = class {
|
|
|
5365
7214
|
const { mapName, path } = message.payload;
|
|
5366
7215
|
try {
|
|
5367
7216
|
const mapForBucket = await this.getMapAsync(mapName);
|
|
5368
|
-
if (mapForBucket instanceof
|
|
7217
|
+
if (mapForBucket instanceof import_core10.LWWMap) {
|
|
5369
7218
|
const treeForBucket = mapForBucket.getMerkleTree();
|
|
5370
7219
|
const buckets = treeForBucket.getBuckets(path);
|
|
5371
7220
|
const node = treeForBucket.getNode(path);
|
|
@@ -5494,6 +7343,23 @@ var ServerCoordinator = class {
|
|
|
5494
7343
|
}
|
|
5495
7344
|
break;
|
|
5496
7345
|
}
|
|
7346
|
+
// ============ Phase 4: Partition Map Request Handler ============
|
|
7347
|
+
case "PARTITION_MAP_REQUEST": {
|
|
7348
|
+
const clientVersion = message.payload?.currentVersion ?? 0;
|
|
7349
|
+
const currentMap = this.partitionService.getPartitionMap();
|
|
7350
|
+
if (clientVersion < currentMap.version) {
|
|
7351
|
+
client.writer.write({
|
|
7352
|
+
type: "PARTITION_MAP",
|
|
7353
|
+
payload: currentMap
|
|
7354
|
+
});
|
|
7355
|
+
logger.debug({
|
|
7356
|
+
clientId: client.id,
|
|
7357
|
+
clientVersion,
|
|
7358
|
+
serverVersion: currentMap.version
|
|
7359
|
+
}, "Sent partition map to client");
|
|
7360
|
+
}
|
|
7361
|
+
break;
|
|
7362
|
+
}
|
|
5497
7363
|
// ============ ORMap Sync Message Handlers ============
|
|
5498
7364
|
case "ORMAP_SYNC_INIT": {
|
|
5499
7365
|
if (!this.securityManager.checkPermission(client.principal, message.mapName, "READ")) {
|
|
@@ -5517,7 +7383,7 @@ var ServerCoordinator = class {
|
|
|
5517
7383
|
this.metricsService.incOp("GET", message.mapName);
|
|
5518
7384
|
try {
|
|
5519
7385
|
const mapForSync = await this.getMapAsync(message.mapName, "OR");
|
|
5520
|
-
if (mapForSync instanceof
|
|
7386
|
+
if (mapForSync instanceof import_core10.ORMap) {
|
|
5521
7387
|
const tree = mapForSync.getMerkleTree();
|
|
5522
7388
|
const rootHash = tree.getRootHash();
|
|
5523
7389
|
client.writer.write({
|
|
@@ -5554,7 +7420,7 @@ var ServerCoordinator = class {
|
|
|
5554
7420
|
const { mapName, path } = message.payload;
|
|
5555
7421
|
try {
|
|
5556
7422
|
const mapForBucket = await this.getMapAsync(mapName, "OR");
|
|
5557
|
-
if (mapForBucket instanceof
|
|
7423
|
+
if (mapForBucket instanceof import_core10.ORMap) {
|
|
5558
7424
|
const tree = mapForBucket.getMerkleTree();
|
|
5559
7425
|
const buckets = tree.getBuckets(path);
|
|
5560
7426
|
const isLeaf = tree.isLeaf(path);
|
|
@@ -5598,7 +7464,7 @@ var ServerCoordinator = class {
|
|
|
5598
7464
|
const { mapName: diffMapName, keys } = message.payload;
|
|
5599
7465
|
try {
|
|
5600
7466
|
const mapForDiff = await this.getMapAsync(diffMapName, "OR");
|
|
5601
|
-
if (mapForDiff instanceof
|
|
7467
|
+
if (mapForDiff instanceof import_core10.ORMap) {
|
|
5602
7468
|
const entries = [];
|
|
5603
7469
|
const allTombstones = mapForDiff.getTombstones();
|
|
5604
7470
|
for (const key of keys) {
|
|
@@ -5630,7 +7496,7 @@ var ServerCoordinator = class {
|
|
|
5630
7496
|
const { mapName: pushMapName, entries: pushEntries } = message.payload;
|
|
5631
7497
|
try {
|
|
5632
7498
|
const mapForPush = await this.getMapAsync(pushMapName, "OR");
|
|
5633
|
-
if (mapForPush instanceof
|
|
7499
|
+
if (mapForPush instanceof import_core10.ORMap) {
|
|
5634
7500
|
let totalAdded = 0;
|
|
5635
7501
|
let totalUpdated = 0;
|
|
5636
7502
|
for (const entry of pushEntries) {
|
|
@@ -5685,7 +7551,7 @@ var ServerCoordinator = class {
|
|
|
5685
7551
|
} else if (op.orRecord && op.orRecord.timestamp) {
|
|
5686
7552
|
} else if (op.orTag) {
|
|
5687
7553
|
try {
|
|
5688
|
-
ts =
|
|
7554
|
+
ts = import_core10.HLC.parse(op.orTag);
|
|
5689
7555
|
} catch (e) {
|
|
5690
7556
|
}
|
|
5691
7557
|
}
|
|
@@ -5697,6 +7563,28 @@ var ServerCoordinator = class {
|
|
|
5697
7563
|
client.lastActiveHlc = this.hlc.now();
|
|
5698
7564
|
}
|
|
5699
7565
|
}
|
|
7566
|
+
// ============ Phase 4: Partition Map Broadcast ============
|
|
7567
|
+
/**
|
|
7568
|
+
* Broadcast partition map to all connected and authenticated clients.
|
|
7569
|
+
* Called when partition topology changes (node join/leave/failover).
|
|
7570
|
+
*/
|
|
7571
|
+
broadcastPartitionMap(partitionMap) {
|
|
7572
|
+
const message = {
|
|
7573
|
+
type: "PARTITION_MAP",
|
|
7574
|
+
payload: partitionMap
|
|
7575
|
+
};
|
|
7576
|
+
let broadcastCount = 0;
|
|
7577
|
+
for (const client of this.clients.values()) {
|
|
7578
|
+
if (client.isAuthenticated && client.socket.readyState === import_ws3.WebSocket.OPEN) {
|
|
7579
|
+
client.writer.write(message);
|
|
7580
|
+
broadcastCount++;
|
|
7581
|
+
}
|
|
7582
|
+
}
|
|
7583
|
+
logger.info({
|
|
7584
|
+
version: partitionMap.version,
|
|
7585
|
+
clientCount: broadcastCount
|
|
7586
|
+
}, "Broadcast partition map to clients");
|
|
7587
|
+
}
|
|
5700
7588
|
broadcast(message, excludeClientId) {
|
|
5701
7589
|
const isServerEvent = message.type === "SERVER_EVENT";
|
|
5702
7590
|
if (isServerEvent) {
|
|
@@ -5727,7 +7615,7 @@ var ServerCoordinator = class {
|
|
|
5727
7615
|
client.writer.write({ ...message, payload: newPayload });
|
|
5728
7616
|
}
|
|
5729
7617
|
} else {
|
|
5730
|
-
const msgData = (0,
|
|
7618
|
+
const msgData = (0, import_core10.serialize)(message);
|
|
5731
7619
|
for (const [id, client] of this.clients) {
|
|
5732
7620
|
if (id !== excludeClientId && client.socket.readyState === 1) {
|
|
5733
7621
|
client.writer.writeRaw(msgData);
|
|
@@ -5805,7 +7693,7 @@ var ServerCoordinator = class {
|
|
|
5805
7693
|
payload: { events: filteredEvents },
|
|
5806
7694
|
timestamp: this.hlc.now()
|
|
5807
7695
|
};
|
|
5808
|
-
const serializedBatch = (0,
|
|
7696
|
+
const serializedBatch = (0, import_core10.serialize)(batchMessage);
|
|
5809
7697
|
for (const client of clients) {
|
|
5810
7698
|
try {
|
|
5811
7699
|
client.writer.writeRaw(serializedBatch);
|
|
@@ -5890,7 +7778,7 @@ var ServerCoordinator = class {
|
|
|
5890
7778
|
payload: { events: filteredEvents },
|
|
5891
7779
|
timestamp: this.hlc.now()
|
|
5892
7780
|
};
|
|
5893
|
-
const serializedBatch = (0,
|
|
7781
|
+
const serializedBatch = (0, import_core10.serialize)(batchMessage);
|
|
5894
7782
|
for (const client of clients) {
|
|
5895
7783
|
sendPromises.push(new Promise((resolve, reject) => {
|
|
5896
7784
|
try {
|
|
@@ -6034,14 +7922,14 @@ var ServerCoordinator = class {
|
|
|
6034
7922
|
async executeLocalQuery(mapName, query) {
|
|
6035
7923
|
const map = await this.getMapAsync(mapName);
|
|
6036
7924
|
const records = /* @__PURE__ */ new Map();
|
|
6037
|
-
if (map instanceof
|
|
7925
|
+
if (map instanceof import_core10.LWWMap) {
|
|
6038
7926
|
for (const key of map.allKeys()) {
|
|
6039
7927
|
const rec = map.getRecord(key);
|
|
6040
7928
|
if (rec && rec.value !== null) {
|
|
6041
7929
|
records.set(key, rec);
|
|
6042
7930
|
}
|
|
6043
7931
|
}
|
|
6044
|
-
} else if (map instanceof
|
|
7932
|
+
} else if (map instanceof import_core10.ORMap) {
|
|
6045
7933
|
const items = map.items;
|
|
6046
7934
|
for (const key of items.keys()) {
|
|
6047
7935
|
const values = map.get(key);
|
|
@@ -6111,11 +7999,11 @@ var ServerCoordinator = class {
|
|
|
6111
7999
|
applyOpToMap(op) {
|
|
6112
8000
|
const typeHint = op.opType === "OR_ADD" || op.opType === "OR_REMOVE" ? "OR" : "LWW";
|
|
6113
8001
|
const map = this.getMap(op.mapName, typeHint);
|
|
6114
|
-
if (typeHint === "OR" && map instanceof
|
|
8002
|
+
if (typeHint === "OR" && map instanceof import_core10.LWWMap) {
|
|
6115
8003
|
logger.error({ mapName: op.mapName }, "Map type mismatch: LWWMap but received OR op");
|
|
6116
8004
|
throw new Error("Map type mismatch: LWWMap but received OR op");
|
|
6117
8005
|
}
|
|
6118
|
-
if (typeHint === "LWW" && map instanceof
|
|
8006
|
+
if (typeHint === "LWW" && map instanceof import_core10.ORMap) {
|
|
6119
8007
|
logger.error({ mapName: op.mapName }, "Map type mismatch: ORMap but received LWW op");
|
|
6120
8008
|
throw new Error("Map type mismatch: ORMap but received LWW op");
|
|
6121
8009
|
}
|
|
@@ -6126,13 +8014,13 @@ var ServerCoordinator = class {
|
|
|
6126
8014
|
mapName: op.mapName,
|
|
6127
8015
|
key: op.key
|
|
6128
8016
|
};
|
|
6129
|
-
if (map instanceof
|
|
8017
|
+
if (map instanceof import_core10.LWWMap) {
|
|
6130
8018
|
oldRecord = map.getRecord(op.key);
|
|
6131
8019
|
map.merge(op.key, op.record);
|
|
6132
8020
|
recordToStore = op.record;
|
|
6133
8021
|
eventPayload.eventType = "UPDATED";
|
|
6134
8022
|
eventPayload.record = op.record;
|
|
6135
|
-
} else if (map instanceof
|
|
8023
|
+
} else if (map instanceof import_core10.ORMap) {
|
|
6136
8024
|
oldRecord = map.getRecords(op.key);
|
|
6137
8025
|
if (op.opType === "OR_ADD") {
|
|
6138
8026
|
map.apply(op.key, op.orRecord);
|
|
@@ -6148,7 +8036,7 @@ var ServerCoordinator = class {
|
|
|
6148
8036
|
}
|
|
6149
8037
|
}
|
|
6150
8038
|
this.queryRegistry.processChange(op.mapName, map, op.key, op.record || op.orRecord, oldRecord);
|
|
6151
|
-
const mapSize = map instanceof
|
|
8039
|
+
const mapSize = map instanceof import_core10.ORMap ? map.totalRecords : map.size;
|
|
6152
8040
|
this.metricsService.setMapSize(op.mapName, mapSize);
|
|
6153
8041
|
if (this.storage) {
|
|
6154
8042
|
if (recordToStore) {
|
|
@@ -6175,6 +8063,26 @@ var ServerCoordinator = class {
|
|
|
6175
8063
|
}
|
|
6176
8064
|
}
|
|
6177
8065
|
}
|
|
8066
|
+
/**
|
|
8067
|
+
* Apply replicated operation from another node (callback for ReplicationPipeline)
|
|
8068
|
+
* This is called when we receive a replicated operation as a backup node
|
|
8069
|
+
*/
|
|
8070
|
+
async applyReplicatedOperation(operation, opId, sourceNode) {
|
|
8071
|
+
try {
|
|
8072
|
+
const op = operation;
|
|
8073
|
+
logger.debug({ sourceNode, opId, mapName: op.mapName, key: op.key }, "Applying replicated operation");
|
|
8074
|
+
const { eventPayload } = this.applyOpToMap(op);
|
|
8075
|
+
this.broadcast({
|
|
8076
|
+
type: "SERVER_EVENT",
|
|
8077
|
+
payload: eventPayload,
|
|
8078
|
+
timestamp: this.hlc.now()
|
|
8079
|
+
});
|
|
8080
|
+
return true;
|
|
8081
|
+
} catch (error) {
|
|
8082
|
+
logger.error({ sourceNode, opId, error }, "Failed to apply replicated operation");
|
|
8083
|
+
return false;
|
|
8084
|
+
}
|
|
8085
|
+
}
|
|
6178
8086
|
/**
|
|
6179
8087
|
* Build OpContext for interceptors.
|
|
6180
8088
|
*/
|
|
@@ -6263,6 +8171,12 @@ var ServerCoordinator = class {
|
|
|
6263
8171
|
throw err;
|
|
6264
8172
|
}
|
|
6265
8173
|
const { eventPayload } = this.applyOpToMap(op);
|
|
8174
|
+
if (this.replicationPipeline && !fromCluster) {
|
|
8175
|
+
const opId = op.id || `${op.mapName}:${op.key}:${Date.now()}`;
|
|
8176
|
+
this.replicationPipeline.replicate(op, opId, op.key).catch((err) => {
|
|
8177
|
+
logger.warn({ opId, key: op.key, err }, "Replication failed (non-fatal)");
|
|
8178
|
+
});
|
|
8179
|
+
}
|
|
6266
8180
|
this.broadcast({
|
|
6267
8181
|
type: "SERVER_EVENT",
|
|
6268
8182
|
payload: eventPayload,
|
|
@@ -6385,6 +8299,12 @@ var ServerCoordinator = class {
|
|
|
6385
8299
|
throw err;
|
|
6386
8300
|
}
|
|
6387
8301
|
const { eventPayload } = this.applyOpToMap(op);
|
|
8302
|
+
if (this.replicationPipeline) {
|
|
8303
|
+
const opId = op.id || `${op.mapName}:${op.key}:${Date.now()}`;
|
|
8304
|
+
this.replicationPipeline.replicate(op, opId, op.key).catch((err) => {
|
|
8305
|
+
logger.warn({ opId, key: op.key, err }, "Batch replication failed (non-fatal)");
|
|
8306
|
+
});
|
|
8307
|
+
}
|
|
6388
8308
|
batchedEvents.push(eventPayload);
|
|
6389
8309
|
this.broadcastToCluster(eventPayload);
|
|
6390
8310
|
this.runAfterInterceptors(op, context);
|
|
@@ -6392,11 +8312,11 @@ var ServerCoordinator = class {
|
|
|
6392
8312
|
handleClusterEvent(payload) {
|
|
6393
8313
|
const { mapName, key, eventType } = payload;
|
|
6394
8314
|
const map = this.getMap(mapName, eventType === "OR_ADD" || eventType === "OR_REMOVE" ? "OR" : "LWW");
|
|
6395
|
-
const oldRecord = map instanceof
|
|
8315
|
+
const oldRecord = map instanceof import_core10.LWWMap ? map.getRecord(key) : null;
|
|
6396
8316
|
if (this.partitionService.isRelated(key)) {
|
|
6397
|
-
if (map instanceof
|
|
8317
|
+
if (map instanceof import_core10.LWWMap && payload.record) {
|
|
6398
8318
|
map.merge(key, payload.record);
|
|
6399
|
-
} else if (map instanceof
|
|
8319
|
+
} else if (map instanceof import_core10.ORMap) {
|
|
6400
8320
|
if (eventType === "OR_ADD" && payload.orRecord) {
|
|
6401
8321
|
map.apply(key, payload.orRecord);
|
|
6402
8322
|
} else if (eventType === "OR_REMOVE" && payload.orTag) {
|
|
@@ -6415,9 +8335,9 @@ var ServerCoordinator = class {
|
|
|
6415
8335
|
if (!this.maps.has(name)) {
|
|
6416
8336
|
let map;
|
|
6417
8337
|
if (typeHint === "OR") {
|
|
6418
|
-
map = new
|
|
8338
|
+
map = new import_core10.ORMap(this.hlc);
|
|
6419
8339
|
} else {
|
|
6420
|
-
map = new
|
|
8340
|
+
map = new import_core10.LWWMap(this.hlc);
|
|
6421
8341
|
}
|
|
6422
8342
|
this.maps.set(name, map);
|
|
6423
8343
|
if (this.storage) {
|
|
@@ -6440,7 +8360,7 @@ var ServerCoordinator = class {
|
|
|
6440
8360
|
this.getMap(name, typeHint);
|
|
6441
8361
|
const loadingPromise = this.mapLoadingPromises.get(name);
|
|
6442
8362
|
const map = this.maps.get(name);
|
|
6443
|
-
const mapSize = map instanceof
|
|
8363
|
+
const mapSize = map instanceof import_core10.LWWMap ? Array.from(map.entries()).length : map instanceof import_core10.ORMap ? map.size : 0;
|
|
6444
8364
|
logger.info({
|
|
6445
8365
|
mapName: name,
|
|
6446
8366
|
mapExisted,
|
|
@@ -6450,7 +8370,7 @@ var ServerCoordinator = class {
|
|
|
6450
8370
|
if (loadingPromise) {
|
|
6451
8371
|
logger.info({ mapName: name }, "[getMapAsync] Waiting for loadMapFromStorage...");
|
|
6452
8372
|
await loadingPromise;
|
|
6453
|
-
const newMapSize = map instanceof
|
|
8373
|
+
const newMapSize = map instanceof import_core10.LWWMap ? Array.from(map.entries()).length : map instanceof import_core10.ORMap ? map.size : 0;
|
|
6454
8374
|
logger.info({ mapName: name, mapSizeAfterLoad: newMapSize }, "[getMapAsync] Load completed");
|
|
6455
8375
|
}
|
|
6456
8376
|
return this.maps.get(name);
|
|
@@ -6476,16 +8396,16 @@ var ServerCoordinator = class {
|
|
|
6476
8396
|
const currentMap = this.maps.get(name);
|
|
6477
8397
|
if (!currentMap) return;
|
|
6478
8398
|
let targetMap = currentMap;
|
|
6479
|
-
if (isOR && currentMap instanceof
|
|
8399
|
+
if (isOR && currentMap instanceof import_core10.LWWMap) {
|
|
6480
8400
|
logger.info({ mapName: name }, "Map auto-detected as ORMap. Switching type.");
|
|
6481
|
-
targetMap = new
|
|
8401
|
+
targetMap = new import_core10.ORMap(this.hlc);
|
|
6482
8402
|
this.maps.set(name, targetMap);
|
|
6483
|
-
} else if (!isOR && currentMap instanceof
|
|
8403
|
+
} else if (!isOR && currentMap instanceof import_core10.ORMap && typeHint !== "OR") {
|
|
6484
8404
|
logger.info({ mapName: name }, "Map auto-detected as LWWMap. Switching type.");
|
|
6485
|
-
targetMap = new
|
|
8405
|
+
targetMap = new import_core10.LWWMap(this.hlc);
|
|
6486
8406
|
this.maps.set(name, targetMap);
|
|
6487
8407
|
}
|
|
6488
|
-
if (targetMap instanceof
|
|
8408
|
+
if (targetMap instanceof import_core10.ORMap) {
|
|
6489
8409
|
for (const [key, record] of records) {
|
|
6490
8410
|
if (key === "__tombstones__") {
|
|
6491
8411
|
const t = record;
|
|
@@ -6498,7 +8418,7 @@ var ServerCoordinator = class {
|
|
|
6498
8418
|
}
|
|
6499
8419
|
}
|
|
6500
8420
|
}
|
|
6501
|
-
} else if (targetMap instanceof
|
|
8421
|
+
} else if (targetMap instanceof import_core10.LWWMap) {
|
|
6502
8422
|
for (const [key, record] of records) {
|
|
6503
8423
|
if (!record.type) {
|
|
6504
8424
|
targetMap.merge(key, record);
|
|
@@ -6509,7 +8429,7 @@ var ServerCoordinator = class {
|
|
|
6509
8429
|
if (count > 0) {
|
|
6510
8430
|
logger.info({ mapName: name, count }, "Loaded records for map");
|
|
6511
8431
|
this.queryRegistry.refreshSubscriptions(name, targetMap);
|
|
6512
|
-
const mapSize = targetMap instanceof
|
|
8432
|
+
const mapSize = targetMap instanceof import_core10.ORMap ? targetMap.totalRecords : targetMap.size;
|
|
6513
8433
|
this.metricsService.setMapSize(name, mapSize);
|
|
6514
8434
|
}
|
|
6515
8435
|
} catch (err) {
|
|
@@ -6591,7 +8511,7 @@ var ServerCoordinator = class {
|
|
|
6591
8511
|
reportLocalHlc() {
|
|
6592
8512
|
let minHlc = this.hlc.now();
|
|
6593
8513
|
for (const client of this.clients.values()) {
|
|
6594
|
-
if (
|
|
8514
|
+
if (import_core10.HLC.compare(client.lastActiveHlc, minHlc) < 0) {
|
|
6595
8515
|
minHlc = client.lastActiveHlc;
|
|
6596
8516
|
}
|
|
6597
8517
|
}
|
|
@@ -6612,7 +8532,7 @@ var ServerCoordinator = class {
|
|
|
6612
8532
|
let globalSafe = this.hlc.now();
|
|
6613
8533
|
let initialized = false;
|
|
6614
8534
|
for (const ts of this.gcReports.values()) {
|
|
6615
|
-
if (!initialized ||
|
|
8535
|
+
if (!initialized || import_core10.HLC.compare(ts, globalSafe) < 0) {
|
|
6616
8536
|
globalSafe = ts;
|
|
6617
8537
|
initialized = true;
|
|
6618
8538
|
}
|
|
@@ -6647,7 +8567,7 @@ var ServerCoordinator = class {
|
|
|
6647
8567
|
logger.info({ olderThanMillis: olderThan.millis }, "Performing Garbage Collection");
|
|
6648
8568
|
const now = Date.now();
|
|
6649
8569
|
for (const [name, map] of this.maps) {
|
|
6650
|
-
if (map instanceof
|
|
8570
|
+
if (map instanceof import_core10.LWWMap) {
|
|
6651
8571
|
for (const key of map.allKeys()) {
|
|
6652
8572
|
const record = map.getRecord(key);
|
|
6653
8573
|
if (record && record.value !== null && record.ttlMs) {
|
|
@@ -6699,7 +8619,7 @@ var ServerCoordinator = class {
|
|
|
6699
8619
|
});
|
|
6700
8620
|
}
|
|
6701
8621
|
}
|
|
6702
|
-
} else if (map instanceof
|
|
8622
|
+
} else if (map instanceof import_core10.ORMap) {
|
|
6703
8623
|
const items = map.items;
|
|
6704
8624
|
const tombstonesSet = map.tombstones;
|
|
6705
8625
|
const tagsToExpire = [];
|
|
@@ -6802,17 +8722,17 @@ var ServerCoordinator = class {
|
|
|
6802
8722
|
stringToWriteConcern(value) {
|
|
6803
8723
|
switch (value) {
|
|
6804
8724
|
case "FIRE_AND_FORGET":
|
|
6805
|
-
return
|
|
8725
|
+
return import_core10.WriteConcern.FIRE_AND_FORGET;
|
|
6806
8726
|
case "MEMORY":
|
|
6807
|
-
return
|
|
8727
|
+
return import_core10.WriteConcern.MEMORY;
|
|
6808
8728
|
case "APPLIED":
|
|
6809
|
-
return
|
|
8729
|
+
return import_core10.WriteConcern.APPLIED;
|
|
6810
8730
|
case "REPLICATED":
|
|
6811
|
-
return
|
|
8731
|
+
return import_core10.WriteConcern.REPLICATED;
|
|
6812
8732
|
case "PERSISTED":
|
|
6813
|
-
return
|
|
8733
|
+
return import_core10.WriteConcern.PERSISTED;
|
|
6814
8734
|
default:
|
|
6815
|
-
return
|
|
8735
|
+
return import_core10.WriteConcern.MEMORY;
|
|
6816
8736
|
}
|
|
6817
8737
|
}
|
|
6818
8738
|
/**
|
|
@@ -6869,7 +8789,7 @@ var ServerCoordinator = class {
|
|
|
6869
8789
|
}
|
|
6870
8790
|
});
|
|
6871
8791
|
if (op.id) {
|
|
6872
|
-
this.writeAckManager.notifyLevel(op.id,
|
|
8792
|
+
this.writeAckManager.notifyLevel(op.id, import_core10.WriteConcern.REPLICATED);
|
|
6873
8793
|
}
|
|
6874
8794
|
}
|
|
6875
8795
|
}
|
|
@@ -6877,7 +8797,7 @@ var ServerCoordinator = class {
|
|
|
6877
8797
|
this.broadcastBatch(batchedEvents, clientId);
|
|
6878
8798
|
for (const op of ops) {
|
|
6879
8799
|
if (op.id && this.partitionService.isLocalOwner(op.key)) {
|
|
6880
|
-
this.writeAckManager.notifyLevel(op.id,
|
|
8800
|
+
this.writeAckManager.notifyLevel(op.id, import_core10.WriteConcern.REPLICATED);
|
|
6881
8801
|
}
|
|
6882
8802
|
}
|
|
6883
8803
|
}
|
|
@@ -6905,7 +8825,7 @@ var ServerCoordinator = class {
|
|
|
6905
8825
|
const owner = this.partitionService.getOwner(op.key);
|
|
6906
8826
|
await this.forwardOpAndWait(op, owner);
|
|
6907
8827
|
if (op.id) {
|
|
6908
|
-
this.writeAckManager.notifyLevel(op.id,
|
|
8828
|
+
this.writeAckManager.notifyLevel(op.id, import_core10.WriteConcern.REPLICATED);
|
|
6909
8829
|
}
|
|
6910
8830
|
}
|
|
6911
8831
|
}
|
|
@@ -6913,7 +8833,7 @@ var ServerCoordinator = class {
|
|
|
6913
8833
|
await this.broadcastBatchSync(batchedEvents, clientId);
|
|
6914
8834
|
for (const op of ops) {
|
|
6915
8835
|
if (op.id && this.partitionService.isLocalOwner(op.key)) {
|
|
6916
|
-
this.writeAckManager.notifyLevel(op.id,
|
|
8836
|
+
this.writeAckManager.notifyLevel(op.id, import_core10.WriteConcern.REPLICATED);
|
|
6917
8837
|
}
|
|
6918
8838
|
}
|
|
6919
8839
|
}
|
|
@@ -6941,7 +8861,7 @@ var ServerCoordinator = class {
|
|
|
6941
8861
|
}
|
|
6942
8862
|
const { eventPayload } = this.applyOpToMap(op);
|
|
6943
8863
|
if (op.id) {
|
|
6944
|
-
this.writeAckManager.notifyLevel(op.id,
|
|
8864
|
+
this.writeAckManager.notifyLevel(op.id, import_core10.WriteConcern.APPLIED);
|
|
6945
8865
|
}
|
|
6946
8866
|
if (eventPayload) {
|
|
6947
8867
|
batchedEvents.push({
|
|
@@ -6955,7 +8875,7 @@ var ServerCoordinator = class {
|
|
|
6955
8875
|
try {
|
|
6956
8876
|
await this.persistOpSync(op);
|
|
6957
8877
|
if (op.id) {
|
|
6958
|
-
this.writeAckManager.notifyLevel(op.id,
|
|
8878
|
+
this.writeAckManager.notifyLevel(op.id, import_core10.WriteConcern.PERSISTED);
|
|
6959
8879
|
}
|
|
6960
8880
|
} catch (err) {
|
|
6961
8881
|
logger.error({ opId: op.id, err }, "Persistence failed");
|
|
@@ -7298,10 +9218,10 @@ var RateLimitInterceptor = class {
|
|
|
7298
9218
|
};
|
|
7299
9219
|
|
|
7300
9220
|
// src/utils/nativeStats.ts
|
|
7301
|
-
var
|
|
9221
|
+
var import_core11 = require("@topgunbuild/core");
|
|
7302
9222
|
function getNativeModuleStatus() {
|
|
7303
9223
|
return {
|
|
7304
|
-
nativeHash: (0,
|
|
9224
|
+
nativeHash: (0, import_core11.isUsingNativeHash)(),
|
|
7305
9225
|
sharedArrayBuffer: SharedMemoryManager.isAvailable()
|
|
7306
9226
|
};
|
|
7307
9227
|
}
|
|
@@ -7332,19 +9252,401 @@ function logNativeStatus() {
|
|
|
7332
9252
|
` - SharedArrayBuffer: ${status.sharedArrayBuffer ? "available" : "unavailable"}`
|
|
7333
9253
|
);
|
|
7334
9254
|
}
|
|
9255
|
+
|
|
9256
|
+
// src/cluster/ClusterCoordinator.ts
|
|
9257
|
+
var import_events9 = require("events");
|
|
9258
|
+
var import_core12 = require("@topgunbuild/core");
|
|
9259
|
+
var DEFAULT_CLUSTER_COORDINATOR_CONFIG = {
|
|
9260
|
+
gradualRebalancing: true,
|
|
9261
|
+
migration: import_core12.DEFAULT_MIGRATION_CONFIG,
|
|
9262
|
+
replication: import_core12.DEFAULT_REPLICATION_CONFIG,
|
|
9263
|
+
replicationEnabled: true
|
|
9264
|
+
};
|
|
9265
|
+
var ClusterCoordinator = class extends import_events9.EventEmitter {
|
|
9266
|
+
constructor(config) {
|
|
9267
|
+
super();
|
|
9268
|
+
this.replicationPipeline = null;
|
|
9269
|
+
// State
|
|
9270
|
+
this.started = false;
|
|
9271
|
+
this.actualPort = 0;
|
|
9272
|
+
this.config = {
|
|
9273
|
+
...DEFAULT_CLUSTER_COORDINATOR_CONFIG,
|
|
9274
|
+
...config
|
|
9275
|
+
};
|
|
9276
|
+
this.clusterManager = new ClusterManager(this.config.cluster);
|
|
9277
|
+
this.lagTracker = new LagTracker();
|
|
9278
|
+
const partitionServiceConfig = {
|
|
9279
|
+
gradualRebalancing: this.config.gradualRebalancing,
|
|
9280
|
+
migration: this.config.migration
|
|
9281
|
+
};
|
|
9282
|
+
this.partitionService = new PartitionService(this.clusterManager, partitionServiceConfig);
|
|
9283
|
+
if (this.config.replicationEnabled) {
|
|
9284
|
+
this.replicationPipeline = new ReplicationPipeline(
|
|
9285
|
+
this.clusterManager,
|
|
9286
|
+
this.partitionService,
|
|
9287
|
+
this.config.replication
|
|
9288
|
+
);
|
|
9289
|
+
}
|
|
9290
|
+
this.setupEventHandlers();
|
|
9291
|
+
}
|
|
9292
|
+
// ============================================
|
|
9293
|
+
// Lifecycle Methods
|
|
9294
|
+
// ============================================
|
|
9295
|
+
/**
|
|
9296
|
+
* Start the cluster coordinator
|
|
9297
|
+
*/
|
|
9298
|
+
async start() {
|
|
9299
|
+
if (this.started) {
|
|
9300
|
+
return this.actualPort;
|
|
9301
|
+
}
|
|
9302
|
+
logger.info({ nodeId: this.config.cluster.nodeId }, "Starting ClusterCoordinator");
|
|
9303
|
+
this.actualPort = await this.clusterManager.start();
|
|
9304
|
+
const migrationManager = this.partitionService.getMigrationManager();
|
|
9305
|
+
if (migrationManager && this.config.dataCollector) {
|
|
9306
|
+
migrationManager.setDataCollector(this.config.dataCollector);
|
|
9307
|
+
}
|
|
9308
|
+
if (migrationManager && this.config.dataStorer) {
|
|
9309
|
+
migrationManager.setDataStorer(this.config.dataStorer);
|
|
9310
|
+
}
|
|
9311
|
+
this.started = true;
|
|
9312
|
+
this.emit("started");
|
|
9313
|
+
logger.info({ nodeId: this.config.cluster.nodeId, port: this.actualPort }, "ClusterCoordinator started");
|
|
9314
|
+
return this.actualPort;
|
|
9315
|
+
}
|
|
9316
|
+
/**
|
|
9317
|
+
* Stop the cluster coordinator
|
|
9318
|
+
*/
|
|
9319
|
+
async stop() {
|
|
9320
|
+
if (!this.started) return;
|
|
9321
|
+
logger.info({ nodeId: this.config.cluster.nodeId }, "Stopping ClusterCoordinator");
|
|
9322
|
+
await this.partitionService.cancelMigrations();
|
|
9323
|
+
this.replicationPipeline?.close();
|
|
9324
|
+
this.clusterManager.stop();
|
|
9325
|
+
this.started = false;
|
|
9326
|
+
this.emit("stopped");
|
|
9327
|
+
logger.info({ nodeId: this.config.cluster.nodeId }, "ClusterCoordinator stopped");
|
|
9328
|
+
}
|
|
9329
|
+
// ============================================
|
|
9330
|
+
// Cluster Information
|
|
9331
|
+
// ============================================
|
|
9332
|
+
/**
|
|
9333
|
+
* Get local node ID
|
|
9334
|
+
*/
|
|
9335
|
+
getNodeId() {
|
|
9336
|
+
return this.config.cluster.nodeId;
|
|
9337
|
+
}
|
|
9338
|
+
/**
|
|
9339
|
+
* Get cluster port
|
|
9340
|
+
*/
|
|
9341
|
+
getPort() {
|
|
9342
|
+
return this.actualPort;
|
|
9343
|
+
}
|
|
9344
|
+
/**
|
|
9345
|
+
* Get all cluster members
|
|
9346
|
+
*/
|
|
9347
|
+
getMembers() {
|
|
9348
|
+
return this.clusterManager.getMembers();
|
|
9349
|
+
}
|
|
9350
|
+
/**
|
|
9351
|
+
* Check if this is the local node
|
|
9352
|
+
*/
|
|
9353
|
+
isLocal(nodeId) {
|
|
9354
|
+
return this.clusterManager.isLocal(nodeId);
|
|
9355
|
+
}
|
|
9356
|
+
/**
|
|
9357
|
+
* Check if coordinator is started
|
|
9358
|
+
*/
|
|
9359
|
+
isStarted() {
|
|
9360
|
+
return this.started;
|
|
9361
|
+
}
|
|
9362
|
+
// ============================================
|
|
9363
|
+
// Partition Operations
|
|
9364
|
+
// ============================================
|
|
9365
|
+
/**
|
|
9366
|
+
* Get current partition map
|
|
9367
|
+
*/
|
|
9368
|
+
getPartitionMap() {
|
|
9369
|
+
return this.partitionService.getPartitionMap();
|
|
9370
|
+
}
|
|
9371
|
+
/**
|
|
9372
|
+
* Get partition map version
|
|
9373
|
+
*/
|
|
9374
|
+
getPartitionMapVersion() {
|
|
9375
|
+
return this.partitionService.getMapVersion();
|
|
9376
|
+
}
|
|
9377
|
+
/**
|
|
9378
|
+
* Get partition ID for a key
|
|
9379
|
+
*/
|
|
9380
|
+
getPartitionId(key) {
|
|
9381
|
+
return this.partitionService.getPartitionId(key);
|
|
9382
|
+
}
|
|
9383
|
+
/**
|
|
9384
|
+
* Get owner node for a key
|
|
9385
|
+
*/
|
|
9386
|
+
getOwner(key) {
|
|
9387
|
+
return this.partitionService.getOwner(key);
|
|
9388
|
+
}
|
|
9389
|
+
/**
|
|
9390
|
+
* Check if this node owns the key
|
|
9391
|
+
*/
|
|
9392
|
+
isLocalOwner(key) {
|
|
9393
|
+
return this.partitionService.isLocalOwner(key);
|
|
9394
|
+
}
|
|
9395
|
+
/**
|
|
9396
|
+
* Check if this node is a backup for the key
|
|
9397
|
+
*/
|
|
9398
|
+
isLocalBackup(key) {
|
|
9399
|
+
return this.partitionService.isLocalBackup(key);
|
|
9400
|
+
}
|
|
9401
|
+
/**
|
|
9402
|
+
* Get backup nodes for a partition
|
|
9403
|
+
*/
|
|
9404
|
+
getBackups(partitionId) {
|
|
9405
|
+
return this.partitionService.getBackups(partitionId);
|
|
9406
|
+
}
|
|
9407
|
+
/**
|
|
9408
|
+
* Check if partition is currently migrating
|
|
9409
|
+
*/
|
|
9410
|
+
isMigrating(partitionId) {
|
|
9411
|
+
return this.partitionService.isMigrating(partitionId);
|
|
9412
|
+
}
|
|
9413
|
+
/**
|
|
9414
|
+
* Check if any rebalancing is in progress
|
|
9415
|
+
*/
|
|
9416
|
+
isRebalancing() {
|
|
9417
|
+
return this.partitionService.isRebalancing();
|
|
9418
|
+
}
|
|
9419
|
+
// ============================================
|
|
9420
|
+
// Migration Operations
|
|
9421
|
+
// ============================================
|
|
9422
|
+
/**
|
|
9423
|
+
* Get migration status
|
|
9424
|
+
*/
|
|
9425
|
+
getMigrationStatus() {
|
|
9426
|
+
return this.partitionService.getMigrationStatus();
|
|
9427
|
+
}
|
|
9428
|
+
/**
|
|
9429
|
+
* Get migration metrics
|
|
9430
|
+
*/
|
|
9431
|
+
getMigrationMetrics() {
|
|
9432
|
+
return this.partitionService.getMigrationManager()?.getMetrics() ?? null;
|
|
9433
|
+
}
|
|
9434
|
+
/**
|
|
9435
|
+
* Cancel all active migrations
|
|
9436
|
+
*/
|
|
9437
|
+
async cancelMigrations() {
|
|
9438
|
+
await this.partitionService.cancelMigrations();
|
|
9439
|
+
}
|
|
9440
|
+
/**
|
|
9441
|
+
* Set data collector for migrations
|
|
9442
|
+
*/
|
|
9443
|
+
setDataCollector(collector) {
|
|
9444
|
+
const migrationManager = this.partitionService.getMigrationManager();
|
|
9445
|
+
if (migrationManager) {
|
|
9446
|
+
migrationManager.setDataCollector(collector);
|
|
9447
|
+
}
|
|
9448
|
+
}
|
|
9449
|
+
/**
|
|
9450
|
+
* Set data storer for incoming migrations
|
|
9451
|
+
*/
|
|
9452
|
+
setDataStorer(storer) {
|
|
9453
|
+
const migrationManager = this.partitionService.getMigrationManager();
|
|
9454
|
+
if (migrationManager) {
|
|
9455
|
+
migrationManager.setDataStorer(storer);
|
|
9456
|
+
}
|
|
9457
|
+
}
|
|
9458
|
+
// ============================================
|
|
9459
|
+
// Replication Operations
|
|
9460
|
+
// ============================================
|
|
9461
|
+
/**
|
|
9462
|
+
* Replicate an operation to backup nodes
|
|
9463
|
+
*/
|
|
9464
|
+
async replicate(operation, opId, key, options = {}) {
|
|
9465
|
+
if (!this.replicationPipeline) {
|
|
9466
|
+
return { success: true, ackedBy: [] };
|
|
9467
|
+
}
|
|
9468
|
+
return this.replicationPipeline.replicate(operation, opId, key, options);
|
|
9469
|
+
}
|
|
9470
|
+
/**
|
|
9471
|
+
* Get replication health status
|
|
9472
|
+
*/
|
|
9473
|
+
getReplicationHealth() {
|
|
9474
|
+
return this.lagTracker.getHealth();
|
|
9475
|
+
}
|
|
9476
|
+
/**
|
|
9477
|
+
* Get replication lag for a specific node
|
|
9478
|
+
*/
|
|
9479
|
+
getReplicationLag(nodeId) {
|
|
9480
|
+
return this.lagTracker.getLag(nodeId);
|
|
9481
|
+
}
|
|
9482
|
+
/**
|
|
9483
|
+
* Check if a node is healthy for replication
|
|
9484
|
+
*/
|
|
9485
|
+
isNodeHealthy(nodeId) {
|
|
9486
|
+
return this.lagTracker.isNodeHealthy(nodeId);
|
|
9487
|
+
}
|
|
9488
|
+
/**
|
|
9489
|
+
* Check if a node is laggy
|
|
9490
|
+
*/
|
|
9491
|
+
isNodeLaggy(nodeId) {
|
|
9492
|
+
return this.lagTracker.isNodeLaggy(nodeId);
|
|
9493
|
+
}
|
|
9494
|
+
// ============================================
|
|
9495
|
+
// Cluster Communication
|
|
9496
|
+
// ============================================
|
|
9497
|
+
/**
|
|
9498
|
+
* Send message to a specific node
|
|
9499
|
+
*/
|
|
9500
|
+
send(nodeId, message) {
|
|
9501
|
+
this.clusterManager.sendToNode(nodeId, message);
|
|
9502
|
+
}
|
|
9503
|
+
/**
|
|
9504
|
+
* Broadcast message to all nodes
|
|
9505
|
+
*/
|
|
9506
|
+
broadcast(message) {
|
|
9507
|
+
for (const nodeId of this.clusterManager.getMembers()) {
|
|
9508
|
+
if (!this.clusterManager.isLocal(nodeId)) {
|
|
9509
|
+
this.clusterManager.sendToNode(nodeId, message);
|
|
9510
|
+
}
|
|
9511
|
+
}
|
|
9512
|
+
}
|
|
9513
|
+
// ============================================
|
|
9514
|
+
// Component Access
|
|
9515
|
+
// ============================================
|
|
9516
|
+
/**
|
|
9517
|
+
* Get underlying ClusterManager
|
|
9518
|
+
*/
|
|
9519
|
+
getClusterManager() {
|
|
9520
|
+
return this.clusterManager;
|
|
9521
|
+
}
|
|
9522
|
+
/**
|
|
9523
|
+
* Get underlying PartitionService
|
|
9524
|
+
*/
|
|
9525
|
+
getPartitionService() {
|
|
9526
|
+
return this.partitionService;
|
|
9527
|
+
}
|
|
9528
|
+
/**
|
|
9529
|
+
* Get underlying ReplicationPipeline
|
|
9530
|
+
*/
|
|
9531
|
+
getReplicationPipeline() {
|
|
9532
|
+
return this.replicationPipeline;
|
|
9533
|
+
}
|
|
9534
|
+
/**
|
|
9535
|
+
* Get underlying LagTracker
|
|
9536
|
+
*/
|
|
9537
|
+
getLagTracker() {
|
|
9538
|
+
return this.lagTracker;
|
|
9539
|
+
}
|
|
9540
|
+
// ============================================
|
|
9541
|
+
// Metrics Export
|
|
9542
|
+
// ============================================
|
|
9543
|
+
/**
|
|
9544
|
+
* Get all metrics in Prometheus format
|
|
9545
|
+
*/
|
|
9546
|
+
getPrometheusMetrics() {
|
|
9547
|
+
const lines = [];
|
|
9548
|
+
lines.push("# HELP topgun_cluster_members Number of cluster members");
|
|
9549
|
+
lines.push("# TYPE topgun_cluster_members gauge");
|
|
9550
|
+
lines.push(`topgun_cluster_members ${this.clusterManager.getMembers().length}`);
|
|
9551
|
+
lines.push("");
|
|
9552
|
+
lines.push("# HELP topgun_cluster_started Cluster started status (1=started, 0=stopped)");
|
|
9553
|
+
lines.push("# TYPE topgun_cluster_started gauge");
|
|
9554
|
+
lines.push(`topgun_cluster_started ${this.started ? 1 : 0}`);
|
|
9555
|
+
lines.push("");
|
|
9556
|
+
lines.push("# HELP topgun_partition_map_version Current partition map version");
|
|
9557
|
+
lines.push("# TYPE topgun_partition_map_version gauge");
|
|
9558
|
+
lines.push(`topgun_partition_map_version ${this.partitionService.getMapVersion()}`);
|
|
9559
|
+
const migrationMetrics = this.getMigrationMetrics();
|
|
9560
|
+
if (migrationMetrics) {
|
|
9561
|
+
lines.push("");
|
|
9562
|
+
lines.push("# HELP topgun_migrations_started Total migrations started");
|
|
9563
|
+
lines.push("# TYPE topgun_migrations_started counter");
|
|
9564
|
+
lines.push(`topgun_migrations_started ${migrationMetrics.migrationsStarted}`);
|
|
9565
|
+
lines.push("");
|
|
9566
|
+
lines.push("# HELP topgun_migrations_completed Total migrations completed");
|
|
9567
|
+
lines.push("# TYPE topgun_migrations_completed counter");
|
|
9568
|
+
lines.push(`topgun_migrations_completed ${migrationMetrics.migrationsCompleted}`);
|
|
9569
|
+
lines.push("");
|
|
9570
|
+
lines.push("# HELP topgun_migrations_failed Total migrations failed");
|
|
9571
|
+
lines.push("# TYPE topgun_migrations_failed counter");
|
|
9572
|
+
lines.push(`topgun_migrations_failed ${migrationMetrics.migrationsFailed}`);
|
|
9573
|
+
lines.push("");
|
|
9574
|
+
lines.push("# HELP topgun_migrations_active Currently active migrations");
|
|
9575
|
+
lines.push("# TYPE topgun_migrations_active gauge");
|
|
9576
|
+
lines.push(`topgun_migrations_active ${migrationMetrics.activeMigrations}`);
|
|
9577
|
+
lines.push("");
|
|
9578
|
+
lines.push("# HELP topgun_migrations_queued Queued migrations");
|
|
9579
|
+
lines.push("# TYPE topgun_migrations_queued gauge");
|
|
9580
|
+
lines.push(`topgun_migrations_queued ${migrationMetrics.queuedMigrations}`);
|
|
9581
|
+
}
|
|
9582
|
+
lines.push("");
|
|
9583
|
+
lines.push(this.lagTracker.toPrometheusMetrics());
|
|
9584
|
+
return lines.join("\n");
|
|
9585
|
+
}
|
|
9586
|
+
// ============================================
|
|
9587
|
+
// Private Methods
|
|
9588
|
+
// ============================================
|
|
9589
|
+
setupEventHandlers() {
|
|
9590
|
+
this.clusterManager.on("memberJoined", (nodeId) => {
|
|
9591
|
+
logger.info({ nodeId }, "Cluster member joined");
|
|
9592
|
+
this.emit("member:joined", nodeId);
|
|
9593
|
+
});
|
|
9594
|
+
this.clusterManager.on("memberLeft", (nodeId) => {
|
|
9595
|
+
logger.info({ nodeId }, "Cluster member left");
|
|
9596
|
+
this.lagTracker.removeNode(nodeId);
|
|
9597
|
+
this.emit("member:left", nodeId);
|
|
9598
|
+
});
|
|
9599
|
+
this.partitionService.on("rebalanced", (map, changes) => {
|
|
9600
|
+
logger.info({ version: map.version, changesCount: changes.length }, "Partition map rebalanced");
|
|
9601
|
+
this.emit("partition:rebalanced", map, changes);
|
|
9602
|
+
});
|
|
9603
|
+
this.partitionService.on("partitionMoved", (info) => {
|
|
9604
|
+
this.emit("partition:moved", info);
|
|
9605
|
+
});
|
|
9606
|
+
const migrationManager = this.partitionService.getMigrationManager();
|
|
9607
|
+
if (migrationManager) {
|
|
9608
|
+
migrationManager.on("migrationStarted", (partitionId, targetNode) => {
|
|
9609
|
+
this.emit("migration:started", partitionId, targetNode);
|
|
9610
|
+
});
|
|
9611
|
+
migrationManager.on("migrationComplete", (partitionId) => {
|
|
9612
|
+
this.emit("migration:completed", partitionId);
|
|
9613
|
+
});
|
|
9614
|
+
migrationManager.on("migrationFailed", (partitionId, error) => {
|
|
9615
|
+
this.emit("migration:failed", partitionId, error);
|
|
9616
|
+
});
|
|
9617
|
+
}
|
|
9618
|
+
if (this.replicationPipeline) {
|
|
9619
|
+
this.replicationPipeline.on("ackReceived", (nodeId) => {
|
|
9620
|
+
this.lagTracker.recordAck(nodeId);
|
|
9621
|
+
});
|
|
9622
|
+
this.replicationPipeline.on("replicationSent", (nodeId) => {
|
|
9623
|
+
this.lagTracker.incrementPending(nodeId);
|
|
9624
|
+
});
|
|
9625
|
+
}
|
|
9626
|
+
}
|
|
9627
|
+
};
|
|
7335
9628
|
// Annotate the CommonJS export names for ESM import in node:
|
|
7336
9629
|
0 && (module.exports = {
|
|
7337
9630
|
BufferPool,
|
|
9631
|
+
ClusterCoordinator,
|
|
9632
|
+
ClusterManager,
|
|
7338
9633
|
ConnectionRateLimiter,
|
|
9634
|
+
DEFAULT_CLUSTER_COORDINATOR_CONFIG,
|
|
9635
|
+
DEFAULT_LAG_TRACKER_CONFIG,
|
|
7339
9636
|
FilterTasklet,
|
|
7340
9637
|
ForEachTasklet,
|
|
7341
9638
|
IteratorTasklet,
|
|
9639
|
+
LagTracker,
|
|
9640
|
+
LockManager,
|
|
7342
9641
|
MapTasklet,
|
|
7343
9642
|
MemoryServerAdapter,
|
|
9643
|
+
MigrationManager,
|
|
7344
9644
|
ObjectPool,
|
|
9645
|
+
PartitionService,
|
|
7345
9646
|
PostgresAdapter,
|
|
7346
9647
|
RateLimitInterceptor,
|
|
7347
9648
|
ReduceTasklet,
|
|
9649
|
+
ReplicationPipeline,
|
|
7348
9650
|
SecurityManager,
|
|
7349
9651
|
ServerCoordinator,
|
|
7350
9652
|
TaskletScheduler,
|