@topgunbuild/server 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +97 -0
- package/dist/index.d.mts +573 -3
- package/dist/index.d.ts +573 -3
- package/dist/index.js +1786 -262
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1697 -183
- package/dist/index.mjs.map +1 -1
- package/package.json +12 -12
package/dist/index.mjs
CHANGED
|
@@ -10,7 +10,7 @@ import { createServer as createHttpServer } from "http";
|
|
|
10
10
|
import { createServer as createHttpsServer } from "https";
|
|
11
11
|
import { readFileSync as readFileSync2 } from "fs";
|
|
12
12
|
import { WebSocketServer as WebSocketServer2, WebSocket as WebSocket3 } from "ws";
|
|
13
|
-
import { HLC as HLC2, LWWMap as LWWMap3, ORMap as ORMap2, serialize as serialize4, deserialize, MessageSchema, WriteConcern as WriteConcern2, ConsistencyLevel as
|
|
13
|
+
import { HLC as HLC2, LWWMap as LWWMap3, ORMap as ORMap2, serialize as serialize4, deserialize, MessageSchema, WriteConcern as WriteConcern2, ConsistencyLevel as ConsistencyLevel3, DEFAULT_REPLICATION_CONFIG as DEFAULT_REPLICATION_CONFIG2, IndexedLWWMap as IndexedLWWMap2, IndexedORMap as IndexedORMap2 } from "@topgunbuild/core";
|
|
14
14
|
import * as jwt from "jsonwebtoken";
|
|
15
15
|
import * as crypto from "crypto";
|
|
16
16
|
|
|
@@ -1126,6 +1126,47 @@ var ClusterManager = class extends EventEmitter2 {
|
|
|
1126
1126
|
handleHeartbeat(senderId, _payload) {
|
|
1127
1127
|
this.failureDetector.recordHeartbeat(senderId);
|
|
1128
1128
|
}
|
|
1129
|
+
/**
|
|
1130
|
+
* Send current member list to a specific node (gossip protocol).
|
|
1131
|
+
* Called when a new node joins to propagate cluster topology.
|
|
1132
|
+
*/
|
|
1133
|
+
sendMemberList(targetNodeId) {
|
|
1134
|
+
const members = [];
|
|
1135
|
+
for (const [nodeId, member] of this.members) {
|
|
1136
|
+
members.push({
|
|
1137
|
+
nodeId,
|
|
1138
|
+
host: member.host,
|
|
1139
|
+
port: member.port
|
|
1140
|
+
});
|
|
1141
|
+
}
|
|
1142
|
+
this.send(targetNodeId, "MEMBER_LIST", { members });
|
|
1143
|
+
logger.debug({ targetNodeId, memberCount: members.length }, "Sent member list");
|
|
1144
|
+
}
|
|
1145
|
+
/**
|
|
1146
|
+
* Broadcast member list to all connected nodes.
|
|
1147
|
+
* Called when cluster membership changes.
|
|
1148
|
+
*/
|
|
1149
|
+
broadcastMemberList() {
|
|
1150
|
+
for (const [nodeId, member] of this.members) {
|
|
1151
|
+
if (member.isSelf) continue;
|
|
1152
|
+
if (member.socket && member.socket.readyState === WebSocket.OPEN) {
|
|
1153
|
+
this.sendMemberList(nodeId);
|
|
1154
|
+
}
|
|
1155
|
+
}
|
|
1156
|
+
}
|
|
1157
|
+
/**
|
|
1158
|
+
* Handle incoming member list from a peer (gossip protocol).
|
|
1159
|
+
* Attempts to connect to unknown members.
|
|
1160
|
+
*/
|
|
1161
|
+
handleMemberList(payload) {
|
|
1162
|
+
for (const memberInfo of payload.members) {
|
|
1163
|
+
if (memberInfo.nodeId === this.config.nodeId) continue;
|
|
1164
|
+
if (this.members.has(memberInfo.nodeId)) continue;
|
|
1165
|
+
const peerAddress = `${memberInfo.host}:${memberInfo.port}`;
|
|
1166
|
+
logger.info({ nodeId: memberInfo.nodeId, peerAddress }, "Discovered new member via gossip");
|
|
1167
|
+
this.connectToPeer(peerAddress);
|
|
1168
|
+
}
|
|
1169
|
+
}
|
|
1129
1170
|
/**
|
|
1130
1171
|
* Handle confirmed node failure.
|
|
1131
1172
|
*/
|
|
@@ -1264,6 +1305,9 @@ var ClusterManager = class extends EventEmitter2 {
|
|
|
1264
1305
|
this.failureDetector.startMonitoring(remoteNodeId);
|
|
1265
1306
|
this.startHeartbeat();
|
|
1266
1307
|
this.emit("memberJoined", remoteNodeId);
|
|
1308
|
+
this.broadcastMemberList();
|
|
1309
|
+
} else if (msg.type === "MEMBER_LIST") {
|
|
1310
|
+
this.handleMemberList(msg.payload);
|
|
1267
1311
|
} else if (msg.type === "HEARTBEAT") {
|
|
1268
1312
|
if (remoteNodeId) {
|
|
1269
1313
|
this.handleHeartbeat(remoteNodeId, msg.payload);
|
|
@@ -6565,236 +6609,1436 @@ var ReplicationPipeline = class extends EventEmitter8 {
|
|
|
6565
6609
|
}
|
|
6566
6610
|
};
|
|
6567
6611
|
|
|
6568
|
-
// src/
|
|
6569
|
-
import {
|
|
6570
|
-
|
|
6571
|
-
|
|
6572
|
-
|
|
6573
|
-
|
|
6574
|
-
|
|
6575
|
-
|
|
6612
|
+
// src/cluster/PartitionReassigner.ts
|
|
6613
|
+
import { EventEmitter as EventEmitter9 } from "events";
|
|
6614
|
+
import { DEFAULT_BACKUP_COUNT as DEFAULT_BACKUP_COUNT2 } from "@topgunbuild/core";
|
|
6615
|
+
var DEFAULT_REASSIGNER_CONFIG = {
|
|
6616
|
+
reassignmentDelayMs: 1e3,
|
|
6617
|
+
maxConcurrentTransfers: 10,
|
|
6618
|
+
autoPromoteBackups: true,
|
|
6619
|
+
autoAssignNewBackups: true
|
|
6620
|
+
};
|
|
6621
|
+
var PartitionReassigner = class extends EventEmitter9 {
|
|
6622
|
+
constructor(clusterManager, partitionService, config = {}) {
|
|
6623
|
+
super();
|
|
6624
|
+
this.failoverInProgress = false;
|
|
6625
|
+
this.partitionsReassigned = 0;
|
|
6626
|
+
this.pendingReassignments = /* @__PURE__ */ new Set();
|
|
6627
|
+
this.clusterManager = clusterManager;
|
|
6628
|
+
this.partitionService = partitionService;
|
|
6629
|
+
this.config = { ...DEFAULT_REASSIGNER_CONFIG, ...config };
|
|
6630
|
+
this.setupEventHandlers();
|
|
6631
|
+
}
|
|
6632
|
+
setupEventHandlers() {
|
|
6633
|
+
this.clusterManager.on("nodeConfirmedFailed", (nodeId) => {
|
|
6634
|
+
logger.warn({ nodeId }, "Node failure confirmed, initiating partition reassignment");
|
|
6635
|
+
this.handleNodeFailure(nodeId);
|
|
6636
|
+
});
|
|
6637
|
+
this.clusterManager.on("memberLeft", (nodeId) => {
|
|
6638
|
+
if (this.currentFailedNode !== nodeId) {
|
|
6639
|
+
logger.info({ nodeId }, "Member left cluster, checking partition reassignment");
|
|
6640
|
+
this.handleNodeDeparture(nodeId);
|
|
6641
|
+
}
|
|
6642
|
+
});
|
|
6576
6643
|
}
|
|
6577
6644
|
/**
|
|
6578
|
-
*
|
|
6645
|
+
* Handle a node failure - initiates failover process
|
|
6579
6646
|
*/
|
|
6580
|
-
|
|
6581
|
-
|
|
6582
|
-
|
|
6583
|
-
|
|
6584
|
-
this.counters.set(name, counter);
|
|
6585
|
-
logger.debug({ name }, "Created new counter");
|
|
6647
|
+
handleNodeFailure(failedNodeId) {
|
|
6648
|
+
if (this.failoverInProgress && this.currentFailedNode === failedNodeId) {
|
|
6649
|
+
logger.debug({ failedNodeId }, "Failover already in progress for this node");
|
|
6650
|
+
return;
|
|
6586
6651
|
}
|
|
6587
|
-
|
|
6652
|
+
if (this.reassignmentTimer) {
|
|
6653
|
+
clearTimeout(this.reassignmentTimer);
|
|
6654
|
+
}
|
|
6655
|
+
this.reassignmentTimer = setTimeout(() => {
|
|
6656
|
+
this.executeFailover(failedNodeId);
|
|
6657
|
+
}, this.config.reassignmentDelayMs);
|
|
6588
6658
|
}
|
|
6589
6659
|
/**
|
|
6590
|
-
* Handle
|
|
6591
|
-
* @returns Response message to send back to client
|
|
6660
|
+
* Handle a graceful node departure
|
|
6592
6661
|
*/
|
|
6593
|
-
|
|
6594
|
-
const
|
|
6595
|
-
|
|
6596
|
-
|
|
6597
|
-
|
|
6598
|
-
|
|
6599
|
-
type: "COUNTER_RESPONSE",
|
|
6600
|
-
payload: {
|
|
6601
|
-
name,
|
|
6602
|
-
state: this.stateToObject(state)
|
|
6603
|
-
}
|
|
6604
|
-
};
|
|
6662
|
+
handleNodeDeparture(nodeId) {
|
|
6663
|
+
const orphanedPartitions = this.findOrphanedPartitions(nodeId);
|
|
6664
|
+
if (orphanedPartitions.length > 0) {
|
|
6665
|
+
logger.warn({ nodeId, count: orphanedPartitions.length }, "Found orphaned partitions after departure");
|
|
6666
|
+
this.executeFailover(nodeId);
|
|
6667
|
+
}
|
|
6605
6668
|
}
|
|
6606
6669
|
/**
|
|
6607
|
-
*
|
|
6608
|
-
* @returns Merged state and list of clients to broadcast to
|
|
6670
|
+
* Execute the failover process for a failed node
|
|
6609
6671
|
*/
|
|
6610
|
-
|
|
6611
|
-
|
|
6612
|
-
|
|
6613
|
-
|
|
6614
|
-
|
|
6615
|
-
|
|
6616
|
-
logger.
|
|
6617
|
-
|
|
6618
|
-
|
|
6619
|
-
|
|
6620
|
-
|
|
6621
|
-
|
|
6622
|
-
|
|
6623
|
-
return {
|
|
6624
|
-
// Response to the sending client
|
|
6625
|
-
response: {
|
|
6626
|
-
type: "COUNTER_UPDATE",
|
|
6627
|
-
payload: {
|
|
6628
|
-
name,
|
|
6629
|
-
state: mergedStateObj
|
|
6630
|
-
}
|
|
6631
|
-
},
|
|
6632
|
-
// Broadcast to other clients
|
|
6633
|
-
broadcastTo,
|
|
6634
|
-
broadcastMessage: {
|
|
6635
|
-
type: "COUNTER_UPDATE",
|
|
6636
|
-
payload: {
|
|
6637
|
-
name,
|
|
6638
|
-
state: mergedStateObj
|
|
6639
|
-
}
|
|
6672
|
+
async executeFailover(failedNodeId) {
|
|
6673
|
+
this.failoverInProgress = true;
|
|
6674
|
+
this.currentFailedNode = failedNodeId;
|
|
6675
|
+
this.reassignmentStartTime = Date.now();
|
|
6676
|
+
this.partitionsReassigned = 0;
|
|
6677
|
+
this.pendingReassignments.clear();
|
|
6678
|
+
logger.info({ failedNodeId }, "Starting partition failover");
|
|
6679
|
+
try {
|
|
6680
|
+
const orphanedPartitions = this.findOrphanedPartitions(failedNodeId);
|
|
6681
|
+
if (orphanedPartitions.length === 0) {
|
|
6682
|
+
logger.info({ failedNodeId }, "No partitions to reassign");
|
|
6683
|
+
this.completeFailover();
|
|
6684
|
+
return;
|
|
6640
6685
|
}
|
|
6641
|
-
|
|
6686
|
+
logger.info({
|
|
6687
|
+
failedNodeId,
|
|
6688
|
+
partitionCount: orphanedPartitions.length
|
|
6689
|
+
}, "Reassigning partitions from failed node");
|
|
6690
|
+
for (const partitionId of orphanedPartitions) {
|
|
6691
|
+
this.pendingReassignments.add(partitionId);
|
|
6692
|
+
}
|
|
6693
|
+
const changes = [];
|
|
6694
|
+
for (const partitionId of orphanedPartitions) {
|
|
6695
|
+
const change = await this.reassignPartition(partitionId, failedNodeId);
|
|
6696
|
+
if (change) {
|
|
6697
|
+
changes.push(change);
|
|
6698
|
+
this.partitionsReassigned++;
|
|
6699
|
+
}
|
|
6700
|
+
this.pendingReassignments.delete(partitionId);
|
|
6701
|
+
}
|
|
6702
|
+
if (changes.length > 0) {
|
|
6703
|
+
this.emit("partitionsReassigned", {
|
|
6704
|
+
failedNodeId,
|
|
6705
|
+
changes,
|
|
6706
|
+
partitionMap: this.partitionService.getPartitionMap()
|
|
6707
|
+
});
|
|
6708
|
+
}
|
|
6709
|
+
this.completeFailover();
|
|
6710
|
+
} catch (error) {
|
|
6711
|
+
logger.error({ failedNodeId, error }, "Failover failed");
|
|
6712
|
+
this.emit("failoverError", { failedNodeId, error });
|
|
6713
|
+
this.completeFailover();
|
|
6714
|
+
}
|
|
6642
6715
|
}
|
|
6643
6716
|
/**
|
|
6644
|
-
*
|
|
6717
|
+
* Find all partitions that need reassignment
|
|
6645
6718
|
*/
|
|
6646
|
-
|
|
6647
|
-
|
|
6648
|
-
|
|
6719
|
+
findOrphanedPartitions(failedNodeId) {
|
|
6720
|
+
const orphaned = [];
|
|
6721
|
+
const partitionMap = this.partitionService.getPartitionMap();
|
|
6722
|
+
for (const partition of partitionMap.partitions) {
|
|
6723
|
+
if (partition.ownerNodeId === failedNodeId) {
|
|
6724
|
+
orphaned.push(partition.partitionId);
|
|
6725
|
+
}
|
|
6649
6726
|
}
|
|
6650
|
-
|
|
6651
|
-
logger.debug({ clientId, counterName }, "Client subscribed to counter");
|
|
6727
|
+
return orphaned;
|
|
6652
6728
|
}
|
|
6653
6729
|
/**
|
|
6654
|
-
*
|
|
6730
|
+
* Reassign a single partition
|
|
6655
6731
|
*/
|
|
6656
|
-
|
|
6657
|
-
const
|
|
6658
|
-
|
|
6659
|
-
|
|
6660
|
-
|
|
6661
|
-
|
|
6732
|
+
async reassignPartition(partitionId, failedNodeId) {
|
|
6733
|
+
const currentBackups = this.partitionService.getBackups(partitionId);
|
|
6734
|
+
const aliveMembers = this.clusterManager.getMembers().filter((m) => m !== failedNodeId);
|
|
6735
|
+
if (aliveMembers.length === 0) {
|
|
6736
|
+
logger.error({ partitionId }, "No alive members to reassign partition to");
|
|
6737
|
+
return null;
|
|
6738
|
+
}
|
|
6739
|
+
let newOwner = null;
|
|
6740
|
+
if (this.config.autoPromoteBackups) {
|
|
6741
|
+
for (const backup of currentBackups) {
|
|
6742
|
+
if (aliveMembers.includes(backup)) {
|
|
6743
|
+
newOwner = backup;
|
|
6744
|
+
break;
|
|
6745
|
+
}
|
|
6662
6746
|
}
|
|
6663
6747
|
}
|
|
6748
|
+
if (!newOwner) {
|
|
6749
|
+
const ownerIndex = partitionId % aliveMembers.length;
|
|
6750
|
+
newOwner = aliveMembers.sort()[ownerIndex];
|
|
6751
|
+
}
|
|
6752
|
+
this.partitionService.setOwner(partitionId, newOwner);
|
|
6753
|
+
logger.info({
|
|
6754
|
+
partitionId,
|
|
6755
|
+
previousOwner: failedNodeId,
|
|
6756
|
+
newOwner
|
|
6757
|
+
}, "Partition owner promoted");
|
|
6758
|
+
this.emit("reassignment", {
|
|
6759
|
+
type: "backup-promoted",
|
|
6760
|
+
partitionId,
|
|
6761
|
+
previousOwner: failedNodeId,
|
|
6762
|
+
newOwner
|
|
6763
|
+
});
|
|
6764
|
+
if (this.config.autoAssignNewBackups) {
|
|
6765
|
+
const newBackups = this.selectBackups(partitionId, newOwner, aliveMembers);
|
|
6766
|
+
}
|
|
6767
|
+
return {
|
|
6768
|
+
partitionId,
|
|
6769
|
+
previousOwner: failedNodeId,
|
|
6770
|
+
newOwner,
|
|
6771
|
+
reason: "FAILOVER"
|
|
6772
|
+
};
|
|
6664
6773
|
}
|
|
6665
6774
|
/**
|
|
6666
|
-
*
|
|
6775
|
+
* Select backup nodes for a partition
|
|
6667
6776
|
*/
|
|
6668
|
-
|
|
6669
|
-
|
|
6670
|
-
|
|
6671
|
-
|
|
6672
|
-
|
|
6673
|
-
|
|
6777
|
+
selectBackups(partitionId, owner, aliveMembers) {
|
|
6778
|
+
const backups = [];
|
|
6779
|
+
const sortedMembers = aliveMembers.filter((m) => m !== owner).sort();
|
|
6780
|
+
const startIndex = partitionId % sortedMembers.length;
|
|
6781
|
+
for (let i = 0; i < Math.min(DEFAULT_BACKUP_COUNT2, sortedMembers.length); i++) {
|
|
6782
|
+
const backupIndex = (startIndex + i) % sortedMembers.length;
|
|
6783
|
+
backups.push(sortedMembers[backupIndex]);
|
|
6674
6784
|
}
|
|
6675
|
-
|
|
6785
|
+
return backups;
|
|
6676
6786
|
}
|
|
6677
6787
|
/**
|
|
6678
|
-
*
|
|
6788
|
+
* Complete the failover process
|
|
6679
6789
|
*/
|
|
6680
|
-
|
|
6681
|
-
const
|
|
6682
|
-
|
|
6790
|
+
completeFailover() {
|
|
6791
|
+
const duration = this.reassignmentStartTime ? Date.now() - this.reassignmentStartTime : 0;
|
|
6792
|
+
logger.info({
|
|
6793
|
+
failedNodeId: this.currentFailedNode,
|
|
6794
|
+
partitionsReassigned: this.partitionsReassigned,
|
|
6795
|
+
durationMs: duration
|
|
6796
|
+
}, "Failover completed");
|
|
6797
|
+
this.emit("failoverComplete", {
|
|
6798
|
+
failedNodeId: this.currentFailedNode,
|
|
6799
|
+
partitionsReassigned: this.partitionsReassigned,
|
|
6800
|
+
durationMs: duration
|
|
6801
|
+
});
|
|
6802
|
+
this.failoverInProgress = false;
|
|
6803
|
+
this.currentFailedNode = void 0;
|
|
6804
|
+
this.reassignmentStartTime = void 0;
|
|
6805
|
+
this.pendingReassignments.clear();
|
|
6683
6806
|
}
|
|
6684
6807
|
/**
|
|
6685
|
-
* Get
|
|
6808
|
+
* Get current failover status
|
|
6686
6809
|
*/
|
|
6687
|
-
|
|
6688
|
-
return
|
|
6810
|
+
getStatus() {
|
|
6811
|
+
return {
|
|
6812
|
+
inProgress: this.failoverInProgress,
|
|
6813
|
+
failedNodeId: this.currentFailedNode,
|
|
6814
|
+
partitionsReassigned: this.partitionsReassigned,
|
|
6815
|
+
partitionsPending: this.pendingReassignments.size,
|
|
6816
|
+
startedAt: this.reassignmentStartTime,
|
|
6817
|
+
completedAt: this.failoverInProgress ? void 0 : Date.now()
|
|
6818
|
+
};
|
|
6689
6819
|
}
|
|
6690
6820
|
/**
|
|
6691
|
-
*
|
|
6821
|
+
* Check if failover is in progress
|
|
6692
6822
|
*/
|
|
6693
|
-
|
|
6694
|
-
return this.
|
|
6823
|
+
isFailoverInProgress() {
|
|
6824
|
+
return this.failoverInProgress;
|
|
6695
6825
|
}
|
|
6696
6826
|
/**
|
|
6697
|
-
*
|
|
6827
|
+
* Force immediate reassignment (for testing/manual intervention)
|
|
6698
6828
|
*/
|
|
6699
|
-
|
|
6700
|
-
|
|
6701
|
-
|
|
6702
|
-
|
|
6703
|
-
|
|
6829
|
+
forceReassignment(failedNodeId) {
|
|
6830
|
+
if (this.reassignmentTimer) {
|
|
6831
|
+
clearTimeout(this.reassignmentTimer);
|
|
6832
|
+
}
|
|
6833
|
+
this.executeFailover(failedNodeId);
|
|
6704
6834
|
}
|
|
6705
6835
|
/**
|
|
6706
|
-
*
|
|
6836
|
+
* Stop any pending reassignment
|
|
6707
6837
|
*/
|
|
6708
|
-
|
|
6709
|
-
|
|
6710
|
-
|
|
6711
|
-
|
|
6712
|
-
}
|
|
6838
|
+
stop() {
|
|
6839
|
+
if (this.reassignmentTimer) {
|
|
6840
|
+
clearTimeout(this.reassignmentTimer);
|
|
6841
|
+
this.reassignmentTimer = void 0;
|
|
6842
|
+
}
|
|
6843
|
+
this.failoverInProgress = false;
|
|
6844
|
+
this.pendingReassignments.clear();
|
|
6713
6845
|
}
|
|
6714
6846
|
};
|
|
6715
6847
|
|
|
6716
|
-
// src/
|
|
6717
|
-
import {
|
|
6718
|
-
|
|
6719
|
-
|
|
6720
|
-
|
|
6721
|
-
|
|
6722
|
-
|
|
6723
|
-
|
|
6724
|
-
} from "@topgunbuild/core";
|
|
6725
|
-
var ivm = null;
|
|
6726
|
-
try {
|
|
6727
|
-
ivm = __require("isolated-vm");
|
|
6728
|
-
} catch {
|
|
6729
|
-
const isProduction = process.env.NODE_ENV === "production";
|
|
6730
|
-
if (isProduction) {
|
|
6731
|
-
logger.error(
|
|
6732
|
-
"SECURITY WARNING: isolated-vm not available in production! Entry processors will run in less secure fallback mode. Install isolated-vm for production environments: pnpm add isolated-vm"
|
|
6733
|
-
);
|
|
6734
|
-
} else {
|
|
6735
|
-
logger.warn("isolated-vm not available, falling back to less secure VM");
|
|
6736
|
-
}
|
|
6737
|
-
}
|
|
6738
|
-
var DEFAULT_SANDBOX_CONFIG = {
|
|
6739
|
-
memoryLimitMb: 8,
|
|
6740
|
-
timeoutMs: 100,
|
|
6741
|
-
maxCachedIsolates: 100,
|
|
6742
|
-
strictValidation: true
|
|
6848
|
+
// src/cluster/ReadReplicaHandler.ts
|
|
6849
|
+
import { EventEmitter as EventEmitter10 } from "events";
|
|
6850
|
+
import { ConsistencyLevel as ConsistencyLevel2 } from "@topgunbuild/core";
|
|
6851
|
+
var DEFAULT_READ_REPLICA_CONFIG = {
|
|
6852
|
+
defaultConsistency: ConsistencyLevel2.STRONG,
|
|
6853
|
+
maxStalenessMs: 5e3,
|
|
6854
|
+
preferLocalReplica: true,
|
|
6855
|
+
loadBalancing: "latency-based"
|
|
6743
6856
|
};
|
|
6744
|
-
var
|
|
6745
|
-
constructor(config = {}) {
|
|
6746
|
-
|
|
6747
|
-
|
|
6748
|
-
this.
|
|
6749
|
-
this.
|
|
6750
|
-
this.
|
|
6857
|
+
var ReadReplicaHandler = class extends EventEmitter10 {
|
|
6858
|
+
constructor(partitionService, clusterManager, nodeId, lagTracker, config = {}) {
|
|
6859
|
+
super();
|
|
6860
|
+
// Round-robin counters for load balancing
|
|
6861
|
+
this.roundRobinCounters = /* @__PURE__ */ new Map();
|
|
6862
|
+
this.partitionService = partitionService;
|
|
6863
|
+
this.clusterManager = clusterManager;
|
|
6864
|
+
this.nodeId = nodeId;
|
|
6865
|
+
this.lagTracker = lagTracker;
|
|
6866
|
+
this.config = { ...DEFAULT_READ_REPLICA_CONFIG, ...config };
|
|
6751
6867
|
}
|
|
6752
6868
|
/**
|
|
6753
|
-
*
|
|
6754
|
-
*
|
|
6755
|
-
* @param processor The processor definition (name, code, args)
|
|
6756
|
-
* @param value The current value for the key (or undefined)
|
|
6757
|
-
* @param key The key being processed
|
|
6758
|
-
* @returns Result containing success status, result, and new value
|
|
6869
|
+
* Determine if a read request can be served locally
|
|
6759
6870
|
*/
|
|
6760
|
-
|
|
6761
|
-
|
|
6762
|
-
|
|
6763
|
-
|
|
6764
|
-
error: "Sandbox has been disposed"
|
|
6765
|
-
};
|
|
6871
|
+
canServeLocally(request) {
|
|
6872
|
+
const consistency = request.options?.consistency ?? this.config.defaultConsistency;
|
|
6873
|
+
if (consistency === ConsistencyLevel2.STRONG) {
|
|
6874
|
+
return this.partitionService.isLocalOwner(request.key);
|
|
6766
6875
|
}
|
|
6767
|
-
|
|
6768
|
-
|
|
6769
|
-
|
|
6770
|
-
|
|
6771
|
-
|
|
6772
|
-
|
|
6773
|
-
|
|
6876
|
+
return this.partitionService.isRelated(request.key);
|
|
6877
|
+
}
|
|
6878
|
+
/**
|
|
6879
|
+
* Determine which node should handle the read
|
|
6880
|
+
*/
|
|
6881
|
+
selectReadNode(request) {
|
|
6882
|
+
const key = request.key;
|
|
6883
|
+
const consistency = request.options?.consistency ?? this.config.defaultConsistency;
|
|
6884
|
+
const partitionId = this.partitionService.getPartitionId(key);
|
|
6885
|
+
const distribution = this.partitionService.getDistribution(key);
|
|
6886
|
+
if (consistency === ConsistencyLevel2.STRONG) {
|
|
6887
|
+
if (!this.isNodeAlive(distribution.owner)) {
|
|
6888
|
+
if (request.options?.allowStale) {
|
|
6889
|
+
return this.selectAliveBackup(distribution.backups);
|
|
6890
|
+
}
|
|
6891
|
+
return null;
|
|
6774
6892
|
}
|
|
6893
|
+
return distribution.owner;
|
|
6775
6894
|
}
|
|
6776
|
-
|
|
6777
|
-
|
|
6778
|
-
|
|
6779
|
-
return
|
|
6895
|
+
const allReplicas = [distribution.owner, ...distribution.backups];
|
|
6896
|
+
const aliveReplicas = allReplicas.filter((n) => this.isNodeAlive(n));
|
|
6897
|
+
if (aliveReplicas.length === 0) {
|
|
6898
|
+
return null;
|
|
6899
|
+
}
|
|
6900
|
+
if (request.options?.maxStaleness) {
|
|
6901
|
+
const withinStaleness = aliveReplicas.filter(
|
|
6902
|
+
(n) => this.getNodeStaleness(n) <= (request.options?.maxStaleness ?? Infinity)
|
|
6903
|
+
);
|
|
6904
|
+
if (withinStaleness.length > 0) {
|
|
6905
|
+
return this.selectByStrategy(withinStaleness, partitionId);
|
|
6906
|
+
}
|
|
6907
|
+
if (this.isNodeAlive(distribution.owner)) {
|
|
6908
|
+
return distribution.owner;
|
|
6909
|
+
}
|
|
6780
6910
|
}
|
|
6911
|
+
if (this.config.preferLocalReplica && aliveReplicas.includes(this.nodeId)) {
|
|
6912
|
+
return this.nodeId;
|
|
6913
|
+
}
|
|
6914
|
+
return this.selectByStrategy(aliveReplicas, partitionId);
|
|
6781
6915
|
}
|
|
6782
6916
|
/**
|
|
6783
|
-
*
|
|
6917
|
+
* Select replica using configured load balancing strategy
|
|
6784
6918
|
*/
|
|
6785
|
-
|
|
6786
|
-
if (
|
|
6787
|
-
|
|
6919
|
+
selectByStrategy(replicas, partitionId) {
|
|
6920
|
+
if (replicas.length === 0) {
|
|
6921
|
+
throw new Error("No replicas available");
|
|
6788
6922
|
}
|
|
6789
|
-
|
|
6790
|
-
|
|
6791
|
-
|
|
6792
|
-
|
|
6793
|
-
|
|
6794
|
-
|
|
6795
|
-
|
|
6796
|
-
|
|
6797
|
-
|
|
6923
|
+
if (replicas.length === 1) {
|
|
6924
|
+
return replicas[0];
|
|
6925
|
+
}
|
|
6926
|
+
switch (this.config.loadBalancing) {
|
|
6927
|
+
case "round-robin":
|
|
6928
|
+
return this.selectRoundRobin(replicas, partitionId);
|
|
6929
|
+
case "latency-based":
|
|
6930
|
+
return this.selectByLatency(replicas);
|
|
6931
|
+
case "least-connections":
|
|
6932
|
+
return this.selectRoundRobin(replicas, partitionId);
|
|
6933
|
+
default:
|
|
6934
|
+
return replicas[0];
|
|
6935
|
+
}
|
|
6936
|
+
}
|
|
6937
|
+
/**
|
|
6938
|
+
* Round-robin selection
|
|
6939
|
+
*/
|
|
6940
|
+
selectRoundRobin(replicas, partitionId) {
|
|
6941
|
+
const counter = this.roundRobinCounters.get(partitionId) ?? 0;
|
|
6942
|
+
const selected = replicas[counter % replicas.length];
|
|
6943
|
+
this.roundRobinCounters.set(partitionId, counter + 1);
|
|
6944
|
+
return selected;
|
|
6945
|
+
}
|
|
6946
|
+
/**
|
|
6947
|
+
* Latency-based selection using lag tracker
|
|
6948
|
+
*/
|
|
6949
|
+
selectByLatency(replicas) {
|
|
6950
|
+
if (!this.lagTracker) {
|
|
6951
|
+
return replicas[0];
|
|
6952
|
+
}
|
|
6953
|
+
let bestNode = replicas[0];
|
|
6954
|
+
let bestLatency = Infinity;
|
|
6955
|
+
for (const nodeId of replicas) {
|
|
6956
|
+
const lag = this.lagTracker.getLag(nodeId);
|
|
6957
|
+
if (lag && lag.current < bestLatency) {
|
|
6958
|
+
bestLatency = lag.current;
|
|
6959
|
+
bestNode = nodeId;
|
|
6960
|
+
}
|
|
6961
|
+
}
|
|
6962
|
+
return bestNode;
|
|
6963
|
+
}
|
|
6964
|
+
/**
|
|
6965
|
+
* Get estimated staleness for a node in ms
|
|
6966
|
+
*/
|
|
6967
|
+
getNodeStaleness(nodeId) {
|
|
6968
|
+
if (nodeId === this.partitionService.getOwner("")) {
|
|
6969
|
+
return 0;
|
|
6970
|
+
}
|
|
6971
|
+
if (this.lagTracker) {
|
|
6972
|
+
const lag = this.lagTracker.getLag(nodeId);
|
|
6973
|
+
return lag?.current ?? 0;
|
|
6974
|
+
}
|
|
6975
|
+
return 0;
|
|
6976
|
+
}
|
|
6977
|
+
/**
|
|
6978
|
+
* Check if a node is alive in the cluster
|
|
6979
|
+
*/
|
|
6980
|
+
isNodeAlive(nodeId) {
|
|
6981
|
+
const members = this.clusterManager.getMembers();
|
|
6982
|
+
return members.includes(nodeId);
|
|
6983
|
+
}
|
|
6984
|
+
/**
|
|
6985
|
+
* Select first alive backup from list
|
|
6986
|
+
*/
|
|
6987
|
+
selectAliveBackup(backups) {
|
|
6988
|
+
for (const backup of backups) {
|
|
6989
|
+
if (this.isNodeAlive(backup)) {
|
|
6990
|
+
return backup;
|
|
6991
|
+
}
|
|
6992
|
+
}
|
|
6993
|
+
return null;
|
|
6994
|
+
}
|
|
6995
|
+
/**
|
|
6996
|
+
* Create read response metadata
|
|
6997
|
+
*/
|
|
6998
|
+
createReadMetadata(key, options) {
|
|
6999
|
+
const consistency = options?.consistency ?? this.config.defaultConsistency;
|
|
7000
|
+
const isOwner = this.partitionService.isLocalOwner(key);
|
|
7001
|
+
return {
|
|
7002
|
+
source: this.nodeId,
|
|
7003
|
+
isOwner,
|
|
7004
|
+
consistency
|
|
7005
|
+
};
|
|
7006
|
+
}
|
|
7007
|
+
/**
|
|
7008
|
+
* Check if local node should forward read to owner
|
|
7009
|
+
*/
|
|
7010
|
+
shouldForwardRead(request) {
|
|
7011
|
+
const consistency = request.options?.consistency ?? this.config.defaultConsistency;
|
|
7012
|
+
if (consistency === ConsistencyLevel2.STRONG) {
|
|
7013
|
+
return !this.partitionService.isLocalOwner(request.key);
|
|
7014
|
+
}
|
|
7015
|
+
if (!this.partitionService.isRelated(request.key)) {
|
|
7016
|
+
return true;
|
|
7017
|
+
}
|
|
7018
|
+
return false;
|
|
7019
|
+
}
|
|
7020
|
+
/**
|
|
7021
|
+
* Get metrics for monitoring
|
|
7022
|
+
*/
|
|
7023
|
+
getMetrics() {
|
|
7024
|
+
return {
|
|
7025
|
+
defaultConsistency: this.config.defaultConsistency,
|
|
7026
|
+
preferLocalReplica: this.config.preferLocalReplica,
|
|
7027
|
+
loadBalancing: this.config.loadBalancing,
|
|
7028
|
+
roundRobinPartitions: this.roundRobinCounters.size
|
|
7029
|
+
};
|
|
7030
|
+
}
|
|
7031
|
+
};
|
|
7032
|
+
|
|
7033
|
+
// src/cluster/MerkleTreeManager.ts
|
|
7034
|
+
import { EventEmitter as EventEmitter11 } from "events";
|
|
7035
|
+
import { MerkleTree, hashString as hashString2 } from "@topgunbuild/core";
|
|
7036
|
+
var DEFAULT_MERKLE_TREE_CONFIG = {
|
|
7037
|
+
treeDepth: 3,
|
|
7038
|
+
autoUpdate: true,
|
|
7039
|
+
lazyInit: true
|
|
7040
|
+
};
|
|
7041
|
+
var MerkleTreeManager = class extends EventEmitter11 {
|
|
7042
|
+
constructor(nodeId, config = {}) {
|
|
7043
|
+
super();
|
|
7044
|
+
this.trees = /* @__PURE__ */ new Map();
|
|
7045
|
+
this.keyCounts = /* @__PURE__ */ new Map();
|
|
7046
|
+
this.lastUpdated = /* @__PURE__ */ new Map();
|
|
7047
|
+
this.nodeId = nodeId;
|
|
7048
|
+
this.config = { ...DEFAULT_MERKLE_TREE_CONFIG, ...config };
|
|
7049
|
+
}
|
|
7050
|
+
/**
|
|
7051
|
+
* Get or create a Merkle tree for a partition
|
|
7052
|
+
*/
|
|
7053
|
+
getTree(partitionId) {
|
|
7054
|
+
let tree = this.trees.get(partitionId);
|
|
7055
|
+
if (!tree) {
|
|
7056
|
+
tree = new MerkleTree(/* @__PURE__ */ new Map(), this.config.treeDepth);
|
|
7057
|
+
this.trees.set(partitionId, tree);
|
|
7058
|
+
this.keyCounts.set(partitionId, 0);
|
|
7059
|
+
this.lastUpdated.set(partitionId, Date.now());
|
|
7060
|
+
}
|
|
7061
|
+
return tree;
|
|
7062
|
+
}
|
|
7063
|
+
/**
|
|
7064
|
+
* Build tree for a partition from existing data
|
|
7065
|
+
*/
|
|
7066
|
+
buildTree(partitionId, records) {
|
|
7067
|
+
const tree = new MerkleTree(records, this.config.treeDepth);
|
|
7068
|
+
this.trees.set(partitionId, tree);
|
|
7069
|
+
this.keyCounts.set(partitionId, records.size);
|
|
7070
|
+
this.lastUpdated.set(partitionId, Date.now());
|
|
7071
|
+
logger.debug({
|
|
7072
|
+
partitionId,
|
|
7073
|
+
keyCount: records.size,
|
|
7074
|
+
rootHash: tree.getRootHash()
|
|
7075
|
+
}, "Built Merkle tree for partition");
|
|
7076
|
+
}
|
|
7077
|
+
/**
|
|
7078
|
+
* Incrementally update tree when a record changes
|
|
7079
|
+
*/
|
|
7080
|
+
updateRecord(partitionId, key, record) {
|
|
7081
|
+
if (!this.config.autoUpdate) return;
|
|
7082
|
+
const tree = this.getTree(partitionId);
|
|
7083
|
+
const previousKeyCount = this.keyCounts.get(partitionId) ?? 0;
|
|
7084
|
+
const existingBuckets = tree.getBuckets("");
|
|
7085
|
+
const wasNewKey = Object.keys(existingBuckets).length === 0 || !tree.getKeysInBucket(this.getKeyPath(key)).includes(key);
|
|
7086
|
+
tree.update(key, record);
|
|
7087
|
+
if (wasNewKey) {
|
|
7088
|
+
this.keyCounts.set(partitionId, previousKeyCount + 1);
|
|
7089
|
+
}
|
|
7090
|
+
this.lastUpdated.set(partitionId, Date.now());
|
|
7091
|
+
this.emit("treeUpdated", {
|
|
7092
|
+
partitionId,
|
|
7093
|
+
key,
|
|
7094
|
+
rootHash: tree.getRootHash()
|
|
7095
|
+
});
|
|
7096
|
+
}
|
|
7097
|
+
/**
|
|
7098
|
+
* Remove a key from the tree (e.g., after GC)
|
|
7099
|
+
*/
|
|
7100
|
+
removeRecord(partitionId, key) {
|
|
7101
|
+
const tree = this.trees.get(partitionId);
|
|
7102
|
+
if (!tree) return;
|
|
7103
|
+
tree.remove(key);
|
|
7104
|
+
const currentCount = this.keyCounts.get(partitionId) ?? 0;
|
|
7105
|
+
if (currentCount > 0) {
|
|
7106
|
+
this.keyCounts.set(partitionId, currentCount - 1);
|
|
7107
|
+
}
|
|
7108
|
+
this.lastUpdated.set(partitionId, Date.now());
|
|
7109
|
+
this.emit("treeUpdated", {
|
|
7110
|
+
partitionId,
|
|
7111
|
+
key,
|
|
7112
|
+
rootHash: tree.getRootHash()
|
|
7113
|
+
});
|
|
7114
|
+
}
|
|
7115
|
+
/**
|
|
7116
|
+
* Get the path prefix for a key in the Merkle tree
|
|
7117
|
+
*/
|
|
7118
|
+
getKeyPath(key) {
|
|
7119
|
+
const hash = hashString2(key).toString(16).padStart(8, "0");
|
|
7120
|
+
return hash.slice(0, this.config.treeDepth);
|
|
7121
|
+
}
|
|
7122
|
+
/**
|
|
7123
|
+
* Get root hash for a partition
|
|
7124
|
+
*/
|
|
7125
|
+
getRootHash(partitionId) {
|
|
7126
|
+
const tree = this.trees.get(partitionId);
|
|
7127
|
+
return tree?.getRootHash() ?? 0;
|
|
7128
|
+
}
|
|
7129
|
+
/**
|
|
7130
|
+
* Compare local tree with remote root hash
|
|
7131
|
+
*/
|
|
7132
|
+
compareWithRemote(partitionId, remoteRoot) {
|
|
7133
|
+
const tree = this.getTree(partitionId);
|
|
7134
|
+
const localRoot = tree.getRootHash();
|
|
7135
|
+
return {
|
|
7136
|
+
partitionId,
|
|
7137
|
+
localRoot,
|
|
7138
|
+
remoteRoot,
|
|
7139
|
+
needsSync: localRoot !== remoteRoot,
|
|
7140
|
+
differingBuckets: localRoot !== remoteRoot ? this.findDifferingBuckets(tree, remoteRoot) : []
|
|
7141
|
+
};
|
|
7142
|
+
}
|
|
7143
|
+
/**
|
|
7144
|
+
* Find buckets that differ between local and remote tree
|
|
7145
|
+
* Note: This is a simplified version - full implementation would
|
|
7146
|
+
* need to exchange bucket hashes with the remote node
|
|
7147
|
+
*/
|
|
7148
|
+
findDifferingBuckets(tree, _remoteRoot) {
|
|
7149
|
+
const buckets = [];
|
|
7150
|
+
this.collectLeafBuckets(tree, "", buckets);
|
|
7151
|
+
return buckets;
|
|
7152
|
+
}
|
|
7153
|
+
/**
|
|
7154
|
+
* Recursively collect all leaf bucket paths
|
|
7155
|
+
*/
|
|
7156
|
+
collectLeafBuckets(tree, path, result) {
|
|
7157
|
+
if (path.length >= this.config.treeDepth) {
|
|
7158
|
+
const keys = tree.getKeysInBucket(path);
|
|
7159
|
+
if (keys.length > 0) {
|
|
7160
|
+
result.push(path);
|
|
7161
|
+
}
|
|
7162
|
+
return;
|
|
7163
|
+
}
|
|
7164
|
+
const buckets = tree.getBuckets(path);
|
|
7165
|
+
for (const char of Object.keys(buckets)) {
|
|
7166
|
+
this.collectLeafBuckets(tree, path + char, result);
|
|
7167
|
+
}
|
|
7168
|
+
}
|
|
7169
|
+
/**
|
|
7170
|
+
* Get bucket hashes for a partition at a given path
|
|
7171
|
+
*/
|
|
7172
|
+
getBuckets(partitionId, path) {
|
|
7173
|
+
const tree = this.trees.get(partitionId);
|
|
7174
|
+
return tree?.getBuckets(path) ?? {};
|
|
7175
|
+
}
|
|
7176
|
+
/**
|
|
7177
|
+
* Get keys in a specific bucket
|
|
7178
|
+
*/
|
|
7179
|
+
getKeysInBucket(partitionId, path) {
|
|
7180
|
+
const tree = this.trees.get(partitionId);
|
|
7181
|
+
return tree?.getKeysInBucket(path) ?? [];
|
|
7182
|
+
}
|
|
7183
|
+
/**
|
|
7184
|
+
* Get all keys across all buckets for a partition
|
|
7185
|
+
*/
|
|
7186
|
+
getAllKeys(partitionId) {
|
|
7187
|
+
const tree = this.trees.get(partitionId);
|
|
7188
|
+
if (!tree) return [];
|
|
7189
|
+
const keys = [];
|
|
7190
|
+
this.collectAllKeys(tree, "", keys);
|
|
7191
|
+
return keys;
|
|
7192
|
+
}
|
|
7193
|
+
/**
|
|
7194
|
+
* Recursively collect all keys from the tree
|
|
7195
|
+
*/
|
|
7196
|
+
collectAllKeys(tree, path, result) {
|
|
7197
|
+
if (path.length >= this.config.treeDepth) {
|
|
7198
|
+
const keys = tree.getKeysInBucket(path);
|
|
7199
|
+
result.push(...keys);
|
|
7200
|
+
return;
|
|
7201
|
+
}
|
|
7202
|
+
const buckets = tree.getBuckets(path);
|
|
7203
|
+
for (const char of Object.keys(buckets)) {
|
|
7204
|
+
this.collectAllKeys(tree, path + char, result);
|
|
7205
|
+
}
|
|
7206
|
+
}
|
|
7207
|
+
/**
|
|
7208
|
+
* Get info about all managed partitions
|
|
7209
|
+
*/
|
|
7210
|
+
getPartitionInfos() {
|
|
7211
|
+
const infos = [];
|
|
7212
|
+
for (const [partitionId, tree] of this.trees) {
|
|
7213
|
+
infos.push({
|
|
7214
|
+
partitionId,
|
|
7215
|
+
rootHash: tree.getRootHash(),
|
|
7216
|
+
keyCount: this.keyCounts.get(partitionId) ?? 0,
|
|
7217
|
+
lastUpdated: this.lastUpdated.get(partitionId) ?? 0
|
|
7218
|
+
});
|
|
7219
|
+
}
|
|
7220
|
+
return infos;
|
|
7221
|
+
}
|
|
7222
|
+
/**
|
|
7223
|
+
* Get info for a specific partition
|
|
7224
|
+
*/
|
|
7225
|
+
getPartitionInfo(partitionId) {
|
|
7226
|
+
const tree = this.trees.get(partitionId);
|
|
7227
|
+
if (!tree) return null;
|
|
7228
|
+
return {
|
|
7229
|
+
partitionId,
|
|
7230
|
+
rootHash: tree.getRootHash(),
|
|
7231
|
+
keyCount: this.keyCounts.get(partitionId) ?? 0,
|
|
7232
|
+
lastUpdated: this.lastUpdated.get(partitionId) ?? 0
|
|
7233
|
+
};
|
|
7234
|
+
}
|
|
7235
|
+
/**
|
|
7236
|
+
* Clear tree for a partition (e.g., after migration)
|
|
7237
|
+
*/
|
|
7238
|
+
clearPartition(partitionId) {
|
|
7239
|
+
this.trees.delete(partitionId);
|
|
7240
|
+
this.keyCounts.delete(partitionId);
|
|
7241
|
+
this.lastUpdated.delete(partitionId);
|
|
7242
|
+
}
|
|
7243
|
+
/**
|
|
7244
|
+
* Clear all trees
|
|
7245
|
+
*/
|
|
7246
|
+
clearAll() {
|
|
7247
|
+
this.trees.clear();
|
|
7248
|
+
this.keyCounts.clear();
|
|
7249
|
+
this.lastUpdated.clear();
|
|
7250
|
+
}
|
|
7251
|
+
/**
|
|
7252
|
+
* Get metrics for monitoring
|
|
7253
|
+
*/
|
|
7254
|
+
getMetrics() {
|
|
7255
|
+
let totalKeys = 0;
|
|
7256
|
+
for (const count of this.keyCounts.values()) {
|
|
7257
|
+
totalKeys += count;
|
|
7258
|
+
}
|
|
7259
|
+
return {
|
|
7260
|
+
totalPartitions: this.trees.size,
|
|
7261
|
+
totalKeys,
|
|
7262
|
+
averageKeysPerPartition: this.trees.size > 0 ? totalKeys / this.trees.size : 0
|
|
7263
|
+
};
|
|
7264
|
+
}
|
|
7265
|
+
/**
|
|
7266
|
+
* Serialize tree state for network transfer
|
|
7267
|
+
*/
|
|
7268
|
+
serializeTree(partitionId) {
|
|
7269
|
+
const tree = this.trees.get(partitionId);
|
|
7270
|
+
if (!tree) return null;
|
|
7271
|
+
const buckets = {};
|
|
7272
|
+
for (let depth = 0; depth < this.config.treeDepth; depth++) {
|
|
7273
|
+
this.collectBucketsAtDepth(tree, "", depth, buckets);
|
|
7274
|
+
}
|
|
7275
|
+
return {
|
|
7276
|
+
rootHash: tree.getRootHash(),
|
|
7277
|
+
buckets
|
|
7278
|
+
};
|
|
7279
|
+
}
|
|
7280
|
+
collectBucketsAtDepth(tree, path, targetDepth, result) {
|
|
7281
|
+
if (path.length === targetDepth) {
|
|
7282
|
+
const buckets2 = tree.getBuckets(path);
|
|
7283
|
+
if (Object.keys(buckets2).length > 0) {
|
|
7284
|
+
result[path] = buckets2;
|
|
7285
|
+
}
|
|
7286
|
+
return;
|
|
7287
|
+
}
|
|
7288
|
+
if (path.length > targetDepth) return;
|
|
7289
|
+
const buckets = tree.getBuckets(path);
|
|
7290
|
+
for (const char of Object.keys(buckets)) {
|
|
7291
|
+
this.collectBucketsAtDepth(tree, path + char, targetDepth, result);
|
|
7292
|
+
}
|
|
7293
|
+
}
|
|
7294
|
+
};
|
|
7295
|
+
|
|
7296
|
+
// src/cluster/RepairScheduler.ts
|
|
7297
|
+
import { EventEmitter as EventEmitter12 } from "events";
|
|
7298
|
+
import { PARTITION_COUNT as PARTITION_COUNT4 } from "@topgunbuild/core";
|
|
7299
|
+
var DEFAULT_REPAIR_CONFIG = {
|
|
7300
|
+
enabled: true,
|
|
7301
|
+
scanIntervalMs: 36e5,
|
|
7302
|
+
// 1 hour
|
|
7303
|
+
repairBatchSize: 1e3,
|
|
7304
|
+
maxConcurrentRepairs: 2,
|
|
7305
|
+
throttleMs: 100,
|
|
7306
|
+
prioritizeRecent: true,
|
|
7307
|
+
requestTimeoutMs: 5e3
|
|
7308
|
+
};
|
|
7309
|
+
var RepairScheduler = class extends EventEmitter12 {
|
|
7310
|
+
constructor(merkleManager, clusterManager, partitionService, nodeId, config = {}) {
|
|
7311
|
+
super();
|
|
7312
|
+
this.repairQueue = [];
|
|
7313
|
+
this.activeRepairs = /* @__PURE__ */ new Set();
|
|
7314
|
+
this.started = false;
|
|
7315
|
+
// Pending network requests
|
|
7316
|
+
this.pendingRequests = /* @__PURE__ */ new Map();
|
|
7317
|
+
// Metrics
|
|
7318
|
+
this.metrics = {
|
|
7319
|
+
scansCompleted: 0,
|
|
7320
|
+
repairsExecuted: 0,
|
|
7321
|
+
keysRepaired: 0,
|
|
7322
|
+
errorsEncountered: 0,
|
|
7323
|
+
averageRepairDurationMs: 0
|
|
7324
|
+
};
|
|
7325
|
+
this.merkleManager = merkleManager;
|
|
7326
|
+
this.clusterManager = clusterManager;
|
|
7327
|
+
this.partitionService = partitionService;
|
|
7328
|
+
this.nodeId = nodeId;
|
|
7329
|
+
this.config = { ...DEFAULT_REPAIR_CONFIG, ...config };
|
|
7330
|
+
this.setupNetworkHandlers();
|
|
7331
|
+
}
|
|
7332
|
+
/**
|
|
7333
|
+
* Set data access callbacks
|
|
7334
|
+
*/
|
|
7335
|
+
setDataAccessors(getRecord, setRecord) {
|
|
7336
|
+
this.getRecord = getRecord;
|
|
7337
|
+
this.setRecord = setRecord;
|
|
7338
|
+
}
|
|
7339
|
+
/**
|
|
7340
|
+
* Setup network message handlers
|
|
7341
|
+
*/
|
|
7342
|
+
setupNetworkHandlers() {
|
|
7343
|
+
this.clusterManager.on("message", (msg) => {
|
|
7344
|
+
this.handleClusterMessage(msg);
|
|
7345
|
+
});
|
|
7346
|
+
}
|
|
7347
|
+
/**
|
|
7348
|
+
* Handle incoming cluster messages
|
|
7349
|
+
*/
|
|
7350
|
+
handleClusterMessage(msg) {
|
|
7351
|
+
switch (msg.type) {
|
|
7352
|
+
case "CLUSTER_MERKLE_ROOT_REQ":
|
|
7353
|
+
this.handleMerkleRootReq(msg);
|
|
7354
|
+
break;
|
|
7355
|
+
case "CLUSTER_MERKLE_ROOT_RESP":
|
|
7356
|
+
this.handleResponse(msg);
|
|
7357
|
+
break;
|
|
7358
|
+
case "CLUSTER_MERKLE_BUCKETS_REQ":
|
|
7359
|
+
this.handleMerkleBucketsReq(msg);
|
|
7360
|
+
break;
|
|
7361
|
+
case "CLUSTER_MERKLE_BUCKETS_RESP":
|
|
7362
|
+
this.handleResponse(msg);
|
|
7363
|
+
break;
|
|
7364
|
+
case "CLUSTER_MERKLE_KEYS_REQ":
|
|
7365
|
+
this.handleMerkleKeysReq(msg);
|
|
7366
|
+
break;
|
|
7367
|
+
case "CLUSTER_MERKLE_KEYS_RESP":
|
|
7368
|
+
this.handleResponse(msg);
|
|
7369
|
+
break;
|
|
7370
|
+
case "CLUSTER_REPAIR_DATA_REQ":
|
|
7371
|
+
this.handleRepairDataReq(msg);
|
|
7372
|
+
break;
|
|
7373
|
+
case "CLUSTER_REPAIR_DATA_RESP":
|
|
7374
|
+
this.handleResponse(msg);
|
|
7375
|
+
break;
|
|
7376
|
+
}
|
|
7377
|
+
}
|
|
7378
|
+
// === Request Handlers (Passive) ===
|
|
7379
|
+
handleMerkleRootReq(msg) {
|
|
7380
|
+
const { requestId, partitionId } = msg.payload;
|
|
7381
|
+
const rootHash = this.merkleManager.getRootHash(partitionId);
|
|
7382
|
+
this.clusterManager.send(msg.senderId, "CLUSTER_MERKLE_ROOT_RESP", {
|
|
7383
|
+
requestId,
|
|
7384
|
+
partitionId,
|
|
7385
|
+
rootHash
|
|
7386
|
+
});
|
|
7387
|
+
}
|
|
7388
|
+
handleMerkleBucketsReq(msg) {
|
|
7389
|
+
const { requestId, partitionId } = msg.payload;
|
|
7390
|
+
const tree = this.merkleManager.serializeTree(partitionId);
|
|
7391
|
+
this.clusterManager.send(msg.senderId, "CLUSTER_MERKLE_BUCKETS_RESP", {
|
|
7392
|
+
requestId,
|
|
7393
|
+
partitionId,
|
|
7394
|
+
buckets: tree?.buckets || {}
|
|
7395
|
+
});
|
|
7396
|
+
}
|
|
7397
|
+
handleMerkleKeysReq(msg) {
|
|
7398
|
+
const { requestId, partitionId, path } = msg.payload;
|
|
7399
|
+
const keys = this.merkleManager.getKeysInBucket(partitionId, path);
|
|
7400
|
+
this.clusterManager.send(msg.senderId, "CLUSTER_MERKLE_KEYS_RESP", {
|
|
7401
|
+
requestId,
|
|
7402
|
+
partitionId,
|
|
7403
|
+
path,
|
|
7404
|
+
keys
|
|
7405
|
+
});
|
|
7406
|
+
}
|
|
7407
|
+
handleRepairDataReq(msg) {
|
|
7408
|
+
const { requestId, key } = msg.payload;
|
|
7409
|
+
if (!this.getRecord) return;
|
|
7410
|
+
const record = this.getRecord(key);
|
|
7411
|
+
this.clusterManager.send(msg.senderId, "CLUSTER_REPAIR_DATA_RESP", {
|
|
7412
|
+
requestId,
|
|
7413
|
+
key,
|
|
7414
|
+
record
|
|
7415
|
+
});
|
|
7416
|
+
}
|
|
7417
|
+
handleResponse(msg) {
|
|
7418
|
+
const { requestId } = msg.payload;
|
|
7419
|
+
const pending = this.pendingRequests.get(requestId);
|
|
7420
|
+
if (pending) {
|
|
7421
|
+
clearTimeout(pending.timer);
|
|
7422
|
+
this.pendingRequests.delete(requestId);
|
|
7423
|
+
pending.resolve(msg.payload);
|
|
7424
|
+
}
|
|
7425
|
+
}
|
|
7426
|
+
// === Lifecycle Methods ===
|
|
7427
|
+
/**
|
|
7428
|
+
* Start the repair scheduler
|
|
7429
|
+
*/
|
|
7430
|
+
start() {
|
|
7431
|
+
if (this.started || !this.config.enabled) return;
|
|
7432
|
+
this.started = true;
|
|
7433
|
+
logger.info({ config: this.config }, "Starting RepairScheduler");
|
|
7434
|
+
this.scanTimer = setInterval(() => {
|
|
7435
|
+
this.scheduleFullScan();
|
|
7436
|
+
}, this.config.scanIntervalMs);
|
|
7437
|
+
this.processTimer = setInterval(() => {
|
|
7438
|
+
this.processRepairQueue();
|
|
7439
|
+
}, 1e3);
|
|
7440
|
+
setTimeout(() => {
|
|
7441
|
+
this.scheduleFullScan();
|
|
7442
|
+
}, 6e4);
|
|
7443
|
+
}
|
|
7444
|
+
/**
|
|
7445
|
+
* Stop the repair scheduler
|
|
7446
|
+
*/
|
|
7447
|
+
stop() {
|
|
7448
|
+
if (!this.started) return;
|
|
7449
|
+
this.started = false;
|
|
7450
|
+
if (this.scanTimer) {
|
|
7451
|
+
clearInterval(this.scanTimer);
|
|
7452
|
+
this.scanTimer = void 0;
|
|
7453
|
+
}
|
|
7454
|
+
if (this.processTimer) {
|
|
7455
|
+
clearInterval(this.processTimer);
|
|
7456
|
+
this.processTimer = void 0;
|
|
7457
|
+
}
|
|
7458
|
+
this.repairQueue = [];
|
|
7459
|
+
this.activeRepairs.clear();
|
|
7460
|
+
for (const [id, req] of this.pendingRequests) {
|
|
7461
|
+
clearTimeout(req.timer);
|
|
7462
|
+
req.reject(new Error("Scheduler stopped"));
|
|
7463
|
+
}
|
|
7464
|
+
this.pendingRequests.clear();
|
|
7465
|
+
logger.info("RepairScheduler stopped");
|
|
7466
|
+
}
|
|
7467
|
+
/**
|
|
7468
|
+
* Schedule a full scan of all owned partitions
|
|
7469
|
+
*/
|
|
7470
|
+
scheduleFullScan() {
|
|
7471
|
+
const ownedPartitions = this.getOwnedPartitions();
|
|
7472
|
+
const replicas = this.getReplicaPartitions();
|
|
7473
|
+
const allPartitions = [.../* @__PURE__ */ new Set([...ownedPartitions, ...replicas])];
|
|
7474
|
+
logger.info({
|
|
7475
|
+
ownedCount: ownedPartitions.length,
|
|
7476
|
+
replicaCount: replicas.length,
|
|
7477
|
+
totalPartitions: allPartitions.length
|
|
7478
|
+
}, "Scheduling full anti-entropy scan");
|
|
7479
|
+
for (const partitionId of allPartitions) {
|
|
7480
|
+
this.schedulePartitionRepair(partitionId);
|
|
7481
|
+
}
|
|
7482
|
+
this.metrics.scansCompleted++;
|
|
7483
|
+
this.metrics.lastScanTime = Date.now();
|
|
7484
|
+
}
|
|
7485
|
+
/**
|
|
7486
|
+
* Schedule repair for a specific partition
|
|
7487
|
+
*/
|
|
7488
|
+
schedulePartitionRepair(partitionId, priority = "normal") {
|
|
7489
|
+
const backups = this.partitionService.getBackups(partitionId);
|
|
7490
|
+
const owner = this.partitionService.getPartitionOwner(partitionId);
|
|
7491
|
+
const replicas = this.nodeId === owner ? backups : owner ? [owner] : [];
|
|
7492
|
+
for (const replicaNodeId of replicas) {
|
|
7493
|
+
const exists = this.repairQueue.some(
|
|
7494
|
+
(t) => t.partitionId === partitionId && t.replicaNodeId === replicaNodeId
|
|
7495
|
+
);
|
|
7496
|
+
if (exists) continue;
|
|
7497
|
+
this.repairQueue.push({
|
|
7498
|
+
partitionId,
|
|
7499
|
+
replicaNodeId,
|
|
7500
|
+
priority,
|
|
7501
|
+
scheduledAt: Date.now()
|
|
7502
|
+
});
|
|
7503
|
+
}
|
|
7504
|
+
this.sortRepairQueue();
|
|
7505
|
+
}
|
|
7506
|
+
/**
|
|
7507
|
+
* Sort repair queue by priority
|
|
7508
|
+
*/
|
|
7509
|
+
sortRepairQueue() {
|
|
7510
|
+
const priorityOrder = { high: 0, normal: 1, low: 2 };
|
|
7511
|
+
this.repairQueue.sort((a, b) => {
|
|
7512
|
+
const priorityDiff = priorityOrder[a.priority] - priorityOrder[b.priority];
|
|
7513
|
+
if (priorityDiff !== 0) return priorityDiff;
|
|
7514
|
+
if (this.config.prioritizeRecent) {
|
|
7515
|
+
const infoA = this.merkleManager.getPartitionInfo(a.partitionId);
|
|
7516
|
+
const infoB = this.merkleManager.getPartitionInfo(b.partitionId);
|
|
7517
|
+
if (infoA && infoB) {
|
|
7518
|
+
return infoB.lastUpdated - infoA.lastUpdated;
|
|
7519
|
+
}
|
|
7520
|
+
}
|
|
7521
|
+
return a.scheduledAt - b.scheduledAt;
|
|
7522
|
+
});
|
|
7523
|
+
}
|
|
7524
|
+
/**
|
|
7525
|
+
* Process the repair queue
|
|
7526
|
+
*/
|
|
7527
|
+
async processRepairQueue() {
|
|
7528
|
+
if (this.activeRepairs.size >= this.config.maxConcurrentRepairs) {
|
|
7529
|
+
return;
|
|
7530
|
+
}
|
|
7531
|
+
const task = this.repairQueue.shift();
|
|
7532
|
+
if (!task) return;
|
|
7533
|
+
if (this.activeRepairs.has(task.partitionId)) {
|
|
7534
|
+
return;
|
|
7535
|
+
}
|
|
7536
|
+
if (!this.clusterManager.getMembers().includes(task.replicaNodeId)) {
|
|
7537
|
+
logger.debug({ task }, "Skipping repair - replica not available");
|
|
7538
|
+
return;
|
|
7539
|
+
}
|
|
7540
|
+
this.activeRepairs.add(task.partitionId);
|
|
7541
|
+
try {
|
|
7542
|
+
const result = await this.executeRepair(task);
|
|
7543
|
+
this.emit("repairComplete", result);
|
|
7544
|
+
if (result.success) {
|
|
7545
|
+
this.metrics.repairsExecuted++;
|
|
7546
|
+
this.metrics.keysRepaired += result.keysRepaired;
|
|
7547
|
+
this.updateAverageRepairDuration(result.durationMs);
|
|
7548
|
+
} else {
|
|
7549
|
+
this.metrics.errorsEncountered++;
|
|
7550
|
+
}
|
|
7551
|
+
} catch (error) {
|
|
7552
|
+
logger.error({ task, error }, "Repair failed");
|
|
7553
|
+
this.metrics.errorsEncountered++;
|
|
7554
|
+
} finally {
|
|
7555
|
+
this.activeRepairs.delete(task.partitionId);
|
|
7556
|
+
}
|
|
7557
|
+
}
|
|
7558
|
+
/**
|
|
7559
|
+
* Execute repair for a partition-replica pair
|
|
7560
|
+
*/
|
|
7561
|
+
async executeRepair(task) {
|
|
7562
|
+
const startTime = Date.now();
|
|
7563
|
+
let keysScanned = 0;
|
|
7564
|
+
let keysRepaired = 0;
|
|
7565
|
+
try {
|
|
7566
|
+
const localRoot = this.merkleManager.getRootHash(task.partitionId);
|
|
7567
|
+
const remoteRoot = await this.requestRemoteMerkleRoot(task.replicaNodeId, task.partitionId);
|
|
7568
|
+
if (localRoot === remoteRoot) {
|
|
7569
|
+
logger.debug({
|
|
7570
|
+
partitionId: task.partitionId,
|
|
7571
|
+
replicaNodeId: task.replicaNodeId
|
|
7572
|
+
}, "Partition in sync");
|
|
7573
|
+
return {
|
|
7574
|
+
partitionId: task.partitionId,
|
|
7575
|
+
replicaNodeId: task.replicaNodeId,
|
|
7576
|
+
keysScanned: 0,
|
|
7577
|
+
keysRepaired: 0,
|
|
7578
|
+
durationMs: Date.now() - startTime,
|
|
7579
|
+
success: true
|
|
7580
|
+
};
|
|
7581
|
+
}
|
|
7582
|
+
const differences = await this.findDifferences(task.partitionId, task.replicaNodeId);
|
|
7583
|
+
keysScanned = differences.length;
|
|
7584
|
+
for (const key of differences) {
|
|
7585
|
+
const repaired = await this.repairKey(task.partitionId, task.replicaNodeId, key);
|
|
7586
|
+
if (repaired) {
|
|
7587
|
+
keysRepaired++;
|
|
7588
|
+
}
|
|
7589
|
+
if (keysRepaired % this.config.repairBatchSize === 0) {
|
|
7590
|
+
await this.sleep(this.config.throttleMs);
|
|
7591
|
+
}
|
|
7592
|
+
}
|
|
7593
|
+
logger.info({
|
|
7594
|
+
partitionId: task.partitionId,
|
|
7595
|
+
replicaNodeId: task.replicaNodeId,
|
|
7596
|
+
keysScanned,
|
|
7597
|
+
keysRepaired,
|
|
7598
|
+
durationMs: Date.now() - startTime
|
|
7599
|
+
}, "Partition repair completed");
|
|
7600
|
+
return {
|
|
7601
|
+
partitionId: task.partitionId,
|
|
7602
|
+
replicaNodeId: task.replicaNodeId,
|
|
7603
|
+
keysScanned,
|
|
7604
|
+
keysRepaired,
|
|
7605
|
+
durationMs: Date.now() - startTime,
|
|
7606
|
+
success: true
|
|
7607
|
+
};
|
|
7608
|
+
} catch (error) {
|
|
7609
|
+
return {
|
|
7610
|
+
partitionId: task.partitionId,
|
|
7611
|
+
replicaNodeId: task.replicaNodeId,
|
|
7612
|
+
keysScanned,
|
|
7613
|
+
keysRepaired,
|
|
7614
|
+
durationMs: Date.now() - startTime,
|
|
7615
|
+
success: false,
|
|
7616
|
+
error: String(error)
|
|
7617
|
+
};
|
|
7618
|
+
}
|
|
7619
|
+
}
|
|
7620
|
+
/**
|
|
7621
|
+
* Send a request and wait for response
|
|
7622
|
+
*/
|
|
7623
|
+
sendRequest(nodeId, type, payload) {
|
|
7624
|
+
return new Promise((resolve, reject) => {
|
|
7625
|
+
const requestId = Math.random().toString(36).substring(7);
|
|
7626
|
+
const timer = setTimeout(() => {
|
|
7627
|
+
this.pendingRequests.delete(requestId);
|
|
7628
|
+
reject(new Error(`Request timeout: ${type} to ${nodeId}`));
|
|
7629
|
+
}, this.config.requestTimeoutMs);
|
|
7630
|
+
this.pendingRequests.set(requestId, { resolve, reject, timer });
|
|
7631
|
+
this.clusterManager.send(nodeId, type, { ...payload, requestId });
|
|
7632
|
+
});
|
|
7633
|
+
}
|
|
7634
|
+
/**
|
|
7635
|
+
* Request Merkle root from remote node
|
|
7636
|
+
*/
|
|
7637
|
+
async requestRemoteMerkleRoot(nodeId, partitionId) {
|
|
7638
|
+
const response = await this.sendRequest(
|
|
7639
|
+
nodeId,
|
|
7640
|
+
"CLUSTER_MERKLE_ROOT_REQ",
|
|
7641
|
+
{ partitionId }
|
|
7642
|
+
);
|
|
7643
|
+
return response.rootHash;
|
|
7644
|
+
}
|
|
7645
|
+
/**
|
|
7646
|
+
* Find keys that differ between local and remote using bucket exchange
|
|
7647
|
+
*/
|
|
7648
|
+
async findDifferences(partitionId, replicaNodeId) {
|
|
7649
|
+
const response = await this.sendRequest(
|
|
7650
|
+
replicaNodeId,
|
|
7651
|
+
"CLUSTER_MERKLE_BUCKETS_REQ",
|
|
7652
|
+
{ partitionId }
|
|
7653
|
+
);
|
|
7654
|
+
const remoteBuckets = response.buckets;
|
|
7655
|
+
const localTree = this.merkleManager.getTree(partitionId);
|
|
7656
|
+
if (!localTree) return [];
|
|
7657
|
+
const differingKeys = /* @__PURE__ */ new Set();
|
|
7658
|
+
const queue = [""];
|
|
7659
|
+
const maxDepth = 3;
|
|
7660
|
+
while (queue.length > 0) {
|
|
7661
|
+
const path = queue.shift();
|
|
7662
|
+
const localChildren = localTree.getBuckets(path);
|
|
7663
|
+
const remoteChildren = remoteBuckets[path] || {};
|
|
7664
|
+
const allChars = /* @__PURE__ */ new Set([...Object.keys(localChildren), ...Object.keys(remoteChildren)]);
|
|
7665
|
+
for (const char of allChars) {
|
|
7666
|
+
const localHash = localChildren[char] || 0;
|
|
7667
|
+
const remoteHash = remoteChildren[char] || 0;
|
|
7668
|
+
if (localHash !== remoteHash) {
|
|
7669
|
+
const nextPath = path + char;
|
|
7670
|
+
if (nextPath.length >= maxDepth) {
|
|
7671
|
+
const bucketKeysResp = await this.sendRequest(
|
|
7672
|
+
replicaNodeId,
|
|
7673
|
+
"CLUSTER_MERKLE_KEYS_REQ",
|
|
7674
|
+
{ partitionId, path: nextPath }
|
|
7675
|
+
);
|
|
7676
|
+
const localBucketKeys = localTree.getKeysInBucket(nextPath);
|
|
7677
|
+
const remoteBucketKeys = bucketKeysResp.keys;
|
|
7678
|
+
for (const k of localBucketKeys) differingKeys.add(k);
|
|
7679
|
+
for (const k of remoteBucketKeys) differingKeys.add(k);
|
|
7680
|
+
} else {
|
|
7681
|
+
queue.push(nextPath);
|
|
7682
|
+
}
|
|
7683
|
+
}
|
|
7684
|
+
}
|
|
7685
|
+
}
|
|
7686
|
+
return Array.from(differingKeys);
|
|
7687
|
+
}
|
|
7688
|
+
/**
|
|
7689
|
+
* Repair a single key
|
|
7690
|
+
*/
|
|
7691
|
+
async repairKey(partitionId, replicaNodeId, key) {
|
|
7692
|
+
if (!this.getRecord || !this.setRecord) {
|
|
7693
|
+
return false;
|
|
7694
|
+
}
|
|
7695
|
+
const localRecord = this.getRecord(key);
|
|
7696
|
+
let remoteRecord;
|
|
7697
|
+
try {
|
|
7698
|
+
const response = await this.sendRequest(
|
|
7699
|
+
replicaNodeId,
|
|
7700
|
+
"CLUSTER_REPAIR_DATA_REQ",
|
|
7701
|
+
{ key }
|
|
7702
|
+
);
|
|
7703
|
+
remoteRecord = response.record;
|
|
7704
|
+
} catch (e) {
|
|
7705
|
+
logger.warn({ key, replicaNodeId, err: e }, "Failed to fetch remote record for repair");
|
|
7706
|
+
return false;
|
|
7707
|
+
}
|
|
7708
|
+
const resolved = this.resolveConflict(localRecord, remoteRecord);
|
|
7709
|
+
if (!resolved) return false;
|
|
7710
|
+
if (JSON.stringify(resolved) !== JSON.stringify(localRecord)) {
|
|
7711
|
+
this.setRecord(key, resolved);
|
|
7712
|
+
if (JSON.stringify(resolved) !== JSON.stringify(remoteRecord)) {
|
|
7713
|
+
this.clusterManager.send(replicaNodeId, "CLUSTER_REPAIR_DATA_RESP", {
|
|
7714
|
+
// In future: Use dedicated WRITE/REPAIR message
|
|
7715
|
+
// For now we rely on the fact that repair will eventually run on other node too
|
|
7716
|
+
});
|
|
7717
|
+
}
|
|
7718
|
+
return true;
|
|
7719
|
+
}
|
|
7720
|
+
return false;
|
|
7721
|
+
}
|
|
7722
|
+
/**
|
|
7723
|
+
* Resolve conflict between two records using LWW
|
|
7724
|
+
*/
|
|
7725
|
+
resolveConflict(a, b) {
|
|
7726
|
+
if (!a && !b) return null;
|
|
7727
|
+
if (!a) return b;
|
|
7728
|
+
if (!b) return a;
|
|
7729
|
+
if (this.compareTimestamps(a.timestamp, b.timestamp) > 0) {
|
|
7730
|
+
return a;
|
|
7731
|
+
}
|
|
7732
|
+
if (this.compareTimestamps(b.timestamp, a.timestamp) > 0) {
|
|
7733
|
+
return b;
|
|
7734
|
+
}
|
|
7735
|
+
if (a.timestamp.nodeId > b.timestamp.nodeId) {
|
|
7736
|
+
return a;
|
|
7737
|
+
}
|
|
7738
|
+
return b;
|
|
7739
|
+
}
|
|
7740
|
+
/**
|
|
7741
|
+
* Compare two timestamps
|
|
7742
|
+
*/
|
|
7743
|
+
compareTimestamps(a, b) {
|
|
7744
|
+
if (a.millis !== b.millis) {
|
|
7745
|
+
return a.millis - b.millis;
|
|
7746
|
+
}
|
|
7747
|
+
return a.counter - b.counter;
|
|
7748
|
+
}
|
|
7749
|
+
/**
|
|
7750
|
+
* Get partitions owned by this node
|
|
7751
|
+
*/
|
|
7752
|
+
getOwnedPartitions() {
|
|
7753
|
+
const owned = [];
|
|
7754
|
+
for (let i = 0; i < PARTITION_COUNT4; i++) {
|
|
7755
|
+
if (this.partitionService.getPartitionOwner(i) === this.nodeId) {
|
|
7756
|
+
owned.push(i);
|
|
7757
|
+
}
|
|
7758
|
+
}
|
|
7759
|
+
return owned;
|
|
7760
|
+
}
|
|
7761
|
+
/**
|
|
7762
|
+
* Get partitions where this node is a backup
|
|
7763
|
+
*/
|
|
7764
|
+
getReplicaPartitions() {
|
|
7765
|
+
const replicas = [];
|
|
7766
|
+
for (let i = 0; i < PARTITION_COUNT4; i++) {
|
|
7767
|
+
const backups = this.partitionService.getBackups(i);
|
|
7768
|
+
if (backups.includes(this.nodeId)) {
|
|
7769
|
+
replicas.push(i);
|
|
7770
|
+
}
|
|
7771
|
+
}
|
|
7772
|
+
return replicas;
|
|
7773
|
+
}
|
|
7774
|
+
/**
|
|
7775
|
+
* Update average repair duration
|
|
7776
|
+
*/
|
|
7777
|
+
updateAverageRepairDuration(durationMs) {
|
|
7778
|
+
const count = this.metrics.repairsExecuted;
|
|
7779
|
+
const currentAvg = this.metrics.averageRepairDurationMs;
|
|
7780
|
+
this.metrics.averageRepairDurationMs = (currentAvg * (count - 1) + durationMs) / count;
|
|
7781
|
+
}
|
|
7782
|
+
/**
|
|
7783
|
+
* Get repair metrics
|
|
7784
|
+
*/
|
|
7785
|
+
getMetrics() {
|
|
7786
|
+
return { ...this.metrics };
|
|
7787
|
+
}
|
|
7788
|
+
/**
|
|
7789
|
+
* Get repair queue status
|
|
7790
|
+
*/
|
|
7791
|
+
getQueueStatus() {
|
|
7792
|
+
return {
|
|
7793
|
+
queueLength: this.repairQueue.length,
|
|
7794
|
+
activeRepairs: this.activeRepairs.size,
|
|
7795
|
+
maxConcurrent: this.config.maxConcurrentRepairs
|
|
7796
|
+
};
|
|
7797
|
+
}
|
|
7798
|
+
/**
|
|
7799
|
+
* Force immediate repair for a partition
|
|
7800
|
+
*/
|
|
7801
|
+
forceRepair(partitionId) {
|
|
7802
|
+
this.schedulePartitionRepair(partitionId, "high");
|
|
7803
|
+
}
|
|
7804
|
+
/**
|
|
7805
|
+
* Sleep utility
|
|
7806
|
+
*/
|
|
7807
|
+
sleep(ms) {
|
|
7808
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
7809
|
+
}
|
|
7810
|
+
};
|
|
7811
|
+
|
|
7812
|
+
// src/handlers/CounterHandler.ts
|
|
7813
|
+
import { PNCounterImpl } from "@topgunbuild/core";
|
|
7814
|
+
var CounterHandler = class {
|
|
7815
|
+
// counterName -> Set<clientId>
|
|
7816
|
+
constructor(nodeId = "server") {
|
|
7817
|
+
this.nodeId = nodeId;
|
|
7818
|
+
this.counters = /* @__PURE__ */ new Map();
|
|
7819
|
+
this.subscriptions = /* @__PURE__ */ new Map();
|
|
7820
|
+
}
|
|
7821
|
+
/**
|
|
7822
|
+
* Get or create a counter by name.
|
|
7823
|
+
*/
|
|
7824
|
+
getOrCreateCounter(name) {
|
|
7825
|
+
let counter = this.counters.get(name);
|
|
7826
|
+
if (!counter) {
|
|
7827
|
+
counter = new PNCounterImpl({ nodeId: this.nodeId });
|
|
7828
|
+
this.counters.set(name, counter);
|
|
7829
|
+
logger.debug({ name }, "Created new counter");
|
|
7830
|
+
}
|
|
7831
|
+
return counter;
|
|
7832
|
+
}
|
|
7833
|
+
/**
|
|
7834
|
+
* Handle COUNTER_REQUEST - client wants initial state.
|
|
7835
|
+
* @returns Response message to send back to client
|
|
7836
|
+
*/
|
|
7837
|
+
handleCounterRequest(clientId, name) {
|
|
7838
|
+
const counter = this.getOrCreateCounter(name);
|
|
7839
|
+
this.subscribe(clientId, name);
|
|
7840
|
+
const state = counter.getState();
|
|
7841
|
+
logger.debug({ clientId, name, value: counter.get() }, "Counter request handled");
|
|
7842
|
+
return {
|
|
7843
|
+
type: "COUNTER_RESPONSE",
|
|
7844
|
+
payload: {
|
|
7845
|
+
name,
|
|
7846
|
+
state: this.stateToObject(state)
|
|
7847
|
+
}
|
|
7848
|
+
};
|
|
7849
|
+
}
|
|
7850
|
+
/**
|
|
7851
|
+
* Handle COUNTER_SYNC - client sends their state to merge.
|
|
7852
|
+
* @returns Merged state and list of clients to broadcast to
|
|
7853
|
+
*/
|
|
7854
|
+
handleCounterSync(clientId, name, stateObj) {
|
|
7855
|
+
const counter = this.getOrCreateCounter(name);
|
|
7856
|
+
const incomingState = this.objectToState(stateObj);
|
|
7857
|
+
counter.merge(incomingState);
|
|
7858
|
+
const mergedState = counter.getState();
|
|
7859
|
+
const mergedStateObj = this.stateToObject(mergedState);
|
|
7860
|
+
logger.debug(
|
|
7861
|
+
{ clientId, name, value: counter.get() },
|
|
7862
|
+
"Counter sync handled"
|
|
7863
|
+
);
|
|
7864
|
+
this.subscribe(clientId, name);
|
|
7865
|
+
const subscribers = this.subscriptions.get(name) || /* @__PURE__ */ new Set();
|
|
7866
|
+
const broadcastTo = Array.from(subscribers).filter((id) => id !== clientId);
|
|
7867
|
+
return {
|
|
7868
|
+
// Response to the sending client
|
|
7869
|
+
response: {
|
|
7870
|
+
type: "COUNTER_UPDATE",
|
|
7871
|
+
payload: {
|
|
7872
|
+
name,
|
|
7873
|
+
state: mergedStateObj
|
|
7874
|
+
}
|
|
7875
|
+
},
|
|
7876
|
+
// Broadcast to other clients
|
|
7877
|
+
broadcastTo,
|
|
7878
|
+
broadcastMessage: {
|
|
7879
|
+
type: "COUNTER_UPDATE",
|
|
7880
|
+
payload: {
|
|
7881
|
+
name,
|
|
7882
|
+
state: mergedStateObj
|
|
7883
|
+
}
|
|
7884
|
+
}
|
|
7885
|
+
};
|
|
7886
|
+
}
|
|
7887
|
+
/**
|
|
7888
|
+
* Subscribe a client to counter updates.
|
|
7889
|
+
*/
|
|
7890
|
+
subscribe(clientId, counterName) {
|
|
7891
|
+
if (!this.subscriptions.has(counterName)) {
|
|
7892
|
+
this.subscriptions.set(counterName, /* @__PURE__ */ new Set());
|
|
7893
|
+
}
|
|
7894
|
+
this.subscriptions.get(counterName).add(clientId);
|
|
7895
|
+
logger.debug({ clientId, counterName }, "Client subscribed to counter");
|
|
7896
|
+
}
|
|
7897
|
+
/**
|
|
7898
|
+
* Unsubscribe a client from counter updates.
|
|
7899
|
+
*/
|
|
7900
|
+
unsubscribe(clientId, counterName) {
|
|
7901
|
+
const subs = this.subscriptions.get(counterName);
|
|
7902
|
+
if (subs) {
|
|
7903
|
+
subs.delete(clientId);
|
|
7904
|
+
if (subs.size === 0) {
|
|
7905
|
+
this.subscriptions.delete(counterName);
|
|
7906
|
+
}
|
|
7907
|
+
}
|
|
7908
|
+
}
|
|
7909
|
+
/**
|
|
7910
|
+
* Unsubscribe a client from all counters (e.g., on disconnect).
|
|
7911
|
+
*/
|
|
7912
|
+
unsubscribeAll(clientId) {
|
|
7913
|
+
for (const [counterName, subs] of this.subscriptions) {
|
|
7914
|
+
subs.delete(clientId);
|
|
7915
|
+
if (subs.size === 0) {
|
|
7916
|
+
this.subscriptions.delete(counterName);
|
|
7917
|
+
}
|
|
7918
|
+
}
|
|
7919
|
+
logger.debug({ clientId }, "Client unsubscribed from all counters");
|
|
7920
|
+
}
|
|
7921
|
+
/**
|
|
7922
|
+
* Get current counter value (for monitoring/debugging).
|
|
7923
|
+
*/
|
|
7924
|
+
getCounterValue(name) {
|
|
7925
|
+
const counter = this.counters.get(name);
|
|
7926
|
+
return counter ? counter.get() : 0;
|
|
7927
|
+
}
|
|
7928
|
+
/**
|
|
7929
|
+
* Get all counter names.
|
|
7930
|
+
*/
|
|
7931
|
+
getCounterNames() {
|
|
7932
|
+
return Array.from(this.counters.keys());
|
|
7933
|
+
}
|
|
7934
|
+
/**
|
|
7935
|
+
* Get number of subscribers for a counter.
|
|
7936
|
+
*/
|
|
7937
|
+
getSubscriberCount(name) {
|
|
7938
|
+
return this.subscriptions.get(name)?.size || 0;
|
|
7939
|
+
}
|
|
7940
|
+
/**
|
|
7941
|
+
* Convert Map-based state to plain object for serialization.
|
|
7942
|
+
*/
|
|
7943
|
+
stateToObject(state) {
|
|
7944
|
+
return {
|
|
7945
|
+
p: Object.fromEntries(state.positive),
|
|
7946
|
+
n: Object.fromEntries(state.negative)
|
|
7947
|
+
};
|
|
7948
|
+
}
|
|
7949
|
+
/**
|
|
7950
|
+
* Convert plain object to Map-based state.
|
|
7951
|
+
*/
|
|
7952
|
+
objectToState(obj) {
|
|
7953
|
+
return {
|
|
7954
|
+
positive: new Map(Object.entries(obj.p || {})),
|
|
7955
|
+
negative: new Map(Object.entries(obj.n || {}))
|
|
7956
|
+
};
|
|
7957
|
+
}
|
|
7958
|
+
};
|
|
7959
|
+
|
|
7960
|
+
// src/handlers/EntryProcessorHandler.ts
|
|
7961
|
+
import {
|
|
7962
|
+
EntryProcessorDefSchema
|
|
7963
|
+
} from "@topgunbuild/core";
|
|
7964
|
+
|
|
7965
|
+
// src/ProcessorSandbox.ts
|
|
7966
|
+
import {
|
|
7967
|
+
validateProcessorCode
|
|
7968
|
+
} from "@topgunbuild/core";
|
|
7969
|
+
var ivm = null;
|
|
7970
|
+
try {
|
|
7971
|
+
ivm = __require("isolated-vm");
|
|
7972
|
+
} catch {
|
|
7973
|
+
const isProduction = process.env.NODE_ENV === "production";
|
|
7974
|
+
if (isProduction) {
|
|
7975
|
+
logger.error(
|
|
7976
|
+
"SECURITY WARNING: isolated-vm not available in production! Entry processors will run in less secure fallback mode. Install isolated-vm for production environments: pnpm add isolated-vm"
|
|
7977
|
+
);
|
|
7978
|
+
} else {
|
|
7979
|
+
logger.warn("isolated-vm not available, falling back to less secure VM");
|
|
7980
|
+
}
|
|
7981
|
+
}
|
|
7982
|
+
var DEFAULT_SANDBOX_CONFIG = {
|
|
7983
|
+
memoryLimitMb: 8,
|
|
7984
|
+
timeoutMs: 100,
|
|
7985
|
+
maxCachedIsolates: 100,
|
|
7986
|
+
strictValidation: true
|
|
7987
|
+
};
|
|
7988
|
+
var ProcessorSandbox = class {
|
|
7989
|
+
constructor(config = {}) {
|
|
7990
|
+
this.isolateCache = /* @__PURE__ */ new Map();
|
|
7991
|
+
this.scriptCache = /* @__PURE__ */ new Map();
|
|
7992
|
+
this.fallbackScriptCache = /* @__PURE__ */ new Map();
|
|
7993
|
+
this.disposed = false;
|
|
7994
|
+
this.config = { ...DEFAULT_SANDBOX_CONFIG, ...config };
|
|
7995
|
+
}
|
|
7996
|
+
/**
|
|
7997
|
+
* Execute an entry processor in the sandbox.
|
|
7998
|
+
*
|
|
7999
|
+
* @param processor The processor definition (name, code, args)
|
|
8000
|
+
* @param value The current value for the key (or undefined)
|
|
8001
|
+
* @param key The key being processed
|
|
8002
|
+
* @returns Result containing success status, result, and new value
|
|
8003
|
+
*/
|
|
8004
|
+
async execute(processor, value, key) {
|
|
8005
|
+
if (this.disposed) {
|
|
8006
|
+
return {
|
|
8007
|
+
success: false,
|
|
8008
|
+
error: "Sandbox has been disposed"
|
|
8009
|
+
};
|
|
8010
|
+
}
|
|
8011
|
+
if (this.config.strictValidation) {
|
|
8012
|
+
const validation = validateProcessorCode(processor.code);
|
|
8013
|
+
if (!validation.valid) {
|
|
8014
|
+
return {
|
|
8015
|
+
success: false,
|
|
8016
|
+
error: validation.error
|
|
8017
|
+
};
|
|
8018
|
+
}
|
|
8019
|
+
}
|
|
8020
|
+
if (ivm) {
|
|
8021
|
+
return this.executeInIsolate(processor, value, key);
|
|
8022
|
+
} else {
|
|
8023
|
+
return this.executeInFallback(processor, value, key);
|
|
8024
|
+
}
|
|
8025
|
+
}
|
|
8026
|
+
/**
|
|
8027
|
+
* Execute processor in isolated-vm (secure production mode).
|
|
8028
|
+
*/
|
|
8029
|
+
async executeInIsolate(processor, value, key) {
|
|
8030
|
+
if (!ivm) {
|
|
8031
|
+
return { success: false, error: "isolated-vm not available" };
|
|
8032
|
+
}
|
|
8033
|
+
const isolate = this.getOrCreateIsolate(processor.name);
|
|
8034
|
+
try {
|
|
8035
|
+
const context = await isolate.createContext();
|
|
8036
|
+
const jail = context.global;
|
|
8037
|
+
await jail.set("global", jail.derefInto());
|
|
8038
|
+
await context.eval(`
|
|
8039
|
+
var value = ${JSON.stringify(value)};
|
|
8040
|
+
var key = ${JSON.stringify(key)};
|
|
8041
|
+
var args = ${JSON.stringify(processor.args)};
|
|
6798
8042
|
`);
|
|
6799
8043
|
const wrappedCode = `
|
|
6800
8044
|
(function() {
|
|
@@ -8112,7 +9356,7 @@ var ServerCoordinator = class {
|
|
|
8112
9356
|
this.partitionService,
|
|
8113
9357
|
{
|
|
8114
9358
|
...DEFAULT_REPLICATION_CONFIG2,
|
|
8115
|
-
defaultConsistency: config.defaultConsistency ??
|
|
9359
|
+
defaultConsistency: config.defaultConsistency ?? ConsistencyLevel3.EVENTUAL,
|
|
8116
9360
|
...config.replicationConfig
|
|
8117
9361
|
}
|
|
8118
9362
|
);
|
|
@@ -8154,6 +9398,52 @@ var ServerCoordinator = class {
|
|
|
8154
9398
|
logger.error({ err }, "Failed to initialize EventJournalService");
|
|
8155
9399
|
});
|
|
8156
9400
|
}
|
|
9401
|
+
this.partitionReassigner = new PartitionReassigner(
|
|
9402
|
+
this.cluster,
|
|
9403
|
+
this.partitionService,
|
|
9404
|
+
{ reassignmentDelayMs: 1e3 }
|
|
9405
|
+
);
|
|
9406
|
+
this.partitionReassigner.on("failoverComplete", (event) => {
|
|
9407
|
+
logger.info({
|
|
9408
|
+
failedNodeId: event.failedNodeId,
|
|
9409
|
+
partitionsReassigned: event.partitionsReassigned,
|
|
9410
|
+
durationMs: event.durationMs
|
|
9411
|
+
}, "Partition failover completed");
|
|
9412
|
+
this.broadcastPartitionMap(this.partitionService.getPartitionMap());
|
|
9413
|
+
});
|
|
9414
|
+
logger.info("PartitionReassigner initialized");
|
|
9415
|
+
this.readReplicaHandler = new ReadReplicaHandler(
|
|
9416
|
+
this.partitionService,
|
|
9417
|
+
this.cluster,
|
|
9418
|
+
this._nodeId,
|
|
9419
|
+
void 0,
|
|
9420
|
+
// LagTracker - can be added later
|
|
9421
|
+
{
|
|
9422
|
+
defaultConsistency: config.defaultConsistency ?? ConsistencyLevel3.STRONG,
|
|
9423
|
+
preferLocalReplica: true,
|
|
9424
|
+
loadBalancing: "latency-based"
|
|
9425
|
+
}
|
|
9426
|
+
);
|
|
9427
|
+
logger.info("ReadReplicaHandler initialized");
|
|
9428
|
+
this.merkleTreeManager = new MerkleTreeManager(this._nodeId);
|
|
9429
|
+
this.repairScheduler = new RepairScheduler(
|
|
9430
|
+
this.merkleTreeManager,
|
|
9431
|
+
this.cluster,
|
|
9432
|
+
this.partitionService,
|
|
9433
|
+
this._nodeId,
|
|
9434
|
+
{
|
|
9435
|
+
enabled: true,
|
|
9436
|
+
scanIntervalMs: 3e5,
|
|
9437
|
+
// 5 minutes
|
|
9438
|
+
maxConcurrentRepairs: 2
|
|
9439
|
+
}
|
|
9440
|
+
);
|
|
9441
|
+
this.repairScheduler.setDataAccessors(
|
|
9442
|
+
(key) => this.getLocalRecord(key) ?? void 0,
|
|
9443
|
+
(key, record) => this.applyRepairRecord(key, record)
|
|
9444
|
+
);
|
|
9445
|
+
this.repairScheduler.start();
|
|
9446
|
+
logger.info("MerkleTreeManager and RepairScheduler initialized");
|
|
8157
9447
|
this.systemManager = new SystemManager(
|
|
8158
9448
|
this.cluster,
|
|
8159
9449
|
this.metricsService,
|
|
@@ -8250,8 +9540,84 @@ var ServerCoordinator = class {
|
|
|
8250
9540
|
getTaskletScheduler() {
|
|
8251
9541
|
return this.taskletScheduler;
|
|
8252
9542
|
}
|
|
9543
|
+
/**
|
|
9544
|
+
* Phase 10.02: Graceful cluster departure
|
|
9545
|
+
*
|
|
9546
|
+
* Notifies the cluster that this node is leaving and allows time for:
|
|
9547
|
+
* 1. Pending replication to complete
|
|
9548
|
+
* 2. Other nodes to detect departure
|
|
9549
|
+
* 3. Partition reassignment to begin
|
|
9550
|
+
*/
|
|
9551
|
+
async gracefulClusterDeparture() {
|
|
9552
|
+
if (!this.cluster || this.cluster.getMembers().length <= 1) {
|
|
9553
|
+
return;
|
|
9554
|
+
}
|
|
9555
|
+
const nodeId = this._nodeId;
|
|
9556
|
+
const ownedPartitions = this.partitionService ? this.getOwnedPartitions() : [];
|
|
9557
|
+
logger.info({
|
|
9558
|
+
nodeId,
|
|
9559
|
+
ownedPartitions: ownedPartitions.length,
|
|
9560
|
+
clusterMembers: this.cluster.getMembers().length
|
|
9561
|
+
}, "Initiating graceful cluster departure");
|
|
9562
|
+
const departureMessage = {
|
|
9563
|
+
type: "NODE_LEAVING",
|
|
9564
|
+
nodeId,
|
|
9565
|
+
partitions: ownedPartitions,
|
|
9566
|
+
timestamp: Date.now()
|
|
9567
|
+
};
|
|
9568
|
+
for (const memberId of this.cluster.getMembers()) {
|
|
9569
|
+
if (memberId !== nodeId) {
|
|
9570
|
+
try {
|
|
9571
|
+
this.cluster.send(memberId, "CLUSTER_EVENT", departureMessage);
|
|
9572
|
+
} catch (e) {
|
|
9573
|
+
logger.warn({ memberId, err: e }, "Failed to notify peer of departure");
|
|
9574
|
+
}
|
|
9575
|
+
}
|
|
9576
|
+
}
|
|
9577
|
+
if (this.replicationPipeline) {
|
|
9578
|
+
logger.info("Waiting for pending replication to complete...");
|
|
9579
|
+
try {
|
|
9580
|
+
await this.waitForReplicationFlush(3e3);
|
|
9581
|
+
logger.info("Replication flush complete");
|
|
9582
|
+
} catch (e) {
|
|
9583
|
+
logger.warn({ err: e }, "Replication flush timeout - some data may not be replicated");
|
|
9584
|
+
}
|
|
9585
|
+
}
|
|
9586
|
+
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
9587
|
+
logger.info({ nodeId }, "Graceful cluster departure complete");
|
|
9588
|
+
}
|
|
9589
|
+
/**
|
|
9590
|
+
* Get list of partition IDs owned by this node
|
|
9591
|
+
*/
|
|
9592
|
+
getOwnedPartitions() {
|
|
9593
|
+
if (!this.partitionService) return [];
|
|
9594
|
+
const partitionMap = this.partitionService.getPartitionMap();
|
|
9595
|
+
const owned = [];
|
|
9596
|
+
for (const partition of partitionMap.partitions) {
|
|
9597
|
+
if (partition.ownerNodeId === this._nodeId) {
|
|
9598
|
+
owned.push(partition.partitionId);
|
|
9599
|
+
}
|
|
9600
|
+
}
|
|
9601
|
+
return owned;
|
|
9602
|
+
}
|
|
9603
|
+
/**
|
|
9604
|
+
* Wait for replication pipeline to flush pending operations
|
|
9605
|
+
*/
|
|
9606
|
+
async waitForReplicationFlush(timeoutMs) {
|
|
9607
|
+
if (!this.replicationPipeline) return;
|
|
9608
|
+
const startTime = Date.now();
|
|
9609
|
+
while (Date.now() - startTime < timeoutMs) {
|
|
9610
|
+
const pendingOps = this.replicationPipeline.getTotalPending();
|
|
9611
|
+
if (pendingOps === 0) {
|
|
9612
|
+
return;
|
|
9613
|
+
}
|
|
9614
|
+
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
9615
|
+
}
|
|
9616
|
+
throw new Error("Replication flush timeout");
|
|
9617
|
+
}
|
|
8253
9618
|
async shutdown() {
|
|
8254
9619
|
logger.info("Shutting down Server Coordinator...");
|
|
9620
|
+
await this.gracefulClusterDeparture();
|
|
8255
9621
|
this.httpServer.close();
|
|
8256
9622
|
if (this.metricsServer) {
|
|
8257
9623
|
this.metricsServer.close();
|
|
@@ -8284,6 +9650,14 @@ var ServerCoordinator = class {
|
|
|
8284
9650
|
if (this.replicationPipeline) {
|
|
8285
9651
|
this.replicationPipeline.close();
|
|
8286
9652
|
}
|
|
9653
|
+
if (this.repairScheduler) {
|
|
9654
|
+
this.repairScheduler.stop();
|
|
9655
|
+
logger.info("RepairScheduler stopped");
|
|
9656
|
+
}
|
|
9657
|
+
if (this.partitionReassigner) {
|
|
9658
|
+
this.partitionReassigner.stop();
|
|
9659
|
+
logger.info("PartitionReassigner stopped");
|
|
9660
|
+
}
|
|
8287
9661
|
if (this.cluster) {
|
|
8288
9662
|
this.cluster.stop();
|
|
8289
9663
|
}
|
|
@@ -8499,7 +9873,32 @@ var ServerCoordinator = class {
|
|
|
8499
9873
|
logger.info({ clientId: client.id, mapName, query }, "Client subscribed");
|
|
8500
9874
|
this.metricsService.incOp("SUBSCRIBE", mapName);
|
|
8501
9875
|
const allMembers = this.cluster.getMembers();
|
|
8502
|
-
|
|
9876
|
+
let remoteMembers = allMembers.filter((id) => !this.cluster.isLocal(id));
|
|
9877
|
+
const queryKey = query._id || query.where?._id;
|
|
9878
|
+
if (queryKey && typeof queryKey === "string" && this.readReplicaHandler) {
|
|
9879
|
+
try {
|
|
9880
|
+
const targetNode = this.readReplicaHandler.selectReadNode({
|
|
9881
|
+
mapName,
|
|
9882
|
+
key: queryKey,
|
|
9883
|
+
options: {
|
|
9884
|
+
// Default to EVENTUAL for read scaling unless specified otherwise
|
|
9885
|
+
// In future, we could extract consistency from query options if available
|
|
9886
|
+
consistency: ConsistencyLevel3.EVENTUAL
|
|
9887
|
+
}
|
|
9888
|
+
});
|
|
9889
|
+
if (targetNode) {
|
|
9890
|
+
if (this.cluster.isLocal(targetNode)) {
|
|
9891
|
+
remoteMembers = [];
|
|
9892
|
+
logger.debug({ clientId: client.id, mapName, key: queryKey }, "Read optimization: Serving locally");
|
|
9893
|
+
} else if (remoteMembers.includes(targetNode)) {
|
|
9894
|
+
remoteMembers = [targetNode];
|
|
9895
|
+
logger.debug({ clientId: client.id, mapName, key: queryKey, targetNode }, "Read optimization: Routing to replica");
|
|
9896
|
+
}
|
|
9897
|
+
}
|
|
9898
|
+
} catch (e) {
|
|
9899
|
+
logger.warn({ err: e }, "Error in ReadReplicaHandler selection");
|
|
9900
|
+
}
|
|
9901
|
+
}
|
|
8503
9902
|
const requestId = crypto.randomUUID();
|
|
8504
9903
|
const pending = {
|
|
8505
9904
|
requestId,
|
|
@@ -9377,7 +10776,7 @@ var ServerCoordinator = class {
|
|
|
9377
10776
|
};
|
|
9378
10777
|
let broadcastCount = 0;
|
|
9379
10778
|
for (const client of this.clients.values()) {
|
|
9380
|
-
if (client.isAuthenticated && client.socket.readyState === WebSocket3.OPEN) {
|
|
10779
|
+
if (client.isAuthenticated && client.socket.readyState === WebSocket3.OPEN && client.writer) {
|
|
9381
10780
|
client.writer.write(message);
|
|
9382
10781
|
broadcastCount++;
|
|
9383
10782
|
}
|
|
@@ -9644,7 +11043,14 @@ var ServerCoordinator = class {
|
|
|
9644
11043
|
this.cluster.on("message", (msg) => {
|
|
9645
11044
|
switch (msg.type) {
|
|
9646
11045
|
case "OP_FORWARD":
|
|
11046
|
+
if (msg.payload._replication || msg.payload._migration) {
|
|
11047
|
+
break;
|
|
11048
|
+
}
|
|
9647
11049
|
logger.info({ senderId: msg.senderId }, "Received forwarded op");
|
|
11050
|
+
if (!msg.payload.key) {
|
|
11051
|
+
logger.warn({ senderId: msg.senderId }, "OP_FORWARD missing key, dropping");
|
|
11052
|
+
break;
|
|
11053
|
+
}
|
|
9648
11054
|
if (this.partitionService.isLocalOwner(msg.payload.key)) {
|
|
9649
11055
|
this.processLocalOp(msg.payload, true, msg.senderId).catch((err) => {
|
|
9650
11056
|
logger.error({ err, senderId: msg.senderId }, "Forwarded op failed");
|
|
@@ -9751,6 +11157,51 @@ var ServerCoordinator = class {
|
|
|
9751
11157
|
this.topicManager.publish(topic, data, originalSenderId, true);
|
|
9752
11158
|
break;
|
|
9753
11159
|
}
|
|
11160
|
+
// Phase 10.04: Anti-entropy repair messages
|
|
11161
|
+
case "CLUSTER_MERKLE_ROOT_REQ": {
|
|
11162
|
+
const { partitionId, requestId } = msg.payload;
|
|
11163
|
+
const rootHash = this.merkleTreeManager?.getRootHash(partitionId) ?? 0;
|
|
11164
|
+
this.cluster.send(msg.senderId, "CLUSTER_MERKLE_ROOT_RESP", {
|
|
11165
|
+
requestId,
|
|
11166
|
+
partitionId,
|
|
11167
|
+
rootHash
|
|
11168
|
+
});
|
|
11169
|
+
break;
|
|
11170
|
+
}
|
|
11171
|
+
case "CLUSTER_MERKLE_ROOT_RESP": {
|
|
11172
|
+
if (this.repairScheduler) {
|
|
11173
|
+
this.repairScheduler.emit("merkleRootResponse", {
|
|
11174
|
+
nodeId: msg.senderId,
|
|
11175
|
+
...msg.payload
|
|
11176
|
+
});
|
|
11177
|
+
}
|
|
11178
|
+
break;
|
|
11179
|
+
}
|
|
11180
|
+
case "CLUSTER_REPAIR_DATA_REQ": {
|
|
11181
|
+
const { partitionId, keys, requestId } = msg.payload;
|
|
11182
|
+
const records = {};
|
|
11183
|
+
for (const key of keys) {
|
|
11184
|
+
const record = this.getLocalRecord(key);
|
|
11185
|
+
if (record) {
|
|
11186
|
+
records[key] = record;
|
|
11187
|
+
}
|
|
11188
|
+
}
|
|
11189
|
+
this.cluster.send(msg.senderId, "CLUSTER_REPAIR_DATA_RESP", {
|
|
11190
|
+
requestId,
|
|
11191
|
+
partitionId,
|
|
11192
|
+
records
|
|
11193
|
+
});
|
|
11194
|
+
break;
|
|
11195
|
+
}
|
|
11196
|
+
case "CLUSTER_REPAIR_DATA_RESP": {
|
|
11197
|
+
if (this.repairScheduler) {
|
|
11198
|
+
this.repairScheduler.emit("repairDataResponse", {
|
|
11199
|
+
nodeId: msg.senderId,
|
|
11200
|
+
...msg.payload
|
|
11201
|
+
});
|
|
11202
|
+
}
|
|
11203
|
+
break;
|
|
11204
|
+
}
|
|
9754
11205
|
}
|
|
9755
11206
|
});
|
|
9756
11207
|
}
|
|
@@ -10020,6 +11471,10 @@ var ServerCoordinator = class {
|
|
|
10020
11471
|
nodeId: this._nodeId
|
|
10021
11472
|
});
|
|
10022
11473
|
}
|
|
11474
|
+
if (this.merkleTreeManager && recordToStore && op.key) {
|
|
11475
|
+
const partitionId = this.partitionService.getPartitionId(op.key);
|
|
11476
|
+
this.merkleTreeManager.updateRecord(partitionId, op.key, recordToStore);
|
|
11477
|
+
}
|
|
10023
11478
|
return { eventPayload, oldRecord };
|
|
10024
11479
|
}
|
|
10025
11480
|
/**
|
|
@@ -10147,7 +11602,7 @@ var ServerCoordinator = class {
|
|
|
10147
11602
|
if (rejected || !eventPayload) {
|
|
10148
11603
|
return;
|
|
10149
11604
|
}
|
|
10150
|
-
if (this.replicationPipeline
|
|
11605
|
+
if (this.replicationPipeline) {
|
|
10151
11606
|
const opId = op.id || `${op.mapName}:${op.key}:${Date.now()}`;
|
|
10152
11607
|
this.replicationPipeline.replicate(op, opId, op.key).catch((err) => {
|
|
10153
11608
|
logger.warn({ opId, key: op.key, err }, "Replication failed (non-fatal)");
|
|
@@ -10290,6 +11745,10 @@ var ServerCoordinator = class {
|
|
|
10290
11745
|
}
|
|
10291
11746
|
handleClusterEvent(payload) {
|
|
10292
11747
|
const { mapName, key, eventType } = payload;
|
|
11748
|
+
if (!key) {
|
|
11749
|
+
logger.warn({ mapName, eventType }, "Received cluster event with undefined key, ignoring");
|
|
11750
|
+
return;
|
|
11751
|
+
}
|
|
10293
11752
|
const map = this.getMap(mapName, eventType === "OR_ADD" || eventType === "OR_REMOVE" ? "OR" : "LWW");
|
|
10294
11753
|
const oldRecord = map instanceof LWWMap3 ? map.getRecord(key) : null;
|
|
10295
11754
|
if (this.partitionService.isRelated(key)) {
|
|
@@ -10354,6 +11813,51 @@ var ServerCoordinator = class {
|
|
|
10354
11813
|
}
|
|
10355
11814
|
return this.maps.get(name);
|
|
10356
11815
|
}
|
|
11816
|
+
/**
|
|
11817
|
+
* Phase 10.04: Get local record for anti-entropy repair
|
|
11818
|
+
* Returns the LWWRecord for a key, used by RepairScheduler
|
|
11819
|
+
*/
|
|
11820
|
+
getLocalRecord(key) {
|
|
11821
|
+
const separatorIndex = key.indexOf(":");
|
|
11822
|
+
if (separatorIndex === -1) {
|
|
11823
|
+
return null;
|
|
11824
|
+
}
|
|
11825
|
+
const mapName = key.substring(0, separatorIndex);
|
|
11826
|
+
const actualKey = key.substring(separatorIndex + 1);
|
|
11827
|
+
const map = this.maps.get(mapName);
|
|
11828
|
+
if (!map || !(map instanceof LWWMap3)) {
|
|
11829
|
+
return null;
|
|
11830
|
+
}
|
|
11831
|
+
return map.getRecord(actualKey) ?? null;
|
|
11832
|
+
}
|
|
11833
|
+
/**
|
|
11834
|
+
* Phase 10.04: Apply repaired record from anti-entropy repair
|
|
11835
|
+
* Used by RepairScheduler to apply resolved conflicts
|
|
11836
|
+
*/
|
|
11837
|
+
applyRepairRecord(key, record) {
|
|
11838
|
+
const separatorIndex = key.indexOf(":");
|
|
11839
|
+
if (separatorIndex === -1) {
|
|
11840
|
+
logger.warn({ key }, "Invalid key format for repair");
|
|
11841
|
+
return;
|
|
11842
|
+
}
|
|
11843
|
+
const mapName = key.substring(0, separatorIndex);
|
|
11844
|
+
const actualKey = key.substring(separatorIndex + 1);
|
|
11845
|
+
const map = this.getMap(mapName, "LWW");
|
|
11846
|
+
const existingRecord = map.getRecord(actualKey);
|
|
11847
|
+
if (!existingRecord || record.timestamp.millis > existingRecord.timestamp.millis || record.timestamp.millis === existingRecord.timestamp.millis && record.timestamp.counter > existingRecord.timestamp.counter) {
|
|
11848
|
+
map.merge(actualKey, record);
|
|
11849
|
+
logger.debug({ mapName, key: actualKey }, "Applied repair record");
|
|
11850
|
+
if (this.storage) {
|
|
11851
|
+
this.storage.store(mapName, actualKey, record).catch((err) => {
|
|
11852
|
+
logger.error({ err, mapName, key: actualKey }, "Failed to persist repair record");
|
|
11853
|
+
});
|
|
11854
|
+
}
|
|
11855
|
+
if (this.merkleTreeManager) {
|
|
11856
|
+
const partitionId = this.partitionService.getPartitionId(actualKey);
|
|
11857
|
+
this.merkleTreeManager.updateRecord(partitionId, actualKey, record);
|
|
11858
|
+
}
|
|
11859
|
+
}
|
|
11860
|
+
}
|
|
10357
11861
|
async loadMapFromStorage(name, typeHint) {
|
|
10358
11862
|
try {
|
|
10359
11863
|
const keys = await this.storage.loadAllKeys(name);
|
|
@@ -11239,7 +12743,7 @@ function logNativeStatus() {
|
|
|
11239
12743
|
}
|
|
11240
12744
|
|
|
11241
12745
|
// src/cluster/ClusterCoordinator.ts
|
|
11242
|
-
import { EventEmitter as
|
|
12746
|
+
import { EventEmitter as EventEmitter13 } from "events";
|
|
11243
12747
|
import {
|
|
11244
12748
|
DEFAULT_MIGRATION_CONFIG as DEFAULT_MIGRATION_CONFIG3,
|
|
11245
12749
|
DEFAULT_REPLICATION_CONFIG as DEFAULT_REPLICATION_CONFIG3
|
|
@@ -11250,7 +12754,7 @@ var DEFAULT_CLUSTER_COORDINATOR_CONFIG = {
|
|
|
11250
12754
|
replication: DEFAULT_REPLICATION_CONFIG3,
|
|
11251
12755
|
replicationEnabled: true
|
|
11252
12756
|
};
|
|
11253
|
-
var ClusterCoordinator = class extends
|
|
12757
|
+
var ClusterCoordinator = class extends EventEmitter13 {
|
|
11254
12758
|
constructor(config) {
|
|
11255
12759
|
super();
|
|
11256
12760
|
this.replicationPipeline = null;
|
|
@@ -12064,12 +13568,18 @@ export {
|
|
|
12064
13568
|
ConnectionRateLimiter,
|
|
12065
13569
|
DEFAULT_CLUSTER_COORDINATOR_CONFIG,
|
|
12066
13570
|
DEFAULT_CONFLICT_RESOLVER_CONFIG,
|
|
13571
|
+
DEFAULT_FAILURE_DETECTOR_CONFIG,
|
|
12067
13572
|
DEFAULT_INDEX_CONFIG,
|
|
12068
13573
|
DEFAULT_JOURNAL_SERVICE_CONFIG,
|
|
12069
13574
|
DEFAULT_LAG_TRACKER_CONFIG,
|
|
13575
|
+
DEFAULT_MERKLE_TREE_CONFIG,
|
|
13576
|
+
DEFAULT_READ_REPLICA_CONFIG,
|
|
13577
|
+
DEFAULT_REASSIGNER_CONFIG,
|
|
13578
|
+
DEFAULT_REPAIR_CONFIG,
|
|
12070
13579
|
DEFAULT_SANDBOX_CONFIG,
|
|
12071
13580
|
EntryProcessorHandler,
|
|
12072
13581
|
EventJournalService,
|
|
13582
|
+
FailureDetector,
|
|
12073
13583
|
FilterTasklet,
|
|
12074
13584
|
ForEachTasklet,
|
|
12075
13585
|
IteratorTasklet,
|
|
@@ -12079,13 +13589,17 @@ export {
|
|
|
12079
13589
|
MapTasklet,
|
|
12080
13590
|
MapWithResolver,
|
|
12081
13591
|
MemoryServerAdapter,
|
|
13592
|
+
MerkleTreeManager,
|
|
12082
13593
|
MigrationManager,
|
|
12083
13594
|
ObjectPool,
|
|
13595
|
+
PartitionReassigner,
|
|
12084
13596
|
PartitionService,
|
|
12085
13597
|
PostgresAdapter,
|
|
12086
13598
|
ProcessorSandbox,
|
|
12087
13599
|
RateLimitInterceptor,
|
|
13600
|
+
ReadReplicaHandler,
|
|
12088
13601
|
ReduceTasklet,
|
|
13602
|
+
RepairScheduler,
|
|
12089
13603
|
ReplicationPipeline,
|
|
12090
13604
|
SecurityManager,
|
|
12091
13605
|
ServerCoordinator,
|