@topgunbuild/server 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -10,7 +10,7 @@ import { createServer as createHttpServer } from "http";
10
10
  import { createServer as createHttpsServer } from "https";
11
11
  import { readFileSync as readFileSync2 } from "fs";
12
12
  import { WebSocketServer as WebSocketServer2, WebSocket as WebSocket3 } from "ws";
13
- import { HLC as HLC2, LWWMap as LWWMap3, ORMap as ORMap2, serialize as serialize4, deserialize, MessageSchema, WriteConcern as WriteConcern2, ConsistencyLevel as ConsistencyLevel2, DEFAULT_REPLICATION_CONFIG as DEFAULT_REPLICATION_CONFIG2, IndexedLWWMap as IndexedLWWMap2, IndexedORMap as IndexedORMap2 } from "@topgunbuild/core";
13
+ import { HLC as HLC2, LWWMap as LWWMap3, ORMap as ORMap2, serialize as serialize4, deserialize, MessageSchema, WriteConcern as WriteConcern2, ConsistencyLevel as ConsistencyLevel3, DEFAULT_REPLICATION_CONFIG as DEFAULT_REPLICATION_CONFIG2, IndexedLWWMap as IndexedLWWMap2, IndexedORMap as IndexedORMap2 } from "@topgunbuild/core";
14
14
  import * as jwt from "jsonwebtoken";
15
15
  import * as crypto from "crypto";
16
16
 
@@ -1126,6 +1126,47 @@ var ClusterManager = class extends EventEmitter2 {
1126
1126
  handleHeartbeat(senderId, _payload) {
1127
1127
  this.failureDetector.recordHeartbeat(senderId);
1128
1128
  }
1129
+ /**
1130
+ * Send current member list to a specific node (gossip protocol).
1131
+ * Called when a new node joins to propagate cluster topology.
1132
+ */
1133
+ sendMemberList(targetNodeId) {
1134
+ const members = [];
1135
+ for (const [nodeId, member] of this.members) {
1136
+ members.push({
1137
+ nodeId,
1138
+ host: member.host,
1139
+ port: member.port
1140
+ });
1141
+ }
1142
+ this.send(targetNodeId, "MEMBER_LIST", { members });
1143
+ logger.debug({ targetNodeId, memberCount: members.length }, "Sent member list");
1144
+ }
1145
+ /**
1146
+ * Broadcast member list to all connected nodes.
1147
+ * Called when cluster membership changes.
1148
+ */
1149
+ broadcastMemberList() {
1150
+ for (const [nodeId, member] of this.members) {
1151
+ if (member.isSelf) continue;
1152
+ if (member.socket && member.socket.readyState === WebSocket.OPEN) {
1153
+ this.sendMemberList(nodeId);
1154
+ }
1155
+ }
1156
+ }
1157
+ /**
1158
+ * Handle incoming member list from a peer (gossip protocol).
1159
+ * Attempts to connect to unknown members.
1160
+ */
1161
+ handleMemberList(payload) {
1162
+ for (const memberInfo of payload.members) {
1163
+ if (memberInfo.nodeId === this.config.nodeId) continue;
1164
+ if (this.members.has(memberInfo.nodeId)) continue;
1165
+ const peerAddress = `${memberInfo.host}:${memberInfo.port}`;
1166
+ logger.info({ nodeId: memberInfo.nodeId, peerAddress }, "Discovered new member via gossip");
1167
+ this.connectToPeer(peerAddress);
1168
+ }
1169
+ }
1129
1170
  /**
1130
1171
  * Handle confirmed node failure.
1131
1172
  */
@@ -1264,6 +1305,9 @@ var ClusterManager = class extends EventEmitter2 {
1264
1305
  this.failureDetector.startMonitoring(remoteNodeId);
1265
1306
  this.startHeartbeat();
1266
1307
  this.emit("memberJoined", remoteNodeId);
1308
+ this.broadcastMemberList();
1309
+ } else if (msg.type === "MEMBER_LIST") {
1310
+ this.handleMemberList(msg.payload);
1267
1311
  } else if (msg.type === "HEARTBEAT") {
1268
1312
  if (remoteNodeId) {
1269
1313
  this.handleHeartbeat(remoteNodeId, msg.payload);
@@ -6565,236 +6609,1436 @@ var ReplicationPipeline = class extends EventEmitter8 {
6565
6609
  }
6566
6610
  };
6567
6611
 
6568
- // src/handlers/CounterHandler.ts
6569
- import { PNCounterImpl } from "@topgunbuild/core";
6570
- var CounterHandler = class {
6571
- // counterName -> Set<clientId>
6572
- constructor(nodeId = "server") {
6573
- this.nodeId = nodeId;
6574
- this.counters = /* @__PURE__ */ new Map();
6575
- this.subscriptions = /* @__PURE__ */ new Map();
6612
+ // src/cluster/PartitionReassigner.ts
6613
+ import { EventEmitter as EventEmitter9 } from "events";
6614
+ import { DEFAULT_BACKUP_COUNT as DEFAULT_BACKUP_COUNT2 } from "@topgunbuild/core";
6615
+ var DEFAULT_REASSIGNER_CONFIG = {
6616
+ reassignmentDelayMs: 1e3,
6617
+ maxConcurrentTransfers: 10,
6618
+ autoPromoteBackups: true,
6619
+ autoAssignNewBackups: true
6620
+ };
6621
+ var PartitionReassigner = class extends EventEmitter9 {
6622
+ constructor(clusterManager, partitionService, config = {}) {
6623
+ super();
6624
+ this.failoverInProgress = false;
6625
+ this.partitionsReassigned = 0;
6626
+ this.pendingReassignments = /* @__PURE__ */ new Set();
6627
+ this.clusterManager = clusterManager;
6628
+ this.partitionService = partitionService;
6629
+ this.config = { ...DEFAULT_REASSIGNER_CONFIG, ...config };
6630
+ this.setupEventHandlers();
6631
+ }
6632
+ setupEventHandlers() {
6633
+ this.clusterManager.on("nodeConfirmedFailed", (nodeId) => {
6634
+ logger.warn({ nodeId }, "Node failure confirmed, initiating partition reassignment");
6635
+ this.handleNodeFailure(nodeId);
6636
+ });
6637
+ this.clusterManager.on("memberLeft", (nodeId) => {
6638
+ if (this.currentFailedNode !== nodeId) {
6639
+ logger.info({ nodeId }, "Member left cluster, checking partition reassignment");
6640
+ this.handleNodeDeparture(nodeId);
6641
+ }
6642
+ });
6576
6643
  }
6577
6644
  /**
6578
- * Get or create a counter by name.
6645
+ * Handle a node failure - initiates failover process
6579
6646
  */
6580
- getOrCreateCounter(name) {
6581
- let counter = this.counters.get(name);
6582
- if (!counter) {
6583
- counter = new PNCounterImpl({ nodeId: this.nodeId });
6584
- this.counters.set(name, counter);
6585
- logger.debug({ name }, "Created new counter");
6647
+ handleNodeFailure(failedNodeId) {
6648
+ if (this.failoverInProgress && this.currentFailedNode === failedNodeId) {
6649
+ logger.debug({ failedNodeId }, "Failover already in progress for this node");
6650
+ return;
6586
6651
  }
6587
- return counter;
6652
+ if (this.reassignmentTimer) {
6653
+ clearTimeout(this.reassignmentTimer);
6654
+ }
6655
+ this.reassignmentTimer = setTimeout(() => {
6656
+ this.executeFailover(failedNodeId);
6657
+ }, this.config.reassignmentDelayMs);
6588
6658
  }
6589
6659
  /**
6590
- * Handle COUNTER_REQUEST - client wants initial state.
6591
- * @returns Response message to send back to client
6660
+ * Handle a graceful node departure
6592
6661
  */
6593
- handleCounterRequest(clientId, name) {
6594
- const counter = this.getOrCreateCounter(name);
6595
- this.subscribe(clientId, name);
6596
- const state = counter.getState();
6597
- logger.debug({ clientId, name, value: counter.get() }, "Counter request handled");
6598
- return {
6599
- type: "COUNTER_RESPONSE",
6600
- payload: {
6601
- name,
6602
- state: this.stateToObject(state)
6603
- }
6604
- };
6662
+ handleNodeDeparture(nodeId) {
6663
+ const orphanedPartitions = this.findOrphanedPartitions(nodeId);
6664
+ if (orphanedPartitions.length > 0) {
6665
+ logger.warn({ nodeId, count: orphanedPartitions.length }, "Found orphaned partitions after departure");
6666
+ this.executeFailover(nodeId);
6667
+ }
6605
6668
  }
6606
6669
  /**
6607
- * Handle COUNTER_SYNC - client sends their state to merge.
6608
- * @returns Merged state and list of clients to broadcast to
6670
+ * Execute the failover process for a failed node
6609
6671
  */
6610
- handleCounterSync(clientId, name, stateObj) {
6611
- const counter = this.getOrCreateCounter(name);
6612
- const incomingState = this.objectToState(stateObj);
6613
- counter.merge(incomingState);
6614
- const mergedState = counter.getState();
6615
- const mergedStateObj = this.stateToObject(mergedState);
6616
- logger.debug(
6617
- { clientId, name, value: counter.get() },
6618
- "Counter sync handled"
6619
- );
6620
- this.subscribe(clientId, name);
6621
- const subscribers = this.subscriptions.get(name) || /* @__PURE__ */ new Set();
6622
- const broadcastTo = Array.from(subscribers).filter((id) => id !== clientId);
6623
- return {
6624
- // Response to the sending client
6625
- response: {
6626
- type: "COUNTER_UPDATE",
6627
- payload: {
6628
- name,
6629
- state: mergedStateObj
6630
- }
6631
- },
6632
- // Broadcast to other clients
6633
- broadcastTo,
6634
- broadcastMessage: {
6635
- type: "COUNTER_UPDATE",
6636
- payload: {
6637
- name,
6638
- state: mergedStateObj
6639
- }
6672
+ async executeFailover(failedNodeId) {
6673
+ this.failoverInProgress = true;
6674
+ this.currentFailedNode = failedNodeId;
6675
+ this.reassignmentStartTime = Date.now();
6676
+ this.partitionsReassigned = 0;
6677
+ this.pendingReassignments.clear();
6678
+ logger.info({ failedNodeId }, "Starting partition failover");
6679
+ try {
6680
+ const orphanedPartitions = this.findOrphanedPartitions(failedNodeId);
6681
+ if (orphanedPartitions.length === 0) {
6682
+ logger.info({ failedNodeId }, "No partitions to reassign");
6683
+ this.completeFailover();
6684
+ return;
6640
6685
  }
6641
- };
6686
+ logger.info({
6687
+ failedNodeId,
6688
+ partitionCount: orphanedPartitions.length
6689
+ }, "Reassigning partitions from failed node");
6690
+ for (const partitionId of orphanedPartitions) {
6691
+ this.pendingReassignments.add(partitionId);
6692
+ }
6693
+ const changes = [];
6694
+ for (const partitionId of orphanedPartitions) {
6695
+ const change = await this.reassignPartition(partitionId, failedNodeId);
6696
+ if (change) {
6697
+ changes.push(change);
6698
+ this.partitionsReassigned++;
6699
+ }
6700
+ this.pendingReassignments.delete(partitionId);
6701
+ }
6702
+ if (changes.length > 0) {
6703
+ this.emit("partitionsReassigned", {
6704
+ failedNodeId,
6705
+ changes,
6706
+ partitionMap: this.partitionService.getPartitionMap()
6707
+ });
6708
+ }
6709
+ this.completeFailover();
6710
+ } catch (error) {
6711
+ logger.error({ failedNodeId, error }, "Failover failed");
6712
+ this.emit("failoverError", { failedNodeId, error });
6713
+ this.completeFailover();
6714
+ }
6642
6715
  }
6643
6716
  /**
6644
- * Subscribe a client to counter updates.
6717
+ * Find all partitions that need reassignment
6645
6718
  */
6646
- subscribe(clientId, counterName) {
6647
- if (!this.subscriptions.has(counterName)) {
6648
- this.subscriptions.set(counterName, /* @__PURE__ */ new Set());
6719
+ findOrphanedPartitions(failedNodeId) {
6720
+ const orphaned = [];
6721
+ const partitionMap = this.partitionService.getPartitionMap();
6722
+ for (const partition of partitionMap.partitions) {
6723
+ if (partition.ownerNodeId === failedNodeId) {
6724
+ orphaned.push(partition.partitionId);
6725
+ }
6649
6726
  }
6650
- this.subscriptions.get(counterName).add(clientId);
6651
- logger.debug({ clientId, counterName }, "Client subscribed to counter");
6727
+ return orphaned;
6652
6728
  }
6653
6729
  /**
6654
- * Unsubscribe a client from counter updates.
6730
+ * Reassign a single partition
6655
6731
  */
6656
- unsubscribe(clientId, counterName) {
6657
- const subs = this.subscriptions.get(counterName);
6658
- if (subs) {
6659
- subs.delete(clientId);
6660
- if (subs.size === 0) {
6661
- this.subscriptions.delete(counterName);
6732
+ async reassignPartition(partitionId, failedNodeId) {
6733
+ const currentBackups = this.partitionService.getBackups(partitionId);
6734
+ const aliveMembers = this.clusterManager.getMembers().filter((m) => m !== failedNodeId);
6735
+ if (aliveMembers.length === 0) {
6736
+ logger.error({ partitionId }, "No alive members to reassign partition to");
6737
+ return null;
6738
+ }
6739
+ let newOwner = null;
6740
+ if (this.config.autoPromoteBackups) {
6741
+ for (const backup of currentBackups) {
6742
+ if (aliveMembers.includes(backup)) {
6743
+ newOwner = backup;
6744
+ break;
6745
+ }
6662
6746
  }
6663
6747
  }
6748
+ if (!newOwner) {
6749
+ const ownerIndex = partitionId % aliveMembers.length;
6750
+ newOwner = aliveMembers.sort()[ownerIndex];
6751
+ }
6752
+ this.partitionService.setOwner(partitionId, newOwner);
6753
+ logger.info({
6754
+ partitionId,
6755
+ previousOwner: failedNodeId,
6756
+ newOwner
6757
+ }, "Partition owner promoted");
6758
+ this.emit("reassignment", {
6759
+ type: "backup-promoted",
6760
+ partitionId,
6761
+ previousOwner: failedNodeId,
6762
+ newOwner
6763
+ });
6764
+ if (this.config.autoAssignNewBackups) {
6765
+ const newBackups = this.selectBackups(partitionId, newOwner, aliveMembers);
6766
+ }
6767
+ return {
6768
+ partitionId,
6769
+ previousOwner: failedNodeId,
6770
+ newOwner,
6771
+ reason: "FAILOVER"
6772
+ };
6664
6773
  }
6665
6774
  /**
6666
- * Unsubscribe a client from all counters (e.g., on disconnect).
6775
+ * Select backup nodes for a partition
6667
6776
  */
6668
- unsubscribeAll(clientId) {
6669
- for (const [counterName, subs] of this.subscriptions) {
6670
- subs.delete(clientId);
6671
- if (subs.size === 0) {
6672
- this.subscriptions.delete(counterName);
6673
- }
6777
+ selectBackups(partitionId, owner, aliveMembers) {
6778
+ const backups = [];
6779
+ const sortedMembers = aliveMembers.filter((m) => m !== owner).sort();
6780
+ const startIndex = partitionId % sortedMembers.length;
6781
+ for (let i = 0; i < Math.min(DEFAULT_BACKUP_COUNT2, sortedMembers.length); i++) {
6782
+ const backupIndex = (startIndex + i) % sortedMembers.length;
6783
+ backups.push(sortedMembers[backupIndex]);
6674
6784
  }
6675
- logger.debug({ clientId }, "Client unsubscribed from all counters");
6785
+ return backups;
6676
6786
  }
6677
6787
  /**
6678
- * Get current counter value (for monitoring/debugging).
6788
+ * Complete the failover process
6679
6789
  */
6680
- getCounterValue(name) {
6681
- const counter = this.counters.get(name);
6682
- return counter ? counter.get() : 0;
6790
+ completeFailover() {
6791
+ const duration = this.reassignmentStartTime ? Date.now() - this.reassignmentStartTime : 0;
6792
+ logger.info({
6793
+ failedNodeId: this.currentFailedNode,
6794
+ partitionsReassigned: this.partitionsReassigned,
6795
+ durationMs: duration
6796
+ }, "Failover completed");
6797
+ this.emit("failoverComplete", {
6798
+ failedNodeId: this.currentFailedNode,
6799
+ partitionsReassigned: this.partitionsReassigned,
6800
+ durationMs: duration
6801
+ });
6802
+ this.failoverInProgress = false;
6803
+ this.currentFailedNode = void 0;
6804
+ this.reassignmentStartTime = void 0;
6805
+ this.pendingReassignments.clear();
6683
6806
  }
6684
6807
  /**
6685
- * Get all counter names.
6808
+ * Get current failover status
6686
6809
  */
6687
- getCounterNames() {
6688
- return Array.from(this.counters.keys());
6810
+ getStatus() {
6811
+ return {
6812
+ inProgress: this.failoverInProgress,
6813
+ failedNodeId: this.currentFailedNode,
6814
+ partitionsReassigned: this.partitionsReassigned,
6815
+ partitionsPending: this.pendingReassignments.size,
6816
+ startedAt: this.reassignmentStartTime,
6817
+ completedAt: this.failoverInProgress ? void 0 : Date.now()
6818
+ };
6689
6819
  }
6690
6820
  /**
6691
- * Get number of subscribers for a counter.
6821
+ * Check if failover is in progress
6692
6822
  */
6693
- getSubscriberCount(name) {
6694
- return this.subscriptions.get(name)?.size || 0;
6823
+ isFailoverInProgress() {
6824
+ return this.failoverInProgress;
6695
6825
  }
6696
6826
  /**
6697
- * Convert Map-based state to plain object for serialization.
6827
+ * Force immediate reassignment (for testing/manual intervention)
6698
6828
  */
6699
- stateToObject(state) {
6700
- return {
6701
- p: Object.fromEntries(state.positive),
6702
- n: Object.fromEntries(state.negative)
6703
- };
6829
+ forceReassignment(failedNodeId) {
6830
+ if (this.reassignmentTimer) {
6831
+ clearTimeout(this.reassignmentTimer);
6832
+ }
6833
+ this.executeFailover(failedNodeId);
6704
6834
  }
6705
6835
  /**
6706
- * Convert plain object to Map-based state.
6836
+ * Stop any pending reassignment
6707
6837
  */
6708
- objectToState(obj) {
6709
- return {
6710
- positive: new Map(Object.entries(obj.p || {})),
6711
- negative: new Map(Object.entries(obj.n || {}))
6712
- };
6838
+ stop() {
6839
+ if (this.reassignmentTimer) {
6840
+ clearTimeout(this.reassignmentTimer);
6841
+ this.reassignmentTimer = void 0;
6842
+ }
6843
+ this.failoverInProgress = false;
6844
+ this.pendingReassignments.clear();
6713
6845
  }
6714
6846
  };
6715
6847
 
6716
- // src/handlers/EntryProcessorHandler.ts
6717
- import {
6718
- EntryProcessorDefSchema
6719
- } from "@topgunbuild/core";
6720
-
6721
- // src/ProcessorSandbox.ts
6722
- import {
6723
- validateProcessorCode
6724
- } from "@topgunbuild/core";
6725
- var ivm = null;
6726
- try {
6727
- ivm = __require("isolated-vm");
6728
- } catch {
6729
- const isProduction = process.env.NODE_ENV === "production";
6730
- if (isProduction) {
6731
- logger.error(
6732
- "SECURITY WARNING: isolated-vm not available in production! Entry processors will run in less secure fallback mode. Install isolated-vm for production environments: pnpm add isolated-vm"
6733
- );
6734
- } else {
6735
- logger.warn("isolated-vm not available, falling back to less secure VM");
6736
- }
6737
- }
6738
- var DEFAULT_SANDBOX_CONFIG = {
6739
- memoryLimitMb: 8,
6740
- timeoutMs: 100,
6741
- maxCachedIsolates: 100,
6742
- strictValidation: true
6848
+ // src/cluster/ReadReplicaHandler.ts
6849
+ import { EventEmitter as EventEmitter10 } from "events";
6850
+ import { ConsistencyLevel as ConsistencyLevel2 } from "@topgunbuild/core";
6851
+ var DEFAULT_READ_REPLICA_CONFIG = {
6852
+ defaultConsistency: ConsistencyLevel2.STRONG,
6853
+ maxStalenessMs: 5e3,
6854
+ preferLocalReplica: true,
6855
+ loadBalancing: "latency-based"
6743
6856
  };
6744
- var ProcessorSandbox = class {
6745
- constructor(config = {}) {
6746
- this.isolateCache = /* @__PURE__ */ new Map();
6747
- this.scriptCache = /* @__PURE__ */ new Map();
6748
- this.fallbackScriptCache = /* @__PURE__ */ new Map();
6749
- this.disposed = false;
6750
- this.config = { ...DEFAULT_SANDBOX_CONFIG, ...config };
6857
+ var ReadReplicaHandler = class extends EventEmitter10 {
6858
+ constructor(partitionService, clusterManager, nodeId, lagTracker, config = {}) {
6859
+ super();
6860
+ // Round-robin counters for load balancing
6861
+ this.roundRobinCounters = /* @__PURE__ */ new Map();
6862
+ this.partitionService = partitionService;
6863
+ this.clusterManager = clusterManager;
6864
+ this.nodeId = nodeId;
6865
+ this.lagTracker = lagTracker;
6866
+ this.config = { ...DEFAULT_READ_REPLICA_CONFIG, ...config };
6751
6867
  }
6752
6868
  /**
6753
- * Execute an entry processor in the sandbox.
6754
- *
6755
- * @param processor The processor definition (name, code, args)
6756
- * @param value The current value for the key (or undefined)
6757
- * @param key The key being processed
6758
- * @returns Result containing success status, result, and new value
6869
+ * Determine if a read request can be served locally
6759
6870
  */
6760
- async execute(processor, value, key) {
6761
- if (this.disposed) {
6762
- return {
6763
- success: false,
6764
- error: "Sandbox has been disposed"
6765
- };
6871
+ canServeLocally(request) {
6872
+ const consistency = request.options?.consistency ?? this.config.defaultConsistency;
6873
+ if (consistency === ConsistencyLevel2.STRONG) {
6874
+ return this.partitionService.isLocalOwner(request.key);
6766
6875
  }
6767
- if (this.config.strictValidation) {
6768
- const validation = validateProcessorCode(processor.code);
6769
- if (!validation.valid) {
6770
- return {
6771
- success: false,
6772
- error: validation.error
6773
- };
6876
+ return this.partitionService.isRelated(request.key);
6877
+ }
6878
+ /**
6879
+ * Determine which node should handle the read
6880
+ */
6881
+ selectReadNode(request) {
6882
+ const key = request.key;
6883
+ const consistency = request.options?.consistency ?? this.config.defaultConsistency;
6884
+ const partitionId = this.partitionService.getPartitionId(key);
6885
+ const distribution = this.partitionService.getDistribution(key);
6886
+ if (consistency === ConsistencyLevel2.STRONG) {
6887
+ if (!this.isNodeAlive(distribution.owner)) {
6888
+ if (request.options?.allowStale) {
6889
+ return this.selectAliveBackup(distribution.backups);
6890
+ }
6891
+ return null;
6774
6892
  }
6893
+ return distribution.owner;
6775
6894
  }
6776
- if (ivm) {
6777
- return this.executeInIsolate(processor, value, key);
6778
- } else {
6779
- return this.executeInFallback(processor, value, key);
6895
+ const allReplicas = [distribution.owner, ...distribution.backups];
6896
+ const aliveReplicas = allReplicas.filter((n) => this.isNodeAlive(n));
6897
+ if (aliveReplicas.length === 0) {
6898
+ return null;
6899
+ }
6900
+ if (request.options?.maxStaleness) {
6901
+ const withinStaleness = aliveReplicas.filter(
6902
+ (n) => this.getNodeStaleness(n) <= (request.options?.maxStaleness ?? Infinity)
6903
+ );
6904
+ if (withinStaleness.length > 0) {
6905
+ return this.selectByStrategy(withinStaleness, partitionId);
6906
+ }
6907
+ if (this.isNodeAlive(distribution.owner)) {
6908
+ return distribution.owner;
6909
+ }
6780
6910
  }
6911
+ if (this.config.preferLocalReplica && aliveReplicas.includes(this.nodeId)) {
6912
+ return this.nodeId;
6913
+ }
6914
+ return this.selectByStrategy(aliveReplicas, partitionId);
6781
6915
  }
6782
6916
  /**
6783
- * Execute processor in isolated-vm (secure production mode).
6917
+ * Select replica using configured load balancing strategy
6784
6918
  */
6785
- async executeInIsolate(processor, value, key) {
6786
- if (!ivm) {
6787
- return { success: false, error: "isolated-vm not available" };
6919
+ selectByStrategy(replicas, partitionId) {
6920
+ if (replicas.length === 0) {
6921
+ throw new Error("No replicas available");
6788
6922
  }
6789
- const isolate = this.getOrCreateIsolate(processor.name);
6790
- try {
6791
- const context = await isolate.createContext();
6792
- const jail = context.global;
6793
- await jail.set("global", jail.derefInto());
6794
- await context.eval(`
6795
- var value = ${JSON.stringify(value)};
6796
- var key = ${JSON.stringify(key)};
6797
- var args = ${JSON.stringify(processor.args)};
6923
+ if (replicas.length === 1) {
6924
+ return replicas[0];
6925
+ }
6926
+ switch (this.config.loadBalancing) {
6927
+ case "round-robin":
6928
+ return this.selectRoundRobin(replicas, partitionId);
6929
+ case "latency-based":
6930
+ return this.selectByLatency(replicas);
6931
+ case "least-connections":
6932
+ return this.selectRoundRobin(replicas, partitionId);
6933
+ default:
6934
+ return replicas[0];
6935
+ }
6936
+ }
6937
+ /**
6938
+ * Round-robin selection
6939
+ */
6940
+ selectRoundRobin(replicas, partitionId) {
6941
+ const counter = this.roundRobinCounters.get(partitionId) ?? 0;
6942
+ const selected = replicas[counter % replicas.length];
6943
+ this.roundRobinCounters.set(partitionId, counter + 1);
6944
+ return selected;
6945
+ }
6946
+ /**
6947
+ * Latency-based selection using lag tracker
6948
+ */
6949
+ selectByLatency(replicas) {
6950
+ if (!this.lagTracker) {
6951
+ return replicas[0];
6952
+ }
6953
+ let bestNode = replicas[0];
6954
+ let bestLatency = Infinity;
6955
+ for (const nodeId of replicas) {
6956
+ const lag = this.lagTracker.getLag(nodeId);
6957
+ if (lag && lag.current < bestLatency) {
6958
+ bestLatency = lag.current;
6959
+ bestNode = nodeId;
6960
+ }
6961
+ }
6962
+ return bestNode;
6963
+ }
6964
+ /**
6965
+ * Get estimated staleness for a node in ms
6966
+ */
6967
+ getNodeStaleness(nodeId) {
6968
+ if (nodeId === this.partitionService.getOwner("")) {
6969
+ return 0;
6970
+ }
6971
+ if (this.lagTracker) {
6972
+ const lag = this.lagTracker.getLag(nodeId);
6973
+ return lag?.current ?? 0;
6974
+ }
6975
+ return 0;
6976
+ }
6977
+ /**
6978
+ * Check if a node is alive in the cluster
6979
+ */
6980
+ isNodeAlive(nodeId) {
6981
+ const members = this.clusterManager.getMembers();
6982
+ return members.includes(nodeId);
6983
+ }
6984
+ /**
6985
+ * Select first alive backup from list
6986
+ */
6987
+ selectAliveBackup(backups) {
6988
+ for (const backup of backups) {
6989
+ if (this.isNodeAlive(backup)) {
6990
+ return backup;
6991
+ }
6992
+ }
6993
+ return null;
6994
+ }
6995
+ /**
6996
+ * Create read response metadata
6997
+ */
6998
+ createReadMetadata(key, options) {
6999
+ const consistency = options?.consistency ?? this.config.defaultConsistency;
7000
+ const isOwner = this.partitionService.isLocalOwner(key);
7001
+ return {
7002
+ source: this.nodeId,
7003
+ isOwner,
7004
+ consistency
7005
+ };
7006
+ }
7007
+ /**
7008
+ * Check if local node should forward read to owner
7009
+ */
7010
+ shouldForwardRead(request) {
7011
+ const consistency = request.options?.consistency ?? this.config.defaultConsistency;
7012
+ if (consistency === ConsistencyLevel2.STRONG) {
7013
+ return !this.partitionService.isLocalOwner(request.key);
7014
+ }
7015
+ if (!this.partitionService.isRelated(request.key)) {
7016
+ return true;
7017
+ }
7018
+ return false;
7019
+ }
7020
+ /**
7021
+ * Get metrics for monitoring
7022
+ */
7023
+ getMetrics() {
7024
+ return {
7025
+ defaultConsistency: this.config.defaultConsistency,
7026
+ preferLocalReplica: this.config.preferLocalReplica,
7027
+ loadBalancing: this.config.loadBalancing,
7028
+ roundRobinPartitions: this.roundRobinCounters.size
7029
+ };
7030
+ }
7031
+ };
7032
+
7033
+ // src/cluster/MerkleTreeManager.ts
7034
+ import { EventEmitter as EventEmitter11 } from "events";
7035
+ import { MerkleTree, hashString as hashString2 } from "@topgunbuild/core";
7036
+ var DEFAULT_MERKLE_TREE_CONFIG = {
7037
+ treeDepth: 3,
7038
+ autoUpdate: true,
7039
+ lazyInit: true
7040
+ };
7041
+ var MerkleTreeManager = class extends EventEmitter11 {
7042
+ constructor(nodeId, config = {}) {
7043
+ super();
7044
+ this.trees = /* @__PURE__ */ new Map();
7045
+ this.keyCounts = /* @__PURE__ */ new Map();
7046
+ this.lastUpdated = /* @__PURE__ */ new Map();
7047
+ this.nodeId = nodeId;
7048
+ this.config = { ...DEFAULT_MERKLE_TREE_CONFIG, ...config };
7049
+ }
7050
+ /**
7051
+ * Get or create a Merkle tree for a partition
7052
+ */
7053
+ getTree(partitionId) {
7054
+ let tree = this.trees.get(partitionId);
7055
+ if (!tree) {
7056
+ tree = new MerkleTree(/* @__PURE__ */ new Map(), this.config.treeDepth);
7057
+ this.trees.set(partitionId, tree);
7058
+ this.keyCounts.set(partitionId, 0);
7059
+ this.lastUpdated.set(partitionId, Date.now());
7060
+ }
7061
+ return tree;
7062
+ }
7063
+ /**
7064
+ * Build tree for a partition from existing data
7065
+ */
7066
+ buildTree(partitionId, records) {
7067
+ const tree = new MerkleTree(records, this.config.treeDepth);
7068
+ this.trees.set(partitionId, tree);
7069
+ this.keyCounts.set(partitionId, records.size);
7070
+ this.lastUpdated.set(partitionId, Date.now());
7071
+ logger.debug({
7072
+ partitionId,
7073
+ keyCount: records.size,
7074
+ rootHash: tree.getRootHash()
7075
+ }, "Built Merkle tree for partition");
7076
+ }
7077
+ /**
7078
+ * Incrementally update tree when a record changes
7079
+ */
7080
+ updateRecord(partitionId, key, record) {
7081
+ if (!this.config.autoUpdate) return;
7082
+ const tree = this.getTree(partitionId);
7083
+ const previousKeyCount = this.keyCounts.get(partitionId) ?? 0;
7084
+ const existingBuckets = tree.getBuckets("");
7085
+ const wasNewKey = Object.keys(existingBuckets).length === 0 || !tree.getKeysInBucket(this.getKeyPath(key)).includes(key);
7086
+ tree.update(key, record);
7087
+ if (wasNewKey) {
7088
+ this.keyCounts.set(partitionId, previousKeyCount + 1);
7089
+ }
7090
+ this.lastUpdated.set(partitionId, Date.now());
7091
+ this.emit("treeUpdated", {
7092
+ partitionId,
7093
+ key,
7094
+ rootHash: tree.getRootHash()
7095
+ });
7096
+ }
7097
+ /**
7098
+ * Remove a key from the tree (e.g., after GC)
7099
+ */
7100
+ removeRecord(partitionId, key) {
7101
+ const tree = this.trees.get(partitionId);
7102
+ if (!tree) return;
7103
+ tree.remove(key);
7104
+ const currentCount = this.keyCounts.get(partitionId) ?? 0;
7105
+ if (currentCount > 0) {
7106
+ this.keyCounts.set(partitionId, currentCount - 1);
7107
+ }
7108
+ this.lastUpdated.set(partitionId, Date.now());
7109
+ this.emit("treeUpdated", {
7110
+ partitionId,
7111
+ key,
7112
+ rootHash: tree.getRootHash()
7113
+ });
7114
+ }
7115
+ /**
7116
+ * Get the path prefix for a key in the Merkle tree
7117
+ */
7118
+ getKeyPath(key) {
7119
+ const hash = hashString2(key).toString(16).padStart(8, "0");
7120
+ return hash.slice(0, this.config.treeDepth);
7121
+ }
7122
+ /**
7123
+ * Get root hash for a partition
7124
+ */
7125
+ getRootHash(partitionId) {
7126
+ const tree = this.trees.get(partitionId);
7127
+ return tree?.getRootHash() ?? 0;
7128
+ }
7129
+ /**
7130
+ * Compare local tree with remote root hash
7131
+ */
7132
+ compareWithRemote(partitionId, remoteRoot) {
7133
+ const tree = this.getTree(partitionId);
7134
+ const localRoot = tree.getRootHash();
7135
+ return {
7136
+ partitionId,
7137
+ localRoot,
7138
+ remoteRoot,
7139
+ needsSync: localRoot !== remoteRoot,
7140
+ differingBuckets: localRoot !== remoteRoot ? this.findDifferingBuckets(tree, remoteRoot) : []
7141
+ };
7142
+ }
7143
+ /**
7144
+ * Find buckets that differ between local and remote tree
7145
+ * Note: This is a simplified version - full implementation would
7146
+ * need to exchange bucket hashes with the remote node
7147
+ */
7148
+ findDifferingBuckets(tree, _remoteRoot) {
7149
+ const buckets = [];
7150
+ this.collectLeafBuckets(tree, "", buckets);
7151
+ return buckets;
7152
+ }
7153
+ /**
7154
+ * Recursively collect all leaf bucket paths
7155
+ */
7156
+ collectLeafBuckets(tree, path, result) {
7157
+ if (path.length >= this.config.treeDepth) {
7158
+ const keys = tree.getKeysInBucket(path);
7159
+ if (keys.length > 0) {
7160
+ result.push(path);
7161
+ }
7162
+ return;
7163
+ }
7164
+ const buckets = tree.getBuckets(path);
7165
+ for (const char of Object.keys(buckets)) {
7166
+ this.collectLeafBuckets(tree, path + char, result);
7167
+ }
7168
+ }
7169
+ /**
7170
+ * Get bucket hashes for a partition at a given path
7171
+ */
7172
+ getBuckets(partitionId, path) {
7173
+ const tree = this.trees.get(partitionId);
7174
+ return tree?.getBuckets(path) ?? {};
7175
+ }
7176
+ /**
7177
+ * Get keys in a specific bucket
7178
+ */
7179
+ getKeysInBucket(partitionId, path) {
7180
+ const tree = this.trees.get(partitionId);
7181
+ return tree?.getKeysInBucket(path) ?? [];
7182
+ }
7183
+ /**
7184
+ * Get all keys across all buckets for a partition
7185
+ */
7186
+ getAllKeys(partitionId) {
7187
+ const tree = this.trees.get(partitionId);
7188
+ if (!tree) return [];
7189
+ const keys = [];
7190
+ this.collectAllKeys(tree, "", keys);
7191
+ return keys;
7192
+ }
7193
+ /**
7194
+ * Recursively collect all keys from the tree
7195
+ */
7196
+ collectAllKeys(tree, path, result) {
7197
+ if (path.length >= this.config.treeDepth) {
7198
+ const keys = tree.getKeysInBucket(path);
7199
+ result.push(...keys);
7200
+ return;
7201
+ }
7202
+ const buckets = tree.getBuckets(path);
7203
+ for (const char of Object.keys(buckets)) {
7204
+ this.collectAllKeys(tree, path + char, result);
7205
+ }
7206
+ }
7207
+ /**
7208
+ * Get info about all managed partitions
7209
+ */
7210
+ getPartitionInfos() {
7211
+ const infos = [];
7212
+ for (const [partitionId, tree] of this.trees) {
7213
+ infos.push({
7214
+ partitionId,
7215
+ rootHash: tree.getRootHash(),
7216
+ keyCount: this.keyCounts.get(partitionId) ?? 0,
7217
+ lastUpdated: this.lastUpdated.get(partitionId) ?? 0
7218
+ });
7219
+ }
7220
+ return infos;
7221
+ }
7222
+ /**
7223
+ * Get info for a specific partition
7224
+ */
7225
+ getPartitionInfo(partitionId) {
7226
+ const tree = this.trees.get(partitionId);
7227
+ if (!tree) return null;
7228
+ return {
7229
+ partitionId,
7230
+ rootHash: tree.getRootHash(),
7231
+ keyCount: this.keyCounts.get(partitionId) ?? 0,
7232
+ lastUpdated: this.lastUpdated.get(partitionId) ?? 0
7233
+ };
7234
+ }
7235
+ /**
7236
+ * Clear tree for a partition (e.g., after migration)
7237
+ */
7238
+ clearPartition(partitionId) {
7239
+ this.trees.delete(partitionId);
7240
+ this.keyCounts.delete(partitionId);
7241
+ this.lastUpdated.delete(partitionId);
7242
+ }
7243
+ /**
7244
+ * Clear all trees
7245
+ */
7246
+ clearAll() {
7247
+ this.trees.clear();
7248
+ this.keyCounts.clear();
7249
+ this.lastUpdated.clear();
7250
+ }
7251
+ /**
7252
+ * Get metrics for monitoring
7253
+ */
7254
+ getMetrics() {
7255
+ let totalKeys = 0;
7256
+ for (const count of this.keyCounts.values()) {
7257
+ totalKeys += count;
7258
+ }
7259
+ return {
7260
+ totalPartitions: this.trees.size,
7261
+ totalKeys,
7262
+ averageKeysPerPartition: this.trees.size > 0 ? totalKeys / this.trees.size : 0
7263
+ };
7264
+ }
7265
+ /**
7266
+ * Serialize tree state for network transfer
7267
+ */
7268
+ serializeTree(partitionId) {
7269
+ const tree = this.trees.get(partitionId);
7270
+ if (!tree) return null;
7271
+ const buckets = {};
7272
+ for (let depth = 0; depth < this.config.treeDepth; depth++) {
7273
+ this.collectBucketsAtDepth(tree, "", depth, buckets);
7274
+ }
7275
+ return {
7276
+ rootHash: tree.getRootHash(),
7277
+ buckets
7278
+ };
7279
+ }
7280
+ collectBucketsAtDepth(tree, path, targetDepth, result) {
7281
+ if (path.length === targetDepth) {
7282
+ const buckets2 = tree.getBuckets(path);
7283
+ if (Object.keys(buckets2).length > 0) {
7284
+ result[path] = buckets2;
7285
+ }
7286
+ return;
7287
+ }
7288
+ if (path.length > targetDepth) return;
7289
+ const buckets = tree.getBuckets(path);
7290
+ for (const char of Object.keys(buckets)) {
7291
+ this.collectBucketsAtDepth(tree, path + char, targetDepth, result);
7292
+ }
7293
+ }
7294
+ };
7295
+
7296
+ // src/cluster/RepairScheduler.ts
7297
+ import { EventEmitter as EventEmitter12 } from "events";
7298
+ import { PARTITION_COUNT as PARTITION_COUNT4 } from "@topgunbuild/core";
7299
+ var DEFAULT_REPAIR_CONFIG = {
7300
+ enabled: true,
7301
+ scanIntervalMs: 36e5,
7302
+ // 1 hour
7303
+ repairBatchSize: 1e3,
7304
+ maxConcurrentRepairs: 2,
7305
+ throttleMs: 100,
7306
+ prioritizeRecent: true,
7307
+ requestTimeoutMs: 5e3
7308
+ };
7309
+ var RepairScheduler = class extends EventEmitter12 {
7310
+ constructor(merkleManager, clusterManager, partitionService, nodeId, config = {}) {
7311
+ super();
7312
+ this.repairQueue = [];
7313
+ this.activeRepairs = /* @__PURE__ */ new Set();
7314
+ this.started = false;
7315
+ // Pending network requests
7316
+ this.pendingRequests = /* @__PURE__ */ new Map();
7317
+ // Metrics
7318
+ this.metrics = {
7319
+ scansCompleted: 0,
7320
+ repairsExecuted: 0,
7321
+ keysRepaired: 0,
7322
+ errorsEncountered: 0,
7323
+ averageRepairDurationMs: 0
7324
+ };
7325
+ this.merkleManager = merkleManager;
7326
+ this.clusterManager = clusterManager;
7327
+ this.partitionService = partitionService;
7328
+ this.nodeId = nodeId;
7329
+ this.config = { ...DEFAULT_REPAIR_CONFIG, ...config };
7330
+ this.setupNetworkHandlers();
7331
+ }
7332
+ /**
7333
+ * Set data access callbacks
7334
+ */
7335
+ setDataAccessors(getRecord, setRecord) {
7336
+ this.getRecord = getRecord;
7337
+ this.setRecord = setRecord;
7338
+ }
7339
+ /**
7340
+ * Setup network message handlers
7341
+ */
7342
+ setupNetworkHandlers() {
7343
+ this.clusterManager.on("message", (msg) => {
7344
+ this.handleClusterMessage(msg);
7345
+ });
7346
+ }
7347
+ /**
7348
+ * Handle incoming cluster messages
7349
+ */
7350
+ handleClusterMessage(msg) {
7351
+ switch (msg.type) {
7352
+ case "CLUSTER_MERKLE_ROOT_REQ":
7353
+ this.handleMerkleRootReq(msg);
7354
+ break;
7355
+ case "CLUSTER_MERKLE_ROOT_RESP":
7356
+ this.handleResponse(msg);
7357
+ break;
7358
+ case "CLUSTER_MERKLE_BUCKETS_REQ":
7359
+ this.handleMerkleBucketsReq(msg);
7360
+ break;
7361
+ case "CLUSTER_MERKLE_BUCKETS_RESP":
7362
+ this.handleResponse(msg);
7363
+ break;
7364
+ case "CLUSTER_MERKLE_KEYS_REQ":
7365
+ this.handleMerkleKeysReq(msg);
7366
+ break;
7367
+ case "CLUSTER_MERKLE_KEYS_RESP":
7368
+ this.handleResponse(msg);
7369
+ break;
7370
+ case "CLUSTER_REPAIR_DATA_REQ":
7371
+ this.handleRepairDataReq(msg);
7372
+ break;
7373
+ case "CLUSTER_REPAIR_DATA_RESP":
7374
+ this.handleResponse(msg);
7375
+ break;
7376
+ }
7377
+ }
7378
+ // === Request Handlers (Passive) ===
7379
+ handleMerkleRootReq(msg) {
7380
+ const { requestId, partitionId } = msg.payload;
7381
+ const rootHash = this.merkleManager.getRootHash(partitionId);
7382
+ this.clusterManager.send(msg.senderId, "CLUSTER_MERKLE_ROOT_RESP", {
7383
+ requestId,
7384
+ partitionId,
7385
+ rootHash
7386
+ });
7387
+ }
7388
+ handleMerkleBucketsReq(msg) {
7389
+ const { requestId, partitionId } = msg.payload;
7390
+ const tree = this.merkleManager.serializeTree(partitionId);
7391
+ this.clusterManager.send(msg.senderId, "CLUSTER_MERKLE_BUCKETS_RESP", {
7392
+ requestId,
7393
+ partitionId,
7394
+ buckets: tree?.buckets || {}
7395
+ });
7396
+ }
7397
+ handleMerkleKeysReq(msg) {
7398
+ const { requestId, partitionId, path } = msg.payload;
7399
+ const keys = this.merkleManager.getKeysInBucket(partitionId, path);
7400
+ this.clusterManager.send(msg.senderId, "CLUSTER_MERKLE_KEYS_RESP", {
7401
+ requestId,
7402
+ partitionId,
7403
+ path,
7404
+ keys
7405
+ });
7406
+ }
7407
+ handleRepairDataReq(msg) {
7408
+ const { requestId, key } = msg.payload;
7409
+ if (!this.getRecord) return;
7410
+ const record = this.getRecord(key);
7411
+ this.clusterManager.send(msg.senderId, "CLUSTER_REPAIR_DATA_RESP", {
7412
+ requestId,
7413
+ key,
7414
+ record
7415
+ });
7416
+ }
7417
+ handleResponse(msg) {
7418
+ const { requestId } = msg.payload;
7419
+ const pending = this.pendingRequests.get(requestId);
7420
+ if (pending) {
7421
+ clearTimeout(pending.timer);
7422
+ this.pendingRequests.delete(requestId);
7423
+ pending.resolve(msg.payload);
7424
+ }
7425
+ }
7426
+ // === Lifecycle Methods ===
7427
+ /**
7428
+ * Start the repair scheduler
7429
+ */
7430
+ start() {
7431
+ if (this.started || !this.config.enabled) return;
7432
+ this.started = true;
7433
+ logger.info({ config: this.config }, "Starting RepairScheduler");
7434
+ this.scanTimer = setInterval(() => {
7435
+ this.scheduleFullScan();
7436
+ }, this.config.scanIntervalMs);
7437
+ this.processTimer = setInterval(() => {
7438
+ this.processRepairQueue();
7439
+ }, 1e3);
7440
+ setTimeout(() => {
7441
+ this.scheduleFullScan();
7442
+ }, 6e4);
7443
+ }
7444
+ /**
7445
+ * Stop the repair scheduler
7446
+ */
7447
+ stop() {
7448
+ if (!this.started) return;
7449
+ this.started = false;
7450
+ if (this.scanTimer) {
7451
+ clearInterval(this.scanTimer);
7452
+ this.scanTimer = void 0;
7453
+ }
7454
+ if (this.processTimer) {
7455
+ clearInterval(this.processTimer);
7456
+ this.processTimer = void 0;
7457
+ }
7458
+ this.repairQueue = [];
7459
+ this.activeRepairs.clear();
7460
+ for (const [id, req] of this.pendingRequests) {
7461
+ clearTimeout(req.timer);
7462
+ req.reject(new Error("Scheduler stopped"));
7463
+ }
7464
+ this.pendingRequests.clear();
7465
+ logger.info("RepairScheduler stopped");
7466
+ }
7467
+ /**
7468
+ * Schedule a full scan of all owned partitions
7469
+ */
7470
+ scheduleFullScan() {
7471
+ const ownedPartitions = this.getOwnedPartitions();
7472
+ const replicas = this.getReplicaPartitions();
7473
+ const allPartitions = [.../* @__PURE__ */ new Set([...ownedPartitions, ...replicas])];
7474
+ logger.info({
7475
+ ownedCount: ownedPartitions.length,
7476
+ replicaCount: replicas.length,
7477
+ totalPartitions: allPartitions.length
7478
+ }, "Scheduling full anti-entropy scan");
7479
+ for (const partitionId of allPartitions) {
7480
+ this.schedulePartitionRepair(partitionId);
7481
+ }
7482
+ this.metrics.scansCompleted++;
7483
+ this.metrics.lastScanTime = Date.now();
7484
+ }
7485
+ /**
7486
+ * Schedule repair for a specific partition
7487
+ */
7488
+ schedulePartitionRepair(partitionId, priority = "normal") {
7489
+ const backups = this.partitionService.getBackups(partitionId);
7490
+ const owner = this.partitionService.getPartitionOwner(partitionId);
7491
+ const replicas = this.nodeId === owner ? backups : owner ? [owner] : [];
7492
+ for (const replicaNodeId of replicas) {
7493
+ const exists = this.repairQueue.some(
7494
+ (t) => t.partitionId === partitionId && t.replicaNodeId === replicaNodeId
7495
+ );
7496
+ if (exists) continue;
7497
+ this.repairQueue.push({
7498
+ partitionId,
7499
+ replicaNodeId,
7500
+ priority,
7501
+ scheduledAt: Date.now()
7502
+ });
7503
+ }
7504
+ this.sortRepairQueue();
7505
+ }
7506
+ /**
7507
+ * Sort repair queue by priority
7508
+ */
7509
+ sortRepairQueue() {
7510
+ const priorityOrder = { high: 0, normal: 1, low: 2 };
7511
+ this.repairQueue.sort((a, b) => {
7512
+ const priorityDiff = priorityOrder[a.priority] - priorityOrder[b.priority];
7513
+ if (priorityDiff !== 0) return priorityDiff;
7514
+ if (this.config.prioritizeRecent) {
7515
+ const infoA = this.merkleManager.getPartitionInfo(a.partitionId);
7516
+ const infoB = this.merkleManager.getPartitionInfo(b.partitionId);
7517
+ if (infoA && infoB) {
7518
+ return infoB.lastUpdated - infoA.lastUpdated;
7519
+ }
7520
+ }
7521
+ return a.scheduledAt - b.scheduledAt;
7522
+ });
7523
+ }
7524
+ /**
7525
+ * Process the repair queue
7526
+ */
7527
+ async processRepairQueue() {
7528
+ if (this.activeRepairs.size >= this.config.maxConcurrentRepairs) {
7529
+ return;
7530
+ }
7531
+ const task = this.repairQueue.shift();
7532
+ if (!task) return;
7533
+ if (this.activeRepairs.has(task.partitionId)) {
7534
+ return;
7535
+ }
7536
+ if (!this.clusterManager.getMembers().includes(task.replicaNodeId)) {
7537
+ logger.debug({ task }, "Skipping repair - replica not available");
7538
+ return;
7539
+ }
7540
+ this.activeRepairs.add(task.partitionId);
7541
+ try {
7542
+ const result = await this.executeRepair(task);
7543
+ this.emit("repairComplete", result);
7544
+ if (result.success) {
7545
+ this.metrics.repairsExecuted++;
7546
+ this.metrics.keysRepaired += result.keysRepaired;
7547
+ this.updateAverageRepairDuration(result.durationMs);
7548
+ } else {
7549
+ this.metrics.errorsEncountered++;
7550
+ }
7551
+ } catch (error) {
7552
+ logger.error({ task, error }, "Repair failed");
7553
+ this.metrics.errorsEncountered++;
7554
+ } finally {
7555
+ this.activeRepairs.delete(task.partitionId);
7556
+ }
7557
+ }
7558
+ /**
7559
+ * Execute repair for a partition-replica pair
7560
+ */
7561
+ async executeRepair(task) {
7562
+ const startTime = Date.now();
7563
+ let keysScanned = 0;
7564
+ let keysRepaired = 0;
7565
+ try {
7566
+ const localRoot = this.merkleManager.getRootHash(task.partitionId);
7567
+ const remoteRoot = await this.requestRemoteMerkleRoot(task.replicaNodeId, task.partitionId);
7568
+ if (localRoot === remoteRoot) {
7569
+ logger.debug({
7570
+ partitionId: task.partitionId,
7571
+ replicaNodeId: task.replicaNodeId
7572
+ }, "Partition in sync");
7573
+ return {
7574
+ partitionId: task.partitionId,
7575
+ replicaNodeId: task.replicaNodeId,
7576
+ keysScanned: 0,
7577
+ keysRepaired: 0,
7578
+ durationMs: Date.now() - startTime,
7579
+ success: true
7580
+ };
7581
+ }
7582
+ const differences = await this.findDifferences(task.partitionId, task.replicaNodeId);
7583
+ keysScanned = differences.length;
7584
+ for (const key of differences) {
7585
+ const repaired = await this.repairKey(task.partitionId, task.replicaNodeId, key);
7586
+ if (repaired) {
7587
+ keysRepaired++;
7588
+ }
7589
+ if (keysRepaired % this.config.repairBatchSize === 0) {
7590
+ await this.sleep(this.config.throttleMs);
7591
+ }
7592
+ }
7593
+ logger.info({
7594
+ partitionId: task.partitionId,
7595
+ replicaNodeId: task.replicaNodeId,
7596
+ keysScanned,
7597
+ keysRepaired,
7598
+ durationMs: Date.now() - startTime
7599
+ }, "Partition repair completed");
7600
+ return {
7601
+ partitionId: task.partitionId,
7602
+ replicaNodeId: task.replicaNodeId,
7603
+ keysScanned,
7604
+ keysRepaired,
7605
+ durationMs: Date.now() - startTime,
7606
+ success: true
7607
+ };
7608
+ } catch (error) {
7609
+ return {
7610
+ partitionId: task.partitionId,
7611
+ replicaNodeId: task.replicaNodeId,
7612
+ keysScanned,
7613
+ keysRepaired,
7614
+ durationMs: Date.now() - startTime,
7615
+ success: false,
7616
+ error: String(error)
7617
+ };
7618
+ }
7619
+ }
7620
+ /**
7621
+ * Send a request and wait for response
7622
+ */
7623
+ sendRequest(nodeId, type, payload) {
7624
+ return new Promise((resolve, reject) => {
7625
+ const requestId = Math.random().toString(36).substring(7);
7626
+ const timer = setTimeout(() => {
7627
+ this.pendingRequests.delete(requestId);
7628
+ reject(new Error(`Request timeout: ${type} to ${nodeId}`));
7629
+ }, this.config.requestTimeoutMs);
7630
+ this.pendingRequests.set(requestId, { resolve, reject, timer });
7631
+ this.clusterManager.send(nodeId, type, { ...payload, requestId });
7632
+ });
7633
+ }
7634
+ /**
7635
+ * Request Merkle root from remote node
7636
+ */
7637
+ async requestRemoteMerkleRoot(nodeId, partitionId) {
7638
+ const response = await this.sendRequest(
7639
+ nodeId,
7640
+ "CLUSTER_MERKLE_ROOT_REQ",
7641
+ { partitionId }
7642
+ );
7643
+ return response.rootHash;
7644
+ }
7645
+ /**
7646
+ * Find keys that differ between local and remote using bucket exchange
7647
+ */
7648
+ async findDifferences(partitionId, replicaNodeId) {
7649
+ const response = await this.sendRequest(
7650
+ replicaNodeId,
7651
+ "CLUSTER_MERKLE_BUCKETS_REQ",
7652
+ { partitionId }
7653
+ );
7654
+ const remoteBuckets = response.buckets;
7655
+ const localTree = this.merkleManager.getTree(partitionId);
7656
+ if (!localTree) return [];
7657
+ const differingKeys = /* @__PURE__ */ new Set();
7658
+ const queue = [""];
7659
+ const maxDepth = 3;
7660
+ while (queue.length > 0) {
7661
+ const path = queue.shift();
7662
+ const localChildren = localTree.getBuckets(path);
7663
+ const remoteChildren = remoteBuckets[path] || {};
7664
+ const allChars = /* @__PURE__ */ new Set([...Object.keys(localChildren), ...Object.keys(remoteChildren)]);
7665
+ for (const char of allChars) {
7666
+ const localHash = localChildren[char] || 0;
7667
+ const remoteHash = remoteChildren[char] || 0;
7668
+ if (localHash !== remoteHash) {
7669
+ const nextPath = path + char;
7670
+ if (nextPath.length >= maxDepth) {
7671
+ const bucketKeysResp = await this.sendRequest(
7672
+ replicaNodeId,
7673
+ "CLUSTER_MERKLE_KEYS_REQ",
7674
+ { partitionId, path: nextPath }
7675
+ );
7676
+ const localBucketKeys = localTree.getKeysInBucket(nextPath);
7677
+ const remoteBucketKeys = bucketKeysResp.keys;
7678
+ for (const k of localBucketKeys) differingKeys.add(k);
7679
+ for (const k of remoteBucketKeys) differingKeys.add(k);
7680
+ } else {
7681
+ queue.push(nextPath);
7682
+ }
7683
+ }
7684
+ }
7685
+ }
7686
+ return Array.from(differingKeys);
7687
+ }
7688
+ /**
7689
+ * Repair a single key
7690
+ */
7691
+ async repairKey(partitionId, replicaNodeId, key) {
7692
+ if (!this.getRecord || !this.setRecord) {
7693
+ return false;
7694
+ }
7695
+ const localRecord = this.getRecord(key);
7696
+ let remoteRecord;
7697
+ try {
7698
+ const response = await this.sendRequest(
7699
+ replicaNodeId,
7700
+ "CLUSTER_REPAIR_DATA_REQ",
7701
+ { key }
7702
+ );
7703
+ remoteRecord = response.record;
7704
+ } catch (e) {
7705
+ logger.warn({ key, replicaNodeId, err: e }, "Failed to fetch remote record for repair");
7706
+ return false;
7707
+ }
7708
+ const resolved = this.resolveConflict(localRecord, remoteRecord);
7709
+ if (!resolved) return false;
7710
+ if (JSON.stringify(resolved) !== JSON.stringify(localRecord)) {
7711
+ this.setRecord(key, resolved);
7712
+ if (JSON.stringify(resolved) !== JSON.stringify(remoteRecord)) {
7713
+ this.clusterManager.send(replicaNodeId, "CLUSTER_REPAIR_DATA_RESP", {
7714
+ // In future: Use dedicated WRITE/REPAIR message
7715
+ // For now we rely on the fact that repair will eventually run on other node too
7716
+ });
7717
+ }
7718
+ return true;
7719
+ }
7720
+ return false;
7721
+ }
7722
+ /**
7723
+ * Resolve conflict between two records using LWW
7724
+ */
7725
+ resolveConflict(a, b) {
7726
+ if (!a && !b) return null;
7727
+ if (!a) return b;
7728
+ if (!b) return a;
7729
+ if (this.compareTimestamps(a.timestamp, b.timestamp) > 0) {
7730
+ return a;
7731
+ }
7732
+ if (this.compareTimestamps(b.timestamp, a.timestamp) > 0) {
7733
+ return b;
7734
+ }
7735
+ if (a.timestamp.nodeId > b.timestamp.nodeId) {
7736
+ return a;
7737
+ }
7738
+ return b;
7739
+ }
7740
+ /**
7741
+ * Compare two timestamps
7742
+ */
7743
+ compareTimestamps(a, b) {
7744
+ if (a.millis !== b.millis) {
7745
+ return a.millis - b.millis;
7746
+ }
7747
+ return a.counter - b.counter;
7748
+ }
7749
+ /**
7750
+ * Get partitions owned by this node
7751
+ */
7752
+ getOwnedPartitions() {
7753
+ const owned = [];
7754
+ for (let i = 0; i < PARTITION_COUNT4; i++) {
7755
+ if (this.partitionService.getPartitionOwner(i) === this.nodeId) {
7756
+ owned.push(i);
7757
+ }
7758
+ }
7759
+ return owned;
7760
+ }
7761
+ /**
7762
+ * Get partitions where this node is a backup
7763
+ */
7764
+ getReplicaPartitions() {
7765
+ const replicas = [];
7766
+ for (let i = 0; i < PARTITION_COUNT4; i++) {
7767
+ const backups = this.partitionService.getBackups(i);
7768
+ if (backups.includes(this.nodeId)) {
7769
+ replicas.push(i);
7770
+ }
7771
+ }
7772
+ return replicas;
7773
+ }
7774
+ /**
7775
+ * Update average repair duration
7776
+ */
7777
+ updateAverageRepairDuration(durationMs) {
7778
+ const count = this.metrics.repairsExecuted;
7779
+ const currentAvg = this.metrics.averageRepairDurationMs;
7780
+ this.metrics.averageRepairDurationMs = (currentAvg * (count - 1) + durationMs) / count;
7781
+ }
7782
+ /**
7783
+ * Get repair metrics
7784
+ */
7785
+ getMetrics() {
7786
+ return { ...this.metrics };
7787
+ }
7788
+ /**
7789
+ * Get repair queue status
7790
+ */
7791
+ getQueueStatus() {
7792
+ return {
7793
+ queueLength: this.repairQueue.length,
7794
+ activeRepairs: this.activeRepairs.size,
7795
+ maxConcurrent: this.config.maxConcurrentRepairs
7796
+ };
7797
+ }
7798
+ /**
7799
+ * Force immediate repair for a partition
7800
+ */
7801
+ forceRepair(partitionId) {
7802
+ this.schedulePartitionRepair(partitionId, "high");
7803
+ }
7804
+ /**
7805
+ * Sleep utility
7806
+ */
7807
+ sleep(ms) {
7808
+ return new Promise((resolve) => setTimeout(resolve, ms));
7809
+ }
7810
+ };
7811
+
7812
+ // src/handlers/CounterHandler.ts
7813
+ import { PNCounterImpl } from "@topgunbuild/core";
7814
+ var CounterHandler = class {
7815
+ // counterName -> Set<clientId>
7816
+ constructor(nodeId = "server") {
7817
+ this.nodeId = nodeId;
7818
+ this.counters = /* @__PURE__ */ new Map();
7819
+ this.subscriptions = /* @__PURE__ */ new Map();
7820
+ }
7821
+ /**
7822
+ * Get or create a counter by name.
7823
+ */
7824
+ getOrCreateCounter(name) {
7825
+ let counter = this.counters.get(name);
7826
+ if (!counter) {
7827
+ counter = new PNCounterImpl({ nodeId: this.nodeId });
7828
+ this.counters.set(name, counter);
7829
+ logger.debug({ name }, "Created new counter");
7830
+ }
7831
+ return counter;
7832
+ }
7833
+ /**
7834
+ * Handle COUNTER_REQUEST - client wants initial state.
7835
+ * @returns Response message to send back to client
7836
+ */
7837
+ handleCounterRequest(clientId, name) {
7838
+ const counter = this.getOrCreateCounter(name);
7839
+ this.subscribe(clientId, name);
7840
+ const state = counter.getState();
7841
+ logger.debug({ clientId, name, value: counter.get() }, "Counter request handled");
7842
+ return {
7843
+ type: "COUNTER_RESPONSE",
7844
+ payload: {
7845
+ name,
7846
+ state: this.stateToObject(state)
7847
+ }
7848
+ };
7849
+ }
7850
+ /**
7851
+ * Handle COUNTER_SYNC - client sends their state to merge.
7852
+ * @returns Merged state and list of clients to broadcast to
7853
+ */
7854
+ handleCounterSync(clientId, name, stateObj) {
7855
+ const counter = this.getOrCreateCounter(name);
7856
+ const incomingState = this.objectToState(stateObj);
7857
+ counter.merge(incomingState);
7858
+ const mergedState = counter.getState();
7859
+ const mergedStateObj = this.stateToObject(mergedState);
7860
+ logger.debug(
7861
+ { clientId, name, value: counter.get() },
7862
+ "Counter sync handled"
7863
+ );
7864
+ this.subscribe(clientId, name);
7865
+ const subscribers = this.subscriptions.get(name) || /* @__PURE__ */ new Set();
7866
+ const broadcastTo = Array.from(subscribers).filter((id) => id !== clientId);
7867
+ return {
7868
+ // Response to the sending client
7869
+ response: {
7870
+ type: "COUNTER_UPDATE",
7871
+ payload: {
7872
+ name,
7873
+ state: mergedStateObj
7874
+ }
7875
+ },
7876
+ // Broadcast to other clients
7877
+ broadcastTo,
7878
+ broadcastMessage: {
7879
+ type: "COUNTER_UPDATE",
7880
+ payload: {
7881
+ name,
7882
+ state: mergedStateObj
7883
+ }
7884
+ }
7885
+ };
7886
+ }
7887
+ /**
7888
+ * Subscribe a client to counter updates.
7889
+ */
7890
+ subscribe(clientId, counterName) {
7891
+ if (!this.subscriptions.has(counterName)) {
7892
+ this.subscriptions.set(counterName, /* @__PURE__ */ new Set());
7893
+ }
7894
+ this.subscriptions.get(counterName).add(clientId);
7895
+ logger.debug({ clientId, counterName }, "Client subscribed to counter");
7896
+ }
7897
+ /**
7898
+ * Unsubscribe a client from counter updates.
7899
+ */
7900
+ unsubscribe(clientId, counterName) {
7901
+ const subs = this.subscriptions.get(counterName);
7902
+ if (subs) {
7903
+ subs.delete(clientId);
7904
+ if (subs.size === 0) {
7905
+ this.subscriptions.delete(counterName);
7906
+ }
7907
+ }
7908
+ }
7909
+ /**
7910
+ * Unsubscribe a client from all counters (e.g., on disconnect).
7911
+ */
7912
+ unsubscribeAll(clientId) {
7913
+ for (const [counterName, subs] of this.subscriptions) {
7914
+ subs.delete(clientId);
7915
+ if (subs.size === 0) {
7916
+ this.subscriptions.delete(counterName);
7917
+ }
7918
+ }
7919
+ logger.debug({ clientId }, "Client unsubscribed from all counters");
7920
+ }
7921
+ /**
7922
+ * Get current counter value (for monitoring/debugging).
7923
+ */
7924
+ getCounterValue(name) {
7925
+ const counter = this.counters.get(name);
7926
+ return counter ? counter.get() : 0;
7927
+ }
7928
+ /**
7929
+ * Get all counter names.
7930
+ */
7931
+ getCounterNames() {
7932
+ return Array.from(this.counters.keys());
7933
+ }
7934
+ /**
7935
+ * Get number of subscribers for a counter.
7936
+ */
7937
+ getSubscriberCount(name) {
7938
+ return this.subscriptions.get(name)?.size || 0;
7939
+ }
7940
+ /**
7941
+ * Convert Map-based state to plain object for serialization.
7942
+ */
7943
+ stateToObject(state) {
7944
+ return {
7945
+ p: Object.fromEntries(state.positive),
7946
+ n: Object.fromEntries(state.negative)
7947
+ };
7948
+ }
7949
+ /**
7950
+ * Convert plain object to Map-based state.
7951
+ */
7952
+ objectToState(obj) {
7953
+ return {
7954
+ positive: new Map(Object.entries(obj.p || {})),
7955
+ negative: new Map(Object.entries(obj.n || {}))
7956
+ };
7957
+ }
7958
+ };
7959
+
7960
+ // src/handlers/EntryProcessorHandler.ts
7961
+ import {
7962
+ EntryProcessorDefSchema
7963
+ } from "@topgunbuild/core";
7964
+
7965
+ // src/ProcessorSandbox.ts
7966
+ import {
7967
+ validateProcessorCode
7968
+ } from "@topgunbuild/core";
7969
+ var ivm = null;
7970
+ try {
7971
+ ivm = __require("isolated-vm");
7972
+ } catch {
7973
+ const isProduction = process.env.NODE_ENV === "production";
7974
+ if (isProduction) {
7975
+ logger.error(
7976
+ "SECURITY WARNING: isolated-vm not available in production! Entry processors will run in less secure fallback mode. Install isolated-vm for production environments: pnpm add isolated-vm"
7977
+ );
7978
+ } else {
7979
+ logger.warn("isolated-vm not available, falling back to less secure VM");
7980
+ }
7981
+ }
7982
+ var DEFAULT_SANDBOX_CONFIG = {
7983
+ memoryLimitMb: 8,
7984
+ timeoutMs: 100,
7985
+ maxCachedIsolates: 100,
7986
+ strictValidation: true
7987
+ };
7988
+ var ProcessorSandbox = class {
7989
+ constructor(config = {}) {
7990
+ this.isolateCache = /* @__PURE__ */ new Map();
7991
+ this.scriptCache = /* @__PURE__ */ new Map();
7992
+ this.fallbackScriptCache = /* @__PURE__ */ new Map();
7993
+ this.disposed = false;
7994
+ this.config = { ...DEFAULT_SANDBOX_CONFIG, ...config };
7995
+ }
7996
+ /**
7997
+ * Execute an entry processor in the sandbox.
7998
+ *
7999
+ * @param processor The processor definition (name, code, args)
8000
+ * @param value The current value for the key (or undefined)
8001
+ * @param key The key being processed
8002
+ * @returns Result containing success status, result, and new value
8003
+ */
8004
+ async execute(processor, value, key) {
8005
+ if (this.disposed) {
8006
+ return {
8007
+ success: false,
8008
+ error: "Sandbox has been disposed"
8009
+ };
8010
+ }
8011
+ if (this.config.strictValidation) {
8012
+ const validation = validateProcessorCode(processor.code);
8013
+ if (!validation.valid) {
8014
+ return {
8015
+ success: false,
8016
+ error: validation.error
8017
+ };
8018
+ }
8019
+ }
8020
+ if (ivm) {
8021
+ return this.executeInIsolate(processor, value, key);
8022
+ } else {
8023
+ return this.executeInFallback(processor, value, key);
8024
+ }
8025
+ }
8026
+ /**
8027
+ * Execute processor in isolated-vm (secure production mode).
8028
+ */
8029
+ async executeInIsolate(processor, value, key) {
8030
+ if (!ivm) {
8031
+ return { success: false, error: "isolated-vm not available" };
8032
+ }
8033
+ const isolate = this.getOrCreateIsolate(processor.name);
8034
+ try {
8035
+ const context = await isolate.createContext();
8036
+ const jail = context.global;
8037
+ await jail.set("global", jail.derefInto());
8038
+ await context.eval(`
8039
+ var value = ${JSON.stringify(value)};
8040
+ var key = ${JSON.stringify(key)};
8041
+ var args = ${JSON.stringify(processor.args)};
6798
8042
  `);
6799
8043
  const wrappedCode = `
6800
8044
  (function() {
@@ -8112,7 +9356,7 @@ var ServerCoordinator = class {
8112
9356
  this.partitionService,
8113
9357
  {
8114
9358
  ...DEFAULT_REPLICATION_CONFIG2,
8115
- defaultConsistency: config.defaultConsistency ?? ConsistencyLevel2.EVENTUAL,
9359
+ defaultConsistency: config.defaultConsistency ?? ConsistencyLevel3.EVENTUAL,
8116
9360
  ...config.replicationConfig
8117
9361
  }
8118
9362
  );
@@ -8154,6 +9398,52 @@ var ServerCoordinator = class {
8154
9398
  logger.error({ err }, "Failed to initialize EventJournalService");
8155
9399
  });
8156
9400
  }
9401
+ this.partitionReassigner = new PartitionReassigner(
9402
+ this.cluster,
9403
+ this.partitionService,
9404
+ { reassignmentDelayMs: 1e3 }
9405
+ );
9406
+ this.partitionReassigner.on("failoverComplete", (event) => {
9407
+ logger.info({
9408
+ failedNodeId: event.failedNodeId,
9409
+ partitionsReassigned: event.partitionsReassigned,
9410
+ durationMs: event.durationMs
9411
+ }, "Partition failover completed");
9412
+ this.broadcastPartitionMap(this.partitionService.getPartitionMap());
9413
+ });
9414
+ logger.info("PartitionReassigner initialized");
9415
+ this.readReplicaHandler = new ReadReplicaHandler(
9416
+ this.partitionService,
9417
+ this.cluster,
9418
+ this._nodeId,
9419
+ void 0,
9420
+ // LagTracker - can be added later
9421
+ {
9422
+ defaultConsistency: config.defaultConsistency ?? ConsistencyLevel3.STRONG,
9423
+ preferLocalReplica: true,
9424
+ loadBalancing: "latency-based"
9425
+ }
9426
+ );
9427
+ logger.info("ReadReplicaHandler initialized");
9428
+ this.merkleTreeManager = new MerkleTreeManager(this._nodeId);
9429
+ this.repairScheduler = new RepairScheduler(
9430
+ this.merkleTreeManager,
9431
+ this.cluster,
9432
+ this.partitionService,
9433
+ this._nodeId,
9434
+ {
9435
+ enabled: true,
9436
+ scanIntervalMs: 3e5,
9437
+ // 5 minutes
9438
+ maxConcurrentRepairs: 2
9439
+ }
9440
+ );
9441
+ this.repairScheduler.setDataAccessors(
9442
+ (key) => this.getLocalRecord(key) ?? void 0,
9443
+ (key, record) => this.applyRepairRecord(key, record)
9444
+ );
9445
+ this.repairScheduler.start();
9446
+ logger.info("MerkleTreeManager and RepairScheduler initialized");
8157
9447
  this.systemManager = new SystemManager(
8158
9448
  this.cluster,
8159
9449
  this.metricsService,
@@ -8250,8 +9540,84 @@ var ServerCoordinator = class {
8250
9540
  getTaskletScheduler() {
8251
9541
  return this.taskletScheduler;
8252
9542
  }
9543
+ /**
9544
+ * Phase 10.02: Graceful cluster departure
9545
+ *
9546
+ * Notifies the cluster that this node is leaving and allows time for:
9547
+ * 1. Pending replication to complete
9548
+ * 2. Other nodes to detect departure
9549
+ * 3. Partition reassignment to begin
9550
+ */
9551
+ async gracefulClusterDeparture() {
9552
+ if (!this.cluster || this.cluster.getMembers().length <= 1) {
9553
+ return;
9554
+ }
9555
+ const nodeId = this._nodeId;
9556
+ const ownedPartitions = this.partitionService ? this.getOwnedPartitions() : [];
9557
+ logger.info({
9558
+ nodeId,
9559
+ ownedPartitions: ownedPartitions.length,
9560
+ clusterMembers: this.cluster.getMembers().length
9561
+ }, "Initiating graceful cluster departure");
9562
+ const departureMessage = {
9563
+ type: "NODE_LEAVING",
9564
+ nodeId,
9565
+ partitions: ownedPartitions,
9566
+ timestamp: Date.now()
9567
+ };
9568
+ for (const memberId of this.cluster.getMembers()) {
9569
+ if (memberId !== nodeId) {
9570
+ try {
9571
+ this.cluster.send(memberId, "CLUSTER_EVENT", departureMessage);
9572
+ } catch (e) {
9573
+ logger.warn({ memberId, err: e }, "Failed to notify peer of departure");
9574
+ }
9575
+ }
9576
+ }
9577
+ if (this.replicationPipeline) {
9578
+ logger.info("Waiting for pending replication to complete...");
9579
+ try {
9580
+ await this.waitForReplicationFlush(3e3);
9581
+ logger.info("Replication flush complete");
9582
+ } catch (e) {
9583
+ logger.warn({ err: e }, "Replication flush timeout - some data may not be replicated");
9584
+ }
9585
+ }
9586
+ await new Promise((resolve) => setTimeout(resolve, 500));
9587
+ logger.info({ nodeId }, "Graceful cluster departure complete");
9588
+ }
9589
+ /**
9590
+ * Get list of partition IDs owned by this node
9591
+ */
9592
+ getOwnedPartitions() {
9593
+ if (!this.partitionService) return [];
9594
+ const partitionMap = this.partitionService.getPartitionMap();
9595
+ const owned = [];
9596
+ for (const partition of partitionMap.partitions) {
9597
+ if (partition.ownerNodeId === this._nodeId) {
9598
+ owned.push(partition.partitionId);
9599
+ }
9600
+ }
9601
+ return owned;
9602
+ }
9603
+ /**
9604
+ * Wait for replication pipeline to flush pending operations
9605
+ */
9606
+ async waitForReplicationFlush(timeoutMs) {
9607
+ if (!this.replicationPipeline) return;
9608
+ const startTime = Date.now();
9609
+ while (Date.now() - startTime < timeoutMs) {
9610
+ const pendingOps = this.replicationPipeline.getTotalPending();
9611
+ if (pendingOps === 0) {
9612
+ return;
9613
+ }
9614
+ await new Promise((resolve) => setTimeout(resolve, 100));
9615
+ }
9616
+ throw new Error("Replication flush timeout");
9617
+ }
8253
9618
  async shutdown() {
8254
9619
  logger.info("Shutting down Server Coordinator...");
9620
+ await this.gracefulClusterDeparture();
8255
9621
  this.httpServer.close();
8256
9622
  if (this.metricsServer) {
8257
9623
  this.metricsServer.close();
@@ -8284,6 +9650,14 @@ var ServerCoordinator = class {
8284
9650
  if (this.replicationPipeline) {
8285
9651
  this.replicationPipeline.close();
8286
9652
  }
9653
+ if (this.repairScheduler) {
9654
+ this.repairScheduler.stop();
9655
+ logger.info("RepairScheduler stopped");
9656
+ }
9657
+ if (this.partitionReassigner) {
9658
+ this.partitionReassigner.stop();
9659
+ logger.info("PartitionReassigner stopped");
9660
+ }
8287
9661
  if (this.cluster) {
8288
9662
  this.cluster.stop();
8289
9663
  }
@@ -8499,7 +9873,32 @@ var ServerCoordinator = class {
8499
9873
  logger.info({ clientId: client.id, mapName, query }, "Client subscribed");
8500
9874
  this.metricsService.incOp("SUBSCRIBE", mapName);
8501
9875
  const allMembers = this.cluster.getMembers();
8502
- const remoteMembers = allMembers.filter((id) => !this.cluster.isLocal(id));
9876
+ let remoteMembers = allMembers.filter((id) => !this.cluster.isLocal(id));
9877
+ const queryKey = query._id || query.where?._id;
9878
+ if (queryKey && typeof queryKey === "string" && this.readReplicaHandler) {
9879
+ try {
9880
+ const targetNode = this.readReplicaHandler.selectReadNode({
9881
+ mapName,
9882
+ key: queryKey,
9883
+ options: {
9884
+ // Default to EVENTUAL for read scaling unless specified otherwise
9885
+ // In future, we could extract consistency from query options if available
9886
+ consistency: ConsistencyLevel3.EVENTUAL
9887
+ }
9888
+ });
9889
+ if (targetNode) {
9890
+ if (this.cluster.isLocal(targetNode)) {
9891
+ remoteMembers = [];
9892
+ logger.debug({ clientId: client.id, mapName, key: queryKey }, "Read optimization: Serving locally");
9893
+ } else if (remoteMembers.includes(targetNode)) {
9894
+ remoteMembers = [targetNode];
9895
+ logger.debug({ clientId: client.id, mapName, key: queryKey, targetNode }, "Read optimization: Routing to replica");
9896
+ }
9897
+ }
9898
+ } catch (e) {
9899
+ logger.warn({ err: e }, "Error in ReadReplicaHandler selection");
9900
+ }
9901
+ }
8503
9902
  const requestId = crypto.randomUUID();
8504
9903
  const pending = {
8505
9904
  requestId,
@@ -9377,7 +10776,7 @@ var ServerCoordinator = class {
9377
10776
  };
9378
10777
  let broadcastCount = 0;
9379
10778
  for (const client of this.clients.values()) {
9380
- if (client.isAuthenticated && client.socket.readyState === WebSocket3.OPEN) {
10779
+ if (client.isAuthenticated && client.socket.readyState === WebSocket3.OPEN && client.writer) {
9381
10780
  client.writer.write(message);
9382
10781
  broadcastCount++;
9383
10782
  }
@@ -9644,7 +11043,14 @@ var ServerCoordinator = class {
9644
11043
  this.cluster.on("message", (msg) => {
9645
11044
  switch (msg.type) {
9646
11045
  case "OP_FORWARD":
11046
+ if (msg.payload._replication || msg.payload._migration) {
11047
+ break;
11048
+ }
9647
11049
  logger.info({ senderId: msg.senderId }, "Received forwarded op");
11050
+ if (!msg.payload.key) {
11051
+ logger.warn({ senderId: msg.senderId }, "OP_FORWARD missing key, dropping");
11052
+ break;
11053
+ }
9648
11054
  if (this.partitionService.isLocalOwner(msg.payload.key)) {
9649
11055
  this.processLocalOp(msg.payload, true, msg.senderId).catch((err) => {
9650
11056
  logger.error({ err, senderId: msg.senderId }, "Forwarded op failed");
@@ -9751,6 +11157,51 @@ var ServerCoordinator = class {
9751
11157
  this.topicManager.publish(topic, data, originalSenderId, true);
9752
11158
  break;
9753
11159
  }
11160
+ // Phase 10.04: Anti-entropy repair messages
11161
+ case "CLUSTER_MERKLE_ROOT_REQ": {
11162
+ const { partitionId, requestId } = msg.payload;
11163
+ const rootHash = this.merkleTreeManager?.getRootHash(partitionId) ?? 0;
11164
+ this.cluster.send(msg.senderId, "CLUSTER_MERKLE_ROOT_RESP", {
11165
+ requestId,
11166
+ partitionId,
11167
+ rootHash
11168
+ });
11169
+ break;
11170
+ }
11171
+ case "CLUSTER_MERKLE_ROOT_RESP": {
11172
+ if (this.repairScheduler) {
11173
+ this.repairScheduler.emit("merkleRootResponse", {
11174
+ nodeId: msg.senderId,
11175
+ ...msg.payload
11176
+ });
11177
+ }
11178
+ break;
11179
+ }
11180
+ case "CLUSTER_REPAIR_DATA_REQ": {
11181
+ const { partitionId, keys, requestId } = msg.payload;
11182
+ const records = {};
11183
+ for (const key of keys) {
11184
+ const record = this.getLocalRecord(key);
11185
+ if (record) {
11186
+ records[key] = record;
11187
+ }
11188
+ }
11189
+ this.cluster.send(msg.senderId, "CLUSTER_REPAIR_DATA_RESP", {
11190
+ requestId,
11191
+ partitionId,
11192
+ records
11193
+ });
11194
+ break;
11195
+ }
11196
+ case "CLUSTER_REPAIR_DATA_RESP": {
11197
+ if (this.repairScheduler) {
11198
+ this.repairScheduler.emit("repairDataResponse", {
11199
+ nodeId: msg.senderId,
11200
+ ...msg.payload
11201
+ });
11202
+ }
11203
+ break;
11204
+ }
9754
11205
  }
9755
11206
  });
9756
11207
  }
@@ -10020,6 +11471,10 @@ var ServerCoordinator = class {
10020
11471
  nodeId: this._nodeId
10021
11472
  });
10022
11473
  }
11474
+ if (this.merkleTreeManager && recordToStore && op.key) {
11475
+ const partitionId = this.partitionService.getPartitionId(op.key);
11476
+ this.merkleTreeManager.updateRecord(partitionId, op.key, recordToStore);
11477
+ }
10023
11478
  return { eventPayload, oldRecord };
10024
11479
  }
10025
11480
  /**
@@ -10147,7 +11602,7 @@ var ServerCoordinator = class {
10147
11602
  if (rejected || !eventPayload) {
10148
11603
  return;
10149
11604
  }
10150
- if (this.replicationPipeline && !fromCluster) {
11605
+ if (this.replicationPipeline) {
10151
11606
  const opId = op.id || `${op.mapName}:${op.key}:${Date.now()}`;
10152
11607
  this.replicationPipeline.replicate(op, opId, op.key).catch((err) => {
10153
11608
  logger.warn({ opId, key: op.key, err }, "Replication failed (non-fatal)");
@@ -10290,6 +11745,10 @@ var ServerCoordinator = class {
10290
11745
  }
10291
11746
  handleClusterEvent(payload) {
10292
11747
  const { mapName, key, eventType } = payload;
11748
+ if (!key) {
11749
+ logger.warn({ mapName, eventType }, "Received cluster event with undefined key, ignoring");
11750
+ return;
11751
+ }
10293
11752
  const map = this.getMap(mapName, eventType === "OR_ADD" || eventType === "OR_REMOVE" ? "OR" : "LWW");
10294
11753
  const oldRecord = map instanceof LWWMap3 ? map.getRecord(key) : null;
10295
11754
  if (this.partitionService.isRelated(key)) {
@@ -10354,6 +11813,51 @@ var ServerCoordinator = class {
10354
11813
  }
10355
11814
  return this.maps.get(name);
10356
11815
  }
11816
+ /**
11817
+ * Phase 10.04: Get local record for anti-entropy repair
11818
+ * Returns the LWWRecord for a key, used by RepairScheduler
11819
+ */
11820
+ getLocalRecord(key) {
11821
+ const separatorIndex = key.indexOf(":");
11822
+ if (separatorIndex === -1) {
11823
+ return null;
11824
+ }
11825
+ const mapName = key.substring(0, separatorIndex);
11826
+ const actualKey = key.substring(separatorIndex + 1);
11827
+ const map = this.maps.get(mapName);
11828
+ if (!map || !(map instanceof LWWMap3)) {
11829
+ return null;
11830
+ }
11831
+ return map.getRecord(actualKey) ?? null;
11832
+ }
11833
+ /**
11834
+ * Phase 10.04: Apply repaired record from anti-entropy repair
11835
+ * Used by RepairScheduler to apply resolved conflicts
11836
+ */
11837
+ applyRepairRecord(key, record) {
11838
+ const separatorIndex = key.indexOf(":");
11839
+ if (separatorIndex === -1) {
11840
+ logger.warn({ key }, "Invalid key format for repair");
11841
+ return;
11842
+ }
11843
+ const mapName = key.substring(0, separatorIndex);
11844
+ const actualKey = key.substring(separatorIndex + 1);
11845
+ const map = this.getMap(mapName, "LWW");
11846
+ const existingRecord = map.getRecord(actualKey);
11847
+ if (!existingRecord || record.timestamp.millis > existingRecord.timestamp.millis || record.timestamp.millis === existingRecord.timestamp.millis && record.timestamp.counter > existingRecord.timestamp.counter) {
11848
+ map.merge(actualKey, record);
11849
+ logger.debug({ mapName, key: actualKey }, "Applied repair record");
11850
+ if (this.storage) {
11851
+ this.storage.store(mapName, actualKey, record).catch((err) => {
11852
+ logger.error({ err, mapName, key: actualKey }, "Failed to persist repair record");
11853
+ });
11854
+ }
11855
+ if (this.merkleTreeManager) {
11856
+ const partitionId = this.partitionService.getPartitionId(actualKey);
11857
+ this.merkleTreeManager.updateRecord(partitionId, actualKey, record);
11858
+ }
11859
+ }
11860
+ }
10357
11861
  async loadMapFromStorage(name, typeHint) {
10358
11862
  try {
10359
11863
  const keys = await this.storage.loadAllKeys(name);
@@ -11239,7 +12743,7 @@ function logNativeStatus() {
11239
12743
  }
11240
12744
 
11241
12745
  // src/cluster/ClusterCoordinator.ts
11242
- import { EventEmitter as EventEmitter9 } from "events";
12746
+ import { EventEmitter as EventEmitter13 } from "events";
11243
12747
  import {
11244
12748
  DEFAULT_MIGRATION_CONFIG as DEFAULT_MIGRATION_CONFIG3,
11245
12749
  DEFAULT_REPLICATION_CONFIG as DEFAULT_REPLICATION_CONFIG3
@@ -11250,7 +12754,7 @@ var DEFAULT_CLUSTER_COORDINATOR_CONFIG = {
11250
12754
  replication: DEFAULT_REPLICATION_CONFIG3,
11251
12755
  replicationEnabled: true
11252
12756
  };
11253
- var ClusterCoordinator = class extends EventEmitter9 {
12757
+ var ClusterCoordinator = class extends EventEmitter13 {
11254
12758
  constructor(config) {
11255
12759
  super();
11256
12760
  this.replicationPipeline = null;
@@ -12064,12 +13568,18 @@ export {
12064
13568
  ConnectionRateLimiter,
12065
13569
  DEFAULT_CLUSTER_COORDINATOR_CONFIG,
12066
13570
  DEFAULT_CONFLICT_RESOLVER_CONFIG,
13571
+ DEFAULT_FAILURE_DETECTOR_CONFIG,
12067
13572
  DEFAULT_INDEX_CONFIG,
12068
13573
  DEFAULT_JOURNAL_SERVICE_CONFIG,
12069
13574
  DEFAULT_LAG_TRACKER_CONFIG,
13575
+ DEFAULT_MERKLE_TREE_CONFIG,
13576
+ DEFAULT_READ_REPLICA_CONFIG,
13577
+ DEFAULT_REASSIGNER_CONFIG,
13578
+ DEFAULT_REPAIR_CONFIG,
12070
13579
  DEFAULT_SANDBOX_CONFIG,
12071
13580
  EntryProcessorHandler,
12072
13581
  EventJournalService,
13582
+ FailureDetector,
12073
13583
  FilterTasklet,
12074
13584
  ForEachTasklet,
12075
13585
  IteratorTasklet,
@@ -12079,13 +13589,17 @@ export {
12079
13589
  MapTasklet,
12080
13590
  MapWithResolver,
12081
13591
  MemoryServerAdapter,
13592
+ MerkleTreeManager,
12082
13593
  MigrationManager,
12083
13594
  ObjectPool,
13595
+ PartitionReassigner,
12084
13596
  PartitionService,
12085
13597
  PostgresAdapter,
12086
13598
  ProcessorSandbox,
12087
13599
  RateLimitInterceptor,
13600
+ ReadReplicaHandler,
12088
13601
  ReduceTasklet,
13602
+ RepairScheduler,
12089
13603
  ReplicationPipeline,
12090
13604
  SecurityManager,
12091
13605
  ServerCoordinator,