@peerbit/shared-log 13.0.2 → 13.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -137,6 +137,7 @@ import {
137
137
  AddedReplicationSegmentMessage,
138
138
  AllReplicatingSegmentsMessage,
139
139
  MinReplicas,
140
+ ReplicationPingMessage,
140
141
  ReplicationError,
141
142
  type ReplicationLimits,
142
143
  RequestReplicationInfoMessage,
@@ -444,6 +445,13 @@ const DEFAULT_DISTRIBUTION_DEBOUNCE_TIME = 500;
444
445
  const RECENT_REPAIR_DISPATCH_TTL_MS = 5_000;
445
446
  const REPAIR_SWEEP_ENTRY_BATCH_SIZE = 1_000;
446
447
  const REPAIR_SWEEP_TARGET_BUFFER_SIZE = 1024;
448
+ // In sparse topologies (browser/relay), peers can learn about replicators via broadcast
449
+ // replication announcements without having a direct connection that emits unsubscribe
450
+ // on abrupt churn. Probe conservatively so a single missed ACK does not evict a
451
+ // healthy replicator, and rely on replication-info refresh to recover membership.
452
+ const REPLICATOR_LIVENESS_SWEEP_INTERVAL_MS = 2_000;
453
+ const REPLICATOR_LIVENESS_IDLE_THRESHOLD_MS = 8_000;
454
+ const REPLICATOR_LIVENESS_PROBE_FAILURES_TO_EVICT = 2;
447
455
  // Churn/join repair can race with pruning and transient missed sync requests under
448
456
  // heavy event-loop load. Keep retries alive with a longer tail so reassigned
449
457
  // entries are retried after short bursts and slower recovery windows.
@@ -634,6 +642,13 @@ export class SharedLog<
634
642
  { attempts: number; timer?: ReturnType<typeof setTimeout> }
635
643
  >;
636
644
  private _replicationInfoApplyQueueByPeer!: Map<string, Promise<void>>;
645
+ private _replicatorLivenessSweepRunning!: boolean;
646
+ private _replicatorLivenessTimer?: ReturnType<typeof setInterval>;
647
+ private _replicatorLivenessTargets!: string[];
648
+ private _replicatorLivenessTargetsSize!: number;
649
+ private _replicatorLivenessCursor!: number;
650
+ private _replicatorLivenessFailures!: Map<string, number>;
651
+ private _replicatorLastActivityAt!: Map<string, number>;
637
652
 
638
653
  private remoteBlocks!: RemoteBlocks;
639
654
 
@@ -2763,23 +2778,30 @@ export class SharedLog<
2763
2778
  this.domain.resolution,
2764
2779
  );
2765
2780
  this._respondToIHaveTimeout = options?.respondToIHaveTimeout ?? 2e4;
2766
- this._pendingDeletes = new Map();
2767
- this._pendingIHave = new Map();
2768
- this.latestReplicationInfoMessage = new Map();
2769
- this._replicationInfoBlockedPeers = new Set();
2770
- this._replicationInfoRequestByPeer = new Map();
2771
- this._replicationInfoApplyQueueByPeer = new Map();
2772
- this._repairRetryTimers = new Set();
2773
- this._recentRepairDispatch = new Map();
2774
- this._repairSweepRunning = false;
2775
- this._repairSweepForceFreshPending = false;
2776
- this._repairSweepAddedPeersPending = new Set();
2777
- this.coordinateToHash = new Cache<string>({ max: 1e6, ttl: 1e4 });
2778
- this.recentlyRebalanced = new Cache<string>({ max: 1e4, ttl: 1e5 });
2779
-
2780
- this.uniqueReplicators = new Set();
2781
- this._replicatorJoinEmitted = new Set();
2782
- this._replicatorsReconciled = false;
2781
+ this._pendingDeletes = new Map();
2782
+ this._pendingIHave = new Map();
2783
+ this.latestReplicationInfoMessage = new Map();
2784
+ this._replicationInfoBlockedPeers = new Set();
2785
+ this._replicationInfoRequestByPeer = new Map();
2786
+ this._replicationInfoApplyQueueByPeer = new Map();
2787
+ this._repairRetryTimers = new Set();
2788
+ this._recentRepairDispatch = new Map();
2789
+ this._repairSweepRunning = false;
2790
+ this._repairSweepForceFreshPending = false;
2791
+ this._repairSweepAddedPeersPending = new Set();
2792
+ this.coordinateToHash = new Cache<string>({ max: 1e6, ttl: 1e4 });
2793
+ this.recentlyRebalanced = new Cache<string>({ max: 1e4, ttl: 1e5 });
2794
+
2795
+ this.uniqueReplicators = new Set();
2796
+ this._replicatorJoinEmitted = new Set();
2797
+ this._replicatorsReconciled = false;
2798
+ this._replicatorLivenessSweepRunning = false;
2799
+ this._replicatorLivenessTimer = undefined;
2800
+ this._replicatorLivenessTargets = [];
2801
+ this._replicatorLivenessTargetsSize = 0;
2802
+ this._replicatorLivenessCursor = 0;
2803
+ this._replicatorLivenessFailures = new Map();
2804
+ this._replicatorLastActivityAt = new Map();
2783
2805
 
2784
2806
  this.openTime = +new Date();
2785
2807
  this.oldestOpenTime = this.openTime;
@@ -3233,18 +3255,20 @@ export class SharedLog<
3233
3255
  await super.afterOpen();
3234
3256
 
3235
3257
  // We do this here, because these calls requires this.closed == false
3236
- void this.pruneOfflineReplicators()
3237
- .then(() => {
3238
- this._replicatorsReconciled = true;
3239
- })
3258
+ void this.pruneOfflineReplicators()
3259
+ .then(() => {
3260
+ this._replicatorsReconciled = true;
3261
+ })
3240
3262
  .catch((error) => {
3241
3263
  if (isNotStartedError(error as Error)) {
3242
3264
  return;
3243
3265
  }
3244
- logger.error(error);
3245
- });
3266
+ logger.error(error);
3267
+ });
3246
3268
 
3247
- await this.rebalanceParticipation();
3269
+ this.startReplicatorLivenessSweep();
3270
+
3271
+ await this.rebalanceParticipation();
3248
3272
 
3249
3273
  // Take into account existing subscription
3250
3274
  (await this._getTopicSubscribers(this.topic))?.forEach((v) => {
@@ -3263,12 +3287,12 @@ export class SharedLog<
3263
3287
  }
3264
3288
 
3265
3289
  async pruneOfflineReplicators() {
3266
- // go through all segments and for waitForAll replicators to become reachable if not prune them away
3267
-
3290
+ // Go through all segments and wait for replicators to become reachable;
3291
+ // otherwise prune them away from the local membership view.
3268
3292
  try {
3269
3293
  const promises: Promise<any>[] = [];
3270
3294
  const iterator = this.replicationIndex.iterate();
3271
- let checkedIsAlive = new Set<string>();
3295
+ const checkedIsAlive = new Set<string>();
3272
3296
 
3273
3297
  while (!iterator.done()) {
3274
3298
  for (const segment of await iterator.next(1000)) {
@@ -3288,7 +3312,6 @@ export class SharedLog<
3288
3312
  signal: this._closeController.signal,
3289
3313
  })
3290
3314
  .then(async () => {
3291
- // is reachable, announce change events
3292
3315
  const key = await this._resolvePublicKeyFromHash(
3293
3316
  segment.value.hash,
3294
3317
  );
@@ -3299,49 +3322,261 @@ export class SharedLog<
3299
3322
  );
3300
3323
  }
3301
3324
 
3302
- const keyHash = key.hashcode();
3303
- this.uniqueReplicators.add(keyHash);
3325
+ const keyHash = key.hashcode();
3326
+ this.uniqueReplicators.add(keyHash);
3304
3327
 
3305
- if (!this._replicatorJoinEmitted.has(keyHash)) {
3306
- this._replicatorJoinEmitted.add(keyHash);
3307
- this.events.dispatchEvent(
3308
- new CustomEvent<ReplicatorJoinEvent>("replicator:join", {
3309
- detail: { publicKey: key },
3310
- }),
3311
- );
3312
- this.events.dispatchEvent(
3313
- new CustomEvent<ReplicationChangeEvent>(
3314
- "replication:change",
3315
- {
3316
- detail: { publicKey: key },
3317
- },
3318
- ),
3319
- );
3320
- }
3321
- })
3322
- .catch(async (e) => {
3323
- if (isNotStartedError(e)) {
3324
- return; // TODO test this path
3328
+ if (!this._replicatorJoinEmitted.has(keyHash)) {
3329
+ this._replicatorJoinEmitted.add(keyHash);
3330
+ this.events.dispatchEvent(
3331
+ new CustomEvent<ReplicatorJoinEvent>("replicator:join", {
3332
+ detail: { publicKey: key },
3333
+ }),
3334
+ );
3335
+ this.events.dispatchEvent(
3336
+ new CustomEvent<ReplicationChangeEvent>("replication:change", {
3337
+ detail: { publicKey: key },
3338
+ }),
3339
+ );
3340
+ }
3341
+ })
3342
+ .catch(async (error) => {
3343
+ if (isNotStartedError(error as Error)) {
3344
+ return;
3325
3345
  }
3326
3346
 
3327
- // not reachable
3328
3347
  return this.removeReplicator(segment.value.hash, {
3329
3348
  noEvent: true,
3330
- }); // done announce since replicator was never reachable
3349
+ });
3331
3350
  }),
3332
3351
  );
3333
3352
  }
3334
3353
  }
3335
- const results = await Promise.all(promises);
3336
- return results;
3337
- } catch (error: any) {
3338
- if (isNotStartedError(error)) {
3354
+
3355
+ return Promise.all(promises);
3356
+ } catch (error) {
3357
+ if (isNotStartedError(error as Error)) {
3339
3358
  return;
3340
3359
  }
3341
3360
  throw error;
3342
3361
  }
3343
3362
  }
3344
3363
 
3364
+ private startReplicatorLivenessSweep() {
3365
+ if (this._replicatorLivenessTimer) {
3366
+ return;
3367
+ }
3368
+ this._replicatorLivenessTimer = setInterval(() => {
3369
+ void this.runReplicatorLivenessSweep();
3370
+ }, REPLICATOR_LIVENESS_SWEEP_INTERVAL_MS);
3371
+ this._replicatorLivenessTimer.unref?.();
3372
+ }
3373
+
3374
+ private stopReplicatorLivenessSweep() {
3375
+ if (this._replicatorLivenessTimer) {
3376
+ clearInterval(this._replicatorLivenessTimer);
3377
+ this._replicatorLivenessTimer = undefined;
3378
+ }
3379
+ this._replicatorLivenessSweepRunning = false;
3380
+ this._replicatorLivenessTargets = [];
3381
+ this._replicatorLivenessTargetsSize = 0;
3382
+ this._replicatorLivenessCursor = 0;
3383
+ this._replicatorLivenessFailures.clear();
3384
+ this._replicatorLastActivityAt.clear();
3385
+ }
3386
+
3387
+ private rebuildReplicatorLivenessTargets() {
3388
+ const selfHash = this.node.identity.publicKey.hashcode();
3389
+ this._replicatorLivenessTargets = [...this.uniqueReplicators].filter(
3390
+ (hash) => hash !== selfHash,
3391
+ );
3392
+ this._replicatorLivenessTargetsSize = this.uniqueReplicators.size;
3393
+ if (this._replicatorLivenessCursor >= this._replicatorLivenessTargets.length) {
3394
+ this._replicatorLivenessCursor = 0;
3395
+ }
3396
+ }
3397
+
3398
+ private getReplicatorLivenessTargets() {
3399
+ const selfHash = this.node.identity.publicKey.hashcode();
3400
+ const expected =
3401
+ this.uniqueReplicators.size - (this.uniqueReplicators.has(selfHash) ? 1 : 0);
3402
+
3403
+ if (this._replicatorLivenessTargets.length > 0) {
3404
+ // Keep the cursor stable, but purge stale hashes (membership can change while
3405
+ // the total size stays constant).
3406
+ this._replicatorLivenessTargets = this._replicatorLivenessTargets.filter(
3407
+ (hash) => hash !== selfHash && this.uniqueReplicators.has(hash),
3408
+ );
3409
+ }
3410
+
3411
+ if (
3412
+ this._replicatorLivenessTargetsSize !== this.uniqueReplicators.size ||
3413
+ this._replicatorLivenessTargets.length !== expected
3414
+ ) {
3415
+ this.rebuildReplicatorLivenessTargets();
3416
+ }
3417
+
3418
+ return this._replicatorLivenessTargets;
3419
+ }
3420
+
3421
+ private cleanupPeerDisconnectTracking(peerHash: string) {
3422
+ this.cancelReplicationInfoRequests(peerHash);
3423
+ this._replicatorLivenessFailures.delete(peerHash);
3424
+ this._replicatorLastActivityAt.delete(peerHash);
3425
+
3426
+ for (const [hash, peers] of this._requestIPruneSent) {
3427
+ peers.delete(peerHash);
3428
+ if (peers.size === 0) {
3429
+ this._requestIPruneSent.delete(hash);
3430
+ }
3431
+ }
3432
+
3433
+ for (const [hash, peers] of this._requestIPruneResponseReplicatorSet) {
3434
+ peers.delete(peerHash);
3435
+ if (peers.size === 0) {
3436
+ this._requestIPruneResponseReplicatorSet.delete(hash);
3437
+ }
3438
+ }
3439
+ }
3440
+
3441
+ private markReplicatorActivity(peerHash: string, now = Date.now()) {
3442
+ this._replicatorLastActivityAt.set(peerHash, now);
3443
+ }
3444
+
3445
+ private hasRecentReplicatorActivity(peerHash: string, now = Date.now()) {
3446
+ const lastActivityAt = this._replicatorLastActivityAt.get(peerHash);
3447
+ if (
3448
+ lastActivityAt != null &&
3449
+ now - lastActivityAt < REPLICATOR_LIVENESS_IDLE_THRESHOLD_MS
3450
+ ) {
3451
+ this._replicatorLivenessFailures.delete(peerHash);
3452
+ return true;
3453
+ }
3454
+ return false;
3455
+ }
3456
+
3457
+ private async evictReplicatorFromLiveness(
3458
+ peerHash: string,
3459
+ publicKey: PublicSignKey,
3460
+ ) {
3461
+ const wasReplicator = this.uniqueReplicators.has(peerHash);
3462
+ const watermark = BigInt(+new Date());
3463
+ const previousWatermark = this.latestReplicationInfoMessage.get(peerHash);
3464
+ if (!previousWatermark || previousWatermark < watermark) {
3465
+ this.latestReplicationInfoMessage.set(peerHash, watermark);
3466
+ }
3467
+
3468
+ try {
3469
+ await this.removeReplicator(publicKey, { noEvent: true });
3470
+ } catch (error) {
3471
+ if (!isNotStartedError(error as Error)) {
3472
+ throw error;
3473
+ }
3474
+ }
3475
+
3476
+ this.cleanupPeerDisconnectTracking(peerHash);
3477
+
3478
+ if (wasReplicator) {
3479
+ this.events.dispatchEvent(
3480
+ new CustomEvent<ReplicatorLeaveEvent>("replicator:leave", {
3481
+ detail: { publicKey },
3482
+ }),
3483
+ );
3484
+ }
3485
+
3486
+ if (!this._replicationInfoBlockedPeers.has(peerHash)) {
3487
+ this.scheduleReplicationInfoRequests(publicKey);
3488
+ }
3489
+ this._replicatorLivenessTargetsSize = -1;
3490
+ }
3491
+
3492
+ private async runReplicatorLivenessSweep() {
3493
+ if (this.closed || this._closeController.signal.aborted) {
3494
+ return;
3495
+ }
3496
+ if (this._replicatorLivenessSweepRunning) {
3497
+ return;
3498
+ }
3499
+
3500
+ const targets = this.getReplicatorLivenessTargets();
3501
+ if (targets.length === 0) {
3502
+ return;
3503
+ }
3504
+
3505
+ this._replicatorLivenessSweepRunning = true;
3506
+ try {
3507
+ if (this._replicatorLivenessCursor >= targets.length) {
3508
+ this._replicatorLivenessCursor = 0;
3509
+ }
3510
+ const peerHash = targets[this._replicatorLivenessCursor]!;
3511
+ this._replicatorLivenessCursor =
3512
+ (this._replicatorLivenessCursor + 1) % targets.length;
3513
+ await this.probeReplicatorLiveness(peerHash);
3514
+ } catch (error) {
3515
+ if (!isNotStartedError(error as Error)) {
3516
+ logger.error((error as any)?.toString?.() ?? String(error));
3517
+ }
3518
+ } finally {
3519
+ this._replicatorLivenessSweepRunning = false;
3520
+ }
3521
+ }
3522
+
3523
+ private async probeReplicatorLiveness(peerHash: string) {
3524
+ if (this.closed || this._closeController.signal.aborted) {
3525
+ return;
3526
+ }
3527
+ if (!this.uniqueReplicators.has(peerHash)) {
3528
+ this._replicatorLivenessFailures.delete(peerHash);
3529
+ return;
3530
+ }
3531
+ if (this.hasRecentReplicatorActivity(peerHash)) {
3532
+ return;
3533
+ }
3534
+
3535
+ const publicKey = await this._resolvePublicKeyFromHash(peerHash);
3536
+ if (!publicKey) {
3537
+ try {
3538
+ await this.removeReplicator(peerHash, { noEvent: true });
3539
+ } catch (error) {
3540
+ if (!isNotStartedError(error as Error)) {
3541
+ throw error;
3542
+ }
3543
+ }
3544
+ this.cleanupPeerDisconnectTracking(peerHash);
3545
+ this._replicatorLivenessTargetsSize = -1;
3546
+ return;
3547
+ }
3548
+
3549
+ try {
3550
+ // Explicit ping (ACKed) instead of RequestReplicationInfoMessage to avoid
3551
+ // triggering large segment snapshots just to prove liveness.
3552
+ await this.rpc.send(new ReplicationPingMessage(), {
3553
+ mode: new AcknowledgeDelivery({ redundancy: 1, to: [publicKey] }),
3554
+ priority: 1,
3555
+ });
3556
+ this.markReplicatorActivity(peerHash);
3557
+ this._replicatorLivenessFailures.delete(peerHash);
3558
+ return;
3559
+ } catch (error) {
3560
+ if (isNotStartedError(error as Error)) {
3561
+ return;
3562
+ }
3563
+ }
3564
+
3565
+ const failures = (this._replicatorLivenessFailures.get(peerHash) ?? 0) + 1;
3566
+ this._replicatorLivenessFailures.set(peerHash, failures);
3567
+ this.scheduleReplicationInfoRequests(publicKey);
3568
+
3569
+ if (failures < REPLICATOR_LIVENESS_PROBE_FAILURES_TO_EVICT) {
3570
+ return;
3571
+ }
3572
+ if (!this.uniqueReplicators.has(peerHash)) {
3573
+ this._replicatorLivenessFailures.delete(peerHash);
3574
+ return;
3575
+ }
3576
+
3577
+ await this.evictReplicatorFromLiveness(peerHash, publicKey);
3578
+ }
3579
+
3345
3580
  async getMemoryUsage() {
3346
3581
  return this.log.blocks.size();
3347
3582
  /* ((await this.log.entryIndex?.getMemoryUsage()) || 0) */ // + (await this.log.blocks.size())
@@ -3533,13 +3768,14 @@ export class SharedLog<
3533
3768
  this.coordinateToHash.clear();
3534
3769
  this.recentlyRebalanced.clear();
3535
3770
  this.uniqueReplicators.clear();
3536
- this._closeController.abort();
3771
+ this._closeController.abort();
3537
3772
 
3538
- clearInterval(this.interval);
3773
+ clearInterval(this.interval);
3774
+ this.stopReplicatorLivenessSweep();
3539
3775
 
3540
- this.node.services.pubsub.removeEventListener(
3541
- "subscribe",
3542
- this._onSubscriptionFn,
3776
+ this.node.services.pubsub.removeEventListener(
3777
+ "subscribe",
3778
+ this._onSubscriptionFn,
3543
3779
  );
3544
3780
 
3545
3781
  this.node.services.pubsub.removeEventListener(
@@ -3712,6 +3948,9 @@ export class SharedLog<
3712
3948
  if (!context.from) {
3713
3949
  throw new Error("Missing from in update role message");
3714
3950
  }
3951
+ if (!context.from.equals(this.node.identity.publicKey)) {
3952
+ this.markReplicatorActivity(context.from.hashcode());
3953
+ }
3715
3954
 
3716
3955
  if (msg instanceof ResponseRoleMessage) {
3717
3956
  msg = msg.toReplicationInfoMessage(); // migration
@@ -4053,25 +4292,27 @@ export class SharedLog<
4053
4292
  msg.message,
4054
4293
  context.from!.hashcode(),
4055
4294
  );
4295
+ } else if (msg instanceof ReplicationPingMessage) {
4296
+ // No-op: used as an ACKed unicast liveness probe.
4056
4297
  } else if (msg instanceof RequestReplicationInfoMessage) {
4057
4298
  if (context.from.equals(this.node.identity.publicKey)) {
4058
4299
  return;
4059
4300
  }
4060
4301
 
4061
- const segments = (await this.getMyReplicationSegments()).map((x) =>
4062
- x.toReplicationRange(),
4063
- );
4302
+ const segments = (await this.getMyReplicationSegments()).map((x) =>
4303
+ x.toReplicationRange(),
4304
+ );
4064
4305
 
4065
- this.rpc
4066
- .send(new AllReplicatingSegmentsMessage({ segments }), {
4067
- mode: new AcknowledgeDelivery({ to: [context.from], redundancy: 1 }),
4068
- })
4069
- .catch((e) => logger.error(e.toString()));
4306
+ this.rpc
4307
+ .send(new AllReplicatingSegmentsMessage({ segments }), {
4308
+ mode: new AcknowledgeDelivery({ to: [context.from], redundancy: 1 }),
4309
+ })
4310
+ .catch((e) => logger.error(e.toString()));
4070
4311
 
4071
- // for backwards compatibility (v8) remove this when we are sure that all nodes are v9+
4072
- if (this.v8Behaviour) {
4073
- const role = this.getRole();
4074
- if (role instanceof Replicator) {
4312
+ // for backwards compatibility (v8) remove this when we are sure that all nodes are v9+
4313
+ if (this.v8Behaviour) {
4314
+ const role = this.getRole();
4315
+ if (role instanceof Replicator) {
4075
4316
  const fixedSettings = !this._isAdaptiveReplicating;
4076
4317
  if (fixedSettings) {
4077
4318
  await this.rpc.send(
@@ -4096,38 +4337,39 @@ export class SharedLog<
4096
4337
  return;
4097
4338
  }
4098
4339
 
4099
- const replicationInfoMessage = msg as
4100
- | AllReplicatingSegmentsMessage
4101
- | AddedReplicationSegmentMessage;
4102
-
4103
- // Process replication updates even if the sender isn't yet considered "ready" by
4104
- // `Program.waitFor()`. Dropping these messages can lead to missing replicator info
4105
- // (and downstream `waitForReplicator()` timeouts) under timing-sensitive joins.
4106
- const from = context.from!;
4107
- const fromHash = from.hashcode();
4108
- if (this._replicationInfoBlockedPeers.has(fromHash)) {
4109
- return;
4110
- }
4111
- const messageTimestamp = context.message.header.timestamp;
4112
- await this.withReplicationInfoApplyQueue(fromHash, async () => {
4113
- try {
4114
- // The peer may have unsubscribed after this message was queued.
4115
- if (this._replicationInfoBlockedPeers.has(fromHash)) {
4116
- return;
4117
- }
4340
+ const replicationInfoMessage = msg as
4341
+ | AllReplicatingSegmentsMessage
4342
+ | AddedReplicationSegmentMessage;
4343
+
4344
+ // Process replication updates even if the sender isn't yet considered "ready" by
4345
+ // `Program.waitFor()`. Dropping these messages can lead to missing replicator info
4346
+ // (and downstream `waitForReplicator()` timeouts) under timing-sensitive joins.
4347
+ const from = context.from!;
4348
+ const fromHash = from.hashcode();
4349
+ if (this._replicationInfoBlockedPeers.has(fromHash)) {
4350
+ return;
4351
+ }
4352
+ const messageTimestamp = context.message.header.timestamp;
4353
+ await this.withReplicationInfoApplyQueue(fromHash, async () => {
4354
+ try {
4355
+ // The peer may have unsubscribed after this message was queued.
4356
+ if (this._replicationInfoBlockedPeers.has(fromHash)) {
4357
+ return;
4358
+ }
4118
4359
 
4119
- // Process in-order to avoid races where repeated reset messages arrive
4120
- // concurrently and trigger spurious "added" diffs / rebalancing.
4121
- const prev = this.latestReplicationInfoMessage.get(fromHash);
4122
- if (prev && prev > messageTimestamp) {
4123
- return;
4124
- }
4360
+ // Process in-order to avoid races where repeated reset messages arrive
4361
+ // concurrently and trigger spurious "added" diffs / rebalancing.
4362
+ const prev = this.latestReplicationInfoMessage.get(fromHash);
4363
+ if (prev && prev > messageTimestamp) {
4364
+ return;
4365
+ }
4125
4366
 
4126
- this.latestReplicationInfoMessage.set(fromHash, messageTimestamp);
4367
+ this.latestReplicationInfoMessage.set(fromHash, messageTimestamp);
4368
+ this._replicatorLivenessFailures.delete(fromHash);
4127
4369
 
4128
- if (this.closed) {
4129
- return;
4130
- }
4370
+ if (this.closed) {
4371
+ return;
4372
+ }
4131
4373
 
4132
4374
  const reset = msg instanceof AllReplicatingSegmentsMessage;
4133
4375
  await this.addReplicationRange(
@@ -4142,39 +4384,40 @@ export class SharedLog<
4142
4384
  },
4143
4385
  );
4144
4386
 
4145
- // If the peer reports any replication segments, stop re-requesting.
4146
- // (Empty reports can be transient during startup.)
4147
- if (replicationInfoMessage.segments.length > 0) {
4148
- this.cancelReplicationInfoRequests(fromHash);
4149
- }
4150
- } catch (e) {
4151
- if (isNotStartedError(e as Error)) {
4152
- return;
4153
- }
4154
- logger.error(
4155
- `Failed to apply replication settings from '${fromHash}': ${
4156
- (e as any)?.message ?? e
4157
- }`,
4158
- );
4387
+ // If the peer reports any replication segments, stop re-requesting.
4388
+ // (Empty reports can be transient during startup.)
4389
+ if (replicationInfoMessage.segments.length > 0) {
4390
+ this.cancelReplicationInfoRequests(fromHash);
4159
4391
  }
4160
- });
4161
- } else if (msg instanceof StoppedReplicating) {
4162
- if (context.from.equals(this.node.identity.publicKey)) {
4163
- return;
4164
- }
4165
- const fromHash = context.from.hashcode();
4166
- if (this._replicationInfoBlockedPeers.has(fromHash)) {
4392
+ } catch (e) {
4393
+ if (isNotStartedError(e as Error)) {
4167
4394
  return;
4168
4395
  }
4396
+ logger.error(
4397
+ `Failed to apply replication settings from '${fromHash}': ${
4398
+ (e as any)?.message ?? e
4399
+ }`,
4400
+ );
4401
+ }
4402
+ });
4403
+ } else if (msg instanceof StoppedReplicating) {
4404
+ if (context.from.equals(this.node.identity.publicKey)) {
4405
+ return;
4406
+ }
4407
+ const fromHash = context.from.hashcode();
4408
+ if (this._replicationInfoBlockedPeers.has(fromHash)) {
4409
+ return;
4410
+ }
4411
+ this._replicatorLivenessFailures.delete(fromHash);
4169
4412
 
4170
- const rangesToRemove = await this.resolveReplicationRangesFromIdsAndKey(
4171
- msg.segmentIds,
4172
- context.from,
4413
+ const rangesToRemove = await this.resolveReplicationRangesFromIdsAndKey(
4414
+ msg.segmentIds,
4415
+ context.from,
4173
4416
  );
4174
4417
 
4175
- await this.removeReplicationRanges(rangesToRemove, context.from);
4176
- const timestamp = BigInt(+new Date());
4177
- for (const range of rangesToRemove) {
4418
+ await this.removeReplicationRanges(rangesToRemove, context.from);
4419
+ const timestamp = BigInt(+new Date());
4420
+ for (const range of rangesToRemove) {
4178
4421
  this.replicationChangeDebounceFn.add({
4179
4422
  range,
4180
4423
  type: "removed",
@@ -5180,97 +5423,80 @@ export class SharedLog<
5180
5423
  tick();
5181
5424
  }
5182
5425
 
5183
- async handleSubscriptionChange(
5184
- publicKey: PublicSignKey,
5185
- topics: string[],
5186
- subscribed: boolean,
5187
- ) {
5188
- if (!topics.includes(this.topic)) {
5189
- return;
5190
- }
5426
+ async handleSubscriptionChange(
5427
+ publicKey: PublicSignKey,
5428
+ topics: string[],
5429
+ subscribed: boolean,
5430
+ ) {
5431
+ if (!topics.includes(this.topic)) {
5432
+ return;
5433
+ }
5191
5434
 
5192
- const peerHash = publicKey.hashcode();
5193
- if (subscribed) {
5194
- this._replicationInfoBlockedPeers.delete(peerHash);
5195
- } else {
5196
- this._replicationInfoBlockedPeers.add(peerHash);
5435
+ const peerHash = publicKey.hashcode();
5436
+ if (!subscribed) {
5437
+ this._replicationInfoBlockedPeers.add(peerHash);
5438
+
5439
+ const now = BigInt(+new Date());
5440
+ const previous = this.latestReplicationInfoMessage.get(peerHash);
5441
+ if (!previous || previous < now) {
5442
+ this.latestReplicationInfoMessage.set(peerHash, now);
5197
5443
  }
5198
5444
 
5199
- if (!subscribed) {
5200
- const wasReplicator = this.uniqueReplicators.has(peerHash);
5201
- try {
5202
- // Unsubscribe can race with the peer's final replication reset message.
5203
- // Proactively evict its ranges so leader selection doesn't keep stale owners.
5204
- await this.removeReplicator(publicKey, { noEvent: true });
5205
- } catch (error) {
5206
- if (!isNotStartedError(error as Error)) {
5207
- throw error;
5208
- }
5445
+ const wasReplicator = this.uniqueReplicators.has(peerHash);
5446
+ try {
5447
+ // Unsubscribe can race with the peer's final replication reset message.
5448
+ // Proactively evict its ranges so leader selection doesn't keep stale owners.
5449
+ await this.removeReplicator(publicKey, { noEvent: true });
5450
+ } catch (error) {
5451
+ if (!isNotStartedError(error as Error)) {
5452
+ throw error;
5209
5453
  }
5454
+ }
5210
5455
 
5211
- // Emit replicator:leave at most once per (join -> leave) transition, even if we
5212
- // concurrently process unsubscribe + replication reset messages for the same peer.
5213
- const stoppedTransition = wasReplicator;
5214
- this._replicatorJoinEmitted.delete(peerHash);
5215
-
5216
- this.cancelReplicationInfoRequests(peerHash);
5217
- this.removePeerFromGidPeerHistory(peerHash);
5456
+ this._replicatorJoinEmitted.delete(peerHash);
5457
+ this.cleanupPeerDisconnectTracking(peerHash);
5218
5458
 
5219
- for (const [k, v] of this._requestIPruneSent) {
5220
- v.delete(peerHash);
5221
- if (v.size === 0) {
5222
- this._requestIPruneSent.delete(k);
5223
- }
5224
- }
5459
+ if (wasReplicator) {
5460
+ this.events.dispatchEvent(
5461
+ new CustomEvent<ReplicatorLeaveEvent>("replicator:leave", {
5462
+ detail: { publicKey },
5463
+ }),
5464
+ );
5465
+ }
5466
+ return;
5467
+ }
5225
5468
 
5226
- for (const [k, v] of this._requestIPruneResponseReplicatorSet) {
5227
- v.delete(peerHash);
5228
- if (v.size === 0) {
5229
- this._requestIPruneResponseReplicatorSet.delete(k);
5230
- }
5231
- }
5469
+ this._replicationInfoBlockedPeers.delete(peerHash);
5470
+ this._replicatorLivenessFailures.delete(peerHash);
5471
+ this.markReplicatorActivity(peerHash);
5232
5472
 
5233
- this.syncronizer.onPeerDisconnected(publicKey);
5473
+ const replicationSegments = await this.getMyReplicationSegments();
5474
+ if (replicationSegments.length > 0) {
5475
+ this.rpc
5476
+ .send(
5477
+ new AllReplicatingSegmentsMessage({
5478
+ segments: replicationSegments.map((x) => x.toReplicationRange()),
5479
+ }),
5480
+ {
5481
+ mode: new AcknowledgeDelivery({ redundancy: 1, to: [publicKey] }),
5482
+ },
5483
+ )
5484
+ .catch((e) => logger.error(e.toString()));
5234
5485
 
5235
- stoppedTransition &&
5236
- this.events.dispatchEvent(
5237
- new CustomEvent<ReplicatorLeaveEvent>("replicator:leave", {
5238
- detail: { publicKey },
5239
- }),
5240
- );
5486
+ if (this.v8Behaviour) {
5487
+ // for backwards compatibility
5488
+ this.rpc
5489
+ .send(new ResponseRoleMessage({ role: await this.getRole() }), {
5490
+ mode: new AcknowledgeDelivery({ redundancy: 1, to: [publicKey] }),
5491
+ })
5492
+ .catch((e) => logger.error(e.toString()));
5241
5493
  }
5242
-
5243
- if (subscribed) {
5244
- const replicationSegments = await this.getMyReplicationSegments();
5245
- if (replicationSegments.length > 0) {
5246
- this.rpc
5247
- .send(
5248
- new AllReplicatingSegmentsMessage({
5249
- segments: replicationSegments.map((x) => x.toReplicationRange()),
5250
- }),
5251
- {
5252
- mode: new AcknowledgeDelivery({ redundancy: 1, to: [publicKey] }),
5253
- },
5254
- )
5255
- .catch((e) => logger.error(e.toString()));
5256
-
5257
- if (this.v8Behaviour) {
5258
- // for backwards compatibility
5259
- this.rpc
5260
- .send(new ResponseRoleMessage({ role: await this.getRole() }), {
5261
- mode: new AcknowledgeDelivery({ redundancy: 1, to: [publicKey] }),
5262
- })
5263
- .catch((e) => logger.error(e.toString()));
5264
- }
5265
- }
5266
-
5267
- // Request the remote peer's replication info. This makes joins resilient to
5268
- // timing-sensitive delivery/order issues where we may miss their initial
5269
- // replication announcement.
5270
- this.scheduleReplicationInfoRequests(publicKey);
5271
- } else {
5272
- await this.removeReplicator(publicKey);
5273
5494
  }
5495
+
5496
+ // Request the remote peer's replication info. This makes joins resilient to
5497
+ // timing-sensitive delivery/order issues where we may miss their initial
5498
+ // replication announcement.
5499
+ this.scheduleReplicationInfoRequests(publicKey);
5274
5500
  }
5275
5501
 
5276
5502
  private getClampedReplicas(customValue?: MinReplicas) {