@peerbit/shared-log 13.0.1-06e7585 → 13.0.2-a54459d
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/index.d.ts +17 -0
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +250 -57
- package/dist/src/index.js.map +1 -1
- package/dist/src/replication.d.ts +3 -0
- package/dist/src/replication.d.ts.map +1 -1
- package/dist/src/replication.js +25 -0
- package/dist/src/replication.js.map +1 -1
- package/dist/src/sync/simple.d.ts +1 -1
- package/dist/src/sync/simple.d.ts.map +1 -1
- package/dist/src/sync/simple.js +15 -8
- package/dist/src/sync/simple.js.map +1 -1
- package/package.json +18 -18
- package/src/index.ts +435 -209
- package/src/replication.ts +10 -0
- package/src/sync/simple.ts +24 -24
package/src/index.ts
CHANGED
|
@@ -137,6 +137,7 @@ import {
|
|
|
137
137
|
AddedReplicationSegmentMessage,
|
|
138
138
|
AllReplicatingSegmentsMessage,
|
|
139
139
|
MinReplicas,
|
|
140
|
+
ReplicationPingMessage,
|
|
140
141
|
ReplicationError,
|
|
141
142
|
type ReplicationLimits,
|
|
142
143
|
RequestReplicationInfoMessage,
|
|
@@ -444,6 +445,13 @@ const DEFAULT_DISTRIBUTION_DEBOUNCE_TIME = 500;
|
|
|
444
445
|
const RECENT_REPAIR_DISPATCH_TTL_MS = 5_000;
|
|
445
446
|
const REPAIR_SWEEP_ENTRY_BATCH_SIZE = 1_000;
|
|
446
447
|
const REPAIR_SWEEP_TARGET_BUFFER_SIZE = 1024;
|
|
448
|
+
// In sparse topologies (browser/relay), peers can learn about replicators via broadcast
|
|
449
|
+
// replication announcements without having a direct connection that emits unsubscribe
|
|
450
|
+
// on abrupt churn. Probe conservatively so a single missed ACK does not evict a
|
|
451
|
+
// healthy replicator, and rely on replication-info refresh to recover membership.
|
|
452
|
+
const REPLICATOR_LIVENESS_SWEEP_INTERVAL_MS = 2_000;
|
|
453
|
+
const REPLICATOR_LIVENESS_IDLE_THRESHOLD_MS = 8_000;
|
|
454
|
+
const REPLICATOR_LIVENESS_PROBE_FAILURES_TO_EVICT = 2;
|
|
447
455
|
// Churn/join repair can race with pruning and transient missed sync requests under
|
|
448
456
|
// heavy event-loop load. Keep retries alive with a longer tail so reassigned
|
|
449
457
|
// entries are retried after short bursts and slower recovery windows.
|
|
@@ -634,6 +642,13 @@ export class SharedLog<
|
|
|
634
642
|
{ attempts: number; timer?: ReturnType<typeof setTimeout> }
|
|
635
643
|
>;
|
|
636
644
|
private _replicationInfoApplyQueueByPeer!: Map<string, Promise<void>>;
|
|
645
|
+
private _replicatorLivenessSweepRunning!: boolean;
|
|
646
|
+
private _replicatorLivenessTimer?: ReturnType<typeof setInterval>;
|
|
647
|
+
private _replicatorLivenessTargets!: string[];
|
|
648
|
+
private _replicatorLivenessTargetsSize!: number;
|
|
649
|
+
private _replicatorLivenessCursor!: number;
|
|
650
|
+
private _replicatorLivenessFailures!: Map<string, number>;
|
|
651
|
+
private _replicatorLastActivityAt!: Map<string, number>;
|
|
637
652
|
|
|
638
653
|
private remoteBlocks!: RemoteBlocks;
|
|
639
654
|
|
|
@@ -2763,23 +2778,30 @@ export class SharedLog<
|
|
|
2763
2778
|
this.domain.resolution,
|
|
2764
2779
|
);
|
|
2765
2780
|
this._respondToIHaveTimeout = options?.respondToIHaveTimeout ?? 2e4;
|
|
2766
|
-
|
|
2767
|
-
|
|
2768
|
-
|
|
2769
|
-
|
|
2770
|
-
|
|
2771
|
-
|
|
2772
|
-
|
|
2773
|
-
|
|
2774
|
-
|
|
2775
|
-
|
|
2776
|
-
|
|
2777
|
-
|
|
2778
|
-
|
|
2779
|
-
|
|
2780
|
-
|
|
2781
|
-
|
|
2782
|
-
|
|
2781
|
+
this._pendingDeletes = new Map();
|
|
2782
|
+
this._pendingIHave = new Map();
|
|
2783
|
+
this.latestReplicationInfoMessage = new Map();
|
|
2784
|
+
this._replicationInfoBlockedPeers = new Set();
|
|
2785
|
+
this._replicationInfoRequestByPeer = new Map();
|
|
2786
|
+
this._replicationInfoApplyQueueByPeer = new Map();
|
|
2787
|
+
this._repairRetryTimers = new Set();
|
|
2788
|
+
this._recentRepairDispatch = new Map();
|
|
2789
|
+
this._repairSweepRunning = false;
|
|
2790
|
+
this._repairSweepForceFreshPending = false;
|
|
2791
|
+
this._repairSweepAddedPeersPending = new Set();
|
|
2792
|
+
this.coordinateToHash = new Cache<string>({ max: 1e6, ttl: 1e4 });
|
|
2793
|
+
this.recentlyRebalanced = new Cache<string>({ max: 1e4, ttl: 1e5 });
|
|
2794
|
+
|
|
2795
|
+
this.uniqueReplicators = new Set();
|
|
2796
|
+
this._replicatorJoinEmitted = new Set();
|
|
2797
|
+
this._replicatorsReconciled = false;
|
|
2798
|
+
this._replicatorLivenessSweepRunning = false;
|
|
2799
|
+
this._replicatorLivenessTimer = undefined;
|
|
2800
|
+
this._replicatorLivenessTargets = [];
|
|
2801
|
+
this._replicatorLivenessTargetsSize = 0;
|
|
2802
|
+
this._replicatorLivenessCursor = 0;
|
|
2803
|
+
this._replicatorLivenessFailures = new Map();
|
|
2804
|
+
this._replicatorLastActivityAt = new Map();
|
|
2783
2805
|
|
|
2784
2806
|
this.openTime = +new Date();
|
|
2785
2807
|
this.oldestOpenTime = this.openTime;
|
|
@@ -3233,18 +3255,20 @@ export class SharedLog<
|
|
|
3233
3255
|
await super.afterOpen();
|
|
3234
3256
|
|
|
3235
3257
|
// We do this here, because these calls requires this.closed == false
|
|
3236
|
-
|
|
3237
|
-
|
|
3238
|
-
|
|
3239
|
-
|
|
3258
|
+
void this.pruneOfflineReplicators()
|
|
3259
|
+
.then(() => {
|
|
3260
|
+
this._replicatorsReconciled = true;
|
|
3261
|
+
})
|
|
3240
3262
|
.catch((error) => {
|
|
3241
3263
|
if (isNotStartedError(error as Error)) {
|
|
3242
3264
|
return;
|
|
3243
3265
|
}
|
|
3244
|
-
|
|
3245
|
-
|
|
3266
|
+
logger.error(error);
|
|
3267
|
+
});
|
|
3246
3268
|
|
|
3247
|
-
|
|
3269
|
+
this.startReplicatorLivenessSweep();
|
|
3270
|
+
|
|
3271
|
+
await this.rebalanceParticipation();
|
|
3248
3272
|
|
|
3249
3273
|
// Take into account existing subscription
|
|
3250
3274
|
(await this._getTopicSubscribers(this.topic))?.forEach((v) => {
|
|
@@ -3263,12 +3287,12 @@ export class SharedLog<
|
|
|
3263
3287
|
}
|
|
3264
3288
|
|
|
3265
3289
|
async pruneOfflineReplicators() {
|
|
3266
|
-
//
|
|
3267
|
-
|
|
3290
|
+
// Go through all segments and wait for replicators to become reachable;
|
|
3291
|
+
// otherwise prune them away from the local membership view.
|
|
3268
3292
|
try {
|
|
3269
3293
|
const promises: Promise<any>[] = [];
|
|
3270
3294
|
const iterator = this.replicationIndex.iterate();
|
|
3271
|
-
|
|
3295
|
+
const checkedIsAlive = new Set<string>();
|
|
3272
3296
|
|
|
3273
3297
|
while (!iterator.done()) {
|
|
3274
3298
|
for (const segment of await iterator.next(1000)) {
|
|
@@ -3288,7 +3312,6 @@ export class SharedLog<
|
|
|
3288
3312
|
signal: this._closeController.signal,
|
|
3289
3313
|
})
|
|
3290
3314
|
.then(async () => {
|
|
3291
|
-
// is reachable, announce change events
|
|
3292
3315
|
const key = await this._resolvePublicKeyFromHash(
|
|
3293
3316
|
segment.value.hash,
|
|
3294
3317
|
);
|
|
@@ -3299,49 +3322,261 @@ export class SharedLog<
|
|
|
3299
3322
|
);
|
|
3300
3323
|
}
|
|
3301
3324
|
|
|
3302
|
-
|
|
3303
|
-
|
|
3325
|
+
const keyHash = key.hashcode();
|
|
3326
|
+
this.uniqueReplicators.add(keyHash);
|
|
3304
3327
|
|
|
3305
|
-
|
|
3306
|
-
|
|
3307
|
-
|
|
3308
|
-
|
|
3309
|
-
|
|
3310
|
-
|
|
3311
|
-
|
|
3312
|
-
|
|
3313
|
-
|
|
3314
|
-
|
|
3315
|
-
|
|
3316
|
-
|
|
3317
|
-
|
|
3318
|
-
|
|
3319
|
-
|
|
3320
|
-
|
|
3321
|
-
|
|
3322
|
-
.catch(async (e) => {
|
|
3323
|
-
if (isNotStartedError(e)) {
|
|
3324
|
-
return; // TODO test this path
|
|
3328
|
+
if (!this._replicatorJoinEmitted.has(keyHash)) {
|
|
3329
|
+
this._replicatorJoinEmitted.add(keyHash);
|
|
3330
|
+
this.events.dispatchEvent(
|
|
3331
|
+
new CustomEvent<ReplicatorJoinEvent>("replicator:join", {
|
|
3332
|
+
detail: { publicKey: key },
|
|
3333
|
+
}),
|
|
3334
|
+
);
|
|
3335
|
+
this.events.dispatchEvent(
|
|
3336
|
+
new CustomEvent<ReplicationChangeEvent>("replication:change", {
|
|
3337
|
+
detail: { publicKey: key },
|
|
3338
|
+
}),
|
|
3339
|
+
);
|
|
3340
|
+
}
|
|
3341
|
+
})
|
|
3342
|
+
.catch(async (error) => {
|
|
3343
|
+
if (isNotStartedError(error as Error)) {
|
|
3344
|
+
return;
|
|
3325
3345
|
}
|
|
3326
3346
|
|
|
3327
|
-
// not reachable
|
|
3328
3347
|
return this.removeReplicator(segment.value.hash, {
|
|
3329
3348
|
noEvent: true,
|
|
3330
|
-
});
|
|
3349
|
+
});
|
|
3331
3350
|
}),
|
|
3332
3351
|
);
|
|
3333
3352
|
}
|
|
3334
3353
|
}
|
|
3335
|
-
|
|
3336
|
-
return
|
|
3337
|
-
} catch (error
|
|
3338
|
-
if (isNotStartedError(error)) {
|
|
3354
|
+
|
|
3355
|
+
return Promise.all(promises);
|
|
3356
|
+
} catch (error) {
|
|
3357
|
+
if (isNotStartedError(error as Error)) {
|
|
3339
3358
|
return;
|
|
3340
3359
|
}
|
|
3341
3360
|
throw error;
|
|
3342
3361
|
}
|
|
3343
3362
|
}
|
|
3344
3363
|
|
|
3364
|
+
private startReplicatorLivenessSweep() {
|
|
3365
|
+
if (this._replicatorLivenessTimer) {
|
|
3366
|
+
return;
|
|
3367
|
+
}
|
|
3368
|
+
this._replicatorLivenessTimer = setInterval(() => {
|
|
3369
|
+
void this.runReplicatorLivenessSweep();
|
|
3370
|
+
}, REPLICATOR_LIVENESS_SWEEP_INTERVAL_MS);
|
|
3371
|
+
this._replicatorLivenessTimer.unref?.();
|
|
3372
|
+
}
|
|
3373
|
+
|
|
3374
|
+
private stopReplicatorLivenessSweep() {
|
|
3375
|
+
if (this._replicatorLivenessTimer) {
|
|
3376
|
+
clearInterval(this._replicatorLivenessTimer);
|
|
3377
|
+
this._replicatorLivenessTimer = undefined;
|
|
3378
|
+
}
|
|
3379
|
+
this._replicatorLivenessSweepRunning = false;
|
|
3380
|
+
this._replicatorLivenessTargets = [];
|
|
3381
|
+
this._replicatorLivenessTargetsSize = 0;
|
|
3382
|
+
this._replicatorLivenessCursor = 0;
|
|
3383
|
+
this._replicatorLivenessFailures.clear();
|
|
3384
|
+
this._replicatorLastActivityAt.clear();
|
|
3385
|
+
}
|
|
3386
|
+
|
|
3387
|
+
private rebuildReplicatorLivenessTargets() {
|
|
3388
|
+
const selfHash = this.node.identity.publicKey.hashcode();
|
|
3389
|
+
this._replicatorLivenessTargets = [...this.uniqueReplicators].filter(
|
|
3390
|
+
(hash) => hash !== selfHash,
|
|
3391
|
+
);
|
|
3392
|
+
this._replicatorLivenessTargetsSize = this.uniqueReplicators.size;
|
|
3393
|
+
if (this._replicatorLivenessCursor >= this._replicatorLivenessTargets.length) {
|
|
3394
|
+
this._replicatorLivenessCursor = 0;
|
|
3395
|
+
}
|
|
3396
|
+
}
|
|
3397
|
+
|
|
3398
|
+
private getReplicatorLivenessTargets() {
|
|
3399
|
+
const selfHash = this.node.identity.publicKey.hashcode();
|
|
3400
|
+
const expected =
|
|
3401
|
+
this.uniqueReplicators.size - (this.uniqueReplicators.has(selfHash) ? 1 : 0);
|
|
3402
|
+
|
|
3403
|
+
if (this._replicatorLivenessTargets.length > 0) {
|
|
3404
|
+
// Keep the cursor stable, but purge stale hashes (membership can change while
|
|
3405
|
+
// the total size stays constant).
|
|
3406
|
+
this._replicatorLivenessTargets = this._replicatorLivenessTargets.filter(
|
|
3407
|
+
(hash) => hash !== selfHash && this.uniqueReplicators.has(hash),
|
|
3408
|
+
);
|
|
3409
|
+
}
|
|
3410
|
+
|
|
3411
|
+
if (
|
|
3412
|
+
this._replicatorLivenessTargetsSize !== this.uniqueReplicators.size ||
|
|
3413
|
+
this._replicatorLivenessTargets.length !== expected
|
|
3414
|
+
) {
|
|
3415
|
+
this.rebuildReplicatorLivenessTargets();
|
|
3416
|
+
}
|
|
3417
|
+
|
|
3418
|
+
return this._replicatorLivenessTargets;
|
|
3419
|
+
}
|
|
3420
|
+
|
|
3421
|
+
private cleanupPeerDisconnectTracking(peerHash: string) {
|
|
3422
|
+
this.cancelReplicationInfoRequests(peerHash);
|
|
3423
|
+
this._replicatorLivenessFailures.delete(peerHash);
|
|
3424
|
+
this._replicatorLastActivityAt.delete(peerHash);
|
|
3425
|
+
|
|
3426
|
+
for (const [hash, peers] of this._requestIPruneSent) {
|
|
3427
|
+
peers.delete(peerHash);
|
|
3428
|
+
if (peers.size === 0) {
|
|
3429
|
+
this._requestIPruneSent.delete(hash);
|
|
3430
|
+
}
|
|
3431
|
+
}
|
|
3432
|
+
|
|
3433
|
+
for (const [hash, peers] of this._requestIPruneResponseReplicatorSet) {
|
|
3434
|
+
peers.delete(peerHash);
|
|
3435
|
+
if (peers.size === 0) {
|
|
3436
|
+
this._requestIPruneResponseReplicatorSet.delete(hash);
|
|
3437
|
+
}
|
|
3438
|
+
}
|
|
3439
|
+
}
|
|
3440
|
+
|
|
3441
|
+
private markReplicatorActivity(peerHash: string, now = Date.now()) {
|
|
3442
|
+
this._replicatorLastActivityAt.set(peerHash, now);
|
|
3443
|
+
}
|
|
3444
|
+
|
|
3445
|
+
private hasRecentReplicatorActivity(peerHash: string, now = Date.now()) {
|
|
3446
|
+
const lastActivityAt = this._replicatorLastActivityAt.get(peerHash);
|
|
3447
|
+
if (
|
|
3448
|
+
lastActivityAt != null &&
|
|
3449
|
+
now - lastActivityAt < REPLICATOR_LIVENESS_IDLE_THRESHOLD_MS
|
|
3450
|
+
) {
|
|
3451
|
+
this._replicatorLivenessFailures.delete(peerHash);
|
|
3452
|
+
return true;
|
|
3453
|
+
}
|
|
3454
|
+
return false;
|
|
3455
|
+
}
|
|
3456
|
+
|
|
3457
|
+
private async evictReplicatorFromLiveness(
|
|
3458
|
+
peerHash: string,
|
|
3459
|
+
publicKey: PublicSignKey,
|
|
3460
|
+
) {
|
|
3461
|
+
const wasReplicator = this.uniqueReplicators.has(peerHash);
|
|
3462
|
+
const watermark = BigInt(+new Date());
|
|
3463
|
+
const previousWatermark = this.latestReplicationInfoMessage.get(peerHash);
|
|
3464
|
+
if (!previousWatermark || previousWatermark < watermark) {
|
|
3465
|
+
this.latestReplicationInfoMessage.set(peerHash, watermark);
|
|
3466
|
+
}
|
|
3467
|
+
|
|
3468
|
+
try {
|
|
3469
|
+
await this.removeReplicator(publicKey, { noEvent: true });
|
|
3470
|
+
} catch (error) {
|
|
3471
|
+
if (!isNotStartedError(error as Error)) {
|
|
3472
|
+
throw error;
|
|
3473
|
+
}
|
|
3474
|
+
}
|
|
3475
|
+
|
|
3476
|
+
this.cleanupPeerDisconnectTracking(peerHash);
|
|
3477
|
+
|
|
3478
|
+
if (wasReplicator) {
|
|
3479
|
+
this.events.dispatchEvent(
|
|
3480
|
+
new CustomEvent<ReplicatorLeaveEvent>("replicator:leave", {
|
|
3481
|
+
detail: { publicKey },
|
|
3482
|
+
}),
|
|
3483
|
+
);
|
|
3484
|
+
}
|
|
3485
|
+
|
|
3486
|
+
if (!this._replicationInfoBlockedPeers.has(peerHash)) {
|
|
3487
|
+
this.scheduleReplicationInfoRequests(publicKey);
|
|
3488
|
+
}
|
|
3489
|
+
this._replicatorLivenessTargetsSize = -1;
|
|
3490
|
+
}
|
|
3491
|
+
|
|
3492
|
+
private async runReplicatorLivenessSweep() {
|
|
3493
|
+
if (this.closed || this._closeController.signal.aborted) {
|
|
3494
|
+
return;
|
|
3495
|
+
}
|
|
3496
|
+
if (this._replicatorLivenessSweepRunning) {
|
|
3497
|
+
return;
|
|
3498
|
+
}
|
|
3499
|
+
|
|
3500
|
+
const targets = this.getReplicatorLivenessTargets();
|
|
3501
|
+
if (targets.length === 0) {
|
|
3502
|
+
return;
|
|
3503
|
+
}
|
|
3504
|
+
|
|
3505
|
+
this._replicatorLivenessSweepRunning = true;
|
|
3506
|
+
try {
|
|
3507
|
+
if (this._replicatorLivenessCursor >= targets.length) {
|
|
3508
|
+
this._replicatorLivenessCursor = 0;
|
|
3509
|
+
}
|
|
3510
|
+
const peerHash = targets[this._replicatorLivenessCursor]!;
|
|
3511
|
+
this._replicatorLivenessCursor =
|
|
3512
|
+
(this._replicatorLivenessCursor + 1) % targets.length;
|
|
3513
|
+
await this.probeReplicatorLiveness(peerHash);
|
|
3514
|
+
} catch (error) {
|
|
3515
|
+
if (!isNotStartedError(error as Error)) {
|
|
3516
|
+
logger.error((error as any)?.toString?.() ?? String(error));
|
|
3517
|
+
}
|
|
3518
|
+
} finally {
|
|
3519
|
+
this._replicatorLivenessSweepRunning = false;
|
|
3520
|
+
}
|
|
3521
|
+
}
|
|
3522
|
+
|
|
3523
|
+
private async probeReplicatorLiveness(peerHash: string) {
|
|
3524
|
+
if (this.closed || this._closeController.signal.aborted) {
|
|
3525
|
+
return;
|
|
3526
|
+
}
|
|
3527
|
+
if (!this.uniqueReplicators.has(peerHash)) {
|
|
3528
|
+
this._replicatorLivenessFailures.delete(peerHash);
|
|
3529
|
+
return;
|
|
3530
|
+
}
|
|
3531
|
+
if (this.hasRecentReplicatorActivity(peerHash)) {
|
|
3532
|
+
return;
|
|
3533
|
+
}
|
|
3534
|
+
|
|
3535
|
+
const publicKey = await this._resolvePublicKeyFromHash(peerHash);
|
|
3536
|
+
if (!publicKey) {
|
|
3537
|
+
try {
|
|
3538
|
+
await this.removeReplicator(peerHash, { noEvent: true });
|
|
3539
|
+
} catch (error) {
|
|
3540
|
+
if (!isNotStartedError(error as Error)) {
|
|
3541
|
+
throw error;
|
|
3542
|
+
}
|
|
3543
|
+
}
|
|
3544
|
+
this.cleanupPeerDisconnectTracking(peerHash);
|
|
3545
|
+
this._replicatorLivenessTargetsSize = -1;
|
|
3546
|
+
return;
|
|
3547
|
+
}
|
|
3548
|
+
|
|
3549
|
+
try {
|
|
3550
|
+
// Explicit ping (ACKed) instead of RequestReplicationInfoMessage to avoid
|
|
3551
|
+
// triggering large segment snapshots just to prove liveness.
|
|
3552
|
+
await this.rpc.send(new ReplicationPingMessage(), {
|
|
3553
|
+
mode: new AcknowledgeDelivery({ redundancy: 1, to: [publicKey] }),
|
|
3554
|
+
priority: 1,
|
|
3555
|
+
});
|
|
3556
|
+
this.markReplicatorActivity(peerHash);
|
|
3557
|
+
this._replicatorLivenessFailures.delete(peerHash);
|
|
3558
|
+
return;
|
|
3559
|
+
} catch (error) {
|
|
3560
|
+
if (isNotStartedError(error as Error)) {
|
|
3561
|
+
return;
|
|
3562
|
+
}
|
|
3563
|
+
}
|
|
3564
|
+
|
|
3565
|
+
const failures = (this._replicatorLivenessFailures.get(peerHash) ?? 0) + 1;
|
|
3566
|
+
this._replicatorLivenessFailures.set(peerHash, failures);
|
|
3567
|
+
this.scheduleReplicationInfoRequests(publicKey);
|
|
3568
|
+
|
|
3569
|
+
if (failures < REPLICATOR_LIVENESS_PROBE_FAILURES_TO_EVICT) {
|
|
3570
|
+
return;
|
|
3571
|
+
}
|
|
3572
|
+
if (!this.uniqueReplicators.has(peerHash)) {
|
|
3573
|
+
this._replicatorLivenessFailures.delete(peerHash);
|
|
3574
|
+
return;
|
|
3575
|
+
}
|
|
3576
|
+
|
|
3577
|
+
await this.evictReplicatorFromLiveness(peerHash, publicKey);
|
|
3578
|
+
}
|
|
3579
|
+
|
|
3345
3580
|
async getMemoryUsage() {
|
|
3346
3581
|
return this.log.blocks.size();
|
|
3347
3582
|
/* ((await this.log.entryIndex?.getMemoryUsage()) || 0) */ // + (await this.log.blocks.size())
|
|
@@ -3533,13 +3768,14 @@ export class SharedLog<
|
|
|
3533
3768
|
this.coordinateToHash.clear();
|
|
3534
3769
|
this.recentlyRebalanced.clear();
|
|
3535
3770
|
this.uniqueReplicators.clear();
|
|
3536
|
-
|
|
3771
|
+
this._closeController.abort();
|
|
3537
3772
|
|
|
3538
|
-
|
|
3773
|
+
clearInterval(this.interval);
|
|
3774
|
+
this.stopReplicatorLivenessSweep();
|
|
3539
3775
|
|
|
3540
|
-
|
|
3541
|
-
|
|
3542
|
-
|
|
3776
|
+
this.node.services.pubsub.removeEventListener(
|
|
3777
|
+
"subscribe",
|
|
3778
|
+
this._onSubscriptionFn,
|
|
3543
3779
|
);
|
|
3544
3780
|
|
|
3545
3781
|
this.node.services.pubsub.removeEventListener(
|
|
@@ -3712,6 +3948,9 @@ export class SharedLog<
|
|
|
3712
3948
|
if (!context.from) {
|
|
3713
3949
|
throw new Error("Missing from in update role message");
|
|
3714
3950
|
}
|
|
3951
|
+
if (!context.from.equals(this.node.identity.publicKey)) {
|
|
3952
|
+
this.markReplicatorActivity(context.from.hashcode());
|
|
3953
|
+
}
|
|
3715
3954
|
|
|
3716
3955
|
if (msg instanceof ResponseRoleMessage) {
|
|
3717
3956
|
msg = msg.toReplicationInfoMessage(); // migration
|
|
@@ -4053,25 +4292,27 @@ export class SharedLog<
|
|
|
4053
4292
|
msg.message,
|
|
4054
4293
|
context.from!.hashcode(),
|
|
4055
4294
|
);
|
|
4295
|
+
} else if (msg instanceof ReplicationPingMessage) {
|
|
4296
|
+
// No-op: used as an ACKed unicast liveness probe.
|
|
4056
4297
|
} else if (msg instanceof RequestReplicationInfoMessage) {
|
|
4057
4298
|
if (context.from.equals(this.node.identity.publicKey)) {
|
|
4058
4299
|
return;
|
|
4059
4300
|
}
|
|
4060
4301
|
|
|
4061
|
-
|
|
4062
|
-
|
|
4063
|
-
|
|
4302
|
+
const segments = (await this.getMyReplicationSegments()).map((x) =>
|
|
4303
|
+
x.toReplicationRange(),
|
|
4304
|
+
);
|
|
4064
4305
|
|
|
4065
|
-
|
|
4066
|
-
|
|
4067
|
-
|
|
4068
|
-
|
|
4069
|
-
|
|
4306
|
+
this.rpc
|
|
4307
|
+
.send(new AllReplicatingSegmentsMessage({ segments }), {
|
|
4308
|
+
mode: new AcknowledgeDelivery({ to: [context.from], redundancy: 1 }),
|
|
4309
|
+
})
|
|
4310
|
+
.catch((e) => logger.error(e.toString()));
|
|
4070
4311
|
|
|
4071
|
-
|
|
4072
|
-
|
|
4073
|
-
|
|
4074
|
-
|
|
4312
|
+
// for backwards compatibility (v8) remove this when we are sure that all nodes are v9+
|
|
4313
|
+
if (this.v8Behaviour) {
|
|
4314
|
+
const role = this.getRole();
|
|
4315
|
+
if (role instanceof Replicator) {
|
|
4075
4316
|
const fixedSettings = !this._isAdaptiveReplicating;
|
|
4076
4317
|
if (fixedSettings) {
|
|
4077
4318
|
await this.rpc.send(
|
|
@@ -4096,38 +4337,39 @@ export class SharedLog<
|
|
|
4096
4337
|
return;
|
|
4097
4338
|
}
|
|
4098
4339
|
|
|
4099
|
-
|
|
4100
|
-
|
|
4101
|
-
|
|
4102
|
-
|
|
4103
|
-
|
|
4104
|
-
|
|
4105
|
-
|
|
4106
|
-
|
|
4107
|
-
|
|
4108
|
-
|
|
4109
|
-
|
|
4110
|
-
|
|
4111
|
-
|
|
4112
|
-
|
|
4113
|
-
|
|
4114
|
-
|
|
4115
|
-
|
|
4116
|
-
|
|
4117
|
-
|
|
4340
|
+
const replicationInfoMessage = msg as
|
|
4341
|
+
| AllReplicatingSegmentsMessage
|
|
4342
|
+
| AddedReplicationSegmentMessage;
|
|
4343
|
+
|
|
4344
|
+
// Process replication updates even if the sender isn't yet considered "ready" by
|
|
4345
|
+
// `Program.waitFor()`. Dropping these messages can lead to missing replicator info
|
|
4346
|
+
// (and downstream `waitForReplicator()` timeouts) under timing-sensitive joins.
|
|
4347
|
+
const from = context.from!;
|
|
4348
|
+
const fromHash = from.hashcode();
|
|
4349
|
+
if (this._replicationInfoBlockedPeers.has(fromHash)) {
|
|
4350
|
+
return;
|
|
4351
|
+
}
|
|
4352
|
+
const messageTimestamp = context.message.header.timestamp;
|
|
4353
|
+
await this.withReplicationInfoApplyQueue(fromHash, async () => {
|
|
4354
|
+
try {
|
|
4355
|
+
// The peer may have unsubscribed after this message was queued.
|
|
4356
|
+
if (this._replicationInfoBlockedPeers.has(fromHash)) {
|
|
4357
|
+
return;
|
|
4358
|
+
}
|
|
4118
4359
|
|
|
4119
|
-
|
|
4120
|
-
|
|
4121
|
-
|
|
4122
|
-
|
|
4123
|
-
|
|
4124
|
-
|
|
4360
|
+
// Process in-order to avoid races where repeated reset messages arrive
|
|
4361
|
+
// concurrently and trigger spurious "added" diffs / rebalancing.
|
|
4362
|
+
const prev = this.latestReplicationInfoMessage.get(fromHash);
|
|
4363
|
+
if (prev && prev > messageTimestamp) {
|
|
4364
|
+
return;
|
|
4365
|
+
}
|
|
4125
4366
|
|
|
4126
|
-
|
|
4367
|
+
this.latestReplicationInfoMessage.set(fromHash, messageTimestamp);
|
|
4368
|
+
this._replicatorLivenessFailures.delete(fromHash);
|
|
4127
4369
|
|
|
4128
|
-
|
|
4129
|
-
|
|
4130
|
-
|
|
4370
|
+
if (this.closed) {
|
|
4371
|
+
return;
|
|
4372
|
+
}
|
|
4131
4373
|
|
|
4132
4374
|
const reset = msg instanceof AllReplicatingSegmentsMessage;
|
|
4133
4375
|
await this.addReplicationRange(
|
|
@@ -4142,39 +4384,40 @@ export class SharedLog<
|
|
|
4142
4384
|
},
|
|
4143
4385
|
);
|
|
4144
4386
|
|
|
4145
|
-
|
|
4146
|
-
|
|
4147
|
-
|
|
4148
|
-
|
|
4149
|
-
}
|
|
4150
|
-
} catch (e) {
|
|
4151
|
-
if (isNotStartedError(e as Error)) {
|
|
4152
|
-
return;
|
|
4153
|
-
}
|
|
4154
|
-
logger.error(
|
|
4155
|
-
`Failed to apply replication settings from '${fromHash}': ${
|
|
4156
|
-
(e as any)?.message ?? e
|
|
4157
|
-
}`,
|
|
4158
|
-
);
|
|
4387
|
+
// If the peer reports any replication segments, stop re-requesting.
|
|
4388
|
+
// (Empty reports can be transient during startup.)
|
|
4389
|
+
if (replicationInfoMessage.segments.length > 0) {
|
|
4390
|
+
this.cancelReplicationInfoRequests(fromHash);
|
|
4159
4391
|
}
|
|
4160
|
-
})
|
|
4161
|
-
|
|
4162
|
-
if (context.from.equals(this.node.identity.publicKey)) {
|
|
4163
|
-
return;
|
|
4164
|
-
}
|
|
4165
|
-
const fromHash = context.from.hashcode();
|
|
4166
|
-
if (this._replicationInfoBlockedPeers.has(fromHash)) {
|
|
4392
|
+
} catch (e) {
|
|
4393
|
+
if (isNotStartedError(e as Error)) {
|
|
4167
4394
|
return;
|
|
4168
4395
|
}
|
|
4396
|
+
logger.error(
|
|
4397
|
+
`Failed to apply replication settings from '${fromHash}': ${
|
|
4398
|
+
(e as any)?.message ?? e
|
|
4399
|
+
}`,
|
|
4400
|
+
);
|
|
4401
|
+
}
|
|
4402
|
+
});
|
|
4403
|
+
} else if (msg instanceof StoppedReplicating) {
|
|
4404
|
+
if (context.from.equals(this.node.identity.publicKey)) {
|
|
4405
|
+
return;
|
|
4406
|
+
}
|
|
4407
|
+
const fromHash = context.from.hashcode();
|
|
4408
|
+
if (this._replicationInfoBlockedPeers.has(fromHash)) {
|
|
4409
|
+
return;
|
|
4410
|
+
}
|
|
4411
|
+
this._replicatorLivenessFailures.delete(fromHash);
|
|
4169
4412
|
|
|
4170
|
-
|
|
4171
|
-
|
|
4172
|
-
|
|
4413
|
+
const rangesToRemove = await this.resolveReplicationRangesFromIdsAndKey(
|
|
4414
|
+
msg.segmentIds,
|
|
4415
|
+
context.from,
|
|
4173
4416
|
);
|
|
4174
4417
|
|
|
4175
|
-
|
|
4176
|
-
|
|
4177
|
-
|
|
4418
|
+
await this.removeReplicationRanges(rangesToRemove, context.from);
|
|
4419
|
+
const timestamp = BigInt(+new Date());
|
|
4420
|
+
for (const range of rangesToRemove) {
|
|
4178
4421
|
this.replicationChangeDebounceFn.add({
|
|
4179
4422
|
range,
|
|
4180
4423
|
type: "removed",
|
|
@@ -5180,97 +5423,80 @@ export class SharedLog<
|
|
|
5180
5423
|
tick();
|
|
5181
5424
|
}
|
|
5182
5425
|
|
|
5183
|
-
|
|
5184
|
-
|
|
5185
|
-
|
|
5186
|
-
|
|
5187
|
-
|
|
5188
|
-
|
|
5189
|
-
|
|
5190
|
-
|
|
5426
|
+
async handleSubscriptionChange(
|
|
5427
|
+
publicKey: PublicSignKey,
|
|
5428
|
+
topics: string[],
|
|
5429
|
+
subscribed: boolean,
|
|
5430
|
+
) {
|
|
5431
|
+
if (!topics.includes(this.topic)) {
|
|
5432
|
+
return;
|
|
5433
|
+
}
|
|
5191
5434
|
|
|
5192
|
-
|
|
5193
|
-
|
|
5194
|
-
|
|
5195
|
-
|
|
5196
|
-
|
|
5435
|
+
const peerHash = publicKey.hashcode();
|
|
5436
|
+
if (!subscribed) {
|
|
5437
|
+
this._replicationInfoBlockedPeers.add(peerHash);
|
|
5438
|
+
|
|
5439
|
+
const now = BigInt(+new Date());
|
|
5440
|
+
const previous = this.latestReplicationInfoMessage.get(peerHash);
|
|
5441
|
+
if (!previous || previous < now) {
|
|
5442
|
+
this.latestReplicationInfoMessage.set(peerHash, now);
|
|
5197
5443
|
}
|
|
5198
5444
|
|
|
5199
|
-
|
|
5200
|
-
|
|
5201
|
-
|
|
5202
|
-
|
|
5203
|
-
|
|
5204
|
-
|
|
5205
|
-
|
|
5206
|
-
|
|
5207
|
-
throw error;
|
|
5208
|
-
}
|
|
5445
|
+
const wasReplicator = this.uniqueReplicators.has(peerHash);
|
|
5446
|
+
try {
|
|
5447
|
+
// Unsubscribe can race with the peer's final replication reset message.
|
|
5448
|
+
// Proactively evict its ranges so leader selection doesn't keep stale owners.
|
|
5449
|
+
await this.removeReplicator(publicKey, { noEvent: true });
|
|
5450
|
+
} catch (error) {
|
|
5451
|
+
if (!isNotStartedError(error as Error)) {
|
|
5452
|
+
throw error;
|
|
5209
5453
|
}
|
|
5454
|
+
}
|
|
5210
5455
|
|
|
5211
|
-
|
|
5212
|
-
|
|
5213
|
-
const stoppedTransition = wasReplicator;
|
|
5214
|
-
this._replicatorJoinEmitted.delete(peerHash);
|
|
5215
|
-
|
|
5216
|
-
this.cancelReplicationInfoRequests(peerHash);
|
|
5217
|
-
this.removePeerFromGidPeerHistory(peerHash);
|
|
5456
|
+
this._replicatorJoinEmitted.delete(peerHash);
|
|
5457
|
+
this.cleanupPeerDisconnectTracking(peerHash);
|
|
5218
5458
|
|
|
5219
|
-
|
|
5220
|
-
|
|
5221
|
-
|
|
5222
|
-
|
|
5223
|
-
}
|
|
5224
|
-
|
|
5459
|
+
if (wasReplicator) {
|
|
5460
|
+
this.events.dispatchEvent(
|
|
5461
|
+
new CustomEvent<ReplicatorLeaveEvent>("replicator:leave", {
|
|
5462
|
+
detail: { publicKey },
|
|
5463
|
+
}),
|
|
5464
|
+
);
|
|
5465
|
+
}
|
|
5466
|
+
return;
|
|
5467
|
+
}
|
|
5225
5468
|
|
|
5226
|
-
|
|
5227
|
-
|
|
5228
|
-
|
|
5229
|
-
this._requestIPruneResponseReplicatorSet.delete(k);
|
|
5230
|
-
}
|
|
5231
|
-
}
|
|
5469
|
+
this._replicationInfoBlockedPeers.delete(peerHash);
|
|
5470
|
+
this._replicatorLivenessFailures.delete(peerHash);
|
|
5471
|
+
this.markReplicatorActivity(peerHash);
|
|
5232
5472
|
|
|
5233
|
-
|
|
5473
|
+
const replicationSegments = await this.getMyReplicationSegments();
|
|
5474
|
+
if (replicationSegments.length > 0) {
|
|
5475
|
+
this.rpc
|
|
5476
|
+
.send(
|
|
5477
|
+
new AllReplicatingSegmentsMessage({
|
|
5478
|
+
segments: replicationSegments.map((x) => x.toReplicationRange()),
|
|
5479
|
+
}),
|
|
5480
|
+
{
|
|
5481
|
+
mode: new AcknowledgeDelivery({ redundancy: 1, to: [publicKey] }),
|
|
5482
|
+
},
|
|
5483
|
+
)
|
|
5484
|
+
.catch((e) => logger.error(e.toString()));
|
|
5234
5485
|
|
|
5235
|
-
|
|
5236
|
-
|
|
5237
|
-
|
|
5238
|
-
|
|
5239
|
-
}),
|
|
5240
|
-
)
|
|
5486
|
+
if (this.v8Behaviour) {
|
|
5487
|
+
// for backwards compatibility
|
|
5488
|
+
this.rpc
|
|
5489
|
+
.send(new ResponseRoleMessage({ role: await this.getRole() }), {
|
|
5490
|
+
mode: new AcknowledgeDelivery({ redundancy: 1, to: [publicKey] }),
|
|
5491
|
+
})
|
|
5492
|
+
.catch((e) => logger.error(e.toString()));
|
|
5241
5493
|
}
|
|
5242
|
-
|
|
5243
|
-
if (subscribed) {
|
|
5244
|
-
const replicationSegments = await this.getMyReplicationSegments();
|
|
5245
|
-
if (replicationSegments.length > 0) {
|
|
5246
|
-
this.rpc
|
|
5247
|
-
.send(
|
|
5248
|
-
new AllReplicatingSegmentsMessage({
|
|
5249
|
-
segments: replicationSegments.map((x) => x.toReplicationRange()),
|
|
5250
|
-
}),
|
|
5251
|
-
{
|
|
5252
|
-
mode: new AcknowledgeDelivery({ redundancy: 1, to: [publicKey] }),
|
|
5253
|
-
},
|
|
5254
|
-
)
|
|
5255
|
-
.catch((e) => logger.error(e.toString()));
|
|
5256
|
-
|
|
5257
|
-
if (this.v8Behaviour) {
|
|
5258
|
-
// for backwards compatibility
|
|
5259
|
-
this.rpc
|
|
5260
|
-
.send(new ResponseRoleMessage({ role: await this.getRole() }), {
|
|
5261
|
-
mode: new AcknowledgeDelivery({ redundancy: 1, to: [publicKey] }),
|
|
5262
|
-
})
|
|
5263
|
-
.catch((e) => logger.error(e.toString()));
|
|
5264
|
-
}
|
|
5265
|
-
}
|
|
5266
|
-
|
|
5267
|
-
// Request the remote peer's replication info. This makes joins resilient to
|
|
5268
|
-
// timing-sensitive delivery/order issues where we may miss their initial
|
|
5269
|
-
// replication announcement.
|
|
5270
|
-
this.scheduleReplicationInfoRequests(publicKey);
|
|
5271
|
-
} else {
|
|
5272
|
-
await this.removeReplicator(publicKey);
|
|
5273
5494
|
}
|
|
5495
|
+
|
|
5496
|
+
// Request the remote peer's replication info. This makes joins resilient to
|
|
5497
|
+
// timing-sensitive delivery/order issues where we may miss their initial
|
|
5498
|
+
// replication announcement.
|
|
5499
|
+
this.scheduleReplicationInfoRequests(publicKey);
|
|
5274
5500
|
}
|
|
5275
5501
|
|
|
5276
5502
|
private getClampedReplicas(customValue?: MinReplicas) {
|