@peerbit/shared-log 13.1.0 → 13.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmark/join-backfill-repair.d.ts +2 -0
- package/dist/benchmark/join-backfill-repair.d.ts.map +1 -0
- package/dist/benchmark/join-backfill-repair.js +288 -0
- package/dist/benchmark/join-backfill-repair.js.map +1 -0
- package/dist/src/exchange-heads.d.ts +1 -0
- package/dist/src/exchange-heads.d.ts.map +1 -1
- package/dist/src/exchange-heads.js +2 -0
- package/dist/src/exchange-heads.js.map +1 -1
- package/dist/src/index.d.ts +38 -2
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +1011 -123
- package/dist/src/index.js.map +1 -1
- package/dist/src/pid.d.ts.map +1 -1
- package/dist/src/pid.js +40 -9
- package/dist/src/pid.js.map +1 -1
- package/dist/src/ranges.d.ts.map +1 -1
- package/dist/src/ranges.js +8 -1
- package/dist/src/ranges.js.map +1 -1
- package/dist/src/sync/simple.d.ts +7 -0
- package/dist/src/sync/simple.d.ts.map +1 -1
- package/dist/src/sync/simple.js +71 -22
- package/dist/src/sync/simple.js.map +1 -1
- package/package.json +7 -6
- package/src/exchange-heads.ts +3 -0
- package/src/index.ts +1376 -233
- package/src/pid.ts +56 -9
- package/src/ranges.ts +9 -1
- package/src/sync/simple.ts +56 -23
package/dist/src/index.js
CHANGED
|
@@ -54,7 +54,7 @@ import { CPUUsageIntervalLag } from "./cpu.js";
|
|
|
54
54
|
import { debouncedAccumulatorMap, } from "./debounce.js";
|
|
55
55
|
import { NoPeersError } from "./errors.js";
|
|
56
56
|
const getSharedLogFanoutService = (services) => services.fanout;
|
|
57
|
-
import { EntryWithRefs, ExchangeHeadsMessage, RequestIPrune, ResponseIPrune, createExchangeHeadsMessages, } from "./exchange-heads.js";
|
|
57
|
+
import { EXCHANGE_HEADS_REPAIR_HINT, EntryWithRefs, ExchangeHeadsMessage, RequestIPrune, ResponseIPrune, createExchangeHeadsMessages, } from "./exchange-heads.js";
|
|
58
58
|
import { FanoutEnvelope } from "./fanout-envelope.js";
|
|
59
59
|
import { MAX_U32, MAX_U64, bytesToNumber, createNumbers, denormalizer, } from "./integers.js";
|
|
60
60
|
import { TransportMessage } from "./message.js";
|
|
@@ -66,7 +66,7 @@ import {} from "./replication-domain.js";
|
|
|
66
66
|
import { AbsoluteReplicas, AddedReplicationSegmentMessage, AllReplicatingSegmentsMessage, MinReplicas, ReplicationPingMessage, ReplicationError, RequestReplicationInfoMessage, ResponseRoleMessage, StoppedReplicating, decodeReplicas, encodeReplicas, maxReplicas, } from "./replication.js";
|
|
67
67
|
import { Observer, Replicator } from "./role.js";
|
|
68
68
|
import { RatelessIBLTSynchronizer } from "./sync/rateless-iblt.js";
|
|
69
|
-
import { SimpleSyncronizer } from "./sync/simple.js";
|
|
69
|
+
import { ConfirmEntriesMessage, SimpleSyncronizer } from "./sync/simple.js";
|
|
70
70
|
import { groupByGid } from "./utils.js";
|
|
71
71
|
const toLocalPublicSignKey = (key) => {
|
|
72
72
|
if (typeof key === "string") {
|
|
@@ -263,10 +263,95 @@ const REPLICATOR_LIVENESS_PROBE_FAILURES_TO_EVICT = 2;
|
|
|
263
263
|
// Churn/join repair can race with pruning and transient missed sync requests under
|
|
264
264
|
// heavy event-loop load. Keep retries alive with a longer tail so reassigned
|
|
265
265
|
// entries are retried after short bursts and slower recovery windows.
|
|
266
|
-
const
|
|
266
|
+
const CHURN_REPAIR_RETRY_SCHEDULE_MS = [
|
|
267
267
|
0, 1_000, 3_000, 7_000, 15_000, 30_000, 45_000,
|
|
268
268
|
];
|
|
269
|
-
const JOIN_WARMUP_RETRY_SCHEDULE_MS = [
|
|
269
|
+
const JOIN_WARMUP_RETRY_SCHEDULE_MS = [
|
|
270
|
+
0,
|
|
271
|
+
1_000,
|
|
272
|
+
3_000,
|
|
273
|
+
7_000,
|
|
274
|
+
15_000,
|
|
275
|
+
30_000,
|
|
276
|
+
60_000,
|
|
277
|
+
];
|
|
278
|
+
const JOIN_AUTHORITATIVE_RETRY_SCHEDULE_MS = [
|
|
279
|
+
0,
|
|
280
|
+
1_000,
|
|
281
|
+
3_000,
|
|
282
|
+
7_000,
|
|
283
|
+
15_000,
|
|
284
|
+
30_000,
|
|
285
|
+
60_000,
|
|
286
|
+
];
|
|
287
|
+
const APPEND_BACKFILL_RETRY_SCHEDULE_MS = [0, 1_000, 3_000, 7_000];
|
|
288
|
+
const JOIN_AUTHORITATIVE_REPAIR_DELAY_MS = 2_000;
|
|
289
|
+
const JOIN_AUTHORITATIVE_REPAIR_SWEEP_DELAYS_MS = [
|
|
290
|
+
JOIN_AUTHORITATIVE_REPAIR_DELAY_MS,
|
|
291
|
+
7_000,
|
|
292
|
+
15_000,
|
|
293
|
+
30_000,
|
|
294
|
+
];
|
|
295
|
+
const APPEND_BACKFILL_DELAY_MS = 500;
|
|
296
|
+
const ASSUME_SYNCED_REPAIR_SUPPRESSION_MS = 5_000;
|
|
297
|
+
const REPAIR_CONFIRMATION_HASH_BATCH_SIZE = 1_024;
|
|
298
|
+
const REPAIR_DISPATCH_MODES = [
|
|
299
|
+
"join-warmup",
|
|
300
|
+
"join-authoritative",
|
|
301
|
+
"append-backfill",
|
|
302
|
+
"churn",
|
|
303
|
+
];
|
|
304
|
+
const createRepairMetricBucket = () => ({
|
|
305
|
+
dispatches: 0,
|
|
306
|
+
entries: 0,
|
|
307
|
+
ratelessFirstPasses: 0,
|
|
308
|
+
simpleFallbackPasses: 0,
|
|
309
|
+
});
|
|
310
|
+
const createRepairMetrics = () => ({
|
|
311
|
+
"join-warmup": createRepairMetricBucket(),
|
|
312
|
+
"join-authoritative": createRepairMetricBucket(),
|
|
313
|
+
"append-backfill": createRepairMetricBucket(),
|
|
314
|
+
churn: createRepairMetricBucket(),
|
|
315
|
+
});
|
|
316
|
+
const createRepairPendingPeersByMode = () => new Map(REPAIR_DISPATCH_MODES.map((mode) => [mode, new Set()]));
|
|
317
|
+
const cloneRepairPendingPeersByMode = (pending) => new Map(REPAIR_DISPATCH_MODES.map((mode) => [mode, new Set(pending.get(mode) ?? [])]));
|
|
318
|
+
const createRepairFrontierByMode = () => new Map(REPAIR_DISPATCH_MODES.map((mode) => [mode, new Map()]));
|
|
319
|
+
const createRepairActiveTargetsByMode = () => new Map(REPAIR_DISPATCH_MODES.map((mode) => [mode, new Set()]));
|
|
320
|
+
const getRepairRetrySchedule = (mode) => {
|
|
321
|
+
switch (mode) {
|
|
322
|
+
case "join-warmup":
|
|
323
|
+
return JOIN_WARMUP_RETRY_SCHEDULE_MS;
|
|
324
|
+
case "join-authoritative":
|
|
325
|
+
return JOIN_AUTHORITATIVE_RETRY_SCHEDULE_MS;
|
|
326
|
+
case "append-backfill":
|
|
327
|
+
return APPEND_BACKFILL_RETRY_SCHEDULE_MS;
|
|
328
|
+
case "churn":
|
|
329
|
+
return CHURN_REPAIR_RETRY_SCHEDULE_MS;
|
|
330
|
+
}
|
|
331
|
+
};
|
|
332
|
+
const resolveRepairRetrySchedule = (mode, override, trackedFrontier = false) => {
|
|
333
|
+
const fallback = getRepairRetrySchedule(mode);
|
|
334
|
+
if (!override || override.length === 0) {
|
|
335
|
+
return fallback;
|
|
336
|
+
}
|
|
337
|
+
if (trackedFrontier &&
|
|
338
|
+
override.length === 1 &&
|
|
339
|
+
override[0] === 0 &&
|
|
340
|
+
fallback.length > 1) {
|
|
341
|
+
// A tracked frontier with only an immediate retry would otherwise stay on
|
|
342
|
+
// attempt 0 forever, which means rateless-only retries and no sparse-tail
|
|
343
|
+
// simple fallback. Keep the immediate seed, then continue with the normal
|
|
344
|
+
// tracked repair schedule.
|
|
345
|
+
return [0, ...fallback.slice(1)];
|
|
346
|
+
}
|
|
347
|
+
return override;
|
|
348
|
+
};
|
|
349
|
+
const getRepairTransportForAttempt = (mode, attemptIndex) => {
|
|
350
|
+
if (mode === "churn") {
|
|
351
|
+
return "simple";
|
|
352
|
+
}
|
|
353
|
+
return attemptIndex === 0 ? "rateless" : "simple";
|
|
354
|
+
};
|
|
270
355
|
const toPositiveInteger = (value, fallback, label) => {
|
|
271
356
|
if (value == null) {
|
|
272
357
|
return fallback;
|
|
@@ -375,8 +460,18 @@ let SharedLog = (() => {
|
|
|
375
460
|
_repairRetryTimers;
|
|
376
461
|
_recentRepairDispatch;
|
|
377
462
|
_repairSweepRunning;
|
|
378
|
-
|
|
379
|
-
|
|
463
|
+
_repairSweepPendingModes;
|
|
464
|
+
_repairSweepPendingPeersByMode;
|
|
465
|
+
_repairFrontierByMode;
|
|
466
|
+
_repairFrontierActiveTargetsByMode;
|
|
467
|
+
_repairSweepOptimisticGidPeersPending;
|
|
468
|
+
_entryKnownPeers;
|
|
469
|
+
_joinAuthoritativeRepairTimersByDelay;
|
|
470
|
+
_joinAuthoritativeRepairPeersByDelay;
|
|
471
|
+
_assumeSyncedRepairSuppressedUntil;
|
|
472
|
+
_appendBackfillTimer;
|
|
473
|
+
_appendBackfillPendingByTarget;
|
|
474
|
+
_repairMetrics;
|
|
380
475
|
_topicSubscribersCache;
|
|
381
476
|
// regular distribution checks
|
|
382
477
|
distributeQueue;
|
|
@@ -716,7 +811,7 @@ let SharedLog = (() => {
|
|
|
716
811
|
}),
|
|
717
812
|
});
|
|
718
813
|
}
|
|
719
|
-
async _appendDeliverToReplicators(entry, minReplicasValue, leaders, selfHash, isLeader, deliveryArg) {
|
|
814
|
+
async _appendDeliverToReplicators(entry, coordinates, minReplicasValue, leaders, selfHash, isLeader, deliveryArg) {
|
|
720
815
|
const { delivery, reliability, requireRecipients, minAcks, wrap } = this._parseDeliveryOptions(deliveryArg);
|
|
721
816
|
const pending = [];
|
|
722
817
|
const track = (promise) => {
|
|
@@ -725,10 +820,32 @@ let SharedLog = (() => {
|
|
|
725
820
|
const fanoutUnicastOptions = delivery?.timeout != null || delivery?.signal != null
|
|
726
821
|
? { timeoutMs: delivery.timeout, signal: delivery.signal }
|
|
727
822
|
: undefined;
|
|
823
|
+
const fullReplicaDeliveryCandidates = await this.getFullReplicaRepairCandidates(undefined, {
|
|
824
|
+
includeSubscribers: false,
|
|
825
|
+
});
|
|
826
|
+
if (minReplicasValue >= Math.max(1, fullReplicaDeliveryCandidates.size)) {
|
|
827
|
+
for (const peer of fullReplicaDeliveryCandidates) {
|
|
828
|
+
if (!leaders.has(peer)) {
|
|
829
|
+
leaders.set(peer, { intersecting: true });
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
const entryReplicatedForRepair = this.createEntryReplicatedForRepair({
|
|
834
|
+
entry,
|
|
835
|
+
coordinates,
|
|
836
|
+
leaders: leaders,
|
|
837
|
+
replicas: minReplicasValue,
|
|
838
|
+
});
|
|
728
839
|
for await (const message of createExchangeHeadsMessages(this.log, [entry])) {
|
|
729
840
|
await this._mergeLeadersFromGidReferences(message, minReplicasValue, leaders);
|
|
730
|
-
const
|
|
731
|
-
const
|
|
841
|
+
const authoritativeRecipients = new Set(leaders.keys());
|
|
842
|
+
const leadersForDelivery = delivery
|
|
843
|
+
? new Set(authoritativeRecipients)
|
|
844
|
+
: undefined;
|
|
845
|
+
// Outbound append delivery only tells us who we intend to send to, not who has
|
|
846
|
+
// actually stored the entry. Keep this recipient set local so later repair
|
|
847
|
+
// sweeps can still backfill peers that missed the initial delivery.
|
|
848
|
+
const set = new Set(leaders.keys());
|
|
732
849
|
let hasRemotePeers = set.has(selfHash) ? set.size > 1 : set.size > 0;
|
|
733
850
|
const allowSubscriberFallback = this.syncronizer instanceof SimpleSyncronizer ||
|
|
734
851
|
(this.compatibility ?? Number.MAX_VALUE) < 10;
|
|
@@ -758,6 +875,17 @@ let SharedLog = (() => {
|
|
|
758
875
|
continue;
|
|
759
876
|
}
|
|
760
877
|
if (!delivery) {
|
|
878
|
+
for (const peer of authoritativeRecipients) {
|
|
879
|
+
if (peer === selfHash) {
|
|
880
|
+
continue;
|
|
881
|
+
}
|
|
882
|
+
// Default live append delivery is still optimistic. If one remote misses
|
|
883
|
+
// the initial heads exchange and the caller did not opt into explicit
|
|
884
|
+
// delivery acks, we still need a targeted backfill source of truth for the
|
|
885
|
+
// authoritative recipients or one entry can get stuck at 2/3 replicas
|
|
886
|
+
// forever. Best-effort fallback subscribers are not repair-worthy.
|
|
887
|
+
this.queueAppendBackfill(peer, entryReplicatedForRepair);
|
|
888
|
+
}
|
|
761
889
|
this.rpc
|
|
762
890
|
.send(message, {
|
|
763
891
|
mode: isLeader
|
|
@@ -785,12 +913,16 @@ let SharedLog = (() => {
|
|
|
785
913
|
}
|
|
786
914
|
const ackTo = [];
|
|
787
915
|
let silentTo;
|
|
916
|
+
const repairTargets = new Set();
|
|
788
917
|
// Default delivery semantics: require enough remote ACKs to reach the requested
|
|
789
918
|
// replication degree (local append counts as 1).
|
|
790
919
|
const defaultMinAcks = Math.max(0, minReplicasValue - 1);
|
|
791
920
|
const ackLimitRaw = reliability === "ack" ? (minAcks ?? defaultMinAcks) : 0;
|
|
792
921
|
const ackLimit = Math.max(0, Math.min(Math.floor(ackLimitRaw), orderedRemoteRecipients.length));
|
|
793
922
|
for (const peer of orderedRemoteRecipients) {
|
|
923
|
+
if (authoritativeRecipients.has(peer)) {
|
|
924
|
+
repairTargets.add(peer);
|
|
925
|
+
}
|
|
794
926
|
if (ackTo.length < ackLimit) {
|
|
795
927
|
ackTo.push(peer);
|
|
796
928
|
}
|
|
@@ -825,6 +957,12 @@ let SharedLog = (() => {
|
|
|
825
957
|
})
|
|
826
958
|
.catch((error) => logger.error(error));
|
|
827
959
|
}
|
|
960
|
+
for (const peer of repairTargets) {
|
|
961
|
+
// Direct append delivery is intentionally optimistic. Queue one delayed,
|
|
962
|
+
// batched maybe-sync pass for the intended recipients so stable 3-peer
|
|
963
|
+
// append workloads do not depend on perfect first-try delivery ordering.
|
|
964
|
+
this.queueAppendBackfill(peer, entryReplicatedForRepair);
|
|
965
|
+
}
|
|
828
966
|
}
|
|
829
967
|
if (pending.length > 0) {
|
|
830
968
|
await Promise.all(pending);
|
|
@@ -1376,6 +1514,7 @@ let SharedLog = (() => {
|
|
|
1376
1514
|
// Keep local sync/prune state consistent even when a peer disappears
|
|
1377
1515
|
// through replication-info updates without a topic unsubscribe event.
|
|
1378
1516
|
this.removePeerFromGidPeerHistory(keyHash);
|
|
1517
|
+
this.removeRepairFrontierTarget(keyHash);
|
|
1379
1518
|
this._recentRepairDispatch.delete(keyHash);
|
|
1380
1519
|
if (!isMe) {
|
|
1381
1520
|
this.syncronizer.onPeerDisconnected(keyHash);
|
|
@@ -1726,6 +1865,7 @@ let SharedLog = (() => {
|
|
|
1726
1865
|
for (const key of this._gidPeersHistory.keys()) {
|
|
1727
1866
|
this.removePeerFromGidPeerHistory(publicKeyHash, key);
|
|
1728
1867
|
}
|
|
1868
|
+
this.removePeerFromEntryKnownPeers(publicKeyHash);
|
|
1729
1869
|
}
|
|
1730
1870
|
}
|
|
1731
1871
|
addPeersToGidPeerHistory(gid, publicKeys, reset) {
|
|
@@ -1744,10 +1884,344 @@ let SharedLog = (() => {
|
|
|
1744
1884
|
}
|
|
1745
1885
|
return set;
|
|
1746
1886
|
}
|
|
1887
|
+
markEntriesKnownByPeer(hashes, peer) {
|
|
1888
|
+
for (const hash of hashes) {
|
|
1889
|
+
let peers = this._entryKnownPeers.get(hash);
|
|
1890
|
+
if (!peers) {
|
|
1891
|
+
peers = new Set();
|
|
1892
|
+
this._entryKnownPeers.set(hash, peers);
|
|
1893
|
+
}
|
|
1894
|
+
peers.add(peer);
|
|
1895
|
+
}
|
|
1896
|
+
}
|
|
1897
|
+
removeEntriesKnownByPeer(hashes, peer) {
|
|
1898
|
+
for (const hash of hashes) {
|
|
1899
|
+
const peers = this._entryKnownPeers.get(hash);
|
|
1900
|
+
if (!peers) {
|
|
1901
|
+
continue;
|
|
1902
|
+
}
|
|
1903
|
+
peers.delete(peer);
|
|
1904
|
+
if (peers.size === 0) {
|
|
1905
|
+
this._entryKnownPeers.delete(hash);
|
|
1906
|
+
}
|
|
1907
|
+
}
|
|
1908
|
+
}
|
|
1909
|
+
removePeerFromEntryKnownPeers(peer) {
|
|
1910
|
+
for (const [hash, peers] of this._entryKnownPeers) {
|
|
1911
|
+
peers.delete(peer);
|
|
1912
|
+
if (peers.size === 0) {
|
|
1913
|
+
this._entryKnownPeers.delete(hash);
|
|
1914
|
+
}
|
|
1915
|
+
}
|
|
1916
|
+
}
|
|
1917
|
+
isEntryKnownByPeer(hash, peer) {
|
|
1918
|
+
return this._entryKnownPeers.get(hash)?.has(peer) === true;
|
|
1919
|
+
}
|
|
1920
|
+
markRepairSweepOptimisticPeer(gid, peer) {
|
|
1921
|
+
let peers = this._repairSweepOptimisticGidPeersPending.get(gid);
|
|
1922
|
+
if (!peers) {
|
|
1923
|
+
peers = new Map();
|
|
1924
|
+
this._repairSweepOptimisticGidPeersPending.set(gid, peers);
|
|
1925
|
+
}
|
|
1926
|
+
peers.set(peer, (peers.get(peer) || 0) + 1);
|
|
1927
|
+
}
|
|
1928
|
+
hasPendingRepairSweepOptimisticPeer(gid, peer) {
|
|
1929
|
+
return (this._repairSweepOptimisticGidPeersPending.get(gid)?.get(peer) || 0) > 0;
|
|
1930
|
+
}
|
|
1931
|
+
createEntryReplicatedForRepair(properties) {
|
|
1932
|
+
const assignedToRangeBoundary = shouldAssignToRangeBoundary(properties.leaders, properties.replicas);
|
|
1933
|
+
const cidObject = cidifyString(properties.entry.hash);
|
|
1934
|
+
const hashNumber = this.indexableDomain.numbers.bytesToNumber(cidObject.multihash.digest);
|
|
1935
|
+
return new this.indexableDomain.constructorEntry({
|
|
1936
|
+
assignedToRangeBoundary,
|
|
1937
|
+
coordinates: properties.coordinates,
|
|
1938
|
+
meta: properties.entry.meta,
|
|
1939
|
+
hash: properties.entry.hash,
|
|
1940
|
+
hashNumber,
|
|
1941
|
+
});
|
|
1942
|
+
}
|
|
1943
|
+
isAssumeSyncedRepairSuppressed() {
|
|
1944
|
+
return this._assumeSyncedRepairSuppressedUntil > Date.now();
|
|
1945
|
+
}
|
|
1946
|
+
isFrontierTrackedRepairMode(mode) {
|
|
1947
|
+
return mode !== "join-warmup";
|
|
1948
|
+
}
|
|
1949
|
+
async sleepTracked(delayMs) {
|
|
1950
|
+
if (delayMs <= 0) {
|
|
1951
|
+
return;
|
|
1952
|
+
}
|
|
1953
|
+
await new Promise((resolve) => {
|
|
1954
|
+
const timer = setTimeout(() => {
|
|
1955
|
+
this._repairRetryTimers.delete(timer);
|
|
1956
|
+
resolve();
|
|
1957
|
+
}, delayMs);
|
|
1958
|
+
timer.unref?.();
|
|
1959
|
+
this._repairRetryTimers.add(timer);
|
|
1960
|
+
});
|
|
1961
|
+
}
|
|
1962
|
+
queueRepairFrontierEntries(mode, target, entries) {
|
|
1963
|
+
let targets = this._repairFrontierByMode.get(mode);
|
|
1964
|
+
if (!targets) {
|
|
1965
|
+
targets = new Map();
|
|
1966
|
+
this._repairFrontierByMode.set(mode, targets);
|
|
1967
|
+
}
|
|
1968
|
+
let pending = targets.get(target);
|
|
1969
|
+
if (!pending) {
|
|
1970
|
+
pending = new Map();
|
|
1971
|
+
targets.set(target, pending);
|
|
1972
|
+
}
|
|
1973
|
+
for (const [hash, entry] of entries) {
|
|
1974
|
+
pending.set(hash, entry);
|
|
1975
|
+
}
|
|
1976
|
+
}
|
|
1977
|
+
clearRepairFrontierHashes(target, hashes) {
|
|
1978
|
+
const hashList = [...hashes];
|
|
1979
|
+
if (hashList.length === 0) {
|
|
1980
|
+
return;
|
|
1981
|
+
}
|
|
1982
|
+
for (const mode of REPAIR_DISPATCH_MODES) {
|
|
1983
|
+
const pending = this._repairFrontierByMode.get(mode)?.get(target);
|
|
1984
|
+
if (!pending) {
|
|
1985
|
+
continue;
|
|
1986
|
+
}
|
|
1987
|
+
for (const hash of hashList) {
|
|
1988
|
+
pending.delete(hash);
|
|
1989
|
+
}
|
|
1990
|
+
if (pending.size === 0) {
|
|
1991
|
+
this._repairFrontierByMode.get(mode)?.delete(target);
|
|
1992
|
+
}
|
|
1993
|
+
}
|
|
1994
|
+
}
|
|
1995
|
+
async getFullReplicaRepairCandidates(extraPeers, options) {
|
|
1996
|
+
const candidates = new Set([
|
|
1997
|
+
this.node.identity.publicKey.hashcode(),
|
|
1998
|
+
]);
|
|
1999
|
+
try {
|
|
2000
|
+
for (const peer of await this.getReplicators()) {
|
|
2001
|
+
candidates.add(peer);
|
|
2002
|
+
}
|
|
2003
|
+
}
|
|
2004
|
+
catch {
|
|
2005
|
+
for (const peer of this.uniqueReplicators) {
|
|
2006
|
+
candidates.add(peer);
|
|
2007
|
+
}
|
|
2008
|
+
}
|
|
2009
|
+
for (const peer of extraPeers ?? []) {
|
|
2010
|
+
candidates.add(peer);
|
|
2011
|
+
}
|
|
2012
|
+
if (options?.includeSubscribers !== false) {
|
|
2013
|
+
try {
|
|
2014
|
+
for (const subscriber of (await this._getTopicSubscribers(this.topic)) ?? []) {
|
|
2015
|
+
candidates.add(subscriber.hashcode());
|
|
2016
|
+
}
|
|
2017
|
+
}
|
|
2018
|
+
catch {
|
|
2019
|
+
// Best-effort only; explicit repair peers still keep the path safe.
|
|
2020
|
+
}
|
|
2021
|
+
}
|
|
2022
|
+
return candidates;
|
|
2023
|
+
}
|
|
2024
|
+
removeRepairFrontierTarget(target) {
|
|
2025
|
+
for (const mode of REPAIR_DISPATCH_MODES) {
|
|
2026
|
+
this._repairFrontierByMode.get(mode)?.delete(target);
|
|
2027
|
+
this._repairFrontierActiveTargetsByMode.get(mode)?.delete(target);
|
|
2028
|
+
}
|
|
2029
|
+
}
|
|
2030
|
+
async sendRepairConfirmation(target, hashes) {
|
|
2031
|
+
const uniqueHashes = [...new Set(hashes)];
|
|
2032
|
+
for (let i = 0; i < uniqueHashes.length; i += REPAIR_CONFIRMATION_HASH_BATCH_SIZE) {
|
|
2033
|
+
const chunk = uniqueHashes.slice(i, i + REPAIR_CONFIRMATION_HASH_BATCH_SIZE);
|
|
2034
|
+
await this.rpc.send(new ConfirmEntriesMessage({ hashes: chunk }), {
|
|
2035
|
+
priority: 1,
|
|
2036
|
+
mode: new SilentDelivery({ to: [target], redundancy: 1 }),
|
|
2037
|
+
});
|
|
2038
|
+
}
|
|
2039
|
+
}
|
|
2040
|
+
async pushRepairEntries(target, entries) {
|
|
2041
|
+
for await (const message of createExchangeHeadsMessages(this.log, [...entries.keys()])) {
|
|
2042
|
+
message.reserved[0] |= EXCHANGE_HEADS_REPAIR_HINT;
|
|
2043
|
+
await this.rpc.send(message, {
|
|
2044
|
+
priority: 1,
|
|
2045
|
+
mode: new SilentDelivery({ to: [target], redundancy: 1 }),
|
|
2046
|
+
});
|
|
2047
|
+
}
|
|
2048
|
+
}
|
|
2049
|
+
async sendRepairEntriesWithTransport(target, entries, transport, options) {
|
|
2050
|
+
const unknownEntries = new Map();
|
|
2051
|
+
const knownHashes = [];
|
|
2052
|
+
for (const [hash, entry] of entries) {
|
|
2053
|
+
if (options?.bypassKnownPeers || !this.isEntryKnownByPeer(hash, target)) {
|
|
2054
|
+
unknownEntries.set(hash, entry);
|
|
2055
|
+
}
|
|
2056
|
+
else {
|
|
2057
|
+
knownHashes.push(hash);
|
|
2058
|
+
}
|
|
2059
|
+
}
|
|
2060
|
+
this.clearRepairFrontierHashes(target, knownHashes);
|
|
2061
|
+
if (unknownEntries.size === 0) {
|
|
2062
|
+
return;
|
|
2063
|
+
}
|
|
2064
|
+
if (transport === "simple") {
|
|
2065
|
+
// Fallback repair should not depend on the target completing the
|
|
2066
|
+
// RequestMaybeSync -> ResponseMaybeSync round trip.
|
|
2067
|
+
await this.pushRepairEntries(target, unknownEntries);
|
|
2068
|
+
return;
|
|
2069
|
+
}
|
|
2070
|
+
await this.syncronizer.onMaybeMissingEntries({
|
|
2071
|
+
entries: unknownEntries,
|
|
2072
|
+
targets: [target],
|
|
2073
|
+
});
|
|
2074
|
+
}
|
|
2075
|
+
async sendMaybeMissingEntriesNow(target, entries, options) {
|
|
2076
|
+
if (entries.size === 0) {
|
|
2077
|
+
return;
|
|
2078
|
+
}
|
|
2079
|
+
const now = Date.now();
|
|
2080
|
+
let recentlyDispatchedByHash = this._recentRepairDispatch.get(target);
|
|
2081
|
+
if (!recentlyDispatchedByHash) {
|
|
2082
|
+
recentlyDispatchedByHash = new Map();
|
|
2083
|
+
this._recentRepairDispatch.set(target, recentlyDispatchedByHash);
|
|
2084
|
+
}
|
|
2085
|
+
for (const [hash, ts] of recentlyDispatchedByHash) {
|
|
2086
|
+
if (now - ts > RECENT_REPAIR_DISPATCH_TTL_MS) {
|
|
2087
|
+
recentlyDispatchedByHash.delete(hash);
|
|
2088
|
+
}
|
|
2089
|
+
}
|
|
2090
|
+
const filteredEntries = options.bypassRecentDedupe === true
|
|
2091
|
+
? new Map(entries)
|
|
2092
|
+
: new Map();
|
|
2093
|
+
if (options.bypassRecentDedupe !== true) {
|
|
2094
|
+
for (const [hash, entry] of entries) {
|
|
2095
|
+
const prev = recentlyDispatchedByHash.get(hash);
|
|
2096
|
+
if (prev != null && now - prev <= RECENT_REPAIR_DISPATCH_TTL_MS) {
|
|
2097
|
+
continue;
|
|
2098
|
+
}
|
|
2099
|
+
recentlyDispatchedByHash.set(hash, now);
|
|
2100
|
+
filteredEntries.set(hash, entry);
|
|
2101
|
+
}
|
|
2102
|
+
}
|
|
2103
|
+
else {
|
|
2104
|
+
for (const hash of entries.keys()) {
|
|
2105
|
+
recentlyDispatchedByHash.set(hash, now);
|
|
2106
|
+
}
|
|
2107
|
+
}
|
|
2108
|
+
if (filteredEntries.size === 0) {
|
|
2109
|
+
return;
|
|
2110
|
+
}
|
|
2111
|
+
const bucket = this._repairMetrics[options.mode];
|
|
2112
|
+
bucket.dispatches += 1;
|
|
2113
|
+
bucket.entries += filteredEntries.size;
|
|
2114
|
+
if (options.transport === "simple") {
|
|
2115
|
+
bucket.simpleFallbackPasses += 1;
|
|
2116
|
+
}
|
|
2117
|
+
else {
|
|
2118
|
+
bucket.ratelessFirstPasses += 1;
|
|
2119
|
+
}
|
|
2120
|
+
await Promise.resolve(this.sendRepairEntriesWithTransport(target, filteredEntries, options.transport, { bypassKnownPeers: options.mode === "churn" })).catch((error) => logger.error(error));
|
|
2121
|
+
}
|
|
2122
|
+
ensureRepairFrontierRunner(mode, target, retryScheduleMs) {
|
|
2123
|
+
const activeTargets = this._repairFrontierActiveTargetsByMode.get(mode);
|
|
2124
|
+
if (!activeTargets || activeTargets.has(target) || this.closed) {
|
|
2125
|
+
return;
|
|
2126
|
+
}
|
|
2127
|
+
activeTargets.add(target);
|
|
2128
|
+
const retrySchedule = resolveRepairRetrySchedule(mode, retryScheduleMs, this.isFrontierTrackedRepairMode(mode));
|
|
2129
|
+
const steadyStateDelay = retrySchedule.length > 1
|
|
2130
|
+
? Math.max(1, retrySchedule[retrySchedule.length - 1] - retrySchedule[retrySchedule.length - 2])
|
|
2131
|
+
: Math.max(retrySchedule[0] || 1_000, 1_000);
|
|
2132
|
+
void (async () => {
|
|
2133
|
+
let attemptIndex = 0;
|
|
2134
|
+
try {
|
|
2135
|
+
for (;;) {
|
|
2136
|
+
if (this.closed) {
|
|
2137
|
+
return;
|
|
2138
|
+
}
|
|
2139
|
+
const pending = this._repairFrontierByMode.get(mode)?.get(target);
|
|
2140
|
+
if (!pending || pending.size === 0) {
|
|
2141
|
+
return;
|
|
2142
|
+
}
|
|
2143
|
+
if ((mode === "join-warmup" || mode === "join-authoritative") &&
|
|
2144
|
+
this.isAssumeSyncedRepairSuppressed()) {
|
|
2145
|
+
await this.sleepTracked(Math.max(250, this._assumeSyncedRepairSuppressedUntil - Date.now()));
|
|
2146
|
+
continue;
|
|
2147
|
+
}
|
|
2148
|
+
await this.sendMaybeMissingEntriesNow(target, pending, {
|
|
2149
|
+
mode,
|
|
2150
|
+
transport: getRepairTransportForAttempt(mode, attemptIndex),
|
|
2151
|
+
bypassRecentDedupe: true,
|
|
2152
|
+
});
|
|
2153
|
+
const remaining = this._repairFrontierByMode.get(mode)?.get(target);
|
|
2154
|
+
if (!remaining || remaining.size === 0) {
|
|
2155
|
+
return;
|
|
2156
|
+
}
|
|
2157
|
+
const waitMs = attemptIndex + 1 < retrySchedule.length
|
|
2158
|
+
? Math.max(0, retrySchedule[attemptIndex + 1] - retrySchedule[attemptIndex])
|
|
2159
|
+
: steadyStateDelay;
|
|
2160
|
+
attemptIndex = Math.min(attemptIndex + 1, retrySchedule.length - 1);
|
|
2161
|
+
await this.sleepTracked(waitMs);
|
|
2162
|
+
}
|
|
2163
|
+
}
|
|
2164
|
+
finally {
|
|
2165
|
+
activeTargets.delete(target);
|
|
2166
|
+
if (!this.closed &&
|
|
2167
|
+
(this._repairFrontierByMode.get(mode)?.get(target)?.size || 0) > 0) {
|
|
2168
|
+
this.ensureRepairFrontierRunner(mode, target, retryScheduleMs);
|
|
2169
|
+
}
|
|
2170
|
+
}
|
|
2171
|
+
})().catch((error) => {
|
|
2172
|
+
activeTargets.delete(target);
|
|
2173
|
+
logger.error(error);
|
|
2174
|
+
});
|
|
2175
|
+
}
|
|
2176
|
+
flushAppendBackfill() {
|
|
2177
|
+
if (this._appendBackfillPendingByTarget.size === 0) {
|
|
2178
|
+
return;
|
|
2179
|
+
}
|
|
2180
|
+
const pending = this._appendBackfillPendingByTarget;
|
|
2181
|
+
this._appendBackfillPendingByTarget = new Map();
|
|
2182
|
+
for (const [target, entries] of pending) {
|
|
2183
|
+
this.dispatchMaybeMissingEntries(target, entries, {
|
|
2184
|
+
mode: "append-backfill",
|
|
2185
|
+
});
|
|
2186
|
+
}
|
|
2187
|
+
}
|
|
2188
|
+
queueAppendBackfill(target, entry) {
|
|
2189
|
+
let entries = this._appendBackfillPendingByTarget.get(target);
|
|
2190
|
+
if (!entries) {
|
|
2191
|
+
entries = new Map();
|
|
2192
|
+
this._appendBackfillPendingByTarget.set(target, entries);
|
|
2193
|
+
}
|
|
2194
|
+
entries.set(entry.hash, entry);
|
|
2195
|
+
if (entries.size >= this.repairSweepTargetBufferSize) {
|
|
2196
|
+
this.flushAppendBackfill();
|
|
2197
|
+
return;
|
|
2198
|
+
}
|
|
2199
|
+
if (this._appendBackfillTimer || this.closed) {
|
|
2200
|
+
return;
|
|
2201
|
+
}
|
|
2202
|
+
const timer = setTimeout(() => {
|
|
2203
|
+
this._repairRetryTimers.delete(timer);
|
|
2204
|
+
if (this._appendBackfillTimer === timer) {
|
|
2205
|
+
this._appendBackfillTimer = undefined;
|
|
2206
|
+
}
|
|
2207
|
+
if (this.closed) {
|
|
2208
|
+
return;
|
|
2209
|
+
}
|
|
2210
|
+
this.flushAppendBackfill();
|
|
2211
|
+
}, APPEND_BACKFILL_DELAY_MS);
|
|
2212
|
+
timer.unref?.();
|
|
2213
|
+
this._repairRetryTimers.add(timer);
|
|
2214
|
+
this._appendBackfillTimer = timer;
|
|
2215
|
+
}
|
|
1747
2216
|
dispatchMaybeMissingEntries(target, entries, options) {
|
|
1748
2217
|
if (entries.size === 0) {
|
|
1749
2218
|
return;
|
|
1750
2219
|
}
|
|
2220
|
+
if (this.isFrontierTrackedRepairMode(options.mode)) {
|
|
2221
|
+
this.queueRepairFrontierEntries(options.mode, target, entries);
|
|
2222
|
+
this.ensureRepairFrontierRunner(options.mode, target, options.retryScheduleMs);
|
|
2223
|
+
return;
|
|
2224
|
+
}
|
|
1751
2225
|
const now = Date.now();
|
|
1752
2226
|
let recentlyDispatchedByHash = this._recentRepairDispatch.get(target);
|
|
1753
2227
|
if (!recentlyDispatchedByHash) {
|
|
@@ -1759,10 +2233,10 @@ let SharedLog = (() => {
|
|
|
1759
2233
|
recentlyDispatchedByHash.delete(hash);
|
|
1760
2234
|
}
|
|
1761
2235
|
}
|
|
1762
|
-
const filteredEntries = options
|
|
2236
|
+
const filteredEntries = options.bypassRecentDedupe === true
|
|
1763
2237
|
? new Map(entries)
|
|
1764
2238
|
: new Map();
|
|
1765
|
-
if (options
|
|
2239
|
+
if (options.bypassRecentDedupe !== true) {
|
|
1766
2240
|
for (const [hash, entry] of entries) {
|
|
1767
2241
|
const prev = recentlyDispatchedByHash.get(hash);
|
|
1768
2242
|
if (prev != null && now - prev <= RECENT_REPAIR_DISPATCH_TTL_MS) {
|
|
@@ -1780,95 +2254,186 @@ let SharedLog = (() => {
|
|
|
1780
2254
|
if (filteredEntries.size === 0) {
|
|
1781
2255
|
return;
|
|
1782
2256
|
}
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
const
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
if (
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
}
|
|
1800
|
-
return Promise.resolve(this.syncronizer.onMaybeMissingEntries({
|
|
1801
|
-
entries: filteredEntries,
|
|
1802
|
-
targets: [target],
|
|
1803
|
-
})).catch((error) => logger.error(error));
|
|
2257
|
+
if ((options.mode === "join-warmup" ||
|
|
2258
|
+
options.mode === "join-authoritative") &&
|
|
2259
|
+
this.isAssumeSyncedRepairSuppressed()) {
|
|
2260
|
+
return;
|
|
2261
|
+
}
|
|
2262
|
+
const retrySchedule = resolveRepairRetrySchedule(options.mode, options.retryScheduleMs, this.isFrontierTrackedRepairMode(options.mode));
|
|
2263
|
+
const bucket = this._repairMetrics[options.mode];
|
|
2264
|
+
bucket.dispatches += 1;
|
|
2265
|
+
bucket.entries += filteredEntries.size;
|
|
2266
|
+
const run = (transport) => {
|
|
2267
|
+
if (transport === "simple") {
|
|
2268
|
+
bucket.simpleFallbackPasses += 1;
|
|
2269
|
+
}
|
|
2270
|
+
else {
|
|
2271
|
+
bucket.ratelessFirstPasses += 1;
|
|
2272
|
+
}
|
|
2273
|
+
return Promise.resolve(this.sendRepairEntriesWithTransport(target, filteredEntries, transport, { bypassKnownPeers: options.mode === "churn" })).catch((error) => logger.error(error));
|
|
1804
2274
|
};
|
|
1805
|
-
|
|
2275
|
+
retrySchedule.forEach((delayMs, index) => {
|
|
2276
|
+
const transport = getRepairTransportForAttempt(options.mode, index);
|
|
1806
2277
|
if (delayMs === 0) {
|
|
1807
|
-
void run();
|
|
1808
|
-
|
|
2278
|
+
void run(transport);
|
|
2279
|
+
return;
|
|
1809
2280
|
}
|
|
1810
2281
|
const timer = setTimeout(() => {
|
|
1811
2282
|
this._repairRetryTimers.delete(timer);
|
|
1812
2283
|
if (this.closed) {
|
|
1813
2284
|
return;
|
|
1814
2285
|
}
|
|
1815
|
-
void run();
|
|
2286
|
+
void run(transport);
|
|
1816
2287
|
}, delayMs);
|
|
1817
2288
|
timer.unref?.();
|
|
1818
2289
|
this._repairRetryTimers.add(timer);
|
|
1819
|
-
}
|
|
2290
|
+
});
|
|
1820
2291
|
}
|
|
1821
2292
|
scheduleRepairSweep(options) {
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
2293
|
+
this._repairSweepPendingModes.add(options.mode);
|
|
2294
|
+
const pendingPeers = this._repairSweepPendingPeersByMode.get(options.mode);
|
|
2295
|
+
if (pendingPeers) {
|
|
2296
|
+
for (const peer of options.peers ?? []) {
|
|
2297
|
+
pendingPeers.add(peer);
|
|
2298
|
+
}
|
|
1827
2299
|
}
|
|
1828
2300
|
if (!this._repairSweepRunning && !this.closed) {
|
|
1829
2301
|
this._repairSweepRunning = true;
|
|
1830
2302
|
void this.runRepairSweep();
|
|
1831
2303
|
}
|
|
1832
2304
|
}
|
|
2305
|
+
scheduleJoinAuthoritativeRepair(peers) {
|
|
2306
|
+
if (this.closed || peers.size === 0) {
|
|
2307
|
+
return;
|
|
2308
|
+
}
|
|
2309
|
+
for (const delayMs of JOIN_AUTHORITATIVE_REPAIR_SWEEP_DELAYS_MS) {
|
|
2310
|
+
let pendingPeers = this._joinAuthoritativeRepairPeersByDelay.get(delayMs);
|
|
2311
|
+
if (!pendingPeers) {
|
|
2312
|
+
pendingPeers = new Set();
|
|
2313
|
+
this._joinAuthoritativeRepairPeersByDelay.set(delayMs, pendingPeers);
|
|
2314
|
+
}
|
|
2315
|
+
for (const peer of peers) {
|
|
2316
|
+
pendingPeers.add(peer);
|
|
2317
|
+
}
|
|
2318
|
+
if (this._joinAuthoritativeRepairTimersByDelay.has(delayMs)) {
|
|
2319
|
+
continue;
|
|
2320
|
+
}
|
|
2321
|
+
const timer = setTimeout(() => {
|
|
2322
|
+
this._repairRetryTimers.delete(timer);
|
|
2323
|
+
this._joinAuthoritativeRepairTimersByDelay.delete(delayMs);
|
|
2324
|
+
if (this.closed) {
|
|
2325
|
+
return;
|
|
2326
|
+
}
|
|
2327
|
+
const peersForSweep = new Set(this._joinAuthoritativeRepairPeersByDelay.get(delayMs) ?? []);
|
|
2328
|
+
this._joinAuthoritativeRepairPeersByDelay.delete(delayMs);
|
|
2329
|
+
if (peersForSweep.size === 0) {
|
|
2330
|
+
return;
|
|
2331
|
+
}
|
|
2332
|
+
// A joiner's leader view can still be partial on the first delayed pass
|
|
2333
|
+
// under pubsub jitter. Bounded per-peer rescans widen the authoritative
|
|
2334
|
+
// frontier without adding per-append sweeps.
|
|
2335
|
+
this.scheduleRepairSweep({
|
|
2336
|
+
mode: "join-authoritative",
|
|
2337
|
+
peers: peersForSweep,
|
|
2338
|
+
});
|
|
2339
|
+
}, delayMs);
|
|
2340
|
+
timer.unref?.();
|
|
2341
|
+
this._repairRetryTimers.add(timer);
|
|
2342
|
+
this._joinAuthoritativeRepairTimersByDelay.set(delayMs, timer);
|
|
2343
|
+
}
|
|
2344
|
+
}
|
|
1833
2345
|
async runRepairSweep() {
|
|
1834
2346
|
try {
|
|
1835
2347
|
while (!this.closed) {
|
|
1836
|
-
const
|
|
1837
|
-
const
|
|
1838
|
-
this.
|
|
1839
|
-
this.
|
|
1840
|
-
|
|
2348
|
+
const pendingModes = new Set(this._repairSweepPendingModes);
|
|
2349
|
+
const pendingPeersByMode = cloneRepairPendingPeersByMode(this._repairSweepPendingPeersByMode);
|
|
2350
|
+
this._repairSweepPendingModes.clear();
|
|
2351
|
+
for (const peers of this._repairSweepPendingPeersByMode.values()) {
|
|
2352
|
+
peers.clear();
|
|
2353
|
+
}
|
|
2354
|
+
if (pendingModes.size === 0) {
|
|
1841
2355
|
return;
|
|
1842
2356
|
}
|
|
1843
|
-
const
|
|
1844
|
-
const
|
|
1845
|
-
|
|
2357
|
+
const optimisticGidPeersByMode = new Map();
|
|
2358
|
+
const optimisticGidPeersConsumedByMode = new Map();
|
|
2359
|
+
for (const mode of pendingModes) {
|
|
2360
|
+
const modePeers = pendingPeersByMode.get(mode);
|
|
2361
|
+
if (!modePeers || modePeers.size === 0) {
|
|
2362
|
+
continue;
|
|
2363
|
+
}
|
|
2364
|
+
const optimisticGidPeers = new Map();
|
|
2365
|
+
const optimisticGidPeersConsumed = new Map();
|
|
2366
|
+
for (const [gid, peerCounts] of this._repairSweepOptimisticGidPeersPending) {
|
|
2367
|
+
let matchedPeers;
|
|
2368
|
+
let matchedCounts;
|
|
2369
|
+
for (const [peer, count] of peerCounts) {
|
|
2370
|
+
if (!modePeers.has(peer)) {
|
|
2371
|
+
continue;
|
|
2372
|
+
}
|
|
2373
|
+
matchedPeers ||= new Set();
|
|
2374
|
+
matchedCounts ||= new Map();
|
|
2375
|
+
matchedPeers.add(peer);
|
|
2376
|
+
matchedCounts.set(peer, count);
|
|
2377
|
+
}
|
|
2378
|
+
if (matchedPeers && matchedCounts) {
|
|
2379
|
+
optimisticGidPeers.set(gid, matchedPeers);
|
|
2380
|
+
optimisticGidPeersConsumed.set(gid, matchedCounts);
|
|
2381
|
+
}
|
|
2382
|
+
}
|
|
2383
|
+
if (optimisticGidPeers.size > 0) {
|
|
2384
|
+
optimisticGidPeersByMode.set(mode, optimisticGidPeers);
|
|
2385
|
+
optimisticGidPeersConsumedByMode.set(mode, optimisticGidPeersConsumed);
|
|
2386
|
+
}
|
|
2387
|
+
}
|
|
2388
|
+
const pendingByMode = new Map(REPAIR_DISPATCH_MODES.map((mode) => [mode, new Map()]));
|
|
2389
|
+
const pendingRepairPeers = new Set();
|
|
2390
|
+
for (const peers of pendingPeersByMode.values()) {
|
|
2391
|
+
for (const peer of peers) {
|
|
2392
|
+
pendingRepairPeers.add(peer);
|
|
2393
|
+
}
|
|
2394
|
+
}
|
|
2395
|
+
const fullReplicaRepairCandidates = await this.getFullReplicaRepairCandidates(pendingRepairPeers, {
|
|
2396
|
+
includeSubscribers: false,
|
|
2397
|
+
});
|
|
2398
|
+
const fullReplicaRepairCandidateCount = Math.max(1, fullReplicaRepairCandidates.size);
|
|
2399
|
+
const nextFrontierByMode = new Map([
|
|
2400
|
+
["join-authoritative", new Map()],
|
|
2401
|
+
["churn", new Map()],
|
|
2402
|
+
]);
|
|
2403
|
+
const flushTarget = (mode, target) => {
|
|
2404
|
+
const targets = pendingByMode.get(mode);
|
|
2405
|
+
const entries = targets?.get(target);
|
|
1846
2406
|
if (!entries || entries.size === 0) {
|
|
1847
2407
|
return;
|
|
1848
2408
|
}
|
|
1849
|
-
const isJoinWarmupTarget = addedPeers.has(target);
|
|
1850
|
-
const bypassRecentDedupe = isJoinWarmupTarget || forceFreshDelivery;
|
|
1851
2409
|
this.dispatchMaybeMissingEntries(target, entries, {
|
|
1852
|
-
bypassRecentDedupe,
|
|
1853
|
-
|
|
1854
|
-
? JOIN_WARMUP_RETRY_SCHEDULE_MS
|
|
1855
|
-
: undefined,
|
|
1856
|
-
forceFreshDelivery,
|
|
2410
|
+
bypassRecentDedupe: true,
|
|
2411
|
+
mode,
|
|
1857
2412
|
});
|
|
1858
|
-
|
|
2413
|
+
targets?.delete(target);
|
|
1859
2414
|
};
|
|
1860
|
-
const queueEntryForTarget = (target, entry) => {
|
|
1861
|
-
|
|
2415
|
+
const queueEntryForTarget = (mode, target, entry) => {
|
|
2416
|
+
const sweepTargets = nextFrontierByMode.get(mode);
|
|
2417
|
+
if (sweepTargets) {
|
|
2418
|
+
let sweepSet = sweepTargets.get(target);
|
|
2419
|
+
if (!sweepSet) {
|
|
2420
|
+
sweepSet = new Map();
|
|
2421
|
+
sweepTargets.set(target, sweepSet);
|
|
2422
|
+
}
|
|
2423
|
+
sweepSet.set(entry.hash, entry);
|
|
2424
|
+
}
|
|
2425
|
+
const targets = pendingByMode.get(mode);
|
|
2426
|
+
let set = targets.get(target);
|
|
1862
2427
|
if (!set) {
|
|
1863
2428
|
set = new Map();
|
|
1864
|
-
|
|
2429
|
+
targets.set(target, set);
|
|
1865
2430
|
}
|
|
1866
2431
|
if (set.has(entry.hash)) {
|
|
1867
2432
|
return;
|
|
1868
2433
|
}
|
|
1869
2434
|
set.set(entry.hash, entry);
|
|
1870
2435
|
if (set.size >= this.repairSweepTargetBufferSize) {
|
|
1871
|
-
flushTarget(target);
|
|
2436
|
+
flushTarget(mode, target);
|
|
1872
2437
|
}
|
|
1873
2438
|
};
|
|
1874
2439
|
const iterator = this.entryCoordinatesIndex.iterate({});
|
|
@@ -1877,20 +2442,42 @@ let SharedLog = (() => {
|
|
|
1877
2442
|
const entries = await iterator.next(REPAIR_SWEEP_ENTRY_BATCH_SIZE);
|
|
1878
2443
|
for (const entry of entries) {
|
|
1879
2444
|
const entryReplicated = entry.value;
|
|
1880
|
-
const
|
|
2445
|
+
const gid = entryReplicated.gid;
|
|
2446
|
+
const knownPeers = this._gidPeersHistory.get(gid);
|
|
2447
|
+
const requestedReplicas = decodeReplicas(entryReplicated).getValue(this);
|
|
1881
2448
|
const currentPeers = await this.findLeaders(entryReplicated.coordinates, entryReplicated, { roleAge: 0 });
|
|
1882
|
-
if (
|
|
2449
|
+
if (pendingModes.has("churn")) {
|
|
1883
2450
|
for (const [currentPeer] of currentPeers) {
|
|
1884
2451
|
if (currentPeer === this.node.identity.publicKey.hashcode()) {
|
|
1885
2452
|
continue;
|
|
1886
2453
|
}
|
|
1887
|
-
queueEntryForTarget(currentPeer, entryReplicated);
|
|
2454
|
+
queueEntryForTarget("churn", currentPeer, entryReplicated);
|
|
1888
2455
|
}
|
|
1889
2456
|
}
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
2457
|
+
for (const mode of pendingModes) {
|
|
2458
|
+
const modePeers = pendingPeersByMode.get(mode);
|
|
2459
|
+
if (!modePeers || modePeers.size === 0) {
|
|
2460
|
+
continue;
|
|
2461
|
+
}
|
|
2462
|
+
const optimisticPeers = optimisticGidPeersByMode.get(mode)?.get(gid);
|
|
2463
|
+
for (const peer of modePeers) {
|
|
2464
|
+
if (this.isEntryKnownByPeer(entryReplicated.hash, peer)) {
|
|
2465
|
+
continue;
|
|
2466
|
+
}
|
|
2467
|
+
const wasOptimisticallyAssigned = optimisticPeers?.has(peer) === true;
|
|
2468
|
+
const isCoveredByFullReplicaRepair = mode === "join-authoritative" &&
|
|
2469
|
+
fullReplicaRepairCandidates.has(peer) &&
|
|
2470
|
+
requestedReplicas >= fullReplicaRepairCandidateCount;
|
|
2471
|
+
const shouldQueue = mode === "join-authoritative"
|
|
2472
|
+
? currentPeers.has(peer) || isCoveredByFullReplicaRepair
|
|
2473
|
+
: wasOptimisticallyAssigned ||
|
|
2474
|
+
(currentPeers.has(peer) && !knownPeers?.has(peer));
|
|
2475
|
+
if (shouldQueue) {
|
|
2476
|
+
// Authoritative join repair must not trust partial gid peer history,
|
|
2477
|
+
// otherwise a late joiner can get stuck with a partial historical
|
|
2478
|
+
// backfill forever. Once we enter the authoritative pass, queue every
|
|
2479
|
+
// entry whose current leader set still includes the added peer.
|
|
2480
|
+
queueEntryForTarget(mode, peer, entryReplicated);
|
|
1894
2481
|
}
|
|
1895
2482
|
}
|
|
1896
2483
|
}
|
|
@@ -1900,8 +2487,64 @@ let SharedLog = (() => {
|
|
|
1900
2487
|
finally {
|
|
1901
2488
|
await iterator.close();
|
|
1902
2489
|
}
|
|
1903
|
-
for (const
|
|
1904
|
-
|
|
2490
|
+
for (const [, optimisticGidPeersConsumed] of optimisticGidPeersConsumedByMode) {
|
|
2491
|
+
for (const [gid, peerCounts] of optimisticGidPeersConsumed) {
|
|
2492
|
+
const pendingPeerCounts = this._repairSweepOptimisticGidPeersPending.get(gid);
|
|
2493
|
+
if (!pendingPeerCounts) {
|
|
2494
|
+
continue;
|
|
2495
|
+
}
|
|
2496
|
+
for (const [peer, count] of peerCounts) {
|
|
2497
|
+
const current = pendingPeerCounts.get(peer) || 0;
|
|
2498
|
+
const next = current - count;
|
|
2499
|
+
if (next > 0) {
|
|
2500
|
+
pendingPeerCounts.set(peer, next);
|
|
2501
|
+
}
|
|
2502
|
+
else {
|
|
2503
|
+
pendingPeerCounts.delete(peer);
|
|
2504
|
+
}
|
|
2505
|
+
}
|
|
2506
|
+
if (pendingPeerCounts.size === 0) {
|
|
2507
|
+
this._repairSweepOptimisticGidPeersPending.delete(gid);
|
|
2508
|
+
}
|
|
2509
|
+
}
|
|
2510
|
+
}
|
|
2511
|
+
for (const mode of pendingModes) {
|
|
2512
|
+
if (mode !== "join-authoritative" && mode !== "churn") {
|
|
2513
|
+
continue;
|
|
2514
|
+
}
|
|
2515
|
+
const nextTargets = nextFrontierByMode.get(mode) ?? new Map();
|
|
2516
|
+
const frontierTargets = this._repairFrontierByMode.get(mode);
|
|
2517
|
+
for (const target of pendingPeersByMode.get(mode) ?? []) {
|
|
2518
|
+
const replacement = nextTargets.get(target);
|
|
2519
|
+
if (mode === "join-authoritative") {
|
|
2520
|
+
// Authoritative join repair is receipt-driven: a later sweep can have a
|
|
2521
|
+
// narrower transient leader view, but it must not forget unconfirmed
|
|
2522
|
+
// hashes that were already queued for this joiner.
|
|
2523
|
+
if (replacement && replacement.size > 0) {
|
|
2524
|
+
const existing = frontierTargets?.get(target);
|
|
2525
|
+
if (existing && existing.size > 0) {
|
|
2526
|
+
for (const [hash, entry] of replacement) {
|
|
2527
|
+
existing.set(hash, entry);
|
|
2528
|
+
}
|
|
2529
|
+
}
|
|
2530
|
+
else {
|
|
2531
|
+
frontierTargets?.set(target, replacement);
|
|
2532
|
+
}
|
|
2533
|
+
}
|
|
2534
|
+
continue;
|
|
2535
|
+
}
|
|
2536
|
+
if (replacement && replacement.size > 0) {
|
|
2537
|
+
frontierTargets?.set(target, replacement);
|
|
2538
|
+
}
|
|
2539
|
+
else {
|
|
2540
|
+
frontierTargets?.delete(target);
|
|
2541
|
+
}
|
|
2542
|
+
}
|
|
2543
|
+
}
|
|
2544
|
+
for (const [mode, targets] of pendingByMode) {
|
|
2545
|
+
for (const target of [...targets.keys()]) {
|
|
2546
|
+
flushTarget(mode, target);
|
|
2547
|
+
}
|
|
1905
2548
|
}
|
|
1906
2549
|
}
|
|
1907
2550
|
}
|
|
@@ -1912,17 +2555,78 @@ let SharedLog = (() => {
|
|
|
1912
2555
|
}
|
|
1913
2556
|
finally {
|
|
1914
2557
|
this._repairSweepRunning = false;
|
|
1915
|
-
if (!this.closed &&
|
|
1916
|
-
(this._repairSweepForceFreshPending ||
|
|
1917
|
-
this._repairSweepAddedPeersPending.size > 0)) {
|
|
2558
|
+
if (!this.closed && this._repairSweepPendingModes.size > 0) {
|
|
1918
2559
|
this._repairSweepRunning = true;
|
|
1919
2560
|
void this.runRepairSweep();
|
|
1920
2561
|
}
|
|
1921
2562
|
}
|
|
1922
2563
|
}
|
|
1923
2564
|
async pruneDebouncedFnAddIfNotKeeping(args) {
|
|
1924
|
-
if (
|
|
1925
|
-
return
|
|
2565
|
+
if (this.keep && (await this.keep(args.value.entry))) {
|
|
2566
|
+
return false;
|
|
2567
|
+
}
|
|
2568
|
+
void this.pruneDebouncedFn.add(args);
|
|
2569
|
+
return true;
|
|
2570
|
+
}
|
|
2571
|
+
async pruneJoinedEntriesNoLongerLed(entries) {
|
|
2572
|
+
const selfHash = this.node.identity.publicKey.hashcode();
|
|
2573
|
+
for (const entry of entries) {
|
|
2574
|
+
if (this.closed || this._pendingDeletes.has(entry.hash)) {
|
|
2575
|
+
continue;
|
|
2576
|
+
}
|
|
2577
|
+
const leaders = await this.findLeadersFromEntry(entry, decodeReplicas(entry).getValue(this), { roleAge: 0 });
|
|
2578
|
+
if (leaders.has(selfHash)) {
|
|
2579
|
+
this.pruneDebouncedFn.delete(entry.hash);
|
|
2580
|
+
continue;
|
|
2581
|
+
}
|
|
2582
|
+
if (leaders.size === 0) {
|
|
2583
|
+
continue;
|
|
2584
|
+
}
|
|
2585
|
+
await this.pruneDebouncedFnAddIfNotKeeping({
|
|
2586
|
+
key: entry.hash,
|
|
2587
|
+
value: { entry, leaders },
|
|
2588
|
+
});
|
|
2589
|
+
this.responseToPruneDebouncedFn.delete(entry.hash);
|
|
2590
|
+
}
|
|
2591
|
+
}
|
|
2592
|
+
async pruneIndexedEntriesNoLongerLed() {
|
|
2593
|
+
const selfHash = this.node.identity.publicKey.hashcode();
|
|
2594
|
+
const iterator = this.entryCoordinatesIndex.iterate({});
|
|
2595
|
+
let enqueuedPrune = false;
|
|
2596
|
+
try {
|
|
2597
|
+
while (!this.closed && !iterator.done()) {
|
|
2598
|
+
const entries = await iterator.next(REPAIR_SWEEP_ENTRY_BATCH_SIZE);
|
|
2599
|
+
for (const entry of entries) {
|
|
2600
|
+
const entryReplicated = entry.value;
|
|
2601
|
+
if (this.closed || this._pendingDeletes.has(entryReplicated.hash)) {
|
|
2602
|
+
continue;
|
|
2603
|
+
}
|
|
2604
|
+
const leaders = await this.findLeaders(entryReplicated.coordinates, entryReplicated, { roleAge: 0 });
|
|
2605
|
+
if (leaders.has(selfHash)) {
|
|
2606
|
+
this.pruneDebouncedFn.delete(entryReplicated.hash);
|
|
2607
|
+
await this._pendingDeletes
|
|
2608
|
+
.get(entryReplicated.hash)
|
|
2609
|
+
?.reject(new Error("Failed to delete, is leader again"));
|
|
2610
|
+
this.removePruneRequestSent(entryReplicated.hash);
|
|
2611
|
+
continue;
|
|
2612
|
+
}
|
|
2613
|
+
if (leaders.size === 0) {
|
|
2614
|
+
continue;
|
|
2615
|
+
}
|
|
2616
|
+
enqueuedPrune =
|
|
2617
|
+
(await this.pruneDebouncedFnAddIfNotKeeping({
|
|
2618
|
+
key: entryReplicated.hash,
|
|
2619
|
+
value: { entry: entryReplicated, leaders },
|
|
2620
|
+
})) || enqueuedPrune;
|
|
2621
|
+
this.responseToPruneDebouncedFn.delete(entryReplicated.hash);
|
|
2622
|
+
}
|
|
2623
|
+
}
|
|
2624
|
+
}
|
|
2625
|
+
finally {
|
|
2626
|
+
await iterator.close();
|
|
2627
|
+
}
|
|
2628
|
+
if (enqueuedPrune && !this.closed) {
|
|
2629
|
+
await this.pruneDebouncedFn.flush();
|
|
1926
2630
|
}
|
|
1927
2631
|
}
|
|
1928
2632
|
clearCheckedPruneRetry(hash) {
|
|
@@ -2065,16 +2769,17 @@ let SharedLog = (() => {
|
|
|
2065
2769
|
await this._appendDeliverToAllFanout(result.entry);
|
|
2066
2770
|
}
|
|
2067
2771
|
else {
|
|
2068
|
-
await this._appendDeliverToReplicators(result.entry, minReplicasValue, leaders, selfHash, isLeader, deliveryArg);
|
|
2772
|
+
await this._appendDeliverToReplicators(result.entry, coordinates, minReplicasValue, leaders, selfHash, isLeader, deliveryArg);
|
|
2069
2773
|
}
|
|
2070
2774
|
}
|
|
2071
|
-
|
|
2775
|
+
const delayAdaptiveRebalance = this.shouldDelayAdaptiveRebalance();
|
|
2776
|
+
if (!isLeader && !delayAdaptiveRebalance) {
|
|
2072
2777
|
this.pruneDebouncedFnAddIfNotKeeping({
|
|
2073
2778
|
key: result.entry.hash,
|
|
2074
2779
|
value: { entry: result.entry, leaders },
|
|
2075
2780
|
});
|
|
2076
2781
|
}
|
|
2077
|
-
if (!
|
|
2782
|
+
if (!delayAdaptiveRebalance) {
|
|
2078
2783
|
this.rebalanceParticipationDebounced?.call();
|
|
2079
2784
|
}
|
|
2080
2785
|
return result;
|
|
@@ -2108,8 +2813,18 @@ let SharedLog = (() => {
|
|
|
2108
2813
|
this._repairRetryTimers = new Set();
|
|
2109
2814
|
this._recentRepairDispatch = new Map();
|
|
2110
2815
|
this._repairSweepRunning = false;
|
|
2111
|
-
this.
|
|
2112
|
-
this.
|
|
2816
|
+
this._repairSweepPendingModes = new Set();
|
|
2817
|
+
this._repairSweepPendingPeersByMode = createRepairPendingPeersByMode();
|
|
2818
|
+
this._repairFrontierByMode = createRepairFrontierByMode();
|
|
2819
|
+
this._repairFrontierActiveTargetsByMode = createRepairActiveTargetsByMode();
|
|
2820
|
+
this._repairSweepOptimisticGidPeersPending = new Map();
|
|
2821
|
+
this._entryKnownPeers = new Map();
|
|
2822
|
+
this._joinAuthoritativeRepairTimersByDelay = new Map();
|
|
2823
|
+
this._joinAuthoritativeRepairPeersByDelay = new Map();
|
|
2824
|
+
this._assumeSyncedRepairSuppressedUntil = 0;
|
|
2825
|
+
this._appendBackfillTimer = undefined;
|
|
2826
|
+
this._appendBackfillPendingByTarget = new Map();
|
|
2827
|
+
this._repairMetrics = createRepairMetrics();
|
|
2113
2828
|
this._topicSubscribersCache = new Map();
|
|
2114
2829
|
this.coordinateToHash = new Cache({ max: 1e6, ttl: 1e4 });
|
|
2115
2830
|
this.recentlyRebalanced = new Cache({ max: 1e4, ttl: 1e5 });
|
|
@@ -2167,7 +2882,10 @@ let SharedLog = (() => {
|
|
|
2167
2882
|
this.keep = options?.keep;
|
|
2168
2883
|
this.pendingMaturity = new Map();
|
|
2169
2884
|
const id = sha256Base64Sync(this.log.id);
|
|
2170
|
-
const storage = await
|
|
2885
|
+
const [storage, logScope] = await Promise.all([
|
|
2886
|
+
this.node.storage.sublevel(id),
|
|
2887
|
+
this.node.indexer.scope(id),
|
|
2888
|
+
]);
|
|
2171
2889
|
const localBlocks = await new AnyBlockStore(await storage.sublevel("blocks"));
|
|
2172
2890
|
const fanoutService = getSharedLogFanoutService(this.node.services);
|
|
2173
2891
|
const blockProviderNamespace = (cid) => `cid:${cid}`;
|
|
@@ -2223,16 +2941,18 @@ let SharedLog = (() => {
|
|
|
2223
2941
|
}
|
|
2224
2942
|
},
|
|
2225
2943
|
});
|
|
2226
|
-
|
|
2227
|
-
const
|
|
2228
|
-
|
|
2944
|
+
const remoteBlocksStartPromise = this.remoteBlocks.start();
|
|
2945
|
+
const [replicationIndex, logIndex] = await Promise.all([
|
|
2946
|
+
logScope.scope("replication"),
|
|
2947
|
+
logScope.scope("log"),
|
|
2948
|
+
]);
|
|
2229
2949
|
this._replicationRangeIndex = await replicationIndex.init({
|
|
2230
2950
|
schema: this.indexableDomain.constructorRange,
|
|
2231
2951
|
});
|
|
2232
2952
|
this._entryCoordinatesIndex = await replicationIndex.init({
|
|
2233
2953
|
schema: this.indexableDomain.constructorEntry,
|
|
2234
2954
|
});
|
|
2235
|
-
|
|
2955
|
+
await remoteBlocksStartPromise;
|
|
2236
2956
|
const hasIndexedReplicationInfo = (await this.replicationIndex.count({
|
|
2237
2957
|
query: [
|
|
2238
2958
|
new StringMatch({
|
|
@@ -2360,27 +3080,33 @@ let SharedLog = (() => {
|
|
|
2360
3080
|
}
|
|
2361
3081
|
}
|
|
2362
3082
|
// Open for communcation
|
|
2363
|
-
await this.rpc.open({
|
|
2364
|
-
queryType: TransportMessage,
|
|
2365
|
-
responseType: TransportMessage,
|
|
2366
|
-
responseHandler: (query, context) => this.onMessage(query, context),
|
|
2367
|
-
topic: this.topic,
|
|
2368
|
-
});
|
|
2369
3083
|
this._onSubscriptionFn =
|
|
2370
3084
|
this._onSubscriptionFn || this._onSubscription.bind(this);
|
|
2371
|
-
await this.node.services.pubsub.addEventListener("subscribe", this._onSubscriptionFn);
|
|
2372
3085
|
this._onUnsubscriptionFn =
|
|
2373
3086
|
this._onUnsubscriptionFn || this._onUnsubscription.bind(this);
|
|
2374
|
-
await
|
|
2375
|
-
|
|
2376
|
-
|
|
2377
|
-
|
|
2378
|
-
|
|
3087
|
+
await Promise.all([
|
|
3088
|
+
this.rpc.open({
|
|
3089
|
+
queryType: TransportMessage,
|
|
3090
|
+
responseType: TransportMessage,
|
|
3091
|
+
responseHandler: (query, context) => this.onMessage(query, context),
|
|
3092
|
+
topic: this.topic,
|
|
3093
|
+
}),
|
|
3094
|
+
this.node.services.pubsub.addEventListener("subscribe", this._onSubscriptionFn),
|
|
3095
|
+
this.node.services.pubsub.addEventListener("unsubscribe", this._onUnsubscriptionFn),
|
|
3096
|
+
]);
|
|
3097
|
+
const fanoutOpenPromise = this._openFanoutChannel(options?.fanout);
|
|
3098
|
+
// Mark previously-owned replication ranges as "new" only when they already exist.
|
|
3099
|
+
// Fresh opens have nothing to touch here, so skip the extra scan/write entirely.
|
|
3100
|
+
const updateOwnedReplicationPromise = hasIndexedReplicationInfo
|
|
3101
|
+
? this.updateTimestampOfOwnedReplicationRanges()
|
|
3102
|
+
: Promise.resolve();
|
|
3103
|
+
await Promise.all([fanoutOpenPromise, updateOwnedReplicationPromise]);
|
|
2379
3104
|
// if we had a previous session with replication info, and new replication info dictates that we unreplicate
|
|
2380
3105
|
// we should do that. Otherwise if options is a unreplication we dont need to do anything because
|
|
2381
3106
|
// we are already unreplicated (as we are just opening)
|
|
2382
|
-
|
|
2383
|
-
const canResumeReplication =
|
|
3107
|
+
const isUnreplicationOptionsDefined = isUnreplicationOptions(options?.replicate);
|
|
3108
|
+
const canResumeReplication = hasIndexedReplicationInfo &&
|
|
3109
|
+
(await isReplicationOptionsDependentOnPreviousState(options?.replicate, this.replicationIndex, this.node.identity.publicKey));
|
|
2384
3110
|
if (hasIndexedReplicationInfo && isUnreplicationOptionsDefined) {
|
|
2385
3111
|
await this.replicate(options?.replicate, { checkDuplicates: true });
|
|
2386
3112
|
}
|
|
@@ -2423,6 +3149,7 @@ let SharedLog = (() => {
|
|
|
2423
3149
|
}
|
|
2424
3150
|
async afterOpen() {
|
|
2425
3151
|
await super.afterOpen();
|
|
3152
|
+
const existingSubscribersPromise = this._getTopicSubscribers(this.topic);
|
|
2426
3153
|
// We do this here, because these calls requires this.closed == false
|
|
2427
3154
|
void this.pruneOfflineReplicators()
|
|
2428
3155
|
.then(() => {
|
|
@@ -2437,7 +3164,7 @@ let SharedLog = (() => {
|
|
|
2437
3164
|
this.startReplicatorLivenessSweep();
|
|
2438
3165
|
await this.rebalanceParticipation();
|
|
2439
3166
|
// Take into account existing subscription
|
|
2440
|
-
(await
|
|
3167
|
+
(await existingSubscribersPromise)?.forEach((v) => {
|
|
2441
3168
|
if (v.equals(this.node.identity.publicKey)) {
|
|
2442
3169
|
return;
|
|
2443
3170
|
}
|
|
@@ -2952,8 +3679,28 @@ let SharedLog = (() => {
|
|
|
2952
3679
|
this._repairRetryTimers.clear();
|
|
2953
3680
|
this._recentRepairDispatch.clear();
|
|
2954
3681
|
this._repairSweepRunning = false;
|
|
2955
|
-
this.
|
|
2956
|
-
this.
|
|
3682
|
+
this._repairSweepPendingModes.clear();
|
|
3683
|
+
for (const peers of this._repairSweepPendingPeersByMode.values()) {
|
|
3684
|
+
peers.clear();
|
|
3685
|
+
}
|
|
3686
|
+
this._repairSweepOptimisticGidPeersPending.clear();
|
|
3687
|
+
this._entryKnownPeers.clear();
|
|
3688
|
+
for (const timer of this._joinAuthoritativeRepairTimersByDelay.values()) {
|
|
3689
|
+
clearTimeout(timer);
|
|
3690
|
+
}
|
|
3691
|
+
this._joinAuthoritativeRepairTimersByDelay.clear();
|
|
3692
|
+
this._joinAuthoritativeRepairPeersByDelay.clear();
|
|
3693
|
+
for (const targets of this._repairFrontierByMode.values()) {
|
|
3694
|
+
targets.clear();
|
|
3695
|
+
}
|
|
3696
|
+
for (const targets of this._repairFrontierActiveTargetsByMode.values()) {
|
|
3697
|
+
targets.clear();
|
|
3698
|
+
}
|
|
3699
|
+
if (this._appendBackfillTimer) {
|
|
3700
|
+
clearTimeout(this._appendBackfillTimer);
|
|
3701
|
+
this._appendBackfillTimer = undefined;
|
|
3702
|
+
}
|
|
3703
|
+
this._appendBackfillPendingByTarget.clear();
|
|
2957
3704
|
for (const [_k, v] of this._pendingDeletes) {
|
|
2958
3705
|
v.clear();
|
|
2959
3706
|
v.promise.resolve(); // TODO or reject?
|
|
@@ -3111,9 +3858,11 @@ let SharedLog = (() => {
|
|
|
3111
3858
|
* I can use them to load associated logs and join/sync them with the data stores I own
|
|
3112
3859
|
*/
|
|
3113
3860
|
const { heads } = msg;
|
|
3861
|
+
const isRepairHint = (msg.reserved[0] & EXCHANGE_HEADS_REPAIR_HINT) !== 0;
|
|
3114
3862
|
logger.trace(`${this.node.identity.publicKey.hashcode()}: Recieved heads: ${heads.length === 1 ? heads[0].entry.hash : "#" + heads.length}, logId: ${this.log.idString}`);
|
|
3115
3863
|
if (heads) {
|
|
3116
3864
|
const filteredHeads = [];
|
|
3865
|
+
const confirmedHashes = new Set();
|
|
3117
3866
|
for (const head of heads) {
|
|
3118
3867
|
if (!(await this.log.has(head.entry.hash))) {
|
|
3119
3868
|
head.entry.init({
|
|
@@ -3123,8 +3872,18 @@ let SharedLog = (() => {
|
|
|
3123
3872
|
});
|
|
3124
3873
|
filteredHeads.push(head);
|
|
3125
3874
|
}
|
|
3875
|
+
else {
|
|
3876
|
+
confirmedHashes.add(head.entry.hash);
|
|
3877
|
+
}
|
|
3878
|
+
}
|
|
3879
|
+
const fromIsSelf = context.from.equals(this.node.identity.publicKey);
|
|
3880
|
+
if (!fromIsSelf) {
|
|
3881
|
+
this.markEntriesKnownByPeer(heads.map((head) => head.entry.hash), context.from.hashcode());
|
|
3126
3882
|
}
|
|
3127
3883
|
if (filteredHeads.length === 0) {
|
|
3884
|
+
if (confirmedHashes.size > 0 && !fromIsSelf) {
|
|
3885
|
+
await this.sendRepairConfirmation(context.from, confirmedHashes);
|
|
3886
|
+
}
|
|
3128
3887
|
return;
|
|
3129
3888
|
}
|
|
3130
3889
|
const groupedByGid = await groupByGid(filteredHeads);
|
|
@@ -3186,8 +3945,15 @@ let SharedLog = (() => {
|
|
|
3186
3945
|
}
|
|
3187
3946
|
let maybeDelete;
|
|
3188
3947
|
let toMerge = [];
|
|
3948
|
+
let toPersist = [];
|
|
3189
3949
|
let toDelete;
|
|
3190
|
-
|
|
3950
|
+
// Targeted repair is sent only to peers the sender currently believes
|
|
3951
|
+
// should store the entry. Accept it while local membership catches up;
|
|
3952
|
+
// the normal checked-prune path below can still remove it if this peer
|
|
3953
|
+
// truly no longer owns the entry.
|
|
3954
|
+
const acceptsTargetedRepair = isRepairHint && fromIsLeader;
|
|
3955
|
+
const keepAsLeader = isLeader || acceptsTargetedRepair;
|
|
3956
|
+
if (keepAsLeader) {
|
|
3191
3957
|
for (const entry of entries) {
|
|
3192
3958
|
this.pruneDebouncedFn.delete(entry.entry.hash);
|
|
3193
3959
|
this.removePruneRequestSent(entry.entry.hash);
|
|
@@ -3203,8 +3969,9 @@ let SharedLog = (() => {
|
|
|
3203
3969
|
}
|
|
3204
3970
|
}
|
|
3205
3971
|
outer: for (const entry of entries) {
|
|
3206
|
-
if (
|
|
3972
|
+
if (keepAsLeader || (await this.keep?.(entry.entry))) {
|
|
3207
3973
|
toMerge.push(entry.entry);
|
|
3974
|
+
toPersist.push(entry.entry);
|
|
3208
3975
|
}
|
|
3209
3976
|
else {
|
|
3210
3977
|
for (const ref of entry.gidRefrences) {
|
|
@@ -3222,7 +3989,18 @@ let SharedLog = (() => {
|
|
|
3222
3989
|
return;
|
|
3223
3990
|
}
|
|
3224
3991
|
if (toMerge.length > 0) {
|
|
3992
|
+
this.markEntriesKnownByPeer(toMerge.map((entry) => entry.hash), context.from.hashcode());
|
|
3225
3993
|
await this.log.join(toMerge);
|
|
3994
|
+
// Network joins bypass SharedLog.join(), but churn repair scans
|
|
3995
|
+
// the coordinate index to redistribute entries after membership changes.
|
|
3996
|
+
for (const entry of toPersist) {
|
|
3997
|
+
const replicas = decodeReplicas(entry).getValue(this);
|
|
3998
|
+
await this.findLeaders(await this.createCoordinates(entry, replicas), entry, { roleAge: 0, persist: {} });
|
|
3999
|
+
}
|
|
4000
|
+
for (const merged of toMerge) {
|
|
4001
|
+
confirmedHashes.add(merged.hash);
|
|
4002
|
+
}
|
|
4003
|
+
await this.pruneJoinedEntriesNoLongerLed(toMerge);
|
|
3226
4004
|
toDelete?.map((x) =>
|
|
3227
4005
|
// TODO types
|
|
3228
4006
|
this.pruneDebouncedFnAddIfNotKeeping({
|
|
@@ -3261,6 +4039,10 @@ let SharedLog = (() => {
|
|
|
3261
4039
|
promises.push(fn()); // we do this concurrently since waitForIsLeader might be a blocking operation for some entries
|
|
3262
4040
|
}
|
|
3263
4041
|
await Promise.all(promises);
|
|
4042
|
+
if (confirmedHashes.size > 0 && !context.from.equals(this.node.identity.publicKey)) {
|
|
4043
|
+
this.markEntriesKnownByPeer(confirmedHashes, context.from.hashcode());
|
|
4044
|
+
await this.sendRepairConfirmation(context.from, confirmedHashes);
|
|
4045
|
+
}
|
|
3264
4046
|
}
|
|
3265
4047
|
}
|
|
3266
4048
|
else if (msg instanceof RequestIPrune) {
|
|
@@ -3268,6 +4050,7 @@ let SharedLog = (() => {
|
|
|
3268
4050
|
const from = context.from.hashcode();
|
|
3269
4051
|
for (const hash of msg.hashes) {
|
|
3270
4052
|
this.removePruneRequestSent(hash, from);
|
|
4053
|
+
this.removeEntriesKnownByPeer([hash], from);
|
|
3271
4054
|
// if we expect the remote to be owner of this entry because we are to prune ourselves, then we need to remove the remote
|
|
3272
4055
|
// this is due to that the remote has previously indicated to be a replicator to help us prune but now has changed their mind
|
|
3273
4056
|
const outGoingPrunes = this._requestIPruneResponseReplicatorSet.get(hash);
|
|
@@ -3276,7 +4059,9 @@ let SharedLog = (() => {
|
|
|
3276
4059
|
}
|
|
3277
4060
|
const indexedEntry = await this.log.entryIndex.getShallow(hash);
|
|
3278
4061
|
let isLeader = false;
|
|
3279
|
-
if (indexedEntry
|
|
4062
|
+
if (indexedEntry &&
|
|
4063
|
+
!this._pendingDeletes.has(hash) &&
|
|
4064
|
+
(await this.log.blocks.has(hash))) {
|
|
3280
4065
|
this.removePeerFromGidPeerHistory(context.from.hashcode(), indexedEntry.value.meta.gid);
|
|
3281
4066
|
await this._waitForReplicators(await this.createCoordinates(indexedEntry.value, decodeReplicas(indexedEntry.value).getValue(this)), indexedEntry.value, [
|
|
3282
4067
|
{
|
|
@@ -3355,6 +4140,11 @@ let SharedLog = (() => {
|
|
|
3355
4140
|
this._pendingDeletes.get(hash)?.resolve(context.from.hashcode());
|
|
3356
4141
|
}
|
|
3357
4142
|
}
|
|
4143
|
+
else if (msg instanceof ConfirmEntriesMessage) {
|
|
4144
|
+
this.markEntriesKnownByPeer(msg.hashes, context.from.hashcode());
|
|
4145
|
+
this.clearRepairFrontierHashes(context.from.hashcode(), msg.hashes);
|
|
4146
|
+
return;
|
|
4147
|
+
}
|
|
3358
4148
|
else if (await this.syncronizer.onMessage(msg, context)) {
|
|
3359
4149
|
return; // the syncronizer has handled the message
|
|
3360
4150
|
}
|
|
@@ -3679,6 +4469,11 @@ let SharedLog = (() => {
|
|
|
3679
4469
|
if (options?.replicate) {
|
|
3680
4470
|
let messageToSend = undefined;
|
|
3681
4471
|
if (assumeSynced) {
|
|
4472
|
+
// `assumeSynced` is an explicit contract that this join should trust the
|
|
4473
|
+
// supplied history and avoid initiating outbound repair while the local
|
|
4474
|
+
// replication ranges settle.
|
|
4475
|
+
this._assumeSyncedRepairSuppressedUntil =
|
|
4476
|
+
Date.now() + ASSUME_SYNCED_REPAIR_SUPPRESSION_MS;
|
|
3682
4477
|
for (const entry of entriesToReplicate) {
|
|
3683
4478
|
await seedAssumeSyncedPeerHistory(entry);
|
|
3684
4479
|
}
|
|
@@ -3747,9 +4542,14 @@ let SharedLog = (() => {
|
|
|
3747
4542
|
clear();
|
|
3748
4543
|
// `waitForReplicator()` is typically used as a precondition before join/replicate
|
|
3749
4544
|
// flows. A replicator can become mature and enqueue a debounced rebalance
|
|
3750
|
-
// (`replicationChangeDebounceFn`) slightly later.
|
|
3751
|
-
//
|
|
3752
|
-
|
|
4545
|
+
// (`replicationChangeDebounceFn`) slightly later. Kick the flush, but do not
|
|
4546
|
+
// make membership waits depend on all rebalance work finishing; callers that
|
|
4547
|
+
// need settled distribution already wait for that explicitly.
|
|
4548
|
+
this.replicationChangeDebounceFn?.flush?.().catch((error) => {
|
|
4549
|
+
if (!isNotStartedError(error)) {
|
|
4550
|
+
logger.error(error?.toString?.() ?? String(error));
|
|
4551
|
+
}
|
|
4552
|
+
});
|
|
3753
4553
|
deferred.resolve();
|
|
3754
4554
|
};
|
|
3755
4555
|
const reject = (error) => {
|
|
@@ -4141,11 +4941,51 @@ let SharedLog = (() => {
|
|
|
4141
4941
|
}
|
|
4142
4942
|
}
|
|
4143
4943
|
}
|
|
4944
|
+
if (!options?.candidates) {
|
|
4945
|
+
const fullReplicaLeaders = await this.findFullReplicaLeaders(cursors.length, roleAge, peerFilter);
|
|
4946
|
+
if (fullReplicaLeaders) {
|
|
4947
|
+
return fullReplicaLeaders;
|
|
4948
|
+
}
|
|
4949
|
+
}
|
|
4144
4950
|
return getSamples(cursors, this.replicationIndex, roleAge, this.indexableDomain.numbers, {
|
|
4145
4951
|
peerFilter,
|
|
4146
4952
|
uniqueReplicators: peerFilter,
|
|
4147
4953
|
});
|
|
4148
4954
|
}
|
|
4955
|
+
async findFullReplicaLeaders(replicas, roleAge, peerFilter) {
|
|
4956
|
+
const now = Date.now();
|
|
4957
|
+
const leaders = new Map();
|
|
4958
|
+
const includeStrict = this._logProperties?.strictFullReplicaFallback !== false;
|
|
4959
|
+
const iterator = this.replicationIndex.iterate({}, { shape: { hash: true, timestamp: true, mode: true } });
|
|
4960
|
+
try {
|
|
4961
|
+
for (;;) {
|
|
4962
|
+
const batch = await iterator.next(64);
|
|
4963
|
+
if (batch.length === 0) {
|
|
4964
|
+
break;
|
|
4965
|
+
}
|
|
4966
|
+
for (const result of batch) {
|
|
4967
|
+
const range = result.value;
|
|
4968
|
+
if (peerFilter && !peerFilter.has(range.hash)) {
|
|
4969
|
+
continue;
|
|
4970
|
+
}
|
|
4971
|
+
if (!isMatured(range, now, roleAge)) {
|
|
4972
|
+
continue;
|
|
4973
|
+
}
|
|
4974
|
+
if (range.mode === ReplicationIntent.Strict && !includeStrict) {
|
|
4975
|
+
continue;
|
|
4976
|
+
}
|
|
4977
|
+
leaders.set(range.hash, { intersecting: true });
|
|
4978
|
+
if (leaders.size > replicas) {
|
|
4979
|
+
return undefined;
|
|
4980
|
+
}
|
|
4981
|
+
}
|
|
4982
|
+
}
|
|
4983
|
+
}
|
|
4984
|
+
finally {
|
|
4985
|
+
await iterator.close();
|
|
4986
|
+
}
|
|
4987
|
+
return leaders.size > 0 ? leaders : undefined;
|
|
4988
|
+
}
|
|
4149
4989
|
async findLeadersFromEntry(entry, replicas, options) {
|
|
4150
4990
|
const coordinates = await this.createCoordinates(entry, replicas);
|
|
4151
4991
|
const result = await this._findLeaders(coordinates, options);
|
|
@@ -4613,13 +5453,25 @@ let SharedLog = (() => {
|
|
|
4613
5453
|
}
|
|
4614
5454
|
const changed = false;
|
|
4615
5455
|
const addedPeers = new Set();
|
|
5456
|
+
const authoritativeRepairPeers = new Set();
|
|
4616
5457
|
const warmupPeers = new Set();
|
|
5458
|
+
const churnRepairPeers = new Set();
|
|
4617
5459
|
const hasSelfWarmupChange = changes.some((change) => change.range.hash === selfHash &&
|
|
4618
5460
|
(change.type === "added" || change.type === "replaced"));
|
|
5461
|
+
const hasSelfRangeRemoval = changes.some((change) => change.range.hash === selfHash &&
|
|
5462
|
+
(change.type === "removed" || change.type === "replaced"));
|
|
4619
5463
|
for (const change of changes) {
|
|
5464
|
+
if (change.range.hash !== selfHash &&
|
|
5465
|
+
(change.type === "removed" || change.type === "replaced")) {
|
|
5466
|
+
this.removePeerFromEntryKnownPeers(change.range.hash);
|
|
5467
|
+
}
|
|
4620
5468
|
if (change.type === "added" || change.type === "replaced") {
|
|
4621
5469
|
const hash = change.range.hash;
|
|
4622
5470
|
if (hash !== selfHash) {
|
|
5471
|
+
// Existing peers can widen/shift ranges after the initial join. If we
|
|
5472
|
+
// only rescan on first-seen "added", late authoritative range updates can
|
|
5473
|
+
// leave historical backfill permanently partial under load.
|
|
5474
|
+
authoritativeRepairPeers.add(hash);
|
|
4623
5475
|
// Range updates can reassign entries to an existing peer shortly after it
|
|
4624
5476
|
// already received a subset. Avoid suppressing legitimate follow-up repair.
|
|
4625
5477
|
this._recentRepairDispatch.delete(hash);
|
|
@@ -4651,17 +5503,24 @@ let SharedLog = (() => {
|
|
|
4651
5503
|
return;
|
|
4652
5504
|
}
|
|
4653
5505
|
const isWarmupTarget = warmupPeers.has(target);
|
|
4654
|
-
const
|
|
5506
|
+
const mode = forceFreshDelivery
|
|
5507
|
+
? "churn"
|
|
5508
|
+
: isWarmupTarget
|
|
5509
|
+
? "join-warmup"
|
|
5510
|
+
: "join-authoritative";
|
|
4655
5511
|
this.dispatchMaybeMissingEntries(target, entries, {
|
|
4656
|
-
bypassRecentDedupe,
|
|
4657
|
-
|
|
5512
|
+
bypassRecentDedupe: isWarmupTarget || forceFreshDelivery,
|
|
5513
|
+
mode,
|
|
5514
|
+
retryScheduleMs: mode === "join-warmup"
|
|
4658
5515
|
? JOIN_WARMUP_RETRY_SCHEDULE_MS
|
|
4659
|
-
:
|
|
4660
|
-
|
|
5516
|
+
: mode === "join-authoritative"
|
|
5517
|
+
? [0]
|
|
5518
|
+
: undefined,
|
|
4661
5519
|
});
|
|
4662
5520
|
uncheckedDeliver.delete(target);
|
|
4663
5521
|
};
|
|
4664
5522
|
const queueUncheckedDeliver = (target, entry) => {
|
|
5523
|
+
churnRepairPeers.add(target);
|
|
4665
5524
|
let set = uncheckedDeliver.get(target);
|
|
4666
5525
|
if (!set) {
|
|
4667
5526
|
set = new Map();
|
|
@@ -4715,7 +5574,14 @@ let SharedLog = (() => {
|
|
|
4715
5574
|
}
|
|
4716
5575
|
}
|
|
4717
5576
|
}
|
|
4718
|
-
|
|
5577
|
+
for (const [peer] of currentPeers) {
|
|
5578
|
+
if (warmupPeers.has(peer)) {
|
|
5579
|
+
this.markRepairSweepOptimisticPeer(entryReplicated.gid, peer);
|
|
5580
|
+
}
|
|
5581
|
+
}
|
|
5582
|
+
const authoritativePeers = [...currentPeers.keys()].filter((peer) => !warmupPeers.has(peer) &&
|
|
5583
|
+
!this.hasPendingRepairSweepOptimisticPeer(entryReplicated.gid, peer));
|
|
5584
|
+
this.addPeersToGidPeerHistory(entryReplicated.gid, authoritativePeers, true);
|
|
4719
5585
|
if (!currentPeers.has(selfHash)) {
|
|
4720
5586
|
this.pruneDebouncedFnAddIfNotKeeping({
|
|
4721
5587
|
key: entryReplicated.hash,
|
|
@@ -4763,7 +5629,14 @@ let SharedLog = (() => {
|
|
|
4763
5629
|
}
|
|
4764
5630
|
}
|
|
4765
5631
|
}
|
|
4766
|
-
|
|
5632
|
+
for (const [peer] of currentPeers) {
|
|
5633
|
+
if (addedPeers.has(peer)) {
|
|
5634
|
+
this.markRepairSweepOptimisticPeer(entryReplicated.gid, peer);
|
|
5635
|
+
}
|
|
5636
|
+
}
|
|
5637
|
+
const authoritativePeers = [...currentPeers.keys()].filter((peer) => !addedPeers.has(peer) &&
|
|
5638
|
+
!this.hasPendingRepairSweepOptimisticPeer(entryReplicated.gid, peer));
|
|
5639
|
+
this.addPeersToGidPeerHistory(entryReplicated.gid, authoritativePeers, true);
|
|
4767
5640
|
if (!isLeader) {
|
|
4768
5641
|
this.pruneDebouncedFnAddIfNotKeeping({
|
|
4769
5642
|
key: entryReplicated.hash,
|
|
@@ -4780,9 +5653,17 @@ let SharedLog = (() => {
|
|
|
4780
5653
|
}
|
|
4781
5654
|
}
|
|
4782
5655
|
}
|
|
5656
|
+
if (this._isAdaptiveReplicating && hasSelfRangeRemoval) {
|
|
5657
|
+
await this.pruneIndexedEntriesNoLongerLed();
|
|
5658
|
+
}
|
|
4783
5659
|
if (forceFreshDelivery) {
|
|
4784
|
-
//
|
|
4785
|
-
|
|
5660
|
+
// Pure leave/shrink churn can have zero `addedPeers`, but the peers that
|
|
5661
|
+
// received redistributed entries still need a follow-up repair pass if the
|
|
5662
|
+
// immediate maybe-sync misses one entry.
|
|
5663
|
+
this.scheduleRepairSweep({
|
|
5664
|
+
mode: "churn",
|
|
5665
|
+
peers: churnRepairPeers,
|
|
5666
|
+
});
|
|
4786
5667
|
}
|
|
4787
5668
|
else if (useJoinWarmupFastPath) {
|
|
4788
5669
|
// Pure join warmup uses the cheap immediate maybe-missing dispatch above,
|
|
@@ -4795,19 +5676,22 @@ let SharedLog = (() => {
|
|
|
4795
5676
|
return;
|
|
4796
5677
|
}
|
|
4797
5678
|
this.scheduleRepairSweep({
|
|
4798
|
-
|
|
4799
|
-
|
|
5679
|
+
mode: "join-warmup",
|
|
5680
|
+
peers,
|
|
4800
5681
|
});
|
|
4801
5682
|
}, 250);
|
|
4802
5683
|
timer.unref?.();
|
|
4803
5684
|
this._repairRetryTimers.add(timer);
|
|
4804
5685
|
}
|
|
4805
|
-
else if (
|
|
5686
|
+
else if (authoritativeRepairPeers.size > 0) {
|
|
4806
5687
|
this.scheduleRepairSweep({
|
|
4807
|
-
|
|
4808
|
-
|
|
5688
|
+
mode: "join-authoritative",
|
|
5689
|
+
peers: authoritativeRepairPeers,
|
|
4809
5690
|
});
|
|
4810
5691
|
}
|
|
5692
|
+
if (!forceFreshDelivery && authoritativeRepairPeers.size > 0) {
|
|
5693
|
+
this.scheduleJoinAuthoritativeRepair(authoritativeRepairPeers);
|
|
5694
|
+
}
|
|
4811
5695
|
for (const target of [...uncheckedDeliver.keys()]) {
|
|
4812
5696
|
flushUncheckedDeliverTarget(target);
|
|
4813
5697
|
}
|
|
@@ -4879,6 +5763,10 @@ let SharedLog = (() => {
|
|
|
4879
5763
|
if (!dynamicRange) {
|
|
4880
5764
|
return; // not allowed to replicate
|
|
4881
5765
|
}
|
|
5766
|
+
if (this.replicationController.maxMemoryLimit != null &&
|
|
5767
|
+
usedMemory > this.replicationController.maxMemoryLimit) {
|
|
5768
|
+
await this.pruneIndexedEntriesNoLongerLed();
|
|
5769
|
+
}
|
|
4882
5770
|
const peersSize = (await peers.getSize()) || 1;
|
|
4883
5771
|
const totalParticipation = await this.calculateTotalParticipation();
|
|
4884
5772
|
const newFactor = this.replicationController.step({
|