@enbox/agent 0.5.13 → 0.5.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,7 +21,9 @@ import { createClosureContext, invalidateClosureCache } from './sync-closure-typ
21
21
  import { AgentPermissionsApi } from './permissions-api.js';
22
22
  import { DwnInterface } from './types/dwn.js';
23
23
  import { isRecordsWrite } from './utils.js';
24
+ import { SyncLinkReconciler } from './sync-link-reconciler.js';
24
25
  import { topologicalSort } from './sync-topological-sort.js';
26
+ import { buildLegacyCursorKey, buildLinkId } from './sync-link-id.js';
25
27
  import { fetchRemoteMessages, pullMessages, pushMessages } from './sync-messages.js';
26
28
 
27
29
  export type SyncEngineLevelParams = {
@@ -46,20 +48,16 @@ const MAX_DIFF_DEPTH = 16;
46
48
  const BATCHED_DIFF_DEPTH = 8;
47
49
 
48
50
  /**
49
- /**
50
- * Key for the subscription cursor sublevel. Cursors are keyed by
51
- * `{did}^{dwnUrl}[^{protocol}]` and store an opaque EventLog cursor string.
52
- */
53
- const CURSOR_SEPARATOR = '^';
54
-
55
- /**
56
- * Debounce window for push-on-write. When the local EventLog emits events,
57
- * we batch them and push after this delay to avoid a push per individual write.
51
+ * Debounce window for batching writes that arrive while a push is in flight.
52
+ * The first write in a quiet window triggers an immediate push; subsequent
53
+ * writes arriving during the push are batched and flushed after this delay
54
+ * once the in-flight push completes.
58
55
  */
59
- const PUSH_DEBOUNCE_MS = 250;
56
+ const PUSH_DEBOUNCE_MS = 100;
60
57
 
61
58
  /** Tracks a live subscription to a remote DWN for one sync target. */
62
59
  type LiveSubscription = {
60
+ linkKey: string;
63
61
  did: string;
64
62
  dwnUrl: string;
65
63
  delegateDid?: string;
@@ -69,6 +67,7 @@ type LiveSubscription = {
69
67
 
70
68
  /** Tracks a local EventLog subscription for push-on-write. */
71
69
  type LocalSubscription = {
70
+ linkKey: string;
72
71
  did: string;
73
72
  dwnUrl: string;
74
73
  delegateDid?: string;
@@ -150,6 +149,18 @@ type LinkRuntimeState = {
150
149
  inflight: Map<number, InFlightCommit>;
151
150
  };
152
151
 
152
+ type PushRuntimeState = {
153
+ did: string;
154
+ dwnUrl: string;
155
+ delegateDid?: string;
156
+ protocol?: string;
157
+ entries: { cid: string }[];
158
+ retryCount: number;
159
+ timer?: ReturnType<typeof setTimeout>;
160
+ /** True while a push HTTP request is in flight for this link. */
161
+ flushing?: boolean;
162
+ };
163
+
153
164
  export class SyncEngineLevel implements SyncEngine {
154
165
  /**
155
166
  * Holds the instance of a `EnboxPlatformAgent` that represents the current execution context for
@@ -170,8 +181,7 @@ export class SyncEngineLevel implements SyncEngine {
170
181
 
171
182
  /**
172
183
  * Durable replication ledger — persists per-link checkpoint state.
173
- * Used by live sync to track pull/push progression independently per link.
174
- * Poll-mode sync still uses the legacy `getCursor`/`setCursor` path.
184
+ * Used by live sync to track pull progression per link.
175
185
  * Lazily initialized on first use to avoid sublevel() calls on mock dbs.
176
186
  */
177
187
  private _ledger?: ReplicationLedger;
@@ -211,7 +221,7 @@ export class SyncEngineLevel implements SyncEngine {
211
221
  * and bail if it has changed — this prevents stale work from mutating
212
222
  * state after teardown or mode switch.
213
223
  */
214
- private _syncGeneration = 0;
224
+ private _engineGeneration = 0;
215
225
 
216
226
  /** Active live pull subscriptions (remote -> local via MessagesSubscribe). */
217
227
  private _liveSubscriptions: LiveSubscription[] = [];
@@ -222,17 +232,11 @@ export class SyncEngineLevel implements SyncEngine {
222
232
  /** Connectivity state derived from subscription health. */
223
233
  private _connectivityState: SyncConnectivityState = 'unknown';
224
234
 
225
- /** Debounce timer for batched push-on-write. */
226
- private _pushDebounceTimer?: ReturnType<typeof setTimeout>;
227
-
228
235
  /** Registered event listeners for observability. */
229
236
  private _eventListeners: Set<SyncEventListener> = new Set();
230
237
 
231
- /** Entry in the pending push queue a message CID with its local EventLog token. */
232
- private _pendingPushCids: Map<string, {
233
- did: string; dwnUrl: string; delegateDid?: string; protocol?: string;
234
- entries: { cid: string; localToken?: ProgressToken }[];
235
- }> = new Map();
238
+ /** Per-link push runtime: queue, debounce timer, retry state. */
239
+ private _pushRuntimes: Map<string, PushRuntimeState> = new Map();
236
240
 
237
241
  /**
238
242
  * CIDs recently received via pull subscription, keyed by `cid|dwnUrl` to
@@ -334,11 +338,13 @@ export class SyncEngineLevel implements SyncEngine {
334
338
  }
335
339
 
336
340
  public async clear(): Promise<void> {
341
+ await this.teardownLiveSync();
337
342
  await this._permissionsApi.clear();
338
343
  await this._db.clear();
339
344
  }
340
345
 
341
346
  public async close(): Promise<void> {
347
+ await this.teardownLiveSync();
342
348
  await this._db.close();
343
349
  }
344
350
 
@@ -405,98 +411,61 @@ export class SyncEngineLevel implements SyncEngine {
405
411
 
406
412
  this._syncLock = true;
407
413
  try {
408
- // Iterate over all registered identities and their DWN endpoints.
414
+ // Group targets by remote endpoint so each URL group can be reconciled
415
+ // concurrently. Within a group, targets are processed sequentially so
416
+ // that a single network failure skips the rest of that group.
409
417
  const syncTargets = await this.getSyncTargets();
410
- const errored = new Set<string>();
411
- let hadFailure = false;
412
-
418
+ const byUrl = new Map<string, typeof syncTargets>();
413
419
  for (const target of syncTargets) {
414
- const { did, delegateDid, dwnUrl, protocol } = target;
415
-
416
- if (errored.has(dwnUrl)) {
417
- continue;
420
+ let group = byUrl.get(target.dwnUrl);
421
+ if (!group) {
422
+ group = [];
423
+ byUrl.set(target.dwnUrl, group);
418
424
  }
425
+ group.push(target);
426
+ }
419
427
 
420
- try {
421
- // Phase 1: Compare SMT roots between local and remote.
422
- const localRoot = await this.getLocalRoot(did, delegateDid, protocol);
423
- const remoteRoot = await this.getRemoteRoot(did, dwnUrl, delegateDid, protocol);
424
-
425
- if (localRoot === remoteRoot) {
426
- // Trees are identical — nothing to sync for this target.
427
- continue;
428
- }
429
-
430
- // Phase 2: Compute the diff in a single round-trip using the
431
- // batched 'diff' action. This replaces the per-node tree walk
432
- // that previously required dozens of HTTP requests.
433
- const diff = await this.diffWithRemote({
434
- did, dwnUrl, delegateDid, protocol,
435
- });
428
+ let groupsSucceeded = 0;
429
+ let groupsFailed = 0;
436
430
 
437
- // Phase 3: Pull missing messages (remote has, local doesn't).
438
- // The diff response may include inline message data — use it
439
- // directly instead of re-fetching via individual MessagesRead calls.
440
- if (!direction || direction === 'pull') {
441
- if (diff.onlyRemote.length > 0) {
442
- // Separate entries into three categories:
443
- // 1. Fully prefetched: have message + inline data (or no data needed)
444
- // 2. Need data fetch: have message but missing data for RecordsWrite
445
- // 3. Need full fetch: no message at all
446
- const prefetched: (MessagesSyncDiffEntry & { message: GenericMessage })[] = [];
447
- const needsFetchCids: string[] = [];
448
-
449
- for (const entry of diff.onlyRemote) {
450
- if (!entry.message) {
451
- // No message at all — need full fetch.
452
- needsFetchCids.push(entry.messageCid);
453
- } else if (
454
- entry.message.descriptor.interface === 'Records' &&
455
- entry.message.descriptor.method === 'Write' &&
456
- (entry.message.descriptor as any).dataCid &&
457
- !entry.encodedData
458
- ) {
459
- // RecordsWrite with data but data wasn't inlined (too large).
460
- // Need to fetch individually to get the data stream.
461
- needsFetchCids.push(entry.messageCid);
462
- } else {
463
- // Fully prefetched (message + data or no data needed).
464
- prefetched.push(entry as MessagesSyncDiffEntry & { message: GenericMessage });
465
- }
466
- }
467
- await this.pullMessages({
468
- did, dwnUrl, delegateDid, protocol,
469
- messageCids: needsFetchCids,
470
- prefetched,
471
- });
472
- }
431
+ const results = await Promise.allSettled([...byUrl.entries()].map(async ([dwnUrl, targets]) => {
432
+ for (const target of targets) {
433
+ const { did, delegateDid, protocol } = target;
434
+ try {
435
+ await this.createLinkReconciler().reconcile({
436
+ did, dwnUrl, delegateDid, protocol,
437
+ }, { direction });
438
+ } catch (error: any) {
439
+ // Skip remaining targets for this DWN endpoint.
440
+ groupsFailed++;
441
+ console.error(`SyncEngineLevel: Error syncing ${did} with ${dwnUrl}`, error);
442
+ return;
473
443
  }
444
+ }
445
+ groupsSucceeded++;
446
+ }));
474
447
 
475
- // Phase 4: Push missing messages (local has, remote doesn't).
476
- if (!direction || direction === 'push') {
477
- if (diff.onlyLocal.length > 0) {
478
- await this.pushMessages({ did, dwnUrl, delegateDid, protocol, messageCids: diff.onlyLocal });
479
- }
480
- }
481
- } catch (error: any) {
482
- // Skip this DWN endpoint for remaining targets and log the real cause.
483
- errored.add(dwnUrl);
484
- hadFailure = true;
485
- console.error(`SyncEngineLevel: Error syncing ${did} with ${dwnUrl}`, error);
448
+ // Check for unexpected rejections (should not happen given inner try/catch).
449
+ for (const result of results) {
450
+ if (result.status === 'rejected') {
451
+ groupsFailed++;
486
452
  }
487
453
  }
488
454
 
489
- // Track consecutive failures for backoff in poll mode.
490
- if (hadFailure) {
455
+ // Track connectivity based on per-group outcomes. If at least one
456
+ // group succeeded, stay online — partial reachability is still online.
457
+ if (groupsSucceeded > 0) {
458
+ this._consecutiveFailures = 0;
459
+ this._connectivityState = 'online';
460
+ } else if (groupsFailed > 0) {
491
461
  this._consecutiveFailures++;
492
462
  if (this._connectivityState === 'online') {
493
463
  this._connectivityState = 'offline';
494
464
  }
495
- } else {
465
+ } else if (syncTargets.length > 0) {
466
+ // All targets had matching roots (no reconciliation needed).
496
467
  this._consecutiveFailures = 0;
497
- if (syncTargets.length > 0) {
498
- this._connectivityState = 'online';
499
- }
468
+ this._connectivityState = 'online';
500
469
  }
501
470
  } finally {
502
471
  this._syncLock = false;
@@ -535,6 +504,7 @@ export class SyncEngineLevel implements SyncEngine {
535
504
  * and tearing down any live subscriptions.
536
505
  */
537
506
  public async stopSync(timeout: number = 2000): Promise<void> {
507
+ this._engineGeneration++;
538
508
  let elapsedTimeout = 0;
539
509
 
540
510
  while (this._syncLock) {
@@ -559,7 +529,9 @@ export class SyncEngineLevel implements SyncEngine {
559
529
  // ---------------------------------------------------------------------------
560
530
 
561
531
  private async startPollSync(intervalMilliseconds: number): Promise<void> {
532
+ const generation = this._engineGeneration;
562
533
  const intervalSync = async (): Promise<void> => {
534
+ if (this._engineGeneration !== generation) { return; }
563
535
  if (this._syncLock) {
564
536
  return;
565
537
  }
@@ -582,6 +554,7 @@ export class SyncEngineLevel implements SyncEngine {
582
554
  ? intervalMilliseconds * backoffMultiplier
583
555
  : intervalMilliseconds;
584
556
 
557
+ if (this._engineGeneration !== generation) { return; }
585
558
  if (!this._syncIntervalId) {
586
559
  this._syncIntervalId = setInterval(intervalSync, effectiveInterval);
587
560
  }
@@ -619,8 +592,9 @@ export class SyncEngineLevel implements SyncEngine {
619
592
  }
620
593
 
621
594
  // Step 2: Initialize replication links and open live subscriptions.
595
+ // Each target's link initialization is independent — process concurrently.
622
596
  const syncTargets = await this.getSyncTargets();
623
- for (const target of syncTargets) {
597
+ await Promise.allSettled(syncTargets.map(async (target) => {
624
598
  let link: ReplicationLinkState | undefined;
625
599
  try {
626
600
  // Get or create the link in the durable ledger.
@@ -637,20 +611,34 @@ export class SyncEngineLevel implements SyncEngine {
637
611
  });
638
612
 
639
613
  // Cache the link for fast access by subscription handlers.
640
- const linkKey = this.buildCursorKey(target.did, target.dwnUrl, target.protocol);
614
+ // Use scopeId from the link for consistent runtime identity.
615
+ const linkKey = this.buildLinkKey(target.did, target.dwnUrl, link.scopeId);
616
+
617
+ // One-time migration: if the link has no pull checkpoint, check for
618
+ // a legacy cursor in the old syncCursors sublevel. The legacy key
619
+ // used protocol, not scopeId, so we must build it the old way.
620
+ if (!link.pull.contiguousAppliedToken) {
621
+ const legacyKey = buildLegacyCursorKey(target.did, target.dwnUrl, target.protocol);
622
+ const legacyCursor = await this.getCursor(legacyKey);
623
+ if (legacyCursor) {
624
+ ReplicationLedger.resetCheckpoint(link.pull, legacyCursor);
625
+ await this.ledger.saveLink(link);
626
+ await this.deleteLegacyCursor(legacyKey);
627
+ }
628
+ }
629
+
641
630
  this._activeLinks.set(linkKey, link);
642
631
 
643
632
  // Open subscriptions — only transition to live if both succeed.
644
633
  // If pull succeeds but push fails, close the pull subscription to
645
634
  // avoid a resource leak with inconsistent state.
646
- await this.openLivePullSubscription(target);
635
+ const targetWithKey = { ...target, linkKey };
636
+ await this.openLivePullSubscription(targetWithKey);
647
637
  try {
648
- await this.openLocalPushSubscription(target);
638
+ await this.openLocalPushSubscription(targetWithKey);
649
639
  } catch (pushError) {
650
640
  // Close the already-opened pull subscription.
651
- const pullSub = this._liveSubscriptions.find(
652
- s => s.did === target.did && s.dwnUrl === target.dwnUrl && s.protocol === target.protocol
653
- );
641
+ const pullSub = this._liveSubscriptions.find((s) => s.linkKey === linkKey);
654
642
  if (pullSub) {
655
643
  try { await pullSub.close(); } catch { /* best effort */ }
656
644
  this._liveSubscriptions = this._liveSubscriptions.filter(s => s !== pullSub);
@@ -660,8 +648,16 @@ export class SyncEngineLevel implements SyncEngine {
660
648
 
661
649
  this.emitEvent({ type: 'link:status-change', tenantDid: target.did, remoteEndpoint: target.dwnUrl, protocol: target.protocol, from: 'initializing', to: 'live' });
662
650
  await this.ledger.setStatus(link!, 'live');
651
+
652
+ // If the link was marked dirty in a previous session, schedule
653
+ // immediate reconciliation now that subscriptions are open.
654
+ if (link!.needsReconcile) {
655
+ this.scheduleReconcile(linkKey, 1000);
656
+ }
663
657
  } catch (error: any) {
664
- const linkKey = this.buildCursorKey(target.did, target.dwnUrl, target.protocol);
658
+ const linkKey = link
659
+ ? this.buildLinkKey(target.did, target.dwnUrl, link.scopeId)
660
+ : buildLegacyCursorKey(target.did, target.dwnUrl, target.protocol);
665
661
 
666
662
  // Detect ProgressGap (410) — the cursor is stale, link needs SMT repair.
667
663
  if ((error as any).isProgressGap && link) {
@@ -671,7 +667,7 @@ export class SyncEngineLevel implements SyncEngine {
671
667
  await this.transitionToRepairing(linkKey, link, {
672
668
  resumeToken: gapInfo?.latestAvailable,
673
669
  });
674
- continue;
670
+ return;
675
671
  }
676
672
 
677
673
  console.error(`SyncEngineLevel: Failed to open live subscription for ${target.did} -> ${target.dwnUrl}`, error);
@@ -686,7 +682,7 @@ export class SyncEngineLevel implements SyncEngine {
686
682
  this._connectivityState = 'unknown';
687
683
  }
688
684
  }
689
- }
685
+ }));
690
686
 
691
687
  // Step 3: Schedule infrequent SMT integrity check.
692
688
  const integrityCheck = async (): Promise<void> => {
@@ -833,12 +829,12 @@ export class SyncEngineLevel implements SyncEngine {
833
829
  const backoff = SyncEngineLevel.REPAIR_BACKOFF_MS;
834
830
  const delayMs = backoff[Math.min(attempts - 1, backoff.length - 1)];
835
831
 
836
- const timerGeneration = this._syncGeneration;
832
+ const timerGeneration = this._engineGeneration;
837
833
  const timer = setTimeout(async (): Promise<void> => {
838
834
  this._repairRetryTimers.delete(linkKey);
839
835
 
840
836
  // Bail if teardown occurred since this timer was scheduled.
841
- if (this._syncGeneration !== timerGeneration) { return; }
837
+ if (this._engineGeneration !== timerGeneration) { return; }
842
838
 
843
839
  // Verify link still exists and is still repairing.
844
840
  const currentLink = this._activeLinks.get(linkKey);
@@ -868,6 +864,15 @@ export class SyncEngineLevel implements SyncEngine {
868
864
 
869
865
  const promise = this.doRepairLink(linkKey).finally(() => {
870
866
  this._activeRepairs.delete(linkKey);
867
+
868
+ // Post-repair reconcile: if doRepairLink() marked needsReconcile
869
+ // (to close the gap between diff snapshot and new push subscription),
870
+ // schedule reconciliation NOW — after _activeRepairs is cleared so
871
+ // scheduleReconcile() won't skip it.
872
+ const link = this._activeLinks.get(linkKey);
873
+ if (link?.needsReconcile && link.status === 'live') {
874
+ this.scheduleReconcile(linkKey, 500);
875
+ }
871
876
  });
872
877
  this._activeRepairs.set(linkKey, promise);
873
878
  return promise;
@@ -886,7 +891,7 @@ export class SyncEngineLevel implements SyncEngine {
886
891
  // Capture the sync generation at repair start. If teardown occurs during
887
892
  // any await, the generation will have incremented and we bail before
888
893
  // mutating state — preventing the race where repair continues after teardown.
889
- const generation = this._syncGeneration;
894
+ const generation = this._engineGeneration;
890
895
 
891
896
  const { tenantDid: did, remoteEndpoint: dwnUrl, delegateDid, protocol } = link;
892
897
 
@@ -897,7 +902,7 @@ export class SyncEngineLevel implements SyncEngine {
897
902
  // Step 1: Close existing subscriptions FIRST to stop old events from
898
903
  // mutating local state while repair runs.
899
904
  await this.closeLinkSubscriptions(link);
900
- if (this._syncGeneration !== generation) { return; } // Teardown occurred.
905
+ if (this._engineGeneration !== generation) { return; } // Teardown occurred.
901
906
 
902
907
  // Step 2: Clear runtime ordinals immediately — stale state must not
903
908
  // persist across repair attempts (successful or failed).
@@ -908,72 +913,64 @@ export class SyncEngineLevel implements SyncEngine {
908
913
 
909
914
  try {
910
915
  // Step 3: Run SMT reconciliation for this link.
911
- const localRoot = await this.getLocalRoot(did, delegateDid, protocol);
912
- if (this._syncGeneration !== generation) { return; }
913
- const remoteRoot = await this.getRemoteRoot(did, dwnUrl, delegateDid, protocol);
914
- if (this._syncGeneration !== generation) { return; }
915
-
916
- if (localRoot !== remoteRoot) {
917
- const diff = await this.diffWithRemote({ did, dwnUrl, delegateDid, protocol });
918
- if (this._syncGeneration !== generation) { return; }
919
-
920
- if (diff.onlyRemote.length > 0) {
921
- const prefetched: (MessagesSyncDiffEntry & { message: GenericMessage })[] = [];
922
- const needsFetchCids: string[] = [];
923
- for (const entry of diff.onlyRemote) {
924
- if (!entry.message || (entry.message.descriptor.interface === 'Records' &&
925
- entry.message.descriptor.method === 'Write' &&
926
- (entry.message.descriptor as any).dataCid && !entry.encodedData)) {
927
- needsFetchCids.push(entry.messageCid);
928
- } else {
929
- prefetched.push(entry as MessagesSyncDiffEntry & { message: GenericMessage });
930
- }
931
- }
932
- await this.pullMessages({ did, dwnUrl, delegateDid, protocol, messageCids: needsFetchCids, prefetched });
933
- if (this._syncGeneration !== generation) { return; }
934
- }
935
-
936
- if (diff.onlyLocal.length > 0) {
937
- await this.pushMessages({ did, dwnUrl, delegateDid, protocol, messageCids: diff.onlyLocal });
938
- if (this._syncGeneration !== generation) { return; }
939
- }
940
- }
916
+ const reconcileOutcome = await this.createLinkReconciler(
917
+ () => this._engineGeneration === generation
918
+ ).reconcile({ did, dwnUrl, delegateDid, protocol });
919
+ if (reconcileOutcome.aborted) { return; }
941
920
 
942
- // Step 4: Determine the post-repair resume token.
921
+ // Step 4: Determine the post-repair pull resume token.
943
922
  // - If repair was triggered by ProgressGap, use the stored resumeToken
944
923
  // (from gapInfo.latestAvailable) so the reopened subscription replays
945
924
  // from a valid boundary, closing the race window between SMT and resubscribe.
946
925
  // - Otherwise, use the existing contiguousAppliedToken if still valid.
947
- // - Push checkpoint is NOT reset during repair: push frontier tracks what
948
- // the local EventLog has delivered to the remote. SMT repair handles
949
- // pull-side convergence; push-side convergence is handled by the diff's
950
- // onlyLocal push. The push checkpoint remains the local authority.
926
+ // Push is opportunistic no push checkpoint to reset.
951
927
  const repairCtx = this._repairContext.get(linkKey);
952
928
  const resumeToken = repairCtx?.resumeToken ?? link.pull.contiguousAppliedToken;
953
929
  ReplicationLedger.resetCheckpoint(link.pull, resumeToken);
954
930
  await this.ledger.saveLink(link);
955
- if (this._syncGeneration !== generation) { return; }
931
+ if (this._engineGeneration !== generation) { return; }
932
+
933
+ // Step 5: Reopen subscriptions.
934
+ // Mark needsReconcile BEFORE reopening — local push starts from "now",
935
+ // so any writes between the diff snapshot (step 3) and the new push
936
+ // subscription are invisible to both mechanisms. A short post-reopen
937
+ // reconcile will close this gap (cheap: SMT root comparison short-circuits
938
+ // if roots already match).
939
+ link.needsReconcile = true;
940
+ await this.ledger.saveLink(link);
941
+ if (this._engineGeneration !== generation) { return; }
956
942
 
957
- // Step 5: Reopen subscriptions with the repaired checkpoints.
958
- const target = { did, dwnUrl, delegateDid, protocol };
959
- await this.openLivePullSubscription(target);
960
- if (this._syncGeneration !== generation) { return; }
943
+ const target = { did, dwnUrl, delegateDid, protocol, linkKey };
961
944
  try {
962
- await this.openLocalPushSubscription({
963
- ...target,
964
- pushCursor: link.push.contiguousAppliedToken,
965
- });
945
+ await this.openLivePullSubscription(target);
946
+ } catch (pullErr: any) {
947
+ if (pullErr.isProgressGap) {
948
+ console.warn(`SyncEngineLevel: Stale pull resume token for ${did} -> ${dwnUrl}, resetting to start fresh`);
949
+ ReplicationLedger.resetCheckpoint(link.pull);
950
+ await this.ledger.saveLink(link);
951
+ if (this._engineGeneration !== generation) { return; }
952
+ await this.openLivePullSubscription(target);
953
+ } else {
954
+ throw pullErr;
955
+ }
956
+ }
957
+ if (this._engineGeneration !== generation) { return; }
958
+ try {
959
+ await this.openLocalPushSubscription(target);
966
960
  } catch (pushError) {
967
- const pullSub = this._liveSubscriptions.find(
968
- s => s.did === did && s.dwnUrl === dwnUrl && s.protocol === protocol
969
- );
961
+ const pullSub = this._liveSubscriptions.find((s) => s.linkKey === linkKey);
970
962
  if (pullSub) {
971
963
  try { await pullSub.close(); } catch { /* best effort */ }
972
964
  this._liveSubscriptions = this._liveSubscriptions.filter(s => s !== pullSub);
973
965
  }
974
966
  throw pushError;
975
967
  }
976
- if (this._syncGeneration !== generation) { return; }
968
+ if (this._engineGeneration !== generation) { return; }
969
+
970
+ // Note: post-repair reconcile to close the repair-window gap is
971
+ // scheduled by repairLink() AFTER _activeRepairs is cleared — not
972
+ // here, because scheduleReconcile() would skip it while _activeRepairs
973
+ // still contains this link.
977
974
 
978
975
  // Step 6: Clean up repair context and transition to live.
979
976
  this._repairContext.delete(linkKey);
@@ -991,7 +988,7 @@ export class SyncEngineLevel implements SyncEngine {
991
988
 
992
989
  } catch (error: any) {
993
990
  // If teardown occurred during repair, don't retry or enter degraded_poll.
994
- if (this._syncGeneration !== generation) { return; }
991
+ if (this._engineGeneration !== generation) { return; }
995
992
 
996
993
  console.error(`SyncEngineLevel: Repair failed for ${did} -> ${dwnUrl} (attempt ${attempts})`, error);
997
994
  this.emitEvent({ type: 'repair:failed', tenantDid: did, remoteEndpoint: dwnUrl, protocol, attempt: attempts, error: String(error.message ?? error) });
@@ -1011,21 +1008,18 @@ export class SyncEngineLevel implements SyncEngine {
1011
1008
  * Close pull and push subscriptions for a specific link.
1012
1009
  */
1013
1010
  private async closeLinkSubscriptions(link: ReplicationLinkState): Promise<void> {
1014
- const { tenantDid: did, remoteEndpoint: dwnUrl, protocol } = link;
1011
+ const { tenantDid: did, remoteEndpoint: dwnUrl } = link;
1012
+ const linkKey = this.buildLinkKey(did, dwnUrl, link.scopeId);
1015
1013
 
1016
1014
  // Close pull subscription.
1017
- const pullSub = this._liveSubscriptions.find(
1018
- s => s.did === did && s.dwnUrl === dwnUrl && s.protocol === protocol
1019
- );
1015
+ const pullSub = this._liveSubscriptions.find((s) => s.linkKey === linkKey);
1020
1016
  if (pullSub) {
1021
1017
  try { await pullSub.close(); } catch { /* best effort */ }
1022
1018
  this._liveSubscriptions = this._liveSubscriptions.filter(s => s !== pullSub);
1023
1019
  }
1024
1020
 
1025
1021
  // Close local push subscription.
1026
- const pushSub = this._localSubscriptions.find(
1027
- s => s.did === did && s.dwnUrl === dwnUrl && s.protocol === protocol
1028
- );
1022
+ const pushSub = this._localSubscriptions.find((s) => s.linkKey === linkKey);
1029
1023
  if (pushSub) {
1030
1024
  try { await pushSub.close(); } catch { /* best effort */ }
1031
1025
  this._localSubscriptions = this._localSubscriptions.filter(s => s !== pushSub);
@@ -1057,10 +1051,10 @@ export class SyncEngineLevel implements SyncEngine {
1057
1051
  const jitter = Math.floor(Math.random() * 15_000);
1058
1052
  const interval = baseInterval + jitter;
1059
1053
 
1060
- const pollGeneration = this._syncGeneration;
1054
+ const pollGeneration = this._engineGeneration;
1061
1055
  const timer = setInterval(async (): Promise<void> => {
1062
1056
  // Bail if teardown occurred since this timer was created.
1063
- if (this._syncGeneration !== pollGeneration) {
1057
+ if (this._engineGeneration !== pollGeneration) {
1064
1058
  clearInterval(timer);
1065
1059
  this._degradedPollTimers.delete(linkKey);
1066
1060
  return;
@@ -1105,16 +1099,15 @@ export class SyncEngineLevel implements SyncEngine {
1105
1099
  // Increment generation to invalidate all in-flight async operations
1106
1100
  // (repairs, retry timers, degraded-poll ticks). Any async work that
1107
1101
  // captured the previous generation will bail on its next checkpoint.
1108
- this._syncGeneration++;
1102
+ this._engineGeneration++;
1109
1103
 
1110
- // Clear the push debounce timer.
1111
- if (this._pushDebounceTimer) {
1112
- clearTimeout(this._pushDebounceTimer);
1113
- this._pushDebounceTimer = undefined;
1104
+ // Clear per-link push runtime state.
1105
+ for (const pushRuntime of this._pushRuntimes.values()) {
1106
+ if (pushRuntime.timer) {
1107
+ clearTimeout(pushRuntime.timer);
1108
+ }
1114
1109
  }
1115
-
1116
- // Flush any pending push CIDs.
1117
- this._pendingPushCids.clear();
1110
+ this._pushRuntimes.clear();
1118
1111
 
1119
1112
  // Close all live pull subscriptions.
1120
1113
  for (const sub of this._liveSubscriptions) {
@@ -1149,8 +1142,16 @@ export class SyncEngineLevel implements SyncEngine {
1149
1142
  this._repairRetryTimers.clear();
1150
1143
  this._repairContext.clear();
1151
1144
 
1145
+ // Clear reconcile timers and in-flight operations.
1146
+ for (const timer of this._reconcileTimers.values()) {
1147
+ clearTimeout(timer);
1148
+ }
1149
+ this._reconcileTimers.clear();
1150
+ this._reconcileInFlight.clear();
1151
+
1152
1152
  // Clear closure evaluation contexts.
1153
1153
  this._closureContexts.clear();
1154
+ this._recentlyPulledCids.clear();
1154
1155
 
1155
1156
  // Clear the in-memory link and runtime state.
1156
1157
  this._activeLinks.clear();
@@ -1167,13 +1168,15 @@ export class SyncEngineLevel implements SyncEngine {
1167
1168
  */
1168
1169
  private async openLivePullSubscription(target: {
1169
1170
  did: string; dwnUrl: string; delegateDid?: string; protocol?: string;
1171
+ linkKey: string;
1170
1172
  }): Promise<void> {
1171
1173
  const { did, delegateDid, dwnUrl, protocol } = target;
1172
1174
 
1173
- // Resolve the cursor from the link's pull checkpoint (preferred) or legacy storage.
1174
- const cursorKey = this.buildCursorKey(did, dwnUrl, protocol);
1175
+ // Resolve the cursor from the link's durable pull checkpoint.
1176
+ // Legacy syncCursors migration happens at link load time in startLiveSync().
1177
+ const cursorKey = target.linkKey;
1175
1178
  const link = this._activeLinks.get(cursorKey);
1176
- let cursor = link?.pull.contiguousAppliedToken ?? await this.getCursor(cursorKey);
1179
+ let cursor = link?.pull.contiguousAppliedToken;
1177
1180
 
1178
1181
  // Guard against corrupted tokens with empty fields — these would fail
1179
1182
  // MessagesSubscribe JSON schema validation (minLength: 1). Discard and
@@ -1217,11 +1220,17 @@ export class SyncEngineLevel implements SyncEngine {
1217
1220
  permissionGrantId = grant.grant.id;
1218
1221
  }
1219
1222
 
1223
+ const handlerGeneration = this._engineGeneration;
1224
+
1220
1225
  // Define the subscription handler that processes incoming events.
1221
1226
  // NOTE: The WebSocket client fires handlers without awaiting (fire-and-forget),
1222
1227
  // so multiple handlers can be in-flight concurrently. The ordinal tracker
1223
1228
  // ensures the checkpoint advances only when all earlier deliveries are committed.
1224
1229
  const subscriptionHandler = async (subMessage: SubscriptionMessage): Promise<void> => {
1230
+ if (this._engineGeneration !== handlerGeneration) {
1231
+ return;
1232
+ }
1233
+
1225
1234
  if (subMessage.type === 'eose') {
1226
1235
  // End-of-stored-events — catch-up complete.
1227
1236
  if (link) {
@@ -1243,8 +1252,6 @@ export class SyncEngineLevel implements SyncEngine {
1243
1252
  // far as the contiguous drain reaches.
1244
1253
  this.drainCommittedPull(cursorKey);
1245
1254
  await this.ledger.saveLink(link);
1246
- } else {
1247
- await this.setCursor(cursorKey, subMessage.cursor);
1248
1255
  }
1249
1256
  // Transport is reachable — set connectivity to online.
1250
1257
  if (link) {
@@ -1253,6 +1260,10 @@ export class SyncEngineLevel implements SyncEngine {
1253
1260
  if (prevEoseConnectivity !== 'online') {
1254
1261
  this.emitEvent({ type: 'link:connectivity-change', tenantDid: did, remoteEndpoint: dwnUrl, protocol, from: prevEoseConnectivity, to: 'online' });
1255
1262
  }
1263
+ // If the link was marked dirty, schedule reconciliation now that it's healthy.
1264
+ if (link.needsReconcile) {
1265
+ this.scheduleReconcile(cursorKey, 500);
1266
+ }
1256
1267
  } else {
1257
1268
  this._connectivityState = 'online';
1258
1269
  }
@@ -1399,9 +1410,6 @@ export class SyncEngineLevel implements SyncEngine {
1399
1410
  console.warn(`SyncEngineLevel: Pull in-flight overflow for ${did} -> ${dwnUrl}, transitioning to repairing`);
1400
1411
  await this.transitionToRepairing(cursorKey, link);
1401
1412
  }
1402
- } else if (!link) {
1403
- // Legacy path: no link available, use simple cursor persistence.
1404
- await this.setCursor(cursorKey, subMessage.cursor);
1405
1413
  }
1406
1414
  } catch (error: any) {
1407
1415
  console.error(`SyncEngineLevel: Error processing live-pull event for ${did}`, error);
@@ -1480,15 +1488,16 @@ export class SyncEngineLevel implements SyncEngine {
1480
1488
  }
1481
1489
 
1482
1490
  this._liveSubscriptions.push({
1491
+ linkKey : cursorKey,
1483
1492
  did,
1484
1493
  dwnUrl,
1485
1494
  delegateDid,
1486
1495
  protocol,
1487
- close: async (): Promise<void> => { await reply.subscription!.close(); },
1496
+ close : async (): Promise<void> => { await reply.subscription!.close(); },
1488
1497
  });
1489
1498
 
1490
1499
  // Set per-link connectivity to online after successful subscription setup.
1491
- const pullLink = this._activeLinks.get(this.buildCursorKey(did, dwnUrl, protocol));
1500
+ const pullLink = this._activeLinks.get(cursorKey);
1492
1501
  if (pullLink) {
1493
1502
  const prevPullConnectivity = pullLink.connectivity;
1494
1503
  pullLink.connectivity = 'online';
@@ -1508,23 +1517,10 @@ export class SyncEngineLevel implements SyncEngine {
1508
1517
  */
1509
1518
  private async openLocalPushSubscription(target: {
1510
1519
  did: string; dwnUrl: string; delegateDid?: string; protocol?: string;
1511
- pushCursor?: ProgressToken;
1520
+ linkKey: string;
1512
1521
  }): Promise<void> {
1513
1522
  const { did, delegateDid, dwnUrl, protocol } = target;
1514
1523
 
1515
- // Guard against corrupted push cursors — same validation as the pull side.
1516
- let pushCursor = target.pushCursor;
1517
- if (pushCursor && (!pushCursor.streamId || !pushCursor.messageCid || !pushCursor.epoch || !pushCursor.position)) {
1518
- console.warn(`SyncEngineLevel: Discarding stored push cursor with empty field(s) for ${did} -> ${dwnUrl}`);
1519
- pushCursor = undefined;
1520
- const cursorKey = this.buildCursorKey(did, dwnUrl, protocol);
1521
- const link = this._activeLinks.get(cursorKey);
1522
- if (link) {
1523
- ReplicationLedger.resetCheckpoint(link.push);
1524
- await this.ledger.saveLink(link);
1525
- }
1526
- }
1527
-
1528
1524
  // Build filters scoped to the protocol (if any).
1529
1525
  const filters = protocol ? [{ protocol }] : [];
1530
1526
 
@@ -1541,41 +1537,28 @@ export class SyncEngineLevel implements SyncEngine {
1541
1537
  permissionGrantId = grant.grant.id;
1542
1538
  }
1543
1539
 
1540
+ const handlerGeneration = this._engineGeneration;
1541
+
1544
1542
  // Subscribe to the local DWN's EventLog.
1545
1543
  const subscriptionHandler = async (subMessage: SubscriptionMessage): Promise<void> => {
1544
+ if (this._engineGeneration !== handlerGeneration) {
1545
+ return;
1546
+ }
1547
+
1546
1548
  if (subMessage.type !== 'event') {
1547
1549
  return;
1548
1550
  }
1549
1551
 
1550
- // Subset scope filtering for push: only push events that match the
1551
- // link's scope prefixes. Events outside the scope are not our responsibility.
1552
- // Skipped events MUST advance the push checkpoint to prevent infinite
1553
- // replay after repair/reconnect (same reason as the pull side).
1554
- const pushLink = this._activeLinks.get(this.buildCursorKey(did, dwnUrl, protocol));
1552
+ // Subset scope filtering: only push events that match the link's
1553
+ // scope prefixes. Events outside the scope are not our responsibility.
1554
+ const pushLinkKey = target.linkKey;
1555
+ const pushLink = this._activeLinks.get(pushLinkKey);
1555
1556
  if (pushLink && !isEventInScope(subMessage.event.message, pushLink.scope)) {
1556
- // Guard: only mutate durable state when the link is live/initializing.
1557
- // During repair/degraded_poll, orchestration owns checkpoint progression.
1558
- if (pushLink.status !== 'live' && pushLink.status !== 'initializing') {
1559
- return;
1560
- }
1561
-
1562
- // Validate token domain before committing — a stream/epoch mismatch
1563
- // on the local EventLog should trigger repair, not silently overwrite.
1564
- if (!ReplicationLedger.validateTokenDomain(pushLink.push, subMessage.cursor)) {
1565
- await this.transitionToRepairing(
1566
- this.buildCursorKey(did, dwnUrl, protocol), pushLink
1567
- );
1568
- return;
1569
- }
1570
-
1571
- ReplicationLedger.setReceivedToken(pushLink.push, subMessage.cursor);
1572
- ReplicationLedger.commitContiguousToken(pushLink.push, subMessage.cursor);
1573
- await this.ledger.saveLink(pushLink);
1574
1557
  return;
1575
1558
  }
1576
1559
 
1577
1560
  // Accumulate the message CID for a debounced push.
1578
- const targetKey = this.buildCursorKey(did, dwnUrl, protocol);
1561
+ const targetKey = pushLinkKey;
1579
1562
  const cid = await Message.getCid(subMessage.event.message);
1580
1563
  if (cid === undefined) {
1581
1564
  return;
@@ -1588,32 +1571,28 @@ export class SyncEngineLevel implements SyncEngine {
1588
1571
  return;
1589
1572
  }
1590
1573
 
1591
- let pending = this._pendingPushCids.get(targetKey);
1592
- if (!pending) {
1593
- pending = { did, dwnUrl, delegateDid, protocol, entries: [] };
1594
- this._pendingPushCids.set(targetKey, pending);
1595
- }
1596
- pending.entries.push({ cid, localToken: subMessage.cursor });
1574
+ const pushRuntime = this.getOrCreatePushRuntime(targetKey, {
1575
+ did, dwnUrl, delegateDid, protocol,
1576
+ });
1577
+ pushRuntime.entries.push({ cid });
1597
1578
 
1598
- // Debounce the push.
1599
- if (this._pushDebounceTimer) {
1600
- clearTimeout(this._pushDebounceTimer);
1579
+ // Immediate-first: if no push is in flight and no batch timer is
1580
+ // pending, push immediately. Otherwise, the pending batch timer
1581
+ // or the post-flush drain will pick up the new entry.
1582
+ if (!pushRuntime.flushing && !pushRuntime.timer) {
1583
+ void this.flushPendingPushesForLink(targetKey);
1601
1584
  }
1602
- this._pushDebounceTimer = setTimeout((): void => {
1603
- void this.flushPendingPushes();
1604
- }, PUSH_DEBOUNCE_MS);
1605
1585
  };
1606
1586
 
1607
- // Process the local subscription request.
1608
- // When a push cursor is provided (e.g., after repair), the local subscription
1609
- // replays events from that position, closing the race window where local
1610
- // writes during repair would otherwise be missed by push-on-write.
1587
+ // Subscribe to the local DWN EventLog from "now" — opportunistic push
1588
+ // does not replay from a stored cursor. Any writes missed during outages
1589
+ // are recovered by the post-repair reconciliation path.
1611
1590
  const response = await this.agent.dwn.processRequest({
1612
1591
  author : did,
1613
1592
  target : did,
1614
1593
  messageType : DwnInterface.MessagesSubscribe,
1615
1594
  granteeDid : delegateDid,
1616
- messageParams : { filters, permissionGrantId, cursor: pushCursor },
1595
+ messageParams : { filters, permissionGrantId },
1617
1596
  subscriptionHandler : subscriptionHandler as any,
1618
1597
  });
1619
1598
 
@@ -1623,11 +1602,12 @@ export class SyncEngineLevel implements SyncEngine {
1623
1602
  }
1624
1603
 
1625
1604
  this._localSubscriptions.push({
1605
+ linkKey : target.linkKey ?? buildLegacyCursorKey(did, dwnUrl, protocol),
1626
1606
  did,
1627
1607
  dwnUrl,
1628
1608
  delegateDid,
1629
1609
  protocol,
1630
- close: async (): Promise<void> => { await reply.subscription!.close(); },
1610
+ close : async (): Promise<void> => { await reply.subscription!.close(); },
1631
1611
  });
1632
1612
  }
1633
1613
 
@@ -1635,112 +1615,261 @@ export class SyncEngineLevel implements SyncEngine {
1635
1615
  * Flushes accumulated push CIDs to remote DWNs.
1636
1616
  */
1637
1617
  private async flushPendingPushes(): Promise<void> {
1638
- this._pushDebounceTimer = undefined;
1618
+ await Promise.all([...this._pushRuntimes.keys()].map(async (linkKey) => {
1619
+ await this.flushPendingPushesForLink(linkKey);
1620
+ }));
1621
+ }
1622
+
1623
+ private async flushPendingPushesForLink(linkKey: string): Promise<void> {
1624
+ const pushRuntime = this._pushRuntimes.get(linkKey);
1625
+ if (!pushRuntime) {
1626
+ return;
1627
+ }
1639
1628
 
1640
- const batches = [...this._pendingPushCids.entries()];
1641
- this._pendingPushCids.clear();
1629
+ const { did, dwnUrl, delegateDid, protocol, entries: pushEntries, retryCount } = pushRuntime;
1630
+ pushRuntime.entries = [];
1642
1631
 
1643
- // Push to all endpoints in parallel — each target is independent.
1644
- await Promise.all(batches.map(async ([targetKey, pending]) => {
1645
- const { did, dwnUrl, delegateDid, protocol, entries: pushEntries } = pending;
1646
- if (pushEntries.length === 0) {
1647
- return;
1632
+ if (pushEntries.length === 0) {
1633
+ if (!pushRuntime.timer && !pushRuntime.flushing && retryCount === 0) {
1634
+ this._pushRuntimes.delete(linkKey);
1648
1635
  }
1636
+ return;
1637
+ }
1649
1638
 
1650
- const cids = pushEntries.map(e => e.cid);
1639
+ const cids = pushEntries.map((entry) => entry.cid);
1640
+ pushRuntime.flushing = true;
1651
1641
 
1652
- try {
1653
- const result = await pushMessages({
1642
+ try {
1643
+ const result = await pushMessages({
1644
+ did, dwnUrl, delegateDid, protocol,
1645
+ messageCids : cids,
1646
+ agent : this.agent,
1647
+ permissionsApi : this._permissionsApi,
1648
+ });
1649
+
1650
+ if (result.failed.length > 0) {
1651
+ const failedSet = new Set(result.failed);
1652
+ const failedEntries = pushEntries.filter((entry) => failedSet.has(entry.cid));
1653
+ this.requeueOrReconcile(linkKey, {
1654
1654
  did, dwnUrl, delegateDid, protocol,
1655
- messageCids : cids,
1656
- agent : this.agent,
1657
- permissionsApi : this._permissionsApi,
1655
+ entries : failedEntries,
1656
+ retryCount : retryCount + 1,
1658
1657
  });
1659
-
1660
- // Advance the push checkpoint for successfully pushed entries.
1661
- // Push is sequential (single batch, in-order processing) so we can
1662
- // commit directly without ordinal tracking — there's no concurrent
1663
- // completion to reorder.
1664
- const link = this._activeLinks.get(targetKey);
1665
- if (link) {
1666
- const succeededSet = new Set(result.succeeded);
1667
- // Track highest contiguous success: if a CID fails, we stop advancing.
1668
- let hitFailure = false;
1669
- for (const entry of pushEntries) {
1670
- if (hitFailure) { break; }
1671
- if (succeededSet.has(entry.cid) && entry.localToken) {
1672
- if (!ReplicationLedger.validateTokenDomain(link.push, entry.localToken)) {
1673
- console.warn(`SyncEngineLevel: Push checkpoint domain mismatch for ${did} -> ${dwnUrl}, transitioning to repairing`);
1674
- await this.transitionToRepairing(targetKey, link);
1675
- break;
1676
- }
1677
- ReplicationLedger.setReceivedToken(link.push, entry.localToken);
1678
- ReplicationLedger.commitContiguousToken(link.push, entry.localToken);
1679
- } else {
1680
- // This CID failed or had no token — stop advancing.
1681
- hitFailure = true;
1682
- }
1683
- }
1684
- await this.ledger.saveLink(link);
1658
+ } else {
1659
+ // Successful push reset retry count so subsequent unrelated
1660
+ // batches on this link start with a fresh budget.
1661
+ pushRuntime.retryCount = 0;
1662
+ if (!pushRuntime.timer && pushRuntime.entries.length === 0) {
1663
+ this._pushRuntimes.delete(linkKey);
1685
1664
  }
1665
+ }
1666
+ } catch (error: any) {
1667
+ console.error(`SyncEngineLevel: Push batch failed for ${did} -> ${dwnUrl}`, error);
1668
+ this.requeueOrReconcile(linkKey, {
1669
+ did, dwnUrl, delegateDid, protocol,
1670
+ entries : pushEntries,
1671
+ retryCount : retryCount + 1,
1672
+ });
1673
+ } finally {
1674
+ pushRuntime.flushing = false;
1675
+
1676
+ // If new entries accumulated while this push was in flight, schedule
1677
+ // a short drain to flush them. This gives a brief batching window
1678
+ // for burst writes while keeping single-write latency low.
1679
+ const rt = this._pushRuntimes.get(linkKey);
1680
+ if (rt && rt.entries.length > 0 && !rt.timer) {
1681
+ rt.timer = setTimeout((): void => {
1682
+ rt.timer = undefined;
1683
+ void this.flushPendingPushesForLink(linkKey);
1684
+ }, PUSH_DEBOUNCE_MS);
1685
+ }
1686
+ }
1687
+ }
1686
1688
 
1687
- // Re-queue only TRANSIENT failures for retry. Permanent failures (400/401/403)
1688
- // are dropped they will never succeed regardless of retry.
1689
- if (result.failed.length > 0) {
1690
- console.error(
1691
- `SyncEngineLevel: Push-on-write failed for ${did} -> ${dwnUrl}: ` +
1692
- `${result.failed.length} transient failures of ${cids.length} messages`
1693
- );
1694
- const failedSet = new Set(result.failed);
1695
- const failedEntries = pushEntries.filter(e => failedSet.has(e.cid));
1696
- let requeued = this._pendingPushCids.get(targetKey);
1697
- if (!requeued) {
1698
- requeued = { did, dwnUrl, delegateDid, protocol, entries: [] };
1699
- this._pendingPushCids.set(targetKey, requeued);
1700
- }
1701
- requeued.entries.push(...failedEntries);
1689
+ /** Push retry backoff schedule: immediate, 250ms, 1s, 2s, then give up. */
1690
+ private static readonly PUSH_RETRY_BACKOFF_MS = [0, 250, 1000, 2000];
1702
1691
 
1703
- // Schedule a retry after a short delay.
1704
- if (!this._pushDebounceTimer) {
1705
- this._pushDebounceTimer = setTimeout((): void => {
1706
- void this.flushPendingPushes();
1707
- }, PUSH_DEBOUNCE_MS * 4);
1708
- }
1709
- }
1710
- // Permanent failures are logged by pushMessages but NOT re-queued.
1711
- // They will be rediscovered by the next SMT integrity check if the
1712
- // local/remote state has changed, but won't spin in a retry loop.
1713
- } catch (error: any) {
1714
- // Truly unexpected error (not per-message failure). Re-queue entire
1715
- // batch so entries aren't silently dropped from the debounce queue.
1716
- console.error(`SyncEngineLevel: Push-on-write failed for ${did} -> ${dwnUrl}`, error);
1717
- let requeued = this._pendingPushCids.get(targetKey);
1718
- if (!requeued) {
1719
- requeued = { did, dwnUrl, delegateDid, protocol, entries: [] };
1720
- this._pendingPushCids.set(targetKey, requeued);
1721
- }
1722
- requeued.entries.push(...pushEntries);
1692
+ /**
1693
+ * Re-queues a failed push batch for retry, or marks the link
1694
+ * `needsReconcile` if retries are exhausted. Bounded to prevent
1695
+ * infinite retry loops.
1696
+ */
1697
+ private requeueOrReconcile(targetKey: string, pending: {
1698
+ did: string; dwnUrl: string; delegateDid?: string; protocol?: string;
1699
+ entries: { cid: string }[];
1700
+ retryCount: number;
1701
+ }): void {
1702
+ const maxRetries = SyncEngineLevel.PUSH_RETRY_BACKOFF_MS.length;
1703
+ const pushRuntime = this.getOrCreatePushRuntime(targetKey, pending);
1704
+
1705
+ if (pending.retryCount >= maxRetries) {
1706
+ // Retry budget exhausted — mark link dirty for reconciliation.
1707
+ if (pushRuntime.timer) {
1708
+ clearTimeout(pushRuntime.timer);
1709
+ }
1710
+ this._pushRuntimes.delete(targetKey);
1711
+ const link = this._activeLinks.get(targetKey);
1712
+ if (link && !link.needsReconcile) {
1713
+ link.needsReconcile = true;
1714
+ void this.ledger.saveLink(link).then(() => {
1715
+ this.emitEvent({ type: 'reconcile:needed', tenantDid: pending.did, remoteEndpoint: pending.dwnUrl, protocol: pending.protocol, reason: 'push-retry-exhausted' });
1716
+ this.scheduleReconcile(targetKey);
1717
+ });
1718
+ }
1719
+ return;
1720
+ }
1723
1721
 
1724
- if (!this._pushDebounceTimer) {
1725
- this._pushDebounceTimer = setTimeout((): void => {
1726
- void this.flushPendingPushes();
1727
- }, PUSH_DEBOUNCE_MS * 4);
1728
- }
1722
+ pushRuntime.entries.push(...pending.entries);
1723
+ pushRuntime.retryCount = pending.retryCount;
1724
+ const delayMs = SyncEngineLevel.PUSH_RETRY_BACKOFF_MS[pending.retryCount] ?? 2000;
1725
+ if (pushRuntime.timer) {
1726
+ clearTimeout(pushRuntime.timer);
1727
+ }
1728
+ pushRuntime.timer = setTimeout((): void => {
1729
+ pushRuntime.timer = undefined;
1730
+ void this.flushPendingPushesForLink(targetKey);
1731
+ }, delayMs);
1732
+ }
1733
+
1734
+ private createLinkReconciler(shouldContinue?: () => boolean): SyncLinkReconciler {
1735
+ return new SyncLinkReconciler({
1736
+ getLocalRoot : async (did, delegateDid, protocol) => this.getLocalRoot(did, delegateDid, protocol),
1737
+ getRemoteRoot : async (did, dwnUrl, delegateDid, protocol) => this.getRemoteRoot(did, dwnUrl, delegateDid, protocol),
1738
+ diffWithRemote : async (target) => this.diffWithRemote(target),
1739
+ pullMessages : async (params) => this.pullMessages(params),
1740
+ pushMessages : async (params) => this.pushMessages(params),
1741
+ shouldContinue,
1742
+ });
1743
+ }
1744
+
1745
+ // ---------------------------------------------------------------------------
1746
+ // Per-link reconciliation
1747
+ // ---------------------------------------------------------------------------
1748
+
1749
+ /** Active reconcile timers, keyed by link key. */
1750
+ private _reconcileTimers: Map<string, ReturnType<typeof setTimeout>> = new Map();
1751
+
1752
+ /** Active reconcile operations, keyed by link key (dedup). */
1753
+ private _reconcileInFlight: Map<string, Promise<void>> = new Map();
1754
+
1755
+ /**
1756
+ * Schedule a per-link reconciliation after a short debounce. Coalesces
1757
+ * repeated requests for the same link.
1758
+ */
1759
+ private scheduleReconcile(linkKey: string, delayMs: number = 1500): void {
1760
+ if (this._reconcileTimers.has(linkKey)) { return; }
1761
+ if (this._reconcileInFlight.has(linkKey)) { return; }
1762
+ if (this._activeRepairs.has(linkKey)) { return; }
1763
+
1764
+ const generation = this._engineGeneration;
1765
+ const timer = setTimeout((): void => {
1766
+ this._reconcileTimers.delete(linkKey);
1767
+ if (this._engineGeneration !== generation) { return; }
1768
+ void this.reconcileLink(linkKey);
1769
+ }, delayMs);
1770
+ this._reconcileTimers.set(linkKey, timer);
1771
+ }
1772
+
1773
+ /**
1774
+ * Run SMT reconciliation for a single link. Deduplicates concurrent calls.
1775
+ * On success, clears `needsReconcile`. On failure, schedules retry.
1776
+ */
1777
+ private async reconcileLink(linkKey: string): Promise<void> {
1778
+ const existing = this._reconcileInFlight.get(linkKey);
1779
+ if (existing) { return existing; }
1780
+
1781
+ const promise = this.doReconcileLink(linkKey).finally(() => {
1782
+ this._reconcileInFlight.delete(linkKey);
1783
+ });
1784
+ this._reconcileInFlight.set(linkKey, promise);
1785
+ return promise;
1786
+ }
1787
+
1788
+ /**
1789
+ * Internal reconciliation implementation for a single link. Runs the
1790
+ * same SMT diff + pull/push that `sync()` does, but scoped to one link.
1791
+ */
1792
+ private async doReconcileLink(linkKey: string): Promise<void> {
1793
+ const link = this._activeLinks.get(linkKey);
1794
+ if (!link) { return; }
1795
+
1796
+ // Only reconcile live links — repairing/degraded links have their own
1797
+ // recovery path. Reconciling during repair would race with SMT diff.
1798
+ if (link.status !== 'live') {
1799
+ return;
1800
+ }
1801
+
1802
+ // Skip if a repair is in progress for this link.
1803
+ if (this._activeRepairs.has(linkKey)) {
1804
+ return;
1805
+ }
1806
+
1807
+ const generation = this._engineGeneration;
1808
+ const { tenantDid: did, remoteEndpoint: dwnUrl, delegateDid, protocol } = link;
1809
+
1810
+ try {
1811
+ const reconcileOutcome = await this.createLinkReconciler(
1812
+ () => this._engineGeneration === generation
1813
+ ).reconcile({ did, dwnUrl, delegateDid, protocol }, { verifyConvergence: true });
1814
+ if (reconcileOutcome.aborted) { return; }
1815
+
1816
+ if (reconcileOutcome.converged) {
1817
+ await this.ledger.clearNeedsReconcile(link);
1818
+ this.emitEvent({ type: 'reconcile:completed', tenantDid: did, remoteEndpoint: dwnUrl, protocol });
1819
+ } else {
1820
+ // Roots still differ — retry after a delay. This can happen when
1821
+ // pushMessages() had permanent failures, pullMessages() partially
1822
+ // failed, or new writes arrived during reconciliation.
1823
+ this.scheduleReconcile(linkKey, 5000);
1729
1824
  }
1730
- }));
1825
+ } catch (error: any) {
1826
+ console.error(`SyncEngineLevel: Reconciliation failed for ${did} -> ${dwnUrl}`, error);
1827
+ // Schedule retry with longer delay.
1828
+ this.scheduleReconcile(linkKey, 5000);
1829
+ }
1830
+ }
1831
+
1832
+ private getOrCreatePushRuntime(linkKey: string, params: {
1833
+ did: string;
1834
+ dwnUrl: string;
1835
+ delegateDid?: string;
1836
+ protocol?: string;
1837
+ }): PushRuntimeState {
1838
+ let pushRuntime = this._pushRuntimes.get(linkKey);
1839
+ if (!pushRuntime) {
1840
+ pushRuntime = {
1841
+ ...params,
1842
+ entries : [],
1843
+ retryCount : 0,
1844
+ };
1845
+ this._pushRuntimes.set(linkKey, pushRuntime);
1846
+ }
1847
+
1848
+ return pushRuntime;
1731
1849
  }
1732
1850
 
1733
1851
  // ---------------------------------------------------------------------------
1734
1852
  // Cursor persistence
1735
1853
  // ---------------------------------------------------------------------------
1736
1854
 
1737
- private buildCursorKey(did: string, dwnUrl: string, protocol?: string): string {
1738
- const base = `${did}${CURSOR_SEPARATOR}${dwnUrl}`;
1739
- return protocol ? `${base}${CURSOR_SEPARATOR}${protocol}` : base;
1855
+ /**
1856
+ * Build the runtime key for a replication link.
1857
+ *
1858
+ * Live-mode subscription methods (`openLivePullSubscription`,
1859
+ * `openLocalPushSubscription`) receive `linkKey` directly and never
1860
+ * call this. The remaining callers are poll-mode `sync()` and the
1861
+ * live-mode startup/error paths that already have `link.scopeId`.
1862
+ *
1863
+ * The `undefined` fallback (which produces a legacy cursor key) exists
1864
+ * only for the no-protocol full-tenant targets in poll mode.
1865
+ */
1866
+ private buildLinkKey(did: string, dwnUrl: string, scopeIdOrProtocol?: string): string {
1867
+ return scopeIdOrProtocol ? buildLinkId(did, dwnUrl, scopeIdOrProtocol) : buildLegacyCursorKey(did, dwnUrl);
1740
1868
  }
1741
1869
 
1742
1870
  /**
1743
- * Retrieves a stored progress token. Handles migration from old string cursors:
1871
+ * @deprecated Used by poll-mode sync and one-time migration only. Live mode
1872
+ * uses ReplicationLedger checkpoints. Handles migration from old string cursors:
1744
1873
  * if the stored value is a bare string (pre-ProgressToken format), it is treated
1745
1874
  * as absent — the sync engine will do a full SMT reconciliation on first startup
1746
1875
  * after upgrade, which is correct and safe.
@@ -1759,8 +1888,11 @@ export class SyncEngineLevel implements SyncEngine {
1759
1888
  return parsed as ProgressToken;
1760
1889
  }
1761
1890
  } catch {
1762
- // Not valid JSON (old string cursor) — treat as absent.
1891
+ // Not valid JSON (old string cursor) — fall through to delete.
1763
1892
  }
1893
+ // Entry exists but is unparseable or has invalid/empty fields. Delete it
1894
+ // so subsequent startups don't re-check it on every launch.
1895
+ await this.deleteLegacyCursor(key);
1764
1896
  return undefined;
1765
1897
  } catch (error) {
1766
1898
  const e = error as { code: string };
@@ -1771,9 +1903,20 @@ export class SyncEngineLevel implements SyncEngine {
1771
1903
  }
1772
1904
  }
1773
1905
 
1774
- private async setCursor(key: string, cursor: ProgressToken): Promise<void> {
1906
+
1907
+ /**
1908
+ * Delete a legacy cursor from the old syncCursors sublevel.
1909
+ * Called as part of one-time migration to ReplicationLedger.
1910
+ */
1911
+ private async deleteLegacyCursor(key: string): Promise<void> {
1775
1912
  const cursors = this._db.sublevel('syncCursors');
1776
- await cursors.put(key, JSON.stringify(cursor));
1913
+ try {
1914
+ await cursors.del(key);
1915
+ } catch {
1916
+ // Best-effort — ignore LEVEL_NOT_FOUND and transient I/O errors alike.
1917
+ // A failed delete leaves the bad entry for one more re-check on the
1918
+ // next startup, which is harmless.
1919
+ }
1777
1920
  }
1778
1921
 
1779
1922
  // ---------------------------------------------------------------------------
@@ -1791,8 +1934,11 @@ export class SyncEngineLevel implements SyncEngine {
1791
1934
  }
1792
1935
 
1793
1936
  // Check for inline base64url-encoded data (small records from EventLog).
1937
+ // Delete the transport-level field so the DWN schema validator does not
1938
+ // reject the message for having unevaluated properties.
1794
1939
  const encodedData = (event.message as any).encodedData as string | undefined;
1795
1940
  if (encodedData) {
1941
+ delete (event.message as any).encodedData;
1796
1942
  const bytes = Encoder.base64UrlToBytes(encodedData);
1797
1943
  return new ReadableStream<Uint8Array>({
1798
1944
  start(controller): void {