@enbox/agent 0.5.10 → 0.5.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/dist/browser.mjs +9 -9
  2. package/dist/browser.mjs.map +4 -4
  3. package/dist/esm/dwn-api.js.map +1 -1
  4. package/dist/esm/dwn-record-upgrade.js +1 -1
  5. package/dist/esm/dwn-record-upgrade.js.map +1 -1
  6. package/dist/esm/index.js +4 -0
  7. package/dist/esm/index.js.map +1 -1
  8. package/dist/esm/sync-closure-resolver.js +855 -0
  9. package/dist/esm/sync-closure-resolver.js.map +1 -0
  10. package/dist/esm/sync-closure-types.js +189 -0
  11. package/dist/esm/sync-closure-types.js.map +1 -0
  12. package/dist/esm/sync-engine-level.js +956 -37
  13. package/dist/esm/sync-engine-level.js.map +1 -1
  14. package/dist/esm/sync-messages.js +42 -5
  15. package/dist/esm/sync-messages.js.map +1 -1
  16. package/dist/esm/sync-replication-ledger.js +220 -0
  17. package/dist/esm/sync-replication-ledger.js.map +1 -0
  18. package/dist/esm/types/sync.js +54 -1
  19. package/dist/esm/types/sync.js.map +1 -1
  20. package/dist/types/dwn-api.d.ts.map +1 -1
  21. package/dist/types/index.d.ts +5 -0
  22. package/dist/types/index.d.ts.map +1 -1
  23. package/dist/types/sync-closure-resolver.d.ts +19 -0
  24. package/dist/types/sync-closure-resolver.d.ts.map +1 -0
  25. package/dist/types/sync-closure-types.d.ts +122 -0
  26. package/dist/types/sync-closure-types.d.ts.map +1 -0
  27. package/dist/types/sync-engine-level.d.ts +137 -2
  28. package/dist/types/sync-engine-level.d.ts.map +1 -1
  29. package/dist/types/sync-messages.d.ts +15 -1
  30. package/dist/types/sync-messages.d.ts.map +1 -1
  31. package/dist/types/sync-replication-ledger.d.ts +72 -0
  32. package/dist/types/sync-replication-ledger.d.ts.map +1 -0
  33. package/dist/types/types/sync.d.ts +190 -0
  34. package/dist/types/types/sync.d.ts.map +1 -1
  35. package/package.json +3 -3
  36. package/src/dwn-api.ts +2 -1
  37. package/src/dwn-record-upgrade.ts +1 -1
  38. package/src/index.ts +5 -0
  39. package/src/sync-closure-resolver.ts +919 -0
  40. package/src/sync-closure-types.ts +270 -0
  41. package/src/sync-engine-level.ts +1041 -45
  42. package/src/sync-messages.ts +44 -6
  43. package/src/sync-replication-ledger.ts +197 -0
  44. package/src/types/sync.ts +204 -0
@@ -1,16 +1,22 @@
1
1
  import type { AbstractLevel } from 'abstract-level';
2
2
 
3
3
  import type { DwnSubscriptionHandler, ResubscribeFactory } from '@enbox/dwn-clients';
4
- import type { GenericMessage, MessageEvent, MessagesSubscribeReply, MessagesSyncDiffEntry, MessagesSyncReply, StateIndex, SubscriptionMessage } from '@enbox/dwn-sdk-js';
4
+ import type { GenericMessage, MessageEvent, MessagesSubscribeReply, MessagesSyncDiffEntry, MessagesSyncReply, ProgressToken, StateIndex, SubscriptionMessage } from '@enbox/dwn-sdk-js';
5
5
 
6
6
  import ms from 'ms';
7
7
 
8
8
  import { Level } from 'level';
9
9
  import { Encoder, hashToHex, initDefaultHashes, Message } from '@enbox/dwn-sdk-js';
10
10
 
11
+ import type { ClosureEvaluationContext } from './sync-closure-types.js';
11
12
  import type { PermissionsApi } from './types/permissions.js';
12
13
  import type { EnboxAgent, EnboxPlatformAgent } from './types/agent.js';
13
- import type { StartSyncParams, SyncConnectivityState, SyncEngine, SyncIdentityOptions, SyncMode } from './types/sync.js';
14
+ import type { PushResult, ReplicationLinkState, StartSyncParams, SyncConnectivityState, SyncEngine, SyncEvent, SyncEventListener, SyncIdentityOptions, SyncMode, SyncScope } from './types/sync.js';
15
+
16
+ import { evaluateClosure } from './sync-closure-resolver.js';
17
+ import { MAX_PENDING_TOKENS } from './types/sync.js';
18
+ import { ReplicationLedger } from './sync-replication-ledger.js';
19
+ import { createClosureContext, invalidateClosureCache } from './sync-closure-types.js';
14
20
 
15
21
  import { AgentPermissionsApi } from './permissions-api.js';
16
22
  import { DwnInterface } from './types/dwn.js';
@@ -70,6 +76,80 @@ type LocalSubscription = {
70
76
  close: () => Promise<void>;
71
77
  };
72
78
 
79
+ // ---------------------------------------------------------------------------
80
+ // Per-link in-memory delivery-order tracking (not persisted to ledger)
81
+ // ---------------------------------------------------------------------------
82
+
83
+ /**
84
+ * Tracks an in-flight delivery that has been started but may not yet be
85
+ * durably committed. Used by the pull path to handle async completion
86
+ * reordering — subscription callbacks are fire-and-forget, so event B
87
+ * can complete before event A even though A was delivered first.
88
+ */
89
+ type InFlightCommit = {
90
+ /** Monotonic delivery ordinal for this link. */
91
+ ordinal: number;
92
+ /** The token associated with this delivery. */
93
+ token: ProgressToken;
94
+ /** Whether processRawMessage has completed successfully. */
95
+ committed: boolean;
96
+ };
97
+
98
+ /**
99
+ * Checks whether a message's protocolPath and contextId match the link's
100
+ * subset scope prefixes. Returns true if the message is in scope.
101
+ *
102
+ * When the scope has no prefixes (or is kind:'full'), all messages match.
103
+ * When protocolPathPrefixes or contextIdPrefixes are specified, the message
104
+ * must match at least one prefix in each specified set.
105
+ *
106
+ * This is agent-side filtering for subset scopes. The underlying
107
+ * MessagesSubscribe filter only supports protocol-level scoping today —
108
+ * protocolPath/contextId prefix filtering at the EventLog level is a
109
+ * follow-up (requires dwn-sdk-js MessagesFilter extension).
110
+ */
111
+ function isEventInScope(message: GenericMessage, scope: SyncScope): boolean {
112
+ if (scope.kind === 'full') { return true; }
113
+ if (!scope.protocolPathPrefixes && !scope.contextIdPrefixes) { return true; }
114
+
115
+ const desc = message.descriptor as Record<string, unknown>;
116
+
117
+ // Check protocolPath prefix.
118
+ if (scope.protocolPathPrefixes && scope.protocolPathPrefixes.length > 0) {
119
+ const protocolPath = desc.protocolPath as string | undefined;
120
+ if (!protocolPath) { return false; }
121
+ const matches = scope.protocolPathPrefixes.some(
122
+ prefix => protocolPath === prefix || protocolPath.startsWith(prefix + '/')
123
+ );
124
+ if (!matches) { return false; }
125
+ }
126
+
127
+ // Check contextId prefix.
128
+ if (scope.contextIdPrefixes && scope.contextIdPrefixes.length > 0) {
129
+ const contextId = (message as any).contextId as string | undefined;
130
+ if (!contextId) { return false; }
131
+ const matches = scope.contextIdPrefixes.some(
132
+ prefix => contextId === prefix || contextId.startsWith(prefix + '/')
133
+ );
134
+ if (!matches) { return false; }
135
+ }
136
+
137
+ return true;
138
+ }
139
+
140
+ /**
141
+ * Per-link runtime state held in memory. Not persisted — on crash,
142
+ * replay restarts from `contiguousAppliedToken` (idempotent apply).
143
+ */
144
+ type LinkRuntimeState = {
145
+ /** Next ordinal to assign when a pull event is delivered. */
146
+ nextDeliveryOrdinal: number;
147
+ /** Next ordinal to check when draining committed entries. */
148
+ nextCommitOrdinal: number;
149
+ /** In-flight deliveries keyed by ordinal. */
150
+ inflight: Map<number, InFlightCommit>;
151
+ };
152
+
73
153
  export class SyncEngineLevel implements SyncEngine {
74
154
  /**
75
155
  * Holds the instance of a `EnboxPlatformAgent` that represents the current execution context for
@@ -88,6 +168,29 @@ export class SyncEngineLevel implements SyncEngine {
88
168
  private _syncIntervalId?: ReturnType<typeof setInterval>;
89
169
  private _syncLock = false;
90
170
 
171
+ /**
172
+ * Durable replication ledger — persists per-link checkpoint state.
173
+ * Used by live sync to track pull/push progression independently per link.
174
+ * Poll-mode sync still uses the legacy `getCursor`/`setCursor` path.
175
+ * Lazily initialized on first use to avoid sublevel() calls on mock dbs.
176
+ */
177
+ private _ledger?: ReplicationLedger;
178
+
179
+ /**
180
+ * In-memory cache of active links, keyed by `{did}^{dwnUrl}^{protocol}`.
181
+ * Populated from the ledger on `startLiveSync`, used by subscription handlers
182
+ * to avoid async ledger lookups on every event.
183
+ */
184
+ private _activeLinks: Map<string, ReplicationLinkState> = new Map();
185
+
186
+ /**
187
+ * Per-link in-memory delivery-order tracking for the pull path. Keyed by
188
+ * the same link key as `_activeLinks`. Not persisted — on crash, replay
189
+ * restarts from `contiguousAppliedToken` and idempotent apply handles
190
+ * re-delivered events.
191
+ */
192
+ private _linkRuntimes: Map<string, LinkRuntimeState> = new Map();
193
+
91
194
  /**
92
195
  * Hex-encoded default hashes for empty subtrees at each depth, keyed by depth.
93
196
  * Lazily initialized on first use. Used by `walkTreeDiff` to detect empty subtrees
@@ -102,6 +205,14 @@ export class SyncEngineLevel implements SyncEngine {
102
205
  /** Current sync mode, set by `startSync`. */
103
206
  private _syncMode: SyncMode = 'poll';
104
207
 
208
+ /**
209
+ * Monotonic session generation counter. Incremented on every teardown.
210
+ * Async operations (repair, retry timers) capture the generation at start
211
+ * and bail if it has changed — this prevents stale work from mutating
212
+ * state after teardown or mode switch.
213
+ */
214
+ private _syncGeneration = 0;
215
+
105
216
  /** Active live pull subscriptions (remote -> local via MessagesSubscribe). */
106
217
  private _liveSubscriptions: LiveSubscription[] = [];
107
218
 
@@ -114,8 +225,35 @@ export class SyncEngineLevel implements SyncEngine {
114
225
  /** Debounce timer for batched push-on-write. */
115
226
  private _pushDebounceTimer?: ReturnType<typeof setTimeout>;
116
227
 
117
- /** Pending message CIDs to push, accumulated during the debounce window. */
118
- private _pendingPushCids: Map<string, { did: string; dwnUrl: string; delegateDid?: string; protocol?: string; cids: string[] }> = new Map();
228
+ /** Registered event listeners for observability. */
229
+ private _eventListeners: Set<SyncEventListener> = new Set();
230
+
231
+ /** Entry in the pending push queue — a message CID with its local EventLog token. */
232
+ private _pendingPushCids: Map<string, {
233
+ did: string; dwnUrl: string; delegateDid?: string; protocol?: string;
234
+ entries: { cid: string; localToken?: ProgressToken }[];
235
+ }> = new Map();
236
+
237
+ /**
238
+ * CIDs recently received via pull subscription, keyed by `cid|dwnUrl` to
239
+ * scope suppression per remote endpoint. A message pulled from Provider A
240
+ * is only suppressed for push back to Provider A — it still fans out to
241
+ * Provider B and C. TTL: 60 seconds. Cap: 10,000 entries.
242
+ */
243
+ private _recentlyPulledCids: Map<string, number> = new Map();
244
+
245
+ /** TTL for echo-loop suppression entries (60 seconds). */
246
+ private static readonly ECHO_SUPPRESS_TTL_MS = 60_000;
247
+
248
+ /**
249
+ * Per-tenant closure evaluation contexts for the current live sync session.
250
+ * Caches ProtocolsConfigure and grant lookups across events for the same
251
+ * tenant. Keyed by tenantDid to prevent cross-tenant cache pollution.
252
+ */
253
+ private _closureContexts: Map<string, ClosureEvaluationContext> = new Map();
254
+
255
+ /** Maximum entries in the echo-loop suppression cache. */
256
+ private static readonly ECHO_SUPPRESS_MAX_ENTRIES = 10_000;
119
257
 
120
258
  /** Count of consecutive SMT sync failures (for backoff in poll mode). */
121
259
  private _consecutiveFailures = 0;
@@ -132,6 +270,14 @@ export class SyncEngineLevel implements SyncEngine {
132
270
  this._db = (db) ? db : new Level<string, string>(dataPath ?? 'DATA/AGENT/SYNC_STORE');
133
271
  }
134
272
 
273
+ /** Lazy accessor for the replication ledger. */
274
+ private get ledger(): ReplicationLedger {
275
+ if (!this._ledger) {
276
+ this._ledger = new ReplicationLedger(this._db);
277
+ }
278
+ return this._ledger;
279
+ }
280
+
135
281
  /**
136
282
  * Retrieves the `EnboxPlatformAgent` execution context.
137
283
  *
@@ -152,7 +298,39 @@ export class SyncEngineLevel implements SyncEngine {
152
298
  }
153
299
 
154
300
  get connectivityState(): SyncConnectivityState {
155
- return this._connectivityState;
301
+ // Aggregate per-link connectivity: if any link is online, report online.
302
+ // If all are offline, report offline. If all unknown, report unknown.
303
+ // Falls back to the global _connectivityState for poll-mode (no active links).
304
+ if (this._activeLinks.size === 0) {
305
+ return this._connectivityState;
306
+ }
307
+
308
+ let hasOnline = false;
309
+ let hasOffline = false;
310
+ for (const link of this._activeLinks.values()) {
311
+ if (link.connectivity === 'online') { hasOnline = true; }
312
+ if (link.connectivity === 'offline') { hasOffline = true; }
313
+ }
314
+
315
+ if (hasOnline) { return 'online'; }
316
+ if (hasOffline) { return 'offline'; }
317
+ return 'unknown';
318
+ }
319
+
320
+ public on(listener: SyncEventListener): () => void {
321
+ this._eventListeners.add(listener);
322
+ return (): void => { this._eventListeners.delete(listener); };
323
+ }
324
+
325
+ /** Emit a sync event to all registered listeners. */
326
+ private emitEvent(event: SyncEvent): void {
327
+ for (const listener of this._eventListeners) {
328
+ try {
329
+ listener(event);
330
+ } catch {
331
+ // Don't let listener errors propagate into sync engine logic.
332
+ }
333
+ }
156
334
  }
157
335
 
158
336
  public async clear(): Promise<void> {
@@ -440,14 +618,73 @@ export class SyncEngineLevel implements SyncEngine {
440
618
  console.error('SyncEngineLevel: Error during initial live-sync catch-up', error);
441
619
  }
442
620
 
443
- // Step 2: Open live subscriptions for each sync target.
621
+ // Step 2: Initialize replication links and open live subscriptions.
444
622
  const syncTargets = await this.getSyncTargets();
445
623
  for (const target of syncTargets) {
624
+ let link: ReplicationLinkState | undefined;
446
625
  try {
626
+ // Get or create the link in the durable ledger.
627
+ // Use protocol-scoped scope when a protocol is specified, otherwise full-tenant.
628
+ const linkScope: SyncScope = target.protocol
629
+ ? { kind: 'protocol', protocol: target.protocol }
630
+ : { kind: 'full' };
631
+ link = await this.ledger.getOrCreateLink({
632
+ tenantDid : target.did,
633
+ remoteEndpoint : target.dwnUrl,
634
+ scope : linkScope,
635
+ delegateDid : target.delegateDid,
636
+ protocol : target.protocol,
637
+ });
638
+
639
+ // Cache the link for fast access by subscription handlers.
640
+ const linkKey = this.buildCursorKey(target.did, target.dwnUrl, target.protocol);
641
+ this._activeLinks.set(linkKey, link);
642
+
643
+ // Open subscriptions — only transition to live if both succeed.
644
+ // If pull succeeds but push fails, close the pull subscription to
645
+ // avoid a resource leak with inconsistent state.
447
646
  await this.openLivePullSubscription(target);
448
- await this.openLocalPushSubscription(target);
647
+ try {
648
+ await this.openLocalPushSubscription(target);
649
+ } catch (pushError) {
650
+ // Close the already-opened pull subscription.
651
+ const pullSub = this._liveSubscriptions.find(
652
+ s => s.did === target.did && s.dwnUrl === target.dwnUrl && s.protocol === target.protocol
653
+ );
654
+ if (pullSub) {
655
+ try { await pullSub.close(); } catch { /* best effort */ }
656
+ this._liveSubscriptions = this._liveSubscriptions.filter(s => s !== pullSub);
657
+ }
658
+ throw pushError;
659
+ }
660
+
661
+ this.emitEvent({ type: 'link:status-change', tenantDid: target.did, remoteEndpoint: target.dwnUrl, protocol: target.protocol, from: 'initializing', to: 'live' });
662
+ await this.ledger.setStatus(link!, 'live');
449
663
  } catch (error: any) {
664
+ const linkKey = this.buildCursorKey(target.did, target.dwnUrl, target.protocol);
665
+
666
+ // Detect ProgressGap (410) — the cursor is stale, link needs SMT repair.
667
+ if ((error as any).isProgressGap && link) {
668
+ console.warn(`SyncEngineLevel: ProgressGap detected for ${target.did} -> ${target.dwnUrl}, initiating repair`);
669
+ this.emitEvent({ type: 'gap:detected', tenantDid: target.did, remoteEndpoint: target.dwnUrl, protocol: target.protocol, reason: 'ProgressGap' });
670
+ const gapInfo = (error as any).gapInfo;
671
+ await this.transitionToRepairing(linkKey, link, {
672
+ resumeToken: gapInfo?.latestAvailable,
673
+ });
674
+ continue;
675
+ }
676
+
450
677
  console.error(`SyncEngineLevel: Failed to open live subscription for ${target.did} -> ${target.dwnUrl}`, error);
678
+
679
+ // Clean up in-memory state for the failed link so it doesn't appear
680
+ // active to later code. The durable link remains at 'initializing'.
681
+ this._activeLinks.delete(linkKey);
682
+ this._linkRuntimes.delete(linkKey);
683
+
684
+ // Recompute connectivity — if no live subscriptions remain, reset to unknown.
685
+ if (this._liveSubscriptions.length === 0) {
686
+ this._connectivityState = 'unknown';
687
+ }
451
688
  }
452
689
  }
453
690
 
@@ -467,10 +704,409 @@ export class SyncEngineLevel implements SyncEngine {
467
704
  this._syncIntervalId = setInterval(integrityCheck, intervalMilliseconds);
468
705
  }
469
706
 
707
+ /**
708
+ * Get or create the runtime state for a link.
709
+ */
710
+ private getOrCreateRuntime(linkKey: string): LinkRuntimeState {
711
+ let rt = this._linkRuntimes.get(linkKey);
712
+ if (!rt) {
713
+ rt = { nextDeliveryOrdinal: 0, nextCommitOrdinal: 0, inflight: new Map() };
714
+ this._linkRuntimes.set(linkKey, rt);
715
+ }
716
+ return rt;
717
+ }
718
+
719
+ /**
720
+ * Drain contiguously committed ordinals from the runtime state, advancing
721
+ * the link's pull checkpoint for each drained entry. Returns the number of
722
+ * entries drained (0 if the next ordinal is not yet committed).
723
+ */
724
+ private drainCommittedPull(linkKey: string): number {
725
+ const rt = this._linkRuntimes.get(linkKey);
726
+ const link = this._activeLinks.get(linkKey);
727
+ if (!rt || !link) { return 0; }
728
+
729
+ let drained = 0;
730
+ while (true) {
731
+ const entry = rt.inflight.get(rt.nextCommitOrdinal);
732
+ if (!entry || !entry.committed) { break; }
733
+
734
+ // This ordinal is committed — advance the durable checkpoint.
735
+ ReplicationLedger.commitContiguousToken(link.pull, entry.token);
736
+ ReplicationLedger.setReceivedToken(link.pull, entry.token);
737
+ rt.inflight.delete(rt.nextCommitOrdinal);
738
+ rt.nextCommitOrdinal++;
739
+ drained++;
740
+ // Note: checkpoint:pull-advance event is emitted AFTER saveLink succeeds
741
+ // in the caller, not here. "Advanced" means durably persisted.
742
+ }
743
+
744
+ return drained;
745
+ }
746
+
747
+ // ---------------------------------------------------------------------------
748
+ // Per-link repair and degraded-poll orchestration (Phase 2)
749
+ // ---------------------------------------------------------------------------
750
+
751
+ /** Maximum consecutive repair attempts before falling back to degraded_poll. */
752
+ private static readonly MAX_REPAIR_ATTEMPTS = 3;
753
+
754
+ /** Per-link degraded-poll interval timers. */
755
+ private _degradedPollTimers: Map<string, ReturnType<typeof setInterval>> = new Map();
756
+
757
+ /** Per-link repair attempt counters. */
758
+ private _repairAttempts: Map<string, number> = new Map();
759
+
760
+ /** Per-link active repair promises — prevents concurrent repair for the same link. */
761
+ private _activeRepairs: Map<string, Promise<void>> = new Map();
762
+
763
+ /** Per-link retry timers for failed repairs below max attempts. */
764
+ private _repairRetryTimers: Map<string, ReturnType<typeof setTimeout>> = new Map();
765
+
766
+ /** Backoff schedule for repair retries (milliseconds). */
767
+ private static readonly REPAIR_BACKOFF_MS = [1_000, 3_000, 10_000];
768
+
769
+ /**
770
+ * Per-link repair context — stores ProgressGap metadata for use during
771
+ * repair. The `resumeToken` (from `gapInfo.latestAvailable`) is used as
772
+ * the post-repair checkpoint so the reopened subscription replays from
773
+ * a valid boundary instead of starting live-only.
774
+ */
775
+ private _repairContext: Map<string, { resumeToken?: ProgressToken }> = new Map();
776
+
777
+ /**
778
+ * Central helper for transitioning a link to `repairing`. Encapsulates:
779
+ * - status change
780
+ * - optional gap context storage
781
+ * - repair kick-off with retry scheduling on failure
782
+ *
783
+ * All code paths that set `repairing` should go through this helper to
784
+ * guarantee a future retry path.
785
+ */
786
+ private async transitionToRepairing(
787
+ linkKey: string,
788
+ link: ReplicationLinkState,
789
+ options?: { resumeToken?: ProgressToken },
790
+ ): Promise<void> {
791
+ const prevStatus = link.status;
792
+ const prevConnectivity = link.connectivity;
793
+ link.connectivity = 'offline';
794
+ await this.ledger.setStatus(link, 'repairing');
795
+
796
+ this.emitEvent({ type: 'link:status-change', tenantDid: link.tenantDid, remoteEndpoint: link.remoteEndpoint, protocol: link.protocol, from: prevStatus, to: 'repairing' });
797
+ if (prevConnectivity !== 'offline') {
798
+ this.emitEvent({ type: 'link:connectivity-change', tenantDid: link.tenantDid, remoteEndpoint: link.remoteEndpoint, protocol: link.protocol, from: prevConnectivity, to: 'offline' });
799
+ }
800
+
801
+ if (options?.resumeToken) {
802
+ this._repairContext.set(linkKey, { resumeToken: options.resumeToken });
803
+ }
804
+
805
+ // Clear runtime ordinals immediately — stale state must not linger
806
+ // across repair attempts.
807
+ const rt = this._linkRuntimes.get(linkKey);
808
+ if (rt) {
809
+ rt.inflight.clear();
810
+ rt.nextCommitOrdinal = rt.nextDeliveryOrdinal;
811
+ }
812
+
813
+ // Kick off repair with retry scheduling on failure.
814
+ void this.repairLink(linkKey).catch(() => {
815
+ this.scheduleRepairRetry(linkKey);
816
+ });
817
+ }
818
+
819
+ /**
820
+ * Schedule a retry for a failed repair. Uses exponential backoff.
821
+ * No-op if the link is already in `degraded_poll` (timer loop owns retries)
822
+ * or if a retry is already scheduled.
823
+ */
824
+ private scheduleRepairRetry(linkKey: string): void {
825
+ // Don't schedule if already in degraded_poll or retry pending.
826
+ const link = this._activeLinks.get(linkKey);
827
+ if (!link || link.status === 'degraded_poll') { return; }
828
+ if (this._repairRetryTimers.has(linkKey)) { return; }
829
+
830
+ // attempts is already post-increment from doRepairLink, so subtract 1
831
+ // for the backoff index: first failure (attempts=1) → backoff[0]=1s.
832
+ const attempts = this._repairAttempts.get(linkKey) ?? 1;
833
+ const backoff = SyncEngineLevel.REPAIR_BACKOFF_MS;
834
+ const delayMs = backoff[Math.min(attempts - 1, backoff.length - 1)];
835
+
836
+ const timerGeneration = this._syncGeneration;
837
+ const timer = setTimeout(async (): Promise<void> => {
838
+ this._repairRetryTimers.delete(linkKey);
839
+
840
+ // Bail if teardown occurred since this timer was scheduled.
841
+ if (this._syncGeneration !== timerGeneration) { return; }
842
+
843
+ // Verify link still exists and is still repairing.
844
+ const currentLink = this._activeLinks.get(linkKey);
845
+ if (!currentLink || currentLink.status !== 'repairing') { return; }
846
+
847
+ try {
848
+ await this.repairLink(linkKey);
849
+ } catch {
850
+ // repairLink handles max attempts → degraded_poll internally.
851
+ // If still below max, schedule another retry.
852
+ if (currentLink.status === 'repairing') {
853
+ this.scheduleRepairRetry(linkKey);
854
+ }
855
+ }
856
+ }, delayMs);
857
+
858
+ this._repairRetryTimers.set(linkKey, timer);
859
+ }
860
+
861
+ /**
862
+ * Repair a single link. Deduplicates concurrent calls via `_activeRepairs`.
863
+ * If repair is already running for this link, returns the existing promise.
864
+ */
865
+ private repairLink(linkKey: string): Promise<void> {
866
+ const existing = this._activeRepairs.get(linkKey);
867
+ if (existing) { return existing; }
868
+
869
+ const promise = this.doRepairLink(linkKey).finally(() => {
870
+ this._activeRepairs.delete(linkKey);
871
+ });
872
+ this._activeRepairs.set(linkKey, promise);
873
+ return promise;
874
+ }
875
+
876
+ /**
877
+ * Internal repair implementation. Runs SMT set reconciliation for a single
878
+ * link, then attempts to re-establish live subscriptions. If repair succeeds,
879
+ * transitions to `live`. If it fails, throws so callers (degraded_poll timer,
880
+ * startup) can handle retry scheduling.
881
+ */
882
+ private async doRepairLink(linkKey: string): Promise<void> {
883
+ const link = this._activeLinks.get(linkKey);
884
+ if (!link) { return; }
885
+
886
+ // Capture the sync generation at repair start. If teardown occurs during
887
+ // any await, the generation will have incremented and we bail before
888
+ // mutating state — preventing the race where repair continues after teardown.
889
+ const generation = this._syncGeneration;
890
+
891
+ const { tenantDid: did, remoteEndpoint: dwnUrl, delegateDid, protocol } = link;
892
+
893
+ this.emitEvent({ type: 'repair:started', tenantDid: did, remoteEndpoint: dwnUrl, protocol, attempt: (this._repairAttempts.get(linkKey) ?? 0) + 1 });
894
+ const attempts = (this._repairAttempts.get(linkKey) ?? 0) + 1;
895
+ this._repairAttempts.set(linkKey, attempts);
896
+
897
+ // Step 1: Close existing subscriptions FIRST to stop old events from
898
+ // mutating local state while repair runs.
899
+ await this.closeLinkSubscriptions(link);
900
+ if (this._syncGeneration !== generation) { return; } // Teardown occurred.
901
+
902
+ // Step 2: Clear runtime ordinals immediately — stale state must not
903
+ // persist across repair attempts (successful or failed).
904
+ const rt = this.getOrCreateRuntime(linkKey);
905
+ rt.inflight.clear();
906
+ rt.nextDeliveryOrdinal = 0;
907
+ rt.nextCommitOrdinal = 0;
908
+
909
+ try {
910
+ // Step 3: Run SMT reconciliation for this link.
911
+ const localRoot = await this.getLocalRoot(did, delegateDid, protocol);
912
+ if (this._syncGeneration !== generation) { return; }
913
+ const remoteRoot = await this.getRemoteRoot(did, dwnUrl, delegateDid, protocol);
914
+ if (this._syncGeneration !== generation) { return; }
915
+
916
+ if (localRoot !== remoteRoot) {
917
+ const diff = await this.diffWithRemote({ did, dwnUrl, delegateDid, protocol });
918
+ if (this._syncGeneration !== generation) { return; }
919
+
920
+ if (diff.onlyRemote.length > 0) {
921
+ const prefetched: (MessagesSyncDiffEntry & { message: GenericMessage })[] = [];
922
+ const needsFetchCids: string[] = [];
923
+ for (const entry of diff.onlyRemote) {
924
+ if (!entry.message || (entry.message.descriptor.interface === 'Records' &&
925
+ entry.message.descriptor.method === 'Write' &&
926
+ (entry.message.descriptor as any).dataCid && !entry.encodedData)) {
927
+ needsFetchCids.push(entry.messageCid);
928
+ } else {
929
+ prefetched.push(entry as MessagesSyncDiffEntry & { message: GenericMessage });
930
+ }
931
+ }
932
+ await this.pullMessages({ did, dwnUrl, delegateDid, protocol, messageCids: needsFetchCids, prefetched });
933
+ if (this._syncGeneration !== generation) { return; }
934
+ }
935
+
936
+ if (diff.onlyLocal.length > 0) {
937
+ await this.pushMessages({ did, dwnUrl, delegateDid, protocol, messageCids: diff.onlyLocal });
938
+ if (this._syncGeneration !== generation) { return; }
939
+ }
940
+ }
941
+
942
+ // Step 4: Determine the post-repair resume token.
943
+ // - If repair was triggered by ProgressGap, use the stored resumeToken
944
+ // (from gapInfo.latestAvailable) so the reopened subscription replays
945
+ // from a valid boundary, closing the race window between SMT and resubscribe.
946
+ // - Otherwise, use the existing contiguousAppliedToken if still valid.
947
+ // - Push checkpoint is NOT reset during repair: push frontier tracks what
948
+ // the local EventLog has delivered to the remote. SMT repair handles
949
+ // pull-side convergence; push-side convergence is handled by the diff's
950
+ // onlyLocal push. The push checkpoint remains the local authority.
951
+ const repairCtx = this._repairContext.get(linkKey);
952
+ const resumeToken = repairCtx?.resumeToken ?? link.pull.contiguousAppliedToken;
953
+ ReplicationLedger.resetCheckpoint(link.pull, resumeToken);
954
+ await this.ledger.saveLink(link);
955
+ if (this._syncGeneration !== generation) { return; }
956
+
957
+ // Step 5: Reopen subscriptions with the repaired checkpoints.
958
+ const target = { did, dwnUrl, delegateDid, protocol };
959
+ await this.openLivePullSubscription(target);
960
+ if (this._syncGeneration !== generation) { return; }
961
+ try {
962
+ await this.openLocalPushSubscription({
963
+ ...target,
964
+ pushCursor: link.push.contiguousAppliedToken,
965
+ });
966
+ } catch (pushError) {
967
+ const pullSub = this._liveSubscriptions.find(
968
+ s => s.did === did && s.dwnUrl === dwnUrl && s.protocol === protocol
969
+ );
970
+ if (pullSub) {
971
+ try { await pullSub.close(); } catch { /* best effort */ }
972
+ this._liveSubscriptions = this._liveSubscriptions.filter(s => s !== pullSub);
973
+ }
974
+ throw pushError;
975
+ }
976
+ if (this._syncGeneration !== generation) { return; }
977
+
978
+ // Step 6: Clean up repair context and transition to live.
979
+ this._repairContext.delete(linkKey);
980
+ this._repairAttempts.delete(linkKey);
981
+ const retryTimer = this._repairRetryTimers.get(linkKey);
982
+ if (retryTimer) { clearTimeout(retryTimer); this._repairRetryTimers.delete(linkKey); }
983
+ const prevRepairConnectivity = link.connectivity;
984
+ link.connectivity = 'online';
985
+ await this.ledger.setStatus(link, 'live');
986
+ this.emitEvent({ type: 'repair:completed', tenantDid: did, remoteEndpoint: dwnUrl, protocol });
987
+ if (prevRepairConnectivity !== 'online') {
988
+ this.emitEvent({ type: 'link:connectivity-change', tenantDid: did, remoteEndpoint: dwnUrl, protocol, from: prevRepairConnectivity, to: 'online' });
989
+ }
990
+ this.emitEvent({ type: 'link:status-change', tenantDid: did, remoteEndpoint: dwnUrl, protocol, from: 'repairing', to: 'live' });
991
+
992
+ } catch (error: any) {
993
+ // If teardown occurred during repair, don't retry or enter degraded_poll.
994
+ if (this._syncGeneration !== generation) { return; }
995
+
996
+ console.error(`SyncEngineLevel: Repair failed for ${did} -> ${dwnUrl} (attempt ${attempts})`, error);
997
+ this.emitEvent({ type: 'repair:failed', tenantDid: did, remoteEndpoint: dwnUrl, protocol, attempt: attempts, error: String(error.message ?? error) });
998
+
999
+ if (attempts >= SyncEngineLevel.MAX_REPAIR_ATTEMPTS) {
1000
+ console.warn(`SyncEngineLevel: Max repair attempts reached for ${did} -> ${dwnUrl}, entering degraded_poll`);
1001
+ await this.enterDegradedPoll(linkKey);
1002
+ return;
1003
+ }
1004
+
1005
+ // Re-throw so callers (degraded_poll timer) can handle retry scheduling.
1006
+ throw error;
1007
+ }
1008
+ }
1009
+
1010
+ /**
1011
+ * Close pull and push subscriptions for a specific link.
1012
+ */
1013
+ private async closeLinkSubscriptions(link: ReplicationLinkState): Promise<void> {
1014
+ const { tenantDid: did, remoteEndpoint: dwnUrl, protocol } = link;
1015
+
1016
+ // Close pull subscription.
1017
+ const pullSub = this._liveSubscriptions.find(
1018
+ s => s.did === did && s.dwnUrl === dwnUrl && s.protocol === protocol
1019
+ );
1020
+ if (pullSub) {
1021
+ try { await pullSub.close(); } catch { /* best effort */ }
1022
+ this._liveSubscriptions = this._liveSubscriptions.filter(s => s !== pullSub);
1023
+ }
1024
+
1025
+ // Close local push subscription.
1026
+ const pushSub = this._localSubscriptions.find(
1027
+ s => s.did === did && s.dwnUrl === dwnUrl && s.protocol === protocol
1028
+ );
1029
+ if (pushSub) {
1030
+ try { await pushSub.close(); } catch { /* best effort */ }
1031
+ this._localSubscriptions = this._localSubscriptions.filter(s => s !== pushSub);
1032
+ }
1033
+ }
1034
+
1035
+ /**
1036
+ * Transition a link to `degraded_poll` and start a per-link polling timer.
1037
+ * The timer runs SMT reconciliation at a reduced frequency (30s with jitter)
1038
+ * and attempts to re-establish live subscriptions after each successful repair.
1039
+ */
1040
+ private async enterDegradedPoll(linkKey: string): Promise<void> {
1041
+ const link = this._activeLinks.get(linkKey);
1042
+ if (!link) { return; }
1043
+ link.connectivity = 'offline';
1044
+
1045
+ const prevDegradedStatus = link.status;
1046
+ await this.ledger.setStatus(link, 'degraded_poll');
1047
+ this._repairAttempts.delete(linkKey);
1048
+ this.emitEvent({ type: 'link:status-change', tenantDid: link.tenantDid, remoteEndpoint: link.remoteEndpoint, protocol: link.protocol, from: prevDegradedStatus, to: 'degraded_poll' });
1049
+ this.emitEvent({ type: 'degraded-poll:entered', tenantDid: link.tenantDid, remoteEndpoint: link.remoteEndpoint, protocol: link.protocol });
1050
+
1051
+ // Clear any existing timer for this link.
1052
+ const existing = this._degradedPollTimers.get(linkKey);
1053
+ if (existing) { clearInterval(existing); }
1054
+
1055
+ // Schedule per-link polling with jitter (15-30 seconds).
1056
+ const baseInterval = 15_000;
1057
+ const jitter = Math.floor(Math.random() * 15_000);
1058
+ const interval = baseInterval + jitter;
1059
+
1060
+ const pollGeneration = this._syncGeneration;
1061
+ const timer = setInterval(async (): Promise<void> => {
1062
+ // Bail if teardown occurred since this timer was created.
1063
+ if (this._syncGeneration !== pollGeneration) {
1064
+ clearInterval(timer);
1065
+ this._degradedPollTimers.delete(linkKey);
1066
+ return;
1067
+ }
1068
+
1069
+ // If the link was transitioned out of degraded_poll externally (e.g.,
1070
+ // by teardown or manual intervention), stop polling.
1071
+ if (link.status !== 'degraded_poll') {
1072
+ clearInterval(timer);
1073
+ this._degradedPollTimers.delete(linkKey);
1074
+ return;
1075
+ }
1076
+
1077
+ try {
1078
+ // Attempt repair. Reset attempt counter so repairLink doesn't
1079
+ // immediately re-enter degraded_poll on failure.
1080
+ this._repairAttempts.set(linkKey, 0);
1081
+ await this.ledger.setStatus(link, 'repairing');
1082
+ await this.repairLink(linkKey);
1083
+
1084
+ // If repairLink succeeded, link is now 'live' — stop polling.
1085
+ if ((link.status as string) === 'live') {
1086
+ clearInterval(timer);
1087
+ this._degradedPollTimers.delete(linkKey);
1088
+ }
1089
+ } catch {
1090
+ // Repair failed — restore degraded_poll status so the timer continues.
1091
+ // This is critical: repairLink sets status to 'repairing' internally,
1092
+ // and if we don't restore degraded_poll, the next tick would see
1093
+ // status !== 'degraded_poll' and stop the timer permanently.
1094
+ await this.ledger.setStatus(link, 'degraded_poll');
1095
+ }
1096
+ }, interval);
1097
+
1098
+ this._degradedPollTimers.set(linkKey, timer);
1099
+ }
1100
+
470
1101
  /**
471
1102
  * Tears down all live subscriptions and push listeners.
472
1103
  */
473
1104
  private async teardownLiveSync(): Promise<void> {
1105
+ // Increment generation to invalidate all in-flight async operations
1106
+ // (repairs, retry timers, degraded-poll ticks). Any async work that
1107
+ // captured the previous generation will bail on its next checkpoint.
1108
+ this._syncGeneration++;
1109
+
474
1110
  // Clear the push debounce timer.
475
1111
  if (this._pushDebounceTimer) {
476
1112
  clearTimeout(this._pushDebounceTimer);
@@ -499,6 +1135,26 @@ export class SyncEngineLevel implements SyncEngine {
499
1135
  }
500
1136
  }
501
1137
  this._localSubscriptions = [];
1138
+
1139
+ // Clear degraded-poll timers and repair state.
1140
+ for (const timer of this._degradedPollTimers.values()) {
1141
+ clearInterval(timer);
1142
+ }
1143
+ this._degradedPollTimers.clear();
1144
+ this._repairAttempts.clear();
1145
+ this._activeRepairs.clear();
1146
+ for (const timer of this._repairRetryTimers.values()) {
1147
+ clearTimeout(timer);
1148
+ }
1149
+ this._repairRetryTimers.clear();
1150
+ this._repairContext.clear();
1151
+
1152
+ // Clear closure evaluation contexts.
1153
+ this._closureContexts.clear();
1154
+
1155
+ // Clear the in-memory link and runtime state.
1156
+ this._activeLinks.clear();
1157
+ this._linkRuntimes.clear();
502
1158
  }
503
1159
 
504
1160
  // ---------------------------------------------------------------------------
@@ -514,12 +1170,24 @@ export class SyncEngineLevel implements SyncEngine {
514
1170
  }): Promise<void> {
515
1171
  const { did, delegateDid, dwnUrl, protocol } = target;
516
1172
 
517
- // Resolve the cursor from the last session (if any).
1173
+ // Resolve the cursor from the link's pull checkpoint (preferred) or legacy storage.
518
1174
  const cursorKey = this.buildCursorKey(did, dwnUrl, protocol);
519
- const cursor = await this.getCursor(cursorKey);
1175
+ const link = this._activeLinks.get(cursorKey);
1176
+ const cursor = link?.pull.contiguousAppliedToken ?? await this.getCursor(cursorKey);
520
1177
 
521
1178
  // Build the MessagesSubscribe filters.
522
- const filters = protocol ? [{ protocol }] : [];
1179
+ // When the link has protocolPathPrefixes, include them in the filter so the
1180
+ // EventLog delivers only matching events (server-side filtering). This replaces
1181
+ // the less efficient agent-side isEventInScope filtering for the pull path.
1182
+ // Note: only the first prefix is used as the MessagesFilter field because
1183
+ // MessagesFilter.protocolPathPrefix is a single string. Multiple prefixes
1184
+ // would need multiple filters (OR semantics) — for now we use the first one.
1185
+ const protocolPathPrefix = link?.scope.kind === 'protocol'
1186
+ ? link.scope.protocolPathPrefixes?.[0]
1187
+ : undefined;
1188
+ const filters = protocol
1189
+ ? [{ protocol, ...(protocolPathPrefix ? { protocolPathPrefix } : {}) }]
1190
+ : [];
523
1191
 
524
1192
  // Look up permission grant for MessagesSubscribe if using a delegate.
525
1193
  // The unified scope matching in AgentPermissionsApi accepts a
@@ -538,16 +1206,88 @@ export class SyncEngineLevel implements SyncEngine {
538
1206
  }
539
1207
 
540
1208
  // Define the subscription handler that processes incoming events.
1209
+ // NOTE: The WebSocket client fires handlers without awaiting (fire-and-forget),
1210
+ // so multiple handlers can be in-flight concurrently. The ordinal tracker
1211
+ // ensures the checkpoint advances only when all earlier deliveries are committed.
541
1212
  const subscriptionHandler = async (subMessage: SubscriptionMessage): Promise<void> => {
542
1213
  if (subMessage.type === 'eose') {
543
- // End-of-stored-events — catch-up complete, persist cursor.
544
- await this.setCursor(cursorKey, subMessage.cursor);
545
- this._connectivityState = 'online';
1214
+ // End-of-stored-events — catch-up complete.
1215
+ if (link) {
1216
+ // Guard: if the link transitioned to repairing while catch-up events
1217
+ // were being processed, skip all mutations — repair owns the state now.
1218
+ if (link.status !== 'live' && link.status !== 'initializing') {
1219
+ return;
1220
+ }
1221
+
1222
+ if (!ReplicationLedger.validateTokenDomain(link.pull, subMessage.cursor)) {
1223
+ console.warn(`SyncEngineLevel: Token domain mismatch on EOSE for ${did} -> ${dwnUrl}, transitioning to repairing`);
1224
+ await this.transitionToRepairing(cursorKey, link);
1225
+ return;
1226
+ }
1227
+ ReplicationLedger.setReceivedToken(link.pull, subMessage.cursor);
1228
+ // Drain committed entries. Do NOT unconditionally advance to the
1229
+ // EOSE cursor — earlier stored events may still be in-flight
1230
+ // (handlers are fire-and-forget). The checkpoint advances only as
1231
+ // far as the contiguous drain reaches.
1232
+ this.drainCommittedPull(cursorKey);
1233
+ await this.ledger.saveLink(link);
1234
+ } else {
1235
+ await this.setCursor(cursorKey, subMessage.cursor);
1236
+ }
1237
+ // Transport is reachable — set connectivity to online.
1238
+ if (link) {
1239
+ const prevEoseConnectivity = link.connectivity;
1240
+ link.connectivity = 'online';
1241
+ if (prevEoseConnectivity !== 'online') {
1242
+ this.emitEvent({ type: 'link:connectivity-change', tenantDid: did, remoteEndpoint: dwnUrl, protocol, from: prevEoseConnectivity, to: 'online' });
1243
+ }
1244
+ } else {
1245
+ this._connectivityState = 'online';
1246
+ }
546
1247
  return;
547
1248
  }
548
1249
 
549
1250
  if (subMessage.type === 'event') {
550
1251
  const event: MessageEvent = subMessage.event;
1252
+
1253
+ // Guard: if the link is not live (e.g., repairing, degraded_poll, paused),
1254
+ // skip all processing. Old subscription handlers may still fire after the
1255
+ // link transitions — these events should be ignored entirely, not just
1256
+ // skipped at the checkpoint level.
1257
+ if (link && link.status !== 'live' && link.status !== 'initializing') {
1258
+ return;
1259
+ }
1260
+
1261
+ // Domain validation: reject tokens from a different stream/epoch.
1262
+ if (link && !ReplicationLedger.validateTokenDomain(link.pull, subMessage.cursor)) {
1263
+ console.warn(`SyncEngineLevel: Token domain mismatch for ${did} -> ${dwnUrl}, transitioning to repairing`);
1264
+ await this.transitionToRepairing(cursorKey, link);
1265
+ return;
1266
+ }
1267
+
1268
+ // Subset scope filtering: if the link has protocolPath/contextId prefixes,
1269
+ // skip events that don't match. This is agent-side filtering because
1270
+ // MessagesSubscribe only supports protocol-level filtering today.
1271
+ //
1272
+ // Skipped events MUST advance contiguousAppliedToken — otherwise the
1273
+ // link would replay the same filtered-out events indefinitely after
1274
+ // reconnect/repair. This is safe because the event is intentionally
1275
+ // excluded from this scope and doesn't need processing.
1276
+ if (link && !isEventInScope(event.message, link.scope)) {
1277
+ ReplicationLedger.setReceivedToken(link.pull, subMessage.cursor);
1278
+ ReplicationLedger.commitContiguousToken(link.pull, subMessage.cursor);
1279
+ await this.ledger.saveLink(link);
1280
+ return;
1281
+ }
1282
+
1283
+ // Assign a delivery ordinal BEFORE async processing begins.
1284
+ // This captures the delivery order even if processing completes out of order.
1285
+ const rt = link ? this.getOrCreateRuntime(cursorKey) : undefined;
1286
+ const ordinal = rt ? rt.nextDeliveryOrdinal++ : -1;
1287
+ if (rt) {
1288
+ rt.inflight.set(ordinal, { ordinal, token: subMessage.cursor, committed: false });
1289
+ }
1290
+
551
1291
  try {
552
1292
  // Extract inline data from the event (available for records <= 30 KB).
553
1293
  let dataStream = this.extractDataStream(event);
@@ -569,12 +1309,97 @@ export class SyncEngineLevel implements SyncEngine {
569
1309
 
570
1310
  await this.agent.dwn.processRawMessage(did, event.message, { dataStream });
571
1311
 
572
- // Only advance the cursor after successful processing.
573
- // If processing fails, the event will be re-delivered on
574
- // reconnection (cursor-based resume from the last good point).
575
- await this.setCursor(cursorKey, subMessage.cursor);
1312
+ // Invalidate closure cache entries that may be affected by this message.
1313
+ // Must run before closure validation so subsequent evaluations in the
1314
+ // same session see the updated local state.
1315
+ const closureCtxForInvalidation = this._closureContexts.get(did);
1316
+ if (closureCtxForInvalidation) {
1317
+ invalidateClosureCache(closureCtxForInvalidation, event.message);
1318
+ }
1319
+
1320
+ // Closure validation for scoped subset sync (Phase 3).
1321
+ // For protocol-scoped links, verify that all hard dependencies for
1322
+ // this operation are locally present before considering it committed.
1323
+ // Full-tenant scope bypasses this entirely (returns complete with 0 queries).
1324
+ if (link && link.scope.kind === 'protocol') {
1325
+ const messageStore = this.agent.dwn.node.storage.messageStore;
1326
+ let closureCtx = this._closureContexts.get(did);
1327
+ if (!closureCtx) {
1328
+ closureCtx = createClosureContext(did);
1329
+ this._closureContexts.set(did, closureCtx);
1330
+ }
1331
+
1332
+ const closureResult = await evaluateClosure(
1333
+ event.message, messageStore, link.scope, closureCtx
1334
+ );
1335
+
1336
+ if (!closureResult.complete) {
1337
+ console.warn(
1338
+ `SyncEngineLevel: Closure incomplete for ${did} -> ${dwnUrl}: ` +
1339
+ `${closureResult.failure!.code} — ${closureResult.failure!.detail}`
1340
+ );
1341
+ await this.transitionToRepairing(cursorKey, link);
1342
+ return;
1343
+ }
1344
+ }
1345
+
1346
+ // Squash convergence: processRawMessage triggers the DWN's built-in
1347
+ // squash resumable task (performRecordsSquash) which runs inline and
1348
+ // handles subset consumers correctly:
1349
+ // - If older siblings are locally present → purges them
1350
+ // - If squash arrives before older siblings → backstop rejects them (409)
1351
+ // - If no older siblings are local → no-op (correct)
1352
+ // Both sync orderings (squash-first or siblings-first) converge to
1353
+ // the same final state. No additional sync-engine side-effect is needed.
1354
+
1355
+ // Track this CID for echo-loop suppression, scoped to the source endpoint.
1356
+ const pulledCid = await Message.getCid(event.message);
1357
+ this._recentlyPulledCids.set(`${pulledCid}|${dwnUrl}`, Date.now() + SyncEngineLevel.ECHO_SUPPRESS_TTL_MS);
1358
+ this.evictExpiredEchoEntries();
1359
+
1360
+ // Mark this ordinal as committed and drain the checkpoint.
1361
+ // Guard: if the link transitioned to repairing while this handler was
1362
+ // in-flight (e.g., an earlier ordinal's handler failed concurrently),
1363
+ // skip all state mutations — the repair process owns progression now.
1364
+ if (link && rt && link.status === 'live') {
1365
+ const entry = rt.inflight.get(ordinal);
1366
+ if (entry) { entry.committed = true; }
1367
+
1368
+ ReplicationLedger.setReceivedToken(link.pull, subMessage.cursor);
1369
+ const drained = this.drainCommittedPull(cursorKey);
1370
+ if (drained > 0) {
1371
+ await this.ledger.saveLink(link);
1372
+ // Emit after durable save — "advanced" means persisted.
1373
+ if (link.pull.contiguousAppliedToken) {
1374
+ this.emitEvent({
1375
+ type : 'checkpoint:pull-advance',
1376
+ tenantDid : link.tenantDid,
1377
+ remoteEndpoint : link.remoteEndpoint,
1378
+ protocol : link.protocol,
1379
+ position : link.pull.contiguousAppliedToken.position,
1380
+ messageCid : link.pull.contiguousAppliedToken.messageCid,
1381
+ });
1382
+ }
1383
+ }
1384
+
1385
+ // Overflow: too many in-flight ordinals without draining.
1386
+ if (rt.inflight.size > MAX_PENDING_TOKENS) {
1387
+ console.warn(`SyncEngineLevel: Pull in-flight overflow for ${did} -> ${dwnUrl}, transitioning to repairing`);
1388
+ await this.transitionToRepairing(cursorKey, link);
1389
+ }
1390
+ } else if (!link) {
1391
+ // Legacy path: no link available, use simple cursor persistence.
1392
+ await this.setCursor(cursorKey, subMessage.cursor);
1393
+ }
576
1394
  } catch (error: any) {
577
1395
  console.error(`SyncEngineLevel: Error processing live-pull event for ${did}`, error);
1396
+ // A failed processRawMessage means local state is incomplete.
1397
+ // Transition to repairing immediately — do NOT advance the checkpoint
1398
+ // past this failure or let later ordinals commit past it. SMT
1399
+ // reconciliation will discover and fill the gap.
1400
+ if (link) {
1401
+ await this.transitionToRepairing(cursorKey, link);
1402
+ }
578
1403
  }
579
1404
  }
580
1405
  };
@@ -599,10 +1424,12 @@ export class SyncEngineLevel implements SyncEngine {
599
1424
 
600
1425
  // Build a resubscribe factory so the WebSocket client can resume with
601
1426
  // a fresh cursor-stamped message after reconnection.
602
- const resubscribeFactory: ResubscribeFactory = async (resumeCursor?: string) => {
1427
+ const resubscribeFactory: ResubscribeFactory = async (resumeCursor?: ProgressToken) => {
1428
+ // On reconnect, use the latest durable checkpoint position if available.
1429
+ const effectiveCursor = resumeCursor ?? link?.pull.contiguousAppliedToken ?? cursor;
603
1430
  const resumeRequest = {
604
1431
  ...subscribeRequest,
605
- messageParams: { ...subscribeRequest.messageParams, cursor: resumeCursor ?? cursor },
1432
+ messageParams: { ...subscribeRequest.messageParams, cursor: effectiveCursor },
606
1433
  };
607
1434
  const { message: resumeMsg } = await this.agent.dwn.processRequest(resumeRequest);
608
1435
  if (!resumeMsg) {
@@ -625,9 +1452,15 @@ export class SyncEngineLevel implements SyncEngine {
625
1452
  resubscribeFactory,
626
1453
  },
627
1454
  }) as MessagesSubscribeReply;
1455
+ if (reply.status.code === 410) {
1456
+ // ProgressGap — the cursor is no longer replayable. The link needs repair.
1457
+ const gapError = new Error(`SyncEngineLevel: ProgressGap for ${did} -> ${dwnUrl}: ${reply.status.detail}`);
1458
+ (gapError as any).isProgressGap = true;
1459
+ (gapError as any).gapInfo = reply.error;
1460
+ throw gapError;
1461
+ }
628
1462
  if (reply.status.code !== 200 || !reply.subscription) {
629
- console.error(`SyncEngineLevel: MessagesSubscribe failed for ${did} -> ${dwnUrl}: ${reply.status.code} ${reply.status.detail}`);
630
- return;
1463
+ throw new Error(`SyncEngineLevel: MessagesSubscribe failed for ${did} -> ${dwnUrl}: ${reply.status.code} ${reply.status.detail}`);
631
1464
  }
632
1465
 
633
1466
  this._liveSubscriptions.push({
@@ -638,7 +1471,15 @@ export class SyncEngineLevel implements SyncEngine {
638
1471
  close: async (): Promise<void> => { await reply.subscription!.close(); },
639
1472
  });
640
1473
 
641
- this._connectivityState = 'online';
1474
+ // Set per-link connectivity to online after successful subscription setup.
1475
+ const pullLink = this._activeLinks.get(this.buildCursorKey(did, dwnUrl, protocol));
1476
+ if (pullLink) {
1477
+ const prevPullConnectivity = pullLink.connectivity;
1478
+ pullLink.connectivity = 'online';
1479
+ if (prevPullConnectivity !== 'online') {
1480
+ this.emitEvent({ type: 'link:connectivity-change', tenantDid: did, remoteEndpoint: dwnUrl, protocol, from: prevPullConnectivity, to: 'online' });
1481
+ }
1482
+ }
642
1483
  }
643
1484
 
644
1485
  // ---------------------------------------------------------------------------
@@ -651,6 +1492,7 @@ export class SyncEngineLevel implements SyncEngine {
651
1492
  */
652
1493
  private async openLocalPushSubscription(target: {
653
1494
  did: string; dwnUrl: string; delegateDid?: string; protocol?: string;
1495
+ pushCursor?: ProgressToken;
654
1496
  }): Promise<void> {
655
1497
  const { did, delegateDid, dwnUrl, protocol } = target;
656
1498
 
@@ -676,6 +1518,33 @@ export class SyncEngineLevel implements SyncEngine {
676
1518
  return;
677
1519
  }
678
1520
 
1521
+ // Subset scope filtering for push: only push events that match the
1522
+ // link's scope prefixes. Events outside the scope are not our responsibility.
1523
+ // Skipped events MUST advance the push checkpoint to prevent infinite
1524
+ // replay after repair/reconnect (same reason as the pull side).
1525
+ const pushLink = this._activeLinks.get(this.buildCursorKey(did, dwnUrl, protocol));
1526
+ if (pushLink && !isEventInScope(subMessage.event.message, pushLink.scope)) {
1527
+ // Guard: only mutate durable state when the link is live/initializing.
1528
+ // During repair/degraded_poll, orchestration owns checkpoint progression.
1529
+ if (pushLink.status !== 'live' && pushLink.status !== 'initializing') {
1530
+ return;
1531
+ }
1532
+
1533
+ // Validate token domain before committing — a stream/epoch mismatch
1534
+ // on the local EventLog should trigger repair, not silently overwrite.
1535
+ if (!ReplicationLedger.validateTokenDomain(pushLink.push, subMessage.cursor)) {
1536
+ await this.transitionToRepairing(
1537
+ this.buildCursorKey(did, dwnUrl, protocol), pushLink
1538
+ );
1539
+ return;
1540
+ }
1541
+
1542
+ ReplicationLedger.setReceivedToken(pushLink.push, subMessage.cursor);
1543
+ ReplicationLedger.commitContiguousToken(pushLink.push, subMessage.cursor);
1544
+ await this.ledger.saveLink(pushLink);
1545
+ return;
1546
+ }
1547
+
679
1548
  // Accumulate the message CID for a debounced push.
680
1549
  const targetKey = this.buildCursorKey(did, dwnUrl, protocol);
681
1550
  const cid = await Message.getCid(subMessage.event.message);
@@ -683,12 +1552,19 @@ export class SyncEngineLevel implements SyncEngine {
683
1552
  return;
684
1553
  }
685
1554
 
1555
+ // Echo-loop suppression: skip CIDs that were recently pulled from this
1556
+ // specific remote. A message pulled from Provider A is only suppressed
1557
+ // for push to A — it still fans out to Provider B and C.
1558
+ if (this.isRecentlyPulled(cid, dwnUrl)) {
1559
+ return;
1560
+ }
1561
+
686
1562
  let pending = this._pendingPushCids.get(targetKey);
687
1563
  if (!pending) {
688
- pending = { did, dwnUrl, delegateDid, protocol, cids: [] };
1564
+ pending = { did, dwnUrl, delegateDid, protocol, entries: [] };
689
1565
  this._pendingPushCids.set(targetKey, pending);
690
1566
  }
691
- pending.cids.push(cid);
1567
+ pending.entries.push({ cid, localToken: subMessage.cursor });
692
1568
 
693
1569
  // Debounce the push.
694
1570
  if (this._pushDebounceTimer) {
@@ -700,19 +1576,21 @@ export class SyncEngineLevel implements SyncEngine {
700
1576
  };
701
1577
 
702
1578
  // Process the local subscription request.
1579
+ // When a push cursor is provided (e.g., after repair), the local subscription
1580
+ // replays events from that position, closing the race window where local
1581
+ // writes during repair would otherwise be missed by push-on-write.
703
1582
  const response = await this.agent.dwn.processRequest({
704
1583
  author : did,
705
1584
  target : did,
706
1585
  messageType : DwnInterface.MessagesSubscribe,
707
1586
  granteeDid : delegateDid,
708
- messageParams : { filters, permissionGrantId },
1587
+ messageParams : { filters, permissionGrantId, cursor: target.pushCursor },
709
1588
  subscriptionHandler : subscriptionHandler as any,
710
1589
  });
711
1590
 
712
1591
  const reply = response.reply as MessagesSubscribeReply;
713
1592
  if (reply.status.code !== 200 || !reply.subscription) {
714
- console.error(`SyncEngineLevel: Local MessagesSubscribe failed for ${did}: ${reply.status.code} ${reply.status.detail}`);
715
- return;
1593
+ throw new Error(`SyncEngineLevel: Local MessagesSubscribe failed for ${did}: ${reply.status.code} ${reply.status.detail}`);
716
1594
  }
717
1595
 
718
1596
  this._localSubscriptions.push({
@@ -730,41 +1608,94 @@ export class SyncEngineLevel implements SyncEngine {
730
1608
  private async flushPendingPushes(): Promise<void> {
731
1609
  this._pushDebounceTimer = undefined;
732
1610
 
733
- const entries = [...this._pendingPushCids.entries()];
1611
+ const batches = [...this._pendingPushCids.entries()];
734
1612
  this._pendingPushCids.clear();
735
1613
 
736
1614
  // Push to all endpoints in parallel — each target is independent.
737
- await Promise.all(entries.map(async ([, pending]) => {
738
- const { did, dwnUrl, delegateDid, protocol, cids } = pending;
739
- if (cids.length === 0) {
1615
+ await Promise.all(batches.map(async ([targetKey, pending]) => {
1616
+ const { did, dwnUrl, delegateDid, protocol, entries: pushEntries } = pending;
1617
+ if (pushEntries.length === 0) {
740
1618
  return;
741
1619
  }
742
1620
 
1621
+ const cids = pushEntries.map(e => e.cid);
1622
+
743
1623
  try {
744
- await pushMessages({
1624
+ const result = await pushMessages({
745
1625
  did, dwnUrl, delegateDid, protocol,
746
1626
  messageCids : cids,
747
1627
  agent : this.agent,
748
1628
  permissionsApi : this._permissionsApi,
749
1629
  });
1630
+
1631
+ // Advance the push checkpoint for successfully pushed entries.
1632
+ // Push is sequential (single batch, in-order processing) so we can
1633
+ // commit directly without ordinal tracking — there's no concurrent
1634
+ // completion to reorder.
1635
+ const link = this._activeLinks.get(targetKey);
1636
+ if (link) {
1637
+ const succeededSet = new Set(result.succeeded);
1638
+ // Track highest contiguous success: if a CID fails, we stop advancing.
1639
+ let hitFailure = false;
1640
+ for (const entry of pushEntries) {
1641
+ if (hitFailure) { break; }
1642
+ if (succeededSet.has(entry.cid) && entry.localToken) {
1643
+ if (!ReplicationLedger.validateTokenDomain(link.push, entry.localToken)) {
1644
+ console.warn(`SyncEngineLevel: Push checkpoint domain mismatch for ${did} -> ${dwnUrl}, transitioning to repairing`);
1645
+ await this.transitionToRepairing(targetKey, link);
1646
+ break;
1647
+ }
1648
+ ReplicationLedger.setReceivedToken(link.push, entry.localToken);
1649
+ ReplicationLedger.commitContiguousToken(link.push, entry.localToken);
1650
+ } else {
1651
+ // This CID failed or had no token — stop advancing.
1652
+ hitFailure = true;
1653
+ }
1654
+ }
1655
+ await this.ledger.saveLink(link);
1656
+ }
1657
+
1658
+ // Re-queue only TRANSIENT failures for retry. Permanent failures (400/401/403)
1659
+ // are dropped — they will never succeed regardless of retry.
1660
+ if (result.failed.length > 0) {
1661
+ console.error(
1662
+ `SyncEngineLevel: Push-on-write failed for ${did} -> ${dwnUrl}: ` +
1663
+ `${result.failed.length} transient failures of ${cids.length} messages`
1664
+ );
1665
+ const failedSet = new Set(result.failed);
1666
+ const failedEntries = pushEntries.filter(e => failedSet.has(e.cid));
1667
+ let requeued = this._pendingPushCids.get(targetKey);
1668
+ if (!requeued) {
1669
+ requeued = { did, dwnUrl, delegateDid, protocol, entries: [] };
1670
+ this._pendingPushCids.set(targetKey, requeued);
1671
+ }
1672
+ requeued.entries.push(...failedEntries);
1673
+
1674
+ // Schedule a retry after a short delay.
1675
+ if (!this._pushDebounceTimer) {
1676
+ this._pushDebounceTimer = setTimeout((): void => {
1677
+ void this.flushPendingPushes();
1678
+ }, PUSH_DEBOUNCE_MS * 4);
1679
+ }
1680
+ }
1681
+ // Permanent failures are logged by pushMessages but NOT re-queued.
1682
+ // They will be rediscovered by the next SMT integrity check if the
1683
+ // local/remote state has changed, but won't spin in a retry loop.
750
1684
  } catch (error: any) {
1685
+ // Truly unexpected error (not per-message failure). Re-queue entire
1686
+ // batch so entries aren't silently dropped from the debounce queue.
751
1687
  console.error(`SyncEngineLevel: Push-on-write failed for ${did} -> ${dwnUrl}`, error);
752
-
753
- // Re-queue the failed CIDs so they are retried on the next
754
- // debounce cycle (or picked up by the SMT integrity check).
755
- const targetKey = this.buildCursorKey(did, dwnUrl, protocol);
756
1688
  let requeued = this._pendingPushCids.get(targetKey);
757
1689
  if (!requeued) {
758
- requeued = { did, dwnUrl, delegateDid, protocol, cids: [] };
1690
+ requeued = { did, dwnUrl, delegateDid, protocol, entries: [] };
759
1691
  this._pendingPushCids.set(targetKey, requeued);
760
1692
  }
761
- requeued.cids.push(...cids);
1693
+ requeued.entries.push(...pushEntries);
762
1694
 
763
- // Schedule a retry after a short delay.
764
1695
  if (!this._pushDebounceTimer) {
765
1696
  this._pushDebounceTimer = setTimeout((): void => {
766
1697
  void this.flushPendingPushes();
767
- }, PUSH_DEBOUNCE_MS * 4); // Back off: 1 second instead of 250ms.
1698
+ }, PUSH_DEBOUNCE_MS * 4);
768
1699
  }
769
1700
  }
770
1701
  }));
@@ -779,10 +1710,29 @@ export class SyncEngineLevel implements SyncEngine {
779
1710
  return protocol ? `${base}${CURSOR_SEPARATOR}${protocol}` : base;
780
1711
  }
781
1712
 
782
- private async getCursor(key: string): Promise<string | undefined> {
1713
+ /**
1714
+ * Retrieves a stored progress token. Handles migration from old string cursors:
1715
+ * if the stored value is a bare string (pre-ProgressToken format), it is treated
1716
+ * as absent — the sync engine will do a full SMT reconciliation on first startup
1717
+ * after upgrade, which is correct and safe.
1718
+ */
1719
+ private async getCursor(key: string): Promise<ProgressToken | undefined> {
783
1720
  const cursors = this._db.sublevel('syncCursors');
784
1721
  try {
785
- return await cursors.get(key);
1722
+ const raw = await cursors.get(key);
1723
+ try {
1724
+ const parsed = JSON.parse(raw);
1725
+ if (parsed && typeof parsed === 'object' &&
1726
+ typeof parsed.streamId === 'string' &&
1727
+ typeof parsed.epoch === 'string' &&
1728
+ typeof parsed.position === 'string' &&
1729
+ typeof parsed.messageCid === 'string') {
1730
+ return parsed as ProgressToken;
1731
+ }
1732
+ } catch {
1733
+ // Not valid JSON (old string cursor) — treat as absent.
1734
+ }
1735
+ return undefined;
786
1736
  } catch (error) {
787
1737
  const e = error as { code: string };
788
1738
  if (e.code === 'LEVEL_NOT_FOUND') {
@@ -792,9 +1742,9 @@ export class SyncEngineLevel implements SyncEngine {
792
1742
  }
793
1743
  }
794
1744
 
795
- private async setCursor(key: string, cursor: string): Promise<void> {
1745
+ private async setCursor(key: string, cursor: ProgressToken): Promise<void> {
796
1746
  const cursors = this._db.sublevel('syncCursors');
797
- await cursors.put(key, cursor);
1747
+ await cursors.put(key, JSON.stringify(cursor));
798
1748
  }
799
1749
 
800
1750
  // ---------------------------------------------------------------------------
@@ -1164,6 +2114,52 @@ export class SyncEngineLevel implements SyncEngine {
1164
2114
  });
1165
2115
  }
1166
2116
 
2117
+ // ---------------------------------------------------------------------------
2118
+ // Echo-loop suppression
2119
+ // ---------------------------------------------------------------------------
2120
+
2121
+ /**
2122
+ * Evicts expired entries from the echo-loop suppression cache.
2123
+ * Also enforces the size cap by evicting oldest entries first.
2124
+ */
2125
+ private evictExpiredEchoEntries(): void {
2126
+ const now = Date.now();
2127
+
2128
+ // Evict expired entries.
2129
+ for (const [cid, expiry] of this._recentlyPulledCids) {
2130
+ if (now >= expiry) {
2131
+ this._recentlyPulledCids.delete(cid);
2132
+ }
2133
+ }
2134
+
2135
+ // Enforce size cap by evicting oldest entries.
2136
+ if (this._recentlyPulledCids.size > SyncEngineLevel.ECHO_SUPPRESS_MAX_ENTRIES) {
2137
+ const excess = this._recentlyPulledCids.size - SyncEngineLevel.ECHO_SUPPRESS_MAX_ENTRIES;
2138
+ let evicted = 0;
2139
+ for (const key of this._recentlyPulledCids.keys()) {
2140
+ if (evicted >= excess) { break; }
2141
+ this._recentlyPulledCids.delete(key);
2142
+ evicted++;
2143
+ }
2144
+ }
2145
+ }
2146
+
2147
+ /**
2148
+ * Checks whether a CID was recently pulled from a specific remote endpoint
2149
+ * and should not be pushed back to that same endpoint (echo-loop suppression).
2150
+ * Does not suppress pushes to other endpoints — multi-provider fan-out works.
2151
+ */
2152
+ private isRecentlyPulled(cid: string, dwnUrl: string): boolean {
2153
+ const key = `${cid}|${dwnUrl}`;
2154
+ const expiry = this._recentlyPulledCids.get(key);
2155
+ if (expiry === undefined) { return false; }
2156
+ if (Date.now() >= expiry) {
2157
+ this._recentlyPulledCids.delete(key);
2158
+ return false;
2159
+ }
2160
+ return true;
2161
+ }
2162
+
1167
2163
  /**
1168
2164
  * Reads missing messages from the local DWN and pushes them to the remote DWN
1169
2165
  * in dependency order (topological sort).
@@ -1174,7 +2170,7 @@ export class SyncEngineLevel implements SyncEngine {
1174
2170
  delegateDid?: string;
1175
2171
  protocol?: string;
1176
2172
  messageCids: string[];
1177
- }): Promise<void> {
2173
+ }): Promise<PushResult> {
1178
2174
  return pushMessages({
1179
2175
  did, dwnUrl, delegateDid, protocol, messageCids,
1180
2176
  agent : this.agent,