@enbox/agent 0.5.10 → 0.5.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/dist/browser.mjs +9 -9
  2. package/dist/browser.mjs.map +4 -4
  3. package/dist/esm/dwn-api.js.map +1 -1
  4. package/dist/esm/dwn-record-upgrade.js +1 -1
  5. package/dist/esm/dwn-record-upgrade.js.map +1 -1
  6. package/dist/esm/index.js +4 -0
  7. package/dist/esm/index.js.map +1 -1
  8. package/dist/esm/sync-closure-resolver.js +855 -0
  9. package/dist/esm/sync-closure-resolver.js.map +1 -0
  10. package/dist/esm/sync-closure-types.js +189 -0
  11. package/dist/esm/sync-closure-types.js.map +1 -0
  12. package/dist/esm/sync-engine-level.js +956 -37
  13. package/dist/esm/sync-engine-level.js.map +1 -1
  14. package/dist/esm/sync-messages.js +42 -5
  15. package/dist/esm/sync-messages.js.map +1 -1
  16. package/dist/esm/sync-replication-ledger.js +220 -0
  17. package/dist/esm/sync-replication-ledger.js.map +1 -0
  18. package/dist/esm/types/sync.js +54 -1
  19. package/dist/esm/types/sync.js.map +1 -1
  20. package/dist/types/dwn-api.d.ts.map +1 -1
  21. package/dist/types/index.d.ts +5 -0
  22. package/dist/types/index.d.ts.map +1 -1
  23. package/dist/types/sync-closure-resolver.d.ts +19 -0
  24. package/dist/types/sync-closure-resolver.d.ts.map +1 -0
  25. package/dist/types/sync-closure-types.d.ts +122 -0
  26. package/dist/types/sync-closure-types.d.ts.map +1 -0
  27. package/dist/types/sync-engine-level.d.ts +137 -2
  28. package/dist/types/sync-engine-level.d.ts.map +1 -1
  29. package/dist/types/sync-messages.d.ts +15 -1
  30. package/dist/types/sync-messages.d.ts.map +1 -1
  31. package/dist/types/sync-replication-ledger.d.ts +72 -0
  32. package/dist/types/sync-replication-ledger.d.ts.map +1 -0
  33. package/dist/types/types/sync.d.ts +190 -0
  34. package/dist/types/types/sync.d.ts.map +1 -1
  35. package/package.json +3 -3
  36. package/src/dwn-api.ts +2 -1
  37. package/src/dwn-record-upgrade.ts +1 -1
  38. package/src/index.ts +5 -0
  39. package/src/sync-closure-resolver.ts +919 -0
  40. package/src/sync-closure-types.ts +270 -0
  41. package/src/sync-engine-level.ts +1041 -45
  42. package/src/sync-messages.ts +44 -6
  43. package/src/sync-replication-ledger.ts +197 -0
  44. package/src/types/sync.ts +204 -0
@@ -17,6 +17,10 @@ var __asyncValues = (this && this.__asyncValues) || function (o) {
17
17
  import ms from 'ms';
18
18
  import { Level } from 'level';
19
19
  import { Encoder, hashToHex, initDefaultHashes, Message } from '@enbox/dwn-sdk-js';
20
+ import { evaluateClosure } from './sync-closure-resolver.js';
21
+ import { MAX_PENDING_TOKENS } from './types/sync.js';
22
+ import { ReplicationLedger } from './sync-replication-ledger.js';
23
+ import { createClosureContext, invalidateClosureCache } from './sync-closure-types.js';
20
24
  import { AgentPermissionsApi } from './permissions-api.js';
21
25
  import { DwnInterface } from './types/dwn.js';
22
26
  import { isRecordsWrite } from './utils.js';
@@ -46,28 +50,130 @@ const CURSOR_SEPARATOR = '^';
46
50
  * we batch them and push after this delay to avoid a push per individual write.
47
51
  */
48
52
  const PUSH_DEBOUNCE_MS = 250;
53
+ /**
54
+ * Checks whether a message's protocolPath and contextId match the link's
55
+ * subset scope prefixes. Returns true if the message is in scope.
56
+ *
57
+ * When the scope has no prefixes (or is kind:'full'), all messages match.
58
+ * When protocolPathPrefixes or contextIdPrefixes are specified, the message
59
+ * must match at least one prefix in each specified set.
60
+ *
61
+ * This is agent-side filtering for subset scopes. The underlying
62
+ * MessagesSubscribe filter only supports protocol-level scoping today —
63
+ * protocolPath/contextId prefix filtering at the EventLog level is a
64
+ * follow-up (requires dwn-sdk-js MessagesFilter extension).
65
+ */
66
+ function isEventInScope(message, scope) {
67
+ if (scope.kind === 'full') {
68
+ return true;
69
+ }
70
+ if (!scope.protocolPathPrefixes && !scope.contextIdPrefixes) {
71
+ return true;
72
+ }
73
+ const desc = message.descriptor;
74
+ // Check protocolPath prefix.
75
+ if (scope.protocolPathPrefixes && scope.protocolPathPrefixes.length > 0) {
76
+ const protocolPath = desc.protocolPath;
77
+ if (!protocolPath) {
78
+ return false;
79
+ }
80
+ const matches = scope.protocolPathPrefixes.some(prefix => protocolPath === prefix || protocolPath.startsWith(prefix + '/'));
81
+ if (!matches) {
82
+ return false;
83
+ }
84
+ }
85
+ // Check contextId prefix.
86
+ if (scope.contextIdPrefixes && scope.contextIdPrefixes.length > 0) {
87
+ const contextId = message.contextId;
88
+ if (!contextId) {
89
+ return false;
90
+ }
91
+ const matches = scope.contextIdPrefixes.some(prefix => contextId === prefix || contextId.startsWith(prefix + '/'));
92
+ if (!matches) {
93
+ return false;
94
+ }
95
+ }
96
+ return true;
97
+ }
49
98
  export class SyncEngineLevel {
50
99
  constructor({ agent, dataPath, db }) {
51
100
  this._syncLock = false;
101
+ /**
102
+ * In-memory cache of active links, keyed by `{did}^{dwnUrl}^{protocol}`.
103
+ * Populated from the ledger on `startLiveSync`, used by subscription handlers
104
+ * to avoid async ledger lookups on every event.
105
+ */
106
+ this._activeLinks = new Map();
107
+ /**
108
+ * Per-link in-memory delivery-order tracking for the pull path. Keyed by
109
+ * the same link key as `_activeLinks`. Not persisted — on crash, replay
110
+ * restarts from `contiguousAppliedToken` and idempotent apply handles
111
+ * re-delivered events.
112
+ */
113
+ this._linkRuntimes = new Map();
52
114
  // ---------------------------------------------------------------------------
53
115
  // Live sync state
54
116
  // ---------------------------------------------------------------------------
55
117
  /** Current sync mode, set by `startSync`. */
56
118
  this._syncMode = 'poll';
119
+ /**
120
+ * Monotonic session generation counter. Incremented on every teardown.
121
+ * Async operations (repair, retry timers) capture the generation at start
122
+ * and bail if it has changed — this prevents stale work from mutating
123
+ * state after teardown or mode switch.
124
+ */
125
+ this._syncGeneration = 0;
57
126
  /** Active live pull subscriptions (remote -> local via MessagesSubscribe). */
58
127
  this._liveSubscriptions = [];
59
128
  /** Active local EventLog subscriptions for push-on-write (local -> remote). */
60
129
  this._localSubscriptions = [];
61
130
  /** Connectivity state derived from subscription health. */
62
131
  this._connectivityState = 'unknown';
63
- /** Pending message CIDs to push, accumulated during the debounce window. */
132
+ /** Registered event listeners for observability. */
133
+ this._eventListeners = new Set();
134
+ /** Entry in the pending push queue — a message CID with its local EventLog token. */
64
135
  this._pendingPushCids = new Map();
136
+ /**
137
+ * CIDs recently received via pull subscription, keyed by `cid|dwnUrl` to
138
+ * scope suppression per remote endpoint. A message pulled from Provider A
139
+ * is only suppressed for push back to Provider A — it still fans out to
140
+ * Provider B and C. TTL: 60 seconds. Cap: 10,000 entries.
141
+ */
142
+ this._recentlyPulledCids = new Map();
143
+ /**
144
+ * Per-tenant closure evaluation contexts for the current live sync session.
145
+ * Caches ProtocolsConfigure and grant lookups across events for the same
146
+ * tenant. Keyed by tenantDid to prevent cross-tenant cache pollution.
147
+ */
148
+ this._closureContexts = new Map();
65
149
  /** Count of consecutive SMT sync failures (for backoff in poll mode). */
66
150
  this._consecutiveFailures = 0;
151
+ /** Per-link degraded-poll interval timers. */
152
+ this._degradedPollTimers = new Map();
153
+ /** Per-link repair attempt counters. */
154
+ this._repairAttempts = new Map();
155
+ /** Per-link active repair promises — prevents concurrent repair for the same link. */
156
+ this._activeRepairs = new Map();
157
+ /** Per-link retry timers for failed repairs below max attempts. */
158
+ this._repairRetryTimers = new Map();
159
+ /**
160
+ * Per-link repair context — stores ProgressGap metadata for use during
161
+ * repair. The `resumeToken` (from `gapInfo.latestAvailable`) is used as
162
+ * the post-repair checkpoint so the reopened subscription replays from
163
+ * a valid boundary instead of starting live-only.
164
+ */
165
+ this._repairContext = new Map();
67
166
  this._agent = agent;
68
167
  this._permissionsApi = new AgentPermissionsApi({ agent: agent });
69
168
  this._db = (db) ? db : new Level(dataPath !== null && dataPath !== void 0 ? dataPath : 'DATA/AGENT/SYNC_STORE');
70
169
  }
170
+ /** Lazy accessor for the replication ledger. */
171
+ get ledger() {
172
+ if (!this._ledger) {
173
+ this._ledger = new ReplicationLedger(this._db);
174
+ }
175
+ return this._ledger;
176
+ }
71
177
  /**
72
178
  * Retrieves the `EnboxPlatformAgent` execution context.
73
179
  *
@@ -85,7 +191,44 @@ export class SyncEngineLevel {
85
191
  this._permissionsApi = new AgentPermissionsApi({ agent: agent });
86
192
  }
87
193
  get connectivityState() {
88
- return this._connectivityState;
194
+ // Aggregate per-link connectivity: if any link is online, report online.
195
+ // If all are offline, report offline. If all unknown, report unknown.
196
+ // Falls back to the global _connectivityState for poll-mode (no active links).
197
+ if (this._activeLinks.size === 0) {
198
+ return this._connectivityState;
199
+ }
200
+ let hasOnline = false;
201
+ let hasOffline = false;
202
+ for (const link of this._activeLinks.values()) {
203
+ if (link.connectivity === 'online') {
204
+ hasOnline = true;
205
+ }
206
+ if (link.connectivity === 'offline') {
207
+ hasOffline = true;
208
+ }
209
+ }
210
+ if (hasOnline) {
211
+ return 'online';
212
+ }
213
+ if (hasOffline) {
214
+ return 'offline';
215
+ }
216
+ return 'unknown';
217
+ }
218
+ on(listener) {
219
+ this._eventListeners.add(listener);
220
+ return () => { this._eventListeners.delete(listener); };
221
+ }
222
+ /** Emit a sync event to all registered listeners. */
223
+ emitEvent(event) {
224
+ for (const listener of this._eventListeners) {
225
+ try {
226
+ listener(event);
227
+ }
228
+ catch (_a) {
229
+ // Don't let listener errors propagate into sync engine logic.
230
+ }
231
+ }
89
232
  }
90
233
  clear() {
91
234
  return __awaiter(this, void 0, void 0, function* () {
@@ -355,15 +498,69 @@ export class SyncEngineLevel {
355
498
  catch (error) {
356
499
  console.error('SyncEngineLevel: Error during initial live-sync catch-up', error);
357
500
  }
358
- // Step 2: Open live subscriptions for each sync target.
501
+ // Step 2: Initialize replication links and open live subscriptions.
359
502
  const syncTargets = yield this.getSyncTargets();
360
503
  for (const target of syncTargets) {
504
+ let link;
361
505
  try {
506
+ // Get or create the link in the durable ledger.
507
+ // Use protocol-scoped scope when a protocol is specified, otherwise full-tenant.
508
+ const linkScope = target.protocol
509
+ ? { kind: 'protocol', protocol: target.protocol }
510
+ : { kind: 'full' };
511
+ link = yield this.ledger.getOrCreateLink({
512
+ tenantDid: target.did,
513
+ remoteEndpoint: target.dwnUrl,
514
+ scope: linkScope,
515
+ delegateDid: target.delegateDid,
516
+ protocol: target.protocol,
517
+ });
518
+ // Cache the link for fast access by subscription handlers.
519
+ const linkKey = this.buildCursorKey(target.did, target.dwnUrl, target.protocol);
520
+ this._activeLinks.set(linkKey, link);
521
+ // Open subscriptions — only transition to live if both succeed.
522
+ // If pull succeeds but push fails, close the pull subscription to
523
+ // avoid a resource leak with inconsistent state.
362
524
  yield this.openLivePullSubscription(target);
363
- yield this.openLocalPushSubscription(target);
525
+ try {
526
+ yield this.openLocalPushSubscription(target);
527
+ }
528
+ catch (pushError) {
529
+ // Close the already-opened pull subscription.
530
+ const pullSub = this._liveSubscriptions.find(s => s.did === target.did && s.dwnUrl === target.dwnUrl && s.protocol === target.protocol);
531
+ if (pullSub) {
532
+ try {
533
+ yield pullSub.close();
534
+ }
535
+ catch ( /* best effort */_a) { /* best effort */ }
536
+ this._liveSubscriptions = this._liveSubscriptions.filter(s => s !== pullSub);
537
+ }
538
+ throw pushError;
539
+ }
540
+ this.emitEvent({ type: 'link:status-change', tenantDid: target.did, remoteEndpoint: target.dwnUrl, protocol: target.protocol, from: 'initializing', to: 'live' });
541
+ yield this.ledger.setStatus(link, 'live');
364
542
  }
365
543
  catch (error) {
544
+ const linkKey = this.buildCursorKey(target.did, target.dwnUrl, target.protocol);
545
+ // Detect ProgressGap (410) — the cursor is stale, link needs SMT repair.
546
+ if (error.isProgressGap && link) {
547
+ console.warn(`SyncEngineLevel: ProgressGap detected for ${target.did} -> ${target.dwnUrl}, initiating repair`);
548
+ this.emitEvent({ type: 'gap:detected', tenantDid: target.did, remoteEndpoint: target.dwnUrl, protocol: target.protocol, reason: 'ProgressGap' });
549
+ const gapInfo = error.gapInfo;
550
+ yield this.transitionToRepairing(linkKey, link, {
551
+ resumeToken: gapInfo === null || gapInfo === void 0 ? void 0 : gapInfo.latestAvailable,
552
+ });
553
+ continue;
554
+ }
366
555
  console.error(`SyncEngineLevel: Failed to open live subscription for ${target.did} -> ${target.dwnUrl}`, error);
556
+ // Clean up in-memory state for the failed link so it doesn't appear
557
+ // active to later code. The durable link remains at 'initializing'.
558
+ this._activeLinks.delete(linkKey);
559
+ this._linkRuntimes.delete(linkKey);
560
+ // Recompute connectivity — if no live subscriptions remain, reset to unknown.
561
+ if (this._liveSubscriptions.length === 0) {
562
+ this._connectivityState = 'unknown';
563
+ }
367
564
  }
368
565
  }
369
566
  // Step 3: Schedule infrequent SMT integrity check.
@@ -381,11 +578,385 @@ export class SyncEngineLevel {
381
578
  this._syncIntervalId = setInterval(integrityCheck, intervalMilliseconds);
382
579
  });
383
580
  }
581
+ /**
582
+ * Get or create the runtime state for a link.
583
+ */
584
+ getOrCreateRuntime(linkKey) {
585
+ let rt = this._linkRuntimes.get(linkKey);
586
+ if (!rt) {
587
+ rt = { nextDeliveryOrdinal: 0, nextCommitOrdinal: 0, inflight: new Map() };
588
+ this._linkRuntimes.set(linkKey, rt);
589
+ }
590
+ return rt;
591
+ }
592
+ /**
593
+ * Drain contiguously committed ordinals from the runtime state, advancing
594
+ * the link's pull checkpoint for each drained entry. Returns the number of
595
+ * entries drained (0 if the next ordinal is not yet committed).
596
+ */
597
+ drainCommittedPull(linkKey) {
598
+ const rt = this._linkRuntimes.get(linkKey);
599
+ const link = this._activeLinks.get(linkKey);
600
+ if (!rt || !link) {
601
+ return 0;
602
+ }
603
+ let drained = 0;
604
+ while (true) {
605
+ const entry = rt.inflight.get(rt.nextCommitOrdinal);
606
+ if (!entry || !entry.committed) {
607
+ break;
608
+ }
609
+ // This ordinal is committed — advance the durable checkpoint.
610
+ ReplicationLedger.commitContiguousToken(link.pull, entry.token);
611
+ ReplicationLedger.setReceivedToken(link.pull, entry.token);
612
+ rt.inflight.delete(rt.nextCommitOrdinal);
613
+ rt.nextCommitOrdinal++;
614
+ drained++;
615
+ // Note: checkpoint:pull-advance event is emitted AFTER saveLink succeeds
616
+ // in the caller, not here. "Advanced" means durably persisted.
617
+ }
618
+ return drained;
619
+ }
620
+ /**
621
+ * Central helper for transitioning a link to `repairing`. Encapsulates:
622
+ * - status change
623
+ * - optional gap context storage
624
+ * - repair kick-off with retry scheduling on failure
625
+ *
626
+ * All code paths that set `repairing` should go through this helper to
627
+ * guarantee a future retry path.
628
+ */
629
+ transitionToRepairing(linkKey, link, options) {
630
+ return __awaiter(this, void 0, void 0, function* () {
631
+ const prevStatus = link.status;
632
+ const prevConnectivity = link.connectivity;
633
+ link.connectivity = 'offline';
634
+ yield this.ledger.setStatus(link, 'repairing');
635
+ this.emitEvent({ type: 'link:status-change', tenantDid: link.tenantDid, remoteEndpoint: link.remoteEndpoint, protocol: link.protocol, from: prevStatus, to: 'repairing' });
636
+ if (prevConnectivity !== 'offline') {
637
+ this.emitEvent({ type: 'link:connectivity-change', tenantDid: link.tenantDid, remoteEndpoint: link.remoteEndpoint, protocol: link.protocol, from: prevConnectivity, to: 'offline' });
638
+ }
639
+ if (options === null || options === void 0 ? void 0 : options.resumeToken) {
640
+ this._repairContext.set(linkKey, { resumeToken: options.resumeToken });
641
+ }
642
+ // Clear runtime ordinals immediately — stale state must not linger
643
+ // across repair attempts.
644
+ const rt = this._linkRuntimes.get(linkKey);
645
+ if (rt) {
646
+ rt.inflight.clear();
647
+ rt.nextCommitOrdinal = rt.nextDeliveryOrdinal;
648
+ }
649
+ // Kick off repair with retry scheduling on failure.
650
+ void this.repairLink(linkKey).catch(() => {
651
+ this.scheduleRepairRetry(linkKey);
652
+ });
653
+ });
654
+ }
655
+ /**
656
+ * Schedule a retry for a failed repair. Uses exponential backoff.
657
+ * No-op if the link is already in `degraded_poll` (timer loop owns retries)
658
+ * or if a retry is already scheduled.
659
+ */
660
+ scheduleRepairRetry(linkKey) {
661
+ var _a;
662
+ // Don't schedule if already in degraded_poll or retry pending.
663
+ const link = this._activeLinks.get(linkKey);
664
+ if (!link || link.status === 'degraded_poll') {
665
+ return;
666
+ }
667
+ if (this._repairRetryTimers.has(linkKey)) {
668
+ return;
669
+ }
670
+ // attempts is already post-increment from doRepairLink, so subtract 1
671
+ // for the backoff index: first failure (attempts=1) → backoff[0]=1s.
672
+ const attempts = (_a = this._repairAttempts.get(linkKey)) !== null && _a !== void 0 ? _a : 1;
673
+ const backoff = SyncEngineLevel.REPAIR_BACKOFF_MS;
674
+ const delayMs = backoff[Math.min(attempts - 1, backoff.length - 1)];
675
+ const timerGeneration = this._syncGeneration;
676
+ const timer = setTimeout(() => __awaiter(this, void 0, void 0, function* () {
677
+ this._repairRetryTimers.delete(linkKey);
678
+ // Bail if teardown occurred since this timer was scheduled.
679
+ if (this._syncGeneration !== timerGeneration) {
680
+ return;
681
+ }
682
+ // Verify link still exists and is still repairing.
683
+ const currentLink = this._activeLinks.get(linkKey);
684
+ if (!currentLink || currentLink.status !== 'repairing') {
685
+ return;
686
+ }
687
+ try {
688
+ yield this.repairLink(linkKey);
689
+ }
690
+ catch (_a) {
691
+ // repairLink handles max attempts → degraded_poll internally.
692
+ // If still below max, schedule another retry.
693
+ if (currentLink.status === 'repairing') {
694
+ this.scheduleRepairRetry(linkKey);
695
+ }
696
+ }
697
+ }), delayMs);
698
+ this._repairRetryTimers.set(linkKey, timer);
699
+ }
700
+ /**
701
+ * Repair a single link. Deduplicates concurrent calls via `_activeRepairs`.
702
+ * If repair is already running for this link, returns the existing promise.
703
+ */
704
+ repairLink(linkKey) {
705
+ const existing = this._activeRepairs.get(linkKey);
706
+ if (existing) {
707
+ return existing;
708
+ }
709
+ const promise = this.doRepairLink(linkKey).finally(() => {
710
+ this._activeRepairs.delete(linkKey);
711
+ });
712
+ this._activeRepairs.set(linkKey, promise);
713
+ return promise;
714
+ }
715
+ /**
716
+ * Internal repair implementation. Runs SMT set reconciliation for a single
717
+ * link, then attempts to re-establish live subscriptions. If repair succeeds,
718
+ * transitions to `live`. If it fails, throws so callers (degraded_poll timer,
719
+ * startup) can handle retry scheduling.
720
+ */
721
+ doRepairLink(linkKey) {
722
+ return __awaiter(this, void 0, void 0, function* () {
723
+ var _a, _b, _c, _d;
724
+ const link = this._activeLinks.get(linkKey);
725
+ if (!link) {
726
+ return;
727
+ }
728
+ // Capture the sync generation at repair start. If teardown occurs during
729
+ // any await, the generation will have incremented and we bail before
730
+ // mutating state — preventing the race where repair continues after teardown.
731
+ const generation = this._syncGeneration;
732
+ const { tenantDid: did, remoteEndpoint: dwnUrl, delegateDid, protocol } = link;
733
+ this.emitEvent({ type: 'repair:started', tenantDid: did, remoteEndpoint: dwnUrl, protocol, attempt: ((_a = this._repairAttempts.get(linkKey)) !== null && _a !== void 0 ? _a : 0) + 1 });
734
+ const attempts = ((_b = this._repairAttempts.get(linkKey)) !== null && _b !== void 0 ? _b : 0) + 1;
735
+ this._repairAttempts.set(linkKey, attempts);
736
+ // Step 1: Close existing subscriptions FIRST to stop old events from
737
+ // mutating local state while repair runs.
738
+ yield this.closeLinkSubscriptions(link);
739
+ if (this._syncGeneration !== generation) {
740
+ return;
741
+ } // Teardown occurred.
742
+ // Step 2: Clear runtime ordinals immediately — stale state must not
743
+ // persist across repair attempts (successful or failed).
744
+ const rt = this.getOrCreateRuntime(linkKey);
745
+ rt.inflight.clear();
746
+ rt.nextDeliveryOrdinal = 0;
747
+ rt.nextCommitOrdinal = 0;
748
+ try {
749
+ // Step 3: Run SMT reconciliation for this link.
750
+ const localRoot = yield this.getLocalRoot(did, delegateDid, protocol);
751
+ if (this._syncGeneration !== generation) {
752
+ return;
753
+ }
754
+ const remoteRoot = yield this.getRemoteRoot(did, dwnUrl, delegateDid, protocol);
755
+ if (this._syncGeneration !== generation) {
756
+ return;
757
+ }
758
+ if (localRoot !== remoteRoot) {
759
+ const diff = yield this.diffWithRemote({ did, dwnUrl, delegateDid, protocol });
760
+ if (this._syncGeneration !== generation) {
761
+ return;
762
+ }
763
+ if (diff.onlyRemote.length > 0) {
764
+ const prefetched = [];
765
+ const needsFetchCids = [];
766
+ for (const entry of diff.onlyRemote) {
767
+ if (!entry.message || (entry.message.descriptor.interface === 'Records' &&
768
+ entry.message.descriptor.method === 'Write' &&
769
+ entry.message.descriptor.dataCid && !entry.encodedData)) {
770
+ needsFetchCids.push(entry.messageCid);
771
+ }
772
+ else {
773
+ prefetched.push(entry);
774
+ }
775
+ }
776
+ yield this.pullMessages({ did, dwnUrl, delegateDid, protocol, messageCids: needsFetchCids, prefetched });
777
+ if (this._syncGeneration !== generation) {
778
+ return;
779
+ }
780
+ }
781
+ if (diff.onlyLocal.length > 0) {
782
+ yield this.pushMessages({ did, dwnUrl, delegateDid, protocol, messageCids: diff.onlyLocal });
783
+ if (this._syncGeneration !== generation) {
784
+ return;
785
+ }
786
+ }
787
+ }
788
+ // Step 4: Determine the post-repair resume token.
789
+ // - If repair was triggered by ProgressGap, use the stored resumeToken
790
+ // (from gapInfo.latestAvailable) so the reopened subscription replays
791
+ // from a valid boundary, closing the race window between SMT and resubscribe.
792
+ // - Otherwise, use the existing contiguousAppliedToken if still valid.
793
+ // - Push checkpoint is NOT reset during repair: push frontier tracks what
794
+ // the local EventLog has delivered to the remote. SMT repair handles
795
+ // pull-side convergence; push-side convergence is handled by the diff's
796
+ // onlyLocal push. The push checkpoint remains the local authority.
797
+ const repairCtx = this._repairContext.get(linkKey);
798
+ const resumeToken = (_c = repairCtx === null || repairCtx === void 0 ? void 0 : repairCtx.resumeToken) !== null && _c !== void 0 ? _c : link.pull.contiguousAppliedToken;
799
+ ReplicationLedger.resetCheckpoint(link.pull, resumeToken);
800
+ yield this.ledger.saveLink(link);
801
+ if (this._syncGeneration !== generation) {
802
+ return;
803
+ }
804
+ // Step 5: Reopen subscriptions with the repaired checkpoints.
805
+ const target = { did, dwnUrl, delegateDid, protocol };
806
+ yield this.openLivePullSubscription(target);
807
+ if (this._syncGeneration !== generation) {
808
+ return;
809
+ }
810
+ try {
811
+ yield this.openLocalPushSubscription(Object.assign(Object.assign({}, target), { pushCursor: link.push.contiguousAppliedToken }));
812
+ }
813
+ catch (pushError) {
814
+ const pullSub = this._liveSubscriptions.find(s => s.did === did && s.dwnUrl === dwnUrl && s.protocol === protocol);
815
+ if (pullSub) {
816
+ try {
817
+ yield pullSub.close();
818
+ }
819
+ catch ( /* best effort */_e) { /* best effort */ }
820
+ this._liveSubscriptions = this._liveSubscriptions.filter(s => s !== pullSub);
821
+ }
822
+ throw pushError;
823
+ }
824
+ if (this._syncGeneration !== generation) {
825
+ return;
826
+ }
827
+ // Step 6: Clean up repair context and transition to live.
828
+ this._repairContext.delete(linkKey);
829
+ this._repairAttempts.delete(linkKey);
830
+ const retryTimer = this._repairRetryTimers.get(linkKey);
831
+ if (retryTimer) {
832
+ clearTimeout(retryTimer);
833
+ this._repairRetryTimers.delete(linkKey);
834
+ }
835
+ const prevRepairConnectivity = link.connectivity;
836
+ link.connectivity = 'online';
837
+ yield this.ledger.setStatus(link, 'live');
838
+ this.emitEvent({ type: 'repair:completed', tenantDid: did, remoteEndpoint: dwnUrl, protocol });
839
+ if (prevRepairConnectivity !== 'online') {
840
+ this.emitEvent({ type: 'link:connectivity-change', tenantDid: did, remoteEndpoint: dwnUrl, protocol, from: prevRepairConnectivity, to: 'online' });
841
+ }
842
+ this.emitEvent({ type: 'link:status-change', tenantDid: did, remoteEndpoint: dwnUrl, protocol, from: 'repairing', to: 'live' });
843
+ }
844
+ catch (error) {
845
+ // If teardown occurred during repair, don't retry or enter degraded_poll.
846
+ if (this._syncGeneration !== generation) {
847
+ return;
848
+ }
849
+ console.error(`SyncEngineLevel: Repair failed for ${did} -> ${dwnUrl} (attempt ${attempts})`, error);
850
+ this.emitEvent({ type: 'repair:failed', tenantDid: did, remoteEndpoint: dwnUrl, protocol, attempt: attempts, error: String((_d = error.message) !== null && _d !== void 0 ? _d : error) });
851
+ if (attempts >= SyncEngineLevel.MAX_REPAIR_ATTEMPTS) {
852
+ console.warn(`SyncEngineLevel: Max repair attempts reached for ${did} -> ${dwnUrl}, entering degraded_poll`);
853
+ yield this.enterDegradedPoll(linkKey);
854
+ return;
855
+ }
856
+ // Re-throw so callers (degraded_poll timer) can handle retry scheduling.
857
+ throw error;
858
+ }
859
+ });
860
+ }
861
+ /**
862
+ * Close pull and push subscriptions for a specific link.
863
+ */
864
+ closeLinkSubscriptions(link) {
865
+ return __awaiter(this, void 0, void 0, function* () {
866
+ const { tenantDid: did, remoteEndpoint: dwnUrl, protocol } = link;
867
+ // Close pull subscription.
868
+ const pullSub = this._liveSubscriptions.find(s => s.did === did && s.dwnUrl === dwnUrl && s.protocol === protocol);
869
+ if (pullSub) {
870
+ try {
871
+ yield pullSub.close();
872
+ }
873
+ catch ( /* best effort */_a) { /* best effort */ }
874
+ this._liveSubscriptions = this._liveSubscriptions.filter(s => s !== pullSub);
875
+ }
876
+ // Close local push subscription.
877
+ const pushSub = this._localSubscriptions.find(s => s.did === did && s.dwnUrl === dwnUrl && s.protocol === protocol);
878
+ if (pushSub) {
879
+ try {
880
+ yield pushSub.close();
881
+ }
882
+ catch ( /* best effort */_b) { /* best effort */ }
883
+ this._localSubscriptions = this._localSubscriptions.filter(s => s !== pushSub);
884
+ }
885
+ });
886
+ }
887
+ /**
888
+ * Transition a link to `degraded_poll` and start a per-link polling timer.
889
+ * The timer runs SMT reconciliation at a reduced frequency (30s with jitter)
890
+ * and attempts to re-establish live subscriptions after each successful repair.
891
+ */
892
+ enterDegradedPoll(linkKey) {
893
+ return __awaiter(this, void 0, void 0, function* () {
894
+ const link = this._activeLinks.get(linkKey);
895
+ if (!link) {
896
+ return;
897
+ }
898
+ link.connectivity = 'offline';
899
+ const prevDegradedStatus = link.status;
900
+ yield this.ledger.setStatus(link, 'degraded_poll');
901
+ this._repairAttempts.delete(linkKey);
902
+ this.emitEvent({ type: 'link:status-change', tenantDid: link.tenantDid, remoteEndpoint: link.remoteEndpoint, protocol: link.protocol, from: prevDegradedStatus, to: 'degraded_poll' });
903
+ this.emitEvent({ type: 'degraded-poll:entered', tenantDid: link.tenantDid, remoteEndpoint: link.remoteEndpoint, protocol: link.protocol });
904
+ // Clear any existing timer for this link.
905
+ const existing = this._degradedPollTimers.get(linkKey);
906
+ if (existing) {
907
+ clearInterval(existing);
908
+ }
909
+ // Schedule per-link polling with jitter (15-30 seconds).
910
+ const baseInterval = 15000;
911
+ const jitter = Math.floor(Math.random() * 15000);
912
+ const interval = baseInterval + jitter;
913
+ const pollGeneration = this._syncGeneration;
914
+ const timer = setInterval(() => __awaiter(this, void 0, void 0, function* () {
915
+ // Bail if teardown occurred since this timer was created.
916
+ if (this._syncGeneration !== pollGeneration) {
917
+ clearInterval(timer);
918
+ this._degradedPollTimers.delete(linkKey);
919
+ return;
920
+ }
921
+ // If the link was transitioned out of degraded_poll externally (e.g.,
922
+ // by teardown or manual intervention), stop polling.
923
+ if (link.status !== 'degraded_poll') {
924
+ clearInterval(timer);
925
+ this._degradedPollTimers.delete(linkKey);
926
+ return;
927
+ }
928
+ try {
929
+ // Attempt repair. Reset attempt counter so repairLink doesn't
930
+ // immediately re-enter degraded_poll on failure.
931
+ this._repairAttempts.set(linkKey, 0);
932
+ yield this.ledger.setStatus(link, 'repairing');
933
+ yield this.repairLink(linkKey);
934
+ // If repairLink succeeded, link is now 'live' — stop polling.
935
+ if (link.status === 'live') {
936
+ clearInterval(timer);
937
+ this._degradedPollTimers.delete(linkKey);
938
+ }
939
+ }
940
+ catch (_a) {
941
+ // Repair failed — restore degraded_poll status so the timer continues.
942
+ // This is critical: repairLink sets status to 'repairing' internally,
943
+ // and if we don't restore degraded_poll, the next tick would see
944
+ // status !== 'degraded_poll' and stop the timer permanently.
945
+ yield this.ledger.setStatus(link, 'degraded_poll');
946
+ }
947
+ }), interval);
948
+ this._degradedPollTimers.set(linkKey, timer);
949
+ });
950
+ }
384
951
  /**
385
952
  * Tears down all live subscriptions and push listeners.
386
953
  */
387
954
  teardownLiveSync() {
388
955
  return __awaiter(this, void 0, void 0, function* () {
956
+ // Increment generation to invalidate all in-flight async operations
957
+ // (repairs, retry timers, degraded-poll ticks). Any async work that
958
+ // captured the previous generation will bail on its next checkpoint.
959
+ this._syncGeneration++;
389
960
  // Clear the push debounce timer.
390
961
  if (this._pushDebounceTimer) {
391
962
  clearTimeout(this._pushDebounceTimer);
@@ -413,6 +984,23 @@ export class SyncEngineLevel {
413
984
  }
414
985
  }
415
986
  this._localSubscriptions = [];
987
+ // Clear degraded-poll timers and repair state.
988
+ for (const timer of this._degradedPollTimers.values()) {
989
+ clearInterval(timer);
990
+ }
991
+ this._degradedPollTimers.clear();
992
+ this._repairAttempts.clear();
993
+ this._activeRepairs.clear();
994
+ for (const timer of this._repairRetryTimers.values()) {
995
+ clearTimeout(timer);
996
+ }
997
+ this._repairRetryTimers.clear();
998
+ this._repairContext.clear();
999
+ // Clear closure evaluation contexts.
1000
+ this._closureContexts.clear();
1001
+ // Clear the in-memory link and runtime state.
1002
+ this._activeLinks.clear();
1003
+ this._linkRuntimes.clear();
416
1004
  });
417
1005
  }
418
1006
  // ---------------------------------------------------------------------------
@@ -424,12 +1012,25 @@ export class SyncEngineLevel {
424
1012
  */
425
1013
  openLivePullSubscription(target) {
426
1014
  return __awaiter(this, void 0, void 0, function* () {
1015
+ var _a, _b;
427
1016
  const { did, delegateDid, dwnUrl, protocol } = target;
428
- // Resolve the cursor from the last session (if any).
1017
+ // Resolve the cursor from the link's pull checkpoint (preferred) or legacy storage.
429
1018
  const cursorKey = this.buildCursorKey(did, dwnUrl, protocol);
430
- const cursor = yield this.getCursor(cursorKey);
1019
+ const link = this._activeLinks.get(cursorKey);
1020
+ const cursor = (_a = link === null || link === void 0 ? void 0 : link.pull.contiguousAppliedToken) !== null && _a !== void 0 ? _a : yield this.getCursor(cursorKey);
431
1021
  // Build the MessagesSubscribe filters.
432
- const filters = protocol ? [{ protocol }] : [];
1022
+ // When the link has protocolPathPrefixes, include them in the filter so the
1023
+ // EventLog delivers only matching events (server-side filtering). This replaces
1024
+ // the less efficient agent-side isEventInScope filtering for the pull path.
1025
+ // Note: only the first prefix is used as the MessagesFilter field because
1026
+ // MessagesFilter.protocolPathPrefix is a single string. Multiple prefixes
1027
+ // would need multiple filters (OR semantics) — for now we use the first one.
1028
+ const protocolPathPrefix = (link === null || link === void 0 ? void 0 : link.scope.kind) === 'protocol'
1029
+ ? (_b = link.scope.protocolPathPrefixes) === null || _b === void 0 ? void 0 : _b[0]
1030
+ : undefined;
1031
+ const filters = protocol
1032
+ ? [Object.assign({ protocol }, (protocolPathPrefix ? { protocolPathPrefix } : {}))]
1033
+ : [];
433
1034
  // Look up permission grant for MessagesSubscribe if using a delegate.
434
1035
  // The unified scope matching in AgentPermissionsApi accepts a
435
1036
  // Messages.Read grant for MessagesSubscribe requests, so a single
@@ -446,15 +1047,83 @@ export class SyncEngineLevel {
446
1047
  permissionGrantId = grant.grant.id;
447
1048
  }
448
1049
  // Define the subscription handler that processes incoming events.
1050
+ // NOTE: The WebSocket client fires handlers without awaiting (fire-and-forget),
1051
+ // so multiple handlers can be in-flight concurrently. The ordinal tracker
1052
+ // ensures the checkpoint advances only when all earlier deliveries are committed.
449
1053
  const subscriptionHandler = (subMessage) => __awaiter(this, void 0, void 0, function* () {
450
1054
  if (subMessage.type === 'eose') {
451
- // End-of-stored-events — catch-up complete, persist cursor.
452
- yield this.setCursor(cursorKey, subMessage.cursor);
453
- this._connectivityState = 'online';
1055
+ // End-of-stored-events — catch-up complete.
1056
+ if (link) {
1057
+ // Guard: if the link transitioned to repairing while catch-up events
1058
+ // were being processed, skip all mutations — repair owns the state now.
1059
+ if (link.status !== 'live' && link.status !== 'initializing') {
1060
+ return;
1061
+ }
1062
+ if (!ReplicationLedger.validateTokenDomain(link.pull, subMessage.cursor)) {
1063
+ console.warn(`SyncEngineLevel: Token domain mismatch on EOSE for ${did} -> ${dwnUrl}, transitioning to repairing`);
1064
+ yield this.transitionToRepairing(cursorKey, link);
1065
+ return;
1066
+ }
1067
+ ReplicationLedger.setReceivedToken(link.pull, subMessage.cursor);
1068
+ // Drain committed entries. Do NOT unconditionally advance to the
1069
+ // EOSE cursor — earlier stored events may still be in-flight
1070
+ // (handlers are fire-and-forget). The checkpoint advances only as
1071
+ // far as the contiguous drain reaches.
1072
+ this.drainCommittedPull(cursorKey);
1073
+ yield this.ledger.saveLink(link);
1074
+ }
1075
+ else {
1076
+ yield this.setCursor(cursorKey, subMessage.cursor);
1077
+ }
1078
+ // Transport is reachable — set connectivity to online.
1079
+ if (link) {
1080
+ const prevEoseConnectivity = link.connectivity;
1081
+ link.connectivity = 'online';
1082
+ if (prevEoseConnectivity !== 'online') {
1083
+ this.emitEvent({ type: 'link:connectivity-change', tenantDid: did, remoteEndpoint: dwnUrl, protocol, from: prevEoseConnectivity, to: 'online' });
1084
+ }
1085
+ }
1086
+ else {
1087
+ this._connectivityState = 'online';
1088
+ }
454
1089
  return;
455
1090
  }
456
1091
  if (subMessage.type === 'event') {
457
1092
  const event = subMessage.event;
1093
+ // Guard: if the link is not live (e.g., repairing, degraded_poll, paused),
1094
+ // skip all processing. Old subscription handlers may still fire after the
1095
+ // link transitions — these events should be ignored entirely, not just
1096
+ // skipped at the checkpoint level.
1097
+ if (link && link.status !== 'live' && link.status !== 'initializing') {
1098
+ return;
1099
+ }
1100
+ // Domain validation: reject tokens from a different stream/epoch.
1101
+ if (link && !ReplicationLedger.validateTokenDomain(link.pull, subMessage.cursor)) {
1102
+ console.warn(`SyncEngineLevel: Token domain mismatch for ${did} -> ${dwnUrl}, transitioning to repairing`);
1103
+ yield this.transitionToRepairing(cursorKey, link);
1104
+ return;
1105
+ }
1106
+ // Subset scope filtering: if the link has protocolPath/contextId prefixes,
1107
+ // skip events that don't match. This is agent-side filtering because
1108
+ // MessagesSubscribe only supports protocol-level filtering today.
1109
+ //
1110
+ // Skipped events MUST advance contiguousAppliedToken — otherwise the
1111
+ // link would replay the same filtered-out events indefinitely after
1112
+ // reconnect/repair. This is safe because the event is intentionally
1113
+ // excluded from this scope and doesn't need processing.
1114
+ if (link && !isEventInScope(event.message, link.scope)) {
1115
+ ReplicationLedger.setReceivedToken(link.pull, subMessage.cursor);
1116
+ ReplicationLedger.commitContiguousToken(link.pull, subMessage.cursor);
1117
+ yield this.ledger.saveLink(link);
1118
+ return;
1119
+ }
1120
+ // Assign a delivery ordinal BEFORE async processing begins.
1121
+ // This captures the delivery order even if processing completes out of order.
1122
+ const rt = link ? this.getOrCreateRuntime(cursorKey) : undefined;
1123
+ const ordinal = rt ? rt.nextDeliveryOrdinal++ : -1;
1124
+ if (rt) {
1125
+ rt.inflight.set(ordinal, { ordinal, token: subMessage.cursor, committed: false });
1126
+ }
458
1127
  try {
459
1128
  // Extract inline data from the event (available for records <= 30 KB).
460
1129
  let dataStream = this.extractDataStream(event);
@@ -473,13 +1142,89 @@ export class SyncEngineLevel {
473
1142
  }
474
1143
  }
475
1144
  yield this.agent.dwn.processRawMessage(did, event.message, { dataStream });
476
- // Only advance the cursor after successful processing.
477
- // If processing fails, the event will be re-delivered on
478
- // reconnection (cursor-based resume from the last good point).
479
- yield this.setCursor(cursorKey, subMessage.cursor);
1145
+ // Invalidate closure cache entries that may be affected by this message.
1146
+ // Must run before closure validation so subsequent evaluations in the
1147
+ // same session see the updated local state.
1148
+ const closureCtxForInvalidation = this._closureContexts.get(did);
1149
+ if (closureCtxForInvalidation) {
1150
+ invalidateClosureCache(closureCtxForInvalidation, event.message);
1151
+ }
1152
+ // Closure validation for scoped subset sync (Phase 3).
1153
+ // For protocol-scoped links, verify that all hard dependencies for
1154
+ // this operation are locally present before considering it committed.
1155
+ // Full-tenant scope bypasses this entirely (returns complete with 0 queries).
1156
+ if (link && link.scope.kind === 'protocol') {
1157
+ const messageStore = this.agent.dwn.node.storage.messageStore;
1158
+ let closureCtx = this._closureContexts.get(did);
1159
+ if (!closureCtx) {
1160
+ closureCtx = createClosureContext(did);
1161
+ this._closureContexts.set(did, closureCtx);
1162
+ }
1163
+ const closureResult = yield evaluateClosure(event.message, messageStore, link.scope, closureCtx);
1164
+ if (!closureResult.complete) {
1165
+ console.warn(`SyncEngineLevel: Closure incomplete for ${did} -> ${dwnUrl}: ` +
1166
+ `${closureResult.failure.code} — ${closureResult.failure.detail}`);
1167
+ yield this.transitionToRepairing(cursorKey, link);
1168
+ return;
1169
+ }
1170
+ }
1171
+ // Squash convergence: processRawMessage triggers the DWN's built-in
1172
+ // squash resumable task (performRecordsSquash) which runs inline and
1173
+ // handles subset consumers correctly:
1174
+ // - If older siblings are locally present → purges them
1175
+ // - If squash arrives before older siblings → backstop rejects them (409)
1176
+ // - If no older siblings are local → no-op (correct)
1177
+ // Both sync orderings (squash-first or siblings-first) converge to
1178
+ // the same final state. No additional sync-engine side-effect is needed.
1179
+ // Track this CID for echo-loop suppression, scoped to the source endpoint.
1180
+ const pulledCid = yield Message.getCid(event.message);
1181
+ this._recentlyPulledCids.set(`${pulledCid}|${dwnUrl}`, Date.now() + SyncEngineLevel.ECHO_SUPPRESS_TTL_MS);
1182
+ this.evictExpiredEchoEntries();
1183
+ // Mark this ordinal as committed and drain the checkpoint.
1184
+ // Guard: if the link transitioned to repairing while this handler was
1185
+ // in-flight (e.g., an earlier ordinal's handler failed concurrently),
1186
+ // skip all state mutations — the repair process owns progression now.
1187
+ if (link && rt && link.status === 'live') {
1188
+ const entry = rt.inflight.get(ordinal);
1189
+ if (entry) {
1190
+ entry.committed = true;
1191
+ }
1192
+ ReplicationLedger.setReceivedToken(link.pull, subMessage.cursor);
1193
+ const drained = this.drainCommittedPull(cursorKey);
1194
+ if (drained > 0) {
1195
+ yield this.ledger.saveLink(link);
1196
+ // Emit after durable save — "advanced" means persisted.
1197
+ if (link.pull.contiguousAppliedToken) {
1198
+ this.emitEvent({
1199
+ type: 'checkpoint:pull-advance',
1200
+ tenantDid: link.tenantDid,
1201
+ remoteEndpoint: link.remoteEndpoint,
1202
+ protocol: link.protocol,
1203
+ position: link.pull.contiguousAppliedToken.position,
1204
+ messageCid: link.pull.contiguousAppliedToken.messageCid,
1205
+ });
1206
+ }
1207
+ }
1208
+ // Overflow: too many in-flight ordinals without draining.
1209
+ if (rt.inflight.size > MAX_PENDING_TOKENS) {
1210
+ console.warn(`SyncEngineLevel: Pull in-flight overflow for ${did} -> ${dwnUrl}, transitioning to repairing`);
1211
+ yield this.transitionToRepairing(cursorKey, link);
1212
+ }
1213
+ }
1214
+ else if (!link) {
1215
+ // Legacy path: no link available, use simple cursor persistence.
1216
+ yield this.setCursor(cursorKey, subMessage.cursor);
1217
+ }
480
1218
  }
481
1219
  catch (error) {
482
1220
  console.error(`SyncEngineLevel: Error processing live-pull event for ${did}`, error);
1221
+ // A failed processRawMessage means local state is incomplete.
1222
+ // Transition to repairing immediately — do NOT advance the checkpoint
1223
+ // past this failure or let later ordinals commit past it. SMT
1224
+ // reconciliation will discover and fill the gap.
1225
+ if (link) {
1226
+ yield this.transitionToRepairing(cursorKey, link);
1227
+ }
483
1228
  }
484
1229
  }
485
1230
  });
@@ -502,7 +1247,10 @@ export class SyncEngineLevel {
502
1247
  // Build a resubscribe factory so the WebSocket client can resume with
503
1248
  // a fresh cursor-stamped message after reconnection.
504
1249
  const resubscribeFactory = (resumeCursor) => __awaiter(this, void 0, void 0, function* () {
505
- const resumeRequest = Object.assign(Object.assign({}, subscribeRequest), { messageParams: Object.assign(Object.assign({}, subscribeRequest.messageParams), { cursor: resumeCursor !== null && resumeCursor !== void 0 ? resumeCursor : cursor }) });
1250
+ var _a;
1251
+ // On reconnect, use the latest durable checkpoint position if available.
1252
+ const effectiveCursor = (_a = resumeCursor !== null && resumeCursor !== void 0 ? resumeCursor : link === null || link === void 0 ? void 0 : link.pull.contiguousAppliedToken) !== null && _a !== void 0 ? _a : cursor;
1253
+ const resumeRequest = Object.assign(Object.assign({}, subscribeRequest), { messageParams: Object.assign(Object.assign({}, subscribeRequest.messageParams), { cursor: effectiveCursor }) });
506
1254
  const { message: resumeMsg } = yield this.agent.dwn.processRequest(resumeRequest);
507
1255
  if (!resumeMsg) {
508
1256
  throw new Error(`SyncEngineLevel: Failed to construct resume MessagesSubscribe for ${dwnUrl}`);
@@ -522,9 +1270,15 @@ export class SyncEngineLevel {
522
1270
  resubscribeFactory,
523
1271
  },
524
1272
  });
1273
+ if (reply.status.code === 410) {
1274
+ // ProgressGap — the cursor is no longer replayable. The link needs repair.
1275
+ const gapError = new Error(`SyncEngineLevel: ProgressGap for ${did} -> ${dwnUrl}: ${reply.status.detail}`);
1276
+ gapError.isProgressGap = true;
1277
+ gapError.gapInfo = reply.error;
1278
+ throw gapError;
1279
+ }
525
1280
  if (reply.status.code !== 200 || !reply.subscription) {
526
- console.error(`SyncEngineLevel: MessagesSubscribe failed for ${did} -> ${dwnUrl}: ${reply.status.code} ${reply.status.detail}`);
527
- return;
1281
+ throw new Error(`SyncEngineLevel: MessagesSubscribe failed for ${did} -> ${dwnUrl}: ${reply.status.code} ${reply.status.detail}`);
528
1282
  }
529
1283
  this._liveSubscriptions.push({
530
1284
  did,
@@ -533,7 +1287,15 @@ export class SyncEngineLevel {
533
1287
  protocol,
534
1288
  close: () => __awaiter(this, void 0, void 0, function* () { yield reply.subscription.close(); }),
535
1289
  });
536
- this._connectivityState = 'online';
1290
+ // Set per-link connectivity to online after successful subscription setup.
1291
+ const pullLink = this._activeLinks.get(this.buildCursorKey(did, dwnUrl, protocol));
1292
+ if (pullLink) {
1293
+ const prevPullConnectivity = pullLink.connectivity;
1294
+ pullLink.connectivity = 'online';
1295
+ if (prevPullConnectivity !== 'online') {
1296
+ this.emitEvent({ type: 'link:connectivity-change', tenantDid: did, remoteEndpoint: dwnUrl, protocol, from: prevPullConnectivity, to: 'online' });
1297
+ }
1298
+ }
537
1299
  });
538
1300
  }
539
1301
  // ---------------------------------------------------------------------------
@@ -565,18 +1327,46 @@ export class SyncEngineLevel {
565
1327
  if (subMessage.type !== 'event') {
566
1328
  return;
567
1329
  }
1330
+ // Subset scope filtering for push: only push events that match the
1331
+ // link's scope prefixes. Events outside the scope are not our responsibility.
1332
+ // Skipped events MUST advance the push checkpoint to prevent infinite
1333
+ // replay after repair/reconnect (same reason as the pull side).
1334
+ const pushLink = this._activeLinks.get(this.buildCursorKey(did, dwnUrl, protocol));
1335
+ if (pushLink && !isEventInScope(subMessage.event.message, pushLink.scope)) {
1336
+ // Guard: only mutate durable state when the link is live/initializing.
1337
+ // During repair/degraded_poll, orchestration owns checkpoint progression.
1338
+ if (pushLink.status !== 'live' && pushLink.status !== 'initializing') {
1339
+ return;
1340
+ }
1341
+ // Validate token domain before committing — a stream/epoch mismatch
1342
+ // on the local EventLog should trigger repair, not silently overwrite.
1343
+ if (!ReplicationLedger.validateTokenDomain(pushLink.push, subMessage.cursor)) {
1344
+ yield this.transitionToRepairing(this.buildCursorKey(did, dwnUrl, protocol), pushLink);
1345
+ return;
1346
+ }
1347
+ ReplicationLedger.setReceivedToken(pushLink.push, subMessage.cursor);
1348
+ ReplicationLedger.commitContiguousToken(pushLink.push, subMessage.cursor);
1349
+ yield this.ledger.saveLink(pushLink);
1350
+ return;
1351
+ }
568
1352
  // Accumulate the message CID for a debounced push.
569
1353
  const targetKey = this.buildCursorKey(did, dwnUrl, protocol);
570
1354
  const cid = yield Message.getCid(subMessage.event.message);
571
1355
  if (cid === undefined) {
572
1356
  return;
573
1357
  }
1358
+ // Echo-loop suppression: skip CIDs that were recently pulled from this
1359
+ // specific remote. A message pulled from Provider A is only suppressed
1360
+ // for push to A — it still fans out to Provider B and C.
1361
+ if (this.isRecentlyPulled(cid, dwnUrl)) {
1362
+ return;
1363
+ }
574
1364
  let pending = this._pendingPushCids.get(targetKey);
575
1365
  if (!pending) {
576
- pending = { did, dwnUrl, delegateDid, protocol, cids: [] };
1366
+ pending = { did, dwnUrl, delegateDid, protocol, entries: [] };
577
1367
  this._pendingPushCids.set(targetKey, pending);
578
1368
  }
579
- pending.cids.push(cid);
1369
+ pending.entries.push({ cid, localToken: subMessage.cursor });
580
1370
  // Debounce the push.
581
1371
  if (this._pushDebounceTimer) {
582
1372
  clearTimeout(this._pushDebounceTimer);
@@ -586,18 +1376,20 @@ export class SyncEngineLevel {
586
1376
  }, PUSH_DEBOUNCE_MS);
587
1377
  });
588
1378
  // Process the local subscription request.
1379
+ // When a push cursor is provided (e.g., after repair), the local subscription
1380
+ // replays events from that position, closing the race window where local
1381
+ // writes during repair would otherwise be missed by push-on-write.
589
1382
  const response = yield this.agent.dwn.processRequest({
590
1383
  author: did,
591
1384
  target: did,
592
1385
  messageType: DwnInterface.MessagesSubscribe,
593
1386
  granteeDid: delegateDid,
594
- messageParams: { filters, permissionGrantId },
1387
+ messageParams: { filters, permissionGrantId, cursor: target.pushCursor },
595
1388
  subscriptionHandler: subscriptionHandler,
596
1389
  });
597
1390
  const reply = response.reply;
598
1391
  if (reply.status.code !== 200 || !reply.subscription) {
599
- console.error(`SyncEngineLevel: Local MessagesSubscribe failed for ${did}: ${reply.status.code} ${reply.status.detail}`);
600
- return;
1392
+ throw new Error(`SyncEngineLevel: Local MessagesSubscribe failed for ${did}: ${reply.status.code} ${reply.status.detail}`);
601
1393
  }
602
1394
  this._localSubscriptions.push({
603
1395
  did,
@@ -614,38 +1406,89 @@ export class SyncEngineLevel {
614
1406
  flushPendingPushes() {
615
1407
  return __awaiter(this, void 0, void 0, function* () {
616
1408
  this._pushDebounceTimer = undefined;
617
- const entries = [...this._pendingPushCids.entries()];
1409
+ const batches = [...this._pendingPushCids.entries()];
618
1410
  this._pendingPushCids.clear();
619
1411
  // Push to all endpoints in parallel — each target is independent.
620
- yield Promise.all(entries.map((_a) => __awaiter(this, [_a], void 0, function* ([, pending]) {
621
- const { did, dwnUrl, delegateDid, protocol, cids } = pending;
622
- if (cids.length === 0) {
1412
+ yield Promise.all(batches.map((_a) => __awaiter(this, [_a], void 0, function* ([targetKey, pending]) {
1413
+ const { did, dwnUrl, delegateDid, protocol, entries: pushEntries } = pending;
1414
+ if (pushEntries.length === 0) {
623
1415
  return;
624
1416
  }
1417
+ const cids = pushEntries.map(e => e.cid);
625
1418
  try {
626
- yield pushMessages({
1419
+ const result = yield pushMessages({
627
1420
  did, dwnUrl, delegateDid, protocol,
628
1421
  messageCids: cids,
629
1422
  agent: this.agent,
630
1423
  permissionsApi: this._permissionsApi,
631
1424
  });
1425
+ // Advance the push checkpoint for successfully pushed entries.
1426
+ // Push is sequential (single batch, in-order processing) so we can
1427
+ // commit directly without ordinal tracking — there's no concurrent
1428
+ // completion to reorder.
1429
+ const link = this._activeLinks.get(targetKey);
1430
+ if (link) {
1431
+ const succeededSet = new Set(result.succeeded);
1432
+ // Track highest contiguous success: if a CID fails, we stop advancing.
1433
+ let hitFailure = false;
1434
+ for (const entry of pushEntries) {
1435
+ if (hitFailure) {
1436
+ break;
1437
+ }
1438
+ if (succeededSet.has(entry.cid) && entry.localToken) {
1439
+ if (!ReplicationLedger.validateTokenDomain(link.push, entry.localToken)) {
1440
+ console.warn(`SyncEngineLevel: Push checkpoint domain mismatch for ${did} -> ${dwnUrl}, transitioning to repairing`);
1441
+ yield this.transitionToRepairing(targetKey, link);
1442
+ break;
1443
+ }
1444
+ ReplicationLedger.setReceivedToken(link.push, entry.localToken);
1445
+ ReplicationLedger.commitContiguousToken(link.push, entry.localToken);
1446
+ }
1447
+ else {
1448
+ // This CID failed or had no token — stop advancing.
1449
+ hitFailure = true;
1450
+ }
1451
+ }
1452
+ yield this.ledger.saveLink(link);
1453
+ }
1454
+ // Re-queue only TRANSIENT failures for retry. Permanent failures (400/401/403)
1455
+ // are dropped — they will never succeed regardless of retry.
1456
+ if (result.failed.length > 0) {
1457
+ console.error(`SyncEngineLevel: Push-on-write failed for ${did} -> ${dwnUrl}: ` +
1458
+ `${result.failed.length} transient failures of ${cids.length} messages`);
1459
+ const failedSet = new Set(result.failed);
1460
+ const failedEntries = pushEntries.filter(e => failedSet.has(e.cid));
1461
+ let requeued = this._pendingPushCids.get(targetKey);
1462
+ if (!requeued) {
1463
+ requeued = { did, dwnUrl, delegateDid, protocol, entries: [] };
1464
+ this._pendingPushCids.set(targetKey, requeued);
1465
+ }
1466
+ requeued.entries.push(...failedEntries);
1467
+ // Schedule a retry after a short delay.
1468
+ if (!this._pushDebounceTimer) {
1469
+ this._pushDebounceTimer = setTimeout(() => {
1470
+ void this.flushPendingPushes();
1471
+ }, PUSH_DEBOUNCE_MS * 4);
1472
+ }
1473
+ }
1474
+ // Permanent failures are logged by pushMessages but NOT re-queued.
1475
+ // They will be rediscovered by the next SMT integrity check if the
1476
+ // local/remote state has changed, but won't spin in a retry loop.
632
1477
  }
633
1478
  catch (error) {
1479
+ // Truly unexpected error (not per-message failure). Re-queue entire
1480
+ // batch so entries aren't silently dropped from the debounce queue.
634
1481
  console.error(`SyncEngineLevel: Push-on-write failed for ${did} -> ${dwnUrl}`, error);
635
- // Re-queue the failed CIDs so they are retried on the next
636
- // debounce cycle (or picked up by the SMT integrity check).
637
- const targetKey = this.buildCursorKey(did, dwnUrl, protocol);
638
1482
  let requeued = this._pendingPushCids.get(targetKey);
639
1483
  if (!requeued) {
640
- requeued = { did, dwnUrl, delegateDid, protocol, cids: [] };
1484
+ requeued = { did, dwnUrl, delegateDid, protocol, entries: [] };
641
1485
  this._pendingPushCids.set(targetKey, requeued);
642
1486
  }
643
- requeued.cids.push(...cids);
644
- // Schedule a retry after a short delay.
1487
+ requeued.entries.push(...pushEntries);
645
1488
  if (!this._pushDebounceTimer) {
646
1489
  this._pushDebounceTimer = setTimeout(() => {
647
1490
  void this.flushPendingPushes();
648
- }, PUSH_DEBOUNCE_MS * 4); // Back off: 1 second instead of 250ms.
1491
+ }, PUSH_DEBOUNCE_MS * 4);
649
1492
  }
650
1493
  }
651
1494
  })));
@@ -658,11 +1501,31 @@ export class SyncEngineLevel {
658
1501
  const base = `${did}${CURSOR_SEPARATOR}${dwnUrl}`;
659
1502
  return protocol ? `${base}${CURSOR_SEPARATOR}${protocol}` : base;
660
1503
  }
1504
+ /**
1505
+ * Retrieves a stored progress token. Handles migration from old string cursors:
1506
+ * if the stored value is a bare string (pre-ProgressToken format), it is treated
1507
+ * as absent — the sync engine will do a full SMT reconciliation on first startup
1508
+ * after upgrade, which is correct and safe.
1509
+ */
661
1510
  getCursor(key) {
662
1511
  return __awaiter(this, void 0, void 0, function* () {
663
1512
  const cursors = this._db.sublevel('syncCursors');
664
1513
  try {
665
- return yield cursors.get(key);
1514
+ const raw = yield cursors.get(key);
1515
+ try {
1516
+ const parsed = JSON.parse(raw);
1517
+ if (parsed && typeof parsed === 'object' &&
1518
+ typeof parsed.streamId === 'string' &&
1519
+ typeof parsed.epoch === 'string' &&
1520
+ typeof parsed.position === 'string' &&
1521
+ typeof parsed.messageCid === 'string') {
1522
+ return parsed;
1523
+ }
1524
+ }
1525
+ catch (_a) {
1526
+ // Not valid JSON (old string cursor) — treat as absent.
1527
+ }
1528
+ return undefined;
666
1529
  }
667
1530
  catch (error) {
668
1531
  const e = error;
@@ -676,7 +1539,7 @@ export class SyncEngineLevel {
676
1539
  setCursor(key, cursor) {
677
1540
  return __awaiter(this, void 0, void 0, function* () {
678
1541
  const cursors = this._db.sublevel('syncCursors');
679
- yield cursors.put(key, cursor);
1542
+ yield cursors.put(key, JSON.stringify(cursor));
680
1543
  });
681
1544
  }
682
1545
  // ---------------------------------------------------------------------------
@@ -1012,6 +1875,51 @@ export class SyncEngineLevel {
1012
1875
  });
1013
1876
  });
1014
1877
  }
1878
+ // ---------------------------------------------------------------------------
1879
+ // Echo-loop suppression
1880
+ // ---------------------------------------------------------------------------
1881
+ /**
1882
+ * Evicts expired entries from the echo-loop suppression cache.
1883
+ * Also enforces the size cap by evicting oldest entries first.
1884
+ */
1885
+ evictExpiredEchoEntries() {
1886
+ const now = Date.now();
1887
+ // Evict expired entries.
1888
+ for (const [cid, expiry] of this._recentlyPulledCids) {
1889
+ if (now >= expiry) {
1890
+ this._recentlyPulledCids.delete(cid);
1891
+ }
1892
+ }
1893
+ // Enforce size cap by evicting oldest entries.
1894
+ if (this._recentlyPulledCids.size > SyncEngineLevel.ECHO_SUPPRESS_MAX_ENTRIES) {
1895
+ const excess = this._recentlyPulledCids.size - SyncEngineLevel.ECHO_SUPPRESS_MAX_ENTRIES;
1896
+ let evicted = 0;
1897
+ for (const key of this._recentlyPulledCids.keys()) {
1898
+ if (evicted >= excess) {
1899
+ break;
1900
+ }
1901
+ this._recentlyPulledCids.delete(key);
1902
+ evicted++;
1903
+ }
1904
+ }
1905
+ }
1906
+ /**
1907
+ * Checks whether a CID was recently pulled from a specific remote endpoint
1908
+ * and should not be pushed back to that same endpoint (echo-loop suppression).
1909
+ * Does not suppress pushes to other endpoints — multi-provider fan-out works.
1910
+ */
1911
+ isRecentlyPulled(cid, dwnUrl) {
1912
+ const key = `${cid}|${dwnUrl}`;
1913
+ const expiry = this._recentlyPulledCids.get(key);
1914
+ if (expiry === undefined) {
1915
+ return false;
1916
+ }
1917
+ if (Date.now() >= expiry) {
1918
+ this._recentlyPulledCids.delete(key);
1919
+ return false;
1920
+ }
1921
+ return true;
1922
+ }
1015
1923
  /**
1016
1924
  * Reads missing messages from the local DWN and pushes them to the remote DWN
1017
1925
  * in dependency order (topological sort).
@@ -1103,8 +2011,19 @@ export class SyncEngineLevel {
1103
2011
  });
1104
2012
  }
1105
2013
  }
2014
+ /** TTL for echo-loop suppression entries (60 seconds). */
2015
+ SyncEngineLevel.ECHO_SUPPRESS_TTL_MS = 60000;
2016
+ /** Maximum entries in the echo-loop suppression cache. */
2017
+ SyncEngineLevel.ECHO_SUPPRESS_MAX_ENTRIES = 10000;
1106
2018
  /** Maximum consecutive failures before entering backoff. */
1107
2019
  SyncEngineLevel.MAX_CONSECUTIVE_FAILURES = 5;
1108
2020
  /** Backoff multiplier for consecutive failures (caps at 4x the configured interval). */
1109
2021
  SyncEngineLevel.MAX_BACKOFF_MULTIPLIER = 4;
2022
+ // ---------------------------------------------------------------------------
2023
+ // Per-link repair and degraded-poll orchestration (Phase 2)
2024
+ // ---------------------------------------------------------------------------
2025
+ /** Maximum consecutive repair attempts before falling back to degraded_poll. */
2026
+ SyncEngineLevel.MAX_REPAIR_ATTEMPTS = 3;
2027
+ /** Backoff schedule for repair retries (milliseconds). */
2028
+ SyncEngineLevel.REPAIR_BACKOFF_MS = [1000, 3000, 10000];
1110
2029
  //# sourceMappingURL=sync-engine-level.js.map