@pylonsync/sync 0.3.252 → 0.3.254

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -3,7 +3,7 @@
3
3
  "publishConfig": {
4
4
  "access": "public"
5
5
  },
6
- "version": "0.3.252",
6
+ "version": "0.3.254",
7
7
  "type": "module",
8
8
  "main": "src/index.ts",
9
9
  "types": "src/index.ts",
package/src/index.ts CHANGED
@@ -308,6 +308,22 @@ export class SyncEngine {
308
308
  */
309
309
  private applyQueue: Promise<void> = Promise.resolve();
310
310
 
311
+ /**
312
+ * Live-event hold buffer, active ONLY while a from-zero snapshot pull is in
313
+ * flight. A snapshot is full state as-of `snapshot_seq` S; its rows arrive
314
+ * tagged `seq = S`. If a live WS frame (or a tab broadcast) at `seq = S+k`
315
+ * applies FIRST — during the snapshot's (possibly multi-page) HTTP fetch — it
316
+ * advances the cursor past S, and then EVERY snapshot row (seq ≤ S) is
317
+ * dropped by the monotonic filter in enqueueApply, leaving a near-empty
318
+ * replica with the cursor persisted ahead (no 410, no heal until a reconcile
319
+ * happens to fire). The store has no per-row seq guard, so we can't just
320
+ * apply the snapshot unconditionally — an older snapshot row would clobber a
321
+ * newer live update. So we instead ORDER them: hold live/broadcast applies
322
+ * here while snapshotting, then replay them (seq-filtered) AFTER the snapshot
323
+ * lands. null = not snapshotting → normal apply.
324
+ */
325
+ private snapshotHold: ChangeEvent[] | null = null;
326
+
311
327
  /**
312
328
  * Serialized channel for outbound network ops (pull, push, reconcile,
313
329
  * refresh, resetReplica). Replaces the per-op `inFlightX` mutexes +
@@ -596,12 +612,15 @@ export class SyncEngine {
596
612
  const mqPersistence = new IndexedDBMutationPersistence(persistence);
597
613
  this.mutations.attachPersistence(mqPersistence);
598
614
  await this.mutations.hydrate();
599
- // Fire-and-forget the actual mutation HTTP calls happen
600
- // async, and we don't want to block engine startup on them.
601
- // pull()/reconcile() below run in parallel; push()'s
602
- // mutations carry op_ids so racing the broadcasts won't
603
- // double-apply.
604
- void this.push();
615
+ // The hydrated offline writes are drained in the leader path
616
+ // below (after `initMultiTab` settles), NOT here. We're still
617
+ // pre-election at this point, so `isMultiTabLeader` is false
618
+ // and a push() now would hit the follower branch — broadcasting
619
+ // the batch to a not-yet-constructed orchestrator (a silent
620
+ // no-op) and stranding every offline write until an unrelated
621
+ // mutation happened to fire push() again. The drain moved to
622
+ // the leader path so it runs once we actually own the network.
623
+ // Test: `hydrated offline writes are pushed once leader-elected`.
605
624
  } catch {
606
625
  // Queue persistence optional — memory-only still works.
607
626
  }
@@ -670,6 +689,20 @@ export class SyncEngine {
670
689
  await this.refreshResolvedSession();
671
690
  }
672
691
 
692
+ // Drain hydrated offline writes now that we ARE the leader. The
693
+ // startup hydrate (in the persist block above) ran pre-election,
694
+ // when a push() would have taken the follower branch and broadcast
695
+ // into a not-yet-running orchestrator — a no-op that stranded the
696
+ // writes. We own the network now, so the follower gate in pushInner
697
+ // passes and the batch actually reaches /api/sync/push. Fire-and-
698
+ // forget (op_ids dedupe against the broadcasts) and ahead of pull()
699
+ // so the server has the writes before the cold-load snapshot lands —
700
+ // the snapshot then returns them as canonical instead of leaving the
701
+ // reconcile backstop to recover the optimistic ghosts.
702
+ if (this.mutations.pending().length > 0) {
703
+ void this.push();
704
+ }
705
+
673
706
  // Pull from server, then connect real-time transport.
674
707
  await this.pull();
675
708
 
@@ -1054,8 +1087,18 @@ export class SyncEngine {
1054
1087
  private enqueueApply(
1055
1088
  changes: ChangeEvent[],
1056
1089
  targetCursor?: SyncCursor,
1057
- opts: { fromBroadcast?: boolean } = {},
1090
+ opts: { fromBroadcast?: boolean; isPull?: boolean } = {},
1058
1091
  ): Promise<void> {
1092
+ // Snapshot fence: while a from-zero snapshot is in flight, hold live WS
1093
+ // frames + tab broadcasts so they can't advance the cursor past the
1094
+ // snapshot's rows and filter them out (see `snapshotHold`). The pull's own
1095
+ // apply (`isPull`) is exempt — it IS the snapshot. Held events are replayed
1096
+ // in arrival (≈seq) order once the snapshot lands. Synchronous + before the
1097
+ // queue chain so held events never interleave into the applyQueue.
1098
+ if (this.snapshotHold !== null && !opts.isPull) {
1099
+ this.snapshotHold.push(...changes);
1100
+ return Promise.resolve();
1101
+ }
1059
1102
  const prev = this.applyQueue;
1060
1103
  const next = prev.then(async () => {
1061
1104
  // Per-event monotonic filter: re-applies of an already-seen seq
@@ -1371,6 +1414,12 @@ export class SyncEngine {
1371
1414
  // bootstrap reconcile (the snapshot path already returned every
1372
1415
  // policy-visible row, per-entity refetch right after is waste).
1373
1416
  const startedFromZero = this.cursor.last_seq === 0;
1417
+ // A from-zero pull is a SNAPSHOT — open the live-event hold for its whole
1418
+ // (possibly multi-page) duration so a racing WS frame can't leapfrog the
1419
+ // cursor and filter the snapshot rows out. Nested pulls (delta tail /
1420
+ // has_more, 410 recursion) run at a non-zero cursor → they don't touch
1421
+ // this, and their applies pass `isPull` so they're never held.
1422
+ if (startedFromZero) this.snapshotHold = [];
1374
1423
  try {
1375
1424
  // Snapshot pagination: when the cursor is 0 and the server's
1376
1425
  // table is larger than a single batch, the response carries
@@ -1390,7 +1439,7 @@ export class SyncEngine {
1390
1439
  const resp = await this.request<
1391
1440
  PullResponse & { snapshot_after?: string | null }
1392
1441
  >("GET", `/api/sync/pull?${params.toString()}`);
1393
- await this.enqueueApply(resp.changes, resp.cursor);
1442
+ await this.enqueueApply(resp.changes, resp.cursor, { isPull: true });
1394
1443
  // `snapshot_after` is only set when the server is mid-snapshot.
1395
1444
  // Continue paginating in the same loop iteration so we don't
1396
1445
  // leave a fresh client with a partial replica.
@@ -1434,6 +1483,16 @@ export class SyncEngine {
1434
1483
  // truth. Record it so onConnected skips the reconcile that would
1435
1484
  // otherwise re-fetch every entity via cursor pagination.
1436
1485
  this.lastPullStartedFromZero = startedFromZero;
1486
+ // Snapshot landed cleanly → replay the live events we held during it, in
1487
+ // arrival (≈seq) order. They filter against the now-correct cursor
1488
+ // (snapshot_seq), so events newer than the snapshot apply and older ones
1489
+ // (already in the snapshot) are deduped. Clearing `snapshotHold` first
1490
+ // means this replay applies normally (it isn't re-held).
1491
+ if (startedFromZero && this.snapshotHold) {
1492
+ const held = this.snapshotHold;
1493
+ this.snapshotHold = null;
1494
+ if (held.length > 0) await this.enqueueApply(held);
1495
+ }
1437
1496
  } catch (err) {
1438
1497
  // Swallow network + transient errors so the poll/reconnect loop
1439
1498
  // keeps trying — but on 429 bump the backoff counter so the next
@@ -1488,6 +1547,15 @@ export class SyncEngine {
1488
1547
  }, delayMs);
1489
1548
  }
1490
1549
  }
1550
+ } finally {
1551
+ // Snapshot pull failed (network error / 410 mid-fetch): DISCARD any
1552
+ // still-held live events rather than applying them. The cursor stays at 0
1553
+ // so the retry resnapshots and re-covers them; applying them here would
1554
+ // advance the cursor and turn the retry into a gappy delta. On success
1555
+ // the try already drained + nulled the hold, so this is a no-op there.
1556
+ // Nested non-zero pulls never set `snapshotHold`, so this only fires for
1557
+ // the from-zero snapshot that owns it.
1558
+ if (startedFromZero && this.snapshotHold !== null) this.snapshotHold = null;
1491
1559
  }
1492
1560
  }
1493
1561
 
@@ -822,4 +822,123 @@ describe("sync scenarios", () => {
822
822
  // And the replica survives — token rotation must not wipe it.
823
823
  expect(env.engine.store.get("Recording", "r1")).not.toBeNull();
824
824
  });
825
+
826
+ // SNAPSHOT-DROP RACE (pins the `snapshotHold` fence, TS F1). A live WS
827
+ // event that lands DURING a from-zero snapshot fetch leapfrogs the
828
+ // cursor: the event applies at seq S+1 first, so when the snapshot's
829
+ // own rows (pinned at seq ≤ S) finally apply, the monotonic
830
+ // `seq > cursor.last_seq` filter drops every one of them. The user's
831
+ // whole replica vanishes for a beat (until the next reconcile),
832
+ // because one concurrent broadcast raced the snapshot.
833
+ //
834
+ // The fix buffers live/broadcast applies in `snapshotHold` for the
835
+ // duration of a from-zero pull, applies the snapshot first, then
836
+ // flushes the held events in order — so the snapshot is never behind
837
+ // the cursor when it lands. This reproduces the race deterministically
838
+ // via a 410 (which forces a from-zero re-pull AFTER the WS is
839
+ // connected) plus a `beforePull` hook that injects a higher-seq live
840
+ // event right as the snapshot is being served.
841
+ test("a live WS event during a from-zero snapshot doesn't drop the snapshot", async () => {
842
+ let armed = false;
843
+ let injected = false;
844
+ env = createTestEnv({
845
+ transport: "websocket",
846
+ // Fires server-side just before the pull response is built. On the
847
+ // armed from-zero re-pull, deliver a live insert for a DIFFERENT
848
+ // row at a seq just past the snapshot — the exact interleave that
849
+ // leapfrogs the cursor mid-fetch.
850
+ beforePull: (_auth, since) => {
851
+ if (since !== 0 || !armed || injected) return;
852
+ injected = true;
853
+ const liveSeq = env!.server.nextSeqValue() + 1; // > snapshot_seq
854
+ env!.server.pushToUser("u1", {
855
+ seq: liveSeq,
856
+ entity: "Note",
857
+ row_id: "n2",
858
+ kind: "insert",
859
+ data: { id: "n2", title: "live" },
860
+ timestamp: "",
861
+ });
862
+ },
863
+ });
864
+ env.signIn({ userId: "u1" });
865
+ env.server.seed("Note", [{ id: "n1", title: "snapshot" }]);
866
+ await env.start();
867
+ await env.flush();
868
+ // Baseline: the snapshot row is present and the WS is connected.
869
+ expect(env.engine.store.get("Note", "n1")).not.toBeNull();
870
+
871
+ // Arm the injection, then force a from-zero re-pull via a 410. The
872
+ // delta pull 410s → resetReplica wipes the replica → pullInner
873
+ // re-pulls from seq=0. The beforePull hook injects the racing live
874
+ // event while that snapshot is served.
875
+ armed = true;
876
+ env.server.primeNextPullStatus(410);
877
+ await env.engine.pull();
878
+ await env.flush();
879
+
880
+ // The injection must actually have fired — otherwise the test is
881
+ // vacuous (no race happened).
882
+ expect(injected).toBe(true);
883
+ // The snapshot row MUST survive even though a higher-seq live event
884
+ // applied during the fetch. Pre-fix, n1 was dropped by the monotonic
885
+ // cursor filter (cursor had already leapfrogged to the live seq).
886
+ expect(env.engine.store.get("Note", "n1")).not.toBeNull();
887
+ // ...and the racing live event is also present.
888
+ expect(env.engine.store.get("Note", "n2")).not.toBeNull();
889
+ });
890
+
891
+ // STRANDED OFFLINE WRITES (pins the leader-path mutation drain, TS F3).
892
+ // A write made offline is queued and persisted; on next boot the engine
893
+ // hydrates it into the mutation queue. The hydrate runs PRE-election,
894
+ // while `isMultiTabLeader` is still false — so the old startup push()
895
+ // took the follower branch and broadcast the batch to a not-yet-running
896
+ // orchestrator (a silent no-op). `onInitialLeader` only flips the flag;
897
+ // nothing re-pushed. The offline write was stranded until an unrelated
898
+ // mutation fired push() again — the yapless "recording I made on the
899
+ // plane never uploaded until I recorded another one" class of bug.
900
+ //
901
+ // The fix drains the pending queue in the leader path, once election
902
+ // has settled and we own the network. This test injects a hydrated-
903
+ // style pending mutation BEFORE start() (persistence is off in the
904
+ // harness, so this stands in for `mutations.hydrate()`), then asserts
905
+ // start() actually shipped it to /api/sync/push with no further action.
906
+ test("hydrated offline writes are pushed once leader-elected", async () => {
907
+ // WS transport specifically: poll mode's `performPollTick` pushes on
908
+ // every tick, which would mask the bug. In WS-only mode the sole
909
+ // startup push is the leader-path drain — onConnected pulls but never
910
+ // pushes — so this isolates exactly the path the fix restores.
911
+ env = createTestEnv({ transport: "websocket" });
912
+ env.signIn({ userId: "u1" });
913
+
914
+ // An offline write rehydrated into the queue before the engine boots.
915
+ env.engine.mutations.add({
916
+ entity: "Note",
917
+ row_id: "n1",
918
+ kind: "insert",
919
+ data: { id: "n1", title: "written offline" },
920
+ });
921
+ // Mirror the optimistic row the offline write left in the replica.
922
+ env.engine.store.applyChange({
923
+ seq: 0,
924
+ entity: "Note",
925
+ row_id: "n1",
926
+ kind: "insert",
927
+ data: { id: "n1", title: "written offline" },
928
+ timestamp: "",
929
+ });
930
+ expect(env.engine.mutations.pending()).toHaveLength(1);
931
+ expect(env.server.receivedPushKeys).not.toContain("Note/n1");
932
+
933
+ await env.start();
934
+ await env.flush();
935
+
936
+ // The leader must have drained THIS write to the server with no extra
937
+ // user action. Asserting the specific op key (not just "a push
938
+ // happened") keeps the test robust against a stray retry timer from
939
+ // an unrelated engine firing against the shared fetch mock. Pre-fix
940
+ // Note/n1 never reached the server — the startup push was a no-op
941
+ // follower broadcast and nothing re-pushed.
942
+ expect(env.server.receivedPushKeys).toContain("Note/n1");
943
+ });
825
944
  });
@@ -102,6 +102,17 @@ export class TestServer {
102
102
  /** Count of snapshot pulls served (since = 0). The egress storm was a
103
103
  * runaway count here; the regression test bounds it. */
104
104
  snapshotPullCount = 0;
105
+ /** Count of /api/sync/push requests received. Lets a test assert the
106
+ * engine actually shipped a batch (e.g. hydrated offline writes that
107
+ * must drain once leader-elected), independent of the no-op push
108
+ * response the harness returns. */
109
+ pushRequestCount = 0;
110
+ /** `${entity}/${row_id}` of every op the engine pushed, across all
111
+ * push requests. Lets a test assert a SPECIFIC mutation reached the
112
+ * server — robust against a stray retry from an unrelated engine
113
+ * whose pending timer fires against the globally-installed fetch
114
+ * mock (that pushes ITS ops, never this test's row). */
115
+ readonly receivedPushKeys: string[] = [];
105
116
  /** Captured outbound WS messages from clients — tests assert against
106
117
  * this to verify `reactive-subscribe`, `crdt-subscribe`, etc., were
107
118
  * actually sent over the wire. */
@@ -279,6 +279,20 @@ async function handle(
279
279
 
280
280
  // /api/sync/push — accept ops from optimistic mutations.
281
281
  if (url.endsWith("/api/sync/push") && method === "POST") {
282
+ server.pushRequestCount += 1;
283
+ // Record which ops were pushed so a test can assert a specific
284
+ // mutation reached the server (not just "some push happened").
285
+ try {
286
+ const body = typeof _init?.body === "string" ? JSON.parse(_init.body) : null;
287
+ const changes = Array.isArray(body?.changes) ? body.changes : [];
288
+ for (const c of changes) {
289
+ if (c && typeof c.entity === "string" && typeof c.row_id === "string") {
290
+ server.receivedPushKeys.push(`${c.entity}/${c.row_id}`);
291
+ }
292
+ }
293
+ } catch {
294
+ /* body not JSON / no changes — count still recorded above */
295
+ }
282
296
  const outcome = server.consumeNextPushOutcome();
283
297
  if (outcome?.kind === "network") {
284
298
  // Reject like a real offline fetch: no HTTP status → the engine