npm - @pylonsync/sync - Versions diffs - 0.3.253 → 0.3.254 - Mend

@pylonsync/sync 0.3.253 → 0.3.254

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/src/index.ts +76 -8
package/src/scenarios.test.ts +119 -0
package/src/test-harness/server.ts +11 -0
package/src/test-harness/transport.ts +14 -0

package/package.json CHANGED Viewed

@@ -3,7 +3,7 @@
   "publishConfig": {
     "access": "public"
   },
-  "version": "0.3.253",
+  "version": "0.3.254",
   "type": "module",
   "main": "src/index.ts",
   "types": "src/index.ts",

package/src/index.ts CHANGED Viewed

@@ -308,6 +308,22 @@ export class SyncEngine {
    */
   private applyQueue: Promise<void> = Promise.resolve();
+  /**
+   * Live-event hold buffer, active ONLY while a from-zero snapshot pull is in
+   * flight. A snapshot is full state as-of `snapshot_seq` S; its rows arrive
+   * tagged `seq = S`. If a live WS frame (or a tab broadcast) at `seq = S+k`
+   * applies FIRST — during the snapshot's (possibly multi-page) HTTP fetch — it
+   * advances the cursor past S, and then EVERY snapshot row (seq ≤ S) is
+   * dropped by the monotonic filter in enqueueApply, leaving a near-empty
+   * replica with the cursor persisted ahead (no 410, no heal until a reconcile
+   * happens to fire). The store has no per-row seq guard, so we can't just
+   * apply the snapshot unconditionally — an older snapshot row would clobber a
+   * newer live update. So we instead ORDER them: hold live/broadcast applies
+   * here while snapshotting, then replay them (seq-filtered) AFTER the snapshot
+   * lands. null = not snapshotting → normal apply.
+   */
+  private snapshotHold: ChangeEvent[] | null = null;
   /**
    * Serialized channel for outbound network ops (pull, push, reconcile,
    * refresh, resetReplica). Replaces the per-op `inFlightX` mutexes +
@@ -596,12 +612,15 @@ export class SyncEngine {
           const mqPersistence = new IndexedDBMutationPersistence(persistence);
           this.mutations.attachPersistence(mqPersistence);
           await this.mutations.hydrate();
-          // Fire-and-forget — the actual mutation HTTP calls happen
-          // async, and we don't want to block engine startup on them.
-          // pull()/reconcile() below run in parallel; push()'s
-          // mutations carry op_ids so racing the broadcasts won't
-          // double-apply.
-          void this.push();
+          // The hydrated offline writes are drained in the leader path
+          // below (after `initMultiTab` settles), NOT here. We're still
+          // pre-election at this point, so `isMultiTabLeader` is false
+          // and a push() now would hit the follower branch — broadcasting
+          // the batch to a not-yet-constructed orchestrator (a silent
+          // no-op) and stranding every offline write until an unrelated
+          // mutation happened to fire push() again. The drain moved to
+          // the leader path so it runs once we actually own the network.
+          // Test: `hydrated offline writes are pushed once leader-elected`.
         } catch {
           // Queue persistence optional — memory-only still works.
         }
@@ -670,6 +689,20 @@ export class SyncEngine {
       await this.refreshResolvedSession();
     }
+    // Drain hydrated offline writes now that we ARE the leader. The
+    // startup hydrate (in the persist block above) ran pre-election,
+    // when a push() would have taken the follower branch and broadcast
+    // into a not-yet-running orchestrator — a no-op that stranded the
+    // writes. We own the network now, so the follower gate in pushInner
+    // passes and the batch actually reaches /api/sync/push. Fire-and-
+    // forget (op_ids dedupe against the broadcasts) and ahead of pull()
+    // so the server has the writes before the cold-load snapshot lands —
+    // the snapshot then returns them as canonical instead of leaving the
+    // reconcile backstop to recover the optimistic ghosts.
+    if (this.mutations.pending().length > 0) {
+      void this.push();
+    }
     // Pull from server, then connect real-time transport.
     await this.pull();
@@ -1054,8 +1087,18 @@ export class SyncEngine {
   private enqueueApply(
     changes: ChangeEvent[],
     targetCursor?: SyncCursor,
-    opts: { fromBroadcast?: boolean } = {},
+    opts: { fromBroadcast?: boolean; isPull?: boolean } = {},
   ): Promise<void> {
+    // Snapshot fence: while a from-zero snapshot is in flight, hold live WS
+    // frames + tab broadcasts so they can't advance the cursor past the
+    // snapshot's rows and filter them out (see `snapshotHold`). The pull's own
+    // apply (`isPull`) is exempt — it IS the snapshot. Held events are replayed
+    // in arrival (≈seq) order once the snapshot lands. Synchronous + before the
+    // queue chain so held events never interleave into the applyQueue.
+    if (this.snapshotHold !== null && !opts.isPull) {
+      this.snapshotHold.push(...changes);
+      return Promise.resolve();
+    }
     const prev = this.applyQueue;
     const next = prev.then(async () => {
       // Per-event monotonic filter: re-applies of an already-seen seq
@@ -1371,6 +1414,12 @@ export class SyncEngine {
     // bootstrap reconcile (the snapshot path already returned every
     // policy-visible row, per-entity refetch right after is waste).
     const startedFromZero = this.cursor.last_seq === 0;
+    // A from-zero pull is a SNAPSHOT — open the live-event hold for its whole
+    // (possibly multi-page) duration so a racing WS frame can't leapfrog the
+    // cursor and filter the snapshot rows out. Nested pulls (delta tail /
+    // has_more, 410 recursion) run at a non-zero cursor → they don't touch
+    // this, and their applies pass `isPull` so they're never held.
+    if (startedFromZero) this.snapshotHold = [];
     try {
       // Snapshot pagination: when the cursor is 0 and the server's
       // table is larger than a single batch, the response carries
@@ -1390,7 +1439,7 @@ export class SyncEngine {
         const resp = await this.request<
           PullResponse & { snapshot_after?: string | null }
         >("GET", `/api/sync/pull?${params.toString()}`);
-        await this.enqueueApply(resp.changes, resp.cursor);
+        await this.enqueueApply(resp.changes, resp.cursor, { isPull: true });
         // `snapshot_after` is only set when the server is mid-snapshot.
         // Continue paginating in the same loop iteration so we don't
         // leave a fresh client with a partial replica.
@@ -1434,6 +1483,16 @@ export class SyncEngine {
       // truth. Record it so onConnected skips the reconcile that would
       // otherwise re-fetch every entity via cursor pagination.
       this.lastPullStartedFromZero = startedFromZero;
+      // Snapshot landed cleanly → replay the live events we held during it, in
+      // arrival (≈seq) order. They filter against the now-correct cursor
+      // (snapshot_seq), so events newer than the snapshot apply and older ones
+      // (already in the snapshot) are deduped. Clearing `snapshotHold` first
+      // means this replay applies normally (it isn't re-held).
+      if (startedFromZero && this.snapshotHold) {
+        const held = this.snapshotHold;
+        this.snapshotHold = null;
+        if (held.length > 0) await this.enqueueApply(held);
+      }
     } catch (err) {
       // Swallow network + transient errors so the poll/reconnect loop
       // keeps trying — but on 429 bump the backoff counter so the next
@@ -1488,6 +1547,15 @@ export class SyncEngine {
           }, delayMs);
         }
       }
+    } finally {
+      // Snapshot pull failed (network error / 410 mid-fetch): DISCARD any
+      // still-held live events rather than applying them. The cursor stays at 0
+      // so the retry resnapshots and re-covers them; applying them here would
+      // advance the cursor and turn the retry into a gappy delta. On success
+      // the try already drained + nulled the hold, so this is a no-op there.
+      // Nested non-zero pulls never set `snapshotHold`, so this only fires for
+      // the from-zero snapshot that owns it.
+      if (startedFromZero && this.snapshotHold !== null) this.snapshotHold = null;
     }
   }

package/src/scenarios.test.ts CHANGED Viewed

@@ -822,4 +822,123 @@ describe("sync scenarios", () => {
     // And the replica survives — token rotation must not wipe it.
     expect(env.engine.store.get("Recording", "r1")).not.toBeNull();
   });
+  // SNAPSHOT-DROP RACE (pins the `snapshotHold` fence, TS F1). A live WS
+  // event that lands DURING a from-zero snapshot fetch leapfrogs the
+  // cursor: the event applies at seq S+1 first, so when the snapshot's
+  // own rows (pinned at seq ≤ S) finally apply, the monotonic
+  // `seq > cursor.last_seq` filter drops every one of them. The user's
+  // whole replica vanishes for a beat (until the next reconcile),
+  // because one concurrent broadcast raced the snapshot.
+  //
+  // The fix buffers live/broadcast applies in `snapshotHold` for the
+  // duration of a from-zero pull, applies the snapshot first, then
+  // flushes the held events in order — so the snapshot is never behind
+  // the cursor when it lands. This reproduces the race deterministically
+  // via a 410 (which forces a from-zero re-pull AFTER the WS is
+  // connected) plus a `beforePull` hook that injects a higher-seq live
+  // event right as the snapshot is being served.
+  test("a live WS event during a from-zero snapshot doesn't drop the snapshot", async () => {
+    let armed = false;
+    let injected = false;
+    env = createTestEnv({
+      transport: "websocket",
+      // Fires server-side just before the pull response is built. On the
+      // armed from-zero re-pull, deliver a live insert for a DIFFERENT
+      // row at a seq just past the snapshot — the exact interleave that
+      // leapfrogs the cursor mid-fetch.
+      beforePull: (_auth, since) => {
+        if (since !== 0 || !armed || injected) return;
+        injected = true;
+        const liveSeq = env!.server.nextSeqValue() + 1; // > snapshot_seq
+        env!.server.pushToUser("u1", {
+          seq: liveSeq,
+          entity: "Note",
+          row_id: "n2",
+          kind: "insert",
+          data: { id: "n2", title: "live" },
+          timestamp: "",
+        });
+      },
+    });
+    env.signIn({ userId: "u1" });
+    env.server.seed("Note", [{ id: "n1", title: "snapshot" }]);
+    await env.start();
+    await env.flush();
+    // Baseline: the snapshot row is present and the WS is connected.
+    expect(env.engine.store.get("Note", "n1")).not.toBeNull();
+    // Arm the injection, then force a from-zero re-pull via a 410. The
+    // delta pull 410s → resetReplica wipes the replica → pullInner
+    // re-pulls from seq=0. The beforePull hook injects the racing live
+    // event while that snapshot is served.
+    armed = true;
+    env.server.primeNextPullStatus(410);
+    await env.engine.pull();
+    await env.flush();
+    // The injection must actually have fired — otherwise the test is
+    // vacuous (no race happened).
+    expect(injected).toBe(true);
+    // The snapshot row MUST survive even though a higher-seq live event
+    // applied during the fetch. Pre-fix, n1 was dropped by the monotonic
+    // cursor filter (cursor had already leapfrogged to the live seq).
+    expect(env.engine.store.get("Note", "n1")).not.toBeNull();
+    // ...and the racing live event is also present.
+    expect(env.engine.store.get("Note", "n2")).not.toBeNull();
+  });
+  // STRANDED OFFLINE WRITES (pins the leader-path mutation drain, TS F3).
+  // A write made offline is queued and persisted; on next boot the engine
+  // hydrates it into the mutation queue. The hydrate runs PRE-election,
+  // while `isMultiTabLeader` is still false — so the old startup push()
+  // took the follower branch and broadcast the batch to a not-yet-running
+  // orchestrator (a silent no-op). `onInitialLeader` only flips the flag;
+  // nothing re-pushed. The offline write was stranded until an unrelated
+  // mutation fired push() again — the yapless "recording I made on the
+  // plane never uploaded until I recorded another one" class of bug.
+  //
+  // The fix drains the pending queue in the leader path, once election
+  // has settled and we own the network. This test injects a hydrated-
+  // style pending mutation BEFORE start() (persistence is off in the
+  // harness, so this stands in for `mutations.hydrate()`), then asserts
+  // start() actually shipped it to /api/sync/push with no further action.
+  test("hydrated offline writes are pushed once leader-elected", async () => {
+    // WS transport specifically: poll mode's `performPollTick` pushes on
+    // every tick, which would mask the bug. In WS-only mode the sole
+    // startup push is the leader-path drain — onConnected pulls but never
+    // pushes — so this isolates exactly the path the fix restores.
+    env = createTestEnv({ transport: "websocket" });
+    env.signIn({ userId: "u1" });
+    // An offline write rehydrated into the queue before the engine boots.
+    env.engine.mutations.add({
+      entity: "Note",
+      row_id: "n1",
+      kind: "insert",
+      data: { id: "n1", title: "written offline" },
+    });
+    // Mirror the optimistic row the offline write left in the replica.
+    env.engine.store.applyChange({
+      seq: 0,
+      entity: "Note",
+      row_id: "n1",
+      kind: "insert",
+      data: { id: "n1", title: "written offline" },
+      timestamp: "",
+    });
+    expect(env.engine.mutations.pending()).toHaveLength(1);
+    expect(env.server.receivedPushKeys).not.toContain("Note/n1");
+    await env.start();
+    await env.flush();
+    // The leader must have drained THIS write to the server with no extra
+    // user action. Asserting the specific op key (not just "a push
+    // happened") keeps the test robust against a stray retry timer from
+    // an unrelated engine firing against the shared fetch mock. Pre-fix
+    // Note/n1 never reached the server — the startup push was a no-op
+    // follower broadcast and nothing re-pushed.
+    expect(env.server.receivedPushKeys).toContain("Note/n1");
+  });
 });

package/src/test-harness/server.ts CHANGED Viewed

@@ -102,6 +102,17 @@ export class TestServer {
   /** Count of snapshot pulls served (since = 0). The egress storm was a
    *  runaway count here; the regression test bounds it. */
   snapshotPullCount = 0;
+  /** Count of /api/sync/push requests received. Lets a test assert the
+   *  engine actually shipped a batch (e.g. hydrated offline writes that
+   *  must drain once leader-elected), independent of the no-op push
+   *  response the harness returns. */
+  pushRequestCount = 0;
+  /** `${entity}/${row_id}` of every op the engine pushed, across all
+   *  push requests. Lets a test assert a SPECIFIC mutation reached the
+   *  server — robust against a stray retry from an unrelated engine
+   *  whose pending timer fires against the globally-installed fetch
+   *  mock (that pushes ITS ops, never this test's row). */
+  readonly receivedPushKeys: string[] = [];
   /** Captured outbound WS messages from clients — tests assert against
    *  this to verify `reactive-subscribe`, `crdt-subscribe`, etc., were
    *  actually sent over the wire. */

package/src/test-harness/transport.ts CHANGED Viewed

@@ -279,6 +279,20 @@ async function handle(
   // /api/sync/push — accept ops from optimistic mutations.
   if (url.endsWith("/api/sync/push") && method === "POST") {
+    server.pushRequestCount += 1;
+    // Record which ops were pushed so a test can assert a specific
+    // mutation reached the server (not just "some push happened").
+    try {
+      const body = typeof _init?.body === "string" ? JSON.parse(_init.body) : null;
+      const changes = Array.isArray(body?.changes) ? body.changes : [];
+      for (const c of changes) {
+        if (c && typeof c.entity === "string" && typeof c.row_id === "string") {
+          server.receivedPushKeys.push(`${c.entity}/${c.row_id}`);
+        }
+      }
+    } catch {
+      /* body not JSON / no changes — count still recorded above */
+    }
     const outcome = server.consumeNextPushOutcome();
     if (outcome?.kind === "network") {
       // Reject like a real offline fetch: no HTTP status → the engine