@pylonsync/sync 0.3.253 → 0.3.254
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.ts +76 -8
- package/src/scenarios.test.ts +119 -0
- package/src/test-harness/server.ts +11 -0
- package/src/test-harness/transport.ts +14 -0
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -308,6 +308,22 @@ export class SyncEngine {
|
|
|
308
308
|
*/
|
|
309
309
|
private applyQueue: Promise<void> = Promise.resolve();
|
|
310
310
|
|
|
311
|
+
/**
|
|
312
|
+
* Live-event hold buffer, active ONLY while a from-zero snapshot pull is in
|
|
313
|
+
* flight. A snapshot is full state as-of `snapshot_seq` S; its rows arrive
|
|
314
|
+
* tagged `seq = S`. If a live WS frame (or a tab broadcast) at `seq = S+k`
|
|
315
|
+
* applies FIRST — during the snapshot's (possibly multi-page) HTTP fetch — it
|
|
316
|
+
* advances the cursor past S, and then EVERY snapshot row (seq ≤ S) is
|
|
317
|
+
* dropped by the monotonic filter in enqueueApply, leaving a near-empty
|
|
318
|
+
* replica with the cursor persisted ahead (no 410, no heal until a reconcile
|
|
319
|
+
* happens to fire). The store has no per-row seq guard, so we can't just
|
|
320
|
+
* apply the snapshot unconditionally — an older snapshot row would clobber a
|
|
321
|
+
* newer live update. So we instead ORDER them: hold live/broadcast applies
|
|
322
|
+
* here while snapshotting, then replay them (seq-filtered) AFTER the snapshot
|
|
323
|
+
* lands. null = not snapshotting → normal apply.
|
|
324
|
+
*/
|
|
325
|
+
private snapshotHold: ChangeEvent[] | null = null;
|
|
326
|
+
|
|
311
327
|
/**
|
|
312
328
|
* Serialized channel for outbound network ops (pull, push, reconcile,
|
|
313
329
|
* refresh, resetReplica). Replaces the per-op `inFlightX` mutexes +
|
|
@@ -596,12 +612,15 @@ export class SyncEngine {
|
|
|
596
612
|
const mqPersistence = new IndexedDBMutationPersistence(persistence);
|
|
597
613
|
this.mutations.attachPersistence(mqPersistence);
|
|
598
614
|
await this.mutations.hydrate();
|
|
599
|
-
//
|
|
600
|
-
//
|
|
601
|
-
//
|
|
602
|
-
//
|
|
603
|
-
//
|
|
604
|
-
|
|
615
|
+
// The hydrated offline writes are drained in the leader path
|
|
616
|
+
// below (after `initMultiTab` settles), NOT here. We're still
|
|
617
|
+
// pre-election at this point, so `isMultiTabLeader` is false
|
|
618
|
+
// and a push() now would hit the follower branch — broadcasting
|
|
619
|
+
// the batch to a not-yet-constructed orchestrator (a silent
|
|
620
|
+
// no-op) and stranding every offline write until an unrelated
|
|
621
|
+
// mutation happened to fire push() again. The drain moved to
|
|
622
|
+
// the leader path so it runs once we actually own the network.
|
|
623
|
+
// Test: `hydrated offline writes are pushed once leader-elected`.
|
|
605
624
|
} catch {
|
|
606
625
|
// Queue persistence optional — memory-only still works.
|
|
607
626
|
}
|
|
@@ -670,6 +689,20 @@ export class SyncEngine {
|
|
|
670
689
|
await this.refreshResolvedSession();
|
|
671
690
|
}
|
|
672
691
|
|
|
692
|
+
// Drain hydrated offline writes now that we ARE the leader. The
|
|
693
|
+
// startup hydrate (in the persist block above) ran pre-election,
|
|
694
|
+
// when a push() would have taken the follower branch and broadcast
|
|
695
|
+
// into a not-yet-running orchestrator — a no-op that stranded the
|
|
696
|
+
// writes. We own the network now, so the follower gate in pushInner
|
|
697
|
+
// passes and the batch actually reaches /api/sync/push. Fire-and-
|
|
698
|
+
// forget (op_ids dedupe against the broadcasts) and ahead of pull()
|
|
699
|
+
// so the server has the writes before the cold-load snapshot lands —
|
|
700
|
+
// the snapshot then returns them as canonical instead of leaving the
|
|
701
|
+
// reconcile backstop to recover the optimistic ghosts.
|
|
702
|
+
if (this.mutations.pending().length > 0) {
|
|
703
|
+
void this.push();
|
|
704
|
+
}
|
|
705
|
+
|
|
673
706
|
// Pull from server, then connect real-time transport.
|
|
674
707
|
await this.pull();
|
|
675
708
|
|
|
@@ -1054,8 +1087,18 @@ export class SyncEngine {
|
|
|
1054
1087
|
private enqueueApply(
|
|
1055
1088
|
changes: ChangeEvent[],
|
|
1056
1089
|
targetCursor?: SyncCursor,
|
|
1057
|
-
opts: { fromBroadcast?: boolean } = {},
|
|
1090
|
+
opts: { fromBroadcast?: boolean; isPull?: boolean } = {},
|
|
1058
1091
|
): Promise<void> {
|
|
1092
|
+
// Snapshot fence: while a from-zero snapshot is in flight, hold live WS
|
|
1093
|
+
// frames + tab broadcasts so they can't advance the cursor past the
|
|
1094
|
+
// snapshot's rows and filter them out (see `snapshotHold`). The pull's own
|
|
1095
|
+
// apply (`isPull`) is exempt — it IS the snapshot. Held events are replayed
|
|
1096
|
+
// in arrival (≈seq) order once the snapshot lands. Synchronous + before the
|
|
1097
|
+
// queue chain so held events never interleave into the applyQueue.
|
|
1098
|
+
if (this.snapshotHold !== null && !opts.isPull) {
|
|
1099
|
+
this.snapshotHold.push(...changes);
|
|
1100
|
+
return Promise.resolve();
|
|
1101
|
+
}
|
|
1059
1102
|
const prev = this.applyQueue;
|
|
1060
1103
|
const next = prev.then(async () => {
|
|
1061
1104
|
// Per-event monotonic filter: re-applies of an already-seen seq
|
|
@@ -1371,6 +1414,12 @@ export class SyncEngine {
|
|
|
1371
1414
|
// bootstrap reconcile (the snapshot path already returned every
|
|
1372
1415
|
// policy-visible row, per-entity refetch right after is waste).
|
|
1373
1416
|
const startedFromZero = this.cursor.last_seq === 0;
|
|
1417
|
+
// A from-zero pull is a SNAPSHOT — open the live-event hold for its whole
|
|
1418
|
+
// (possibly multi-page) duration so a racing WS frame can't leapfrog the
|
|
1419
|
+
// cursor and filter the snapshot rows out. Nested pulls (delta tail /
|
|
1420
|
+
// has_more, 410 recursion) run at a non-zero cursor → they don't touch
|
|
1421
|
+
// this, and their applies pass `isPull` so they're never held.
|
|
1422
|
+
if (startedFromZero) this.snapshotHold = [];
|
|
1374
1423
|
try {
|
|
1375
1424
|
// Snapshot pagination: when the cursor is 0 and the server's
|
|
1376
1425
|
// table is larger than a single batch, the response carries
|
|
@@ -1390,7 +1439,7 @@ export class SyncEngine {
|
|
|
1390
1439
|
const resp = await this.request<
|
|
1391
1440
|
PullResponse & { snapshot_after?: string | null }
|
|
1392
1441
|
>("GET", `/api/sync/pull?${params.toString()}`);
|
|
1393
|
-
await this.enqueueApply(resp.changes, resp.cursor);
|
|
1442
|
+
await this.enqueueApply(resp.changes, resp.cursor, { isPull: true });
|
|
1394
1443
|
// `snapshot_after` is only set when the server is mid-snapshot.
|
|
1395
1444
|
// Continue paginating in the same loop iteration so we don't
|
|
1396
1445
|
// leave a fresh client with a partial replica.
|
|
@@ -1434,6 +1483,16 @@ export class SyncEngine {
|
|
|
1434
1483
|
// truth. Record it so onConnected skips the reconcile that would
|
|
1435
1484
|
// otherwise re-fetch every entity via cursor pagination.
|
|
1436
1485
|
this.lastPullStartedFromZero = startedFromZero;
|
|
1486
|
+
// Snapshot landed cleanly → replay the live events we held during it, in
|
|
1487
|
+
// arrival (≈seq) order. They filter against the now-correct cursor
|
|
1488
|
+
// (snapshot_seq), so events newer than the snapshot apply and older ones
|
|
1489
|
+
// (already in the snapshot) are deduped. Clearing `snapshotHold` first
|
|
1490
|
+
// means this replay applies normally (it isn't re-held).
|
|
1491
|
+
if (startedFromZero && this.snapshotHold) {
|
|
1492
|
+
const held = this.snapshotHold;
|
|
1493
|
+
this.snapshotHold = null;
|
|
1494
|
+
if (held.length > 0) await this.enqueueApply(held);
|
|
1495
|
+
}
|
|
1437
1496
|
} catch (err) {
|
|
1438
1497
|
// Swallow network + transient errors so the poll/reconnect loop
|
|
1439
1498
|
// keeps trying — but on 429 bump the backoff counter so the next
|
|
@@ -1488,6 +1547,15 @@ export class SyncEngine {
|
|
|
1488
1547
|
}, delayMs);
|
|
1489
1548
|
}
|
|
1490
1549
|
}
|
|
1550
|
+
} finally {
|
|
1551
|
+
// Snapshot pull failed (network error / 410 mid-fetch): DISCARD any
|
|
1552
|
+
// still-held live events rather than applying them. The cursor stays at 0
|
|
1553
|
+
// so the retry resnapshots and re-covers them; applying them here would
|
|
1554
|
+
// advance the cursor and turn the retry into a gappy delta. On success
|
|
1555
|
+
// the try already drained + nulled the hold, so this is a no-op there.
|
|
1556
|
+
// Nested non-zero pulls never set `snapshotHold`, so this only fires for
|
|
1557
|
+
// the from-zero snapshot that owns it.
|
|
1558
|
+
if (startedFromZero && this.snapshotHold !== null) this.snapshotHold = null;
|
|
1491
1559
|
}
|
|
1492
1560
|
}
|
|
1493
1561
|
|
package/src/scenarios.test.ts
CHANGED
|
@@ -822,4 +822,123 @@ describe("sync scenarios", () => {
|
|
|
822
822
|
// And the replica survives — token rotation must not wipe it.
|
|
823
823
|
expect(env.engine.store.get("Recording", "r1")).not.toBeNull();
|
|
824
824
|
});
|
|
825
|
+
|
|
826
|
+
// SNAPSHOT-DROP RACE (pins the `snapshotHold` fence, TS F1). A live WS
|
|
827
|
+
// event that lands DURING a from-zero snapshot fetch leapfrogs the
|
|
828
|
+
// cursor: the event applies at seq S+1 first, so when the snapshot's
|
|
829
|
+
// own rows (pinned at seq ≤ S) finally apply, the monotonic
|
|
830
|
+
// `seq > cursor.last_seq` filter drops every one of them. The user's
|
|
831
|
+
// whole replica vanishes for a beat (until the next reconcile),
|
|
832
|
+
// because one concurrent broadcast raced the snapshot.
|
|
833
|
+
//
|
|
834
|
+
// The fix buffers live/broadcast applies in `snapshotHold` for the
|
|
835
|
+
// duration of a from-zero pull, applies the snapshot first, then
|
|
836
|
+
// flushes the held events in order — so the snapshot is never behind
|
|
837
|
+
// the cursor when it lands. This reproduces the race deterministically
|
|
838
|
+
// via a 410 (which forces a from-zero re-pull AFTER the WS is
|
|
839
|
+
// connected) plus a `beforePull` hook that injects a higher-seq live
|
|
840
|
+
// event right as the snapshot is being served.
|
|
841
|
+
test("a live WS event during a from-zero snapshot doesn't drop the snapshot", async () => {
|
|
842
|
+
let armed = false;
|
|
843
|
+
let injected = false;
|
|
844
|
+
env = createTestEnv({
|
|
845
|
+
transport: "websocket",
|
|
846
|
+
// Fires server-side just before the pull response is built. On the
|
|
847
|
+
// armed from-zero re-pull, deliver a live insert for a DIFFERENT
|
|
848
|
+
// row at a seq just past the snapshot — the exact interleave that
|
|
849
|
+
// leapfrogs the cursor mid-fetch.
|
|
850
|
+
beforePull: (_auth, since) => {
|
|
851
|
+
if (since !== 0 || !armed || injected) return;
|
|
852
|
+
injected = true;
|
|
853
|
+
const liveSeq = env!.server.nextSeqValue() + 1; // > snapshot_seq
|
|
854
|
+
env!.server.pushToUser("u1", {
|
|
855
|
+
seq: liveSeq,
|
|
856
|
+
entity: "Note",
|
|
857
|
+
row_id: "n2",
|
|
858
|
+
kind: "insert",
|
|
859
|
+
data: { id: "n2", title: "live" },
|
|
860
|
+
timestamp: "",
|
|
861
|
+
});
|
|
862
|
+
},
|
|
863
|
+
});
|
|
864
|
+
env.signIn({ userId: "u1" });
|
|
865
|
+
env.server.seed("Note", [{ id: "n1", title: "snapshot" }]);
|
|
866
|
+
await env.start();
|
|
867
|
+
await env.flush();
|
|
868
|
+
// Baseline: the snapshot row is present and the WS is connected.
|
|
869
|
+
expect(env.engine.store.get("Note", "n1")).not.toBeNull();
|
|
870
|
+
|
|
871
|
+
// Arm the injection, then force a from-zero re-pull via a 410. The
|
|
872
|
+
// delta pull 410s → resetReplica wipes the replica → pullInner
|
|
873
|
+
// re-pulls from seq=0. The beforePull hook injects the racing live
|
|
874
|
+
// event while that snapshot is served.
|
|
875
|
+
armed = true;
|
|
876
|
+
env.server.primeNextPullStatus(410);
|
|
877
|
+
await env.engine.pull();
|
|
878
|
+
await env.flush();
|
|
879
|
+
|
|
880
|
+
// The injection must actually have fired — otherwise the test is
|
|
881
|
+
// vacuous (no race happened).
|
|
882
|
+
expect(injected).toBe(true);
|
|
883
|
+
// The snapshot row MUST survive even though a higher-seq live event
|
|
884
|
+
// applied during the fetch. Pre-fix, n1 was dropped by the monotonic
|
|
885
|
+
// cursor filter (cursor had already leapfrogged to the live seq).
|
|
886
|
+
expect(env.engine.store.get("Note", "n1")).not.toBeNull();
|
|
887
|
+
// ...and the racing live event is also present.
|
|
888
|
+
expect(env.engine.store.get("Note", "n2")).not.toBeNull();
|
|
889
|
+
});
|
|
890
|
+
|
|
891
|
+
// STRANDED OFFLINE WRITES (pins the leader-path mutation drain, TS F3).
|
|
892
|
+
// A write made offline is queued and persisted; on next boot the engine
|
|
893
|
+
// hydrates it into the mutation queue. The hydrate runs PRE-election,
|
|
894
|
+
// while `isMultiTabLeader` is still false — so the old startup push()
|
|
895
|
+
// took the follower branch and broadcast the batch to a not-yet-running
|
|
896
|
+
// orchestrator (a silent no-op). `onInitialLeader` only flips the flag;
|
|
897
|
+
// nothing re-pushed. The offline write was stranded until an unrelated
|
|
898
|
+
// mutation fired push() again — the yapless "recording I made on the
|
|
899
|
+
// plane never uploaded until I recorded another one" class of bug.
|
|
900
|
+
//
|
|
901
|
+
// The fix drains the pending queue in the leader path, once election
|
|
902
|
+
// has settled and we own the network. This test injects a hydrated-
|
|
903
|
+
// style pending mutation BEFORE start() (persistence is off in the
|
|
904
|
+
// harness, so this stands in for `mutations.hydrate()`), then asserts
|
|
905
|
+
// start() actually shipped it to /api/sync/push with no further action.
|
|
906
|
+
test("hydrated offline writes are pushed once leader-elected", async () => {
|
|
907
|
+
// WS transport specifically: poll mode's `performPollTick` pushes on
|
|
908
|
+
// every tick, which would mask the bug. In WS-only mode the sole
|
|
909
|
+
// startup push is the leader-path drain — onConnected pulls but never
|
|
910
|
+
// pushes — so this isolates exactly the path the fix restores.
|
|
911
|
+
env = createTestEnv({ transport: "websocket" });
|
|
912
|
+
env.signIn({ userId: "u1" });
|
|
913
|
+
|
|
914
|
+
// An offline write rehydrated into the queue before the engine boots.
|
|
915
|
+
env.engine.mutations.add({
|
|
916
|
+
entity: "Note",
|
|
917
|
+
row_id: "n1",
|
|
918
|
+
kind: "insert",
|
|
919
|
+
data: { id: "n1", title: "written offline" },
|
|
920
|
+
});
|
|
921
|
+
// Mirror the optimistic row the offline write left in the replica.
|
|
922
|
+
env.engine.store.applyChange({
|
|
923
|
+
seq: 0,
|
|
924
|
+
entity: "Note",
|
|
925
|
+
row_id: "n1",
|
|
926
|
+
kind: "insert",
|
|
927
|
+
data: { id: "n1", title: "written offline" },
|
|
928
|
+
timestamp: "",
|
|
929
|
+
});
|
|
930
|
+
expect(env.engine.mutations.pending()).toHaveLength(1);
|
|
931
|
+
expect(env.server.receivedPushKeys).not.toContain("Note/n1");
|
|
932
|
+
|
|
933
|
+
await env.start();
|
|
934
|
+
await env.flush();
|
|
935
|
+
|
|
936
|
+
// The leader must have drained THIS write to the server with no extra
|
|
937
|
+
// user action. Asserting the specific op key (not just "a push
|
|
938
|
+
// happened") keeps the test robust against a stray retry timer from
|
|
939
|
+
// an unrelated engine firing against the shared fetch mock. Pre-fix
|
|
940
|
+
// Note/n1 never reached the server — the startup push was a no-op
|
|
941
|
+
// follower broadcast and nothing re-pushed.
|
|
942
|
+
expect(env.server.receivedPushKeys).toContain("Note/n1");
|
|
943
|
+
});
|
|
825
944
|
});
|
|
@@ -102,6 +102,17 @@ export class TestServer {
|
|
|
102
102
|
/** Count of snapshot pulls served (since = 0). The egress storm was a
|
|
103
103
|
* runaway count here; the regression test bounds it. */
|
|
104
104
|
snapshotPullCount = 0;
|
|
105
|
+
/** Count of /api/sync/push requests received. Lets a test assert the
|
|
106
|
+
* engine actually shipped a batch (e.g. hydrated offline writes that
|
|
107
|
+
* must drain once leader-elected), independent of the no-op push
|
|
108
|
+
* response the harness returns. */
|
|
109
|
+
pushRequestCount = 0;
|
|
110
|
+
/** `${entity}/${row_id}` of every op the engine pushed, across all
|
|
111
|
+
* push requests. Lets a test assert a SPECIFIC mutation reached the
|
|
112
|
+
* server — robust against a stray retry from an unrelated engine
|
|
113
|
+
* whose pending timer fires against the globally-installed fetch
|
|
114
|
+
* mock (that pushes ITS ops, never this test's row). */
|
|
115
|
+
readonly receivedPushKeys: string[] = [];
|
|
105
116
|
/** Captured outbound WS messages from clients — tests assert against
|
|
106
117
|
* this to verify `reactive-subscribe`, `crdt-subscribe`, etc., were
|
|
107
118
|
* actually sent over the wire. */
|
|
@@ -279,6 +279,20 @@ async function handle(
|
|
|
279
279
|
|
|
280
280
|
// /api/sync/push — accept ops from optimistic mutations.
|
|
281
281
|
if (url.endsWith("/api/sync/push") && method === "POST") {
|
|
282
|
+
server.pushRequestCount += 1;
|
|
283
|
+
// Record which ops were pushed so a test can assert a specific
|
|
284
|
+
// mutation reached the server (not just "some push happened").
|
|
285
|
+
try {
|
|
286
|
+
const body = typeof _init?.body === "string" ? JSON.parse(_init.body) : null;
|
|
287
|
+
const changes = Array.isArray(body?.changes) ? body.changes : [];
|
|
288
|
+
for (const c of changes) {
|
|
289
|
+
if (c && typeof c.entity === "string" && typeof c.row_id === "string") {
|
|
290
|
+
server.receivedPushKeys.push(`${c.entity}/${c.row_id}`);
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
} catch {
|
|
294
|
+
/* body not JSON / no changes — count still recorded above */
|
|
295
|
+
}
|
|
282
296
|
const outcome = server.consumeNextPushOutcome();
|
|
283
297
|
if (outcome?.kind === "network") {
|
|
284
298
|
// Reject like a real offline fetch: no HTTP status → the engine
|