@pylonsync/sync 0.3.228 → 0.3.230

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -3,7 +3,7 @@
3
3
  "publishConfig": {
4
4
  "access": "public"
5
5
  },
6
- "version": "0.3.228",
6
+ "version": "0.3.230",
7
7
  "type": "module",
8
8
  "main": "src/index.ts",
9
9
  "types": "src/index.ts",
@@ -423,4 +423,148 @@ describe("IDB warm-load hydration", () => {
423
423
  // Fast path — no warning expected on a trivial load.
424
424
  expect(warned).toBe(false);
425
425
  });
426
+
427
+ // IDB WRITE HANG (pins persistence.commit). A write tx that ABORTS
428
+ // (quota-exceeded, or any storage error) must let the persist promise
429
+ // SETTLE — not hang. The engine awaits the persist before advancing
430
+ // the cursor in enqueueApply, so a hung write would wedge the whole
431
+ // apply queue and silently kill live sync. Pre-fix saveRow/deleteRow/
432
+ // saveCursor registered only `oncomplete`, so an abort never resolved.
433
+ test("a write tx that aborts resolves (degrades) instead of hanging", async () => {
434
+ const origWarn = console.warn;
435
+ console.warn = () => {};
436
+ try {
437
+ const p = new IndexedDBPersistence("idb-abort-degrade");
438
+ await p.open();
439
+ const db = p.connection!;
440
+ const realTx = db.transaction.bind(db);
441
+ // Abort the NEXT readwrite tx right after handing it back.
442
+ let armed = true;
443
+ (db as unknown as { transaction: typeof db.transaction }).transaction = ((
444
+ ...args: Parameters<typeof db.transaction>
445
+ ) => {
446
+ const tx = realTx(...args);
447
+ if (armed && String(args[1]) === "readwrite") {
448
+ armed = false;
449
+ queueMicrotask(() => {
450
+ try {
451
+ tx.abort();
452
+ } catch {
453
+ /* already settled */
454
+ }
455
+ });
456
+ }
457
+ return tx;
458
+ }) as typeof db.transaction;
459
+
460
+ // Pre-fix: this promise never settles → the race rejects.
461
+ await Promise.race([
462
+ p.saveRow("Note", "n1", { id: "n1", title: "x" } as Row),
463
+ new Promise((_, reject) =>
464
+ setTimeout(() => reject(new Error("saveRow hung on abort")), 1000),
465
+ ),
466
+ ]);
467
+
468
+ // A subsequent (un-armed) write still commits — the engine degrades,
469
+ // it isn't permanently broken.
470
+ await p.saveRow("Note", "n2", { id: "n2", title: "y" } as Row);
471
+ const rows = await p.loadAll("Note");
472
+ expect(rows.some((r) => (r as { id?: string }).id === "n2")).toBe(true);
473
+ } finally {
474
+ console.warn = origWarn;
475
+ }
476
+ });
477
+
478
+ // CURSOR-AHEAD-OF-DISK (pins the persistDegraded gate). When a row write
479
+ // ABORTS, the row never reaches disk — so the engine MUST NOT persist a
480
+ // cursor past it, or the next cold start's warm-load skips that row
481
+ // forever (cursor ahead of replica). The in-memory cursor still advances
482
+ // (live session stays correct); only the ON-DISK cursor is held back so
483
+ // a restart re-pulls the gap. This is the regression for the IDB-hang
484
+ // fix that (before this gate) traded a hang for silent data loss.
485
+ test("a row write that aborts holds the on-disk cursor back", async () => {
486
+ const origWarn = console.warn;
487
+ console.warn = () => {};
488
+ try {
489
+ const appName = "idb-cursor-drift";
490
+ const engine = makeEngine(appName);
491
+ await engine.start();
492
+
493
+ const internal = engine as unknown as {
494
+ persistence: IndexedDBPersistence;
495
+ cursor: { last_seq: number };
496
+ persistDegraded: boolean;
497
+ enqueueApply(
498
+ changes: unknown[],
499
+ targetCursor?: { last_seq: number },
500
+ ): Promise<void>;
501
+ };
502
+ const persistence = internal.persistence;
503
+ // Cursor on disk after start() (server.serverSeq seed) — capture it
504
+ // so we assert it does NOT advance to 50 below.
505
+ const onDiskBefore = (await persistence.loadCursor())?.last_seq ?? 0;
506
+
507
+ // Abort the next ENTITIES (row) readwrite, leaving the separate
508
+ // CURSOR-store tx alone — mirrors a quota abort on a row write.
509
+ const db = persistence.connection!;
510
+ const realTx = db.transaction.bind(db);
511
+ let armed = true;
512
+ (db as unknown as { transaction: typeof db.transaction }).transaction = ((
513
+ ...args: Parameters<typeof db.transaction>
514
+ ) => {
515
+ const tx = realTx(...args);
516
+ const stores = args[0];
517
+ const touchesEntities = Array.isArray(stores)
518
+ ? stores.includes("entities")
519
+ : stores === "entities";
520
+ const touchesCursors = Array.isArray(stores)
521
+ ? stores.includes("cursors")
522
+ : stores === "cursors";
523
+ if (
524
+ armed &&
525
+ String(args[1]) === "readwrite" &&
526
+ touchesEntities &&
527
+ !touchesCursors
528
+ ) {
529
+ armed = false;
530
+ queueMicrotask(() => {
531
+ try {
532
+ tx.abort();
533
+ } catch {
534
+ /* already settled */
535
+ }
536
+ });
537
+ }
538
+ return tx;
539
+ }) as typeof db.transaction;
540
+
541
+ // Apply a change with a target cursor far ahead. The row write
542
+ // aborts; the on-disk cursor must stay where it was.
543
+ await internal.enqueueApply(
544
+ [
545
+ {
546
+ seq: 50,
547
+ entity: "Note",
548
+ row_id: "n1",
549
+ kind: "insert",
550
+ data: { id: "n1", title: "x" },
551
+ timestamp: "",
552
+ },
553
+ ],
554
+ { last_seq: 50 },
555
+ );
556
+
557
+ // In-memory cursor advanced (live sync correct); degrade flag latched.
558
+ expect(internal.cursor.last_seq).toBe(50);
559
+ expect(internal.persistDegraded).toBe(true);
560
+ // The on-disk cursor did NOT advance past the un-persisted row.
561
+ const onDiskAfter = (await persistence.loadCursor())?.last_seq ?? 0;
562
+ expect(onDiskAfter).toBe(onDiskBefore);
563
+ expect(onDiskAfter).toBeLessThan(50);
564
+
565
+ engine.stop();
566
+ } finally {
567
+ console.warn = origWarn;
568
+ }
569
+ });
426
570
  });
package/src/index.ts CHANGED
@@ -233,6 +233,20 @@ export class SyncEngine {
233
233
  */
234
234
  private _hadCachedReplica = false;
235
235
 
236
+ /**
237
+ * Sticky flag: a persisted row/cursor write degraded (IDB quota /
238
+ * abort), so the on-disk replica is known to be behind the in-memory
239
+ * cursor. Once set, `enqueueApply` STOPS advancing the persisted
240
+ * cursor — persisting a cursor ahead of the durable rows would make
241
+ * the next cold start skip them forever (cursor-ahead-of-replica). The
242
+ * in-memory replica stays authoritative for the live session; on
243
+ * restart the lagging on-disk cursor simply re-pulls the gap. Resets to
244
+ * false only on `resetReplicaInner` (full wipe + resync, disk is clean
245
+ * again). A storage-pressured tab thus degrades to "re-pull on restart"
246
+ * — like a memory-only client — instead of silently losing rows.
247
+ */
248
+ private persistDegraded = false;
249
+
236
250
  readonly store: LocalStore;
237
251
  readonly mutations: MutationQueue;
238
252
 
@@ -556,13 +570,16 @@ export class SyncEngine {
556
570
  this.cursor = cachedCursor;
557
571
  }
558
572
 
559
- // Auto-save changes to IndexedDB. Returns a Promise so the async
560
- // apply path (applyChangesAsync) can await the write before the
561
- // cursor advances the fix for "cursor ahead of replica" on crash.
573
+ // Auto-save changes to IndexedDB. Returns a Promise<boolean>
574
+ // (true = durable) so the async apply path (applyChangesAsync)
575
+ // can both await the write before the cursor advances AND hold
576
+ // the persisted cursor back when a write degraded — the fix for
577
+ // "cursor ahead of replica" on crash AND on quota/abort.
562
578
  const persistence = this.persistence;
563
579
  this.store._persistFn = async (change: ChangeEvent) => {
564
580
  const { persistChange } = await import("./persistence");
565
- if (persistence) await persistChange(persistence, change);
581
+ if (!persistence) return true;
582
+ return persistChange(persistence, change);
566
583
  };
567
584
 
568
585
  // Hydrate the mutation queue from disk. Any offline writes
@@ -663,7 +680,7 @@ export class SyncEngine {
663
680
  // runs; the apply path's idempotent op_id-keyed merge handles the
664
681
  // worst case (one re-applied batch on next cold pull if the tab
665
682
  // crashes between this line and the saveCursor task completing).
666
- if (this.persistence) {
683
+ if (this.persistence && !this.persistDegraded) {
667
684
  void this.persistence.saveCursor(this.cursor);
668
685
  }
669
686
 
@@ -756,8 +773,8 @@ export class SyncEngine {
756
773
  fromBroadcast: true,
757
774
  });
758
775
  },
759
- onResetReceived: () => {
760
- void this.resetReplicaInner();
776
+ onResetReceived: (wipeMutations: boolean) => {
777
+ void this.resetReplicaInner({ wipeMutations });
761
778
  },
762
779
  onSessionReceived: (resolved: ResolvedSession) => {
763
780
  // Funnel through the shared session chain so concurrent triggers
@@ -768,7 +785,15 @@ export class SyncEngine {
768
785
  },
769
786
  onMutationsForwarded: (ops: PendingMutation[]) => {
770
787
  for (const op of ops) {
771
- this.mutations.add(op.change);
788
+ // Thread the follower's captured `prevRow` so a server
789
+ // rejection of this forwarded update/delete restores the
790
+ // canonical value rather than deleting it. Without it the
791
+ // leader's queue entry has prevRow === undefined, and
792
+ // failPushedMutation's restoreRow(undefined ?? null) would
793
+ // DELETE the leader's still-valid row. The follower's prevRow
794
+ // (its pre-edit value) equals the leader's canonical row, so
795
+ // restoring it is correct on both tabs.
796
+ this.mutations.add(op.change, op.prevRow);
772
797
  }
773
798
  void this.push();
774
799
  },
@@ -777,8 +802,19 @@ export class SyncEngine {
777
802
  this.mutations.clear();
778
803
  },
779
804
  onMutationsFailed: (ops: { opId: string; error: string }[]) => {
805
+ // The leader pushed this follower's forwarded mutation and the
806
+ // server rejected it. Roll back the follower's OWN optimistic
807
+ // ghost (the leader already rolled back its copy) — calling
808
+ // markFailed alone left the ghost row stuck in the very tab the
809
+ // user is looking at. failPushedMutation restores prevRow for
810
+ // update/delete and removes the insert ghost, then marks failed.
780
811
  for (const op of ops) {
781
- this.mutations.markFailed(op.opId, op.error);
812
+ const m = this.mutations.get(op.opId);
813
+ if (m) {
814
+ this.failPushedMutation(m, op.error);
815
+ } else {
816
+ this.mutations.markFailed(op.opId, op.error);
817
+ }
782
818
  }
783
819
  },
784
820
  onBinaryReceived: (bytes: Uint8Array) => {
@@ -882,6 +918,28 @@ export class SyncEngine {
882
918
  });
883
919
  }
884
920
  },
921
+ onEntityObserve: (entity: string) => {
922
+ // Leader path: a follower's useQuery observed this entity. Add
923
+ // it to our reconcile sweep and fetch it now if we have no local
924
+ // rows — the resulting `reconciled` batch is broadcast to every
925
+ // tab, so the follower's view populates. Same shape as the
926
+ // leader half of observeEntity; the `has` guard dedupes against
927
+ // our own interest.
928
+ if (!this.isMultiTabLeader) return;
929
+ if (this.observedEntities.has(entity)) return;
930
+ this.observedEntities.add(entity);
931
+ if (this.isHydrated() && this.store.list(entity).length === 0) {
932
+ void this.reconcile([entity]);
933
+ }
934
+ },
935
+ onReplayObservedEntities: () => {
936
+ // Follower path: re-declare every observed entity to the new
937
+ // leader so its reconcile sweep covers them after a leader flip.
938
+ if (this.isMultiTabLeader) return;
939
+ for (const entity of this.observedEntities) {
940
+ this.broadcastToTabs({ type: "entity-observe", entity });
941
+ }
942
+ },
885
943
  };
886
944
  }
887
945
 
@@ -1008,7 +1066,11 @@ export class SyncEngine {
1008
1066
  (c) => typeof c.seq === "number" && c.seq > this.cursor.last_seq,
1009
1067
  );
1010
1068
  if (filtered.length > 0) {
1011
- await this.store.applyChangesAsync(filtered);
1069
+ const durable = await this.store.applyChangesAsync(filtered);
1070
+ // A row in this batch didn't reach disk (quota / abort). Latch
1071
+ // the degraded flag so we never persist a cursor ahead of the
1072
+ // durable replica — the next cold start must re-pull this gap.
1073
+ if (!durable) this.persistDegraded = true;
1012
1074
  }
1013
1075
  // Pick the cursor target. Explicit `targetCursor` (from pull) wins
1014
1076
  // — pull's response carries the server's authoritative current_seq
@@ -1020,8 +1082,12 @@ export class SyncEngine {
1020
1082
  ? { last_seq: filtered[filtered.length - 1].seq }
1021
1083
  : null);
1022
1084
  if (candidate && candidate.last_seq > this.cursor.last_seq) {
1085
+ // In-memory cursor ALWAYS advances — live sync stays correct.
1023
1086
  this.cursor = candidate;
1024
- if (this.persistence) {
1087
+ // The on-disk cursor only advances while persistence is healthy.
1088
+ // Once degraded, freezing it keeps disk self-consistent (cursor
1089
+ // never exceeds the rows actually written) so restart re-pulls.
1090
+ if (this.persistence && !this.persistDegraded) {
1025
1091
  await this.persistence.saveCursor(this.cursor);
1026
1092
  }
1027
1093
  }
@@ -1126,16 +1192,41 @@ export class SyncEngine {
1126
1192
  * rehydrated on the next page load — phantom rows that no purge of
1127
1193
  * in-memory state could fix.
1128
1194
  */
1129
- async resetReplica(): Promise<void> {
1195
+ async resetReplica(opts: { wipeMutations?: boolean } = {}): Promise<void> {
1130
1196
  // Public callers go through the queue so a reset can't race with
1131
1197
  // an in-flight pull / push / reconcile. Internal callers that
1132
1198
  // already hold the queue slot use `resetReplicaInner` directly.
1133
- return this.opQueue.enqueue("reset", () => this.resetReplicaInner());
1199
+ return this.opQueue.enqueue("reset", () => this.resetReplicaInner(opts));
1134
1200
  }
1135
1201
 
1136
- private async resetReplicaInner(): Promise<void> {
1202
+ /**
1203
+ * Drop the local replica and pull fresh. `wipeMutations` decides the
1204
+ * fate of the durable offline write queue:
1205
+ * - `false` (default, 410 RESYNC, SAME user): KEEP pending writes —
1206
+ * they survive the snapshot refresh and re-push under the same
1207
+ * session.
1208
+ * - `true` (token/tenant flip, DIFFERENT identity): DROP them — the
1209
+ * queued writes belong to the outgoing identity and must never be
1210
+ * replayed as the incoming one (cross-identity write leak).
1211
+ */
1212
+ private async resetReplicaInner(
1213
+ opts: { wipeMutations?: boolean } = {},
1214
+ ): Promise<void> {
1215
+ const wipeMutations = opts.wipeMutations === true;
1137
1216
  this.cursor = { last_seq: 0 };
1138
1217
  this.store.clearAll();
1218
+ // Disk is about to be wiped + re-pulled from 0, so any prior
1219
+ // persist degradation is moot — start the durability invariant
1220
+ // fresh. (If the fresh snapshot also fails to persist, enqueueApply
1221
+ // re-latches the flag.)
1222
+ this.persistDegraded = false;
1223
+ if (wipeMutations) {
1224
+ // Identity flip: discard the outgoing identity's pending offline
1225
+ // writes (and persist the empty queue to disk via the mutation
1226
+ // backend). persistence.clear() deliberately leaves MUTATIONS_STORE
1227
+ // alone for the 410 path, so this is the only site that drops them.
1228
+ this.mutations.clearAll();
1229
+ }
1139
1230
  // The cache is now empty. The next pull will start from 0 and
1140
1231
  // return a full snapshot — that's a true cold start, so the
1141
1232
  // onConnected fast-path may skip the post-pull reconcile. Without
@@ -1154,9 +1245,11 @@ export class SyncEngine {
1154
1245
  }
1155
1246
  // Leader broadcasts the reset so follower replicas wipe their
1156
1247
  // own copies in lockstep — otherwise a follower keeps stale
1157
- // rows under the old identity until its own pull catches up.
1248
+ // rows under the old identity until its own pull catches up. The
1249
+ // `wipeMutations` flag rides along so followers make the same
1250
+ // keep-vs-drop decision for THEIR forwarded offline writes.
1158
1251
  if (this.isMultiTabLeader) {
1159
- this.broadcastToTabs({ type: "reset" });
1252
+ this.broadcastToTabs({ type: "reset", wipeMutations });
1160
1253
  }
1161
1254
  }
1162
1255
 
@@ -1264,8 +1357,9 @@ export class SyncEngine {
1264
1357
  const { tokenChanged } = this.session.observeToken(this.currentToken());
1265
1358
  if (tokenChanged) {
1266
1359
  // We're holding the "pull" slot in the op queue — bypass the
1267
- // queue's reset path to avoid self-deadlock.
1268
- await this.resetReplicaInner();
1360
+ // queue's reset path to avoid self-deadlock. Identity flipped, so
1361
+ // wipe the old identity's pending offline writes.
1362
+ await this.resetReplicaInner({ wipeMutations: true });
1269
1363
  // Token flipped → the cached tenant is for the previous user. Pull
1270
1364
  // the fresh session in parallel with the cursor catch-up below.
1271
1365
  void this.refreshResolvedSession();
@@ -1301,12 +1395,18 @@ export class SyncEngine {
1301
1395
  // Continue paginating in the same loop iteration so we don't
1302
1396
  // leave a fresh client with a partial replica.
1303
1397
  snapshotAfter = resp.snapshot_after ?? undefined;
1304
- // The change-log tail also paginates via `has_more` — handle
1305
- // that one recursively after the snapshot loop completes so
1306
- // backpressure on the change-log path uses the existing
1307
- // tail-pull semantics.
1398
+ // The change-log tail also paginates via `has_more` — drain it
1399
+ // by recursing into `pullInner` directly. We are INSIDE the
1400
+ // `pull` op-queue slot right now; calling the public `pull()`
1401
+ // would re-enqueue under the same "pull" key, which coalesces
1402
+ // to the promise we're currently running inside (op-queue.ts
1403
+ // deletes the key only after `fn` resolves) and `await` it →
1404
+ // permanent self-deadlock that bricks the entire pull path for
1405
+ // the session. This is the exact hazard the 410 handler avoids;
1406
+ // `pullInner` re-reads `this.cursor.last_seq` (already advanced
1407
+ // by enqueueApply) so the recursion resumes at the right cursor.
1308
1408
  if (!snapshotAfter && resp.has_more) {
1309
- await this.pull();
1409
+ await this.pullInner();
1310
1410
  break;
1311
1411
  }
1312
1412
  }
@@ -1397,6 +1497,12 @@ export class SyncEngine {
1397
1497
  * that doesn't throw a 410. */
1398
1498
  private consecutive_410s = 0;
1399
1499
 
1500
+ /** Consecutive TRANSIENT push failures (offline / 5xx / 429 / 401)
1501
+ * since the last server response. Drives the exponential backoff on
1502
+ * the retry of a transient-failed push so an offline tab doesn't
1503
+ * hot-loop. Reset to 0 the moment the server returns any response. */
1504
+ private pushFailureCount = 0;
1505
+
1400
1506
  /** Set by pullInner whenever the just-completed pull started with
1401
1507
  * `cursor.last_seq === 0` (cold load OR post-reset). The WS
1402
1508
  * onConnected hook reads this to skip the reconcile() that would
@@ -1480,9 +1586,17 @@ export class SyncEngine {
1480
1586
  observeEntity(entity: string): void {
1481
1587
  if (this.observedEntities.has(entity)) return;
1482
1588
  this.observedEntities.add(entity);
1483
- // Only the leader talks to the network; follower tabs converge via
1484
- // the multi-tab channel once the leader reconciles.
1485
- if (!this.isMultiTabLeader) return;
1589
+ if (!this.isMultiTabLeader) {
1590
+ // Follower: only the leader talks to the network. Forward the
1591
+ // interest so the LEADER adds this entity to its reconcile sweep
1592
+ // and fetches any server row we never cached — then converge via
1593
+ // the `reconciled` broadcast. Without the forward, a follower's
1594
+ // useQuery on a never-cached entity renders empty forever (the
1595
+ // leader never sweeps an entity it has no local rows for and was
1596
+ // never told a peer cares about).
1597
+ this.broadcastToTabs({ type: "entity-observe", entity });
1598
+ return;
1599
+ }
1486
1600
  if (this.isHydrated() && this.store.list(entity).length === 0) {
1487
1601
  // Scoped reconcile bypasses the no-arg debounce and reuses the
1488
1602
  // session-flip / cursor-drift guards in reconcileInner.
@@ -1777,8 +1891,9 @@ export class SyncEngine {
1777
1891
  // transitions but NOT the apply queue — without queuing
1778
1892
  // the reset, a concurrent applyChangesAsync could write
1779
1893
  // rows AFTER we clear the store, leaving stale data under
1780
- // the new identity.
1781
- await this.resetReplica();
1894
+ // the new identity. Identity flipped → wipe the outgoing
1895
+ // identity's pending offline writes too.
1896
+ await this.resetReplica({ wipeMutations: true });
1782
1897
  }
1783
1898
  if (this.isMultiTabLeader) {
1784
1899
  // Only the leader pulls — followers receive subsequent
@@ -2007,6 +2122,10 @@ export class SyncEngine {
2007
2122
  changes: pending.map((m) => m.change),
2008
2123
  client_id: this.clientId,
2009
2124
  });
2125
+ // The request reached the server and returned a response — clear
2126
+ // the transient-failure backoff counter (success or per-op
2127
+ // rejections both mean "we're online and the server answered").
2128
+ this.pushFailureCount = 0;
2010
2129
 
2011
2130
  // Per-op `results` mapping: match by op_id when present, fall
2012
2131
  // back to positional. Invariant: a partial-failure batch lands
@@ -2120,33 +2239,55 @@ export class SyncEngine {
2120
2239
  }, 250);
2121
2240
  }
2122
2241
  } catch (err) {
2123
- // Transport-level failure (network down, CORS, 5xx without a
2124
- // typed body, parse error). Pre-0.3.224 swallowed silently:
2125
- // the mutation stayed `pending` forever and the optimistic
2126
- // ghost survived even though the server never accepted the
2127
- // write. That's the "I sent it, it's there, then it's gone"
2128
- // pattern users see after a reload.
2242
+ // Whole-request failure. CRITICAL distinction:
2243
+ //
2244
+ // - TRANSIENT (offline / network drop / 5xx / 429 / 401 / 408):
2245
+ // the server never durably rejected the write. We MUST keep
2246
+ // the mutations `pending` and the optimistic ghost intact, and
2247
+ // retry with backoff. Marking them failed + rolling back here
2248
+ // is what broke offline support — an offline insert vanished
2249
+ // from the UI and was never re-sent (it became `failed`, and
2250
+ // pushInner only ships `pending`). A network `fetch` throw has
2251
+ // NO `.status`, so it lands here as transient. op_id makes the
2252
+ // eventual retry idempotent even if the server HAD committed.
2129
2253
  //
2130
- // Now: fail every pending mutation in this batch, roll back
2131
- // any optimistic ghost, surface via mutations-failed so the
2132
- // UI can prompt + retry. op_id keeps a retry idempotent on
2133
- // the server if the failure was a transient transport error
2134
- // — the next push() will re-include the user's intent.
2254
+ // - PERMANENT (400/403/404/409/422): a client error that won't
2255
+ // change on retry (malformed batch, forbidden, gone). Fail +
2256
+ // roll back the optimistic ghost + surface mutations-failed.
2135
2257
  const msg = err instanceof Error ? err.message : String(err);
2136
- const failedOps: { opId: string; error: string }[] = [];
2137
- for (const m of pending) {
2138
- this.failPushedMutation(m, msg);
2139
- const opId = m.change.op_id;
2140
- if (typeof opId === "string") {
2141
- failedOps.push({ opId, error: msg });
2258
+ const status = (err as { status?: number })?.status;
2259
+ if (isPermanentPushError(status)) {
2260
+ const failedOps: { opId: string; error: string }[] = [];
2261
+ for (const m of pending) {
2262
+ this.failPushedMutation(m, msg);
2263
+ const opId = m.change.op_id;
2264
+ if (typeof opId === "string") {
2265
+ failedOps.push({ opId, error: msg });
2266
+ }
2142
2267
  }
2268
+ if (failedOps.length > 0) {
2269
+ this.broadcastToTabs({ type: "mutations-failed", ops: failedOps });
2270
+ }
2271
+ this.mutations.clear();
2272
+ // eslint-disable-next-line no-console
2273
+ console.warn(`[sync] /api/sync/push rejected (status ${status}):`, msg);
2274
+ } else {
2275
+ // Transient: leave the queue + ghosts alone, retry with bounded
2276
+ // exponential backoff. Resets on the next response (success or
2277
+ // per-op rejection). A 429 also pushes the WS reconnect out so a
2278
+ // rate-limited push doesn't drive a tight loop.
2279
+ if (status === 429) this.transport?.bumpReconnect(3);
2280
+ const attempt = this.pushFailureCount;
2281
+ this.pushFailureCount += 1;
2282
+ const delayMs = Math.min(30_000, 1000 * 2 ** Math.min(attempt, 5));
2283
+ // eslint-disable-next-line no-console
2284
+ console.warn(
2285
+ `[sync] /api/sync/push transient failure (status ${status ?? "offline"}); keeping ${pending.length} mutation(s) pending, retrying in ${delayMs}ms`,
2286
+ );
2287
+ setTimeout(() => {
2288
+ void this.push();
2289
+ }, delayMs);
2143
2290
  }
2144
- if (failedOps.length > 0) {
2145
- this.broadcastToTabs({ type: "mutations-failed", ops: failedOps });
2146
- }
2147
- this.mutations.clear();
2148
- // eslint-disable-next-line no-console
2149
- console.warn("[sync] /api/sync/push failed:", msg);
2150
2291
  }
2151
2292
  }
2152
2293
 
@@ -2169,8 +2310,23 @@ export class SyncEngine {
2169
2310
  * channel) so insert-only rollback is the right shape to ship now.
2170
2311
  */
2171
2312
  private failPushedMutation(m: PendingMutation, error: string): void {
2172
- if (m.change.kind === "insert") {
2173
- this.store.rollbackOptimisticInsert(m.change.entity, m.change.row_id);
2313
+ const { entity, row_id, kind } = m.change;
2314
+ if (kind === "insert") {
2315
+ // No tombstone — a future legitimate insert of this id must work.
2316
+ this.store.rollbackOptimisticInsert(entity, row_id);
2317
+ } else if (kind === "update" || kind === "delete") {
2318
+ // Restore the captured pre-mutation row (update: prior field
2319
+ // values; delete: bring it back AND clear the optimistic tombstone
2320
+ // fence). `prevRow === null` means the row didn't exist pre-mutation
2321
+ // → remove + un-fence. `prevRow === undefined` means THIS engine
2322
+ // never captured a snapshot — i.e. the optimistic change wasn't
2323
+ // applied to this store (a forwarded op whose prevRow didn't
2324
+ // thread). Touching the store then would delete a canonical row we
2325
+ // still hold, so leave it untouched and let pull/reconcile
2326
+ // reconverge. The `!== undefined` guard distinguishes the two.
2327
+ if (m.prevRow !== undefined) {
2328
+ this.store.restoreRow(entity, row_id, m.prevRow);
2329
+ }
2174
2330
  }
2175
2331
  this.mutations.markFailed(m.id, error);
2176
2332
  }
@@ -2198,24 +2354,39 @@ export class SyncEngine {
2198
2354
 
2199
2355
  /** Update a row with optimistic local update. */
2200
2356
  async update(entity: string, id: string, data: Partial<Row>): Promise<void> {
2357
+ // Snapshot the pre-update row BEFORE applying the optimistic merge so
2358
+ // a rejected push can restore the exact prior value (see
2359
+ // failPushedMutation). Clone — the live row is mutated in place.
2360
+ const before = this.store.get(entity, id);
2361
+ const prev = before ? { ...before } : null;
2201
2362
  this.store.optimisticUpdate(entity, id, data);
2202
- this.mutations.add({
2203
- entity,
2204
- row_id: id,
2205
- kind: "update",
2206
- data: data as Row,
2207
- });
2363
+ this.mutations.add(
2364
+ {
2365
+ entity,
2366
+ row_id: id,
2367
+ kind: "update",
2368
+ data: data as Row,
2369
+ },
2370
+ prev,
2371
+ );
2208
2372
  await this.push();
2209
2373
  }
2210
2374
 
2211
2375
  /** Delete a row with optimistic local update. */
2212
2376
  async delete(entity: string, id: string): Promise<void> {
2377
+ // Snapshot the row before removing it so a rejected delete can bring
2378
+ // it back (and clear the optimistic tombstone).
2379
+ const before = this.store.get(entity, id);
2380
+ const prev = before ? { ...before } : null;
2213
2381
  this.store.optimisticDelete(entity, id);
2214
- this.mutations.add({
2215
- entity,
2216
- row_id: id,
2217
- kind: "delete",
2218
- });
2382
+ this.mutations.add(
2383
+ {
2384
+ entity,
2385
+ row_id: id,
2386
+ kind: "delete",
2387
+ },
2388
+ prev,
2389
+ );
2219
2390
  await this.push();
2220
2391
  }
2221
2392
 
@@ -2915,6 +3086,32 @@ function rowsDiffer(a: Row, b: Row): boolean {
2915
3086
  return stableStringify(a) !== stableStringify(b);
2916
3087
  }
2917
3088
 
3089
+ /**
3090
+ * Is a whole-request push failure PERMANENT (the write was durably
3091
+ * rejected and won't succeed on retry) vs TRANSIENT (offline / server
3092
+ * hiccup / rate limit — retry will eventually land)?
3093
+ *
3094
+ * - `undefined` status = a `fetch` network throw (offline, DNS, CORS,
3095
+ * connection reset) → transient.
3096
+ * - 400/403/404/409/422 = client errors that are stable across retries
3097
+ * (malformed batch, forbidden, gone, conflict, unprocessable) →
3098
+ * permanent.
3099
+ * - everything else (5xx, 429 rate-limit, 408 timeout, 401 needs
3100
+ * re-auth, 502/503/504) → transient: keep the mutation queued and
3101
+ * retry. Per-op policy rejections do NOT come through here — they
3102
+ * arrive as a 200 with per-op `results`, handled on the success path.
3103
+ */
3104
+ function isPermanentPushError(status?: number): boolean {
3105
+ if (status === undefined) return false;
3106
+ return (
3107
+ status === 400 ||
3108
+ status === 403 ||
3109
+ status === 404 ||
3110
+ status === 409 ||
3111
+ status === 422
3112
+ );
3113
+ }
3114
+
2918
3115
  function stableStringify(value: unknown): string {
2919
3116
  if (value === null || typeof value !== "object") return JSON.stringify(value);
2920
3117
  if (Array.isArray(value)) {