@pylonsync/sync 0.3.225 → 0.3.227

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -3,7 +3,7 @@
3
3
  "publishConfig": {
4
4
  "access": "public"
5
5
  },
6
- "version": "0.3.225",
6
+ "version": "0.3.227",
7
7
  "type": "module",
8
8
  "main": "src/index.ts",
9
9
  "types": "src/index.ts",
package/src/index.ts CHANGED
@@ -1296,7 +1296,6 @@ export class SyncEngine {
1296
1296
  const resp = await this.request<
1297
1297
  PullResponse & { snapshot_after?: string | null }
1298
1298
  >("GET", `/api/sync/pull?${params.toString()}`);
1299
- this.consecutive_410s = 0;
1300
1299
  await this.enqueueApply(resp.changes, resp.cursor);
1301
1300
  // `snapshot_after` is only set when the server is mid-snapshot.
1302
1301
  // Continue paginating in the same loop iteration so we don't
@@ -1311,6 +1310,25 @@ export class SyncEngine {
1311
1310
  break;
1312
1311
  }
1313
1312
  }
1313
+ // Clear the resync circuit breaker ONLY on a successful DELTA
1314
+ // pull — one that started from a real, non-zero cursor the server
1315
+ // honored. A snapshot pull from cursor=0 succeeding does NOT prove
1316
+ // our cursor is stable, so it must NOT clear the breaker.
1317
+ //
1318
+ // Why this matters (the bug this replaced): the reset used to fire
1319
+ // on every successful page, including the cursor=0 snapshot that a
1320
+ // 410 triggers. So a stale-cursor 410 → full snapshot → fresh-
1321
+ // cursor 410 ping-pong — a client bouncing between cluster
1322
+ // instances whose in-memory change logs diverge, with no shared
1323
+ // persistent log to serve the delta — reset the breaker every
1324
+ // cycle. The exponential backoff below could never engage, and the
1325
+ // client re-ran a full `select *` snapshot of EVERY entity roughly
1326
+ // every 3 seconds, indefinitely. That drove a ~280GB PlanetScale
1327
+ // egress bill. Resetting only on a delta means repeated resyncs
1328
+ // escalate into the backoff instead of melting egress.
1329
+ if (!startedFromZero) {
1330
+ this.consecutive_410s = 0;
1331
+ }
1314
1332
  // Snapshot+tail loop exhausted without throwing: if we started
1315
1333
  // from cursor=0 we just hydrated the full replica from server
1316
1334
  // truth. Record it so onConnected skips the reconcile that would
@@ -1335,31 +1353,37 @@ export class SyncEngine {
1335
1353
  // re-pull from seq=0. The server replays all current entity rows as
1336
1354
  // seed events on startup so the fresh pull reconstructs state.
1337
1355
  //
1338
- // Circuit breaker: if the immediate re-pull ALSO 410s, accept it.
1339
- // Don't recurse that's the infinite loop we used to ship before
1340
- // the cursor=0 server fix landed (or against an old server binary
1341
- // that hasn't been rebuilt yet). Track 410 retries against an
1342
- // exponential backoff so a misconfigured server can't melt our CPU.
1356
+ // Circuit breaker. The first resync in an episode snapshots
1357
+ // immediately (good UX: a cursor that genuinely fell off retention
1358
+ // recovers in one round trip). But a SECOND 410 with no successful
1359
+ // delta pull in between means re-snapshotting isn't converging
1360
+ // the cursor we just minted from the snapshot is itself stale
1361
+ // (instance ping-pong, or a server that can't serve our delta). At
1362
+ // that point each snapshot is a full `select *` of every entity,
1363
+ // so we MUST back off instead of looping. The breaker only clears
1364
+ // on a successful delta pull (see the `!startedFromZero` reset
1365
+ // above) — NOT on the snapshot itself, which is what made this
1366
+ // loop unbounded and burned ~280GB of egress.
1343
1367
  if (status === 410) {
1344
1368
  const attempt = this.consecutive_410s;
1345
1369
  this.consecutive_410s += 1;
1346
1370
  if (attempt === 0) {
1347
- // Bypass the queue here we ARE the pull op holding the
1348
- // queue slot. Calling the public pull() would re-enqueue and
1349
- // share our own promise back to us (deadlock).
1371
+ // First resync of the episodesnapshot now. Bypass the queue
1372
+ // (we ARE the pull op holding the slot; the public pull()
1373
+ // would re-enqueue and share our own promise back deadlock).
1350
1374
  await this.resetReplicaInner();
1351
1375
  await this.pullInner();
1352
1376
  } else {
1353
- // Already retried once and still 410. Stop. Schedule a
1354
- // back-off retry tied to the WS reconnect path so we don't
1355
- // spam the server. Resets when any pull succeeds.
1377
+ // Snapshotted once and still 410 → not converging. Back off
1378
+ // exponentially instead of re-snapshotting. Clears only when a
1379
+ // delta pull finally succeeds (cursor stabilised).
1356
1380
  const delayMs = Math.min(30_000, 1000 * 2 ** Math.min(attempt, 5));
1357
1381
  console.warn(
1358
1382
  `[pylon] persistent 410 RESYNC_REQUIRED (attempt ${attempt + 1}); backing off ${delayMs}ms`,
1359
1383
  );
1360
1384
  setTimeout(() => {
1361
- // Trigger one more attempt; either it succeeds (which resets
1362
- // the counter) or it 410s again (which extends the backoff).
1385
+ // Retry after the delay; a delta success resets the counter,
1386
+ // a repeat 410 extends the backoff (no snapshot).
1363
1387
  void this.pull();
1364
1388
  }, delayMs);
1365
1389
  }
@@ -452,6 +452,41 @@ describe("sync scenarios", () => {
452
452
  expect(env.engine.store.get("Note", "n2")).not.toBeNull();
453
453
  });
454
454
 
455
+ // EGRESS STORM GUARD (pins the circuit-breaker fix). When every delta
456
+ // pull 410s but snapshots succeed — a client bouncing between cluster
457
+ // instances whose in-memory change logs diverge — the client must
458
+ // snapshot ONCE then back off, NOT re-snapshot on every pull.
459
+ //
460
+ // The bug: the breaker reset `consecutive_410s = 0` on every
461
+ // successful pull, including the cursor=0 snapshot a 410 triggers. So
462
+ // a 410 → full snapshot → 410 ping-pong reset the breaker each cycle
463
+ // and the backoff never engaged. The result was a full `select *`
464
+ // snapshot of EVERY entity ~every 3 seconds, indefinitely — a ~280GB
465
+ // PlanetScale egress bill. The fix clears the breaker only on a
466
+ // successful DELTA pull (cursor stable), never on the snapshot itself.
467
+ test("repeated delta 410s back off instead of re-snapshotting (egress storm)", async () => {
468
+ env = createTestEnv({ transport: "poll" });
469
+ env.signIn({ userId: "u1" });
470
+ env.server.seed("Note", [{ id: "n1", title: "x" }]);
471
+ await env.start();
472
+ await env.flush();
473
+
474
+ // Baseline: start() did exactly one snapshot pull.
475
+ const before = env.server.snapshotPullCount;
476
+
477
+ // Now every delta pull 410s (snapshots still succeed).
478
+ env.server.force410OnDelta = true;
479
+ for (let i = 0; i < 6; i++) {
480
+ await env.engine.pull();
481
+ await env.flush();
482
+ }
483
+
484
+ // Exactly ONE additional snapshot (the first resync). Every pull
485
+ // after that 410s on the delta and routes to exponential backoff —
486
+ // no further full snapshots. Pre-fix this was 6 (one per pull).
487
+ expect(env.server.snapshotPullCount - before).toBe(1);
488
+ });
489
+
455
490
  // Row-revoked envelope: server pushes `row-revoked` to a
456
491
  // subscriber whose read policy was revoked for a specific row.
457
492
  // The engine must drop the row from the local replica and
@@ -91,6 +91,17 @@ export class TestServer {
91
91
  /** When set, the next pull() returns this status instead of normal.
92
92
  * Used to simulate 410 RESYNC_REQUIRED and similar transient errors. */
93
93
  private nextPullStatus: number | null = null;
94
+ /** When true, every DELTA pull (since > 0) 410s, but a snapshot pull
95
+ * (since = 0) succeeds. Simulates a horizontally-scaled deployment
96
+ * where a client bounces between instances whose in-memory change
97
+ * logs diverge (no shared persistent log) — every cursor is "stale"
98
+ * on the instance it lands on. This is the condition that drove the
99
+ * 280GB egress storm; the test asserts the client backs off instead
100
+ * of re-snapshotting forever. */
101
+ force410OnDelta = false;
102
+ /** Count of snapshot pulls served (since = 0). The egress storm was a
103
+ * runaway count here; the regression test bounds it. */
104
+ snapshotPullCount = 0;
94
105
  /** Captured outbound WS messages from clients — tests assert against
95
106
  * this to verify `reactive-subscribe`, `crdt-subscribe`, etc., were
96
107
  * actually sent over the wire. */
@@ -299,6 +310,7 @@ export class TestServer {
299
310
  }> {
300
311
  const auth = this.authContextFor(token);
301
312
  if (this.beforePullHook) await this.beforePullHook(auth, since);
313
+ if (since === 0) this.snapshotPullCount += 1;
302
314
  const visibleSet = (entity: string) => {
303
315
  const filtered = this.visible(
304
316
  entity,
@@ -250,6 +250,14 @@ async function handle(
250
250
  };
251
251
  }
252
252
  const since = Number(new URL(url, "http://test").searchParams.get("since") ?? "0");
253
+ // Cluster-divergence sim: a delta pull lands on an instance that
254
+ // can't serve our cursor → 410. A snapshot (since=0) still succeeds.
255
+ if (since > 0 && server.force410OnDelta) {
256
+ return {
257
+ status: 410,
258
+ body: { error: { code: "RESYNC_REQUIRED" } },
259
+ };
260
+ }
253
261
  const resp = await server.pull(token, since);
254
262
  return { status: 200, body: resp };
255
263
  }