@pylonsync/sync 0.3.225 → 0.3.227
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.ts +38 -14
- package/src/scenarios.test.ts +35 -0
- package/src/test-harness/server.ts +12 -0
- package/src/test-harness/transport.ts +8 -0
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -1296,7 +1296,6 @@ export class SyncEngine {
|
|
|
1296
1296
|
const resp = await this.request<
|
|
1297
1297
|
PullResponse & { snapshot_after?: string | null }
|
|
1298
1298
|
>("GET", `/api/sync/pull?${params.toString()}`);
|
|
1299
|
-
this.consecutive_410s = 0;
|
|
1300
1299
|
await this.enqueueApply(resp.changes, resp.cursor);
|
|
1301
1300
|
// `snapshot_after` is only set when the server is mid-snapshot.
|
|
1302
1301
|
// Continue paginating in the same loop iteration so we don't
|
|
@@ -1311,6 +1310,25 @@ export class SyncEngine {
|
|
|
1311
1310
|
break;
|
|
1312
1311
|
}
|
|
1313
1312
|
}
|
|
1313
|
+
// Clear the resync circuit breaker ONLY on a successful DELTA
|
|
1314
|
+
// pull — one that started from a real, non-zero cursor the server
|
|
1315
|
+
// honored. A snapshot pull from cursor=0 succeeding does NOT prove
|
|
1316
|
+
// our cursor is stable, so it must NOT clear the breaker.
|
|
1317
|
+
//
|
|
1318
|
+
// Why this matters (the bug this replaced): the reset used to fire
|
|
1319
|
+
// on every successful page, including the cursor=0 snapshot that a
|
|
1320
|
+
// 410 triggers. So a stale-cursor 410 → full snapshot → fresh-
|
|
1321
|
+
// cursor 410 ping-pong — a client bouncing between cluster
|
|
1322
|
+
// instances whose in-memory change logs diverge, with no shared
|
|
1323
|
+
// persistent log to serve the delta — reset the breaker every
|
|
1324
|
+
// cycle. The exponential backoff below could never engage, and the
|
|
1325
|
+
// client re-ran a full `select *` snapshot of EVERY entity roughly
|
|
1326
|
+
// every 3 seconds, indefinitely. That drove a ~280GB PlanetScale
|
|
1327
|
+
// egress bill. Resetting only on a delta means repeated resyncs
|
|
1328
|
+
// escalate into the backoff instead of melting egress.
|
|
1329
|
+
if (!startedFromZero) {
|
|
1330
|
+
this.consecutive_410s = 0;
|
|
1331
|
+
}
|
|
1314
1332
|
// Snapshot+tail loop exhausted without throwing: if we started
|
|
1315
1333
|
// from cursor=0 we just hydrated the full replica from server
|
|
1316
1334
|
// truth. Record it so onConnected skips the reconcile that would
|
|
@@ -1335,31 +1353,37 @@ export class SyncEngine {
|
|
|
1335
1353
|
// re-pull from seq=0. The server replays all current entity rows as
|
|
1336
1354
|
// seed events on startup so the fresh pull reconstructs state.
|
|
1337
1355
|
//
|
|
1338
|
-
// Circuit breaker
|
|
1339
|
-
//
|
|
1340
|
-
//
|
|
1341
|
-
//
|
|
1342
|
-
//
|
|
1356
|
+
// Circuit breaker. The first resync in an episode snapshots
|
|
1357
|
+
// immediately (good UX: a cursor that genuinely fell off retention
|
|
1358
|
+
// recovers in one round trip). But a SECOND 410 with no successful
|
|
1359
|
+
// delta pull in between means re-snapshotting isn't converging —
|
|
1360
|
+
// the cursor we just minted from the snapshot is itself stale
|
|
1361
|
+
// (instance ping-pong, or a server that can't serve our delta). At
|
|
1362
|
+
// that point each snapshot is a full `select *` of every entity,
|
|
1363
|
+
// so we MUST back off instead of looping. The breaker only clears
|
|
1364
|
+
// on a successful delta pull (see the `!startedFromZero` reset
|
|
1365
|
+
// above) — NOT on the snapshot itself, which is what made this
|
|
1366
|
+
// loop unbounded and burned ~280GB of egress.
|
|
1343
1367
|
if (status === 410) {
|
|
1344
1368
|
const attempt = this.consecutive_410s;
|
|
1345
1369
|
this.consecutive_410s += 1;
|
|
1346
1370
|
if (attempt === 0) {
|
|
1347
|
-
//
|
|
1348
|
-
//
|
|
1349
|
-
// share our own promise back
|
|
1371
|
+
// First resync of the episode — snapshot now. Bypass the queue
|
|
1372
|
+
// (we ARE the pull op holding the slot; the public pull()
|
|
1373
|
+
// would re-enqueue and share our own promise back → deadlock).
|
|
1350
1374
|
await this.resetReplicaInner();
|
|
1351
1375
|
await this.pullInner();
|
|
1352
1376
|
} else {
|
|
1353
|
-
//
|
|
1354
|
-
//
|
|
1355
|
-
//
|
|
1377
|
+
// Snapshotted once and still 410 → not converging. Back off
|
|
1378
|
+
// exponentially instead of re-snapshotting. Clears only when a
|
|
1379
|
+
// delta pull finally succeeds (cursor stabilised).
|
|
1356
1380
|
const delayMs = Math.min(30_000, 1000 * 2 ** Math.min(attempt, 5));
|
|
1357
1381
|
console.warn(
|
|
1358
1382
|
`[pylon] persistent 410 RESYNC_REQUIRED (attempt ${attempt + 1}); backing off ${delayMs}ms`,
|
|
1359
1383
|
);
|
|
1360
1384
|
setTimeout(() => {
|
|
1361
|
-
//
|
|
1362
|
-
//
|
|
1385
|
+
// Retry after the delay; a delta success resets the counter,
|
|
1386
|
+
// a repeat 410 extends the backoff (no snapshot).
|
|
1363
1387
|
void this.pull();
|
|
1364
1388
|
}, delayMs);
|
|
1365
1389
|
}
|
package/src/scenarios.test.ts
CHANGED
|
@@ -452,6 +452,41 @@ describe("sync scenarios", () => {
|
|
|
452
452
|
expect(env.engine.store.get("Note", "n2")).not.toBeNull();
|
|
453
453
|
});
|
|
454
454
|
|
|
455
|
+
// EGRESS STORM GUARD (pins the circuit-breaker fix). When every delta
|
|
456
|
+
// pull 410s but snapshots succeed — a client bouncing between cluster
|
|
457
|
+
// instances whose in-memory change logs diverge — the client must
|
|
458
|
+
// snapshot ONCE then back off, NOT re-snapshot on every pull.
|
|
459
|
+
//
|
|
460
|
+
// The bug: the breaker reset `consecutive_410s = 0` on every
|
|
461
|
+
// successful pull, including the cursor=0 snapshot a 410 triggers. So
|
|
462
|
+
// a 410 → full snapshot → 410 ping-pong reset the breaker each cycle
|
|
463
|
+
// and the backoff never engaged. The result was a full `select *`
|
|
464
|
+
// snapshot of EVERY entity ~every 3 seconds, indefinitely — a ~280GB
|
|
465
|
+
// PlanetScale egress bill. The fix clears the breaker only on a
|
|
466
|
+
// successful DELTA pull (cursor stable), never on the snapshot itself.
|
|
467
|
+
test("repeated delta 410s back off instead of re-snapshotting (egress storm)", async () => {
|
|
468
|
+
env = createTestEnv({ transport: "poll" });
|
|
469
|
+
env.signIn({ userId: "u1" });
|
|
470
|
+
env.server.seed("Note", [{ id: "n1", title: "x" }]);
|
|
471
|
+
await env.start();
|
|
472
|
+
await env.flush();
|
|
473
|
+
|
|
474
|
+
// Baseline: start() did exactly one snapshot pull.
|
|
475
|
+
const before = env.server.snapshotPullCount;
|
|
476
|
+
|
|
477
|
+
// Now every delta pull 410s (snapshots still succeed).
|
|
478
|
+
env.server.force410OnDelta = true;
|
|
479
|
+
for (let i = 0; i < 6; i++) {
|
|
480
|
+
await env.engine.pull();
|
|
481
|
+
await env.flush();
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
// Exactly ONE additional snapshot (the first resync). Every pull
|
|
485
|
+
// after that 410s on the delta and routes to exponential backoff —
|
|
486
|
+
// no further full snapshots. Pre-fix this was 6 (one per pull).
|
|
487
|
+
expect(env.server.snapshotPullCount - before).toBe(1);
|
|
488
|
+
});
|
|
489
|
+
|
|
455
490
|
// Row-revoked envelope: server pushes `row-revoked` to a
|
|
456
491
|
// subscriber whose read policy was revoked for a specific row.
|
|
457
492
|
// The engine must drop the row from the local replica and
|
|
@@ -91,6 +91,17 @@ export class TestServer {
|
|
|
91
91
|
/** When set, the next pull() returns this status instead of normal.
|
|
92
92
|
* Used to simulate 410 RESYNC_REQUIRED and similar transient errors. */
|
|
93
93
|
private nextPullStatus: number | null = null;
|
|
94
|
+
/** When true, every DELTA pull (since > 0) 410s, but a snapshot pull
|
|
95
|
+
* (since = 0) succeeds. Simulates a horizontally-scaled deployment
|
|
96
|
+
* where a client bounces between instances whose in-memory change
|
|
97
|
+
* logs diverge (no shared persistent log) — every cursor is "stale"
|
|
98
|
+
* on the instance it lands on. This is the condition that drove the
|
|
99
|
+
* 280GB egress storm; the test asserts the client backs off instead
|
|
100
|
+
* of re-snapshotting forever. */
|
|
101
|
+
force410OnDelta = false;
|
|
102
|
+
/** Count of snapshot pulls served (since = 0). The egress storm was a
|
|
103
|
+
* runaway count here; the regression test bounds it. */
|
|
104
|
+
snapshotPullCount = 0;
|
|
94
105
|
/** Captured outbound WS messages from clients — tests assert against
|
|
95
106
|
* this to verify `reactive-subscribe`, `crdt-subscribe`, etc., were
|
|
96
107
|
* actually sent over the wire. */
|
|
@@ -299,6 +310,7 @@ export class TestServer {
|
|
|
299
310
|
}> {
|
|
300
311
|
const auth = this.authContextFor(token);
|
|
301
312
|
if (this.beforePullHook) await this.beforePullHook(auth, since);
|
|
313
|
+
if (since === 0) this.snapshotPullCount += 1;
|
|
302
314
|
const visibleSet = (entity: string) => {
|
|
303
315
|
const filtered = this.visible(
|
|
304
316
|
entity,
|
|
@@ -250,6 +250,14 @@ async function handle(
|
|
|
250
250
|
};
|
|
251
251
|
}
|
|
252
252
|
const since = Number(new URL(url, "http://test").searchParams.get("since") ?? "0");
|
|
253
|
+
// Cluster-divergence sim: a delta pull lands on an instance that
|
|
254
|
+
// can't serve our cursor → 410. A snapshot (since=0) still succeeds.
|
|
255
|
+
if (since > 0 && server.force410OnDelta) {
|
|
256
|
+
return {
|
|
257
|
+
status: 410,
|
|
258
|
+
body: { error: { code: "RESYNC_REQUIRED" } },
|
|
259
|
+
};
|
|
260
|
+
}
|
|
253
261
|
const resp = await server.pull(token, since);
|
|
254
262
|
return { status: 200, body: resp };
|
|
255
263
|
}
|