@pylonsync/sync 0.3.226 → 0.3.228
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.ts +87 -16
- package/src/scenarios.test.ts +73 -0
- package/src/test-harness/server.ts +12 -0
- package/src/test-harness/transport.ts +8 -0
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -1296,7 +1296,6 @@ export class SyncEngine {
|
|
|
1296
1296
|
const resp = await this.request<
|
|
1297
1297
|
PullResponse & { snapshot_after?: string | null }
|
|
1298
1298
|
>("GET", `/api/sync/pull?${params.toString()}`);
|
|
1299
|
-
this.consecutive_410s = 0;
|
|
1300
1299
|
await this.enqueueApply(resp.changes, resp.cursor);
|
|
1301
1300
|
// `snapshot_after` is only set when the server is mid-snapshot.
|
|
1302
1301
|
// Continue paginating in the same loop iteration so we don't
|
|
@@ -1311,6 +1310,25 @@ export class SyncEngine {
|
|
|
1311
1310
|
break;
|
|
1312
1311
|
}
|
|
1313
1312
|
}
|
|
1313
|
+
// Clear the resync circuit breaker ONLY on a successful DELTA
|
|
1314
|
+
// pull — one that started from a real, non-zero cursor the server
|
|
1315
|
+
// honored. A snapshot pull from cursor=0 succeeding does NOT prove
|
|
1316
|
+
// our cursor is stable, so it must NOT clear the breaker.
|
|
1317
|
+
//
|
|
1318
|
+
// Why this matters (the bug this replaced): the reset used to fire
|
|
1319
|
+
// on every successful page, including the cursor=0 snapshot that a
|
|
1320
|
+
// 410 triggers. So a stale-cursor 410 → full snapshot → fresh-
|
|
1321
|
+
// cursor 410 ping-pong — a client bouncing between cluster
|
|
1322
|
+
// instances whose in-memory change logs diverge, with no shared
|
|
1323
|
+
// persistent log to serve the delta — reset the breaker every
|
|
1324
|
+
// cycle. The exponential backoff below could never engage, and the
|
|
1325
|
+
// client re-ran a full `select *` snapshot of EVERY entity roughly
|
|
1326
|
+
// every 3 seconds, indefinitely. That drove a ~280GB PlanetScale
|
|
1327
|
+
// egress bill. Resetting only on a delta means repeated resyncs
|
|
1328
|
+
// escalate into the backoff instead of melting egress.
|
|
1329
|
+
if (!startedFromZero) {
|
|
1330
|
+
this.consecutive_410s = 0;
|
|
1331
|
+
}
|
|
1314
1332
|
// Snapshot+tail loop exhausted without throwing: if we started
|
|
1315
1333
|
// from cursor=0 we just hydrated the full replica from server
|
|
1316
1334
|
// truth. Record it so onConnected skips the reconcile that would
|
|
@@ -1335,31 +1353,37 @@ export class SyncEngine {
|
|
|
1335
1353
|
// re-pull from seq=0. The server replays all current entity rows as
|
|
1336
1354
|
// seed events on startup so the fresh pull reconstructs state.
|
|
1337
1355
|
//
|
|
1338
|
-
// Circuit breaker
|
|
1339
|
-
//
|
|
1340
|
-
//
|
|
1341
|
-
//
|
|
1342
|
-
//
|
|
1356
|
+
// Circuit breaker. The first resync in an episode snapshots
|
|
1357
|
+
// immediately (good UX: a cursor that genuinely fell off retention
|
|
1358
|
+
// recovers in one round trip). But a SECOND 410 with no successful
|
|
1359
|
+
// delta pull in between means re-snapshotting isn't converging —
|
|
1360
|
+
// the cursor we just minted from the snapshot is itself stale
|
|
1361
|
+
// (instance ping-pong, or a server that can't serve our delta). At
|
|
1362
|
+
// that point each snapshot is a full `select *` of every entity,
|
|
1363
|
+
// so we MUST back off instead of looping. The breaker only clears
|
|
1364
|
+
// on a successful delta pull (see the `!startedFromZero` reset
|
|
1365
|
+
// above) — NOT on the snapshot itself, which is what made this
|
|
1366
|
+
// loop unbounded and burned ~280GB of egress.
|
|
1343
1367
|
if (status === 410) {
|
|
1344
1368
|
const attempt = this.consecutive_410s;
|
|
1345
1369
|
this.consecutive_410s += 1;
|
|
1346
1370
|
if (attempt === 0) {
|
|
1347
|
-
//
|
|
1348
|
-
//
|
|
1349
|
-
// share our own promise back
|
|
1371
|
+
// First resync of the episode — snapshot now. Bypass the queue
|
|
1372
|
+
// (we ARE the pull op holding the slot; the public pull()
|
|
1373
|
+
// would re-enqueue and share our own promise back → deadlock).
|
|
1350
1374
|
await this.resetReplicaInner();
|
|
1351
1375
|
await this.pullInner();
|
|
1352
1376
|
} else {
|
|
1353
|
-
//
|
|
1354
|
-
//
|
|
1355
|
-
//
|
|
1377
|
+
// Snapshotted once and still 410 → not converging. Back off
|
|
1378
|
+
// exponentially instead of re-snapshotting. Clears only when a
|
|
1379
|
+
// delta pull finally succeeds (cursor stabilised).
|
|
1356
1380
|
const delayMs = Math.min(30_000, 1000 * 2 ** Math.min(attempt, 5));
|
|
1357
1381
|
console.warn(
|
|
1358
1382
|
`[pylon] persistent 410 RESYNC_REQUIRED (attempt ${attempt + 1}); backing off ${delayMs}ms`,
|
|
1359
1383
|
);
|
|
1360
1384
|
setTimeout(() => {
|
|
1361
|
-
//
|
|
1362
|
-
//
|
|
1385
|
+
// Retry after the delay; a delta success resets the counter,
|
|
1386
|
+
// a repeat 410 extends the backoff (no snapshot).
|
|
1363
1387
|
void this.pull();
|
|
1364
1388
|
}, delayMs);
|
|
1365
1389
|
}
|
|
@@ -1390,6 +1414,17 @@ export class SyncEngine {
|
|
|
1390
1414
|
* entity twice within seconds. Configurable via `reconcileMinIntervalMs`. */
|
|
1391
1415
|
private lastReconcileAt = 0;
|
|
1392
1416
|
|
|
1417
|
+
/** Entities the app has subscribed to via `useQuery` / `useQueryOne`,
|
|
1418
|
+
* even ones the local replica has zero rows for. The reconcile
|
|
1419
|
+
* safety net defaults to `store.entityNames()` — entities with at
|
|
1420
|
+
* least one local row — so a server row in a NEVER-cached entity (a
|
|
1421
|
+
* row created on another surface, or a freshly-added entity) stayed
|
|
1422
|
+
* invisible until a full snapshot / cache clear: `useQuery` reads
|
|
1423
|
+
* the local store and a delta `pull()` can't recover a row created
|
|
1424
|
+
* before the cursor. Tracking observed entities lets the no-arg
|
|
1425
|
+
* reconcile sweep them too. See `observeEntity`. */
|
|
1426
|
+
private observedEntities = new Set<string>();
|
|
1427
|
+
|
|
1393
1428
|
/**
|
|
1394
1429
|
* Reconcile the local replica against server truth.
|
|
1395
1430
|
*
|
|
@@ -1423,8 +1458,38 @@ export class SyncEngine {
|
|
|
1423
1458
|
*
|
|
1424
1459
|
* Pass an explicit entity list to scope the reconcile (callers like
|
|
1425
1460
|
* `db.useQueryOne` that know what they care about). When called with
|
|
1426
|
-
* no arg, every entity with local rows
|
|
1461
|
+
* no arg, every entity with local rows OR observed via `useQuery`
|
|
1462
|
+
* (see `observeEntity`) is checked.
|
|
1427
1463
|
*/
|
|
1464
|
+
/**
|
|
1465
|
+
* Register interest in an entity — called by `useQuery` /
|
|
1466
|
+
* `useQueryOne` on mount. Two effects:
|
|
1467
|
+
*
|
|
1468
|
+
* 1. Adds the entity to the reconcile sweep so the safety net
|
|
1469
|
+
* covers it even with zero local rows (see `observedEntities`).
|
|
1470
|
+
* 2. The FIRST time an entity is observed while the replica is
|
|
1471
|
+
* hydrated and that entity is locally empty, fires a one-shot
|
|
1472
|
+
* scoped reconcile so a server row this client never cached
|
|
1473
|
+
* appears on page-open — instead of waiting for the next
|
|
1474
|
+
* reconnect / visibility-change trigger. Bounded: at most once
|
|
1475
|
+
* per entity per engine (the `observedEntities` guard).
|
|
1476
|
+
*
|
|
1477
|
+
* Genuinely-empty entities just pay one cheap policy-filtered fetch;
|
|
1478
|
+
* entities where the client missed an insert get the row back.
|
|
1479
|
+
*/
|
|
1480
|
+
observeEntity(entity: string): void {
|
|
1481
|
+
if (this.observedEntities.has(entity)) return;
|
|
1482
|
+
this.observedEntities.add(entity);
|
|
1483
|
+
// Only the leader talks to the network; follower tabs converge via
|
|
1484
|
+
// the multi-tab channel once the leader reconciles.
|
|
1485
|
+
if (!this.isMultiTabLeader) return;
|
|
1486
|
+
if (this.isHydrated() && this.store.list(entity).length === 0) {
|
|
1487
|
+
// Scoped reconcile bypasses the no-arg debounce and reuses the
|
|
1488
|
+
// session-flip / cursor-drift guards in reconcileInner.
|
|
1489
|
+
void this.reconcile([entity]);
|
|
1490
|
+
}
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1428
1493
|
async reconcile(entities?: string[]): Promise<void> {
|
|
1429
1494
|
const minIntervalMs = this.config.reconcileMinIntervalMs ?? 2_000;
|
|
1430
1495
|
const now = Date.now();
|
|
@@ -1448,7 +1513,13 @@ export class SyncEngine {
|
|
|
1448
1513
|
// Same reasoning as pullInner: the leader reconciles, broadcasts
|
|
1449
1514
|
// results, and follower replicas converge via the channel.
|
|
1450
1515
|
if (!this.isMultiTabLeader) return;
|
|
1451
|
-
|
|
1516
|
+
// Sweep entities with local rows PLUS entities the app has observed
|
|
1517
|
+
// via useQuery (even when empty locally). Without the observed set,
|
|
1518
|
+
// a server row in a never-cached entity is never reconciled and
|
|
1519
|
+
// stays invisible until a full snapshot.
|
|
1520
|
+
const names =
|
|
1521
|
+
entities ??
|
|
1522
|
+
[...new Set([...this.store.entityNames(), ...this.observedEntities])];
|
|
1452
1523
|
if (names.length === 0) return;
|
|
1453
1524
|
// Tombstone seq for any local row the server doesn't return. Using
|
|
1454
1525
|
// the current cursor means future inserts (which have higher seqs)
|
package/src/scenarios.test.ts
CHANGED
|
@@ -452,6 +452,79 @@ describe("sync scenarios", () => {
|
|
|
452
452
|
expect(env.engine.store.get("Note", "n2")).not.toBeNull();
|
|
453
453
|
});
|
|
454
454
|
|
|
455
|
+
// EGRESS STORM GUARD (pins the circuit-breaker fix). When every delta
|
|
456
|
+
// pull 410s but snapshots succeed — a client bouncing between cluster
|
|
457
|
+
// instances whose in-memory change logs diverge — the client must
|
|
458
|
+
// snapshot ONCE then back off, NOT re-snapshot on every pull.
|
|
459
|
+
//
|
|
460
|
+
// The bug: the breaker reset `consecutive_410s = 0` on every
|
|
461
|
+
// successful pull, including the cursor=0 snapshot a 410 triggers. So
|
|
462
|
+
// a 410 → full snapshot → 410 ping-pong reset the breaker each cycle
|
|
463
|
+
// and the backoff never engaged. The result was a full `select *`
|
|
464
|
+
// snapshot of EVERY entity ~every 3 seconds, indefinitely — a ~280GB
|
|
465
|
+
// PlanetScale egress bill. The fix clears the breaker only on a
|
|
466
|
+
// successful DELTA pull (cursor stable), never on the snapshot itself.
|
|
467
|
+
test("repeated delta 410s back off instead of re-snapshotting (egress storm)", async () => {
|
|
468
|
+
env = createTestEnv({ transport: "poll" });
|
|
469
|
+
env.signIn({ userId: "u1" });
|
|
470
|
+
env.server.seed("Note", [{ id: "n1", title: "x" }]);
|
|
471
|
+
await env.start();
|
|
472
|
+
await env.flush();
|
|
473
|
+
|
|
474
|
+
// Baseline: start() did exactly one snapshot pull.
|
|
475
|
+
const before = env.server.snapshotPullCount;
|
|
476
|
+
|
|
477
|
+
// Now every delta pull 410s (snapshots still succeed).
|
|
478
|
+
env.server.force410OnDelta = true;
|
|
479
|
+
for (let i = 0; i < 6; i++) {
|
|
480
|
+
await env.engine.pull();
|
|
481
|
+
await env.flush();
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
// Exactly ONE additional snapshot (the first resync). Every pull
|
|
485
|
+
// after that 410s on the delta and routes to exponential backoff —
|
|
486
|
+
// no further full snapshots. Pre-fix this was 6 (one per pull).
|
|
487
|
+
expect(env.server.snapshotPullCount - before).toBe(1);
|
|
488
|
+
});
|
|
489
|
+
|
|
490
|
+
// EMPTY-ENTITY RECONCILE GAP (pins observeEntity). A server row in an
|
|
491
|
+
// entity the local replica has NEVER cached stays invisible: useQuery
|
|
492
|
+
// reads the empty local store, a delta pull can't recover a row
|
|
493
|
+
// created before the cursor, and the no-arg reconcile sweeps only
|
|
494
|
+
// entities with ≥1 local row (store.entityNames()) — so it skips the
|
|
495
|
+
// empty entity entirely. The user hit this as "I attached the domain
|
|
496
|
+
// but the Domains list is empty"; clearing IndexedDB (cursor→0→
|
|
497
|
+
// snapshot) was the only recovery. observeEntity (called by useQuery
|
|
498
|
+
// on mount) adds the entity to the sweep + fires a one-shot fetch.
|
|
499
|
+
test("observing an entity recovers a server row the local cache never had", async () => {
|
|
500
|
+
env = createTestEnv({ transport: "poll" });
|
|
501
|
+
env.signIn({ userId: "u1" });
|
|
502
|
+
// Client has rows for Note, but none for Domain.
|
|
503
|
+
env.server.seed("Note", [{ id: "n1", title: "x" }]);
|
|
504
|
+
await env.start();
|
|
505
|
+
await env.flush();
|
|
506
|
+
expect(env.engine.store.list("Domain")).toHaveLength(0);
|
|
507
|
+
|
|
508
|
+
// A Domain row exists server-side but was never delivered to this
|
|
509
|
+
// client (created on another surface / a missed insert). Inserted
|
|
510
|
+
// after start so the initial snapshot didn't include it; poll
|
|
511
|
+
// transport means no auto-delivery.
|
|
512
|
+
env.server.insert("Domain", { id: "d1", host: "chat.example.com" });
|
|
513
|
+
|
|
514
|
+
// A no-arg reconcile sweeps only entities with local rows (Note),
|
|
515
|
+
// so it never touches Domain — the row stays invisible. The bug.
|
|
516
|
+
await env.engine.reconcile(["Note"]);
|
|
517
|
+
await env.flush();
|
|
518
|
+
expect(env.engine.store.get("Domain", "d1")).toBeNull();
|
|
519
|
+
|
|
520
|
+
// observeEntity (what useQuery now calls on mount) adds Domain to
|
|
521
|
+
// the sweep and fires a one-shot scoped reconcile — the row appears
|
|
522
|
+
// without a cache clear.
|
|
523
|
+
env.engine.observeEntity("Domain");
|
|
524
|
+
await env.flush();
|
|
525
|
+
expect(env.engine.store.get("Domain", "d1")).not.toBeNull();
|
|
526
|
+
});
|
|
527
|
+
|
|
455
528
|
// Row-revoked envelope: server pushes `row-revoked` to a
|
|
456
529
|
// subscriber whose read policy was revoked for a specific row.
|
|
457
530
|
// The engine must drop the row from the local replica and
|
|
@@ -91,6 +91,17 @@ export class TestServer {
|
|
|
91
91
|
/** When set, the next pull() returns this status instead of normal.
|
|
92
92
|
* Used to simulate 410 RESYNC_REQUIRED and similar transient errors. */
|
|
93
93
|
private nextPullStatus: number | null = null;
|
|
94
|
+
/** When true, every DELTA pull (since > 0) 410s, but a snapshot pull
|
|
95
|
+
* (since = 0) succeeds. Simulates a horizontally-scaled deployment
|
|
96
|
+
* where a client bounces between instances whose in-memory change
|
|
97
|
+
* logs diverge (no shared persistent log) — every cursor is "stale"
|
|
98
|
+
* on the instance it lands on. This is the condition that drove the
|
|
99
|
+
* 280GB egress storm; the test asserts the client backs off instead
|
|
100
|
+
* of re-snapshotting forever. */
|
|
101
|
+
force410OnDelta = false;
|
|
102
|
+
/** Count of snapshot pulls served (since = 0). The egress storm was a
|
|
103
|
+
* runaway count here; the regression test bounds it. */
|
|
104
|
+
snapshotPullCount = 0;
|
|
94
105
|
/** Captured outbound WS messages from clients — tests assert against
|
|
95
106
|
* this to verify `reactive-subscribe`, `crdt-subscribe`, etc., were
|
|
96
107
|
* actually sent over the wire. */
|
|
@@ -299,6 +310,7 @@ export class TestServer {
|
|
|
299
310
|
}> {
|
|
300
311
|
const auth = this.authContextFor(token);
|
|
301
312
|
if (this.beforePullHook) await this.beforePullHook(auth, since);
|
|
313
|
+
if (since === 0) this.snapshotPullCount += 1;
|
|
302
314
|
const visibleSet = (entity: string) => {
|
|
303
315
|
const filtered = this.visible(
|
|
304
316
|
entity,
|
|
@@ -250,6 +250,14 @@ async function handle(
|
|
|
250
250
|
};
|
|
251
251
|
}
|
|
252
252
|
const since = Number(new URL(url, "http://test").searchParams.get("since") ?? "0");
|
|
253
|
+
// Cluster-divergence sim: a delta pull lands on an instance that
|
|
254
|
+
// can't serve our cursor → 410. A snapshot (since=0) still succeeds.
|
|
255
|
+
if (since > 0 && server.force410OnDelta) {
|
|
256
|
+
return {
|
|
257
|
+
status: 410,
|
|
258
|
+
body: { error: { code: "RESYNC_REQUIRED" } },
|
|
259
|
+
};
|
|
260
|
+
}
|
|
253
261
|
const resp = await server.pull(token, since);
|
|
254
262
|
return { status: 200, body: resp };
|
|
255
263
|
}
|