@powersync/service-module-postgres 0.0.0-dev-20251111070830 → 0.0.0-dev-20251111093449
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -5
- package/dist/replication/WalStream.d.ts +2 -5
- package/dist/replication/WalStream.js +64 -95
- package/dist/replication/WalStream.js.map +1 -1
- package/package.json +8 -8
- package/src/replication/WalStream.ts +71 -116
- package/test/src/wal_stream.test.ts +15 -7
- package/test/src/wal_stream_utils.ts +23 -4
- package/tsconfig.tsbuildinfo +1 -1
|
@@ -4,7 +4,6 @@ import {
|
|
|
4
4
|
DatabaseConnectionError,
|
|
5
5
|
logger as defaultLogger,
|
|
6
6
|
ErrorCode,
|
|
7
|
-
errors,
|
|
8
7
|
Logger,
|
|
9
8
|
ReplicationAbortedError,
|
|
10
9
|
ReplicationAssertionError
|
|
@@ -100,8 +99,10 @@ export const sendKeepAlive = async (db: pgwire.PgClient) => {
|
|
|
100
99
|
};
|
|
101
100
|
|
|
102
101
|
export class MissingReplicationSlotError extends Error {
|
|
103
|
-
constructor(message: string) {
|
|
102
|
+
constructor(message: string, cause?: any) {
|
|
104
103
|
super(message);
|
|
104
|
+
|
|
105
|
+
this.cause = cause;
|
|
105
106
|
}
|
|
106
107
|
}
|
|
107
108
|
|
|
@@ -304,135 +305,54 @@ export class WalStream {
|
|
|
304
305
|
})
|
|
305
306
|
)[0];
|
|
306
307
|
|
|
308
|
+
// Previously we also used pg_catalog.pg_logical_slot_peek_binary_changes to confirm that we can query the slot.
|
|
309
|
+
// However, there were some edge cases where the query times out, repeating the query, ultimately
|
|
310
|
+
// causing high load on the source database and never recovering automatically.
|
|
311
|
+
// We now instead jump straight to replication if the wal_status is not "lost", rather detecting those
|
|
312
|
+
// errors during streaming replication, which is a little more robust.
|
|
313
|
+
|
|
314
|
+
// We can have:
|
|
315
|
+
// 1. needsInitialSync: true, lost slot -> MissingReplicationSlotError (starts new sync rules version).
|
|
316
|
+
// Theoretically we could handle this the same as (2).
|
|
317
|
+
// 2. needsInitialSync: true, no slot -> create new slot
|
|
318
|
+
// 3. needsInitialSync: true, valid slot -> resume initial sync
|
|
319
|
+
// 4. needsInitialSync: false, lost slot -> MissingReplicationSlotError (starts new sync rules version)
|
|
320
|
+
// 5. needsInitialSync: false, no slot -> MissingReplicationSlotError (starts new sync rules version)
|
|
321
|
+
// 6. needsInitialSync: false, valid slot -> resume streaming replication
|
|
322
|
+
// The main advantage of MissingReplicationSlotError are:
|
|
323
|
+
// 1. If there was a complete snapshot already (cases 4/5), users can still sync from that snapshot while
|
|
324
|
+
// we do the reprocessing under a new slot name.
|
|
325
|
+
// 2. If there was a partial snapshot (case 1), we can start with the new slot faster by not waiting for
|
|
326
|
+
// the partial data to be cleared.
|
|
307
327
|
if (slot != null) {
|
|
308
328
|
// This checks that the slot is still valid
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
329
|
+
|
|
330
|
+
// wal_status is present in postgres 13+
|
|
331
|
+
// invalidation_reason is present in postgres 17+
|
|
332
|
+
const lost = slot.wal_status == 'lost';
|
|
333
|
+
if (lost) {
|
|
334
|
+
// Case 1 / 4
|
|
335
|
+
throw new MissingReplicationSlotError(
|
|
336
|
+
`Replication slot ${slotName} is not valid anymore. invalidation_reason: ${slot.invalidation_reason ?? 'unknown'}`
|
|
337
|
+
);
|
|
313
338
|
}
|
|
314
|
-
//
|
|
315
|
-
// needsInitialSync: true, needsNewSlot: true -> initial sync from scratch
|
|
316
|
-
// needsInitialSync: true, needsNewSlot: false -> resume initial sync
|
|
317
|
-
// needsInitialSync: false, needsNewSlot: true -> handled above
|
|
318
|
-
// needsInitialSync: false, needsNewSlot: false -> resume streaming replication
|
|
339
|
+
// Case 3 / 6
|
|
319
340
|
return {
|
|
320
341
|
needsInitialSync: !snapshotDone,
|
|
321
|
-
needsNewSlot:
|
|
342
|
+
needsNewSlot: false
|
|
322
343
|
};
|
|
323
344
|
} else {
|
|
324
345
|
if (snapshotDone) {
|
|
346
|
+
// Case 5
|
|
325
347
|
// This will create a new slot, while keeping the current sync rules active
|
|
326
348
|
throw new MissingReplicationSlotError(`Replication slot ${slotName} is missing`);
|
|
327
349
|
}
|
|
328
|
-
//
|
|
350
|
+
// Case 2
|
|
351
|
+
// This will clear data (if any) and re-create the same slot
|
|
329
352
|
return { needsInitialSync: true, needsNewSlot: true };
|
|
330
353
|
}
|
|
331
354
|
}
|
|
332
355
|
|
|
333
|
-
/**
|
|
334
|
-
* If a replication slot exists, check that it is healthy.
|
|
335
|
-
*/
|
|
336
|
-
private async checkReplicationSlot(slot: {
|
|
337
|
-
// postgres 13+
|
|
338
|
-
wal_status?: string;
|
|
339
|
-
// postgres 17+
|
|
340
|
-
invalidation_reason?: string | null;
|
|
341
|
-
}): Promise<{ needsNewSlot: boolean }> {
|
|
342
|
-
// Start with a placeholder error, should be replaced if there is an actual issue.
|
|
343
|
-
let last_error = new ReplicationAssertionError(`Slot health check failed to execute`);
|
|
344
|
-
|
|
345
|
-
const slotName = this.slot_name;
|
|
346
|
-
|
|
347
|
-
const lost = slot.wal_status == 'lost';
|
|
348
|
-
if (lost) {
|
|
349
|
-
this.logger.warn(
|
|
350
|
-
`Replication slot ${slotName} is invalidated. invalidation_reason: ${slot.invalidation_reason ?? 'unknown'}`
|
|
351
|
-
);
|
|
352
|
-
return {
|
|
353
|
-
needsNewSlot: true
|
|
354
|
-
};
|
|
355
|
-
}
|
|
356
|
-
|
|
357
|
-
// Check that replication slot exists, trying for up to 2 minutes.
|
|
358
|
-
const startAt = performance.now();
|
|
359
|
-
while (performance.now() - startAt < 120_000) {
|
|
360
|
-
this.touch();
|
|
361
|
-
|
|
362
|
-
try {
|
|
363
|
-
// We peek a large number of changes here, to make it more likely to pick up replication slot errors.
|
|
364
|
-
// For example, "publication does not exist" only occurs here if the peek actually includes changes related
|
|
365
|
-
// to the slot.
|
|
366
|
-
this.logger.info(`Checking ${slotName}`);
|
|
367
|
-
|
|
368
|
-
// The actual results can be quite large, so we don't actually return everything
|
|
369
|
-
// due to memory and processing overhead that would create.
|
|
370
|
-
const cursor = await this.connections.pool.stream({
|
|
371
|
-
statement: `SELECT 1 FROM pg_catalog.pg_logical_slot_peek_binary_changes($1, NULL, 1000, 'proto_version', '1', 'publication_names', $2)`,
|
|
372
|
-
params: [
|
|
373
|
-
{ type: 'varchar', value: slotName },
|
|
374
|
-
{ type: 'varchar', value: PUBLICATION_NAME }
|
|
375
|
-
]
|
|
376
|
-
});
|
|
377
|
-
|
|
378
|
-
for await (let _chunk of cursor) {
|
|
379
|
-
// No-op, just exhaust the cursor
|
|
380
|
-
}
|
|
381
|
-
|
|
382
|
-
// Success
|
|
383
|
-
this.logger.info(`Slot ${slotName} appears healthy`);
|
|
384
|
-
return { needsNewSlot: false };
|
|
385
|
-
} catch (e) {
|
|
386
|
-
last_error = e;
|
|
387
|
-
this.logger.warn(`Replication slot error`, e);
|
|
388
|
-
|
|
389
|
-
if (this.stopped) {
|
|
390
|
-
throw e;
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
if (
|
|
394
|
-
/incorrect prev-link/.test(e.message) ||
|
|
395
|
-
/replication slot.*does not exist/.test(e.message) ||
|
|
396
|
-
/publication.*does not exist/.test(e.message) ||
|
|
397
|
-
// Postgres 18 - exceeded max_slot_wal_keep_size
|
|
398
|
-
/can no longer access replication slot/.test(e.message) ||
|
|
399
|
-
// Postgres 17 - exceeded max_slot_wal_keep_size
|
|
400
|
-
/can no longer get changes from replication slot/.test(e.message)
|
|
401
|
-
) {
|
|
402
|
-
// Fatal error. In most cases since Postgres 13+, the `wal_status == 'lost'` check should pick this up, but this
|
|
403
|
-
// works as a fallback.
|
|
404
|
-
|
|
405
|
-
container.reporter.captureException(e, {
|
|
406
|
-
level: errors.ErrorSeverity.WARNING,
|
|
407
|
-
metadata: {
|
|
408
|
-
replication_slot: slotName
|
|
409
|
-
}
|
|
410
|
-
});
|
|
411
|
-
// Sample: record with incorrect prev-link 10000/10000 at 0/18AB778
|
|
412
|
-
// Seen during development. Some internal error, fixed by re-creating slot.
|
|
413
|
-
//
|
|
414
|
-
// Sample: publication "powersync" does not exist
|
|
415
|
-
// Happens when publication deleted or never created.
|
|
416
|
-
// Slot must be re-created in this case.
|
|
417
|
-
this.logger.info(`${slotName} is not valid anymore`);
|
|
418
|
-
|
|
419
|
-
return { needsNewSlot: true };
|
|
420
|
-
}
|
|
421
|
-
// Try again after a pause
|
|
422
|
-
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
423
|
-
}
|
|
424
|
-
}
|
|
425
|
-
|
|
426
|
-
container.reporter.captureException(last_error, {
|
|
427
|
-
level: errors.ErrorSeverity.ERROR,
|
|
428
|
-
metadata: {
|
|
429
|
-
replication_slot: slotName
|
|
430
|
-
}
|
|
431
|
-
});
|
|
432
|
-
|
|
433
|
-
throw last_error;
|
|
434
|
-
}
|
|
435
|
-
|
|
436
356
|
async estimatedCountNumber(db: pgwire.PgConnection, table: storage.SourceTable): Promise<number> {
|
|
437
357
|
const results = await db.query({
|
|
438
358
|
statement: `SELECT reltuples::bigint AS estimate
|
|
@@ -915,6 +835,17 @@ WHERE oid = $1::regclass`,
|
|
|
915
835
|
}
|
|
916
836
|
|
|
917
837
|
async streamChanges(replicationConnection: pgwire.PgConnection) {
|
|
838
|
+
try {
|
|
839
|
+
await this.streamChangesInternal(replicationConnection);
|
|
840
|
+
} catch (e) {
|
|
841
|
+
if (isReplicationSlotInvalidError(e)) {
|
|
842
|
+
throw new MissingReplicationSlotError(e.message, e);
|
|
843
|
+
}
|
|
844
|
+
throw e;
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
private async streamChangesInternal(replicationConnection: pgwire.PgConnection) {
|
|
918
849
|
// When changing any logic here, check /docs/wal-lsns.md.
|
|
919
850
|
const { createEmptyCheckpoints } = await this.ensureStorageCompatibility();
|
|
920
851
|
|
|
@@ -1179,3 +1110,27 @@ WHERE oid = $1::regclass`,
|
|
|
1179
1110
|
});
|
|
1180
1111
|
}
|
|
1181
1112
|
}
|
|
1113
|
+
|
|
1114
|
+
function isReplicationSlotInvalidError(e: any) {
|
|
1115
|
+
// We could access the error code from pgwire using this:
|
|
1116
|
+
// e[Symbol.for('pg.ErrorCode')]
|
|
1117
|
+
// However, we typically get a generic code such as 42704 (undefined_object), which does not
|
|
1118
|
+
// help much. So we check the actual error message.
|
|
1119
|
+
const message = e.message ?? '';
|
|
1120
|
+
|
|
1121
|
+
// Sample: record with incorrect prev-link 10000/10000 at 0/18AB778
|
|
1122
|
+
// Seen during development. Some internal error, fixed by re-creating slot.
|
|
1123
|
+
//
|
|
1124
|
+
// Sample: publication "powersync" does not exist
|
|
1125
|
+
// Happens when publication deleted or never created.
|
|
1126
|
+
// Slot must be re-created in this case.
|
|
1127
|
+
return (
|
|
1128
|
+
/incorrect prev-link/.test(message) ||
|
|
1129
|
+
/replication slot.*does not exist/.test(message) ||
|
|
1130
|
+
/publication.*does not exist/.test(message) ||
|
|
1131
|
+
// Postgres 18 - exceeded max_slot_wal_keep_size
|
|
1132
|
+
/can no longer access replication slot/.test(message) ||
|
|
1133
|
+
// Postgres 17 - exceeded max_slot_wal_keep_size
|
|
1134
|
+
/can no longer get changes from replication slot/.test(message)
|
|
1135
|
+
);
|
|
1136
|
+
}
|
|
@@ -295,7 +295,7 @@ bucket_definitions:
|
|
|
295
295
|
`INSERT INTO test_data(id, description) VALUES('8133cd37-903b-4937-a022-7c8294015a3a', 'test1') returning id as test_id`
|
|
296
296
|
);
|
|
297
297
|
await context.replicateSnapshot();
|
|
298
|
-
|
|
298
|
+
context.startStreaming();
|
|
299
299
|
|
|
300
300
|
const data = await context.getBucketData('global[]');
|
|
301
301
|
|
|
@@ -320,17 +320,25 @@ bucket_definitions:
|
|
|
320
320
|
|
|
321
321
|
await context.loadActiveSyncRules();
|
|
322
322
|
|
|
323
|
+
// Previously, the `replicateSnapshot` call picked up on this error.
|
|
324
|
+
// Now, we have removed that check, this only comes up when we start actually streaming.
|
|
325
|
+
// We don't get the streaming response directly here, but getCheckpoint() checks for that.
|
|
326
|
+
await context.replicateSnapshot();
|
|
327
|
+
context.startStreaming();
|
|
328
|
+
|
|
323
329
|
if (serverVersion!.compareMain('18.0.0') >= 0) {
|
|
324
|
-
await context.replicateSnapshot();
|
|
325
330
|
// No error expected in Postres 18. Replication keeps on working depite the
|
|
326
331
|
// publication being re-created.
|
|
332
|
+
await context.getCheckpoint();
|
|
327
333
|
} else {
|
|
334
|
+
// await context.getCheckpoint();
|
|
328
335
|
// Postgres < 18 invalidates the replication slot when the publication is re-created.
|
|
329
|
-
//
|
|
336
|
+
// In the service, this error is handled in WalStreamReplicationJob,
|
|
330
337
|
// creating a new replication slot.
|
|
331
338
|
await expect(async () => {
|
|
332
|
-
await context.
|
|
339
|
+
await context.getCheckpoint();
|
|
333
340
|
}).rejects.toThrowError(MissingReplicationSlotError);
|
|
341
|
+
context.clearStreamError();
|
|
334
342
|
}
|
|
335
343
|
}
|
|
336
344
|
});
|
|
@@ -352,7 +360,7 @@ bucket_definitions:
|
|
|
352
360
|
`INSERT INTO test_data(id, description) VALUES('8133cd37-903b-4937-a022-7c8294015a3a', 'test1') returning id as test_id`
|
|
353
361
|
);
|
|
354
362
|
await context.replicateSnapshot();
|
|
355
|
-
|
|
363
|
+
context.startStreaming();
|
|
356
364
|
|
|
357
365
|
const data = await context.getBucketData('global[]');
|
|
358
366
|
|
|
@@ -415,7 +423,7 @@ bucket_definitions:
|
|
|
415
423
|
`INSERT INTO test_data(id, description) VALUES('8133cd37-903b-4937-a022-7c8294015a3a', 'test1') returning id as test_id`
|
|
416
424
|
);
|
|
417
425
|
await context.replicateSnapshot();
|
|
418
|
-
|
|
426
|
+
context.startStreaming();
|
|
419
427
|
|
|
420
428
|
const data = await context.getBucketData('global[]');
|
|
421
429
|
|
|
@@ -572,7 +580,7 @@ config:
|
|
|
572
580
|
);
|
|
573
581
|
|
|
574
582
|
await context.replicateSnapshot();
|
|
575
|
-
|
|
583
|
+
context.startStreaming();
|
|
576
584
|
|
|
577
585
|
await pool.query(`UPDATE test_data SET description = 'test2' WHERE id = '${test_id}'`);
|
|
578
586
|
|
|
@@ -55,12 +55,31 @@ export class WalStreamTestContext implements AsyncDisposable {
|
|
|
55
55
|
await this.dispose();
|
|
56
56
|
}
|
|
57
57
|
|
|
58
|
+
/**
|
|
59
|
+
* Clear any errors from startStream, to allow for a graceful dispose when streaming errors
|
|
60
|
+
* were expected.
|
|
61
|
+
*/
|
|
62
|
+
async clearStreamError() {
|
|
63
|
+
if (this.streamPromise != null) {
|
|
64
|
+
this.streamPromise = this.streamPromise.catch((e) => {});
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
58
68
|
async dispose() {
|
|
59
69
|
this.abortController.abort();
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
70
|
+
try {
|
|
71
|
+
await this.snapshotPromise;
|
|
72
|
+
await this.streamPromise;
|
|
73
|
+
await this.connectionManager.destroy();
|
|
74
|
+
await this.factory?.[Symbol.asyncDispose]();
|
|
75
|
+
} catch (e) {
|
|
76
|
+
// Throwing here may result in SuppressedError. The underlying errors often don't show up
|
|
77
|
+
// in the test output, so we log it here.
|
|
78
|
+
// If we could get vitest to log SuppressedError.error and SuppressedError.suppressed, we
|
|
79
|
+
// could remove this.
|
|
80
|
+
console.error('Error during WalStreamTestContext dispose', e);
|
|
81
|
+
throw e;
|
|
82
|
+
}
|
|
64
83
|
}
|
|
65
84
|
|
|
66
85
|
get pool() {
|