@powersync/service-module-postgres 0.0.0-dev-20250507154604 → 0.0.0-dev-20250611110033

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/CHANGELOG.md +47 -8
  2. package/dist/api/PostgresRouteAPIAdapter.d.ts +1 -1
  3. package/dist/api/PostgresRouteAPIAdapter.js +5 -1
  4. package/dist/api/PostgresRouteAPIAdapter.js.map +1 -1
  5. package/dist/auth/SupabaseKeyCollector.d.ts +3 -10
  6. package/dist/auth/SupabaseKeyCollector.js +6 -4
  7. package/dist/auth/SupabaseKeyCollector.js.map +1 -1
  8. package/dist/replication/SnapshotQuery.d.ts +75 -0
  9. package/dist/replication/SnapshotQuery.js +172 -0
  10. package/dist/replication/SnapshotQuery.js.map +1 -0
  11. package/dist/replication/WalStream.d.ts +37 -4
  12. package/dist/replication/WalStream.js +284 -88
  13. package/dist/replication/WalStream.js.map +1 -1
  14. package/dist/replication/WalStreamReplicationJob.d.ts +2 -0
  15. package/dist/replication/WalStreamReplicationJob.js +10 -3
  16. package/dist/replication/WalStreamReplicationJob.js.map +1 -1
  17. package/dist/replication/WalStreamReplicator.d.ts +1 -0
  18. package/dist/replication/WalStreamReplicator.js +22 -0
  19. package/dist/replication/WalStreamReplicator.js.map +1 -1
  20. package/package.json +12 -12
  21. package/src/api/PostgresRouteAPIAdapter.ts +5 -1
  22. package/src/auth/SupabaseKeyCollector.ts +14 -5
  23. package/src/replication/SnapshotQuery.ts +206 -0
  24. package/src/replication/WalStream.ts +338 -95
  25. package/src/replication/WalStreamReplicationJob.ts +11 -3
  26. package/src/replication/WalStreamReplicator.ts +26 -0
  27. package/test/src/__snapshots__/schema_changes.test.ts.snap +2 -2
  28. package/test/src/checkpoints.test.ts +10 -3
  29. package/test/src/chunked_snapshots.test.ts +156 -0
  30. package/test/src/large_batch.test.ts +5 -154
  31. package/test/src/resuming_snapshots.test.ts +150 -0
  32. package/test/src/schema_changes.test.ts +5 -10
  33. package/test/src/slow_tests.test.ts +13 -30
  34. package/test/src/util.ts +12 -1
  35. package/test/src/validation.test.ts +0 -1
  36. package/test/src/wal_stream.test.ts +4 -9
  37. package/test/src/wal_stream_utils.ts +15 -7
  38. package/tsconfig.tsbuildinfo +1 -1
@@ -1,12 +1,13 @@
1
1
  import * as lib_postgres from '@powersync/lib-service-postgres';
2
- import { container, DatabaseConnectionError, ErrorCode, errors, logger, ReplicationAbortedError, ReplicationAssertionError } from '@powersync/lib-services-framework';
3
- import { getUuidReplicaIdentityBson, storage } from '@powersync/service-core';
2
+ import { container, DatabaseConnectionError, ErrorCode, errors, logger as defaultLogger, ReplicationAssertionError, ReplicationAbortedError } from '@powersync/lib-services-framework';
3
+ import { getUuidReplicaIdentityBson, RelationCache, storage } from '@powersync/service-core';
4
4
  import * as pgwire from '@powersync/service-jpgwire';
5
5
  import { toSyncRulesRow } from '@powersync/service-sync-rules';
6
6
  import * as pg_utils from '../utils/pgwire_utils.js';
7
7
  import { getPgOutputRelation, getRelId } from './PgRelation.js';
8
8
  import { checkSourceConfiguration, getReplicationIdentityColumns } from './replication-utils.js';
9
9
  import { ReplicationMetric } from '@powersync/service-types';
10
+ import { ChunkedSnapshotQuery, IdSnapshotQuery, SimpleSnapshotQuery } from './SnapshotQuery.js';
10
11
  export const ZERO_LSN = '00000000/00000000';
11
12
  export const PUBLICATION_NAME = 'powersync';
12
13
  export const POSTGRES_DEFAULT_SCHEMA = 'public';
@@ -39,20 +40,39 @@ export class WalStream {
39
40
  sync_rules;
40
41
  group_id;
41
42
  connection_id = 1;
43
+ logger;
42
44
  storage;
43
45
  metrics;
44
46
  slot_name;
45
47
  connections;
46
48
  abort_signal;
47
- relation_cache = new Map();
49
+ relationCache = new RelationCache((relation) => {
50
+ if (typeof relation == 'number') {
51
+ return relation;
52
+ }
53
+ return relation.objectId;
54
+ });
48
55
  startedStreaming = false;
56
+ snapshotChunkLength;
57
+ /**
58
+ * Time of the oldest uncommitted change, according to the source db.
59
+ * This is used to determine the replication lag.
60
+ */
61
+ oldestUncommittedChange = null;
62
+ /**
63
+ * Keep track of whether we have done a commit or keepalive yet.
64
+ * We can only compute replication lag if isStartingReplication == false, or oldestUncommittedChange is present.
65
+ */
66
+ isStartingReplication = true;
49
67
  constructor(options) {
68
+ this.logger = options.logger ?? defaultLogger;
50
69
  this.storage = options.storage;
51
70
  this.metrics = options.metrics;
52
71
  this.sync_rules = options.storage.getParsedSyncRules({ defaultSchema: POSTGRES_DEFAULT_SCHEMA });
53
72
  this.group_id = options.storage.group_id;
54
73
  this.slot_name = options.storage.slot_name;
55
74
  this.connections = options.connections;
75
+ this.snapshotChunkLength = options.snapshotChunkLength ?? 10_000;
56
76
  this.abort_signal = options.abort_signal;
57
77
  this.abort_signal.addEventListener('abort', () => {
58
78
  if (this.startedStreaming) {
@@ -62,7 +82,7 @@ export class WalStream {
62
82
  const promise = sendKeepAlive(this.connections.pool);
63
83
  promise.catch((e) => {
64
84
  // Failures here are okay - this only speeds up stopping the process.
65
- logger.warn('Failed to ping connection', e);
85
+ this.logger.warn('Failed to ping connection', e);
66
86
  });
67
87
  }
68
88
  else {
@@ -130,7 +150,7 @@ export class WalStream {
130
150
  ]
131
151
  });
132
152
  if (rs.rows.length == 0) {
133
- logger.info(`Skipping ${tablePattern.schema}.${name} - not part of ${PUBLICATION_NAME} publication`);
153
+ this.logger.info(`Skipping ${tablePattern.schema}.${name} - not part of ${PUBLICATION_NAME} publication`);
134
154
  continue;
135
155
  }
136
156
  const cresult = await getReplicationIdentityColumns(db, relid);
@@ -152,7 +172,7 @@ export class WalStream {
152
172
  const snapshotDone = status.snapshot_done && status.checkpoint_lsn != null;
153
173
  if (snapshotDone) {
154
174
  // Snapshot is done, but we still need to check the replication slot status
155
- logger.info(`${slotName} Initial replication already done`);
175
+ this.logger.info(`Initial replication already done`);
156
176
  }
157
177
  // Check if replication slot exists
158
178
  const rs = await this.connections.pool.query({
@@ -208,7 +228,7 @@ export class WalStream {
208
228
  // We peek a large number of changes here, to make it more likely to pick up replication slot errors.
209
229
  // For example, "publication does not exist" only occurs here if the peek actually includes changes related
210
230
  // to the slot.
211
- logger.info(`Checking ${slotName}`);
231
+ this.logger.info(`Checking ${slotName}`);
212
232
  // The actual results can be quite large, so we don't actually return everything
213
233
  // due to memory and processing overhead that would create.
214
234
  const cursor = await this.connections.pool.stream({
@@ -222,12 +242,12 @@ export class WalStream {
222
242
  // No-op, just exhaust the cursor
223
243
  }
224
244
  // Success
225
- logger.info(`Slot ${slotName} appears healthy`);
245
+ this.logger.info(`Slot ${slotName} appears healthy`);
226
246
  return { needsNewSlot: false };
227
247
  }
228
248
  catch (e) {
229
249
  last_error = e;
230
- logger.warn(`${slotName} Replication slot error`, e);
250
+ this.logger.warn(`Replication slot error`, e);
231
251
  if (this.stopped) {
232
252
  throw e;
233
253
  }
@@ -249,7 +269,7 @@ export class WalStream {
249
269
  // Sample: publication "powersync" does not exist
250
270
  // Happens when publication deleted or never created.
251
271
  // Slot must be re-created in this case.
252
- logger.info(`${slotName} is not valid anymore`);
272
+ this.logger.info(`${slotName} is not valid anymore`);
253
273
  return { needsNewSlot: true };
254
274
  }
255
275
  // Try again after a pause
@@ -258,7 +278,7 @@ export class WalStream {
258
278
  }
259
279
  throw new ReplicationAssertionError('Unreachable');
260
280
  }
261
- async estimatedCount(db, table) {
281
+ async estimatedCountNumber(db, table) {
262
282
  const results = await db.query({
263
283
  statement: `SELECT reltuples::bigint AS estimate
264
284
  FROM pg_class
@@ -267,10 +287,10 @@ WHERE oid = $1::regclass`,
267
287
  });
268
288
  const row = results.rows[0];
269
289
  if ((row?.[0] ?? -1n) == -1n) {
270
- return '?';
290
+ return -1;
271
291
  }
272
292
  else {
273
- return `~${row[0]}`;
293
+ return Number(row[0]);
274
294
  }
275
295
  }
276
296
  /**
@@ -298,27 +318,39 @@ WHERE oid = $1::regclass`,
298
318
  // We use the replication connection here, not a pool.
299
319
  // The replication slot must be created before we start snapshotting tables.
300
320
  await replicationConnection.query(`CREATE_REPLICATION_SLOT ${slotName} LOGICAL pgoutput`);
301
- logger.info(`Created replication slot ${slotName}`);
321
+ this.logger.info(`Created replication slot ${slotName}`);
302
322
  }
303
323
  await this.initialReplication(db);
304
324
  }
305
325
  async initialReplication(db) {
306
326
  const sourceTables = this.sync_rules.getSourceTables();
307
- await this.storage.startBatch({ zeroLSN: ZERO_LSN, defaultSchema: POSTGRES_DEFAULT_SCHEMA, storeCurrentData: true, skipExistingRows: true }, async (batch) => {
327
+ await this.storage.startBatch({
328
+ logger: this.logger,
329
+ zeroLSN: ZERO_LSN,
330
+ defaultSchema: POSTGRES_DEFAULT_SCHEMA,
331
+ storeCurrentData: true,
332
+ skipExistingRows: true
333
+ }, async (batch) => {
334
+ let tablesWithStatus = [];
308
335
  for (let tablePattern of sourceTables) {
309
336
  const tables = await this.getQualifiedTableNames(batch, db, tablePattern);
337
+ // Pre-get counts
310
338
  for (let table of tables) {
311
339
  if (table.snapshotComplete) {
312
- logger.info(`${this.slot_name} Skipping ${table.qualifiedName} - snapshot already done`);
340
+ this.logger.info(`Skipping ${table.qualifiedName} - snapshot already done`);
313
341
  continue;
314
342
  }
315
- await this.snapshotTable(batch, db, table);
316
- const rs = await db.query(`select pg_current_wal_lsn() as lsn`);
317
- const tableLsnNotBefore = rs.rows[0][0];
318
- await batch.markSnapshotDone([table], tableLsnNotBefore);
319
- await touch();
343
+ const count = await this.estimatedCountNumber(db, table);
344
+ table = await batch.updateTableProgress(table, { totalEstimatedCount: count });
345
+ this.relationCache.update(table);
346
+ tablesWithStatus.push(table);
347
+ this.logger.info(`To replicate: ${table.qualifiedName} ${table.formatSnapshotProgress()}`);
320
348
  }
321
349
  }
350
+ for (let table of tablesWithStatus) {
351
+ await this.snapshotTableInTx(batch, db, table);
352
+ await touch();
353
+ }
322
354
  // Always commit the initial snapshot at zero.
323
355
  // This makes sure we don't skip any changes applied before starting this snapshot,
324
356
  // in the case of snapshot retries.
@@ -340,53 +372,130 @@ WHERE oid = $1::regclass`,
340
372
  yield toSyncRulesRow(row);
341
373
  }
342
374
  }
343
- async snapshotTable(batch, db, table) {
344
- logger.info(`${this.slot_name} Replicating ${table.qualifiedName}`);
345
- const estimatedCount = await this.estimatedCount(db, table);
346
- let at = 0;
347
- let lastLogIndex = 0;
348
- const cursor = db.stream({ statement: `SELECT * FROM ${table.escapedIdentifier}` });
375
+ async snapshotTableInTx(batch, db, table, limited) {
376
+ await db.query('BEGIN');
377
+ try {
378
+ let tableLsnNotBefore;
379
+ await this.snapshotTable(batch, db, table, limited);
380
+ // Get the current LSN.
381
+ // The data will only be consistent once incremental replication
382
+ // has passed that point.
383
+ // We have to get this LSN _after_ we have started the snapshot query.
384
+ const rs = await db.query(`select pg_current_wal_lsn() as lsn`);
385
+ tableLsnNotBefore = rs.rows[0][0];
386
+ await db.query('COMMIT');
387
+ const [resultTable] = await batch.markSnapshotDone([table], tableLsnNotBefore);
388
+ this.relationCache.update(resultTable);
389
+ return resultTable;
390
+ }
391
+ catch (e) {
392
+ await db.query('ROLLBACK');
393
+ throw e;
394
+ }
395
+ }
396
+ async snapshotTable(batch, db, table, limited) {
397
+ let totalEstimatedCount = table.snapshotStatus?.totalEstimatedCount;
398
+ let at = table.snapshotStatus?.replicatedCount ?? 0;
399
+ let lastCountTime = 0;
400
+ let q;
401
+ // We do streaming on two levels:
402
+ // 1. Coarse level: DELCARE CURSOR, FETCH 10000 at a time.
403
+ // 2. Fine level: Stream chunks from each fetch call.
404
+ if (limited) {
405
+ q = new IdSnapshotQuery(db, table, limited);
406
+ }
407
+ else if (ChunkedSnapshotQuery.supports(table)) {
408
+ // Single primary key - we can use the primary key for chunking
409
+ const orderByKey = table.replicaIdColumns[0];
410
+ q = new ChunkedSnapshotQuery(db, table, this.snapshotChunkLength, table.snapshotStatus?.lastKey ?? null);
411
+ if (table.snapshotStatus?.lastKey != null) {
412
+ this.logger.info(`Replicating ${table.qualifiedName} ${table.formatSnapshotProgress()} - resuming from ${orderByKey.name} > ${q.lastKey}`);
413
+ }
414
+ else {
415
+ this.logger.info(`Replicating ${table.qualifiedName} ${table.formatSnapshotProgress()} - resumable`);
416
+ }
417
+ }
418
+ else {
419
+ // Fallback case - query the entire table
420
+ this.logger.info(`Replicating ${table.qualifiedName} ${table.formatSnapshotProgress()} - not resumable`);
421
+ q = new SimpleSnapshotQuery(db, table, this.snapshotChunkLength);
422
+ at = 0;
423
+ }
424
+ await q.initialize();
349
425
  let columns = [];
350
- // pgwire streams rows in chunks.
351
- // These chunks can be quite small (as little as 16KB), so we don't flush chunks automatically.
352
- for await (let chunk of cursor) {
353
- if (chunk.tag == 'RowDescription') {
354
- let i = 0;
355
- columns = chunk.payload.map((c) => {
356
- return { i: i++, name: c.name };
426
+ let hasRemainingData = true;
427
+ while (hasRemainingData) {
428
+ // Fetch 10k at a time.
429
+ // The balance here is between latency overhead per FETCH call,
430
+ // and not spending too much time on each FETCH call.
431
+ // We aim for a couple of seconds on each FETCH call.
432
+ const cursor = q.nextChunk();
433
+ hasRemainingData = false;
434
+ // pgwire streams rows in chunks.
435
+ // These chunks can be quite small (as little as 16KB), so we don't flush chunks automatically.
436
+ // There are typically 100-200 rows per chunk.
437
+ for await (let chunk of cursor) {
438
+ if (chunk.tag == 'RowDescription') {
439
+ // We get a RowDescription for each FETCH call, but they should
440
+ // all be the same.
441
+ let i = 0;
442
+ columns = chunk.payload.map((c) => {
443
+ return { i: i++, name: c.name };
444
+ });
445
+ continue;
446
+ }
447
+ const rows = chunk.rows.map((row) => {
448
+ let q = {};
449
+ for (let c of columns) {
450
+ q[c.name] = row[c.i];
451
+ }
452
+ return q;
357
453
  });
358
- continue;
454
+ if (rows.length > 0) {
455
+ hasRemainingData = true;
456
+ }
457
+ for (const record of WalStream.getQueryData(rows)) {
458
+ // This auto-flushes when the batch reaches its size limit
459
+ await batch.save({
460
+ tag: storage.SaveOperationTag.INSERT,
461
+ sourceTable: table,
462
+ before: undefined,
463
+ beforeReplicaId: undefined,
464
+ after: record,
465
+ afterReplicaId: getUuidReplicaIdentityBson(record, table.replicaIdColumns)
466
+ });
467
+ }
468
+ at += rows.length;
469
+ this.metrics.getCounter(ReplicationMetric.ROWS_REPLICATED).add(rows.length);
470
+ await touch();
359
471
  }
360
- const rows = chunk.rows.map((row) => {
361
- let q = {};
362
- for (let c of columns) {
363
- q[c.name] = row[c.i];
472
+ // Important: flush before marking progress
473
+ await batch.flush();
474
+ if (limited == null) {
475
+ let lastKey;
476
+ if (q instanceof ChunkedSnapshotQuery) {
477
+ lastKey = q.getLastKeySerialized();
364
478
  }
365
- return q;
366
- });
367
- if (rows.length > 0 && at - lastLogIndex >= 5000) {
368
- logger.info(`${this.slot_name} Replicating ${table.qualifiedName} ${at}/${estimatedCount}`);
369
- lastLogIndex = at;
479
+ if (lastCountTime < performance.now() - 10 * 60 * 1000) {
480
+ totalEstimatedCount = await this.estimatedCountNumber(db, table);
481
+ lastCountTime = performance.now();
482
+ }
483
+ table = await batch.updateTableProgress(table, {
484
+ lastKey: lastKey,
485
+ replicatedCount: at,
486
+ totalEstimatedCount: totalEstimatedCount
487
+ });
488
+ this.relationCache.update(table);
489
+ this.logger.info(`Replicating ${table.qualifiedName} ${table.formatSnapshotProgress()}`);
370
490
  }
371
- if (this.abort_signal.aborted) {
372
- throw new ReplicationAbortedError(`Aborted initial replication of ${this.slot_name}`);
491
+ else {
492
+ this.logger.info(`Replicating ${table.qualifiedName} ${at}/${limited.length} for resnapshot`);
373
493
  }
374
- for (const record of WalStream.getQueryData(rows)) {
375
- // This auto-flushes when the batch reaches its size limit
376
- await batch.save({
377
- tag: storage.SaveOperationTag.INSERT,
378
- sourceTable: table,
379
- before: undefined,
380
- beforeReplicaId: undefined,
381
- after: record,
382
- afterReplicaId: getUuidReplicaIdentityBson(record, table.replicaIdColumns)
383
- });
494
+ if (this.abort_signal.aborted) {
495
+ // We only abort after flushing
496
+ throw new ReplicationAbortedError(`Initial replication interrupted`);
384
497
  }
385
- at += rows.length;
386
- this.metrics.getCounter(ReplicationMetric.ROWS_REPLICATED).add(rows.length);
387
- await touch();
388
498
  }
389
- await batch.flush();
390
499
  }
391
500
  async handleRelation(batch, descriptor, snapshot) {
392
501
  if (!descriptor.objectId && typeof descriptor.objectId != 'number') {
@@ -399,7 +508,7 @@ WHERE oid = $1::regclass`,
399
508
  entity_descriptor: descriptor,
400
509
  sync_rules: this.sync_rules
401
510
  });
402
- this.relation_cache.set(descriptor.objectId, result.table);
511
+ this.relationCache.update(result.table);
403
512
  // Drop conflicting tables. This includes for example renamed tables.
404
513
  await batch.drop(result.dropTables);
405
514
  // Snapshot if:
@@ -410,38 +519,53 @@ WHERE oid = $1::regclass`,
410
519
  if (shouldSnapshot) {
411
520
  // Truncate this table, in case a previous snapshot was interrupted.
412
521
  await batch.truncate([result.table]);
413
- let lsn = ZERO_LSN;
414
522
  // Start the snapshot inside a transaction.
415
523
  // We use a dedicated connection for this.
416
524
  const db = await this.connections.snapshotConnection();
417
525
  try {
418
- await db.query('BEGIN');
419
- try {
420
- await this.snapshotTable(batch, db, result.table);
421
- // Get the current LSN.
422
- // The data will only be consistent once incremental replication
423
- // has passed that point.
424
- // We have to get this LSN _after_ we have started the snapshot query.
425
- const rs = await db.query(`select pg_current_wal_lsn() as lsn`);
426
- lsn = rs.rows[0][0];
427
- await db.query('COMMIT');
428
- }
429
- catch (e) {
430
- await db.query('ROLLBACK');
431
- // TODO: Wrap with custom error type
432
- throw e;
433
- }
526
+ const table = await this.snapshotTableInTx(batch, db, result.table);
527
+ // After the table snapshot, we wait for replication to catch up.
528
+ // To make sure there is actually something to replicate, we send a keepalive
529
+ // message.
530
+ await sendKeepAlive(db);
531
+ return table;
434
532
  }
435
533
  finally {
436
534
  await db.end();
437
535
  }
438
- const [table] = await batch.markSnapshotDone([result.table], lsn);
439
- return table;
440
536
  }
441
537
  return result.table;
442
538
  }
539
+ /**
540
+ * Process rows that have missing TOAST values.
541
+ *
542
+ * This can happen during edge cases in the chunked intial snapshot process.
543
+ *
544
+ * We handle this similar to an inline table snapshot, but limited to the specific
545
+ * set of rows.
546
+ */
547
+ async resnapshot(batch, rows) {
548
+ const byTable = new Map();
549
+ for (let row of rows) {
550
+ const relId = row.table.objectId; // always a number for postgres
551
+ if (!byTable.has(relId)) {
552
+ byTable.set(relId, []);
553
+ }
554
+ byTable.get(relId).push(row);
555
+ }
556
+ const db = await this.connections.snapshotConnection();
557
+ try {
558
+ for (let rows of byTable.values()) {
559
+ const table = rows[0].table;
560
+ await this.snapshotTableInTx(batch, db, table, rows.map((r) => r.key));
561
+ }
562
+ }
563
+ finally {
564
+ await db.end();
565
+ }
566
+ }
443
567
  getTable(relationId) {
444
- const table = this.relation_cache.get(relationId);
568
+ const table = this.relationCache.get(relationId);
445
569
  if (table == null) {
446
570
  // We should always receive a replication message before the relation is used.
447
571
  // If we can't find it, it's a bug.
@@ -456,7 +580,7 @@ WHERE oid = $1::regclass`,
456
580
  if (msg.tag == 'insert' || msg.tag == 'update' || msg.tag == 'delete') {
457
581
  const table = this.getTable(getRelId(msg.relation));
458
582
  if (!table.syncAny) {
459
- logger.debug(`Table ${table.qualifiedName} not used in sync rules - skipping`);
583
+ this.logger.debug(`Table ${table.qualifiedName} not used in sync rules - skipping`);
460
584
  return null;
461
585
  }
462
586
  if (msg.tag == 'insert') {
@@ -556,7 +680,36 @@ WHERE oid = $1::regclass`,
556
680
  this.startedStreaming = true;
557
681
  // Auto-activate as soon as initial replication is done
558
682
  await this.storage.autoActivate();
559
- await this.storage.startBatch({ zeroLSN: ZERO_LSN, defaultSchema: POSTGRES_DEFAULT_SCHEMA, storeCurrentData: true, skipExistingRows: false }, async (batch) => {
683
+ let resnapshot = [];
684
+ const markRecordUnavailable = (record) => {
685
+ if (!IdSnapshotQuery.supports(record.sourceTable)) {
686
+ // If it's not supported, it's also safe to ignore
687
+ return;
688
+ }
689
+ let key = {};
690
+ for (let column of record.sourceTable.replicaIdColumns) {
691
+ const name = column.name;
692
+ const value = record.after[name];
693
+ if (value == null) {
694
+ // We don't expect this to actually happen.
695
+ // The key should always be present in the "after" record.
696
+ return;
697
+ }
698
+ key[name] = value;
699
+ }
700
+ resnapshot.push({
701
+ table: record.sourceTable,
702
+ key: key
703
+ });
704
+ };
705
+ await this.storage.startBatch({
706
+ logger: this.logger,
707
+ zeroLSN: ZERO_LSN,
708
+ defaultSchema: POSTGRES_DEFAULT_SCHEMA,
709
+ storeCurrentData: true,
710
+ skipExistingRows: false,
711
+ markRecordUnavailable
712
+ }, async (batch) => {
560
713
  // We don't handle any plain keepalive messages while we have transactions.
561
714
  // While we have transactions, we use that to advance the position.
562
715
  // Replication never starts in the middle of a transaction, so this starts as false.
@@ -585,6 +738,9 @@ WHERE oid = $1::regclass`,
585
738
  else if (msg.tag == 'begin') {
586
739
  // This may span multiple transactions in the same chunk, or even across chunks.
587
740
  skipKeepalive = true;
741
+ if (this.oldestUncommittedChange == null) {
742
+ this.oldestUncommittedChange = new Date(Number(msg.commitTime / 1000n));
743
+ }
588
744
  }
589
745
  else if (msg.tag == 'commit') {
590
746
  this.metrics.getCounter(ReplicationMetric.TRANSACTIONS_REPLICATED).add(1);
@@ -593,13 +749,30 @@ WHERE oid = $1::regclass`,
593
749
  // This effectively lets us batch multiple transactions within the same chunk
594
750
  // into a single flush, increasing throughput for many small transactions.
595
751
  skipKeepalive = false;
596
- await batch.commit(msg.lsn, { createEmptyCheckpoints });
752
+ // flush() must be before the resnapshot check - that is
753
+ // typically what reports the resnapshot records.
754
+ await batch.flush();
755
+ // This _must_ be checked after the flush(), and before
756
+ // commit() or ack(). We never persist the resnapshot list,
757
+ // so we have to process it before marking our progress.
758
+ if (resnapshot.length > 0) {
759
+ await this.resnapshot(batch, resnapshot);
760
+ resnapshot = [];
761
+ }
762
+ const didCommit = await batch.commit(msg.lsn, {
763
+ createEmptyCheckpoints,
764
+ oldestUncommittedChange: this.oldestUncommittedChange
765
+ });
597
766
  await this.ack(msg.lsn, replicationStream);
767
+ if (didCommit) {
768
+ this.oldestUncommittedChange = null;
769
+ this.isStartingReplication = false;
770
+ }
598
771
  }
599
772
  }
600
773
  else {
601
774
  if (count % 100 == 0) {
602
- logger.info(`${this.slot_name} replicating op ${count} ${msg.lsn}`);
775
+ this.logger.info(`Replicating op ${count} ${msg.lsn}`);
603
776
  }
604
777
  /**
605
778
  * If we can see the contents of logical messages, then we can check if a keepalive
@@ -611,7 +784,14 @@ WHERE oid = $1::regclass`,
611
784
  keepAliveDetected = true;
612
785
  }
613
786
  count += 1;
614
- await this.writeChange(batch, msg);
787
+ const flushResult = await this.writeChange(batch, msg);
788
+ if (flushResult != null && resnapshot.length > 0) {
789
+ // If we have large transactions, we also need to flush the resnapshot list
790
+ // periodically.
791
+ // TODO: make sure this bit is actually triggered
792
+ await this.resnapshot(batch, resnapshot);
793
+ resnapshot = [];
794
+ }
615
795
  }
616
796
  }
617
797
  if (!skipKeepalive) {
@@ -624,6 +804,7 @@ WHERE oid = $1::regclass`,
624
804
  // may be in the middle of the next transaction.
625
805
  // It must only be used to associate checkpoints with LSNs.
626
806
  await batch.keepalive(chunkLastLsn);
807
+ this.isStartingReplication = false;
627
808
  }
628
809
  // We receive chunks with empty messages often (about each second).
629
810
  // Acknowledging here progresses the slot past these and frees up resources.
@@ -649,7 +830,8 @@ WHERE oid = $1::regclass`,
649
830
  if (storageIdentifier.type != lib_postgres.POSTGRES_CONNECTION_TYPE) {
650
831
  return {
651
832
  // Keep the same behaviour as before allowing Postgres storage.
652
- createEmptyCheckpoints: true
833
+ createEmptyCheckpoints: true,
834
+ oldestUncommittedChange: null
653
835
  };
654
836
  }
655
837
  const parsedStorageIdentifier = lib_postgres.utils.decodePostgresSystemIdentifier(storageIdentifier.id);
@@ -665,7 +847,8 @@ WHERE oid = $1::regclass`,
665
847
  * Don't create empty checkpoints if the same Postgres database is used for the data source
666
848
  * and sync bucket storage. Creating empty checkpoints will cause WAL feedback loops.
667
849
  */
668
- createEmptyCheckpoints: replicationIdentifier.database_name != parsedStorageIdentifier.database_name
850
+ createEmptyCheckpoints: replicationIdentifier.database_name != parsedStorageIdentifier.database_name,
851
+ oldestUncommittedChange: null
669
852
  };
670
853
  }
671
854
  /**
@@ -676,6 +859,19 @@ WHERE oid = $1::regclass`,
676
859
  const version = await this.connections.getServerVersion();
677
860
  return version ? version.compareMain('14.0.0') >= 0 : false;
678
861
  }
862
+ async getReplicationLagMillis() {
863
+ if (this.oldestUncommittedChange == null) {
864
+ if (this.isStartingReplication) {
865
+ // We don't have anything to compute replication lag with yet.
866
+ return undefined;
867
+ }
868
+ else {
869
+ // We don't have any uncommitted changes, so replication is up-to-date.
870
+ return 0;
871
+ }
872
+ }
873
+ return Date.now() - this.oldestUncommittedChange.getTime();
874
+ }
679
875
  }
680
876
  async function touch() {
681
877
  // FIXME: The hosted Kubernetes probe does not actually check the timestamp on this.