@powersync/service-module-postgres 0.13.1 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/CHANGELOG.md +32 -0
  2. package/dist/api/PostgresRouteAPIAdapter.d.ts +1 -1
  3. package/dist/api/PostgresRouteAPIAdapter.js +1 -1
  4. package/dist/api/PostgresRouteAPIAdapter.js.map +1 -1
  5. package/dist/replication/SnapshotQuery.d.ts +78 -0
  6. package/dist/replication/SnapshotQuery.js +175 -0
  7. package/dist/replication/SnapshotQuery.js.map +1 -0
  8. package/dist/replication/WalStream.d.ts +37 -4
  9. package/dist/replication/WalStream.js +318 -91
  10. package/dist/replication/WalStream.js.map +1 -1
  11. package/dist/replication/WalStreamReplicationJob.d.ts +2 -0
  12. package/dist/replication/WalStreamReplicationJob.js +14 -3
  13. package/dist/replication/WalStreamReplicationJob.js.map +1 -1
  14. package/dist/replication/WalStreamReplicator.d.ts +1 -0
  15. package/dist/replication/WalStreamReplicator.js +22 -0
  16. package/dist/replication/WalStreamReplicator.js.map +1 -1
  17. package/dist/replication/replication-utils.d.ts +4 -0
  18. package/dist/replication/replication-utils.js +46 -2
  19. package/dist/replication/replication-utils.js.map +1 -1
  20. package/package.json +10 -9
  21. package/src/api/PostgresRouteAPIAdapter.ts +1 -1
  22. package/src/replication/SnapshotQuery.ts +209 -0
  23. package/src/replication/WalStream.ts +373 -98
  24. package/src/replication/WalStreamReplicationJob.ts +15 -3
  25. package/src/replication/WalStreamReplicator.ts +26 -0
  26. package/src/replication/replication-utils.ts +60 -2
  27. package/test/src/__snapshots__/schema_changes.test.ts.snap +2 -2
  28. package/test/src/checkpoints.test.ts +7 -5
  29. package/test/src/chunked_snapshots.test.ts +156 -0
  30. package/test/src/large_batch.test.ts +5 -154
  31. package/test/src/resuming_snapshots.test.ts +150 -0
  32. package/test/src/schema_changes.test.ts +5 -10
  33. package/test/src/slow_tests.test.ts +13 -30
  34. package/test/src/util.ts +12 -1
  35. package/test/src/validation.test.ts +0 -1
  36. package/test/src/wal_stream.test.ts +4 -9
  37. package/test/src/wal_stream_utils.ts +15 -7
  38. package/tsconfig.tsbuildinfo +1 -1
@@ -4,25 +4,53 @@ import {
4
4
  DatabaseConnectionError,
5
5
  ErrorCode,
6
6
  errors,
7
- logger,
8
- ReplicationAbortedError,
9
- ReplicationAssertionError
7
+ Logger,
8
+ logger as defaultLogger,
9
+ ReplicationAssertionError,
10
+ ReplicationAbortedError
10
11
  } from '@powersync/lib-services-framework';
11
- import { getUuidReplicaIdentityBson, MetricsEngine, SourceEntityDescriptor, storage } from '@powersync/service-core';
12
+ import {
13
+ BucketStorageBatch,
14
+ getUuidReplicaIdentityBson,
15
+ MetricsEngine,
16
+ RelationCache,
17
+ SaveUpdate,
18
+ SourceEntityDescriptor,
19
+ SourceTable,
20
+ storage
21
+ } from '@powersync/service-core';
12
22
  import * as pgwire from '@powersync/service-jpgwire';
13
23
  import { DatabaseInputRow, SqliteRow, SqlSyncRules, TablePattern, toSyncRulesRow } from '@powersync/service-sync-rules';
14
24
  import * as pg_utils from '../utils/pgwire_utils.js';
15
25
 
16
26
  import { PgManager } from './PgManager.js';
17
27
  import { getPgOutputRelation, getRelId } from './PgRelation.js';
18
- import { checkSourceConfiguration, getReplicationIdentityColumns } from './replication-utils.js';
28
+ import { checkSourceConfiguration, checkTableRls, getReplicationIdentityColumns } from './replication-utils.js';
19
29
  import { ReplicationMetric } from '@powersync/service-types';
30
+ import {
31
+ ChunkedSnapshotQuery,
32
+ IdSnapshotQuery,
33
+ MissingRow,
34
+ PrimaryKeyValue,
35
+ SimpleSnapshotQuery,
36
+ SnapshotQuery
37
+ } from './SnapshotQuery.js';
20
38
 
21
39
  export interface WalStreamOptions {
40
+ logger?: Logger;
22
41
  connections: PgManager;
23
42
  storage: storage.SyncRulesBucketStorage;
24
43
  metrics: MetricsEngine;
25
44
  abort_signal: AbortSignal;
45
+
46
+ /**
47
+ * Override snapshot chunk length (number of rows), for testing.
48
+ *
49
+ * Defaults to 10_000.
50
+ *
51
+ * Note that queries are streamed, so we don't actually keep that much data in memory.
52
+ */
53
+ snapshotChunkLength?: number;
26
54
  }
27
55
 
28
56
  interface InitResult {
@@ -73,6 +101,8 @@ export class WalStream {
73
101
 
74
102
  connection_id = 1;
75
103
 
104
+ private logger: Logger;
105
+
76
106
  private readonly storage: storage.SyncRulesBucketStorage;
77
107
  private readonly metrics: MetricsEngine;
78
108
  private readonly slot_name: string;
@@ -81,17 +111,37 @@ export class WalStream {
81
111
 
82
112
  private abort_signal: AbortSignal;
83
113
 
84
- private relation_cache = new Map<string | number, storage.SourceTable>();
114
+ private relationCache = new RelationCache((relation: number | SourceTable) => {
115
+ if (typeof relation == 'number') {
116
+ return relation;
117
+ }
118
+ return relation.objectId!;
119
+ });
85
120
 
86
121
  private startedStreaming = false;
87
122
 
123
+ private snapshotChunkLength: number;
124
+
125
+ /**
126
+ * Time of the oldest uncommitted change, according to the source db.
127
+ * This is used to determine the replication lag.
128
+ */
129
+ private oldestUncommittedChange: Date | null = null;
130
+ /**
131
+ * Keep track of whether we have done a commit or keepalive yet.
132
+ * We can only compute replication lag if isStartingReplication == false, or oldestUncommittedChange is present.
133
+ */
134
+ private isStartingReplication = true;
135
+
88
136
  constructor(options: WalStreamOptions) {
137
+ this.logger = options.logger ?? defaultLogger;
89
138
  this.storage = options.storage;
90
139
  this.metrics = options.metrics;
91
140
  this.sync_rules = options.storage.getParsedSyncRules({ defaultSchema: POSTGRES_DEFAULT_SCHEMA });
92
141
  this.group_id = options.storage.group_id;
93
142
  this.slot_name = options.storage.slot_name;
94
143
  this.connections = options.connections;
144
+ this.snapshotChunkLength = options.snapshotChunkLength ?? 10_000;
95
145
 
96
146
  this.abort_signal = options.abort_signal;
97
147
  this.abort_signal.addEventListener(
@@ -104,7 +154,7 @@ export class WalStream {
104
154
  const promise = sendKeepAlive(this.connections.pool);
105
155
  promise.catch((e) => {
106
156
  // Failures here are okay - this only speeds up stopping the process.
107
- logger.warn('Failed to ping connection', e);
157
+ this.logger.warn('Failed to ping connection', e);
108
158
  });
109
159
  } else {
110
160
  // If we haven't started streaming yet, it could be due to something like
@@ -183,10 +233,21 @@ export class WalStream {
183
233
  ]
184
234
  });
185
235
  if (rs.rows.length == 0) {
186
- logger.info(`Skipping ${tablePattern.schema}.${name} - not part of ${PUBLICATION_NAME} publication`);
236
+ this.logger.info(`Skipping ${tablePattern.schema}.${name} - not part of ${PUBLICATION_NAME} publication`);
187
237
  continue;
188
238
  }
189
239
 
240
+ try {
241
+ const result = await checkTableRls(db, relid);
242
+ if (!result.canRead) {
243
+ // We log the message, then continue anyway, since the check does not cover all cases.
244
+ this.logger.warn(result.message!);
245
+ }
246
+ } catch (e) {
247
+ // It's possible that we just don't have permission to access pg_roles - log the error and continue.
248
+ this.logger.warn(`Could not check RLS access for ${tablePattern.schema}.${name}`, e);
249
+ }
250
+
190
251
  const cresult = await getReplicationIdentityColumns(db, relid);
191
252
 
192
253
  const table = await this.handleRelation(
@@ -215,7 +276,7 @@ export class WalStream {
215
276
  const snapshotDone = status.snapshot_done && status.checkpoint_lsn != null;
216
277
  if (snapshotDone) {
217
278
  // Snapshot is done, but we still need to check the replication slot status
218
- logger.info(`${slotName} Initial replication already done`);
279
+ this.logger.info(`Initial replication already done`);
219
280
  }
220
281
 
221
282
  // Check if replication slot exists
@@ -276,7 +337,7 @@ export class WalStream {
276
337
  // We peek a large number of changes here, to make it more likely to pick up replication slot errors.
277
338
  // For example, "publication does not exist" only occurs here if the peek actually includes changes related
278
339
  // to the slot.
279
- logger.info(`Checking ${slotName}`);
340
+ this.logger.info(`Checking ${slotName}`);
280
341
 
281
342
  // The actual results can be quite large, so we don't actually return everything
282
343
  // due to memory and processing overhead that would create.
@@ -293,11 +354,11 @@ export class WalStream {
293
354
  }
294
355
 
295
356
  // Success
296
- logger.info(`Slot ${slotName} appears healthy`);
357
+ this.logger.info(`Slot ${slotName} appears healthy`);
297
358
  return { needsNewSlot: false };
298
359
  } catch (e) {
299
360
  last_error = e;
300
- logger.warn(`${slotName} Replication slot error`, e);
361
+ this.logger.warn(`Replication slot error`, e);
301
362
 
302
363
  if (this.stopped) {
303
364
  throw e;
@@ -324,7 +385,7 @@ export class WalStream {
324
385
  // Sample: publication "powersync" does not exist
325
386
  // Happens when publication deleted or never created.
326
387
  // Slot must be re-created in this case.
327
- logger.info(`${slotName} is not valid anymore`);
388
+ this.logger.info(`${slotName} is not valid anymore`);
328
389
 
329
390
  return { needsNewSlot: true };
330
391
  }
@@ -336,7 +397,7 @@ export class WalStream {
336
397
  throw new ReplicationAssertionError('Unreachable');
337
398
  }
338
399
 
339
- async estimatedCount(db: pgwire.PgConnection, table: storage.SourceTable): Promise<string> {
400
+ async estimatedCountNumber(db: pgwire.PgConnection, table: storage.SourceTable): Promise<number> {
340
401
  const results = await db.query({
341
402
  statement: `SELECT reltuples::bigint AS estimate
342
403
  FROM pg_class
@@ -345,9 +406,9 @@ WHERE oid = $1::regclass`,
345
406
  });
346
407
  const row = results.rows[0];
347
408
  if ((row?.[0] ?? -1n) == -1n) {
348
- return '?';
409
+ return -1;
349
410
  } else {
350
- return `~${row[0]}`;
411
+ return Number(row[0]);
351
412
  }
352
413
  }
353
414
 
@@ -370,7 +431,7 @@ WHERE oid = $1::regclass`,
370
431
  // In those cases, we have to start replication from scratch.
371
432
  // If there is an existing healthy slot, we can skip this and continue
372
433
  // initial replication where we left off.
373
- await this.storage.clear();
434
+ await this.storage.clear({ signal: this.abort_signal });
374
435
 
375
436
  await db.query({
376
437
  statement: 'SELECT pg_drop_replication_slot(slot_name) FROM pg_replication_slots WHERE slot_name = $1',
@@ -381,7 +442,7 @@ WHERE oid = $1::regclass`,
381
442
  // The replication slot must be created before we start snapshotting tables.
382
443
  await replicationConnection.query(`CREATE_REPLICATION_SLOT ${slotName} LOGICAL pgoutput`);
383
444
 
384
- logger.info(`Created replication slot ${slotName}`);
445
+ this.logger.info(`Created replication slot ${slotName}`);
385
446
  }
386
447
 
387
448
  await this.initialReplication(db);
@@ -390,24 +451,37 @@ WHERE oid = $1::regclass`,
390
451
  async initialReplication(db: pgwire.PgConnection) {
391
452
  const sourceTables = this.sync_rules.getSourceTables();
392
453
  await this.storage.startBatch(
393
- { zeroLSN: ZERO_LSN, defaultSchema: POSTGRES_DEFAULT_SCHEMA, storeCurrentData: true, skipExistingRows: true },
454
+ {
455
+ logger: this.logger,
456
+ zeroLSN: ZERO_LSN,
457
+ defaultSchema: POSTGRES_DEFAULT_SCHEMA,
458
+ storeCurrentData: true,
459
+ skipExistingRows: true
460
+ },
394
461
  async (batch) => {
462
+ let tablesWithStatus: SourceTable[] = [];
395
463
  for (let tablePattern of sourceTables) {
396
464
  const tables = await this.getQualifiedTableNames(batch, db, tablePattern);
465
+ // Pre-get counts
397
466
  for (let table of tables) {
398
467
  if (table.snapshotComplete) {
399
- logger.info(`${this.slot_name} Skipping ${table.qualifiedName} - snapshot already done`);
468
+ this.logger.info(`Skipping ${table.qualifiedName} - snapshot already done`);
400
469
  continue;
401
470
  }
402
- await this.snapshotTable(batch, db, table);
471
+ const count = await this.estimatedCountNumber(db, table);
472
+ table = await batch.updateTableProgress(table, { totalEstimatedCount: count });
473
+ this.relationCache.update(table);
474
+ tablesWithStatus.push(table);
403
475
 
404
- const rs = await db.query(`select pg_current_wal_lsn() as lsn`);
405
- const tableLsnNotBefore = rs.rows[0][0];
406
- await batch.markSnapshotDone([table], tableLsnNotBefore);
407
- await touch();
476
+ this.logger.info(`To replicate: ${table.qualifiedName} ${table.formatSnapshotProgress()}`);
408
477
  }
409
478
  }
410
479
 
480
+ for (let table of tablesWithStatus) {
481
+ await this.snapshotTableInTx(batch, db, table);
482
+ await touch();
483
+ }
484
+
411
485
  // Always commit the initial snapshot at zero.
412
486
  // This makes sure we don't skip any changes applied before starting this snapshot,
413
487
  // in the case of snapshot retries.
@@ -431,60 +505,164 @@ WHERE oid = $1::regclass`,
431
505
  yield toSyncRulesRow(row);
432
506
  }
433
507
  }
508
+ private async snapshotTableInTx(
509
+ batch: storage.BucketStorageBatch,
510
+ db: pgwire.PgConnection,
511
+ table: storage.SourceTable,
512
+ limited?: PrimaryKeyValue[]
513
+ ): Promise<storage.SourceTable> {
514
+ // Note: We use the default "Read Committed" isolation level here, not snapshot isolation.
515
+ // The data may change during the transaction, but that is compensated for in the streaming
516
+ // replication afterwards.
517
+ await db.query('BEGIN');
518
+ try {
519
+ let tableLsnNotBefore: string;
520
+ await this.snapshotTable(batch, db, table, limited);
521
+
522
+ // Get the current LSN.
523
+ // The data will only be consistent once incremental replication has passed that point.
524
+ // We have to get this LSN _after_ we have finished the table snapshot.
525
+ //
526
+ // There are basically two relevant LSNs here:
527
+ // A: The LSN before the snapshot starts. We don't explicitly record this on the PowerSync side,
528
+ // but it is implicitly recorded in the replication slot.
529
+ // B: The LSN after the table snapshot is complete, which is what we get here.
530
+ // When we do the snapshot queries, the data that we get back for each chunk could match the state
531
+ // anywhere between A and B. To actually have a consistent state on our side, we need to:
532
+ // 1. Complete the snapshot.
533
+ // 2. Wait until logical replication has caught up with all the change between A and B.
534
+ // Calling `markSnapshotDone(LSN B)` covers that.
535
+ const rs = await db.query(`select pg_current_wal_lsn() as lsn`);
536
+ tableLsnNotBefore = rs.rows[0][0];
537
+ // Side note: A ROLLBACK would probably also be fine here, since we only read in this transaction.
538
+ await db.query('COMMIT');
539
+ const [resultTable] = await batch.markSnapshotDone([table], tableLsnNotBefore);
540
+ this.relationCache.update(resultTable);
541
+ return resultTable;
542
+ } catch (e) {
543
+ await db.query('ROLLBACK');
544
+ throw e;
545
+ }
546
+ }
547
+
548
+ private async snapshotTable(
549
+ batch: storage.BucketStorageBatch,
550
+ db: pgwire.PgConnection,
551
+ table: storage.SourceTable,
552
+ limited?: PrimaryKeyValue[]
553
+ ) {
554
+ let totalEstimatedCount = table.snapshotStatus?.totalEstimatedCount;
555
+ let at = table.snapshotStatus?.replicatedCount ?? 0;
556
+ let lastCountTime = 0;
557
+ let q: SnapshotQuery;
558
+ // We do streaming on two levels:
559
+ // 1. Coarse level: DELCARE CURSOR, FETCH 10000 at a time.
560
+ // 2. Fine level: Stream chunks from each fetch call.
561
+ if (limited) {
562
+ q = new IdSnapshotQuery(db, table, limited);
563
+ } else if (ChunkedSnapshotQuery.supports(table)) {
564
+ // Single primary key - we can use the primary key for chunking
565
+ const orderByKey = table.replicaIdColumns[0];
566
+ q = new ChunkedSnapshotQuery(db, table, this.snapshotChunkLength, table.snapshotStatus?.lastKey ?? null);
567
+ if (table.snapshotStatus?.lastKey != null) {
568
+ this.logger.info(
569
+ `Replicating ${table.qualifiedName} ${table.formatSnapshotProgress()} - resuming from ${orderByKey.name} > ${(q as ChunkedSnapshotQuery).lastKey}`
570
+ );
571
+ } else {
572
+ this.logger.info(`Replicating ${table.qualifiedName} ${table.formatSnapshotProgress()} - resumable`);
573
+ }
574
+ } else {
575
+ // Fallback case - query the entire table
576
+ this.logger.info(`Replicating ${table.qualifiedName} ${table.formatSnapshotProgress()} - not resumable`);
577
+ q = new SimpleSnapshotQuery(db, table, this.snapshotChunkLength);
578
+ at = 0;
579
+ }
580
+ await q.initialize();
434
581
 
435
- private async snapshotTable(batch: storage.BucketStorageBatch, db: pgwire.PgConnection, table: storage.SourceTable) {
436
- logger.info(`${this.slot_name} Replicating ${table.qualifiedName}`);
437
- const estimatedCount = await this.estimatedCount(db, table);
438
- let at = 0;
439
- let lastLogIndex = 0;
440
- const cursor = db.stream({ statement: `SELECT * FROM ${table.escapedIdentifier}` });
441
582
  let columns: { i: number; name: string }[] = [];
442
- // pgwire streams rows in chunks.
443
- // These chunks can be quite small (as little as 16KB), so we don't flush chunks automatically.
444
-
445
- for await (let chunk of cursor) {
446
- if (chunk.tag == 'RowDescription') {
447
- let i = 0;
448
- columns = chunk.payload.map((c) => {
449
- return { i: i++, name: c.name };
583
+ let hasRemainingData = true;
584
+ while (hasRemainingData) {
585
+ // Fetch 10k at a time.
586
+ // The balance here is between latency overhead per FETCH call,
587
+ // and not spending too much time on each FETCH call.
588
+ // We aim for a couple of seconds on each FETCH call.
589
+ const cursor = q.nextChunk();
590
+ hasRemainingData = false;
591
+ // pgwire streams rows in chunks.
592
+ // These chunks can be quite small (as little as 16KB), so we don't flush chunks automatically.
593
+ // There are typically 100-200 rows per chunk.
594
+ for await (let chunk of cursor) {
595
+ if (chunk.tag == 'RowDescription') {
596
+ // We get a RowDescription for each FETCH call, but they should
597
+ // all be the same.
598
+ let i = 0;
599
+ columns = chunk.payload.map((c) => {
600
+ return { i: i++, name: c.name };
601
+ });
602
+ continue;
603
+ }
604
+
605
+ const rows = chunk.rows.map((row) => {
606
+ let q: DatabaseInputRow = {};
607
+ for (let c of columns) {
608
+ q[c.name] = row[c.i];
609
+ }
610
+ return q;
450
611
  });
451
- continue;
452
- }
612
+ if (rows.length > 0) {
613
+ hasRemainingData = true;
614
+ }
453
615
 
454
- const rows = chunk.rows.map((row) => {
455
- let q: DatabaseInputRow = {};
456
- for (let c of columns) {
457
- q[c.name] = row[c.i];
616
+ for (const record of WalStream.getQueryData(rows)) {
617
+ // This auto-flushes when the batch reaches its size limit
618
+ await batch.save({
619
+ tag: storage.SaveOperationTag.INSERT,
620
+ sourceTable: table,
621
+ before: undefined,
622
+ beforeReplicaId: undefined,
623
+ after: record,
624
+ afterReplicaId: getUuidReplicaIdentityBson(record, table.replicaIdColumns)
625
+ });
458
626
  }
459
- return q;
460
- });
461
- if (rows.length > 0 && at - lastLogIndex >= 5000) {
462
- logger.info(`${this.slot_name} Replicating ${table.qualifiedName} ${at}/${estimatedCount}`);
463
- lastLogIndex = at;
464
- }
465
- if (this.abort_signal.aborted) {
466
- throw new ReplicationAbortedError(`Aborted initial replication of ${this.slot_name}`);
627
+
628
+ at += rows.length;
629
+ this.metrics.getCounter(ReplicationMetric.ROWS_REPLICATED).add(rows.length);
630
+
631
+ await touch();
467
632
  }
468
633
 
469
- for (const record of WalStream.getQueryData(rows)) {
470
- // This auto-flushes when the batch reaches its size limit
471
- await batch.save({
472
- tag: storage.SaveOperationTag.INSERT,
473
- sourceTable: table,
474
- before: undefined,
475
- beforeReplicaId: undefined,
476
- after: record,
477
- afterReplicaId: getUuidReplicaIdentityBson(record, table.replicaIdColumns)
634
+ // Important: flush before marking progress
635
+ await batch.flush();
636
+ if (limited == null) {
637
+ let lastKey: Uint8Array | undefined;
638
+ if (q instanceof ChunkedSnapshotQuery) {
639
+ lastKey = q.getLastKeySerialized();
640
+ }
641
+ if (lastCountTime < performance.now() - 10 * 60 * 1000) {
642
+ // Even though we're doing the snapshot inside a transaction, the transaction uses
643
+ // the default "Read Committed" isolation level. This means we can get new data
644
+ // within the transaction, so we re-estimate the count every 10 minutes when replicating
645
+ // large tables.
646
+ totalEstimatedCount = await this.estimatedCountNumber(db, table);
647
+ lastCountTime = performance.now();
648
+ }
649
+ table = await batch.updateTableProgress(table, {
650
+ lastKey: lastKey,
651
+ replicatedCount: at,
652
+ totalEstimatedCount: totalEstimatedCount
478
653
  });
479
- }
654
+ this.relationCache.update(table);
480
655
 
481
- at += rows.length;
482
- this.metrics.getCounter(ReplicationMetric.ROWS_REPLICATED).add(rows.length);
656
+ this.logger.info(`Replicating ${table.qualifiedName} ${table.formatSnapshotProgress()}`);
657
+ } else {
658
+ this.logger.info(`Replicating ${table.qualifiedName} ${at}/${limited.length} for resnapshot`);
659
+ }
483
660
 
484
- await touch();
661
+ if (this.abort_signal.aborted) {
662
+ // We only abort after flushing
663
+ throw new ReplicationAbortedError(`Initial replication interrupted`);
664
+ }
485
665
  }
486
-
487
- await batch.flush();
488
666
  }
489
667
 
490
668
  async handleRelation(batch: storage.BucketStorageBatch, descriptor: SourceEntityDescriptor, snapshot: boolean) {
@@ -498,7 +676,7 @@ WHERE oid = $1::regclass`,
498
676
  entity_descriptor: descriptor,
499
677
  sync_rules: this.sync_rules
500
678
  });
501
- this.relation_cache.set(descriptor.objectId, result.table);
679
+ this.relationCache.update(result.table);
502
680
 
503
681
  // Drop conflicting tables. This includes for example renamed tables.
504
682
  await batch.drop(result.dropTables);
@@ -513,40 +691,59 @@ WHERE oid = $1::regclass`,
513
691
  // Truncate this table, in case a previous snapshot was interrupted.
514
692
  await batch.truncate([result.table]);
515
693
 
516
- let lsn: string = ZERO_LSN;
517
694
  // Start the snapshot inside a transaction.
518
695
  // We use a dedicated connection for this.
519
696
  const db = await this.connections.snapshotConnection();
520
697
  try {
521
- await db.query('BEGIN');
522
- try {
523
- await this.snapshotTable(batch, db, result.table);
524
-
525
- // Get the current LSN.
526
- // The data will only be consistent once incremental replication
527
- // has passed that point.
528
- // We have to get this LSN _after_ we have started the snapshot query.
529
- const rs = await db.query(`select pg_current_wal_lsn() as lsn`);
530
- lsn = rs.rows[0][0];
531
-
532
- await db.query('COMMIT');
533
- } catch (e) {
534
- await db.query('ROLLBACK');
535
- // TODO: Wrap with custom error type
536
- throw e;
537
- }
698
+ const table = await this.snapshotTableInTx(batch, db, result.table);
699
+ // After the table snapshot, we wait for replication to catch up.
700
+ // To make sure there is actually something to replicate, we send a keepalive
701
+ // message.
702
+ await sendKeepAlive(db);
703
+ return table;
538
704
  } finally {
539
705
  await db.end();
540
706
  }
541
- const [table] = await batch.markSnapshotDone([result.table], lsn);
542
- return table;
543
707
  }
544
708
 
545
709
  return result.table;
546
710
  }
547
711
 
712
+ /**
713
+ * Process rows that have missing TOAST values.
714
+ *
715
+ * This can happen during edge cases in the chunked intial snapshot process.
716
+ *
717
+ * We handle this similar to an inline table snapshot, but limited to the specific
718
+ * set of rows.
719
+ */
720
+ private async resnapshot(batch: BucketStorageBatch, rows: MissingRow[]) {
721
+ const byTable = new Map<number, MissingRow[]>();
722
+ for (let row of rows) {
723
+ const relId = row.table.objectId as number; // always a number for postgres
724
+ if (!byTable.has(relId)) {
725
+ byTable.set(relId, []);
726
+ }
727
+ byTable.get(relId)!.push(row);
728
+ }
729
+ const db = await this.connections.snapshotConnection();
730
+ try {
731
+ for (let rows of byTable.values()) {
732
+ const table = rows[0].table;
733
+ await this.snapshotTableInTx(
734
+ batch,
735
+ db,
736
+ table,
737
+ rows.map((r) => r.key)
738
+ );
739
+ }
740
+ } finally {
741
+ await db.end();
742
+ }
743
+ }
744
+
548
745
  private getTable(relationId: number): storage.SourceTable {
549
- const table = this.relation_cache.get(relationId);
746
+ const table = this.relationCache.get(relationId);
550
747
  if (table == null) {
551
748
  // We should always receive a replication message before the relation is used.
552
749
  // If we can't find it, it's a bug.
@@ -565,7 +762,7 @@ WHERE oid = $1::regclass`,
565
762
  if (msg.tag == 'insert' || msg.tag == 'update' || msg.tag == 'delete') {
566
763
  const table = this.getTable(getRelId(msg.relation));
567
764
  if (!table.syncAny) {
568
- logger.debug(`Table ${table.qualifiedName} not used in sync rules - skipping`);
765
+ this.logger.debug(`Table ${table.qualifiedName} not used in sync rules - skipping`);
569
766
  return null;
570
767
  }
571
768
 
@@ -673,8 +870,39 @@ WHERE oid = $1::regclass`,
673
870
  // Auto-activate as soon as initial replication is done
674
871
  await this.storage.autoActivate();
675
872
 
873
+ let resnapshot: { table: storage.SourceTable; key: PrimaryKeyValue }[] = [];
874
+
875
+ const markRecordUnavailable = (record: SaveUpdate) => {
876
+ if (!IdSnapshotQuery.supports(record.sourceTable)) {
877
+ // If it's not supported, it's also safe to ignore
878
+ return;
879
+ }
880
+ let key: PrimaryKeyValue = {};
881
+ for (let column of record.sourceTable.replicaIdColumns) {
882
+ const name = column.name;
883
+ const value = record.after[name];
884
+ if (value == null) {
885
+ // We don't expect this to actually happen.
886
+ // The key should always be present in the "after" record.
887
+ return;
888
+ }
889
+ key[name] = value;
890
+ }
891
+ resnapshot.push({
892
+ table: record.sourceTable,
893
+ key: key
894
+ });
895
+ };
896
+
676
897
  await this.storage.startBatch(
677
- { zeroLSN: ZERO_LSN, defaultSchema: POSTGRES_DEFAULT_SCHEMA, storeCurrentData: true, skipExistingRows: false },
898
+ {
899
+ logger: this.logger,
900
+ zeroLSN: ZERO_LSN,
901
+ defaultSchema: POSTGRES_DEFAULT_SCHEMA,
902
+ storeCurrentData: true,
903
+ skipExistingRows: false,
904
+ markRecordUnavailable
905
+ },
678
906
  async (batch) => {
679
907
  // We don't handle any plain keepalive messages while we have transactions.
680
908
  // While we have transactions, we use that to advance the position.
@@ -708,6 +936,9 @@ WHERE oid = $1::regclass`,
708
936
  } else if (msg.tag == 'begin') {
709
937
  // This may span multiple transactions in the same chunk, or even across chunks.
710
938
  skipKeepalive = true;
939
+ if (this.oldestUncommittedChange == null) {
940
+ this.oldestUncommittedChange = new Date(Number(msg.commitTime / 1000n));
941
+ }
711
942
  } else if (msg.tag == 'commit') {
712
943
  this.metrics.getCounter(ReplicationMetric.TRANSACTIONS_REPLICATED).add(1);
713
944
  if (msg == lastCommit) {
@@ -715,12 +946,29 @@ WHERE oid = $1::regclass`,
715
946
  // This effectively lets us batch multiple transactions within the same chunk
716
947
  // into a single flush, increasing throughput for many small transactions.
717
948
  skipKeepalive = false;
718
- await batch.commit(msg.lsn!, { createEmptyCheckpoints });
949
+ // flush() must be before the resnapshot check - that is
950
+ // typically what reports the resnapshot records.
951
+ await batch.flush({ oldestUncommittedChange: this.oldestUncommittedChange });
952
+ // This _must_ be checked after the flush(), and before
953
+ // commit() or ack(). We never persist the resnapshot list,
954
+ // so we have to process it before marking our progress.
955
+ if (resnapshot.length > 0) {
956
+ await this.resnapshot(batch, resnapshot);
957
+ resnapshot = [];
958
+ }
959
+ const didCommit = await batch.commit(msg.lsn!, {
960
+ createEmptyCheckpoints,
961
+ oldestUncommittedChange: this.oldestUncommittedChange
962
+ });
719
963
  await this.ack(msg.lsn!, replicationStream);
964
+ if (didCommit) {
965
+ this.oldestUncommittedChange = null;
966
+ this.isStartingReplication = false;
967
+ }
720
968
  }
721
969
  } else {
722
970
  if (count % 100 == 0) {
723
- logger.info(`${this.slot_name} replicating op ${count} ${msg.lsn}`);
971
+ this.logger.info(`Replicating op ${count} ${msg.lsn}`);
724
972
  }
725
973
 
726
974
  /**
@@ -734,7 +982,14 @@ WHERE oid = $1::regclass`,
734
982
  }
735
983
 
736
984
  count += 1;
737
- await this.writeChange(batch, msg);
985
+ const flushResult = await this.writeChange(batch, msg);
986
+ if (flushResult != null && resnapshot.length > 0) {
987
+ // If we have large transactions, we also need to flush the resnapshot list
988
+ // periodically.
989
+ // TODO: make sure this bit is actually triggered
990
+ await this.resnapshot(batch, resnapshot);
991
+ resnapshot = [];
992
+ }
738
993
  }
739
994
  }
740
995
 
@@ -748,7 +1003,12 @@ WHERE oid = $1::regclass`,
748
1003
  // Big caveat: This _must not_ be used to skip individual messages, since this LSN
749
1004
  // may be in the middle of the next transaction.
750
1005
  // It must only be used to associate checkpoints with LSNs.
751
- await batch.keepalive(chunkLastLsn);
1006
+ const didCommit = await batch.keepalive(chunkLastLsn);
1007
+ if (didCommit) {
1008
+ this.oldestUncommittedChange = null;
1009
+ }
1010
+
1011
+ this.isStartingReplication = false;
752
1012
  }
753
1013
 
754
1014
  // We receive chunks with empty messages often (about each second).
@@ -781,7 +1041,8 @@ WHERE oid = $1::regclass`,
781
1041
  if (storageIdentifier.type != lib_postgres.POSTGRES_CONNECTION_TYPE) {
782
1042
  return {
783
1043
  // Keep the same behaviour as before allowing Postgres storage.
784
- createEmptyCheckpoints: true
1044
+ createEmptyCheckpoints: true,
1045
+ oldestUncommittedChange: null
785
1046
  };
786
1047
  }
787
1048
 
@@ -804,7 +1065,8 @@ WHERE oid = $1::regclass`,
804
1065
  * Don't create empty checkpoints if the same Postgres database is used for the data source
805
1066
  * and sync bucket storage. Creating empty checkpoints will cause WAL feedback loops.
806
1067
  */
807
- createEmptyCheckpoints: replicationIdentifier.database_name != parsedStorageIdentifier.database_name
1068
+ createEmptyCheckpoints: replicationIdentifier.database_name != parsedStorageIdentifier.database_name,
1069
+ oldestUncommittedChange: null
808
1070
  };
809
1071
  }
810
1072
 
@@ -816,6 +1078,19 @@ WHERE oid = $1::regclass`,
816
1078
  const version = await this.connections.getServerVersion();
817
1079
  return version ? version.compareMain('14.0.0') >= 0 : false;
818
1080
  }
1081
+
1082
+ async getReplicationLagMillis(): Promise<number | undefined> {
1083
+ if (this.oldestUncommittedChange == null) {
1084
+ if (this.isStartingReplication) {
1085
+ // We don't have anything to compute replication lag with yet.
1086
+ return undefined;
1087
+ } else {
1088
+ // We don't have any uncommitted changes, so replication is up-to-date.
1089
+ return 0;
1090
+ }
1091
+ }
1092
+ return Date.now() - this.oldestUncommittedChange.getTime();
1093
+ }
819
1094
  }
820
1095
 
821
1096
  async function touch() {