@powersync/service-module-mongodb 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +33 -0
- package/dist/api/MongoRouteAPIAdapter.js +12 -21
- package/dist/api/MongoRouteAPIAdapter.js.map +1 -1
- package/dist/replication/ChangeStream.d.ts +18 -37
- package/dist/replication/ChangeStream.js +136 -351
- package/dist/replication/ChangeStream.js.map +1 -1
- package/dist/replication/MongoRelation.d.ts +1 -1
- package/dist/replication/MongoRelation.js +41 -21
- package/dist/replication/MongoRelation.js.map +1 -1
- package/dist/replication/MongoSnapshotter.d.ts +81 -0
- package/dist/replication/MongoSnapshotter.js +594 -0
- package/dist/replication/MongoSnapshotter.js.map +1 -0
- package/package.json +8 -8
- package/src/api/MongoRouteAPIAdapter.ts +13 -21
- package/src/replication/ChangeStream.ts +150 -426
- package/src/replication/MongoRelation.ts +51 -25
- package/src/replication/MongoSnapshotter.ts +729 -0
- package/test/src/change_stream.test.ts +210 -17
- package/test/src/change_stream_utils.ts +24 -17
- package/test/src/checkpoint_retry.test.ts +131 -0
- package/test/src/resuming_snapshots.test.ts +10 -6
- package/tsconfig.tsbuildinfo +1 -1
|
@@ -7,7 +7,7 @@ import { MongoLSN } from '../common/MongoLSN.js';
|
|
|
7
7
|
import { PostImagesOption } from '../types/types.js';
|
|
8
8
|
import { escapeRegExp } from '../utils.js';
|
|
9
9
|
import { createCheckpoint, getCacheIdentifier, getMongoRelation, STANDALONE_CHECKPOINT_ID } from './MongoRelation.js';
|
|
10
|
-
import {
|
|
10
|
+
import { MongoSnapshotter } from './MongoSnapshotter.js';
|
|
11
11
|
import { parseChangeDocument, rawChangeStream } from './RawChangeStream.js';
|
|
12
12
|
import { CHECKPOINTS_COLLECTION, timestampToDate } from './replication-utils.js';
|
|
13
13
|
import { DirectSourceRowConverter } from './SourceRowConverter.js';
|
|
@@ -34,13 +34,20 @@ export class ChangeStream {
|
|
|
34
34
|
defaultDb;
|
|
35
35
|
metrics;
|
|
36
36
|
maxAwaitTimeMS;
|
|
37
|
-
|
|
37
|
+
abortController = new AbortController();
|
|
38
|
+
abortSignal = this.abortController.signal;
|
|
39
|
+
initPromise = null;
|
|
40
|
+
snapshotter;
|
|
41
|
+
/**
|
|
42
|
+
* We use the relationCache _only_ for caching static SourceTable info, not for snapshot status.
|
|
43
|
+
*/
|
|
38
44
|
relationCache = new RelationCache(getCacheIdentifier);
|
|
39
45
|
replicationLag = new ReplicationLagTracker();
|
|
40
46
|
checkpointStreamId = new mongo.ObjectId();
|
|
41
47
|
logger;
|
|
42
48
|
snapshotChunkLength;
|
|
43
49
|
changeStreamTimeout;
|
|
50
|
+
storageHooks;
|
|
44
51
|
sourceRowConverter;
|
|
45
52
|
constructor(options) {
|
|
46
53
|
this.storage = options.storage;
|
|
@@ -49,6 +56,7 @@ export class ChangeStream {
|
|
|
49
56
|
this.connections = options.connections;
|
|
50
57
|
this.maxAwaitTimeMS = options.maxAwaitTimeMS ?? 10_000;
|
|
51
58
|
this.snapshotChunkLength = options.snapshotChunkLength ?? 6_000;
|
|
59
|
+
this.storageHooks = options.storageHooks;
|
|
52
60
|
this.client = this.connections.client;
|
|
53
61
|
this.defaultDb = this.connections.db;
|
|
54
62
|
this.sync_rules = options.storage.getParsedSyncRules({
|
|
@@ -58,14 +66,22 @@ export class ChangeStream {
|
|
|
58
66
|
// The change stream aggregation command should timeout before the socket times out,
|
|
59
67
|
// so we use 90% of the socket timeout value.
|
|
60
68
|
this.changeStreamTimeout = Math.ceil(this.client.options.socketTimeoutMS * 0.9);
|
|
61
|
-
this.abort_signal = options.abort_signal;
|
|
62
|
-
this.abort_signal.addEventListener('abort', () => {
|
|
63
|
-
// TODO: Fast abort?
|
|
64
|
-
}, { once: true });
|
|
65
69
|
this.logger = options.logger ?? this.storage.logger;
|
|
70
|
+
this.snapshotter = new MongoSnapshotter({
|
|
71
|
+
...options,
|
|
72
|
+
abortSignal: this.abortSignal,
|
|
73
|
+
logger: this.logger,
|
|
74
|
+
checkpointStreamId: this.checkpointStreamId
|
|
75
|
+
});
|
|
76
|
+
options.abort_signal.addEventListener('abort', () => {
|
|
77
|
+
this.abortController.abort(options.abort_signal.reason);
|
|
78
|
+
}, { once: true });
|
|
79
|
+
if (options.abort_signal.aborted) {
|
|
80
|
+
this.abortController.abort(options.abort_signal.reason);
|
|
81
|
+
}
|
|
66
82
|
}
|
|
67
83
|
get stopped() {
|
|
68
|
-
return this.
|
|
84
|
+
return this.abortSignal.aborted;
|
|
69
85
|
}
|
|
70
86
|
get usePostImages() {
|
|
71
87
|
return this.connections.options.postImages != PostImagesOption.OFF;
|
|
@@ -73,229 +89,6 @@ export class ChangeStream {
|
|
|
73
89
|
get configurePostImages() {
|
|
74
90
|
return this.connections.options.postImages == PostImagesOption.AUTO_CONFIGURE;
|
|
75
91
|
}
|
|
76
|
-
/**
|
|
77
|
-
* This resolves a pattern, persists the related metadata, and returns
|
|
78
|
-
* the resulting SourceTables.
|
|
79
|
-
*
|
|
80
|
-
* This implicitly checks the collection postImage configuration.
|
|
81
|
-
*/
|
|
82
|
-
async resolveQualifiedTableNames(batch, tablePattern) {
|
|
83
|
-
const schema = tablePattern.schema;
|
|
84
|
-
if (tablePattern.connectionTag != this.connections.connectionTag) {
|
|
85
|
-
return [];
|
|
86
|
-
}
|
|
87
|
-
let nameFilter;
|
|
88
|
-
if (tablePattern.isWildcard) {
|
|
89
|
-
nameFilter = new RegExp('^' + escapeRegExp(tablePattern.tablePrefix));
|
|
90
|
-
}
|
|
91
|
-
else {
|
|
92
|
-
nameFilter = tablePattern.name;
|
|
93
|
-
}
|
|
94
|
-
let result = [];
|
|
95
|
-
// Check if the collection exists
|
|
96
|
-
const collections = await this.client
|
|
97
|
-
.db(schema)
|
|
98
|
-
.listCollections({
|
|
99
|
-
name: nameFilter
|
|
100
|
-
}, { nameOnly: false })
|
|
101
|
-
.toArray();
|
|
102
|
-
if (!tablePattern.isWildcard && collections.length == 0) {
|
|
103
|
-
this.logger.warn(`Collection ${schema}.${tablePattern.name} not found`);
|
|
104
|
-
}
|
|
105
|
-
for (let collection of collections) {
|
|
106
|
-
const table = await this.handleRelation(batch, getMongoRelation({ db: schema, coll: collection.name }),
|
|
107
|
-
// This is done as part of the initial setup - snapshot is handled elsewhere
|
|
108
|
-
{ snapshot: false, collectionInfo: collection });
|
|
109
|
-
result.push(table);
|
|
110
|
-
}
|
|
111
|
-
return result;
|
|
112
|
-
}
|
|
113
|
-
async initSlot() {
|
|
114
|
-
const status = await this.storage.getStatus();
|
|
115
|
-
if (status.snapshot_done && status.checkpoint_lsn) {
|
|
116
|
-
this.logger.info(`Initial replication already done`);
|
|
117
|
-
return { needsInitialSync: false, snapshotLsn: null };
|
|
118
|
-
}
|
|
119
|
-
return { needsInitialSync: true, snapshotLsn: status.snapshot_lsn };
|
|
120
|
-
}
|
|
121
|
-
async estimatedCount(table) {
|
|
122
|
-
const count = await this.estimatedCountNumber(table);
|
|
123
|
-
return `~${count}`;
|
|
124
|
-
}
|
|
125
|
-
async estimatedCountNumber(table) {
|
|
126
|
-
const db = this.client.db(table.schema);
|
|
127
|
-
return await db.collection(table.name).estimatedDocumentCount();
|
|
128
|
-
}
|
|
129
|
-
/**
|
|
130
|
-
* This gets a LSN before starting a snapshot, which we can resume streaming from after the snapshot.
|
|
131
|
-
*
|
|
132
|
-
* This LSN can survive initial replication restarts.
|
|
133
|
-
*/
|
|
134
|
-
async getSnapshotLsn() {
|
|
135
|
-
const hello = await this.defaultDb.command({ hello: 1 });
|
|
136
|
-
// Basic sanity check
|
|
137
|
-
if (hello.msg == 'isdbgrid') {
|
|
138
|
-
throw new ServiceError(ErrorCode.PSYNC_S1341, 'Sharded MongoDB Clusters are not supported yet (including MongoDB Serverless instances).');
|
|
139
|
-
}
|
|
140
|
-
else if (hello.setName == null) {
|
|
141
|
-
throw new ServiceError(ErrorCode.PSYNC_S1342, 'Standalone MongoDB instances are not supported - use a replicaset.');
|
|
142
|
-
}
|
|
143
|
-
// Open a change stream just to get a resume token for later use.
|
|
144
|
-
// We could use clusterTime from the hello command, but that won't tell us if the
|
|
145
|
-
// snapshot isn't valid anymore.
|
|
146
|
-
// If we just use the first resumeToken from the stream, we get two potential issues:
|
|
147
|
-
// 1. The resumeToken may just be a wrapped clusterTime, which does not detect changes
|
|
148
|
-
// in source db or other stream issues.
|
|
149
|
-
// 2. The first actual change we get may have the same clusterTime, causing us to incorrect
|
|
150
|
-
// skip that event.
|
|
151
|
-
// Instead, we create a new checkpoint document, and wait until we get that document back in the stream.
|
|
152
|
-
// To avoid potential race conditions with the checkpoint creation, we create a new checkpoint document
|
|
153
|
-
// periodically until the timeout is reached.
|
|
154
|
-
const LSN_TIMEOUT_SECONDS = 60;
|
|
155
|
-
const LSN_CREATE_INTERVAL_SECONDS = 1;
|
|
156
|
-
// Create a checkpoint, and open a change stream using startAtOperationTime with the checkpoint's operationTime.
|
|
157
|
-
const firstCheckpointLsn = await createCheckpoint(this.client, this.defaultDb, this.checkpointStreamId);
|
|
158
|
-
const startTime = performance.now();
|
|
159
|
-
let lastCheckpointCreated = performance.now();
|
|
160
|
-
let eventsSeen = 0;
|
|
161
|
-
let batchesSeen = 0;
|
|
162
|
-
const filters = this.getSourceNamespaceFilters();
|
|
163
|
-
const iter = this.rawChangeStreamBatches({
|
|
164
|
-
lsn: firstCheckpointLsn,
|
|
165
|
-
maxAwaitTimeMS: 0,
|
|
166
|
-
signal: this.abort_signal,
|
|
167
|
-
filters
|
|
168
|
-
});
|
|
169
|
-
for await (let { events } of iter) {
|
|
170
|
-
if (performance.now() - startTime >= LSN_TIMEOUT_SECONDS * 1000) {
|
|
171
|
-
break;
|
|
172
|
-
}
|
|
173
|
-
if (performance.now() - lastCheckpointCreated >= LSN_CREATE_INTERVAL_SECONDS * 1000) {
|
|
174
|
-
await createCheckpoint(this.client, this.defaultDb, this.checkpointStreamId);
|
|
175
|
-
lastCheckpointCreated = performance.now();
|
|
176
|
-
}
|
|
177
|
-
batchesSeen += 1;
|
|
178
|
-
for (let rawChangeDocument of events) {
|
|
179
|
-
const changeDocument = parseChangeDocument(rawChangeDocument);
|
|
180
|
-
const ns = 'ns' in changeDocument && 'coll' in changeDocument.ns ? changeDocument.ns : undefined;
|
|
181
|
-
if (ns?.coll == CHECKPOINTS_COLLECTION && 'documentKey' in changeDocument) {
|
|
182
|
-
const checkpointId = changeDocument.documentKey._id;
|
|
183
|
-
if (!this.checkpointStreamId.equals(checkpointId)) {
|
|
184
|
-
continue;
|
|
185
|
-
}
|
|
186
|
-
const { comparable: lsn } = new MongoLSN({
|
|
187
|
-
timestamp: changeDocument.clusterTime,
|
|
188
|
-
resume_token: changeDocument._id
|
|
189
|
-
});
|
|
190
|
-
return lsn;
|
|
191
|
-
}
|
|
192
|
-
eventsSeen += 1;
|
|
193
|
-
}
|
|
194
|
-
}
|
|
195
|
-
// Could happen if there is a very large replication lag?
|
|
196
|
-
throw new ServiceError(ErrorCode.PSYNC_S1301, `Timeout after while waiting for checkpoint document for ${LSN_TIMEOUT_SECONDS}s. Streamed events = ${eventsSeen}, batches = ${batchesSeen}`);
|
|
197
|
-
}
|
|
198
|
-
/**
|
|
199
|
-
* Given a snapshot LSN, validate that we can read from it, by opening a change stream.
|
|
200
|
-
*/
|
|
201
|
-
async validateSnapshotLsn(lsn) {
|
|
202
|
-
const filters = this.getSourceNamespaceFilters();
|
|
203
|
-
const stream = this.rawChangeStreamBatches({
|
|
204
|
-
lsn: lsn,
|
|
205
|
-
// maxAwaitTimeMS should never actually be used here
|
|
206
|
-
maxAwaitTimeMS: 0,
|
|
207
|
-
filters
|
|
208
|
-
});
|
|
209
|
-
for await (let _batch of stream) {
|
|
210
|
-
// We got a response from the aggregate command, so consider the LSN valid.
|
|
211
|
-
// Close the stream immediately.
|
|
212
|
-
break;
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
async initialReplication(snapshotLsn) {
|
|
216
|
-
const sourceTables = this.sync_rules.getSourceTables();
|
|
217
|
-
await this.client.connect();
|
|
218
|
-
const tracer = new PerformanceTracer('MongoDB initial replication');
|
|
219
|
-
const flushResult = await this.storage.startBatch({
|
|
220
|
-
logger: this.logger,
|
|
221
|
-
zeroLSN: MongoLSN.ZERO.comparable,
|
|
222
|
-
defaultSchema: this.defaultDb.databaseName,
|
|
223
|
-
storeCurrentData: false,
|
|
224
|
-
skipExistingRows: true,
|
|
225
|
-
tracer
|
|
226
|
-
}, async (batch) => {
|
|
227
|
-
if (snapshotLsn == null) {
|
|
228
|
-
// First replication attempt - get a snapshot and store the timestamp
|
|
229
|
-
snapshotLsn = await this.getSnapshotLsn();
|
|
230
|
-
await batch.setResumeLsn(snapshotLsn);
|
|
231
|
-
this.logger.info(`Marking snapshot at ${snapshotLsn}`);
|
|
232
|
-
}
|
|
233
|
-
else {
|
|
234
|
-
this.logger.info(`Resuming snapshot at ${snapshotLsn}`);
|
|
235
|
-
// Check that the snapshot is still valid.
|
|
236
|
-
await this.validateSnapshotLsn(snapshotLsn);
|
|
237
|
-
}
|
|
238
|
-
// Start by resolving all tables.
|
|
239
|
-
// This checks postImage configuration, and that should fail as
|
|
240
|
-
// early as possible.
|
|
241
|
-
let allSourceTables = [];
|
|
242
|
-
for (let tablePattern of sourceTables) {
|
|
243
|
-
const tables = await this.resolveQualifiedTableNames(batch, tablePattern);
|
|
244
|
-
allSourceTables.push(...tables);
|
|
245
|
-
}
|
|
246
|
-
let tablesWithStatus = [];
|
|
247
|
-
for (let table of allSourceTables) {
|
|
248
|
-
if (table.snapshotComplete) {
|
|
249
|
-
this.logger.info(`Skipping ${table.qualifiedName} - snapshot already done`);
|
|
250
|
-
continue;
|
|
251
|
-
}
|
|
252
|
-
let count = await this.estimatedCountNumber(table);
|
|
253
|
-
const updated = await batch.updateTableProgress(table, {
|
|
254
|
-
totalEstimatedCount: count
|
|
255
|
-
});
|
|
256
|
-
tablesWithStatus.push(updated);
|
|
257
|
-
this.relationCache.update(updated);
|
|
258
|
-
this.logger.info(`To replicate: ${table.qualifiedName}: ${updated.snapshotStatus?.replicatedCount}/~${updated.snapshotStatus?.totalEstimatedCount}`);
|
|
259
|
-
}
|
|
260
|
-
for (let table of tablesWithStatus) {
|
|
261
|
-
await this.snapshotTable(batch, table);
|
|
262
|
-
await batch.markTableSnapshotDone([table]);
|
|
263
|
-
this.touch();
|
|
264
|
-
}
|
|
265
|
-
// The checkpoint here is a marker - we need to replicate up to at least this
|
|
266
|
-
// point before the data can be considered consistent.
|
|
267
|
-
// We could do this for each individual table, but may as well just do it once for the entire snapshot.
|
|
268
|
-
const checkpoint = await createCheckpoint(this.client, this.defaultDb, STANDALONE_CHECKPOINT_ID);
|
|
269
|
-
await batch.markAllSnapshotDone(checkpoint);
|
|
270
|
-
// This will not create a consistent checkpoint yet, but will persist the op.
|
|
271
|
-
// Actual checkpoint will be created when streaming replication caught up.
|
|
272
|
-
await batch.commit(snapshotLsn);
|
|
273
|
-
this.logger.info(`Snapshot done. Need to replicate from ${snapshotLsn} to ${checkpoint} to be consistent`);
|
|
274
|
-
});
|
|
275
|
-
return { lastOpId: flushResult?.flushed_op };
|
|
276
|
-
}
|
|
277
|
-
async setupCheckpointsCollection() {
|
|
278
|
-
const collection = await this.getCollectionInfo(this.defaultDb.databaseName, CHECKPOINTS_COLLECTION);
|
|
279
|
-
if (collection == null) {
|
|
280
|
-
await this.defaultDb.createCollection(CHECKPOINTS_COLLECTION, {
|
|
281
|
-
changeStreamPreAndPostImages: { enabled: true }
|
|
282
|
-
});
|
|
283
|
-
}
|
|
284
|
-
else if (this.usePostImages && collection.options?.changeStreamPreAndPostImages?.enabled != true) {
|
|
285
|
-
// Drop + create requires less permissions than collMod,
|
|
286
|
-
// and we don't care about the data in this collection.
|
|
287
|
-
await this.defaultDb.dropCollection(CHECKPOINTS_COLLECTION);
|
|
288
|
-
await this.defaultDb.createCollection(CHECKPOINTS_COLLECTION, {
|
|
289
|
-
changeStreamPreAndPostImages: { enabled: true }
|
|
290
|
-
});
|
|
291
|
-
}
|
|
292
|
-
else {
|
|
293
|
-
// Clear the collection on startup, to keep it clean
|
|
294
|
-
// We never query this collection directly, and don't want to keep the data around.
|
|
295
|
-
// We only use this to get data into the oplog/changestream.
|
|
296
|
-
await this.defaultDb.collection(CHECKPOINTS_COLLECTION).deleteMany({});
|
|
297
|
-
}
|
|
298
|
-
}
|
|
299
92
|
getSourceNamespaceFilters() {
|
|
300
93
|
const sourceTables = this.sync_rules.getSourceTables();
|
|
301
94
|
let $inFilters = [
|
|
@@ -343,73 +136,10 @@ export class ChangeStream {
|
|
|
343
136
|
}
|
|
344
137
|
return { $match: nsFilter, multipleDatabases };
|
|
345
138
|
}
|
|
346
|
-
async
|
|
347
|
-
const
|
|
348
|
-
const bytesReplicatedMetric = this.metrics.getCounter(ReplicationMetric.DATA_REPLICATED_BYTES);
|
|
349
|
-
const chunksReplicatedMetric = this.metrics.getCounter(ReplicationMetric.CHUNKS_REPLICATED);
|
|
350
|
-
const totalEstimatedCount = await this.estimatedCountNumber(table);
|
|
351
|
-
let at = table.snapshotStatus?.replicatedCount ?? 0;
|
|
352
|
-
const db = this.client.db(table.schema);
|
|
353
|
-
const collection = db.collection(table.name);
|
|
354
|
-
await using query = new ChunkedSnapshotQuery({
|
|
355
|
-
collection,
|
|
356
|
-
key: table.snapshotStatus?.lastKey,
|
|
357
|
-
batchSize: this.snapshotChunkLength
|
|
358
|
-
});
|
|
359
|
-
if (query.lastKey != null) {
|
|
360
|
-
this.logger.info(`Replicating ${table.qualifiedName} ${table.formatSnapshotProgress()} - resuming at _id > ${query.lastKey}`);
|
|
361
|
-
}
|
|
362
|
-
else {
|
|
363
|
-
this.logger.info(`Replicating ${table.qualifiedName} ${table.formatSnapshotProgress()}`);
|
|
364
|
-
}
|
|
365
|
-
let lastBatch = performance.now();
|
|
366
|
-
let nextChunkPromise = query.nextChunk();
|
|
367
|
-
while (true) {
|
|
368
|
-
const { docs: docBatch, lastKey, bytes: chunkBytes } = await nextChunkPromise;
|
|
369
|
-
if (docBatch.length == 0) {
|
|
370
|
-
// No more data - stop iterating
|
|
371
|
-
break;
|
|
372
|
-
}
|
|
373
|
-
bytesReplicatedMetric.add(chunkBytes);
|
|
374
|
-
chunksReplicatedMetric.add(1);
|
|
375
|
-
if (this.abort_signal.aborted) {
|
|
376
|
-
throw new ReplicationAbortedError(`Aborted initial replication`, this.abort_signal.reason);
|
|
377
|
-
}
|
|
378
|
-
// Pre-fetch next batch, so that we can read and write concurrently
|
|
379
|
-
nextChunkPromise = query.nextChunk();
|
|
380
|
-
for (let buffer of docBatch) {
|
|
381
|
-
const { row: record, replicaId: replicaId } = this.rawToSqliteRow(buffer);
|
|
382
|
-
// This auto-flushes when the batch reaches its size limit
|
|
383
|
-
await batch.save({
|
|
384
|
-
tag: SaveOperationTag.INSERT,
|
|
385
|
-
sourceTable: table,
|
|
386
|
-
before: undefined,
|
|
387
|
-
beforeReplicaId: undefined,
|
|
388
|
-
after: record,
|
|
389
|
-
afterReplicaId: replicaId
|
|
390
|
-
});
|
|
391
|
-
}
|
|
392
|
-
// Important: flush before marking progress
|
|
393
|
-
await batch.flush();
|
|
394
|
-
at += docBatch.length;
|
|
395
|
-
rowsReplicatedMetric.add(docBatch.length);
|
|
396
|
-
table = await batch.updateTableProgress(table, {
|
|
397
|
-
lastKey,
|
|
398
|
-
replicatedCount: at,
|
|
399
|
-
totalEstimatedCount: totalEstimatedCount
|
|
400
|
-
});
|
|
401
|
-
this.relationCache.update(table);
|
|
402
|
-
const duration = performance.now() - lastBatch;
|
|
403
|
-
lastBatch = performance.now();
|
|
404
|
-
this.logger.info(`Replicating ${table.qualifiedName} ${table.formatSnapshotProgress()} in ${duration.toFixed(0)}ms`);
|
|
405
|
-
this.touch();
|
|
406
|
-
}
|
|
407
|
-
// In case the loop was interrupted, make sure we await the last promise.
|
|
408
|
-
await nextChunkPromise;
|
|
409
|
-
}
|
|
410
|
-
async getRelation(batch, descriptor, options) {
|
|
411
|
-
const existing = this.relationCache.get(descriptor);
|
|
139
|
+
async getRelations(batch, descriptor, options) {
|
|
140
|
+
const existing = this.relationCache.getAll(descriptor);
|
|
412
141
|
if (existing != null) {
|
|
142
|
+
// We do this even when it's an empty result: Empty means nothing to sync, and we don't need to re-resolve.
|
|
413
143
|
return existing;
|
|
414
144
|
}
|
|
415
145
|
// Note: collection may have been dropped at this point, so we handle
|
|
@@ -452,14 +182,11 @@ export class ChangeStream {
|
|
|
452
182
|
// Ignore the postImages check in this case.
|
|
453
183
|
}
|
|
454
184
|
const snapshot = options.snapshot;
|
|
455
|
-
const result = await
|
|
456
|
-
group_id: this.group_id,
|
|
185
|
+
const result = await batch.resolveTables({
|
|
457
186
|
connection_id: this.connection_id,
|
|
458
|
-
|
|
459
|
-
entity_descriptor: descriptor,
|
|
460
|
-
sync_rules: this.sync_rules
|
|
187
|
+
source: descriptor
|
|
461
188
|
});
|
|
462
|
-
this.relationCache.
|
|
189
|
+
this.relationCache.updateAll(descriptor, result.tables);
|
|
463
190
|
// Drop conflicting collections.
|
|
464
191
|
// This is generally not expected for MongoDB source dbs, so we log an error.
|
|
465
192
|
if (result.dropTables.length > 0) {
|
|
@@ -470,17 +197,12 @@ export class ChangeStream {
|
|
|
470
197
|
// 1. Snapshot is requested (false for initial snapshot, since that process handles it elsewhere)
|
|
471
198
|
// 2. Snapshot is not already done, AND:
|
|
472
199
|
// 3. The table is used in sync config.
|
|
473
|
-
const
|
|
474
|
-
if (
|
|
200
|
+
const snapshotCandidates = result.tables.filter((table) => snapshot && !table.snapshotComplete && table.syncAny);
|
|
201
|
+
if (snapshotCandidates.length > 0) {
|
|
475
202
|
this.logger.info(`New collection: ${descriptor.schema}.${descriptor.name}`);
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
const no_checkpoint_before_lsn = await createCheckpoint(this.client, this.defaultDb, STANDALONE_CHECKPOINT_ID);
|
|
480
|
-
const [table] = await batch.markTableSnapshotDone([result.table], no_checkpoint_before_lsn);
|
|
481
|
-
return table;
|
|
482
|
-
}
|
|
483
|
-
return result.table;
|
|
203
|
+
await this.snapshotter.snapshotTables(batch, snapshotCandidates);
|
|
204
|
+
}
|
|
205
|
+
return result.tables;
|
|
484
206
|
}
|
|
485
207
|
async writeChange(batch, table, change) {
|
|
486
208
|
if (!table.syncAny) {
|
|
@@ -535,34 +257,60 @@ export class ChangeStream {
|
|
|
535
257
|
}
|
|
536
258
|
}
|
|
537
259
|
async replicate() {
|
|
260
|
+
let streamPromise = null;
|
|
261
|
+
let loopPromise = null;
|
|
538
262
|
try {
|
|
539
263
|
// If anything errors here, the entire replication process is halted, and
|
|
540
264
|
// all connections automatically closed, including this one.
|
|
541
|
-
|
|
542
|
-
await this.
|
|
265
|
+
this.initPromise = this.initReplication();
|
|
266
|
+
await this.initPromise;
|
|
267
|
+
loopPromise = this.snapshotter
|
|
268
|
+
.replicationLoop()
|
|
269
|
+
.then(() => {
|
|
270
|
+
throw new ReplicationAssertionError(`Replication snapshotter exited unexpectedly`);
|
|
271
|
+
})
|
|
272
|
+
.catch((e) => {
|
|
273
|
+
this.abortController.abort(e);
|
|
274
|
+
throw e;
|
|
275
|
+
});
|
|
276
|
+
if (!this.snapshotter.supportsConcurrentSnapshots) {
|
|
277
|
+
await Promise.race([this.snapshotter.waitForInitialSnapshot(), loopPromise]);
|
|
278
|
+
}
|
|
279
|
+
streamPromise = this.streamChanges()
|
|
280
|
+
.then(() => {
|
|
281
|
+
throw new ReplicationAssertionError(`Replication stream exited unexpectedly`);
|
|
282
|
+
})
|
|
283
|
+
.catch((e) => {
|
|
284
|
+
this.abortController.abort(e);
|
|
285
|
+
throw e;
|
|
286
|
+
});
|
|
287
|
+
const results = await Promise.allSettled([loopPromise, streamPromise]);
|
|
288
|
+
throw replicationLoopError(results);
|
|
543
289
|
}
|
|
544
290
|
catch (e) {
|
|
545
291
|
await this.storage.reportError(e);
|
|
546
292
|
throw e;
|
|
547
293
|
}
|
|
294
|
+
finally {
|
|
295
|
+
this.abortController.abort();
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
async waitForInitialSnapshot() {
|
|
299
|
+
if (this.initPromise == null) {
|
|
300
|
+
throw new ReplicationAssertionError('replicate() must be called before waitForInitialSnapshot()');
|
|
301
|
+
}
|
|
302
|
+
await this.initPromise;
|
|
303
|
+
await this.snapshotter.waitForInitialSnapshot();
|
|
548
304
|
}
|
|
549
305
|
async initReplication() {
|
|
550
|
-
const result = await this.
|
|
551
|
-
await this.setupCheckpointsCollection();
|
|
306
|
+
const result = await this.snapshotter.checkSlot();
|
|
307
|
+
await this.snapshotter.setupCheckpointsCollection();
|
|
552
308
|
if (result.needsInitialSync) {
|
|
553
309
|
if (result.snapshotLsn == null) {
|
|
554
310
|
// Snapshot LSN is not present, so we need to start replication from scratch.
|
|
555
|
-
await this.storage.clear({ signal: this.
|
|
556
|
-
}
|
|
557
|
-
const { lastOpId } = await this.initialReplication(result.snapshotLsn);
|
|
558
|
-
if (lastOpId != null) {
|
|
559
|
-
// Populate the cache _after_ initial replication, but _before_ we switch to this replication stream.
|
|
560
|
-
await this.storage.populatePersistentChecksumCache({
|
|
561
|
-
signal: this.abort_signal,
|
|
562
|
-
// No checkpoint yet, but we do have the opId.
|
|
563
|
-
maxOpId: lastOpId
|
|
564
|
-
});
|
|
311
|
+
await this.storage.clear({ signal: this.abortSignal });
|
|
565
312
|
}
|
|
313
|
+
await this.snapshotter.queueSnapshotTables(result.snapshotLsn);
|
|
566
314
|
}
|
|
567
315
|
}
|
|
568
316
|
async streamChanges() {
|
|
@@ -649,6 +397,7 @@ export class ChangeStream {
|
|
|
649
397
|
defaultSchema: this.defaultDb.databaseName,
|
|
650
398
|
// We get a complete postimage for every change, so we don't need to store the current data.
|
|
651
399
|
storeCurrentData: false,
|
|
400
|
+
hooks: this.storageHooks,
|
|
652
401
|
tracer
|
|
653
402
|
}, async (batch) => {
|
|
654
403
|
const { resumeFromLsn } = batch;
|
|
@@ -667,7 +416,7 @@ export class ChangeStream {
|
|
|
667
416
|
const batchStream = this.rawChangeStreamBatches({
|
|
668
417
|
lsn: resumeFromLsn,
|
|
669
418
|
filters,
|
|
670
|
-
signal: this.
|
|
419
|
+
signal: this.abortSignal,
|
|
671
420
|
tracer
|
|
672
421
|
});
|
|
673
422
|
// Always start with a checkpoint.
|
|
@@ -683,7 +432,7 @@ export class ChangeStream {
|
|
|
683
432
|
using batchSpan = tracer.span('processing');
|
|
684
433
|
bytesReplicatedMetric.add(eventBatch.byteSize);
|
|
685
434
|
chunksReplicatedMetric.add(1);
|
|
686
|
-
if (this.
|
|
435
|
+
if (this.abortSignal.aborted) {
|
|
687
436
|
break;
|
|
688
437
|
}
|
|
689
438
|
this.touch();
|
|
@@ -712,7 +461,7 @@ export class ChangeStream {
|
|
|
712
461
|
for (let eventIndex = 0; eventIndex < events.length; eventIndex++) {
|
|
713
462
|
const rawChangeDocument = events[eventIndex];
|
|
714
463
|
const originalChangeDocument = parseChangeDocument(rawChangeDocument);
|
|
715
|
-
if (this.
|
|
464
|
+
if (this.abortSignal.aborted) {
|
|
716
465
|
break;
|
|
717
466
|
}
|
|
718
467
|
if (startAfter != null && originalChangeDocument.clusterTime?.lte(startAfter)) {
|
|
@@ -799,12 +548,19 @@ export class ChangeStream {
|
|
|
799
548
|
// change stream events, collapse standalone checkpoints into the normal batch
|
|
800
549
|
// checkpoint flow to avoid commit churn under sustained load.
|
|
801
550
|
const hasBufferedChanges = eventIndex < events.length - 1;
|
|
802
|
-
if (waitForCheckpointLsn
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
551
|
+
if (hasBufferedChanges && waitForCheckpointLsn == null) {
|
|
552
|
+
// Buffered changes - create a new batch checkpoint to rate limit commits
|
|
553
|
+
using _ = tracer.span('source_checkpoint');
|
|
554
|
+
waitForCheckpointLsn = await createCheckpoint(this.client, this.defaultDb, this.checkpointStreamId);
|
|
555
|
+
continue;
|
|
556
|
+
}
|
|
557
|
+
else if (waitForCheckpointLsn != null) {
|
|
558
|
+
// Skip this checkpoint - wait for the batch checkpoint.
|
|
806
559
|
continue;
|
|
807
560
|
}
|
|
561
|
+
else {
|
|
562
|
+
// No buffered changes, and no batch checkpoint pending - commit immediately.
|
|
563
|
+
}
|
|
808
564
|
}
|
|
809
565
|
else if (!this.checkpointStreamId.equals(checkpointId)) {
|
|
810
566
|
continue;
|
|
@@ -836,17 +592,19 @@ export class ChangeStream {
|
|
|
836
592
|
changeDocument.operationType == 'replace' ||
|
|
837
593
|
changeDocument.operationType == 'delete') {
|
|
838
594
|
if (waitForCheckpointLsn == null) {
|
|
595
|
+
using _ = tracer.span('source_checkpoint');
|
|
839
596
|
waitForCheckpointLsn = await createCheckpoint(this.client, this.defaultDb, this.checkpointStreamId);
|
|
840
597
|
}
|
|
841
|
-
const rel = getMongoRelation(changeDocument.ns);
|
|
842
|
-
const
|
|
598
|
+
const rel = getMongoRelation(changeDocument.ns, this.connections.connectionTag);
|
|
599
|
+
const tables = await this.getRelations(batch, rel, {
|
|
843
600
|
// In most cases, we should not need to snapshot this. But if this is the first time we see the collection
|
|
844
601
|
// for whatever reason, then we do need to snapshot it.
|
|
845
602
|
// This may result in some duplicate operations when a collection is created for the first time after
|
|
846
603
|
// sync config was deployed.
|
|
847
604
|
snapshot: true
|
|
848
605
|
});
|
|
849
|
-
|
|
606
|
+
const tablesToReplicate = tables.filter((table) => table.syncAny);
|
|
607
|
+
if (tablesToReplicate.length > 0) {
|
|
850
608
|
this.replicationLag.trackUncommittedChange(changeDocument.clusterTime == null ? null : timestampToDate(changeDocument.clusterTime));
|
|
851
609
|
const transactionKeyValue = transactionKey(changeDocument);
|
|
852
610
|
if (transactionKeyValue == null || lastTxnKey != transactionKeyValue) {
|
|
@@ -856,31 +614,35 @@ export class ChangeStream {
|
|
|
856
614
|
lastTxnKey = transactionKeyValue;
|
|
857
615
|
transactionsReplicatedMetric.add(1);
|
|
858
616
|
}
|
|
859
|
-
|
|
617
|
+
for (const table of tablesToReplicate) {
|
|
618
|
+
await this.writeChange(batch, table, changeDocument);
|
|
619
|
+
}
|
|
860
620
|
}
|
|
861
621
|
}
|
|
862
622
|
else if (changeDocument.operationType == 'drop') {
|
|
863
|
-
const rel = getMongoRelation(changeDocument.ns);
|
|
864
|
-
const
|
|
623
|
+
const rel = getMongoRelation(changeDocument.ns, this.connections.connectionTag);
|
|
624
|
+
const tables = await this.getRelations(batch, rel, {
|
|
865
625
|
// We're "dropping" this collection, so never snapshot it.
|
|
866
626
|
snapshot: false
|
|
867
627
|
});
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
628
|
+
const tablesToDrop = tables.filter((table) => table.syncAny);
|
|
629
|
+
if (tablesToDrop.length > 0) {
|
|
630
|
+
await batch.drop(tablesToDrop);
|
|
871
631
|
}
|
|
632
|
+
this.relationCache.delete(rel);
|
|
872
633
|
}
|
|
873
634
|
else if (changeDocument.operationType == 'rename') {
|
|
874
|
-
const relFrom = getMongoRelation(changeDocument.ns);
|
|
875
|
-
const relTo = getMongoRelation(changeDocument.to);
|
|
876
|
-
const
|
|
635
|
+
const relFrom = getMongoRelation(changeDocument.ns, this.connections.connectionTag);
|
|
636
|
+
const relTo = getMongoRelation(changeDocument.to, this.connections.connectionTag);
|
|
637
|
+
const tablesFrom = await this.getRelations(batch, relFrom, {
|
|
877
638
|
// We're "dropping" this collection, so never snapshot it.
|
|
878
639
|
snapshot: false
|
|
879
640
|
});
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
641
|
+
const tablesToDrop = tablesFrom.filter((table) => table.syncAny);
|
|
642
|
+
if (tablesToDrop.length > 0) {
|
|
643
|
+
await batch.drop(tablesToDrop);
|
|
883
644
|
}
|
|
645
|
+
this.relationCache.delete(relFrom);
|
|
884
646
|
// Here we do need to snapshot the new table
|
|
885
647
|
const collection = await this.getCollectionInfo(relTo.schema, relTo.name);
|
|
886
648
|
await this.handleRelation(batch, relTo, {
|
|
@@ -901,17 +663,18 @@ export class ChangeStream {
|
|
|
901
663
|
await batch.setResumeLsn(lsn);
|
|
902
664
|
}
|
|
903
665
|
batchSpan.end();
|
|
904
|
-
const
|
|
905
|
-
const duration = batchSpan.
|
|
666
|
+
const durationsMicroseconds = outerSpan.end();
|
|
667
|
+
const duration = batchSpan.durationMillis;
|
|
906
668
|
this.logger.info(`Processed batch of ${events.length} changes / ${eventBatch.byteSize} bytes in ${duration}ms`, {
|
|
907
669
|
count: events.length,
|
|
908
670
|
bytes: eventBatch.byteSize,
|
|
909
671
|
duration,
|
|
910
|
-
t:
|
|
672
|
+
t: durationsMicroseconds
|
|
911
673
|
});
|
|
912
674
|
outerSpan = tracer.span('batch');
|
|
913
675
|
}
|
|
914
676
|
});
|
|
677
|
+
throw new ReplicationAbortedError(`Replication stream aborted`, this.abortSignal.reason);
|
|
915
678
|
}
|
|
916
679
|
getReplicationLagMillis() {
|
|
917
680
|
return this.replicationLag.getLagMillis();
|
|
@@ -936,4 +699,26 @@ function transactionKey(doc) {
|
|
|
936
699
|
}
|
|
937
700
|
return `${doc.lsid.id.toString('hex')}:${doc.txnNumber}`;
|
|
938
701
|
}
|
|
702
|
+
/**
|
|
703
|
+
* Prioritize errors that are _not_ ReplicationAbortedError. Any error on either loopPromise or
|
|
704
|
+
* streamPromise aborts the other one, which then results in a ReplicationAbortedError, hiding the
|
|
705
|
+
* original cause.
|
|
706
|
+
*/
|
|
707
|
+
function replicationLoopError(results) {
|
|
708
|
+
// 1. Prioritize not ReplicationAbortedError.
|
|
709
|
+
for (const result of results) {
|
|
710
|
+
if (result.status == 'rejected' && !(result.reason instanceof ReplicationAbortedError)) {
|
|
711
|
+
return result.reason;
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
// 2. Fallback to ReplicationAbortedError.
|
|
715
|
+
for (const result of results) {
|
|
716
|
+
if (result.status == 'rejected') {
|
|
717
|
+
// At this point only ReplicationAbortedError remains
|
|
718
|
+
return result.reason;
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
// 3. Should never happen, but we cover this case.
|
|
722
|
+
return new ReplicationAssertionError(`Replication loop exited unexpectedly`);
|
|
723
|
+
}
|
|
939
724
|
//# sourceMappingURL=ChangeStream.js.map
|