@powersync/service-module-mongodb-storage 0.13.0 → 0.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -62,6 +62,7 @@ const DEFAULT_CLEAR_BATCH_LIMIT = 5000;
62
62
  const DEFAULT_MOVE_BATCH_LIMIT = 2000;
63
63
  const DEFAULT_MOVE_BATCH_QUERY_LIMIT = 10_000;
64
64
  const DEFAULT_MIN_BUCKET_CHANGES = 10;
65
+ const DEFAULT_MIN_CHANGE_RATIO = 0.1;
65
66
 
66
67
  /** This default is primarily for tests. */
67
68
  const DEFAULT_MEMORY_LIMIT_MB = 64;
@@ -75,6 +76,7 @@ export class MongoCompactor {
75
76
  private moveBatchQueryLimit: number;
76
77
  private clearBatchLimit: number;
77
78
  private minBucketChanges: number;
79
+ private minChangeRatio: number;
78
80
  private maxOpId: bigint;
79
81
  private buckets: string[] | undefined;
80
82
  private signal?: AbortSignal;
@@ -91,6 +93,7 @@ export class MongoCompactor {
91
93
  this.moveBatchQueryLimit = options?.moveBatchQueryLimit ?? DEFAULT_MOVE_BATCH_QUERY_LIMIT;
92
94
  this.clearBatchLimit = options?.clearBatchLimit ?? DEFAULT_CLEAR_BATCH_LIMIT;
93
95
  this.minBucketChanges = options?.minBucketChanges ?? DEFAULT_MIN_BUCKET_CHANGES;
96
+ this.minChangeRatio = options?.minChangeRatio ?? DEFAULT_MIN_CHANGE_RATIO;
94
97
  this.maxOpId = options?.maxOpId ?? 0n;
95
98
  this.buckets = options?.compactBuckets;
96
99
  this.signal = options?.signal;
@@ -115,27 +118,19 @@ export class MongoCompactor {
115
118
  }
116
119
 
117
120
  private async compactDirtyBuckets() {
118
- while (!this.signal?.aborted) {
119
- // Process all buckets with 10 or more changes since last time.
120
- // We exclude the last 100 compacted buckets, to avoid repeatedly re-compacting the same buckets over and over
121
- // if they are modified while compacting.
122
- const TRACK_RECENTLY_COMPACTED_NUMBER = 100;
123
-
124
- let recentlyCompacted: string[] = [];
125
- const buckets = await this.dirtyBucketBatch({
126
- minBucketChanges: this.minBucketChanges,
127
- exclude: recentlyCompacted
128
- });
129
- if (buckets.length == 0) {
130
- // All done
121
+ for await (let buckets of this.dirtyBucketBatches({
122
+ minBucketChanges: this.minBucketChanges,
123
+ minChangeRatio: this.minChangeRatio
124
+ })) {
125
+ if (this.signal?.aborted) {
131
126
  break;
132
127
  }
128
+ if (buckets.length == 0) {
129
+ continue;
130
+ }
131
+
133
132
  for (let { bucket } of buckets) {
134
133
  await this.compactSingleBucket(bucket);
135
- recentlyCompacted.push(bucket);
136
- }
137
- if (recentlyCompacted.length > TRACK_RECENTLY_COMPACTED_NUMBER) {
138
- recentlyCompacted = recentlyCompacted.slice(-TRACK_RECENTLY_COMPACTED_NUMBER);
139
134
  }
140
135
  }
141
136
  }
@@ -491,13 +486,13 @@ export class MongoCompactor {
491
486
  async populateChecksums(options: { minBucketChanges: number }): Promise<PopulateChecksumCacheResults> {
492
487
  let count = 0;
493
488
  while (!this.signal?.aborted) {
494
- const buckets = await this.dirtyBucketBatch(options);
495
- if (buckets.length == 0) {
489
+ const buckets = await this.dirtyBucketBatchForChecksums(options);
490
+ if (buckets.length == 0 || this.signal?.aborted) {
496
491
  // All done
497
492
  break;
498
493
  }
494
+
499
495
  const start = Date.now();
500
- logger.info(`Calculating checksums for batch of ${buckets.length} buckets`);
501
496
 
502
497
  // Filter batch by estimated bucket size, to reduce possibility of timeouts
503
498
  let checkBuckets: typeof buckets = [];
@@ -509,22 +504,97 @@ export class MongoCompactor {
509
504
  break;
510
505
  }
511
506
  }
507
+ logger.info(
508
+ `Calculating checksums for batch of ${buckets.length} buckets, estimated count of ${totalCountEstimate}`
509
+ );
512
510
  await this.updateChecksumsBatch(checkBuckets.map((b) => b.bucket));
513
511
  logger.info(`Updated checksums for batch of ${checkBuckets.length} buckets in ${Date.now() - start}ms`);
514
- count += buckets.length;
512
+ count += checkBuckets.length;
515
513
  }
516
514
  return { buckets: count };
517
515
  }
518
516
 
517
+ /**
518
+ * Return batches of dirty buckets.
519
+ *
520
+ * Can be used to iterate through all buckets.
521
+ *
522
+ * minBucketChanges: minimum number of changes for a bucket to be included in the results.
523
+ * minChangeRatio: minimum ratio of changes to total ops for a bucket to be included in the results, number between 0 and 1.
524
+ */
525
+ private async *dirtyBucketBatches(options: {
526
+ minBucketChanges: number;
527
+ minChangeRatio: number;
528
+ }): AsyncGenerator<{ bucket: string; estimatedCount: number }[]> {
529
+ // Previously, we used an index on {_id.g: 1, estimate_since_compact.count: 1} to only buckets with changes.
530
+ // This works well if there are only a small number of buckets with changes.
531
+ // However, if buckets are continuosly modified while we are compacting, we get the same buckets over and over again.
532
+ // This has caused the compact process to re-read the same collection around 5x times in total, which is very inefficient.
533
+ // To solve this, we now just iterate through all buckets, and filter out the ones with low changes.
534
+
535
+ if (options.minBucketChanges <= 0) {
536
+ throw new ReplicationAssertionError('minBucketChanges must be >= 1');
537
+ }
538
+ let lastId = { g: this.group_id, b: new mongo.MinKey() as any };
539
+ const maxId = { g: this.group_id, b: new mongo.MaxKey() as any };
540
+ while (true) {
541
+ const batch = await this.db.bucket_state
542
+ .find(
543
+ {
544
+ _id: { $gt: lastId, $lt: maxId },
545
+ 'estimate_since_compact.count': { $gte: options.minBucketChanges }
546
+ },
547
+ {
548
+ projection: {
549
+ _id: 1,
550
+ estimate_since_compact: 1,
551
+ compacted_state: 1
552
+ },
553
+ sort: {
554
+ _id: 1
555
+ },
556
+ limit: 2000,
557
+ maxTimeMS: MONGO_OPERATION_TIMEOUT_MS
558
+ }
559
+ )
560
+ .toArray();
561
+ if (batch.length == 0) {
562
+ break;
563
+ }
564
+ lastId = batch[batch.length - 1]._id;
565
+ const mapped = batch.map((b) => {
566
+ const updatedCount = b.estimate_since_compact?.count ?? 0;
567
+ const totalCount = (b.compacted_state?.count ?? 0) + updatedCount;
568
+ const updatedBytes = b.estimate_since_compact?.bytes ?? 0;
569
+ const totalBytes = (b.compacted_state?.bytes ?? 0) + updatedBytes;
570
+ const dirtyChangeNumber = totalCount > 0 ? updatedCount / totalCount : 0;
571
+ const dirtyChangeBytes = totalBytes > 0 ? updatedBytes / totalBytes : 0;
572
+ return {
573
+ bucket: b._id.b,
574
+ estimatedCount: totalCount,
575
+ dirtyRatio: Math.max(dirtyChangeNumber, dirtyChangeBytes)
576
+ };
577
+ });
578
+ const filtered = mapped.filter(
579
+ (b) => b.estimatedCount >= options.minBucketChanges && b.dirtyRatio >= options.minChangeRatio
580
+ );
581
+ yield filtered;
582
+ }
583
+ }
584
+
519
585
  /**
520
586
  * Returns a batch of dirty buckets - buckets with most changes first.
521
587
  *
522
588
  * This cannot be used to iterate on its own - the client is expected to process these buckets and
523
589
  * set estimate_since_compact.count: 0 when done, before fetching the next batch.
590
+ *
591
+ * Unlike dirtyBucketBatches, used for compacting, this is specifically designed to be resuamble after a restart,
592
+ * since it is used as the last step for initial replication.
593
+ *
594
+ * We currently don't get new data while doing populateChecksums, so we don't need to worry about buckets changing while processing.
524
595
  */
525
- private async dirtyBucketBatch(options: {
596
+ private async dirtyBucketBatchForChecksums(options: {
526
597
  minBucketChanges: number;
527
- exclude?: string[];
528
598
  }): Promise<{ bucket: string; estimatedCount: number }[]> {
529
599
  if (options.minBucketChanges <= 0) {
530
600
  throw new ReplicationAssertionError('minBucketChanges must be >= 1');
@@ -534,8 +604,7 @@ export class MongoCompactor {
534
604
  .find(
535
605
  {
536
606
  '_id.g': this.group_id,
537
- 'estimate_since_compact.count': { $gte: options.minBucketChanges },
538
- '_id.b': { $nin: options.exclude ?? [] }
607
+ 'estimate_since_compact.count': { $gte: options.minBucketChanges }
539
608
  },
540
609
  {
541
610
  projection: {
@@ -1,4 +1,4 @@
1
- import { SqlSyncRules, HydratedSyncRules, versionedHydrationState } from '@powersync/service-sync-rules';
1
+ import { SyncConfigWithErrors, HydratedSyncRules, versionedHydrationState } from '@powersync/service-sync-rules';
2
2
 
3
3
  import { storage } from '@powersync/service-core';
4
4
 
@@ -7,7 +7,7 @@ export class MongoPersistedSyncRules implements storage.PersistedSyncRules {
7
7
 
8
8
  constructor(
9
9
  public readonly id: number,
10
- public readonly sync_rules: SqlSyncRules,
10
+ public readonly sync_rules: SyncConfigWithErrors,
11
11
  public readonly checkpoint_lsn: string | null,
12
12
  slot_name: string | null
13
13
  ) {
@@ -15,6 +15,6 @@ export class MongoPersistedSyncRules implements storage.PersistedSyncRules {
15
15
  }
16
16
 
17
17
  hydratedSyncRules(): HydratedSyncRules {
18
- return this.sync_rules.hydrate({ hydrationState: versionedHydrationState(this.id) });
18
+ return this.sync_rules.config.hydrate({ hydrationState: versionedHydrationState(this.id) });
19
19
  }
20
20
  }
@@ -15,6 +15,7 @@ import {
15
15
  InternalOpId,
16
16
  internalToExternalOpId,
17
17
  maxLsn,
18
+ mergeAsyncIterables,
18
19
  PopulateChecksumCacheOptions,
19
20
  PopulateChecksumCacheResults,
20
21
  ProtocolOpId,
@@ -694,53 +695,39 @@ export class MongoSyncBucketStorage
694
695
  * Instance-wide watch on the latest available checkpoint (op_id + lsn).
695
696
  */
696
697
  private async *watchActiveCheckpoint(signal: AbortSignal): AsyncIterable<ReplicationCheckpoint> {
697
- const stream = this.checkpointChangesStream(signal);
698
-
699
698
  if (signal.aborted) {
700
699
  return;
701
700
  }
702
701
 
702
+ // If the stream is idle, we wait a max of a minute (CHECKPOINT_TIMEOUT_MS) before we get another checkpoint,
703
+ // to avoid stale checkpoint snapshots. This is what checkpointTimeoutStream() is for.
704
+ // Essentially, even if there are no actual checkpoint changes, we want a new snapshotTime every minute or so,
705
+ // to ensure that any new clients connecting will get a valid snapshotTime.
706
+ const stream = mergeAsyncIterables(
707
+ [this.checkpointChangesStream(signal), this.checkpointTimeoutStream(signal)],
708
+ signal
709
+ );
710
+
703
711
  // We only watch changes to the active sync rules.
704
712
  // If it changes to inactive, we abort and restart with the new sync rules.
705
- try {
706
- while (true) {
707
- // If the stream is idle, we wait a max of a minute (CHECKPOINT_TIMEOUT_MS)
708
- // before we get another checkpoint, to avoid stale checkpoint snapshots.
709
- const timeout = timers
710
- .setTimeout(CHECKPOINT_TIMEOUT_MS, { done: false }, { signal })
711
- .catch(() => ({ done: true }));
712
- try {
713
- const result = await Promise.race([stream.next(), timeout]);
714
- if (result.done) {
715
- break;
716
- }
717
- } catch (e) {
718
- if (e.name == 'AbortError') {
719
- break;
720
- }
721
- throw e;
722
- }
723
-
724
- if (signal.aborted) {
725
- // Would likely have been caught by the signal on the timeout or the upstream stream, but we check here anyway
726
- break;
727
- }
728
-
729
- const op = await this.getCheckpointInternal();
730
- if (op == null) {
731
- // Sync rules have changed - abort and restart.
732
- // We do a soft close of the stream here - no error
733
- break;
734
- }
713
+ for await (const _ of stream) {
714
+ if (signal.aborted) {
715
+ // Would likely have been caught by the signal on the timeout or the upstream stream, but we check here anyway
716
+ break;
717
+ }
735
718
 
736
- // Previously, we only yielded when the checkpoint or lsn changed.
737
- // However, we always want to use the latest snapshotTime, so we skip that filtering here.
738
- // That filtering could be added in the per-user streams if needed, but in general the capped collection
739
- // should already only contain useful changes in most cases.
740
- yield op;
719
+ const op = await this.getCheckpointInternal();
720
+ if (op == null) {
721
+ // Sync rules have changed - abort and restart.
722
+ // We do a soft close of the stream here - no error
723
+ break;
741
724
  }
742
- } finally {
743
- await stream.return(null);
725
+
726
+ // Previously, we only yielded when the checkpoint or lsn changed.
727
+ // However, we always want to use the latest snapshotTime, so we skip that filtering here.
728
+ // That filtering could be added in the per-user streams if needed, but in general the capped collection
729
+ // should already only contain useful changes in most cases.
730
+ yield op;
744
731
  }
745
732
  }
746
733
 
@@ -900,6 +887,24 @@ export class MongoSyncBucketStorage
900
887
  }
901
888
  }
902
889
 
890
+ private async *checkpointTimeoutStream(signal: AbortSignal): AsyncGenerator<void> {
891
+ while (!signal.aborted) {
892
+ try {
893
+ await timers.setTimeout(CHECKPOINT_TIMEOUT_MS, undefined, { signal });
894
+ } catch (e) {
895
+ if (e.name == 'AbortError') {
896
+ // This is how we typically abort this stream, when all listeners are done
897
+ return;
898
+ }
899
+ throw e;
900
+ }
901
+
902
+ if (!signal.aborted) {
903
+ yield;
904
+ }
905
+ }
906
+ }
907
+
903
908
  private async getDataBucketChanges(
904
909
  options: GetCheckpointChangesOptions
905
910
  ): Promise<Pick<CheckpointChanges, 'updatedDataBuckets' | 'invalidateDataBuckets'>> {
@@ -57,7 +57,9 @@ describe('Connection reporting storage', async () => {
57
57
  user_agent: userData.user_week.user_agent
58
58
  });
59
59
 
60
- const connection = await factory.db.connection_report_events.find({ user_id: userData.user_week.user_id }).toArray();
60
+ const connection = await factory.db.connection_report_events
61
+ .find({ user_id: userData.user_week.user_id })
62
+ .toArray();
61
63
  expect(connection).toHaveLength(2);
62
64
  const cleaned = removeVolatileFields(connection);
63
65
  expect(cleaned).toMatchSnapshot();
@@ -111,7 +113,9 @@ describe('Connection reporting storage', async () => {
111
113
  connected_at: userData.user_three.connected_at
112
114
  });
113
115
 
114
- const connection = await factory.db.connection_report_events.find({ user_id: userData.user_three.user_id }).toArray();
116
+ const connection = await factory.db.connection_report_events
117
+ .find({ user_id: userData.user_three.user_id })
118
+ .toArray();
115
119
  expect(connection).toHaveLength(1);
116
120
  expect(new Date(connection[0].disconnected_at!)).toEqual(disconnectAt);
117
121
  const cleaned = removeVolatileFields(connection);
@@ -63,6 +63,7 @@ bucket_definitions:
63
63
  moveBatchLimit: 10,
64
64
  moveBatchQueryLimit: 10,
65
65
  minBucketChanges: 1,
66
+ minChangeRatio: 0,
66
67
  maxOpId: checkpoint,
67
68
  signal: null as any
68
69
  });
@@ -1,17 +1,13 @@
1
1
  {
2
- "extends": "../../../tsconfig.base.json",
2
+ "extends": "../../../tsconfig.tests.json",
3
3
  "compilerOptions": {
4
- "rootDir": "src",
5
4
  "baseUrl": "./",
6
- "noEmit": true,
7
- "esModuleInterop": true,
8
- "skipLibCheck": true,
9
- "sourceMap": true,
10
5
  "paths": {
11
6
  "@/*": ["../../../packages/service-core/src/*"],
12
7
  "@module/*": ["../src/*"],
13
8
  "@core-tests/*": ["../../../packages/service-core/test/src/*"]
14
- }
9
+ },
10
+ "rootDir": "src"
15
11
  },
16
12
  "include": ["src"],
17
13
  "references": [