@powersync/service-module-mongodb-storage 0.15.3 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/CHANGELOG.md +54 -0
  2. package/dist/migrations/db/migrations/1688556755264-initial-sync-rules.js +1 -1
  3. package/dist/migrations/db/migrations/1688556755264-initial-sync-rules.js.map +1 -1
  4. package/dist/migrations/db/migrations/1702295701188-sync-rule-state.js +3 -3
  5. package/dist/migrations/db/migrations/1702295701188-sync-rule-state.js.map +1 -1
  6. package/dist/migrations/db/migrations/1770213298299-storage-version.js.map +1 -1
  7. package/dist/storage/MongoBucketStorage.d.ts +5 -3
  8. package/dist/storage/MongoBucketStorage.js +50 -36
  9. package/dist/storage/MongoBucketStorage.js.map +1 -1
  10. package/dist/storage/MongoReportStorage.js.map +1 -1
  11. package/dist/storage/implementation/BucketDefinitionMapping.d.ts +17 -0
  12. package/dist/storage/implementation/BucketDefinitionMapping.js +58 -0
  13. package/dist/storage/implementation/BucketDefinitionMapping.js.map +1 -0
  14. package/dist/storage/implementation/MongoBucketBatch.d.ts +16 -14
  15. package/dist/storage/implementation/MongoBucketBatch.js +80 -115
  16. package/dist/storage/implementation/MongoBucketBatch.js.map +1 -1
  17. package/dist/storage/implementation/MongoBucketBatchShared.d.ts +5 -0
  18. package/dist/storage/implementation/MongoBucketBatchShared.js +8 -0
  19. package/dist/storage/implementation/MongoBucketBatchShared.js.map +1 -0
  20. package/dist/storage/implementation/MongoChecksums.d.ts +28 -17
  21. package/dist/storage/implementation/MongoChecksums.js +13 -72
  22. package/dist/storage/implementation/MongoChecksums.js.map +1 -1
  23. package/dist/storage/implementation/MongoCompactor.d.ts +98 -58
  24. package/dist/storage/implementation/MongoCompactor.js +229 -296
  25. package/dist/storage/implementation/MongoCompactor.js.map +1 -1
  26. package/dist/storage/implementation/MongoParameterCompactor.d.ts +11 -6
  27. package/dist/storage/implementation/MongoParameterCompactor.js +11 -8
  28. package/dist/storage/implementation/MongoParameterCompactor.js.map +1 -1
  29. package/dist/storage/implementation/MongoPersistedSyncRules.d.ts +14 -0
  30. package/dist/storage/implementation/MongoPersistedSyncRules.js +64 -0
  31. package/dist/storage/implementation/MongoPersistedSyncRules.js.map +1 -0
  32. package/dist/storage/implementation/MongoPersistedSyncRulesContent.d.ts +3 -0
  33. package/dist/storage/implementation/MongoPersistedSyncRulesContent.js +9 -0
  34. package/dist/storage/implementation/MongoPersistedSyncRulesContent.js.map +1 -1
  35. package/dist/storage/implementation/MongoStorageProvider.js +1 -1
  36. package/dist/storage/implementation/MongoStorageProvider.js.map +1 -1
  37. package/dist/storage/implementation/MongoSyncBucketStorage.d.ts +49 -30
  38. package/dist/storage/implementation/MongoSyncBucketStorage.js +96 -388
  39. package/dist/storage/implementation/MongoSyncBucketStorage.js.map +1 -1
  40. package/dist/storage/implementation/MongoSyncRulesLock.d.ts +5 -3
  41. package/dist/storage/implementation/MongoSyncRulesLock.js +12 -10
  42. package/dist/storage/implementation/MongoSyncRulesLock.js.map +1 -1
  43. package/dist/storage/implementation/MongoWriteCheckpointAPI.js +1 -1
  44. package/dist/storage/implementation/MongoWriteCheckpointAPI.js.map +1 -1
  45. package/dist/storage/implementation/OperationBatch.js +1 -1
  46. package/dist/storage/implementation/common/BucketDataDoc.d.ts +35 -0
  47. package/dist/storage/implementation/common/BucketDataDoc.js +2 -0
  48. package/dist/storage/implementation/common/BucketDataDoc.js.map +1 -0
  49. package/dist/storage/implementation/common/MongoSyncBucketStorageContext.d.ts +13 -0
  50. package/dist/storage/implementation/common/MongoSyncBucketStorageContext.js +2 -0
  51. package/dist/storage/implementation/common/MongoSyncBucketStorageContext.js.map +1 -0
  52. package/dist/storage/implementation/common/PersistedBatch.d.ts +108 -0
  53. package/dist/storage/implementation/common/PersistedBatch.js +237 -0
  54. package/dist/storage/implementation/common/PersistedBatch.js.map +1 -0
  55. package/dist/storage/implementation/common/SingleBucketStore.d.ts +54 -0
  56. package/dist/storage/implementation/common/SingleBucketStore.js +3 -0
  57. package/dist/storage/implementation/common/SingleBucketStore.js.map +1 -0
  58. package/dist/storage/implementation/common/SourceRecordStore.d.ts +36 -0
  59. package/dist/storage/implementation/common/SourceRecordStore.js +2 -0
  60. package/dist/storage/implementation/common/SourceRecordStore.js.map +1 -0
  61. package/dist/storage/implementation/common/VersionedPowerSyncMongoBase.d.ts +27 -0
  62. package/dist/storage/implementation/common/VersionedPowerSyncMongoBase.js +57 -0
  63. package/dist/storage/implementation/common/VersionedPowerSyncMongoBase.js.map +1 -0
  64. package/dist/storage/implementation/createMongoSyncBucketStorage.d.ts +7 -0
  65. package/dist/storage/implementation/createMongoSyncBucketStorage.js +9 -0
  66. package/dist/storage/implementation/createMongoSyncBucketStorage.js.map +1 -0
  67. package/dist/storage/implementation/db.d.ts +34 -34
  68. package/dist/storage/implementation/db.js +78 -98
  69. package/dist/storage/implementation/db.js.map +1 -1
  70. package/dist/storage/implementation/models.d.ts +63 -34
  71. package/dist/storage/implementation/models.js +21 -2
  72. package/dist/storage/implementation/models.js.map +1 -1
  73. package/dist/storage/implementation/v1/MongoBucketBatchV1.d.ts +13 -0
  74. package/dist/storage/implementation/v1/MongoBucketBatchV1.js +22 -0
  75. package/dist/storage/implementation/v1/MongoBucketBatchV1.js.map +1 -0
  76. package/dist/storage/implementation/v1/MongoChecksumsV1.d.ts +12 -0
  77. package/dist/storage/implementation/v1/MongoChecksumsV1.js +56 -0
  78. package/dist/storage/implementation/v1/MongoChecksumsV1.js.map +1 -0
  79. package/dist/storage/implementation/v1/MongoCompactorV1.d.ts +23 -0
  80. package/dist/storage/implementation/v1/MongoCompactorV1.js +52 -0
  81. package/dist/storage/implementation/v1/MongoCompactorV1.js.map +1 -0
  82. package/dist/storage/implementation/v1/MongoParameterCompactorV1.d.ts +9 -0
  83. package/dist/storage/implementation/v1/MongoParameterCompactorV1.js +20 -0
  84. package/dist/storage/implementation/v1/MongoParameterCompactorV1.js.map +1 -0
  85. package/dist/storage/implementation/v1/MongoSyncBucketStorageV1.d.ts +41 -0
  86. package/dist/storage/implementation/v1/MongoSyncBucketStorageV1.js +283 -0
  87. package/dist/storage/implementation/v1/MongoSyncBucketStorageV1.js.map +1 -0
  88. package/dist/storage/implementation/v1/PersistedBatchV1.d.ts +26 -0
  89. package/dist/storage/implementation/v1/PersistedBatchV1.js +183 -0
  90. package/dist/storage/implementation/v1/PersistedBatchV1.js.map +1 -0
  91. package/dist/storage/implementation/v1/SingleBucketStoreV1.d.ts +18 -0
  92. package/dist/storage/implementation/v1/SingleBucketStoreV1.js +57 -0
  93. package/dist/storage/implementation/v1/SingleBucketStoreV1.js.map +1 -0
  94. package/dist/storage/implementation/v1/SourceRecordStoreV1.d.ts +19 -0
  95. package/dist/storage/implementation/v1/SourceRecordStoreV1.js +105 -0
  96. package/dist/storage/implementation/v1/SourceRecordStoreV1.js.map +1 -0
  97. package/dist/storage/implementation/v1/VersionedPowerSyncMongoV1.d.ts +12 -0
  98. package/dist/storage/implementation/v1/VersionedPowerSyncMongoV1.js +20 -0
  99. package/dist/storage/implementation/v1/VersionedPowerSyncMongoV1.js.map +1 -0
  100. package/dist/storage/implementation/v1/models.d.ts +34 -0
  101. package/dist/storage/implementation/v1/models.js +37 -0
  102. package/dist/storage/implementation/v1/models.js.map +1 -0
  103. package/dist/storage/implementation/v3/MongoBucketBatchV3.d.ts +13 -0
  104. package/dist/storage/implementation/v3/MongoBucketBatchV3.js +34 -0
  105. package/dist/storage/implementation/v3/MongoBucketBatchV3.js.map +1 -0
  106. package/dist/storage/implementation/v3/MongoChecksumsV3.d.ts +15 -0
  107. package/dist/storage/implementation/v3/MongoChecksumsV3.js +84 -0
  108. package/dist/storage/implementation/v3/MongoChecksumsV3.js.map +1 -0
  109. package/dist/storage/implementation/v3/MongoCompactorV3.d.ts +23 -0
  110. package/dist/storage/implementation/v3/MongoCompactorV3.js +68 -0
  111. package/dist/storage/implementation/v3/MongoCompactorV3.js.map +1 -0
  112. package/dist/storage/implementation/v3/MongoParameterCompactorV3.d.ts +9 -0
  113. package/dist/storage/implementation/v3/MongoParameterCompactorV3.js +18 -0
  114. package/dist/storage/implementation/v3/MongoParameterCompactorV3.js.map +1 -0
  115. package/dist/storage/implementation/v3/MongoParameterLookupV3.d.ts +5 -0
  116. package/dist/storage/implementation/v3/MongoParameterLookupV3.js +9 -0
  117. package/dist/storage/implementation/v3/MongoParameterLookupV3.js.map +1 -0
  118. package/dist/storage/implementation/v3/MongoSyncBucketStorageV3.d.ts +41 -0
  119. package/dist/storage/implementation/v3/MongoSyncBucketStorageV3.js +407 -0
  120. package/dist/storage/implementation/v3/MongoSyncBucketStorageV3.js.map +1 -0
  121. package/dist/storage/implementation/v3/PersistedBatchV3.d.ts +29 -0
  122. package/dist/storage/implementation/v3/PersistedBatchV3.js +259 -0
  123. package/dist/storage/implementation/v3/PersistedBatchV3.js.map +1 -0
  124. package/dist/storage/implementation/v3/SingleBucketStoreV3.d.ts +18 -0
  125. package/dist/storage/implementation/v3/SingleBucketStoreV3.js +48 -0
  126. package/dist/storage/implementation/v3/SingleBucketStoreV3.js.map +1 -0
  127. package/dist/storage/implementation/v3/SourceRecordStoreV3.d.ts +22 -0
  128. package/dist/storage/implementation/v3/SourceRecordStoreV3.js +164 -0
  129. package/dist/storage/implementation/v3/SourceRecordStoreV3.js.map +1 -0
  130. package/dist/storage/implementation/v3/VersionedPowerSyncMongoV3.d.ts +21 -0
  131. package/dist/storage/implementation/v3/VersionedPowerSyncMongoV3.js +71 -0
  132. package/dist/storage/implementation/v3/VersionedPowerSyncMongoV3.js.map +1 -0
  133. package/dist/storage/implementation/v3/models.d.ts +43 -0
  134. package/dist/storage/implementation/v3/models.js +34 -0
  135. package/dist/storage/implementation/v3/models.js.map +1 -0
  136. package/dist/storage/storage-index.d.ts +8 -5
  137. package/dist/storage/storage-index.js +8 -5
  138. package/dist/storage/storage-index.js.map +1 -1
  139. package/dist/utils/util.d.ts +11 -4
  140. package/dist/utils/util.js +25 -4
  141. package/dist/utils/util.js.map +1 -1
  142. package/package.json +9 -9
  143. package/src/migrations/db/migrations/1688556755264-initial-sync-rules.ts +1 -1
  144. package/src/migrations/db/migrations/1702295701188-sync-rule-state.ts +7 -7
  145. package/src/migrations/db/migrations/1770213298299-storage-version.ts +1 -1
  146. package/src/storage/MongoBucketStorage.ts +97 -62
  147. package/src/storage/MongoReportStorage.ts +2 -2
  148. package/src/storage/implementation/BucketDefinitionMapping.ts +72 -0
  149. package/src/storage/implementation/MongoBucketBatch.ts +110 -144
  150. package/src/storage/implementation/MongoBucketBatchShared.ts +11 -0
  151. package/src/storage/implementation/MongoChecksums.ts +53 -76
  152. package/src/storage/implementation/MongoCompactor.ts +374 -404
  153. package/src/storage/implementation/MongoParameterCompactor.ts +37 -24
  154. package/src/storage/implementation/MongoPersistedSyncRules.ts +76 -0
  155. package/src/storage/implementation/MongoPersistedSyncRulesContent.ts +18 -1
  156. package/src/storage/implementation/MongoStorageProvider.ts +1 -1
  157. package/src/storage/implementation/MongoSyncBucketStorage.ts +190 -457
  158. package/src/storage/implementation/MongoSyncRulesLock.ts +12 -14
  159. package/src/storage/implementation/MongoWriteCheckpointAPI.ts +4 -2
  160. package/src/storage/implementation/OperationBatch.ts +1 -1
  161. package/src/storage/implementation/common/BucketDataDoc.ts +37 -0
  162. package/src/storage/implementation/common/MongoSyncBucketStorageContext.ts +15 -0
  163. package/src/storage/implementation/common/PersistedBatch.ts +364 -0
  164. package/src/storage/implementation/common/SingleBucketStore.ts +63 -0
  165. package/src/storage/implementation/common/SourceRecordStore.ts +49 -0
  166. package/src/storage/implementation/common/VersionedPowerSyncMongoBase.ts +80 -0
  167. package/src/storage/implementation/createMongoSyncBucketStorage.ts +25 -0
  168. package/src/storage/implementation/db.ts +107 -128
  169. package/src/storage/implementation/models.ts +84 -38
  170. package/src/storage/implementation/v1/MongoBucketBatchV1.ts +32 -0
  171. package/src/storage/implementation/v1/MongoChecksumsV1.ts +75 -0
  172. package/src/storage/implementation/v1/MongoCompactorV1.ts +93 -0
  173. package/src/storage/implementation/v1/MongoParameterCompactorV1.ts +26 -0
  174. package/src/storage/implementation/v1/MongoSyncBucketStorageV1.ts +448 -0
  175. package/src/storage/implementation/v1/PersistedBatchV1.ts +230 -0
  176. package/src/storage/implementation/v1/SingleBucketStoreV1.ts +74 -0
  177. package/src/storage/implementation/v1/SourceRecordStoreV1.ts +156 -0
  178. package/src/storage/implementation/v1/VersionedPowerSyncMongoV1.ts +28 -0
  179. package/src/storage/implementation/v1/models.ts +84 -0
  180. package/src/storage/implementation/v3/MongoBucketBatchV3.ts +44 -0
  181. package/src/storage/implementation/v3/MongoChecksumsV3.ts +120 -0
  182. package/src/storage/implementation/v3/MongoCompactorV3.ts +107 -0
  183. package/src/storage/implementation/v3/MongoParameterCompactorV3.ts +24 -0
  184. package/src/storage/implementation/v3/MongoParameterLookupV3.ts +12 -0
  185. package/src/storage/implementation/v3/MongoSyncBucketStorageV3.ts +550 -0
  186. package/src/storage/implementation/v3/PersistedBatchV3.ts +318 -0
  187. package/src/storage/implementation/v3/SingleBucketStoreV3.ts +68 -0
  188. package/src/storage/implementation/v3/SourceRecordStoreV3.ts +226 -0
  189. package/src/storage/implementation/v3/VersionedPowerSyncMongoV3.ts +112 -0
  190. package/src/storage/implementation/v3/models.ts +96 -0
  191. package/src/storage/storage-index.ts +8 -5
  192. package/src/utils/util.ts +36 -7
  193. package/test/src/__snapshots__/storage_sync.test.ts.snap +282 -0
  194. package/test/src/connection-report-storage.test.ts +3 -3
  195. package/test/src/setup.ts +1 -1
  196. package/test/src/storage.test.ts +2 -2
  197. package/test/src/storage_compacting.test.ts +57 -29
  198. package/test/src/storage_sync.test.ts +351 -5
  199. package/test/tsconfig.json +0 -1
  200. package/tsconfig.tsbuildinfo +1 -1
  201. package/dist/storage/implementation/PersistedBatch.d.ts +0 -71
  202. package/dist/storage/implementation/PersistedBatch.js +0 -354
  203. package/dist/storage/implementation/PersistedBatch.js.map +0 -1
  204. package/src/storage/implementation/PersistedBatch.ts +0 -432
@@ -1,5 +1,10 @@
1
1
  import { isMongoServerError, mongo, MONGO_OPERATION_TIMEOUT_MS } from '@powersync/lib-service-mongodb';
2
- import { logger, ReplicationAssertionError, ServiceAssertionError } from '@powersync/lib-services-framework';
2
+ import {
3
+ logger as defaultLogger,
4
+ Logger,
5
+ ReplicationAssertionError,
6
+ ServiceAssertionError
7
+ } from '@powersync/lib-services-framework';
3
8
  import {
4
9
  addChecksums,
5
10
  InternalOpId,
@@ -9,15 +14,18 @@ import {
9
14
  utils
10
15
  } from '@powersync/service-core';
11
16
 
12
- import { VersionedPowerSyncMongo } from './db.js';
13
- import { BucketDataDocument, BucketDataKey, BucketStateDocument } from './models.js';
14
- import { MongoSyncBucketStorage } from './MongoSyncBucketStorage.js';
17
+ import { BucketDefinitionId } from './BucketDefinitionMapping.js';
18
+ import { BucketDataDoc, BucketKey } from './common/BucketDataDoc.js';
19
+ import { BucketDataDocumentGeneric, SingleBucketStore } from './common/SingleBucketStore.js';
20
+ import type { VersionedPowerSyncMongo } from './db.js';
21
+ import { BucketStateDocumentBase } from './models.js';
22
+ import type { MongoSyncBucketStorage } from './MongoSyncBucketStorage.js';
15
23
  import { cacheKey } from './OperationBatch.js';
16
24
 
17
25
  interface CurrentBucketState {
18
26
  /** Bucket name */
19
27
  bucket: string;
20
-
28
+ definitionId: BucketDefinitionId;
21
29
  /**
22
30
  * Rows seen in the bucket, with the last op_id of each.
23
31
  */
@@ -26,36 +34,30 @@ interface CurrentBucketState {
26
34
  * Estimated memory usage of the seen Map.
27
35
  */
28
36
  trackingSize: number;
29
-
30
37
  /**
31
38
  * Last (lowest) seen op_id that is not a PUT.
32
39
  */
33
40
  lastNotPut: InternalOpId | null;
34
-
35
41
  /**
36
42
  * Number of REMOVE/MOVE operations seen since lastNotPut.
37
43
  */
38
44
  opsSincePut: number;
39
-
40
45
  /**
41
- * Incrementally-updated checksum, up to maxOpId
46
+ * Incrementally-updated checksum, up to maxOpId.
42
47
  */
43
48
  checksum: number;
44
-
45
49
  /**
46
- * op count for the checksum
50
+ * Op count for the checksum.
47
51
  */
48
52
  opCount: number;
49
-
50
53
  /**
51
54
  * Byte size of ops covered by the checksum.
52
55
  */
53
56
  opBytes: number;
54
57
  }
55
58
 
56
- /**
57
- * Additional options, primarily for testing.
58
- */
59
+ type CompactClearProperties = 'op' | 'checksum' | 'target_op';
60
+
59
61
  export interface MongoCompactOptions extends storage.CompactOptions {}
60
62
 
61
63
  const DEFAULT_CLEAR_BATCH_LIMIT = 5000;
@@ -64,28 +66,36 @@ const DEFAULT_MOVE_BATCH_QUERY_LIMIT = 10_000;
64
66
  const DEFAULT_MIN_BUCKET_CHANGES = 10;
65
67
  const DEFAULT_MIN_CHANGE_RATIO = 0.1;
66
68
  const DIRTY_BUCKET_SCAN_BATCH_SIZE = 2_000;
67
-
68
69
  /** This default is primarily for tests. */
69
70
  const DEFAULT_MEMORY_LIMIT_MB = 64;
70
71
 
71
- export class MongoCompactor {
72
- private updates: mongo.AnyBulkWriteOperation<BucketDataDocument>[] = [];
73
- private bucketStateUpdates: mongo.AnyBulkWriteOperation<BucketStateDocument>[] = [];
74
-
75
- private idLimitBytes: number;
76
- private moveBatchLimit: number;
77
- private moveBatchQueryLimit: number;
78
- private clearBatchLimit: number;
79
- private minBucketChanges: number;
80
- private minChangeRatio: number;
81
- private maxOpId: bigint;
82
- private buckets: string[] | undefined;
83
- private signal?: AbortSignal;
84
- private group_id: number;
72
+ export interface DirtyBucket {
73
+ bucket: string;
74
+ definitionId: BucketDefinitionId | null;
75
+ estimatedCount: number;
76
+ dirtyRatio?: number;
77
+ }
78
+
79
+ export abstract class MongoCompactor {
80
+ protected updates: mongo.AnyBulkWriteOperation<BucketDataDocumentGeneric>[] = [];
81
+ protected bucketStateUpdates: mongo.AnyBulkWriteOperation<BucketStateDocumentBase>[] = [];
82
+
83
+ protected readonly idLimitBytes: number;
84
+ protected readonly moveBatchLimit: number;
85
+ protected readonly moveBatchQueryLimit: number;
86
+ protected readonly clearBatchLimit: number;
87
+ protected readonly minBucketChanges: number;
88
+ protected readonly minChangeRatio: number;
89
+ protected readonly maxOpId: bigint;
90
+ protected readonly buckets: string[] | undefined;
91
+ protected readonly signal?: AbortSignal;
92
+ protected readonly group_id: number;
93
+
94
+ protected readonly logger: Logger;
85
95
 
86
96
  constructor(
87
- private storage: MongoSyncBucketStorage,
88
- private db: VersionedPowerSyncMongo,
97
+ protected readonly storage: MongoSyncBucketStorage,
98
+ protected readonly db: VersionedPowerSyncMongo,
89
99
  options: MongoCompactOptions
90
100
  ) {
91
101
  this.group_id = storage.group_id;
@@ -98,6 +108,7 @@ export class MongoCompactor {
98
108
  this.maxOpId = options.maxOpId ?? 0n;
99
109
  this.buckets = options.compactBuckets;
100
110
  this.signal = options.signal;
111
+ this.logger = options.logger ?? defaultLogger;
101
112
  }
102
113
 
103
114
  /**
@@ -107,9 +118,8 @@ export class MongoCompactor {
107
118
  */
108
119
  async compact() {
109
120
  if (this.buckets) {
110
- for (let bucket of this.buckets) {
111
- // We can make this more efficient later on by iterating
112
- // through the buckets in a single query.
121
+ for (const bucket of this.buckets) {
122
+ // We can make this more efficient later on by iterating through the buckets in a single query.
113
123
  // That makes batching more tricky, so we leave for later.
114
124
  await this.compactSingleBucketRetried(bucket);
115
125
  }
@@ -118,8 +128,161 @@ export class MongoCompactor {
118
128
  }
119
129
  }
120
130
 
121
- private async compactDirtyBuckets() {
122
- for await (let buckets of this.dirtyBucketBatches({
131
+ /**
132
+ * Subset of compact, only populating checksums where relevant.
133
+ */
134
+ async populateChecksums(options: { minBucketChanges: number }): Promise<PopulateChecksumCacheResults> {
135
+ let count = 0;
136
+ while (true) {
137
+ this.signal?.throwIfAborted();
138
+ const buckets = await this.dirtyBucketBatchForChecksums(options);
139
+ if (buckets.length == 0) {
140
+ break;
141
+ }
142
+ this.signal?.throwIfAborted();
143
+
144
+ const start = Date.now();
145
+ // Filter batch by estimated bucket size, to reduce possibility of timeouts.
146
+ const checkBuckets: typeof buckets = [];
147
+ let totalCountEstimate = 0;
148
+ for (const bucket of buckets) {
149
+ checkBuckets.push(bucket);
150
+ totalCountEstimate += bucket.estimatedCount;
151
+ if (totalCountEstimate > 50_000) {
152
+ break;
153
+ }
154
+ }
155
+ this.logger.info(
156
+ `Calculating checksums for batch of ${buckets.length} buckets, estimated count of ${totalCountEstimate}`
157
+ );
158
+ await this.updateChecksumsBatch(checkBuckets);
159
+ this.logger.info(`Updated checksums for batch of ${checkBuckets.length} buckets in ${Date.now() - start}ms`);
160
+ count += checkBuckets.length;
161
+ }
162
+ return { buckets: count };
163
+ }
164
+
165
+ protected async *dirtyBucketBatchesForCollection<TCollectionBucketState extends BucketStateDocumentBase>(
166
+ collection: mongo.Collection<TCollectionBucketState>,
167
+ lastId: TCollectionBucketState['_id'],
168
+ maxId: TCollectionBucketState['_id'],
169
+ options: {
170
+ minBucketChanges: number;
171
+ minChangeRatio: number;
172
+ },
173
+ getDefinitionId: (state: TCollectionBucketState) => BucketDefinitionId | null
174
+ ): AsyncGenerator<DirtyBucket[]> {
175
+ while (true) {
176
+ // To avoid timeouts from too many buckets not meeting the minBucketChanges criteria, use an aggregation pipeline
177
+ // to scan a fixed batch of buckets at a time, but only return buckets that meet the criteria.
178
+ const [result] = await collection
179
+ .aggregate<{
180
+ buckets: TCollectionBucketState[];
181
+ cursor: Pick<TCollectionBucketState, '_id'>[];
182
+ }>(
183
+ [
184
+ {
185
+ $match: {
186
+ _id: { $gt: lastId, $lt: maxId }
187
+ }
188
+ },
189
+ {
190
+ $sort: { _id: 1 }
191
+ },
192
+ {
193
+ // Scan a fixed number of docs each query so sparse matches don't block progress.
194
+ $limit: DIRTY_BUCKET_SCAN_BATCH_SIZE
195
+ },
196
+ {
197
+ $facet: {
198
+ buckets: [
199
+ {
200
+ $match: {
201
+ 'estimate_since_compact.count': { $gte: options.minBucketChanges }
202
+ }
203
+ },
204
+ {
205
+ $project: {
206
+ _id: 1,
207
+ estimate_since_compact: 1,
208
+ compacted_state: 1
209
+ }
210
+ }
211
+ ],
212
+ // This is used for the next query.
213
+ cursor: [{ $sort: { _id: -1 } }, { $limit: 1 }, { $project: { _id: 1 } }]
214
+ }
215
+ }
216
+ ],
217
+ { maxTimeMS: MONGO_OPERATION_TIMEOUT_MS }
218
+ )
219
+ .toArray();
220
+
221
+ const cursor = result?.cursor?.[0];
222
+ if (cursor == null) {
223
+ break;
224
+ }
225
+ lastId = cursor._id;
226
+
227
+ const mapped = (result?.buckets ?? []).map((bucketState) => {
228
+ // The numbers, specifically the bytes, could be a bigint. Convert to Number to allow calculating ratios.
229
+ // BigInt precision is not needed here since this is only an estimate.
230
+ const updatedCount = bucketState.estimate_since_compact?.count ?? 0;
231
+ const totalCount = (bucketState.compacted_state?.count ?? 0) + updatedCount;
232
+ const updatedBytes = Number(bucketState.estimate_since_compact?.bytes ?? 0);
233
+ const totalBytes = Number(bucketState.compacted_state?.bytes ?? 0) + updatedBytes;
234
+ const dirtyChangeNumber = totalCount > 0 ? updatedCount / totalCount : 0;
235
+ const dirtyChangeBytes = totalBytes > 0 ? updatedBytes / totalBytes : 0;
236
+ return {
237
+ bucket: bucketState._id.b,
238
+ definitionId: getDefinitionId(bucketState),
239
+ estimatedCount: totalCount,
240
+ dirtyRatio: Math.max(dirtyChangeNumber, dirtyChangeBytes)
241
+ };
242
+ });
243
+
244
+ yield mapped.filter(
245
+ (bucket) => bucket.estimatedCount >= options.minBucketChanges && bucket.dirtyRatio >= options.minChangeRatio
246
+ );
247
+ }
248
+ }
249
+
250
+ protected async dirtyBucketBatchForChecksumsForCollection<TBucketState extends BucketStateDocumentBase>(
251
+ collection: mongo.Collection<TBucketState>,
252
+ filter: mongo.Filter<TBucketState>,
253
+ getDefinitionId: (state: mongo.WithId<TBucketState>) => BucketDefinitionId | null
254
+ ): Promise<DirtyBucket[]> {
255
+ const dirtyBuckets = await collection
256
+ .find(filter, {
257
+ projection: {
258
+ _id: 1,
259
+ estimate_since_compact: 1,
260
+ compacted_state: 1
261
+ },
262
+ sort: {
263
+ 'estimate_since_compact.count': -1
264
+ },
265
+ limit: 200,
266
+ maxTimeMS: MONGO_OPERATION_TIMEOUT_MS
267
+ })
268
+ .toArray();
269
+
270
+ return dirtyBuckets.map((bucket) => ({
271
+ bucket: bucket._id.b,
272
+ definitionId: getDefinitionId(bucket),
273
+ estimatedCount: Number(bucket.estimate_since_compact!.count) + Number(bucket.compacted_state?.count ?? 0)
274
+ }));
275
+ }
276
+
277
+ public abstract dirtyBucketBatches(options: {
278
+ minBucketChanges: number;
279
+ minChangeRatio: number;
280
+ }): AsyncGenerator<DirtyBucket[]>;
281
+
282
+ public abstract dirtyBucketBatchForChecksums(options: { minBucketChanges: number }): Promise<DirtyBucket[]>;
283
+
284
+ protected async compactDirtyBuckets() {
285
+ for await (const buckets of this.dirtyBucketBatches({
123
286
  minBucketChanges: this.minBucketChanges,
124
287
  minChangeRatio: this.minChangeRatio
125
288
  })) {
@@ -128,8 +291,8 @@ export class MongoCompactor {
128
291
  continue;
129
292
  }
130
293
 
131
- for (let { bucket } of buckets) {
132
- await this.compactSingleBucketRetried(bucket);
294
+ for (const { bucket, definitionId } of buckets) {
295
+ await this.compactSingleBucketRetried(bucket, definitionId);
133
296
  }
134
297
  }
135
298
  }
@@ -139,15 +302,15 @@ export class MongoCompactor {
139
302
  *
140
303
  * This covers against occasional network or other database errors during a long compact job.
141
304
  */
142
- private async compactSingleBucketRetried(bucket: string) {
305
+ protected async compactSingleBucketRetried(bucket: string, definitionId: BucketDefinitionId | null = null) {
143
306
  let retryCount = 0;
144
307
  while (true) {
145
308
  try {
146
- await this.compactSingleBucket(bucket);
309
+ await this.compactSingleBucket(bucket, definitionId);
147
310
  break;
148
311
  } catch (e) {
149
312
  if (retryCount < 3 && isMongoServerError(e)) {
150
- logger.warn(`Error compacting bucket ${bucket}, retrying...`, e);
313
+ this.logger.warn(`Error compacting bucket ${bucket}, retrying...`, e);
151
314
  retryCount++;
152
315
  await new Promise((resolve) => setTimeout(resolve, 1000 * retryCount));
153
316
  } else {
@@ -157,64 +320,64 @@ export class MongoCompactor {
157
320
  }
158
321
  }
159
322
 
160
- private async compactSingleBucket(bucket: string) {
323
+ protected async compactSingleBucket(bucket: string, definitionId: BucketDefinitionId | null = null) {
161
324
  const idLimitBytes = this.idLimitBytes;
162
-
163
- let currentState: CurrentBucketState = {
325
+ const bucketContext = await this.getBucketDataContext(bucket, definitionId);
326
+ if (bucketContext == null) {
327
+ return;
328
+ }
329
+ const currentState: CurrentBucketState = {
164
330
  bucket,
331
+ definitionId: bucketContext.key.definitionId,
165
332
  seen: new Map(),
166
333
  trackingSize: 0,
167
334
  lastNotPut: null,
168
335
  opsSincePut: 0,
169
-
170
336
  checksum: 0,
171
337
  opCount: 0,
172
338
  opBytes: 0
173
339
  };
174
340
 
175
- // Constant lower bound
176
- const lowerBound: BucketDataKey = {
177
- g: this.group_id,
178
- b: bucket,
179
- o: new mongo.MinKey() as any
180
- };
181
-
182
- // Upper bound is adjusted for each batch
183
- let upperBound: BucketDataKey = {
184
- g: this.group_id,
185
- b: bucket,
186
- o: new mongo.MaxKey() as any
187
- };
341
+ // Constant lower bound.
342
+ const lowerBound = bucketContext.minId;
343
+ // Upper bound is adjusted for each batch.
344
+ let upperBound = bucketContext.maxId;
188
345
 
189
346
  while (true) {
190
347
  this.signal?.throwIfAborted();
191
348
 
192
- // Query one batch at a time, to avoid cursor timeouts
193
- const cursor = this.db.bucket_data.aggregate<BucketDataDocument & { size: number | bigint }>(
194
- [
195
- {
196
- $match: {
197
- _id: {
198
- $gte: lowerBound,
199
- $lt: upperBound
200
- }
201
- }
202
- },
203
- { $sort: { _id: -1 } },
204
- { $limit: this.moveBatchQueryLimit },
205
- {
206
- $project: {
207
- _id: 1,
208
- op: 1,
209
- table: 1,
210
- row_id: 1,
211
- source_table: 1,
212
- source_key: 1,
213
- checksum: 1,
214
- size: { $bsonSize: '$$ROOT' }
215
- }
349
+ // Query one batch at a time, to avoid cursor timeouts.
350
+ const pipeline = [
351
+ {
352
+ $match: {
353
+ _id: {
354
+ $gte: lowerBound,
355
+ $lt: upperBound
356
+ },
357
+ // Workaround for a clustered collection bug where the $lt operator may include upperBound.
358
+ // Technically only needed for storage V3.
359
+ // https://jira.mongodb.org/browse/SERVER-121822
360
+ '_id.o': { $lt: upperBound.o }
216
361
  }
217
- ],
362
+ },
363
+ { $sort: { _id: -1 } },
364
+ { $limit: this.moveBatchQueryLimit },
365
+ {
366
+ $project: {
367
+ _id: 1,
368
+ op: 1,
369
+ table: 1,
370
+ row_id: 1,
371
+ source_table: 1,
372
+ source_key: 1,
373
+ checksum: 1,
374
+ size: { $bsonSize: '$$ROOT' }
375
+ }
376
+ }
377
+ ];
378
+
379
+ const cursor = bucketContext.collection.aggregate<BucketDataDocumentGeneric & { size: number | bigint }>(
380
+ pipeline,
218
381
  {
219
382
  // batchSize is 1 more than limit to auto-close the cursor.
220
383
  // See https://github.com/mongodb/node-mongodb-native/pull/4580
@@ -223,18 +386,25 @@ export class MongoCompactor {
223
386
  );
224
387
  // We don't limit to a single batch here, since that often causes MongoDB to scan through more than it returns.
225
388
  // Instead, we load up to the limit.
226
- const batch = await cursor.toArray();
389
+ const rawBatch = await cursor.toArray();
390
+ const batch = rawBatch.map((document) => {
391
+ const { size, ...rest } = document;
392
+ return {
393
+ doc: bucketContext.fromPersistedDocument(rest),
394
+ size
395
+ };
396
+ });
227
397
 
228
398
  if (batch.length == 0) {
229
- // We've reached the end
399
+ // We've reached the end.
230
400
  break;
231
401
  }
232
402
 
233
- // Set upperBound for the next batch
234
- upperBound = batch[batch.length - 1]._id;
403
+ // Reuse the exact collection _id value from Mongo for the next bound.
404
+ upperBound = rawBatch[rawBatch.length - 1]._id;
235
405
 
236
- for (let doc of batch) {
237
- if (doc._id.o > this.maxOpId) {
406
+ for (const { doc, size } of batch) {
407
+ if (doc.o > this.maxOpId) {
238
408
  continue;
239
409
  }
240
410
 
@@ -243,19 +413,17 @@ export class MongoCompactor {
243
413
 
244
414
  let isPersistentPut = doc.op == 'PUT';
245
415
 
246
- currentState.opBytes += Number(doc.size);
416
+ currentState.opBytes += Number(size);
247
417
  if (doc.op == 'REMOVE' || doc.op == 'PUT') {
248
418
  const key = `${doc.table}/${doc.row_id}/${cacheKey(doc.source_table!, doc.source_key!)}`;
249
419
  const targetOp = currentState.seen.get(key);
250
420
  if (targetOp) {
251
- // Will convert to MOVE, so don't count as PUT
421
+ // Will convert to MOVE, so don't count as PUT.
252
422
  isPersistentPut = false;
253
423
 
254
424
  this.updates.push({
255
425
  updateOne: {
256
- filter: {
257
- _id: doc._id
258
- },
426
+ filter: { _id: bucketContext.docId(doc.o) },
259
427
  update: {
260
428
  $set: {
261
429
  op: 'MOVE',
@@ -268,24 +436,20 @@ export class MongoCompactor {
268
436
  row_id: 1,
269
437
  data: 1
270
438
  }
271
- }
439
+ } satisfies mongo.UpdateFilter<BucketDataDocumentGeneric>
272
440
  }
273
441
  });
274
442
 
275
- currentState.opBytes += 200 - Number(doc.size); // TODO: better estimate for this
276
- } else {
277
- if (currentState.trackingSize >= idLimitBytes) {
278
- // Reached memory limit.
279
- // Keep the highest seen values in this case.
280
- } else {
281
- // flatstr reduces the memory usage by flattening the string
282
- currentState.seen.set(utils.flatstr(key), doc._id.o);
283
- // length + 16 for the string
284
- // 24 for the bigint
285
- // 50 for map overhead
286
- // 50 for additional overhead
287
- currentState.trackingSize += key.length + 140;
288
- }
443
+ // TODO: better estimate for this.
444
+ currentState.opBytes += 200 - Number(size);
445
+ } else if (currentState.trackingSize < idLimitBytes) {
446
+ // flatstr reduces the memory usage by flattening the string.
447
+ currentState.seen.set(utils.flatstr(key), doc.o);
448
+ // length + 16 for the string
449
+ // 24 for the bigint
450
+ // 50 for map overhead
451
+ // 50 for additional overhead
452
+ currentState.trackingSize += key.length + 140;
289
453
  }
290
454
  }
291
455
 
@@ -294,41 +458,37 @@ export class MongoCompactor {
294
458
  currentState.opsSincePut = 0;
295
459
  } else if (doc.op != 'CLEAR') {
296
460
  if (currentState.lastNotPut == null) {
297
- currentState.lastNotPut = doc._id.o;
461
+ currentState.lastNotPut = doc.o;
298
462
  }
299
463
  currentState.opsSincePut += 1;
300
464
  }
301
465
 
302
466
  if (this.updates.length + this.bucketStateUpdates.length >= this.moveBatchLimit) {
303
- await this.flush();
467
+ await this.flush(bucketContext);
304
468
  }
305
469
  }
306
470
 
307
- logger.info(`Processed batch of length ${batch.length} current bucket: ${bucket}`);
471
+ this.logger.info(`Processed batch of length ${batch.length} current bucket: ${bucket}`);
308
472
  }
309
473
 
310
- // Free memory before clearing bucket
474
+ // Free memory before clearing the bucket.
311
475
  currentState.seen.clear();
312
476
  if (currentState.lastNotPut != null && currentState.opsSincePut >= 1) {
313
- logger.info(
477
+ this.logger.info(
314
478
  `Inserting CLEAR at ${this.group_id}:${bucket}:${currentState.lastNotPut} to remove ${currentState.opsSincePut} operations`
315
479
  );
316
- // Need flush() before clear()
317
- await this.flush();
318
- await this.clearBucket(currentState);
480
+ // Need flush() before clear().
481
+ await this.flush(bucketContext);
482
+ await this.clearBucket(currentState, bucketContext);
319
483
  }
320
484
 
321
- // Do this _after_ clearBucket so that we have accurate counts.
485
+ // Do this after clearBucket so we have accurate counts.
322
486
  this.updateBucketChecksums(currentState);
323
-
324
- // Need another flush after updateBucketChecksums()
325
- await this.flush();
487
+ // Need another flush after updateBucketChecksums().
488
+ await this.flush(bucketContext);
326
489
  }
327
490
 
328
- /**
329
- * Call when done with a bucket.
330
- */
331
- private updateBucketChecksums(state: CurrentBucketState) {
491
+ protected updateBucketChecksums(state: CurrentBucketState) {
332
492
  if (state.opCount < 0) {
333
493
  throw new ServiceAssertionError(
334
494
  `Invalid opCount: ${state.opCount} checksum ${state.checksum} opsSincePut: ${state.opsSincePut} maxOpId: ${this.maxOpId}`
@@ -336,12 +496,7 @@ export class MongoCompactor {
336
496
  }
337
497
  this.bucketStateUpdates.push({
338
498
  updateOne: {
339
- filter: {
340
- _id: {
341
- g: this.group_id,
342
- b: state.bucket
343
- }
344
- },
499
+ filter: this.bucketStateFilter(state.bucket, state.definitionId),
345
500
  update: {
346
501
  $set: {
347
502
  compacted_state: {
@@ -351,14 +506,13 @@ export class MongoCompactor {
351
506
  bytes: state.opBytes
352
507
  },
353
508
  estimate_since_compact: {
354
- // Note: There could have been a whole bunch of new operations added to the bucket _while_ compacting,
355
- // which we don't currently cater for.
356
- // We could potentially query for that, but that could add overhead.
509
+ // There could have been a whole bunch of new operations added to the bucket while compacting,
510
+ // which we don't currently cater for. We could potentially query for that, but that adds overhead.
357
511
  count: 0,
358
512
  bytes: 0
359
513
  }
360
514
  }
361
- },
515
+ } satisfies mongo.UpdateFilter<BucketStateDocumentBase>,
362
516
  // We generally expect this to have been created before.
363
517
  // We don't create new ones here, to avoid issues with the unique index on bucket_updates.
364
518
  upsert: false
@@ -366,23 +520,24 @@ export class MongoCompactor {
366
520
  });
367
521
  }
368
522
 
369
- private async flush() {
523
+ protected async flush(col: SingleBucketStore) {
370
524
  if (this.updates.length > 0) {
371
- logger.info(`Compacting ${this.updates.length} ops`);
372
- await this.db.bucket_data.bulkWrite(this.updates, {
373
- // Order is not important.
374
- // Since checksums are not affected, these operations can happen in any order,
375
- // and it's fine if the operations are partially applied.
376
- // Each individual operation is atomic.
525
+ this.logger.info(`Compacting ${this.updates.length} ops`);
526
+ await col.collection.bulkWrite(this.updates, {
527
+ // Order is not important. Since checksums are not affected, these operations can happen in any order,
528
+ // and it's fine if the operations are partially applied. Each individual operation is atomic.
377
529
  ordered: false
378
530
  });
379
531
  this.updates = [];
380
532
  }
533
+
534
+ await this.flushBucketStateUpdates();
535
+ }
536
+
537
+ private async flushBucketStateUpdates() {
381
538
  if (this.bucketStateUpdates.length > 0) {
382
- logger.info(`Updating ${this.bucketStateUpdates.length} bucket states`);
383
- await this.db.bucket_state.bulkWrite(this.bucketStateUpdates, {
384
- ordered: false
385
- });
539
+ this.logger.info(`Updating ${this.bucketStateUpdates.length} bucket states`);
540
+ await this.writeBucketStateUpdates();
386
541
  this.bucketStateUpdates = [];
387
542
  }
388
543
  }
@@ -390,26 +545,15 @@ export class MongoCompactor {
390
545
  /**
391
546
  * Perform a CLEAR compact for a bucket.
392
547
  *
393
- *
394
- * @param bucket bucket name
395
- * @param op op_id of the last non-PUT operation, which will be converted to CLEAR.
548
+ * @param currentState tracks the last non-PUT op, which will be converted to CLEAR.
396
549
  */
397
- private async clearBucket(currentState: CurrentBucketState) {
398
- const bucket = currentState.bucket;
550
+ protected async clearBucket(currentState: CurrentBucketState, col: SingleBucketStore) {
399
551
  const clearOp = currentState.lastNotPut!;
400
552
 
401
553
  const opFilter = {
402
554
  _id: {
403
- $gte: {
404
- g: this.group_id,
405
- b: bucket,
406
- o: new mongo.MinKey() as any
407
- },
408
- $lte: {
409
- g: this.group_id,
410
- b: bucket,
411
- o: clearOp
412
- }
555
+ $gte: col.minId,
556
+ $lte: col.docId(clearOp)
413
557
  }
414
558
  };
415
559
 
@@ -424,39 +568,40 @@ export class MongoCompactor {
424
568
  // We need a transaction per batch to make sure checksums stay consistent.
425
569
  await session.withTransaction(
426
570
  async () => {
427
- const query = this.db.bucket_data.find(opFilter, {
428
- session,
429
- sort: { _id: 1 },
430
- projection: {
431
- _id: 1,
432
- op: 1,
433
- checksum: 1,
434
- target_op: 1
435
- },
436
- limit: this.clearBatchLimit
437
- });
571
+ const query = col.collection.find<Pick<BucketDataDocumentGeneric, '_id' | CompactClearProperties>>(
572
+ opFilter,
573
+ {
574
+ session,
575
+ sort: { _id: 1 },
576
+ projection: {
577
+ _id: 1,
578
+ op: 1,
579
+ checksum: 1,
580
+ target_op: 1
581
+ },
582
+ limit: this.clearBatchLimit
583
+ }
584
+ );
438
585
  let checksum = 0;
439
- let lastOpId: BucketDataKey | null = null;
586
+ let lastOp: Pick<BucketDataDoc, 'o' | CompactClearProperties> | null = null;
440
587
  let targetOp: bigint | null = null;
441
588
  let gotAnOp = false;
442
589
  let numberOfOpsToClear = 0;
443
- for await (let op of query.stream()) {
590
+ for await (const rawOp of query.stream()) {
591
+ const op = col.fromPartialPersistedDocument(rawOp);
592
+
444
593
  if (op.op == 'MOVE' || op.op == 'REMOVE' || op.op == 'CLEAR') {
445
594
  checksum = utils.addChecksums(checksum, Number(op.checksum));
446
- lastOpId = op._id;
595
+ lastOp = op;
447
596
  numberOfOpsToClear += 1;
448
597
  if (op.op != 'CLEAR') {
449
598
  gotAnOp = true;
450
599
  }
451
- if (op.target_op != null) {
452
- if (targetOp == null || op.target_op > targetOp) {
453
- targetOp = op.target_op;
454
- }
600
+ if (op.target_op != null && (targetOp == null || op.target_op > targetOp)) {
601
+ targetOp = op.target_op;
455
602
  }
456
603
  } else {
457
- throw new ReplicationAssertionError(
458
- `Unexpected ${op.op} operation at ${op._id.g}:${op._id.b}:${op._id.o}`
459
- );
604
+ throw new ReplicationAssertionError(`Unexpected ${op.op} operation at ${this.formatBucketDataKey(op)}`);
460
605
  }
461
606
  }
462
607
  if (!gotAnOp) {
@@ -464,31 +609,25 @@ export class MongoCompactor {
464
609
  return;
465
610
  }
466
611
 
467
- logger.info(`Flushing CLEAR for ${numberOfOpsToClear} ops at ${lastOpId?.o}`);
468
- await this.db.bucket_data.deleteMany(
612
+ this.logger.info(`Flushing CLEAR for ${numberOfOpsToClear} ops at ${lastOp?.o}`);
613
+ await col.collection.deleteMany(
469
614
  {
470
615
  _id: {
471
- $gte: {
472
- g: this.group_id,
473
- b: bucket,
474
- o: new mongo.MinKey() as any
475
- },
476
- $lte: lastOpId!
616
+ $gte: col.minId,
617
+ $lte: col.docId(lastOp!.o)
477
618
  }
478
619
  },
479
620
  { session }
480
621
  );
481
622
 
482
- await this.db.bucket_data.insertOne(
483
- {
484
- _id: lastOpId!,
485
- op: 'CLEAR',
486
- checksum: BigInt(checksum),
487
- data: null,
488
- target_op: targetOp
489
- },
490
- { session }
491
- );
623
+ const op = col.toPersistedDocument({
624
+ o: lastOp!.o,
625
+ op: 'CLEAR',
626
+ checksum: BigInt(checksum),
627
+ data: null,
628
+ target_op: targetOp
629
+ });
630
+ await col.collection.insertOne(op, { session });
492
631
 
493
632
  opCountDiff = -numberOfOpsToClear + 1;
494
633
  },
@@ -497,7 +636,7 @@ export class MongoCompactor {
497
636
  readConcern: { level: 'snapshot' }
498
637
  }
499
638
  );
500
- // Update _outside_ the transaction, since the transaction can be retried multiple times.
639
+ // Update outside the transaction, since the transaction can be retried multiple times.
501
640
  currentState.opCount += opCountDiff;
502
641
  }
503
642
  } finally {
@@ -505,211 +644,22 @@ export class MongoCompactor {
505
644
  }
506
645
  }
507
646
 
508
- /**
509
- * Subset of compact, only populating checksums where relevant.
510
- */
511
- async populateChecksums(options: { minBucketChanges: number }): Promise<PopulateChecksumCacheResults> {
512
- let count = 0;
513
- while (true) {
514
- this.signal?.throwIfAborted();
515
- const buckets = await this.dirtyBucketBatchForChecksums(options);
516
- if (buckets.length == 0) {
517
- // All done
518
- break;
519
- }
520
- this.signal?.throwIfAborted();
521
-
522
- const start = Date.now();
523
-
524
- // Filter batch by estimated bucket size, to reduce possibility of timeouts
525
- let checkBuckets: typeof buckets = [];
526
- let totalCountEstimate = 0;
527
- for (let bucket of buckets) {
528
- checkBuckets.push(bucket);
529
- totalCountEstimate += bucket.estimatedCount;
530
- if (totalCountEstimate > 50_000) {
531
- break;
532
- }
533
- }
534
- logger.info(
535
- `Calculating checksums for batch of ${buckets.length} buckets, estimated count of ${totalCountEstimate}`
536
- );
537
- await this.updateChecksumsBatch(checkBuckets.map((b) => b.bucket));
538
- logger.info(`Updated checksums for batch of ${checkBuckets.length} buckets in ${Date.now() - start}ms`);
539
- count += checkBuckets.length;
540
- }
541
- return { buckets: count };
542
- }
543
-
544
- /**
545
- * Return batches of dirty buckets.
546
- *
547
- * Can be used to iterate through all buckets.
548
- *
549
- * minBucketChanges: minimum number of changes for a bucket to be included in the results.
550
- * minChangeRatio: minimum ratio of changes to total ops for a bucket to be included in the results, number between 0 and 1.
551
- */
552
- private async *dirtyBucketBatches(options: {
553
- minBucketChanges: number;
554
- minChangeRatio: number;
555
- }): AsyncGenerator<{ bucket: string; estimatedCount: number }[]> {
556
- // Previously, we used an index on {_id.g: 1, estimate_since_compact.count: 1} to only buckets with changes.
557
- // This works well if there are only a small number of buckets with changes.
558
- // However, if buckets are continuosly modified while we are compacting, we get the same buckets over and over again.
559
- // This has caused the compact process to re-read the same collection around 5x times in total, which is very inefficient.
560
- // To solve this, we now just iterate through all buckets, and filter out the ones with low changes.
561
-
562
- if (options.minBucketChanges <= 0) {
563
- throw new ReplicationAssertionError('minBucketChanges must be >= 1');
564
- }
565
- let lastId = { g: this.group_id, b: new mongo.MinKey() as any };
566
- const maxId = { g: this.group_id, b: new mongo.MaxKey() as any };
567
- while (true) {
568
- // To avoid timeouts from too many buckets not meeting the minBucketChanges criteria, we use an aggregation pipeline
569
- // to scan a fixed batch of buckets at a time, but only return buckets that meet the criteria, rather than limiting
570
- // on the output number.
571
- const [result] = await this.db.bucket_state
572
- .aggregate<{
573
- buckets: Pick<BucketStateDocument, '_id' | 'estimate_since_compact' | 'compacted_state'>[];
574
- cursor: Pick<BucketStateDocument, '_id'>[];
575
- }>(
576
- [
577
- {
578
- $match: {
579
- _id: { $gt: lastId, $lt: maxId }
580
- }
581
- },
582
- {
583
- $sort: { _id: 1 }
584
- },
585
- {
586
- // Scan a fixed number of docs each query so sparse matches don't block progress.
587
- $limit: DIRTY_BUCKET_SCAN_BATCH_SIZE
588
- },
589
- {
590
- $facet: {
591
- // This is the results for the batch
592
- buckets: [
593
- {
594
- $match: {
595
- 'estimate_since_compact.count': { $gte: options.minBucketChanges }
596
- }
597
- },
598
- {
599
- $project: {
600
- _id: 1,
601
- estimate_since_compact: 1,
602
- compacted_state: 1
603
- }
604
- }
605
- ],
606
- // This is used for the next query.
607
- cursor: [{ $sort: { _id: -1 } }, { $limit: 1 }, { $project: { _id: 1 } }]
608
- }
609
- }
610
- ],
611
- { maxTimeMS: MONGO_OPERATION_TIMEOUT_MS }
612
- )
613
- .toArray();
614
-
615
- const cursor = result?.cursor?.[0];
616
- if (cursor == null) {
617
- break;
618
- }
619
- lastId = cursor._id;
620
-
621
- const mapped = (result?.buckets ?? []).map((b) => {
622
- // The numbers, specifically the bytes, could be a bigint. We convert to Number to allow calculating the ratios.
623
- // BigInt precision is not needed here since it's just an estimate.
624
- const updatedCount = b.estimate_since_compact?.count ?? 0;
625
- const totalCount = (b.compacted_state?.count ?? 0) + updatedCount;
626
- const updatedBytes = Number(b.estimate_since_compact?.bytes ?? 0);
627
- const totalBytes = Number(b.compacted_state?.bytes ?? 0) + updatedBytes;
628
- const dirtyChangeNumber = totalCount > 0 ? updatedCount / totalCount : 0;
629
- const dirtyChangeBytes = totalBytes > 0 ? updatedBytes / totalBytes : 0;
630
- return {
631
- bucket: b._id.b,
632
- estimatedCount: totalCount,
633
- dirtyRatio: Math.max(dirtyChangeNumber, dirtyChangeBytes)
634
- };
635
- });
636
- const filtered = mapped.filter(
637
- (b) => b.estimatedCount >= options.minBucketChanges && b.dirtyRatio >= options.minChangeRatio
638
- );
639
- yield filtered;
640
- }
641
- }
642
-
643
- /**
644
- * Returns a batch of dirty buckets - buckets with most changes first.
645
- *
646
- * This cannot be used to iterate on its own - the client is expected to process these buckets and
647
- * set estimate_since_compact.count: 0 when done, before fetching the next batch.
648
- *
649
- * Unlike dirtyBucketBatches, used for compacting, this is specifically designed to be resuamble after a restart,
650
- * since it is used as the last step for initial replication.
651
- *
652
- * We currently don't get new data while doing populateChecksums, so we don't need to worry about buckets changing while processing.
653
- */
654
- private async dirtyBucketBatchForChecksums(options: {
655
- minBucketChanges: number;
656
- }): Promise<{ bucket: string; estimatedCount: number }[]> {
657
- if (options.minBucketChanges <= 0) {
658
- throw new ReplicationAssertionError('minBucketChanges must be >= 1');
659
- }
660
- // We make use of an index on {_id.g: 1, 'estimate_since_compact.count': -1}
661
- const dirtyBuckets = await this.db.bucket_state
662
- .find(
663
- {
664
- '_id.g': this.group_id,
665
- 'estimate_since_compact.count': { $gte: options.minBucketChanges }
666
- },
667
- {
668
- projection: {
669
- _id: 1,
670
- estimate_since_compact: 1,
671
- compacted_state: 1
672
- },
673
- sort: {
674
- 'estimate_since_compact.count': -1
675
- },
676
- limit: 200,
677
- maxTimeMS: MONGO_OPERATION_TIMEOUT_MS
678
- }
679
- )
680
- .toArray();
647
+ protected async updateChecksumsBatch(buckets: Pick<DirtyBucket, 'bucket' | 'definitionId'>[]) {
648
+ const checksums = await this.computeChecksumsForBuckets(buckets);
649
+ const definitionIdByBucket = new Map(buckets.map((bucket) => [bucket.bucket, bucket.definitionId]));
681
650
 
682
- return dirtyBuckets.map((bucket) => ({
683
- bucket: bucket._id.b,
684
- estimatedCount: Number(bucket.estimate_since_compact!.count) + Number(bucket.compacted_state?.count ?? 0)
685
- }));
686
- }
687
-
688
- private async updateChecksumsBatch(buckets: string[]) {
689
- const checksums = await this.storage.checksums.computePartialChecksumsDirect(
690
- buckets.map((bucket) => {
691
- return {
692
- bucket,
693
- source: {} as any,
694
- end: this.maxOpId
695
- };
696
- })
697
- );
698
-
699
- for (let bucketChecksum of checksums.values()) {
651
+ for (const bucketChecksum of checksums.values()) {
700
652
  if (isPartialChecksum(bucketChecksum)) {
701
- // Should never happen since we don't specify `start`
653
+ // Should never happen since we don't specify `start`.
702
654
  throw new ServiceAssertionError(`Full checksum expected, got ${JSON.stringify(bucketChecksum)}`);
703
655
  }
704
656
 
705
657
  this.bucketStateUpdates.push({
706
658
  updateOne: {
707
- filter: {
708
- _id: {
709
- g: this.group_id,
710
- b: bucketChecksum.bucket
711
- }
712
- },
659
+ filter: this.bucketStateFilter(
660
+ bucketChecksum.bucket,
661
+ definitionIdByBucket.get(bucketChecksum.bucket) ?? null
662
+ ),
713
663
  update: {
714
664
  $set: {
715
665
  compacted_state: {
@@ -723,14 +673,34 @@ export class MongoCompactor {
723
673
  bytes: 0
724
674
  }
725
675
  }
726
- },
727
- // We don't create new ones here - it gets tricky to get the last_op right with the unique index on:
728
- // bucket_updates: {'id.g': 1, 'last_op': 1}
676
+ } satisfies mongo.UpdateFilter<BucketStateDocumentBase>,
677
+ // We don't create new ones here - it gets tricky to get the last_op right with the unique index on
678
+ // bucket_updates.
729
679
  upsert: false
730
680
  }
731
681
  });
732
682
  }
733
683
 
734
- await this.flush();
684
+ await this.flushBucketStateUpdates();
685
+ }
686
+
687
+ protected formatBucketDataKey(doc: Pick<BucketDataDoc, 'bucketKey' | 'o'>) {
688
+ return `${doc.bucketKey.replicationStreamId}:${doc.bucketKey.bucket}:${doc.o}`;
735
689
  }
690
+
691
+ protected abstract writeBucketStateUpdates(): Promise<void>;
692
+ protected abstract computeChecksumsForBuckets(
693
+ buckets: Pick<DirtyBucket, 'bucket' | 'definitionId'>[]
694
+ ): Promise<storage.PartialChecksumMap>;
695
+ protected abstract bucketStateFilter(bucket: string, definitionId: BucketDefinitionId | null): mongo.Document;
696
+
697
+ protected abstract getBucketDataContext(
698
+ bucket: string,
699
+ definitionId: BucketDefinitionId | null
700
+ ): Promise<SingleBucketStore | null>;
701
+ }
702
+
703
+ export interface BucketDataCollectionContext<TBucketData extends mongo.Document> {
704
+ bucketKey: BucketKey;
705
+ collection: mongo.Collection<TBucketData>;
736
706
  }