@tmlmobilidade/utils 20260320.1741.37 → 20260320.1746.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/batching/index.d.ts +2 -0
- package/dist/batching/index.js +2 -0
- package/dist/batching/perform-in-time-chunks.d.ts +14 -0
- package/dist/batching/perform-in-time-chunks.js +38 -0
- package/dist/batching/replicate.d.ts +80 -0
- package/dist/batching/replicate.js +82 -0
- package/package.json +5 -1
package/dist/batching/index.d.ts
CHANGED
package/dist/batching/index.js
CHANGED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { UnixTimestamp } from '@tmlmobilidade/types';
|
|
2
|
+
import { DurationObjectUnits } from 'luxon';
|
|
3
|
+
export interface PerformInTimeChunksItem {
|
|
4
|
+
end: UnixTimestamp;
|
|
5
|
+
index: number;
|
|
6
|
+
start: UnixTimestamp;
|
|
7
|
+
total: number;
|
|
8
|
+
}
|
|
9
|
+
export interface PerformInTimeChunksOptions {
|
|
10
|
+
onChunk: (chunk: PerformInTimeChunksItem) => Promise<void>;
|
|
11
|
+
splitBy: DurationObjectUnits;
|
|
12
|
+
startDate: UnixTimestamp;
|
|
13
|
+
}
|
|
14
|
+
export declare function performInTimeChunks({ onChunk, splitBy, startDate }: PerformInTimeChunksOptions): Promise<void>;
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/* * */
|
|
2
|
+
import { Dates } from '@tmlmobilidade/dates';
|
|
3
|
+
import { Interval } from 'luxon';
|
|
4
|
+
export async function performInTimeChunks({ onChunk, splitBy, startDate }) {
|
|
5
|
+
//
|
|
6
|
+
// In order to sync both collections in a manageable way, due to the high volume of data,
|
|
7
|
+
// it is necessary to divide the process into smaller blocks. Instead of syncing all documents at once,
|
|
8
|
+
// divide the process by timestamps chunks and iterate over each one, getting all document IDs from both databases.
|
|
9
|
+
// Like this we can more easily compare the IDs in memory and sync only the missing documents.
|
|
10
|
+
// More recent data is more important than older data, so we start syncing the most recent data first.
|
|
11
|
+
// It makes sense to divide chunks by day, but this should be adjusted according to the volume of data in each chunk.
|
|
12
|
+
const thirtySecondsAgo = Dates
|
|
13
|
+
.now('Europe/Lisbon')
|
|
14
|
+
.minus({ seconds: 30 });
|
|
15
|
+
const earliestDataNeeded = Dates.fromUnixTimestamp(startDate);
|
|
16
|
+
const allTimestampChunks = Interval
|
|
17
|
+
.fromISO(`${earliestDataNeeded.iso}/${thirtySecondsAgo.iso}`)
|
|
18
|
+
.splitBy(splitBy)
|
|
19
|
+
.map(interval => ({ end: interval.end.toMillis(), start: interval.start.toMillis() }))
|
|
20
|
+
.sort((a, b) => b.start - a.start);
|
|
21
|
+
//
|
|
22
|
+
// Iterate over each timestamp chunk and sync the documents.
|
|
23
|
+
// Timestamp chunks are sorted in descending order, so that more recent data is processed first.
|
|
24
|
+
// Timestamp chunks are in the format { start: day1, end: day2 }, so end is always greater than start.
|
|
25
|
+
// This might be confusing as the array of chunks itself is sorted in descending order, but the chunks individually are not.
|
|
26
|
+
for (const [chunkIndex, chunkData] of allTimestampChunks.entries()) {
|
|
27
|
+
//
|
|
28
|
+
const chunkStartDate = Dates
|
|
29
|
+
.fromUnixTimestamp(chunkData.start)
|
|
30
|
+
.setZone('Europe/Lisbon', 'offset_only');
|
|
31
|
+
const chunkEndDate = Dates
|
|
32
|
+
.fromUnixTimestamp(chunkData.end)
|
|
33
|
+
.setZone('Europe/Lisbon', 'offset_only');
|
|
34
|
+
await onChunk({ end: chunkEndDate.unix_timestamp, index: chunkIndex, start: chunkStartDate.unix_timestamp, total: allTimestampChunks.length });
|
|
35
|
+
}
|
|
36
|
+
//
|
|
37
|
+
}
|
|
38
|
+
;
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
interface ReplicateProps<SourceDocType> {
|
|
2
|
+
/**
|
|
3
|
+
* A function to count the total number of documents
|
|
4
|
+
* in the destination database. This must return a number.
|
|
5
|
+
* @returns A promise that resolves to a number.
|
|
6
|
+
*/
|
|
7
|
+
countDestinationDbFn: () => Promise<number>;
|
|
8
|
+
/**
|
|
9
|
+
* A function to count the total number of documents
|
|
10
|
+
* in the source database. This must return a number.
|
|
11
|
+
* @returns A promise that resolves to a number.
|
|
12
|
+
*/
|
|
13
|
+
countSourceDbFn: () => Promise<number>;
|
|
14
|
+
/**
|
|
15
|
+
* A function that deletes documents in the destination database,
|
|
16
|
+
* from an array of unique document IDs. This is used to remove any extra documents
|
|
17
|
+
* that are present in the destination database but not in the source database.
|
|
18
|
+
* This function should return a promise that resolves when the deletion is complete.
|
|
19
|
+
* @param uniqueIds An array of unique document IDs to be deleted from the destination database.
|
|
20
|
+
* @returns A promise that resolves when the deletion is complete.
|
|
21
|
+
*/
|
|
22
|
+
deleteDestinationDbFn: (uniqueIds: string[]) => Promise<void>;
|
|
23
|
+
/**
|
|
24
|
+
* A function to get the distinct document IDs from the destination database.
|
|
25
|
+
* This must return an array of strings.
|
|
26
|
+
* @returns A promise that resolves to an array of strings.
|
|
27
|
+
*/
|
|
28
|
+
distinctDestinationDbFn: () => Promise<string[]>;
|
|
29
|
+
/**
|
|
30
|
+
* A function to get the distinct document IDs from the source database.
|
|
31
|
+
* This must return an array of strings.
|
|
32
|
+
* @returns A promise that resolves to an array of strings.
|
|
33
|
+
*/
|
|
34
|
+
distinctSourceDbFn: () => Promise<string[]>;
|
|
35
|
+
/**
|
|
36
|
+
* This is the function that should query the source database for the missing documents based on their IDs.
|
|
37
|
+
* It should return an async iterable (e.g., an async generator or a MongoDB `.stream()`) that yields
|
|
38
|
+
* the missing documents one by one.
|
|
39
|
+
* @param missingDocumentIds An array of document IDs that are missing in the destination database.
|
|
40
|
+
* @returns An async iterable that yields source documents one by one.
|
|
41
|
+
*/
|
|
42
|
+
missingDocumentsSourceDbAsyncIterator: (missingDocumentIds: string[]) => AsyncIterable<SourceDocType>;
|
|
43
|
+
/**
|
|
44
|
+
* An optional callback function that will be executed after the replication process is complete.
|
|
45
|
+
* This can be used to perform any necessary cleanup tasks, such as flushing writers or logging.
|
|
46
|
+
*/
|
|
47
|
+
onCompleteCallbackFn?: () => Promise<void>;
|
|
48
|
+
/**
|
|
49
|
+
* This function receives a document from the source database and should write it to the destination database.
|
|
50
|
+
* You can use any method you prefer to write the document to the destination database, such as a bulk insert or individual writes,
|
|
51
|
+
* and perform any necessary transformations on the document before writing it.
|
|
52
|
+
* @param sourceDocument The source document to be written to the destination database.
|
|
53
|
+
* @returns A promise that resolves when the document has been successfully written to the destination database.
|
|
54
|
+
*/
|
|
55
|
+
writeSourceDocumentToDestinationDbFn: (sourceDocument: SourceDocType) => Promise<void>;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Copy documents from a source database to a destination database in multiple steps.
|
|
59
|
+
* The goal of this function is to ensure that the destination database has the same documents
|
|
60
|
+
* as the source database. The replication process is designed to be efficient and to minimize
|
|
61
|
+
* the amount of data transferred between the two databases by only syncing the missing documents.
|
|
62
|
+
*
|
|
63
|
+
* 1. First count the total number of documents in both databases to check if they match. This is a
|
|
64
|
+
* crucial optimization step, as it allows us to skip the replication process if both databases already
|
|
65
|
+
* have the same number of documents, which would indicate that they are already in sync.
|
|
66
|
+
* Though, it's important to note that having the same document count does not guarantee
|
|
67
|
+
* that the documents are identical, but it is a quick check to potentially avoid unnecessary replication.
|
|
68
|
+
*
|
|
69
|
+
* 2. If the counts do not match, get the distinct document IDs from both databases and compare them
|
|
70
|
+
* to find out which ones are missing in the destination database. This step is essential to identify
|
|
71
|
+
* the specific documents that need to be replicated, rather than syncing all documents again.
|
|
72
|
+
* Sync only the missing documents from the source database to the destination database,
|
|
73
|
+
*
|
|
74
|
+
* 3. Delete any extra documents in the destination database that are not present in the source database.
|
|
75
|
+
*
|
|
76
|
+
* 4. Run the onComplete callback function if provided. This allows for any additional actions to be performed after
|
|
77
|
+
* the replication process is complete, such as logging, flushing writers or any other necessary cleanup tasks.
|
|
78
|
+
*/
|
|
79
|
+
export declare function replicate<SourceDocType>({ countDestinationDbFn, countSourceDbFn, deleteDestinationDbFn, distinctDestinationDbFn, distinctSourceDbFn, missingDocumentsSourceDbAsyncIterator, onCompleteCallbackFn, writeSourceDocumentToDestinationDbFn }: ReplicateProps<SourceDocType>): Promise<void>;
|
|
80
|
+
export {};
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/* * */
|
|
2
|
+
import { Logger } from '@tmlmobilidade/logger';
|
|
3
|
+
import { Timer } from '@tmlmobilidade/timer';
|
|
4
|
+
/**
|
|
5
|
+
* Copy documents from a source database to a destination database in multiple steps.
|
|
6
|
+
* The goal of this function is to ensure that the destination database has the same documents
|
|
7
|
+
* as the source database. The replication process is designed to be efficient and to minimize
|
|
8
|
+
* the amount of data transferred between the two databases by only syncing the missing documents.
|
|
9
|
+
*
|
|
10
|
+
* 1. First count the total number of documents in both databases to check if they match. This is a
|
|
11
|
+
* crucial optimization step, as it allows us to skip the replication process if both databases already
|
|
12
|
+
* have the same number of documents, which would indicate that they are already in sync.
|
|
13
|
+
* Though, it's important to note that having the same document count does not guarantee
|
|
14
|
+
* that the documents are identical, but it is a quick check to potentially avoid unnecessary replication.
|
|
15
|
+
*
|
|
16
|
+
* 2. If the counts do not match, get the distinct document IDs from both databases and compare them
|
|
17
|
+
* to find out which ones are missing in the destination database. This step is essential to identify
|
|
18
|
+
* the specific documents that need to be replicated, rather than syncing all documents again.
|
|
19
|
+
* Sync only the missing documents from the source database to the destination database,
|
|
20
|
+
*
|
|
21
|
+
* 3. Delete any extra documents in the destination database that are not present in the source database.
|
|
22
|
+
*
|
|
23
|
+
* 4. Run the onComplete callback function if provided. This allows for any additional actions to be performed after
|
|
24
|
+
* the replication process is complete, such as logging, flushing writers or any other necessary cleanup tasks.
|
|
25
|
+
*/
|
|
26
|
+
export async function replicate({ countDestinationDbFn, countSourceDbFn, deleteDestinationDbFn, distinctDestinationDbFn, distinctSourceDbFn, missingDocumentsSourceDbAsyncIterator, onCompleteCallbackFn, writeSourceDocumentToDestinationDbFn }) {
|
|
27
|
+
//
|
|
28
|
+
const globalTimer = new Timer();
|
|
29
|
+
//
|
|
30
|
+
// Run the count functions for both databases, if enabled, to get the total number
|
|
31
|
+
// of documents that match a given query. This is done to check if the document count
|
|
32
|
+
// is the same for both databases, which would indicate that all documents are already synced.
|
|
33
|
+
const countStepTimer = new Timer();
|
|
34
|
+
const sourceDbCount = await countSourceDbFn();
|
|
35
|
+
const destinationDbCount = await countDestinationDbFn();
|
|
36
|
+
if (sourceDbCount === destinationDbCount) {
|
|
37
|
+
Logger.success(`MATCH: Found the same number of documents in both databases: ${sourceDbCount} Source = ${destinationDbCount} Destination (${countStepTimer.get()})`);
|
|
38
|
+
return;
|
|
39
|
+
}
|
|
40
|
+
Logger.info(`MISMATCH: Document count was different for both databases: ${sourceDbCount} Source != ${destinationDbCount} Destination (${countStepTimer.get()})`);
|
|
41
|
+
//
|
|
42
|
+
// If the document count was different, then check which documents are missing.
|
|
43
|
+
// Instead of syncing all documents again, only the missing IDs are synced.
|
|
44
|
+
// This is done to get the distinct values from each database and comparing
|
|
45
|
+
// them to find the missing ones.
|
|
46
|
+
const distinctStepTimer = new Timer();
|
|
47
|
+
const sourceDbDocIds = await distinctSourceDbFn();
|
|
48
|
+
const sourceDbDocIdsUnique = new Set(sourceDbDocIds);
|
|
49
|
+
const destinationDbDocIds = await distinctDestinationDbFn();
|
|
50
|
+
const destinationDbDocIdsUnique = new Set(destinationDbDocIds);
|
|
51
|
+
const missingDocumentIds = sourceDbDocIds.filter((documentId) => !destinationDbDocIdsUnique.has(documentId));
|
|
52
|
+
const extraDocumentIds = destinationDbDocIds.filter(doc => !sourceDbDocIdsUnique.has(doc));
|
|
53
|
+
Logger.info(`Source Total: ${sourceDbCount} | Source Unique: ${sourceDbDocIdsUnique.size} | Source ▲: ${sourceDbCount - sourceDbDocIdsUnique.size} | Destination Total: ${destinationDbCount} | Destination Unique: ${destinationDbDocIdsUnique.size} | Destination ▲: ${destinationDbCount - destinationDbDocIdsUnique.size} | Destination Missing: ${missingDocumentIds.length} | Destination Extra: ${extraDocumentIds.length} (${distinctStepTimer.get()})`);
|
|
54
|
+
if (missingDocumentIds.length === 0) {
|
|
55
|
+
Logger.success(`Chunk complete. All document IDs matched. (${distinctStepTimer.get()})`);
|
|
56
|
+
return;
|
|
57
|
+
}
|
|
58
|
+
//
|
|
59
|
+
// Extra documents in the destination database should be removed,
|
|
60
|
+
// as they are not present in the source database.
|
|
61
|
+
const deleteStepTimer = new Timer();
|
|
62
|
+
if (extraDocumentIds.length > 0 && deleteDestinationDbFn) {
|
|
63
|
+
await deleteDestinationDbFn(extraDocumentIds);
|
|
64
|
+
Logger.info(`Deleted ${extraDocumentIds.length} extra documents in the Destination database. (${deleteStepTimer.get()})`);
|
|
65
|
+
}
|
|
66
|
+
//
|
|
67
|
+
// If there are missing documents, then they are synced.
|
|
68
|
+
// We query the Source database for the missing documents
|
|
69
|
+
// and write them to the Destination database.
|
|
70
|
+
const missingStepTimer = new Timer();
|
|
71
|
+
Logger.info(`Found ${missingDocumentIds.length} missing documents in the Destination database. (${missingStepTimer.get()})`);
|
|
72
|
+
for await (const sourceDbDocument of missingDocumentsSourceDbAsyncIterator(missingDocumentIds)) {
|
|
73
|
+
await writeSourceDocumentToDestinationDbFn(sourceDbDocument);
|
|
74
|
+
}
|
|
75
|
+
//
|
|
76
|
+
// After syncing the missing documents,
|
|
77
|
+
// run the onComplete callback function if provided.
|
|
78
|
+
if (onCompleteCallbackFn)
|
|
79
|
+
await onCompleteCallbackFn();
|
|
80
|
+
Logger.success(`Replication complete (${globalTimer.get()})`);
|
|
81
|
+
//
|
|
82
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tmlmobilidade/utils",
|
|
3
|
-
"version": "20260320.
|
|
3
|
+
"version": "20260320.1746.41",
|
|
4
4
|
"author": {
|
|
5
5
|
"email": "iso@tmlmobilidade.pt",
|
|
6
6
|
"name": "TML-ISO"
|
|
@@ -37,7 +37,11 @@
|
|
|
37
37
|
},
|
|
38
38
|
"dependencies": {
|
|
39
39
|
"@tmlmobilidade/consts": "*",
|
|
40
|
+
"@tmlmobilidade/dates": "*",
|
|
41
|
+
"@tmlmobilidade/logger": "*",
|
|
42
|
+
"@tmlmobilidade/timer": "*",
|
|
40
43
|
"@tmlmobilidade/types": "*",
|
|
44
|
+
"luxon": "3.7.2",
|
|
41
45
|
"mergekit": "3.0.6"
|
|
42
46
|
},
|
|
43
47
|
"devDependencies": {
|