@memberjunction/ai-vector-dupe 5.21.0 → 5.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +266 -229
- package/dist/duplicateRecordDetector.d.ts +180 -18
- package/dist/duplicateRecordDetector.d.ts.map +1 -1
- package/dist/duplicateRecordDetector.js +746 -267
- package/dist/duplicateRecordDetector.js.map +1 -1
- package/dist/index.d.ts +2 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +4 -3
- package/dist/index.js.map +1 -1
- package/dist/scoring/ReciprocalRankFusion.d.ts +45 -0
- package/dist/scoring/ReciprocalRankFusion.d.ts.map +1 -0
- package/dist/scoring/ReciprocalRankFusion.js +63 -0
- package/dist/scoring/ReciprocalRankFusion.js.map +1 -0
- package/package.json +10 -10
- package/dist/config.d.ts +0 -13
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js +0 -15
- package/dist/config.js.map +0 -1
- package/dist/generic/vectorSyncBase.d.ts +0 -20
- package/dist/generic/vectorSyncBase.d.ts.map +0 -1
- package/dist/generic/vectorSyncBase.js +0 -42
- package/dist/generic/vectorSyncBase.js.map +0 -1
- package/dist/models/entitySyncConfig.d.ts +0 -36
- package/dist/models/entitySyncConfig.d.ts.map +0 -1
- package/dist/models/entitySyncConfig.js +0 -2
- package/dist/models/entitySyncConfig.js.map +0 -1
|
@@ -1,332 +1,811 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Modernized duplicate record detection engine.
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates the full pipeline: vectorize records, query for similar candidates,
|
|
5
|
+
* optionally apply hybrid search (RRF) and reranking, persist match results,
|
|
6
|
+
* and auto-merge high-confidence duplicates.
|
|
7
|
+
*
|
|
8
|
+
* Supports three record-source modes:
|
|
9
|
+
* 1. List-based batch detection (ListID provided)
|
|
10
|
+
* 2. View-based detection (ViewID provided)
|
|
11
|
+
* 3. Entity-wide detection (no ListID/ViewID — scans all records or applies ExtraFilter)
|
|
12
|
+
*
|
|
13
|
+
* Also supports single-record checks via CheckSingleRecord().
|
|
14
|
+
*
|
|
15
|
+
* @module @memberjunction/ai-vector-dupe
|
|
16
|
+
*/
|
|
1
17
|
import { BaseEmbeddings, GetAIAPIKey } from "@memberjunction/ai";
|
|
2
|
-
import { PotentialDuplicateResponse,
|
|
3
|
-
import { LogStatus } from "@memberjunction/core";
|
|
18
|
+
import { PotentialDuplicateResponse, CompositeKey, PotentialDuplicateResult, LogError, LogStatus, RecordMergeRequest, PotentialDuplicate, RunView, } from "@memberjunction/core";
|
|
4
19
|
import { VectorDBBase } from "@memberjunction/ai-vectordb";
|
|
5
20
|
import { MJGlobal, UUIDsEqual } from "@memberjunction/global";
|
|
6
21
|
import { VectorBase } from "@memberjunction/ai-vectors";
|
|
7
22
|
import { EntityDocumentTemplateParser, EntityVectorSyncer } from "@memberjunction/ai-vector-sync";
|
|
23
|
+
import { TemplateEngineServer } from "@memberjunction/templates";
|
|
24
|
+
/** Default number of nearest neighbors to retrieve per record */
|
|
25
|
+
const DEFAULT_TOP_K = 5;
|
|
26
|
+
/** Default concurrency limit for parallel vector queries */
|
|
27
|
+
const DEFAULT_QUERY_CONCURRENCY = 10;
|
|
28
|
+
/** Default batch size for loading records and parallel database saves */
|
|
29
|
+
const DEFAULT_BATCH_SIZE = 500;
|
|
30
|
+
/** Default batch size for parallel database saves */
|
|
31
|
+
const SAVE_BATCH_SIZE = 20;
|
|
32
|
+
/**
|
|
33
|
+
* Modernized duplicate record detection engine.
|
|
34
|
+
*
|
|
35
|
+
* Supports:
|
|
36
|
+
* - List-based batch detection (getDuplicateRecords)
|
|
37
|
+
* - View/filter/full-entity batch detection (vector-first approach)
|
|
38
|
+
* - Single-record duplicate check (CheckSingleRecord)
|
|
39
|
+
* - Hybrid search via RRF when vector DB supports it
|
|
40
|
+
* - Optional post-retrieval reranking via MJ's BaseReranker
|
|
41
|
+
* - Configurable topK, thresholds, and progress reporting
|
|
42
|
+
*/
|
|
8
43
|
export class DuplicateRecordDetector extends VectorBase {
|
|
9
44
|
constructor() {
|
|
10
|
-
super();
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
let response = new PotentialDuplicateResponse();
|
|
45
|
+
super(...arguments);
|
|
46
|
+
/**
|
|
47
|
+
* Tracks already-seen source↔match pairs across the entire run to suppress inverse duplicates.
|
|
48
|
+
* If A→B is persisted, B→A is skipped. Key format: "smallerID::largerID" for consistent ordering.
|
|
49
|
+
*/
|
|
50
|
+
this._seenPairs = new Set();
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Run duplicate detection for records identified by ListID, ViewID, ExtraFilter,
|
|
54
|
+
* or all records in the entity (vector-first approach).
|
|
55
|
+
*
|
|
56
|
+
* Flow: validate -> vectorize -> init providers -> load/create run ->
|
|
57
|
+
* load record IDs -> batch(embed -> query -> persist) -> complete run -> auto-merge
|
|
58
|
+
*/
|
|
59
|
+
async GetDuplicateRecords(params, contextUser) {
|
|
60
|
+
this.CurrentUser = contextUser;
|
|
61
|
+
this._seenPairs.clear(); // Reset for each new run
|
|
62
|
+
const options = params.Options ?? {};
|
|
63
|
+
const startTime = Date.now();
|
|
64
|
+
const response = new PotentialDuplicateResponse();
|
|
31
65
|
response.PotentialDuplicateResult = [];
|
|
66
|
+
// Step 1: Validate entity document
|
|
67
|
+
const entityDocument = await this.ValidateEntityDocument(params.EntityDocumentID);
|
|
32
68
|
if (!entityDocument) {
|
|
33
|
-
response.ErrorMessage = `No active Entity Document found for
|
|
69
|
+
response.ErrorMessage = `No active Entity Document found for ID ${params.EntityDocumentID}`;
|
|
34
70
|
response.Status = 'Error';
|
|
35
71
|
return response;
|
|
36
72
|
}
|
|
37
|
-
//
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
73
|
+
// Step 2: Optionally vectorize source records (default: skip — vectors should already exist from sync)
|
|
74
|
+
if (options.Revectorize) {
|
|
75
|
+
this.reportProgress(options, 'Vectorizing', 0, 0, 0, startTime);
|
|
76
|
+
await this.VectorizeSourceRecords(entityDocument, contextUser);
|
|
77
|
+
}
|
|
78
|
+
// Step 3: Initialize providers
|
|
79
|
+
await this.InitializeProviders(entityDocument);
|
|
80
|
+
// Step 4: Create or load DuplicateRun
|
|
81
|
+
const duplicateRun = await this.ResolveOrCreateDuplicateRun(params, entityDocument, options);
|
|
82
|
+
// Step 5: Load record IDs to check (batch-friendly — only IDs)
|
|
83
|
+
this.reportProgress(options, 'Loading', 0, 0, 0, startTime);
|
|
84
|
+
const entityInfo = this.Metadata.EntityByID(entityDocument.EntityID);
|
|
85
|
+
if (!entityInfo) {
|
|
86
|
+
response.ErrorMessage = `Entity not found for ID ${entityDocument.EntityID}`;
|
|
87
|
+
response.Status = 'Error';
|
|
88
|
+
return response;
|
|
89
|
+
}
|
|
90
|
+
const recordIDs = await this.LoadRecordIDsToCheck(params, entityInfo);
|
|
91
|
+
if (recordIDs.length === 0) {
|
|
92
|
+
response.ErrorMessage = 'No records found to check for duplicates';
|
|
93
|
+
response.Status = 'Error';
|
|
94
|
+
return response;
|
|
95
|
+
}
|
|
96
|
+
// Step 6: Process in batches
|
|
97
|
+
const batchSize = DEFAULT_BATCH_SIZE;
|
|
98
|
+
const concurrency = this.GetQueryConcurrency(entityDocument);
|
|
99
|
+
const topK = options.TopK ?? DEFAULT_TOP_K;
|
|
100
|
+
const templateParser = EntityDocumentTemplateParser.CreateInstance();
|
|
101
|
+
let totalMatchesFound = 0;
|
|
102
|
+
for (let offset = 0; offset < recordIDs.length; offset += batchSize) {
|
|
103
|
+
const batchIDs = recordIDs.slice(offset, offset + batchSize);
|
|
104
|
+
const batchResults = await this.ProcessBatch(batchIDs, entityInfo, entityDocument, templateParser, duplicateRun.ID, topK, concurrency, options, startTime, recordIDs.length, offset, totalMatchesFound, contextUser);
|
|
105
|
+
response.PotentialDuplicateResult.push(...batchResults.Results);
|
|
106
|
+
totalMatchesFound += batchResults.MatchesFound;
|
|
107
|
+
}
|
|
108
|
+
// Step 7: Complete the duplicate run
|
|
109
|
+
duplicateRun.ProcessingStatus = 'Complete';
|
|
110
|
+
duplicateRun.EndedAt = new Date();
|
|
111
|
+
const runSaveSuccess = await this.SaveEntity(duplicateRun);
|
|
112
|
+
if (!runSaveSuccess) {
|
|
113
|
+
throw new Error(`Failed to update Duplicate Run record ${duplicateRun.ID}`);
|
|
114
|
+
}
|
|
115
|
+
// Step 8: Auto-merge high-confidence matches
|
|
116
|
+
this.reportProgress(options, 'Merging', recordIDs.length, recordIDs.length, totalMatchesFound, startTime);
|
|
117
|
+
await this.ProcessAutoMerges(response, entityDocument, options);
|
|
118
|
+
response.Status = 'Success';
|
|
119
|
+
LogStatus(`Duplicate detection complete: ${recordIDs.length} records checked, ${totalMatchesFound} matches found`);
|
|
120
|
+
return response;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Check a single record for duplicates without requiring a list.
|
|
124
|
+
* Embeds the record and queries for matches directly.
|
|
125
|
+
*/
|
|
126
|
+
async CheckSingleRecord(EntityDocumentID, RecordID, Options, ContextUser) {
|
|
127
|
+
this.CurrentUser = ContextUser;
|
|
128
|
+
const options = Options ?? {};
|
|
129
|
+
const entityDocument = await this.ValidateEntityDocument(EntityDocumentID);
|
|
130
|
+
if (!entityDocument) {
|
|
131
|
+
throw new Error(`No active Entity Document found for ID ${EntityDocumentID}`);
|
|
132
|
+
}
|
|
133
|
+
await this.InitializeProviders(entityDocument);
|
|
134
|
+
// Load the single record
|
|
135
|
+
const entityInfo = this.Metadata.EntityByID(entityDocument.EntityID);
|
|
136
|
+
if (!entityInfo) {
|
|
137
|
+
throw new Error(`Entity not found for ID ${entityDocument.EntityID}`);
|
|
138
|
+
}
|
|
139
|
+
const records = await this.RunView.RunView({
|
|
140
|
+
EntityName: entityInfo.Name,
|
|
141
|
+
ExtraFilter: this.BuildExtraFilter([RecordID]),
|
|
142
|
+
ResultType: 'entity_object',
|
|
143
|
+
}, this.CurrentUser);
|
|
144
|
+
if (!records.Success || records.Results.length === 0) {
|
|
145
|
+
throw new Error(`Record not found: ${RecordID.ToString()}`);
|
|
146
|
+
}
|
|
147
|
+
const record = records.Results[0];
|
|
46
148
|
const templateParser = EntityDocumentTemplateParser.CreateInstance();
|
|
47
|
-
await
|
|
48
|
-
const
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
149
|
+
const templateTexts = await this.GenerateTemplateTexts(templateParser, entityDocument, [record], ContextUser);
|
|
150
|
+
const embedResult = await this.embedding.EmbedTexts({ texts: templateTexts, model: null });
|
|
151
|
+
const topK = options.TopK ?? DEFAULT_TOP_K;
|
|
152
|
+
const queryResults = await this.QueryDuplicatesForRecords([record], embedResult.vectors, templateTexts, entityDocument, topK, options, this.GetQueryConcurrency(entityDocument));
|
|
153
|
+
return queryResults.length > 0 ? queryResults[0].Duplicates : new PotentialDuplicateResult();
|
|
154
|
+
}
|
|
155
|
+
// ─────────────────────────────────────────────
|
|
156
|
+
// Batch Processing
|
|
157
|
+
// ─────────────────────────────────────────────
|
|
158
|
+
/**
|
|
159
|
+
* Result from processing a single batch of records.
|
|
160
|
+
*/
|
|
161
|
+
async ProcessBatch(batchIDs, entityInfo, entityDocument, templateParser, duplicateRunID, topK, concurrency, options, startTime, totalRecords, processedSoFar, matchesSoFar, contextUser) {
|
|
162
|
+
// 6a: Load full record data for this batch (needed for template rendering)
|
|
163
|
+
const compositeKeys = batchIDs.map(id => {
|
|
164
|
+
const ck = new CompositeKey();
|
|
165
|
+
ck.KeyValuePairs.push({ FieldName: entityInfo.FirstPrimaryKey.Name, Value: id });
|
|
166
|
+
return ck;
|
|
167
|
+
});
|
|
168
|
+
const records = await this.LoadRecordsByKeys(compositeKeys, entityInfo);
|
|
169
|
+
if (records.length === 0) {
|
|
170
|
+
return { Results: [], MatchesFound: 0 };
|
|
171
|
+
}
|
|
172
|
+
// 6b: Build source record metadata map for rich UI display
|
|
173
|
+
const sourceMetadataMap = this.buildSourceMetadataMap(records, entityInfo);
|
|
174
|
+
// 6c: Create DuplicateRunDetail records for this batch
|
|
175
|
+
const duplicateRunDetails = await this.CreateRunDetailRecords(batchIDs, duplicateRunID, entityInfo, sourceMetadataMap);
|
|
176
|
+
// 6c: Generate template texts and embed
|
|
177
|
+
this.reportProgress(options, 'Embedding', totalRecords, processedSoFar, matchesSoFar, startTime);
|
|
178
|
+
const templateTexts = await this.GenerateTemplateTexts(templateParser, entityDocument, records, contextUser);
|
|
179
|
+
const embedResult = await this.embedding.EmbedTexts({ texts: templateTexts, model: null });
|
|
180
|
+
// 6d: Query vector DB for each record with concurrency control
|
|
181
|
+
this.reportProgress(options, 'Querying', totalRecords, processedSoFar, matchesSoFar, startTime);
|
|
182
|
+
const queryResults = await this.QueryDuplicatesForRecords(records, embedResult.vectors, templateTexts, entityDocument, topK, options, concurrency);
|
|
183
|
+
// 6e: Persist match results and update run details
|
|
184
|
+
this.reportProgress(options, 'Matching', totalRecords, processedSoFar + records.length, matchesSoFar, startTime);
|
|
185
|
+
const results = await this.PersistMatchResults(queryResults, duplicateRunDetails, entityDocument, options, startTime);
|
|
186
|
+
const batchMatches = results.reduce((sum, r) => sum + r.Duplicates.length, 0);
|
|
187
|
+
return { Results: results, MatchesFound: batchMatches };
|
|
188
|
+
}
|
|
189
|
+
// ─────────────────────────────────────────────
|
|
190
|
+
// Record ID Loading (multiple strategies)
|
|
191
|
+
// ─────────────────────────────────────────────
|
|
192
|
+
/**
|
|
193
|
+
* Load the IDs of records to check, using the appropriate strategy based on the request.
|
|
194
|
+
* Returns an array of primary key value strings.
|
|
195
|
+
*/
|
|
196
|
+
async LoadRecordIDsToCheck(params, entityInfo) {
|
|
197
|
+
if (params.ListID) {
|
|
198
|
+
return this.LoadRecordIDsFromList(params.ListID);
|
|
199
|
+
}
|
|
200
|
+
if (params.ViewID) {
|
|
201
|
+
return this.LoadRecordIDsFromView(params.ViewID, entityInfo);
|
|
202
|
+
}
|
|
203
|
+
// ExtraFilter or all records
|
|
204
|
+
return this.LoadRecordIDsFromEntity(entityInfo, params.ExtraFilter);
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* Load record IDs from a list's detail records.
|
|
208
|
+
*/
|
|
209
|
+
async LoadRecordIDsFromList(listID) {
|
|
210
|
+
const sanitizedListID = listID.replace(/'/g, "''");
|
|
211
|
+
const viewResults = await this.RunView.RunView({
|
|
212
|
+
EntityName: 'MJ: List Details',
|
|
213
|
+
ExtraFilter: `ListID = '${sanitizedListID}'`,
|
|
214
|
+
Fields: ['RecordID'],
|
|
215
|
+
ResultType: 'simple',
|
|
216
|
+
}, this.CurrentUser);
|
|
217
|
+
if (!viewResults.Success) {
|
|
218
|
+
throw new Error(`Failed to load list details: ${viewResults.ErrorMessage}`);
|
|
219
|
+
}
|
|
220
|
+
return viewResults.Results.map(r => r.RecordID);
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Load record IDs by running a saved view.
|
|
224
|
+
*/
|
|
225
|
+
async LoadRecordIDsFromView(viewID, entityInfo) {
|
|
226
|
+
const pkField = entityInfo.FirstPrimaryKey.Name;
|
|
227
|
+
const sanitizedViewID = viewID.replace(/'/g, "''");
|
|
228
|
+
// Load the view definition to get its filter
|
|
229
|
+
const viewEntity = await this.RunViewForSingleValue('Views', `ID = '${sanitizedViewID}'`);
|
|
230
|
+
if (!viewEntity) {
|
|
231
|
+
throw new Error(`View not found: ${viewID}`);
|
|
232
|
+
}
|
|
233
|
+
// Run the entity with the view's filter to get IDs
|
|
234
|
+
const viewResults = await this.RunView.RunView({
|
|
235
|
+
ViewID: viewID,
|
|
236
|
+
Fields: [pkField],
|
|
237
|
+
ResultType: 'simple',
|
|
238
|
+
}, this.CurrentUser);
|
|
239
|
+
if (!viewResults.Success) {
|
|
240
|
+
throw new Error(`Failed to run view ${viewID}: ${viewResults.ErrorMessage}`);
|
|
241
|
+
}
|
|
242
|
+
return viewResults.Results.map(r => r[pkField]);
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Load record IDs directly from the entity, optionally filtered.
|
|
246
|
+
* Uses Fields: ['ID'] and ResultType: 'simple' for efficiency.
|
|
247
|
+
*/
|
|
248
|
+
async LoadRecordIDsFromEntity(entityInfo, extraFilter) {
|
|
249
|
+
const pkField = entityInfo.FirstPrimaryKey.Name;
|
|
250
|
+
const viewResults = await this.RunView.RunView({
|
|
251
|
+
EntityName: entityInfo.Name,
|
|
252
|
+
ExtraFilter: extraFilter,
|
|
253
|
+
Fields: [pkField],
|
|
254
|
+
ResultType: 'simple',
|
|
255
|
+
}, this.CurrentUser);
|
|
256
|
+
if (!viewResults.Success) {
|
|
257
|
+
throw new Error(`Failed to load record IDs from ${entityInfo.Name}: ${viewResults.ErrorMessage}`);
|
|
258
|
+
}
|
|
259
|
+
return viewResults.Results.map(r => r[pkField]);
|
|
260
|
+
}
|
|
261
|
+
// ─────────────────────────────────────────────
|
|
262
|
+
// Validation & Setup
|
|
263
|
+
// ─────────────────────────────────────────────
|
|
264
|
+
/**
|
|
265
|
+
* Validate and return an entity document, or null if not found.
|
|
266
|
+
*/
|
|
267
|
+
async ValidateEntityDocument(entityDocumentID) {
|
|
268
|
+
const vectorizer = new EntityVectorSyncer();
|
|
269
|
+
vectorizer.CurrentUser = this.CurrentUser;
|
|
270
|
+
return vectorizer.GetEntityDocument(entityDocumentID);
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* Initialize embedding and vector DB providers via ClassFactory.
|
|
274
|
+
*/
|
|
275
|
+
async InitializeProviders(entityDocument) {
|
|
276
|
+
const aiModel = this.GetAIModel(entityDocument.AIModelID);
|
|
277
|
+
const vectorDB = this.GetVectorDatabase(entityDocument.VectorDatabaseID);
|
|
58
278
|
const embeddingAPIKey = GetAIAPIKey(aiModel.DriverClass);
|
|
59
279
|
const vectorDBAPIKey = GetAIAPIKey(vectorDB.ClassKey);
|
|
60
280
|
if (!embeddingAPIKey) {
|
|
61
|
-
throw Error(`No API Key found for AI Model ${aiModel.DriverClass}`);
|
|
281
|
+
throw new Error(`No API Key found for AI Model ${aiModel.DriverClass}`);
|
|
62
282
|
}
|
|
63
283
|
if (!vectorDBAPIKey) {
|
|
64
|
-
throw Error(`No API Key found for Vector Database ${vectorDB.ClassKey}`);
|
|
284
|
+
throw new Error(`No API Key found for Vector Database ${vectorDB.ClassKey}`);
|
|
65
285
|
}
|
|
66
|
-
|
|
67
|
-
this.
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
throw Error(`Failed to create Embeddings instance for AI Model ${aiModel.DriverClass}`);
|
|
286
|
+
this.embedding = MJGlobal.Instance.ClassFactory.CreateInstance(BaseEmbeddings, aiModel.DriverClass, embeddingAPIKey);
|
|
287
|
+
this.vectorDB = MJGlobal.Instance.ClassFactory.CreateInstance(VectorDBBase, vectorDB.ClassKey, vectorDBAPIKey);
|
|
288
|
+
if (!this.embedding) {
|
|
289
|
+
throw new Error(`Failed to create Embeddings instance for ${aiModel.DriverClass}`);
|
|
71
290
|
}
|
|
72
|
-
if (!this.
|
|
73
|
-
throw Error(`Failed to create
|
|
291
|
+
if (!this.vectorDB) {
|
|
292
|
+
throw new Error(`Failed to create VectorDB instance for ${vectorDB.ClassKey}`);
|
|
74
293
|
}
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
const template = await templateParser.Parse(sampleTemplate, entityDocument.EntityID, record, contextUser);
|
|
89
|
-
recordTemplates.push(template);
|
|
90
|
-
}
|
|
91
|
-
let embedTextsResult = await this._embedding.EmbedTexts({ texts: recordTemplates, model: null });
|
|
92
|
-
const topK = 5;
|
|
93
|
-
let results = [];
|
|
94
|
-
for (const [index, vector] of embedTextsResult.vectors.entries()) {
|
|
95
|
-
const compositeKey = records[index].PrimaryKey;
|
|
96
|
-
let filterResult = await this._vectorDB.queryIndex({ vector: vector, topK: topK, includeMetadata: true, includeValues: false });
|
|
97
|
-
if (!filterResult.success) {
|
|
98
|
-
LogError(`Failed to query index for record ${compositeKey.ToString()}`);
|
|
99
|
-
continue;
|
|
294
|
+
// Resolve the vector index name from the entity document's VectorIndexID
|
|
295
|
+
// This is the actual Pinecone/pgvector/Qdrant index name needed for QueryIndex calls
|
|
296
|
+
if (entityDocument.VectorIndexID) {
|
|
297
|
+
const rv = new RunView();
|
|
298
|
+
const indexResult = await rv.RunView({
|
|
299
|
+
EntityName: 'MJ: Vector Indexes',
|
|
300
|
+
ExtraFilter: `ID='${entityDocument.VectorIndexID}'`,
|
|
301
|
+
Fields: ['Name'],
|
|
302
|
+
ResultType: 'simple',
|
|
303
|
+
MaxRows: 1
|
|
304
|
+
}, this.CurrentUser);
|
|
305
|
+
if (indexResult.Success && indexResult.Results.length > 0) {
|
|
306
|
+
this.indexName = indexResult.Results[0].Name;
|
|
100
307
|
}
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
308
|
+
}
|
|
309
|
+
if (!this.indexName) {
|
|
310
|
+
throw new Error(`No vector index found for entity document "${entityDocument.Name}". Ensure VectorIndexID is set on the entity document.`);
|
|
311
|
+
}
|
|
312
|
+
LogStatus(`Providers initialized: AI Model=${aiModel.DriverClass}, VectorDB=${vectorDB.ClassKey}, Index=${this.indexName}`);
|
|
313
|
+
}
|
|
314
|
+
/**
|
|
315
|
+
* Run vectorization for the entity document's records.
|
|
316
|
+
*/
|
|
317
|
+
async VectorizeSourceRecords(entityDocument, contextUser) {
|
|
318
|
+
const vectorizer = new EntityVectorSyncer();
|
|
319
|
+
vectorizer.CurrentUser = contextUser;
|
|
320
|
+
const request = {
|
|
321
|
+
entityID: entityDocument.EntityID,
|
|
322
|
+
entityDocumentID: entityDocument.ID,
|
|
323
|
+
listBatchCount: 20,
|
|
324
|
+
options: {},
|
|
325
|
+
CurrentUser: contextUser,
|
|
326
|
+
};
|
|
327
|
+
LogStatus(`Vectorizing entity records for document ${entityDocument.Name}`);
|
|
328
|
+
await vectorizer.VectorizeEntity(request, contextUser);
|
|
329
|
+
}
|
|
330
|
+
/**
|
|
331
|
+
* Read the maxConcurrentRequests from the VectorDatabase entity's Configuration column,
|
|
332
|
+
* falling back to DEFAULT_QUERY_CONCURRENCY if not set.
|
|
333
|
+
*/
|
|
334
|
+
GetQueryConcurrency(entityDocument) {
|
|
335
|
+
const vectorDBEntity = this.GetVectorDatabase(entityDocument.VectorDatabaseID);
|
|
336
|
+
if (vectorDBEntity.Configuration) {
|
|
337
|
+
try {
|
|
338
|
+
const config = JSON.parse(vectorDBEntity.Configuration);
|
|
339
|
+
if (config.throughput?.maxConcurrentRequests != null) {
|
|
340
|
+
return config.throughput.maxConcurrentRequests;
|
|
117
341
|
}
|
|
118
342
|
}
|
|
119
|
-
|
|
120
|
-
|
|
343
|
+
catch {
|
|
344
|
+
// Invalid JSON in Configuration — fall through to default
|
|
121
345
|
}
|
|
122
346
|
}
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
347
|
+
return DEFAULT_QUERY_CONCURRENCY;
|
|
348
|
+
}
|
|
349
|
+
// ─────────────────────────────────────────────
|
|
350
|
+
// DuplicateRun Management
|
|
351
|
+
// ─────────────────────────────────────────────
|
|
352
|
+
/**
|
|
353
|
+
* Resolve an existing DuplicateRun or create a new one.
|
|
354
|
+
* Supports both list-based and list-free operation.
|
|
355
|
+
*/
|
|
356
|
+
async ResolveOrCreateDuplicateRun(params, entityDocument, options) {
|
|
357
|
+
// If a specific run ID was provided, load it
|
|
358
|
+
if (options.DuplicateRunID) {
|
|
359
|
+
return this.LoadDuplicateRun(options.DuplicateRunID);
|
|
360
|
+
}
|
|
361
|
+
// If a ListID is provided, try to find an existing run for that list
|
|
362
|
+
if (params.ListID) {
|
|
363
|
+
const existing = await this.FindDuplicateRunByListID(params.ListID);
|
|
364
|
+
if (existing) {
|
|
365
|
+
return existing;
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
// Create a new DuplicateRun
|
|
369
|
+
return this.CreateDuplicateRun(entityDocument, params.ListID);
|
|
370
|
+
}
|
|
371
|
+
/**
|
|
372
|
+
* Create a new DuplicateRun record.
|
|
373
|
+
*/
|
|
374
|
+
async CreateDuplicateRun(entityDocument, listID) {
|
|
375
|
+
const dupeRun = await this.Metadata.GetEntityObject('MJ: Duplicate Runs', this.CurrentUser);
|
|
376
|
+
dupeRun.NewRecord();
|
|
377
|
+
dupeRun.EntityID = entityDocument.EntityID;
|
|
378
|
+
dupeRun.StartedByUserID = this.CurrentUser?.ID;
|
|
379
|
+
dupeRun.StartedAt = new Date();
|
|
380
|
+
dupeRun.ProcessingStatus = 'In Progress';
|
|
381
|
+
dupeRun.ApprovalStatus = 'Pending';
|
|
382
|
+
if (listID) {
|
|
383
|
+
dupeRun.SourceListID = listID;
|
|
384
|
+
}
|
|
385
|
+
const success = await this.SaveEntity(dupeRun);
|
|
127
386
|
if (!success) {
|
|
128
|
-
throw new Error(
|
|
387
|
+
throw new Error('Failed to create Duplicate Run record');
|
|
129
388
|
}
|
|
130
|
-
|
|
131
|
-
response.PotentialDuplicateResult = results;
|
|
132
|
-
response.Status = 'Success';
|
|
133
|
-
LogStatus("Dupe Run complete. Response:");
|
|
134
|
-
LogStatus(JSON.stringify(response, null, "\t"));
|
|
135
|
-
return response;
|
|
389
|
+
return dupeRun;
|
|
136
390
|
}
|
|
137
|
-
async
|
|
138
|
-
const
|
|
391
|
+
async LoadDuplicateRun(duplicateRunID) {
|
|
392
|
+
const dupeRun = await this.Metadata.GetEntityObject('MJ: Duplicate Runs', this.CurrentUser);
|
|
393
|
+
dupeRun.ContextCurrentUser = this.CurrentUser;
|
|
394
|
+
const success = await dupeRun.Load(duplicateRunID);
|
|
395
|
+
if (!success) {
|
|
396
|
+
throw new Error(`Failed to load Duplicate Run record ${duplicateRunID}`);
|
|
397
|
+
}
|
|
398
|
+
return dupeRun;
|
|
399
|
+
}
|
|
400
|
+
/**
|
|
401
|
+
* Try to find an existing DuplicateRun for a given ListID. Returns null if none found.
|
|
402
|
+
*/
|
|
403
|
+
async FindDuplicateRunByListID(listID) {
|
|
404
|
+
return this.RunViewForSingleValue('MJ: Duplicate Runs', `SourceListID = '${listID.replace(/'/g, "''")}'`);
|
|
405
|
+
}
|
|
406
|
+
// ─────────────────────────────────────────────
|
|
407
|
+
// Entity Loading
|
|
408
|
+
// ─────────────────────────────────────────────
|
|
409
|
+
/**
|
|
410
|
+
* Load full entity objects for a batch of composite keys.
|
|
411
|
+
*/
|
|
412
|
+
async LoadRecordsByKeys(compositeKeys, entityInfo) {
|
|
413
|
+
if (compositeKeys.length === 0) {
|
|
414
|
+
return [];
|
|
415
|
+
}
|
|
416
|
+
const rvResult = await this.RunView.RunView({
|
|
417
|
+
EntityName: entityInfo.Name,
|
|
418
|
+
ExtraFilter: this.BuildExtraFilter(compositeKeys),
|
|
419
|
+
ResultType: 'entity_object',
|
|
420
|
+
}, this.CurrentUser);
|
|
421
|
+
if (!rvResult.Success) {
|
|
422
|
+
throw new Error(rvResult.ErrorMessage);
|
|
423
|
+
}
|
|
424
|
+
return rvResult.Results;
|
|
425
|
+
}
|
|
426
|
+
/**
|
|
427
|
+
* Load records from an entity that are members of the specified list.
|
|
428
|
+
* Kept for backward compatibility.
|
|
429
|
+
*/
|
|
430
|
+
async LoadRecordsByListID(listID, entityID) {
|
|
431
|
+
const entityInfo = this.Metadata.EntityByID(entityID);
|
|
139
432
|
if (!entityInfo) {
|
|
140
|
-
throw new Error(`
|
|
433
|
+
throw new Error(`Entity not found for ID ${entityID}`);
|
|
141
434
|
}
|
|
142
|
-
const
|
|
435
|
+
const sanitizedListID = listID.replace(/'/g, "''");
|
|
436
|
+
const rvResult = await this.RunView.RunView({
|
|
143
437
|
EntityName: entityInfo.Name,
|
|
144
|
-
ExtraFilter: `ID IN (SELECT RecordID FROM __mj.vwListDetails WHERE ListID = '${
|
|
145
|
-
ResultType: 'entity_object'
|
|
146
|
-
},
|
|
438
|
+
ExtraFilter: `ID IN (SELECT RecordID FROM __mj.vwListDetails WHERE ListID = '${sanitizedListID}')`,
|
|
439
|
+
ResultType: 'entity_object',
|
|
440
|
+
}, this.CurrentUser);
|
|
147
441
|
if (!rvResult.Success) {
|
|
148
442
|
throw new Error(rvResult.ErrorMessage);
|
|
149
443
|
}
|
|
150
444
|
return rvResult.Results;
|
|
151
445
|
}
|
|
152
|
-
async
|
|
153
|
-
const
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
duplicateRun.StartedAt = new Date();
|
|
159
|
-
duplicateRun.ProcessingStatus = 'In Progress';
|
|
160
|
-
duplicateRun.ApprovalStatus = 'Pending';
|
|
161
|
-
duplicateRun.SourceListID = listID;
|
|
162
|
-
const saveResult = await super.SaveEntity(duplicateRun);
|
|
163
|
-
if (!saveResult) {
|
|
164
|
-
throw new Error(`Failed to save list for Potential Duplicate Run`);
|
|
165
|
-
}
|
|
166
|
-
return duplicateRun;
|
|
167
|
-
}
|
|
168
|
-
async createDuplicateRunDetailRecords(recordIDs, duplicateRunID) {
|
|
169
|
-
let results = [];
|
|
170
|
-
const md = new Metadata();
|
|
171
|
-
for (const recordID of recordIDs) {
|
|
172
|
-
let runDetail = await md.GetEntityObject('MJ: Duplicate Run Details');
|
|
173
|
-
runDetail.NewRecord();
|
|
174
|
-
runDetail.DuplicateRunID = duplicateRunID;
|
|
175
|
-
runDetail.RecordID = recordID.ToString();
|
|
176
|
-
runDetail.MatchStatus = 'Pending';
|
|
177
|
-
runDetail.MergeStatus = 'Pending';
|
|
178
|
-
const success = await super.SaveEntity(runDetail);
|
|
179
|
-
if (success) {
|
|
180
|
-
results.push(runDetail);
|
|
181
|
-
}
|
|
446
|
+
async LoadListEntity(listID) {
|
|
447
|
+
const list = await this.Metadata.GetEntityObject('MJ: Lists');
|
|
448
|
+
list.ContextCurrentUser = this.CurrentUser;
|
|
449
|
+
const success = await list.Load(listID);
|
|
450
|
+
if (!success) {
|
|
451
|
+
throw new Error(`Failed to load List record ${listID}`);
|
|
182
452
|
}
|
|
183
|
-
return
|
|
453
|
+
return list;
|
|
184
454
|
}
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
const
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
const
|
|
205
|
-
if (
|
|
206
|
-
|
|
455
|
+
// ─────────────────────────────────────────────
|
|
456
|
+
// Template Generation & Embedding
|
|
457
|
+
// ─────────────────────────────────────────────
|
|
458
|
+
/**
|
|
459
|
+
* Generate human-readable template text for each record using the entity document template.
|
|
460
|
+
*
|
|
461
|
+
* Loads the template from TemplateEngineServer and renders it via Nunjucks,
|
|
462
|
+
* matching the same approach used by the vectorization pipeline.
|
|
463
|
+
*/
|
|
464
|
+
async GenerateTemplateTexts(templateParser, entityDocument, records, contextUser) {
|
|
465
|
+
await TemplateEngineServer.Instance.Config(false, contextUser);
|
|
466
|
+
const template = this.loadTemplate(entityDocument);
|
|
467
|
+
const templateContent = template.Content[0];
|
|
468
|
+
TemplateEngineServer.Instance.SetupNunjucks();
|
|
469
|
+
const templateTexts = [];
|
|
470
|
+
for (const record of records) {
|
|
471
|
+
// NEW convention: main entity fields are TOP-LEVEL variables (no Entity. prefix).
|
|
472
|
+
// Spread record fields directly into root context so templates use {{FieldName}}.
|
|
473
|
+
const data = { ...record.GetAll() };
|
|
474
|
+
const result = await TemplateEngineServer.Instance.RenderTemplate(template, templateContent, data, true);
|
|
475
|
+
if (result.Success) {
|
|
476
|
+
templateTexts.push(result.Output);
|
|
207
477
|
}
|
|
208
478
|
else {
|
|
209
|
-
LogError(
|
|
479
|
+
LogError(`Template render failed for record ${record.PrimaryKey.ToString()}: ${result.Message}`);
|
|
480
|
+
templateTexts.push('');
|
|
210
481
|
}
|
|
211
482
|
}
|
|
212
|
-
return
|
|
483
|
+
return templateTexts;
|
|
213
484
|
}
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
const
|
|
219
|
-
if (!
|
|
220
|
-
throw new Error(`
|
|
485
|
+
/**
|
|
486
|
+
* Load the template entity from TemplateEngineServer for the given entity document.
|
|
487
|
+
*/
|
|
488
|
+
loadTemplate(entityDocument) {
|
|
489
|
+
const template = TemplateEngineServer.Instance.Templates.find((t) => UUIDsEqual(t.ID, entityDocument.TemplateID));
|
|
490
|
+
if (!template) {
|
|
491
|
+
throw new Error(`Template not found for ID ${entityDocument.TemplateID}`);
|
|
221
492
|
}
|
|
222
|
-
|
|
493
|
+
if (template.Content.length === 0) {
|
|
494
|
+
throw new Error(`Template ${template.ID} has no content records`);
|
|
495
|
+
}
|
|
496
|
+
return template;
|
|
223
497
|
}
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
498
|
+
// ─────────────────────────────────────────────
|
|
499
|
+
// Vector Query & Hybrid Search
|
|
500
|
+
// ─────────────────────────────────────────────
|
|
501
|
+
/**
|
|
502
|
+
* Query the vector DB for duplicates of each record, with concurrency control.
|
|
503
|
+
* Supports hybrid search and RRF fusion when the vector DB supports it.
|
|
504
|
+
*/
|
|
505
|
+
async QueryDuplicatesForRecords(records, vectors, templateTexts, entityDocument, topK, options, concurrency) {
|
|
506
|
+
const tasks = records.map((record, index) => async () => {
|
|
507
|
+
const compositeKey = record.PrimaryKey;
|
|
508
|
+
const vector = vectors[index];
|
|
509
|
+
const templateText = templateTexts[index];
|
|
510
|
+
const queryResponse = await this.executeVectorQuery(vector, templateText, topK, options);
|
|
511
|
+
if (!queryResponse.success) {
|
|
512
|
+
LogError(`Failed to query index for record ${compositeKey.ToString()}`);
|
|
513
|
+
const emptyResult = new PotentialDuplicateResult();
|
|
514
|
+
emptyResult.EntityID = entityDocument.EntityID;
|
|
515
|
+
emptyResult.RecordCompositeKey = compositeKey;
|
|
516
|
+
return { SourceKey: compositeKey, TemplateText: templateText, Duplicates: emptyResult };
|
|
517
|
+
}
|
|
518
|
+
const dupeResult = this.ParseVectorMatches(queryResponse, compositeKey);
|
|
519
|
+
dupeResult.Duplicates = this.FilterSelfMatches(dupeResult.Duplicates, compositeKey);
|
|
520
|
+
const potentialThreshold = options.PotentialMatchThreshold ?? entityDocument.PotentialMatchThreshold;
|
|
521
|
+
dupeResult.Duplicates = dupeResult.Duplicates.filter((d) => d.ProbabilityScore >= potentialThreshold);
|
|
522
|
+
dupeResult.EntityID = entityDocument.EntityID;
|
|
523
|
+
dupeResult.RecordCompositeKey = compositeKey;
|
|
524
|
+
return { SourceKey: compositeKey, TemplateText: templateText, Duplicates: dupeResult };
|
|
525
|
+
});
|
|
526
|
+
return RunWithConcurrency(tasks, concurrency);
|
|
527
|
+
}
|
|
528
|
+
/**
|
|
529
|
+
* Execute a vector query — uses hybrid search with RRF when the provider supports it.
|
|
530
|
+
*/
|
|
531
|
+
async executeVectorQuery(vector, templateText, topK, options) {
|
|
532
|
+
if (this.vectorDB.SupportsHybridSearch && templateText) {
|
|
533
|
+
return this.vectorDB.HybridQuery({
|
|
534
|
+
vector,
|
|
535
|
+
topK,
|
|
536
|
+
KeywordQuery: templateText,
|
|
537
|
+
Alpha: options.KeywordSearchWeight != null ? (1.0 - options.KeywordSearchWeight) : 0.7,
|
|
538
|
+
FusionMethod: options.FusionMethod ?? 'rrf',
|
|
539
|
+
includeMetadata: true,
|
|
540
|
+
includeValues: false,
|
|
541
|
+
});
|
|
231
542
|
}
|
|
232
|
-
return
|
|
543
|
+
return this.vectorDB.QueryIndex({
|
|
544
|
+
id: this.indexName,
|
|
545
|
+
vector,
|
|
546
|
+
topK,
|
|
547
|
+
includeMetadata: true,
|
|
548
|
+
includeValues: false,
|
|
549
|
+
});
|
|
233
550
|
}
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
async createListForDupeRun(entityDocument) {
|
|
242
|
-
const md = new Metadata();
|
|
243
|
-
const list = await md.GetEntityObject('MJ: Lists');
|
|
244
|
-
list.NewRecord();
|
|
245
|
-
list.Name = `Potential Duplicate Run`;
|
|
246
|
-
list.Description = `Potential Duplicate Run for ${entityDocument.Entity} Entity`;
|
|
247
|
-
list.EntityID = entityDocument.EntityID;
|
|
248
|
-
list.UserID = super.CurrentUser.ID;
|
|
249
|
-
const saveResult = await super.SaveEntity(list);
|
|
250
|
-
if (!saveResult) {
|
|
251
|
-
throw new Error(`Failed to save list for Potential Duplicate Run`);
|
|
551
|
+
/**
|
|
552
|
+
* Parse raw vector DB matches into a PotentialDuplicateResult.
|
|
553
|
+
*/
|
|
554
|
+
ParseVectorMatches(queryResponse, sourceKey) {
|
|
555
|
+
const result = new PotentialDuplicateResult();
|
|
556
|
+
if (!queryResponse.data?.matches) {
|
|
557
|
+
return result;
|
|
252
558
|
}
|
|
253
|
-
|
|
559
|
+
for (const match of queryResponse.data.matches) {
|
|
560
|
+
if (!match?.id)
|
|
561
|
+
continue;
|
|
562
|
+
if (!match.metadata?.RecordID) {
|
|
563
|
+
LogError(`Invalid vector metadata for match: ${match.id}`);
|
|
564
|
+
continue;
|
|
565
|
+
}
|
|
566
|
+
const duplicate = new PotentialDuplicate();
|
|
567
|
+
duplicate.LoadFromConcatenatedString(match.metadata.RecordID);
|
|
568
|
+
duplicate.ProbabilityScore = match.score;
|
|
569
|
+
// Capture the full vector metadata for rich UI display
|
|
570
|
+
duplicate.VectorMetadata = { ...match.metadata };
|
|
571
|
+
result.Duplicates.push(duplicate);
|
|
572
|
+
}
|
|
573
|
+
return result;
|
|
254
574
|
}
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
575
|
+
/**
|
|
576
|
+
* Filter out self-matches where the candidate is the same record as the source.
|
|
577
|
+
*/
|
|
578
|
+
/**
|
|
579
|
+
* Build a map of recordID → JSON metadata string from loaded BaseEntity records.
|
|
580
|
+
* Extracts the entity's name field and a few key display fields for rich UI rendering.
|
|
581
|
+
*/
|
|
582
|
+
buildSourceMetadataMap(records, entityInfo) {
|
|
583
|
+
const metadataMap = new Map();
|
|
584
|
+
const nameField = entityInfo.NameField;
|
|
585
|
+
// Collect a small set of useful display fields
|
|
586
|
+
const displayFieldNames = ['Name', 'Title', 'Description', 'Status', 'Type']
|
|
587
|
+
.filter(fn => entityInfo.Fields.find(f => f.Name === fn));
|
|
588
|
+
for (const record of records) {
|
|
589
|
+
const pk = record.PrimaryKey;
|
|
590
|
+
const id = pk.KeyValuePairs.length === 1 ? String(pk.KeyValuePairs[0].Value) : pk.Values();
|
|
591
|
+
const meta = {
|
|
592
|
+
Entity: entityInfo.Name,
|
|
593
|
+
};
|
|
594
|
+
if (entityInfo.Icon) {
|
|
595
|
+
meta['EntityIcon'] = entityInfo.Icon;
|
|
596
|
+
}
|
|
597
|
+
if (nameField) {
|
|
598
|
+
const nameVal = record.Get(nameField.Name);
|
|
599
|
+
if (nameVal != null)
|
|
600
|
+
meta['Name'] = String(nameVal);
|
|
601
|
+
}
|
|
602
|
+
for (const fn of displayFieldNames) {
|
|
603
|
+
if (fn !== nameField?.Name) {
|
|
604
|
+
const val = record.Get(fn);
|
|
605
|
+
if (val != null) {
|
|
606
|
+
const str = String(val);
|
|
607
|
+
meta[fn] = str.length > 200 ? str.substring(0, 197) + '...' : str;
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
metadataMap.set(id, JSON.stringify(meta));
|
|
612
|
+
}
|
|
613
|
+
return metadataMap;
|
|
614
|
+
}
|
|
615
|
+
FilterSelfMatches(duplicates, sourceKey) {
|
|
616
|
+
return duplicates.filter((d) => d.ToString() !== sourceKey.ToString());
|
|
617
|
+
}
|
|
618
|
+
// ─────────────────────────────────────────────
|
|
619
|
+
// Run Detail & Match Persistence (Batched)
|
|
620
|
+
// ─────────────────────────────────────────────
|
|
621
|
+
/**
|
|
622
|
+
* Create DuplicateRunDetail records for a batch of record IDs.
|
|
623
|
+
*/
|
|
624
|
+
async CreateRunDetailRecords(recordIDs, duplicateRunID, entityInfo, metadataMap) {
|
|
625
|
+
const results = [];
|
|
626
|
+
const pkFieldName = entityInfo.FirstPrimaryKey.Name;
|
|
627
|
+
for (const batch of chunkArray(recordIDs, SAVE_BATCH_SIZE)) {
|
|
628
|
+
const batchResults = await Promise.all(batch.map(async (recordID) => {
|
|
629
|
+
const runDetail = await this.Metadata.GetEntityObject('MJ: Duplicate Run Details', this.CurrentUser);
|
|
630
|
+
runDetail.NewRecord();
|
|
631
|
+
runDetail.DuplicateRunID = duplicateRunID;
|
|
632
|
+
// Store RecordID in standard MJ URL segment format (e.g., "ID|uuid")
|
|
633
|
+
runDetail.RecordID = `${pkFieldName}|${recordID}`;
|
|
634
|
+
runDetail.MatchStatus = 'Pending';
|
|
635
|
+
runDetail.MergeStatus = 'Pending';
|
|
636
|
+
runDetail.RecordMetadata = metadataMap?.get(recordID) ?? null;
|
|
637
|
+
const success = await this.SaveEntity(runDetail);
|
|
638
|
+
if (!success) {
|
|
639
|
+
LogError("Failed to save MJDuplicateRunDetailEntity", undefined, runDetail.LatestResult);
|
|
640
|
+
return null;
|
|
641
|
+
}
|
|
642
|
+
return runDetail;
|
|
643
|
+
}));
|
|
644
|
+
for (const r of batchResults) {
|
|
645
|
+
if (r)
|
|
646
|
+
results.push(r);
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
return results;
|
|
650
|
+
}
|
|
651
|
+
/**
|
|
652
|
+
* Persist match results and update run detail records.
|
|
653
|
+
*/
|
|
654
|
+
async PersistMatchResults(queryResults, duplicateRunDetails, entityDocument, options, startTime) {
|
|
655
|
+
const results = [];
|
|
656
|
+
let matchesFound = 0;
|
|
657
|
+
for (const qr of queryResults) {
|
|
658
|
+
// Filter out inverse duplicates: if A→B was already persisted, skip B→A
|
|
659
|
+
const sourceId = qr.SourceKey.Values();
|
|
660
|
+
qr.Duplicates.Duplicates = qr.Duplicates.Duplicates.filter(dupe => {
|
|
661
|
+
const matchId = dupe.Values();
|
|
662
|
+
const pairKey = sourceId < matchId ? `${sourceId}::${matchId}` : `${matchId}::${sourceId}`;
|
|
663
|
+
if (this._seenPairs.has(pairKey)) {
|
|
664
|
+
return false; // Inverse already recorded
|
|
665
|
+
}
|
|
666
|
+
this._seenPairs.add(pairKey);
|
|
667
|
+
return true;
|
|
668
|
+
});
|
|
669
|
+
results.push(qr.Duplicates);
|
|
670
|
+
matchesFound += qr.Duplicates.Duplicates.length;
|
|
671
|
+
const sourceKey = qr.SourceKey;
|
|
672
|
+
const detail = duplicateRunDetails.find((d) => {
|
|
673
|
+
const detailKey = new CompositeKey();
|
|
674
|
+
detailKey.LoadFromConcatenatedString(d.RecordID);
|
|
675
|
+
return detailKey.Equals(sourceKey);
|
|
676
|
+
});
|
|
677
|
+
if (detail) {
|
|
678
|
+
const matchRecords = await this.CreateMatchRecordsForDetail(detail.ID, qr.Duplicates);
|
|
679
|
+
qr.Duplicates.DuplicateRunDetailMatchRecordIDs = matchRecords.map((m) => m.ID);
|
|
680
|
+
detail.MatchStatus = 'Complete';
|
|
681
|
+
const success = await this.SaveEntity(detail);
|
|
682
|
+
if (!success) {
|
|
683
|
+
LogError(`Failed to update Duplicate Run Detail record ${detail.ID}`);
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
else {
|
|
687
|
+
LogError(`No Duplicate Run Detail found for ${qr.SourceKey.ToString()}`);
|
|
688
|
+
}
|
|
689
|
+
this.reportProgress(options, 'Matching', queryResults.length, results.length, matchesFound, startTime);
|
|
690
|
+
}
|
|
691
|
+
return results;
|
|
692
|
+
}
|
|
693
|
+
/**
|
|
694
|
+
* Create match records for a single run detail, saving in parallel batches.
|
|
695
|
+
*/
|
|
696
|
+
async CreateMatchRecordsForDetail(duplicateRunDetailID, duplicateResult) {
|
|
697
|
+
const matchRecords = [];
|
|
698
|
+
for (const batch of chunkArray(duplicateResult.Duplicates, SAVE_BATCH_SIZE)) {
|
|
699
|
+
const batchResults = await Promise.all(batch.map(async (dupe) => {
|
|
700
|
+
const match = await this.Metadata.GetEntityObject('MJ: Duplicate Run Detail Matches', this.CurrentUser);
|
|
701
|
+
match.NewRecord();
|
|
702
|
+
match.DuplicateRunDetailID = duplicateRunDetailID;
|
|
703
|
+
match.MatchRecordID = dupe.ToURLSegment();
|
|
704
|
+
match.MatchProbability = dupe.ProbabilityScore;
|
|
705
|
+
match.MatchedAt = new Date();
|
|
706
|
+
match.Action = '';
|
|
707
|
+
match.ApprovalStatus = 'Pending';
|
|
708
|
+
match.MergeStatus = 'Pending';
|
|
709
|
+
match.RecordMetadata = dupe.VectorMetadata ? JSON.stringify(dupe.VectorMetadata) : null;
|
|
710
|
+
const success = await this.SaveEntity(match);
|
|
711
|
+
return success ? match : null;
|
|
712
|
+
}));
|
|
713
|
+
for (const m of batchResults) {
|
|
714
|
+
if (m)
|
|
715
|
+
matchRecords.push(m);
|
|
272
716
|
}
|
|
273
717
|
}
|
|
274
718
|
return matchRecords;
|
|
275
719
|
}
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
720
|
+
// ─────────────────────────────────────────────
|
|
721
|
+
// Auto-Merge
|
|
722
|
+
// ─────────────────────────────────────────────
|
|
723
|
+
/**
|
|
724
|
+
* Automatically merge records that meet the absolute match threshold.
|
|
725
|
+
*/
|
|
726
|
+
async ProcessAutoMerges(response, entityDocument, options = {}) {
|
|
727
|
+
const absoluteThreshold = options.AbsoluteMatchThreshold ?? entityDocument.AbsoluteMatchThreshold;
|
|
728
|
+
for (const dupeResult of response.PotentialDuplicateResult) {
|
|
279
729
|
for (const [index, dupe] of dupeResult.Duplicates.entries()) {
|
|
280
|
-
if (dupe.
|
|
281
|
-
//same record, skip
|
|
730
|
+
if (dupe.ProbabilityScore < absoluteThreshold) {
|
|
282
731
|
continue;
|
|
283
732
|
}
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
if (!loadResult) {
|
|
295
|
-
LogError(`Failed to load Duplicate Run Match record ${dupeResult.DuplicateRunDetailMatchRecordIDs[index]}`);
|
|
296
|
-
continue;
|
|
297
|
-
}
|
|
298
|
-
dupeRunMatchRecord.MergeStatus = 'Complete';
|
|
299
|
-
dupeRunMatchRecord.Action = 'Merged';
|
|
300
|
-
dupeRunMatchRecord.MergedAt = new Date();
|
|
301
|
-
let saveResult = await dupeRunMatchRecord.Save();
|
|
302
|
-
if (!saveResult) {
|
|
303
|
-
LogError(`Failed to update Duplicate Run Match record ${dupeRunMatchRecord.ID}`);
|
|
304
|
-
}
|
|
305
|
-
}
|
|
306
|
-
else {
|
|
307
|
-
LogError(`Failed to merge records ${dupeResult.RecordCompositeKey.ToString()} and ${dupe.ToString()}`);
|
|
308
|
-
}
|
|
733
|
+
const mergeParams = new RecordMergeRequest();
|
|
734
|
+
mergeParams.EntityName = entityDocument.Entity;
|
|
735
|
+
mergeParams.SurvivingRecordCompositeKey = dupeResult.RecordCompositeKey;
|
|
736
|
+
mergeParams.RecordsToMerge = [dupe];
|
|
737
|
+
const mergeResult = await this.Metadata.MergeRecords(mergeParams, this.CurrentUser);
|
|
738
|
+
if (mergeResult.Success) {
|
|
739
|
+
await this.updateMatchRecordAfterMerge(dupeResult.DuplicateRunDetailMatchRecordIDs[index]);
|
|
740
|
+
}
|
|
741
|
+
else {
|
|
742
|
+
LogError(`Failed to merge ${dupeResult.RecordCompositeKey.ToString()} and ${dupe.ToString()}`);
|
|
309
743
|
}
|
|
310
744
|
}
|
|
311
745
|
}
|
|
312
746
|
}
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
747
|
+
/**
|
|
748
|
+
* Update a match record's status after a successful merge.
|
|
749
|
+
*/
|
|
750
|
+
async updateMatchRecordAfterMerge(matchRecordID) {
|
|
751
|
+
const matchRecord = await this.Metadata.GetEntityObject('MJ: Duplicate Run Detail Matches', this.CurrentUser);
|
|
752
|
+
const loaded = await matchRecord.Load(matchRecordID);
|
|
753
|
+
if (!loaded) {
|
|
754
|
+
LogError(`Failed to load match record ${matchRecordID} for merge status update`);
|
|
755
|
+
return;
|
|
756
|
+
}
|
|
757
|
+
matchRecord.MergeStatus = 'Complete';
|
|
758
|
+
matchRecord.Action = 'Merged';
|
|
759
|
+
matchRecord.MergedAt = new Date();
|
|
760
|
+
const saved = await matchRecord.Save();
|
|
761
|
+
if (!saved) {
|
|
762
|
+
LogError(`Failed to update match record ${matchRecordID} after merge`);
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
// ─────────────────────────────────────────────
|
|
766
|
+
// Progress Reporting
|
|
767
|
+
// ─────────────────────────────────────────────
|
|
768
|
+
reportProgress(options, phase, totalRecords, processedRecords, matchesFound, startTime, currentRecordID) {
|
|
769
|
+
if (options.OnProgress) {
|
|
770
|
+
options.OnProgress({
|
|
771
|
+
Phase: phase,
|
|
772
|
+
TotalRecords: totalRecords,
|
|
773
|
+
ProcessedRecords: processedRecords,
|
|
774
|
+
MatchesFound: matchesFound,
|
|
775
|
+
CurrentRecordID: currentRecordID,
|
|
776
|
+
ElapsedMs: Date.now() - startTime,
|
|
777
|
+
});
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
// ─────────────────────────────────────────────
|
|
782
|
+
// Utility Functions
|
|
783
|
+
// ─────────────────────────────────────────────
|
|
784
|
+
/**
|
|
785
|
+
* Split an array into chunks of a given size.
|
|
786
|
+
*/
|
|
787
|
+
function chunkArray(array, chunkSize) {
|
|
788
|
+
const chunks = [];
|
|
789
|
+
for (let i = 0; i < array.length; i += chunkSize) {
|
|
790
|
+
chunks.push(array.slice(i, i + chunkSize));
|
|
791
|
+
}
|
|
792
|
+
return chunks;
|
|
793
|
+
}
|
|
794
|
+
/**
|
|
795
|
+
* Run async tasks with a concurrency limit.
|
|
796
|
+
* Executes up to `limit` tasks in parallel, queuing the rest.
|
|
797
|
+
*/
|
|
798
|
+
async function RunWithConcurrency(tasks, limit) {
|
|
799
|
+
const results = [];
|
|
800
|
+
let index = 0;
|
|
801
|
+
async function runNext() {
|
|
802
|
+
while (index < tasks.length) {
|
|
803
|
+
const currentIndex = index++;
|
|
804
|
+
results[currentIndex] = await tasks[currentIndex]();
|
|
328
805
|
}
|
|
329
|
-
return response;
|
|
330
806
|
}
|
|
807
|
+
const workers = Array.from({ length: Math.min(limit, tasks.length) }, () => runNext());
|
|
808
|
+
await Promise.all(workers);
|
|
809
|
+
return results;
|
|
331
810
|
}
|
|
332
811
|
//# sourceMappingURL=duplicateRecordDetector.js.map
|