@memberjunction/ai-vector-dupe 5.20.0 → 5.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +254 -230
- package/dist/duplicateRecordDetector.d.ts +116 -18
- package/dist/duplicateRecordDetector.d.ts.map +1 -1
- package/dist/duplicateRecordDetector.js +465 -262
- package/dist/duplicateRecordDetector.js.map +1 -1
- package/dist/index.d.ts +2 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +4 -3
- package/dist/index.js.map +1 -1
- package/dist/scoring/ReciprocalRankFusion.d.ts +45 -0
- package/dist/scoring/ReciprocalRankFusion.d.ts.map +1 -0
- package/dist/scoring/ReciprocalRankFusion.js +63 -0
- package/dist/scoring/ReciprocalRankFusion.js.map +1 -0
- package/package.json +10 -10
- package/dist/config.d.ts +0 -13
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js +0 -15
- package/dist/config.js.map +0 -1
- package/dist/generic/vectorSyncBase.d.ts +0 -20
- package/dist/generic/vectorSyncBase.d.ts.map +0 -1
- package/dist/generic/vectorSyncBase.js +0 -42
- package/dist/generic/vectorSyncBase.js.map +0 -1
- package/dist/models/entitySyncConfig.d.ts +0 -36
- package/dist/models/entitySyncConfig.d.ts.map +0 -1
- package/dist/models/entitySyncConfig.js +0 -2
- package/dist/models/entitySyncConfig.js.map +0 -1
|
@@ -1,332 +1,535 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Modernized duplicate record detection engine.
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates the full pipeline: vectorize records, query for similar candidates,
|
|
5
|
+
* optionally apply hybrid search (RRF) and reranking, persist match results,
|
|
6
|
+
* and auto-merge high-confidence duplicates.
|
|
7
|
+
*
|
|
8
|
+
* Supports both list-based batch detection and single-record checks.
|
|
9
|
+
*
|
|
10
|
+
* @module @memberjunction/ai-vector-dupe
|
|
11
|
+
*/
|
|
1
12
|
import { BaseEmbeddings, GetAIAPIKey } from "@memberjunction/ai";
|
|
2
|
-
import { PotentialDuplicateResponse,
|
|
3
|
-
import { LogStatus } from "@memberjunction/core";
|
|
13
|
+
import { PotentialDuplicateResponse, PotentialDuplicateResult, LogError, LogStatus, RecordMergeRequest, PotentialDuplicate, } from "@memberjunction/core";
|
|
4
14
|
import { VectorDBBase } from "@memberjunction/ai-vectordb";
|
|
5
15
|
import { MJGlobal, UUIDsEqual } from "@memberjunction/global";
|
|
6
16
|
import { VectorBase } from "@memberjunction/ai-vectors";
|
|
7
17
|
import { EntityDocumentTemplateParser, EntityVectorSyncer } from "@memberjunction/ai-vector-sync";
|
|
18
|
+
import { TemplateEngineServer } from "@memberjunction/templates";
|
|
19
|
+
/** Default number of nearest neighbors to retrieve per record */
|
|
20
|
+
const DEFAULT_TOP_K = 5;
|
|
21
|
+
/** Default concurrency limit for parallel vector queries */
|
|
22
|
+
const QUERY_CONCURRENCY_LIMIT = 5;
|
|
23
|
+
/** Default batch size for parallel database saves */
|
|
24
|
+
const SAVE_BATCH_SIZE = 20;
|
|
25
|
+
/**
|
|
26
|
+
* Modernized duplicate record detection engine.
|
|
27
|
+
*
|
|
28
|
+
* Supports:
|
|
29
|
+
* - List-based batch detection (getDuplicateRecords)
|
|
30
|
+
* - Single-record duplicate check (CheckSingleRecord)
|
|
31
|
+
* - Hybrid search via RRF when vector DB supports it
|
|
32
|
+
* - Optional post-retrieval reranking via MJ's BaseReranker
|
|
33
|
+
* - Configurable topK, thresholds, and progress reporting
|
|
34
|
+
*/
|
|
8
35
|
export class DuplicateRecordDetector extends VectorBase {
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
vectorizer.CurrentUser = super.CurrentUser;
|
|
20
|
-
let entityDocument = await vectorizer.GetEntityDocument(params.EntityDocumentID);
|
|
21
|
-
if (!entityDocument) {
|
|
22
|
-
throw Error(`No Entity Document found with ID ${params.EntityDocumentID}`);
|
|
23
|
-
//Update: No longer creating an entity docuement if one is not found
|
|
24
|
-
//If an entitiy document is not found, that is our indicator that the
|
|
25
|
-
//underlying entity's records have not been vectorized yet
|
|
26
|
-
//const defaultVectorDB: MJVectorDatabaseEntity = super.getVectorDatabase();
|
|
27
|
-
//const defaultAIModel: MJAIModelEntity = super.getAIModel();
|
|
28
|
-
//entityDocument = await this.createEntityDocumentForEntity(params.EntityID, defaultVectorDB, defaultAIModel);
|
|
29
|
-
}
|
|
30
|
-
let response = new PotentialDuplicateResponse();
|
|
36
|
+
/**
|
|
37
|
+
* Run duplicate detection for all records in a list.
|
|
38
|
+
*
|
|
39
|
+
* Flow: validate → vectorize → embed → query → (optional rerank) → persist → (optional merge)
|
|
40
|
+
*/
|
|
41
|
+
async GetDuplicateRecords(params, contextUser) {
|
|
42
|
+
this.CurrentUser = contextUser;
|
|
43
|
+
const options = params.Options ?? {};
|
|
44
|
+
const startTime = Date.now();
|
|
45
|
+
const response = new PotentialDuplicateResponse();
|
|
31
46
|
response.PotentialDuplicateResult = [];
|
|
47
|
+
// Step 1: Validate entity document
|
|
48
|
+
const entityDocument = await this.ValidateEntityDocument(params.EntityDocumentID);
|
|
32
49
|
if (!entityDocument) {
|
|
33
|
-
response.ErrorMessage = `No active Entity Document found for
|
|
50
|
+
response.ErrorMessage = `No active Entity Document found for ID ${params.EntityDocumentID}`;
|
|
34
51
|
response.Status = 'Error';
|
|
35
52
|
return response;
|
|
36
53
|
}
|
|
37
|
-
//
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
const
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
const duplicateRunDetails = await this.createDuplicateRunDetailRecordsByListID(list.ID, duplicateRun.ID);
|
|
52
|
-
//await this.createListDetailsForDupeRun(params.RecordIDs, list.ID);
|
|
53
|
-
LogStatus(`Using vector database ${entityDocument.VectorDatabaseID} and AI Model ${entityDocument.AIModelID}`);
|
|
54
|
-
const vectorDB = super.GetVectorDatabase(entityDocument.VectorDatabaseID);
|
|
55
|
-
const aiModel = super.GetAIModel(entityDocument.AIModelID);
|
|
56
|
-
LogStatus(`AIModel driver class: ${aiModel.DriverClass}`);
|
|
57
|
-
LogStatus(`VectorDB class key: ${vectorDB.ClassKey}`);
|
|
58
|
-
const embeddingAPIKey = GetAIAPIKey(aiModel.DriverClass);
|
|
59
|
-
const vectorDBAPIKey = GetAIAPIKey(vectorDB.ClassKey);
|
|
60
|
-
if (!embeddingAPIKey) {
|
|
61
|
-
throw Error(`No API Key found for AI Model ${aiModel.DriverClass}`);
|
|
62
|
-
}
|
|
63
|
-
if (!vectorDBAPIKey) {
|
|
64
|
-
throw Error(`No API Key found for Vector Database ${vectorDB.ClassKey}`);
|
|
65
|
-
}
|
|
66
|
-
//LogStatus(`Embedding API Key: ${embeddingAPIKey} VectorDB API Key: ${vectorDBAPIKey}`);
|
|
67
|
-
this._embedding = MJGlobal.Instance.ClassFactory.CreateInstance(BaseEmbeddings, aiModel.DriverClass, embeddingAPIKey);
|
|
68
|
-
this._vectorDB = MJGlobal.Instance.ClassFactory.CreateInstance(VectorDBBase, vectorDB.ClassKey, vectorDBAPIKey);
|
|
69
|
-
if (!this._embedding) {
|
|
70
|
-
throw Error(`Failed to create Embeddings instance for AI Model ${aiModel.DriverClass}`);
|
|
71
|
-
}
|
|
72
|
-
if (!this._vectorDB) {
|
|
73
|
-
throw Error(`Failed to create Vector Database instance for ${vectorDB.ClassKey}`);
|
|
74
|
-
}
|
|
75
|
-
let records = await this.GetRecordsByListID(list.ID, entityDocument.EntityID);
|
|
54
|
+
// Step 2: Vectorize source records
|
|
55
|
+
this.reportProgress(options, 'Vectorizing', 0, 0, 0, startTime);
|
|
56
|
+
await this.VectorizeSourceRecords(entityDocument, contextUser);
|
|
57
|
+
// Step 3: Initialize providers
|
|
58
|
+
this.InitializeProviders(entityDocument);
|
|
59
|
+
// Step 4: Load list and duplicate run
|
|
60
|
+
const list = await this.LoadListEntity(params.ListID);
|
|
61
|
+
const duplicateRun = options.DuplicateRunID
|
|
62
|
+
? await this.LoadDuplicateRun(options.DuplicateRunID)
|
|
63
|
+
: await this.LoadDuplicateRunByListID(list.ID);
|
|
64
|
+
// Step 5: Create run detail records in batches
|
|
65
|
+
const duplicateRunDetails = await this.CreateRunDetailRecordsFromList(list.ID, duplicateRun.ID);
|
|
66
|
+
// Step 6: Load and embed records
|
|
67
|
+
const records = await this.LoadRecordsByListID(list.ID, entityDocument.EntityID);
|
|
76
68
|
if (records.length === 0) {
|
|
77
|
-
LogError(`No records found in list ${list.Name}
|
|
69
|
+
LogError(`No records found in list ${list.Name}`);
|
|
78
70
|
response.ErrorMessage = `No records found in list ${list.Name}`;
|
|
79
71
|
response.Status = 'Error';
|
|
80
72
|
return response;
|
|
81
73
|
}
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
for
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
const compositeKey = records[index].PrimaryKey;
|
|
96
|
-
let filterResult = await this._vectorDB.queryIndex({ vector: vector, topK: topK, includeMetadata: true, includeValues: false });
|
|
97
|
-
if (!filterResult.success) {
|
|
98
|
-
LogError(`Failed to query index for record ${compositeKey.ToString()}`);
|
|
99
|
-
continue;
|
|
100
|
-
}
|
|
101
|
-
let queryResult = await this.getVectorDuplicates(filterResult);
|
|
102
|
-
queryResult.Duplicates = queryResult.Duplicates.filter((dupe) => {
|
|
103
|
-
return dupe.ProbabilityScore >= entityDocument.PotentialMatchThreshold;
|
|
104
|
-
});
|
|
105
|
-
queryResult.EntityID = entityDocument.EntityID;
|
|
106
|
-
queryResult.RecordCompositeKey = compositeKey;
|
|
107
|
-
results.push(queryResult);
|
|
108
|
-
//now update all of the dupe run detail records
|
|
109
|
-
let dupeRunDetail = duplicateRunDetails.find((detail) => UUIDsEqual(detail.RecordID, compositeKey.Values()));
|
|
110
|
-
if (dupeRunDetail) {
|
|
111
|
-
const matchRecords = await this.createDuplicateRunDetailMatchesForRecord(dupeRunDetail.ID, queryResult);
|
|
112
|
-
queryResult.DuplicateRunDetailMatchRecordIDs = matchRecords.map((match) => match.ID);
|
|
113
|
-
dupeRunDetail.MatchStatus = 'Complete';
|
|
114
|
-
let success = await super.SaveEntity(dupeRunDetail);
|
|
115
|
-
if (!success) {
|
|
116
|
-
LogStatus(`Failed to update Duplicate Run Detail record ${dupeRunDetail.ID}`);
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
else {
|
|
120
|
-
LogError(`Failed to find Duplicate Run Detail record for ${compositeKey.ToString()}`);
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
//almost done
|
|
74
|
+
this.reportProgress(options, 'Embedding', records.length, 0, 0, startTime);
|
|
75
|
+
// Step 7: Generate template text and embeddings
|
|
76
|
+
const templateParser = EntityDocumentTemplateParser.CreateInstance();
|
|
77
|
+
const templateTexts = await this.GenerateTemplateTexts(templateParser, entityDocument, records, contextUser);
|
|
78
|
+
const embedResult = await this.embedding.EmbedTexts({ texts: templateTexts, model: null });
|
|
79
|
+
// Step 8: Query vector DB for each record (with concurrency control)
|
|
80
|
+
this.reportProgress(options, 'Querying', records.length, 0, 0, startTime);
|
|
81
|
+
const topK = options.TopK ?? DEFAULT_TOP_K;
|
|
82
|
+
const queryResults = await this.QueryDuplicatesForRecords(records, embedResult.vectors, templateTexts, entityDocument, topK, options);
|
|
83
|
+
// Step 9: Persist match results and update run details
|
|
84
|
+
this.reportProgress(options, 'Matching', records.length, records.length, 0, startTime);
|
|
85
|
+
const results = await this.PersistMatchResults(queryResults, duplicateRunDetails, entityDocument, options, startTime);
|
|
86
|
+
// Step 10: Complete the duplicate run
|
|
124
87
|
duplicateRun.ProcessingStatus = 'Complete';
|
|
125
88
|
duplicateRun.EndedAt = new Date();
|
|
126
|
-
|
|
127
|
-
if (!
|
|
89
|
+
const runSaveSuccess = await this.SaveEntity(duplicateRun);
|
|
90
|
+
if (!runSaveSuccess) {
|
|
128
91
|
throw new Error(`Failed to update Duplicate Run record ${duplicateRun.ID}`);
|
|
129
92
|
}
|
|
130
|
-
|
|
93
|
+
// Step 11: Auto-merge high-confidence matches
|
|
94
|
+
this.reportProgress(options, 'Merging', records.length, records.length, results.length, startTime);
|
|
131
95
|
response.PotentialDuplicateResult = results;
|
|
96
|
+
await this.ProcessAutoMerges(response, entityDocument);
|
|
132
97
|
response.Status = 'Success';
|
|
133
|
-
LogStatus(
|
|
134
|
-
LogStatus(JSON.stringify(response, null, "\t"));
|
|
98
|
+
LogStatus(`Duplicate detection complete: ${results.length} records checked`);
|
|
135
99
|
return response;
|
|
136
100
|
}
|
|
137
|
-
|
|
138
|
-
|
|
101
|
+
/**
|
|
102
|
+
* Check a single record for duplicates without requiring a list.
|
|
103
|
+
* Embeds the record and queries for matches directly.
|
|
104
|
+
*/
|
|
105
|
+
async CheckSingleRecord(EntityDocumentID, RecordID, Options, ContextUser) {
|
|
106
|
+
this.CurrentUser = ContextUser;
|
|
107
|
+
const options = Options ?? {};
|
|
108
|
+
const entityDocument = await this.ValidateEntityDocument(EntityDocumentID);
|
|
109
|
+
if (!entityDocument) {
|
|
110
|
+
throw new Error(`No active Entity Document found for ID ${EntityDocumentID}`);
|
|
111
|
+
}
|
|
112
|
+
this.InitializeProviders(entityDocument);
|
|
113
|
+
// Load the single record
|
|
114
|
+
const entityInfo = this.Metadata.EntityByID(entityDocument.EntityID);
|
|
139
115
|
if (!entityInfo) {
|
|
140
|
-
throw new Error(`
|
|
116
|
+
throw new Error(`Entity not found for ID ${entityDocument.EntityID}`);
|
|
141
117
|
}
|
|
142
|
-
const
|
|
118
|
+
const records = await this.RunView.RunView({
|
|
143
119
|
EntityName: entityInfo.Name,
|
|
144
|
-
ExtraFilter:
|
|
145
|
-
ResultType: 'entity_object'
|
|
146
|
-
},
|
|
147
|
-
if (!
|
|
148
|
-
throw new Error(
|
|
120
|
+
ExtraFilter: this.BuildExtraFilter([RecordID]),
|
|
121
|
+
ResultType: 'entity_object',
|
|
122
|
+
}, this.CurrentUser);
|
|
123
|
+
if (!records.Success || records.Results.length === 0) {
|
|
124
|
+
throw new Error(`Record not found: ${RecordID.ToString()}`);
|
|
149
125
|
}
|
|
150
|
-
|
|
126
|
+
const record = records.Results[0];
|
|
127
|
+
const templateParser = EntityDocumentTemplateParser.CreateInstance();
|
|
128
|
+
const templateTexts = await this.GenerateTemplateTexts(templateParser, entityDocument, [record], ContextUser);
|
|
129
|
+
const embedResult = await this.embedding.EmbedTexts({ texts: templateTexts, model: null });
|
|
130
|
+
const topK = options.TopK ?? DEFAULT_TOP_K;
|
|
131
|
+
const queryResults = await this.QueryDuplicatesForRecords([record], embedResult.vectors, templateTexts, entityDocument, topK, options);
|
|
132
|
+
return queryResults.length > 0 ? queryResults[0].Duplicates : new PotentialDuplicateResult();
|
|
151
133
|
}
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
134
|
+
// ─────────────────────────────────────────────
|
|
135
|
+
// Validation & Setup
|
|
136
|
+
// ─────────────────────────────────────────────
|
|
137
|
+
/**
|
|
138
|
+
* Validate and return an entity document, or null if not found.
|
|
139
|
+
*/
|
|
140
|
+
async ValidateEntityDocument(entityDocumentID) {
|
|
141
|
+
const vectorizer = new EntityVectorSyncer();
|
|
142
|
+
vectorizer.CurrentUser = this.CurrentUser;
|
|
143
|
+
return vectorizer.GetEntityDocument(entityDocumentID);
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Initialize embedding and vector DB providers via ClassFactory.
|
|
147
|
+
*/
|
|
148
|
+
InitializeProviders(entityDocument) {
|
|
149
|
+
const aiModel = this.GetAIModel(entityDocument.AIModelID);
|
|
150
|
+
const vectorDB = this.GetVectorDatabase(entityDocument.VectorDatabaseID);
|
|
151
|
+
const embeddingAPIKey = GetAIAPIKey(aiModel.DriverClass);
|
|
152
|
+
const vectorDBAPIKey = GetAIAPIKey(vectorDB.ClassKey);
|
|
153
|
+
if (!embeddingAPIKey) {
|
|
154
|
+
throw new Error(`No API Key found for AI Model ${aiModel.DriverClass}`);
|
|
155
|
+
}
|
|
156
|
+
if (!vectorDBAPIKey) {
|
|
157
|
+
throw new Error(`No API Key found for Vector Database ${vectorDB.ClassKey}`);
|
|
158
|
+
}
|
|
159
|
+
this.embedding = MJGlobal.Instance.ClassFactory.CreateInstance(BaseEmbeddings, aiModel.DriverClass, embeddingAPIKey);
|
|
160
|
+
this.vectorDB = MJGlobal.Instance.ClassFactory.CreateInstance(VectorDBBase, vectorDB.ClassKey, vectorDBAPIKey);
|
|
161
|
+
if (!this.embedding) {
|
|
162
|
+
throw new Error(`Failed to create Embeddings instance for ${aiModel.DriverClass}`);
|
|
165
163
|
}
|
|
166
|
-
|
|
164
|
+
if (!this.vectorDB) {
|
|
165
|
+
throw new Error(`Failed to create VectorDB instance for ${vectorDB.ClassKey}`);
|
|
166
|
+
}
|
|
167
|
+
LogStatus(`Providers initialized: AI Model=${aiModel.DriverClass}, VectorDB=${vectorDB.ClassKey}`);
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Run vectorization for the entity document's records.
|
|
171
|
+
*/
|
|
172
|
+
async VectorizeSourceRecords(entityDocument, contextUser) {
|
|
173
|
+
const vectorizer = new EntityVectorSyncer();
|
|
174
|
+
vectorizer.CurrentUser = contextUser;
|
|
175
|
+
const request = {
|
|
176
|
+
entityID: entityDocument.EntityID,
|
|
177
|
+
entityDocumentID: entityDocument.ID,
|
|
178
|
+
listBatchCount: 20,
|
|
179
|
+
options: {},
|
|
180
|
+
CurrentUser: contextUser,
|
|
181
|
+
};
|
|
182
|
+
LogStatus(`Vectorizing entity records for document ${entityDocument.Name}`);
|
|
183
|
+
await vectorizer.VectorizeEntity(request, contextUser);
|
|
167
184
|
}
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
185
|
+
// ─────────────────────────────────────────────
|
|
186
|
+
// Template Generation & Embedding
|
|
187
|
+
// ─────────────────────────────────────────────
|
|
188
|
+
/**
|
|
189
|
+
* Generate human-readable template text for each record using the entity document template.
|
|
190
|
+
*
|
|
191
|
+
* Loads the template from TemplateEngineServer and renders it via Nunjucks,
|
|
192
|
+
* matching the same approach used by the vectorization pipeline.
|
|
193
|
+
*/
|
|
194
|
+
async GenerateTemplateTexts(templateParser, entityDocument, records, contextUser) {
|
|
195
|
+
await TemplateEngineServer.Instance.Config(false, contextUser);
|
|
196
|
+
const template = this.loadTemplate(entityDocument);
|
|
197
|
+
const templateContent = template.Content[0];
|
|
198
|
+
TemplateEngineServer.Instance.SetupNunjucks();
|
|
199
|
+
const templateTexts = [];
|
|
200
|
+
for (const record of records) {
|
|
201
|
+
// NEW convention: main entity fields are TOP-LEVEL variables (no Entity. prefix).
|
|
202
|
+
// Spread record fields directly into root context so templates use {{FieldName}}.
|
|
203
|
+
const data = { ...record.GetAll() };
|
|
204
|
+
const result = await TemplateEngineServer.Instance.RenderTemplate(template, templateContent, data, true);
|
|
205
|
+
if (result.Success) {
|
|
206
|
+
templateTexts.push(result.Output);
|
|
207
|
+
}
|
|
208
|
+
else {
|
|
209
|
+
LogError(`Template render failed for record ${record.PrimaryKey.ToString()}: ${result.Message}`);
|
|
210
|
+
templateTexts.push('');
|
|
181
211
|
}
|
|
182
212
|
}
|
|
183
|
-
return
|
|
213
|
+
return templateTexts;
|
|
184
214
|
}
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
if (!viewResults.Success) {
|
|
193
|
-
throw new Error(viewResults.ErrorMessage);
|
|
215
|
+
/**
|
|
216
|
+
* Load the template entity from TemplateEngineServer for the given entity document.
|
|
217
|
+
*/
|
|
218
|
+
loadTemplate(entityDocument) {
|
|
219
|
+
const template = TemplateEngineServer.Instance.Templates.find((t) => UUIDsEqual(t.ID, entityDocument.TemplateID));
|
|
220
|
+
if (!template) {
|
|
221
|
+
throw new Error(`Template not found for ID ${entityDocument.TemplateID}`);
|
|
194
222
|
}
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
223
|
+
if (template.Content.length === 0) {
|
|
224
|
+
throw new Error(`Template ${template.ID} has no content records`);
|
|
225
|
+
}
|
|
226
|
+
return template;
|
|
227
|
+
}
|
|
228
|
+
// ─────────────────────────────────────────────
|
|
229
|
+
// Vector Query & Hybrid Search
|
|
230
|
+
// ─────────────────────────────────────────────
|
|
231
|
+
/**
|
|
232
|
+
* Query the vector DB for duplicates of each record, with concurrency control.
|
|
233
|
+
* Supports hybrid search and RRF fusion when the vector DB supports it.
|
|
234
|
+
*/
|
|
235
|
+
async QueryDuplicatesForRecords(records, vectors, templateTexts, entityDocument, topK, options) {
|
|
236
|
+
const tasks = records.map((record, index) => async () => {
|
|
237
|
+
const compositeKey = record.PrimaryKey;
|
|
238
|
+
const vector = vectors[index];
|
|
239
|
+
const templateText = templateTexts[index];
|
|
240
|
+
const queryResponse = await this.executeVectorQuery(vector, templateText, topK, options);
|
|
241
|
+
if (!queryResponse.success) {
|
|
242
|
+
LogError(`Failed to query index for record ${compositeKey.ToString()}`);
|
|
243
|
+
const emptyResult = new PotentialDuplicateResult();
|
|
244
|
+
emptyResult.EntityID = entityDocument.EntityID;
|
|
245
|
+
emptyResult.RecordCompositeKey = compositeKey;
|
|
246
|
+
return { SourceKey: compositeKey, TemplateText: templateText, Duplicates: emptyResult };
|
|
207
247
|
}
|
|
208
|
-
|
|
209
|
-
|
|
248
|
+
const dupeResult = this.ParseVectorMatches(queryResponse, compositeKey);
|
|
249
|
+
dupeResult.Duplicates = this.FilterSelfMatches(dupeResult.Duplicates, compositeKey);
|
|
250
|
+
dupeResult.Duplicates = dupeResult.Duplicates.filter((d) => d.ProbabilityScore >= entityDocument.PotentialMatchThreshold);
|
|
251
|
+
dupeResult.EntityID = entityDocument.EntityID;
|
|
252
|
+
dupeResult.RecordCompositeKey = compositeKey;
|
|
253
|
+
return { SourceKey: compositeKey, TemplateText: templateText, Duplicates: dupeResult };
|
|
254
|
+
});
|
|
255
|
+
return RunWithConcurrency(tasks, QUERY_CONCURRENCY_LIMIT);
|
|
256
|
+
}
|
|
257
|
+
/**
|
|
258
|
+
* Execute a vector query — uses hybrid search with RRF when the provider supports it.
|
|
259
|
+
*/
|
|
260
|
+
async executeVectorQuery(vector, templateText, topK, options) {
|
|
261
|
+
if (this.vectorDB.SupportsHybridSearch && templateText) {
|
|
262
|
+
return this.vectorDB.HybridQuery({
|
|
263
|
+
vector,
|
|
264
|
+
topK,
|
|
265
|
+
KeywordQuery: templateText,
|
|
266
|
+
Alpha: options.KeywordSearchWeight != null ? (1.0 - options.KeywordSearchWeight) : 0.7,
|
|
267
|
+
FusionMethod: options.FusionMethod ?? 'rrf',
|
|
268
|
+
includeMetadata: true,
|
|
269
|
+
includeValues: false,
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
return this.vectorDB.QueryIndex({
|
|
273
|
+
vector,
|
|
274
|
+
topK,
|
|
275
|
+
includeMetadata: true,
|
|
276
|
+
includeValues: false,
|
|
277
|
+
});
|
|
278
|
+
}
|
|
279
|
+
/**
|
|
280
|
+
* Parse raw vector DB matches into a PotentialDuplicateResult.
|
|
281
|
+
*/
|
|
282
|
+
ParseVectorMatches(queryResponse, sourceKey) {
|
|
283
|
+
const result = new PotentialDuplicateResult();
|
|
284
|
+
if (!queryResponse.data?.matches) {
|
|
285
|
+
return result;
|
|
286
|
+
}
|
|
287
|
+
for (const match of queryResponse.data.matches) {
|
|
288
|
+
if (!match?.id)
|
|
289
|
+
continue;
|
|
290
|
+
if (!match.metadata?.RecordID) {
|
|
291
|
+
LogError(`Invalid vector metadata for match: ${match.id}`);
|
|
292
|
+
continue;
|
|
210
293
|
}
|
|
294
|
+
const duplicate = new PotentialDuplicate();
|
|
295
|
+
duplicate.LoadFromConcatenatedString(match.metadata.RecordID);
|
|
296
|
+
duplicate.ProbabilityScore = match.score;
|
|
297
|
+
result.Duplicates.push(duplicate);
|
|
211
298
|
}
|
|
212
|
-
return
|
|
299
|
+
return result;
|
|
213
300
|
}
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
301
|
+
/**
|
|
302
|
+
* Filter out self-matches where the candidate is the same record as the source.
|
|
303
|
+
*/
|
|
304
|
+
FilterSelfMatches(duplicates, sourceKey) {
|
|
305
|
+
return duplicates.filter((d) => d.ToString() !== sourceKey.ToString());
|
|
306
|
+
}
|
|
307
|
+
// ─────────────────────────────────────────────
|
|
308
|
+
// Entity Loading
|
|
309
|
+
// ─────────────────────────────────────────────
|
|
310
|
+
/**
|
|
311
|
+
* Load records from an entity that are members of the specified list.
|
|
312
|
+
*/
|
|
313
|
+
async LoadRecordsByListID(listID, entityID) {
|
|
314
|
+
const entityInfo = this.Metadata.EntityByID(entityID);
|
|
315
|
+
if (!entityInfo) {
|
|
316
|
+
throw new Error(`Entity not found for ID ${entityID}`);
|
|
317
|
+
}
|
|
318
|
+
const sanitizedListID = listID.replace(/'/g, "''");
|
|
319
|
+
const rvResult = await this.RunView.RunView({
|
|
320
|
+
EntityName: entityInfo.Name,
|
|
321
|
+
ExtraFilter: `ID IN (SELECT RecordID FROM __mj.vwListDetails WHERE ListID = '${sanitizedListID}')`,
|
|
322
|
+
ResultType: 'entity_object',
|
|
323
|
+
}, this.CurrentUser);
|
|
324
|
+
if (!rvResult.Success) {
|
|
325
|
+
throw new Error(rvResult.ErrorMessage);
|
|
326
|
+
}
|
|
327
|
+
return rvResult.Results;
|
|
328
|
+
}
|
|
329
|
+
async LoadListEntity(listID) {
|
|
330
|
+
const list = await this.Metadata.GetEntityObject('MJ: Lists');
|
|
331
|
+
list.ContextCurrentUser = this.CurrentUser;
|
|
218
332
|
const success = await list.Load(listID);
|
|
219
333
|
if (!success) {
|
|
220
334
|
throw new Error(`Failed to load List record ${listID}`);
|
|
221
335
|
}
|
|
222
336
|
return list;
|
|
223
337
|
}
|
|
224
|
-
async
|
|
225
|
-
const
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
const success = await dupeRun.Load(DupeRunID);
|
|
338
|
+
async LoadDuplicateRun(duplicateRunID) {
|
|
339
|
+
const dupeRun = await this.Metadata.GetEntityObject('MJ: Duplicate Runs');
|
|
340
|
+
dupeRun.ContextCurrentUser = this.CurrentUser;
|
|
341
|
+
const success = await dupeRun.Load(duplicateRunID);
|
|
229
342
|
if (!success) {
|
|
230
|
-
throw new Error(`Failed to load Duplicate Run record ${
|
|
343
|
+
throw new Error(`Failed to load Duplicate Run record ${duplicateRunID}`);
|
|
231
344
|
}
|
|
232
345
|
return dupeRun;
|
|
233
346
|
}
|
|
234
|
-
async
|
|
235
|
-
const entity = await
|
|
347
|
+
async LoadDuplicateRunByListID(listID) {
|
|
348
|
+
const entity = await this.RunViewForSingleValue('MJ: Duplicate Runs', `SourceListID = '${listID.replace(/'/g, "''")}'`);
|
|
236
349
|
if (!entity) {
|
|
237
|
-
throw new Error(`
|
|
350
|
+
throw new Error(`No Duplicate Run found for List ${listID}`);
|
|
238
351
|
}
|
|
239
352
|
return entity;
|
|
240
353
|
}
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
354
|
+
// ─────────────────────────────────────────────
|
|
355
|
+
// Run Detail & Match Persistence (Batched)
|
|
356
|
+
// ─────────────────────────────────────────────
|
|
357
|
+
/**
|
|
358
|
+
* Create DuplicateRunDetail records for each item in the list, saving in parallel batches.
|
|
359
|
+
*/
|
|
360
|
+
async CreateRunDetailRecordsFromList(listID, duplicateRunID) {
|
|
361
|
+
const viewResults = await this.RunView.RunView({
|
|
362
|
+
EntityName: 'MJ: List Details',
|
|
363
|
+
ExtraFilter: `ListID = '${listID.replace(/'/g, "''")}'`,
|
|
364
|
+
ResultType: 'entity_object',
|
|
365
|
+
}, this.CurrentUser);
|
|
366
|
+
if (!viewResults.Success) {
|
|
367
|
+
throw new Error(viewResults.ErrorMessage);
|
|
252
368
|
}
|
|
253
|
-
|
|
369
|
+
const listDetails = viewResults.Results;
|
|
370
|
+
const results = [];
|
|
371
|
+
for (const batch of chunkArray(listDetails, SAVE_BATCH_SIZE)) {
|
|
372
|
+
const batchResults = await Promise.all(batch.map(async (listDetail) => {
|
|
373
|
+
const runDetail = await this.Metadata.GetEntityObject('MJ: Duplicate Run Details');
|
|
374
|
+
runDetail.NewRecord();
|
|
375
|
+
runDetail.DuplicateRunID = duplicateRunID;
|
|
376
|
+
runDetail.RecordID = listDetail.RecordID;
|
|
377
|
+
runDetail.MatchStatus = 'Pending';
|
|
378
|
+
runDetail.MergeStatus = 'Pending';
|
|
379
|
+
const success = await this.SaveEntity(runDetail);
|
|
380
|
+
if (!success) {
|
|
381
|
+
LogError("Failed to save MJDuplicateRunDetailEntity", undefined, runDetail.LatestResult);
|
|
382
|
+
return null;
|
|
383
|
+
}
|
|
384
|
+
return runDetail;
|
|
385
|
+
}));
|
|
386
|
+
for (const r of batchResults) {
|
|
387
|
+
if (r)
|
|
388
|
+
results.push(r);
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
return results;
|
|
254
392
|
}
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
393
|
+
/**
|
|
394
|
+
* Persist match results and update run detail records.
|
|
395
|
+
*/
|
|
396
|
+
async PersistMatchResults(queryResults, duplicateRunDetails, entityDocument, options, startTime) {
|
|
397
|
+
const results = [];
|
|
398
|
+
let matchesFound = 0;
|
|
399
|
+
for (const qr of queryResults) {
|
|
400
|
+
results.push(qr.Duplicates);
|
|
401
|
+
matchesFound += qr.Duplicates.Duplicates.length;
|
|
402
|
+
const detail = duplicateRunDetails.find((d) => UUIDsEqual(d.RecordID, qr.SourceKey.Values()));
|
|
403
|
+
if (detail) {
|
|
404
|
+
const matchRecords = await this.CreateMatchRecordsForDetail(detail.ID, qr.Duplicates);
|
|
405
|
+
qr.Duplicates.DuplicateRunDetailMatchRecordIDs = matchRecords.map((m) => m.ID);
|
|
406
|
+
detail.MatchStatus = 'Complete';
|
|
407
|
+
const success = await this.SaveEntity(detail);
|
|
408
|
+
if (!success) {
|
|
409
|
+
LogError(`Failed to update Duplicate Run Detail record ${detail.ID}`);
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
else {
|
|
413
|
+
LogError(`No Duplicate Run Detail found for ${qr.SourceKey.ToString()}`);
|
|
414
|
+
}
|
|
415
|
+
this.reportProgress(options, 'Matching', queryResults.length, results.length, matchesFound, startTime);
|
|
416
|
+
}
|
|
417
|
+
return results;
|
|
418
|
+
}
|
|
419
|
+
/**
|
|
420
|
+
* Create match records for a single run detail, saving in parallel batches.
|
|
421
|
+
*/
|
|
422
|
+
async CreateMatchRecordsForDetail(duplicateRunDetailID, duplicateResult) {
|
|
423
|
+
const matchRecords = [];
|
|
424
|
+
for (const batch of chunkArray(duplicateResult.Duplicates, SAVE_BATCH_SIZE)) {
|
|
425
|
+
const batchResults = await Promise.all(batch.map(async (dupe) => {
|
|
426
|
+
const match = await this.Metadata.GetEntityObject('MJ: Duplicate Run Detail Matches');
|
|
427
|
+
match.NewRecord();
|
|
428
|
+
match.DuplicateRunDetailID = duplicateRunDetailID;
|
|
429
|
+
match.MatchRecordID = dupe.ToString();
|
|
430
|
+
match.MatchProbability = dupe.ProbabilityScore;
|
|
431
|
+
match.MatchedAt = new Date();
|
|
432
|
+
match.Action = '';
|
|
433
|
+
match.ApprovalStatus = 'Pending';
|
|
434
|
+
match.MergeStatus = 'Pending';
|
|
435
|
+
const success = await this.SaveEntity(match);
|
|
436
|
+
return success ? match : null;
|
|
437
|
+
}));
|
|
438
|
+
for (const m of batchResults) {
|
|
439
|
+
if (m)
|
|
440
|
+
matchRecords.push(m);
|
|
272
441
|
}
|
|
273
442
|
}
|
|
274
443
|
return matchRecords;
|
|
275
444
|
}
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
445
|
+
// ─────────────────────────────────────────────
|
|
446
|
+
// Auto-Merge
|
|
447
|
+
// ─────────────────────────────────────────────
|
|
448
|
+
/**
|
|
449
|
+
* Automatically merge records that meet the absolute match threshold.
|
|
450
|
+
*/
|
|
451
|
+
async ProcessAutoMerges(response, entityDocument) {
|
|
452
|
+
for (const dupeResult of response.PotentialDuplicateResult) {
|
|
279
453
|
for (const [index, dupe] of dupeResult.Duplicates.entries()) {
|
|
280
|
-
if (dupe.
|
|
281
|
-
//same record, skip
|
|
454
|
+
if (dupe.ProbabilityScore < entityDocument.AbsoluteMatchThreshold) {
|
|
282
455
|
continue;
|
|
283
456
|
}
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
if (!loadResult) {
|
|
295
|
-
LogError(`Failed to load Duplicate Run Match record ${dupeResult.DuplicateRunDetailMatchRecordIDs[index]}`);
|
|
296
|
-
continue;
|
|
297
|
-
}
|
|
298
|
-
dupeRunMatchRecord.MergeStatus = 'Complete';
|
|
299
|
-
dupeRunMatchRecord.Action = 'Merged';
|
|
300
|
-
dupeRunMatchRecord.MergedAt = new Date();
|
|
301
|
-
let saveResult = await dupeRunMatchRecord.Save();
|
|
302
|
-
if (!saveResult) {
|
|
303
|
-
LogError(`Failed to update Duplicate Run Match record ${dupeRunMatchRecord.ID}`);
|
|
304
|
-
}
|
|
305
|
-
}
|
|
306
|
-
else {
|
|
307
|
-
LogError(`Failed to merge records ${dupeResult.RecordCompositeKey.ToString()} and ${dupe.ToString()}`);
|
|
308
|
-
}
|
|
457
|
+
const mergeParams = new RecordMergeRequest();
|
|
458
|
+
mergeParams.EntityName = entityDocument.Entity;
|
|
459
|
+
mergeParams.SurvivingRecordCompositeKey = dupeResult.RecordCompositeKey;
|
|
460
|
+
mergeParams.RecordsToMerge = [dupe];
|
|
461
|
+
const mergeResult = await this.Metadata.MergeRecords(mergeParams, this.CurrentUser);
|
|
462
|
+
if (mergeResult.Success) {
|
|
463
|
+
await this.updateMatchRecordAfterMerge(dupeResult.DuplicateRunDetailMatchRecordIDs[index]);
|
|
464
|
+
}
|
|
465
|
+
else {
|
|
466
|
+
LogError(`Failed to merge ${dupeResult.RecordCompositeKey.ToString()} and ${dupe.ToString()}`);
|
|
309
467
|
}
|
|
310
468
|
}
|
|
311
469
|
}
|
|
312
470
|
}
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
471
|
+
/**
|
|
472
|
+
* Update a match record's status after a successful merge.
|
|
473
|
+
*/
|
|
474
|
+
async updateMatchRecordAfterMerge(matchRecordID) {
|
|
475
|
+
const matchRecord = await this.Metadata.GetEntityObject('MJ: Duplicate Run Detail Matches', this.CurrentUser);
|
|
476
|
+
const loaded = await matchRecord.Load(matchRecordID);
|
|
477
|
+
if (!loaded) {
|
|
478
|
+
LogError(`Failed to load match record ${matchRecordID} for merge status update`);
|
|
479
|
+
return;
|
|
480
|
+
}
|
|
481
|
+
matchRecord.MergeStatus = 'Complete';
|
|
482
|
+
matchRecord.Action = 'Merged';
|
|
483
|
+
matchRecord.MergedAt = new Date();
|
|
484
|
+
const saved = await matchRecord.Save();
|
|
485
|
+
if (!saved) {
|
|
486
|
+
LogError(`Failed to update match record ${matchRecordID} after merge`);
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
// ─────────────────────────────────────────────
|
|
490
|
+
// Progress Reporting
|
|
491
|
+
// ─────────────────────────────────────────────
|
|
492
|
+
reportProgress(options, phase, totalRecords, processedRecords, matchesFound, startTime, currentRecordID) {
|
|
493
|
+
if (options.OnProgress) {
|
|
494
|
+
options.OnProgress({
|
|
495
|
+
Phase: phase,
|
|
496
|
+
TotalRecords: totalRecords,
|
|
497
|
+
ProcessedRecords: processedRecords,
|
|
498
|
+
MatchesFound: matchesFound,
|
|
499
|
+
CurrentRecordID: currentRecordID,
|
|
500
|
+
ElapsedMs: Date.now() - startTime,
|
|
501
|
+
});
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
// ─────────────────────────────────────────────
|
|
506
|
+
// Utility Functions
|
|
507
|
+
// ─────────────────────────────────────────────
|
|
508
|
+
/**
|
|
509
|
+
* Split an array into chunks of a given size.
|
|
510
|
+
*/
|
|
511
|
+
function chunkArray(array, chunkSize) {
|
|
512
|
+
const chunks = [];
|
|
513
|
+
for (let i = 0; i < array.length; i += chunkSize) {
|
|
514
|
+
chunks.push(array.slice(i, i + chunkSize));
|
|
515
|
+
}
|
|
516
|
+
return chunks;
|
|
517
|
+
}
|
|
518
|
+
/**
|
|
519
|
+
* Run async tasks with a concurrency limit.
|
|
520
|
+
* Executes up to `limit` tasks in parallel, queuing the rest.
|
|
521
|
+
*/
|
|
522
|
+
async function RunWithConcurrency(tasks, limit) {
|
|
523
|
+
const results = [];
|
|
524
|
+
let index = 0;
|
|
525
|
+
async function runNext() {
|
|
526
|
+
while (index < tasks.length) {
|
|
527
|
+
const currentIndex = index++;
|
|
528
|
+
results[currentIndex] = await tasks[currentIndex]();
|
|
328
529
|
}
|
|
329
|
-
return response;
|
|
330
530
|
}
|
|
531
|
+
const workers = Array.from({ length: Math.min(limit, tasks.length) }, () => runNext());
|
|
532
|
+
await Promise.all(workers);
|
|
533
|
+
return results;
|
|
331
534
|
}
|
|
332
535
|
//# sourceMappingURL=duplicateRecordDetector.js.map
|