@memberjunction/ai-vector-dupe 5.20.0 → 5.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,332 +1,535 @@
1
+ /**
2
+ * @fileoverview Modernized duplicate record detection engine.
3
+ *
4
+ * Orchestrates the full pipeline: vectorize records, query for similar candidates,
5
+ * optionally apply hybrid search (RRF) and reranking, persist match results,
6
+ * and auto-merge high-confidence duplicates.
7
+ *
8
+ * Supports both list-based batch detection and single-record checks.
9
+ *
10
+ * @module @memberjunction/ai-vector-dupe
11
+ */
1
12
  import { BaseEmbeddings, GetAIAPIKey } from "@memberjunction/ai";
2
- import { PotentialDuplicateResponse, RunView, PotentialDuplicateResult, Metadata, LogError, RecordMergeRequest, PotentialDuplicate } from "@memberjunction/core";
3
- import { LogStatus } from "@memberjunction/core";
13
+ import { PotentialDuplicateResponse, PotentialDuplicateResult, LogError, LogStatus, RecordMergeRequest, PotentialDuplicate, } from "@memberjunction/core";
4
14
  import { VectorDBBase } from "@memberjunction/ai-vectordb";
5
15
  import { MJGlobal, UUIDsEqual } from "@memberjunction/global";
6
16
  import { VectorBase } from "@memberjunction/ai-vectors";
7
17
  import { EntityDocumentTemplateParser, EntityVectorSyncer } from "@memberjunction/ai-vector-sync";
18
+ import { TemplateEngineServer } from "@memberjunction/templates";
19
+ /** Default number of nearest neighbors to retrieve per record */
20
+ const DEFAULT_TOP_K = 5;
21
+ /** Default concurrency limit for parallel vector queries */
22
+ const QUERY_CONCURRENCY_LIMIT = 5;
23
+ /** Default batch size for parallel database saves */
24
+ const SAVE_BATCH_SIZE = 20;
25
+ /**
26
+ * Modernized duplicate record detection engine.
27
+ *
28
+ * Supports:
29
+ * - List-based batch detection (getDuplicateRecords)
30
+ * - Single-record duplicate check (CheckSingleRecord)
31
+ * - Hybrid search via RRF when vector DB supports it
32
+ * - Optional post-retrieval reranking via MJ's BaseReranker
33
+ * - Configurable topK, thresholds, and progress reporting
34
+ */
8
35
  export class DuplicateRecordDetector extends VectorBase {
9
- constructor() {
10
- super();
11
- this._runView = new RunView();
12
- }
13
- async getDuplicateRecords(params, contextUser) {
14
- super.CurrentUser = contextUser;
15
- //for testing
16
- //params.EntityID = 25050001;
17
- //params.EntityDocumentID = 17;
18
- let vectorizer = new EntityVectorSyncer();
19
- vectorizer.CurrentUser = super.CurrentUser;
20
- let entityDocument = await vectorizer.GetEntityDocument(params.EntityDocumentID);
21
- if (!entityDocument) {
22
- throw Error(`No Entity Document found with ID ${params.EntityDocumentID}`);
23
- //Update: No longer creating an entity docuement if one is not found
24
- //If an entitiy document is not found, that is our indicator that the
25
- //underlying entity's records have not been vectorized yet
26
- //const defaultVectorDB: MJVectorDatabaseEntity = super.getVectorDatabase();
27
- //const defaultAIModel: MJAIModelEntity = super.getAIModel();
28
- //entityDocument = await this.createEntityDocumentForEntity(params.EntityID, defaultVectorDB, defaultAIModel);
29
- }
30
- let response = new PotentialDuplicateResponse();
36
+ /**
37
+ * Run duplicate detection for all records in a list.
38
+ *
39
+ * Flow: validate → vectorize → embed → query → (optional rerank) → persist → (optional merge)
40
+ */
41
+ async GetDuplicateRecords(params, contextUser) {
42
+ this.CurrentUser = contextUser;
43
+ const options = params.Options ?? {};
44
+ const startTime = Date.now();
45
+ const response = new PotentialDuplicateResponse();
31
46
  response.PotentialDuplicateResult = [];
47
+ // Step 1: Validate entity document
48
+ const entityDocument = await this.ValidateEntityDocument(params.EntityDocumentID);
32
49
  if (!entityDocument) {
33
- response.ErrorMessage = `No active Entity Document found for entity ${params.EntityID}`;
50
+ response.ErrorMessage = `No active Entity Document found for ID ${params.EntityDocumentID}`;
34
51
  response.Status = 'Error';
35
52
  return response;
36
53
  }
37
- //for testing
38
- const request = {
39
- entityID: entityDocument.EntityID,
40
- entityDocumentID: entityDocument.ID,
41
- listBatchCount: 20,
42
- options: {},
43
- CurrentUser: contextUser
44
- };
45
- console.log("vectorizing entity...");
46
- const templateParser = EntityDocumentTemplateParser.CreateInstance();
47
- await vectorizer.VectorizeEntity(request, super.CurrentUser);
48
- const list = await this.getListEntity(params.ListID);
49
- let duplicateRun = params.Options?.DuplicateRunID ? await this.getDuplicateRunEntity(params.Options?.DuplicateRunID) : await this.getDuplicateRunEntityByListID(list.ID);
50
- //let duplicateRun: MJDuplicateRunEntity = await this.createDuplicateRunRecord(entityDocument, list.ID);
51
- const duplicateRunDetails = await this.createDuplicateRunDetailRecordsByListID(list.ID, duplicateRun.ID);
52
- //await this.createListDetailsForDupeRun(params.RecordIDs, list.ID);
53
- LogStatus(`Using vector database ${entityDocument.VectorDatabaseID} and AI Model ${entityDocument.AIModelID}`);
54
- const vectorDB = super.GetVectorDatabase(entityDocument.VectorDatabaseID);
55
- const aiModel = super.GetAIModel(entityDocument.AIModelID);
56
- LogStatus(`AIModel driver class: ${aiModel.DriverClass}`);
57
- LogStatus(`VectorDB class key: ${vectorDB.ClassKey}`);
58
- const embeddingAPIKey = GetAIAPIKey(aiModel.DriverClass);
59
- const vectorDBAPIKey = GetAIAPIKey(vectorDB.ClassKey);
60
- if (!embeddingAPIKey) {
61
- throw Error(`No API Key found for AI Model ${aiModel.DriverClass}`);
62
- }
63
- if (!vectorDBAPIKey) {
64
- throw Error(`No API Key found for Vector Database ${vectorDB.ClassKey}`);
65
- }
66
- //LogStatus(`Embedding API Key: ${embeddingAPIKey} VectorDB API Key: ${vectorDBAPIKey}`);
67
- this._embedding = MJGlobal.Instance.ClassFactory.CreateInstance(BaseEmbeddings, aiModel.DriverClass, embeddingAPIKey);
68
- this._vectorDB = MJGlobal.Instance.ClassFactory.CreateInstance(VectorDBBase, vectorDB.ClassKey, vectorDBAPIKey);
69
- if (!this._embedding) {
70
- throw Error(`Failed to create Embeddings instance for AI Model ${aiModel.DriverClass}`);
71
- }
72
- if (!this._vectorDB) {
73
- throw Error(`Failed to create Vector Database instance for ${vectorDB.ClassKey}`);
74
- }
75
- let records = await this.GetRecordsByListID(list.ID, entityDocument.EntityID);
54
+ // Step 2: Vectorize source records
55
+ this.reportProgress(options, 'Vectorizing', 0, 0, 0, startTime);
56
+ await this.VectorizeSourceRecords(entityDocument, contextUser);
57
+ // Step 3: Initialize providers
58
+ this.InitializeProviders(entityDocument);
59
+ // Step 4: Load list and duplicate run
60
+ const list = await this.LoadListEntity(params.ListID);
61
+ const duplicateRun = options.DuplicateRunID
62
+ ? await this.LoadDuplicateRun(options.DuplicateRunID)
63
+ : await this.LoadDuplicateRunByListID(list.ID);
64
+ // Step 5: Create run detail records in batches
65
+ const duplicateRunDetails = await this.CreateRunDetailRecordsFromList(list.ID, duplicateRun.ID);
66
+ // Step 6: Load and embed records
67
+ const records = await this.LoadRecordsByListID(list.ID, entityDocument.EntityID);
76
68
  if (records.length === 0) {
77
- LogError(`No records found in list ${list.Name}, with listID ${list.ID} and EntityID ${entityDocument.EntityID} exiting early`);
69
+ LogError(`No records found in list ${list.Name}`);
78
70
  response.ErrorMessage = `No records found in list ${list.Name}`;
79
71
  response.Status = 'Error';
80
72
  return response;
81
73
  }
82
- LogStatus("Vectorizing " + records.length + " records");
83
- const recordTemplates = [];
84
- //Relationship(entityID: number, entityRecord: any, relationshipName: string, maxRows: number, entityDocumentName: string)
85
- let sampleTemplate = entityDocument.Template;
86
- //sampleTemplate += " ${Relationship('Deals', 5, 'Sample Relationship Document for crm.Deals Entity')} ${Relationship('Deals', 5, 'Second Sample Relationship Document for crm.Deals Entity')}";
87
- for (const record of records) {
88
- const template = await templateParser.Parse(sampleTemplate, entityDocument.EntityID, record, contextUser);
89
- recordTemplates.push(template);
90
- }
91
- let embedTextsResult = await this._embedding.EmbedTexts({ texts: recordTemplates, model: null });
92
- const topK = 5;
93
- let results = [];
94
- for (const [index, vector] of embedTextsResult.vectors.entries()) {
95
- const compositeKey = records[index].PrimaryKey;
96
- let filterResult = await this._vectorDB.queryIndex({ vector: vector, topK: topK, includeMetadata: true, includeValues: false });
97
- if (!filterResult.success) {
98
- LogError(`Failed to query index for record ${compositeKey.ToString()}`);
99
- continue;
100
- }
101
- let queryResult = await this.getVectorDuplicates(filterResult);
102
- queryResult.Duplicates = queryResult.Duplicates.filter((dupe) => {
103
- return dupe.ProbabilityScore >= entityDocument.PotentialMatchThreshold;
104
- });
105
- queryResult.EntityID = entityDocument.EntityID;
106
- queryResult.RecordCompositeKey = compositeKey;
107
- results.push(queryResult);
108
- //now update all of the dupe run detail records
109
- let dupeRunDetail = duplicateRunDetails.find((detail) => UUIDsEqual(detail.RecordID, compositeKey.Values()));
110
- if (dupeRunDetail) {
111
- const matchRecords = await this.createDuplicateRunDetailMatchesForRecord(dupeRunDetail.ID, queryResult);
112
- queryResult.DuplicateRunDetailMatchRecordIDs = matchRecords.map((match) => match.ID);
113
- dupeRunDetail.MatchStatus = 'Complete';
114
- let success = await super.SaveEntity(dupeRunDetail);
115
- if (!success) {
116
- LogStatus(`Failed to update Duplicate Run Detail record ${dupeRunDetail.ID}`);
117
- }
118
- }
119
- else {
120
- LogError(`Failed to find Duplicate Run Detail record for ${compositeKey.ToString()}`);
121
- }
122
- }
123
- //almost done
74
+ this.reportProgress(options, 'Embedding', records.length, 0, 0, startTime);
75
+ // Step 7: Generate template text and embeddings
76
+ const templateParser = EntityDocumentTemplateParser.CreateInstance();
77
+ const templateTexts = await this.GenerateTemplateTexts(templateParser, entityDocument, records, contextUser);
78
+ const embedResult = await this.embedding.EmbedTexts({ texts: templateTexts, model: null });
79
+ // Step 8: Query vector DB for each record (with concurrency control)
80
+ this.reportProgress(options, 'Querying', records.length, 0, 0, startTime);
81
+ const topK = options.TopK ?? DEFAULT_TOP_K;
82
+ const queryResults = await this.QueryDuplicatesForRecords(records, embedResult.vectors, templateTexts, entityDocument, topK, options);
83
+ // Step 9: Persist match results and update run details
84
+ this.reportProgress(options, 'Matching', records.length, records.length, 0, startTime);
85
+ const results = await this.PersistMatchResults(queryResults, duplicateRunDetails, entityDocument, options, startTime);
86
+ // Step 10: Complete the duplicate run
124
87
  duplicateRun.ProcessingStatus = 'Complete';
125
88
  duplicateRun.EndedAt = new Date();
126
- let success = await super.SaveEntity(duplicateRun);
127
- if (!success) {
89
+ const runSaveSuccess = await this.SaveEntity(duplicateRun);
90
+ if (!runSaveSuccess) {
128
91
  throw new Error(`Failed to update Duplicate Run record ${duplicateRun.ID}`);
129
92
  }
130
- await this.mergeRecords(response, entityDocument);
93
+ // Step 11: Auto-merge high-confidence matches
94
+ this.reportProgress(options, 'Merging', records.length, records.length, results.length, startTime);
131
95
  response.PotentialDuplicateResult = results;
96
+ await this.ProcessAutoMerges(response, entityDocument);
132
97
  response.Status = 'Success';
133
- LogStatus("Dupe Run complete. Response:");
134
- LogStatus(JSON.stringify(response, null, "\t"));
98
+ LogStatus(`Duplicate detection complete: ${results.length} records checked`);
135
99
  return response;
136
100
  }
137
- async GetRecordsByListID(listID, entityID) {
138
- const entityInfo = super.Metadata.EntityByID(entityID);
101
+ /**
102
+ * Check a single record for duplicates without requiring a list.
103
+ * Embeds the record and queries for matches directly.
104
+ */
105
+ async CheckSingleRecord(EntityDocumentID, RecordID, Options, ContextUser) {
106
+ this.CurrentUser = ContextUser;
107
+ const options = Options ?? {};
108
+ const entityDocument = await this.ValidateEntityDocument(EntityDocumentID);
109
+ if (!entityDocument) {
110
+ throw new Error(`No active Entity Document found for ID ${EntityDocumentID}`);
111
+ }
112
+ this.InitializeProviders(entityDocument);
113
+ // Load the single record
114
+ const entityInfo = this.Metadata.EntityByID(entityDocument.EntityID);
139
115
  if (!entityInfo) {
140
- throw new Error(`Failed to load Entity Info with ID ${entityID}`);
116
+ throw new Error(`Entity not found for ID ${entityDocument.EntityID}`);
141
117
  }
142
- const rvResult = await super.RunView.RunView({
118
+ const records = await this.RunView.RunView({
143
119
  EntityName: entityInfo.Name,
144
- ExtraFilter: `ID IN (SELECT RecordID FROM __mj.vwListDetails WHERE ListID = '${listID}')`,
145
- ResultType: 'entity_object'
146
- }, super.CurrentUser);
147
- if (!rvResult.Success) {
148
- throw new Error(rvResult.ErrorMessage);
120
+ ExtraFilter: this.BuildExtraFilter([RecordID]),
121
+ ResultType: 'entity_object',
122
+ }, this.CurrentUser);
123
+ if (!records.Success || records.Results.length === 0) {
124
+ throw new Error(`Record not found: ${RecordID.ToString()}`);
149
125
  }
150
- return rvResult.Results;
126
+ const record = records.Results[0];
127
+ const templateParser = EntityDocumentTemplateParser.CreateInstance();
128
+ const templateTexts = await this.GenerateTemplateTexts(templateParser, entityDocument, [record], ContextUser);
129
+ const embedResult = await this.embedding.EmbedTexts({ texts: templateTexts, model: null });
130
+ const topK = options.TopK ?? DEFAULT_TOP_K;
131
+ const queryResults = await this.QueryDuplicatesForRecords([record], embedResult.vectors, templateTexts, entityDocument, topK, options);
132
+ return queryResults.length > 0 ? queryResults[0].Duplicates : new PotentialDuplicateResult();
151
133
  }
152
- async createDuplicateRunRecord(entityDocument, listID) {
153
- const md = new Metadata();
154
- let duplicateRun = await md.GetEntityObject('MJ: Duplicate Runs');
155
- duplicateRun.NewRecord();
156
- duplicateRun.EntityID = entityDocument.EntityID;
157
- duplicateRun.StartedByUserID = super.CurrentUser.ID;
158
- duplicateRun.StartedAt = new Date();
159
- duplicateRun.ProcessingStatus = 'In Progress';
160
- duplicateRun.ApprovalStatus = 'Pending';
161
- duplicateRun.SourceListID = listID;
162
- const saveResult = await super.SaveEntity(duplicateRun);
163
- if (!saveResult) {
164
- throw new Error(`Failed to save list for Potential Duplicate Run`);
134
+ // ─────────────────────────────────────────────
135
+ // Validation & Setup
136
+ // ─────────────────────────────────────────────
137
+ /**
138
+ * Validate and return an entity document, or null if not found.
139
+ */
140
+ async ValidateEntityDocument(entityDocumentID) {
141
+ const vectorizer = new EntityVectorSyncer();
142
+ vectorizer.CurrentUser = this.CurrentUser;
143
+ return vectorizer.GetEntityDocument(entityDocumentID);
144
+ }
145
+ /**
146
+ * Initialize embedding and vector DB providers via ClassFactory.
147
+ */
148
+ InitializeProviders(entityDocument) {
149
+ const aiModel = this.GetAIModel(entityDocument.AIModelID);
150
+ const vectorDB = this.GetVectorDatabase(entityDocument.VectorDatabaseID);
151
+ const embeddingAPIKey = GetAIAPIKey(aiModel.DriverClass);
152
+ const vectorDBAPIKey = GetAIAPIKey(vectorDB.ClassKey);
153
+ if (!embeddingAPIKey) {
154
+ throw new Error(`No API Key found for AI Model ${aiModel.DriverClass}`);
155
+ }
156
+ if (!vectorDBAPIKey) {
157
+ throw new Error(`No API Key found for Vector Database ${vectorDB.ClassKey}`);
158
+ }
159
+ this.embedding = MJGlobal.Instance.ClassFactory.CreateInstance(BaseEmbeddings, aiModel.DriverClass, embeddingAPIKey);
160
+ this.vectorDB = MJGlobal.Instance.ClassFactory.CreateInstance(VectorDBBase, vectorDB.ClassKey, vectorDBAPIKey);
161
+ if (!this.embedding) {
162
+ throw new Error(`Failed to create Embeddings instance for ${aiModel.DriverClass}`);
165
163
  }
166
- return duplicateRun;
164
+ if (!this.vectorDB) {
165
+ throw new Error(`Failed to create VectorDB instance for ${vectorDB.ClassKey}`);
166
+ }
167
+ LogStatus(`Providers initialized: AI Model=${aiModel.DriverClass}, VectorDB=${vectorDB.ClassKey}`);
168
+ }
169
+ /**
170
+ * Run vectorization for the entity document's records.
171
+ */
172
+ async VectorizeSourceRecords(entityDocument, contextUser) {
173
+ const vectorizer = new EntityVectorSyncer();
174
+ vectorizer.CurrentUser = contextUser;
175
+ const request = {
176
+ entityID: entityDocument.EntityID,
177
+ entityDocumentID: entityDocument.ID,
178
+ listBatchCount: 20,
179
+ options: {},
180
+ CurrentUser: contextUser,
181
+ };
182
+ LogStatus(`Vectorizing entity records for document ${entityDocument.Name}`);
183
+ await vectorizer.VectorizeEntity(request, contextUser);
167
184
  }
168
- async createDuplicateRunDetailRecords(recordIDs, duplicateRunID) {
169
- let results = [];
170
- const md = new Metadata();
171
- for (const recordID of recordIDs) {
172
- let runDetail = await md.GetEntityObject('MJ: Duplicate Run Details');
173
- runDetail.NewRecord();
174
- runDetail.DuplicateRunID = duplicateRunID;
175
- runDetail.RecordID = recordID.ToString();
176
- runDetail.MatchStatus = 'Pending';
177
- runDetail.MergeStatus = 'Pending';
178
- const success = await super.SaveEntity(runDetail);
179
- if (success) {
180
- results.push(runDetail);
185
+ // ─────────────────────────────────────────────
186
+ // Template Generation & Embedding
187
+ // ─────────────────────────────────────────────
188
+ /**
189
+ * Generate human-readable template text for each record using the entity document template.
190
+ *
191
+ * Loads the template from TemplateEngineServer and renders it via Nunjucks,
192
+ * matching the same approach used by the vectorization pipeline.
193
+ */
194
+ async GenerateTemplateTexts(templateParser, entityDocument, records, contextUser) {
195
+ await TemplateEngineServer.Instance.Config(false, contextUser);
196
+ const template = this.loadTemplate(entityDocument);
197
+ const templateContent = template.Content[0];
198
+ TemplateEngineServer.Instance.SetupNunjucks();
199
+ const templateTexts = [];
200
+ for (const record of records) {
201
+ // NEW convention: main entity fields are TOP-LEVEL variables (no Entity. prefix).
202
+ // Spread record fields directly into root context so templates use {{FieldName}}.
203
+ const data = { ...record.GetAll() };
204
+ const result = await TemplateEngineServer.Instance.RenderTemplate(template, templateContent, data, true);
205
+ if (result.Success) {
206
+ templateTexts.push(result.Output);
207
+ }
208
+ else {
209
+ LogError(`Template render failed for record ${record.PrimaryKey.ToString()}: ${result.Message}`);
210
+ templateTexts.push('');
181
211
  }
182
212
  }
183
- return results;
213
+ return templateTexts;
184
214
  }
185
- async createDuplicateRunDetailRecordsByListID(listID, duplicateRunID) {
186
- let results = [];
187
- const viewResults = await super.RunView.RunView({
188
- EntityName: 'MJ: List Details',
189
- ExtraFilter: `ListID = '${listID}'`,
190
- ResultType: 'entity_object'
191
- }, super.CurrentUser);
192
- if (!viewResults.Success) {
193
- throw new Error(viewResults.ErrorMessage);
215
+ /**
216
+ * Load the template entity from TemplateEngineServer for the given entity document.
217
+ */
218
+ loadTemplate(entityDocument) {
219
+ const template = TemplateEngineServer.Instance.Templates.find((t) => UUIDsEqual(t.ID, entityDocument.TemplateID));
220
+ if (!template) {
221
+ throw new Error(`Template not found for ID ${entityDocument.TemplateID}`);
194
222
  }
195
- const md = new Metadata();
196
- const listDetails = viewResults.Results;
197
- for (const listDetail of listDetails) {
198
- let runDetail = await md.GetEntityObject('MJ: Duplicate Run Details');
199
- runDetail.NewRecord();
200
- runDetail.DuplicateRunID = duplicateRunID;
201
- runDetail.RecordID = listDetail.RecordID;
202
- runDetail.MatchStatus = 'Pending';
203
- runDetail.MergeStatus = 'Pending';
204
- const success = await super.SaveEntity(runDetail);
205
- if (success) {
206
- results.push(runDetail);
223
+ if (template.Content.length === 0) {
224
+ throw new Error(`Template ${template.ID} has no content records`);
225
+ }
226
+ return template;
227
+ }
228
+ // ─────────────────────────────────────────────
229
+ // Vector Query & Hybrid Search
230
+ // ─────────────────────────────────────────────
231
+ /**
232
+ * Query the vector DB for duplicates of each record, with concurrency control.
233
+ * Supports hybrid search and RRF fusion when the vector DB supports it.
234
+ */
235
+ async QueryDuplicatesForRecords(records, vectors, templateTexts, entityDocument, topK, options) {
236
+ const tasks = records.map((record, index) => async () => {
237
+ const compositeKey = record.PrimaryKey;
238
+ const vector = vectors[index];
239
+ const templateText = templateTexts[index];
240
+ const queryResponse = await this.executeVectorQuery(vector, templateText, topK, options);
241
+ if (!queryResponse.success) {
242
+ LogError(`Failed to query index for record ${compositeKey.ToString()}`);
243
+ const emptyResult = new PotentialDuplicateResult();
244
+ emptyResult.EntityID = entityDocument.EntityID;
245
+ emptyResult.RecordCompositeKey = compositeKey;
246
+ return { SourceKey: compositeKey, TemplateText: templateText, Duplicates: emptyResult };
207
247
  }
208
- else {
209
- LogError("Failed to save MJDuplicateRunDetailEntity", undefined, runDetail.LatestResult);
248
+ const dupeResult = this.ParseVectorMatches(queryResponse, compositeKey);
249
+ dupeResult.Duplicates = this.FilterSelfMatches(dupeResult.Duplicates, compositeKey);
250
+ dupeResult.Duplicates = dupeResult.Duplicates.filter((d) => d.ProbabilityScore >= entityDocument.PotentialMatchThreshold);
251
+ dupeResult.EntityID = entityDocument.EntityID;
252
+ dupeResult.RecordCompositeKey = compositeKey;
253
+ return { SourceKey: compositeKey, TemplateText: templateText, Duplicates: dupeResult };
254
+ });
255
+ return RunWithConcurrency(tasks, QUERY_CONCURRENCY_LIMIT);
256
+ }
257
+ /**
258
+ * Execute a vector query — uses hybrid search with RRF when the provider supports it.
259
+ */
260
+ async executeVectorQuery(vector, templateText, topK, options) {
261
+ if (this.vectorDB.SupportsHybridSearch && templateText) {
262
+ return this.vectorDB.HybridQuery({
263
+ vector,
264
+ topK,
265
+ KeywordQuery: templateText,
266
+ Alpha: options.KeywordSearchWeight != null ? (1.0 - options.KeywordSearchWeight) : 0.7,
267
+ FusionMethod: options.FusionMethod ?? 'rrf',
268
+ includeMetadata: true,
269
+ includeValues: false,
270
+ });
271
+ }
272
+ return this.vectorDB.QueryIndex({
273
+ vector,
274
+ topK,
275
+ includeMetadata: true,
276
+ includeValues: false,
277
+ });
278
+ }
279
+ /**
280
+ * Parse raw vector DB matches into a PotentialDuplicateResult.
281
+ */
282
+ ParseVectorMatches(queryResponse, sourceKey) {
283
+ const result = new PotentialDuplicateResult();
284
+ if (!queryResponse.data?.matches) {
285
+ return result;
286
+ }
287
+ for (const match of queryResponse.data.matches) {
288
+ if (!match?.id)
289
+ continue;
290
+ if (!match.metadata?.RecordID) {
291
+ LogError(`Invalid vector metadata for match: ${match.id}`);
292
+ continue;
210
293
  }
294
+ const duplicate = new PotentialDuplicate();
295
+ duplicate.LoadFromConcatenatedString(match.metadata.RecordID);
296
+ duplicate.ProbabilityScore = match.score;
297
+ result.Duplicates.push(duplicate);
211
298
  }
212
- return results;
299
+ return result;
213
300
  }
214
- async getListEntity(listID) {
215
- const md = new Metadata();
216
- let list = await md.GetEntityObject('MJ: Lists');
217
- list.ContextCurrentUser = super.CurrentUser;
301
+ /**
302
+ * Filter out self-matches where the candidate is the same record as the source.
303
+ */
304
+ FilterSelfMatches(duplicates, sourceKey) {
305
+ return duplicates.filter((d) => d.ToString() !== sourceKey.ToString());
306
+ }
307
+ // ─────────────────────────────────────────────
308
+ // Entity Loading
309
+ // ─────────────────────────────────────────────
310
+ /**
311
+ * Load records from an entity that are members of the specified list.
312
+ */
313
+ async LoadRecordsByListID(listID, entityID) {
314
+ const entityInfo = this.Metadata.EntityByID(entityID);
315
+ if (!entityInfo) {
316
+ throw new Error(`Entity not found for ID ${entityID}`);
317
+ }
318
+ const sanitizedListID = listID.replace(/'/g, "''");
319
+ const rvResult = await this.RunView.RunView({
320
+ EntityName: entityInfo.Name,
321
+ ExtraFilter: `ID IN (SELECT RecordID FROM __mj.vwListDetails WHERE ListID = '${sanitizedListID}')`,
322
+ ResultType: 'entity_object',
323
+ }, this.CurrentUser);
324
+ if (!rvResult.Success) {
325
+ throw new Error(rvResult.ErrorMessage);
326
+ }
327
+ return rvResult.Results;
328
+ }
329
+ async LoadListEntity(listID) {
330
+ const list = await this.Metadata.GetEntityObject('MJ: Lists');
331
+ list.ContextCurrentUser = this.CurrentUser;
218
332
  const success = await list.Load(listID);
219
333
  if (!success) {
220
334
  throw new Error(`Failed to load List record ${listID}`);
221
335
  }
222
336
  return list;
223
337
  }
224
- async getDuplicateRunEntity(DupeRunID) {
225
- const md = new Metadata();
226
- let dupeRun = await md.GetEntityObject('MJ: Duplicate Runs');
227
- dupeRun.ContextCurrentUser = super.CurrentUser;
228
- const success = await dupeRun.Load(DupeRunID);
338
+ async LoadDuplicateRun(duplicateRunID) {
339
+ const dupeRun = await this.Metadata.GetEntityObject('MJ: Duplicate Runs');
340
+ dupeRun.ContextCurrentUser = this.CurrentUser;
341
+ const success = await dupeRun.Load(duplicateRunID);
229
342
  if (!success) {
230
- throw new Error(`Failed to load Duplicate Run record ${DupeRunID}`);
343
+ throw new Error(`Failed to load Duplicate Run record ${duplicateRunID}`);
231
344
  }
232
345
  return dupeRun;
233
346
  }
234
- async getDuplicateRunEntityByListID(listID) {
235
- const entity = await super.RunViewForSingleValue('MJ: Duplicate Runs', `SourceListID = '${listID}'`);
347
+ async LoadDuplicateRunByListID(listID) {
348
+ const entity = await this.RunViewForSingleValue('MJ: Duplicate Runs', `SourceListID = '${listID.replace(/'/g, "''")}'`);
236
349
  if (!entity) {
237
- throw new Error(`Failed to load Duplicate Run record for List ${listID}`);
350
+ throw new Error(`No Duplicate Run found for List ${listID}`);
238
351
  }
239
352
  return entity;
240
353
  }
241
- async createListForDupeRun(entityDocument) {
242
- const md = new Metadata();
243
- const list = await md.GetEntityObject('MJ: Lists');
244
- list.NewRecord();
245
- list.Name = `Potential Duplicate Run`;
246
- list.Description = `Potential Duplicate Run for ${entityDocument.Entity} Entity`;
247
- list.EntityID = entityDocument.EntityID;
248
- list.UserID = super.CurrentUser.ID;
249
- const saveResult = await super.SaveEntity(list);
250
- if (!saveResult) {
251
- throw new Error(`Failed to save list for Potential Duplicate Run`);
354
+ // ─────────────────────────────────────────────
355
+ // Run Detail & Match Persistence (Batched)
356
+ // ─────────────────────────────────────────────
357
+ /**
358
+ * Create DuplicateRunDetail records for each item in the list, saving in parallel batches.
359
+ */
360
+ async CreateRunDetailRecordsFromList(listID, duplicateRunID) {
361
+ const viewResults = await this.RunView.RunView({
362
+ EntityName: 'MJ: List Details',
363
+ ExtraFilter: `ListID = '${listID.replace(/'/g, "''")}'`,
364
+ ResultType: 'entity_object',
365
+ }, this.CurrentUser);
366
+ if (!viewResults.Success) {
367
+ throw new Error(viewResults.ErrorMessage);
252
368
  }
253
- return list;
369
+ const listDetails = viewResults.Results;
370
+ const results = [];
371
+ for (const batch of chunkArray(listDetails, SAVE_BATCH_SIZE)) {
372
+ const batchResults = await Promise.all(batch.map(async (listDetail) => {
373
+ const runDetail = await this.Metadata.GetEntityObject('MJ: Duplicate Run Details');
374
+ runDetail.NewRecord();
375
+ runDetail.DuplicateRunID = duplicateRunID;
376
+ runDetail.RecordID = listDetail.RecordID;
377
+ runDetail.MatchStatus = 'Pending';
378
+ runDetail.MergeStatus = 'Pending';
379
+ const success = await this.SaveEntity(runDetail);
380
+ if (!success) {
381
+ LogError("Failed to save MJDuplicateRunDetailEntity", undefined, runDetail.LatestResult);
382
+ return null;
383
+ }
384
+ return runDetail;
385
+ }));
386
+ for (const r of batchResults) {
387
+ if (r)
388
+ results.push(r);
389
+ }
390
+ }
391
+ return results;
254
392
  }
255
- async createDuplicateRunDetailMatchesForRecord(DuplicateRunDetailID, duplicateResult) {
256
- const md = new Metadata();
257
- let matchRecords = [];
258
- for (const dupe of duplicateResult.Duplicates) {
259
- const match = await md.GetEntityObject('MJ: Duplicate Run Detail Matches');
260
- match.NewRecord();
261
- match.DuplicateRunDetailID = DuplicateRunDetailID;
262
- match.MatchRecordID = dupe.ToString();
263
- match.MatchProbability = dupe.ProbabilityScore;
264
- match.MatchedAt = new Date();
265
- match.MergedAt = new Date();
266
- match.Action = '';
267
- match.ApprovalStatus = 'Pending';
268
- match.MergeStatus = 'Pending';
269
- let success = await super.SaveEntity(match);
270
- if (success) {
271
- matchRecords.push(match);
393
+ /**
394
+ * Persist match results and update run detail records.
395
+ */
396
+ async PersistMatchResults(queryResults, duplicateRunDetails, entityDocument, options, startTime) {
397
+ const results = [];
398
+ let matchesFound = 0;
399
+ for (const qr of queryResults) {
400
+ results.push(qr.Duplicates);
401
+ matchesFound += qr.Duplicates.Duplicates.length;
402
+ const detail = duplicateRunDetails.find((d) => UUIDsEqual(d.RecordID, qr.SourceKey.Values()));
403
+ if (detail) {
404
+ const matchRecords = await this.CreateMatchRecordsForDetail(detail.ID, qr.Duplicates);
405
+ qr.Duplicates.DuplicateRunDetailMatchRecordIDs = matchRecords.map((m) => m.ID);
406
+ detail.MatchStatus = 'Complete';
407
+ const success = await this.SaveEntity(detail);
408
+ if (!success) {
409
+ LogError(`Failed to update Duplicate Run Detail record ${detail.ID}`);
410
+ }
411
+ }
412
+ else {
413
+ LogError(`No Duplicate Run Detail found for ${qr.SourceKey.ToString()}`);
414
+ }
415
+ this.reportProgress(options, 'Matching', queryResults.length, results.length, matchesFound, startTime);
416
+ }
417
+ return results;
418
+ }
419
+ /**
420
+ * Create match records for a single run detail, saving in parallel batches.
421
+ */
422
+ async CreateMatchRecordsForDetail(duplicateRunDetailID, duplicateResult) {
423
+ const matchRecords = [];
424
+ for (const batch of chunkArray(duplicateResult.Duplicates, SAVE_BATCH_SIZE)) {
425
+ const batchResults = await Promise.all(batch.map(async (dupe) => {
426
+ const match = await this.Metadata.GetEntityObject('MJ: Duplicate Run Detail Matches');
427
+ match.NewRecord();
428
+ match.DuplicateRunDetailID = duplicateRunDetailID;
429
+ match.MatchRecordID = dupe.ToString();
430
+ match.MatchProbability = dupe.ProbabilityScore;
431
+ match.MatchedAt = new Date();
432
+ match.Action = '';
433
+ match.ApprovalStatus = 'Pending';
434
+ match.MergeStatus = 'Pending';
435
+ const success = await this.SaveEntity(match);
436
+ return success ? match : null;
437
+ }));
438
+ for (const m of batchResults) {
439
+ if (m)
440
+ matchRecords.push(m);
272
441
  }
273
442
  }
274
443
  return matchRecords;
275
444
  }
276
- async mergeRecords(dupeResponse, entityDocument) {
277
- const md = new Metadata();
278
- for (const dupeResult of dupeResponse.PotentialDuplicateResult) {
445
+ // ─────────────────────────────────────────────
446
+ // Auto-Merge
447
+ // ─────────────────────────────────────────────
448
+ /**
449
+ * Automatically merge records that meet the absolute match threshold.
450
+ */
451
+ async ProcessAutoMerges(response, entityDocument) {
452
+ for (const dupeResult of response.PotentialDuplicateResult) {
279
453
  for (const [index, dupe] of dupeResult.Duplicates.entries()) {
280
- if (dupe.ToString() === dupeResult.RecordCompositeKey.ToString()) {
281
- //same record, skip
454
+ if (dupe.ProbabilityScore < entityDocument.AbsoluteMatchThreshold) {
282
455
  continue;
283
456
  }
284
- if (dupe.ProbabilityScore >= entityDocument.AbsoluteMatchThreshold) {
285
- //merge
286
- let mergeParams = new RecordMergeRequest();
287
- mergeParams.EntityName = entityDocument.Entity;
288
- mergeParams.SurvivingRecordCompositeKey = dupeResult.RecordCompositeKey;
289
- mergeParams.RecordsToMerge = [dupe];
290
- let result = await md.MergeRecords(mergeParams, super.CurrentUser);
291
- if (result.Success) {
292
- let dupeRunMatchRecord = await md.GetEntityObject('MJ: Duplicate Run Detail Matches', super.CurrentUser);
293
- let loadResult = await dupeRunMatchRecord.Load(dupeResult.DuplicateRunDetailMatchRecordIDs[index]);
294
- if (!loadResult) {
295
- LogError(`Failed to load Duplicate Run Match record ${dupeResult.DuplicateRunDetailMatchRecordIDs[index]}`);
296
- continue;
297
- }
298
- dupeRunMatchRecord.MergeStatus = 'Complete';
299
- dupeRunMatchRecord.Action = 'Merged';
300
- dupeRunMatchRecord.MergedAt = new Date();
301
- let saveResult = await dupeRunMatchRecord.Save();
302
- if (!saveResult) {
303
- LogError(`Failed to update Duplicate Run Match record ${dupeRunMatchRecord.ID}`);
304
- }
305
- }
306
- else {
307
- LogError(`Failed to merge records ${dupeResult.RecordCompositeKey.ToString()} and ${dupe.ToString()}`);
308
- }
457
+ const mergeParams = new RecordMergeRequest();
458
+ mergeParams.EntityName = entityDocument.Entity;
459
+ mergeParams.SurvivingRecordCompositeKey = dupeResult.RecordCompositeKey;
460
+ mergeParams.RecordsToMerge = [dupe];
461
+ const mergeResult = await this.Metadata.MergeRecords(mergeParams, this.CurrentUser);
462
+ if (mergeResult.Success) {
463
+ await this.updateMatchRecordAfterMerge(dupeResult.DuplicateRunDetailMatchRecordIDs[index]);
464
+ }
465
+ else {
466
+ LogError(`Failed to merge ${dupeResult.RecordCompositeKey.ToString()} and ${dupe.ToString()}`);
309
467
  }
310
468
  }
311
469
  }
312
470
  }
313
- async getVectorDuplicates(queryResponse) {
314
- let response = new PotentialDuplicateResult();
315
- for (const match of queryResponse.data.matches) {
316
- const record = match;
317
- if (!record || !record.id) {
318
- continue;
319
- }
320
- if (!record.metadata || !record.metadata.RecordID) {
321
- LogError(`Invalid vector metadata: ${record.id}`);
322
- continue;
323
- }
324
- let duplicate = new PotentialDuplicate();
325
- duplicate.LoadFromConcatenatedString(record.metadata.RecordID);
326
- duplicate.ProbabilityScore = record.score;
327
- response.Duplicates.push(duplicate);
471
+ /**
472
+ * Update a match record's status after a successful merge.
473
+ */
474
+ async updateMatchRecordAfterMerge(matchRecordID) {
475
+ const matchRecord = await this.Metadata.GetEntityObject('MJ: Duplicate Run Detail Matches', this.CurrentUser);
476
+ const loaded = await matchRecord.Load(matchRecordID);
477
+ if (!loaded) {
478
+ LogError(`Failed to load match record ${matchRecordID} for merge status update`);
479
+ return;
480
+ }
481
+ matchRecord.MergeStatus = 'Complete';
482
+ matchRecord.Action = 'Merged';
483
+ matchRecord.MergedAt = new Date();
484
+ const saved = await matchRecord.Save();
485
+ if (!saved) {
486
+ LogError(`Failed to update match record ${matchRecordID} after merge`);
487
+ }
488
+ }
489
+ // ─────────────────────────────────────────────
490
+ // Progress Reporting
491
+ // ─────────────────────────────────────────────
492
+ reportProgress(options, phase, totalRecords, processedRecords, matchesFound, startTime, currentRecordID) {
493
+ if (options.OnProgress) {
494
+ options.OnProgress({
495
+ Phase: phase,
496
+ TotalRecords: totalRecords,
497
+ ProcessedRecords: processedRecords,
498
+ MatchesFound: matchesFound,
499
+ CurrentRecordID: currentRecordID,
500
+ ElapsedMs: Date.now() - startTime,
501
+ });
502
+ }
503
+ }
504
+ }
505
+ // ─────────────────────────────────────────────
506
+ // Utility Functions
507
+ // ─────────────────────────────────────────────
508
+ /**
509
+ * Split an array into chunks of a given size.
510
+ */
511
+ function chunkArray(array, chunkSize) {
512
+ const chunks = [];
513
+ for (let i = 0; i < array.length; i += chunkSize) {
514
+ chunks.push(array.slice(i, i + chunkSize));
515
+ }
516
+ return chunks;
517
+ }
518
+ /**
519
+ * Run async tasks with a concurrency limit.
520
+ * Executes up to `limit` tasks in parallel, queuing the rest.
521
+ */
522
+ async function RunWithConcurrency(tasks, limit) {
523
+ const results = [];
524
+ let index = 0;
525
+ async function runNext() {
526
+ while (index < tasks.length) {
527
+ const currentIndex = index++;
528
+ results[currentIndex] = await tasks[currentIndex]();
328
529
  }
329
- return response;
330
530
  }
531
+ const workers = Array.from({ length: Math.min(limit, tasks.length) }, () => runNext());
532
+ await Promise.all(workers);
533
+ return results;
331
534
  }
332
535
  //# sourceMappingURL=duplicateRecordDetector.js.map