@memberjunction/ai-vector-dupe 5.21.0 → 5.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,332 +1,811 @@
1
+ /**
2
+ * @fileoverview Modernized duplicate record detection engine.
3
+ *
4
+ * Orchestrates the full pipeline: vectorize records, query for similar candidates,
5
+ * optionally apply hybrid search (RRF) and reranking, persist match results,
6
+ * and auto-merge high-confidence duplicates.
7
+ *
8
+ * Supports three record-source modes:
9
+ * 1. List-based batch detection (ListID provided)
10
+ * 2. View-based detection (ViewID provided)
11
+ * 3. Entity-wide detection (no ListID/ViewID — scans all records or applies ExtraFilter)
12
+ *
13
+ * Also supports single-record checks via CheckSingleRecord().
14
+ *
15
+ * @module @memberjunction/ai-vector-dupe
16
+ */
1
17
  import { BaseEmbeddings, GetAIAPIKey } from "@memberjunction/ai";
2
- import { PotentialDuplicateResponse, RunView, PotentialDuplicateResult, Metadata, LogError, RecordMergeRequest, PotentialDuplicate } from "@memberjunction/core";
3
- import { LogStatus } from "@memberjunction/core";
18
+ import { PotentialDuplicateResponse, CompositeKey, PotentialDuplicateResult, LogError, LogStatus, RecordMergeRequest, PotentialDuplicate, RunView, } from "@memberjunction/core";
4
19
  import { VectorDBBase } from "@memberjunction/ai-vectordb";
5
20
  import { MJGlobal, UUIDsEqual } from "@memberjunction/global";
6
21
  import { VectorBase } from "@memberjunction/ai-vectors";
7
22
  import { EntityDocumentTemplateParser, EntityVectorSyncer } from "@memberjunction/ai-vector-sync";
23
+ import { TemplateEngineServer } from "@memberjunction/templates";
24
+ /** Default number of nearest neighbors to retrieve per record */
25
+ const DEFAULT_TOP_K = 5;
26
+ /** Default concurrency limit for parallel vector queries */
27
+ const DEFAULT_QUERY_CONCURRENCY = 10;
28
+ /** Default batch size for loading records and parallel database saves */
29
+ const DEFAULT_BATCH_SIZE = 500;
30
+ /** Default batch size for parallel database saves */
31
+ const SAVE_BATCH_SIZE = 20;
32
+ /**
33
+ * Modernized duplicate record detection engine.
34
+ *
35
+ * Supports:
36
+ * - List-based batch detection (getDuplicateRecords)
37
+ * - View/filter/full-entity batch detection (vector-first approach)
38
+ * - Single-record duplicate check (CheckSingleRecord)
39
+ * - Hybrid search via RRF when vector DB supports it
40
+ * - Optional post-retrieval reranking via MJ's BaseReranker
41
+ * - Configurable topK, thresholds, and progress reporting
42
+ */
8
43
  export class DuplicateRecordDetector extends VectorBase {
9
44
  constructor() {
10
- super();
11
- this._runView = new RunView();
12
- }
13
- async getDuplicateRecords(params, contextUser) {
14
- super.CurrentUser = contextUser;
15
- //for testing
16
- //params.EntityID = 25050001;
17
- //params.EntityDocumentID = 17;
18
- let vectorizer = new EntityVectorSyncer();
19
- vectorizer.CurrentUser = super.CurrentUser;
20
- let entityDocument = await vectorizer.GetEntityDocument(params.EntityDocumentID);
21
- if (!entityDocument) {
22
- throw Error(`No Entity Document found with ID ${params.EntityDocumentID}`);
23
- //Update: No longer creating an entity docuement if one is not found
24
- //If an entitiy document is not found, that is our indicator that the
25
- //underlying entity's records have not been vectorized yet
26
- //const defaultVectorDB: MJVectorDatabaseEntity = super.getVectorDatabase();
27
- //const defaultAIModel: MJAIModelEntity = super.getAIModel();
28
- //entityDocument = await this.createEntityDocumentForEntity(params.EntityID, defaultVectorDB, defaultAIModel);
29
- }
30
- let response = new PotentialDuplicateResponse();
45
+ super(...arguments);
46
+ /**
47
+ * Tracks already-seen source↔match pairs across the entire run to suppress inverse duplicates.
48
+ * If A→B is persisted, B→A is skipped. Key format: "smallerID::largerID" for consistent ordering.
49
+ */
50
+ this._seenPairs = new Set();
51
+ }
52
+ /**
53
+ * Run duplicate detection for records identified by ListID, ViewID, ExtraFilter,
54
+ * or all records in the entity (vector-first approach).
55
+ *
56
+ * Flow: validate -> vectorize -> init providers -> load/create run ->
57
+ * load record IDs -> batch(embed -> query -> persist) -> complete run -> auto-merge
58
+ */
59
+ async GetDuplicateRecords(params, contextUser) {
60
+ this.CurrentUser = contextUser;
61
+ this._seenPairs.clear(); // Reset for each new run
62
+ const options = params.Options ?? {};
63
+ const startTime = Date.now();
64
+ const response = new PotentialDuplicateResponse();
31
65
  response.PotentialDuplicateResult = [];
66
+ // Step 1: Validate entity document
67
+ const entityDocument = await this.ValidateEntityDocument(params.EntityDocumentID);
32
68
  if (!entityDocument) {
33
- response.ErrorMessage = `No active Entity Document found for entity ${params.EntityID}`;
69
+ response.ErrorMessage = `No active Entity Document found for ID ${params.EntityDocumentID}`;
34
70
  response.Status = 'Error';
35
71
  return response;
36
72
  }
37
- //for testing
38
- const request = {
39
- entityID: entityDocument.EntityID,
40
- entityDocumentID: entityDocument.ID,
41
- listBatchCount: 20,
42
- options: {},
43
- CurrentUser: contextUser
44
- };
45
- console.log("vectorizing entity...");
73
+ // Step 2: Optionally vectorize source records (default: skip — vectors should already exist from sync)
74
+ if (options.Revectorize) {
75
+ this.reportProgress(options, 'Vectorizing', 0, 0, 0, startTime);
76
+ await this.VectorizeSourceRecords(entityDocument, contextUser);
77
+ }
78
+ // Step 3: Initialize providers
79
+ await this.InitializeProviders(entityDocument);
80
+ // Step 4: Create or load DuplicateRun
81
+ const duplicateRun = await this.ResolveOrCreateDuplicateRun(params, entityDocument, options);
82
+ // Step 5: Load record IDs to check (batch-friendly — only IDs)
83
+ this.reportProgress(options, 'Loading', 0, 0, 0, startTime);
84
+ const entityInfo = this.Metadata.EntityByID(entityDocument.EntityID);
85
+ if (!entityInfo) {
86
+ response.ErrorMessage = `Entity not found for ID ${entityDocument.EntityID}`;
87
+ response.Status = 'Error';
88
+ return response;
89
+ }
90
+ const recordIDs = await this.LoadRecordIDsToCheck(params, entityInfo);
91
+ if (recordIDs.length === 0) {
92
+ response.ErrorMessage = 'No records found to check for duplicates';
93
+ response.Status = 'Error';
94
+ return response;
95
+ }
96
+ // Step 6: Process in batches
97
+ const batchSize = DEFAULT_BATCH_SIZE;
98
+ const concurrency = this.GetQueryConcurrency(entityDocument);
99
+ const topK = options.TopK ?? DEFAULT_TOP_K;
100
+ const templateParser = EntityDocumentTemplateParser.CreateInstance();
101
+ let totalMatchesFound = 0;
102
+ for (let offset = 0; offset < recordIDs.length; offset += batchSize) {
103
+ const batchIDs = recordIDs.slice(offset, offset + batchSize);
104
+ const batchResults = await this.ProcessBatch(batchIDs, entityInfo, entityDocument, templateParser, duplicateRun.ID, topK, concurrency, options, startTime, recordIDs.length, offset, totalMatchesFound, contextUser);
105
+ response.PotentialDuplicateResult.push(...batchResults.Results);
106
+ totalMatchesFound += batchResults.MatchesFound;
107
+ }
108
+ // Step 7: Complete the duplicate run
109
+ duplicateRun.ProcessingStatus = 'Complete';
110
+ duplicateRun.EndedAt = new Date();
111
+ const runSaveSuccess = await this.SaveEntity(duplicateRun);
112
+ if (!runSaveSuccess) {
113
+ throw new Error(`Failed to update Duplicate Run record ${duplicateRun.ID}`);
114
+ }
115
+ // Step 8: Auto-merge high-confidence matches
116
+ this.reportProgress(options, 'Merging', recordIDs.length, recordIDs.length, totalMatchesFound, startTime);
117
+ await this.ProcessAutoMerges(response, entityDocument, options);
118
+ response.Status = 'Success';
119
+ LogStatus(`Duplicate detection complete: ${recordIDs.length} records checked, ${totalMatchesFound} matches found`);
120
+ return response;
121
+ }
122
+ /**
123
+ * Check a single record for duplicates without requiring a list.
124
+ * Embeds the record and queries for matches directly.
125
+ */
126
+ async CheckSingleRecord(EntityDocumentID, RecordID, Options, ContextUser) {
127
+ this.CurrentUser = ContextUser;
128
+ const options = Options ?? {};
129
+ const entityDocument = await this.ValidateEntityDocument(EntityDocumentID);
130
+ if (!entityDocument) {
131
+ throw new Error(`No active Entity Document found for ID ${EntityDocumentID}`);
132
+ }
133
+ await this.InitializeProviders(entityDocument);
134
+ // Load the single record
135
+ const entityInfo = this.Metadata.EntityByID(entityDocument.EntityID);
136
+ if (!entityInfo) {
137
+ throw new Error(`Entity not found for ID ${entityDocument.EntityID}`);
138
+ }
139
+ const records = await this.RunView.RunView({
140
+ EntityName: entityInfo.Name,
141
+ ExtraFilter: this.BuildExtraFilter([RecordID]),
142
+ ResultType: 'entity_object',
143
+ }, this.CurrentUser);
144
+ if (!records.Success || records.Results.length === 0) {
145
+ throw new Error(`Record not found: ${RecordID.ToString()}`);
146
+ }
147
+ const record = records.Results[0];
46
148
  const templateParser = EntityDocumentTemplateParser.CreateInstance();
47
- await vectorizer.VectorizeEntity(request, super.CurrentUser);
48
- const list = await this.getListEntity(params.ListID);
49
- let duplicateRun = params.Options?.DuplicateRunID ? await this.getDuplicateRunEntity(params.Options?.DuplicateRunID) : await this.getDuplicateRunEntityByListID(list.ID);
50
- //let duplicateRun: MJDuplicateRunEntity = await this.createDuplicateRunRecord(entityDocument, list.ID);
51
- const duplicateRunDetails = await this.createDuplicateRunDetailRecordsByListID(list.ID, duplicateRun.ID);
52
- //await this.createListDetailsForDupeRun(params.RecordIDs, list.ID);
53
- LogStatus(`Using vector database ${entityDocument.VectorDatabaseID} and AI Model ${entityDocument.AIModelID}`);
54
- const vectorDB = super.GetVectorDatabase(entityDocument.VectorDatabaseID);
55
- const aiModel = super.GetAIModel(entityDocument.AIModelID);
56
- LogStatus(`AIModel driver class: ${aiModel.DriverClass}`);
57
- LogStatus(`VectorDB class key: ${vectorDB.ClassKey}`);
149
+ const templateTexts = await this.GenerateTemplateTexts(templateParser, entityDocument, [record], ContextUser);
150
+ const embedResult = await this.embedding.EmbedTexts({ texts: templateTexts, model: null });
151
+ const topK = options.TopK ?? DEFAULT_TOP_K;
152
+ const queryResults = await this.QueryDuplicatesForRecords([record], embedResult.vectors, templateTexts, entityDocument, topK, options, this.GetQueryConcurrency(entityDocument));
153
+ return queryResults.length > 0 ? queryResults[0].Duplicates : new PotentialDuplicateResult();
154
+ }
155
+ // ─────────────────────────────────────────────
156
+ // Batch Processing
157
+ // ─────────────────────────────────────────────
158
+ /**
159
+ * Result from processing a single batch of records.
160
+ */
161
+ async ProcessBatch(batchIDs, entityInfo, entityDocument, templateParser, duplicateRunID, topK, concurrency, options, startTime, totalRecords, processedSoFar, matchesSoFar, contextUser) {
162
+ // 6a: Load full record data for this batch (needed for template rendering)
163
+ const compositeKeys = batchIDs.map(id => {
164
+ const ck = new CompositeKey();
165
+ ck.KeyValuePairs.push({ FieldName: entityInfo.FirstPrimaryKey.Name, Value: id });
166
+ return ck;
167
+ });
168
+ const records = await this.LoadRecordsByKeys(compositeKeys, entityInfo);
169
+ if (records.length === 0) {
170
+ return { Results: [], MatchesFound: 0 };
171
+ }
172
+ // 6b: Build source record metadata map for rich UI display
173
+ const sourceMetadataMap = this.buildSourceMetadataMap(records, entityInfo);
174
+ // 6c: Create DuplicateRunDetail records for this batch
175
+ const duplicateRunDetails = await this.CreateRunDetailRecords(batchIDs, duplicateRunID, entityInfo, sourceMetadataMap);
176
+ // 6c: Generate template texts and embed
177
+ this.reportProgress(options, 'Embedding', totalRecords, processedSoFar, matchesSoFar, startTime);
178
+ const templateTexts = await this.GenerateTemplateTexts(templateParser, entityDocument, records, contextUser);
179
+ const embedResult = await this.embedding.EmbedTexts({ texts: templateTexts, model: null });
180
+ // 6d: Query vector DB for each record with concurrency control
181
+ this.reportProgress(options, 'Querying', totalRecords, processedSoFar, matchesSoFar, startTime);
182
+ const queryResults = await this.QueryDuplicatesForRecords(records, embedResult.vectors, templateTexts, entityDocument, topK, options, concurrency);
183
+ // 6e: Persist match results and update run details
184
+ this.reportProgress(options, 'Matching', totalRecords, processedSoFar + records.length, matchesSoFar, startTime);
185
+ const results = await this.PersistMatchResults(queryResults, duplicateRunDetails, entityDocument, options, startTime);
186
+ const batchMatches = results.reduce((sum, r) => sum + r.Duplicates.length, 0);
187
+ return { Results: results, MatchesFound: batchMatches };
188
+ }
189
+ // ─────────────────────────────────────────────
190
+ // Record ID Loading (multiple strategies)
191
+ // ─────────────────────────────────────────────
192
+ /**
193
+ * Load the IDs of records to check, using the appropriate strategy based on the request.
194
+ * Returns an array of primary key value strings.
195
+ */
196
+ async LoadRecordIDsToCheck(params, entityInfo) {
197
+ if (params.ListID) {
198
+ return this.LoadRecordIDsFromList(params.ListID);
199
+ }
200
+ if (params.ViewID) {
201
+ return this.LoadRecordIDsFromView(params.ViewID, entityInfo);
202
+ }
203
+ // ExtraFilter or all records
204
+ return this.LoadRecordIDsFromEntity(entityInfo, params.ExtraFilter);
205
+ }
206
+ /**
207
+ * Load record IDs from a list's detail records.
208
+ */
209
+ async LoadRecordIDsFromList(listID) {
210
+ const sanitizedListID = listID.replace(/'/g, "''");
211
+ const viewResults = await this.RunView.RunView({
212
+ EntityName: 'MJ: List Details',
213
+ ExtraFilter: `ListID = '${sanitizedListID}'`,
214
+ Fields: ['RecordID'],
215
+ ResultType: 'simple',
216
+ }, this.CurrentUser);
217
+ if (!viewResults.Success) {
218
+ throw new Error(`Failed to load list details: ${viewResults.ErrorMessage}`);
219
+ }
220
+ return viewResults.Results.map(r => r.RecordID);
221
+ }
222
+ /**
223
+ * Load record IDs by running a saved view.
224
+ */
225
+ async LoadRecordIDsFromView(viewID, entityInfo) {
226
+ const pkField = entityInfo.FirstPrimaryKey.Name;
227
+ const sanitizedViewID = viewID.replace(/'/g, "''");
228
+ // Load the view definition to get its filter
229
+ const viewEntity = await this.RunViewForSingleValue('Views', `ID = '${sanitizedViewID}'`);
230
+ if (!viewEntity) {
231
+ throw new Error(`View not found: ${viewID}`);
232
+ }
233
+ // Run the entity with the view's filter to get IDs
234
+ const viewResults = await this.RunView.RunView({
235
+ ViewID: viewID,
236
+ Fields: [pkField],
237
+ ResultType: 'simple',
238
+ }, this.CurrentUser);
239
+ if (!viewResults.Success) {
240
+ throw new Error(`Failed to run view ${viewID}: ${viewResults.ErrorMessage}`);
241
+ }
242
+ return viewResults.Results.map(r => r[pkField]);
243
+ }
244
+ /**
245
+ * Load record IDs directly from the entity, optionally filtered.
246
+ * Uses Fields: ['ID'] and ResultType: 'simple' for efficiency.
247
+ */
248
+ async LoadRecordIDsFromEntity(entityInfo, extraFilter) {
249
+ const pkField = entityInfo.FirstPrimaryKey.Name;
250
+ const viewResults = await this.RunView.RunView({
251
+ EntityName: entityInfo.Name,
252
+ ExtraFilter: extraFilter,
253
+ Fields: [pkField],
254
+ ResultType: 'simple',
255
+ }, this.CurrentUser);
256
+ if (!viewResults.Success) {
257
+ throw new Error(`Failed to load record IDs from ${entityInfo.Name}: ${viewResults.ErrorMessage}`);
258
+ }
259
+ return viewResults.Results.map(r => r[pkField]);
260
+ }
261
+ // ─────────────────────────────────────────────
262
+ // Validation & Setup
263
+ // ─────────────────────────────────────────────
264
+ /**
265
+ * Validate and return an entity document, or null if not found.
266
+ */
267
+ async ValidateEntityDocument(entityDocumentID) {
268
+ const vectorizer = new EntityVectorSyncer();
269
+ vectorizer.CurrentUser = this.CurrentUser;
270
+ return vectorizer.GetEntityDocument(entityDocumentID);
271
+ }
272
+ /**
273
+ * Initialize embedding and vector DB providers via ClassFactory.
274
+ */
275
+ async InitializeProviders(entityDocument) {
276
+ const aiModel = this.GetAIModel(entityDocument.AIModelID);
277
+ const vectorDB = this.GetVectorDatabase(entityDocument.VectorDatabaseID);
58
278
  const embeddingAPIKey = GetAIAPIKey(aiModel.DriverClass);
59
279
  const vectorDBAPIKey = GetAIAPIKey(vectorDB.ClassKey);
60
280
  if (!embeddingAPIKey) {
61
- throw Error(`No API Key found for AI Model ${aiModel.DriverClass}`);
281
+ throw new Error(`No API Key found for AI Model ${aiModel.DriverClass}`);
62
282
  }
63
283
  if (!vectorDBAPIKey) {
64
- throw Error(`No API Key found for Vector Database ${vectorDB.ClassKey}`);
284
+ throw new Error(`No API Key found for Vector Database ${vectorDB.ClassKey}`);
65
285
  }
66
- //LogStatus(`Embedding API Key: ${embeddingAPIKey} VectorDB API Key: ${vectorDBAPIKey}`);
67
- this._embedding = MJGlobal.Instance.ClassFactory.CreateInstance(BaseEmbeddings, aiModel.DriverClass, embeddingAPIKey);
68
- this._vectorDB = MJGlobal.Instance.ClassFactory.CreateInstance(VectorDBBase, vectorDB.ClassKey, vectorDBAPIKey);
69
- if (!this._embedding) {
70
- throw Error(`Failed to create Embeddings instance for AI Model ${aiModel.DriverClass}`);
286
+ this.embedding = MJGlobal.Instance.ClassFactory.CreateInstance(BaseEmbeddings, aiModel.DriverClass, embeddingAPIKey);
287
+ this.vectorDB = MJGlobal.Instance.ClassFactory.CreateInstance(VectorDBBase, vectorDB.ClassKey, vectorDBAPIKey);
288
+ if (!this.embedding) {
289
+ throw new Error(`Failed to create Embeddings instance for ${aiModel.DriverClass}`);
71
290
  }
72
- if (!this._vectorDB) {
73
- throw Error(`Failed to create Vector Database instance for ${vectorDB.ClassKey}`);
291
+ if (!this.vectorDB) {
292
+ throw new Error(`Failed to create VectorDB instance for ${vectorDB.ClassKey}`);
74
293
  }
75
- let records = await this.GetRecordsByListID(list.ID, entityDocument.EntityID);
76
- if (records.length === 0) {
77
- LogError(`No records found in list ${list.Name}, with listID ${list.ID} and EntityID ${entityDocument.EntityID} exiting early`);
78
- response.ErrorMessage = `No records found in list ${list.Name}`;
79
- response.Status = 'Error';
80
- return response;
81
- }
82
- LogStatus("Vectorizing " + records.length + " records");
83
- const recordTemplates = [];
84
- //Relationship(entityID: number, entityRecord: any, relationshipName: string, maxRows: number, entityDocumentName: string)
85
- let sampleTemplate = entityDocument.Template;
86
- //sampleTemplate += " ${Relationship('Deals', 5, 'Sample Relationship Document for crm.Deals Entity')} ${Relationship('Deals', 5, 'Second Sample Relationship Document for crm.Deals Entity')}";
87
- for (const record of records) {
88
- const template = await templateParser.Parse(sampleTemplate, entityDocument.EntityID, record, contextUser);
89
- recordTemplates.push(template);
90
- }
91
- let embedTextsResult = await this._embedding.EmbedTexts({ texts: recordTemplates, model: null });
92
- const topK = 5;
93
- let results = [];
94
- for (const [index, vector] of embedTextsResult.vectors.entries()) {
95
- const compositeKey = records[index].PrimaryKey;
96
- let filterResult = await this._vectorDB.queryIndex({ vector: vector, topK: topK, includeMetadata: true, includeValues: false });
97
- if (!filterResult.success) {
98
- LogError(`Failed to query index for record ${compositeKey.ToString()}`);
99
- continue;
294
+ // Resolve the vector index name from the entity document's VectorIndexID
295
+ // This is the actual Pinecone/pgvector/Qdrant index name needed for QueryIndex calls
296
+ if (entityDocument.VectorIndexID) {
297
+ const rv = new RunView();
298
+ const indexResult = await rv.RunView({
299
+ EntityName: 'MJ: Vector Indexes',
300
+ ExtraFilter: `ID='${entityDocument.VectorIndexID}'`,
301
+ Fields: ['Name'],
302
+ ResultType: 'simple',
303
+ MaxRows: 1
304
+ }, this.CurrentUser);
305
+ if (indexResult.Success && indexResult.Results.length > 0) {
306
+ this.indexName = indexResult.Results[0].Name;
100
307
  }
101
- let queryResult = await this.getVectorDuplicates(filterResult);
102
- queryResult.Duplicates = queryResult.Duplicates.filter((dupe) => {
103
- return dupe.ProbabilityScore >= entityDocument.PotentialMatchThreshold;
104
- });
105
- queryResult.EntityID = entityDocument.EntityID;
106
- queryResult.RecordCompositeKey = compositeKey;
107
- results.push(queryResult);
108
- //now update all of the dupe run detail records
109
- let dupeRunDetail = duplicateRunDetails.find((detail) => UUIDsEqual(detail.RecordID, compositeKey.Values()));
110
- if (dupeRunDetail) {
111
- const matchRecords = await this.createDuplicateRunDetailMatchesForRecord(dupeRunDetail.ID, queryResult);
112
- queryResult.DuplicateRunDetailMatchRecordIDs = matchRecords.map((match) => match.ID);
113
- dupeRunDetail.MatchStatus = 'Complete';
114
- let success = await super.SaveEntity(dupeRunDetail);
115
- if (!success) {
116
- LogStatus(`Failed to update Duplicate Run Detail record ${dupeRunDetail.ID}`);
308
+ }
309
+ if (!this.indexName) {
310
+ throw new Error(`No vector index found for entity document "${entityDocument.Name}". Ensure VectorIndexID is set on the entity document.`);
311
+ }
312
+ LogStatus(`Providers initialized: AI Model=${aiModel.DriverClass}, VectorDB=${vectorDB.ClassKey}, Index=${this.indexName}`);
313
+ }
314
+ /**
315
+ * Run vectorization for the entity document's records.
316
+ */
317
+ async VectorizeSourceRecords(entityDocument, contextUser) {
318
+ const vectorizer = new EntityVectorSyncer();
319
+ vectorizer.CurrentUser = contextUser;
320
+ const request = {
321
+ entityID: entityDocument.EntityID,
322
+ entityDocumentID: entityDocument.ID,
323
+ listBatchCount: 20,
324
+ options: {},
325
+ CurrentUser: contextUser,
326
+ };
327
+ LogStatus(`Vectorizing entity records for document ${entityDocument.Name}`);
328
+ await vectorizer.VectorizeEntity(request, contextUser);
329
+ }
330
+ /**
331
+ * Read the maxConcurrentRequests from the VectorDatabase entity's Configuration column,
332
+ * falling back to DEFAULT_QUERY_CONCURRENCY if not set.
333
+ */
334
+ GetQueryConcurrency(entityDocument) {
335
+ const vectorDBEntity = this.GetVectorDatabase(entityDocument.VectorDatabaseID);
336
+ if (vectorDBEntity.Configuration) {
337
+ try {
338
+ const config = JSON.parse(vectorDBEntity.Configuration);
339
+ if (config.throughput?.maxConcurrentRequests != null) {
340
+ return config.throughput.maxConcurrentRequests;
117
341
  }
118
342
  }
119
- else {
120
- LogError(`Failed to find Duplicate Run Detail record for ${compositeKey.ToString()}`);
343
+ catch {
344
+ // Invalid JSON in Configuration fall through to default
121
345
  }
122
346
  }
123
- //almost done
124
- duplicateRun.ProcessingStatus = 'Complete';
125
- duplicateRun.EndedAt = new Date();
126
- let success = await super.SaveEntity(duplicateRun);
347
+ return DEFAULT_QUERY_CONCURRENCY;
348
+ }
349
+ // ─────────────────────────────────────────────
350
+ // DuplicateRun Management
351
+ // ─────────────────────────────────────────────
352
+ /**
353
+ * Resolve an existing DuplicateRun or create a new one.
354
+ * Supports both list-based and list-free operation.
355
+ */
356
+ async ResolveOrCreateDuplicateRun(params, entityDocument, options) {
357
+ // If a specific run ID was provided, load it
358
+ if (options.DuplicateRunID) {
359
+ return this.LoadDuplicateRun(options.DuplicateRunID);
360
+ }
361
+ // If a ListID is provided, try to find an existing run for that list
362
+ if (params.ListID) {
363
+ const existing = await this.FindDuplicateRunByListID(params.ListID);
364
+ if (existing) {
365
+ return existing;
366
+ }
367
+ }
368
+ // Create a new DuplicateRun
369
+ return this.CreateDuplicateRun(entityDocument, params.ListID);
370
+ }
371
+ /**
372
+ * Create a new DuplicateRun record.
373
+ */
374
+ async CreateDuplicateRun(entityDocument, listID) {
375
+ const dupeRun = await this.Metadata.GetEntityObject('MJ: Duplicate Runs', this.CurrentUser);
376
+ dupeRun.NewRecord();
377
+ dupeRun.EntityID = entityDocument.EntityID;
378
+ dupeRun.StartedByUserID = this.CurrentUser?.ID;
379
+ dupeRun.StartedAt = new Date();
380
+ dupeRun.ProcessingStatus = 'In Progress';
381
+ dupeRun.ApprovalStatus = 'Pending';
382
+ if (listID) {
383
+ dupeRun.SourceListID = listID;
384
+ }
385
+ const success = await this.SaveEntity(dupeRun);
127
386
  if (!success) {
128
- throw new Error(`Failed to update Duplicate Run record ${duplicateRun.ID}`);
387
+ throw new Error('Failed to create Duplicate Run record');
129
388
  }
130
- await this.mergeRecords(response, entityDocument);
131
- response.PotentialDuplicateResult = results;
132
- response.Status = 'Success';
133
- LogStatus("Dupe Run complete. Response:");
134
- LogStatus(JSON.stringify(response, null, "\t"));
135
- return response;
389
+ return dupeRun;
136
390
  }
137
- async GetRecordsByListID(listID, entityID) {
138
- const entityInfo = super.Metadata.EntityByID(entityID);
391
+ async LoadDuplicateRun(duplicateRunID) {
392
+ const dupeRun = await this.Metadata.GetEntityObject('MJ: Duplicate Runs', this.CurrentUser);
393
+ dupeRun.ContextCurrentUser = this.CurrentUser;
394
+ const success = await dupeRun.Load(duplicateRunID);
395
+ if (!success) {
396
+ throw new Error(`Failed to load Duplicate Run record ${duplicateRunID}`);
397
+ }
398
+ return dupeRun;
399
+ }
400
+ /**
401
+ * Try to find an existing DuplicateRun for a given ListID. Returns null if none found.
402
+ */
403
+ async FindDuplicateRunByListID(listID) {
404
+ return this.RunViewForSingleValue('MJ: Duplicate Runs', `SourceListID = '${listID.replace(/'/g, "''")}'`);
405
+ }
406
+ // ─────────────────────────────────────────────
407
+ // Entity Loading
408
+ // ─────────────────────────────────────────────
409
+ /**
410
+ * Load full entity objects for a batch of composite keys.
411
+ */
412
+ async LoadRecordsByKeys(compositeKeys, entityInfo) {
413
+ if (compositeKeys.length === 0) {
414
+ return [];
415
+ }
416
+ const rvResult = await this.RunView.RunView({
417
+ EntityName: entityInfo.Name,
418
+ ExtraFilter: this.BuildExtraFilter(compositeKeys),
419
+ ResultType: 'entity_object',
420
+ }, this.CurrentUser);
421
+ if (!rvResult.Success) {
422
+ throw new Error(rvResult.ErrorMessage);
423
+ }
424
+ return rvResult.Results;
425
+ }
426
+ /**
427
+ * Load records from an entity that are members of the specified list.
428
+ * Kept for backward compatibility.
429
+ */
430
+ async LoadRecordsByListID(listID, entityID) {
431
+ const entityInfo = this.Metadata.EntityByID(entityID);
139
432
  if (!entityInfo) {
140
- throw new Error(`Failed to load Entity Info with ID ${entityID}`);
433
+ throw new Error(`Entity not found for ID ${entityID}`);
141
434
  }
142
- const rvResult = await super.RunView.RunView({
435
+ const sanitizedListID = listID.replace(/'/g, "''");
436
+ const rvResult = await this.RunView.RunView({
143
437
  EntityName: entityInfo.Name,
144
- ExtraFilter: `ID IN (SELECT RecordID FROM __mj.vwListDetails WHERE ListID = '${listID}')`,
145
- ResultType: 'entity_object'
146
- }, super.CurrentUser);
438
+ ExtraFilter: `ID IN (SELECT RecordID FROM __mj.vwListDetails WHERE ListID = '${sanitizedListID}')`,
439
+ ResultType: 'entity_object',
440
+ }, this.CurrentUser);
147
441
  if (!rvResult.Success) {
148
442
  throw new Error(rvResult.ErrorMessage);
149
443
  }
150
444
  return rvResult.Results;
151
445
  }
152
- async createDuplicateRunRecord(entityDocument, listID) {
153
- const md = new Metadata();
154
- let duplicateRun = await md.GetEntityObject('MJ: Duplicate Runs');
155
- duplicateRun.NewRecord();
156
- duplicateRun.EntityID = entityDocument.EntityID;
157
- duplicateRun.StartedByUserID = super.CurrentUser.ID;
158
- duplicateRun.StartedAt = new Date();
159
- duplicateRun.ProcessingStatus = 'In Progress';
160
- duplicateRun.ApprovalStatus = 'Pending';
161
- duplicateRun.SourceListID = listID;
162
- const saveResult = await super.SaveEntity(duplicateRun);
163
- if (!saveResult) {
164
- throw new Error(`Failed to save list for Potential Duplicate Run`);
165
- }
166
- return duplicateRun;
167
- }
168
- async createDuplicateRunDetailRecords(recordIDs, duplicateRunID) {
169
- let results = [];
170
- const md = new Metadata();
171
- for (const recordID of recordIDs) {
172
- let runDetail = await md.GetEntityObject('MJ: Duplicate Run Details');
173
- runDetail.NewRecord();
174
- runDetail.DuplicateRunID = duplicateRunID;
175
- runDetail.RecordID = recordID.ToString();
176
- runDetail.MatchStatus = 'Pending';
177
- runDetail.MergeStatus = 'Pending';
178
- const success = await super.SaveEntity(runDetail);
179
- if (success) {
180
- results.push(runDetail);
181
- }
446
+ async LoadListEntity(listID) {
447
+ const list = await this.Metadata.GetEntityObject('MJ: Lists');
448
+ list.ContextCurrentUser = this.CurrentUser;
449
+ const success = await list.Load(listID);
450
+ if (!success) {
451
+ throw new Error(`Failed to load List record ${listID}`);
182
452
  }
183
- return results;
453
+ return list;
184
454
  }
185
- async createDuplicateRunDetailRecordsByListID(listID, duplicateRunID) {
186
- let results = [];
187
- const viewResults = await super.RunView.RunView({
188
- EntityName: 'MJ: List Details',
189
- ExtraFilter: `ListID = '${listID}'`,
190
- ResultType: 'entity_object'
191
- }, super.CurrentUser);
192
- if (!viewResults.Success) {
193
- throw new Error(viewResults.ErrorMessage);
194
- }
195
- const md = new Metadata();
196
- const listDetails = viewResults.Results;
197
- for (const listDetail of listDetails) {
198
- let runDetail = await md.GetEntityObject('MJ: Duplicate Run Details');
199
- runDetail.NewRecord();
200
- runDetail.DuplicateRunID = duplicateRunID;
201
- runDetail.RecordID = listDetail.RecordID;
202
- runDetail.MatchStatus = 'Pending';
203
- runDetail.MergeStatus = 'Pending';
204
- const success = await super.SaveEntity(runDetail);
205
- if (success) {
206
- results.push(runDetail);
455
+ // ─────────────────────────────────────────────
456
+ // Template Generation & Embedding
457
+ // ─────────────────────────────────────────────
458
+ /**
459
+ * Generate human-readable template text for each record using the entity document template.
460
+ *
461
+ * Loads the template from TemplateEngineServer and renders it via Nunjucks,
462
+ * matching the same approach used by the vectorization pipeline.
463
+ */
464
+ async GenerateTemplateTexts(templateParser, entityDocument, records, contextUser) {
465
+ await TemplateEngineServer.Instance.Config(false, contextUser);
466
+ const template = this.loadTemplate(entityDocument);
467
+ const templateContent = template.Content[0];
468
+ TemplateEngineServer.Instance.SetupNunjucks();
469
+ const templateTexts = [];
470
+ for (const record of records) {
471
+ // NEW convention: main entity fields are TOP-LEVEL variables (no Entity. prefix).
472
+ // Spread record fields directly into root context so templates use {{FieldName}}.
473
+ const data = { ...record.GetAll() };
474
+ const result = await TemplateEngineServer.Instance.RenderTemplate(template, templateContent, data, true);
475
+ if (result.Success) {
476
+ templateTexts.push(result.Output);
207
477
  }
208
478
  else {
209
- LogError("Failed to save MJDuplicateRunDetailEntity", undefined, runDetail.LatestResult);
479
+ LogError(`Template render failed for record ${record.PrimaryKey.ToString()}: ${result.Message}`);
480
+ templateTexts.push('');
210
481
  }
211
482
  }
212
- return results;
483
+ return templateTexts;
213
484
  }
214
- async getListEntity(listID) {
215
- const md = new Metadata();
216
- let list = await md.GetEntityObject('MJ: Lists');
217
- list.ContextCurrentUser = super.CurrentUser;
218
- const success = await list.Load(listID);
219
- if (!success) {
220
- throw new Error(`Failed to load List record ${listID}`);
485
+ /**
486
+ * Load the template entity from TemplateEngineServer for the given entity document.
487
+ */
488
+ loadTemplate(entityDocument) {
489
+ const template = TemplateEngineServer.Instance.Templates.find((t) => UUIDsEqual(t.ID, entityDocument.TemplateID));
490
+ if (!template) {
491
+ throw new Error(`Template not found for ID ${entityDocument.TemplateID}`);
221
492
  }
222
- return list;
493
+ if (template.Content.length === 0) {
494
+ throw new Error(`Template ${template.ID} has no content records`);
495
+ }
496
+ return template;
223
497
  }
224
- async getDuplicateRunEntity(DupeRunID) {
225
- const md = new Metadata();
226
- let dupeRun = await md.GetEntityObject('MJ: Duplicate Runs');
227
- dupeRun.ContextCurrentUser = super.CurrentUser;
228
- const success = await dupeRun.Load(DupeRunID);
229
- if (!success) {
230
- throw new Error(`Failed to load Duplicate Run record ${DupeRunID}`);
498
+ // ─────────────────────────────────────────────
499
+ // Vector Query & Hybrid Search
500
+ // ─────────────────────────────────────────────
501
+ /**
502
+ * Query the vector DB for duplicates of each record, with concurrency control.
503
+ * Supports hybrid search and RRF fusion when the vector DB supports it.
504
+ */
505
+ async QueryDuplicatesForRecords(records, vectors, templateTexts, entityDocument, topK, options, concurrency) {
506
+ const tasks = records.map((record, index) => async () => {
507
+ const compositeKey = record.PrimaryKey;
508
+ const vector = vectors[index];
509
+ const templateText = templateTexts[index];
510
+ const queryResponse = await this.executeVectorQuery(vector, templateText, topK, options);
511
+ if (!queryResponse.success) {
512
+ LogError(`Failed to query index for record ${compositeKey.ToString()}`);
513
+ const emptyResult = new PotentialDuplicateResult();
514
+ emptyResult.EntityID = entityDocument.EntityID;
515
+ emptyResult.RecordCompositeKey = compositeKey;
516
+ return { SourceKey: compositeKey, TemplateText: templateText, Duplicates: emptyResult };
517
+ }
518
+ const dupeResult = this.ParseVectorMatches(queryResponse, compositeKey);
519
+ dupeResult.Duplicates = this.FilterSelfMatches(dupeResult.Duplicates, compositeKey);
520
+ const potentialThreshold = options.PotentialMatchThreshold ?? entityDocument.PotentialMatchThreshold;
521
+ dupeResult.Duplicates = dupeResult.Duplicates.filter((d) => d.ProbabilityScore >= potentialThreshold);
522
+ dupeResult.EntityID = entityDocument.EntityID;
523
+ dupeResult.RecordCompositeKey = compositeKey;
524
+ return { SourceKey: compositeKey, TemplateText: templateText, Duplicates: dupeResult };
525
+ });
526
+ return RunWithConcurrency(tasks, concurrency);
527
+ }
528
+ /**
529
+ * Execute a vector query — uses hybrid search with RRF when the provider supports it.
530
+ */
531
+ async executeVectorQuery(vector, templateText, topK, options) {
532
+ if (this.vectorDB.SupportsHybridSearch && templateText) {
533
+ return this.vectorDB.HybridQuery({
534
+ vector,
535
+ topK,
536
+ KeywordQuery: templateText,
537
+ Alpha: options.KeywordSearchWeight != null ? (1.0 - options.KeywordSearchWeight) : 0.7,
538
+ FusionMethod: options.FusionMethod ?? 'rrf',
539
+ includeMetadata: true,
540
+ includeValues: false,
541
+ });
231
542
  }
232
- return dupeRun;
543
+ return this.vectorDB.QueryIndex({
544
+ id: this.indexName,
545
+ vector,
546
+ topK,
547
+ includeMetadata: true,
548
+ includeValues: false,
549
+ });
233
550
  }
234
- async getDuplicateRunEntityByListID(listID) {
235
- const entity = await super.RunViewForSingleValue('MJ: Duplicate Runs', `SourceListID = '${listID}'`);
236
- if (!entity) {
237
- throw new Error(`Failed to load Duplicate Run record for List ${listID}`);
238
- }
239
- return entity;
240
- }
241
- async createListForDupeRun(entityDocument) {
242
- const md = new Metadata();
243
- const list = await md.GetEntityObject('MJ: Lists');
244
- list.NewRecord();
245
- list.Name = `Potential Duplicate Run`;
246
- list.Description = `Potential Duplicate Run for ${entityDocument.Entity} Entity`;
247
- list.EntityID = entityDocument.EntityID;
248
- list.UserID = super.CurrentUser.ID;
249
- const saveResult = await super.SaveEntity(list);
250
- if (!saveResult) {
251
- throw new Error(`Failed to save list for Potential Duplicate Run`);
551
+ /**
552
+ * Parse raw vector DB matches into a PotentialDuplicateResult.
553
+ */
554
+ ParseVectorMatches(queryResponse, sourceKey) {
555
+ const result = new PotentialDuplicateResult();
556
+ if (!queryResponse.data?.matches) {
557
+ return result;
252
558
  }
253
- return list;
559
+ for (const match of queryResponse.data.matches) {
560
+ if (!match?.id)
561
+ continue;
562
+ if (!match.metadata?.RecordID) {
563
+ LogError(`Invalid vector metadata for match: ${match.id}`);
564
+ continue;
565
+ }
566
+ const duplicate = new PotentialDuplicate();
567
+ duplicate.LoadFromConcatenatedString(match.metadata.RecordID);
568
+ duplicate.ProbabilityScore = match.score;
569
+ // Capture the full vector metadata for rich UI display
570
+ duplicate.VectorMetadata = { ...match.metadata };
571
+ result.Duplicates.push(duplicate);
572
+ }
573
+ return result;
254
574
  }
255
- async createDuplicateRunDetailMatchesForRecord(DuplicateRunDetailID, duplicateResult) {
256
- const md = new Metadata();
257
- let matchRecords = [];
258
- for (const dupe of duplicateResult.Duplicates) {
259
- const match = await md.GetEntityObject('MJ: Duplicate Run Detail Matches');
260
- match.NewRecord();
261
- match.DuplicateRunDetailID = DuplicateRunDetailID;
262
- match.MatchRecordID = dupe.ToString();
263
- match.MatchProbability = dupe.ProbabilityScore;
264
- match.MatchedAt = new Date();
265
- match.MergedAt = new Date();
266
- match.Action = '';
267
- match.ApprovalStatus = 'Pending';
268
- match.MergeStatus = 'Pending';
269
- let success = await super.SaveEntity(match);
270
- if (success) {
271
- matchRecords.push(match);
575
+ /**
576
+ * Filter out self-matches where the candidate is the same record as the source.
577
+ */
578
+ /**
579
+ * Build a map of recordID JSON metadata string from loaded BaseEntity records.
580
+ * Extracts the entity's name field and a few key display fields for rich UI rendering.
581
+ */
582
+ buildSourceMetadataMap(records, entityInfo) {
583
+ const metadataMap = new Map();
584
+ const nameField = entityInfo.NameField;
585
+ // Collect a small set of useful display fields
586
+ const displayFieldNames = ['Name', 'Title', 'Description', 'Status', 'Type']
587
+ .filter(fn => entityInfo.Fields.find(f => f.Name === fn));
588
+ for (const record of records) {
589
+ const pk = record.PrimaryKey;
590
+ const id = pk.KeyValuePairs.length === 1 ? String(pk.KeyValuePairs[0].Value) : pk.Values();
591
+ const meta = {
592
+ Entity: entityInfo.Name,
593
+ };
594
+ if (entityInfo.Icon) {
595
+ meta['EntityIcon'] = entityInfo.Icon;
596
+ }
597
+ if (nameField) {
598
+ const nameVal = record.Get(nameField.Name);
599
+ if (nameVal != null)
600
+ meta['Name'] = String(nameVal);
601
+ }
602
+ for (const fn of displayFieldNames) {
603
+ if (fn !== nameField?.Name) {
604
+ const val = record.Get(fn);
605
+ if (val != null) {
606
+ const str = String(val);
607
+ meta[fn] = str.length > 200 ? str.substring(0, 197) + '...' : str;
608
+ }
609
+ }
610
+ }
611
+ metadataMap.set(id, JSON.stringify(meta));
612
+ }
613
+ return metadataMap;
614
+ }
615
+ FilterSelfMatches(duplicates, sourceKey) {
616
+ return duplicates.filter((d) => d.ToString() !== sourceKey.ToString());
617
+ }
618
+ // ─────────────────────────────────────────────
619
+ // Run Detail & Match Persistence (Batched)
620
+ // ─────────────────────────────────────────────
621
+ /**
622
+ * Create DuplicateRunDetail records for a batch of record IDs.
623
+ */
624
+ async CreateRunDetailRecords(recordIDs, duplicateRunID, entityInfo, metadataMap) {
625
+ const results = [];
626
+ const pkFieldName = entityInfo.FirstPrimaryKey.Name;
627
+ for (const batch of chunkArray(recordIDs, SAVE_BATCH_SIZE)) {
628
+ const batchResults = await Promise.all(batch.map(async (recordID) => {
629
+ const runDetail = await this.Metadata.GetEntityObject('MJ: Duplicate Run Details', this.CurrentUser);
630
+ runDetail.NewRecord();
631
+ runDetail.DuplicateRunID = duplicateRunID;
632
+ // Store RecordID in standard MJ URL segment format (e.g., "ID|uuid")
633
+ runDetail.RecordID = `${pkFieldName}|${recordID}`;
634
+ runDetail.MatchStatus = 'Pending';
635
+ runDetail.MergeStatus = 'Pending';
636
+ runDetail.RecordMetadata = metadataMap?.get(recordID) ?? null;
637
+ const success = await this.SaveEntity(runDetail);
638
+ if (!success) {
639
+ LogError("Failed to save MJDuplicateRunDetailEntity", undefined, runDetail.LatestResult);
640
+ return null;
641
+ }
642
+ return runDetail;
643
+ }));
644
+ for (const r of batchResults) {
645
+ if (r)
646
+ results.push(r);
647
+ }
648
+ }
649
+ return results;
650
+ }
651
+ /**
652
+ * Persist match results and update run detail records.
653
+ */
654
+ async PersistMatchResults(queryResults, duplicateRunDetails, entityDocument, options, startTime) {
655
+ const results = [];
656
+ let matchesFound = 0;
657
+ for (const qr of queryResults) {
658
+ // Filter out inverse duplicates: if A→B was already persisted, skip B→A
659
+ const sourceId = qr.SourceKey.Values();
660
+ qr.Duplicates.Duplicates = qr.Duplicates.Duplicates.filter(dupe => {
661
+ const matchId = dupe.Values();
662
+ const pairKey = sourceId < matchId ? `${sourceId}::${matchId}` : `${matchId}::${sourceId}`;
663
+ if (this._seenPairs.has(pairKey)) {
664
+ return false; // Inverse already recorded
665
+ }
666
+ this._seenPairs.add(pairKey);
667
+ return true;
668
+ });
669
+ results.push(qr.Duplicates);
670
+ matchesFound += qr.Duplicates.Duplicates.length;
671
+ const sourceKey = qr.SourceKey;
672
+ const detail = duplicateRunDetails.find((d) => {
673
+ const detailKey = new CompositeKey();
674
+ detailKey.LoadFromConcatenatedString(d.RecordID);
675
+ return detailKey.Equals(sourceKey);
676
+ });
677
+ if (detail) {
678
+ const matchRecords = await this.CreateMatchRecordsForDetail(detail.ID, qr.Duplicates);
679
+ qr.Duplicates.DuplicateRunDetailMatchRecordIDs = matchRecords.map((m) => m.ID);
680
+ detail.MatchStatus = 'Complete';
681
+ const success = await this.SaveEntity(detail);
682
+ if (!success) {
683
+ LogError(`Failed to update Duplicate Run Detail record ${detail.ID}`);
684
+ }
685
+ }
686
+ else {
687
+ LogError(`No Duplicate Run Detail found for ${qr.SourceKey.ToString()}`);
688
+ }
689
+ this.reportProgress(options, 'Matching', queryResults.length, results.length, matchesFound, startTime);
690
+ }
691
+ return results;
692
+ }
693
+ /**
694
+ * Create match records for a single run detail, saving in parallel batches.
695
+ */
696
+ async CreateMatchRecordsForDetail(duplicateRunDetailID, duplicateResult) {
697
+ const matchRecords = [];
698
+ for (const batch of chunkArray(duplicateResult.Duplicates, SAVE_BATCH_SIZE)) {
699
+ const batchResults = await Promise.all(batch.map(async (dupe) => {
700
+ const match = await this.Metadata.GetEntityObject('MJ: Duplicate Run Detail Matches', this.CurrentUser);
701
+ match.NewRecord();
702
+ match.DuplicateRunDetailID = duplicateRunDetailID;
703
+ match.MatchRecordID = dupe.ToURLSegment();
704
+ match.MatchProbability = dupe.ProbabilityScore;
705
+ match.MatchedAt = new Date();
706
+ match.Action = '';
707
+ match.ApprovalStatus = 'Pending';
708
+ match.MergeStatus = 'Pending';
709
+ match.RecordMetadata = dupe.VectorMetadata ? JSON.stringify(dupe.VectorMetadata) : null;
710
+ const success = await this.SaveEntity(match);
711
+ return success ? match : null;
712
+ }));
713
+ for (const m of batchResults) {
714
+ if (m)
715
+ matchRecords.push(m);
272
716
  }
273
717
  }
274
718
  return matchRecords;
275
719
  }
276
- async mergeRecords(dupeResponse, entityDocument) {
277
- const md = new Metadata();
278
- for (const dupeResult of dupeResponse.PotentialDuplicateResult) {
720
+ // ─────────────────────────────────────────────
721
+ // Auto-Merge
722
+ // ─────────────────────────────────────────────
723
+ /**
724
+ * Automatically merge records that meet the absolute match threshold.
725
+ */
726
+ async ProcessAutoMerges(response, entityDocument, options = {}) {
727
+ const absoluteThreshold = options.AbsoluteMatchThreshold ?? entityDocument.AbsoluteMatchThreshold;
728
+ for (const dupeResult of response.PotentialDuplicateResult) {
279
729
  for (const [index, dupe] of dupeResult.Duplicates.entries()) {
280
- if (dupe.ToString() === dupeResult.RecordCompositeKey.ToString()) {
281
- //same record, skip
730
+ if (dupe.ProbabilityScore < absoluteThreshold) {
282
731
  continue;
283
732
  }
284
- if (dupe.ProbabilityScore >= entityDocument.AbsoluteMatchThreshold) {
285
- //merge
286
- let mergeParams = new RecordMergeRequest();
287
- mergeParams.EntityName = entityDocument.Entity;
288
- mergeParams.SurvivingRecordCompositeKey = dupeResult.RecordCompositeKey;
289
- mergeParams.RecordsToMerge = [dupe];
290
- let result = await md.MergeRecords(mergeParams, super.CurrentUser);
291
- if (result.Success) {
292
- let dupeRunMatchRecord = await md.GetEntityObject('MJ: Duplicate Run Detail Matches', super.CurrentUser);
293
- let loadResult = await dupeRunMatchRecord.Load(dupeResult.DuplicateRunDetailMatchRecordIDs[index]);
294
- if (!loadResult) {
295
- LogError(`Failed to load Duplicate Run Match record ${dupeResult.DuplicateRunDetailMatchRecordIDs[index]}`);
296
- continue;
297
- }
298
- dupeRunMatchRecord.MergeStatus = 'Complete';
299
- dupeRunMatchRecord.Action = 'Merged';
300
- dupeRunMatchRecord.MergedAt = new Date();
301
- let saveResult = await dupeRunMatchRecord.Save();
302
- if (!saveResult) {
303
- LogError(`Failed to update Duplicate Run Match record ${dupeRunMatchRecord.ID}`);
304
- }
305
- }
306
- else {
307
- LogError(`Failed to merge records ${dupeResult.RecordCompositeKey.ToString()} and ${dupe.ToString()}`);
308
- }
733
+ const mergeParams = new RecordMergeRequest();
734
+ mergeParams.EntityName = entityDocument.Entity;
735
+ mergeParams.SurvivingRecordCompositeKey = dupeResult.RecordCompositeKey;
736
+ mergeParams.RecordsToMerge = [dupe];
737
+ const mergeResult = await this.Metadata.MergeRecords(mergeParams, this.CurrentUser);
738
+ if (mergeResult.Success) {
739
+ await this.updateMatchRecordAfterMerge(dupeResult.DuplicateRunDetailMatchRecordIDs[index]);
740
+ }
741
+ else {
742
+ LogError(`Failed to merge ${dupeResult.RecordCompositeKey.ToString()} and ${dupe.ToString()}`);
309
743
  }
310
744
  }
311
745
  }
312
746
  }
313
- async getVectorDuplicates(queryResponse) {
314
- let response = new PotentialDuplicateResult();
315
- for (const match of queryResponse.data.matches) {
316
- const record = match;
317
- if (!record || !record.id) {
318
- continue;
319
- }
320
- if (!record.metadata || !record.metadata.RecordID) {
321
- LogError(`Invalid vector metadata: ${record.id}`);
322
- continue;
323
- }
324
- let duplicate = new PotentialDuplicate();
325
- duplicate.LoadFromConcatenatedString(record.metadata.RecordID);
326
- duplicate.ProbabilityScore = record.score;
327
- response.Duplicates.push(duplicate);
747
+ /**
748
+ * Update a match record's status after a successful merge.
749
+ */
750
+ async updateMatchRecordAfterMerge(matchRecordID) {
751
+ const matchRecord = await this.Metadata.GetEntityObject('MJ: Duplicate Run Detail Matches', this.CurrentUser);
752
+ const loaded = await matchRecord.Load(matchRecordID);
753
+ if (!loaded) {
754
+ LogError(`Failed to load match record ${matchRecordID} for merge status update`);
755
+ return;
756
+ }
757
+ matchRecord.MergeStatus = 'Complete';
758
+ matchRecord.Action = 'Merged';
759
+ matchRecord.MergedAt = new Date();
760
+ const saved = await matchRecord.Save();
761
+ if (!saved) {
762
+ LogError(`Failed to update match record ${matchRecordID} after merge`);
763
+ }
764
+ }
765
+ // ─────────────────────────────────────────────
766
+ // Progress Reporting
767
+ // ─────────────────────────────────────────────
768
+ reportProgress(options, phase, totalRecords, processedRecords, matchesFound, startTime, currentRecordID) {
769
+ if (options.OnProgress) {
770
+ options.OnProgress({
771
+ Phase: phase,
772
+ TotalRecords: totalRecords,
773
+ ProcessedRecords: processedRecords,
774
+ MatchesFound: matchesFound,
775
+ CurrentRecordID: currentRecordID,
776
+ ElapsedMs: Date.now() - startTime,
777
+ });
778
+ }
779
+ }
780
+ }
781
+ // ─────────────────────────────────────────────
782
+ // Utility Functions
783
+ // ─────────────────────────────────────────────
784
+ /**
785
+ * Split an array into chunks of a given size.
786
+ */
787
+ function chunkArray(array, chunkSize) {
788
+ const chunks = [];
789
+ for (let i = 0; i < array.length; i += chunkSize) {
790
+ chunks.push(array.slice(i, i + chunkSize));
791
+ }
792
+ return chunks;
793
+ }
794
+ /**
795
+ * Run async tasks with a concurrency limit.
796
+ * Executes up to `limit` tasks in parallel, queuing the rest.
797
+ */
798
+ async function RunWithConcurrency(tasks, limit) {
799
+ const results = [];
800
+ let index = 0;
801
+ async function runNext() {
802
+ while (index < tasks.length) {
803
+ const currentIndex = index++;
804
+ results[currentIndex] = await tasks[currentIndex]();
328
805
  }
329
- return response;
330
806
  }
807
+ const workers = Array.from({ length: Math.min(limit, tasks.length) }, () => runNext());
808
+ await Promise.all(workers);
809
+ return results;
331
810
  }
332
811
  //# sourceMappingURL=duplicateRecordDetector.js.map