@memberjunction/ai-vector-dupe 5.21.0 → 5.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +254 -230
- package/dist/duplicateRecordDetector.d.ts +116 -18
- package/dist/duplicateRecordDetector.d.ts.map +1 -1
- package/dist/duplicateRecordDetector.js +465 -262
- package/dist/duplicateRecordDetector.js.map +1 -1
- package/dist/index.d.ts +2 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +4 -3
- package/dist/index.js.map +1 -1
- package/dist/scoring/ReciprocalRankFusion.d.ts +45 -0
- package/dist/scoring/ReciprocalRankFusion.d.ts.map +1 -0
- package/dist/scoring/ReciprocalRankFusion.js +63 -0
- package/dist/scoring/ReciprocalRankFusion.js.map +1 -0
- package/package.json +10 -10
- package/dist/config.d.ts +0 -13
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js +0 -15
- package/dist/config.js.map +0 -1
- package/dist/generic/vectorSyncBase.d.ts +0 -20
- package/dist/generic/vectorSyncBase.d.ts.map +0 -1
- package/dist/generic/vectorSyncBase.js +0 -42
- package/dist/generic/vectorSyncBase.js.map +0 -1
- package/dist/models/entitySyncConfig.d.ts +0 -36
- package/dist/models/entitySyncConfig.d.ts.map +0 -1
- package/dist/models/entitySyncConfig.js +0 -2
- package/dist/models/entitySyncConfig.js.map +0 -1
|
@@ -1,22 +1,120 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Modernized duplicate record detection engine.
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates the full pipeline: vectorize records, query for similar candidates,
|
|
5
|
+
* optionally apply hybrid search (RRF) and reranking, persist match results,
|
|
6
|
+
* and auto-merge high-confidence duplicates.
|
|
7
|
+
*
|
|
8
|
+
* Supports both list-based batch detection and single-record checks.
|
|
9
|
+
*
|
|
10
|
+
* @module @memberjunction/ai-vector-dupe
|
|
11
|
+
*/
|
|
12
|
+
import { PotentialDuplicateRequest, PotentialDuplicateResponse, CompositeKey, UserInfo, BaseEntity, PotentialDuplicateResult, PotentialDuplicate, DuplicateDetectionOptions } from "@memberjunction/core";
|
|
13
|
+
import { BaseResponse } from "@memberjunction/ai-vectordb";
|
|
14
|
+
import { MJDuplicateRunDetailEntity, MJDuplicateRunDetailMatchEntity, MJDuplicateRunEntity, MJEntityDocumentEntity, MJListEntity } from "@memberjunction/core-entities";
|
|
4
15
|
import { VectorBase } from "@memberjunction/ai-vectors";
|
|
16
|
+
import { EntityDocumentTemplateParser } from "@memberjunction/ai-vector-sync";
|
|
17
|
+
import type { MJTemplateEntityExtended } from "@memberjunction/core-entities";
|
|
18
|
+
/**
|
|
19
|
+
* Internal result from querying duplicates for a single source record.
|
|
20
|
+
*/
|
|
21
|
+
interface RecordQueryResult {
|
|
22
|
+
SourceKey: CompositeKey;
|
|
23
|
+
TemplateText: string;
|
|
24
|
+
Duplicates: PotentialDuplicateResult;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Modernized duplicate record detection engine.
|
|
28
|
+
*
|
|
29
|
+
* Supports:
|
|
30
|
+
* - List-based batch detection (getDuplicateRecords)
|
|
31
|
+
* - Single-record duplicate check (CheckSingleRecord)
|
|
32
|
+
* - Hybrid search via RRF when vector DB supports it
|
|
33
|
+
* - Optional post-retrieval reranking via MJ's BaseReranker
|
|
34
|
+
* - Configurable topK, thresholds, and progress reporting
|
|
35
|
+
*/
|
|
5
36
|
export declare class DuplicateRecordDetector extends VectorBase {
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
37
|
+
private vectorDB;
|
|
38
|
+
private embedding;
|
|
39
|
+
/**
|
|
40
|
+
* Run duplicate detection for all records in a list.
|
|
41
|
+
*
|
|
42
|
+
* Flow: validate → vectorize → embed → query → (optional rerank) → persist → (optional merge)
|
|
43
|
+
*/
|
|
44
|
+
GetDuplicateRecords(params: PotentialDuplicateRequest, contextUser?: UserInfo): Promise<PotentialDuplicateResponse>;
|
|
45
|
+
/**
|
|
46
|
+
* Check a single record for duplicates without requiring a list.
|
|
47
|
+
* Embeds the record and queries for matches directly.
|
|
48
|
+
*/
|
|
49
|
+
CheckSingleRecord(EntityDocumentID: string, RecordID: CompositeKey, Options?: DuplicateDetectionOptions, ContextUser?: UserInfo): Promise<PotentialDuplicateResult>;
|
|
50
|
+
/**
|
|
51
|
+
* Validate and return an entity document, or null if not found.
|
|
52
|
+
*/
|
|
53
|
+
protected ValidateEntityDocument(entityDocumentID: string): Promise<MJEntityDocumentEntity | null>;
|
|
54
|
+
/**
|
|
55
|
+
* Initialize embedding and vector DB providers via ClassFactory.
|
|
56
|
+
*/
|
|
57
|
+
protected InitializeProviders(entityDocument: MJEntityDocumentEntity): void;
|
|
58
|
+
/**
|
|
59
|
+
* Run vectorization for the entity document's records.
|
|
60
|
+
*/
|
|
61
|
+
protected VectorizeSourceRecords(entityDocument: MJEntityDocumentEntity, contextUser: UserInfo): Promise<void>;
|
|
62
|
+
/**
|
|
63
|
+
* Generate human-readable template text for each record using the entity document template.
|
|
64
|
+
*
|
|
65
|
+
* Loads the template from TemplateEngineServer and renders it via Nunjucks,
|
|
66
|
+
* matching the same approach used by the vectorization pipeline.
|
|
67
|
+
*/
|
|
68
|
+
protected GenerateTemplateTexts(templateParser: ReturnType<typeof EntityDocumentTemplateParser.CreateInstance>, entityDocument: MJEntityDocumentEntity, records: BaseEntity[], contextUser: UserInfo): Promise<string[]>;
|
|
69
|
+
/**
|
|
70
|
+
* Load the template entity from TemplateEngineServer for the given entity document.
|
|
71
|
+
*/
|
|
72
|
+
protected loadTemplate(entityDocument: MJEntityDocumentEntity): MJTemplateEntityExtended;
|
|
73
|
+
/**
|
|
74
|
+
* Query the vector DB for duplicates of each record, with concurrency control.
|
|
75
|
+
* Supports hybrid search and RRF fusion when the vector DB supports it.
|
|
76
|
+
*/
|
|
77
|
+
protected QueryDuplicatesForRecords(records: BaseEntity[], vectors: number[][], templateTexts: string[], entityDocument: MJEntityDocumentEntity, topK: number, options: DuplicateDetectionOptions): Promise<RecordQueryResult[]>;
|
|
78
|
+
/**
|
|
79
|
+
* Execute a vector query — uses hybrid search with RRF when the provider supports it.
|
|
80
|
+
*/
|
|
81
|
+
protected executeVectorQuery(vector: number[], templateText: string, topK: number, options: DuplicateDetectionOptions): Promise<BaseResponse>;
|
|
82
|
+
/**
|
|
83
|
+
* Parse raw vector DB matches into a PotentialDuplicateResult.
|
|
84
|
+
*/
|
|
85
|
+
ParseVectorMatches(queryResponse: BaseResponse, sourceKey?: CompositeKey): PotentialDuplicateResult;
|
|
86
|
+
/**
|
|
87
|
+
* Filter out self-matches where the candidate is the same record as the source.
|
|
88
|
+
*/
|
|
89
|
+
protected FilterSelfMatches(duplicates: PotentialDuplicate[], sourceKey: CompositeKey): PotentialDuplicate[];
|
|
90
|
+
/**
|
|
91
|
+
* Load records from an entity that are members of the specified list.
|
|
92
|
+
*/
|
|
93
|
+
protected LoadRecordsByListID(listID: string, entityID: string): Promise<BaseEntity[]>;
|
|
94
|
+
protected LoadListEntity(listID: string): Promise<MJListEntity>;
|
|
95
|
+
protected LoadDuplicateRun(duplicateRunID: string): Promise<MJDuplicateRunEntity>;
|
|
96
|
+
protected LoadDuplicateRunByListID(listID: string): Promise<MJDuplicateRunEntity>;
|
|
97
|
+
/**
|
|
98
|
+
* Create DuplicateRunDetail records for each item in the list, saving in parallel batches.
|
|
99
|
+
*/
|
|
100
|
+
protected CreateRunDetailRecordsFromList(listID: string, duplicateRunID: string): Promise<MJDuplicateRunDetailEntity[]>;
|
|
101
|
+
/**
|
|
102
|
+
* Persist match results and update run detail records.
|
|
103
|
+
*/
|
|
104
|
+
protected PersistMatchResults(queryResults: RecordQueryResult[], duplicateRunDetails: MJDuplicateRunDetailEntity[], entityDocument: MJEntityDocumentEntity, options: DuplicateDetectionOptions, startTime: number): Promise<PotentialDuplicateResult[]>;
|
|
105
|
+
/**
|
|
106
|
+
* Create match records for a single run detail, saving in parallel batches.
|
|
107
|
+
*/
|
|
108
|
+
protected CreateMatchRecordsForDetail(duplicateRunDetailID: string, duplicateResult: PotentialDuplicateResult): Promise<MJDuplicateRunDetailMatchEntity[]>;
|
|
109
|
+
/**
|
|
110
|
+
* Automatically merge records that meet the absolute match threshold.
|
|
111
|
+
*/
|
|
112
|
+
protected ProcessAutoMerges(response: PotentialDuplicateResponse, entityDocument: MJEntityDocumentEntity): Promise<void>;
|
|
113
|
+
/**
|
|
114
|
+
* Update a match record's status after a successful merge.
|
|
115
|
+
*/
|
|
116
|
+
private updateMatchRecordAfterMerge;
|
|
117
|
+
private reportProgress;
|
|
21
118
|
}
|
|
119
|
+
export {};
|
|
22
120
|
//# sourceMappingURL=duplicateRecordDetector.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"duplicateRecordDetector.d.ts","sourceRoot":"","sources":["../src/duplicateRecordDetector.ts"],"names":[],"mappings":"AAAA,OAAO,
|
|
1
|
+
{"version":3,"file":"duplicateRecordDetector.d.ts","sourceRoot":"","sources":["../src/duplicateRecordDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAGH,OAAO,EACH,yBAAyB,EACzB,0BAA0B,EAC1B,YAAY,EACZ,QAAQ,EACR,UAAU,EACV,wBAAwB,EAKxB,kBAAkB,EAClB,yBAAyB,EAE5B,MAAM,sBAAsB,CAAC;AAC9B,OAAO,EAAE,YAAY,EAAgB,MAAM,6BAA6B,CAAC;AAEzE,OAAO,EACH,0BAA0B,EAC1B,+BAA+B,EAC/B,oBAAoB,EACpB,sBAAsB,EAEtB,YAAY,EACf,MAAM,+BAA+B,CAAC;AACvC,OAAO,EAAE,UAAU,EAAE,MAAM,4BAA4B,CAAC;AACxD,OAAO,EAAE,4BAA4B,EAA6C,MAAM,gCAAgC,CAAC;AAEzH,OAAO,KAAK,EAAE,wBAAwB,EAA2B,MAAM,+BAA+B,CAAC;AA8BvG;;GAEG;AACH,UAAU,iBAAiB;IACvB,SAAS,EAAE,YAAY,CAAC;IACxB,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,wBAAwB,CAAC;CACxC;AAED;;;;;;;;;GASG;AACH,qBAAa,uBAAwB,SAAQ,UAAU;IACnD,OAAO,CAAC,QAAQ,CAAe;IAC/B,OAAO,CAAC,SAAS,CAAiB;IAElC;;;;OAIG;IACU,mBAAmB,CAAC,MAAM,EAAE,yBAAyB,EAAE,WAAW,CAAC,EAAE,QAAQ,GAAG,OAAO,CAAC,0BAA0B,CAAC;IA+EhI;;;OAGG;IACU,iBAAiB,CAC1B,gBAAgB,EAAE,MAAM,EACxB,QAAQ,EAAE,YAAY,EACtB,OAAO,CAAC,EAAE,yBAAyB,EACnC,WAAW,CAAC,EAAE,QAAQ,GACvB,OAAO,CAAC,wBAAwB,CAAC;IA4CpC;;OAEG;cACa,sBAAsB,CAAC,gBAAgB,EAAE,MAAM,GAAG,OAAO,CAAC,sBAAsB,GAAG,IAAI,CAAC;IAMxG;;OAEG;IACH,SAAS,CAAC,mBAAmB,CAAC,cAAc,EAAE,sBAAsB,GAAG,IAAI;IA+B3E;;OAEG;cACa,sBAAsB,CAAC,cAAc,EAAE,sBAAsB,EAAE,WAAW,EAAE,QAAQ,GAAG,OAAO,CAAC,IAAI,CAAC;IAoBpH;;;;;OAKG;cACa,qBAAqB,CACjC,cAAc,EAAE,UAAU,CAAC,OAAO,4BAA4B,CAAC,cAAc,CAAC,EAC9E,cAAc,EAAE,sBAAsB,EACtC,OAAO,EAAE,UAAU,EAAE,EACrB,WAAW,EAAE,QAAQ,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;IA0BpB;;OAEG;IACH,SAAS,CAAC,YAAY,CAAC,cAAc,EAAE,sBAAsB,GAAG,wBAAwB;IAkBxF;;;OAGG;cACa,yBAAyB,CACrC,OAAO,EAAE,UAAU,EAAE,EACrB,OAAO,EAAE,MAAM,EAAE,EAAE,EACnB,aAAa,EAAE,MAAM,EAAE,EACvB,cAAc,EAAE,sBAAsB,EACtC,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,yBAAyB,GACnC,OAAO,CAAC,iBAAiB,EAAE,CAAC;IA8B/B;;OAEG;cACa,kBAAkB,CAC9B,MAAM,EAAE,MAAM,EAAE,EAChB,YAAY,EAAE,MAAM,EACpB,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,yBAAyB,GACnC,OAAO,CAAC,YAAY,CAAC;IAqBxB;;OAEG;IACI,kBAAkB,CAAC,aAAa,EAAE,YAAY,EAAE,SAAS,CAAC,EAAE,YAAY,GAAG,wBAAwB;IAuB1G;;OAEG;IACH,SAAS,CAAC,iBAAiB,CAAC,UAAU,EAAE,kBAAkB,EAAE,EAAE,SAAS,EAAE,YAAY,GAAG,kBAAkB,EAAE;IAQ5G;;OAEG;cACa,mBAAmB,CAAC,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC;cAmB5E,cAAc,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC;cAUrD,gBAAgB,CAAC,cAAc,EAAE,MAAM,GAAG,OAAO,CAAC,oBAAoB,CAAC;cAUvE,wBAAwB,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,oBAAoB,CAAC;IAcvF;;OAEG;cACa,8BAA8B,CAAC,MAAM,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,GAAG,OAAO,CAAC,0BAA0B,EAAE,CAAC;IAuC7H;;OAEG;cACa,mBAAmB,CAC/B,YAAY,EAAE,iBAAiB,EAAE,EACjC,mBAAmB,EAAE,0BAA0B,EAAE,EACjD,cAAc,EAAE,sBAAsB,EACtC,OAAO,EAAE,yBAAyB,EAClC,SAAS,EAAE,MAAM,GAClB,OAAO,CAAC,wBAAwB,EAAE,CAAC;IA8BtC;;OAEG;cACa,2BAA2B,CACvC,oBAAoB,EAAE,MAAM,EAC5B,eAAe,EAAE,wBAAwB,GAC1C,OAAO,CAAC,+BAA+B,EAAE,CAAC;IAiC7C;;OAEG;cACa,iBAAiB,CAC7B,QAAQ,EAAE,0BAA0B,EACpC,cAAc,EAAE,sBAAsB,GACvC,OAAO,CAAC,IAAI,CAAC;IAsBhB;;OAEG;YACW,2BAA2B;IAwBzC,OAAO,CAAC,cAAc;CAoBzB"}
|