@memberjunction/ai-vector-dupe 4.0.0 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +212 -157
- package/package.json +10 -10
package/README.md
CHANGED
|
@@ -1,14 +1,42 @@
|
|
|
1
1
|
# @memberjunction/ai-vector-dupe
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
##
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
-
|
|
10
|
-
|
|
11
|
-
|
|
3
|
+
AI-powered duplicate record detection for MemberJunction entities. This package uses vector embeddings and similarity search to find potential duplicate records, track detection runs, and optionally auto-merge high-confidence matches.
|
|
4
|
+
|
|
5
|
+
## Architecture
|
|
6
|
+
|
|
7
|
+
```mermaid
|
|
8
|
+
graph TD
|
|
9
|
+
subgraph DupePkg["@memberjunction/ai-vector-dupe"]
|
|
10
|
+
DRD["DuplicateRecordDetector"]
|
|
11
|
+
VSB["VectorSyncBase"]
|
|
12
|
+
ESC["EntitySyncConfig"]
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
subgraph Pipeline["Detection Pipeline"]
|
|
16
|
+
LIST["Load Records<br/>from List"] --> VECT["Vectorize Records<br/>via Templates"]
|
|
17
|
+
VECT --> EMBED["Generate<br/>Embeddings"]
|
|
18
|
+
EMBED --> QUERY["Query Vector DB<br/>for Matches"]
|
|
19
|
+
QUERY --> FILTER["Filter by<br/>Threshold"]
|
|
20
|
+
FILTER --> TRACK["Track Results<br/>in Duplicate Runs"]
|
|
21
|
+
TRACK --> MERGE["Auto-Merge<br/>Above Threshold"]
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
subgraph Dependencies["Key Dependencies"]
|
|
25
|
+
VB["ai-vectors<br/>(VectorBase)"]
|
|
26
|
+
SYNC["ai-vector-sync<br/>(EntityVectorSyncer)"]
|
|
27
|
+
VDBB["ai-vectordb<br/>(VectorDBBase)"]
|
|
28
|
+
AI["ai<br/>(BaseEmbeddings)"]
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
DRD -->|extends| VB
|
|
32
|
+
DRD --> SYNC
|
|
33
|
+
DRD --> VDBB
|
|
34
|
+
DRD --> AI
|
|
35
|
+
|
|
36
|
+
style DupePkg fill:#2d6a9f,stroke:#1a4971,color:#fff
|
|
37
|
+
style Pipeline fill:#2d8659,stroke:#1a5c3a,color:#fff
|
|
38
|
+
style Dependencies fill:#7c5295,stroke:#563a6b,color:#fff
|
|
39
|
+
```
|
|
12
40
|
|
|
13
41
|
## Installation
|
|
14
42
|
|
|
@@ -16,49 +44,99 @@ The AI Vector Dupe package provides sophisticated duplicate detection capabiliti
|
|
|
16
44
|
npm install @memberjunction/ai-vector-dupe
|
|
17
45
|
```
|
|
18
46
|
|
|
19
|
-
##
|
|
47
|
+
## Overview
|
|
20
48
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
49
|
+
The package provides the `DuplicateRecordDetector` class, which orchestrates a complete duplicate detection workflow:
|
|
50
|
+
|
|
51
|
+
1. Loads records from a MemberJunction List
|
|
52
|
+
2. Vectorizes them using a configured Entity Document template and embedding model
|
|
53
|
+
3. Queries the vector database for similarity matches
|
|
54
|
+
4. Filters results against configurable thresholds
|
|
55
|
+
5. Creates Duplicate Run, Duplicate Run Detail, and Duplicate Run Detail Match records for tracking
|
|
56
|
+
6. Optionally auto-merges records that exceed the absolute match threshold
|
|
57
|
+
|
|
58
|
+
## Duplicate Detection Flow
|
|
59
|
+
|
|
60
|
+
```mermaid
|
|
61
|
+
sequenceDiagram
|
|
62
|
+
participant Caller
|
|
63
|
+
participant DRD as DuplicateRecordDetector
|
|
64
|
+
participant EVS as EntityVectorSyncer
|
|
65
|
+
participant Embed as Embedding Model
|
|
66
|
+
participant VDB as Vector Database
|
|
67
|
+
participant DB as MJ Database
|
|
68
|
+
|
|
69
|
+
Caller->>DRD: getDuplicateRecords(request, user)
|
|
70
|
+
DRD->>DB: Load Entity Document
|
|
71
|
+
DRD->>EVS: VectorizeEntity (ensure all records are indexed)
|
|
72
|
+
DRD->>DB: Load records from List
|
|
73
|
+
|
|
74
|
+
loop For each record
|
|
75
|
+
DRD->>Embed: Generate embedding from template
|
|
76
|
+
DRD->>VDB: queryIndex (topK=5)
|
|
77
|
+
VDB-->>DRD: Scored matches
|
|
78
|
+
DRD->>DRD: Filter by PotentialMatchThreshold
|
|
79
|
+
DRD->>DB: Create DuplicateRunDetailMatch records
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
DRD->>DRD: Check AbsoluteMatchThreshold
|
|
83
|
+
DRD->>DB: Auto-merge high-confidence duplicates
|
|
84
|
+
DRD-->>Caller: PotentialDuplicateResponse
|
|
85
|
+
```
|
|
25
86
|
|
|
26
87
|
## Core Components
|
|
27
88
|
|
|
28
89
|
### DuplicateRecordDetector
|
|
29
90
|
|
|
30
|
-
The main class that
|
|
91
|
+
The main class that extends `VectorBase` from `@memberjunction/ai-vectors`.
|
|
31
92
|
|
|
32
|
-
|
|
33
|
-
import { DuplicateRecordDetector } from '@memberjunction/ai-vector-dupe';
|
|
34
|
-
import { PotentialDuplicateRequest, UserInfo } from '@memberjunction/core';
|
|
93
|
+
**Key method:**
|
|
35
94
|
|
|
36
|
-
|
|
95
|
+
```typescript
|
|
96
|
+
getDuplicateRecords(
|
|
97
|
+
params: PotentialDuplicateRequest,
|
|
98
|
+
contextUser?: UserInfo
|
|
99
|
+
): Promise<PotentialDuplicateResponse>
|
|
37
100
|
```
|
|
38
101
|
|
|
102
|
+
**Parameters in `PotentialDuplicateRequest`:**
|
|
103
|
+
|
|
104
|
+
| Field | Type | Description |
|
|
105
|
+
|---|---|---|
|
|
106
|
+
| `ListID` | `string` | ID of the List containing records to check |
|
|
107
|
+
| `EntityID` | `string` | ID of the entity type |
|
|
108
|
+
| `EntityDocumentID` | `string` | ID of the Entity Document with vectorization template |
|
|
109
|
+
| `Options.DuplicateRunID` | `string` (optional) | Resume an existing duplicate run |
|
|
110
|
+
|
|
111
|
+
**Thresholds (configured on Entity Document):**
|
|
112
|
+
|
|
113
|
+
| Threshold | Purpose |
|
|
114
|
+
|---|---|
|
|
115
|
+
| `PotentialMatchThreshold` | Minimum similarity score to report as potential duplicate |
|
|
116
|
+
| `AbsoluteMatchThreshold` | Minimum similarity score for automatic record merge |
|
|
117
|
+
|
|
39
118
|
### VectorSyncBase
|
|
40
119
|
|
|
41
|
-
|
|
120
|
+
A utility base class providing helper methods for vector synchronization operations:
|
|
42
121
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
122
|
+
- `parseStringTemplate(str, obj)` -- simple template variable substitution
|
|
123
|
+
- `timer(ms)` -- async delay
|
|
124
|
+
- `start()` / `end()` / `timeDiff()` -- execution timing
|
|
125
|
+
- `saveJSONData(data, path)` -- JSON file output
|
|
46
126
|
|
|
47
127
|
### EntitySyncConfig
|
|
48
128
|
|
|
49
|
-
|
|
129
|
+
Configuration type for entity synchronization scheduling:
|
|
50
130
|
|
|
51
131
|
```typescript
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
VectorIndexID: 1,
|
|
61
|
-
VectorID: 1
|
|
132
|
+
type EntitySyncConfig = {
|
|
133
|
+
EntityDocumentID: string; // Entity Document to use
|
|
134
|
+
Interval: number; // Sync interval in seconds
|
|
135
|
+
RunViewParams: RunViewParams; // View parameters for fetching
|
|
136
|
+
IncludeInSync: boolean; // Whether to include in sync
|
|
137
|
+
LastRunDate: string; // Last sync timestamp
|
|
138
|
+
VectorIndexID: number; // Vector index ID
|
|
139
|
+
VectorID: number; // Vector database ID
|
|
62
140
|
};
|
|
63
141
|
```
|
|
64
142
|
|
|
@@ -70,106 +148,109 @@ const config: EntitySyncConfig = {
|
|
|
70
148
|
import { DuplicateRecordDetector } from '@memberjunction/ai-vector-dupe';
|
|
71
149
|
import { PotentialDuplicateRequest, UserInfo } from '@memberjunction/core';
|
|
72
150
|
|
|
73
|
-
// Initialize the detector
|
|
74
151
|
const detector = new DuplicateRecordDetector();
|
|
75
152
|
|
|
76
|
-
// Define the request parameters
|
|
77
153
|
const request: PotentialDuplicateRequest = {
|
|
78
|
-
ListID: '
|
|
79
|
-
EntityID: '
|
|
80
|
-
EntityDocumentID: 'doc-
|
|
81
|
-
Options: {
|
|
82
|
-
DuplicateRunID: 'run-id' // Optional: existing duplicate run to continue
|
|
83
|
-
}
|
|
154
|
+
ListID: 'list-uuid',
|
|
155
|
+
EntityID: 'entity-uuid',
|
|
156
|
+
EntityDocumentID: 'doc-uuid'
|
|
84
157
|
};
|
|
85
158
|
|
|
86
|
-
// Execute duplicate detection
|
|
87
159
|
const response = await detector.getDuplicateRecords(request, currentUser);
|
|
88
160
|
|
|
89
161
|
if (response.Status === 'Success') {
|
|
90
|
-
console.log(`Found ${response.PotentialDuplicateResult.length} records with potential duplicates`);
|
|
91
|
-
|
|
92
162
|
for (const result of response.PotentialDuplicateResult) {
|
|
93
|
-
console.log(`Record ${result.RecordCompositeKey.ToString()}
|
|
94
|
-
for (const
|
|
95
|
-
console.log(`
|
|
163
|
+
console.log(`Record: ${result.RecordCompositeKey.ToString()}`);
|
|
164
|
+
for (const dupe of result.Duplicates) {
|
|
165
|
+
console.log(` Match: ${dupe.ToString()} (${(dupe.ProbabilityScore * 100).toFixed(1)}%)`);
|
|
96
166
|
}
|
|
97
167
|
}
|
|
98
168
|
}
|
|
99
169
|
```
|
|
100
170
|
|
|
101
|
-
###
|
|
171
|
+
### Resuming an Existing Run
|
|
102
172
|
|
|
103
173
|
```typescript
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
```
|
|
113
|
-
|
|
114
|
-
## API Reference
|
|
115
|
-
|
|
116
|
-
### DuplicateRecordDetector
|
|
117
|
-
|
|
118
|
-
#### `getDuplicateRecords(params: PotentialDuplicateRequest, contextUser?: UserInfo): Promise<PotentialDuplicateResponse>`
|
|
119
|
-
|
|
120
|
-
Performs duplicate detection on records in a list.
|
|
121
|
-
|
|
122
|
-
**Parameters:**
|
|
123
|
-
- `params`: Request parameters including:
|
|
124
|
-
- `ListID`: ID of the list containing records to analyze
|
|
125
|
-
- `EntityID`: ID of the entity type
|
|
126
|
-
- `EntityDocumentID`: ID of the entity document configuration
|
|
127
|
-
- `Options`: Optional configuration including `DuplicateRunID`
|
|
128
|
-
- `contextUser`: Optional user context for permissions
|
|
129
|
-
|
|
130
|
-
**Returns:** `PotentialDuplicateResponse` containing:
|
|
131
|
-
- `Status`: 'Success' or 'Error'
|
|
132
|
-
- `ErrorMessage`: Error details if failed
|
|
133
|
-
- `PotentialDuplicateResult[]`: Array of results for each analyzed record
|
|
134
|
-
|
|
135
|
-
### VectorSyncBase
|
|
136
|
-
|
|
137
|
-
Base class providing utility methods:
|
|
138
|
-
|
|
139
|
-
- `parseStringTemplate(str: string, obj: any): string` - Parse template strings
|
|
140
|
-
- `timer(ms: number): Promise<unknown>` - Async delay utility
|
|
141
|
-
- `start()` / `end()` / `timeDiff()` - Timing utilities
|
|
142
|
-
- `saveJSONData(data: any, path: string)` - JSON file operations
|
|
174
|
+
const request: PotentialDuplicateRequest = {
|
|
175
|
+
ListID: 'list-uuid',
|
|
176
|
+
EntityID: 'entity-uuid',
|
|
177
|
+
EntityDocumentID: 'doc-uuid',
|
|
178
|
+
Options: {
|
|
179
|
+
DuplicateRunID: 'existing-run-uuid'
|
|
180
|
+
}
|
|
181
|
+
};
|
|
143
182
|
|
|
144
|
-
|
|
183
|
+
const response = await detector.getDuplicateRecords(request, currentUser);
|
|
184
|
+
```
|
|
145
185
|
|
|
146
|
-
|
|
186
|
+
## Database Entities Used
|
|
187
|
+
|
|
188
|
+
The package reads from and writes to these MemberJunction entities:
|
|
189
|
+
|
|
190
|
+
```mermaid
|
|
191
|
+
erDiagram
|
|
192
|
+
DUPLICATE_RUN {
|
|
193
|
+
string ID PK
|
|
194
|
+
string EntityID
|
|
195
|
+
string StartedByUserID
|
|
196
|
+
datetime StartedAt
|
|
197
|
+
datetime EndedAt
|
|
198
|
+
string ProcessingStatus
|
|
199
|
+
string ApprovalStatus
|
|
200
|
+
string SourceListID
|
|
201
|
+
}
|
|
147
202
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
203
|
+
DUPLICATE_RUN_DETAIL {
|
|
204
|
+
string ID PK
|
|
205
|
+
string DuplicateRunID FK
|
|
206
|
+
string RecordID
|
|
207
|
+
string MatchStatus
|
|
208
|
+
string MergeStatus
|
|
209
|
+
}
|
|
153
210
|
|
|
154
|
-
|
|
211
|
+
DUPLICATE_RUN_DETAIL_MATCH {
|
|
212
|
+
string ID PK
|
|
213
|
+
string DuplicateRunDetailID FK
|
|
214
|
+
string MatchRecordID
|
|
215
|
+
float MatchProbability
|
|
216
|
+
datetime MatchedAt
|
|
217
|
+
string Action
|
|
218
|
+
string ApprovalStatus
|
|
219
|
+
string MergeStatus
|
|
220
|
+
}
|
|
155
221
|
|
|
156
|
-
|
|
222
|
+
LIST {
|
|
223
|
+
string ID PK
|
|
224
|
+
string Name
|
|
225
|
+
string EntityID
|
|
226
|
+
}
|
|
157
227
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
- **Entity Documents**: Configuration for entity vectorization
|
|
228
|
+
LIST_DETAIL {
|
|
229
|
+
string ID PK
|
|
230
|
+
string ListID FK
|
|
231
|
+
string RecordID
|
|
232
|
+
}
|
|
164
233
|
|
|
165
|
-
|
|
234
|
+
ENTITY_DOCUMENT {
|
|
235
|
+
string ID PK
|
|
236
|
+
string EntityID
|
|
237
|
+
string TemplateID
|
|
238
|
+
string AIModelID
|
|
239
|
+
string VectorDatabaseID
|
|
240
|
+
float PotentialMatchThreshold
|
|
241
|
+
float AbsoluteMatchThreshold
|
|
242
|
+
}
|
|
166
243
|
|
|
167
|
-
|
|
244
|
+
DUPLICATE_RUN ||--o{ DUPLICATE_RUN_DETAIL : contains
|
|
245
|
+
DUPLICATE_RUN_DETAIL ||--o{ DUPLICATE_RUN_DETAIL_MATCH : has
|
|
246
|
+
DUPLICATE_RUN }o--|| LIST : "source"
|
|
247
|
+
LIST ||--o{ LIST_DETAIL : contains
|
|
248
|
+
```
|
|
168
249
|
|
|
169
|
-
|
|
250
|
+
## Environment Variables
|
|
170
251
|
|
|
171
252
|
```env
|
|
172
|
-
# AI Model
|
|
253
|
+
# AI Model API Keys
|
|
173
254
|
OPENAI_API_KEY=your-openai-key
|
|
174
255
|
MISTRAL_API_KEY=your-mistral-key
|
|
175
256
|
|
|
@@ -189,63 +270,37 @@ DB_DATABASE=your-database
|
|
|
189
270
|
CURRENT_USER_EMAIL=user@example.com
|
|
190
271
|
```
|
|
191
272
|
|
|
192
|
-
### Entity Document Templates
|
|
193
|
-
|
|
194
|
-
Entity documents use template syntax to define how records are converted to text for vectorization:
|
|
195
|
-
|
|
196
|
-
```javascript
|
|
197
|
-
// Example template
|
|
198
|
-
const template = "${FirstName} ${LastName} works at ${Company} as ${Title}";
|
|
199
|
-
```
|
|
200
|
-
|
|
201
273
|
## Dependencies
|
|
202
274
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
275
|
+
| Package | Purpose |
|
|
276
|
+
|---|---|
|
|
277
|
+
| `@memberjunction/ai` | `BaseEmbeddings`, `GetAIAPIKey` |
|
|
278
|
+
| `@memberjunction/ai-vectordb` | `VectorDBBase`, `BaseResponse` |
|
|
279
|
+
| `@memberjunction/ai-vectors` | `VectorBase` base class |
|
|
280
|
+
| `@memberjunction/ai-vectors-pinecone` | Pinecone implementation |
|
|
281
|
+
| `@memberjunction/ai-vector-sync` | `EntityVectorSyncer`, `EntityDocumentTemplateParser` |
|
|
282
|
+
| `@memberjunction/aiengine` | AI engine integration |
|
|
283
|
+
| `@memberjunction/core` | Core MJ types and data access |
|
|
284
|
+
| `@memberjunction/core-entities` | Entity type definitions |
|
|
285
|
+
| `@memberjunction/global` | MJGlobal class factory |
|
|
210
286
|
|
|
211
|
-
##
|
|
287
|
+
## Limitations
|
|
212
288
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
5. **Monitor Performance**: Track processing times and optimize for large datasets
|
|
289
|
+
- Duplicate detection operates within a single entity type
|
|
290
|
+
- Requires pre-configured Entity Documents with templates
|
|
291
|
+
- Currently supports Pinecone as the vector database provider
|
|
292
|
+
- Records must be added to a List before detection can run
|
|
218
293
|
|
|
219
|
-
##
|
|
294
|
+
## Development
|
|
220
295
|
|
|
221
|
-
|
|
296
|
+
```bash
|
|
297
|
+
# Build
|
|
298
|
+
npm run build
|
|
222
299
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
const response = await detector.getDuplicateRecords(request, user);
|
|
226
|
-
if (response.Status === 'Error') {
|
|
227
|
-
console.error('Duplicate detection failed:', response.ErrorMessage);
|
|
228
|
-
}
|
|
229
|
-
} catch (error) {
|
|
230
|
-
console.error('Unexpected error:', error.message);
|
|
231
|
-
}
|
|
300
|
+
# Development mode
|
|
301
|
+
npm run start
|
|
232
302
|
```
|
|
233
303
|
|
|
234
|
-
##
|
|
235
|
-
|
|
236
|
-
- Currently supports duplicate detection within a single entity type only
|
|
237
|
-
- Requires pre-configured entity documents with templates
|
|
238
|
-
- Vector database support limited to Pinecone
|
|
239
|
-
- Performance depends on vector database query capabilities
|
|
240
|
-
|
|
241
|
-
## Future Enhancements
|
|
242
|
-
|
|
243
|
-
- Cross-entity duplicate detection
|
|
244
|
-
- Additional vector database providers
|
|
245
|
-
- Batch processing improvements
|
|
246
|
-
- Real-time duplicate prevention
|
|
247
|
-
- Advanced merge strategies
|
|
248
|
-
|
|
249
|
-
## Support
|
|
304
|
+
## License
|
|
250
305
|
|
|
251
|
-
|
|
306
|
+
ISC
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@memberjunction/ai-vector-dupe",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "4.
|
|
4
|
+
"version": "4.2.0",
|
|
5
5
|
"description": "MemberJunction: AI Vector/Entity Sync Package - Handles synchronization between Vector DB and MJ CDP Data",
|
|
6
6
|
"main": "dist/index.js",
|
|
7
7
|
"types": "dist/index.d.ts",
|
|
@@ -16,15 +16,15 @@
|
|
|
16
16
|
"author": "MemberJunction.com",
|
|
17
17
|
"license": "ISC",
|
|
18
18
|
"dependencies": {
|
|
19
|
-
"@memberjunction/ai": "4.
|
|
20
|
-
"@memberjunction/ai-vectordb": "4.
|
|
21
|
-
"@memberjunction/ai-vectors": "4.
|
|
22
|
-
"@memberjunction/ai-vectors-pinecone": "4.
|
|
23
|
-
"@memberjunction/ai-vector-sync": "4.
|
|
24
|
-
"@memberjunction/aiengine": "4.
|
|
25
|
-
"@memberjunction/core": "4.
|
|
26
|
-
"@memberjunction/global": "4.
|
|
27
|
-
"@memberjunction/core-entities": "4.
|
|
19
|
+
"@memberjunction/ai": "4.2.0",
|
|
20
|
+
"@memberjunction/ai-vectordb": "4.2.0",
|
|
21
|
+
"@memberjunction/ai-vectors": "4.2.0",
|
|
22
|
+
"@memberjunction/ai-vectors-pinecone": "4.2.0",
|
|
23
|
+
"@memberjunction/ai-vector-sync": "4.2.0",
|
|
24
|
+
"@memberjunction/aiengine": "4.2.0",
|
|
25
|
+
"@memberjunction/core": "4.2.0",
|
|
26
|
+
"@memberjunction/global": "4.2.0",
|
|
27
|
+
"@memberjunction/core-entities": "4.2.0",
|
|
28
28
|
"dotenv": "^17.2.4"
|
|
29
29
|
},
|
|
30
30
|
"devDependencies": {
|