@memberjunction/ai-vector-dupe 2.43.0 → 2.45.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +219 -58
- package/package.json +10 -10
package/README.md
CHANGED
|
@@ -1,90 +1,251 @@
|
|
|
1
|
-
#
|
|
1
|
+
# @memberjunction/ai-vector-dupe
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
A MemberJunction package for identifying and managing duplicate records using AI-powered vector similarity search. This package generates vector representations of records and uses similarity scoring to detect potential duplicates, with options for automatic merging.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The AI Vector Dupe package provides sophisticated duplicate detection capabilities by:
|
|
8
|
+
- Converting records into vector embeddings using AI models
|
|
9
|
+
- Performing similarity searches in vector databases
|
|
10
|
+
- Tracking duplicate detection runs and results
|
|
11
|
+
- Optionally merging duplicates based on configurable thresholds
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npm install @memberjunction/ai-vector-dupe
|
|
17
|
+
```
|
|
6
18
|
|
|
7
19
|
## Prerequisites
|
|
8
20
|
|
|
9
|
-
|
|
21
|
+
1. **MemberJunction Framework**: A properly configured MemberJunction database with the core schema
|
|
22
|
+
2. **AI Model Provider**: API key for embedding models (OpenAI, Mistral, or other supported providers)
|
|
23
|
+
3. **Vector Database**: Currently supports Pinecone with appropriate API credentials
|
|
24
|
+
4. **Entity Documents**: Configured entity documents with templates for the entities you want to analyze
|
|
25
|
+
|
|
26
|
+
## Core Components
|
|
27
|
+
|
|
28
|
+
### DuplicateRecordDetector
|
|
29
|
+
|
|
30
|
+
The main class that handles duplicate detection operations.
|
|
31
|
+
|
|
32
|
+
```typescript
|
|
33
|
+
import { DuplicateRecordDetector } from '@memberjunction/ai-vector-dupe';
|
|
34
|
+
import { PotentialDuplicateRequest, UserInfo } from '@memberjunction/core';
|
|
35
|
+
|
|
36
|
+
const detector = new DuplicateRecordDetector();
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### VectorSyncBase
|
|
40
|
+
|
|
41
|
+
Abstract base class providing utilities for vector synchronization operations.
|
|
42
|
+
|
|
43
|
+
```typescript
|
|
44
|
+
import { VectorSyncBase } from '@memberjunction/ai-vector-dupe';
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### EntitySyncConfig
|
|
48
|
+
|
|
49
|
+
Type definition for entity synchronization configuration.
|
|
50
|
+
|
|
51
|
+
```typescript
|
|
52
|
+
import { EntitySyncConfig } from '@memberjunction/ai-vector-dupe';
|
|
53
|
+
|
|
54
|
+
const config: EntitySyncConfig = {
|
|
55
|
+
EntityDocumentID: 'entity-doc-id',
|
|
56
|
+
Interval: 3600,
|
|
57
|
+
RunViewParams: { /* RunView parameters */ },
|
|
58
|
+
IncludeInSync: true,
|
|
59
|
+
LastRunDate: 'January 1, 2024 00:00:00',
|
|
60
|
+
VectorIndexID: 1,
|
|
61
|
+
VectorID: 1
|
|
62
|
+
};
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Usage
|
|
66
|
+
|
|
67
|
+
### Basic Duplicate Detection
|
|
68
|
+
|
|
69
|
+
```typescript
|
|
70
|
+
import { DuplicateRecordDetector } from '@memberjunction/ai-vector-dupe';
|
|
71
|
+
import { PotentialDuplicateRequest, UserInfo } from '@memberjunction/core';
|
|
72
|
+
|
|
73
|
+
// Initialize the detector
|
|
74
|
+
const detector = new DuplicateRecordDetector();
|
|
75
|
+
|
|
76
|
+
// Define the request parameters
|
|
77
|
+
const request: PotentialDuplicateRequest = {
|
|
78
|
+
ListID: 'your-list-id', // ID of the list containing records to check
|
|
79
|
+
EntityID: 'your-entity-id', // ID of the entity type
|
|
80
|
+
EntityDocumentID: 'doc-id', // ID of the entity document with template
|
|
81
|
+
Options: {
|
|
82
|
+
DuplicateRunID: 'run-id' // Optional: existing duplicate run to continue
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
// Execute duplicate detection
|
|
87
|
+
const response = await detector.getDuplicateRecords(request, currentUser);
|
|
88
|
+
|
|
89
|
+
if (response.Status === 'Success') {
|
|
90
|
+
console.log(`Found ${response.PotentialDuplicateResult.length} records with potential duplicates`);
|
|
91
|
+
|
|
92
|
+
for (const result of response.PotentialDuplicateResult) {
|
|
93
|
+
console.log(`Record ${result.RecordCompositeKey.ToString()}:`);
|
|
94
|
+
for (const duplicate of result.Duplicates) {
|
|
95
|
+
console.log(` - Potential duplicate: ${duplicate.ToString()} (${duplicate.ProbabilityScore * 100}% match)`);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Advanced Configuration
|
|
10
102
|
|
|
11
|
-
|
|
12
|
-
|
|
103
|
+
```typescript
|
|
104
|
+
// Configure thresholds via Entity Document settings
|
|
105
|
+
// PotentialMatchThreshold: Minimum score to consider as potential duplicate (e.g., 0.8)
|
|
106
|
+
// AbsoluteMatchThreshold: Score at which automatic merging occurs (e.g., 0.95)
|
|
13
107
|
|
|
14
|
-
|
|
15
|
-
|
|
108
|
+
const entityDocument = await vectorizer.GetEntityDocument(entityDocumentID);
|
|
109
|
+
entityDocument.PotentialMatchThreshold = 0.8; // 80% similarity
|
|
110
|
+
entityDocument.AbsoluteMatchThreshold = 0.95; // 95% for auto-merge
|
|
111
|
+
await entityDocument.Save();
|
|
112
|
+
```
|
|
16
113
|
|
|
17
|
-
|
|
18
|
-
Currently, only **Pinecone** is supported for vector storage.
|
|
114
|
+
## API Reference
|
|
19
115
|
|
|
20
|
-
|
|
116
|
+
### DuplicateRecordDetector
|
|
21
117
|
|
|
22
|
-
|
|
118
|
+
#### `getDuplicateRecords(params: PotentialDuplicateRequest, contextUser?: UserInfo): Promise<PotentialDuplicateResponse>`
|
|
23
119
|
|
|
24
|
-
|
|
120
|
+
Performs duplicate detection on records in a list.
|
|
25
121
|
|
|
26
|
-
|
|
27
|
-
|
|
122
|
+
**Parameters:**
|
|
123
|
+
- `params`: Request parameters including:
|
|
124
|
+
- `ListID`: ID of the list containing records to analyze
|
|
125
|
+
- `EntityID`: ID of the entity type
|
|
126
|
+
- `EntityDocumentID`: ID of the entity document configuration
|
|
127
|
+
- `Options`: Optional configuration including `DuplicateRunID`
|
|
128
|
+
- `contextUser`: Optional user context for permissions
|
|
28
129
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
130
|
+
**Returns:** `PotentialDuplicateResponse` containing:
|
|
131
|
+
- `Status`: 'Success' or 'Error'
|
|
132
|
+
- `ErrorMessage`: Error details if failed
|
|
133
|
+
- `PotentialDuplicateResult[]`: Array of results for each analyzed record
|
|
32
134
|
|
|
33
|
-
|
|
34
|
-
Create an instance of the `DuplicateRecordDetector` class and call the `getDuplicateRecords` function with the following parameters:
|
|
135
|
+
### VectorSyncBase
|
|
35
136
|
|
|
36
|
-
|
|
37
|
-
|--------------------|----------------|-----------------------------------------------------------------------------|
|
|
38
|
-
| `listID` | `string` | The ID of the list containing the records to analyze. |
|
|
39
|
-
| `entityID` | `string` | The ID of the entity the records belong to. |
|
|
40
|
-
| `probabilityScore` | `number` (optional) | The minimum similarity score to consider a record as a potential duplicate. |
|
|
137
|
+
Base class providing utility methods:
|
|
41
138
|
|
|
42
|
-
|
|
139
|
+
- `parseStringTemplate(str: string, obj: any): string` - Parse template strings
|
|
140
|
+
- `timer(ms: number): Promise<unknown>` - Async delay utility
|
|
141
|
+
- `start()` / `end()` / `timeDiff()` - Timing utilities
|
|
142
|
+
- `saveJSONData(data: any, path: string)` - JSON file operations
|
|
43
143
|
|
|
44
|
-
|
|
144
|
+
## Workflow Details
|
|
45
145
|
|
|
46
|
-
|
|
146
|
+
The duplicate detection process follows these steps:
|
|
47
147
|
|
|
48
|
-
|
|
148
|
+
1. **Vectorization**: Records are converted to vector embeddings using the configured AI model
|
|
149
|
+
2. **Similarity Search**: Each vector is compared against others in the vector database
|
|
150
|
+
3. **Threshold Filtering**: Results are filtered based on the potential match threshold
|
|
151
|
+
4. **Result Tracking**: All operations are logged in duplicate run tables
|
|
152
|
+
5. **Optional Merging**: Records exceeding the absolute match threshold are automatically merged
|
|
49
153
|
|
|
50
|
-
|
|
51
|
-
Fetches the list by `listID` and retrieves all records contained within it.
|
|
154
|
+
## Database Schema Integration
|
|
52
155
|
|
|
53
|
-
|
|
54
|
-
- If configured, generates new vectors for all records associated with the specified `entityID` and upserts them into the vector database.
|
|
55
|
-
- If not configured to upsert new vectors, it queries the vector database to fetch existing vectors for the records.
|
|
156
|
+
The package integrates with these MemberJunction entities:
|
|
56
157
|
|
|
57
|
-
|
|
58
|
-
|
|
158
|
+
- **Duplicate Runs**: Master record for each duplicate detection execution
|
|
159
|
+
- **Duplicate Run Details**: Individual record analysis results
|
|
160
|
+
- **Duplicate Run Detail Matches**: Specific duplicate matches found
|
|
161
|
+
- **Lists**: Source lists containing records to analyze
|
|
162
|
+
- **List Details**: Individual records within lists
|
|
163
|
+
- **Entity Documents**: Configuration for entity vectorization
|
|
59
164
|
|
|
60
|
-
|
|
61
|
-
Fetches database records corresponding to the similar vectors retrieved.
|
|
165
|
+
## Configuration
|
|
62
166
|
|
|
63
|
-
|
|
64
|
-
If configured, merges records marked as duplicates into the source record based on a **similarity probability threshold**.
|
|
65
|
-
- Example: If the similarity score exceeds `0.95`, the record is merged.
|
|
167
|
+
### Environment Variables
|
|
66
168
|
|
|
67
|
-
|
|
68
|
-
Records are created in the database to log:
|
|
69
|
-
- The duplicate record search run.
|
|
70
|
-
- Which records were analyzed.
|
|
71
|
-
- Which records were marked as potential duplicates.
|
|
169
|
+
Create a `.env` file with:
|
|
72
170
|
|
|
73
|
-
|
|
171
|
+
```env
|
|
172
|
+
# AI Model Configuration
|
|
173
|
+
OPENAI_API_KEY=your-openai-key
|
|
174
|
+
MISTRAL_API_KEY=your-mistral-key
|
|
74
175
|
|
|
75
|
-
|
|
176
|
+
# Vector Database
|
|
177
|
+
PINECONE_API_KEY=your-pinecone-key
|
|
178
|
+
PINECONE_HOST=your-pinecone-host
|
|
179
|
+
PINECONE_DEFAULT_INDEX=your-index-name
|
|
76
180
|
|
|
77
|
-
|
|
181
|
+
# Database Connection
|
|
182
|
+
DB_HOST=your-sql-server
|
|
183
|
+
DB_PORT=1433
|
|
184
|
+
DB_USERNAME=your-username
|
|
185
|
+
DB_PASSWORD=your-password
|
|
186
|
+
DB_DATABASE=your-database
|
|
187
|
+
|
|
188
|
+
# User Context
|
|
189
|
+
CURRENT_USER_EMAIL=user@example.com
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Entity Document Templates
|
|
193
|
+
|
|
194
|
+
Entity documents use template syntax to define how records are converted to text for vectorization:
|
|
78
195
|
|
|
79
196
|
```javascript
|
|
80
|
-
|
|
197
|
+
// Example template
|
|
198
|
+
const template = "${FirstName} ${LastName} works at ${Company} as ${Title}";
|
|
199
|
+
```
|
|
81
200
|
|
|
82
|
-
|
|
83
|
-
|
|
201
|
+
## Dependencies
|
|
202
|
+
|
|
203
|
+
- `@memberjunction/ai`: AI model abstractions
|
|
204
|
+
- `@memberjunction/ai-vectordb`: Vector database interfaces
|
|
205
|
+
- `@memberjunction/ai-vectors`: Vector operations
|
|
206
|
+
- `@memberjunction/ai-vectors-pinecone`: Pinecone implementation
|
|
207
|
+
- `@memberjunction/ai-vector-sync`: Entity vectorization
|
|
208
|
+
- `@memberjunction/core`: Core MJ functionality
|
|
209
|
+
- `@memberjunction/core-entities`: Entity definitions
|
|
210
|
+
|
|
211
|
+
## Best Practices
|
|
212
|
+
|
|
213
|
+
1. **Batch Processing**: For large datasets, process records in batches to avoid timeouts
|
|
214
|
+
2. **Threshold Tuning**: Start with conservative thresholds and adjust based on results
|
|
215
|
+
3. **Template Design**: Create comprehensive templates that capture all relevant fields
|
|
216
|
+
4. **Regular Sync**: Keep vector databases synchronized with source data
|
|
217
|
+
5. **Monitor Performance**: Track processing times and optimize for large datasets
|
|
218
|
+
|
|
219
|
+
## Error Handling
|
|
220
|
+
|
|
221
|
+
The package provides detailed error messages for common issues:
|
|
222
|
+
|
|
223
|
+
```typescript
|
|
224
|
+
try {
|
|
225
|
+
const response = await detector.getDuplicateRecords(request, user);
|
|
226
|
+
if (response.Status === 'Error') {
|
|
227
|
+
console.error('Duplicate detection failed:', response.ErrorMessage);
|
|
228
|
+
}
|
|
229
|
+
} catch (error) {
|
|
230
|
+
console.error('Unexpected error:', error.message);
|
|
231
|
+
}
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## Limitations
|
|
235
|
+
|
|
236
|
+
- Currently supports duplicate detection within a single entity type only
|
|
237
|
+
- Requires pre-configured entity documents with templates
|
|
238
|
+
- Vector database support limited to Pinecone
|
|
239
|
+
- Performance depends on vector database query capabilities
|
|
240
|
+
|
|
241
|
+
## Future Enhancements
|
|
242
|
+
|
|
243
|
+
- Cross-entity duplicate detection
|
|
244
|
+
- Additional vector database providers
|
|
245
|
+
- Batch processing improvements
|
|
246
|
+
- Real-time duplicate prevention
|
|
247
|
+
- Advanced merge strategies
|
|
248
|
+
|
|
249
|
+
## Support
|
|
84
250
|
|
|
85
|
-
|
|
86
|
-
detector.getDuplicateRecords({
|
|
87
|
-
listID: 'example-list-id',
|
|
88
|
-
entityID: 'example-entity-id',
|
|
89
|
-
probabilityScore: 0.9
|
|
90
|
-
});
|
|
251
|
+
For issues, questions, or contributions, please refer to the [MemberJunction documentation](https://docs.memberjunction.org) or contact the development team.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@memberjunction/ai-vector-dupe",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.45.0",
|
|
4
4
|
"description": "MemberJunction: AI Vector/Entity Sync Package - Handles synchronization between Vector DB and MJ CDP Data",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -15,15 +15,15 @@
|
|
|
15
15
|
"author": "MemberJunction.com",
|
|
16
16
|
"license": "ISC",
|
|
17
17
|
"dependencies": {
|
|
18
|
-
"@memberjunction/ai": "2.
|
|
19
|
-
"@memberjunction/ai-vectordb": "2.
|
|
20
|
-
"@memberjunction/ai-vectors": "2.
|
|
21
|
-
"@memberjunction/ai-vectors-pinecone": "2.
|
|
22
|
-
"@memberjunction/ai-vector-sync": "2.
|
|
23
|
-
"@memberjunction/aiengine": "2.
|
|
24
|
-
"@memberjunction/core": "2.
|
|
25
|
-
"@memberjunction/global": "2.
|
|
26
|
-
"@memberjunction/core-entities": "2.
|
|
18
|
+
"@memberjunction/ai": "2.45.0",
|
|
19
|
+
"@memberjunction/ai-vectordb": "2.45.0",
|
|
20
|
+
"@memberjunction/ai-vectors": "2.45.0",
|
|
21
|
+
"@memberjunction/ai-vectors-pinecone": "2.45.0",
|
|
22
|
+
"@memberjunction/ai-vector-sync": "2.45.0",
|
|
23
|
+
"@memberjunction/aiengine": "2.45.0",
|
|
24
|
+
"@memberjunction/core": "2.45.0",
|
|
25
|
+
"@memberjunction/global": "2.45.0",
|
|
26
|
+
"@memberjunction/core-entities": "2.45.0",
|
|
27
27
|
"dotenv": "^16.4.1",
|
|
28
28
|
"typeorm": "^0.3.20"
|
|
29
29
|
},
|