entity-predictor 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +56 -17
- package/index.d.ts +26 -0
- package/package.json +4 -6
- package/src/predictor.js +148 -62
- package/src/string-similarity.js +83 -0
package/README.md
CHANGED
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
# Entity Predictor
|
|
2
2
|
|
|
3
|
-
A lightweight,
|
|
3
|
+
A lightweight, **Zero Dependency** Node.js library for entity name prediction and normalization.
|
|
4
|
+
|
|
5
|
+
It uses **fuzzy matching** to identify entities from messy input, supporting:
|
|
6
|
+
|
|
7
|
+
- **Aliases & Acronyms** (e.g., "SBI" -> "STATE BANK OF INDIA")
|
|
8
|
+
- **Confidence Scoring** ("Trustable", "High Confidence", etc.)
|
|
9
|
+
- **Top-N Matches** (Get the top 3 best guesses)
|
|
10
|
+
- **Configurable Stop Words** (Ignore "The", "Inc", etc.)
|
|
4
11
|
|
|
5
12
|
## Features
|
|
6
13
|
|
|
@@ -77,7 +84,41 @@ Output:
|
|
|
77
84
|
*/
|
|
78
85
|
```
|
|
79
86
|
|
|
80
|
-
### 3.
|
|
87
|
+
### 3. Top-N Matches
|
|
88
|
+
|
|
89
|
+
Get a list of best matches instead of just one.
|
|
90
|
+
|
|
91
|
+
```javascript
|
|
92
|
+
const results = predictor.predictTop("Apple", 3);
|
|
93
|
+
// Returns array of matches: [{ entity: "Apple Inc", ... }, ...]
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### 4. Stop Words Filtering
|
|
97
|
+
|
|
98
|
+
Automatically remove noise words like "The", "Inc", "Ltd". **Disabled by default.**
|
|
99
|
+
|
|
100
|
+
```javascript
|
|
101
|
+
// Enable with default list
|
|
102
|
+
const predictor = new EntityPredictor(entities, { ignoreStopWords: true });
|
|
103
|
+
|
|
104
|
+
// Enable with custom list
|
|
105
|
+
const predictor = new EntityPredictor(entities, {
|
|
106
|
+
ignoreStopWords: true,
|
|
107
|
+
stopWords: ["inc", "co", "corp"],
|
|
108
|
+
});
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### 5. Custom Normalization
|
|
112
|
+
|
|
113
|
+
Pass a custom normalizer to clean data your way.
|
|
114
|
+
|
|
115
|
+
```javascript
|
|
116
|
+
const predictor = new EntityPredictor(entities, {
|
|
117
|
+
normalizer: (text) => text.toUpperCase(),
|
|
118
|
+
});
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### 6. Add Entities Dynamically
|
|
81
122
|
|
|
82
123
|
You can add new entities to an existing predictor instance.
|
|
83
124
|
|
|
@@ -87,28 +128,26 @@ predictor.addEntity("PUNJAB NATIONAL BANK", ["PNB"]);
|
|
|
87
128
|
|
|
88
129
|
## API Reference
|
|
89
130
|
|
|
90
|
-
### `new EntityPredictor(entities)`
|
|
131
|
+
### `new EntityPredictor(entities, options)`
|
|
91
132
|
|
|
92
133
|
- `entities`: Array of strings or objects `{ name: string, aliases: string[] }`.
|
|
134
|
+
- `options`: (Optional)
|
|
135
|
+
- `ignoreStopWords`: boolean (default `false`)
|
|
136
|
+
- `stopWords`: string[] (optional, defaults to internal list)
|
|
137
|
+
- `normalizer`: (text: string) => string
|
|
138
|
+
- **Throws**: `TypeError` if `entities` is not an array.
|
|
93
139
|
|
|
94
140
|
### `predict(input, threshold)`
|
|
95
141
|
|
|
96
142
|
- `input`: String to search for.
|
|
97
|
-
- `threshold`: (Optional) Minimum confidence score
|
|
143
|
+
- `threshold`: (Optional) Minimum confidence score (default `0.6`).
|
|
144
|
+
- **Returns**: Best match object or `{ entity: "UNKNOWN", ... }`.
|
|
98
145
|
|
|
99
|
-
|
|
146
|
+
### `predictTop(input, limit, threshold)`
|
|
100
147
|
|
|
101
|
-
- `
|
|
102
|
-
-
|
|
103
|
-
- `confidenceLevel`:
|
|
104
|
-
- `"Trustable"` (1.0)
|
|
105
|
-
- `"High Confidence"` (>= 0.8)
|
|
106
|
-
- `"Moderate Confidence"` (>= 0.6)
|
|
107
|
-
- `"Low Confidence"` (< 0.6)
|
|
108
|
-
- Returns `null` if the input is invalid.
|
|
109
|
-
- Returns `{ entity: "UNKNOWN", ... }` if no match meets the threshold.
|
|
148
|
+
- `limit`: Max number of results (default `5`).
|
|
149
|
+
- **Returns**: Array of match objects.
|
|
110
150
|
|
|
111
|
-
###
|
|
151
|
+
### Typescript Support
|
|
112
152
|
|
|
113
|
-
|
|
114
|
-
- `aliases`: (Optional) Array of alias strings.
|
|
153
|
+
Includes `index.d.ts` for full TypeScript support.
|
package/index.d.ts
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export interface EntityOption {
|
|
2
|
+
name: string;
|
|
3
|
+
aliases?: string[];
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
export interface PredictorOptions {
|
|
7
|
+
ignoreStopWords?: boolean;
|
|
8
|
+
stopWords?: string[];
|
|
9
|
+
normalizer?: (text: string) => string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface PredictionResult {
|
|
13
|
+
entity: string;
|
|
14
|
+
confidence: number;
|
|
15
|
+
confidenceLevel: "Trustable" | "High Confidence" | "Moderate Confidence" | "Low Confidence";
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export class EntityPredictor {
|
|
19
|
+
constructor(entities: (string | EntityOption)[], options?: PredictorOptions);
|
|
20
|
+
|
|
21
|
+
predict(input: string, threshold?: number): PredictionResult | null;
|
|
22
|
+
|
|
23
|
+
predictTop(input: string, limit?: number, threshold?: number): PredictionResult[];
|
|
24
|
+
|
|
25
|
+
addEntity(entity: string | EntityOption): void;
|
|
26
|
+
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "entity-predictor",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "Lightweight entity
|
|
3
|
+
"version": "1.2.0",
|
|
4
|
+
"description": "Lightweight entity prediction with fuzzy matching, aliases, and confidence scoring.",
|
|
5
|
+
"types": "index.d.ts",
|
|
5
6
|
"type": "module",
|
|
6
7
|
"main": "src/index.js",
|
|
7
8
|
"keywords": [
|
|
@@ -12,8 +13,5 @@
|
|
|
12
13
|
],
|
|
13
14
|
"author": "Sahil",
|
|
14
15
|
"email": "dev.sahilkumar02@gmail.com",
|
|
15
|
-
"license": "MIT"
|
|
16
|
-
"dependencies": {
|
|
17
|
-
"string-similarity": "^4.0.4"
|
|
18
|
-
}
|
|
16
|
+
"license": "MIT"
|
|
19
17
|
}
|
package/src/predictor.js
CHANGED
|
@@ -1,93 +1,179 @@
|
|
|
1
|
-
import
|
|
1
|
+
import { findBestMatch } from "./string-similarity.js";
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
3
|
+
const DEFAULT_STOP_WORDS = [
|
|
4
|
+
"the",
|
|
5
|
+
"inc",
|
|
6
|
+
"ltd",
|
|
7
|
+
"pvt",
|
|
8
|
+
"corp",
|
|
9
|
+
"corporation",
|
|
10
|
+
"co",
|
|
11
|
+
"company",
|
|
12
|
+
"limited",
|
|
13
|
+
"private",
|
|
14
|
+
"bank",
|
|
15
|
+
];
|
|
16
|
+
|
|
17
|
+
function defaultNormalize(
|
|
18
|
+
text,
|
|
19
|
+
ignoreStopWords = true,
|
|
20
|
+
stopWords = DEFAULT_STOP_WORDS
|
|
21
|
+
) {
|
|
22
|
+
let processed = text.toLowerCase();
|
|
23
|
+
|
|
24
|
+
if (ignoreStopWords) {
|
|
25
|
+
// Remove stop words (must be surrounded by word boundaries or start/end)
|
|
26
|
+
const regex = new RegExp(`\\b(${stopWords.join("|")})\\b`, "g");
|
|
27
|
+
processed = processed.replace(regex, " ");
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
return processed.replace(/[^a-z]/g, "").trim();
|
|
8
31
|
}
|
|
9
32
|
|
|
33
|
+
/**
|
|
34
|
+
* EntityPredictor class for fuzzy matching entities.
|
|
35
|
+
*/
|
|
10
36
|
export class EntityPredictor {
|
|
11
|
-
|
|
37
|
+
/**
|
|
38
|
+
* Creates an instance of EntityPredictor.
|
|
39
|
+
* @param {Array<string | {name: string, aliases: string[]}>} entities - List of entities.
|
|
40
|
+
* @param {Object} [options] - Configuration options.
|
|
41
|
+
* @param {boolean} [options.ignoreStopWords=false] - Whether to ignore stop words.
|
|
42
|
+
* @param {string[]} [options.stopWords] - Custom list of stop words.
|
|
43
|
+
* @param {function(string): string} [options.normalizer] - Custom normalizer function.
|
|
44
|
+
* @throws {TypeError} If entities is not an array.
|
|
45
|
+
*/
|
|
46
|
+
constructor(entities = [], options = {}) {
|
|
47
|
+
if (!Array.isArray(entities)) {
|
|
48
|
+
throw new TypeError("Entities must be an array.");
|
|
49
|
+
}
|
|
50
|
+
|
|
12
51
|
this.entities = [];
|
|
13
52
|
this.searchCandidates = [];
|
|
14
53
|
this.candidateToEntity = [];
|
|
15
54
|
|
|
55
|
+
this.ignoreStopWords = options.ignoreStopWords === true; // Default false
|
|
56
|
+
this.stopWords = options.stopWords || DEFAULT_STOP_WORDS;
|
|
57
|
+
this.customNormalizer = options.normalizer;
|
|
58
|
+
|
|
16
59
|
entities.forEach((item) => {
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
60
|
+
this.addEntity(item, true); // true = internal call
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Normalizes text for comparison.
|
|
66
|
+
* @param {string} text - The text to normalize.
|
|
67
|
+
* @returns {string} Normalized text.
|
|
68
|
+
*/
|
|
69
|
+
normalize(text) {
|
|
70
|
+
if (this.customNormalizer) {
|
|
71
|
+
return this.customNormalizer(text);
|
|
72
|
+
}
|
|
73
|
+
return defaultNormalize(text, this.ignoreStopWords, this.stopWords);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Adds an entity to the predictor.
|
|
78
|
+
* @param {string | {name: string, aliases: string[]}} item - The entity to add.
|
|
79
|
+
* @param {boolean} [isInternal=false] - Internal flag.
|
|
80
|
+
*/
|
|
81
|
+
addEntity(item, isInternal = false) {
|
|
82
|
+
let entityName;
|
|
83
|
+
let aliases = [];
|
|
84
|
+
|
|
85
|
+
if (typeof item === "string") {
|
|
86
|
+
entityName = item;
|
|
87
|
+
} else if (typeof item === "object" && item.name) {
|
|
88
|
+
entityName = item.name;
|
|
89
|
+
if (Array.isArray(item.aliases)) {
|
|
90
|
+
aliases = item.aliases;
|
|
29
91
|
}
|
|
92
|
+
} else {
|
|
93
|
+
// Invalid entity format, skip silently or could warn in future
|
|
94
|
+
return;
|
|
95
|
+
}
|
|
30
96
|
|
|
97
|
+
if (!this.entities.includes(entityName)) {
|
|
31
98
|
this.entities.push(entityName);
|
|
99
|
+
}
|
|
32
100
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
this.candidateToEntity.push(entityName);
|
|
101
|
+
// Add canonical
|
|
102
|
+
this.searchCandidates.push(this.normalize(entityName));
|
|
103
|
+
this.candidateToEntity.push(entityName);
|
|
37
104
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
});
|
|
105
|
+
// Add aliases
|
|
106
|
+
aliases.forEach((alias) => {
|
|
107
|
+
this.searchCandidates.push(this.normalize(alias));
|
|
108
|
+
this.candidateToEntity.push(entityName);
|
|
43
109
|
});
|
|
44
110
|
}
|
|
45
111
|
|
|
112
|
+
/**
|
|
113
|
+
* Predicts the best match for the input.
|
|
114
|
+
* @param {string} input - The input string.
|
|
115
|
+
* @param {number} [threshold=0.6] - The confidence threshold.
|
|
116
|
+
* @returns {{entity: string, confidence: number, confidenceLevel: string} | null} The best match or null/UNKNOWN.
|
|
117
|
+
*/
|
|
46
118
|
predict(input, threshold = 0.6) {
|
|
47
119
|
if (!input || typeof input !== "string") {
|
|
48
120
|
return null;
|
|
49
121
|
}
|
|
122
|
+
const results = this.predictTop(input, 1, threshold);
|
|
123
|
+
if (results.length > 0) {
|
|
124
|
+
return results[0];
|
|
125
|
+
}
|
|
126
|
+
return {
|
|
127
|
+
entity: "UNKNOWN",
|
|
128
|
+
confidence: 0,
|
|
129
|
+
confidenceLevel: "Low Confidence",
|
|
130
|
+
};
|
|
131
|
+
}
|
|
50
132
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
133
|
+
/**
|
|
134
|
+
* Predicts the top N best matches.
|
|
135
|
+
* @param {string} input - The input string.
|
|
136
|
+
* @param {number} [limit=5] - The number of results to return.
|
|
137
|
+
* @param {number} [threshold=0.6] - The confidence threshold.
|
|
138
|
+
* @returns {Array<{entity: string, confidence: number, confidenceLevel: string}>} Array of matches.
|
|
139
|
+
*/
|
|
140
|
+
predictTop(input, limit = 5, threshold = 0.6) {
|
|
141
|
+
if (!input || typeof input !== "string") {
|
|
142
|
+
return [];
|
|
143
|
+
}
|
|
55
144
|
|
|
56
|
-
const
|
|
57
|
-
|
|
145
|
+
const normalizedInput = this.normalize(input);
|
|
146
|
+
const matches = findBestMatch(normalizedInput, this.searchCandidates);
|
|
58
147
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
148
|
+
// Map all ratings to our format and sort
|
|
149
|
+
const sortedMatches = matches.ratings
|
|
150
|
+
.map((rating, index) => ({
|
|
151
|
+
entity: this.candidateToEntity[index],
|
|
152
|
+
confidence: rating.rating,
|
|
153
|
+
confidenceLevel: this._getConfidenceLevel(rating.rating),
|
|
154
|
+
}))
|
|
155
|
+
.filter((m) => m.confidence >= threshold)
|
|
156
|
+
.sort((a, b) => b.confidence - a.confidence);
|
|
66
157
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
158
|
+
// Deduplicate entities (picking the highest score for each unique entity)
|
|
159
|
+
const uniqueMatches = [];
|
|
160
|
+
const seenEntities = new Set();
|
|
161
|
+
|
|
162
|
+
for (const match of sortedMatches) {
|
|
163
|
+
if (!seenEntities.has(match.entity)) {
|
|
164
|
+
uniqueMatches.push(match);
|
|
165
|
+
seenEntities.add(match.entity);
|
|
166
|
+
if (uniqueMatches.length >= limit) break;
|
|
167
|
+
}
|
|
73
168
|
}
|
|
74
169
|
|
|
75
|
-
return
|
|
76
|
-
entity: "UNKNOWN",
|
|
77
|
-
confidence: rating,
|
|
78
|
-
confidenceLevel,
|
|
79
|
-
};
|
|
170
|
+
return uniqueMatches;
|
|
80
171
|
}
|
|
81
172
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
aliases.forEach((alias) => {
|
|
89
|
-
this.searchCandidates.push(normalize(alias));
|
|
90
|
-
this.candidateToEntity.push(entity);
|
|
91
|
-
});
|
|
173
|
+
_getConfidenceLevel(rating) {
|
|
174
|
+
if (rating === 1) return "Trustable";
|
|
175
|
+
if (rating >= 0.8) return "High Confidence";
|
|
176
|
+
if (rating >= 0.6) return "Moderate Confidence";
|
|
177
|
+
return "Low Confidence";
|
|
92
178
|
}
|
|
93
179
|
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Compares two strings using bigram comparison (Dice Coefficient).
|
|
3
|
+
*
|
|
4
|
+
* @param {string} first - The first string to compare.
|
|
5
|
+
* @param {string} second - The second string to compare.
|
|
6
|
+
* @returns {number} A fraction between 0 and 1, which indicates the degree of similarity.
|
|
7
|
+
*/
|
|
8
|
+
export function compareTwoStrings(first, second) {
|
|
9
|
+
first = first.replace(/\s+/g, "");
|
|
10
|
+
second = second.replace(/\s+/g, "");
|
|
11
|
+
|
|
12
|
+
if (first === second) return 1; // identical or empty
|
|
13
|
+
if (first.length < 2 || second.length < 2) return 0; // if either is a 0-letter or 1-letter string
|
|
14
|
+
|
|
15
|
+
let firstBigrams = new Map();
|
|
16
|
+
for (let i = 0; i < first.length - 1; i++) {
|
|
17
|
+
const bigram = first.substring(i, i + 2);
|
|
18
|
+
const count = firstBigrams.has(bigram) ? firstBigrams.get(bigram) + 1 : 1;
|
|
19
|
+
|
|
20
|
+
firstBigrams.set(bigram, count);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
let intersectionSize = 0;
|
|
24
|
+
for (let i = 0; i < second.length - 1; i++) {
|
|
25
|
+
const bigram = second.substring(i, i + 2);
|
|
26
|
+
const count = firstBigrams.has(bigram) ? firstBigrams.get(bigram) : 0;
|
|
27
|
+
|
|
28
|
+
if (count > 0) {
|
|
29
|
+
firstBigrams.set(bigram, count - 1);
|
|
30
|
+
intersectionSize++;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
return (2.0 * intersectionSize) / (first.length + second.length - 2);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Finds the best match for a main string from a target list of strings.
|
|
39
|
+
*
|
|
40
|
+
* @param {string} mainString - The string to match.
|
|
41
|
+
* @param {string[]} targetStrings - The array of strings to match against.
|
|
42
|
+
* @returns {{ ratings: Array<{target: string, rating: number}>, bestMatch: {target: string, rating: number}, bestMatchIndex: number }}
|
|
43
|
+
* @throws {TypeError} If arguments are invalid.
|
|
44
|
+
*/
|
|
45
|
+
export function findBestMatch(mainString, targetStrings) {
|
|
46
|
+
if (!areArgsValid(mainString, targetStrings))
|
|
47
|
+
throw new TypeError(
|
|
48
|
+
"Bad arguments: First argument should be a string, second should be an array of strings"
|
|
49
|
+
);
|
|
50
|
+
|
|
51
|
+
const ratings = [];
|
|
52
|
+
let bestMatchIndex = 0;
|
|
53
|
+
|
|
54
|
+
for (let i = 0; i < targetStrings.length; i++) {
|
|
55
|
+
const currentTargetString = targetStrings[i];
|
|
56
|
+
const currentRating = compareTwoStrings(mainString, currentTargetString);
|
|
57
|
+
ratings.push({ target: currentTargetString, rating: currentRating });
|
|
58
|
+
if (currentRating > ratings[bestMatchIndex].rating) {
|
|
59
|
+
bestMatchIndex = i;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const bestMatch = ratings[bestMatchIndex];
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
ratings: ratings,
|
|
67
|
+
bestMatch: bestMatch,
|
|
68
|
+
bestMatchIndex: bestMatchIndex,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function areArgsValid(mainString, targetStrings) {
|
|
73
|
+
if (typeof mainString !== "string") return false;
|
|
74
|
+
if (!Array.isArray(targetStrings)) return false;
|
|
75
|
+
if (!targetStrings.length) return false;
|
|
76
|
+
if (
|
|
77
|
+
targetStrings.find(function (s) {
|
|
78
|
+
return typeof s !== "string";
|
|
79
|
+
})
|
|
80
|
+
)
|
|
81
|
+
return false;
|
|
82
|
+
return true;
|
|
83
|
+
}
|