entity-predictor 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/index.d.ts +1 -1
- package/package.json +16 -9
- package/src/predictor.js +43 -6
- package/src/string-similarity.js +83 -0
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Entity Predictor
|
|
2
2
|
|
|
3
|
-
A lightweight,
|
|
3
|
+
A lightweight, **Zero Dependency** Node.js library for entity name prediction and normalization.
|
|
4
4
|
|
|
5
5
|
It uses **fuzzy matching** to identify entities from messy input, supporting:
|
|
6
6
|
|
|
@@ -135,6 +135,7 @@ predictor.addEntity("PUNJAB NATIONAL BANK", ["PNB"]);
|
|
|
135
135
|
- `ignoreStopWords`: boolean (default `false`)
|
|
136
136
|
- `stopWords`: string[] (optional, defaults to internal list)
|
|
137
137
|
- `normalizer`: (text: string) => string
|
|
138
|
+
- **Throws**: `TypeError` if `entities` is not an array.
|
|
138
139
|
|
|
139
140
|
### `predict(input, threshold)`
|
|
140
141
|
|
package/index.d.ts
CHANGED
|
@@ -18,7 +18,7 @@ export interface PredictionResult {
|
|
|
18
18
|
export class EntityPredictor {
|
|
19
19
|
constructor(entities: (string | EntityOption)[], options?: PredictorOptions);
|
|
20
20
|
|
|
21
|
-
predict(input: string, threshold?: number): PredictionResult;
|
|
21
|
+
predict(input: string, threshold?: number): PredictionResult | null;
|
|
22
22
|
|
|
23
23
|
predictTop(input: string, limit?: number, threshold?: number): PredictionResult[];
|
|
24
24
|
|
package/package.json
CHANGED
|
@@ -1,20 +1,27 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "entity-predictor",
|
|
3
|
-
"version": "1.1
|
|
4
|
-
"description": "Lightweight
|
|
3
|
+
"version": "1.2.1",
|
|
4
|
+
"description": "Lightweight, Zero Dependency Node.js library for entity name prediction and normalization.",
|
|
5
5
|
"types": "index.d.ts",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"main": "src/index.js",
|
|
8
|
+
"author": "Sahil <dev.sahilkumar02@gmail.com> (github.com/Sahilkr02)",
|
|
8
9
|
"keywords": [
|
|
9
10
|
"nlp",
|
|
10
11
|
"entity",
|
|
11
12
|
"prediction",
|
|
12
|
-
"nodejs"
|
|
13
|
+
"nodejs",
|
|
14
|
+
"fuzzy-matching",
|
|
15
|
+
"string-similarity",
|
|
16
|
+
"entity-normalization",
|
|
17
|
+
"entity-prediction"
|
|
13
18
|
],
|
|
14
|
-
"
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
"
|
|
19
|
-
|
|
19
|
+
"files": [
|
|
20
|
+
"src/index.js",
|
|
21
|
+
"src/predictor.js",
|
|
22
|
+
"src/string-similarity.js",
|
|
23
|
+
"index.d.ts",
|
|
24
|
+
"package.json"
|
|
25
|
+
],
|
|
26
|
+
"license": "MIT"
|
|
20
27
|
}
|
package/src/predictor.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
import { findBestMatch } from "./string-similarity.js";
|
|
2
2
|
|
|
3
3
|
const DEFAULT_STOP_WORDS = [
|
|
4
4
|
"the",
|
|
@@ -30,8 +30,24 @@ function defaultNormalize(
|
|
|
30
30
|
return processed.replace(/[^a-z]/g, "").trim();
|
|
31
31
|
}
|
|
32
32
|
|
|
33
|
+
/**
|
|
34
|
+
* EntityPredictor class for fuzzy matching entities.
|
|
35
|
+
*/
|
|
33
36
|
export class EntityPredictor {
|
|
37
|
+
/**
|
|
38
|
+
* Creates an instance of EntityPredictor.
|
|
39
|
+
* @param {Array<string | {name: string, aliases: string[]}>} entities - List of entities.
|
|
40
|
+
* @param {Object} [options] - Configuration options.
|
|
41
|
+
* @param {boolean} [options.ignoreStopWords=false] - Whether to ignore stop words.
|
|
42
|
+
* @param {string[]} [options.stopWords] - Custom list of stop words.
|
|
43
|
+
* @param {function(string): string} [options.normalizer] - Custom normalizer function.
|
|
44
|
+
* @throws {TypeError} If entities is not an array.
|
|
45
|
+
*/
|
|
34
46
|
constructor(entities = [], options = {}) {
|
|
47
|
+
if (!Array.isArray(entities)) {
|
|
48
|
+
throw new TypeError("Entities must be an array.");
|
|
49
|
+
}
|
|
50
|
+
|
|
35
51
|
this.entities = [];
|
|
36
52
|
this.searchCandidates = [];
|
|
37
53
|
this.candidateToEntity = [];
|
|
@@ -41,10 +57,15 @@ export class EntityPredictor {
|
|
|
41
57
|
this.customNormalizer = options.normalizer;
|
|
42
58
|
|
|
43
59
|
entities.forEach((item) => {
|
|
44
|
-
this.addEntity(item, true); // true = internal call
|
|
60
|
+
this.addEntity(item, true); // true = internal call
|
|
45
61
|
});
|
|
46
62
|
}
|
|
47
63
|
|
|
64
|
+
/**
|
|
65
|
+
* Normalizes text for comparison.
|
|
66
|
+
* @param {string} text - The text to normalize.
|
|
67
|
+
* @returns {string} Normalized text.
|
|
68
|
+
*/
|
|
48
69
|
normalize(text) {
|
|
49
70
|
if (this.customNormalizer) {
|
|
50
71
|
return this.customNormalizer(text);
|
|
@@ -52,6 +73,11 @@ export class EntityPredictor {
|
|
|
52
73
|
return defaultNormalize(text, this.ignoreStopWords, this.stopWords);
|
|
53
74
|
}
|
|
54
75
|
|
|
76
|
+
/**
|
|
77
|
+
* Adds an entity to the predictor.
|
|
78
|
+
* @param {string | {name: string, aliases: string[]}} item - The entity to add.
|
|
79
|
+
* @param {boolean} [isInternal=false] - Internal flag.
|
|
80
|
+
*/
|
|
55
81
|
addEntity(item, isInternal = false) {
|
|
56
82
|
let entityName;
|
|
57
83
|
let aliases = [];
|
|
@@ -64,6 +90,7 @@ export class EntityPredictor {
|
|
|
64
90
|
aliases = item.aliases;
|
|
65
91
|
}
|
|
66
92
|
} else {
|
|
93
|
+
// Invalid entity format, skip silently or could warn in future
|
|
67
94
|
return;
|
|
68
95
|
}
|
|
69
96
|
|
|
@@ -82,6 +109,12 @@ export class EntityPredictor {
|
|
|
82
109
|
});
|
|
83
110
|
}
|
|
84
111
|
|
|
112
|
+
/**
|
|
113
|
+
* Predicts the best match for the input.
|
|
114
|
+
* @param {string} input - The input string.
|
|
115
|
+
* @param {number} [threshold=0.6] - The confidence threshold.
|
|
116
|
+
* @returns {{entity: string, confidence: number, confidenceLevel: string} | null} The best match or null/UNKNOWN.
|
|
117
|
+
*/
|
|
85
118
|
predict(input, threshold = 0.6) {
|
|
86
119
|
if (!input || typeof input !== "string") {
|
|
87
120
|
return null;
|
|
@@ -97,16 +130,20 @@ export class EntityPredictor {
|
|
|
97
130
|
};
|
|
98
131
|
}
|
|
99
132
|
|
|
133
|
+
/**
|
|
134
|
+
* Predicts the top N best matches.
|
|
135
|
+
* @param {string} input - The input string.
|
|
136
|
+
* @param {number} [limit=5] - The number of results to return.
|
|
137
|
+
* @param {number} [threshold=0.6] - The confidence threshold.
|
|
138
|
+
* @returns {Array<{entity: string, confidence: number, confidenceLevel: string}>} Array of matches.
|
|
139
|
+
*/
|
|
100
140
|
predictTop(input, limit = 5, threshold = 0.6) {
|
|
101
141
|
if (!input || typeof input !== "string") {
|
|
102
142
|
return [];
|
|
103
143
|
}
|
|
104
144
|
|
|
105
145
|
const normalizedInput = this.normalize(input);
|
|
106
|
-
const matches =
|
|
107
|
-
normalizedInput,
|
|
108
|
-
this.searchCandidates
|
|
109
|
-
);
|
|
146
|
+
const matches = findBestMatch(normalizedInput, this.searchCandidates);
|
|
110
147
|
|
|
111
148
|
// Map all ratings to our format and sort
|
|
112
149
|
const sortedMatches = matches.ratings
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Compares two strings using bigram comparison (Dice Coefficient).
|
|
3
|
+
*
|
|
4
|
+
* @param {string} first - The first string to compare.
|
|
5
|
+
* @param {string} second - The second string to compare.
|
|
6
|
+
* @returns {number} A fraction between 0 and 1, which indicates the degree of similarity.
|
|
7
|
+
*/
|
|
8
|
+
export function compareTwoStrings(first, second) {
|
|
9
|
+
first = first.replace(/\s+/g, "");
|
|
10
|
+
second = second.replace(/\s+/g, "");
|
|
11
|
+
|
|
12
|
+
if (first === second) return 1; // identical or empty
|
|
13
|
+
if (first.length < 2 || second.length < 2) return 0; // if either is a 0-letter or 1-letter string
|
|
14
|
+
|
|
15
|
+
let firstBigrams = new Map();
|
|
16
|
+
for (let i = 0; i < first.length - 1; i++) {
|
|
17
|
+
const bigram = first.substring(i, i + 2);
|
|
18
|
+
const count = firstBigrams.has(bigram) ? firstBigrams.get(bigram) + 1 : 1;
|
|
19
|
+
|
|
20
|
+
firstBigrams.set(bigram, count);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
let intersectionSize = 0;
|
|
24
|
+
for (let i = 0; i < second.length - 1; i++) {
|
|
25
|
+
const bigram = second.substring(i, i + 2);
|
|
26
|
+
const count = firstBigrams.has(bigram) ? firstBigrams.get(bigram) : 0;
|
|
27
|
+
|
|
28
|
+
if (count > 0) {
|
|
29
|
+
firstBigrams.set(bigram, count - 1);
|
|
30
|
+
intersectionSize++;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
return (2.0 * intersectionSize) / (first.length + second.length - 2);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Finds the best match for a main string from a target list of strings.
|
|
39
|
+
*
|
|
40
|
+
* @param {string} mainString - The string to match.
|
|
41
|
+
* @param {string[]} targetStrings - The array of strings to match against.
|
|
42
|
+
* @returns {{ ratings: Array<{target: string, rating: number}>, bestMatch: {target: string, rating: number}, bestMatchIndex: number }}
|
|
43
|
+
* @throws {TypeError} If arguments are invalid.
|
|
44
|
+
*/
|
|
45
|
+
export function findBestMatch(mainString, targetStrings) {
|
|
46
|
+
if (!areArgsValid(mainString, targetStrings))
|
|
47
|
+
throw new TypeError(
|
|
48
|
+
"Bad arguments: First argument should be a string, second should be an array of strings"
|
|
49
|
+
);
|
|
50
|
+
|
|
51
|
+
const ratings = [];
|
|
52
|
+
let bestMatchIndex = 0;
|
|
53
|
+
|
|
54
|
+
for (let i = 0; i < targetStrings.length; i++) {
|
|
55
|
+
const currentTargetString = targetStrings[i];
|
|
56
|
+
const currentRating = compareTwoStrings(mainString, currentTargetString);
|
|
57
|
+
ratings.push({ target: currentTargetString, rating: currentRating });
|
|
58
|
+
if (currentRating > ratings[bestMatchIndex].rating) {
|
|
59
|
+
bestMatchIndex = i;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const bestMatch = ratings[bestMatchIndex];
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
ratings: ratings,
|
|
67
|
+
bestMatch: bestMatch,
|
|
68
|
+
bestMatchIndex: bestMatchIndex,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function areArgsValid(mainString, targetStrings) {
|
|
73
|
+
if (typeof mainString !== "string") return false;
|
|
74
|
+
if (!Array.isArray(targetStrings)) return false;
|
|
75
|
+
if (!targetStrings.length) return false;
|
|
76
|
+
if (
|
|
77
|
+
targetStrings.find(function (s) {
|
|
78
|
+
return typeof s !== "string";
|
|
79
|
+
})
|
|
80
|
+
)
|
|
81
|
+
return false;
|
|
82
|
+
return true;
|
|
83
|
+
}
|