entity-predictor 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -17
- package/index.d.ts +26 -0
- package/package.json +3 -2
- package/src/predictor.js +108 -59
package/README.md
CHANGED
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
# Entity Predictor
|
|
2
2
|
|
|
3
|
-
A lightweight, zero-dependency (almost) Node.js library for entity name prediction and normalization.
|
|
3
|
+
A lightweight, zero-dependency (almost) Node.js library for entity name prediction and normalization.
|
|
4
|
+
|
|
5
|
+
It uses **fuzzy matching** to identify entities from messy input, supporting:
|
|
6
|
+
|
|
7
|
+
- **Aliases & Acronyms** (e.g., "SBI" -> "STATE BANK OF INDIA")
|
|
8
|
+
- **Confidence Scoring** ("Trustable", "High Confidence", etc.)
|
|
9
|
+
- **Top-N Matches** (Get the top 3 best guesses)
|
|
10
|
+
- **Configurable Stop Words** (Ignore "The", "Inc", etc.)
|
|
4
11
|
|
|
5
12
|
## Features
|
|
6
13
|
|
|
@@ -77,7 +84,41 @@ Output:
|
|
|
77
84
|
*/
|
|
78
85
|
```
|
|
79
86
|
|
|
80
|
-
### 3.
|
|
87
|
+
### 3. Top-N Matches
|
|
88
|
+
|
|
89
|
+
Get a list of best matches instead of just one.
|
|
90
|
+
|
|
91
|
+
```javascript
|
|
92
|
+
const results = predictor.predictTop("Apple", 3);
|
|
93
|
+
// Returns array of matches: [{ entity: "Apple Inc", ... }, ...]
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### 4. Stop Words Filtering
|
|
97
|
+
|
|
98
|
+
Automatically remove noise words like "The", "Inc", "Ltd". **Disabled by default.**
|
|
99
|
+
|
|
100
|
+
```javascript
|
|
101
|
+
// Enable with default list
|
|
102
|
+
const predictor = new EntityPredictor(entities, { ignoreStopWords: true });
|
|
103
|
+
|
|
104
|
+
// Enable with custom list
|
|
105
|
+
const predictor = new EntityPredictor(entities, {
|
|
106
|
+
ignoreStopWords: true,
|
|
107
|
+
stopWords: ["inc", "co", "corp"],
|
|
108
|
+
});
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### 5. Custom Normalization
|
|
112
|
+
|
|
113
|
+
Pass a custom normalizer to clean data your way.
|
|
114
|
+
|
|
115
|
+
```javascript
|
|
116
|
+
const predictor = new EntityPredictor(entities, {
|
|
117
|
+
normalizer: (text) => text.toUpperCase(),
|
|
118
|
+
});
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### 6. Add Entities Dynamically
|
|
81
122
|
|
|
82
123
|
You can add new entities to an existing predictor instance.
|
|
83
124
|
|
|
@@ -87,28 +128,25 @@ predictor.addEntity("PUNJAB NATIONAL BANK", ["PNB"]);
|
|
|
87
128
|
|
|
88
129
|
## API Reference
|
|
89
130
|
|
|
90
|
-
### `new EntityPredictor(entities)`
|
|
131
|
+
### `new EntityPredictor(entities, options)`
|
|
91
132
|
|
|
92
133
|
- `entities`: Array of strings or objects `{ name: string, aliases: string[] }`.
|
|
134
|
+
- `options`: (Optional)
|
|
135
|
+
- `ignoreStopWords`: boolean (default `false`)
|
|
136
|
+
- `stopWords`: string[] (optional, defaults to internal list)
|
|
137
|
+
- `normalizer`: (text: string) => string
|
|
93
138
|
|
|
94
139
|
### `predict(input, threshold)`
|
|
95
140
|
|
|
96
141
|
- `input`: String to search for.
|
|
97
|
-
- `threshold`: (Optional) Minimum confidence score
|
|
142
|
+
- `threshold`: (Optional) Minimum confidence score (default `0.6`).
|
|
143
|
+
- **Returns**: Best match object or `{ entity: "UNKNOWN", ... }`.
|
|
98
144
|
|
|
99
|
-
|
|
145
|
+
### `predictTop(input, limit, threshold)`
|
|
100
146
|
|
|
101
|
-
- `
|
|
102
|
-
-
|
|
103
|
-
- `confidenceLevel`:
|
|
104
|
-
- `"Trustable"` (1.0)
|
|
105
|
-
- `"High Confidence"` (>= 0.8)
|
|
106
|
-
- `"Moderate Confidence"` (>= 0.6)
|
|
107
|
-
- `"Low Confidence"` (< 0.6)
|
|
108
|
-
- Returns `null` if the input is invalid.
|
|
109
|
-
- Returns `{ entity: "UNKNOWN", ... }` if no match meets the threshold.
|
|
147
|
+
- `limit`: Max number of results (default `5`).
|
|
148
|
+
- **Returns**: Array of match objects.
|
|
110
149
|
|
|
111
|
-
###
|
|
150
|
+
### Typescript Support
|
|
112
151
|
|
|
113
|
-
|
|
114
|
-
- `aliases`: (Optional) Array of alias strings.
|
|
152
|
+
Includes `index.d.ts` for full TypeScript support.
|
package/index.d.ts
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export interface EntityOption {
|
|
2
|
+
name: string;
|
|
3
|
+
aliases?: string[];
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
export interface PredictorOptions {
|
|
7
|
+
ignoreStopWords?: boolean;
|
|
8
|
+
stopWords?: string[];
|
|
9
|
+
normalizer?: (text: string) => string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface PredictionResult {
|
|
13
|
+
entity: string;
|
|
14
|
+
confidence: number;
|
|
15
|
+
confidenceLevel: "Trustable" | "High Confidence" | "Moderate Confidence" | "Low Confidence";
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export class EntityPredictor {
|
|
19
|
+
constructor(entities: (string | EntityOption)[], options?: PredictorOptions);
|
|
20
|
+
|
|
21
|
+
predict(input: string, threshold?: number): PredictionResult;
|
|
22
|
+
|
|
23
|
+
predictTop(input: string, limit?: number, threshold?: number): PredictionResult[];
|
|
24
|
+
|
|
25
|
+
addEntity(entity: string | EntityOption): void;
|
|
26
|
+
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "entity-predictor",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "Lightweight entity
|
|
3
|
+
"version": "1.1.0",
|
|
4
|
+
"description": "Lightweight entity prediction with fuzzy matching, aliases, and confidence scoring.",
|
|
5
|
+
"types": "index.d.ts",
|
|
5
6
|
"type": "module",
|
|
6
7
|
"main": "src/index.js",
|
|
7
8
|
"keywords": [
|
package/src/predictor.js
CHANGED
|
@@ -1,45 +1,84 @@
|
|
|
1
1
|
import stringSimilarity from "string-similarity";
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
3
|
+
const DEFAULT_STOP_WORDS = [
|
|
4
|
+
"the",
|
|
5
|
+
"inc",
|
|
6
|
+
"ltd",
|
|
7
|
+
"pvt",
|
|
8
|
+
"corp",
|
|
9
|
+
"corporation",
|
|
10
|
+
"co",
|
|
11
|
+
"company",
|
|
12
|
+
"limited",
|
|
13
|
+
"private",
|
|
14
|
+
"bank",
|
|
15
|
+
];
|
|
16
|
+
|
|
17
|
+
function defaultNormalize(
|
|
18
|
+
text,
|
|
19
|
+
ignoreStopWords = true,
|
|
20
|
+
stopWords = DEFAULT_STOP_WORDS
|
|
21
|
+
) {
|
|
22
|
+
let processed = text.toLowerCase();
|
|
23
|
+
|
|
24
|
+
if (ignoreStopWords) {
|
|
25
|
+
// Remove stop words (must be surrounded by word boundaries or start/end)
|
|
26
|
+
const regex = new RegExp(`\\b(${stopWords.join("|")})\\b`, "g");
|
|
27
|
+
processed = processed.replace(regex, " ");
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
return processed.replace(/[^a-z]/g, "").trim();
|
|
8
31
|
}
|
|
9
32
|
|
|
10
33
|
export class EntityPredictor {
|
|
11
|
-
constructor(entities = []) {
|
|
34
|
+
constructor(entities = [], options = {}) {
|
|
12
35
|
this.entities = [];
|
|
13
36
|
this.searchCandidates = [];
|
|
14
37
|
this.candidateToEntity = [];
|
|
15
38
|
|
|
39
|
+
this.ignoreStopWords = options.ignoreStopWords === true; // Default false
|
|
40
|
+
this.stopWords = options.stopWords || DEFAULT_STOP_WORDS;
|
|
41
|
+
this.customNormalizer = options.normalizer;
|
|
42
|
+
|
|
16
43
|
entities.forEach((item) => {
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
44
|
+
this.addEntity(item, true); // true = internal call, delay re-indexing if needed (not needed here)
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
normalize(text) {
|
|
49
|
+
if (this.customNormalizer) {
|
|
50
|
+
return this.customNormalizer(text);
|
|
51
|
+
}
|
|
52
|
+
return defaultNormalize(text, this.ignoreStopWords, this.stopWords);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
addEntity(item, isInternal = false) {
|
|
56
|
+
let entityName;
|
|
57
|
+
let aliases = [];
|
|
58
|
+
|
|
59
|
+
if (typeof item === "string") {
|
|
60
|
+
entityName = item;
|
|
61
|
+
} else if (typeof item === "object" && item.name) {
|
|
62
|
+
entityName = item.name;
|
|
63
|
+
if (Array.isArray(item.aliases)) {
|
|
64
|
+
aliases = item.aliases;
|
|
29
65
|
}
|
|
66
|
+
} else {
|
|
67
|
+
return;
|
|
68
|
+
}
|
|
30
69
|
|
|
70
|
+
if (!this.entities.includes(entityName)) {
|
|
31
71
|
this.entities.push(entityName);
|
|
72
|
+
}
|
|
32
73
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
this.candidateToEntity.push(entityName);
|
|
74
|
+
// Add canonical
|
|
75
|
+
this.searchCandidates.push(this.normalize(entityName));
|
|
76
|
+
this.candidateToEntity.push(entityName);
|
|
37
77
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
});
|
|
78
|
+
// Add aliases
|
|
79
|
+
aliases.forEach((alias) => {
|
|
80
|
+
this.searchCandidates.push(this.normalize(alias));
|
|
81
|
+
this.candidateToEntity.push(entityName);
|
|
43
82
|
});
|
|
44
83
|
}
|
|
45
84
|
|
|
@@ -47,47 +86,57 @@ export class EntityPredictor {
|
|
|
47
86
|
if (!input || typeof input !== "string") {
|
|
48
87
|
return null;
|
|
49
88
|
}
|
|
89
|
+
const results = this.predictTop(input, 1, threshold);
|
|
90
|
+
if (results.length > 0) {
|
|
91
|
+
return results[0];
|
|
92
|
+
}
|
|
93
|
+
return {
|
|
94
|
+
entity: "UNKNOWN",
|
|
95
|
+
confidence: 0,
|
|
96
|
+
confidenceLevel: "Low Confidence",
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
predictTop(input, limit = 5, threshold = 0.6) {
|
|
101
|
+
if (!input || typeof input !== "string") {
|
|
102
|
+
return [];
|
|
103
|
+
}
|
|
50
104
|
|
|
51
|
-
const
|
|
52
|
-
|
|
105
|
+
const normalizedInput = this.normalize(input);
|
|
106
|
+
const matches = stringSimilarity.findBestMatch(
|
|
107
|
+
normalizedInput,
|
|
53
108
|
this.searchCandidates
|
|
54
109
|
);
|
|
55
110
|
|
|
56
|
-
|
|
57
|
-
|
|
111
|
+
// Map all ratings to our format and sort
|
|
112
|
+
const sortedMatches = matches.ratings
|
|
113
|
+
.map((rating, index) => ({
|
|
114
|
+
entity: this.candidateToEntity[index],
|
|
115
|
+
confidence: rating.rating,
|
|
116
|
+
confidenceLevel: this._getConfidenceLevel(rating.rating),
|
|
117
|
+
}))
|
|
118
|
+
.filter((m) => m.confidence >= threshold)
|
|
119
|
+
.sort((a, b) => b.confidence - a.confidence);
|
|
58
120
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
confidenceLevel = "High Confidence";
|
|
63
|
-
} else if (rating >= 0.6) {
|
|
64
|
-
confidenceLevel = "Moderate Confidence";
|
|
65
|
-
}
|
|
121
|
+
// Deduplicate entities (picking the highest score for each unique entity)
|
|
122
|
+
const uniqueMatches = [];
|
|
123
|
+
const seenEntities = new Set();
|
|
66
124
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
}
|
|
125
|
+
for (const match of sortedMatches) {
|
|
126
|
+
if (!seenEntities.has(match.entity)) {
|
|
127
|
+
uniqueMatches.push(match);
|
|
128
|
+
seenEntities.add(match.entity);
|
|
129
|
+
if (uniqueMatches.length >= limit) break;
|
|
130
|
+
}
|
|
73
131
|
}
|
|
74
132
|
|
|
75
|
-
return
|
|
76
|
-
entity: "UNKNOWN",
|
|
77
|
-
confidence: rating,
|
|
78
|
-
confidenceLevel,
|
|
79
|
-
};
|
|
133
|
+
return uniqueMatches;
|
|
80
134
|
}
|
|
81
135
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
aliases.forEach((alias) => {
|
|
89
|
-
this.searchCandidates.push(normalize(alias));
|
|
90
|
-
this.candidateToEntity.push(entity);
|
|
91
|
-
});
|
|
136
|
+
_getConfidenceLevel(rating) {
|
|
137
|
+
if (rating === 1) return "Trustable";
|
|
138
|
+
if (rating >= 0.8) return "High Confidence";
|
|
139
|
+
if (rating >= 0.6) return "Moderate Confidence";
|
|
140
|
+
return "Low Confidence";
|
|
92
141
|
}
|
|
93
142
|
}
|