entity-predictor 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,13 @@
1
1
  # Entity Predictor
2
2
 
3
- A lightweight, zero-dependency (almost) Node.js library for entity name prediction and normalization. It uses fuzzy matching to identify entities from messy input, supporting aliases, acronyms, and common typos.
3
+ A lightweight, **Zero Dependency** Node.js library for entity name prediction and normalization.
4
+
5
+ It uses **fuzzy matching** to identify entities from messy input, supporting:
6
+
7
+ - **Aliases & Acronyms** (e.g., "SBI" -> "STATE BANK OF INDIA")
8
+ - **Confidence Scoring** ("Trustable", "High Confidence", etc.)
9
+ - **Top-N Matches** (Get the top 3 best guesses)
10
+ - **Configurable Stop Words** (Ignore "The", "Inc", etc.)
4
11
 
5
12
  ## Features
6
13
 
@@ -77,7 +84,41 @@ Output:
77
84
  */
78
85
  ```
79
86
 
80
- ### 3. Add Entities Dynamically
87
+ ### 3. Top-N Matches
88
+
89
+ Get a list of best matches instead of just one.
90
+
91
+ ```javascript
92
+ const results = predictor.predictTop("Apple", 3);
93
+ // Returns array of matches: [{ entity: "Apple Inc", ... }, ...]
94
+ ```
95
+
96
+ ### 4. Stop Words Filtering
97
+
98
+ Automatically remove noise words like "The", "Inc", "Ltd". **Disabled by default.**
99
+
100
+ ```javascript
101
+ // Enable with default list
102
+ const predictor = new EntityPredictor(entities, { ignoreStopWords: true });
103
+
104
+ // Enable with custom list
105
+ const predictor = new EntityPredictor(entities, {
106
+ ignoreStopWords: true,
107
+ stopWords: ["inc", "co", "corp"],
108
+ });
109
+ ```
110
+
111
+ ### 5. Custom Normalization
112
+
113
+ Pass a custom normalizer to clean data your way.
114
+
115
+ ```javascript
116
+ const predictor = new EntityPredictor(entities, {
117
+ normalizer: (text) => text.toUpperCase(),
118
+ });
119
+ ```
120
+
121
+ ### 6. Add Entities Dynamically
81
122
 
82
123
  You can add new entities to an existing predictor instance.
83
124
 
@@ -87,28 +128,26 @@ predictor.addEntity("PUNJAB NATIONAL BANK", ["PNB"]);
87
128
 
88
129
  ## API Reference
89
130
 
90
- ### `new EntityPredictor(entities)`
131
+ ### `new EntityPredictor(entities, options)`
91
132
 
92
133
  - `entities`: Array of strings or objects `{ name: string, aliases: string[] }`.
134
+ - `options`: (Optional)
135
+ - `ignoreStopWords`: boolean (default `false`)
136
+ - `stopWords`: string[] (optional, defaults to internal list)
137
+ - `normalizer`: (text: string) => string
138
+ - **Throws**: `TypeError` if `entities` is not an array.
93
139
 
94
140
  ### `predict(input, threshold)`
95
141
 
96
142
  - `input`: String to search for.
97
- - `threshold`: (Optional) Minimum confidence score to return a match. Default is `0.6`.
143
+ - `threshold`: (Optional) Minimum confidence score (default `0.6`).
144
+ - **Returns**: Best match object or `{ entity: "UNKNOWN", ... }`.
98
145
 
99
- **Returns:**
146
+ ### `predictTop(input, limit, threshold)`
100
147
 
101
- - `entity`: The canonical name of the matched entity.
102
- - `confidence`: Score between 0 and 1.
103
- - `confidenceLevel`:
104
- - `"Trustable"` (1.0)
105
- - `"High Confidence"` (>= 0.8)
106
- - `"Moderate Confidence"` (>= 0.6)
107
- - `"Low Confidence"` (< 0.6)
108
- - Returns `null` if the input is invalid.
109
- - Returns `{ entity: "UNKNOWN", ... }` if no match meets the threshold.
148
+ - `limit`: Max number of results (default `5`).
149
+ - **Returns**: Array of match objects.
110
150
 
111
- ### `addEntity(name, aliases)`
151
+ ### Typescript Support
112
152
 
113
- - `name`: Canonical name of the entity.
114
- - `aliases`: (Optional) Array of alias strings.
153
+ Includes `index.d.ts` for full TypeScript support.
package/index.d.ts ADDED
@@ -0,0 +1,26 @@
1
+ export interface EntityOption {
2
+ name: string;
3
+ aliases?: string[];
4
+ }
5
+
6
+ export interface PredictorOptions {
7
+ ignoreStopWords?: boolean;
8
+ stopWords?: string[];
9
+ normalizer?: (text: string) => string;
10
+ }
11
+
12
+ export interface PredictionResult {
13
+ entity: string;
14
+ confidence: number;
15
+ confidenceLevel: "Trustable" | "High Confidence" | "Moderate Confidence" | "Low Confidence";
16
+ }
17
+
18
+ export class EntityPredictor {
19
+ constructor(entities: (string | EntityOption)[], options?: PredictorOptions);
20
+
21
+ predict(input: string, threshold?: number): PredictionResult | null;
22
+
23
+ predictTop(input: string, limit?: number, threshold?: number): PredictionResult[];
24
+
25
+ addEntity(entity: string | EntityOption): void;
26
+ }
package/package.json CHANGED
@@ -1,7 +1,8 @@
1
1
  {
2
2
  "name": "entity-predictor",
3
- "version": "1.0.0",
4
- "description": "Lightweight entity name prediction and normalization library",
3
+ "version": "1.2.0",
4
+ "description": "Lightweight entity prediction with fuzzy matching, aliases, and confidence scoring.",
5
+ "types": "index.d.ts",
5
6
  "type": "module",
6
7
  "main": "src/index.js",
7
8
  "keywords": [
@@ -12,8 +13,5 @@
12
13
  ],
13
14
  "author": "Sahil",
14
15
  "email": "dev.sahilkumar02@gmail.com",
15
- "license": "MIT",
16
- "dependencies": {
17
- "string-similarity": "^4.0.4"
18
- }
16
+ "license": "MIT"
19
17
  }
package/src/predictor.js CHANGED
@@ -1,93 +1,179 @@
1
- import stringSimilarity from "string-similarity";
1
+ import { findBestMatch } from "./string-similarity.js";
2
2
 
3
- function normalize(text) {
4
- return text
5
- .toLowerCase()
6
- .replace(/[^a-z]/g, "")
7
- .trim();
3
+ const DEFAULT_STOP_WORDS = [
4
+ "the",
5
+ "inc",
6
+ "ltd",
7
+ "pvt",
8
+ "corp",
9
+ "corporation",
10
+ "co",
11
+ "company",
12
+ "limited",
13
+ "private",
14
+ "bank",
15
+ ];
16
+
17
+ function defaultNormalize(
18
+ text,
19
+ ignoreStopWords = true,
20
+ stopWords = DEFAULT_STOP_WORDS
21
+ ) {
22
+ let processed = text.toLowerCase();
23
+
24
+ if (ignoreStopWords) {
25
+ // Remove stop words (must be surrounded by word boundaries or start/end)
26
+ const regex = new RegExp(`\\b(${stopWords.join("|")})\\b`, "g");
27
+ processed = processed.replace(regex, " ");
28
+ }
29
+
30
+ return processed.replace(/[^a-z]/g, "").trim();
8
31
  }
9
32
 
33
+ /**
34
+ * EntityPredictor class for fuzzy matching entities.
35
+ */
10
36
  export class EntityPredictor {
11
- constructor(entities = []) {
37
+ /**
38
+ * Creates an instance of EntityPredictor.
39
+ * @param {Array<string | {name: string, aliases: string[]}>} entities - List of entities.
40
+ * @param {Object} [options] - Configuration options.
41
+ * @param {boolean} [options.ignoreStopWords=false] - Whether to ignore stop words.
42
+ * @param {string[]} [options.stopWords] - Custom list of stop words.
43
+ * @param {function(string): string} [options.normalizer] - Custom normalizer function.
44
+ * @throws {TypeError} If entities is not an array.
45
+ */
46
+ constructor(entities = [], options = {}) {
47
+ if (!Array.isArray(entities)) {
48
+ throw new TypeError("Entities must be an array.");
49
+ }
50
+
12
51
  this.entities = [];
13
52
  this.searchCandidates = [];
14
53
  this.candidateToEntity = [];
15
54
 
55
+ this.ignoreStopWords = options.ignoreStopWords === true; // Default false
56
+ this.stopWords = options.stopWords || DEFAULT_STOP_WORDS;
57
+ this.customNormalizer = options.normalizer;
58
+
16
59
  entities.forEach((item) => {
17
- let entityName;
18
- let aliases = [];
19
-
20
- if (typeof item === "string") {
21
- entityName = item;
22
- } else if (typeof item === "object" && item.name) {
23
- entityName = item.name;
24
- if (Array.isArray(item.aliases)) {
25
- aliases = item.aliases;
26
- }
27
- } else {
28
- return; // Skip invalid entries
60
+ this.addEntity(item, true); // true = internal call
61
+ });
62
+ }
63
+
64
+ /**
65
+ * Normalizes text for comparison.
66
+ * @param {string} text - The text to normalize.
67
+ * @returns {string} Normalized text.
68
+ */
69
+ normalize(text) {
70
+ if (this.customNormalizer) {
71
+ return this.customNormalizer(text);
72
+ }
73
+ return defaultNormalize(text, this.ignoreStopWords, this.stopWords);
74
+ }
75
+
76
+ /**
77
+ * Adds an entity to the predictor.
78
+ * @param {string | {name: string, aliases: string[]}} item - The entity to add.
79
+ * @param {boolean} [isInternal=false] - Internal flag.
80
+ */
81
+ addEntity(item, isInternal = false) {
82
+ let entityName;
83
+ let aliases = [];
84
+
85
+ if (typeof item === "string") {
86
+ entityName = item;
87
+ } else if (typeof item === "object" && item.name) {
88
+ entityName = item.name;
89
+ if (Array.isArray(item.aliases)) {
90
+ aliases = item.aliases;
29
91
  }
92
+ } else {
93
+ // Invalid entity format, skip silently or could warn in future
94
+ return;
95
+ }
30
96
 
97
+ if (!this.entities.includes(entityName)) {
31
98
  this.entities.push(entityName);
99
+ }
32
100
 
33
- // Add canonical name to search candidates
34
- const normalizedName = normalize(entityName);
35
- this.searchCandidates.push(normalizedName);
36
- this.candidateToEntity.push(entityName);
101
+ // Add canonical
102
+ this.searchCandidates.push(this.normalize(entityName));
103
+ this.candidateToEntity.push(entityName);
37
104
 
38
- // Add aliases to search candidates
39
- aliases.forEach((alias) => {
40
- this.searchCandidates.push(normalize(alias));
41
- this.candidateToEntity.push(entityName);
42
- });
105
+ // Add aliases
106
+ aliases.forEach((alias) => {
107
+ this.searchCandidates.push(this.normalize(alias));
108
+ this.candidateToEntity.push(entityName);
43
109
  });
44
110
  }
45
111
 
112
+ /**
113
+ * Predicts the best match for the input.
114
+ * @param {string} input - The input string.
115
+ * @param {number} [threshold=0.6] - The confidence threshold.
116
+ * @returns {{entity: string, confidence: number, confidenceLevel: string} | null} The best match or null/UNKNOWN.
117
+ */
46
118
  predict(input, threshold = 0.6) {
47
119
  if (!input || typeof input !== "string") {
48
120
  return null;
49
121
  }
122
+ const results = this.predictTop(input, 1, threshold);
123
+ if (results.length > 0) {
124
+ return results[0];
125
+ }
126
+ return {
127
+ entity: "UNKNOWN",
128
+ confidence: 0,
129
+ confidenceLevel: "Low Confidence",
130
+ };
131
+ }
50
132
 
51
- const match = stringSimilarity.findBestMatch(
52
- normalize(input),
53
- this.searchCandidates
54
- );
133
+ /**
134
+ * Predicts the top N best matches.
135
+ * @param {string} input - The input string.
136
+ * @param {number} [limit=5] - The number of results to return.
137
+ * @param {number} [threshold=0.6] - The confidence threshold.
138
+ * @returns {Array<{entity: string, confidence: number, confidenceLevel: string}>} Array of matches.
139
+ */
140
+ predictTop(input, limit = 5, threshold = 0.6) {
141
+ if (!input || typeof input !== "string") {
142
+ return [];
143
+ }
55
144
 
56
- const rating = match.bestMatch.rating;
57
- let confidenceLevel = "Low Confidence";
145
+ const normalizedInput = this.normalize(input);
146
+ const matches = findBestMatch(normalizedInput, this.searchCandidates);
58
147
 
59
- if (rating === 1) {
60
- confidenceLevel = "Trustable";
61
- } else if (rating >= 0.8) {
62
- confidenceLevel = "High Confidence";
63
- } else if (rating >= 0.6) {
64
- confidenceLevel = "Moderate Confidence";
65
- }
148
+ // Map all ratings to our format and sort
149
+ const sortedMatches = matches.ratings
150
+ .map((rating, index) => ({
151
+ entity: this.candidateToEntity[index],
152
+ confidence: rating.rating,
153
+ confidenceLevel: this._getConfidenceLevel(rating.rating),
154
+ }))
155
+ .filter((m) => m.confidence >= threshold)
156
+ .sort((a, b) => b.confidence - a.confidence);
66
157
 
67
- if (rating >= threshold) {
68
- return {
69
- entity: this.candidateToEntity[match.bestMatchIndex],
70
- confidence: rating,
71
- confidenceLevel,
72
- };
158
+ // Deduplicate entities (picking the highest score for each unique entity)
159
+ const uniqueMatches = [];
160
+ const seenEntities = new Set();
161
+
162
+ for (const match of sortedMatches) {
163
+ if (!seenEntities.has(match.entity)) {
164
+ uniqueMatches.push(match);
165
+ seenEntities.add(match.entity);
166
+ if (uniqueMatches.length >= limit) break;
167
+ }
73
168
  }
74
169
 
75
- return {
76
- entity: "UNKNOWN",
77
- confidence: rating,
78
- confidenceLevel,
79
- };
170
+ return uniqueMatches;
80
171
  }
81
172
 
82
- addEntity(entity, aliases = []) {
83
- this.entities.push(entity);
84
- const normalizedName = normalize(entity);
85
- this.searchCandidates.push(normalizedName);
86
- this.candidateToEntity.push(entity);
87
-
88
- aliases.forEach((alias) => {
89
- this.searchCandidates.push(normalize(alias));
90
- this.candidateToEntity.push(entity);
91
- });
173
+ _getConfidenceLevel(rating) {
174
+ if (rating === 1) return "Trustable";
175
+ if (rating >= 0.8) return "High Confidence";
176
+ if (rating >= 0.6) return "Moderate Confidence";
177
+ return "Low Confidence";
92
178
  }
93
179
  }
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Compares two strings using bigram comparison (Dice Coefficient).
3
+ *
4
+ * @param {string} first - The first string to compare.
5
+ * @param {string} second - The second string to compare.
6
+ * @returns {number} A fraction between 0 and 1, which indicates the degree of similarity.
7
+ */
8
+ export function compareTwoStrings(first, second) {
9
+ first = first.replace(/\s+/g, "");
10
+ second = second.replace(/\s+/g, "");
11
+
12
+ if (first === second) return 1; // identical or empty
13
+ if (first.length < 2 || second.length < 2) return 0; // if either is a 0-letter or 1-letter string
14
+
15
+ let firstBigrams = new Map();
16
+ for (let i = 0; i < first.length - 1; i++) {
17
+ const bigram = first.substring(i, i + 2);
18
+ const count = firstBigrams.has(bigram) ? firstBigrams.get(bigram) + 1 : 1;
19
+
20
+ firstBigrams.set(bigram, count);
21
+ }
22
+
23
+ let intersectionSize = 0;
24
+ for (let i = 0; i < second.length - 1; i++) {
25
+ const bigram = second.substring(i, i + 2);
26
+ const count = firstBigrams.has(bigram) ? firstBigrams.get(bigram) : 0;
27
+
28
+ if (count > 0) {
29
+ firstBigrams.set(bigram, count - 1);
30
+ intersectionSize++;
31
+ }
32
+ }
33
+
34
+ return (2.0 * intersectionSize) / (first.length + second.length - 2);
35
+ }
36
+
37
+ /**
38
+ * Finds the best match for a main string from a target list of strings.
39
+ *
40
+ * @param {string} mainString - The string to match.
41
+ * @param {string[]} targetStrings - The array of strings to match against.
42
+ * @returns {{ ratings: Array<{target: string, rating: number}>, bestMatch: {target: string, rating: number}, bestMatchIndex: number }}
43
+ * @throws {TypeError} If arguments are invalid.
44
+ */
45
+ export function findBestMatch(mainString, targetStrings) {
46
+ if (!areArgsValid(mainString, targetStrings))
47
+ throw new TypeError(
48
+ "Bad arguments: First argument should be a string, second should be an array of strings"
49
+ );
50
+
51
+ const ratings = [];
52
+ let bestMatchIndex = 0;
53
+
54
+ for (let i = 0; i < targetStrings.length; i++) {
55
+ const currentTargetString = targetStrings[i];
56
+ const currentRating = compareTwoStrings(mainString, currentTargetString);
57
+ ratings.push({ target: currentTargetString, rating: currentRating });
58
+ if (currentRating > ratings[bestMatchIndex].rating) {
59
+ bestMatchIndex = i;
60
+ }
61
+ }
62
+
63
+ const bestMatch = ratings[bestMatchIndex];
64
+
65
+ return {
66
+ ratings: ratings,
67
+ bestMatch: bestMatch,
68
+ bestMatchIndex: bestMatchIndex,
69
+ };
70
+ }
71
+
72
+ function areArgsValid(mainString, targetStrings) {
73
+ if (typeof mainString !== "string") return false;
74
+ if (!Array.isArray(targetStrings)) return false;
75
+ if (!targetStrings.length) return false;
76
+ if (
77
+ targetStrings.find(function (s) {
78
+ return typeof s !== "string";
79
+ })
80
+ )
81
+ return false;
82
+ return true;
83
+ }