entity-predictor 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,13 @@
1
1
  # Entity Predictor
2
2
 
3
- A lightweight, zero-dependency (almost) Node.js library for entity name prediction and normalization. It uses fuzzy matching to identify entities from messy input, supporting aliases, acronyms, and common typos.
3
+ A lightweight, zero-dependency (almost) Node.js library for entity name prediction and normalization.
4
+
5
+ It uses **fuzzy matching** to identify entities from messy input, supporting:
6
+
7
+ - **Aliases & Acronyms** (e.g., "SBI" -> "STATE BANK OF INDIA")
8
+ - **Confidence Scoring** ("Trustable", "High Confidence", etc.)
9
+ - **Top-N Matches** (Get the top 3 best guesses)
10
+ - **Configurable Stop Words** (Ignore "The", "Inc", etc.)
4
11
 
5
12
  ## Features
6
13
 
@@ -77,7 +84,41 @@ Output:
77
84
  */
78
85
  ```
79
86
 
80
- ### 3. Add Entities Dynamically
87
+ ### 3. Top-N Matches
88
+
89
+ Get a list of best matches instead of just one.
90
+
91
+ ```javascript
92
+ const results = predictor.predictTop("Apple", 3);
93
+ // Returns array of matches: [{ entity: "Apple Inc", ... }, ...]
94
+ ```
95
+
96
+ ### 4. Stop Words Filtering
97
+
98
+ Automatically remove noise words like "The", "Inc", "Ltd". **Disabled by default.**
99
+
100
+ ```javascript
101
+ // Enable with default list
102
+ const predictor = new EntityPredictor(entities, { ignoreStopWords: true });
103
+
104
+ // Enable with custom list
105
+ const predictor = new EntityPredictor(entities, {
106
+ ignoreStopWords: true,
107
+ stopWords: ["inc", "co", "corp"],
108
+ });
109
+ ```
110
+
111
+ ### 5. Custom Normalization
112
+
113
+ Pass a custom normalizer to clean data your way.
114
+
115
+ ```javascript
116
+ const predictor = new EntityPredictor(entities, {
117
+ normalizer: (text) => text.toUpperCase(),
118
+ });
119
+ ```
120
+
121
+ ### 6. Add Entities Dynamically
81
122
 
82
123
  You can add new entities to an existing predictor instance.
83
124
 
@@ -87,28 +128,25 @@ predictor.addEntity("PUNJAB NATIONAL BANK", ["PNB"]);
87
128
 
88
129
  ## API Reference
89
130
 
90
- ### `new EntityPredictor(entities)`
131
+ ### `new EntityPredictor(entities, options)`
91
132
 
92
133
  - `entities`: Array of strings or objects `{ name: string, aliases: string[] }`.
134
+ - `options`: (Optional)
135
+ - `ignoreStopWords`: boolean (default `false`)
136
+ - `stopWords`: string[] (optional, defaults to internal list)
137
+ - `normalizer`: (text: string) => string
93
138
 
94
139
  ### `predict(input, threshold)`
95
140
 
96
141
  - `input`: String to search for.
97
- - `threshold`: (Optional) Minimum confidence score to return a match. Default is `0.6`.
142
+ - `threshold`: (Optional) Minimum confidence score (default `0.6`).
143
+ - **Returns**: Best match object or `{ entity: "UNKNOWN", ... }`.
98
144
 
99
- **Returns:**
145
+ ### `predictTop(input, limit, threshold)`
100
146
 
101
- - `entity`: The canonical name of the matched entity.
102
- - `confidence`: Score between 0 and 1.
103
- - `confidenceLevel`:
104
- - `"Trustable"` (1.0)
105
- - `"High Confidence"` (>= 0.8)
106
- - `"Moderate Confidence"` (>= 0.6)
107
- - `"Low Confidence"` (< 0.6)
108
- - Returns `null` if the input is invalid.
109
- - Returns `{ entity: "UNKNOWN", ... }` if no match meets the threshold.
147
+ - `limit`: Max number of results (default `5`).
148
+ - **Returns**: Array of match objects.
110
149
 
111
- ### `addEntity(name, aliases)`
150
+ ### Typescript Support
112
151
 
113
- - `name`: Canonical name of the entity.
114
- - `aliases`: (Optional) Array of alias strings.
152
+ Includes `index.d.ts` for full TypeScript support.
package/index.d.ts ADDED
@@ -0,0 +1,26 @@
1
+ export interface EntityOption {
2
+ name: string;
3
+ aliases?: string[];
4
+ }
5
+
6
+ export interface PredictorOptions {
7
+ ignoreStopWords?: boolean;
8
+ stopWords?: string[];
9
+ normalizer?: (text: string) => string;
10
+ }
11
+
12
+ export interface PredictionResult {
13
+ entity: string;
14
+ confidence: number;
15
+ confidenceLevel: "Trustable" | "High Confidence" | "Moderate Confidence" | "Low Confidence";
16
+ }
17
+
18
+ export class EntityPredictor {
19
+ constructor(entities: (string | EntityOption)[], options?: PredictorOptions);
20
+
21
+ predict(input: string, threshold?: number): PredictionResult;
22
+
23
+ predictTop(input: string, limit?: number, threshold?: number): PredictionResult[];
24
+
25
+ addEntity(entity: string | EntityOption): void;
26
+ }
package/package.json CHANGED
@@ -1,7 +1,8 @@
1
1
  {
2
2
  "name": "entity-predictor",
3
- "version": "1.0.0",
4
- "description": "Lightweight entity name prediction and normalization library",
3
+ "version": "1.1.0",
4
+ "description": "Lightweight entity prediction with fuzzy matching, aliases, and confidence scoring.",
5
+ "types": "index.d.ts",
5
6
  "type": "module",
6
7
  "main": "src/index.js",
7
8
  "keywords": [
package/src/predictor.js CHANGED
@@ -1,45 +1,84 @@
1
1
  import stringSimilarity from "string-similarity";
2
2
 
3
- function normalize(text) {
4
- return text
5
- .toLowerCase()
6
- .replace(/[^a-z]/g, "")
7
- .trim();
3
+ const DEFAULT_STOP_WORDS = [
4
+ "the",
5
+ "inc",
6
+ "ltd",
7
+ "pvt",
8
+ "corp",
9
+ "corporation",
10
+ "co",
11
+ "company",
12
+ "limited",
13
+ "private",
14
+ "bank",
15
+ ];
16
+
17
+ function defaultNormalize(
18
+ text,
19
+ ignoreStopWords = true,
20
+ stopWords = DEFAULT_STOP_WORDS
21
+ ) {
22
+ let processed = text.toLowerCase();
23
+
24
+ if (ignoreStopWords) {
25
+ // Remove stop words (must be surrounded by word boundaries or start/end)
26
+ const regex = new RegExp(`\\b(${stopWords.join("|")})\\b`, "g");
27
+ processed = processed.replace(regex, " ");
28
+ }
29
+
30
+ return processed.replace(/[^a-z]/g, "").trim();
8
31
  }
9
32
 
10
33
  export class EntityPredictor {
11
- constructor(entities = []) {
34
+ constructor(entities = [], options = {}) {
12
35
  this.entities = [];
13
36
  this.searchCandidates = [];
14
37
  this.candidateToEntity = [];
15
38
 
39
+ this.ignoreStopWords = options.ignoreStopWords === true; // Default false
40
+ this.stopWords = options.stopWords || DEFAULT_STOP_WORDS;
41
+ this.customNormalizer = options.normalizer;
42
+
16
43
  entities.forEach((item) => {
17
- let entityName;
18
- let aliases = [];
19
-
20
- if (typeof item === "string") {
21
- entityName = item;
22
- } else if (typeof item === "object" && item.name) {
23
- entityName = item.name;
24
- if (Array.isArray(item.aliases)) {
25
- aliases = item.aliases;
26
- }
27
- } else {
28
- return; // Skip invalid entries
44
+ this.addEntity(item, true); // true = internal call, delay re-indexing if needed (not needed here)
45
+ });
46
+ }
47
+
48
+ normalize(text) {
49
+ if (this.customNormalizer) {
50
+ return this.customNormalizer(text);
51
+ }
52
+ return defaultNormalize(text, this.ignoreStopWords, this.stopWords);
53
+ }
54
+
55
+ addEntity(item, isInternal = false) {
56
+ let entityName;
57
+ let aliases = [];
58
+
59
+ if (typeof item === "string") {
60
+ entityName = item;
61
+ } else if (typeof item === "object" && item.name) {
62
+ entityName = item.name;
63
+ if (Array.isArray(item.aliases)) {
64
+ aliases = item.aliases;
29
65
  }
66
+ } else {
67
+ return;
68
+ }
30
69
 
70
+ if (!this.entities.includes(entityName)) {
31
71
  this.entities.push(entityName);
72
+ }
32
73
 
33
- // Add canonical name to search candidates
34
- const normalizedName = normalize(entityName);
35
- this.searchCandidates.push(normalizedName);
36
- this.candidateToEntity.push(entityName);
74
+ // Add canonical
75
+ this.searchCandidates.push(this.normalize(entityName));
76
+ this.candidateToEntity.push(entityName);
37
77
 
38
- // Add aliases to search candidates
39
- aliases.forEach((alias) => {
40
- this.searchCandidates.push(normalize(alias));
41
- this.candidateToEntity.push(entityName);
42
- });
78
+ // Add aliases
79
+ aliases.forEach((alias) => {
80
+ this.searchCandidates.push(this.normalize(alias));
81
+ this.candidateToEntity.push(entityName);
43
82
  });
44
83
  }
45
84
 
@@ -47,47 +86,57 @@ export class EntityPredictor {
47
86
  if (!input || typeof input !== "string") {
48
87
  return null;
49
88
  }
89
+ const results = this.predictTop(input, 1, threshold);
90
+ if (results.length > 0) {
91
+ return results[0];
92
+ }
93
+ return {
94
+ entity: "UNKNOWN",
95
+ confidence: 0,
96
+ confidenceLevel: "Low Confidence",
97
+ };
98
+ }
99
+
100
+ predictTop(input, limit = 5, threshold = 0.6) {
101
+ if (!input || typeof input !== "string") {
102
+ return [];
103
+ }
50
104
 
51
- const match = stringSimilarity.findBestMatch(
52
- normalize(input),
105
+ const normalizedInput = this.normalize(input);
106
+ const matches = stringSimilarity.findBestMatch(
107
+ normalizedInput,
53
108
  this.searchCandidates
54
109
  );
55
110
 
56
- const rating = match.bestMatch.rating;
57
- let confidenceLevel = "Low Confidence";
111
+ // Map all ratings to our format and sort
112
+ const sortedMatches = matches.ratings
113
+ .map((rating, index) => ({
114
+ entity: this.candidateToEntity[index],
115
+ confidence: rating.rating,
116
+ confidenceLevel: this._getConfidenceLevel(rating.rating),
117
+ }))
118
+ .filter((m) => m.confidence >= threshold)
119
+ .sort((a, b) => b.confidence - a.confidence);
58
120
 
59
- if (rating === 1) {
60
- confidenceLevel = "Trustable";
61
- } else if (rating >= 0.8) {
62
- confidenceLevel = "High Confidence";
63
- } else if (rating >= 0.6) {
64
- confidenceLevel = "Moderate Confidence";
65
- }
121
+ // Deduplicate entities (picking the highest score for each unique entity)
122
+ const uniqueMatches = [];
123
+ const seenEntities = new Set();
66
124
 
67
- if (rating >= threshold) {
68
- return {
69
- entity: this.candidateToEntity[match.bestMatchIndex],
70
- confidence: rating,
71
- confidenceLevel,
72
- };
125
+ for (const match of sortedMatches) {
126
+ if (!seenEntities.has(match.entity)) {
127
+ uniqueMatches.push(match);
128
+ seenEntities.add(match.entity);
129
+ if (uniqueMatches.length >= limit) break;
130
+ }
73
131
  }
74
132
 
75
- return {
76
- entity: "UNKNOWN",
77
- confidence: rating,
78
- confidenceLevel,
79
- };
133
+ return uniqueMatches;
80
134
  }
81
135
 
82
- addEntity(entity, aliases = []) {
83
- this.entities.push(entity);
84
- const normalizedName = normalize(entity);
85
- this.searchCandidates.push(normalizedName);
86
- this.candidateToEntity.push(entity);
87
-
88
- aliases.forEach((alias) => {
89
- this.searchCandidates.push(normalize(alias));
90
- this.candidateToEntity.push(entity);
91
- });
136
+ _getConfidenceLevel(rating) {
137
+ if (rating === 1) return "Trustable";
138
+ if (rating >= 0.8) return "High Confidence";
139
+ if (rating >= 0.6) return "Moderate Confidence";
140
+ return "Low Confidence";
92
141
  }
93
142
  }