entity-predictor 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Entity Predictor
2
2
 
3
- A lightweight, zero-dependency (almost) Node.js library for entity name prediction and normalization.
3
+ A lightweight, **Zero Dependency** Node.js library for entity name prediction and normalization.
4
4
 
5
5
  It uses **fuzzy matching** to identify entities from messy input, supporting:
6
6
 
@@ -135,6 +135,7 @@ predictor.addEntity("PUNJAB NATIONAL BANK", ["PNB"]);
135
135
  - `ignoreStopWords`: boolean (default `false`)
136
136
  - `stopWords`: string[] (optional, defaults to internal list)
137
137
  - `normalizer`: (text: string) => string
138
+ - **Throws**: `TypeError` if `entities` is not an array.
138
139
 
139
140
  ### `predict(input, threshold)`
140
141
 
package/index.d.ts CHANGED
@@ -18,7 +18,7 @@ export interface PredictionResult {
18
18
  export class EntityPredictor {
19
19
  constructor(entities: (string | EntityOption)[], options?: PredictorOptions);
20
20
 
21
- predict(input: string, threshold?: number): PredictionResult;
21
+ predict(input: string, threshold?: number): PredictionResult | null;
22
22
 
23
23
  predictTop(input: string, limit?: number, threshold?: number): PredictionResult[];
24
24
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "entity-predictor",
3
- "version": "1.1.0",
3
+ "version": "1.2.0",
4
4
  "description": "Lightweight entity prediction with fuzzy matching, aliases, and confidence scoring.",
5
5
  "types": "index.d.ts",
6
6
  "type": "module",
@@ -13,8 +13,5 @@
13
13
  ],
14
14
  "author": "Sahil",
15
15
  "email": "dev.sahilkumar02@gmail.com",
16
- "license": "MIT",
17
- "dependencies": {
18
- "string-similarity": "^4.0.4"
19
- }
16
+ "license": "MIT"
20
17
  }
package/src/predictor.js CHANGED
@@ -1,4 +1,4 @@
1
- import stringSimilarity from "string-similarity";
1
+ import { findBestMatch } from "./string-similarity.js";
2
2
 
3
3
  const DEFAULT_STOP_WORDS = [
4
4
  "the",
@@ -30,8 +30,24 @@ function defaultNormalize(
30
30
  return processed.replace(/[^a-z]/g, "").trim();
31
31
  }
32
32
 
33
+ /**
34
+ * EntityPredictor class for fuzzy matching entities.
35
+ */
33
36
  export class EntityPredictor {
37
+ /**
38
+ * Creates an instance of EntityPredictor.
39
+ * @param {Array<string | {name: string, aliases: string[]}>} entities - List of entities.
40
+ * @param {Object} [options] - Configuration options.
41
+ * @param {boolean} [options.ignoreStopWords=false] - Whether to ignore stop words.
42
+ * @param {string[]} [options.stopWords] - Custom list of stop words.
43
+ * @param {function(string): string} [options.normalizer] - Custom normalizer function.
44
+ * @throws {TypeError} If entities is not an array.
45
+ */
34
46
  constructor(entities = [], options = {}) {
47
+ if (!Array.isArray(entities)) {
48
+ throw new TypeError("Entities must be an array.");
49
+ }
50
+
35
51
  this.entities = [];
36
52
  this.searchCandidates = [];
37
53
  this.candidateToEntity = [];
@@ -41,10 +57,15 @@ export class EntityPredictor {
41
57
  this.customNormalizer = options.normalizer;
42
58
 
43
59
  entities.forEach((item) => {
44
- this.addEntity(item, true); // true = internal call, delay re-indexing if needed (not needed here)
60
+ this.addEntity(item, true); // true = internal call
45
61
  });
46
62
  }
47
63
 
64
+ /**
65
+ * Normalizes text for comparison.
66
+ * @param {string} text - The text to normalize.
67
+ * @returns {string} Normalized text.
68
+ */
48
69
  normalize(text) {
49
70
  if (this.customNormalizer) {
50
71
  return this.customNormalizer(text);
@@ -52,6 +73,11 @@ export class EntityPredictor {
52
73
  return defaultNormalize(text, this.ignoreStopWords, this.stopWords);
53
74
  }
54
75
 
76
+ /**
77
+ * Adds an entity to the predictor.
78
+ * @param {string | {name: string, aliases: string[]}} item - The entity to add.
79
+ * @param {boolean} [isInternal=false] - Internal flag.
80
+ */
55
81
  addEntity(item, isInternal = false) {
56
82
  let entityName;
57
83
  let aliases = [];
@@ -64,6 +90,7 @@ export class EntityPredictor {
64
90
  aliases = item.aliases;
65
91
  }
66
92
  } else {
93
+ // Invalid entity format, skip silently or could warn in future
67
94
  return;
68
95
  }
69
96
 
@@ -82,6 +109,12 @@ export class EntityPredictor {
82
109
  });
83
110
  }
84
111
 
112
+ /**
113
+ * Predicts the best match for the input.
114
+ * @param {string} input - The input string.
115
+ * @param {number} [threshold=0.6] - The confidence threshold.
116
+ * @returns {{entity: string, confidence: number, confidenceLevel: string} | null} The best match or null/UNKNOWN.
117
+ */
85
118
  predict(input, threshold = 0.6) {
86
119
  if (!input || typeof input !== "string") {
87
120
  return null;
@@ -97,16 +130,20 @@ export class EntityPredictor {
97
130
  };
98
131
  }
99
132
 
133
+ /**
134
+ * Predicts the top N best matches.
135
+ * @param {string} input - The input string.
136
+ * @param {number} [limit=5] - The number of results to return.
137
+ * @param {number} [threshold=0.6] - The confidence threshold.
138
+ * @returns {Array<{entity: string, confidence: number, confidenceLevel: string}>} Array of matches.
139
+ */
100
140
  predictTop(input, limit = 5, threshold = 0.6) {
101
141
  if (!input || typeof input !== "string") {
102
142
  return [];
103
143
  }
104
144
 
105
145
  const normalizedInput = this.normalize(input);
106
- const matches = stringSimilarity.findBestMatch(
107
- normalizedInput,
108
- this.searchCandidates
109
- );
146
+ const matches = findBestMatch(normalizedInput, this.searchCandidates);
110
147
 
111
148
  // Map all ratings to our format and sort
112
149
  const sortedMatches = matches.ratings
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Compares two strings using bigram comparison (Dice Coefficient).
3
+ *
4
+ * @param {string} first - The first string to compare.
5
+ * @param {string} second - The second string to compare.
6
+ * @returns {number} A fraction between 0 and 1, which indicates the degree of similarity.
7
+ */
8
+ export function compareTwoStrings(first, second) {
9
+ first = first.replace(/\s+/g, "");
10
+ second = second.replace(/\s+/g, "");
11
+
12
+ if (first === second) return 1; // identical or empty
13
+ if (first.length < 2 || second.length < 2) return 0; // if either is a 0-letter or 1-letter string
14
+
15
+ let firstBigrams = new Map();
16
+ for (let i = 0; i < first.length - 1; i++) {
17
+ const bigram = first.substring(i, i + 2);
18
+ const count = firstBigrams.has(bigram) ? firstBigrams.get(bigram) + 1 : 1;
19
+
20
+ firstBigrams.set(bigram, count);
21
+ }
22
+
23
+ let intersectionSize = 0;
24
+ for (let i = 0; i < second.length - 1; i++) {
25
+ const bigram = second.substring(i, i + 2);
26
+ const count = firstBigrams.has(bigram) ? firstBigrams.get(bigram) : 0;
27
+
28
+ if (count > 0) {
29
+ firstBigrams.set(bigram, count - 1);
30
+ intersectionSize++;
31
+ }
32
+ }
33
+
34
+ return (2.0 * intersectionSize) / (first.length + second.length - 2);
35
+ }
36
+
37
+ /**
38
+ * Finds the best match for a main string from a target list of strings.
39
+ *
40
+ * @param {string} mainString - The string to match.
41
+ * @param {string[]} targetStrings - The array of strings to match against.
42
+ * @returns {{ ratings: Array<{target: string, rating: number}>, bestMatch: {target: string, rating: number}, bestMatchIndex: number }}
43
+ * @throws {TypeError} If arguments are invalid.
44
+ */
45
+ export function findBestMatch(mainString, targetStrings) {
46
+ if (!areArgsValid(mainString, targetStrings))
47
+ throw new TypeError(
48
+ "Bad arguments: First argument should be a string, second should be an array of strings"
49
+ );
50
+
51
+ const ratings = [];
52
+ let bestMatchIndex = 0;
53
+
54
+ for (let i = 0; i < targetStrings.length; i++) {
55
+ const currentTargetString = targetStrings[i];
56
+ const currentRating = compareTwoStrings(mainString, currentTargetString);
57
+ ratings.push({ target: currentTargetString, rating: currentRating });
58
+ if (currentRating > ratings[bestMatchIndex].rating) {
59
+ bestMatchIndex = i;
60
+ }
61
+ }
62
+
63
+ const bestMatch = ratings[bestMatchIndex];
64
+
65
+ return {
66
+ ratings: ratings,
67
+ bestMatch: bestMatch,
68
+ bestMatchIndex: bestMatchIndex,
69
+ };
70
+ }
71
+
72
+ function areArgsValid(mainString, targetStrings) {
73
+ if (typeof mainString !== "string") return false;
74
+ if (!Array.isArray(targetStrings)) return false;
75
+ if (!targetStrings.length) return false;
76
+ if (
77
+ targetStrings.find(function (s) {
78
+ return typeof s !== "string";
79
+ })
80
+ )
81
+ return false;
82
+ return true;
83
+ }