npm - entity-predictor - Versions diffs - 1.0.0 → 1.2.0 - Mend

entity-predictor 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -1,6 +1,13 @@
 # Entity Predictor
-A lightweight, zero-dependency (almost) Node.js library for entity name prediction and normalization. It uses fuzzy matching to identify entities from messy input, supporting aliases, acronyms, and common typos.
+A lightweight, **Zero Dependency** Node.js library for entity name prediction and normalization.
+It uses **fuzzy matching** to identify entities from messy input, supporting:
+- **Aliases & Acronyms** (e.g., "SBI" -> "STATE BANK OF INDIA")
+- **Confidence Scoring** ("Trustable", "High Confidence", etc.)
+- **Top-N Matches** (Get the top 3 best guesses)
+- **Configurable Stop Words** (Ignore "The", "Inc", etc.)
 ## Features
@@ -77,7 +84,41 @@ Output:
 */
 ```
-### 3. Add Entities Dynamically
+### 3. Top-N Matches
+Get a list of best matches instead of just one.
+```javascript
+const results = predictor.predictTop("Apple", 3);
+// Returns array of matches: [{ entity: "Apple Inc", ... }, ...]
+```
+### 4. Stop Words Filtering
+Automatically remove noise words like "The", "Inc", "Ltd". **Disabled by default.**
+```javascript
+// Enable with default list
+const predictor = new EntityPredictor(entities, { ignoreStopWords: true });
+// Enable with custom list
+const predictor = new EntityPredictor(entities, {
+  ignoreStopWords: true,
+  stopWords: ["inc", "co", "corp"],
+});
+```
+### 5. Custom Normalization
+Pass a custom normalizer to clean data your way.
+```javascript
+const predictor = new EntityPredictor(entities, {
+  normalizer: (text) => text.toUpperCase(),
+});
+```
+### 6. Add Entities Dynamically
 You can add new entities to an existing predictor instance.
@@ -87,28 +128,26 @@ predictor.addEntity("PUNJAB NATIONAL BANK", ["PNB"]);
 ## API Reference
-### `new EntityPredictor(entities)`
+### `new EntityPredictor(entities, options)`
 - `entities`: Array of strings or objects `{ name: string, aliases: string[] }`.
+- `options`: (Optional)
+  - `ignoreStopWords`: boolean (default `false`)
+  - `stopWords`: string[] (optional, defaults to internal list)
+  - `normalizer`: (text: string) => string
+- **Throws**: `TypeError` if `entities` is not an array.
 ### `predict(input, threshold)`
 - `input`: String to search for.
-- `threshold`: (Optional) Minimum confidence score to return a match. Default is `0.6`.
+- `threshold`: (Optional) Minimum confidence score (default `0.6`).
+- **Returns**: Best match object or `{ entity: "UNKNOWN", ... }`.
-**Returns:**
+### `predictTop(input, limit, threshold)`
-- `entity`: The canonical name of the matched entity.
-- `confidence`: Score between 0 and 1.
-- `confidenceLevel`:
-  - `"Trustable"` (1.0)
-  - `"High Confidence"` (>= 0.8)
-  - `"Moderate Confidence"` (>= 0.6)
-  - `"Low Confidence"` (< 0.6)
-- Returns `null` if the input is invalid.
-- Returns `{ entity: "UNKNOWN", ... }` if no match meets the threshold.
+- `limit`: Max number of results (default `5`).
+- **Returns**: Array of match objects.
-### `addEntity(name, aliases)`
+### Typescript Support
-- `name`: Canonical name of the entity.
-- `aliases`: (Optional) Array of alias strings.
+Includes `index.d.ts` for full TypeScript support.

package/index.d.ts ADDED Viewed

@@ -0,0 +1,26 @@
+export interface EntityOption {
+    name: string;
+    aliases?: string[];
+}
+export interface PredictorOptions {
+    ignoreStopWords?: boolean;
+    stopWords?: string[];
+    normalizer?: (text: string) => string;
+}
+export interface PredictionResult {
+    entity: string;
+    confidence: number;
+    confidenceLevel: "Trustable" | "High Confidence" | "Moderate Confidence" | "Low Confidence";
+}
+export class EntityPredictor {
+    constructor(entities: (string | EntityOption)[], options?: PredictorOptions);
+    predict(input: string, threshold?: number): PredictionResult | null;
+    predictTop(input: string, limit?: number, threshold?: number): PredictionResult[];
+    addEntity(entity: string | EntityOption): void;
+}

package/package.json CHANGED Viewed

@@ -1,7 +1,8 @@
 {
   "name": "entity-predictor",
-  "version": "1.0.0",
-  "description": "Lightweight entity name prediction and normalization library",
+  "version": "1.2.0",
+  "description": "Lightweight entity prediction with fuzzy matching, aliases, and confidence scoring.",
+  "types": "index.d.ts",
   "type": "module",
   "main": "src/index.js",
   "keywords": [
@@ -12,8 +13,5 @@
   ],
   "author": "Sahil",
   "email": "dev.sahilkumar02@gmail.com",
-  "license": "MIT",
-  "dependencies": {
-    "string-similarity": "^4.0.4"
-  }
+  "license": "MIT"
 }

package/src/predictor.js CHANGED Viewed

@@ -1,93 +1,179 @@
-import stringSimilarity from "string-similarity";
+import { findBestMatch } from "./string-similarity.js";
-function normalize(text) {
-  return text
-    .toLowerCase()
-    .replace(/[^a-z]/g, "")
-    .trim();
+const DEFAULT_STOP_WORDS = [
+  "the",
+  "inc",
+  "ltd",
+  "pvt",
+  "corp",
+  "corporation",
+  "co",
+  "company",
+  "limited",
+  "private",
+  "bank",
+];
+function defaultNormalize(
+  text,
+  ignoreStopWords = true,
+  stopWords = DEFAULT_STOP_WORDS
+) {
+  let processed = text.toLowerCase();
+  if (ignoreStopWords) {
+    // Remove stop words (must be surrounded by word boundaries or start/end)
+    const regex = new RegExp(`\\b(${stopWords.join("|")})\\b`, "g");
+    processed = processed.replace(regex, " ");
+  }
+  return processed.replace(/[^a-z]/g, "").trim();
 }
+/**
+ * EntityPredictor class for fuzzy matching entities.
+ */
 export class EntityPredictor {
-  constructor(entities = []) {
+  /**
+   * Creates an instance of EntityPredictor.
+   * @param {Array<string | {name: string, aliases: string[]}>} entities - List of entities.
+   * @param {Object} [options] - Configuration options.
+   * @param {boolean} [options.ignoreStopWords=false] - Whether to ignore stop words.
+   * @param {string[]} [options.stopWords] - Custom list of stop words.
+   * @param {function(string): string} [options.normalizer] - Custom normalizer function.
+   * @throws {TypeError} If entities is not an array.
+   */
+  constructor(entities = [], options = {}) {
+    if (!Array.isArray(entities)) {
+      throw new TypeError("Entities must be an array.");
+    }
     this.entities = [];
     this.searchCandidates = [];
     this.candidateToEntity = [];
+    this.ignoreStopWords = options.ignoreStopWords === true; // Default false
+    this.stopWords = options.stopWords || DEFAULT_STOP_WORDS;
+    this.customNormalizer = options.normalizer;
     entities.forEach((item) => {
-      let entityName;
-      let aliases = [];
-      if (typeof item === "string") {
-        entityName = item;
-      } else if (typeof item === "object" && item.name) {
-        entityName = item.name;
-        if (Array.isArray(item.aliases)) {
-          aliases = item.aliases;
-        }
-      } else {
-        return; // Skip invalid entries
+      this.addEntity(item, true); // true = internal call
+    });
+  }
+  /**
+   * Normalizes text for comparison.
+   * @param {string} text - The text to normalize.
+   * @returns {string} Normalized text.
+   */
+  normalize(text) {
+    if (this.customNormalizer) {
+      return this.customNormalizer(text);
+    }
+    return defaultNormalize(text, this.ignoreStopWords, this.stopWords);
+  }
+  /**
+   * Adds an entity to the predictor.
+   * @param {string | {name: string, aliases: string[]}} item - The entity to add.
+   * @param {boolean} [isInternal=false] - Internal flag.
+   */
+  addEntity(item, isInternal = false) {
+    let entityName;
+    let aliases = [];
+    if (typeof item === "string") {
+      entityName = item;
+    } else if (typeof item === "object" && item.name) {
+      entityName = item.name;
+      if (Array.isArray(item.aliases)) {
+        aliases = item.aliases;
       }
+    } else {
+      // Invalid entity format, skip silently or could warn in future
+      return;
+    }
+    if (!this.entities.includes(entityName)) {
       this.entities.push(entityName);
+    }
-      // Add canonical name to search candidates
-      const normalizedName = normalize(entityName);
-      this.searchCandidates.push(normalizedName);
-      this.candidateToEntity.push(entityName);
+    // Add canonical
+    this.searchCandidates.push(this.normalize(entityName));
+    this.candidateToEntity.push(entityName);
-      // Add aliases to search candidates
-      aliases.forEach((alias) => {
-        this.searchCandidates.push(normalize(alias));
-        this.candidateToEntity.push(entityName);
-      });
+    // Add aliases
+    aliases.forEach((alias) => {
+      this.searchCandidates.push(this.normalize(alias));
+      this.candidateToEntity.push(entityName);
     });
   }
+  /**
+   * Predicts the best match for the input.
+   * @param {string} input - The input string.
+   * @param {number} [threshold=0.6] - The confidence threshold.
+   * @returns {{entity: string, confidence: number, confidenceLevel: string} | null} The best match or null/UNKNOWN.
+   */
   predict(input, threshold = 0.6) {
     if (!input || typeof input !== "string") {
       return null;
     }
+    const results = this.predictTop(input, 1, threshold);
+    if (results.length > 0) {
+      return results[0];
+    }
+    return {
+      entity: "UNKNOWN",
+      confidence: 0,
+      confidenceLevel: "Low Confidence",
+    };
+  }
-    const match = stringSimilarity.findBestMatch(
-      normalize(input),
-      this.searchCandidates
-    );
+  /**
+   * Predicts the top N best matches.
+   * @param {string} input - The input string.
+   * @param {number} [limit=5] - The number of results to return.
+   * @param {number} [threshold=0.6] - The confidence threshold.
+   * @returns {Array<{entity: string, confidence: number, confidenceLevel: string}>} Array of matches.
+   */
+  predictTop(input, limit = 5, threshold = 0.6) {
+    if (!input || typeof input !== "string") {
+      return [];
+    }
-    const rating = match.bestMatch.rating;
-    let confidenceLevel = "Low Confidence";
+    const normalizedInput = this.normalize(input);
+    const matches = findBestMatch(normalizedInput, this.searchCandidates);
-    if (rating === 1) {
-      confidenceLevel = "Trustable";
-    } else if (rating >= 0.8) {
-      confidenceLevel = "High Confidence";
-    } else if (rating >= 0.6) {
-      confidenceLevel = "Moderate Confidence";
-    }
+    // Map all ratings to our format and sort
+    const sortedMatches = matches.ratings
+      .map((rating, index) => ({
+        entity: this.candidateToEntity[index],
+        confidence: rating.rating,
+        confidenceLevel: this._getConfidenceLevel(rating.rating),
+      }))
+      .filter((m) => m.confidence >= threshold)
+      .sort((a, b) => b.confidence - a.confidence);
-    if (rating >= threshold) {
-      return {
-        entity: this.candidateToEntity[match.bestMatchIndex],
-        confidence: rating,
-        confidenceLevel,
-      };
+    // Deduplicate entities (picking the highest score for each unique entity)
+    const uniqueMatches = [];
+    const seenEntities = new Set();
+    for (const match of sortedMatches) {
+      if (!seenEntities.has(match.entity)) {
+        uniqueMatches.push(match);
+        seenEntities.add(match.entity);
+        if (uniqueMatches.length >= limit) break;
+      }
     }
-    return {
-      entity: "UNKNOWN",
-      confidence: rating,
-      confidenceLevel,
-    };
+    return uniqueMatches;
   }
-  addEntity(entity, aliases = []) {
-    this.entities.push(entity);
-    const normalizedName = normalize(entity);
-    this.searchCandidates.push(normalizedName);
-    this.candidateToEntity.push(entity);
-    aliases.forEach((alias) => {
-      this.searchCandidates.push(normalize(alias));
-      this.candidateToEntity.push(entity);
-    });
+  _getConfidenceLevel(rating) {
+    if (rating === 1) return "Trustable";
+    if (rating >= 0.8) return "High Confidence";
+    if (rating >= 0.6) return "Moderate Confidence";
+    return "Low Confidence";
   }
 }

package/src/string-similarity.js ADDED Viewed

@@ -0,0 +1,83 @@
+/**
+ * Compares two strings using bigram comparison (Dice Coefficient).
+ *
+ * @param {string} first - The first string to compare.
+ * @param {string} second - The second string to compare.
+ * @returns {number} A fraction between 0 and 1, which indicates the degree of similarity.
+ */
+export function compareTwoStrings(first, second) {
+  first = first.replace(/\s+/g, "");
+  second = second.replace(/\s+/g, "");
+  if (first === second) return 1; // identical or empty
+  if (first.length < 2 || second.length < 2) return 0; // if either is a 0-letter or 1-letter string
+  let firstBigrams = new Map();
+  for (let i = 0; i < first.length - 1; i++) {
+    const bigram = first.substring(i, i + 2);
+    const count = firstBigrams.has(bigram) ? firstBigrams.get(bigram) + 1 : 1;
+    firstBigrams.set(bigram, count);
+  }
+  let intersectionSize = 0;
+  for (let i = 0; i < second.length - 1; i++) {
+    const bigram = second.substring(i, i + 2);
+    const count = firstBigrams.has(bigram) ? firstBigrams.get(bigram) : 0;
+    if (count > 0) {
+      firstBigrams.set(bigram, count - 1);
+      intersectionSize++;
+    }
+  }
+  return (2.0 * intersectionSize) / (first.length + second.length - 2);
+}
+/**
+ * Finds the best match for a main string from a target list of strings.
+ *
+ * @param {string} mainString - The string to match.
+ * @param {string[]} targetStrings - The array of strings to match against.
+ * @returns {{ ratings: Array<{target: string, rating: number}>, bestMatch: {target: string, rating: number}, bestMatchIndex: number }}
+ * @throws {TypeError} If arguments are invalid.
+ */
+export function findBestMatch(mainString, targetStrings) {
+  if (!areArgsValid(mainString, targetStrings))
+    throw new TypeError(
+      "Bad arguments: First argument should be a string, second should be an array of strings"
+    );
+  const ratings = [];
+  let bestMatchIndex = 0;
+  for (let i = 0; i < targetStrings.length; i++) {
+    const currentTargetString = targetStrings[i];
+    const currentRating = compareTwoStrings(mainString, currentTargetString);
+    ratings.push({ target: currentTargetString, rating: currentRating });
+    if (currentRating > ratings[bestMatchIndex].rating) {
+      bestMatchIndex = i;
+    }
+  }
+  const bestMatch = ratings[bestMatchIndex];
+  return {
+    ratings: ratings,
+    bestMatch: bestMatch,
+    bestMatchIndex: bestMatchIndex,
+  };
+}
+function areArgsValid(mainString, targetStrings) {
+  if (typeof mainString !== "string") return false;
+  if (!Array.isArray(targetStrings)) return false;
+  if (!targetStrings.length) return false;
+  if (
+    targetStrings.find(function (s) {
+      return typeof s !== "string";
+    })
+  )
+    return false;
+  return true;
+}