npm - @stll/text-search - Versions diffs - 0.1.0 → 0.2.0 - Mend

@stll/text-search 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,150 @@
+/**
+ * A single match result. Same shape as
+ * @stll/regex-set and @stll/aho-corasick.
+ */
+type Match = {
+    /** Index of the pattern that matched. */
+    pattern: number;
+    /** Start UTF-16 code unit offset. */
+    start: number;
+    /** End offset (exclusive). */
+    end: number;
+    /** The matched text. */
+    text: string;
+    /** Pattern name (if provided). */
+    name?: string;
+    /** Edit distance (fuzzy matches only). */
+    distance?: number;
+};
+/** A pattern entry for TextSearch. */
+type PatternEntry = string | RegExp | {
+    pattern: string | RegExp;
+    name?: string;
+} | {
+    pattern: string;
+    name?: string;
+    /** Fuzzy matching distance. Routes to
+     *  @stll/fuzzy-search instead of regex. */
+    distance: number | "auto";
+} | {
+    pattern: string;
+    name?: string;
+    /** Force literal matching via Aho-Corasick.
+     *  Skips regex metacharacter detection so
+     *  patterns like "č.p." or "s.r.o." are
+     *  matched literally, not as regex. */
+    literal: true;
+    /** Per-pattern case-insensitive for AC.
+     *  Overrides the global option for this
+     *  pattern only. */
+    caseInsensitive?: boolean;
+    /** Per-pattern whole-word matching for AC. */
+    wholeWords?: boolean;
+};
+/** Options for TextSearch. */
+type TextSearchOptions = {
+    /**
+     * Use Unicode word boundaries.
+     * @default true
+     */
+    unicodeBoundaries?: boolean;
+    /**
+     * Only match whole words.
+     * @default false
+     */
+    wholeWords?: boolean;
+    /**
+     * Max alternation branches before auto-splitting
+     * into a separate engine instance. Prevents DFA
+     * state explosion when large-alternation patterns
+     * are combined with other patterns.
+     * @default 50
+     */
+    maxAlternations?: number;
+    /**
+     * Fuzzy matching metric.
+     * @default "levenshtein"
+     */
+    fuzzyMetric?: "levenshtein" | "damerau-levenshtein";
+    /**
+     * Normalize diacritics for fuzzy matching.
+     * @default false
+     */
+    normalizeDiacritics?: boolean;
+    /**
+     * Case-insensitive matching for AC literals
+     * and fuzzy patterns.
+     * @default false
+     */
+    caseInsensitive?: boolean;
+    /**
+     * How to handle overlapping matches from
+     * different engines or patterns.
+     *
+     * - "longest": keep longest non-overlapping match
+     *   at each position (default).
+     * - "all": return all matches including overlaps.
+     *   Useful when the caller applies its own dedup.
+     *
+     * @default "longest"
+     */
+    overlapStrategy?: "longest" | "all";
+    /**
+     * Treat ALL string patterns as literals (route
+     * to AC, skip metacharacter detection). Useful
+     * for deny-list patterns where "s.r.o." means
+     * the literal string, not a regex with wildcards.
+     * @default false
+     */
+    allLiteral?: boolean;
+};
+/**
+ * Multi-engine text search orchestrator.
+ *
+ * Routes patterns to the optimal engine
+ * configuration:
+ * - Large alternation patterns get their own
+ *   RegexSet instance (prevents DFA state explosion)
+ * - Normal patterns share a single RegexSet
+ *   (single-pass multi-pattern DFA)
+ *
+ * Merges results from all engines into a unified
+ * non-overlapping Match[] sorted by position.
+ */
+declare class TextSearch {
+    private engines;
+    private patternCount;
+    private overlapAll;
+    /**
+     * True when there's exactly one engine and all
+     * patterns map to identity indices (0→0, 1→1, ...).
+     * Enables zero-overhead findIter: return raw engine
+     * output without remapping or object allocation.
+     */
+    private zeroOverhead;
+    constructor(patterns: PatternEntry[], options?: TextSearchOptions);
+    /** Number of patterns. */
+    get length(): number;
+    /** Returns true if any pattern matches. */
+    isMatch(haystack: string): boolean;
+    /**
+     * Find matches in text.
+     *
+     * With `overlapStrategy: "longest"` (default):
+     * returns non-overlapping matches, longest wins.
+     *
+     * With `overlapStrategy: "all"`: returns all
+     * matches including overlaps, sorted by position.
+     */
+    findIter(haystack: string): Match[];
+    /** Which pattern indices matched (not where). */
+    whichMatch(haystack: string): number[];
+    /**
+     * Replace all non-overlapping matches.
+     * replacements[i] replaces pattern i.
+     */
+    replaceAll(haystack: string, replacements: string[]): string;
+}
+export { type Match, type PatternEntry, TextSearch, type TextSearchOptions };

package/dist/index.js ADDED Viewed

@@ -0,0 +1,464 @@
+// src/text-search.ts
+import { AhoCorasick } from "@stll/aho-corasick";
+import { FuzzySearch } from "@stll/fuzzy-search";
+import { RegexSet } from "@stll/regex-set";
+// src/classify.ts
+function isLiteralPattern(pattern) {
+  for (let i = 0; i < pattern.length; i++) {
+    const ch = pattern[i];
+    if (ch === "\\" || ch === "." || ch === "^" || ch === "$" || ch === "*" || ch === "+" || ch === "?" || ch === "{" || ch === "}" || ch === "(" || ch === ")" || ch === "[" || ch === "]" || ch === "|") {
+      return false;
+    }
+  }
+  return pattern.length > 0;
+}
+function countAlternations(pattern) {
+  let depth = 0;
+  let inClass = false;
+  let i = 0;
+  let max = 1;
+  let currentCount = 1;
+  const stack = [];
+  while (i < pattern.length) {
+    const ch = pattern[i];
+    if (ch === "\\" && i + 1 < pattern.length) {
+      i += 2;
+      continue;
+    }
+    if (ch === "[") inClass = true;
+    if (ch === "]") inClass = false;
+    if (!inClass) {
+      if (ch === "(") {
+        stack.push(currentCount);
+        currentCount = 1;
+        depth++;
+      }
+      if (ch === ")") {
+        if (currentCount > max) max = currentCount;
+        currentCount = stack.pop() ?? 1;
+        depth--;
+      }
+      if (ch === "|") {
+        currentCount++;
+      }
+    }
+    i++;
+  }
+  if (currentCount > max) max = currentCount;
+  return max;
+}
+function classifyPatterns(entries, allLiteral = false) {
+  return entries.map((entry, i) => {
+    if (typeof entry === "string") {
+      return {
+        originalIndex: i,
+        pattern: entry,
+        alternationCount: allLiteral ? 0 : countAlternations(entry),
+        isLiteral: allLiteral || isLiteralPattern(entry)
+      };
+    }
+    if (entry instanceof RegExp) {
+      return {
+        originalIndex: i,
+        pattern: entry,
+        alternationCount: countAlternations(
+          entry.source
+        ),
+        isLiteral: false
+        // RegExp is never literal
+      };
+    }
+    if ("distance" in entry) {
+      const result2 = {
+        originalIndex: i,
+        pattern: entry.pattern,
+        alternationCount: 0,
+        isLiteral: false,
+        fuzzyDistance: entry.distance
+      };
+      if (entry.name !== void 0) result2.name = entry.name;
+      return result2;
+    }
+    if ("literal" in entry && entry.literal) {
+      const hasPerPatternOpts = "caseInsensitive" in entry || "wholeWords" in entry;
+      const result2 = {
+        originalIndex: i,
+        pattern: entry.pattern,
+        alternationCount: 0,
+        isLiteral: true
+      };
+      if (entry.name !== void 0) result2.name = entry.name;
+      if (hasPerPatternOpts) {
+        const opts = {};
+        if (entry.caseInsensitive !== void 0)
+          opts.caseInsensitive = entry.caseInsensitive;
+        if (entry.wholeWords !== void 0)
+          opts.wholeWords = entry.wholeWords;
+        result2.acOptions = opts;
+      }
+      return result2;
+    }
+    const pat = entry.pattern;
+    const source = pat instanceof RegExp ? pat.source : pat;
+    const result = {
+      originalIndex: i,
+      pattern: pat,
+      alternationCount: allLiteral ? 0 : countAlternations(source),
+      isLiteral: typeof pat === "string" && (allLiteral || isLiteralPattern(pat))
+    };
+    if (entry.name !== void 0) result.name = entry.name;
+    return result;
+  });
+}
+// src/merge.ts
+function mergeAndSelect(matches) {
+  if (matches.length <= 1) return matches;
+  matches.sort((a, b) => {
+    if (a.start !== b.start) {
+      return a.start - b.start;
+    }
+    return b.end - b.start - (a.end - a.start);
+  });
+  const selected = [];
+  let lastEnd = 0;
+  for (const m of matches) {
+    if (m.start >= lastEnd) {
+      selected.push(m);
+      lastEnd = m.end;
+    }
+  }
+  return selected;
+}
+// src/text-search.ts
+var TextSearch = class {
+  engines = [];
+  patternCount;
+  overlapAll;
+  /**
+   * True when there's exactly one engine and all
+   * patterns map to identity indices (0→0, 1→1, ...).
+   * Enables zero-overhead findIter: return raw engine
+   * output without remapping or object allocation.
+   */
+  zeroOverhead = false;
+  constructor(patterns, options) {
+    this.patternCount = patterns.length;
+    this.overlapAll = options?.overlapStrategy === "all";
+    const maxAlt = options?.maxAlternations ?? 50;
+    const classified = classifyPatterns(
+      patterns,
+      options?.allLiteral ?? false
+    );
+    const fuzzy = [];
+    const literals = [];
+    const shared = [];
+    const isolated = [];
+    for (const cp of classified) {
+      if (cp.fuzzyDistance !== void 0) {
+        fuzzy.push(cp);
+      } else if (cp.isLiteral) {
+        literals.push(cp);
+      } else if (cp.alternationCount > maxAlt) {
+        isolated.push(cp);
+      } else {
+        shared.push(cp);
+      }
+    }
+    const rsOptions = {
+      unicodeBoundaries: options?.unicodeBoundaries ?? true,
+      wholeWords: options?.wholeWords ?? false,
+      caseInsensitive: options?.caseInsensitive ?? false
+    };
+    if (fuzzy.length > 0) {
+      const fuzzyOpts = {
+        unicodeBoundaries: rsOptions.unicodeBoundaries,
+        wholeWords: rsOptions.wholeWords
+      };
+      if (options?.fuzzyMetric !== void 0)
+        fuzzyOpts.metric = options.fuzzyMetric;
+      if (options?.normalizeDiacritics !== void 0)
+        fuzzyOpts.normalizeDiacritics = options.normalizeDiacritics;
+      if (options?.caseInsensitive !== void 0)
+        fuzzyOpts.caseInsensitive = options.caseInsensitive;
+      this.engines.push(
+        buildFuzzyEngine(fuzzy, fuzzyOpts)
+      );
+    }
+    if (literals.length > 0) {
+      const groups = /* @__PURE__ */ new Map();
+      for (const cp of literals) {
+        const ci = cp.acOptions?.caseInsensitive ?? rsOptions.caseInsensitive;
+        const ww = cp.acOptions?.wholeWords ?? rsOptions.wholeWords;
+        const key = `${ci ? 1 : 0}:${ww ? 1 : 0}`;
+        const group = groups.get(key);
+        if (group) {
+          group.push(cp);
+        } else {
+          groups.set(key, [cp]);
+        }
+      }
+      for (const [key, group] of groups) {
+        const [ci, ww] = key.split(":");
+        this.engines.push(
+          buildAcEngine(group, {
+            ...rsOptions,
+            caseInsensitive: ci === "1",
+            wholeWords: ww === "1"
+          })
+        );
+      }
+    }
+    if (shared.length > 1) {
+      const combined = buildRegexEngine(
+        shared,
+        rsOptions
+      );
+      const probe = "Hello World 123 test@example.com 2025-01-01 +420 123 456 789 Ing. Jan Nov\xE1k, s.r.o. Praha 1 ".repeat(10);
+      const t0 = performance.now();
+      combined.rs.findIter(probe);
+      const combinedMs = performance.now() - t0;
+      let individualMs = 0;
+      const individualEngines = [];
+      for (const cp of shared) {
+        const eng = buildRegexEngine(
+          [cp],
+          rsOptions
+        );
+        const t1 = performance.now();
+        eng.rs.findIter(probe);
+        individualMs += performance.now() - t1;
+        individualEngines.push(eng);
+      }
+      if (combinedMs > individualMs * 1.5) {
+        for (const eng of individualEngines) {
+          this.engines.push(eng);
+        }
+      } else {
+        this.engines.push(combined);
+      }
+    } else if (shared.length === 1) {
+      this.engines.push(
+        buildRegexEngine(shared, rsOptions)
+      );
+    }
+    for (const cp of isolated) {
+      this.engines.push(
+        buildRegexEngine([cp], rsOptions)
+      );
+    }
+    if (this.engines.length === 1) {
+      const engine = this.engines[0];
+      const hasNames = engine.nameMap.some(
+        (n) => n !== void 0
+      );
+      if (!hasNames) {
+        this.zeroOverhead = true;
+      }
+    }
+  }
+  /** Number of patterns. */
+  get length() {
+    return this.patternCount;
+  }
+  /** Returns true if any pattern matches. */
+  isMatch(haystack) {
+    for (const engine of this.engines) {
+      if (engineIsMatch(engine, haystack)) {
+        return true;
+      }
+    }
+    return false;
+  }
+  /**
+   * Find matches in text.
+   *
+   * With `overlapStrategy: "longest"` (default):
+   * returns non-overlapping matches, longest wins.
+   *
+   * With `overlapStrategy: "all"`: returns all
+   * matches including overlaps, sorted by position.
+   */
+  findIter(haystack) {
+    if (this.zeroOverhead) {
+      return engineFindIter(
+        this.engines[0],
+        haystack
+      );
+    }
+    if (this.engines.length === 1) {
+      return remapMatches(
+        engineFindIter(this.engines[0], haystack),
+        this.engines[0]
+      );
+    }
+    const all = [];
+    for (const engine of this.engines) {
+      const matches = engineFindIter(
+        engine,
+        haystack
+      );
+      for (const m of remapMatches(matches, engine)) {
+        all.push(m);
+      }
+    }
+    if (this.overlapAll) {
+      return all.sort(
+        (a, b) => a.start - b.start
+      );
+    }
+    return mergeAndSelect(all);
+  }
+  /** Which pattern indices matched (not where). */
+  whichMatch(haystack) {
+    const seen = /* @__PURE__ */ new Set();
+    for (const engine of this.engines) {
+      const matches = engineFindIter(
+        engine,
+        haystack
+      );
+      for (const m of matches) {
+        seen.add(engine.indexMap[m.pattern]);
+      }
+    }
+    return [...seen];
+  }
+  /**
+   * Replace all non-overlapping matches.
+   * replacements[i] replaces pattern i.
+   */
+  replaceAll(haystack, replacements) {
+    if (replacements.length !== this.patternCount) {
+      throw new Error(
+        `Expected ${this.patternCount} replacements, got ${replacements.length}`
+      );
+    }
+    const all = [];
+    for (const engine of this.engines) {
+      const matches2 = engineFindIter(
+        engine,
+        haystack
+      );
+      for (const m of remapMatches(matches2, engine)) {
+        all.push(m);
+      }
+    }
+    const matches = mergeAndSelect(all);
+    let result = "";
+    let last = 0;
+    for (const m of matches) {
+      result += haystack.slice(last, m.start);
+      result += replacements[m.pattern];
+      last = m.end;
+    }
+    result += haystack.slice(last);
+    return result;
+  }
+};
+function buildRegexEngine(patterns, options) {
+  const rsPatterns = [];
+  const indexMap = [];
+  const nameMap = [];
+  for (const cp of patterns) {
+    if (cp.name !== void 0) {
+      rsPatterns.push({
+        pattern: cp.pattern,
+        name: cp.name
+      });
+    } else {
+      rsPatterns.push(cp.pattern);
+    }
+    indexMap.push(cp.originalIndex);
+    nameMap.push(cp.name);
+  }
+  const rs = new RegexSet(rsPatterns, options);
+  return { type: "regex", rs, indexMap, nameMap };
+}
+function buildAcEngine(patterns, options) {
+  const literals = [];
+  const indexMap = [];
+  const nameMap = [];
+  for (const cp of patterns) {
+    literals.push(cp.pattern);
+    indexMap.push(cp.originalIndex);
+    nameMap.push(cp.name);
+  }
+  const ac = new AhoCorasick(literals, {
+    wholeWords: options.wholeWords,
+    unicodeBoundaries: options.unicodeBoundaries,
+    caseInsensitive: options.caseInsensitive
+  });
+  return { type: "ac", ac, indexMap, nameMap };
+}
+function buildFuzzyEngine(patterns, options) {
+  const fsPatterns = [];
+  const indexMap = [];
+  const nameMap = [];
+  for (const cp of patterns) {
+    const entry = {
+      pattern: cp.pattern
+    };
+    if (cp.fuzzyDistance !== void 0)
+      entry.distance = cp.fuzzyDistance;
+    if (cp.name !== void 0) entry.name = cp.name;
+    fsPatterns.push(entry);
+    indexMap.push(cp.originalIndex);
+    nameMap.push(cp.name);
+  }
+  const fsOptions = {
+    unicodeBoundaries: options.unicodeBoundaries,
+    wholeWords: options.wholeWords
+  };
+  if (options.metric !== void 0)
+    fsOptions.metric = options.metric;
+  if (options.normalizeDiacritics !== void 0)
+    fsOptions.normalizeDiacritics = options.normalizeDiacritics;
+  if (options.caseInsensitive !== void 0)
+    fsOptions.caseInsensitive = options.caseInsensitive;
+  const fs = new FuzzySearch(fsPatterns, fsOptions);
+  return { type: "fuzzy", fs, indexMap, nameMap };
+}
+function engineIsMatch(engine, haystack) {
+  switch (engine.type) {
+    case "ac":
+      return engine.ac.isMatch(haystack);
+    case "fuzzy":
+      return engine.fs.isMatch(haystack);
+    case "regex":
+      return engine.rs.isMatch(haystack);
+  }
+}
+function engineFindIter(engine, haystack) {
+  switch (engine.type) {
+    case "ac":
+      return engine.ac.findIter(haystack);
+    case "fuzzy":
+      return engine.fs.findIter(haystack);
+    case "regex":
+      return engine.rs.findIter(haystack);
+  }
+}
+function remapMatches(matches, engine) {
+  return matches.map((m) => {
+    const originalIdx = engine.indexMap[m.pattern];
+    const name = engine.nameMap[m.pattern];
+    const result = {
+      pattern: originalIdx,
+      start: m.start,
+      end: m.end,
+      text: m.text
+    };
+    if (name !== void 0) {
+      result.name = name;
+    }
+    if ("distance" in m && m.distance !== void 0) {
+      result.distance = m.distance;
+    }
+    return result;
+  });
+}
+export {
+  TextSearch
+};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@stll/text-search",
-  "version": "0.1.0",
+  "version": "0.2.0",
   "description": "Multi-engine text search orchestrator. Routes patterns to optimal engines: Aho-Corasick, RegexSet, or FuzzySearch.",
   "keywords": [
     "text-search",
@@ -20,16 +20,23 @@
     "url": "https://github.com/stella/text-search"
   },
   "type": "module",
-  "main": "src/index.ts",
-  "module": "src/index.ts",
+  "main": "dist/index.js",
+  "module": "dist/index.js",
+  "types": "dist/index.d.ts",
   "exports": {
-    ".": "./src/index.ts"
+    ".": {
+      "types": "./dist/index.d.ts",
+      "import": "./dist/index.js",
+      "default": "./dist/index.js"
+    }
   },
   "files": [
+    "src",
     "dist"
   ],
   "scripts": {
-    "build": "bun build src/index.ts --outdir dist --target node",
+    "build": "tsup",
+    "prepublishOnly": "bun run build",
     "test": "bun test",
     "lint": "oxlint .",
     "format": "oxfmt ."
@@ -43,7 +50,9 @@
     "@types/node": "^22.0.0",
     "bun-types": "^1.3.10",
     "oxfmt": "^0.40.0",
-    "oxlint": "^1.55.0"
+    "oxlint": "^1.55.0",
+    "tsup": "^8.5.1",
+    "typescript": "^5.9.3"
   },
   "engines": {
     "node": ">= 18"