npm - @wcs-colab/plugin-fuzzy-phrase - Versions diffs - 3.1.16-custom.1 - Mend

@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,164 @@
+# @wcs-colab/plugin-fuzzy-phrase
+Advanced fuzzy phrase matching plugin for Orama with semantic weighting and synonym expansion.
+## Features
+- ✅ **Independent from QPS** - Direct radix tree access, no QPS dependency
+- ✅ **Fuzzy matching** - Using `boundedLevenshtein` algorithm (same as match-highlight)
+- ✅ **Phrase-level scoring** - Multi-factor scoring algorithm
+- ✅ **Synonym expansion** - Load synonyms from Supabase
+- ✅ **Adaptive tolerance** - Dynamically scales with query length
+- ✅ **Semantic weighting** - TF-IDF scoring for relevance
+- ✅ **Configurable** - All weights and thresholds are configurable
+## Installation
+```bash
+npm install @wcs-colab/plugin-fuzzy-phrase
+```
+## Basic Usage
+```typescript
+import { create } from '@wcs-colab/orama';
+import { pluginFuzzyPhrase } from '@wcs-colab/plugin-fuzzy-phrase';
+const db = await create({
+  schema: {
+    content: 'string',
+    title: 'string'
+  },
+  plugins: [
+    pluginFuzzyPhrase({
+      textProperty: 'content',
+      tolerance: 1,
+      adaptiveTolerance: true
+    })
+  ]
+});
+// Search with fuzzy phrase matching
+const results = await search(db, {
+  term: 'fuzzy search example',
+  properties: ['content']
+});
+```
+## Configuration
+```typescript
+interface FuzzyPhraseConfig {
+  // Text property to search in
+  textProperty?: string;  // default: 'content'
+  // Base fuzzy matching tolerance (edit distance)
+  tolerance?: number;  // default: 1
+  // Enable adaptive tolerance (scales with query length)
+  adaptiveTolerance?: boolean;  // default: true
+  // Enable synonym expansion
+  enableSynonyms?: boolean;  // default: false
+  // Supabase configuration for loading synonyms
+  supabase?: {
+    url: string;
+    serviceKey: string;
+  };
+  // Scoring weight for synonym matches (0-1)
+  synonymMatchScore?: number;  // default: 0.8
+  // Scoring weights for different components
+  weights?: {
+    exact?: number;      // default: 1.0
+    fuzzy?: number;      // default: 0.8
+    order?: number;      // default: 0.3
+    proximity?: number;  // default: 0.2
+    density?: number;    // default: 0.2
+    semantic?: number;   // default: 0.15
+  };
+  // Maximum gap between words in a phrase
+  maxGap?: number;  // default: 5
+  // Minimum phrase score to include in results
+  minScore?: number;  // default: 0.1
+}
+```
+## With Synonyms (Supabase)
+```typescript
+import { pluginFuzzyPhrase } from '@wcs-colab/plugin-fuzzy-phrase';
+const db = await create({
+  schema: {
+    content: 'string'
+  },
+  plugins: [
+    pluginFuzzyPhrase({
+      textProperty: 'content',
+      enableSynonyms: true,
+      supabase: {
+        url: process.env.SUPABASE_URL,
+        serviceKey: process.env.SUPABASE_SERVICE_ROLE_KEY
+      }
+    })
+  ]
+});
+// Now searches will include synonym matches
+// e.g., "humanité" will also match "homme", "humain"
+```
+## How It Works
+### 1. Candidate Expansion
+For each query token, the plugin finds:
+- **Exact matches** - Exact word match (score: 1.0)
+- **Fuzzy matches** - Within edit distance tolerance (score: 0.6-0.95)
+- **Synonym matches** - From synonym dictionary (score: 0.8)
+### 2. Phrase Finding
+Uses sliding window to find phrases where:
+- Words are within `maxGap` distance
+- Multiple query tokens are present
+- Phrases don't overlap
+### 3. Multi-Factor Scoring
+Each phrase is scored using:
+- **Base score** - Quality of word matches
+- **Order bonus** - Words in correct order
+- **Proximity bonus** - Words close together
+- **Density bonus** - Percentage of query covered
+- **Semantic bonus** - TF-IDF relevance weighting
+### 4. Result Ranking
+Results are sorted by highest phrase score.
+## Architecture
+The plugin is completely independent from QPS:
+- Accesses Orama's radix tree directly
+- Uses same `boundedLevenshtein` as match-highlight plugin
+- Implements custom phrase-level scoring
+- Loads synonyms from Supabase (optional)
+## Performance
+- **Bounded Levenshtein** - Early termination for performance
+- **Vocabulary extraction** - One-time cost at index creation
+- **TF-IDF** - Pre-calculated document frequencies
+- **Deduplication** - Non-overlapping phrase optimization
+## License
+Apache-2.0
+## Version
+3.1.16-custom.1
+Compatible with `@wcs-colab/orama@3.1.16-custom.9`

package/dist/index.cjs ADDED Viewed

@@ -0,0 +1,508 @@
+'use strict';
+// src/fuzzy.ts
+function boundedLevenshtein(a, b, bound) {
+  if (a === b) {
+    return { isBounded: true, distance: 0 };
+  }
+  const aLen = a.length;
+  const bLen = b.length;
+  if (Math.abs(aLen - bLen) > bound) {
+    return { isBounded: false, distance: bound + 1 };
+  }
+  if (aLen > bLen) {
+    [a, b] = [b, a];
+  }
+  const m = a.length;
+  const n = b.length;
+  let prevRow = new Array(n + 1);
+  let currRow = new Array(n + 1);
+  for (let j = 0; j <= n; j++) {
+    prevRow[j] = j;
+  }
+  for (let i = 1; i <= m; i++) {
+    currRow[0] = i;
+    let minInRow = i;
+    for (let j = 1; j <= n; j++) {
+      const cost = a[i - 1] === b[j - 1] ? 0 : 1;
+      currRow[j] = Math.min(
+        prevRow[j] + 1,
+        // deletion
+        currRow[j - 1] + 1,
+        // insertion
+        prevRow[j - 1] + cost
+        // substitution
+      );
+      minInRow = Math.min(minInRow, currRow[j]);
+    }
+    if (minInRow > bound) {
+      return { isBounded: false, distance: bound + 1 };
+    }
+    [prevRow, currRow] = [currRow, prevRow];
+  }
+  const distance = prevRow[n];
+  return {
+    isBounded: distance <= bound,
+    distance
+  };
+}
+function fuzzyMatch(word, queryToken, tolerance) {
+  if (word === queryToken) {
+    return { matches: true, distance: 0, score: 1 };
+  }
+  if (word.startsWith(queryToken)) {
+    return { matches: true, distance: 0, score: 0.95 };
+  }
+  const result = boundedLevenshtein(word, queryToken, tolerance);
+  if (result.isBounded) {
+    const score = 1 - result.distance * 0.2;
+    return {
+      matches: true,
+      distance: result.distance,
+      score: Math.max(0.1, score)
+      // Minimum score of 0.1
+    };
+  }
+  return { matches: false, distance: tolerance + 1, score: 0 };
+}
+function calculateAdaptiveTolerance(queryTokens, baseTolerance) {
+  const queryLength = queryTokens.length;
+  if (queryLength <= 2) {
+    return baseTolerance;
+  } else if (queryLength <= 4) {
+    return baseTolerance + 1;
+  } else if (queryLength <= 6) {
+    return baseTolerance + 2;
+  } else {
+    return baseTolerance + 3;
+  }
+}
+// src/candidates.ts
+function extractVocabularyFromRadixTree(radixNode) {
+  const vocabulary = /* @__PURE__ */ new Set();
+  function traverse(node) {
+    if (node.w) {
+      vocabulary.add(node.w);
+    }
+    if (node.c) {
+      for (const child of Object.values(node.c)) {
+        traverse(child);
+      }
+    }
+  }
+  traverse(radixNode);
+  return vocabulary;
+}
+function findCandidatesForToken(queryToken, vocabulary, tolerance, synonyms, synonymScore = 0.8) {
+  const candidates = [];
+  const seen = /* @__PURE__ */ new Set();
+  if (vocabulary.has(queryToken)) {
+    candidates.push({
+      word: queryToken,
+      type: "exact",
+      queryToken,
+      distance: 0,
+      score: 1
+    });
+    seen.add(queryToken);
+  }
+  for (const word of vocabulary) {
+    if (seen.has(word))
+      continue;
+    const match = fuzzyMatch(word, queryToken, tolerance);
+    if (match.matches) {
+      candidates.push({
+        word,
+        type: "fuzzy",
+        queryToken,
+        distance: match.distance,
+        score: match.score
+      });
+      seen.add(word);
+    }
+  }
+  if (synonyms && synonyms[queryToken]) {
+    for (const synonym of synonyms[queryToken]) {
+      if (seen.has(synonym))
+        continue;
+      if (vocabulary.has(synonym)) {
+        candidates.push({
+          word: synonym,
+          type: "synonym",
+          queryToken,
+          distance: 0,
+          score: synonymScore
+        });
+        seen.add(synonym);
+      }
+    }
+  }
+  return candidates;
+}
+function findAllCandidates(queryTokens, vocabulary, tolerance, synonyms, synonymScore = 0.8) {
+  const candidatesMap = /* @__PURE__ */ new Map();
+  for (const token of queryTokens) {
+    const tokenCandidates = findCandidatesForToken(
+      token,
+      vocabulary,
+      tolerance,
+      synonyms,
+      synonymScore
+    );
+    candidatesMap.set(token, tokenCandidates);
+  }
+  return candidatesMap;
+}
+function filterCandidatesByScore(candidatesMap, minScore) {
+  const filtered = /* @__PURE__ */ new Map();
+  for (const [token, candidates] of candidatesMap.entries()) {
+    const filteredCandidates = candidates.filter((c) => c.score >= minScore);
+    if (filteredCandidates.length > 0) {
+      filtered.set(token, filteredCandidates);
+    }
+  }
+  return filtered;
+}
+// src/scoring.ts
+function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
+  const phrases = [];
+  const queryTokens = Array.from(candidatesMap.keys());
+  const wordMatches = [];
+  for (let i = 0; i < documentTokens.length; i++) {
+    const docWord = documentTokens[i];
+    for (const [queryToken, candidates] of candidatesMap.entries()) {
+      for (const candidate of candidates) {
+        if (candidate.word === docWord) {
+          wordMatches.push({
+            word: docWord,
+            queryToken,
+            position: i,
+            type: candidate.type,
+            distance: candidate.distance,
+            score: candidate.score
+          });
+        }
+      }
+    }
+  }
+  for (let i = 0; i < wordMatches.length; i++) {
+    const phrase = buildPhraseFromPosition(
+      wordMatches,
+      i,
+      queryTokens,
+      config,
+      documentFrequency,
+      totalDocuments
+    );
+    if (phrase && phrase.words.length > 0) {
+      phrases.push(phrase);
+    }
+  }
+  return deduplicatePhrases(phrases);
+}
+function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments) {
+  const startMatch = wordMatches[startIndex];
+  const phraseWords = [startMatch];
+  const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
+  for (let i = startIndex + 1; i < wordMatches.length; i++) {
+    const match = wordMatches[i];
+    const gap = match.position - phraseWords[phraseWords.length - 1].position - 1;
+    if (gap > config.maxGap) {
+      break;
+    }
+    if (!coveredTokens.has(match.queryToken)) {
+      phraseWords.push(match);
+      coveredTokens.add(match.queryToken);
+    }
+    if (coveredTokens.size === queryTokens.length) {
+      break;
+    }
+  }
+  if (phraseWords.length > 0) {
+    const score = calculatePhraseScore(
+      phraseWords,
+      queryTokens,
+      config,
+      documentFrequency,
+      totalDocuments
+    );
+    return {
+      words: phraseWords,
+      startPosition: phraseWords[0].position,
+      endPosition: phraseWords[phraseWords.length - 1].position,
+      gap: phraseWords[phraseWords.length - 1].position - phraseWords[0].position,
+      inOrder: isInOrder(phraseWords, queryTokens),
+      score
+    };
+  }
+  return null;
+}
+function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments) {
+  let baseScore = 0;
+  for (const word of phraseWords) {
+    const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
+    baseScore += word.score * weight;
+  }
+  baseScore /= phraseWords.length;
+  const inOrder = isInOrder(phraseWords, queryTokens);
+  const orderScore = inOrder ? 1 : 0.5;
+  const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
+  const proximityScore = Math.max(0, 1 - span / (queryTokens.length * 5));
+  const densityScore = phraseWords.length / queryTokens.length;
+  const semanticScore = calculateSemanticScore(
+    phraseWords,
+    documentFrequency,
+    totalDocuments
+  );
+  const weights = config.weights;
+  const totalScore = baseScore + orderScore * weights.order + proximityScore * weights.proximity + densityScore * weights.density + semanticScore * weights.semantic;
+  const maxPossibleScore = 1 + weights.order + weights.proximity + weights.density + weights.semantic;
+  return Math.min(1, totalScore / maxPossibleScore);
+}
+function isInOrder(phraseWords, queryTokens) {
+  const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
+  for (let i = 1; i < phraseWords.length; i++) {
+    const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1;
+    const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1;
+    if (currOrder < prevOrder) {
+      return false;
+    }
+  }
+  return true;
+}
+function calculateSemanticScore(phraseWords, documentFrequency, totalDocuments) {
+  let tfidfSum = 0;
+  for (const word of phraseWords) {
+    const df = documentFrequency.get(word.word) || 1;
+    const idf = Math.log(totalDocuments / df);
+    tfidfSum += idf;
+  }
+  const avgTfidf = tfidfSum / phraseWords.length;
+  return Math.min(1, avgTfidf / 10);
+}
+function deduplicatePhrases(phrases) {
+  if (phrases.length === 0)
+    return [];
+  const sorted = phrases.slice().sort((a, b) => b.score - a.score);
+  const result = [];
+  const covered = /* @__PURE__ */ new Set();
+  for (const phrase of sorted) {
+    let overlaps = false;
+    for (let pos = phrase.startPosition; pos <= phrase.endPosition; pos++) {
+      if (covered.has(pos)) {
+        overlaps = true;
+        break;
+      }
+    }
+    if (!overlaps) {
+      result.push(phrase);
+      for (let pos = phrase.startPosition; pos <= phrase.endPosition; pos++) {
+        covered.add(pos);
+      }
+    }
+  }
+  return result.sort((a, b) => b.score - a.score);
+}
+// src/index.ts
+var DEFAULT_CONFIG = {
+  textProperty: "content",
+  tolerance: 1,
+  adaptiveTolerance: true,
+  enableSynonyms: false,
+  supabase: void 0,
+  synonymMatchScore: 0.8,
+  weights: {
+    exact: 1,
+    fuzzy: 0.8,
+    order: 0.3,
+    proximity: 0.2,
+    density: 0.2,
+    semantic: 0.15
+  },
+  maxGap: 5,
+  minScore: 0.1
+};
+var pluginStates = /* @__PURE__ */ new WeakMap();
+function pluginFuzzyPhrase(userConfig = {}) {
+  const config = {
+    textProperty: userConfig.textProperty ?? DEFAULT_CONFIG.textProperty,
+    tolerance: userConfig.tolerance ?? DEFAULT_CONFIG.tolerance,
+    adaptiveTolerance: userConfig.adaptiveTolerance ?? DEFAULT_CONFIG.adaptiveTolerance,
+    enableSynonyms: userConfig.enableSynonyms ?? DEFAULT_CONFIG.enableSynonyms,
+    supabase: userConfig.supabase || DEFAULT_CONFIG.supabase,
+    synonymMatchScore: userConfig.synonymMatchScore ?? DEFAULT_CONFIG.synonymMatchScore,
+    weights: {
+      exact: userConfig.weights?.exact ?? DEFAULT_CONFIG.weights.exact,
+      fuzzy: userConfig.weights?.fuzzy ?? DEFAULT_CONFIG.weights.fuzzy,
+      order: userConfig.weights?.order ?? DEFAULT_CONFIG.weights.order,
+      proximity: userConfig.weights?.proximity ?? DEFAULT_CONFIG.weights.proximity,
+      density: userConfig.weights?.density ?? DEFAULT_CONFIG.weights.density,
+      semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
+    },
+    maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
+    minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
+  };
+  const plugin = {
+    name: "fuzzy-phrase",
+    /**
+     * Initialize plugin after index is created
+     */
+    afterCreate: async (orama) => {
+      console.log("\u{1F52E} Initializing Fuzzy Phrase Plugin...");
+      const state = {
+        synonymMap: {},
+        config,
+        documentFrequency: /* @__PURE__ */ new Map(),
+        totalDocuments: 0
+      };
+      if (config.enableSynonyms && config.supabase) {
+        try {
+          console.log("\u{1F4D6} Loading synonyms from Supabase...");
+          state.synonymMap = await loadSynonymsFromSupabase(config.supabase);
+          console.log(`\u2705 Loaded ${Object.keys(state.synonymMap).length} words with synonyms`);
+        } catch (error) {
+          console.error("\u26A0\uFE0F  Failed to load synonyms:", error);
+        }
+      }
+      if (orama.data && typeof orama.data === "object") {
+        const docs = orama.data.docs || {};
+        state.totalDocuments = Object.keys(docs).length;
+        state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
+        console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
+      }
+      pluginStates.set(orama, state);
+      console.log("\u2705 Fuzzy Phrase Plugin initialized");
+    }
+  };
+  return plugin;
+}
+async function searchWithFuzzyPhrase(orama, params, language) {
+  const startTime = performance.now();
+  const state = pluginStates.get(orama);
+  if (!state) {
+    console.error("\u274C Plugin state not initialized");
+    throw new Error("Fuzzy Phrase Plugin not properly initialized");
+  }
+  const { term, properties } = params;
+  if (!term || typeof term !== "string") {
+    return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+  }
+  const textProperty = properties && properties[0] || state.config.textProperty;
+  const queryTokens = tokenize(term);
+  if (queryTokens.length === 0) {
+    return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+  }
+  const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
+  console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
+  let vocabulary;
+  try {
+    const radixNode = orama.index?.indexes?.[textProperty]?.node;
+    if (!radixNode) {
+      console.error("\u274C Radix tree not found for property:", textProperty);
+      return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+    }
+    vocabulary = extractVocabularyFromRadixTree(radixNode);
+    console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`);
+  } catch (error) {
+    console.error("\u274C Failed to extract vocabulary:", error);
+    return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+  }
+  const candidatesMap = findAllCandidates(
+    queryTokens,
+    vocabulary,
+    tolerance,
+    state.config.enableSynonyms ? state.synonymMap : void 0,
+    state.config.synonymMatchScore
+  );
+  const filteredCandidates = filterCandidatesByScore(
+    candidatesMap,
+    state.config.minScore
+  );
+  console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
+  const documentMatches = [];
+  const docs = orama.data?.docs || {};
+  for (const [docId, doc] of Object.entries(docs)) {
+    const text = doc[textProperty];
+    if (!text || typeof text !== "string") {
+      continue;
+    }
+    const docTokens = tokenize(text);
+    const phrases = findPhrasesInDocument(
+      docTokens,
+      filteredCandidates,
+      {
+        weights: state.config.weights,
+        maxGap: state.config.maxGap
+      },
+      state.documentFrequency,
+      state.totalDocuments
+    );
+    if (phrases.length > 0) {
+      const docScore = Math.max(...phrases.map((p) => p.score));
+      documentMatches.push({
+        id: docId,
+        phrases,
+        score: docScore,
+        document: doc
+      });
+    }
+  }
+  documentMatches.sort((a, b) => b.score - a.score);
+  const hits = documentMatches.map((match) => ({
+    id: match.id,
+    score: match.score,
+    document: match.document,
+    // Store phrases for highlighting
+    _phrases: match.phrases
+  }));
+  const elapsed = performance.now() - startTime;
+  console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms`);
+  return {
+    elapsed: {
+      formatted: `${elapsed.toFixed(2)}ms`,
+      raw: Math.floor(elapsed * 1e6)
+      // nanoseconds
+    },
+    hits,
+    count: hits.length
+  };
+}
+async function loadSynonymsFromSupabase(supabaseConfig) {
+  try {
+    const { createClient } = await import('@supabase/supabase-js');
+    const supabase = createClient(supabaseConfig.url, supabaseConfig.serviceKey);
+    const { data, error } = await supabase.rpc("get_synonym_map");
+    if (error) {
+      throw new Error(`Supabase error: ${error.message}`);
+    }
+    return data || {};
+  } catch (error) {
+    console.error("Failed to load synonyms from Supabase:", error);
+    throw error;
+  }
+}
+function calculateDocumentFrequencies(docs, textProperty) {
+  const df = /* @__PURE__ */ new Map();
+  for (const doc of Object.values(docs)) {
+    const text = doc[textProperty];
+    if (!text || typeof text !== "string") {
+      continue;
+    }
+    const words = new Set(tokenize(text));
+    for (const word of words) {
+      df.set(word, (df.get(word) || 0) + 1);
+    }
+  }
+  return df;
+}
+function tokenize(text) {
+  return text.toLowerCase().split(/\s+/).filter((token) => token.length > 0);
+}
+exports.pluginFuzzyPhrase = pluginFuzzyPhrase;
+exports.searchWithFuzzyPhrase = searchWithFuzzyPhrase;
+//# sourceMappingURL=out.js.map
+//# sourceMappingURL=index.cjs.map