npm - @wcs-colab/plugin-fuzzy-phrase - Versions diffs - 3.1.16-custom.9 → 3.1.16-custom.newbase.10 - Mend

@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.9 → 3.1.16-custom.newbase.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/LICENSE.md ADDED Viewed

@@ -0,0 +1,13 @@
+Copyright 2023 OramaSearch Inc.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

package/README.md CHANGED Viewed

@@ -159,6 +159,6 @@ Apache-2.0
 ## Version
-3.1.16-custom.1
+3.1.16-custom.newbase.1
 Compatible with `@wcs-colab/orama@3.1.16-custom.9`

package/dist/index.cjs CHANGED Viewed

@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
   if (word === queryToken) {
     return { matches: true, distance: 0, score: 1 };
   }
-  if (word.startsWith(queryToken)) {
-    return { matches: true, distance: 0, score: 0.95 };
-  }
   const result = boundedLevenshtein(word, queryToken, tolerance);
   if (result.isBounded) {
     const score = 1 - result.distance * 0.2;
@@ -82,30 +79,13 @@ function calculateAdaptiveTolerance(queryTokens, baseTolerance) {
 function extractVocabularyFromRadixTree(radixNode) {
   const vocabulary = /* @__PURE__ */ new Set();
   let nodesVisited = 0;
-  let wordsFound = 0;
   function traverse(node, depth = 0) {
     if (!node) {
-      console.log(`\u26A0\uFE0F Null node at depth ${depth}`);
       return;
     }
     nodesVisited++;
-    if (nodesVisited <= 3) {
-      const cInfo = node.c ? {
-        isArray: Array.isArray(node.c),
-        isMap: node.c instanceof Map,
-        type: typeof node.c,
-        constructor: node.c.constructor?.name,
-        keys: node.c instanceof Map ? Array.from(node.c.keys()).slice(0, 3) : Object.keys(node.c).slice(0, 3),
-        valuesCount: node.c instanceof Map ? node.c.size : Array.isArray(node.c) ? node.c.length : Object.keys(node.c).length
-      } : "null";
-      console.log(`\u{1F50D} Node ${nodesVisited}:`, { w: node.w, e: node.e, has_c: !!node.c, c_info: cInfo });
-    }
     if (node.e && node.w && typeof node.w === "string" && node.w.length > 0) {
       vocabulary.add(node.w);
-      wordsFound++;
-      if (wordsFound <= 5) {
-        console.log(`\u2705 Found word ${wordsFound}: "${node.w}"`);
-      }
     }
     if (node.c) {
       if (node.c instanceof Map) {
@@ -227,52 +207,85 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
       queryTokens,
       config,
       documentFrequency,
-      totalDocuments
+      totalDocuments,
+      wordMatches,
+      documentTokens
+      // Pass document tokens to extract gap words
     );
     if (phrase && phrase.words.length > 0) {
       phrases.push(phrase);
     }
   }
-  return deduplicatePhrases(phrases);
+  const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
+  const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
+  return deduplicatePhrases(filteredPhrases);
 }
-function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments) {
+function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
   const startMatch = wordMatches[startIndex];
   const phraseWords = [startMatch];
-  const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
+  const queryTokenCounts = /* @__PURE__ */ new Map();
+  for (const token of queryTokens) {
+    queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
+  }
+  const matchedCounts = /* @__PURE__ */ new Map();
+  matchedCounts.set(startMatch.queryToken, 1);
+  const gapWords = [];
+  let totalGapUsed = 0;
+  let totalMatchedTokens = 1;
   for (let i = startIndex + 1; i < wordMatches.length; i++) {
     const match = wordMatches[i];
-    const gap = match.position - phraseWords[phraseWords.length - 1].position - 1;
+    const lastPos = phraseWords[phraseWords.length - 1].position;
+    const gap = match.position - lastPos - 1;
     if (gap > config.maxGap) {
       break;
     }
-    if (!coveredTokens.has(match.queryToken)) {
+    const neededCount = queryTokenCounts.get(match.queryToken) || 0;
+    const currentCount = matchedCounts.get(match.queryToken) || 0;
+    if (currentCount < neededCount) {
+      for (let pos = lastPos + 1; pos < match.position; pos++) {
+        totalGapUsed++;
+        gapWords.push({
+          word: documentTokens[pos],
+          position: pos,
+          gapIndex: totalGapUsed
+        });
+      }
       phraseWords.push(match);
-      coveredTokens.add(match.queryToken);
-    }
-    if (coveredTokens.size === queryTokens.length) {
-      break;
+      matchedCounts.set(match.queryToken, currentCount + 1);
+      totalMatchedTokens++;
+      if (totalMatchedTokens === queryTokens.length) {
+        break;
+      }
     }
   }
   if (phraseWords.length > 0) {
-    const score = calculatePhraseScore(
+    const coverage = phraseWords.length / queryTokens.length;
+    const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
+    const { score, breakdown } = calculatePhraseScore(
       phraseWords,
       queryTokens,
       config,
       documentFrequency,
-      totalDocuments
+      totalDocuments,
+      allWordMatches,
+      coverage
     );
     return {
       words: phraseWords,
+      gapWords,
+      gapUsed: totalGapUsed,
+      coverage,
       startPosition: phraseWords[0].position,
       endPosition: phraseWords[phraseWords.length - 1].position,
-      gap: phraseWords[phraseWords.length - 1].position - phraseWords[0].position,
+      span,
       inOrder: isInOrder(phraseWords, queryTokens),
-      score
+      score,
+      scoreBreakdown: breakdown
     };
   }
   return null;
 }
-function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments) {
+function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, coverage) {
   let baseScore = 0;
   for (const word of phraseWords) {
     const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
@@ -281,18 +294,53 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
   baseScore /= phraseWords.length;
   const inOrder = isInOrder(phraseWords, queryTokens);
   const orderScore = inOrder ? 1 : 0.5;
-  const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
-  const proximityScore = Math.max(0, 1 - span / (queryTokens.length * 5));
-  const densityScore = phraseWords.length / queryTokens.length;
+  let proximityScore = 0;
+  if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
+    const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
+    const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
+    proximityScore = Math.max(0, 1 - span / proximityWindow);
+  }
+  let densityScore = 0;
+  if (queryTokens.length === 1) {
+    const totalOccurrences = allWordMatches.length;
+    densityScore = Math.min(1, totalOccurrences / 10);
+  }
   const semanticScore = calculateSemanticScore(
     phraseWords,
     documentFrequency,
     totalDocuments
   );
   const weights = config.weights;
-  const totalScore = baseScore + orderScore * weights.order + proximityScore * weights.proximity + densityScore * weights.density + semanticScore * weights.semantic;
-  const maxPossibleScore = 1 + weights.order + weights.proximity + weights.density + weights.semantic;
-  return Math.min(1, totalScore / maxPossibleScore);
+  const weightedBase = baseScore;
+  const weightedOrder = orderScore * weights.order;
+  const weightedProximity = proximityScore * weights.proximity;
+  const weightedDensity = densityScore * weights.density;
+  const weightedSemantic = semanticScore * weights.semantic;
+  const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
+  const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
+  const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
+  const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
+  const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
+  const normalizedScore = totalScore / maxPossibleScore;
+  const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
+  const score = normalizedScore * coverageMultiplier;
+  const base = weightedBase / maxPossibleScore;
+  const order = weightedOrder / maxPossibleScore;
+  const proximity = weightedProximity / maxPossibleScore;
+  const density = weightedDensity / maxPossibleScore;
+  const semantic = weightedSemantic / maxPossibleScore;
+  return {
+    score,
+    breakdown: {
+      base,
+      order,
+      proximity,
+      density,
+      semantic,
+      coverage: coverageMultiplier
+      // Show coverage multiplier in breakdown
+    }
+  };
 }
 function isInOrder(phraseWords, queryTokens) {
   const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
@@ -306,6 +354,9 @@ function isInOrder(phraseWords, queryTokens) {
   return true;
 }
 function calculateSemanticScore(phraseWords, documentFrequency, totalDocuments) {
+  if (totalDocuments === 0) {
+    return 0;
+  }
   let tfidfSum = 0;
   for (const word of phraseWords) {
     const df = documentFrequency.get(word.word) || 1;
@@ -341,7 +392,8 @@ function deduplicatePhrases(phrases) {
 // src/index.ts
 var DEFAULT_CONFIG = {
-  textProperty: "content",
+  textProperty: "normalized_content",
+  // Must match server's field name
   tolerance: 1,
   adaptiveTolerance: true,
   enableSynonyms: false,
@@ -356,7 +408,10 @@ var DEFAULT_CONFIG = {
     semantic: 0.15
   },
   maxGap: 5,
-  minScore: 0.1
+  minScore: 0.1,
+  enableFinalScoreMinimum: false,
+  finalScoreMinimum: 0.3,
+  proximitySpanMultiplier: 5
 };
 var pluginStates = /* @__PURE__ */ new WeakMap();
 function pluginFuzzyPhrase(userConfig = {}) {
@@ -376,7 +431,10 @@ function pluginFuzzyPhrase(userConfig = {}) {
       semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
     },
     maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
-    minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
+    minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
+    enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
+    finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
+    proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
   };
   const plugin = {
     name: "fuzzy-phrase",
@@ -400,14 +458,22 @@ function pluginFuzzyPhrase(userConfig = {}) {
           console.error("\u26A0\uFE0F  Failed to load synonyms:", error);
         }
       }
-      if (orama.data && typeof orama.data === "object") {
-        const docs = orama.data.docs || {};
+      const docs = orama.data?.docs?.docs;
+      if (docs) {
         state.totalDocuments = Object.keys(docs).length;
         state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
         console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
       }
       pluginStates.set(orama, state);
       console.log("\u2705 Fuzzy Phrase Plugin initialized");
+      setImmediate(() => {
+        if (typeof globalThis.fuzzyPhrasePluginReady === "function") {
+          console.log("\u{1F4E1} Signaling plugin ready...");
+          globalThis.fuzzyPhrasePluginReady();
+        } else {
+          console.warn("\u26A0\uFE0F  fuzzyPhrasePluginReady callback not found");
+        }
+      });
     }
   };
   return plugin;
@@ -464,13 +530,34 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     state.config.enableSynonyms ? state.synonymMap : void 0,
     state.config.synonymMatchScore
   );
-  const filteredCandidates = filterCandidatesByScore(
-    candidatesMap,
-    state.config.minScore
-  );
+  const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
   console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
   const documentMatches = [];
-  const docs = orama.data?.docs || {};
+  console.log("\u{1F50D} DEBUG orama.data structure:", {
+    dataKeys: Object.keys(orama.data || {}),
+    hasDocs: !!orama.data?.docs,
+    docsType: orama.data?.docs ? typeof orama.data.docs : "undefined"
+  });
+  let docs = {};
+  if (orama.data?.docs?.docs) {
+    docs = orama.data.docs.docs;
+    console.log("\u2705 Found docs at orama.data.docs.docs");
+  } else if (orama.data?.docs && typeof orama.data.docs === "object") {
+    const firstKey = Object.keys(orama.data.docs)[0];
+    if (firstKey && firstKey !== "sharedInternalDocumentStore" && firstKey !== "count") {
+      docs = orama.data.docs;
+      console.log("\u2705 Found docs at orama.data.docs (direct)");
+    }
+  }
+  if (Object.keys(docs).length === 0) {
+    console.log("\u274C Could not find documents - available structure:", {
+      hasDataDocs: !!orama.data?.docs,
+      dataDocsKeys: orama.data?.docs ? Object.keys(orama.data.docs) : "none",
+      hasDataDocsDocs: !!orama.data?.docs?.docs,
+      dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
+    });
+  }
+  console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
   for (const [docId, doc] of Object.entries(docs)) {
     const text = doc[textProperty];
     if (!text || typeof text !== "string") {
@@ -482,7 +569,9 @@ async function searchWithFuzzyPhrase(orama, params, language) {
       filteredCandidates,
       {
         weights: state.config.weights,
-        maxGap: state.config.maxGap
+        maxGap: state.config.maxGap,
+        proximitySpanMultiplier: state.config.proximitySpanMultiplier,
+        tolerance
       },
       state.documentFrequency,
       state.totalDocuments
@@ -498,7 +587,16 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     }
   }
   documentMatches.sort((a, b) => b.score - a.score);
-  const hits = documentMatches.map((match) => ({
+  let filteredMatches = documentMatches;
+  if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
+    const threshold = state.config.finalScoreMinimum;
+    const beforeCount = filteredMatches.length;
+    filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
+    console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
+  }
+  const limit = params.limit ?? filteredMatches.length;
+  const limitedMatches = filteredMatches.slice(0, limit);
+  const hits = limitedMatches.map((match) => ({
     id: match.id,
     score: match.score,
     document: match.document,
@@ -506,7 +604,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     _phrases: match.phrases
   }));
   const elapsed = performance.now() - startTime;
-  console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms`);
+  console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms (limit: ${limit})`);
   return {
     elapsed: {
       formatted: `${elapsed.toFixed(2)}ms`,
@@ -519,15 +617,25 @@ async function searchWithFuzzyPhrase(orama, params, language) {
 }
 async function loadSynonymsFromSupabase(supabaseConfig) {
   try {
+    console.log("\u{1F50D} DEBUG: Calling Supabase RPC get_synonym_map...");
     const { createClient } = await import('@supabase/supabase-js');
     const supabase = createClient(supabaseConfig.url, supabaseConfig.serviceKey);
     const { data, error } = await supabase.rpc("get_synonym_map");
+    console.log("\u{1F50D} DEBUG: Supabase RPC response:", {
+      hasError: !!error,
+      errorMessage: error?.message,
+      hasData: !!data,
+      dataType: typeof data,
+      dataKeys: data ? Object.keys(data).length : 0
+    });
     if (error) {
       throw new Error(`Supabase error: ${error.message}`);
     }
-    return data || {};
+    const synonymMap = data || {};
+    console.log(`\u{1F4DA} Loaded ${Object.keys(synonymMap).length} synonym entries from Supabase`);
+    return synonymMap;
   } catch (error) {
-    console.error("Failed to load synonyms from Supabase:", error);
+    console.error("\u274C Failed to load synonyms from Supabase:", error);
     throw error;
   }
 }
@@ -545,8 +653,11 @@ function calculateDocumentFrequencies(docs, textProperty) {
   }
   return df;
 }
+function normalizeText(text) {
+  return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
+}
 function tokenize(text) {
-  return text.toLowerCase().split(/\s+/).filter((token) => token.length > 0);
+  return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
 }
 exports.pluginFuzzyPhrase = pluginFuzzyPhrase;