npm - @wcs-colab/plugin-fuzzy-phrase - Versions diffs - 3.1.16-custom.newbase.2 → 3.1.16-custom.newbase.21 - Mend

@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.2 → 3.1.16-custom.newbase.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
   if (word === queryToken) {
     return { matches: true, distance: 0, score: 1 };
   }
-  if (word.startsWith(queryToken)) {
-    return { matches: true, distance: 0, score: 0.95 };
-  }
   const result = boundedLevenshtein(word, queryToken, tolerance);
   if (result.isBounded) {
     const score = 1 - result.distance * 0.2;
@@ -182,24 +179,32 @@ function filterCandidatesByScore(candidatesMap, minScore) {
 }
 // src/scoring.ts
-function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
+function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, docPositions) {
   const phrases = [];
-  const queryTokens = Array.from(candidatesMap.keys());
+  const queryTokens = originalQueryTokens;
   const wordMatches = [];
+  const candidateLookup = /* @__PURE__ */ new Map();
+  for (const [queryToken, candidates] of candidatesMap.entries()) {
+    for (const candidate of candidates) {
+      if (!candidateLookup.has(candidate.word)) {
+        candidateLookup.set(candidate.word, []);
+      }
+      candidateLookup.get(candidate.word).push({ queryToken, candidate });
+    }
+  }
   for (let i = 0; i < documentTokens.length; i++) {
     const docWord = documentTokens[i];
-    for (const [queryToken, candidates] of candidatesMap.entries()) {
-      for (const candidate of candidates) {
-        if (candidate.word === docWord) {
-          wordMatches.push({
-            word: docWord,
-            queryToken,
-            position: i,
-            type: candidate.type,
-            distance: candidate.distance,
-            score: candidate.score
-          });
-        }
+    const matches = candidateLookup.get(docWord);
+    if (matches) {
+      for (const { queryToken, candidate } of matches) {
+        wordMatches.push({
+          word: docWord,
+          queryToken,
+          position: i,
+          type: candidate.type,
+          distance: candidate.distance,
+          score: candidate.score
+        });
       }
     }
   }
@@ -211,47 +216,76 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
       config,
       documentFrequency,
       totalDocuments,
-      wordMatches
-      // Pass all word matches for density calculation
+      wordMatches,
+      documentTokens
+      // Pass document tokens to extract gap words
     );
     if (phrase && phrase.words.length > 0) {
       phrases.push(phrase);
     }
   }
-  return deduplicatePhrases(phrases);
+  const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
+  const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
+  return deduplicatePhrases(filteredPhrases);
 }
-function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
+function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
   const startMatch = wordMatches[startIndex];
   const phraseWords = [startMatch];
-  const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
+  const queryTokenCounts = /* @__PURE__ */ new Map();
+  for (const token of queryTokens) {
+    queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
+  }
+  const matchedCounts = /* @__PURE__ */ new Map();
+  matchedCounts.set(startMatch.queryToken, 1);
+  const gapWords = [];
+  let totalGapUsed = 0;
+  let totalMatchedTokens = 1;
   for (let i = startIndex + 1; i < wordMatches.length; i++) {
     const match = wordMatches[i];
-    const gap = match.position - phraseWords[phraseWords.length - 1].position - 1;
+    const lastPos = phraseWords[phraseWords.length - 1].position;
+    const gap = match.position - lastPos - 1;
     if (gap > config.maxGap) {
       break;
     }
-    if (!coveredTokens.has(match.queryToken)) {
+    const neededCount = queryTokenCounts.get(match.queryToken) || 0;
+    const currentCount = matchedCounts.get(match.queryToken) || 0;
+    if (currentCount < neededCount) {
+      for (let pos = lastPos + 1; pos < match.position; pos++) {
+        totalGapUsed++;
+        gapWords.push({
+          word: documentTokens[pos],
+          position: pos,
+          gapIndex: totalGapUsed
+        });
+      }
       phraseWords.push(match);
-      coveredTokens.add(match.queryToken);
-    }
-    if (coveredTokens.size === queryTokens.length) {
-      break;
+      matchedCounts.set(match.queryToken, currentCount + 1);
+      totalMatchedTokens++;
+      if (totalMatchedTokens === queryTokens.length) {
+        break;
+      }
     }
   }
   if (phraseWords.length > 0) {
+    const coverage = phraseWords.length / queryTokens.length;
+    const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
     const { score, breakdown } = calculatePhraseScore(
       phraseWords,
       queryTokens,
       config,
       documentFrequency,
       totalDocuments,
-      allWordMatches
+      allWordMatches,
+      coverage
     );
     return {
       words: phraseWords,
+      gapWords,
+      gapUsed: totalGapUsed,
+      coverage,
       startPosition: phraseWords[0].position,
       endPosition: phraseWords[phraseWords.length - 1].position,
-      gap: phraseWords[phraseWords.length - 1].position - phraseWords[0].position,
+      span,
       inOrder: isInOrder(phraseWords, queryTokens),
       score,
       scoreBreakdown: breakdown
@@ -259,7 +293,7 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
   }
   return null;
 }
-function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
+function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, coverage) {
   let baseScore = 0;
   for (const word of phraseWords) {
     const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
@@ -268,14 +302,16 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
   baseScore /= phraseWords.length;
   const inOrder = isInOrder(phraseWords, queryTokens);
   const orderScore = inOrder ? 1 : 0.5;
-  const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
-  const proximityScore = Math.max(0, 1 - span / (queryTokens.length * 5));
+  let proximityScore = 0;
+  if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
+    const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
+    const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
+    proximityScore = Math.max(0, 1 - span / proximityWindow);
+  }
   let densityScore = 0;
   if (queryTokens.length === 1) {
     const totalOccurrences = allWordMatches.length;
-    densityScore = totalOccurrences / queryTokens.length;
-  } else {
-    densityScore = phraseWords.length / queryTokens.length;
+    densityScore = Math.min(1, totalOccurrences / 10);
   }
   const semanticScore = calculateSemanticScore(
     phraseWords,
@@ -289,8 +325,13 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
   const weightedDensity = densityScore * weights.density;
   const weightedSemantic = semanticScore * weights.semantic;
   const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
-  const maxPossibleScore = 1 + weights.order + weights.proximity + weights.density + weights.semantic;
-  const score = totalScore / maxPossibleScore;
+  const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
+  const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
+  const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
+  const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
+  const normalizedScore = totalScore / maxPossibleScore;
+  const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
+  const score = normalizedScore * coverageMultiplier;
   const base = weightedBase / maxPossibleScore;
   const order = weightedOrder / maxPossibleScore;
   const proximity = weightedProximity / maxPossibleScore;
@@ -303,18 +344,27 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
       order,
       proximity,
       density,
-      semantic
+      semantic,
+      coverage: coverageMultiplier
+      // Show coverage multiplier in breakdown
     }
   };
 }
 function isInOrder(phraseWords, queryTokens) {
-  const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
-  for (let i = 1; i < phraseWords.length; i++) {
-    const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1;
-    const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1;
-    if (currOrder < prevOrder) {
+  const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
+  let lastMatchedIndex = -1;
+  for (const phraseWord of phraseWords) {
+    let foundIndex = -1;
+    for (const pos of tokenPositions) {
+      if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
+        foundIndex = pos.index;
+        break;
+      }
+    }
+    if (foundIndex === -1) {
       return false;
     }
+    lastMatchedIndex = foundIndex;
   }
   return true;
 }
@@ -357,7 +407,8 @@ function deduplicatePhrases(phrases) {
 // src/index.ts
 var DEFAULT_CONFIG = {
-  textProperty: "content",
+  textProperty: "normalized_content",
+  // Must match server's field name
   tolerance: 1,
   adaptiveTolerance: true,
   enableSynonyms: false,
@@ -372,7 +423,10 @@ var DEFAULT_CONFIG = {
     semantic: 0.15
   },
   maxGap: 5,
-  minScore: 0.1
+  minScore: 0.1,
+  enableFinalScoreMinimum: false,
+  finalScoreMinimum: 0.3,
+  proximitySpanMultiplier: 5
 };
 var pluginStates = /* @__PURE__ */ new WeakMap();
 function pluginFuzzyPhrase(userConfig = {}) {
@@ -392,7 +446,10 @@ function pluginFuzzyPhrase(userConfig = {}) {
       semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
     },
     maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
-    minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
+    minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
+    enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
+    finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
+    proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
   };
   const plugin = {
     name: "fuzzy-phrase",
@@ -405,7 +462,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
         synonymMap: {},
         config,
         documentFrequency: /* @__PURE__ */ new Map(),
-        totalDocuments: 0
+        totalDocuments: 0,
+        vocabulary: /* @__PURE__ */ new Set()
       };
       if (config.enableSynonyms && config.supabase) {
         try {
@@ -422,6 +480,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
         state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
         console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
       }
+      try {
+        const indexData = orama.data?.index;
+        let radixNode = null;
+        if (indexData?.indexes?.[config.textProperty]?.node) {
+          radixNode = indexData.indexes[config.textProperty].node;
+        } else if (indexData?.[config.textProperty]?.node) {
+          radixNode = indexData[config.textProperty].node;
+        }
+        if (radixNode) {
+          state.vocabulary = extractVocabularyFromRadixTree(radixNode);
+          console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
+        } else {
+          console.warn("\u26A0\uFE0F  Could not find radix tree for vocabulary caching");
+        }
+      } catch (error) {
+        console.error("\u26A0\uFE0F  Failed to cache vocabulary:", error);
+      }
       pluginStates.set(orama, state);
       console.log("\u2705 Fuzzy Phrase Plugin initialized");
       setImmediate(() => {
@@ -443,7 +518,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     console.error("\u274C Plugin state not initialized");
     throw new Error("Fuzzy Phrase Plugin not properly initialized");
   }
-  const { term, properties } = params;
+  const { term, properties, tokenCache } = params;
   if (!term || typeof term !== "string") {
     return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
   }
@@ -454,32 +529,31 @@ async function searchWithFuzzyPhrase(orama, params, language) {
   }
   const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
   console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
-  let vocabulary;
-  try {
-    const indexData = orama.data?.index;
-    if (!indexData) {
-      console.error("\u274C No index data found in orama.data.index");
-      return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
-    }
-    console.log("\u{1F50D} DEBUG: Index data keys:", Object.keys(indexData || {}));
-    let radixNode = null;
-    if (indexData.indexes?.[textProperty]?.node) {
-      radixNode = indexData.indexes[textProperty].node;
-      console.log("\u2705 Found radix via QPS-style path (data.index.indexes)");
-    } else if (indexData[textProperty]?.node) {
-      radixNode = indexData[textProperty].node;
-      console.log("\u2705 Found radix via standard path (data.index[property])");
-    }
-    if (!radixNode) {
-      console.error("\u274C Radix tree not found for property:", textProperty);
-      console.error("   Available properties in index:", Object.keys(indexData));
+  let vocabulary = state.vocabulary;
+  if (vocabulary.size === 0) {
+    console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
+    try {
+      const indexData = orama.data?.index;
+      let radixNode = null;
+      if (indexData?.indexes?.[textProperty]?.node) {
+        radixNode = indexData.indexes[textProperty].node;
+      } else if (indexData?.[textProperty]?.node) {
+        radixNode = indexData[textProperty].node;
+      }
+      if (radixNode) {
+        state.vocabulary = extractVocabularyFromRadixTree(radixNode);
+        vocabulary = state.vocabulary;
+        console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
+      } else {
+        console.error("\u274C Radix tree not found for vocabulary extraction");
+        return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+      }
+    } catch (error) {
+      console.error("\u274C Failed to extract vocabulary:", error);
       return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
     }
-    vocabulary = extractVocabularyFromRadixTree(radixNode);
-    console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`);
-  } catch (error) {
-    console.error("\u274C Failed to extract vocabulary:", error);
-    return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+  } else {
+    console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
   }
   const candidatesMap = findAllCandidates(
     queryTokens,
@@ -488,10 +562,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     state.config.enableSynonyms ? state.synonymMap : void 0,
     state.config.synonymMatchScore
   );
-  const filteredCandidates = filterCandidatesByScore(
-    candidatesMap,
-    state.config.minScore
-  );
+  const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
   console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
   const documentMatches = [];
   console.log("\u{1F50D} DEBUG orama.data structure:", {
@@ -518,23 +589,44 @@ async function searchWithFuzzyPhrase(orama, params, language) {
       dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
     });
   }
-  console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
+  const cacheHits = tokenCache ? tokenCache.size : 0;
+  let hasPositionalIndex = false;
+  if (tokenCache && tokenCache.size > 0) {
+    const firstEntry = tokenCache.values().next().value;
+    hasPositionalIndex = !!(firstEntry && !Array.isArray(firstEntry) && firstEntry.positions);
+  }
+  console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents (${hasPositionalIndex ? "\u26A1 positional index" : cacheHits > 0 ? "tokens cached" : "no cache"})`);
   for (const [docId, doc] of Object.entries(docs)) {
     const text = doc[textProperty];
     if (!text || typeof text !== "string") {
       continue;
     }
-    const docTokens = tokenize(text);
+    let docTokens;
+    if (tokenCache && tokenCache.has(docId)) {
+      const cached = tokenCache.get(docId);
+      if (Array.isArray(cached)) {
+        docTokens = cached;
+      } else if (cached.tokens && cached.positions) {
+        docTokens = cached.tokens;
+        cached.positions;
+      } else {
+        docTokens = tokenize(text);
+      }
+    } else {
+      docTokens = tokenize(text);
+    }
     const phrases = findPhrasesInDocument(
       docTokens,
       filteredCandidates,
       {
         weights: state.config.weights,
-        maxGap: state.config.maxGap
+        maxGap: state.config.maxGap,
+        proximitySpanMultiplier: state.config.proximitySpanMultiplier,
+        tolerance
       },
       state.documentFrequency,
-      state.totalDocuments
-    );
+      state.totalDocuments,
+      queryTokens);
     if (phrases.length > 0) {
       const docScore = Math.max(...phrases.map((p) => p.score));
       documentMatches.push({
@@ -546,8 +638,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     }
   }
   documentMatches.sort((a, b) => b.score - a.score);
-  const limit = params.limit ?? documentMatches.length;
-  const limitedMatches = documentMatches.slice(0, limit);
+  let filteredMatches = documentMatches;
+  if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
+    const threshold = state.config.finalScoreMinimum;
+    const beforeCount = filteredMatches.length;
+    filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
+    console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
+  }
+  const limit = params.limit ?? filteredMatches.length;
+  const limitedMatches = filteredMatches.slice(0, limit);
   const hits = limitedMatches.map((match) => ({
     id: match.id,
     score: match.score,