npm - @wcs-colab/plugin-fuzzy-phrase - Versions diffs - 3.1.16-custom.newbase.3 → 3.1.16-custom.newbase.32 - Mend

@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.3 → 3.1.16-custom.newbase.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
   if (word === queryToken) {
     return { matches: true, distance: 0, score: 1 };
   }
-  if (word.startsWith(queryToken)) {
-    return { matches: true, distance: 0, score: 0.95 };
-  }
   const result = boundedLevenshtein(word, queryToken, tolerance);
   if (result.isBounded) {
     const score = 1 - result.distance * 0.2;
@@ -182,24 +179,51 @@ function filterCandidatesByScore(candidatesMap, minScore) {
 }
 // src/scoring.ts
-function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
+function buildCandidateLookup(candidatesMap) {
+  const candidateLookup = /* @__PURE__ */ new Map();
+  for (const [queryToken, candidates] of candidatesMap.entries()) {
+    for (const candidate of candidates) {
+      if (!candidateLookup.has(candidate.word)) {
+        candidateLookup.set(candidate.word, []);
+      }
+      candidateLookup.get(candidate.word).push({ queryToken, candidate });
+    }
+  }
+  for (const entries of candidateLookup.values()) {
+    entries.sort((a, b) => {
+      if (a.candidate.type === "exact" && b.candidate.type !== "exact")
+        return -1;
+      if (b.candidate.type === "exact" && a.candidate.type !== "exact")
+        return 1;
+      return b.candidate.score - a.candidate.score;
+    });
+  }
+  return candidateLookup;
+}
+function buildQueryTokenCounts(queryTokens) {
+  const queryTokenCounts = /* @__PURE__ */ new Map();
+  for (const token of queryTokens) {
+    queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
+  }
+  return queryTokenCounts;
+}
+function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, candidateLookup, queryTokenCounts) {
   const phrases = [];
-  const queryTokens = Array.from(candidatesMap.keys());
+  const queryTokens = originalQueryTokens;
   const wordMatches = [];
   for (let i = 0; i < documentTokens.length; i++) {
     const docWord = documentTokens[i];
-    for (const [queryToken, candidates] of candidatesMap.entries()) {
-      for (const candidate of candidates) {
-        if (candidate.word === docWord) {
-          wordMatches.push({
-            word: docWord,
-            queryToken,
-            position: i,
-            type: candidate.type,
-            distance: candidate.distance,
-            score: candidate.score
-          });
-        }
+    const matches = candidateLookup.get(docWord);
+    if (matches) {
+      for (const { queryToken, candidate } of matches) {
+        wordMatches.push({
+          word: docWord,
+          queryToken,
+          position: i,
+          type: candidate.type,
+          distance: candidate.distance,
+          score: candidate.score
+        });
       }
     }
   }
@@ -212,42 +236,56 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
       documentFrequency,
       totalDocuments,
       wordMatches,
-      documentTokens
-      // Pass document tokens to extract gap words
+      documentTokens,
+      queryTokenCounts
+      // OPTIMIZATION B: Pass pre-built queryTokenCounts
     );
     if (phrase && phrase.words.length > 0) {
       phrases.push(phrase);
     }
   }
-  return deduplicatePhrases(phrases);
+  const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
+  const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
+  return deduplicatePhrases(filteredPhrases);
 }
-function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
+function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens, queryTokenCounts) {
   const startMatch = wordMatches[startIndex];
   const phraseWords = [startMatch];
-  const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
+  const matchedCounts = /* @__PURE__ */ new Map();
+  matchedCounts.set(startMatch.queryToken, 1);
   const gapWords = [];
   let totalGapUsed = 0;
+  let totalMatchedTokens = 1;
   for (let i = startIndex + 1; i < wordMatches.length; i++) {
     const match = wordMatches[i];
     const lastPos = phraseWords[phraseWords.length - 1].position;
+    if (match.position <= lastPos) {
+      continue;
+    }
     const gap = match.position - lastPos - 1;
     if (gap > config.maxGap) {
       break;
     }
-    for (let pos = lastPos + 1; pos < match.position; pos++) {
-      totalGapUsed++;
-      gapWords.push({
-        word: documentTokens[pos],
-        position: pos,
-        gapIndex: totalGapUsed
-      });
+    if (totalGapUsed + gap > config.maxGap) {
+      break;
     }
-    if (!coveredTokens.has(match.queryToken)) {
+    const neededCount = queryTokenCounts.get(match.queryToken) || 0;
+    const currentCount = matchedCounts.get(match.queryToken) || 0;
+    if (currentCount < neededCount) {
+      for (let pos = lastPos + 1; pos < match.position; pos++) {
+        totalGapUsed++;
+        gapWords.push({
+          word: documentTokens[pos],
+          position: pos,
+          gapIndex: totalGapUsed
+        });
+      }
       phraseWords.push(match);
-      coveredTokens.add(match.queryToken);
-    }
-    if (coveredTokens.size === queryTokens.length) {
-      break;
+      matchedCounts.set(match.queryToken, currentCount + 1);
+      totalMatchedTokens++;
+      if (totalMatchedTokens === queryTokens.length) {
+        break;
+      }
     }
   }
   if (phraseWords.length > 0) {
@@ -286,9 +324,12 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
   baseScore /= phraseWords.length;
   const inOrder = isInOrder(phraseWords, queryTokens);
   const orderScore = inOrder ? 1 : 0.5;
-  const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
-  const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
-  const proximityScore = Math.max(0, 1 - span / proximityWindow);
+  let proximityScore = 0;
+  if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
+    const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
+    const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
+    proximityScore = Math.max(0, 1 - span / proximityWindow);
+  }
   let densityScore = 0;
   if (queryTokens.length === 1) {
     const totalOccurrences = allWordMatches.length;
@@ -306,8 +347,10 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
   const weightedDensity = densityScore * weights.density;
   const weightedSemantic = semanticScore * weights.semantic;
   const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
-  const maxBaseWeight = Math.max(weights.exact, weights.fuzzy);
-  const maxPossibleScore = maxBaseWeight + weights.order + weights.proximity + weights.density + weights.semantic;
+  const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
+  const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
+  const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
+  const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
   const normalizedScore = totalScore / maxPossibleScore;
   const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
   const score = normalizedScore * coverageMultiplier;
@@ -330,13 +373,20 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
   };
 }
 function isInOrder(phraseWords, queryTokens) {
-  const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
-  for (let i = 1; i < phraseWords.length; i++) {
-    const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1;
-    const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1;
-    if (currOrder < prevOrder) {
+  const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
+  let lastMatchedIndex = -1;
+  for (const phraseWord of phraseWords) {
+    let foundIndex = -1;
+    for (const pos of tokenPositions) {
+      if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
+        foundIndex = pos.index;
+        break;
+      }
+    }
+    if (foundIndex === -1) {
       return false;
     }
+    lastMatchedIndex = foundIndex;
   }
   return true;
 }
@@ -377,9 +427,259 @@ function deduplicatePhrases(phrases) {
   return result.sort((a, b) => b.score - a.score);
 }
+// src/optimized.ts
+var DEFAULT_OPTIMIZED_CONFIG = {
+  maxQPSCandidates: 100,
+  // Limit phrase scoring to top 100 candidates
+  minQPSScore: 0.1,
+  // Include candidates with 10%+ of best score
+  qpsExact: false,
+  // Use fuzzy matching by default
+  qpsTolerance: 1
+  // Default tolerance of 1 edit distance
+};
+function normalizeText(text) {
+  return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
+}
+function tokenize(text) {
+  return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
+}
+function buildReducedVocabularyFromDocs(candidateDocIds, docs) {
+  const reducedVocab = /* @__PURE__ */ new Set();
+  for (const docId of candidateDocIds) {
+    const doc = docs[docId];
+    if (!doc?.normalized_content)
+      continue;
+    const tokens = doc.normalized_content.split(/\s+/).filter((token) => token.length > 0);
+    for (const token of tokens) {
+      reducedVocab.add(token);
+    }
+  }
+  return reducedVocab;
+}
+function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
+  const tokens = tokenizer.tokenize(term, language);
+  if (tokens.length === 0) {
+    return [];
+  }
+  const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
+  const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
+  const boostPerProp = config.qpsBoostPerProp ?? {};
+  const resultMap = /* @__PURE__ */ new Map();
+  for (const prop of properties) {
+    const indexEntry = qpsIndex.indexes[prop];
+    if (!indexEntry || indexEntry.type !== "Radix") {
+      continue;
+    }
+    const radixNode = indexEntry.node;
+    const stats = qpsIndex.stats[prop];
+    if (!radixNode || !stats) {
+      continue;
+    }
+    const boost = boostPerProp[prop] ?? 1;
+    for (let i = 0; i < tokens.length; i++) {
+      const token = tokens[i];
+      const matches = radixNode.find({
+        term: token,
+        exact,
+        tolerance: exact ? 0 : tolerance
+      });
+      for (const [matchedWord, docIds] of Object.entries(matches)) {
+        if (!Array.isArray(docIds))
+          continue;
+        const isExactMatch = matchedWord === token;
+        for (const docId of docIds) {
+          const tokensLength = stats.tokensLength.get(docId) || 1;
+          const quantum = stats.tokenQuantums[docId]?.[matchedWord];
+          const occurrences = quantum ? quantum >> 20 : 1;
+          const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
+          if (!resultMap.has(docId)) {
+            resultMap.set(docId, [scoreContrib, 1 << i]);
+          } else {
+            const [prevScore, prevMask] = resultMap.get(docId);
+            const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
+            resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
+          }
+        }
+      }
+    }
+  }
+  const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
+  return results;
+}
+function countSetBits(n) {
+  let count = 0;
+  while (n) {
+    count += n & 1;
+    n >>= 1;
+  }
+  return count;
+}
+async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
+  const startTime = performance.now();
+  const { term, properties, tokenCache } = params;
+  if (!term || typeof term !== "string") {
+    return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+  }
+  const textProperty = properties && properties[0] || pluginState.config.textProperty;
+  const searchProperties = properties || [textProperty];
+  const queryTokens = tokenize(term);
+  if (queryTokens.length === 0) {
+    return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+  }
+  const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
+  console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
+  const qpsStartTime = performance.now();
+  const tokenizer = orama.tokenizer;
+  const qpsCandidates = searchQPS(
+    term,
+    qpsIndex,
+    tokenizer,
+    searchProperties,
+    config,
+    language
+  );
+  const qpsTime = performance.now() - qpsStartTime;
+  console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
+  if (qpsCandidates.length === 0) {
+    return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+  }
+  const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
+  const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
+  const bestScore = qpsCandidates[0][1];
+  const minScore = bestScore * minScoreRatio;
+  const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
+  console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
+  const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
+  let vocabulary = pluginState.vocabulary;
+  if (vocabulary.size === 0) {
+    console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
+    try {
+      const indexData = orama.data?.index;
+      let radixNode = null;
+      if (indexData?.indexes?.[textProperty]?.node) {
+        radixNode = indexData.indexes[textProperty].node;
+      } else if (indexData?.[textProperty]?.node) {
+        radixNode = indexData[textProperty].node;
+      }
+      if (radixNode) {
+        pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
+        vocabulary = pluginState.vocabulary;
+        console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
+      } else {
+        console.error("\u274C Radix tree not found for vocabulary extraction");
+        return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+      }
+    } catch (error) {
+      console.error("\u274C Failed to extract vocabulary:", error);
+      return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+    }
+  }
+  let docs = {};
+  if (orama.data?.docs?.docs) {
+    docs = orama.data.docs.docs;
+  }
+  const vocabStartTime = performance.now();
+  const reducedVocabulary = buildReducedVocabularyFromDocs(candidateDocIds, docs);
+  const vocabTime = performance.now() - vocabStartTime;
+  console.log(`\u{1F4DA} Reduced vocabulary: ${reducedVocabulary.size} words (full: ${vocabulary.size}, reduction: ${(100 * (1 - reducedVocabulary.size / vocabulary.size)).toFixed(1)}%, built in ${vocabTime.toFixed(2)}ms)`);
+  const candidatesMap = findAllCandidates(
+    queryTokens,
+    reducedVocabulary,
+    tolerance,
+    pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
+    pluginState.config.synonymMatchScore
+  );
+  const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
+  console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
+  const phraseStartTime = performance.now();
+  const candidateLookup = buildCandidateLookup(filteredFuzzyCandidates);
+  const queryTokenCounts = buildQueryTokenCounts(queryTokens);
+  const documentMatches = [];
+  let docsScored = 0;
+  for (const [docId, doc] of Object.entries(docs)) {
+    if (!candidateDocIds.has(docId)) {
+      continue;
+    }
+    docsScored++;
+    const text = doc[textProperty];
+    if (!text || typeof text !== "string") {
+      continue;
+    }
+    let docTokens;
+    if (tokenCache && tokenCache.has(docId)) {
+      docTokens = tokenCache.get(docId);
+    } else {
+      docTokens = text.split(/\s+/).filter((token) => token.length > 0);
+    }
+    const phrases = findPhrasesInDocument(
+      docTokens,
+      filteredFuzzyCandidates,
+      {
+        weights: pluginState.config.weights,
+        maxGap: pluginState.config.maxGap,
+        proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
+        tolerance
+      },
+      pluginState.documentFrequency,
+      pluginState.totalDocuments,
+      queryTokens,
+      candidateLookup,
+      // PHASE 1 OPTIMIZATION A: Pre-built candidate lookup
+      queryTokenCounts
+      // PHASE 1 OPTIMIZATION B: Pre-built query token counts
+    );
+    if (phrases.length > 0) {
+      const docScore = Math.max(...phrases.map((p) => p.score));
+      documentMatches.push({
+        id: docId,
+        phrases,
+        score: docScore,
+        document: doc
+      });
+    }
+  }
+  const phraseTime = performance.now() - phraseStartTime;
+  console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
+  documentMatches.sort((a, b) => b.score - a.score);
+  let finalMatches = documentMatches;
+  if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
+    const threshold = pluginState.config.finalScoreMinimum;
+    const beforeCount = finalMatches.length;
+    finalMatches = finalMatches.filter((m) => m.score >= threshold);
+    console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
+  }
+  const limit = params.limit ?? finalMatches.length;
+  const limitedMatches = finalMatches.slice(0, limit);
+  const hits = limitedMatches.map((match) => ({
+    id: match.id,
+    score: match.score,
+    document: match.document,
+    _phrases: match.phrases
+  }));
+  const elapsed = performance.now() - startTime;
+  console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
+  return {
+    elapsed: {
+      formatted: `${elapsed.toFixed(2)}ms`,
+      raw: Math.floor(elapsed * 1e6),
+      qpsTime,
+      phraseTime
+    },
+    hits,
+    count: hits.length
+  };
+}
+function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
+  return async (params, language = "french") => {
+    return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
+  };
+}
 // src/index.ts
 var DEFAULT_CONFIG = {
-  textProperty: "content",
+  textProperty: "normalized_content",
+  // Must match server's field name
   tolerance: 1,
   adaptiveTolerance: true,
   enableSynonyms: false,
@@ -395,6 +695,8 @@ var DEFAULT_CONFIG = {
   },
   maxGap: 5,
   minScore: 0.1,
+  enableFinalScoreMinimum: false,
+  finalScoreMinimum: 0.3,
   proximitySpanMultiplier: 5
 };
 var pluginStates = /* @__PURE__ */ new WeakMap();
@@ -416,6 +718,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
     },
     maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
     minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
+    enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
+    finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
     proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
   };
   const plugin = {
@@ -429,7 +733,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
         synonymMap: {},
         config,
         documentFrequency: /* @__PURE__ */ new Map(),
-        totalDocuments: 0
+        totalDocuments: 0,
+        vocabulary: /* @__PURE__ */ new Set()
       };
       if (config.enableSynonyms && config.supabase) {
         try {
@@ -446,6 +751,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
         state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
         console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
       }
+      try {
+        const indexData = orama.data?.index;
+        let radixNode = null;
+        if (indexData?.indexes?.[config.textProperty]?.node) {
+          radixNode = indexData.indexes[config.textProperty].node;
+        } else if (indexData?.[config.textProperty]?.node) {
+          radixNode = indexData[config.textProperty].node;
+        }
+        if (radixNode) {
+          state.vocabulary = extractVocabularyFromRadixTree(radixNode);
+          console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
+        } else {
+          console.warn("\u26A0\uFE0F  Could not find radix tree for vocabulary caching");
+        }
+      } catch (error) {
+        console.error("\u26A0\uFE0F  Failed to cache vocabulary:", error);
+      }
       pluginStates.set(orama, state);
       console.log("\u2705 Fuzzy Phrase Plugin initialized");
       setImmediate(() => {
@@ -467,43 +789,43 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     console.error("\u274C Plugin state not initialized");
     throw new Error("Fuzzy Phrase Plugin not properly initialized");
   }
-  const { term, properties } = params;
+  const { term, properties, tokenCache, candidateIds } = params;
+  const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
   if (!term || typeof term !== "string") {
     return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
   }
   const textProperty = properties && properties[0] || state.config.textProperty;
-  const queryTokens = tokenize(term);
+  const queryTokens = tokenize2(term);
   if (queryTokens.length === 0) {
     return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
   }
   const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
   console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
-  let vocabulary;
-  try {
-    const indexData = orama.data?.index;
-    if (!indexData) {
-      console.error("\u274C No index data found in orama.data.index");
-      return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
-    }
-    console.log("\u{1F50D} DEBUG: Index data keys:", Object.keys(indexData || {}));
-    let radixNode = null;
-    if (indexData.indexes?.[textProperty]?.node) {
-      radixNode = indexData.indexes[textProperty].node;
-      console.log("\u2705 Found radix via QPS-style path (data.index.indexes)");
-    } else if (indexData[textProperty]?.node) {
-      radixNode = indexData[textProperty].node;
-      console.log("\u2705 Found radix via standard path (data.index[property])");
-    }
-    if (!radixNode) {
-      console.error("\u274C Radix tree not found for property:", textProperty);
-      console.error("   Available properties in index:", Object.keys(indexData));
+  let vocabulary = state.vocabulary;
+  if (vocabulary.size === 0) {
+    console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
+    try {
+      const indexData = orama.data?.index;
+      let radixNode = null;
+      if (indexData?.indexes?.[textProperty]?.node) {
+        radixNode = indexData.indexes[textProperty].node;
+      } else if (indexData?.[textProperty]?.node) {
+        radixNode = indexData[textProperty].node;
+      }
+      if (radixNode) {
+        state.vocabulary = extractVocabularyFromRadixTree(radixNode);
+        vocabulary = state.vocabulary;
+        console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
+      } else {
+        console.error("\u274C Radix tree not found for vocabulary extraction");
+        return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+      }
+    } catch (error) {
+      console.error("\u274C Failed to extract vocabulary:", error);
       return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
     }
-    vocabulary = extractVocabularyFromRadixTree(radixNode);
-    console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`);
-  } catch (error) {
-    console.error("\u274C Failed to extract vocabulary:", error);
-    return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+  } else {
+    console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
   }
   const candidatesMap = findAllCandidates(
     queryTokens,
@@ -512,11 +834,10 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     state.config.enableSynonyms ? state.synonymMap : void 0,
     state.config.synonymMatchScore
   );
-  const filteredCandidates = filterCandidatesByScore(
-    candidatesMap,
-    state.config.minScore
-  );
+  const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
   console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
+  const candidateLookup = buildCandidateLookup(filteredCandidates);
+  const queryTokenCounts = buildQueryTokenCounts(queryTokens);
   const documentMatches = [];
   console.log("\u{1F50D} DEBUG orama.data structure:", {
     dataKeys: Object.keys(orama.data || {}),
@@ -542,23 +863,42 @@ async function searchWithFuzzyPhrase(orama, params, language) {
       dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
     });
   }
-  console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
+  const cacheHits = tokenCache ? tokenCache.size : 0;
+  const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
+  console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
   for (const [docId, doc] of Object.entries(docs)) {
+    if (candidateIdSet) {
+      const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
+      if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
+        continue;
+      }
+    }
     const text = doc[textProperty];
     if (!text || typeof text !== "string") {
       continue;
     }
-    const docTokens = tokenize(text);
+    let docTokens;
+    if (tokenCache && tokenCache.has(docId)) {
+      docTokens = tokenCache.get(docId);
+    } else {
+      docTokens = tokenize2(text);
+    }
     const phrases = findPhrasesInDocument(
       docTokens,
       filteredCandidates,
       {
         weights: state.config.weights,
         maxGap: state.config.maxGap,
-        proximitySpanMultiplier: state.config.proximitySpanMultiplier
+        proximitySpanMultiplier: state.config.proximitySpanMultiplier,
+        tolerance
       },
       state.documentFrequency,
-      state.totalDocuments
+      state.totalDocuments,
+      queryTokens,
+      candidateLookup,
+      // PHASE 1 OPTIMIZATION A: Pre-built candidate lookup
+      queryTokenCounts
+      // PHASE 1 OPTIMIZATION B: Pre-built query token counts
     );
     if (phrases.length > 0) {
       const docScore = Math.max(...phrases.map((p) => p.score));
@@ -571,8 +911,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     }
   }
   documentMatches.sort((a, b) => b.score - a.score);
-  const limit = params.limit ?? documentMatches.length;
-  const limitedMatches = documentMatches.slice(0, limit);
+  let filteredMatches = documentMatches;
+  if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
+    const threshold = state.config.finalScoreMinimum;
+    const beforeCount = filteredMatches.length;
+    filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
+    console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
+  }
+  const limit = params.limit ?? filteredMatches.length;
+  const limitedMatches = filteredMatches.slice(0, limit);
   const hits = limitedMatches.map((match) => ({
     id: match.id,
     score: match.score,
@@ -623,21 +970,29 @@ function calculateDocumentFrequencies(docs, textProperty) {
     if (!text || typeof text !== "string") {
       continue;
     }
-    const words = new Set(tokenize(text));
+    const words = new Set(tokenize2(text));
     for (const word of words) {
       df.set(word, (df.get(word) || 0) + 1);
     }
   }
   return df;
 }
-function normalizeText(text) {
+function normalizeText2(text) {
   return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
 }
-function tokenize(text) {
-  return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
+function tokenize2(text) {
+  return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
+}
+function getPluginState(orama) {
+  return pluginStates.get(orama);
 }
+exports.createOptimizedSearch = createOptimizedSearch;
+exports.getPluginState = getPluginState;
+exports.normalizeTextOptimized = normalizeText;
 exports.pluginFuzzyPhrase = pluginFuzzyPhrase;
 exports.searchWithFuzzyPhrase = searchWithFuzzyPhrase;
+exports.searchWithQPSPruning = searchWithQPSPruning;
+exports.tokenizeOptimized = tokenize;
 //# sourceMappingURL=out.js.map
 //# sourceMappingURL=index.cjs.map