npm - @wcs-colab/plugin-fuzzy-phrase - Versions diffs - 3.1.16-custom.newbase.25 → 3.1.16-custom.newbase.28 - Mend

@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.25 → 3.1.16-custom.newbase.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -417,6 +417,232 @@ function deduplicatePhrases(phrases) {
   return result.sort((a, b) => b.score - a.score);
 }
+// src/optimized.ts
+var DEFAULT_OPTIMIZED_CONFIG = {
+  maxQPSCandidates: 100,
+  // Limit phrase scoring to top 100 candidates
+  minQPSScore: 0.1,
+  // Include candidates with 10%+ of best score
+  qpsExact: false,
+  // Use fuzzy matching by default
+  qpsTolerance: 1
+  // Default tolerance of 1 edit distance
+};
+function normalizeText(text) {
+  return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
+}
+function tokenize(text) {
+  return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
+}
+function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
+  const tokens = tokenizer.tokenize(term, language);
+  if (tokens.length === 0) {
+    return [];
+  }
+  const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
+  const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
+  const boostPerProp = config.qpsBoostPerProp ?? {};
+  const resultMap = /* @__PURE__ */ new Map();
+  for (const prop of properties) {
+    const indexEntry = qpsIndex.indexes[prop];
+    if (!indexEntry || indexEntry.type !== "Radix") {
+      continue;
+    }
+    const radixNode = indexEntry.node;
+    const stats = qpsIndex.stats[prop];
+    if (!radixNode || !stats) {
+      continue;
+    }
+    const boost = boostPerProp[prop] ?? 1;
+    for (let i = 0; i < tokens.length; i++) {
+      const token = tokens[i];
+      const matches = radixNode.find({
+        term: token,
+        exact,
+        tolerance: exact ? 0 : tolerance
+      });
+      for (const [matchedWord, docIds] of Object.entries(matches)) {
+        if (!Array.isArray(docIds))
+          continue;
+        const isExactMatch = matchedWord === token;
+        for (const docId of docIds) {
+          const tokensLength = stats.tokensLength.get(docId) || 1;
+          const quantum = stats.tokenQuantums[docId]?.[matchedWord];
+          const occurrences = quantum ? quantum >> 20 : 1;
+          const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
+          if (!resultMap.has(docId)) {
+            resultMap.set(docId, [scoreContrib, 1 << i]);
+          } else {
+            const [prevScore, prevMask] = resultMap.get(docId);
+            const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
+            resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
+          }
+        }
+      }
+    }
+  }
+  const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
+  return results;
+}
+function countSetBits(n) {
+  let count = 0;
+  while (n) {
+    count += n & 1;
+    n >>= 1;
+  }
+  return count;
+}
+async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
+  const startTime = performance.now();
+  const { term, properties, tokenCache } = params;
+  if (!term || typeof term !== "string") {
+    return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+  }
+  const textProperty = properties && properties[0] || pluginState.config.textProperty;
+  const searchProperties = properties || [textProperty];
+  const queryTokens = tokenize(term);
+  if (queryTokens.length === 0) {
+    return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+  }
+  const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
+  console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
+  const qpsStartTime = performance.now();
+  const tokenizer = orama.tokenizer;
+  const qpsCandidates = searchQPS(
+    term,
+    qpsIndex,
+    tokenizer,
+    searchProperties,
+    config,
+    language
+  );
+  const qpsTime = performance.now() - qpsStartTime;
+  console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
+  if (qpsCandidates.length === 0) {
+    return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+  }
+  const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
+  const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
+  const bestScore = qpsCandidates[0][1];
+  const minScore = bestScore * minScoreRatio;
+  const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
+  console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
+  const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
+  let vocabulary = pluginState.vocabulary;
+  if (vocabulary.size === 0) {
+    console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
+    try {
+      const indexData = orama.data?.index;
+      let radixNode = null;
+      if (indexData?.indexes?.[textProperty]?.node) {
+        radixNode = indexData.indexes[textProperty].node;
+      } else if (indexData?.[textProperty]?.node) {
+        radixNode = indexData[textProperty].node;
+      }
+      if (radixNode) {
+        pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
+        vocabulary = pluginState.vocabulary;
+        console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
+      } else {
+        console.error("\u274C Radix tree not found for vocabulary extraction");
+        return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+      }
+    } catch (error) {
+      console.error("\u274C Failed to extract vocabulary:", error);
+      return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
+    }
+  }
+  const candidatesMap = findAllCandidates(
+    queryTokens,
+    vocabulary,
+    tolerance,
+    pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
+    pluginState.config.synonymMatchScore
+  );
+  const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
+  console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
+  const phraseStartTime = performance.now();
+  const documentMatches = [];
+  let docs = {};
+  if (orama.data?.docs?.docs) {
+    docs = orama.data.docs.docs;
+  }
+  let docsScored = 0;
+  for (const [docId, doc] of Object.entries(docs)) {
+    if (!candidateDocIds.has(docId)) {
+      continue;
+    }
+    docsScored++;
+    const text = doc[textProperty];
+    if (!text || typeof text !== "string") {
+      continue;
+    }
+    let docTokens;
+    if (tokenCache && tokenCache.has(docId)) {
+      docTokens = tokenCache.get(docId);
+    } else {
+      docTokens = tokenize(text);
+    }
+    const phrases = findPhrasesInDocument(
+      docTokens,
+      filteredFuzzyCandidates,
+      {
+        weights: pluginState.config.weights,
+        maxGap: pluginState.config.maxGap,
+        proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
+        tolerance
+      },
+      pluginState.documentFrequency,
+      pluginState.totalDocuments,
+      queryTokens
+    );
+    if (phrases.length > 0) {
+      const docScore = Math.max(...phrases.map((p) => p.score));
+      documentMatches.push({
+        id: docId,
+        phrases,
+        score: docScore,
+        document: doc
+      });
+    }
+  }
+  const phraseTime = performance.now() - phraseStartTime;
+  console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
+  documentMatches.sort((a, b) => b.score - a.score);
+  let finalMatches = documentMatches;
+  if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
+    const threshold = pluginState.config.finalScoreMinimum;
+    const beforeCount = finalMatches.length;
+    finalMatches = finalMatches.filter((m) => m.score >= threshold);
+    console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
+  }
+  const limit = params.limit ?? finalMatches.length;
+  const limitedMatches = finalMatches.slice(0, limit);
+  const hits = limitedMatches.map((match) => ({
+    id: match.id,
+    score: match.score,
+    document: match.document,
+    _phrases: match.phrases
+  }));
+  const elapsed = performance.now() - startTime;
+  console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
+  return {
+    elapsed: {
+      formatted: `${elapsed.toFixed(2)}ms`,
+      raw: Math.floor(elapsed * 1e6),
+      qpsTime,
+      phraseTime
+    },
+    hits,
+    count: hits.length
+  };
+}
+function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
+  return async (params, language = "french") => {
+    return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
+  };
+}
 // src/index.ts
 var DEFAULT_CONFIG = {
   textProperty: "normalized_content",
@@ -530,12 +756,13 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     console.error("\u274C Plugin state not initialized");
     throw new Error("Fuzzy Phrase Plugin not properly initialized");
   }
-  const { term, properties, tokenCache } = params;
+  const { term, properties, tokenCache, candidateIds } = params;
+  const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
   if (!term || typeof term !== "string") {
     return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
   }
   const textProperty = properties && properties[0] || state.config.textProperty;
-  const queryTokens = tokenize(term);
+  const queryTokens = tokenize2(term);
   if (queryTokens.length === 0) {
     return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
   }
@@ -602,8 +829,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     });
   }
   const cacheHits = tokenCache ? tokenCache.size : 0;
-  console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
+  const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
+  console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
   for (const [docId, doc] of Object.entries(docs)) {
+    if (candidateIdSet) {
+      const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
+      if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
+        continue;
+      }
+    }
     const text = doc[textProperty];
     if (!text || typeof text !== "string") {
       continue;
@@ -612,7 +846,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     if (tokenCache && tokenCache.has(docId)) {
       docTokens = tokenCache.get(docId);
     } else {
-      docTokens = tokenize(text);
+      docTokens = tokenize2(text);
     }
     const phrases = findPhrasesInDocument(
       docTokens,
@@ -698,21 +932,29 @@ function calculateDocumentFrequencies(docs, textProperty) {
     if (!text || typeof text !== "string") {
       continue;
     }
-    const words = new Set(tokenize(text));
+    const words = new Set(tokenize2(text));
     for (const word of words) {
       df.set(word, (df.get(word) || 0) + 1);
     }
   }
   return df;
 }
-function normalizeText(text) {
+function normalizeText2(text) {
   return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
 }
-function tokenize(text) {
-  return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
+function tokenize2(text) {
+  return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
+}
+function getPluginState(orama) {
+  return pluginStates.get(orama);
 }
+exports.createOptimizedSearch = createOptimizedSearch;
+exports.getPluginState = getPluginState;
+exports.normalizeTextOptimized = normalizeText;
 exports.pluginFuzzyPhrase = pluginFuzzyPhrase;
 exports.searchWithFuzzyPhrase = searchWithFuzzyPhrase;
+exports.searchWithQPSPruning = searchWithQPSPruning;
+exports.tokenizeOptimized = tokenize;
 //# sourceMappingURL=out.js.map
 //# sourceMappingURL=index.cjs.map