npm - @wcs-colab/plugin-fuzzy-phrase - Versions diffs - 3.1.16-custom.8 → 3.1.16-custom.newbase.1 - Mend

@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.8 → 3.1.16-custom.newbase.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -1,108 +1,62 @@
-import { SearchableValue, OramaPlugin, AnyOrama, Results, TypedDocument } from '@wcs-colab/orama';
+import { OramaPlugin, AnyOrama, Results, TypedDocument } from '@wcs-colab/orama';
 /**
- * TypeScript type definitions for Fuzzy Phrase Plugin
- */
-/**
- * Configuration for the Fuzzy Phrase Plugin
+ * Shared types for the fuzzy phrase plugin.
  */
 interface FuzzyPhraseConfig {
-    /**
-     * Text property to search in
-     * @default 'content'
-     */
+    /** Text property to search in (defaults to `content`) */
     textProperty?: string;
-    /**
-     * Base fuzzy matching tolerance (edit distance)
-     * @default 1
-     */
+    /** Base fuzzy matching tolerance (edit distance) */
     tolerance?: number;
-    /**
-     * Enable adaptive tolerance (scales with query length)
-     * @default true
-     */
+    /** Enable adaptive tolerance that scales with query length */
     adaptiveTolerance?: boolean;
-    /**
-     * Enable synonym expansion
-     * @default false
-     */
+    /** Enable synonym expansion using Supabase-backed synonym map */
     enableSynonyms?: boolean;
-    /**
-     * Supabase configuration for loading synonyms
-     */
+    /** Supabase configuration for loading synonyms */
     supabase?: {
         url: string;
         serviceKey: string;
     };
-    /**
-     * Scoring weight for synonym matches (0-1)
-     * @default 0.8
-     */
+    /** Scoring weight for synonym matches (0-1, default ~0.8) */
     synonymMatchScore?: number;
-    /**
-     * Scoring weights for different components
-     */
+    /** Scoring weights for different components */
     weights?: {
-        /** Weight for exact matches */
         exact?: number;
-        /** Weight for fuzzy matches */
         fuzzy?: number;
-        /** Weight for phrase order */
         order?: number;
-        /** Weight for proximity bonus */
         proximity?: number;
-        /** Weight for density bonus */
         density?: number;
-        /** Weight for TF-IDF semantic score */
         semantic?: number;
     };
-    /**
-     * Maximum gap between words in a phrase
-     * @default 5
-     */
+    /** Maximum gap between words in a phrase */
     maxGap?: number;
-    /**
-     * Minimum phrase score to include in results
-     * @default 0.1
-     */
+    /** Minimum phrase score to include in results */
     minScore?: number;
 }
-/**
- * Match information for a single word
- */
+type SynonymMap = Record<string, string[]>;
+interface Candidate {
+    word: string;
+    type: 'exact' | 'fuzzy' | 'synonym';
+    queryToken: string;
+    distance: number;
+    score: number;
+}
 interface WordMatch {
-    /** The matched word from the document */
     word: string;
-    /** The query token that matched */
     queryToken: string;
-    /** Position of the word in the document */
     position: number;
-    /** Type of match */
     type: 'exact' | 'fuzzy' | 'synonym';
-    /** Edit distance for fuzzy matches */
-    distance?: number;
-    /** Match score (0-1) */
+    distance: number;
     score: number;
 }
-/**
- * Phrase match information
- */
 interface PhraseMatch {
-    /** All word matches in this phrase */
     words: WordMatch[];
-    /** Start position in document */
     startPosition: number;
-    /** End position in document */
     endPosition: number;
-    /** Gap between words */
     gap: number;
-    /** Whether words are in correct order */
     inOrder: boolean;
-    /** Overall phrase score */
     score: number;
-    /** Score breakdown by component */
-    scoreBreakdown?: {
+    scoreBreakdown: {
         base: number;
         order: number;
         proximity: number;
@@ -110,34 +64,11 @@ interface PhraseMatch {
         semantic: number;
     };
 }
-/**
- * Document match with all phrase matches
- */
 interface DocumentMatch {
-    /** Document ID */
     id: string;
-    /** All phrase matches found in this document */
     phrases: PhraseMatch[];
-    /** Overall document score */
-    score: number;
-    /** Document data */
-    document: Record<string, SearchableValue>;
-}
-/**
- * Synonym map structure
- */
-interface SynonymMap {
-    [word: string]: string[];
-}
-/**
- * Candidate word for matching
- */
-interface Candidate {
-    word: string;
-    type: 'exact' | 'fuzzy' | 'synonym';
-    queryToken: string;
-    distance?: number;
     score: number;
+    document: any;
 }
 /**

package/dist/index.d.ts CHANGED Viewed

@@ -1,108 +1,62 @@
-import { SearchableValue, OramaPlugin, AnyOrama, Results, TypedDocument } from '@wcs-colab/orama';
+import { OramaPlugin, AnyOrama, Results, TypedDocument } from '@wcs-colab/orama';
 /**
- * TypeScript type definitions for Fuzzy Phrase Plugin
- */
-/**
- * Configuration for the Fuzzy Phrase Plugin
+ * Shared types for the fuzzy phrase plugin.
  */
 interface FuzzyPhraseConfig {
-    /**
-     * Text property to search in
-     * @default 'content'
-     */
+    /** Text property to search in (defaults to `content`) */
     textProperty?: string;
-    /**
-     * Base fuzzy matching tolerance (edit distance)
-     * @default 1
-     */
+    /** Base fuzzy matching tolerance (edit distance) */
     tolerance?: number;
-    /**
-     * Enable adaptive tolerance (scales with query length)
-     * @default true
-     */
+    /** Enable adaptive tolerance that scales with query length */
     adaptiveTolerance?: boolean;
-    /**
-     * Enable synonym expansion
-     * @default false
-     */
+    /** Enable synonym expansion using Supabase-backed synonym map */
     enableSynonyms?: boolean;
-    /**
-     * Supabase configuration for loading synonyms
-     */
+    /** Supabase configuration for loading synonyms */
     supabase?: {
         url: string;
         serviceKey: string;
     };
-    /**
-     * Scoring weight for synonym matches (0-1)
-     * @default 0.8
-     */
+    /** Scoring weight for synonym matches (0-1, default ~0.8) */
     synonymMatchScore?: number;
-    /**
-     * Scoring weights for different components
-     */
+    /** Scoring weights for different components */
     weights?: {
-        /** Weight for exact matches */
         exact?: number;
-        /** Weight for fuzzy matches */
         fuzzy?: number;
-        /** Weight for phrase order */
         order?: number;
-        /** Weight for proximity bonus */
         proximity?: number;
-        /** Weight for density bonus */
         density?: number;
-        /** Weight for TF-IDF semantic score */
         semantic?: number;
     };
-    /**
-     * Maximum gap between words in a phrase
-     * @default 5
-     */
+    /** Maximum gap between words in a phrase */
     maxGap?: number;
-    /**
-     * Minimum phrase score to include in results
-     * @default 0.1
-     */
+    /** Minimum phrase score to include in results */
     minScore?: number;
 }
-/**
- * Match information for a single word
- */
+type SynonymMap = Record<string, string[]>;
+interface Candidate {
+    word: string;
+    type: 'exact' | 'fuzzy' | 'synonym';
+    queryToken: string;
+    distance: number;
+    score: number;
+}
 interface WordMatch {
-    /** The matched word from the document */
     word: string;
-    /** The query token that matched */
     queryToken: string;
-    /** Position of the word in the document */
     position: number;
-    /** Type of match */
     type: 'exact' | 'fuzzy' | 'synonym';
-    /** Edit distance for fuzzy matches */
-    distance?: number;
-    /** Match score (0-1) */
+    distance: number;
     score: number;
 }
-/**
- * Phrase match information
- */
 interface PhraseMatch {
-    /** All word matches in this phrase */
     words: WordMatch[];
-    /** Start position in document */
     startPosition: number;
-    /** End position in document */
     endPosition: number;
-    /** Gap between words */
     gap: number;
-    /** Whether words are in correct order */
     inOrder: boolean;
-    /** Overall phrase score */
     score: number;
-    /** Score breakdown by component */
-    scoreBreakdown?: {
+    scoreBreakdown: {
         base: number;
         order: number;
         proximity: number;
@@ -110,34 +64,11 @@ interface PhraseMatch {
         semantic: number;
     };
 }
-/**
- * Document match with all phrase matches
- */
 interface DocumentMatch {
-    /** Document ID */
     id: string;
-    /** All phrase matches found in this document */
     phrases: PhraseMatch[];
-    /** Overall document score */
-    score: number;
-    /** Document data */
-    document: Record<string, SearchableValue>;
-}
-/**
- * Synonym map structure
- */
-interface SynonymMap {
-    [word: string]: string[];
-}
-/**
- * Candidate word for matching
- */
-interface Candidate {
-    word: string;
-    type: 'exact' | 'fuzzy' | 'synonym';
-    queryToken: string;
-    distance?: number;
     score: number;
+    document: any;
 }
 /**

package/dist/index.js CHANGED Viewed

@@ -88,7 +88,15 @@ function extractVocabularyFromRadixTree(radixNode) {
     }
     nodesVisited++;
     if (nodesVisited <= 3) {
-      console.log(`\u{1F50D} Node ${nodesVisited}: w="${node.w}", e=${node.e}, has_c=${!!node.c}, c_is_array=${Array.isArray(node.c)}, c_length=${node.c?.length || 0}`);
+      const cInfo = node.c ? {
+        isArray: Array.isArray(node.c),
+        isMap: node.c instanceof Map,
+        type: typeof node.c,
+        constructor: node.c.constructor?.name,
+        keys: node.c instanceof Map ? Array.from(node.c.keys()).slice(0, 3) : Object.keys(node.c).slice(0, 3),
+        valuesCount: node.c instanceof Map ? node.c.size : Array.isArray(node.c) ? node.c.length : Object.keys(node.c).length
+      } : "null";
+      console.log(`\u{1F50D} Node ${nodesVisited}:`, { w: node.w, e: node.e, has_c: !!node.c, c_info: cInfo });
     }
     if (node.e && node.w && typeof node.w === "string" && node.w.length > 0) {
       vocabulary.add(node.w);
@@ -98,7 +106,11 @@ function extractVocabularyFromRadixTree(radixNode) {
       }
     }
     if (node.c) {
-      if (Array.isArray(node.c)) {
+      if (node.c instanceof Map) {
+        for (const [_key, childNode] of node.c) {
+          traverse(childNode, depth + 1);
+        }
+      } else if (Array.isArray(node.c)) {
         for (const [_key, childNode] of node.c) {
           traverse(childNode, depth + 1);
         }
@@ -213,7 +225,9 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
       queryTokens,
       config,
       documentFrequency,
-      totalDocuments
+      totalDocuments,
+      wordMatches
+      // Pass all word matches for density calculation
     );
     if (phrase && phrase.words.length > 0) {
       phrases.push(phrase);
@@ -221,7 +235,7 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
   }
   return deduplicatePhrases(phrases);
 }
-function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments) {
+function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
   const startMatch = wordMatches[startIndex];
   const phraseWords = [startMatch];
   const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
@@ -240,12 +254,13 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
     }
   }
   if (phraseWords.length > 0) {
-    const score = calculatePhraseScore(
+    const { score, breakdown } = calculatePhraseScore(
       phraseWords,
       queryTokens,
       config,
       documentFrequency,
-      totalDocuments
+      totalDocuments,
+      allWordMatches
     );
     return {
       words: phraseWords,
@@ -253,12 +268,13 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
       endPosition: phraseWords[phraseWords.length - 1].position,
       gap: phraseWords[phraseWords.length - 1].position - phraseWords[0].position,
       inOrder: isInOrder(phraseWords, queryTokens),
-      score
+      score,
+      scoreBreakdown: breakdown
     };
   }
   return null;
 }
-function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments) {
+function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
   let baseScore = 0;
   for (const word of phraseWords) {
     const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
@@ -269,16 +285,42 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
   const orderScore = inOrder ? 1 : 0.5;
   const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
   const proximityScore = Math.max(0, 1 - span / (queryTokens.length * 5));
-  const densityScore = phraseWords.length / queryTokens.length;
+  let densityScore = 0;
+  if (queryTokens.length === 1) {
+    const totalOccurrences = allWordMatches.length;
+    densityScore = totalOccurrences / queryTokens.length;
+  } else {
+    densityScore = phraseWords.length / queryTokens.length;
+  }
   const semanticScore = calculateSemanticScore(
     phraseWords,
     documentFrequency,
     totalDocuments
   );
   const weights = config.weights;
-  const totalScore = baseScore + orderScore * weights.order + proximityScore * weights.proximity + densityScore * weights.density + semanticScore * weights.semantic;
+  const weightedBase = baseScore;
+  const weightedOrder = orderScore * weights.order;
+  const weightedProximity = proximityScore * weights.proximity;
+  const weightedDensity = densityScore * weights.density;
+  const weightedSemantic = semanticScore * weights.semantic;
+  const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
   const maxPossibleScore = 1 + weights.order + weights.proximity + weights.density + weights.semantic;
-  return Math.min(1, totalScore / maxPossibleScore);
+  const score = totalScore / maxPossibleScore;
+  const base = weightedBase / maxPossibleScore;
+  const order = weightedOrder / maxPossibleScore;
+  const proximity = weightedProximity / maxPossibleScore;
+  const density = weightedDensity / maxPossibleScore;
+  const semantic = weightedSemantic / maxPossibleScore;
+  return {
+    score,
+    breakdown: {
+      base,
+      order,
+      proximity,
+      density,
+      semantic
+    }
+  };
 }
 function isInOrder(phraseWords, queryTokens) {
   const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
@@ -292,6 +334,9 @@ function isInOrder(phraseWords, queryTokens) {
   return true;
 }
 function calculateSemanticScore(phraseWords, documentFrequency, totalDocuments) {
+  if (totalDocuments === 0) {
+    return 0;
+  }
   let tfidfSum = 0;
   for (const word of phraseWords) {
     const df = documentFrequency.get(word.word) || 1;
@@ -386,14 +431,22 @@ function pluginFuzzyPhrase(userConfig = {}) {
           console.error("\u26A0\uFE0F  Failed to load synonyms:", error);
         }
       }
-      if (orama.data && typeof orama.data === "object") {
-        const docs = orama.data.docs || {};
+      const docs = orama.data?.docs?.docs;
+      if (docs) {
         state.totalDocuments = Object.keys(docs).length;
         state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
         console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
       }
       pluginStates.set(orama, state);
       console.log("\u2705 Fuzzy Phrase Plugin initialized");
+      setImmediate(() => {
+        if (typeof globalThis.fuzzyPhrasePluginReady === "function") {
+          console.log("\u{1F4E1} Signaling plugin ready...");
+          globalThis.fuzzyPhrasePluginReady();
+        } else {
+          console.warn("\u26A0\uFE0F  fuzzyPhrasePluginReady callback not found");
+        }
+      });
     }
   };
   return plugin;
@@ -456,7 +509,31 @@ async function searchWithFuzzyPhrase(orama, params, language) {
   );
   console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
   const documentMatches = [];
-  const docs = orama.data?.docs || {};
+  console.log("\u{1F50D} DEBUG orama.data structure:", {
+    dataKeys: Object.keys(orama.data || {}),
+    hasDocs: !!orama.data?.docs,
+    docsType: orama.data?.docs ? typeof orama.data.docs : "undefined"
+  });
+  let docs = {};
+  if (orama.data?.docs?.docs) {
+    docs = orama.data.docs.docs;
+    console.log("\u2705 Found docs at orama.data.docs.docs");
+  } else if (orama.data?.docs && typeof orama.data.docs === "object") {
+    const firstKey = Object.keys(orama.data.docs)[0];
+    if (firstKey && firstKey !== "sharedInternalDocumentStore" && firstKey !== "count") {
+      docs = orama.data.docs;
+      console.log("\u2705 Found docs at orama.data.docs (direct)");
+    }
+  }
+  if (Object.keys(docs).length === 0) {
+    console.log("\u274C Could not find documents - available structure:", {
+      hasDataDocs: !!orama.data?.docs,
+      dataDocsKeys: orama.data?.docs ? Object.keys(orama.data.docs) : "none",
+      hasDataDocsDocs: !!orama.data?.docs?.docs,
+      dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
+    });
+  }
+  console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
   for (const [docId, doc] of Object.entries(docs)) {
     const text = doc[textProperty];
     if (!text || typeof text !== "string") {
@@ -484,7 +561,9 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     }
   }
   documentMatches.sort((a, b) => b.score - a.score);
-  const hits = documentMatches.map((match) => ({
+  const limit = params.limit ?? documentMatches.length;
+  const limitedMatches = documentMatches.slice(0, limit);
+  const hits = limitedMatches.map((match) => ({
     id: match.id,
     score: match.score,
     document: match.document,
@@ -492,7 +571,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
     _phrases: match.phrases
   }));
   const elapsed = performance.now() - startTime;
-  console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms`);
+  console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms (limit: ${limit})`);
   return {
     elapsed: {
       formatted: `${elapsed.toFixed(2)}ms`,
@@ -505,15 +584,25 @@ async function searchWithFuzzyPhrase(orama, params, language) {
 }
 async function loadSynonymsFromSupabase(supabaseConfig) {
   try {
+    console.log("\u{1F50D} DEBUG: Calling Supabase RPC get_synonym_map...");
     const { createClient } = await import('@supabase/supabase-js');
     const supabase = createClient(supabaseConfig.url, supabaseConfig.serviceKey);
     const { data, error } = await supabase.rpc("get_synonym_map");
+    console.log("\u{1F50D} DEBUG: Supabase RPC response:", {
+      hasError: !!error,
+      errorMessage: error?.message,
+      hasData: !!data,
+      dataType: typeof data,
+      dataKeys: data ? Object.keys(data).length : 0
+    });
     if (error) {
       throw new Error(`Supabase error: ${error.message}`);
     }
-    return data || {};
+    const synonymMap = data || {};
+    console.log(`\u{1F4DA} Loaded ${Object.keys(synonymMap).length} synonym entries from Supabase`);
+    return synonymMap;
   } catch (error) {
-    console.error("Failed to load synonyms from Supabase:", error);
+    console.error("\u274C Failed to load synonyms from Supabase:", error);
     throw error;
   }
 }
@@ -531,8 +620,11 @@ function calculateDocumentFrequencies(docs, textProperty) {
   }
   return df;
 }
+function normalizeText(text) {
+  return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
+}
 function tokenize(text) {
-  return text.toLowerCase().split(/\s+/).filter((token) => token.length > 0);
+  return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
 }
 export { pluginFuzzyPhrase, searchWithFuzzyPhrase };