@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.9 → 3.1.16-custom.newbase.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE.md ADDED
@@ -0,0 +1,13 @@
1
+ Copyright 2023 OramaSearch Inc.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
package/README.md CHANGED
@@ -159,6 +159,6 @@ Apache-2.0
159
159
 
160
160
  ## Version
161
161
 
162
- 3.1.16-custom.1
162
+ 3.1.16-custom.newbase.1
163
163
 
164
164
  Compatible with `@wcs-colab/orama@3.1.16-custom.9`
package/dist/index.cjs CHANGED
@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
50
50
  if (word === queryToken) {
51
51
  return { matches: true, distance: 0, score: 1 };
52
52
  }
53
- if (word.startsWith(queryToken)) {
54
- return { matches: true, distance: 0, score: 0.95 };
55
- }
56
53
  const result = boundedLevenshtein(word, queryToken, tolerance);
57
54
  if (result.isBounded) {
58
55
  const score = 1 - result.distance * 0.2;
@@ -82,30 +79,13 @@ function calculateAdaptiveTolerance(queryTokens, baseTolerance) {
82
79
  function extractVocabularyFromRadixTree(radixNode) {
83
80
  const vocabulary = /* @__PURE__ */ new Set();
84
81
  let nodesVisited = 0;
85
- let wordsFound = 0;
86
82
  function traverse(node, depth = 0) {
87
83
  if (!node) {
88
- console.log(`\u26A0\uFE0F Null node at depth ${depth}`);
89
84
  return;
90
85
  }
91
86
  nodesVisited++;
92
- if (nodesVisited <= 3) {
93
- const cInfo = node.c ? {
94
- isArray: Array.isArray(node.c),
95
- isMap: node.c instanceof Map,
96
- type: typeof node.c,
97
- constructor: node.c.constructor?.name,
98
- keys: node.c instanceof Map ? Array.from(node.c.keys()).slice(0, 3) : Object.keys(node.c).slice(0, 3),
99
- valuesCount: node.c instanceof Map ? node.c.size : Array.isArray(node.c) ? node.c.length : Object.keys(node.c).length
100
- } : "null";
101
- console.log(`\u{1F50D} Node ${nodesVisited}:`, { w: node.w, e: node.e, has_c: !!node.c, c_info: cInfo });
102
- }
103
87
  if (node.e && node.w && typeof node.w === "string" && node.w.length > 0) {
104
88
  vocabulary.add(node.w);
105
- wordsFound++;
106
- if (wordsFound <= 5) {
107
- console.log(`\u2705 Found word ${wordsFound}: "${node.w}"`);
108
- }
109
89
  }
110
90
  if (node.c) {
111
91
  if (node.c instanceof Map) {
@@ -227,52 +207,85 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
227
207
  queryTokens,
228
208
  config,
229
209
  documentFrequency,
230
- totalDocuments
210
+ totalDocuments,
211
+ wordMatches,
212
+ documentTokens
213
+ // Pass document tokens to extract gap words
231
214
  );
232
215
  if (phrase && phrase.words.length > 0) {
233
216
  phrases.push(phrase);
234
217
  }
235
218
  }
236
- return deduplicatePhrases(phrases);
219
+ const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
220
+ const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
221
+ return deduplicatePhrases(filteredPhrases);
237
222
  }
238
- function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments) {
223
+ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
239
224
  const startMatch = wordMatches[startIndex];
240
225
  const phraseWords = [startMatch];
241
- const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
226
+ const queryTokenCounts = /* @__PURE__ */ new Map();
227
+ for (const token of queryTokens) {
228
+ queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
229
+ }
230
+ const matchedCounts = /* @__PURE__ */ new Map();
231
+ matchedCounts.set(startMatch.queryToken, 1);
232
+ const gapWords = [];
233
+ let totalGapUsed = 0;
234
+ let totalMatchedTokens = 1;
242
235
  for (let i = startIndex + 1; i < wordMatches.length; i++) {
243
236
  const match = wordMatches[i];
244
- const gap = match.position - phraseWords[phraseWords.length - 1].position - 1;
237
+ const lastPos = phraseWords[phraseWords.length - 1].position;
238
+ const gap = match.position - lastPos - 1;
245
239
  if (gap > config.maxGap) {
246
240
  break;
247
241
  }
248
- if (!coveredTokens.has(match.queryToken)) {
242
+ const neededCount = queryTokenCounts.get(match.queryToken) || 0;
243
+ const currentCount = matchedCounts.get(match.queryToken) || 0;
244
+ if (currentCount < neededCount) {
245
+ for (let pos = lastPos + 1; pos < match.position; pos++) {
246
+ totalGapUsed++;
247
+ gapWords.push({
248
+ word: documentTokens[pos],
249
+ position: pos,
250
+ gapIndex: totalGapUsed
251
+ });
252
+ }
249
253
  phraseWords.push(match);
250
- coveredTokens.add(match.queryToken);
251
- }
252
- if (coveredTokens.size === queryTokens.length) {
253
- break;
254
+ matchedCounts.set(match.queryToken, currentCount + 1);
255
+ totalMatchedTokens++;
256
+ if (totalMatchedTokens === queryTokens.length) {
257
+ break;
258
+ }
254
259
  }
255
260
  }
256
261
  if (phraseWords.length > 0) {
257
- const score = calculatePhraseScore(
262
+ const coverage = phraseWords.length / queryTokens.length;
263
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
264
+ const { score, breakdown } = calculatePhraseScore(
258
265
  phraseWords,
259
266
  queryTokens,
260
267
  config,
261
268
  documentFrequency,
262
- totalDocuments
269
+ totalDocuments,
270
+ allWordMatches,
271
+ coverage
263
272
  );
264
273
  return {
265
274
  words: phraseWords,
275
+ gapWords,
276
+ gapUsed: totalGapUsed,
277
+ coverage,
266
278
  startPosition: phraseWords[0].position,
267
279
  endPosition: phraseWords[phraseWords.length - 1].position,
268
- gap: phraseWords[phraseWords.length - 1].position - phraseWords[0].position,
280
+ span,
269
281
  inOrder: isInOrder(phraseWords, queryTokens),
270
- score
282
+ score,
283
+ scoreBreakdown: breakdown
271
284
  };
272
285
  }
273
286
  return null;
274
287
  }
275
- function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments) {
288
+ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, coverage) {
276
289
  let baseScore = 0;
277
290
  for (const word of phraseWords) {
278
291
  const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
@@ -281,18 +294,53 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
281
294
  baseScore /= phraseWords.length;
282
295
  const inOrder = isInOrder(phraseWords, queryTokens);
283
296
  const orderScore = inOrder ? 1 : 0.5;
284
- const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
285
- const proximityScore = Math.max(0, 1 - span / (queryTokens.length * 5));
286
- const densityScore = phraseWords.length / queryTokens.length;
297
+ let proximityScore = 0;
298
+ if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
299
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
300
+ const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
301
+ proximityScore = Math.max(0, 1 - span / proximityWindow);
302
+ }
303
+ let densityScore = 0;
304
+ if (queryTokens.length === 1) {
305
+ const totalOccurrences = allWordMatches.length;
306
+ densityScore = Math.min(1, totalOccurrences / 10);
307
+ }
287
308
  const semanticScore = calculateSemanticScore(
288
309
  phraseWords,
289
310
  documentFrequency,
290
311
  totalDocuments
291
312
  );
292
313
  const weights = config.weights;
293
- const totalScore = baseScore + orderScore * weights.order + proximityScore * weights.proximity + densityScore * weights.density + semanticScore * weights.semantic;
294
- const maxPossibleScore = 1 + weights.order + weights.proximity + weights.density + weights.semantic;
295
- return Math.min(1, totalScore / maxPossibleScore);
314
+ const weightedBase = baseScore;
315
+ const weightedOrder = orderScore * weights.order;
316
+ const weightedProximity = proximityScore * weights.proximity;
317
+ const weightedDensity = densityScore * weights.density;
318
+ const weightedSemantic = semanticScore * weights.semantic;
319
+ const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
320
+ const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
321
+ const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
322
+ const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
323
+ const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
324
+ const normalizedScore = totalScore / maxPossibleScore;
325
+ const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
326
+ const score = normalizedScore * coverageMultiplier;
327
+ const base = weightedBase / maxPossibleScore;
328
+ const order = weightedOrder / maxPossibleScore;
329
+ const proximity = weightedProximity / maxPossibleScore;
330
+ const density = weightedDensity / maxPossibleScore;
331
+ const semantic = weightedSemantic / maxPossibleScore;
332
+ return {
333
+ score,
334
+ breakdown: {
335
+ base,
336
+ order,
337
+ proximity,
338
+ density,
339
+ semantic,
340
+ coverage: coverageMultiplier
341
+ // Show coverage multiplier in breakdown
342
+ }
343
+ };
296
344
  }
297
345
  function isInOrder(phraseWords, queryTokens) {
298
346
  const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
@@ -306,6 +354,9 @@ function isInOrder(phraseWords, queryTokens) {
306
354
  return true;
307
355
  }
308
356
  function calculateSemanticScore(phraseWords, documentFrequency, totalDocuments) {
357
+ if (totalDocuments === 0) {
358
+ return 0;
359
+ }
309
360
  let tfidfSum = 0;
310
361
  for (const word of phraseWords) {
311
362
  const df = documentFrequency.get(word.word) || 1;
@@ -341,7 +392,8 @@ function deduplicatePhrases(phrases) {
341
392
 
342
393
  // src/index.ts
343
394
  var DEFAULT_CONFIG = {
344
- textProperty: "content",
395
+ textProperty: "normalized_content",
396
+ // Must match server's field name
345
397
  tolerance: 1,
346
398
  adaptiveTolerance: true,
347
399
  enableSynonyms: false,
@@ -356,7 +408,10 @@ var DEFAULT_CONFIG = {
356
408
  semantic: 0.15
357
409
  },
358
410
  maxGap: 5,
359
- minScore: 0.1
411
+ minScore: 0.1,
412
+ enableFinalScoreMinimum: false,
413
+ finalScoreMinimum: 0.3,
414
+ proximitySpanMultiplier: 5
360
415
  };
361
416
  var pluginStates = /* @__PURE__ */ new WeakMap();
362
417
  function pluginFuzzyPhrase(userConfig = {}) {
@@ -376,7 +431,10 @@ function pluginFuzzyPhrase(userConfig = {}) {
376
431
  semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
377
432
  },
378
433
  maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
379
- minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
434
+ minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
435
+ enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
436
+ finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
437
+ proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
380
438
  };
381
439
  const plugin = {
382
440
  name: "fuzzy-phrase",
@@ -400,14 +458,22 @@ function pluginFuzzyPhrase(userConfig = {}) {
400
458
  console.error("\u26A0\uFE0F Failed to load synonyms:", error);
401
459
  }
402
460
  }
403
- if (orama.data && typeof orama.data === "object") {
404
- const docs = orama.data.docs || {};
461
+ const docs = orama.data?.docs?.docs;
462
+ if (docs) {
405
463
  state.totalDocuments = Object.keys(docs).length;
406
464
  state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
407
465
  console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
408
466
  }
409
467
  pluginStates.set(orama, state);
410
468
  console.log("\u2705 Fuzzy Phrase Plugin initialized");
469
+ setImmediate(() => {
470
+ if (typeof globalThis.fuzzyPhrasePluginReady === "function") {
471
+ console.log("\u{1F4E1} Signaling plugin ready...");
472
+ globalThis.fuzzyPhrasePluginReady();
473
+ } else {
474
+ console.warn("\u26A0\uFE0F fuzzyPhrasePluginReady callback not found");
475
+ }
476
+ });
411
477
  }
412
478
  };
413
479
  return plugin;
@@ -464,13 +530,34 @@ async function searchWithFuzzyPhrase(orama, params, language) {
464
530
  state.config.enableSynonyms ? state.synonymMap : void 0,
465
531
  state.config.synonymMatchScore
466
532
  );
467
- const filteredCandidates = filterCandidatesByScore(
468
- candidatesMap,
469
- state.config.minScore
470
- );
533
+ const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
471
534
  console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
472
535
  const documentMatches = [];
473
- const docs = orama.data?.docs || {};
536
+ console.log("\u{1F50D} DEBUG orama.data structure:", {
537
+ dataKeys: Object.keys(orama.data || {}),
538
+ hasDocs: !!orama.data?.docs,
539
+ docsType: orama.data?.docs ? typeof orama.data.docs : "undefined"
540
+ });
541
+ let docs = {};
542
+ if (orama.data?.docs?.docs) {
543
+ docs = orama.data.docs.docs;
544
+ console.log("\u2705 Found docs at orama.data.docs.docs");
545
+ } else if (orama.data?.docs && typeof orama.data.docs === "object") {
546
+ const firstKey = Object.keys(orama.data.docs)[0];
547
+ if (firstKey && firstKey !== "sharedInternalDocumentStore" && firstKey !== "count") {
548
+ docs = orama.data.docs;
549
+ console.log("\u2705 Found docs at orama.data.docs (direct)");
550
+ }
551
+ }
552
+ if (Object.keys(docs).length === 0) {
553
+ console.log("\u274C Could not find documents - available structure:", {
554
+ hasDataDocs: !!orama.data?.docs,
555
+ dataDocsKeys: orama.data?.docs ? Object.keys(orama.data.docs) : "none",
556
+ hasDataDocsDocs: !!orama.data?.docs?.docs,
557
+ dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
558
+ });
559
+ }
560
+ console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
474
561
  for (const [docId, doc] of Object.entries(docs)) {
475
562
  const text = doc[textProperty];
476
563
  if (!text || typeof text !== "string") {
@@ -482,7 +569,9 @@ async function searchWithFuzzyPhrase(orama, params, language) {
482
569
  filteredCandidates,
483
570
  {
484
571
  weights: state.config.weights,
485
- maxGap: state.config.maxGap
572
+ maxGap: state.config.maxGap,
573
+ proximitySpanMultiplier: state.config.proximitySpanMultiplier,
574
+ tolerance
486
575
  },
487
576
  state.documentFrequency,
488
577
  state.totalDocuments
@@ -498,7 +587,16 @@ async function searchWithFuzzyPhrase(orama, params, language) {
498
587
  }
499
588
  }
500
589
  documentMatches.sort((a, b) => b.score - a.score);
501
- const hits = documentMatches.map((match) => ({
590
+ let filteredMatches = documentMatches;
591
+ if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
592
+ const threshold = state.config.finalScoreMinimum;
593
+ const beforeCount = filteredMatches.length;
594
+ filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
595
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
596
+ }
597
+ const limit = params.limit ?? filteredMatches.length;
598
+ const limitedMatches = filteredMatches.slice(0, limit);
599
+ const hits = limitedMatches.map((match) => ({
502
600
  id: match.id,
503
601
  score: match.score,
504
602
  document: match.document,
@@ -506,7 +604,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
506
604
  _phrases: match.phrases
507
605
  }));
508
606
  const elapsed = performance.now() - startTime;
509
- console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms`);
607
+ console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms (limit: ${limit})`);
510
608
  return {
511
609
  elapsed: {
512
610
  formatted: `${elapsed.toFixed(2)}ms`,
@@ -519,15 +617,25 @@ async function searchWithFuzzyPhrase(orama, params, language) {
519
617
  }
520
618
  async function loadSynonymsFromSupabase(supabaseConfig) {
521
619
  try {
620
+ console.log("\u{1F50D} DEBUG: Calling Supabase RPC get_synonym_map...");
522
621
  const { createClient } = await import('@supabase/supabase-js');
523
622
  const supabase = createClient(supabaseConfig.url, supabaseConfig.serviceKey);
524
623
  const { data, error } = await supabase.rpc("get_synonym_map");
624
+ console.log("\u{1F50D} DEBUG: Supabase RPC response:", {
625
+ hasError: !!error,
626
+ errorMessage: error?.message,
627
+ hasData: !!data,
628
+ dataType: typeof data,
629
+ dataKeys: data ? Object.keys(data).length : 0
630
+ });
525
631
  if (error) {
526
632
  throw new Error(`Supabase error: ${error.message}`);
527
633
  }
528
- return data || {};
634
+ const synonymMap = data || {};
635
+ console.log(`\u{1F4DA} Loaded ${Object.keys(synonymMap).length} synonym entries from Supabase`);
636
+ return synonymMap;
529
637
  } catch (error) {
530
- console.error("Failed to load synonyms from Supabase:", error);
638
+ console.error("\u274C Failed to load synonyms from Supabase:", error);
531
639
  throw error;
532
640
  }
533
641
  }
@@ -545,8 +653,11 @@ function calculateDocumentFrequencies(docs, textProperty) {
545
653
  }
546
654
  return df;
547
655
  }
656
+ function normalizeText(text) {
657
+ return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
658
+ }
548
659
  function tokenize(text) {
549
- return text.toLowerCase().split(/\s+/).filter((token) => token.length > 0);
660
+ return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
550
661
  }
551
662
 
552
663
  exports.pluginFuzzyPhrase = pluginFuzzyPhrase;