@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.2 → 3.1.16-custom.newbase.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
50
50
  if (word === queryToken) {
51
51
  return { matches: true, distance: 0, score: 1 };
52
52
  }
53
- if (word.startsWith(queryToken)) {
54
- return { matches: true, distance: 0, score: 0.95 };
55
- }
56
53
  const result = boundedLevenshtein(word, queryToken, tolerance);
57
54
  if (result.isBounded) {
58
55
  const score = 1 - result.distance * 0.2;
@@ -182,23 +179,53 @@ function filterCandidatesByScore(candidatesMap, minScore) {
182
179
  }
183
180
 
184
181
  // src/scoring.ts
185
- function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
182
+ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, docPositions) {
186
183
  const phrases = [];
187
- const queryTokens = Array.from(candidatesMap.keys());
184
+ const queryTokens = originalQueryTokens;
188
185
  const wordMatches = [];
189
- for (let i = 0; i < documentTokens.length; i++) {
190
- const docWord = documentTokens[i];
186
+ if (docPositions) {
187
+ const candidateLookup = /* @__PURE__ */ new Map();
191
188
  for (const [queryToken, candidates] of candidatesMap.entries()) {
192
189
  for (const candidate of candidates) {
193
- if (candidate.word === docWord) {
194
- wordMatches.push({
195
- word: docWord,
196
- queryToken,
197
- position: i,
198
- type: candidate.type,
199
- distance: candidate.distance,
200
- score: candidate.score
201
- });
190
+ if (!candidateLookup.has(candidate.word)) {
191
+ candidateLookup.set(candidate.word, []);
192
+ }
193
+ candidateLookup.get(candidate.word).push({ queryToken, candidate });
194
+ }
195
+ }
196
+ for (const [docWord, positions] of Object.entries(docPositions)) {
197
+ const matches = candidateLookup.get(docWord);
198
+ if (matches) {
199
+ for (const { queryToken, candidate } of matches) {
200
+ for (const position of positions) {
201
+ wordMatches.push({
202
+ word: docWord,
203
+ queryToken,
204
+ position,
205
+ type: candidate.type,
206
+ distance: candidate.distance,
207
+ score: candidate.score
208
+ });
209
+ }
210
+ }
211
+ }
212
+ }
213
+ wordMatches.sort((a, b) => a.position - b.position);
214
+ } else {
215
+ for (let i = 0; i < documentTokens.length; i++) {
216
+ const docWord = documentTokens[i];
217
+ for (const [queryToken, candidates] of candidatesMap.entries()) {
218
+ for (const candidate of candidates) {
219
+ if (candidate.word === docWord) {
220
+ wordMatches.push({
221
+ word: docWord,
222
+ queryToken,
223
+ position: i,
224
+ type: candidate.type,
225
+ distance: candidate.distance,
226
+ score: candidate.score
227
+ });
228
+ }
202
229
  }
203
230
  }
204
231
  }
@@ -211,47 +238,76 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
211
238
  config,
212
239
  documentFrequency,
213
240
  totalDocuments,
214
- wordMatches
215
- // Pass all word matches for density calculation
241
+ wordMatches,
242
+ documentTokens
243
+ // Pass document tokens to extract gap words
216
244
  );
217
245
  if (phrase && phrase.words.length > 0) {
218
246
  phrases.push(phrase);
219
247
  }
220
248
  }
221
- return deduplicatePhrases(phrases);
249
+ const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
250
+ const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
251
+ return deduplicatePhrases(filteredPhrases);
222
252
  }
223
- function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
253
+ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
224
254
  const startMatch = wordMatches[startIndex];
225
255
  const phraseWords = [startMatch];
226
- const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
256
+ const queryTokenCounts = /* @__PURE__ */ new Map();
257
+ for (const token of queryTokens) {
258
+ queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
259
+ }
260
+ const matchedCounts = /* @__PURE__ */ new Map();
261
+ matchedCounts.set(startMatch.queryToken, 1);
262
+ const gapWords = [];
263
+ let totalGapUsed = 0;
264
+ let totalMatchedTokens = 1;
227
265
  for (let i = startIndex + 1; i < wordMatches.length; i++) {
228
266
  const match = wordMatches[i];
229
- const gap = match.position - phraseWords[phraseWords.length - 1].position - 1;
267
+ const lastPos = phraseWords[phraseWords.length - 1].position;
268
+ const gap = match.position - lastPos - 1;
230
269
  if (gap > config.maxGap) {
231
270
  break;
232
271
  }
233
- if (!coveredTokens.has(match.queryToken)) {
272
+ const neededCount = queryTokenCounts.get(match.queryToken) || 0;
273
+ const currentCount = matchedCounts.get(match.queryToken) || 0;
274
+ if (currentCount < neededCount) {
275
+ for (let pos = lastPos + 1; pos < match.position; pos++) {
276
+ totalGapUsed++;
277
+ gapWords.push({
278
+ word: documentTokens[pos],
279
+ position: pos,
280
+ gapIndex: totalGapUsed
281
+ });
282
+ }
234
283
  phraseWords.push(match);
235
- coveredTokens.add(match.queryToken);
236
- }
237
- if (coveredTokens.size === queryTokens.length) {
238
- break;
284
+ matchedCounts.set(match.queryToken, currentCount + 1);
285
+ totalMatchedTokens++;
286
+ if (totalMatchedTokens === queryTokens.length) {
287
+ break;
288
+ }
239
289
  }
240
290
  }
241
291
  if (phraseWords.length > 0) {
292
+ const coverage = phraseWords.length / queryTokens.length;
293
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
242
294
  const { score, breakdown } = calculatePhraseScore(
243
295
  phraseWords,
244
296
  queryTokens,
245
297
  config,
246
298
  documentFrequency,
247
299
  totalDocuments,
248
- allWordMatches
300
+ allWordMatches,
301
+ coverage
249
302
  );
250
303
  return {
251
304
  words: phraseWords,
305
+ gapWords,
306
+ gapUsed: totalGapUsed,
307
+ coverage,
252
308
  startPosition: phraseWords[0].position,
253
309
  endPosition: phraseWords[phraseWords.length - 1].position,
254
- gap: phraseWords[phraseWords.length - 1].position - phraseWords[0].position,
310
+ span,
255
311
  inOrder: isInOrder(phraseWords, queryTokens),
256
312
  score,
257
313
  scoreBreakdown: breakdown
@@ -259,7 +315,7 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
259
315
  }
260
316
  return null;
261
317
  }
262
- function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
318
+ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, coverage) {
263
319
  let baseScore = 0;
264
320
  for (const word of phraseWords) {
265
321
  const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
@@ -268,14 +324,16 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
268
324
  baseScore /= phraseWords.length;
269
325
  const inOrder = isInOrder(phraseWords, queryTokens);
270
326
  const orderScore = inOrder ? 1 : 0.5;
271
- const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
272
- const proximityScore = Math.max(0, 1 - span / (queryTokens.length * 5));
327
+ let proximityScore = 0;
328
+ if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
329
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
330
+ const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
331
+ proximityScore = Math.max(0, 1 - span / proximityWindow);
332
+ }
273
333
  let densityScore = 0;
274
334
  if (queryTokens.length === 1) {
275
335
  const totalOccurrences = allWordMatches.length;
276
- densityScore = totalOccurrences / queryTokens.length;
277
- } else {
278
- densityScore = phraseWords.length / queryTokens.length;
336
+ densityScore = Math.min(1, totalOccurrences / 10);
279
337
  }
280
338
  const semanticScore = calculateSemanticScore(
281
339
  phraseWords,
@@ -289,8 +347,13 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
289
347
  const weightedDensity = densityScore * weights.density;
290
348
  const weightedSemantic = semanticScore * weights.semantic;
291
349
  const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
292
- const maxPossibleScore = 1 + weights.order + weights.proximity + weights.density + weights.semantic;
293
- const score = totalScore / maxPossibleScore;
350
+ const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
351
+ const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
352
+ const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
353
+ const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
354
+ const normalizedScore = totalScore / maxPossibleScore;
355
+ const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
356
+ const score = normalizedScore * coverageMultiplier;
294
357
  const base = weightedBase / maxPossibleScore;
295
358
  const order = weightedOrder / maxPossibleScore;
296
359
  const proximity = weightedProximity / maxPossibleScore;
@@ -303,18 +366,27 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
303
366
  order,
304
367
  proximity,
305
368
  density,
306
- semantic
369
+ semantic,
370
+ coverage: coverageMultiplier
371
+ // Show coverage multiplier in breakdown
307
372
  }
308
373
  };
309
374
  }
310
375
  function isInOrder(phraseWords, queryTokens) {
311
- const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
312
- for (let i = 1; i < phraseWords.length; i++) {
313
- const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1;
314
- const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1;
315
- if (currOrder < prevOrder) {
376
+ const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
377
+ let lastMatchedIndex = -1;
378
+ for (const phraseWord of phraseWords) {
379
+ let foundIndex = -1;
380
+ for (const pos of tokenPositions) {
381
+ if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
382
+ foundIndex = pos.index;
383
+ break;
384
+ }
385
+ }
386
+ if (foundIndex === -1) {
316
387
  return false;
317
388
  }
389
+ lastMatchedIndex = foundIndex;
318
390
  }
319
391
  return true;
320
392
  }
@@ -357,7 +429,8 @@ function deduplicatePhrases(phrases) {
357
429
 
358
430
  // src/index.ts
359
431
  var DEFAULT_CONFIG = {
360
- textProperty: "content",
432
+ textProperty: "normalized_content",
433
+ // Must match server's field name
361
434
  tolerance: 1,
362
435
  adaptiveTolerance: true,
363
436
  enableSynonyms: false,
@@ -372,7 +445,10 @@ var DEFAULT_CONFIG = {
372
445
  semantic: 0.15
373
446
  },
374
447
  maxGap: 5,
375
- minScore: 0.1
448
+ minScore: 0.1,
449
+ enableFinalScoreMinimum: false,
450
+ finalScoreMinimum: 0.3,
451
+ proximitySpanMultiplier: 5
376
452
  };
377
453
  var pluginStates = /* @__PURE__ */ new WeakMap();
378
454
  function pluginFuzzyPhrase(userConfig = {}) {
@@ -392,7 +468,10 @@ function pluginFuzzyPhrase(userConfig = {}) {
392
468
  semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
393
469
  },
394
470
  maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
395
- minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
471
+ minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
472
+ enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
473
+ finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
474
+ proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
396
475
  };
397
476
  const plugin = {
398
477
  name: "fuzzy-phrase",
@@ -405,7 +484,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
405
484
  synonymMap: {},
406
485
  config,
407
486
  documentFrequency: /* @__PURE__ */ new Map(),
408
- totalDocuments: 0
487
+ totalDocuments: 0,
488
+ vocabulary: /* @__PURE__ */ new Set()
409
489
  };
410
490
  if (config.enableSynonyms && config.supabase) {
411
491
  try {
@@ -422,6 +502,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
422
502
  state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
423
503
  console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
424
504
  }
505
+ try {
506
+ const indexData = orama.data?.index;
507
+ let radixNode = null;
508
+ if (indexData?.indexes?.[config.textProperty]?.node) {
509
+ radixNode = indexData.indexes[config.textProperty].node;
510
+ } else if (indexData?.[config.textProperty]?.node) {
511
+ radixNode = indexData[config.textProperty].node;
512
+ }
513
+ if (radixNode) {
514
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
515
+ console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
516
+ } else {
517
+ console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
518
+ }
519
+ } catch (error) {
520
+ console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
521
+ }
425
522
  pluginStates.set(orama, state);
426
523
  console.log("\u2705 Fuzzy Phrase Plugin initialized");
427
524
  setImmediate(() => {
@@ -443,7 +540,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
443
540
  console.error("\u274C Plugin state not initialized");
444
541
  throw new Error("Fuzzy Phrase Plugin not properly initialized");
445
542
  }
446
- const { term, properties } = params;
543
+ const { term, properties, tokenCache } = params;
447
544
  if (!term || typeof term !== "string") {
448
545
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
449
546
  }
@@ -454,32 +551,31 @@ async function searchWithFuzzyPhrase(orama, params, language) {
454
551
  }
455
552
  const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
456
553
  console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
457
- let vocabulary;
458
- try {
459
- const indexData = orama.data?.index;
460
- if (!indexData) {
461
- console.error("\u274C No index data found in orama.data.index");
462
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
463
- }
464
- console.log("\u{1F50D} DEBUG: Index data keys:", Object.keys(indexData || {}));
465
- let radixNode = null;
466
- if (indexData.indexes?.[textProperty]?.node) {
467
- radixNode = indexData.indexes[textProperty].node;
468
- console.log("\u2705 Found radix via QPS-style path (data.index.indexes)");
469
- } else if (indexData[textProperty]?.node) {
470
- radixNode = indexData[textProperty].node;
471
- console.log("\u2705 Found radix via standard path (data.index[property])");
472
- }
473
- if (!radixNode) {
474
- console.error("\u274C Radix tree not found for property:", textProperty);
475
- console.error(" Available properties in index:", Object.keys(indexData));
554
+ let vocabulary = state.vocabulary;
555
+ if (vocabulary.size === 0) {
556
+ console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
557
+ try {
558
+ const indexData = orama.data?.index;
559
+ let radixNode = null;
560
+ if (indexData?.indexes?.[textProperty]?.node) {
561
+ radixNode = indexData.indexes[textProperty].node;
562
+ } else if (indexData?.[textProperty]?.node) {
563
+ radixNode = indexData[textProperty].node;
564
+ }
565
+ if (radixNode) {
566
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
567
+ vocabulary = state.vocabulary;
568
+ console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
569
+ } else {
570
+ console.error("\u274C Radix tree not found for vocabulary extraction");
571
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
572
+ }
573
+ } catch (error) {
574
+ console.error("\u274C Failed to extract vocabulary:", error);
476
575
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
477
576
  }
478
- vocabulary = extractVocabularyFromRadixTree(radixNode);
479
- console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`);
480
- } catch (error) {
481
- console.error("\u274C Failed to extract vocabulary:", error);
482
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
577
+ } else {
578
+ console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
483
579
  }
484
580
  const candidatesMap = findAllCandidates(
485
581
  queryTokens,
@@ -488,10 +584,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
488
584
  state.config.enableSynonyms ? state.synonymMap : void 0,
489
585
  state.config.synonymMatchScore
490
586
  );
491
- const filteredCandidates = filterCandidatesByScore(
492
- candidatesMap,
493
- state.config.minScore
494
- );
587
+ const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
495
588
  console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
496
589
  const documentMatches = [];
497
590
  console.log("\u{1F50D} DEBUG orama.data structure:", {
@@ -518,22 +611,48 @@ async function searchWithFuzzyPhrase(orama, params, language) {
518
611
  dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
519
612
  });
520
613
  }
521
- console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
614
+ const cacheHits = tokenCache ? tokenCache.size : 0;
615
+ let hasPositionalIndex = false;
616
+ if (tokenCache && tokenCache.size > 0) {
617
+ const firstEntry = tokenCache.values().next().value;
618
+ hasPositionalIndex = !!(firstEntry && !Array.isArray(firstEntry) && firstEntry.positions);
619
+ }
620
+ console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents (${hasPositionalIndex ? "\u26A1 positional index" : cacheHits > 0 ? "tokens cached" : "no cache"})`);
522
621
  for (const [docId, doc] of Object.entries(docs)) {
523
622
  const text = doc[textProperty];
524
623
  if (!text || typeof text !== "string") {
525
624
  continue;
526
625
  }
527
- const docTokens = tokenize(text);
626
+ let docTokens;
627
+ let docPositions;
628
+ if (tokenCache && tokenCache.has(docId)) {
629
+ const cached = tokenCache.get(docId);
630
+ if (Array.isArray(cached)) {
631
+ docTokens = cached;
632
+ } else if (cached.tokens && cached.positions) {
633
+ docTokens = cached.tokens;
634
+ docPositions = cached.positions;
635
+ } else {
636
+ docTokens = tokenize(text);
637
+ }
638
+ } else {
639
+ docTokens = tokenize(text);
640
+ }
528
641
  const phrases = findPhrasesInDocument(
529
642
  docTokens,
530
643
  filteredCandidates,
531
644
  {
532
645
  weights: state.config.weights,
533
- maxGap: state.config.maxGap
646
+ maxGap: state.config.maxGap,
647
+ proximitySpanMultiplier: state.config.proximitySpanMultiplier,
648
+ tolerance
534
649
  },
535
650
  state.documentFrequency,
536
- state.totalDocuments
651
+ state.totalDocuments,
652
+ queryTokens,
653
+ // Original tokens with duplicates preserved
654
+ docPositions
655
+ // Positional index for O(matches) lookup
537
656
  );
538
657
  if (phrases.length > 0) {
539
658
  const docScore = Math.max(...phrases.map((p) => p.score));
@@ -546,8 +665,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
546
665
  }
547
666
  }
548
667
  documentMatches.sort((a, b) => b.score - a.score);
549
- const limit = params.limit ?? documentMatches.length;
550
- const limitedMatches = documentMatches.slice(0, limit);
668
+ let filteredMatches = documentMatches;
669
+ if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
670
+ const threshold = state.config.finalScoreMinimum;
671
+ const beforeCount = filteredMatches.length;
672
+ filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
673
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
674
+ }
675
+ const limit = params.limit ?? filteredMatches.length;
676
+ const limitedMatches = filteredMatches.slice(0, limit);
551
677
  const hits = limitedMatches.map((match) => ({
552
678
  id: match.id,
553
679
  score: match.score,