@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.2 → 3.1.16-custom.newbase.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -48,9 +48,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
48
48
  if (word === queryToken) {
49
49
  return { matches: true, distance: 0, score: 1 };
50
50
  }
51
- if (word.startsWith(queryToken)) {
52
- return { matches: true, distance: 0, score: 0.95 };
53
- }
54
51
  const result = boundedLevenshtein(word, queryToken, tolerance);
55
52
  if (result.isBounded) {
56
53
  const score = 1 - result.distance * 0.2;
@@ -180,23 +177,53 @@ function filterCandidatesByScore(candidatesMap, minScore) {
180
177
  }
181
178
 
182
179
  // src/scoring.ts
183
- function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
180
+ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, docPositions) {
184
181
  const phrases = [];
185
- const queryTokens = Array.from(candidatesMap.keys());
182
+ const queryTokens = originalQueryTokens;
186
183
  const wordMatches = [];
187
- for (let i = 0; i < documentTokens.length; i++) {
188
- const docWord = documentTokens[i];
184
+ if (docPositions) {
185
+ const candidateLookup = /* @__PURE__ */ new Map();
189
186
  for (const [queryToken, candidates] of candidatesMap.entries()) {
190
187
  for (const candidate of candidates) {
191
- if (candidate.word === docWord) {
192
- wordMatches.push({
193
- word: docWord,
194
- queryToken,
195
- position: i,
196
- type: candidate.type,
197
- distance: candidate.distance,
198
- score: candidate.score
199
- });
188
+ if (!candidateLookup.has(candidate.word)) {
189
+ candidateLookup.set(candidate.word, []);
190
+ }
191
+ candidateLookup.get(candidate.word).push({ queryToken, candidate });
192
+ }
193
+ }
194
+ for (const [docWord, positions] of Object.entries(docPositions)) {
195
+ const matches = candidateLookup.get(docWord);
196
+ if (matches) {
197
+ for (const { queryToken, candidate } of matches) {
198
+ for (const position of positions) {
199
+ wordMatches.push({
200
+ word: docWord,
201
+ queryToken,
202
+ position,
203
+ type: candidate.type,
204
+ distance: candidate.distance,
205
+ score: candidate.score
206
+ });
207
+ }
208
+ }
209
+ }
210
+ }
211
+ wordMatches.sort((a, b) => a.position - b.position);
212
+ } else {
213
+ for (let i = 0; i < documentTokens.length; i++) {
214
+ const docWord = documentTokens[i];
215
+ for (const [queryToken, candidates] of candidatesMap.entries()) {
216
+ for (const candidate of candidates) {
217
+ if (candidate.word === docWord) {
218
+ wordMatches.push({
219
+ word: docWord,
220
+ queryToken,
221
+ position: i,
222
+ type: candidate.type,
223
+ distance: candidate.distance,
224
+ score: candidate.score
225
+ });
226
+ }
200
227
  }
201
228
  }
202
229
  }
@@ -209,47 +236,76 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
209
236
  config,
210
237
  documentFrequency,
211
238
  totalDocuments,
212
- wordMatches
213
- // Pass all word matches for density calculation
239
+ wordMatches,
240
+ documentTokens
241
+ // Pass document tokens to extract gap words
214
242
  );
215
243
  if (phrase && phrase.words.length > 0) {
216
244
  phrases.push(phrase);
217
245
  }
218
246
  }
219
- return deduplicatePhrases(phrases);
247
+ const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
248
+ const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
249
+ return deduplicatePhrases(filteredPhrases);
220
250
  }
221
- function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
251
+ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
222
252
  const startMatch = wordMatches[startIndex];
223
253
  const phraseWords = [startMatch];
224
- const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
254
+ const queryTokenCounts = /* @__PURE__ */ new Map();
255
+ for (const token of queryTokens) {
256
+ queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
257
+ }
258
+ const matchedCounts = /* @__PURE__ */ new Map();
259
+ matchedCounts.set(startMatch.queryToken, 1);
260
+ const gapWords = [];
261
+ let totalGapUsed = 0;
262
+ let totalMatchedTokens = 1;
225
263
  for (let i = startIndex + 1; i < wordMatches.length; i++) {
226
264
  const match = wordMatches[i];
227
- const gap = match.position - phraseWords[phraseWords.length - 1].position - 1;
265
+ const lastPos = phraseWords[phraseWords.length - 1].position;
266
+ const gap = match.position - lastPos - 1;
228
267
  if (gap > config.maxGap) {
229
268
  break;
230
269
  }
231
- if (!coveredTokens.has(match.queryToken)) {
270
+ const neededCount = queryTokenCounts.get(match.queryToken) || 0;
271
+ const currentCount = matchedCounts.get(match.queryToken) || 0;
272
+ if (currentCount < neededCount) {
273
+ for (let pos = lastPos + 1; pos < match.position; pos++) {
274
+ totalGapUsed++;
275
+ gapWords.push({
276
+ word: documentTokens[pos],
277
+ position: pos,
278
+ gapIndex: totalGapUsed
279
+ });
280
+ }
232
281
  phraseWords.push(match);
233
- coveredTokens.add(match.queryToken);
234
- }
235
- if (coveredTokens.size === queryTokens.length) {
236
- break;
282
+ matchedCounts.set(match.queryToken, currentCount + 1);
283
+ totalMatchedTokens++;
284
+ if (totalMatchedTokens === queryTokens.length) {
285
+ break;
286
+ }
237
287
  }
238
288
  }
239
289
  if (phraseWords.length > 0) {
290
+ const coverage = phraseWords.length / queryTokens.length;
291
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
240
292
  const { score, breakdown } = calculatePhraseScore(
241
293
  phraseWords,
242
294
  queryTokens,
243
295
  config,
244
296
  documentFrequency,
245
297
  totalDocuments,
246
- allWordMatches
298
+ allWordMatches,
299
+ coverage
247
300
  );
248
301
  return {
249
302
  words: phraseWords,
303
+ gapWords,
304
+ gapUsed: totalGapUsed,
305
+ coverage,
250
306
  startPosition: phraseWords[0].position,
251
307
  endPosition: phraseWords[phraseWords.length - 1].position,
252
- gap: phraseWords[phraseWords.length - 1].position - phraseWords[0].position,
308
+ span,
253
309
  inOrder: isInOrder(phraseWords, queryTokens),
254
310
  score,
255
311
  scoreBreakdown: breakdown
@@ -257,7 +313,7 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
257
313
  }
258
314
  return null;
259
315
  }
260
- function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
316
+ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, coverage) {
261
317
  let baseScore = 0;
262
318
  for (const word of phraseWords) {
263
319
  const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
@@ -266,14 +322,16 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
266
322
  baseScore /= phraseWords.length;
267
323
  const inOrder = isInOrder(phraseWords, queryTokens);
268
324
  const orderScore = inOrder ? 1 : 0.5;
269
- const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
270
- const proximityScore = Math.max(0, 1 - span / (queryTokens.length * 5));
325
+ let proximityScore = 0;
326
+ if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
327
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
328
+ const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
329
+ proximityScore = Math.max(0, 1 - span / proximityWindow);
330
+ }
271
331
  let densityScore = 0;
272
332
  if (queryTokens.length === 1) {
273
333
  const totalOccurrences = allWordMatches.length;
274
- densityScore = totalOccurrences / queryTokens.length;
275
- } else {
276
- densityScore = phraseWords.length / queryTokens.length;
334
+ densityScore = Math.min(1, totalOccurrences / 10);
277
335
  }
278
336
  const semanticScore = calculateSemanticScore(
279
337
  phraseWords,
@@ -287,8 +345,13 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
287
345
  const weightedDensity = densityScore * weights.density;
288
346
  const weightedSemantic = semanticScore * weights.semantic;
289
347
  const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
290
- const maxPossibleScore = 1 + weights.order + weights.proximity + weights.density + weights.semantic;
291
- const score = totalScore / maxPossibleScore;
348
+ const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
349
+ const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
350
+ const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
351
+ const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
352
+ const normalizedScore = totalScore / maxPossibleScore;
353
+ const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
354
+ const score = normalizedScore * coverageMultiplier;
292
355
  const base = weightedBase / maxPossibleScore;
293
356
  const order = weightedOrder / maxPossibleScore;
294
357
  const proximity = weightedProximity / maxPossibleScore;
@@ -301,18 +364,27 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
301
364
  order,
302
365
  proximity,
303
366
  density,
304
- semantic
367
+ semantic,
368
+ coverage: coverageMultiplier
369
+ // Show coverage multiplier in breakdown
305
370
  }
306
371
  };
307
372
  }
308
373
  function isInOrder(phraseWords, queryTokens) {
309
- const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
310
- for (let i = 1; i < phraseWords.length; i++) {
311
- const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1;
312
- const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1;
313
- if (currOrder < prevOrder) {
374
+ const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
375
+ let lastMatchedIndex = -1;
376
+ for (const phraseWord of phraseWords) {
377
+ let foundIndex = -1;
378
+ for (const pos of tokenPositions) {
379
+ if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
380
+ foundIndex = pos.index;
381
+ break;
382
+ }
383
+ }
384
+ if (foundIndex === -1) {
314
385
  return false;
315
386
  }
387
+ lastMatchedIndex = foundIndex;
316
388
  }
317
389
  return true;
318
390
  }
@@ -355,7 +427,8 @@ function deduplicatePhrases(phrases) {
355
427
 
356
428
  // src/index.ts
357
429
  var DEFAULT_CONFIG = {
358
- textProperty: "content",
430
+ textProperty: "normalized_content",
431
+ // Must match server's field name
359
432
  tolerance: 1,
360
433
  adaptiveTolerance: true,
361
434
  enableSynonyms: false,
@@ -370,7 +443,10 @@ var DEFAULT_CONFIG = {
370
443
  semantic: 0.15
371
444
  },
372
445
  maxGap: 5,
373
- minScore: 0.1
446
+ minScore: 0.1,
447
+ enableFinalScoreMinimum: false,
448
+ finalScoreMinimum: 0.3,
449
+ proximitySpanMultiplier: 5
374
450
  };
375
451
  var pluginStates = /* @__PURE__ */ new WeakMap();
376
452
  function pluginFuzzyPhrase(userConfig = {}) {
@@ -390,7 +466,10 @@ function pluginFuzzyPhrase(userConfig = {}) {
390
466
  semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
391
467
  },
392
468
  maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
393
- minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
469
+ minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
470
+ enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
471
+ finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
472
+ proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
394
473
  };
395
474
  const plugin = {
396
475
  name: "fuzzy-phrase",
@@ -403,7 +482,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
403
482
  synonymMap: {},
404
483
  config,
405
484
  documentFrequency: /* @__PURE__ */ new Map(),
406
- totalDocuments: 0
485
+ totalDocuments: 0,
486
+ vocabulary: /* @__PURE__ */ new Set()
407
487
  };
408
488
  if (config.enableSynonyms && config.supabase) {
409
489
  try {
@@ -420,6 +500,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
420
500
  state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
421
501
  console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
422
502
  }
503
+ try {
504
+ const indexData = orama.data?.index;
505
+ let radixNode = null;
506
+ if (indexData?.indexes?.[config.textProperty]?.node) {
507
+ radixNode = indexData.indexes[config.textProperty].node;
508
+ } else if (indexData?.[config.textProperty]?.node) {
509
+ radixNode = indexData[config.textProperty].node;
510
+ }
511
+ if (radixNode) {
512
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
513
+ console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
514
+ } else {
515
+ console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
516
+ }
517
+ } catch (error) {
518
+ console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
519
+ }
423
520
  pluginStates.set(orama, state);
424
521
  console.log("\u2705 Fuzzy Phrase Plugin initialized");
425
522
  setImmediate(() => {
@@ -441,7 +538,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
441
538
  console.error("\u274C Plugin state not initialized");
442
539
  throw new Error("Fuzzy Phrase Plugin not properly initialized");
443
540
  }
444
- const { term, properties } = params;
541
+ const { term, properties, tokenCache } = params;
445
542
  if (!term || typeof term !== "string") {
446
543
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
447
544
  }
@@ -452,32 +549,31 @@ async function searchWithFuzzyPhrase(orama, params, language) {
452
549
  }
453
550
  const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
454
551
  console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
455
- let vocabulary;
456
- try {
457
- const indexData = orama.data?.index;
458
- if (!indexData) {
459
- console.error("\u274C No index data found in orama.data.index");
460
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
461
- }
462
- console.log("\u{1F50D} DEBUG: Index data keys:", Object.keys(indexData || {}));
463
- let radixNode = null;
464
- if (indexData.indexes?.[textProperty]?.node) {
465
- radixNode = indexData.indexes[textProperty].node;
466
- console.log("\u2705 Found radix via QPS-style path (data.index.indexes)");
467
- } else if (indexData[textProperty]?.node) {
468
- radixNode = indexData[textProperty].node;
469
- console.log("\u2705 Found radix via standard path (data.index[property])");
470
- }
471
- if (!radixNode) {
472
- console.error("\u274C Radix tree not found for property:", textProperty);
473
- console.error(" Available properties in index:", Object.keys(indexData));
552
+ let vocabulary = state.vocabulary;
553
+ if (vocabulary.size === 0) {
554
+ console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
555
+ try {
556
+ const indexData = orama.data?.index;
557
+ let radixNode = null;
558
+ if (indexData?.indexes?.[textProperty]?.node) {
559
+ radixNode = indexData.indexes[textProperty].node;
560
+ } else if (indexData?.[textProperty]?.node) {
561
+ radixNode = indexData[textProperty].node;
562
+ }
563
+ if (radixNode) {
564
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
565
+ vocabulary = state.vocabulary;
566
+ console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
567
+ } else {
568
+ console.error("\u274C Radix tree not found for vocabulary extraction");
569
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
570
+ }
571
+ } catch (error) {
572
+ console.error("\u274C Failed to extract vocabulary:", error);
474
573
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
475
574
  }
476
- vocabulary = extractVocabularyFromRadixTree(radixNode);
477
- console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`);
478
- } catch (error) {
479
- console.error("\u274C Failed to extract vocabulary:", error);
480
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
575
+ } else {
576
+ console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
481
577
  }
482
578
  const candidatesMap = findAllCandidates(
483
579
  queryTokens,
@@ -486,10 +582,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
486
582
  state.config.enableSynonyms ? state.synonymMap : void 0,
487
583
  state.config.synonymMatchScore
488
584
  );
489
- const filteredCandidates = filterCandidatesByScore(
490
- candidatesMap,
491
- state.config.minScore
492
- );
585
+ const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
493
586
  console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
494
587
  const documentMatches = [];
495
588
  console.log("\u{1F50D} DEBUG orama.data structure:", {
@@ -516,22 +609,48 @@ async function searchWithFuzzyPhrase(orama, params, language) {
516
609
  dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
517
610
  });
518
611
  }
519
- console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
612
+ const cacheHits = tokenCache ? tokenCache.size : 0;
613
+ let hasPositionalIndex = false;
614
+ if (tokenCache && tokenCache.size > 0) {
615
+ const firstEntry = tokenCache.values().next().value;
616
+ hasPositionalIndex = !!(firstEntry && !Array.isArray(firstEntry) && firstEntry.positions);
617
+ }
618
+ console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents (${hasPositionalIndex ? "\u26A1 positional index" : cacheHits > 0 ? "tokens cached" : "no cache"})`);
520
619
  for (const [docId, doc] of Object.entries(docs)) {
521
620
  const text = doc[textProperty];
522
621
  if (!text || typeof text !== "string") {
523
622
  continue;
524
623
  }
525
- const docTokens = tokenize(text);
624
+ let docTokens;
625
+ let docPositions;
626
+ if (tokenCache && tokenCache.has(docId)) {
627
+ const cached = tokenCache.get(docId);
628
+ if (Array.isArray(cached)) {
629
+ docTokens = cached;
630
+ } else if (cached.tokens && cached.positions) {
631
+ docTokens = cached.tokens;
632
+ docPositions = cached.positions;
633
+ } else {
634
+ docTokens = tokenize(text);
635
+ }
636
+ } else {
637
+ docTokens = tokenize(text);
638
+ }
526
639
  const phrases = findPhrasesInDocument(
527
640
  docTokens,
528
641
  filteredCandidates,
529
642
  {
530
643
  weights: state.config.weights,
531
- maxGap: state.config.maxGap
644
+ maxGap: state.config.maxGap,
645
+ proximitySpanMultiplier: state.config.proximitySpanMultiplier,
646
+ tolerance
532
647
  },
533
648
  state.documentFrequency,
534
- state.totalDocuments
649
+ state.totalDocuments,
650
+ queryTokens,
651
+ // Original tokens with duplicates preserved
652
+ docPositions
653
+ // Positional index for O(matches) lookup
535
654
  );
536
655
  if (phrases.length > 0) {
537
656
  const docScore = Math.max(...phrases.map((p) => p.score));
@@ -544,8 +663,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
544
663
  }
545
664
  }
546
665
  documentMatches.sort((a, b) => b.score - a.score);
547
- const limit = params.limit ?? documentMatches.length;
548
- const limitedMatches = documentMatches.slice(0, limit);
666
+ let filteredMatches = documentMatches;
667
+ if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
668
+ const threshold = state.config.finalScoreMinimum;
669
+ const beforeCount = filteredMatches.length;
670
+ filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
671
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
672
+ }
673
+ const limit = params.limit ?? filteredMatches.length;
674
+ const limitedMatches = filteredMatches.slice(0, limit);
549
675
  const hits = limitedMatches.map((match) => ({
550
676
  id: match.id,
551
677
  score: match.score,