@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.2 → 3.1.16-custom.newbase.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
50
50
  if (word === queryToken) {
51
51
  return { matches: true, distance: 0, score: 1 };
52
52
  }
53
- if (word.startsWith(queryToken)) {
54
- return { matches: true, distance: 0, score: 0.95 };
55
- }
56
53
  const result = boundedLevenshtein(word, queryToken, tolerance);
57
54
  if (result.isBounded) {
58
55
  const score = 1 - result.distance * 0.2;
@@ -182,24 +179,32 @@ function filterCandidatesByScore(candidatesMap, minScore) {
182
179
  }
183
180
 
184
181
  // src/scoring.ts
185
- function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
182
+ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, docPositions) {
186
183
  const phrases = [];
187
- const queryTokens = Array.from(candidatesMap.keys());
184
+ const queryTokens = originalQueryTokens;
188
185
  const wordMatches = [];
186
+ const candidateLookup = /* @__PURE__ */ new Map();
187
+ for (const [queryToken, candidates] of candidatesMap.entries()) {
188
+ for (const candidate of candidates) {
189
+ if (!candidateLookup.has(candidate.word)) {
190
+ candidateLookup.set(candidate.word, []);
191
+ }
192
+ candidateLookup.get(candidate.word).push({ queryToken, candidate });
193
+ }
194
+ }
189
195
  for (let i = 0; i < documentTokens.length; i++) {
190
196
  const docWord = documentTokens[i];
191
- for (const [queryToken, candidates] of candidatesMap.entries()) {
192
- for (const candidate of candidates) {
193
- if (candidate.word === docWord) {
194
- wordMatches.push({
195
- word: docWord,
196
- queryToken,
197
- position: i,
198
- type: candidate.type,
199
- distance: candidate.distance,
200
- score: candidate.score
201
- });
202
- }
197
+ const matches = candidateLookup.get(docWord);
198
+ if (matches) {
199
+ for (const { queryToken, candidate } of matches) {
200
+ wordMatches.push({
201
+ word: docWord,
202
+ queryToken,
203
+ position: i,
204
+ type: candidate.type,
205
+ distance: candidate.distance,
206
+ score: candidate.score
207
+ });
203
208
  }
204
209
  }
205
210
  }
@@ -211,47 +216,76 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
211
216
  config,
212
217
  documentFrequency,
213
218
  totalDocuments,
214
- wordMatches
215
- // Pass all word matches for density calculation
219
+ wordMatches,
220
+ documentTokens
221
+ // Pass document tokens to extract gap words
216
222
  );
217
223
  if (phrase && phrase.words.length > 0) {
218
224
  phrases.push(phrase);
219
225
  }
220
226
  }
221
- return deduplicatePhrases(phrases);
227
+ const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
228
+ const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
229
+ return deduplicatePhrases(filteredPhrases);
222
230
  }
223
- function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
231
+ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
224
232
  const startMatch = wordMatches[startIndex];
225
233
  const phraseWords = [startMatch];
226
- const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
234
+ const queryTokenCounts = /* @__PURE__ */ new Map();
235
+ for (const token of queryTokens) {
236
+ queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
237
+ }
238
+ const matchedCounts = /* @__PURE__ */ new Map();
239
+ matchedCounts.set(startMatch.queryToken, 1);
240
+ const gapWords = [];
241
+ let totalGapUsed = 0;
242
+ let totalMatchedTokens = 1;
227
243
  for (let i = startIndex + 1; i < wordMatches.length; i++) {
228
244
  const match = wordMatches[i];
229
- const gap = match.position - phraseWords[phraseWords.length - 1].position - 1;
245
+ const lastPos = phraseWords[phraseWords.length - 1].position;
246
+ const gap = match.position - lastPos - 1;
230
247
  if (gap > config.maxGap) {
231
248
  break;
232
249
  }
233
- if (!coveredTokens.has(match.queryToken)) {
250
+ const neededCount = queryTokenCounts.get(match.queryToken) || 0;
251
+ const currentCount = matchedCounts.get(match.queryToken) || 0;
252
+ if (currentCount < neededCount) {
253
+ for (let pos = lastPos + 1; pos < match.position; pos++) {
254
+ totalGapUsed++;
255
+ gapWords.push({
256
+ word: documentTokens[pos],
257
+ position: pos,
258
+ gapIndex: totalGapUsed
259
+ });
260
+ }
234
261
  phraseWords.push(match);
235
- coveredTokens.add(match.queryToken);
236
- }
237
- if (coveredTokens.size === queryTokens.length) {
238
- break;
262
+ matchedCounts.set(match.queryToken, currentCount + 1);
263
+ totalMatchedTokens++;
264
+ if (totalMatchedTokens === queryTokens.length) {
265
+ break;
266
+ }
239
267
  }
240
268
  }
241
269
  if (phraseWords.length > 0) {
270
+ const coverage = phraseWords.length / queryTokens.length;
271
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
242
272
  const { score, breakdown } = calculatePhraseScore(
243
273
  phraseWords,
244
274
  queryTokens,
245
275
  config,
246
276
  documentFrequency,
247
277
  totalDocuments,
248
- allWordMatches
278
+ allWordMatches,
279
+ coverage
249
280
  );
250
281
  return {
251
282
  words: phraseWords,
283
+ gapWords,
284
+ gapUsed: totalGapUsed,
285
+ coverage,
252
286
  startPosition: phraseWords[0].position,
253
287
  endPosition: phraseWords[phraseWords.length - 1].position,
254
- gap: phraseWords[phraseWords.length - 1].position - phraseWords[0].position,
288
+ span,
255
289
  inOrder: isInOrder(phraseWords, queryTokens),
256
290
  score,
257
291
  scoreBreakdown: breakdown
@@ -259,7 +293,7 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
259
293
  }
260
294
  return null;
261
295
  }
262
- function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
296
+ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, coverage) {
263
297
  let baseScore = 0;
264
298
  for (const word of phraseWords) {
265
299
  const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
@@ -268,14 +302,16 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
268
302
  baseScore /= phraseWords.length;
269
303
  const inOrder = isInOrder(phraseWords, queryTokens);
270
304
  const orderScore = inOrder ? 1 : 0.5;
271
- const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
272
- const proximityScore = Math.max(0, 1 - span / (queryTokens.length * 5));
305
+ let proximityScore = 0;
306
+ if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
307
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
308
+ const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
309
+ proximityScore = Math.max(0, 1 - span / proximityWindow);
310
+ }
273
311
  let densityScore = 0;
274
312
  if (queryTokens.length === 1) {
275
313
  const totalOccurrences = allWordMatches.length;
276
- densityScore = totalOccurrences / queryTokens.length;
277
- } else {
278
- densityScore = phraseWords.length / queryTokens.length;
314
+ densityScore = Math.min(1, totalOccurrences / 10);
279
315
  }
280
316
  const semanticScore = calculateSemanticScore(
281
317
  phraseWords,
@@ -289,8 +325,13 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
289
325
  const weightedDensity = densityScore * weights.density;
290
326
  const weightedSemantic = semanticScore * weights.semantic;
291
327
  const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
292
- const maxPossibleScore = 1 + weights.order + weights.proximity + weights.density + weights.semantic;
293
- const score = totalScore / maxPossibleScore;
328
+ const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
329
+ const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
330
+ const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
331
+ const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
332
+ const normalizedScore = totalScore / maxPossibleScore;
333
+ const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
334
+ const score = normalizedScore * coverageMultiplier;
294
335
  const base = weightedBase / maxPossibleScore;
295
336
  const order = weightedOrder / maxPossibleScore;
296
337
  const proximity = weightedProximity / maxPossibleScore;
@@ -303,18 +344,27 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
303
344
  order,
304
345
  proximity,
305
346
  density,
306
- semantic
347
+ semantic,
348
+ coverage: coverageMultiplier
349
+ // Show coverage multiplier in breakdown
307
350
  }
308
351
  };
309
352
  }
310
353
  function isInOrder(phraseWords, queryTokens) {
311
- const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
312
- for (let i = 1; i < phraseWords.length; i++) {
313
- const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1;
314
- const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1;
315
- if (currOrder < prevOrder) {
354
+ const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
355
+ let lastMatchedIndex = -1;
356
+ for (const phraseWord of phraseWords) {
357
+ let foundIndex = -1;
358
+ for (const pos of tokenPositions) {
359
+ if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
360
+ foundIndex = pos.index;
361
+ break;
362
+ }
363
+ }
364
+ if (foundIndex === -1) {
316
365
  return false;
317
366
  }
367
+ lastMatchedIndex = foundIndex;
318
368
  }
319
369
  return true;
320
370
  }
@@ -357,7 +407,8 @@ function deduplicatePhrases(phrases) {
357
407
 
358
408
  // src/index.ts
359
409
  var DEFAULT_CONFIG = {
360
- textProperty: "content",
410
+ textProperty: "normalized_content",
411
+ // Must match server's field name
361
412
  tolerance: 1,
362
413
  adaptiveTolerance: true,
363
414
  enableSynonyms: false,
@@ -372,7 +423,10 @@ var DEFAULT_CONFIG = {
372
423
  semantic: 0.15
373
424
  },
374
425
  maxGap: 5,
375
- minScore: 0.1
426
+ minScore: 0.1,
427
+ enableFinalScoreMinimum: false,
428
+ finalScoreMinimum: 0.3,
429
+ proximitySpanMultiplier: 5
376
430
  };
377
431
  var pluginStates = /* @__PURE__ */ new WeakMap();
378
432
  function pluginFuzzyPhrase(userConfig = {}) {
@@ -392,7 +446,10 @@ function pluginFuzzyPhrase(userConfig = {}) {
392
446
  semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
393
447
  },
394
448
  maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
395
- minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
449
+ minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
450
+ enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
451
+ finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
452
+ proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
396
453
  };
397
454
  const plugin = {
398
455
  name: "fuzzy-phrase",
@@ -405,7 +462,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
405
462
  synonymMap: {},
406
463
  config,
407
464
  documentFrequency: /* @__PURE__ */ new Map(),
408
- totalDocuments: 0
465
+ totalDocuments: 0,
466
+ vocabulary: /* @__PURE__ */ new Set()
409
467
  };
410
468
  if (config.enableSynonyms && config.supabase) {
411
469
  try {
@@ -422,6 +480,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
422
480
  state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
423
481
  console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
424
482
  }
483
+ try {
484
+ const indexData = orama.data?.index;
485
+ let radixNode = null;
486
+ if (indexData?.indexes?.[config.textProperty]?.node) {
487
+ radixNode = indexData.indexes[config.textProperty].node;
488
+ } else if (indexData?.[config.textProperty]?.node) {
489
+ radixNode = indexData[config.textProperty].node;
490
+ }
491
+ if (radixNode) {
492
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
493
+ console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
494
+ } else {
495
+ console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
496
+ }
497
+ } catch (error) {
498
+ console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
499
+ }
425
500
  pluginStates.set(orama, state);
426
501
  console.log("\u2705 Fuzzy Phrase Plugin initialized");
427
502
  setImmediate(() => {
@@ -443,7 +518,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
443
518
  console.error("\u274C Plugin state not initialized");
444
519
  throw new Error("Fuzzy Phrase Plugin not properly initialized");
445
520
  }
446
- const { term, properties } = params;
521
+ const { term, properties, tokenCache } = params;
447
522
  if (!term || typeof term !== "string") {
448
523
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
449
524
  }
@@ -454,32 +529,31 @@ async function searchWithFuzzyPhrase(orama, params, language) {
454
529
  }
455
530
  const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
456
531
  console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
457
- let vocabulary;
458
- try {
459
- const indexData = orama.data?.index;
460
- if (!indexData) {
461
- console.error("\u274C No index data found in orama.data.index");
462
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
463
- }
464
- console.log("\u{1F50D} DEBUG: Index data keys:", Object.keys(indexData || {}));
465
- let radixNode = null;
466
- if (indexData.indexes?.[textProperty]?.node) {
467
- radixNode = indexData.indexes[textProperty].node;
468
- console.log("\u2705 Found radix via QPS-style path (data.index.indexes)");
469
- } else if (indexData[textProperty]?.node) {
470
- radixNode = indexData[textProperty].node;
471
- console.log("\u2705 Found radix via standard path (data.index[property])");
472
- }
473
- if (!radixNode) {
474
- console.error("\u274C Radix tree not found for property:", textProperty);
475
- console.error(" Available properties in index:", Object.keys(indexData));
532
+ let vocabulary = state.vocabulary;
533
+ if (vocabulary.size === 0) {
534
+ console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
535
+ try {
536
+ const indexData = orama.data?.index;
537
+ let radixNode = null;
538
+ if (indexData?.indexes?.[textProperty]?.node) {
539
+ radixNode = indexData.indexes[textProperty].node;
540
+ } else if (indexData?.[textProperty]?.node) {
541
+ radixNode = indexData[textProperty].node;
542
+ }
543
+ if (radixNode) {
544
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
545
+ vocabulary = state.vocabulary;
546
+ console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
547
+ } else {
548
+ console.error("\u274C Radix tree not found for vocabulary extraction");
549
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
550
+ }
551
+ } catch (error) {
552
+ console.error("\u274C Failed to extract vocabulary:", error);
476
553
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
477
554
  }
478
- vocabulary = extractVocabularyFromRadixTree(radixNode);
479
- console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`);
480
- } catch (error) {
481
- console.error("\u274C Failed to extract vocabulary:", error);
482
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
555
+ } else {
556
+ console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
483
557
  }
484
558
  const candidatesMap = findAllCandidates(
485
559
  queryTokens,
@@ -488,10 +562,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
488
562
  state.config.enableSynonyms ? state.synonymMap : void 0,
489
563
  state.config.synonymMatchScore
490
564
  );
491
- const filteredCandidates = filterCandidatesByScore(
492
- candidatesMap,
493
- state.config.minScore
494
- );
565
+ const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
495
566
  console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
496
567
  const documentMatches = [];
497
568
  console.log("\u{1F50D} DEBUG orama.data structure:", {
@@ -518,23 +589,44 @@ async function searchWithFuzzyPhrase(orama, params, language) {
518
589
  dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
519
590
  });
520
591
  }
521
- console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
592
+ const cacheHits = tokenCache ? tokenCache.size : 0;
593
+ let hasPositionalIndex = false;
594
+ if (tokenCache && tokenCache.size > 0) {
595
+ const firstEntry = tokenCache.values().next().value;
596
+ hasPositionalIndex = !!(firstEntry && !Array.isArray(firstEntry) && firstEntry.positions);
597
+ }
598
+ console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents (${hasPositionalIndex ? "\u26A1 positional index" : cacheHits > 0 ? "tokens cached" : "no cache"})`);
522
599
  for (const [docId, doc] of Object.entries(docs)) {
523
600
  const text = doc[textProperty];
524
601
  if (!text || typeof text !== "string") {
525
602
  continue;
526
603
  }
527
- const docTokens = tokenize(text);
604
+ let docTokens;
605
+ if (tokenCache && tokenCache.has(docId)) {
606
+ const cached = tokenCache.get(docId);
607
+ if (Array.isArray(cached)) {
608
+ docTokens = cached;
609
+ } else if (cached.tokens && cached.positions) {
610
+ docTokens = cached.tokens;
611
+ cached.positions;
612
+ } else {
613
+ docTokens = tokenize(text);
614
+ }
615
+ } else {
616
+ docTokens = tokenize(text);
617
+ }
528
618
  const phrases = findPhrasesInDocument(
529
619
  docTokens,
530
620
  filteredCandidates,
531
621
  {
532
622
  weights: state.config.weights,
533
- maxGap: state.config.maxGap
623
+ maxGap: state.config.maxGap,
624
+ proximitySpanMultiplier: state.config.proximitySpanMultiplier,
625
+ tolerance
534
626
  },
535
627
  state.documentFrequency,
536
- state.totalDocuments
537
- );
628
+ state.totalDocuments,
629
+ queryTokens);
538
630
  if (phrases.length > 0) {
539
631
  const docScore = Math.max(...phrases.map((p) => p.score));
540
632
  documentMatches.push({
@@ -546,8 +638,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
546
638
  }
547
639
  }
548
640
  documentMatches.sort((a, b) => b.score - a.score);
549
- const limit = params.limit ?? documentMatches.length;
550
- const limitedMatches = documentMatches.slice(0, limit);
641
+ let filteredMatches = documentMatches;
642
+ if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
643
+ const threshold = state.config.finalScoreMinimum;
644
+ const beforeCount = filteredMatches.length;
645
+ filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
646
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
647
+ }
648
+ const limit = params.limit ?? filteredMatches.length;
649
+ const limitedMatches = filteredMatches.slice(0, limit);
551
650
  const hits = limitedMatches.map((match) => ({
552
651
  id: match.id,
553
652
  score: match.score,