@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.2 → 3.1.16-custom.newbase.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -48,9 +48,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
48
48
  if (word === queryToken) {
49
49
  return { matches: true, distance: 0, score: 1 };
50
50
  }
51
- if (word.startsWith(queryToken)) {
52
- return { matches: true, distance: 0, score: 0.95 };
53
- }
54
51
  const result = boundedLevenshtein(word, queryToken, tolerance);
55
52
  if (result.isBounded) {
56
53
  const score = 1 - result.distance * 0.2;
@@ -180,24 +177,32 @@ function filterCandidatesByScore(candidatesMap, minScore) {
180
177
  }
181
178
 
182
179
  // src/scoring.ts
183
- function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
180
+ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, docPositions) {
184
181
  const phrases = [];
185
- const queryTokens = Array.from(candidatesMap.keys());
182
+ const queryTokens = originalQueryTokens;
186
183
  const wordMatches = [];
184
+ const candidateLookup = /* @__PURE__ */ new Map();
185
+ for (const [queryToken, candidates] of candidatesMap.entries()) {
186
+ for (const candidate of candidates) {
187
+ if (!candidateLookup.has(candidate.word)) {
188
+ candidateLookup.set(candidate.word, []);
189
+ }
190
+ candidateLookup.get(candidate.word).push({ queryToken, candidate });
191
+ }
192
+ }
187
193
  for (let i = 0; i < documentTokens.length; i++) {
188
194
  const docWord = documentTokens[i];
189
- for (const [queryToken, candidates] of candidatesMap.entries()) {
190
- for (const candidate of candidates) {
191
- if (candidate.word === docWord) {
192
- wordMatches.push({
193
- word: docWord,
194
- queryToken,
195
- position: i,
196
- type: candidate.type,
197
- distance: candidate.distance,
198
- score: candidate.score
199
- });
200
- }
195
+ const matches = candidateLookup.get(docWord);
196
+ if (matches) {
197
+ for (const { queryToken, candidate } of matches) {
198
+ wordMatches.push({
199
+ word: docWord,
200
+ queryToken,
201
+ position: i,
202
+ type: candidate.type,
203
+ distance: candidate.distance,
204
+ score: candidate.score
205
+ });
201
206
  }
202
207
  }
203
208
  }
@@ -209,47 +214,76 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
209
214
  config,
210
215
  documentFrequency,
211
216
  totalDocuments,
212
- wordMatches
213
- // Pass all word matches for density calculation
217
+ wordMatches,
218
+ documentTokens
219
+ // Pass document tokens to extract gap words
214
220
  );
215
221
  if (phrase && phrase.words.length > 0) {
216
222
  phrases.push(phrase);
217
223
  }
218
224
  }
219
- return deduplicatePhrases(phrases);
225
+ const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
226
+ const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
227
+ return deduplicatePhrases(filteredPhrases);
220
228
  }
221
- function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
229
+ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
222
230
  const startMatch = wordMatches[startIndex];
223
231
  const phraseWords = [startMatch];
224
- const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
232
+ const queryTokenCounts = /* @__PURE__ */ new Map();
233
+ for (const token of queryTokens) {
234
+ queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
235
+ }
236
+ const matchedCounts = /* @__PURE__ */ new Map();
237
+ matchedCounts.set(startMatch.queryToken, 1);
238
+ const gapWords = [];
239
+ let totalGapUsed = 0;
240
+ let totalMatchedTokens = 1;
225
241
  for (let i = startIndex + 1; i < wordMatches.length; i++) {
226
242
  const match = wordMatches[i];
227
- const gap = match.position - phraseWords[phraseWords.length - 1].position - 1;
243
+ const lastPos = phraseWords[phraseWords.length - 1].position;
244
+ const gap = match.position - lastPos - 1;
228
245
  if (gap > config.maxGap) {
229
246
  break;
230
247
  }
231
- if (!coveredTokens.has(match.queryToken)) {
248
+ const neededCount = queryTokenCounts.get(match.queryToken) || 0;
249
+ const currentCount = matchedCounts.get(match.queryToken) || 0;
250
+ if (currentCount < neededCount) {
251
+ for (let pos = lastPos + 1; pos < match.position; pos++) {
252
+ totalGapUsed++;
253
+ gapWords.push({
254
+ word: documentTokens[pos],
255
+ position: pos,
256
+ gapIndex: totalGapUsed
257
+ });
258
+ }
232
259
  phraseWords.push(match);
233
- coveredTokens.add(match.queryToken);
234
- }
235
- if (coveredTokens.size === queryTokens.length) {
236
- break;
260
+ matchedCounts.set(match.queryToken, currentCount + 1);
261
+ totalMatchedTokens++;
262
+ if (totalMatchedTokens === queryTokens.length) {
263
+ break;
264
+ }
237
265
  }
238
266
  }
239
267
  if (phraseWords.length > 0) {
268
+ const coverage = phraseWords.length / queryTokens.length;
269
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
240
270
  const { score, breakdown } = calculatePhraseScore(
241
271
  phraseWords,
242
272
  queryTokens,
243
273
  config,
244
274
  documentFrequency,
245
275
  totalDocuments,
246
- allWordMatches
276
+ allWordMatches,
277
+ coverage
247
278
  );
248
279
  return {
249
280
  words: phraseWords,
281
+ gapWords,
282
+ gapUsed: totalGapUsed,
283
+ coverage,
250
284
  startPosition: phraseWords[0].position,
251
285
  endPosition: phraseWords[phraseWords.length - 1].position,
252
- gap: phraseWords[phraseWords.length - 1].position - phraseWords[0].position,
286
+ span,
253
287
  inOrder: isInOrder(phraseWords, queryTokens),
254
288
  score,
255
289
  scoreBreakdown: breakdown
@@ -257,7 +291,7 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
257
291
  }
258
292
  return null;
259
293
  }
260
- function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
294
+ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, coverage) {
261
295
  let baseScore = 0;
262
296
  for (const word of phraseWords) {
263
297
  const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
@@ -266,14 +300,16 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
266
300
  baseScore /= phraseWords.length;
267
301
  const inOrder = isInOrder(phraseWords, queryTokens);
268
302
  const orderScore = inOrder ? 1 : 0.5;
269
- const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
270
- const proximityScore = Math.max(0, 1 - span / (queryTokens.length * 5));
303
+ let proximityScore = 0;
304
+ if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
305
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
306
+ const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
307
+ proximityScore = Math.max(0, 1 - span / proximityWindow);
308
+ }
271
309
  let densityScore = 0;
272
310
  if (queryTokens.length === 1) {
273
311
  const totalOccurrences = allWordMatches.length;
274
- densityScore = totalOccurrences / queryTokens.length;
275
- } else {
276
- densityScore = phraseWords.length / queryTokens.length;
312
+ densityScore = Math.min(1, totalOccurrences / 10);
277
313
  }
278
314
  const semanticScore = calculateSemanticScore(
279
315
  phraseWords,
@@ -287,8 +323,13 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
287
323
  const weightedDensity = densityScore * weights.density;
288
324
  const weightedSemantic = semanticScore * weights.semantic;
289
325
  const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
290
- const maxPossibleScore = 1 + weights.order + weights.proximity + weights.density + weights.semantic;
291
- const score = totalScore / maxPossibleScore;
326
+ const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
327
+ const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
328
+ const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
329
+ const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
330
+ const normalizedScore = totalScore / maxPossibleScore;
331
+ const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
332
+ const score = normalizedScore * coverageMultiplier;
292
333
  const base = weightedBase / maxPossibleScore;
293
334
  const order = weightedOrder / maxPossibleScore;
294
335
  const proximity = weightedProximity / maxPossibleScore;
@@ -301,18 +342,27 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
301
342
  order,
302
343
  proximity,
303
344
  density,
304
- semantic
345
+ semantic,
346
+ coverage: coverageMultiplier
347
+ // Show coverage multiplier in breakdown
305
348
  }
306
349
  };
307
350
  }
308
351
  function isInOrder(phraseWords, queryTokens) {
309
- const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
310
- for (let i = 1; i < phraseWords.length; i++) {
311
- const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1;
312
- const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1;
313
- if (currOrder < prevOrder) {
352
+ const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
353
+ let lastMatchedIndex = -1;
354
+ for (const phraseWord of phraseWords) {
355
+ let foundIndex = -1;
356
+ for (const pos of tokenPositions) {
357
+ if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
358
+ foundIndex = pos.index;
359
+ break;
360
+ }
361
+ }
362
+ if (foundIndex === -1) {
314
363
  return false;
315
364
  }
365
+ lastMatchedIndex = foundIndex;
316
366
  }
317
367
  return true;
318
368
  }
@@ -355,7 +405,8 @@ function deduplicatePhrases(phrases) {
355
405
 
356
406
  // src/index.ts
357
407
  var DEFAULT_CONFIG = {
358
- textProperty: "content",
408
+ textProperty: "normalized_content",
409
+ // Must match server's field name
359
410
  tolerance: 1,
360
411
  adaptiveTolerance: true,
361
412
  enableSynonyms: false,
@@ -370,7 +421,10 @@ var DEFAULT_CONFIG = {
370
421
  semantic: 0.15
371
422
  },
372
423
  maxGap: 5,
373
- minScore: 0.1
424
+ minScore: 0.1,
425
+ enableFinalScoreMinimum: false,
426
+ finalScoreMinimum: 0.3,
427
+ proximitySpanMultiplier: 5
374
428
  };
375
429
  var pluginStates = /* @__PURE__ */ new WeakMap();
376
430
  function pluginFuzzyPhrase(userConfig = {}) {
@@ -390,7 +444,10 @@ function pluginFuzzyPhrase(userConfig = {}) {
390
444
  semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
391
445
  },
392
446
  maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
393
- minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
447
+ minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
448
+ enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
449
+ finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
450
+ proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
394
451
  };
395
452
  const plugin = {
396
453
  name: "fuzzy-phrase",
@@ -403,7 +460,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
403
460
  synonymMap: {},
404
461
  config,
405
462
  documentFrequency: /* @__PURE__ */ new Map(),
406
- totalDocuments: 0
463
+ totalDocuments: 0,
464
+ vocabulary: /* @__PURE__ */ new Set()
407
465
  };
408
466
  if (config.enableSynonyms && config.supabase) {
409
467
  try {
@@ -420,6 +478,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
420
478
  state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
421
479
  console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
422
480
  }
481
+ try {
482
+ const indexData = orama.data?.index;
483
+ let radixNode = null;
484
+ if (indexData?.indexes?.[config.textProperty]?.node) {
485
+ radixNode = indexData.indexes[config.textProperty].node;
486
+ } else if (indexData?.[config.textProperty]?.node) {
487
+ radixNode = indexData[config.textProperty].node;
488
+ }
489
+ if (radixNode) {
490
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
491
+ console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
492
+ } else {
493
+ console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
494
+ }
495
+ } catch (error) {
496
+ console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
497
+ }
423
498
  pluginStates.set(orama, state);
424
499
  console.log("\u2705 Fuzzy Phrase Plugin initialized");
425
500
  setImmediate(() => {
@@ -441,7 +516,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
441
516
  console.error("\u274C Plugin state not initialized");
442
517
  throw new Error("Fuzzy Phrase Plugin not properly initialized");
443
518
  }
444
- const { term, properties } = params;
519
+ const { term, properties, tokenCache } = params;
445
520
  if (!term || typeof term !== "string") {
446
521
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
447
522
  }
@@ -452,32 +527,31 @@ async function searchWithFuzzyPhrase(orama, params, language) {
452
527
  }
453
528
  const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
454
529
  console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
455
- let vocabulary;
456
- try {
457
- const indexData = orama.data?.index;
458
- if (!indexData) {
459
- console.error("\u274C No index data found in orama.data.index");
460
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
461
- }
462
- console.log("\u{1F50D} DEBUG: Index data keys:", Object.keys(indexData || {}));
463
- let radixNode = null;
464
- if (indexData.indexes?.[textProperty]?.node) {
465
- radixNode = indexData.indexes[textProperty].node;
466
- console.log("\u2705 Found radix via QPS-style path (data.index.indexes)");
467
- } else if (indexData[textProperty]?.node) {
468
- radixNode = indexData[textProperty].node;
469
- console.log("\u2705 Found radix via standard path (data.index[property])");
470
- }
471
- if (!radixNode) {
472
- console.error("\u274C Radix tree not found for property:", textProperty);
473
- console.error(" Available properties in index:", Object.keys(indexData));
530
+ let vocabulary = state.vocabulary;
531
+ if (vocabulary.size === 0) {
532
+ console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
533
+ try {
534
+ const indexData = orama.data?.index;
535
+ let radixNode = null;
536
+ if (indexData?.indexes?.[textProperty]?.node) {
537
+ radixNode = indexData.indexes[textProperty].node;
538
+ } else if (indexData?.[textProperty]?.node) {
539
+ radixNode = indexData[textProperty].node;
540
+ }
541
+ if (radixNode) {
542
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
543
+ vocabulary = state.vocabulary;
544
+ console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
545
+ } else {
546
+ console.error("\u274C Radix tree not found for vocabulary extraction");
547
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
548
+ }
549
+ } catch (error) {
550
+ console.error("\u274C Failed to extract vocabulary:", error);
474
551
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
475
552
  }
476
- vocabulary = extractVocabularyFromRadixTree(radixNode);
477
- console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`);
478
- } catch (error) {
479
- console.error("\u274C Failed to extract vocabulary:", error);
480
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
553
+ } else {
554
+ console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
481
555
  }
482
556
  const candidatesMap = findAllCandidates(
483
557
  queryTokens,
@@ -486,10 +560,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
486
560
  state.config.enableSynonyms ? state.synonymMap : void 0,
487
561
  state.config.synonymMatchScore
488
562
  );
489
- const filteredCandidates = filterCandidatesByScore(
490
- candidatesMap,
491
- state.config.minScore
492
- );
563
+ const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
493
564
  console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
494
565
  const documentMatches = [];
495
566
  console.log("\u{1F50D} DEBUG orama.data structure:", {
@@ -516,23 +587,44 @@ async function searchWithFuzzyPhrase(orama, params, language) {
516
587
  dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
517
588
  });
518
589
  }
519
- console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
590
+ const cacheHits = tokenCache ? tokenCache.size : 0;
591
+ let hasPositionalIndex = false;
592
+ if (tokenCache && tokenCache.size > 0) {
593
+ const firstEntry = tokenCache.values().next().value;
594
+ hasPositionalIndex = !!(firstEntry && !Array.isArray(firstEntry) && firstEntry.positions);
595
+ }
596
+ console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents (${hasPositionalIndex ? "\u26A1 positional index" : cacheHits > 0 ? "tokens cached" : "no cache"})`);
520
597
  for (const [docId, doc] of Object.entries(docs)) {
521
598
  const text = doc[textProperty];
522
599
  if (!text || typeof text !== "string") {
523
600
  continue;
524
601
  }
525
- const docTokens = tokenize(text);
602
+ let docTokens;
603
+ if (tokenCache && tokenCache.has(docId)) {
604
+ const cached = tokenCache.get(docId);
605
+ if (Array.isArray(cached)) {
606
+ docTokens = cached;
607
+ } else if (cached.tokens && cached.positions) {
608
+ docTokens = cached.tokens;
609
+ cached.positions;
610
+ } else {
611
+ docTokens = tokenize(text);
612
+ }
613
+ } else {
614
+ docTokens = tokenize(text);
615
+ }
526
616
  const phrases = findPhrasesInDocument(
527
617
  docTokens,
528
618
  filteredCandidates,
529
619
  {
530
620
  weights: state.config.weights,
531
- maxGap: state.config.maxGap
621
+ maxGap: state.config.maxGap,
622
+ proximitySpanMultiplier: state.config.proximitySpanMultiplier,
623
+ tolerance
532
624
  },
533
625
  state.documentFrequency,
534
- state.totalDocuments
535
- );
626
+ state.totalDocuments,
627
+ queryTokens);
536
628
  if (phrases.length > 0) {
537
629
  const docScore = Math.max(...phrases.map((p) => p.score));
538
630
  documentMatches.push({
@@ -544,8 +636,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
544
636
  }
545
637
  }
546
638
  documentMatches.sort((a, b) => b.score - a.score);
547
- const limit = params.limit ?? documentMatches.length;
548
- const limitedMatches = documentMatches.slice(0, limit);
639
+ let filteredMatches = documentMatches;
640
+ if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
641
+ const threshold = state.config.finalScoreMinimum;
642
+ const beforeCount = filteredMatches.length;
643
+ filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
644
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
645
+ }
646
+ const limit = params.limit ?? filteredMatches.length;
647
+ const limitedMatches = filteredMatches.slice(0, limit);
549
648
  const hits = limitedMatches.map((match) => ({
550
649
  id: match.id,
551
650
  score: match.score,