@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.3 → 3.1.16-custom.newbase.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -48,9 +48,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
48
48
  if (word === queryToken) {
49
49
  return { matches: true, distance: 0, score: 1 };
50
50
  }
51
- if (word.startsWith(queryToken)) {
52
- return { matches: true, distance: 0, score: 0.95 };
53
- }
54
51
  const result = boundedLevenshtein(word, queryToken, tolerance);
55
52
  if (result.isBounded) {
56
53
  const score = 1 - result.distance * 0.2;
@@ -180,24 +177,41 @@ function filterCandidatesByScore(candidatesMap, minScore) {
180
177
  }
181
178
 
182
179
  // src/scoring.ts
183
- function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
180
+ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens) {
184
181
  const phrases = [];
185
- const queryTokens = Array.from(candidatesMap.keys());
182
+ const queryTokens = originalQueryTokens;
186
183
  const wordMatches = [];
184
+ const candidateLookup = /* @__PURE__ */ new Map();
185
+ for (const [queryToken, candidates] of candidatesMap.entries()) {
186
+ for (const candidate of candidates) {
187
+ if (!candidateLookup.has(candidate.word)) {
188
+ candidateLookup.set(candidate.word, []);
189
+ }
190
+ candidateLookup.get(candidate.word).push({ queryToken, candidate });
191
+ }
192
+ }
193
+ for (const entries of candidateLookup.values()) {
194
+ entries.sort((a, b) => {
195
+ if (a.candidate.type === "exact" && b.candidate.type !== "exact")
196
+ return -1;
197
+ if (b.candidate.type === "exact" && a.candidate.type !== "exact")
198
+ return 1;
199
+ return b.candidate.score - a.candidate.score;
200
+ });
201
+ }
187
202
  for (let i = 0; i < documentTokens.length; i++) {
188
203
  const docWord = documentTokens[i];
189
- for (const [queryToken, candidates] of candidatesMap.entries()) {
190
- for (const candidate of candidates) {
191
- if (candidate.word === docWord) {
192
- wordMatches.push({
193
- word: docWord,
194
- queryToken,
195
- position: i,
196
- type: candidate.type,
197
- distance: candidate.distance,
198
- score: candidate.score
199
- });
200
- }
204
+ const matches = candidateLookup.get(docWord);
205
+ if (matches) {
206
+ for (const { queryToken, candidate } of matches) {
207
+ wordMatches.push({
208
+ word: docWord,
209
+ queryToken,
210
+ position: i,
211
+ type: candidate.type,
212
+ distance: candidate.distance,
213
+ score: candidate.score
214
+ });
201
215
  }
202
216
  }
203
217
  }
@@ -217,35 +231,52 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
217
231
  phrases.push(phrase);
218
232
  }
219
233
  }
220
- return deduplicatePhrases(phrases);
234
+ const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
235
+ const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
236
+ return deduplicatePhrases(filteredPhrases);
221
237
  }
222
238
  function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
223
239
  const startMatch = wordMatches[startIndex];
224
240
  const phraseWords = [startMatch];
225
- const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
241
+ const queryTokenCounts = /* @__PURE__ */ new Map();
242
+ for (const token of queryTokens) {
243
+ queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
244
+ }
245
+ const matchedCounts = /* @__PURE__ */ new Map();
246
+ matchedCounts.set(startMatch.queryToken, 1);
226
247
  const gapWords = [];
227
248
  let totalGapUsed = 0;
249
+ let totalMatchedTokens = 1;
228
250
  for (let i = startIndex + 1; i < wordMatches.length; i++) {
229
251
  const match = wordMatches[i];
230
252
  const lastPos = phraseWords[phraseWords.length - 1].position;
253
+ if (match.position <= lastPos) {
254
+ continue;
255
+ }
231
256
  const gap = match.position - lastPos - 1;
232
257
  if (gap > config.maxGap) {
233
258
  break;
234
259
  }
235
- for (let pos = lastPos + 1; pos < match.position; pos++) {
236
- totalGapUsed++;
237
- gapWords.push({
238
- word: documentTokens[pos],
239
- position: pos,
240
- gapIndex: totalGapUsed
241
- });
260
+ if (totalGapUsed + gap > config.maxGap) {
261
+ break;
242
262
  }
243
- if (!coveredTokens.has(match.queryToken)) {
263
+ const neededCount = queryTokenCounts.get(match.queryToken) || 0;
264
+ const currentCount = matchedCounts.get(match.queryToken) || 0;
265
+ if (currentCount < neededCount) {
266
+ for (let pos = lastPos + 1; pos < match.position; pos++) {
267
+ totalGapUsed++;
268
+ gapWords.push({
269
+ word: documentTokens[pos],
270
+ position: pos,
271
+ gapIndex: totalGapUsed
272
+ });
273
+ }
244
274
  phraseWords.push(match);
245
- coveredTokens.add(match.queryToken);
246
- }
247
- if (coveredTokens.size === queryTokens.length) {
248
- break;
275
+ matchedCounts.set(match.queryToken, currentCount + 1);
276
+ totalMatchedTokens++;
277
+ if (totalMatchedTokens === queryTokens.length) {
278
+ break;
279
+ }
249
280
  }
250
281
  }
251
282
  if (phraseWords.length > 0) {
@@ -284,9 +315,12 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
284
315
  baseScore /= phraseWords.length;
285
316
  const inOrder = isInOrder(phraseWords, queryTokens);
286
317
  const orderScore = inOrder ? 1 : 0.5;
287
- const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
288
- const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
289
- const proximityScore = Math.max(0, 1 - span / proximityWindow);
318
+ let proximityScore = 0;
319
+ if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
320
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
321
+ const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
322
+ proximityScore = Math.max(0, 1 - span / proximityWindow);
323
+ }
290
324
  let densityScore = 0;
291
325
  if (queryTokens.length === 1) {
292
326
  const totalOccurrences = allWordMatches.length;
@@ -304,8 +338,10 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
304
338
  const weightedDensity = densityScore * weights.density;
305
339
  const weightedSemantic = semanticScore * weights.semantic;
306
340
  const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
307
- const maxBaseWeight = Math.max(weights.exact, weights.fuzzy);
308
- const maxPossibleScore = maxBaseWeight + weights.order + weights.proximity + weights.density + weights.semantic;
341
+ const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
342
+ const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
343
+ const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
344
+ const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
309
345
  const normalizedScore = totalScore / maxPossibleScore;
310
346
  const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
311
347
  const score = normalizedScore * coverageMultiplier;
@@ -328,13 +364,20 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
328
364
  };
329
365
  }
330
366
  function isInOrder(phraseWords, queryTokens) {
331
- const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
332
- for (let i = 1; i < phraseWords.length; i++) {
333
- const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1;
334
- const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1;
335
- if (currOrder < prevOrder) {
367
+ const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
368
+ let lastMatchedIndex = -1;
369
+ for (const phraseWord of phraseWords) {
370
+ let foundIndex = -1;
371
+ for (const pos of tokenPositions) {
372
+ if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
373
+ foundIndex = pos.index;
374
+ break;
375
+ }
376
+ }
377
+ if (foundIndex === -1) {
336
378
  return false;
337
379
  }
380
+ lastMatchedIndex = foundIndex;
338
381
  }
339
382
  return true;
340
383
  }
@@ -375,9 +418,236 @@ function deduplicatePhrases(phrases) {
375
418
  return result.sort((a, b) => b.score - a.score);
376
419
  }
377
420
 
421
+ // src/optimized.ts
422
+ var DEFAULT_OPTIMIZED_CONFIG = {
423
+ maxQPSCandidates: 100,
424
+ // Limit phrase scoring to top 100 candidates
425
+ minQPSScore: 0.1,
426
+ // Include candidates with 10%+ of best score
427
+ qpsExact: false,
428
+ // Use fuzzy matching by default
429
+ qpsTolerance: 1
430
+ // Default tolerance of 1 edit distance
431
+ };
432
+ function normalizeText(text) {
433
+ return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
434
+ }
435
+ function tokenize(text) {
436
+ return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
437
+ }
438
+ function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
439
+ const tokens = tokenizer.tokenize(term, language);
440
+ if (tokens.length === 0) {
441
+ return [];
442
+ }
443
+ const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
444
+ const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
445
+ const boostPerProp = config.qpsBoostPerProp ?? {};
446
+ const resultMap = /* @__PURE__ */ new Map();
447
+ for (const prop of properties) {
448
+ const indexEntry = qpsIndex.indexes[prop];
449
+ if (!indexEntry || indexEntry.type !== "Radix") {
450
+ continue;
451
+ }
452
+ const radixNode = indexEntry.node;
453
+ const stats = qpsIndex.stats[prop];
454
+ if (!radixNode || !stats) {
455
+ continue;
456
+ }
457
+ const boost = boostPerProp[prop] ?? 1;
458
+ for (let i = 0; i < tokens.length; i++) {
459
+ const token = tokens[i];
460
+ const matches = radixNode.find({
461
+ term: token,
462
+ exact,
463
+ tolerance: exact ? 0 : tolerance
464
+ });
465
+ for (const [matchedWord, docIds] of Object.entries(matches)) {
466
+ if (!Array.isArray(docIds))
467
+ continue;
468
+ const isExactMatch = matchedWord === token;
469
+ for (const docId of docIds) {
470
+ const tokensLength = stats.tokensLength.get(docId) || 1;
471
+ const quantum = stats.tokenQuantums[docId]?.[matchedWord];
472
+ const occurrences = quantum ? quantum >> 20 : 1;
473
+ const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
474
+ if (!resultMap.has(docId)) {
475
+ resultMap.set(docId, [scoreContrib, 1 << i]);
476
+ } else {
477
+ const [prevScore, prevMask] = resultMap.get(docId);
478
+ const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
479
+ resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
480
+ }
481
+ }
482
+ }
483
+ }
484
+ }
485
+ const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
486
+ return results;
487
+ }
488
+ function countSetBits(n) {
489
+ let count = 0;
490
+ while (n) {
491
+ count += n & 1;
492
+ n >>= 1;
493
+ }
494
+ return count;
495
+ }
496
+ async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
497
+ const startTime = performance.now();
498
+ const { term, properties, tokenCache } = params;
499
+ if (!term || typeof term !== "string") {
500
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
501
+ }
502
+ const textProperty = properties && properties[0] || pluginState.config.textProperty;
503
+ const searchProperties = properties || [textProperty];
504
+ const queryTokens = tokenize(term);
505
+ if (queryTokens.length === 0) {
506
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
507
+ }
508
+ const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
509
+ console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
510
+ const qpsStartTime = performance.now();
511
+ const tokenizer = orama.tokenizer;
512
+ const qpsCandidates = searchQPS(
513
+ term,
514
+ qpsIndex,
515
+ tokenizer,
516
+ searchProperties,
517
+ config,
518
+ language
519
+ );
520
+ const qpsTime = performance.now() - qpsStartTime;
521
+ console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
522
+ if (qpsCandidates.length === 0) {
523
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
524
+ }
525
+ const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
526
+ const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
527
+ const bestScore = qpsCandidates[0][1];
528
+ const minScore = bestScore * minScoreRatio;
529
+ const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
530
+ console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
531
+ const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
532
+ let vocabulary = pluginState.vocabulary;
533
+ if (vocabulary.size === 0) {
534
+ console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
535
+ try {
536
+ const indexData = orama.data?.index;
537
+ let radixNode = null;
538
+ if (indexData?.indexes?.[textProperty]?.node) {
539
+ radixNode = indexData.indexes[textProperty].node;
540
+ } else if (indexData?.[textProperty]?.node) {
541
+ radixNode = indexData[textProperty].node;
542
+ }
543
+ if (radixNode) {
544
+ pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
545
+ vocabulary = pluginState.vocabulary;
546
+ console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
547
+ } else {
548
+ console.error("\u274C Radix tree not found for vocabulary extraction");
549
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
550
+ }
551
+ } catch (error) {
552
+ console.error("\u274C Failed to extract vocabulary:", error);
553
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
554
+ }
555
+ }
556
+ const candidatesMap = findAllCandidates(
557
+ queryTokens,
558
+ vocabulary,
559
+ tolerance,
560
+ pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
561
+ pluginState.config.synonymMatchScore
562
+ );
563
+ const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
564
+ console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
565
+ const phraseStartTime = performance.now();
566
+ const documentMatches = [];
567
+ let docs = {};
568
+ if (orama.data?.docs?.docs) {
569
+ docs = orama.data.docs.docs;
570
+ }
571
+ let docsScored = 0;
572
+ for (const [docId, doc] of Object.entries(docs)) {
573
+ if (!candidateDocIds.has(docId)) {
574
+ continue;
575
+ }
576
+ docsScored++;
577
+ const text = doc[textProperty];
578
+ if (!text || typeof text !== "string") {
579
+ continue;
580
+ }
581
+ let docTokens;
582
+ if (tokenCache && tokenCache.has(docId)) {
583
+ docTokens = tokenCache.get(docId);
584
+ } else {
585
+ docTokens = tokenize(text);
586
+ }
587
+ const phrases = findPhrasesInDocument(
588
+ docTokens,
589
+ filteredFuzzyCandidates,
590
+ {
591
+ weights: pluginState.config.weights,
592
+ maxGap: pluginState.config.maxGap,
593
+ proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
594
+ tolerance
595
+ },
596
+ pluginState.documentFrequency,
597
+ pluginState.totalDocuments,
598
+ queryTokens
599
+ );
600
+ if (phrases.length > 0) {
601
+ const docScore = Math.max(...phrases.map((p) => p.score));
602
+ documentMatches.push({
603
+ id: docId,
604
+ phrases,
605
+ score: docScore,
606
+ document: doc
607
+ });
608
+ }
609
+ }
610
+ const phraseTime = performance.now() - phraseStartTime;
611
+ console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
612
+ documentMatches.sort((a, b) => b.score - a.score);
613
+ let finalMatches = documentMatches;
614
+ if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
615
+ const threshold = pluginState.config.finalScoreMinimum;
616
+ const beforeCount = finalMatches.length;
617
+ finalMatches = finalMatches.filter((m) => m.score >= threshold);
618
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
619
+ }
620
+ const limit = params.limit ?? finalMatches.length;
621
+ const limitedMatches = finalMatches.slice(0, limit);
622
+ const hits = limitedMatches.map((match) => ({
623
+ id: match.id,
624
+ score: match.score,
625
+ document: match.document,
626
+ _phrases: match.phrases
627
+ }));
628
+ const elapsed = performance.now() - startTime;
629
+ console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
630
+ return {
631
+ elapsed: {
632
+ formatted: `${elapsed.toFixed(2)}ms`,
633
+ raw: Math.floor(elapsed * 1e6),
634
+ qpsTime,
635
+ phraseTime
636
+ },
637
+ hits,
638
+ count: hits.length
639
+ };
640
+ }
641
+ function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
642
+ return async (params, language = "french") => {
643
+ return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
644
+ };
645
+ }
646
+
378
647
  // src/index.ts
379
648
  var DEFAULT_CONFIG = {
380
- textProperty: "content",
649
+ textProperty: "normalized_content",
650
+ // Must match server's field name
381
651
  tolerance: 1,
382
652
  adaptiveTolerance: true,
383
653
  enableSynonyms: false,
@@ -393,6 +663,8 @@ var DEFAULT_CONFIG = {
393
663
  },
394
664
  maxGap: 5,
395
665
  minScore: 0.1,
666
+ enableFinalScoreMinimum: false,
667
+ finalScoreMinimum: 0.3,
396
668
  proximitySpanMultiplier: 5
397
669
  };
398
670
  var pluginStates = /* @__PURE__ */ new WeakMap();
@@ -414,6 +686,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
414
686
  },
415
687
  maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
416
688
  minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
689
+ enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
690
+ finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
417
691
  proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
418
692
  };
419
693
  const plugin = {
@@ -427,7 +701,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
427
701
  synonymMap: {},
428
702
  config,
429
703
  documentFrequency: /* @__PURE__ */ new Map(),
430
- totalDocuments: 0
704
+ totalDocuments: 0,
705
+ vocabulary: /* @__PURE__ */ new Set()
431
706
  };
432
707
  if (config.enableSynonyms && config.supabase) {
433
708
  try {
@@ -444,6 +719,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
444
719
  state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
445
720
  console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
446
721
  }
722
+ try {
723
+ const indexData = orama.data?.index;
724
+ let radixNode = null;
725
+ if (indexData?.indexes?.[config.textProperty]?.node) {
726
+ radixNode = indexData.indexes[config.textProperty].node;
727
+ } else if (indexData?.[config.textProperty]?.node) {
728
+ radixNode = indexData[config.textProperty].node;
729
+ }
730
+ if (radixNode) {
731
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
732
+ console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
733
+ } else {
734
+ console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
735
+ }
736
+ } catch (error) {
737
+ console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
738
+ }
447
739
  pluginStates.set(orama, state);
448
740
  console.log("\u2705 Fuzzy Phrase Plugin initialized");
449
741
  setImmediate(() => {
@@ -465,43 +757,43 @@ async function searchWithFuzzyPhrase(orama, params, language) {
465
757
  console.error("\u274C Plugin state not initialized");
466
758
  throw new Error("Fuzzy Phrase Plugin not properly initialized");
467
759
  }
468
- const { term, properties } = params;
760
+ const { term, properties, tokenCache, candidateIds } = params;
761
+ const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
469
762
  if (!term || typeof term !== "string") {
470
763
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
471
764
  }
472
765
  const textProperty = properties && properties[0] || state.config.textProperty;
473
- const queryTokens = tokenize(term);
766
+ const queryTokens = tokenize2(term);
474
767
  if (queryTokens.length === 0) {
475
768
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
476
769
  }
477
770
  const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
478
771
  console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
479
- let vocabulary;
480
- try {
481
- const indexData = orama.data?.index;
482
- if (!indexData) {
483
- console.error("\u274C No index data found in orama.data.index");
484
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
485
- }
486
- console.log("\u{1F50D} DEBUG: Index data keys:", Object.keys(indexData || {}));
487
- let radixNode = null;
488
- if (indexData.indexes?.[textProperty]?.node) {
489
- radixNode = indexData.indexes[textProperty].node;
490
- console.log("\u2705 Found radix via QPS-style path (data.index.indexes)");
491
- } else if (indexData[textProperty]?.node) {
492
- radixNode = indexData[textProperty].node;
493
- console.log("\u2705 Found radix via standard path (data.index[property])");
494
- }
495
- if (!radixNode) {
496
- console.error("\u274C Radix tree not found for property:", textProperty);
497
- console.error(" Available properties in index:", Object.keys(indexData));
772
+ let vocabulary = state.vocabulary;
773
+ if (vocabulary.size === 0) {
774
+ console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
775
+ try {
776
+ const indexData = orama.data?.index;
777
+ let radixNode = null;
778
+ if (indexData?.indexes?.[textProperty]?.node) {
779
+ radixNode = indexData.indexes[textProperty].node;
780
+ } else if (indexData?.[textProperty]?.node) {
781
+ radixNode = indexData[textProperty].node;
782
+ }
783
+ if (radixNode) {
784
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
785
+ vocabulary = state.vocabulary;
786
+ console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
787
+ } else {
788
+ console.error("\u274C Radix tree not found for vocabulary extraction");
789
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
790
+ }
791
+ } catch (error) {
792
+ console.error("\u274C Failed to extract vocabulary:", error);
498
793
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
499
794
  }
500
- vocabulary = extractVocabularyFromRadixTree(radixNode);
501
- console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`);
502
- } catch (error) {
503
- console.error("\u274C Failed to extract vocabulary:", error);
504
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
795
+ } else {
796
+ console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
505
797
  }
506
798
  const candidatesMap = findAllCandidates(
507
799
  queryTokens,
@@ -510,10 +802,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
510
802
  state.config.enableSynonyms ? state.synonymMap : void 0,
511
803
  state.config.synonymMatchScore
512
804
  );
513
- const filteredCandidates = filterCandidatesByScore(
514
- candidatesMap,
515
- state.config.minScore
516
- );
805
+ const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
517
806
  console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
518
807
  const documentMatches = [];
519
808
  console.log("\u{1F50D} DEBUG orama.data structure:", {
@@ -540,23 +829,39 @@ async function searchWithFuzzyPhrase(orama, params, language) {
540
829
  dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
541
830
  });
542
831
  }
543
- console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
832
+ const cacheHits = tokenCache ? tokenCache.size : 0;
833
+ const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
834
+ console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
544
835
  for (const [docId, doc] of Object.entries(docs)) {
836
+ if (candidateIdSet) {
837
+ const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
838
+ if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
839
+ continue;
840
+ }
841
+ }
545
842
  const text = doc[textProperty];
546
843
  if (!text || typeof text !== "string") {
547
844
  continue;
548
845
  }
549
- const docTokens = tokenize(text);
846
+ let docTokens;
847
+ if (tokenCache && tokenCache.has(docId)) {
848
+ docTokens = tokenCache.get(docId);
849
+ } else {
850
+ docTokens = tokenize2(text);
851
+ }
550
852
  const phrases = findPhrasesInDocument(
551
853
  docTokens,
552
854
  filteredCandidates,
553
855
  {
554
856
  weights: state.config.weights,
555
857
  maxGap: state.config.maxGap,
556
- proximitySpanMultiplier: state.config.proximitySpanMultiplier
858
+ proximitySpanMultiplier: state.config.proximitySpanMultiplier,
859
+ tolerance
557
860
  },
558
861
  state.documentFrequency,
559
- state.totalDocuments
862
+ state.totalDocuments,
863
+ queryTokens
864
+ // Original tokens with duplicates preserved
560
865
  );
561
866
  if (phrases.length > 0) {
562
867
  const docScore = Math.max(...phrases.map((p) => p.score));
@@ -569,8 +874,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
569
874
  }
570
875
  }
571
876
  documentMatches.sort((a, b) => b.score - a.score);
572
- const limit = params.limit ?? documentMatches.length;
573
- const limitedMatches = documentMatches.slice(0, limit);
877
+ let filteredMatches = documentMatches;
878
+ if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
879
+ const threshold = state.config.finalScoreMinimum;
880
+ const beforeCount = filteredMatches.length;
881
+ filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
882
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
883
+ }
884
+ const limit = params.limit ?? filteredMatches.length;
885
+ const limitedMatches = filteredMatches.slice(0, limit);
574
886
  const hits = limitedMatches.map((match) => ({
575
887
  id: match.id,
576
888
  score: match.score,
@@ -621,20 +933,23 @@ function calculateDocumentFrequencies(docs, textProperty) {
621
933
  if (!text || typeof text !== "string") {
622
934
  continue;
623
935
  }
624
- const words = new Set(tokenize(text));
936
+ const words = new Set(tokenize2(text));
625
937
  for (const word of words) {
626
938
  df.set(word, (df.get(word) || 0) + 1);
627
939
  }
628
940
  }
629
941
  return df;
630
942
  }
631
- function normalizeText(text) {
943
+ function normalizeText2(text) {
632
944
  return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
633
945
  }
634
- function tokenize(text) {
635
- return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
946
+ function tokenize2(text) {
947
+ return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
948
+ }
949
+ function getPluginState(orama) {
950
+ return pluginStates.get(orama);
636
951
  }
637
952
 
638
- export { pluginFuzzyPhrase, searchWithFuzzyPhrase };
953
+ export { createOptimizedSearch, getPluginState, normalizeText as normalizeTextOptimized, pluginFuzzyPhrase, searchWithFuzzyPhrase, searchWithQPSPruning, tokenize as tokenizeOptimized };
639
954
  //# sourceMappingURL=out.js.map
640
955
  //# sourceMappingURL=index.js.map