@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.3 → 3.1.16-custom.newbase.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -48,9 +48,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
48
48
  if (word === queryToken) {
49
49
  return { matches: true, distance: 0, score: 1 };
50
50
  }
51
- if (word.startsWith(queryToken)) {
52
- return { matches: true, distance: 0, score: 0.95 };
53
- }
54
51
  const result = boundedLevenshtein(word, queryToken, tolerance);
55
52
  if (result.isBounded) {
56
53
  const score = 1 - result.distance * 0.2;
@@ -180,24 +177,51 @@ function filterCandidatesByScore(candidatesMap, minScore) {
180
177
  }
181
178
 
182
179
  // src/scoring.ts
183
- function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
180
+ function buildCandidateLookup(candidatesMap) {
181
+ const candidateLookup = /* @__PURE__ */ new Map();
182
+ for (const [queryToken, candidates] of candidatesMap.entries()) {
183
+ for (const candidate of candidates) {
184
+ if (!candidateLookup.has(candidate.word)) {
185
+ candidateLookup.set(candidate.word, []);
186
+ }
187
+ candidateLookup.get(candidate.word).push({ queryToken, candidate });
188
+ }
189
+ }
190
+ for (const entries of candidateLookup.values()) {
191
+ entries.sort((a, b) => {
192
+ if (a.candidate.type === "exact" && b.candidate.type !== "exact")
193
+ return -1;
194
+ if (b.candidate.type === "exact" && a.candidate.type !== "exact")
195
+ return 1;
196
+ return b.candidate.score - a.candidate.score;
197
+ });
198
+ }
199
+ return candidateLookup;
200
+ }
201
+ function buildQueryTokenCounts(queryTokens) {
202
+ const queryTokenCounts = /* @__PURE__ */ new Map();
203
+ for (const token of queryTokens) {
204
+ queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
205
+ }
206
+ return queryTokenCounts;
207
+ }
208
+ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, candidateLookup, queryTokenCounts) {
184
209
  const phrases = [];
185
- const queryTokens = Array.from(candidatesMap.keys());
210
+ const queryTokens = originalQueryTokens;
186
211
  const wordMatches = [];
187
212
  for (let i = 0; i < documentTokens.length; i++) {
188
213
  const docWord = documentTokens[i];
189
- for (const [queryToken, candidates] of candidatesMap.entries()) {
190
- for (const candidate of candidates) {
191
- if (candidate.word === docWord) {
192
- wordMatches.push({
193
- word: docWord,
194
- queryToken,
195
- position: i,
196
- type: candidate.type,
197
- distance: candidate.distance,
198
- score: candidate.score
199
- });
200
- }
214
+ const matches = candidateLookup.get(docWord);
215
+ if (matches) {
216
+ for (const { queryToken, candidate } of matches) {
217
+ wordMatches.push({
218
+ word: docWord,
219
+ queryToken,
220
+ position: i,
221
+ type: candidate.type,
222
+ distance: candidate.distance,
223
+ score: candidate.score
224
+ });
201
225
  }
202
226
  }
203
227
  }
@@ -210,42 +234,56 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
210
234
  documentFrequency,
211
235
  totalDocuments,
212
236
  wordMatches,
213
- documentTokens
214
- // Pass document tokens to extract gap words
237
+ documentTokens,
238
+ queryTokenCounts
239
+ // OPTIMIZATION B: Pass pre-built queryTokenCounts
215
240
  );
216
241
  if (phrase && phrase.words.length > 0) {
217
242
  phrases.push(phrase);
218
243
  }
219
244
  }
220
- return deduplicatePhrases(phrases);
245
+ const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
246
+ const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
247
+ return deduplicatePhrases(filteredPhrases);
221
248
  }
222
- function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
249
+ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens, queryTokenCounts) {
223
250
  const startMatch = wordMatches[startIndex];
224
251
  const phraseWords = [startMatch];
225
- const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
252
+ const matchedCounts = /* @__PURE__ */ new Map();
253
+ matchedCounts.set(startMatch.queryToken, 1);
226
254
  const gapWords = [];
227
255
  let totalGapUsed = 0;
256
+ let totalMatchedTokens = 1;
228
257
  for (let i = startIndex + 1; i < wordMatches.length; i++) {
229
258
  const match = wordMatches[i];
230
259
  const lastPos = phraseWords[phraseWords.length - 1].position;
260
+ if (match.position <= lastPos) {
261
+ continue;
262
+ }
231
263
  const gap = match.position - lastPos - 1;
232
264
  if (gap > config.maxGap) {
233
265
  break;
234
266
  }
235
- for (let pos = lastPos + 1; pos < match.position; pos++) {
236
- totalGapUsed++;
237
- gapWords.push({
238
- word: documentTokens[pos],
239
- position: pos,
240
- gapIndex: totalGapUsed
241
- });
267
+ if (totalGapUsed + gap > config.maxGap) {
268
+ break;
242
269
  }
243
- if (!coveredTokens.has(match.queryToken)) {
270
+ const neededCount = queryTokenCounts.get(match.queryToken) || 0;
271
+ const currentCount = matchedCounts.get(match.queryToken) || 0;
272
+ if (currentCount < neededCount) {
273
+ for (let pos = lastPos + 1; pos < match.position; pos++) {
274
+ totalGapUsed++;
275
+ gapWords.push({
276
+ word: documentTokens[pos],
277
+ position: pos,
278
+ gapIndex: totalGapUsed
279
+ });
280
+ }
244
281
  phraseWords.push(match);
245
- coveredTokens.add(match.queryToken);
246
- }
247
- if (coveredTokens.size === queryTokens.length) {
248
- break;
282
+ matchedCounts.set(match.queryToken, currentCount + 1);
283
+ totalMatchedTokens++;
284
+ if (totalMatchedTokens === queryTokens.length) {
285
+ break;
286
+ }
249
287
  }
250
288
  }
251
289
  if (phraseWords.length > 0) {
@@ -284,9 +322,12 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
284
322
  baseScore /= phraseWords.length;
285
323
  const inOrder = isInOrder(phraseWords, queryTokens);
286
324
  const orderScore = inOrder ? 1 : 0.5;
287
- const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
288
- const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
289
- const proximityScore = Math.max(0, 1 - span / proximityWindow);
325
+ let proximityScore = 0;
326
+ if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
327
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
328
+ const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
329
+ proximityScore = Math.max(0, 1 - span / proximityWindow);
330
+ }
290
331
  let densityScore = 0;
291
332
  if (queryTokens.length === 1) {
292
333
  const totalOccurrences = allWordMatches.length;
@@ -304,8 +345,10 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
304
345
  const weightedDensity = densityScore * weights.density;
305
346
  const weightedSemantic = semanticScore * weights.semantic;
306
347
  const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
307
- const maxBaseWeight = Math.max(weights.exact, weights.fuzzy);
308
- const maxPossibleScore = maxBaseWeight + weights.order + weights.proximity + weights.density + weights.semantic;
348
+ const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
349
+ const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
350
+ const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
351
+ const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
309
352
  const normalizedScore = totalScore / maxPossibleScore;
310
353
  const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
311
354
  const score = normalizedScore * coverageMultiplier;
@@ -328,13 +371,20 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
328
371
  };
329
372
  }
330
373
  function isInOrder(phraseWords, queryTokens) {
331
- const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
332
- for (let i = 1; i < phraseWords.length; i++) {
333
- const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1;
334
- const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1;
335
- if (currOrder < prevOrder) {
374
+ const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
375
+ let lastMatchedIndex = -1;
376
+ for (const phraseWord of phraseWords) {
377
+ let foundIndex = -1;
378
+ for (const pos of tokenPositions) {
379
+ if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
380
+ foundIndex = pos.index;
381
+ break;
382
+ }
383
+ }
384
+ if (foundIndex === -1) {
336
385
  return false;
337
386
  }
387
+ lastMatchedIndex = foundIndex;
338
388
  }
339
389
  return true;
340
390
  }
@@ -375,9 +425,259 @@ function deduplicatePhrases(phrases) {
375
425
  return result.sort((a, b) => b.score - a.score);
376
426
  }
377
427
 
428
+ // src/optimized.ts
429
+ var DEFAULT_OPTIMIZED_CONFIG = {
430
+ maxQPSCandidates: 100,
431
+ // Limit phrase scoring to top 100 candidates
432
+ minQPSScore: 0.1,
433
+ // Include candidates with 10%+ of best score
434
+ qpsExact: false,
435
+ // Use fuzzy matching by default
436
+ qpsTolerance: 1
437
+ // Default tolerance of 1 edit distance
438
+ };
439
+ function normalizeText(text) {
440
+ return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
441
+ }
442
+ function tokenize(text) {
443
+ return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
444
+ }
445
+ function buildReducedVocabularyFromDocs(candidateDocIds, docs) {
446
+ const reducedVocab = /* @__PURE__ */ new Set();
447
+ for (const docId of candidateDocIds) {
448
+ const doc = docs[docId];
449
+ if (!doc?.normalized_content)
450
+ continue;
451
+ const tokens = doc.normalized_content.split(/\s+/).filter((token) => token.length > 0);
452
+ for (const token of tokens) {
453
+ reducedVocab.add(token);
454
+ }
455
+ }
456
+ return reducedVocab;
457
+ }
458
+ function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
459
+ const tokens = tokenizer.tokenize(term, language);
460
+ if (tokens.length === 0) {
461
+ return [];
462
+ }
463
+ const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
464
+ const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
465
+ const boostPerProp = config.qpsBoostPerProp ?? {};
466
+ const resultMap = /* @__PURE__ */ new Map();
467
+ for (const prop of properties) {
468
+ const indexEntry = qpsIndex.indexes[prop];
469
+ if (!indexEntry || indexEntry.type !== "Radix") {
470
+ continue;
471
+ }
472
+ const radixNode = indexEntry.node;
473
+ const stats = qpsIndex.stats[prop];
474
+ if (!radixNode || !stats) {
475
+ continue;
476
+ }
477
+ const boost = boostPerProp[prop] ?? 1;
478
+ for (let i = 0; i < tokens.length; i++) {
479
+ const token = tokens[i];
480
+ const matches = radixNode.find({
481
+ term: token,
482
+ exact,
483
+ tolerance: exact ? 0 : tolerance
484
+ });
485
+ for (const [matchedWord, docIds] of Object.entries(matches)) {
486
+ if (!Array.isArray(docIds))
487
+ continue;
488
+ const isExactMatch = matchedWord === token;
489
+ for (const docId of docIds) {
490
+ const tokensLength = stats.tokensLength.get(docId) || 1;
491
+ const quantum = stats.tokenQuantums[docId]?.[matchedWord];
492
+ const occurrences = quantum ? quantum >> 20 : 1;
493
+ const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
494
+ if (!resultMap.has(docId)) {
495
+ resultMap.set(docId, [scoreContrib, 1 << i]);
496
+ } else {
497
+ const [prevScore, prevMask] = resultMap.get(docId);
498
+ const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
499
+ resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
500
+ }
501
+ }
502
+ }
503
+ }
504
+ }
505
+ const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
506
+ return results;
507
+ }
508
+ function countSetBits(n) {
509
+ let count = 0;
510
+ while (n) {
511
+ count += n & 1;
512
+ n >>= 1;
513
+ }
514
+ return count;
515
+ }
516
+ async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
517
+ const startTime = performance.now();
518
+ const { term, properties, tokenCache } = params;
519
+ if (!term || typeof term !== "string") {
520
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
521
+ }
522
+ const textProperty = properties && properties[0] || pluginState.config.textProperty;
523
+ const searchProperties = properties || [textProperty];
524
+ const queryTokens = tokenize(term);
525
+ if (queryTokens.length === 0) {
526
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
527
+ }
528
+ const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
529
+ console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
530
+ const qpsStartTime = performance.now();
531
+ const tokenizer = orama.tokenizer;
532
+ const qpsCandidates = searchQPS(
533
+ term,
534
+ qpsIndex,
535
+ tokenizer,
536
+ searchProperties,
537
+ config,
538
+ language
539
+ );
540
+ const qpsTime = performance.now() - qpsStartTime;
541
+ console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
542
+ if (qpsCandidates.length === 0) {
543
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
544
+ }
545
+ const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
546
+ const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
547
+ const bestScore = qpsCandidates[0][1];
548
+ const minScore = bestScore * minScoreRatio;
549
+ const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
550
+ console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
551
+ const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
552
+ let vocabulary = pluginState.vocabulary;
553
+ if (vocabulary.size === 0) {
554
+ console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
555
+ try {
556
+ const indexData = orama.data?.index;
557
+ let radixNode = null;
558
+ if (indexData?.indexes?.[textProperty]?.node) {
559
+ radixNode = indexData.indexes[textProperty].node;
560
+ } else if (indexData?.[textProperty]?.node) {
561
+ radixNode = indexData[textProperty].node;
562
+ }
563
+ if (radixNode) {
564
+ pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
565
+ vocabulary = pluginState.vocabulary;
566
+ console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
567
+ } else {
568
+ console.error("\u274C Radix tree not found for vocabulary extraction");
569
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
570
+ }
571
+ } catch (error) {
572
+ console.error("\u274C Failed to extract vocabulary:", error);
573
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
574
+ }
575
+ }
576
+ let docs = {};
577
+ if (orama.data?.docs?.docs) {
578
+ docs = orama.data.docs.docs;
579
+ }
580
+ const vocabStartTime = performance.now();
581
+ const reducedVocabulary = buildReducedVocabularyFromDocs(candidateDocIds, docs);
582
+ const vocabTime = performance.now() - vocabStartTime;
583
+ console.log(`\u{1F4DA} Reduced vocabulary: ${reducedVocabulary.size} words (full: ${vocabulary.size}, reduction: ${(100 * (1 - reducedVocabulary.size / vocabulary.size)).toFixed(1)}%, built in ${vocabTime.toFixed(2)}ms)`);
584
+ const candidatesMap = findAllCandidates(
585
+ queryTokens,
586
+ reducedVocabulary,
587
+ tolerance,
588
+ pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
589
+ pluginState.config.synonymMatchScore
590
+ );
591
+ const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
592
+ console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
593
+ const phraseStartTime = performance.now();
594
+ const candidateLookup = buildCandidateLookup(filteredFuzzyCandidates);
595
+ const queryTokenCounts = buildQueryTokenCounts(queryTokens);
596
+ const documentMatches = [];
597
+ let docsScored = 0;
598
+ for (const [docId, doc] of Object.entries(docs)) {
599
+ if (!candidateDocIds.has(docId)) {
600
+ continue;
601
+ }
602
+ docsScored++;
603
+ const text = doc[textProperty];
604
+ if (!text || typeof text !== "string") {
605
+ continue;
606
+ }
607
+ let docTokens;
608
+ if (tokenCache && tokenCache.has(docId)) {
609
+ docTokens = tokenCache.get(docId);
610
+ } else {
611
+ docTokens = text.split(/\s+/).filter((token) => token.length > 0);
612
+ }
613
+ const phrases = findPhrasesInDocument(
614
+ docTokens,
615
+ filteredFuzzyCandidates,
616
+ {
617
+ weights: pluginState.config.weights,
618
+ maxGap: pluginState.config.maxGap,
619
+ proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
620
+ tolerance
621
+ },
622
+ pluginState.documentFrequency,
623
+ pluginState.totalDocuments,
624
+ queryTokens,
625
+ candidateLookup,
626
+ // PHASE 1 OPTIMIZATION A: Pre-built candidate lookup
627
+ queryTokenCounts
628
+ // PHASE 1 OPTIMIZATION B: Pre-built query token counts
629
+ );
630
+ if (phrases.length > 0) {
631
+ const docScore = Math.max(...phrases.map((p) => p.score));
632
+ documentMatches.push({
633
+ id: docId,
634
+ phrases,
635
+ score: docScore,
636
+ document: doc
637
+ });
638
+ }
639
+ }
640
+ const phraseTime = performance.now() - phraseStartTime;
641
+ console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
642
+ documentMatches.sort((a, b) => b.score - a.score);
643
+ let finalMatches = documentMatches;
644
+ if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
645
+ const threshold = pluginState.config.finalScoreMinimum;
646
+ const beforeCount = finalMatches.length;
647
+ finalMatches = finalMatches.filter((m) => m.score >= threshold);
648
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
649
+ }
650
+ const limit = params.limit ?? finalMatches.length;
651
+ const limitedMatches = finalMatches.slice(0, limit);
652
+ const hits = limitedMatches.map((match) => ({
653
+ id: match.id,
654
+ score: match.score,
655
+ document: match.document,
656
+ _phrases: match.phrases
657
+ }));
658
+ const elapsed = performance.now() - startTime;
659
+ console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
660
+ return {
661
+ elapsed: {
662
+ formatted: `${elapsed.toFixed(2)}ms`,
663
+ raw: Math.floor(elapsed * 1e6),
664
+ qpsTime,
665
+ phraseTime
666
+ },
667
+ hits,
668
+ count: hits.length
669
+ };
670
+ }
671
+ function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
672
+ return async (params, language = "french") => {
673
+ return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
674
+ };
675
+ }
676
+
378
677
  // src/index.ts
379
678
  var DEFAULT_CONFIG = {
380
- textProperty: "content",
679
+ textProperty: "normalized_content",
680
+ // Must match server's field name
381
681
  tolerance: 1,
382
682
  adaptiveTolerance: true,
383
683
  enableSynonyms: false,
@@ -393,6 +693,8 @@ var DEFAULT_CONFIG = {
393
693
  },
394
694
  maxGap: 5,
395
695
  minScore: 0.1,
696
+ enableFinalScoreMinimum: false,
697
+ finalScoreMinimum: 0.3,
396
698
  proximitySpanMultiplier: 5
397
699
  };
398
700
  var pluginStates = /* @__PURE__ */ new WeakMap();
@@ -414,6 +716,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
414
716
  },
415
717
  maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
416
718
  minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
719
+ enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
720
+ finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
417
721
  proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
418
722
  };
419
723
  const plugin = {
@@ -427,7 +731,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
427
731
  synonymMap: {},
428
732
  config,
429
733
  documentFrequency: /* @__PURE__ */ new Map(),
430
- totalDocuments: 0
734
+ totalDocuments: 0,
735
+ vocabulary: /* @__PURE__ */ new Set()
431
736
  };
432
737
  if (config.enableSynonyms && config.supabase) {
433
738
  try {
@@ -444,6 +749,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
444
749
  state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
445
750
  console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
446
751
  }
752
+ try {
753
+ const indexData = orama.data?.index;
754
+ let radixNode = null;
755
+ if (indexData?.indexes?.[config.textProperty]?.node) {
756
+ radixNode = indexData.indexes[config.textProperty].node;
757
+ } else if (indexData?.[config.textProperty]?.node) {
758
+ radixNode = indexData[config.textProperty].node;
759
+ }
760
+ if (radixNode) {
761
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
762
+ console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
763
+ } else {
764
+ console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
765
+ }
766
+ } catch (error) {
767
+ console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
768
+ }
447
769
  pluginStates.set(orama, state);
448
770
  console.log("\u2705 Fuzzy Phrase Plugin initialized");
449
771
  setImmediate(() => {
@@ -465,43 +787,43 @@ async function searchWithFuzzyPhrase(orama, params, language) {
465
787
  console.error("\u274C Plugin state not initialized");
466
788
  throw new Error("Fuzzy Phrase Plugin not properly initialized");
467
789
  }
468
- const { term, properties } = params;
790
+ const { term, properties, tokenCache, candidateIds } = params;
791
+ const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
469
792
  if (!term || typeof term !== "string") {
470
793
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
471
794
  }
472
795
  const textProperty = properties && properties[0] || state.config.textProperty;
473
- const queryTokens = tokenize(term);
796
+ const queryTokens = tokenize2(term);
474
797
  if (queryTokens.length === 0) {
475
798
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
476
799
  }
477
800
  const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
478
801
  console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
479
- let vocabulary;
480
- try {
481
- const indexData = orama.data?.index;
482
- if (!indexData) {
483
- console.error("\u274C No index data found in orama.data.index");
484
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
485
- }
486
- console.log("\u{1F50D} DEBUG: Index data keys:", Object.keys(indexData || {}));
487
- let radixNode = null;
488
- if (indexData.indexes?.[textProperty]?.node) {
489
- radixNode = indexData.indexes[textProperty].node;
490
- console.log("\u2705 Found radix via QPS-style path (data.index.indexes)");
491
- } else if (indexData[textProperty]?.node) {
492
- radixNode = indexData[textProperty].node;
493
- console.log("\u2705 Found radix via standard path (data.index[property])");
494
- }
495
- if (!radixNode) {
496
- console.error("\u274C Radix tree not found for property:", textProperty);
497
- console.error(" Available properties in index:", Object.keys(indexData));
802
+ let vocabulary = state.vocabulary;
803
+ if (vocabulary.size === 0) {
804
+ console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
805
+ try {
806
+ const indexData = orama.data?.index;
807
+ let radixNode = null;
808
+ if (indexData?.indexes?.[textProperty]?.node) {
809
+ radixNode = indexData.indexes[textProperty].node;
810
+ } else if (indexData?.[textProperty]?.node) {
811
+ radixNode = indexData[textProperty].node;
812
+ }
813
+ if (radixNode) {
814
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
815
+ vocabulary = state.vocabulary;
816
+ console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
817
+ } else {
818
+ console.error("\u274C Radix tree not found for vocabulary extraction");
819
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
820
+ }
821
+ } catch (error) {
822
+ console.error("\u274C Failed to extract vocabulary:", error);
498
823
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
499
824
  }
500
- vocabulary = extractVocabularyFromRadixTree(radixNode);
501
- console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`);
502
- } catch (error) {
503
- console.error("\u274C Failed to extract vocabulary:", error);
504
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
825
+ } else {
826
+ console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
505
827
  }
506
828
  const candidatesMap = findAllCandidates(
507
829
  queryTokens,
@@ -510,11 +832,10 @@ async function searchWithFuzzyPhrase(orama, params, language) {
510
832
  state.config.enableSynonyms ? state.synonymMap : void 0,
511
833
  state.config.synonymMatchScore
512
834
  );
513
- const filteredCandidates = filterCandidatesByScore(
514
- candidatesMap,
515
- state.config.minScore
516
- );
835
+ const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
517
836
  console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
837
+ const candidateLookup = buildCandidateLookup(filteredCandidates);
838
+ const queryTokenCounts = buildQueryTokenCounts(queryTokens);
518
839
  const documentMatches = [];
519
840
  console.log("\u{1F50D} DEBUG orama.data structure:", {
520
841
  dataKeys: Object.keys(orama.data || {}),
@@ -540,23 +861,42 @@ async function searchWithFuzzyPhrase(orama, params, language) {
540
861
  dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
541
862
  });
542
863
  }
543
- console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
864
+ const cacheHits = tokenCache ? tokenCache.size : 0;
865
+ const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
866
+ console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
544
867
  for (const [docId, doc] of Object.entries(docs)) {
868
+ if (candidateIdSet) {
869
+ const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
870
+ if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
871
+ continue;
872
+ }
873
+ }
545
874
  const text = doc[textProperty];
546
875
  if (!text || typeof text !== "string") {
547
876
  continue;
548
877
  }
549
- const docTokens = tokenize(text);
878
+ let docTokens;
879
+ if (tokenCache && tokenCache.has(docId)) {
880
+ docTokens = tokenCache.get(docId);
881
+ } else {
882
+ docTokens = tokenize2(text);
883
+ }
550
884
  const phrases = findPhrasesInDocument(
551
885
  docTokens,
552
886
  filteredCandidates,
553
887
  {
554
888
  weights: state.config.weights,
555
889
  maxGap: state.config.maxGap,
556
- proximitySpanMultiplier: state.config.proximitySpanMultiplier
890
+ proximitySpanMultiplier: state.config.proximitySpanMultiplier,
891
+ tolerance
557
892
  },
558
893
  state.documentFrequency,
559
- state.totalDocuments
894
+ state.totalDocuments,
895
+ queryTokens,
896
+ candidateLookup,
897
+ // PHASE 1 OPTIMIZATION A: Pre-built candidate lookup
898
+ queryTokenCounts
899
+ // PHASE 1 OPTIMIZATION B: Pre-built query token counts
560
900
  );
561
901
  if (phrases.length > 0) {
562
902
  const docScore = Math.max(...phrases.map((p) => p.score));
@@ -569,8 +909,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
569
909
  }
570
910
  }
571
911
  documentMatches.sort((a, b) => b.score - a.score);
572
- const limit = params.limit ?? documentMatches.length;
573
- const limitedMatches = documentMatches.slice(0, limit);
912
+ let filteredMatches = documentMatches;
913
+ if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
914
+ const threshold = state.config.finalScoreMinimum;
915
+ const beforeCount = filteredMatches.length;
916
+ filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
917
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
918
+ }
919
+ const limit = params.limit ?? filteredMatches.length;
920
+ const limitedMatches = filteredMatches.slice(0, limit);
574
921
  const hits = limitedMatches.map((match) => ({
575
922
  id: match.id,
576
923
  score: match.score,
@@ -621,20 +968,23 @@ function calculateDocumentFrequencies(docs, textProperty) {
621
968
  if (!text || typeof text !== "string") {
622
969
  continue;
623
970
  }
624
- const words = new Set(tokenize(text));
971
+ const words = new Set(tokenize2(text));
625
972
  for (const word of words) {
626
973
  df.set(word, (df.get(word) || 0) + 1);
627
974
  }
628
975
  }
629
976
  return df;
630
977
  }
631
- function normalizeText(text) {
978
+ function normalizeText2(text) {
632
979
  return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
633
980
  }
634
- function tokenize(text) {
635
- return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
981
+ function tokenize2(text) {
982
+ return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
983
+ }
984
+ function getPluginState(orama) {
985
+ return pluginStates.get(orama);
636
986
  }
637
987
 
638
- export { pluginFuzzyPhrase, searchWithFuzzyPhrase };
988
+ export { createOptimizedSearch, getPluginState, normalizeText as normalizeTextOptimized, pluginFuzzyPhrase, searchWithFuzzyPhrase, searchWithQPSPruning, tokenize as tokenizeOptimized };
639
989
  //# sourceMappingURL=out.js.map
640
990
  //# sourceMappingURL=index.js.map