@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.3 → 3.1.16-custom.newbase.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
50
50
  if (word === queryToken) {
51
51
  return { matches: true, distance: 0, score: 1 };
52
52
  }
53
- if (word.startsWith(queryToken)) {
54
- return { matches: true, distance: 0, score: 0.95 };
55
- }
56
53
  const result = boundedLevenshtein(word, queryToken, tolerance);
57
54
  if (result.isBounded) {
58
55
  const score = 1 - result.distance * 0.2;
@@ -182,24 +179,51 @@ function filterCandidatesByScore(candidatesMap, minScore) {
182
179
  }
183
180
 
184
181
  // src/scoring.ts
185
- function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
182
+ function buildCandidateLookup(candidatesMap) {
183
+ const candidateLookup = /* @__PURE__ */ new Map();
184
+ for (const [queryToken, candidates] of candidatesMap.entries()) {
185
+ for (const candidate of candidates) {
186
+ if (!candidateLookup.has(candidate.word)) {
187
+ candidateLookup.set(candidate.word, []);
188
+ }
189
+ candidateLookup.get(candidate.word).push({ queryToken, candidate });
190
+ }
191
+ }
192
+ for (const entries of candidateLookup.values()) {
193
+ entries.sort((a, b) => {
194
+ if (a.candidate.type === "exact" && b.candidate.type !== "exact")
195
+ return -1;
196
+ if (b.candidate.type === "exact" && a.candidate.type !== "exact")
197
+ return 1;
198
+ return b.candidate.score - a.candidate.score;
199
+ });
200
+ }
201
+ return candidateLookup;
202
+ }
203
+ function buildQueryTokenCounts(queryTokens) {
204
+ const queryTokenCounts = /* @__PURE__ */ new Map();
205
+ for (const token of queryTokens) {
206
+ queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
207
+ }
208
+ return queryTokenCounts;
209
+ }
210
+ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, candidateLookup, queryTokenCounts) {
186
211
  const phrases = [];
187
- const queryTokens = Array.from(candidatesMap.keys());
212
+ const queryTokens = originalQueryTokens;
188
213
  const wordMatches = [];
189
214
  for (let i = 0; i < documentTokens.length; i++) {
190
215
  const docWord = documentTokens[i];
191
- for (const [queryToken, candidates] of candidatesMap.entries()) {
192
- for (const candidate of candidates) {
193
- if (candidate.word === docWord) {
194
- wordMatches.push({
195
- word: docWord,
196
- queryToken,
197
- position: i,
198
- type: candidate.type,
199
- distance: candidate.distance,
200
- score: candidate.score
201
- });
202
- }
216
+ const matches = candidateLookup.get(docWord);
217
+ if (matches) {
218
+ for (const { queryToken, candidate } of matches) {
219
+ wordMatches.push({
220
+ word: docWord,
221
+ queryToken,
222
+ position: i,
223
+ type: candidate.type,
224
+ distance: candidate.distance,
225
+ score: candidate.score
226
+ });
203
227
  }
204
228
  }
205
229
  }
@@ -212,42 +236,56 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
212
236
  documentFrequency,
213
237
  totalDocuments,
214
238
  wordMatches,
215
- documentTokens
216
- // Pass document tokens to extract gap words
239
+ documentTokens,
240
+ queryTokenCounts
241
+ // OPTIMIZATION B: Pass pre-built queryTokenCounts
217
242
  );
218
243
  if (phrase && phrase.words.length > 0) {
219
244
  phrases.push(phrase);
220
245
  }
221
246
  }
222
- return deduplicatePhrases(phrases);
247
+ const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
248
+ const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
249
+ return deduplicatePhrases(filteredPhrases);
223
250
  }
224
- function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
251
+ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens, queryTokenCounts) {
225
252
  const startMatch = wordMatches[startIndex];
226
253
  const phraseWords = [startMatch];
227
- const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
254
+ const matchedCounts = /* @__PURE__ */ new Map();
255
+ matchedCounts.set(startMatch.queryToken, 1);
228
256
  const gapWords = [];
229
257
  let totalGapUsed = 0;
258
+ let totalMatchedTokens = 1;
230
259
  for (let i = startIndex + 1; i < wordMatches.length; i++) {
231
260
  const match = wordMatches[i];
232
261
  const lastPos = phraseWords[phraseWords.length - 1].position;
262
+ if (match.position <= lastPos) {
263
+ continue;
264
+ }
233
265
  const gap = match.position - lastPos - 1;
234
266
  if (gap > config.maxGap) {
235
267
  break;
236
268
  }
237
- for (let pos = lastPos + 1; pos < match.position; pos++) {
238
- totalGapUsed++;
239
- gapWords.push({
240
- word: documentTokens[pos],
241
- position: pos,
242
- gapIndex: totalGapUsed
243
- });
269
+ if (totalGapUsed + gap > config.maxGap) {
270
+ break;
244
271
  }
245
- if (!coveredTokens.has(match.queryToken)) {
272
+ const neededCount = queryTokenCounts.get(match.queryToken) || 0;
273
+ const currentCount = matchedCounts.get(match.queryToken) || 0;
274
+ if (currentCount < neededCount) {
275
+ for (let pos = lastPos + 1; pos < match.position; pos++) {
276
+ totalGapUsed++;
277
+ gapWords.push({
278
+ word: documentTokens[pos],
279
+ position: pos,
280
+ gapIndex: totalGapUsed
281
+ });
282
+ }
246
283
  phraseWords.push(match);
247
- coveredTokens.add(match.queryToken);
248
- }
249
- if (coveredTokens.size === queryTokens.length) {
250
- break;
284
+ matchedCounts.set(match.queryToken, currentCount + 1);
285
+ totalMatchedTokens++;
286
+ if (totalMatchedTokens === queryTokens.length) {
287
+ break;
288
+ }
251
289
  }
252
290
  }
253
291
  if (phraseWords.length > 0) {
@@ -286,9 +324,12 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
286
324
  baseScore /= phraseWords.length;
287
325
  const inOrder = isInOrder(phraseWords, queryTokens);
288
326
  const orderScore = inOrder ? 1 : 0.5;
289
- const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
290
- const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
291
- const proximityScore = Math.max(0, 1 - span / proximityWindow);
327
+ let proximityScore = 0;
328
+ if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
329
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
330
+ const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
331
+ proximityScore = Math.max(0, 1 - span / proximityWindow);
332
+ }
292
333
  let densityScore = 0;
293
334
  if (queryTokens.length === 1) {
294
335
  const totalOccurrences = allWordMatches.length;
@@ -306,8 +347,10 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
306
347
  const weightedDensity = densityScore * weights.density;
307
348
  const weightedSemantic = semanticScore * weights.semantic;
308
349
  const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
309
- const maxBaseWeight = Math.max(weights.exact, weights.fuzzy);
310
- const maxPossibleScore = maxBaseWeight + weights.order + weights.proximity + weights.density + weights.semantic;
350
+ const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
351
+ const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
352
+ const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
353
+ const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
311
354
  const normalizedScore = totalScore / maxPossibleScore;
312
355
  const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
313
356
  const score = normalizedScore * coverageMultiplier;
@@ -330,13 +373,20 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
330
373
  };
331
374
  }
332
375
  function isInOrder(phraseWords, queryTokens) {
333
- const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
334
- for (let i = 1; i < phraseWords.length; i++) {
335
- const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1;
336
- const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1;
337
- if (currOrder < prevOrder) {
376
+ const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
377
+ let lastMatchedIndex = -1;
378
+ for (const phraseWord of phraseWords) {
379
+ let foundIndex = -1;
380
+ for (const pos of tokenPositions) {
381
+ if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
382
+ foundIndex = pos.index;
383
+ break;
384
+ }
385
+ }
386
+ if (foundIndex === -1) {
338
387
  return false;
339
388
  }
389
+ lastMatchedIndex = foundIndex;
340
390
  }
341
391
  return true;
342
392
  }
@@ -377,9 +427,259 @@ function deduplicatePhrases(phrases) {
377
427
  return result.sort((a, b) => b.score - a.score);
378
428
  }
379
429
 
430
+ // src/optimized.ts
431
+ var DEFAULT_OPTIMIZED_CONFIG = {
432
+ maxQPSCandidates: 100,
433
+ // Limit phrase scoring to top 100 candidates
434
+ minQPSScore: 0.1,
435
+ // Include candidates with 10%+ of best score
436
+ qpsExact: false,
437
+ // Use fuzzy matching by default
438
+ qpsTolerance: 1
439
+ // Default tolerance of 1 edit distance
440
+ };
441
+ function normalizeText(text) {
442
+ return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
443
+ }
444
+ function tokenize(text) {
445
+ return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
446
+ }
447
+ function buildReducedVocabularyFromDocs(candidateDocIds, docs) {
448
+ const reducedVocab = /* @__PURE__ */ new Set();
449
+ for (const docId of candidateDocIds) {
450
+ const doc = docs[docId];
451
+ if (!doc?.normalized_content)
452
+ continue;
453
+ const tokens = doc.normalized_content.split(/\s+/).filter((token) => token.length > 0);
454
+ for (const token of tokens) {
455
+ reducedVocab.add(token);
456
+ }
457
+ }
458
+ return reducedVocab;
459
+ }
460
+ function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
461
+ const tokens = tokenizer.tokenize(term, language);
462
+ if (tokens.length === 0) {
463
+ return [];
464
+ }
465
+ const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
466
+ const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
467
+ const boostPerProp = config.qpsBoostPerProp ?? {};
468
+ const resultMap = /* @__PURE__ */ new Map();
469
+ for (const prop of properties) {
470
+ const indexEntry = qpsIndex.indexes[prop];
471
+ if (!indexEntry || indexEntry.type !== "Radix") {
472
+ continue;
473
+ }
474
+ const radixNode = indexEntry.node;
475
+ const stats = qpsIndex.stats[prop];
476
+ if (!radixNode || !stats) {
477
+ continue;
478
+ }
479
+ const boost = boostPerProp[prop] ?? 1;
480
+ for (let i = 0; i < tokens.length; i++) {
481
+ const token = tokens[i];
482
+ const matches = radixNode.find({
483
+ term: token,
484
+ exact,
485
+ tolerance: exact ? 0 : tolerance
486
+ });
487
+ for (const [matchedWord, docIds] of Object.entries(matches)) {
488
+ if (!Array.isArray(docIds))
489
+ continue;
490
+ const isExactMatch = matchedWord === token;
491
+ for (const docId of docIds) {
492
+ const tokensLength = stats.tokensLength.get(docId) || 1;
493
+ const quantum = stats.tokenQuantums[docId]?.[matchedWord];
494
+ const occurrences = quantum ? quantum >> 20 : 1;
495
+ const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
496
+ if (!resultMap.has(docId)) {
497
+ resultMap.set(docId, [scoreContrib, 1 << i]);
498
+ } else {
499
+ const [prevScore, prevMask] = resultMap.get(docId);
500
+ const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
501
+ resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
502
+ }
503
+ }
504
+ }
505
+ }
506
+ }
507
+ const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
508
+ return results;
509
+ }
510
+ function countSetBits(n) {
511
+ let count = 0;
512
+ while (n) {
513
+ count += n & 1;
514
+ n >>= 1;
515
+ }
516
+ return count;
517
+ }
518
+ async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
519
+ const startTime = performance.now();
520
+ const { term, properties, tokenCache } = params;
521
+ if (!term || typeof term !== "string") {
522
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
523
+ }
524
+ const textProperty = properties && properties[0] || pluginState.config.textProperty;
525
+ const searchProperties = properties || [textProperty];
526
+ const queryTokens = tokenize(term);
527
+ if (queryTokens.length === 0) {
528
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
529
+ }
530
+ const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
531
+ console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
532
+ const qpsStartTime = performance.now();
533
+ const tokenizer = orama.tokenizer;
534
+ const qpsCandidates = searchQPS(
535
+ term,
536
+ qpsIndex,
537
+ tokenizer,
538
+ searchProperties,
539
+ config,
540
+ language
541
+ );
542
+ const qpsTime = performance.now() - qpsStartTime;
543
+ console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
544
+ if (qpsCandidates.length === 0) {
545
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
546
+ }
547
+ const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
548
+ const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
549
+ const bestScore = qpsCandidates[0][1];
550
+ const minScore = bestScore * minScoreRatio;
551
+ const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
552
+ console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
553
+ const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
554
+ let vocabulary = pluginState.vocabulary;
555
+ if (vocabulary.size === 0) {
556
+ console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
557
+ try {
558
+ const indexData = orama.data?.index;
559
+ let radixNode = null;
560
+ if (indexData?.indexes?.[textProperty]?.node) {
561
+ radixNode = indexData.indexes[textProperty].node;
562
+ } else if (indexData?.[textProperty]?.node) {
563
+ radixNode = indexData[textProperty].node;
564
+ }
565
+ if (radixNode) {
566
+ pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
567
+ vocabulary = pluginState.vocabulary;
568
+ console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
569
+ } else {
570
+ console.error("\u274C Radix tree not found for vocabulary extraction");
571
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
572
+ }
573
+ } catch (error) {
574
+ console.error("\u274C Failed to extract vocabulary:", error);
575
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
576
+ }
577
+ }
578
+ let docs = {};
579
+ if (orama.data?.docs?.docs) {
580
+ docs = orama.data.docs.docs;
581
+ }
582
+ const vocabStartTime = performance.now();
583
+ const reducedVocabulary = buildReducedVocabularyFromDocs(candidateDocIds, docs);
584
+ const vocabTime = performance.now() - vocabStartTime;
585
+ console.log(`\u{1F4DA} Reduced vocabulary: ${reducedVocabulary.size} words (full: ${vocabulary.size}, reduction: ${(100 * (1 - reducedVocabulary.size / vocabulary.size)).toFixed(1)}%, built in ${vocabTime.toFixed(2)}ms)`);
586
+ const candidatesMap = findAllCandidates(
587
+ queryTokens,
588
+ reducedVocabulary,
589
+ tolerance,
590
+ pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
591
+ pluginState.config.synonymMatchScore
592
+ );
593
+ const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
594
+ console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
595
+ const phraseStartTime = performance.now();
596
+ const candidateLookup = buildCandidateLookup(filteredFuzzyCandidates);
597
+ const queryTokenCounts = buildQueryTokenCounts(queryTokens);
598
+ const documentMatches = [];
599
+ let docsScored = 0;
600
+ for (const [docId, doc] of Object.entries(docs)) {
601
+ if (!candidateDocIds.has(docId)) {
602
+ continue;
603
+ }
604
+ docsScored++;
605
+ const text = doc[textProperty];
606
+ if (!text || typeof text !== "string") {
607
+ continue;
608
+ }
609
+ let docTokens;
610
+ if (tokenCache && tokenCache.has(docId)) {
611
+ docTokens = tokenCache.get(docId);
612
+ } else {
613
+ docTokens = text.split(/\s+/).filter((token) => token.length > 0);
614
+ }
615
+ const phrases = findPhrasesInDocument(
616
+ docTokens,
617
+ filteredFuzzyCandidates,
618
+ {
619
+ weights: pluginState.config.weights,
620
+ maxGap: pluginState.config.maxGap,
621
+ proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
622
+ tolerance
623
+ },
624
+ pluginState.documentFrequency,
625
+ pluginState.totalDocuments,
626
+ queryTokens,
627
+ candidateLookup,
628
+ // PHASE 1 OPTIMIZATION A: Pre-built candidate lookup
629
+ queryTokenCounts
630
+ // PHASE 1 OPTIMIZATION B: Pre-built query token counts
631
+ );
632
+ if (phrases.length > 0) {
633
+ const docScore = Math.max(...phrases.map((p) => p.score));
634
+ documentMatches.push({
635
+ id: docId,
636
+ phrases,
637
+ score: docScore,
638
+ document: doc
639
+ });
640
+ }
641
+ }
642
+ const phraseTime = performance.now() - phraseStartTime;
643
+ console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
644
+ documentMatches.sort((a, b) => b.score - a.score);
645
+ let finalMatches = documentMatches;
646
+ if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
647
+ const threshold = pluginState.config.finalScoreMinimum;
648
+ const beforeCount = finalMatches.length;
649
+ finalMatches = finalMatches.filter((m) => m.score >= threshold);
650
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
651
+ }
652
+ const limit = params.limit ?? finalMatches.length;
653
+ const limitedMatches = finalMatches.slice(0, limit);
654
+ const hits = limitedMatches.map((match) => ({
655
+ id: match.id,
656
+ score: match.score,
657
+ document: match.document,
658
+ _phrases: match.phrases
659
+ }));
660
+ const elapsed = performance.now() - startTime;
661
+ console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
662
+ return {
663
+ elapsed: {
664
+ formatted: `${elapsed.toFixed(2)}ms`,
665
+ raw: Math.floor(elapsed * 1e6),
666
+ qpsTime,
667
+ phraseTime
668
+ },
669
+ hits,
670
+ count: hits.length
671
+ };
672
+ }
673
+ function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
674
+ return async (params, language = "french") => {
675
+ return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
676
+ };
677
+ }
678
+
380
679
  // src/index.ts
381
680
  var DEFAULT_CONFIG = {
382
- textProperty: "content",
681
+ textProperty: "normalized_content",
682
+ // Must match server's field name
383
683
  tolerance: 1,
384
684
  adaptiveTolerance: true,
385
685
  enableSynonyms: false,
@@ -395,6 +695,8 @@ var DEFAULT_CONFIG = {
395
695
  },
396
696
  maxGap: 5,
397
697
  minScore: 0.1,
698
+ enableFinalScoreMinimum: false,
699
+ finalScoreMinimum: 0.3,
398
700
  proximitySpanMultiplier: 5
399
701
  };
400
702
  var pluginStates = /* @__PURE__ */ new WeakMap();
@@ -416,6 +718,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
416
718
  },
417
719
  maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
418
720
  minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
721
+ enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
722
+ finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
419
723
  proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
420
724
  };
421
725
  const plugin = {
@@ -429,7 +733,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
429
733
  synonymMap: {},
430
734
  config,
431
735
  documentFrequency: /* @__PURE__ */ new Map(),
432
- totalDocuments: 0
736
+ totalDocuments: 0,
737
+ vocabulary: /* @__PURE__ */ new Set()
433
738
  };
434
739
  if (config.enableSynonyms && config.supabase) {
435
740
  try {
@@ -446,6 +751,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
446
751
  state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
447
752
  console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
448
753
  }
754
+ try {
755
+ const indexData = orama.data?.index;
756
+ let radixNode = null;
757
+ if (indexData?.indexes?.[config.textProperty]?.node) {
758
+ radixNode = indexData.indexes[config.textProperty].node;
759
+ } else if (indexData?.[config.textProperty]?.node) {
760
+ radixNode = indexData[config.textProperty].node;
761
+ }
762
+ if (radixNode) {
763
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
764
+ console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
765
+ } else {
766
+ console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
767
+ }
768
+ } catch (error) {
769
+ console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
770
+ }
449
771
  pluginStates.set(orama, state);
450
772
  console.log("\u2705 Fuzzy Phrase Plugin initialized");
451
773
  setImmediate(() => {
@@ -467,43 +789,43 @@ async function searchWithFuzzyPhrase(orama, params, language) {
467
789
  console.error("\u274C Plugin state not initialized");
468
790
  throw new Error("Fuzzy Phrase Plugin not properly initialized");
469
791
  }
470
- const { term, properties } = params;
792
+ const { term, properties, tokenCache, candidateIds } = params;
793
+ const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
471
794
  if (!term || typeof term !== "string") {
472
795
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
473
796
  }
474
797
  const textProperty = properties && properties[0] || state.config.textProperty;
475
- const queryTokens = tokenize(term);
798
+ const queryTokens = tokenize2(term);
476
799
  if (queryTokens.length === 0) {
477
800
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
478
801
  }
479
802
  const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
480
803
  console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
481
- let vocabulary;
482
- try {
483
- const indexData = orama.data?.index;
484
- if (!indexData) {
485
- console.error("\u274C No index data found in orama.data.index");
486
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
487
- }
488
- console.log("\u{1F50D} DEBUG: Index data keys:", Object.keys(indexData || {}));
489
- let radixNode = null;
490
- if (indexData.indexes?.[textProperty]?.node) {
491
- radixNode = indexData.indexes[textProperty].node;
492
- console.log("\u2705 Found radix via QPS-style path (data.index.indexes)");
493
- } else if (indexData[textProperty]?.node) {
494
- radixNode = indexData[textProperty].node;
495
- console.log("\u2705 Found radix via standard path (data.index[property])");
496
- }
497
- if (!radixNode) {
498
- console.error("\u274C Radix tree not found for property:", textProperty);
499
- console.error(" Available properties in index:", Object.keys(indexData));
804
+ let vocabulary = state.vocabulary;
805
+ if (vocabulary.size === 0) {
806
+ console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
807
+ try {
808
+ const indexData = orama.data?.index;
809
+ let radixNode = null;
810
+ if (indexData?.indexes?.[textProperty]?.node) {
811
+ radixNode = indexData.indexes[textProperty].node;
812
+ } else if (indexData?.[textProperty]?.node) {
813
+ radixNode = indexData[textProperty].node;
814
+ }
815
+ if (radixNode) {
816
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
817
+ vocabulary = state.vocabulary;
818
+ console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
819
+ } else {
820
+ console.error("\u274C Radix tree not found for vocabulary extraction");
821
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
822
+ }
823
+ } catch (error) {
824
+ console.error("\u274C Failed to extract vocabulary:", error);
500
825
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
501
826
  }
502
- vocabulary = extractVocabularyFromRadixTree(radixNode);
503
- console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`);
504
- } catch (error) {
505
- console.error("\u274C Failed to extract vocabulary:", error);
506
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
827
+ } else {
828
+ console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
507
829
  }
508
830
  const candidatesMap = findAllCandidates(
509
831
  queryTokens,
@@ -512,11 +834,10 @@ async function searchWithFuzzyPhrase(orama, params, language) {
512
834
  state.config.enableSynonyms ? state.synonymMap : void 0,
513
835
  state.config.synonymMatchScore
514
836
  );
515
- const filteredCandidates = filterCandidatesByScore(
516
- candidatesMap,
517
- state.config.minScore
518
- );
837
+ const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
519
838
  console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
839
+ const candidateLookup = buildCandidateLookup(filteredCandidates);
840
+ const queryTokenCounts = buildQueryTokenCounts(queryTokens);
520
841
  const documentMatches = [];
521
842
  console.log("\u{1F50D} DEBUG orama.data structure:", {
522
843
  dataKeys: Object.keys(orama.data || {}),
@@ -542,23 +863,42 @@ async function searchWithFuzzyPhrase(orama, params, language) {
542
863
  dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
543
864
  });
544
865
  }
545
- console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
866
+ const cacheHits = tokenCache ? tokenCache.size : 0;
867
+ const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
868
+ console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
546
869
  for (const [docId, doc] of Object.entries(docs)) {
870
+ if (candidateIdSet) {
871
+ const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
872
+ if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
873
+ continue;
874
+ }
875
+ }
547
876
  const text = doc[textProperty];
548
877
  if (!text || typeof text !== "string") {
549
878
  continue;
550
879
  }
551
- const docTokens = tokenize(text);
880
+ let docTokens;
881
+ if (tokenCache && tokenCache.has(docId)) {
882
+ docTokens = tokenCache.get(docId);
883
+ } else {
884
+ docTokens = tokenize2(text);
885
+ }
552
886
  const phrases = findPhrasesInDocument(
553
887
  docTokens,
554
888
  filteredCandidates,
555
889
  {
556
890
  weights: state.config.weights,
557
891
  maxGap: state.config.maxGap,
558
- proximitySpanMultiplier: state.config.proximitySpanMultiplier
892
+ proximitySpanMultiplier: state.config.proximitySpanMultiplier,
893
+ tolerance
559
894
  },
560
895
  state.documentFrequency,
561
- state.totalDocuments
896
+ state.totalDocuments,
897
+ queryTokens,
898
+ candidateLookup,
899
+ // PHASE 1 OPTIMIZATION A: Pre-built candidate lookup
900
+ queryTokenCounts
901
+ // PHASE 1 OPTIMIZATION B: Pre-built query token counts
562
902
  );
563
903
  if (phrases.length > 0) {
564
904
  const docScore = Math.max(...phrases.map((p) => p.score));
@@ -571,8 +911,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
571
911
  }
572
912
  }
573
913
  documentMatches.sort((a, b) => b.score - a.score);
574
- const limit = params.limit ?? documentMatches.length;
575
- const limitedMatches = documentMatches.slice(0, limit);
914
+ let filteredMatches = documentMatches;
915
+ if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
916
+ const threshold = state.config.finalScoreMinimum;
917
+ const beforeCount = filteredMatches.length;
918
+ filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
919
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
920
+ }
921
+ const limit = params.limit ?? filteredMatches.length;
922
+ const limitedMatches = filteredMatches.slice(0, limit);
576
923
  const hits = limitedMatches.map((match) => ({
577
924
  id: match.id,
578
925
  score: match.score,
@@ -623,21 +970,29 @@ function calculateDocumentFrequencies(docs, textProperty) {
623
970
  if (!text || typeof text !== "string") {
624
971
  continue;
625
972
  }
626
- const words = new Set(tokenize(text));
973
+ const words = new Set(tokenize2(text));
627
974
  for (const word of words) {
628
975
  df.set(word, (df.get(word) || 0) + 1);
629
976
  }
630
977
  }
631
978
  return df;
632
979
  }
633
- function normalizeText(text) {
980
+ function normalizeText2(text) {
634
981
  return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
635
982
  }
636
- function tokenize(text) {
637
- return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
983
+ function tokenize2(text) {
984
+ return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
985
+ }
986
+ function getPluginState(orama) {
987
+ return pluginStates.get(orama);
638
988
  }
639
989
 
990
+ exports.createOptimizedSearch = createOptimizedSearch;
991
+ exports.getPluginState = getPluginState;
992
+ exports.normalizeTextOptimized = normalizeText;
640
993
  exports.pluginFuzzyPhrase = pluginFuzzyPhrase;
641
994
  exports.searchWithFuzzyPhrase = searchWithFuzzyPhrase;
995
+ exports.searchWithQPSPruning = searchWithQPSPruning;
996
+ exports.tokenizeOptimized = tokenize;
642
997
  //# sourceMappingURL=out.js.map
643
998
  //# sourceMappingURL=index.cjs.map