@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.3 → 3.1.16-custom.newbase.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
50
50
  if (word === queryToken) {
51
51
  return { matches: true, distance: 0, score: 1 };
52
52
  }
53
- if (word.startsWith(queryToken)) {
54
- return { matches: true, distance: 0, score: 0.95 };
55
- }
56
53
  const result = boundedLevenshtein(word, queryToken, tolerance);
57
54
  if (result.isBounded) {
58
55
  const score = 1 - result.distance * 0.2;
@@ -182,24 +179,41 @@ function filterCandidatesByScore(candidatesMap, minScore) {
182
179
  }
183
180
 
184
181
  // src/scoring.ts
185
- function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
182
+ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens) {
186
183
  const phrases = [];
187
- const queryTokens = Array.from(candidatesMap.keys());
184
+ const queryTokens = originalQueryTokens;
188
185
  const wordMatches = [];
186
+ const candidateLookup = /* @__PURE__ */ new Map();
187
+ for (const [queryToken, candidates] of candidatesMap.entries()) {
188
+ for (const candidate of candidates) {
189
+ if (!candidateLookup.has(candidate.word)) {
190
+ candidateLookup.set(candidate.word, []);
191
+ }
192
+ candidateLookup.get(candidate.word).push({ queryToken, candidate });
193
+ }
194
+ }
195
+ for (const entries of candidateLookup.values()) {
196
+ entries.sort((a, b) => {
197
+ if (a.candidate.type === "exact" && b.candidate.type !== "exact")
198
+ return -1;
199
+ if (b.candidate.type === "exact" && a.candidate.type !== "exact")
200
+ return 1;
201
+ return b.candidate.score - a.candidate.score;
202
+ });
203
+ }
189
204
  for (let i = 0; i < documentTokens.length; i++) {
190
205
  const docWord = documentTokens[i];
191
- for (const [queryToken, candidates] of candidatesMap.entries()) {
192
- for (const candidate of candidates) {
193
- if (candidate.word === docWord) {
194
- wordMatches.push({
195
- word: docWord,
196
- queryToken,
197
- position: i,
198
- type: candidate.type,
199
- distance: candidate.distance,
200
- score: candidate.score
201
- });
202
- }
206
+ const matches = candidateLookup.get(docWord);
207
+ if (matches) {
208
+ for (const { queryToken, candidate } of matches) {
209
+ wordMatches.push({
210
+ word: docWord,
211
+ queryToken,
212
+ position: i,
213
+ type: candidate.type,
214
+ distance: candidate.distance,
215
+ score: candidate.score
216
+ });
203
217
  }
204
218
  }
205
219
  }
@@ -219,35 +233,52 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
219
233
  phrases.push(phrase);
220
234
  }
221
235
  }
222
- return deduplicatePhrases(phrases);
236
+ const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
237
+ const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
238
+ return deduplicatePhrases(filteredPhrases);
223
239
  }
224
240
  function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
225
241
  const startMatch = wordMatches[startIndex];
226
242
  const phraseWords = [startMatch];
227
- const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
243
+ const queryTokenCounts = /* @__PURE__ */ new Map();
244
+ for (const token of queryTokens) {
245
+ queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
246
+ }
247
+ const matchedCounts = /* @__PURE__ */ new Map();
248
+ matchedCounts.set(startMatch.queryToken, 1);
228
249
  const gapWords = [];
229
250
  let totalGapUsed = 0;
251
+ let totalMatchedTokens = 1;
230
252
  for (let i = startIndex + 1; i < wordMatches.length; i++) {
231
253
  const match = wordMatches[i];
232
254
  const lastPos = phraseWords[phraseWords.length - 1].position;
255
+ if (match.position <= lastPos) {
256
+ continue;
257
+ }
233
258
  const gap = match.position - lastPos - 1;
234
259
  if (gap > config.maxGap) {
235
260
  break;
236
261
  }
237
- for (let pos = lastPos + 1; pos < match.position; pos++) {
238
- totalGapUsed++;
239
- gapWords.push({
240
- word: documentTokens[pos],
241
- position: pos,
242
- gapIndex: totalGapUsed
243
- });
262
+ if (totalGapUsed + gap > config.maxGap) {
263
+ break;
244
264
  }
245
- if (!coveredTokens.has(match.queryToken)) {
265
+ const neededCount = queryTokenCounts.get(match.queryToken) || 0;
266
+ const currentCount = matchedCounts.get(match.queryToken) || 0;
267
+ if (currentCount < neededCount) {
268
+ for (let pos = lastPos + 1; pos < match.position; pos++) {
269
+ totalGapUsed++;
270
+ gapWords.push({
271
+ word: documentTokens[pos],
272
+ position: pos,
273
+ gapIndex: totalGapUsed
274
+ });
275
+ }
246
276
  phraseWords.push(match);
247
- coveredTokens.add(match.queryToken);
248
- }
249
- if (coveredTokens.size === queryTokens.length) {
250
- break;
277
+ matchedCounts.set(match.queryToken, currentCount + 1);
278
+ totalMatchedTokens++;
279
+ if (totalMatchedTokens === queryTokens.length) {
280
+ break;
281
+ }
251
282
  }
252
283
  }
253
284
  if (phraseWords.length > 0) {
@@ -286,9 +317,12 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
286
317
  baseScore /= phraseWords.length;
287
318
  const inOrder = isInOrder(phraseWords, queryTokens);
288
319
  const orderScore = inOrder ? 1 : 0.5;
289
- const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
290
- const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
291
- const proximityScore = Math.max(0, 1 - span / proximityWindow);
320
+ let proximityScore = 0;
321
+ if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
322
+ const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
323
+ const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
324
+ proximityScore = Math.max(0, 1 - span / proximityWindow);
325
+ }
292
326
  let densityScore = 0;
293
327
  if (queryTokens.length === 1) {
294
328
  const totalOccurrences = allWordMatches.length;
@@ -306,8 +340,10 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
306
340
  const weightedDensity = densityScore * weights.density;
307
341
  const weightedSemantic = semanticScore * weights.semantic;
308
342
  const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
309
- const maxBaseWeight = Math.max(weights.exact, weights.fuzzy);
310
- const maxPossibleScore = maxBaseWeight + weights.order + weights.proximity + weights.density + weights.semantic;
343
+ const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
344
+ const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
345
+ const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
346
+ const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
311
347
  const normalizedScore = totalScore / maxPossibleScore;
312
348
  const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
313
349
  const score = normalizedScore * coverageMultiplier;
@@ -330,13 +366,20 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
330
366
  };
331
367
  }
332
368
  function isInOrder(phraseWords, queryTokens) {
333
- const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
334
- for (let i = 1; i < phraseWords.length; i++) {
335
- const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1;
336
- const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1;
337
- if (currOrder < prevOrder) {
369
+ const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
370
+ let lastMatchedIndex = -1;
371
+ for (const phraseWord of phraseWords) {
372
+ let foundIndex = -1;
373
+ for (const pos of tokenPositions) {
374
+ if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
375
+ foundIndex = pos.index;
376
+ break;
377
+ }
378
+ }
379
+ if (foundIndex === -1) {
338
380
  return false;
339
381
  }
382
+ lastMatchedIndex = foundIndex;
340
383
  }
341
384
  return true;
342
385
  }
@@ -377,9 +420,236 @@ function deduplicatePhrases(phrases) {
377
420
  return result.sort((a, b) => b.score - a.score);
378
421
  }
379
422
 
423
+ // src/optimized.ts
424
+ var DEFAULT_OPTIMIZED_CONFIG = {
425
+ maxQPSCandidates: 100,
426
+ // Limit phrase scoring to top 100 candidates
427
+ minQPSScore: 0.1,
428
+ // Include candidates with 10%+ of best score
429
+ qpsExact: false,
430
+ // Use fuzzy matching by default
431
+ qpsTolerance: 1
432
+ // Default tolerance of 1 edit distance
433
+ };
434
+ function normalizeText(text) {
435
+ return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
436
+ }
437
+ function tokenize(text) {
438
+ return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
439
+ }
440
+ function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
441
+ const tokens = tokenizer.tokenize(term, language);
442
+ if (tokens.length === 0) {
443
+ return [];
444
+ }
445
+ const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
446
+ const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
447
+ const boostPerProp = config.qpsBoostPerProp ?? {};
448
+ const resultMap = /* @__PURE__ */ new Map();
449
+ for (const prop of properties) {
450
+ const indexEntry = qpsIndex.indexes[prop];
451
+ if (!indexEntry || indexEntry.type !== "Radix") {
452
+ continue;
453
+ }
454
+ const radixNode = indexEntry.node;
455
+ const stats = qpsIndex.stats[prop];
456
+ if (!radixNode || !stats) {
457
+ continue;
458
+ }
459
+ const boost = boostPerProp[prop] ?? 1;
460
+ for (let i = 0; i < tokens.length; i++) {
461
+ const token = tokens[i];
462
+ const matches = radixNode.find({
463
+ term: token,
464
+ exact,
465
+ tolerance: exact ? 0 : tolerance
466
+ });
467
+ for (const [matchedWord, docIds] of Object.entries(matches)) {
468
+ if (!Array.isArray(docIds))
469
+ continue;
470
+ const isExactMatch = matchedWord === token;
471
+ for (const docId of docIds) {
472
+ const tokensLength = stats.tokensLength.get(docId) || 1;
473
+ const quantum = stats.tokenQuantums[docId]?.[matchedWord];
474
+ const occurrences = quantum ? quantum >> 20 : 1;
475
+ const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
476
+ if (!resultMap.has(docId)) {
477
+ resultMap.set(docId, [scoreContrib, 1 << i]);
478
+ } else {
479
+ const [prevScore, prevMask] = resultMap.get(docId);
480
+ const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
481
+ resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
482
+ }
483
+ }
484
+ }
485
+ }
486
+ }
487
+ const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
488
+ return results;
489
+ }
490
+ function countSetBits(n) {
491
+ let count = 0;
492
+ while (n) {
493
+ count += n & 1;
494
+ n >>= 1;
495
+ }
496
+ return count;
497
+ }
498
+ async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
499
+ const startTime = performance.now();
500
+ const { term, properties, tokenCache } = params;
501
+ if (!term || typeof term !== "string") {
502
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
503
+ }
504
+ const textProperty = properties && properties[0] || pluginState.config.textProperty;
505
+ const searchProperties = properties || [textProperty];
506
+ const queryTokens = tokenize(term);
507
+ if (queryTokens.length === 0) {
508
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
509
+ }
510
+ const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
511
+ console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
512
+ const qpsStartTime = performance.now();
513
+ const tokenizer = orama.tokenizer;
514
+ const qpsCandidates = searchQPS(
515
+ term,
516
+ qpsIndex,
517
+ tokenizer,
518
+ searchProperties,
519
+ config,
520
+ language
521
+ );
522
+ const qpsTime = performance.now() - qpsStartTime;
523
+ console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
524
+ if (qpsCandidates.length === 0) {
525
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
526
+ }
527
+ const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
528
+ const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
529
+ const bestScore = qpsCandidates[0][1];
530
+ const minScore = bestScore * minScoreRatio;
531
+ const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
532
+ console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
533
+ const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
534
+ let vocabulary = pluginState.vocabulary;
535
+ if (vocabulary.size === 0) {
536
+ console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
537
+ try {
538
+ const indexData = orama.data?.index;
539
+ let radixNode = null;
540
+ if (indexData?.indexes?.[textProperty]?.node) {
541
+ radixNode = indexData.indexes[textProperty].node;
542
+ } else if (indexData?.[textProperty]?.node) {
543
+ radixNode = indexData[textProperty].node;
544
+ }
545
+ if (radixNode) {
546
+ pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
547
+ vocabulary = pluginState.vocabulary;
548
+ console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
549
+ } else {
550
+ console.error("\u274C Radix tree not found for vocabulary extraction");
551
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
552
+ }
553
+ } catch (error) {
554
+ console.error("\u274C Failed to extract vocabulary:", error);
555
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
556
+ }
557
+ }
558
+ const candidatesMap = findAllCandidates(
559
+ queryTokens,
560
+ vocabulary,
561
+ tolerance,
562
+ pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
563
+ pluginState.config.synonymMatchScore
564
+ );
565
+ const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
566
+ console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
567
+ const phraseStartTime = performance.now();
568
+ const documentMatches = [];
569
+ let docs = {};
570
+ if (orama.data?.docs?.docs) {
571
+ docs = orama.data.docs.docs;
572
+ }
573
+ let docsScored = 0;
574
+ for (const [docId, doc] of Object.entries(docs)) {
575
+ if (!candidateDocIds.has(docId)) {
576
+ continue;
577
+ }
578
+ docsScored++;
579
+ const text = doc[textProperty];
580
+ if (!text || typeof text !== "string") {
581
+ continue;
582
+ }
583
+ let docTokens;
584
+ if (tokenCache && tokenCache.has(docId)) {
585
+ docTokens = tokenCache.get(docId);
586
+ } else {
587
+ docTokens = tokenize(text);
588
+ }
589
+ const phrases = findPhrasesInDocument(
590
+ docTokens,
591
+ filteredFuzzyCandidates,
592
+ {
593
+ weights: pluginState.config.weights,
594
+ maxGap: pluginState.config.maxGap,
595
+ proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
596
+ tolerance
597
+ },
598
+ pluginState.documentFrequency,
599
+ pluginState.totalDocuments,
600
+ queryTokens
601
+ );
602
+ if (phrases.length > 0) {
603
+ const docScore = Math.max(...phrases.map((p) => p.score));
604
+ documentMatches.push({
605
+ id: docId,
606
+ phrases,
607
+ score: docScore,
608
+ document: doc
609
+ });
610
+ }
611
+ }
612
+ const phraseTime = performance.now() - phraseStartTime;
613
+ console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
614
+ documentMatches.sort((a, b) => b.score - a.score);
615
+ let finalMatches = documentMatches;
616
+ if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
617
+ const threshold = pluginState.config.finalScoreMinimum;
618
+ const beforeCount = finalMatches.length;
619
+ finalMatches = finalMatches.filter((m) => m.score >= threshold);
620
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
621
+ }
622
+ const limit = params.limit ?? finalMatches.length;
623
+ const limitedMatches = finalMatches.slice(0, limit);
624
+ const hits = limitedMatches.map((match) => ({
625
+ id: match.id,
626
+ score: match.score,
627
+ document: match.document,
628
+ _phrases: match.phrases
629
+ }));
630
+ const elapsed = performance.now() - startTime;
631
+ console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
632
+ return {
633
+ elapsed: {
634
+ formatted: `${elapsed.toFixed(2)}ms`,
635
+ raw: Math.floor(elapsed * 1e6),
636
+ qpsTime,
637
+ phraseTime
638
+ },
639
+ hits,
640
+ count: hits.length
641
+ };
642
+ }
643
+ function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
644
+ return async (params, language = "french") => {
645
+ return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
646
+ };
647
+ }
648
+
380
649
  // src/index.ts
381
650
  var DEFAULT_CONFIG = {
382
- textProperty: "content",
651
+ textProperty: "normalized_content",
652
+ // Must match server's field name
383
653
  tolerance: 1,
384
654
  adaptiveTolerance: true,
385
655
  enableSynonyms: false,
@@ -395,6 +665,8 @@ var DEFAULT_CONFIG = {
395
665
  },
396
666
  maxGap: 5,
397
667
  minScore: 0.1,
668
+ enableFinalScoreMinimum: false,
669
+ finalScoreMinimum: 0.3,
398
670
  proximitySpanMultiplier: 5
399
671
  };
400
672
  var pluginStates = /* @__PURE__ */ new WeakMap();
@@ -416,6 +688,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
416
688
  },
417
689
  maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
418
690
  minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
691
+ enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
692
+ finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
419
693
  proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
420
694
  };
421
695
  const plugin = {
@@ -429,7 +703,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
429
703
  synonymMap: {},
430
704
  config,
431
705
  documentFrequency: /* @__PURE__ */ new Map(),
432
- totalDocuments: 0
706
+ totalDocuments: 0,
707
+ vocabulary: /* @__PURE__ */ new Set()
433
708
  };
434
709
  if (config.enableSynonyms && config.supabase) {
435
710
  try {
@@ -446,6 +721,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
446
721
  state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
447
722
  console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
448
723
  }
724
+ try {
725
+ const indexData = orama.data?.index;
726
+ let radixNode = null;
727
+ if (indexData?.indexes?.[config.textProperty]?.node) {
728
+ radixNode = indexData.indexes[config.textProperty].node;
729
+ } else if (indexData?.[config.textProperty]?.node) {
730
+ radixNode = indexData[config.textProperty].node;
731
+ }
732
+ if (radixNode) {
733
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
734
+ console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
735
+ } else {
736
+ console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
737
+ }
738
+ } catch (error) {
739
+ console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
740
+ }
449
741
  pluginStates.set(orama, state);
450
742
  console.log("\u2705 Fuzzy Phrase Plugin initialized");
451
743
  setImmediate(() => {
@@ -467,43 +759,43 @@ async function searchWithFuzzyPhrase(orama, params, language) {
467
759
  console.error("\u274C Plugin state not initialized");
468
760
  throw new Error("Fuzzy Phrase Plugin not properly initialized");
469
761
  }
470
- const { term, properties } = params;
762
+ const { term, properties, tokenCache, candidateIds } = params;
763
+ const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
471
764
  if (!term || typeof term !== "string") {
472
765
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
473
766
  }
474
767
  const textProperty = properties && properties[0] || state.config.textProperty;
475
- const queryTokens = tokenize(term);
768
+ const queryTokens = tokenize2(term);
476
769
  if (queryTokens.length === 0) {
477
770
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
478
771
  }
479
772
  const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
480
773
  console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
481
- let vocabulary;
482
- try {
483
- const indexData = orama.data?.index;
484
- if (!indexData) {
485
- console.error("\u274C No index data found in orama.data.index");
486
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
487
- }
488
- console.log("\u{1F50D} DEBUG: Index data keys:", Object.keys(indexData || {}));
489
- let radixNode = null;
490
- if (indexData.indexes?.[textProperty]?.node) {
491
- radixNode = indexData.indexes[textProperty].node;
492
- console.log("\u2705 Found radix via QPS-style path (data.index.indexes)");
493
- } else if (indexData[textProperty]?.node) {
494
- radixNode = indexData[textProperty].node;
495
- console.log("\u2705 Found radix via standard path (data.index[property])");
496
- }
497
- if (!radixNode) {
498
- console.error("\u274C Radix tree not found for property:", textProperty);
499
- console.error(" Available properties in index:", Object.keys(indexData));
774
+ let vocabulary = state.vocabulary;
775
+ if (vocabulary.size === 0) {
776
+ console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
777
+ try {
778
+ const indexData = orama.data?.index;
779
+ let radixNode = null;
780
+ if (indexData?.indexes?.[textProperty]?.node) {
781
+ radixNode = indexData.indexes[textProperty].node;
782
+ } else if (indexData?.[textProperty]?.node) {
783
+ radixNode = indexData[textProperty].node;
784
+ }
785
+ if (radixNode) {
786
+ state.vocabulary = extractVocabularyFromRadixTree(radixNode);
787
+ vocabulary = state.vocabulary;
788
+ console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
789
+ } else {
790
+ console.error("\u274C Radix tree not found for vocabulary extraction");
791
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
792
+ }
793
+ } catch (error) {
794
+ console.error("\u274C Failed to extract vocabulary:", error);
500
795
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
501
796
  }
502
- vocabulary = extractVocabularyFromRadixTree(radixNode);
503
- console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`);
504
- } catch (error) {
505
- console.error("\u274C Failed to extract vocabulary:", error);
506
- return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
797
+ } else {
798
+ console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
507
799
  }
508
800
  const candidatesMap = findAllCandidates(
509
801
  queryTokens,
@@ -512,10 +804,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
512
804
  state.config.enableSynonyms ? state.synonymMap : void 0,
513
805
  state.config.synonymMatchScore
514
806
  );
515
- const filteredCandidates = filterCandidatesByScore(
516
- candidatesMap,
517
- state.config.minScore
518
- );
807
+ const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
519
808
  console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
520
809
  const documentMatches = [];
521
810
  console.log("\u{1F50D} DEBUG orama.data structure:", {
@@ -542,23 +831,39 @@ async function searchWithFuzzyPhrase(orama, params, language) {
542
831
  dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
543
832
  });
544
833
  }
545
- console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
834
+ const cacheHits = tokenCache ? tokenCache.size : 0;
835
+ const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
836
+ console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
546
837
  for (const [docId, doc] of Object.entries(docs)) {
838
+ if (candidateIdSet) {
839
+ const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
840
+ if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
841
+ continue;
842
+ }
843
+ }
547
844
  const text = doc[textProperty];
548
845
  if (!text || typeof text !== "string") {
549
846
  continue;
550
847
  }
551
- const docTokens = tokenize(text);
848
+ let docTokens;
849
+ if (tokenCache && tokenCache.has(docId)) {
850
+ docTokens = tokenCache.get(docId);
851
+ } else {
852
+ docTokens = tokenize2(text);
853
+ }
552
854
  const phrases = findPhrasesInDocument(
553
855
  docTokens,
554
856
  filteredCandidates,
555
857
  {
556
858
  weights: state.config.weights,
557
859
  maxGap: state.config.maxGap,
558
- proximitySpanMultiplier: state.config.proximitySpanMultiplier
860
+ proximitySpanMultiplier: state.config.proximitySpanMultiplier,
861
+ tolerance
559
862
  },
560
863
  state.documentFrequency,
561
- state.totalDocuments
864
+ state.totalDocuments,
865
+ queryTokens
866
+ // Original tokens with duplicates preserved
562
867
  );
563
868
  if (phrases.length > 0) {
564
869
  const docScore = Math.max(...phrases.map((p) => p.score));
@@ -571,8 +876,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
571
876
  }
572
877
  }
573
878
  documentMatches.sort((a, b) => b.score - a.score);
574
- const limit = params.limit ?? documentMatches.length;
575
- const limitedMatches = documentMatches.slice(0, limit);
879
+ let filteredMatches = documentMatches;
880
+ if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
881
+ const threshold = state.config.finalScoreMinimum;
882
+ const beforeCount = filteredMatches.length;
883
+ filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
884
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
885
+ }
886
+ const limit = params.limit ?? filteredMatches.length;
887
+ const limitedMatches = filteredMatches.slice(0, limit);
576
888
  const hits = limitedMatches.map((match) => ({
577
889
  id: match.id,
578
890
  score: match.score,
@@ -623,21 +935,29 @@ function calculateDocumentFrequencies(docs, textProperty) {
623
935
  if (!text || typeof text !== "string") {
624
936
  continue;
625
937
  }
626
- const words = new Set(tokenize(text));
938
+ const words = new Set(tokenize2(text));
627
939
  for (const word of words) {
628
940
  df.set(word, (df.get(word) || 0) + 1);
629
941
  }
630
942
  }
631
943
  return df;
632
944
  }
633
- function normalizeText(text) {
945
+ function normalizeText2(text) {
634
946
  return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
635
947
  }
636
- function tokenize(text) {
637
- return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
948
+ function tokenize2(text) {
949
+ return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
950
+ }
951
+ function getPluginState(orama) {
952
+ return pluginStates.get(orama);
638
953
  }
639
954
 
955
+ exports.createOptimizedSearch = createOptimizedSearch;
956
+ exports.getPluginState = getPluginState;
957
+ exports.normalizeTextOptimized = normalizeText;
640
958
  exports.pluginFuzzyPhrase = pluginFuzzyPhrase;
641
959
  exports.searchWithFuzzyPhrase = searchWithFuzzyPhrase;
960
+ exports.searchWithQPSPruning = searchWithQPSPruning;
961
+ exports.tokenizeOptimized = tokenize;
642
962
  //# sourceMappingURL=out.js.map
643
963
  //# sourceMappingURL=index.cjs.map