@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.24 → 3.1.16-custom.newbase.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -190,6 +190,15 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
190
190
  candidateLookup.get(candidate.word).push({ queryToken, candidate });
191
191
  }
192
192
  }
193
+ for (const entries of candidateLookup.values()) {
194
+ entries.sort((a, b) => {
195
+ if (a.candidate.type === "exact" && b.candidate.type !== "exact")
196
+ return -1;
197
+ if (b.candidate.type === "exact" && a.candidate.type !== "exact")
198
+ return 1;
199
+ return b.candidate.score - a.candidate.score;
200
+ });
201
+ }
193
202
  for (let i = 0; i < documentTokens.length; i++) {
194
203
  const docWord = documentTokens[i];
195
204
  const matches = candidateLookup.get(docWord);
@@ -206,15 +215,6 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
206
215
  }
207
216
  }
208
217
  }
209
- wordMatches.sort((a, b) => {
210
- if (a.position !== b.position)
211
- return a.position - b.position;
212
- if (a.type === "exact" && b.type !== "exact")
213
- return -1;
214
- if (b.type === "exact" && a.type !== "exact")
215
- return 1;
216
- return b.score - a.score;
217
- });
218
218
  for (let i = 0; i < wordMatches.length; i++) {
219
219
  const phrase = buildPhraseFromPosition(
220
220
  wordMatches,
@@ -415,6 +415,232 @@ function deduplicatePhrases(phrases) {
415
415
  return result.sort((a, b) => b.score - a.score);
416
416
  }
417
417
 
418
+ // src/optimized.ts
419
+ var DEFAULT_OPTIMIZED_CONFIG = {
420
+ maxQPSCandidates: 100,
421
+ // Limit phrase scoring to top 100 candidates
422
+ minQPSScore: 0.1,
423
+ // Include candidates with 10%+ of best score
424
+ qpsExact: false,
425
+ // Use fuzzy matching by default
426
+ qpsTolerance: 1
427
+ // Default tolerance of 1 edit distance
428
+ };
429
+ function normalizeText(text) {
430
+ return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
431
+ }
432
+ function tokenize(text) {
433
+ return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
434
+ }
435
+ function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
436
+ const tokens = tokenizer.tokenize(term, language);
437
+ if (tokens.length === 0) {
438
+ return [];
439
+ }
440
+ const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
441
+ const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
442
+ const boostPerProp = config.qpsBoostPerProp ?? {};
443
+ const resultMap = /* @__PURE__ */ new Map();
444
+ for (const prop of properties) {
445
+ const indexEntry = qpsIndex.indexes[prop];
446
+ if (!indexEntry || indexEntry.type !== "Radix") {
447
+ continue;
448
+ }
449
+ const radixNode = indexEntry.node;
450
+ const stats = qpsIndex.stats[prop];
451
+ if (!radixNode || !stats) {
452
+ continue;
453
+ }
454
+ const boost = boostPerProp[prop] ?? 1;
455
+ for (let i = 0; i < tokens.length; i++) {
456
+ const token = tokens[i];
457
+ const matches = radixNode.find({
458
+ term: token,
459
+ exact,
460
+ tolerance: exact ? 0 : tolerance
461
+ });
462
+ for (const [matchedWord, docIds] of Object.entries(matches)) {
463
+ if (!Array.isArray(docIds))
464
+ continue;
465
+ const isExactMatch = matchedWord === token;
466
+ for (const docId of docIds) {
467
+ const tokensLength = stats.tokensLength.get(docId) || 1;
468
+ const quantum = stats.tokenQuantums[docId]?.[matchedWord];
469
+ const occurrences = quantum ? quantum >> 20 : 1;
470
+ const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
471
+ if (!resultMap.has(docId)) {
472
+ resultMap.set(docId, [scoreContrib, 1 << i]);
473
+ } else {
474
+ const [prevScore, prevMask] = resultMap.get(docId);
475
+ const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
476
+ resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
477
+ }
478
+ }
479
+ }
480
+ }
481
+ }
482
+ const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
483
+ return results;
484
+ }
485
+ function countSetBits(n) {
486
+ let count = 0;
487
+ while (n) {
488
+ count += n & 1;
489
+ n >>= 1;
490
+ }
491
+ return count;
492
+ }
493
+ async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
494
+ const startTime = performance.now();
495
+ const { term, properties, tokenCache } = params;
496
+ if (!term || typeof term !== "string") {
497
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
498
+ }
499
+ const textProperty = properties && properties[0] || pluginState.config.textProperty;
500
+ const searchProperties = properties || [textProperty];
501
+ const queryTokens = tokenize(term);
502
+ if (queryTokens.length === 0) {
503
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
504
+ }
505
+ const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
506
+ console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
507
+ const qpsStartTime = performance.now();
508
+ const tokenizer = orama.tokenizer;
509
+ const qpsCandidates = searchQPS(
510
+ term,
511
+ qpsIndex,
512
+ tokenizer,
513
+ searchProperties,
514
+ config,
515
+ language
516
+ );
517
+ const qpsTime = performance.now() - qpsStartTime;
518
+ console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
519
+ if (qpsCandidates.length === 0) {
520
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
521
+ }
522
+ const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
523
+ const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
524
+ const bestScore = qpsCandidates[0][1];
525
+ const minScore = bestScore * minScoreRatio;
526
+ const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
527
+ console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
528
+ const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
529
+ let vocabulary = pluginState.vocabulary;
530
+ if (vocabulary.size === 0) {
531
+ console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
532
+ try {
533
+ const indexData = orama.data?.index;
534
+ let radixNode = null;
535
+ if (indexData?.indexes?.[textProperty]?.node) {
536
+ radixNode = indexData.indexes[textProperty].node;
537
+ } else if (indexData?.[textProperty]?.node) {
538
+ radixNode = indexData[textProperty].node;
539
+ }
540
+ if (radixNode) {
541
+ pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
542
+ vocabulary = pluginState.vocabulary;
543
+ console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
544
+ } else {
545
+ console.error("\u274C Radix tree not found for vocabulary extraction");
546
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
547
+ }
548
+ } catch (error) {
549
+ console.error("\u274C Failed to extract vocabulary:", error);
550
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
551
+ }
552
+ }
553
+ const candidatesMap = findAllCandidates(
554
+ queryTokens,
555
+ vocabulary,
556
+ tolerance,
557
+ pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
558
+ pluginState.config.synonymMatchScore
559
+ );
560
+ const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
561
+ console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
562
+ const phraseStartTime = performance.now();
563
+ const documentMatches = [];
564
+ let docs = {};
565
+ if (orama.data?.docs?.docs) {
566
+ docs = orama.data.docs.docs;
567
+ }
568
+ let docsScored = 0;
569
+ for (const [docId, doc] of Object.entries(docs)) {
570
+ if (!candidateDocIds.has(docId)) {
571
+ continue;
572
+ }
573
+ docsScored++;
574
+ const text = doc[textProperty];
575
+ if (!text || typeof text !== "string") {
576
+ continue;
577
+ }
578
+ let docTokens;
579
+ if (tokenCache && tokenCache.has(docId)) {
580
+ docTokens = tokenCache.get(docId);
581
+ } else {
582
+ docTokens = tokenize(text);
583
+ }
584
+ const phrases = findPhrasesInDocument(
585
+ docTokens,
586
+ filteredFuzzyCandidates,
587
+ {
588
+ weights: pluginState.config.weights,
589
+ maxGap: pluginState.config.maxGap,
590
+ proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
591
+ tolerance
592
+ },
593
+ pluginState.documentFrequency,
594
+ pluginState.totalDocuments,
595
+ queryTokens
596
+ );
597
+ if (phrases.length > 0) {
598
+ const docScore = Math.max(...phrases.map((p) => p.score));
599
+ documentMatches.push({
600
+ id: docId,
601
+ phrases,
602
+ score: docScore,
603
+ document: doc
604
+ });
605
+ }
606
+ }
607
+ const phraseTime = performance.now() - phraseStartTime;
608
+ console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
609
+ documentMatches.sort((a, b) => b.score - a.score);
610
+ let finalMatches = documentMatches;
611
+ if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
612
+ const threshold = pluginState.config.finalScoreMinimum;
613
+ const beforeCount = finalMatches.length;
614
+ finalMatches = finalMatches.filter((m) => m.score >= threshold);
615
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
616
+ }
617
+ const limit = params.limit ?? finalMatches.length;
618
+ const limitedMatches = finalMatches.slice(0, limit);
619
+ const hits = limitedMatches.map((match) => ({
620
+ id: match.id,
621
+ score: match.score,
622
+ document: match.document,
623
+ _phrases: match.phrases
624
+ }));
625
+ const elapsed = performance.now() - startTime;
626
+ console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
627
+ return {
628
+ elapsed: {
629
+ formatted: `${elapsed.toFixed(2)}ms`,
630
+ raw: Math.floor(elapsed * 1e6),
631
+ qpsTime,
632
+ phraseTime
633
+ },
634
+ hits,
635
+ count: hits.length
636
+ };
637
+ }
638
+ function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
639
+ return async (params, language = "french") => {
640
+ return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
641
+ };
642
+ }
643
+
418
644
  // src/index.ts
419
645
  var DEFAULT_CONFIG = {
420
646
  textProperty: "normalized_content",
@@ -528,12 +754,13 @@ async function searchWithFuzzyPhrase(orama, params, language) {
528
754
  console.error("\u274C Plugin state not initialized");
529
755
  throw new Error("Fuzzy Phrase Plugin not properly initialized");
530
756
  }
531
- const { term, properties, tokenCache } = params;
757
+ const { term, properties, tokenCache, candidateIds } = params;
758
+ const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
532
759
  if (!term || typeof term !== "string") {
533
760
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
534
761
  }
535
762
  const textProperty = properties && properties[0] || state.config.textProperty;
536
- const queryTokens = tokenize(term);
763
+ const queryTokens = tokenize2(term);
537
764
  if (queryTokens.length === 0) {
538
765
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
539
766
  }
@@ -600,8 +827,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
600
827
  });
601
828
  }
602
829
  const cacheHits = tokenCache ? tokenCache.size : 0;
603
- console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
830
+ const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
831
+ console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
604
832
  for (const [docId, doc] of Object.entries(docs)) {
833
+ if (candidateIdSet) {
834
+ const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
835
+ if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
836
+ continue;
837
+ }
838
+ }
605
839
  const text = doc[textProperty];
606
840
  if (!text || typeof text !== "string") {
607
841
  continue;
@@ -610,7 +844,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
610
844
  if (tokenCache && tokenCache.has(docId)) {
611
845
  docTokens = tokenCache.get(docId);
612
846
  } else {
613
- docTokens = tokenize(text);
847
+ docTokens = tokenize2(text);
614
848
  }
615
849
  const phrases = findPhrasesInDocument(
616
850
  docTokens,
@@ -696,20 +930,23 @@ function calculateDocumentFrequencies(docs, textProperty) {
696
930
  if (!text || typeof text !== "string") {
697
931
  continue;
698
932
  }
699
- const words = new Set(tokenize(text));
933
+ const words = new Set(tokenize2(text));
700
934
  for (const word of words) {
701
935
  df.set(word, (df.get(word) || 0) + 1);
702
936
  }
703
937
  }
704
938
  return df;
705
939
  }
706
- function normalizeText(text) {
940
+ function normalizeText2(text) {
707
941
  return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
708
942
  }
709
- function tokenize(text) {
710
- return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
943
+ function tokenize2(text) {
944
+ return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
945
+ }
946
+ function getPluginState(orama) {
947
+ return pluginStates.get(orama);
711
948
  }
712
949
 
713
- export { pluginFuzzyPhrase, searchWithFuzzyPhrase };
950
+ export { createOptimizedSearch, getPluginState, normalizeText as normalizeTextOptimized, pluginFuzzyPhrase, searchWithFuzzyPhrase, searchWithQPSPruning, tokenize as tokenizeOptimized };
714
951
  //# sourceMappingURL=out.js.map
715
952
  //# sourceMappingURL=index.js.map