@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.25 → 3.1.16-custom.newbase.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -417,6 +417,232 @@ function deduplicatePhrases(phrases) {
417
417
  return result.sort((a, b) => b.score - a.score);
418
418
  }
419
419
 
420
+ // src/optimized.ts
421
+ var DEFAULT_OPTIMIZED_CONFIG = {
422
+ maxQPSCandidates: 100,
423
+ // Limit phrase scoring to top 100 candidates
424
+ minQPSScore: 0.1,
425
+ // Include candidates with 10%+ of best score
426
+ qpsExact: false,
427
+ // Use fuzzy matching by default
428
+ qpsTolerance: 1
429
+ // Default tolerance of 1 edit distance
430
+ };
431
+ function normalizeText(text) {
432
+ return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
433
+ }
434
+ function tokenize(text) {
435
+ return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
436
+ }
437
+ function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
438
+ const tokens = tokenizer.tokenize(term, language);
439
+ if (tokens.length === 0) {
440
+ return [];
441
+ }
442
+ const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
443
+ const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
444
+ const boostPerProp = config.qpsBoostPerProp ?? {};
445
+ const resultMap = /* @__PURE__ */ new Map();
446
+ for (const prop of properties) {
447
+ const indexEntry = qpsIndex.indexes[prop];
448
+ if (!indexEntry || indexEntry.type !== "Radix") {
449
+ continue;
450
+ }
451
+ const radixNode = indexEntry.node;
452
+ const stats = qpsIndex.stats[prop];
453
+ if (!radixNode || !stats) {
454
+ continue;
455
+ }
456
+ const boost = boostPerProp[prop] ?? 1;
457
+ for (let i = 0; i < tokens.length; i++) {
458
+ const token = tokens[i];
459
+ const matches = radixNode.find({
460
+ term: token,
461
+ exact,
462
+ tolerance: exact ? 0 : tolerance
463
+ });
464
+ for (const [matchedWord, docIds] of Object.entries(matches)) {
465
+ if (!Array.isArray(docIds))
466
+ continue;
467
+ const isExactMatch = matchedWord === token;
468
+ for (const docId of docIds) {
469
+ const tokensLength = stats.tokensLength.get(docId) || 1;
470
+ const quantum = stats.tokenQuantums[docId]?.[matchedWord];
471
+ const occurrences = quantum ? quantum >> 20 : 1;
472
+ const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
473
+ if (!resultMap.has(docId)) {
474
+ resultMap.set(docId, [scoreContrib, 1 << i]);
475
+ } else {
476
+ const [prevScore, prevMask] = resultMap.get(docId);
477
+ const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
478
+ resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
479
+ }
480
+ }
481
+ }
482
+ }
483
+ }
484
+ const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
485
+ return results;
486
+ }
487
+ function countSetBits(n) {
488
+ let count = 0;
489
+ while (n) {
490
+ count += n & 1;
491
+ n >>= 1;
492
+ }
493
+ return count;
494
+ }
495
+ async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
496
+ const startTime = performance.now();
497
+ const { term, properties, tokenCache } = params;
498
+ if (!term || typeof term !== "string") {
499
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
500
+ }
501
+ const textProperty = properties && properties[0] || pluginState.config.textProperty;
502
+ const searchProperties = properties || [textProperty];
503
+ const queryTokens = tokenize(term);
504
+ if (queryTokens.length === 0) {
505
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
506
+ }
507
+ const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
508
+ console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
509
+ const qpsStartTime = performance.now();
510
+ const tokenizer = orama.tokenizer;
511
+ const qpsCandidates = searchQPS(
512
+ term,
513
+ qpsIndex,
514
+ tokenizer,
515
+ searchProperties,
516
+ config,
517
+ language
518
+ );
519
+ const qpsTime = performance.now() - qpsStartTime;
520
+ console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
521
+ if (qpsCandidates.length === 0) {
522
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
523
+ }
524
+ const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
525
+ const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
526
+ const bestScore = qpsCandidates[0][1];
527
+ const minScore = bestScore * minScoreRatio;
528
+ const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
529
+ console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
530
+ const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
531
+ let vocabulary = pluginState.vocabulary;
532
+ if (vocabulary.size === 0) {
533
+ console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
534
+ try {
535
+ const indexData = orama.data?.index;
536
+ let radixNode = null;
537
+ if (indexData?.indexes?.[textProperty]?.node) {
538
+ radixNode = indexData.indexes[textProperty].node;
539
+ } else if (indexData?.[textProperty]?.node) {
540
+ radixNode = indexData[textProperty].node;
541
+ }
542
+ if (radixNode) {
543
+ pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
544
+ vocabulary = pluginState.vocabulary;
545
+ console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
546
+ } else {
547
+ console.error("\u274C Radix tree not found for vocabulary extraction");
548
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
549
+ }
550
+ } catch (error) {
551
+ console.error("\u274C Failed to extract vocabulary:", error);
552
+ return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
553
+ }
554
+ }
555
+ const candidatesMap = findAllCandidates(
556
+ queryTokens,
557
+ vocabulary,
558
+ tolerance,
559
+ pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
560
+ pluginState.config.synonymMatchScore
561
+ );
562
+ const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
563
+ console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
564
+ const phraseStartTime = performance.now();
565
+ const documentMatches = [];
566
+ let docs = {};
567
+ if (orama.data?.docs?.docs) {
568
+ docs = orama.data.docs.docs;
569
+ }
570
+ let docsScored = 0;
571
+ for (const [docId, doc] of Object.entries(docs)) {
572
+ if (!candidateDocIds.has(docId)) {
573
+ continue;
574
+ }
575
+ docsScored++;
576
+ const text = doc[textProperty];
577
+ if (!text || typeof text !== "string") {
578
+ continue;
579
+ }
580
+ let docTokens;
581
+ if (tokenCache && tokenCache.has(docId)) {
582
+ docTokens = tokenCache.get(docId);
583
+ } else {
584
+ docTokens = tokenize(text);
585
+ }
586
+ const phrases = findPhrasesInDocument(
587
+ docTokens,
588
+ filteredFuzzyCandidates,
589
+ {
590
+ weights: pluginState.config.weights,
591
+ maxGap: pluginState.config.maxGap,
592
+ proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
593
+ tolerance
594
+ },
595
+ pluginState.documentFrequency,
596
+ pluginState.totalDocuments,
597
+ queryTokens
598
+ );
599
+ if (phrases.length > 0) {
600
+ const docScore = Math.max(...phrases.map((p) => p.score));
601
+ documentMatches.push({
602
+ id: docId,
603
+ phrases,
604
+ score: docScore,
605
+ document: doc
606
+ });
607
+ }
608
+ }
609
+ const phraseTime = performance.now() - phraseStartTime;
610
+ console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
611
+ documentMatches.sort((a, b) => b.score - a.score);
612
+ let finalMatches = documentMatches;
613
+ if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
614
+ const threshold = pluginState.config.finalScoreMinimum;
615
+ const beforeCount = finalMatches.length;
616
+ finalMatches = finalMatches.filter((m) => m.score >= threshold);
617
+ console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
618
+ }
619
+ const limit = params.limit ?? finalMatches.length;
620
+ const limitedMatches = finalMatches.slice(0, limit);
621
+ const hits = limitedMatches.map((match) => ({
622
+ id: match.id,
623
+ score: match.score,
624
+ document: match.document,
625
+ _phrases: match.phrases
626
+ }));
627
+ const elapsed = performance.now() - startTime;
628
+ console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
629
+ return {
630
+ elapsed: {
631
+ formatted: `${elapsed.toFixed(2)}ms`,
632
+ raw: Math.floor(elapsed * 1e6),
633
+ qpsTime,
634
+ phraseTime
635
+ },
636
+ hits,
637
+ count: hits.length
638
+ };
639
+ }
640
+ function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
641
+ return async (params, language = "french") => {
642
+ return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
643
+ };
644
+ }
645
+
420
646
  // src/index.ts
421
647
  var DEFAULT_CONFIG = {
422
648
  textProperty: "normalized_content",
@@ -530,12 +756,13 @@ async function searchWithFuzzyPhrase(orama, params, language) {
530
756
  console.error("\u274C Plugin state not initialized");
531
757
  throw new Error("Fuzzy Phrase Plugin not properly initialized");
532
758
  }
533
- const { term, properties, tokenCache } = params;
759
+ const { term, properties, tokenCache, candidateIds } = params;
760
+ const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
534
761
  if (!term || typeof term !== "string") {
535
762
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
536
763
  }
537
764
  const textProperty = properties && properties[0] || state.config.textProperty;
538
- const queryTokens = tokenize(term);
765
+ const queryTokens = tokenize2(term);
539
766
  if (queryTokens.length === 0) {
540
767
  return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
541
768
  }
@@ -602,8 +829,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
602
829
  });
603
830
  }
604
831
  const cacheHits = tokenCache ? tokenCache.size : 0;
605
- console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
832
+ const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
833
+ console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
606
834
  for (const [docId, doc] of Object.entries(docs)) {
835
+ if (candidateIdSet) {
836
+ const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
837
+ if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
838
+ continue;
839
+ }
840
+ }
607
841
  const text = doc[textProperty];
608
842
  if (!text || typeof text !== "string") {
609
843
  continue;
@@ -612,7 +846,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
612
846
  if (tokenCache && tokenCache.has(docId)) {
613
847
  docTokens = tokenCache.get(docId);
614
848
  } else {
615
- docTokens = tokenize(text);
849
+ docTokens = tokenize2(text);
616
850
  }
617
851
  const phrases = findPhrasesInDocument(
618
852
  docTokens,
@@ -698,21 +932,29 @@ function calculateDocumentFrequencies(docs, textProperty) {
698
932
  if (!text || typeof text !== "string") {
699
933
  continue;
700
934
  }
701
- const words = new Set(tokenize(text));
935
+ const words = new Set(tokenize2(text));
702
936
  for (const word of words) {
703
937
  df.set(word, (df.get(word) || 0) + 1);
704
938
  }
705
939
  }
706
940
  return df;
707
941
  }
708
- function normalizeText(text) {
942
+ function normalizeText2(text) {
709
943
  return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
710
944
  }
711
- function tokenize(text) {
712
- return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
945
+ function tokenize2(text) {
946
+ return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
947
+ }
948
+ function getPluginState(orama) {
949
+ return pluginStates.get(orama);
713
950
  }
714
951
 
952
+ exports.createOptimizedSearch = createOptimizedSearch;
953
+ exports.getPluginState = getPluginState;
954
+ exports.normalizeTextOptimized = normalizeText;
715
955
  exports.pluginFuzzyPhrase = pluginFuzzyPhrase;
716
956
  exports.searchWithFuzzyPhrase = searchWithFuzzyPhrase;
957
+ exports.searchWithQPSPruning = searchWithQPSPruning;
958
+ exports.tokenizeOptimized = tokenize;
717
959
  //# sourceMappingURL=out.js.map
718
960
  //# sourceMappingURL=index.cjs.map