@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.25 → 3.1.16-custom.newbase.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +250 -8
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +110 -2
- package/dist/index.d.ts +110 -2
- package/dist/index.js +246 -9
- package/dist/index.js.map +1 -1
- package/package.json +64 -62
package/dist/index.cjs
CHANGED
|
@@ -417,6 +417,232 @@ function deduplicatePhrases(phrases) {
|
|
|
417
417
|
return result.sort((a, b) => b.score - a.score);
|
|
418
418
|
}
|
|
419
419
|
|
|
420
|
+
// src/optimized.ts
|
|
421
|
+
var DEFAULT_OPTIMIZED_CONFIG = {
|
|
422
|
+
maxQPSCandidates: 100,
|
|
423
|
+
// Limit phrase scoring to top 100 candidates
|
|
424
|
+
minQPSScore: 0.1,
|
|
425
|
+
// Include candidates with 10%+ of best score
|
|
426
|
+
qpsExact: false,
|
|
427
|
+
// Use fuzzy matching by default
|
|
428
|
+
qpsTolerance: 1
|
|
429
|
+
// Default tolerance of 1 edit distance
|
|
430
|
+
};
|
|
431
|
+
function normalizeText(text) {
|
|
432
|
+
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
433
|
+
}
|
|
434
|
+
function tokenize(text) {
|
|
435
|
+
return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
|
|
436
|
+
}
|
|
437
|
+
function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
|
|
438
|
+
const tokens = tokenizer.tokenize(term, language);
|
|
439
|
+
if (tokens.length === 0) {
|
|
440
|
+
return [];
|
|
441
|
+
}
|
|
442
|
+
const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
|
|
443
|
+
const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
|
|
444
|
+
const boostPerProp = config.qpsBoostPerProp ?? {};
|
|
445
|
+
const resultMap = /* @__PURE__ */ new Map();
|
|
446
|
+
for (const prop of properties) {
|
|
447
|
+
const indexEntry = qpsIndex.indexes[prop];
|
|
448
|
+
if (!indexEntry || indexEntry.type !== "Radix") {
|
|
449
|
+
continue;
|
|
450
|
+
}
|
|
451
|
+
const radixNode = indexEntry.node;
|
|
452
|
+
const stats = qpsIndex.stats[prop];
|
|
453
|
+
if (!radixNode || !stats) {
|
|
454
|
+
continue;
|
|
455
|
+
}
|
|
456
|
+
const boost = boostPerProp[prop] ?? 1;
|
|
457
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
458
|
+
const token = tokens[i];
|
|
459
|
+
const matches = radixNode.find({
|
|
460
|
+
term: token,
|
|
461
|
+
exact,
|
|
462
|
+
tolerance: exact ? 0 : tolerance
|
|
463
|
+
});
|
|
464
|
+
for (const [matchedWord, docIds] of Object.entries(matches)) {
|
|
465
|
+
if (!Array.isArray(docIds))
|
|
466
|
+
continue;
|
|
467
|
+
const isExactMatch = matchedWord === token;
|
|
468
|
+
for (const docId of docIds) {
|
|
469
|
+
const tokensLength = stats.tokensLength.get(docId) || 1;
|
|
470
|
+
const quantum = stats.tokenQuantums[docId]?.[matchedWord];
|
|
471
|
+
const occurrences = quantum ? quantum >> 20 : 1;
|
|
472
|
+
const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
|
|
473
|
+
if (!resultMap.has(docId)) {
|
|
474
|
+
resultMap.set(docId, [scoreContrib, 1 << i]);
|
|
475
|
+
} else {
|
|
476
|
+
const [prevScore, prevMask] = resultMap.get(docId);
|
|
477
|
+
const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
|
|
478
|
+
resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
|
|
485
|
+
return results;
|
|
486
|
+
}
|
|
487
|
+
function countSetBits(n) {
|
|
488
|
+
let count = 0;
|
|
489
|
+
while (n) {
|
|
490
|
+
count += n & 1;
|
|
491
|
+
n >>= 1;
|
|
492
|
+
}
|
|
493
|
+
return count;
|
|
494
|
+
}
|
|
495
|
+
async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
|
|
496
|
+
const startTime = performance.now();
|
|
497
|
+
const { term, properties, tokenCache } = params;
|
|
498
|
+
if (!term || typeof term !== "string") {
|
|
499
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
500
|
+
}
|
|
501
|
+
const textProperty = properties && properties[0] || pluginState.config.textProperty;
|
|
502
|
+
const searchProperties = properties || [textProperty];
|
|
503
|
+
const queryTokens = tokenize(term);
|
|
504
|
+
if (queryTokens.length === 0) {
|
|
505
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
506
|
+
}
|
|
507
|
+
const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
|
|
508
|
+
console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
509
|
+
const qpsStartTime = performance.now();
|
|
510
|
+
const tokenizer = orama.tokenizer;
|
|
511
|
+
const qpsCandidates = searchQPS(
|
|
512
|
+
term,
|
|
513
|
+
qpsIndex,
|
|
514
|
+
tokenizer,
|
|
515
|
+
searchProperties,
|
|
516
|
+
config,
|
|
517
|
+
language
|
|
518
|
+
);
|
|
519
|
+
const qpsTime = performance.now() - qpsStartTime;
|
|
520
|
+
console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
|
|
521
|
+
if (qpsCandidates.length === 0) {
|
|
522
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
523
|
+
}
|
|
524
|
+
const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
|
|
525
|
+
const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
|
|
526
|
+
const bestScore = qpsCandidates[0][1];
|
|
527
|
+
const minScore = bestScore * minScoreRatio;
|
|
528
|
+
const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
|
|
529
|
+
console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
|
|
530
|
+
const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
|
|
531
|
+
let vocabulary = pluginState.vocabulary;
|
|
532
|
+
if (vocabulary.size === 0) {
|
|
533
|
+
console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
|
|
534
|
+
try {
|
|
535
|
+
const indexData = orama.data?.index;
|
|
536
|
+
let radixNode = null;
|
|
537
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
538
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
539
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
540
|
+
radixNode = indexData[textProperty].node;
|
|
541
|
+
}
|
|
542
|
+
if (radixNode) {
|
|
543
|
+
pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
544
|
+
vocabulary = pluginState.vocabulary;
|
|
545
|
+
console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
|
|
546
|
+
} else {
|
|
547
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
548
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
549
|
+
}
|
|
550
|
+
} catch (error) {
|
|
551
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
552
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
const candidatesMap = findAllCandidates(
|
|
556
|
+
queryTokens,
|
|
557
|
+
vocabulary,
|
|
558
|
+
tolerance,
|
|
559
|
+
pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
|
|
560
|
+
pluginState.config.synonymMatchScore
|
|
561
|
+
);
|
|
562
|
+
const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
|
|
563
|
+
console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
564
|
+
const phraseStartTime = performance.now();
|
|
565
|
+
const documentMatches = [];
|
|
566
|
+
let docs = {};
|
|
567
|
+
if (orama.data?.docs?.docs) {
|
|
568
|
+
docs = orama.data.docs.docs;
|
|
569
|
+
}
|
|
570
|
+
let docsScored = 0;
|
|
571
|
+
for (const [docId, doc] of Object.entries(docs)) {
|
|
572
|
+
if (!candidateDocIds.has(docId)) {
|
|
573
|
+
continue;
|
|
574
|
+
}
|
|
575
|
+
docsScored++;
|
|
576
|
+
const text = doc[textProperty];
|
|
577
|
+
if (!text || typeof text !== "string") {
|
|
578
|
+
continue;
|
|
579
|
+
}
|
|
580
|
+
let docTokens;
|
|
581
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
582
|
+
docTokens = tokenCache.get(docId);
|
|
583
|
+
} else {
|
|
584
|
+
docTokens = tokenize(text);
|
|
585
|
+
}
|
|
586
|
+
const phrases = findPhrasesInDocument(
|
|
587
|
+
docTokens,
|
|
588
|
+
filteredFuzzyCandidates,
|
|
589
|
+
{
|
|
590
|
+
weights: pluginState.config.weights,
|
|
591
|
+
maxGap: pluginState.config.maxGap,
|
|
592
|
+
proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
|
|
593
|
+
tolerance
|
|
594
|
+
},
|
|
595
|
+
pluginState.documentFrequency,
|
|
596
|
+
pluginState.totalDocuments,
|
|
597
|
+
queryTokens
|
|
598
|
+
);
|
|
599
|
+
if (phrases.length > 0) {
|
|
600
|
+
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
601
|
+
documentMatches.push({
|
|
602
|
+
id: docId,
|
|
603
|
+
phrases,
|
|
604
|
+
score: docScore,
|
|
605
|
+
document: doc
|
|
606
|
+
});
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
const phraseTime = performance.now() - phraseStartTime;
|
|
610
|
+
console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
|
|
611
|
+
documentMatches.sort((a, b) => b.score - a.score);
|
|
612
|
+
let finalMatches = documentMatches;
|
|
613
|
+
if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
|
|
614
|
+
const threshold = pluginState.config.finalScoreMinimum;
|
|
615
|
+
const beforeCount = finalMatches.length;
|
|
616
|
+
finalMatches = finalMatches.filter((m) => m.score >= threshold);
|
|
617
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
|
|
618
|
+
}
|
|
619
|
+
const limit = params.limit ?? finalMatches.length;
|
|
620
|
+
const limitedMatches = finalMatches.slice(0, limit);
|
|
621
|
+
const hits = limitedMatches.map((match) => ({
|
|
622
|
+
id: match.id,
|
|
623
|
+
score: match.score,
|
|
624
|
+
document: match.document,
|
|
625
|
+
_phrases: match.phrases
|
|
626
|
+
}));
|
|
627
|
+
const elapsed = performance.now() - startTime;
|
|
628
|
+
console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
|
|
629
|
+
return {
|
|
630
|
+
elapsed: {
|
|
631
|
+
formatted: `${elapsed.toFixed(2)}ms`,
|
|
632
|
+
raw: Math.floor(elapsed * 1e6),
|
|
633
|
+
qpsTime,
|
|
634
|
+
phraseTime
|
|
635
|
+
},
|
|
636
|
+
hits,
|
|
637
|
+
count: hits.length
|
|
638
|
+
};
|
|
639
|
+
}
|
|
640
|
+
function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
|
|
641
|
+
return async (params, language = "french") => {
|
|
642
|
+
return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
|
|
643
|
+
};
|
|
644
|
+
}
|
|
645
|
+
|
|
420
646
|
// src/index.ts
|
|
421
647
|
var DEFAULT_CONFIG = {
|
|
422
648
|
textProperty: "normalized_content",
|
|
@@ -530,12 +756,13 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
530
756
|
console.error("\u274C Plugin state not initialized");
|
|
531
757
|
throw new Error("Fuzzy Phrase Plugin not properly initialized");
|
|
532
758
|
}
|
|
533
|
-
const { term, properties, tokenCache } = params;
|
|
759
|
+
const { term, properties, tokenCache, candidateIds } = params;
|
|
760
|
+
const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
|
|
534
761
|
if (!term || typeof term !== "string") {
|
|
535
762
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
536
763
|
}
|
|
537
764
|
const textProperty = properties && properties[0] || state.config.textProperty;
|
|
538
|
-
const queryTokens =
|
|
765
|
+
const queryTokens = tokenize2(term);
|
|
539
766
|
if (queryTokens.length === 0) {
|
|
540
767
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
541
768
|
}
|
|
@@ -602,8 +829,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
602
829
|
});
|
|
603
830
|
}
|
|
604
831
|
const cacheHits = tokenCache ? tokenCache.size : 0;
|
|
605
|
-
|
|
832
|
+
const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
|
|
833
|
+
console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
|
|
606
834
|
for (const [docId, doc] of Object.entries(docs)) {
|
|
835
|
+
if (candidateIdSet) {
|
|
836
|
+
const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
|
|
837
|
+
if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
|
|
838
|
+
continue;
|
|
839
|
+
}
|
|
840
|
+
}
|
|
607
841
|
const text = doc[textProperty];
|
|
608
842
|
if (!text || typeof text !== "string") {
|
|
609
843
|
continue;
|
|
@@ -612,7 +846,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
612
846
|
if (tokenCache && tokenCache.has(docId)) {
|
|
613
847
|
docTokens = tokenCache.get(docId);
|
|
614
848
|
} else {
|
|
615
|
-
docTokens =
|
|
849
|
+
docTokens = tokenize2(text);
|
|
616
850
|
}
|
|
617
851
|
const phrases = findPhrasesInDocument(
|
|
618
852
|
docTokens,
|
|
@@ -698,21 +932,29 @@ function calculateDocumentFrequencies(docs, textProperty) {
|
|
|
698
932
|
if (!text || typeof text !== "string") {
|
|
699
933
|
continue;
|
|
700
934
|
}
|
|
701
|
-
const words = new Set(
|
|
935
|
+
const words = new Set(tokenize2(text));
|
|
702
936
|
for (const word of words) {
|
|
703
937
|
df.set(word, (df.get(word) || 0) + 1);
|
|
704
938
|
}
|
|
705
939
|
}
|
|
706
940
|
return df;
|
|
707
941
|
}
|
|
708
|
-
function
|
|
942
|
+
function normalizeText2(text) {
|
|
709
943
|
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
710
944
|
}
|
|
711
|
-
function
|
|
712
|
-
return
|
|
945
|
+
function tokenize2(text) {
|
|
946
|
+
return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
|
|
947
|
+
}
|
|
948
|
+
function getPluginState(orama) {
|
|
949
|
+
return pluginStates.get(orama);
|
|
713
950
|
}
|
|
714
951
|
|
|
952
|
+
exports.createOptimizedSearch = createOptimizedSearch;
|
|
953
|
+
exports.getPluginState = getPluginState;
|
|
954
|
+
exports.normalizeTextOptimized = normalizeText;
|
|
715
955
|
exports.pluginFuzzyPhrase = pluginFuzzyPhrase;
|
|
716
956
|
exports.searchWithFuzzyPhrase = searchWithFuzzyPhrase;
|
|
957
|
+
exports.searchWithQPSPruning = searchWithQPSPruning;
|
|
958
|
+
exports.tokenizeOptimized = tokenize;
|
|
717
959
|
//# sourceMappingURL=out.js.map
|
|
718
960
|
//# sourceMappingURL=index.cjs.map
|