@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.24 → 3.1.16-custom.newbase.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +259 -17
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +110 -2
- package/dist/index.d.ts +110 -2
- package/dist/index.js +255 -18
- package/dist/index.js.map +1 -1
- package/package.json +64 -62
package/dist/index.js
CHANGED
|
@@ -190,6 +190,15 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
|
|
|
190
190
|
candidateLookup.get(candidate.word).push({ queryToken, candidate });
|
|
191
191
|
}
|
|
192
192
|
}
|
|
193
|
+
for (const entries of candidateLookup.values()) {
|
|
194
|
+
entries.sort((a, b) => {
|
|
195
|
+
if (a.candidate.type === "exact" && b.candidate.type !== "exact")
|
|
196
|
+
return -1;
|
|
197
|
+
if (b.candidate.type === "exact" && a.candidate.type !== "exact")
|
|
198
|
+
return 1;
|
|
199
|
+
return b.candidate.score - a.candidate.score;
|
|
200
|
+
});
|
|
201
|
+
}
|
|
193
202
|
for (let i = 0; i < documentTokens.length; i++) {
|
|
194
203
|
const docWord = documentTokens[i];
|
|
195
204
|
const matches = candidateLookup.get(docWord);
|
|
@@ -206,15 +215,6 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
|
|
|
206
215
|
}
|
|
207
216
|
}
|
|
208
217
|
}
|
|
209
|
-
wordMatches.sort((a, b) => {
|
|
210
|
-
if (a.position !== b.position)
|
|
211
|
-
return a.position - b.position;
|
|
212
|
-
if (a.type === "exact" && b.type !== "exact")
|
|
213
|
-
return -1;
|
|
214
|
-
if (b.type === "exact" && a.type !== "exact")
|
|
215
|
-
return 1;
|
|
216
|
-
return b.score - a.score;
|
|
217
|
-
});
|
|
218
218
|
for (let i = 0; i < wordMatches.length; i++) {
|
|
219
219
|
const phrase = buildPhraseFromPosition(
|
|
220
220
|
wordMatches,
|
|
@@ -415,6 +415,232 @@ function deduplicatePhrases(phrases) {
|
|
|
415
415
|
return result.sort((a, b) => b.score - a.score);
|
|
416
416
|
}
|
|
417
417
|
|
|
418
|
+
// src/optimized.ts
|
|
419
|
+
var DEFAULT_OPTIMIZED_CONFIG = {
|
|
420
|
+
maxQPSCandidates: 100,
|
|
421
|
+
// Limit phrase scoring to top 100 candidates
|
|
422
|
+
minQPSScore: 0.1,
|
|
423
|
+
// Include candidates with 10%+ of best score
|
|
424
|
+
qpsExact: false,
|
|
425
|
+
// Use fuzzy matching by default
|
|
426
|
+
qpsTolerance: 1
|
|
427
|
+
// Default tolerance of 1 edit distance
|
|
428
|
+
};
|
|
429
|
+
function normalizeText(text) {
|
|
430
|
+
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
431
|
+
}
|
|
432
|
+
function tokenize(text) {
|
|
433
|
+
return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
|
|
434
|
+
}
|
|
435
|
+
function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
|
|
436
|
+
const tokens = tokenizer.tokenize(term, language);
|
|
437
|
+
if (tokens.length === 0) {
|
|
438
|
+
return [];
|
|
439
|
+
}
|
|
440
|
+
const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
|
|
441
|
+
const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
|
|
442
|
+
const boostPerProp = config.qpsBoostPerProp ?? {};
|
|
443
|
+
const resultMap = /* @__PURE__ */ new Map();
|
|
444
|
+
for (const prop of properties) {
|
|
445
|
+
const indexEntry = qpsIndex.indexes[prop];
|
|
446
|
+
if (!indexEntry || indexEntry.type !== "Radix") {
|
|
447
|
+
continue;
|
|
448
|
+
}
|
|
449
|
+
const radixNode = indexEntry.node;
|
|
450
|
+
const stats = qpsIndex.stats[prop];
|
|
451
|
+
if (!radixNode || !stats) {
|
|
452
|
+
continue;
|
|
453
|
+
}
|
|
454
|
+
const boost = boostPerProp[prop] ?? 1;
|
|
455
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
456
|
+
const token = tokens[i];
|
|
457
|
+
const matches = radixNode.find({
|
|
458
|
+
term: token,
|
|
459
|
+
exact,
|
|
460
|
+
tolerance: exact ? 0 : tolerance
|
|
461
|
+
});
|
|
462
|
+
for (const [matchedWord, docIds] of Object.entries(matches)) {
|
|
463
|
+
if (!Array.isArray(docIds))
|
|
464
|
+
continue;
|
|
465
|
+
const isExactMatch = matchedWord === token;
|
|
466
|
+
for (const docId of docIds) {
|
|
467
|
+
const tokensLength = stats.tokensLength.get(docId) || 1;
|
|
468
|
+
const quantum = stats.tokenQuantums[docId]?.[matchedWord];
|
|
469
|
+
const occurrences = quantum ? quantum >> 20 : 1;
|
|
470
|
+
const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
|
|
471
|
+
if (!resultMap.has(docId)) {
|
|
472
|
+
resultMap.set(docId, [scoreContrib, 1 << i]);
|
|
473
|
+
} else {
|
|
474
|
+
const [prevScore, prevMask] = resultMap.get(docId);
|
|
475
|
+
const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
|
|
476
|
+
resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
|
|
483
|
+
return results;
|
|
484
|
+
}
|
|
485
|
+
function countSetBits(n) {
|
|
486
|
+
let count = 0;
|
|
487
|
+
while (n) {
|
|
488
|
+
count += n & 1;
|
|
489
|
+
n >>= 1;
|
|
490
|
+
}
|
|
491
|
+
return count;
|
|
492
|
+
}
|
|
493
|
+
async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
|
|
494
|
+
const startTime = performance.now();
|
|
495
|
+
const { term, properties, tokenCache } = params;
|
|
496
|
+
if (!term || typeof term !== "string") {
|
|
497
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
498
|
+
}
|
|
499
|
+
const textProperty = properties && properties[0] || pluginState.config.textProperty;
|
|
500
|
+
const searchProperties = properties || [textProperty];
|
|
501
|
+
const queryTokens = tokenize(term);
|
|
502
|
+
if (queryTokens.length === 0) {
|
|
503
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
504
|
+
}
|
|
505
|
+
const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
|
|
506
|
+
console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
507
|
+
const qpsStartTime = performance.now();
|
|
508
|
+
const tokenizer = orama.tokenizer;
|
|
509
|
+
const qpsCandidates = searchQPS(
|
|
510
|
+
term,
|
|
511
|
+
qpsIndex,
|
|
512
|
+
tokenizer,
|
|
513
|
+
searchProperties,
|
|
514
|
+
config,
|
|
515
|
+
language
|
|
516
|
+
);
|
|
517
|
+
const qpsTime = performance.now() - qpsStartTime;
|
|
518
|
+
console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
|
|
519
|
+
if (qpsCandidates.length === 0) {
|
|
520
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
521
|
+
}
|
|
522
|
+
const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
|
|
523
|
+
const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
|
|
524
|
+
const bestScore = qpsCandidates[0][1];
|
|
525
|
+
const minScore = bestScore * minScoreRatio;
|
|
526
|
+
const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
|
|
527
|
+
console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
|
|
528
|
+
const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
|
|
529
|
+
let vocabulary = pluginState.vocabulary;
|
|
530
|
+
if (vocabulary.size === 0) {
|
|
531
|
+
console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
|
|
532
|
+
try {
|
|
533
|
+
const indexData = orama.data?.index;
|
|
534
|
+
let radixNode = null;
|
|
535
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
536
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
537
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
538
|
+
radixNode = indexData[textProperty].node;
|
|
539
|
+
}
|
|
540
|
+
if (radixNode) {
|
|
541
|
+
pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
542
|
+
vocabulary = pluginState.vocabulary;
|
|
543
|
+
console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
|
|
544
|
+
} else {
|
|
545
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
546
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
547
|
+
}
|
|
548
|
+
} catch (error) {
|
|
549
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
550
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
const candidatesMap = findAllCandidates(
|
|
554
|
+
queryTokens,
|
|
555
|
+
vocabulary,
|
|
556
|
+
tolerance,
|
|
557
|
+
pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
|
|
558
|
+
pluginState.config.synonymMatchScore
|
|
559
|
+
);
|
|
560
|
+
const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
|
|
561
|
+
console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
562
|
+
const phraseStartTime = performance.now();
|
|
563
|
+
const documentMatches = [];
|
|
564
|
+
let docs = {};
|
|
565
|
+
if (orama.data?.docs?.docs) {
|
|
566
|
+
docs = orama.data.docs.docs;
|
|
567
|
+
}
|
|
568
|
+
let docsScored = 0;
|
|
569
|
+
for (const [docId, doc] of Object.entries(docs)) {
|
|
570
|
+
if (!candidateDocIds.has(docId)) {
|
|
571
|
+
continue;
|
|
572
|
+
}
|
|
573
|
+
docsScored++;
|
|
574
|
+
const text = doc[textProperty];
|
|
575
|
+
if (!text || typeof text !== "string") {
|
|
576
|
+
continue;
|
|
577
|
+
}
|
|
578
|
+
let docTokens;
|
|
579
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
580
|
+
docTokens = tokenCache.get(docId);
|
|
581
|
+
} else {
|
|
582
|
+
docTokens = tokenize(text);
|
|
583
|
+
}
|
|
584
|
+
const phrases = findPhrasesInDocument(
|
|
585
|
+
docTokens,
|
|
586
|
+
filteredFuzzyCandidates,
|
|
587
|
+
{
|
|
588
|
+
weights: pluginState.config.weights,
|
|
589
|
+
maxGap: pluginState.config.maxGap,
|
|
590
|
+
proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
|
|
591
|
+
tolerance
|
|
592
|
+
},
|
|
593
|
+
pluginState.documentFrequency,
|
|
594
|
+
pluginState.totalDocuments,
|
|
595
|
+
queryTokens
|
|
596
|
+
);
|
|
597
|
+
if (phrases.length > 0) {
|
|
598
|
+
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
599
|
+
documentMatches.push({
|
|
600
|
+
id: docId,
|
|
601
|
+
phrases,
|
|
602
|
+
score: docScore,
|
|
603
|
+
document: doc
|
|
604
|
+
});
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
const phraseTime = performance.now() - phraseStartTime;
|
|
608
|
+
console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
|
|
609
|
+
documentMatches.sort((a, b) => b.score - a.score);
|
|
610
|
+
let finalMatches = documentMatches;
|
|
611
|
+
if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
|
|
612
|
+
const threshold = pluginState.config.finalScoreMinimum;
|
|
613
|
+
const beforeCount = finalMatches.length;
|
|
614
|
+
finalMatches = finalMatches.filter((m) => m.score >= threshold);
|
|
615
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
|
|
616
|
+
}
|
|
617
|
+
const limit = params.limit ?? finalMatches.length;
|
|
618
|
+
const limitedMatches = finalMatches.slice(0, limit);
|
|
619
|
+
const hits = limitedMatches.map((match) => ({
|
|
620
|
+
id: match.id,
|
|
621
|
+
score: match.score,
|
|
622
|
+
document: match.document,
|
|
623
|
+
_phrases: match.phrases
|
|
624
|
+
}));
|
|
625
|
+
const elapsed = performance.now() - startTime;
|
|
626
|
+
console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
|
|
627
|
+
return {
|
|
628
|
+
elapsed: {
|
|
629
|
+
formatted: `${elapsed.toFixed(2)}ms`,
|
|
630
|
+
raw: Math.floor(elapsed * 1e6),
|
|
631
|
+
qpsTime,
|
|
632
|
+
phraseTime
|
|
633
|
+
},
|
|
634
|
+
hits,
|
|
635
|
+
count: hits.length
|
|
636
|
+
};
|
|
637
|
+
}
|
|
638
|
+
function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
|
|
639
|
+
return async (params, language = "french") => {
|
|
640
|
+
return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
|
|
641
|
+
};
|
|
642
|
+
}
|
|
643
|
+
|
|
418
644
|
// src/index.ts
|
|
419
645
|
var DEFAULT_CONFIG = {
|
|
420
646
|
textProperty: "normalized_content",
|
|
@@ -528,12 +754,13 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
528
754
|
console.error("\u274C Plugin state not initialized");
|
|
529
755
|
throw new Error("Fuzzy Phrase Plugin not properly initialized");
|
|
530
756
|
}
|
|
531
|
-
const { term, properties, tokenCache } = params;
|
|
757
|
+
const { term, properties, tokenCache, candidateIds } = params;
|
|
758
|
+
const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
|
|
532
759
|
if (!term || typeof term !== "string") {
|
|
533
760
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
534
761
|
}
|
|
535
762
|
const textProperty = properties && properties[0] || state.config.textProperty;
|
|
536
|
-
const queryTokens =
|
|
763
|
+
const queryTokens = tokenize2(term);
|
|
537
764
|
if (queryTokens.length === 0) {
|
|
538
765
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
539
766
|
}
|
|
@@ -600,8 +827,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
600
827
|
});
|
|
601
828
|
}
|
|
602
829
|
const cacheHits = tokenCache ? tokenCache.size : 0;
|
|
603
|
-
|
|
830
|
+
const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
|
|
831
|
+
console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
|
|
604
832
|
for (const [docId, doc] of Object.entries(docs)) {
|
|
833
|
+
if (candidateIdSet) {
|
|
834
|
+
const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
|
|
835
|
+
if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
|
|
836
|
+
continue;
|
|
837
|
+
}
|
|
838
|
+
}
|
|
605
839
|
const text = doc[textProperty];
|
|
606
840
|
if (!text || typeof text !== "string") {
|
|
607
841
|
continue;
|
|
@@ -610,7 +844,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
610
844
|
if (tokenCache && tokenCache.has(docId)) {
|
|
611
845
|
docTokens = tokenCache.get(docId);
|
|
612
846
|
} else {
|
|
613
|
-
docTokens =
|
|
847
|
+
docTokens = tokenize2(text);
|
|
614
848
|
}
|
|
615
849
|
const phrases = findPhrasesInDocument(
|
|
616
850
|
docTokens,
|
|
@@ -696,20 +930,23 @@ function calculateDocumentFrequencies(docs, textProperty) {
|
|
|
696
930
|
if (!text || typeof text !== "string") {
|
|
697
931
|
continue;
|
|
698
932
|
}
|
|
699
|
-
const words = new Set(
|
|
933
|
+
const words = new Set(tokenize2(text));
|
|
700
934
|
for (const word of words) {
|
|
701
935
|
df.set(word, (df.get(word) || 0) + 1);
|
|
702
936
|
}
|
|
703
937
|
}
|
|
704
938
|
return df;
|
|
705
939
|
}
|
|
706
|
-
function
|
|
940
|
+
function normalizeText2(text) {
|
|
707
941
|
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
708
942
|
}
|
|
709
|
-
function
|
|
710
|
-
return
|
|
943
|
+
function tokenize2(text) {
|
|
944
|
+
return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
|
|
945
|
+
}
|
|
946
|
+
function getPluginState(orama) {
|
|
947
|
+
return pluginStates.get(orama);
|
|
711
948
|
}
|
|
712
949
|
|
|
713
|
-
export { pluginFuzzyPhrase, searchWithFuzzyPhrase };
|
|
950
|
+
export { createOptimizedSearch, getPluginState, normalizeText as normalizeTextOptimized, pluginFuzzyPhrase, searchWithFuzzyPhrase, searchWithQPSPruning, tokenize as tokenizeOptimized };
|
|
714
951
|
//# sourceMappingURL=out.js.map
|
|
715
952
|
//# sourceMappingURL=index.js.map
|