@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.3 → 3.1.16-custom.newbase.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +403 -83
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +116 -3
- package/dist/index.d.ts +116 -3
- package/dist/index.js +399 -84
- package/dist/index.js.map +1 -1
- package/package.json +4 -2
package/dist/index.cjs
CHANGED
|
@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
|
|
|
50
50
|
if (word === queryToken) {
|
|
51
51
|
return { matches: true, distance: 0, score: 1 };
|
|
52
52
|
}
|
|
53
|
-
if (word.startsWith(queryToken)) {
|
|
54
|
-
return { matches: true, distance: 0, score: 0.95 };
|
|
55
|
-
}
|
|
56
53
|
const result = boundedLevenshtein(word, queryToken, tolerance);
|
|
57
54
|
if (result.isBounded) {
|
|
58
55
|
const score = 1 - result.distance * 0.2;
|
|
@@ -182,24 +179,41 @@ function filterCandidatesByScore(candidatesMap, minScore) {
|
|
|
182
179
|
}
|
|
183
180
|
|
|
184
181
|
// src/scoring.ts
|
|
185
|
-
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
|
|
182
|
+
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens) {
|
|
186
183
|
const phrases = [];
|
|
187
|
-
const queryTokens =
|
|
184
|
+
const queryTokens = originalQueryTokens;
|
|
188
185
|
const wordMatches = [];
|
|
186
|
+
const candidateLookup = /* @__PURE__ */ new Map();
|
|
187
|
+
for (const [queryToken, candidates] of candidatesMap.entries()) {
|
|
188
|
+
for (const candidate of candidates) {
|
|
189
|
+
if (!candidateLookup.has(candidate.word)) {
|
|
190
|
+
candidateLookup.set(candidate.word, []);
|
|
191
|
+
}
|
|
192
|
+
candidateLookup.get(candidate.word).push({ queryToken, candidate });
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
for (const entries of candidateLookup.values()) {
|
|
196
|
+
entries.sort((a, b) => {
|
|
197
|
+
if (a.candidate.type === "exact" && b.candidate.type !== "exact")
|
|
198
|
+
return -1;
|
|
199
|
+
if (b.candidate.type === "exact" && a.candidate.type !== "exact")
|
|
200
|
+
return 1;
|
|
201
|
+
return b.candidate.score - a.candidate.score;
|
|
202
|
+
});
|
|
203
|
+
}
|
|
189
204
|
for (let i = 0; i < documentTokens.length; i++) {
|
|
190
205
|
const docWord = documentTokens[i];
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
}
|
|
206
|
+
const matches = candidateLookup.get(docWord);
|
|
207
|
+
if (matches) {
|
|
208
|
+
for (const { queryToken, candidate } of matches) {
|
|
209
|
+
wordMatches.push({
|
|
210
|
+
word: docWord,
|
|
211
|
+
queryToken,
|
|
212
|
+
position: i,
|
|
213
|
+
type: candidate.type,
|
|
214
|
+
distance: candidate.distance,
|
|
215
|
+
score: candidate.score
|
|
216
|
+
});
|
|
203
217
|
}
|
|
204
218
|
}
|
|
205
219
|
}
|
|
@@ -219,35 +233,52 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
|
|
|
219
233
|
phrases.push(phrase);
|
|
220
234
|
}
|
|
221
235
|
}
|
|
222
|
-
|
|
236
|
+
const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
|
|
237
|
+
const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
|
|
238
|
+
return deduplicatePhrases(filteredPhrases);
|
|
223
239
|
}
|
|
224
240
|
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
|
|
225
241
|
const startMatch = wordMatches[startIndex];
|
|
226
242
|
const phraseWords = [startMatch];
|
|
227
|
-
const
|
|
243
|
+
const queryTokenCounts = /* @__PURE__ */ new Map();
|
|
244
|
+
for (const token of queryTokens) {
|
|
245
|
+
queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
|
|
246
|
+
}
|
|
247
|
+
const matchedCounts = /* @__PURE__ */ new Map();
|
|
248
|
+
matchedCounts.set(startMatch.queryToken, 1);
|
|
228
249
|
const gapWords = [];
|
|
229
250
|
let totalGapUsed = 0;
|
|
251
|
+
let totalMatchedTokens = 1;
|
|
230
252
|
for (let i = startIndex + 1; i < wordMatches.length; i++) {
|
|
231
253
|
const match = wordMatches[i];
|
|
232
254
|
const lastPos = phraseWords[phraseWords.length - 1].position;
|
|
255
|
+
if (match.position <= lastPos) {
|
|
256
|
+
continue;
|
|
257
|
+
}
|
|
233
258
|
const gap = match.position - lastPos - 1;
|
|
234
259
|
if (gap > config.maxGap) {
|
|
235
260
|
break;
|
|
236
261
|
}
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
gapWords.push({
|
|
240
|
-
word: documentTokens[pos],
|
|
241
|
-
position: pos,
|
|
242
|
-
gapIndex: totalGapUsed
|
|
243
|
-
});
|
|
262
|
+
if (totalGapUsed + gap > config.maxGap) {
|
|
263
|
+
break;
|
|
244
264
|
}
|
|
245
|
-
|
|
265
|
+
const neededCount = queryTokenCounts.get(match.queryToken) || 0;
|
|
266
|
+
const currentCount = matchedCounts.get(match.queryToken) || 0;
|
|
267
|
+
if (currentCount < neededCount) {
|
|
268
|
+
for (let pos = lastPos + 1; pos < match.position; pos++) {
|
|
269
|
+
totalGapUsed++;
|
|
270
|
+
gapWords.push({
|
|
271
|
+
word: documentTokens[pos],
|
|
272
|
+
position: pos,
|
|
273
|
+
gapIndex: totalGapUsed
|
|
274
|
+
});
|
|
275
|
+
}
|
|
246
276
|
phraseWords.push(match);
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
277
|
+
matchedCounts.set(match.queryToken, currentCount + 1);
|
|
278
|
+
totalMatchedTokens++;
|
|
279
|
+
if (totalMatchedTokens === queryTokens.length) {
|
|
280
|
+
break;
|
|
281
|
+
}
|
|
251
282
|
}
|
|
252
283
|
}
|
|
253
284
|
if (phraseWords.length > 0) {
|
|
@@ -286,9 +317,12 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
286
317
|
baseScore /= phraseWords.length;
|
|
287
318
|
const inOrder = isInOrder(phraseWords, queryTokens);
|
|
288
319
|
const orderScore = inOrder ? 1 : 0.5;
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
320
|
+
let proximityScore = 0;
|
|
321
|
+
if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
|
|
322
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
323
|
+
const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
|
|
324
|
+
proximityScore = Math.max(0, 1 - span / proximityWindow);
|
|
325
|
+
}
|
|
292
326
|
let densityScore = 0;
|
|
293
327
|
if (queryTokens.length === 1) {
|
|
294
328
|
const totalOccurrences = allWordMatches.length;
|
|
@@ -306,8 +340,10 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
306
340
|
const weightedDensity = densityScore * weights.density;
|
|
307
341
|
const weightedSemantic = semanticScore * weights.semantic;
|
|
308
342
|
const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
|
|
309
|
-
const
|
|
310
|
-
const
|
|
343
|
+
const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
|
|
344
|
+
const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
|
|
345
|
+
const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
|
|
346
|
+
const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
|
|
311
347
|
const normalizedScore = totalScore / maxPossibleScore;
|
|
312
348
|
const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
|
|
313
349
|
const score = normalizedScore * coverageMultiplier;
|
|
@@ -330,13 +366,20 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
330
366
|
};
|
|
331
367
|
}
|
|
332
368
|
function isInOrder(phraseWords, queryTokens) {
|
|
333
|
-
const
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
369
|
+
const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
|
|
370
|
+
let lastMatchedIndex = -1;
|
|
371
|
+
for (const phraseWord of phraseWords) {
|
|
372
|
+
let foundIndex = -1;
|
|
373
|
+
for (const pos of tokenPositions) {
|
|
374
|
+
if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
|
|
375
|
+
foundIndex = pos.index;
|
|
376
|
+
break;
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
if (foundIndex === -1) {
|
|
338
380
|
return false;
|
|
339
381
|
}
|
|
382
|
+
lastMatchedIndex = foundIndex;
|
|
340
383
|
}
|
|
341
384
|
return true;
|
|
342
385
|
}
|
|
@@ -377,9 +420,236 @@ function deduplicatePhrases(phrases) {
|
|
|
377
420
|
return result.sort((a, b) => b.score - a.score);
|
|
378
421
|
}
|
|
379
422
|
|
|
423
|
+
// src/optimized.ts
|
|
424
|
+
var DEFAULT_OPTIMIZED_CONFIG = {
|
|
425
|
+
maxQPSCandidates: 100,
|
|
426
|
+
// Limit phrase scoring to top 100 candidates
|
|
427
|
+
minQPSScore: 0.1,
|
|
428
|
+
// Include candidates with 10%+ of best score
|
|
429
|
+
qpsExact: false,
|
|
430
|
+
// Use fuzzy matching by default
|
|
431
|
+
qpsTolerance: 1
|
|
432
|
+
// Default tolerance of 1 edit distance
|
|
433
|
+
};
|
|
434
|
+
function normalizeText(text) {
|
|
435
|
+
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
436
|
+
}
|
|
437
|
+
function tokenize(text) {
|
|
438
|
+
return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
|
|
439
|
+
}
|
|
440
|
+
function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
|
|
441
|
+
const tokens = tokenizer.tokenize(term, language);
|
|
442
|
+
if (tokens.length === 0) {
|
|
443
|
+
return [];
|
|
444
|
+
}
|
|
445
|
+
const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
|
|
446
|
+
const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
|
|
447
|
+
const boostPerProp = config.qpsBoostPerProp ?? {};
|
|
448
|
+
const resultMap = /* @__PURE__ */ new Map();
|
|
449
|
+
for (const prop of properties) {
|
|
450
|
+
const indexEntry = qpsIndex.indexes[prop];
|
|
451
|
+
if (!indexEntry || indexEntry.type !== "Radix") {
|
|
452
|
+
continue;
|
|
453
|
+
}
|
|
454
|
+
const radixNode = indexEntry.node;
|
|
455
|
+
const stats = qpsIndex.stats[prop];
|
|
456
|
+
if (!radixNode || !stats) {
|
|
457
|
+
continue;
|
|
458
|
+
}
|
|
459
|
+
const boost = boostPerProp[prop] ?? 1;
|
|
460
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
461
|
+
const token = tokens[i];
|
|
462
|
+
const matches = radixNode.find({
|
|
463
|
+
term: token,
|
|
464
|
+
exact,
|
|
465
|
+
tolerance: exact ? 0 : tolerance
|
|
466
|
+
});
|
|
467
|
+
for (const [matchedWord, docIds] of Object.entries(matches)) {
|
|
468
|
+
if (!Array.isArray(docIds))
|
|
469
|
+
continue;
|
|
470
|
+
const isExactMatch = matchedWord === token;
|
|
471
|
+
for (const docId of docIds) {
|
|
472
|
+
const tokensLength = stats.tokensLength.get(docId) || 1;
|
|
473
|
+
const quantum = stats.tokenQuantums[docId]?.[matchedWord];
|
|
474
|
+
const occurrences = quantum ? quantum >> 20 : 1;
|
|
475
|
+
const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
|
|
476
|
+
if (!resultMap.has(docId)) {
|
|
477
|
+
resultMap.set(docId, [scoreContrib, 1 << i]);
|
|
478
|
+
} else {
|
|
479
|
+
const [prevScore, prevMask] = resultMap.get(docId);
|
|
480
|
+
const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
|
|
481
|
+
resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
|
|
488
|
+
return results;
|
|
489
|
+
}
|
|
490
|
+
function countSetBits(n) {
|
|
491
|
+
let count = 0;
|
|
492
|
+
while (n) {
|
|
493
|
+
count += n & 1;
|
|
494
|
+
n >>= 1;
|
|
495
|
+
}
|
|
496
|
+
return count;
|
|
497
|
+
}
|
|
498
|
+
async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
|
|
499
|
+
const startTime = performance.now();
|
|
500
|
+
const { term, properties, tokenCache } = params;
|
|
501
|
+
if (!term || typeof term !== "string") {
|
|
502
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
503
|
+
}
|
|
504
|
+
const textProperty = properties && properties[0] || pluginState.config.textProperty;
|
|
505
|
+
const searchProperties = properties || [textProperty];
|
|
506
|
+
const queryTokens = tokenize(term);
|
|
507
|
+
if (queryTokens.length === 0) {
|
|
508
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
509
|
+
}
|
|
510
|
+
const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
|
|
511
|
+
console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
512
|
+
const qpsStartTime = performance.now();
|
|
513
|
+
const tokenizer = orama.tokenizer;
|
|
514
|
+
const qpsCandidates = searchQPS(
|
|
515
|
+
term,
|
|
516
|
+
qpsIndex,
|
|
517
|
+
tokenizer,
|
|
518
|
+
searchProperties,
|
|
519
|
+
config,
|
|
520
|
+
language
|
|
521
|
+
);
|
|
522
|
+
const qpsTime = performance.now() - qpsStartTime;
|
|
523
|
+
console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
|
|
524
|
+
if (qpsCandidates.length === 0) {
|
|
525
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
526
|
+
}
|
|
527
|
+
const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
|
|
528
|
+
const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
|
|
529
|
+
const bestScore = qpsCandidates[0][1];
|
|
530
|
+
const minScore = bestScore * minScoreRatio;
|
|
531
|
+
const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
|
|
532
|
+
console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
|
|
533
|
+
const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
|
|
534
|
+
let vocabulary = pluginState.vocabulary;
|
|
535
|
+
if (vocabulary.size === 0) {
|
|
536
|
+
console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
|
|
537
|
+
try {
|
|
538
|
+
const indexData = orama.data?.index;
|
|
539
|
+
let radixNode = null;
|
|
540
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
541
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
542
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
543
|
+
radixNode = indexData[textProperty].node;
|
|
544
|
+
}
|
|
545
|
+
if (radixNode) {
|
|
546
|
+
pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
547
|
+
vocabulary = pluginState.vocabulary;
|
|
548
|
+
console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
|
|
549
|
+
} else {
|
|
550
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
551
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
552
|
+
}
|
|
553
|
+
} catch (error) {
|
|
554
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
555
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
const candidatesMap = findAllCandidates(
|
|
559
|
+
queryTokens,
|
|
560
|
+
vocabulary,
|
|
561
|
+
tolerance,
|
|
562
|
+
pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
|
|
563
|
+
pluginState.config.synonymMatchScore
|
|
564
|
+
);
|
|
565
|
+
const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
|
|
566
|
+
console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
567
|
+
const phraseStartTime = performance.now();
|
|
568
|
+
const documentMatches = [];
|
|
569
|
+
let docs = {};
|
|
570
|
+
if (orama.data?.docs?.docs) {
|
|
571
|
+
docs = orama.data.docs.docs;
|
|
572
|
+
}
|
|
573
|
+
let docsScored = 0;
|
|
574
|
+
for (const [docId, doc] of Object.entries(docs)) {
|
|
575
|
+
if (!candidateDocIds.has(docId)) {
|
|
576
|
+
continue;
|
|
577
|
+
}
|
|
578
|
+
docsScored++;
|
|
579
|
+
const text = doc[textProperty];
|
|
580
|
+
if (!text || typeof text !== "string") {
|
|
581
|
+
continue;
|
|
582
|
+
}
|
|
583
|
+
let docTokens;
|
|
584
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
585
|
+
docTokens = tokenCache.get(docId);
|
|
586
|
+
} else {
|
|
587
|
+
docTokens = tokenize(text);
|
|
588
|
+
}
|
|
589
|
+
const phrases = findPhrasesInDocument(
|
|
590
|
+
docTokens,
|
|
591
|
+
filteredFuzzyCandidates,
|
|
592
|
+
{
|
|
593
|
+
weights: pluginState.config.weights,
|
|
594
|
+
maxGap: pluginState.config.maxGap,
|
|
595
|
+
proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
|
|
596
|
+
tolerance
|
|
597
|
+
},
|
|
598
|
+
pluginState.documentFrequency,
|
|
599
|
+
pluginState.totalDocuments,
|
|
600
|
+
queryTokens
|
|
601
|
+
);
|
|
602
|
+
if (phrases.length > 0) {
|
|
603
|
+
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
604
|
+
documentMatches.push({
|
|
605
|
+
id: docId,
|
|
606
|
+
phrases,
|
|
607
|
+
score: docScore,
|
|
608
|
+
document: doc
|
|
609
|
+
});
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
const phraseTime = performance.now() - phraseStartTime;
|
|
613
|
+
console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
|
|
614
|
+
documentMatches.sort((a, b) => b.score - a.score);
|
|
615
|
+
let finalMatches = documentMatches;
|
|
616
|
+
if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
|
|
617
|
+
const threshold = pluginState.config.finalScoreMinimum;
|
|
618
|
+
const beforeCount = finalMatches.length;
|
|
619
|
+
finalMatches = finalMatches.filter((m) => m.score >= threshold);
|
|
620
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
|
|
621
|
+
}
|
|
622
|
+
const limit = params.limit ?? finalMatches.length;
|
|
623
|
+
const limitedMatches = finalMatches.slice(0, limit);
|
|
624
|
+
const hits = limitedMatches.map((match) => ({
|
|
625
|
+
id: match.id,
|
|
626
|
+
score: match.score,
|
|
627
|
+
document: match.document,
|
|
628
|
+
_phrases: match.phrases
|
|
629
|
+
}));
|
|
630
|
+
const elapsed = performance.now() - startTime;
|
|
631
|
+
console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
|
|
632
|
+
return {
|
|
633
|
+
elapsed: {
|
|
634
|
+
formatted: `${elapsed.toFixed(2)}ms`,
|
|
635
|
+
raw: Math.floor(elapsed * 1e6),
|
|
636
|
+
qpsTime,
|
|
637
|
+
phraseTime
|
|
638
|
+
},
|
|
639
|
+
hits,
|
|
640
|
+
count: hits.length
|
|
641
|
+
};
|
|
642
|
+
}
|
|
643
|
+
function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
|
|
644
|
+
return async (params, language = "french") => {
|
|
645
|
+
return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
|
|
646
|
+
};
|
|
647
|
+
}
|
|
648
|
+
|
|
380
649
|
// src/index.ts
|
|
381
650
|
var DEFAULT_CONFIG = {
|
|
382
|
-
textProperty: "
|
|
651
|
+
textProperty: "normalized_content",
|
|
652
|
+
// Must match server's field name
|
|
383
653
|
tolerance: 1,
|
|
384
654
|
adaptiveTolerance: true,
|
|
385
655
|
enableSynonyms: false,
|
|
@@ -395,6 +665,8 @@ var DEFAULT_CONFIG = {
|
|
|
395
665
|
},
|
|
396
666
|
maxGap: 5,
|
|
397
667
|
minScore: 0.1,
|
|
668
|
+
enableFinalScoreMinimum: false,
|
|
669
|
+
finalScoreMinimum: 0.3,
|
|
398
670
|
proximitySpanMultiplier: 5
|
|
399
671
|
};
|
|
400
672
|
var pluginStates = /* @__PURE__ */ new WeakMap();
|
|
@@ -416,6 +688,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
416
688
|
},
|
|
417
689
|
maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
|
|
418
690
|
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
|
|
691
|
+
enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
|
|
692
|
+
finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
|
|
419
693
|
proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
|
|
420
694
|
};
|
|
421
695
|
const plugin = {
|
|
@@ -429,7 +703,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
429
703
|
synonymMap: {},
|
|
430
704
|
config,
|
|
431
705
|
documentFrequency: /* @__PURE__ */ new Map(),
|
|
432
|
-
totalDocuments: 0
|
|
706
|
+
totalDocuments: 0,
|
|
707
|
+
vocabulary: /* @__PURE__ */ new Set()
|
|
433
708
|
};
|
|
434
709
|
if (config.enableSynonyms && config.supabase) {
|
|
435
710
|
try {
|
|
@@ -446,6 +721,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
446
721
|
state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
|
|
447
722
|
console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
|
|
448
723
|
}
|
|
724
|
+
try {
|
|
725
|
+
const indexData = orama.data?.index;
|
|
726
|
+
let radixNode = null;
|
|
727
|
+
if (indexData?.indexes?.[config.textProperty]?.node) {
|
|
728
|
+
radixNode = indexData.indexes[config.textProperty].node;
|
|
729
|
+
} else if (indexData?.[config.textProperty]?.node) {
|
|
730
|
+
radixNode = indexData[config.textProperty].node;
|
|
731
|
+
}
|
|
732
|
+
if (radixNode) {
|
|
733
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
734
|
+
console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
|
|
735
|
+
} else {
|
|
736
|
+
console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
|
|
737
|
+
}
|
|
738
|
+
} catch (error) {
|
|
739
|
+
console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
|
|
740
|
+
}
|
|
449
741
|
pluginStates.set(orama, state);
|
|
450
742
|
console.log("\u2705 Fuzzy Phrase Plugin initialized");
|
|
451
743
|
setImmediate(() => {
|
|
@@ -467,43 +759,43 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
467
759
|
console.error("\u274C Plugin state not initialized");
|
|
468
760
|
throw new Error("Fuzzy Phrase Plugin not properly initialized");
|
|
469
761
|
}
|
|
470
|
-
const { term, properties } = params;
|
|
762
|
+
const { term, properties, tokenCache, candidateIds } = params;
|
|
763
|
+
const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
|
|
471
764
|
if (!term || typeof term !== "string") {
|
|
472
765
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
473
766
|
}
|
|
474
767
|
const textProperty = properties && properties[0] || state.config.textProperty;
|
|
475
|
-
const queryTokens =
|
|
768
|
+
const queryTokens = tokenize2(term);
|
|
476
769
|
if (queryTokens.length === 0) {
|
|
477
770
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
478
771
|
}
|
|
479
772
|
const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
|
|
480
773
|
console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
481
|
-
let vocabulary;
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
774
|
+
let vocabulary = state.vocabulary;
|
|
775
|
+
if (vocabulary.size === 0) {
|
|
776
|
+
console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
|
|
777
|
+
try {
|
|
778
|
+
const indexData = orama.data?.index;
|
|
779
|
+
let radixNode = null;
|
|
780
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
781
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
782
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
783
|
+
radixNode = indexData[textProperty].node;
|
|
784
|
+
}
|
|
785
|
+
if (radixNode) {
|
|
786
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
787
|
+
vocabulary = state.vocabulary;
|
|
788
|
+
console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
|
|
789
|
+
} else {
|
|
790
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
791
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
792
|
+
}
|
|
793
|
+
} catch (error) {
|
|
794
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
500
795
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
501
796
|
}
|
|
502
|
-
|
|
503
|
-
console.log(`\u{1F4DA}
|
|
504
|
-
} catch (error) {
|
|
505
|
-
console.error("\u274C Failed to extract vocabulary:", error);
|
|
506
|
-
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
797
|
+
} else {
|
|
798
|
+
console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
|
|
507
799
|
}
|
|
508
800
|
const candidatesMap = findAllCandidates(
|
|
509
801
|
queryTokens,
|
|
@@ -512,10 +804,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
512
804
|
state.config.enableSynonyms ? state.synonymMap : void 0,
|
|
513
805
|
state.config.synonymMatchScore
|
|
514
806
|
);
|
|
515
|
-
const filteredCandidates = filterCandidatesByScore(
|
|
516
|
-
candidatesMap,
|
|
517
|
-
state.config.minScore
|
|
518
|
-
);
|
|
807
|
+
const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
|
|
519
808
|
console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
520
809
|
const documentMatches = [];
|
|
521
810
|
console.log("\u{1F50D} DEBUG orama.data structure:", {
|
|
@@ -542,23 +831,39 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
542
831
|
dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
|
|
543
832
|
});
|
|
544
833
|
}
|
|
545
|
-
|
|
834
|
+
const cacheHits = tokenCache ? tokenCache.size : 0;
|
|
835
|
+
const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
|
|
836
|
+
console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
|
|
546
837
|
for (const [docId, doc] of Object.entries(docs)) {
|
|
838
|
+
if (candidateIdSet) {
|
|
839
|
+
const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
|
|
840
|
+
if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
|
|
841
|
+
continue;
|
|
842
|
+
}
|
|
843
|
+
}
|
|
547
844
|
const text = doc[textProperty];
|
|
548
845
|
if (!text || typeof text !== "string") {
|
|
549
846
|
continue;
|
|
550
847
|
}
|
|
551
|
-
|
|
848
|
+
let docTokens;
|
|
849
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
850
|
+
docTokens = tokenCache.get(docId);
|
|
851
|
+
} else {
|
|
852
|
+
docTokens = tokenize2(text);
|
|
853
|
+
}
|
|
552
854
|
const phrases = findPhrasesInDocument(
|
|
553
855
|
docTokens,
|
|
554
856
|
filteredCandidates,
|
|
555
857
|
{
|
|
556
858
|
weights: state.config.weights,
|
|
557
859
|
maxGap: state.config.maxGap,
|
|
558
|
-
proximitySpanMultiplier: state.config.proximitySpanMultiplier
|
|
860
|
+
proximitySpanMultiplier: state.config.proximitySpanMultiplier,
|
|
861
|
+
tolerance
|
|
559
862
|
},
|
|
560
863
|
state.documentFrequency,
|
|
561
|
-
state.totalDocuments
|
|
864
|
+
state.totalDocuments,
|
|
865
|
+
queryTokens
|
|
866
|
+
// Original tokens with duplicates preserved
|
|
562
867
|
);
|
|
563
868
|
if (phrases.length > 0) {
|
|
564
869
|
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
@@ -571,8 +876,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
571
876
|
}
|
|
572
877
|
}
|
|
573
878
|
documentMatches.sort((a, b) => b.score - a.score);
|
|
574
|
-
|
|
575
|
-
|
|
879
|
+
let filteredMatches = documentMatches;
|
|
880
|
+
if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
|
|
881
|
+
const threshold = state.config.finalScoreMinimum;
|
|
882
|
+
const beforeCount = filteredMatches.length;
|
|
883
|
+
filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
|
|
884
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
|
|
885
|
+
}
|
|
886
|
+
const limit = params.limit ?? filteredMatches.length;
|
|
887
|
+
const limitedMatches = filteredMatches.slice(0, limit);
|
|
576
888
|
const hits = limitedMatches.map((match) => ({
|
|
577
889
|
id: match.id,
|
|
578
890
|
score: match.score,
|
|
@@ -623,21 +935,29 @@ function calculateDocumentFrequencies(docs, textProperty) {
|
|
|
623
935
|
if (!text || typeof text !== "string") {
|
|
624
936
|
continue;
|
|
625
937
|
}
|
|
626
|
-
const words = new Set(
|
|
938
|
+
const words = new Set(tokenize2(text));
|
|
627
939
|
for (const word of words) {
|
|
628
940
|
df.set(word, (df.get(word) || 0) + 1);
|
|
629
941
|
}
|
|
630
942
|
}
|
|
631
943
|
return df;
|
|
632
944
|
}
|
|
633
|
-
function
|
|
945
|
+
function normalizeText2(text) {
|
|
634
946
|
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
635
947
|
}
|
|
636
|
-
function
|
|
637
|
-
return
|
|
948
|
+
function tokenize2(text) {
|
|
949
|
+
return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
|
|
950
|
+
}
|
|
951
|
+
function getPluginState(orama) {
|
|
952
|
+
return pluginStates.get(orama);
|
|
638
953
|
}
|
|
639
954
|
|
|
955
|
+
exports.createOptimizedSearch = createOptimizedSearch;
|
|
956
|
+
exports.getPluginState = getPluginState;
|
|
957
|
+
exports.normalizeTextOptimized = normalizeText;
|
|
640
958
|
exports.pluginFuzzyPhrase = pluginFuzzyPhrase;
|
|
641
959
|
exports.searchWithFuzzyPhrase = searchWithFuzzyPhrase;
|
|
960
|
+
exports.searchWithQPSPruning = searchWithQPSPruning;
|
|
961
|
+
exports.tokenizeOptimized = tokenize;
|
|
642
962
|
//# sourceMappingURL=out.js.map
|
|
643
963
|
//# sourceMappingURL=index.cjs.map
|