@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.3 → 3.1.16-custom.newbase.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +403 -83
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +116 -3
- package/dist/index.d.ts +116 -3
- package/dist/index.js +399 -84
- package/dist/index.js.map +1 -1
- package/package.json +4 -2
package/dist/index.js
CHANGED
|
@@ -48,9 +48,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
|
|
|
48
48
|
if (word === queryToken) {
|
|
49
49
|
return { matches: true, distance: 0, score: 1 };
|
|
50
50
|
}
|
|
51
|
-
if (word.startsWith(queryToken)) {
|
|
52
|
-
return { matches: true, distance: 0, score: 0.95 };
|
|
53
|
-
}
|
|
54
51
|
const result = boundedLevenshtein(word, queryToken, tolerance);
|
|
55
52
|
if (result.isBounded) {
|
|
56
53
|
const score = 1 - result.distance * 0.2;
|
|
@@ -180,24 +177,41 @@ function filterCandidatesByScore(candidatesMap, minScore) {
|
|
|
180
177
|
}
|
|
181
178
|
|
|
182
179
|
// src/scoring.ts
|
|
183
|
-
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
|
|
180
|
+
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens) {
|
|
184
181
|
const phrases = [];
|
|
185
|
-
const queryTokens =
|
|
182
|
+
const queryTokens = originalQueryTokens;
|
|
186
183
|
const wordMatches = [];
|
|
184
|
+
const candidateLookup = /* @__PURE__ */ new Map();
|
|
185
|
+
for (const [queryToken, candidates] of candidatesMap.entries()) {
|
|
186
|
+
for (const candidate of candidates) {
|
|
187
|
+
if (!candidateLookup.has(candidate.word)) {
|
|
188
|
+
candidateLookup.set(candidate.word, []);
|
|
189
|
+
}
|
|
190
|
+
candidateLookup.get(candidate.word).push({ queryToken, candidate });
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
for (const entries of candidateLookup.values()) {
|
|
194
|
+
entries.sort((a, b) => {
|
|
195
|
+
if (a.candidate.type === "exact" && b.candidate.type !== "exact")
|
|
196
|
+
return -1;
|
|
197
|
+
if (b.candidate.type === "exact" && a.candidate.type !== "exact")
|
|
198
|
+
return 1;
|
|
199
|
+
return b.candidate.score - a.candidate.score;
|
|
200
|
+
});
|
|
201
|
+
}
|
|
187
202
|
for (let i = 0; i < documentTokens.length; i++) {
|
|
188
203
|
const docWord = documentTokens[i];
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
}
|
|
204
|
+
const matches = candidateLookup.get(docWord);
|
|
205
|
+
if (matches) {
|
|
206
|
+
for (const { queryToken, candidate } of matches) {
|
|
207
|
+
wordMatches.push({
|
|
208
|
+
word: docWord,
|
|
209
|
+
queryToken,
|
|
210
|
+
position: i,
|
|
211
|
+
type: candidate.type,
|
|
212
|
+
distance: candidate.distance,
|
|
213
|
+
score: candidate.score
|
|
214
|
+
});
|
|
201
215
|
}
|
|
202
216
|
}
|
|
203
217
|
}
|
|
@@ -217,35 +231,52 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
|
|
|
217
231
|
phrases.push(phrase);
|
|
218
232
|
}
|
|
219
233
|
}
|
|
220
|
-
|
|
234
|
+
const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
|
|
235
|
+
const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
|
|
236
|
+
return deduplicatePhrases(filteredPhrases);
|
|
221
237
|
}
|
|
222
238
|
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
|
|
223
239
|
const startMatch = wordMatches[startIndex];
|
|
224
240
|
const phraseWords = [startMatch];
|
|
225
|
-
const
|
|
241
|
+
const queryTokenCounts = /* @__PURE__ */ new Map();
|
|
242
|
+
for (const token of queryTokens) {
|
|
243
|
+
queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
|
|
244
|
+
}
|
|
245
|
+
const matchedCounts = /* @__PURE__ */ new Map();
|
|
246
|
+
matchedCounts.set(startMatch.queryToken, 1);
|
|
226
247
|
const gapWords = [];
|
|
227
248
|
let totalGapUsed = 0;
|
|
249
|
+
let totalMatchedTokens = 1;
|
|
228
250
|
for (let i = startIndex + 1; i < wordMatches.length; i++) {
|
|
229
251
|
const match = wordMatches[i];
|
|
230
252
|
const lastPos = phraseWords[phraseWords.length - 1].position;
|
|
253
|
+
if (match.position <= lastPos) {
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
231
256
|
const gap = match.position - lastPos - 1;
|
|
232
257
|
if (gap > config.maxGap) {
|
|
233
258
|
break;
|
|
234
259
|
}
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
gapWords.push({
|
|
238
|
-
word: documentTokens[pos],
|
|
239
|
-
position: pos,
|
|
240
|
-
gapIndex: totalGapUsed
|
|
241
|
-
});
|
|
260
|
+
if (totalGapUsed + gap > config.maxGap) {
|
|
261
|
+
break;
|
|
242
262
|
}
|
|
243
|
-
|
|
263
|
+
const neededCount = queryTokenCounts.get(match.queryToken) || 0;
|
|
264
|
+
const currentCount = matchedCounts.get(match.queryToken) || 0;
|
|
265
|
+
if (currentCount < neededCount) {
|
|
266
|
+
for (let pos = lastPos + 1; pos < match.position; pos++) {
|
|
267
|
+
totalGapUsed++;
|
|
268
|
+
gapWords.push({
|
|
269
|
+
word: documentTokens[pos],
|
|
270
|
+
position: pos,
|
|
271
|
+
gapIndex: totalGapUsed
|
|
272
|
+
});
|
|
273
|
+
}
|
|
244
274
|
phraseWords.push(match);
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
275
|
+
matchedCounts.set(match.queryToken, currentCount + 1);
|
|
276
|
+
totalMatchedTokens++;
|
|
277
|
+
if (totalMatchedTokens === queryTokens.length) {
|
|
278
|
+
break;
|
|
279
|
+
}
|
|
249
280
|
}
|
|
250
281
|
}
|
|
251
282
|
if (phraseWords.length > 0) {
|
|
@@ -284,9 +315,12 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
284
315
|
baseScore /= phraseWords.length;
|
|
285
316
|
const inOrder = isInOrder(phraseWords, queryTokens);
|
|
286
317
|
const orderScore = inOrder ? 1 : 0.5;
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
318
|
+
let proximityScore = 0;
|
|
319
|
+
if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
|
|
320
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
321
|
+
const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
|
|
322
|
+
proximityScore = Math.max(0, 1 - span / proximityWindow);
|
|
323
|
+
}
|
|
290
324
|
let densityScore = 0;
|
|
291
325
|
if (queryTokens.length === 1) {
|
|
292
326
|
const totalOccurrences = allWordMatches.length;
|
|
@@ -304,8 +338,10 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
304
338
|
const weightedDensity = densityScore * weights.density;
|
|
305
339
|
const weightedSemantic = semanticScore * weights.semantic;
|
|
306
340
|
const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
|
|
307
|
-
const
|
|
308
|
-
const
|
|
341
|
+
const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
|
|
342
|
+
const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
|
|
343
|
+
const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
|
|
344
|
+
const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
|
|
309
345
|
const normalizedScore = totalScore / maxPossibleScore;
|
|
310
346
|
const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
|
|
311
347
|
const score = normalizedScore * coverageMultiplier;
|
|
@@ -328,13 +364,20 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
328
364
|
};
|
|
329
365
|
}
|
|
330
366
|
function isInOrder(phraseWords, queryTokens) {
|
|
331
|
-
const
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
367
|
+
const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
|
|
368
|
+
let lastMatchedIndex = -1;
|
|
369
|
+
for (const phraseWord of phraseWords) {
|
|
370
|
+
let foundIndex = -1;
|
|
371
|
+
for (const pos of tokenPositions) {
|
|
372
|
+
if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
|
|
373
|
+
foundIndex = pos.index;
|
|
374
|
+
break;
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
if (foundIndex === -1) {
|
|
336
378
|
return false;
|
|
337
379
|
}
|
|
380
|
+
lastMatchedIndex = foundIndex;
|
|
338
381
|
}
|
|
339
382
|
return true;
|
|
340
383
|
}
|
|
@@ -375,9 +418,236 @@ function deduplicatePhrases(phrases) {
|
|
|
375
418
|
return result.sort((a, b) => b.score - a.score);
|
|
376
419
|
}
|
|
377
420
|
|
|
421
|
+
// src/optimized.ts
|
|
422
|
+
var DEFAULT_OPTIMIZED_CONFIG = {
|
|
423
|
+
maxQPSCandidates: 100,
|
|
424
|
+
// Limit phrase scoring to top 100 candidates
|
|
425
|
+
minQPSScore: 0.1,
|
|
426
|
+
// Include candidates with 10%+ of best score
|
|
427
|
+
qpsExact: false,
|
|
428
|
+
// Use fuzzy matching by default
|
|
429
|
+
qpsTolerance: 1
|
|
430
|
+
// Default tolerance of 1 edit distance
|
|
431
|
+
};
|
|
432
|
+
function normalizeText(text) {
|
|
433
|
+
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
434
|
+
}
|
|
435
|
+
function tokenize(text) {
|
|
436
|
+
return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
|
|
437
|
+
}
|
|
438
|
+
function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
|
|
439
|
+
const tokens = tokenizer.tokenize(term, language);
|
|
440
|
+
if (tokens.length === 0) {
|
|
441
|
+
return [];
|
|
442
|
+
}
|
|
443
|
+
const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
|
|
444
|
+
const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
|
|
445
|
+
const boostPerProp = config.qpsBoostPerProp ?? {};
|
|
446
|
+
const resultMap = /* @__PURE__ */ new Map();
|
|
447
|
+
for (const prop of properties) {
|
|
448
|
+
const indexEntry = qpsIndex.indexes[prop];
|
|
449
|
+
if (!indexEntry || indexEntry.type !== "Radix") {
|
|
450
|
+
continue;
|
|
451
|
+
}
|
|
452
|
+
const radixNode = indexEntry.node;
|
|
453
|
+
const stats = qpsIndex.stats[prop];
|
|
454
|
+
if (!radixNode || !stats) {
|
|
455
|
+
continue;
|
|
456
|
+
}
|
|
457
|
+
const boost = boostPerProp[prop] ?? 1;
|
|
458
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
459
|
+
const token = tokens[i];
|
|
460
|
+
const matches = radixNode.find({
|
|
461
|
+
term: token,
|
|
462
|
+
exact,
|
|
463
|
+
tolerance: exact ? 0 : tolerance
|
|
464
|
+
});
|
|
465
|
+
for (const [matchedWord, docIds] of Object.entries(matches)) {
|
|
466
|
+
if (!Array.isArray(docIds))
|
|
467
|
+
continue;
|
|
468
|
+
const isExactMatch = matchedWord === token;
|
|
469
|
+
for (const docId of docIds) {
|
|
470
|
+
const tokensLength = stats.tokensLength.get(docId) || 1;
|
|
471
|
+
const quantum = stats.tokenQuantums[docId]?.[matchedWord];
|
|
472
|
+
const occurrences = quantum ? quantum >> 20 : 1;
|
|
473
|
+
const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
|
|
474
|
+
if (!resultMap.has(docId)) {
|
|
475
|
+
resultMap.set(docId, [scoreContrib, 1 << i]);
|
|
476
|
+
} else {
|
|
477
|
+
const [prevScore, prevMask] = resultMap.get(docId);
|
|
478
|
+
const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
|
|
479
|
+
resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
|
|
486
|
+
return results;
|
|
487
|
+
}
|
|
488
|
+
function countSetBits(n) {
|
|
489
|
+
let count = 0;
|
|
490
|
+
while (n) {
|
|
491
|
+
count += n & 1;
|
|
492
|
+
n >>= 1;
|
|
493
|
+
}
|
|
494
|
+
return count;
|
|
495
|
+
}
|
|
496
|
+
async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
|
|
497
|
+
const startTime = performance.now();
|
|
498
|
+
const { term, properties, tokenCache } = params;
|
|
499
|
+
if (!term || typeof term !== "string") {
|
|
500
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
501
|
+
}
|
|
502
|
+
const textProperty = properties && properties[0] || pluginState.config.textProperty;
|
|
503
|
+
const searchProperties = properties || [textProperty];
|
|
504
|
+
const queryTokens = tokenize(term);
|
|
505
|
+
if (queryTokens.length === 0) {
|
|
506
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
507
|
+
}
|
|
508
|
+
const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
|
|
509
|
+
console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
510
|
+
const qpsStartTime = performance.now();
|
|
511
|
+
const tokenizer = orama.tokenizer;
|
|
512
|
+
const qpsCandidates = searchQPS(
|
|
513
|
+
term,
|
|
514
|
+
qpsIndex,
|
|
515
|
+
tokenizer,
|
|
516
|
+
searchProperties,
|
|
517
|
+
config,
|
|
518
|
+
language
|
|
519
|
+
);
|
|
520
|
+
const qpsTime = performance.now() - qpsStartTime;
|
|
521
|
+
console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
|
|
522
|
+
if (qpsCandidates.length === 0) {
|
|
523
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
524
|
+
}
|
|
525
|
+
const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
|
|
526
|
+
const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
|
|
527
|
+
const bestScore = qpsCandidates[0][1];
|
|
528
|
+
const minScore = bestScore * minScoreRatio;
|
|
529
|
+
const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
|
|
530
|
+
console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
|
|
531
|
+
const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
|
|
532
|
+
let vocabulary = pluginState.vocabulary;
|
|
533
|
+
if (vocabulary.size === 0) {
|
|
534
|
+
console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
|
|
535
|
+
try {
|
|
536
|
+
const indexData = orama.data?.index;
|
|
537
|
+
let radixNode = null;
|
|
538
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
539
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
540
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
541
|
+
radixNode = indexData[textProperty].node;
|
|
542
|
+
}
|
|
543
|
+
if (radixNode) {
|
|
544
|
+
pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
545
|
+
vocabulary = pluginState.vocabulary;
|
|
546
|
+
console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
|
|
547
|
+
} else {
|
|
548
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
549
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
550
|
+
}
|
|
551
|
+
} catch (error) {
|
|
552
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
553
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
const candidatesMap = findAllCandidates(
|
|
557
|
+
queryTokens,
|
|
558
|
+
vocabulary,
|
|
559
|
+
tolerance,
|
|
560
|
+
pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
|
|
561
|
+
pluginState.config.synonymMatchScore
|
|
562
|
+
);
|
|
563
|
+
const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
|
|
564
|
+
console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
565
|
+
const phraseStartTime = performance.now();
|
|
566
|
+
const documentMatches = [];
|
|
567
|
+
let docs = {};
|
|
568
|
+
if (orama.data?.docs?.docs) {
|
|
569
|
+
docs = orama.data.docs.docs;
|
|
570
|
+
}
|
|
571
|
+
let docsScored = 0;
|
|
572
|
+
for (const [docId, doc] of Object.entries(docs)) {
|
|
573
|
+
if (!candidateDocIds.has(docId)) {
|
|
574
|
+
continue;
|
|
575
|
+
}
|
|
576
|
+
docsScored++;
|
|
577
|
+
const text = doc[textProperty];
|
|
578
|
+
if (!text || typeof text !== "string") {
|
|
579
|
+
continue;
|
|
580
|
+
}
|
|
581
|
+
let docTokens;
|
|
582
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
583
|
+
docTokens = tokenCache.get(docId);
|
|
584
|
+
} else {
|
|
585
|
+
docTokens = tokenize(text);
|
|
586
|
+
}
|
|
587
|
+
const phrases = findPhrasesInDocument(
|
|
588
|
+
docTokens,
|
|
589
|
+
filteredFuzzyCandidates,
|
|
590
|
+
{
|
|
591
|
+
weights: pluginState.config.weights,
|
|
592
|
+
maxGap: pluginState.config.maxGap,
|
|
593
|
+
proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
|
|
594
|
+
tolerance
|
|
595
|
+
},
|
|
596
|
+
pluginState.documentFrequency,
|
|
597
|
+
pluginState.totalDocuments,
|
|
598
|
+
queryTokens
|
|
599
|
+
);
|
|
600
|
+
if (phrases.length > 0) {
|
|
601
|
+
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
602
|
+
documentMatches.push({
|
|
603
|
+
id: docId,
|
|
604
|
+
phrases,
|
|
605
|
+
score: docScore,
|
|
606
|
+
document: doc
|
|
607
|
+
});
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
const phraseTime = performance.now() - phraseStartTime;
|
|
611
|
+
console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
|
|
612
|
+
documentMatches.sort((a, b) => b.score - a.score);
|
|
613
|
+
let finalMatches = documentMatches;
|
|
614
|
+
if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
|
|
615
|
+
const threshold = pluginState.config.finalScoreMinimum;
|
|
616
|
+
const beforeCount = finalMatches.length;
|
|
617
|
+
finalMatches = finalMatches.filter((m) => m.score >= threshold);
|
|
618
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
|
|
619
|
+
}
|
|
620
|
+
const limit = params.limit ?? finalMatches.length;
|
|
621
|
+
const limitedMatches = finalMatches.slice(0, limit);
|
|
622
|
+
const hits = limitedMatches.map((match) => ({
|
|
623
|
+
id: match.id,
|
|
624
|
+
score: match.score,
|
|
625
|
+
document: match.document,
|
|
626
|
+
_phrases: match.phrases
|
|
627
|
+
}));
|
|
628
|
+
const elapsed = performance.now() - startTime;
|
|
629
|
+
console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
|
|
630
|
+
return {
|
|
631
|
+
elapsed: {
|
|
632
|
+
formatted: `${elapsed.toFixed(2)}ms`,
|
|
633
|
+
raw: Math.floor(elapsed * 1e6),
|
|
634
|
+
qpsTime,
|
|
635
|
+
phraseTime
|
|
636
|
+
},
|
|
637
|
+
hits,
|
|
638
|
+
count: hits.length
|
|
639
|
+
};
|
|
640
|
+
}
|
|
641
|
+
function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
|
|
642
|
+
return async (params, language = "french") => {
|
|
643
|
+
return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
|
|
644
|
+
};
|
|
645
|
+
}
|
|
646
|
+
|
|
378
647
|
// src/index.ts
|
|
379
648
|
var DEFAULT_CONFIG = {
|
|
380
|
-
textProperty: "
|
|
649
|
+
textProperty: "normalized_content",
|
|
650
|
+
// Must match server's field name
|
|
381
651
|
tolerance: 1,
|
|
382
652
|
adaptiveTolerance: true,
|
|
383
653
|
enableSynonyms: false,
|
|
@@ -393,6 +663,8 @@ var DEFAULT_CONFIG = {
|
|
|
393
663
|
},
|
|
394
664
|
maxGap: 5,
|
|
395
665
|
minScore: 0.1,
|
|
666
|
+
enableFinalScoreMinimum: false,
|
|
667
|
+
finalScoreMinimum: 0.3,
|
|
396
668
|
proximitySpanMultiplier: 5
|
|
397
669
|
};
|
|
398
670
|
var pluginStates = /* @__PURE__ */ new WeakMap();
|
|
@@ -414,6 +686,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
414
686
|
},
|
|
415
687
|
maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
|
|
416
688
|
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
|
|
689
|
+
enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
|
|
690
|
+
finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
|
|
417
691
|
proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
|
|
418
692
|
};
|
|
419
693
|
const plugin = {
|
|
@@ -427,7 +701,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
427
701
|
synonymMap: {},
|
|
428
702
|
config,
|
|
429
703
|
documentFrequency: /* @__PURE__ */ new Map(),
|
|
430
|
-
totalDocuments: 0
|
|
704
|
+
totalDocuments: 0,
|
|
705
|
+
vocabulary: /* @__PURE__ */ new Set()
|
|
431
706
|
};
|
|
432
707
|
if (config.enableSynonyms && config.supabase) {
|
|
433
708
|
try {
|
|
@@ -444,6 +719,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
444
719
|
state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
|
|
445
720
|
console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
|
|
446
721
|
}
|
|
722
|
+
try {
|
|
723
|
+
const indexData = orama.data?.index;
|
|
724
|
+
let radixNode = null;
|
|
725
|
+
if (indexData?.indexes?.[config.textProperty]?.node) {
|
|
726
|
+
radixNode = indexData.indexes[config.textProperty].node;
|
|
727
|
+
} else if (indexData?.[config.textProperty]?.node) {
|
|
728
|
+
radixNode = indexData[config.textProperty].node;
|
|
729
|
+
}
|
|
730
|
+
if (radixNode) {
|
|
731
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
732
|
+
console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
|
|
733
|
+
} else {
|
|
734
|
+
console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
|
|
735
|
+
}
|
|
736
|
+
} catch (error) {
|
|
737
|
+
console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
|
|
738
|
+
}
|
|
447
739
|
pluginStates.set(orama, state);
|
|
448
740
|
console.log("\u2705 Fuzzy Phrase Plugin initialized");
|
|
449
741
|
setImmediate(() => {
|
|
@@ -465,43 +757,43 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
465
757
|
console.error("\u274C Plugin state not initialized");
|
|
466
758
|
throw new Error("Fuzzy Phrase Plugin not properly initialized");
|
|
467
759
|
}
|
|
468
|
-
const { term, properties } = params;
|
|
760
|
+
const { term, properties, tokenCache, candidateIds } = params;
|
|
761
|
+
const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
|
|
469
762
|
if (!term || typeof term !== "string") {
|
|
470
763
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
471
764
|
}
|
|
472
765
|
const textProperty = properties && properties[0] || state.config.textProperty;
|
|
473
|
-
const queryTokens =
|
|
766
|
+
const queryTokens = tokenize2(term);
|
|
474
767
|
if (queryTokens.length === 0) {
|
|
475
768
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
476
769
|
}
|
|
477
770
|
const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
|
|
478
771
|
console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
479
|
-
let vocabulary;
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
772
|
+
let vocabulary = state.vocabulary;
|
|
773
|
+
if (vocabulary.size === 0) {
|
|
774
|
+
console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
|
|
775
|
+
try {
|
|
776
|
+
const indexData = orama.data?.index;
|
|
777
|
+
let radixNode = null;
|
|
778
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
779
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
780
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
781
|
+
radixNode = indexData[textProperty].node;
|
|
782
|
+
}
|
|
783
|
+
if (radixNode) {
|
|
784
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
785
|
+
vocabulary = state.vocabulary;
|
|
786
|
+
console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
|
|
787
|
+
} else {
|
|
788
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
789
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
790
|
+
}
|
|
791
|
+
} catch (error) {
|
|
792
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
498
793
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
499
794
|
}
|
|
500
|
-
|
|
501
|
-
console.log(`\u{1F4DA}
|
|
502
|
-
} catch (error) {
|
|
503
|
-
console.error("\u274C Failed to extract vocabulary:", error);
|
|
504
|
-
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
795
|
+
} else {
|
|
796
|
+
console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
|
|
505
797
|
}
|
|
506
798
|
const candidatesMap = findAllCandidates(
|
|
507
799
|
queryTokens,
|
|
@@ -510,10 +802,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
510
802
|
state.config.enableSynonyms ? state.synonymMap : void 0,
|
|
511
803
|
state.config.synonymMatchScore
|
|
512
804
|
);
|
|
513
|
-
const filteredCandidates = filterCandidatesByScore(
|
|
514
|
-
candidatesMap,
|
|
515
|
-
state.config.minScore
|
|
516
|
-
);
|
|
805
|
+
const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
|
|
517
806
|
console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
518
807
|
const documentMatches = [];
|
|
519
808
|
console.log("\u{1F50D} DEBUG orama.data structure:", {
|
|
@@ -540,23 +829,39 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
540
829
|
dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
|
|
541
830
|
});
|
|
542
831
|
}
|
|
543
|
-
|
|
832
|
+
const cacheHits = tokenCache ? tokenCache.size : 0;
|
|
833
|
+
const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
|
|
834
|
+
console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
|
|
544
835
|
for (const [docId, doc] of Object.entries(docs)) {
|
|
836
|
+
if (candidateIdSet) {
|
|
837
|
+
const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
|
|
838
|
+
if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
|
|
839
|
+
continue;
|
|
840
|
+
}
|
|
841
|
+
}
|
|
545
842
|
const text = doc[textProperty];
|
|
546
843
|
if (!text || typeof text !== "string") {
|
|
547
844
|
continue;
|
|
548
845
|
}
|
|
549
|
-
|
|
846
|
+
let docTokens;
|
|
847
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
848
|
+
docTokens = tokenCache.get(docId);
|
|
849
|
+
} else {
|
|
850
|
+
docTokens = tokenize2(text);
|
|
851
|
+
}
|
|
550
852
|
const phrases = findPhrasesInDocument(
|
|
551
853
|
docTokens,
|
|
552
854
|
filteredCandidates,
|
|
553
855
|
{
|
|
554
856
|
weights: state.config.weights,
|
|
555
857
|
maxGap: state.config.maxGap,
|
|
556
|
-
proximitySpanMultiplier: state.config.proximitySpanMultiplier
|
|
858
|
+
proximitySpanMultiplier: state.config.proximitySpanMultiplier,
|
|
859
|
+
tolerance
|
|
557
860
|
},
|
|
558
861
|
state.documentFrequency,
|
|
559
|
-
state.totalDocuments
|
|
862
|
+
state.totalDocuments,
|
|
863
|
+
queryTokens
|
|
864
|
+
// Original tokens with duplicates preserved
|
|
560
865
|
);
|
|
561
866
|
if (phrases.length > 0) {
|
|
562
867
|
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
@@ -569,8 +874,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
569
874
|
}
|
|
570
875
|
}
|
|
571
876
|
documentMatches.sort((a, b) => b.score - a.score);
|
|
572
|
-
|
|
573
|
-
|
|
877
|
+
let filteredMatches = documentMatches;
|
|
878
|
+
if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
|
|
879
|
+
const threshold = state.config.finalScoreMinimum;
|
|
880
|
+
const beforeCount = filteredMatches.length;
|
|
881
|
+
filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
|
|
882
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
|
|
883
|
+
}
|
|
884
|
+
const limit = params.limit ?? filteredMatches.length;
|
|
885
|
+
const limitedMatches = filteredMatches.slice(0, limit);
|
|
574
886
|
const hits = limitedMatches.map((match) => ({
|
|
575
887
|
id: match.id,
|
|
576
888
|
score: match.score,
|
|
@@ -621,20 +933,23 @@ function calculateDocumentFrequencies(docs, textProperty) {
|
|
|
621
933
|
if (!text || typeof text !== "string") {
|
|
622
934
|
continue;
|
|
623
935
|
}
|
|
624
|
-
const words = new Set(
|
|
936
|
+
const words = new Set(tokenize2(text));
|
|
625
937
|
for (const word of words) {
|
|
626
938
|
df.set(word, (df.get(word) || 0) + 1);
|
|
627
939
|
}
|
|
628
940
|
}
|
|
629
941
|
return df;
|
|
630
942
|
}
|
|
631
|
-
function
|
|
943
|
+
function normalizeText2(text) {
|
|
632
944
|
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
633
945
|
}
|
|
634
|
-
function
|
|
635
|
-
return
|
|
946
|
+
function tokenize2(text) {
|
|
947
|
+
return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
|
|
948
|
+
}
|
|
949
|
+
function getPluginState(orama) {
|
|
950
|
+
return pluginStates.get(orama);
|
|
636
951
|
}
|
|
637
952
|
|
|
638
|
-
export { pluginFuzzyPhrase, searchWithFuzzyPhrase };
|
|
953
|
+
export { createOptimizedSearch, getPluginState, normalizeText as normalizeTextOptimized, pluginFuzzyPhrase, searchWithFuzzyPhrase, searchWithQPSPruning, tokenize as tokenizeOptimized };
|
|
639
954
|
//# sourceMappingURL=out.js.map
|
|
640
955
|
//# sourceMappingURL=index.js.map
|