@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.2 → 3.1.16-custom.newbase.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +183 -84
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +42 -2
- package/dist/index.d.ts +42 -2
- package/dist/index.js +183 -84
- package/dist/index.js.map +1 -1
- package/package.json +62 -62
package/dist/index.cjs
CHANGED
|
@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
|
|
|
50
50
|
if (word === queryToken) {
|
|
51
51
|
return { matches: true, distance: 0, score: 1 };
|
|
52
52
|
}
|
|
53
|
-
if (word.startsWith(queryToken)) {
|
|
54
|
-
return { matches: true, distance: 0, score: 0.95 };
|
|
55
|
-
}
|
|
56
53
|
const result = boundedLevenshtein(word, queryToken, tolerance);
|
|
57
54
|
if (result.isBounded) {
|
|
58
55
|
const score = 1 - result.distance * 0.2;
|
|
@@ -182,24 +179,32 @@ function filterCandidatesByScore(candidatesMap, minScore) {
|
|
|
182
179
|
}
|
|
183
180
|
|
|
184
181
|
// src/scoring.ts
|
|
185
|
-
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
|
|
182
|
+
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, docPositions) {
|
|
186
183
|
const phrases = [];
|
|
187
|
-
const queryTokens =
|
|
184
|
+
const queryTokens = originalQueryTokens;
|
|
188
185
|
const wordMatches = [];
|
|
186
|
+
const candidateLookup = /* @__PURE__ */ new Map();
|
|
187
|
+
for (const [queryToken, candidates] of candidatesMap.entries()) {
|
|
188
|
+
for (const candidate of candidates) {
|
|
189
|
+
if (!candidateLookup.has(candidate.word)) {
|
|
190
|
+
candidateLookup.set(candidate.word, []);
|
|
191
|
+
}
|
|
192
|
+
candidateLookup.get(candidate.word).push({ queryToken, candidate });
|
|
193
|
+
}
|
|
194
|
+
}
|
|
189
195
|
for (let i = 0; i < documentTokens.length; i++) {
|
|
190
196
|
const docWord = documentTokens[i];
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
}
|
|
197
|
+
const matches = candidateLookup.get(docWord);
|
|
198
|
+
if (matches) {
|
|
199
|
+
for (const { queryToken, candidate } of matches) {
|
|
200
|
+
wordMatches.push({
|
|
201
|
+
word: docWord,
|
|
202
|
+
queryToken,
|
|
203
|
+
position: i,
|
|
204
|
+
type: candidate.type,
|
|
205
|
+
distance: candidate.distance,
|
|
206
|
+
score: candidate.score
|
|
207
|
+
});
|
|
203
208
|
}
|
|
204
209
|
}
|
|
205
210
|
}
|
|
@@ -211,47 +216,76 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
|
|
|
211
216
|
config,
|
|
212
217
|
documentFrequency,
|
|
213
218
|
totalDocuments,
|
|
214
|
-
wordMatches
|
|
215
|
-
|
|
219
|
+
wordMatches,
|
|
220
|
+
documentTokens
|
|
221
|
+
// Pass document tokens to extract gap words
|
|
216
222
|
);
|
|
217
223
|
if (phrase && phrase.words.length > 0) {
|
|
218
224
|
phrases.push(phrase);
|
|
219
225
|
}
|
|
220
226
|
}
|
|
221
|
-
|
|
227
|
+
const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
|
|
228
|
+
const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
|
|
229
|
+
return deduplicatePhrases(filteredPhrases);
|
|
222
230
|
}
|
|
223
|
-
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
|
|
231
|
+
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
|
|
224
232
|
const startMatch = wordMatches[startIndex];
|
|
225
233
|
const phraseWords = [startMatch];
|
|
226
|
-
const
|
|
234
|
+
const queryTokenCounts = /* @__PURE__ */ new Map();
|
|
235
|
+
for (const token of queryTokens) {
|
|
236
|
+
queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
|
|
237
|
+
}
|
|
238
|
+
const matchedCounts = /* @__PURE__ */ new Map();
|
|
239
|
+
matchedCounts.set(startMatch.queryToken, 1);
|
|
240
|
+
const gapWords = [];
|
|
241
|
+
let totalGapUsed = 0;
|
|
242
|
+
let totalMatchedTokens = 1;
|
|
227
243
|
for (let i = startIndex + 1; i < wordMatches.length; i++) {
|
|
228
244
|
const match = wordMatches[i];
|
|
229
|
-
const
|
|
245
|
+
const lastPos = phraseWords[phraseWords.length - 1].position;
|
|
246
|
+
const gap = match.position - lastPos - 1;
|
|
230
247
|
if (gap > config.maxGap) {
|
|
231
248
|
break;
|
|
232
249
|
}
|
|
233
|
-
|
|
250
|
+
const neededCount = queryTokenCounts.get(match.queryToken) || 0;
|
|
251
|
+
const currentCount = matchedCounts.get(match.queryToken) || 0;
|
|
252
|
+
if (currentCount < neededCount) {
|
|
253
|
+
for (let pos = lastPos + 1; pos < match.position; pos++) {
|
|
254
|
+
totalGapUsed++;
|
|
255
|
+
gapWords.push({
|
|
256
|
+
word: documentTokens[pos],
|
|
257
|
+
position: pos,
|
|
258
|
+
gapIndex: totalGapUsed
|
|
259
|
+
});
|
|
260
|
+
}
|
|
234
261
|
phraseWords.push(match);
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
262
|
+
matchedCounts.set(match.queryToken, currentCount + 1);
|
|
263
|
+
totalMatchedTokens++;
|
|
264
|
+
if (totalMatchedTokens === queryTokens.length) {
|
|
265
|
+
break;
|
|
266
|
+
}
|
|
239
267
|
}
|
|
240
268
|
}
|
|
241
269
|
if (phraseWords.length > 0) {
|
|
270
|
+
const coverage = phraseWords.length / queryTokens.length;
|
|
271
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
242
272
|
const { score, breakdown } = calculatePhraseScore(
|
|
243
273
|
phraseWords,
|
|
244
274
|
queryTokens,
|
|
245
275
|
config,
|
|
246
276
|
documentFrequency,
|
|
247
277
|
totalDocuments,
|
|
248
|
-
allWordMatches
|
|
278
|
+
allWordMatches,
|
|
279
|
+
coverage
|
|
249
280
|
);
|
|
250
281
|
return {
|
|
251
282
|
words: phraseWords,
|
|
283
|
+
gapWords,
|
|
284
|
+
gapUsed: totalGapUsed,
|
|
285
|
+
coverage,
|
|
252
286
|
startPosition: phraseWords[0].position,
|
|
253
287
|
endPosition: phraseWords[phraseWords.length - 1].position,
|
|
254
|
-
|
|
288
|
+
span,
|
|
255
289
|
inOrder: isInOrder(phraseWords, queryTokens),
|
|
256
290
|
score,
|
|
257
291
|
scoreBreakdown: breakdown
|
|
@@ -259,7 +293,7 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
|
|
|
259
293
|
}
|
|
260
294
|
return null;
|
|
261
295
|
}
|
|
262
|
-
function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
|
|
296
|
+
function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, coverage) {
|
|
263
297
|
let baseScore = 0;
|
|
264
298
|
for (const word of phraseWords) {
|
|
265
299
|
const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
|
|
@@ -268,14 +302,16 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
268
302
|
baseScore /= phraseWords.length;
|
|
269
303
|
const inOrder = isInOrder(phraseWords, queryTokens);
|
|
270
304
|
const orderScore = inOrder ? 1 : 0.5;
|
|
271
|
-
|
|
272
|
-
|
|
305
|
+
let proximityScore = 0;
|
|
306
|
+
if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
|
|
307
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
308
|
+
const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
|
|
309
|
+
proximityScore = Math.max(0, 1 - span / proximityWindow);
|
|
310
|
+
}
|
|
273
311
|
let densityScore = 0;
|
|
274
312
|
if (queryTokens.length === 1) {
|
|
275
313
|
const totalOccurrences = allWordMatches.length;
|
|
276
|
-
densityScore = totalOccurrences /
|
|
277
|
-
} else {
|
|
278
|
-
densityScore = phraseWords.length / queryTokens.length;
|
|
314
|
+
densityScore = Math.min(1, totalOccurrences / 10);
|
|
279
315
|
}
|
|
280
316
|
const semanticScore = calculateSemanticScore(
|
|
281
317
|
phraseWords,
|
|
@@ -289,8 +325,13 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
289
325
|
const weightedDensity = densityScore * weights.density;
|
|
290
326
|
const weightedSemantic = semanticScore * weights.semantic;
|
|
291
327
|
const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
|
|
292
|
-
const
|
|
293
|
-
const
|
|
328
|
+
const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
|
|
329
|
+
const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
|
|
330
|
+
const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
|
|
331
|
+
const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
|
|
332
|
+
const normalizedScore = totalScore / maxPossibleScore;
|
|
333
|
+
const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
|
|
334
|
+
const score = normalizedScore * coverageMultiplier;
|
|
294
335
|
const base = weightedBase / maxPossibleScore;
|
|
295
336
|
const order = weightedOrder / maxPossibleScore;
|
|
296
337
|
const proximity = weightedProximity / maxPossibleScore;
|
|
@@ -303,18 +344,27 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
303
344
|
order,
|
|
304
345
|
proximity,
|
|
305
346
|
density,
|
|
306
|
-
semantic
|
|
347
|
+
semantic,
|
|
348
|
+
coverage: coverageMultiplier
|
|
349
|
+
// Show coverage multiplier in breakdown
|
|
307
350
|
}
|
|
308
351
|
};
|
|
309
352
|
}
|
|
310
353
|
function isInOrder(phraseWords, queryTokens) {
|
|
311
|
-
const
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
354
|
+
const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
|
|
355
|
+
let lastMatchedIndex = -1;
|
|
356
|
+
for (const phraseWord of phraseWords) {
|
|
357
|
+
let foundIndex = -1;
|
|
358
|
+
for (const pos of tokenPositions) {
|
|
359
|
+
if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
|
|
360
|
+
foundIndex = pos.index;
|
|
361
|
+
break;
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
if (foundIndex === -1) {
|
|
316
365
|
return false;
|
|
317
366
|
}
|
|
367
|
+
lastMatchedIndex = foundIndex;
|
|
318
368
|
}
|
|
319
369
|
return true;
|
|
320
370
|
}
|
|
@@ -357,7 +407,8 @@ function deduplicatePhrases(phrases) {
|
|
|
357
407
|
|
|
358
408
|
// src/index.ts
|
|
359
409
|
var DEFAULT_CONFIG = {
|
|
360
|
-
textProperty: "
|
|
410
|
+
textProperty: "normalized_content",
|
|
411
|
+
// Must match server's field name
|
|
361
412
|
tolerance: 1,
|
|
362
413
|
adaptiveTolerance: true,
|
|
363
414
|
enableSynonyms: false,
|
|
@@ -372,7 +423,10 @@ var DEFAULT_CONFIG = {
|
|
|
372
423
|
semantic: 0.15
|
|
373
424
|
},
|
|
374
425
|
maxGap: 5,
|
|
375
|
-
minScore: 0.1
|
|
426
|
+
minScore: 0.1,
|
|
427
|
+
enableFinalScoreMinimum: false,
|
|
428
|
+
finalScoreMinimum: 0.3,
|
|
429
|
+
proximitySpanMultiplier: 5
|
|
376
430
|
};
|
|
377
431
|
var pluginStates = /* @__PURE__ */ new WeakMap();
|
|
378
432
|
function pluginFuzzyPhrase(userConfig = {}) {
|
|
@@ -392,7 +446,10 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
392
446
|
semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
|
|
393
447
|
},
|
|
394
448
|
maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
|
|
395
|
-
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
|
|
449
|
+
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
|
|
450
|
+
enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
|
|
451
|
+
finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
|
|
452
|
+
proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
|
|
396
453
|
};
|
|
397
454
|
const plugin = {
|
|
398
455
|
name: "fuzzy-phrase",
|
|
@@ -405,7 +462,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
405
462
|
synonymMap: {},
|
|
406
463
|
config,
|
|
407
464
|
documentFrequency: /* @__PURE__ */ new Map(),
|
|
408
|
-
totalDocuments: 0
|
|
465
|
+
totalDocuments: 0,
|
|
466
|
+
vocabulary: /* @__PURE__ */ new Set()
|
|
409
467
|
};
|
|
410
468
|
if (config.enableSynonyms && config.supabase) {
|
|
411
469
|
try {
|
|
@@ -422,6 +480,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
422
480
|
state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
|
|
423
481
|
console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
|
|
424
482
|
}
|
|
483
|
+
try {
|
|
484
|
+
const indexData = orama.data?.index;
|
|
485
|
+
let radixNode = null;
|
|
486
|
+
if (indexData?.indexes?.[config.textProperty]?.node) {
|
|
487
|
+
radixNode = indexData.indexes[config.textProperty].node;
|
|
488
|
+
} else if (indexData?.[config.textProperty]?.node) {
|
|
489
|
+
radixNode = indexData[config.textProperty].node;
|
|
490
|
+
}
|
|
491
|
+
if (radixNode) {
|
|
492
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
493
|
+
console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
|
|
494
|
+
} else {
|
|
495
|
+
console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
|
|
496
|
+
}
|
|
497
|
+
} catch (error) {
|
|
498
|
+
console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
|
|
499
|
+
}
|
|
425
500
|
pluginStates.set(orama, state);
|
|
426
501
|
console.log("\u2705 Fuzzy Phrase Plugin initialized");
|
|
427
502
|
setImmediate(() => {
|
|
@@ -443,7 +518,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
443
518
|
console.error("\u274C Plugin state not initialized");
|
|
444
519
|
throw new Error("Fuzzy Phrase Plugin not properly initialized");
|
|
445
520
|
}
|
|
446
|
-
const { term, properties } = params;
|
|
521
|
+
const { term, properties, tokenCache } = params;
|
|
447
522
|
if (!term || typeof term !== "string") {
|
|
448
523
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
449
524
|
}
|
|
@@ -454,32 +529,31 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
454
529
|
}
|
|
455
530
|
const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
|
|
456
531
|
console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
457
|
-
let vocabulary;
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
532
|
+
let vocabulary = state.vocabulary;
|
|
533
|
+
if (vocabulary.size === 0) {
|
|
534
|
+
console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
|
|
535
|
+
try {
|
|
536
|
+
const indexData = orama.data?.index;
|
|
537
|
+
let radixNode = null;
|
|
538
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
539
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
540
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
541
|
+
radixNode = indexData[textProperty].node;
|
|
542
|
+
}
|
|
543
|
+
if (radixNode) {
|
|
544
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
545
|
+
vocabulary = state.vocabulary;
|
|
546
|
+
console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
|
|
547
|
+
} else {
|
|
548
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
549
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
550
|
+
}
|
|
551
|
+
} catch (error) {
|
|
552
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
476
553
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
477
554
|
}
|
|
478
|
-
|
|
479
|
-
console.log(`\u{1F4DA}
|
|
480
|
-
} catch (error) {
|
|
481
|
-
console.error("\u274C Failed to extract vocabulary:", error);
|
|
482
|
-
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
555
|
+
} else {
|
|
556
|
+
console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
|
|
483
557
|
}
|
|
484
558
|
const candidatesMap = findAllCandidates(
|
|
485
559
|
queryTokens,
|
|
@@ -488,10 +562,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
488
562
|
state.config.enableSynonyms ? state.synonymMap : void 0,
|
|
489
563
|
state.config.synonymMatchScore
|
|
490
564
|
);
|
|
491
|
-
const filteredCandidates = filterCandidatesByScore(
|
|
492
|
-
candidatesMap,
|
|
493
|
-
state.config.minScore
|
|
494
|
-
);
|
|
565
|
+
const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
|
|
495
566
|
console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
496
567
|
const documentMatches = [];
|
|
497
568
|
console.log("\u{1F50D} DEBUG orama.data structure:", {
|
|
@@ -518,23 +589,44 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
518
589
|
dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
|
|
519
590
|
});
|
|
520
591
|
}
|
|
521
|
-
|
|
592
|
+
const cacheHits = tokenCache ? tokenCache.size : 0;
|
|
593
|
+
let hasPositionalIndex = false;
|
|
594
|
+
if (tokenCache && tokenCache.size > 0) {
|
|
595
|
+
const firstEntry = tokenCache.values().next().value;
|
|
596
|
+
hasPositionalIndex = !!(firstEntry && !Array.isArray(firstEntry) && firstEntry.positions);
|
|
597
|
+
}
|
|
598
|
+
console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents (${hasPositionalIndex ? "\u26A1 positional index" : cacheHits > 0 ? "tokens cached" : "no cache"})`);
|
|
522
599
|
for (const [docId, doc] of Object.entries(docs)) {
|
|
523
600
|
const text = doc[textProperty];
|
|
524
601
|
if (!text || typeof text !== "string") {
|
|
525
602
|
continue;
|
|
526
603
|
}
|
|
527
|
-
|
|
604
|
+
let docTokens;
|
|
605
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
606
|
+
const cached = tokenCache.get(docId);
|
|
607
|
+
if (Array.isArray(cached)) {
|
|
608
|
+
docTokens = cached;
|
|
609
|
+
} else if (cached.tokens && cached.positions) {
|
|
610
|
+
docTokens = cached.tokens;
|
|
611
|
+
cached.positions;
|
|
612
|
+
} else {
|
|
613
|
+
docTokens = tokenize(text);
|
|
614
|
+
}
|
|
615
|
+
} else {
|
|
616
|
+
docTokens = tokenize(text);
|
|
617
|
+
}
|
|
528
618
|
const phrases = findPhrasesInDocument(
|
|
529
619
|
docTokens,
|
|
530
620
|
filteredCandidates,
|
|
531
621
|
{
|
|
532
622
|
weights: state.config.weights,
|
|
533
|
-
maxGap: state.config.maxGap
|
|
623
|
+
maxGap: state.config.maxGap,
|
|
624
|
+
proximitySpanMultiplier: state.config.proximitySpanMultiplier,
|
|
625
|
+
tolerance
|
|
534
626
|
},
|
|
535
627
|
state.documentFrequency,
|
|
536
|
-
state.totalDocuments
|
|
537
|
-
|
|
628
|
+
state.totalDocuments,
|
|
629
|
+
queryTokens);
|
|
538
630
|
if (phrases.length > 0) {
|
|
539
631
|
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
540
632
|
documentMatches.push({
|
|
@@ -546,8 +638,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
546
638
|
}
|
|
547
639
|
}
|
|
548
640
|
documentMatches.sort((a, b) => b.score - a.score);
|
|
549
|
-
|
|
550
|
-
|
|
641
|
+
let filteredMatches = documentMatches;
|
|
642
|
+
if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
|
|
643
|
+
const threshold = state.config.finalScoreMinimum;
|
|
644
|
+
const beforeCount = filteredMatches.length;
|
|
645
|
+
filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
|
|
646
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
|
|
647
|
+
}
|
|
648
|
+
const limit = params.limit ?? filteredMatches.length;
|
|
649
|
+
const limitedMatches = filteredMatches.slice(0, limit);
|
|
551
650
|
const hits = limitedMatches.map((match) => ({
|
|
552
651
|
id: match.id,
|
|
553
652
|
score: match.score,
|