@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.2 → 3.1.16-custom.newbase.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +183 -84
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +42 -2
- package/dist/index.d.ts +42 -2
- package/dist/index.js +183 -84
- package/dist/index.js.map +1 -1
- package/package.json +62 -62
package/dist/index.js
CHANGED
|
@@ -48,9 +48,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
|
|
|
48
48
|
if (word === queryToken) {
|
|
49
49
|
return { matches: true, distance: 0, score: 1 };
|
|
50
50
|
}
|
|
51
|
-
if (word.startsWith(queryToken)) {
|
|
52
|
-
return { matches: true, distance: 0, score: 0.95 };
|
|
53
|
-
}
|
|
54
51
|
const result = boundedLevenshtein(word, queryToken, tolerance);
|
|
55
52
|
if (result.isBounded) {
|
|
56
53
|
const score = 1 - result.distance * 0.2;
|
|
@@ -180,24 +177,32 @@ function filterCandidatesByScore(candidatesMap, minScore) {
|
|
|
180
177
|
}
|
|
181
178
|
|
|
182
179
|
// src/scoring.ts
|
|
183
|
-
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
|
|
180
|
+
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, docPositions) {
|
|
184
181
|
const phrases = [];
|
|
185
|
-
const queryTokens =
|
|
182
|
+
const queryTokens = originalQueryTokens;
|
|
186
183
|
const wordMatches = [];
|
|
184
|
+
const candidateLookup = /* @__PURE__ */ new Map();
|
|
185
|
+
for (const [queryToken, candidates] of candidatesMap.entries()) {
|
|
186
|
+
for (const candidate of candidates) {
|
|
187
|
+
if (!candidateLookup.has(candidate.word)) {
|
|
188
|
+
candidateLookup.set(candidate.word, []);
|
|
189
|
+
}
|
|
190
|
+
candidateLookup.get(candidate.word).push({ queryToken, candidate });
|
|
191
|
+
}
|
|
192
|
+
}
|
|
187
193
|
for (let i = 0; i < documentTokens.length; i++) {
|
|
188
194
|
const docWord = documentTokens[i];
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
}
|
|
195
|
+
const matches = candidateLookup.get(docWord);
|
|
196
|
+
if (matches) {
|
|
197
|
+
for (const { queryToken, candidate } of matches) {
|
|
198
|
+
wordMatches.push({
|
|
199
|
+
word: docWord,
|
|
200
|
+
queryToken,
|
|
201
|
+
position: i,
|
|
202
|
+
type: candidate.type,
|
|
203
|
+
distance: candidate.distance,
|
|
204
|
+
score: candidate.score
|
|
205
|
+
});
|
|
201
206
|
}
|
|
202
207
|
}
|
|
203
208
|
}
|
|
@@ -209,47 +214,76 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
|
|
|
209
214
|
config,
|
|
210
215
|
documentFrequency,
|
|
211
216
|
totalDocuments,
|
|
212
|
-
wordMatches
|
|
213
|
-
|
|
217
|
+
wordMatches,
|
|
218
|
+
documentTokens
|
|
219
|
+
// Pass document tokens to extract gap words
|
|
214
220
|
);
|
|
215
221
|
if (phrase && phrase.words.length > 0) {
|
|
216
222
|
phrases.push(phrase);
|
|
217
223
|
}
|
|
218
224
|
}
|
|
219
|
-
|
|
225
|
+
const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
|
|
226
|
+
const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
|
|
227
|
+
return deduplicatePhrases(filteredPhrases);
|
|
220
228
|
}
|
|
221
|
-
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
|
|
229
|
+
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
|
|
222
230
|
const startMatch = wordMatches[startIndex];
|
|
223
231
|
const phraseWords = [startMatch];
|
|
224
|
-
const
|
|
232
|
+
const queryTokenCounts = /* @__PURE__ */ new Map();
|
|
233
|
+
for (const token of queryTokens) {
|
|
234
|
+
queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
|
|
235
|
+
}
|
|
236
|
+
const matchedCounts = /* @__PURE__ */ new Map();
|
|
237
|
+
matchedCounts.set(startMatch.queryToken, 1);
|
|
238
|
+
const gapWords = [];
|
|
239
|
+
let totalGapUsed = 0;
|
|
240
|
+
let totalMatchedTokens = 1;
|
|
225
241
|
for (let i = startIndex + 1; i < wordMatches.length; i++) {
|
|
226
242
|
const match = wordMatches[i];
|
|
227
|
-
const
|
|
243
|
+
const lastPos = phraseWords[phraseWords.length - 1].position;
|
|
244
|
+
const gap = match.position - lastPos - 1;
|
|
228
245
|
if (gap > config.maxGap) {
|
|
229
246
|
break;
|
|
230
247
|
}
|
|
231
|
-
|
|
248
|
+
const neededCount = queryTokenCounts.get(match.queryToken) || 0;
|
|
249
|
+
const currentCount = matchedCounts.get(match.queryToken) || 0;
|
|
250
|
+
if (currentCount < neededCount) {
|
|
251
|
+
for (let pos = lastPos + 1; pos < match.position; pos++) {
|
|
252
|
+
totalGapUsed++;
|
|
253
|
+
gapWords.push({
|
|
254
|
+
word: documentTokens[pos],
|
|
255
|
+
position: pos,
|
|
256
|
+
gapIndex: totalGapUsed
|
|
257
|
+
});
|
|
258
|
+
}
|
|
232
259
|
phraseWords.push(match);
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
260
|
+
matchedCounts.set(match.queryToken, currentCount + 1);
|
|
261
|
+
totalMatchedTokens++;
|
|
262
|
+
if (totalMatchedTokens === queryTokens.length) {
|
|
263
|
+
break;
|
|
264
|
+
}
|
|
237
265
|
}
|
|
238
266
|
}
|
|
239
267
|
if (phraseWords.length > 0) {
|
|
268
|
+
const coverage = phraseWords.length / queryTokens.length;
|
|
269
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
240
270
|
const { score, breakdown } = calculatePhraseScore(
|
|
241
271
|
phraseWords,
|
|
242
272
|
queryTokens,
|
|
243
273
|
config,
|
|
244
274
|
documentFrequency,
|
|
245
275
|
totalDocuments,
|
|
246
|
-
allWordMatches
|
|
276
|
+
allWordMatches,
|
|
277
|
+
coverage
|
|
247
278
|
);
|
|
248
279
|
return {
|
|
249
280
|
words: phraseWords,
|
|
281
|
+
gapWords,
|
|
282
|
+
gapUsed: totalGapUsed,
|
|
283
|
+
coverage,
|
|
250
284
|
startPosition: phraseWords[0].position,
|
|
251
285
|
endPosition: phraseWords[phraseWords.length - 1].position,
|
|
252
|
-
|
|
286
|
+
span,
|
|
253
287
|
inOrder: isInOrder(phraseWords, queryTokens),
|
|
254
288
|
score,
|
|
255
289
|
scoreBreakdown: breakdown
|
|
@@ -257,7 +291,7 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
|
|
|
257
291
|
}
|
|
258
292
|
return null;
|
|
259
293
|
}
|
|
260
|
-
function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
|
|
294
|
+
function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, coverage) {
|
|
261
295
|
let baseScore = 0;
|
|
262
296
|
for (const word of phraseWords) {
|
|
263
297
|
const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
|
|
@@ -266,14 +300,16 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
266
300
|
baseScore /= phraseWords.length;
|
|
267
301
|
const inOrder = isInOrder(phraseWords, queryTokens);
|
|
268
302
|
const orderScore = inOrder ? 1 : 0.5;
|
|
269
|
-
|
|
270
|
-
|
|
303
|
+
let proximityScore = 0;
|
|
304
|
+
if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
|
|
305
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
306
|
+
const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
|
|
307
|
+
proximityScore = Math.max(0, 1 - span / proximityWindow);
|
|
308
|
+
}
|
|
271
309
|
let densityScore = 0;
|
|
272
310
|
if (queryTokens.length === 1) {
|
|
273
311
|
const totalOccurrences = allWordMatches.length;
|
|
274
|
-
densityScore = totalOccurrences /
|
|
275
|
-
} else {
|
|
276
|
-
densityScore = phraseWords.length / queryTokens.length;
|
|
312
|
+
densityScore = Math.min(1, totalOccurrences / 10);
|
|
277
313
|
}
|
|
278
314
|
const semanticScore = calculateSemanticScore(
|
|
279
315
|
phraseWords,
|
|
@@ -287,8 +323,13 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
287
323
|
const weightedDensity = densityScore * weights.density;
|
|
288
324
|
const weightedSemantic = semanticScore * weights.semantic;
|
|
289
325
|
const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
|
|
290
|
-
const
|
|
291
|
-
const
|
|
326
|
+
const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
|
|
327
|
+
const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
|
|
328
|
+
const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
|
|
329
|
+
const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
|
|
330
|
+
const normalizedScore = totalScore / maxPossibleScore;
|
|
331
|
+
const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
|
|
332
|
+
const score = normalizedScore * coverageMultiplier;
|
|
292
333
|
const base = weightedBase / maxPossibleScore;
|
|
293
334
|
const order = weightedOrder / maxPossibleScore;
|
|
294
335
|
const proximity = weightedProximity / maxPossibleScore;
|
|
@@ -301,18 +342,27 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
301
342
|
order,
|
|
302
343
|
proximity,
|
|
303
344
|
density,
|
|
304
|
-
semantic
|
|
345
|
+
semantic,
|
|
346
|
+
coverage: coverageMultiplier
|
|
347
|
+
// Show coverage multiplier in breakdown
|
|
305
348
|
}
|
|
306
349
|
};
|
|
307
350
|
}
|
|
308
351
|
function isInOrder(phraseWords, queryTokens) {
|
|
309
|
-
const
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
352
|
+
const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
|
|
353
|
+
let lastMatchedIndex = -1;
|
|
354
|
+
for (const phraseWord of phraseWords) {
|
|
355
|
+
let foundIndex = -1;
|
|
356
|
+
for (const pos of tokenPositions) {
|
|
357
|
+
if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
|
|
358
|
+
foundIndex = pos.index;
|
|
359
|
+
break;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
if (foundIndex === -1) {
|
|
314
363
|
return false;
|
|
315
364
|
}
|
|
365
|
+
lastMatchedIndex = foundIndex;
|
|
316
366
|
}
|
|
317
367
|
return true;
|
|
318
368
|
}
|
|
@@ -355,7 +405,8 @@ function deduplicatePhrases(phrases) {
|
|
|
355
405
|
|
|
356
406
|
// src/index.ts
|
|
357
407
|
var DEFAULT_CONFIG = {
|
|
358
|
-
textProperty: "
|
|
408
|
+
textProperty: "normalized_content",
|
|
409
|
+
// Must match server's field name
|
|
359
410
|
tolerance: 1,
|
|
360
411
|
adaptiveTolerance: true,
|
|
361
412
|
enableSynonyms: false,
|
|
@@ -370,7 +421,10 @@ var DEFAULT_CONFIG = {
|
|
|
370
421
|
semantic: 0.15
|
|
371
422
|
},
|
|
372
423
|
maxGap: 5,
|
|
373
|
-
minScore: 0.1
|
|
424
|
+
minScore: 0.1,
|
|
425
|
+
enableFinalScoreMinimum: false,
|
|
426
|
+
finalScoreMinimum: 0.3,
|
|
427
|
+
proximitySpanMultiplier: 5
|
|
374
428
|
};
|
|
375
429
|
var pluginStates = /* @__PURE__ */ new WeakMap();
|
|
376
430
|
function pluginFuzzyPhrase(userConfig = {}) {
|
|
@@ -390,7 +444,10 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
390
444
|
semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
|
|
391
445
|
},
|
|
392
446
|
maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
|
|
393
|
-
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
|
|
447
|
+
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
|
|
448
|
+
enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
|
|
449
|
+
finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
|
|
450
|
+
proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
|
|
394
451
|
};
|
|
395
452
|
const plugin = {
|
|
396
453
|
name: "fuzzy-phrase",
|
|
@@ -403,7 +460,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
403
460
|
synonymMap: {},
|
|
404
461
|
config,
|
|
405
462
|
documentFrequency: /* @__PURE__ */ new Map(),
|
|
406
|
-
totalDocuments: 0
|
|
463
|
+
totalDocuments: 0,
|
|
464
|
+
vocabulary: /* @__PURE__ */ new Set()
|
|
407
465
|
};
|
|
408
466
|
if (config.enableSynonyms && config.supabase) {
|
|
409
467
|
try {
|
|
@@ -420,6 +478,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
420
478
|
state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
|
|
421
479
|
console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
|
|
422
480
|
}
|
|
481
|
+
try {
|
|
482
|
+
const indexData = orama.data?.index;
|
|
483
|
+
let radixNode = null;
|
|
484
|
+
if (indexData?.indexes?.[config.textProperty]?.node) {
|
|
485
|
+
radixNode = indexData.indexes[config.textProperty].node;
|
|
486
|
+
} else if (indexData?.[config.textProperty]?.node) {
|
|
487
|
+
radixNode = indexData[config.textProperty].node;
|
|
488
|
+
}
|
|
489
|
+
if (radixNode) {
|
|
490
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
491
|
+
console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
|
|
492
|
+
} else {
|
|
493
|
+
console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
|
|
494
|
+
}
|
|
495
|
+
} catch (error) {
|
|
496
|
+
console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
|
|
497
|
+
}
|
|
423
498
|
pluginStates.set(orama, state);
|
|
424
499
|
console.log("\u2705 Fuzzy Phrase Plugin initialized");
|
|
425
500
|
setImmediate(() => {
|
|
@@ -441,7 +516,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
441
516
|
console.error("\u274C Plugin state not initialized");
|
|
442
517
|
throw new Error("Fuzzy Phrase Plugin not properly initialized");
|
|
443
518
|
}
|
|
444
|
-
const { term, properties } = params;
|
|
519
|
+
const { term, properties, tokenCache } = params;
|
|
445
520
|
if (!term || typeof term !== "string") {
|
|
446
521
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
447
522
|
}
|
|
@@ -452,32 +527,31 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
452
527
|
}
|
|
453
528
|
const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
|
|
454
529
|
console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
455
|
-
let vocabulary;
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
530
|
+
let vocabulary = state.vocabulary;
|
|
531
|
+
if (vocabulary.size === 0) {
|
|
532
|
+
console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
|
|
533
|
+
try {
|
|
534
|
+
const indexData = orama.data?.index;
|
|
535
|
+
let radixNode = null;
|
|
536
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
537
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
538
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
539
|
+
radixNode = indexData[textProperty].node;
|
|
540
|
+
}
|
|
541
|
+
if (radixNode) {
|
|
542
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
543
|
+
vocabulary = state.vocabulary;
|
|
544
|
+
console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
|
|
545
|
+
} else {
|
|
546
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
547
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
548
|
+
}
|
|
549
|
+
} catch (error) {
|
|
550
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
474
551
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
475
552
|
}
|
|
476
|
-
|
|
477
|
-
console.log(`\u{1F4DA}
|
|
478
|
-
} catch (error) {
|
|
479
|
-
console.error("\u274C Failed to extract vocabulary:", error);
|
|
480
|
-
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
553
|
+
} else {
|
|
554
|
+
console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
|
|
481
555
|
}
|
|
482
556
|
const candidatesMap = findAllCandidates(
|
|
483
557
|
queryTokens,
|
|
@@ -486,10 +560,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
486
560
|
state.config.enableSynonyms ? state.synonymMap : void 0,
|
|
487
561
|
state.config.synonymMatchScore
|
|
488
562
|
);
|
|
489
|
-
const filteredCandidates = filterCandidatesByScore(
|
|
490
|
-
candidatesMap,
|
|
491
|
-
state.config.minScore
|
|
492
|
-
);
|
|
563
|
+
const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
|
|
493
564
|
console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
494
565
|
const documentMatches = [];
|
|
495
566
|
console.log("\u{1F50D} DEBUG orama.data structure:", {
|
|
@@ -516,23 +587,44 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
516
587
|
dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
|
|
517
588
|
});
|
|
518
589
|
}
|
|
519
|
-
|
|
590
|
+
const cacheHits = tokenCache ? tokenCache.size : 0;
|
|
591
|
+
let hasPositionalIndex = false;
|
|
592
|
+
if (tokenCache && tokenCache.size > 0) {
|
|
593
|
+
const firstEntry = tokenCache.values().next().value;
|
|
594
|
+
hasPositionalIndex = !!(firstEntry && !Array.isArray(firstEntry) && firstEntry.positions);
|
|
595
|
+
}
|
|
596
|
+
console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents (${hasPositionalIndex ? "\u26A1 positional index" : cacheHits > 0 ? "tokens cached" : "no cache"})`);
|
|
520
597
|
for (const [docId, doc] of Object.entries(docs)) {
|
|
521
598
|
const text = doc[textProperty];
|
|
522
599
|
if (!text || typeof text !== "string") {
|
|
523
600
|
continue;
|
|
524
601
|
}
|
|
525
|
-
|
|
602
|
+
let docTokens;
|
|
603
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
604
|
+
const cached = tokenCache.get(docId);
|
|
605
|
+
if (Array.isArray(cached)) {
|
|
606
|
+
docTokens = cached;
|
|
607
|
+
} else if (cached.tokens && cached.positions) {
|
|
608
|
+
docTokens = cached.tokens;
|
|
609
|
+
cached.positions;
|
|
610
|
+
} else {
|
|
611
|
+
docTokens = tokenize(text);
|
|
612
|
+
}
|
|
613
|
+
} else {
|
|
614
|
+
docTokens = tokenize(text);
|
|
615
|
+
}
|
|
526
616
|
const phrases = findPhrasesInDocument(
|
|
527
617
|
docTokens,
|
|
528
618
|
filteredCandidates,
|
|
529
619
|
{
|
|
530
620
|
weights: state.config.weights,
|
|
531
|
-
maxGap: state.config.maxGap
|
|
621
|
+
maxGap: state.config.maxGap,
|
|
622
|
+
proximitySpanMultiplier: state.config.proximitySpanMultiplier,
|
|
623
|
+
tolerance
|
|
532
624
|
},
|
|
533
625
|
state.documentFrequency,
|
|
534
|
-
state.totalDocuments
|
|
535
|
-
|
|
626
|
+
state.totalDocuments,
|
|
627
|
+
queryTokens);
|
|
536
628
|
if (phrases.length > 0) {
|
|
537
629
|
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
538
630
|
documentMatches.push({
|
|
@@ -544,8 +636,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
544
636
|
}
|
|
545
637
|
}
|
|
546
638
|
documentMatches.sort((a, b) => b.score - a.score);
|
|
547
|
-
|
|
548
|
-
|
|
639
|
+
let filteredMatches = documentMatches;
|
|
640
|
+
if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
|
|
641
|
+
const threshold = state.config.finalScoreMinimum;
|
|
642
|
+
const beforeCount = filteredMatches.length;
|
|
643
|
+
filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
|
|
644
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
|
|
645
|
+
}
|
|
646
|
+
const limit = params.limit ?? filteredMatches.length;
|
|
647
|
+
const limitedMatches = filteredMatches.slice(0, limit);
|
|
549
648
|
const hits = limitedMatches.map((match) => ({
|
|
550
649
|
id: match.id,
|
|
551
650
|
score: match.score,
|