@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.2 → 3.1.16-custom.newbase.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +208 -82
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +42 -2
- package/dist/index.d.ts +42 -2
- package/dist/index.js +208 -82
- package/dist/index.js.map +1 -1
- package/package.json +62 -62
package/dist/index.cjs
CHANGED
|
@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
|
|
|
50
50
|
if (word === queryToken) {
|
|
51
51
|
return { matches: true, distance: 0, score: 1 };
|
|
52
52
|
}
|
|
53
|
-
if (word.startsWith(queryToken)) {
|
|
54
|
-
return { matches: true, distance: 0, score: 0.95 };
|
|
55
|
-
}
|
|
56
53
|
const result = boundedLevenshtein(word, queryToken, tolerance);
|
|
57
54
|
if (result.isBounded) {
|
|
58
55
|
const score = 1 - result.distance * 0.2;
|
|
@@ -182,23 +179,53 @@ function filterCandidatesByScore(candidatesMap, minScore) {
|
|
|
182
179
|
}
|
|
183
180
|
|
|
184
181
|
// src/scoring.ts
|
|
185
|
-
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
|
|
182
|
+
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, docPositions) {
|
|
186
183
|
const phrases = [];
|
|
187
|
-
const queryTokens =
|
|
184
|
+
const queryTokens = originalQueryTokens;
|
|
188
185
|
const wordMatches = [];
|
|
189
|
-
|
|
190
|
-
const
|
|
186
|
+
if (docPositions) {
|
|
187
|
+
const candidateLookup = /* @__PURE__ */ new Map();
|
|
191
188
|
for (const [queryToken, candidates] of candidatesMap.entries()) {
|
|
192
189
|
for (const candidate of candidates) {
|
|
193
|
-
if (candidate.word
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
190
|
+
if (!candidateLookup.has(candidate.word)) {
|
|
191
|
+
candidateLookup.set(candidate.word, []);
|
|
192
|
+
}
|
|
193
|
+
candidateLookup.get(candidate.word).push({ queryToken, candidate });
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
for (const [docWord, positions] of Object.entries(docPositions)) {
|
|
197
|
+
const matches = candidateLookup.get(docWord);
|
|
198
|
+
if (matches) {
|
|
199
|
+
for (const { queryToken, candidate } of matches) {
|
|
200
|
+
for (const position of positions) {
|
|
201
|
+
wordMatches.push({
|
|
202
|
+
word: docWord,
|
|
203
|
+
queryToken,
|
|
204
|
+
position,
|
|
205
|
+
type: candidate.type,
|
|
206
|
+
distance: candidate.distance,
|
|
207
|
+
score: candidate.score
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
wordMatches.sort((a, b) => a.position - b.position);
|
|
214
|
+
} else {
|
|
215
|
+
for (let i = 0; i < documentTokens.length; i++) {
|
|
216
|
+
const docWord = documentTokens[i];
|
|
217
|
+
for (const [queryToken, candidates] of candidatesMap.entries()) {
|
|
218
|
+
for (const candidate of candidates) {
|
|
219
|
+
if (candidate.word === docWord) {
|
|
220
|
+
wordMatches.push({
|
|
221
|
+
word: docWord,
|
|
222
|
+
queryToken,
|
|
223
|
+
position: i,
|
|
224
|
+
type: candidate.type,
|
|
225
|
+
distance: candidate.distance,
|
|
226
|
+
score: candidate.score
|
|
227
|
+
});
|
|
228
|
+
}
|
|
202
229
|
}
|
|
203
230
|
}
|
|
204
231
|
}
|
|
@@ -211,47 +238,76 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
|
|
|
211
238
|
config,
|
|
212
239
|
documentFrequency,
|
|
213
240
|
totalDocuments,
|
|
214
|
-
wordMatches
|
|
215
|
-
|
|
241
|
+
wordMatches,
|
|
242
|
+
documentTokens
|
|
243
|
+
// Pass document tokens to extract gap words
|
|
216
244
|
);
|
|
217
245
|
if (phrase && phrase.words.length > 0) {
|
|
218
246
|
phrases.push(phrase);
|
|
219
247
|
}
|
|
220
248
|
}
|
|
221
|
-
|
|
249
|
+
const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
|
|
250
|
+
const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
|
|
251
|
+
return deduplicatePhrases(filteredPhrases);
|
|
222
252
|
}
|
|
223
|
-
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
|
|
253
|
+
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
|
|
224
254
|
const startMatch = wordMatches[startIndex];
|
|
225
255
|
const phraseWords = [startMatch];
|
|
226
|
-
const
|
|
256
|
+
const queryTokenCounts = /* @__PURE__ */ new Map();
|
|
257
|
+
for (const token of queryTokens) {
|
|
258
|
+
queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
|
|
259
|
+
}
|
|
260
|
+
const matchedCounts = /* @__PURE__ */ new Map();
|
|
261
|
+
matchedCounts.set(startMatch.queryToken, 1);
|
|
262
|
+
const gapWords = [];
|
|
263
|
+
let totalGapUsed = 0;
|
|
264
|
+
let totalMatchedTokens = 1;
|
|
227
265
|
for (let i = startIndex + 1; i < wordMatches.length; i++) {
|
|
228
266
|
const match = wordMatches[i];
|
|
229
|
-
const
|
|
267
|
+
const lastPos = phraseWords[phraseWords.length - 1].position;
|
|
268
|
+
const gap = match.position - lastPos - 1;
|
|
230
269
|
if (gap > config.maxGap) {
|
|
231
270
|
break;
|
|
232
271
|
}
|
|
233
|
-
|
|
272
|
+
const neededCount = queryTokenCounts.get(match.queryToken) || 0;
|
|
273
|
+
const currentCount = matchedCounts.get(match.queryToken) || 0;
|
|
274
|
+
if (currentCount < neededCount) {
|
|
275
|
+
for (let pos = lastPos + 1; pos < match.position; pos++) {
|
|
276
|
+
totalGapUsed++;
|
|
277
|
+
gapWords.push({
|
|
278
|
+
word: documentTokens[pos],
|
|
279
|
+
position: pos,
|
|
280
|
+
gapIndex: totalGapUsed
|
|
281
|
+
});
|
|
282
|
+
}
|
|
234
283
|
phraseWords.push(match);
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
284
|
+
matchedCounts.set(match.queryToken, currentCount + 1);
|
|
285
|
+
totalMatchedTokens++;
|
|
286
|
+
if (totalMatchedTokens === queryTokens.length) {
|
|
287
|
+
break;
|
|
288
|
+
}
|
|
239
289
|
}
|
|
240
290
|
}
|
|
241
291
|
if (phraseWords.length > 0) {
|
|
292
|
+
const coverage = phraseWords.length / queryTokens.length;
|
|
293
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
242
294
|
const { score, breakdown } = calculatePhraseScore(
|
|
243
295
|
phraseWords,
|
|
244
296
|
queryTokens,
|
|
245
297
|
config,
|
|
246
298
|
documentFrequency,
|
|
247
299
|
totalDocuments,
|
|
248
|
-
allWordMatches
|
|
300
|
+
allWordMatches,
|
|
301
|
+
coverage
|
|
249
302
|
);
|
|
250
303
|
return {
|
|
251
304
|
words: phraseWords,
|
|
305
|
+
gapWords,
|
|
306
|
+
gapUsed: totalGapUsed,
|
|
307
|
+
coverage,
|
|
252
308
|
startPosition: phraseWords[0].position,
|
|
253
309
|
endPosition: phraseWords[phraseWords.length - 1].position,
|
|
254
|
-
|
|
310
|
+
span,
|
|
255
311
|
inOrder: isInOrder(phraseWords, queryTokens),
|
|
256
312
|
score,
|
|
257
313
|
scoreBreakdown: breakdown
|
|
@@ -259,7 +315,7 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
|
|
|
259
315
|
}
|
|
260
316
|
return null;
|
|
261
317
|
}
|
|
262
|
-
function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
|
|
318
|
+
function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, coverage) {
|
|
263
319
|
let baseScore = 0;
|
|
264
320
|
for (const word of phraseWords) {
|
|
265
321
|
const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
|
|
@@ -268,14 +324,16 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
268
324
|
baseScore /= phraseWords.length;
|
|
269
325
|
const inOrder = isInOrder(phraseWords, queryTokens);
|
|
270
326
|
const orderScore = inOrder ? 1 : 0.5;
|
|
271
|
-
|
|
272
|
-
|
|
327
|
+
let proximityScore = 0;
|
|
328
|
+
if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
|
|
329
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
330
|
+
const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
|
|
331
|
+
proximityScore = Math.max(0, 1 - span / proximityWindow);
|
|
332
|
+
}
|
|
273
333
|
let densityScore = 0;
|
|
274
334
|
if (queryTokens.length === 1) {
|
|
275
335
|
const totalOccurrences = allWordMatches.length;
|
|
276
|
-
densityScore = totalOccurrences /
|
|
277
|
-
} else {
|
|
278
|
-
densityScore = phraseWords.length / queryTokens.length;
|
|
336
|
+
densityScore = Math.min(1, totalOccurrences / 10);
|
|
279
337
|
}
|
|
280
338
|
const semanticScore = calculateSemanticScore(
|
|
281
339
|
phraseWords,
|
|
@@ -289,8 +347,13 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
289
347
|
const weightedDensity = densityScore * weights.density;
|
|
290
348
|
const weightedSemantic = semanticScore * weights.semantic;
|
|
291
349
|
const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
|
|
292
|
-
const
|
|
293
|
-
const
|
|
350
|
+
const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
|
|
351
|
+
const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
|
|
352
|
+
const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
|
|
353
|
+
const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
|
|
354
|
+
const normalizedScore = totalScore / maxPossibleScore;
|
|
355
|
+
const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
|
|
356
|
+
const score = normalizedScore * coverageMultiplier;
|
|
294
357
|
const base = weightedBase / maxPossibleScore;
|
|
295
358
|
const order = weightedOrder / maxPossibleScore;
|
|
296
359
|
const proximity = weightedProximity / maxPossibleScore;
|
|
@@ -303,18 +366,27 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
303
366
|
order,
|
|
304
367
|
proximity,
|
|
305
368
|
density,
|
|
306
|
-
semantic
|
|
369
|
+
semantic,
|
|
370
|
+
coverage: coverageMultiplier
|
|
371
|
+
// Show coverage multiplier in breakdown
|
|
307
372
|
}
|
|
308
373
|
};
|
|
309
374
|
}
|
|
310
375
|
function isInOrder(phraseWords, queryTokens) {
|
|
311
|
-
const
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
376
|
+
const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
|
|
377
|
+
let lastMatchedIndex = -1;
|
|
378
|
+
for (const phraseWord of phraseWords) {
|
|
379
|
+
let foundIndex = -1;
|
|
380
|
+
for (const pos of tokenPositions) {
|
|
381
|
+
if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
|
|
382
|
+
foundIndex = pos.index;
|
|
383
|
+
break;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
if (foundIndex === -1) {
|
|
316
387
|
return false;
|
|
317
388
|
}
|
|
389
|
+
lastMatchedIndex = foundIndex;
|
|
318
390
|
}
|
|
319
391
|
return true;
|
|
320
392
|
}
|
|
@@ -357,7 +429,8 @@ function deduplicatePhrases(phrases) {
|
|
|
357
429
|
|
|
358
430
|
// src/index.ts
|
|
359
431
|
var DEFAULT_CONFIG = {
|
|
360
|
-
textProperty: "
|
|
432
|
+
textProperty: "normalized_content",
|
|
433
|
+
// Must match server's field name
|
|
361
434
|
tolerance: 1,
|
|
362
435
|
adaptiveTolerance: true,
|
|
363
436
|
enableSynonyms: false,
|
|
@@ -372,7 +445,10 @@ var DEFAULT_CONFIG = {
|
|
|
372
445
|
semantic: 0.15
|
|
373
446
|
},
|
|
374
447
|
maxGap: 5,
|
|
375
|
-
minScore: 0.1
|
|
448
|
+
minScore: 0.1,
|
|
449
|
+
enableFinalScoreMinimum: false,
|
|
450
|
+
finalScoreMinimum: 0.3,
|
|
451
|
+
proximitySpanMultiplier: 5
|
|
376
452
|
};
|
|
377
453
|
var pluginStates = /* @__PURE__ */ new WeakMap();
|
|
378
454
|
function pluginFuzzyPhrase(userConfig = {}) {
|
|
@@ -392,7 +468,10 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
392
468
|
semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
|
|
393
469
|
},
|
|
394
470
|
maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
|
|
395
|
-
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
|
|
471
|
+
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
|
|
472
|
+
enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
|
|
473
|
+
finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
|
|
474
|
+
proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
|
|
396
475
|
};
|
|
397
476
|
const plugin = {
|
|
398
477
|
name: "fuzzy-phrase",
|
|
@@ -405,7 +484,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
405
484
|
synonymMap: {},
|
|
406
485
|
config,
|
|
407
486
|
documentFrequency: /* @__PURE__ */ new Map(),
|
|
408
|
-
totalDocuments: 0
|
|
487
|
+
totalDocuments: 0,
|
|
488
|
+
vocabulary: /* @__PURE__ */ new Set()
|
|
409
489
|
};
|
|
410
490
|
if (config.enableSynonyms && config.supabase) {
|
|
411
491
|
try {
|
|
@@ -422,6 +502,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
422
502
|
state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
|
|
423
503
|
console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
|
|
424
504
|
}
|
|
505
|
+
try {
|
|
506
|
+
const indexData = orama.data?.index;
|
|
507
|
+
let radixNode = null;
|
|
508
|
+
if (indexData?.indexes?.[config.textProperty]?.node) {
|
|
509
|
+
radixNode = indexData.indexes[config.textProperty].node;
|
|
510
|
+
} else if (indexData?.[config.textProperty]?.node) {
|
|
511
|
+
radixNode = indexData[config.textProperty].node;
|
|
512
|
+
}
|
|
513
|
+
if (radixNode) {
|
|
514
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
515
|
+
console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
|
|
516
|
+
} else {
|
|
517
|
+
console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
|
|
518
|
+
}
|
|
519
|
+
} catch (error) {
|
|
520
|
+
console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
|
|
521
|
+
}
|
|
425
522
|
pluginStates.set(orama, state);
|
|
426
523
|
console.log("\u2705 Fuzzy Phrase Plugin initialized");
|
|
427
524
|
setImmediate(() => {
|
|
@@ -443,7 +540,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
443
540
|
console.error("\u274C Plugin state not initialized");
|
|
444
541
|
throw new Error("Fuzzy Phrase Plugin not properly initialized");
|
|
445
542
|
}
|
|
446
|
-
const { term, properties } = params;
|
|
543
|
+
const { term, properties, tokenCache } = params;
|
|
447
544
|
if (!term || typeof term !== "string") {
|
|
448
545
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
449
546
|
}
|
|
@@ -454,32 +551,31 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
454
551
|
}
|
|
455
552
|
const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
|
|
456
553
|
console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
457
|
-
let vocabulary;
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
554
|
+
let vocabulary = state.vocabulary;
|
|
555
|
+
if (vocabulary.size === 0) {
|
|
556
|
+
console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
|
|
557
|
+
try {
|
|
558
|
+
const indexData = orama.data?.index;
|
|
559
|
+
let radixNode = null;
|
|
560
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
561
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
562
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
563
|
+
radixNode = indexData[textProperty].node;
|
|
564
|
+
}
|
|
565
|
+
if (radixNode) {
|
|
566
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
567
|
+
vocabulary = state.vocabulary;
|
|
568
|
+
console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
|
|
569
|
+
} else {
|
|
570
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
571
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
572
|
+
}
|
|
573
|
+
} catch (error) {
|
|
574
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
476
575
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
477
576
|
}
|
|
478
|
-
|
|
479
|
-
console.log(`\u{1F4DA}
|
|
480
|
-
} catch (error) {
|
|
481
|
-
console.error("\u274C Failed to extract vocabulary:", error);
|
|
482
|
-
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
577
|
+
} else {
|
|
578
|
+
console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
|
|
483
579
|
}
|
|
484
580
|
const candidatesMap = findAllCandidates(
|
|
485
581
|
queryTokens,
|
|
@@ -488,10 +584,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
488
584
|
state.config.enableSynonyms ? state.synonymMap : void 0,
|
|
489
585
|
state.config.synonymMatchScore
|
|
490
586
|
);
|
|
491
|
-
const filteredCandidates = filterCandidatesByScore(
|
|
492
|
-
candidatesMap,
|
|
493
|
-
state.config.minScore
|
|
494
|
-
);
|
|
587
|
+
const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
|
|
495
588
|
console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
496
589
|
const documentMatches = [];
|
|
497
590
|
console.log("\u{1F50D} DEBUG orama.data structure:", {
|
|
@@ -518,22 +611,48 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
518
611
|
dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
|
|
519
612
|
});
|
|
520
613
|
}
|
|
521
|
-
|
|
614
|
+
const cacheHits = tokenCache ? tokenCache.size : 0;
|
|
615
|
+
let hasPositionalIndex = false;
|
|
616
|
+
if (tokenCache && tokenCache.size > 0) {
|
|
617
|
+
const firstEntry = tokenCache.values().next().value;
|
|
618
|
+
hasPositionalIndex = !!(firstEntry && !Array.isArray(firstEntry) && firstEntry.positions);
|
|
619
|
+
}
|
|
620
|
+
console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents (${hasPositionalIndex ? "\u26A1 positional index" : cacheHits > 0 ? "tokens cached" : "no cache"})`);
|
|
522
621
|
for (const [docId, doc] of Object.entries(docs)) {
|
|
523
622
|
const text = doc[textProperty];
|
|
524
623
|
if (!text || typeof text !== "string") {
|
|
525
624
|
continue;
|
|
526
625
|
}
|
|
527
|
-
|
|
626
|
+
let docTokens;
|
|
627
|
+
let docPositions;
|
|
628
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
629
|
+
const cached = tokenCache.get(docId);
|
|
630
|
+
if (Array.isArray(cached)) {
|
|
631
|
+
docTokens = cached;
|
|
632
|
+
} else if (cached.tokens && cached.positions) {
|
|
633
|
+
docTokens = cached.tokens;
|
|
634
|
+
docPositions = cached.positions;
|
|
635
|
+
} else {
|
|
636
|
+
docTokens = tokenize(text);
|
|
637
|
+
}
|
|
638
|
+
} else {
|
|
639
|
+
docTokens = tokenize(text);
|
|
640
|
+
}
|
|
528
641
|
const phrases = findPhrasesInDocument(
|
|
529
642
|
docTokens,
|
|
530
643
|
filteredCandidates,
|
|
531
644
|
{
|
|
532
645
|
weights: state.config.weights,
|
|
533
|
-
maxGap: state.config.maxGap
|
|
646
|
+
maxGap: state.config.maxGap,
|
|
647
|
+
proximitySpanMultiplier: state.config.proximitySpanMultiplier,
|
|
648
|
+
tolerance
|
|
534
649
|
},
|
|
535
650
|
state.documentFrequency,
|
|
536
|
-
state.totalDocuments
|
|
651
|
+
state.totalDocuments,
|
|
652
|
+
queryTokens,
|
|
653
|
+
// Original tokens with duplicates preserved
|
|
654
|
+
docPositions
|
|
655
|
+
// Positional index for O(matches) lookup
|
|
537
656
|
);
|
|
538
657
|
if (phrases.length > 0) {
|
|
539
658
|
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
@@ -546,8 +665,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
546
665
|
}
|
|
547
666
|
}
|
|
548
667
|
documentMatches.sort((a, b) => b.score - a.score);
|
|
549
|
-
|
|
550
|
-
|
|
668
|
+
let filteredMatches = documentMatches;
|
|
669
|
+
if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
|
|
670
|
+
const threshold = state.config.finalScoreMinimum;
|
|
671
|
+
const beforeCount = filteredMatches.length;
|
|
672
|
+
filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
|
|
673
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
|
|
674
|
+
}
|
|
675
|
+
const limit = params.limit ?? filteredMatches.length;
|
|
676
|
+
const limitedMatches = filteredMatches.slice(0, limit);
|
|
551
677
|
const hits = limitedMatches.map((match) => ({
|
|
552
678
|
id: match.id,
|
|
553
679
|
score: match.score,
|