@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.2 → 3.1.16-custom.newbase.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +208 -82
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +42 -2
- package/dist/index.d.ts +42 -2
- package/dist/index.js +208 -82
- package/dist/index.js.map +1 -1
- package/package.json +62 -62
package/dist/index.js
CHANGED
|
@@ -48,9 +48,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
|
|
|
48
48
|
if (word === queryToken) {
|
|
49
49
|
return { matches: true, distance: 0, score: 1 };
|
|
50
50
|
}
|
|
51
|
-
if (word.startsWith(queryToken)) {
|
|
52
|
-
return { matches: true, distance: 0, score: 0.95 };
|
|
53
|
-
}
|
|
54
51
|
const result = boundedLevenshtein(word, queryToken, tolerance);
|
|
55
52
|
if (result.isBounded) {
|
|
56
53
|
const score = 1 - result.distance * 0.2;
|
|
@@ -180,23 +177,53 @@ function filterCandidatesByScore(candidatesMap, minScore) {
|
|
|
180
177
|
}
|
|
181
178
|
|
|
182
179
|
// src/scoring.ts
|
|
183
|
-
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
|
|
180
|
+
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, docPositions) {
|
|
184
181
|
const phrases = [];
|
|
185
|
-
const queryTokens =
|
|
182
|
+
const queryTokens = originalQueryTokens;
|
|
186
183
|
const wordMatches = [];
|
|
187
|
-
|
|
188
|
-
const
|
|
184
|
+
if (docPositions) {
|
|
185
|
+
const candidateLookup = /* @__PURE__ */ new Map();
|
|
189
186
|
for (const [queryToken, candidates] of candidatesMap.entries()) {
|
|
190
187
|
for (const candidate of candidates) {
|
|
191
|
-
if (candidate.word
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
188
|
+
if (!candidateLookup.has(candidate.word)) {
|
|
189
|
+
candidateLookup.set(candidate.word, []);
|
|
190
|
+
}
|
|
191
|
+
candidateLookup.get(candidate.word).push({ queryToken, candidate });
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
for (const [docWord, positions] of Object.entries(docPositions)) {
|
|
195
|
+
const matches = candidateLookup.get(docWord);
|
|
196
|
+
if (matches) {
|
|
197
|
+
for (const { queryToken, candidate } of matches) {
|
|
198
|
+
for (const position of positions) {
|
|
199
|
+
wordMatches.push({
|
|
200
|
+
word: docWord,
|
|
201
|
+
queryToken,
|
|
202
|
+
position,
|
|
203
|
+
type: candidate.type,
|
|
204
|
+
distance: candidate.distance,
|
|
205
|
+
score: candidate.score
|
|
206
|
+
});
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
wordMatches.sort((a, b) => a.position - b.position);
|
|
212
|
+
} else {
|
|
213
|
+
for (let i = 0; i < documentTokens.length; i++) {
|
|
214
|
+
const docWord = documentTokens[i];
|
|
215
|
+
for (const [queryToken, candidates] of candidatesMap.entries()) {
|
|
216
|
+
for (const candidate of candidates) {
|
|
217
|
+
if (candidate.word === docWord) {
|
|
218
|
+
wordMatches.push({
|
|
219
|
+
word: docWord,
|
|
220
|
+
queryToken,
|
|
221
|
+
position: i,
|
|
222
|
+
type: candidate.type,
|
|
223
|
+
distance: candidate.distance,
|
|
224
|
+
score: candidate.score
|
|
225
|
+
});
|
|
226
|
+
}
|
|
200
227
|
}
|
|
201
228
|
}
|
|
202
229
|
}
|
|
@@ -209,47 +236,76 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
|
|
|
209
236
|
config,
|
|
210
237
|
documentFrequency,
|
|
211
238
|
totalDocuments,
|
|
212
|
-
wordMatches
|
|
213
|
-
|
|
239
|
+
wordMatches,
|
|
240
|
+
documentTokens
|
|
241
|
+
// Pass document tokens to extract gap words
|
|
214
242
|
);
|
|
215
243
|
if (phrase && phrase.words.length > 0) {
|
|
216
244
|
phrases.push(phrase);
|
|
217
245
|
}
|
|
218
246
|
}
|
|
219
|
-
|
|
247
|
+
const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
|
|
248
|
+
const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
|
|
249
|
+
return deduplicatePhrases(filteredPhrases);
|
|
220
250
|
}
|
|
221
|
-
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
|
|
251
|
+
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
|
|
222
252
|
const startMatch = wordMatches[startIndex];
|
|
223
253
|
const phraseWords = [startMatch];
|
|
224
|
-
const
|
|
254
|
+
const queryTokenCounts = /* @__PURE__ */ new Map();
|
|
255
|
+
for (const token of queryTokens) {
|
|
256
|
+
queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
|
|
257
|
+
}
|
|
258
|
+
const matchedCounts = /* @__PURE__ */ new Map();
|
|
259
|
+
matchedCounts.set(startMatch.queryToken, 1);
|
|
260
|
+
const gapWords = [];
|
|
261
|
+
let totalGapUsed = 0;
|
|
262
|
+
let totalMatchedTokens = 1;
|
|
225
263
|
for (let i = startIndex + 1; i < wordMatches.length; i++) {
|
|
226
264
|
const match = wordMatches[i];
|
|
227
|
-
const
|
|
265
|
+
const lastPos = phraseWords[phraseWords.length - 1].position;
|
|
266
|
+
const gap = match.position - lastPos - 1;
|
|
228
267
|
if (gap > config.maxGap) {
|
|
229
268
|
break;
|
|
230
269
|
}
|
|
231
|
-
|
|
270
|
+
const neededCount = queryTokenCounts.get(match.queryToken) || 0;
|
|
271
|
+
const currentCount = matchedCounts.get(match.queryToken) || 0;
|
|
272
|
+
if (currentCount < neededCount) {
|
|
273
|
+
for (let pos = lastPos + 1; pos < match.position; pos++) {
|
|
274
|
+
totalGapUsed++;
|
|
275
|
+
gapWords.push({
|
|
276
|
+
word: documentTokens[pos],
|
|
277
|
+
position: pos,
|
|
278
|
+
gapIndex: totalGapUsed
|
|
279
|
+
});
|
|
280
|
+
}
|
|
232
281
|
phraseWords.push(match);
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
282
|
+
matchedCounts.set(match.queryToken, currentCount + 1);
|
|
283
|
+
totalMatchedTokens++;
|
|
284
|
+
if (totalMatchedTokens === queryTokens.length) {
|
|
285
|
+
break;
|
|
286
|
+
}
|
|
237
287
|
}
|
|
238
288
|
}
|
|
239
289
|
if (phraseWords.length > 0) {
|
|
290
|
+
const coverage = phraseWords.length / queryTokens.length;
|
|
291
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
240
292
|
const { score, breakdown } = calculatePhraseScore(
|
|
241
293
|
phraseWords,
|
|
242
294
|
queryTokens,
|
|
243
295
|
config,
|
|
244
296
|
documentFrequency,
|
|
245
297
|
totalDocuments,
|
|
246
|
-
allWordMatches
|
|
298
|
+
allWordMatches,
|
|
299
|
+
coverage
|
|
247
300
|
);
|
|
248
301
|
return {
|
|
249
302
|
words: phraseWords,
|
|
303
|
+
gapWords,
|
|
304
|
+
gapUsed: totalGapUsed,
|
|
305
|
+
coverage,
|
|
250
306
|
startPosition: phraseWords[0].position,
|
|
251
307
|
endPosition: phraseWords[phraseWords.length - 1].position,
|
|
252
|
-
|
|
308
|
+
span,
|
|
253
309
|
inOrder: isInOrder(phraseWords, queryTokens),
|
|
254
310
|
score,
|
|
255
311
|
scoreBreakdown: breakdown
|
|
@@ -257,7 +313,7 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
|
|
|
257
313
|
}
|
|
258
314
|
return null;
|
|
259
315
|
}
|
|
260
|
-
function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
|
|
316
|
+
function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, coverage) {
|
|
261
317
|
let baseScore = 0;
|
|
262
318
|
for (const word of phraseWords) {
|
|
263
319
|
const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
|
|
@@ -266,14 +322,16 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
266
322
|
baseScore /= phraseWords.length;
|
|
267
323
|
const inOrder = isInOrder(phraseWords, queryTokens);
|
|
268
324
|
const orderScore = inOrder ? 1 : 0.5;
|
|
269
|
-
|
|
270
|
-
|
|
325
|
+
let proximityScore = 0;
|
|
326
|
+
if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
|
|
327
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
328
|
+
const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
|
|
329
|
+
proximityScore = Math.max(0, 1 - span / proximityWindow);
|
|
330
|
+
}
|
|
271
331
|
let densityScore = 0;
|
|
272
332
|
if (queryTokens.length === 1) {
|
|
273
333
|
const totalOccurrences = allWordMatches.length;
|
|
274
|
-
densityScore = totalOccurrences /
|
|
275
|
-
} else {
|
|
276
|
-
densityScore = phraseWords.length / queryTokens.length;
|
|
334
|
+
densityScore = Math.min(1, totalOccurrences / 10);
|
|
277
335
|
}
|
|
278
336
|
const semanticScore = calculateSemanticScore(
|
|
279
337
|
phraseWords,
|
|
@@ -287,8 +345,13 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
287
345
|
const weightedDensity = densityScore * weights.density;
|
|
288
346
|
const weightedSemantic = semanticScore * weights.semantic;
|
|
289
347
|
const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
|
|
290
|
-
const
|
|
291
|
-
const
|
|
348
|
+
const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
|
|
349
|
+
const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
|
|
350
|
+
const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
|
|
351
|
+
const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
|
|
352
|
+
const normalizedScore = totalScore / maxPossibleScore;
|
|
353
|
+
const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
|
|
354
|
+
const score = normalizedScore * coverageMultiplier;
|
|
292
355
|
const base = weightedBase / maxPossibleScore;
|
|
293
356
|
const order = weightedOrder / maxPossibleScore;
|
|
294
357
|
const proximity = weightedProximity / maxPossibleScore;
|
|
@@ -301,18 +364,27 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
301
364
|
order,
|
|
302
365
|
proximity,
|
|
303
366
|
density,
|
|
304
|
-
semantic
|
|
367
|
+
semantic,
|
|
368
|
+
coverage: coverageMultiplier
|
|
369
|
+
// Show coverage multiplier in breakdown
|
|
305
370
|
}
|
|
306
371
|
};
|
|
307
372
|
}
|
|
308
373
|
function isInOrder(phraseWords, queryTokens) {
|
|
309
|
-
const
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
374
|
+
const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
|
|
375
|
+
let lastMatchedIndex = -1;
|
|
376
|
+
for (const phraseWord of phraseWords) {
|
|
377
|
+
let foundIndex = -1;
|
|
378
|
+
for (const pos of tokenPositions) {
|
|
379
|
+
if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
|
|
380
|
+
foundIndex = pos.index;
|
|
381
|
+
break;
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
if (foundIndex === -1) {
|
|
314
385
|
return false;
|
|
315
386
|
}
|
|
387
|
+
lastMatchedIndex = foundIndex;
|
|
316
388
|
}
|
|
317
389
|
return true;
|
|
318
390
|
}
|
|
@@ -355,7 +427,8 @@ function deduplicatePhrases(phrases) {
|
|
|
355
427
|
|
|
356
428
|
// src/index.ts
|
|
357
429
|
var DEFAULT_CONFIG = {
|
|
358
|
-
textProperty: "
|
|
430
|
+
textProperty: "normalized_content",
|
|
431
|
+
// Must match server's field name
|
|
359
432
|
tolerance: 1,
|
|
360
433
|
adaptiveTolerance: true,
|
|
361
434
|
enableSynonyms: false,
|
|
@@ -370,7 +443,10 @@ var DEFAULT_CONFIG = {
|
|
|
370
443
|
semantic: 0.15
|
|
371
444
|
},
|
|
372
445
|
maxGap: 5,
|
|
373
|
-
minScore: 0.1
|
|
446
|
+
minScore: 0.1,
|
|
447
|
+
enableFinalScoreMinimum: false,
|
|
448
|
+
finalScoreMinimum: 0.3,
|
|
449
|
+
proximitySpanMultiplier: 5
|
|
374
450
|
};
|
|
375
451
|
var pluginStates = /* @__PURE__ */ new WeakMap();
|
|
376
452
|
function pluginFuzzyPhrase(userConfig = {}) {
|
|
@@ -390,7 +466,10 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
390
466
|
semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
|
|
391
467
|
},
|
|
392
468
|
maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
|
|
393
|
-
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
|
|
469
|
+
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
|
|
470
|
+
enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
|
|
471
|
+
finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
|
|
472
|
+
proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
|
|
394
473
|
};
|
|
395
474
|
const plugin = {
|
|
396
475
|
name: "fuzzy-phrase",
|
|
@@ -403,7 +482,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
403
482
|
synonymMap: {},
|
|
404
483
|
config,
|
|
405
484
|
documentFrequency: /* @__PURE__ */ new Map(),
|
|
406
|
-
totalDocuments: 0
|
|
485
|
+
totalDocuments: 0,
|
|
486
|
+
vocabulary: /* @__PURE__ */ new Set()
|
|
407
487
|
};
|
|
408
488
|
if (config.enableSynonyms && config.supabase) {
|
|
409
489
|
try {
|
|
@@ -420,6 +500,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
420
500
|
state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
|
|
421
501
|
console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
|
|
422
502
|
}
|
|
503
|
+
try {
|
|
504
|
+
const indexData = orama.data?.index;
|
|
505
|
+
let radixNode = null;
|
|
506
|
+
if (indexData?.indexes?.[config.textProperty]?.node) {
|
|
507
|
+
radixNode = indexData.indexes[config.textProperty].node;
|
|
508
|
+
} else if (indexData?.[config.textProperty]?.node) {
|
|
509
|
+
radixNode = indexData[config.textProperty].node;
|
|
510
|
+
}
|
|
511
|
+
if (radixNode) {
|
|
512
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
513
|
+
console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
|
|
514
|
+
} else {
|
|
515
|
+
console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
|
|
516
|
+
}
|
|
517
|
+
} catch (error) {
|
|
518
|
+
console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
|
|
519
|
+
}
|
|
423
520
|
pluginStates.set(orama, state);
|
|
424
521
|
console.log("\u2705 Fuzzy Phrase Plugin initialized");
|
|
425
522
|
setImmediate(() => {
|
|
@@ -441,7 +538,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
441
538
|
console.error("\u274C Plugin state not initialized");
|
|
442
539
|
throw new Error("Fuzzy Phrase Plugin not properly initialized");
|
|
443
540
|
}
|
|
444
|
-
const { term, properties } = params;
|
|
541
|
+
const { term, properties, tokenCache } = params;
|
|
445
542
|
if (!term || typeof term !== "string") {
|
|
446
543
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
447
544
|
}
|
|
@@ -452,32 +549,31 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
452
549
|
}
|
|
453
550
|
const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
|
|
454
551
|
console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
455
|
-
let vocabulary;
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
552
|
+
let vocabulary = state.vocabulary;
|
|
553
|
+
if (vocabulary.size === 0) {
|
|
554
|
+
console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
|
|
555
|
+
try {
|
|
556
|
+
const indexData = orama.data?.index;
|
|
557
|
+
let radixNode = null;
|
|
558
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
559
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
560
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
561
|
+
radixNode = indexData[textProperty].node;
|
|
562
|
+
}
|
|
563
|
+
if (radixNode) {
|
|
564
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
565
|
+
vocabulary = state.vocabulary;
|
|
566
|
+
console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
|
|
567
|
+
} else {
|
|
568
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
569
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
570
|
+
}
|
|
571
|
+
} catch (error) {
|
|
572
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
474
573
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
475
574
|
}
|
|
476
|
-
|
|
477
|
-
console.log(`\u{1F4DA}
|
|
478
|
-
} catch (error) {
|
|
479
|
-
console.error("\u274C Failed to extract vocabulary:", error);
|
|
480
|
-
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
575
|
+
} else {
|
|
576
|
+
console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
|
|
481
577
|
}
|
|
482
578
|
const candidatesMap = findAllCandidates(
|
|
483
579
|
queryTokens,
|
|
@@ -486,10 +582,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
486
582
|
state.config.enableSynonyms ? state.synonymMap : void 0,
|
|
487
583
|
state.config.synonymMatchScore
|
|
488
584
|
);
|
|
489
|
-
const filteredCandidates = filterCandidatesByScore(
|
|
490
|
-
candidatesMap,
|
|
491
|
-
state.config.minScore
|
|
492
|
-
);
|
|
585
|
+
const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
|
|
493
586
|
console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
494
587
|
const documentMatches = [];
|
|
495
588
|
console.log("\u{1F50D} DEBUG orama.data structure:", {
|
|
@@ -516,22 +609,48 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
516
609
|
dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
|
|
517
610
|
});
|
|
518
611
|
}
|
|
519
|
-
|
|
612
|
+
const cacheHits = tokenCache ? tokenCache.size : 0;
|
|
613
|
+
let hasPositionalIndex = false;
|
|
614
|
+
if (tokenCache && tokenCache.size > 0) {
|
|
615
|
+
const firstEntry = tokenCache.values().next().value;
|
|
616
|
+
hasPositionalIndex = !!(firstEntry && !Array.isArray(firstEntry) && firstEntry.positions);
|
|
617
|
+
}
|
|
618
|
+
console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents (${hasPositionalIndex ? "\u26A1 positional index" : cacheHits > 0 ? "tokens cached" : "no cache"})`);
|
|
520
619
|
for (const [docId, doc] of Object.entries(docs)) {
|
|
521
620
|
const text = doc[textProperty];
|
|
522
621
|
if (!text || typeof text !== "string") {
|
|
523
622
|
continue;
|
|
524
623
|
}
|
|
525
|
-
|
|
624
|
+
let docTokens;
|
|
625
|
+
let docPositions;
|
|
626
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
627
|
+
const cached = tokenCache.get(docId);
|
|
628
|
+
if (Array.isArray(cached)) {
|
|
629
|
+
docTokens = cached;
|
|
630
|
+
} else if (cached.tokens && cached.positions) {
|
|
631
|
+
docTokens = cached.tokens;
|
|
632
|
+
docPositions = cached.positions;
|
|
633
|
+
} else {
|
|
634
|
+
docTokens = tokenize(text);
|
|
635
|
+
}
|
|
636
|
+
} else {
|
|
637
|
+
docTokens = tokenize(text);
|
|
638
|
+
}
|
|
526
639
|
const phrases = findPhrasesInDocument(
|
|
527
640
|
docTokens,
|
|
528
641
|
filteredCandidates,
|
|
529
642
|
{
|
|
530
643
|
weights: state.config.weights,
|
|
531
|
-
maxGap: state.config.maxGap
|
|
644
|
+
maxGap: state.config.maxGap,
|
|
645
|
+
proximitySpanMultiplier: state.config.proximitySpanMultiplier,
|
|
646
|
+
tolerance
|
|
532
647
|
},
|
|
533
648
|
state.documentFrequency,
|
|
534
|
-
state.totalDocuments
|
|
649
|
+
state.totalDocuments,
|
|
650
|
+
queryTokens,
|
|
651
|
+
// Original tokens with duplicates preserved
|
|
652
|
+
docPositions
|
|
653
|
+
// Positional index for O(matches) lookup
|
|
535
654
|
);
|
|
536
655
|
if (phrases.length > 0) {
|
|
537
656
|
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
@@ -544,8 +663,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
544
663
|
}
|
|
545
664
|
}
|
|
546
665
|
documentMatches.sort((a, b) => b.score - a.score);
|
|
547
|
-
|
|
548
|
-
|
|
666
|
+
let filteredMatches = documentMatches;
|
|
667
|
+
if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
|
|
668
|
+
const threshold = state.config.finalScoreMinimum;
|
|
669
|
+
const beforeCount = filteredMatches.length;
|
|
670
|
+
filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
|
|
671
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
|
|
672
|
+
}
|
|
673
|
+
const limit = params.limit ?? filteredMatches.length;
|
|
674
|
+
const limitedMatches = filteredMatches.slice(0, limit);
|
|
549
675
|
const hits = limitedMatches.map((match) => ({
|
|
550
676
|
id: match.id,
|
|
551
677
|
score: match.score,
|