@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.3 → 3.1.16-custom.newbase.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +441 -86
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +116 -3
- package/dist/index.d.ts +116 -3
- package/dist/index.js +437 -87
- package/dist/index.js.map +1 -1
- package/package.json +5 -3
package/dist/index.cjs
CHANGED
|
@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
|
|
|
50
50
|
if (word === queryToken) {
|
|
51
51
|
return { matches: true, distance: 0, score: 1 };
|
|
52
52
|
}
|
|
53
|
-
if (word.startsWith(queryToken)) {
|
|
54
|
-
return { matches: true, distance: 0, score: 0.95 };
|
|
55
|
-
}
|
|
56
53
|
const result = boundedLevenshtein(word, queryToken, tolerance);
|
|
57
54
|
if (result.isBounded) {
|
|
58
55
|
const score = 1 - result.distance * 0.2;
|
|
@@ -182,24 +179,51 @@ function filterCandidatesByScore(candidatesMap, minScore) {
|
|
|
182
179
|
}
|
|
183
180
|
|
|
184
181
|
// src/scoring.ts
|
|
185
|
-
function
|
|
182
|
+
function buildCandidateLookup(candidatesMap) {
|
|
183
|
+
const candidateLookup = /* @__PURE__ */ new Map();
|
|
184
|
+
for (const [queryToken, candidates] of candidatesMap.entries()) {
|
|
185
|
+
for (const candidate of candidates) {
|
|
186
|
+
if (!candidateLookup.has(candidate.word)) {
|
|
187
|
+
candidateLookup.set(candidate.word, []);
|
|
188
|
+
}
|
|
189
|
+
candidateLookup.get(candidate.word).push({ queryToken, candidate });
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
for (const entries of candidateLookup.values()) {
|
|
193
|
+
entries.sort((a, b) => {
|
|
194
|
+
if (a.candidate.type === "exact" && b.candidate.type !== "exact")
|
|
195
|
+
return -1;
|
|
196
|
+
if (b.candidate.type === "exact" && a.candidate.type !== "exact")
|
|
197
|
+
return 1;
|
|
198
|
+
return b.candidate.score - a.candidate.score;
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
return candidateLookup;
|
|
202
|
+
}
|
|
203
|
+
function buildQueryTokenCounts(queryTokens) {
|
|
204
|
+
const queryTokenCounts = /* @__PURE__ */ new Map();
|
|
205
|
+
for (const token of queryTokens) {
|
|
206
|
+
queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
|
|
207
|
+
}
|
|
208
|
+
return queryTokenCounts;
|
|
209
|
+
}
|
|
210
|
+
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, candidateLookup, queryTokenCounts) {
|
|
186
211
|
const phrases = [];
|
|
187
|
-
const queryTokens =
|
|
212
|
+
const queryTokens = originalQueryTokens;
|
|
188
213
|
const wordMatches = [];
|
|
189
214
|
for (let i = 0; i < documentTokens.length; i++) {
|
|
190
215
|
const docWord = documentTokens[i];
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
}
|
|
216
|
+
const matches = candidateLookup.get(docWord);
|
|
217
|
+
if (matches) {
|
|
218
|
+
for (const { queryToken, candidate } of matches) {
|
|
219
|
+
wordMatches.push({
|
|
220
|
+
word: docWord,
|
|
221
|
+
queryToken,
|
|
222
|
+
position: i,
|
|
223
|
+
type: candidate.type,
|
|
224
|
+
distance: candidate.distance,
|
|
225
|
+
score: candidate.score
|
|
226
|
+
});
|
|
203
227
|
}
|
|
204
228
|
}
|
|
205
229
|
}
|
|
@@ -212,42 +236,56 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
|
|
|
212
236
|
documentFrequency,
|
|
213
237
|
totalDocuments,
|
|
214
238
|
wordMatches,
|
|
215
|
-
documentTokens
|
|
216
|
-
|
|
239
|
+
documentTokens,
|
|
240
|
+
queryTokenCounts
|
|
241
|
+
// OPTIMIZATION B: Pass pre-built queryTokenCounts
|
|
217
242
|
);
|
|
218
243
|
if (phrase && phrase.words.length > 0) {
|
|
219
244
|
phrases.push(phrase);
|
|
220
245
|
}
|
|
221
246
|
}
|
|
222
|
-
|
|
247
|
+
const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
|
|
248
|
+
const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
|
|
249
|
+
return deduplicatePhrases(filteredPhrases);
|
|
223
250
|
}
|
|
224
|
-
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
|
|
251
|
+
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens, queryTokenCounts) {
|
|
225
252
|
const startMatch = wordMatches[startIndex];
|
|
226
253
|
const phraseWords = [startMatch];
|
|
227
|
-
const
|
|
254
|
+
const matchedCounts = /* @__PURE__ */ new Map();
|
|
255
|
+
matchedCounts.set(startMatch.queryToken, 1);
|
|
228
256
|
const gapWords = [];
|
|
229
257
|
let totalGapUsed = 0;
|
|
258
|
+
let totalMatchedTokens = 1;
|
|
230
259
|
for (let i = startIndex + 1; i < wordMatches.length; i++) {
|
|
231
260
|
const match = wordMatches[i];
|
|
232
261
|
const lastPos = phraseWords[phraseWords.length - 1].position;
|
|
262
|
+
if (match.position <= lastPos) {
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
233
265
|
const gap = match.position - lastPos - 1;
|
|
234
266
|
if (gap > config.maxGap) {
|
|
235
267
|
break;
|
|
236
268
|
}
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
gapWords.push({
|
|
240
|
-
word: documentTokens[pos],
|
|
241
|
-
position: pos,
|
|
242
|
-
gapIndex: totalGapUsed
|
|
243
|
-
});
|
|
269
|
+
if (totalGapUsed + gap > config.maxGap) {
|
|
270
|
+
break;
|
|
244
271
|
}
|
|
245
|
-
|
|
272
|
+
const neededCount = queryTokenCounts.get(match.queryToken) || 0;
|
|
273
|
+
const currentCount = matchedCounts.get(match.queryToken) || 0;
|
|
274
|
+
if (currentCount < neededCount) {
|
|
275
|
+
for (let pos = lastPos + 1; pos < match.position; pos++) {
|
|
276
|
+
totalGapUsed++;
|
|
277
|
+
gapWords.push({
|
|
278
|
+
word: documentTokens[pos],
|
|
279
|
+
position: pos,
|
|
280
|
+
gapIndex: totalGapUsed
|
|
281
|
+
});
|
|
282
|
+
}
|
|
246
283
|
phraseWords.push(match);
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
284
|
+
matchedCounts.set(match.queryToken, currentCount + 1);
|
|
285
|
+
totalMatchedTokens++;
|
|
286
|
+
if (totalMatchedTokens === queryTokens.length) {
|
|
287
|
+
break;
|
|
288
|
+
}
|
|
251
289
|
}
|
|
252
290
|
}
|
|
253
291
|
if (phraseWords.length > 0) {
|
|
@@ -286,9 +324,12 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
286
324
|
baseScore /= phraseWords.length;
|
|
287
325
|
const inOrder = isInOrder(phraseWords, queryTokens);
|
|
288
326
|
const orderScore = inOrder ? 1 : 0.5;
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
327
|
+
let proximityScore = 0;
|
|
328
|
+
if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
|
|
329
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
330
|
+
const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
|
|
331
|
+
proximityScore = Math.max(0, 1 - span / proximityWindow);
|
|
332
|
+
}
|
|
292
333
|
let densityScore = 0;
|
|
293
334
|
if (queryTokens.length === 1) {
|
|
294
335
|
const totalOccurrences = allWordMatches.length;
|
|
@@ -306,8 +347,10 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
306
347
|
const weightedDensity = densityScore * weights.density;
|
|
307
348
|
const weightedSemantic = semanticScore * weights.semantic;
|
|
308
349
|
const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
|
|
309
|
-
const
|
|
310
|
-
const
|
|
350
|
+
const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
|
|
351
|
+
const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
|
|
352
|
+
const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
|
|
353
|
+
const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
|
|
311
354
|
const normalizedScore = totalScore / maxPossibleScore;
|
|
312
355
|
const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
|
|
313
356
|
const score = normalizedScore * coverageMultiplier;
|
|
@@ -330,13 +373,20 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
330
373
|
};
|
|
331
374
|
}
|
|
332
375
|
function isInOrder(phraseWords, queryTokens) {
|
|
333
|
-
const
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
376
|
+
const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
|
|
377
|
+
let lastMatchedIndex = -1;
|
|
378
|
+
for (const phraseWord of phraseWords) {
|
|
379
|
+
let foundIndex = -1;
|
|
380
|
+
for (const pos of tokenPositions) {
|
|
381
|
+
if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
|
|
382
|
+
foundIndex = pos.index;
|
|
383
|
+
break;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
if (foundIndex === -1) {
|
|
338
387
|
return false;
|
|
339
388
|
}
|
|
389
|
+
lastMatchedIndex = foundIndex;
|
|
340
390
|
}
|
|
341
391
|
return true;
|
|
342
392
|
}
|
|
@@ -377,9 +427,259 @@ function deduplicatePhrases(phrases) {
|
|
|
377
427
|
return result.sort((a, b) => b.score - a.score);
|
|
378
428
|
}
|
|
379
429
|
|
|
430
|
+
// src/optimized.ts
|
|
431
|
+
var DEFAULT_OPTIMIZED_CONFIG = {
|
|
432
|
+
maxQPSCandidates: 100,
|
|
433
|
+
// Limit phrase scoring to top 100 candidates
|
|
434
|
+
minQPSScore: 0.1,
|
|
435
|
+
// Include candidates with 10%+ of best score
|
|
436
|
+
qpsExact: false,
|
|
437
|
+
// Use fuzzy matching by default
|
|
438
|
+
qpsTolerance: 1
|
|
439
|
+
// Default tolerance of 1 edit distance
|
|
440
|
+
};
|
|
441
|
+
function normalizeText(text) {
|
|
442
|
+
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
443
|
+
}
|
|
444
|
+
function tokenize(text) {
|
|
445
|
+
return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
|
|
446
|
+
}
|
|
447
|
+
function buildReducedVocabularyFromDocs(candidateDocIds, docs) {
|
|
448
|
+
const reducedVocab = /* @__PURE__ */ new Set();
|
|
449
|
+
for (const docId of candidateDocIds) {
|
|
450
|
+
const doc = docs[docId];
|
|
451
|
+
if (!doc?.normalized_content)
|
|
452
|
+
continue;
|
|
453
|
+
const tokens = doc.normalized_content.split(/\s+/).filter((token) => token.length > 0);
|
|
454
|
+
for (const token of tokens) {
|
|
455
|
+
reducedVocab.add(token);
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
return reducedVocab;
|
|
459
|
+
}
|
|
460
|
+
function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
|
|
461
|
+
const tokens = tokenizer.tokenize(term, language);
|
|
462
|
+
if (tokens.length === 0) {
|
|
463
|
+
return [];
|
|
464
|
+
}
|
|
465
|
+
const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
|
|
466
|
+
const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
|
|
467
|
+
const boostPerProp = config.qpsBoostPerProp ?? {};
|
|
468
|
+
const resultMap = /* @__PURE__ */ new Map();
|
|
469
|
+
for (const prop of properties) {
|
|
470
|
+
const indexEntry = qpsIndex.indexes[prop];
|
|
471
|
+
if (!indexEntry || indexEntry.type !== "Radix") {
|
|
472
|
+
continue;
|
|
473
|
+
}
|
|
474
|
+
const radixNode = indexEntry.node;
|
|
475
|
+
const stats = qpsIndex.stats[prop];
|
|
476
|
+
if (!radixNode || !stats) {
|
|
477
|
+
continue;
|
|
478
|
+
}
|
|
479
|
+
const boost = boostPerProp[prop] ?? 1;
|
|
480
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
481
|
+
const token = tokens[i];
|
|
482
|
+
const matches = radixNode.find({
|
|
483
|
+
term: token,
|
|
484
|
+
exact,
|
|
485
|
+
tolerance: exact ? 0 : tolerance
|
|
486
|
+
});
|
|
487
|
+
for (const [matchedWord, docIds] of Object.entries(matches)) {
|
|
488
|
+
if (!Array.isArray(docIds))
|
|
489
|
+
continue;
|
|
490
|
+
const isExactMatch = matchedWord === token;
|
|
491
|
+
for (const docId of docIds) {
|
|
492
|
+
const tokensLength = stats.tokensLength.get(docId) || 1;
|
|
493
|
+
const quantum = stats.tokenQuantums[docId]?.[matchedWord];
|
|
494
|
+
const occurrences = quantum ? quantum >> 20 : 1;
|
|
495
|
+
const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
|
|
496
|
+
if (!resultMap.has(docId)) {
|
|
497
|
+
resultMap.set(docId, [scoreContrib, 1 << i]);
|
|
498
|
+
} else {
|
|
499
|
+
const [prevScore, prevMask] = resultMap.get(docId);
|
|
500
|
+
const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
|
|
501
|
+
resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
|
|
508
|
+
return results;
|
|
509
|
+
}
|
|
510
|
+
function countSetBits(n) {
|
|
511
|
+
let count = 0;
|
|
512
|
+
while (n) {
|
|
513
|
+
count += n & 1;
|
|
514
|
+
n >>= 1;
|
|
515
|
+
}
|
|
516
|
+
return count;
|
|
517
|
+
}
|
|
518
|
+
async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
|
|
519
|
+
const startTime = performance.now();
|
|
520
|
+
const { term, properties, tokenCache } = params;
|
|
521
|
+
if (!term || typeof term !== "string") {
|
|
522
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
523
|
+
}
|
|
524
|
+
const textProperty = properties && properties[0] || pluginState.config.textProperty;
|
|
525
|
+
const searchProperties = properties || [textProperty];
|
|
526
|
+
const queryTokens = tokenize(term);
|
|
527
|
+
if (queryTokens.length === 0) {
|
|
528
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
529
|
+
}
|
|
530
|
+
const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
|
|
531
|
+
console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
532
|
+
const qpsStartTime = performance.now();
|
|
533
|
+
const tokenizer = orama.tokenizer;
|
|
534
|
+
const qpsCandidates = searchQPS(
|
|
535
|
+
term,
|
|
536
|
+
qpsIndex,
|
|
537
|
+
tokenizer,
|
|
538
|
+
searchProperties,
|
|
539
|
+
config,
|
|
540
|
+
language
|
|
541
|
+
);
|
|
542
|
+
const qpsTime = performance.now() - qpsStartTime;
|
|
543
|
+
console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
|
|
544
|
+
if (qpsCandidates.length === 0) {
|
|
545
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
546
|
+
}
|
|
547
|
+
const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
|
|
548
|
+
const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
|
|
549
|
+
const bestScore = qpsCandidates[0][1];
|
|
550
|
+
const minScore = bestScore * minScoreRatio;
|
|
551
|
+
const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
|
|
552
|
+
console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
|
|
553
|
+
const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
|
|
554
|
+
let vocabulary = pluginState.vocabulary;
|
|
555
|
+
if (vocabulary.size === 0) {
|
|
556
|
+
console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
|
|
557
|
+
try {
|
|
558
|
+
const indexData = orama.data?.index;
|
|
559
|
+
let radixNode = null;
|
|
560
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
561
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
562
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
563
|
+
radixNode = indexData[textProperty].node;
|
|
564
|
+
}
|
|
565
|
+
if (radixNode) {
|
|
566
|
+
pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
567
|
+
vocabulary = pluginState.vocabulary;
|
|
568
|
+
console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
|
|
569
|
+
} else {
|
|
570
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
571
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
572
|
+
}
|
|
573
|
+
} catch (error) {
|
|
574
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
575
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
let docs = {};
|
|
579
|
+
if (orama.data?.docs?.docs) {
|
|
580
|
+
docs = orama.data.docs.docs;
|
|
581
|
+
}
|
|
582
|
+
const vocabStartTime = performance.now();
|
|
583
|
+
const reducedVocabulary = buildReducedVocabularyFromDocs(candidateDocIds, docs);
|
|
584
|
+
const vocabTime = performance.now() - vocabStartTime;
|
|
585
|
+
console.log(`\u{1F4DA} Reduced vocabulary: ${reducedVocabulary.size} words (full: ${vocabulary.size}, reduction: ${(100 * (1 - reducedVocabulary.size / vocabulary.size)).toFixed(1)}%, built in ${vocabTime.toFixed(2)}ms)`);
|
|
586
|
+
const candidatesMap = findAllCandidates(
|
|
587
|
+
queryTokens,
|
|
588
|
+
reducedVocabulary,
|
|
589
|
+
tolerance,
|
|
590
|
+
pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
|
|
591
|
+
pluginState.config.synonymMatchScore
|
|
592
|
+
);
|
|
593
|
+
const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
|
|
594
|
+
console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
595
|
+
const phraseStartTime = performance.now();
|
|
596
|
+
const candidateLookup = buildCandidateLookup(filteredFuzzyCandidates);
|
|
597
|
+
const queryTokenCounts = buildQueryTokenCounts(queryTokens);
|
|
598
|
+
const documentMatches = [];
|
|
599
|
+
let docsScored = 0;
|
|
600
|
+
for (const [docId, doc] of Object.entries(docs)) {
|
|
601
|
+
if (!candidateDocIds.has(docId)) {
|
|
602
|
+
continue;
|
|
603
|
+
}
|
|
604
|
+
docsScored++;
|
|
605
|
+
const text = doc[textProperty];
|
|
606
|
+
if (!text || typeof text !== "string") {
|
|
607
|
+
continue;
|
|
608
|
+
}
|
|
609
|
+
let docTokens;
|
|
610
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
611
|
+
docTokens = tokenCache.get(docId);
|
|
612
|
+
} else {
|
|
613
|
+
docTokens = text.split(/\s+/).filter((token) => token.length > 0);
|
|
614
|
+
}
|
|
615
|
+
const phrases = findPhrasesInDocument(
|
|
616
|
+
docTokens,
|
|
617
|
+
filteredFuzzyCandidates,
|
|
618
|
+
{
|
|
619
|
+
weights: pluginState.config.weights,
|
|
620
|
+
maxGap: pluginState.config.maxGap,
|
|
621
|
+
proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
|
|
622
|
+
tolerance
|
|
623
|
+
},
|
|
624
|
+
pluginState.documentFrequency,
|
|
625
|
+
pluginState.totalDocuments,
|
|
626
|
+
queryTokens,
|
|
627
|
+
candidateLookup,
|
|
628
|
+
// PHASE 1 OPTIMIZATION A: Pre-built candidate lookup
|
|
629
|
+
queryTokenCounts
|
|
630
|
+
// PHASE 1 OPTIMIZATION B: Pre-built query token counts
|
|
631
|
+
);
|
|
632
|
+
if (phrases.length > 0) {
|
|
633
|
+
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
634
|
+
documentMatches.push({
|
|
635
|
+
id: docId,
|
|
636
|
+
phrases,
|
|
637
|
+
score: docScore,
|
|
638
|
+
document: doc
|
|
639
|
+
});
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
const phraseTime = performance.now() - phraseStartTime;
|
|
643
|
+
console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
|
|
644
|
+
documentMatches.sort((a, b) => b.score - a.score);
|
|
645
|
+
let finalMatches = documentMatches;
|
|
646
|
+
if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
|
|
647
|
+
const threshold = pluginState.config.finalScoreMinimum;
|
|
648
|
+
const beforeCount = finalMatches.length;
|
|
649
|
+
finalMatches = finalMatches.filter((m) => m.score >= threshold);
|
|
650
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
|
|
651
|
+
}
|
|
652
|
+
const limit = params.limit ?? finalMatches.length;
|
|
653
|
+
const limitedMatches = finalMatches.slice(0, limit);
|
|
654
|
+
const hits = limitedMatches.map((match) => ({
|
|
655
|
+
id: match.id,
|
|
656
|
+
score: match.score,
|
|
657
|
+
document: match.document,
|
|
658
|
+
_phrases: match.phrases
|
|
659
|
+
}));
|
|
660
|
+
const elapsed = performance.now() - startTime;
|
|
661
|
+
console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
|
|
662
|
+
return {
|
|
663
|
+
elapsed: {
|
|
664
|
+
formatted: `${elapsed.toFixed(2)}ms`,
|
|
665
|
+
raw: Math.floor(elapsed * 1e6),
|
|
666
|
+
qpsTime,
|
|
667
|
+
phraseTime
|
|
668
|
+
},
|
|
669
|
+
hits,
|
|
670
|
+
count: hits.length
|
|
671
|
+
};
|
|
672
|
+
}
|
|
673
|
+
function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
|
|
674
|
+
return async (params, language = "french") => {
|
|
675
|
+
return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
|
|
676
|
+
};
|
|
677
|
+
}
|
|
678
|
+
|
|
380
679
|
// src/index.ts
|
|
381
680
|
var DEFAULT_CONFIG = {
|
|
382
|
-
textProperty: "
|
|
681
|
+
textProperty: "normalized_content",
|
|
682
|
+
// Must match server's field name
|
|
383
683
|
tolerance: 1,
|
|
384
684
|
adaptiveTolerance: true,
|
|
385
685
|
enableSynonyms: false,
|
|
@@ -395,6 +695,8 @@ var DEFAULT_CONFIG = {
|
|
|
395
695
|
},
|
|
396
696
|
maxGap: 5,
|
|
397
697
|
minScore: 0.1,
|
|
698
|
+
enableFinalScoreMinimum: false,
|
|
699
|
+
finalScoreMinimum: 0.3,
|
|
398
700
|
proximitySpanMultiplier: 5
|
|
399
701
|
};
|
|
400
702
|
var pluginStates = /* @__PURE__ */ new WeakMap();
|
|
@@ -416,6 +718,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
416
718
|
},
|
|
417
719
|
maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
|
|
418
720
|
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
|
|
721
|
+
enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
|
|
722
|
+
finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
|
|
419
723
|
proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
|
|
420
724
|
};
|
|
421
725
|
const plugin = {
|
|
@@ -429,7 +733,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
429
733
|
synonymMap: {},
|
|
430
734
|
config,
|
|
431
735
|
documentFrequency: /* @__PURE__ */ new Map(),
|
|
432
|
-
totalDocuments: 0
|
|
736
|
+
totalDocuments: 0,
|
|
737
|
+
vocabulary: /* @__PURE__ */ new Set()
|
|
433
738
|
};
|
|
434
739
|
if (config.enableSynonyms && config.supabase) {
|
|
435
740
|
try {
|
|
@@ -446,6 +751,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
446
751
|
state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
|
|
447
752
|
console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
|
|
448
753
|
}
|
|
754
|
+
try {
|
|
755
|
+
const indexData = orama.data?.index;
|
|
756
|
+
let radixNode = null;
|
|
757
|
+
if (indexData?.indexes?.[config.textProperty]?.node) {
|
|
758
|
+
radixNode = indexData.indexes[config.textProperty].node;
|
|
759
|
+
} else if (indexData?.[config.textProperty]?.node) {
|
|
760
|
+
radixNode = indexData[config.textProperty].node;
|
|
761
|
+
}
|
|
762
|
+
if (radixNode) {
|
|
763
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
764
|
+
console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
|
|
765
|
+
} else {
|
|
766
|
+
console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
|
|
767
|
+
}
|
|
768
|
+
} catch (error) {
|
|
769
|
+
console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
|
|
770
|
+
}
|
|
449
771
|
pluginStates.set(orama, state);
|
|
450
772
|
console.log("\u2705 Fuzzy Phrase Plugin initialized");
|
|
451
773
|
setImmediate(() => {
|
|
@@ -467,43 +789,43 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
467
789
|
console.error("\u274C Plugin state not initialized");
|
|
468
790
|
throw new Error("Fuzzy Phrase Plugin not properly initialized");
|
|
469
791
|
}
|
|
470
|
-
const { term, properties } = params;
|
|
792
|
+
const { term, properties, tokenCache, candidateIds } = params;
|
|
793
|
+
const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
|
|
471
794
|
if (!term || typeof term !== "string") {
|
|
472
795
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
473
796
|
}
|
|
474
797
|
const textProperty = properties && properties[0] || state.config.textProperty;
|
|
475
|
-
const queryTokens =
|
|
798
|
+
const queryTokens = tokenize2(term);
|
|
476
799
|
if (queryTokens.length === 0) {
|
|
477
800
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
478
801
|
}
|
|
479
802
|
const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
|
|
480
803
|
console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
481
|
-
let vocabulary;
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
804
|
+
let vocabulary = state.vocabulary;
|
|
805
|
+
if (vocabulary.size === 0) {
|
|
806
|
+
console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
|
|
807
|
+
try {
|
|
808
|
+
const indexData = orama.data?.index;
|
|
809
|
+
let radixNode = null;
|
|
810
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
811
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
812
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
813
|
+
radixNode = indexData[textProperty].node;
|
|
814
|
+
}
|
|
815
|
+
if (radixNode) {
|
|
816
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
817
|
+
vocabulary = state.vocabulary;
|
|
818
|
+
console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
|
|
819
|
+
} else {
|
|
820
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
821
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
822
|
+
}
|
|
823
|
+
} catch (error) {
|
|
824
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
500
825
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
501
826
|
}
|
|
502
|
-
|
|
503
|
-
console.log(`\u{1F4DA}
|
|
504
|
-
} catch (error) {
|
|
505
|
-
console.error("\u274C Failed to extract vocabulary:", error);
|
|
506
|
-
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
827
|
+
} else {
|
|
828
|
+
console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
|
|
507
829
|
}
|
|
508
830
|
const candidatesMap = findAllCandidates(
|
|
509
831
|
queryTokens,
|
|
@@ -512,11 +834,10 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
512
834
|
state.config.enableSynonyms ? state.synonymMap : void 0,
|
|
513
835
|
state.config.synonymMatchScore
|
|
514
836
|
);
|
|
515
|
-
const filteredCandidates = filterCandidatesByScore(
|
|
516
|
-
candidatesMap,
|
|
517
|
-
state.config.minScore
|
|
518
|
-
);
|
|
837
|
+
const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
|
|
519
838
|
console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
839
|
+
const candidateLookup = buildCandidateLookup(filteredCandidates);
|
|
840
|
+
const queryTokenCounts = buildQueryTokenCounts(queryTokens);
|
|
520
841
|
const documentMatches = [];
|
|
521
842
|
console.log("\u{1F50D} DEBUG orama.data structure:", {
|
|
522
843
|
dataKeys: Object.keys(orama.data || {}),
|
|
@@ -542,23 +863,42 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
542
863
|
dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
|
|
543
864
|
});
|
|
544
865
|
}
|
|
545
|
-
|
|
866
|
+
const cacheHits = tokenCache ? tokenCache.size : 0;
|
|
867
|
+
const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
|
|
868
|
+
console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
|
|
546
869
|
for (const [docId, doc] of Object.entries(docs)) {
|
|
870
|
+
if (candidateIdSet) {
|
|
871
|
+
const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
|
|
872
|
+
if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
|
|
873
|
+
continue;
|
|
874
|
+
}
|
|
875
|
+
}
|
|
547
876
|
const text = doc[textProperty];
|
|
548
877
|
if (!text || typeof text !== "string") {
|
|
549
878
|
continue;
|
|
550
879
|
}
|
|
551
|
-
|
|
880
|
+
let docTokens;
|
|
881
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
882
|
+
docTokens = tokenCache.get(docId);
|
|
883
|
+
} else {
|
|
884
|
+
docTokens = tokenize2(text);
|
|
885
|
+
}
|
|
552
886
|
const phrases = findPhrasesInDocument(
|
|
553
887
|
docTokens,
|
|
554
888
|
filteredCandidates,
|
|
555
889
|
{
|
|
556
890
|
weights: state.config.weights,
|
|
557
891
|
maxGap: state.config.maxGap,
|
|
558
|
-
proximitySpanMultiplier: state.config.proximitySpanMultiplier
|
|
892
|
+
proximitySpanMultiplier: state.config.proximitySpanMultiplier,
|
|
893
|
+
tolerance
|
|
559
894
|
},
|
|
560
895
|
state.documentFrequency,
|
|
561
|
-
state.totalDocuments
|
|
896
|
+
state.totalDocuments,
|
|
897
|
+
queryTokens,
|
|
898
|
+
candidateLookup,
|
|
899
|
+
// PHASE 1 OPTIMIZATION A: Pre-built candidate lookup
|
|
900
|
+
queryTokenCounts
|
|
901
|
+
// PHASE 1 OPTIMIZATION B: Pre-built query token counts
|
|
562
902
|
);
|
|
563
903
|
if (phrases.length > 0) {
|
|
564
904
|
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
@@ -571,8 +911,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
571
911
|
}
|
|
572
912
|
}
|
|
573
913
|
documentMatches.sort((a, b) => b.score - a.score);
|
|
574
|
-
|
|
575
|
-
|
|
914
|
+
let filteredMatches = documentMatches;
|
|
915
|
+
if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
|
|
916
|
+
const threshold = state.config.finalScoreMinimum;
|
|
917
|
+
const beforeCount = filteredMatches.length;
|
|
918
|
+
filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
|
|
919
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
|
|
920
|
+
}
|
|
921
|
+
const limit = params.limit ?? filteredMatches.length;
|
|
922
|
+
const limitedMatches = filteredMatches.slice(0, limit);
|
|
576
923
|
const hits = limitedMatches.map((match) => ({
|
|
577
924
|
id: match.id,
|
|
578
925
|
score: match.score,
|
|
@@ -623,21 +970,29 @@ function calculateDocumentFrequencies(docs, textProperty) {
|
|
|
623
970
|
if (!text || typeof text !== "string") {
|
|
624
971
|
continue;
|
|
625
972
|
}
|
|
626
|
-
const words = new Set(
|
|
973
|
+
const words = new Set(tokenize2(text));
|
|
627
974
|
for (const word of words) {
|
|
628
975
|
df.set(word, (df.get(word) || 0) + 1);
|
|
629
976
|
}
|
|
630
977
|
}
|
|
631
978
|
return df;
|
|
632
979
|
}
|
|
633
|
-
function
|
|
980
|
+
function normalizeText2(text) {
|
|
634
981
|
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
635
982
|
}
|
|
636
|
-
function
|
|
637
|
-
return
|
|
983
|
+
function tokenize2(text) {
|
|
984
|
+
return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
|
|
985
|
+
}
|
|
986
|
+
function getPluginState(orama) {
|
|
987
|
+
return pluginStates.get(orama);
|
|
638
988
|
}
|
|
639
989
|
|
|
990
|
+
exports.createOptimizedSearch = createOptimizedSearch;
|
|
991
|
+
exports.getPluginState = getPluginState;
|
|
992
|
+
exports.normalizeTextOptimized = normalizeText;
|
|
640
993
|
exports.pluginFuzzyPhrase = pluginFuzzyPhrase;
|
|
641
994
|
exports.searchWithFuzzyPhrase = searchWithFuzzyPhrase;
|
|
995
|
+
exports.searchWithQPSPruning = searchWithQPSPruning;
|
|
996
|
+
exports.tokenizeOptimized = tokenize;
|
|
642
997
|
//# sourceMappingURL=out.js.map
|
|
643
998
|
//# sourceMappingURL=index.cjs.map
|