@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.newbase.3 → 3.1.16-custom.newbase.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +441 -86
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +116 -3
- package/dist/index.d.ts +116 -3
- package/dist/index.js +437 -87
- package/dist/index.js.map +1 -1
- package/package.json +5 -3
package/dist/index.js
CHANGED
|
@@ -48,9 +48,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
|
|
|
48
48
|
if (word === queryToken) {
|
|
49
49
|
return { matches: true, distance: 0, score: 1 };
|
|
50
50
|
}
|
|
51
|
-
if (word.startsWith(queryToken)) {
|
|
52
|
-
return { matches: true, distance: 0, score: 0.95 };
|
|
53
|
-
}
|
|
54
51
|
const result = boundedLevenshtein(word, queryToken, tolerance);
|
|
55
52
|
if (result.isBounded) {
|
|
56
53
|
const score = 1 - result.distance * 0.2;
|
|
@@ -180,24 +177,51 @@ function filterCandidatesByScore(candidatesMap, minScore) {
|
|
|
180
177
|
}
|
|
181
178
|
|
|
182
179
|
// src/scoring.ts
|
|
183
|
-
function
|
|
180
|
+
function buildCandidateLookup(candidatesMap) {
|
|
181
|
+
const candidateLookup = /* @__PURE__ */ new Map();
|
|
182
|
+
for (const [queryToken, candidates] of candidatesMap.entries()) {
|
|
183
|
+
for (const candidate of candidates) {
|
|
184
|
+
if (!candidateLookup.has(candidate.word)) {
|
|
185
|
+
candidateLookup.set(candidate.word, []);
|
|
186
|
+
}
|
|
187
|
+
candidateLookup.get(candidate.word).push({ queryToken, candidate });
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
for (const entries of candidateLookup.values()) {
|
|
191
|
+
entries.sort((a, b) => {
|
|
192
|
+
if (a.candidate.type === "exact" && b.candidate.type !== "exact")
|
|
193
|
+
return -1;
|
|
194
|
+
if (b.candidate.type === "exact" && a.candidate.type !== "exact")
|
|
195
|
+
return 1;
|
|
196
|
+
return b.candidate.score - a.candidate.score;
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
return candidateLookup;
|
|
200
|
+
}
|
|
201
|
+
function buildQueryTokenCounts(queryTokens) {
|
|
202
|
+
const queryTokenCounts = /* @__PURE__ */ new Map();
|
|
203
|
+
for (const token of queryTokens) {
|
|
204
|
+
queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
|
|
205
|
+
}
|
|
206
|
+
return queryTokenCounts;
|
|
207
|
+
}
|
|
208
|
+
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments, originalQueryTokens, candidateLookup, queryTokenCounts) {
|
|
184
209
|
const phrases = [];
|
|
185
|
-
const queryTokens =
|
|
210
|
+
const queryTokens = originalQueryTokens;
|
|
186
211
|
const wordMatches = [];
|
|
187
212
|
for (let i = 0; i < documentTokens.length; i++) {
|
|
188
213
|
const docWord = documentTokens[i];
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
}
|
|
214
|
+
const matches = candidateLookup.get(docWord);
|
|
215
|
+
if (matches) {
|
|
216
|
+
for (const { queryToken, candidate } of matches) {
|
|
217
|
+
wordMatches.push({
|
|
218
|
+
word: docWord,
|
|
219
|
+
queryToken,
|
|
220
|
+
position: i,
|
|
221
|
+
type: candidate.type,
|
|
222
|
+
distance: candidate.distance,
|
|
223
|
+
score: candidate.score
|
|
224
|
+
});
|
|
201
225
|
}
|
|
202
226
|
}
|
|
203
227
|
}
|
|
@@ -210,42 +234,56 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
|
|
|
210
234
|
documentFrequency,
|
|
211
235
|
totalDocuments,
|
|
212
236
|
wordMatches,
|
|
213
|
-
documentTokens
|
|
214
|
-
|
|
237
|
+
documentTokens,
|
|
238
|
+
queryTokenCounts
|
|
239
|
+
// OPTIMIZATION B: Pass pre-built queryTokenCounts
|
|
215
240
|
);
|
|
216
241
|
if (phrase && phrase.words.length > 0) {
|
|
217
242
|
phrases.push(phrase);
|
|
218
243
|
}
|
|
219
244
|
}
|
|
220
|
-
|
|
245
|
+
const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
|
|
246
|
+
const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
|
|
247
|
+
return deduplicatePhrases(filteredPhrases);
|
|
221
248
|
}
|
|
222
|
-
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
|
|
249
|
+
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens, queryTokenCounts) {
|
|
223
250
|
const startMatch = wordMatches[startIndex];
|
|
224
251
|
const phraseWords = [startMatch];
|
|
225
|
-
const
|
|
252
|
+
const matchedCounts = /* @__PURE__ */ new Map();
|
|
253
|
+
matchedCounts.set(startMatch.queryToken, 1);
|
|
226
254
|
const gapWords = [];
|
|
227
255
|
let totalGapUsed = 0;
|
|
256
|
+
let totalMatchedTokens = 1;
|
|
228
257
|
for (let i = startIndex + 1; i < wordMatches.length; i++) {
|
|
229
258
|
const match = wordMatches[i];
|
|
230
259
|
const lastPos = phraseWords[phraseWords.length - 1].position;
|
|
260
|
+
if (match.position <= lastPos) {
|
|
261
|
+
continue;
|
|
262
|
+
}
|
|
231
263
|
const gap = match.position - lastPos - 1;
|
|
232
264
|
if (gap > config.maxGap) {
|
|
233
265
|
break;
|
|
234
266
|
}
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
gapWords.push({
|
|
238
|
-
word: documentTokens[pos],
|
|
239
|
-
position: pos,
|
|
240
|
-
gapIndex: totalGapUsed
|
|
241
|
-
});
|
|
267
|
+
if (totalGapUsed + gap > config.maxGap) {
|
|
268
|
+
break;
|
|
242
269
|
}
|
|
243
|
-
|
|
270
|
+
const neededCount = queryTokenCounts.get(match.queryToken) || 0;
|
|
271
|
+
const currentCount = matchedCounts.get(match.queryToken) || 0;
|
|
272
|
+
if (currentCount < neededCount) {
|
|
273
|
+
for (let pos = lastPos + 1; pos < match.position; pos++) {
|
|
274
|
+
totalGapUsed++;
|
|
275
|
+
gapWords.push({
|
|
276
|
+
word: documentTokens[pos],
|
|
277
|
+
position: pos,
|
|
278
|
+
gapIndex: totalGapUsed
|
|
279
|
+
});
|
|
280
|
+
}
|
|
244
281
|
phraseWords.push(match);
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
282
|
+
matchedCounts.set(match.queryToken, currentCount + 1);
|
|
283
|
+
totalMatchedTokens++;
|
|
284
|
+
if (totalMatchedTokens === queryTokens.length) {
|
|
285
|
+
break;
|
|
286
|
+
}
|
|
249
287
|
}
|
|
250
288
|
}
|
|
251
289
|
if (phraseWords.length > 0) {
|
|
@@ -284,9 +322,12 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
284
322
|
baseScore /= phraseWords.length;
|
|
285
323
|
const inOrder = isInOrder(phraseWords, queryTokens);
|
|
286
324
|
const orderScore = inOrder ? 1 : 0.5;
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
325
|
+
let proximityScore = 0;
|
|
326
|
+
if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
|
|
327
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
328
|
+
const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
|
|
329
|
+
proximityScore = Math.max(0, 1 - span / proximityWindow);
|
|
330
|
+
}
|
|
290
331
|
let densityScore = 0;
|
|
291
332
|
if (queryTokens.length === 1) {
|
|
292
333
|
const totalOccurrences = allWordMatches.length;
|
|
@@ -304,8 +345,10 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
304
345
|
const weightedDensity = densityScore * weights.density;
|
|
305
346
|
const weightedSemantic = semanticScore * weights.semantic;
|
|
306
347
|
const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
|
|
307
|
-
const
|
|
308
|
-
const
|
|
348
|
+
const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
|
|
349
|
+
const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
|
|
350
|
+
const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
|
|
351
|
+
const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
|
|
309
352
|
const normalizedScore = totalScore / maxPossibleScore;
|
|
310
353
|
const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
|
|
311
354
|
const score = normalizedScore * coverageMultiplier;
|
|
@@ -328,13 +371,20 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
328
371
|
};
|
|
329
372
|
}
|
|
330
373
|
function isInOrder(phraseWords, queryTokens) {
|
|
331
|
-
const
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
374
|
+
const tokenPositions = queryTokens.map((token, index) => ({ token, index }));
|
|
375
|
+
let lastMatchedIndex = -1;
|
|
376
|
+
for (const phraseWord of phraseWords) {
|
|
377
|
+
let foundIndex = -1;
|
|
378
|
+
for (const pos of tokenPositions) {
|
|
379
|
+
if (pos.token === phraseWord.queryToken && pos.index > lastMatchedIndex) {
|
|
380
|
+
foundIndex = pos.index;
|
|
381
|
+
break;
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
if (foundIndex === -1) {
|
|
336
385
|
return false;
|
|
337
386
|
}
|
|
387
|
+
lastMatchedIndex = foundIndex;
|
|
338
388
|
}
|
|
339
389
|
return true;
|
|
340
390
|
}
|
|
@@ -375,9 +425,259 @@ function deduplicatePhrases(phrases) {
|
|
|
375
425
|
return result.sort((a, b) => b.score - a.score);
|
|
376
426
|
}
|
|
377
427
|
|
|
428
|
+
// src/optimized.ts
|
|
429
|
+
var DEFAULT_OPTIMIZED_CONFIG = {
|
|
430
|
+
maxQPSCandidates: 100,
|
|
431
|
+
// Limit phrase scoring to top 100 candidates
|
|
432
|
+
minQPSScore: 0.1,
|
|
433
|
+
// Include candidates with 10%+ of best score
|
|
434
|
+
qpsExact: false,
|
|
435
|
+
// Use fuzzy matching by default
|
|
436
|
+
qpsTolerance: 1
|
|
437
|
+
// Default tolerance of 1 edit distance
|
|
438
|
+
};
|
|
439
|
+
function normalizeText(text) {
|
|
440
|
+
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
441
|
+
}
|
|
442
|
+
function tokenize(text) {
|
|
443
|
+
return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
|
|
444
|
+
}
|
|
445
|
+
function buildReducedVocabularyFromDocs(candidateDocIds, docs) {
|
|
446
|
+
const reducedVocab = /* @__PURE__ */ new Set();
|
|
447
|
+
for (const docId of candidateDocIds) {
|
|
448
|
+
const doc = docs[docId];
|
|
449
|
+
if (!doc?.normalized_content)
|
|
450
|
+
continue;
|
|
451
|
+
const tokens = doc.normalized_content.split(/\s+/).filter((token) => token.length > 0);
|
|
452
|
+
for (const token of tokens) {
|
|
453
|
+
reducedVocab.add(token);
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
return reducedVocab;
|
|
457
|
+
}
|
|
458
|
+
function searchQPS(term, qpsIndex, tokenizer, properties, config, language) {
|
|
459
|
+
const tokens = tokenizer.tokenize(term, language);
|
|
460
|
+
if (tokens.length === 0) {
|
|
461
|
+
return [];
|
|
462
|
+
}
|
|
463
|
+
const exact = config.qpsExact ?? DEFAULT_OPTIMIZED_CONFIG.qpsExact;
|
|
464
|
+
const tolerance = config.qpsTolerance ?? DEFAULT_OPTIMIZED_CONFIG.qpsTolerance;
|
|
465
|
+
const boostPerProp = config.qpsBoostPerProp ?? {};
|
|
466
|
+
const resultMap = /* @__PURE__ */ new Map();
|
|
467
|
+
for (const prop of properties) {
|
|
468
|
+
const indexEntry = qpsIndex.indexes[prop];
|
|
469
|
+
if (!indexEntry || indexEntry.type !== "Radix") {
|
|
470
|
+
continue;
|
|
471
|
+
}
|
|
472
|
+
const radixNode = indexEntry.node;
|
|
473
|
+
const stats = qpsIndex.stats[prop];
|
|
474
|
+
if (!radixNode || !stats) {
|
|
475
|
+
continue;
|
|
476
|
+
}
|
|
477
|
+
const boost = boostPerProp[prop] ?? 1;
|
|
478
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
479
|
+
const token = tokens[i];
|
|
480
|
+
const matches = radixNode.find({
|
|
481
|
+
term: token,
|
|
482
|
+
exact,
|
|
483
|
+
tolerance: exact ? 0 : tolerance
|
|
484
|
+
});
|
|
485
|
+
for (const [matchedWord, docIds] of Object.entries(matches)) {
|
|
486
|
+
if (!Array.isArray(docIds))
|
|
487
|
+
continue;
|
|
488
|
+
const isExactMatch = matchedWord === token;
|
|
489
|
+
for (const docId of docIds) {
|
|
490
|
+
const tokensLength = stats.tokensLength.get(docId) || 1;
|
|
491
|
+
const quantum = stats.tokenQuantums[docId]?.[matchedWord];
|
|
492
|
+
const occurrences = quantum ? quantum >> 20 : 1;
|
|
493
|
+
const scoreContrib = (occurrences * occurrences / tokensLength + (isExactMatch ? 1 : 0)) * boost;
|
|
494
|
+
if (!resultMap.has(docId)) {
|
|
495
|
+
resultMap.set(docId, [scoreContrib, 1 << i]);
|
|
496
|
+
} else {
|
|
497
|
+
const [prevScore, prevMask] = resultMap.get(docId);
|
|
498
|
+
const adjacencyBonus = countSetBits(prevMask >> 1 & 1 << i) * 2;
|
|
499
|
+
resultMap.set(docId, [prevScore + scoreContrib + adjacencyBonus, prevMask | 1 << i]);
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
const results = Array.from(resultMap.entries()).map(([docId, [score]]) => [docId, score]).sort((a, b) => b[1] - a[1]);
|
|
506
|
+
return results;
|
|
507
|
+
}
|
|
508
|
+
function countSetBits(n) {
|
|
509
|
+
let count = 0;
|
|
510
|
+
while (n) {
|
|
511
|
+
count += n & 1;
|
|
512
|
+
n >>= 1;
|
|
513
|
+
}
|
|
514
|
+
return count;
|
|
515
|
+
}
|
|
516
|
+
async function searchWithQPSPruning(orama, qpsIndex, pluginState, params, config = {}, language = "french") {
|
|
517
|
+
const startTime = performance.now();
|
|
518
|
+
const { term, properties, tokenCache } = params;
|
|
519
|
+
if (!term || typeof term !== "string") {
|
|
520
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
521
|
+
}
|
|
522
|
+
const textProperty = properties && properties[0] || pluginState.config.textProperty;
|
|
523
|
+
const searchProperties = properties || [textProperty];
|
|
524
|
+
const queryTokens = tokenize(term);
|
|
525
|
+
if (queryTokens.length === 0) {
|
|
526
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
527
|
+
}
|
|
528
|
+
const tolerance = pluginState.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, pluginState.config.tolerance) : pluginState.config.tolerance;
|
|
529
|
+
console.log(`\u{1F680} Optimized search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
530
|
+
const qpsStartTime = performance.now();
|
|
531
|
+
const tokenizer = orama.tokenizer;
|
|
532
|
+
const qpsCandidates = searchQPS(
|
|
533
|
+
term,
|
|
534
|
+
qpsIndex,
|
|
535
|
+
tokenizer,
|
|
536
|
+
searchProperties,
|
|
537
|
+
config,
|
|
538
|
+
language
|
|
539
|
+
);
|
|
540
|
+
const qpsTime = performance.now() - qpsStartTime;
|
|
541
|
+
console.log(`\u26A1 QPS found ${qpsCandidates.length} candidates in ${qpsTime.toFixed(2)}ms`);
|
|
542
|
+
if (qpsCandidates.length === 0) {
|
|
543
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
544
|
+
}
|
|
545
|
+
const maxCandidates = config.maxQPSCandidates ?? DEFAULT_OPTIMIZED_CONFIG.maxQPSCandidates;
|
|
546
|
+
const minScoreRatio = config.minQPSScore ?? DEFAULT_OPTIMIZED_CONFIG.minQPSScore;
|
|
547
|
+
const bestScore = qpsCandidates[0][1];
|
|
548
|
+
const minScore = bestScore * minScoreRatio;
|
|
549
|
+
const filteredCandidates = qpsCandidates.filter(([, score]) => score >= minScore).slice(0, maxCandidates);
|
|
550
|
+
console.log(`\u{1F4CB} Filtered to ${filteredCandidates.length} candidates (min score: ${minScore.toFixed(2)})`);
|
|
551
|
+
const candidateDocIds = new Set(filteredCandidates.map(([docId]) => String(docId)));
|
|
552
|
+
let vocabulary = pluginState.vocabulary;
|
|
553
|
+
if (vocabulary.size === 0) {
|
|
554
|
+
console.log("\u{1F4DA} Vocabulary not initialized - extracting from index...");
|
|
555
|
+
try {
|
|
556
|
+
const indexData = orama.data?.index;
|
|
557
|
+
let radixNode = null;
|
|
558
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
559
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
560
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
561
|
+
radixNode = indexData[textProperty].node;
|
|
562
|
+
}
|
|
563
|
+
if (radixNode) {
|
|
564
|
+
pluginState.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
565
|
+
vocabulary = pluginState.vocabulary;
|
|
566
|
+
console.log(`\u{1F4DA} Extracted ${vocabulary.size} vocabulary words`);
|
|
567
|
+
} else {
|
|
568
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
569
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
570
|
+
}
|
|
571
|
+
} catch (error) {
|
|
572
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
573
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
let docs = {};
|
|
577
|
+
if (orama.data?.docs?.docs) {
|
|
578
|
+
docs = orama.data.docs.docs;
|
|
579
|
+
}
|
|
580
|
+
const vocabStartTime = performance.now();
|
|
581
|
+
const reducedVocabulary = buildReducedVocabularyFromDocs(candidateDocIds, docs);
|
|
582
|
+
const vocabTime = performance.now() - vocabStartTime;
|
|
583
|
+
console.log(`\u{1F4DA} Reduced vocabulary: ${reducedVocabulary.size} words (full: ${vocabulary.size}, reduction: ${(100 * (1 - reducedVocabulary.size / vocabulary.size)).toFixed(1)}%, built in ${vocabTime.toFixed(2)}ms)`);
|
|
584
|
+
const candidatesMap = findAllCandidates(
|
|
585
|
+
queryTokens,
|
|
586
|
+
reducedVocabulary,
|
|
587
|
+
tolerance,
|
|
588
|
+
pluginState.config.enableSynonyms ? pluginState.synonymMap : void 0,
|
|
589
|
+
pluginState.config.synonymMatchScore
|
|
590
|
+
);
|
|
591
|
+
const filteredFuzzyCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, pluginState.config.minScore);
|
|
592
|
+
console.log(`\u{1F3AF} Fuzzy candidates: ${Array.from(filteredFuzzyCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
593
|
+
const phraseStartTime = performance.now();
|
|
594
|
+
const candidateLookup = buildCandidateLookup(filteredFuzzyCandidates);
|
|
595
|
+
const queryTokenCounts = buildQueryTokenCounts(queryTokens);
|
|
596
|
+
const documentMatches = [];
|
|
597
|
+
let docsScored = 0;
|
|
598
|
+
for (const [docId, doc] of Object.entries(docs)) {
|
|
599
|
+
if (!candidateDocIds.has(docId)) {
|
|
600
|
+
continue;
|
|
601
|
+
}
|
|
602
|
+
docsScored++;
|
|
603
|
+
const text = doc[textProperty];
|
|
604
|
+
if (!text || typeof text !== "string") {
|
|
605
|
+
continue;
|
|
606
|
+
}
|
|
607
|
+
let docTokens;
|
|
608
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
609
|
+
docTokens = tokenCache.get(docId);
|
|
610
|
+
} else {
|
|
611
|
+
docTokens = text.split(/\s+/).filter((token) => token.length > 0);
|
|
612
|
+
}
|
|
613
|
+
const phrases = findPhrasesInDocument(
|
|
614
|
+
docTokens,
|
|
615
|
+
filteredFuzzyCandidates,
|
|
616
|
+
{
|
|
617
|
+
weights: pluginState.config.weights,
|
|
618
|
+
maxGap: pluginState.config.maxGap,
|
|
619
|
+
proximitySpanMultiplier: pluginState.config.proximitySpanMultiplier,
|
|
620
|
+
tolerance
|
|
621
|
+
},
|
|
622
|
+
pluginState.documentFrequency,
|
|
623
|
+
pluginState.totalDocuments,
|
|
624
|
+
queryTokens,
|
|
625
|
+
candidateLookup,
|
|
626
|
+
// PHASE 1 OPTIMIZATION A: Pre-built candidate lookup
|
|
627
|
+
queryTokenCounts
|
|
628
|
+
// PHASE 1 OPTIMIZATION B: Pre-built query token counts
|
|
629
|
+
);
|
|
630
|
+
if (phrases.length > 0) {
|
|
631
|
+
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
632
|
+
documentMatches.push({
|
|
633
|
+
id: docId,
|
|
634
|
+
phrases,
|
|
635
|
+
score: docScore,
|
|
636
|
+
document: doc
|
|
637
|
+
});
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
const phraseTime = performance.now() - phraseStartTime;
|
|
641
|
+
console.log(`\u{1F4CA} Phrase scored ${docsScored} documents in ${phraseTime.toFixed(2)}ms`);
|
|
642
|
+
documentMatches.sort((a, b) => b.score - a.score);
|
|
643
|
+
let finalMatches = documentMatches;
|
|
644
|
+
if (pluginState.config.enableFinalScoreMinimum && pluginState.config.finalScoreMinimum > 0) {
|
|
645
|
+
const threshold = pluginState.config.finalScoreMinimum;
|
|
646
|
+
const beforeCount = finalMatches.length;
|
|
647
|
+
finalMatches = finalMatches.filter((m) => m.score >= threshold);
|
|
648
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${finalMatches.length} (threshold: ${threshold})`);
|
|
649
|
+
}
|
|
650
|
+
const limit = params.limit ?? finalMatches.length;
|
|
651
|
+
const limitedMatches = finalMatches.slice(0, limit);
|
|
652
|
+
const hits = limitedMatches.map((match) => ({
|
|
653
|
+
id: match.id,
|
|
654
|
+
score: match.score,
|
|
655
|
+
document: match.document,
|
|
656
|
+
_phrases: match.phrases
|
|
657
|
+
}));
|
|
658
|
+
const elapsed = performance.now() - startTime;
|
|
659
|
+
console.log(`\u2705 Optimized search: ${hits.length} results in ${elapsed.toFixed(2)}ms (QPS: ${qpsTime.toFixed(2)}ms, Phrase: ${phraseTime.toFixed(2)}ms)`);
|
|
660
|
+
return {
|
|
661
|
+
elapsed: {
|
|
662
|
+
formatted: `${elapsed.toFixed(2)}ms`,
|
|
663
|
+
raw: Math.floor(elapsed * 1e6),
|
|
664
|
+
qpsTime,
|
|
665
|
+
phraseTime
|
|
666
|
+
},
|
|
667
|
+
hits,
|
|
668
|
+
count: hits.length
|
|
669
|
+
};
|
|
670
|
+
}
|
|
671
|
+
function createOptimizedSearch(orama, qpsIndex, pluginState, config = {}) {
|
|
672
|
+
return async (params, language = "french") => {
|
|
673
|
+
return searchWithQPSPruning(orama, qpsIndex, pluginState, params, config, language);
|
|
674
|
+
};
|
|
675
|
+
}
|
|
676
|
+
|
|
378
677
|
// src/index.ts
|
|
379
678
|
var DEFAULT_CONFIG = {
|
|
380
|
-
textProperty: "
|
|
679
|
+
textProperty: "normalized_content",
|
|
680
|
+
// Must match server's field name
|
|
381
681
|
tolerance: 1,
|
|
382
682
|
adaptiveTolerance: true,
|
|
383
683
|
enableSynonyms: false,
|
|
@@ -393,6 +693,8 @@ var DEFAULT_CONFIG = {
|
|
|
393
693
|
},
|
|
394
694
|
maxGap: 5,
|
|
395
695
|
minScore: 0.1,
|
|
696
|
+
enableFinalScoreMinimum: false,
|
|
697
|
+
finalScoreMinimum: 0.3,
|
|
396
698
|
proximitySpanMultiplier: 5
|
|
397
699
|
};
|
|
398
700
|
var pluginStates = /* @__PURE__ */ new WeakMap();
|
|
@@ -414,6 +716,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
414
716
|
},
|
|
415
717
|
maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
|
|
416
718
|
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
|
|
719
|
+
enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
|
|
720
|
+
finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
|
|
417
721
|
proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
|
|
418
722
|
};
|
|
419
723
|
const plugin = {
|
|
@@ -427,7 +731,8 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
427
731
|
synonymMap: {},
|
|
428
732
|
config,
|
|
429
733
|
documentFrequency: /* @__PURE__ */ new Map(),
|
|
430
|
-
totalDocuments: 0
|
|
734
|
+
totalDocuments: 0,
|
|
735
|
+
vocabulary: /* @__PURE__ */ new Set()
|
|
431
736
|
};
|
|
432
737
|
if (config.enableSynonyms && config.supabase) {
|
|
433
738
|
try {
|
|
@@ -444,6 +749,23 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
444
749
|
state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
|
|
445
750
|
console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
|
|
446
751
|
}
|
|
752
|
+
try {
|
|
753
|
+
const indexData = orama.data?.index;
|
|
754
|
+
let radixNode = null;
|
|
755
|
+
if (indexData?.indexes?.[config.textProperty]?.node) {
|
|
756
|
+
radixNode = indexData.indexes[config.textProperty].node;
|
|
757
|
+
} else if (indexData?.[config.textProperty]?.node) {
|
|
758
|
+
radixNode = indexData[config.textProperty].node;
|
|
759
|
+
}
|
|
760
|
+
if (radixNode) {
|
|
761
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
762
|
+
console.log(`\u{1F4DA} Cached ${state.vocabulary.size} vocabulary words (eliminates per-query radix traversal)`);
|
|
763
|
+
} else {
|
|
764
|
+
console.warn("\u26A0\uFE0F Could not find radix tree for vocabulary caching");
|
|
765
|
+
}
|
|
766
|
+
} catch (error) {
|
|
767
|
+
console.error("\u26A0\uFE0F Failed to cache vocabulary:", error);
|
|
768
|
+
}
|
|
447
769
|
pluginStates.set(orama, state);
|
|
448
770
|
console.log("\u2705 Fuzzy Phrase Plugin initialized");
|
|
449
771
|
setImmediate(() => {
|
|
@@ -465,43 +787,43 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
465
787
|
console.error("\u274C Plugin state not initialized");
|
|
466
788
|
throw new Error("Fuzzy Phrase Plugin not properly initialized");
|
|
467
789
|
}
|
|
468
|
-
const { term, properties } = params;
|
|
790
|
+
const { term, properties, tokenCache, candidateIds } = params;
|
|
791
|
+
const candidateIdSet = candidateIds ? candidateIds instanceof Set ? candidateIds : new Set(candidateIds) : null;
|
|
469
792
|
if (!term || typeof term !== "string") {
|
|
470
793
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
471
794
|
}
|
|
472
795
|
const textProperty = properties && properties[0] || state.config.textProperty;
|
|
473
|
-
const queryTokens =
|
|
796
|
+
const queryTokens = tokenize2(term);
|
|
474
797
|
if (queryTokens.length === 0) {
|
|
475
798
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
476
799
|
}
|
|
477
800
|
const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
|
|
478
801
|
console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
|
|
479
|
-
let vocabulary;
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
802
|
+
let vocabulary = state.vocabulary;
|
|
803
|
+
if (vocabulary.size === 0) {
|
|
804
|
+
console.log("\u{1F4DA} Vocabulary cache empty - extracting on first search...");
|
|
805
|
+
try {
|
|
806
|
+
const indexData = orama.data?.index;
|
|
807
|
+
let radixNode = null;
|
|
808
|
+
if (indexData?.indexes?.[textProperty]?.node) {
|
|
809
|
+
radixNode = indexData.indexes[textProperty].node;
|
|
810
|
+
} else if (indexData?.[textProperty]?.node) {
|
|
811
|
+
radixNode = indexData[textProperty].node;
|
|
812
|
+
}
|
|
813
|
+
if (radixNode) {
|
|
814
|
+
state.vocabulary = extractVocabularyFromRadixTree(radixNode);
|
|
815
|
+
vocabulary = state.vocabulary;
|
|
816
|
+
console.log(`\u{1F4DA} Cached ${vocabulary.size} vocabulary words (will be reused for subsequent queries)`);
|
|
817
|
+
} else {
|
|
818
|
+
console.error("\u274C Radix tree not found for vocabulary extraction");
|
|
819
|
+
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
820
|
+
}
|
|
821
|
+
} catch (error) {
|
|
822
|
+
console.error("\u274C Failed to extract vocabulary:", error);
|
|
498
823
|
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
499
824
|
}
|
|
500
|
-
|
|
501
|
-
console.log(`\u{1F4DA}
|
|
502
|
-
} catch (error) {
|
|
503
|
-
console.error("\u274C Failed to extract vocabulary:", error);
|
|
504
|
-
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
|
|
825
|
+
} else {
|
|
826
|
+
console.log(`\u{1F4DA} Using cached vocabulary (${vocabulary.size} words)`);
|
|
505
827
|
}
|
|
506
828
|
const candidatesMap = findAllCandidates(
|
|
507
829
|
queryTokens,
|
|
@@ -510,11 +832,10 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
510
832
|
state.config.enableSynonyms ? state.synonymMap : void 0,
|
|
511
833
|
state.config.synonymMatchScore
|
|
512
834
|
);
|
|
513
|
-
const filteredCandidates = filterCandidatesByScore(
|
|
514
|
-
candidatesMap,
|
|
515
|
-
state.config.minScore
|
|
516
|
-
);
|
|
835
|
+
const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
|
|
517
836
|
console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
837
|
+
const candidateLookup = buildCandidateLookup(filteredCandidates);
|
|
838
|
+
const queryTokenCounts = buildQueryTokenCounts(queryTokens);
|
|
518
839
|
const documentMatches = [];
|
|
519
840
|
console.log("\u{1F50D} DEBUG orama.data structure:", {
|
|
520
841
|
dataKeys: Object.keys(orama.data || {}),
|
|
@@ -540,23 +861,42 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
540
861
|
dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
|
|
541
862
|
});
|
|
542
863
|
}
|
|
543
|
-
|
|
864
|
+
const cacheHits = tokenCache ? tokenCache.size : 0;
|
|
865
|
+
const docsToSearch = candidateIdSet ? candidateIdSet.size : Object.keys(docs).length;
|
|
866
|
+
console.log(`\u{1F4C4} Searching through ${docsToSearch} documents${candidateIdSet ? " (pruned by candidateIds)" : ""} (${cacheHits > 0 ? `${cacheHits} tokens cached` : "no cache"})`);
|
|
544
867
|
for (const [docId, doc] of Object.entries(docs)) {
|
|
868
|
+
if (candidateIdSet) {
|
|
869
|
+
const userDocId = doc.id !== void 0 ? String(doc.id) : docId;
|
|
870
|
+
if (!candidateIdSet.has(userDocId) && !candidateIdSet.has(docId)) {
|
|
871
|
+
continue;
|
|
872
|
+
}
|
|
873
|
+
}
|
|
545
874
|
const text = doc[textProperty];
|
|
546
875
|
if (!text || typeof text !== "string") {
|
|
547
876
|
continue;
|
|
548
877
|
}
|
|
549
|
-
|
|
878
|
+
let docTokens;
|
|
879
|
+
if (tokenCache && tokenCache.has(docId)) {
|
|
880
|
+
docTokens = tokenCache.get(docId);
|
|
881
|
+
} else {
|
|
882
|
+
docTokens = tokenize2(text);
|
|
883
|
+
}
|
|
550
884
|
const phrases = findPhrasesInDocument(
|
|
551
885
|
docTokens,
|
|
552
886
|
filteredCandidates,
|
|
553
887
|
{
|
|
554
888
|
weights: state.config.weights,
|
|
555
889
|
maxGap: state.config.maxGap,
|
|
556
|
-
proximitySpanMultiplier: state.config.proximitySpanMultiplier
|
|
890
|
+
proximitySpanMultiplier: state.config.proximitySpanMultiplier,
|
|
891
|
+
tolerance
|
|
557
892
|
},
|
|
558
893
|
state.documentFrequency,
|
|
559
|
-
state.totalDocuments
|
|
894
|
+
state.totalDocuments,
|
|
895
|
+
queryTokens,
|
|
896
|
+
candidateLookup,
|
|
897
|
+
// PHASE 1 OPTIMIZATION A: Pre-built candidate lookup
|
|
898
|
+
queryTokenCounts
|
|
899
|
+
// PHASE 1 OPTIMIZATION B: Pre-built query token counts
|
|
560
900
|
);
|
|
561
901
|
if (phrases.length > 0) {
|
|
562
902
|
const docScore = Math.max(...phrases.map((p) => p.score));
|
|
@@ -569,8 +909,15 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
569
909
|
}
|
|
570
910
|
}
|
|
571
911
|
documentMatches.sort((a, b) => b.score - a.score);
|
|
572
|
-
|
|
573
|
-
|
|
912
|
+
let filteredMatches = documentMatches;
|
|
913
|
+
if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
|
|
914
|
+
const threshold = state.config.finalScoreMinimum;
|
|
915
|
+
const beforeCount = filteredMatches.length;
|
|
916
|
+
filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
|
|
917
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
|
|
918
|
+
}
|
|
919
|
+
const limit = params.limit ?? filteredMatches.length;
|
|
920
|
+
const limitedMatches = filteredMatches.slice(0, limit);
|
|
574
921
|
const hits = limitedMatches.map((match) => ({
|
|
575
922
|
id: match.id,
|
|
576
923
|
score: match.score,
|
|
@@ -621,20 +968,23 @@ function calculateDocumentFrequencies(docs, textProperty) {
|
|
|
621
968
|
if (!text || typeof text !== "string") {
|
|
622
969
|
continue;
|
|
623
970
|
}
|
|
624
|
-
const words = new Set(
|
|
971
|
+
const words = new Set(tokenize2(text));
|
|
625
972
|
for (const word of words) {
|
|
626
973
|
df.set(word, (df.get(word) || 0) + 1);
|
|
627
974
|
}
|
|
628
975
|
}
|
|
629
976
|
return df;
|
|
630
977
|
}
|
|
631
|
-
function
|
|
978
|
+
function normalizeText2(text) {
|
|
632
979
|
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
633
980
|
}
|
|
634
|
-
function
|
|
635
|
-
return
|
|
981
|
+
function tokenize2(text) {
|
|
982
|
+
return normalizeText2(text).split(/\s+/).filter((token) => token.length > 0);
|
|
983
|
+
}
|
|
984
|
+
function getPluginState(orama) {
|
|
985
|
+
return pluginStates.get(orama);
|
|
636
986
|
}
|
|
637
987
|
|
|
638
|
-
export { pluginFuzzyPhrase, searchWithFuzzyPhrase };
|
|
988
|
+
export { createOptimizedSearch, getPluginState, normalizeText as normalizeTextOptimized, pluginFuzzyPhrase, searchWithFuzzyPhrase, searchWithQPSPruning, tokenize as tokenizeOptimized };
|
|
639
989
|
//# sourceMappingURL=out.js.map
|
|
640
990
|
//# sourceMappingURL=index.js.map
|