@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.9 → 3.1.16-custom.newbase.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +13 -0
- package/README.md +1 -1
- package/dist/index.cjs +168 -57
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +54 -89
- package/dist/index.d.ts +54 -89
- package/dist/index.js +168 -57
- package/dist/index.js.map +1 -1
- package/package.json +62 -54
package/LICENSE.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Copyright 2023 OramaSearch Inc.
|
|
2
|
+
|
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
you may not use this file except in compliance with the License.
|
|
5
|
+
You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
See the License for the specific language governing permissions and
|
|
13
|
+
limitations under the License.
|
package/README.md
CHANGED
package/dist/index.cjs
CHANGED
|
@@ -50,9 +50,6 @@ function fuzzyMatch(word, queryToken, tolerance) {
|
|
|
50
50
|
if (word === queryToken) {
|
|
51
51
|
return { matches: true, distance: 0, score: 1 };
|
|
52
52
|
}
|
|
53
|
-
if (word.startsWith(queryToken)) {
|
|
54
|
-
return { matches: true, distance: 0, score: 0.95 };
|
|
55
|
-
}
|
|
56
53
|
const result = boundedLevenshtein(word, queryToken, tolerance);
|
|
57
54
|
if (result.isBounded) {
|
|
58
55
|
const score = 1 - result.distance * 0.2;
|
|
@@ -82,30 +79,13 @@ function calculateAdaptiveTolerance(queryTokens, baseTolerance) {
|
|
|
82
79
|
function extractVocabularyFromRadixTree(radixNode) {
|
|
83
80
|
const vocabulary = /* @__PURE__ */ new Set();
|
|
84
81
|
let nodesVisited = 0;
|
|
85
|
-
let wordsFound = 0;
|
|
86
82
|
function traverse(node, depth = 0) {
|
|
87
83
|
if (!node) {
|
|
88
|
-
console.log(`\u26A0\uFE0F Null node at depth ${depth}`);
|
|
89
84
|
return;
|
|
90
85
|
}
|
|
91
86
|
nodesVisited++;
|
|
92
|
-
if (nodesVisited <= 3) {
|
|
93
|
-
const cInfo = node.c ? {
|
|
94
|
-
isArray: Array.isArray(node.c),
|
|
95
|
-
isMap: node.c instanceof Map,
|
|
96
|
-
type: typeof node.c,
|
|
97
|
-
constructor: node.c.constructor?.name,
|
|
98
|
-
keys: node.c instanceof Map ? Array.from(node.c.keys()).slice(0, 3) : Object.keys(node.c).slice(0, 3),
|
|
99
|
-
valuesCount: node.c instanceof Map ? node.c.size : Array.isArray(node.c) ? node.c.length : Object.keys(node.c).length
|
|
100
|
-
} : "null";
|
|
101
|
-
console.log(`\u{1F50D} Node ${nodesVisited}:`, { w: node.w, e: node.e, has_c: !!node.c, c_info: cInfo });
|
|
102
|
-
}
|
|
103
87
|
if (node.e && node.w && typeof node.w === "string" && node.w.length > 0) {
|
|
104
88
|
vocabulary.add(node.w);
|
|
105
|
-
wordsFound++;
|
|
106
|
-
if (wordsFound <= 5) {
|
|
107
|
-
console.log(`\u2705 Found word ${wordsFound}: "${node.w}"`);
|
|
108
|
-
}
|
|
109
89
|
}
|
|
110
90
|
if (node.c) {
|
|
111
91
|
if (node.c instanceof Map) {
|
|
@@ -227,52 +207,85 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
|
|
|
227
207
|
queryTokens,
|
|
228
208
|
config,
|
|
229
209
|
documentFrequency,
|
|
230
|
-
totalDocuments
|
|
210
|
+
totalDocuments,
|
|
211
|
+
wordMatches,
|
|
212
|
+
documentTokens
|
|
213
|
+
// Pass document tokens to extract gap words
|
|
231
214
|
);
|
|
232
215
|
if (phrase && phrase.words.length > 0) {
|
|
233
216
|
phrases.push(phrase);
|
|
234
217
|
}
|
|
235
218
|
}
|
|
236
|
-
|
|
219
|
+
const minTokensRequired = queryTokens.length >= 3 ? 2 : 1;
|
|
220
|
+
const filteredPhrases = phrases.filter((p) => p.words.length >= minTokensRequired);
|
|
221
|
+
return deduplicatePhrases(filteredPhrases);
|
|
237
222
|
}
|
|
238
|
-
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments) {
|
|
223
|
+
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, documentTokens) {
|
|
239
224
|
const startMatch = wordMatches[startIndex];
|
|
240
225
|
const phraseWords = [startMatch];
|
|
241
|
-
const
|
|
226
|
+
const queryTokenCounts = /* @__PURE__ */ new Map();
|
|
227
|
+
for (const token of queryTokens) {
|
|
228
|
+
queryTokenCounts.set(token, (queryTokenCounts.get(token) || 0) + 1);
|
|
229
|
+
}
|
|
230
|
+
const matchedCounts = /* @__PURE__ */ new Map();
|
|
231
|
+
matchedCounts.set(startMatch.queryToken, 1);
|
|
232
|
+
const gapWords = [];
|
|
233
|
+
let totalGapUsed = 0;
|
|
234
|
+
let totalMatchedTokens = 1;
|
|
242
235
|
for (let i = startIndex + 1; i < wordMatches.length; i++) {
|
|
243
236
|
const match = wordMatches[i];
|
|
244
|
-
const
|
|
237
|
+
const lastPos = phraseWords[phraseWords.length - 1].position;
|
|
238
|
+
const gap = match.position - lastPos - 1;
|
|
245
239
|
if (gap > config.maxGap) {
|
|
246
240
|
break;
|
|
247
241
|
}
|
|
248
|
-
|
|
242
|
+
const neededCount = queryTokenCounts.get(match.queryToken) || 0;
|
|
243
|
+
const currentCount = matchedCounts.get(match.queryToken) || 0;
|
|
244
|
+
if (currentCount < neededCount) {
|
|
245
|
+
for (let pos = lastPos + 1; pos < match.position; pos++) {
|
|
246
|
+
totalGapUsed++;
|
|
247
|
+
gapWords.push({
|
|
248
|
+
word: documentTokens[pos],
|
|
249
|
+
position: pos,
|
|
250
|
+
gapIndex: totalGapUsed
|
|
251
|
+
});
|
|
252
|
+
}
|
|
249
253
|
phraseWords.push(match);
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
+
matchedCounts.set(match.queryToken, currentCount + 1);
|
|
255
|
+
totalMatchedTokens++;
|
|
256
|
+
if (totalMatchedTokens === queryTokens.length) {
|
|
257
|
+
break;
|
|
258
|
+
}
|
|
254
259
|
}
|
|
255
260
|
}
|
|
256
261
|
if (phraseWords.length > 0) {
|
|
257
|
-
const
|
|
262
|
+
const coverage = phraseWords.length / queryTokens.length;
|
|
263
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
264
|
+
const { score, breakdown } = calculatePhraseScore(
|
|
258
265
|
phraseWords,
|
|
259
266
|
queryTokens,
|
|
260
267
|
config,
|
|
261
268
|
documentFrequency,
|
|
262
|
-
totalDocuments
|
|
269
|
+
totalDocuments,
|
|
270
|
+
allWordMatches,
|
|
271
|
+
coverage
|
|
263
272
|
);
|
|
264
273
|
return {
|
|
265
274
|
words: phraseWords,
|
|
275
|
+
gapWords,
|
|
276
|
+
gapUsed: totalGapUsed,
|
|
277
|
+
coverage,
|
|
266
278
|
startPosition: phraseWords[0].position,
|
|
267
279
|
endPosition: phraseWords[phraseWords.length - 1].position,
|
|
268
|
-
|
|
280
|
+
span,
|
|
269
281
|
inOrder: isInOrder(phraseWords, queryTokens),
|
|
270
|
-
score
|
|
282
|
+
score,
|
|
283
|
+
scoreBreakdown: breakdown
|
|
271
284
|
};
|
|
272
285
|
}
|
|
273
286
|
return null;
|
|
274
287
|
}
|
|
275
|
-
function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments) {
|
|
288
|
+
function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches, coverage) {
|
|
276
289
|
let baseScore = 0;
|
|
277
290
|
for (const word of phraseWords) {
|
|
278
291
|
const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
|
|
@@ -281,18 +294,53 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
|
|
|
281
294
|
baseScore /= phraseWords.length;
|
|
282
295
|
const inOrder = isInOrder(phraseWords, queryTokens);
|
|
283
296
|
const orderScore = inOrder ? 1 : 0.5;
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
297
|
+
let proximityScore = 0;
|
|
298
|
+
if (config.maxGap > 0 && config.weights.proximity > 0 && queryTokens.length > 1) {
|
|
299
|
+
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
|
|
300
|
+
const proximityWindow = queryTokens.length * config.proximitySpanMultiplier;
|
|
301
|
+
proximityScore = Math.max(0, 1 - span / proximityWindow);
|
|
302
|
+
}
|
|
303
|
+
let densityScore = 0;
|
|
304
|
+
if (queryTokens.length === 1) {
|
|
305
|
+
const totalOccurrences = allWordMatches.length;
|
|
306
|
+
densityScore = Math.min(1, totalOccurrences / 10);
|
|
307
|
+
}
|
|
287
308
|
const semanticScore = calculateSemanticScore(
|
|
288
309
|
phraseWords,
|
|
289
310
|
documentFrequency,
|
|
290
311
|
totalDocuments
|
|
291
312
|
);
|
|
292
313
|
const weights = config.weights;
|
|
293
|
-
const
|
|
294
|
-
const
|
|
295
|
-
|
|
314
|
+
const weightedBase = baseScore;
|
|
315
|
+
const weightedOrder = orderScore * weights.order;
|
|
316
|
+
const weightedProximity = proximityScore * weights.proximity;
|
|
317
|
+
const weightedDensity = densityScore * weights.density;
|
|
318
|
+
const weightedSemantic = semanticScore * weights.semantic;
|
|
319
|
+
const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
|
|
320
|
+
const canHaveFuzzyMatches = config.tolerance > 0 && weights.fuzzy > 0;
|
|
321
|
+
const maxBaseWeight = canHaveFuzzyMatches ? Math.max(weights.exact, weights.fuzzy) : weights.exact;
|
|
322
|
+
const effectiveProximityWeight = config.maxGap > 0 && weights.proximity > 0 && queryTokens.length > 1 ? weights.proximity : 0;
|
|
323
|
+
const maxPossibleScore = maxBaseWeight + weights.order + effectiveProximityWeight + weights.density + weights.semantic;
|
|
324
|
+
const normalizedScore = totalScore / maxPossibleScore;
|
|
325
|
+
const coverageMultiplier = queryTokens.length > 1 ? coverage : 1;
|
|
326
|
+
const score = normalizedScore * coverageMultiplier;
|
|
327
|
+
const base = weightedBase / maxPossibleScore;
|
|
328
|
+
const order = weightedOrder / maxPossibleScore;
|
|
329
|
+
const proximity = weightedProximity / maxPossibleScore;
|
|
330
|
+
const density = weightedDensity / maxPossibleScore;
|
|
331
|
+
const semantic = weightedSemantic / maxPossibleScore;
|
|
332
|
+
return {
|
|
333
|
+
score,
|
|
334
|
+
breakdown: {
|
|
335
|
+
base,
|
|
336
|
+
order,
|
|
337
|
+
proximity,
|
|
338
|
+
density,
|
|
339
|
+
semantic,
|
|
340
|
+
coverage: coverageMultiplier
|
|
341
|
+
// Show coverage multiplier in breakdown
|
|
342
|
+
}
|
|
343
|
+
};
|
|
296
344
|
}
|
|
297
345
|
function isInOrder(phraseWords, queryTokens) {
|
|
298
346
|
const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
|
|
@@ -306,6 +354,9 @@ function isInOrder(phraseWords, queryTokens) {
|
|
|
306
354
|
return true;
|
|
307
355
|
}
|
|
308
356
|
function calculateSemanticScore(phraseWords, documentFrequency, totalDocuments) {
|
|
357
|
+
if (totalDocuments === 0) {
|
|
358
|
+
return 0;
|
|
359
|
+
}
|
|
309
360
|
let tfidfSum = 0;
|
|
310
361
|
for (const word of phraseWords) {
|
|
311
362
|
const df = documentFrequency.get(word.word) || 1;
|
|
@@ -341,7 +392,8 @@ function deduplicatePhrases(phrases) {
|
|
|
341
392
|
|
|
342
393
|
// src/index.ts
|
|
343
394
|
var DEFAULT_CONFIG = {
|
|
344
|
-
textProperty: "
|
|
395
|
+
textProperty: "normalized_content",
|
|
396
|
+
// Must match server's field name
|
|
345
397
|
tolerance: 1,
|
|
346
398
|
adaptiveTolerance: true,
|
|
347
399
|
enableSynonyms: false,
|
|
@@ -356,7 +408,10 @@ var DEFAULT_CONFIG = {
|
|
|
356
408
|
semantic: 0.15
|
|
357
409
|
},
|
|
358
410
|
maxGap: 5,
|
|
359
|
-
minScore: 0.1
|
|
411
|
+
minScore: 0.1,
|
|
412
|
+
enableFinalScoreMinimum: false,
|
|
413
|
+
finalScoreMinimum: 0.3,
|
|
414
|
+
proximitySpanMultiplier: 5
|
|
360
415
|
};
|
|
361
416
|
var pluginStates = /* @__PURE__ */ new WeakMap();
|
|
362
417
|
function pluginFuzzyPhrase(userConfig = {}) {
|
|
@@ -376,7 +431,10 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
376
431
|
semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
|
|
377
432
|
},
|
|
378
433
|
maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
|
|
379
|
-
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
|
|
434
|
+
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore,
|
|
435
|
+
enableFinalScoreMinimum: userConfig.enableFinalScoreMinimum ?? DEFAULT_CONFIG.enableFinalScoreMinimum,
|
|
436
|
+
finalScoreMinimum: userConfig.finalScoreMinimum ?? DEFAULT_CONFIG.finalScoreMinimum,
|
|
437
|
+
proximitySpanMultiplier: userConfig.proximitySpanMultiplier ?? DEFAULT_CONFIG.proximitySpanMultiplier
|
|
380
438
|
};
|
|
381
439
|
const plugin = {
|
|
382
440
|
name: "fuzzy-phrase",
|
|
@@ -400,14 +458,22 @@ function pluginFuzzyPhrase(userConfig = {}) {
|
|
|
400
458
|
console.error("\u26A0\uFE0F Failed to load synonyms:", error);
|
|
401
459
|
}
|
|
402
460
|
}
|
|
403
|
-
|
|
404
|
-
|
|
461
|
+
const docs = orama.data?.docs?.docs;
|
|
462
|
+
if (docs) {
|
|
405
463
|
state.totalDocuments = Object.keys(docs).length;
|
|
406
464
|
state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
|
|
407
465
|
console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
|
|
408
466
|
}
|
|
409
467
|
pluginStates.set(orama, state);
|
|
410
468
|
console.log("\u2705 Fuzzy Phrase Plugin initialized");
|
|
469
|
+
setImmediate(() => {
|
|
470
|
+
if (typeof globalThis.fuzzyPhrasePluginReady === "function") {
|
|
471
|
+
console.log("\u{1F4E1} Signaling plugin ready...");
|
|
472
|
+
globalThis.fuzzyPhrasePluginReady();
|
|
473
|
+
} else {
|
|
474
|
+
console.warn("\u26A0\uFE0F fuzzyPhrasePluginReady callback not found");
|
|
475
|
+
}
|
|
476
|
+
});
|
|
411
477
|
}
|
|
412
478
|
};
|
|
413
479
|
return plugin;
|
|
@@ -464,13 +530,34 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
464
530
|
state.config.enableSynonyms ? state.synonymMap : void 0,
|
|
465
531
|
state.config.synonymMatchScore
|
|
466
532
|
);
|
|
467
|
-
const filteredCandidates = filterCandidatesByScore(
|
|
468
|
-
candidatesMap,
|
|
469
|
-
state.config.minScore
|
|
470
|
-
);
|
|
533
|
+
const filteredCandidates = tolerance === 0 ? candidatesMap : filterCandidatesByScore(candidatesMap, state.config.minScore);
|
|
471
534
|
console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
|
|
472
535
|
const documentMatches = [];
|
|
473
|
-
|
|
536
|
+
console.log("\u{1F50D} DEBUG orama.data structure:", {
|
|
537
|
+
dataKeys: Object.keys(orama.data || {}),
|
|
538
|
+
hasDocs: !!orama.data?.docs,
|
|
539
|
+
docsType: orama.data?.docs ? typeof orama.data.docs : "undefined"
|
|
540
|
+
});
|
|
541
|
+
let docs = {};
|
|
542
|
+
if (orama.data?.docs?.docs) {
|
|
543
|
+
docs = orama.data.docs.docs;
|
|
544
|
+
console.log("\u2705 Found docs at orama.data.docs.docs");
|
|
545
|
+
} else if (orama.data?.docs && typeof orama.data.docs === "object") {
|
|
546
|
+
const firstKey = Object.keys(orama.data.docs)[0];
|
|
547
|
+
if (firstKey && firstKey !== "sharedInternalDocumentStore" && firstKey !== "count") {
|
|
548
|
+
docs = orama.data.docs;
|
|
549
|
+
console.log("\u2705 Found docs at orama.data.docs (direct)");
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
if (Object.keys(docs).length === 0) {
|
|
553
|
+
console.log("\u274C Could not find documents - available structure:", {
|
|
554
|
+
hasDataDocs: !!orama.data?.docs,
|
|
555
|
+
dataDocsKeys: orama.data?.docs ? Object.keys(orama.data.docs) : "none",
|
|
556
|
+
hasDataDocsDocs: !!orama.data?.docs?.docs,
|
|
557
|
+
dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
|
|
558
|
+
});
|
|
559
|
+
}
|
|
560
|
+
console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
|
|
474
561
|
for (const [docId, doc] of Object.entries(docs)) {
|
|
475
562
|
const text = doc[textProperty];
|
|
476
563
|
if (!text || typeof text !== "string") {
|
|
@@ -482,7 +569,9 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
482
569
|
filteredCandidates,
|
|
483
570
|
{
|
|
484
571
|
weights: state.config.weights,
|
|
485
|
-
maxGap: state.config.maxGap
|
|
572
|
+
maxGap: state.config.maxGap,
|
|
573
|
+
proximitySpanMultiplier: state.config.proximitySpanMultiplier,
|
|
574
|
+
tolerance
|
|
486
575
|
},
|
|
487
576
|
state.documentFrequency,
|
|
488
577
|
state.totalDocuments
|
|
@@ -498,7 +587,16 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
498
587
|
}
|
|
499
588
|
}
|
|
500
589
|
documentMatches.sort((a, b) => b.score - a.score);
|
|
501
|
-
|
|
590
|
+
let filteredMatches = documentMatches;
|
|
591
|
+
if (state.config.enableFinalScoreMinimum && state.config.finalScoreMinimum > 0) {
|
|
592
|
+
const threshold = state.config.finalScoreMinimum;
|
|
593
|
+
const beforeCount = filteredMatches.length;
|
|
594
|
+
filteredMatches = filteredMatches.filter((m) => m.score >= threshold);
|
|
595
|
+
console.log(`\u{1F39A}\uFE0F Final score filter: ${beforeCount} \u2192 ${filteredMatches.length} (threshold: ${threshold})`);
|
|
596
|
+
}
|
|
597
|
+
const limit = params.limit ?? filteredMatches.length;
|
|
598
|
+
const limitedMatches = filteredMatches.slice(0, limit);
|
|
599
|
+
const hits = limitedMatches.map((match) => ({
|
|
502
600
|
id: match.id,
|
|
503
601
|
score: match.score,
|
|
504
602
|
document: match.document,
|
|
@@ -506,7 +604,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
506
604
|
_phrases: match.phrases
|
|
507
605
|
}));
|
|
508
606
|
const elapsed = performance.now() - startTime;
|
|
509
|
-
console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms`);
|
|
607
|
+
console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms (limit: ${limit})`);
|
|
510
608
|
return {
|
|
511
609
|
elapsed: {
|
|
512
610
|
formatted: `${elapsed.toFixed(2)}ms`,
|
|
@@ -519,15 +617,25 @@ async function searchWithFuzzyPhrase(orama, params, language) {
|
|
|
519
617
|
}
|
|
520
618
|
async function loadSynonymsFromSupabase(supabaseConfig) {
|
|
521
619
|
try {
|
|
620
|
+
console.log("\u{1F50D} DEBUG: Calling Supabase RPC get_synonym_map...");
|
|
522
621
|
const { createClient } = await import('@supabase/supabase-js');
|
|
523
622
|
const supabase = createClient(supabaseConfig.url, supabaseConfig.serviceKey);
|
|
524
623
|
const { data, error } = await supabase.rpc("get_synonym_map");
|
|
624
|
+
console.log("\u{1F50D} DEBUG: Supabase RPC response:", {
|
|
625
|
+
hasError: !!error,
|
|
626
|
+
errorMessage: error?.message,
|
|
627
|
+
hasData: !!data,
|
|
628
|
+
dataType: typeof data,
|
|
629
|
+
dataKeys: data ? Object.keys(data).length : 0
|
|
630
|
+
});
|
|
525
631
|
if (error) {
|
|
526
632
|
throw new Error(`Supabase error: ${error.message}`);
|
|
527
633
|
}
|
|
528
|
-
|
|
634
|
+
const synonymMap = data || {};
|
|
635
|
+
console.log(`\u{1F4DA} Loaded ${Object.keys(synonymMap).length} synonym entries from Supabase`);
|
|
636
|
+
return synonymMap;
|
|
529
637
|
} catch (error) {
|
|
530
|
-
console.error("Failed to load synonyms from Supabase:", error);
|
|
638
|
+
console.error("\u274C Failed to load synonyms from Supabase:", error);
|
|
531
639
|
throw error;
|
|
532
640
|
}
|
|
533
641
|
}
|
|
@@ -545,8 +653,11 @@ function calculateDocumentFrequencies(docs, textProperty) {
|
|
|
545
653
|
}
|
|
546
654
|
return df;
|
|
547
655
|
}
|
|
656
|
+
function normalizeText(text) {
|
|
657
|
+
return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
|
|
658
|
+
}
|
|
548
659
|
function tokenize(text) {
|
|
549
|
-
return text
|
|
660
|
+
return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
|
|
550
661
|
}
|
|
551
662
|
|
|
552
663
|
exports.pluginFuzzyPhrase = pluginFuzzyPhrase;
|