@wcs-colab/plugin-fuzzy-phrase 3.1.16-custom.9 → 3.1.16-custom.newbase.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1,108 +1,62 @@
1
- import { SearchableValue, OramaPlugin, AnyOrama, Results, TypedDocument } from '@wcs-colab/orama';
1
+ import { OramaPlugin, AnyOrama, Results, TypedDocument } from '@wcs-colab/orama';
2
2
 
3
3
  /**
4
- * TypeScript type definitions for Fuzzy Phrase Plugin
5
- */
6
-
7
- /**
8
- * Configuration for the Fuzzy Phrase Plugin
4
+ * Shared types for the fuzzy phrase plugin.
9
5
  */
10
6
  interface FuzzyPhraseConfig {
11
- /**
12
- * Text property to search in
13
- * @default 'content'
14
- */
7
+ /** Text property to search in (defaults to `content`) */
15
8
  textProperty?: string;
16
- /**
17
- * Base fuzzy matching tolerance (edit distance)
18
- * @default 1
19
- */
9
+ /** Base fuzzy matching tolerance (edit distance) */
20
10
  tolerance?: number;
21
- /**
22
- * Enable adaptive tolerance (scales with query length)
23
- * @default true
24
- */
11
+ /** Enable adaptive tolerance that scales with query length */
25
12
  adaptiveTolerance?: boolean;
26
- /**
27
- * Enable synonym expansion
28
- * @default false
29
- */
13
+ /** Enable synonym expansion using Supabase-backed synonym map */
30
14
  enableSynonyms?: boolean;
31
- /**
32
- * Supabase configuration for loading synonyms
33
- */
15
+ /** Supabase configuration for loading synonyms */
34
16
  supabase?: {
35
17
  url: string;
36
18
  serviceKey: string;
37
19
  };
38
- /**
39
- * Scoring weight for synonym matches (0-1)
40
- * @default 0.8
41
- */
20
+ /** Scoring weight for synonym matches (0-1, default ~0.8) */
42
21
  synonymMatchScore?: number;
43
- /**
44
- * Scoring weights for different components
45
- */
22
+ /** Scoring weights for different components */
46
23
  weights?: {
47
- /** Weight for exact matches */
48
24
  exact?: number;
49
- /** Weight for fuzzy matches */
50
25
  fuzzy?: number;
51
- /** Weight for phrase order */
52
26
  order?: number;
53
- /** Weight for proximity bonus */
54
27
  proximity?: number;
55
- /** Weight for density bonus */
56
28
  density?: number;
57
- /** Weight for TF-IDF semantic score */
58
29
  semantic?: number;
59
30
  };
60
- /**
61
- * Maximum gap between words in a phrase
62
- * @default 5
63
- */
31
+ /** Maximum gap between words in a phrase */
64
32
  maxGap?: number;
65
- /**
66
- * Minimum phrase score to include in results
67
- * @default 0.1
68
- */
33
+ /** Minimum phrase score to include in results */
69
34
  minScore?: number;
70
35
  }
71
- /**
72
- * Match information for a single word
73
- */
36
+ type SynonymMap = Record<string, string[]>;
37
+ interface Candidate {
38
+ word: string;
39
+ type: 'exact' | 'fuzzy' | 'synonym';
40
+ queryToken: string;
41
+ distance: number;
42
+ score: number;
43
+ }
74
44
  interface WordMatch {
75
- /** The matched word from the document */
76
45
  word: string;
77
- /** The query token that matched */
78
46
  queryToken: string;
79
- /** Position of the word in the document */
80
47
  position: number;
81
- /** Type of match */
82
48
  type: 'exact' | 'fuzzy' | 'synonym';
83
- /** Edit distance for fuzzy matches */
84
- distance?: number;
85
- /** Match score (0-1) */
49
+ distance: number;
86
50
  score: number;
87
51
  }
88
- /**
89
- * Phrase match information
90
- */
91
52
  interface PhraseMatch {
92
- /** All word matches in this phrase */
93
53
  words: WordMatch[];
94
- /** Start position in document */
95
54
  startPosition: number;
96
- /** End position in document */
97
55
  endPosition: number;
98
- /** Gap between words */
99
56
  gap: number;
100
- /** Whether words are in correct order */
101
57
  inOrder: boolean;
102
- /** Overall phrase score */
103
58
  score: number;
104
- /** Score breakdown by component */
105
- scoreBreakdown?: {
59
+ scoreBreakdown: {
106
60
  base: number;
107
61
  order: number;
108
62
  proximity: number;
@@ -110,34 +64,11 @@ interface PhraseMatch {
110
64
  semantic: number;
111
65
  };
112
66
  }
113
- /**
114
- * Document match with all phrase matches
115
- */
116
67
  interface DocumentMatch {
117
- /** Document ID */
118
68
  id: string;
119
- /** All phrase matches found in this document */
120
69
  phrases: PhraseMatch[];
121
- /** Overall document score */
122
- score: number;
123
- /** Document data */
124
- document: Record<string, SearchableValue>;
125
- }
126
- /**
127
- * Synonym map structure
128
- */
129
- interface SynonymMap {
130
- [word: string]: string[];
131
- }
132
- /**
133
- * Candidate word for matching
134
- */
135
- interface Candidate {
136
- word: string;
137
- type: 'exact' | 'fuzzy' | 'synonym';
138
- queryToken: string;
139
- distance?: number;
140
70
  score: number;
71
+ document: any;
141
72
  }
142
73
 
143
74
  /**
package/dist/index.d.ts CHANGED
@@ -1,108 +1,62 @@
1
- import { SearchableValue, OramaPlugin, AnyOrama, Results, TypedDocument } from '@wcs-colab/orama';
1
+ import { OramaPlugin, AnyOrama, Results, TypedDocument } from '@wcs-colab/orama';
2
2
 
3
3
  /**
4
- * TypeScript type definitions for Fuzzy Phrase Plugin
5
- */
6
-
7
- /**
8
- * Configuration for the Fuzzy Phrase Plugin
4
+ * Shared types for the fuzzy phrase plugin.
9
5
  */
10
6
  interface FuzzyPhraseConfig {
11
- /**
12
- * Text property to search in
13
- * @default 'content'
14
- */
7
+ /** Text property to search in (defaults to `content`) */
15
8
  textProperty?: string;
16
- /**
17
- * Base fuzzy matching tolerance (edit distance)
18
- * @default 1
19
- */
9
+ /** Base fuzzy matching tolerance (edit distance) */
20
10
  tolerance?: number;
21
- /**
22
- * Enable adaptive tolerance (scales with query length)
23
- * @default true
24
- */
11
+ /** Enable adaptive tolerance that scales with query length */
25
12
  adaptiveTolerance?: boolean;
26
- /**
27
- * Enable synonym expansion
28
- * @default false
29
- */
13
+ /** Enable synonym expansion using Supabase-backed synonym map */
30
14
  enableSynonyms?: boolean;
31
- /**
32
- * Supabase configuration for loading synonyms
33
- */
15
+ /** Supabase configuration for loading synonyms */
34
16
  supabase?: {
35
17
  url: string;
36
18
  serviceKey: string;
37
19
  };
38
- /**
39
- * Scoring weight for synonym matches (0-1)
40
- * @default 0.8
41
- */
20
+ /** Scoring weight for synonym matches (0-1, default ~0.8) */
42
21
  synonymMatchScore?: number;
43
- /**
44
- * Scoring weights for different components
45
- */
22
+ /** Scoring weights for different components */
46
23
  weights?: {
47
- /** Weight for exact matches */
48
24
  exact?: number;
49
- /** Weight for fuzzy matches */
50
25
  fuzzy?: number;
51
- /** Weight for phrase order */
52
26
  order?: number;
53
- /** Weight for proximity bonus */
54
27
  proximity?: number;
55
- /** Weight for density bonus */
56
28
  density?: number;
57
- /** Weight for TF-IDF semantic score */
58
29
  semantic?: number;
59
30
  };
60
- /**
61
- * Maximum gap between words in a phrase
62
- * @default 5
63
- */
31
+ /** Maximum gap between words in a phrase */
64
32
  maxGap?: number;
65
- /**
66
- * Minimum phrase score to include in results
67
- * @default 0.1
68
- */
33
+ /** Minimum phrase score to include in results */
69
34
  minScore?: number;
70
35
  }
71
- /**
72
- * Match information for a single word
73
- */
36
+ type SynonymMap = Record<string, string[]>;
37
+ interface Candidate {
38
+ word: string;
39
+ type: 'exact' | 'fuzzy' | 'synonym';
40
+ queryToken: string;
41
+ distance: number;
42
+ score: number;
43
+ }
74
44
  interface WordMatch {
75
- /** The matched word from the document */
76
45
  word: string;
77
- /** The query token that matched */
78
46
  queryToken: string;
79
- /** Position of the word in the document */
80
47
  position: number;
81
- /** Type of match */
82
48
  type: 'exact' | 'fuzzy' | 'synonym';
83
- /** Edit distance for fuzzy matches */
84
- distance?: number;
85
- /** Match score (0-1) */
49
+ distance: number;
86
50
  score: number;
87
51
  }
88
- /**
89
- * Phrase match information
90
- */
91
52
  interface PhraseMatch {
92
- /** All word matches in this phrase */
93
53
  words: WordMatch[];
94
- /** Start position in document */
95
54
  startPosition: number;
96
- /** End position in document */
97
55
  endPosition: number;
98
- /** Gap between words */
99
56
  gap: number;
100
- /** Whether words are in correct order */
101
57
  inOrder: boolean;
102
- /** Overall phrase score */
103
58
  score: number;
104
- /** Score breakdown by component */
105
- scoreBreakdown?: {
59
+ scoreBreakdown: {
106
60
  base: number;
107
61
  order: number;
108
62
  proximity: number;
@@ -110,34 +64,11 @@ interface PhraseMatch {
110
64
  semantic: number;
111
65
  };
112
66
  }
113
- /**
114
- * Document match with all phrase matches
115
- */
116
67
  interface DocumentMatch {
117
- /** Document ID */
118
68
  id: string;
119
- /** All phrase matches found in this document */
120
69
  phrases: PhraseMatch[];
121
- /** Overall document score */
122
- score: number;
123
- /** Document data */
124
- document: Record<string, SearchableValue>;
125
- }
126
- /**
127
- * Synonym map structure
128
- */
129
- interface SynonymMap {
130
- [word: string]: string[];
131
- }
132
- /**
133
- * Candidate word for matching
134
- */
135
- interface Candidate {
136
- word: string;
137
- type: 'exact' | 'fuzzy' | 'synonym';
138
- queryToken: string;
139
- distance?: number;
140
70
  score: number;
71
+ document: any;
141
72
  }
142
73
 
143
74
  /**
package/dist/index.js CHANGED
@@ -80,30 +80,13 @@ function calculateAdaptiveTolerance(queryTokens, baseTolerance) {
80
80
  function extractVocabularyFromRadixTree(radixNode) {
81
81
  const vocabulary = /* @__PURE__ */ new Set();
82
82
  let nodesVisited = 0;
83
- let wordsFound = 0;
84
83
  function traverse(node, depth = 0) {
85
84
  if (!node) {
86
- console.log(`\u26A0\uFE0F Null node at depth ${depth}`);
87
85
  return;
88
86
  }
89
87
  nodesVisited++;
90
- if (nodesVisited <= 3) {
91
- const cInfo = node.c ? {
92
- isArray: Array.isArray(node.c),
93
- isMap: node.c instanceof Map,
94
- type: typeof node.c,
95
- constructor: node.c.constructor?.name,
96
- keys: node.c instanceof Map ? Array.from(node.c.keys()).slice(0, 3) : Object.keys(node.c).slice(0, 3),
97
- valuesCount: node.c instanceof Map ? node.c.size : Array.isArray(node.c) ? node.c.length : Object.keys(node.c).length
98
- } : "null";
99
- console.log(`\u{1F50D} Node ${nodesVisited}:`, { w: node.w, e: node.e, has_c: !!node.c, c_info: cInfo });
100
- }
101
88
  if (node.e && node.w && typeof node.w === "string" && node.w.length > 0) {
102
89
  vocabulary.add(node.w);
103
- wordsFound++;
104
- if (wordsFound <= 5) {
105
- console.log(`\u2705 Found word ${wordsFound}: "${node.w}"`);
106
- }
107
90
  }
108
91
  if (node.c) {
109
92
  if (node.c instanceof Map) {
@@ -225,7 +208,9 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
225
208
  queryTokens,
226
209
  config,
227
210
  documentFrequency,
228
- totalDocuments
211
+ totalDocuments,
212
+ wordMatches
213
+ // Pass all word matches for density calculation
229
214
  );
230
215
  if (phrase && phrase.words.length > 0) {
231
216
  phrases.push(phrase);
@@ -233,7 +218,7 @@ function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFr
233
218
  }
234
219
  return deduplicatePhrases(phrases);
235
220
  }
236
- function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments) {
221
+ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
237
222
  const startMatch = wordMatches[startIndex];
238
223
  const phraseWords = [startMatch];
239
224
  const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
@@ -252,12 +237,13 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
252
237
  }
253
238
  }
254
239
  if (phraseWords.length > 0) {
255
- const score = calculatePhraseScore(
240
+ const { score, breakdown } = calculatePhraseScore(
256
241
  phraseWords,
257
242
  queryTokens,
258
243
  config,
259
244
  documentFrequency,
260
- totalDocuments
245
+ totalDocuments,
246
+ allWordMatches
261
247
  );
262
248
  return {
263
249
  words: phraseWords,
@@ -265,12 +251,13 @@ function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, d
265
251
  endPosition: phraseWords[phraseWords.length - 1].position,
266
252
  gap: phraseWords[phraseWords.length - 1].position - phraseWords[0].position,
267
253
  inOrder: isInOrder(phraseWords, queryTokens),
268
- score
254
+ score,
255
+ scoreBreakdown: breakdown
269
256
  };
270
257
  }
271
258
  return null;
272
259
  }
273
- function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments) {
260
+ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments, allWordMatches) {
274
261
  let baseScore = 0;
275
262
  for (const word of phraseWords) {
276
263
  const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
@@ -281,16 +268,42 @@ function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequenc
281
268
  const orderScore = inOrder ? 1 : 0.5;
282
269
  const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
283
270
  const proximityScore = Math.max(0, 1 - span / (queryTokens.length * 5));
284
- const densityScore = phraseWords.length / queryTokens.length;
271
+ let densityScore = 0;
272
+ if (queryTokens.length === 1) {
273
+ const totalOccurrences = allWordMatches.length;
274
+ densityScore = totalOccurrences / queryTokens.length;
275
+ } else {
276
+ densityScore = phraseWords.length / queryTokens.length;
277
+ }
285
278
  const semanticScore = calculateSemanticScore(
286
279
  phraseWords,
287
280
  documentFrequency,
288
281
  totalDocuments
289
282
  );
290
283
  const weights = config.weights;
291
- const totalScore = baseScore + orderScore * weights.order + proximityScore * weights.proximity + densityScore * weights.density + semanticScore * weights.semantic;
284
+ const weightedBase = baseScore;
285
+ const weightedOrder = orderScore * weights.order;
286
+ const weightedProximity = proximityScore * weights.proximity;
287
+ const weightedDensity = densityScore * weights.density;
288
+ const weightedSemantic = semanticScore * weights.semantic;
289
+ const totalScore = weightedBase + weightedOrder + weightedProximity + weightedDensity + weightedSemantic;
292
290
  const maxPossibleScore = 1 + weights.order + weights.proximity + weights.density + weights.semantic;
293
- return Math.min(1, totalScore / maxPossibleScore);
291
+ const score = totalScore / maxPossibleScore;
292
+ const base = weightedBase / maxPossibleScore;
293
+ const order = weightedOrder / maxPossibleScore;
294
+ const proximity = weightedProximity / maxPossibleScore;
295
+ const density = weightedDensity / maxPossibleScore;
296
+ const semantic = weightedSemantic / maxPossibleScore;
297
+ return {
298
+ score,
299
+ breakdown: {
300
+ base,
301
+ order,
302
+ proximity,
303
+ density,
304
+ semantic
305
+ }
306
+ };
294
307
  }
295
308
  function isInOrder(phraseWords, queryTokens) {
296
309
  const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
@@ -304,6 +317,9 @@ function isInOrder(phraseWords, queryTokens) {
304
317
  return true;
305
318
  }
306
319
  function calculateSemanticScore(phraseWords, documentFrequency, totalDocuments) {
320
+ if (totalDocuments === 0) {
321
+ return 0;
322
+ }
307
323
  let tfidfSum = 0;
308
324
  for (const word of phraseWords) {
309
325
  const df = documentFrequency.get(word.word) || 1;
@@ -398,14 +414,22 @@ function pluginFuzzyPhrase(userConfig = {}) {
398
414
  console.error("\u26A0\uFE0F Failed to load synonyms:", error);
399
415
  }
400
416
  }
401
- if (orama.data && typeof orama.data === "object") {
402
- const docs = orama.data.docs || {};
417
+ const docs = orama.data?.docs?.docs;
418
+ if (docs) {
403
419
  state.totalDocuments = Object.keys(docs).length;
404
420
  state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
405
421
  console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
406
422
  }
407
423
  pluginStates.set(orama, state);
408
424
  console.log("\u2705 Fuzzy Phrase Plugin initialized");
425
+ setImmediate(() => {
426
+ if (typeof globalThis.fuzzyPhrasePluginReady === "function") {
427
+ console.log("\u{1F4E1} Signaling plugin ready...");
428
+ globalThis.fuzzyPhrasePluginReady();
429
+ } else {
430
+ console.warn("\u26A0\uFE0F fuzzyPhrasePluginReady callback not found");
431
+ }
432
+ });
409
433
  }
410
434
  };
411
435
  return plugin;
@@ -468,7 +492,31 @@ async function searchWithFuzzyPhrase(orama, params, language) {
468
492
  );
469
493
  console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
470
494
  const documentMatches = [];
471
- const docs = orama.data?.docs || {};
495
+ console.log("\u{1F50D} DEBUG orama.data structure:", {
496
+ dataKeys: Object.keys(orama.data || {}),
497
+ hasDocs: !!orama.data?.docs,
498
+ docsType: orama.data?.docs ? typeof orama.data.docs : "undefined"
499
+ });
500
+ let docs = {};
501
+ if (orama.data?.docs?.docs) {
502
+ docs = orama.data.docs.docs;
503
+ console.log("\u2705 Found docs at orama.data.docs.docs");
504
+ } else if (orama.data?.docs && typeof orama.data.docs === "object") {
505
+ const firstKey = Object.keys(orama.data.docs)[0];
506
+ if (firstKey && firstKey !== "sharedInternalDocumentStore" && firstKey !== "count") {
507
+ docs = orama.data.docs;
508
+ console.log("\u2705 Found docs at orama.data.docs (direct)");
509
+ }
510
+ }
511
+ if (Object.keys(docs).length === 0) {
512
+ console.log("\u274C Could not find documents - available structure:", {
513
+ hasDataDocs: !!orama.data?.docs,
514
+ dataDocsKeys: orama.data?.docs ? Object.keys(orama.data.docs) : "none",
515
+ hasDataDocsDocs: !!orama.data?.docs?.docs,
516
+ dataDocsDocsCount: orama.data?.docs?.docs ? Object.keys(orama.data.docs.docs).length : 0
517
+ });
518
+ }
519
+ console.log(`\u{1F4C4} Searching through ${Object.keys(docs).length} documents`);
472
520
  for (const [docId, doc] of Object.entries(docs)) {
473
521
  const text = doc[textProperty];
474
522
  if (!text || typeof text !== "string") {
@@ -496,7 +544,9 @@ async function searchWithFuzzyPhrase(orama, params, language) {
496
544
  }
497
545
  }
498
546
  documentMatches.sort((a, b) => b.score - a.score);
499
- const hits = documentMatches.map((match) => ({
547
+ const limit = params.limit ?? documentMatches.length;
548
+ const limitedMatches = documentMatches.slice(0, limit);
549
+ const hits = limitedMatches.map((match) => ({
500
550
  id: match.id,
501
551
  score: match.score,
502
552
  document: match.document,
@@ -504,7 +554,7 @@ async function searchWithFuzzyPhrase(orama, params, language) {
504
554
  _phrases: match.phrases
505
555
  }));
506
556
  const elapsed = performance.now() - startTime;
507
- console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms`);
557
+ console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms (limit: ${limit})`);
508
558
  return {
509
559
  elapsed: {
510
560
  formatted: `${elapsed.toFixed(2)}ms`,
@@ -517,15 +567,25 @@ async function searchWithFuzzyPhrase(orama, params, language) {
517
567
  }
518
568
  async function loadSynonymsFromSupabase(supabaseConfig) {
519
569
  try {
570
+ console.log("\u{1F50D} DEBUG: Calling Supabase RPC get_synonym_map...");
520
571
  const { createClient } = await import('@supabase/supabase-js');
521
572
  const supabase = createClient(supabaseConfig.url, supabaseConfig.serviceKey);
522
573
  const { data, error } = await supabase.rpc("get_synonym_map");
574
+ console.log("\u{1F50D} DEBUG: Supabase RPC response:", {
575
+ hasError: !!error,
576
+ errorMessage: error?.message,
577
+ hasData: !!data,
578
+ dataType: typeof data,
579
+ dataKeys: data ? Object.keys(data).length : 0
580
+ });
523
581
  if (error) {
524
582
  throw new Error(`Supabase error: ${error.message}`);
525
583
  }
526
- return data || {};
584
+ const synonymMap = data || {};
585
+ console.log(`\u{1F4DA} Loaded ${Object.keys(synonymMap).length} synonym entries from Supabase`);
586
+ return synonymMap;
527
587
  } catch (error) {
528
- console.error("Failed to load synonyms from Supabase:", error);
588
+ console.error("\u274C Failed to load synonyms from Supabase:", error);
529
589
  throw error;
530
590
  }
531
591
  }
@@ -543,8 +603,11 @@ function calculateDocumentFrequencies(docs, textProperty) {
543
603
  }
544
604
  return df;
545
605
  }
606
+ function normalizeText(text) {
607
+ return text.toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\b[ldcjmnst][\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4](?=\w)/gi, " ").replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0027\u0060\u00B4]/g, "").replace(/[\u201c\u201d]/g, '"').replace(/[.,;:!?()[\]{}\-—–«»""]/g, " ").replace(/\s+/g, " ").trim();
608
+ }
546
609
  function tokenize(text) {
547
- return text.toLowerCase().split(/\s+/).filter((token) => token.length > 0);
610
+ return normalizeText(text).split(/\s+/).filter((token) => token.length > 0);
548
611
  }
549
612
 
550
613
  export { pluginFuzzyPhrase, searchWithFuzzyPhrase };