@nlptools/distance 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -31
- package/dist/index.d.mts +707 -4
- package/dist/index.mjs +959 -54
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -65,6 +65,260 @@ declare function lcsLength(a: string, b: string, algorithm?: "myers" | "dp"): nu
|
|
|
65
65
|
*/
|
|
66
66
|
declare function lcsPairs(a: string, b: string, algorithm?: "myers" | "dp"): Array<[number, number]>;
|
|
67
67
|
//#endregion
|
|
68
|
+
//#region src/edit/jaro.d.ts
|
|
69
|
+
/**
|
|
70
|
+
* Jaro and Jaro-Winkler similarity algorithms.
|
|
71
|
+
*
|
|
72
|
+
* Jaro measures similarity between two strings by considering matching characters
|
|
73
|
+
* and transpositions. Jaro-Winkler extends Jaro with a prefix bonus.
|
|
74
|
+
*
|
|
75
|
+
* Time: O(m * n)
|
|
76
|
+
*/
|
|
77
|
+
/**
|
|
78
|
+
* Compute Jaro similarity between two strings.
|
|
79
|
+
*
|
|
80
|
+
* J(S1, S2) = (1/3) * (m/|S1| + m/|S2| + (m - t/2) / m)
|
|
81
|
+
*
|
|
82
|
+
* where m = number of matching characters (within window),
|
|
83
|
+
* t = number of transpositions among matching characters.
|
|
84
|
+
*
|
|
85
|
+
* @param a - First string
|
|
86
|
+
* @param b - Second string
|
|
87
|
+
* @returns Jaro similarity in [0, 1]
|
|
88
|
+
*/
|
|
89
|
+
declare function jaro(a: string, b: string): number;
|
|
90
|
+
/**
|
|
91
|
+
* Options for Jaro-Winkler similarity.
|
|
92
|
+
*/
|
|
93
|
+
interface IJaroWinklerOptions {
|
|
94
|
+
/**
|
|
95
|
+
* Weight applied to the common prefix bonus.
|
|
96
|
+
* @default 0.1
|
|
97
|
+
*/
|
|
98
|
+
prefixWeight?: number;
|
|
99
|
+
/**
|
|
100
|
+
* Maximum length of common prefix to consider.
|
|
101
|
+
* @default 4
|
|
102
|
+
*/
|
|
103
|
+
maxPrefix?: number;
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Compute Jaro-Winkler similarity between two strings.
|
|
107
|
+
*
|
|
108
|
+
* JW(S1, S2) = Jaro(S1, S2) + l * p * (1 - Jaro(S1, S2))
|
|
109
|
+
*
|
|
110
|
+
* where l = length of common prefix (up to maxPrefix),
|
|
111
|
+
* p = prefixWeight.
|
|
112
|
+
*
|
|
113
|
+
* @param a - First string
|
|
114
|
+
* @param b - Second string
|
|
115
|
+
* @param options - Configuration
|
|
116
|
+
* @returns Jaro-Winkler similarity in [0, 1]
|
|
117
|
+
*/
|
|
118
|
+
declare function jaroWinkler(a: string, b: string, options?: IJaroWinklerOptions): number;
|
|
119
|
+
//#endregion
|
|
120
|
+
//#region src/edit/damerau.d.ts
|
|
121
|
+
/**
|
|
122
|
+
* Damerau-Levenshtein distance (unrestricted variant).
|
|
123
|
+
*
|
|
124
|
+
* Extension of Levenshtein that allows transpositions of adjacent characters,
|
|
125
|
+
* even when substrings are edited multiple times.
|
|
126
|
+
*
|
|
127
|
+
* Matches the default behavior of textdistance.rs (restricted = false).
|
|
128
|
+
*
|
|
129
|
+
* Time: O(m * n), Space: O(m * n)
|
|
130
|
+
*/
|
|
131
|
+
/**
|
|
132
|
+
* Compute the Damerau-Levenshtein distance between two strings.
|
|
133
|
+
*
|
|
134
|
+
* Allows insertions, deletions, substitutions, and transpositions of
|
|
135
|
+
* adjacent characters. This is the unrestricted variant, which correctly
|
|
136
|
+
* handles cases where a substring is edited more than once.
|
|
137
|
+
*
|
|
138
|
+
* @param a - First string
|
|
139
|
+
* @param b - Second string
|
|
140
|
+
* @returns Edit distance (non-negative integer)
|
|
141
|
+
*/
|
|
142
|
+
declare function damerauLevenshtein(a: string, b: string): number;
|
|
143
|
+
/**
|
|
144
|
+
* Compute the normalized Damerau-Levenshtein similarity in [0, 1].
|
|
145
|
+
*
|
|
146
|
+
* @param a - First string
|
|
147
|
+
* @param b - Second string
|
|
148
|
+
* @returns Similarity score where 1 means identical
|
|
149
|
+
*/
|
|
150
|
+
declare function damerauLevenshteinNormalized(a: string, b: string): number;
|
|
151
|
+
//#endregion
|
|
152
|
+
//#region src/edit/hamming.d.ts
|
|
153
|
+
/**
|
|
154
|
+
* Hamming distance — counts character mismatches between equal-length strings.
|
|
155
|
+
*
|
|
156
|
+
* Time: O(min(m, n))
|
|
157
|
+
*/
|
|
158
|
+
/**
|
|
159
|
+
* Compute the Hamming distance between two strings.
|
|
160
|
+
*
|
|
161
|
+
* If strings have different lengths, only compares up to the shorter length
|
|
162
|
+
* and adds the length difference as additional mismatches.
|
|
163
|
+
*
|
|
164
|
+
* @param a - First string
|
|
165
|
+
* @param b - Second string
|
|
166
|
+
* @returns Number of mismatching characters
|
|
167
|
+
*/
|
|
168
|
+
declare function hamming(a: string, b: string): number;
|
|
169
|
+
/**
|
|
170
|
+
* Compute the normalized Hamming similarity in [0, 1].
|
|
171
|
+
*
|
|
172
|
+
* @param a - First string
|
|
173
|
+
* @param b - Second string
|
|
174
|
+
* @returns Similarity score where 1 means identical
|
|
175
|
+
*/
|
|
176
|
+
declare function hammingNormalized(a: string, b: string): number;
|
|
177
|
+
//#endregion
|
|
178
|
+
//#region src/edit/lcs-str.d.ts
|
|
179
|
+
/**
|
|
180
|
+
* Longest Common Substring (contiguous) algorithms.
|
|
181
|
+
*
|
|
182
|
+
* Unlike LCS (subsequence), this requires the matching characters to be contiguous.
|
|
183
|
+
*
|
|
184
|
+
* Time: O(m * n), Space: O(min(m, n))
|
|
185
|
+
*/
|
|
186
|
+
/**
|
|
187
|
+
* Compute the length of the Longest Common Substring.
|
|
188
|
+
*
|
|
189
|
+
* @param a - First string
|
|
190
|
+
* @param b - Second string
|
|
191
|
+
* @returns Length of the longest common substring
|
|
192
|
+
*/
|
|
193
|
+
declare function lcsSubstringLength(a: string, b: string): number;
|
|
194
|
+
/**
|
|
195
|
+
* Compute the LCS substring distance: len(a) + len(b) - 2 * lcsSubstringLength.
|
|
196
|
+
*
|
|
197
|
+
* @param a - First string
|
|
198
|
+
* @param b - Second string
|
|
199
|
+
* @returns LCS substring distance (non-negative integer)
|
|
200
|
+
*/
|
|
201
|
+
declare function lcsSubstringDistance(a: string, b: string): number;
|
|
202
|
+
/**
|
|
203
|
+
* Compute the normalized LCS substring similarity in [0, 1].
|
|
204
|
+
*
|
|
205
|
+
* Normalized by max(len(a), len(b)) to match textdistance.rs convention.
|
|
206
|
+
*
|
|
207
|
+
* @param a - First string
|
|
208
|
+
* @param b - Second string
|
|
209
|
+
* @returns Similarity score where 1 means identical
|
|
210
|
+
*/
|
|
211
|
+
declare function lcsSubstringNormalized(a: string, b: string): number;
|
|
212
|
+
//#endregion
|
|
213
|
+
//#region src/edit/sift4.d.ts
|
|
214
|
+
/**
|
|
215
|
+
* SIFT4 simple — fast approximate string distance.
|
|
216
|
+
*
|
|
217
|
+
* A fast algorithm for approximate string matching with O(n) complexity
|
|
218
|
+
* in typical cases. Returns a distance value (lower = more similar).
|
|
219
|
+
*
|
|
220
|
+
* Matches the textdistance.rs implementation exactly.
|
|
221
|
+
*
|
|
222
|
+
* Time: O(n * maxOffset)
|
|
223
|
+
*/
|
|
224
|
+
/**
|
|
225
|
+
* Options for SIFT4.
|
|
226
|
+
*/
|
|
227
|
+
interface ISift4Options {
|
|
228
|
+
/**
|
|
229
|
+
* Maximum offset for character matching.
|
|
230
|
+
* @default 5
|
|
231
|
+
*/
|
|
232
|
+
maxOffset?: number;
|
|
233
|
+
}
|
|
234
|
+
/**
|
|
235
|
+
* Compute the SIFT4 simple distance between two strings.
|
|
236
|
+
*
|
|
237
|
+
* @param a - First string
|
|
238
|
+
* @param b - Second string
|
|
239
|
+
* @param options - Configuration
|
|
240
|
+
* @returns Distance (non-negative integer)
|
|
241
|
+
*/
|
|
242
|
+
declare function sift4(a: string, b: string, options?: ISift4Options): number;
|
|
243
|
+
/**
|
|
244
|
+
* Compute the normalized SIFT4 similarity in [0, 1].
|
|
245
|
+
*
|
|
246
|
+
* @param a - First string
|
|
247
|
+
* @param b - Second string
|
|
248
|
+
* @param options - Configuration
|
|
249
|
+
* @returns Similarity score where 1 means identical
|
|
250
|
+
*/
|
|
251
|
+
declare function sift4Normalized(a: string, b: string, options?: ISift4Options): number;
|
|
252
|
+
//#endregion
|
|
253
|
+
//#region src/edit/ratcliff.d.ts
|
|
254
|
+
/**
|
|
255
|
+
* Ratcliff-Obershelp algorithm — Gestalt pattern matching.
|
|
256
|
+
*
|
|
257
|
+
* Iteratively finds the longest common substring using a stack-based approach,
|
|
258
|
+
* combining scores from both sides. Returns a similarity in [0, 1].
|
|
259
|
+
*
|
|
260
|
+
* Based on the textdistance.rs implementation.
|
|
261
|
+
*
|
|
262
|
+
* Time: O(n * m * log(n * m)) worst case, O(n + m) average
|
|
263
|
+
*/
|
|
264
|
+
/**
|
|
265
|
+
* Compute Ratcliff-Obershelp similarity between two strings.
|
|
266
|
+
*
|
|
267
|
+
* Uses an iterative stack-based approach to avoid stack overflow on
|
|
268
|
+
* very different strings. The algorithm recursively finds the longest
|
|
269
|
+
* common substring and combines similarity scores from both sides.
|
|
270
|
+
*
|
|
271
|
+
* similarity = 2 * M / T, where M = total matched characters, T = total characters
|
|
272
|
+
*
|
|
273
|
+
* @param a - First string
|
|
274
|
+
* @param b - Second string
|
|
275
|
+
* @returns Ratcliff-Obershelp similarity in [0, 1]
|
|
276
|
+
*/
|
|
277
|
+
declare function ratcliff(a: string, b: string): number;
|
|
278
|
+
//#endregion
|
|
279
|
+
//#region src/edit/smith-waterman.d.ts
|
|
280
|
+
/**
|
|
281
|
+
* Smith-Waterman local sequence alignment algorithm.
|
|
282
|
+
*
|
|
283
|
+
* Designed for biological sequence alignment, it finds the best
|
|
284
|
+
* local alignment between two sequences.
|
|
285
|
+
*
|
|
286
|
+
* Default scoring: match=1, mismatch=0, gap=-1 (matches textdistance.rs)
|
|
287
|
+
*
|
|
288
|
+
* Time: O(m * n), Space: O(m * n)
|
|
289
|
+
*/
|
|
290
|
+
/**
|
|
291
|
+
* Options for Smith-Waterman alignment.
|
|
292
|
+
*/
|
|
293
|
+
interface ISmithWatermanOptions {
|
|
294
|
+
/** Score for matching characters. @default 1 */
|
|
295
|
+
matchScore?: number;
|
|
296
|
+
/** Score for mismatching characters. @default 0 */
|
|
297
|
+
mismatchScore?: number;
|
|
298
|
+
/** Score penalty for a gap. @default -1 */
|
|
299
|
+
gapScore?: number;
|
|
300
|
+
}
|
|
301
|
+
/**
|
|
302
|
+
* Compute the raw Smith-Waterman alignment score.
|
|
303
|
+
*
|
|
304
|
+
* @param a - First string
|
|
305
|
+
* @param b - Second string
|
|
306
|
+
* @param options - Scoring parameters
|
|
307
|
+
* @returns Raw alignment score (non-negative)
|
|
308
|
+
*/
|
|
309
|
+
declare function smithWaterman(a: string, b: string, options?: ISmithWatermanOptions): number;
|
|
310
|
+
/**
|
|
311
|
+
* Compute the normalized Smith-Waterman similarity in [0, 1].
|
|
312
|
+
*
|
|
313
|
+
* Normalized by matchScore * max(len(a), len(b)), matching textdistance.rs convention.
|
|
314
|
+
*
|
|
315
|
+
* @param a - First string
|
|
316
|
+
* @param b - Second string
|
|
317
|
+
* @param options - Scoring parameters
|
|
318
|
+
* @returns Normalized similarity in [0, 1]
|
|
319
|
+
*/
|
|
320
|
+
declare function smithWatermanNormalized(a: string, b: string, options?: ISmithWatermanOptions): number;
|
|
321
|
+
//#endregion
|
|
68
322
|
//#region src/token/jaccard.d.ts
|
|
69
323
|
/**
|
|
70
324
|
* Jaccard similarity between two strings based on character-level multiset.
|
|
@@ -95,10 +349,10 @@ declare function jaccardNgram(a: string, b: string, n?: number): number;
|
|
|
95
349
|
/**
|
|
96
350
|
* Cosine similarity between two strings based on character-level multiset.
|
|
97
351
|
*
|
|
98
|
-
*
|
|
352
|
+
* Uses textdistance.rs convention:
|
|
353
|
+
* cosine(A, B) = intersect_count(A, B) / sqrt(count(A) * count(B))
|
|
99
354
|
*
|
|
100
|
-
*
|
|
101
|
-
* matching the textdistance crate behavior.
|
|
355
|
+
* Where intersect_count = sum(min(freqA[c], freqB[c])) and count = sum of frequencies.
|
|
102
356
|
*
|
|
103
357
|
* Time: O(m + n)
|
|
104
358
|
*
|
|
@@ -110,6 +364,9 @@ declare function cosine(a: string, b: string): number;
|
|
|
110
364
|
/**
|
|
111
365
|
* Cosine similarity based on character n-grams.
|
|
112
366
|
*
|
|
367
|
+
* Uses textdistance.rs convention (same as character-level cosine but on n-grams):
|
|
368
|
+
* cosine_ngram(A, B) = intersect_count(A, B) / sqrt(count(A) * count(B))
|
|
369
|
+
*
|
|
113
370
|
* @param a - First string
|
|
114
371
|
* @param b - Second string
|
|
115
372
|
* @param n - N-gram size (default: 2)
|
|
@@ -143,6 +400,96 @@ declare function sorensen(a: string, b: string): number;
|
|
|
143
400
|
*/
|
|
144
401
|
declare function sorensenNgram(a: string, b: string, n?: number): number;
|
|
145
402
|
//#endregion
|
|
403
|
+
//#region src/token/tversky.d.ts
|
|
404
|
+
/**
|
|
405
|
+
* Tversky index — asymmetric set similarity measure.
|
|
406
|
+
*
|
|
407
|
+
* Reduces to Jaccard when alpha = beta = 1.
|
|
408
|
+
* Reduces to Sorensen-Dice when alpha = beta = 0.5.
|
|
409
|
+
*
|
|
410
|
+
* Time: O(m + n)
|
|
411
|
+
*/
|
|
412
|
+
/**
|
|
413
|
+
* Options for Tversky index.
|
|
414
|
+
*/
|
|
415
|
+
interface ITverskyOptions {
|
|
416
|
+
/**
|
|
417
|
+
* Weight for elements unique to the first set (a).
|
|
418
|
+
* @default 1
|
|
419
|
+
*/
|
|
420
|
+
alpha?: number;
|
|
421
|
+
/**
|
|
422
|
+
* Weight for elements unique to the second set (b).
|
|
423
|
+
* @default 1
|
|
424
|
+
*/
|
|
425
|
+
beta?: number;
|
|
426
|
+
}
|
|
427
|
+
/**
|
|
428
|
+
* Compute the Tversky index between two strings based on character multiset.
|
|
429
|
+
*
|
|
430
|
+
* T(A, B; α, β) = |A ∩ B| / (|A ∩ B| + α|A \ B| + β|B \ A|)
|
|
431
|
+
*
|
|
432
|
+
* @param a - First string
|
|
433
|
+
* @param b - Second string
|
|
434
|
+
* @param options - alpha and beta weights
|
|
435
|
+
* @returns Tversky index in [0, 1]
|
|
436
|
+
*/
|
|
437
|
+
declare function tversky(a: string, b: string, options?: ITverskyOptions): number;
|
|
438
|
+
//#endregion
|
|
439
|
+
//#region src/token/overlap.d.ts
|
|
440
|
+
/**
|
|
441
|
+
* Overlap coefficient — set similarity normalized by the smaller set.
|
|
442
|
+
*
|
|
443
|
+
* overlap(A, B) = |A ∩ B| / min(|A|, |B|)
|
|
444
|
+
*
|
|
445
|
+
* Time: O(m + n)
|
|
446
|
+
*/
|
|
447
|
+
/**
|
|
448
|
+
* Compute the overlap coefficient between two strings based on character multiset.
|
|
449
|
+
*
|
|
450
|
+
* @param a - First string
|
|
451
|
+
* @param b - Second string
|
|
452
|
+
* @returns Overlap coefficient in [0, 1]
|
|
453
|
+
*/
|
|
454
|
+
declare function overlap(a: string, b: string): number;
|
|
455
|
+
//#endregion
|
|
456
|
+
//#region src/token/naive.d.ts
|
|
457
|
+
/**
|
|
458
|
+
* Naive string similarity measures: prefix, suffix, length.
|
|
459
|
+
*
|
|
460
|
+
* Time: O(min(m, n)) for prefix/suffix, O(1) for length
|
|
461
|
+
*/
|
|
462
|
+
/**
|
|
463
|
+
* Compute prefix similarity between two strings.
|
|
464
|
+
*
|
|
465
|
+
* prefix(a, b) = commonPrefixLength / max(|a|, |b|)
|
|
466
|
+
*
|
|
467
|
+
* @param a - First string
|
|
468
|
+
* @param b - Second string
|
|
469
|
+
* @returns Prefix similarity in [0, 1]
|
|
470
|
+
*/
|
|
471
|
+
declare function prefix(a: string, b: string): number;
|
|
472
|
+
/**
|
|
473
|
+
* Compute suffix similarity between two strings.
|
|
474
|
+
*
|
|
475
|
+
* suffix(a, b) = commonSuffixLength / max(|a|, |b|)
|
|
476
|
+
*
|
|
477
|
+
* @param a - First string
|
|
478
|
+
* @param b - Second string
|
|
479
|
+
* @returns Suffix similarity in [0, 1]
|
|
480
|
+
*/
|
|
481
|
+
declare function suffix(a: string, b: string): number;
|
|
482
|
+
/**
|
|
483
|
+
* Compute length-based similarity between two strings.
|
|
484
|
+
*
|
|
485
|
+
* length(a, b) = 1 - |len(a) - len(b)| / max(len(a), len(b))
|
|
486
|
+
*
|
|
487
|
+
* @param a - First string
|
|
488
|
+
* @param b - Second string
|
|
489
|
+
* @returns Normalized length similarity in [0, 1]
|
|
490
|
+
*/
|
|
491
|
+
declare function length(a: string, b: string): number;
|
|
492
|
+
//#endregion
|
|
146
493
|
//#region src/hash/simhash.d.ts
|
|
147
494
|
interface ISimHashOptions {
|
|
148
495
|
/**
|
|
@@ -389,4 +736,360 @@ declare class LSH {
|
|
|
389
736
|
get size(): number;
|
|
390
737
|
}
|
|
391
738
|
//#endregion
|
|
392
|
-
|
|
739
|
+
//#region src/search.d.ts
|
|
740
|
+
/**
|
|
741
|
+
* A function that computes similarity between two strings, returning a value
|
|
742
|
+
* in [0, 1] where 1 means identical.
|
|
743
|
+
*/
|
|
744
|
+
type SimilarityFn = (a: string, b: string) => number;
|
|
745
|
+
/**
|
|
746
|
+
* Built-in similarity algorithms. Each maps to a normalized similarity
|
|
747
|
+
* function from @nlptools/distance.
|
|
748
|
+
*/
|
|
749
|
+
type BuiltinAlgorithm = "levenshtein" | "lcs" | "jaccard" | "jaccardNgram" | "cosine" | "cosineNgram" | "sorensen" | "sorensenNgram";
|
|
750
|
+
/**
|
|
751
|
+
* Configuration for a searchable key on an object item.
|
|
752
|
+
*
|
|
753
|
+
* @example
|
|
754
|
+
* ```ts
|
|
755
|
+
* const keys = [
|
|
756
|
+
* { name: "title", weight: 0.7 },
|
|
757
|
+
* { name: "author", weight: 0.3 },
|
|
758
|
+
* ];
|
|
759
|
+
* ```
|
|
760
|
+
*/
|
|
761
|
+
interface ISearchKey {
|
|
762
|
+
/** Property name to search on. */
|
|
763
|
+
name: string;
|
|
764
|
+
/**
|
|
765
|
+
* Weight of this key in the final score.
|
|
766
|
+
* Weights are normalized to sum to 1.0 internally.
|
|
767
|
+
* @default 1
|
|
768
|
+
*/
|
|
769
|
+
weight?: number;
|
|
770
|
+
/**
|
|
771
|
+
* Optional custom getter function. If provided, used instead of
|
|
772
|
+
* reading `item[name]`. Must return a string.
|
|
773
|
+
*/
|
|
774
|
+
getter?: (item: any) => string;
|
|
775
|
+
}
|
|
776
|
+
/**
|
|
777
|
+
* A single search result, containing the matched item, its score, and
|
|
778
|
+
* its position in the original collection.
|
|
779
|
+
*
|
|
780
|
+
* Results are sorted by score descending (best match first).
|
|
781
|
+
*/
|
|
782
|
+
interface ISearchResult<T> {
|
|
783
|
+
/** The matched item from the collection. */
|
|
784
|
+
item: T;
|
|
785
|
+
/**
|
|
786
|
+
* Similarity score in [0, 1], where 1 means identical.
|
|
787
|
+
* For multi-key search, this is the weighted sum of per-key scores.
|
|
788
|
+
*/
|
|
789
|
+
score: number;
|
|
790
|
+
/** Index of the item in the original collection array. */
|
|
791
|
+
index: number;
|
|
792
|
+
}
|
|
793
|
+
/**
|
|
794
|
+
* Extended search result including per-key match details.
|
|
795
|
+
* Only produced when `includeMatchDetails` is true.
|
|
796
|
+
*/
|
|
797
|
+
interface ISearchResultWithDetails<T> extends ISearchResult<T> {
|
|
798
|
+
/**
|
|
799
|
+
* Per-key similarity scores.
|
|
800
|
+
* Keys are the key names from the ISearchKey configuration.
|
|
801
|
+
* Values are individual similarity scores in [0, 1].
|
|
802
|
+
*/
|
|
803
|
+
matches: Record<string, number>;
|
|
804
|
+
}
|
|
805
|
+
/**
|
|
806
|
+
* Options for the {@link FuzzySearch} constructor.
|
|
807
|
+
*
|
|
808
|
+
* @example
|
|
809
|
+
* ```ts
|
|
810
|
+
* // String array
|
|
811
|
+
* const search = new FuzzySearch(["apple", "banana", "cherry"]);
|
|
812
|
+
*
|
|
813
|
+
* // Object array with weighted keys
|
|
814
|
+
* const search = new FuzzySearch(books, {
|
|
815
|
+
* keys: [
|
|
816
|
+
* { name: "title", weight: 0.7 },
|
|
817
|
+
* { name: "author", weight: 0.3 },
|
|
818
|
+
* ],
|
|
819
|
+
* algorithm: "cosine",
|
|
820
|
+
* threshold: 0.4,
|
|
821
|
+
* });
|
|
822
|
+
* ```
|
|
823
|
+
*/
|
|
824
|
+
interface IFuzzySearchOptions {
|
|
825
|
+
/**
|
|
826
|
+
* Similarity algorithm to use for comparing strings.
|
|
827
|
+
*
|
|
828
|
+
* - A string from {@link BuiltinAlgorithm} selects a built-in function.
|
|
829
|
+
* - A custom {@link SimilarityFn} can be provided for full control.
|
|
830
|
+
*
|
|
831
|
+
* @default "levenshtein"
|
|
832
|
+
*/
|
|
833
|
+
algorithm?: BuiltinAlgorithm | SimilarityFn;
|
|
834
|
+
/**
|
|
835
|
+
* Keys to search on when the collection contains objects.
|
|
836
|
+
* Ignored for string arrays.
|
|
837
|
+
*/
|
|
838
|
+
keys?: ISearchKey[];
|
|
839
|
+
/**
|
|
840
|
+
* Minimum similarity score (0-1) for a result to be included.
|
|
841
|
+
* Results scoring below this threshold are excluded.
|
|
842
|
+
* @default 0
|
|
843
|
+
*/
|
|
844
|
+
threshold?: number;
|
|
845
|
+
/**
|
|
846
|
+
* Maximum number of results to return.
|
|
847
|
+
* @default Infinity
|
|
848
|
+
*/
|
|
849
|
+
limit?: number;
|
|
850
|
+
/**
|
|
851
|
+
* Whether search should be case-sensitive.
|
|
852
|
+
* When false (default), both the query and the item strings are lowercased
|
|
853
|
+
* before comparison (case-insensitive search).
|
|
854
|
+
* @default false
|
|
855
|
+
*/
|
|
856
|
+
caseSensitive?: boolean;
|
|
857
|
+
/**
|
|
858
|
+
* Include per-key match details in results.
|
|
859
|
+
* When true, results include a `matches` field with individual
|
|
860
|
+
* similarity scores per key.
|
|
861
|
+
* @default false
|
|
862
|
+
*/
|
|
863
|
+
includeMatchDetails?: boolean;
|
|
864
|
+
/**
|
|
865
|
+
* Enable LSH-accelerated search for large collections (>1000 items).
|
|
866
|
+
* Uses MinHash + banding as a candidate filter, then re-scores with
|
|
867
|
+
* the exact algorithm. Provides sub-linear query time at the cost of
|
|
868
|
+
* approximate results (some true matches may be missed).
|
|
869
|
+
*/
|
|
870
|
+
lsh?: {
|
|
871
|
+
/** Number of hash functions for MinHash signature size. @default 128 */numHashes?: number;
|
|
872
|
+
/**
|
|
873
|
+
* Number of bands for LSH banding.
|
|
874
|
+
* More bands = higher recall, lower precision.
|
|
875
|
+
* @default 16
|
|
876
|
+
*/
|
|
877
|
+
numBands?: number;
|
|
878
|
+
};
|
|
879
|
+
}
|
|
880
|
+
/**
|
|
881
|
+
* Options for the {@link findBestMatch} function.
|
|
882
|
+
*
|
|
883
|
+
* @example
|
|
884
|
+
* ```ts
|
|
885
|
+
* const result = findBestMatch("kitten", ["sitting", "kit", "mitten"], {
|
|
886
|
+
* algorithm: "levenshtein",
|
|
887
|
+
* threshold: 0.3,
|
|
888
|
+
* });
|
|
889
|
+
* ```
|
|
890
|
+
*/
|
|
891
|
+
interface IFindBestMatchOptions {
|
|
892
|
+
/** Similarity algorithm. @default "levenshtein" */
|
|
893
|
+
algorithm?: BuiltinAlgorithm | SimilarityFn;
|
|
894
|
+
/** Keys for object-array search. */
|
|
895
|
+
keys?: ISearchKey[];
|
|
896
|
+
/** Minimum similarity score. @default 0 */
|
|
897
|
+
threshold?: number;
|
|
898
|
+
/** Whether search is case-insensitive. @default false (case-insensitive) */
|
|
899
|
+
caseSensitive?: boolean;
|
|
900
|
+
}
|
|
901
|
+
/**
|
|
902
|
+
* Fuzzy search engine for finding similar items in a collection.
|
|
903
|
+
*
|
|
904
|
+
* Supports both string arrays and object arrays with weighted multi-key search.
|
|
905
|
+
* Uses any similarity algorithm from @nlptools/distance, with optional LSH
|
|
906
|
+
* acceleration for large datasets.
|
|
907
|
+
*
|
|
908
|
+
* @example
|
|
909
|
+
* ```ts
|
|
910
|
+
* // String array search
|
|
911
|
+
* const search = new FuzzySearch(["apple", "banana", "cherry"]);
|
|
912
|
+
* const results = search.search("aple"); // [{ item: "apple", score: 0.75, index: 0 }]
|
|
913
|
+
*
|
|
914
|
+
* // Object array with weighted keys
|
|
915
|
+
* const books = [
|
|
916
|
+
* { title: "Old Man's War", author: "John Scalzi" },
|
|
917
|
+
* { title: "The Lock Artist", author: "Steve Hamilton" },
|
|
918
|
+
* ];
|
|
919
|
+
* const bookSearch = new FuzzySearch(books, {
|
|
920
|
+
* keys: [
|
|
921
|
+
* { name: "title", weight: 0.7 },
|
|
922
|
+
* { name: "author", weight: 0.3 },
|
|
923
|
+
* ],
|
|
924
|
+
* algorithm: "cosine",
|
|
925
|
+
* });
|
|
926
|
+
* const results = bookSearch.search("old man"); // finds "Old Man's War"
|
|
927
|
+
* ```
|
|
928
|
+
*/
|
|
929
|
+
declare class FuzzySearch<T> {
|
|
930
|
+
private readonly similarityFn;
|
|
931
|
+
private readonly keys;
|
|
932
|
+
private readonly threshold;
|
|
933
|
+
private readonly limit;
|
|
934
|
+
private readonly caseSensitive;
|
|
935
|
+
private readonly includeMatchDetails;
|
|
936
|
+
private readonly isObjectArray;
|
|
937
|
+
private collection;
|
|
938
|
+
private readonly useLSH;
|
|
939
|
+
private readonly lshNumHashes;
|
|
940
|
+
private readonly lshNumBands;
|
|
941
|
+
private lshIndex;
|
|
942
|
+
private minHashSignatures;
|
|
943
|
+
constructor(collection: ReadonlyArray<T>, options?: IFuzzySearchOptions);
|
|
944
|
+
/**
|
|
945
|
+
* Search the collection for items similar to the query.
|
|
946
|
+
*
|
|
947
|
+
* @param query - The search query string
|
|
948
|
+
* @param limit - Optional per-query limit override
|
|
949
|
+
* @returns Array of results sorted by score descending
|
|
950
|
+
*/
|
|
951
|
+
search(query: string, limit?: number): ISearchResult<T>[];
|
|
952
|
+
/**
|
|
953
|
+
* Add an item to the collection.
|
|
954
|
+
* If LSH is enabled, the index is updated incrementally.
|
|
955
|
+
*/
|
|
956
|
+
add(item: T): void;
|
|
957
|
+
/**
|
|
958
|
+
* Remove an item from the collection by index.
|
|
959
|
+
* If LSH is enabled, the index is rebuilt (O(n)).
|
|
960
|
+
*
|
|
961
|
+
* @returns true if the item was found and removed
|
|
962
|
+
*/
|
|
963
|
+
remove(index: number): boolean;
|
|
964
|
+
/**
|
|
965
|
+
* Replace the entire collection.
|
|
966
|
+
* If LSH is enabled, the index is rebuilt.
|
|
967
|
+
*/
|
|
968
|
+
setCollection(collection: ReadonlyArray<T>): void;
|
|
969
|
+
/**
|
|
970
|
+
* Get the current collection.
|
|
971
|
+
*/
|
|
972
|
+
getCollection(): ReadonlyArray<T>;
|
|
973
|
+
/**
|
|
974
|
+
* Get the number of items in the collection.
|
|
975
|
+
*/
|
|
976
|
+
get size(): number;
|
|
977
|
+
/**
|
|
978
|
+
* Clear the collection and any LSH index.
|
|
979
|
+
*/
|
|
980
|
+
clear(): void;
|
|
981
|
+
private searchLinear;
|
|
982
|
+
private searchWithLSH;
|
|
983
|
+
private buildLSHIndex;
|
|
984
|
+
private buildMinHashSignature;
|
|
985
|
+
private computeItemScore;
|
|
986
|
+
private computeDetailedScore;
|
|
987
|
+
private extractSearchText;
|
|
988
|
+
private extractKeyValue;
|
|
989
|
+
private normalizeString;
|
|
990
|
+
}
|
|
991
|
+
/**
|
|
992
|
+
* Find the single best match for a query against a collection.
|
|
993
|
+
*
|
|
994
|
+
* This is a convenience wrapper around {@link FuzzySearch} for one-shot queries.
|
|
995
|
+
* For repeated searches against the same collection, prefer creating a
|
|
996
|
+
* {@link FuzzySearch} instance directly.
|
|
997
|
+
*
|
|
998
|
+
* Time: O(n * k) where n = collection size, k = number of keys
|
|
999
|
+
*
|
|
1000
|
+
* @param query - The search query string
|
|
1001
|
+
* @param collection - Array of strings or objects to search
|
|
1002
|
+
* @param options - Search configuration
|
|
1003
|
+
* @returns The best matching result, or null if nothing meets the threshold
|
|
1004
|
+
*
|
|
1005
|
+
* @example
|
|
1006
|
+
* ```ts
|
|
1007
|
+
* // String array
|
|
1008
|
+
* const result = findBestMatch("kitten", ["sitting", "kit", "mitten"]);
|
|
1009
|
+
* console.log(result?.item); // "kit"
|
|
1010
|
+
* console.log(result?.score); // 0.5
|
|
1011
|
+
*
|
|
1012
|
+
* // Object array with weighted keys
|
|
1013
|
+
* const books = [
|
|
1014
|
+
* { title: "The Great Gatsby", author: "F. Scott Fitzgerald" },
|
|
1015
|
+
* { title: "Great Expectations", author: "Charles Dickens" },
|
|
1016
|
+
* ];
|
|
1017
|
+
* const result = findBestMatch("grate gatsbi", books, {
|
|
1018
|
+
* keys: [
|
|
1019
|
+
* { name: "title", weight: 0.7 },
|
|
1020
|
+
* { name: "author", weight: 0.3 },
|
|
1021
|
+
* ],
|
|
1022
|
+
* });
|
|
1023
|
+
* ```
|
|
1024
|
+
*/
|
|
1025
|
+
declare function findBestMatch<T>(query: string, collection: ReadonlyArray<T>, options?: IFindBestMatchOptions): ISearchResult<T> | null;
|
|
1026
|
+
//#endregion
|
|
1027
|
+
//#region src/utils.d.ts
|
|
1028
|
+
/**
|
|
1029
|
+
* Generate character n-grams from a string.
|
|
1030
|
+
*
|
|
1031
|
+
* @param str - Input string
|
|
1032
|
+
* @param n - N-gram size (default: 2 for bigrams)
|
|
1033
|
+
*/
|
|
1034
|
+
declare function ngrams(str: string, n?: number): string[];
|
|
1035
|
+
/**
|
|
1036
|
+
* Build an n-gram frequency map using integer-encoded keys.
|
|
1037
|
+
* Encodes n characters into a single number to avoid string allocation
|
|
1038
|
+
* and speed up Map hashing.
|
|
1039
|
+
*
|
|
1040
|
+
* For ASCII bigrams: key = (c1 << 8) | c2 (fits in 16 bits).
|
|
1041
|
+
* For non-ASCII or n > 2: falls back to string keys.
|
|
1042
|
+
*/
|
|
1043
|
+
declare function ngramFrequencyMap(str: string, n?: number): Map<number, number> | null;
|
|
1044
|
+
/**
|
|
1045
|
+
* Build a frequency map (Counter/multiset) from an iterable of tokens.
|
|
1046
|
+
* Matches the behavior of Rust's textdistance Counter.
|
|
1047
|
+
*/
|
|
1048
|
+
declare function frequencyMap(tokens: Iterable<string>): Map<string, number>;
|
|
1049
|
+
/**
|
|
1050
|
+
* Build a character-level frequency map from a string.
|
|
1051
|
+
* This is the default tokenization strategy used by textdistance.
|
|
1052
|
+
*/
|
|
1053
|
+
declare function charFrequencyMap(str: string): Map<string, number>;
|
|
1054
|
+
/** Size of the ASCII frequency array (covers charCode 0-127). */
|
|
1055
|
+
declare const CHAR_FREQ_SIZE = 128;
|
|
1056
|
+
/**
|
|
1057
|
+
* Build a character frequency array from a string.
|
|
1058
|
+
* Returns false if any character is non-ASCII (charCode >= 128).
|
|
1059
|
+
* The caller must zero the array before use.
|
|
1060
|
+
*/
|
|
1061
|
+
declare function buildCharFreqArray(arr: Int32Array, str: string): boolean;
|
|
1062
|
+
/**
|
|
1063
|
+
* Count intersect size between two frequency maps.
|
|
1064
|
+
* For each key, takes the minimum count (multiset intersection).
|
|
1065
|
+
*/
|
|
1066
|
+
declare function intersectCount(a: Map<string, number>, b: Map<string, number>): number;
|
|
1067
|
+
/**
|
|
1068
|
+
* Count union size between two frequency maps.
|
|
1069
|
+
* For each key, takes the maximum count (multiset union).
|
|
1070
|
+
*/
|
|
1071
|
+
declare function unionCount(a: Map<string, number>, b: Map<string, number>): number;
|
|
1072
|
+
/**
|
|
1073
|
+
* Get total token count from a frequency map.
|
|
1074
|
+
*/
|
|
1075
|
+
declare function totalCount(map: Map<string, number>): number;
|
|
1076
|
+
declare function intersectCountInt(a: Map<number, number>, b: Map<number, number>): number;
|
|
1077
|
+
declare function unionCountInt(a: Map<number, number>, b: Map<number, number>): number;
|
|
1078
|
+
declare function totalCountInt(map: Map<number, number>): number;
|
|
1079
|
+
/**
|
|
1080
|
+
* Normalize a raw distance to a similarity score in [0, 1].
|
|
1081
|
+
*
|
|
1082
|
+
* @param distance - Raw distance value
|
|
1083
|
+
* @param maxDistance - Maximum possible distance (usually max(len(a), len(b)))
|
|
1084
|
+
*/
|
|
1085
|
+
declare function normalize(distance: number, maxDistance: number): number;
|
|
1086
|
+
/**
|
|
1087
|
+
* FNV-1a hash for strings. Fast, good distribution for hash-based algorithms.
|
|
1088
|
+
*/
|
|
1089
|
+
declare function fnv1a(str: string): number;
|
|
1090
|
+
/**
|
|
1091
|
+
* Combine two hashes into one (for generating multiple independent hash values).
|
|
1092
|
+
*/
|
|
1093
|
+
declare function combineHash(a: number, b: number): number;
|
|
1094
|
+
//#endregion
|
|
1095
|
+
export { BuiltinAlgorithm, CHAR_FREQ_SIZE, DiffType, FuzzySearch, type IDiffItem, type IDiffOptions, IFindBestMatchOptions, IFuzzySearchOptions, IJaroWinklerOptions, ILSHOptions, type ILcs, type ILcsAlgorithm, IMinHashOptions, ISearchKey, ISearchResult, ISearchResultWithDetails, ISift4Options, ISimHashOptions, ISmithWatermanOptions, ITverskyOptions, LSH, LcsPairsFunc, LcsSizeFunc, MinHash, SimHasher, SimilarityFn, buildCharFreqArray, charFrequencyMap, combineHash, cosine, cosineNgram, damerauLevenshtein, damerauLevenshteinNormalized, diff, findBestMatch, fnv1a, frequencyMap, hamming, hammingDistance, hammingNormalized, hammingSimilarity, intersectCount, intersectCountInt, jaccard, jaccardNgram, jaro, jaroWinkler, lcsDistance, lcsLength, lcsNormalized, lcsPairs, lcsSubstringDistance, lcsSubstringLength, lcsSubstringNormalized, length, levenshtein, levenshteinNormalized, ngramFrequencyMap, ngrams, normalize, overlap, prefix, ratcliff, sift4, sift4Normalized, simhash, smithWaterman, smithWatermanNormalized, sorensen, sorensenNgram, stringEquals, suffix, totalCount, totalCountInt, tversky, unionCount, unionCountInt };
|