@nlptools/distance 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -257,46 +257,46 @@ const result = diff("abc", "ac");
257
257
 
258
258
  ## Performance
259
259
 
260
- Benchmark: 1000 iterations per pair, same test data across all runtimes.
260
+ Benchmark: same test data across all runtimes. TS/WASM via `vitest bench` (V8 JIT), Rust via `cargo test --release`.
261
261
  Unit: microseconds per operation (us/op).
262
262
 
263
263
  ### Edit Distance
264
264
 
265
265
  | Algorithm | Size | TS (V8 JIT) | WASM (via JS) | Rust (native) |
266
266
  | --------------- | --------------- | ----------- | ------------- | ------------- |
267
- | levenshtein | Short (<10) | 0.3 | 7.9 | 0.11 |
268
- | levenshtein | Medium (10-100) | 1.3 | 116.2 | 0.98 |
269
- | levenshtein | Long (>200) | 15.2 | 2,877.5 | 39.68 |
270
- | levenshteinNorm | Short | 0.3 | 7.9 | 0.11 |
271
- | lcs | Short (<10) | 1.6 | 16.5 | 0.41 |
272
- | lcs | Medium (10-100) | 6.8 | 272.6 | 3.22 |
273
- | lcs | Long (>200) | 217.8 | 6,574.1 | 122.63 |
274
- | lcsNorm | Short | 1.7 | 16.2 | 0.48 |
267
+ | levenshtein | Short (<10) | 0.3 | 1.0 | 0.24 |
268
+ | levenshtein | Medium (10-100) | 1.3 | 4.8 | 2.00 |
269
+ | levenshtein | Long (>200) | 13.9 | 102.3 | 61.77 |
270
+ | levenshteinNorm | Short | 0.3 | 1.0 | 0.19 |
271
+ | lcs | Short (<10) | 1.7 | 1.9 | 0.69 |
272
+ | lcs | Medium (10-100) | 6.8 | 10.1 | 7.70 |
273
+ | lcs | Long (>200) | 216.0 | 161.8 | 151.84 |
274
+ | lcsNorm | Short | 1.7 | 1.9 | 0.42 |
275
275
 
276
276
  ### Token Similarity (Character Multiset)
277
277
 
278
278
  | Algorithm | Size | TS (V8 JIT) | WASM (via JS) | Rust (native) |
279
279
  | --------- | --------------- | ----------- | ------------- | ------------- |
280
- | jaccard | Short (<10) | 0.8 | 25.2 | 0.42 |
281
- | jaccard | Medium (10-100) | 0.8 | 74.3 | 1.55 |
282
- | jaccard | Long (>200) | 1.6 | 171.5 | 5.54 |
283
- | cosine | Short (<10) | 0.8 | 19.3 | 0.32 |
284
- | cosine | Medium (10-100) | 0.8 | 61.4 | 1.35 |
285
- | cosine | Long (>200) | 1.5 | 158.5 | 4.77 |
286
- | sorensen | Short (<10) | 0.7 | 19.3 | 0.33 |
287
- | sorensen | Medium (10-100) | 0.7 | 61.0 | 1.33 |
288
- | sorensen | Long (>200) | 1.5 | 160.0 | 4.46 |
280
+ | jaccard | Short (<10) | 0.8 | 3.4 | 0.63 |
281
+ | jaccard | Medium (10-100) | 0.8 | 8.6 | 2.67 |
282
+ | jaccard | Long (>200) | 1.5 | 18.9 | 7.25 |
283
+ | cosine | Short (<10) | 1.0 | 2.6 | 0.43 |
284
+ | cosine | Medium (10-100) | 0.8 | 7.0 | 1.56 |
285
+ | cosine | Long (>200) | 1.7 | 17.2 | 6.23 |
286
+ | sorensen | Short (<10) | 0.7 | 2.6 | 0.56 |
287
+ | sorensen | Medium (10-100) | 0.7 | 7.0 | 2.27 |
288
+ | sorensen | Long (>200) | 1.4 | 17.4 | 6.48 |
289
289
 
290
290
  ### Bigram Variants
291
291
 
292
292
  | Algorithm | Size | TS (V8 JIT) | WASM (via JS) | Rust (native) |
293
293
  | ------------- | --------------- | ----------- | ------------- | ------------- |
294
- | jaccardBigram | Short (<10) | 1.1 | 27.4 | 0.45 |
295
- | jaccardBigram | Medium (10-100) | 7.7 | 160.4 | 3.86 |
296
- | cosineBigram | Short (<10) | 0.8 | 21.2 | 0.36 |
297
- | cosineBigram | Medium (10-100) | 5.9 | 127.0 | 3.12 |
294
+ | jaccardBigram | Short (<10) | 1.1 | 3.5 | 0.67 |
295
+ | jaccardBigram | Medium (10-100) | 7.5 | 18.1 | 4.80 |
296
+ | cosineBigram | Short (<10) | 0.7 | 2.8 | 0.43 |
297
+ | cosineBigram | Medium (10-100) | 5.4 | 14.0 | 4.04 |
298
298
 
299
- TS implementations use V8 JIT optimization + `Int32Array` ASCII fast path + integer-encoded bigrams, avoiding JS-WASM boundary overhead entirely.
299
+ TS implementations use `Int32Array` ASCII fast path + integer-encoded bigrams, avoiding JS-WASM boundary overhead. For compute-heavy algorithms on long strings (e.g. LCS), WASM via JS and Rust native can outperform TS due to native computation advantage outweighing the boundary cost.
300
300
 
301
301
  ### Fuzzy Search: NLPTools vs Fuse.js
302
302
 
package/dist/index.d.mts CHANGED
@@ -65,6 +65,260 @@ declare function lcsLength(a: string, b: string, algorithm?: "myers" | "dp"): nu
65
65
  */
66
66
  declare function lcsPairs(a: string, b: string, algorithm?: "myers" | "dp"): Array<[number, number]>;
67
67
  //#endregion
68
+ //#region src/edit/jaro.d.ts
69
+ /**
70
+ * Jaro and Jaro-Winkler similarity algorithms.
71
+ *
72
+ * Jaro measures similarity between two strings by considering matching characters
73
+ * and transpositions. Jaro-Winkler extends Jaro with a prefix bonus.
74
+ *
75
+ * Time: O(m * n)
76
+ */
77
+ /**
78
+ * Compute Jaro similarity between two strings.
79
+ *
80
+ * J(S1, S2) = (1/3) * (m/|S1| + m/|S2| + (m - t/2) / m)
81
+ *
82
+ * where m = number of matching characters (within window),
83
+ * t = number of transpositions among matching characters.
84
+ *
85
+ * @param a - First string
86
+ * @param b - Second string
87
+ * @returns Jaro similarity in [0, 1]
88
+ */
89
+ declare function jaro(a: string, b: string): number;
90
+ /**
91
+ * Options for Jaro-Winkler similarity.
92
+ */
93
+ interface IJaroWinklerOptions {
94
+ /**
95
+ * Weight applied to the common prefix bonus.
96
+ * @default 0.1
97
+ */
98
+ prefixWeight?: number;
99
+ /**
100
+ * Maximum length of common prefix to consider.
101
+ * @default 4
102
+ */
103
+ maxPrefix?: number;
104
+ }
105
+ /**
106
+ * Compute Jaro-Winkler similarity between two strings.
107
+ *
108
+ * JW(S1, S2) = Jaro(S1, S2) + l * p * (1 - Jaro(S1, S2))
109
+ *
110
+ * where l = length of common prefix (up to maxPrefix),
111
+ * p = prefixWeight.
112
+ *
113
+ * @param a - First string
114
+ * @param b - Second string
115
+ * @param options - Configuration
116
+ * @returns Jaro-Winkler similarity in [0, 1]
117
+ */
118
+ declare function jaroWinkler(a: string, b: string, options?: IJaroWinklerOptions): number;
119
+ //#endregion
120
+ //#region src/edit/damerau.d.ts
121
+ /**
122
+ * Damerau-Levenshtein distance (unrestricted variant).
123
+ *
124
+ * Extension of Levenshtein that allows transpositions of adjacent characters,
125
+ * even when substrings are edited multiple times.
126
+ *
127
+ * Matches the default behavior of textdistance.rs (restricted = false).
128
+ *
129
+ * Time: O(m * n), Space: O(m * n)
130
+ */
131
+ /**
132
+ * Compute the Damerau-Levenshtein distance between two strings.
133
+ *
134
+ * Allows insertions, deletions, substitutions, and transpositions of
135
+ * adjacent characters. This is the unrestricted variant, which correctly
136
+ * handles cases where a substring is edited more than once.
137
+ *
138
+ * @param a - First string
139
+ * @param b - Second string
140
+ * @returns Edit distance (non-negative integer)
141
+ */
142
+ declare function damerauLevenshtein(a: string, b: string): number;
143
+ /**
144
+ * Compute the normalized Damerau-Levenshtein similarity in [0, 1].
145
+ *
146
+ * @param a - First string
147
+ * @param b - Second string
148
+ * @returns Similarity score where 1 means identical
149
+ */
150
+ declare function damerauLevenshteinNormalized(a: string, b: string): number;
151
+ //#endregion
152
+ //#region src/edit/hamming.d.ts
153
+ /**
154
+ * Hamming distance — counts character mismatches between equal-length strings.
155
+ *
156
+ * Time: O(min(m, n))
157
+ */
158
+ /**
159
+ * Compute the Hamming distance between two strings.
160
+ *
161
+ * If strings have different lengths, only compares up to the shorter length
162
+ * and adds the length difference as additional mismatches.
163
+ *
164
+ * @param a - First string
165
+ * @param b - Second string
166
+ * @returns Number of mismatching characters
167
+ */
168
+ declare function hamming(a: string, b: string): number;
169
+ /**
170
+ * Compute the normalized Hamming similarity in [0, 1].
171
+ *
172
+ * @param a - First string
173
+ * @param b - Second string
174
+ * @returns Similarity score where 1 means identical
175
+ */
176
+ declare function hammingNormalized(a: string, b: string): number;
177
+ //#endregion
178
+ //#region src/edit/lcs-str.d.ts
179
+ /**
180
+ * Longest Common Substring (contiguous) algorithms.
181
+ *
182
+ * Unlike LCS (subsequence), this requires the matching characters to be contiguous.
183
+ *
184
+ * Time: O(m * n), Space: O(min(m, n))
185
+ */
186
+ /**
187
+ * Compute the length of the Longest Common Substring.
188
+ *
189
+ * @param a - First string
190
+ * @param b - Second string
191
+ * @returns Length of the longest common substring
192
+ */
193
+ declare function lcsSubstringLength(a: string, b: string): number;
194
+ /**
195
+ * Compute the LCS substring distance: len(a) + len(b) - 2 * lcsSubstringLength.
196
+ *
197
+ * @param a - First string
198
+ * @param b - Second string
199
+ * @returns LCS substring distance (non-negative integer)
200
+ */
201
+ declare function lcsSubstringDistance(a: string, b: string): number;
202
+ /**
203
+ * Compute the normalized LCS substring similarity in [0, 1].
204
+ *
205
+ * Normalized by max(len(a), len(b)) to match textdistance.rs convention.
206
+ *
207
+ * @param a - First string
208
+ * @param b - Second string
209
+ * @returns Similarity score where 1 means identical
210
+ */
211
+ declare function lcsSubstringNormalized(a: string, b: string): number;
212
+ //#endregion
213
+ //#region src/edit/sift4.d.ts
214
+ /**
215
+ * SIFT4 simple — fast approximate string distance.
216
+ *
217
+ * A fast algorithm for approximate string matching with O(n) complexity
218
+ * in typical cases. Returns a distance value (lower = more similar).
219
+ *
220
+ * Matches the textdistance.rs implementation exactly.
221
+ *
222
+ * Time: O(n * maxOffset)
223
+ */
224
+ /**
225
+ * Options for SIFT4.
226
+ */
227
+ interface ISift4Options {
228
+ /**
229
+ * Maximum offset for character matching.
230
+ * @default 5
231
+ */
232
+ maxOffset?: number;
233
+ }
234
+ /**
235
+ * Compute the SIFT4 simple distance between two strings.
236
+ *
237
+ * @param a - First string
238
+ * @param b - Second string
239
+ * @param options - Configuration
240
+ * @returns Distance (non-negative integer)
241
+ */
242
+ declare function sift4(a: string, b: string, options?: ISift4Options): number;
243
+ /**
244
+ * Compute the normalized SIFT4 similarity in [0, 1].
245
+ *
246
+ * @param a - First string
247
+ * @param b - Second string
248
+ * @param options - Configuration
249
+ * @returns Similarity score where 1 means identical
250
+ */
251
+ declare function sift4Normalized(a: string, b: string, options?: ISift4Options): number;
252
+ //#endregion
253
+ //#region src/edit/ratcliff.d.ts
254
+ /**
255
+ * Ratcliff-Obershelp algorithm — Gestalt pattern matching.
256
+ *
257
+ * Iteratively finds the longest common substring using a stack-based approach,
258
+ * combining scores from both sides. Returns a similarity in [0, 1].
259
+ *
260
+ * Based on the textdistance.rs implementation.
261
+ *
262
+ * Time: O(n * m * log(n * m)) worst case, O(n + m) average
263
+ */
264
+ /**
265
+ * Compute Ratcliff-Obershelp similarity between two strings.
266
+ *
267
+ * Uses an iterative stack-based approach to avoid stack overflow on
268
+ * very different strings. The algorithm recursively finds the longest
269
+ * common substring and combines similarity scores from both sides.
270
+ *
271
+ * similarity = 2 * M / T, where M = total matched characters, T = total characters
272
+ *
273
+ * @param a - First string
274
+ * @param b - Second string
275
+ * @returns Ratcliff-Obershelp similarity in [0, 1]
276
+ */
277
+ declare function ratcliff(a: string, b: string): number;
278
+ //#endregion
279
+ //#region src/edit/smith-waterman.d.ts
280
+ /**
281
+ * Smith-Waterman local sequence alignment algorithm.
282
+ *
283
+ * Designed for biological sequence alignment, it finds the best
284
+ * local alignment between two sequences.
285
+ *
286
+ * Default scoring: match=1, mismatch=0, gap=-1 (matches textdistance.rs)
287
+ *
288
+ * Time: O(m * n), Space: O(m * n)
289
+ */
290
+ /**
291
+ * Options for Smith-Waterman alignment.
292
+ */
293
+ interface ISmithWatermanOptions {
294
+ /** Score for matching characters. @default 1 */
295
+ matchScore?: number;
296
+ /** Score for mismatching characters. @default 0 */
297
+ mismatchScore?: number;
298
+ /** Score penalty for a gap. @default -1 */
299
+ gapScore?: number;
300
+ }
301
+ /**
302
+ * Compute the raw Smith-Waterman alignment score.
303
+ *
304
+ * @param a - First string
305
+ * @param b - Second string
306
+ * @param options - Scoring parameters
307
+ * @returns Raw alignment score (non-negative)
308
+ */
309
+ declare function smithWaterman(a: string, b: string, options?: ISmithWatermanOptions): number;
310
+ /**
311
+ * Compute the normalized Smith-Waterman similarity in [0, 1].
312
+ *
313
+ * Normalized by matchScore * max(len(a), len(b)), matching textdistance.rs convention.
314
+ *
315
+ * @param a - First string
316
+ * @param b - Second string
317
+ * @param options - Scoring parameters
318
+ * @returns Normalized similarity in [0, 1]
319
+ */
320
+ declare function smithWatermanNormalized(a: string, b: string, options?: ISmithWatermanOptions): number;
321
+ //#endregion
68
322
  //#region src/token/jaccard.d.ts
69
323
  /**
70
324
  * Jaccard similarity between two strings based on character-level multiset.
@@ -95,10 +349,10 @@ declare function jaccardNgram(a: string, b: string, n?: number): number;
95
349
  /**
96
350
  * Cosine similarity between two strings based on character-level multiset.
97
351
  *
98
- * cos(A, B) = (A · B) / (|A| * |B|)
352
+ * Uses textdistance.rs convention:
353
+ * cosine(A, B) = intersect_count(A, B) / sqrt(count(A) * count(B))
99
354
  *
100
- * Uses Counter (frequency map) for multiset semantics,
101
- * matching the textdistance crate behavior.
355
+ * Where intersect_count = sum(min(freqA[c], freqB[c])) and count = sum of frequencies.
102
356
  *
103
357
  * Time: O(m + n)
104
358
  *
@@ -110,6 +364,9 @@ declare function cosine(a: string, b: string): number;
110
364
  /**
111
365
  * Cosine similarity based on character n-grams.
112
366
  *
367
+ * Uses textdistance.rs convention (same as character-level cosine but on n-grams):
368
+ * cosine_ngram(A, B) = intersect_count(A, B) / sqrt(count(A) * count(B))
369
+ *
113
370
  * @param a - First string
114
371
  * @param b - Second string
115
372
  * @param n - N-gram size (default: 2)
@@ -143,6 +400,96 @@ declare function sorensen(a: string, b: string): number;
143
400
  */
144
401
  declare function sorensenNgram(a: string, b: string, n?: number): number;
145
402
  //#endregion
403
+ //#region src/token/tversky.d.ts
404
+ /**
405
+ * Tversky index — asymmetric set similarity measure.
406
+ *
407
+ * Reduces to Jaccard when alpha = beta = 1.
408
+ * Reduces to Sorensen-Dice when alpha = beta = 0.5.
409
+ *
410
+ * Time: O(m + n)
411
+ */
412
+ /**
413
+ * Options for Tversky index.
414
+ */
415
+ interface ITverskyOptions {
416
+ /**
417
+ * Weight for elements unique to the first set (a).
418
+ * @default 1
419
+ */
420
+ alpha?: number;
421
+ /**
422
+ * Weight for elements unique to the second set (b).
423
+ * @default 1
424
+ */
425
+ beta?: number;
426
+ }
427
+ /**
428
+ * Compute the Tversky index between two strings based on character multiset.
429
+ *
430
+ * T(A, B; α, β) = |A ∩ B| / (|A ∩ B| + α|A \ B| + β|B \ A|)
431
+ *
432
+ * @param a - First string
433
+ * @param b - Second string
434
+ * @param options - alpha and beta weights
435
+ * @returns Tversky index in [0, 1]
436
+ */
437
+ declare function tversky(a: string, b: string, options?: ITverskyOptions): number;
438
+ //#endregion
439
+ //#region src/token/overlap.d.ts
440
+ /**
441
+ * Overlap coefficient — set similarity normalized by the smaller set.
442
+ *
443
+ * overlap(A, B) = |A ∩ B| / min(|A|, |B|)
444
+ *
445
+ * Time: O(m + n)
446
+ */
447
+ /**
448
+ * Compute the overlap coefficient between two strings based on character multiset.
449
+ *
450
+ * @param a - First string
451
+ * @param b - Second string
452
+ * @returns Overlap coefficient in [0, 1]
453
+ */
454
+ declare function overlap(a: string, b: string): number;
455
+ //#endregion
456
+ //#region src/token/naive.d.ts
457
+ /**
458
+ * Naive string similarity measures: prefix, suffix, length.
459
+ *
460
+ * Time: O(min(m, n)) for prefix/suffix, O(1) for length
461
+ */
462
+ /**
463
+ * Compute prefix similarity between two strings.
464
+ *
465
+ * prefix(a, b) = commonPrefixLength / max(|a|, |b|)
466
+ *
467
+ * @param a - First string
468
+ * @param b - Second string
469
+ * @returns Prefix similarity in [0, 1]
470
+ */
471
+ declare function prefix(a: string, b: string): number;
472
+ /**
473
+ * Compute suffix similarity between two strings.
474
+ *
475
+ * suffix(a, b) = commonSuffixLength / max(|a|, |b|)
476
+ *
477
+ * @param a - First string
478
+ * @param b - Second string
479
+ * @returns Suffix similarity in [0, 1]
480
+ */
481
+ declare function suffix(a: string, b: string): number;
482
+ /**
483
+ * Compute length-based similarity between two strings.
484
+ *
485
+ * length(a, b) = 1 - |len(a) - len(b)| / max(len(a), len(b))
486
+ *
487
+ * @param a - First string
488
+ * @param b - Second string
489
+ * @returns Normalized length similarity in [0, 1]
490
+ */
491
+ declare function length(a: string, b: string): number;
492
+ //#endregion
146
493
  //#region src/hash/simhash.d.ts
147
494
  interface ISimHashOptions {
148
495
  /**
@@ -501,10 +848,10 @@ interface IFuzzySearchOptions {
501
848
  */
502
849
  limit?: number;
503
850
  /**
504
- * Whether search should be case-insensitive.
505
- * When true, both the query and the item strings are lowercased
506
- * before comparison.
507
- * @default false (case-insensitive by default)
851
+ * Whether search should be case-sensitive.
852
+ * When false (default), both the query and the item strings are lowercased
853
+ * before comparison (case-insensitive search).
854
+ * @default false
508
855
  */
509
856
  caseSensitive?: boolean;
510
857
  /**
@@ -677,4 +1024,72 @@ declare class FuzzySearch<T> {
677
1024
  */
678
1025
  declare function findBestMatch<T>(query: string, collection: ReadonlyArray<T>, options?: IFindBestMatchOptions): ISearchResult<T> | null;
679
1026
  //#endregion
680
- export { BuiltinAlgorithm, DiffType, FuzzySearch, type IDiffItem, type IDiffOptions, IFindBestMatchOptions, IFuzzySearchOptions, ILSHOptions, type ILcs, type ILcsAlgorithm, IMinHashOptions, ISearchKey, ISearchResult, ISearchResultWithDetails, ISimHashOptions, LSH, LcsPairsFunc, LcsSizeFunc, MinHash, SimHasher, SimilarityFn, cosine, cosineNgram, diff, findBestMatch, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };
1027
+ //#region src/utils.d.ts
1028
+ /**
1029
+ * Generate character n-grams from a string.
1030
+ *
1031
+ * @param str - Input string
1032
+ * @param n - N-gram size (default: 2 for bigrams)
1033
+ */
1034
+ declare function ngrams(str: string, n?: number): string[];
1035
+ /**
1036
+ * Build an n-gram frequency map using integer-encoded keys.
1037
+ * Encodes n characters into a single number to avoid string allocation
1038
+ * and speed up Map hashing.
1039
+ *
1040
+ * For ASCII bigrams: key = (c1 << 8) | c2 (fits in 16 bits).
1041
+ * For non-ASCII or n > 2: falls back to string keys.
1042
+ */
1043
+ declare function ngramFrequencyMap(str: string, n?: number): Map<number, number> | null;
1044
+ /**
1045
+ * Build a frequency map (Counter/multiset) from an iterable of tokens.
1046
+ * Matches the behavior of Rust's textdistance Counter.
1047
+ */
1048
+ declare function frequencyMap(tokens: Iterable<string>): Map<string, number>;
1049
+ /**
1050
+ * Build a character-level frequency map from a string.
1051
+ * This is the default tokenization strategy used by textdistance.
1052
+ */
1053
+ declare function charFrequencyMap(str: string): Map<string, number>;
1054
+ /** Size of the ASCII frequency array (covers charCode 0-127). */
1055
+ declare const CHAR_FREQ_SIZE = 128;
1056
+ /**
1057
+ * Build a character frequency array from a string.
1058
+ * Returns false if any character is non-ASCII (charCode >= 128).
1059
+ * The caller must zero the array before use.
1060
+ */
1061
+ declare function buildCharFreqArray(arr: Int32Array, str: string): boolean;
1062
+ /**
1063
+ * Count intersect size between two frequency maps.
1064
+ * For each key, takes the minimum count (multiset intersection).
1065
+ */
1066
+ declare function intersectCount(a: Map<string, number>, b: Map<string, number>): number;
1067
+ /**
1068
+ * Count union size between two frequency maps.
1069
+ * For each key, takes the maximum count (multiset union).
1070
+ */
1071
+ declare function unionCount(a: Map<string, number>, b: Map<string, number>): number;
1072
+ /**
1073
+ * Get total token count from a frequency map.
1074
+ */
1075
+ declare function totalCount(map: Map<string, number>): number;
1076
+ declare function intersectCountInt(a: Map<number, number>, b: Map<number, number>): number;
1077
+ declare function unionCountInt(a: Map<number, number>, b: Map<number, number>): number;
1078
+ declare function totalCountInt(map: Map<number, number>): number;
1079
+ /**
1080
+ * Normalize a raw distance to a similarity score in [0, 1].
1081
+ *
1082
+ * @param distance - Raw distance value
1083
+ * @param maxDistance - Maximum possible distance (usually max(len(a), len(b)))
1084
+ */
1085
+ declare function normalize(distance: number, maxDistance: number): number;
1086
+ /**
1087
+ * FNV-1a hash for strings. Fast, good distribution for hash-based algorithms.
1088
+ */
1089
+ declare function fnv1a(str: string): number;
1090
+ /**
1091
+ * Combine two hashes into one (for generating multiple independent hash values).
1092
+ */
1093
+ declare function combineHash(a: number, b: number): number;
1094
+ //#endregion
1095
+ export { BuiltinAlgorithm, CHAR_FREQ_SIZE, DiffType, FuzzySearch, type IDiffItem, type IDiffOptions, IFindBestMatchOptions, IFuzzySearchOptions, IJaroWinklerOptions, ILSHOptions, type ILcs, type ILcsAlgorithm, IMinHashOptions, ISearchKey, ISearchResult, ISearchResultWithDetails, ISift4Options, ISimHashOptions, ISmithWatermanOptions, ITverskyOptions, LSH, LcsPairsFunc, LcsSizeFunc, MinHash, SimHasher, SimilarityFn, buildCharFreqArray, charFrequencyMap, combineHash, cosine, cosineNgram, damerauLevenshtein, damerauLevenshteinNormalized, diff, findBestMatch, fnv1a, frequencyMap, hamming, hammingDistance, hammingNormalized, hammingSimilarity, intersectCount, intersectCountInt, jaccard, jaccardNgram, jaro, jaroWinkler, lcsDistance, lcsLength, lcsNormalized, lcsPairs, lcsSubstringDistance, lcsSubstringLength, lcsSubstringNormalized, length, levenshtein, levenshteinNormalized, ngramFrequencyMap, ngrams, normalize, overlap, prefix, ratcliff, sift4, sift4Normalized, simhash, smithWaterman, smithWatermanNormalized, sorensen, sorensenNgram, stringEquals, suffix, totalCount, totalCountInt, tversky, unionCount, unionCountInt };
package/dist/index.mjs CHANGED
@@ -53,6 +53,8 @@ function frequencyMap(tokens) {
53
53
  function charFrequencyMap(str) {
54
54
  return frequencyMap(str);
55
55
  }
56
+ /** Size of the ASCII frequency array (covers charCode 0-127). */
57
+ const CHAR_FREQ_SIZE = 128;
56
58
  /**
57
59
  * Build a character frequency array from a string.
58
60
  * Returns false if any character is non-ASCII (charCode >= 128).
@@ -148,6 +150,13 @@ function fnv1a(str) {
148
150
  }
149
151
  return hash >>> 0;
150
152
  }
153
+ /**
154
+ * Combine two hashes into one (for generating multiple independent hash values).
155
+ */
156
+ function combineHash(a, b) {
157
+ a ^= b + 2654435769 + (a << 6) + (a >>> 2);
158
+ return a >>> 0;
159
+ }
151
160
  //#endregion
152
161
  //#region src/edit/levenshtein.ts
153
162
  /**
@@ -206,7 +215,9 @@ function lcsDistance(a, b, algorithm = "myers") {
206
215
  * @returns Similarity score where 1 means identical
207
216
  */
208
217
  function lcsNormalized(a, b, algorithm = "myers") {
209
- return normalize(lcsDistance(a, b, algorithm), a.length + b.length);
218
+ const maxLen = Math.max(a.length, b.length);
219
+ if (maxLen === 0) return 1;
220
+ return lcsLength(a, b, algorithm) / maxLen;
210
221
  }
211
222
  /**
212
223
  * Get the length of the Longest Common Subsequence.
@@ -231,6 +242,458 @@ function lcsPairs(a, b, algorithm = "myers") {
231
242
  return (algorithm === "dp" ? lcs_dp : lcs_myers_linear_space)(a.length, b.length, stringEquals(a, b));
232
243
  }
233
244
  //#endregion
245
+ //#region src/edit/jaro.ts
246
+ /**
247
+ * Jaro and Jaro-Winkler similarity algorithms.
248
+ *
249
+ * Jaro measures similarity between two strings by considering matching characters
250
+ * and transpositions. Jaro-Winkler extends Jaro with a prefix bonus.
251
+ *
252
+ * Time: O(m * n)
253
+ */
254
+ /**
255
+ * Compute Jaro similarity between two strings.
256
+ *
257
+ * J(S1, S2) = (1/3) * (m/|S1| + m/|S2| + (m - t/2) / m)
258
+ *
259
+ * where m = number of matching characters (within window),
260
+ * t = number of transpositions among matching characters.
261
+ *
262
+ * @param a - First string
263
+ * @param b - Second string
264
+ * @returns Jaro similarity in [0, 1]
265
+ */
266
+ function jaro(a, b) {
267
+ const aLen = a.length;
268
+ const bLen = b.length;
269
+ if (aLen === 0 && bLen === 0) return 1;
270
+ if (aLen === 0 || bLen === 0) return 0;
271
+ const matchDistance = Math.floor(Math.max(aLen, bLen) / 2) - 1;
272
+ if (matchDistance < 0) return 0;
273
+ const aMatches = new Uint8Array(aLen);
274
+ const bMatches = new Uint8Array(bLen);
275
+ let matches = 0;
276
+ let transpositions = 0;
277
+ for (let i = 0; i < aLen; i++) {
278
+ const start = Math.max(0, i - matchDistance);
279
+ const end = Math.min(i + matchDistance + 1, bLen);
280
+ for (let j = start; j < end; j++) {
281
+ if (bMatches[j] || a.charCodeAt(i) !== b.charCodeAt(j)) continue;
282
+ aMatches[i] = 1;
283
+ bMatches[j] = 1;
284
+ matches++;
285
+ break;
286
+ }
287
+ }
288
+ if (matches === 0) return 0;
289
+ let k = 0;
290
+ for (let i = 0; i < aLen; i++) {
291
+ if (!aMatches[i]) continue;
292
+ while (!bMatches[k]) k++;
293
+ if (a.charCodeAt(i) !== b.charCodeAt(k)) transpositions++;
294
+ k++;
295
+ }
296
+ return (matches / aLen + matches / bLen + (matches - transpositions / 2) / matches) / 3;
297
+ }
298
+ /**
299
+ * Compute Jaro-Winkler similarity between two strings.
300
+ *
301
+ * JW(S1, S2) = Jaro(S1, S2) + l * p * (1 - Jaro(S1, S2))
302
+ *
303
+ * where l = length of common prefix (up to maxPrefix),
304
+ * p = prefixWeight.
305
+ *
306
+ * @param a - First string
307
+ * @param b - Second string
308
+ * @param options - Configuration
309
+ * @returns Jaro-Winkler similarity in [0, 1]
310
+ */
311
+ function jaroWinkler(a, b, options = {}) {
312
+ const p = options.prefixWeight ?? .1;
313
+ const maxPrefix = options.maxPrefix ?? 4;
314
+ const jaroScore = jaro(a, b);
315
+ let l = 0;
316
+ const minLen = Math.min(a.length, b.length, maxPrefix);
317
+ while (l < minLen && a.charCodeAt(l) === b.charCodeAt(l)) l++;
318
+ return jaroScore + l * p * (1 - jaroScore);
319
+ }
320
+ //#endregion
321
+ //#region src/edit/damerau.ts
322
+ /**
323
+ * Damerau-Levenshtein distance (unrestricted variant).
324
+ *
325
+ * Extension of Levenshtein that allows transpositions of adjacent characters,
326
+ * even when substrings are edited multiple times.
327
+ *
328
+ * Matches the default behavior of textdistance.rs (restricted = false).
329
+ *
330
+ * Time: O(m * n), Space: O(m * n)
331
+ */
332
+ /**
333
+ * Compute the Damerau-Levenshtein distance between two strings.
334
+ *
335
+ * Allows insertions, deletions, substitutions, and transpositions of
336
+ * adjacent characters. This is the unrestricted variant, which correctly
337
+ * handles cases where a substring is edited more than once.
338
+ *
339
+ * @param a - First string
340
+ * @param b - Second string
341
+ * @returns Edit distance (non-negative integer)
342
+ */
343
+ function damerauLevenshtein(a, b) {
344
+ const aLen = a.length;
345
+ const bLen = b.length;
346
+ if (aLen === 0) return bLen;
347
+ if (bLen === 0) return aLen;
348
+ const maxDist = aLen + bLen;
349
+ const w = bLen + 2;
350
+ const mat = new Uint32Array((aLen + 2) * w);
351
+ mat[0] = maxDist;
352
+ for (let i = 0; i <= aLen; i++) {
353
+ mat[(i + 1) * w] = maxDist;
354
+ mat[(i + 1) * w + 1] = i;
355
+ }
356
+ for (let j = 0; j <= bLen; j++) {
357
+ mat[j + 1] = maxDist;
358
+ mat[w + j + 1] = j;
359
+ }
360
+ const lastSeen = /* @__PURE__ */ new Map();
361
+ for (let i = 0; i < aLen; i++) {
362
+ let db = 0;
363
+ const aChar = a.charCodeAt(i);
364
+ const i1 = i + 1;
365
+ for (let j = 0; j < bLen; j++) {
366
+ const j1 = j + 1;
367
+ const bChar = b.charCodeAt(j);
368
+ const last = lastSeen.get(bChar) ?? 0;
369
+ const subCost = aChar === bChar ? 0 : 1;
370
+ const base = (i1 + 1) * w + j1 + 1;
371
+ const sub = mat[i1 * w + j1] + subCost;
372
+ const del = mat[(i1 + 1) * w + j1] + 1;
373
+ const ins = mat[i1 * w + j1 + 1] + 1;
374
+ const trans = mat[last * w + db] + i1 + j1 - 2 + 1 - last - db;
375
+ mat[base] = Math.min(sub, del, ins, trans);
376
+ if (aChar === bChar) db = j1;
377
+ }
378
+ lastSeen.set(aChar, i1);
379
+ }
380
+ return mat[(aLen + 1) * w + bLen + 1];
381
+ }
382
+ /**
383
+ * Compute the normalized Damerau-Levenshtein similarity in [0, 1].
384
+ *
385
+ * @param a - First string
386
+ * @param b - Second string
387
+ * @returns Similarity score where 1 means identical
388
+ */
389
+ function damerauLevenshteinNormalized(a, b) {
390
+ return normalize(damerauLevenshtein(a, b), Math.max(a.length, b.length));
391
+ }
392
+ //#endregion
393
+ //#region src/edit/hamming.ts
394
+ /**
395
+ * Hamming distance — counts character mismatches between equal-length strings.
396
+ *
397
+ * Time: O(min(m, n))
398
+ */
399
+ /**
400
+ * Compute the Hamming distance between two strings.
401
+ *
402
+ * If strings have different lengths, only compares up to the shorter length
403
+ * and adds the length difference as additional mismatches.
404
+ *
405
+ * @param a - First string
406
+ * @param b - Second string
407
+ * @returns Number of mismatching characters
408
+ */
409
+ function hamming(a, b) {
410
+ const minLen = Math.min(a.length, b.length);
411
+ let count = Math.abs(a.length - b.length);
412
+ for (let i = 0; i < minLen; i++) if (a.charCodeAt(i) !== b.charCodeAt(i)) count++;
413
+ return count;
414
+ }
415
+ /**
416
+ * Compute the normalized Hamming similarity in [0, 1].
417
+ *
418
+ * @param a - First string
419
+ * @param b - Second string
420
+ * @returns Similarity score where 1 means identical
421
+ */
422
+ function hammingNormalized(a, b) {
423
+ const maxLen = Math.max(a.length, b.length);
424
+ return maxLen === 0 ? 1 : 1 - hamming(a, b) / maxLen;
425
+ }
426
+ //#endregion
427
+ //#region src/edit/lcs-str.ts
428
+ /**
429
+ * Longest Common Substring (contiguous) algorithms.
430
+ *
431
+ * Unlike LCS (subsequence), this requires the matching characters to be contiguous.
432
+ *
433
+ * Time: O(m * n), Space: O(min(m, n))
434
+ */
435
+ /**
436
+ * Compute the length of the Longest Common Substring.
437
+ *
438
+ * @param a - First string
439
+ * @param b - Second string
440
+ * @returns Length of the longest common substring
441
+ */
442
+ function lcsSubstringLength(a, b) {
443
+ const aLen = a.length;
444
+ const bLen = b.length;
445
+ if (aLen === 0 || bLen === 0) return 0;
446
+ let maxLen = 0;
447
+ const dp = new Uint32Array(bLen + 1);
448
+ for (let i = 1; i <= aLen; i++) {
449
+ let prev = 0;
450
+ for (let j = 1; j <= bLen; j++) {
451
+ const temp = dp[j];
452
+ if (a.charCodeAt(i - 1) === b.charCodeAt(j - 1)) {
453
+ dp[j] = prev + 1;
454
+ if (dp[j] > maxLen) maxLen = dp[j];
455
+ } else dp[j] = 0;
456
+ prev = temp;
457
+ }
458
+ }
459
+ return maxLen;
460
+ }
461
+ /**
462
+ * Compute the LCS substring distance: len(a) + len(b) - 2 * lcsSubstringLength.
463
+ *
464
+ * @param a - First string
465
+ * @param b - Second string
466
+ * @returns LCS substring distance (non-negative integer)
467
+ */
468
+ function lcsSubstringDistance(a, b) {
469
+ return a.length + b.length - 2 * lcsSubstringLength(a, b);
470
+ }
471
+ /**
472
+ * Compute the normalized LCS substring similarity in [0, 1].
473
+ *
474
+ * Normalized by max(len(a), len(b)) to match textdistance.rs convention.
475
+ *
476
+ * @param a - First string
477
+ * @param b - Second string
478
+ * @returns Similarity score where 1 means identical
479
+ */
480
+ function lcsSubstringNormalized(a, b) {
481
+ const maxLen = Math.max(a.length, b.length);
482
+ if (maxLen === 0) return 1;
483
+ return lcsSubstringLength(a, b) / maxLen;
484
+ }
485
+ //#endregion
486
+ //#region src/edit/sift4.ts
487
+ /**
488
+ * SIFT4 simple — fast approximate string distance.
489
+ *
490
+ * A fast algorithm for approximate string matching with O(n) complexity
491
+ * in typical cases. Returns a distance value (lower = more similar).
492
+ *
493
+ * Matches the textdistance.rs implementation exactly.
494
+ *
495
+ * Time: O(n * maxOffset)
496
+ */
497
+ /**
498
+ * Compute the SIFT4 simple distance between two strings.
499
+ *
500
+ * @param a - First string
501
+ * @param b - Second string
502
+ * @param options - Configuration
503
+ * @returns Distance (non-negative integer)
504
+ */
505
+ function sift4(a, b, options = {}) {
506
+ const maxOffset = options.maxOffset ?? 5;
507
+ const aLen = a.length;
508
+ const bLen = b.length;
509
+ let c1 = 0;
510
+ let c2 = 0;
511
+ let lcss = 0;
512
+ let localCs = 0;
513
+ while (c1 < aLen && c2 < bLen) {
514
+ if (a.charCodeAt(c1) === b.charCodeAt(c2)) localCs++;
515
+ else {
516
+ lcss += localCs;
517
+ localCs = 0;
518
+ if (c1 !== c2) {
519
+ c1 = Math.min(c1, c2);
520
+ c2 = c1;
521
+ }
522
+ for (let offset = 0; offset < maxOffset; offset++) {
523
+ if (!(c1 + 1 < aLen || c2 + offset < bLen)) break;
524
+ if (c1 + offset < aLen && a.charCodeAt(c1 + offset) === b.charCodeAt(c2)) {
525
+ c1 += offset;
526
+ localCs++;
527
+ break;
528
+ }
529
+ if (c2 + offset < bLen && a.charCodeAt(c1) === b.charCodeAt(c2 + offset)) {
530
+ c2 += offset;
531
+ localCs++;
532
+ break;
533
+ }
534
+ }
535
+ }
536
+ c1++;
537
+ c2++;
538
+ }
539
+ return Math.max(aLen, bLen) - lcss - localCs;
540
+ }
541
+ /**
542
+ * Compute the normalized SIFT4 similarity in [0, 1].
543
+ *
544
+ * @param a - First string
545
+ * @param b - Second string
546
+ * @param options - Configuration
547
+ * @returns Similarity score where 1 means identical
548
+ */
549
+ function sift4Normalized(a, b, options = {}) {
550
+ return normalize(sift4(a, b, options), Math.max(a.length, b.length));
551
+ }
552
+ //#endregion
553
+ //#region src/edit/ratcliff.ts
554
+ /**
555
+ * Ratcliff-Obershelp algorithm — Gestalt pattern matching.
556
+ *
557
+ * Iteratively finds the longest common substring using a stack-based approach,
558
+ * combining scores from both sides. Returns a similarity in [0, 1].
559
+ *
560
+ * Based on the textdistance.rs implementation.
561
+ *
562
+ * Time: O(n * m * log(n * m)) worst case, O(n + m) average
563
+ */
564
+ /**
565
+ * Internal: find the longest common substring and return its length and positions.
566
+ */
567
+ function findLCS(a, b) {
568
+ const aLen = a.length;
569
+ const bLen = b.length;
570
+ if (aLen === 0 || bLen === 0) return {
571
+ len: 0,
572
+ aIdx: 0,
573
+ bIdx: 0
574
+ };
575
+ let maxLen = 0;
576
+ let endI = 0;
577
+ let endJ = 0;
578
+ const dp = new Uint32Array(bLen + 1);
579
+ for (let i = 1; i <= aLen; i++) {
580
+ let prev = 0;
581
+ for (let j = 1; j <= bLen; j++) {
582
+ const temp = dp[j];
583
+ if (a.charCodeAt(i - 1) === b.charCodeAt(j - 1)) {
584
+ dp[j] = prev + 1;
585
+ if (dp[j] > maxLen) {
586
+ maxLen = dp[j];
587
+ endI = i;
588
+ endJ = j;
589
+ }
590
+ } else dp[j] = 0;
591
+ prev = temp;
592
+ }
593
+ }
594
+ return {
595
+ len: maxLen,
596
+ aIdx: endI - maxLen,
597
+ bIdx: endJ - maxLen
598
+ };
599
+ }
600
+ /**
601
+ * Compute Ratcliff-Obershelp similarity between two strings.
602
+ *
603
+ * Uses an iterative stack-based approach to avoid stack overflow on
604
+ * very different strings. The algorithm recursively finds the longest
605
+ * common substring and combines similarity scores from both sides.
606
+ *
607
+ * similarity = 2 * M / T, where M = total matched characters, T = total characters
608
+ *
609
+ * @param a - First string
610
+ * @param b - Second string
611
+ * @returns Ratcliff-Obershelp similarity in [0, 1]
612
+ */
613
+ function ratcliff(a, b) {
614
+ if (a === b) return 1;
615
+ const totalLen = a.length + b.length;
616
+ if (totalLen === 0) return 1;
617
+ let totalMatch = 0;
618
+ const stack = [[
619
+ 0,
620
+ a.length,
621
+ 0,
622
+ b.length
623
+ ]];
624
+ while (stack.length > 0) {
625
+ const [aStart, aEnd, bStart, bEnd] = stack.pop();
626
+ if (aEnd - aStart === 0 || bEnd - bStart === 0) continue;
627
+ const lcs = findLCS(a.slice(aStart, aEnd), b.slice(bStart, bEnd));
628
+ if (lcs.len === 0) continue;
629
+ totalMatch += lcs.len;
630
+ const aRightStart = aStart + lcs.aIdx + lcs.len;
631
+ const bRightStart = bStart + lcs.bIdx + lcs.len;
632
+ if (aEnd - aRightStart > 0 && bEnd - bRightStart > 0) stack.push([
633
+ aRightStart,
634
+ aEnd,
635
+ bRightStart,
636
+ bEnd
637
+ ]);
638
+ const aLeftEnd = aStart + lcs.aIdx;
639
+ const bLeftEnd = bStart + lcs.bIdx;
640
+ if (aLeftEnd - aStart > 0 && bLeftEnd - bStart > 0) stack.push([
641
+ aStart,
642
+ aLeftEnd,
643
+ bStart,
644
+ bLeftEnd
645
+ ]);
646
+ }
647
+ return 2 * totalMatch / totalLen;
648
+ }
649
+ //#endregion
650
+ //#region src/edit/smith-waterman.ts
651
+ /**
652
+ * Compute the raw Smith-Waterman alignment score.
653
+ *
654
+ * @param a - First string
655
+ * @param b - Second string
656
+ * @param options - Scoring parameters
657
+ * @returns Raw alignment score (non-negative)
658
+ */
659
+ function smithWaterman(a, b, options = {}) {
660
+ const matchScore = options.matchScore ?? 1;
661
+ const mismatchScore = options.mismatchScore ?? 0;
662
+ const gapScore = options.gapScore ?? -1;
663
+ const aLen = a.length;
664
+ const bLen = b.length;
665
+ const w = bLen + 1;
666
+ const dp = new Int16Array((aLen + 1) * w);
667
+ dp.fill(0);
668
+ for (let i = 1; i <= aLen; i++) {
669
+ const rowBase = i * w;
670
+ const prevRowBase = (i - 1) * w;
671
+ for (let j = 1; j <= bLen; j++) {
672
+ const cost = a.charCodeAt(i - 1) === b.charCodeAt(j - 1) ? matchScore : mismatchScore;
673
+ const diag = dp[prevRowBase + j - 1] + cost;
674
+ const up = dp[prevRowBase + j] + gapScore;
675
+ const left = dp[rowBase + j - 1] + gapScore;
676
+ dp[rowBase + j] = Math.max(0, diag, up, left);
677
+ }
678
+ }
679
+ return dp[aLen * w + bLen];
680
+ }
681
+ /**
682
+ * Compute the normalized Smith-Waterman similarity in [0, 1].
683
+ *
684
+ * Normalized by matchScore * max(len(a), len(b)), matching textdistance.rs convention.
685
+ *
686
+ * @param a - First string
687
+ * @param b - Second string
688
+ * @param options - Scoring parameters
689
+ * @returns Normalized similarity in [0, 1]
690
+ */
691
+ function smithWatermanNormalized(a, b, options = {}) {
692
+ const maxPossible = (options.matchScore ?? 1) * Math.max(a.length, b.length);
693
+ if (maxPossible === 0) return 1;
694
+ return smithWaterman(a, b, options) / maxPossible;
695
+ }
696
+ //#endregion
234
697
  //#region src/token/jaccard.ts
235
698
  const _freqA$2 = new Int32Array(128);
236
699
  const _freqB$2 = new Int32Array(128);
@@ -299,10 +762,10 @@ const _freqB$1 = new Int32Array(128);
299
762
  /**
300
763
  * Cosine similarity between two strings based on character-level multiset.
301
764
  *
302
- * cos(A, B) = (A · B) / (|A| * |B|)
765
+ * Uses textdistance.rs convention:
766
+ * cosine(A, B) = intersect_count(A, B) / sqrt(count(A) * count(B))
303
767
  *
304
- * Uses Counter (frequency map) for multiset semantics,
305
- * matching the textdistance crate behavior.
768
+ * Where intersect_count = sum(min(freqA[c], freqB[c])) and count = sum of frequencies.
306
769
  *
307
770
  * Time: O(m + n)
308
771
  *
@@ -314,43 +777,49 @@ function cosine(a, b) {
314
777
  _freqA$1.fill(0);
315
778
  _freqB$1.fill(0);
316
779
  if (buildCharFreqArray(_freqA$1, a) && buildCharFreqArray(_freqB$1, b)) {
317
- let dot = 0;
318
- let normA = 0;
319
- let normB = 0;
780
+ let intersection = 0;
781
+ let totalA = 0;
782
+ let totalB = 0;
320
783
  for (let i = 0; i < 128; i++) {
321
784
  const va = _freqA$1[i];
322
785
  const vb = _freqB$1[i];
323
- dot += va * vb;
324
- normA += va * va;
325
- normB += vb * vb;
786
+ intersection += va < vb ? va : vb;
787
+ totalA += va;
788
+ totalB += vb;
326
789
  }
327
- const denominator = Math.sqrt(normA) * Math.sqrt(normB);
328
- return denominator === 0 ? 1 : dot / denominator;
790
+ if (totalA === 0 && totalB === 0) return 1;
791
+ if (totalA === 0 || totalB === 0) return 0;
792
+ return intersection / Math.sqrt(totalA * totalB);
329
793
  }
330
794
  const freqAMap = charFrequencyMap(a);
331
795
  const freqBMap = charFrequencyMap(b);
332
- let dotProduct = 0;
333
- let normA = 0;
334
- let normB = 0;
335
- const [smaller, larger] = freqAMap.size <= freqBMap.size ? [freqAMap, freqBMap] : [freqBMap, freqAMap];
336
- for (const [char, countA] of smaller) {
337
- const countB = larger.get(char) ?? 0;
338
- dotProduct += countA * countB;
339
- normA += countA * countA;
340
- }
341
- for (const [, count] of larger) normB += count * count;
342
- if (freqAMap.size > freqBMap.size) {
343
- const tmp = normA;
344
- normA = normB;
345
- normB = tmp;
796
+ const intersection = intersectCount$1(freqAMap, freqBMap);
797
+ const totalA = totalCount$1(freqAMap);
798
+ const totalB = totalCount$1(freqBMap);
799
+ if (totalA === 0 && totalB === 0) return 1;
800
+ if (totalA === 0 || totalB === 0) return 0;
801
+ return intersection / Math.sqrt(totalA * totalB);
802
+ }
803
+ function intersectCount$1(a, b) {
804
+ let count = 0;
805
+ const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
806
+ for (const [key, countA] of smaller) {
807
+ const countB = larger.get(key);
808
+ if (countB !== void 0) count += countA < countB ? countA : countB;
346
809
  }
347
- const denominator = Math.sqrt(normA) * Math.sqrt(normB);
348
- if (denominator === 0) return 1;
349
- return dotProduct / denominator;
810
+ return count;
811
+ }
812
+ function totalCount$1(map) {
813
+ let count = 0;
814
+ for (const c of map.values()) count += c;
815
+ return count;
350
816
  }
351
817
  /**
352
818
  * Cosine similarity based on character n-grams.
353
819
  *
820
+ * Uses textdistance.rs convention (same as character-level cosine but on n-grams):
821
+ * cosine_ngram(A, B) = intersect_count(A, B) / sqrt(count(A) * count(B))
822
+ *
354
823
  * @param a - First string
355
824
  * @param b - Second string
356
825
  * @param n - N-gram size (default: 2)
@@ -360,32 +829,21 @@ function cosineNgram(a, b, n = 2) {
360
829
  const freqAInt = ngramFrequencyMap(a, n);
361
830
  const freqBInt = ngramFrequencyMap(b, n);
362
831
  if (freqAInt !== null && freqBInt !== null) {
363
- let dotProduct = 0;
364
- let normA = 0;
365
- let normB = 0;
366
- for (const [id, countA] of freqAInt) {
367
- const countB = freqBInt.get(id) ?? 0;
368
- dotProduct += countA * countB;
369
- normA += countA * countA;
370
- }
371
- for (const [, count] of freqBInt) normB += count * count;
372
- const denominator = Math.sqrt(normA) * Math.sqrt(normB);
373
- return denominator === 0 ? 1 : dotProduct / denominator;
832
+ const intersection = intersectCountInt(freqAInt, freqBInt);
833
+ const totalA = totalCountInt(freqAInt);
834
+ const totalB = totalCountInt(freqBInt);
835
+ if (totalA === 0 && totalB === 0) return 1;
836
+ if (totalA === 0 || totalB === 0) return 0;
837
+ return intersection / Math.sqrt(totalA * totalB);
374
838
  }
375
839
  const freqA = frequencyMap(ngrams(a, n));
376
840
  const freqB = frequencyMap(ngrams(b, n));
377
- let dotProduct = 0;
378
- let normA = 0;
379
- let normB = 0;
380
- for (const [token, countA] of freqA) {
381
- const countB = freqB.get(token) ?? 0;
382
- dotProduct += countA * countB;
383
- normA += countA * countA;
384
- }
385
- for (const [, count] of freqB) normB += count * count;
386
- const denominator = Math.sqrt(normA) * Math.sqrt(normB);
387
- if (denominator === 0) return 1;
388
- return dotProduct / denominator;
841
+ const intersection = intersectCount$1(freqA, freqB);
842
+ const totalA = totalCount$1(freqA);
843
+ const totalB = totalCount$1(freqB);
844
+ if (totalA === 0 && totalB === 0) return 1;
845
+ if (totalA === 0 || totalB === 0) return 0;
846
+ return intersection / Math.sqrt(totalA * totalB);
389
847
  }
390
848
  //#endregion
391
849
  //#region src/token/sorensen.ts
@@ -445,6 +903,122 @@ function sorensenNgram(a, b, n = 2) {
445
903
  return 2 * ic / total;
446
904
  }
447
905
  //#endregion
906
+ //#region src/token/tversky.ts
907
+ /**
908
+ * Tversky index — asymmetric set similarity measure.
909
+ *
910
+ * Reduces to Jaccard when alpha = beta = 1.
911
+ * Reduces to Sorensen-Dice when alpha = beta = 0.5.
912
+ *
913
+ * Time: O(m + n)
914
+ */
915
+ /**
916
+ * Compute the Tversky index between two strings based on character multiset.
917
+ *
918
+ * T(A, B; α, β) = |A ∩ B| / (|A ∩ B| + α|A \ B| + β|B \ A|)
919
+ *
920
+ * @param a - First string
921
+ * @param b - Second string
922
+ * @param options - alpha and beta weights
923
+ * @returns Tversky index in [0, 1]
924
+ */
925
+ function tversky(a, b, options = {}) {
926
+ const alpha = options.alpha ?? 1;
927
+ const beta = options.beta ?? 1;
928
+ const freqA = charFrequencyMap(a);
929
+ const freqB = charFrequencyMap(b);
930
+ const intersection = intersectCount(freqA, freqB);
931
+ const totalA = totalCount(freqA);
932
+ const totalB = totalCount(freqB);
933
+ const onlyA = totalA - intersection;
934
+ const onlyB = totalB - intersection;
935
+ const denominator = intersection + alpha * onlyA + beta * onlyB;
936
+ if (denominator === 0) return 1;
937
+ return intersection / denominator;
938
+ }
939
+ //#endregion
940
+ //#region src/token/overlap.ts
941
+ /**
942
+ * Overlap coefficient — set similarity normalized by the smaller set.
943
+ *
944
+ * overlap(A, B) = |A ∩ B| / min(|A|, |B|)
945
+ *
946
+ * Time: O(m + n)
947
+ */
948
+ /**
949
+ * Compute the overlap coefficient between two strings based on character multiset.
950
+ *
951
+ * @param a - First string
952
+ * @param b - Second string
953
+ * @returns Overlap coefficient in [0, 1]
954
+ */
955
+ function overlap(a, b) {
956
+ const freqA = charFrequencyMap(a);
957
+ const freqB = charFrequencyMap(b);
958
+ const intersection = intersectCount(freqA, freqB);
959
+ const totalA = totalCount(freqA);
960
+ const totalB = totalCount(freqB);
961
+ const minTotal = Math.min(totalA, totalB);
962
+ if (totalA === 0 && totalB === 0) return 1;
963
+ if (totalA === 0 || totalB === 0) return 0;
964
+ return intersection / minTotal;
965
+ }
966
+ //#endregion
967
+ //#region src/token/naive.ts
968
+ /**
969
+ * Naive string similarity measures: prefix, suffix, length.
970
+ *
971
+ * Time: O(min(m, n)) for prefix/suffix, O(1) for length
972
+ */
973
+ /**
974
+ * Compute prefix similarity between two strings.
975
+ *
976
+ * prefix(a, b) = commonPrefixLength / max(|a|, |b|)
977
+ *
978
+ * @param a - First string
979
+ * @param b - Second string
980
+ * @returns Prefix similarity in [0, 1]
981
+ */
982
+ function prefix(a, b) {
983
+ const maxLen = Math.max(a.length, b.length);
984
+ if (maxLen === 0) return 1;
985
+ let commonLen = 0;
986
+ while (commonLen < a.length && commonLen < b.length && a.charCodeAt(commonLen) === b.charCodeAt(commonLen)) commonLen++;
987
+ return commonLen / maxLen;
988
+ }
989
+ /**
990
+ * Compute suffix similarity between two strings.
991
+ *
992
+ * suffix(a, b) = commonSuffixLength / max(|a|, |b|)
993
+ *
994
+ * @param a - First string
995
+ * @param b - Second string
996
+ * @returns Suffix similarity in [0, 1]
997
+ */
998
+ function suffix(a, b) {
999
+ const maxLen = Math.max(a.length, b.length);
1000
+ if (maxLen === 0) return 1;
1001
+ let commonLen = 0;
1002
+ const aEnd = a.length;
1003
+ const bEnd = b.length;
1004
+ while (commonLen < aEnd && commonLen < bEnd && a.charCodeAt(aEnd - 1 - commonLen) === b.charCodeAt(bEnd - 1 - commonLen)) commonLen++;
1005
+ return commonLen / maxLen;
1006
+ }
1007
+ /**
1008
+ * Compute length-based similarity between two strings.
1009
+ *
1010
+ * length(a, b) = 1 - |len(a) - len(b)| / max(len(a), len(b))
1011
+ *
1012
+ * @param a - First string
1013
+ * @param b - Second string
1014
+ * @returns Normalized length similarity in [0, 1]
1015
+ */
1016
+ function length(a, b) {
1017
+ const maxLen = Math.max(a.length, b.length);
1018
+ if (maxLen === 0) return 1;
1019
+ return 1 - Math.abs(a.length - b.length) / maxLen;
1020
+ }
1021
+ //#endregion
448
1022
  //#region src/hash/simhash.ts
449
1023
  /**
450
1024
  * Generate a 64-bit fingerprint for a collection of features.
@@ -1131,4 +1705,4 @@ function findBestMatch(query, collection, options = {}) {
1131
1705
  return results.length > 0 ? results[0] : null;
1132
1706
  }
1133
1707
  //#endregion
1134
- export { DiffType, FuzzySearch, LSH, MinHash, SimHasher, cosine, cosineNgram, diff, findBestMatch, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };
1708
+ export { CHAR_FREQ_SIZE, DiffType, FuzzySearch, LSH, MinHash, SimHasher, buildCharFreqArray, charFrequencyMap, combineHash, cosine, cosineNgram, damerauLevenshtein, damerauLevenshteinNormalized, diff, findBestMatch, fnv1a, frequencyMap, hamming, hammingDistance, hammingNormalized, hammingSimilarity, intersectCount, intersectCountInt, jaccard, jaccardNgram, jaro, jaroWinkler, lcsDistance, lcsLength, lcsNormalized, lcsPairs, lcsSubstringDistance, lcsSubstringLength, lcsSubstringNormalized, length, levenshtein, levenshteinNormalized, ngramFrequencyMap, ngrams, normalize, overlap, prefix, ratcliff, sift4, sift4Normalized, simhash, smithWaterman, smithWatermanNormalized, sorensen, sorensenNgram, stringEquals, suffix, totalCount, totalCountInt, tversky, unionCount, unionCountInt };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@nlptools/distance",
3
- "version": "0.0.4",
3
+ "version": "0.0.5",
4
4
  "description": "Complete string distance and similarity algorithms package with WebAssembly and JavaScript implementations",
5
5
  "keywords": [
6
6
  "algorithms",