@nlptools/distance 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,6 +1,1134 @@
1
- export * from '@nlptools/distance-wasm';
2
- import { distance } from 'fastest-levenshtein';
3
-
4
- const fastest_levenshtein = distance;
5
-
6
- export { fastest_levenshtein };
1
+ import { distance } from "fastest-levenshtein";
2
+ import { lcs_dp, lcs_myers_linear_space, lcs_size_dp, lcs_size_myers_linear_space } from "@algorithm.ts/lcs";
3
+ import { DiffType, diff } from "@algorithm.ts/diff";
4
+ //#region src/utils.ts
5
+ /**
6
+ * Generate character n-grams from a string.
7
+ *
8
+ * @param str - Input string
9
+ * @param n - N-gram size (default: 2 for bigrams)
10
+ */
11
+ function ngrams(str, n = 2) {
12
+ const result = [];
13
+ for (let i = 0; i <= str.length - n; i++) result.push(str.slice(i, i + n));
14
+ return result;
15
+ }
16
+ /**
17
+ * Build an n-gram frequency map using integer-encoded keys.
18
+ * Encodes n characters into a single number to avoid string allocation
19
+ * and speed up Map hashing.
20
+ *
21
+ * For ASCII bigrams: key = (c1 << 8) | c2 (fits in 16 bits).
22
+ * For non-ASCII or n > 2: falls back to string keys.
23
+ */
24
+ function ngramFrequencyMap(str, n = 2) {
25
+ const len = str.length;
26
+ if (len < n) return /* @__PURE__ */ new Map();
27
+ if (n === 2) {
28
+ const map = /* @__PURE__ */ new Map();
29
+ for (let i = 0; i <= len - 2; i++) {
30
+ const c1 = str.charCodeAt(i);
31
+ const c2 = str.charCodeAt(i + 1);
32
+ if (c1 >= 128 || c2 >= 128) return null;
33
+ const key = c1 << 8 | c2;
34
+ map.set(key, (map.get(key) ?? 0) + 1);
35
+ }
36
+ return map;
37
+ }
38
+ return null;
39
+ }
40
+ /**
41
+ * Build a frequency map (Counter/multiset) from an iterable of tokens.
42
+ * Matches the behavior of Rust's textdistance Counter.
43
+ */
44
+ function frequencyMap(tokens) {
45
+ const map = /* @__PURE__ */ new Map();
46
+ for (const token of tokens) map.set(token, (map.get(token) ?? 0) + 1);
47
+ return map;
48
+ }
49
+ /**
50
+ * Build a character-level frequency map from a string.
51
+ * This is the default tokenization strategy used by textdistance.
52
+ */
53
+ function charFrequencyMap(str) {
54
+ return frequencyMap(str);
55
+ }
56
+ /**
57
+ * Build a character frequency array from a string.
58
+ * Returns false if any character is non-ASCII (charCode >= 128).
59
+ * The caller must zero the array before use.
60
+ */
61
+ function buildCharFreqArray(arr, str) {
62
+ for (let i = 0; i < str.length; i++) {
63
+ const code = str.charCodeAt(i);
64
+ if (code >= 128) return false;
65
+ arr[code]++;
66
+ }
67
+ return true;
68
+ }
69
+ /**
70
+ * Count intersect size between two frequency maps.
71
+ * For each key, takes the minimum count (multiset intersection).
72
+ */
73
+ function intersectCount(a, b) {
74
+ let count = 0;
75
+ const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
76
+ for (const [key, countA] of smaller) {
77
+ const countB = larger.get(key);
78
+ if (countB !== void 0) count += countA < countB ? countA : countB;
79
+ }
80
+ return count;
81
+ }
82
+ /**
83
+ * Count union size between two frequency maps.
84
+ * For each key, takes the maximum count (multiset union).
85
+ */
86
+ function unionCount(a, b) {
87
+ let count = 0;
88
+ const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
89
+ for (const [key, countA] of smaller) {
90
+ const countB = larger.get(key);
91
+ if (countB !== void 0) count += countA > countB ? countA : countB;
92
+ else count += countA;
93
+ }
94
+ for (const [key, countB] of larger) if (!smaller.has(key)) count += countB;
95
+ return count;
96
+ }
97
+ /**
98
+ * Get total token count from a frequency map.
99
+ */
100
+ function totalCount(map) {
101
+ let count = 0;
102
+ for (const c of map.values()) count += c;
103
+ return count;
104
+ }
105
+ function intersectCountInt(a, b) {
106
+ let count = 0;
107
+ const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
108
+ for (const [key, countA] of smaller) {
109
+ const countB = larger.get(key);
110
+ if (countB !== void 0) count += Math.min(countA, countB);
111
+ }
112
+ return count;
113
+ }
114
+ function unionCountInt(a, b) {
115
+ let count = 0;
116
+ const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
117
+ for (const [key, countA] of smaller) {
118
+ const countB = larger.get(key);
119
+ if (countB !== void 0) count += Math.max(countA, countB);
120
+ else count += countA;
121
+ }
122
+ for (const [key, countB] of larger) if (!smaller.has(key)) count += countB;
123
+ return count;
124
+ }
125
+ function totalCountInt(map) {
126
+ let count = 0;
127
+ for (const c of map.values()) count += c;
128
+ return count;
129
+ }
130
+ /**
131
+ * Normalize a raw distance to a similarity score in [0, 1].
132
+ *
133
+ * @param distance - Raw distance value
134
+ * @param maxDistance - Maximum possible distance (usually max(len(a), len(b)))
135
+ */
136
+ function normalize(distance, maxDistance) {
137
+ if (maxDistance === 0) return 1;
138
+ return Math.max(0, 1 - distance / maxDistance);
139
+ }
140
+ /**
141
+ * FNV-1a hash for strings. Fast, good distribution for hash-based algorithms.
142
+ */
143
+ function fnv1a(str) {
144
+ let hash = 2166136261;
145
+ for (let i = 0; i < str.length; i++) {
146
+ hash ^= str.charCodeAt(i);
147
+ hash = Math.imul(hash, 16777619);
148
+ }
149
+ return hash >>> 0;
150
+ }
151
+ //#endregion
152
+ //#region src/edit/levenshtein.ts
153
+ /**
154
+ * Compute the Levenshtein edit distance between two strings.
155
+ *
156
+ * Time: O(m * n), Space: O(min(m, n))
157
+ *
158
+ * @param a - First string
159
+ * @param b - Second string
160
+ * @returns Edit distance (non-negative integer)
161
+ */
162
+ function levenshtein(a, b) {
163
+ return distance(a, b);
164
+ }
165
+ /**
166
+ * Compute the normalized Levenshtein similarity in [0, 1].
167
+ *
168
+ * @param a - First string
169
+ * @param b - Second string
170
+ * @returns Similarity score where 1 means identical
171
+ */
172
+ function levenshteinNormalized(a, b) {
173
+ return normalize(levenshtein(a, b), Math.max(a.length, b.length));
174
+ }
175
+ //#endregion
176
+ //#region src/edit/lcs.ts
177
+ /**
178
+ * Internal helper: create an equals callback using pre-built CharCode arrays.
179
+ * Avoids repeated string indexing inside the hot LCS loop.
180
+ */
181
+ function stringEquals(a, b) {
182
+ const ca = new Uint8Array(a.length);
183
+ const cb = new Uint8Array(b.length);
184
+ for (let i = 0; i < a.length; i++) ca[i] = a.charCodeAt(i);
185
+ for (let i = 0; i < b.length; i++) cb[i] = b.charCodeAt(i);
186
+ return (x, y) => ca[x] === cb[y];
187
+ }
188
+ /**
189
+ * Compute the LCS distance: len(a) + len(b) - 2 * lcsLength.
190
+ *
191
+ * @param a - First string
192
+ * @param b - Second string
193
+ * @param algorithm - 'myers' (default, better for sparse diffs) | 'dp'
194
+ * @returns LCS distance (non-negative integer)
195
+ */
196
+ function lcsDistance(a, b, algorithm = "myers") {
197
+ const lcsLen = (algorithm === "dp" ? lcs_size_dp : lcs_size_myers_linear_space)(a.length, b.length, stringEquals(a, b));
198
+ return a.length + b.length - 2 * lcsLen;
199
+ }
200
+ /**
201
+ * Compute the normalized LCS similarity in [0, 1].
202
+ *
203
+ * @param a - First string
204
+ * @param b - Second string
205
+ * @param algorithm - 'myers' | 'dp'
206
+ * @returns Similarity score where 1 means identical
207
+ */
208
+ function lcsNormalized(a, b, algorithm = "myers") {
209
+ return normalize(lcsDistance(a, b, algorithm), a.length + b.length);
210
+ }
211
+ /**
212
+ * Get the length of the Longest Common Subsequence.
213
+ *
214
+ * @param a - First string
215
+ * @param b - Second string
216
+ * @param algorithm - 'myers' | 'dp'
217
+ * @returns LCS length
218
+ */
219
+ function lcsLength(a, b, algorithm = "myers") {
220
+ return (algorithm === "dp" ? lcs_size_dp : lcs_size_myers_linear_space)(a.length, b.length, stringEquals(a, b));
221
+ }
222
+ /**
223
+ * Get the matching index pairs of the Longest Common Subsequence.
224
+ *
225
+ * @param a - First string
226
+ * @param b - Second string
227
+ * @param algorithm - 'myers' | 'dp'
228
+ * @returns Array of [indexInA, indexInB] pairs
229
+ */
230
+ function lcsPairs(a, b, algorithm = "myers") {
231
+ return (algorithm === "dp" ? lcs_dp : lcs_myers_linear_space)(a.length, b.length, stringEquals(a, b));
232
+ }
233
+ //#endregion
234
+ //#region src/token/jaccard.ts
235
+ const _freqA$2 = new Int32Array(128);
236
+ const _freqB$2 = new Int32Array(128);
237
+ /**
238
+ * Jaccard similarity between two strings based on character-level multiset.
239
+ *
240
+ * J(A, B) = |A ∩ B| / |A ∪ B|
241
+ *
242
+ * Uses Counter (frequency map) for multiset semantics,
243
+ * matching the textdistance crate behavior.
244
+ *
245
+ * Time: O(m + n)
246
+ *
247
+ * @param a - First string
248
+ * @param b - Second string
249
+ * @returns Jaccard similarity in [0, 1]
250
+ */
251
+ function jaccard(a, b) {
252
+ _freqA$2.fill(0);
253
+ _freqB$2.fill(0);
254
+ if (buildCharFreqArray(_freqA$2, a) && buildCharFreqArray(_freqB$2, b)) {
255
+ let ic = 0;
256
+ let uc = 0;
257
+ for (let i = 0; i < 128; i++) {
258
+ const va = _freqA$2[i];
259
+ const vb = _freqB$2[i];
260
+ ic += Math.min(va, vb);
261
+ uc += Math.max(va, vb);
262
+ }
263
+ return uc === 0 ? 1 : ic / uc;
264
+ }
265
+ const freqAMap = charFrequencyMap(a);
266
+ const freqBMap = charFrequencyMap(b);
267
+ const ic = intersectCount(freqAMap, freqBMap);
268
+ const uc = unionCount(freqAMap, freqBMap);
269
+ if (uc === 0) return 1;
270
+ return ic / uc;
271
+ }
272
+ /**
273
+ * Jaccard similarity based on character bigrams.
274
+ *
275
+ * @param a - First string
276
+ * @param b - Second string
277
+ * @param n - N-gram size (default: 2)
278
+ * @returns Bigram Jaccard similarity in [0, 1]
279
+ */
280
+ function jaccardNgram(a, b, n = 2) {
281
+ const freqAInt = ngramFrequencyMap(a, n);
282
+ const freqBInt = ngramFrequencyMap(b, n);
283
+ if (freqAInt !== null && freqBInt !== null) {
284
+ const ic = intersectCountInt(freqAInt, freqBInt);
285
+ const uc = unionCountInt(freqAInt, freqBInt);
286
+ return uc === 0 ? 1 : ic / uc;
287
+ }
288
+ const freqA = frequencyMap(ngrams(a, n));
289
+ const freqB = frequencyMap(ngrams(b, n));
290
+ const ic = intersectCount(freqA, freqB);
291
+ const uc = unionCount(freqA, freqB);
292
+ if (uc === 0) return 1;
293
+ return ic / uc;
294
+ }
295
+ //#endregion
296
+ //#region src/token/cosine.ts
297
+ const _freqA$1 = new Int32Array(128);
298
+ const _freqB$1 = new Int32Array(128);
299
+ /**
300
+ * Cosine similarity between two strings based on character-level multiset.
301
+ *
302
+ * cos(A, B) = (A · B) / (|A| * |B|)
303
+ *
304
+ * Uses Counter (frequency map) for multiset semantics,
305
+ * matching the textdistance crate behavior.
306
+ *
307
+ * Time: O(m + n)
308
+ *
309
+ * @param a - First string
310
+ * @param b - Second string
311
+ * @returns Cosine similarity in [0, 1]
312
+ */
313
+ function cosine(a, b) {
314
+ _freqA$1.fill(0);
315
+ _freqB$1.fill(0);
316
+ if (buildCharFreqArray(_freqA$1, a) && buildCharFreqArray(_freqB$1, b)) {
317
+ let dot = 0;
318
+ let normA = 0;
319
+ let normB = 0;
320
+ for (let i = 0; i < 128; i++) {
321
+ const va = _freqA$1[i];
322
+ const vb = _freqB$1[i];
323
+ dot += va * vb;
324
+ normA += va * va;
325
+ normB += vb * vb;
326
+ }
327
+ const denominator = Math.sqrt(normA) * Math.sqrt(normB);
328
+ return denominator === 0 ? 1 : dot / denominator;
329
+ }
330
+ const freqAMap = charFrequencyMap(a);
331
+ const freqBMap = charFrequencyMap(b);
332
+ let dotProduct = 0;
333
+ let normA = 0;
334
+ let normB = 0;
335
+ const [smaller, larger] = freqAMap.size <= freqBMap.size ? [freqAMap, freqBMap] : [freqBMap, freqAMap];
336
+ for (const [char, countA] of smaller) {
337
+ const countB = larger.get(char) ?? 0;
338
+ dotProduct += countA * countB;
339
+ normA += countA * countA;
340
+ }
341
+ for (const [, count] of larger) normB += count * count;
342
+ if (freqAMap.size > freqBMap.size) {
343
+ const tmp = normA;
344
+ normA = normB;
345
+ normB = tmp;
346
+ }
347
+ const denominator = Math.sqrt(normA) * Math.sqrt(normB);
348
+ if (denominator === 0) return 1;
349
+ return dotProduct / denominator;
350
+ }
351
+ /**
352
+ * Cosine similarity based on character n-grams.
353
+ *
354
+ * @param a - First string
355
+ * @param b - Second string
356
+ * @param n - N-gram size (default: 2)
357
+ * @returns N-gram Cosine similarity in [0, 1]
358
+ */
359
+ function cosineNgram(a, b, n = 2) {
360
+ const freqAInt = ngramFrequencyMap(a, n);
361
+ const freqBInt = ngramFrequencyMap(b, n);
362
+ if (freqAInt !== null && freqBInt !== null) {
363
+ let dotProduct = 0;
364
+ let normA = 0;
365
+ let normB = 0;
366
+ for (const [id, countA] of freqAInt) {
367
+ const countB = freqBInt.get(id) ?? 0;
368
+ dotProduct += countA * countB;
369
+ normA += countA * countA;
370
+ }
371
+ for (const [, count] of freqBInt) normB += count * count;
372
+ const denominator = Math.sqrt(normA) * Math.sqrt(normB);
373
+ return denominator === 0 ? 1 : dotProduct / denominator;
374
+ }
375
+ const freqA = frequencyMap(ngrams(a, n));
376
+ const freqB = frequencyMap(ngrams(b, n));
377
+ let dotProduct = 0;
378
+ let normA = 0;
379
+ let normB = 0;
380
+ for (const [token, countA] of freqA) {
381
+ const countB = freqB.get(token) ?? 0;
382
+ dotProduct += countA * countB;
383
+ normA += countA * countA;
384
+ }
385
+ for (const [, count] of freqB) normB += count * count;
386
+ const denominator = Math.sqrt(normA) * Math.sqrt(normB);
387
+ if (denominator === 0) return 1;
388
+ return dotProduct / denominator;
389
+ }
390
+ //#endregion
391
+ //#region src/token/sorensen.ts
392
+ const _freqA = new Int32Array(128);
393
+ const _freqB = new Int32Array(128);
394
+ /**
395
+ * Sørensen-Dice coefficient between two strings based on character-level multiset.
396
+ *
397
+ * DSC(A, B) = 2 * |A ∩ B| / (|A| + |B|)
398
+ *
399
+ * Uses Counter (frequency map) for multiset semantics,
400
+ * matching the textdistance crate behavior.
401
+ *
402
+ * Time: O(m + n)
403
+ *
404
+ * @param a - First string
405
+ * @param b - Second string
406
+ * @returns Sørensen-Dice coefficient in [0, 1]
407
+ */
408
+ function sorensen(a, b) {
409
+ _freqA.fill(0);
410
+ _freqB.fill(0);
411
+ if (buildCharFreqArray(_freqA, a) && buildCharFreqArray(_freqB, b)) {
412
+ let ic = 0;
413
+ for (let i = 0; i < 128; i++) ic += Math.min(_freqA[i], _freqB[i]);
414
+ const total = a.length + b.length;
415
+ return total === 0 ? 1 : 2 * ic / total;
416
+ }
417
+ const freqAMap = charFrequencyMap(a);
418
+ const freqBMap = charFrequencyMap(b);
419
+ const ic = intersectCount(freqAMap, freqBMap);
420
+ const total = totalCount(freqAMap) + totalCount(freqBMap);
421
+ if (total === 0) return 1;
422
+ return 2 * ic / total;
423
+ }
424
+ /**
425
+ * Sørensen-Dice coefficient based on character n-grams.
426
+ *
427
+ * @param a - First string
428
+ * @param b - Second string
429
+ * @param n - N-gram size (default: 2)
430
+ * @returns Bigram Sørensen-Dice coefficient in [0, 1]
431
+ */
432
+ function sorensenNgram(a, b, n = 2) {
433
+ const freqAInt = ngramFrequencyMap(a, n);
434
+ const freqBInt = ngramFrequencyMap(b, n);
435
+ if (freqAInt !== null && freqBInt !== null) {
436
+ const ic = intersectCountInt(freqAInt, freqBInt);
437
+ const total = totalCountInt(freqAInt) + totalCountInt(freqBInt);
438
+ return total === 0 ? 1 : 2 * ic / total;
439
+ }
440
+ const freqA = frequencyMap(ngrams(a, n));
441
+ const freqB = frequencyMap(ngrams(b, n));
442
+ const ic = intersectCount(freqA, freqB);
443
+ const total = totalCount(freqA) + totalCount(freqB);
444
+ if (total === 0) return 1;
445
+ return 2 * ic / total;
446
+ }
447
+ //#endregion
448
+ //#region src/hash/simhash.ts
449
+ /**
450
+ * Generate a 64-bit fingerprint for a collection of features.
451
+ *
452
+ * SimHash maps a set of features to a fixed-length binary fingerprint such that
453
+ * similar documents produce similar fingerprints. The similarity between two
454
+ * fingerprints is measured by Hamming distance.
455
+ *
456
+ * Algorithm:
457
+ * 1. Initialize a vector V of length `bits` to all zeros
458
+ * 2. For each feature, compute its hash and set the i-th bit
459
+ * 3. For each bit position i: if hash[i] = 1, V[i] += weight; else V[i] -= weight
460
+ * 4. The final fingerprint: bit i = 1 if V[i] > 0, else 0
461
+ *
462
+ * Time: O(features * bits)
463
+ *
464
+ * @param features - Array of feature strings (e.g., words, n-grams, shingles)
465
+ * @param options - Configuration
466
+ * @returns 64-bit fingerprint as a bigint
467
+ */
468
+ function simhash(features, options = {}) {
469
+ const bits = options.bits ?? 64;
470
+ const hashFn = options.hashFn ?? fnv1a;
471
+ const v = new Float64Array(bits);
472
+ for (const feature of features) {
473
+ const h = hashFn(feature);
474
+ for (let i = 0; i < bits; i++) if (h & 1 << i) v[i] += 1;
475
+ else v[i] -= 1;
476
+ }
477
+ let fingerprint = 0n;
478
+ for (let i = 0; i < bits; i++) if (v[i] > 0) fingerprint |= 1n << BigInt(i);
479
+ return fingerprint;
480
+ }
481
+ /**
482
+ * Compute the Hamming distance between two SimHash fingerprints.
483
+ *
484
+ * The Hamming distance is the number of differing bits.
485
+ * For 64-bit fingerprints, a distance ≤ 3 typically indicates near-duplicate content.
486
+ *
487
+ * Time: O(bits)
488
+ *
489
+ * @param a - First fingerprint
490
+ * @param b - Second fingerprint
491
+ * @returns Hamming distance (non-negative integer)
492
+ */
493
+ function hammingDistance(a, b) {
494
+ return bitCount(a ^ b);
495
+ }
496
+ /**
497
+ * Compute normalized Hamming similarity in [0, 1].
498
+ *
499
+ * @param a - First fingerprint
500
+ * @param b - Second fingerprint
501
+ * @param bits - Bit length of the fingerprints (default: 64)
502
+ */
503
+ function hammingSimilarity(a, b, bits = 64) {
504
+ return 1 - hammingDistance(a, b) / bits;
505
+ }
506
+ /**
507
+ * Count the number of set bits in a bigint using a lookup table.
508
+ * Processes 8 bits at a time instead of 1, reducing iterations from 64 to 8.
509
+ */
510
+ const POPCOUNT_TABLE = new Uint8Array(256);
511
+ for (let i = 0; i < 256; i++) POPCOUNT_TABLE[i] = (i & 1) + (i >> 1 & 1) + (i >> 2 & 1) + (i >> 3 & 1) + (i >> 4 & 1) + (i >> 5 & 1) + (i >> 6 & 1) + (i >> 7 & 1);
512
+ function bitCount(n) {
513
+ let count = 0;
514
+ while (n > 0n) {
515
+ count += POPCOUNT_TABLE[Number(n & 255n)];
516
+ n >>= 8n;
517
+ }
518
+ return count;
519
+ }
520
+ /**
521
+ * SimHasher class for convenient document fingerprinting.
522
+ *
523
+ * @example
524
+ * ```ts
525
+ * const hasher = new SimHasher();
526
+ * const fp1 = hasher.hash(['hello', 'world']);
527
+ * const fp2 = hasher.hash(['hello', 'earth']);
528
+ * console.log(hasher.distance(fp1, fp2)); // small number = similar
529
+ * ```
530
+ */
531
+ var SimHasher = class {
532
+ bits;
533
+ hashFn;
534
+ constructor(options = {}) {
535
+ this.bits = options.bits ?? 64;
536
+ this.hashFn = options.hashFn ?? fnv1a;
537
+ }
538
+ /**
539
+ * Generate a fingerprint from features.
540
+ */
541
+ hash(features) {
542
+ return simhash(features, {
543
+ bits: this.bits,
544
+ hashFn: this.hashFn
545
+ });
546
+ }
547
+ /**
548
+ * Compute Hamming distance between two fingerprints.
549
+ */
550
+ distance(a, b) {
551
+ return hammingDistance(a, b);
552
+ }
553
+ /**
554
+ * Compute similarity between two fingerprints.
555
+ */
556
+ similarity(a, b) {
557
+ return hammingSimilarity(a, b, this.bits);
558
+ }
559
+ /**
560
+ * Check if two fingerprints are likely near-duplicates.
561
+ *
562
+ * @param threshold - Maximum Hamming distance to consider as duplicate (default: 3)
563
+ */
564
+ isDuplicate(a, b, threshold = 3) {
565
+ return this.distance(a, b) <= threshold;
566
+ }
567
+ };
568
+ //#endregion
569
+ //#region src/hash/minhash.ts
570
+ /**
571
+ * MinHash estimator for Jaccard similarity.
572
+ *
573
+ * Instead of computing the exact Jaccard index (which requires set intersection/union
574
+ * on potentially large sets), MinHash generates a fixed-size signature for each set.
575
+ * The Jaccard similarity is then estimated by comparing the fraction of matching
576
+ * positions in the signatures.
577
+ *
578
+ * Time:
579
+ * - Update: O(k) per element, where k = numHashes
580
+ * - Estimate: O(k)
581
+ *
582
+ * @example
583
+ * ```ts
584
+ * const mh = new MinHash({ numHashes: 128 });
585
+ * mh.update('hello');
586
+ * mh.update('world');
587
+ * const sig1 = mh.digest();
588
+ *
589
+ * const mh2 = new MinHash({ numHashes: 128 });
590
+ * mh2.update('hello');
591
+ * mh2.update('earth');
592
+ * const sig2 = mh2.digest();
593
+ *
594
+ * console.log(MinHash.estimate(sig1, sig2)); // ~0.67
595
+ * ```
596
+ */
597
+ var MinHash = class MinHash {
598
+ numHashes;
599
+ hashParams;
600
+ maxHash;
601
+ signature;
602
+ dirty;
603
+ constructor(options = {}) {
604
+ this.numHashes = options.numHashes ?? 128;
605
+ const seed = options.seed ?? 42;
606
+ const p = 4294967311;
607
+ this.maxHash = p - 1;
608
+ this.hashParams = [];
609
+ let rng = seed;
610
+ for (let i = 0; i < this.numHashes; i++) {
611
+ rng = rng * 1103515245 + 12345 & 2147483647;
612
+ const a = rng % (p - 1) + 1;
613
+ rng = rng * 1103515245 + 12345 & 2147483647;
614
+ const b = rng % p;
615
+ this.hashParams.push({
616
+ a,
617
+ b,
618
+ p
619
+ });
620
+ }
621
+ this.signature = new Uint32Array(this.numHashes).fill(4294967295);
622
+ this.dirty = false;
623
+ }
624
+ /**
625
+ * Add a feature to the set.
626
+ */
627
+ update(feature) {
628
+ const h = fnv1a(feature);
629
+ for (let i = 0; i < this.numHashes; i++) {
630
+ const { a, b, p } = this.hashParams[i];
631
+ const hash = ((a * h + b) % p + p) % p;
632
+ if (hash < this.signature[i]) this.signature[i] = hash;
633
+ }
634
+ this.dirty = true;
635
+ }
636
+ /**
637
+ * Get the MinHash signature.
638
+ * The signature is a fixed-size array that represents the set.
639
+ */
640
+ digest() {
641
+ return new Uint32Array(this.signature);
642
+ }
643
+ /**
644
+ * Estimate Jaccard similarity between two MinHash signatures.
645
+ *
646
+ * @param sig1 - First MinHash signature
647
+ * @param sig2 - Second MinHash signature
648
+ * @returns Estimated Jaccard similarity in [0, 1]
649
+ */
650
+ static estimate(sig1, sig2) {
651
+ if (sig1.length !== sig2.length) throw new Error("Signature lengths must match");
652
+ let matches = 0;
653
+ for (let i = 0; i < sig1.length; i++) if (sig1[i] === sig2[i]) matches++;
654
+ return matches / sig1.length;
655
+ }
656
+ /**
657
+ * Estimate Jaccard similarity between this and another MinHash instance.
658
+ */
659
+ estimate(other) {
660
+ return MinHash.estimate(this.digest(), other.digest());
661
+ }
662
+ };
663
+ //#endregion
664
+ //#region src/hash/lsh.ts
665
+ /**
666
+ * LSH (Locality-Sensitive Hashing) index for fast approximate nearest neighbor search.
667
+ *
668
+ * Uses the MinHash + banding technique:
669
+ * 1. Divide each MinHash signature into `numBands` bands
670
+ * 2. Hash each band to a bucket
671
+ * 3. Items sharing at least one bucket are candidates for similarity
672
+ *
673
+ * The probability of two items with Jaccard similarity `s` being compared is:
674
+ * P = 1 - (1 - s^r)^b
675
+ * where r = rows per band, b = numBands.
676
+ *
677
+ * @example
678
+ * ```ts
679
+ * const lsh = new LSH({ numBands: 16, numHashes: 128 });
680
+ *
681
+ * // Index documents
682
+ * const mh1 = new MinHash({ numHashes: 128 });
683
+ * mh1.update('hello');
684
+ * mh1.update('world');
685
+ * lsh.insert('doc1', mh1.digest());
686
+ *
687
+ * const mh2 = new MinHash({ numHashes: 128 });
688
+ * mh2.update('hello');
689
+ * mh2.update('earth');
690
+ * lsh.insert('doc2', mh2.digest());
691
+ *
692
+ * // Query for similar documents
693
+ * const mh3 = new MinHash({ numHashes: 128 });
694
+ * mh3.update('hello');
695
+ * mh3.update('earth');
696
+ * const candidates = lsh.query(mh3.digest());
697
+ * ```
698
+ */
699
+ var LSH = class {
700
+ numBands;
701
+ rowsPerBand;
702
+ numHashes;
703
+ /**
704
+ * Map from band index → bucket hash → set of document IDs
705
+ */
706
+ bands;
707
+ /**
708
+ * All indexed document signatures for exact similarity estimation.
709
+ */
710
+ signatures;
711
+ constructor(options = {}) {
712
+ this.numHashes = options.numHashes ?? 128;
713
+ this.numBands = options.numBands ?? 16;
714
+ this.rowsPerBand = Math.floor(this.numHashes / this.numBands);
715
+ if (this.numBands > this.numHashes) throw new Error("numBands must be <= numHashes");
716
+ this.bands = [];
717
+ for (let i = 0; i < this.numBands; i++) this.bands.push(/* @__PURE__ */ new Map());
718
+ this.signatures = /* @__PURE__ */ new Map();
719
+ }
720
+ /**
721
+ * Insert a document into the index.
722
+ *
723
+ * @param id - Document identifier
724
+ * @param signature - MinHash signature (from MinHash.digest())
725
+ */
726
+ insert(id, signature) {
727
+ if (signature.length !== this.numHashes) throw new Error(`Signature length ${signature.length} does not match numHashes ${this.numHashes}`);
728
+ this.signatures.set(id, signature);
729
+ for (let band = 0; band < this.numBands; band++) {
730
+ const start = band * this.rowsPerBand;
731
+ const end = start + this.rowsPerBand;
732
+ const bucketKey = bandHash(signature.slice(start, end));
733
+ let bucket = this.bands[band].get(bucketKey);
734
+ if (!bucket) {
735
+ bucket = /* @__PURE__ */ new Set();
736
+ this.bands[band].set(bucketKey, bucket);
737
+ }
738
+ bucket.add(id);
739
+ }
740
+ }
741
+ /**
742
+ * Query for candidate documents similar to the given signature.
743
+ *
744
+ * @param signature - Query MinHash signature
745
+ * @param threshold - Optional: minimum Jaccard similarity to return (default: return all candidates)
746
+ * @returns Array of [docId, estimatedJaccard] pairs, sorted by similarity descending
747
+ */
748
+ query(signature, threshold) {
749
+ if (signature.length !== this.numHashes) throw new Error(`Signature length ${signature.length} does not match numHashes ${this.numHashes}`);
750
+ const candidates = /* @__PURE__ */ new Set();
751
+ for (let band = 0; band < this.numBands; band++) {
752
+ const start = band * this.rowsPerBand;
753
+ const end = start + this.rowsPerBand;
754
+ const bucketKey = bandHash(signature.slice(start, end));
755
+ const bucket = this.bands[band].get(bucketKey);
756
+ if (bucket) for (const id of bucket) candidates.add(id);
757
+ }
758
+ const results = [];
759
+ for (const id of candidates) {
760
+ const sig = this.signatures.get(id);
761
+ const similarity = MinHash.estimate(signature, sig);
762
+ if (threshold === void 0 || similarity >= threshold) results.push([id, similarity]);
763
+ }
764
+ results.sort((a, b) => b[1] - a[1]);
765
+ return results;
766
+ }
767
+ /**
768
+ * Remove a document from the index.
769
+ */
770
+ remove(id) {
771
+ const sig = this.signatures.get(id);
772
+ if (!sig) return false;
773
+ this.signatures.delete(id);
774
+ for (let band = 0; band < this.numBands; band++) {
775
+ const start = band * this.rowsPerBand;
776
+ const end = start + this.rowsPerBand;
777
+ const bucketKey = bandHash(sig.slice(start, end));
778
+ const bucket = this.bands[band].get(bucketKey);
779
+ if (bucket) {
780
+ bucket.delete(id);
781
+ if (bucket.size === 0) this.bands[band].delete(bucketKey);
782
+ }
783
+ }
784
+ return true;
785
+ }
786
+ /**
787
+ * Get the number of indexed documents.
788
+ */
789
+ get size() {
790
+ return this.signatures.size;
791
+ }
792
+ };
793
+ /**
794
+ * Hash a band slice to a bucket key string.
795
+ * Uses a simple but effective hash combining approach.
796
+ */
797
+ function bandHash(slice) {
798
+ let hash = 0;
799
+ for (let i = 0; i < slice.length; i++) hash = hash * 31 + slice[i] | 0;
800
+ return hash.toString(36);
801
+ }
802
+ //#endregion
803
+ //#region src/search.ts
804
+ const BUILTIN_ALGORITHMS = {
805
+ levenshtein: levenshteinNormalized,
806
+ lcs: lcsNormalized,
807
+ jaccard,
808
+ jaccardNgram,
809
+ cosine,
810
+ cosineNgram,
811
+ sorensen,
812
+ sorensenNgram
813
+ };
814
+ function resolveKeys(rawKeys) {
815
+ if (rawKeys.length === 0) return [];
816
+ const totalWeight = rawKeys.reduce((sum, k) => sum + (k.weight ?? 1), 0);
817
+ return rawKeys.map((k) => ({
818
+ ...k,
819
+ normalizedWeight: totalWeight > 0 ? (k.weight ?? 1) / totalWeight : 1 / rawKeys.length
820
+ }));
821
+ }
822
+ function resolveAlgorithm(algo) {
823
+ if (algo === void 0) return BUILTIN_ALGORITHMS.levenshtein;
824
+ if (typeof algo === "function") return algo;
825
+ return BUILTIN_ALGORITHMS[algo];
826
+ }
827
+ /**
828
+ * Fuzzy search engine for finding similar items in a collection.
829
+ *
830
+ * Supports both string arrays and object arrays with weighted multi-key search.
831
+ * Uses any similarity algorithm from @nlptools/distance, with optional LSH
832
+ * acceleration for large datasets.
833
+ *
834
+ * @example
835
+ * ```ts
836
+ * // String array search
837
+ * const search = new FuzzySearch(["apple", "banana", "cherry"]);
838
+ * const results = search.search("aple"); // [{ item: "apple", score: 0.75, index: 0 }]
839
+ *
840
+ * // Object array with weighted keys
841
+ * const books = [
842
+ * { title: "Old Man's War", author: "John Scalzi" },
843
+ * { title: "The Lock Artist", author: "Steve Hamilton" },
844
+ * ];
845
+ * const bookSearch = new FuzzySearch(books, {
846
+ * keys: [
847
+ * { name: "title", weight: 0.7 },
848
+ * { name: "author", weight: 0.3 },
849
+ * ],
850
+ * algorithm: "cosine",
851
+ * });
852
+ * const results = bookSearch.search("old man"); // finds "Old Man's War"
853
+ * ```
854
+ */
855
+ var FuzzySearch = class {
856
+ similarityFn;
857
+ keys;
858
+ threshold;
859
+ limit;
860
+ caseSensitive;
861
+ includeMatchDetails;
862
+ isObjectArray;
863
+ collection;
864
+ useLSH;
865
+ lshNumHashes;
866
+ lshNumBands;
867
+ lshIndex;
868
+ minHashSignatures;
869
+ constructor(collection, options = {}) {
870
+ this.similarityFn = resolveAlgorithm(options.algorithm);
871
+ this.keys = resolveKeys(options.keys ?? []);
872
+ this.isObjectArray = this.keys.length > 0;
873
+ this.threshold = options.threshold ?? 0;
874
+ this.limit = options.limit ?? Infinity;
875
+ this.caseSensitive = options.caseSensitive ?? false;
876
+ this.includeMatchDetails = options.includeMatchDetails ?? false;
877
+ this.collection = [...collection];
878
+ const lshOpts = options.lsh;
879
+ this.useLSH = lshOpts !== void 0;
880
+ this.lshNumHashes = lshOpts?.numHashes ?? 128;
881
+ this.lshNumBands = lshOpts?.numBands ?? 16;
882
+ this.lshIndex = null;
883
+ this.minHashSignatures = /* @__PURE__ */ new Map();
884
+ if (this.useLSH && this.collection.length > 0) this.buildLSHIndex();
885
+ }
886
+ /**
887
+ * Search the collection for items similar to the query.
888
+ *
889
+ * @param query - The search query string
890
+ * @param limit - Optional per-query limit override
891
+ * @returns Array of results sorted by score descending
892
+ */
893
+ search(query, limit) {
894
+ const effectiveLimit = limit ?? this.limit;
895
+ if (effectiveLimit === 0 || this.collection.length === 0) return [];
896
+ const normalizedQuery = this.normalizeString(query);
897
+ if (this.useLSH && this.lshIndex !== null) return this.searchWithLSH(normalizedQuery, effectiveLimit);
898
+ return this.searchLinear(normalizedQuery, effectiveLimit);
899
+ }
900
+ /**
901
+ * Add an item to the collection.
902
+ * If LSH is enabled, the index is updated incrementally.
903
+ */
904
+ add(item) {
905
+ const index = this.collection.length;
906
+ this.collection.push(item);
907
+ if (this.useLSH && this.lshIndex !== null) {
908
+ const text = this.extractSearchText(item);
909
+ const sig = this.buildMinHashSignature(text);
910
+ this.minHashSignatures.set(index, sig);
911
+ this.lshIndex.insert(String(index), sig);
912
+ }
913
+ }
914
+ /**
915
+ * Remove an item from the collection by index.
916
+ * If LSH is enabled, the index is rebuilt (O(n)).
917
+ *
918
+ * @returns true if the item was found and removed
919
+ */
920
+ remove(index) {
921
+ if (index < 0 || index >= this.collection.length) return false;
922
+ this.collection.splice(index, 1);
923
+ if (this.useLSH) this.buildLSHIndex();
924
+ return true;
925
+ }
926
+ /**
927
+ * Replace the entire collection.
928
+ * If LSH is enabled, the index is rebuilt.
929
+ */
930
+ setCollection(collection) {
931
+ this.collection = [...collection];
932
+ if (this.useLSH && this.collection.length > 0) this.buildLSHIndex();
933
+ else if (this.useLSH) {
934
+ this.lshIndex = null;
935
+ this.minHashSignatures.clear();
936
+ }
937
+ }
938
+ /**
939
+ * Get the current collection.
940
+ */
941
+ getCollection() {
942
+ return this.collection;
943
+ }
944
+ /**
945
+ * Get the number of items in the collection.
946
+ */
947
+ get size() {
948
+ return this.collection.length;
949
+ }
950
+ /**
951
+ * Clear the collection and any LSH index.
952
+ */
953
+ clear() {
954
+ this.collection = [];
955
+ this.lshIndex = null;
956
+ this.minHashSignatures.clear();
957
+ }
958
+ searchLinear(normalizedQuery, limit) {
959
+ const candidates = [];
960
+ for (let i = 0; i < this.collection.length; i++) {
961
+ const item = this.collection[i];
962
+ if (this.isObjectArray) if (this.includeMatchDetails) {
963
+ const { score, matches } = this.computeDetailedScore(normalizedQuery, item);
964
+ if (score >= this.threshold) candidates.push({
965
+ item,
966
+ score,
967
+ index: i,
968
+ matches
969
+ });
970
+ } else {
971
+ const score = this.computeItemScore(normalizedQuery, item);
972
+ if (score >= this.threshold) candidates.push({
973
+ item,
974
+ score,
975
+ index: i
976
+ });
977
+ }
978
+ else {
979
+ const itemStr = this.normalizeString(item);
980
+ const score = this.similarityFn(normalizedQuery, itemStr);
981
+ if (score >= this.threshold) candidates.push({
982
+ item,
983
+ score,
984
+ index: i
985
+ });
986
+ }
987
+ }
988
+ candidates.sort((a, b) => b.score - a.score);
989
+ if (candidates.length <= limit) return candidates;
990
+ return candidates.slice(0, limit);
991
+ }
992
+ searchWithLSH(normalizedQuery, limit) {
993
+ const queryText = this.isObjectArray ? normalizedQuery : normalizedQuery;
994
+ const querySig = this.buildMinHashSignature(queryText);
995
+ const candidateIds = this.lshIndex.query(querySig, this.threshold);
996
+ const candidates = [];
997
+ for (const [id] of candidateIds) {
998
+ const idx = parseInt(id, 10);
999
+ if (idx < 0 || idx >= this.collection.length) continue;
1000
+ const item = this.collection[idx];
1001
+ if (this.isObjectArray) if (this.includeMatchDetails) {
1002
+ const { score, matches } = this.computeDetailedScore(normalizedQuery, item);
1003
+ if (score >= this.threshold) candidates.push({
1004
+ item,
1005
+ score,
1006
+ index: idx,
1007
+ matches
1008
+ });
1009
+ } else {
1010
+ const score = this.computeItemScore(normalizedQuery, item);
1011
+ if (score >= this.threshold) candidates.push({
1012
+ item,
1013
+ score,
1014
+ index: idx
1015
+ });
1016
+ }
1017
+ else {
1018
+ const itemStr = this.normalizeString(item);
1019
+ const score = this.similarityFn(normalizedQuery, itemStr);
1020
+ if (score >= this.threshold) candidates.push({
1021
+ item,
1022
+ score,
1023
+ index: idx
1024
+ });
1025
+ }
1026
+ }
1027
+ candidates.sort((a, b) => b.score - a.score);
1028
+ if (candidates.length <= limit) return candidates;
1029
+ return candidates.slice(0, limit);
1030
+ }
1031
+ buildLSHIndex() {
1032
+ this.lshIndex = new LSH({
1033
+ numBands: this.lshNumBands,
1034
+ numHashes: this.lshNumHashes
1035
+ });
1036
+ this.minHashSignatures.clear();
1037
+ for (let i = 0; i < this.collection.length; i++) {
1038
+ const text = this.extractSearchText(this.collection[i]);
1039
+ const sig = this.buildMinHashSignature(text);
1040
+ this.minHashSignatures.set(i, sig);
1041
+ this.lshIndex.insert(String(i), sig);
1042
+ }
1043
+ }
1044
+ buildMinHashSignature(text) {
1045
+ const mh = new MinHash({ numHashes: this.lshNumHashes });
1046
+ const grams = ngrams(text, 2);
1047
+ for (const g of grams) mh.update(g);
1048
+ return mh.digest();
1049
+ }
1050
+ computeItemScore(normalizedQuery, item) {
1051
+ let score = 0;
1052
+ for (const key of this.keys) {
1053
+ const value = this.extractKeyValue(item, key);
1054
+ const normalizedValue = this.normalizeString(value);
1055
+ score += key.normalizedWeight * this.similarityFn(normalizedQuery, normalizedValue);
1056
+ }
1057
+ return score;
1058
+ }
1059
+ computeDetailedScore(normalizedQuery, item) {
1060
+ let score = 0;
1061
+ const matches = {};
1062
+ for (const key of this.keys) {
1063
+ const value = this.extractKeyValue(item, key);
1064
+ const normalizedValue = this.normalizeString(value);
1065
+ const s = this.similarityFn(normalizedQuery, normalizedValue);
1066
+ matches[key.name] = s;
1067
+ score += key.normalizedWeight * s;
1068
+ }
1069
+ return {
1070
+ score,
1071
+ matches
1072
+ };
1073
+ }
1074
+ extractSearchText(item) {
1075
+ if (this.isObjectArray) return this.keys.map((k) => this.extractKeyValue(item, k)).join(" ");
1076
+ return this.normalizeString(item);
1077
+ }
1078
+ extractKeyValue(item, key) {
1079
+ if (key.getter) {
1080
+ const value = key.getter(item);
1081
+ return typeof value === "string" ? value : "";
1082
+ }
1083
+ const value = item[key.name];
1084
+ return typeof value === "string" ? value : "";
1085
+ }
1086
+ normalizeString(str) {
1087
+ return this.caseSensitive ? str : str.toLowerCase();
1088
+ }
1089
+ };
1090
+ /**
1091
+ * Find the single best match for a query against a collection.
1092
+ *
1093
+ * This is a convenience wrapper around {@link FuzzySearch} for one-shot queries.
1094
+ * For repeated searches against the same collection, prefer creating a
1095
+ * {@link FuzzySearch} instance directly.
1096
+ *
1097
+ * Time: O(n * k) where n = collection size, k = number of keys
1098
+ *
1099
+ * @param query - The search query string
1100
+ * @param collection - Array of strings or objects to search
1101
+ * @param options - Search configuration
1102
+ * @returns The best matching result, or null if nothing meets the threshold
1103
+ *
1104
+ * @example
1105
+ * ```ts
1106
+ * // String array
1107
+ * const result = findBestMatch("kitten", ["sitting", "kit", "mitten"]);
1108
+ * console.log(result?.item); // "kit"
1109
+ * console.log(result?.score); // 0.5
1110
+ *
1111
+ * // Object array with weighted keys
1112
+ * const books = [
1113
+ * { title: "The Great Gatsby", author: "F. Scott Fitzgerald" },
1114
+ * { title: "Great Expectations", author: "Charles Dickens" },
1115
+ * ];
1116
+ * const result = findBestMatch("grate gatsbi", books, {
1117
+ * keys: [
1118
+ * { name: "title", weight: 0.7 },
1119
+ * { name: "author", weight: 0.3 },
1120
+ * ],
1121
+ * });
1122
+ * ```
1123
+ */
1124
+ function findBestMatch(query, collection, options = {}) {
1125
+ const results = new FuzzySearch(collection, {
1126
+ algorithm: options.algorithm,
1127
+ keys: options.keys,
1128
+ threshold: options.threshold,
1129
+ caseSensitive: options.caseSensitive
1130
+ }).search(query, 1);
1131
+ return results.length > 0 ? results[0] : null;
1132
+ }
1133
+ //#endregion
1134
+ export { DiffType, FuzzySearch, LSH, MinHash, SimHasher, cosine, cosineNgram, diff, findBestMatch, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };