@nlptools/distance 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,6 +1,803 @@
1
- export * from '@nlptools/distance-wasm';
2
- import { distance } from 'fastest-levenshtein';
3
-
4
- const fastest_levenshtein = distance;
5
-
6
- export { fastest_levenshtein };
1
+ import { distance } from "fastest-levenshtein";
2
+ import { lcs_dp, lcs_myers_linear_space, lcs_size_dp, lcs_size_myers_linear_space } from "@algorithm.ts/lcs";
3
+ import { DiffType, diff } from "@algorithm.ts/diff";
4
+ //#region src/utils.ts
5
+ /**
6
+ * Generate character n-grams from a string.
7
+ *
8
+ * @param str - Input string
9
+ * @param n - N-gram size (default: 2 for bigrams)
10
+ */
11
+ function ngrams(str, n = 2) {
12
+ const result = [];
13
+ for (let i = 0; i <= str.length - n; i++) result.push(str.slice(i, i + n));
14
+ return result;
15
+ }
16
+ /**
17
+ * Build an n-gram frequency map using integer-encoded keys.
18
+ * Encodes n characters into a single number to avoid string allocation
19
+ * and speed up Map hashing.
20
+ *
21
+ * For ASCII bigrams: key = (c1 << 8) | c2 (fits in 16 bits).
22
+ * For non-ASCII or n > 2: falls back to string keys.
23
+ */
24
+ function ngramFrequencyMap(str, n = 2) {
25
+ const len = str.length;
26
+ if (len < n) return /* @__PURE__ */ new Map();
27
+ if (n === 2) {
28
+ const map = /* @__PURE__ */ new Map();
29
+ for (let i = 0; i <= len - 2; i++) {
30
+ const c1 = str.charCodeAt(i);
31
+ const c2 = str.charCodeAt(i + 1);
32
+ if (c1 >= 128 || c2 >= 128) return null;
33
+ const key = c1 << 8 | c2;
34
+ map.set(key, (map.get(key) ?? 0) + 1);
35
+ }
36
+ return map;
37
+ }
38
+ return null;
39
+ }
40
+ /**
41
+ * Build a frequency map (Counter/multiset) from an iterable of tokens.
42
+ * Matches the behavior of Rust's textdistance Counter.
43
+ */
44
+ function frequencyMap(tokens) {
45
+ const map = /* @__PURE__ */ new Map();
46
+ for (const token of tokens) map.set(token, (map.get(token) ?? 0) + 1);
47
+ return map;
48
+ }
49
+ /**
50
+ * Build a character-level frequency map from a string.
51
+ * This is the default tokenization strategy used by textdistance.
52
+ */
53
+ function charFrequencyMap(str) {
54
+ return frequencyMap(str);
55
+ }
56
+ /**
57
+ * Build a character frequency array from a string.
58
+ * Returns false if any character is non-ASCII (charCode >= 128).
59
+ * The caller must zero the array before use.
60
+ */
61
+ function buildCharFreqArray(arr, str) {
62
+ for (let i = 0; i < str.length; i++) {
63
+ const code = str.charCodeAt(i);
64
+ if (code >= 128) return false;
65
+ arr[code]++;
66
+ }
67
+ return true;
68
+ }
69
+ /**
70
+ * Count intersect size between two frequency maps.
71
+ * For each key, takes the minimum count (multiset intersection).
72
+ */
73
+ function intersectCount(a, b) {
74
+ let count = 0;
75
+ const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
76
+ for (const [key, countA] of smaller) {
77
+ const countB = larger.get(key);
78
+ if (countB !== void 0) count += countA < countB ? countA : countB;
79
+ }
80
+ return count;
81
+ }
82
+ /**
83
+ * Count union size between two frequency maps.
84
+ * For each key, takes the maximum count (multiset union).
85
+ */
86
+ function unionCount(a, b) {
87
+ let count = 0;
88
+ const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
89
+ for (const [key, countA] of smaller) {
90
+ const countB = larger.get(key);
91
+ if (countB !== void 0) count += countA > countB ? countA : countB;
92
+ else count += countA;
93
+ }
94
+ for (const [key, countB] of larger) if (!smaller.has(key)) count += countB;
95
+ return count;
96
+ }
97
+ /**
98
+ * Get total token count from a frequency map.
99
+ */
100
+ function totalCount(map) {
101
+ let count = 0;
102
+ for (const c of map.values()) count += c;
103
+ return count;
104
+ }
105
+ function intersectCountInt(a, b) {
106
+ let count = 0;
107
+ const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
108
+ for (const [key, countA] of smaller) {
109
+ const countB = larger.get(key);
110
+ if (countB !== void 0) count += Math.min(countA, countB);
111
+ }
112
+ return count;
113
+ }
114
+ function unionCountInt(a, b) {
115
+ let count = 0;
116
+ const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
117
+ for (const [key, countA] of smaller) {
118
+ const countB = larger.get(key);
119
+ if (countB !== void 0) count += Math.max(countA, countB);
120
+ else count += countA;
121
+ }
122
+ for (const [key, countB] of larger) if (!smaller.has(key)) count += countB;
123
+ return count;
124
+ }
125
+ function totalCountInt(map) {
126
+ let count = 0;
127
+ for (const c of map.values()) count += c;
128
+ return count;
129
+ }
130
+ /**
131
+ * Normalize a raw distance to a similarity score in [0, 1].
132
+ *
133
+ * @param distance - Raw distance value
134
+ * @param maxDistance - Maximum possible distance (usually max(len(a), len(b)))
135
+ */
136
+ function normalize(distance, maxDistance) {
137
+ if (maxDistance === 0) return 1;
138
+ return Math.max(0, 1 - distance / maxDistance);
139
+ }
140
+ /**
141
+ * FNV-1a hash for strings. Fast, good distribution for hash-based algorithms.
142
+ */
143
+ function fnv1a(str) {
144
+ let hash = 2166136261;
145
+ for (let i = 0; i < str.length; i++) {
146
+ hash ^= str.charCodeAt(i);
147
+ hash = Math.imul(hash, 16777619);
148
+ }
149
+ return hash >>> 0;
150
+ }
151
+ //#endregion
152
+ //#region src/edit/levenshtein.ts
153
+ /**
154
+ * Compute the Levenshtein edit distance between two strings.
155
+ *
156
+ * Time: O(m * n), Space: O(min(m, n))
157
+ *
158
+ * @param a - First string
159
+ * @param b - Second string
160
+ * @returns Edit distance (non-negative integer)
161
+ */
162
+ function levenshtein(a, b) {
163
+ return distance(a, b);
164
+ }
165
+ /**
166
+ * Compute the normalized Levenshtein similarity in [0, 1].
167
+ *
168
+ * @param a - First string
169
+ * @param b - Second string
170
+ * @returns Similarity score where 1 means identical
171
+ */
172
+ function levenshteinNormalized(a, b) {
173
+ return normalize(levenshtein(a, b), Math.max(a.length, b.length));
174
+ }
175
+ //#endregion
176
+ //#region src/edit/lcs.ts
177
+ /**
178
+ * Internal helper: create an equals callback using pre-built CharCode arrays.
179
+ * Avoids repeated string indexing inside the hot LCS loop.
180
+ */
181
+ function stringEquals(a, b) {
182
+ const ca = new Uint8Array(a.length);
183
+ const cb = new Uint8Array(b.length);
184
+ for (let i = 0; i < a.length; i++) ca[i] = a.charCodeAt(i);
185
+ for (let i = 0; i < b.length; i++) cb[i] = b.charCodeAt(i);
186
+ return (x, y) => ca[x] === cb[y];
187
+ }
188
+ /**
189
+ * Compute the LCS distance: len(a) + len(b) - 2 * lcsLength.
190
+ *
191
+ * @param a - First string
192
+ * @param b - Second string
193
+ * @param algorithm - 'myers' (default, better for sparse diffs) | 'dp'
194
+ * @returns LCS distance (non-negative integer)
195
+ */
196
+ function lcsDistance(a, b, algorithm = "myers") {
197
+ const lcsLen = (algorithm === "dp" ? lcs_size_dp : lcs_size_myers_linear_space)(a.length, b.length, stringEquals(a, b));
198
+ return a.length + b.length - 2 * lcsLen;
199
+ }
200
+ /**
201
+ * Compute the normalized LCS similarity in [0, 1].
202
+ *
203
+ * @param a - First string
204
+ * @param b - Second string
205
+ * @param algorithm - 'myers' | 'dp'
206
+ * @returns Similarity score where 1 means identical
207
+ */
208
+ function lcsNormalized(a, b, algorithm = "myers") {
209
+ return normalize(lcsDistance(a, b, algorithm), a.length + b.length);
210
+ }
211
+ /**
212
+ * Get the length of the Longest Common Subsequence.
213
+ *
214
+ * @param a - First string
215
+ * @param b - Second string
216
+ * @param algorithm - 'myers' | 'dp'
217
+ * @returns LCS length
218
+ */
219
+ function lcsLength(a, b, algorithm = "myers") {
220
+ return (algorithm === "dp" ? lcs_size_dp : lcs_size_myers_linear_space)(a.length, b.length, stringEquals(a, b));
221
+ }
222
+ /**
223
+ * Get the matching index pairs of the Longest Common Subsequence.
224
+ *
225
+ * @param a - First string
226
+ * @param b - Second string
227
+ * @param algorithm - 'myers' | 'dp'
228
+ * @returns Array of [indexInA, indexInB] pairs
229
+ */
230
+ function lcsPairs(a, b, algorithm = "myers") {
231
+ return (algorithm === "dp" ? lcs_dp : lcs_myers_linear_space)(a.length, b.length, stringEquals(a, b));
232
+ }
233
+ //#endregion
234
+ //#region src/token/jaccard.ts
235
+ const _freqA$2 = new Int32Array(128);
236
+ const _freqB$2 = new Int32Array(128);
237
+ /**
238
+ * Jaccard similarity between two strings based on character-level multiset.
239
+ *
240
+ * J(A, B) = |A ∩ B| / |A ∪ B|
241
+ *
242
+ * Uses Counter (frequency map) for multiset semantics,
243
+ * matching the textdistance crate behavior.
244
+ *
245
+ * Time: O(m + n)
246
+ *
247
+ * @param a - First string
248
+ * @param b - Second string
249
+ * @returns Jaccard similarity in [0, 1]
250
+ */
251
+ function jaccard(a, b) {
252
+ _freqA$2.fill(0);
253
+ _freqB$2.fill(0);
254
+ if (buildCharFreqArray(_freqA$2, a) && buildCharFreqArray(_freqB$2, b)) {
255
+ let ic = 0;
256
+ let uc = 0;
257
+ for (let i = 0; i < 128; i++) {
258
+ const va = _freqA$2[i];
259
+ const vb = _freqB$2[i];
260
+ ic += Math.min(va, vb);
261
+ uc += Math.max(va, vb);
262
+ }
263
+ return uc === 0 ? 1 : ic / uc;
264
+ }
265
+ const freqAMap = charFrequencyMap(a);
266
+ const freqBMap = charFrequencyMap(b);
267
+ const ic = intersectCount(freqAMap, freqBMap);
268
+ const uc = unionCount(freqAMap, freqBMap);
269
+ if (uc === 0) return 1;
270
+ return ic / uc;
271
+ }
272
+ /**
273
+ * Jaccard similarity based on character bigrams.
274
+ *
275
+ * @param a - First string
276
+ * @param b - Second string
277
+ * @param n - N-gram size (default: 2)
278
+ * @returns Bigram Jaccard similarity in [0, 1]
279
+ */
280
+ function jaccardNgram(a, b, n = 2) {
281
+ const freqAInt = ngramFrequencyMap(a, n);
282
+ const freqBInt = ngramFrequencyMap(b, n);
283
+ if (freqAInt !== null && freqBInt !== null) {
284
+ const ic = intersectCountInt(freqAInt, freqBInt);
285
+ const uc = unionCountInt(freqAInt, freqBInt);
286
+ return uc === 0 ? 1 : ic / uc;
287
+ }
288
+ const freqA = frequencyMap(ngrams(a, n));
289
+ const freqB = frequencyMap(ngrams(b, n));
290
+ const ic = intersectCount(freqA, freqB);
291
+ const uc = unionCount(freqA, freqB);
292
+ if (uc === 0) return 1;
293
+ return ic / uc;
294
+ }
295
+ //#endregion
296
+ //#region src/token/cosine.ts
297
+ const _freqA$1 = new Int32Array(128);
298
+ const _freqB$1 = new Int32Array(128);
299
+ /**
300
+ * Cosine similarity between two strings based on character-level multiset.
301
+ *
302
+ * cos(A, B) = (A · B) / (|A| * |B|)
303
+ *
304
+ * Uses Counter (frequency map) for multiset semantics,
305
+ * matching the textdistance crate behavior.
306
+ *
307
+ * Time: O(m + n)
308
+ *
309
+ * @param a - First string
310
+ * @param b - Second string
311
+ * @returns Cosine similarity in [0, 1]
312
+ */
313
+ function cosine(a, b) {
314
+ _freqA$1.fill(0);
315
+ _freqB$1.fill(0);
316
+ if (buildCharFreqArray(_freqA$1, a) && buildCharFreqArray(_freqB$1, b)) {
317
+ let dot = 0;
318
+ let normA = 0;
319
+ let normB = 0;
320
+ for (let i = 0; i < 128; i++) {
321
+ const va = _freqA$1[i];
322
+ const vb = _freqB$1[i];
323
+ dot += va * vb;
324
+ normA += va * va;
325
+ normB += vb * vb;
326
+ }
327
+ const denominator = Math.sqrt(normA) * Math.sqrt(normB);
328
+ return denominator === 0 ? 1 : dot / denominator;
329
+ }
330
+ const freqAMap = charFrequencyMap(a);
331
+ const freqBMap = charFrequencyMap(b);
332
+ let dotProduct = 0;
333
+ let normA = 0;
334
+ let normB = 0;
335
+ const [smaller, larger] = freqAMap.size <= freqBMap.size ? [freqAMap, freqBMap] : [freqBMap, freqAMap];
336
+ for (const [char, countA] of smaller) {
337
+ const countB = larger.get(char) ?? 0;
338
+ dotProduct += countA * countB;
339
+ normA += countA * countA;
340
+ }
341
+ for (const [, count] of larger) normB += count * count;
342
+ if (freqAMap.size > freqBMap.size) {
343
+ const tmp = normA;
344
+ normA = normB;
345
+ normB = tmp;
346
+ }
347
+ const denominator = Math.sqrt(normA) * Math.sqrt(normB);
348
+ if (denominator === 0) return 1;
349
+ return dotProduct / denominator;
350
+ }
351
+ /**
352
+ * Cosine similarity based on character n-grams.
353
+ *
354
+ * @param a - First string
355
+ * @param b - Second string
356
+ * @param n - N-gram size (default: 2)
357
+ * @returns N-gram Cosine similarity in [0, 1]
358
+ */
359
+ function cosineNgram(a, b, n = 2) {
360
+ const freqAInt = ngramFrequencyMap(a, n);
361
+ const freqBInt = ngramFrequencyMap(b, n);
362
+ if (freqAInt !== null && freqBInt !== null) {
363
+ let dotProduct = 0;
364
+ let normA = 0;
365
+ let normB = 0;
366
+ for (const [id, countA] of freqAInt) {
367
+ const countB = freqBInt.get(id) ?? 0;
368
+ dotProduct += countA * countB;
369
+ normA += countA * countA;
370
+ }
371
+ for (const [, count] of freqBInt) normB += count * count;
372
+ const denominator = Math.sqrt(normA) * Math.sqrt(normB);
373
+ return denominator === 0 ? 1 : dotProduct / denominator;
374
+ }
375
+ const freqA = frequencyMap(ngrams(a, n));
376
+ const freqB = frequencyMap(ngrams(b, n));
377
+ let dotProduct = 0;
378
+ let normA = 0;
379
+ let normB = 0;
380
+ for (const [token, countA] of freqA) {
381
+ const countB = freqB.get(token) ?? 0;
382
+ dotProduct += countA * countB;
383
+ normA += countA * countA;
384
+ }
385
+ for (const [, count] of freqB) normB += count * count;
386
+ const denominator = Math.sqrt(normA) * Math.sqrt(normB);
387
+ if (denominator === 0) return 1;
388
+ return dotProduct / denominator;
389
+ }
390
+ //#endregion
391
+ //#region src/token/sorensen.ts
392
+ const _freqA = new Int32Array(128);
393
+ const _freqB = new Int32Array(128);
394
+ /**
395
+ * Sørensen-Dice coefficient between two strings based on character-level multiset.
396
+ *
397
+ * DSC(A, B) = 2 * |A ∩ B| / (|A| + |B|)
398
+ *
399
+ * Uses Counter (frequency map) for multiset semantics,
400
+ * matching the textdistance crate behavior.
401
+ *
402
+ * Time: O(m + n)
403
+ *
404
+ * @param a - First string
405
+ * @param b - Second string
406
+ * @returns Sørensen-Dice coefficient in [0, 1]
407
+ */
408
+ function sorensen(a, b) {
409
+ _freqA.fill(0);
410
+ _freqB.fill(0);
411
+ if (buildCharFreqArray(_freqA, a) && buildCharFreqArray(_freqB, b)) {
412
+ let ic = 0;
413
+ for (let i = 0; i < 128; i++) ic += Math.min(_freqA[i], _freqB[i]);
414
+ const total = a.length + b.length;
415
+ return total === 0 ? 1 : 2 * ic / total;
416
+ }
417
+ const freqAMap = charFrequencyMap(a);
418
+ const freqBMap = charFrequencyMap(b);
419
+ const ic = intersectCount(freqAMap, freqBMap);
420
+ const total = totalCount(freqAMap) + totalCount(freqBMap);
421
+ if (total === 0) return 1;
422
+ return 2 * ic / total;
423
+ }
424
+ /**
425
+ * Sørensen-Dice coefficient based on character n-grams.
426
+ *
427
+ * @param a - First string
428
+ * @param b - Second string
429
+ * @param n - N-gram size (default: 2)
430
+ * @returns Bigram Sørensen-Dice coefficient in [0, 1]
431
+ */
432
+ function sorensenNgram(a, b, n = 2) {
433
+ const freqAInt = ngramFrequencyMap(a, n);
434
+ const freqBInt = ngramFrequencyMap(b, n);
435
+ if (freqAInt !== null && freqBInt !== null) {
436
+ const ic = intersectCountInt(freqAInt, freqBInt);
437
+ const total = totalCountInt(freqAInt) + totalCountInt(freqBInt);
438
+ return total === 0 ? 1 : 2 * ic / total;
439
+ }
440
+ const freqA = frequencyMap(ngrams(a, n));
441
+ const freqB = frequencyMap(ngrams(b, n));
442
+ const ic = intersectCount(freqA, freqB);
443
+ const total = totalCount(freqA) + totalCount(freqB);
444
+ if (total === 0) return 1;
445
+ return 2 * ic / total;
446
+ }
447
+ //#endregion
448
+ //#region src/hash/simhash.ts
449
+ /**
450
+ * Generate a 64-bit fingerprint for a collection of features.
451
+ *
452
+ * SimHash maps a set of features to a fixed-length binary fingerprint such that
453
+ * similar documents produce similar fingerprints. The similarity between two
454
+ * fingerprints is measured by Hamming distance.
455
+ *
456
+ * Algorithm:
457
+ * 1. Initialize a vector V of length `bits` to all zeros
458
+ * 2. For each feature, compute its hash and set the i-th bit
459
+ * 3. For each bit position i: if hash[i] = 1, V[i] += weight; else V[i] -= weight
460
+ * 4. The final fingerprint: bit i = 1 if V[i] > 0, else 0
461
+ *
462
+ * Time: O(features * bits)
463
+ *
464
+ * @param features - Array of feature strings (e.g., words, n-grams, shingles)
465
+ * @param options - Configuration
466
+ * @returns 64-bit fingerprint as a bigint
467
+ */
468
+ function simhash(features, options = {}) {
469
+ const bits = options.bits ?? 64;
470
+ const hashFn = options.hashFn ?? fnv1a;
471
+ const v = new Float64Array(bits);
472
+ for (const feature of features) {
473
+ const h = hashFn(feature);
474
+ for (let i = 0; i < bits; i++) if (h & 1 << i) v[i] += 1;
475
+ else v[i] -= 1;
476
+ }
477
+ let fingerprint = 0n;
478
+ for (let i = 0; i < bits; i++) if (v[i] > 0) fingerprint |= 1n << BigInt(i);
479
+ return fingerprint;
480
+ }
481
+ /**
482
+ * Compute the Hamming distance between two SimHash fingerprints.
483
+ *
484
+ * The Hamming distance is the number of differing bits.
485
+ * For 64-bit fingerprints, a distance ≤ 3 typically indicates near-duplicate content.
486
+ *
487
+ * Time: O(bits)
488
+ *
489
+ * @param a - First fingerprint
490
+ * @param b - Second fingerprint
491
+ * @returns Hamming distance (non-negative integer)
492
+ */
493
+ function hammingDistance(a, b) {
494
+ return bitCount(a ^ b);
495
+ }
496
+ /**
497
+ * Compute normalized Hamming similarity in [0, 1].
498
+ *
499
+ * @param a - First fingerprint
500
+ * @param b - Second fingerprint
501
+ * @param bits - Bit length of the fingerprints (default: 64)
502
+ */
503
+ function hammingSimilarity(a, b, bits = 64) {
504
+ return 1 - hammingDistance(a, b) / bits;
505
+ }
506
+ /**
507
+ * Count the number of set bits in a bigint using a lookup table.
508
+ * Processes 8 bits at a time instead of 1, reducing iterations from 64 to 8.
509
+ */
510
+ const POPCOUNT_TABLE = new Uint8Array(256);
511
+ for (let i = 0; i < 256; i++) POPCOUNT_TABLE[i] = (i & 1) + (i >> 1 & 1) + (i >> 2 & 1) + (i >> 3 & 1) + (i >> 4 & 1) + (i >> 5 & 1) + (i >> 6 & 1) + (i >> 7 & 1);
512
+ function bitCount(n) {
513
+ let count = 0;
514
+ while (n > 0n) {
515
+ count += POPCOUNT_TABLE[Number(n & 255n)];
516
+ n >>= 8n;
517
+ }
518
+ return count;
519
+ }
520
+ /**
521
+ * SimHasher class for convenient document fingerprinting.
522
+ *
523
+ * @example
524
+ * ```ts
525
+ * const hasher = new SimHasher();
526
+ * const fp1 = hasher.hash(['hello', 'world']);
527
+ * const fp2 = hasher.hash(['hello', 'earth']);
528
+ * console.log(hasher.distance(fp1, fp2)); // small number = similar
529
+ * ```
530
+ */
531
+ var SimHasher = class {
532
+ bits;
533
+ hashFn;
534
+ constructor(options = {}) {
535
+ this.bits = options.bits ?? 64;
536
+ this.hashFn = options.hashFn ?? fnv1a;
537
+ }
538
+ /**
539
+ * Generate a fingerprint from features.
540
+ */
541
+ hash(features) {
542
+ return simhash(features, {
543
+ bits: this.bits,
544
+ hashFn: this.hashFn
545
+ });
546
+ }
547
+ /**
548
+ * Compute Hamming distance between two fingerprints.
549
+ */
550
+ distance(a, b) {
551
+ return hammingDistance(a, b);
552
+ }
553
+ /**
554
+ * Compute similarity between two fingerprints.
555
+ */
556
+ similarity(a, b) {
557
+ return hammingSimilarity(a, b, this.bits);
558
+ }
559
+ /**
560
+ * Check if two fingerprints are likely near-duplicates.
561
+ *
562
+ * @param threshold - Maximum Hamming distance to consider as duplicate (default: 3)
563
+ */
564
+ isDuplicate(a, b, threshold = 3) {
565
+ return this.distance(a, b) <= threshold;
566
+ }
567
+ };
568
+ //#endregion
569
+ //#region src/hash/minhash.ts
570
+ /**
571
+ * MinHash estimator for Jaccard similarity.
572
+ *
573
+ * Instead of computing the exact Jaccard index (which requires set intersection/union
574
+ * on potentially large sets), MinHash generates a fixed-size signature for each set.
575
+ * The Jaccard similarity is then estimated by comparing the fraction of matching
576
+ * positions in the signatures.
577
+ *
578
+ * Time:
579
+ * - Update: O(k) per element, where k = numHashes
580
+ * - Estimate: O(k)
581
+ *
582
+ * @example
583
+ * ```ts
584
+ * const mh = new MinHash({ numHashes: 128 });
585
+ * mh.update('hello');
586
+ * mh.update('world');
587
+ * const sig1 = mh.digest();
588
+ *
589
+ * const mh2 = new MinHash({ numHashes: 128 });
590
+ * mh2.update('hello');
591
+ * mh2.update('earth');
592
+ * const sig2 = mh2.digest();
593
+ *
594
+ * console.log(MinHash.estimate(sig1, sig2)); // ~0.67
595
+ * ```
596
+ */
597
+ var MinHash = class MinHash {
598
+ numHashes;
599
+ hashParams;
600
+ maxHash;
601
+ signature;
602
+ dirty;
603
+ constructor(options = {}) {
604
+ this.numHashes = options.numHashes ?? 128;
605
+ const seed = options.seed ?? 42;
606
+ const p = 4294967311;
607
+ this.maxHash = p - 1;
608
+ this.hashParams = [];
609
+ let rng = seed;
610
+ for (let i = 0; i < this.numHashes; i++) {
611
+ rng = rng * 1103515245 + 12345 & 2147483647;
612
+ const a = rng % (p - 1) + 1;
613
+ rng = rng * 1103515245 + 12345 & 2147483647;
614
+ const b = rng % p;
615
+ this.hashParams.push({
616
+ a,
617
+ b,
618
+ p
619
+ });
620
+ }
621
+ this.signature = new Uint32Array(this.numHashes).fill(4294967295);
622
+ this.dirty = false;
623
+ }
624
+ /**
625
+ * Add a feature to the set.
626
+ */
627
+ update(feature) {
628
+ const h = fnv1a(feature);
629
+ for (let i = 0; i < this.numHashes; i++) {
630
+ const { a, b, p } = this.hashParams[i];
631
+ const hash = ((a * h + b) % p + p) % p;
632
+ if (hash < this.signature[i]) this.signature[i] = hash;
633
+ }
634
+ this.dirty = true;
635
+ }
636
+ /**
637
+ * Get the MinHash signature.
638
+ * The signature is a fixed-size array that represents the set.
639
+ */
640
+ digest() {
641
+ return new Uint32Array(this.signature);
642
+ }
643
+ /**
644
+ * Estimate Jaccard similarity between two MinHash signatures.
645
+ *
646
+ * @param sig1 - First MinHash signature
647
+ * @param sig2 - Second MinHash signature
648
+ * @returns Estimated Jaccard similarity in [0, 1]
649
+ */
650
+ static estimate(sig1, sig2) {
651
+ if (sig1.length !== sig2.length) throw new Error("Signature lengths must match");
652
+ let matches = 0;
653
+ for (let i = 0; i < sig1.length; i++) if (sig1[i] === sig2[i]) matches++;
654
+ return matches / sig1.length;
655
+ }
656
+ /**
657
+ * Estimate Jaccard similarity between this and another MinHash instance.
658
+ */
659
+ estimate(other) {
660
+ return MinHash.estimate(this.digest(), other.digest());
661
+ }
662
+ };
663
+ //#endregion
664
+ //#region src/hash/lsh.ts
665
+ /**
666
+ * LSH (Locality-Sensitive Hashing) index for fast approximate nearest neighbor search.
667
+ *
668
+ * Uses the MinHash + banding technique:
669
+ * 1. Divide each MinHash signature into `numBands` bands
670
+ * 2. Hash each band to a bucket
671
+ * 3. Items sharing at least one bucket are candidates for similarity
672
+ *
673
+ * The probability of two items with Jaccard similarity `s` being compared is:
674
+ * P = 1 - (1 - s^r)^b
675
+ * where r = rows per band, b = numBands.
676
+ *
677
+ * @example
678
+ * ```ts
679
+ * const lsh = new LSH({ numBands: 16, numHashes: 128 });
680
+ *
681
+ * // Index documents
682
+ * const mh1 = new MinHash({ numHashes: 128 });
683
+ * mh1.update('hello');
684
+ * mh1.update('world');
685
+ * lsh.insert('doc1', mh1.digest());
686
+ *
687
+ * const mh2 = new MinHash({ numHashes: 128 });
688
+ * mh2.update('hello');
689
+ * mh2.update('earth');
690
+ * lsh.insert('doc2', mh2.digest());
691
+ *
692
+ * // Query for similar documents
693
+ * const mh3 = new MinHash({ numHashes: 128 });
694
+ * mh3.update('hello');
695
+ * mh3.update('earth');
696
+ * const candidates = lsh.query(mh3.digest());
697
+ * ```
698
+ */
699
+ var LSH = class {
700
+ numBands;
701
+ rowsPerBand;
702
+ numHashes;
703
+ /**
704
+ * Map from band index → bucket hash → set of document IDs
705
+ */
706
+ bands;
707
+ /**
708
+ * All indexed document signatures for exact similarity estimation.
709
+ */
710
+ signatures;
711
+ constructor(options = {}) {
712
+ this.numHashes = options.numHashes ?? 128;
713
+ this.numBands = options.numBands ?? 16;
714
+ this.rowsPerBand = Math.floor(this.numHashes / this.numBands);
715
+ if (this.numBands > this.numHashes) throw new Error("numBands must be <= numHashes");
716
+ this.bands = [];
717
+ for (let i = 0; i < this.numBands; i++) this.bands.push(/* @__PURE__ */ new Map());
718
+ this.signatures = /* @__PURE__ */ new Map();
719
+ }
720
+ /**
721
+ * Insert a document into the index.
722
+ *
723
+ * @param id - Document identifier
724
+ * @param signature - MinHash signature (from MinHash.digest())
725
+ */
726
+ insert(id, signature) {
727
+ if (signature.length !== this.numHashes) throw new Error(`Signature length ${signature.length} does not match numHashes ${this.numHashes}`);
728
+ this.signatures.set(id, signature);
729
+ for (let band = 0; band < this.numBands; band++) {
730
+ const start = band * this.rowsPerBand;
731
+ const end = start + this.rowsPerBand;
732
+ const bucketKey = bandHash(signature.slice(start, end));
733
+ let bucket = this.bands[band].get(bucketKey);
734
+ if (!bucket) {
735
+ bucket = /* @__PURE__ */ new Set();
736
+ this.bands[band].set(bucketKey, bucket);
737
+ }
738
+ bucket.add(id);
739
+ }
740
+ }
741
+ /**
742
+ * Query for candidate documents similar to the given signature.
743
+ *
744
+ * @param signature - Query MinHash signature
745
+ * @param threshold - Optional: minimum Jaccard similarity to return (default: return all candidates)
746
+ * @returns Array of [docId, estimatedJaccard] pairs, sorted by similarity descending
747
+ */
748
+ query(signature, threshold) {
749
+ if (signature.length !== this.numHashes) throw new Error(`Signature length ${signature.length} does not match numHashes ${this.numHashes}`);
750
+ const candidates = /* @__PURE__ */ new Set();
751
+ for (let band = 0; band < this.numBands; band++) {
752
+ const start = band * this.rowsPerBand;
753
+ const end = start + this.rowsPerBand;
754
+ const bucketKey = bandHash(signature.slice(start, end));
755
+ const bucket = this.bands[band].get(bucketKey);
756
+ if (bucket) for (const id of bucket) candidates.add(id);
757
+ }
758
+ const results = [];
759
+ for (const id of candidates) {
760
+ const sig = this.signatures.get(id);
761
+ const similarity = MinHash.estimate(signature, sig);
762
+ if (threshold === void 0 || similarity >= threshold) results.push([id, similarity]);
763
+ }
764
+ results.sort((a, b) => b[1] - a[1]);
765
+ return results;
766
+ }
767
+ /**
768
+ * Remove a document from the index.
769
+ */
770
+ remove(id) {
771
+ const sig = this.signatures.get(id);
772
+ if (!sig) return false;
773
+ this.signatures.delete(id);
774
+ for (let band = 0; band < this.numBands; band++) {
775
+ const start = band * this.rowsPerBand;
776
+ const end = start + this.rowsPerBand;
777
+ const bucketKey = bandHash(sig.slice(start, end));
778
+ const bucket = this.bands[band].get(bucketKey);
779
+ if (bucket) {
780
+ bucket.delete(id);
781
+ if (bucket.size === 0) this.bands[band].delete(bucketKey);
782
+ }
783
+ }
784
+ return true;
785
+ }
786
+ /**
787
+ * Get the number of indexed documents.
788
+ */
789
+ get size() {
790
+ return this.signatures.size;
791
+ }
792
+ };
793
+ /**
794
+ * Hash a band slice to a bucket key string.
795
+ * Uses a simple but effective hash combining approach.
796
+ */
797
+ function bandHash(slice) {
798
+ let hash = 0;
799
+ for (let i = 0; i < slice.length; i++) hash = hash * 31 + slice[i] | 0;
800
+ return hash.toString(36);
801
+ }
802
+ //#endregion
803
+ export { DiffType, LSH, MinHash, SimHasher, cosine, cosineNgram, diff, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };