@nlptools/distance 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +185 -69
- package/dist/index.d.mts +391 -4
- package/dist/index.mjs +803 -6
- package/package.json +30 -27
- package/dist/index.d.ts +0 -5
package/dist/index.mjs
CHANGED
|
@@ -1,6 +1,803 @@
|
|
|
1
|
-
|
|
2
|
-
import {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
1
|
+
import { distance } from "fastest-levenshtein";
|
|
2
|
+
import { lcs_dp, lcs_myers_linear_space, lcs_size_dp, lcs_size_myers_linear_space } from "@algorithm.ts/lcs";
|
|
3
|
+
import { DiffType, diff } from "@algorithm.ts/diff";
|
|
4
|
+
//#region src/utils.ts
|
|
5
|
+
/**
|
|
6
|
+
* Generate character n-grams from a string.
|
|
7
|
+
*
|
|
8
|
+
* @param str - Input string
|
|
9
|
+
* @param n - N-gram size (default: 2 for bigrams)
|
|
10
|
+
*/
|
|
11
|
+
function ngrams(str, n = 2) {
|
|
12
|
+
const result = [];
|
|
13
|
+
for (let i = 0; i <= str.length - n; i++) result.push(str.slice(i, i + n));
|
|
14
|
+
return result;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Build an n-gram frequency map using integer-encoded keys.
|
|
18
|
+
* Encodes n characters into a single number to avoid string allocation
|
|
19
|
+
* and speed up Map hashing.
|
|
20
|
+
*
|
|
21
|
+
* For ASCII bigrams: key = (c1 << 8) | c2 (fits in 16 bits).
|
|
22
|
+
* For non-ASCII or n > 2: falls back to string keys.
|
|
23
|
+
*/
|
|
24
|
+
function ngramFrequencyMap(str, n = 2) {
|
|
25
|
+
const len = str.length;
|
|
26
|
+
if (len < n) return /* @__PURE__ */ new Map();
|
|
27
|
+
if (n === 2) {
|
|
28
|
+
const map = /* @__PURE__ */ new Map();
|
|
29
|
+
for (let i = 0; i <= len - 2; i++) {
|
|
30
|
+
const c1 = str.charCodeAt(i);
|
|
31
|
+
const c2 = str.charCodeAt(i + 1);
|
|
32
|
+
if (c1 >= 128 || c2 >= 128) return null;
|
|
33
|
+
const key = c1 << 8 | c2;
|
|
34
|
+
map.set(key, (map.get(key) ?? 0) + 1);
|
|
35
|
+
}
|
|
36
|
+
return map;
|
|
37
|
+
}
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Build a frequency map (Counter/multiset) from an iterable of tokens.
|
|
42
|
+
* Matches the behavior of Rust's textdistance Counter.
|
|
43
|
+
*/
|
|
44
|
+
function frequencyMap(tokens) {
|
|
45
|
+
const map = /* @__PURE__ */ new Map();
|
|
46
|
+
for (const token of tokens) map.set(token, (map.get(token) ?? 0) + 1);
|
|
47
|
+
return map;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Build a character-level frequency map from a string.
|
|
51
|
+
* This is the default tokenization strategy used by textdistance.
|
|
52
|
+
*/
|
|
53
|
+
function charFrequencyMap(str) {
|
|
54
|
+
return frequencyMap(str);
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Build a character frequency array from a string.
|
|
58
|
+
* Returns false if any character is non-ASCII (charCode >= 128).
|
|
59
|
+
* The caller must zero the array before use.
|
|
60
|
+
*/
|
|
61
|
+
function buildCharFreqArray(arr, str) {
|
|
62
|
+
for (let i = 0; i < str.length; i++) {
|
|
63
|
+
const code = str.charCodeAt(i);
|
|
64
|
+
if (code >= 128) return false;
|
|
65
|
+
arr[code]++;
|
|
66
|
+
}
|
|
67
|
+
return true;
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Count intersect size between two frequency maps.
|
|
71
|
+
* For each key, takes the minimum count (multiset intersection).
|
|
72
|
+
*/
|
|
73
|
+
function intersectCount(a, b) {
|
|
74
|
+
let count = 0;
|
|
75
|
+
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
|
76
|
+
for (const [key, countA] of smaller) {
|
|
77
|
+
const countB = larger.get(key);
|
|
78
|
+
if (countB !== void 0) count += countA < countB ? countA : countB;
|
|
79
|
+
}
|
|
80
|
+
return count;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Count union size between two frequency maps.
|
|
84
|
+
* For each key, takes the maximum count (multiset union).
|
|
85
|
+
*/
|
|
86
|
+
function unionCount(a, b) {
|
|
87
|
+
let count = 0;
|
|
88
|
+
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
|
89
|
+
for (const [key, countA] of smaller) {
|
|
90
|
+
const countB = larger.get(key);
|
|
91
|
+
if (countB !== void 0) count += countA > countB ? countA : countB;
|
|
92
|
+
else count += countA;
|
|
93
|
+
}
|
|
94
|
+
for (const [key, countB] of larger) if (!smaller.has(key)) count += countB;
|
|
95
|
+
return count;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Get total token count from a frequency map.
|
|
99
|
+
*/
|
|
100
|
+
function totalCount(map) {
|
|
101
|
+
let count = 0;
|
|
102
|
+
for (const c of map.values()) count += c;
|
|
103
|
+
return count;
|
|
104
|
+
}
|
|
105
|
+
function intersectCountInt(a, b) {
|
|
106
|
+
let count = 0;
|
|
107
|
+
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
|
108
|
+
for (const [key, countA] of smaller) {
|
|
109
|
+
const countB = larger.get(key);
|
|
110
|
+
if (countB !== void 0) count += Math.min(countA, countB);
|
|
111
|
+
}
|
|
112
|
+
return count;
|
|
113
|
+
}
|
|
114
|
+
function unionCountInt(a, b) {
|
|
115
|
+
let count = 0;
|
|
116
|
+
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
|
117
|
+
for (const [key, countA] of smaller) {
|
|
118
|
+
const countB = larger.get(key);
|
|
119
|
+
if (countB !== void 0) count += Math.max(countA, countB);
|
|
120
|
+
else count += countA;
|
|
121
|
+
}
|
|
122
|
+
for (const [key, countB] of larger) if (!smaller.has(key)) count += countB;
|
|
123
|
+
return count;
|
|
124
|
+
}
|
|
125
|
+
function totalCountInt(map) {
|
|
126
|
+
let count = 0;
|
|
127
|
+
for (const c of map.values()) count += c;
|
|
128
|
+
return count;
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Normalize a raw distance to a similarity score in [0, 1].
|
|
132
|
+
*
|
|
133
|
+
* @param distance - Raw distance value
|
|
134
|
+
* @param maxDistance - Maximum possible distance (usually max(len(a), len(b)))
|
|
135
|
+
*/
|
|
136
|
+
function normalize(distance, maxDistance) {
|
|
137
|
+
if (maxDistance === 0) return 1;
|
|
138
|
+
return Math.max(0, 1 - distance / maxDistance);
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* FNV-1a hash for strings. Fast, good distribution for hash-based algorithms.
|
|
142
|
+
*/
|
|
143
|
+
function fnv1a(str) {
|
|
144
|
+
let hash = 2166136261;
|
|
145
|
+
for (let i = 0; i < str.length; i++) {
|
|
146
|
+
hash ^= str.charCodeAt(i);
|
|
147
|
+
hash = Math.imul(hash, 16777619);
|
|
148
|
+
}
|
|
149
|
+
return hash >>> 0;
|
|
150
|
+
}
|
|
151
|
+
//#endregion
|
|
152
|
+
//#region src/edit/levenshtein.ts
|
|
153
|
+
/**
|
|
154
|
+
* Compute the Levenshtein edit distance between two strings.
|
|
155
|
+
*
|
|
156
|
+
* Time: O(m * n), Space: O(min(m, n))
|
|
157
|
+
*
|
|
158
|
+
* @param a - First string
|
|
159
|
+
* @param b - Second string
|
|
160
|
+
* @returns Edit distance (non-negative integer)
|
|
161
|
+
*/
|
|
162
|
+
function levenshtein(a, b) {
|
|
163
|
+
return distance(a, b);
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Compute the normalized Levenshtein similarity in [0, 1].
|
|
167
|
+
*
|
|
168
|
+
* @param a - First string
|
|
169
|
+
* @param b - Second string
|
|
170
|
+
* @returns Similarity score where 1 means identical
|
|
171
|
+
*/
|
|
172
|
+
function levenshteinNormalized(a, b) {
|
|
173
|
+
return normalize(levenshtein(a, b), Math.max(a.length, b.length));
|
|
174
|
+
}
|
|
175
|
+
//#endregion
|
|
176
|
+
//#region src/edit/lcs.ts
|
|
177
|
+
/**
|
|
178
|
+
* Internal helper: create an equals callback using pre-built CharCode arrays.
|
|
179
|
+
* Avoids repeated string indexing inside the hot LCS loop.
|
|
180
|
+
*/
|
|
181
|
+
function stringEquals(a, b) {
|
|
182
|
+
const ca = new Uint8Array(a.length);
|
|
183
|
+
const cb = new Uint8Array(b.length);
|
|
184
|
+
for (let i = 0; i < a.length; i++) ca[i] = a.charCodeAt(i);
|
|
185
|
+
for (let i = 0; i < b.length; i++) cb[i] = b.charCodeAt(i);
|
|
186
|
+
return (x, y) => ca[x] === cb[y];
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Compute the LCS distance: len(a) + len(b) - 2 * lcsLength.
|
|
190
|
+
*
|
|
191
|
+
* @param a - First string
|
|
192
|
+
* @param b - Second string
|
|
193
|
+
* @param algorithm - 'myers' (default, better for sparse diffs) | 'dp'
|
|
194
|
+
* @returns LCS distance (non-negative integer)
|
|
195
|
+
*/
|
|
196
|
+
function lcsDistance(a, b, algorithm = "myers") {
|
|
197
|
+
const lcsLen = (algorithm === "dp" ? lcs_size_dp : lcs_size_myers_linear_space)(a.length, b.length, stringEquals(a, b));
|
|
198
|
+
return a.length + b.length - 2 * lcsLen;
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Compute the normalized LCS similarity in [0, 1].
|
|
202
|
+
*
|
|
203
|
+
* @param a - First string
|
|
204
|
+
* @param b - Second string
|
|
205
|
+
* @param algorithm - 'myers' | 'dp'
|
|
206
|
+
* @returns Similarity score where 1 means identical
|
|
207
|
+
*/
|
|
208
|
+
function lcsNormalized(a, b, algorithm = "myers") {
|
|
209
|
+
return normalize(lcsDistance(a, b, algorithm), a.length + b.length);
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Get the length of the Longest Common Subsequence.
|
|
213
|
+
*
|
|
214
|
+
* @param a - First string
|
|
215
|
+
* @param b - Second string
|
|
216
|
+
* @param algorithm - 'myers' | 'dp'
|
|
217
|
+
* @returns LCS length
|
|
218
|
+
*/
|
|
219
|
+
function lcsLength(a, b, algorithm = "myers") {
|
|
220
|
+
return (algorithm === "dp" ? lcs_size_dp : lcs_size_myers_linear_space)(a.length, b.length, stringEquals(a, b));
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Get the matching index pairs of the Longest Common Subsequence.
|
|
224
|
+
*
|
|
225
|
+
* @param a - First string
|
|
226
|
+
* @param b - Second string
|
|
227
|
+
* @param algorithm - 'myers' | 'dp'
|
|
228
|
+
* @returns Array of [indexInA, indexInB] pairs
|
|
229
|
+
*/
|
|
230
|
+
function lcsPairs(a, b, algorithm = "myers") {
|
|
231
|
+
return (algorithm === "dp" ? lcs_dp : lcs_myers_linear_space)(a.length, b.length, stringEquals(a, b));
|
|
232
|
+
}
|
|
233
|
+
//#endregion
|
|
234
|
+
//#region src/token/jaccard.ts
|
|
235
|
+
const _freqA$2 = new Int32Array(128);
|
|
236
|
+
const _freqB$2 = new Int32Array(128);
|
|
237
|
+
/**
|
|
238
|
+
* Jaccard similarity between two strings based on character-level multiset.
|
|
239
|
+
*
|
|
240
|
+
* J(A, B) = |A ∩ B| / |A ∪ B|
|
|
241
|
+
*
|
|
242
|
+
* Uses Counter (frequency map) for multiset semantics,
|
|
243
|
+
* matching the textdistance crate behavior.
|
|
244
|
+
*
|
|
245
|
+
* Time: O(m + n)
|
|
246
|
+
*
|
|
247
|
+
* @param a - First string
|
|
248
|
+
* @param b - Second string
|
|
249
|
+
* @returns Jaccard similarity in [0, 1]
|
|
250
|
+
*/
|
|
251
|
+
function jaccard(a, b) {
|
|
252
|
+
_freqA$2.fill(0);
|
|
253
|
+
_freqB$2.fill(0);
|
|
254
|
+
if (buildCharFreqArray(_freqA$2, a) && buildCharFreqArray(_freqB$2, b)) {
|
|
255
|
+
let ic = 0;
|
|
256
|
+
let uc = 0;
|
|
257
|
+
for (let i = 0; i < 128; i++) {
|
|
258
|
+
const va = _freqA$2[i];
|
|
259
|
+
const vb = _freqB$2[i];
|
|
260
|
+
ic += Math.min(va, vb);
|
|
261
|
+
uc += Math.max(va, vb);
|
|
262
|
+
}
|
|
263
|
+
return uc === 0 ? 1 : ic / uc;
|
|
264
|
+
}
|
|
265
|
+
const freqAMap = charFrequencyMap(a);
|
|
266
|
+
const freqBMap = charFrequencyMap(b);
|
|
267
|
+
const ic = intersectCount(freqAMap, freqBMap);
|
|
268
|
+
const uc = unionCount(freqAMap, freqBMap);
|
|
269
|
+
if (uc === 0) return 1;
|
|
270
|
+
return ic / uc;
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* Jaccard similarity based on character bigrams.
|
|
274
|
+
*
|
|
275
|
+
* @param a - First string
|
|
276
|
+
* @param b - Second string
|
|
277
|
+
* @param n - N-gram size (default: 2)
|
|
278
|
+
* @returns Bigram Jaccard similarity in [0, 1]
|
|
279
|
+
*/
|
|
280
|
+
function jaccardNgram(a, b, n = 2) {
|
|
281
|
+
const freqAInt = ngramFrequencyMap(a, n);
|
|
282
|
+
const freqBInt = ngramFrequencyMap(b, n);
|
|
283
|
+
if (freqAInt !== null && freqBInt !== null) {
|
|
284
|
+
const ic = intersectCountInt(freqAInt, freqBInt);
|
|
285
|
+
const uc = unionCountInt(freqAInt, freqBInt);
|
|
286
|
+
return uc === 0 ? 1 : ic / uc;
|
|
287
|
+
}
|
|
288
|
+
const freqA = frequencyMap(ngrams(a, n));
|
|
289
|
+
const freqB = frequencyMap(ngrams(b, n));
|
|
290
|
+
const ic = intersectCount(freqA, freqB);
|
|
291
|
+
const uc = unionCount(freqA, freqB);
|
|
292
|
+
if (uc === 0) return 1;
|
|
293
|
+
return ic / uc;
|
|
294
|
+
}
|
|
295
|
+
//#endregion
|
|
296
|
+
//#region src/token/cosine.ts
|
|
297
|
+
const _freqA$1 = new Int32Array(128);
|
|
298
|
+
const _freqB$1 = new Int32Array(128);
|
|
299
|
+
/**
|
|
300
|
+
* Cosine similarity between two strings based on character-level multiset.
|
|
301
|
+
*
|
|
302
|
+
* cos(A, B) = (A · B) / (|A| * |B|)
|
|
303
|
+
*
|
|
304
|
+
* Uses Counter (frequency map) for multiset semantics,
|
|
305
|
+
* matching the textdistance crate behavior.
|
|
306
|
+
*
|
|
307
|
+
* Time: O(m + n)
|
|
308
|
+
*
|
|
309
|
+
* @param a - First string
|
|
310
|
+
* @param b - Second string
|
|
311
|
+
* @returns Cosine similarity in [0, 1]
|
|
312
|
+
*/
|
|
313
|
+
function cosine(a, b) {
|
|
314
|
+
_freqA$1.fill(0);
|
|
315
|
+
_freqB$1.fill(0);
|
|
316
|
+
if (buildCharFreqArray(_freqA$1, a) && buildCharFreqArray(_freqB$1, b)) {
|
|
317
|
+
let dot = 0;
|
|
318
|
+
let normA = 0;
|
|
319
|
+
let normB = 0;
|
|
320
|
+
for (let i = 0; i < 128; i++) {
|
|
321
|
+
const va = _freqA$1[i];
|
|
322
|
+
const vb = _freqB$1[i];
|
|
323
|
+
dot += va * vb;
|
|
324
|
+
normA += va * va;
|
|
325
|
+
normB += vb * vb;
|
|
326
|
+
}
|
|
327
|
+
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
|
|
328
|
+
return denominator === 0 ? 1 : dot / denominator;
|
|
329
|
+
}
|
|
330
|
+
const freqAMap = charFrequencyMap(a);
|
|
331
|
+
const freqBMap = charFrequencyMap(b);
|
|
332
|
+
let dotProduct = 0;
|
|
333
|
+
let normA = 0;
|
|
334
|
+
let normB = 0;
|
|
335
|
+
const [smaller, larger] = freqAMap.size <= freqBMap.size ? [freqAMap, freqBMap] : [freqBMap, freqAMap];
|
|
336
|
+
for (const [char, countA] of smaller) {
|
|
337
|
+
const countB = larger.get(char) ?? 0;
|
|
338
|
+
dotProduct += countA * countB;
|
|
339
|
+
normA += countA * countA;
|
|
340
|
+
}
|
|
341
|
+
for (const [, count] of larger) normB += count * count;
|
|
342
|
+
if (freqAMap.size > freqBMap.size) {
|
|
343
|
+
const tmp = normA;
|
|
344
|
+
normA = normB;
|
|
345
|
+
normB = tmp;
|
|
346
|
+
}
|
|
347
|
+
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
|
|
348
|
+
if (denominator === 0) return 1;
|
|
349
|
+
return dotProduct / denominator;
|
|
350
|
+
}
|
|
351
|
+
/**
|
|
352
|
+
* Cosine similarity based on character n-grams.
|
|
353
|
+
*
|
|
354
|
+
* @param a - First string
|
|
355
|
+
* @param b - Second string
|
|
356
|
+
* @param n - N-gram size (default: 2)
|
|
357
|
+
* @returns N-gram Cosine similarity in [0, 1]
|
|
358
|
+
*/
|
|
359
|
+
function cosineNgram(a, b, n = 2) {
|
|
360
|
+
const freqAInt = ngramFrequencyMap(a, n);
|
|
361
|
+
const freqBInt = ngramFrequencyMap(b, n);
|
|
362
|
+
if (freqAInt !== null && freqBInt !== null) {
|
|
363
|
+
let dotProduct = 0;
|
|
364
|
+
let normA = 0;
|
|
365
|
+
let normB = 0;
|
|
366
|
+
for (const [id, countA] of freqAInt) {
|
|
367
|
+
const countB = freqBInt.get(id) ?? 0;
|
|
368
|
+
dotProduct += countA * countB;
|
|
369
|
+
normA += countA * countA;
|
|
370
|
+
}
|
|
371
|
+
for (const [, count] of freqBInt) normB += count * count;
|
|
372
|
+
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
|
|
373
|
+
return denominator === 0 ? 1 : dotProduct / denominator;
|
|
374
|
+
}
|
|
375
|
+
const freqA = frequencyMap(ngrams(a, n));
|
|
376
|
+
const freqB = frequencyMap(ngrams(b, n));
|
|
377
|
+
let dotProduct = 0;
|
|
378
|
+
let normA = 0;
|
|
379
|
+
let normB = 0;
|
|
380
|
+
for (const [token, countA] of freqA) {
|
|
381
|
+
const countB = freqB.get(token) ?? 0;
|
|
382
|
+
dotProduct += countA * countB;
|
|
383
|
+
normA += countA * countA;
|
|
384
|
+
}
|
|
385
|
+
for (const [, count] of freqB) normB += count * count;
|
|
386
|
+
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
|
|
387
|
+
if (denominator === 0) return 1;
|
|
388
|
+
return dotProduct / denominator;
|
|
389
|
+
}
|
|
390
|
+
//#endregion
|
|
391
|
+
//#region src/token/sorensen.ts
|
|
392
|
+
const _freqA = new Int32Array(128);
|
|
393
|
+
const _freqB = new Int32Array(128);
|
|
394
|
+
/**
|
|
395
|
+
* Sørensen-Dice coefficient between two strings based on character-level multiset.
|
|
396
|
+
*
|
|
397
|
+
* DSC(A, B) = 2 * |A ∩ B| / (|A| + |B|)
|
|
398
|
+
*
|
|
399
|
+
* Uses Counter (frequency map) for multiset semantics,
|
|
400
|
+
* matching the textdistance crate behavior.
|
|
401
|
+
*
|
|
402
|
+
* Time: O(m + n)
|
|
403
|
+
*
|
|
404
|
+
* @param a - First string
|
|
405
|
+
* @param b - Second string
|
|
406
|
+
* @returns Sørensen-Dice coefficient in [0, 1]
|
|
407
|
+
*/
|
|
408
|
+
function sorensen(a, b) {
|
|
409
|
+
_freqA.fill(0);
|
|
410
|
+
_freqB.fill(0);
|
|
411
|
+
if (buildCharFreqArray(_freqA, a) && buildCharFreqArray(_freqB, b)) {
|
|
412
|
+
let ic = 0;
|
|
413
|
+
for (let i = 0; i < 128; i++) ic += Math.min(_freqA[i], _freqB[i]);
|
|
414
|
+
const total = a.length + b.length;
|
|
415
|
+
return total === 0 ? 1 : 2 * ic / total;
|
|
416
|
+
}
|
|
417
|
+
const freqAMap = charFrequencyMap(a);
|
|
418
|
+
const freqBMap = charFrequencyMap(b);
|
|
419
|
+
const ic = intersectCount(freqAMap, freqBMap);
|
|
420
|
+
const total = totalCount(freqAMap) + totalCount(freqBMap);
|
|
421
|
+
if (total === 0) return 1;
|
|
422
|
+
return 2 * ic / total;
|
|
423
|
+
}
|
|
424
|
+
/**
|
|
425
|
+
* Sørensen-Dice coefficient based on character n-grams.
|
|
426
|
+
*
|
|
427
|
+
* @param a - First string
|
|
428
|
+
* @param b - Second string
|
|
429
|
+
* @param n - N-gram size (default: 2)
|
|
430
|
+
* @returns Bigram Sørensen-Dice coefficient in [0, 1]
|
|
431
|
+
*/
|
|
432
|
+
function sorensenNgram(a, b, n = 2) {
|
|
433
|
+
const freqAInt = ngramFrequencyMap(a, n);
|
|
434
|
+
const freqBInt = ngramFrequencyMap(b, n);
|
|
435
|
+
if (freqAInt !== null && freqBInt !== null) {
|
|
436
|
+
const ic = intersectCountInt(freqAInt, freqBInt);
|
|
437
|
+
const total = totalCountInt(freqAInt) + totalCountInt(freqBInt);
|
|
438
|
+
return total === 0 ? 1 : 2 * ic / total;
|
|
439
|
+
}
|
|
440
|
+
const freqA = frequencyMap(ngrams(a, n));
|
|
441
|
+
const freqB = frequencyMap(ngrams(b, n));
|
|
442
|
+
const ic = intersectCount(freqA, freqB);
|
|
443
|
+
const total = totalCount(freqA) + totalCount(freqB);
|
|
444
|
+
if (total === 0) return 1;
|
|
445
|
+
return 2 * ic / total;
|
|
446
|
+
}
|
|
447
|
+
//#endregion
|
|
448
|
+
//#region src/hash/simhash.ts
|
|
449
|
+
/**
|
|
450
|
+
* Generate a 64-bit fingerprint for a collection of features.
|
|
451
|
+
*
|
|
452
|
+
* SimHash maps a set of features to a fixed-length binary fingerprint such that
|
|
453
|
+
* similar documents produce similar fingerprints. The similarity between two
|
|
454
|
+
* fingerprints is measured by Hamming distance.
|
|
455
|
+
*
|
|
456
|
+
* Algorithm:
|
|
457
|
+
* 1. Initialize a vector V of length `bits` to all zeros
|
|
458
|
+
* 2. For each feature, compute its hash and set the i-th bit
|
|
459
|
+
* 3. For each bit position i: if hash[i] = 1, V[i] += weight; else V[i] -= weight
|
|
460
|
+
* 4. The final fingerprint: bit i = 1 if V[i] > 0, else 0
|
|
461
|
+
*
|
|
462
|
+
* Time: O(features * bits)
|
|
463
|
+
*
|
|
464
|
+
* @param features - Array of feature strings (e.g., words, n-grams, shingles)
|
|
465
|
+
* @param options - Configuration
|
|
466
|
+
* @returns 64-bit fingerprint as a bigint
|
|
467
|
+
*/
|
|
468
|
+
function simhash(features, options = {}) {
|
|
469
|
+
const bits = options.bits ?? 64;
|
|
470
|
+
const hashFn = options.hashFn ?? fnv1a;
|
|
471
|
+
const v = new Float64Array(bits);
|
|
472
|
+
for (const feature of features) {
|
|
473
|
+
const h = hashFn(feature);
|
|
474
|
+
for (let i = 0; i < bits; i++) if (h & 1 << i) v[i] += 1;
|
|
475
|
+
else v[i] -= 1;
|
|
476
|
+
}
|
|
477
|
+
let fingerprint = 0n;
|
|
478
|
+
for (let i = 0; i < bits; i++) if (v[i] > 0) fingerprint |= 1n << BigInt(i);
|
|
479
|
+
return fingerprint;
|
|
480
|
+
}
|
|
481
|
+
/**
|
|
482
|
+
* Compute the Hamming distance between two SimHash fingerprints.
|
|
483
|
+
*
|
|
484
|
+
* The Hamming distance is the number of differing bits.
|
|
485
|
+
* For 64-bit fingerprints, a distance ≤ 3 typically indicates near-duplicate content.
|
|
486
|
+
*
|
|
487
|
+
* Time: O(bits)
|
|
488
|
+
*
|
|
489
|
+
* @param a - First fingerprint
|
|
490
|
+
* @param b - Second fingerprint
|
|
491
|
+
* @returns Hamming distance (non-negative integer)
|
|
492
|
+
*/
|
|
493
|
+
function hammingDistance(a, b) {
|
|
494
|
+
return bitCount(a ^ b);
|
|
495
|
+
}
|
|
496
|
+
/**
|
|
497
|
+
* Compute normalized Hamming similarity in [0, 1].
|
|
498
|
+
*
|
|
499
|
+
* @param a - First fingerprint
|
|
500
|
+
* @param b - Second fingerprint
|
|
501
|
+
* @param bits - Bit length of the fingerprints (default: 64)
|
|
502
|
+
*/
|
|
503
|
+
function hammingSimilarity(a, b, bits = 64) {
|
|
504
|
+
return 1 - hammingDistance(a, b) / bits;
|
|
505
|
+
}
|
|
506
|
+
/**
|
|
507
|
+
* Count the number of set bits in a bigint using a lookup table.
|
|
508
|
+
* Processes 8 bits at a time instead of 1, reducing iterations from 64 to 8.
|
|
509
|
+
*/
|
|
510
|
+
const POPCOUNT_TABLE = new Uint8Array(256);
|
|
511
|
+
for (let i = 0; i < 256; i++) POPCOUNT_TABLE[i] = (i & 1) + (i >> 1 & 1) + (i >> 2 & 1) + (i >> 3 & 1) + (i >> 4 & 1) + (i >> 5 & 1) + (i >> 6 & 1) + (i >> 7 & 1);
|
|
512
|
+
function bitCount(n) {
|
|
513
|
+
let count = 0;
|
|
514
|
+
while (n > 0n) {
|
|
515
|
+
count += POPCOUNT_TABLE[Number(n & 255n)];
|
|
516
|
+
n >>= 8n;
|
|
517
|
+
}
|
|
518
|
+
return count;
|
|
519
|
+
}
|
|
520
|
+
/**
|
|
521
|
+
* SimHasher class for convenient document fingerprinting.
|
|
522
|
+
*
|
|
523
|
+
* @example
|
|
524
|
+
* ```ts
|
|
525
|
+
* const hasher = new SimHasher();
|
|
526
|
+
* const fp1 = hasher.hash(['hello', 'world']);
|
|
527
|
+
* const fp2 = hasher.hash(['hello', 'earth']);
|
|
528
|
+
* console.log(hasher.distance(fp1, fp2)); // small number = similar
|
|
529
|
+
* ```
|
|
530
|
+
*/
|
|
531
|
+
var SimHasher = class {
|
|
532
|
+
bits;
|
|
533
|
+
hashFn;
|
|
534
|
+
constructor(options = {}) {
|
|
535
|
+
this.bits = options.bits ?? 64;
|
|
536
|
+
this.hashFn = options.hashFn ?? fnv1a;
|
|
537
|
+
}
|
|
538
|
+
/**
|
|
539
|
+
* Generate a fingerprint from features.
|
|
540
|
+
*/
|
|
541
|
+
hash(features) {
|
|
542
|
+
return simhash(features, {
|
|
543
|
+
bits: this.bits,
|
|
544
|
+
hashFn: this.hashFn
|
|
545
|
+
});
|
|
546
|
+
}
|
|
547
|
+
/**
|
|
548
|
+
* Compute Hamming distance between two fingerprints.
|
|
549
|
+
*/
|
|
550
|
+
distance(a, b) {
|
|
551
|
+
return hammingDistance(a, b);
|
|
552
|
+
}
|
|
553
|
+
/**
|
|
554
|
+
* Compute similarity between two fingerprints.
|
|
555
|
+
*/
|
|
556
|
+
similarity(a, b) {
|
|
557
|
+
return hammingSimilarity(a, b, this.bits);
|
|
558
|
+
}
|
|
559
|
+
/**
|
|
560
|
+
* Check if two fingerprints are likely near-duplicates.
|
|
561
|
+
*
|
|
562
|
+
* @param threshold - Maximum Hamming distance to consider as duplicate (default: 3)
|
|
563
|
+
*/
|
|
564
|
+
isDuplicate(a, b, threshold = 3) {
|
|
565
|
+
return this.distance(a, b) <= threshold;
|
|
566
|
+
}
|
|
567
|
+
};
|
|
568
|
+
//#endregion
|
|
569
|
+
//#region src/hash/minhash.ts
|
|
570
|
+
/**
|
|
571
|
+
* MinHash estimator for Jaccard similarity.
|
|
572
|
+
*
|
|
573
|
+
* Instead of computing the exact Jaccard index (which requires set intersection/union
|
|
574
|
+
* on potentially large sets), MinHash generates a fixed-size signature for each set.
|
|
575
|
+
* The Jaccard similarity is then estimated by comparing the fraction of matching
|
|
576
|
+
* positions in the signatures.
|
|
577
|
+
*
|
|
578
|
+
* Time:
|
|
579
|
+
* - Update: O(k) per element, where k = numHashes
|
|
580
|
+
* - Estimate: O(k)
|
|
581
|
+
*
|
|
582
|
+
* @example
|
|
583
|
+
* ```ts
|
|
584
|
+
* const mh = new MinHash({ numHashes: 128 });
|
|
585
|
+
* mh.update('hello');
|
|
586
|
+
* mh.update('world');
|
|
587
|
+
* const sig1 = mh.digest();
|
|
588
|
+
*
|
|
589
|
+
* const mh2 = new MinHash({ numHashes: 128 });
|
|
590
|
+
* mh2.update('hello');
|
|
591
|
+
* mh2.update('earth');
|
|
592
|
+
* const sig2 = mh2.digest();
|
|
593
|
+
*
|
|
594
|
+
* console.log(MinHash.estimate(sig1, sig2)); // ~0.67
|
|
595
|
+
* ```
|
|
596
|
+
*/
|
|
597
|
+
var MinHash = class MinHash {
|
|
598
|
+
numHashes;
|
|
599
|
+
hashParams;
|
|
600
|
+
maxHash;
|
|
601
|
+
signature;
|
|
602
|
+
dirty;
|
|
603
|
+
constructor(options = {}) {
|
|
604
|
+
this.numHashes = options.numHashes ?? 128;
|
|
605
|
+
const seed = options.seed ?? 42;
|
|
606
|
+
const p = 4294967311;
|
|
607
|
+
this.maxHash = p - 1;
|
|
608
|
+
this.hashParams = [];
|
|
609
|
+
let rng = seed;
|
|
610
|
+
for (let i = 0; i < this.numHashes; i++) {
|
|
611
|
+
rng = rng * 1103515245 + 12345 & 2147483647;
|
|
612
|
+
const a = rng % (p - 1) + 1;
|
|
613
|
+
rng = rng * 1103515245 + 12345 & 2147483647;
|
|
614
|
+
const b = rng % p;
|
|
615
|
+
this.hashParams.push({
|
|
616
|
+
a,
|
|
617
|
+
b,
|
|
618
|
+
p
|
|
619
|
+
});
|
|
620
|
+
}
|
|
621
|
+
this.signature = new Uint32Array(this.numHashes).fill(4294967295);
|
|
622
|
+
this.dirty = false;
|
|
623
|
+
}
|
|
624
|
+
/**
|
|
625
|
+
* Add a feature to the set.
|
|
626
|
+
*/
|
|
627
|
+
update(feature) {
|
|
628
|
+
const h = fnv1a(feature);
|
|
629
|
+
for (let i = 0; i < this.numHashes; i++) {
|
|
630
|
+
const { a, b, p } = this.hashParams[i];
|
|
631
|
+
const hash = ((a * h + b) % p + p) % p;
|
|
632
|
+
if (hash < this.signature[i]) this.signature[i] = hash;
|
|
633
|
+
}
|
|
634
|
+
this.dirty = true;
|
|
635
|
+
}
|
|
636
|
+
/**
|
|
637
|
+
* Get the MinHash signature.
|
|
638
|
+
* The signature is a fixed-size array that represents the set.
|
|
639
|
+
*/
|
|
640
|
+
digest() {
|
|
641
|
+
return new Uint32Array(this.signature);
|
|
642
|
+
}
|
|
643
|
+
/**
|
|
644
|
+
* Estimate Jaccard similarity between two MinHash signatures.
|
|
645
|
+
*
|
|
646
|
+
* @param sig1 - First MinHash signature
|
|
647
|
+
* @param sig2 - Second MinHash signature
|
|
648
|
+
* @returns Estimated Jaccard similarity in [0, 1]
|
|
649
|
+
*/
|
|
650
|
+
static estimate(sig1, sig2) {
|
|
651
|
+
if (sig1.length !== sig2.length) throw new Error("Signature lengths must match");
|
|
652
|
+
let matches = 0;
|
|
653
|
+
for (let i = 0; i < sig1.length; i++) if (sig1[i] === sig2[i]) matches++;
|
|
654
|
+
return matches / sig1.length;
|
|
655
|
+
}
|
|
656
|
+
/**
|
|
657
|
+
* Estimate Jaccard similarity between this and another MinHash instance.
|
|
658
|
+
*/
|
|
659
|
+
estimate(other) {
|
|
660
|
+
return MinHash.estimate(this.digest(), other.digest());
|
|
661
|
+
}
|
|
662
|
+
};
|
|
663
|
+
//#endregion
|
|
664
|
+
//#region src/hash/lsh.ts
|
|
665
|
+
/**
|
|
666
|
+
* LSH (Locality-Sensitive Hashing) index for fast approximate nearest neighbor search.
|
|
667
|
+
*
|
|
668
|
+
* Uses the MinHash + banding technique:
|
|
669
|
+
* 1. Divide each MinHash signature into `numBands` bands
|
|
670
|
+
* 2. Hash each band to a bucket
|
|
671
|
+
* 3. Items sharing at least one bucket are candidates for similarity
|
|
672
|
+
*
|
|
673
|
+
* The probability of two items with Jaccard similarity `s` being compared is:
|
|
674
|
+
* P = 1 - (1 - s^r)^b
|
|
675
|
+
* where r = rows per band, b = numBands.
|
|
676
|
+
*
|
|
677
|
+
* @example
|
|
678
|
+
* ```ts
|
|
679
|
+
* const lsh = new LSH({ numBands: 16, numHashes: 128 });
|
|
680
|
+
*
|
|
681
|
+
* // Index documents
|
|
682
|
+
* const mh1 = new MinHash({ numHashes: 128 });
|
|
683
|
+
* mh1.update('hello');
|
|
684
|
+
* mh1.update('world');
|
|
685
|
+
* lsh.insert('doc1', mh1.digest());
|
|
686
|
+
*
|
|
687
|
+
* const mh2 = new MinHash({ numHashes: 128 });
|
|
688
|
+
* mh2.update('hello');
|
|
689
|
+
* mh2.update('earth');
|
|
690
|
+
* lsh.insert('doc2', mh2.digest());
|
|
691
|
+
*
|
|
692
|
+
* // Query for similar documents
|
|
693
|
+
* const mh3 = new MinHash({ numHashes: 128 });
|
|
694
|
+
* mh3.update('hello');
|
|
695
|
+
* mh3.update('earth');
|
|
696
|
+
* const candidates = lsh.query(mh3.digest());
|
|
697
|
+
* ```
|
|
698
|
+
*/
|
|
699
|
+
var LSH = class {
|
|
700
|
+
numBands;
|
|
701
|
+
rowsPerBand;
|
|
702
|
+
numHashes;
|
|
703
|
+
/**
|
|
704
|
+
* Map from band index → bucket hash → set of document IDs
|
|
705
|
+
*/
|
|
706
|
+
bands;
|
|
707
|
+
/**
|
|
708
|
+
* All indexed document signatures for exact similarity estimation.
|
|
709
|
+
*/
|
|
710
|
+
signatures;
|
|
711
|
+
constructor(options = {}) {
|
|
712
|
+
this.numHashes = options.numHashes ?? 128;
|
|
713
|
+
this.numBands = options.numBands ?? 16;
|
|
714
|
+
this.rowsPerBand = Math.floor(this.numHashes / this.numBands);
|
|
715
|
+
if (this.numBands > this.numHashes) throw new Error("numBands must be <= numHashes");
|
|
716
|
+
this.bands = [];
|
|
717
|
+
for (let i = 0; i < this.numBands; i++) this.bands.push(/* @__PURE__ */ new Map());
|
|
718
|
+
this.signatures = /* @__PURE__ */ new Map();
|
|
719
|
+
}
|
|
720
|
+
/**
|
|
721
|
+
* Insert a document into the index.
|
|
722
|
+
*
|
|
723
|
+
* @param id - Document identifier
|
|
724
|
+
* @param signature - MinHash signature (from MinHash.digest())
|
|
725
|
+
*/
|
|
726
|
+
insert(id, signature) {
|
|
727
|
+
if (signature.length !== this.numHashes) throw new Error(`Signature length ${signature.length} does not match numHashes ${this.numHashes}`);
|
|
728
|
+
this.signatures.set(id, signature);
|
|
729
|
+
for (let band = 0; band < this.numBands; band++) {
|
|
730
|
+
const start = band * this.rowsPerBand;
|
|
731
|
+
const end = start + this.rowsPerBand;
|
|
732
|
+
const bucketKey = bandHash(signature.slice(start, end));
|
|
733
|
+
let bucket = this.bands[band].get(bucketKey);
|
|
734
|
+
if (!bucket) {
|
|
735
|
+
bucket = /* @__PURE__ */ new Set();
|
|
736
|
+
this.bands[band].set(bucketKey, bucket);
|
|
737
|
+
}
|
|
738
|
+
bucket.add(id);
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
/**
|
|
742
|
+
* Query for candidate documents similar to the given signature.
|
|
743
|
+
*
|
|
744
|
+
* @param signature - Query MinHash signature
|
|
745
|
+
* @param threshold - Optional: minimum Jaccard similarity to return (default: return all candidates)
|
|
746
|
+
* @returns Array of [docId, estimatedJaccard] pairs, sorted by similarity descending
|
|
747
|
+
*/
|
|
748
|
+
query(signature, threshold) {
|
|
749
|
+
if (signature.length !== this.numHashes) throw new Error(`Signature length ${signature.length} does not match numHashes ${this.numHashes}`);
|
|
750
|
+
const candidates = /* @__PURE__ */ new Set();
|
|
751
|
+
for (let band = 0; band < this.numBands; band++) {
|
|
752
|
+
const start = band * this.rowsPerBand;
|
|
753
|
+
const end = start + this.rowsPerBand;
|
|
754
|
+
const bucketKey = bandHash(signature.slice(start, end));
|
|
755
|
+
const bucket = this.bands[band].get(bucketKey);
|
|
756
|
+
if (bucket) for (const id of bucket) candidates.add(id);
|
|
757
|
+
}
|
|
758
|
+
const results = [];
|
|
759
|
+
for (const id of candidates) {
|
|
760
|
+
const sig = this.signatures.get(id);
|
|
761
|
+
const similarity = MinHash.estimate(signature, sig);
|
|
762
|
+
if (threshold === void 0 || similarity >= threshold) results.push([id, similarity]);
|
|
763
|
+
}
|
|
764
|
+
results.sort((a, b) => b[1] - a[1]);
|
|
765
|
+
return results;
|
|
766
|
+
}
|
|
767
|
+
/**
|
|
768
|
+
* Remove a document from the index.
|
|
769
|
+
*/
|
|
770
|
+
remove(id) {
|
|
771
|
+
const sig = this.signatures.get(id);
|
|
772
|
+
if (!sig) return false;
|
|
773
|
+
this.signatures.delete(id);
|
|
774
|
+
for (let band = 0; band < this.numBands; band++) {
|
|
775
|
+
const start = band * this.rowsPerBand;
|
|
776
|
+
const end = start + this.rowsPerBand;
|
|
777
|
+
const bucketKey = bandHash(sig.slice(start, end));
|
|
778
|
+
const bucket = this.bands[band].get(bucketKey);
|
|
779
|
+
if (bucket) {
|
|
780
|
+
bucket.delete(id);
|
|
781
|
+
if (bucket.size === 0) this.bands[band].delete(bucketKey);
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
return true;
|
|
785
|
+
}
|
|
786
|
+
/**
|
|
787
|
+
* Get the number of indexed documents.
|
|
788
|
+
*/
|
|
789
|
+
get size() {
|
|
790
|
+
return this.signatures.size;
|
|
791
|
+
}
|
|
792
|
+
};
|
|
793
|
+
/**
|
|
794
|
+
* Hash a band slice to a bucket key string.
|
|
795
|
+
* Uses a simple but effective hash combining approach.
|
|
796
|
+
*/
|
|
797
|
+
function bandHash(slice) {
|
|
798
|
+
let hash = 0;
|
|
799
|
+
for (let i = 0; i < slice.length; i++) hash = hash * 31 + slice[i] | 0;
|
|
800
|
+
return hash.toString(36);
|
|
801
|
+
}
|
|
802
|
+
//#endregion
|
|
803
|
+
export { DiffType, LSH, MinHash, SimHasher, cosine, cosineNgram, diff, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };
|