@nlptools/distance 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +185 -69
- package/dist/index.d.mts +391 -4
- package/dist/index.mjs +803 -6
- package/package.json +30 -27
- package/dist/index.d.ts +0 -5
package/dist/index.d.mts
CHANGED
|
@@ -1,5 +1,392 @@
|
|
|
1
|
-
|
|
1
|
+
import { DiffType, IDiffItem, IDiffOptions, ILcs, ILcsAlgorithm, diff } from "@algorithm.ts/diff";
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
//#region src/edit/levenshtein.d.ts
|
|
4
|
+
/**
|
|
5
|
+
* Compute the Levenshtein edit distance between two strings.
|
|
6
|
+
*
|
|
7
|
+
* Time: O(m * n), Space: O(min(m, n))
|
|
8
|
+
*
|
|
9
|
+
* @param a - First string
|
|
10
|
+
* @param b - Second string
|
|
11
|
+
* @returns Edit distance (non-negative integer)
|
|
12
|
+
*/
|
|
13
|
+
declare function levenshtein(a: string, b: string): number;
|
|
14
|
+
/**
|
|
15
|
+
* Compute the normalized Levenshtein similarity in [0, 1].
|
|
16
|
+
*
|
|
17
|
+
* @param a - First string
|
|
18
|
+
* @param b - Second string
|
|
19
|
+
* @returns Similarity score where 1 means identical
|
|
20
|
+
*/
|
|
21
|
+
declare function levenshteinNormalized(a: string, b: string): number;
|
|
22
|
+
//#endregion
|
|
23
|
+
//#region src/edit/lcs.d.ts
|
|
24
|
+
type LcsSizeFunc = (N1: number, N2: number, equals: (x: number, y: number) => boolean) => number;
|
|
25
|
+
type LcsPairsFunc = (N1: number, N2: number, equals: (x: number, y: number) => boolean) => Array<[number, number]>;
|
|
26
|
+
/**
|
|
27
|
+
* Internal helper: create an equals callback using pre-built CharCode arrays.
|
|
28
|
+
* Avoids repeated string indexing inside the hot LCS loop.
|
|
29
|
+
*/
|
|
30
|
+
declare function stringEquals(a: string, b: string): (x: number, y: number) => boolean;
|
|
31
|
+
/**
|
|
32
|
+
* Compute the LCS distance: len(a) + len(b) - 2 * lcsLength.
|
|
33
|
+
*
|
|
34
|
+
* @param a - First string
|
|
35
|
+
* @param b - Second string
|
|
36
|
+
* @param algorithm - 'myers' (default, better for sparse diffs) | 'dp'
|
|
37
|
+
* @returns LCS distance (non-negative integer)
|
|
38
|
+
*/
|
|
39
|
+
declare function lcsDistance(a: string, b: string, algorithm?: "myers" | "dp"): number;
|
|
40
|
+
/**
|
|
41
|
+
* Compute the normalized LCS similarity in [0, 1].
|
|
42
|
+
*
|
|
43
|
+
* @param a - First string
|
|
44
|
+
* @param b - Second string
|
|
45
|
+
* @param algorithm - 'myers' | 'dp'
|
|
46
|
+
* @returns Similarity score where 1 means identical
|
|
47
|
+
*/
|
|
48
|
+
declare function lcsNormalized(a: string, b: string, algorithm?: "myers" | "dp"): number;
|
|
49
|
+
/**
|
|
50
|
+
* Get the length of the Longest Common Subsequence.
|
|
51
|
+
*
|
|
52
|
+
* @param a - First string
|
|
53
|
+
* @param b - Second string
|
|
54
|
+
* @param algorithm - 'myers' | 'dp'
|
|
55
|
+
* @returns LCS length
|
|
56
|
+
*/
|
|
57
|
+
declare function lcsLength(a: string, b: string, algorithm?: "myers" | "dp"): number;
|
|
58
|
+
/**
|
|
59
|
+
* Get the matching index pairs of the Longest Common Subsequence.
|
|
60
|
+
*
|
|
61
|
+
* @param a - First string
|
|
62
|
+
* @param b - Second string
|
|
63
|
+
* @param algorithm - 'myers' | 'dp'
|
|
64
|
+
* @returns Array of [indexInA, indexInB] pairs
|
|
65
|
+
*/
|
|
66
|
+
declare function lcsPairs(a: string, b: string, algorithm?: "myers" | "dp"): Array<[number, number]>;
|
|
67
|
+
//#endregion
|
|
68
|
+
//#region src/token/jaccard.d.ts
|
|
69
|
+
/**
|
|
70
|
+
* Jaccard similarity between two strings based on character-level multiset.
|
|
71
|
+
*
|
|
72
|
+
* J(A, B) = |A ∩ B| / |A ∪ B|
|
|
73
|
+
*
|
|
74
|
+
* Uses Counter (frequency map) for multiset semantics,
|
|
75
|
+
* matching the textdistance crate behavior.
|
|
76
|
+
*
|
|
77
|
+
* Time: O(m + n)
|
|
78
|
+
*
|
|
79
|
+
* @param a - First string
|
|
80
|
+
* @param b - Second string
|
|
81
|
+
* @returns Jaccard similarity in [0, 1]
|
|
82
|
+
*/
|
|
83
|
+
declare function jaccard(a: string, b: string): number;
|
|
84
|
+
/**
|
|
85
|
+
* Jaccard similarity based on character bigrams.
|
|
86
|
+
*
|
|
87
|
+
* @param a - First string
|
|
88
|
+
* @param b - Second string
|
|
89
|
+
* @param n - N-gram size (default: 2)
|
|
90
|
+
* @returns Bigram Jaccard similarity in [0, 1]
|
|
91
|
+
*/
|
|
92
|
+
declare function jaccardNgram(a: string, b: string, n?: number): number;
|
|
93
|
+
//#endregion
|
|
94
|
+
//#region src/token/cosine.d.ts
|
|
95
|
+
/**
|
|
96
|
+
* Cosine similarity between two strings based on character-level multiset.
|
|
97
|
+
*
|
|
98
|
+
* cos(A, B) = (A · B) / (|A| * |B|)
|
|
99
|
+
*
|
|
100
|
+
* Uses Counter (frequency map) for multiset semantics,
|
|
101
|
+
* matching the textdistance crate behavior.
|
|
102
|
+
*
|
|
103
|
+
* Time: O(m + n)
|
|
104
|
+
*
|
|
105
|
+
* @param a - First string
|
|
106
|
+
* @param b - Second string
|
|
107
|
+
* @returns Cosine similarity in [0, 1]
|
|
108
|
+
*/
|
|
109
|
+
declare function cosine(a: string, b: string): number;
|
|
110
|
+
/**
|
|
111
|
+
* Cosine similarity based on character n-grams.
|
|
112
|
+
*
|
|
113
|
+
* @param a - First string
|
|
114
|
+
* @param b - Second string
|
|
115
|
+
* @param n - N-gram size (default: 2)
|
|
116
|
+
* @returns N-gram Cosine similarity in [0, 1]
|
|
117
|
+
*/
|
|
118
|
+
declare function cosineNgram(a: string, b: string, n?: number): number;
|
|
119
|
+
//#endregion
|
|
120
|
+
//#region src/token/sorensen.d.ts
|
|
121
|
+
/**
|
|
122
|
+
* Sørensen-Dice coefficient between two strings based on character-level multiset.
|
|
123
|
+
*
|
|
124
|
+
* DSC(A, B) = 2 * |A ∩ B| / (|A| + |B|)
|
|
125
|
+
*
|
|
126
|
+
* Uses Counter (frequency map) for multiset semantics,
|
|
127
|
+
* matching the textdistance crate behavior.
|
|
128
|
+
*
|
|
129
|
+
* Time: O(m + n)
|
|
130
|
+
*
|
|
131
|
+
* @param a - First string
|
|
132
|
+
* @param b - Second string
|
|
133
|
+
* @returns Sørensen-Dice coefficient in [0, 1]
|
|
134
|
+
*/
|
|
135
|
+
declare function sorensen(a: string, b: string): number;
|
|
136
|
+
/**
|
|
137
|
+
* Sørensen-Dice coefficient based on character n-grams.
|
|
138
|
+
*
|
|
139
|
+
* @param a - First string
|
|
140
|
+
* @param b - Second string
|
|
141
|
+
* @param n - N-gram size (default: 2)
|
|
142
|
+
* @returns Bigram Sørensen-Dice coefficient in [0, 1]
|
|
143
|
+
*/
|
|
144
|
+
declare function sorensenNgram(a: string, b: string, n?: number): number;
|
|
145
|
+
//#endregion
|
|
146
|
+
//#region src/hash/simhash.d.ts
|
|
147
|
+
interface ISimHashOptions {
|
|
148
|
+
/**
|
|
149
|
+
* Bit length of the fingerprint.
|
|
150
|
+
* @default 64
|
|
151
|
+
*/
|
|
152
|
+
bits?: number;
|
|
153
|
+
/**
|
|
154
|
+
* Hash function to use for feature hashing.
|
|
155
|
+
* Defaults to a built-in FNV-1a implementation.
|
|
156
|
+
*/
|
|
157
|
+
hashFn?: (feature: string) => number;
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Generate a 64-bit fingerprint for a collection of features.
|
|
161
|
+
*
|
|
162
|
+
* SimHash maps a set of features to a fixed-length binary fingerprint such that
|
|
163
|
+
* similar documents produce similar fingerprints. The similarity between two
|
|
164
|
+
* fingerprints is measured by Hamming distance.
|
|
165
|
+
*
|
|
166
|
+
* Algorithm:
|
|
167
|
+
* 1. Initialize a vector V of length `bits` to all zeros
|
|
168
|
+
* 2. For each feature, compute its hash and set the i-th bit
|
|
169
|
+
* 3. For each bit position i: if hash[i] = 1, V[i] += weight; else V[i] -= weight
|
|
170
|
+
* 4. The final fingerprint: bit i = 1 if V[i] > 0, else 0
|
|
171
|
+
*
|
|
172
|
+
* Time: O(features * bits)
|
|
173
|
+
*
|
|
174
|
+
* @param features - Array of feature strings (e.g., words, n-grams, shingles)
|
|
175
|
+
* @param options - Configuration
|
|
176
|
+
* @returns 64-bit fingerprint as a bigint
|
|
177
|
+
*/
|
|
178
|
+
declare function simhash(features: string[], options?: ISimHashOptions): bigint;
|
|
179
|
+
/**
|
|
180
|
+
* Compute the Hamming distance between two SimHash fingerprints.
|
|
181
|
+
*
|
|
182
|
+
* The Hamming distance is the number of differing bits.
|
|
183
|
+
* For 64-bit fingerprints, a distance ≤ 3 typically indicates near-duplicate content.
|
|
184
|
+
*
|
|
185
|
+
* Time: O(bits)
|
|
186
|
+
*
|
|
187
|
+
* @param a - First fingerprint
|
|
188
|
+
* @param b - Second fingerprint
|
|
189
|
+
* @returns Hamming distance (non-negative integer)
|
|
190
|
+
*/
|
|
191
|
+
declare function hammingDistance(a: bigint, b: bigint): number;
|
|
192
|
+
/**
|
|
193
|
+
* Compute normalized Hamming similarity in [0, 1].
|
|
194
|
+
*
|
|
195
|
+
* @param a - First fingerprint
|
|
196
|
+
* @param b - Second fingerprint
|
|
197
|
+
* @param bits - Bit length of the fingerprints (default: 64)
|
|
198
|
+
*/
|
|
199
|
+
declare function hammingSimilarity(a: bigint, b: bigint, bits?: number): number;
|
|
200
|
+
/**
|
|
201
|
+
* SimHasher class for convenient document fingerprinting.
|
|
202
|
+
*
|
|
203
|
+
* @example
|
|
204
|
+
* ```ts
|
|
205
|
+
* const hasher = new SimHasher();
|
|
206
|
+
* const fp1 = hasher.hash(['hello', 'world']);
|
|
207
|
+
* const fp2 = hasher.hash(['hello', 'earth']);
|
|
208
|
+
* console.log(hasher.distance(fp1, fp2)); // small number = similar
|
|
209
|
+
* ```
|
|
210
|
+
*/
|
|
211
|
+
declare class SimHasher {
|
|
212
|
+
private readonly bits;
|
|
213
|
+
private readonly hashFn;
|
|
214
|
+
constructor(options?: ISimHashOptions);
|
|
215
|
+
/**
|
|
216
|
+
* Generate a fingerprint from features.
|
|
217
|
+
*/
|
|
218
|
+
hash(features: string[]): bigint;
|
|
219
|
+
/**
|
|
220
|
+
* Compute Hamming distance between two fingerprints.
|
|
221
|
+
*/
|
|
222
|
+
distance(a: bigint, b: bigint): number;
|
|
223
|
+
/**
|
|
224
|
+
* Compute similarity between two fingerprints.
|
|
225
|
+
*/
|
|
226
|
+
similarity(a: bigint, b: bigint): number;
|
|
227
|
+
/**
|
|
228
|
+
* Check if two fingerprints are likely near-duplicates.
|
|
229
|
+
*
|
|
230
|
+
* @param threshold - Maximum Hamming distance to consider as duplicate (default: 3)
|
|
231
|
+
*/
|
|
232
|
+
isDuplicate(a: bigint, b: bigint, threshold?: number): boolean;
|
|
233
|
+
}
|
|
234
|
+
//#endregion
|
|
235
|
+
//#region src/hash/minhash.d.ts
|
|
236
|
+
interface IMinHashOptions {
|
|
237
|
+
/**
|
|
238
|
+
* Number of hash functions / signature size.
|
|
239
|
+
* Larger values give more accurate Jaccard estimates.
|
|
240
|
+
* @default 128
|
|
241
|
+
*/
|
|
242
|
+
numHashes?: number;
|
|
243
|
+
/**
|
|
244
|
+
* Seed for the random number generator.
|
|
245
|
+
* @default 42
|
|
246
|
+
*/
|
|
247
|
+
seed?: number;
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* MinHash estimator for Jaccard similarity.
|
|
251
|
+
*
|
|
252
|
+
* Instead of computing the exact Jaccard index (which requires set intersection/union
|
|
253
|
+
* on potentially large sets), MinHash generates a fixed-size signature for each set.
|
|
254
|
+
* The Jaccard similarity is then estimated by comparing the fraction of matching
|
|
255
|
+
* positions in the signatures.
|
|
256
|
+
*
|
|
257
|
+
* Time:
|
|
258
|
+
* - Update: O(k) per element, where k = numHashes
|
|
259
|
+
* - Estimate: O(k)
|
|
260
|
+
*
|
|
261
|
+
* @example
|
|
262
|
+
* ```ts
|
|
263
|
+
* const mh = new MinHash({ numHashes: 128 });
|
|
264
|
+
* mh.update('hello');
|
|
265
|
+
* mh.update('world');
|
|
266
|
+
* const sig1 = mh.digest();
|
|
267
|
+
*
|
|
268
|
+
* const mh2 = new MinHash({ numHashes: 128 });
|
|
269
|
+
* mh2.update('hello');
|
|
270
|
+
* mh2.update('earth');
|
|
271
|
+
* const sig2 = mh2.digest();
|
|
272
|
+
*
|
|
273
|
+
* console.log(MinHash.estimate(sig1, sig2)); // ~0.67
|
|
274
|
+
* ```
|
|
275
|
+
*/
|
|
276
|
+
declare class MinHash {
|
|
277
|
+
private readonly numHashes;
|
|
278
|
+
private readonly hashParams;
|
|
279
|
+
private readonly maxHash;
|
|
280
|
+
private signature;
|
|
281
|
+
private dirty;
|
|
282
|
+
constructor(options?: IMinHashOptions);
|
|
283
|
+
/**
|
|
284
|
+
* Add a feature to the set.
|
|
285
|
+
*/
|
|
286
|
+
update(feature: string): void;
|
|
287
|
+
/**
|
|
288
|
+
* Get the MinHash signature.
|
|
289
|
+
* The signature is a fixed-size array that represents the set.
|
|
290
|
+
*/
|
|
291
|
+
digest(): Uint32Array;
|
|
292
|
+
/**
|
|
293
|
+
* Estimate Jaccard similarity between two MinHash signatures.
|
|
294
|
+
*
|
|
295
|
+
* @param sig1 - First MinHash signature
|
|
296
|
+
* @param sig2 - Second MinHash signature
|
|
297
|
+
* @returns Estimated Jaccard similarity in [0, 1]
|
|
298
|
+
*/
|
|
299
|
+
static estimate(sig1: Uint32Array, sig2: Uint32Array): number;
|
|
300
|
+
/**
|
|
301
|
+
* Estimate Jaccard similarity between this and another MinHash instance.
|
|
302
|
+
*/
|
|
303
|
+
estimate(other: MinHash): number;
|
|
304
|
+
}
|
|
305
|
+
//#endregion
|
|
306
|
+
//#region src/hash/lsh.d.ts
|
|
307
|
+
interface ILSHOptions {
|
|
308
|
+
/**
|
|
309
|
+
* Number of bands (rows per band = numHashes / numBands).
|
|
310
|
+
* More bands → higher recall, lower precision.
|
|
311
|
+
* @default 16
|
|
312
|
+
*/
|
|
313
|
+
numBands?: number;
|
|
314
|
+
/**
|
|
315
|
+
* Number of hash functions (must match MinHash signature size).
|
|
316
|
+
* @default 128
|
|
317
|
+
*/
|
|
318
|
+
numHashes?: number;
|
|
319
|
+
}
|
|
320
|
+
/**
|
|
321
|
+
* LSH (Locality-Sensitive Hashing) index for fast approximate nearest neighbor search.
|
|
322
|
+
*
|
|
323
|
+
* Uses the MinHash + banding technique:
|
|
324
|
+
* 1. Divide each MinHash signature into `numBands` bands
|
|
325
|
+
* 2. Hash each band to a bucket
|
|
326
|
+
* 3. Items sharing at least one bucket are candidates for similarity
|
|
327
|
+
*
|
|
328
|
+
* The probability of two items with Jaccard similarity `s` being compared is:
|
|
329
|
+
* P = 1 - (1 - s^r)^b
|
|
330
|
+
* where r = rows per band, b = numBands.
|
|
331
|
+
*
|
|
332
|
+
* @example
|
|
333
|
+
* ```ts
|
|
334
|
+
* const lsh = new LSH({ numBands: 16, numHashes: 128 });
|
|
335
|
+
*
|
|
336
|
+
* // Index documents
|
|
337
|
+
* const mh1 = new MinHash({ numHashes: 128 });
|
|
338
|
+
* mh1.update('hello');
|
|
339
|
+
* mh1.update('world');
|
|
340
|
+
* lsh.insert('doc1', mh1.digest());
|
|
341
|
+
*
|
|
342
|
+
* const mh2 = new MinHash({ numHashes: 128 });
|
|
343
|
+
* mh2.update('hello');
|
|
344
|
+
* mh2.update('earth');
|
|
345
|
+
* lsh.insert('doc2', mh2.digest());
|
|
346
|
+
*
|
|
347
|
+
* // Query for similar documents
|
|
348
|
+
* const mh3 = new MinHash({ numHashes: 128 });
|
|
349
|
+
* mh3.update('hello');
|
|
350
|
+
* mh3.update('earth');
|
|
351
|
+
* const candidates = lsh.query(mh3.digest());
|
|
352
|
+
* ```
|
|
353
|
+
*/
|
|
354
|
+
declare class LSH {
|
|
355
|
+
private readonly numBands;
|
|
356
|
+
private readonly rowsPerBand;
|
|
357
|
+
private readonly numHashes;
|
|
358
|
+
/**
|
|
359
|
+
* Map from band index → bucket hash → set of document IDs
|
|
360
|
+
*/
|
|
361
|
+
private readonly bands;
|
|
362
|
+
/**
|
|
363
|
+
* All indexed document signatures for exact similarity estimation.
|
|
364
|
+
*/
|
|
365
|
+
private readonly signatures;
|
|
366
|
+
constructor(options?: ILSHOptions);
|
|
367
|
+
/**
|
|
368
|
+
* Insert a document into the index.
|
|
369
|
+
*
|
|
370
|
+
* @param id - Document identifier
|
|
371
|
+
* @param signature - MinHash signature (from MinHash.digest())
|
|
372
|
+
*/
|
|
373
|
+
insert(id: string, signature: Uint32Array): void;
|
|
374
|
+
/**
|
|
375
|
+
* Query for candidate documents similar to the given signature.
|
|
376
|
+
*
|
|
377
|
+
* @param signature - Query MinHash signature
|
|
378
|
+
* @param threshold - Optional: minimum Jaccard similarity to return (default: return all candidates)
|
|
379
|
+
* @returns Array of [docId, estimatedJaccard] pairs, sorted by similarity descending
|
|
380
|
+
*/
|
|
381
|
+
query(signature: Uint32Array, threshold?: number): Array<[string, number]>;
|
|
382
|
+
/**
|
|
383
|
+
* Remove a document from the index.
|
|
384
|
+
*/
|
|
385
|
+
remove(id: string): boolean;
|
|
386
|
+
/**
|
|
387
|
+
* Get the number of indexed documents.
|
|
388
|
+
*/
|
|
389
|
+
get size(): number;
|
|
390
|
+
}
|
|
391
|
+
//#endregion
|
|
392
|
+
export { DiffType, type IDiffItem, type IDiffOptions, ILSHOptions, type ILcs, type ILcsAlgorithm, IMinHashOptions, ISimHashOptions, LSH, LcsPairsFunc, LcsSizeFunc, MinHash, SimHasher, cosine, cosineNgram, diff, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };
|