@nlptools/distance 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1,5 +1,392 @@
1
- export * from '@nlptools/distance-wasm';
1
+ import { DiffType, IDiffItem, IDiffOptions, ILcs, ILcsAlgorithm, diff } from "@algorithm.ts/diff";
2
2
 
3
- declare const fastest_levenshtein: (a: string, b: string) => number;
4
-
5
- export { fastest_levenshtein };
3
+ //#region src/edit/levenshtein.d.ts
4
+ /**
5
+ * Compute the Levenshtein edit distance between two strings.
6
+ *
7
+ * Time: O(m * n), Space: O(min(m, n))
8
+ *
9
+ * @param a - First string
10
+ * @param b - Second string
11
+ * @returns Edit distance (non-negative integer)
12
+ */
13
+ declare function levenshtein(a: string, b: string): number;
14
+ /**
15
+ * Compute the normalized Levenshtein similarity in [0, 1].
16
+ *
17
+ * @param a - First string
18
+ * @param b - Second string
19
+ * @returns Similarity score where 1 means identical
20
+ */
21
+ declare function levenshteinNormalized(a: string, b: string): number;
22
+ //#endregion
23
+ //#region src/edit/lcs.d.ts
24
+ type LcsSizeFunc = (N1: number, N2: number, equals: (x: number, y: number) => boolean) => number;
25
+ type LcsPairsFunc = (N1: number, N2: number, equals: (x: number, y: number) => boolean) => Array<[number, number]>;
26
+ /**
27
+ * Internal helper: create an equals callback using pre-built CharCode arrays.
28
+ * Avoids repeated string indexing inside the hot LCS loop.
29
+ */
30
+ declare function stringEquals(a: string, b: string): (x: number, y: number) => boolean;
31
+ /**
32
+ * Compute the LCS distance: len(a) + len(b) - 2 * lcsLength.
33
+ *
34
+ * @param a - First string
35
+ * @param b - Second string
36
+ * @param algorithm - 'myers' (default, better for sparse diffs) | 'dp'
37
+ * @returns LCS distance (non-negative integer)
38
+ */
39
+ declare function lcsDistance(a: string, b: string, algorithm?: "myers" | "dp"): number;
40
+ /**
41
+ * Compute the normalized LCS similarity in [0, 1].
42
+ *
43
+ * @param a - First string
44
+ * @param b - Second string
45
+ * @param algorithm - 'myers' | 'dp'
46
+ * @returns Similarity score where 1 means identical
47
+ */
48
+ declare function lcsNormalized(a: string, b: string, algorithm?: "myers" | "dp"): number;
49
+ /**
50
+ * Get the length of the Longest Common Subsequence.
51
+ *
52
+ * @param a - First string
53
+ * @param b - Second string
54
+ * @param algorithm - 'myers' | 'dp'
55
+ * @returns LCS length
56
+ */
57
+ declare function lcsLength(a: string, b: string, algorithm?: "myers" | "dp"): number;
58
+ /**
59
+ * Get the matching index pairs of the Longest Common Subsequence.
60
+ *
61
+ * @param a - First string
62
+ * @param b - Second string
63
+ * @param algorithm - 'myers' | 'dp'
64
+ * @returns Array of [indexInA, indexInB] pairs
65
+ */
66
+ declare function lcsPairs(a: string, b: string, algorithm?: "myers" | "dp"): Array<[number, number]>;
67
+ //#endregion
68
+ //#region src/token/jaccard.d.ts
69
+ /**
70
+ * Jaccard similarity between two strings based on character-level multiset.
71
+ *
72
+ * J(A, B) = |A ∩ B| / |A ∪ B|
73
+ *
74
+ * Uses Counter (frequency map) for multiset semantics,
75
+ * matching the textdistance crate behavior.
76
+ *
77
+ * Time: O(m + n)
78
+ *
79
+ * @param a - First string
80
+ * @param b - Second string
81
+ * @returns Jaccard similarity in [0, 1]
82
+ */
83
+ declare function jaccard(a: string, b: string): number;
84
+ /**
85
+ * Jaccard similarity based on character bigrams.
86
+ *
87
+ * @param a - First string
88
+ * @param b - Second string
89
+ * @param n - N-gram size (default: 2)
90
+ * @returns Bigram Jaccard similarity in [0, 1]
91
+ */
92
+ declare function jaccardNgram(a: string, b: string, n?: number): number;
93
+ //#endregion
94
+ //#region src/token/cosine.d.ts
95
+ /**
96
+ * Cosine similarity between two strings based on character-level multiset.
97
+ *
98
+ * cos(A, B) = (A · B) / (|A| * |B|)
99
+ *
100
+ * Uses Counter (frequency map) for multiset semantics,
101
+ * matching the textdistance crate behavior.
102
+ *
103
+ * Time: O(m + n)
104
+ *
105
+ * @param a - First string
106
+ * @param b - Second string
107
+ * @returns Cosine similarity in [0, 1]
108
+ */
109
+ declare function cosine(a: string, b: string): number;
110
+ /**
111
+ * Cosine similarity based on character n-grams.
112
+ *
113
+ * @param a - First string
114
+ * @param b - Second string
115
+ * @param n - N-gram size (default: 2)
116
+ * @returns N-gram Cosine similarity in [0, 1]
117
+ */
118
+ declare function cosineNgram(a: string, b: string, n?: number): number;
119
+ //#endregion
120
+ //#region src/token/sorensen.d.ts
121
+ /**
122
+ * Sørensen-Dice coefficient between two strings based on character-level multiset.
123
+ *
124
+ * DSC(A, B) = 2 * |A ∩ B| / (|A| + |B|)
125
+ *
126
+ * Uses Counter (frequency map) for multiset semantics,
127
+ * matching the textdistance crate behavior.
128
+ *
129
+ * Time: O(m + n)
130
+ *
131
+ * @param a - First string
132
+ * @param b - Second string
133
+ * @returns Sørensen-Dice coefficient in [0, 1]
134
+ */
135
+ declare function sorensen(a: string, b: string): number;
136
+ /**
137
+ * Sørensen-Dice coefficient based on character n-grams.
138
+ *
139
+ * @param a - First string
140
+ * @param b - Second string
141
+ * @param n - N-gram size (default: 2)
142
+ * @returns Bigram Sørensen-Dice coefficient in [0, 1]
143
+ */
144
+ declare function sorensenNgram(a: string, b: string, n?: number): number;
145
+ //#endregion
146
+ //#region src/hash/simhash.d.ts
147
+ interface ISimHashOptions {
148
+ /**
149
+ * Bit length of the fingerprint.
150
+ * @default 64
151
+ */
152
+ bits?: number;
153
+ /**
154
+ * Hash function to use for feature hashing.
155
+ * Defaults to a built-in FNV-1a implementation.
156
+ */
157
+ hashFn?: (feature: string) => number;
158
+ }
159
+ /**
160
+ * Generate a 64-bit fingerprint for a collection of features.
161
+ *
162
+ * SimHash maps a set of features to a fixed-length binary fingerprint such that
163
+ * similar documents produce similar fingerprints. The similarity between two
164
+ * fingerprints is measured by Hamming distance.
165
+ *
166
+ * Algorithm:
167
+ * 1. Initialize a vector V of length `bits` to all zeros
168
+ * 2. For each feature, compute its hash and set the i-th bit
169
+ * 3. For each bit position i: if hash[i] = 1, V[i] += weight; else V[i] -= weight
170
+ * 4. The final fingerprint: bit i = 1 if V[i] > 0, else 0
171
+ *
172
+ * Time: O(features * bits)
173
+ *
174
+ * @param features - Array of feature strings (e.g., words, n-grams, shingles)
175
+ * @param options - Configuration
176
+ * @returns 64-bit fingerprint as a bigint
177
+ */
178
+ declare function simhash(features: string[], options?: ISimHashOptions): bigint;
179
+ /**
180
+ * Compute the Hamming distance between two SimHash fingerprints.
181
+ *
182
+ * The Hamming distance is the number of differing bits.
183
+ * For 64-bit fingerprints, a distance ≤ 3 typically indicates near-duplicate content.
184
+ *
185
+ * Time: O(bits)
186
+ *
187
+ * @param a - First fingerprint
188
+ * @param b - Second fingerprint
189
+ * @returns Hamming distance (non-negative integer)
190
+ */
191
+ declare function hammingDistance(a: bigint, b: bigint): number;
192
+ /**
193
+ * Compute normalized Hamming similarity in [0, 1].
194
+ *
195
+ * @param a - First fingerprint
196
+ * @param b - Second fingerprint
197
+ * @param bits - Bit length of the fingerprints (default: 64)
198
+ */
199
+ declare function hammingSimilarity(a: bigint, b: bigint, bits?: number): number;
200
+ /**
201
+ * SimHasher class for convenient document fingerprinting.
202
+ *
203
+ * @example
204
+ * ```ts
205
+ * const hasher = new SimHasher();
206
+ * const fp1 = hasher.hash(['hello', 'world']);
207
+ * const fp2 = hasher.hash(['hello', 'earth']);
208
+ * console.log(hasher.distance(fp1, fp2)); // small number = similar
209
+ * ```
210
+ */
211
+ declare class SimHasher {
212
+ private readonly bits;
213
+ private readonly hashFn;
214
+ constructor(options?: ISimHashOptions);
215
+ /**
216
+ * Generate a fingerprint from features.
217
+ */
218
+ hash(features: string[]): bigint;
219
+ /**
220
+ * Compute Hamming distance between two fingerprints.
221
+ */
222
+ distance(a: bigint, b: bigint): number;
223
+ /**
224
+ * Compute similarity between two fingerprints.
225
+ */
226
+ similarity(a: bigint, b: bigint): number;
227
+ /**
228
+ * Check if two fingerprints are likely near-duplicates.
229
+ *
230
+ * @param threshold - Maximum Hamming distance to consider as duplicate (default: 3)
231
+ */
232
+ isDuplicate(a: bigint, b: bigint, threshold?: number): boolean;
233
+ }
234
+ //#endregion
235
+ //#region src/hash/minhash.d.ts
236
+ interface IMinHashOptions {
237
+ /**
238
+ * Number of hash functions / signature size.
239
+ * Larger values give more accurate Jaccard estimates.
240
+ * @default 128
241
+ */
242
+ numHashes?: number;
243
+ /**
244
+ * Seed for the random number generator.
245
+ * @default 42
246
+ */
247
+ seed?: number;
248
+ }
249
+ /**
250
+ * MinHash estimator for Jaccard similarity.
251
+ *
252
+ * Instead of computing the exact Jaccard index (which requires set intersection/union
253
+ * on potentially large sets), MinHash generates a fixed-size signature for each set.
254
+ * The Jaccard similarity is then estimated by comparing the fraction of matching
255
+ * positions in the signatures.
256
+ *
257
+ * Time:
258
+ * - Update: O(k) per element, where k = numHashes
259
+ * - Estimate: O(k)
260
+ *
261
+ * @example
262
+ * ```ts
263
+ * const mh = new MinHash({ numHashes: 128 });
264
+ * mh.update('hello');
265
+ * mh.update('world');
266
+ * const sig1 = mh.digest();
267
+ *
268
+ * const mh2 = new MinHash({ numHashes: 128 });
269
+ * mh2.update('hello');
270
+ * mh2.update('earth');
271
+ * const sig2 = mh2.digest();
272
+ *
273
+ * console.log(MinHash.estimate(sig1, sig2)); // ~0.67
274
+ * ```
275
+ */
276
+ declare class MinHash {
277
+ private readonly numHashes;
278
+ private readonly hashParams;
279
+ private readonly maxHash;
280
+ private signature;
281
+ private dirty;
282
+ constructor(options?: IMinHashOptions);
283
+ /**
284
+ * Add a feature to the set.
285
+ */
286
+ update(feature: string): void;
287
+ /**
288
+ * Get the MinHash signature.
289
+ * The signature is a fixed-size array that represents the set.
290
+ */
291
+ digest(): Uint32Array;
292
+ /**
293
+ * Estimate Jaccard similarity between two MinHash signatures.
294
+ *
295
+ * @param sig1 - First MinHash signature
296
+ * @param sig2 - Second MinHash signature
297
+ * @returns Estimated Jaccard similarity in [0, 1]
298
+ */
299
+ static estimate(sig1: Uint32Array, sig2: Uint32Array): number;
300
+ /**
301
+ * Estimate Jaccard similarity between this and another MinHash instance.
302
+ */
303
+ estimate(other: MinHash): number;
304
+ }
305
+ //#endregion
306
+ //#region src/hash/lsh.d.ts
307
+ interface ILSHOptions {
308
+ /**
309
+ * Number of bands (rows per band = numHashes / numBands).
310
+ * More bands → higher recall, lower precision.
311
+ * @default 16
312
+ */
313
+ numBands?: number;
314
+ /**
315
+ * Number of hash functions (must match MinHash signature size).
316
+ * @default 128
317
+ */
318
+ numHashes?: number;
319
+ }
320
+ /**
321
+ * LSH (Locality-Sensitive Hashing) index for fast approximate nearest neighbor search.
322
+ *
323
+ * Uses the MinHash + banding technique:
324
+ * 1. Divide each MinHash signature into `numBands` bands
325
+ * 2. Hash each band to a bucket
326
+ * 3. Items sharing at least one bucket are candidates for similarity
327
+ *
328
+ * The probability of two items with Jaccard similarity `s` being compared is:
329
+ * P = 1 - (1 - s^r)^b
330
+ * where r = rows per band, b = numBands.
331
+ *
332
+ * @example
333
+ * ```ts
334
+ * const lsh = new LSH({ numBands: 16, numHashes: 128 });
335
+ *
336
+ * // Index documents
337
+ * const mh1 = new MinHash({ numHashes: 128 });
338
+ * mh1.update('hello');
339
+ * mh1.update('world');
340
+ * lsh.insert('doc1', mh1.digest());
341
+ *
342
+ * const mh2 = new MinHash({ numHashes: 128 });
343
+ * mh2.update('hello');
344
+ * mh2.update('earth');
345
+ * lsh.insert('doc2', mh2.digest());
346
+ *
347
+ * // Query for similar documents
348
+ * const mh3 = new MinHash({ numHashes: 128 });
349
+ * mh3.update('hello');
350
+ * mh3.update('earth');
351
+ * const candidates = lsh.query(mh3.digest());
352
+ * ```
353
+ */
354
+ declare class LSH {
355
+ private readonly numBands;
356
+ private readonly rowsPerBand;
357
+ private readonly numHashes;
358
+ /**
359
+ * Map from band index → bucket hash → set of document IDs
360
+ */
361
+ private readonly bands;
362
+ /**
363
+ * All indexed document signatures for exact similarity estimation.
364
+ */
365
+ private readonly signatures;
366
+ constructor(options?: ILSHOptions);
367
+ /**
368
+ * Insert a document into the index.
369
+ *
370
+ * @param id - Document identifier
371
+ * @param signature - MinHash signature (from MinHash.digest())
372
+ */
373
+ insert(id: string, signature: Uint32Array): void;
374
+ /**
375
+ * Query for candidate documents similar to the given signature.
376
+ *
377
+ * @param signature - Query MinHash signature
378
+ * @param threshold - Optional: minimum Jaccard similarity to return (default: return all candidates)
379
+ * @returns Array of [docId, estimatedJaccard] pairs, sorted by similarity descending
380
+ */
381
+ query(signature: Uint32Array, threshold?: number): Array<[string, number]>;
382
+ /**
383
+ * Remove a document from the index.
384
+ */
385
+ remove(id: string): boolean;
386
+ /**
387
+ * Get the number of indexed documents.
388
+ */
389
+ get size(): number;
390
+ }
391
+ //#endregion
392
+ export { DiffType, type IDiffItem, type IDiffOptions, ILSHOptions, type ILcs, type ILcsAlgorithm, IMinHashOptions, ISimHashOptions, LSH, LcsPairsFunc, LcsSizeFunc, MinHash, SimHasher, cosine, cosineNgram, diff, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };