@nlptools/distance 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -31
- package/dist/index.d.mts +707 -4
- package/dist/index.mjs +959 -54
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -53,6 +53,8 @@ function frequencyMap(tokens) {
|
|
|
53
53
|
function charFrequencyMap(str) {
|
|
54
54
|
return frequencyMap(str);
|
|
55
55
|
}
|
|
56
|
+
/** Size of the ASCII frequency array (covers charCode 0-127). */
|
|
57
|
+
const CHAR_FREQ_SIZE = 128;
|
|
56
58
|
/**
|
|
57
59
|
* Build a character frequency array from a string.
|
|
58
60
|
* Returns false if any character is non-ASCII (charCode >= 128).
|
|
@@ -148,6 +150,13 @@ function fnv1a(str) {
|
|
|
148
150
|
}
|
|
149
151
|
return hash >>> 0;
|
|
150
152
|
}
|
|
153
|
+
/**
|
|
154
|
+
* Combine two hashes into one (for generating multiple independent hash values).
|
|
155
|
+
*/
|
|
156
|
+
function combineHash(a, b) {
|
|
157
|
+
a ^= b + 2654435769 + (a << 6) + (a >>> 2);
|
|
158
|
+
return a >>> 0;
|
|
159
|
+
}
|
|
151
160
|
//#endregion
|
|
152
161
|
//#region src/edit/levenshtein.ts
|
|
153
162
|
/**
|
|
@@ -206,7 +215,9 @@ function lcsDistance(a, b, algorithm = "myers") {
|
|
|
206
215
|
* @returns Similarity score where 1 means identical
|
|
207
216
|
*/
|
|
208
217
|
function lcsNormalized(a, b, algorithm = "myers") {
|
|
209
|
-
|
|
218
|
+
const maxLen = Math.max(a.length, b.length);
|
|
219
|
+
if (maxLen === 0) return 1;
|
|
220
|
+
return lcsLength(a, b, algorithm) / maxLen;
|
|
210
221
|
}
|
|
211
222
|
/**
|
|
212
223
|
* Get the length of the Longest Common Subsequence.
|
|
@@ -231,6 +242,458 @@ function lcsPairs(a, b, algorithm = "myers") {
|
|
|
231
242
|
return (algorithm === "dp" ? lcs_dp : lcs_myers_linear_space)(a.length, b.length, stringEquals(a, b));
|
|
232
243
|
}
|
|
233
244
|
//#endregion
|
|
245
|
+
//#region src/edit/jaro.ts
|
|
246
|
+
/**
|
|
247
|
+
* Jaro and Jaro-Winkler similarity algorithms.
|
|
248
|
+
*
|
|
249
|
+
* Jaro measures similarity between two strings by considering matching characters
|
|
250
|
+
* and transpositions. Jaro-Winkler extends Jaro with a prefix bonus.
|
|
251
|
+
*
|
|
252
|
+
* Time: O(m * n)
|
|
253
|
+
*/
|
|
254
|
+
/**
|
|
255
|
+
* Compute Jaro similarity between two strings.
|
|
256
|
+
*
|
|
257
|
+
* J(S1, S2) = (1/3) * (m/|S1| + m/|S2| + (m - t/2) / m)
|
|
258
|
+
*
|
|
259
|
+
* where m = number of matching characters (within window),
|
|
260
|
+
* t = number of transpositions among matching characters.
|
|
261
|
+
*
|
|
262
|
+
* @param a - First string
|
|
263
|
+
* @param b - Second string
|
|
264
|
+
* @returns Jaro similarity in [0, 1]
|
|
265
|
+
*/
|
|
266
|
+
function jaro(a, b) {
|
|
267
|
+
const aLen = a.length;
|
|
268
|
+
const bLen = b.length;
|
|
269
|
+
if (aLen === 0 && bLen === 0) return 1;
|
|
270
|
+
if (aLen === 0 || bLen === 0) return 0;
|
|
271
|
+
const matchDistance = Math.floor(Math.max(aLen, bLen) / 2) - 1;
|
|
272
|
+
if (matchDistance < 0) return 0;
|
|
273
|
+
const aMatches = new Uint8Array(aLen);
|
|
274
|
+
const bMatches = new Uint8Array(bLen);
|
|
275
|
+
let matches = 0;
|
|
276
|
+
let transpositions = 0;
|
|
277
|
+
for (let i = 0; i < aLen; i++) {
|
|
278
|
+
const start = Math.max(0, i - matchDistance);
|
|
279
|
+
const end = Math.min(i + matchDistance + 1, bLen);
|
|
280
|
+
for (let j = start; j < end; j++) {
|
|
281
|
+
if (bMatches[j] || a.charCodeAt(i) !== b.charCodeAt(j)) continue;
|
|
282
|
+
aMatches[i] = 1;
|
|
283
|
+
bMatches[j] = 1;
|
|
284
|
+
matches++;
|
|
285
|
+
break;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
if (matches === 0) return 0;
|
|
289
|
+
let k = 0;
|
|
290
|
+
for (let i = 0; i < aLen; i++) {
|
|
291
|
+
if (!aMatches[i]) continue;
|
|
292
|
+
while (!bMatches[k]) k++;
|
|
293
|
+
if (a.charCodeAt(i) !== b.charCodeAt(k)) transpositions++;
|
|
294
|
+
k++;
|
|
295
|
+
}
|
|
296
|
+
return (matches / aLen + matches / bLen + (matches - transpositions / 2) / matches) / 3;
|
|
297
|
+
}
|
|
298
|
+
/**
|
|
299
|
+
* Compute Jaro-Winkler similarity between two strings.
|
|
300
|
+
*
|
|
301
|
+
* JW(S1, S2) = Jaro(S1, S2) + l * p * (1 - Jaro(S1, S2))
|
|
302
|
+
*
|
|
303
|
+
* where l = length of common prefix (up to maxPrefix),
|
|
304
|
+
* p = prefixWeight.
|
|
305
|
+
*
|
|
306
|
+
* @param a - First string
|
|
307
|
+
* @param b - Second string
|
|
308
|
+
* @param options - Configuration
|
|
309
|
+
* @returns Jaro-Winkler similarity in [0, 1]
|
|
310
|
+
*/
|
|
311
|
+
function jaroWinkler(a, b, options = {}) {
|
|
312
|
+
const p = options.prefixWeight ?? .1;
|
|
313
|
+
const maxPrefix = options.maxPrefix ?? 4;
|
|
314
|
+
const jaroScore = jaro(a, b);
|
|
315
|
+
let l = 0;
|
|
316
|
+
const minLen = Math.min(a.length, b.length, maxPrefix);
|
|
317
|
+
while (l < minLen && a.charCodeAt(l) === b.charCodeAt(l)) l++;
|
|
318
|
+
return jaroScore + l * p * (1 - jaroScore);
|
|
319
|
+
}
|
|
320
|
+
//#endregion
|
|
321
|
+
//#region src/edit/damerau.ts
|
|
322
|
+
/**
|
|
323
|
+
* Damerau-Levenshtein distance (unrestricted variant).
|
|
324
|
+
*
|
|
325
|
+
* Extension of Levenshtein that allows transpositions of adjacent characters,
|
|
326
|
+
* even when substrings are edited multiple times.
|
|
327
|
+
*
|
|
328
|
+
* Matches the default behavior of textdistance.rs (restricted = false).
|
|
329
|
+
*
|
|
330
|
+
* Time: O(m * n), Space: O(m * n)
|
|
331
|
+
*/
|
|
332
|
+
/**
|
|
333
|
+
* Compute the Damerau-Levenshtein distance between two strings.
|
|
334
|
+
*
|
|
335
|
+
* Allows insertions, deletions, substitutions, and transpositions of
|
|
336
|
+
* adjacent characters. This is the unrestricted variant, which correctly
|
|
337
|
+
* handles cases where a substring is edited more than once.
|
|
338
|
+
*
|
|
339
|
+
* @param a - First string
|
|
340
|
+
* @param b - Second string
|
|
341
|
+
* @returns Edit distance (non-negative integer)
|
|
342
|
+
*/
|
|
343
|
+
function damerauLevenshtein(a, b) {
|
|
344
|
+
const aLen = a.length;
|
|
345
|
+
const bLen = b.length;
|
|
346
|
+
if (aLen === 0) return bLen;
|
|
347
|
+
if (bLen === 0) return aLen;
|
|
348
|
+
const maxDist = aLen + bLen;
|
|
349
|
+
const w = bLen + 2;
|
|
350
|
+
const mat = new Uint32Array((aLen + 2) * w);
|
|
351
|
+
mat[0] = maxDist;
|
|
352
|
+
for (let i = 0; i <= aLen; i++) {
|
|
353
|
+
mat[(i + 1) * w] = maxDist;
|
|
354
|
+
mat[(i + 1) * w + 1] = i;
|
|
355
|
+
}
|
|
356
|
+
for (let j = 0; j <= bLen; j++) {
|
|
357
|
+
mat[j + 1] = maxDist;
|
|
358
|
+
mat[w + j + 1] = j;
|
|
359
|
+
}
|
|
360
|
+
const lastSeen = /* @__PURE__ */ new Map();
|
|
361
|
+
for (let i = 0; i < aLen; i++) {
|
|
362
|
+
let db = 0;
|
|
363
|
+
const aChar = a.charCodeAt(i);
|
|
364
|
+
const i1 = i + 1;
|
|
365
|
+
for (let j = 0; j < bLen; j++) {
|
|
366
|
+
const j1 = j + 1;
|
|
367
|
+
const bChar = b.charCodeAt(j);
|
|
368
|
+
const last = lastSeen.get(bChar) ?? 0;
|
|
369
|
+
const subCost = aChar === bChar ? 0 : 1;
|
|
370
|
+
const base = (i1 + 1) * w + j1 + 1;
|
|
371
|
+
const sub = mat[i1 * w + j1] + subCost;
|
|
372
|
+
const del = mat[(i1 + 1) * w + j1] + 1;
|
|
373
|
+
const ins = mat[i1 * w + j1 + 1] + 1;
|
|
374
|
+
const trans = mat[last * w + db] + i1 + j1 - 2 + 1 - last - db;
|
|
375
|
+
mat[base] = Math.min(sub, del, ins, trans);
|
|
376
|
+
if (aChar === bChar) db = j1;
|
|
377
|
+
}
|
|
378
|
+
lastSeen.set(aChar, i1);
|
|
379
|
+
}
|
|
380
|
+
return mat[(aLen + 1) * w + bLen + 1];
|
|
381
|
+
}
|
|
382
|
+
/**
|
|
383
|
+
* Compute the normalized Damerau-Levenshtein similarity in [0, 1].
|
|
384
|
+
*
|
|
385
|
+
* @param a - First string
|
|
386
|
+
* @param b - Second string
|
|
387
|
+
* @returns Similarity score where 1 means identical
|
|
388
|
+
*/
|
|
389
|
+
function damerauLevenshteinNormalized(a, b) {
|
|
390
|
+
return normalize(damerauLevenshtein(a, b), Math.max(a.length, b.length));
|
|
391
|
+
}
|
|
392
|
+
//#endregion
|
|
393
|
+
//#region src/edit/hamming.ts
|
|
394
|
+
/**
|
|
395
|
+
* Hamming distance — counts character mismatches between equal-length strings.
|
|
396
|
+
*
|
|
397
|
+
* Time: O(min(m, n))
|
|
398
|
+
*/
|
|
399
|
+
/**
|
|
400
|
+
* Compute the Hamming distance between two strings.
|
|
401
|
+
*
|
|
402
|
+
* If strings have different lengths, only compares up to the shorter length
|
|
403
|
+
* and adds the length difference as additional mismatches.
|
|
404
|
+
*
|
|
405
|
+
* @param a - First string
|
|
406
|
+
* @param b - Second string
|
|
407
|
+
* @returns Number of mismatching characters
|
|
408
|
+
*/
|
|
409
|
+
function hamming(a, b) {
|
|
410
|
+
const minLen = Math.min(a.length, b.length);
|
|
411
|
+
let count = Math.abs(a.length - b.length);
|
|
412
|
+
for (let i = 0; i < minLen; i++) if (a.charCodeAt(i) !== b.charCodeAt(i)) count++;
|
|
413
|
+
return count;
|
|
414
|
+
}
|
|
415
|
+
/**
|
|
416
|
+
* Compute the normalized Hamming similarity in [0, 1].
|
|
417
|
+
*
|
|
418
|
+
* @param a - First string
|
|
419
|
+
* @param b - Second string
|
|
420
|
+
* @returns Similarity score where 1 means identical
|
|
421
|
+
*/
|
|
422
|
+
function hammingNormalized(a, b) {
|
|
423
|
+
const maxLen = Math.max(a.length, b.length);
|
|
424
|
+
return maxLen === 0 ? 1 : 1 - hamming(a, b) / maxLen;
|
|
425
|
+
}
|
|
426
|
+
//#endregion
|
|
427
|
+
//#region src/edit/lcs-str.ts
|
|
428
|
+
/**
|
|
429
|
+
* Longest Common Substring (contiguous) algorithms.
|
|
430
|
+
*
|
|
431
|
+
* Unlike LCS (subsequence), this requires the matching characters to be contiguous.
|
|
432
|
+
*
|
|
433
|
+
* Time: O(m * n), Space: O(min(m, n))
|
|
434
|
+
*/
|
|
435
|
+
/**
|
|
436
|
+
* Compute the length of the Longest Common Substring.
|
|
437
|
+
*
|
|
438
|
+
* @param a - First string
|
|
439
|
+
* @param b - Second string
|
|
440
|
+
* @returns Length of the longest common substring
|
|
441
|
+
*/
|
|
442
|
+
function lcsSubstringLength(a, b) {
|
|
443
|
+
const aLen = a.length;
|
|
444
|
+
const bLen = b.length;
|
|
445
|
+
if (aLen === 0 || bLen === 0) return 0;
|
|
446
|
+
let maxLen = 0;
|
|
447
|
+
const dp = new Uint32Array(bLen + 1);
|
|
448
|
+
for (let i = 1; i <= aLen; i++) {
|
|
449
|
+
let prev = 0;
|
|
450
|
+
for (let j = 1; j <= bLen; j++) {
|
|
451
|
+
const temp = dp[j];
|
|
452
|
+
if (a.charCodeAt(i - 1) === b.charCodeAt(j - 1)) {
|
|
453
|
+
dp[j] = prev + 1;
|
|
454
|
+
if (dp[j] > maxLen) maxLen = dp[j];
|
|
455
|
+
} else dp[j] = 0;
|
|
456
|
+
prev = temp;
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
return maxLen;
|
|
460
|
+
}
|
|
461
|
+
/**
|
|
462
|
+
* Compute the LCS substring distance: len(a) + len(b) - 2 * lcsSubstringLength.
|
|
463
|
+
*
|
|
464
|
+
* @param a - First string
|
|
465
|
+
* @param b - Second string
|
|
466
|
+
* @returns LCS substring distance (non-negative integer)
|
|
467
|
+
*/
|
|
468
|
+
function lcsSubstringDistance(a, b) {
|
|
469
|
+
return a.length + b.length - 2 * lcsSubstringLength(a, b);
|
|
470
|
+
}
|
|
471
|
+
/**
|
|
472
|
+
* Compute the normalized LCS substring similarity in [0, 1].
|
|
473
|
+
*
|
|
474
|
+
* Normalized by max(len(a), len(b)) to match textdistance.rs convention.
|
|
475
|
+
*
|
|
476
|
+
* @param a - First string
|
|
477
|
+
* @param b - Second string
|
|
478
|
+
* @returns Similarity score where 1 means identical
|
|
479
|
+
*/
|
|
480
|
+
function lcsSubstringNormalized(a, b) {
|
|
481
|
+
const maxLen = Math.max(a.length, b.length);
|
|
482
|
+
if (maxLen === 0) return 1;
|
|
483
|
+
return lcsSubstringLength(a, b) / maxLen;
|
|
484
|
+
}
|
|
485
|
+
//#endregion
|
|
486
|
+
//#region src/edit/sift4.ts
|
|
487
|
+
/**
|
|
488
|
+
* SIFT4 simple — fast approximate string distance.
|
|
489
|
+
*
|
|
490
|
+
* A fast algorithm for approximate string matching with O(n) complexity
|
|
491
|
+
* in typical cases. Returns a distance value (lower = more similar).
|
|
492
|
+
*
|
|
493
|
+
* Matches the textdistance.rs implementation exactly.
|
|
494
|
+
*
|
|
495
|
+
* Time: O(n * maxOffset)
|
|
496
|
+
*/
|
|
497
|
+
/**
|
|
498
|
+
* Compute the SIFT4 simple distance between two strings.
|
|
499
|
+
*
|
|
500
|
+
* @param a - First string
|
|
501
|
+
* @param b - Second string
|
|
502
|
+
* @param options - Configuration
|
|
503
|
+
* @returns Distance (non-negative integer)
|
|
504
|
+
*/
|
|
505
|
+
function sift4(a, b, options = {}) {
|
|
506
|
+
const maxOffset = options.maxOffset ?? 5;
|
|
507
|
+
const aLen = a.length;
|
|
508
|
+
const bLen = b.length;
|
|
509
|
+
let c1 = 0;
|
|
510
|
+
let c2 = 0;
|
|
511
|
+
let lcss = 0;
|
|
512
|
+
let localCs = 0;
|
|
513
|
+
while (c1 < aLen && c2 < bLen) {
|
|
514
|
+
if (a.charCodeAt(c1) === b.charCodeAt(c2)) localCs++;
|
|
515
|
+
else {
|
|
516
|
+
lcss += localCs;
|
|
517
|
+
localCs = 0;
|
|
518
|
+
if (c1 !== c2) {
|
|
519
|
+
c1 = Math.min(c1, c2);
|
|
520
|
+
c2 = c1;
|
|
521
|
+
}
|
|
522
|
+
for (let offset = 0; offset < maxOffset; offset++) {
|
|
523
|
+
if (!(c1 + 1 < aLen || c2 + offset < bLen)) break;
|
|
524
|
+
if (c1 + offset < aLen && a.charCodeAt(c1 + offset) === b.charCodeAt(c2)) {
|
|
525
|
+
c1 += offset;
|
|
526
|
+
localCs++;
|
|
527
|
+
break;
|
|
528
|
+
}
|
|
529
|
+
if (c2 + offset < bLen && a.charCodeAt(c1) === b.charCodeAt(c2 + offset)) {
|
|
530
|
+
c2 += offset;
|
|
531
|
+
localCs++;
|
|
532
|
+
break;
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
c1++;
|
|
537
|
+
c2++;
|
|
538
|
+
}
|
|
539
|
+
return Math.max(aLen, bLen) - lcss - localCs;
|
|
540
|
+
}
|
|
541
|
+
/**
|
|
542
|
+
* Compute the normalized SIFT4 similarity in [0, 1].
|
|
543
|
+
*
|
|
544
|
+
* @param a - First string
|
|
545
|
+
* @param b - Second string
|
|
546
|
+
* @param options - Configuration
|
|
547
|
+
* @returns Similarity score where 1 means identical
|
|
548
|
+
*/
|
|
549
|
+
function sift4Normalized(a, b, options = {}) {
|
|
550
|
+
return normalize(sift4(a, b, options), Math.max(a.length, b.length));
|
|
551
|
+
}
|
|
552
|
+
//#endregion
|
|
553
|
+
//#region src/edit/ratcliff.ts
|
|
554
|
+
/**
|
|
555
|
+
* Ratcliff-Obershelp algorithm — Gestalt pattern matching.
|
|
556
|
+
*
|
|
557
|
+
* Iteratively finds the longest common substring using a stack-based approach,
|
|
558
|
+
* combining scores from both sides. Returns a similarity in [0, 1].
|
|
559
|
+
*
|
|
560
|
+
* Based on the textdistance.rs implementation.
|
|
561
|
+
*
|
|
562
|
+
* Time: O(n * m * log(n * m)) worst case, O(n + m) average
|
|
563
|
+
*/
|
|
564
|
+
/**
|
|
565
|
+
* Internal: find the longest common substring and return its length and positions.
|
|
566
|
+
*/
|
|
567
|
+
function findLCS(a, b) {
|
|
568
|
+
const aLen = a.length;
|
|
569
|
+
const bLen = b.length;
|
|
570
|
+
if (aLen === 0 || bLen === 0) return {
|
|
571
|
+
len: 0,
|
|
572
|
+
aIdx: 0,
|
|
573
|
+
bIdx: 0
|
|
574
|
+
};
|
|
575
|
+
let maxLen = 0;
|
|
576
|
+
let endI = 0;
|
|
577
|
+
let endJ = 0;
|
|
578
|
+
const dp = new Uint32Array(bLen + 1);
|
|
579
|
+
for (let i = 1; i <= aLen; i++) {
|
|
580
|
+
let prev = 0;
|
|
581
|
+
for (let j = 1; j <= bLen; j++) {
|
|
582
|
+
const temp = dp[j];
|
|
583
|
+
if (a.charCodeAt(i - 1) === b.charCodeAt(j - 1)) {
|
|
584
|
+
dp[j] = prev + 1;
|
|
585
|
+
if (dp[j] > maxLen) {
|
|
586
|
+
maxLen = dp[j];
|
|
587
|
+
endI = i;
|
|
588
|
+
endJ = j;
|
|
589
|
+
}
|
|
590
|
+
} else dp[j] = 0;
|
|
591
|
+
prev = temp;
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
return {
|
|
595
|
+
len: maxLen,
|
|
596
|
+
aIdx: endI - maxLen,
|
|
597
|
+
bIdx: endJ - maxLen
|
|
598
|
+
};
|
|
599
|
+
}
|
|
600
|
+
/**
|
|
601
|
+
* Compute Ratcliff-Obershelp similarity between two strings.
|
|
602
|
+
*
|
|
603
|
+
* Uses an iterative stack-based approach to avoid stack overflow on
|
|
604
|
+
* very different strings. The algorithm recursively finds the longest
|
|
605
|
+
* common substring and combines similarity scores from both sides.
|
|
606
|
+
*
|
|
607
|
+
* similarity = 2 * M / T, where M = total matched characters, T = total characters
|
|
608
|
+
*
|
|
609
|
+
* @param a - First string
|
|
610
|
+
* @param b - Second string
|
|
611
|
+
* @returns Ratcliff-Obershelp similarity in [0, 1]
|
|
612
|
+
*/
|
|
613
|
+
function ratcliff(a, b) {
|
|
614
|
+
if (a === b) return 1;
|
|
615
|
+
const totalLen = a.length + b.length;
|
|
616
|
+
if (totalLen === 0) return 1;
|
|
617
|
+
let totalMatch = 0;
|
|
618
|
+
const stack = [[
|
|
619
|
+
0,
|
|
620
|
+
a.length,
|
|
621
|
+
0,
|
|
622
|
+
b.length
|
|
623
|
+
]];
|
|
624
|
+
while (stack.length > 0) {
|
|
625
|
+
const [aStart, aEnd, bStart, bEnd] = stack.pop();
|
|
626
|
+
if (aEnd - aStart === 0 || bEnd - bStart === 0) continue;
|
|
627
|
+
const lcs = findLCS(a.slice(aStart, aEnd), b.slice(bStart, bEnd));
|
|
628
|
+
if (lcs.len === 0) continue;
|
|
629
|
+
totalMatch += lcs.len;
|
|
630
|
+
const aRightStart = aStart + lcs.aIdx + lcs.len;
|
|
631
|
+
const bRightStart = bStart + lcs.bIdx + lcs.len;
|
|
632
|
+
if (aEnd - aRightStart > 0 && bEnd - bRightStart > 0) stack.push([
|
|
633
|
+
aRightStart,
|
|
634
|
+
aEnd,
|
|
635
|
+
bRightStart,
|
|
636
|
+
bEnd
|
|
637
|
+
]);
|
|
638
|
+
const aLeftEnd = aStart + lcs.aIdx;
|
|
639
|
+
const bLeftEnd = bStart + lcs.bIdx;
|
|
640
|
+
if (aLeftEnd - aStart > 0 && bLeftEnd - bStart > 0) stack.push([
|
|
641
|
+
aStart,
|
|
642
|
+
aLeftEnd,
|
|
643
|
+
bStart,
|
|
644
|
+
bLeftEnd
|
|
645
|
+
]);
|
|
646
|
+
}
|
|
647
|
+
return 2 * totalMatch / totalLen;
|
|
648
|
+
}
|
|
649
|
+
//#endregion
|
|
650
|
+
//#region src/edit/smith-waterman.ts
|
|
651
|
+
/**
|
|
652
|
+
* Compute the raw Smith-Waterman alignment score.
|
|
653
|
+
*
|
|
654
|
+
* @param a - First string
|
|
655
|
+
* @param b - Second string
|
|
656
|
+
* @param options - Scoring parameters
|
|
657
|
+
* @returns Raw alignment score (non-negative)
|
|
658
|
+
*/
|
|
659
|
+
function smithWaterman(a, b, options = {}) {
|
|
660
|
+
const matchScore = options.matchScore ?? 1;
|
|
661
|
+
const mismatchScore = options.mismatchScore ?? 0;
|
|
662
|
+
const gapScore = options.gapScore ?? -1;
|
|
663
|
+
const aLen = a.length;
|
|
664
|
+
const bLen = b.length;
|
|
665
|
+
const w = bLen + 1;
|
|
666
|
+
const dp = new Int16Array((aLen + 1) * w);
|
|
667
|
+
dp.fill(0);
|
|
668
|
+
for (let i = 1; i <= aLen; i++) {
|
|
669
|
+
const rowBase = i * w;
|
|
670
|
+
const prevRowBase = (i - 1) * w;
|
|
671
|
+
for (let j = 1; j <= bLen; j++) {
|
|
672
|
+
const cost = a.charCodeAt(i - 1) === b.charCodeAt(j - 1) ? matchScore : mismatchScore;
|
|
673
|
+
const diag = dp[prevRowBase + j - 1] + cost;
|
|
674
|
+
const up = dp[prevRowBase + j] + gapScore;
|
|
675
|
+
const left = dp[rowBase + j - 1] + gapScore;
|
|
676
|
+
dp[rowBase + j] = Math.max(0, diag, up, left);
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
return dp[aLen * w + bLen];
|
|
680
|
+
}
|
|
681
|
+
/**
|
|
682
|
+
* Compute the normalized Smith-Waterman similarity in [0, 1].
|
|
683
|
+
*
|
|
684
|
+
* Normalized by matchScore * max(len(a), len(b)), matching textdistance.rs convention.
|
|
685
|
+
*
|
|
686
|
+
* @param a - First string
|
|
687
|
+
* @param b - Second string
|
|
688
|
+
* @param options - Scoring parameters
|
|
689
|
+
* @returns Normalized similarity in [0, 1]
|
|
690
|
+
*/
|
|
691
|
+
function smithWatermanNormalized(a, b, options = {}) {
|
|
692
|
+
const maxPossible = (options.matchScore ?? 1) * Math.max(a.length, b.length);
|
|
693
|
+
if (maxPossible === 0) return 1;
|
|
694
|
+
return smithWaterman(a, b, options) / maxPossible;
|
|
695
|
+
}
|
|
696
|
+
//#endregion
|
|
234
697
|
//#region src/token/jaccard.ts
|
|
235
698
|
const _freqA$2 = new Int32Array(128);
|
|
236
699
|
const _freqB$2 = new Int32Array(128);
|
|
@@ -299,10 +762,10 @@ const _freqB$1 = new Int32Array(128);
|
|
|
299
762
|
/**
|
|
300
763
|
* Cosine similarity between two strings based on character-level multiset.
|
|
301
764
|
*
|
|
302
|
-
*
|
|
765
|
+
* Uses textdistance.rs convention:
|
|
766
|
+
* cosine(A, B) = intersect_count(A, B) / sqrt(count(A) * count(B))
|
|
303
767
|
*
|
|
304
|
-
*
|
|
305
|
-
* matching the textdistance crate behavior.
|
|
768
|
+
* Where intersect_count = sum(min(freqA[c], freqB[c])) and count = sum of frequencies.
|
|
306
769
|
*
|
|
307
770
|
* Time: O(m + n)
|
|
308
771
|
*
|
|
@@ -314,43 +777,49 @@ function cosine(a, b) {
|
|
|
314
777
|
_freqA$1.fill(0);
|
|
315
778
|
_freqB$1.fill(0);
|
|
316
779
|
if (buildCharFreqArray(_freqA$1, a) && buildCharFreqArray(_freqB$1, b)) {
|
|
317
|
-
let
|
|
318
|
-
let
|
|
319
|
-
let
|
|
780
|
+
let intersection = 0;
|
|
781
|
+
let totalA = 0;
|
|
782
|
+
let totalB = 0;
|
|
320
783
|
for (let i = 0; i < 128; i++) {
|
|
321
784
|
const va = _freqA$1[i];
|
|
322
785
|
const vb = _freqB$1[i];
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
786
|
+
intersection += va < vb ? va : vb;
|
|
787
|
+
totalA += va;
|
|
788
|
+
totalB += vb;
|
|
326
789
|
}
|
|
327
|
-
|
|
328
|
-
|
|
790
|
+
if (totalA === 0 && totalB === 0) return 1;
|
|
791
|
+
if (totalA === 0 || totalB === 0) return 0;
|
|
792
|
+
return intersection / Math.sqrt(totalA * totalB);
|
|
329
793
|
}
|
|
330
794
|
const freqAMap = charFrequencyMap(a);
|
|
331
795
|
const freqBMap = charFrequencyMap(b);
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
const
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
796
|
+
const intersection = intersectCount$1(freqAMap, freqBMap);
|
|
797
|
+
const totalA = totalCount$1(freqAMap);
|
|
798
|
+
const totalB = totalCount$1(freqBMap);
|
|
799
|
+
if (totalA === 0 && totalB === 0) return 1;
|
|
800
|
+
if (totalA === 0 || totalB === 0) return 0;
|
|
801
|
+
return intersection / Math.sqrt(totalA * totalB);
|
|
802
|
+
}
|
|
803
|
+
function intersectCount$1(a, b) {
|
|
804
|
+
let count = 0;
|
|
805
|
+
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
|
|
806
|
+
for (const [key, countA] of smaller) {
|
|
807
|
+
const countB = larger.get(key);
|
|
808
|
+
if (countB !== void 0) count += countA < countB ? countA : countB;
|
|
809
|
+
}
|
|
810
|
+
return count;
|
|
811
|
+
}
|
|
812
|
+
function totalCount$1(map) {
|
|
813
|
+
let count = 0;
|
|
814
|
+
for (const c of map.values()) count += c;
|
|
815
|
+
return count;
|
|
350
816
|
}
|
|
351
817
|
/**
|
|
352
818
|
* Cosine similarity based on character n-grams.
|
|
353
819
|
*
|
|
820
|
+
* Uses textdistance.rs convention (same as character-level cosine but on n-grams):
|
|
821
|
+
* cosine_ngram(A, B) = intersect_count(A, B) / sqrt(count(A) * count(B))
|
|
822
|
+
*
|
|
354
823
|
* @param a - First string
|
|
355
824
|
* @param b - Second string
|
|
356
825
|
* @param n - N-gram size (default: 2)
|
|
@@ -360,32 +829,21 @@ function cosineNgram(a, b, n = 2) {
|
|
|
360
829
|
const freqAInt = ngramFrequencyMap(a, n);
|
|
361
830
|
const freqBInt = ngramFrequencyMap(b, n);
|
|
362
831
|
if (freqAInt !== null && freqBInt !== null) {
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
normA += countA * countA;
|
|
370
|
-
}
|
|
371
|
-
for (const [, count] of freqBInt) normB += count * count;
|
|
372
|
-
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
|
|
373
|
-
return denominator === 0 ? 1 : dotProduct / denominator;
|
|
832
|
+
const intersection = intersectCountInt(freqAInt, freqBInt);
|
|
833
|
+
const totalA = totalCountInt(freqAInt);
|
|
834
|
+
const totalB = totalCountInt(freqBInt);
|
|
835
|
+
if (totalA === 0 && totalB === 0) return 1;
|
|
836
|
+
if (totalA === 0 || totalB === 0) return 0;
|
|
837
|
+
return intersection / Math.sqrt(totalA * totalB);
|
|
374
838
|
}
|
|
375
839
|
const freqA = frequencyMap(ngrams(a, n));
|
|
376
840
|
const freqB = frequencyMap(ngrams(b, n));
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
normA += countA * countA;
|
|
384
|
-
}
|
|
385
|
-
for (const [, count] of freqB) normB += count * count;
|
|
386
|
-
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
|
|
387
|
-
if (denominator === 0) return 1;
|
|
388
|
-
return dotProduct / denominator;
|
|
841
|
+
const intersection = intersectCount$1(freqA, freqB);
|
|
842
|
+
const totalA = totalCount$1(freqA);
|
|
843
|
+
const totalB = totalCount$1(freqB);
|
|
844
|
+
if (totalA === 0 && totalB === 0) return 1;
|
|
845
|
+
if (totalA === 0 || totalB === 0) return 0;
|
|
846
|
+
return intersection / Math.sqrt(totalA * totalB);
|
|
389
847
|
}
|
|
390
848
|
//#endregion
|
|
391
849
|
//#region src/token/sorensen.ts
|
|
@@ -445,6 +903,122 @@ function sorensenNgram(a, b, n = 2) {
|
|
|
445
903
|
return 2 * ic / total;
|
|
446
904
|
}
|
|
447
905
|
//#endregion
|
|
906
|
+
//#region src/token/tversky.ts
|
|
907
|
+
/**
|
|
908
|
+
* Tversky index — asymmetric set similarity measure.
|
|
909
|
+
*
|
|
910
|
+
* Reduces to Jaccard when alpha = beta = 1.
|
|
911
|
+
* Reduces to Sorensen-Dice when alpha = beta = 0.5.
|
|
912
|
+
*
|
|
913
|
+
* Time: O(m + n)
|
|
914
|
+
*/
|
|
915
|
+
/**
|
|
916
|
+
* Compute the Tversky index between two strings based on character multiset.
|
|
917
|
+
*
|
|
918
|
+
* T(A, B; α, β) = |A ∩ B| / (|A ∩ B| + α|A \ B| + β|B \ A|)
|
|
919
|
+
*
|
|
920
|
+
* @param a - First string
|
|
921
|
+
* @param b - Second string
|
|
922
|
+
* @param options - alpha and beta weights
|
|
923
|
+
* @returns Tversky index in [0, 1]
|
|
924
|
+
*/
|
|
925
|
+
function tversky(a, b, options = {}) {
|
|
926
|
+
const alpha = options.alpha ?? 1;
|
|
927
|
+
const beta = options.beta ?? 1;
|
|
928
|
+
const freqA = charFrequencyMap(a);
|
|
929
|
+
const freqB = charFrequencyMap(b);
|
|
930
|
+
const intersection = intersectCount(freqA, freqB);
|
|
931
|
+
const totalA = totalCount(freqA);
|
|
932
|
+
const totalB = totalCount(freqB);
|
|
933
|
+
const onlyA = totalA - intersection;
|
|
934
|
+
const onlyB = totalB - intersection;
|
|
935
|
+
const denominator = intersection + alpha * onlyA + beta * onlyB;
|
|
936
|
+
if (denominator === 0) return 1;
|
|
937
|
+
return intersection / denominator;
|
|
938
|
+
}
|
|
939
|
+
//#endregion
|
|
940
|
+
//#region src/token/overlap.ts
|
|
941
|
+
/**
|
|
942
|
+
* Overlap coefficient — set similarity normalized by the smaller set.
|
|
943
|
+
*
|
|
944
|
+
* overlap(A, B) = |A ∩ B| / min(|A|, |B|)
|
|
945
|
+
*
|
|
946
|
+
* Time: O(m + n)
|
|
947
|
+
*/
|
|
948
|
+
/**
|
|
949
|
+
* Compute the overlap coefficient between two strings based on character multiset.
|
|
950
|
+
*
|
|
951
|
+
* @param a - First string
|
|
952
|
+
* @param b - Second string
|
|
953
|
+
* @returns Overlap coefficient in [0, 1]
|
|
954
|
+
*/
|
|
955
|
+
function overlap(a, b) {
|
|
956
|
+
const freqA = charFrequencyMap(a);
|
|
957
|
+
const freqB = charFrequencyMap(b);
|
|
958
|
+
const intersection = intersectCount(freqA, freqB);
|
|
959
|
+
const totalA = totalCount(freqA);
|
|
960
|
+
const totalB = totalCount(freqB);
|
|
961
|
+
const minTotal = Math.min(totalA, totalB);
|
|
962
|
+
if (totalA === 0 && totalB === 0) return 1;
|
|
963
|
+
if (totalA === 0 || totalB === 0) return 0;
|
|
964
|
+
return intersection / minTotal;
|
|
965
|
+
}
|
|
966
|
+
//#endregion
|
|
967
|
+
//#region src/token/naive.ts
|
|
968
|
+
/**
|
|
969
|
+
* Naive string similarity measures: prefix, suffix, length.
|
|
970
|
+
*
|
|
971
|
+
* Time: O(min(m, n)) for prefix/suffix, O(1) for length
|
|
972
|
+
*/
|
|
973
|
+
/**
|
|
974
|
+
* Compute prefix similarity between two strings.
|
|
975
|
+
*
|
|
976
|
+
* prefix(a, b) = commonPrefixLength / max(|a|, |b|)
|
|
977
|
+
*
|
|
978
|
+
* @param a - First string
|
|
979
|
+
* @param b - Second string
|
|
980
|
+
* @returns Prefix similarity in [0, 1]
|
|
981
|
+
*/
|
|
982
|
+
function prefix(a, b) {
|
|
983
|
+
const maxLen = Math.max(a.length, b.length);
|
|
984
|
+
if (maxLen === 0) return 1;
|
|
985
|
+
let commonLen = 0;
|
|
986
|
+
while (commonLen < a.length && commonLen < b.length && a.charCodeAt(commonLen) === b.charCodeAt(commonLen)) commonLen++;
|
|
987
|
+
return commonLen / maxLen;
|
|
988
|
+
}
|
|
989
|
+
/**
|
|
990
|
+
* Compute suffix similarity between two strings.
|
|
991
|
+
*
|
|
992
|
+
* suffix(a, b) = commonSuffixLength / max(|a|, |b|)
|
|
993
|
+
*
|
|
994
|
+
* @param a - First string
|
|
995
|
+
* @param b - Second string
|
|
996
|
+
* @returns Suffix similarity in [0, 1]
|
|
997
|
+
*/
|
|
998
|
+
function suffix(a, b) {
|
|
999
|
+
const maxLen = Math.max(a.length, b.length);
|
|
1000
|
+
if (maxLen === 0) return 1;
|
|
1001
|
+
let commonLen = 0;
|
|
1002
|
+
const aEnd = a.length;
|
|
1003
|
+
const bEnd = b.length;
|
|
1004
|
+
while (commonLen < aEnd && commonLen < bEnd && a.charCodeAt(aEnd - 1 - commonLen) === b.charCodeAt(bEnd - 1 - commonLen)) commonLen++;
|
|
1005
|
+
return commonLen / maxLen;
|
|
1006
|
+
}
|
|
1007
|
+
/**
|
|
1008
|
+
* Compute length-based similarity between two strings.
|
|
1009
|
+
*
|
|
1010
|
+
* length(a, b) = 1 - |len(a) - len(b)| / max(len(a), len(b))
|
|
1011
|
+
*
|
|
1012
|
+
* @param a - First string
|
|
1013
|
+
* @param b - Second string
|
|
1014
|
+
* @returns Normalized length similarity in [0, 1]
|
|
1015
|
+
*/
|
|
1016
|
+
function length(a, b) {
|
|
1017
|
+
const maxLen = Math.max(a.length, b.length);
|
|
1018
|
+
if (maxLen === 0) return 1;
|
|
1019
|
+
return 1 - Math.abs(a.length - b.length) / maxLen;
|
|
1020
|
+
}
|
|
1021
|
+
//#endregion
|
|
448
1022
|
//#region src/hash/simhash.ts
|
|
449
1023
|
/**
|
|
450
1024
|
* Generate a 64-bit fingerprint for a collection of features.
|
|
@@ -800,4 +1374,335 @@ function bandHash(slice) {
|
|
|
800
1374
|
return hash.toString(36);
|
|
801
1375
|
}
|
|
802
1376
|
//#endregion
|
|
803
|
-
|
|
1377
|
+
//#region src/search.ts
|
|
1378
|
+
const BUILTIN_ALGORITHMS = {
|
|
1379
|
+
levenshtein: levenshteinNormalized,
|
|
1380
|
+
lcs: lcsNormalized,
|
|
1381
|
+
jaccard,
|
|
1382
|
+
jaccardNgram,
|
|
1383
|
+
cosine,
|
|
1384
|
+
cosineNgram,
|
|
1385
|
+
sorensen,
|
|
1386
|
+
sorensenNgram
|
|
1387
|
+
};
|
|
1388
|
+
function resolveKeys(rawKeys) {
|
|
1389
|
+
if (rawKeys.length === 0) return [];
|
|
1390
|
+
const totalWeight = rawKeys.reduce((sum, k) => sum + (k.weight ?? 1), 0);
|
|
1391
|
+
return rawKeys.map((k) => ({
|
|
1392
|
+
...k,
|
|
1393
|
+
normalizedWeight: totalWeight > 0 ? (k.weight ?? 1) / totalWeight : 1 / rawKeys.length
|
|
1394
|
+
}));
|
|
1395
|
+
}
|
|
1396
|
+
function resolveAlgorithm(algo) {
|
|
1397
|
+
if (algo === void 0) return BUILTIN_ALGORITHMS.levenshtein;
|
|
1398
|
+
if (typeof algo === "function") return algo;
|
|
1399
|
+
return BUILTIN_ALGORITHMS[algo];
|
|
1400
|
+
}
|
|
1401
|
+
/**
|
|
1402
|
+
* Fuzzy search engine for finding similar items in a collection.
|
|
1403
|
+
*
|
|
1404
|
+
* Supports both string arrays and object arrays with weighted multi-key search.
|
|
1405
|
+
* Uses any similarity algorithm from @nlptools/distance, with optional LSH
|
|
1406
|
+
* acceleration for large datasets.
|
|
1407
|
+
*
|
|
1408
|
+
* @example
|
|
1409
|
+
* ```ts
|
|
1410
|
+
* // String array search
|
|
1411
|
+
* const search = new FuzzySearch(["apple", "banana", "cherry"]);
|
|
1412
|
+
* const results = search.search("aple"); // [{ item: "apple", score: 0.75, index: 0 }]
|
|
1413
|
+
*
|
|
1414
|
+
* // Object array with weighted keys
|
|
1415
|
+
* const books = [
|
|
1416
|
+
* { title: "Old Man's War", author: "John Scalzi" },
|
|
1417
|
+
* { title: "The Lock Artist", author: "Steve Hamilton" },
|
|
1418
|
+
* ];
|
|
1419
|
+
* const bookSearch = new FuzzySearch(books, {
|
|
1420
|
+
* keys: [
|
|
1421
|
+
* { name: "title", weight: 0.7 },
|
|
1422
|
+
* { name: "author", weight: 0.3 },
|
|
1423
|
+
* ],
|
|
1424
|
+
* algorithm: "cosine",
|
|
1425
|
+
* });
|
|
1426
|
+
* const results = bookSearch.search("old man"); // finds "Old Man's War"
|
|
1427
|
+
* ```
|
|
1428
|
+
*/
|
|
1429
|
+
var FuzzySearch = class {
|
|
1430
|
+
similarityFn;
|
|
1431
|
+
keys;
|
|
1432
|
+
threshold;
|
|
1433
|
+
limit;
|
|
1434
|
+
caseSensitive;
|
|
1435
|
+
includeMatchDetails;
|
|
1436
|
+
isObjectArray;
|
|
1437
|
+
collection;
|
|
1438
|
+
useLSH;
|
|
1439
|
+
lshNumHashes;
|
|
1440
|
+
lshNumBands;
|
|
1441
|
+
lshIndex;
|
|
1442
|
+
minHashSignatures;
|
|
1443
|
+
constructor(collection, options = {}) {
|
|
1444
|
+
this.similarityFn = resolveAlgorithm(options.algorithm);
|
|
1445
|
+
this.keys = resolveKeys(options.keys ?? []);
|
|
1446
|
+
this.isObjectArray = this.keys.length > 0;
|
|
1447
|
+
this.threshold = options.threshold ?? 0;
|
|
1448
|
+
this.limit = options.limit ?? Infinity;
|
|
1449
|
+
this.caseSensitive = options.caseSensitive ?? false;
|
|
1450
|
+
this.includeMatchDetails = options.includeMatchDetails ?? false;
|
|
1451
|
+
this.collection = [...collection];
|
|
1452
|
+
const lshOpts = options.lsh;
|
|
1453
|
+
this.useLSH = lshOpts !== void 0;
|
|
1454
|
+
this.lshNumHashes = lshOpts?.numHashes ?? 128;
|
|
1455
|
+
this.lshNumBands = lshOpts?.numBands ?? 16;
|
|
1456
|
+
this.lshIndex = null;
|
|
1457
|
+
this.minHashSignatures = /* @__PURE__ */ new Map();
|
|
1458
|
+
if (this.useLSH && this.collection.length > 0) this.buildLSHIndex();
|
|
1459
|
+
}
|
|
1460
|
+
/**
|
|
1461
|
+
* Search the collection for items similar to the query.
|
|
1462
|
+
*
|
|
1463
|
+
* @param query - The search query string
|
|
1464
|
+
* @param limit - Optional per-query limit override
|
|
1465
|
+
* @returns Array of results sorted by score descending
|
|
1466
|
+
*/
|
|
1467
|
+
search(query, limit) {
|
|
1468
|
+
const effectiveLimit = limit ?? this.limit;
|
|
1469
|
+
if (effectiveLimit === 0 || this.collection.length === 0) return [];
|
|
1470
|
+
const normalizedQuery = this.normalizeString(query);
|
|
1471
|
+
if (this.useLSH && this.lshIndex !== null) return this.searchWithLSH(normalizedQuery, effectiveLimit);
|
|
1472
|
+
return this.searchLinear(normalizedQuery, effectiveLimit);
|
|
1473
|
+
}
|
|
1474
|
+
/**
|
|
1475
|
+
* Add an item to the collection.
|
|
1476
|
+
* If LSH is enabled, the index is updated incrementally.
|
|
1477
|
+
*/
|
|
1478
|
+
add(item) {
|
|
1479
|
+
const index = this.collection.length;
|
|
1480
|
+
this.collection.push(item);
|
|
1481
|
+
if (this.useLSH && this.lshIndex !== null) {
|
|
1482
|
+
const text = this.extractSearchText(item);
|
|
1483
|
+
const sig = this.buildMinHashSignature(text);
|
|
1484
|
+
this.minHashSignatures.set(index, sig);
|
|
1485
|
+
this.lshIndex.insert(String(index), sig);
|
|
1486
|
+
}
|
|
1487
|
+
}
|
|
1488
|
+
/**
|
|
1489
|
+
* Remove an item from the collection by index.
|
|
1490
|
+
* If LSH is enabled, the index is rebuilt (O(n)).
|
|
1491
|
+
*
|
|
1492
|
+
* @returns true if the item was found and removed
|
|
1493
|
+
*/
|
|
1494
|
+
remove(index) {
|
|
1495
|
+
if (index < 0 || index >= this.collection.length) return false;
|
|
1496
|
+
this.collection.splice(index, 1);
|
|
1497
|
+
if (this.useLSH) this.buildLSHIndex();
|
|
1498
|
+
return true;
|
|
1499
|
+
}
|
|
1500
|
+
/**
|
|
1501
|
+
* Replace the entire collection.
|
|
1502
|
+
* If LSH is enabled, the index is rebuilt.
|
|
1503
|
+
*/
|
|
1504
|
+
setCollection(collection) {
|
|
1505
|
+
this.collection = [...collection];
|
|
1506
|
+
if (this.useLSH && this.collection.length > 0) this.buildLSHIndex();
|
|
1507
|
+
else if (this.useLSH) {
|
|
1508
|
+
this.lshIndex = null;
|
|
1509
|
+
this.minHashSignatures.clear();
|
|
1510
|
+
}
|
|
1511
|
+
}
|
|
1512
|
+
/**
|
|
1513
|
+
* Get the current collection.
|
|
1514
|
+
*/
|
|
1515
|
+
getCollection() {
|
|
1516
|
+
return this.collection;
|
|
1517
|
+
}
|
|
1518
|
+
/**
|
|
1519
|
+
* Get the number of items in the collection.
|
|
1520
|
+
*/
|
|
1521
|
+
get size() {
|
|
1522
|
+
return this.collection.length;
|
|
1523
|
+
}
|
|
1524
|
+
/**
|
|
1525
|
+
* Clear the collection and any LSH index.
|
|
1526
|
+
*/
|
|
1527
|
+
clear() {
|
|
1528
|
+
this.collection = [];
|
|
1529
|
+
this.lshIndex = null;
|
|
1530
|
+
this.minHashSignatures.clear();
|
|
1531
|
+
}
|
|
1532
|
+
searchLinear(normalizedQuery, limit) {
|
|
1533
|
+
const candidates = [];
|
|
1534
|
+
for (let i = 0; i < this.collection.length; i++) {
|
|
1535
|
+
const item = this.collection[i];
|
|
1536
|
+
if (this.isObjectArray) if (this.includeMatchDetails) {
|
|
1537
|
+
const { score, matches } = this.computeDetailedScore(normalizedQuery, item);
|
|
1538
|
+
if (score >= this.threshold) candidates.push({
|
|
1539
|
+
item,
|
|
1540
|
+
score,
|
|
1541
|
+
index: i,
|
|
1542
|
+
matches
|
|
1543
|
+
});
|
|
1544
|
+
} else {
|
|
1545
|
+
const score = this.computeItemScore(normalizedQuery, item);
|
|
1546
|
+
if (score >= this.threshold) candidates.push({
|
|
1547
|
+
item,
|
|
1548
|
+
score,
|
|
1549
|
+
index: i
|
|
1550
|
+
});
|
|
1551
|
+
}
|
|
1552
|
+
else {
|
|
1553
|
+
const itemStr = this.normalizeString(item);
|
|
1554
|
+
const score = this.similarityFn(normalizedQuery, itemStr);
|
|
1555
|
+
if (score >= this.threshold) candidates.push({
|
|
1556
|
+
item,
|
|
1557
|
+
score,
|
|
1558
|
+
index: i
|
|
1559
|
+
});
|
|
1560
|
+
}
|
|
1561
|
+
}
|
|
1562
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
1563
|
+
if (candidates.length <= limit) return candidates;
|
|
1564
|
+
return candidates.slice(0, limit);
|
|
1565
|
+
}
|
|
1566
|
+
searchWithLSH(normalizedQuery, limit) {
|
|
1567
|
+
const queryText = this.isObjectArray ? normalizedQuery : normalizedQuery;
|
|
1568
|
+
const querySig = this.buildMinHashSignature(queryText);
|
|
1569
|
+
const candidateIds = this.lshIndex.query(querySig, this.threshold);
|
|
1570
|
+
const candidates = [];
|
|
1571
|
+
for (const [id] of candidateIds) {
|
|
1572
|
+
const idx = parseInt(id, 10);
|
|
1573
|
+
if (idx < 0 || idx >= this.collection.length) continue;
|
|
1574
|
+
const item = this.collection[idx];
|
|
1575
|
+
if (this.isObjectArray) if (this.includeMatchDetails) {
|
|
1576
|
+
const { score, matches } = this.computeDetailedScore(normalizedQuery, item);
|
|
1577
|
+
if (score >= this.threshold) candidates.push({
|
|
1578
|
+
item,
|
|
1579
|
+
score,
|
|
1580
|
+
index: idx,
|
|
1581
|
+
matches
|
|
1582
|
+
});
|
|
1583
|
+
} else {
|
|
1584
|
+
const score = this.computeItemScore(normalizedQuery, item);
|
|
1585
|
+
if (score >= this.threshold) candidates.push({
|
|
1586
|
+
item,
|
|
1587
|
+
score,
|
|
1588
|
+
index: idx
|
|
1589
|
+
});
|
|
1590
|
+
}
|
|
1591
|
+
else {
|
|
1592
|
+
const itemStr = this.normalizeString(item);
|
|
1593
|
+
const score = this.similarityFn(normalizedQuery, itemStr);
|
|
1594
|
+
if (score >= this.threshold) candidates.push({
|
|
1595
|
+
item,
|
|
1596
|
+
score,
|
|
1597
|
+
index: idx
|
|
1598
|
+
});
|
|
1599
|
+
}
|
|
1600
|
+
}
|
|
1601
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
1602
|
+
if (candidates.length <= limit) return candidates;
|
|
1603
|
+
return candidates.slice(0, limit);
|
|
1604
|
+
}
|
|
1605
|
+
buildLSHIndex() {
|
|
1606
|
+
this.lshIndex = new LSH({
|
|
1607
|
+
numBands: this.lshNumBands,
|
|
1608
|
+
numHashes: this.lshNumHashes
|
|
1609
|
+
});
|
|
1610
|
+
this.minHashSignatures.clear();
|
|
1611
|
+
for (let i = 0; i < this.collection.length; i++) {
|
|
1612
|
+
const text = this.extractSearchText(this.collection[i]);
|
|
1613
|
+
const sig = this.buildMinHashSignature(text);
|
|
1614
|
+
this.minHashSignatures.set(i, sig);
|
|
1615
|
+
this.lshIndex.insert(String(i), sig);
|
|
1616
|
+
}
|
|
1617
|
+
}
|
|
1618
|
+
buildMinHashSignature(text) {
|
|
1619
|
+
const mh = new MinHash({ numHashes: this.lshNumHashes });
|
|
1620
|
+
const grams = ngrams(text, 2);
|
|
1621
|
+
for (const g of grams) mh.update(g);
|
|
1622
|
+
return mh.digest();
|
|
1623
|
+
}
|
|
1624
|
+
computeItemScore(normalizedQuery, item) {
|
|
1625
|
+
let score = 0;
|
|
1626
|
+
for (const key of this.keys) {
|
|
1627
|
+
const value = this.extractKeyValue(item, key);
|
|
1628
|
+
const normalizedValue = this.normalizeString(value);
|
|
1629
|
+
score += key.normalizedWeight * this.similarityFn(normalizedQuery, normalizedValue);
|
|
1630
|
+
}
|
|
1631
|
+
return score;
|
|
1632
|
+
}
|
|
1633
|
+
computeDetailedScore(normalizedQuery, item) {
|
|
1634
|
+
let score = 0;
|
|
1635
|
+
const matches = {};
|
|
1636
|
+
for (const key of this.keys) {
|
|
1637
|
+
const value = this.extractKeyValue(item, key);
|
|
1638
|
+
const normalizedValue = this.normalizeString(value);
|
|
1639
|
+
const s = this.similarityFn(normalizedQuery, normalizedValue);
|
|
1640
|
+
matches[key.name] = s;
|
|
1641
|
+
score += key.normalizedWeight * s;
|
|
1642
|
+
}
|
|
1643
|
+
return {
|
|
1644
|
+
score,
|
|
1645
|
+
matches
|
|
1646
|
+
};
|
|
1647
|
+
}
|
|
1648
|
+
extractSearchText(item) {
|
|
1649
|
+
if (this.isObjectArray) return this.keys.map((k) => this.extractKeyValue(item, k)).join(" ");
|
|
1650
|
+
return this.normalizeString(item);
|
|
1651
|
+
}
|
|
1652
|
+
extractKeyValue(item, key) {
|
|
1653
|
+
if (key.getter) {
|
|
1654
|
+
const value = key.getter(item);
|
|
1655
|
+
return typeof value === "string" ? value : "";
|
|
1656
|
+
}
|
|
1657
|
+
const value = item[key.name];
|
|
1658
|
+
return typeof value === "string" ? value : "";
|
|
1659
|
+
}
|
|
1660
|
+
normalizeString(str) {
|
|
1661
|
+
return this.caseSensitive ? str : str.toLowerCase();
|
|
1662
|
+
}
|
|
1663
|
+
};
|
|
1664
|
+
/**
|
|
1665
|
+
* Find the single best match for a query against a collection.
|
|
1666
|
+
*
|
|
1667
|
+
* This is a convenience wrapper around {@link FuzzySearch} for one-shot queries.
|
|
1668
|
+
* For repeated searches against the same collection, prefer creating a
|
|
1669
|
+
* {@link FuzzySearch} instance directly.
|
|
1670
|
+
*
|
|
1671
|
+
* Time: O(n * k) where n = collection size, k = number of keys
|
|
1672
|
+
*
|
|
1673
|
+
* @param query - The search query string
|
|
1674
|
+
* @param collection - Array of strings or objects to search
|
|
1675
|
+
* @param options - Search configuration
|
|
1676
|
+
* @returns The best matching result, or null if nothing meets the threshold
|
|
1677
|
+
*
|
|
1678
|
+
* @example
|
|
1679
|
+
* ```ts
|
|
1680
|
+
* // String array
|
|
1681
|
+
* const result = findBestMatch("kitten", ["sitting", "kit", "mitten"]);
|
|
1682
|
+
* console.log(result?.item); // "kit"
|
|
1683
|
+
* console.log(result?.score); // 0.5
|
|
1684
|
+
*
|
|
1685
|
+
* // Object array with weighted keys
|
|
1686
|
+
* const books = [
|
|
1687
|
+
* { title: "The Great Gatsby", author: "F. Scott Fitzgerald" },
|
|
1688
|
+
* { title: "Great Expectations", author: "Charles Dickens" },
|
|
1689
|
+
* ];
|
|
1690
|
+
* const result = findBestMatch("grate gatsbi", books, {
|
|
1691
|
+
* keys: [
|
|
1692
|
+
* { name: "title", weight: 0.7 },
|
|
1693
|
+
* { name: "author", weight: 0.3 },
|
|
1694
|
+
* ],
|
|
1695
|
+
* });
|
|
1696
|
+
* ```
|
|
1697
|
+
*/
|
|
1698
|
+
function findBestMatch(query, collection, options = {}) {
|
|
1699
|
+
const results = new FuzzySearch(collection, {
|
|
1700
|
+
algorithm: options.algorithm,
|
|
1701
|
+
keys: options.keys,
|
|
1702
|
+
threshold: options.threshold,
|
|
1703
|
+
caseSensitive: options.caseSensitive
|
|
1704
|
+
}).search(query, 1);
|
|
1705
|
+
return results.length > 0 ? results[0] : null;
|
|
1706
|
+
}
|
|
1707
|
+
//#endregion
|
|
1708
|
+
export { CHAR_FREQ_SIZE, DiffType, FuzzySearch, LSH, MinHash, SimHasher, buildCharFreqArray, charFrequencyMap, combineHash, cosine, cosineNgram, damerauLevenshtein, damerauLevenshteinNormalized, diff, findBestMatch, fnv1a, frequencyMap, hamming, hammingDistance, hammingNormalized, hammingSimilarity, intersectCount, intersectCountInt, jaccard, jaccardNgram, jaro, jaroWinkler, lcsDistance, lcsLength, lcsNormalized, lcsPairs, lcsSubstringDistance, lcsSubstringLength, lcsSubstringNormalized, length, levenshtein, levenshteinNormalized, ngramFrequencyMap, ngrams, normalize, overlap, prefix, ratcliff, sift4, sift4Normalized, simhash, smithWaterman, smithWatermanNormalized, sorensen, sorensenNgram, stringEquals, suffix, totalCount, totalCountInt, tversky, unionCount, unionCountInt };
|