@nlptools/distance 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +88 -8
- package/dist/index.d.mts +289 -1
- package/dist/index.mjs +332 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
- Edit distance: Levenshtein, LCS (Myers O(ND) and DP)
|
|
12
12
|
- Token similarity: Jaccard, Cosine, Sorensen-Dice (character multiset and n-gram variants)
|
|
13
13
|
- Hash-based deduplication: SimHash, MinHash, LSH
|
|
14
|
+
- Fuzzy search: `FuzzySearch` class and `findBestMatch` with multi-algorithm support
|
|
14
15
|
- Diff: based on `@algorithm.ts/diff` (Myers and DP backends)
|
|
15
16
|
- All distance algorithms include normalized similarity variants (0-1 range)
|
|
16
17
|
|
|
@@ -123,6 +124,45 @@ const query = lsh.query(mh.digest(), 0.5);
|
|
|
123
124
|
// => [["doc1", 0.67]]
|
|
124
125
|
```
|
|
125
126
|
|
|
127
|
+
### Fuzzy Search
|
|
128
|
+
|
|
129
|
+
```typescript
|
|
130
|
+
import { FuzzySearch, findBestMatch } from "@nlptools/distance";
|
|
131
|
+
|
|
132
|
+
// String array search
|
|
133
|
+
const search = new FuzzySearch(["apple", "banana", "cherry"]);
|
|
134
|
+
search.search("aple");
|
|
135
|
+
// => [{ item: "apple", score: 0.8, index: 0 }]
|
|
136
|
+
|
|
137
|
+
// Object array with weighted keys
|
|
138
|
+
const books = [
|
|
139
|
+
{ title: "Old Man's War", author: "John Scalzi" },
|
|
140
|
+
{ title: "The Great Gatsby", author: "F. Scott Fitzgerald" },
|
|
141
|
+
];
|
|
142
|
+
const bookSearch = new FuzzySearch(books, {
|
|
143
|
+
keys: [
|
|
144
|
+
{ name: "title", weight: 0.7 },
|
|
145
|
+
{ name: "author", weight: 0.3 },
|
|
146
|
+
],
|
|
147
|
+
algorithm: "cosine",
|
|
148
|
+
threshold: 0.3,
|
|
149
|
+
});
|
|
150
|
+
bookSearch.search("old man");
|
|
151
|
+
// => [{ item: { title: "Old Man's War", ... }, score: 0.52, index: 0 }]
|
|
152
|
+
|
|
153
|
+
// One-shot best match
|
|
154
|
+
findBestMatch("kitten", ["sitting", "kit", "mitten"]);
|
|
155
|
+
// => { item: "kit", score: 0.5, index: 1 }
|
|
156
|
+
|
|
157
|
+
// With per-key details
|
|
158
|
+
const detailed = new FuzzySearch(books, {
|
|
159
|
+
keys: [{ name: "title" }, { name: "author" }],
|
|
160
|
+
includeMatchDetails: true,
|
|
161
|
+
});
|
|
162
|
+
detailed.search("gatsby");
|
|
163
|
+
// => [{ item: ..., score: 0.45, index: 1, matches: { title: 0.6, author: 0.1 } }]
|
|
164
|
+
```
|
|
165
|
+
|
|
126
166
|
### Diff
|
|
127
167
|
|
|
128
168
|
```typescript
|
|
@@ -172,6 +212,27 @@ const result = diff("abc", "ac");
|
|
|
172
212
|
| `MinHash.estimate(sig1, sig2)` | Static: estimate Jaccard from signatures |
|
|
173
213
|
| `LSH` | Class with `insert()`, `query()`, `remove()` |
|
|
174
214
|
|
|
215
|
+
### Fuzzy Search
|
|
216
|
+
|
|
217
|
+
| Function / Class | Description |
|
|
218
|
+
| -------------------------------------------- | -------------------------------------------------- |
|
|
219
|
+
| `FuzzySearch<T>(collection, options?)` | Search engine with dynamic collection management |
|
|
220
|
+
| `findBestMatch(query, collection, options?)` | One-shot convenience: returns best match or `null` |
|
|
221
|
+
|
|
222
|
+
**FuzzySearch options:**
|
|
223
|
+
|
|
224
|
+
| Option | Type | Default | Description |
|
|
225
|
+
| --------------------- | ---------------------------------- | --------------- | ----------------------------- |
|
|
226
|
+
| `algorithm` | `BuiltinAlgorithm \| SimilarityFn` | `"levenshtein"` | Similarity algorithm to use |
|
|
227
|
+
| `keys` | `ISearchKey[]` | `[]` | Object fields to search on |
|
|
228
|
+
| `threshold` | `number` | `0` | Min similarity score (0-1) |
|
|
229
|
+
| `limit` | `number` | `Infinity` | Max results to return |
|
|
230
|
+
| `caseSensitive` | `boolean` | `false` | Case-insensitive by default |
|
|
231
|
+
| `includeMatchDetails` | `boolean` | `false` | Include per-key scores |
|
|
232
|
+
| `lsh` | `{ numHashes?, numBands? }` | — | Enable LSH for large datasets |
|
|
233
|
+
|
|
234
|
+
**Built-in algorithms:** `"levenshtein"`, `"lcs"`, `"jaccard"`, `"jaccardNgram"`, `"cosine"`, `"cosineNgram"`, `"sorensen"`, `"sorensenNgram"`
|
|
235
|
+
|
|
175
236
|
### Diff
|
|
176
237
|
|
|
177
238
|
| Function | Description | Returns |
|
|
@@ -180,14 +241,19 @@ const result = diff("abc", "ac");
|
|
|
180
241
|
|
|
181
242
|
### Types
|
|
182
243
|
|
|
183
|
-
| Type
|
|
184
|
-
|
|
|
185
|
-
| `DiffType`
|
|
186
|
-
| `IDiffItem<T>`
|
|
187
|
-
| `IDiffOptions<T>`
|
|
188
|
-
| `ISimHashOptions`
|
|
189
|
-
| `IMinHashOptions`
|
|
190
|
-
| `ILSHOptions`
|
|
244
|
+
| Type | Description |
|
|
245
|
+
| ----------------------- | -------------------------------------------- |
|
|
246
|
+
| `DiffType` | Enum: `ADDED`, `REMOVED`, `COMMON` |
|
|
247
|
+
| `IDiffItem<T>` | Diff result item with type and tokens |
|
|
248
|
+
| `IDiffOptions<T>` | Options for diff (equals, lcs algorithm) |
|
|
249
|
+
| `ISimHashOptions` | Options for SimHash (bits, hashFn) |
|
|
250
|
+
| `IMinHashOptions` | Options for MinHash (numHashes, seed) |
|
|
251
|
+
| `ILSHOptions` | Options for LSH (numBands, numHashes) |
|
|
252
|
+
| `IFuzzySearchOptions` | Options for FuzzySearch constructor |
|
|
253
|
+
| `IFindBestMatchOptions` | Options for findBestMatch function |
|
|
254
|
+
| `ISearchKey` | Searchable key config (name, weight, getter) |
|
|
255
|
+
| `ISearchResult<T>` | Search result with item, score, index |
|
|
256
|
+
| `SimilarityFn` | `(a: string, b: string) => number` in [0,1] |
|
|
191
257
|
|
|
192
258
|
## Performance
|
|
193
259
|
|
|
@@ -232,6 +298,20 @@ Unit: microseconds per operation (us/op).
|
|
|
232
298
|
|
|
233
299
|
TS implementations use V8 JIT optimization + `Int32Array` ASCII fast path + integer-encoded bigrams, avoiding JS-WASM boundary overhead entirely.
|
|
234
300
|
|
|
301
|
+
### Fuzzy Search: NLPTools vs Fuse.js
|
|
302
|
+
|
|
303
|
+
Benchmark: 20 items in collection, 6 queries per iteration, 1000 iterations.
|
|
304
|
+
Unit: milliseconds per operation (ms/op). Algorithm: levenshtein (default).
|
|
305
|
+
|
|
306
|
+
| Scenario | NLPTools | Fuse.js |
|
|
307
|
+
| ----------------------- | -------- | ------- |
|
|
308
|
+
| Setup (constructor) | 0.0002 | 0.0050 |
|
|
309
|
+
| Search (string array) | 0.0114 | 0.1077 |
|
|
310
|
+
| Search (object, 1 key) | 0.0176 | 0.3308 |
|
|
311
|
+
| Search (object, 2 keys) | 0.0289 | 0.6445 |
|
|
312
|
+
|
|
313
|
+
Both libraries return identical top-1 results for all test queries. NLPTools scores are normalized similarity (0-1, higher is better); Fuse.js uses Bitap error scores (0 = perfect, lower is better).
|
|
314
|
+
|
|
235
315
|
## Dependencies
|
|
236
316
|
|
|
237
317
|
- `fastest-levenshtein` — fastest JS Levenshtein implementation
|
package/dist/index.d.mts
CHANGED
|
@@ -389,4 +389,292 @@ declare class LSH {
|
|
|
389
389
|
get size(): number;
|
|
390
390
|
}
|
|
391
391
|
//#endregion
|
|
392
|
-
|
|
392
|
+
//#region src/search.d.ts
|
|
393
|
+
/**
|
|
394
|
+
* A function that computes similarity between two strings, returning a value
|
|
395
|
+
* in [0, 1] where 1 means identical.
|
|
396
|
+
*/
|
|
397
|
+
type SimilarityFn = (a: string, b: string) => number;
|
|
398
|
+
/**
|
|
399
|
+
* Built-in similarity algorithms. Each maps to a normalized similarity
|
|
400
|
+
* function from @nlptools/distance.
|
|
401
|
+
*/
|
|
402
|
+
type BuiltinAlgorithm = "levenshtein" | "lcs" | "jaccard" | "jaccardNgram" | "cosine" | "cosineNgram" | "sorensen" | "sorensenNgram";
|
|
403
|
+
/**
|
|
404
|
+
* Configuration for a searchable key on an object item.
|
|
405
|
+
*
|
|
406
|
+
* @example
|
|
407
|
+
* ```ts
|
|
408
|
+
* const keys = [
|
|
409
|
+
* { name: "title", weight: 0.7 },
|
|
410
|
+
* { name: "author", weight: 0.3 },
|
|
411
|
+
* ];
|
|
412
|
+
* ```
|
|
413
|
+
*/
|
|
414
|
+
interface ISearchKey {
|
|
415
|
+
/** Property name to search on. */
|
|
416
|
+
name: string;
|
|
417
|
+
/**
|
|
418
|
+
* Weight of this key in the final score.
|
|
419
|
+
* Weights are normalized to sum to 1.0 internally.
|
|
420
|
+
* @default 1
|
|
421
|
+
*/
|
|
422
|
+
weight?: number;
|
|
423
|
+
/**
|
|
424
|
+
* Optional custom getter function. If provided, used instead of
|
|
425
|
+
* reading `item[name]`. Must return a string.
|
|
426
|
+
*/
|
|
427
|
+
getter?: (item: any) => string;
|
|
428
|
+
}
|
|
429
|
+
/**
|
|
430
|
+
* A single search result, containing the matched item, its score, and
|
|
431
|
+
* its position in the original collection.
|
|
432
|
+
*
|
|
433
|
+
* Results are sorted by score descending (best match first).
|
|
434
|
+
*/
|
|
435
|
+
interface ISearchResult<T> {
|
|
436
|
+
/** The matched item from the collection. */
|
|
437
|
+
item: T;
|
|
438
|
+
/**
|
|
439
|
+
* Similarity score in [0, 1], where 1 means identical.
|
|
440
|
+
* For multi-key search, this is the weighted sum of per-key scores.
|
|
441
|
+
*/
|
|
442
|
+
score: number;
|
|
443
|
+
/** Index of the item in the original collection array. */
|
|
444
|
+
index: number;
|
|
445
|
+
}
|
|
446
|
+
/**
|
|
447
|
+
* Extended search result including per-key match details.
|
|
448
|
+
* Only produced when `includeMatchDetails` is true.
|
|
449
|
+
*/
|
|
450
|
+
interface ISearchResultWithDetails<T> extends ISearchResult<T> {
|
|
451
|
+
/**
|
|
452
|
+
* Per-key similarity scores.
|
|
453
|
+
* Keys are the key names from the ISearchKey configuration.
|
|
454
|
+
* Values are individual similarity scores in [0, 1].
|
|
455
|
+
*/
|
|
456
|
+
matches: Record<string, number>;
|
|
457
|
+
}
|
|
458
|
+
/**
|
|
459
|
+
* Options for the {@link FuzzySearch} constructor.
|
|
460
|
+
*
|
|
461
|
+
* @example
|
|
462
|
+
* ```ts
|
|
463
|
+
* // String array
|
|
464
|
+
* const search = new FuzzySearch(["apple", "banana", "cherry"]);
|
|
465
|
+
*
|
|
466
|
+
* // Object array with weighted keys
|
|
467
|
+
* const search = new FuzzySearch(books, {
|
|
468
|
+
* keys: [
|
|
469
|
+
* { name: "title", weight: 0.7 },
|
|
470
|
+
* { name: "author", weight: 0.3 },
|
|
471
|
+
* ],
|
|
472
|
+
* algorithm: "cosine",
|
|
473
|
+
* threshold: 0.4,
|
|
474
|
+
* });
|
|
475
|
+
* ```
|
|
476
|
+
*/
|
|
477
|
+
interface IFuzzySearchOptions {
|
|
478
|
+
/**
|
|
479
|
+
* Similarity algorithm to use for comparing strings.
|
|
480
|
+
*
|
|
481
|
+
* - A string from {@link BuiltinAlgorithm} selects a built-in function.
|
|
482
|
+
* - A custom {@link SimilarityFn} can be provided for full control.
|
|
483
|
+
*
|
|
484
|
+
* @default "levenshtein"
|
|
485
|
+
*/
|
|
486
|
+
algorithm?: BuiltinAlgorithm | SimilarityFn;
|
|
487
|
+
/**
|
|
488
|
+
* Keys to search on when the collection contains objects.
|
|
489
|
+
* Ignored for string arrays.
|
|
490
|
+
*/
|
|
491
|
+
keys?: ISearchKey[];
|
|
492
|
+
/**
|
|
493
|
+
* Minimum similarity score (0-1) for a result to be included.
|
|
494
|
+
* Results scoring below this threshold are excluded.
|
|
495
|
+
* @default 0
|
|
496
|
+
*/
|
|
497
|
+
threshold?: number;
|
|
498
|
+
/**
|
|
499
|
+
* Maximum number of results to return.
|
|
500
|
+
* @default Infinity
|
|
501
|
+
*/
|
|
502
|
+
limit?: number;
|
|
503
|
+
/**
|
|
504
|
+
* Whether search should be case-insensitive.
|
|
505
|
+
* When true, both the query and the item strings are lowercased
|
|
506
|
+
* before comparison.
|
|
507
|
+
* @default false (case-insensitive by default)
|
|
508
|
+
*/
|
|
509
|
+
caseSensitive?: boolean;
|
|
510
|
+
/**
|
|
511
|
+
* Include per-key match details in results.
|
|
512
|
+
* When true, results include a `matches` field with individual
|
|
513
|
+
* similarity scores per key.
|
|
514
|
+
* @default false
|
|
515
|
+
*/
|
|
516
|
+
includeMatchDetails?: boolean;
|
|
517
|
+
/**
|
|
518
|
+
* Enable LSH-accelerated search for large collections (>1000 items).
|
|
519
|
+
* Uses MinHash + banding as a candidate filter, then re-scores with
|
|
520
|
+
* the exact algorithm. Provides sub-linear query time at the cost of
|
|
521
|
+
* approximate results (some true matches may be missed).
|
|
522
|
+
*/
|
|
523
|
+
lsh?: {
|
|
524
|
+
/** Number of hash functions for MinHash signature size. @default 128 */numHashes?: number;
|
|
525
|
+
/**
|
|
526
|
+
* Number of bands for LSH banding.
|
|
527
|
+
* More bands = higher recall, lower precision.
|
|
528
|
+
* @default 16
|
|
529
|
+
*/
|
|
530
|
+
numBands?: number;
|
|
531
|
+
};
|
|
532
|
+
}
|
|
533
|
+
/**
|
|
534
|
+
* Options for the {@link findBestMatch} function.
|
|
535
|
+
*
|
|
536
|
+
* @example
|
|
537
|
+
* ```ts
|
|
538
|
+
* const result = findBestMatch("kitten", ["sitting", "kit", "mitten"], {
|
|
539
|
+
* algorithm: "levenshtein",
|
|
540
|
+
* threshold: 0.3,
|
|
541
|
+
* });
|
|
542
|
+
* ```
|
|
543
|
+
*/
|
|
544
|
+
interface IFindBestMatchOptions {
|
|
545
|
+
/** Similarity algorithm. @default "levenshtein" */
|
|
546
|
+
algorithm?: BuiltinAlgorithm | SimilarityFn;
|
|
547
|
+
/** Keys for object-array search. */
|
|
548
|
+
keys?: ISearchKey[];
|
|
549
|
+
/** Minimum similarity score. @default 0 */
|
|
550
|
+
threshold?: number;
|
|
551
|
+
/** Whether search is case-insensitive. @default false (case-insensitive) */
|
|
552
|
+
caseSensitive?: boolean;
|
|
553
|
+
}
|
|
554
|
+
/**
|
|
555
|
+
* Fuzzy search engine for finding similar items in a collection.
|
|
556
|
+
*
|
|
557
|
+
* Supports both string arrays and object arrays with weighted multi-key search.
|
|
558
|
+
* Uses any similarity algorithm from @nlptools/distance, with optional LSH
|
|
559
|
+
* acceleration for large datasets.
|
|
560
|
+
*
|
|
561
|
+
* @example
|
|
562
|
+
* ```ts
|
|
563
|
+
* // String array search
|
|
564
|
+
* const search = new FuzzySearch(["apple", "banana", "cherry"]);
|
|
565
|
+
* const results = search.search("aple"); // [{ item: "apple", score: 0.75, index: 0 }]
|
|
566
|
+
*
|
|
567
|
+
* // Object array with weighted keys
|
|
568
|
+
* const books = [
|
|
569
|
+
* { title: "Old Man's War", author: "John Scalzi" },
|
|
570
|
+
* { title: "The Lock Artist", author: "Steve Hamilton" },
|
|
571
|
+
* ];
|
|
572
|
+
* const bookSearch = new FuzzySearch(books, {
|
|
573
|
+
* keys: [
|
|
574
|
+
* { name: "title", weight: 0.7 },
|
|
575
|
+
* { name: "author", weight: 0.3 },
|
|
576
|
+
* ],
|
|
577
|
+
* algorithm: "cosine",
|
|
578
|
+
* });
|
|
579
|
+
* const results = bookSearch.search("old man"); // finds "Old Man's War"
|
|
580
|
+
* ```
|
|
581
|
+
*/
|
|
582
|
+
declare class FuzzySearch<T> {
|
|
583
|
+
private readonly similarityFn;
|
|
584
|
+
private readonly keys;
|
|
585
|
+
private readonly threshold;
|
|
586
|
+
private readonly limit;
|
|
587
|
+
private readonly caseSensitive;
|
|
588
|
+
private readonly includeMatchDetails;
|
|
589
|
+
private readonly isObjectArray;
|
|
590
|
+
private collection;
|
|
591
|
+
private readonly useLSH;
|
|
592
|
+
private readonly lshNumHashes;
|
|
593
|
+
private readonly lshNumBands;
|
|
594
|
+
private lshIndex;
|
|
595
|
+
private minHashSignatures;
|
|
596
|
+
constructor(collection: ReadonlyArray<T>, options?: IFuzzySearchOptions);
|
|
597
|
+
/**
|
|
598
|
+
* Search the collection for items similar to the query.
|
|
599
|
+
*
|
|
600
|
+
* @param query - The search query string
|
|
601
|
+
* @param limit - Optional per-query limit override
|
|
602
|
+
* @returns Array of results sorted by score descending
|
|
603
|
+
*/
|
|
604
|
+
search(query: string, limit?: number): ISearchResult<T>[];
|
|
605
|
+
/**
|
|
606
|
+
* Add an item to the collection.
|
|
607
|
+
* If LSH is enabled, the index is updated incrementally.
|
|
608
|
+
*/
|
|
609
|
+
add(item: T): void;
|
|
610
|
+
/**
|
|
611
|
+
* Remove an item from the collection by index.
|
|
612
|
+
* If LSH is enabled, the index is rebuilt (O(n)).
|
|
613
|
+
*
|
|
614
|
+
* @returns true if the item was found and removed
|
|
615
|
+
*/
|
|
616
|
+
remove(index: number): boolean;
|
|
617
|
+
/**
|
|
618
|
+
* Replace the entire collection.
|
|
619
|
+
* If LSH is enabled, the index is rebuilt.
|
|
620
|
+
*/
|
|
621
|
+
setCollection(collection: ReadonlyArray<T>): void;
|
|
622
|
+
/**
|
|
623
|
+
* Get the current collection.
|
|
624
|
+
*/
|
|
625
|
+
getCollection(): ReadonlyArray<T>;
|
|
626
|
+
/**
|
|
627
|
+
* Get the number of items in the collection.
|
|
628
|
+
*/
|
|
629
|
+
get size(): number;
|
|
630
|
+
/**
|
|
631
|
+
* Clear the collection and any LSH index.
|
|
632
|
+
*/
|
|
633
|
+
clear(): void;
|
|
634
|
+
private searchLinear;
|
|
635
|
+
private searchWithLSH;
|
|
636
|
+
private buildLSHIndex;
|
|
637
|
+
private buildMinHashSignature;
|
|
638
|
+
private computeItemScore;
|
|
639
|
+
private computeDetailedScore;
|
|
640
|
+
private extractSearchText;
|
|
641
|
+
private extractKeyValue;
|
|
642
|
+
private normalizeString;
|
|
643
|
+
}
|
|
644
|
+
/**
|
|
645
|
+
* Find the single best match for a query against a collection.
|
|
646
|
+
*
|
|
647
|
+
* This is a convenience wrapper around {@link FuzzySearch} for one-shot queries.
|
|
648
|
+
* For repeated searches against the same collection, prefer creating a
|
|
649
|
+
* {@link FuzzySearch} instance directly.
|
|
650
|
+
*
|
|
651
|
+
* Time: O(n * k) where n = collection size, k = number of keys
|
|
652
|
+
*
|
|
653
|
+
* @param query - The search query string
|
|
654
|
+
* @param collection - Array of strings or objects to search
|
|
655
|
+
* @param options - Search configuration
|
|
656
|
+
* @returns The best matching result, or null if nothing meets the threshold
|
|
657
|
+
*
|
|
658
|
+
* @example
|
|
659
|
+
* ```ts
|
|
660
|
+
* // String array
|
|
661
|
+
* const result = findBestMatch("kitten", ["sitting", "kit", "mitten"]);
|
|
662
|
+
* console.log(result?.item); // "kit"
|
|
663
|
+
* console.log(result?.score); // 0.5
|
|
664
|
+
*
|
|
665
|
+
* // Object array with weighted keys
|
|
666
|
+
* const books = [
|
|
667
|
+
* { title: "The Great Gatsby", author: "F. Scott Fitzgerald" },
|
|
668
|
+
* { title: "Great Expectations", author: "Charles Dickens" },
|
|
669
|
+
* ];
|
|
670
|
+
* const result = findBestMatch("grate gatsbi", books, {
|
|
671
|
+
* keys: [
|
|
672
|
+
* { name: "title", weight: 0.7 },
|
|
673
|
+
* { name: "author", weight: 0.3 },
|
|
674
|
+
* ],
|
|
675
|
+
* });
|
|
676
|
+
* ```
|
|
677
|
+
*/
|
|
678
|
+
declare function findBestMatch<T>(query: string, collection: ReadonlyArray<T>, options?: IFindBestMatchOptions): ISearchResult<T> | null;
|
|
679
|
+
//#endregion
|
|
680
|
+
export { BuiltinAlgorithm, DiffType, FuzzySearch, type IDiffItem, type IDiffOptions, IFindBestMatchOptions, IFuzzySearchOptions, ILSHOptions, type ILcs, type ILcsAlgorithm, IMinHashOptions, ISearchKey, ISearchResult, ISearchResultWithDetails, ISimHashOptions, LSH, LcsPairsFunc, LcsSizeFunc, MinHash, SimHasher, SimilarityFn, cosine, cosineNgram, diff, findBestMatch, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };
|
package/dist/index.mjs
CHANGED
|
@@ -800,4 +800,335 @@ function bandHash(slice) {
|
|
|
800
800
|
return hash.toString(36);
|
|
801
801
|
}
|
|
802
802
|
//#endregion
|
|
803
|
-
|
|
803
|
+
//#region src/search.ts
|
|
804
|
+
const BUILTIN_ALGORITHMS = {
|
|
805
|
+
levenshtein: levenshteinNormalized,
|
|
806
|
+
lcs: lcsNormalized,
|
|
807
|
+
jaccard,
|
|
808
|
+
jaccardNgram,
|
|
809
|
+
cosine,
|
|
810
|
+
cosineNgram,
|
|
811
|
+
sorensen,
|
|
812
|
+
sorensenNgram
|
|
813
|
+
};
|
|
814
|
+
function resolveKeys(rawKeys) {
|
|
815
|
+
if (rawKeys.length === 0) return [];
|
|
816
|
+
const totalWeight = rawKeys.reduce((sum, k) => sum + (k.weight ?? 1), 0);
|
|
817
|
+
return rawKeys.map((k) => ({
|
|
818
|
+
...k,
|
|
819
|
+
normalizedWeight: totalWeight > 0 ? (k.weight ?? 1) / totalWeight : 1 / rawKeys.length
|
|
820
|
+
}));
|
|
821
|
+
}
|
|
822
|
+
function resolveAlgorithm(algo) {
|
|
823
|
+
if (algo === void 0) return BUILTIN_ALGORITHMS.levenshtein;
|
|
824
|
+
if (typeof algo === "function") return algo;
|
|
825
|
+
return BUILTIN_ALGORITHMS[algo];
|
|
826
|
+
}
|
|
827
|
+
/**
|
|
828
|
+
* Fuzzy search engine for finding similar items in a collection.
|
|
829
|
+
*
|
|
830
|
+
* Supports both string arrays and object arrays with weighted multi-key search.
|
|
831
|
+
* Uses any similarity algorithm from @nlptools/distance, with optional LSH
|
|
832
|
+
* acceleration for large datasets.
|
|
833
|
+
*
|
|
834
|
+
* @example
|
|
835
|
+
* ```ts
|
|
836
|
+
* // String array search
|
|
837
|
+
* const search = new FuzzySearch(["apple", "banana", "cherry"]);
|
|
838
|
+
* const results = search.search("aple"); // [{ item: "apple", score: 0.75, index: 0 }]
|
|
839
|
+
*
|
|
840
|
+
* // Object array with weighted keys
|
|
841
|
+
* const books = [
|
|
842
|
+
* { title: "Old Man's War", author: "John Scalzi" },
|
|
843
|
+
* { title: "The Lock Artist", author: "Steve Hamilton" },
|
|
844
|
+
* ];
|
|
845
|
+
* const bookSearch = new FuzzySearch(books, {
|
|
846
|
+
* keys: [
|
|
847
|
+
* { name: "title", weight: 0.7 },
|
|
848
|
+
* { name: "author", weight: 0.3 },
|
|
849
|
+
* ],
|
|
850
|
+
* algorithm: "cosine",
|
|
851
|
+
* });
|
|
852
|
+
* const results = bookSearch.search("old man"); // finds "Old Man's War"
|
|
853
|
+
* ```
|
|
854
|
+
*/
|
|
855
|
+
var FuzzySearch = class {
|
|
856
|
+
similarityFn;
|
|
857
|
+
keys;
|
|
858
|
+
threshold;
|
|
859
|
+
limit;
|
|
860
|
+
caseSensitive;
|
|
861
|
+
includeMatchDetails;
|
|
862
|
+
isObjectArray;
|
|
863
|
+
collection;
|
|
864
|
+
useLSH;
|
|
865
|
+
lshNumHashes;
|
|
866
|
+
lshNumBands;
|
|
867
|
+
lshIndex;
|
|
868
|
+
minHashSignatures;
|
|
869
|
+
constructor(collection, options = {}) {
|
|
870
|
+
this.similarityFn = resolveAlgorithm(options.algorithm);
|
|
871
|
+
this.keys = resolveKeys(options.keys ?? []);
|
|
872
|
+
this.isObjectArray = this.keys.length > 0;
|
|
873
|
+
this.threshold = options.threshold ?? 0;
|
|
874
|
+
this.limit = options.limit ?? Infinity;
|
|
875
|
+
this.caseSensitive = options.caseSensitive ?? false;
|
|
876
|
+
this.includeMatchDetails = options.includeMatchDetails ?? false;
|
|
877
|
+
this.collection = [...collection];
|
|
878
|
+
const lshOpts = options.lsh;
|
|
879
|
+
this.useLSH = lshOpts !== void 0;
|
|
880
|
+
this.lshNumHashes = lshOpts?.numHashes ?? 128;
|
|
881
|
+
this.lshNumBands = lshOpts?.numBands ?? 16;
|
|
882
|
+
this.lshIndex = null;
|
|
883
|
+
this.minHashSignatures = /* @__PURE__ */ new Map();
|
|
884
|
+
if (this.useLSH && this.collection.length > 0) this.buildLSHIndex();
|
|
885
|
+
}
|
|
886
|
+
/**
|
|
887
|
+
* Search the collection for items similar to the query.
|
|
888
|
+
*
|
|
889
|
+
* @param query - The search query string
|
|
890
|
+
* @param limit - Optional per-query limit override
|
|
891
|
+
* @returns Array of results sorted by score descending
|
|
892
|
+
*/
|
|
893
|
+
search(query, limit) {
|
|
894
|
+
const effectiveLimit = limit ?? this.limit;
|
|
895
|
+
if (effectiveLimit === 0 || this.collection.length === 0) return [];
|
|
896
|
+
const normalizedQuery = this.normalizeString(query);
|
|
897
|
+
if (this.useLSH && this.lshIndex !== null) return this.searchWithLSH(normalizedQuery, effectiveLimit);
|
|
898
|
+
return this.searchLinear(normalizedQuery, effectiveLimit);
|
|
899
|
+
}
|
|
900
|
+
/**
|
|
901
|
+
* Add an item to the collection.
|
|
902
|
+
* If LSH is enabled, the index is updated incrementally.
|
|
903
|
+
*/
|
|
904
|
+
add(item) {
|
|
905
|
+
const index = this.collection.length;
|
|
906
|
+
this.collection.push(item);
|
|
907
|
+
if (this.useLSH && this.lshIndex !== null) {
|
|
908
|
+
const text = this.extractSearchText(item);
|
|
909
|
+
const sig = this.buildMinHashSignature(text);
|
|
910
|
+
this.minHashSignatures.set(index, sig);
|
|
911
|
+
this.lshIndex.insert(String(index), sig);
|
|
912
|
+
}
|
|
913
|
+
}
|
|
914
|
+
/**
|
|
915
|
+
* Remove an item from the collection by index.
|
|
916
|
+
* If LSH is enabled, the index is rebuilt (O(n)).
|
|
917
|
+
*
|
|
918
|
+
* @returns true if the item was found and removed
|
|
919
|
+
*/
|
|
920
|
+
remove(index) {
|
|
921
|
+
if (index < 0 || index >= this.collection.length) return false;
|
|
922
|
+
this.collection.splice(index, 1);
|
|
923
|
+
if (this.useLSH) this.buildLSHIndex();
|
|
924
|
+
return true;
|
|
925
|
+
}
|
|
926
|
+
/**
|
|
927
|
+
* Replace the entire collection.
|
|
928
|
+
* If LSH is enabled, the index is rebuilt.
|
|
929
|
+
*/
|
|
930
|
+
setCollection(collection) {
|
|
931
|
+
this.collection = [...collection];
|
|
932
|
+
if (this.useLSH && this.collection.length > 0) this.buildLSHIndex();
|
|
933
|
+
else if (this.useLSH) {
|
|
934
|
+
this.lshIndex = null;
|
|
935
|
+
this.minHashSignatures.clear();
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
/**
|
|
939
|
+
* Get the current collection.
|
|
940
|
+
*/
|
|
941
|
+
getCollection() {
|
|
942
|
+
return this.collection;
|
|
943
|
+
}
|
|
944
|
+
/**
|
|
945
|
+
* Get the number of items in the collection.
|
|
946
|
+
*/
|
|
947
|
+
get size() {
|
|
948
|
+
return this.collection.length;
|
|
949
|
+
}
|
|
950
|
+
/**
|
|
951
|
+
* Clear the collection and any LSH index.
|
|
952
|
+
*/
|
|
953
|
+
clear() {
|
|
954
|
+
this.collection = [];
|
|
955
|
+
this.lshIndex = null;
|
|
956
|
+
this.minHashSignatures.clear();
|
|
957
|
+
}
|
|
958
|
+
searchLinear(normalizedQuery, limit) {
|
|
959
|
+
const candidates = [];
|
|
960
|
+
for (let i = 0; i < this.collection.length; i++) {
|
|
961
|
+
const item = this.collection[i];
|
|
962
|
+
if (this.isObjectArray) if (this.includeMatchDetails) {
|
|
963
|
+
const { score, matches } = this.computeDetailedScore(normalizedQuery, item);
|
|
964
|
+
if (score >= this.threshold) candidates.push({
|
|
965
|
+
item,
|
|
966
|
+
score,
|
|
967
|
+
index: i,
|
|
968
|
+
matches
|
|
969
|
+
});
|
|
970
|
+
} else {
|
|
971
|
+
const score = this.computeItemScore(normalizedQuery, item);
|
|
972
|
+
if (score >= this.threshold) candidates.push({
|
|
973
|
+
item,
|
|
974
|
+
score,
|
|
975
|
+
index: i
|
|
976
|
+
});
|
|
977
|
+
}
|
|
978
|
+
else {
|
|
979
|
+
const itemStr = this.normalizeString(item);
|
|
980
|
+
const score = this.similarityFn(normalizedQuery, itemStr);
|
|
981
|
+
if (score >= this.threshold) candidates.push({
|
|
982
|
+
item,
|
|
983
|
+
score,
|
|
984
|
+
index: i
|
|
985
|
+
});
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
989
|
+
if (candidates.length <= limit) return candidates;
|
|
990
|
+
return candidates.slice(0, limit);
|
|
991
|
+
}
|
|
992
|
+
searchWithLSH(normalizedQuery, limit) {
|
|
993
|
+
const queryText = this.isObjectArray ? normalizedQuery : normalizedQuery;
|
|
994
|
+
const querySig = this.buildMinHashSignature(queryText);
|
|
995
|
+
const candidateIds = this.lshIndex.query(querySig, this.threshold);
|
|
996
|
+
const candidates = [];
|
|
997
|
+
for (const [id] of candidateIds) {
|
|
998
|
+
const idx = parseInt(id, 10);
|
|
999
|
+
if (idx < 0 || idx >= this.collection.length) continue;
|
|
1000
|
+
const item = this.collection[idx];
|
|
1001
|
+
if (this.isObjectArray) if (this.includeMatchDetails) {
|
|
1002
|
+
const { score, matches } = this.computeDetailedScore(normalizedQuery, item);
|
|
1003
|
+
if (score >= this.threshold) candidates.push({
|
|
1004
|
+
item,
|
|
1005
|
+
score,
|
|
1006
|
+
index: idx,
|
|
1007
|
+
matches
|
|
1008
|
+
});
|
|
1009
|
+
} else {
|
|
1010
|
+
const score = this.computeItemScore(normalizedQuery, item);
|
|
1011
|
+
if (score >= this.threshold) candidates.push({
|
|
1012
|
+
item,
|
|
1013
|
+
score,
|
|
1014
|
+
index: idx
|
|
1015
|
+
});
|
|
1016
|
+
}
|
|
1017
|
+
else {
|
|
1018
|
+
const itemStr = this.normalizeString(item);
|
|
1019
|
+
const score = this.similarityFn(normalizedQuery, itemStr);
|
|
1020
|
+
if (score >= this.threshold) candidates.push({
|
|
1021
|
+
item,
|
|
1022
|
+
score,
|
|
1023
|
+
index: idx
|
|
1024
|
+
});
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
1028
|
+
if (candidates.length <= limit) return candidates;
|
|
1029
|
+
return candidates.slice(0, limit);
|
|
1030
|
+
}
|
|
1031
|
+
buildLSHIndex() {
|
|
1032
|
+
this.lshIndex = new LSH({
|
|
1033
|
+
numBands: this.lshNumBands,
|
|
1034
|
+
numHashes: this.lshNumHashes
|
|
1035
|
+
});
|
|
1036
|
+
this.minHashSignatures.clear();
|
|
1037
|
+
for (let i = 0; i < this.collection.length; i++) {
|
|
1038
|
+
const text = this.extractSearchText(this.collection[i]);
|
|
1039
|
+
const sig = this.buildMinHashSignature(text);
|
|
1040
|
+
this.minHashSignatures.set(i, sig);
|
|
1041
|
+
this.lshIndex.insert(String(i), sig);
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
buildMinHashSignature(text) {
|
|
1045
|
+
const mh = new MinHash({ numHashes: this.lshNumHashes });
|
|
1046
|
+
const grams = ngrams(text, 2);
|
|
1047
|
+
for (const g of grams) mh.update(g);
|
|
1048
|
+
return mh.digest();
|
|
1049
|
+
}
|
|
1050
|
+
computeItemScore(normalizedQuery, item) {
|
|
1051
|
+
let score = 0;
|
|
1052
|
+
for (const key of this.keys) {
|
|
1053
|
+
const value = this.extractKeyValue(item, key);
|
|
1054
|
+
const normalizedValue = this.normalizeString(value);
|
|
1055
|
+
score += key.normalizedWeight * this.similarityFn(normalizedQuery, normalizedValue);
|
|
1056
|
+
}
|
|
1057
|
+
return score;
|
|
1058
|
+
}
|
|
1059
|
+
computeDetailedScore(normalizedQuery, item) {
|
|
1060
|
+
let score = 0;
|
|
1061
|
+
const matches = {};
|
|
1062
|
+
for (const key of this.keys) {
|
|
1063
|
+
const value = this.extractKeyValue(item, key);
|
|
1064
|
+
const normalizedValue = this.normalizeString(value);
|
|
1065
|
+
const s = this.similarityFn(normalizedQuery, normalizedValue);
|
|
1066
|
+
matches[key.name] = s;
|
|
1067
|
+
score += key.normalizedWeight * s;
|
|
1068
|
+
}
|
|
1069
|
+
return {
|
|
1070
|
+
score,
|
|
1071
|
+
matches
|
|
1072
|
+
};
|
|
1073
|
+
}
|
|
1074
|
+
extractSearchText(item) {
|
|
1075
|
+
if (this.isObjectArray) return this.keys.map((k) => this.extractKeyValue(item, k)).join(" ");
|
|
1076
|
+
return this.normalizeString(item);
|
|
1077
|
+
}
|
|
1078
|
+
extractKeyValue(item, key) {
|
|
1079
|
+
if (key.getter) {
|
|
1080
|
+
const value = key.getter(item);
|
|
1081
|
+
return typeof value === "string" ? value : "";
|
|
1082
|
+
}
|
|
1083
|
+
const value = item[key.name];
|
|
1084
|
+
return typeof value === "string" ? value : "";
|
|
1085
|
+
}
|
|
1086
|
+
normalizeString(str) {
|
|
1087
|
+
return this.caseSensitive ? str : str.toLowerCase();
|
|
1088
|
+
}
|
|
1089
|
+
};
|
|
1090
|
+
/**
|
|
1091
|
+
* Find the single best match for a query against a collection.
|
|
1092
|
+
*
|
|
1093
|
+
* This is a convenience wrapper around {@link FuzzySearch} for one-shot queries.
|
|
1094
|
+
* For repeated searches against the same collection, prefer creating a
|
|
1095
|
+
* {@link FuzzySearch} instance directly.
|
|
1096
|
+
*
|
|
1097
|
+
* Time: O(n * k) where n = collection size, k = number of keys
|
|
1098
|
+
*
|
|
1099
|
+
* @param query - The search query string
|
|
1100
|
+
* @param collection - Array of strings or objects to search
|
|
1101
|
+
* @param options - Search configuration
|
|
1102
|
+
* @returns The best matching result, or null if nothing meets the threshold
|
|
1103
|
+
*
|
|
1104
|
+
* @example
|
|
1105
|
+
* ```ts
|
|
1106
|
+
* // String array
|
|
1107
|
+
* const result = findBestMatch("kitten", ["sitting", "kit", "mitten"]);
|
|
1108
|
+
* console.log(result?.item); // "kit"
|
|
1109
|
+
* console.log(result?.score); // 0.5
|
|
1110
|
+
*
|
|
1111
|
+
* // Object array with weighted keys
|
|
1112
|
+
* const books = [
|
|
1113
|
+
* { title: "The Great Gatsby", author: "F. Scott Fitzgerald" },
|
|
1114
|
+
* { title: "Great Expectations", author: "Charles Dickens" },
|
|
1115
|
+
* ];
|
|
1116
|
+
* const result = findBestMatch("grate gatsbi", books, {
|
|
1117
|
+
* keys: [
|
|
1118
|
+
* { name: "title", weight: 0.7 },
|
|
1119
|
+
* { name: "author", weight: 0.3 },
|
|
1120
|
+
* ],
|
|
1121
|
+
* });
|
|
1122
|
+
* ```
|
|
1123
|
+
*/
|
|
1124
|
+
function findBestMatch(query, collection, options = {}) {
|
|
1125
|
+
const results = new FuzzySearch(collection, {
|
|
1126
|
+
algorithm: options.algorithm,
|
|
1127
|
+
keys: options.keys,
|
|
1128
|
+
threshold: options.threshold,
|
|
1129
|
+
caseSensitive: options.caseSensitive
|
|
1130
|
+
}).search(query, 1);
|
|
1131
|
+
return results.length > 0 ? results[0] : null;
|
|
1132
|
+
}
|
|
1133
|
+
//#endregion
|
|
1134
|
+
export { DiffType, FuzzySearch, LSH, MinHash, SimHasher, cosine, cosineNgram, diff, findBestMatch, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };
|