@nlptools/distance 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,6 +11,7 @@
11
11
  - Edit distance: Levenshtein, LCS (Myers O(ND) and DP)
12
12
  - Token similarity: Jaccard, Cosine, Sorensen-Dice (character multiset and n-gram variants)
13
13
  - Hash-based deduplication: SimHash, MinHash, LSH
14
+ - Fuzzy search: `FuzzySearch` class and `findBestMatch` with multi-algorithm support
14
15
  - Diff: based on `@algorithm.ts/diff` (Myers and DP backends)
15
16
  - All distance algorithms include normalized similarity variants (0-1 range)
16
17
 
@@ -123,6 +124,45 @@ const query = lsh.query(mh.digest(), 0.5);
123
124
  // => [["doc1", 0.67]]
124
125
  ```
125
126
 
127
+ ### Fuzzy Search
128
+
129
+ ```typescript
130
+ import { FuzzySearch, findBestMatch } from "@nlptools/distance";
131
+
132
+ // String array search
133
+ const search = new FuzzySearch(["apple", "banana", "cherry"]);
134
+ search.search("aple");
135
+ // => [{ item: "apple", score: 0.8, index: 0 }]
136
+
137
+ // Object array with weighted keys
138
+ const books = [
139
+ { title: "Old Man's War", author: "John Scalzi" },
140
+ { title: "The Great Gatsby", author: "F. Scott Fitzgerald" },
141
+ ];
142
+ const bookSearch = new FuzzySearch(books, {
143
+ keys: [
144
+ { name: "title", weight: 0.7 },
145
+ { name: "author", weight: 0.3 },
146
+ ],
147
+ algorithm: "cosine",
148
+ threshold: 0.3,
149
+ });
150
+ bookSearch.search("old man");
151
+ // => [{ item: { title: "Old Man's War", ... }, score: 0.52, index: 0 }]
152
+
153
+ // One-shot best match
154
+ findBestMatch("kitten", ["sitting", "kit", "mitten"]);
155
+ // => { item: "kit", score: 0.5, index: 1 }
156
+
157
+ // With per-key details
158
+ const detailed = new FuzzySearch(books, {
159
+ keys: [{ name: "title" }, { name: "author" }],
160
+ includeMatchDetails: true,
161
+ });
162
+ detailed.search("gatsby");
163
+ // => [{ item: ..., score: 0.45, index: 1, matches: { title: 0.6, author: 0.1 } }]
164
+ ```
165
+
126
166
  ### Diff
127
167
 
128
168
  ```typescript
@@ -172,6 +212,27 @@ const result = diff("abc", "ac");
172
212
  | `MinHash.estimate(sig1, sig2)` | Static: estimate Jaccard from signatures |
173
213
  | `LSH` | Class with `insert()`, `query()`, `remove()` |
174
214
 
215
+ ### Fuzzy Search
216
+
217
+ | Function / Class | Description |
218
+ | -------------------------------------------- | -------------------------------------------------- |
219
+ | `FuzzySearch<T>(collection, options?)` | Search engine with dynamic collection management |
220
+ | `findBestMatch(query, collection, options?)` | One-shot convenience: returns best match or `null` |
221
+
222
+ **FuzzySearch options:**
223
+
224
+ | Option | Type | Default | Description |
225
+ | --------------------- | ---------------------------------- | --------------- | ----------------------------- |
226
+ | `algorithm` | `BuiltinAlgorithm \| SimilarityFn` | `"levenshtein"` | Similarity algorithm to use |
227
+ | `keys` | `ISearchKey[]` | `[]` | Object fields to search on |
228
+ | `threshold` | `number` | `0` | Min similarity score (0-1) |
229
+ | `limit` | `number` | `Infinity` | Max results to return |
230
+ | `caseSensitive` | `boolean` | `false` | Case-insensitive by default |
231
+ | `includeMatchDetails` | `boolean` | `false` | Include per-key scores |
232
+ | `lsh` | `{ numHashes?, numBands? }` | — | Enable LSH for large datasets |
233
+
234
+ **Built-in algorithms:** `"levenshtein"`, `"lcs"`, `"jaccard"`, `"jaccardNgram"`, `"cosine"`, `"cosineNgram"`, `"sorensen"`, `"sorensenNgram"`
235
+
175
236
  ### Diff
176
237
 
177
238
  | Function | Description | Returns |
@@ -180,14 +241,19 @@ const result = diff("abc", "ac");
180
241
 
181
242
  ### Types
182
243
 
183
- | Type | Description |
184
- | ----------------- | ---------------------------------------- |
185
- | `DiffType` | Enum: `ADDED`, `REMOVED`, `COMMON` |
186
- | `IDiffItem<T>` | Diff result item with type and tokens |
187
- | `IDiffOptions<T>` | Options for diff (equals, lcs algorithm) |
188
- | `ISimHashOptions` | Options for SimHash (bits, hashFn) |
189
- | `IMinHashOptions` | Options for MinHash (numHashes, seed) |
190
- | `ILSHOptions` | Options for LSH (numBands, numHashes) |
244
+ | Type | Description |
245
+ | ----------------------- | -------------------------------------------- |
246
+ | `DiffType` | Enum: `ADDED`, `REMOVED`, `COMMON` |
247
+ | `IDiffItem<T>` | Diff result item with type and tokens |
248
+ | `IDiffOptions<T>` | Options for diff (equals, lcs algorithm) |
249
+ | `ISimHashOptions` | Options for SimHash (bits, hashFn) |
250
+ | `IMinHashOptions` | Options for MinHash (numHashes, seed) |
251
+ | `ILSHOptions` | Options for LSH (numBands, numHashes) |
252
+ | `IFuzzySearchOptions` | Options for FuzzySearch constructor |
253
+ | `IFindBestMatchOptions` | Options for findBestMatch function |
254
+ | `ISearchKey` | Searchable key config (name, weight, getter) |
255
+ | `ISearchResult<T>` | Search result with item, score, index |
256
+ | `SimilarityFn` | `(a: string, b: string) => number` in [0,1] |
191
257
 
192
258
  ## Performance
193
259
 
@@ -232,6 +298,20 @@ Unit: microseconds per operation (us/op).
232
298
 
233
299
  TS implementations use V8 JIT optimization + `Int32Array` ASCII fast path + integer-encoded bigrams, avoiding JS-WASM boundary overhead entirely.
234
300
 
301
+ ### Fuzzy Search: NLPTools vs Fuse.js
302
+
303
+ Benchmark: 20 items in collection, 6 queries per iteration, 1000 iterations.
304
+ Unit: milliseconds per operation (ms/op). Algorithm: levenshtein (default).
305
+
306
+ | Scenario | NLPTools | Fuse.js |
307
+ | ----------------------- | -------- | ------- |
308
+ | Setup (constructor) | 0.0002 | 0.0050 |
309
+ | Search (string array) | 0.0114 | 0.1077 |
310
+ | Search (object, 1 key) | 0.0176 | 0.3308 |
311
+ | Search (object, 2 keys) | 0.0289 | 0.6445 |
312
+
313
+ Both libraries return identical top-1 results for all test queries. NLPTools scores are normalized similarity (0-1, higher is better); Fuse.js uses Bitap error scores (0 = perfect, lower is better).
314
+
235
315
  ## Dependencies
236
316
 
237
317
  - `fastest-levenshtein` — fastest JS Levenshtein implementation
package/dist/index.d.mts CHANGED
@@ -389,4 +389,292 @@ declare class LSH {
389
389
  get size(): number;
390
390
  }
391
391
  //#endregion
392
- export { DiffType, type IDiffItem, type IDiffOptions, ILSHOptions, type ILcs, type ILcsAlgorithm, IMinHashOptions, ISimHashOptions, LSH, LcsPairsFunc, LcsSizeFunc, MinHash, SimHasher, cosine, cosineNgram, diff, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };
392
+ //#region src/search.d.ts
393
+ /**
394
+ * A function that computes similarity between two strings, returning a value
395
+ * in [0, 1] where 1 means identical.
396
+ */
397
+ type SimilarityFn = (a: string, b: string) => number;
398
+ /**
399
+ * Built-in similarity algorithms. Each maps to a normalized similarity
400
+ * function from @nlptools/distance.
401
+ */
402
+ type BuiltinAlgorithm = "levenshtein" | "lcs" | "jaccard" | "jaccardNgram" | "cosine" | "cosineNgram" | "sorensen" | "sorensenNgram";
403
+ /**
404
+ * Configuration for a searchable key on an object item.
405
+ *
406
+ * @example
407
+ * ```ts
408
+ * const keys = [
409
+ * { name: "title", weight: 0.7 },
410
+ * { name: "author", weight: 0.3 },
411
+ * ];
412
+ * ```
413
+ */
414
+ interface ISearchKey {
415
+ /** Property name to search on. */
416
+ name: string;
417
+ /**
418
+ * Weight of this key in the final score.
419
+ * Weights are normalized to sum to 1.0 internally.
420
+ * @default 1
421
+ */
422
+ weight?: number;
423
+ /**
424
+ * Optional custom getter function. If provided, used instead of
425
+ * reading `item[name]`. Must return a string.
426
+ */
427
+ getter?: (item: any) => string;
428
+ }
429
+ /**
430
+ * A single search result, containing the matched item, its score, and
431
+ * its position in the original collection.
432
+ *
433
+ * Results are sorted by score descending (best match first).
434
+ */
435
+ interface ISearchResult<T> {
436
+ /** The matched item from the collection. */
437
+ item: T;
438
+ /**
439
+ * Similarity score in [0, 1], where 1 means identical.
440
+ * For multi-key search, this is the weighted sum of per-key scores.
441
+ */
442
+ score: number;
443
+ /** Index of the item in the original collection array. */
444
+ index: number;
445
+ }
446
+ /**
447
+ * Extended search result including per-key match details.
448
+ * Only produced when `includeMatchDetails` is true.
449
+ */
450
+ interface ISearchResultWithDetails<T> extends ISearchResult<T> {
451
+ /**
452
+ * Per-key similarity scores.
453
+ * Keys are the key names from the ISearchKey configuration.
454
+ * Values are individual similarity scores in [0, 1].
455
+ */
456
+ matches: Record<string, number>;
457
+ }
458
+ /**
459
+ * Options for the {@link FuzzySearch} constructor.
460
+ *
461
+ * @example
462
+ * ```ts
463
+ * // String array
464
+ * const search = new FuzzySearch(["apple", "banana", "cherry"]);
465
+ *
466
+ * // Object array with weighted keys
467
+ * const search = new FuzzySearch(books, {
468
+ * keys: [
469
+ * { name: "title", weight: 0.7 },
470
+ * { name: "author", weight: 0.3 },
471
+ * ],
472
+ * algorithm: "cosine",
473
+ * threshold: 0.4,
474
+ * });
475
+ * ```
476
+ */
477
+ interface IFuzzySearchOptions {
478
+ /**
479
+ * Similarity algorithm to use for comparing strings.
480
+ *
481
+ * - A string from {@link BuiltinAlgorithm} selects a built-in function.
482
+ * - A custom {@link SimilarityFn} can be provided for full control.
483
+ *
484
+ * @default "levenshtein"
485
+ */
486
+ algorithm?: BuiltinAlgorithm | SimilarityFn;
487
+ /**
488
+ * Keys to search on when the collection contains objects.
489
+ * Ignored for string arrays.
490
+ */
491
+ keys?: ISearchKey[];
492
+ /**
493
+ * Minimum similarity score (0-1) for a result to be included.
494
+ * Results scoring below this threshold are excluded.
495
+ * @default 0
496
+ */
497
+ threshold?: number;
498
+ /**
499
+ * Maximum number of results to return.
500
+ * @default Infinity
501
+ */
502
+ limit?: number;
503
+ /**
504
+ * Whether search should be case-insensitive.
505
+ * When true, both the query and the item strings are lowercased
506
+ * before comparison.
507
+ * @default false (case-insensitive by default)
508
+ */
509
+ caseSensitive?: boolean;
510
+ /**
511
+ * Include per-key match details in results.
512
+ * When true, results include a `matches` field with individual
513
+ * similarity scores per key.
514
+ * @default false
515
+ */
516
+ includeMatchDetails?: boolean;
517
+ /**
518
+ * Enable LSH-accelerated search for large collections (>1000 items).
519
+ * Uses MinHash + banding as a candidate filter, then re-scores with
520
+ * the exact algorithm. Provides sub-linear query time at the cost of
521
+ * approximate results (some true matches may be missed).
522
+ */
523
+ lsh?: {
524
+ /** Number of hash functions for MinHash signature size. @default 128 */numHashes?: number;
525
+ /**
526
+ * Number of bands for LSH banding.
527
+ * More bands = higher recall, lower precision.
528
+ * @default 16
529
+ */
530
+ numBands?: number;
531
+ };
532
+ }
533
+ /**
534
+ * Options for the {@link findBestMatch} function.
535
+ *
536
+ * @example
537
+ * ```ts
538
+ * const result = findBestMatch("kitten", ["sitting", "kit", "mitten"], {
539
+ * algorithm: "levenshtein",
540
+ * threshold: 0.3,
541
+ * });
542
+ * ```
543
+ */
544
+ interface IFindBestMatchOptions {
545
+ /** Similarity algorithm. @default "levenshtein" */
546
+ algorithm?: BuiltinAlgorithm | SimilarityFn;
547
+ /** Keys for object-array search. */
548
+ keys?: ISearchKey[];
549
+ /** Minimum similarity score. @default 0 */
550
+ threshold?: number;
551
+ /** Whether search is case-insensitive. @default false (case-insensitive) */
552
+ caseSensitive?: boolean;
553
+ }
554
+ /**
555
+ * Fuzzy search engine for finding similar items in a collection.
556
+ *
557
+ * Supports both string arrays and object arrays with weighted multi-key search.
558
+ * Uses any similarity algorithm from @nlptools/distance, with optional LSH
559
+ * acceleration for large datasets.
560
+ *
561
+ * @example
562
+ * ```ts
563
+ * // String array search
564
+ * const search = new FuzzySearch(["apple", "banana", "cherry"]);
565
+ * const results = search.search("aple"); // [{ item: "apple", score: 0.75, index: 0 }]
566
+ *
567
+ * // Object array with weighted keys
568
+ * const books = [
569
+ * { title: "Old Man's War", author: "John Scalzi" },
570
+ * { title: "The Lock Artist", author: "Steve Hamilton" },
571
+ * ];
572
+ * const bookSearch = new FuzzySearch(books, {
573
+ * keys: [
574
+ * { name: "title", weight: 0.7 },
575
+ * { name: "author", weight: 0.3 },
576
+ * ],
577
+ * algorithm: "cosine",
578
+ * });
579
+ * const results = bookSearch.search("old man"); // finds "Old Man's War"
580
+ * ```
581
+ */
582
+ declare class FuzzySearch<T> {
583
+ private readonly similarityFn;
584
+ private readonly keys;
585
+ private readonly threshold;
586
+ private readonly limit;
587
+ private readonly caseSensitive;
588
+ private readonly includeMatchDetails;
589
+ private readonly isObjectArray;
590
+ private collection;
591
+ private readonly useLSH;
592
+ private readonly lshNumHashes;
593
+ private readonly lshNumBands;
594
+ private lshIndex;
595
+ private minHashSignatures;
596
+ constructor(collection: ReadonlyArray<T>, options?: IFuzzySearchOptions);
597
+ /**
598
+ * Search the collection for items similar to the query.
599
+ *
600
+ * @param query - The search query string
601
+ * @param limit - Optional per-query limit override
602
+ * @returns Array of results sorted by score descending
603
+ */
604
+ search(query: string, limit?: number): ISearchResult<T>[];
605
+ /**
606
+ * Add an item to the collection.
607
+ * If LSH is enabled, the index is updated incrementally.
608
+ */
609
+ add(item: T): void;
610
+ /**
611
+ * Remove an item from the collection by index.
612
+ * If LSH is enabled, the index is rebuilt (O(n)).
613
+ *
614
+ * @returns true if the item was found and removed
615
+ */
616
+ remove(index: number): boolean;
617
+ /**
618
+ * Replace the entire collection.
619
+ * If LSH is enabled, the index is rebuilt.
620
+ */
621
+ setCollection(collection: ReadonlyArray<T>): void;
622
+ /**
623
+ * Get the current collection.
624
+ */
625
+ getCollection(): ReadonlyArray<T>;
626
+ /**
627
+ * Get the number of items in the collection.
628
+ */
629
+ get size(): number;
630
+ /**
631
+ * Clear the collection and any LSH index.
632
+ */
633
+ clear(): void;
634
+ private searchLinear;
635
+ private searchWithLSH;
636
+ private buildLSHIndex;
637
+ private buildMinHashSignature;
638
+ private computeItemScore;
639
+ private computeDetailedScore;
640
+ private extractSearchText;
641
+ private extractKeyValue;
642
+ private normalizeString;
643
+ }
644
+ /**
645
+ * Find the single best match for a query against a collection.
646
+ *
647
+ * This is a convenience wrapper around {@link FuzzySearch} for one-shot queries.
648
+ * For repeated searches against the same collection, prefer creating a
649
+ * {@link FuzzySearch} instance directly.
650
+ *
651
+ * Time: O(n * k) where n = collection size, k = number of keys
652
+ *
653
+ * @param query - The search query string
654
+ * @param collection - Array of strings or objects to search
655
+ * @param options - Search configuration
656
+ * @returns The best matching result, or null if nothing meets the threshold
657
+ *
658
+ * @example
659
+ * ```ts
660
+ * // String array
661
+ * const result = findBestMatch("kitten", ["sitting", "kit", "mitten"]);
662
+ * console.log(result?.item); // "kit"
663
+ * console.log(result?.score); // 0.5
664
+ *
665
+ * // Object array with weighted keys
666
+ * const books = [
667
+ * { title: "The Great Gatsby", author: "F. Scott Fitzgerald" },
668
+ * { title: "Great Expectations", author: "Charles Dickens" },
669
+ * ];
670
+ * const result = findBestMatch("grate gatsbi", books, {
671
+ * keys: [
672
+ * { name: "title", weight: 0.7 },
673
+ * { name: "author", weight: 0.3 },
674
+ * ],
675
+ * });
676
+ * ```
677
+ */
678
+ declare function findBestMatch<T>(query: string, collection: ReadonlyArray<T>, options?: IFindBestMatchOptions): ISearchResult<T> | null;
679
+ //#endregion
680
+ export { BuiltinAlgorithm, DiffType, FuzzySearch, type IDiffItem, type IDiffOptions, IFindBestMatchOptions, IFuzzySearchOptions, ILSHOptions, type ILcs, type ILcsAlgorithm, IMinHashOptions, ISearchKey, ISearchResult, ISearchResultWithDetails, ISimHashOptions, LSH, LcsPairsFunc, LcsSizeFunc, MinHash, SimHasher, SimilarityFn, cosine, cosineNgram, diff, findBestMatch, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };
package/dist/index.mjs CHANGED
@@ -800,4 +800,335 @@ function bandHash(slice) {
800
800
  return hash.toString(36);
801
801
  }
802
802
  //#endregion
803
- export { DiffType, LSH, MinHash, SimHasher, cosine, cosineNgram, diff, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };
803
+ //#region src/search.ts
804
+ const BUILTIN_ALGORITHMS = {
805
+ levenshtein: levenshteinNormalized,
806
+ lcs: lcsNormalized,
807
+ jaccard,
808
+ jaccardNgram,
809
+ cosine,
810
+ cosineNgram,
811
+ sorensen,
812
+ sorensenNgram
813
+ };
814
+ function resolveKeys(rawKeys) {
815
+ if (rawKeys.length === 0) return [];
816
+ const totalWeight = rawKeys.reduce((sum, k) => sum + (k.weight ?? 1), 0);
817
+ return rawKeys.map((k) => ({
818
+ ...k,
819
+ normalizedWeight: totalWeight > 0 ? (k.weight ?? 1) / totalWeight : 1 / rawKeys.length
820
+ }));
821
+ }
822
+ function resolveAlgorithm(algo) {
823
+ if (algo === void 0) return BUILTIN_ALGORITHMS.levenshtein;
824
+ if (typeof algo === "function") return algo;
825
+ return BUILTIN_ALGORITHMS[algo];
826
+ }
827
+ /**
828
+ * Fuzzy search engine for finding similar items in a collection.
829
+ *
830
+ * Supports both string arrays and object arrays with weighted multi-key search.
831
+ * Uses any similarity algorithm from @nlptools/distance, with optional LSH
832
+ * acceleration for large datasets.
833
+ *
834
+ * @example
835
+ * ```ts
836
+ * // String array search
837
+ * const search = new FuzzySearch(["apple", "banana", "cherry"]);
838
+ * const results = search.search("aple"); // [{ item: "apple", score: 0.75, index: 0 }]
839
+ *
840
+ * // Object array with weighted keys
841
+ * const books = [
842
+ * { title: "Old Man's War", author: "John Scalzi" },
843
+ * { title: "The Lock Artist", author: "Steve Hamilton" },
844
+ * ];
845
+ * const bookSearch = new FuzzySearch(books, {
846
+ * keys: [
847
+ * { name: "title", weight: 0.7 },
848
+ * { name: "author", weight: 0.3 },
849
+ * ],
850
+ * algorithm: "cosine",
851
+ * });
852
+ * const results = bookSearch.search("old man"); // finds "Old Man's War"
853
+ * ```
854
+ */
855
+ var FuzzySearch = class {
856
+ similarityFn;
857
+ keys;
858
+ threshold;
859
+ limit;
860
+ caseSensitive;
861
+ includeMatchDetails;
862
+ isObjectArray;
863
+ collection;
864
+ useLSH;
865
+ lshNumHashes;
866
+ lshNumBands;
867
+ lshIndex;
868
+ minHashSignatures;
869
+ constructor(collection, options = {}) {
870
+ this.similarityFn = resolveAlgorithm(options.algorithm);
871
+ this.keys = resolveKeys(options.keys ?? []);
872
+ this.isObjectArray = this.keys.length > 0;
873
+ this.threshold = options.threshold ?? 0;
874
+ this.limit = options.limit ?? Infinity;
875
+ this.caseSensitive = options.caseSensitive ?? false;
876
+ this.includeMatchDetails = options.includeMatchDetails ?? false;
877
+ this.collection = [...collection];
878
+ const lshOpts = options.lsh;
879
+ this.useLSH = lshOpts !== void 0;
880
+ this.lshNumHashes = lshOpts?.numHashes ?? 128;
881
+ this.lshNumBands = lshOpts?.numBands ?? 16;
882
+ this.lshIndex = null;
883
+ this.minHashSignatures = /* @__PURE__ */ new Map();
884
+ if (this.useLSH && this.collection.length > 0) this.buildLSHIndex();
885
+ }
886
+ /**
887
+ * Search the collection for items similar to the query.
888
+ *
889
+ * @param query - The search query string
890
+ * @param limit - Optional per-query limit override
891
+ * @returns Array of results sorted by score descending
892
+ */
893
+ search(query, limit) {
894
+ const effectiveLimit = limit ?? this.limit;
895
+ if (effectiveLimit === 0 || this.collection.length === 0) return [];
896
+ const normalizedQuery = this.normalizeString(query);
897
+ if (this.useLSH && this.lshIndex !== null) return this.searchWithLSH(normalizedQuery, effectiveLimit);
898
+ return this.searchLinear(normalizedQuery, effectiveLimit);
899
+ }
900
+ /**
901
+ * Add an item to the collection.
902
+ * If LSH is enabled, the index is updated incrementally.
903
+ */
904
+ add(item) {
905
+ const index = this.collection.length;
906
+ this.collection.push(item);
907
+ if (this.useLSH && this.lshIndex !== null) {
908
+ const text = this.extractSearchText(item);
909
+ const sig = this.buildMinHashSignature(text);
910
+ this.minHashSignatures.set(index, sig);
911
+ this.lshIndex.insert(String(index), sig);
912
+ }
913
+ }
914
+ /**
915
+ * Remove an item from the collection by index.
916
+ * If LSH is enabled, the index is rebuilt (O(n)).
917
+ *
918
+ * @returns true if the item was found and removed
919
+ */
920
+ remove(index) {
921
+ if (index < 0 || index >= this.collection.length) return false;
922
+ this.collection.splice(index, 1);
923
+ if (this.useLSH) this.buildLSHIndex();
924
+ return true;
925
+ }
926
+ /**
927
+ * Replace the entire collection.
928
+ * If LSH is enabled, the index is rebuilt.
929
+ */
930
+ setCollection(collection) {
931
+ this.collection = [...collection];
932
+ if (this.useLSH && this.collection.length > 0) this.buildLSHIndex();
933
+ else if (this.useLSH) {
934
+ this.lshIndex = null;
935
+ this.minHashSignatures.clear();
936
+ }
937
+ }
938
+ /**
939
+ * Get the current collection.
940
+ */
941
+ getCollection() {
942
+ return this.collection;
943
+ }
944
+ /**
945
+ * Get the number of items in the collection.
946
+ */
947
+ get size() {
948
+ return this.collection.length;
949
+ }
950
+ /**
951
+ * Clear the collection and any LSH index.
952
+ */
953
+ clear() {
954
+ this.collection = [];
955
+ this.lshIndex = null;
956
+ this.minHashSignatures.clear();
957
+ }
958
+ searchLinear(normalizedQuery, limit) {
959
+ const candidates = [];
960
+ for (let i = 0; i < this.collection.length; i++) {
961
+ const item = this.collection[i];
962
+ if (this.isObjectArray) if (this.includeMatchDetails) {
963
+ const { score, matches } = this.computeDetailedScore(normalizedQuery, item);
964
+ if (score >= this.threshold) candidates.push({
965
+ item,
966
+ score,
967
+ index: i,
968
+ matches
969
+ });
970
+ } else {
971
+ const score = this.computeItemScore(normalizedQuery, item);
972
+ if (score >= this.threshold) candidates.push({
973
+ item,
974
+ score,
975
+ index: i
976
+ });
977
+ }
978
+ else {
979
+ const itemStr = this.normalizeString(item);
980
+ const score = this.similarityFn(normalizedQuery, itemStr);
981
+ if (score >= this.threshold) candidates.push({
982
+ item,
983
+ score,
984
+ index: i
985
+ });
986
+ }
987
+ }
988
+ candidates.sort((a, b) => b.score - a.score);
989
+ if (candidates.length <= limit) return candidates;
990
+ return candidates.slice(0, limit);
991
+ }
992
+ searchWithLSH(normalizedQuery, limit) {
993
+ const queryText = this.isObjectArray ? normalizedQuery : normalizedQuery;
994
+ const querySig = this.buildMinHashSignature(queryText);
995
+ const candidateIds = this.lshIndex.query(querySig, this.threshold);
996
+ const candidates = [];
997
+ for (const [id] of candidateIds) {
998
+ const idx = parseInt(id, 10);
999
+ if (idx < 0 || idx >= this.collection.length) continue;
1000
+ const item = this.collection[idx];
1001
+ if (this.isObjectArray) if (this.includeMatchDetails) {
1002
+ const { score, matches } = this.computeDetailedScore(normalizedQuery, item);
1003
+ if (score >= this.threshold) candidates.push({
1004
+ item,
1005
+ score,
1006
+ index: idx,
1007
+ matches
1008
+ });
1009
+ } else {
1010
+ const score = this.computeItemScore(normalizedQuery, item);
1011
+ if (score >= this.threshold) candidates.push({
1012
+ item,
1013
+ score,
1014
+ index: idx
1015
+ });
1016
+ }
1017
+ else {
1018
+ const itemStr = this.normalizeString(item);
1019
+ const score = this.similarityFn(normalizedQuery, itemStr);
1020
+ if (score >= this.threshold) candidates.push({
1021
+ item,
1022
+ score,
1023
+ index: idx
1024
+ });
1025
+ }
1026
+ }
1027
+ candidates.sort((a, b) => b.score - a.score);
1028
+ if (candidates.length <= limit) return candidates;
1029
+ return candidates.slice(0, limit);
1030
+ }
1031
+ buildLSHIndex() {
1032
+ this.lshIndex = new LSH({
1033
+ numBands: this.lshNumBands,
1034
+ numHashes: this.lshNumHashes
1035
+ });
1036
+ this.minHashSignatures.clear();
1037
+ for (let i = 0; i < this.collection.length; i++) {
1038
+ const text = this.extractSearchText(this.collection[i]);
1039
+ const sig = this.buildMinHashSignature(text);
1040
+ this.minHashSignatures.set(i, sig);
1041
+ this.lshIndex.insert(String(i), sig);
1042
+ }
1043
+ }
1044
+ buildMinHashSignature(text) {
1045
+ const mh = new MinHash({ numHashes: this.lshNumHashes });
1046
+ const grams = ngrams(text, 2);
1047
+ for (const g of grams) mh.update(g);
1048
+ return mh.digest();
1049
+ }
1050
+ computeItemScore(normalizedQuery, item) {
1051
+ let score = 0;
1052
+ for (const key of this.keys) {
1053
+ const value = this.extractKeyValue(item, key);
1054
+ const normalizedValue = this.normalizeString(value);
1055
+ score += key.normalizedWeight * this.similarityFn(normalizedQuery, normalizedValue);
1056
+ }
1057
+ return score;
1058
+ }
1059
+ computeDetailedScore(normalizedQuery, item) {
1060
+ let score = 0;
1061
+ const matches = {};
1062
+ for (const key of this.keys) {
1063
+ const value = this.extractKeyValue(item, key);
1064
+ const normalizedValue = this.normalizeString(value);
1065
+ const s = this.similarityFn(normalizedQuery, normalizedValue);
1066
+ matches[key.name] = s;
1067
+ score += key.normalizedWeight * s;
1068
+ }
1069
+ return {
1070
+ score,
1071
+ matches
1072
+ };
1073
+ }
1074
+ extractSearchText(item) {
1075
+ if (this.isObjectArray) return this.keys.map((k) => this.extractKeyValue(item, k)).join(" ");
1076
+ return this.normalizeString(item);
1077
+ }
1078
+ extractKeyValue(item, key) {
1079
+ if (key.getter) {
1080
+ const value = key.getter(item);
1081
+ return typeof value === "string" ? value : "";
1082
+ }
1083
+ const value = item[key.name];
1084
+ return typeof value === "string" ? value : "";
1085
+ }
1086
+ normalizeString(str) {
1087
+ return this.caseSensitive ? str : str.toLowerCase();
1088
+ }
1089
+ };
1090
+ /**
1091
+ * Find the single best match for a query against a collection.
1092
+ *
1093
+ * This is a convenience wrapper around {@link FuzzySearch} for one-shot queries.
1094
+ * For repeated searches against the same collection, prefer creating a
1095
+ * {@link FuzzySearch} instance directly.
1096
+ *
1097
+ * Time: O(n * k) where n = collection size, k = number of keys
1098
+ *
1099
+ * @param query - The search query string
1100
+ * @param collection - Array of strings or objects to search
1101
+ * @param options - Search configuration
1102
+ * @returns The best matching result, or null if nothing meets the threshold
1103
+ *
1104
+ * @example
1105
+ * ```ts
1106
+ * // String array
1107
+ * const result = findBestMatch("kitten", ["sitting", "kit", "mitten"]);
1108
+ * console.log(result?.item); // "kit"
1109
+ * console.log(result?.score); // 0.5
1110
+ *
1111
+ * // Object array with weighted keys
1112
+ * const books = [
1113
+ * { title: "The Great Gatsby", author: "F. Scott Fitzgerald" },
1114
+ * { title: "Great Expectations", author: "Charles Dickens" },
1115
+ * ];
1116
+ * const result = findBestMatch("grate gatsbi", books, {
1117
+ * keys: [
1118
+ * { name: "title", weight: 0.7 },
1119
+ * { name: "author", weight: 0.3 },
1120
+ * ],
1121
+ * });
1122
+ * ```
1123
+ */
1124
+ function findBestMatch(query, collection, options = {}) {
1125
+ const results = new FuzzySearch(collection, {
1126
+ algorithm: options.algorithm,
1127
+ keys: options.keys,
1128
+ threshold: options.threshold,
1129
+ caseSensitive: options.caseSensitive
1130
+ }).search(query, 1);
1131
+ return results.length > 0 ? results[0] : null;
1132
+ }
1133
+ //#endregion
1134
+ export { DiffType, FuzzySearch, LSH, MinHash, SimHasher, cosine, cosineNgram, diff, findBestMatch, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@nlptools/distance",
3
- "version": "0.0.3",
3
+ "version": "0.0.4",
4
4
  "description": "Complete string distance and similarity algorithms package with WebAssembly and JavaScript implementations",
5
5
  "keywords": [
6
6
  "algorithms",