@nlptools/distance 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -31
- package/dist/index.d.mts +707 -4
- package/dist/index.mjs +959 -54
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
- Edit distance: Levenshtein, LCS (Myers O(ND) and DP)
|
|
12
12
|
- Token similarity: Jaccard, Cosine, Sorensen-Dice (character multiset and n-gram variants)
|
|
13
13
|
- Hash-based deduplication: SimHash, MinHash, LSH
|
|
14
|
+
- Fuzzy search: `FuzzySearch` class and `findBestMatch` with multi-algorithm support
|
|
14
15
|
- Diff: based on `@algorithm.ts/diff` (Myers and DP backends)
|
|
15
16
|
- All distance algorithms include normalized similarity variants (0-1 range)
|
|
16
17
|
|
|
@@ -123,6 +124,45 @@ const query = lsh.query(mh.digest(), 0.5);
|
|
|
123
124
|
// => [["doc1", 0.67]]
|
|
124
125
|
```
|
|
125
126
|
|
|
127
|
+
### Fuzzy Search
|
|
128
|
+
|
|
129
|
+
```typescript
|
|
130
|
+
import { FuzzySearch, findBestMatch } from "@nlptools/distance";
|
|
131
|
+
|
|
132
|
+
// String array search
|
|
133
|
+
const search = new FuzzySearch(["apple", "banana", "cherry"]);
|
|
134
|
+
search.search("aple");
|
|
135
|
+
// => [{ item: "apple", score: 0.8, index: 0 }]
|
|
136
|
+
|
|
137
|
+
// Object array with weighted keys
|
|
138
|
+
const books = [
|
|
139
|
+
{ title: "Old Man's War", author: "John Scalzi" },
|
|
140
|
+
{ title: "The Great Gatsby", author: "F. Scott Fitzgerald" },
|
|
141
|
+
];
|
|
142
|
+
const bookSearch = new FuzzySearch(books, {
|
|
143
|
+
keys: [
|
|
144
|
+
{ name: "title", weight: 0.7 },
|
|
145
|
+
{ name: "author", weight: 0.3 },
|
|
146
|
+
],
|
|
147
|
+
algorithm: "cosine",
|
|
148
|
+
threshold: 0.3,
|
|
149
|
+
});
|
|
150
|
+
bookSearch.search("old man");
|
|
151
|
+
// => [{ item: { title: "Old Man's War", ... }, score: 0.52, index: 0 }]
|
|
152
|
+
|
|
153
|
+
// One-shot best match
|
|
154
|
+
findBestMatch("kitten", ["sitting", "kit", "mitten"]);
|
|
155
|
+
// => { item: "kit", score: 0.5, index: 1 }
|
|
156
|
+
|
|
157
|
+
// With per-key details
|
|
158
|
+
const detailed = new FuzzySearch(books, {
|
|
159
|
+
keys: [{ name: "title" }, { name: "author" }],
|
|
160
|
+
includeMatchDetails: true,
|
|
161
|
+
});
|
|
162
|
+
detailed.search("gatsby");
|
|
163
|
+
// => [{ item: ..., score: 0.45, index: 1, matches: { title: 0.6, author: 0.1 } }]
|
|
164
|
+
```
|
|
165
|
+
|
|
126
166
|
### Diff
|
|
127
167
|
|
|
128
168
|
```typescript
|
|
@@ -172,6 +212,27 @@ const result = diff("abc", "ac");
|
|
|
172
212
|
| `MinHash.estimate(sig1, sig2)` | Static: estimate Jaccard from signatures |
|
|
173
213
|
| `LSH` | Class with `insert()`, `query()`, `remove()` |
|
|
174
214
|
|
|
215
|
+
### Fuzzy Search
|
|
216
|
+
|
|
217
|
+
| Function / Class | Description |
|
|
218
|
+
| -------------------------------------------- | -------------------------------------------------- |
|
|
219
|
+
| `FuzzySearch<T>(collection, options?)` | Search engine with dynamic collection management |
|
|
220
|
+
| `findBestMatch(query, collection, options?)` | One-shot convenience: returns best match or `null` |
|
|
221
|
+
|
|
222
|
+
**FuzzySearch options:**
|
|
223
|
+
|
|
224
|
+
| Option | Type | Default | Description |
|
|
225
|
+
| --------------------- | ---------------------------------- | --------------- | ----------------------------- |
|
|
226
|
+
| `algorithm` | `BuiltinAlgorithm \| SimilarityFn` | `"levenshtein"` | Similarity algorithm to use |
|
|
227
|
+
| `keys` | `ISearchKey[]` | `[]` | Object fields to search on |
|
|
228
|
+
| `threshold` | `number` | `0` | Min similarity score (0-1) |
|
|
229
|
+
| `limit` | `number` | `Infinity` | Max results to return |
|
|
230
|
+
| `caseSensitive` | `boolean` | `false` | Case-insensitive by default |
|
|
231
|
+
| `includeMatchDetails` | `boolean` | `false` | Include per-key scores |
|
|
232
|
+
| `lsh` | `{ numHashes?, numBands? }` | — | Enable LSH for large datasets |
|
|
233
|
+
|
|
234
|
+
**Built-in algorithms:** `"levenshtein"`, `"lcs"`, `"jaccard"`, `"jaccardNgram"`, `"cosine"`, `"cosineNgram"`, `"sorensen"`, `"sorensenNgram"`
|
|
235
|
+
|
|
175
236
|
### Diff
|
|
176
237
|
|
|
177
238
|
| Function | Description | Returns |
|
|
@@ -180,57 +241,76 @@ const result = diff("abc", "ac");
|
|
|
180
241
|
|
|
181
242
|
### Types
|
|
182
243
|
|
|
183
|
-
| Type
|
|
184
|
-
|
|
|
185
|
-
| `DiffType`
|
|
186
|
-
| `IDiffItem<T>`
|
|
187
|
-
| `IDiffOptions<T>`
|
|
188
|
-
| `ISimHashOptions`
|
|
189
|
-
| `IMinHashOptions`
|
|
190
|
-
| `ILSHOptions`
|
|
244
|
+
| Type | Description |
|
|
245
|
+
| ----------------------- | -------------------------------------------- |
|
|
246
|
+
| `DiffType` | Enum: `ADDED`, `REMOVED`, `COMMON` |
|
|
247
|
+
| `IDiffItem<T>` | Diff result item with type and tokens |
|
|
248
|
+
| `IDiffOptions<T>` | Options for diff (equals, lcs algorithm) |
|
|
249
|
+
| `ISimHashOptions` | Options for SimHash (bits, hashFn) |
|
|
250
|
+
| `IMinHashOptions` | Options for MinHash (numHashes, seed) |
|
|
251
|
+
| `ILSHOptions` | Options for LSH (numBands, numHashes) |
|
|
252
|
+
| `IFuzzySearchOptions` | Options for FuzzySearch constructor |
|
|
253
|
+
| `IFindBestMatchOptions` | Options for findBestMatch function |
|
|
254
|
+
| `ISearchKey` | Searchable key config (name, weight, getter) |
|
|
255
|
+
| `ISearchResult<T>` | Search result with item, score, index |
|
|
256
|
+
| `SimilarityFn` | `(a: string, b: string) => number` in [0,1] |
|
|
191
257
|
|
|
192
258
|
## Performance
|
|
193
259
|
|
|
194
|
-
Benchmark:
|
|
260
|
+
Benchmark: same test data across all runtimes. TS/WASM via `vitest bench` (V8 JIT), Rust via `cargo test --release`.
|
|
195
261
|
Unit: microseconds per operation (us/op).
|
|
196
262
|
|
|
197
263
|
### Edit Distance
|
|
198
264
|
|
|
199
265
|
| Algorithm | Size | TS (V8 JIT) | WASM (via JS) | Rust (native) |
|
|
200
266
|
| --------------- | --------------- | ----------- | ------------- | ------------- |
|
|
201
|
-
| levenshtein | Short (<10) | 0.3 |
|
|
202
|
-
| levenshtein | Medium (10-100) | 1.3 |
|
|
203
|
-
| levenshtein | Long (>200) |
|
|
204
|
-
| levenshteinNorm | Short | 0.3 |
|
|
205
|
-
| lcs | Short (<10) | 1.
|
|
206
|
-
| lcs | Medium (10-100) | 6.8 |
|
|
207
|
-
| lcs | Long (>200) |
|
|
208
|
-
| lcsNorm | Short | 1.7 |
|
|
267
|
+
| levenshtein | Short (<10) | 0.3 | 1.0 | 0.24 |
|
|
268
|
+
| levenshtein | Medium (10-100) | 1.3 | 4.8 | 2.00 |
|
|
269
|
+
| levenshtein | Long (>200) | 13.9 | 102.3 | 61.77 |
|
|
270
|
+
| levenshteinNorm | Short | 0.3 | 1.0 | 0.19 |
|
|
271
|
+
| lcs | Short (<10) | 1.7 | 1.9 | 0.69 |
|
|
272
|
+
| lcs | Medium (10-100) | 6.8 | 10.1 | 7.70 |
|
|
273
|
+
| lcs | Long (>200) | 216.0 | 161.8 | 151.84 |
|
|
274
|
+
| lcsNorm | Short | 1.7 | 1.9 | 0.42 |
|
|
209
275
|
|
|
210
276
|
### Token Similarity (Character Multiset)
|
|
211
277
|
|
|
212
278
|
| Algorithm | Size | TS (V8 JIT) | WASM (via JS) | Rust (native) |
|
|
213
279
|
| --------- | --------------- | ----------- | ------------- | ------------- |
|
|
214
|
-
| jaccard | Short (<10) | 0.8 |
|
|
215
|
-
| jaccard | Medium (10-100) | 0.8 |
|
|
216
|
-
| jaccard | Long (>200) | 1.
|
|
217
|
-
| cosine | Short (<10) | 0
|
|
218
|
-
| cosine | Medium (10-100) | 0.8 |
|
|
219
|
-
| cosine | Long (>200) | 1.
|
|
220
|
-
| sorensen | Short (<10) | 0.7 |
|
|
221
|
-
| sorensen | Medium (10-100) | 0.7 |
|
|
222
|
-
| sorensen | Long (>200) | 1.
|
|
280
|
+
| jaccard | Short (<10) | 0.8 | 3.4 | 0.63 |
|
|
281
|
+
| jaccard | Medium (10-100) | 0.8 | 8.6 | 2.67 |
|
|
282
|
+
| jaccard | Long (>200) | 1.5 | 18.9 | 7.25 |
|
|
283
|
+
| cosine | Short (<10) | 1.0 | 2.6 | 0.43 |
|
|
284
|
+
| cosine | Medium (10-100) | 0.8 | 7.0 | 1.56 |
|
|
285
|
+
| cosine | Long (>200) | 1.7 | 17.2 | 6.23 |
|
|
286
|
+
| sorensen | Short (<10) | 0.7 | 2.6 | 0.56 |
|
|
287
|
+
| sorensen | Medium (10-100) | 0.7 | 7.0 | 2.27 |
|
|
288
|
+
| sorensen | Long (>200) | 1.4 | 17.4 | 6.48 |
|
|
223
289
|
|
|
224
290
|
### Bigram Variants
|
|
225
291
|
|
|
226
292
|
| Algorithm | Size | TS (V8 JIT) | WASM (via JS) | Rust (native) |
|
|
227
293
|
| ------------- | --------------- | ----------- | ------------- | ------------- |
|
|
228
|
-
| jaccardBigram | Short (<10) | 1.1 |
|
|
229
|
-
| jaccardBigram | Medium (10-100) | 7.
|
|
230
|
-
| cosineBigram | Short (<10) | 0.
|
|
231
|
-
| cosineBigram | Medium (10-100) | 5.
|
|
294
|
+
| jaccardBigram | Short (<10) | 1.1 | 3.5 | 0.67 |
|
|
295
|
+
| jaccardBigram | Medium (10-100) | 7.5 | 18.1 | 4.80 |
|
|
296
|
+
| cosineBigram | Short (<10) | 0.7 | 2.8 | 0.43 |
|
|
297
|
+
| cosineBigram | Medium (10-100) | 5.4 | 14.0 | 4.04 |
|
|
298
|
+
|
|
299
|
+
TS implementations use `Int32Array` ASCII fast path + integer-encoded bigrams, avoiding JS-WASM boundary overhead. For compute-heavy algorithms on long strings (e.g. LCS), WASM via JS and Rust native can outperform TS due to native computation advantage outweighing the boundary cost.
|
|
300
|
+
|
|
301
|
+
### Fuzzy Search: NLPTools vs Fuse.js
|
|
302
|
+
|
|
303
|
+
Benchmark: 20 items in collection, 6 queries per iteration, 1000 iterations.
|
|
304
|
+
Unit: milliseconds per operation (ms/op). Algorithm: levenshtein (default).
|
|
305
|
+
|
|
306
|
+
| Scenario | NLPTools | Fuse.js |
|
|
307
|
+
| ----------------------- | -------- | ------- |
|
|
308
|
+
| Setup (constructor) | 0.0002 | 0.0050 |
|
|
309
|
+
| Search (string array) | 0.0114 | 0.1077 |
|
|
310
|
+
| Search (object, 1 key) | 0.0176 | 0.3308 |
|
|
311
|
+
| Search (object, 2 keys) | 0.0289 | 0.6445 |
|
|
232
312
|
|
|
233
|
-
|
|
313
|
+
Both libraries return identical top-1 results for all test queries. NLPTools scores are normalized similarity (0-1, higher is better); Fuse.js uses Bitap error scores (0 = perfect, lower is better).
|
|
234
314
|
|
|
235
315
|
## Dependencies
|
|
236
316
|
|