cmpstr 1.0.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +438 -79
- package/package.json +47 -25
- package/src/CmpStr.js +784 -0
- package/src/CmpStrAsync.js +191 -0
- package/src/algorithms/cosine.js +86 -0
- package/src/algorithms/damerau.js +78 -0
- package/src/algorithms/dice.js +65 -0
- package/src/algorithms/hamming.js +44 -0
- package/src/algorithms/jaccard.js +34 -0
- package/src/algorithms/jaroWinkler.js +106 -0
- package/src/algorithms/lcs.js +58 -0
- package/src/algorithms/levenshtein.js +70 -0
- package/src/algorithms/needlemanWunsch.js +72 -0
- package/src/algorithms/qGram.js +63 -0
- package/src/algorithms/smithWaterman.js +78 -0
- package/src/algorithms/soundex.js +152 -0
- package/src/index.js +47 -0
- package/index.js +0 -425
package/index.js
DELETED
|
@@ -1,425 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* cmpstr
|
|
3
|
-
* lightweight npm package to calculate string similarity
|
|
4
|
-
*
|
|
5
|
-
* @author komed3 (Paul Köhler)
|
|
6
|
-
* @version 1.0.2
|
|
7
|
-
* @license MIT
|
|
8
|
-
*/
|
|
9
|
-
|
|
10
|
-
'use strict'
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* basic functions
|
|
14
|
-
* @private
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
/**
|
|
18
|
-
* normalize string
|
|
19
|
-
* @param {String} str string
|
|
20
|
-
* @param {Null|String} flags options
|
|
21
|
-
* @returns normalized string
|
|
22
|
-
*/
|
|
23
|
-
const normalize = ( str, flags = null ) => {
|
|
24
|
-
|
|
25
|
-
str = str.toString();
|
|
26
|
-
|
|
27
|
-
( flags || '' ).toString().split( '' ).forEach( ( f ) => {
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* normalize options
|
|
31
|
-
* i case insensitive
|
|
32
|
-
* s non-whitespace
|
|
33
|
-
*/
|
|
34
|
-
|
|
35
|
-
switch( f.toLowerCase() ) {
|
|
36
|
-
|
|
37
|
-
case 'i':
|
|
38
|
-
str = str.toLowerCase();
|
|
39
|
-
break;
|
|
40
|
-
|
|
41
|
-
case 's':
|
|
42
|
-
str = str.replace( /[^\S]+/g, '' );
|
|
43
|
-
break;
|
|
44
|
-
|
|
45
|
-
default:
|
|
46
|
-
/* do nothing */
|
|
47
|
-
break;
|
|
48
|
-
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
} );
|
|
52
|
-
|
|
53
|
-
return str;
|
|
54
|
-
|
|
55
|
-
};
|
|
56
|
-
|
|
57
|
-
/**
|
|
58
|
-
* get bigrams from string
|
|
59
|
-
* @param {String} str string
|
|
60
|
-
* @returns bigrams
|
|
61
|
-
*/
|
|
62
|
-
const str2bigrams = ( str ) => {
|
|
63
|
-
|
|
64
|
-
let bigrams = new Set();
|
|
65
|
-
|
|
66
|
-
for( let i = 0; i < str.length - 1; i++ ) {
|
|
67
|
-
|
|
68
|
-
bigrams.add(
|
|
69
|
-
str.substring( i, i + 2 )
|
|
70
|
-
);
|
|
71
|
-
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
return bigrams;
|
|
75
|
-
|
|
76
|
-
};
|
|
77
|
-
|
|
78
|
-
/**
|
|
79
|
-
* compare strings by given algorithm
|
|
80
|
-
* @param {String} algo algorithm to use
|
|
81
|
-
* @param {String} a string 1
|
|
82
|
-
* @param {String} b string 2
|
|
83
|
-
* @param {Null|String} flags options
|
|
84
|
-
* @returns similarity
|
|
85
|
-
*/
|
|
86
|
-
const cpmByAlgo = ( algo, a, b, flags ) => {
|
|
87
|
-
|
|
88
|
-
switch( algo ) {
|
|
89
|
-
|
|
90
|
-
case 'levenshtein':
|
|
91
|
-
return levenshtein( a, b, flags );
|
|
92
|
-
|
|
93
|
-
case 'diceCoefficient':
|
|
94
|
-
return diceCoefficient( a, b, flags );
|
|
95
|
-
|
|
96
|
-
default:
|
|
97
|
-
return 0;
|
|
98
|
-
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
};
|
|
102
|
-
|
|
103
|
-
/**
|
|
104
|
-
* search for closest string
|
|
105
|
-
* @param {String} algo algorithm to use
|
|
106
|
-
* @param {String} test test string
|
|
107
|
-
* @param {Array} arr targets to test
|
|
108
|
-
* @param {Null|String} flags options
|
|
109
|
-
* @returns closest target
|
|
110
|
-
*/
|
|
111
|
-
const findClosest = ( algo, test, arr, flags ) => {
|
|
112
|
-
|
|
113
|
-
let best = -Infinity,
|
|
114
|
-
idx = 0,
|
|
115
|
-
pct;
|
|
116
|
-
|
|
117
|
-
/* search for closest element in arr */
|
|
118
|
-
|
|
119
|
-
[ ...arr ].forEach( ( str, i ) => {
|
|
120
|
-
|
|
121
|
-
pct = cpmByAlgo( algo, test, str, flags );
|
|
122
|
-
|
|
123
|
-
if( pct > best ) {
|
|
124
|
-
|
|
125
|
-
/* save closest target */
|
|
126
|
-
|
|
127
|
-
best = pct;
|
|
128
|
-
idx = i;
|
|
129
|
-
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
} );
|
|
133
|
-
|
|
134
|
-
/* return closest target */
|
|
135
|
-
|
|
136
|
-
return arr[ idx ];
|
|
137
|
-
|
|
138
|
-
};
|
|
139
|
-
|
|
140
|
-
/**
|
|
141
|
-
* sort best matches to test string
|
|
142
|
-
* @param {String} algo algorithm to use
|
|
143
|
-
* @param {String} test test string
|
|
144
|
-
* @param {Array} arr targets to test
|
|
145
|
-
* @param {Null|String} flags options
|
|
146
|
-
* @returns sorted matches
|
|
147
|
-
*/
|
|
148
|
-
const bestMatch = ( algo, test, arr, flags = null ) => {
|
|
149
|
-
|
|
150
|
-
let matches = [],
|
|
151
|
-
pct;
|
|
152
|
-
|
|
153
|
-
/* calculate similarity for each arr items */
|
|
154
|
-
|
|
155
|
-
[ ...arr ].forEach( ( str ) => {
|
|
156
|
-
|
|
157
|
-
pct = cpmByAlgo( algo, test, str, flags );
|
|
158
|
-
|
|
159
|
-
matches.push( {
|
|
160
|
-
target: str,
|
|
161
|
-
match: pct
|
|
162
|
-
} );
|
|
163
|
-
|
|
164
|
-
} );
|
|
165
|
-
|
|
166
|
-
/* sort by highest similarity */
|
|
167
|
-
|
|
168
|
-
let sorted = matches.sort( ( a, b ) => {
|
|
169
|
-
return b.match - a.match;
|
|
170
|
-
} );
|
|
171
|
-
|
|
172
|
-
/* return sorted matches */
|
|
173
|
-
|
|
174
|
-
return sorted;
|
|
175
|
-
|
|
176
|
-
};
|
|
177
|
-
|
|
178
|
-
/**
|
|
179
|
-
* similarity calculations
|
|
180
|
-
* @public
|
|
181
|
-
*/
|
|
182
|
-
|
|
183
|
-
/**
|
|
184
|
-
* calculate levenshtein similarity (in percent)
|
|
185
|
-
* @param {String} a string 1
|
|
186
|
-
* @param {String} b string 2
|
|
187
|
-
* @param {Null|String} flags options
|
|
188
|
-
* @returns similarity 0..1
|
|
189
|
-
*/
|
|
190
|
-
const levenshtein = ( a, b, flags = null ) => {
|
|
191
|
-
|
|
192
|
-
/* normalize string */
|
|
193
|
-
|
|
194
|
-
a = normalize( a, flags );
|
|
195
|
-
b = normalize( b, flags );
|
|
196
|
-
|
|
197
|
-
if( a == b ) {
|
|
198
|
-
|
|
199
|
-
/* both string are similar or empty */
|
|
200
|
-
|
|
201
|
-
return 1;
|
|
202
|
-
|
|
203
|
-
} else if( a.length < 2 || b.length < 2 ) {
|
|
204
|
-
|
|
205
|
-
/* for 0-letter or 1-letter strings */
|
|
206
|
-
|
|
207
|
-
return 0;
|
|
208
|
-
|
|
209
|
-
} else {
|
|
210
|
-
|
|
211
|
-
/* get levenshtein distance */
|
|
212
|
-
|
|
213
|
-
let distance = levenshteinDistance( a, b );
|
|
214
|
-
|
|
215
|
-
/* return percentage */
|
|
216
|
-
|
|
217
|
-
return 1 - (
|
|
218
|
-
distance / Math.max(
|
|
219
|
-
a.length,
|
|
220
|
-
b.length
|
|
221
|
-
)
|
|
222
|
-
);
|
|
223
|
-
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
};
|
|
227
|
-
|
|
228
|
-
/**
|
|
229
|
-
* get levenshtein distance
|
|
230
|
-
* @param {String} a string 1
|
|
231
|
-
* @param {String} b string 2
|
|
232
|
-
* @param {Null|String} flags options
|
|
233
|
-
* @returns distance
|
|
234
|
-
*/
|
|
235
|
-
const levenshteinDistance = ( a, b, flags = null ) => {
|
|
236
|
-
|
|
237
|
-
/* normalize string */
|
|
238
|
-
|
|
239
|
-
a = normalize( a, flags );
|
|
240
|
-
b = normalize( b, flags );
|
|
241
|
-
|
|
242
|
-
if( a == b ) {
|
|
243
|
-
|
|
244
|
-
/* both string are similar or empty */
|
|
245
|
-
|
|
246
|
-
return 0;
|
|
247
|
-
|
|
248
|
-
} else if( a.length == 0 ) {
|
|
249
|
-
|
|
250
|
-
/* empty string 1 */
|
|
251
|
-
|
|
252
|
-
return b.length;
|
|
253
|
-
|
|
254
|
-
} else if( b.length == 0 ) {
|
|
255
|
-
|
|
256
|
-
/* empty string 2 */
|
|
257
|
-
|
|
258
|
-
return a.length;
|
|
259
|
-
|
|
260
|
-
} else {
|
|
261
|
-
|
|
262
|
-
/* create matrix */
|
|
263
|
-
|
|
264
|
-
const matrix = [];
|
|
265
|
-
|
|
266
|
-
for( let i = 0; i <= a.length; i++ ) {
|
|
267
|
-
|
|
268
|
-
const row = [];
|
|
269
|
-
|
|
270
|
-
for( let j = 0; j <= b.length; j++ ) {
|
|
271
|
-
|
|
272
|
-
row.push( j );
|
|
273
|
-
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
row[0] = i;
|
|
277
|
-
|
|
278
|
-
matrix.push( row );
|
|
279
|
-
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
/* calculate distance */
|
|
283
|
-
|
|
284
|
-
for( let i = 1; i <= a.length; i++ ) {
|
|
285
|
-
|
|
286
|
-
for( let j = 1; j <= b.length; j++ ) {
|
|
287
|
-
|
|
288
|
-
if( a[ i - 1 ] === b[ j - 1 ] ) {
|
|
289
|
-
|
|
290
|
-
matrix[ i ][ j ] = matrix[ i - 1 ][ j - 1 ];
|
|
291
|
-
|
|
292
|
-
} else {
|
|
293
|
-
|
|
294
|
-
matrix[ i ][ j ] = 1 + Math.min(
|
|
295
|
-
matrix[ i ][ j - 1 ],
|
|
296
|
-
matrix[ i - 1 ][ j - 1 ],
|
|
297
|
-
matrix[ i - 1 ][ j ]
|
|
298
|
-
);
|
|
299
|
-
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
/* return levenshtein distance */
|
|
307
|
-
|
|
308
|
-
return matrix[ a.length ][ b.length ];
|
|
309
|
-
|
|
310
|
-
}
|
|
311
|
-
|
|
312
|
-
};
|
|
313
|
-
|
|
314
|
-
/**
|
|
315
|
-
* search for closest target to test string
|
|
316
|
-
* @param {String} test test string
|
|
317
|
-
* @param {Array} arr targets to test
|
|
318
|
-
* @param {Null|String} flags options
|
|
319
|
-
* @returns closest target
|
|
320
|
-
*/
|
|
321
|
-
const levenshteinClosest = ( test, arr, flags = null ) => {
|
|
322
|
-
|
|
323
|
-
return findClosest( 'levenshtein', test, arr, flags );
|
|
324
|
-
|
|
325
|
-
};
|
|
326
|
-
|
|
327
|
-
/**
|
|
328
|
-
* sort best matches to test string
|
|
329
|
-
* @param {String} test test string
|
|
330
|
-
* @param {Array} arr targets to test
|
|
331
|
-
* @param {Null|String} flags options
|
|
332
|
-
* @returns sorted matches
|
|
333
|
-
*/
|
|
334
|
-
const levenshteinMatch = ( test, arr, flags = null ) => {
|
|
335
|
-
|
|
336
|
-
return bestMatch( 'levenshtein', test, arr, flags );
|
|
337
|
-
|
|
338
|
-
};
|
|
339
|
-
|
|
340
|
-
/**
|
|
341
|
-
* calculate dice coefficient
|
|
342
|
-
* @param {String} a string 1
|
|
343
|
-
* @param {String} b string 2
|
|
344
|
-
* @param {Null|String} flags options
|
|
345
|
-
* @returns dice coefficient
|
|
346
|
-
*/
|
|
347
|
-
const diceCoefficient = ( a, b, flags = null ) => {
|
|
348
|
-
|
|
349
|
-
/* normalize string */
|
|
350
|
-
|
|
351
|
-
a = normalize( a, flags );
|
|
352
|
-
b = normalize( b, flags );
|
|
353
|
-
|
|
354
|
-
if( a == b ) {
|
|
355
|
-
|
|
356
|
-
/* both string are similar or empty */
|
|
357
|
-
|
|
358
|
-
return 1;
|
|
359
|
-
|
|
360
|
-
} else if( a.length < 2 || b.length < 2 ) {
|
|
361
|
-
|
|
362
|
-
/* for 0-letter or 1-letter strings */
|
|
363
|
-
|
|
364
|
-
return 0;
|
|
365
|
-
|
|
366
|
-
} else {
|
|
367
|
-
|
|
368
|
-
/* get bigrams */
|
|
369
|
-
|
|
370
|
-
let setA = str2bigrams( a ),
|
|
371
|
-
setB = str2bigrams( b );
|
|
372
|
-
|
|
373
|
-
/* calculate dice coefficient */
|
|
374
|
-
|
|
375
|
-
return (
|
|
376
|
-
( new Set( [ ...setA ].filter( ( x ) => {
|
|
377
|
-
return setB.has( x );
|
|
378
|
-
} ) ) ).size * 2
|
|
379
|
-
) / (
|
|
380
|
-
setA.size +
|
|
381
|
-
setB.size
|
|
382
|
-
);
|
|
383
|
-
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
/**
|
|
389
|
-
* search for closest target to test string
|
|
390
|
-
* @param {String} test test string
|
|
391
|
-
* @param {Array} arr targets to test
|
|
392
|
-
* @param {Null|String} flags options
|
|
393
|
-
* @returns closest target
|
|
394
|
-
*/
|
|
395
|
-
const diceClosest = ( test, arr, flags = null ) => {
|
|
396
|
-
|
|
397
|
-
return findClosest( 'diceCoefficient', test, arr, flags );
|
|
398
|
-
|
|
399
|
-
};
|
|
400
|
-
|
|
401
|
-
/**
|
|
402
|
-
* sort best matches to test string
|
|
403
|
-
* @param {String} test test string
|
|
404
|
-
* @param {Array} arr targets to test
|
|
405
|
-
* @param {Null|String} flags options
|
|
406
|
-
* @returns sorted matches
|
|
407
|
-
*/
|
|
408
|
-
const diceMatch = ( test, arr, flags = null ) => {
|
|
409
|
-
|
|
410
|
-
return bestMatch( 'diceCoefficient', test, arr, flags );
|
|
411
|
-
|
|
412
|
-
};
|
|
413
|
-
|
|
414
|
-
/**
|
|
415
|
-
* export module functions
|
|
416
|
-
*/
|
|
417
|
-
module.exports = {
|
|
418
|
-
levenshtein,
|
|
419
|
-
levenshteinDistance,
|
|
420
|
-
levenshteinClosest,
|
|
421
|
-
levenshteinMatch,
|
|
422
|
-
diceCoefficient,
|
|
423
|
-
diceClosest,
|
|
424
|
-
diceMatch
|
|
425
|
-
};
|