cmpstr 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -8
- package/index.js +152 -40
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -24,16 +24,38 @@ Sample of how to use the package in your code:
|
|
|
24
24
|
let str1 = 'kitten';
|
|
25
25
|
let str2 = 'sitting';
|
|
26
26
|
|
|
27
|
+
/**
|
|
28
|
+
* levenshteinDistance
|
|
29
|
+
* expected: 3
|
|
30
|
+
*/
|
|
27
31
|
let distance = cmpstr.levenshteinDistance( str1, str2 );
|
|
28
|
-
// expected 3
|
|
29
32
|
|
|
33
|
+
/**
|
|
34
|
+
* diceCoefficient
|
|
35
|
+
* expected: 0.3636363636363636
|
|
36
|
+
*/
|
|
30
37
|
let dice = cmpstr.diceCoefficient( str1, str2 );
|
|
31
|
-
// expected 0.3636363636363636
|
|
32
38
|
|
|
39
|
+
/**
|
|
40
|
+
* diceClosest
|
|
41
|
+
* expected: bestest
|
|
42
|
+
*/
|
|
33
43
|
let closest = cmpstr.diceClosest( 'best', [
|
|
34
44
|
'better', 'bestest', 'well', 'good'
|
|
35
45
|
] );
|
|
36
|
-
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* levenshteinMatch
|
|
49
|
+
* expected: [
|
|
50
|
+
* { target: 'bestest', match: 0.5714285714285714 },
|
|
51
|
+
* { target: 'better', match: 0.5 },
|
|
52
|
+
* { target: 'well', match: 0.25 },
|
|
53
|
+
* { target: 'good', match: 0 }
|
|
54
|
+
* ]
|
|
55
|
+
*/
|
|
56
|
+
let matches = cmpstr.levenshteinMatch( 'best', [
|
|
57
|
+
'better', 'bestest', 'well', 'good'
|
|
58
|
+
] );
|
|
37
59
|
```
|
|
38
60
|
|
|
39
61
|
## API
|
|
@@ -47,24 +69,56 @@ Learn more about both by visiting these links:
|
|
|
47
69
|
|
|
48
70
|
### Levenshtein distance
|
|
49
71
|
|
|
50
|
-
#### ``levenshteinDistance( a, b )``
|
|
72
|
+
#### ``levenshteinDistance( a, b [, flags = null ] )``
|
|
51
73
|
|
|
52
74
|
Calculates the difference between two strings ``a`` and ``b`` and returns the Levenshtein distance as an integer value.
|
|
53
75
|
|
|
54
|
-
#### ``levenshtein( a, b )``
|
|
76
|
+
#### ``levenshtein( a, b [, flags = null ] )``
|
|
55
77
|
|
|
56
78
|
Returns the match percentage of two strings ``a`` and ``b``. The output value is in the range ``0..1`` as a floating point number.
|
|
57
79
|
|
|
58
|
-
#### ``levenshteinClosest( str, arr )``
|
|
80
|
+
#### ``levenshteinClosest( str, arr [, flags = null ] )``
|
|
59
81
|
|
|
60
82
|
Returns the best match of the string ``str`` against the array ``arr`` of passed strings. The function returns the most closely matched string found in the array.
|
|
61
83
|
|
|
84
|
+
#### ``levenshteinMatch( str, arr [, flags = null ] )``
|
|
85
|
+
|
|
86
|
+
Calculates the similarity of all strings contained in the array ``arr`` according to Levenshtein compared to ``str`` and returns an array of all samples sorted by matching in descending order.
|
|
87
|
+
|
|
62
88
|
### Sørensen-Dice coefficient
|
|
63
89
|
|
|
64
|
-
#### ``diceCoefficient( a, b )``
|
|
90
|
+
#### ``diceCoefficient( a, b [, flags = null ] )``
|
|
65
91
|
|
|
66
92
|
This function evaluates the similarity of two given strings ``a`` and ``b`` as percentage value according to the Sørensen-Dice coefficient and returns the result as floating point number.
|
|
67
93
|
|
|
68
|
-
#### ``diceClosest( str, arr )``
|
|
94
|
+
#### ``diceClosest( str, arr [, flags = null ] )``
|
|
69
95
|
|
|
70
96
|
As another way to find the best match between the string ``str`` and a given array ``arr`` of samples, this function uses the Sørensen-Dice coefficient. It returns the most matching string as well.
|
|
97
|
+
|
|
98
|
+
#### ``diceMatch( str, arr [, flags = null ] )``
|
|
99
|
+
|
|
100
|
+
Calculates the similarity of all strings contained in the array ``arr`` according to Sørensen-Dice coefficient compared to ``str`` and returns an array of all samples sorted by matching in descending order.
|
|
101
|
+
|
|
102
|
+
### Flags
|
|
103
|
+
|
|
104
|
+
Each method can be passed the ``flags`` options listed below:
|
|
105
|
+
|
|
106
|
+
| Flag | Option |
|
|
107
|
+
| ----- | ------------------------------ |
|
|
108
|
+
| ``i`` | case insensitive |
|
|
109
|
+
| ``s`` | non-whitespace characters only |
|
|
110
|
+
|
|
111
|
+
## Patch notes
|
|
112
|
+
|
|
113
|
+
### 1.0.2
|
|
114
|
+
|
|
115
|
+
* Add normalize options ``i`` and ``s``
|
|
116
|
+
* Minor fixes
|
|
117
|
+
|
|
118
|
+
### 1.0.1
|
|
119
|
+
|
|
120
|
+
* Minor fixes
|
|
121
|
+
|
|
122
|
+
### 1.0.0
|
|
123
|
+
|
|
124
|
+
* Initial release
|
package/index.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* lightweight npm package to calculate string similarity
|
|
4
4
|
*
|
|
5
5
|
* @author komed3 (Paul Köhler)
|
|
6
|
-
* @version 1.0.
|
|
6
|
+
* @version 1.0.2
|
|
7
7
|
* @license MIT
|
|
8
8
|
*/
|
|
9
9
|
|
|
@@ -17,11 +17,40 @@
|
|
|
17
17
|
/**
|
|
18
18
|
* normalize string
|
|
19
19
|
* @param {String} str string
|
|
20
|
+
* @param {Null|String} flags options
|
|
20
21
|
* @returns normalized string
|
|
21
22
|
*/
|
|
22
|
-
const normalize = ( str ) => {
|
|
23
|
+
const normalize = ( str, flags = null ) => {
|
|
23
24
|
|
|
24
|
-
|
|
25
|
+
str = str.toString();
|
|
26
|
+
|
|
27
|
+
( flags || '' ).toString().split( '' ).forEach( ( f ) => {
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* normalize options
|
|
31
|
+
* i case insensitive
|
|
32
|
+
* s non-whitespace
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
switch( f.toLowerCase() ) {
|
|
36
|
+
|
|
37
|
+
case 'i':
|
|
38
|
+
str = str.toLowerCase();
|
|
39
|
+
break;
|
|
40
|
+
|
|
41
|
+
case 's':
|
|
42
|
+
str = str.replace( /[^\S]+/g, '' );
|
|
43
|
+
break;
|
|
44
|
+
|
|
45
|
+
default:
|
|
46
|
+
/* do nothing */
|
|
47
|
+
break;
|
|
48
|
+
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
} );
|
|
52
|
+
|
|
53
|
+
return str;
|
|
25
54
|
|
|
26
55
|
};
|
|
27
56
|
|
|
@@ -30,7 +59,7 @@ const normalize = ( str ) => {
|
|
|
30
59
|
* @param {String} str string
|
|
31
60
|
* @returns bigrams
|
|
32
61
|
*/
|
|
33
|
-
const
|
|
62
|
+
const str2bigrams = ( str ) => {
|
|
34
63
|
|
|
35
64
|
let bigrams = new Set();
|
|
36
65
|
|
|
@@ -46,14 +75,40 @@ const bbigrams = ( str ) => {
|
|
|
46
75
|
|
|
47
76
|
};
|
|
48
77
|
|
|
78
|
+
/**
|
|
79
|
+
* compare strings by given algorithm
|
|
80
|
+
* @param {String} algo algorithm to use
|
|
81
|
+
* @param {String} a string 1
|
|
82
|
+
* @param {String} b string 2
|
|
83
|
+
* @param {Null|String} flags options
|
|
84
|
+
* @returns similarity
|
|
85
|
+
*/
|
|
86
|
+
const cpmByAlgo = ( algo, a, b, flags ) => {
|
|
87
|
+
|
|
88
|
+
switch( algo ) {
|
|
89
|
+
|
|
90
|
+
case 'levenshtein':
|
|
91
|
+
return levenshtein( a, b, flags );
|
|
92
|
+
|
|
93
|
+
case 'diceCoefficient':
|
|
94
|
+
return diceCoefficient( a, b, flags );
|
|
95
|
+
|
|
96
|
+
default:
|
|
97
|
+
return 0;
|
|
98
|
+
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
};
|
|
102
|
+
|
|
49
103
|
/**
|
|
50
104
|
* search for closest string
|
|
51
105
|
* @param {String} algo algorithm to use
|
|
52
106
|
* @param {String} test test string
|
|
53
|
-
* @param
|
|
107
|
+
* @param {Array} arr targets to test
|
|
108
|
+
* @param {Null|String} flags options
|
|
54
109
|
* @returns closest target
|
|
55
110
|
*/
|
|
56
|
-
const findClosest = ( algo, test, arr ) => {
|
|
111
|
+
const findClosest = ( algo, test, arr, flags ) => {
|
|
57
112
|
|
|
58
113
|
let best = -Infinity,
|
|
59
114
|
idx = 0,
|
|
@@ -61,23 +116,9 @@ const findClosest = ( algo, test, arr ) => {
|
|
|
61
116
|
|
|
62
117
|
/* search for closest element in arr */
|
|
63
118
|
|
|
64
|
-
arr.forEach( ( str, i ) => {
|
|
65
|
-
|
|
66
|
-
switch( algo ) {
|
|
67
|
-
|
|
68
|
-
case 'levenshtein':
|
|
69
|
-
pct = levenshtein( test, str );
|
|
70
|
-
break;
|
|
71
|
-
|
|
72
|
-
case 'diceCoefficient':
|
|
73
|
-
pct = diceCoefficient( test, str );
|
|
74
|
-
break;
|
|
75
|
-
|
|
76
|
-
default:
|
|
77
|
-
pct = 0;
|
|
78
|
-
break;
|
|
119
|
+
[ ...arr ].forEach( ( str, i ) => {
|
|
79
120
|
|
|
80
|
-
|
|
121
|
+
pct = cpmByAlgo( algo, test, str, flags );
|
|
81
122
|
|
|
82
123
|
if( pct > best ) {
|
|
83
124
|
|
|
@@ -96,6 +137,44 @@ const findClosest = ( algo, test, arr ) => {
|
|
|
96
137
|
|
|
97
138
|
};
|
|
98
139
|
|
|
140
|
+
/**
|
|
141
|
+
* sort best matches to test string
|
|
142
|
+
* @param {String} algo algorithm to use
|
|
143
|
+
* @param {String} test test string
|
|
144
|
+
* @param {Array} arr targets to test
|
|
145
|
+
* @param {Null|String} flags options
|
|
146
|
+
* @returns sorted matches
|
|
147
|
+
*/
|
|
148
|
+
const bestMatch = ( algo, test, arr, flags = null ) => {
|
|
149
|
+
|
|
150
|
+
let matches = [],
|
|
151
|
+
pct;
|
|
152
|
+
|
|
153
|
+
/* calculate similarity for each arr items */
|
|
154
|
+
|
|
155
|
+
[ ...arr ].forEach( ( str ) => {
|
|
156
|
+
|
|
157
|
+
pct = cpmByAlgo( algo, test, str, flags );
|
|
158
|
+
|
|
159
|
+
matches.push( {
|
|
160
|
+
target: str,
|
|
161
|
+
match: pct
|
|
162
|
+
} );
|
|
163
|
+
|
|
164
|
+
} );
|
|
165
|
+
|
|
166
|
+
/* sort by highest similarity */
|
|
167
|
+
|
|
168
|
+
let sorted = matches.sort( ( a, b ) => {
|
|
169
|
+
return b.match - a.match;
|
|
170
|
+
} );
|
|
171
|
+
|
|
172
|
+
/* return sorted matches */
|
|
173
|
+
|
|
174
|
+
return sorted;
|
|
175
|
+
|
|
176
|
+
};
|
|
177
|
+
|
|
99
178
|
/**
|
|
100
179
|
* similarity calculations
|
|
101
180
|
* @public
|
|
@@ -105,14 +184,15 @@ const findClosest = ( algo, test, arr ) => {
|
|
|
105
184
|
* calculate levenshtein similarity (in percent)
|
|
106
185
|
* @param {String} a string 1
|
|
107
186
|
* @param {String} b string 2
|
|
187
|
+
* @param {Null|String} flags options
|
|
108
188
|
* @returns similarity 0..1
|
|
109
189
|
*/
|
|
110
|
-
const levenshtein = ( a, b ) => {
|
|
190
|
+
const levenshtein = ( a, b, flags = null ) => {
|
|
111
191
|
|
|
112
192
|
/* normalize string */
|
|
113
193
|
|
|
114
|
-
a = normalize( a );
|
|
115
|
-
b = normalize( b );
|
|
194
|
+
a = normalize( a, flags );
|
|
195
|
+
b = normalize( b, flags );
|
|
116
196
|
|
|
117
197
|
if( a == b ) {
|
|
118
198
|
|
|
@@ -149,14 +229,15 @@ const levenshtein = ( a, b ) => {
|
|
|
149
229
|
* get levenshtein distance
|
|
150
230
|
* @param {String} a string 1
|
|
151
231
|
* @param {String} b string 2
|
|
232
|
+
* @param {Null|String} flags options
|
|
152
233
|
* @returns distance
|
|
153
234
|
*/
|
|
154
|
-
const levenshteinDistance = ( a, b ) => {
|
|
235
|
+
const levenshteinDistance = ( a, b, flags = null ) => {
|
|
155
236
|
|
|
156
237
|
/* normalize string */
|
|
157
238
|
|
|
158
|
-
a = normalize( a );
|
|
159
|
-
b = normalize( b );
|
|
239
|
+
a = normalize( a, flags );
|
|
240
|
+
b = normalize( b, flags );
|
|
160
241
|
|
|
161
242
|
if( a == b ) {
|
|
162
243
|
|
|
@@ -233,12 +314,26 @@ const levenshteinDistance = ( a, b ) => {
|
|
|
233
314
|
/**
|
|
234
315
|
* search for closest target to test string
|
|
235
316
|
* @param {String} test test string
|
|
236
|
-
* @param
|
|
317
|
+
* @param {Array} arr targets to test
|
|
318
|
+
* @param {Null|String} flags options
|
|
237
319
|
* @returns closest target
|
|
238
320
|
*/
|
|
239
|
-
const levenshteinClosest = ( test, arr ) => {
|
|
321
|
+
const levenshteinClosest = ( test, arr, flags = null ) => {
|
|
322
|
+
|
|
323
|
+
return findClosest( 'levenshtein', test, arr, flags );
|
|
324
|
+
|
|
325
|
+
};
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* sort best matches to test string
|
|
329
|
+
* @param {String} test test string
|
|
330
|
+
* @param {Array} arr targets to test
|
|
331
|
+
* @param {Null|String} flags options
|
|
332
|
+
* @returns sorted matches
|
|
333
|
+
*/
|
|
334
|
+
const levenshteinMatch = ( test, arr, flags = null ) => {
|
|
240
335
|
|
|
241
|
-
return
|
|
336
|
+
return bestMatch( 'levenshtein', test, arr, flags );
|
|
242
337
|
|
|
243
338
|
};
|
|
244
339
|
|
|
@@ -246,14 +341,15 @@ const levenshteinClosest = ( test, arr ) => {
|
|
|
246
341
|
* calculate dice coefficient
|
|
247
342
|
* @param {String} a string 1
|
|
248
343
|
* @param {String} b string 2
|
|
344
|
+
* @param {Null|String} flags options
|
|
249
345
|
* @returns dice coefficient
|
|
250
346
|
*/
|
|
251
|
-
const diceCoefficient = ( a, b ) => {
|
|
347
|
+
const diceCoefficient = ( a, b, flags = null ) => {
|
|
252
348
|
|
|
253
349
|
/* normalize string */
|
|
254
350
|
|
|
255
|
-
a = normalize( a );
|
|
256
|
-
b = normalize( b );
|
|
351
|
+
a = normalize( a, flags );
|
|
352
|
+
b = normalize( b, flags );
|
|
257
353
|
|
|
258
354
|
if( a == b ) {
|
|
259
355
|
|
|
@@ -271,8 +367,8 @@ const diceCoefficient = ( a, b ) => {
|
|
|
271
367
|
|
|
272
368
|
/* get bigrams */
|
|
273
369
|
|
|
274
|
-
let setA =
|
|
275
|
-
setB =
|
|
370
|
+
let setA = str2bigrams( a ),
|
|
371
|
+
setB = str2bigrams( b );
|
|
276
372
|
|
|
277
373
|
/* calculate dice coefficient */
|
|
278
374
|
|
|
@@ -292,12 +388,26 @@ const diceCoefficient = ( a, b ) => {
|
|
|
292
388
|
/**
|
|
293
389
|
* search for closest target to test string
|
|
294
390
|
* @param {String} test test string
|
|
295
|
-
* @param
|
|
391
|
+
* @param {Array} arr targets to test
|
|
392
|
+
* @param {Null|String} flags options
|
|
296
393
|
* @returns closest target
|
|
297
394
|
*/
|
|
298
|
-
const diceClosest = ( test, arr ) => {
|
|
395
|
+
const diceClosest = ( test, arr, flags = null ) => {
|
|
396
|
+
|
|
397
|
+
return findClosest( 'diceCoefficient', test, arr, flags );
|
|
398
|
+
|
|
399
|
+
};
|
|
400
|
+
|
|
401
|
+
/**
|
|
402
|
+
* sort best matches to test string
|
|
403
|
+
* @param {String} test test string
|
|
404
|
+
* @param {Array} arr targets to test
|
|
405
|
+
* @param {Null|String} flags options
|
|
406
|
+
* @returns sorted matches
|
|
407
|
+
*/
|
|
408
|
+
const diceMatch = ( test, arr, flags = null ) => {
|
|
299
409
|
|
|
300
|
-
return
|
|
410
|
+
return bestMatch( 'diceCoefficient', test, arr, flags );
|
|
301
411
|
|
|
302
412
|
};
|
|
303
413
|
|
|
@@ -308,6 +418,8 @@ module.exports = {
|
|
|
308
418
|
levenshtein,
|
|
309
419
|
levenshteinDistance,
|
|
310
420
|
levenshteinClosest,
|
|
421
|
+
levenshteinMatch,
|
|
311
422
|
diceCoefficient,
|
|
312
|
-
diceClosest
|
|
423
|
+
diceClosest,
|
|
424
|
+
diceMatch
|
|
313
425
|
};
|