cmpstr 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -10
- package/index.js +81 -35
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -4,7 +4,7 @@ This lightweight npm package can be used to __calculate the similarity of string
|
|
|
4
4
|
|
|
5
5
|
## Install
|
|
6
6
|
|
|
7
|
-
Using
|
|
7
|
+
Using __Node.js__, install the package with the following shell command:
|
|
8
8
|
|
|
9
9
|
```sh
|
|
10
10
|
npm install cmpstr
|
|
@@ -58,6 +58,16 @@ let matches = cmpstr.levenshteinMatch( 'best', [
|
|
|
58
58
|
] );
|
|
59
59
|
```
|
|
60
60
|
|
|
61
|
+
### JavaScript
|
|
62
|
+
|
|
63
|
+
Using JavaScript load this package by embed this file via [jsDelivr](https://www.jsdelivr.com/package/npm/cmpstr):
|
|
64
|
+
|
|
65
|
+
```js
|
|
66
|
+
import cmpstr from "https://cdn.jsdelivr.net/npm/cmpstr@1.0.3/+esm";
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Remember: To use ``import`` you need to load your JavaScript file as ``type="module"``.
|
|
70
|
+
|
|
61
71
|
## API
|
|
62
72
|
|
|
63
73
|
The npm package ``cmpstr`` supports two different methods for determining the similarity of two strings. The __Levenshtein distance__, as the minimum number of inserting, deleting and replacing operations to convert one string into another, and the __Sørensen-Dice coefficient__ to measure the similarity of two samples.
|
|
@@ -69,32 +79,60 @@ Learn more about both by visiting these links:
|
|
|
69
79
|
|
|
70
80
|
### Levenshtein distance
|
|
71
81
|
|
|
72
|
-
#### ``levenshteinDistance( a, b )``
|
|
82
|
+
#### ``levenshteinDistance( a, b [, flags = null ] )``
|
|
73
83
|
|
|
74
84
|
Calculates the difference between two strings ``a`` and ``b`` and returns the Levenshtein distance as an integer value.
|
|
75
85
|
|
|
76
|
-
#### ``levenshtein( a, b )``
|
|
86
|
+
#### ``levenshtein( a, b [, flags = null ] )``
|
|
77
87
|
|
|
78
88
|
Returns the match percentage of two strings ``a`` and ``b``. The output value is in the range ``0..1`` as a floating point number.
|
|
79
89
|
|
|
80
|
-
#### ``levenshteinClosest( str, arr )``
|
|
90
|
+
#### ``levenshteinClosest( str, arr [, flags = null ] )``
|
|
81
91
|
|
|
82
92
|
Returns the best match of the string ``str`` against the array ``arr`` of passed strings. The function returns the most closely matched string found in the array.
|
|
83
93
|
|
|
84
|
-
#### ``levenshteinMatch( str, arr )``
|
|
94
|
+
#### ``levenshteinMatch( str, arr [, flags = null [, threshold = 0 ] ] )``
|
|
85
95
|
|
|
86
|
-
Calculates the similarity of all strings contained in the array ``arr`` according to Levenshtein compared to ``str`` and returns an array of all samples sorted by matching in descending order.
|
|
96
|
+
Calculates the similarity of all strings contained in the array ``arr`` according to Levenshtein compared to ``str`` and returns an array of all samples sorted by matching in descending order. The ``threshold`` specifies the minimum required similarity.
|
|
87
97
|
|
|
88
98
|
### Sørensen-Dice coefficient
|
|
89
99
|
|
|
90
|
-
#### ``diceCoefficient( a, b )``
|
|
100
|
+
#### ``diceCoefficient( a, b [, flags = null ] )``
|
|
91
101
|
|
|
92
102
|
This function evaluates the similarity of two given strings ``a`` and ``b`` as percentage value according to the Sørensen-Dice coefficient and returns the result as floating point number.
|
|
93
103
|
|
|
94
|
-
#### ``diceClosest( str, arr )``
|
|
104
|
+
#### ``diceClosest( str, arr [, flags = null ] )``
|
|
95
105
|
|
|
96
106
|
As another way to find the best match between the string ``str`` and a given array ``arr`` of samples, this function uses the Sørensen-Dice coefficient. It returns the most matching string as well.
|
|
97
107
|
|
|
98
|
-
#### ``diceMatch( str, arr )``
|
|
108
|
+
#### ``diceMatch( str, arr [, flags = null [, threshold = 0 ] ] )``
|
|
109
|
+
|
|
110
|
+
Calculates the similarity of all strings contained in the array ``arr`` according to Sørensen-Dice coefficient compared to ``str`` and returns an array of all samples sorted by matching in descending order. The ``threshold`` specifies the minimum required similarity.
|
|
111
|
+
|
|
112
|
+
### Flags
|
|
113
|
+
|
|
114
|
+
Each method can be passed the ``flags`` options listed below:
|
|
115
|
+
|
|
116
|
+
| Flag | Option |
|
|
117
|
+
| ----- | ------------------------------ |
|
|
118
|
+
| ``i`` | case insensitive |
|
|
119
|
+
| ``s`` | non-whitespace characters only |
|
|
120
|
+
|
|
121
|
+
## Patch notes
|
|
122
|
+
|
|
123
|
+
### 1.0.3
|
|
124
|
+
|
|
125
|
+
* Add ``threshold`` to specify the minimum required similarity
|
|
126
|
+
|
|
127
|
+
### 1.0.2
|
|
128
|
+
|
|
129
|
+
* Add normalize options ``i`` and ``s``
|
|
130
|
+
* Minor fixes
|
|
131
|
+
|
|
132
|
+
### 1.0.1
|
|
133
|
+
|
|
134
|
+
* Minor fixes
|
|
135
|
+
|
|
136
|
+
### 1.0.0
|
|
99
137
|
|
|
100
|
-
|
|
138
|
+
* Initial release
|
package/index.js
CHANGED
|
@@ -3,11 +3,11 @@
|
|
|
3
3
|
* lightweight npm package to calculate string similarity
|
|
4
4
|
*
|
|
5
5
|
* @author komed3 (Paul Köhler)
|
|
6
|
-
* @version 1.0.
|
|
6
|
+
* @version 1.0.3
|
|
7
7
|
* @license MIT
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
|
-
'use strict'
|
|
10
|
+
'use strict';
|
|
11
11
|
|
|
12
12
|
/**
|
|
13
13
|
* basic functions
|
|
@@ -17,11 +17,40 @@
|
|
|
17
17
|
/**
|
|
18
18
|
* normalize string
|
|
19
19
|
* @param {String} str string
|
|
20
|
+
* @param {Null|String} flags options
|
|
20
21
|
* @returns normalized string
|
|
21
22
|
*/
|
|
22
|
-
const normalize = ( str ) => {
|
|
23
|
+
const normalize = ( str, flags = null ) => {
|
|
23
24
|
|
|
24
|
-
|
|
25
|
+
str = str.toString();
|
|
26
|
+
|
|
27
|
+
( flags || '' ).toString().split( '' ).forEach( ( f ) => {
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* normalize options
|
|
31
|
+
* i case insensitive
|
|
32
|
+
* s non-whitespace
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
switch( f.toLowerCase() ) {
|
|
36
|
+
|
|
37
|
+
case 'i':
|
|
38
|
+
str = str.toLowerCase();
|
|
39
|
+
break;
|
|
40
|
+
|
|
41
|
+
case 's':
|
|
42
|
+
str = str.replace( /[^\S]+/g, '' );
|
|
43
|
+
break;
|
|
44
|
+
|
|
45
|
+
default:
|
|
46
|
+
/* do nothing */
|
|
47
|
+
break;
|
|
48
|
+
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
} );
|
|
52
|
+
|
|
53
|
+
return str;
|
|
25
54
|
|
|
26
55
|
};
|
|
27
56
|
|
|
@@ -47,21 +76,22 @@ const str2bigrams = ( str ) => {
|
|
|
47
76
|
};
|
|
48
77
|
|
|
49
78
|
/**
|
|
50
|
-
*
|
|
79
|
+
* compare strings by given algorithm
|
|
51
80
|
* @param {String} algo algorithm to use
|
|
52
81
|
* @param {String} a string 1
|
|
53
82
|
* @param {String} b string 2
|
|
83
|
+
* @param {Null|String} flags options
|
|
54
84
|
* @returns similarity
|
|
55
85
|
*/
|
|
56
|
-
const cpmByAlgo = ( algo, a, b ) => {
|
|
86
|
+
const cpmByAlgo = ( algo, a, b, flags = null ) => {
|
|
57
87
|
|
|
58
88
|
switch( algo ) {
|
|
59
89
|
|
|
60
90
|
case 'levenshtein':
|
|
61
|
-
return levenshtein( a, b );
|
|
91
|
+
return levenshtein( a, b, flags );
|
|
62
92
|
|
|
63
93
|
case 'diceCoefficient':
|
|
64
|
-
return diceCoefficient( a, b );
|
|
94
|
+
return diceCoefficient( a, b, flags );
|
|
65
95
|
|
|
66
96
|
default:
|
|
67
97
|
return 0;
|
|
@@ -75,9 +105,10 @@ const cpmByAlgo = ( algo, a, b ) => {
|
|
|
75
105
|
* @param {String} algo algorithm to use
|
|
76
106
|
* @param {String} test test string
|
|
77
107
|
* @param {Array} arr targets to test
|
|
108
|
+
* @param {Null|String} flags options
|
|
78
109
|
* @returns closest target
|
|
79
110
|
*/
|
|
80
|
-
const findClosest = ( algo, test, arr ) => {
|
|
111
|
+
const findClosest = ( algo, test, arr, flags = null ) => {
|
|
81
112
|
|
|
82
113
|
let best = -Infinity,
|
|
83
114
|
idx = 0,
|
|
@@ -85,9 +116,9 @@ const findClosest = ( algo, test, arr ) => {
|
|
|
85
116
|
|
|
86
117
|
/* search for closest element in arr */
|
|
87
118
|
|
|
88
|
-
arr.forEach( ( str, i ) => {
|
|
119
|
+
[ ...arr ].forEach( ( str, i ) => {
|
|
89
120
|
|
|
90
|
-
pct = cpmByAlgo( algo, test, str );
|
|
121
|
+
pct = cpmByAlgo( algo, test, str, flags );
|
|
91
122
|
|
|
92
123
|
if( pct > best ) {
|
|
93
124
|
|
|
@@ -111,23 +142,29 @@ const findClosest = ( algo, test, arr ) => {
|
|
|
111
142
|
* @param {String} algo algorithm to use
|
|
112
143
|
* @param {String} test test string
|
|
113
144
|
* @param {Array} arr targets to test
|
|
145
|
+
* @param {Null|String} flags options
|
|
146
|
+
* @param {Float} threshold required similarity
|
|
114
147
|
* @returns sorted matches
|
|
115
148
|
*/
|
|
116
|
-
const bestMatch = ( algo, test, arr ) => {
|
|
149
|
+
const bestMatch = ( algo, test, arr, flags = null, threshold = 0 ) => {
|
|
117
150
|
|
|
118
151
|
let matches = [],
|
|
119
152
|
pct;
|
|
120
153
|
|
|
121
154
|
/* calculate similarity for each arr items */
|
|
122
155
|
|
|
123
|
-
arr.forEach( ( str ) => {
|
|
156
|
+
[ ...arr ].forEach( ( str ) => {
|
|
124
157
|
|
|
125
|
-
pct = cpmByAlgo( algo, test, str );
|
|
158
|
+
pct = cpmByAlgo( algo, test, str, flags );
|
|
126
159
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
160
|
+
if( pct >= threshold ) {
|
|
161
|
+
|
|
162
|
+
matches.push( {
|
|
163
|
+
target: str,
|
|
164
|
+
match: pct
|
|
165
|
+
} );
|
|
166
|
+
|
|
167
|
+
}
|
|
131
168
|
|
|
132
169
|
} );
|
|
133
170
|
|
|
@@ -152,14 +189,15 @@ const bestMatch = ( algo, test, arr ) => {
|
|
|
152
189
|
* calculate levenshtein similarity (in percent)
|
|
153
190
|
* @param {String} a string 1
|
|
154
191
|
* @param {String} b string 2
|
|
192
|
+
* @param {Null|String} flags options
|
|
155
193
|
* @returns similarity 0..1
|
|
156
194
|
*/
|
|
157
|
-
const levenshtein = ( a, b ) => {
|
|
195
|
+
const levenshtein = ( a, b, flags = null ) => {
|
|
158
196
|
|
|
159
197
|
/* normalize string */
|
|
160
198
|
|
|
161
|
-
a = normalize( a );
|
|
162
|
-
b = normalize( b );
|
|
199
|
+
a = normalize( a, flags );
|
|
200
|
+
b = normalize( b, flags );
|
|
163
201
|
|
|
164
202
|
if( a == b ) {
|
|
165
203
|
|
|
@@ -196,14 +234,15 @@ const levenshtein = ( a, b ) => {
|
|
|
196
234
|
* get levenshtein distance
|
|
197
235
|
* @param {String} a string 1
|
|
198
236
|
* @param {String} b string 2
|
|
237
|
+
* @param {Null|String} flags options
|
|
199
238
|
* @returns distance
|
|
200
239
|
*/
|
|
201
|
-
const levenshteinDistance = ( a, b ) => {
|
|
240
|
+
const levenshteinDistance = ( a, b, flags = null ) => {
|
|
202
241
|
|
|
203
242
|
/* normalize string */
|
|
204
243
|
|
|
205
|
-
a = normalize( a );
|
|
206
|
-
b = normalize( b );
|
|
244
|
+
a = normalize( a, flags );
|
|
245
|
+
b = normalize( b, flags );
|
|
207
246
|
|
|
208
247
|
if( a == b ) {
|
|
209
248
|
|
|
@@ -281,11 +320,12 @@ const levenshteinDistance = ( a, b ) => {
|
|
|
281
320
|
* search for closest target to test string
|
|
282
321
|
* @param {String} test test string
|
|
283
322
|
* @param {Array} arr targets to test
|
|
323
|
+
* @param {Null|String} flags options
|
|
284
324
|
* @returns closest target
|
|
285
325
|
*/
|
|
286
|
-
const levenshteinClosest = ( test, arr ) => {
|
|
326
|
+
const levenshteinClosest = ( test, arr, flags = null ) => {
|
|
287
327
|
|
|
288
|
-
return findClosest( 'levenshtein', test, arr );
|
|
328
|
+
return findClosest( 'levenshtein', test, arr, flags );
|
|
289
329
|
|
|
290
330
|
};
|
|
291
331
|
|
|
@@ -293,11 +333,13 @@ const levenshteinClosest = ( test, arr ) => {
|
|
|
293
333
|
* sort best matches to test string
|
|
294
334
|
* @param {String} test test string
|
|
295
335
|
* @param {Array} arr targets to test
|
|
336
|
+
* @param {Null|String} flags options
|
|
337
|
+
* @param {Float} threshold required similarity
|
|
296
338
|
* @returns sorted matches
|
|
297
339
|
*/
|
|
298
|
-
const levenshteinMatch = ( test, arr ) => {
|
|
340
|
+
const levenshteinMatch = ( test, arr, flags = null, threshold = 0 ) => {
|
|
299
341
|
|
|
300
|
-
return bestMatch( 'levenshtein', test, arr );
|
|
342
|
+
return bestMatch( 'levenshtein', test, arr, flags, threshold );
|
|
301
343
|
|
|
302
344
|
};
|
|
303
345
|
|
|
@@ -305,14 +347,15 @@ const levenshteinMatch = ( test, arr ) => {
|
|
|
305
347
|
* calculate dice coefficient
|
|
306
348
|
* @param {String} a string 1
|
|
307
349
|
* @param {String} b string 2
|
|
350
|
+
* @param {Null|String} flags options
|
|
308
351
|
* @returns dice coefficient
|
|
309
352
|
*/
|
|
310
|
-
const diceCoefficient = ( a, b ) => {
|
|
353
|
+
const diceCoefficient = ( a, b, flags = null ) => {
|
|
311
354
|
|
|
312
355
|
/* normalize string */
|
|
313
356
|
|
|
314
|
-
a = normalize( a );
|
|
315
|
-
b = normalize( b );
|
|
357
|
+
a = normalize( a, flags );
|
|
358
|
+
b = normalize( b, flags );
|
|
316
359
|
|
|
317
360
|
if( a == b ) {
|
|
318
361
|
|
|
@@ -352,11 +395,12 @@ const diceCoefficient = ( a, b ) => {
|
|
|
352
395
|
* search for closest target to test string
|
|
353
396
|
* @param {String} test test string
|
|
354
397
|
* @param {Array} arr targets to test
|
|
398
|
+
* @param {Null|String} flags options
|
|
355
399
|
* @returns closest target
|
|
356
400
|
*/
|
|
357
|
-
const diceClosest = ( test, arr ) => {
|
|
401
|
+
const diceClosest = ( test, arr, flags = null ) => {
|
|
358
402
|
|
|
359
|
-
return findClosest( 'diceCoefficient', test, arr );
|
|
403
|
+
return findClosest( 'diceCoefficient', test, arr, flags );
|
|
360
404
|
|
|
361
405
|
};
|
|
362
406
|
|
|
@@ -364,11 +408,13 @@ const diceClosest = ( test, arr ) => {
|
|
|
364
408
|
* sort best matches to test string
|
|
365
409
|
* @param {String} test test string
|
|
366
410
|
* @param {Array} arr targets to test
|
|
411
|
+
* @param {Null|String} flags options
|
|
412
|
+
* @param {Float} threshold required similarity
|
|
367
413
|
* @returns sorted matches
|
|
368
414
|
*/
|
|
369
|
-
const diceMatch = ( test, arr ) => {
|
|
415
|
+
const diceMatch = ( test, arr, flags = null, threshold = 0 ) => {
|
|
370
416
|
|
|
371
|
-
return bestMatch( 'diceCoefficient', test, arr );
|
|
417
|
+
return bestMatch( 'diceCoefficient', test, arr, flags, threshold );
|
|
372
418
|
|
|
373
419
|
};
|
|
374
420
|
|