cmpstr 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +70 -0
- package/index.js +313 -0
- package/package.json +25 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 Paul Köhler
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# cmpstr
|
|
2
|
+
|
|
3
|
+
This lightweight npm package can be used to __calculate the similarity of strings__. It supports both the best known __Levenshtein distance__ and the slightly more accurate __Sørensen dice coefficient__.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
Using Node.js install the package using shell command:
|
|
8
|
+
|
|
9
|
+
```sh
|
|
10
|
+
npm install cmpstr
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
Load the package into your project:
|
|
16
|
+
|
|
17
|
+
```js
|
|
18
|
+
const cmpstr = require( 'cmpstr' );
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Sample of how to use the package in your code:
|
|
22
|
+
|
|
23
|
+
```js
|
|
24
|
+
let str1 = 'kitten';
|
|
25
|
+
let str2 = 'sitting';
|
|
26
|
+
|
|
27
|
+
let distance = cmpstr.levenshteinDistance( str1, str2 );
|
|
28
|
+
// expected 3
|
|
29
|
+
|
|
30
|
+
let dice = cmpstr.diceCoefficient( str1, str2 );
|
|
31
|
+
// expected 0.3636363636363636
|
|
32
|
+
|
|
33
|
+
let closest = cmpstr.diceClosest( 'best', [
|
|
34
|
+
'better', 'bestest', 'well', 'good'
|
|
35
|
+
] );
|
|
36
|
+
// expected bestest
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## API
|
|
40
|
+
|
|
41
|
+
The npm package ``cmpstr`` supports two different methods for determining the similarity of two strings. The __Levenshtein distance__, as the minimum number of inserting, deleting and replacing operations to convert one string into another, and the __Sørensen-Dice coefficient__ to measure the similarity of two samples.
|
|
42
|
+
|
|
43
|
+
Learn more about both by visiting these links:
|
|
44
|
+
|
|
45
|
+
* [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
|
|
46
|
+
* [Sørensen-Dice coefficient](https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient)
|
|
47
|
+
|
|
48
|
+
### Levenshtein distance
|
|
49
|
+
|
|
50
|
+
#### ``levenshteinDistance( a, b )``
|
|
51
|
+
|
|
52
|
+
Calculates the difference between two strings ``a`` and ``b`` and returns the Levenshtein distance as an integer value.
|
|
53
|
+
|
|
54
|
+
#### ``levenshtein( a, b )``
|
|
55
|
+
|
|
56
|
+
Returns the match percentage of two strings ``a`` and ``b``. The output value is in the range ``0..1`` as a floating point number.
|
|
57
|
+
|
|
58
|
+
#### ``levenshteinClosest( str, arr )``
|
|
59
|
+
|
|
60
|
+
Returns the best match of the string ``str`` against the array ``arr`` of passed strings. The function returns the most closely matched string found in the array.
|
|
61
|
+
|
|
62
|
+
### Sørensen-Dice coefficient
|
|
63
|
+
|
|
64
|
+
#### ``diceCoefficient( a, b )``
|
|
65
|
+
|
|
66
|
+
This function evaluates the similarity of two given strings ``a`` and ``b`` as percentage value according to the Sørensen-Dice coefficient and returns the result as floating point number.
|
|
67
|
+
|
|
68
|
+
#### ``diceClosest( str, arr )``
|
|
69
|
+
|
|
70
|
+
As another way to find the best match between the string ``str`` and a given array ``arr`` of samples, this function uses the Sørensen-Dice coefficient. It returns the most matching string as well.
|
package/index.js
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* cmpstr
|
|
3
|
+
* lightweight npm package to calculate string similarity
|
|
4
|
+
*
|
|
5
|
+
* @author komed3 (Paul Köhler)
|
|
6
|
+
* @version 1.0.0
|
|
7
|
+
* @license MIT
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
'use strict'
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* basic functions
|
|
14
|
+
* @private
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* normalize string
|
|
19
|
+
* @param {String} str string
|
|
20
|
+
* @returns normalized string
|
|
21
|
+
*/
|
|
22
|
+
const normalize = ( str ) => {
|
|
23
|
+
|
|
24
|
+
return str.toString();
|
|
25
|
+
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* get bigrams from string
|
|
30
|
+
* @param {String} str string
|
|
31
|
+
* @returns bigrams
|
|
32
|
+
*/
|
|
33
|
+
const bbigrams = ( str ) => {
|
|
34
|
+
|
|
35
|
+
let bigrams = new Set();
|
|
36
|
+
|
|
37
|
+
for( let i = 0; i < str.length - 1; i++ ) {
|
|
38
|
+
|
|
39
|
+
bigrams.add(
|
|
40
|
+
str.substring( i, i + 2 )
|
|
41
|
+
);
|
|
42
|
+
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return bigrams;
|
|
46
|
+
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* search for closest string
|
|
51
|
+
* @param {String} algo algorithm to use
|
|
52
|
+
* @param {String} test test string
|
|
53
|
+
* @param {Array} arr targets to test
|
|
54
|
+
* @returns closest target
|
|
55
|
+
*/
|
|
56
|
+
const findClosest = ( algo, test, arr ) => {
|
|
57
|
+
|
|
58
|
+
let best = -Infinity,
|
|
59
|
+
idx = 0,
|
|
60
|
+
pct;
|
|
61
|
+
|
|
62
|
+
/* search for closest element in arr */
|
|
63
|
+
|
|
64
|
+
arr.forEach( ( str, i ) => {
|
|
65
|
+
|
|
66
|
+
switch( algo ) {
|
|
67
|
+
|
|
68
|
+
case 'levenshtein':
|
|
69
|
+
pct = levenshtein( test, str );
|
|
70
|
+
break;
|
|
71
|
+
|
|
72
|
+
case 'diceCoefficient':
|
|
73
|
+
pct = diceCoefficient( test, str );
|
|
74
|
+
break;
|
|
75
|
+
|
|
76
|
+
default:
|
|
77
|
+
pct = 0;
|
|
78
|
+
break;
|
|
79
|
+
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
if( pct > best ) {
|
|
83
|
+
|
|
84
|
+
/* save closest target */
|
|
85
|
+
|
|
86
|
+
best = pct;
|
|
87
|
+
idx = i;
|
|
88
|
+
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
} );
|
|
92
|
+
|
|
93
|
+
/* return closest target */
|
|
94
|
+
|
|
95
|
+
return arr[ idx ];
|
|
96
|
+
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* similarity calculations
|
|
101
|
+
* @public
|
|
102
|
+
*/
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* calculate levenshtein similarity (in percent)
|
|
106
|
+
* @param {String} a string 1
|
|
107
|
+
* @param {String} b string 2
|
|
108
|
+
* @returns similarity 0..1
|
|
109
|
+
*/
|
|
110
|
+
const levenshtein = ( a, b ) => {
|
|
111
|
+
|
|
112
|
+
/* normalize string */
|
|
113
|
+
|
|
114
|
+
a = normalize( a );
|
|
115
|
+
b = normalize( b );
|
|
116
|
+
|
|
117
|
+
if( a == b ) {
|
|
118
|
+
|
|
119
|
+
/* both string are similar or empty */
|
|
120
|
+
|
|
121
|
+
return 1;
|
|
122
|
+
|
|
123
|
+
} else if( a.length < 2 || b.length < 2 ) {
|
|
124
|
+
|
|
125
|
+
/* for 0-letter or 1-letter strings */
|
|
126
|
+
|
|
127
|
+
return 0;
|
|
128
|
+
|
|
129
|
+
} else {
|
|
130
|
+
|
|
131
|
+
/* get levenshtein distance */
|
|
132
|
+
|
|
133
|
+
let distance = levenshteinDistance( a, b );
|
|
134
|
+
|
|
135
|
+
/* return percentage */
|
|
136
|
+
|
|
137
|
+
return 1 - (
|
|
138
|
+
distance / Math.max(
|
|
139
|
+
a.length,
|
|
140
|
+
b.length
|
|
141
|
+
)
|
|
142
|
+
);
|
|
143
|
+
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
};
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* get levenshtein distance
|
|
150
|
+
* @param {String} a string 1
|
|
151
|
+
* @param {String} b string 2
|
|
152
|
+
* @returns distance
|
|
153
|
+
*/
|
|
154
|
+
const levenshteinDistance = ( a, b ) => {
|
|
155
|
+
|
|
156
|
+
/* normalize string */
|
|
157
|
+
|
|
158
|
+
a = normalize( a );
|
|
159
|
+
b = normalize( b );
|
|
160
|
+
|
|
161
|
+
if( a == b ) {
|
|
162
|
+
|
|
163
|
+
/* both string are similar or empty */
|
|
164
|
+
|
|
165
|
+
return 0;
|
|
166
|
+
|
|
167
|
+
} else if( a.length == 0 ) {
|
|
168
|
+
|
|
169
|
+
/* empty string 1 */
|
|
170
|
+
|
|
171
|
+
return b.length;
|
|
172
|
+
|
|
173
|
+
} else if( b.length == 0 ) {
|
|
174
|
+
|
|
175
|
+
/* empty string 2 */
|
|
176
|
+
|
|
177
|
+
return a.length;
|
|
178
|
+
|
|
179
|
+
} else {
|
|
180
|
+
|
|
181
|
+
/* create matrix */
|
|
182
|
+
|
|
183
|
+
const matrix = [];
|
|
184
|
+
|
|
185
|
+
for( let i = 0; i <= a.length; i++ ) {
|
|
186
|
+
|
|
187
|
+
const row = [];
|
|
188
|
+
|
|
189
|
+
for( let j = 0; j <= b.length; j++ ) {
|
|
190
|
+
|
|
191
|
+
row.push( j );
|
|
192
|
+
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
row[0] = i;
|
|
196
|
+
|
|
197
|
+
matrix.push( row );
|
|
198
|
+
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/* calculate distance */
|
|
202
|
+
|
|
203
|
+
for( let i = 1; i <= a.length; i++ ) {
|
|
204
|
+
|
|
205
|
+
for( let j = 1; j <= b.length; j++ ) {
|
|
206
|
+
|
|
207
|
+
if( a[ i - 1 ] === b[ j - 1 ] ) {
|
|
208
|
+
|
|
209
|
+
matrix[ i ][ j ] = matrix[ i - 1 ][ j - 1 ];
|
|
210
|
+
|
|
211
|
+
} else {
|
|
212
|
+
|
|
213
|
+
matrix[ i ][ j ] = 1 + Math.min(
|
|
214
|
+
matrix[ i ][ j - 1 ],
|
|
215
|
+
matrix[ i - 1 ][ j - 1 ],
|
|
216
|
+
matrix[ i - 1 ][ j ]
|
|
217
|
+
);
|
|
218
|
+
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/* return levenshtein distance */
|
|
226
|
+
|
|
227
|
+
return matrix[ a.length ][ b.length ];
|
|
228
|
+
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
};
|
|
232
|
+
|
|
233
|
+
/**
|
|
234
|
+
* search for closest target to test string
|
|
235
|
+
* @param {String} test test string
|
|
236
|
+
* @param {Array} arr targets to test
|
|
237
|
+
* @returns closest target
|
|
238
|
+
*/
|
|
239
|
+
const levenshteinClosest = ( test, arr ) => {
|
|
240
|
+
|
|
241
|
+
return findClosest( 'levenshtein', test, arr );
|
|
242
|
+
|
|
243
|
+
};
|
|
244
|
+
|
|
245
|
+
/**
|
|
246
|
+
* calculate dice coefficient
|
|
247
|
+
* @param {String} a string 1
|
|
248
|
+
* @param {String} b string 2
|
|
249
|
+
* @returns dice coefficient
|
|
250
|
+
*/
|
|
251
|
+
const diceCoefficient = ( a, b ) => {
|
|
252
|
+
|
|
253
|
+
/* normalize string */
|
|
254
|
+
|
|
255
|
+
a = normalize( a );
|
|
256
|
+
b = normalize( b );
|
|
257
|
+
|
|
258
|
+
if( a == b ) {
|
|
259
|
+
|
|
260
|
+
/* both string are similar or empty */
|
|
261
|
+
|
|
262
|
+
return 1;
|
|
263
|
+
|
|
264
|
+
} else if( a.length < 2 || b.length < 2 ) {
|
|
265
|
+
|
|
266
|
+
/* for 0-letter or 1-letter strings */
|
|
267
|
+
|
|
268
|
+
return 0;
|
|
269
|
+
|
|
270
|
+
} else {
|
|
271
|
+
|
|
272
|
+
/* get bigrams */
|
|
273
|
+
|
|
274
|
+
let setA = bbigrams( a ),
|
|
275
|
+
setB = bbigrams( b );
|
|
276
|
+
|
|
277
|
+
/* calculate dice coefficient */
|
|
278
|
+
|
|
279
|
+
return (
|
|
280
|
+
( new Set( [ ...setA ].filter( ( x ) => {
|
|
281
|
+
return setB.has( x );
|
|
282
|
+
} ) ) ).size * 2
|
|
283
|
+
) / (
|
|
284
|
+
setA.size +
|
|
285
|
+
setB.size
|
|
286
|
+
);
|
|
287
|
+
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
/**
|
|
293
|
+
* search for closest target to test string
|
|
294
|
+
* @param {String} test test string
|
|
295
|
+
* @param {Array} arr targets to test
|
|
296
|
+
* @returns closest target
|
|
297
|
+
*/
|
|
298
|
+
const diceClosest = ( test, arr ) => {
|
|
299
|
+
|
|
300
|
+
return findClosest( 'diceCoefficient', test, arr );
|
|
301
|
+
|
|
302
|
+
};
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* export module functions
|
|
306
|
+
*/
|
|
307
|
+
module.exports = {
|
|
308
|
+
levenshtein,
|
|
309
|
+
levenshteinDistance,
|
|
310
|
+
levenshteinClosest,
|
|
311
|
+
diceCoefficient,
|
|
312
|
+
diceClosest
|
|
313
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "cmpstr",
|
|
3
|
+
"description": "lightweight npm package to calculate string similarity",
|
|
4
|
+
"author": {
|
|
5
|
+
"name" : "komed3 (Paul Köhler)",
|
|
6
|
+
"email" : "webmaster@komed3.de",
|
|
7
|
+
"url" : "https://komed3.de"
|
|
8
|
+
},
|
|
9
|
+
"homepage": "https://github.com/komed3/cmpstr#readme",
|
|
10
|
+
"version": "1.0.0",
|
|
11
|
+
"license": "MIT",
|
|
12
|
+
"keywords": [
|
|
13
|
+
"string",
|
|
14
|
+
"similarity",
|
|
15
|
+
"levenshtein-distance",
|
|
16
|
+
"dice-coefficient"
|
|
17
|
+
],
|
|
18
|
+
"repository": {
|
|
19
|
+
"type": "git",
|
|
20
|
+
"url": "git+https://github.com/komed3/cmpstr.git"
|
|
21
|
+
},
|
|
22
|
+
"bugs": {
|
|
23
|
+
"url": "https://github.com/komed3/cmpstr/issues"
|
|
24
|
+
}
|
|
25
|
+
}
|