cmpstr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +70 -0
  3. package/index.js +313 -0
  4. package/package.json +25 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Paul Köhler
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,70 @@
1
+ # cmpstr
2
+
3
+ This lightweight npm package can be used to __calculate the similarity of strings__. It supports both the best known __Levenshtein distance__ and the slightly more accurate __Sørensen dice coefficient__.
4
+
5
+ ## Install
6
+
7
+ Using Node.js install the package using shell command:
8
+
9
+ ```sh
10
+ npm install cmpstr
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ Load the package into your project:
16
+
17
+ ```js
18
+ const cmpstr = require( 'cmpstr' );
19
+ ```
20
+
21
+ Sample of how to use the package in your code:
22
+
23
+ ```js
24
+ let str1 = 'kitten';
25
+ let str2 = 'sitting';
26
+
27
+ let distance = cmpstr.levenshteinDistance( str1, str2 );
28
+ // expected 3
29
+
30
+ let dice = cmpstr.diceCoefficient( str1, str2 );
31
+ // expected 0.3636363636363636
32
+
33
+ let closest = cmpstr.diceClosest( 'best', [
34
+ 'better', 'bestest', 'well', 'good'
35
+ ] );
36
+ // expected bestest
37
+ ```
38
+
39
+ ## API
40
+
41
+ The npm package ``cmpstr`` supports two different methods for determining the similarity of two strings. The __Levenshtein distance__, as the minimum number of inserting, deleting and replacing operations to convert one string into another, and the __Sørensen-Dice coefficient__ to measure the similarity of two samples.
42
+
43
+ Learn more about both by visiting these links:
44
+
45
+ * [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
46
+ * [Sørensen-Dice coefficient](https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient)
47
+
48
+ ### Levenshtein distance
49
+
50
+ #### ``levenshteinDistance( a, b )``
51
+
52
+ Calculates the difference between two strings ``a`` and ``b`` and returns the Levenshtein distance as an integer value.
53
+
54
+ #### ``levenshtein( a, b )``
55
+
56
+ Returns the match percentage of two strings ``a`` and ``b``. The output value is in the range ``0..1`` as a floating point number.
57
+
58
+ #### ``levenshteinClosest( str, arr )``
59
+
60
+ Returns the best match of the string ``str`` against the array ``arr`` of passed strings. The function returns the most closely matched string found in the array.
61
+
62
+ ### Sørensen-Dice coefficient
63
+
64
+ #### ``diceCoefficient( a, b )``
65
+
66
+ This function evaluates the similarity of two given strings ``a`` and ``b`` as percentage value according to the Sørensen-Dice coefficient and returns the result as floating point number.
67
+
68
+ #### ``diceClosest( str, arr )``
69
+
70
+ As another way to find the best match between the string ``str`` and a given array ``arr`` of samples, this function uses the Sørensen-Dice coefficient. It returns the most matching string as well.
package/index.js ADDED
@@ -0,0 +1,313 @@
1
+ /**
2
+ * cmpstr
3
+ * lightweight npm package to calculate string similarity
4
+ *
5
+ * @author komed3 (Paul Köhler)
6
+ * @version 1.0.0
7
+ * @license MIT
8
+ */
9
+
10
+ 'use strict'
11
+
12
+ /**
13
+ * basic functions
14
+ * @private
15
+ */
16
+
17
+ /**
18
+ * normalize string
19
+ * @param {String} str string
20
+ * @returns normalized string
21
+ */
22
+ const normalize = ( str ) => {
23
+
24
+ return str.toString();
25
+
26
+ };
27
+
28
+ /**
29
+ * get bigrams from string
30
+ * @param {String} str string
31
+ * @returns bigrams
32
+ */
33
+ const bbigrams = ( str ) => {
34
+
35
+ let bigrams = new Set();
36
+
37
+ for( let i = 0; i < str.length - 1; i++ ) {
38
+
39
+ bigrams.add(
40
+ str.substring( i, i + 2 )
41
+ );
42
+
43
+ }
44
+
45
+ return bigrams;
46
+
47
+ };
48
+
49
+ /**
50
+ * search for closest string
51
+ * @param {String} algo algorithm to use
52
+ * @param {String} test test string
53
+ * @param {Array} arr targets to test
54
+ * @returns closest target
55
+ */
56
+ const findClosest = ( algo, test, arr ) => {
57
+
58
+ let best = -Infinity,
59
+ idx = 0,
60
+ pct;
61
+
62
+ /* search for closest element in arr */
63
+
64
+ arr.forEach( ( str, i ) => {
65
+
66
+ switch( algo ) {
67
+
68
+ case 'levenshtein':
69
+ pct = levenshtein( test, str );
70
+ break;
71
+
72
+ case 'diceCoefficient':
73
+ pct = diceCoefficient( test, str );
74
+ break;
75
+
76
+ default:
77
+ pct = 0;
78
+ break;
79
+
80
+ }
81
+
82
+ if( pct > best ) {
83
+
84
+ /* save closest target */
85
+
86
+ best = pct;
87
+ idx = i;
88
+
89
+ }
90
+
91
+ } );
92
+
93
+ /* return closest target */
94
+
95
+ return arr[ idx ];
96
+
97
+ };
98
+
99
+ /**
100
+ * similarity calculations
101
+ * @public
102
+ */
103
+
104
+ /**
105
+ * calculate levenshtein similarity (in percent)
106
+ * @param {String} a string 1
107
+ * @param {String} b string 2
108
+ * @returns similarity 0..1
109
+ */
110
+ const levenshtein = ( a, b ) => {
111
+
112
+ /* normalize string */
113
+
114
+ a = normalize( a );
115
+ b = normalize( b );
116
+
117
+ if( a == b ) {
118
+
119
+ /* both string are similar or empty */
120
+
121
+ return 1;
122
+
123
+ } else if( a.length < 2 || b.length < 2 ) {
124
+
125
+ /* for 0-letter or 1-letter strings */
126
+
127
+ return 0;
128
+
129
+ } else {
130
+
131
+ /* get levenshtein distance */
132
+
133
+ let distance = levenshteinDistance( a, b );
134
+
135
+ /* return percentage */
136
+
137
+ return 1 - (
138
+ distance / Math.max(
139
+ a.length,
140
+ b.length
141
+ )
142
+ );
143
+
144
+ }
145
+
146
+ };
147
+
148
+ /**
149
+ * get levenshtein distance
150
+ * @param {String} a string 1
151
+ * @param {String} b string 2
152
+ * @returns distance
153
+ */
154
+ const levenshteinDistance = ( a, b ) => {
155
+
156
+ /* normalize string */
157
+
158
+ a = normalize( a );
159
+ b = normalize( b );
160
+
161
+ if( a == b ) {
162
+
163
+ /* both string are similar or empty */
164
+
165
+ return 0;
166
+
167
+ } else if( a.length == 0 ) {
168
+
169
+ /* empty string 1 */
170
+
171
+ return b.length;
172
+
173
+ } else if( b.length == 0 ) {
174
+
175
+ /* empty string 2 */
176
+
177
+ return a.length;
178
+
179
+ } else {
180
+
181
+ /* create matrix */
182
+
183
+ const matrix = [];
184
+
185
+ for( let i = 0; i <= a.length; i++ ) {
186
+
187
+ const row = [];
188
+
189
+ for( let j = 0; j <= b.length; j++ ) {
190
+
191
+ row.push( j );
192
+
193
+ }
194
+
195
+ row[0] = i;
196
+
197
+ matrix.push( row );
198
+
199
+ }
200
+
201
+ /* calculate distance */
202
+
203
+ for( let i = 1; i <= a.length; i++ ) {
204
+
205
+ for( let j = 1; j <= b.length; j++ ) {
206
+
207
+ if( a[ i - 1 ] === b[ j - 1 ] ) {
208
+
209
+ matrix[ i ][ j ] = matrix[ i - 1 ][ j - 1 ];
210
+
211
+ } else {
212
+
213
+ matrix[ i ][ j ] = 1 + Math.min(
214
+ matrix[ i ][ j - 1 ],
215
+ matrix[ i - 1 ][ j - 1 ],
216
+ matrix[ i - 1 ][ j ]
217
+ );
218
+
219
+ }
220
+
221
+ }
222
+
223
+ }
224
+
225
+ /* return levenshtein distance */
226
+
227
+ return matrix[ a.length ][ b.length ];
228
+
229
+ }
230
+
231
+ };
232
+
233
+ /**
234
+ * search for closest target to test string
235
+ * @param {String} test test string
236
+ * @param {Array} arr targets to test
237
+ * @returns closest target
238
+ */
239
+ const levenshteinClosest = ( test, arr ) => {
240
+
241
+ return findClosest( 'levenshtein', test, arr );
242
+
243
+ };
244
+
245
+ /**
246
+ * calculate dice coefficient
247
+ * @param {String} a string 1
248
+ * @param {String} b string 2
249
+ * @returns dice coefficient
250
+ */
251
+ const diceCoefficient = ( a, b ) => {
252
+
253
+ /* normalize string */
254
+
255
+ a = normalize( a );
256
+ b = normalize( b );
257
+
258
+ if( a == b ) {
259
+
260
+ /* both string are similar or empty */
261
+
262
+ return 1;
263
+
264
+ } else if( a.length < 2 || b.length < 2 ) {
265
+
266
+ /* for 0-letter or 1-letter strings */
267
+
268
+ return 0;
269
+
270
+ } else {
271
+
272
+ /* get bigrams */
273
+
274
+ let setA = bbigrams( a ),
275
+ setB = bbigrams( b );
276
+
277
+ /* calculate dice coefficient */
278
+
279
+ return (
280
+ ( new Set( [ ...setA ].filter( ( x ) => {
281
+ return setB.has( x );
282
+ } ) ) ).size * 2
283
+ ) / (
284
+ setA.size +
285
+ setB.size
286
+ );
287
+
288
+ }
289
+
290
+ }
291
+
292
+ /**
293
+ * search for closest target to test string
294
+ * @param {String} test test string
295
+ * @param {Array} arr targets to test
296
+ * @returns closest target
297
+ */
298
+ const diceClosest = ( test, arr ) => {
299
+
300
+ return findClosest( 'diceCoefficient', test, arr );
301
+
302
+ };
303
+
304
+ /**
305
+ * export module functions
306
+ */
307
+ module.exports = {
308
+ levenshtein,
309
+ levenshteinDistance,
310
+ levenshteinClosest,
311
+ diceCoefficient,
312
+ diceClosest
313
+ };
package/package.json ADDED
@@ -0,0 +1,25 @@
1
+ {
2
+ "name": "cmpstr",
3
+ "description": "lightweight npm package to calculate string similarity",
4
+ "author": {
5
+ "name" : "komed3 (Paul Köhler)",
6
+ "email" : "webmaster@komed3.de",
7
+ "url" : "https://komed3.de"
8
+ },
9
+ "homepage": "https://github.com/komed3/cmpstr#readme",
10
+ "version": "1.0.0",
11
+ "license": "MIT",
12
+ "keywords": [
13
+ "string",
14
+ "similarity",
15
+ "levenshtein-distance",
16
+ "dice-coefficient"
17
+ ],
18
+ "repository": {
19
+ "type": "git",
20
+ "url": "git+https://github.com/komed3/cmpstr.git"
21
+ },
22
+ "bugs": {
23
+ "url": "https://github.com/komed3/cmpstr/issues"
24
+ }
25
+ }