cmpstr 1.0.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,191 @@
1
+ /**
2
+ * class CmpStrAsync
3
+ * extends CmpStr
4
+ *
5
+ * The CmpStrAsync class extends the CmpStr class and provides asynchronous
6
+ * versions of its methods. It uses Promises and setImmediate to ensure
7
+ * non-blocking execution, making it suitable for use in asynchronous workflows.
8
+ *
9
+ * @author Paul Köhler (komed3)
10
+ * @license MIT
11
+ */
12
+
13
+ 'use strict';
14
+
15
+ /**
16
+ * module dependencies
17
+ * @private
18
+ */
19
+
20
+ const CmpStr = require( './CmpStr' );
21
+
22
+ /**
23
+ * module exports
24
+ * @public
25
+ */
26
+
27
+ module.exports = class CmpStrAsync extends CmpStr {
28
+
29
+ /**
30
+ * initializes a CmpStrAsync instance
31
+ * algorithm and base string can be set by initialization
32
+ *
33
+ * @param {String} algo name of the algorithm to use for calculation
34
+ * @param {String} str string to set as the base
35
+ */
36
+ constructor ( algo = undefined, str = undefined ) {
37
+
38
+ super ( algo, str );
39
+
40
+ };
41
+
42
+ /**
43
+ * @private
44
+ * generic async wrapper for methods
45
+ *
46
+ * @param {Function} method method to call
47
+ * @param {...any} args arguments to pass to the method
48
+ * @returns {Promise} Promise resolving the result of the method
49
+ */
50
+ #asyncWrapper ( method, ...args ) {
51
+
52
+ return new Promise ( ( resolve, reject ) => {
53
+
54
+ setImmediate( () => {
55
+
56
+ try {
57
+
58
+ resolve( method.apply( this, args ) );
59
+
60
+ } catch ( err ) {
61
+
62
+ reject( err );
63
+
64
+ }
65
+
66
+ } );
67
+
68
+ } );
69
+
70
+ };
71
+
72
+ /**
73
+ * --------------------------------------------------
74
+ * Asynchronous Methods
75
+ * --------------------------------------------------
76
+ */
77
+
78
+ /**
79
+ * compares two string a and b using the passed algorithm
80
+ *
81
+ * @async
82
+ *
83
+ * @param {String} algo name of the algorithm
84
+ * @param {String} a string a
85
+ * @param {String} b string b
86
+ * @param {Object} [config={}] config (flags, args)
87
+ * @returns {Promise} Promise resolving similarity between a and b
88
+ */
89
+ compareAsync ( algo, a, b, config = {} ) {
90
+
91
+ return this.#asyncWrapper(
92
+ this.compare,
93
+ algo, a, b, config
94
+ );
95
+
96
+ };
97
+
98
+ /**
99
+ * tests the similarity between the base string and a target string
100
+ * using the current algorithm
101
+ *
102
+ * @async
103
+ *
104
+ * @param {String} str target string
105
+ * @param {Object} [config={}] config (flags, args)
106
+ * @returns {Promise} Promise resolving similarity to base string
107
+ */
108
+ testAsync ( str, config = {} ) {
109
+
110
+ return this.#asyncWrapper(
111
+ this.test,
112
+ str, config
113
+ );
114
+
115
+ };
116
+
117
+ /**
118
+ * tests the similarity of multiple strings against the base string
119
+ *
120
+ * @async
121
+ *
122
+ * @param {String[]} arr array of strings
123
+ * @param {Object} [config={}] config (flags, args)
124
+ * @returns {Promise} Promise resolving an array of objects, each containing target string and similarity score
125
+ */
126
+ batchTestAsync ( arr, config = {} ) {
127
+
128
+ return this.#asyncWrapper(
129
+ this.batchTest,
130
+ arr, config
131
+ );
132
+
133
+ };
134
+
135
+ /**
136
+ * finds strings in an array that exceed a similarity threshold
137
+ * returns the array sorted by highest similarity
138
+ *
139
+ * @async
140
+ *
141
+ * @param {String[]} arr array of strings
142
+ * @param {Object} [config={}] config (flags, threshold, args)
143
+ * @returns {Promise} Promise resolving an array of objects, sorted by highest similarity
144
+ */
145
+ async matchAsync ( arr, config = {} ) {
146
+
147
+ return this.#asyncWrapper(
148
+ this.match,
149
+ arr, config
150
+ );
151
+
152
+ };
153
+
154
+ /**
155
+ * finds the closest matching string from an array
156
+ *
157
+ * @async
158
+ *
159
+ * @param {String[]} arr array of strings
160
+ * @param {Object} [config={}] config (flags, args)
161
+ * @returns {Promise} Promise resolving the closest matching string
162
+ */
163
+ async closestAsync ( arr, config = {} ) {
164
+
165
+ return this.#asyncWrapper(
166
+ this.closest,
167
+ arr, config
168
+ );
169
+
170
+ };
171
+
172
+ /**
173
+ * generate a similarity matrix for an array of strings
174
+ *
175
+ * @async
176
+ *
177
+ * @param {String} algo name of the algorithm
178
+ * @param {String[]} arr array of strings to cross-compare
179
+ * @param {Object} [config={}] config (flags, args)
180
+ * @returns {Promise} Promise resolving an 2D array representing the similarity matrix
181
+ */
182
+ similarityMatrixAsync ( algo, arr, config = {} ) {
183
+
184
+ return this.#asyncWrapper(
185
+ this.similarityMatrix,
186
+ algo, arr, config
187
+ );
188
+
189
+ };
190
+
191
+ };
@@ -0,0 +1,86 @@
1
+ /**
2
+ * Cosine Similarity
3
+ * CmpStr module
4
+ *
5
+ * Cosine similarity is a measure how similar two vectors are. It's often used
6
+ * in text analysis to compare texts based on the words they contain.
7
+ *
8
+ * @author Paul Köhler (komed3)
9
+ * @license MIT
10
+ */
11
+
12
+ 'use strict';
13
+
14
+ /**
15
+ * private helper function
16
+ * get term frequency from string
17
+ * @private
18
+ *
19
+ * @param {String} str string
20
+ * @param {String} delimiter term delimiter
21
+ * @returns {Object} term frequency
22
+ */
23
+ const _termFreq = ( str, delimiter ) => {
24
+
25
+ let freq = {};
26
+
27
+ str.split( delimiter ).forEach( ( term ) => {
28
+
29
+ freq[ term ] = ( freq[ term ] || 0 ) + 1;
30
+
31
+ } );
32
+
33
+ return freq;
34
+
35
+ };
36
+
37
+ /**
38
+ * module exports
39
+ * @public
40
+ *
41
+ * @param {String} a string a
42
+ * @param {String} b string b
43
+ * @param {Object} options having {
44
+ * @param {String} [delimiter=' '] term delimiter
45
+ * }
46
+ * @returns {Number} similarity score (0..1)
47
+ */
48
+
49
+ module.exports = ( a, b, { delimiter = ' ' } = {} ) => {
50
+
51
+ /* step 1: count the frequency of chars per string */
52
+
53
+ let termsA = _termFreq( a, delimiter ),
54
+ termsB = _termFreq( b, delimiter );
55
+
56
+ let allTerms = new Set ( [
57
+ ...Object.keys( termsA ),
58
+ ...Object.keys( termsB )
59
+ ] );
60
+
61
+ /* step 2: calculate the dot product */
62
+
63
+ let dotProduct = [ ...allTerms ].reduce(
64
+ ( sum, char ) => sum + ( termsA[ char ] || 0 ) * ( termsB[ char ] || 0 ),
65
+ 0
66
+ );
67
+
68
+ /* step 3: calculate the vector magnitudes */
69
+
70
+ let magnitudeA = Math.sqrt( [ ...allTerms ].reduce(
71
+ ( sum, char ) => sum + ( termsA[ char ] || 0 ) ** 2,
72
+ 0
73
+ ) );
74
+
75
+ let magnitudeB = Math.sqrt( [ ...allTerms ].reduce(
76
+ ( sum, char ) => sum + ( termsB[ char ] || 0 ) ** 2,
77
+ 0
78
+ ) );
79
+
80
+ /* step 4: calculate Cosine similarity */
81
+
82
+ return magnitudeA && magnitudeB
83
+ ? dotProduct / ( magnitudeA * magnitudeB )
84
+ : 0;
85
+
86
+ };
@@ -0,0 +1,78 @@
1
+ /**
2
+ * Damerau-Levenshtein Distance
3
+ * CmpStr module
4
+ *
5
+ * The Damerau-Levenshtein distance differs from the classical Levenshtein
6
+ * distance by including transpositions among its allowable operations in
7
+ * addition to the three classical single-character edit operations
8
+ * (insertions, deletions and substitutions). Useful for correcting typos.
9
+ *
10
+ * @author Paul Köhler (komed3)
11
+ * @license MIT
12
+ */
13
+
14
+ 'use strict';
15
+
16
+ /**
17
+ * module exports
18
+ * @public
19
+ *
20
+ * @param {String} a string a
21
+ * @param {String} b string b
22
+ * @param {Object} options having {
23
+ * @param {Boolean} [raw=false] if true the raw distance is returned
24
+ * }
25
+ * @returns {Number} similarity score (0..1) or distance
26
+ */
27
+
28
+ module.exports = ( a, b, { raw = false } = {} ) => {
29
+
30
+ /* step 1: initialize scoring matrix */
31
+
32
+ let matrix = Array.from(
33
+ { length: a.length + 1 },
34
+ ( _, i ) => Array.from(
35
+ { length: b.length + 1 },
36
+ ( _, j ) => i && j ? 0 : i || j
37
+ )
38
+ );
39
+
40
+ /* step 2: calculate Damerau-Levenshtein distance */
41
+
42
+ for ( let i = 1; i <= a.length; i++ ) {
43
+
44
+ for ( let j = 1; j <= b.length; j++ ) {
45
+
46
+ let cost = a[ i - 1 ] === b[ j - 1 ] ? 0 : 1;
47
+
48
+ matrix[ i ][ j ] = Math.min(
49
+ matrix[ i - 1 ][ j ] + 1,
50
+ matrix[ i ][ j - 1 ] + 1,
51
+ matrix[ i - 1 ][ j - 1 ] + cost
52
+ );
53
+
54
+ if (
55
+ i > 1 && j > 1 &&
56
+ a[ i - 1 ] === b[ j - 2 ] &&
57
+ a[ i - 2 ] === b[ j - 1 ]
58
+ ) {
59
+
60
+ matrix[ i ][ j ] = Math.min(
61
+ matrix[ i ][ j ],
62
+ matrix[ i - 2 ][ j - 2 ] + cost
63
+ );
64
+
65
+ }
66
+
67
+ }
68
+
69
+ }
70
+
71
+ /* step 3: get Damerau-Levenshtein distance as value between 0..1 */
72
+
73
+ return raw ? matrix[ a.length ][ b.length ] : 1 - (
74
+ matrix[ a.length ][ b.length ] /
75
+ Math.max( a.length, b.length )
76
+ );
77
+
78
+ };
@@ -0,0 +1,65 @@
1
+ /**
2
+ * Dice-Sørensen Coefficient
3
+ * CmpStr module
4
+ *
5
+ * The Dice-Sørensen index equals twice the number of elements common to
6
+ * both sets divided by the sum of the number of elements in each set.
7
+ * Equivalently, the index is the size of the intersection as a fraction
8
+ * of the average size of the two sets.
9
+ *
10
+ * @author Paul Köhler (komed3)
11
+ * @license MIT
12
+ */
13
+
14
+ 'use strict';
15
+
16
+ /**
17
+ * private helper function
18
+ * get bigrams from string
19
+ * @private
20
+ *
21
+ * @param {String} str string
22
+ * @returns {Set} set of bigrams
23
+ */
24
+ const _str2bigrams = ( str ) => {
25
+
26
+ let bigrams = new Set ();
27
+
28
+ for ( let i = 0; i < str.length - 1; i++ ) {
29
+
30
+ bigrams.add( str.substring( i, i + 2 ) );
31
+
32
+ }
33
+
34
+ return bigrams;
35
+
36
+ };
37
+
38
+ /**
39
+ * module exports
40
+ * @public
41
+ *
42
+ * @param {String} a string a
43
+ * @param {String} b string b
44
+ * @returns {Number} similarity score (0..1)
45
+ */
46
+
47
+ module.exports = ( a, b ) => {
48
+
49
+ /* step 1: generate bigrams from strings */
50
+
51
+ let setA = _str2bigrams( a ),
52
+ setB = _str2bigrams( b );
53
+
54
+ /* step 2: calculate coefficient */
55
+
56
+ return (
57
+ ( new Set ( [ ...setA ].filter( ( test ) => {
58
+ return setB.has( test );
59
+ } ) ) ).size * 2
60
+ ) / (
61
+ setA.size +
62
+ setB.size
63
+ );
64
+
65
+ };
@@ -0,0 +1,44 @@
1
+ /**
2
+ * Hamming Distance
3
+ * CmpStr module
4
+ *
5
+ * The Hamming distance between two equal-length strings of symbols is the
6
+ * number of positions at which the corresponding symbols are different.
7
+ *
8
+ * @author Paul Köhler (komed3)
9
+ * @license MIT
10
+ */
11
+
12
+ 'use strict';
13
+
14
+ /**
15
+ * module exports
16
+ * @public
17
+ *
18
+ * @param {String} a string a
19
+ * @param {String} b string b
20
+ * @returns {Number} similarity score (0..1)
21
+ * @throws {Error} if string not of equal length
22
+ */
23
+
24
+ module.exports = ( a, b ) => {
25
+
26
+ if ( a.length !== b.length ) {
27
+
28
+ /* strings must be of equal length for this calculation */
29
+
30
+ throw new Error (
31
+ `Strings must be of equal length for Hamming Distance`
32
+ );
33
+
34
+ }
35
+
36
+ return 1 - (
37
+ [ ...a ].reduce(
38
+ ( sum, char, i ) => sum + ( char !== b[ i ] ? 1 : 0 ),
39
+ 0
40
+ ) /
41
+ a.length
42
+ );
43
+
44
+ };
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Jaccard Index
3
+ * CmpStr module
4
+ *
5
+ * The Jaccard Index measures the similarity between two sets by dividing
6
+ * the size of their intersection by the size of their union.
7
+ *
8
+ * @author Paul Köhler (komed3)
9
+ * @license MIT
10
+ */
11
+
12
+ 'use strict';
13
+
14
+ /**
15
+ * module exports
16
+ * @public
17
+ *
18
+ * @param {String} a string a
19
+ * @param {String} b string b
20
+ * @returns {Number} similarity score (0..1)
21
+ */
22
+
23
+ module.exports = ( a, b ) => {
24
+
25
+ let setA = new Set ( a ),
26
+ setB = new Set ( b );
27
+
28
+ return (
29
+ new Set ( [ ...setA ].filter( x => setB.has( x ) ) )
30
+ ).size / (
31
+ new Set ( [ ...setA, ...setB ] )
32
+ ).size;
33
+
34
+ };
@@ -0,0 +1,106 @@
1
+ /**
2
+ * Jaro-Winkler Distance
3
+ * CmpStr module
4
+ *
5
+ * Jaro-Winkler is a string similarity metric that gives more weight to
6
+ * matching characters at the start of the strings.
7
+ *
8
+ * @author Paul Köhler (komed3)
9
+ * @license MIT
10
+ */
11
+
12
+ 'use strict';
13
+
14
+ /**
15
+ * module exports
16
+ * @public
17
+ *
18
+ * @param {String} a string a
19
+ * @param {String} b string b
20
+ * @param {Object} options having {
21
+ * @param {Boolean} [raw=false] if true the raw distance is returned
22
+ * }
23
+ * @returns {Number} similarity score (0..1) or distance
24
+ */
25
+
26
+ module.exports = ( a, b, { raw = false } = {} ) => {
27
+
28
+ /* step 1: check for matches between strings */
29
+
30
+ let matchWindow = Math.floor(
31
+ Math.max( a.length, b.length ) / 2
32
+ ) - 1;
33
+
34
+ let aMatches = Array( a.length ).fill( false ),
35
+ bMatches = Array( b.length ).fill( false );
36
+
37
+ let matches = 0;
38
+
39
+ for ( let i = 0; i < a.length; i++ ) {
40
+
41
+ for (
42
+ let j = Math.max( 0, i - matchWindow );
43
+ j < Math.min( i + matchWindow + 1, b.length );
44
+ j++
45
+ ) {
46
+
47
+ if ( !bMatches[ j ] && a[ i ] === b[ j ] ) {
48
+
49
+ aMatches[ i ] = true;
50
+ bMatches[ j ] = true;
51
+
52
+ matches++;
53
+
54
+ break;
55
+
56
+ }
57
+
58
+ }
59
+
60
+ }
61
+
62
+ if ( matches === 0 ) {
63
+
64
+ /* if no matches found, return 0 */
65
+
66
+ return 0;
67
+
68
+ }
69
+
70
+ /* step 2: calculate transpositions */
71
+
72
+ let transpos = 0,
73
+ k = 0;
74
+
75
+ for ( let i = 0; i < a.length; i++ ) {
76
+
77
+ if ( aMatches[ i ] ) {
78
+
79
+ while ( !bMatches[ k ] ) k++;
80
+
81
+ if ( a[ i ] !== b[ k ] ) transpos++;
82
+
83
+ k++;
84
+
85
+ }
86
+
87
+ }
88
+
89
+ /* step 3: calculate Jaro-Winkler distance */
90
+
91
+ let jaro = (
92
+ ( matches / a.length ) +
93
+ ( matches / b.length ) +
94
+ ( matches - ( transpos / 2 ) ) /
95
+ matches
96
+ ) / 3;
97
+
98
+ /* step 4: get Jaro-Winkler as value between 0..1 */
99
+
100
+ return raw ? jaro : jaro + Math.min(
101
+ 4, [ ...a ].findIndex(
102
+ ( char, i ) => char !== b[ i ]
103
+ ) || 0
104
+ ) * 0.1 * ( 1 - jaro );
105
+
106
+ };
@@ -0,0 +1,58 @@
1
+ /**
2
+ * Longest Common Subsequence (LCS)
3
+ * CmpStr module
4
+ *
5
+ * LCS measures the length of the longest subsequence common to both strings.
6
+ *
7
+ * @author Paul Köhler (komed3)
8
+ * @license MIT
9
+ */
10
+
11
+ 'use strict';
12
+
13
+ /**
14
+ * module exports
15
+ * @public
16
+ *
17
+ * @param {String} a string a
18
+ * @param {String} b string b
19
+ * @returns {Number} similarity score (0..1)
20
+ */
21
+
22
+ module.exports = ( a, b ) => {
23
+
24
+ /* step 1: initialize scoring matrix */
25
+
26
+ let matrix = Array( a.length + 1 ).fill( null ).map(
27
+ () => Array( b.length + 1 ).fill( 0 )
28
+ );
29
+
30
+ for ( let i = 1; i <= a.length; i++ ) {
31
+
32
+ for ( let j = 1; j <= b.length; j++ ) {
33
+
34
+ if ( a[ i - 1 ] === b[ j - 1 ] ) {
35
+
36
+ matrix[ i ][ j ] = matrix[ i - 1 ][ j - 1 ] + 1;
37
+
38
+ } else {
39
+
40
+ matrix[ i ][ j ] = Math.max(
41
+ matrix[ i - 1 ][ j ],
42
+ matrix[ i ][ j - 1 ]
43
+ );
44
+
45
+ }
46
+
47
+ }
48
+
49
+ }
50
+
51
+ /* step 2: calculate LCS */
52
+
53
+ return (
54
+ matrix[ a.length ][ b.length ] /
55
+ Math.max( a.length, b.length )
56
+ );
57
+
58
+ };