cmpstr 1.0.3 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +430 -85
- package/package.json +47 -25
- package/src/CmpStr.js +784 -0
- package/src/CmpStrAsync.js +191 -0
- package/src/algorithms/cosine.js +86 -0
- package/src/algorithms/damerau.js +78 -0
- package/src/algorithms/dice.js +65 -0
- package/src/algorithms/hamming.js +44 -0
- package/src/algorithms/jaccard.js +34 -0
- package/src/algorithms/jaroWinkler.js +106 -0
- package/src/algorithms/lcs.js +58 -0
- package/src/algorithms/levenshtein.js +70 -0
- package/src/algorithms/needlemanWunsch.js +72 -0
- package/src/algorithms/qGram.js +63 -0
- package/src/algorithms/smithWaterman.js +78 -0
- package/src/algorithms/soundex.js +152 -0
- package/src/index.js +47 -0
- package/index.js +0 -432
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* class CmpStrAsync
|
|
3
|
+
* extends CmpStr
|
|
4
|
+
*
|
|
5
|
+
* The CmpStrAsync class extends the CmpStr class and provides asynchronous
|
|
6
|
+
* versions of its methods. It uses Promises and setImmediate to ensure
|
|
7
|
+
* non-blocking execution, making it suitable for use in asynchronous workflows.
|
|
8
|
+
*
|
|
9
|
+
* @author Paul Köhler (komed3)
|
|
10
|
+
* @license MIT
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
'use strict';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* module dependencies
|
|
17
|
+
* @private
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
const CmpStr = require( './CmpStr' );
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* module exports
|
|
24
|
+
* @public
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
module.exports = class CmpStrAsync extends CmpStr {
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* initializes a CmpStrAsync instance
|
|
31
|
+
* algorithm and base string can be set by initialization
|
|
32
|
+
*
|
|
33
|
+
* @param {String} algo name of the algorithm to use for calculation
|
|
34
|
+
* @param {String} str string to set as the base
|
|
35
|
+
*/
|
|
36
|
+
constructor ( algo = undefined, str = undefined ) {
|
|
37
|
+
|
|
38
|
+
super ( algo, str );
|
|
39
|
+
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* @private
|
|
44
|
+
* generic async wrapper for methods
|
|
45
|
+
*
|
|
46
|
+
* @param {Function} method method to call
|
|
47
|
+
* @param {...any} args arguments to pass to the method
|
|
48
|
+
* @returns {Promise} Promise resolving the result of the method
|
|
49
|
+
*/
|
|
50
|
+
#asyncWrapper ( method, ...args ) {
|
|
51
|
+
|
|
52
|
+
return new Promise ( ( resolve, reject ) => {
|
|
53
|
+
|
|
54
|
+
setImmediate( () => {
|
|
55
|
+
|
|
56
|
+
try {
|
|
57
|
+
|
|
58
|
+
resolve( method.apply( this, args ) );
|
|
59
|
+
|
|
60
|
+
} catch ( err ) {
|
|
61
|
+
|
|
62
|
+
reject( err );
|
|
63
|
+
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
} );
|
|
67
|
+
|
|
68
|
+
} );
|
|
69
|
+
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* --------------------------------------------------
|
|
74
|
+
* Asynchronous Methods
|
|
75
|
+
* --------------------------------------------------
|
|
76
|
+
*/
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* compares two string a and b using the passed algorithm
|
|
80
|
+
*
|
|
81
|
+
* @async
|
|
82
|
+
*
|
|
83
|
+
* @param {String} algo name of the algorithm
|
|
84
|
+
* @param {String} a string a
|
|
85
|
+
* @param {String} b string b
|
|
86
|
+
* @param {Object} [config={}] config (flags, args)
|
|
87
|
+
* @returns {Promise} Promise resolving similarity between a and b
|
|
88
|
+
*/
|
|
89
|
+
compareAsync ( algo, a, b, config = {} ) {
|
|
90
|
+
|
|
91
|
+
return this.#asyncWrapper(
|
|
92
|
+
this.compare,
|
|
93
|
+
algo, a, b, config
|
|
94
|
+
);
|
|
95
|
+
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* tests the similarity between the base string and a target string
|
|
100
|
+
* using the current algorithm
|
|
101
|
+
*
|
|
102
|
+
* @async
|
|
103
|
+
*
|
|
104
|
+
* @param {String} str target string
|
|
105
|
+
* @param {Object} [config={}] config (flags, args)
|
|
106
|
+
* @returns {Promise} Promise resolving similarity to base string
|
|
107
|
+
*/
|
|
108
|
+
testAsync ( str, config = {} ) {
|
|
109
|
+
|
|
110
|
+
return this.#asyncWrapper(
|
|
111
|
+
this.test,
|
|
112
|
+
str, config
|
|
113
|
+
);
|
|
114
|
+
|
|
115
|
+
};
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* tests the similarity of multiple strings against the base string
|
|
119
|
+
*
|
|
120
|
+
* @async
|
|
121
|
+
*
|
|
122
|
+
* @param {String[]} arr array of strings
|
|
123
|
+
* @param {Object} [config={}] config (flags, args)
|
|
124
|
+
* @returns {Promise} Promise resolving an array of objects, each containing target string and similarity score
|
|
125
|
+
*/
|
|
126
|
+
batchTestAsync ( arr, config = {} ) {
|
|
127
|
+
|
|
128
|
+
return this.#asyncWrapper(
|
|
129
|
+
this.batchTest,
|
|
130
|
+
arr, config
|
|
131
|
+
);
|
|
132
|
+
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* finds strings in an array that exceed a similarity threshold
|
|
137
|
+
* returns the array sorted by highest similarity
|
|
138
|
+
*
|
|
139
|
+
* @async
|
|
140
|
+
*
|
|
141
|
+
* @param {String[]} arr array of strings
|
|
142
|
+
* @param {Object} [config={}] config (flags, threshold, args)
|
|
143
|
+
* @returns {Promise} Promise resolving an array of objects, sorted by highest similarity
|
|
144
|
+
*/
|
|
145
|
+
async matchAsync ( arr, config = {} ) {
|
|
146
|
+
|
|
147
|
+
return this.#asyncWrapper(
|
|
148
|
+
this.match,
|
|
149
|
+
arr, config
|
|
150
|
+
);
|
|
151
|
+
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* finds the closest matching string from an array
|
|
156
|
+
*
|
|
157
|
+
* @async
|
|
158
|
+
*
|
|
159
|
+
* @param {String[]} arr array of strings
|
|
160
|
+
* @param {Object} [config={}] config (flags, args)
|
|
161
|
+
* @returns {Promise} Promise resolving the closest matching string
|
|
162
|
+
*/
|
|
163
|
+
async closestAsync ( arr, config = {} ) {
|
|
164
|
+
|
|
165
|
+
return this.#asyncWrapper(
|
|
166
|
+
this.closest,
|
|
167
|
+
arr, config
|
|
168
|
+
);
|
|
169
|
+
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* generate a similarity matrix for an array of strings
|
|
174
|
+
*
|
|
175
|
+
* @async
|
|
176
|
+
*
|
|
177
|
+
* @param {String} algo name of the algorithm
|
|
178
|
+
* @param {String[]} arr array of strings to cross-compare
|
|
179
|
+
* @param {Object} [config={}] config (flags, args)
|
|
180
|
+
* @returns {Promise} Promise resolving an 2D array representing the similarity matrix
|
|
181
|
+
*/
|
|
182
|
+
similarityMatrixAsync ( algo, arr, config = {} ) {
|
|
183
|
+
|
|
184
|
+
return this.#asyncWrapper(
|
|
185
|
+
this.similarityMatrix,
|
|
186
|
+
algo, arr, config
|
|
187
|
+
);
|
|
188
|
+
|
|
189
|
+
};
|
|
190
|
+
|
|
191
|
+
};
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cosine Similarity
|
|
3
|
+
* CmpStr module
|
|
4
|
+
*
|
|
5
|
+
* Cosine similarity is a measure how similar two vectors are. It's often used
|
|
6
|
+
* in text analysis to compare texts based on the words they contain.
|
|
7
|
+
*
|
|
8
|
+
* @author Paul Köhler (komed3)
|
|
9
|
+
* @license MIT
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
'use strict';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* private helper function
|
|
16
|
+
* get term frequency from string
|
|
17
|
+
* @private
|
|
18
|
+
*
|
|
19
|
+
* @param {String} str string
|
|
20
|
+
* @param {String} delimiter term delimiter
|
|
21
|
+
* @returns {Object} term frequency
|
|
22
|
+
*/
|
|
23
|
+
const _termFreq = ( str, delimiter ) => {
|
|
24
|
+
|
|
25
|
+
let freq = {};
|
|
26
|
+
|
|
27
|
+
str.split( delimiter ).forEach( ( term ) => {
|
|
28
|
+
|
|
29
|
+
freq[ term ] = ( freq[ term ] || 0 ) + 1;
|
|
30
|
+
|
|
31
|
+
} );
|
|
32
|
+
|
|
33
|
+
return freq;
|
|
34
|
+
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* module exports
|
|
39
|
+
* @public
|
|
40
|
+
*
|
|
41
|
+
* @param {String} a string a
|
|
42
|
+
* @param {String} b string b
|
|
43
|
+
* @param {Object} options having {
|
|
44
|
+
* @param {String} [delimiter=' '] term delimiter
|
|
45
|
+
* }
|
|
46
|
+
* @returns {Number} similarity score (0..1)
|
|
47
|
+
*/
|
|
48
|
+
|
|
49
|
+
module.exports = ( a, b, { delimiter = ' ' } = {} ) => {
|
|
50
|
+
|
|
51
|
+
/* step 1: count the frequency of chars per string */
|
|
52
|
+
|
|
53
|
+
let termsA = _termFreq( a, delimiter ),
|
|
54
|
+
termsB = _termFreq( b, delimiter );
|
|
55
|
+
|
|
56
|
+
let allTerms = new Set ( [
|
|
57
|
+
...Object.keys( termsA ),
|
|
58
|
+
...Object.keys( termsB )
|
|
59
|
+
] );
|
|
60
|
+
|
|
61
|
+
/* step 2: calculate the dot product */
|
|
62
|
+
|
|
63
|
+
let dotProduct = [ ...allTerms ].reduce(
|
|
64
|
+
( sum, char ) => sum + ( termsA[ char ] || 0 ) * ( termsB[ char ] || 0 ),
|
|
65
|
+
0
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
/* step 3: calculate the vector magnitudes */
|
|
69
|
+
|
|
70
|
+
let magnitudeA = Math.sqrt( [ ...allTerms ].reduce(
|
|
71
|
+
( sum, char ) => sum + ( termsA[ char ] || 0 ) ** 2,
|
|
72
|
+
0
|
|
73
|
+
) );
|
|
74
|
+
|
|
75
|
+
let magnitudeB = Math.sqrt( [ ...allTerms ].reduce(
|
|
76
|
+
( sum, char ) => sum + ( termsB[ char ] || 0 ) ** 2,
|
|
77
|
+
0
|
|
78
|
+
) );
|
|
79
|
+
|
|
80
|
+
/* step 4: calculate Cosine similarity */
|
|
81
|
+
|
|
82
|
+
return magnitudeA && magnitudeB
|
|
83
|
+
? dotProduct / ( magnitudeA * magnitudeB )
|
|
84
|
+
: 0;
|
|
85
|
+
|
|
86
|
+
};
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Damerau-Levenshtein Distance
|
|
3
|
+
* CmpStr module
|
|
4
|
+
*
|
|
5
|
+
* The Damerau-Levenshtein distance differs from the classical Levenshtein
|
|
6
|
+
* distance by including transpositions among its allowable operations in
|
|
7
|
+
* addition to the three classical single-character edit operations
|
|
8
|
+
* (insertions, deletions and substitutions). Useful for correcting typos.
|
|
9
|
+
*
|
|
10
|
+
* @author Paul Köhler (komed3)
|
|
11
|
+
* @license MIT
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
'use strict';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* module exports
|
|
18
|
+
* @public
|
|
19
|
+
*
|
|
20
|
+
* @param {String} a string a
|
|
21
|
+
* @param {String} b string b
|
|
22
|
+
* @param {Object} options having {
|
|
23
|
+
* @param {Boolean} [raw=false] if true the raw distance is returned
|
|
24
|
+
* }
|
|
25
|
+
* @returns {Number} similarity score (0..1) or distance
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
module.exports = ( a, b, { raw = false } = {} ) => {
|
|
29
|
+
|
|
30
|
+
/* step 1: initialize scoring matrix */
|
|
31
|
+
|
|
32
|
+
let matrix = Array.from(
|
|
33
|
+
{ length: a.length + 1 },
|
|
34
|
+
( _, i ) => Array.from(
|
|
35
|
+
{ length: b.length + 1 },
|
|
36
|
+
( _, j ) => i && j ? 0 : i || j
|
|
37
|
+
)
|
|
38
|
+
);
|
|
39
|
+
|
|
40
|
+
/* step 2: calculate Damerau-Levenshtein distance */
|
|
41
|
+
|
|
42
|
+
for ( let i = 1; i <= a.length; i++ ) {
|
|
43
|
+
|
|
44
|
+
for ( let j = 1; j <= b.length; j++ ) {
|
|
45
|
+
|
|
46
|
+
let cost = a[ i - 1 ] === b[ j - 1 ] ? 0 : 1;
|
|
47
|
+
|
|
48
|
+
matrix[ i ][ j ] = Math.min(
|
|
49
|
+
matrix[ i - 1 ][ j ] + 1,
|
|
50
|
+
matrix[ i ][ j - 1 ] + 1,
|
|
51
|
+
matrix[ i - 1 ][ j - 1 ] + cost
|
|
52
|
+
);
|
|
53
|
+
|
|
54
|
+
if (
|
|
55
|
+
i > 1 && j > 1 &&
|
|
56
|
+
a[ i - 1 ] === b[ j - 2 ] &&
|
|
57
|
+
a[ i - 2 ] === b[ j - 1 ]
|
|
58
|
+
) {
|
|
59
|
+
|
|
60
|
+
matrix[ i ][ j ] = Math.min(
|
|
61
|
+
matrix[ i ][ j ],
|
|
62
|
+
matrix[ i - 2 ][ j - 2 ] + cost
|
|
63
|
+
);
|
|
64
|
+
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/* step 3: get Damerau-Levenshtein distance as value between 0..1 */
|
|
72
|
+
|
|
73
|
+
return raw ? matrix[ a.length ][ b.length ] : 1 - (
|
|
74
|
+
matrix[ a.length ][ b.length ] /
|
|
75
|
+
Math.max( a.length, b.length )
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
};
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dice-Sørensen Coefficient
|
|
3
|
+
* CmpStr module
|
|
4
|
+
*
|
|
5
|
+
* The Dice-Sørensen index equals twice the number of elements common to
|
|
6
|
+
* both sets divided by the sum of the number of elements in each set.
|
|
7
|
+
* Equivalently, the index is the size of the intersection as a fraction
|
|
8
|
+
* of the average size of the two sets.
|
|
9
|
+
*
|
|
10
|
+
* @author Paul Köhler (komed3)
|
|
11
|
+
* @license MIT
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
'use strict';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* private helper function
|
|
18
|
+
* get bigrams from string
|
|
19
|
+
* @private
|
|
20
|
+
*
|
|
21
|
+
* @param {String} str string
|
|
22
|
+
* @returns {Set} set of bigrams
|
|
23
|
+
*/
|
|
24
|
+
const _str2bigrams = ( str ) => {
|
|
25
|
+
|
|
26
|
+
let bigrams = new Set ();
|
|
27
|
+
|
|
28
|
+
for ( let i = 0; i < str.length - 1; i++ ) {
|
|
29
|
+
|
|
30
|
+
bigrams.add( str.substring( i, i + 2 ) );
|
|
31
|
+
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
return bigrams;
|
|
35
|
+
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* module exports
|
|
40
|
+
* @public
|
|
41
|
+
*
|
|
42
|
+
* @param {String} a string a
|
|
43
|
+
* @param {String} b string b
|
|
44
|
+
* @returns {Number} similarity score (0..1)
|
|
45
|
+
*/
|
|
46
|
+
|
|
47
|
+
module.exports = ( a, b ) => {
|
|
48
|
+
|
|
49
|
+
/* step 1: generate bigrams from strings */
|
|
50
|
+
|
|
51
|
+
let setA = _str2bigrams( a ),
|
|
52
|
+
setB = _str2bigrams( b );
|
|
53
|
+
|
|
54
|
+
/* step 2: calculate coefficient */
|
|
55
|
+
|
|
56
|
+
return (
|
|
57
|
+
( new Set ( [ ...setA ].filter( ( test ) => {
|
|
58
|
+
return setB.has( test );
|
|
59
|
+
} ) ) ).size * 2
|
|
60
|
+
) / (
|
|
61
|
+
setA.size +
|
|
62
|
+
setB.size
|
|
63
|
+
);
|
|
64
|
+
|
|
65
|
+
};
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hamming Distance
|
|
3
|
+
* CmpStr module
|
|
4
|
+
*
|
|
5
|
+
* The Hamming distance between two equal-length strings of symbols is the
|
|
6
|
+
* number of positions at which the corresponding symbols are different.
|
|
7
|
+
*
|
|
8
|
+
* @author Paul Köhler (komed3)
|
|
9
|
+
* @license MIT
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
'use strict';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* module exports
|
|
16
|
+
* @public
|
|
17
|
+
*
|
|
18
|
+
* @param {String} a string a
|
|
19
|
+
* @param {String} b string b
|
|
20
|
+
* @returns {Number} similarity score (0..1)
|
|
21
|
+
* @throws {Error} if string not of equal length
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
module.exports = ( a, b ) => {
|
|
25
|
+
|
|
26
|
+
if ( a.length !== b.length ) {
|
|
27
|
+
|
|
28
|
+
/* strings must be of equal length for this calculation */
|
|
29
|
+
|
|
30
|
+
throw new Error (
|
|
31
|
+
`Strings must be of equal length for Hamming Distance`
|
|
32
|
+
);
|
|
33
|
+
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return 1 - (
|
|
37
|
+
[ ...a ].reduce(
|
|
38
|
+
( sum, char, i ) => sum + ( char !== b[ i ] ? 1 : 0 ),
|
|
39
|
+
0
|
|
40
|
+
) /
|
|
41
|
+
a.length
|
|
42
|
+
);
|
|
43
|
+
|
|
44
|
+
};
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Jaccard Index
|
|
3
|
+
* CmpStr module
|
|
4
|
+
*
|
|
5
|
+
* The Jaccard Index measures the similarity between two sets by dividing
|
|
6
|
+
* the size of their intersection by the size of their union.
|
|
7
|
+
*
|
|
8
|
+
* @author Paul Köhler (komed3)
|
|
9
|
+
* @license MIT
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
'use strict';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* module exports
|
|
16
|
+
* @public
|
|
17
|
+
*
|
|
18
|
+
* @param {String} a string a
|
|
19
|
+
* @param {String} b string b
|
|
20
|
+
* @returns {Number} similarity score (0..1)
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
module.exports = ( a, b ) => {
|
|
24
|
+
|
|
25
|
+
let setA = new Set ( a ),
|
|
26
|
+
setB = new Set ( b );
|
|
27
|
+
|
|
28
|
+
return (
|
|
29
|
+
new Set ( [ ...setA ].filter( x => setB.has( x ) ) )
|
|
30
|
+
).size / (
|
|
31
|
+
new Set ( [ ...setA, ...setB ] )
|
|
32
|
+
).size;
|
|
33
|
+
|
|
34
|
+
};
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Jaro-Winkler Distance
|
|
3
|
+
* CmpStr module
|
|
4
|
+
*
|
|
5
|
+
* Jaro-Winkler is a string similarity metric that gives more weight to
|
|
6
|
+
* matching characters at the start of the strings.
|
|
7
|
+
*
|
|
8
|
+
* @author Paul Köhler (komed3)
|
|
9
|
+
* @license MIT
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
'use strict';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* module exports
|
|
16
|
+
* @public
|
|
17
|
+
*
|
|
18
|
+
* @param {String} a string a
|
|
19
|
+
* @param {String} b string b
|
|
20
|
+
* @param {Object} options having {
|
|
21
|
+
* @param {Boolean} [raw=false] if true the raw distance is returned
|
|
22
|
+
* }
|
|
23
|
+
* @returns {Number} similarity score (0..1) or distance
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
module.exports = ( a, b, { raw = false } = {} ) => {
|
|
27
|
+
|
|
28
|
+
/* step 1: check for matches between strings */
|
|
29
|
+
|
|
30
|
+
let matchWindow = Math.floor(
|
|
31
|
+
Math.max( a.length, b.length ) / 2
|
|
32
|
+
) - 1;
|
|
33
|
+
|
|
34
|
+
let aMatches = Array( a.length ).fill( false ),
|
|
35
|
+
bMatches = Array( b.length ).fill( false );
|
|
36
|
+
|
|
37
|
+
let matches = 0;
|
|
38
|
+
|
|
39
|
+
for ( let i = 0; i < a.length; i++ ) {
|
|
40
|
+
|
|
41
|
+
for (
|
|
42
|
+
let j = Math.max( 0, i - matchWindow );
|
|
43
|
+
j < Math.min( i + matchWindow + 1, b.length );
|
|
44
|
+
j++
|
|
45
|
+
) {
|
|
46
|
+
|
|
47
|
+
if ( !bMatches[ j ] && a[ i ] === b[ j ] ) {
|
|
48
|
+
|
|
49
|
+
aMatches[ i ] = true;
|
|
50
|
+
bMatches[ j ] = true;
|
|
51
|
+
|
|
52
|
+
matches++;
|
|
53
|
+
|
|
54
|
+
break;
|
|
55
|
+
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if ( matches === 0 ) {
|
|
63
|
+
|
|
64
|
+
/* if no matches found, return 0 */
|
|
65
|
+
|
|
66
|
+
return 0;
|
|
67
|
+
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/* step 2: calculate transpositions */
|
|
71
|
+
|
|
72
|
+
let transpos = 0,
|
|
73
|
+
k = 0;
|
|
74
|
+
|
|
75
|
+
for ( let i = 0; i < a.length; i++ ) {
|
|
76
|
+
|
|
77
|
+
if ( aMatches[ i ] ) {
|
|
78
|
+
|
|
79
|
+
while ( !bMatches[ k ] ) k++;
|
|
80
|
+
|
|
81
|
+
if ( a[ i ] !== b[ k ] ) transpos++;
|
|
82
|
+
|
|
83
|
+
k++;
|
|
84
|
+
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/* step 3: calculate Jaro-Winkler distance */
|
|
90
|
+
|
|
91
|
+
let jaro = (
|
|
92
|
+
( matches / a.length ) +
|
|
93
|
+
( matches / b.length ) +
|
|
94
|
+
( matches - ( transpos / 2 ) ) /
|
|
95
|
+
matches
|
|
96
|
+
) / 3;
|
|
97
|
+
|
|
98
|
+
/* step 4: get Jaro-Winkler as value between 0..1 */
|
|
99
|
+
|
|
100
|
+
return raw ? jaro : jaro + Math.min(
|
|
101
|
+
4, [ ...a ].findIndex(
|
|
102
|
+
( char, i ) => char !== b[ i ]
|
|
103
|
+
) || 0
|
|
104
|
+
) * 0.1 * ( 1 - jaro );
|
|
105
|
+
|
|
106
|
+
};
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Longest Common Subsequence (LCS)
|
|
3
|
+
* CmpStr module
|
|
4
|
+
*
|
|
5
|
+
* LCS measures the length of the longest subsequence common to both strings.
|
|
6
|
+
*
|
|
7
|
+
* @author Paul Köhler (komed3)
|
|
8
|
+
* @license MIT
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
'use strict';
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* module exports
|
|
15
|
+
* @public
|
|
16
|
+
*
|
|
17
|
+
* @param {String} a string a
|
|
18
|
+
* @param {String} b string b
|
|
19
|
+
* @returns {Number} similarity score (0..1)
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
module.exports = ( a, b ) => {
|
|
23
|
+
|
|
24
|
+
/* step 1: initialize scoring matrix */
|
|
25
|
+
|
|
26
|
+
let matrix = Array( a.length + 1 ).fill( null ).map(
|
|
27
|
+
() => Array( b.length + 1 ).fill( 0 )
|
|
28
|
+
);
|
|
29
|
+
|
|
30
|
+
for ( let i = 1; i <= a.length; i++ ) {
|
|
31
|
+
|
|
32
|
+
for ( let j = 1; j <= b.length; j++ ) {
|
|
33
|
+
|
|
34
|
+
if ( a[ i - 1 ] === b[ j - 1 ] ) {
|
|
35
|
+
|
|
36
|
+
matrix[ i ][ j ] = matrix[ i - 1 ][ j - 1 ] + 1;
|
|
37
|
+
|
|
38
|
+
} else {
|
|
39
|
+
|
|
40
|
+
matrix[ i ][ j ] = Math.max(
|
|
41
|
+
matrix[ i - 1 ][ j ],
|
|
42
|
+
matrix[ i ][ j - 1 ]
|
|
43
|
+
);
|
|
44
|
+
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/* step 2: calculate LCS */
|
|
52
|
+
|
|
53
|
+
return (
|
|
54
|
+
matrix[ a.length ][ b.length ] /
|
|
55
|
+
Math.max( a.length, b.length )
|
|
56
|
+
);
|
|
57
|
+
|
|
58
|
+
};
|