cmpstr 1.0.3 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +430 -85
- package/package.json +47 -25
- package/src/CmpStr.js +784 -0
- package/src/CmpStrAsync.js +191 -0
- package/src/algorithms/cosine.js +86 -0
- package/src/algorithms/damerau.js +78 -0
- package/src/algorithms/dice.js +65 -0
- package/src/algorithms/hamming.js +44 -0
- package/src/algorithms/jaccard.js +34 -0
- package/src/algorithms/jaroWinkler.js +106 -0
- package/src/algorithms/lcs.js +58 -0
- package/src/algorithms/levenshtein.js +70 -0
- package/src/algorithms/needlemanWunsch.js +72 -0
- package/src/algorithms/qGram.js +63 -0
- package/src/algorithms/smithWaterman.js +78 -0
- package/src/algorithms/soundex.js +152 -0
- package/src/index.js +47 -0
- package/index.js +0 -432
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Levenshtein Distance
|
|
3
|
+
* CmpStr module
|
|
4
|
+
*
|
|
5
|
+
* The Levenshtein distance between two strings is the minimum number of
|
|
6
|
+
* single-character edits (i.e. insertions, deletions or substitutions)
|
|
7
|
+
* required to change one word into the other.
|
|
8
|
+
*
|
|
9
|
+
* @author Paul Köhler (komed3)
|
|
10
|
+
* @license MIT
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
'use strict';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* module exports
|
|
17
|
+
* @public
|
|
18
|
+
*
|
|
19
|
+
* @param {String} a string a
|
|
20
|
+
* @param {String} b string b
|
|
21
|
+
* @param {Object} options having {
|
|
22
|
+
* @param {Boolean} [raw=false] if true the raw distance is returned
|
|
23
|
+
* }
|
|
24
|
+
* @returns {Number} similarity score (0..1) or distance
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
module.exports = ( a, b, { raw = false } = {} ) => {
|
|
28
|
+
|
|
29
|
+
/* step 1: initialize scoring matrix */
|
|
30
|
+
|
|
31
|
+
let matrix = Array.from(
|
|
32
|
+
{ length: a.length + 1 },
|
|
33
|
+
( _, i ) => Array.from(
|
|
34
|
+
{ length: b.length + 1 },
|
|
35
|
+
( _, j ) => j
|
|
36
|
+
).fill( i, 0, 1 )
|
|
37
|
+
);
|
|
38
|
+
|
|
39
|
+
/* step 2: calculate Levenshtein distance */
|
|
40
|
+
|
|
41
|
+
for ( let i = 1; i <= a.length; i++ ) {
|
|
42
|
+
|
|
43
|
+
for ( let j = 1; j <= b.length; j++ ) {
|
|
44
|
+
|
|
45
|
+
if ( a[ i - 1 ] === b[ j - 1 ] ) {
|
|
46
|
+
|
|
47
|
+
matrix[ i ][ j ] = matrix[ i - 1 ][ j - 1 ];
|
|
48
|
+
|
|
49
|
+
} else {
|
|
50
|
+
|
|
51
|
+
matrix[ i ][ j ] = 1 + Math.min(
|
|
52
|
+
matrix[ i ][ j - 1 ],
|
|
53
|
+
matrix[ i - 1 ][ j - 1 ],
|
|
54
|
+
matrix[ i - 1 ][ j ]
|
|
55
|
+
);
|
|
56
|
+
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/* step 3: get Levenshtein distance as value between 0..1 */
|
|
64
|
+
|
|
65
|
+
return raw ? matrix[ a.length ][ b.length ] : 1 - (
|
|
66
|
+
matrix[ a.length ][ b.length ] /
|
|
67
|
+
Math.max( a.length, b.length )
|
|
68
|
+
);
|
|
69
|
+
|
|
70
|
+
};
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Needleman-Wunsch Algorithm
|
|
3
|
+
* CmpStr module
|
|
4
|
+
*
|
|
5
|
+
* The Needleman-Wunsch algorithm performs global alignment,
|
|
6
|
+
* aligning two strings entirely, including gaps. It is commonly
|
|
7
|
+
* used in bioinformatics.
|
|
8
|
+
*
|
|
9
|
+
* @author Paul Köhler
|
|
10
|
+
* @license MIT
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
'use strict';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* module exports
|
|
17
|
+
* @public
|
|
18
|
+
*
|
|
19
|
+
* @param {String} a string a
|
|
20
|
+
* @param {String} b string b
|
|
21
|
+
* @param {Object} options having {
|
|
22
|
+
* @param {Number} [match=1] score for a match
|
|
23
|
+
* @param {Number} [mismatch=-1] penalty for a mismatch
|
|
24
|
+
* @param {Number} [gap=-1] penalty for a gap
|
|
25
|
+
* }
|
|
26
|
+
* @returns {Number} similarity score (0..1)
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
module.exports = ( a, b, {
|
|
30
|
+
match = 1, mismatch = -1, gap = -1
|
|
31
|
+
} = {} ) => {
|
|
32
|
+
|
|
33
|
+
let rows = a.length + 1,
|
|
34
|
+
cols = b.length + 1;
|
|
35
|
+
|
|
36
|
+
/* step 1: initialize scoring matrix */
|
|
37
|
+
|
|
38
|
+
let matrix = Array.from(
|
|
39
|
+
{ length: rows },
|
|
40
|
+
( _, i ) => Array.from(
|
|
41
|
+
{ length: cols },
|
|
42
|
+
( _, j ) => ( i === 0 ? j * gap : j === 0 ? i * gap : 0 )
|
|
43
|
+
)
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
/* step 2: fill the scoring matrix */
|
|
47
|
+
|
|
48
|
+
for ( let i = 1; i < rows; i++ ) {
|
|
49
|
+
|
|
50
|
+
for ( let j = 1; j < cols; j++ ) {
|
|
51
|
+
|
|
52
|
+
let matchScore = a[ i - 1 ] === b[ j - 1 ] ? match : mismatch;
|
|
53
|
+
|
|
54
|
+
matrix[ i ][ j ] = Math.max(
|
|
55
|
+
matrix[ i - 1 ][ j - 1 ] + matchScore,
|
|
56
|
+
matrix[ i - 1 ][ j ] + gap,
|
|
57
|
+
matrix[ i ][ j - 1 ] + gap
|
|
58
|
+
);
|
|
59
|
+
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/* step 3: normalize the score to a value between 0..1 */
|
|
65
|
+
|
|
66
|
+
return Math.max( 0, Math.min( 1,
|
|
67
|
+
matrix[ a.length ][ b.length ] / (
|
|
68
|
+
Math.max( a.length, b.length ) * match
|
|
69
|
+
)
|
|
70
|
+
) );
|
|
71
|
+
|
|
72
|
+
};
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* q-Gram Similarity
|
|
3
|
+
* CmpStr module
|
|
4
|
+
*
|
|
5
|
+
* Q-gram similarity is a string-matching algorithm that compares two
|
|
6
|
+
* strings by breaking them into substrings of length Q. It's used to
|
|
7
|
+
* determine how similar the two strings are.
|
|
8
|
+
*
|
|
9
|
+
* @author Paul Köhler (komed3)
|
|
10
|
+
* @license MIT
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
'use strict';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* private helper function
|
|
17
|
+
* convert string to array of substrings
|
|
18
|
+
* @private
|
|
19
|
+
*
|
|
20
|
+
* @param {String} str string
|
|
21
|
+
* @param {Int} q length of substrings
|
|
22
|
+
* @returns {String[]} array of substrings
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
const _qGrams = ( str, q ) => {
|
|
26
|
+
|
|
27
|
+
let grams = [];
|
|
28
|
+
|
|
29
|
+
for ( let i = 0; i <= str.length - q; i++ ) {
|
|
30
|
+
|
|
31
|
+
grams.push( str.slice( i, i + q ) );
|
|
32
|
+
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return grams;
|
|
36
|
+
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* module exports
|
|
41
|
+
* @public
|
|
42
|
+
*
|
|
43
|
+
* @param {String} a string a
|
|
44
|
+
* @param {String} b string b
|
|
45
|
+
* @param {Object} options having {
|
|
46
|
+
* @param {Int} [q=2] length of substrings
|
|
47
|
+
* }
|
|
48
|
+
* @returns {Number} similarity score (0..1)
|
|
49
|
+
*/
|
|
50
|
+
module.exports = ( a, b, { q = 2 } = {} ) => {
|
|
51
|
+
|
|
52
|
+
let setA = new Set ( _qGrams( a, q ) ),
|
|
53
|
+
setB = new Set ( _qGrams( b, q ) );
|
|
54
|
+
|
|
55
|
+
return (
|
|
56
|
+
new Set( [ ...setA ].filter(
|
|
57
|
+
test => setB.has( test )
|
|
58
|
+
) )
|
|
59
|
+
).size / Math.max(
|
|
60
|
+
setA.size, setB.size
|
|
61
|
+
);
|
|
62
|
+
|
|
63
|
+
};
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smith-Waterman Algorithm
|
|
3
|
+
* CmpStr module
|
|
4
|
+
*
|
|
5
|
+
* The Smith-Waterman algorithm performs local alignment, finding the
|
|
6
|
+
* best matching subsequence between two strings. It is commonly used
|
|
7
|
+
* in bioinformatics.
|
|
8
|
+
*
|
|
9
|
+
* @author Paul Köhler
|
|
10
|
+
* @license MIT
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
'use strict';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* module exports
|
|
17
|
+
* @public
|
|
18
|
+
*
|
|
19
|
+
* @param {String} a string a
|
|
20
|
+
* @param {String} b string b
|
|
21
|
+
* @param {Object} options having {
|
|
22
|
+
* @param {Number} [match=2] score for a match
|
|
23
|
+
* @param {Number} [mismatch=-1] penalty for a mismatch
|
|
24
|
+
* @param {Number} [gap=-1] penalty for a gap
|
|
25
|
+
* }
|
|
26
|
+
* @returns {Number} similarity score (0..1)
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
module.exports = ( a, b, {
|
|
30
|
+
match = 2, mismatch = -1, gap = -1
|
|
31
|
+
} = {} ) => {
|
|
32
|
+
|
|
33
|
+
let rows = a.length + 1,
|
|
34
|
+
cols = b.length + 1;
|
|
35
|
+
|
|
36
|
+
/* step 1: initialize scoring matrix */
|
|
37
|
+
|
|
38
|
+
let matrix = Array.from(
|
|
39
|
+
{ length: rows },
|
|
40
|
+
() => Array( cols ).fill( 0 )
|
|
41
|
+
);
|
|
42
|
+
|
|
43
|
+
/* step 2: fill the scoring matrix */
|
|
44
|
+
|
|
45
|
+
let maxScore = 0;
|
|
46
|
+
|
|
47
|
+
for ( let i = 1; i < rows; i++ ) {
|
|
48
|
+
|
|
49
|
+
for ( let j = 1; j < cols; j++ ) {
|
|
50
|
+
|
|
51
|
+
let matchScore = a[ i - 1 ] === b[ j - 1 ] ? match : mismatch;
|
|
52
|
+
|
|
53
|
+
matrix[ i ][ j ] = Math.max(
|
|
54
|
+
0,
|
|
55
|
+
matrix[ i - 1 ][ j - 1 ] + matchScore,
|
|
56
|
+
matrix[ i - 1 ][ j ] + gap,
|
|
57
|
+
matrix[ i ][ j - 1 ] + gap
|
|
58
|
+
);
|
|
59
|
+
|
|
60
|
+
maxScore = Math.max(
|
|
61
|
+
maxScore,
|
|
62
|
+
matrix[ i ][ j ]
|
|
63
|
+
);
|
|
64
|
+
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/* step 3: normalize the score to a value between 0..1 */
|
|
70
|
+
|
|
71
|
+
return Math.max( 0, Math.min( 1,
|
|
72
|
+
maxScore / Math.min(
|
|
73
|
+
a.length * match,
|
|
74
|
+
b.length * match
|
|
75
|
+
)
|
|
76
|
+
) );
|
|
77
|
+
|
|
78
|
+
};
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Soundex Algorithm
|
|
3
|
+
* CmpStr module
|
|
4
|
+
*
|
|
5
|
+
* The Soundex algorithm generates a phonetic representation of a string
|
|
6
|
+
* based on how it sounds. It supports predefined setups for English and
|
|
7
|
+
* German and allows users to provide custom options.
|
|
8
|
+
*
|
|
9
|
+
* @author Paul Köhler
|
|
10
|
+
* @license MIT
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
'use strict';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* predefined phonetic mappings / excluded chars for supported languages
|
|
17
|
+
* @private
|
|
18
|
+
*/
|
|
19
|
+
const soundexConfig = {
|
|
20
|
+
en: {
|
|
21
|
+
exclude: 'AEIOUHWY',
|
|
22
|
+
mapping: {
|
|
23
|
+
B: '1', F: '1', P: '1', V: '1',
|
|
24
|
+
C: '2', G: '2', J: '2', K: '2', Q: '2', S: '2', X: '2', Z: '2',
|
|
25
|
+
D: '3', T: '3',
|
|
26
|
+
L: '4',
|
|
27
|
+
M: '5', N: '5',
|
|
28
|
+
R: '6'
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
de: {
|
|
32
|
+
exclude: 'AEIOUÄÖÜHWY',
|
|
33
|
+
mapping: {
|
|
34
|
+
B: '1', P: '1', F: '1', V: '1',
|
|
35
|
+
C: '2', G: '2', K: '2', Q: '2', S: '2', X: '2', Z: '2', J: '2',
|
|
36
|
+
D: '3', T: '3',
|
|
37
|
+
L: '4',
|
|
38
|
+
M: '5', N: '5',
|
|
39
|
+
R: '6'
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* private helper function
|
|
46
|
+
* generate soundex code from string
|
|
47
|
+
* @private
|
|
48
|
+
*
|
|
49
|
+
* @param {String} str string to create soundex code for
|
|
50
|
+
* @param {Object} mapping soundex mapping
|
|
51
|
+
* @param {String} exclude characters to exclude from the input
|
|
52
|
+
* @param {Number} maxLength maximum length of the phonetic code
|
|
53
|
+
* @returns {String} soundex code
|
|
54
|
+
*/
|
|
55
|
+
const _generateSoundex = ( str, mapping, exclude, maxLength ) => {
|
|
56
|
+
|
|
57
|
+
let normalized = str.toUpperCase().replace(
|
|
58
|
+
new RegExp( `[${exclude}]`, 'g' ), ''
|
|
59
|
+
);
|
|
60
|
+
|
|
61
|
+
let soundexCode = normalized[ 0 ],
|
|
62
|
+
prevCode = mapping[ soundexCode ] || '';
|
|
63
|
+
|
|
64
|
+
for ( let i = 1; i < normalized.length; i++ ) {
|
|
65
|
+
|
|
66
|
+
let code = mapping[ normalized[ i ] ] || '';
|
|
67
|
+
|
|
68
|
+
if ( code !== prevCode && code !== '' ) {
|
|
69
|
+
|
|
70
|
+
soundexCode += code;
|
|
71
|
+
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
prevCode = code;
|
|
75
|
+
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/* pad or truncate the code to the desired length */
|
|
79
|
+
|
|
80
|
+
return soundexCode
|
|
81
|
+
.padEnd( maxLength, '0' )
|
|
82
|
+
.slice( 0, maxLength );
|
|
83
|
+
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* module exports
|
|
88
|
+
* @public
|
|
89
|
+
*
|
|
90
|
+
* @param {String} a string a
|
|
91
|
+
* @param {String} b string b
|
|
92
|
+
* @param {Object} options having {
|
|
93
|
+
* @param {String} [lang='en'] language code for predefined setups (e.g., 'en', 'de')
|
|
94
|
+
* @param {Boolean} [raw=false] if true, returns the raw sound index codes
|
|
95
|
+
* @param {Object} [mapping] custom phonetic mapping (overrides predefined)
|
|
96
|
+
* @param {String} [exclude=''] characters to exclude from the input (overrides predefined)
|
|
97
|
+
* @param {Number} [maxLength=4] maximum length of the phonetic code
|
|
98
|
+
* }
|
|
99
|
+
* @returns {Number|Object} similarity score (0..1) or raw soundex codes
|
|
100
|
+
*/
|
|
101
|
+
|
|
102
|
+
module.exports = ( a, b, {
|
|
103
|
+
lang = 'en',
|
|
104
|
+
raw = false,
|
|
105
|
+
mapping = null,
|
|
106
|
+
exclude = null,
|
|
107
|
+
maxLength = 4
|
|
108
|
+
} = {} ) => {
|
|
109
|
+
|
|
110
|
+
/* step 1: load mapping and excluded chars or use custom data */
|
|
111
|
+
|
|
112
|
+
let pMapping = mapping || soundexConfig[ lang ].mapping || soundexConfig.en.mapping,
|
|
113
|
+
pExclude = exclude || soundexConfig[ lang ].exclude || soundexConfig.en.exclude;
|
|
114
|
+
|
|
115
|
+
/* step 2: generate soundex codes for both strings */
|
|
116
|
+
|
|
117
|
+
let soundexA = _generateSoundex( a, pMapping, pExclude, maxLength ),
|
|
118
|
+
soundexB = _generateSoundex( b, pMapping, pExclude, maxLength );
|
|
119
|
+
|
|
120
|
+
if ( raw ) {
|
|
121
|
+
|
|
122
|
+
/* return raw soundex codes */
|
|
123
|
+
|
|
124
|
+
return {
|
|
125
|
+
a: soundexA,
|
|
126
|
+
b: soundexB
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/* step 3: calculate similarity between soundex codes */
|
|
132
|
+
|
|
133
|
+
let maxLen = Math.max(
|
|
134
|
+
soundexA.length,
|
|
135
|
+
soundexB.length
|
|
136
|
+
);
|
|
137
|
+
|
|
138
|
+
let matches = 0;
|
|
139
|
+
|
|
140
|
+
for ( let i = 0; i < maxLen; i++ ) {
|
|
141
|
+
|
|
142
|
+
if ( soundexA[ i ] === soundexB[ i ] ) {
|
|
143
|
+
|
|
144
|
+
matches++;
|
|
145
|
+
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
return matches / maxLen;
|
|
151
|
+
|
|
152
|
+
};
|
package/src/index.js
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* npm package
|
|
3
|
+
* cmpstr
|
|
4
|
+
*
|
|
5
|
+
* The cmpstr package is a powerful and lightweight library for calculating string similarity,
|
|
6
|
+
* finding the closest matches in arrays, performing phonetic searches, and more. It supports
|
|
7
|
+
* a variety of built-in algorithms, including Levenshtein distance, Dice-Sørensen coefficient,
|
|
8
|
+
* Damerau-Levenshtein, Soundex, and many others. Users can also add custom algorithms and
|
|
9
|
+
* normalization filters to extend its functionality.
|
|
10
|
+
*
|
|
11
|
+
* key features:
|
|
12
|
+
* - built-in support for multiple similarity algorithms
|
|
13
|
+
* - phonetic search with language-specific configurations
|
|
14
|
+
* - batch operations and similarity matrices for large datasets
|
|
15
|
+
* - customizable normalization with global flags and caching
|
|
16
|
+
* - asynchronous support for non-blocking workflows
|
|
17
|
+
*
|
|
18
|
+
* usage:
|
|
19
|
+
* - compare strings for similarity using various algorithms
|
|
20
|
+
* - find the closest match from an array of strings
|
|
21
|
+
* - perform phonetic searches with raw or similarity-based results
|
|
22
|
+
* - generate similarity matrices for cross-comparisons
|
|
23
|
+
*
|
|
24
|
+
* @author Paul Köhler (komed3)
|
|
25
|
+
* @version 2.0.0
|
|
26
|
+
* @license MIT
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
'use strict';
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* module dependencies
|
|
33
|
+
* @private
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
const CmpStr = require( './CmpStr' );
|
|
37
|
+
const CmpStrAsync = require( './CmpStrAsync' );
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* module exports
|
|
41
|
+
* @public
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
module.exports = {
|
|
45
|
+
CmpStr,
|
|
46
|
+
CmpStrAsync
|
|
47
|
+
};
|