cmpstr 1.0.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,70 @@
1
+ /**
2
+ * Levenshtein Distance
3
+ * CmpStr module
4
+ *
5
+ * The Levenshtein distance between two strings is the minimum number of
6
+ * single-character edits (i.e. insertions, deletions or substitutions)
7
+ * required to change one word into the other.
8
+ *
9
+ * @author Paul Köhler (komed3)
10
+ * @license MIT
11
+ */
12
+
13
+ 'use strict';
14
+
15
+ /**
16
+ * module exports
17
+ * @public
18
+ *
19
+ * @param {String} a string a
20
+ * @param {String} b string b
21
+ * @param {Object} options having {
22
+ * @param {Boolean} [raw=false] if true the raw distance is returned
23
+ * }
24
+ * @returns {Number} similarity score (0..1) or distance
25
+ */
26
+
27
+ module.exports = ( a, b, { raw = false } = {} ) => {
28
+
29
+ /* step 1: initialize scoring matrix */
30
+
31
+ let matrix = Array.from(
32
+ { length: a.length + 1 },
33
+ ( _, i ) => Array.from(
34
+ { length: b.length + 1 },
35
+ ( _, j ) => j
36
+ ).fill( i, 0, 1 )
37
+ );
38
+
39
+ /* step 2: calculate Levenshtein distance */
40
+
41
+ for ( let i = 1; i <= a.length; i++ ) {
42
+
43
+ for ( let j = 1; j <= b.length; j++ ) {
44
+
45
+ if ( a[ i - 1 ] === b[ j - 1 ] ) {
46
+
47
+ matrix[ i ][ j ] = matrix[ i - 1 ][ j - 1 ];
48
+
49
+ } else {
50
+
51
+ matrix[ i ][ j ] = 1 + Math.min(
52
+ matrix[ i ][ j - 1 ],
53
+ matrix[ i - 1 ][ j - 1 ],
54
+ matrix[ i - 1 ][ j ]
55
+ );
56
+
57
+ }
58
+
59
+ }
60
+
61
+ }
62
+
63
+ /* step 3: get Levenshtein distance as value between 0..1 */
64
+
65
+ return raw ? matrix[ a.length ][ b.length ] : 1 - (
66
+ matrix[ a.length ][ b.length ] /
67
+ Math.max( a.length, b.length )
68
+ );
69
+
70
+ };
@@ -0,0 +1,72 @@
1
+ /**
2
+ * Needleman-Wunsch Algorithm
3
+ * CmpStr module
4
+ *
5
+ * The Needleman-Wunsch algorithm performs global alignment,
6
+ * aligning two strings entirely, including gaps. It is commonly
7
+ * used in bioinformatics.
8
+ *
9
+ * @author Paul Köhler
10
+ * @license MIT
11
+ */
12
+
13
+ 'use strict';
14
+
15
+ /**
16
+ * module exports
17
+ * @public
18
+ *
19
+ * @param {String} a string a
20
+ * @param {String} b string b
21
+ * @param {Object} options having {
22
+ * @param {Number} [match=1] score for a match
23
+ * @param {Number} [mismatch=-1] penalty for a mismatch
24
+ * @param {Number} [gap=-1] penalty for a gap
25
+ * }
26
+ * @returns {Number} similarity score (0..1)
27
+ */
28
+
29
+ module.exports = ( a, b, {
30
+ match = 1, mismatch = -1, gap = -1
31
+ } = {} ) => {
32
+
33
+ let rows = a.length + 1,
34
+ cols = b.length + 1;
35
+
36
+ /* step 1: initialize scoring matrix */
37
+
38
+ let matrix = Array.from(
39
+ { length: rows },
40
+ ( _, i ) => Array.from(
41
+ { length: cols },
42
+ ( _, j ) => ( i === 0 ? j * gap : j === 0 ? i * gap : 0 )
43
+ )
44
+ );
45
+
46
+ /* step 2: fill the scoring matrix */
47
+
48
+ for ( let i = 1; i < rows; i++ ) {
49
+
50
+ for ( let j = 1; j < cols; j++ ) {
51
+
52
+ let matchScore = a[ i - 1 ] === b[ j - 1 ] ? match : mismatch;
53
+
54
+ matrix[ i ][ j ] = Math.max(
55
+ matrix[ i - 1 ][ j - 1 ] + matchScore,
56
+ matrix[ i - 1 ][ j ] + gap,
57
+ matrix[ i ][ j - 1 ] + gap
58
+ );
59
+
60
+ }
61
+
62
+ }
63
+
64
+ /* step 3: normalize the score to a value between 0..1 */
65
+
66
+ return Math.max( 0, Math.min( 1,
67
+ matrix[ a.length ][ b.length ] / (
68
+ Math.max( a.length, b.length ) * match
69
+ )
70
+ ) );
71
+
72
+ };
@@ -0,0 +1,63 @@
1
+ /**
2
+ * q-Gram Similarity
3
+ * CmpStr module
4
+ *
5
+ * Q-gram similarity is a string-matching algorithm that compares two
6
+ * strings by breaking them into substrings of length Q. It's used to
7
+ * determine how similar the two strings are.
8
+ *
9
+ * @author Paul Köhler (komed3)
10
+ * @license MIT
11
+ */
12
+
13
+ 'use strict';
14
+
15
+ /**
16
+ * private helper function
17
+ * convert string to array of substrings
18
+ * @private
19
+ *
20
+ * @param {String} str string
21
+ * @param {Int} q length of substrings
22
+ * @returns {String[]} array of substrings
23
+ */
24
+
25
+ const _qGrams = ( str, q ) => {
26
+
27
+ let grams = [];
28
+
29
+ for ( let i = 0; i <= str.length - q; i++ ) {
30
+
31
+ grams.push( str.slice( i, i + q ) );
32
+
33
+ }
34
+
35
+ return grams;
36
+
37
+ };
38
+
39
+ /**
40
+ * module exports
41
+ * @public
42
+ *
43
+ * @param {String} a string a
44
+ * @param {String} b string b
45
+ * @param {Object} options having {
46
+ * @param {Int} [q=2] length of substrings
47
+ * }
48
+ * @returns {Number} similarity score (0..1)
49
+ */
50
+ module.exports = ( a, b, { q = 2 } = {} ) => {
51
+
52
+ let setA = new Set ( _qGrams( a, q ) ),
53
+ setB = new Set ( _qGrams( b, q ) );
54
+
55
+ return (
56
+ new Set( [ ...setA ].filter(
57
+ test => setB.has( test )
58
+ ) )
59
+ ).size / Math.max(
60
+ setA.size, setB.size
61
+ );
62
+
63
+ };
@@ -0,0 +1,78 @@
1
+ /**
2
+ * Smith-Waterman Algorithm
3
+ * CmpStr module
4
+ *
5
+ * The Smith-Waterman algorithm performs local alignment, finding the
6
+ * best matching subsequence between two strings. It is commonly used
7
+ * in bioinformatics.
8
+ *
9
+ * @author Paul Köhler
10
+ * @license MIT
11
+ */
12
+
13
+ 'use strict';
14
+
15
+ /**
16
+ * module exports
17
+ * @public
18
+ *
19
+ * @param {String} a string a
20
+ * @param {String} b string b
21
+ * @param {Object} options having {
22
+ * @param {Number} [match=2] score for a match
23
+ * @param {Number} [mismatch=-1] penalty for a mismatch
24
+ * @param {Number} [gap=-1] penalty for a gap
25
+ * }
26
+ * @returns {Number} similarity score (0..1)
27
+ */
28
+
29
+ module.exports = ( a, b, {
30
+ match = 2, mismatch = -1, gap = -1
31
+ } = {} ) => {
32
+
33
+ let rows = a.length + 1,
34
+ cols = b.length + 1;
35
+
36
+ /* step 1: initialize scoring matrix */
37
+
38
+ let matrix = Array.from(
39
+ { length: rows },
40
+ () => Array( cols ).fill( 0 )
41
+ );
42
+
43
+ /* step 2: fill the scoring matrix */
44
+
45
+ let maxScore = 0;
46
+
47
+ for ( let i = 1; i < rows; i++ ) {
48
+
49
+ for ( let j = 1; j < cols; j++ ) {
50
+
51
+ let matchScore = a[ i - 1 ] === b[ j - 1 ] ? match : mismatch;
52
+
53
+ matrix[ i ][ j ] = Math.max(
54
+ 0,
55
+ matrix[ i - 1 ][ j - 1 ] + matchScore,
56
+ matrix[ i - 1 ][ j ] + gap,
57
+ matrix[ i ][ j - 1 ] + gap
58
+ );
59
+
60
+ maxScore = Math.max(
61
+ maxScore,
62
+ matrix[ i ][ j ]
63
+ );
64
+
65
+ }
66
+
67
+ }
68
+
69
+ /* step 3: normalize the score to a value between 0..1 */
70
+
71
+ return Math.max( 0, Math.min( 1,
72
+ maxScore / Math.min(
73
+ a.length * match,
74
+ b.length * match
75
+ )
76
+ ) );
77
+
78
+ };
@@ -0,0 +1,152 @@
1
+ /**
2
+ * Soundex Algorithm
3
+ * CmpStr module
4
+ *
5
+ * The Soundex algorithm generates a phonetic representation of a string
6
+ * based on how it sounds. It supports predefined setups for English and
7
+ * German and allows users to provide custom options.
8
+ *
9
+ * @author Paul Köhler
10
+ * @license MIT
11
+ */
12
+
13
+ 'use strict';
14
+
15
+ /**
16
+ * predefined phonetic mappings / excluded chars for supported languages
17
+ * @private
18
+ */
19
+ const soundexConfig = {
20
+ en: {
21
+ exclude: 'AEIOUHWY',
22
+ mapping: {
23
+ B: '1', F: '1', P: '1', V: '1',
24
+ C: '2', G: '2', J: '2', K: '2', Q: '2', S: '2', X: '2', Z: '2',
25
+ D: '3', T: '3',
26
+ L: '4',
27
+ M: '5', N: '5',
28
+ R: '6'
29
+ }
30
+ },
31
+ de: {
32
+ exclude: 'AEIOUÄÖÜHWY',
33
+ mapping: {
34
+ B: '1', P: '1', F: '1', V: '1',
35
+ C: '2', G: '2', K: '2', Q: '2', S: '2', X: '2', Z: '2', J: '2',
36
+ D: '3', T: '3',
37
+ L: '4',
38
+ M: '5', N: '5',
39
+ R: '6'
40
+ }
41
+ }
42
+ };
43
+
44
+ /**
45
+ * private helper function
46
+ * generate soundex code from string
47
+ * @private
48
+ *
49
+ * @param {String} str string to create soundex code for
50
+ * @param {Object} mapping soundex mapping
51
+ * @param {String} exclude characters to exclude from the input
52
+ * @param {Number} maxLength maximum length of the phonetic code
53
+ * @returns {String} soundex code
54
+ */
55
+ const _generateSoundex = ( str, mapping, exclude, maxLength ) => {
56
+
57
+ let normalized = str.toUpperCase().replace(
58
+ new RegExp( `[${exclude}]`, 'g' ), ''
59
+ );
60
+
61
+ let soundexCode = normalized[ 0 ],
62
+ prevCode = mapping[ soundexCode ] || '';
63
+
64
+ for ( let i = 1; i < normalized.length; i++ ) {
65
+
66
+ let code = mapping[ normalized[ i ] ] || '';
67
+
68
+ if ( code !== prevCode && code !== '' ) {
69
+
70
+ soundexCode += code;
71
+
72
+ }
73
+
74
+ prevCode = code;
75
+
76
+ }
77
+
78
+ /* pad or truncate the code to the desired length */
79
+
80
+ return soundexCode
81
+ .padEnd( maxLength, '0' )
82
+ .slice( 0, maxLength );
83
+
84
+ };
85
+
86
+ /**
87
+ * module exports
88
+ * @public
89
+ *
90
+ * @param {String} a string a
91
+ * @param {String} b string b
92
+ * @param {Object} options having {
93
+ * @param {String} [lang='en'] language code for predefined setups (e.g., 'en', 'de')
94
+ * @param {Boolean} [raw=false] if true, returns the raw sound index codes
95
+ * @param {Object} [mapping] custom phonetic mapping (overrides predefined)
96
+ * @param {String} [exclude=''] characters to exclude from the input (overrides predefined)
97
+ * @param {Number} [maxLength=4] maximum length of the phonetic code
98
+ * }
99
+ * @returns {Number|Object} similarity score (0..1) or raw soundex codes
100
+ */
101
+
102
+ module.exports = ( a, b, {
103
+ lang = 'en',
104
+ raw = false,
105
+ mapping = null,
106
+ exclude = null,
107
+ maxLength = 4
108
+ } = {} ) => {
109
+
110
+ /* step 1: load mapping and excluded chars or use custom data */
111
+
112
+ let pMapping = mapping || soundexConfig[ lang ].mapping || soundexConfig.en.mapping,
113
+ pExclude = exclude || soundexConfig[ lang ].exclude || soundexConfig.en.exclude;
114
+
115
+ /* step 2: generate soundex codes for both strings */
116
+
117
+ let soundexA = _generateSoundex( a, pMapping, pExclude, maxLength ),
118
+ soundexB = _generateSoundex( b, pMapping, pExclude, maxLength );
119
+
120
+ if ( raw ) {
121
+
122
+ /* return raw soundex codes */
123
+
124
+ return {
125
+ a: soundexA,
126
+ b: soundexB
127
+ };
128
+
129
+ }
130
+
131
+ /* step 3: calculate similarity between soundex codes */
132
+
133
+ let maxLen = Math.max(
134
+ soundexA.length,
135
+ soundexB.length
136
+ );
137
+
138
+ let matches = 0;
139
+
140
+ for ( let i = 0; i < maxLen; i++ ) {
141
+
142
+ if ( soundexA[ i ] === soundexB[ i ] ) {
143
+
144
+ matches++;
145
+
146
+ }
147
+
148
+ }
149
+
150
+ return matches / maxLen;
151
+
152
+ };
package/src/index.js ADDED
@@ -0,0 +1,47 @@
1
+ /**
2
+ * npm package
3
+ * cmpstr
4
+ *
5
+ * The cmpstr package is a powerful and lightweight library for calculating string similarity,
6
+ * finding the closest matches in arrays, performing phonetic searches, and more. It supports
7
+ * a variety of built-in algorithms, including Levenshtein distance, Dice-Sørensen coefficient,
8
+ * Damerau-Levenshtein, Soundex, and many others. Users can also add custom algorithms and
9
+ * normalization filters to extend its functionality.
10
+ *
11
+ * key features:
12
+ * - built-in support for multiple similarity algorithms
13
+ * - phonetic search with language-specific configurations
14
+ * - batch operations and similarity matrices for large datasets
15
+ * - customizable normalization with global flags and caching
16
+ * - asynchronous support for non-blocking workflows
17
+ *
18
+ * usage:
19
+ * - compare strings for similarity using various algorithms
20
+ * - find the closest match from an array of strings
21
+ * - perform phonetic searches with raw or similarity-based results
22
+ * - generate similarity matrices for cross-comparisons
23
+ *
24
+ * @author Paul Köhler (komed3)
25
+ * @version 2.0.0
26
+ * @license MIT
27
+ */
28
+
29
+ 'use strict';
30
+
31
+ /**
32
+ * module dependencies
33
+ * @private
34
+ */
35
+
36
+ const CmpStr = require( './CmpStr' );
37
+ const CmpStrAsync = require( './CmpStrAsync' );
38
+
39
+ /**
40
+ * module exports
41
+ * @public
42
+ */
43
+
44
+ module.exports = {
45
+ CmpStr,
46
+ CmpStrAsync
47
+ };