cmpstr 2.0.3 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +75 -503
  3. package/dist/CmpStr.esm.js +4863 -0
  4. package/dist/CmpStr.esm.js.map +1 -0
  5. package/dist/CmpStr.esm.min.js +8 -0
  6. package/dist/CmpStr.esm.min.js.map +1 -0
  7. package/dist/CmpStr.umd.js +4875 -0
  8. package/dist/CmpStr.umd.js.map +1 -0
  9. package/dist/CmpStr.umd.min.js +8 -0
  10. package/dist/CmpStr.umd.min.js.map +1 -0
  11. package/dist/cjs/CmpStr.js +663 -0
  12. package/dist/cjs/CmpStr.js.map +1 -0
  13. package/dist/cjs/CmpStrAsync.js +336 -0
  14. package/dist/cjs/CmpStrAsync.js.map +1 -0
  15. package/dist/cjs/index.js +15 -0
  16. package/dist/cjs/index.js.map +1 -0
  17. package/dist/cjs/metric/Cosine.js +101 -0
  18. package/dist/cjs/metric/Cosine.js.map +1 -0
  19. package/dist/cjs/metric/DamerauLevenshtein.js +110 -0
  20. package/dist/cjs/metric/DamerauLevenshtein.js.map +1 -0
  21. package/dist/cjs/metric/DiceSorensen.js +91 -0
  22. package/dist/cjs/metric/DiceSorensen.js.map +1 -0
  23. package/dist/cjs/metric/Hamming.js +82 -0
  24. package/dist/cjs/metric/Hamming.js.map +1 -0
  25. package/dist/cjs/metric/Jaccard.js +76 -0
  26. package/dist/cjs/metric/Jaccard.js.map +1 -0
  27. package/dist/cjs/metric/JaroWinkler.js +114 -0
  28. package/dist/cjs/metric/JaroWinkler.js.map +1 -0
  29. package/dist/cjs/metric/LCS.js +89 -0
  30. package/dist/cjs/metric/LCS.js.map +1 -0
  31. package/dist/cjs/metric/Levenshtein.js +94 -0
  32. package/dist/cjs/metric/Levenshtein.js.map +1 -0
  33. package/dist/cjs/metric/Metric.js +445 -0
  34. package/dist/cjs/metric/Metric.js.map +1 -0
  35. package/dist/cjs/metric/NeedlemanWunsch.js +95 -0
  36. package/dist/cjs/metric/NeedlemanWunsch.js.map +1 -0
  37. package/dist/cjs/metric/SmithWaterman.js +98 -0
  38. package/dist/cjs/metric/SmithWaterman.js.map +1 -0
  39. package/dist/cjs/metric/qGram.js +91 -0
  40. package/dist/cjs/metric/qGram.js.map +1 -0
  41. package/dist/cjs/phonetic/Cologne.js +112 -0
  42. package/dist/cjs/phonetic/Cologne.js.map +1 -0
  43. package/dist/cjs/phonetic/Metaphone.js +172 -0
  44. package/dist/cjs/phonetic/Metaphone.js.map +1 -0
  45. package/dist/cjs/phonetic/Phonetic.js +413 -0
  46. package/dist/cjs/phonetic/Phonetic.js.map +1 -0
  47. package/dist/cjs/phonetic/Soundex.js +135 -0
  48. package/dist/cjs/phonetic/Soundex.js.map +1 -0
  49. package/dist/cjs/utils/DeepMerge.js +144 -0
  50. package/dist/cjs/utils/DeepMerge.js.map +1 -0
  51. package/dist/cjs/utils/DiffChecker.js +500 -0
  52. package/dist/cjs/utils/DiffChecker.js.map +1 -0
  53. package/dist/cjs/utils/Filter.js +189 -0
  54. package/dist/cjs/utils/Filter.js.map +1 -0
  55. package/dist/cjs/utils/HashTable.js +175 -0
  56. package/dist/cjs/utils/HashTable.js.map +1 -0
  57. package/dist/cjs/utils/Normalizer.js +144 -0
  58. package/dist/cjs/utils/Normalizer.js.map +1 -0
  59. package/dist/cjs/utils/Pool.js +196 -0
  60. package/dist/cjs/utils/Pool.js.map +1 -0
  61. package/dist/cjs/utils/Profiler.js +229 -0
  62. package/dist/cjs/utils/Profiler.js.map +1 -0
  63. package/dist/cjs/utils/Registry.js +148 -0
  64. package/dist/cjs/utils/Registry.js.map +1 -0
  65. package/dist/cjs/utils/TextAnalyzer.js +358 -0
  66. package/dist/cjs/utils/TextAnalyzer.js.map +1 -0
  67. package/dist/esm/CmpStr.js +662 -0
  68. package/dist/esm/CmpStr.js.map +1 -0
  69. package/dist/esm/CmpStrAsync.js +331 -0
  70. package/dist/esm/CmpStrAsync.js.map +1 -0
  71. package/dist/esm/index.js +7 -0
  72. package/dist/esm/index.js.map +1 -0
  73. package/dist/esm/metric/Cosine.js +99 -0
  74. package/dist/esm/metric/Cosine.js.map +1 -0
  75. package/dist/esm/metric/DamerauLevenshtein.js +108 -0
  76. package/dist/esm/metric/DamerauLevenshtein.js.map +1 -0
  77. package/dist/esm/metric/DiceSorensen.js +89 -0
  78. package/dist/esm/metric/DiceSorensen.js.map +1 -0
  79. package/dist/esm/metric/Hamming.js +77 -0
  80. package/dist/esm/metric/Hamming.js.map +1 -0
  81. package/dist/esm/metric/Jaccard.js +74 -0
  82. package/dist/esm/metric/Jaccard.js.map +1 -0
  83. package/dist/esm/metric/JaroWinkler.js +112 -0
  84. package/dist/esm/metric/JaroWinkler.js.map +1 -0
  85. package/dist/esm/metric/LCS.js +87 -0
  86. package/dist/esm/metric/LCS.js.map +1 -0
  87. package/dist/esm/metric/Levenshtein.js +92 -0
  88. package/dist/esm/metric/Levenshtein.js.map +1 -0
  89. package/dist/esm/metric/Metric.js +442 -0
  90. package/dist/esm/metric/Metric.js.map +1 -0
  91. package/dist/esm/metric/NeedlemanWunsch.js +93 -0
  92. package/dist/esm/metric/NeedlemanWunsch.js.map +1 -0
  93. package/dist/esm/metric/SmithWaterman.js +96 -0
  94. package/dist/esm/metric/SmithWaterman.js.map +1 -0
  95. package/dist/esm/metric/qGram.js +89 -0
  96. package/dist/esm/metric/qGram.js.map +1 -0
  97. package/dist/esm/phonetic/Cologne.js +114 -0
  98. package/dist/esm/phonetic/Cologne.js.map +1 -0
  99. package/dist/esm/phonetic/Metaphone.js +174 -0
  100. package/dist/esm/phonetic/Metaphone.js.map +1 -0
  101. package/dist/esm/phonetic/Phonetic.js +409 -0
  102. package/dist/esm/phonetic/Phonetic.js.map +1 -0
  103. package/dist/esm/phonetic/Soundex.js +137 -0
  104. package/dist/esm/phonetic/Soundex.js.map +1 -0
  105. package/dist/esm/utils/DeepMerge.js +139 -0
  106. package/dist/esm/utils/DeepMerge.js.map +1 -0
  107. package/dist/esm/utils/DiffChecker.js +498 -0
  108. package/dist/esm/utils/DiffChecker.js.map +1 -0
  109. package/dist/esm/utils/Filter.js +187 -0
  110. package/dist/esm/utils/Filter.js.map +1 -0
  111. package/dist/esm/utils/HashTable.js +173 -0
  112. package/dist/esm/utils/HashTable.js.map +1 -0
  113. package/dist/esm/utils/Normalizer.js +142 -0
  114. package/dist/esm/utils/Normalizer.js.map +1 -0
  115. package/dist/esm/utils/Pool.js +194 -0
  116. package/dist/esm/utils/Pool.js.map +1 -0
  117. package/dist/esm/utils/Profiler.js +227 -0
  118. package/dist/esm/utils/Profiler.js.map +1 -0
  119. package/dist/esm/utils/Registry.js +142 -0
  120. package/dist/esm/utils/Registry.js.map +1 -0
  121. package/dist/esm/utils/TextAnalyzer.js +356 -0
  122. package/dist/esm/utils/TextAnalyzer.js.map +1 -0
  123. package/dist/types/CmpStr.d.ts +472 -0
  124. package/dist/types/CmpStrAsync.d.ts +233 -0
  125. package/dist/types/index.d.ts +51 -0
  126. package/dist/types/metric/Cosine.d.ts +57 -0
  127. package/dist/types/metric/DamerauLevenshtein.d.ts +50 -0
  128. package/dist/types/metric/DiceSorensen.d.ts +57 -0
  129. package/dist/types/metric/Hamming.d.ts +49 -0
  130. package/dist/types/metric/Jaccard.d.ts +48 -0
  131. package/dist/types/metric/JaroWinkler.d.ts +50 -0
  132. package/dist/types/metric/LCS.d.ts +50 -0
  133. package/dist/types/metric/Levenshtein.d.ts +50 -0
  134. package/dist/types/metric/Metric.d.ts +261 -0
  135. package/dist/types/metric/NeedlemanWunsch.d.ts +47 -0
  136. package/dist/types/metric/SmithWaterman.d.ts +48 -0
  137. package/dist/types/metric/index.d.ts +41 -0
  138. package/dist/types/metric/qGram.d.ts +56 -0
  139. package/dist/types/phonetic/Cologne.d.ts +46 -0
  140. package/dist/types/phonetic/Metaphone.d.ts +50 -0
  141. package/dist/types/phonetic/Phonetic.d.ts +189 -0
  142. package/dist/types/phonetic/Soundex.d.ts +49 -0
  143. package/dist/types/phonetic/index.d.ts +30 -0
  144. package/dist/types/utils/DeepMerge.d.ts +70 -0
  145. package/dist/types/utils/DiffChecker.d.ts +137 -0
  146. package/dist/types/utils/Filter.d.ts +97 -0
  147. package/dist/types/utils/HashTable.d.ts +86 -0
  148. package/dist/types/utils/Normalizer.d.ts +76 -0
  149. package/dist/types/utils/Pool.d.ts +63 -0
  150. package/dist/types/utils/Profiler.d.ts +129 -0
  151. package/dist/types/utils/Registry.d.ts +57 -0
  152. package/dist/types/utils/TextAnalyzer.d.ts +199 -0
  153. package/dist/types/utils/Types.d.ts +313 -0
  154. package/package.json +62 -49
  155. package/src/CmpStr.d.ts +0 -70
  156. package/src/CmpStr.js +0 -917
  157. package/src/CmpStrAsync.d.ts +0 -19
  158. package/src/CmpStrAsync.js +0 -197
  159. package/src/algorithms/cosine.js +0 -86
  160. package/src/algorithms/damerau.js +0 -78
  161. package/src/algorithms/dice.js +0 -65
  162. package/src/algorithms/hamming.js +0 -44
  163. package/src/algorithms/jaccard.js +0 -34
  164. package/src/algorithms/jaroWinkler.js +0 -106
  165. package/src/algorithms/lcs.js +0 -58
  166. package/src/algorithms/levenshtein.js +0 -70
  167. package/src/algorithms/needlemanWunsch.js +0 -72
  168. package/src/algorithms/qGram.js +0 -63
  169. package/src/algorithms/smithWaterman.js +0 -78
  170. package/src/algorithms/soundex.js +0 -152
  171. package/src/index.d.ts +0 -3
  172. package/src/index.js +0 -47
package/LICENSE CHANGED
@@ -1,21 +1,21 @@
1
- MIT License
2
-
3
- Copyright (c) 2025 Paul Köhler (komed3)
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
1
+ MIT License
2
+
3
+ Copyright (c) 2023-2025 Paul Köhler (komed3)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -1,503 +1,75 @@
1
- # CmpStr `v2.0`
2
-
3
- CmpStr is a lightweight and powerful npm package for calculating string similarity, finding the closest matches in arrays, performing phonetic searches, and more. It supports a variety of built-in algorithms (e.g., Levenshtein, Dice-Sørensen, Damerau-Levenshtein, Soundex) and allows users to add custom algorithms and normalization filters.
4
-
5
- **Key Features**
6
-
7
- - Built-in support for multiple similarity algorithms.
8
- - Phonetic search with language-specific configurations (e.g., Soundex).
9
- - Batch operations and similarity matrices for large datasets.
10
- - Customizable normalization with global flags and caching.
11
- - Asynchronous support for non-blocking workflows.
12
- - Extensible with custom algorithms and filters.
13
- - TypeScript declarations for better developer experience.
14
-
15
- ## Installation
16
-
17
- Install the package via npm:
18
-
19
- ```bash
20
- npm install cmpstr
21
- ```
22
-
23
- ## Basic Usage
24
-
25
- Importing the Package:
26
-
27
- ```js
28
- const { CmpStr } = require( 'cmpstr' );
29
- ```
30
-
31
- Example 1: Basic String Similarity
32
-
33
- ```js
34
- const cmp = new CmpStr( 'levenshtein', 'hello' );
35
-
36
- console.log( cmp.test( 'Hallo', { flags: 'i' } ) );
37
- // Output: 0.8
38
- ```
39
-
40
- Example 2: Phonetic Search
41
-
42
- ```js
43
- const cmp = new CmpStr( 'soundex', 'Robert' );
44
-
45
- console.log( cmp.test( 'Rubin', { options: { raw: true } } ) );
46
- // Output: { a: 'R163', b: 'R150' }
47
- ```
48
-
49
- ## Methods
50
-
51
- Creating a new instance of `CmpStr` or `CmpStrAsync` allows passing the algorithm to be used and the base string as optional arguments. Alternatively or later in the process, the `setAlgo` and `setStr` methods can be used for this purpose.
52
-
53
- ### Basics
54
-
55
- #### `isReady()`
56
-
57
- Checks whether string and algorithm are set correctly. Returns `true`, if the class is ready to perform similarity checks, false otherwise.
58
-
59
- #### `setStr( str )`
60
-
61
- Sets the base string for comparison.
62
-
63
- Parameters:
64
-
65
- `<String> str` – string to set as the base
66
-
67
- #### `getStr()`
68
-
69
- Gets the base string for comparison.
70
-
71
- #### `setFlags( [ flags = '' ] )`
72
-
73
- Set default normalization flags. They will be overwritten by passing `flags` through the configuration object. See description of available flags / normalization options below in the documentation.
74
-
75
- #### `getFlags()`
76
-
77
- Gets the default normalization flags.
78
-
79
- #### `clearCache()`
80
-
81
- Clears the normalization cache.
82
-
83
- ### Algorithms
84
-
85
- #### `listAlgo( [ loadedOnly = false ] )`
86
-
87
- List all registered or loaded similarity algorithms.
88
-
89
- Parameters:
90
-
91
- `<Boolean> loadedOnly` – it true, only loaded algorithm names are returned
92
-
93
- #### `isAlgo( algo )`
94
-
95
- Checks if an algorithm is registered. Returns `true` if so, `false` otherwise.
96
-
97
- Parameters:
98
-
99
- `<String> algo` – name of the algorithm
100
-
101
- #### `setAlgo( algo )`
102
-
103
- Sets the current algorithm to use for similarity calculations.
104
-
105
- Allowed options for build-in althorithms are `cosine`, `damerau`, `dice`, `hamming`, `jaccard`, `jaro`, `lcs`, `levenshtein`, `needlemanWunsch`, `qGram`, `smithWaterman` and `soundex`.
106
-
107
- Parameters:
108
-
109
- `<String> algo` – name of the algorithm
110
-
111
- #### `getAlgo()`
112
-
113
- Gets the current algorithm to use for similarity calculations.
114
-
115
- #### `addAlgo( algo, callback [, useIt = true ] )`
116
-
117
- Adding a new similarity algorithm by using the `addAlgo()` method passing the name and a callback function, that must accept at least two strings and return a number. If `useIt` is `true`, the new algorithm will automatically be set as the current one.
118
-
119
- Parameters:
120
-
121
- `<String> algo` – name of the algorithm
122
- `<Function> callback` – callback function implementing the algorithm
123
- `<Boolean> useIt` – whether to set this algorithm as the current one
124
-
125
- Example:
126
-
127
- ```js
128
- const cmp = new CmpStr();
129
-
130
- cmp.addAlgo( 'customAlgo', ( a, b ) => {
131
- return a === b ? 1 : 0;
132
- } );
133
-
134
- console.log( cmp.compare( 'customAlgo', 'hello', 'hello' ) );
135
- // Output: 1
136
- ```
137
-
138
- #### `rmvAlgo( algo )`
139
-
140
- Removing a registered similarity algorithm.
141
-
142
- Parameters:
143
-
144
- `<String> algo` – name of the algorithm
145
-
146
- ### Filters
147
-
148
- #### `listFilter( [ activeOnly = false ] )`
149
-
150
- List all added or active filter names.
151
-
152
- Parameters:
153
-
154
- `<Boolean> activeOnly` – it true, only names of active filters are returned
155
-
156
- #### `addFilter( name, callback [, priority = 10 ] )`
157
-
158
- Adds a custom normalization filter. Needs to be passed a unique name and callback function accepting a string and returns a normalized one. Prioritizing filters by setting higher priority (default is `10`).
159
-
160
- Parameters:
161
-
162
- `<String> name` – filter name
163
- `<Function> callback` – callback function implementing the filter
164
- `<Int> priority` – priority of the filter
165
-
166
- Example:
167
-
168
- ```js
169
- const cmp = new CmpStr();
170
-
171
- cmp.addFilter( 'prefix', ( str ) => `prefix_${str}` );
172
- ```
173
-
174
- #### `rmvFilter( name )`
175
-
176
- Removes a custom normalization filter.
177
-
178
- Parameters:
179
-
180
- `<String> name` – filter name
181
-
182
- #### `pauseFilter( name )`
183
-
184
- Pauses a custom normalization filter.
185
-
186
- Parameters:
187
-
188
- `<String> name` – filter name
189
-
190
- #### `resumeFilter( name )`
191
-
192
- Resumes a custom normalization filter.
193
-
194
- Parameters:
195
-
196
- `<String> name` – filter name
197
-
198
- #### `clearFilter( name )`
199
-
200
- Clears normalization filters (removing all of them).
201
-
202
- ### Similarity Comparison
203
-
204
- #### `compare( algo, a, b [, config = {} ] )`
205
-
206
- Compares two strings using the specified algorithm. The method returns either the similarity score as a floating point number between 0 and 1 or raw output, if the algorithm supports it and the user passes `raw=true` through the config options.
207
-
208
- Parameters:
209
-
210
- `<String> algo` – name of the algorithm
211
- `<String> a` – first string
212
- `<String> b` – second string
213
- `<Object> config` – configuration object
214
-
215
- Example:
216
-
217
- ```js
218
- const cmp = new CmpStr();
219
-
220
- console.log( cmp.compare( 'levenshtein', 'hello', 'hallo' ) );
221
- // Output: 0.8
222
- ```
223
-
224
- #### `test( str [, config = {} ] )`
225
-
226
- Tests the similarity between the base string and a given target string. Returns the same as ``compare``.
227
-
228
- Parameters:
229
-
230
- `<String> str` – target string
231
- `<Object> config` – configuration object
232
-
233
- Example:
234
-
235
- ```js
236
- const cmp = new CmpStr( 'levenshtein', 'hello' );
237
-
238
- console.log( cmp.test( 'hallo' ) );
239
- // Output: 0.8
240
- ```
241
-
242
- #### `batchTest( arr [, config = {} ] )`
243
-
244
- Tests the similarity of multiple strings against the base string. Returns an array of objects with the target string and either the similarity score as a floating point number between 0 and 1 or raw output, if the algorithm supports it and the user passes `raw=true` through the config options.
245
-
246
- Parameters:
247
-
248
- `<String[]> arr` – array of strings
249
- `<Object> config` – configuration object
250
-
251
- Example:
252
-
253
- ```js
254
- const cmp = new CmpStr( 'levenshtein', 'hello' );
255
-
256
- console.log( cmp.batchTest( [ 'hallo', 'hola', 'hey' ] ) );
257
- // Output: [ { target: 'hallo', match: 0.8 }, { target: 'hola', match: 0.4 }, { target: 'hey', match: 0.4 } ]
258
- ```
259
-
260
- #### `match( arr [, config = {} ] )`
261
-
262
- Finds strings in an array that exceed a similarity threshold and sorts them by highest similarity. Returns an array of objects contain target string and similarity score as a floating point number between 0 and 1.
263
-
264
- Parameters:
265
-
266
- `<String[]> arr` – array of strings
267
- `<Object> config` – configuration object
268
-
269
- Example:
270
-
271
- ```js
272
- const cmp = new CmpStr( 'levenshtein', 'hello' );
273
-
274
- console.log( cmp.batchTest( [ 'hallo', 'hola', 'hey' ], {
275
- threshold: 0.5
276
- } ) );
277
- // Output: [ { target: 'hallo', match: 0.8 } ]
278
- ```
279
-
280
- #### `closest( arr [, config = {} ] )`
281
-
282
- Finds the closest matching string from an array and returns them.
283
-
284
- Parameters:
285
-
286
- `<String[]> arr` – array of strings
287
- `<Object> config` – configuration object
288
-
289
- Example:
290
-
291
- ```js
292
- const cmp = new CmpStr( 'levenshtein', 'hello' );
293
-
294
- console.log( cmp.batchTest( [ 'hallo', 'hola', 'hey' ] ) );
295
- // Output: 'hallo'
296
- ```
297
-
298
- #### `similarityMatrix( algo, arr [, config = {} ] )`
299
-
300
- Generates a similarity matrix for an array of strings. Returns an 2D array that represents the similarity matrix by floating point numbers between 0 and 1.
301
-
302
- Parameters:
303
-
304
- `<String> algo` – name of the algorithm
305
- `<String[]> arr` – array of strings
306
- `<Object> config` – configuration object
307
-
308
- Example:
309
-
310
- ```js
311
- const cmp = new CmpStr();
312
-
313
- console.log( cmp.similarityMatrix( 'levenshtein', [
314
- 'hello', 'hallo', 'hola'
315
- ] ) );
316
- // Output: [ [ 1, 0.8, 0.4 ], [ 0.8, 1, 0.4 ], [ 0.4, 0.4, 1 ] ]
317
- ```
318
-
319
- ## Customization
320
-
321
- ### Normalize Strings
322
-
323
- The `CmpStr` package allows strings to be normalized before the similarity comparison. Options listed below are available for this and can either be set globally via `setFlags` or passed using the config object, which will overwrite the global flags. Flags are passed as a chained string in any order. For improved performance, normalized strings are stored in the cache, which can be freed using the `clearCache` method. Modifying custom filters automatically deletes the cache.
324
-
325
- #### Supported Flags
326
-
327
- `s` – remove special chars
328
- `w` – collapse whitespaces
329
- `r` – remove repeated chars
330
- `k` – keep only letters
331
- `n` – ignore numbers
332
- `t` – trim whitespaces
333
- `i` – case insensitivity
334
- `d` – decompose unicode
335
- `u` – normalize unicode
336
-
337
- #### `normalize( input [, flags = '' ] )`
338
-
339
- The method for normalizing strings can also be called on its own, without comparing the similarity of two strings. This also applies all filters and reads or writes to the cache. This can be helpful if certain strings should be saved beforehand or different normalization options want to be tested.
340
-
341
- Parameters:
342
-
343
- `<String|String[]> input` – single string or array of strings to normalize
344
- `<String> flags` normalization flags
345
-
346
- Example:
347
-
348
- ```js
349
- const cmp = new CmpStr();
350
-
351
- console.log( cmp.normalize( ' he123LLo ', 'nti' ) );
352
- // Output: hello
353
-
354
- console.log( cmp.normalize( [ 'Hello World!', 'CmpStr 123' ], 'nwti' ) );
355
- // Output: [ 'hello world!', 'cmpstr' ]
356
- ```
357
-
358
- ### Configuration Object
359
-
360
- An additional object with optional parameters can be passed to all comparison methods (e.g. `test`, `match`, `closest` etc.) and their asynchronous pendants. This object includes the ability to pass `flags` for normalization to all methods, as well as the `threshold` parameter for `match` and `matchAsync`.
361
-
362
- It also contains `options` as an object of key-value pairs that are passed to the comparison algorithm. Which additional arguments an algorithm accepts depends on the function exported from the module itself. Further down in this documentation, the various parameters for each algorithm are listed.
363
-
364
- Global config options:
365
-
366
- `<String> flags` – normalization flags
367
- `<Number> threshold` – similarity threshold between 0 and 1
368
- `<Object> options` – options passed to the algorithm
369
-
370
- Example:
371
-
372
- ```js
373
- const cmp = new CmpStr( 'smithWaterman', 'alignment' );
374
-
375
- console.log( cmp.match( [
376
- ' align ment', 'ali gnm ent ', ' alIGNMent'
377
- ], {
378
- flags: 'it',
379
- threshold: 0.8,
380
- options: {
381
- mismatch: -4,
382
- gap: -2
383
- }
384
- } ) );
385
- // Output: [ { target: ' alIGNMent', match: 1 }, { target: ' align ment', match: 0.8... }
386
- ]
387
- ```
388
-
389
- ## Asynchronous Support
390
-
391
- The `CmpStrAsync` class provides an asynchronous wrapper for all comparison methods as well as the string normalization function. It is ideal for large datasets or non-blocking workflows.
392
-
393
- The asynchronous class supports the methods `normalizeAsync`, `compareAsync`, `testAsync`, `batchTestAsync`, `matchAsync`, `closestAsync` and `similarityMatrixAsync`. Each of these methods returns a `Promise`.
394
-
395
- For options, arguments and returned values, see the documentation above.
396
-
397
- Example:
398
-
399
- ```js
400
- const { CmpStrAsync } = require( 'cmpstr' );
401
-
402
- const cmp = new CmpStrAsync( 'dice', 'best' );
403
-
404
- cmp.batchTestAsync( [
405
- 'better', 'bestest', 'the best', 'good', ...
406
- ] ).then( console.log );
407
- ```
408
-
409
- ## Supported Algorithms
410
-
411
- The following algorithms for similarity analysis are natively supported by the CmpStr package. Lazy-loading keeps memory consumption and loading time low, as only the algorithm intended to be used will be loaded as a module.
412
-
413
- ### Similarity Algorithms
414
-
415
- #### Levenshtein Distance – `levenshtein`
416
-
417
- The Levenshtein distance between two strings is the minimum number of single-character edits (i.e. insertions, deletions or substitutions) required to change one word into the other.
418
-
419
- Options:
420
-
421
- `<Boolean> raw` – if true the raw distance is returned
422
-
423
- #### Damerau-Levenshtein – `damerau`
424
-
425
- The Damerau-Levenshtein distance differs from the classical Levenshtein distance by including transpositions among its allowable operations in addition to the three classical single-character edit operations (insertions, deletions and substitutions). Useful for correcting typos.
426
-
427
- Options:
428
-
429
- `<Boolean> raw` – if true the raw distance is returned
430
-
431
- #### Jaro-Winkler – `jaro`
432
-
433
- Jaro-Winkler is a string similarity metric that gives more weight to matching characters at the start of the strings.
434
-
435
- Options:
436
-
437
- `<Boolean> raw` – if true the raw distance is returned
438
-
439
- #### Cosine Similarity – `cosine`
440
-
441
- Cosine similarity is a measure how similar two vectors are. It's often used in text analysis to compare texts based on the words they contain.
442
-
443
- Options:
444
-
445
- `<String> delimiter` – term delimiter
446
-
447
- #### Dice Coefficient – `dice`
448
-
449
- The Dice-Sørensen index equals twice the number of elements common to both sets divided by the sum of the number of elements in each set. Equivalently the index is the size of the intersection as a fraction of the average size of the two sets.
450
-
451
- #### Jaccard Index – `jaccard`
452
-
453
- The Jaccard Index measures the similarity between two sets by dividing the size of their intersection by the size of their union.
454
-
455
- #### Hamming Distance – `hamming`
456
-
457
- The Hamming distance between two equal-length strings of symbols is the number of positions at which the corresponding symbols are different.
458
-
459
- #### Longest Common Subsequence – `lcs`
460
-
461
- LCS measures the length of the longest subsequence common to both strings.
462
-
463
- #### Needleman-Wunsch – `needlemanWunsch`
464
-
465
- The Needleman-Wunsch algorithm performs global alignment, aligning two strings entirely, including gaps. It is commonly used in bioinformatics.
466
-
467
- Options:
468
-
469
- `<Number> match` – score for a match
470
- `<Number> mismatch` – penalty for a mismatch
471
- `<Number> gap` – penalty for a gap
472
-
473
- #### Smith-Waterman – `smithWaterman`
474
-
475
- The Smith-Waterman algorithm performs local alignment, finding the best matching subsequence between two strings. It is commonly used in bioinformatics.
476
-
477
- Options:
478
-
479
- `<Number> match` – score for a match
480
- `<Number> mismatch` – penalty for a mismatch
481
- `<Number> gap` – penalty for a gap
482
-
483
- #### q-Gram – `qGram`
484
-
485
- Q-gram similarity is a string-matching algorithm that compares two strings by breaking them into substrings of length Q. It's used to determine how similar the two strings are.
486
-
487
- Options:
488
-
489
- `<Int> q` length of substrings
490
-
491
- ### Phonetic Algorithms
492
-
493
- #### Soundex – `soundex`
494
-
495
- The Soundex algorithm generates a phonetic representation of a string based on how it sounds. It supports predefined setups for English and German and allows users to provide custom options.
496
-
497
- Options:
498
-
499
- `<String> lang` – language code for predefined setups (e.g., `en`, `de`)
500
- `<Boolean> raw` – if true, returns the raw sound index codes
501
- `<Object> mapping` – custom phonetic mapping (overrides predefined)
502
- `<String> exclude` – characters to exclude from the input (overrides predefined)
503
- `<Number> maxLength` – maximum length of the phonetic code
1
+ # CmpStr - Modern String Similarity Package
2
+
3
+ [![GitHub License](https://img.shields.io/github/license/komed3/cmpstr?style=for-the-badge&logo=unlicense&logoColor=fff)](LICENSE)
4
+ [![Static Badge](https://img.shields.io/badge/docs-docs?style=for-the-badge&logo=readthedocs&logoColor=fff&color=blue)](https://github.com/komed3/cmpstr/wiki)
5
+ [![Static Badge](https://img.shields.io/badge/Typescript-support?style=for-the-badge&logo=typescript&logoColor=fff&color=blue)]()
6
+ [![GitHub package.json version](https://img.shields.io/github/package-json/v/komed3/cmpstr?style=for-the-badge&logo=npm&logoColor=fff)](https://npmjs.com/package/cmpstr)
7
+ [![npm bundle size](https://img.shields.io/bundlephobia/min/cmpstr?style=for-the-badge&logo=gitlfs&logoColor=fff)](https://bundlephobia.com/package/cmpstr)
8
+ [![NPM Downloads](https://img.shields.io/npm/dy/cmpstr?style=for-the-badge&logo=transmission&logoColor=fff)](https://npmpackage.info/package/cmpstr?t=downloads)
9
+ [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/komed3/cmpstr/build.yml?style=for-the-badge&logo=educative&logoColor=fff)](https://github.com/komed3/cmpstr/actions/workflows/build.yml)
10
+ [![Static Badge](https://img.shields.io/badge/ESM_%26_CJS-TypeScript?style=for-the-badge&logo=nodedotjs&logoColor=fff&color=purple)]()
11
+ [![Static Badge](https://img.shields.io/badge/UMD_%26_ESM-JavaScript?style=for-the-badge&logo=javascript&logoColor=fff&color=orange)]()
12
+
13
+ **CmpStr** is a TypeScript library for advanced string comparison, similarity measurement, phonetic indexing, and text analysis. It includes implementations of several established algorithms such as Levenshtein, Dice–Sørensen, Damerau–Levenshtein and Soundex. The library has no external dependencies and allows for the integration of custom metrics, phonetic mappings, and normalization filters.
14
+
15
+ CmpStr provides a unified API for single, batch and pairwise operations. It is suitable for a range of use cases in application development and research. The package includes support for both ESM and CommonJS environments, TypeScript type declarations and a browser-compatible JavaScript bundle.
16
+
17
+ Originally launched in 2023 with a minimal feature set, the library was redesigned in 2025 to support a broader set of algorithms and processing features. The current version offers asynchronous operation, configurable normalization and filtering pipelines, phonetic search functionality, and basic tools for string differencing.
18
+
19
+ **Key Features**
20
+
21
+ - Unified API for string similarity, distance measurement and matching
22
+ - Modular metric system with support for algorithms such as Levenshtein, Jaro-Winkler, Cosine etc.
23
+ - Integrated phonetic algorithms (e.g., Soundex, Metaphone) with configurable registry
24
+ - Normalization and filtering pipeline for consistent input processing
25
+ - Single, batch and pairwise comparisons with structured, type-safe results
26
+ - Phonetic-aware search and comparison
27
+ - Utilities for text structure and readability analysis (e.g., syllables, word statistics)
28
+ - Diffing tools with CLI-friendly formatting
29
+ - TypeScript-native with full type declarations and extensibility
30
+ - Supports asynchronous workflows for scalable, non-blocking processing
31
+ - Extensible architecture for integrating custom algorithms and filters
32
+
33
+ ## Getting Started
34
+
35
+ Working with CmpStr is simple and straightforward. The package is installed just like any other using the following command:
36
+
37
+ ```sh
38
+ npm install cmpstr
39
+ ```
40
+
41
+ Minimal usage example:
42
+
43
+ ```ts
44
+ import { CmpStr } from 'cmpstr';
45
+
46
+ const cmp = CmpStr.create().setMetric( 'levenshtein' ).setFlags( 'i' );
47
+
48
+ const result = cmp.test( [ 'hello', 'hola' ], 'Hallo' );
49
+
50
+ console.log( result );
51
+ // { source: 'hello', target: 'Hallo', match: 0.8 }
52
+ ```
53
+
54
+ For asynchronous workloads:
55
+
56
+ ```ts
57
+ import { CmpStrAsync } from 'cmpstr';
58
+
59
+ const cmp = CmpStrAsync.create().setProcessors( {
60
+ phonetic: { algo: 'soundex' }
61
+ } );
62
+
63
+ const result = await cmp.searchAsync( 'Maier', [
64
+ 'Meyer', 'Müller', 'Miller', 'Meyers', 'Meier'
65
+ ] );
66
+
67
+ console.log( result );
68
+ // [ 'Meyer', 'Meier' ]
69
+ ```
70
+
71
+ ## Documentation
72
+
73
+ The full documentation, API reference and advanced usage examples are available in the [GitHub Wiki](https://github.com/komed3/cmpstr/wiki).
74
+
75
+ **LICENSE MIT © 2023-2025 PAUL KÖHLER (KOMED3)**