cmpstr 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/CmpStr.js ADDED
@@ -0,0 +1,784 @@
1
+ /**
2
+ * class CmpStr
3
+ *
4
+ * The CmpStr class is the core of the cmpstr package. It provides methods to calculate
5
+ * string similarity, find the closest matches in arrays, and generate similarity
6
+ * matrices. The class supports built-in algorithms (e.g., Levenshtein, Dice-Sørensen)
7
+ * and allows users to add custom algorithms. It also includes features like string
8
+ * normalization, caching, and extensibility.
9
+ *
10
+ * @author komed3 (Paul Köhler)
11
+ * @license MIT
12
+ */
13
+
14
+ 'use strict';
15
+
16
+ /**
17
+ * module exports
18
+ * @public
19
+ */
20
+
21
+ module.exports = class CmpStr {
22
+
23
+ /**
24
+ * all pre-defined similarity algorithms
25
+ *
26
+ * @private
27
+ * @type {Object}
28
+ */
29
+ #algorithms = {
30
+ cosine: './algorithms/cosine',
31
+ damerau: './algorithms/damerau',
32
+ dice: './algorithms/dice',
33
+ hamming: './algorithms/hamming',
34
+ jaccard: './algorithms/jaccard',
35
+ jaro: './algorithms/jaroWinkler',
36
+ lcs: './algorithms/lcs',
37
+ levenshtein: './algorithms/levenshtein',
38
+ needlemanWunsch: './algorithms/needlemanWunsch',
39
+ qGram: './algorithms/qGram',
40
+ smithWaterman: './algorithms/smithWaterman',
41
+ soundex: './algorithms/soundex'
42
+ };
43
+
44
+ /**
45
+ * normalized strings cache
46
+ *
47
+ * @private
48
+ * @type {Map<String, String>}
49
+ */
50
+ #cache = new Map ();
51
+
52
+ /**
53
+ * added filters for string normalization
54
+ *
55
+ * @private
56
+ * @type {Map<String, Object[]>}
57
+ */
58
+ #filter = new Map ();
59
+
60
+ /**
61
+ * default normalization flags
62
+ * set by setFlags()
63
+ *
64
+ * @public
65
+ * @type {String}
66
+ */
67
+ flags = '';
68
+
69
+ /**
70
+ * base string for comparison
71
+ * set by setStr or constructor()
72
+ *
73
+ * @public
74
+ * @type {String}
75
+ */
76
+ str;
77
+
78
+ /**
79
+ * current algorithm to use for similarity calculations
80
+ * set by setAlgo(), addAlgo() or constructor()
81
+ *
82
+ * @public
83
+ * @type {String}
84
+ */
85
+ algo;
86
+
87
+ /**
88
+ * initializes a CmpStr instance
89
+ * algorithm and base string can be set by initialization
90
+ *
91
+ * @param {String} algo name of the algorithm to use for calculation
92
+ * @param {String} str string to set as the base
93
+ */
94
+ constructor ( algo = undefined, str = undefined ) {
95
+
96
+ if ( algo !== undefined ) {
97
+
98
+ this.setAlgo( algo );
99
+
100
+ }
101
+
102
+ if ( str !== undefined ) {
103
+
104
+ this.setStr( str );
105
+
106
+ }
107
+
108
+ };
109
+
110
+ /**
111
+ * checks whether string and algorithm are set correctly
112
+ *
113
+ * @returns {Boolean} true if ready, false otherwise
114
+ */
115
+ isReady () {
116
+
117
+ return (
118
+ typeof this.algo === 'string' &&
119
+ this.isAlgo( this.algo ) &&
120
+ typeof this.str === 'string' &&
121
+ this.str.length != 0
122
+ );
123
+
124
+ };
125
+
126
+ /**
127
+ * checks ready state and throws an error if not
128
+ *
129
+ * @returns {Boolean} true if ready
130
+ * @throws {Error} if CmpStr is not ready
131
+ */
132
+ _checkReady () {
133
+
134
+ if ( !this.isReady() ) {
135
+
136
+ throw new Error(
137
+ `CmpStr instance is not ready. Ensure the algorithm and base string are set.`
138
+ );
139
+
140
+ }
141
+
142
+ return true;
143
+
144
+ };
145
+
146
+ /**
147
+ * sets the base string for comparison
148
+ *
149
+ * @param {String} str string to set as the base
150
+ * @returns {Boolean} always returns true
151
+ */
152
+ setStr ( str ) {
153
+
154
+ this.str = String ( str );
155
+
156
+ return true;
157
+
158
+ };
159
+
160
+ /**
161
+ * --------------------------------------------------
162
+ * Algorithms
163
+ * --------------------------------------------------
164
+ */
165
+
166
+ /**
167
+ * list all registered similarity algorithms
168
+ *
169
+ * @returns {String[]} array of algorithm names
170
+ */
171
+ listAlgo () {
172
+
173
+ return [ ...Object.keys( this.#algorithms ) ];
174
+
175
+ };
176
+
177
+ /**
178
+ * checks if an algorithm is registered
179
+ *
180
+ * @param {String} algo name of the algorithm
181
+ * @returns {Boolean} true if the algorithm is registered, false otherwise
182
+ */
183
+ isAlgo ( algo ) {
184
+
185
+ return algo in this.#algorithms;
186
+
187
+ };
188
+
189
+ /**
190
+ * sets the current algorithm to use for similarity calculations
191
+ *
192
+ * @param {String} algo name of the algorithm
193
+ * @returns {Boolean} true if the algorithm has been set
194
+ */
195
+ setAlgo ( algo ) {
196
+
197
+ if ( this._loadAlgo( algo ) ) {
198
+
199
+ this.algo = algo;
200
+
201
+ return true;
202
+
203
+ }
204
+
205
+ };
206
+
207
+ /**
208
+ * adds a new similarity algorithm
209
+ *
210
+ * @param {String} algo name of the algorithm
211
+ * @param {Function} callback function implementing the algorithm (must accept two strings and return a number)
212
+ * @param {Boolean} [useIt=true] whether to set this algorithm as the current one
213
+ * @returns {Boolean} returns true if the algorithms was added successfully
214
+ * @throws {Error} if the algorithm cannot be added
215
+ */
216
+ addAlgo ( algo, callback, useIt = true ) {
217
+
218
+ if (
219
+ !this.isAlgo( algo ) &&
220
+ typeof callback === 'function' &&
221
+ callback.length >= 2 &&
222
+ typeof callback.apply( null, [ 'abc', 'abc' ] ) === 'number'
223
+ ) {
224
+
225
+ this.#algorithms[ algo ] = callback;
226
+
227
+ if ( useIt ) {
228
+
229
+ this.setAlgo( algo );
230
+
231
+ }
232
+
233
+ return true;
234
+
235
+ } else {
236
+
237
+ throw new Error (
238
+ `Algorithm "${algo}" cannot be added.`
239
+ );
240
+
241
+ }
242
+
243
+ };
244
+
245
+ /**
246
+ * removes a registered similarity algorithm
247
+ *
248
+ * @param {String} algo name of the algorithm
249
+ * @returns {Boolean} true if the algorithm was removed successfully
250
+ * @throws {Error} if the algorithm is not defined
251
+ */
252
+ rmvAlgo ( algo ) {
253
+
254
+ if ( this.isAlgo( algo ) ) {
255
+
256
+ delete this.#algorithms[ algo ];
257
+
258
+ if ( this.algo === algo ) {
259
+
260
+ /* reset current algorithm if it was removed */
261
+
262
+ this.algo = undefined;
263
+
264
+ }
265
+
266
+ return true;
267
+
268
+ } else {
269
+
270
+ throw new Error (
271
+ `Algorithm "${algo}" is not defined.`
272
+ );
273
+
274
+ }
275
+
276
+ };
277
+
278
+ /**
279
+ * lazy-loads the specified algorithm module
280
+ *
281
+ * @param {String} algo name of the similarity algorithm
282
+ * @returns {Boolean} true if the algorithm is loaded
283
+ * @throws {Error} if the algorithm cannot be loaded or is not defined
284
+ */
285
+ _loadAlgo ( algo ) {
286
+
287
+ if ( this.isAlgo( algo ) ) {
288
+
289
+ let typeOf = typeof this.#algorithms[ algo ];
290
+
291
+ if ( typeOf === 'function' ) {
292
+
293
+ return true;
294
+
295
+ } else if ( typeOf === 'string' ) {
296
+
297
+ try {
298
+
299
+ /* lazy-load algorithm module */
300
+
301
+ this.#algorithms[ algo ] = require(
302
+ this.#algorithms[ algo ]
303
+ );
304
+
305
+ return true;
306
+
307
+ } catch ( err ) {
308
+
309
+ throw new Error (
310
+ `Failed to load algorithm "${algo}".`,
311
+ { cause: err }
312
+ );
313
+
314
+ }
315
+
316
+ } else {
317
+
318
+ throw new Error (
319
+ `Algorithm "${algo}" cannot be loaded.`
320
+ );
321
+
322
+ }
323
+
324
+ } else {
325
+
326
+ throw new Error (
327
+ `Algorithm "${algo}" is not defined.`
328
+ );
329
+
330
+ }
331
+
332
+ };
333
+
334
+ /**
335
+ * --------------------------------------------------
336
+ * Custom Filters
337
+ * --------------------------------------------------
338
+ */
339
+
340
+ /**
341
+ * list all added filters
342
+ *
343
+ * @returns {String[]} array of filter names
344
+ */
345
+ listFilter () {
346
+
347
+ return [ ...this.#filter.keys() ];
348
+
349
+ };
350
+
351
+ /**
352
+ * adds a custom normalization filter
353
+ *
354
+ * @param {String} name filter name
355
+ * @param {Function} callback function implementing the filter (must accept a string and returns a normalized one)
356
+ * @param {Int} [priority=10] priority of the filter (lower numbers are processed first)
357
+ * @returns {Boolean} returns true if the filter was added successfully
358
+ * @throws {Error} if the filter cannot be added
359
+ */
360
+ addFilter ( name, callback, priority = 10 ) {
361
+
362
+ if (
363
+ !this.#filter.has( name ) &&
364
+ typeof callback === 'function' &&
365
+ callback.length == 1 &&
366
+ typeof callback.apply( null, [ 'abc' ] ) === 'string'
367
+ ) {
368
+
369
+ this.#filter.set( name, {
370
+ callback, priority,
371
+ active: true
372
+ } );
373
+
374
+ this.clearCache();
375
+
376
+ return true;
377
+
378
+ } else {
379
+
380
+ throw new Error (
381
+ `Filter "${filter}" cannot be added.`
382
+ );
383
+
384
+ }
385
+
386
+ };
387
+
388
+ /**
389
+ * removes a custom normalization filter
390
+ *
391
+ * @param {String} name filter name
392
+ * @returns {Boolean} true if the filter was removed successfully
393
+ * @throws {Error} if the filter does not exists
394
+ */
395
+ rmvFilter ( name ) {
396
+
397
+ if ( this.#filter.delete( name ) ) {
398
+
399
+ this.clearCache();
400
+
401
+ return true;
402
+
403
+ } else {
404
+
405
+ throw new Error (
406
+ `Filter "${filter}" does not exists.`
407
+ );
408
+
409
+ }
410
+
411
+ };
412
+
413
+ /**
414
+ * pauses a custom normalization filter
415
+ *
416
+ * @param {String} name filter name
417
+ * @returns {Boolean} true if the filter was paused successfully
418
+ * @throws {Error} if the filter does not exists
419
+ */
420
+ pauseFilter ( name ) {
421
+
422
+ if ( this.#filter.has( name ) ) {
423
+
424
+ this.#filter.get( name ).active = false;
425
+
426
+ this.clearCache();
427
+
428
+ return true;
429
+
430
+ } else {
431
+
432
+ throw new Error (
433
+ `Filter "${filter}" does not exists.`
434
+ );
435
+
436
+ }
437
+
438
+ };
439
+
440
+ /**
441
+ * resumes a custom normalization filter
442
+ *
443
+ * @param {String} name filter name
444
+ * @returns {Boolean} true if the filter was resumed successfully
445
+ * @throws {Error} if the filter does not exists
446
+ */
447
+ resumeFilter ( name ) {
448
+
449
+ if ( this.#filter.has( name ) ) {
450
+
451
+ this.#filter.get( name ).active = true;
452
+
453
+ this.clearCache();
454
+
455
+ return true;
456
+
457
+ } else {
458
+
459
+ throw new Error (
460
+ `Filter "${filter}" does not exists.`
461
+ );
462
+
463
+ }
464
+
465
+ };
466
+
467
+ /**
468
+ * clears normalization filters (remove all of them)
469
+ *
470
+ * @returns {Boolean} always returns true
471
+ */
472
+ clearFilter () {
473
+
474
+ this.#filter.clear();
475
+
476
+ this.clearCache();
477
+
478
+ return true;
479
+
480
+ };
481
+
482
+ /**
483
+ * applies all active filters to a string
484
+ *
485
+ * @param {String} str string to process
486
+ * @returns {String} filtered string
487
+ * @throws {Error} if applying filters cause an error
488
+ */
489
+ _applyFilters ( str ) {
490
+
491
+ try {
492
+
493
+ return Array.from( this.#filter.values() ).flat().filter(
494
+ ( filter ) => filter.active
495
+ ).sort(
496
+ ( a, b ) => a.priority - b.priority
497
+ ).reduce(
498
+ ( res, filter ) => filter.callback.apply( null, [ res ] ),
499
+ String ( str )
500
+ );
501
+
502
+ } catch ( err ) {
503
+
504
+ throw new Error (
505
+ `Error while applying filters.`,
506
+ { cause: err }
507
+ );
508
+
509
+ }
510
+
511
+ };
512
+
513
+ /**
514
+ * --------------------------------------------------
515
+ * Normalization
516
+ * --------------------------------------------------
517
+ */
518
+
519
+ /**
520
+ * set default normalization flags
521
+ *
522
+ * @param {String} [flags=''] normalization flags
523
+ * @returns {Boolean} always returns true
524
+ */
525
+ setFlags ( flags = '' ) {
526
+
527
+ this.flags = String ( flags );
528
+
529
+ };
530
+
531
+ /**
532
+ * normalizes a string by chainable options; uses cache to increase
533
+ * performance and custom filters for advanced behavior
534
+ *
535
+ * list of all supported flags:
536
+ *
537
+ * s :: remove special chars
538
+ * w :: collapse whitespaces
539
+ * r :: remove repeated chars
540
+ * k :: keep only letters
541
+ * n :: ignore numbers
542
+ * t :: trim whitespaces
543
+ * i :: case insensitivity
544
+ * d :: decompose unicode
545
+ * u :: normalize unicode
546
+ *
547
+ * @param {String} string string to normalize
548
+ * @param {String} [flags=''] normalization flags
549
+ * @returns {String} normalized string
550
+ * @throws {Error} if normalization cause an error
551
+ */
552
+ normalize ( str, flags = '' ) {
553
+
554
+ let res = String ( str );
555
+
556
+ /* use normalized string from cache to increase performance */
557
+
558
+ let key = `${res}::${flags}`;
559
+
560
+ if ( this.#cache.has( key ) ) {
561
+
562
+ return this.#cache.get( key );
563
+
564
+ }
565
+
566
+ /* apply custom filters */
567
+
568
+ res = this._applyFilters( res );
569
+
570
+ /* normalize using flags */
571
+
572
+ try {
573
+
574
+ if ( flags.includes( 's' ) ) res = res.replace( /[^a-z0-9]/gi, '' );
575
+ if ( flags.includes( 'w' ) ) res = res.replace( /\s+/g, ' ' );
576
+ if ( flags.includes( 'r' ) ) res = res.replace( /(.)\1+/g, '$1' );
577
+ if ( flags.includes( 'k' ) ) res = res.replace( /[^a-z]/gi, '' );
578
+ if ( flags.includes( 'n' ) ) res = res.replace( /[0-9]/g, '' );
579
+ if ( flags.includes( 't' ) ) res = res.trim();
580
+ if ( flags.includes( 'i' ) ) res = res.toLowerCase();
581
+ if ( flags.includes( 'd' ) ) res = res.normalize( 'NFD' ).replace( /[\u0300-\u036f]/g, '' );
582
+ if ( flags.includes( 'u' ) ) res = res.normalize( 'NFC' );
583
+
584
+ } catch ( err ) {
585
+
586
+ throw new Error (
587
+ `Error while normalization.`,
588
+ { cause: err }
589
+ );
590
+
591
+ }
592
+
593
+ /* store the normalized string in the cache */
594
+
595
+ this.#cache.set( key, res );
596
+
597
+ return res;
598
+
599
+ };
600
+
601
+ /**
602
+ * clears the normalization cache
603
+ *
604
+ * @returns {Boolean} always returns true
605
+ */
606
+ clearCache () {
607
+
608
+ this.#cache.clear();
609
+
610
+ return true;
611
+
612
+ };
613
+
614
+ /**
615
+ * --------------------------------------------------
616
+ * Similarity Comparison
617
+ * --------------------------------------------------
618
+ */
619
+
620
+ /**
621
+ * compares two string a and b using the passed algorithm
622
+ *
623
+ * @param {String} algo name of the algorithm
624
+ * @param {String} a string a
625
+ * @param {String} b string b
626
+ * @param {Object} [config={}] config (flags, args)
627
+ * @returns {Mixed} similarity score (0..1) or raw output
628
+ * @throws {Error} if algorithm cause an error
629
+ */
630
+ compare ( algo, a, b, config = {} ) {
631
+
632
+ if ( this._loadAlgo( algo ) ) {
633
+
634
+ /* handle trivial cases */
635
+
636
+ if ( a === b ) return 1; // strings are identical
637
+ if ( a.length < 2 || b.length < 2 ) return 0; // too short to compare
638
+
639
+ /* apply similarity algorithm */
640
+
641
+ const {
642
+ flags = this.flags,
643
+ options = {}
644
+ } = config;
645
+
646
+ try {
647
+
648
+ return this.#algorithms[ algo ].apply( null, [
649
+ this.normalize( a, flags ),
650
+ this.normalize( b, flags ),
651
+ options
652
+ ] );
653
+
654
+ } catch ( err ) {
655
+
656
+ throw new Error (
657
+ `Error in algorithm "${algo}".`,
658
+ { cause: err }
659
+ );
660
+
661
+ }
662
+
663
+ }
664
+
665
+ };
666
+
667
+ /**
668
+ * tests the similarity between the base string and a target string
669
+ * using the current algorithm
670
+ *
671
+ * @param {String} str target string
672
+ * @param {Object} [config={}] config (flags, args)
673
+ * @returns {Mixed} similarity score (0..1) or raw output
674
+ */
675
+ test ( str, config = {} ) {
676
+
677
+ if ( this._checkReady() ) {
678
+
679
+ return this.compare(
680
+ this.algo,
681
+ this.str, str,
682
+ config
683
+ );
684
+
685
+ }
686
+
687
+ };
688
+
689
+ /**
690
+ * tests the similarity of multiple strings against the base string
691
+ *
692
+ * @param {String[]} arr array of strings
693
+ * @param {Object} [config={}] config (flags, args)
694
+ * @returns {Object[]} array of objects, each containing the target string and its similarity score / raw output
695
+ */
696
+ batchTest ( arr, config = {} ) {
697
+
698
+ if ( this._checkReady() ) {
699
+
700
+ return [ ...arr ].map( ( str ) => ( {
701
+ target: str,
702
+ match: this.compare(
703
+ this.algo,
704
+ this.str, str,
705
+ config
706
+ )
707
+ } ) );
708
+
709
+ }
710
+
711
+ };
712
+
713
+ /**
714
+ * finds strings in an array that exceed a similarity threshold
715
+ * returns the array sorted by highest similarity
716
+ *
717
+ * @param {String[]} arr array of strings
718
+ * @param {Object} [config={}] config (flags, threshold, args)
719
+ * @returns {Object[]} array of objects, sorted by highest similarity
720
+ */
721
+ match ( arr, config = {} ) {
722
+
723
+ const { threshold = 0 } = config;
724
+
725
+ delete config?.options?.raw;
726
+
727
+ return this.batchTest(
728
+ arr, config
729
+ ).filter(
730
+ ( r ) => r.match >= threshold
731
+ ).sort(
732
+ ( a, b ) => b.match - a.match
733
+ );
734
+
735
+ };
736
+
737
+ /**
738
+ * finds the closest matching string from an array
739
+ *
740
+ * @param {String[]} arr array of strings
741
+ * @param {Object} [config={}] config (flags, args)
742
+ * @returns {String} closest matching string
743
+ */
744
+ closest ( arr, config = {} ) {
745
+
746
+ let res = this.match(
747
+ arr, config
748
+ );
749
+
750
+ return res.length && res[ 0 ].match > 0
751
+ ? res[ 0 ].target
752
+ : undefined;
753
+
754
+ };
755
+
756
+ /**
757
+ * generate a similarity matrix for an array of strings
758
+ *
759
+ * @param {String} algo name of the algorithm
760
+ * @param {String[]} arr array of strings to cross-compare
761
+ * @param {Object} [config={}] config (flags, args)
762
+ * @returns {Number[][]} 2D array representing the similarity matrix
763
+ */
764
+ similarityMatrix ( algo, arr, config = {} ) {
765
+
766
+ if ( this._loadAlgo( algo ) ) {
767
+
768
+ delete config?.options?.raw;
769
+
770
+ return [ ...arr ].map( ( a, i ) => {
771
+
772
+ return [ ...arr ].map(
773
+ ( b, j ) => i === j ? 1 : this.compare(
774
+ algo, a, b, config
775
+ )
776
+ );
777
+
778
+ } );
779
+
780
+ }
781
+
782
+ };
783
+
784
+ };