cmpstr 3.2.2 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/dist/CmpStr.esm.js +2149 -1721
  2. package/dist/CmpStr.esm.min.js +2 -2
  3. package/dist/CmpStr.umd.js +2028 -1604
  4. package/dist/CmpStr.umd.min.js +2 -2
  5. package/dist/cjs/CmpStr.cjs +100 -51
  6. package/dist/cjs/CmpStrAsync.cjs +35 -18
  7. package/dist/cjs/index.cjs +1 -1
  8. package/dist/cjs/metric/Cosine.cjs +1 -1
  9. package/dist/cjs/metric/DamerauLevenshtein.cjs +1 -1
  10. package/dist/cjs/metric/DiceSorensen.cjs +1 -1
  11. package/dist/cjs/metric/Hamming.cjs +1 -1
  12. package/dist/cjs/metric/Jaccard.cjs +1 -1
  13. package/dist/cjs/metric/JaroWinkler.cjs +1 -1
  14. package/dist/cjs/metric/LCS.cjs +1 -1
  15. package/dist/cjs/metric/Levenshtein.cjs +1 -1
  16. package/dist/cjs/metric/Metric.cjs +40 -22
  17. package/dist/cjs/metric/NeedlemanWunsch.cjs +1 -1
  18. package/dist/cjs/metric/QGram.cjs +1 -1
  19. package/dist/cjs/metric/SmithWaterman.cjs +1 -1
  20. package/dist/cjs/phonetic/Caverphone.cjs +1 -1
  21. package/dist/cjs/phonetic/Cologne.cjs +1 -1
  22. package/dist/cjs/phonetic/Metaphone.cjs +1 -1
  23. package/dist/cjs/phonetic/Phonetic.cjs +27 -15
  24. package/dist/cjs/phonetic/Soundex.cjs +1 -1
  25. package/dist/cjs/root.cjs +4 -2
  26. package/dist/cjs/utils/DeepMerge.cjs +102 -97
  27. package/dist/cjs/utils/DiffChecker.cjs +1 -1
  28. package/dist/cjs/utils/Errors.cjs +22 -19
  29. package/dist/cjs/utils/Filter.cjs +59 -24
  30. package/dist/cjs/utils/HashTable.cjs +44 -29
  31. package/dist/cjs/utils/Normalizer.cjs +57 -28
  32. package/dist/cjs/utils/OptionsValidator.cjs +211 -0
  33. package/dist/cjs/utils/Pool.cjs +27 -13
  34. package/dist/cjs/utils/Profiler.cjs +41 -27
  35. package/dist/cjs/utils/Registry.cjs +5 -5
  36. package/dist/cjs/utils/StructuredData.cjs +83 -53
  37. package/dist/cjs/utils/TextAnalyzer.cjs +1 -1
  38. package/dist/esm/CmpStr.mjs +101 -52
  39. package/dist/esm/CmpStrAsync.mjs +35 -18
  40. package/dist/esm/index.mjs +1 -1
  41. package/dist/esm/metric/Cosine.mjs +1 -1
  42. package/dist/esm/metric/DamerauLevenshtein.mjs +1 -1
  43. package/dist/esm/metric/DiceSorensen.mjs +1 -1
  44. package/dist/esm/metric/Hamming.mjs +1 -1
  45. package/dist/esm/metric/Jaccard.mjs +1 -1
  46. package/dist/esm/metric/JaroWinkler.mjs +1 -1
  47. package/dist/esm/metric/LCS.mjs +1 -1
  48. package/dist/esm/metric/Levenshtein.mjs +1 -1
  49. package/dist/esm/metric/Metric.mjs +40 -22
  50. package/dist/esm/metric/NeedlemanWunsch.mjs +1 -1
  51. package/dist/esm/metric/QGram.mjs +1 -1
  52. package/dist/esm/metric/SmithWaterman.mjs +1 -1
  53. package/dist/esm/phonetic/Caverphone.mjs +1 -1
  54. package/dist/esm/phonetic/Cologne.mjs +1 -1
  55. package/dist/esm/phonetic/Metaphone.mjs +1 -1
  56. package/dist/esm/phonetic/Phonetic.mjs +30 -15
  57. package/dist/esm/phonetic/Soundex.mjs +1 -1
  58. package/dist/esm/root.mjs +3 -3
  59. package/dist/esm/utils/DeepMerge.mjs +103 -94
  60. package/dist/esm/utils/DiffChecker.mjs +1 -1
  61. package/dist/esm/utils/Errors.mjs +22 -19
  62. package/dist/esm/utils/Filter.mjs +59 -24
  63. package/dist/esm/utils/HashTable.mjs +44 -29
  64. package/dist/esm/utils/Normalizer.mjs +57 -28
  65. package/dist/esm/utils/OptionsValidator.mjs +210 -0
  66. package/dist/esm/utils/Pool.mjs +27 -13
  67. package/dist/esm/utils/Profiler.mjs +41 -27
  68. package/dist/esm/utils/Registry.mjs +5 -5
  69. package/dist/esm/utils/StructuredData.mjs +83 -53
  70. package/dist/esm/utils/TextAnalyzer.mjs +1 -1
  71. package/dist/types/CmpStr.d.ts +22 -15
  72. package/dist/types/CmpStrAsync.d.ts +3 -0
  73. package/dist/types/index.d.ts +3 -3
  74. package/dist/types/metric/Metric.d.ts +9 -9
  75. package/dist/types/phonetic/Phonetic.d.ts +4 -3
  76. package/dist/types/root.d.ts +3 -2
  77. package/dist/types/utils/DeepMerge.d.ts +80 -58
  78. package/dist/types/utils/Errors.d.ts +25 -8
  79. package/dist/types/utils/Filter.d.ts +4 -1
  80. package/dist/types/utils/HashTable.d.ts +12 -11
  81. package/dist/types/utils/Normalizer.d.ts +2 -1
  82. package/dist/types/utils/OptionsValidator.d.ts +193 -0
  83. package/dist/types/utils/Profiler.d.ts +9 -28
  84. package/dist/types/utils/StructuredData.d.ts +3 -0
  85. package/dist/types/utils/Types.d.ts +13 -1
  86. package/package.json +14 -5
@@ -1,5 +1,5 @@
1
1
  /**
2
- * CmpStr v3.2.2 build-bb61120-260311
2
+ * CmpStr v3.3.0 build-3699f85-260318
3
3
  * This is a lightweight, fast and well performing library for calculating string similarity.
4
4
  * (c) 2023-2026 Paul Köhler @komed3 / MIT License
5
5
  * Visit https://github.com/komed3/cmpstr and https://npmjs.org/package/cmpstr
@@ -7,19 +7,32 @@
7
7
  class CmpStrError extends Error {
8
8
  code;
9
9
  meta;
10
- cause;
11
10
  when = new Date().toISOString();
12
11
  constructor(code, message, meta, cause) {
13
- super(message);
12
+ super(message, cause !== undefined ? { cause } : undefined);
14
13
  this.name = this.constructor.name;
15
14
  this.code = code;
16
15
  this.meta = meta;
17
- this.cause = cause;
18
16
  if (typeof Error.captureStackTrace === 'function') {
19
17
  Error.captureStackTrace(this, this.constructor);
20
18
  }
21
19
  }
22
- toJSON() {
20
+ format(stack = false) {
21
+ const parts = [`${this.name} [${this.code}]`, this.message];
22
+ if (this.meta)
23
+ for (const _ in this.meta) {
24
+ parts.push(JSON.stringify(this.meta));
25
+ break;
26
+ }
27
+ return (
28
+ parts.join(' - ') +
29
+ (stack && this.stack ? `\nStack Trace:\n${this.stack}` : '')
30
+ );
31
+ }
32
+ toString() {
33
+ return this.format(false);
34
+ }
35
+ toJSON(stack = false) {
23
36
  return {
24
37
  name: this.name,
25
38
  code: this.code,
@@ -31,23 +44,11 @@ class CmpStrError extends Error {
31
44
  ? {
32
45
  name: this.cause.name,
33
46
  message: this.cause.message,
34
- stack: this.cause.stack
47
+ stack: stack && this.cause.stack
35
48
  }
36
49
  : this.cause
37
50
  };
38
51
  }
39
- toString(stack = false) {
40
- const parts = [`${this.name} [${this.code}]`, this.message];
41
- if (this.meta && Object.keys(this.meta).length) {
42
- try {
43
- parts.push(JSON.stringify(this.meta));
44
- } catch {}
45
- }
46
- return (
47
- parts.join(' - ') +
48
- (stack && this.stack ? `\nStack Trace:\n${this.stack}` : '')
49
- );
50
- }
51
52
  }
52
53
  class CmpStrValidationError extends CmpStrError {
53
54
  constructor(message, meta, cause) {
@@ -73,7 +74,7 @@ class ErrorUtil {
73
74
  static assert(condition, message, meta) {
74
75
  if (!condition) throw new CmpStrUsageError(message, meta);
75
76
  }
76
- static create(err, message, meta) {
77
+ static rethrow(err, message, meta) {
77
78
  if (err instanceof CmpStrError) throw err;
78
79
  throw new CmpStrInternalError(message, meta, err);
79
80
  }
@@ -86,6 +87,7 @@ class ErrorUtil {
86
87
  try {
87
88
  return fn();
88
89
  } catch (err) {
90
+ if (err instanceof CmpStrError) throw err;
89
91
  throw new CmpStrInternalError(message, meta, err);
90
92
  }
91
93
  }
@@ -93,6 +95,7 @@ class ErrorUtil {
93
95
  try {
94
96
  return await fn();
95
97
  } catch (err) {
98
+ if (err instanceof CmpStrError) throw err;
96
99
  throw new CmpStrInternalError(message, meta, err);
97
100
  }
98
101
  }
@@ -108,118 +111,118 @@ var Errors = /*#__PURE__*/ Object.freeze({
108
111
  ErrorUtil: ErrorUtil
109
112
  });
110
113
 
111
- const BRACKET_PATTERN = /\[(\d+)]/g;
112
- const PATH_CACHE = new Map();
113
- function parse(p) {
114
- let cached = PATH_CACHE.get(p);
115
- if (cached) return cached;
116
- const parsed = p
117
- .replace(BRACKET_PATTERN, '.$1')
118
- .split('.')
119
- .map((s) => {
120
- const n = Number(s);
121
- return Number.isInteger(n) && String(n) === s ? n : s;
122
- });
123
- PATH_CACHE.set(p, parsed);
124
- return parsed;
125
- }
126
- function get(t, path, fb) {
127
- let o = t;
128
- for (const k of parse(path)) {
129
- if (o == null || !(k in o)) return fb;
130
- o = o[k];
114
+ class DeepMerge {
115
+ static BRACKET_PATTERN = /\[(\d+)]/g;
116
+ static PATH_CACHE = new Map();
117
+ static walk(obj, keys) {
118
+ let o = obj;
119
+ for (let i = 0; i < keys.length; i++) {
120
+ const k = keys[i];
121
+ if (o == null || !(k in o)) return { exists: false };
122
+ o = o[k];
123
+ }
124
+ return { exists: true, value: o };
125
+ }
126
+ static parse(p) {
127
+ const cached = DeepMerge.PATH_CACHE.get(p);
128
+ if (cached) return cached;
129
+ const parsed = p
130
+ .replace(DeepMerge.BRACKET_PATTERN, '.$1')
131
+ .split('.')
132
+ .map((s) => {
133
+ const n = Number(s);
134
+ return Number.isInteger(n) && String(n) === s ? n : s;
135
+ });
136
+ if (DeepMerge.PATH_CACHE.size > 2000) DeepMerge.PATH_CACHE.clear();
137
+ DeepMerge.PATH_CACHE.set(p, parsed);
138
+ return parsed;
131
139
  }
132
- return o;
133
- }
134
- function has(t, path) {
135
- let o = t;
136
- for (const k of parse(path)) {
137
- if (o == null || !(k in o)) return false;
138
- o = o[k];
140
+ static has(t, path) {
141
+ return DeepMerge.walk(t, DeepMerge.parse(path)).exists;
139
142
  }
140
- return true;
141
- }
142
- function set(t, path, value) {
143
- if (path === '') return value;
144
- const keys = parse(path);
145
- if (t !== undefined && (typeof t !== 'object' || t === null))
146
- throw new CmpStrUsageError(
143
+ static get(t, path, fb) {
144
+ const r = DeepMerge.walk(t, DeepMerge.parse(path));
145
+ return r.exists ? r.value : fb;
146
+ }
147
+ static set(t, path, value) {
148
+ if (path === '') return value;
149
+ const keys = DeepMerge.parse(path);
150
+ ErrorUtil.assert(
151
+ t === undefined || (typeof t === 'object' && t !== null),
147
152
  `Cannot set property <${keys[0]}> of <${JSON.stringify(t)}>`,
148
153
  { path: keys[0], target: t }
149
154
  );
150
- const root = t ?? (typeof keys[0] === 'number' ? [] : Object.create(null));
151
- let cur = root;
152
- for (let i = 0; i < keys.length - 1; i++) {
153
- const k = keys[i];
154
- let n = cur[k];
155
- if (n != null && typeof n !== 'object')
156
- throw new CmpStrUsageError(
155
+ const root = t ?? (typeof keys[0] === 'number' ? [] : Object.create(null));
156
+ let cur = root;
157
+ for (let i = 0; i < keys.length - 1; i++) {
158
+ const k = keys[i];
159
+ let n = cur[k];
160
+ ErrorUtil.assert(
161
+ n == null || typeof n === 'object',
157
162
  `Cannot set property <${keys[i + 1]}> of <${JSON.stringify(n)}>`,
158
163
  { path: keys.slice(0, i + 2), value: n }
159
164
  );
160
- if (n == null)
161
- n = cur[k] = typeof keys[i + 1] === 'number' ? [] : Object.create(null);
162
- cur = n;
163
- }
164
- cur[keys[keys.length - 1]] = value;
165
- return root;
166
- }
167
- function merge(
168
- t = Object.create(null),
169
- o = Object.create(null),
170
- mergeUndefined = false
171
- ) {
172
- const target = t ?? Object.create(null);
173
- Object.keys(o).forEach((k) => {
174
- const val = o[k];
175
- if (!mergeUndefined && val === undefined) return;
176
- if (k === '__proto__' || k === 'constructor') return;
177
- if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
178
- const existing = target[k];
179
- target[k] = merge(
180
- existing !== null &&
181
- typeof existing === 'object' &&
182
- !Array.isArray(existing)
183
- ? existing
184
- : Object.create(null),
185
- val,
186
- mergeUndefined
187
- );
188
- } else target[k] = val;
189
- });
190
- return target;
191
- }
192
- function rmv(t, path, preserveEmpty = false) {
193
- const keys = parse(path);
194
- const remove = (obj, i = 0) => {
195
- const key = keys[i];
196
- if (!obj || typeof obj !== 'object') return false;
197
- if (i === keys.length - 1) return delete obj[key];
198
- if (!remove(obj[key], i + 1)) return false;
199
- if (!preserveEmpty) {
200
- const val = obj[key];
201
- if (
202
- typeof val === 'object' &&
203
- ((Array.isArray(val) && val.every((v) => v == null)) ||
204
- (!Array.isArray(val) && Object.keys(val).length === 0))
205
- )
206
- delete obj[key];
165
+ if (n == null)
166
+ n = cur[k] = typeof keys[i + 1] === 'number' ? [] : Object.create(null);
167
+ cur = n;
207
168
  }
208
- return true;
209
- };
210
- remove(t);
211
- return t;
169
+ cur[keys[keys.length - 1]] = value;
170
+ return root;
171
+ }
172
+ static rmv(t, path, preserveEmpty = false) {
173
+ const keys = DeepMerge.parse(path);
174
+ const remove = (obj, i = 0) => {
175
+ const key = keys[i];
176
+ if (!obj || typeof obj !== 'object') return false;
177
+ if (i === keys.length - 1) return delete obj[key];
178
+ if (!remove(obj[key], i + 1)) return false;
179
+ if (!preserveEmpty) {
180
+ const val = obj[key];
181
+ let empty = true;
182
+ if (typeof val === 'object') {
183
+ if (Array.isArray(val))
184
+ for (let i = 0; i < val.length; i++) {
185
+ if (val[i] != null) {
186
+ empty = false;
187
+ break;
188
+ }
189
+ }
190
+ else empty = false;
191
+ }
192
+ if (empty) delete obj[key];
193
+ }
194
+ return true;
195
+ };
196
+ remove(t);
197
+ return t;
198
+ }
199
+ static merge(
200
+ t = Object.create(null),
201
+ o = Object.create(null),
202
+ mergeUndefined = false
203
+ ) {
204
+ const target = t ?? Object.create(null);
205
+ for (const k in o) {
206
+ const val = o[k];
207
+ if (!mergeUndefined && val === undefined) continue;
208
+ if (k === '__proto__' || k === 'constructor') continue;
209
+ if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
210
+ const existing = target[k];
211
+ target[k] = DeepMerge.merge(
212
+ existing !== null &&
213
+ typeof existing === 'object' &&
214
+ !Array.isArray(existing)
215
+ ? existing
216
+ : Object.create(null),
217
+ val,
218
+ mergeUndefined
219
+ );
220
+ } else target[k] = val;
221
+ }
222
+ return target;
223
+ }
212
224
  }
213
225
 
214
- var DeepMerge = /*#__PURE__*/ Object.freeze({
215
- __proto__: null,
216
- get: get,
217
- has: has,
218
- merge: merge,
219
- rmv: rmv,
220
- set: set
221
- });
222
-
223
226
  class DiffChecker {
224
227
  a;
225
228
  b;
@@ -516,20 +519,32 @@ class DiffChecker {
516
519
  }
517
520
 
518
521
  class Filter {
522
+ static IDENTITY = (s) => s;
519
523
  static filters = new Map();
520
524
  static pipeline = new Map();
521
- static getPipeline(hook) {
525
+ static getPipeline(hook, force = false) {
522
526
  return ErrorUtil.wrap(
523
527
  () => {
524
- const cached = Filter.pipeline.get(hook);
525
- if (cached) return cached;
528
+ if (!force) {
529
+ const cached = Filter.pipeline.get(hook);
530
+ if (cached) return cached;
531
+ }
526
532
  const filter = Filter.filters.get(hook);
527
- if (!filter) return (s) => s;
528
- const pipeline = Array.from(filter.values())
529
- .filter((f) => f.active)
530
- .sort((a, b) => a.priority - b.priority)
531
- .map((f) => f.fn);
532
- const fn = (input) => pipeline.reduce((v, f) => f(v), input);
533
+ if (!filter) {
534
+ Filter.pipeline.set(hook, Filter.IDENTITY);
535
+ return Filter.IDENTITY;
536
+ }
537
+ const pipeline = [];
538
+ for (const f of filter.values()) if (f.active) pipeline.push(f);
539
+ pipeline.sort((a, b) => a.priority - b.priority);
540
+ const fn =
541
+ pipeline.length === 0
542
+ ? Filter.IDENTITY
543
+ : (input) => {
544
+ let v = input;
545
+ for (let i = 0; i < pipeline.length; i++) v = pipeline[i].fn(v);
546
+ return v;
547
+ };
533
548
  Filter.pipeline.set(hook, fn);
534
549
  return fn;
535
550
  },
@@ -547,9 +562,16 @@ class Filter {
547
562
  const filter = Filter.filters.get(hook) ?? new Map();
548
563
  const index = filter.get(id);
549
564
  if (index && !index.overrideable) return false;
565
+ if (
566
+ index &&
567
+ index.fn === fn &&
568
+ index.priority === priority &&
569
+ index.active === active
570
+ )
571
+ return true;
550
572
  filter.set(id, { id, fn, priority, active, overrideable });
551
573
  Filter.filters.set(hook, filter);
552
- Filter.pipeline.delete(hook);
574
+ Filter.getPipeline(hook, true);
553
575
  return true;
554
576
  },
555
577
  `Error adding filter <${id}> to hook <${hook}>`,
@@ -557,19 +579,28 @@ class Filter {
557
579
  );
558
580
  }
559
581
  static remove(hook, id) {
560
- Filter.pipeline.delete(hook);
561
582
  const filter = Filter.filters.get(hook);
562
- return filter ? filter.delete(id) : false;
583
+ if (!filter || !filter.delete(id)) return false;
584
+ Filter.getPipeline(hook, true);
585
+ return true;
563
586
  }
564
587
  static pause(hook, id) {
565
- Filter.pipeline.delete(hook);
566
- const f = Filter.filters.get(hook)?.get(id);
567
- return !!(f && ((f.active = false), true));
588
+ const filter = Filter.filters.get(hook);
589
+ if (!filter) return false;
590
+ const f = filter.get(id);
591
+ if (!f || !f.active) return false;
592
+ f.active = false;
593
+ Filter.getPipeline(hook, true);
594
+ return true;
568
595
  }
569
596
  static resume(hook, id) {
570
- Filter.pipeline.delete(hook);
571
- const f = Filter.filters.get(hook)?.get(id);
572
- return !!(f && ((f.active = true), true));
597
+ const filter = Filter.filters.get(hook);
598
+ if (!filter) return false;
599
+ const f = filter.get(id);
600
+ if (!f || f.active) return false;
601
+ f.active = true;
602
+ Filter.getPipeline(hook, true);
603
+ return true;
573
604
  }
574
605
  static list(hook, active = false) {
575
606
  const filter = Filter.filters.get(hook);
@@ -582,7 +613,11 @@ class Filter {
582
613
  return ErrorUtil.wrap(
583
614
  () => {
584
615
  const fn = Filter.getPipeline(hook);
585
- return Array.isArray(input) ? input.map(fn) : fn(input);
616
+ if (typeof input === 'string') return fn(input);
617
+ const arr = input;
618
+ const out = new Array(arr.length);
619
+ for (let i = 0; i < arr.length; i++) out[i] = fn(arr[i]);
620
+ return out;
586
621
  },
587
622
  `Error applying filters for hook <${hook}>`,
588
623
  { hook, input }
@@ -592,16 +627,19 @@ class Filter {
592
627
  return ErrorUtil.wrapAsync(
593
628
  async () => {
594
629
  const fn = Filter.getPipeline(hook);
595
- return Array.isArray(input)
596
- ? Promise.all(input.map(fn))
597
- : Promise.resolve(fn(input));
630
+ if (typeof input === 'string') return Promise.resolve(fn(input));
631
+ const arr = input;
632
+ const out = new Array(arr.length);
633
+ for (let i = 0; i < arr.length; i++)
634
+ out[i] = Promise.resolve(fn(arr[i]));
635
+ return Promise.all(out);
598
636
  },
599
637
  `Error applying filters for hook <${hook}>`,
600
638
  { hook, input }
601
639
  );
602
640
  }
603
641
  static clear(hook) {
604
- Filter.pipeline.clear();
642
+ Filter.clearPipeline();
605
643
  if (hook) Filter.filters.delete(hook);
606
644
  else Filter.filters.clear();
607
645
  }
@@ -615,25 +653,21 @@ class Hasher {
615
653
  static HASH_OFFSET = 0x811c9dc5;
616
654
  static fastFNV1a(str) {
617
655
  const len = str.length;
656
+ const limit = len & -4;
618
657
  let hash = this.HASH_OFFSET;
619
- const chunks = Math.floor(len / 4);
620
- for (let i = 0; i < chunks; i++) {
621
- const pos = i * 4;
658
+ let i = 0;
659
+ for (; i < limit; i += 4) {
622
660
  const chunk =
623
- str.charCodeAt(pos) |
624
- (str.charCodeAt(pos + 1) << 8) |
625
- (str.charCodeAt(pos + 2) << 16) |
626
- (str.charCodeAt(pos + 3) << 24);
661
+ str.charCodeAt(i) |
662
+ (str.charCodeAt(i + 1) << 8) |
663
+ (str.charCodeAt(i + 2) << 16) |
664
+ (str.charCodeAt(i + 3) << 24);
627
665
  hash ^= chunk;
628
666
  hash = Math.imul(hash, this.FNV_PRIME);
629
667
  }
630
- const remaining = len % 4;
631
- if (remaining > 0) {
632
- const pos = chunks * 4;
633
- for (let i = 0; i < remaining; i++) {
634
- hash ^= str.charCodeAt(pos + i);
635
- hash = Math.imul(hash, this.FNV_PRIME);
636
- }
668
+ for (; i < len; i++) {
669
+ hash ^= str.charCodeAt(i);
670
+ hash = Math.imul(hash, this.FNV_PRIME);
637
671
  }
638
672
  hash ^= hash >>> 16;
639
673
  hash *= 0x85ebca6b;
@@ -644,32 +678,51 @@ class Hasher {
644
678
  }
645
679
  }
646
680
  class HashTable {
647
- LRU;
681
+ FIFO;
682
+ maxSize;
648
683
  static MAX_LEN = 2048;
649
- static TABLE_SIZE = 10_000;
650
684
  table = new Map();
651
- constructor(LRU = true) {
652
- this.LRU = LRU;
685
+ constructor(FIFO = true, maxSize = 10000) {
686
+ this.FIFO = FIFO;
687
+ this.maxSize = maxSize;
653
688
  }
654
689
  key(label, strs, sorted = false) {
655
- for (const str of strs) if (str.length > HashTable.MAX_LEN) return false;
656
- const hashes = strs.map((s) => Hasher.fastFNV1a(s));
657
- return [label, ...(sorted ? hashes.sort() : hashes)].join('-');
690
+ const n = strs.length;
691
+ const hashes = new Array(n);
692
+ for (let i = 0; i < n; i++) {
693
+ const s = strs[i];
694
+ if (s.length > HashTable.MAX_LEN) return false;
695
+ hashes[i] = Hasher.fastFNV1a(s);
696
+ }
697
+ if (sorted) hashes.sort((a, b) => a - b);
698
+ let key = label;
699
+ for (let i = 0; i < hashes.length; i++) key += '-' + hashes[i];
700
+ return key;
701
+ }
702
+ has(key) {
703
+ return this.table.has(key);
704
+ }
705
+ get(key) {
706
+ return this.table.get(key);
658
707
  }
659
- has = (key) => this.table.has(key);
660
- get = (key) => this.table.get(key);
661
708
  set(key, entry, update = true) {
662
709
  if (!update && this.table.has(key)) return false;
663
- while (!this.table.has(key) && this.table.size >= HashTable.TABLE_SIZE) {
664
- if (!this.LRU) return false;
710
+ if (!this.table.has(key) && this.table.size >= this.maxSize) {
711
+ if (!this.FIFO) return false;
665
712
  this.table.delete(this.table.keys().next().value);
666
713
  }
667
714
  this.table.set(key, entry);
668
715
  return true;
669
716
  }
670
- delete = (key) => this.table.delete(key);
671
- clear = () => this.table.clear();
672
- size = () => this.table.size;
717
+ delete(key) {
718
+ return this.table.delete(key);
719
+ }
720
+ clear() {
721
+ this.table.clear();
722
+ }
723
+ size() {
724
+ return this.table.size;
725
+ }
673
726
  }
674
727
 
675
728
  class Normalizer {
@@ -688,25 +741,49 @@ class Normalizer {
688
741
  static getPipeline(flags) {
689
742
  return ErrorUtil.wrap(
690
743
  () => {
691
- if (Normalizer.pipeline.has(flags))
692
- return Normalizer.pipeline.get(flags);
744
+ const cached = Normalizer.pipeline.get(flags);
745
+ if (cached) return cached;
693
746
  const { REGEX } = Normalizer;
694
- const steps = [
695
- ['d', (s) => s.normalize('NFD')],
696
- ['i', (s) => s.toLowerCase()],
697
- ['k', (s) => s.replace(REGEX.nonLetters, '')],
698
- ['n', (s) => s.replace(REGEX.nonNumbers, '')],
699
- ['r', (s) => s.replace(REGEX.doubleChars, '$1')],
700
- ['s', (s) => s.replace(REGEX.specialChars, '')],
701
- ['t', (s) => s.trim()],
702
- ['u', (s) => s.normalize('NFC')],
703
- ['w', (s) => s.replace(REGEX.whitespace, ' ')],
704
- ['x', (s) => s.normalize('NFKC')]
705
- ];
706
- const pipeline = steps
707
- .filter(([f]) => flags.includes(f))
708
- .map(([, fn]) => fn);
709
- const fn = (s) => pipeline.reduce((v, f) => f(v), s);
747
+ const steps = [];
748
+ for (let i = 0; i < flags.length; i++) {
749
+ switch (flags[i]) {
750
+ case 'd':
751
+ steps.push((s) => s.normalize('NFD'));
752
+ break;
753
+ case 'i':
754
+ steps.push((s) => s.toLowerCase());
755
+ break;
756
+ case 'k':
757
+ steps.push((s) => s.replace(REGEX.nonLetters, ''));
758
+ break;
759
+ case 'n':
760
+ steps.push((s) => s.replace(REGEX.nonNumbers, ''));
761
+ break;
762
+ case 'r':
763
+ steps.push((s) => s.replace(REGEX.doubleChars, '$1'));
764
+ break;
765
+ case 's':
766
+ steps.push((s) => s.replace(REGEX.specialChars, ''));
767
+ break;
768
+ case 't':
769
+ steps.push((s) => s.trim());
770
+ break;
771
+ case 'u':
772
+ steps.push((s) => s.normalize('NFC'));
773
+ break;
774
+ case 'w':
775
+ steps.push((s) => s.replace(REGEX.whitespace, ' '));
776
+ break;
777
+ case 'x':
778
+ steps.push((s) => s.normalize('NFKC'));
779
+ break;
780
+ }
781
+ }
782
+ const fn = (input) => {
783
+ let v = input;
784
+ for (let i = 0; i < steps.length; i++) v = steps[i](v);
785
+ return v;
786
+ };
710
787
  Normalizer.pipeline.set(flags, fn);
711
788
  return fn;
712
789
  },
@@ -714,18 +791,23 @@ class Normalizer {
714
791
  { flags }
715
792
  );
716
793
  }
717
- static normalize(input, flags) {
794
+ static normalize(input, flags, normalizedFlags) {
718
795
  return ErrorUtil.wrap(
719
796
  () => {
720
797
  if (!flags || typeof flags !== 'string' || !input) return input;
721
- flags = this.canonicalFlags(flags);
722
- if (Array.isArray(input))
723
- return input.map((s) => Normalizer.normalize(s, flags));
724
- const key = Normalizer.cache.key(flags, [input]);
725
- if (key && Normalizer.cache.has(key)) return Normalizer.cache.get(key);
726
- const res = Normalizer.getPipeline(flags)(input);
727
- if (key) Normalizer.cache.set(key, res);
728
- return res;
798
+ flags = normalizedFlags ?? this.canonicalFlags(flags);
799
+ const pipeline = Normalizer.getPipeline(flags);
800
+ const normalizeOne = (s) => {
801
+ const key = Normalizer.cache.key(flags, [s]);
802
+ if (key && Normalizer.cache.has(key))
803
+ return Normalizer.cache.get(key);
804
+ const res = pipeline(s);
805
+ if (key) Normalizer.cache.set(key, res);
806
+ return res;
807
+ };
808
+ return Array.isArray(input)
809
+ ? input.map(normalizeOne)
810
+ : normalizeOne(input);
729
811
  },
730
812
  `Failed to normalize input with flags: ${flags}`,
731
813
  { input, flags }
@@ -749,17 +831,143 @@ class Normalizer {
749
831
  }
750
832
  }
751
833
 
834
+ class RingPool {
835
+ maxSize;
836
+ buffers = [];
837
+ pointer = 0;
838
+ constructor(maxSize) {
839
+ this.maxSize = maxSize;
840
+ }
841
+ acquire(minSize, allowOversize) {
842
+ return ErrorUtil.wrap(
843
+ () => {
844
+ const buffers = this.buffers;
845
+ const len = buffers.length;
846
+ for (let i = 0; i < len; i++) {
847
+ const idx = (this.pointer + i) % len;
848
+ const item = buffers[idx];
849
+ const size = item.size;
850
+ if (size >= minSize && (allowOversize || size === minSize)) {
851
+ this.pointer = (idx + 1) % len;
852
+ return item;
853
+ }
854
+ }
855
+ return null;
856
+ },
857
+ `Failed to acquire buffer of size >= ${minSize} from pool`,
858
+ { minSize, allowOversize }
859
+ );
860
+ }
861
+ release(item) {
862
+ ErrorUtil.wrap(
863
+ () => {
864
+ const buffers = this.buffers;
865
+ if (buffers.length < this.maxSize) {
866
+ buffers.push(item);
867
+ return;
868
+ }
869
+ buffers[this.pointer] = item;
870
+ this.pointer = (this.pointer + 1) % this.maxSize;
871
+ },
872
+ `Failed to release buffer back to pool`,
873
+ { item }
874
+ );
875
+ }
876
+ clear() {
877
+ this.buffers = [];
878
+ this.pointer = 0;
879
+ }
880
+ }
881
+ class Pool {
882
+ static CONFIG = {
883
+ int32: {
884
+ type: 'int32',
885
+ maxSize: 64,
886
+ maxItemSize: 2048,
887
+ allowOversize: true
888
+ },
889
+ 'arr[]': {
890
+ type: 'arr[]',
891
+ maxSize: 4,
892
+ maxItemSize: 1024,
893
+ allowOversize: false
894
+ },
895
+ 'number[]': {
896
+ type: 'number[]',
897
+ maxSize: 16,
898
+ maxItemSize: 1024,
899
+ allowOversize: false
900
+ },
901
+ 'string[]': {
902
+ type: 'string[]',
903
+ maxSize: 2,
904
+ maxItemSize: 1024,
905
+ allowOversize: false
906
+ },
907
+ set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
908
+ map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
909
+ };
910
+ static POOLS = {
911
+ int32: new RingPool(64),
912
+ 'arr[]': new RingPool(4),
913
+ 'number[]': new RingPool(16),
914
+ 'string[]': new RingPool(2),
915
+ set: new RingPool(8),
916
+ map: new RingPool(8)
917
+ };
918
+ static allocate(type, size) {
919
+ switch (type) {
920
+ case 'int32':
921
+ return new Int32Array(size);
922
+ case 'arr[]':
923
+ return new Array(size);
924
+ case 'number[]':
925
+ return new Float64Array(size);
926
+ case 'string[]':
927
+ return new Array(size);
928
+ case 'set':
929
+ return new Set();
930
+ case 'map':
931
+ return new Map();
932
+ }
933
+ }
934
+ static acquire(type, size) {
935
+ const CONFIG = this.CONFIG[type];
936
+ if (!CONFIG)
937
+ throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
938
+ if (size > CONFIG.maxItemSize) return this.allocate(type, size);
939
+ const item = this.POOLS[type].acquire(size, CONFIG.allowOversize);
940
+ if (item)
941
+ return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
942
+ return this.allocate(type, size);
943
+ }
944
+ static acquireMany(type, sizes) {
945
+ const out = new Array(sizes.length);
946
+ for (let i = 0; i < sizes.length; i++)
947
+ out[i] = this.acquire(type, sizes[i]);
948
+ return out;
949
+ }
950
+ static release(type, buffer, size) {
951
+ const CONFIG = this.CONFIG[type];
952
+ if (!CONFIG)
953
+ throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
954
+ if (size <= CONFIG.maxItemSize) this.POOLS[type].release({ buffer, size });
955
+ }
956
+ }
957
+
752
958
  class Profiler {
753
959
  active;
754
960
  static ENV;
755
961
  static instance;
756
962
  nowFn;
757
963
  memFn;
758
- store = new Set();
964
+ store = [];
965
+ last;
759
966
  totalTime = 0;
760
967
  totalMem = 0;
761
968
  static detectEnv() {
762
- if (typeof process !== 'undefined') Profiler.ENV = 'nodejs';
969
+ if (typeof process !== 'undefined' && process.versions?.node)
970
+ Profiler.ENV = 'nodejs';
763
971
  else if (typeof performance !== 'undefined') Profiler.ENV = 'browser';
764
972
  else Profiler.ENV = 'unknown';
765
973
  }
@@ -771,7 +979,7 @@ class Profiler {
771
979
  this.active = active;
772
980
  switch (Profiler.ENV) {
773
981
  case 'nodejs':
774
- this.nowFn = () => Number(process.hrtime.bigint()) / 1e6;
982
+ this.nowFn = () => Number(process.hrtime.bigint()) * 1e-6;
775
983
  this.memFn = () => process.memoryUsage().heapUsed;
776
984
  break;
777
985
  case 'browser':
@@ -784,40 +992,52 @@ class Profiler {
784
992
  break;
785
993
  }
786
994
  }
787
- now = () => this.nowFn();
788
- mem = () => this.memFn();
789
- profile(fn, meta) {
790
- const startTime = this.now(),
791
- startMem = this.mem();
792
- const res = fn();
793
- const deltaTime = this.now() - startTime,
794
- deltaMem = this.mem() - startMem;
795
- this.store.add({ time: deltaTime, mem: deltaMem, res, meta });
796
- ((this.totalTime += deltaTime), (this.totalMem += deltaMem));
797
- return res;
995
+ storeRes(entry) {
996
+ this.store.push((this.last = entry));
997
+ this.totalTime += entry.time;
998
+ this.totalMem += entry.mem;
798
999
  }
799
- enable = () => {
1000
+ enable() {
800
1001
  this.active = true;
801
- };
802
- disable = () => {
1002
+ }
1003
+ disable() {
803
1004
  this.active = false;
804
- };
1005
+ }
805
1006
  clear() {
806
- this.store.clear();
1007
+ this.store.length = 0;
1008
+ this.last = undefined;
807
1009
  this.totalTime = 0;
808
1010
  this.totalMem = 0;
809
1011
  }
810
1012
  run(fn, meta = {}) {
811
- return this.active ? this.profile(fn, meta) : fn();
1013
+ if (!this.active) return fn();
1014
+ const startTime = this.nowFn(),
1015
+ startMem = this.memFn();
1016
+ const res = fn();
1017
+ const deltaTime = this.nowFn() - startTime,
1018
+ deltaMem = this.memFn() - startMem;
1019
+ this.storeRes({ time: deltaTime, mem: deltaMem, res, meta });
1020
+ return res;
812
1021
  }
813
1022
  async runAsync(fn, meta = {}) {
814
- return this.active
815
- ? this.profile(async () => await fn(), meta)
816
- : await fn();
1023
+ if (!this.active) return fn();
1024
+ const startTime = this.nowFn(),
1025
+ startMem = this.memFn();
1026
+ const res = await fn();
1027
+ const deltaTime = this.nowFn() - startTime,
1028
+ deltaMem = this.memFn() - startMem;
1029
+ this.storeRes({ time: deltaTime, mem: deltaMem, res, meta });
1030
+ return res;
1031
+ }
1032
+ getAll() {
1033
+ return [...this.store];
1034
+ }
1035
+ getLast() {
1036
+ return this.last;
1037
+ }
1038
+ getTotal() {
1039
+ return { time: this.totalTime, mem: this.totalMem };
817
1040
  }
818
- getAll = () => [...this.store];
819
- getLast = () => this.getAll().pop();
820
- getTotal = () => ({ time: this.totalTime, mem: this.totalMem });
821
1041
  services = Object.freeze({
822
1042
  enable: this.enable.bind(this),
823
1043
  disable: this.disable.bind(this),
@@ -893,1368 +1113,1134 @@ function resolveCls(reg, cls) {
893
1113
  throw new CmpStrNotFoundError(`Registry <${reg}> does not exist`, {
894
1114
  registry: reg
895
1115
  });
896
- return typeof cls === 'string' ? registry[reg]?.get(cls) : cls;
1116
+ return typeof cls === 'string' ? registry[reg].get(cls) : cls;
897
1117
  }
898
1118
  function createFromRegistry(reg, cls, ...args) {
899
- cls = resolveCls(reg, cls);
1119
+ const ctor = resolveCls(reg, cls);
900
1120
  return ErrorUtil.wrap(
901
- () => new cls(...args),
902
- `Failed to create instance of class <${cls.name ?? cls}> from registry <${reg}>`,
1121
+ () => new ctor(...args),
1122
+ `Failed to create instance of class <${ctor.name ?? cls}> from registry <${reg}>`,
903
1123
  { registry: reg, class: cls, args }
904
1124
  );
905
1125
  }
906
1126
 
907
- class RingPool {
908
- maxSize;
909
- buffers = [];
910
- pointer = 0;
911
- constructor(maxSize) {
912
- this.maxSize = maxSize;
1127
+ const profiler$2 = Profiler.getInstance();
1128
+ class Metric {
1129
+ static cache = new HashTable();
1130
+ metric;
1131
+ a;
1132
+ b;
1133
+ origA = [];
1134
+ origB = [];
1135
+ options;
1136
+ optKey;
1137
+ symmetric;
1138
+ results;
1139
+ static clear() {
1140
+ this.cache.clear();
913
1141
  }
914
- acquire(minSize, allowOversize) {
915
- return ErrorUtil.wrap(
916
- () => {
917
- const len = this.buffers.length;
918
- for (let i = 0; i < len; i++) {
919
- const idx = (this.pointer + i) & (len - 1);
920
- const item = this.buffers[idx];
921
- if (
922
- item.size >= minSize &&
923
- (allowOversize || item.size === minSize)
924
- ) {
925
- this.pointer = (idx + 1) & (len - 1);
926
- return item;
927
- }
928
- }
929
- return null;
930
- },
931
- `Failed to acquire buffer of size >= ${minSize} from pool`,
932
- { minSize, allowOversize }
933
- );
1142
+ static swap(a, b, m, n) {
1143
+ return m > n ? [b, a, n, m] : [a, b, m, n];
934
1144
  }
935
- release(item) {
936
- ErrorUtil.wrap(
937
- () => {
938
- if (this.buffers.length < this.maxSize)
939
- return void [this.buffers.push(item)];
940
- this.buffers[this.pointer] = item;
941
- this.pointer = (this.pointer + 1) % this.maxSize;
942
- },
943
- `Failed to release buffer back to pool`,
944
- { item }
1145
+ static clamp(res) {
1146
+ return Math.max(0, Math.min(1, res));
1147
+ }
1148
+ constructor(metric, a, b, opt = {}, symmetric = false) {
1149
+ this.metric = metric;
1150
+ this.a = Array.isArray(a) ? a : [a];
1151
+ this.b = Array.isArray(b) ? b : [b];
1152
+ ErrorUtil.assert(
1153
+ this.a.length > 0 && this.b.length > 0,
1154
+ `Inputs <a> and <b> must not be empty`,
1155
+ { a: this.a, b: this.b }
945
1156
  );
1157
+ this.options = opt;
1158
+ this.optKey = Hasher.fastFNV1a(
1159
+ JSON.stringify(opt, Object.keys(opt).sort())
1160
+ ).toString();
1161
+ this.symmetric = symmetric;
946
1162
  }
947
- clear() {
948
- this.buffers = [];
949
- this.pointer = 0;
1163
+ preCompute(a, b, m, n) {
1164
+ if (a === b) return { res: 1 };
1165
+ if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 };
1166
+ return undefined;
950
1167
  }
951
- }
952
- class Pool {
953
- static CONFIG = {
954
- int32: {
955
- type: 'int32',
956
- maxSize: 64,
957
- maxItemSize: 2048,
958
- allowOversize: true
959
- },
960
- 'number[]': {
961
- type: 'number[]',
962
- maxSize: 16,
963
- maxItemSize: 1024,
964
- allowOversize: false
965
- },
966
- 'string[]': {
967
- type: 'string[]',
968
- maxSize: 2,
969
- maxItemSize: 1024,
970
- allowOversize: false
971
- },
972
- set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
973
- map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
974
- };
975
- static POOLS = {
976
- int32: new RingPool(64),
977
- 'number[]': new RingPool(16),
978
- 'string[]': new RingPool(2),
979
- set: new RingPool(8),
980
- map: new RingPool(8)
981
- };
982
- static allocate(type, size) {
983
- switch (type) {
984
- case 'int32':
985
- return new Int32Array(size);
986
- case 'number[]':
987
- return new Float64Array(size);
988
- case 'string[]':
989
- return new Array(size);
990
- case 'set':
991
- return new Set();
992
- case 'map':
993
- return new Map();
994
- }
995
- }
996
- static acquire(type, size) {
997
- const CONFIG = this.CONFIG[type];
998
- if (!CONFIG)
999
- throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
1000
- if (size > CONFIG.maxItemSize) return this.allocate(type, size);
1001
- const item = this.POOLS[type].acquire(size, CONFIG.allowOversize);
1002
- if (item)
1003
- return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
1004
- return this.allocate(type, size);
1168
+ compute(a, b, m, n, maxLen) {
1169
+ throw new CmpStrInternalError(
1170
+ `Method compute() must be overridden in a subclass`
1171
+ );
1005
1172
  }
1006
- static acquireMany(type, sizes) {
1007
- return sizes.map((size) => this.acquire(type, size));
1173
+ runSingle(i, j) {
1174
+ return ErrorUtil.wrap(
1175
+ () => {
1176
+ let a = String(this.a[i]),
1177
+ A = a;
1178
+ let b = String(this.b[j]),
1179
+ B = b;
1180
+ let m = A.length,
1181
+ n = B.length;
1182
+ let result = this.preCompute(A, B, m, n);
1183
+ if (!result) {
1184
+ result = profiler$2.run(() => {
1185
+ if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
1186
+ let key = Metric.cache.key(this.metric, [A, B], this.symmetric);
1187
+ if (key) key += this.optKey;
1188
+ return (
1189
+ Metric.cache.get(key || '') ??
1190
+ (() => {
1191
+ const maxLen = m > n ? m : n;
1192
+ const res = this.compute(A, B, m, n, maxLen);
1193
+ if (key) Metric.cache.set(key, res);
1194
+ return res;
1195
+ })()
1196
+ );
1197
+ });
1198
+ }
1199
+ return {
1200
+ metric: this.metric,
1201
+ a: this.origA.length > i ? this.origA[i] : a,
1202
+ b: this.origB.length > j ? this.origB[j] : b,
1203
+ ...result
1204
+ };
1205
+ },
1206
+ `Failed to compute metric for inputs at indices a[${i}] and b[${j}]`,
1207
+ { i, j }
1208
+ );
1008
1209
  }
1009
- static release(type, buffer, size) {
1010
- const CONFIG = this.CONFIG[type];
1011
- if (!CONFIG)
1012
- throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
1013
- if (size <= CONFIG.maxItemSize) this.POOLS[type].release({ buffer, size });
1210
+ async runSingleAsync(i, j) {
1211
+ return Promise.resolve(this.runSingle(i, j));
1014
1212
  }
1015
- }
1016
-
1017
- class StructuredData {
1018
- data;
1019
- key;
1020
- static create(data, key) {
1021
- return new StructuredData(data, key);
1213
+ runBatch() {
1214
+ const results = [];
1215
+ for (let i = 0; i < this.a.length; i++)
1216
+ for (let j = 0; j < this.b.length; j++)
1217
+ results.push(this.runSingle(i, j));
1218
+ this.results = results;
1022
1219
  }
1023
- constructor(data, key) {
1024
- this.data = data;
1025
- this.key = key;
1220
+ async runBatchAsync() {
1221
+ const tasks = [];
1222
+ for (let i = 0; i < this.a.length; i++)
1223
+ for (let j = 0; j < this.b.length; j++)
1224
+ tasks.push(this.runSingleAsync(i, j));
1225
+ this.results = await Promise.all(tasks);
1026
1226
  }
1027
- extractFrom(arr, key) {
1028
- const result = Pool.acquire('string[]', arr.length);
1029
- for (let i = 0; i < arr.length; i++) {
1030
- const val = arr[i][key];
1031
- result[i] = typeof val === 'string' ? val : String(val ?? '');
1032
- }
1033
- return result;
1227
+ runPairwise() {
1228
+ const results = [];
1229
+ for (let i = 0; i < this.a.length; i++) results.push(this.runSingle(i, i));
1230
+ this.results = results;
1034
1231
  }
1035
- extract = () => this.extractFrom(this.data, this.key);
1036
- isMetricResult(v) {
1037
- return (
1038
- typeof v === 'object' && v !== null && 'a' in v && 'b' in v && 'res' in v
1039
- );
1232
+ async runPairwiseAsync() {
1233
+ const tasks = [];
1234
+ for (let i = 0; i < this.a.length; i++)
1235
+ tasks.push(this.runSingleAsync(i, i));
1236
+ this.results = await Promise.all(tasks);
1040
1237
  }
1041
- isCmpStrResult(v) {
1042
- return (
1043
- typeof v === 'object' &&
1044
- v !== null &&
1045
- 'source' in v &&
1046
- 'target' in v &&
1047
- 'match' in v
1048
- );
1238
+ setOriginal(a, b) {
1239
+ if (a) this.origA = Array.isArray(a) ? a : [a];
1240
+ if (b) this.origB = Array.isArray(b) ? b : [b];
1241
+ return this;
1049
1242
  }
1050
- normalizeResults(results) {
1051
- if (!Array.isArray(results) || results.length === 0) return [];
1052
- const first = results[0];
1053
- let normalized = [];
1054
- if (this.isMetricResult(first)) normalized = results;
1055
- else if (this.isCmpStrResult(first))
1056
- normalized = results.map((r) => ({
1057
- metric: 'unknown',
1058
- a: r.source,
1059
- b: r.target,
1060
- res: r.match,
1061
- raw: r.raw
1062
- }));
1063
- else
1064
- throw new CmpStrValidationError(
1065
- 'Unsupported result format for StructuredData normalization.'
1066
- );
1067
- return normalized.map((r, idx) => ({ ...r, __idx: idx }));
1243
+ isBatch() {
1244
+ return this.a.length > 1 || this.b.length > 1;
1068
1245
  }
1069
- rebuild(results, sourceData, extractedStrings, removeZero, objectsOnly) {
1070
- const stringToIndices = new Map();
1071
- for (let i = 0; i < extractedStrings.length; i++) {
1072
- const str = extractedStrings[i];
1073
- if (!stringToIndices.has(str)) stringToIndices.set(str, []);
1074
- stringToIndices.get(str).push(i);
1075
- }
1076
- const output = new Array(results.length);
1077
- const occurrenceCount = new Map();
1078
- let out = 0;
1079
- for (let i = 0; i < results.length; i++) {
1080
- const result = results[i];
1081
- if (removeZero && result.res === 0) continue;
1082
- const targetStr = result.b || '';
1083
- const indices = stringToIndices.get(targetStr);
1084
- let dataIndex;
1085
- if (indices && indices.length > 0) {
1086
- const occurrence = occurrenceCount.get(targetStr) ?? 0;
1087
- occurrenceCount.set(targetStr, occurrence + 1);
1088
- dataIndex = indices[occurrence % indices.length];
1089
- } else {
1090
- dataIndex = result.__idx ?? i;
1091
- }
1092
- if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
1093
- const sourceObj = sourceData[dataIndex];
1094
- const mappedTarget = extractedStrings[dataIndex] || targetStr;
1095
- if (objectsOnly) output[out++] = sourceObj;
1096
- else
1097
- output[out++] = {
1098
- obj: sourceObj,
1099
- key: this.key,
1100
- result: { source: result.a, target: mappedTarget, match: result.res },
1101
- ...(result.raw ? { raw: result.raw } : null)
1102
- };
1103
- }
1104
- output.length = out;
1105
- return output;
1246
+ isSingle() {
1247
+ return !this.isBatch();
1106
1248
  }
1107
- sort(results, sort) {
1108
- if (!sort || results.length <= 1) return results;
1109
- const asc = sort === 'asc';
1110
- return results.sort((a, b) => (asc ? a.res - b.res : b.res - a.res));
1249
+ isPairwise(safe = false) {
1250
+ return this.isBatch() && this.a.length === this.b.length
1251
+ ? true
1252
+ : !safe &&
1253
+ (() => {
1254
+ throw new CmpStrUsageError(
1255
+ `Mode <pairwise> requires arrays of equal length`,
1256
+ { a: this.a, b: this.b }
1257
+ );
1258
+ })();
1111
1259
  }
1112
- finalizeLookup(results, extractedStrings, opt) {
1113
- return this.rebuild(
1114
- this.sort(this.normalizeResults(results), opt?.sort),
1115
- this.data,
1116
- extractedStrings,
1117
- opt?.removeZero,
1118
- opt?.objectsOnly
1119
- );
1260
+ isSymmetrical() {
1261
+ return this.symmetric;
1120
1262
  }
1121
- performLookup(fn, extractedStrings, opt) {
1122
- return ErrorUtil.wrap(
1123
- () => this.finalizeLookup(fn(), extractedStrings, opt),
1124
- 'StructuredData lookup failed',
1125
- { key: this.key }
1126
- );
1263
+ whichMode(mode) {
1264
+ return mode ?? this.options.mode ?? 'default';
1127
1265
  }
1128
- async performLookupAsync(fn, extractedStrings, opt) {
1129
- return await ErrorUtil.wrapAsync(
1130
- async () => this.finalizeLookup(await fn(), extractedStrings, opt),
1131
- 'StructuredData async lookup failed',
1132
- { key: this.key }
1133
- );
1266
+ clear() {
1267
+ this.results = undefined;
1134
1268
  }
1135
- lookup(fn, query, opt) {
1136
- const b = this.extract();
1137
- try {
1138
- return this.performLookup(() => fn(query, b, opt), b, opt);
1139
- } finally {
1140
- Pool.release('string[]', b, b.length);
1269
+ run(mode, clear = true) {
1270
+ if (clear) this.clear();
1271
+ switch (this.whichMode(mode)) {
1272
+ case 'default':
1273
+ if (this.isSingle()) {
1274
+ this.results = this.runSingle(0, 0);
1275
+ break;
1276
+ }
1277
+ case 'batch':
1278
+ this.runBatch();
1279
+ break;
1280
+ case 'single':
1281
+ this.results = this.runSingle(0, 0);
1282
+ break;
1283
+ case 'pairwise':
1284
+ if (this.isPairwise()) this.runPairwise();
1285
+ break;
1286
+ default:
1287
+ throw new CmpStrInternalError(`Unsupported mode <${mode}>`);
1141
1288
  }
1142
1289
  }
1143
- async lookupAsync(fn, query, opt) {
1144
- const b = this.extract();
1145
- try {
1146
- return await this.performLookupAsync(() => fn(query, b, opt), b, opt);
1147
- } finally {
1148
- Pool.release('string[]', b, b.length);
1290
+ async runAsync(mode, clear = true) {
1291
+ if (clear) this.clear();
1292
+ switch (this.whichMode(mode)) {
1293
+ case 'default':
1294
+ if (this.isSingle()) {
1295
+ this.results = await this.runSingleAsync(0, 0);
1296
+ break;
1297
+ }
1298
+ case 'batch':
1299
+ await this.runBatchAsync();
1300
+ break;
1301
+ case 'single':
1302
+ this.results = await this.runSingleAsync(0, 0);
1303
+ break;
1304
+ case 'pairwise':
1305
+ if (this.isPairwise()) await this.runPairwiseAsync();
1306
+ break;
1307
+ default:
1308
+ throw new CmpStrInternalError(`Unsupported async mode <${mode}>`);
1149
1309
  }
1150
1310
  }
1151
- lookupPairs(fn, other, otherKey, opt) {
1152
- const a = this.extract();
1153
- const b = this.extractFrom(other, otherKey);
1154
- try {
1155
- return this.performLookup(() => fn(a, b, opt), a, opt);
1156
- } finally {
1157
- Pool.release('string[]', a, a.length);
1158
- Pool.release('string[]', b, b.length);
1159
- }
1311
+ getMetricName() {
1312
+ return this.metric;
1160
1313
  }
1161
- async lookupPairsAsync(fn, other, otherKey, opt) {
1162
- const a = this.extract();
1163
- const b = this.extractFrom(other, otherKey);
1164
- try {
1165
- return await this.performLookupAsync(() => fn(a, b, opt), a, opt);
1166
- } finally {
1167
- Pool.release('string[]', a, a.length);
1168
- Pool.release('string[]', b, b.length);
1169
- }
1314
+ getResults() {
1315
+ ErrorUtil.assert(
1316
+ this.results !== undefined,
1317
+ `run() must be called before getResults()`
1318
+ );
1319
+ return this.results;
1170
1320
  }
1171
1321
  }
1322
+ const MetricRegistry = Registry('metric', Metric);
1172
1323
 
1173
- class TextAnalyzer {
1174
- static REGEX = {
1175
- number: /\d/,
1176
- sentence: /(?<=[.!?])\s+/,
1177
- word: /\p{L}+/gu,
1178
- nonWord: /[^\p{L}]/gu,
1179
- vowelGroup: /[aeiouy]+/g,
1180
- letter: /\p{L}/gu,
1181
- ucLetter: /\p{Lu}/gu
1182
- };
1183
- text;
1184
- words = [];
1185
- sentences = [];
1186
- charFrequency = new Map();
1187
- wordHistogram = new Map();
1188
- syllableCache = new Map();
1189
- syllableStats;
1190
- constructor(input) {
1191
- this.text = input.trim();
1192
- this.tokenize();
1193
- this.computeFrequencies();
1324
+ class CosineSimilarity extends Metric {
1325
+ constructor(a, b, opt = {}) {
1326
+ super('cosine', a, b, opt, true);
1194
1327
  }
1195
- tokenize() {
1196
- let match;
1197
- const lcText = this.text.toLowerCase();
1198
- while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null)
1199
- this.words.push(match[0]);
1200
- this.sentences = this.text
1201
- .split(TextAnalyzer.REGEX.sentence)
1202
- .filter(Boolean);
1328
+ _termFreq(str, delimiter) {
1329
+ const terms = str.split(delimiter);
1330
+ const freq = Pool.acquire('map', terms.length);
1331
+ for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
1332
+ return freq;
1203
1333
  }
1204
- computeFrequencies() {
1205
- for (const char of this.text)
1206
- this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1);
1207
- for (const word of this.words)
1208
- this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1);
1334
+ compute(a, b) {
1335
+ const { delimiter = ' ' } = this.options;
1336
+ const termsA = this._termFreq(a, delimiter);
1337
+ const termsB = this._termFreq(b, delimiter);
1338
+ try {
1339
+ let dotP = 0,
1340
+ magA = 0,
1341
+ magB = 0;
1342
+ for (const [term, freqA] of termsA) {
1343
+ const freqB = termsB.get(term) || 0;
1344
+ dotP += freqA * freqB;
1345
+ magA += freqA * freqA;
1346
+ }
1347
+ for (const freqB of termsB.values()) magB += freqB * freqB;
1348
+ magA = Math.sqrt(magA);
1349
+ magB = Math.sqrt(magB);
1350
+ return {
1351
+ res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0,
1352
+ raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB }
1353
+ };
1354
+ } finally {
1355
+ Pool.release('map', termsA, termsA.size);
1356
+ Pool.release('map', termsB, termsB.size);
1357
+ }
1209
1358
  }
1210
- estimateSyllables(word) {
1211
- const clean = word
1212
- .normalize('NFC')
1213
- .toLowerCase()
1214
- .replace(TextAnalyzer.REGEX.nonWord, '');
1215
- if (this.syllableCache.has(clean)) return this.syllableCache.get(clean);
1216
- const matches = clean.match(TextAnalyzer.REGEX.vowelGroup);
1217
- const count = matches ? matches.length : 1;
1218
- this.syllableCache.set(clean, count);
1219
- return count;
1359
+ }
1360
+ MetricRegistry.add('cosine', CosineSimilarity);
1361
+
1362
+ class DamerauLevenshteinDistance extends Metric {
1363
+ constructor(a, b, opt = {}) {
1364
+ super('damerau', a, b, opt, true);
1220
1365
  }
1221
- computeSyllableStats() {
1222
- return (this.syllableStats ||= (() => {
1223
- const perWord = this.words
1224
- .map((w) => this.estimateSyllables(w))
1225
- .sort((a, b) => a - b);
1226
- const total = perWord.reduce((sum, s) => sum + s, 0);
1227
- const mono = perWord.filter((s) => s === 1).length;
1228
- const median = !perWord.length
1229
- ? 0
1230
- : perWord.length % 2 === 0
1231
- ? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) / 2
1232
- : perWord[Math.floor(perWord.length / 2)];
1366
+ compute(a, b, m, n, maxLen) {
1367
+ const len = m + 1;
1368
+ const [test, prev, curr] = Pool.acquireMany('int32', [len, len, len]);
1369
+ try {
1370
+ for (let i = 0; i <= m; i++) prev[i] = i;
1371
+ for (let j = 1; j <= n; j++) {
1372
+ curr[0] = j;
1373
+ const cb = b.charCodeAt(j - 1);
1374
+ for (let i = 1; i <= m; i++) {
1375
+ const ca = a.charCodeAt(i - 1);
1376
+ const cost = ca === cb ? 0 : 1;
1377
+ let val = Math.min(curr[i - 1] + 1, prev[i] + 1, prev[i - 1] + cost);
1378
+ if (
1379
+ i > 1 &&
1380
+ j > 1 &&
1381
+ ca === b.charCodeAt(j - 2) &&
1382
+ cb === a.charCodeAt(i - 2)
1383
+ )
1384
+ val = Math.min(val, test[i - 2] + cost);
1385
+ curr[i] = val;
1386
+ }
1387
+ test.set(prev);
1388
+ prev.set(curr);
1389
+ }
1390
+ const dist = prev[m];
1233
1391
  return {
1234
- total,
1235
- mono,
1236
- perWord,
1237
- avg: perWord.length ? total / perWord.length : 0,
1238
- median
1392
+ res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
1393
+ raw: { dist, maxLen }
1239
1394
  };
1240
- })());
1241
- }
1242
- getLength = () => this.text.length;
1243
- getWordCount = () => this.words.length;
1244
- getSentenceCount = () => this.sentences.length;
1245
- getAvgWordLength() {
1246
- return this.words.length
1247
- ? this.words.join('').length / this.words.length
1248
- : 0;
1395
+ } finally {
1396
+ Pool.release('int32', test, len);
1397
+ Pool.release('int32', prev, len);
1398
+ Pool.release('int32', curr, len);
1399
+ }
1249
1400
  }
1250
- getAvgSentenceLength() {
1251
- return this.sentences.length
1252
- ? this.words.length / this.sentences.length
1253
- : 0;
1401
+ }
1402
+ MetricRegistry.add('damerau', DamerauLevenshteinDistance);
1403
+
1404
+ class DiceSorensenCoefficient extends Metric {
1405
+ constructor(a, b, opt = {}) {
1406
+ super('dice', a, b, opt, true);
1254
1407
  }
1255
- getWordHistogram() {
1256
- return Object.fromEntries(this.wordHistogram);
1408
+ _bigrams(str) {
1409
+ const len = str.length - 1;
1410
+ const bigrams = Pool.acquire('set', len);
1411
+ for (let i = 0; i < len; i++) bigrams.add(str.substring(i, i + 2));
1412
+ return bigrams;
1257
1413
  }
1258
- getMostCommonWords(limit = 5) {
1259
- return [...this.wordHistogram.entries()]
1260
- .sort((a, b) => b[1] - a[1])
1261
- .slice(0, limit)
1262
- .map((e) => e[0]);
1414
+ compute(a, b) {
1415
+ const setA = this._bigrams(a),
1416
+ setB = this._bigrams(b);
1417
+ const sizeA = setA.size,
1418
+ sizeB = setB.size;
1419
+ try {
1420
+ let intersection = 0;
1421
+ for (const bigram of setA) if (setB.has(bigram)) intersection++;
1422
+ const size = sizeA + sizeB;
1423
+ return {
1424
+ res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
1425
+ raw: { intersection, size }
1426
+ };
1427
+ } finally {
1428
+ Pool.release('set', setA, sizeA);
1429
+ Pool.release('set', setB, sizeB);
1430
+ }
1263
1431
  }
1264
- getHapaxLegomena() {
1265
- return [...this.wordHistogram.entries()]
1266
- .filter(([, c]) => c === 1)
1267
- .map((e) => e[0]);
1432
+ }
1433
+ MetricRegistry.add('dice', DiceSorensenCoefficient);
1434
+
1435
+ class HammingDistance extends Metric {
1436
+ constructor(a, b, opt = {}) {
1437
+ super('hamming', a, b, opt, true);
1268
1438
  }
1269
- hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text);
1270
- getUpperCaseRatio() {
1271
- const matches = this.text.match(TextAnalyzer.REGEX.letter) || [];
1272
- const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0;
1273
- return matches.length ? upper / matches.length : 0;
1439
+ compute(a, b, m, n, maxLen) {
1440
+ if (m !== n) {
1441
+ if (this.options.pad !== undefined) {
1442
+ if (m < maxLen) a = a.padEnd(maxLen, this.options.pad);
1443
+ if (n < maxLen) b = b.padEnd(maxLen, this.options.pad);
1444
+ m = n = maxLen;
1445
+ } else
1446
+ throw new CmpStrUsageError(
1447
+ `Strings must be of equal length for Hamming Distance, a=${m} and b=${n} given, ` +
1448
+ `use option.pad for automatic adjustment`,
1449
+ { a: m, b: n }
1450
+ );
1451
+ }
1452
+ let dist = 0;
1453
+ for (let i = 0; i < m; i++) if (a[i] !== b[i]) dist++;
1454
+ return { res: m === 0 ? 1 : Metric.clamp(1 - dist / m), raw: { dist } };
1274
1455
  }
1275
- getCharFrequency() {
1276
- return Object.fromEntries(this.charFrequency);
1456
+ }
1457
+ MetricRegistry.add('hamming', HammingDistance);
1458
+
1459
+ class JaccardIndex extends Metric {
1460
+ constructor(a, b, opt = {}) {
1461
+ super('jaccard', a, b, opt, true);
1277
1462
  }
1278
- getUnicodeCodepoints() {
1279
- const result = {};
1280
- for (const [char, count] of this.charFrequency) {
1281
- const block = char
1282
- .charCodeAt(0)
1283
- .toString(16)
1284
- .padStart(4, '0')
1285
- .toUpperCase();
1286
- result[block] = (result[block] || 0) + count;
1463
+ compute(a, b, m, n) {
1464
+ const [setA, setB] = Pool.acquireMany('set', [m, n]);
1465
+ try {
1466
+ for (const A of a) setA.add(A);
1467
+ for (const B of b) setB.add(B);
1468
+ let intersection = 0;
1469
+ for (const c of setA) if (setB.has(c)) intersection++;
1470
+ const union = setA.size + setB.size - intersection;
1471
+ return {
1472
+ res: union === 0 ? 1 : Metric.clamp(intersection / union),
1473
+ raw: { intersection, union }
1474
+ };
1475
+ } finally {
1476
+ Pool.release('set', setA, m);
1477
+ Pool.release('set', setB, n);
1287
1478
  }
1288
- return result;
1289
1479
  }
1290
- getLongWordRatio(len = 7) {
1291
- let long = 0;
1292
- for (const w of this.words) if (w.length >= len) long++;
1293
- return this.words.length ? long / this.words.length : 0;
1480
+ }
1481
+ MetricRegistry.add('jaccard', JaccardIndex);
1482
+
1483
+ class JaroWinklerDistance extends Metric {
1484
+ constructor(a, b, opt = {}) {
1485
+ super('jaroWinkler', a, b, opt, true);
1294
1486
  }
1295
- getShortWordRatio(len = 3) {
1296
- let short = 0;
1297
- for (const w of this.words) if (w.length <= len) short++;
1298
- return this.words.length ? short / this.words.length : 0;
1299
- }
1300
- getSyllablesCount() {
1301
- return this.computeSyllableStats().total;
1302
- }
1303
- getMonosyllabicWordCount() {
1304
- return this.computeSyllableStats().mono;
1305
- }
1306
- getMinSyllablesWordCount(min) {
1307
- return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
1487
+ compute(a, b, m, n) {
1488
+ const [matchA, matchB] = Pool.acquireMany('int32', [m, n]);
1489
+ try {
1490
+ for (let i = 0; i < m; i++) matchA[i] = 0;
1491
+ for (let i = 0; i < n; i++) matchB[i] = 0;
1492
+ const matchWindow = Math.max(0, Math.floor(n / 2) - 1);
1493
+ let matches = 0;
1494
+ for (let i = 0; i < m; i++) {
1495
+ const start = Math.max(0, i - matchWindow);
1496
+ const end = Math.min(i + matchWindow + 1, n);
1497
+ for (let j = start; j < end; j++) {
1498
+ if (!matchB[j] && a[i] === b[j]) {
1499
+ matchA[i] = 1;
1500
+ matchB[j] = 1;
1501
+ matches++;
1502
+ break;
1503
+ }
1504
+ }
1505
+ }
1506
+ let transpos = 0,
1507
+ jaro = 0,
1508
+ prefix = 0,
1509
+ res = 0;
1510
+ if (matches > 0) {
1511
+ let k = 0;
1512
+ for (let i = 0; i < m; i++) {
1513
+ if (matchA[i]) {
1514
+ while (!matchB[k]) k++;
1515
+ if (a[i] !== b[k]) transpos++;
1516
+ k++;
1517
+ }
1518
+ }
1519
+ transpos /= 2;
1520
+ jaro = (matches / m + matches / n + (matches - transpos) / matches) / 3;
1521
+ for (let i = 0; i < Math.min(4, m, n); i++) {
1522
+ if (a[i] === b[i]) prefix++;
1523
+ else break;
1524
+ }
1525
+ res = jaro + prefix * 0.1 * (1 - jaro);
1526
+ }
1527
+ return {
1528
+ res: Metric.clamp(res),
1529
+ raw: { matchWindow, matches, transpos, jaro, prefix }
1530
+ };
1531
+ } finally {
1532
+ Pool.release('int32', matchA, m);
1533
+ Pool.release('int32', matchB, n);
1534
+ }
1308
1535
  }
1309
- getMaxSyllablesWordCount(max) {
1310
- return this.computeSyllableStats().perWord.filter((w) => w <= max).length;
1536
+ }
1537
+ MetricRegistry.add('jaroWinkler', JaroWinklerDistance);
1538
+
1539
+ class LCSMetric extends Metric {
1540
+ constructor(a, b, opt = {}) {
1541
+ super('lcs', a, b, opt, true);
1311
1542
  }
1312
- getAvgSyllablesPerWord() {
1313
- return this.computeSyllableStats().avg;
1543
+ compute(a, b, m, n, maxLen) {
1544
+ const len = m + 1;
1545
+ const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1546
+ try {
1547
+ for (let i = 0; i <= m; i++) prev[i] = 0;
1548
+ for (let j = 1; j <= n; j++) {
1549
+ curr[0] = 0;
1550
+ const cb = b.charCodeAt(j - 1);
1551
+ for (let i = 1; i <= m; i++) {
1552
+ if (a.charCodeAt(i - 1) === cb) curr[i] = prev[i - 1] + 1;
1553
+ else curr[i] = Math.max(prev[i], curr[i - 1]);
1554
+ }
1555
+ prev.set(curr);
1556
+ }
1557
+ const lcs = prev[m];
1558
+ return {
1559
+ res: maxLen === 0 ? 1 : Metric.clamp(lcs / maxLen),
1560
+ raw: { lcs, maxLen }
1561
+ };
1562
+ } finally {
1563
+ Pool.release('int32', prev, len);
1564
+ Pool.release('int32', curr, len);
1565
+ }
1314
1566
  }
1315
- getMedianSyllablesPerWord() {
1316
- return this.computeSyllableStats().median;
1567
+ }
1568
+ MetricRegistry.add('lcs', LCSMetric);
1569
+
1570
+ class LevenshteinDistance extends Metric {
1571
+ constructor(a, b, opt = {}) {
1572
+ super('levenshtein', a, b, opt, true);
1317
1573
  }
1318
- getHonoresR() {
1574
+ compute(a, b, m, n, maxLen) {
1575
+ const len = m + 1;
1576
+ const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1319
1577
  try {
1320
- return (
1321
- (100 * Math.log(this.words.length)) /
1322
- (1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1))
1323
- );
1324
- } catch {
1325
- return 0;
1578
+ for (let i = 0; i <= m; i++) prev[i] = i;
1579
+ for (let j = 1; j <= n; j++) {
1580
+ curr[0] = j;
1581
+ const cb = b.charCodeAt(j - 1);
1582
+ for (let i = 1; i <= m; i++) {
1583
+ const cost = a.charCodeAt(i - 1) === cb ? 0 : 1;
1584
+ curr[i] = Math.min(curr[i - 1] + 1, prev[i] + 1, prev[i - 1] + cost);
1585
+ }
1586
+ prev.set(curr);
1587
+ }
1588
+ const dist = prev[m];
1589
+ return {
1590
+ res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
1591
+ raw: { dist, maxLen }
1592
+ };
1593
+ } finally {
1594
+ Pool.release('int32', prev, len);
1595
+ Pool.release('int32', curr, len);
1326
1596
  }
1327
1597
  }
1328
- getReadingTime(wpm = 200) {
1329
- return this.words.length / (wpm ?? 1);
1598
+ }
1599
+ MetricRegistry.add('levenshtein', LevenshteinDistance);
1600
+
1601
+ class NeedlemanWunschDistance extends Metric {
1602
+ constructor(a, b, opt = {}) {
1603
+ super('needlemanWunsch', a, b, opt, true);
1330
1604
  }
1331
- getReadabilityScore(metric = 'flesch') {
1332
- const w = this.words.length || 1;
1333
- const s = this.sentences.length || 1;
1334
- const y = this.getSyllablesCount() || 1;
1335
- const asl = w / s;
1336
- const asw = y / w;
1337
- switch (metric) {
1338
- case 'flesch':
1339
- return 206.835 - 1.015 * asl - 84.6 * asw;
1340
- case 'fleschde':
1341
- return 180 - asl - 58.5 * asw;
1342
- case 'kincaid':
1343
- return 0.39 * asl + 11.8 * asw - 15.59;
1605
+ compute(a, b, m, n, maxLen) {
1606
+ const { match = 1, mismatch = -1, gap = -1 } = this.options;
1607
+ const len = m + 1;
1608
+ const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1609
+ try {
1610
+ prev[0] = 0;
1611
+ for (let i = 1; i <= m; i++) prev[i] = prev[i - 1] + gap;
1612
+ for (let j = 1; j <= n; j++) {
1613
+ curr[0] = prev[0] + gap;
1614
+ const cb = b.charCodeAt(j - 1);
1615
+ for (let i = 1; i <= m; i++) {
1616
+ const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
1617
+ curr[i] = Math.max(
1618
+ prev[i - 1] + score,
1619
+ prev[i] + gap,
1620
+ curr[i - 1] + gap
1621
+ );
1622
+ }
1623
+ prev.set(curr);
1624
+ }
1625
+ const score = prev[m];
1626
+ const denum = maxLen * match;
1627
+ return {
1628
+ res: denum === 0 ? 0 : Metric.clamp(score / denum),
1629
+ raw: { score, denum }
1630
+ };
1631
+ } finally {
1632
+ Pool.release('int32', prev, len);
1633
+ Pool.release('int32', curr, len);
1344
1634
  }
1345
1635
  }
1346
- getLIXScore() {
1347
- const w = this.words.length || 1;
1348
- const s = this.sentences.length || 1;
1349
- const l = this.getLongWordRatio() * w;
1350
- return w / s + (l / w) * 100;
1636
+ }
1637
+ MetricRegistry.add('needlemanWunsch', NeedlemanWunschDistance);
1638
+
1639
+ class QGramSimilarity extends Metric {
1640
+ constructor(a, b, opt = {}) {
1641
+ super('qGram', a, b, opt, true);
1351
1642
  }
1352
- getWSTFScore() {
1353
- const w = this.words.length || 1;
1354
- const h = (this.getMinSyllablesWordCount(3) / w) * 100;
1355
- const s = this.getAvgSentenceLength();
1356
- const l = this.getLongWordRatio() * 100;
1357
- const m = (this.getMonosyllabicWordCount() / w) * 100;
1358
- return [
1359
- 0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875,
1360
- 0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779,
1361
- 0.2963 * h + 0.1905 * s - 1.1144,
1362
- 0.2744 * h + 0.2656 * s - 1.693
1363
- ];
1643
+ _qGrams(str, q) {
1644
+ const len = Math.max(0, str.length - q + 1);
1645
+ const grams = Pool.acquire('set', len);
1646
+ for (let i = 0; i < len; i++) grams.add(str.slice(i, i + q));
1647
+ return grams;
1648
+ }
1649
+ compute(a, b) {
1650
+ const { q = 2 } = this.options;
1651
+ const setA = this._qGrams(a, q),
1652
+ setB = this._qGrams(b, q);
1653
+ const sizeA = setA.size,
1654
+ sizeB = setB.size;
1655
+ try {
1656
+ let intersection = 0;
1657
+ for (const gram of setA) if (setB.has(gram)) intersection++;
1658
+ const size = Math.max(sizeA, sizeB);
1659
+ return {
1660
+ res: size === 0 ? 1 : Metric.clamp(intersection / size),
1661
+ raw: { intersection, size }
1662
+ };
1663
+ } finally {
1664
+ Pool.release('set', setA, sizeA);
1665
+ Pool.release('set', setB, sizeB);
1666
+ }
1364
1667
  }
1365
1668
  }
1669
+ MetricRegistry.add('qGram', QGramSimilarity);
1366
1670
 
1367
- const profiler$2 = Profiler.getInstance();
1368
- class Metric {
1369
- static cache = new HashTable();
1370
- metric;
1371
- a;
1372
- b;
1373
- origA = [];
1374
- origB = [];
1375
- options;
1376
- optKey;
1377
- symmetric;
1378
- results;
1379
- static clear = () => this.cache.clear();
1380
- static swap = (a, b, m, n) => (m > n ? [b, a, n, m] : [a, b, m, n]);
1381
- static clamp = (res) => Math.max(0, Math.min(1, res));
1382
- constructor(metric, a, b, opt = {}, symmetric = false) {
1383
- this.metric = metric;
1384
- this.a = Array.isArray(a) ? a : [a];
1385
- this.b = Array.isArray(b) ? b : [b];
1386
- ErrorUtil.assert(
1387
- this.a.length > 0 && this.b.length > 0,
1388
- `Inputs <a> and <b> must not be empty`,
1389
- { a: this.a, b: this.b }
1390
- );
1391
- this.options = opt;
1392
- this.optKey = Hasher.fastFNV1a(
1393
- JSON.stringify(opt, Object.keys(opt).sort())
1394
- ).toString();
1395
- this.symmetric = symmetric;
1671
+ class SmithWatermanDistance extends Metric {
1672
+ constructor(a, b, opt = {}) {
1673
+ super('smithWaterman', a, b, opt, true);
1396
1674
  }
1397
- preCompute(a, b, m, n) {
1398
- if (a === b) return { res: 1 };
1399
- if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 };
1400
- return undefined;
1401
- }
1402
- compute(a, b, m, n, maxLen) {
1403
- throw new CmpStrInternalError(
1404
- `Method compute() must be overridden in a subclass`
1405
- );
1406
- }
1407
- runSingle(i, j) {
1408
- return ErrorUtil.wrap(
1409
- () => {
1410
- let a = String(this.a[i]),
1411
- A = a;
1412
- let b = String(this.b[j]),
1413
- B = b;
1414
- let m = A.length,
1415
- n = B.length;
1416
- let result = this.preCompute(A, B, m, n);
1417
- if (!result) {
1418
- result = profiler$2.run(() => {
1419
- if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
1420
- const key =
1421
- Metric.cache.key(this.metric, [A, B], this.symmetric) +
1422
- this.optKey;
1423
- return (
1424
- Metric.cache.get(key || '') ??
1425
- (() => {
1426
- const res = this.compute(A, B, m, n, Math.max(m, n));
1427
- if (key) Metric.cache.set(key, res);
1428
- return res;
1429
- })()
1430
- );
1431
- });
1432
- }
1433
- return {
1434
- metric: this.metric,
1435
- a: this.origA[i] ?? a,
1436
- b: this.origB[j] ?? b,
1437
- ...result
1438
- };
1439
- },
1440
- `Failed to compute metric for inputs at indices a[${i}] and b[${j}]`,
1441
- { i, j }
1442
- );
1443
- }
1444
- async runSingleAsync(i, j) {
1445
- return Promise.resolve(this.runSingle(i, j));
1446
- }
1447
- runBatch() {
1448
- const results = [];
1449
- for (let i = 0; i < this.a.length; i++)
1450
- for (let j = 0; j < this.b.length; j++)
1451
- results.push(this.runSingle(i, j));
1452
- this.results = results;
1453
- }
1454
- async runBatchAsync() {
1455
- const results = [];
1456
- for (let i = 0; i < this.a.length; i++)
1457
- for (let j = 0; j < this.b.length; j++)
1458
- results.push(await this.runSingleAsync(i, j));
1459
- this.results = results;
1460
- }
1461
- runPairwise() {
1462
- const results = [];
1463
- for (let i = 0; i < this.a.length; i++) results.push(this.runSingle(i, i));
1464
- this.results = results;
1465
- }
1466
- async runPairwiseAsync() {
1467
- const results = [];
1468
- for (let i = 0; i < this.a.length; i++)
1469
- results.push(await this.runSingleAsync(i, i));
1470
- this.results = results;
1471
- }
1472
- setOriginal(a, b) {
1473
- if (a) this.origA = Array.isArray(a) ? a : [a];
1474
- if (b) this.origB = Array.isArray(b) ? b : [b];
1475
- return this;
1476
- }
1477
- isBatch = () => this.a.length > 1 || this.b.length > 1;
1478
- isSingle = () => !this.isBatch();
1479
- isPairwise(safe = false) {
1480
- return this.isBatch() && this.a.length === this.b.length
1481
- ? true
1482
- : !safe &&
1483
- (() => {
1484
- throw new CmpStrUsageError(
1485
- `Mode <pairwise> requires arrays of equal length`,
1486
- { a: this.a, b: this.b }
1487
- );
1488
- })();
1489
- }
1490
- isSymmetrical = () => this.symmetric;
1491
- whichMode = (mode) => mode ?? this.options?.mode ?? 'default';
1492
- clear = () => (this.results = undefined);
1493
- run(mode, clear = true) {
1494
- if (clear) this.clear();
1495
- switch (this.whichMode(mode)) {
1496
- case 'default':
1497
- if (this.isSingle()) {
1498
- this.results = this.runSingle(0, 0);
1499
- break;
1500
- }
1501
- case 'batch':
1502
- this.runBatch();
1503
- break;
1504
- case 'single':
1505
- this.results = this.runSingle(0, 0);
1506
- break;
1507
- case 'pairwise':
1508
- if (this.isPairwise()) this.runPairwise();
1509
- break;
1510
- default:
1511
- throw new CmpStrInternalError(`Unsupported mode <${mode}>`);
1512
- }
1513
- }
1514
- async runAsync(mode, clear = true) {
1515
- if (clear) this.clear();
1516
- switch (this.whichMode(mode)) {
1517
- case 'default':
1518
- if (this.isSingle()) {
1519
- this.results = await this.runSingleAsync(0, 0);
1520
- break;
1521
- }
1522
- case 'batch':
1523
- await this.runBatchAsync();
1524
- break;
1525
- case 'single':
1526
- this.results = await this.runSingleAsync(0, 0);
1527
- break;
1528
- case 'pairwise':
1529
- if (this.isPairwise()) await this.runPairwiseAsync();
1530
- break;
1531
- default:
1532
- throw new CmpStrInternalError(`Unsupported async mode <${mode}>`);
1533
- }
1534
- }
1535
- getMetricName = () => this.metric;
1536
- getResults() {
1537
- ErrorUtil.assert(
1538
- this.results !== undefined,
1539
- `run() must be called before getResults()`
1540
- );
1541
- return this.results;
1542
- }
1543
- }
1544
- const MetricRegistry = Registry('metric', Metric);
1545
-
1546
- class CosineSimilarity extends Metric {
1547
- constructor(a, b, opt = {}) {
1548
- super('cosine', a, b, opt, true);
1549
- }
1550
- _termFreq(str, delimiter) {
1551
- const terms = str.split(delimiter);
1552
- const freq = Pool.acquire('map', terms.length);
1553
- for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
1554
- return freq;
1555
- }
1556
- compute(a, b) {
1557
- const { delimiter = ' ' } = this.options;
1558
- const termsA = this._termFreq(a, delimiter);
1559
- const termsB = this._termFreq(b, delimiter);
1560
- try {
1561
- let dotP = 0,
1562
- magA = 0,
1563
- magB = 0;
1564
- for (const [term, freqA] of termsA) {
1565
- const freqB = termsB.get(term) || 0;
1566
- dotP += freqA * freqB;
1567
- magA += freqA * freqA;
1568
- }
1569
- for (const freqB of termsB.values()) magB += freqB * freqB;
1570
- magA = Math.sqrt(magA);
1571
- magB = Math.sqrt(magB);
1572
- return {
1573
- res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0,
1574
- raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB }
1575
- };
1576
- } finally {
1577
- Pool.release('map', termsA, termsA.size);
1578
- Pool.release('map', termsB, termsB.size);
1579
- }
1580
- }
1581
- }
1582
- MetricRegistry.add('cosine', CosineSimilarity);
1583
-
1584
- class DamerauLevenshteinDistance extends Metric {
1585
- constructor(a, b, opt = {}) {
1586
- super('damerau', a, b, opt, true);
1587
- }
1588
- compute(a, b, m, n, maxLen) {
1675
+ compute(a, b, m, n) {
1676
+ const { match = 2, mismatch = -1, gap = -2 } = this.options;
1589
1677
  const len = m + 1;
1590
- const [test, prev, curr] = Pool.acquireMany('int32', [len, len, len]);
1678
+ const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1679
+ let maxScore = 0;
1591
1680
  try {
1592
- for (let i = 0; i <= m; i++) prev[i] = i;
1681
+ for (let i = 0; i <= m; i++) prev[i] = 0;
1593
1682
  for (let j = 1; j <= n; j++) {
1594
- curr[0] = j;
1683
+ curr[0] = 0;
1595
1684
  const cb = b.charCodeAt(j - 1);
1596
1685
  for (let i = 1; i <= m; i++) {
1597
- const ca = a.charCodeAt(i - 1);
1598
- const cost = ca === cb ? 0 : 1;
1599
- let val = Math.min(curr[i - 1] + 1, prev[i] + 1, prev[i - 1] + cost);
1600
- if (
1601
- i > 1 &&
1602
- j > 1 &&
1603
- ca === b.charCodeAt(j - 2) &&
1604
- cb === a.charCodeAt(i - 2)
1605
- )
1606
- val = Math.min(val, test[i - 2] + cost);
1607
- curr[i] = val;
1686
+ const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
1687
+ curr[i] = Math.max(
1688
+ 0,
1689
+ prev[i - 1] + score,
1690
+ prev[i] + gap,
1691
+ curr[i - 1] + gap
1692
+ );
1693
+ if (curr[i] > maxScore) maxScore = curr[i];
1608
1694
  }
1609
- test.set(prev);
1610
1695
  prev.set(curr);
1611
1696
  }
1612
- const dist = prev[m];
1697
+ const denum = Math.min(m * match, n * match);
1613
1698
  return {
1614
- res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
1615
- raw: { dist, maxLen }
1699
+ res: denum === 0 ? 0 : Metric.clamp(maxScore / denum),
1700
+ raw: { score: maxScore, denum }
1616
1701
  };
1617
1702
  } finally {
1618
- Pool.release('int32', test, len);
1619
1703
  Pool.release('int32', prev, len);
1620
1704
  Pool.release('int32', curr, len);
1621
1705
  }
1622
1706
  }
1623
1707
  }
1624
- MetricRegistry.add('damerau', DamerauLevenshteinDistance);
1708
+ MetricRegistry.add('smithWaterman', SmithWatermanDistance);
1625
1709
 
1626
- class DiceSorensenCoefficient extends Metric {
1627
- constructor(a, b, opt = {}) {
1628
- super('dice', a, b, opt, true);
1710
+ const profiler$1 = Profiler.getInstance();
1711
+ class Phonetic {
1712
+ static cache = new HashTable();
1713
+ static default;
1714
+ algo;
1715
+ options;
1716
+ optKey;
1717
+ map;
1718
+ ignoreSet;
1719
+ static clear() {
1720
+ this.cache.clear();
1629
1721
  }
1630
- _bigrams(str) {
1631
- const len = str.length - 1;
1632
- const bigrams = Pool.acquire('set', len);
1633
- for (let i = 0; i < len; i++) bigrams.add(str.substring(i, i + 2));
1634
- return bigrams;
1722
+ constructor(algo, opt = {}) {
1723
+ const defaults = this.constructor.default ?? {};
1724
+ const mapId = opt.map ?? defaults.map;
1725
+ if (!mapId)
1726
+ throw new CmpStrNotFoundError(
1727
+ `No mapping specified for phonetic algorithm`,
1728
+ { algo }
1729
+ );
1730
+ const map = PhoneticMappingRegistry.get(algo, mapId);
1731
+ if (map === undefined)
1732
+ throw new CmpStrNotFoundError(
1733
+ `Requested mapping <${mapId}> is not declared`,
1734
+ { algo, mapId }
1735
+ );
1736
+ this.options = DeepMerge.merge(
1737
+ DeepMerge.merge(defaults, map.options ?? {}),
1738
+ opt
1739
+ );
1740
+ this.optKey = Hasher.fastFNV1a(
1741
+ JSON.stringify(this.options, Object.keys(this.options).sort())
1742
+ ).toString();
1743
+ this.algo = algo;
1744
+ this.map = map;
1745
+ this.ignoreSet = new Set(map.ignore ?? []);
1635
1746
  }
1636
- compute(a, b) {
1637
- const setA = this._bigrams(a),
1638
- setB = this._bigrams(b);
1639
- const sizeA = setA.size,
1640
- sizeB = setB.size;
1641
- try {
1642
- let intersection = 0;
1643
- for (const bigram of setA) if (setB.has(bigram)) intersection++;
1644
- const size = sizeA + sizeB;
1645
- return {
1646
- res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
1647
- raw: { intersection, size }
1648
- };
1649
- } finally {
1650
- Pool.release('set', setA, sizeA);
1651
- Pool.release('set', setB, sizeB);
1747
+ applyPattern(word) {
1748
+ const { patterns = [] } = this.map;
1749
+ if (!patterns.length) return word;
1750
+ for (const { pattern, replace, all = false } of patterns) {
1751
+ word = all
1752
+ ? word.replaceAll(pattern, replace)
1753
+ : word.replace(pattern, replace);
1652
1754
  }
1755
+ return word;
1653
1756
  }
1654
- }
1655
- MetricRegistry.add('dice', DiceSorensenCoefficient);
1656
-
1657
- class HammingDistance extends Metric {
1658
- constructor(a, b, opt = {}) {
1659
- super('hamming', a, b, opt, true);
1757
+ applyRules(char, i, chars, charLen) {
1758
+ const { ruleset = [] } = this.map;
1759
+ if (!ruleset.length) return undefined;
1760
+ const prev = chars[i - 1] || '',
1761
+ prev2 = chars[i - 2] || '';
1762
+ const next = chars[i + 1] || '',
1763
+ next2 = chars[i + 2] || '';
1764
+ const str = chars.join('');
1765
+ for (const rule of ruleset) {
1766
+ if (rule.char && rule.char !== char) continue;
1767
+ if (rule.position === 'start' && i !== 0) continue;
1768
+ if (rule.position === 'middle' && (i === 0 || i === charLen - 1))
1769
+ continue;
1770
+ if (rule.position === 'end' && i !== charLen - 1) continue;
1771
+ if (rule.prev && !rule.prev.includes(prev)) continue;
1772
+ if (rule.prevNot && rule.prevNot.includes(prev)) continue;
1773
+ if (rule.prev2 && !rule.prev2.includes(prev2)) continue;
1774
+ if (rule.prev2Not && rule.prev2Not.includes(prev2)) continue;
1775
+ if (rule.next && !rule.next.includes(next)) continue;
1776
+ if (rule.nextNot && rule.nextNot.includes(next)) continue;
1777
+ if (rule.next2 && !rule.next2.includes(next2)) continue;
1778
+ if (rule.next2Not && rule.next2Not.includes(next2)) continue;
1779
+ if (
1780
+ rule.leading &&
1781
+ !rule.leading.includes(str.slice(0, rule.leading.length))
1782
+ )
1783
+ continue;
1784
+ if (
1785
+ rule.trailing &&
1786
+ !rule.trailing.includes(str.slice(-rule.trailing.length))
1787
+ )
1788
+ continue;
1789
+ if (rule.match && !rule.match.every((c, j) => chars[i + j] === c))
1790
+ continue;
1791
+ return rule.code;
1792
+ }
1793
+ return undefined;
1660
1794
  }
1661
- compute(a, b, m, n, maxLen) {
1662
- if (m !== n) {
1663
- if (this.options.pad !== undefined) {
1664
- if (m < maxLen) a = a.padEnd(maxLen, this.options.pad);
1665
- if (n < maxLen) b = b.padEnd(maxLen, this.options.pad);
1666
- m = n = maxLen;
1667
- } else
1668
- throw new CmpStrUsageError(
1669
- `Strings must be of equal length for Hamming Distance, a=${m} and b=${n} given, ` +
1670
- `use option.pad for automatic adjustment`,
1671
- { a: m, b: n }
1672
- );
1795
+ encode(word) {
1796
+ const { map = {} } = this.map;
1797
+ word = this.applyPattern(word);
1798
+ const chars = this.word2Chars(word);
1799
+ const charLen = chars.length;
1800
+ let code = '',
1801
+ lastCode = null;
1802
+ for (let i = 0; i < charLen; i++) {
1803
+ const char = chars[i];
1804
+ if (this.ignoreSet.has(char)) continue;
1805
+ const mapped = this.mapChar(char, i, chars, charLen, lastCode, map);
1806
+ if (mapped === undefined) continue;
1807
+ ((code += mapped), (lastCode = mapped));
1808
+ if (this.exitEarly(code, i)) break;
1673
1809
  }
1674
- let dist = 0;
1675
- for (let i = 0; i < m; i++) if (a[i] !== b[i]) dist++;
1676
- return { res: m === 0 ? 1 : Metric.clamp(1 - dist / m), raw: { dist } };
1810
+ return this.adjustCode(code, chars);
1677
1811
  }
1678
- }
1679
- MetricRegistry.add('hamming', HammingDistance);
1680
-
1681
- class JaccardIndex extends Metric {
1682
- constructor(a, b, opt = {}) {
1683
- super('jaccard', a, b, opt, true);
1812
+ mapChar(char, i, chars, charLen, lastCode, map) {
1813
+ const { dedupe = true, fallback = undefined } = this.options;
1814
+ const c = this.applyRules(char, i, chars, charLen) ?? map[char] ?? fallback;
1815
+ return dedupe && c === lastCode ? undefined : c;
1684
1816
  }
1685
- compute(a, b, m, n) {
1686
- const [setA, setB] = Pool.acquireMany('set', [m, n]);
1687
- try {
1688
- for (const A of a) setA.add(A);
1689
- for (const B of b) setB.add(B);
1690
- let intersection = 0;
1691
- for (const c of setA) if (setB.has(c)) intersection++;
1692
- const union = setA.size + setB.size - intersection;
1693
- return {
1694
- res: union === 0 ? 1 : Metric.clamp(intersection / union),
1695
- raw: { intersection, union }
1696
- };
1697
- } finally {
1698
- Pool.release('set', setA, m);
1699
- Pool.release('set', setB, n);
1700
- }
1817
+ equalLen(input) {
1818
+ const { length = -1, pad = '0' } = this.options;
1819
+ return length === -1
1820
+ ? input
1821
+ : (input + pad.repeat(length)).slice(0, length);
1701
1822
  }
1702
- }
1703
- MetricRegistry.add('jaccard', JaccardIndex);
1704
-
1705
- class JaroWinklerDistance extends Metric {
1706
- constructor(a, b, opt = {}) {
1707
- super('jaroWinkler', a, b, opt, true);
1823
+ word2Chars(word) {
1824
+ return Array.from(word.toLowerCase());
1708
1825
  }
1709
- compute(a, b, m, n) {
1710
- const [matchA, matchB] = Pool.acquireMany('int32', [m, n]);
1711
- try {
1712
- for (let i = 0; i < m; i++) matchA[i] = 0;
1713
- for (let i = 0; i < n; i++) matchB[i] = 0;
1714
- const matchWindow = Math.max(0, Math.floor(n / 2) - 1);
1715
- let matches = 0;
1716
- for (let i = 0; i < m; i++) {
1717
- const start = Math.max(0, i - matchWindow);
1718
- const end = Math.min(i + matchWindow + 1, n);
1719
- for (let j = start; j < end; j++) {
1720
- if (!matchB[j] && a[i] === b[j]) {
1721
- matchA[i] = 1;
1722
- matchB[j] = 1;
1723
- matches++;
1724
- break;
1725
- }
1726
- }
1727
- }
1728
- let transpos = 0,
1729
- jaro = 0,
1730
- prefix = 0,
1731
- res = 0;
1732
- if (matches > 0) {
1733
- let k = 0;
1734
- for (let i = 0; i < m; i++) {
1735
- if (matchA[i]) {
1736
- while (!matchB[k]) k++;
1737
- if (a[i] !== b[k]) transpos++;
1738
- k++;
1739
- }
1826
+ exitEarly(code, i) {
1827
+ const { length = -1 } = this.options;
1828
+ return length > 0 && code.length >= length;
1829
+ }
1830
+ adjustCode(code, chars) {
1831
+ return code;
1832
+ }
1833
+ loop(words) {
1834
+ return ErrorUtil.wrap(
1835
+ () => {
1836
+ const index = [];
1837
+ for (const word of words) {
1838
+ let key = Phonetic.cache.key(this.algo, [word]);
1839
+ if (key) key += this.optKey;
1840
+ const code =
1841
+ Phonetic.cache.get(key || '') ??
1842
+ (() => {
1843
+ const res = this.encode(word);
1844
+ if (key) Phonetic.cache.set(key, res);
1845
+ return res;
1846
+ })();
1847
+ if (code && code.length) index.push(this.equalLen(code));
1740
1848
  }
1741
- transpos /= 2;
1742
- jaro = (matches / m + matches / n + (matches - transpos) / matches) / 3;
1743
- for (let i = 0; i < Math.min(4, m, n); i++) {
1744
- if (a[i] === b[i]) prefix++;
1745
- else break;
1849
+ return index;
1850
+ },
1851
+ `Failed to generate phonetic index`,
1852
+ { algo: this.algo, words }
1853
+ );
1854
+ }
1855
+ async loopAsync(words) {
1856
+ return ErrorUtil.wrapAsync(
1857
+ async () => {
1858
+ const index = [];
1859
+ for (const word of words) {
1860
+ const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
1861
+ const code = await Promise.resolve(
1862
+ Phonetic.cache.get(key || '') ??
1863
+ (() => {
1864
+ const res = this.encode(word);
1865
+ if (key) Phonetic.cache.set(key, res);
1866
+ return res;
1867
+ })()
1868
+ );
1869
+ if (code && code.length) index.push(this.equalLen(code));
1746
1870
  }
1747
- res = jaro + prefix * 0.1 * (1 - jaro);
1748
- }
1749
- return {
1750
- res: Metric.clamp(res),
1751
- raw: { matchWindow, matches, transpos, jaro, prefix }
1752
- };
1753
- } finally {
1754
- Pool.release('int32', matchA, m);
1755
- Pool.release('int32', matchB, n);
1756
- }
1871
+ return index;
1872
+ },
1873
+ `Failed to generate phonetic index asynchronously`,
1874
+ { algo: this.algo, words }
1875
+ );
1757
1876
  }
1758
- }
1759
- MetricRegistry.add('jaroWinkler', JaroWinklerDistance);
1760
-
1761
- class LCSMetric extends Metric {
1762
- constructor(a, b, opt = {}) {
1763
- super('lcs', a, b, opt, true);
1877
+ getAlgoName() {
1878
+ return this.algo;
1764
1879
  }
1765
- compute(a, b, m, n, maxLen) {
1766
- const len = m + 1;
1767
- const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1768
- try {
1769
- for (let i = 0; i <= m; i++) prev[i] = 0;
1770
- for (let j = 1; j <= n; j++) {
1771
- curr[0] = 0;
1772
- const cb = b.charCodeAt(j - 1);
1773
- for (let i = 1; i <= m; i++) {
1774
- if (a.charCodeAt(i - 1) === cb) curr[i] = prev[i - 1] + 1;
1775
- else curr[i] = Math.max(prev[i], curr[i - 1]);
1776
- }
1777
- prev.set(curr);
1778
- }
1779
- const lcs = prev[m];
1780
- return {
1781
- res: maxLen === 0 ? 1 : Metric.clamp(lcs / maxLen),
1782
- raw: { lcs, maxLen }
1783
- };
1784
- } finally {
1785
- Pool.release('int32', prev, len);
1786
- Pool.release('int32', curr, len);
1787
- }
1880
+ getIndex(input) {
1881
+ const { delimiter = ' ' } = this.options;
1882
+ return profiler$1.run(() =>
1883
+ this.loop(input.split(delimiter).filter(Boolean)).filter(Boolean)
1884
+ );
1885
+ }
1886
+ async getIndexAsync(input) {
1887
+ const { delimiter = ' ' } = this.options;
1888
+ return (
1889
+ await profiler$1.runAsync(
1890
+ async () => await this.loopAsync(input.split(delimiter).filter(Boolean))
1891
+ )
1892
+ ).filter(Boolean);
1788
1893
  }
1789
1894
  }
1790
- MetricRegistry.add('lcs', LCSMetric);
1791
-
1792
- class LevenshteinDistance extends Metric {
1793
- constructor(a, b, opt = {}) {
1794
- super('levenshtein', a, b, opt, true);
1795
- }
1796
- compute(a, b, m, n, maxLen) {
1797
- const len = m + 1;
1798
- const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1799
- try {
1800
- for (let i = 0; i <= m; i++) prev[i] = i;
1801
- for (let j = 1; j <= n; j++) {
1802
- curr[0] = j;
1803
- const cb = b.charCodeAt(j - 1);
1804
- for (let i = 1; i <= m; i++) {
1805
- const cost = a.charCodeAt(i - 1) === cb ? 0 : 1;
1806
- curr[i] = Math.min(curr[i - 1] + 1, prev[i] + 1, prev[i - 1] + cost);
1807
- }
1808
- prev.set(curr);
1809
- }
1810
- const dist = prev[m];
1811
- return {
1812
- res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
1813
- raw: { dist, maxLen }
1814
- };
1815
- } finally {
1816
- Pool.release('int32', prev, len);
1817
- Pool.release('int32', curr, len);
1895
+ const PhoneticRegistry = Registry('phonetic', Phonetic);
1896
+ const PhoneticMappingRegistry = (() => {
1897
+ const mappings = Object.create(null);
1898
+ const maps = (algo) => (mappings[algo] ||= Object.create(null));
1899
+ return Object.freeze({
1900
+ add(algo, id, map, update = false) {
1901
+ const mappings = maps(algo);
1902
+ ErrorUtil.assert(
1903
+ !(!id || id in mappings) || update,
1904
+ `Entry <${id}> already exists / use <update=true> to overwrite`,
1905
+ { algo, id }
1906
+ );
1907
+ mappings[id] = map;
1908
+ },
1909
+ remove(algo, id) {
1910
+ delete maps(algo)[id];
1911
+ },
1912
+ has(algo, id) {
1913
+ return id in maps(algo);
1914
+ },
1915
+ get(algo, id) {
1916
+ return maps(algo)[id];
1917
+ },
1918
+ list(algo) {
1919
+ return Object.keys(maps(algo));
1818
1920
  }
1819
- }
1820
- }
1821
- MetricRegistry.add('levenshtein', LevenshteinDistance);
1921
+ });
1922
+ })();
1822
1923
 
1823
- class NeedlemanWunschDistance extends Metric {
1824
- constructor(a, b, opt = {}) {
1825
- super('needlemanWunsch', a, b, opt, true);
1924
+ class Caverphone extends Phonetic {
1925
+ static REGEX = { uppercase: /[^A-Z]/gi };
1926
+ static default = {
1927
+ map: 'en2',
1928
+ delimiter: ' ',
1929
+ length: -1,
1930
+ pad: '',
1931
+ dedupe: false
1932
+ };
1933
+ constructor(opt = {}) {
1934
+ super('caverphone', opt);
1826
1935
  }
1827
- compute(a, b, m, n, maxLen) {
1828
- const { match = 1, mismatch = -1, gap = -1 } = this.options;
1829
- const len = m + 1;
1830
- const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1831
- try {
1832
- prev[0] = 0;
1833
- for (let i = 1; i <= m; i++) prev[i] = prev[i - 1] + gap;
1834
- for (let j = 1; j <= n; j++) {
1835
- curr[0] = prev[0] + gap;
1836
- const cb = b.charCodeAt(j - 1);
1837
- for (let i = 1; i <= m; i++) {
1838
- const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
1839
- curr[i] = Math.max(
1840
- prev[i - 1] + score,
1841
- prev[i] + gap,
1842
- curr[i - 1] + gap
1843
- );
1844
- }
1845
- prev.set(curr);
1846
- }
1847
- const score = prev[m];
1848
- const denum = maxLen * match;
1849
- return {
1850
- res: denum === 0 ? 0 : Metric.clamp(score / denum),
1851
- raw: { score, denum }
1852
- };
1853
- } finally {
1854
- Pool.release('int32', prev, len);
1855
- Pool.release('int32', curr, len);
1856
- }
1936
+ encode(word) {
1937
+ word = word.replace(Caverphone.REGEX.uppercase, '').toLowerCase();
1938
+ return super.encode(word);
1857
1939
  }
1940
+ mapChar = (char) => char;
1941
+ adjustCode = (code) => code.toUpperCase();
1858
1942
  }
1859
- MetricRegistry.add('needlemanWunsch', NeedlemanWunschDistance);
1943
+ PhoneticRegistry.add('caverphone', Caverphone);
1944
+ PhoneticMappingRegistry.add('caverphone', 'en1', {
1945
+ options: { length: 6, pad: '1' },
1946
+ map: {},
1947
+ patterns: [
1948
+ { pattern: /^(c|r|t|en)ough/, replace: '$1ou2f' },
1949
+ { pattern: /^gn/, replace: '2n' },
1950
+ { pattern: /mb$/, replace: 'm2' },
1951
+ { pattern: /cq/g, replace: '2q' },
1952
+ { pattern: /c(e|i|y)/g, replace: 's$1' },
1953
+ { pattern: /tch/g, replace: '2ch' },
1954
+ { pattern: /[cqx]/g, replace: 'k' },
1955
+ { pattern: /v/g, replace: 'f' },
1956
+ { pattern: /dg/g, replace: '2g' },
1957
+ { pattern: /ti(a|o)/g, replace: 'si$1' },
1958
+ { pattern: /d/g, replace: 't' },
1959
+ { pattern: /ph/g, replace: 'fh' },
1960
+ { pattern: /b/g, replace: 'p' },
1961
+ { pattern: /sh/g, replace: 's2' },
1962
+ { pattern: /z/g, replace: 's' },
1963
+ { pattern: /^[aeiou]/, replace: 'A' },
1964
+ { pattern: /[aeiou]/g, replace: '3' },
1965
+ { pattern: /3gh3/g, replace: '3kh3' },
1966
+ { pattern: /gh/g, replace: '22' },
1967
+ { pattern: /g/g, replace: 'k' },
1968
+ { pattern: /s+/g, replace: 'S' },
1969
+ { pattern: /t+/g, replace: 'T' },
1970
+ { pattern: /p+/g, replace: 'P' },
1971
+ { pattern: /k+/g, replace: 'K' },
1972
+ { pattern: /f+/g, replace: 'F' },
1973
+ { pattern: /m+/g, replace: 'M' },
1974
+ { pattern: /n+/g, replace: 'N' },
1975
+ { pattern: /j/g, replace: 'y' },
1976
+ { pattern: /l3/g, replace: 'L3' },
1977
+ { pattern: /r3/g, replace: 'R3' },
1978
+ { pattern: /w3/g, replace: 'W3' },
1979
+ { pattern: /y3/g, replace: 'Y3' },
1980
+ { pattern: /ly/g, replace: 'Ly' },
1981
+ { pattern: /ry/g, replace: 'Ry' },
1982
+ { pattern: /wy/g, replace: 'Wy' },
1983
+ { pattern: /wh3/g, replace: 'Wh3' },
1984
+ { pattern: /why/g, replace: 'Why' },
1985
+ { pattern: /^h/, replace: 'A' },
1986
+ { pattern: /[hlrwy23]/g, replace: '' }
1987
+ ]
1988
+ });
1989
+ PhoneticMappingRegistry.add('caverphone', 'en2', {
1990
+ options: { length: 10, pad: '1' },
1991
+ map: {},
1992
+ patterns: [
1993
+ { pattern: /e$/, replace: '' },
1994
+ { pattern: /^(c|r|t|en|tr)ough/, replace: '$1ou2f' },
1995
+ { pattern: /^gn/, replace: '2n' },
1996
+ { pattern: /mb$/, replace: 'm2' },
1997
+ { pattern: /cq/g, replace: '2q' },
1998
+ { pattern: /c(e|i|y)/g, replace: 's$1' },
1999
+ { pattern: /tch/g, replace: '2ch' },
2000
+ { pattern: /[cqx]/g, replace: 'k' },
2001
+ { pattern: /v/g, replace: 'f' },
2002
+ { pattern: /dg/g, replace: '2g' },
2003
+ { pattern: /ti(a|o)/g, replace: 'si$1' },
2004
+ { pattern: /d/g, replace: 't' },
2005
+ { pattern: /ph/g, replace: 'fh' },
2006
+ { pattern: /b/g, replace: 'p' },
2007
+ { pattern: /sh/g, replace: 's2' },
2008
+ { pattern: /z/g, replace: 's' },
2009
+ { pattern: /^[aeiou]/, replace: 'A' },
2010
+ { pattern: /[aeiou]/g, replace: '3' },
2011
+ { pattern: /j/g, replace: 'y' },
2012
+ { pattern: /^y3/, replace: 'Y3' },
2013
+ { pattern: /^y/, replace: 'A' },
2014
+ { pattern: /y/g, replace: '3' },
2015
+ { pattern: /3gh3/g, replace: '3kh3' },
2016
+ { pattern: /gh/g, replace: '22' },
2017
+ { pattern: /g/g, replace: 'k' },
2018
+ { pattern: /s+/g, replace: 'S' },
2019
+ { pattern: /t+/g, replace: 'T' },
2020
+ { pattern: /p+/g, replace: 'P' },
2021
+ { pattern: /k+/g, replace: 'K' },
2022
+ { pattern: /f+/g, replace: 'F' },
2023
+ { pattern: /m+/g, replace: 'M' },
2024
+ { pattern: /n+/g, replace: 'N' },
2025
+ { pattern: /l3/g, replace: 'L3' },
2026
+ { pattern: /r3/g, replace: 'R3' },
2027
+ { pattern: /w3/g, replace: 'W3' },
2028
+ { pattern: /wh3/g, replace: 'Wh3' },
2029
+ { pattern: /[lrw]$/, replace: '3' },
2030
+ { pattern: /^h/, replace: 'A' },
2031
+ { pattern: /3$/, replace: 'A' },
2032
+ { pattern: /[hlrw23]/g, replace: '' }
2033
+ ]
2034
+ });
1860
2035
 
1861
- class QGramSimilarity extends Metric {
1862
- constructor(a, b, opt = {}) {
1863
- super('qGram', a, b, opt, true);
1864
- }
1865
- _qGrams(str, q) {
1866
- const len = Math.max(0, str.length - q + 1);
1867
- const grams = Pool.acquire('set', len);
1868
- for (let i = 0; i < len; i++) grams.add(str.slice(i, i + q));
1869
- return grams;
2036
+ class Cologne extends Phonetic {
2037
+ static default = { map: 'default', delimiter: ' ', length: -1, dedupe: true };
2038
+ constructor(opt = {}) {
2039
+ super('cologne', opt);
1870
2040
  }
1871
- compute(a, b) {
1872
- const { q = 2 } = this.options;
1873
- const setA = this._qGrams(a, q),
1874
- setB = this._qGrams(b, q);
1875
- const sizeA = setA.size,
1876
- sizeB = setB.size;
1877
- try {
1878
- let intersection = 0;
1879
- for (const gram of setA) if (setB.has(gram)) intersection++;
1880
- const size = Math.max(sizeA, sizeB);
1881
- return {
1882
- res: size === 0 ? 1 : Metric.clamp(intersection / size),
1883
- raw: { intersection, size }
1884
- };
1885
- } finally {
1886
- Pool.release('set', setA, sizeA);
1887
- Pool.release('set', setB, sizeB);
1888
- }
2041
+ adjustCode(code) {
2042
+ return code.slice(0, 1) + code.slice(1).replaceAll('0', '');
1889
2043
  }
1890
2044
  }
1891
- MetricRegistry.add('qGram', QGramSimilarity);
1892
-
1893
- class SmithWatermanDistance extends Metric {
1894
- constructor(a, b, opt = {}) {
1895
- super('smithWaterman', a, b, opt, true);
1896
- }
1897
- compute(a, b, m, n) {
1898
- const { match = 2, mismatch = -1, gap = -2 } = this.options;
1899
- const len = m + 1;
1900
- const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1901
- let maxScore = 0;
1902
- try {
1903
- for (let i = 0; i <= m; i++) prev[i] = 0;
1904
- for (let j = 1; j <= n; j++) {
1905
- curr[0] = 0;
1906
- const cb = b.charCodeAt(j - 1);
1907
- for (let i = 1; i <= m; i++) {
1908
- const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
1909
- curr[i] = Math.max(
1910
- 0,
1911
- prev[i - 1] + score,
1912
- prev[i] + gap,
1913
- curr[i - 1] + gap
1914
- );
1915
- if (curr[i] > maxScore) maxScore = curr[i];
1916
- }
1917
- prev.set(curr);
1918
- }
1919
- const denum = Math.min(m * match, n * match);
1920
- return {
1921
- res: denum === 0 ? 0 : Metric.clamp(maxScore / denum),
1922
- raw: { score: maxScore, denum }
1923
- };
1924
- } finally {
1925
- Pool.release('int32', prev, len);
1926
- Pool.release('int32', curr, len);
1927
- }
1928
- }
1929
- }
1930
- MetricRegistry.add('smithWaterman', SmithWatermanDistance);
1931
-
1932
- const profiler$1 = Profiler.getInstance();
1933
- class Phonetic {
1934
- static cache = new HashTable();
1935
- static default;
1936
- algo;
1937
- options;
1938
- optKey;
1939
- map;
1940
- static clear = () => this.cache.clear();
1941
- constructor(algo, opt = {}) {
1942
- const defaults = this.constructor.default ?? {};
1943
- const mapId = opt.map ?? defaults.map;
1944
- if (!mapId)
1945
- throw new CmpStrNotFoundError(
1946
- `No mapping specified for phonetic algorithm`,
1947
- { algo }
1948
- );
1949
- const map = PhoneticMappingRegistry.get(algo, mapId);
1950
- if (map === undefined)
1951
- throw new CmpStrNotFoundError(
1952
- `Requested mapping <${mapId}> is not declared`,
1953
- { algo, mapId }
1954
- );
1955
- this.options = merge(merge(defaults, map.options ?? {}), opt);
1956
- this.optKey = Hasher.fastFNV1a(
1957
- JSON.stringify(this.options, Object.keys(this.options).sort())
1958
- ).toString();
1959
- this.algo = algo;
1960
- this.map = map;
1961
- }
1962
- applyPattern(word) {
1963
- const { patterns = [] } = this.map;
1964
- if (!patterns || !patterns.length) return word;
1965
- for (const { pattern, replace, all = false } of patterns) {
1966
- word = word[all ? 'replaceAll' : 'replace'](pattern, replace);
1967
- }
1968
- return word;
1969
- }
1970
- applyRules(char, i, chars, charLen) {
1971
- const { ruleset = [] } = this.map;
1972
- if (!ruleset || !ruleset.length) return undefined;
1973
- const prev = chars[i - 1] || '',
1974
- prev2 = chars[i - 2] || '';
1975
- const next = chars[i + 1] || '',
1976
- next2 = chars[i + 2] || '';
1977
- for (const rule of ruleset) {
1978
- if (rule.char && rule.char !== char) continue;
1979
- if (rule.position === 'start' && i !== 0) continue;
1980
- if (rule.position === 'middle' && (i === 0 || i === charLen - 1))
1981
- continue;
1982
- if (rule.position === 'end' && i !== charLen) continue;
1983
- if (rule.prev && !rule.prev.includes(prev)) continue;
1984
- if (rule.prevNot && rule.prevNot.includes(prev)) continue;
1985
- if (rule.prev2 && !rule.prev2.includes(prev2)) continue;
1986
- if (rule.prev2Not && rule.prev2Not.includes(prev2)) continue;
1987
- if (rule.next && !rule.next.includes(next)) continue;
1988
- if (rule.nextNot && rule.nextNot.includes(next)) continue;
1989
- if (rule.next2 && !rule.next2.includes(next2)) continue;
1990
- if (rule.next2Not && rule.next2Not.includes(next2)) continue;
1991
- if (
1992
- rule.leading &&
1993
- !rule.leading.includes(chars.slice(0, rule.leading.length).join(''))
1994
- )
1995
- continue;
1996
- if (
1997
- rule.trailing &&
1998
- !rule.trailing.includes(chars.slice(-rule.trailing.length).join(''))
1999
- )
2000
- continue;
2001
- if (rule.match && !rule.match.every((c, j) => chars[i + j] === c))
2002
- continue;
2003
- return rule.code;
2004
- }
2005
- return undefined;
2006
- }
2007
- encode(word) {
2008
- const { map = {}, ignore = [] } = this.map;
2009
- word = this.applyPattern(word);
2010
- const chars = this.word2Chars(word);
2011
- const charLen = chars.length;
2012
- let code = '',
2013
- lastCode = null;
2014
- for (let i = 0; i < charLen; i++) {
2015
- const char = chars[i];
2016
- if (ignore.includes(char)) continue;
2017
- const mapped = this.mapChar(char, i, chars, charLen, lastCode, map);
2018
- if (mapped === undefined) continue;
2019
- ((code += mapped), (lastCode = mapped));
2020
- if (this.exitEarly(code, i)) break;
2021
- }
2022
- return this.adjustCode(code, chars);
2023
- }
2024
- mapChar(char, i, chars, charLen, lastCode, map) {
2025
- const { dedupe = true, fallback = undefined } = this.options;
2026
- const c = this.applyRules(char, i, chars, charLen) ?? map[char] ?? fallback;
2027
- return dedupe && c === lastCode ? undefined : c;
2028
- }
2029
- equalLen(input) {
2030
- const { length = -1, pad = '0' } = this.options;
2031
- return length === -1
2032
- ? input
2033
- : (input + pad.repeat(length)).slice(0, length);
2034
- }
2035
- word2Chars = (word) => word.toLowerCase().split('');
2036
- exitEarly(code, i) {
2037
- const { length = -1 } = this.options;
2038
- return length > 0 && code.length >= length;
2039
- }
2040
- adjustCode(code, chars) {
2041
- return code;
2042
- }
2043
- loop(words) {
2044
- return ErrorUtil.wrap(
2045
- () => {
2046
- const index = [];
2047
- for (const word of words) {
2048
- const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
2049
- const code =
2050
- Phonetic.cache.get(key || '') ??
2051
- (() => {
2052
- const res = this.encode(word);
2053
- if (key) Phonetic.cache.set(key, res);
2054
- return res;
2055
- })();
2056
- if (code && code.length) index.push(this.equalLen(code));
2057
- }
2058
- return index;
2059
- },
2060
- `Failed to generate phonetic index`,
2061
- { algo: this.algo, words }
2062
- );
2063
- }
2064
- async loopAsync(words) {
2065
- return ErrorUtil.wrapAsync(
2066
- async () => {
2067
- const index = [];
2068
- for (const word of words) {
2069
- const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
2070
- const code = await Promise.resolve(
2071
- Phonetic.cache.get(key || '') ??
2072
- (() => {
2073
- const res = this.encode(word);
2074
- if (key) Phonetic.cache.set(key, res);
2075
- return res;
2076
- })()
2077
- );
2078
- if (code && code.length) index.push(this.equalLen(code));
2079
- }
2080
- return index;
2081
- },
2082
- `Failed to generate phonetic index asynchronously`,
2083
- { algo: this.algo, words }
2084
- );
2085
- }
2086
- getAlgoName = () => this.algo;
2087
- getIndex(input) {
2088
- const { delimiter = ' ' } = this.options;
2089
- return profiler$1.run(() =>
2090
- this.loop(input.split(delimiter).filter(Boolean)).filter(Boolean)
2091
- );
2092
- }
2093
- async getIndexAsync(input) {
2094
- const { delimiter = ' ' } = this.options;
2095
- return (
2096
- await profiler$1.runAsync(
2097
- async () => await this.loopAsync(input.split(delimiter).filter(Boolean))
2098
- )
2099
- ).filter(Boolean);
2100
- }
2101
- }
2102
- const PhoneticRegistry = Registry('phonetic', Phonetic);
2103
- const PhoneticMappingRegistry = (() => {
2104
- const mappings = Object.create(null);
2105
- const maps = (algo) => (mappings[algo] ||= Object.create(null));
2106
- return Object.freeze({
2107
- add(algo, id, map, update = false) {
2108
- const mappings = maps(algo);
2109
- ErrorUtil.assert(
2110
- !(!id || id in mappings) || update,
2111
- `Entry <${id}> already exists / use <update=true> to overwrite`,
2112
- { algo, id }
2113
- );
2114
- mappings[id] = map;
2115
- },
2116
- remove(algo, id) {
2117
- delete maps(algo)[id];
2118
- },
2119
- has(algo, id) {
2120
- return id in maps(algo);
2045
+ PhoneticRegistry.add('cologne', Cologne);
2046
+ PhoneticMappingRegistry.add('cologne', 'default', {
2047
+ map: {
2048
+ a: '0',
2049
+ ä: '0',
2050
+ e: '0',
2051
+ i: '0',
2052
+ j: '0',
2053
+ o: '0',
2054
+ ö: '0',
2055
+ u: '0',
2056
+ ü: '0',
2057
+ y: '0',
2058
+ b: '1',
2059
+ p: '1',
2060
+ d: '2',
2061
+ t: '2',
2062
+ f: '3',
2063
+ v: '3',
2064
+ w: '3',
2065
+ g: '4',
2066
+ k: '4',
2067
+ q: '4',
2068
+ l: '5',
2069
+ m: '6',
2070
+ n: '6',
2071
+ r: '7',
2072
+ c: '8',
2073
+ s: '8',
2074
+ ß: '8',
2075
+ z: '8',
2076
+ x: '48'
2077
+ },
2078
+ ignore: ['h'],
2079
+ ruleset: [
2080
+ { char: 'p', next: ['h'], code: '3' },
2081
+ {
2082
+ char: 'c',
2083
+ position: 'start',
2084
+ next: ['a', 'h', 'k', 'l', 'o', 'q', 'r', 'u', 'x'],
2085
+ code: '4'
2121
2086
  },
2122
- get(algo, id) {
2123
- return maps(algo)[id];
2087
+ {
2088
+ char: 'c',
2089
+ next: ['a', 'h', 'k', 'o', 'q', 'u', 'x'],
2090
+ prevNot: ['s', 'z'],
2091
+ code: '4'
2124
2092
  },
2125
- list(algo) {
2126
- return Object.keys(maps(algo));
2127
- }
2128
- });
2129
- })();
2093
+ { char: 'd', next: ['c', 's', 'z'], code: '8' },
2094
+ { char: 't', next: ['c', 's', 'z'], code: '8' },
2095
+ { char: 'x', prev: ['c', 'k', 'q'], code: '8' }
2096
+ ]
2097
+ });
2130
2098
 
2131
- class Caverphone extends Phonetic {
2132
- static REGEX = { uppercase: /[^A-Z]/gi };
2099
+ class Metaphone extends Phonetic {
2100
+ static REGEX = { adjacent: /([A-BD-Z])\1+/gi, vowel: /[AEIOU]/g };
2133
2101
  static default = {
2134
- map: 'en2',
2102
+ map: 'en90',
2135
2103
  delimiter: ' ',
2136
2104
  length: -1,
2137
2105
  pad: '',
2138
2106
  dedupe: false
2139
2107
  };
2140
2108
  constructor(opt = {}) {
2141
- super('caverphone', opt);
2109
+ super('metaphone', opt);
2142
2110
  }
2143
2111
  encode(word) {
2144
- word = word.replace(Caverphone.REGEX.uppercase, '').toLowerCase();
2112
+ word = word.replace(Metaphone.REGEX.adjacent, (m, c) =>
2113
+ c === 'C' ? m : c
2114
+ );
2145
2115
  return super.encode(word);
2146
2116
  }
2147
- mapChar = (char) => char;
2148
- adjustCode = (code) => code.toUpperCase();
2117
+ adjustCode(code) {
2118
+ return code.slice(0, 1) + code.slice(1).replace(Metaphone.REGEX.vowel, '');
2119
+ }
2149
2120
  }
2150
- PhoneticRegistry.add('caverphone', Caverphone);
2151
- PhoneticMappingRegistry.add('caverphone', 'en1', {
2152
- options: { length: 6, pad: '1' },
2153
- map: {},
2154
- patterns: [
2155
- { pattern: /^(c|r|t|en)ough/, replace: '$1ou2f' },
2156
- { pattern: /^gn/, replace: '2n' },
2157
- { pattern: /mb$/, replace: 'm2' },
2158
- { pattern: /cq/g, replace: '2q' },
2159
- { pattern: /c(e|i|y)/g, replace: 's$1' },
2160
- { pattern: /tch/g, replace: '2ch' },
2161
- { pattern: /[cqx]/g, replace: 'k' },
2162
- { pattern: /v/g, replace: 'f' },
2163
- { pattern: /dg/g, replace: '2g' },
2164
- { pattern: /ti(a|o)/g, replace: 'si$1' },
2165
- { pattern: /d/g, replace: 't' },
2166
- { pattern: /ph/g, replace: 'fh' },
2167
- { pattern: /b/g, replace: 'p' },
2168
- { pattern: /sh/g, replace: 's2' },
2169
- { pattern: /z/g, replace: 's' },
2170
- { pattern: /^[aeiou]/, replace: 'A' },
2171
- { pattern: /[aeiou]/g, replace: '3' },
2172
- { pattern: /3gh3/g, replace: '3kh3' },
2173
- { pattern: /gh/g, replace: '22' },
2174
- { pattern: /g/g, replace: 'k' },
2175
- { pattern: /s+/g, replace: 'S' },
2176
- { pattern: /t+/g, replace: 'T' },
2177
- { pattern: /p+/g, replace: 'P' },
2178
- { pattern: /k+/g, replace: 'K' },
2179
- { pattern: /f+/g, replace: 'F' },
2180
- { pattern: /m+/g, replace: 'M' },
2181
- { pattern: /n+/g, replace: 'N' },
2182
- { pattern: /j/g, replace: 'y' },
2183
- { pattern: /l3/g, replace: 'L3' },
2184
- { pattern: /r3/g, replace: 'R3' },
2185
- { pattern: /w3/g, replace: 'W3' },
2186
- { pattern: /y3/g, replace: 'Y3' },
2187
- { pattern: /ly/g, replace: 'Ly' },
2188
- { pattern: /ry/g, replace: 'Ry' },
2189
- { pattern: /wy/g, replace: 'Wy' },
2190
- { pattern: /wh3/g, replace: 'Wh3' },
2191
- { pattern: /why/g, replace: 'Why' },
2192
- { pattern: /^h/, replace: 'A' },
2193
- { pattern: /[hlrwy23]/g, replace: '' }
2194
- ]
2195
- });
2196
- PhoneticMappingRegistry.add('caverphone', 'en2', {
2197
- options: { length: 10, pad: '1' },
2198
- map: {},
2199
- patterns: [
2200
- { pattern: /e$/, replace: '' },
2201
- { pattern: /^(c|r|t|en|tr)ough/, replace: '$1ou2f' },
2202
- { pattern: /^gn/, replace: '2n' },
2203
- { pattern: /mb$/, replace: 'm2' },
2204
- { pattern: /cq/g, replace: '2q' },
2205
- { pattern: /c(e|i|y)/g, replace: 's$1' },
2206
- { pattern: /tch/g, replace: '2ch' },
2207
- { pattern: /[cqx]/g, replace: 'k' },
2208
- { pattern: /v/g, replace: 'f' },
2209
- { pattern: /dg/g, replace: '2g' },
2210
- { pattern: /ti(a|o)/g, replace: 'si$1' },
2211
- { pattern: /d/g, replace: 't' },
2212
- { pattern: /ph/g, replace: 'fh' },
2213
- { pattern: /b/g, replace: 'p' },
2214
- { pattern: /sh/g, replace: 's2' },
2215
- { pattern: /z/g, replace: 's' },
2216
- { pattern: /^[aeiou]/, replace: 'A' },
2217
- { pattern: /[aeiou]/g, replace: '3' },
2218
- { pattern: /j/g, replace: 'y' },
2219
- { pattern: /^y3/, replace: 'Y3' },
2220
- { pattern: /^y/, replace: 'A' },
2221
- { pattern: /y/g, replace: '3' },
2222
- { pattern: /3gh3/g, replace: '3kh3' },
2223
- { pattern: /gh/g, replace: '22' },
2224
- { pattern: /g/g, replace: 'k' },
2225
- { pattern: /s+/g, replace: 'S' },
2226
- { pattern: /t+/g, replace: 'T' },
2227
- { pattern: /p+/g, replace: 'P' },
2228
- { pattern: /k+/g, replace: 'K' },
2229
- { pattern: /f+/g, replace: 'F' },
2230
- { pattern: /m+/g, replace: 'M' },
2231
- { pattern: /n+/g, replace: 'N' },
2232
- { pattern: /l3/g, replace: 'L3' },
2233
- { pattern: /r3/g, replace: 'R3' },
2234
- { pattern: /w3/g, replace: 'W3' },
2235
- { pattern: /wh3/g, replace: 'Wh3' },
2236
- { pattern: /[lrw]$/, replace: '3' },
2237
- { pattern: /^h/, replace: 'A' },
2238
- { pattern: /3$/, replace: 'A' },
2239
- { pattern: /[hlrw23]/g, replace: '' }
2121
+ PhoneticRegistry.add('metaphone', Metaphone);
2122
+ PhoneticMappingRegistry.add('metaphone', 'en90', {
2123
+ map: {
2124
+ a: 'A',
2125
+ b: 'B',
2126
+ c: 'K',
2127
+ d: 'T',
2128
+ e: 'E',
2129
+ f: 'F',
2130
+ g: 'K',
2131
+ h: 'H',
2132
+ i: 'I',
2133
+ j: 'J',
2134
+ k: 'K',
2135
+ l: 'L',
2136
+ m: 'M',
2137
+ n: 'N',
2138
+ o: 'O',
2139
+ p: 'P',
2140
+ q: 'K',
2141
+ r: 'R',
2142
+ s: 'S',
2143
+ t: 'T',
2144
+ u: 'U',
2145
+ v: 'F',
2146
+ w: 'W',
2147
+ x: 'KS',
2148
+ y: 'Y',
2149
+ z: 'S'
2150
+ },
2151
+ ruleset: [
2152
+ { char: 'a', position: 'start', next: ['e'], code: '' },
2153
+ { char: 'g', position: 'start', next: ['n'], code: '' },
2154
+ { char: 'k', position: 'start', next: ['n'], code: '' },
2155
+ { char: 'p', position: 'start', next: ['n'], code: '' },
2156
+ { char: 'w', position: 'start', next: ['r'], code: '' },
2157
+ { char: 'b', position: 'end', prev: ['m'], code: '' },
2158
+ { char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
2159
+ { char: 'c', next: ['i'], next2: ['a'], code: 'X' },
2160
+ { char: 'c', next: ['e', 'i', 'y'], code: 'S' },
2161
+ { char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
2162
+ {
2163
+ char: 'g',
2164
+ next: ['h'],
2165
+ next2Not: ['', 'a', 'e', 'i', 'o', 'u'],
2166
+ code: ''
2167
+ },
2168
+ { char: 'g', trailing: 'n', code: '' },
2169
+ { char: 'g', trailing: 'ned', code: '' },
2170
+ { char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
2171
+ {
2172
+ char: 'h',
2173
+ prev: ['a', 'e', 'i', 'o', 'u'],
2174
+ nextNot: ['a', 'e', 'i', 'o', 'u'],
2175
+ code: ''
2176
+ },
2177
+ { char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
2178
+ { char: 'k', prev: ['c'], code: '' },
2179
+ { char: 'p', next: ['h'], code: 'F' },
2180
+ { char: 's', next: ['h'], code: 'X' },
2181
+ { char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
2182
+ { char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
2183
+ { char: 't', next: ['h'], code: '0' },
2184
+ { char: 't', next: ['c'], next2: ['h'], code: '' },
2185
+ { char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
2186
+ { char: 'h', leading: 'w', code: '' },
2187
+ { char: 'x', position: 'start', code: 'S' },
2188
+ { char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
2240
2189
  ]
2241
2190
  });
2242
2191
 
2243
- class Cologne extends Phonetic {
2244
- static default = { map: 'default', delimiter: ' ', length: -1, dedupe: true };
2192
+ class Soundex extends Phonetic {
2193
+ static default = {
2194
+ map: 'en',
2195
+ delimiter: ' ',
2196
+ length: 4,
2197
+ pad: '0',
2198
+ dedupe: true
2199
+ };
2245
2200
  constructor(opt = {}) {
2246
- super('cologne', opt);
2201
+ super('soundex', opt);
2247
2202
  }
2248
- adjustCode(code) {
2249
- return code.slice(0, 1) + code.slice(1).replaceAll('0', '');
2203
+ adjustCode(code, chars) {
2204
+ return chars[0].toUpperCase() + code.slice(1).replaceAll('0', '');
2250
2205
  }
2251
2206
  }
2252
- PhoneticRegistry.add('cologne', Cologne);
2253
- PhoneticMappingRegistry.add('cologne', 'default', {
2207
+ PhoneticRegistry.add('soundex', Soundex);
2208
+ PhoneticMappingRegistry.add('soundex', 'en', {
2209
+ map: {
2210
+ a: '0',
2211
+ e: '0',
2212
+ h: '0',
2213
+ i: '0',
2214
+ o: '0',
2215
+ u: '0',
2216
+ w: '0',
2217
+ y: '0',
2218
+ b: '1',
2219
+ f: '1',
2220
+ p: '1',
2221
+ v: '1',
2222
+ c: '2',
2223
+ g: '2',
2224
+ j: '2',
2225
+ k: '2',
2226
+ q: '2',
2227
+ s: '2',
2228
+ x: '2',
2229
+ z: '2',
2230
+ d: '3',
2231
+ t: '3',
2232
+ l: '4',
2233
+ m: '5',
2234
+ n: '5',
2235
+ r: '6'
2236
+ }
2237
+ });
2238
+ PhoneticMappingRegistry.add('soundex', 'de', {
2254
2239
  map: {
2255
2240
  a: '0',
2256
2241
  ä: '0',
2257
2242
  e: '0',
2243
+ h: '0',
2258
2244
  i: '0',
2259
2245
  j: '0',
2260
2246
  o: '0',
@@ -2263,220 +2249,596 @@ PhoneticMappingRegistry.add('cologne', 'default', {
2263
2249
  ü: '0',
2264
2250
  y: '0',
2265
2251
  b: '1',
2252
+ f: '1',
2266
2253
  p: '1',
2267
- d: '2',
2268
- t: '2',
2269
- f: '3',
2270
- v: '3',
2271
- w: '3',
2272
- g: '4',
2273
- k: '4',
2274
- q: '4',
2275
- l: '5',
2276
- m: '6',
2277
- n: '6',
2278
- r: '7',
2279
- c: '8',
2280
- s: '8',
2281
- ß: '8',
2282
- z: '8',
2283
- x: '48'
2254
+ v: '1',
2255
+ w: '1',
2256
+ c: '2',
2257
+ g: '2',
2258
+ k: '2',
2259
+ q: '2',
2260
+ s: '2',
2261
+ ß: '2',
2262
+ x: '2',
2263
+ z: '2',
2264
+ d: '3',
2265
+ t: '3',
2266
+ l: '4',
2267
+ m: '5',
2268
+ n: '5',
2269
+ r: '6'
2284
2270
  },
2285
- ignore: ['h'],
2286
- ruleset: [
2287
- { char: 'p', next: ['h'], code: '3' },
2288
- {
2289
- char: 'c',
2290
- position: 'start',
2291
- next: ['a', 'h', 'k', 'l', 'o', 'q', 'r', 'u', 'x'],
2292
- code: '4'
2293
- },
2294
- {
2295
- char: 'c',
2296
- next: ['a', 'h', 'k', 'o', 'q', 'u', 'x'],
2297
- prevNot: ['s', 'z'],
2298
- code: '4'
2299
- },
2300
- { char: 'd', next: ['c', 's', 'z'], code: '8' },
2301
- { char: 't', next: ['c', 's', 'z'], code: '8' },
2302
- { char: 'x', prev: ['c', 'k', 'q'], code: '8' }
2303
- ]
2271
+ ruleset: [{ char: 'c', next: ['h'], code: '7' }]
2304
2272
  });
2305
2273
 
2306
- class Metaphone extends Phonetic {
2307
- static REGEX = { adjacent: /([A-BD-Z])\1+/gi, vowel: /[AEIOU]/g };
2308
- static default = {
2309
- map: 'en90',
2310
- delimiter: ' ',
2311
- length: -1,
2312
- pad: '',
2313
- dedupe: false
2274
+ class OptionsValidator {
2275
+ static ALLOWED_FLAGS = new Set([
2276
+ 'd',
2277
+ 'u',
2278
+ 'x',
2279
+ 'w',
2280
+ 't',
2281
+ 'r',
2282
+ 's',
2283
+ 'k',
2284
+ 'n',
2285
+ 'i'
2286
+ ]);
2287
+ static ALLOWED_OUTPUT = new Set(['orig', 'prep']);
2288
+ static ALLOWED_MODES = new Set(['default', 'batch', 'single', 'pairwise']);
2289
+ static ALLOWED_SORT = new Set(['asc', 'desc']);
2290
+ static PROCESSORS = {
2291
+ phonetic: (opt) => {
2292
+ if (!opt) return;
2293
+ OptionsValidator.validatePhoneticName(opt.algo);
2294
+ OptionsValidator.validatePhoneticOptions(opt.opt);
2295
+ }
2314
2296
  };
2315
- constructor(opt = {}) {
2316
- super('metaphone', opt);
2297
+ static METRIC_OPT_MAP = {
2298
+ mode: (v) => OptionsValidator.validateMode(v),
2299
+ delimiter: (v) => OptionsValidator.validateString(v, 'opt.delimiter'),
2300
+ pad: (v) => OptionsValidator.validateString(v, 'opt.pad'),
2301
+ q: (v) => OptionsValidator.validateNumber(v, 'opt.q'),
2302
+ match: (v) => OptionsValidator.validateNumber(v, 'opt.match'),
2303
+ mismatch: (v) => OptionsValidator.validateNumber(v, 'opt.mismatch'),
2304
+ gap: (v) => OptionsValidator.validateNumber(v, 'opt.gap')
2305
+ };
2306
+ static PHONETIC_OPT_MAP = {
2307
+ map: (v) =>
2308
+ OptionsValidator.validateString(v, 'processors.phonetic.opt.map'),
2309
+ delimiter: (v) =>
2310
+ OptionsValidator.validateString(v, 'processors.phonetic.opt.delimiter'),
2311
+ length: (v) =>
2312
+ OptionsValidator.validateNumber(v, 'processors.phonetic.opt.length'),
2313
+ pad: (v) =>
2314
+ OptionsValidator.validateString(v, 'processors.phonetic.opt.pad'),
2315
+ dedupe: (v) =>
2316
+ OptionsValidator.validateBoolean(v, 'processors.phonetic.opt.dedupe'),
2317
+ fallback: (v) =>
2318
+ OptionsValidator.validateString(v, 'processors.phonetic.opt.fallback')
2319
+ };
2320
+ static CMPSTR_OPT_MAP = {
2321
+ raw: (v) => OptionsValidator.validateBoolean(v, 'raw'),
2322
+ removeZero: (v) => OptionsValidator.validateBoolean(v, 'removeZero'),
2323
+ safeEmpty: (v) => OptionsValidator.validateBoolean(v, 'safeEmpty'),
2324
+ flags: (v) => OptionsValidator.validateFlags(v),
2325
+ metric: (v) => OptionsValidator.validateMetricName(v),
2326
+ output: (v) => OptionsValidator.validateOutput(v),
2327
+ opt: (v) => OptionsValidator.validateMetricOptions(v),
2328
+ processors: (v) => OptionsValidator.validateProcessors(v),
2329
+ sort: (v) => OptionsValidator.validateSort(v, 'sort'),
2330
+ objectsOnly: (v) => OptionsValidator.validateBoolean(v, 'objectsOnly')
2331
+ };
2332
+ static set2string(set) {
2333
+ return Array.from(set).join(' | ');
2334
+ }
2335
+ static validateType(value, name, type) {
2336
+ if (value === undefined) return;
2337
+ if (typeof value !== type || (type === 'number' && Number.isNaN(value))) {
2338
+ throw new CmpStrValidationError(
2339
+ `Invalid option <${name}>: expected ${type}`,
2340
+ { name, value }
2341
+ );
2342
+ }
2343
+ }
2344
+ static validateEnum(value, name, set) {
2345
+ if (value === undefined) return;
2346
+ if (typeof value !== 'string' || !set.has(value)) {
2347
+ throw new CmpStrValidationError(
2348
+ `Invalid option <${name}>: expected ${OptionsValidator.set2string(set)}`,
2349
+ { name, value }
2350
+ );
2351
+ }
2352
+ }
2353
+ static validateMap(opt, map) {
2354
+ if (!opt) return;
2355
+ for (const k in opt) {
2356
+ const fn = map[k];
2357
+ if (!fn)
2358
+ throw new CmpStrValidationError(`Invalid option <${k}>`, {
2359
+ option: k,
2360
+ value: map[k]
2361
+ });
2362
+ fn(opt[k]);
2363
+ }
2364
+ }
2365
+ static validateRegistryName(value, name, label, has, list) {
2366
+ if (value === undefined) return;
2367
+ if (typeof value !== 'string' || value.length === 0)
2368
+ throw new CmpStrValidationError(
2369
+ `Invalid option <${name}>: expected non-empty string`,
2370
+ { name, value }
2371
+ );
2372
+ if (!has(value))
2373
+ throw new CmpStrValidationError(`${label} <${value}> is not registered`, {
2374
+ name,
2375
+ value,
2376
+ available: list()
2377
+ });
2378
+ }
2379
+ static validateBoolean(value, name) {
2380
+ OptionsValidator.validateType(value, name, 'boolean');
2381
+ }
2382
+ static validateNumber(value, name) {
2383
+ OptionsValidator.validateType(value, name, 'number');
2384
+ }
2385
+ static validateString(value, name) {
2386
+ OptionsValidator.validateType(value, name, 'string');
2387
+ }
2388
+ static validateFlags(value) {
2389
+ if (value === undefined) return;
2390
+ if (typeof value !== 'string')
2391
+ throw new CmpStrValidationError(
2392
+ `Invalid option <flags>: expected string`,
2393
+ { flags: value }
2394
+ );
2395
+ for (let i = 0; i < value.length; i++) {
2396
+ const ch = value[i];
2397
+ if (!OptionsValidator.ALLOWED_FLAGS.has(ch))
2398
+ throw new CmpStrValidationError(
2399
+ `Invalid normalization flag <${ch}> in <flags>: expected ${OptionsValidator.set2string(OptionsValidator.ALLOWED_FLAGS)}`,
2400
+ { flags: value, invalid: ch }
2401
+ );
2402
+ }
2403
+ }
2404
+ static validateOutput(value) {
2405
+ OptionsValidator.validateEnum(
2406
+ value,
2407
+ 'output',
2408
+ OptionsValidator.ALLOWED_OUTPUT
2409
+ );
2410
+ }
2411
+ static validateMode(value) {
2412
+ OptionsValidator.validateEnum(
2413
+ value,
2414
+ 'mode',
2415
+ OptionsValidator.ALLOWED_MODES
2416
+ );
2417
+ }
2418
+ static validateSort(value, name) {
2419
+ if (value === undefined || typeof value === 'boolean') return;
2420
+ OptionsValidator.validateEnum(value, name, OptionsValidator.ALLOWED_SORT);
2421
+ }
2422
+ static validateMetricName(value) {
2423
+ OptionsValidator.validateRegistryName(
2424
+ value,
2425
+ 'metric',
2426
+ 'Comparison metric',
2427
+ MetricRegistry.has,
2428
+ MetricRegistry.list
2429
+ );
2430
+ }
2431
+ static validatePhoneticName(value) {
2432
+ OptionsValidator.validateRegistryName(
2433
+ value,
2434
+ 'phonetic',
2435
+ 'Phonetic algorithm',
2436
+ PhoneticRegistry.has,
2437
+ PhoneticRegistry.list
2438
+ );
2439
+ }
2440
+ static validateMetricOptions(opt) {
2441
+ OptionsValidator.validateMap(opt, OptionsValidator.METRIC_OPT_MAP);
2442
+ }
2443
+ static validatePhoneticOptions(opt) {
2444
+ OptionsValidator.validateMap(opt, OptionsValidator.PHONETIC_OPT_MAP);
2445
+ }
2446
+ static validateProcessors(opt) {
2447
+ if (!opt) return;
2448
+ for (const key in opt) {
2449
+ const fn = OptionsValidator.PROCESSORS[key];
2450
+ if (!fn)
2451
+ throw new CmpStrValidationError(
2452
+ `Invalid processor type <${key}> in <processors>: expected ${Object.keys(OptionsValidator.PROCESSORS).join(' | ')}`,
2453
+ { processors: opt, invalid: key }
2454
+ );
2455
+ fn(opt[key]);
2456
+ }
2457
+ }
2458
+ static validateOptions(opt) {
2459
+ OptionsValidator.validateMap(opt, OptionsValidator.CMPSTR_OPT_MAP);
2460
+ }
2461
+ }
2462
+
2463
+ class StructuredData {
2464
+ data;
2465
+ key;
2466
+ static SORT_ASC = (a, b) => a.res - b.res;
2467
+ static SORT_DESC = (a, b) => b.res - a.res;
2468
+ static create(data, key) {
2469
+ return new StructuredData(data, key);
2470
+ }
2471
+ constructor(data, key) {
2472
+ this.data = data;
2473
+ this.key = key;
2474
+ }
2475
+ extractFrom(arr, key) {
2476
+ const n = arr.length;
2477
+ const result = new Array(n);
2478
+ for (let i = 0; i < n; i++) {
2479
+ const val = arr[i][key];
2480
+ result[i] = val != null ? String(val) : '';
2481
+ }
2482
+ return result;
2483
+ }
2484
+ extract() {
2485
+ return this.extractFrom(this.data, this.key);
2486
+ }
2487
+ isMetricResult(v) {
2488
+ return (
2489
+ typeof v === 'object' && v !== null && 'a' in v && 'b' in v && 'res' in v
2490
+ );
2491
+ }
2492
+ isCmpStrResult(v) {
2493
+ return (
2494
+ typeof v === 'object' &&
2495
+ v !== null &&
2496
+ 'source' in v &&
2497
+ 'target' in v &&
2498
+ 'match' in v
2499
+ );
2500
+ }
2501
+ normalizeResults(results) {
2502
+ if (!Array.isArray(results) || results.length === 0) return [];
2503
+ const first = results[0];
2504
+ let out = new Array(results.length);
2505
+ if (this.isMetricResult(first)) {
2506
+ const src = results;
2507
+ for (let i = 0; i < src.length; i++) out[i] = { ...src[i], __idx: i };
2508
+ } else if (this.isCmpStrResult(first)) {
2509
+ const src = results;
2510
+ for (let i = 0; i < src.length; i++) {
2511
+ const r = src[i];
2512
+ out[i] = {
2513
+ metric: 'unknown',
2514
+ a: r.source,
2515
+ b: r.target,
2516
+ res: r.match,
2517
+ raw: r.raw,
2518
+ __idx: i
2519
+ };
2520
+ }
2521
+ } else
2522
+ throw new CmpStrValidationError(
2523
+ 'Unsupported result format for StructuredData normalization.'
2524
+ );
2525
+ return out;
2526
+ }
2527
+ rebuild(results, sourceData, extractedStrings, removeZero, objectsOnly) {
2528
+ const m = extractedStrings.length,
2529
+ n = results.length;
2530
+ const stringToIndices = Pool.acquire('map', m);
2531
+ const occurrenceCount = Pool.acquire('map', n);
2532
+ const output = new Array(n);
2533
+ stringToIndices.clear();
2534
+ occurrenceCount.clear();
2535
+ try {
2536
+ for (let i = 0; i < m; i++) {
2537
+ const str = extractedStrings[i];
2538
+ let arr = stringToIndices.get(str);
2539
+ if (!arr) {
2540
+ arr = [];
2541
+ stringToIndices.set(str, arr);
2542
+ }
2543
+ arr.push(i);
2544
+ }
2545
+ let out = 0;
2546
+ for (let i = 0; i < n; i++) {
2547
+ const result = results[i];
2548
+ if (removeZero && result.res === 0) continue;
2549
+ const targetStr = result.b || '';
2550
+ const indices = stringToIndices.get(targetStr);
2551
+ let dataIndex;
2552
+ if (indices && indices.length > 0) {
2553
+ const occurrence = occurrenceCount.get(targetStr) ?? 0;
2554
+ occurrenceCount.set(targetStr, occurrence + 1);
2555
+ dataIndex = indices[occurrence % indices.length];
2556
+ } else {
2557
+ dataIndex = result.__idx ?? i;
2558
+ }
2559
+ if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
2560
+ const sourceObj = sourceData[dataIndex];
2561
+ const mappedTarget = extractedStrings[dataIndex] || targetStr;
2562
+ if (objectsOnly) output[out++] = sourceObj;
2563
+ else
2564
+ output[out++] = {
2565
+ obj: sourceObj,
2566
+ key: this.key,
2567
+ result: {
2568
+ source: result.a,
2569
+ target: mappedTarget,
2570
+ match: result.res
2571
+ },
2572
+ ...(result.raw ? { raw: result.raw } : null)
2573
+ };
2574
+ }
2575
+ output.length = out;
2576
+ return output;
2577
+ } finally {
2578
+ Pool.release('map', stringToIndices, m);
2579
+ Pool.release('map', occurrenceCount, n);
2580
+ }
2581
+ }
2582
+ sort(results, sort) {
2583
+ if (!sort || results.length <= 1) return results;
2584
+ return results.sort(
2585
+ sort === 'asc' ? StructuredData.SORT_ASC : StructuredData.SORT_DESC
2586
+ );
2587
+ }
2588
+ finalizeLookup(results, extractedStrings, opt) {
2589
+ return this.rebuild(
2590
+ this.sort(this.normalizeResults(results), opt?.sort),
2591
+ this.data,
2592
+ extractedStrings,
2593
+ opt?.removeZero,
2594
+ opt?.objectsOnly
2595
+ );
2596
+ }
2597
+ performLookup(fn, extractedStrings, opt) {
2598
+ return ErrorUtil.wrap(
2599
+ () => this.finalizeLookup(fn(), extractedStrings, opt),
2600
+ 'StructuredData lookup failed',
2601
+ { key: this.key }
2602
+ );
2603
+ }
2604
+ async performLookupAsync(fn, extractedStrings, opt) {
2605
+ return await ErrorUtil.wrapAsync(
2606
+ async () => this.finalizeLookup(await fn(), extractedStrings, opt),
2607
+ 'StructuredData async lookup failed',
2608
+ { key: this.key }
2609
+ );
2610
+ }
2611
+ lookup(fn, query, opt) {
2612
+ const b = this.extract();
2613
+ try {
2614
+ return this.performLookup(() => fn(query, b, opt), b, opt);
2615
+ } finally {
2616
+ Pool.release('string[]', b, b.length);
2617
+ }
2618
+ }
2619
+ async lookupAsync(fn, query, opt) {
2620
+ const b = this.extract();
2621
+ try {
2622
+ return await this.performLookupAsync(() => fn(query, b, opt), b, opt);
2623
+ } finally {
2624
+ Pool.release('string[]', b, b.length);
2625
+ }
2626
+ }
2627
+ lookupPairs(fn, other, otherKey, opt) {
2628
+ const a = this.extract();
2629
+ const b = this.extractFrom(other, otherKey);
2630
+ try {
2631
+ return this.performLookup(() => fn(a, b, opt), a, opt);
2632
+ } finally {
2633
+ Pool.release('string[]', a, a.length);
2634
+ Pool.release('string[]', b, b.length);
2635
+ }
2636
+ }
2637
+ async lookupPairsAsync(fn, other, otherKey, opt) {
2638
+ const a = this.extract();
2639
+ const b = this.extractFrom(other, otherKey);
2640
+ try {
2641
+ return await this.performLookupAsync(() => fn(a, b, opt), a, opt);
2642
+ } finally {
2643
+ Pool.release('string[]', a, a.length);
2644
+ Pool.release('string[]', b, b.length);
2645
+ }
2646
+ }
2647
+ }
2648
+
2649
+ class TextAnalyzer {
2650
+ static REGEX = {
2651
+ number: /\d/,
2652
+ sentence: /(?<=[.!?])\s+/,
2653
+ word: /\p{L}+/gu,
2654
+ nonWord: /[^\p{L}]/gu,
2655
+ vowelGroup: /[aeiouy]+/g,
2656
+ letter: /\p{L}/gu,
2657
+ ucLetter: /\p{Lu}/gu
2658
+ };
2659
+ text;
2660
+ words = [];
2661
+ sentences = [];
2662
+ charFrequency = new Map();
2663
+ wordHistogram = new Map();
2664
+ syllableCache = new Map();
2665
+ syllableStats;
2666
+ constructor(input) {
2667
+ this.text = input.trim();
2668
+ this.tokenize();
2669
+ this.computeFrequencies();
2670
+ }
2671
+ tokenize() {
2672
+ let match;
2673
+ const lcText = this.text.toLowerCase();
2674
+ while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null)
2675
+ this.words.push(match[0]);
2676
+ this.sentences = this.text
2677
+ .split(TextAnalyzer.REGEX.sentence)
2678
+ .filter(Boolean);
2679
+ }
2680
+ computeFrequencies() {
2681
+ for (const char of this.text)
2682
+ this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1);
2683
+ for (const word of this.words)
2684
+ this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1);
2685
+ }
2686
+ estimateSyllables(word) {
2687
+ const clean = word
2688
+ .normalize('NFC')
2689
+ .toLowerCase()
2690
+ .replace(TextAnalyzer.REGEX.nonWord, '');
2691
+ if (this.syllableCache.has(clean)) return this.syllableCache.get(clean);
2692
+ const matches = clean.match(TextAnalyzer.REGEX.vowelGroup);
2693
+ const count = matches ? matches.length : 1;
2694
+ this.syllableCache.set(clean, count);
2695
+ return count;
2696
+ }
2697
+ computeSyllableStats() {
2698
+ return (this.syllableStats ||= (() => {
2699
+ const perWord = this.words
2700
+ .map((w) => this.estimateSyllables(w))
2701
+ .sort((a, b) => a - b);
2702
+ const total = perWord.reduce((sum, s) => sum + s, 0);
2703
+ const mono = perWord.filter((s) => s === 1).length;
2704
+ const median = !perWord.length
2705
+ ? 0
2706
+ : perWord.length % 2 === 0
2707
+ ? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) / 2
2708
+ : perWord[Math.floor(perWord.length / 2)];
2709
+ return {
2710
+ total,
2711
+ mono,
2712
+ perWord,
2713
+ avg: perWord.length ? total / perWord.length : 0,
2714
+ median
2715
+ };
2716
+ })());
2717
+ }
2718
+ getLength = () => this.text.length;
2719
+ getWordCount = () => this.words.length;
2720
+ getSentenceCount = () => this.sentences.length;
2721
+ getAvgWordLength() {
2722
+ return this.words.length
2723
+ ? this.words.join('').length / this.words.length
2724
+ : 0;
2725
+ }
2726
+ getAvgSentenceLength() {
2727
+ return this.sentences.length
2728
+ ? this.words.length / this.sentences.length
2729
+ : 0;
2730
+ }
2731
+ getWordHistogram() {
2732
+ return Object.fromEntries(this.wordHistogram);
2733
+ }
2734
+ getMostCommonWords(limit = 5) {
2735
+ return [...this.wordHistogram.entries()]
2736
+ .sort((a, b) => b[1] - a[1])
2737
+ .slice(0, limit)
2738
+ .map((e) => e[0]);
2739
+ }
2740
+ getHapaxLegomena() {
2741
+ return [...this.wordHistogram.entries()]
2742
+ .filter(([, c]) => c === 1)
2743
+ .map((e) => e[0]);
2744
+ }
2745
+ hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text);
2746
+ getUpperCaseRatio() {
2747
+ const matches = this.text.match(TextAnalyzer.REGEX.letter) || [];
2748
+ const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0;
2749
+ return matches.length ? upper / matches.length : 0;
2750
+ }
2751
+ getCharFrequency() {
2752
+ return Object.fromEntries(this.charFrequency);
2753
+ }
2754
+ getUnicodeCodepoints() {
2755
+ const result = {};
2756
+ for (const [char, count] of this.charFrequency) {
2757
+ const block = char
2758
+ .charCodeAt(0)
2759
+ .toString(16)
2760
+ .padStart(4, '0')
2761
+ .toUpperCase();
2762
+ result[block] = (result[block] || 0) + count;
2763
+ }
2764
+ return result;
2765
+ }
2766
+ getLongWordRatio(len = 7) {
2767
+ let long = 0;
2768
+ for (const w of this.words) if (w.length >= len) long++;
2769
+ return this.words.length ? long / this.words.length : 0;
2770
+ }
2771
+ getShortWordRatio(len = 3) {
2772
+ let short = 0;
2773
+ for (const w of this.words) if (w.length <= len) short++;
2774
+ return this.words.length ? short / this.words.length : 0;
2775
+ }
2776
+ getSyllablesCount() {
2777
+ return this.computeSyllableStats().total;
2778
+ }
2779
+ getMonosyllabicWordCount() {
2780
+ return this.computeSyllableStats().mono;
2781
+ }
2782
+ getMinSyllablesWordCount(min) {
2783
+ return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
2784
+ }
2785
+ getMaxSyllablesWordCount(max) {
2786
+ return this.computeSyllableStats().perWord.filter((w) => w <= max).length;
2317
2787
  }
2318
- encode(word) {
2319
- word = word.replace(Metaphone.REGEX.adjacent, (m, c) =>
2320
- c === 'C' ? m : c
2321
- );
2322
- return super.encode(word);
2788
+ getAvgSyllablesPerWord() {
2789
+ return this.computeSyllableStats().avg;
2323
2790
  }
2324
- adjustCode(code) {
2325
- return code.slice(0, 1) + code.slice(1).replace(Metaphone.REGEX.vowel, '');
2791
+ getMedianSyllablesPerWord() {
2792
+ return this.computeSyllableStats().median;
2326
2793
  }
2327
- }
2328
- PhoneticRegistry.add('metaphone', Metaphone);
2329
- PhoneticMappingRegistry.add('metaphone', 'en90', {
2330
- map: {
2331
- a: 'A',
2332
- b: 'B',
2333
- c: 'K',
2334
- d: 'T',
2335
- e: 'E',
2336
- f: 'F',
2337
- g: 'K',
2338
- h: 'H',
2339
- i: 'I',
2340
- j: 'J',
2341
- k: 'K',
2342
- l: 'L',
2343
- m: 'M',
2344
- n: 'N',
2345
- o: 'O',
2346
- p: 'P',
2347
- q: 'K',
2348
- r: 'R',
2349
- s: 'S',
2350
- t: 'T',
2351
- u: 'U',
2352
- v: 'F',
2353
- w: 'W',
2354
- x: 'KS',
2355
- y: 'Y',
2356
- z: 'S'
2357
- },
2358
- ruleset: [
2359
- { char: 'a', position: 'start', next: ['e'], code: '' },
2360
- { char: 'g', position: 'start', next: ['n'], code: '' },
2361
- { char: 'k', position: 'start', next: ['n'], code: '' },
2362
- { char: 'p', position: 'start', next: ['n'], code: '' },
2363
- { char: 'w', position: 'start', next: ['r'], code: '' },
2364
- { char: 'b', position: 'end', prev: ['m'], code: '' },
2365
- { char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
2366
- { char: 'c', next: ['i'], next2: ['a'], code: 'X' },
2367
- { char: 'c', next: ['e', 'i', 'y'], code: 'S' },
2368
- { char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
2369
- {
2370
- char: 'g',
2371
- next: ['h'],
2372
- next2Not: ['', 'a', 'e', 'i', 'o', 'u'],
2373
- code: ''
2374
- },
2375
- { char: 'g', trailing: 'n', code: '' },
2376
- { char: 'g', trailing: 'ned', code: '' },
2377
- { char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
2378
- {
2379
- char: 'h',
2380
- prev: ['a', 'e', 'i', 'o', 'u'],
2381
- nextNot: ['a', 'e', 'i', 'o', 'u'],
2382
- code: ''
2383
- },
2384
- { char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
2385
- { char: 'k', prev: ['c'], code: '' },
2386
- { char: 'p', next: ['h'], code: 'F' },
2387
- { char: 's', next: ['h'], code: 'X' },
2388
- { char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
2389
- { char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
2390
- { char: 't', next: ['h'], code: '0' },
2391
- { char: 't', next: ['c'], next2: ['h'], code: '' },
2392
- { char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
2393
- { char: 'h', leading: 'w', code: '' },
2394
- { char: 'x', position: 'start', code: 'S' },
2395
- { char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
2396
- ]
2397
- });
2398
-
2399
- class Soundex extends Phonetic {
2400
- static default = {
2401
- map: 'en',
2402
- delimiter: ' ',
2403
- length: 4,
2404
- pad: '0',
2405
- dedupe: true
2406
- };
2407
- constructor(opt = {}) {
2408
- super('soundex', opt);
2794
+ getHonoresR() {
2795
+ try {
2796
+ return (
2797
+ (100 * Math.log(this.words.length)) /
2798
+ (1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1))
2799
+ );
2800
+ } catch {
2801
+ return 0;
2802
+ }
2409
2803
  }
2410
- adjustCode(code, chars) {
2411
- return chars[0].toUpperCase() + code.slice(1).replaceAll('0', '');
2804
+ getReadingTime(wpm = 200) {
2805
+ return this.words.length / (wpm ?? 1);
2412
2806
  }
2413
- }
2414
- PhoneticRegistry.add('soundex', Soundex);
2415
- PhoneticMappingRegistry.add('soundex', 'en', {
2416
- map: {
2417
- a: '0',
2418
- e: '0',
2419
- h: '0',
2420
- i: '0',
2421
- o: '0',
2422
- u: '0',
2423
- w: '0',
2424
- y: '0',
2425
- b: '1',
2426
- f: '1',
2427
- p: '1',
2428
- v: '1',
2429
- c: '2',
2430
- g: '2',
2431
- j: '2',
2432
- k: '2',
2433
- q: '2',
2434
- s: '2',
2435
- x: '2',
2436
- z: '2',
2437
- d: '3',
2438
- t: '3',
2439
- l: '4',
2440
- m: '5',
2441
- n: '5',
2442
- r: '6'
2807
+ getReadabilityScore(metric = 'flesch') {
2808
+ const w = this.words.length || 1;
2809
+ const s = this.sentences.length || 1;
2810
+ const y = this.getSyllablesCount() || 1;
2811
+ const asl = w / s;
2812
+ const asw = y / w;
2813
+ switch (metric) {
2814
+ case 'flesch':
2815
+ return 206.835 - 1.015 * asl - 84.6 * asw;
2816
+ case 'fleschde':
2817
+ return 180 - asl - 58.5 * asw;
2818
+ case 'kincaid':
2819
+ return 0.39 * asl + 11.8 * asw - 15.59;
2820
+ }
2443
2821
  }
2444
- });
2445
- PhoneticMappingRegistry.add('soundex', 'de', {
2446
- map: {
2447
- a: '0',
2448
- ä: '0',
2449
- e: '0',
2450
- h: '0',
2451
- i: '0',
2452
- j: '0',
2453
- o: '0',
2454
- ö: '0',
2455
- u: '0',
2456
- ü: '0',
2457
- y: '0',
2458
- b: '1',
2459
- f: '1',
2460
- p: '1',
2461
- v: '1',
2462
- w: '1',
2463
- c: '2',
2464
- g: '2',
2465
- k: '2',
2466
- q: '2',
2467
- s: '2',
2468
- ß: '2',
2469
- x: '2',
2470
- z: '2',
2471
- d: '3',
2472
- t: '3',
2473
- l: '4',
2474
- m: '5',
2475
- n: '5',
2476
- r: '6'
2477
- },
2478
- ruleset: [{ char: 'c', next: ['h'], code: '7' }]
2479
- });
2822
+ getLIXScore() {
2823
+ const w = this.words.length || 1;
2824
+ const s = this.sentences.length || 1;
2825
+ const l = this.getLongWordRatio() * w;
2826
+ return w / s + (l / w) * 100;
2827
+ }
2828
+ getWSTFScore() {
2829
+ const w = this.words.length || 1;
2830
+ const h = (this.getMinSyllablesWordCount(3) / w) * 100;
2831
+ const s = this.getAvgSentenceLength();
2832
+ const l = this.getLongWordRatio() * 100;
2833
+ const m = (this.getMonosyllabicWordCount() / w) * 100;
2834
+ return [
2835
+ 0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875,
2836
+ 0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779,
2837
+ 0.2963 * h + 0.1905 * s - 1.1144,
2838
+ 0.2744 * h + 0.2656 * s - 1.693
2839
+ ];
2840
+ }
2841
+ }
2480
2842
 
2481
2843
  const profiler = Profiler.getInstance();
2482
2844
  class CmpStr {
@@ -2528,31 +2890,26 @@ class CmpStr {
2528
2890
  }
2529
2891
  assert(cond, test) {
2530
2892
  switch (cond) {
2893
+ default:
2894
+ throw new CmpStrInternalError(`Cmpstr condition <${cond}> unknown`);
2531
2895
  case 'metric':
2532
- if (!CmpStr.metric.has(test))
2533
- throw new CmpStrNotFoundError(
2534
- `CmpStr <metric> must be set, call .setMetric(), ` +
2535
- `use CmpStr.metric.list() for available metrics`,
2536
- { metric: test }
2537
- );
2896
+ OptionsValidator.validateMetricName(test);
2538
2897
  break;
2539
2898
  case 'phonetic':
2540
- if (!CmpStr.phonetic.has(test))
2541
- throw new CmpStrNotFoundError(
2542
- `CmpStr <phonetic> must be set, call .setPhonetic(), ` +
2543
- `use CmpStr.phonetic.list() for available phonetic algorithms`,
2544
- { phonetic: test }
2545
- );
2899
+ OptionsValidator.validatePhoneticName(test);
2546
2900
  break;
2547
- default:
2548
- throw new CmpStrInternalError(`Cmpstr condition <${cond}> unknown`);
2549
2901
  }
2550
2902
  }
2551
2903
  assertMany(...cond) {
2552
2904
  for (const [c, test] of cond) this.assert(c, test);
2553
2905
  }
2554
2906
  resolveOptions(opt) {
2555
- return merge({ ...(this.options ?? Object.create(null)) }, opt);
2907
+ const merged = DeepMerge.merge(
2908
+ { ...(this.options ?? Object.create(null)) },
2909
+ opt
2910
+ );
2911
+ OptionsValidator.validateOptions(merged);
2912
+ return merged;
2556
2913
  }
2557
2914
  normalize(input, flags) {
2558
2915
  return Normalizer.normalize(input, flags ?? this.options.flags ?? '');
@@ -2568,7 +2925,7 @@ class CmpStr {
2568
2925
  return input;
2569
2926
  }
2570
2927
  postProcess(result, opt) {
2571
- if (opt?.removeZero && Array.isArray(result))
2928
+ if (Array.isArray(result) && opt?.removeZero)
2572
2929
  result = result.filter((r) => r.res > 0);
2573
2930
  return result;
2574
2931
  }
@@ -2584,10 +2941,10 @@ class CmpStr {
2584
2941
  return StructuredData.create(data, key);
2585
2942
  }
2586
2943
  compute(a, b, opt, mode, raw, skip) {
2944
+ const resolved = this.resolveOptions(opt);
2945
+ this.assert('metric', resolved.metric);
2587
2946
  return ErrorUtil.wrap(
2588
2947
  () => {
2589
- const resolved = this.resolveOptions(opt);
2590
- this.assert('metric', resolved.metric);
2591
2948
  const A = skip ? a : this.prepare(a, resolved);
2592
2949
  const B = skip ? b : this.prepare(b, resolved);
2593
2950
  if (
@@ -2605,7 +2962,7 @@ class CmpStr {
2605
2962
  const result = this.postProcess(metric.getResults(), resolved);
2606
2963
  return this.output(result, raw ?? resolved.raw);
2607
2964
  },
2608
- `Failed to compute metric <${opt?.metric ?? this.options.metric}> for the given inputs`,
2965
+ `Failed to compute metric <${resolved.metric}> for the given inputs`,
2609
2966
  { a, b, options: opt }
2610
2967
  );
2611
2968
  }
@@ -2621,46 +2978,79 @@ class CmpStr {
2621
2978
  { result, raw }
2622
2979
  );
2623
2980
  }
2624
- clone = () => Object.assign(Object.create(Object.getPrototypeOf(this)), this);
2981
+ clone() {
2982
+ const inst = Object.assign(
2983
+ Object.create(Object.getPrototypeOf(this)),
2984
+ this
2985
+ );
2986
+ inst.options = DeepMerge.merge(Object.create(null), this.options);
2987
+ return inst;
2988
+ }
2625
2989
  reset() {
2626
- for (const k in this.options) delete this.options[k];
2990
+ this.options = Object.create(null);
2627
2991
  return this;
2628
2992
  }
2629
2993
  setOptions(opt) {
2994
+ OptionsValidator.validateOptions(opt);
2630
2995
  this.options = opt;
2631
2996
  return this;
2632
2997
  }
2633
2998
  mergeOptions(opt) {
2634
- merge(this.options, opt);
2999
+ DeepMerge.merge(this.options, opt);
3000
+ OptionsValidator.validateOptions(this.options);
2635
3001
  return this;
2636
3002
  }
2637
3003
  setSerializedOptions(opt) {
2638
- return ErrorUtil.wrap(
2639
- () => {
2640
- this.options = JSON.parse(opt);
2641
- return this;
2642
- },
2643
- `Failed to parse serialized options, invalid JSON string`,
2644
- { opt }
2645
- );
3004
+ try {
3005
+ const parsed = JSON.parse(opt);
3006
+ OptionsValidator.validateOptions(parsed);
3007
+ this.options = parsed;
3008
+ return this;
3009
+ } catch (err) {
3010
+ if (err instanceof SyntaxError)
3011
+ throw new CmpStrValidationError(
3012
+ `Failed to parse serialized options, invalid JSON string`,
3013
+ { opt, error: err instanceof Error ? err.message : String(err) }
3014
+ );
3015
+ throw err;
3016
+ }
2646
3017
  }
2647
3018
  setOption(path, value) {
2648
- set(this.options, path, value);
3019
+ DeepMerge.set(this.options, path, value);
3020
+ OptionsValidator.validateOptions(this.options);
2649
3021
  return this;
2650
3022
  }
2651
3023
  rmvOption(path) {
2652
- rmv(this.options, path);
3024
+ DeepMerge.rmv(this.options, path);
2653
3025
  return this;
2654
3026
  }
2655
- setRaw = (enable) => this.setOption('raw', enable);
2656
- setMetric = (name) => this.setOption('metric', name);
2657
- setFlags = (flags) => this.setOption('flags', flags);
2658
- rmvFlags = () => this.rmvOption('flags');
2659
- setProcessors = (opt) => this.setOption('processors', opt);
2660
- rmvProcessors = () => this.rmvOption('processors');
2661
- getOptions = () => this.options;
2662
- getSerializedOptions = () => JSON.stringify(this.options);
2663
- getOption = (path) => get(this.options, path);
3027
+ setRaw(enable) {
3028
+ return this.setOption('raw', enable);
3029
+ }
3030
+ setMetric(name) {
3031
+ return this.setOption('metric', name);
3032
+ }
3033
+ setFlags(flags) {
3034
+ return this.setOption('flags', flags);
3035
+ }
3036
+ rmvFlags() {
3037
+ return this.rmvOption('flags');
3038
+ }
3039
+ setProcessors(opt) {
3040
+ return this.setOption('processors', opt);
3041
+ }
3042
+ rmvProcessors() {
3043
+ return this.rmvOption('processors');
3044
+ }
3045
+ getOptions() {
3046
+ return this.options;
3047
+ }
3048
+ getSerializedOptions() {
3049
+ return JSON.stringify(this.options);
3050
+ }
3051
+ getOption(path) {
3052
+ return DeepMerge.get(this.options, path);
3053
+ }
2664
3054
  test(a, b, opt) {
2665
3055
  return this.compute(a, b, opt, 'single');
2666
3056
  }
@@ -2699,15 +3089,35 @@ class CmpStr {
2699
3089
  const resolved = this.resolveOptions({ flags, processors });
2700
3090
  const test = this.prepare(needle, resolved);
2701
3091
  const hstk = this.prepare(haystack, resolved);
2702
- return haystack.filter((_, i) => hstk[i].includes(test));
3092
+ const out = [];
3093
+ for (let i = 0, len = hstk.length; i < len; i++) {
3094
+ if (hstk[i].includes(test)) out.push(haystack[i]);
3095
+ }
3096
+ return out;
2703
3097
  }
2704
3098
  matrix(input, opt) {
2705
- input = this.prepare(input, this.resolveOptions(opt));
2706
- return input.map((a) =>
2707
- this.compute(a, input, undefined, 'batch', true, true).map(
2708
- (b) => b.res ?? 0
2709
- )
2710
- );
3099
+ const resolved = this.resolveOptions(opt);
3100
+ const arr = this.prepare(input, resolved);
3101
+ const n = arr.length;
3102
+ const out = Array.from({ length: n }, () => new Array(n).fill(0));
3103
+ for (let i = 0; i < n; i++)
3104
+ for (let j = i; j < n; j++) {
3105
+ if (i === j) {
3106
+ out[i][j] = 1;
3107
+ } else {
3108
+ const score = this.compute(
3109
+ arr[i],
3110
+ arr[j],
3111
+ resolved,
3112
+ 'single',
3113
+ true,
3114
+ true
3115
+ ).res;
3116
+ out[i][j] = score;
3117
+ out[j][i] = score;
3118
+ }
3119
+ }
3120
+ return out;
2711
3121
  }
2712
3122
  phoneticIndex(input, algo, opt) {
2713
3123
  const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
@@ -2785,10 +3195,10 @@ class CmpStrAsync extends CmpStr {
2785
3195
  : phonetic.getIndexAsync(input).then((r) => r.join(delimiter));
2786
3196
  }
2787
3197
  async computeAsync(a, b, opt, mode, raw, skip) {
3198
+ const resolved = this.resolveOptions(opt);
3199
+ this.assert('metric', resolved.metric);
2788
3200
  return ErrorUtil.wrapAsync(
2789
3201
  async () => {
2790
- const resolved = this.resolveOptions(opt);
2791
- this.assert('metric', resolved.metric);
2792
3202
  const A = skip ? a : await this.prepareAsync(a, resolved);
2793
3203
  const B = skip ? b : await this.prepareAsync(b, resolved);
2794
3204
  if (
@@ -2846,23 +3256,40 @@ class CmpStrAsync extends CmpStr {
2846
3256
  const resolved = this.resolveOptions({ flags, processors });
2847
3257
  const test = await this.prepareAsync(needle, resolved);
2848
3258
  const hstk = await this.prepareAsync(haystack, resolved);
2849
- return haystack.filter((_, i) => hstk[i].includes(test));
3259
+ const out = [];
3260
+ for (let i = 0; i < hstk.length; i++) {
3261
+ if (hstk[i].includes(test)) out.push(haystack[i]);
3262
+ }
3263
+ return out;
2850
3264
  }
2851
3265
  async matrixAsync(input, opt) {
2852
- input = await this.prepareAsync(input, this.resolveOptions(opt));
2853
- return Promise.all(
2854
- input.map(
2855
- async (a) =>
2856
- await this.computeAsync(
2857
- a,
2858
- input,
2859
- undefined,
2860
- 'batch',
2861
- true,
2862
- true
2863
- ).then((r) => r.map((b) => b.res ?? 0))
2864
- )
2865
- );
3266
+ const resolved = this.resolveOptions(opt);
3267
+ const arr = await this.prepareAsync(input, resolved);
3268
+ const n = arr.length;
3269
+ const out = Array.from({ length: n }, () => new Array(n).fill(0));
3270
+ for (let i = 0; i < n; i++) {
3271
+ await Promise.all(
3272
+ Array.from({ length: n - i }, (_, k) => i + k).map(async (j) => {
3273
+ if (i === j) {
3274
+ out[i][j] = 1;
3275
+ } else {
3276
+ const score = (
3277
+ await this.computeAsync(
3278
+ arr[i],
3279
+ arr[j],
3280
+ resolved,
3281
+ 'single',
3282
+ true,
3283
+ true
3284
+ )
3285
+ ).res;
3286
+ out[i][j] = score;
3287
+ out[j][i] = score;
3288
+ }
3289
+ })
3290
+ );
3291
+ }
3292
+ return out;
2866
3293
  }
2867
3294
  async phoneticIndexAsync(input, algo, opt) {
2868
3295
  const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
@@ -2919,6 +3346,7 @@ export {
2919
3346
  Metric,
2920
3347
  MetricRegistry,
2921
3348
  Normalizer,
3349
+ OptionsValidator,
2922
3350
  Phonetic,
2923
3351
  PhoneticMappingRegistry,
2924
3352
  PhoneticRegistry,