cmpstr 3.2.2 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/dist/CmpStr.esm.js +2149 -1721
  2. package/dist/CmpStr.esm.min.js +2 -2
  3. package/dist/CmpStr.umd.js +2028 -1604
  4. package/dist/CmpStr.umd.min.js +2 -2
  5. package/dist/cjs/CmpStr.cjs +100 -51
  6. package/dist/cjs/CmpStrAsync.cjs +35 -18
  7. package/dist/cjs/index.cjs +1 -1
  8. package/dist/cjs/metric/Cosine.cjs +1 -1
  9. package/dist/cjs/metric/DamerauLevenshtein.cjs +1 -1
  10. package/dist/cjs/metric/DiceSorensen.cjs +1 -1
  11. package/dist/cjs/metric/Hamming.cjs +1 -1
  12. package/dist/cjs/metric/Jaccard.cjs +1 -1
  13. package/dist/cjs/metric/JaroWinkler.cjs +1 -1
  14. package/dist/cjs/metric/LCS.cjs +1 -1
  15. package/dist/cjs/metric/Levenshtein.cjs +1 -1
  16. package/dist/cjs/metric/Metric.cjs +40 -22
  17. package/dist/cjs/metric/NeedlemanWunsch.cjs +1 -1
  18. package/dist/cjs/metric/QGram.cjs +1 -1
  19. package/dist/cjs/metric/SmithWaterman.cjs +1 -1
  20. package/dist/cjs/phonetic/Caverphone.cjs +1 -1
  21. package/dist/cjs/phonetic/Cologne.cjs +1 -1
  22. package/dist/cjs/phonetic/Metaphone.cjs +1 -1
  23. package/dist/cjs/phonetic/Phonetic.cjs +27 -15
  24. package/dist/cjs/phonetic/Soundex.cjs +1 -1
  25. package/dist/cjs/root.cjs +4 -2
  26. package/dist/cjs/utils/DeepMerge.cjs +102 -97
  27. package/dist/cjs/utils/DiffChecker.cjs +1 -1
  28. package/dist/cjs/utils/Errors.cjs +22 -19
  29. package/dist/cjs/utils/Filter.cjs +59 -24
  30. package/dist/cjs/utils/HashTable.cjs +44 -29
  31. package/dist/cjs/utils/Normalizer.cjs +57 -28
  32. package/dist/cjs/utils/OptionsValidator.cjs +211 -0
  33. package/dist/cjs/utils/Pool.cjs +27 -13
  34. package/dist/cjs/utils/Profiler.cjs +41 -27
  35. package/dist/cjs/utils/Registry.cjs +5 -5
  36. package/dist/cjs/utils/StructuredData.cjs +83 -53
  37. package/dist/cjs/utils/TextAnalyzer.cjs +1 -1
  38. package/dist/esm/CmpStr.mjs +101 -52
  39. package/dist/esm/CmpStrAsync.mjs +35 -18
  40. package/dist/esm/index.mjs +1 -1
  41. package/dist/esm/metric/Cosine.mjs +1 -1
  42. package/dist/esm/metric/DamerauLevenshtein.mjs +1 -1
  43. package/dist/esm/metric/DiceSorensen.mjs +1 -1
  44. package/dist/esm/metric/Hamming.mjs +1 -1
  45. package/dist/esm/metric/Jaccard.mjs +1 -1
  46. package/dist/esm/metric/JaroWinkler.mjs +1 -1
  47. package/dist/esm/metric/LCS.mjs +1 -1
  48. package/dist/esm/metric/Levenshtein.mjs +1 -1
  49. package/dist/esm/metric/Metric.mjs +40 -22
  50. package/dist/esm/metric/NeedlemanWunsch.mjs +1 -1
  51. package/dist/esm/metric/QGram.mjs +1 -1
  52. package/dist/esm/metric/SmithWaterman.mjs +1 -1
  53. package/dist/esm/phonetic/Caverphone.mjs +1 -1
  54. package/dist/esm/phonetic/Cologne.mjs +1 -1
  55. package/dist/esm/phonetic/Metaphone.mjs +1 -1
  56. package/dist/esm/phonetic/Phonetic.mjs +30 -15
  57. package/dist/esm/phonetic/Soundex.mjs +1 -1
  58. package/dist/esm/root.mjs +3 -3
  59. package/dist/esm/utils/DeepMerge.mjs +103 -94
  60. package/dist/esm/utils/DiffChecker.mjs +1 -1
  61. package/dist/esm/utils/Errors.mjs +22 -19
  62. package/dist/esm/utils/Filter.mjs +59 -24
  63. package/dist/esm/utils/HashTable.mjs +44 -29
  64. package/dist/esm/utils/Normalizer.mjs +57 -28
  65. package/dist/esm/utils/OptionsValidator.mjs +210 -0
  66. package/dist/esm/utils/Pool.mjs +27 -13
  67. package/dist/esm/utils/Profiler.mjs +41 -27
  68. package/dist/esm/utils/Registry.mjs +5 -5
  69. package/dist/esm/utils/StructuredData.mjs +83 -53
  70. package/dist/esm/utils/TextAnalyzer.mjs +1 -1
  71. package/dist/types/CmpStr.d.ts +22 -15
  72. package/dist/types/CmpStrAsync.d.ts +3 -0
  73. package/dist/types/index.d.ts +3 -3
  74. package/dist/types/metric/Metric.d.ts +9 -9
  75. package/dist/types/phonetic/Phonetic.d.ts +4 -3
  76. package/dist/types/root.d.ts +3 -2
  77. package/dist/types/utils/DeepMerge.d.ts +80 -58
  78. package/dist/types/utils/Errors.d.ts +25 -8
  79. package/dist/types/utils/Filter.d.ts +4 -1
  80. package/dist/types/utils/HashTable.d.ts +12 -11
  81. package/dist/types/utils/Normalizer.d.ts +2 -1
  82. package/dist/types/utils/OptionsValidator.d.ts +193 -0
  83. package/dist/types/utils/Profiler.d.ts +9 -28
  84. package/dist/types/utils/StructuredData.d.ts +3 -0
  85. package/dist/types/utils/Types.d.ts +13 -1
  86. package/package.json +14 -5
@@ -1,5 +1,5 @@
1
1
  /**
2
- * CmpStr v3.2.2 build-bb61120-260311
2
+ * CmpStr v3.3.0 build-3699f85-260318
3
3
  * This is a lightweight, fast and well performing library for calculating string similarity.
4
4
  * (c) 2023-2026 Paul Köhler @komed3 / MIT License
5
5
  * Visit https://github.com/komed3/cmpstr and https://npmjs.org/package/cmpstr
@@ -18,19 +18,32 @@
18
18
  class CmpStrError extends Error {
19
19
  code;
20
20
  meta;
21
- cause;
22
21
  when = new Date().toISOString();
23
22
  constructor(code, message, meta, cause) {
24
- super(message);
23
+ super(message, cause !== undefined ? { cause } : undefined);
25
24
  this.name = this.constructor.name;
26
25
  this.code = code;
27
26
  this.meta = meta;
28
- this.cause = cause;
29
27
  if (typeof Error.captureStackTrace === 'function') {
30
28
  Error.captureStackTrace(this, this.constructor);
31
29
  }
32
30
  }
33
- toJSON() {
31
+ format(stack = false) {
32
+ const parts = [`${this.name} [${this.code}]`, this.message];
33
+ if (this.meta)
34
+ for (const _ in this.meta) {
35
+ parts.push(JSON.stringify(this.meta));
36
+ break;
37
+ }
38
+ return (
39
+ parts.join(' - ') +
40
+ (stack && this.stack ? `\nStack Trace:\n${this.stack}` : '')
41
+ );
42
+ }
43
+ toString() {
44
+ return this.format(false);
45
+ }
46
+ toJSON(stack = false) {
34
47
  return {
35
48
  name: this.name,
36
49
  code: this.code,
@@ -42,23 +55,11 @@
42
55
  ? {
43
56
  name: this.cause.name,
44
57
  message: this.cause.message,
45
- stack: this.cause.stack
58
+ stack: stack && this.cause.stack
46
59
  }
47
60
  : this.cause
48
61
  };
49
62
  }
50
- toString(stack = false) {
51
- const parts = [`${this.name} [${this.code}]`, this.message];
52
- if (this.meta && Object.keys(this.meta).length) {
53
- try {
54
- parts.push(JSON.stringify(this.meta));
55
- } catch {}
56
- }
57
- return (
58
- parts.join(' - ') +
59
- (stack && this.stack ? `\nStack Trace:\n${this.stack}` : '')
60
- );
61
- }
62
63
  }
63
64
  class CmpStrValidationError extends CmpStrError {
64
65
  constructor(message, meta, cause) {
@@ -84,7 +85,7 @@
84
85
  static assert(condition, message, meta) {
85
86
  if (!condition) throw new CmpStrUsageError(message, meta);
86
87
  }
87
- static create(err, message, meta) {
88
+ static rethrow(err, message, meta) {
88
89
  if (err instanceof CmpStrError) throw err;
89
90
  throw new CmpStrInternalError(message, meta, err);
90
91
  }
@@ -97,6 +98,7 @@
97
98
  try {
98
99
  return fn();
99
100
  } catch (err) {
101
+ if (err instanceof CmpStrError) throw err;
100
102
  throw new CmpStrInternalError(message, meta, err);
101
103
  }
102
104
  }
@@ -104,6 +106,7 @@
104
106
  try {
105
107
  return await fn();
106
108
  } catch (err) {
109
+ if (err instanceof CmpStrError) throw err;
107
110
  throw new CmpStrInternalError(message, meta, err);
108
111
  }
109
112
  }
@@ -119,118 +122,120 @@
119
122
  ErrorUtil: ErrorUtil
120
123
  });
121
124
 
122
- const BRACKET_PATTERN = /\[(\d+)]/g;
123
- const PATH_CACHE = new Map();
124
- function parse(p) {
125
- let cached = PATH_CACHE.get(p);
126
- if (cached) return cached;
127
- const parsed = p
128
- .replace(BRACKET_PATTERN, '.$1')
129
- .split('.')
130
- .map((s) => {
131
- const n = Number(s);
132
- return Number.isInteger(n) && String(n) === s ? n : s;
133
- });
134
- PATH_CACHE.set(p, parsed);
135
- return parsed;
136
- }
137
- function get(t, path, fb) {
138
- let o = t;
139
- for (const k of parse(path)) {
140
- if (o == null || !(k in o)) return fb;
141
- o = o[k];
125
+ class DeepMerge {
126
+ static BRACKET_PATTERN = /\[(\d+)]/g;
127
+ static PATH_CACHE = new Map();
128
+ static walk(obj, keys) {
129
+ let o = obj;
130
+ for (let i = 0; i < keys.length; i++) {
131
+ const k = keys[i];
132
+ if (o == null || !(k in o)) return { exists: false };
133
+ o = o[k];
134
+ }
135
+ return { exists: true, value: o };
136
+ }
137
+ static parse(p) {
138
+ const cached = DeepMerge.PATH_CACHE.get(p);
139
+ if (cached) return cached;
140
+ const parsed = p
141
+ .replace(DeepMerge.BRACKET_PATTERN, '.$1')
142
+ .split('.')
143
+ .map((s) => {
144
+ const n = Number(s);
145
+ return Number.isInteger(n) && String(n) === s ? n : s;
146
+ });
147
+ if (DeepMerge.PATH_CACHE.size > 2000) DeepMerge.PATH_CACHE.clear();
148
+ DeepMerge.PATH_CACHE.set(p, parsed);
149
+ return parsed;
142
150
  }
143
- return o;
144
- }
145
- function has(t, path) {
146
- let o = t;
147
- for (const k of parse(path)) {
148
- if (o == null || !(k in o)) return false;
149
- o = o[k];
151
+ static has(t, path) {
152
+ return DeepMerge.walk(t, DeepMerge.parse(path)).exists;
150
153
  }
151
- return true;
152
- }
153
- function set(t, path, value) {
154
- if (path === '') return value;
155
- const keys = parse(path);
156
- if (t !== undefined && (typeof t !== 'object' || t === null))
157
- throw new CmpStrUsageError(
154
+ static get(t, path, fb) {
155
+ const r = DeepMerge.walk(t, DeepMerge.parse(path));
156
+ return r.exists ? r.value : fb;
157
+ }
158
+ static set(t, path, value) {
159
+ if (path === '') return value;
160
+ const keys = DeepMerge.parse(path);
161
+ ErrorUtil.assert(
162
+ t === undefined || (typeof t === 'object' && t !== null),
158
163
  `Cannot set property <${keys[0]}> of <${JSON.stringify(t)}>`,
159
164
  { path: keys[0], target: t }
160
165
  );
161
- const root = t ?? (typeof keys[0] === 'number' ? [] : Object.create(null));
162
- let cur = root;
163
- for (let i = 0; i < keys.length - 1; i++) {
164
- const k = keys[i];
165
- let n = cur[k];
166
- if (n != null && typeof n !== 'object')
167
- throw new CmpStrUsageError(
166
+ const root =
167
+ t ?? (typeof keys[0] === 'number' ? [] : Object.create(null));
168
+ let cur = root;
169
+ for (let i = 0; i < keys.length - 1; i++) {
170
+ const k = keys[i];
171
+ let n = cur[k];
172
+ ErrorUtil.assert(
173
+ n == null || typeof n === 'object',
168
174
  `Cannot set property <${keys[i + 1]}> of <${JSON.stringify(n)}>`,
169
175
  { path: keys.slice(0, i + 2), value: n }
170
176
  );
171
- if (n == null)
172
- n = cur[k] = typeof keys[i + 1] === 'number' ? [] : Object.create(null);
173
- cur = n;
174
- }
175
- cur[keys[keys.length - 1]] = value;
176
- return root;
177
- }
178
- function merge(
179
- t = Object.create(null),
180
- o = Object.create(null),
181
- mergeUndefined = false
182
- ) {
183
- const target = t ?? Object.create(null);
184
- Object.keys(o).forEach((k) => {
185
- const val = o[k];
186
- if (!mergeUndefined && val === undefined) return;
187
- if (k === '__proto__' || k === 'constructor') return;
188
- if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
189
- const existing = target[k];
190
- target[k] = merge(
191
- existing !== null &&
192
- typeof existing === 'object' &&
193
- !Array.isArray(existing)
194
- ? existing
195
- : Object.create(null),
196
- val,
197
- mergeUndefined
198
- );
199
- } else target[k] = val;
200
- });
201
- return target;
202
- }
203
- function rmv(t, path, preserveEmpty = false) {
204
- const keys = parse(path);
205
- const remove = (obj, i = 0) => {
206
- const key = keys[i];
207
- if (!obj || typeof obj !== 'object') return false;
208
- if (i === keys.length - 1) return delete obj[key];
209
- if (!remove(obj[key], i + 1)) return false;
210
- if (!preserveEmpty) {
211
- const val = obj[key];
212
- if (
213
- typeof val === 'object' &&
214
- ((Array.isArray(val) && val.every((v) => v == null)) ||
215
- (!Array.isArray(val) && Object.keys(val).length === 0))
216
- )
217
- delete obj[key];
177
+ if (n == null)
178
+ n = cur[k] =
179
+ typeof keys[i + 1] === 'number' ? [] : Object.create(null);
180
+ cur = n;
218
181
  }
219
- return true;
220
- };
221
- remove(t);
222
- return t;
182
+ cur[keys[keys.length - 1]] = value;
183
+ return root;
184
+ }
185
+ static rmv(t, path, preserveEmpty = false) {
186
+ const keys = DeepMerge.parse(path);
187
+ const remove = (obj, i = 0) => {
188
+ const key = keys[i];
189
+ if (!obj || typeof obj !== 'object') return false;
190
+ if (i === keys.length - 1) return delete obj[key];
191
+ if (!remove(obj[key], i + 1)) return false;
192
+ if (!preserveEmpty) {
193
+ const val = obj[key];
194
+ let empty = true;
195
+ if (typeof val === 'object') {
196
+ if (Array.isArray(val))
197
+ for (let i = 0; i < val.length; i++) {
198
+ if (val[i] != null) {
199
+ empty = false;
200
+ break;
201
+ }
202
+ }
203
+ else empty = false;
204
+ }
205
+ if (empty) delete obj[key];
206
+ }
207
+ return true;
208
+ };
209
+ remove(t);
210
+ return t;
211
+ }
212
+ static merge(
213
+ t = Object.create(null),
214
+ o = Object.create(null),
215
+ mergeUndefined = false
216
+ ) {
217
+ const target = t ?? Object.create(null);
218
+ for (const k in o) {
219
+ const val = o[k];
220
+ if (!mergeUndefined && val === undefined) continue;
221
+ if (k === '__proto__' || k === 'constructor') continue;
222
+ if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
223
+ const existing = target[k];
224
+ target[k] = DeepMerge.merge(
225
+ existing !== null &&
226
+ typeof existing === 'object' &&
227
+ !Array.isArray(existing)
228
+ ? existing
229
+ : Object.create(null),
230
+ val,
231
+ mergeUndefined
232
+ );
233
+ } else target[k] = val;
234
+ }
235
+ return target;
236
+ }
223
237
  }
224
238
 
225
- var DeepMerge = /*#__PURE__*/ Object.freeze({
226
- __proto__: null,
227
- get: get,
228
- has: has,
229
- merge: merge,
230
- rmv: rmv,
231
- set: set
232
- });
233
-
234
239
  class DiffChecker {
235
240
  a;
236
241
  b;
@@ -530,20 +535,33 @@
530
535
  }
531
536
 
532
537
  class Filter {
538
+ static IDENTITY = (s) => s;
533
539
  static filters = new Map();
534
540
  static pipeline = new Map();
535
- static getPipeline(hook) {
541
+ static getPipeline(hook, force = false) {
536
542
  return ErrorUtil.wrap(
537
543
  () => {
538
- const cached = Filter.pipeline.get(hook);
539
- if (cached) return cached;
544
+ if (!force) {
545
+ const cached = Filter.pipeline.get(hook);
546
+ if (cached) return cached;
547
+ }
540
548
  const filter = Filter.filters.get(hook);
541
- if (!filter) return (s) => s;
542
- const pipeline = Array.from(filter.values())
543
- .filter((f) => f.active)
544
- .sort((a, b) => a.priority - b.priority)
545
- .map((f) => f.fn);
546
- const fn = (input) => pipeline.reduce((v, f) => f(v), input);
549
+ if (!filter) {
550
+ Filter.pipeline.set(hook, Filter.IDENTITY);
551
+ return Filter.IDENTITY;
552
+ }
553
+ const pipeline = [];
554
+ for (const f of filter.values()) if (f.active) pipeline.push(f);
555
+ pipeline.sort((a, b) => a.priority - b.priority);
556
+ const fn =
557
+ pipeline.length === 0
558
+ ? Filter.IDENTITY
559
+ : (input) => {
560
+ let v = input;
561
+ for (let i = 0; i < pipeline.length; i++)
562
+ v = pipeline[i].fn(v);
563
+ return v;
564
+ };
547
565
  Filter.pipeline.set(hook, fn);
548
566
  return fn;
549
567
  },
@@ -561,9 +579,16 @@
561
579
  const filter = Filter.filters.get(hook) ?? new Map();
562
580
  const index = filter.get(id);
563
581
  if (index && !index.overrideable) return false;
582
+ if (
583
+ index &&
584
+ index.fn === fn &&
585
+ index.priority === priority &&
586
+ index.active === active
587
+ )
588
+ return true;
564
589
  filter.set(id, { id, fn, priority, active, overrideable });
565
590
  Filter.filters.set(hook, filter);
566
- Filter.pipeline.delete(hook);
591
+ Filter.getPipeline(hook, true);
567
592
  return true;
568
593
  },
569
594
  `Error adding filter <${id}> to hook <${hook}>`,
@@ -571,19 +596,28 @@
571
596
  );
572
597
  }
573
598
  static remove(hook, id) {
574
- Filter.pipeline.delete(hook);
575
599
  const filter = Filter.filters.get(hook);
576
- return filter ? filter.delete(id) : false;
600
+ if (!filter || !filter.delete(id)) return false;
601
+ Filter.getPipeline(hook, true);
602
+ return true;
577
603
  }
578
604
  static pause(hook, id) {
579
- Filter.pipeline.delete(hook);
580
- const f = Filter.filters.get(hook)?.get(id);
581
- return !!(f && ((f.active = false), true));
605
+ const filter = Filter.filters.get(hook);
606
+ if (!filter) return false;
607
+ const f = filter.get(id);
608
+ if (!f || !f.active) return false;
609
+ f.active = false;
610
+ Filter.getPipeline(hook, true);
611
+ return true;
582
612
  }
583
613
  static resume(hook, id) {
584
- Filter.pipeline.delete(hook);
585
- const f = Filter.filters.get(hook)?.get(id);
586
- return !!(f && ((f.active = true), true));
614
+ const filter = Filter.filters.get(hook);
615
+ if (!filter) return false;
616
+ const f = filter.get(id);
617
+ if (!f || f.active) return false;
618
+ f.active = true;
619
+ Filter.getPipeline(hook, true);
620
+ return true;
587
621
  }
588
622
  static list(hook, active = false) {
589
623
  const filter = Filter.filters.get(hook);
@@ -596,7 +630,11 @@
596
630
  return ErrorUtil.wrap(
597
631
  () => {
598
632
  const fn = Filter.getPipeline(hook);
599
- return Array.isArray(input) ? input.map(fn) : fn(input);
633
+ if (typeof input === 'string') return fn(input);
634
+ const arr = input;
635
+ const out = new Array(arr.length);
636
+ for (let i = 0; i < arr.length; i++) out[i] = fn(arr[i]);
637
+ return out;
600
638
  },
601
639
  `Error applying filters for hook <${hook}>`,
602
640
  { hook, input }
@@ -606,16 +644,19 @@
606
644
  return ErrorUtil.wrapAsync(
607
645
  async () => {
608
646
  const fn = Filter.getPipeline(hook);
609
- return Array.isArray(input)
610
- ? Promise.all(input.map(fn))
611
- : Promise.resolve(fn(input));
647
+ if (typeof input === 'string') return Promise.resolve(fn(input));
648
+ const arr = input;
649
+ const out = new Array(arr.length);
650
+ for (let i = 0; i < arr.length; i++)
651
+ out[i] = Promise.resolve(fn(arr[i]));
652
+ return Promise.all(out);
612
653
  },
613
654
  `Error applying filters for hook <${hook}>`,
614
655
  { hook, input }
615
656
  );
616
657
  }
617
658
  static clear(hook) {
618
- Filter.pipeline.clear();
659
+ Filter.clearPipeline();
619
660
  if (hook) Filter.filters.delete(hook);
620
661
  else Filter.filters.clear();
621
662
  }
@@ -629,25 +670,21 @@
629
670
  static HASH_OFFSET = 0x811c9dc5;
630
671
  static fastFNV1a(str) {
631
672
  const len = str.length;
673
+ const limit = len & -4;
632
674
  let hash = this.HASH_OFFSET;
633
- const chunks = Math.floor(len / 4);
634
- for (let i = 0; i < chunks; i++) {
635
- const pos = i * 4;
675
+ let i = 0;
676
+ for (; i < limit; i += 4) {
636
677
  const chunk =
637
- str.charCodeAt(pos) |
638
- (str.charCodeAt(pos + 1) << 8) |
639
- (str.charCodeAt(pos + 2) << 16) |
640
- (str.charCodeAt(pos + 3) << 24);
678
+ str.charCodeAt(i) |
679
+ (str.charCodeAt(i + 1) << 8) |
680
+ (str.charCodeAt(i + 2) << 16) |
681
+ (str.charCodeAt(i + 3) << 24);
641
682
  hash ^= chunk;
642
683
  hash = Math.imul(hash, this.FNV_PRIME);
643
684
  }
644
- const remaining = len % 4;
645
- if (remaining > 0) {
646
- const pos = chunks * 4;
647
- for (let i = 0; i < remaining; i++) {
648
- hash ^= str.charCodeAt(pos + i);
649
- hash = Math.imul(hash, this.FNV_PRIME);
650
- }
685
+ for (; i < len; i++) {
686
+ hash ^= str.charCodeAt(i);
687
+ hash = Math.imul(hash, this.FNV_PRIME);
651
688
  }
652
689
  hash ^= hash >>> 16;
653
690
  hash *= 0x85ebca6b;
@@ -658,32 +695,51 @@
658
695
  }
659
696
  }
660
697
  class HashTable {
661
- LRU;
698
+ FIFO;
699
+ maxSize;
662
700
  static MAX_LEN = 2048;
663
- static TABLE_SIZE = 10_000;
664
701
  table = new Map();
665
- constructor(LRU = true) {
666
- this.LRU = LRU;
702
+ constructor(FIFO = true, maxSize = 10000) {
703
+ this.FIFO = FIFO;
704
+ this.maxSize = maxSize;
667
705
  }
668
706
  key(label, strs, sorted = false) {
669
- for (const str of strs) if (str.length > HashTable.MAX_LEN) return false;
670
- const hashes = strs.map((s) => Hasher.fastFNV1a(s));
671
- return [label, ...(sorted ? hashes.sort() : hashes)].join('-');
707
+ const n = strs.length;
708
+ const hashes = new Array(n);
709
+ for (let i = 0; i < n; i++) {
710
+ const s = strs[i];
711
+ if (s.length > HashTable.MAX_LEN) return false;
712
+ hashes[i] = Hasher.fastFNV1a(s);
713
+ }
714
+ if (sorted) hashes.sort((a, b) => a - b);
715
+ let key = label;
716
+ for (let i = 0; i < hashes.length; i++) key += '-' + hashes[i];
717
+ return key;
718
+ }
719
+ has(key) {
720
+ return this.table.has(key);
721
+ }
722
+ get(key) {
723
+ return this.table.get(key);
672
724
  }
673
- has = (key) => this.table.has(key);
674
- get = (key) => this.table.get(key);
675
725
  set(key, entry, update = true) {
676
726
  if (!update && this.table.has(key)) return false;
677
- while (!this.table.has(key) && this.table.size >= HashTable.TABLE_SIZE) {
678
- if (!this.LRU) return false;
727
+ if (!this.table.has(key) && this.table.size >= this.maxSize) {
728
+ if (!this.FIFO) return false;
679
729
  this.table.delete(this.table.keys().next().value);
680
730
  }
681
731
  this.table.set(key, entry);
682
732
  return true;
683
733
  }
684
- delete = (key) => this.table.delete(key);
685
- clear = () => this.table.clear();
686
- size = () => this.table.size;
734
+ delete(key) {
735
+ return this.table.delete(key);
736
+ }
737
+ clear() {
738
+ this.table.clear();
739
+ }
740
+ size() {
741
+ return this.table.size;
742
+ }
687
743
  }
688
744
 
689
745
  class Normalizer {
@@ -702,25 +758,49 @@
702
758
  static getPipeline(flags) {
703
759
  return ErrorUtil.wrap(
704
760
  () => {
705
- if (Normalizer.pipeline.has(flags))
706
- return Normalizer.pipeline.get(flags);
761
+ const cached = Normalizer.pipeline.get(flags);
762
+ if (cached) return cached;
707
763
  const { REGEX } = Normalizer;
708
- const steps = [
709
- ['d', (s) => s.normalize('NFD')],
710
- ['i', (s) => s.toLowerCase()],
711
- ['k', (s) => s.replace(REGEX.nonLetters, '')],
712
- ['n', (s) => s.replace(REGEX.nonNumbers, '')],
713
- ['r', (s) => s.replace(REGEX.doubleChars, '$1')],
714
- ['s', (s) => s.replace(REGEX.specialChars, '')],
715
- ['t', (s) => s.trim()],
716
- ['u', (s) => s.normalize('NFC')],
717
- ['w', (s) => s.replace(REGEX.whitespace, ' ')],
718
- ['x', (s) => s.normalize('NFKC')]
719
- ];
720
- const pipeline = steps
721
- .filter(([f]) => flags.includes(f))
722
- .map(([, fn]) => fn);
723
- const fn = (s) => pipeline.reduce((v, f) => f(v), s);
764
+ const steps = [];
765
+ for (let i = 0; i < flags.length; i++) {
766
+ switch (flags[i]) {
767
+ case 'd':
768
+ steps.push((s) => s.normalize('NFD'));
769
+ break;
770
+ case 'i':
771
+ steps.push((s) => s.toLowerCase());
772
+ break;
773
+ case 'k':
774
+ steps.push((s) => s.replace(REGEX.nonLetters, ''));
775
+ break;
776
+ case 'n':
777
+ steps.push((s) => s.replace(REGEX.nonNumbers, ''));
778
+ break;
779
+ case 'r':
780
+ steps.push((s) => s.replace(REGEX.doubleChars, '$1'));
781
+ break;
782
+ case 's':
783
+ steps.push((s) => s.replace(REGEX.specialChars, ''));
784
+ break;
785
+ case 't':
786
+ steps.push((s) => s.trim());
787
+ break;
788
+ case 'u':
789
+ steps.push((s) => s.normalize('NFC'));
790
+ break;
791
+ case 'w':
792
+ steps.push((s) => s.replace(REGEX.whitespace, ' '));
793
+ break;
794
+ case 'x':
795
+ steps.push((s) => s.normalize('NFKC'));
796
+ break;
797
+ }
798
+ }
799
+ const fn = (input) => {
800
+ let v = input;
801
+ for (let i = 0; i < steps.length; i++) v = steps[i](v);
802
+ return v;
803
+ };
724
804
  Normalizer.pipeline.set(flags, fn);
725
805
  return fn;
726
806
  },
@@ -728,19 +808,23 @@
728
808
  { flags }
729
809
  );
730
810
  }
731
- static normalize(input, flags) {
811
+ static normalize(input, flags, normalizedFlags) {
732
812
  return ErrorUtil.wrap(
733
813
  () => {
734
814
  if (!flags || typeof flags !== 'string' || !input) return input;
735
- flags = this.canonicalFlags(flags);
736
- if (Array.isArray(input))
737
- return input.map((s) => Normalizer.normalize(s, flags));
738
- const key = Normalizer.cache.key(flags, [input]);
739
- if (key && Normalizer.cache.has(key))
740
- return Normalizer.cache.get(key);
741
- const res = Normalizer.getPipeline(flags)(input);
742
- if (key) Normalizer.cache.set(key, res);
743
- return res;
815
+ flags = normalizedFlags ?? this.canonicalFlags(flags);
816
+ const pipeline = Normalizer.getPipeline(flags);
817
+ const normalizeOne = (s) => {
818
+ const key = Normalizer.cache.key(flags, [s]);
819
+ if (key && Normalizer.cache.has(key))
820
+ return Normalizer.cache.get(key);
821
+ const res = pipeline(s);
822
+ if (key) Normalizer.cache.set(key, res);
823
+ return res;
824
+ };
825
+ return Array.isArray(input)
826
+ ? input.map(normalizeOne)
827
+ : normalizeOne(input);
744
828
  },
745
829
  `Failed to normalize input with flags: ${flags}`,
746
830
  { input, flags }
@@ -764,17 +848,144 @@
764
848
  }
765
849
  }
766
850
 
851
+ class RingPool {
852
+ maxSize;
853
+ buffers = [];
854
+ pointer = 0;
855
+ constructor(maxSize) {
856
+ this.maxSize = maxSize;
857
+ }
858
+ acquire(minSize, allowOversize) {
859
+ return ErrorUtil.wrap(
860
+ () => {
861
+ const buffers = this.buffers;
862
+ const len = buffers.length;
863
+ for (let i = 0; i < len; i++) {
864
+ const idx = (this.pointer + i) % len;
865
+ const item = buffers[idx];
866
+ const size = item.size;
867
+ if (size >= minSize && (allowOversize || size === minSize)) {
868
+ this.pointer = (idx + 1) % len;
869
+ return item;
870
+ }
871
+ }
872
+ return null;
873
+ },
874
+ `Failed to acquire buffer of size >= ${minSize} from pool`,
875
+ { minSize, allowOversize }
876
+ );
877
+ }
878
+ release(item) {
879
+ ErrorUtil.wrap(
880
+ () => {
881
+ const buffers = this.buffers;
882
+ if (buffers.length < this.maxSize) {
883
+ buffers.push(item);
884
+ return;
885
+ }
886
+ buffers[this.pointer] = item;
887
+ this.pointer = (this.pointer + 1) % this.maxSize;
888
+ },
889
+ `Failed to release buffer back to pool`,
890
+ { item }
891
+ );
892
+ }
893
+ clear() {
894
+ this.buffers = [];
895
+ this.pointer = 0;
896
+ }
897
+ }
898
+ class Pool {
899
+ static CONFIG = {
900
+ int32: {
901
+ type: 'int32',
902
+ maxSize: 64,
903
+ maxItemSize: 2048,
904
+ allowOversize: true
905
+ },
906
+ 'arr[]': {
907
+ type: 'arr[]',
908
+ maxSize: 4,
909
+ maxItemSize: 1024,
910
+ allowOversize: false
911
+ },
912
+ 'number[]': {
913
+ type: 'number[]',
914
+ maxSize: 16,
915
+ maxItemSize: 1024,
916
+ allowOversize: false
917
+ },
918
+ 'string[]': {
919
+ type: 'string[]',
920
+ maxSize: 2,
921
+ maxItemSize: 1024,
922
+ allowOversize: false
923
+ },
924
+ set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
925
+ map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
926
+ };
927
+ static POOLS = {
928
+ int32: new RingPool(64),
929
+ 'arr[]': new RingPool(4),
930
+ 'number[]': new RingPool(16),
931
+ 'string[]': new RingPool(2),
932
+ set: new RingPool(8),
933
+ map: new RingPool(8)
934
+ };
935
+ static allocate(type, size) {
936
+ switch (type) {
937
+ case 'int32':
938
+ return new Int32Array(size);
939
+ case 'arr[]':
940
+ return new Array(size);
941
+ case 'number[]':
942
+ return new Float64Array(size);
943
+ case 'string[]':
944
+ return new Array(size);
945
+ case 'set':
946
+ return new Set();
947
+ case 'map':
948
+ return new Map();
949
+ }
950
+ }
951
+ static acquire(type, size) {
952
+ const CONFIG = this.CONFIG[type];
953
+ if (!CONFIG)
954
+ throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
955
+ if (size > CONFIG.maxItemSize) return this.allocate(type, size);
956
+ const item = this.POOLS[type].acquire(size, CONFIG.allowOversize);
957
+ if (item)
958
+ return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
959
+ return this.allocate(type, size);
960
+ }
961
+ static acquireMany(type, sizes) {
962
+ const out = new Array(sizes.length);
963
+ for (let i = 0; i < sizes.length; i++)
964
+ out[i] = this.acquire(type, sizes[i]);
965
+ return out;
966
+ }
967
+ static release(type, buffer, size) {
968
+ const CONFIG = this.CONFIG[type];
969
+ if (!CONFIG)
970
+ throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
971
+ if (size <= CONFIG.maxItemSize)
972
+ this.POOLS[type].release({ buffer, size });
973
+ }
974
+ }
975
+
767
976
  class Profiler {
768
977
  active;
769
978
  static ENV;
770
979
  static instance;
771
980
  nowFn;
772
981
  memFn;
773
- store = new Set();
982
+ store = [];
983
+ last;
774
984
  totalTime = 0;
775
985
  totalMem = 0;
776
986
  static detectEnv() {
777
- if (typeof process !== 'undefined') Profiler.ENV = 'nodejs';
987
+ if (typeof process !== 'undefined' && process.versions?.node)
988
+ Profiler.ENV = 'nodejs';
778
989
  else if (typeof performance !== 'undefined') Profiler.ENV = 'browser';
779
990
  else Profiler.ENV = 'unknown';
780
991
  }
@@ -786,7 +997,7 @@
786
997
  this.active = active;
787
998
  switch (Profiler.ENV) {
788
999
  case 'nodejs':
789
- this.nowFn = () => Number(process.hrtime.bigint()) / 1e6;
1000
+ this.nowFn = () => Number(process.hrtime.bigint()) * 1e-6;
790
1001
  this.memFn = () => process.memoryUsage().heapUsed;
791
1002
  break;
792
1003
  case 'browser':
@@ -799,40 +1010,52 @@
799
1010
  break;
800
1011
  }
801
1012
  }
802
- now = () => this.nowFn();
803
- mem = () => this.memFn();
804
- profile(fn, meta) {
805
- const startTime = this.now(),
806
- startMem = this.mem();
807
- const res = fn();
808
- const deltaTime = this.now() - startTime,
809
- deltaMem = this.mem() - startMem;
810
- this.store.add({ time: deltaTime, mem: deltaMem, res, meta });
811
- ((this.totalTime += deltaTime), (this.totalMem += deltaMem));
812
- return res;
1013
+ storeRes(entry) {
1014
+ this.store.push((this.last = entry));
1015
+ this.totalTime += entry.time;
1016
+ this.totalMem += entry.mem;
813
1017
  }
814
- enable = () => {
1018
+ enable() {
815
1019
  this.active = true;
816
- };
817
- disable = () => {
1020
+ }
1021
+ disable() {
818
1022
  this.active = false;
819
- };
1023
+ }
820
1024
  clear() {
821
- this.store.clear();
1025
+ this.store.length = 0;
1026
+ this.last = undefined;
822
1027
  this.totalTime = 0;
823
1028
  this.totalMem = 0;
824
1029
  }
825
1030
  run(fn, meta = {}) {
826
- return this.active ? this.profile(fn, meta) : fn();
1031
+ if (!this.active) return fn();
1032
+ const startTime = this.nowFn(),
1033
+ startMem = this.memFn();
1034
+ const res = fn();
1035
+ const deltaTime = this.nowFn() - startTime,
1036
+ deltaMem = this.memFn() - startMem;
1037
+ this.storeRes({ time: deltaTime, mem: deltaMem, res, meta });
1038
+ return res;
827
1039
  }
828
1040
  async runAsync(fn, meta = {}) {
829
- return this.active
830
- ? this.profile(async () => await fn(), meta)
831
- : await fn();
1041
+ if (!this.active) return fn();
1042
+ const startTime = this.nowFn(),
1043
+ startMem = this.memFn();
1044
+ const res = await fn();
1045
+ const deltaTime = this.nowFn() - startTime,
1046
+ deltaMem = this.memFn() - startMem;
1047
+ this.storeRes({ time: deltaTime, mem: deltaMem, res, meta });
1048
+ return res;
1049
+ }
1050
+ getAll() {
1051
+ return [...this.store];
1052
+ }
1053
+ getLast() {
1054
+ return this.last;
1055
+ }
1056
+ getTotal() {
1057
+ return { time: this.totalTime, mem: this.totalMem };
832
1058
  }
833
- getAll = () => [...this.store];
834
- getLast = () => this.getAll().pop();
835
- getTotal = () => ({ time: this.totalTime, mem: this.totalMem });
836
1059
  services = Object.freeze({
837
1060
  enable: this.enable.bind(this),
838
1061
  disable: this.disable.bind(this),
@@ -908,1278 +1131,841 @@
908
1131
  throw new CmpStrNotFoundError(`Registry <${reg}> does not exist`, {
909
1132
  registry: reg
910
1133
  });
911
- return typeof cls === 'string' ? registry[reg]?.get(cls) : cls;
1134
+ return typeof cls === 'string' ? registry[reg].get(cls) : cls;
912
1135
  }
913
1136
  function createFromRegistry(reg, cls, ...args) {
914
- cls = resolveCls(reg, cls);
1137
+ const ctor = resolveCls(reg, cls);
915
1138
  return ErrorUtil.wrap(
916
- () => new cls(...args),
917
- `Failed to create instance of class <${cls.name ?? cls}> from registry <${reg}>`,
1139
+ () => new ctor(...args),
1140
+ `Failed to create instance of class <${ctor.name ?? cls}> from registry <${reg}>`,
918
1141
  { registry: reg, class: cls, args }
919
1142
  );
920
1143
  }
921
1144
 
922
- class RingPool {
923
- maxSize;
924
- buffers = [];
925
- pointer = 0;
926
- constructor(maxSize) {
927
- this.maxSize = maxSize;
1145
+ const profiler$2 = Profiler.getInstance();
1146
+ class Metric {
1147
+ static cache = new HashTable();
1148
+ metric;
1149
+ a;
1150
+ b;
1151
+ origA = [];
1152
+ origB = [];
1153
+ options;
1154
+ optKey;
1155
+ symmetric;
1156
+ results;
1157
+ static clear() {
1158
+ this.cache.clear();
928
1159
  }
929
- acquire(minSize, allowOversize) {
930
- return ErrorUtil.wrap(
931
- () => {
932
- const len = this.buffers.length;
933
- for (let i = 0; i < len; i++) {
934
- const idx = (this.pointer + i) & (len - 1);
935
- const item = this.buffers[idx];
936
- if (
937
- item.size >= minSize &&
938
- (allowOversize || item.size === minSize)
939
- ) {
940
- this.pointer = (idx + 1) & (len - 1);
941
- return item;
942
- }
943
- }
944
- return null;
945
- },
946
- `Failed to acquire buffer of size >= ${minSize} from pool`,
947
- { minSize, allowOversize }
948
- );
1160
+ static swap(a, b, m, n) {
1161
+ return m > n ? [b, a, n, m] : [a, b, m, n];
949
1162
  }
950
- release(item) {
951
- ErrorUtil.wrap(
952
- () => {
953
- if (this.buffers.length < this.maxSize)
954
- return void [this.buffers.push(item)];
955
- this.buffers[this.pointer] = item;
956
- this.pointer = (this.pointer + 1) % this.maxSize;
957
- },
958
- `Failed to release buffer back to pool`,
959
- { item }
1163
+ static clamp(res) {
1164
+ return Math.max(0, Math.min(1, res));
1165
+ }
1166
+ constructor(metric, a, b, opt = {}, symmetric = false) {
1167
+ this.metric = metric;
1168
+ this.a = Array.isArray(a) ? a : [a];
1169
+ this.b = Array.isArray(b) ? b : [b];
1170
+ ErrorUtil.assert(
1171
+ this.a.length > 0 && this.b.length > 0,
1172
+ `Inputs <a> and <b> must not be empty`,
1173
+ { a: this.a, b: this.b }
960
1174
  );
1175
+ this.options = opt;
1176
+ this.optKey = Hasher.fastFNV1a(
1177
+ JSON.stringify(opt, Object.keys(opt).sort())
1178
+ ).toString();
1179
+ this.symmetric = symmetric;
961
1180
  }
962
- clear() {
963
- this.buffers = [];
964
- this.pointer = 0;
1181
+ preCompute(a, b, m, n) {
1182
+ if (a === b) return { res: 1 };
1183
+ if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 };
1184
+ return undefined;
965
1185
  }
966
- }
967
- class Pool {
968
- static CONFIG = {
969
- int32: {
970
- type: 'int32',
971
- maxSize: 64,
972
- maxItemSize: 2048,
973
- allowOversize: true
974
- },
975
- 'number[]': {
976
- type: 'number[]',
977
- maxSize: 16,
978
- maxItemSize: 1024,
979
- allowOversize: false
980
- },
981
- 'string[]': {
982
- type: 'string[]',
983
- maxSize: 2,
984
- maxItemSize: 1024,
985
- allowOversize: false
986
- },
987
- set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
988
- map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
989
- };
990
- static POOLS = {
991
- int32: new RingPool(64),
992
- 'number[]': new RingPool(16),
993
- 'string[]': new RingPool(2),
994
- set: new RingPool(8),
995
- map: new RingPool(8)
996
- };
997
- static allocate(type, size) {
998
- switch (type) {
999
- case 'int32':
1000
- return new Int32Array(size);
1001
- case 'number[]':
1002
- return new Float64Array(size);
1003
- case 'string[]':
1004
- return new Array(size);
1005
- case 'set':
1006
- return new Set();
1007
- case 'map':
1008
- return new Map();
1009
- }
1010
- }
1011
- static acquire(type, size) {
1012
- const CONFIG = this.CONFIG[type];
1013
- if (!CONFIG)
1014
- throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
1015
- if (size > CONFIG.maxItemSize) return this.allocate(type, size);
1016
- const item = this.POOLS[type].acquire(size, CONFIG.allowOversize);
1017
- if (item)
1018
- return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
1019
- return this.allocate(type, size);
1186
+ compute(a, b, m, n, maxLen) {
1187
+ throw new CmpStrInternalError(
1188
+ `Method compute() must be overridden in a subclass`
1189
+ );
1020
1190
  }
1021
- static acquireMany(type, sizes) {
1022
- return sizes.map((size) => this.acquire(type, size));
1191
+ runSingle(i, j) {
1192
+ return ErrorUtil.wrap(
1193
+ () => {
1194
+ let a = String(this.a[i]),
1195
+ A = a;
1196
+ let b = String(this.b[j]),
1197
+ B = b;
1198
+ let m = A.length,
1199
+ n = B.length;
1200
+ let result = this.preCompute(A, B, m, n);
1201
+ if (!result) {
1202
+ result = profiler$2.run(() => {
1203
+ if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
1204
+ let key = Metric.cache.key(this.metric, [A, B], this.symmetric);
1205
+ if (key) key += this.optKey;
1206
+ return (
1207
+ Metric.cache.get(key || '') ??
1208
+ (() => {
1209
+ const maxLen = m > n ? m : n;
1210
+ const res = this.compute(A, B, m, n, maxLen);
1211
+ if (key) Metric.cache.set(key, res);
1212
+ return res;
1213
+ })()
1214
+ );
1215
+ });
1216
+ }
1217
+ return {
1218
+ metric: this.metric,
1219
+ a: this.origA.length > i ? this.origA[i] : a,
1220
+ b: this.origB.length > j ? this.origB[j] : b,
1221
+ ...result
1222
+ };
1223
+ },
1224
+ `Failed to compute metric for inputs at indices a[${i}] and b[${j}]`,
1225
+ { i, j }
1226
+ );
1023
1227
  }
1024
- static release(type, buffer, size) {
1025
- const CONFIG = this.CONFIG[type];
1026
- if (!CONFIG)
1027
- throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
1028
- if (size <= CONFIG.maxItemSize)
1029
- this.POOLS[type].release({ buffer, size });
1228
+ async runSingleAsync(i, j) {
1229
+ return Promise.resolve(this.runSingle(i, j));
1030
1230
  }
1031
- }
1032
-
1033
- class StructuredData {
1034
- data;
1035
- key;
1036
- static create(data, key) {
1037
- return new StructuredData(data, key);
1231
+ runBatch() {
1232
+ const results = [];
1233
+ for (let i = 0; i < this.a.length; i++)
1234
+ for (let j = 0; j < this.b.length; j++)
1235
+ results.push(this.runSingle(i, j));
1236
+ this.results = results;
1038
1237
  }
1039
- constructor(data, key) {
1040
- this.data = data;
1041
- this.key = key;
1238
+ async runBatchAsync() {
1239
+ const tasks = [];
1240
+ for (let i = 0; i < this.a.length; i++)
1241
+ for (let j = 0; j < this.b.length; j++)
1242
+ tasks.push(this.runSingleAsync(i, j));
1243
+ this.results = await Promise.all(tasks);
1042
1244
  }
1043
- extractFrom(arr, key) {
1044
- const result = Pool.acquire('string[]', arr.length);
1045
- for (let i = 0; i < arr.length; i++) {
1046
- const val = arr[i][key];
1047
- result[i] = typeof val === 'string' ? val : String(val ?? '');
1048
- }
1049
- return result;
1245
+ runPairwise() {
1246
+ const results = [];
1247
+ for (let i = 0; i < this.a.length; i++)
1248
+ results.push(this.runSingle(i, i));
1249
+ this.results = results;
1050
1250
  }
1051
- extract = () => this.extractFrom(this.data, this.key);
1052
- isMetricResult(v) {
1053
- return (
1054
- typeof v === 'object' &&
1055
- v !== null &&
1056
- 'a' in v &&
1057
- 'b' in v &&
1058
- 'res' in v
1059
- );
1251
+ async runPairwiseAsync() {
1252
+ const tasks = [];
1253
+ for (let i = 0; i < this.a.length; i++)
1254
+ tasks.push(this.runSingleAsync(i, i));
1255
+ this.results = await Promise.all(tasks);
1060
1256
  }
1061
- isCmpStrResult(v) {
1062
- return (
1063
- typeof v === 'object' &&
1064
- v !== null &&
1065
- 'source' in v &&
1066
- 'target' in v &&
1067
- 'match' in v
1068
- );
1257
+ setOriginal(a, b) {
1258
+ if (a) this.origA = Array.isArray(a) ? a : [a];
1259
+ if (b) this.origB = Array.isArray(b) ? b : [b];
1260
+ return this;
1069
1261
  }
1070
- normalizeResults(results) {
1071
- if (!Array.isArray(results) || results.length === 0) return [];
1072
- const first = results[0];
1073
- let normalized = [];
1074
- if (this.isMetricResult(first)) normalized = results;
1075
- else if (this.isCmpStrResult(first))
1076
- normalized = results.map((r) => ({
1077
- metric: 'unknown',
1078
- a: r.source,
1079
- b: r.target,
1080
- res: r.match,
1081
- raw: r.raw
1082
- }));
1083
- else
1084
- throw new CmpStrValidationError(
1085
- 'Unsupported result format for StructuredData normalization.'
1086
- );
1087
- return normalized.map((r, idx) => ({ ...r, __idx: idx }));
1262
+ isBatch() {
1263
+ return this.a.length > 1 || this.b.length > 1;
1088
1264
  }
1089
- rebuild(results, sourceData, extractedStrings, removeZero, objectsOnly) {
1090
- const stringToIndices = new Map();
1091
- for (let i = 0; i < extractedStrings.length; i++) {
1092
- const str = extractedStrings[i];
1093
- if (!stringToIndices.has(str)) stringToIndices.set(str, []);
1094
- stringToIndices.get(str).push(i);
1095
- }
1096
- const output = new Array(results.length);
1097
- const occurrenceCount = new Map();
1098
- let out = 0;
1099
- for (let i = 0; i < results.length; i++) {
1100
- const result = results[i];
1101
- if (removeZero && result.res === 0) continue;
1102
- const targetStr = result.b || '';
1103
- const indices = stringToIndices.get(targetStr);
1104
- let dataIndex;
1105
- if (indices && indices.length > 0) {
1106
- const occurrence = occurrenceCount.get(targetStr) ?? 0;
1107
- occurrenceCount.set(targetStr, occurrence + 1);
1108
- dataIndex = indices[occurrence % indices.length];
1109
- } else {
1110
- dataIndex = result.__idx ?? i;
1111
- }
1112
- if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
1113
- const sourceObj = sourceData[dataIndex];
1114
- const mappedTarget = extractedStrings[dataIndex] || targetStr;
1115
- if (objectsOnly) output[out++] = sourceObj;
1116
- else
1117
- output[out++] = {
1118
- obj: sourceObj,
1119
- key: this.key,
1120
- result: {
1121
- source: result.a,
1122
- target: mappedTarget,
1123
- match: result.res
1124
- },
1125
- ...(result.raw ? { raw: result.raw } : null)
1126
- };
1127
- }
1128
- output.length = out;
1129
- return output;
1265
+ isSingle() {
1266
+ return !this.isBatch();
1130
1267
  }
1131
- sort(results, sort) {
1132
- if (!sort || results.length <= 1) return results;
1133
- const asc = sort === 'asc';
1134
- return results.sort((a, b) => (asc ? a.res - b.res : b.res - a.res));
1268
+ isPairwise(safe = false) {
1269
+ return this.isBatch() && this.a.length === this.b.length
1270
+ ? true
1271
+ : !safe &&
1272
+ (() => {
1273
+ throw new CmpStrUsageError(
1274
+ `Mode <pairwise> requires arrays of equal length`,
1275
+ { a: this.a, b: this.b }
1276
+ );
1277
+ })();
1135
1278
  }
1136
- finalizeLookup(results, extractedStrings, opt) {
1137
- return this.rebuild(
1138
- this.sort(this.normalizeResults(results), opt?.sort),
1139
- this.data,
1140
- extractedStrings,
1141
- opt?.removeZero,
1142
- opt?.objectsOnly
1143
- );
1279
+ isSymmetrical() {
1280
+ return this.symmetric;
1144
1281
  }
1145
- performLookup(fn, extractedStrings, opt) {
1146
- return ErrorUtil.wrap(
1147
- () => this.finalizeLookup(fn(), extractedStrings, opt),
1148
- 'StructuredData lookup failed',
1149
- { key: this.key }
1150
- );
1282
+ whichMode(mode) {
1283
+ return mode ?? this.options.mode ?? 'default';
1151
1284
  }
1152
- async performLookupAsync(fn, extractedStrings, opt) {
1153
- return await ErrorUtil.wrapAsync(
1154
- async () => this.finalizeLookup(await fn(), extractedStrings, opt),
1155
- 'StructuredData async lookup failed',
1156
- { key: this.key }
1157
- );
1285
+ clear() {
1286
+ this.results = undefined;
1158
1287
  }
1159
- lookup(fn, query, opt) {
1160
- const b = this.extract();
1161
- try {
1162
- return this.performLookup(() => fn(query, b, opt), b, opt);
1163
- } finally {
1164
- Pool.release('string[]', b, b.length);
1288
+ run(mode, clear = true) {
1289
+ if (clear) this.clear();
1290
+ switch (this.whichMode(mode)) {
1291
+ case 'default':
1292
+ if (this.isSingle()) {
1293
+ this.results = this.runSingle(0, 0);
1294
+ break;
1295
+ }
1296
+ case 'batch':
1297
+ this.runBatch();
1298
+ break;
1299
+ case 'single':
1300
+ this.results = this.runSingle(0, 0);
1301
+ break;
1302
+ case 'pairwise':
1303
+ if (this.isPairwise()) this.runPairwise();
1304
+ break;
1305
+ default:
1306
+ throw new CmpStrInternalError(`Unsupported mode <${mode}>`);
1165
1307
  }
1166
1308
  }
1167
- async lookupAsync(fn, query, opt) {
1168
- const b = this.extract();
1169
- try {
1170
- return await this.performLookupAsync(() => fn(query, b, opt), b, opt);
1171
- } finally {
1172
- Pool.release('string[]', b, b.length);
1309
+ async runAsync(mode, clear = true) {
1310
+ if (clear) this.clear();
1311
+ switch (this.whichMode(mode)) {
1312
+ case 'default':
1313
+ if (this.isSingle()) {
1314
+ this.results = await this.runSingleAsync(0, 0);
1315
+ break;
1316
+ }
1317
+ case 'batch':
1318
+ await this.runBatchAsync();
1319
+ break;
1320
+ case 'single':
1321
+ this.results = await this.runSingleAsync(0, 0);
1322
+ break;
1323
+ case 'pairwise':
1324
+ if (this.isPairwise()) await this.runPairwiseAsync();
1325
+ break;
1326
+ default:
1327
+ throw new CmpStrInternalError(`Unsupported async mode <${mode}>`);
1173
1328
  }
1174
1329
  }
1175
- lookupPairs(fn, other, otherKey, opt) {
1176
- const a = this.extract();
1177
- const b = this.extractFrom(other, otherKey);
1178
- try {
1179
- return this.performLookup(() => fn(a, b, opt), a, opt);
1180
- } finally {
1181
- Pool.release('string[]', a, a.length);
1182
- Pool.release('string[]', b, b.length);
1183
- }
1330
+ getMetricName() {
1331
+ return this.metric;
1184
1332
  }
1185
- async lookupPairsAsync(fn, other, otherKey, opt) {
1186
- const a = this.extract();
1187
- const b = this.extractFrom(other, otherKey);
1188
- try {
1189
- return await this.performLookupAsync(() => fn(a, b, opt), a, opt);
1190
- } finally {
1191
- Pool.release('string[]', a, a.length);
1192
- Pool.release('string[]', b, b.length);
1193
- }
1333
+ getResults() {
1334
+ ErrorUtil.assert(
1335
+ this.results !== undefined,
1336
+ `run() must be called before getResults()`
1337
+ );
1338
+ return this.results;
1194
1339
  }
1195
1340
  }
1341
+ const MetricRegistry = Registry('metric', Metric);
1196
1342
 
1197
- class TextAnalyzer {
1198
- static REGEX = {
1199
- number: /\d/,
1200
- sentence: /(?<=[.!?])\s+/,
1201
- word: /\p{L}+/gu,
1202
- nonWord: /[^\p{L}]/gu,
1203
- vowelGroup: /[aeiouy]+/g,
1204
- letter: /\p{L}/gu,
1205
- ucLetter: /\p{Lu}/gu
1206
- };
1207
- text;
1208
- words = [];
1209
- sentences = [];
1210
- charFrequency = new Map();
1211
- wordHistogram = new Map();
1212
- syllableCache = new Map();
1213
- syllableStats;
1214
- constructor(input) {
1215
- this.text = input.trim();
1216
- this.tokenize();
1217
- this.computeFrequencies();
1343
+ class CosineSimilarity extends Metric {
1344
+ constructor(a, b, opt = {}) {
1345
+ super('cosine', a, b, opt, true);
1218
1346
  }
1219
- tokenize() {
1220
- let match;
1221
- const lcText = this.text.toLowerCase();
1222
- while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null)
1223
- this.words.push(match[0]);
1224
- this.sentences = this.text
1225
- .split(TextAnalyzer.REGEX.sentence)
1226
- .filter(Boolean);
1347
+ _termFreq(str, delimiter) {
1348
+ const terms = str.split(delimiter);
1349
+ const freq = Pool.acquire('map', terms.length);
1350
+ for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
1351
+ return freq;
1227
1352
  }
1228
- computeFrequencies() {
1229
- for (const char of this.text)
1230
- this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1);
1231
- for (const word of this.words)
1232
- this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1);
1353
+ compute(a, b) {
1354
+ const { delimiter = ' ' } = this.options;
1355
+ const termsA = this._termFreq(a, delimiter);
1356
+ const termsB = this._termFreq(b, delimiter);
1357
+ try {
1358
+ let dotP = 0,
1359
+ magA = 0,
1360
+ magB = 0;
1361
+ for (const [term, freqA] of termsA) {
1362
+ const freqB = termsB.get(term) || 0;
1363
+ dotP += freqA * freqB;
1364
+ magA += freqA * freqA;
1365
+ }
1366
+ for (const freqB of termsB.values()) magB += freqB * freqB;
1367
+ magA = Math.sqrt(magA);
1368
+ magB = Math.sqrt(magB);
1369
+ return {
1370
+ res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0,
1371
+ raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB }
1372
+ };
1373
+ } finally {
1374
+ Pool.release('map', termsA, termsA.size);
1375
+ Pool.release('map', termsB, termsB.size);
1376
+ }
1233
1377
  }
1234
- estimateSyllables(word) {
1235
- const clean = word
1236
- .normalize('NFC')
1237
- .toLowerCase()
1238
- .replace(TextAnalyzer.REGEX.nonWord, '');
1239
- if (this.syllableCache.has(clean)) return this.syllableCache.get(clean);
1240
- const matches = clean.match(TextAnalyzer.REGEX.vowelGroup);
1241
- const count = matches ? matches.length : 1;
1242
- this.syllableCache.set(clean, count);
1243
- return count;
1378
+ }
1379
+ MetricRegistry.add('cosine', CosineSimilarity);
1380
+
1381
+ class DamerauLevenshteinDistance extends Metric {
1382
+ constructor(a, b, opt = {}) {
1383
+ super('damerau', a, b, opt, true);
1244
1384
  }
1245
- computeSyllableStats() {
1246
- return (this.syllableStats ||= (() => {
1247
- const perWord = this.words
1248
- .map((w) => this.estimateSyllables(w))
1249
- .sort((a, b) => a - b);
1250
- const total = perWord.reduce((sum, s) => sum + s, 0);
1251
- const mono = perWord.filter((s) => s === 1).length;
1252
- const median = !perWord.length
1253
- ? 0
1254
- : perWord.length % 2 === 0
1255
- ? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) /
1256
- 2
1257
- : perWord[Math.floor(perWord.length / 2)];
1385
+ compute(a, b, m, n, maxLen) {
1386
+ const len = m + 1;
1387
+ const [test, prev, curr] = Pool.acquireMany('int32', [len, len, len]);
1388
+ try {
1389
+ for (let i = 0; i <= m; i++) prev[i] = i;
1390
+ for (let j = 1; j <= n; j++) {
1391
+ curr[0] = j;
1392
+ const cb = b.charCodeAt(j - 1);
1393
+ for (let i = 1; i <= m; i++) {
1394
+ const ca = a.charCodeAt(i - 1);
1395
+ const cost = ca === cb ? 0 : 1;
1396
+ let val = Math.min(
1397
+ curr[i - 1] + 1,
1398
+ prev[i] + 1,
1399
+ prev[i - 1] + cost
1400
+ );
1401
+ if (
1402
+ i > 1 &&
1403
+ j > 1 &&
1404
+ ca === b.charCodeAt(j - 2) &&
1405
+ cb === a.charCodeAt(i - 2)
1406
+ )
1407
+ val = Math.min(val, test[i - 2] + cost);
1408
+ curr[i] = val;
1409
+ }
1410
+ test.set(prev);
1411
+ prev.set(curr);
1412
+ }
1413
+ const dist = prev[m];
1258
1414
  return {
1259
- total,
1260
- mono,
1261
- perWord,
1262
- avg: perWord.length ? total / perWord.length : 0,
1263
- median
1415
+ res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
1416
+ raw: { dist, maxLen }
1264
1417
  };
1265
- })());
1266
- }
1267
- getLength = () => this.text.length;
1268
- getWordCount = () => this.words.length;
1269
- getSentenceCount = () => this.sentences.length;
1270
- getAvgWordLength() {
1271
- return this.words.length
1272
- ? this.words.join('').length / this.words.length
1273
- : 0;
1418
+ } finally {
1419
+ Pool.release('int32', test, len);
1420
+ Pool.release('int32', prev, len);
1421
+ Pool.release('int32', curr, len);
1422
+ }
1274
1423
  }
1275
- getAvgSentenceLength() {
1276
- return this.sentences.length
1277
- ? this.words.length / this.sentences.length
1278
- : 0;
1424
+ }
1425
+ MetricRegistry.add('damerau', DamerauLevenshteinDistance);
1426
+
1427
+ class DiceSorensenCoefficient extends Metric {
1428
+ constructor(a, b, opt = {}) {
1429
+ super('dice', a, b, opt, true);
1279
1430
  }
1280
- getWordHistogram() {
1281
- return Object.fromEntries(this.wordHistogram);
1431
+ _bigrams(str) {
1432
+ const len = str.length - 1;
1433
+ const bigrams = Pool.acquire('set', len);
1434
+ for (let i = 0; i < len; i++) bigrams.add(str.substring(i, i + 2));
1435
+ return bigrams;
1282
1436
  }
1283
- getMostCommonWords(limit = 5) {
1284
- return [...this.wordHistogram.entries()]
1285
- .sort((a, b) => b[1] - a[1])
1286
- .slice(0, limit)
1287
- .map((e) => e[0]);
1437
+ compute(a, b) {
1438
+ const setA = this._bigrams(a),
1439
+ setB = this._bigrams(b);
1440
+ const sizeA = setA.size,
1441
+ sizeB = setB.size;
1442
+ try {
1443
+ let intersection = 0;
1444
+ for (const bigram of setA) if (setB.has(bigram)) intersection++;
1445
+ const size = sizeA + sizeB;
1446
+ return {
1447
+ res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
1448
+ raw: { intersection, size }
1449
+ };
1450
+ } finally {
1451
+ Pool.release('set', setA, sizeA);
1452
+ Pool.release('set', setB, sizeB);
1453
+ }
1288
1454
  }
1289
- getHapaxLegomena() {
1290
- return [...this.wordHistogram.entries()]
1291
- .filter(([, c]) => c === 1)
1292
- .map((e) => e[0]);
1455
+ }
1456
+ MetricRegistry.add('dice', DiceSorensenCoefficient);
1457
+
1458
+ class HammingDistance extends Metric {
1459
+ constructor(a, b, opt = {}) {
1460
+ super('hamming', a, b, opt, true);
1293
1461
  }
1294
- hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text);
1295
- getUpperCaseRatio() {
1296
- const matches = this.text.match(TextAnalyzer.REGEX.letter) || [];
1297
- const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0;
1298
- return matches.length ? upper / matches.length : 0;
1462
+ compute(a, b, m, n, maxLen) {
1463
+ if (m !== n) {
1464
+ if (this.options.pad !== undefined) {
1465
+ if (m < maxLen) a = a.padEnd(maxLen, this.options.pad);
1466
+ if (n < maxLen) b = b.padEnd(maxLen, this.options.pad);
1467
+ m = n = maxLen;
1468
+ } else
1469
+ throw new CmpStrUsageError(
1470
+ `Strings must be of equal length for Hamming Distance, a=${m} and b=${n} given, ` +
1471
+ `use option.pad for automatic adjustment`,
1472
+ { a: m, b: n }
1473
+ );
1474
+ }
1475
+ let dist = 0;
1476
+ for (let i = 0; i < m; i++) if (a[i] !== b[i]) dist++;
1477
+ return { res: m === 0 ? 1 : Metric.clamp(1 - dist / m), raw: { dist } };
1299
1478
  }
1300
- getCharFrequency() {
1301
- return Object.fromEntries(this.charFrequency);
1479
+ }
1480
+ MetricRegistry.add('hamming', HammingDistance);
1481
+
1482
+ class JaccardIndex extends Metric {
1483
+ constructor(a, b, opt = {}) {
1484
+ super('jaccard', a, b, opt, true);
1302
1485
  }
1303
- getUnicodeCodepoints() {
1304
- const result = {};
1305
- for (const [char, count] of this.charFrequency) {
1306
- const block = char
1307
- .charCodeAt(0)
1308
- .toString(16)
1309
- .padStart(4, '0')
1310
- .toUpperCase();
1311
- result[block] = (result[block] || 0) + count;
1486
+ compute(a, b, m, n) {
1487
+ const [setA, setB] = Pool.acquireMany('set', [m, n]);
1488
+ try {
1489
+ for (const A of a) setA.add(A);
1490
+ for (const B of b) setB.add(B);
1491
+ let intersection = 0;
1492
+ for (const c of setA) if (setB.has(c)) intersection++;
1493
+ const union = setA.size + setB.size - intersection;
1494
+ return {
1495
+ res: union === 0 ? 1 : Metric.clamp(intersection / union),
1496
+ raw: { intersection, union }
1497
+ };
1498
+ } finally {
1499
+ Pool.release('set', setA, m);
1500
+ Pool.release('set', setB, n);
1312
1501
  }
1313
- return result;
1314
1502
  }
1315
- getLongWordRatio(len = 7) {
1316
- let long = 0;
1317
- for (const w of this.words) if (w.length >= len) long++;
1318
- return this.words.length ? long / this.words.length : 0;
1319
- }
1320
- getShortWordRatio(len = 3) {
1321
- let short = 0;
1322
- for (const w of this.words) if (w.length <= len) short++;
1323
- return this.words.length ? short / this.words.length : 0;
1324
- }
1325
- getSyllablesCount() {
1326
- return this.computeSyllableStats().total;
1327
- }
1328
- getMonosyllabicWordCount() {
1329
- return this.computeSyllableStats().mono;
1330
- }
1331
- getMinSyllablesWordCount(min) {
1332
- return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
1333
- }
1334
- getMaxSyllablesWordCount(max) {
1335
- return this.computeSyllableStats().perWord.filter((w) => w <= max).length;
1503
+ }
1504
+ MetricRegistry.add('jaccard', JaccardIndex);
1505
+
1506
+ class JaroWinklerDistance extends Metric {
1507
+ constructor(a, b, opt = {}) {
1508
+ super('jaroWinkler', a, b, opt, true);
1336
1509
  }
1337
- getAvgSyllablesPerWord() {
1338
- return this.computeSyllableStats().avg;
1510
+ compute(a, b, m, n) {
1511
+ const [matchA, matchB] = Pool.acquireMany('int32', [m, n]);
1512
+ try {
1513
+ for (let i = 0; i < m; i++) matchA[i] = 0;
1514
+ for (let i = 0; i < n; i++) matchB[i] = 0;
1515
+ const matchWindow = Math.max(0, Math.floor(n / 2) - 1);
1516
+ let matches = 0;
1517
+ for (let i = 0; i < m; i++) {
1518
+ const start = Math.max(0, i - matchWindow);
1519
+ const end = Math.min(i + matchWindow + 1, n);
1520
+ for (let j = start; j < end; j++) {
1521
+ if (!matchB[j] && a[i] === b[j]) {
1522
+ matchA[i] = 1;
1523
+ matchB[j] = 1;
1524
+ matches++;
1525
+ break;
1526
+ }
1527
+ }
1528
+ }
1529
+ let transpos = 0,
1530
+ jaro = 0,
1531
+ prefix = 0,
1532
+ res = 0;
1533
+ if (matches > 0) {
1534
+ let k = 0;
1535
+ for (let i = 0; i < m; i++) {
1536
+ if (matchA[i]) {
1537
+ while (!matchB[k]) k++;
1538
+ if (a[i] !== b[k]) transpos++;
1539
+ k++;
1540
+ }
1541
+ }
1542
+ transpos /= 2;
1543
+ jaro =
1544
+ (matches / m + matches / n + (matches - transpos) / matches) / 3;
1545
+ for (let i = 0; i < Math.min(4, m, n); i++) {
1546
+ if (a[i] === b[i]) prefix++;
1547
+ else break;
1548
+ }
1549
+ res = jaro + prefix * 0.1 * (1 - jaro);
1550
+ }
1551
+ return {
1552
+ res: Metric.clamp(res),
1553
+ raw: { matchWindow, matches, transpos, jaro, prefix }
1554
+ };
1555
+ } finally {
1556
+ Pool.release('int32', matchA, m);
1557
+ Pool.release('int32', matchB, n);
1558
+ }
1339
1559
  }
1340
- getMedianSyllablesPerWord() {
1341
- return this.computeSyllableStats().median;
1560
+ }
1561
+ MetricRegistry.add('jaroWinkler', JaroWinklerDistance);
1562
+
1563
+ class LCSMetric extends Metric {
1564
+ constructor(a, b, opt = {}) {
1565
+ super('lcs', a, b, opt, true);
1342
1566
  }
1343
- getHonoresR() {
1567
+ compute(a, b, m, n, maxLen) {
1568
+ const len = m + 1;
1569
+ const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1344
1570
  try {
1345
- return (
1346
- (100 * Math.log(this.words.length)) /
1347
- (1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1))
1348
- );
1349
- } catch {
1350
- return 0;
1571
+ for (let i = 0; i <= m; i++) prev[i] = 0;
1572
+ for (let j = 1; j <= n; j++) {
1573
+ curr[0] = 0;
1574
+ const cb = b.charCodeAt(j - 1);
1575
+ for (let i = 1; i <= m; i++) {
1576
+ if (a.charCodeAt(i - 1) === cb) curr[i] = prev[i - 1] + 1;
1577
+ else curr[i] = Math.max(prev[i], curr[i - 1]);
1578
+ }
1579
+ prev.set(curr);
1580
+ }
1581
+ const lcs = prev[m];
1582
+ return {
1583
+ res: maxLen === 0 ? 1 : Metric.clamp(lcs / maxLen),
1584
+ raw: { lcs, maxLen }
1585
+ };
1586
+ } finally {
1587
+ Pool.release('int32', prev, len);
1588
+ Pool.release('int32', curr, len);
1351
1589
  }
1352
1590
  }
1353
- getReadingTime(wpm = 200) {
1354
- return this.words.length / (wpm ?? 1);
1591
+ }
1592
+ MetricRegistry.add('lcs', LCSMetric);
1593
+
1594
+ class LevenshteinDistance extends Metric {
1595
+ constructor(a, b, opt = {}) {
1596
+ super('levenshtein', a, b, opt, true);
1355
1597
  }
1356
- getReadabilityScore(metric = 'flesch') {
1357
- const w = this.words.length || 1;
1358
- const s = this.sentences.length || 1;
1359
- const y = this.getSyllablesCount() || 1;
1360
- const asl = w / s;
1361
- const asw = y / w;
1362
- switch (metric) {
1363
- case 'flesch':
1364
- return 206.835 - 1.015 * asl - 84.6 * asw;
1365
- case 'fleschde':
1366
- return 180 - asl - 58.5 * asw;
1367
- case 'kincaid':
1368
- return 0.39 * asl + 11.8 * asw - 15.59;
1598
+ compute(a, b, m, n, maxLen) {
1599
+ const len = m + 1;
1600
+ const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1601
+ try {
1602
+ for (let i = 0; i <= m; i++) prev[i] = i;
1603
+ for (let j = 1; j <= n; j++) {
1604
+ curr[0] = j;
1605
+ const cb = b.charCodeAt(j - 1);
1606
+ for (let i = 1; i <= m; i++) {
1607
+ const cost = a.charCodeAt(i - 1) === cb ? 0 : 1;
1608
+ curr[i] = Math.min(
1609
+ curr[i - 1] + 1,
1610
+ prev[i] + 1,
1611
+ prev[i - 1] + cost
1612
+ );
1613
+ }
1614
+ prev.set(curr);
1615
+ }
1616
+ const dist = prev[m];
1617
+ return {
1618
+ res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
1619
+ raw: { dist, maxLen }
1620
+ };
1621
+ } finally {
1622
+ Pool.release('int32', prev, len);
1623
+ Pool.release('int32', curr, len);
1369
1624
  }
1370
1625
  }
1371
- getLIXScore() {
1372
- const w = this.words.length || 1;
1373
- const s = this.sentences.length || 1;
1374
- const l = this.getLongWordRatio() * w;
1375
- return w / s + (l / w) * 100;
1626
+ }
1627
+ MetricRegistry.add('levenshtein', LevenshteinDistance);
1628
+
1629
+ class NeedlemanWunschDistance extends Metric {
1630
+ constructor(a, b, opt = {}) {
1631
+ super('needlemanWunsch', a, b, opt, true);
1376
1632
  }
1377
- getWSTFScore() {
1378
- const w = this.words.length || 1;
1379
- const h = (this.getMinSyllablesWordCount(3) / w) * 100;
1380
- const s = this.getAvgSentenceLength();
1381
- const l = this.getLongWordRatio() * 100;
1382
- const m = (this.getMonosyllabicWordCount() / w) * 100;
1383
- return [
1384
- 0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875,
1385
- 0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779,
1386
- 0.2963 * h + 0.1905 * s - 1.1144,
1387
- 0.2744 * h + 0.2656 * s - 1.693
1388
- ];
1633
+ compute(a, b, m, n, maxLen) {
1634
+ const { match = 1, mismatch = -1, gap = -1 } = this.options;
1635
+ const len = m + 1;
1636
+ const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1637
+ try {
1638
+ prev[0] = 0;
1639
+ for (let i = 1; i <= m; i++) prev[i] = prev[i - 1] + gap;
1640
+ for (let j = 1; j <= n; j++) {
1641
+ curr[0] = prev[0] + gap;
1642
+ const cb = b.charCodeAt(j - 1);
1643
+ for (let i = 1; i <= m; i++) {
1644
+ const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
1645
+ curr[i] = Math.max(
1646
+ prev[i - 1] + score,
1647
+ prev[i] + gap,
1648
+ curr[i - 1] + gap
1649
+ );
1650
+ }
1651
+ prev.set(curr);
1652
+ }
1653
+ const score = prev[m];
1654
+ const denum = maxLen * match;
1655
+ return {
1656
+ res: denum === 0 ? 0 : Metric.clamp(score / denum),
1657
+ raw: { score, denum }
1658
+ };
1659
+ } finally {
1660
+ Pool.release('int32', prev, len);
1661
+ Pool.release('int32', curr, len);
1662
+ }
1389
1663
  }
1390
1664
  }
1665
+ MetricRegistry.add('needlemanWunsch', NeedlemanWunschDistance);
1391
1666
 
1392
- const profiler$2 = Profiler.getInstance();
1393
- class Metric {
1394
- static cache = new HashTable();
1395
- metric;
1396
- a;
1397
- b;
1398
- origA = [];
1399
- origB = [];
1400
- options;
1401
- optKey;
1402
- symmetric;
1403
- results;
1404
- static clear = () => this.cache.clear();
1405
- static swap = (a, b, m, n) => (m > n ? [b, a, n, m] : [a, b, m, n]);
1406
- static clamp = (res) => Math.max(0, Math.min(1, res));
1407
- constructor(metric, a, b, opt = {}, symmetric = false) {
1408
- this.metric = metric;
1409
- this.a = Array.isArray(a) ? a : [a];
1410
- this.b = Array.isArray(b) ? b : [b];
1411
- ErrorUtil.assert(
1412
- this.a.length > 0 && this.b.length > 0,
1413
- `Inputs <a> and <b> must not be empty`,
1414
- { a: this.a, b: this.b }
1415
- );
1416
- this.options = opt;
1417
- this.optKey = Hasher.fastFNV1a(
1418
- JSON.stringify(opt, Object.keys(opt).sort())
1419
- ).toString();
1420
- this.symmetric = symmetric;
1667
+ class QGramSimilarity extends Metric {
1668
+ constructor(a, b, opt = {}) {
1669
+ super('qGram', a, b, opt, true);
1421
1670
  }
1422
- preCompute(a, b, m, n) {
1423
- if (a === b) return { res: 1 };
1424
- if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 };
1425
- return undefined;
1426
- }
1427
- compute(a, b, m, n, maxLen) {
1428
- throw new CmpStrInternalError(
1429
- `Method compute() must be overridden in a subclass`
1430
- );
1431
- }
1432
- runSingle(i, j) {
1433
- return ErrorUtil.wrap(
1434
- () => {
1435
- let a = String(this.a[i]),
1436
- A = a;
1437
- let b = String(this.b[j]),
1438
- B = b;
1439
- let m = A.length,
1440
- n = B.length;
1441
- let result = this.preCompute(A, B, m, n);
1442
- if (!result) {
1443
- result = profiler$2.run(() => {
1444
- if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
1445
- const key =
1446
- Metric.cache.key(this.metric, [A, B], this.symmetric) +
1447
- this.optKey;
1448
- return (
1449
- Metric.cache.get(key || '') ??
1450
- (() => {
1451
- const res = this.compute(A, B, m, n, Math.max(m, n));
1452
- if (key) Metric.cache.set(key, res);
1453
- return res;
1454
- })()
1455
- );
1456
- });
1457
- }
1458
- return {
1459
- metric: this.metric,
1460
- a: this.origA[i] ?? a,
1461
- b: this.origB[j] ?? b,
1462
- ...result
1463
- };
1464
- },
1465
- `Failed to compute metric for inputs at indices a[${i}] and b[${j}]`,
1466
- { i, j }
1467
- );
1468
- }
1469
- async runSingleAsync(i, j) {
1470
- return Promise.resolve(this.runSingle(i, j));
1471
- }
1472
- runBatch() {
1473
- const results = [];
1474
- for (let i = 0; i < this.a.length; i++)
1475
- for (let j = 0; j < this.b.length; j++)
1476
- results.push(this.runSingle(i, j));
1477
- this.results = results;
1478
- }
1479
- async runBatchAsync() {
1480
- const results = [];
1481
- for (let i = 0; i < this.a.length; i++)
1482
- for (let j = 0; j < this.b.length; j++)
1483
- results.push(await this.runSingleAsync(i, j));
1484
- this.results = results;
1485
- }
1486
- runPairwise() {
1487
- const results = [];
1488
- for (let i = 0; i < this.a.length; i++)
1489
- results.push(this.runSingle(i, i));
1490
- this.results = results;
1491
- }
1492
- async runPairwiseAsync() {
1493
- const results = [];
1494
- for (let i = 0; i < this.a.length; i++)
1495
- results.push(await this.runSingleAsync(i, i));
1496
- this.results = results;
1497
- }
1498
- setOriginal(a, b) {
1499
- if (a) this.origA = Array.isArray(a) ? a : [a];
1500
- if (b) this.origB = Array.isArray(b) ? b : [b];
1501
- return this;
1502
- }
1503
- isBatch = () => this.a.length > 1 || this.b.length > 1;
1504
- isSingle = () => !this.isBatch();
1505
- isPairwise(safe = false) {
1506
- return this.isBatch() && this.a.length === this.b.length
1507
- ? true
1508
- : !safe &&
1509
- (() => {
1510
- throw new CmpStrUsageError(
1511
- `Mode <pairwise> requires arrays of equal length`,
1512
- { a: this.a, b: this.b }
1513
- );
1514
- })();
1515
- }
1516
- isSymmetrical = () => this.symmetric;
1517
- whichMode = (mode) => mode ?? this.options?.mode ?? 'default';
1518
- clear = () => (this.results = undefined);
1519
- run(mode, clear = true) {
1520
- if (clear) this.clear();
1521
- switch (this.whichMode(mode)) {
1522
- case 'default':
1523
- if (this.isSingle()) {
1524
- this.results = this.runSingle(0, 0);
1525
- break;
1526
- }
1527
- case 'batch':
1528
- this.runBatch();
1529
- break;
1530
- case 'single':
1531
- this.results = this.runSingle(0, 0);
1532
- break;
1533
- case 'pairwise':
1534
- if (this.isPairwise()) this.runPairwise();
1535
- break;
1536
- default:
1537
- throw new CmpStrInternalError(`Unsupported mode <${mode}>`);
1538
- }
1539
- }
1540
- async runAsync(mode, clear = true) {
1541
- if (clear) this.clear();
1542
- switch (this.whichMode(mode)) {
1543
- case 'default':
1544
- if (this.isSingle()) {
1545
- this.results = await this.runSingleAsync(0, 0);
1546
- break;
1547
- }
1548
- case 'batch':
1549
- await this.runBatchAsync();
1550
- break;
1551
- case 'single':
1552
- this.results = await this.runSingleAsync(0, 0);
1553
- break;
1554
- case 'pairwise':
1555
- if (this.isPairwise()) await this.runPairwiseAsync();
1556
- break;
1557
- default:
1558
- throw new CmpStrInternalError(`Unsupported async mode <${mode}>`);
1559
- }
1560
- }
1561
- getMetricName = () => this.metric;
1562
- getResults() {
1563
- ErrorUtil.assert(
1564
- this.results !== undefined,
1565
- `run() must be called before getResults()`
1566
- );
1567
- return this.results;
1568
- }
1569
- }
1570
- const MetricRegistry = Registry('metric', Metric);
1571
-
1572
- class CosineSimilarity extends Metric {
1573
- constructor(a, b, opt = {}) {
1574
- super('cosine', a, b, opt, true);
1575
- }
1576
- _termFreq(str, delimiter) {
1577
- const terms = str.split(delimiter);
1578
- const freq = Pool.acquire('map', terms.length);
1579
- for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
1580
- return freq;
1671
+ _qGrams(str, q) {
1672
+ const len = Math.max(0, str.length - q + 1);
1673
+ const grams = Pool.acquire('set', len);
1674
+ for (let i = 0; i < len; i++) grams.add(str.slice(i, i + q));
1675
+ return grams;
1581
1676
  }
1582
1677
  compute(a, b) {
1583
- const { delimiter = ' ' } = this.options;
1584
- const termsA = this._termFreq(a, delimiter);
1585
- const termsB = this._termFreq(b, delimiter);
1678
+ const { q = 2 } = this.options;
1679
+ const setA = this._qGrams(a, q),
1680
+ setB = this._qGrams(b, q);
1681
+ const sizeA = setA.size,
1682
+ sizeB = setB.size;
1586
1683
  try {
1587
- let dotP = 0,
1588
- magA = 0,
1589
- magB = 0;
1590
- for (const [term, freqA] of termsA) {
1591
- const freqB = termsB.get(term) || 0;
1592
- dotP += freqA * freqB;
1593
- magA += freqA * freqA;
1594
- }
1595
- for (const freqB of termsB.values()) magB += freqB * freqB;
1596
- magA = Math.sqrt(magA);
1597
- magB = Math.sqrt(magB);
1684
+ let intersection = 0;
1685
+ for (const gram of setA) if (setB.has(gram)) intersection++;
1686
+ const size = Math.max(sizeA, sizeB);
1598
1687
  return {
1599
- res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0,
1600
- raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB }
1688
+ res: size === 0 ? 1 : Metric.clamp(intersection / size),
1689
+ raw: { intersection, size }
1601
1690
  };
1602
1691
  } finally {
1603
- Pool.release('map', termsA, termsA.size);
1604
- Pool.release('map', termsB, termsB.size);
1692
+ Pool.release('set', setA, sizeA);
1693
+ Pool.release('set', setB, sizeB);
1605
1694
  }
1606
1695
  }
1607
1696
  }
1608
- MetricRegistry.add('cosine', CosineSimilarity);
1697
+ MetricRegistry.add('qGram', QGramSimilarity);
1609
1698
 
1610
- class DamerauLevenshteinDistance extends Metric {
1699
+ class SmithWatermanDistance extends Metric {
1611
1700
  constructor(a, b, opt = {}) {
1612
- super('damerau', a, b, opt, true);
1701
+ super('smithWaterman', a, b, opt, true);
1613
1702
  }
1614
- compute(a, b, m, n, maxLen) {
1703
+ compute(a, b, m, n) {
1704
+ const { match = 2, mismatch = -1, gap = -2 } = this.options;
1615
1705
  const len = m + 1;
1616
- const [test, prev, curr] = Pool.acquireMany('int32', [len, len, len]);
1706
+ const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1707
+ let maxScore = 0;
1617
1708
  try {
1618
- for (let i = 0; i <= m; i++) prev[i] = i;
1709
+ for (let i = 0; i <= m; i++) prev[i] = 0;
1619
1710
  for (let j = 1; j <= n; j++) {
1620
- curr[0] = j;
1711
+ curr[0] = 0;
1621
1712
  const cb = b.charCodeAt(j - 1);
1622
1713
  for (let i = 1; i <= m; i++) {
1623
- const ca = a.charCodeAt(i - 1);
1624
- const cost = ca === cb ? 0 : 1;
1625
- let val = Math.min(
1626
- curr[i - 1] + 1,
1627
- prev[i] + 1,
1628
- prev[i - 1] + cost
1714
+ const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
1715
+ curr[i] = Math.max(
1716
+ 0,
1717
+ prev[i - 1] + score,
1718
+ prev[i] + gap,
1719
+ curr[i - 1] + gap
1629
1720
  );
1630
- if (
1631
- i > 1 &&
1632
- j > 1 &&
1633
- ca === b.charCodeAt(j - 2) &&
1634
- cb === a.charCodeAt(i - 2)
1635
- )
1636
- val = Math.min(val, test[i - 2] + cost);
1637
- curr[i] = val;
1721
+ if (curr[i] > maxScore) maxScore = curr[i];
1638
1722
  }
1639
- test.set(prev);
1640
1723
  prev.set(curr);
1641
1724
  }
1642
- const dist = prev[m];
1725
+ const denum = Math.min(m * match, n * match);
1643
1726
  return {
1644
- res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
1645
- raw: { dist, maxLen }
1727
+ res: denum === 0 ? 0 : Metric.clamp(maxScore / denum),
1728
+ raw: { score: maxScore, denum }
1646
1729
  };
1647
1730
  } finally {
1648
- Pool.release('int32', test, len);
1649
1731
  Pool.release('int32', prev, len);
1650
1732
  Pool.release('int32', curr, len);
1651
1733
  }
1652
1734
  }
1653
1735
  }
1654
- MetricRegistry.add('damerau', DamerauLevenshteinDistance);
1736
+ MetricRegistry.add('smithWaterman', SmithWatermanDistance);
1655
1737
 
1656
- class DiceSorensenCoefficient extends Metric {
1657
- constructor(a, b, opt = {}) {
1658
- super('dice', a, b, opt, true);
1738
+ const profiler$1 = Profiler.getInstance();
1739
+ class Phonetic {
1740
+ static cache = new HashTable();
1741
+ static default;
1742
+ algo;
1743
+ options;
1744
+ optKey;
1745
+ map;
1746
+ ignoreSet;
1747
+ static clear() {
1748
+ this.cache.clear();
1659
1749
  }
1660
- _bigrams(str) {
1661
- const len = str.length - 1;
1662
- const bigrams = Pool.acquire('set', len);
1663
- for (let i = 0; i < len; i++) bigrams.add(str.substring(i, i + 2));
1664
- return bigrams;
1750
+ constructor(algo, opt = {}) {
1751
+ const defaults = this.constructor.default ?? {};
1752
+ const mapId = opt.map ?? defaults.map;
1753
+ if (!mapId)
1754
+ throw new CmpStrNotFoundError(
1755
+ `No mapping specified for phonetic algorithm`,
1756
+ { algo }
1757
+ );
1758
+ const map = PhoneticMappingRegistry.get(algo, mapId);
1759
+ if (map === undefined)
1760
+ throw new CmpStrNotFoundError(
1761
+ `Requested mapping <${mapId}> is not declared`,
1762
+ { algo, mapId }
1763
+ );
1764
+ this.options = DeepMerge.merge(
1765
+ DeepMerge.merge(defaults, map.options ?? {}),
1766
+ opt
1767
+ );
1768
+ this.optKey = Hasher.fastFNV1a(
1769
+ JSON.stringify(this.options, Object.keys(this.options).sort())
1770
+ ).toString();
1771
+ this.algo = algo;
1772
+ this.map = map;
1773
+ this.ignoreSet = new Set(map.ignore ?? []);
1665
1774
  }
1666
- compute(a, b) {
1667
- const setA = this._bigrams(a),
1668
- setB = this._bigrams(b);
1669
- const sizeA = setA.size,
1670
- sizeB = setB.size;
1671
- try {
1672
- let intersection = 0;
1673
- for (const bigram of setA) if (setB.has(bigram)) intersection++;
1674
- const size = sizeA + sizeB;
1675
- return {
1676
- res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
1677
- raw: { intersection, size }
1678
- };
1679
- } finally {
1680
- Pool.release('set', setA, sizeA);
1681
- Pool.release('set', setB, sizeB);
1775
+ applyPattern(word) {
1776
+ const { patterns = [] } = this.map;
1777
+ if (!patterns.length) return word;
1778
+ for (const { pattern, replace, all = false } of patterns) {
1779
+ word = all
1780
+ ? word.replaceAll(pattern, replace)
1781
+ : word.replace(pattern, replace);
1682
1782
  }
1783
+ return word;
1683
1784
  }
1684
- }
1685
- MetricRegistry.add('dice', DiceSorensenCoefficient);
1686
-
1687
- class HammingDistance extends Metric {
1688
- constructor(a, b, opt = {}) {
1689
- super('hamming', a, b, opt, true);
1785
+ applyRules(char, i, chars, charLen) {
1786
+ const { ruleset = [] } = this.map;
1787
+ if (!ruleset.length) return undefined;
1788
+ const prev = chars[i - 1] || '',
1789
+ prev2 = chars[i - 2] || '';
1790
+ const next = chars[i + 1] || '',
1791
+ next2 = chars[i + 2] || '';
1792
+ const str = chars.join('');
1793
+ for (const rule of ruleset) {
1794
+ if (rule.char && rule.char !== char) continue;
1795
+ if (rule.position === 'start' && i !== 0) continue;
1796
+ if (rule.position === 'middle' && (i === 0 || i === charLen - 1))
1797
+ continue;
1798
+ if (rule.position === 'end' && i !== charLen - 1) continue;
1799
+ if (rule.prev && !rule.prev.includes(prev)) continue;
1800
+ if (rule.prevNot && rule.prevNot.includes(prev)) continue;
1801
+ if (rule.prev2 && !rule.prev2.includes(prev2)) continue;
1802
+ if (rule.prev2Not && rule.prev2Not.includes(prev2)) continue;
1803
+ if (rule.next && !rule.next.includes(next)) continue;
1804
+ if (rule.nextNot && rule.nextNot.includes(next)) continue;
1805
+ if (rule.next2 && !rule.next2.includes(next2)) continue;
1806
+ if (rule.next2Not && rule.next2Not.includes(next2)) continue;
1807
+ if (
1808
+ rule.leading &&
1809
+ !rule.leading.includes(str.slice(0, rule.leading.length))
1810
+ )
1811
+ continue;
1812
+ if (
1813
+ rule.trailing &&
1814
+ !rule.trailing.includes(str.slice(-rule.trailing.length))
1815
+ )
1816
+ continue;
1817
+ if (rule.match && !rule.match.every((c, j) => chars[i + j] === c))
1818
+ continue;
1819
+ return rule.code;
1820
+ }
1821
+ return undefined;
1690
1822
  }
1691
- compute(a, b, m, n, maxLen) {
1692
- if (m !== n) {
1693
- if (this.options.pad !== undefined) {
1694
- if (m < maxLen) a = a.padEnd(maxLen, this.options.pad);
1695
- if (n < maxLen) b = b.padEnd(maxLen, this.options.pad);
1696
- m = n = maxLen;
1697
- } else
1698
- throw new CmpStrUsageError(
1699
- `Strings must be of equal length for Hamming Distance, a=${m} and b=${n} given, ` +
1700
- `use option.pad for automatic adjustment`,
1701
- { a: m, b: n }
1702
- );
1823
+ encode(word) {
1824
+ const { map = {} } = this.map;
1825
+ word = this.applyPattern(word);
1826
+ const chars = this.word2Chars(word);
1827
+ const charLen = chars.length;
1828
+ let code = '',
1829
+ lastCode = null;
1830
+ for (let i = 0; i < charLen; i++) {
1831
+ const char = chars[i];
1832
+ if (this.ignoreSet.has(char)) continue;
1833
+ const mapped = this.mapChar(char, i, chars, charLen, lastCode, map);
1834
+ if (mapped === undefined) continue;
1835
+ ((code += mapped), (lastCode = mapped));
1836
+ if (this.exitEarly(code, i)) break;
1703
1837
  }
1704
- let dist = 0;
1705
- for (let i = 0; i < m; i++) if (a[i] !== b[i]) dist++;
1706
- return { res: m === 0 ? 1 : Metric.clamp(1 - dist / m), raw: { dist } };
1838
+ return this.adjustCode(code, chars);
1707
1839
  }
1708
- }
1709
- MetricRegistry.add('hamming', HammingDistance);
1710
-
1711
- class JaccardIndex extends Metric {
1712
- constructor(a, b, opt = {}) {
1713
- super('jaccard', a, b, opt, true);
1840
+ mapChar(char, i, chars, charLen, lastCode, map) {
1841
+ const { dedupe = true, fallback = undefined } = this.options;
1842
+ const c =
1843
+ this.applyRules(char, i, chars, charLen) ?? map[char] ?? fallback;
1844
+ return dedupe && c === lastCode ? undefined : c;
1714
1845
  }
1715
- compute(a, b, m, n) {
1716
- const [setA, setB] = Pool.acquireMany('set', [m, n]);
1717
- try {
1718
- for (const A of a) setA.add(A);
1719
- for (const B of b) setB.add(B);
1720
- let intersection = 0;
1721
- for (const c of setA) if (setB.has(c)) intersection++;
1722
- const union = setA.size + setB.size - intersection;
1723
- return {
1724
- res: union === 0 ? 1 : Metric.clamp(intersection / union),
1725
- raw: { intersection, union }
1726
- };
1727
- } finally {
1728
- Pool.release('set', setA, m);
1729
- Pool.release('set', setB, n);
1730
- }
1846
+ equalLen(input) {
1847
+ const { length = -1, pad = '0' } = this.options;
1848
+ return length === -1
1849
+ ? input
1850
+ : (input + pad.repeat(length)).slice(0, length);
1731
1851
  }
1732
- }
1733
- MetricRegistry.add('jaccard', JaccardIndex);
1734
-
1735
- class JaroWinklerDistance extends Metric {
1736
- constructor(a, b, opt = {}) {
1737
- super('jaroWinkler', a, b, opt, true);
1852
+ word2Chars(word) {
1853
+ return Array.from(word.toLowerCase());
1738
1854
  }
1739
- compute(a, b, m, n) {
1740
- const [matchA, matchB] = Pool.acquireMany('int32', [m, n]);
1741
- try {
1742
- for (let i = 0; i < m; i++) matchA[i] = 0;
1743
- for (let i = 0; i < n; i++) matchB[i] = 0;
1744
- const matchWindow = Math.max(0, Math.floor(n / 2) - 1);
1745
- let matches = 0;
1746
- for (let i = 0; i < m; i++) {
1747
- const start = Math.max(0, i - matchWindow);
1748
- const end = Math.min(i + matchWindow + 1, n);
1749
- for (let j = start; j < end; j++) {
1750
- if (!matchB[j] && a[i] === b[j]) {
1751
- matchA[i] = 1;
1752
- matchB[j] = 1;
1753
- matches++;
1754
- break;
1755
- }
1756
- }
1757
- }
1758
- let transpos = 0,
1759
- jaro = 0,
1760
- prefix = 0,
1761
- res = 0;
1762
- if (matches > 0) {
1763
- let k = 0;
1764
- for (let i = 0; i < m; i++) {
1765
- if (matchA[i]) {
1766
- while (!matchB[k]) k++;
1767
- if (a[i] !== b[k]) transpos++;
1768
- k++;
1769
- }
1770
- }
1771
- transpos /= 2;
1772
- jaro =
1773
- (matches / m + matches / n + (matches - transpos) / matches) / 3;
1774
- for (let i = 0; i < Math.min(4, m, n); i++) {
1775
- if (a[i] === b[i]) prefix++;
1776
- else break;
1777
- }
1778
- res = jaro + prefix * 0.1 * (1 - jaro);
1779
- }
1780
- return {
1781
- res: Metric.clamp(res),
1782
- raw: { matchWindow, matches, transpos, jaro, prefix }
1783
- };
1784
- } finally {
1785
- Pool.release('int32', matchA, m);
1786
- Pool.release('int32', matchB, n);
1787
- }
1855
+ exitEarly(code, i) {
1856
+ const { length = -1 } = this.options;
1857
+ return length > 0 && code.length >= length;
1788
1858
  }
1789
- }
1790
- MetricRegistry.add('jaroWinkler', JaroWinklerDistance);
1791
-
1792
- class LCSMetric extends Metric {
1793
- constructor(a, b, opt = {}) {
1794
- super('lcs', a, b, opt, true);
1859
+ adjustCode(code, chars) {
1860
+ return code;
1795
1861
  }
1796
- compute(a, b, m, n, maxLen) {
1797
- const len = m + 1;
1798
- const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1799
- try {
1800
- for (let i = 0; i <= m; i++) prev[i] = 0;
1801
- for (let j = 1; j <= n; j++) {
1802
- curr[0] = 0;
1803
- const cb = b.charCodeAt(j - 1);
1804
- for (let i = 1; i <= m; i++) {
1805
- if (a.charCodeAt(i - 1) === cb) curr[i] = prev[i - 1] + 1;
1806
- else curr[i] = Math.max(prev[i], curr[i - 1]);
1862
+ loop(words) {
1863
+ return ErrorUtil.wrap(
1864
+ () => {
1865
+ const index = [];
1866
+ for (const word of words) {
1867
+ let key = Phonetic.cache.key(this.algo, [word]);
1868
+ if (key) key += this.optKey;
1869
+ const code =
1870
+ Phonetic.cache.get(key || '') ??
1871
+ (() => {
1872
+ const res = this.encode(word);
1873
+ if (key) Phonetic.cache.set(key, res);
1874
+ return res;
1875
+ })();
1876
+ if (code && code.length) index.push(this.equalLen(code));
1807
1877
  }
1808
- prev.set(curr);
1809
- }
1810
- const lcs = prev[m];
1811
- return {
1812
- res: maxLen === 0 ? 1 : Metric.clamp(lcs / maxLen),
1813
- raw: { lcs, maxLen }
1814
- };
1815
- } finally {
1816
- Pool.release('int32', prev, len);
1817
- Pool.release('int32', curr, len);
1818
- }
1819
- }
1820
- }
1821
- MetricRegistry.add('lcs', LCSMetric);
1822
-
1823
- class LevenshteinDistance extends Metric {
1824
- constructor(a, b, opt = {}) {
1825
- super('levenshtein', a, b, opt, true);
1878
+ return index;
1879
+ },
1880
+ `Failed to generate phonetic index`,
1881
+ { algo: this.algo, words }
1882
+ );
1826
1883
  }
1827
- compute(a, b, m, n, maxLen) {
1828
- const len = m + 1;
1829
- const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1830
- try {
1831
- for (let i = 0; i <= m; i++) prev[i] = i;
1832
- for (let j = 1; j <= n; j++) {
1833
- curr[0] = j;
1834
- const cb = b.charCodeAt(j - 1);
1835
- for (let i = 1; i <= m; i++) {
1836
- const cost = a.charCodeAt(i - 1) === cb ? 0 : 1;
1837
- curr[i] = Math.min(
1838
- curr[i - 1] + 1,
1839
- prev[i] + 1,
1840
- prev[i - 1] + cost
1884
+ async loopAsync(words) {
1885
+ return ErrorUtil.wrapAsync(
1886
+ async () => {
1887
+ const index = [];
1888
+ for (const word of words) {
1889
+ const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
1890
+ const code = await Promise.resolve(
1891
+ Phonetic.cache.get(key || '') ??
1892
+ (() => {
1893
+ const res = this.encode(word);
1894
+ if (key) Phonetic.cache.set(key, res);
1895
+ return res;
1896
+ })()
1841
1897
  );
1898
+ if (code && code.length) index.push(this.equalLen(code));
1842
1899
  }
1843
- prev.set(curr);
1844
- }
1845
- const dist = prev[m];
1846
- return {
1847
- res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
1848
- raw: { dist, maxLen }
1849
- };
1850
- } finally {
1851
- Pool.release('int32', prev, len);
1852
- Pool.release('int32', curr, len);
1853
- }
1854
- }
1855
- }
1856
- MetricRegistry.add('levenshtein', LevenshteinDistance);
1857
-
1858
- class NeedlemanWunschDistance extends Metric {
1859
- constructor(a, b, opt = {}) {
1860
- super('needlemanWunsch', a, b, opt, true);
1861
- }
1862
- compute(a, b, m, n, maxLen) {
1863
- const { match = 1, mismatch = -1, gap = -1 } = this.options;
1864
- const len = m + 1;
1865
- const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1866
- try {
1867
- prev[0] = 0;
1868
- for (let i = 1; i <= m; i++) prev[i] = prev[i - 1] + gap;
1869
- for (let j = 1; j <= n; j++) {
1870
- curr[0] = prev[0] + gap;
1871
- const cb = b.charCodeAt(j - 1);
1872
- for (let i = 1; i <= m; i++) {
1873
- const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
1874
- curr[i] = Math.max(
1875
- prev[i - 1] + score,
1876
- prev[i] + gap,
1877
- curr[i - 1] + gap
1878
- );
1879
- }
1880
- prev.set(curr);
1881
- }
1882
- const score = prev[m];
1883
- const denum = maxLen * match;
1884
- return {
1885
- res: denum === 0 ? 0 : Metric.clamp(score / denum),
1886
- raw: { score, denum }
1887
- };
1888
- } finally {
1889
- Pool.release('int32', prev, len);
1890
- Pool.release('int32', curr, len);
1891
- }
1892
- }
1893
- }
1894
- MetricRegistry.add('needlemanWunsch', NeedlemanWunschDistance);
1895
-
1896
- class QGramSimilarity extends Metric {
1897
- constructor(a, b, opt = {}) {
1898
- super('qGram', a, b, opt, true);
1899
- }
1900
- _qGrams(str, q) {
1901
- const len = Math.max(0, str.length - q + 1);
1902
- const grams = Pool.acquire('set', len);
1903
- for (let i = 0; i < len; i++) grams.add(str.slice(i, i + q));
1904
- return grams;
1900
+ return index;
1901
+ },
1902
+ `Failed to generate phonetic index asynchronously`,
1903
+ { algo: this.algo, words }
1904
+ );
1905
1905
  }
1906
- compute(a, b) {
1907
- const { q = 2 } = this.options;
1908
- const setA = this._qGrams(a, q),
1909
- setB = this._qGrams(b, q);
1910
- const sizeA = setA.size,
1911
- sizeB = setB.size;
1912
- try {
1913
- let intersection = 0;
1914
- for (const gram of setA) if (setB.has(gram)) intersection++;
1915
- const size = Math.max(sizeA, sizeB);
1916
- return {
1917
- res: size === 0 ? 1 : Metric.clamp(intersection / size),
1918
- raw: { intersection, size }
1919
- };
1920
- } finally {
1921
- Pool.release('set', setA, sizeA);
1922
- Pool.release('set', setB, sizeB);
1923
- }
1906
+ getAlgoName() {
1907
+ return this.algo;
1924
1908
  }
1925
- }
1926
- MetricRegistry.add('qGram', QGramSimilarity);
1927
-
1928
- class SmithWatermanDistance extends Metric {
1929
- constructor(a, b, opt = {}) {
1930
- super('smithWaterman', a, b, opt, true);
1909
+ getIndex(input) {
1910
+ const { delimiter = ' ' } = this.options;
1911
+ return profiler$1.run(() =>
1912
+ this.loop(input.split(delimiter).filter(Boolean)).filter(Boolean)
1913
+ );
1931
1914
  }
1932
- compute(a, b, m, n) {
1933
- const { match = 2, mismatch = -1, gap = -2 } = this.options;
1934
- const len = m + 1;
1935
- const [prev, curr] = Pool.acquireMany('int32', [len, len]);
1936
- let maxScore = 0;
1937
- try {
1938
- for (let i = 0; i <= m; i++) prev[i] = 0;
1939
- for (let j = 1; j <= n; j++) {
1940
- curr[0] = 0;
1941
- const cb = b.charCodeAt(j - 1);
1942
- for (let i = 1; i <= m; i++) {
1943
- const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
1944
- curr[i] = Math.max(
1945
- 0,
1946
- prev[i - 1] + score,
1947
- prev[i] + gap,
1948
- curr[i - 1] + gap
1949
- );
1950
- if (curr[i] > maxScore) maxScore = curr[i];
1951
- }
1952
- prev.set(curr);
1953
- }
1954
- const denum = Math.min(m * match, n * match);
1955
- return {
1956
- res: denum === 0 ? 0 : Metric.clamp(maxScore / denum),
1957
- raw: { score: maxScore, denum }
1958
- };
1959
- } finally {
1960
- Pool.release('int32', prev, len);
1961
- Pool.release('int32', curr, len);
1962
- }
1915
+ async getIndexAsync(input) {
1916
+ const { delimiter = ' ' } = this.options;
1917
+ return (
1918
+ await profiler$1.runAsync(
1919
+ async () =>
1920
+ await this.loopAsync(input.split(delimiter).filter(Boolean))
1921
+ )
1922
+ ).filter(Boolean);
1963
1923
  }
1964
1924
  }
1965
- MetricRegistry.add('smithWaterman', SmithWatermanDistance);
1966
-
1967
- const profiler$1 = Profiler.getInstance();
1968
- class Phonetic {
1969
- static cache = new HashTable();
1970
- static default;
1971
- algo;
1972
- options;
1973
- optKey;
1974
- map;
1975
- static clear = () => this.cache.clear();
1976
- constructor(algo, opt = {}) {
1977
- const defaults = this.constructor.default ?? {};
1978
- const mapId = opt.map ?? defaults.map;
1979
- if (!mapId)
1980
- throw new CmpStrNotFoundError(
1981
- `No mapping specified for phonetic algorithm`,
1982
- { algo }
1983
- );
1984
- const map = PhoneticMappingRegistry.get(algo, mapId);
1985
- if (map === undefined)
1986
- throw new CmpStrNotFoundError(
1987
- `Requested mapping <${mapId}> is not declared`,
1988
- { algo, mapId }
1925
+ const PhoneticRegistry = Registry('phonetic', Phonetic);
1926
+ const PhoneticMappingRegistry = (() => {
1927
+ const mappings = Object.create(null);
1928
+ const maps = (algo) => (mappings[algo] ||= Object.create(null));
1929
+ return Object.freeze({
1930
+ add(algo, id, map, update = false) {
1931
+ const mappings = maps(algo);
1932
+ ErrorUtil.assert(
1933
+ !(!id || id in mappings) || update,
1934
+ `Entry <${id}> already exists / use <update=true> to overwrite`,
1935
+ { algo, id }
1989
1936
  );
1990
- this.options = merge(merge(defaults, map.options ?? {}), opt);
1991
- this.optKey = Hasher.fastFNV1a(
1992
- JSON.stringify(this.options, Object.keys(this.options).sort())
1993
- ).toString();
1994
- this.algo = algo;
1995
- this.map = map;
1996
- }
1997
- applyPattern(word) {
1998
- const { patterns = [] } = this.map;
1999
- if (!patterns || !patterns.length) return word;
2000
- for (const { pattern, replace, all = false } of patterns) {
2001
- word = word[all ? 'replaceAll' : 'replace'](pattern, replace);
2002
- }
2003
- return word;
2004
- }
2005
- applyRules(char, i, chars, charLen) {
2006
- const { ruleset = [] } = this.map;
2007
- if (!ruleset || !ruleset.length) return undefined;
2008
- const prev = chars[i - 1] || '',
2009
- prev2 = chars[i - 2] || '';
2010
- const next = chars[i + 1] || '',
2011
- next2 = chars[i + 2] || '';
2012
- for (const rule of ruleset) {
2013
- if (rule.char && rule.char !== char) continue;
2014
- if (rule.position === 'start' && i !== 0) continue;
2015
- if (rule.position === 'middle' && (i === 0 || i === charLen - 1))
2016
- continue;
2017
- if (rule.position === 'end' && i !== charLen) continue;
2018
- if (rule.prev && !rule.prev.includes(prev)) continue;
2019
- if (rule.prevNot && rule.prevNot.includes(prev)) continue;
2020
- if (rule.prev2 && !rule.prev2.includes(prev2)) continue;
2021
- if (rule.prev2Not && rule.prev2Not.includes(prev2)) continue;
2022
- if (rule.next && !rule.next.includes(next)) continue;
2023
- if (rule.nextNot && rule.nextNot.includes(next)) continue;
2024
- if (rule.next2 && !rule.next2.includes(next2)) continue;
2025
- if (rule.next2Not && rule.next2Not.includes(next2)) continue;
2026
- if (
2027
- rule.leading &&
2028
- !rule.leading.includes(chars.slice(0, rule.leading.length).join(''))
2029
- )
2030
- continue;
2031
- if (
2032
- rule.trailing &&
2033
- !rule.trailing.includes(chars.slice(-rule.trailing.length).join(''))
2034
- )
2035
- continue;
2036
- if (rule.match && !rule.match.every((c, j) => chars[i + j] === c))
2037
- continue;
2038
- return rule.code;
1937
+ mappings[id] = map;
1938
+ },
1939
+ remove(algo, id) {
1940
+ delete maps(algo)[id];
1941
+ },
1942
+ has(algo, id) {
1943
+ return id in maps(algo);
1944
+ },
1945
+ get(algo, id) {
1946
+ return maps(algo)[id];
1947
+ },
1948
+ list(algo) {
1949
+ return Object.keys(maps(algo));
2039
1950
  }
2040
- return undefined;
1951
+ });
1952
+ })();
1953
+
1954
+ class Caverphone extends Phonetic {
1955
+ static REGEX = { uppercase: /[^A-Z]/gi };
1956
+ static default = {
1957
+ map: 'en2',
1958
+ delimiter: ' ',
1959
+ length: -1,
1960
+ pad: '',
1961
+ dedupe: false
1962
+ };
1963
+ constructor(opt = {}) {
1964
+ super('caverphone', opt);
2041
1965
  }
2042
1966
  encode(word) {
2043
- const { map = {}, ignore = [] } = this.map;
2044
- word = this.applyPattern(word);
2045
- const chars = this.word2Chars(word);
2046
- const charLen = chars.length;
2047
- let code = '',
2048
- lastCode = null;
2049
- for (let i = 0; i < charLen; i++) {
2050
- const char = chars[i];
2051
- if (ignore.includes(char)) continue;
2052
- const mapped = this.mapChar(char, i, chars, charLen, lastCode, map);
2053
- if (mapped === undefined) continue;
2054
- ((code += mapped), (lastCode = mapped));
2055
- if (this.exitEarly(code, i)) break;
2056
- }
2057
- return this.adjustCode(code, chars);
2058
- }
2059
- mapChar(char, i, chars, charLen, lastCode, map) {
2060
- const { dedupe = true, fallback = undefined } = this.options;
2061
- const c =
2062
- this.applyRules(char, i, chars, charLen) ?? map[char] ?? fallback;
2063
- return dedupe && c === lastCode ? undefined : c;
2064
- }
2065
- equalLen(input) {
2066
- const { length = -1, pad = '0' } = this.options;
2067
- return length === -1
2068
- ? input
2069
- : (input + pad.repeat(length)).slice(0, length);
2070
- }
2071
- word2Chars = (word) => word.toLowerCase().split('');
2072
- exitEarly(code, i) {
2073
- const { length = -1 } = this.options;
2074
- return length > 0 && code.length >= length;
2075
- }
2076
- adjustCode(code, chars) {
2077
- return code;
2078
- }
2079
- loop(words) {
2080
- return ErrorUtil.wrap(
2081
- () => {
2082
- const index = [];
2083
- for (const word of words) {
2084
- const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
2085
- const code =
2086
- Phonetic.cache.get(key || '') ??
2087
- (() => {
2088
- const res = this.encode(word);
2089
- if (key) Phonetic.cache.set(key, res);
2090
- return res;
2091
- })();
2092
- if (code && code.length) index.push(this.equalLen(code));
2093
- }
2094
- return index;
2095
- },
2096
- `Failed to generate phonetic index`,
2097
- { algo: this.algo, words }
2098
- );
2099
- }
2100
- async loopAsync(words) {
2101
- return ErrorUtil.wrapAsync(
2102
- async () => {
2103
- const index = [];
2104
- for (const word of words) {
2105
- const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
2106
- const code = await Promise.resolve(
2107
- Phonetic.cache.get(key || '') ??
2108
- (() => {
2109
- const res = this.encode(word);
2110
- if (key) Phonetic.cache.set(key, res);
2111
- return res;
2112
- })()
2113
- );
2114
- if (code && code.length) index.push(this.equalLen(code));
2115
- }
2116
- return index;
2117
- },
2118
- `Failed to generate phonetic index asynchronously`,
2119
- { algo: this.algo, words }
2120
- );
2121
- }
2122
- getAlgoName = () => this.algo;
2123
- getIndex(input) {
2124
- const { delimiter = ' ' } = this.options;
2125
- return profiler$1.run(() =>
2126
- this.loop(input.split(delimiter).filter(Boolean)).filter(Boolean)
2127
- );
2128
- }
2129
- async getIndexAsync(input) {
2130
- const { delimiter = ' ' } = this.options;
2131
- return (
2132
- await profiler$1.runAsync(
2133
- async () =>
2134
- await this.loopAsync(input.split(delimiter).filter(Boolean))
2135
- )
2136
- ).filter(Boolean);
2137
- }
2138
- }
2139
- const PhoneticRegistry = Registry('phonetic', Phonetic);
2140
- const PhoneticMappingRegistry = (() => {
2141
- const mappings = Object.create(null);
2142
- const maps = (algo) => (mappings[algo] ||= Object.create(null));
2143
- return Object.freeze({
2144
- add(algo, id, map, update = false) {
2145
- const mappings = maps(algo);
2146
- ErrorUtil.assert(
2147
- !(!id || id in mappings) || update,
2148
- `Entry <${id}> already exists / use <update=true> to overwrite`,
2149
- { algo, id }
2150
- );
2151
- mappings[id] = map;
2152
- },
2153
- remove(algo, id) {
2154
- delete maps(algo)[id];
2155
- },
2156
- has(algo, id) {
2157
- return id in maps(algo);
2158
- },
2159
- get(algo, id) {
2160
- return maps(algo)[id];
2161
- },
2162
- list(algo) {
2163
- return Object.keys(maps(algo));
2164
- }
2165
- });
2166
- })();
2167
-
2168
- class Caverphone extends Phonetic {
2169
- static REGEX = { uppercase: /[^A-Z]/gi };
2170
- static default = {
2171
- map: 'en2',
2172
- delimiter: ' ',
2173
- length: -1,
2174
- pad: '',
2175
- dedupe: false
2176
- };
2177
- constructor(opt = {}) {
2178
- super('caverphone', opt);
2179
- }
2180
- encode(word) {
2181
- word = word.replace(Caverphone.REGEX.uppercase, '').toLowerCase();
2182
- return super.encode(word);
1967
+ word = word.replace(Caverphone.REGEX.uppercase, '').toLowerCase();
1968
+ return super.encode(word);
2183
1969
  }
2184
1970
  mapChar = (char) => char;
2185
1971
  adjustCode = (code) => code.toUpperCase();
@@ -2357,170 +2143,743 @@
2357
2143
  constructor(opt = {}) {
2358
2144
  super('metaphone', opt);
2359
2145
  }
2360
- encode(word) {
2361
- word = word.replace(Metaphone.REGEX.adjacent, (m, c) =>
2362
- c === 'C' ? m : c
2363
- );
2364
- return super.encode(word);
2146
+ encode(word) {
2147
+ word = word.replace(Metaphone.REGEX.adjacent, (m, c) =>
2148
+ c === 'C' ? m : c
2149
+ );
2150
+ return super.encode(word);
2151
+ }
2152
+ adjustCode(code) {
2153
+ return (
2154
+ code.slice(0, 1) + code.slice(1).replace(Metaphone.REGEX.vowel, '')
2155
+ );
2156
+ }
2157
+ }
2158
+ PhoneticRegistry.add('metaphone', Metaphone);
2159
+ PhoneticMappingRegistry.add('metaphone', 'en90', {
2160
+ map: {
2161
+ a: 'A',
2162
+ b: 'B',
2163
+ c: 'K',
2164
+ d: 'T',
2165
+ e: 'E',
2166
+ f: 'F',
2167
+ g: 'K',
2168
+ h: 'H',
2169
+ i: 'I',
2170
+ j: 'J',
2171
+ k: 'K',
2172
+ l: 'L',
2173
+ m: 'M',
2174
+ n: 'N',
2175
+ o: 'O',
2176
+ p: 'P',
2177
+ q: 'K',
2178
+ r: 'R',
2179
+ s: 'S',
2180
+ t: 'T',
2181
+ u: 'U',
2182
+ v: 'F',
2183
+ w: 'W',
2184
+ x: 'KS',
2185
+ y: 'Y',
2186
+ z: 'S'
2187
+ },
2188
+ ruleset: [
2189
+ { char: 'a', position: 'start', next: ['e'], code: '' },
2190
+ { char: 'g', position: 'start', next: ['n'], code: '' },
2191
+ { char: 'k', position: 'start', next: ['n'], code: '' },
2192
+ { char: 'p', position: 'start', next: ['n'], code: '' },
2193
+ { char: 'w', position: 'start', next: ['r'], code: '' },
2194
+ { char: 'b', position: 'end', prev: ['m'], code: '' },
2195
+ { char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
2196
+ { char: 'c', next: ['i'], next2: ['a'], code: 'X' },
2197
+ { char: 'c', next: ['e', 'i', 'y'], code: 'S' },
2198
+ { char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
2199
+ {
2200
+ char: 'g',
2201
+ next: ['h'],
2202
+ next2Not: ['', 'a', 'e', 'i', 'o', 'u'],
2203
+ code: ''
2204
+ },
2205
+ { char: 'g', trailing: 'n', code: '' },
2206
+ { char: 'g', trailing: 'ned', code: '' },
2207
+ { char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
2208
+ {
2209
+ char: 'h',
2210
+ prev: ['a', 'e', 'i', 'o', 'u'],
2211
+ nextNot: ['a', 'e', 'i', 'o', 'u'],
2212
+ code: ''
2213
+ },
2214
+ { char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
2215
+ { char: 'k', prev: ['c'], code: '' },
2216
+ { char: 'p', next: ['h'], code: 'F' },
2217
+ { char: 's', next: ['h'], code: 'X' },
2218
+ { char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
2219
+ { char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
2220
+ { char: 't', next: ['h'], code: '0' },
2221
+ { char: 't', next: ['c'], next2: ['h'], code: '' },
2222
+ { char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
2223
+ { char: 'h', leading: 'w', code: '' },
2224
+ { char: 'x', position: 'start', code: 'S' },
2225
+ { char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
2226
+ ]
2227
+ });
2228
+
2229
+ class Soundex extends Phonetic {
2230
+ static default = {
2231
+ map: 'en',
2232
+ delimiter: ' ',
2233
+ length: 4,
2234
+ pad: '0',
2235
+ dedupe: true
2236
+ };
2237
+ constructor(opt = {}) {
2238
+ super('soundex', opt);
2239
+ }
2240
+ adjustCode(code, chars) {
2241
+ return chars[0].toUpperCase() + code.slice(1).replaceAll('0', '');
2242
+ }
2243
+ }
2244
+ PhoneticRegistry.add('soundex', Soundex);
2245
+ PhoneticMappingRegistry.add('soundex', 'en', {
2246
+ map: {
2247
+ a: '0',
2248
+ e: '0',
2249
+ h: '0',
2250
+ i: '0',
2251
+ o: '0',
2252
+ u: '0',
2253
+ w: '0',
2254
+ y: '0',
2255
+ b: '1',
2256
+ f: '1',
2257
+ p: '1',
2258
+ v: '1',
2259
+ c: '2',
2260
+ g: '2',
2261
+ j: '2',
2262
+ k: '2',
2263
+ q: '2',
2264
+ s: '2',
2265
+ x: '2',
2266
+ z: '2',
2267
+ d: '3',
2268
+ t: '3',
2269
+ l: '4',
2270
+ m: '5',
2271
+ n: '5',
2272
+ r: '6'
2273
+ }
2274
+ });
2275
+ PhoneticMappingRegistry.add('soundex', 'de', {
2276
+ map: {
2277
+ a: '0',
2278
+ ä: '0',
2279
+ e: '0',
2280
+ h: '0',
2281
+ i: '0',
2282
+ j: '0',
2283
+ o: '0',
2284
+ ö: '0',
2285
+ u: '0',
2286
+ ü: '0',
2287
+ y: '0',
2288
+ b: '1',
2289
+ f: '1',
2290
+ p: '1',
2291
+ v: '1',
2292
+ w: '1',
2293
+ c: '2',
2294
+ g: '2',
2295
+ k: '2',
2296
+ q: '2',
2297
+ s: '2',
2298
+ ß: '2',
2299
+ x: '2',
2300
+ z: '2',
2301
+ d: '3',
2302
+ t: '3',
2303
+ l: '4',
2304
+ m: '5',
2305
+ n: '5',
2306
+ r: '6'
2307
+ },
2308
+ ruleset: [{ char: 'c', next: ['h'], code: '7' }]
2309
+ });
2310
+
2311
+ class OptionsValidator {
2312
+ static ALLOWED_FLAGS = new Set([
2313
+ 'd',
2314
+ 'u',
2315
+ 'x',
2316
+ 'w',
2317
+ 't',
2318
+ 'r',
2319
+ 's',
2320
+ 'k',
2321
+ 'n',
2322
+ 'i'
2323
+ ]);
2324
+ static ALLOWED_OUTPUT = new Set(['orig', 'prep']);
2325
+ static ALLOWED_MODES = new Set(['default', 'batch', 'single', 'pairwise']);
2326
+ static ALLOWED_SORT = new Set(['asc', 'desc']);
2327
+ static PROCESSORS = {
2328
+ phonetic: (opt) => {
2329
+ if (!opt) return;
2330
+ OptionsValidator.validatePhoneticName(opt.algo);
2331
+ OptionsValidator.validatePhoneticOptions(opt.opt);
2332
+ }
2333
+ };
2334
+ static METRIC_OPT_MAP = {
2335
+ mode: (v) => OptionsValidator.validateMode(v),
2336
+ delimiter: (v) => OptionsValidator.validateString(v, 'opt.delimiter'),
2337
+ pad: (v) => OptionsValidator.validateString(v, 'opt.pad'),
2338
+ q: (v) => OptionsValidator.validateNumber(v, 'opt.q'),
2339
+ match: (v) => OptionsValidator.validateNumber(v, 'opt.match'),
2340
+ mismatch: (v) => OptionsValidator.validateNumber(v, 'opt.mismatch'),
2341
+ gap: (v) => OptionsValidator.validateNumber(v, 'opt.gap')
2342
+ };
2343
+ static PHONETIC_OPT_MAP = {
2344
+ map: (v) =>
2345
+ OptionsValidator.validateString(v, 'processors.phonetic.opt.map'),
2346
+ delimiter: (v) =>
2347
+ OptionsValidator.validateString(v, 'processors.phonetic.opt.delimiter'),
2348
+ length: (v) =>
2349
+ OptionsValidator.validateNumber(v, 'processors.phonetic.opt.length'),
2350
+ pad: (v) =>
2351
+ OptionsValidator.validateString(v, 'processors.phonetic.opt.pad'),
2352
+ dedupe: (v) =>
2353
+ OptionsValidator.validateBoolean(v, 'processors.phonetic.opt.dedupe'),
2354
+ fallback: (v) =>
2355
+ OptionsValidator.validateString(v, 'processors.phonetic.opt.fallback')
2356
+ };
2357
+ static CMPSTR_OPT_MAP = {
2358
+ raw: (v) => OptionsValidator.validateBoolean(v, 'raw'),
2359
+ removeZero: (v) => OptionsValidator.validateBoolean(v, 'removeZero'),
2360
+ safeEmpty: (v) => OptionsValidator.validateBoolean(v, 'safeEmpty'),
2361
+ flags: (v) => OptionsValidator.validateFlags(v),
2362
+ metric: (v) => OptionsValidator.validateMetricName(v),
2363
+ output: (v) => OptionsValidator.validateOutput(v),
2364
+ opt: (v) => OptionsValidator.validateMetricOptions(v),
2365
+ processors: (v) => OptionsValidator.validateProcessors(v),
2366
+ sort: (v) => OptionsValidator.validateSort(v, 'sort'),
2367
+ objectsOnly: (v) => OptionsValidator.validateBoolean(v, 'objectsOnly')
2368
+ };
2369
+ static set2string(set) {
2370
+ return Array.from(set).join(' | ');
2371
+ }
2372
+ static validateType(value, name, type) {
2373
+ if (value === undefined) return;
2374
+ if (typeof value !== type || (type === 'number' && Number.isNaN(value))) {
2375
+ throw new CmpStrValidationError(
2376
+ `Invalid option <${name}>: expected ${type}`,
2377
+ { name, value }
2378
+ );
2379
+ }
2380
+ }
2381
+ static validateEnum(value, name, set) {
2382
+ if (value === undefined) return;
2383
+ if (typeof value !== 'string' || !set.has(value)) {
2384
+ throw new CmpStrValidationError(
2385
+ `Invalid option <${name}>: expected ${OptionsValidator.set2string(set)}`,
2386
+ { name, value }
2387
+ );
2388
+ }
2389
+ }
2390
+ static validateMap(opt, map) {
2391
+ if (!opt) return;
2392
+ for (const k in opt) {
2393
+ const fn = map[k];
2394
+ if (!fn)
2395
+ throw new CmpStrValidationError(`Invalid option <${k}>`, {
2396
+ option: k,
2397
+ value: map[k]
2398
+ });
2399
+ fn(opt[k]);
2400
+ }
2401
+ }
2402
+ static validateRegistryName(value, name, label, has, list) {
2403
+ if (value === undefined) return;
2404
+ if (typeof value !== 'string' || value.length === 0)
2405
+ throw new CmpStrValidationError(
2406
+ `Invalid option <${name}>: expected non-empty string`,
2407
+ { name, value }
2408
+ );
2409
+ if (!has(value))
2410
+ throw new CmpStrValidationError(
2411
+ `${label} <${value}> is not registered`,
2412
+ { name, value, available: list() }
2413
+ );
2414
+ }
2415
+ static validateBoolean(value, name) {
2416
+ OptionsValidator.validateType(value, name, 'boolean');
2417
+ }
2418
+ static validateNumber(value, name) {
2419
+ OptionsValidator.validateType(value, name, 'number');
2420
+ }
2421
+ static validateString(value, name) {
2422
+ OptionsValidator.validateType(value, name, 'string');
2423
+ }
2424
+ static validateFlags(value) {
2425
+ if (value === undefined) return;
2426
+ if (typeof value !== 'string')
2427
+ throw new CmpStrValidationError(
2428
+ `Invalid option <flags>: expected string`,
2429
+ { flags: value }
2430
+ );
2431
+ for (let i = 0; i < value.length; i++) {
2432
+ const ch = value[i];
2433
+ if (!OptionsValidator.ALLOWED_FLAGS.has(ch))
2434
+ throw new CmpStrValidationError(
2435
+ `Invalid normalization flag <${ch}> in <flags>: expected ${OptionsValidator.set2string(OptionsValidator.ALLOWED_FLAGS)}`,
2436
+ { flags: value, invalid: ch }
2437
+ );
2438
+ }
2439
+ }
2440
+ static validateOutput(value) {
2441
+ OptionsValidator.validateEnum(
2442
+ value,
2443
+ 'output',
2444
+ OptionsValidator.ALLOWED_OUTPUT
2445
+ );
2446
+ }
2447
+ static validateMode(value) {
2448
+ OptionsValidator.validateEnum(
2449
+ value,
2450
+ 'mode',
2451
+ OptionsValidator.ALLOWED_MODES
2452
+ );
2453
+ }
2454
+ static validateSort(value, name) {
2455
+ if (value === undefined || typeof value === 'boolean') return;
2456
+ OptionsValidator.validateEnum(value, name, OptionsValidator.ALLOWED_SORT);
2457
+ }
2458
+ static validateMetricName(value) {
2459
+ OptionsValidator.validateRegistryName(
2460
+ value,
2461
+ 'metric',
2462
+ 'Comparison metric',
2463
+ MetricRegistry.has,
2464
+ MetricRegistry.list
2465
+ );
2466
+ }
2467
+ static validatePhoneticName(value) {
2468
+ OptionsValidator.validateRegistryName(
2469
+ value,
2470
+ 'phonetic',
2471
+ 'Phonetic algorithm',
2472
+ PhoneticRegistry.has,
2473
+ PhoneticRegistry.list
2474
+ );
2475
+ }
2476
+ static validateMetricOptions(opt) {
2477
+ OptionsValidator.validateMap(opt, OptionsValidator.METRIC_OPT_MAP);
2478
+ }
2479
+ static validatePhoneticOptions(opt) {
2480
+ OptionsValidator.validateMap(opt, OptionsValidator.PHONETIC_OPT_MAP);
2481
+ }
2482
+ static validateProcessors(opt) {
2483
+ if (!opt) return;
2484
+ for (const key in opt) {
2485
+ const fn = OptionsValidator.PROCESSORS[key];
2486
+ if (!fn)
2487
+ throw new CmpStrValidationError(
2488
+ `Invalid processor type <${key}> in <processors>: expected ${Object.keys(OptionsValidator.PROCESSORS).join(' | ')}`,
2489
+ { processors: opt, invalid: key }
2490
+ );
2491
+ fn(opt[key]);
2492
+ }
2493
+ }
2494
+ static validateOptions(opt) {
2495
+ OptionsValidator.validateMap(opt, OptionsValidator.CMPSTR_OPT_MAP);
2496
+ }
2497
+ }
2498
+
2499
+ class StructuredData {
2500
+ data;
2501
+ key;
2502
+ static SORT_ASC = (a, b) => a.res - b.res;
2503
+ static SORT_DESC = (a, b) => b.res - a.res;
2504
+ static create(data, key) {
2505
+ return new StructuredData(data, key);
2506
+ }
2507
+ constructor(data, key) {
2508
+ this.data = data;
2509
+ this.key = key;
2510
+ }
2511
+ extractFrom(arr, key) {
2512
+ const n = arr.length;
2513
+ const result = new Array(n);
2514
+ for (let i = 0; i < n; i++) {
2515
+ const val = arr[i][key];
2516
+ result[i] = val != null ? String(val) : '';
2517
+ }
2518
+ return result;
2519
+ }
2520
+ extract() {
2521
+ return this.extractFrom(this.data, this.key);
2522
+ }
2523
+ isMetricResult(v) {
2524
+ return (
2525
+ typeof v === 'object' &&
2526
+ v !== null &&
2527
+ 'a' in v &&
2528
+ 'b' in v &&
2529
+ 'res' in v
2530
+ );
2531
+ }
2532
+ isCmpStrResult(v) {
2533
+ return (
2534
+ typeof v === 'object' &&
2535
+ v !== null &&
2536
+ 'source' in v &&
2537
+ 'target' in v &&
2538
+ 'match' in v
2539
+ );
2540
+ }
2541
+ normalizeResults(results) {
2542
+ if (!Array.isArray(results) || results.length === 0) return [];
2543
+ const first = results[0];
2544
+ let out = new Array(results.length);
2545
+ if (this.isMetricResult(first)) {
2546
+ const src = results;
2547
+ for (let i = 0; i < src.length; i++) out[i] = { ...src[i], __idx: i };
2548
+ } else if (this.isCmpStrResult(first)) {
2549
+ const src = results;
2550
+ for (let i = 0; i < src.length; i++) {
2551
+ const r = src[i];
2552
+ out[i] = {
2553
+ metric: 'unknown',
2554
+ a: r.source,
2555
+ b: r.target,
2556
+ res: r.match,
2557
+ raw: r.raw,
2558
+ __idx: i
2559
+ };
2560
+ }
2561
+ } else
2562
+ throw new CmpStrValidationError(
2563
+ 'Unsupported result format for StructuredData normalization.'
2564
+ );
2565
+ return out;
2566
+ }
2567
+ rebuild(results, sourceData, extractedStrings, removeZero, objectsOnly) {
2568
+ const m = extractedStrings.length,
2569
+ n = results.length;
2570
+ const stringToIndices = Pool.acquire('map', m);
2571
+ const occurrenceCount = Pool.acquire('map', n);
2572
+ const output = new Array(n);
2573
+ stringToIndices.clear();
2574
+ occurrenceCount.clear();
2575
+ try {
2576
+ for (let i = 0; i < m; i++) {
2577
+ const str = extractedStrings[i];
2578
+ let arr = stringToIndices.get(str);
2579
+ if (!arr) {
2580
+ arr = [];
2581
+ stringToIndices.set(str, arr);
2582
+ }
2583
+ arr.push(i);
2584
+ }
2585
+ let out = 0;
2586
+ for (let i = 0; i < n; i++) {
2587
+ const result = results[i];
2588
+ if (removeZero && result.res === 0) continue;
2589
+ const targetStr = result.b || '';
2590
+ const indices = stringToIndices.get(targetStr);
2591
+ let dataIndex;
2592
+ if (indices && indices.length > 0) {
2593
+ const occurrence = occurrenceCount.get(targetStr) ?? 0;
2594
+ occurrenceCount.set(targetStr, occurrence + 1);
2595
+ dataIndex = indices[occurrence % indices.length];
2596
+ } else {
2597
+ dataIndex = result.__idx ?? i;
2598
+ }
2599
+ if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
2600
+ const sourceObj = sourceData[dataIndex];
2601
+ const mappedTarget = extractedStrings[dataIndex] || targetStr;
2602
+ if (objectsOnly) output[out++] = sourceObj;
2603
+ else
2604
+ output[out++] = {
2605
+ obj: sourceObj,
2606
+ key: this.key,
2607
+ result: {
2608
+ source: result.a,
2609
+ target: mappedTarget,
2610
+ match: result.res
2611
+ },
2612
+ ...(result.raw ? { raw: result.raw } : null)
2613
+ };
2614
+ }
2615
+ output.length = out;
2616
+ return output;
2617
+ } finally {
2618
+ Pool.release('map', stringToIndices, m);
2619
+ Pool.release('map', occurrenceCount, n);
2620
+ }
2621
+ }
2622
+ sort(results, sort) {
2623
+ if (!sort || results.length <= 1) return results;
2624
+ return results.sort(
2625
+ sort === 'asc' ? StructuredData.SORT_ASC : StructuredData.SORT_DESC
2626
+ );
2627
+ }
2628
+ finalizeLookup(results, extractedStrings, opt) {
2629
+ return this.rebuild(
2630
+ this.sort(this.normalizeResults(results), opt?.sort),
2631
+ this.data,
2632
+ extractedStrings,
2633
+ opt?.removeZero,
2634
+ opt?.objectsOnly
2635
+ );
2636
+ }
2637
+ performLookup(fn, extractedStrings, opt) {
2638
+ return ErrorUtil.wrap(
2639
+ () => this.finalizeLookup(fn(), extractedStrings, opt),
2640
+ 'StructuredData lookup failed',
2641
+ { key: this.key }
2642
+ );
2643
+ }
2644
+ async performLookupAsync(fn, extractedStrings, opt) {
2645
+ return await ErrorUtil.wrapAsync(
2646
+ async () => this.finalizeLookup(await fn(), extractedStrings, opt),
2647
+ 'StructuredData async lookup failed',
2648
+ { key: this.key }
2649
+ );
2650
+ }
2651
+ lookup(fn, query, opt) {
2652
+ const b = this.extract();
2653
+ try {
2654
+ return this.performLookup(() => fn(query, b, opt), b, opt);
2655
+ } finally {
2656
+ Pool.release('string[]', b, b.length);
2657
+ }
2658
+ }
2659
+ async lookupAsync(fn, query, opt) {
2660
+ const b = this.extract();
2661
+ try {
2662
+ return await this.performLookupAsync(() => fn(query, b, opt), b, opt);
2663
+ } finally {
2664
+ Pool.release('string[]', b, b.length);
2665
+ }
2666
+ }
2667
+ lookupPairs(fn, other, otherKey, opt) {
2668
+ const a = this.extract();
2669
+ const b = this.extractFrom(other, otherKey);
2670
+ try {
2671
+ return this.performLookup(() => fn(a, b, opt), a, opt);
2672
+ } finally {
2673
+ Pool.release('string[]', a, a.length);
2674
+ Pool.release('string[]', b, b.length);
2675
+ }
2676
+ }
2677
+ async lookupPairsAsync(fn, other, otherKey, opt) {
2678
+ const a = this.extract();
2679
+ const b = this.extractFrom(other, otherKey);
2680
+ try {
2681
+ return await this.performLookupAsync(() => fn(a, b, opt), a, opt);
2682
+ } finally {
2683
+ Pool.release('string[]', a, a.length);
2684
+ Pool.release('string[]', b, b.length);
2685
+ }
2686
+ }
2687
+ }
2688
+
2689
+ class TextAnalyzer {
2690
+ static REGEX = {
2691
+ number: /\d/,
2692
+ sentence: /(?<=[.!?])\s+/,
2693
+ word: /\p{L}+/gu,
2694
+ nonWord: /[^\p{L}]/gu,
2695
+ vowelGroup: /[aeiouy]+/g,
2696
+ letter: /\p{L}/gu,
2697
+ ucLetter: /\p{Lu}/gu
2698
+ };
2699
+ text;
2700
+ words = [];
2701
+ sentences = [];
2702
+ charFrequency = new Map();
2703
+ wordHistogram = new Map();
2704
+ syllableCache = new Map();
2705
+ syllableStats;
2706
+ constructor(input) {
2707
+ this.text = input.trim();
2708
+ this.tokenize();
2709
+ this.computeFrequencies();
2710
+ }
2711
+ tokenize() {
2712
+ let match;
2713
+ const lcText = this.text.toLowerCase();
2714
+ while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null)
2715
+ this.words.push(match[0]);
2716
+ this.sentences = this.text
2717
+ .split(TextAnalyzer.REGEX.sentence)
2718
+ .filter(Boolean);
2719
+ }
2720
+ computeFrequencies() {
2721
+ for (const char of this.text)
2722
+ this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1);
2723
+ for (const word of this.words)
2724
+ this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1);
2725
+ }
2726
+ estimateSyllables(word) {
2727
+ const clean = word
2728
+ .normalize('NFC')
2729
+ .toLowerCase()
2730
+ .replace(TextAnalyzer.REGEX.nonWord, '');
2731
+ if (this.syllableCache.has(clean)) return this.syllableCache.get(clean);
2732
+ const matches = clean.match(TextAnalyzer.REGEX.vowelGroup);
2733
+ const count = matches ? matches.length : 1;
2734
+ this.syllableCache.set(clean, count);
2735
+ return count;
2736
+ }
2737
+ computeSyllableStats() {
2738
+ return (this.syllableStats ||= (() => {
2739
+ const perWord = this.words
2740
+ .map((w) => this.estimateSyllables(w))
2741
+ .sort((a, b) => a - b);
2742
+ const total = perWord.reduce((sum, s) => sum + s, 0);
2743
+ const mono = perWord.filter((s) => s === 1).length;
2744
+ const median = !perWord.length
2745
+ ? 0
2746
+ : perWord.length % 2 === 0
2747
+ ? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) /
2748
+ 2
2749
+ : perWord[Math.floor(perWord.length / 2)];
2750
+ return {
2751
+ total,
2752
+ mono,
2753
+ perWord,
2754
+ avg: perWord.length ? total / perWord.length : 0,
2755
+ median
2756
+ };
2757
+ })());
2758
+ }
2759
+ getLength = () => this.text.length;
2760
+ getWordCount = () => this.words.length;
2761
+ getSentenceCount = () => this.sentences.length;
2762
+ getAvgWordLength() {
2763
+ return this.words.length
2764
+ ? this.words.join('').length / this.words.length
2765
+ : 0;
2766
+ }
2767
+ getAvgSentenceLength() {
2768
+ return this.sentences.length
2769
+ ? this.words.length / this.sentences.length
2770
+ : 0;
2771
+ }
2772
+ getWordHistogram() {
2773
+ return Object.fromEntries(this.wordHistogram);
2774
+ }
2775
+ getMostCommonWords(limit = 5) {
2776
+ return [...this.wordHistogram.entries()]
2777
+ .sort((a, b) => b[1] - a[1])
2778
+ .slice(0, limit)
2779
+ .map((e) => e[0]);
2780
+ }
2781
+ getHapaxLegomena() {
2782
+ return [...this.wordHistogram.entries()]
2783
+ .filter(([, c]) => c === 1)
2784
+ .map((e) => e[0]);
2785
+ }
2786
+ hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text);
2787
+ getUpperCaseRatio() {
2788
+ const matches = this.text.match(TextAnalyzer.REGEX.letter) || [];
2789
+ const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0;
2790
+ return matches.length ? upper / matches.length : 0;
2791
+ }
2792
+ getCharFrequency() {
2793
+ return Object.fromEntries(this.charFrequency);
2794
+ }
2795
+ getUnicodeCodepoints() {
2796
+ const result = {};
2797
+ for (const [char, count] of this.charFrequency) {
2798
+ const block = char
2799
+ .charCodeAt(0)
2800
+ .toString(16)
2801
+ .padStart(4, '0')
2802
+ .toUpperCase();
2803
+ result[block] = (result[block] || 0) + count;
2804
+ }
2805
+ return result;
2806
+ }
2807
+ getLongWordRatio(len = 7) {
2808
+ let long = 0;
2809
+ for (const w of this.words) if (w.length >= len) long++;
2810
+ return this.words.length ? long / this.words.length : 0;
2811
+ }
2812
+ getShortWordRatio(len = 3) {
2813
+ let short = 0;
2814
+ for (const w of this.words) if (w.length <= len) short++;
2815
+ return this.words.length ? short / this.words.length : 0;
2816
+ }
2817
+ getSyllablesCount() {
2818
+ return this.computeSyllableStats().total;
2819
+ }
2820
+ getMonosyllabicWordCount() {
2821
+ return this.computeSyllableStats().mono;
2822
+ }
2823
+ getMinSyllablesWordCount(min) {
2824
+ return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
2825
+ }
2826
+ getMaxSyllablesWordCount(max) {
2827
+ return this.computeSyllableStats().perWord.filter((w) => w <= max).length;
2828
+ }
2829
+ getAvgSyllablesPerWord() {
2830
+ return this.computeSyllableStats().avg;
2831
+ }
2832
+ getMedianSyllablesPerWord() {
2833
+ return this.computeSyllableStats().median;
2365
2834
  }
2366
- adjustCode(code) {
2367
- return (
2368
- code.slice(0, 1) + code.slice(1).replace(Metaphone.REGEX.vowel, '')
2369
- );
2835
+ getHonoresR() {
2836
+ try {
2837
+ return (
2838
+ (100 * Math.log(this.words.length)) /
2839
+ (1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1))
2840
+ );
2841
+ } catch {
2842
+ return 0;
2843
+ }
2370
2844
  }
2371
- }
2372
- PhoneticRegistry.add('metaphone', Metaphone);
2373
- PhoneticMappingRegistry.add('metaphone', 'en90', {
2374
- map: {
2375
- a: 'A',
2376
- b: 'B',
2377
- c: 'K',
2378
- d: 'T',
2379
- e: 'E',
2380
- f: 'F',
2381
- g: 'K',
2382
- h: 'H',
2383
- i: 'I',
2384
- j: 'J',
2385
- k: 'K',
2386
- l: 'L',
2387
- m: 'M',
2388
- n: 'N',
2389
- o: 'O',
2390
- p: 'P',
2391
- q: 'K',
2392
- r: 'R',
2393
- s: 'S',
2394
- t: 'T',
2395
- u: 'U',
2396
- v: 'F',
2397
- w: 'W',
2398
- x: 'KS',
2399
- y: 'Y',
2400
- z: 'S'
2401
- },
2402
- ruleset: [
2403
- { char: 'a', position: 'start', next: ['e'], code: '' },
2404
- { char: 'g', position: 'start', next: ['n'], code: '' },
2405
- { char: 'k', position: 'start', next: ['n'], code: '' },
2406
- { char: 'p', position: 'start', next: ['n'], code: '' },
2407
- { char: 'w', position: 'start', next: ['r'], code: '' },
2408
- { char: 'b', position: 'end', prev: ['m'], code: '' },
2409
- { char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
2410
- { char: 'c', next: ['i'], next2: ['a'], code: 'X' },
2411
- { char: 'c', next: ['e', 'i', 'y'], code: 'S' },
2412
- { char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
2413
- {
2414
- char: 'g',
2415
- next: ['h'],
2416
- next2Not: ['', 'a', 'e', 'i', 'o', 'u'],
2417
- code: ''
2418
- },
2419
- { char: 'g', trailing: 'n', code: '' },
2420
- { char: 'g', trailing: 'ned', code: '' },
2421
- { char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
2422
- {
2423
- char: 'h',
2424
- prev: ['a', 'e', 'i', 'o', 'u'],
2425
- nextNot: ['a', 'e', 'i', 'o', 'u'],
2426
- code: ''
2427
- },
2428
- { char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
2429
- { char: 'k', prev: ['c'], code: '' },
2430
- { char: 'p', next: ['h'], code: 'F' },
2431
- { char: 's', next: ['h'], code: 'X' },
2432
- { char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
2433
- { char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
2434
- { char: 't', next: ['h'], code: '0' },
2435
- { char: 't', next: ['c'], next2: ['h'], code: '' },
2436
- { char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
2437
- { char: 'h', leading: 'w', code: '' },
2438
- { char: 'x', position: 'start', code: 'S' },
2439
- { char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
2440
- ]
2441
- });
2442
-
2443
- class Soundex extends Phonetic {
2444
- static default = {
2445
- map: 'en',
2446
- delimiter: ' ',
2447
- length: 4,
2448
- pad: '0',
2449
- dedupe: true
2450
- };
2451
- constructor(opt = {}) {
2452
- super('soundex', opt);
2845
+ getReadingTime(wpm = 200) {
2846
+ return this.words.length / (wpm ?? 1);
2453
2847
  }
2454
- adjustCode(code, chars) {
2455
- return chars[0].toUpperCase() + code.slice(1).replaceAll('0', '');
2848
+ getReadabilityScore(metric = 'flesch') {
2849
+ const w = this.words.length || 1;
2850
+ const s = this.sentences.length || 1;
2851
+ const y = this.getSyllablesCount() || 1;
2852
+ const asl = w / s;
2853
+ const asw = y / w;
2854
+ switch (metric) {
2855
+ case 'flesch':
2856
+ return 206.835 - 1.015 * asl - 84.6 * asw;
2857
+ case 'fleschde':
2858
+ return 180 - asl - 58.5 * asw;
2859
+ case 'kincaid':
2860
+ return 0.39 * asl + 11.8 * asw - 15.59;
2861
+ }
2456
2862
  }
2457
- }
2458
- PhoneticRegistry.add('soundex', Soundex);
2459
- PhoneticMappingRegistry.add('soundex', 'en', {
2460
- map: {
2461
- a: '0',
2462
- e: '0',
2463
- h: '0',
2464
- i: '0',
2465
- o: '0',
2466
- u: '0',
2467
- w: '0',
2468
- y: '0',
2469
- b: '1',
2470
- f: '1',
2471
- p: '1',
2472
- v: '1',
2473
- c: '2',
2474
- g: '2',
2475
- j: '2',
2476
- k: '2',
2477
- q: '2',
2478
- s: '2',
2479
- x: '2',
2480
- z: '2',
2481
- d: '3',
2482
- t: '3',
2483
- l: '4',
2484
- m: '5',
2485
- n: '5',
2486
- r: '6'
2863
+ getLIXScore() {
2864
+ const w = this.words.length || 1;
2865
+ const s = this.sentences.length || 1;
2866
+ const l = this.getLongWordRatio() * w;
2867
+ return w / s + (l / w) * 100;
2487
2868
  }
2488
- });
2489
- PhoneticMappingRegistry.add('soundex', 'de', {
2490
- map: {
2491
- a: '0',
2492
- ä: '0',
2493
- e: '0',
2494
- h: '0',
2495
- i: '0',
2496
- j: '0',
2497
- o: '0',
2498
- ö: '0',
2499
- u: '0',
2500
- ü: '0',
2501
- y: '0',
2502
- b: '1',
2503
- f: '1',
2504
- p: '1',
2505
- v: '1',
2506
- w: '1',
2507
- c: '2',
2508
- g: '2',
2509
- k: '2',
2510
- q: '2',
2511
- s: '2',
2512
- ß: '2',
2513
- x: '2',
2514
- z: '2',
2515
- d: '3',
2516
- t: '3',
2517
- l: '4',
2518
- m: '5',
2519
- n: '5',
2520
- r: '6'
2521
- },
2522
- ruleset: [{ char: 'c', next: ['h'], code: '7' }]
2523
- });
2869
+ getWSTFScore() {
2870
+ const w = this.words.length || 1;
2871
+ const h = (this.getMinSyllablesWordCount(3) / w) * 100;
2872
+ const s = this.getAvgSentenceLength();
2873
+ const l = this.getLongWordRatio() * 100;
2874
+ const m = (this.getMonosyllabicWordCount() / w) * 100;
2875
+ return [
2876
+ 0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875,
2877
+ 0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779,
2878
+ 0.2963 * h + 0.1905 * s - 1.1144,
2879
+ 0.2744 * h + 0.2656 * s - 1.693
2880
+ ];
2881
+ }
2882
+ }
2524
2883
 
2525
2884
  const profiler = Profiler.getInstance();
2526
2885
  class CmpStr {
@@ -2572,31 +2931,26 @@
2572
2931
  }
2573
2932
  assert(cond, test) {
2574
2933
  switch (cond) {
2934
+ default:
2935
+ throw new CmpStrInternalError(`Cmpstr condition <${cond}> unknown`);
2575
2936
  case 'metric':
2576
- if (!CmpStr.metric.has(test))
2577
- throw new CmpStrNotFoundError(
2578
- `CmpStr <metric> must be set, call .setMetric(), ` +
2579
- `use CmpStr.metric.list() for available metrics`,
2580
- { metric: test }
2581
- );
2937
+ OptionsValidator.validateMetricName(test);
2582
2938
  break;
2583
2939
  case 'phonetic':
2584
- if (!CmpStr.phonetic.has(test))
2585
- throw new CmpStrNotFoundError(
2586
- `CmpStr <phonetic> must be set, call .setPhonetic(), ` +
2587
- `use CmpStr.phonetic.list() for available phonetic algorithms`,
2588
- { phonetic: test }
2589
- );
2940
+ OptionsValidator.validatePhoneticName(test);
2590
2941
  break;
2591
- default:
2592
- throw new CmpStrInternalError(`Cmpstr condition <${cond}> unknown`);
2593
2942
  }
2594
2943
  }
2595
2944
  assertMany(...cond) {
2596
2945
  for (const [c, test] of cond) this.assert(c, test);
2597
2946
  }
2598
2947
  resolveOptions(opt) {
2599
- return merge({ ...(this.options ?? Object.create(null)) }, opt);
2948
+ const merged = DeepMerge.merge(
2949
+ { ...(this.options ?? Object.create(null)) },
2950
+ opt
2951
+ );
2952
+ OptionsValidator.validateOptions(merged);
2953
+ return merged;
2600
2954
  }
2601
2955
  normalize(input, flags) {
2602
2956
  return Normalizer.normalize(input, flags ?? this.options.flags ?? '');
@@ -2612,7 +2966,7 @@
2612
2966
  return input;
2613
2967
  }
2614
2968
  postProcess(result, opt) {
2615
- if (opt?.removeZero && Array.isArray(result))
2969
+ if (Array.isArray(result) && opt?.removeZero)
2616
2970
  result = result.filter((r) => r.res > 0);
2617
2971
  return result;
2618
2972
  }
@@ -2628,10 +2982,10 @@
2628
2982
  return StructuredData.create(data, key);
2629
2983
  }
2630
2984
  compute(a, b, opt, mode, raw, skip) {
2985
+ const resolved = this.resolveOptions(opt);
2986
+ this.assert('metric', resolved.metric);
2631
2987
  return ErrorUtil.wrap(
2632
2988
  () => {
2633
- const resolved = this.resolveOptions(opt);
2634
- this.assert('metric', resolved.metric);
2635
2989
  const A = skip ? a : this.prepare(a, resolved);
2636
2990
  const B = skip ? b : this.prepare(b, resolved);
2637
2991
  if (
@@ -2649,7 +3003,7 @@
2649
3003
  const result = this.postProcess(metric.getResults(), resolved);
2650
3004
  return this.output(result, raw ?? resolved.raw);
2651
3005
  },
2652
- `Failed to compute metric <${opt?.metric ?? this.options.metric}> for the given inputs`,
3006
+ `Failed to compute metric <${resolved.metric}> for the given inputs`,
2653
3007
  { a, b, options: opt }
2654
3008
  );
2655
3009
  }
@@ -2665,47 +3019,79 @@
2665
3019
  { result, raw }
2666
3020
  );
2667
3021
  }
2668
- clone = () =>
2669
- Object.assign(Object.create(Object.getPrototypeOf(this)), this);
3022
+ clone() {
3023
+ const inst = Object.assign(
3024
+ Object.create(Object.getPrototypeOf(this)),
3025
+ this
3026
+ );
3027
+ inst.options = DeepMerge.merge(Object.create(null), this.options);
3028
+ return inst;
3029
+ }
2670
3030
  reset() {
2671
- for (const k in this.options) delete this.options[k];
3031
+ this.options = Object.create(null);
2672
3032
  return this;
2673
3033
  }
2674
3034
  setOptions(opt) {
3035
+ OptionsValidator.validateOptions(opt);
2675
3036
  this.options = opt;
2676
3037
  return this;
2677
3038
  }
2678
3039
  mergeOptions(opt) {
2679
- merge(this.options, opt);
3040
+ DeepMerge.merge(this.options, opt);
3041
+ OptionsValidator.validateOptions(this.options);
2680
3042
  return this;
2681
3043
  }
2682
3044
  setSerializedOptions(opt) {
2683
- return ErrorUtil.wrap(
2684
- () => {
2685
- this.options = JSON.parse(opt);
2686
- return this;
2687
- },
2688
- `Failed to parse serialized options, invalid JSON string`,
2689
- { opt }
2690
- );
3045
+ try {
3046
+ const parsed = JSON.parse(opt);
3047
+ OptionsValidator.validateOptions(parsed);
3048
+ this.options = parsed;
3049
+ return this;
3050
+ } catch (err) {
3051
+ if (err instanceof SyntaxError)
3052
+ throw new CmpStrValidationError(
3053
+ `Failed to parse serialized options, invalid JSON string`,
3054
+ { opt, error: err instanceof Error ? err.message : String(err) }
3055
+ );
3056
+ throw err;
3057
+ }
2691
3058
  }
2692
3059
  setOption(path, value) {
2693
- set(this.options, path, value);
3060
+ DeepMerge.set(this.options, path, value);
3061
+ OptionsValidator.validateOptions(this.options);
2694
3062
  return this;
2695
3063
  }
2696
3064
  rmvOption(path) {
2697
- rmv(this.options, path);
3065
+ DeepMerge.rmv(this.options, path);
2698
3066
  return this;
2699
3067
  }
2700
- setRaw = (enable) => this.setOption('raw', enable);
2701
- setMetric = (name) => this.setOption('metric', name);
2702
- setFlags = (flags) => this.setOption('flags', flags);
2703
- rmvFlags = () => this.rmvOption('flags');
2704
- setProcessors = (opt) => this.setOption('processors', opt);
2705
- rmvProcessors = () => this.rmvOption('processors');
2706
- getOptions = () => this.options;
2707
- getSerializedOptions = () => JSON.stringify(this.options);
2708
- getOption = (path) => get(this.options, path);
3068
+ setRaw(enable) {
3069
+ return this.setOption('raw', enable);
3070
+ }
3071
+ setMetric(name) {
3072
+ return this.setOption('metric', name);
3073
+ }
3074
+ setFlags(flags) {
3075
+ return this.setOption('flags', flags);
3076
+ }
3077
+ rmvFlags() {
3078
+ return this.rmvOption('flags');
3079
+ }
3080
+ setProcessors(opt) {
3081
+ return this.setOption('processors', opt);
3082
+ }
3083
+ rmvProcessors() {
3084
+ return this.rmvOption('processors');
3085
+ }
3086
+ getOptions() {
3087
+ return this.options;
3088
+ }
3089
+ getSerializedOptions() {
3090
+ return JSON.stringify(this.options);
3091
+ }
3092
+ getOption(path) {
3093
+ return DeepMerge.get(this.options, path);
3094
+ }
2709
3095
  test(a, b, opt) {
2710
3096
  return this.compute(a, b, opt, 'single');
2711
3097
  }
@@ -2744,15 +3130,35 @@
2744
3130
  const resolved = this.resolveOptions({ flags, processors });
2745
3131
  const test = this.prepare(needle, resolved);
2746
3132
  const hstk = this.prepare(haystack, resolved);
2747
- return haystack.filter((_, i) => hstk[i].includes(test));
3133
+ const out = [];
3134
+ for (let i = 0, len = hstk.length; i < len; i++) {
3135
+ if (hstk[i].includes(test)) out.push(haystack[i]);
3136
+ }
3137
+ return out;
2748
3138
  }
2749
3139
  matrix(input, opt) {
2750
- input = this.prepare(input, this.resolveOptions(opt));
2751
- return input.map((a) =>
2752
- this.compute(a, input, undefined, 'batch', true, true).map(
2753
- (b) => b.res ?? 0
2754
- )
2755
- );
3140
+ const resolved = this.resolveOptions(opt);
3141
+ const arr = this.prepare(input, resolved);
3142
+ const n = arr.length;
3143
+ const out = Array.from({ length: n }, () => new Array(n).fill(0));
3144
+ for (let i = 0; i < n; i++)
3145
+ for (let j = i; j < n; j++) {
3146
+ if (i === j) {
3147
+ out[i][j] = 1;
3148
+ } else {
3149
+ const score = this.compute(
3150
+ arr[i],
3151
+ arr[j],
3152
+ resolved,
3153
+ 'single',
3154
+ true,
3155
+ true
3156
+ ).res;
3157
+ out[i][j] = score;
3158
+ out[j][i] = score;
3159
+ }
3160
+ }
3161
+ return out;
2756
3162
  }
2757
3163
  phoneticIndex(input, algo, opt) {
2758
3164
  const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
@@ -2833,10 +3239,10 @@
2833
3239
  : phonetic.getIndexAsync(input).then((r) => r.join(delimiter));
2834
3240
  }
2835
3241
  async computeAsync(a, b, opt, mode, raw, skip) {
3242
+ const resolved = this.resolveOptions(opt);
3243
+ this.assert('metric', resolved.metric);
2836
3244
  return ErrorUtil.wrapAsync(
2837
3245
  async () => {
2838
- const resolved = this.resolveOptions(opt);
2839
- this.assert('metric', resolved.metric);
2840
3246
  const A = skip ? a : await this.prepareAsync(a, resolved);
2841
3247
  const B = skip ? b : await this.prepareAsync(b, resolved);
2842
3248
  if (
@@ -2894,23 +3300,40 @@
2894
3300
  const resolved = this.resolveOptions({ flags, processors });
2895
3301
  const test = await this.prepareAsync(needle, resolved);
2896
3302
  const hstk = await this.prepareAsync(haystack, resolved);
2897
- return haystack.filter((_, i) => hstk[i].includes(test));
3303
+ const out = [];
3304
+ for (let i = 0; i < hstk.length; i++) {
3305
+ if (hstk[i].includes(test)) out.push(haystack[i]);
3306
+ }
3307
+ return out;
2898
3308
  }
2899
3309
  async matrixAsync(input, opt) {
2900
- input = await this.prepareAsync(input, this.resolveOptions(opt));
2901
- return Promise.all(
2902
- input.map(
2903
- async (a) =>
2904
- await this.computeAsync(
2905
- a,
2906
- input,
2907
- undefined,
2908
- 'batch',
2909
- true,
2910
- true
2911
- ).then((r) => r.map((b) => b.res ?? 0))
2912
- )
2913
- );
3310
+ const resolved = this.resolveOptions(opt);
3311
+ const arr = await this.prepareAsync(input, resolved);
3312
+ const n = arr.length;
3313
+ const out = Array.from({ length: n }, () => new Array(n).fill(0));
3314
+ for (let i = 0; i < n; i++) {
3315
+ await Promise.all(
3316
+ Array.from({ length: n - i }, (_, k) => i + k).map(async (j) => {
3317
+ if (i === j) {
3318
+ out[i][j] = 1;
3319
+ } else {
3320
+ const score = (
3321
+ await this.computeAsync(
3322
+ arr[i],
3323
+ arr[j],
3324
+ resolved,
3325
+ 'single',
3326
+ true,
3327
+ true
3328
+ )
3329
+ ).res;
3330
+ out[i][j] = score;
3331
+ out[j][i] = score;
3332
+ }
3333
+ })
3334
+ );
3335
+ }
3336
+ return out;
2914
3337
  }
2915
3338
  async phoneticIndexAsync(input, algo, opt) {
2916
3339
  const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
@@ -2966,6 +3389,7 @@
2966
3389
  exports.Metric = Metric;
2967
3390
  exports.MetricRegistry = MetricRegistry;
2968
3391
  exports.Normalizer = Normalizer;
3392
+ exports.OptionsValidator = OptionsValidator;
2969
3393
  exports.Phonetic = Phonetic;
2970
3394
  exports.PhoneticMappingRegistry = PhoneticMappingRegistry;
2971
3395
  exports.PhoneticRegistry = PhoneticRegistry;