cmpstr 3.2.2 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/CmpStr.esm.js +2149 -1721
- package/dist/CmpStr.esm.min.js +2 -2
- package/dist/CmpStr.umd.js +2028 -1604
- package/dist/CmpStr.umd.min.js +2 -2
- package/dist/cjs/CmpStr.cjs +100 -51
- package/dist/cjs/CmpStrAsync.cjs +35 -18
- package/dist/cjs/index.cjs +1 -1
- package/dist/cjs/metric/Cosine.cjs +1 -1
- package/dist/cjs/metric/DamerauLevenshtein.cjs +1 -1
- package/dist/cjs/metric/DiceSorensen.cjs +1 -1
- package/dist/cjs/metric/Hamming.cjs +1 -1
- package/dist/cjs/metric/Jaccard.cjs +1 -1
- package/dist/cjs/metric/JaroWinkler.cjs +1 -1
- package/dist/cjs/metric/LCS.cjs +1 -1
- package/dist/cjs/metric/Levenshtein.cjs +1 -1
- package/dist/cjs/metric/Metric.cjs +40 -22
- package/dist/cjs/metric/NeedlemanWunsch.cjs +1 -1
- package/dist/cjs/metric/QGram.cjs +1 -1
- package/dist/cjs/metric/SmithWaterman.cjs +1 -1
- package/dist/cjs/phonetic/Caverphone.cjs +1 -1
- package/dist/cjs/phonetic/Cologne.cjs +1 -1
- package/dist/cjs/phonetic/Metaphone.cjs +1 -1
- package/dist/cjs/phonetic/Phonetic.cjs +27 -15
- package/dist/cjs/phonetic/Soundex.cjs +1 -1
- package/dist/cjs/root.cjs +4 -2
- package/dist/cjs/utils/DeepMerge.cjs +102 -97
- package/dist/cjs/utils/DiffChecker.cjs +1 -1
- package/dist/cjs/utils/Errors.cjs +22 -19
- package/dist/cjs/utils/Filter.cjs +59 -24
- package/dist/cjs/utils/HashTable.cjs +44 -29
- package/dist/cjs/utils/Normalizer.cjs +57 -28
- package/dist/cjs/utils/OptionsValidator.cjs +211 -0
- package/dist/cjs/utils/Pool.cjs +27 -13
- package/dist/cjs/utils/Profiler.cjs +41 -27
- package/dist/cjs/utils/Registry.cjs +5 -5
- package/dist/cjs/utils/StructuredData.cjs +83 -53
- package/dist/cjs/utils/TextAnalyzer.cjs +1 -1
- package/dist/esm/CmpStr.mjs +101 -52
- package/dist/esm/CmpStrAsync.mjs +35 -18
- package/dist/esm/index.mjs +1 -1
- package/dist/esm/metric/Cosine.mjs +1 -1
- package/dist/esm/metric/DamerauLevenshtein.mjs +1 -1
- package/dist/esm/metric/DiceSorensen.mjs +1 -1
- package/dist/esm/metric/Hamming.mjs +1 -1
- package/dist/esm/metric/Jaccard.mjs +1 -1
- package/dist/esm/metric/JaroWinkler.mjs +1 -1
- package/dist/esm/metric/LCS.mjs +1 -1
- package/dist/esm/metric/Levenshtein.mjs +1 -1
- package/dist/esm/metric/Metric.mjs +40 -22
- package/dist/esm/metric/NeedlemanWunsch.mjs +1 -1
- package/dist/esm/metric/QGram.mjs +1 -1
- package/dist/esm/metric/SmithWaterman.mjs +1 -1
- package/dist/esm/phonetic/Caverphone.mjs +1 -1
- package/dist/esm/phonetic/Cologne.mjs +1 -1
- package/dist/esm/phonetic/Metaphone.mjs +1 -1
- package/dist/esm/phonetic/Phonetic.mjs +30 -15
- package/dist/esm/phonetic/Soundex.mjs +1 -1
- package/dist/esm/root.mjs +3 -3
- package/dist/esm/utils/DeepMerge.mjs +103 -94
- package/dist/esm/utils/DiffChecker.mjs +1 -1
- package/dist/esm/utils/Errors.mjs +22 -19
- package/dist/esm/utils/Filter.mjs +59 -24
- package/dist/esm/utils/HashTable.mjs +44 -29
- package/dist/esm/utils/Normalizer.mjs +57 -28
- package/dist/esm/utils/OptionsValidator.mjs +210 -0
- package/dist/esm/utils/Pool.mjs +27 -13
- package/dist/esm/utils/Profiler.mjs +41 -27
- package/dist/esm/utils/Registry.mjs +5 -5
- package/dist/esm/utils/StructuredData.mjs +83 -53
- package/dist/esm/utils/TextAnalyzer.mjs +1 -1
- package/dist/types/CmpStr.d.ts +22 -15
- package/dist/types/CmpStrAsync.d.ts +3 -0
- package/dist/types/index.d.ts +3 -3
- package/dist/types/metric/Metric.d.ts +9 -9
- package/dist/types/phonetic/Phonetic.d.ts +4 -3
- package/dist/types/root.d.ts +3 -2
- package/dist/types/utils/DeepMerge.d.ts +80 -58
- package/dist/types/utils/Errors.d.ts +25 -8
- package/dist/types/utils/Filter.d.ts +4 -1
- package/dist/types/utils/HashTable.d.ts +12 -11
- package/dist/types/utils/Normalizer.d.ts +2 -1
- package/dist/types/utils/OptionsValidator.d.ts +193 -0
- package/dist/types/utils/Profiler.d.ts +9 -28
- package/dist/types/utils/StructuredData.d.ts +3 -0
- package/dist/types/utils/Types.d.ts +13 -1
- package/package.json +14 -5
package/dist/CmpStr.umd.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* CmpStr v3.
|
|
2
|
+
* CmpStr v3.3.0 build-3699f85-260318
|
|
3
3
|
* This is a lightweight, fast and well performing library for calculating string similarity.
|
|
4
4
|
* (c) 2023-2026 Paul Köhler @komed3 / MIT License
|
|
5
5
|
* Visit https://github.com/komed3/cmpstr and https://npmjs.org/package/cmpstr
|
|
@@ -18,19 +18,32 @@
|
|
|
18
18
|
class CmpStrError extends Error {
|
|
19
19
|
code;
|
|
20
20
|
meta;
|
|
21
|
-
cause;
|
|
22
21
|
when = new Date().toISOString();
|
|
23
22
|
constructor(code, message, meta, cause) {
|
|
24
|
-
super(message);
|
|
23
|
+
super(message, cause !== undefined ? { cause } : undefined);
|
|
25
24
|
this.name = this.constructor.name;
|
|
26
25
|
this.code = code;
|
|
27
26
|
this.meta = meta;
|
|
28
|
-
this.cause = cause;
|
|
29
27
|
if (typeof Error.captureStackTrace === 'function') {
|
|
30
28
|
Error.captureStackTrace(this, this.constructor);
|
|
31
29
|
}
|
|
32
30
|
}
|
|
33
|
-
|
|
31
|
+
format(stack = false) {
|
|
32
|
+
const parts = [`${this.name} [${this.code}]`, this.message];
|
|
33
|
+
if (this.meta)
|
|
34
|
+
for (const _ in this.meta) {
|
|
35
|
+
parts.push(JSON.stringify(this.meta));
|
|
36
|
+
break;
|
|
37
|
+
}
|
|
38
|
+
return (
|
|
39
|
+
parts.join(' - ') +
|
|
40
|
+
(stack && this.stack ? `\nStack Trace:\n${this.stack}` : '')
|
|
41
|
+
);
|
|
42
|
+
}
|
|
43
|
+
toString() {
|
|
44
|
+
return this.format(false);
|
|
45
|
+
}
|
|
46
|
+
toJSON(stack = false) {
|
|
34
47
|
return {
|
|
35
48
|
name: this.name,
|
|
36
49
|
code: this.code,
|
|
@@ -42,23 +55,11 @@
|
|
|
42
55
|
? {
|
|
43
56
|
name: this.cause.name,
|
|
44
57
|
message: this.cause.message,
|
|
45
|
-
stack: this.cause.stack
|
|
58
|
+
stack: stack && this.cause.stack
|
|
46
59
|
}
|
|
47
60
|
: this.cause
|
|
48
61
|
};
|
|
49
62
|
}
|
|
50
|
-
toString(stack = false) {
|
|
51
|
-
const parts = [`${this.name} [${this.code}]`, this.message];
|
|
52
|
-
if (this.meta && Object.keys(this.meta).length) {
|
|
53
|
-
try {
|
|
54
|
-
parts.push(JSON.stringify(this.meta));
|
|
55
|
-
} catch {}
|
|
56
|
-
}
|
|
57
|
-
return (
|
|
58
|
-
parts.join(' - ') +
|
|
59
|
-
(stack && this.stack ? `\nStack Trace:\n${this.stack}` : '')
|
|
60
|
-
);
|
|
61
|
-
}
|
|
62
63
|
}
|
|
63
64
|
class CmpStrValidationError extends CmpStrError {
|
|
64
65
|
constructor(message, meta, cause) {
|
|
@@ -84,7 +85,7 @@
|
|
|
84
85
|
static assert(condition, message, meta) {
|
|
85
86
|
if (!condition) throw new CmpStrUsageError(message, meta);
|
|
86
87
|
}
|
|
87
|
-
static
|
|
88
|
+
static rethrow(err, message, meta) {
|
|
88
89
|
if (err instanceof CmpStrError) throw err;
|
|
89
90
|
throw new CmpStrInternalError(message, meta, err);
|
|
90
91
|
}
|
|
@@ -97,6 +98,7 @@
|
|
|
97
98
|
try {
|
|
98
99
|
return fn();
|
|
99
100
|
} catch (err) {
|
|
101
|
+
if (err instanceof CmpStrError) throw err;
|
|
100
102
|
throw new CmpStrInternalError(message, meta, err);
|
|
101
103
|
}
|
|
102
104
|
}
|
|
@@ -104,6 +106,7 @@
|
|
|
104
106
|
try {
|
|
105
107
|
return await fn();
|
|
106
108
|
} catch (err) {
|
|
109
|
+
if (err instanceof CmpStrError) throw err;
|
|
107
110
|
throw new CmpStrInternalError(message, meta, err);
|
|
108
111
|
}
|
|
109
112
|
}
|
|
@@ -119,118 +122,120 @@
|
|
|
119
122
|
ErrorUtil: ErrorUtil
|
|
120
123
|
});
|
|
121
124
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
125
|
+
class DeepMerge {
|
|
126
|
+
static BRACKET_PATTERN = /\[(\d+)]/g;
|
|
127
|
+
static PATH_CACHE = new Map();
|
|
128
|
+
static walk(obj, keys) {
|
|
129
|
+
let o = obj;
|
|
130
|
+
for (let i = 0; i < keys.length; i++) {
|
|
131
|
+
const k = keys[i];
|
|
132
|
+
if (o == null || !(k in o)) return { exists: false };
|
|
133
|
+
o = o[k];
|
|
134
|
+
}
|
|
135
|
+
return { exists: true, value: o };
|
|
136
|
+
}
|
|
137
|
+
static parse(p) {
|
|
138
|
+
const cached = DeepMerge.PATH_CACHE.get(p);
|
|
139
|
+
if (cached) return cached;
|
|
140
|
+
const parsed = p
|
|
141
|
+
.replace(DeepMerge.BRACKET_PATTERN, '.$1')
|
|
142
|
+
.split('.')
|
|
143
|
+
.map((s) => {
|
|
144
|
+
const n = Number(s);
|
|
145
|
+
return Number.isInteger(n) && String(n) === s ? n : s;
|
|
146
|
+
});
|
|
147
|
+
if (DeepMerge.PATH_CACHE.size > 2000) DeepMerge.PATH_CACHE.clear();
|
|
148
|
+
DeepMerge.PATH_CACHE.set(p, parsed);
|
|
149
|
+
return parsed;
|
|
142
150
|
}
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
function has(t, path) {
|
|
146
|
-
let o = t;
|
|
147
|
-
for (const k of parse(path)) {
|
|
148
|
-
if (o == null || !(k in o)) return false;
|
|
149
|
-
o = o[k];
|
|
151
|
+
static has(t, path) {
|
|
152
|
+
return DeepMerge.walk(t, DeepMerge.parse(path)).exists;
|
|
150
153
|
}
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
154
|
+
static get(t, path, fb) {
|
|
155
|
+
const r = DeepMerge.walk(t, DeepMerge.parse(path));
|
|
156
|
+
return r.exists ? r.value : fb;
|
|
157
|
+
}
|
|
158
|
+
static set(t, path, value) {
|
|
159
|
+
if (path === '') return value;
|
|
160
|
+
const keys = DeepMerge.parse(path);
|
|
161
|
+
ErrorUtil.assert(
|
|
162
|
+
t === undefined || (typeof t === 'object' && t !== null),
|
|
158
163
|
`Cannot set property <${keys[0]}> of <${JSON.stringify(t)}>`,
|
|
159
164
|
{ path: keys[0], target: t }
|
|
160
165
|
);
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
166
|
+
const root =
|
|
167
|
+
t ?? (typeof keys[0] === 'number' ? [] : Object.create(null));
|
|
168
|
+
let cur = root;
|
|
169
|
+
for (let i = 0; i < keys.length - 1; i++) {
|
|
170
|
+
const k = keys[i];
|
|
171
|
+
let n = cur[k];
|
|
172
|
+
ErrorUtil.assert(
|
|
173
|
+
n == null || typeof n === 'object',
|
|
168
174
|
`Cannot set property <${keys[i + 1]}> of <${JSON.stringify(n)}>`,
|
|
169
175
|
{ path: keys.slice(0, i + 2), value: n }
|
|
170
176
|
);
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
cur[keys[keys.length - 1]] = value;
|
|
176
|
-
return root;
|
|
177
|
-
}
|
|
178
|
-
function merge(
|
|
179
|
-
t = Object.create(null),
|
|
180
|
-
o = Object.create(null),
|
|
181
|
-
mergeUndefined = false
|
|
182
|
-
) {
|
|
183
|
-
const target = t ?? Object.create(null);
|
|
184
|
-
Object.keys(o).forEach((k) => {
|
|
185
|
-
const val = o[k];
|
|
186
|
-
if (!mergeUndefined && val === undefined) return;
|
|
187
|
-
if (k === '__proto__' || k === 'constructor') return;
|
|
188
|
-
if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
|
|
189
|
-
const existing = target[k];
|
|
190
|
-
target[k] = merge(
|
|
191
|
-
existing !== null &&
|
|
192
|
-
typeof existing === 'object' &&
|
|
193
|
-
!Array.isArray(existing)
|
|
194
|
-
? existing
|
|
195
|
-
: Object.create(null),
|
|
196
|
-
val,
|
|
197
|
-
mergeUndefined
|
|
198
|
-
);
|
|
199
|
-
} else target[k] = val;
|
|
200
|
-
});
|
|
201
|
-
return target;
|
|
202
|
-
}
|
|
203
|
-
function rmv(t, path, preserveEmpty = false) {
|
|
204
|
-
const keys = parse(path);
|
|
205
|
-
const remove = (obj, i = 0) => {
|
|
206
|
-
const key = keys[i];
|
|
207
|
-
if (!obj || typeof obj !== 'object') return false;
|
|
208
|
-
if (i === keys.length - 1) return delete obj[key];
|
|
209
|
-
if (!remove(obj[key], i + 1)) return false;
|
|
210
|
-
if (!preserveEmpty) {
|
|
211
|
-
const val = obj[key];
|
|
212
|
-
if (
|
|
213
|
-
typeof val === 'object' &&
|
|
214
|
-
((Array.isArray(val) && val.every((v) => v == null)) ||
|
|
215
|
-
(!Array.isArray(val) && Object.keys(val).length === 0))
|
|
216
|
-
)
|
|
217
|
-
delete obj[key];
|
|
177
|
+
if (n == null)
|
|
178
|
+
n = cur[k] =
|
|
179
|
+
typeof keys[i + 1] === 'number' ? [] : Object.create(null);
|
|
180
|
+
cur = n;
|
|
218
181
|
}
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
182
|
+
cur[keys[keys.length - 1]] = value;
|
|
183
|
+
return root;
|
|
184
|
+
}
|
|
185
|
+
static rmv(t, path, preserveEmpty = false) {
|
|
186
|
+
const keys = DeepMerge.parse(path);
|
|
187
|
+
const remove = (obj, i = 0) => {
|
|
188
|
+
const key = keys[i];
|
|
189
|
+
if (!obj || typeof obj !== 'object') return false;
|
|
190
|
+
if (i === keys.length - 1) return delete obj[key];
|
|
191
|
+
if (!remove(obj[key], i + 1)) return false;
|
|
192
|
+
if (!preserveEmpty) {
|
|
193
|
+
const val = obj[key];
|
|
194
|
+
let empty = true;
|
|
195
|
+
if (typeof val === 'object') {
|
|
196
|
+
if (Array.isArray(val))
|
|
197
|
+
for (let i = 0; i < val.length; i++) {
|
|
198
|
+
if (val[i] != null) {
|
|
199
|
+
empty = false;
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
else empty = false;
|
|
204
|
+
}
|
|
205
|
+
if (empty) delete obj[key];
|
|
206
|
+
}
|
|
207
|
+
return true;
|
|
208
|
+
};
|
|
209
|
+
remove(t);
|
|
210
|
+
return t;
|
|
211
|
+
}
|
|
212
|
+
static merge(
|
|
213
|
+
t = Object.create(null),
|
|
214
|
+
o = Object.create(null),
|
|
215
|
+
mergeUndefined = false
|
|
216
|
+
) {
|
|
217
|
+
const target = t ?? Object.create(null);
|
|
218
|
+
for (const k in o) {
|
|
219
|
+
const val = o[k];
|
|
220
|
+
if (!mergeUndefined && val === undefined) continue;
|
|
221
|
+
if (k === '__proto__' || k === 'constructor') continue;
|
|
222
|
+
if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
|
|
223
|
+
const existing = target[k];
|
|
224
|
+
target[k] = DeepMerge.merge(
|
|
225
|
+
existing !== null &&
|
|
226
|
+
typeof existing === 'object' &&
|
|
227
|
+
!Array.isArray(existing)
|
|
228
|
+
? existing
|
|
229
|
+
: Object.create(null),
|
|
230
|
+
val,
|
|
231
|
+
mergeUndefined
|
|
232
|
+
);
|
|
233
|
+
} else target[k] = val;
|
|
234
|
+
}
|
|
235
|
+
return target;
|
|
236
|
+
}
|
|
223
237
|
}
|
|
224
238
|
|
|
225
|
-
var DeepMerge = /*#__PURE__*/ Object.freeze({
|
|
226
|
-
__proto__: null,
|
|
227
|
-
get: get,
|
|
228
|
-
has: has,
|
|
229
|
-
merge: merge,
|
|
230
|
-
rmv: rmv,
|
|
231
|
-
set: set
|
|
232
|
-
});
|
|
233
|
-
|
|
234
239
|
class DiffChecker {
|
|
235
240
|
a;
|
|
236
241
|
b;
|
|
@@ -530,20 +535,33 @@
|
|
|
530
535
|
}
|
|
531
536
|
|
|
532
537
|
class Filter {
|
|
538
|
+
static IDENTITY = (s) => s;
|
|
533
539
|
static filters = new Map();
|
|
534
540
|
static pipeline = new Map();
|
|
535
|
-
static getPipeline(hook) {
|
|
541
|
+
static getPipeline(hook, force = false) {
|
|
536
542
|
return ErrorUtil.wrap(
|
|
537
543
|
() => {
|
|
538
|
-
|
|
539
|
-
|
|
544
|
+
if (!force) {
|
|
545
|
+
const cached = Filter.pipeline.get(hook);
|
|
546
|
+
if (cached) return cached;
|
|
547
|
+
}
|
|
540
548
|
const filter = Filter.filters.get(hook);
|
|
541
|
-
if (!filter)
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
const
|
|
549
|
+
if (!filter) {
|
|
550
|
+
Filter.pipeline.set(hook, Filter.IDENTITY);
|
|
551
|
+
return Filter.IDENTITY;
|
|
552
|
+
}
|
|
553
|
+
const pipeline = [];
|
|
554
|
+
for (const f of filter.values()) if (f.active) pipeline.push(f);
|
|
555
|
+
pipeline.sort((a, b) => a.priority - b.priority);
|
|
556
|
+
const fn =
|
|
557
|
+
pipeline.length === 0
|
|
558
|
+
? Filter.IDENTITY
|
|
559
|
+
: (input) => {
|
|
560
|
+
let v = input;
|
|
561
|
+
for (let i = 0; i < pipeline.length; i++)
|
|
562
|
+
v = pipeline[i].fn(v);
|
|
563
|
+
return v;
|
|
564
|
+
};
|
|
547
565
|
Filter.pipeline.set(hook, fn);
|
|
548
566
|
return fn;
|
|
549
567
|
},
|
|
@@ -561,9 +579,16 @@
|
|
|
561
579
|
const filter = Filter.filters.get(hook) ?? new Map();
|
|
562
580
|
const index = filter.get(id);
|
|
563
581
|
if (index && !index.overrideable) return false;
|
|
582
|
+
if (
|
|
583
|
+
index &&
|
|
584
|
+
index.fn === fn &&
|
|
585
|
+
index.priority === priority &&
|
|
586
|
+
index.active === active
|
|
587
|
+
)
|
|
588
|
+
return true;
|
|
564
589
|
filter.set(id, { id, fn, priority, active, overrideable });
|
|
565
590
|
Filter.filters.set(hook, filter);
|
|
566
|
-
Filter.
|
|
591
|
+
Filter.getPipeline(hook, true);
|
|
567
592
|
return true;
|
|
568
593
|
},
|
|
569
594
|
`Error adding filter <${id}> to hook <${hook}>`,
|
|
@@ -571,19 +596,28 @@
|
|
|
571
596
|
);
|
|
572
597
|
}
|
|
573
598
|
static remove(hook, id) {
|
|
574
|
-
Filter.pipeline.delete(hook);
|
|
575
599
|
const filter = Filter.filters.get(hook);
|
|
576
|
-
|
|
600
|
+
if (!filter || !filter.delete(id)) return false;
|
|
601
|
+
Filter.getPipeline(hook, true);
|
|
602
|
+
return true;
|
|
577
603
|
}
|
|
578
604
|
static pause(hook, id) {
|
|
579
|
-
Filter.
|
|
580
|
-
|
|
581
|
-
|
|
605
|
+
const filter = Filter.filters.get(hook);
|
|
606
|
+
if (!filter) return false;
|
|
607
|
+
const f = filter.get(id);
|
|
608
|
+
if (!f || !f.active) return false;
|
|
609
|
+
f.active = false;
|
|
610
|
+
Filter.getPipeline(hook, true);
|
|
611
|
+
return true;
|
|
582
612
|
}
|
|
583
613
|
static resume(hook, id) {
|
|
584
|
-
Filter.
|
|
585
|
-
|
|
586
|
-
|
|
614
|
+
const filter = Filter.filters.get(hook);
|
|
615
|
+
if (!filter) return false;
|
|
616
|
+
const f = filter.get(id);
|
|
617
|
+
if (!f || f.active) return false;
|
|
618
|
+
f.active = true;
|
|
619
|
+
Filter.getPipeline(hook, true);
|
|
620
|
+
return true;
|
|
587
621
|
}
|
|
588
622
|
static list(hook, active = false) {
|
|
589
623
|
const filter = Filter.filters.get(hook);
|
|
@@ -596,7 +630,11 @@
|
|
|
596
630
|
return ErrorUtil.wrap(
|
|
597
631
|
() => {
|
|
598
632
|
const fn = Filter.getPipeline(hook);
|
|
599
|
-
|
|
633
|
+
if (typeof input === 'string') return fn(input);
|
|
634
|
+
const arr = input;
|
|
635
|
+
const out = new Array(arr.length);
|
|
636
|
+
for (let i = 0; i < arr.length; i++) out[i] = fn(arr[i]);
|
|
637
|
+
return out;
|
|
600
638
|
},
|
|
601
639
|
`Error applying filters for hook <${hook}>`,
|
|
602
640
|
{ hook, input }
|
|
@@ -606,16 +644,19 @@
|
|
|
606
644
|
return ErrorUtil.wrapAsync(
|
|
607
645
|
async () => {
|
|
608
646
|
const fn = Filter.getPipeline(hook);
|
|
609
|
-
return
|
|
610
|
-
|
|
611
|
-
|
|
647
|
+
if (typeof input === 'string') return Promise.resolve(fn(input));
|
|
648
|
+
const arr = input;
|
|
649
|
+
const out = new Array(arr.length);
|
|
650
|
+
for (let i = 0; i < arr.length; i++)
|
|
651
|
+
out[i] = Promise.resolve(fn(arr[i]));
|
|
652
|
+
return Promise.all(out);
|
|
612
653
|
},
|
|
613
654
|
`Error applying filters for hook <${hook}>`,
|
|
614
655
|
{ hook, input }
|
|
615
656
|
);
|
|
616
657
|
}
|
|
617
658
|
static clear(hook) {
|
|
618
|
-
Filter.
|
|
659
|
+
Filter.clearPipeline();
|
|
619
660
|
if (hook) Filter.filters.delete(hook);
|
|
620
661
|
else Filter.filters.clear();
|
|
621
662
|
}
|
|
@@ -629,25 +670,21 @@
|
|
|
629
670
|
static HASH_OFFSET = 0x811c9dc5;
|
|
630
671
|
static fastFNV1a(str) {
|
|
631
672
|
const len = str.length;
|
|
673
|
+
const limit = len & -4;
|
|
632
674
|
let hash = this.HASH_OFFSET;
|
|
633
|
-
|
|
634
|
-
for (
|
|
635
|
-
const pos = i * 4;
|
|
675
|
+
let i = 0;
|
|
676
|
+
for (; i < limit; i += 4) {
|
|
636
677
|
const chunk =
|
|
637
|
-
str.charCodeAt(
|
|
638
|
-
(str.charCodeAt(
|
|
639
|
-
(str.charCodeAt(
|
|
640
|
-
(str.charCodeAt(
|
|
678
|
+
str.charCodeAt(i) |
|
|
679
|
+
(str.charCodeAt(i + 1) << 8) |
|
|
680
|
+
(str.charCodeAt(i + 2) << 16) |
|
|
681
|
+
(str.charCodeAt(i + 3) << 24);
|
|
641
682
|
hash ^= chunk;
|
|
642
683
|
hash = Math.imul(hash, this.FNV_PRIME);
|
|
643
684
|
}
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
for (let i = 0; i < remaining; i++) {
|
|
648
|
-
hash ^= str.charCodeAt(pos + i);
|
|
649
|
-
hash = Math.imul(hash, this.FNV_PRIME);
|
|
650
|
-
}
|
|
685
|
+
for (; i < len; i++) {
|
|
686
|
+
hash ^= str.charCodeAt(i);
|
|
687
|
+
hash = Math.imul(hash, this.FNV_PRIME);
|
|
651
688
|
}
|
|
652
689
|
hash ^= hash >>> 16;
|
|
653
690
|
hash *= 0x85ebca6b;
|
|
@@ -658,32 +695,51 @@
|
|
|
658
695
|
}
|
|
659
696
|
}
|
|
660
697
|
class HashTable {
|
|
661
|
-
|
|
698
|
+
FIFO;
|
|
699
|
+
maxSize;
|
|
662
700
|
static MAX_LEN = 2048;
|
|
663
|
-
static TABLE_SIZE = 10_000;
|
|
664
701
|
table = new Map();
|
|
665
|
-
constructor(
|
|
666
|
-
this.
|
|
702
|
+
constructor(FIFO = true, maxSize = 10000) {
|
|
703
|
+
this.FIFO = FIFO;
|
|
704
|
+
this.maxSize = maxSize;
|
|
667
705
|
}
|
|
668
706
|
key(label, strs, sorted = false) {
|
|
669
|
-
|
|
670
|
-
const hashes =
|
|
671
|
-
|
|
707
|
+
const n = strs.length;
|
|
708
|
+
const hashes = new Array(n);
|
|
709
|
+
for (let i = 0; i < n; i++) {
|
|
710
|
+
const s = strs[i];
|
|
711
|
+
if (s.length > HashTable.MAX_LEN) return false;
|
|
712
|
+
hashes[i] = Hasher.fastFNV1a(s);
|
|
713
|
+
}
|
|
714
|
+
if (sorted) hashes.sort((a, b) => a - b);
|
|
715
|
+
let key = label;
|
|
716
|
+
for (let i = 0; i < hashes.length; i++) key += '-' + hashes[i];
|
|
717
|
+
return key;
|
|
718
|
+
}
|
|
719
|
+
has(key) {
|
|
720
|
+
return this.table.has(key);
|
|
721
|
+
}
|
|
722
|
+
get(key) {
|
|
723
|
+
return this.table.get(key);
|
|
672
724
|
}
|
|
673
|
-
has = (key) => this.table.has(key);
|
|
674
|
-
get = (key) => this.table.get(key);
|
|
675
725
|
set(key, entry, update = true) {
|
|
676
726
|
if (!update && this.table.has(key)) return false;
|
|
677
|
-
|
|
678
|
-
if (!this.
|
|
727
|
+
if (!this.table.has(key) && this.table.size >= this.maxSize) {
|
|
728
|
+
if (!this.FIFO) return false;
|
|
679
729
|
this.table.delete(this.table.keys().next().value);
|
|
680
730
|
}
|
|
681
731
|
this.table.set(key, entry);
|
|
682
732
|
return true;
|
|
683
733
|
}
|
|
684
|
-
delete
|
|
685
|
-
|
|
686
|
-
|
|
734
|
+
delete(key) {
|
|
735
|
+
return this.table.delete(key);
|
|
736
|
+
}
|
|
737
|
+
clear() {
|
|
738
|
+
this.table.clear();
|
|
739
|
+
}
|
|
740
|
+
size() {
|
|
741
|
+
return this.table.size;
|
|
742
|
+
}
|
|
687
743
|
}
|
|
688
744
|
|
|
689
745
|
class Normalizer {
|
|
@@ -702,25 +758,49 @@
|
|
|
702
758
|
static getPipeline(flags) {
|
|
703
759
|
return ErrorUtil.wrap(
|
|
704
760
|
() => {
|
|
705
|
-
|
|
706
|
-
|
|
761
|
+
const cached = Normalizer.pipeline.get(flags);
|
|
762
|
+
if (cached) return cached;
|
|
707
763
|
const { REGEX } = Normalizer;
|
|
708
|
-
const steps = [
|
|
709
|
-
|
|
710
|
-
[
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
764
|
+
const steps = [];
|
|
765
|
+
for (let i = 0; i < flags.length; i++) {
|
|
766
|
+
switch (flags[i]) {
|
|
767
|
+
case 'd':
|
|
768
|
+
steps.push((s) => s.normalize('NFD'));
|
|
769
|
+
break;
|
|
770
|
+
case 'i':
|
|
771
|
+
steps.push((s) => s.toLowerCase());
|
|
772
|
+
break;
|
|
773
|
+
case 'k':
|
|
774
|
+
steps.push((s) => s.replace(REGEX.nonLetters, ''));
|
|
775
|
+
break;
|
|
776
|
+
case 'n':
|
|
777
|
+
steps.push((s) => s.replace(REGEX.nonNumbers, ''));
|
|
778
|
+
break;
|
|
779
|
+
case 'r':
|
|
780
|
+
steps.push((s) => s.replace(REGEX.doubleChars, '$1'));
|
|
781
|
+
break;
|
|
782
|
+
case 's':
|
|
783
|
+
steps.push((s) => s.replace(REGEX.specialChars, ''));
|
|
784
|
+
break;
|
|
785
|
+
case 't':
|
|
786
|
+
steps.push((s) => s.trim());
|
|
787
|
+
break;
|
|
788
|
+
case 'u':
|
|
789
|
+
steps.push((s) => s.normalize('NFC'));
|
|
790
|
+
break;
|
|
791
|
+
case 'w':
|
|
792
|
+
steps.push((s) => s.replace(REGEX.whitespace, ' '));
|
|
793
|
+
break;
|
|
794
|
+
case 'x':
|
|
795
|
+
steps.push((s) => s.normalize('NFKC'));
|
|
796
|
+
break;
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
const fn = (input) => {
|
|
800
|
+
let v = input;
|
|
801
|
+
for (let i = 0; i < steps.length; i++) v = steps[i](v);
|
|
802
|
+
return v;
|
|
803
|
+
};
|
|
724
804
|
Normalizer.pipeline.set(flags, fn);
|
|
725
805
|
return fn;
|
|
726
806
|
},
|
|
@@ -728,19 +808,23 @@
|
|
|
728
808
|
{ flags }
|
|
729
809
|
);
|
|
730
810
|
}
|
|
731
|
-
static normalize(input, flags) {
|
|
811
|
+
static normalize(input, flags, normalizedFlags) {
|
|
732
812
|
return ErrorUtil.wrap(
|
|
733
813
|
() => {
|
|
734
814
|
if (!flags || typeof flags !== 'string' || !input) return input;
|
|
735
|
-
flags = this.canonicalFlags(flags);
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
815
|
+
flags = normalizedFlags ?? this.canonicalFlags(flags);
|
|
816
|
+
const pipeline = Normalizer.getPipeline(flags);
|
|
817
|
+
const normalizeOne = (s) => {
|
|
818
|
+
const key = Normalizer.cache.key(flags, [s]);
|
|
819
|
+
if (key && Normalizer.cache.has(key))
|
|
820
|
+
return Normalizer.cache.get(key);
|
|
821
|
+
const res = pipeline(s);
|
|
822
|
+
if (key) Normalizer.cache.set(key, res);
|
|
823
|
+
return res;
|
|
824
|
+
};
|
|
825
|
+
return Array.isArray(input)
|
|
826
|
+
? input.map(normalizeOne)
|
|
827
|
+
: normalizeOne(input);
|
|
744
828
|
},
|
|
745
829
|
`Failed to normalize input with flags: ${flags}`,
|
|
746
830
|
{ input, flags }
|
|
@@ -764,17 +848,144 @@
|
|
|
764
848
|
}
|
|
765
849
|
}
|
|
766
850
|
|
|
851
|
+
class RingPool {
|
|
852
|
+
maxSize;
|
|
853
|
+
buffers = [];
|
|
854
|
+
pointer = 0;
|
|
855
|
+
constructor(maxSize) {
|
|
856
|
+
this.maxSize = maxSize;
|
|
857
|
+
}
|
|
858
|
+
acquire(minSize, allowOversize) {
|
|
859
|
+
return ErrorUtil.wrap(
|
|
860
|
+
() => {
|
|
861
|
+
const buffers = this.buffers;
|
|
862
|
+
const len = buffers.length;
|
|
863
|
+
for (let i = 0; i < len; i++) {
|
|
864
|
+
const idx = (this.pointer + i) % len;
|
|
865
|
+
const item = buffers[idx];
|
|
866
|
+
const size = item.size;
|
|
867
|
+
if (size >= minSize && (allowOversize || size === minSize)) {
|
|
868
|
+
this.pointer = (idx + 1) % len;
|
|
869
|
+
return item;
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
return null;
|
|
873
|
+
},
|
|
874
|
+
`Failed to acquire buffer of size >= ${minSize} from pool`,
|
|
875
|
+
{ minSize, allowOversize }
|
|
876
|
+
);
|
|
877
|
+
}
|
|
878
|
+
release(item) {
|
|
879
|
+
ErrorUtil.wrap(
|
|
880
|
+
() => {
|
|
881
|
+
const buffers = this.buffers;
|
|
882
|
+
if (buffers.length < this.maxSize) {
|
|
883
|
+
buffers.push(item);
|
|
884
|
+
return;
|
|
885
|
+
}
|
|
886
|
+
buffers[this.pointer] = item;
|
|
887
|
+
this.pointer = (this.pointer + 1) % this.maxSize;
|
|
888
|
+
},
|
|
889
|
+
`Failed to release buffer back to pool`,
|
|
890
|
+
{ item }
|
|
891
|
+
);
|
|
892
|
+
}
|
|
893
|
+
clear() {
|
|
894
|
+
this.buffers = [];
|
|
895
|
+
this.pointer = 0;
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
class Pool {
|
|
899
|
+
static CONFIG = {
|
|
900
|
+
int32: {
|
|
901
|
+
type: 'int32',
|
|
902
|
+
maxSize: 64,
|
|
903
|
+
maxItemSize: 2048,
|
|
904
|
+
allowOversize: true
|
|
905
|
+
},
|
|
906
|
+
'arr[]': {
|
|
907
|
+
type: 'arr[]',
|
|
908
|
+
maxSize: 4,
|
|
909
|
+
maxItemSize: 1024,
|
|
910
|
+
allowOversize: false
|
|
911
|
+
},
|
|
912
|
+
'number[]': {
|
|
913
|
+
type: 'number[]',
|
|
914
|
+
maxSize: 16,
|
|
915
|
+
maxItemSize: 1024,
|
|
916
|
+
allowOversize: false
|
|
917
|
+
},
|
|
918
|
+
'string[]': {
|
|
919
|
+
type: 'string[]',
|
|
920
|
+
maxSize: 2,
|
|
921
|
+
maxItemSize: 1024,
|
|
922
|
+
allowOversize: false
|
|
923
|
+
},
|
|
924
|
+
set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
|
|
925
|
+
map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
|
|
926
|
+
};
|
|
927
|
+
static POOLS = {
|
|
928
|
+
int32: new RingPool(64),
|
|
929
|
+
'arr[]': new RingPool(4),
|
|
930
|
+
'number[]': new RingPool(16),
|
|
931
|
+
'string[]': new RingPool(2),
|
|
932
|
+
set: new RingPool(8),
|
|
933
|
+
map: new RingPool(8)
|
|
934
|
+
};
|
|
935
|
+
static allocate(type, size) {
|
|
936
|
+
switch (type) {
|
|
937
|
+
case 'int32':
|
|
938
|
+
return new Int32Array(size);
|
|
939
|
+
case 'arr[]':
|
|
940
|
+
return new Array(size);
|
|
941
|
+
case 'number[]':
|
|
942
|
+
return new Float64Array(size);
|
|
943
|
+
case 'string[]':
|
|
944
|
+
return new Array(size);
|
|
945
|
+
case 'set':
|
|
946
|
+
return new Set();
|
|
947
|
+
case 'map':
|
|
948
|
+
return new Map();
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
static acquire(type, size) {
|
|
952
|
+
const CONFIG = this.CONFIG[type];
|
|
953
|
+
if (!CONFIG)
|
|
954
|
+
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
|
|
955
|
+
if (size > CONFIG.maxItemSize) return this.allocate(type, size);
|
|
956
|
+
const item = this.POOLS[type].acquire(size, CONFIG.allowOversize);
|
|
957
|
+
if (item)
|
|
958
|
+
return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
|
|
959
|
+
return this.allocate(type, size);
|
|
960
|
+
}
|
|
961
|
+
static acquireMany(type, sizes) {
|
|
962
|
+
const out = new Array(sizes.length);
|
|
963
|
+
for (let i = 0; i < sizes.length; i++)
|
|
964
|
+
out[i] = this.acquire(type, sizes[i]);
|
|
965
|
+
return out;
|
|
966
|
+
}
|
|
967
|
+
static release(type, buffer, size) {
|
|
968
|
+
const CONFIG = this.CONFIG[type];
|
|
969
|
+
if (!CONFIG)
|
|
970
|
+
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
|
|
971
|
+
if (size <= CONFIG.maxItemSize)
|
|
972
|
+
this.POOLS[type].release({ buffer, size });
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
|
|
767
976
|
class Profiler {
|
|
768
977
|
active;
|
|
769
978
|
static ENV;
|
|
770
979
|
static instance;
|
|
771
980
|
nowFn;
|
|
772
981
|
memFn;
|
|
773
|
-
store =
|
|
982
|
+
store = [];
|
|
983
|
+
last;
|
|
774
984
|
totalTime = 0;
|
|
775
985
|
totalMem = 0;
|
|
776
986
|
static detectEnv() {
|
|
777
|
-
if (typeof process !== 'undefined'
|
|
987
|
+
if (typeof process !== 'undefined' && process.versions?.node)
|
|
988
|
+
Profiler.ENV = 'nodejs';
|
|
778
989
|
else if (typeof performance !== 'undefined') Profiler.ENV = 'browser';
|
|
779
990
|
else Profiler.ENV = 'unknown';
|
|
780
991
|
}
|
|
@@ -786,7 +997,7 @@
|
|
|
786
997
|
this.active = active;
|
|
787
998
|
switch (Profiler.ENV) {
|
|
788
999
|
case 'nodejs':
|
|
789
|
-
this.nowFn = () => Number(process.hrtime.bigint())
|
|
1000
|
+
this.nowFn = () => Number(process.hrtime.bigint()) * 1e-6;
|
|
790
1001
|
this.memFn = () => process.memoryUsage().heapUsed;
|
|
791
1002
|
break;
|
|
792
1003
|
case 'browser':
|
|
@@ -799,40 +1010,52 @@
|
|
|
799
1010
|
break;
|
|
800
1011
|
}
|
|
801
1012
|
}
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
startMem = this.mem();
|
|
807
|
-
const res = fn();
|
|
808
|
-
const deltaTime = this.now() - startTime,
|
|
809
|
-
deltaMem = this.mem() - startMem;
|
|
810
|
-
this.store.add({ time: deltaTime, mem: deltaMem, res, meta });
|
|
811
|
-
((this.totalTime += deltaTime), (this.totalMem += deltaMem));
|
|
812
|
-
return res;
|
|
1013
|
+
storeRes(entry) {
|
|
1014
|
+
this.store.push((this.last = entry));
|
|
1015
|
+
this.totalTime += entry.time;
|
|
1016
|
+
this.totalMem += entry.mem;
|
|
813
1017
|
}
|
|
814
|
-
enable
|
|
1018
|
+
enable() {
|
|
815
1019
|
this.active = true;
|
|
816
|
-
}
|
|
817
|
-
disable
|
|
1020
|
+
}
|
|
1021
|
+
disable() {
|
|
818
1022
|
this.active = false;
|
|
819
|
-
}
|
|
1023
|
+
}
|
|
820
1024
|
clear() {
|
|
821
|
-
this.store.
|
|
1025
|
+
this.store.length = 0;
|
|
1026
|
+
this.last = undefined;
|
|
822
1027
|
this.totalTime = 0;
|
|
823
1028
|
this.totalMem = 0;
|
|
824
1029
|
}
|
|
825
1030
|
run(fn, meta = {}) {
|
|
826
|
-
|
|
1031
|
+
if (!this.active) return fn();
|
|
1032
|
+
const startTime = this.nowFn(),
|
|
1033
|
+
startMem = this.memFn();
|
|
1034
|
+
const res = fn();
|
|
1035
|
+
const deltaTime = this.nowFn() - startTime,
|
|
1036
|
+
deltaMem = this.memFn() - startMem;
|
|
1037
|
+
this.storeRes({ time: deltaTime, mem: deltaMem, res, meta });
|
|
1038
|
+
return res;
|
|
827
1039
|
}
|
|
828
1040
|
async runAsync(fn, meta = {}) {
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
1041
|
+
if (!this.active) return fn();
|
|
1042
|
+
const startTime = this.nowFn(),
|
|
1043
|
+
startMem = this.memFn();
|
|
1044
|
+
const res = await fn();
|
|
1045
|
+
const deltaTime = this.nowFn() - startTime,
|
|
1046
|
+
deltaMem = this.memFn() - startMem;
|
|
1047
|
+
this.storeRes({ time: deltaTime, mem: deltaMem, res, meta });
|
|
1048
|
+
return res;
|
|
1049
|
+
}
|
|
1050
|
+
getAll() {
|
|
1051
|
+
return [...this.store];
|
|
1052
|
+
}
|
|
1053
|
+
getLast() {
|
|
1054
|
+
return this.last;
|
|
1055
|
+
}
|
|
1056
|
+
getTotal() {
|
|
1057
|
+
return { time: this.totalTime, mem: this.totalMem };
|
|
832
1058
|
}
|
|
833
|
-
getAll = () => [...this.store];
|
|
834
|
-
getLast = () => this.getAll().pop();
|
|
835
|
-
getTotal = () => ({ time: this.totalTime, mem: this.totalMem });
|
|
836
1059
|
services = Object.freeze({
|
|
837
1060
|
enable: this.enable.bind(this),
|
|
838
1061
|
disable: this.disable.bind(this),
|
|
@@ -908,1278 +1131,841 @@
|
|
|
908
1131
|
throw new CmpStrNotFoundError(`Registry <${reg}> does not exist`, {
|
|
909
1132
|
registry: reg
|
|
910
1133
|
});
|
|
911
|
-
return typeof cls === 'string' ? registry[reg]
|
|
1134
|
+
return typeof cls === 'string' ? registry[reg].get(cls) : cls;
|
|
912
1135
|
}
|
|
913
1136
|
function createFromRegistry(reg, cls, ...args) {
|
|
914
|
-
|
|
1137
|
+
const ctor = resolveCls(reg, cls);
|
|
915
1138
|
return ErrorUtil.wrap(
|
|
916
|
-
() => new
|
|
917
|
-
`Failed to create instance of class <${
|
|
1139
|
+
() => new ctor(...args),
|
|
1140
|
+
`Failed to create instance of class <${ctor.name ?? cls}> from registry <${reg}>`,
|
|
918
1141
|
{ registry: reg, class: cls, args }
|
|
919
1142
|
);
|
|
920
1143
|
}
|
|
921
1144
|
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
1145
|
+
const profiler$2 = Profiler.getInstance();
|
|
1146
|
+
class Metric {
|
|
1147
|
+
static cache = new HashTable();
|
|
1148
|
+
metric;
|
|
1149
|
+
a;
|
|
1150
|
+
b;
|
|
1151
|
+
origA = [];
|
|
1152
|
+
origB = [];
|
|
1153
|
+
options;
|
|
1154
|
+
optKey;
|
|
1155
|
+
symmetric;
|
|
1156
|
+
results;
|
|
1157
|
+
static clear() {
|
|
1158
|
+
this.cache.clear();
|
|
928
1159
|
}
|
|
929
|
-
|
|
930
|
-
return
|
|
931
|
-
() => {
|
|
932
|
-
const len = this.buffers.length;
|
|
933
|
-
for (let i = 0; i < len; i++) {
|
|
934
|
-
const idx = (this.pointer + i) & (len - 1);
|
|
935
|
-
const item = this.buffers[idx];
|
|
936
|
-
if (
|
|
937
|
-
item.size >= minSize &&
|
|
938
|
-
(allowOversize || item.size === minSize)
|
|
939
|
-
) {
|
|
940
|
-
this.pointer = (idx + 1) & (len - 1);
|
|
941
|
-
return item;
|
|
942
|
-
}
|
|
943
|
-
}
|
|
944
|
-
return null;
|
|
945
|
-
},
|
|
946
|
-
`Failed to acquire buffer of size >= ${minSize} from pool`,
|
|
947
|
-
{ minSize, allowOversize }
|
|
948
|
-
);
|
|
1160
|
+
static swap(a, b, m, n) {
|
|
1161
|
+
return m > n ? [b, a, n, m] : [a, b, m, n];
|
|
949
1162
|
}
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
1163
|
+
static clamp(res) {
|
|
1164
|
+
return Math.max(0, Math.min(1, res));
|
|
1165
|
+
}
|
|
1166
|
+
constructor(metric, a, b, opt = {}, symmetric = false) {
|
|
1167
|
+
this.metric = metric;
|
|
1168
|
+
this.a = Array.isArray(a) ? a : [a];
|
|
1169
|
+
this.b = Array.isArray(b) ? b : [b];
|
|
1170
|
+
ErrorUtil.assert(
|
|
1171
|
+
this.a.length > 0 && this.b.length > 0,
|
|
1172
|
+
`Inputs <a> and <b> must not be empty`,
|
|
1173
|
+
{ a: this.a, b: this.b }
|
|
960
1174
|
);
|
|
1175
|
+
this.options = opt;
|
|
1176
|
+
this.optKey = Hasher.fastFNV1a(
|
|
1177
|
+
JSON.stringify(opt, Object.keys(opt).sort())
|
|
1178
|
+
).toString();
|
|
1179
|
+
this.symmetric = symmetric;
|
|
961
1180
|
}
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
1181
|
+
preCompute(a, b, m, n) {
|
|
1182
|
+
if (a === b) return { res: 1 };
|
|
1183
|
+
if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 };
|
|
1184
|
+
return undefined;
|
|
965
1185
|
}
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
type: 'int32',
|
|
971
|
-
maxSize: 64,
|
|
972
|
-
maxItemSize: 2048,
|
|
973
|
-
allowOversize: true
|
|
974
|
-
},
|
|
975
|
-
'number[]': {
|
|
976
|
-
type: 'number[]',
|
|
977
|
-
maxSize: 16,
|
|
978
|
-
maxItemSize: 1024,
|
|
979
|
-
allowOversize: false
|
|
980
|
-
},
|
|
981
|
-
'string[]': {
|
|
982
|
-
type: 'string[]',
|
|
983
|
-
maxSize: 2,
|
|
984
|
-
maxItemSize: 1024,
|
|
985
|
-
allowOversize: false
|
|
986
|
-
},
|
|
987
|
-
set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
|
|
988
|
-
map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
|
|
989
|
-
};
|
|
990
|
-
static POOLS = {
|
|
991
|
-
int32: new RingPool(64),
|
|
992
|
-
'number[]': new RingPool(16),
|
|
993
|
-
'string[]': new RingPool(2),
|
|
994
|
-
set: new RingPool(8),
|
|
995
|
-
map: new RingPool(8)
|
|
996
|
-
};
|
|
997
|
-
static allocate(type, size) {
|
|
998
|
-
switch (type) {
|
|
999
|
-
case 'int32':
|
|
1000
|
-
return new Int32Array(size);
|
|
1001
|
-
case 'number[]':
|
|
1002
|
-
return new Float64Array(size);
|
|
1003
|
-
case 'string[]':
|
|
1004
|
-
return new Array(size);
|
|
1005
|
-
case 'set':
|
|
1006
|
-
return new Set();
|
|
1007
|
-
case 'map':
|
|
1008
|
-
return new Map();
|
|
1009
|
-
}
|
|
1010
|
-
}
|
|
1011
|
-
static acquire(type, size) {
|
|
1012
|
-
const CONFIG = this.CONFIG[type];
|
|
1013
|
-
if (!CONFIG)
|
|
1014
|
-
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
|
|
1015
|
-
if (size > CONFIG.maxItemSize) return this.allocate(type, size);
|
|
1016
|
-
const item = this.POOLS[type].acquire(size, CONFIG.allowOversize);
|
|
1017
|
-
if (item)
|
|
1018
|
-
return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
|
|
1019
|
-
return this.allocate(type, size);
|
|
1186
|
+
compute(a, b, m, n, maxLen) {
|
|
1187
|
+
throw new CmpStrInternalError(
|
|
1188
|
+
`Method compute() must be overridden in a subclass`
|
|
1189
|
+
);
|
|
1020
1190
|
}
|
|
1021
|
-
|
|
1022
|
-
return
|
|
1191
|
+
runSingle(i, j) {
|
|
1192
|
+
return ErrorUtil.wrap(
|
|
1193
|
+
() => {
|
|
1194
|
+
let a = String(this.a[i]),
|
|
1195
|
+
A = a;
|
|
1196
|
+
let b = String(this.b[j]),
|
|
1197
|
+
B = b;
|
|
1198
|
+
let m = A.length,
|
|
1199
|
+
n = B.length;
|
|
1200
|
+
let result = this.preCompute(A, B, m, n);
|
|
1201
|
+
if (!result) {
|
|
1202
|
+
result = profiler$2.run(() => {
|
|
1203
|
+
if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
|
|
1204
|
+
let key = Metric.cache.key(this.metric, [A, B], this.symmetric);
|
|
1205
|
+
if (key) key += this.optKey;
|
|
1206
|
+
return (
|
|
1207
|
+
Metric.cache.get(key || '') ??
|
|
1208
|
+
(() => {
|
|
1209
|
+
const maxLen = m > n ? m : n;
|
|
1210
|
+
const res = this.compute(A, B, m, n, maxLen);
|
|
1211
|
+
if (key) Metric.cache.set(key, res);
|
|
1212
|
+
return res;
|
|
1213
|
+
})()
|
|
1214
|
+
);
|
|
1215
|
+
});
|
|
1216
|
+
}
|
|
1217
|
+
return {
|
|
1218
|
+
metric: this.metric,
|
|
1219
|
+
a: this.origA.length > i ? this.origA[i] : a,
|
|
1220
|
+
b: this.origB.length > j ? this.origB[j] : b,
|
|
1221
|
+
...result
|
|
1222
|
+
};
|
|
1223
|
+
},
|
|
1224
|
+
`Failed to compute metric for inputs at indices a[${i}] and b[${j}]`,
|
|
1225
|
+
{ i, j }
|
|
1226
|
+
);
|
|
1023
1227
|
}
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
if (!CONFIG)
|
|
1027
|
-
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
|
|
1028
|
-
if (size <= CONFIG.maxItemSize)
|
|
1029
|
-
this.POOLS[type].release({ buffer, size });
|
|
1228
|
+
async runSingleAsync(i, j) {
|
|
1229
|
+
return Promise.resolve(this.runSingle(i, j));
|
|
1030
1230
|
}
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
return new StructuredData(data, key);
|
|
1231
|
+
runBatch() {
|
|
1232
|
+
const results = [];
|
|
1233
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1234
|
+
for (let j = 0; j < this.b.length; j++)
|
|
1235
|
+
results.push(this.runSingle(i, j));
|
|
1236
|
+
this.results = results;
|
|
1038
1237
|
}
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1238
|
+
async runBatchAsync() {
|
|
1239
|
+
const tasks = [];
|
|
1240
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1241
|
+
for (let j = 0; j < this.b.length; j++)
|
|
1242
|
+
tasks.push(this.runSingleAsync(i, j));
|
|
1243
|
+
this.results = await Promise.all(tasks);
|
|
1042
1244
|
}
|
|
1043
|
-
|
|
1044
|
-
const
|
|
1045
|
-
for (let i = 0; i <
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
}
|
|
1049
|
-
return result;
|
|
1245
|
+
runPairwise() {
|
|
1246
|
+
const results = [];
|
|
1247
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1248
|
+
results.push(this.runSingle(i, i));
|
|
1249
|
+
this.results = results;
|
|
1050
1250
|
}
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
'a' in v &&
|
|
1057
|
-
'b' in v &&
|
|
1058
|
-
'res' in v
|
|
1059
|
-
);
|
|
1251
|
+
async runPairwiseAsync() {
|
|
1252
|
+
const tasks = [];
|
|
1253
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1254
|
+
tasks.push(this.runSingleAsync(i, i));
|
|
1255
|
+
this.results = await Promise.all(tasks);
|
|
1060
1256
|
}
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
'source' in v &&
|
|
1066
|
-
'target' in v &&
|
|
1067
|
-
'match' in v
|
|
1068
|
-
);
|
|
1257
|
+
setOriginal(a, b) {
|
|
1258
|
+
if (a) this.origA = Array.isArray(a) ? a : [a];
|
|
1259
|
+
if (b) this.origB = Array.isArray(b) ? b : [b];
|
|
1260
|
+
return this;
|
|
1069
1261
|
}
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
const first = results[0];
|
|
1073
|
-
let normalized = [];
|
|
1074
|
-
if (this.isMetricResult(first)) normalized = results;
|
|
1075
|
-
else if (this.isCmpStrResult(first))
|
|
1076
|
-
normalized = results.map((r) => ({
|
|
1077
|
-
metric: 'unknown',
|
|
1078
|
-
a: r.source,
|
|
1079
|
-
b: r.target,
|
|
1080
|
-
res: r.match,
|
|
1081
|
-
raw: r.raw
|
|
1082
|
-
}));
|
|
1083
|
-
else
|
|
1084
|
-
throw new CmpStrValidationError(
|
|
1085
|
-
'Unsupported result format for StructuredData normalization.'
|
|
1086
|
-
);
|
|
1087
|
-
return normalized.map((r, idx) => ({ ...r, __idx: idx }));
|
|
1262
|
+
isBatch() {
|
|
1263
|
+
return this.a.length > 1 || this.b.length > 1;
|
|
1088
1264
|
}
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
for (let i = 0; i < extractedStrings.length; i++) {
|
|
1092
|
-
const str = extractedStrings[i];
|
|
1093
|
-
if (!stringToIndices.has(str)) stringToIndices.set(str, []);
|
|
1094
|
-
stringToIndices.get(str).push(i);
|
|
1095
|
-
}
|
|
1096
|
-
const output = new Array(results.length);
|
|
1097
|
-
const occurrenceCount = new Map();
|
|
1098
|
-
let out = 0;
|
|
1099
|
-
for (let i = 0; i < results.length; i++) {
|
|
1100
|
-
const result = results[i];
|
|
1101
|
-
if (removeZero && result.res === 0) continue;
|
|
1102
|
-
const targetStr = result.b || '';
|
|
1103
|
-
const indices = stringToIndices.get(targetStr);
|
|
1104
|
-
let dataIndex;
|
|
1105
|
-
if (indices && indices.length > 0) {
|
|
1106
|
-
const occurrence = occurrenceCount.get(targetStr) ?? 0;
|
|
1107
|
-
occurrenceCount.set(targetStr, occurrence + 1);
|
|
1108
|
-
dataIndex = indices[occurrence % indices.length];
|
|
1109
|
-
} else {
|
|
1110
|
-
dataIndex = result.__idx ?? i;
|
|
1111
|
-
}
|
|
1112
|
-
if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
|
|
1113
|
-
const sourceObj = sourceData[dataIndex];
|
|
1114
|
-
const mappedTarget = extractedStrings[dataIndex] || targetStr;
|
|
1115
|
-
if (objectsOnly) output[out++] = sourceObj;
|
|
1116
|
-
else
|
|
1117
|
-
output[out++] = {
|
|
1118
|
-
obj: sourceObj,
|
|
1119
|
-
key: this.key,
|
|
1120
|
-
result: {
|
|
1121
|
-
source: result.a,
|
|
1122
|
-
target: mappedTarget,
|
|
1123
|
-
match: result.res
|
|
1124
|
-
},
|
|
1125
|
-
...(result.raw ? { raw: result.raw } : null)
|
|
1126
|
-
};
|
|
1127
|
-
}
|
|
1128
|
-
output.length = out;
|
|
1129
|
-
return output;
|
|
1265
|
+
isSingle() {
|
|
1266
|
+
return !this.isBatch();
|
|
1130
1267
|
}
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1268
|
+
isPairwise(safe = false) {
|
|
1269
|
+
return this.isBatch() && this.a.length === this.b.length
|
|
1270
|
+
? true
|
|
1271
|
+
: !safe &&
|
|
1272
|
+
(() => {
|
|
1273
|
+
throw new CmpStrUsageError(
|
|
1274
|
+
`Mode <pairwise> requires arrays of equal length`,
|
|
1275
|
+
{ a: this.a, b: this.b }
|
|
1276
|
+
);
|
|
1277
|
+
})();
|
|
1135
1278
|
}
|
|
1136
|
-
|
|
1137
|
-
return this.
|
|
1138
|
-
this.sort(this.normalizeResults(results), opt?.sort),
|
|
1139
|
-
this.data,
|
|
1140
|
-
extractedStrings,
|
|
1141
|
-
opt?.removeZero,
|
|
1142
|
-
opt?.objectsOnly
|
|
1143
|
-
);
|
|
1279
|
+
isSymmetrical() {
|
|
1280
|
+
return this.symmetric;
|
|
1144
1281
|
}
|
|
1145
|
-
|
|
1146
|
-
return
|
|
1147
|
-
() => this.finalizeLookup(fn(), extractedStrings, opt),
|
|
1148
|
-
'StructuredData lookup failed',
|
|
1149
|
-
{ key: this.key }
|
|
1150
|
-
);
|
|
1282
|
+
whichMode(mode) {
|
|
1283
|
+
return mode ?? this.options.mode ?? 'default';
|
|
1151
1284
|
}
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
async () => this.finalizeLookup(await fn(), extractedStrings, opt),
|
|
1155
|
-
'StructuredData async lookup failed',
|
|
1156
|
-
{ key: this.key }
|
|
1157
|
-
);
|
|
1285
|
+
clear() {
|
|
1286
|
+
this.results = undefined;
|
|
1158
1287
|
}
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1288
|
+
run(mode, clear = true) {
|
|
1289
|
+
if (clear) this.clear();
|
|
1290
|
+
switch (this.whichMode(mode)) {
|
|
1291
|
+
case 'default':
|
|
1292
|
+
if (this.isSingle()) {
|
|
1293
|
+
this.results = this.runSingle(0, 0);
|
|
1294
|
+
break;
|
|
1295
|
+
}
|
|
1296
|
+
case 'batch':
|
|
1297
|
+
this.runBatch();
|
|
1298
|
+
break;
|
|
1299
|
+
case 'single':
|
|
1300
|
+
this.results = this.runSingle(0, 0);
|
|
1301
|
+
break;
|
|
1302
|
+
case 'pairwise':
|
|
1303
|
+
if (this.isPairwise()) this.runPairwise();
|
|
1304
|
+
break;
|
|
1305
|
+
default:
|
|
1306
|
+
throw new CmpStrInternalError(`Unsupported mode <${mode}>`);
|
|
1165
1307
|
}
|
|
1166
1308
|
}
|
|
1167
|
-
async
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1309
|
+
async runAsync(mode, clear = true) {
|
|
1310
|
+
if (clear) this.clear();
|
|
1311
|
+
switch (this.whichMode(mode)) {
|
|
1312
|
+
case 'default':
|
|
1313
|
+
if (this.isSingle()) {
|
|
1314
|
+
this.results = await this.runSingleAsync(0, 0);
|
|
1315
|
+
break;
|
|
1316
|
+
}
|
|
1317
|
+
case 'batch':
|
|
1318
|
+
await this.runBatchAsync();
|
|
1319
|
+
break;
|
|
1320
|
+
case 'single':
|
|
1321
|
+
this.results = await this.runSingleAsync(0, 0);
|
|
1322
|
+
break;
|
|
1323
|
+
case 'pairwise':
|
|
1324
|
+
if (this.isPairwise()) await this.runPairwiseAsync();
|
|
1325
|
+
break;
|
|
1326
|
+
default:
|
|
1327
|
+
throw new CmpStrInternalError(`Unsupported async mode <${mode}>`);
|
|
1173
1328
|
}
|
|
1174
1329
|
}
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
const b = this.extractFrom(other, otherKey);
|
|
1178
|
-
try {
|
|
1179
|
-
return this.performLookup(() => fn(a, b, opt), a, opt);
|
|
1180
|
-
} finally {
|
|
1181
|
-
Pool.release('string[]', a, a.length);
|
|
1182
|
-
Pool.release('string[]', b, b.length);
|
|
1183
|
-
}
|
|
1330
|
+
getMetricName() {
|
|
1331
|
+
return this.metric;
|
|
1184
1332
|
}
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
Pool.release('string[]', a, a.length);
|
|
1192
|
-
Pool.release('string[]', b, b.length);
|
|
1193
|
-
}
|
|
1333
|
+
getResults() {
|
|
1334
|
+
ErrorUtil.assert(
|
|
1335
|
+
this.results !== undefined,
|
|
1336
|
+
`run() must be called before getResults()`
|
|
1337
|
+
);
|
|
1338
|
+
return this.results;
|
|
1194
1339
|
}
|
|
1195
1340
|
}
|
|
1341
|
+
const MetricRegistry = Registry('metric', Metric);
|
|
1196
1342
|
|
|
1197
|
-
class
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
sentence: /(?<=[.!?])\s+/,
|
|
1201
|
-
word: /\p{L}+/gu,
|
|
1202
|
-
nonWord: /[^\p{L}]/gu,
|
|
1203
|
-
vowelGroup: /[aeiouy]+/g,
|
|
1204
|
-
letter: /\p{L}/gu,
|
|
1205
|
-
ucLetter: /\p{Lu}/gu
|
|
1206
|
-
};
|
|
1207
|
-
text;
|
|
1208
|
-
words = [];
|
|
1209
|
-
sentences = [];
|
|
1210
|
-
charFrequency = new Map();
|
|
1211
|
-
wordHistogram = new Map();
|
|
1212
|
-
syllableCache = new Map();
|
|
1213
|
-
syllableStats;
|
|
1214
|
-
constructor(input) {
|
|
1215
|
-
this.text = input.trim();
|
|
1216
|
-
this.tokenize();
|
|
1217
|
-
this.computeFrequencies();
|
|
1343
|
+
class CosineSimilarity extends Metric {
|
|
1344
|
+
constructor(a, b, opt = {}) {
|
|
1345
|
+
super('cosine', a, b, opt, true);
|
|
1218
1346
|
}
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
const
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
this.sentences = this.text
|
|
1225
|
-
.split(TextAnalyzer.REGEX.sentence)
|
|
1226
|
-
.filter(Boolean);
|
|
1347
|
+
_termFreq(str, delimiter) {
|
|
1348
|
+
const terms = str.split(delimiter);
|
|
1349
|
+
const freq = Pool.acquire('map', terms.length);
|
|
1350
|
+
for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
|
|
1351
|
+
return freq;
|
|
1227
1352
|
}
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1353
|
+
compute(a, b) {
|
|
1354
|
+
const { delimiter = ' ' } = this.options;
|
|
1355
|
+
const termsA = this._termFreq(a, delimiter);
|
|
1356
|
+
const termsB = this._termFreq(b, delimiter);
|
|
1357
|
+
try {
|
|
1358
|
+
let dotP = 0,
|
|
1359
|
+
magA = 0,
|
|
1360
|
+
magB = 0;
|
|
1361
|
+
for (const [term, freqA] of termsA) {
|
|
1362
|
+
const freqB = termsB.get(term) || 0;
|
|
1363
|
+
dotP += freqA * freqB;
|
|
1364
|
+
magA += freqA * freqA;
|
|
1365
|
+
}
|
|
1366
|
+
for (const freqB of termsB.values()) magB += freqB * freqB;
|
|
1367
|
+
magA = Math.sqrt(magA);
|
|
1368
|
+
magB = Math.sqrt(magB);
|
|
1369
|
+
return {
|
|
1370
|
+
res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0,
|
|
1371
|
+
raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB }
|
|
1372
|
+
};
|
|
1373
|
+
} finally {
|
|
1374
|
+
Pool.release('map', termsA, termsA.size);
|
|
1375
|
+
Pool.release('map', termsB, termsB.size);
|
|
1376
|
+
}
|
|
1233
1377
|
}
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
const matches = clean.match(TextAnalyzer.REGEX.vowelGroup);
|
|
1241
|
-
const count = matches ? matches.length : 1;
|
|
1242
|
-
this.syllableCache.set(clean, count);
|
|
1243
|
-
return count;
|
|
1378
|
+
}
|
|
1379
|
+
MetricRegistry.add('cosine', CosineSimilarity);
|
|
1380
|
+
|
|
1381
|
+
class DamerauLevenshteinDistance extends Metric {
|
|
1382
|
+
constructor(a, b, opt = {}) {
|
|
1383
|
+
super('damerau', a, b, opt, true);
|
|
1244
1384
|
}
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1385
|
+
compute(a, b, m, n, maxLen) {
|
|
1386
|
+
const len = m + 1;
|
|
1387
|
+
const [test, prev, curr] = Pool.acquireMany('int32', [len, len, len]);
|
|
1388
|
+
try {
|
|
1389
|
+
for (let i = 0; i <= m; i++) prev[i] = i;
|
|
1390
|
+
for (let j = 1; j <= n; j++) {
|
|
1391
|
+
curr[0] = j;
|
|
1392
|
+
const cb = b.charCodeAt(j - 1);
|
|
1393
|
+
for (let i = 1; i <= m; i++) {
|
|
1394
|
+
const ca = a.charCodeAt(i - 1);
|
|
1395
|
+
const cost = ca === cb ? 0 : 1;
|
|
1396
|
+
let val = Math.min(
|
|
1397
|
+
curr[i - 1] + 1,
|
|
1398
|
+
prev[i] + 1,
|
|
1399
|
+
prev[i - 1] + cost
|
|
1400
|
+
);
|
|
1401
|
+
if (
|
|
1402
|
+
i > 1 &&
|
|
1403
|
+
j > 1 &&
|
|
1404
|
+
ca === b.charCodeAt(j - 2) &&
|
|
1405
|
+
cb === a.charCodeAt(i - 2)
|
|
1406
|
+
)
|
|
1407
|
+
val = Math.min(val, test[i - 2] + cost);
|
|
1408
|
+
curr[i] = val;
|
|
1409
|
+
}
|
|
1410
|
+
test.set(prev);
|
|
1411
|
+
prev.set(curr);
|
|
1412
|
+
}
|
|
1413
|
+
const dist = prev[m];
|
|
1258
1414
|
return {
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
perWord,
|
|
1262
|
-
avg: perWord.length ? total / perWord.length : 0,
|
|
1263
|
-
median
|
|
1415
|
+
res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
|
|
1416
|
+
raw: { dist, maxLen }
|
|
1264
1417
|
};
|
|
1265
|
-
}
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
getAvgWordLength() {
|
|
1271
|
-
return this.words.length
|
|
1272
|
-
? this.words.join('').length / this.words.length
|
|
1273
|
-
: 0;
|
|
1418
|
+
} finally {
|
|
1419
|
+
Pool.release('int32', test, len);
|
|
1420
|
+
Pool.release('int32', prev, len);
|
|
1421
|
+
Pool.release('int32', curr, len);
|
|
1422
|
+
}
|
|
1274
1423
|
}
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1424
|
+
}
|
|
1425
|
+
MetricRegistry.add('damerau', DamerauLevenshteinDistance);
|
|
1426
|
+
|
|
1427
|
+
class DiceSorensenCoefficient extends Metric {
|
|
1428
|
+
constructor(a, b, opt = {}) {
|
|
1429
|
+
super('dice', a, b, opt, true);
|
|
1279
1430
|
}
|
|
1280
|
-
|
|
1281
|
-
|
|
1431
|
+
_bigrams(str) {
|
|
1432
|
+
const len = str.length - 1;
|
|
1433
|
+
const bigrams = Pool.acquire('set', len);
|
|
1434
|
+
for (let i = 0; i < len; i++) bigrams.add(str.substring(i, i + 2));
|
|
1435
|
+
return bigrams;
|
|
1282
1436
|
}
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
.
|
|
1286
|
-
|
|
1287
|
-
|
|
1437
|
+
compute(a, b) {
|
|
1438
|
+
const setA = this._bigrams(a),
|
|
1439
|
+
setB = this._bigrams(b);
|
|
1440
|
+
const sizeA = setA.size,
|
|
1441
|
+
sizeB = setB.size;
|
|
1442
|
+
try {
|
|
1443
|
+
let intersection = 0;
|
|
1444
|
+
for (const bigram of setA) if (setB.has(bigram)) intersection++;
|
|
1445
|
+
const size = sizeA + sizeB;
|
|
1446
|
+
return {
|
|
1447
|
+
res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
|
|
1448
|
+
raw: { intersection, size }
|
|
1449
|
+
};
|
|
1450
|
+
} finally {
|
|
1451
|
+
Pool.release('set', setA, sizeA);
|
|
1452
|
+
Pool.release('set', setB, sizeB);
|
|
1453
|
+
}
|
|
1288
1454
|
}
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1455
|
+
}
|
|
1456
|
+
MetricRegistry.add('dice', DiceSorensenCoefficient);
|
|
1457
|
+
|
|
1458
|
+
class HammingDistance extends Metric {
|
|
1459
|
+
constructor(a, b, opt = {}) {
|
|
1460
|
+
super('hamming', a, b, opt, true);
|
|
1293
1461
|
}
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1462
|
+
compute(a, b, m, n, maxLen) {
|
|
1463
|
+
if (m !== n) {
|
|
1464
|
+
if (this.options.pad !== undefined) {
|
|
1465
|
+
if (m < maxLen) a = a.padEnd(maxLen, this.options.pad);
|
|
1466
|
+
if (n < maxLen) b = b.padEnd(maxLen, this.options.pad);
|
|
1467
|
+
m = n = maxLen;
|
|
1468
|
+
} else
|
|
1469
|
+
throw new CmpStrUsageError(
|
|
1470
|
+
`Strings must be of equal length for Hamming Distance, a=${m} and b=${n} given, ` +
|
|
1471
|
+
`use option.pad for automatic adjustment`,
|
|
1472
|
+
{ a: m, b: n }
|
|
1473
|
+
);
|
|
1474
|
+
}
|
|
1475
|
+
let dist = 0;
|
|
1476
|
+
for (let i = 0; i < m; i++) if (a[i] !== b[i]) dist++;
|
|
1477
|
+
return { res: m === 0 ? 1 : Metric.clamp(1 - dist / m), raw: { dist } };
|
|
1299
1478
|
}
|
|
1300
|
-
|
|
1301
|
-
|
|
1479
|
+
}
|
|
1480
|
+
MetricRegistry.add('hamming', HammingDistance);
|
|
1481
|
+
|
|
1482
|
+
class JaccardIndex extends Metric {
|
|
1483
|
+
constructor(a, b, opt = {}) {
|
|
1484
|
+
super('jaccard', a, b, opt, true);
|
|
1302
1485
|
}
|
|
1303
|
-
|
|
1304
|
-
const
|
|
1305
|
-
|
|
1306
|
-
const
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1486
|
+
compute(a, b, m, n) {
|
|
1487
|
+
const [setA, setB] = Pool.acquireMany('set', [m, n]);
|
|
1488
|
+
try {
|
|
1489
|
+
for (const A of a) setA.add(A);
|
|
1490
|
+
for (const B of b) setB.add(B);
|
|
1491
|
+
let intersection = 0;
|
|
1492
|
+
for (const c of setA) if (setB.has(c)) intersection++;
|
|
1493
|
+
const union = setA.size + setB.size - intersection;
|
|
1494
|
+
return {
|
|
1495
|
+
res: union === 0 ? 1 : Metric.clamp(intersection / union),
|
|
1496
|
+
raw: { intersection, union }
|
|
1497
|
+
};
|
|
1498
|
+
} finally {
|
|
1499
|
+
Pool.release('set', setA, m);
|
|
1500
|
+
Pool.release('set', setB, n);
|
|
1312
1501
|
}
|
|
1313
|
-
return result;
|
|
1314
1502
|
}
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
}
|
|
1320
|
-
|
|
1321
|
-
let short = 0;
|
|
1322
|
-
for (const w of this.words) if (w.length <= len) short++;
|
|
1323
|
-
return this.words.length ? short / this.words.length : 0;
|
|
1324
|
-
}
|
|
1325
|
-
getSyllablesCount() {
|
|
1326
|
-
return this.computeSyllableStats().total;
|
|
1327
|
-
}
|
|
1328
|
-
getMonosyllabicWordCount() {
|
|
1329
|
-
return this.computeSyllableStats().mono;
|
|
1330
|
-
}
|
|
1331
|
-
getMinSyllablesWordCount(min) {
|
|
1332
|
-
return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
|
|
1333
|
-
}
|
|
1334
|
-
getMaxSyllablesWordCount(max) {
|
|
1335
|
-
return this.computeSyllableStats().perWord.filter((w) => w <= max).length;
|
|
1503
|
+
}
|
|
1504
|
+
MetricRegistry.add('jaccard', JaccardIndex);
|
|
1505
|
+
|
|
1506
|
+
class JaroWinklerDistance extends Metric {
|
|
1507
|
+
constructor(a, b, opt = {}) {
|
|
1508
|
+
super('jaroWinkler', a, b, opt, true);
|
|
1336
1509
|
}
|
|
1337
|
-
|
|
1338
|
-
|
|
1510
|
+
compute(a, b, m, n) {
|
|
1511
|
+
const [matchA, matchB] = Pool.acquireMany('int32', [m, n]);
|
|
1512
|
+
try {
|
|
1513
|
+
for (let i = 0; i < m; i++) matchA[i] = 0;
|
|
1514
|
+
for (let i = 0; i < n; i++) matchB[i] = 0;
|
|
1515
|
+
const matchWindow = Math.max(0, Math.floor(n / 2) - 1);
|
|
1516
|
+
let matches = 0;
|
|
1517
|
+
for (let i = 0; i < m; i++) {
|
|
1518
|
+
const start = Math.max(0, i - matchWindow);
|
|
1519
|
+
const end = Math.min(i + matchWindow + 1, n);
|
|
1520
|
+
for (let j = start; j < end; j++) {
|
|
1521
|
+
if (!matchB[j] && a[i] === b[j]) {
|
|
1522
|
+
matchA[i] = 1;
|
|
1523
|
+
matchB[j] = 1;
|
|
1524
|
+
matches++;
|
|
1525
|
+
break;
|
|
1526
|
+
}
|
|
1527
|
+
}
|
|
1528
|
+
}
|
|
1529
|
+
let transpos = 0,
|
|
1530
|
+
jaro = 0,
|
|
1531
|
+
prefix = 0,
|
|
1532
|
+
res = 0;
|
|
1533
|
+
if (matches > 0) {
|
|
1534
|
+
let k = 0;
|
|
1535
|
+
for (let i = 0; i < m; i++) {
|
|
1536
|
+
if (matchA[i]) {
|
|
1537
|
+
while (!matchB[k]) k++;
|
|
1538
|
+
if (a[i] !== b[k]) transpos++;
|
|
1539
|
+
k++;
|
|
1540
|
+
}
|
|
1541
|
+
}
|
|
1542
|
+
transpos /= 2;
|
|
1543
|
+
jaro =
|
|
1544
|
+
(matches / m + matches / n + (matches - transpos) / matches) / 3;
|
|
1545
|
+
for (let i = 0; i < Math.min(4, m, n); i++) {
|
|
1546
|
+
if (a[i] === b[i]) prefix++;
|
|
1547
|
+
else break;
|
|
1548
|
+
}
|
|
1549
|
+
res = jaro + prefix * 0.1 * (1 - jaro);
|
|
1550
|
+
}
|
|
1551
|
+
return {
|
|
1552
|
+
res: Metric.clamp(res),
|
|
1553
|
+
raw: { matchWindow, matches, transpos, jaro, prefix }
|
|
1554
|
+
};
|
|
1555
|
+
} finally {
|
|
1556
|
+
Pool.release('int32', matchA, m);
|
|
1557
|
+
Pool.release('int32', matchB, n);
|
|
1558
|
+
}
|
|
1339
1559
|
}
|
|
1340
|
-
|
|
1341
|
-
|
|
1560
|
+
}
|
|
1561
|
+
MetricRegistry.add('jaroWinkler', JaroWinklerDistance);
|
|
1562
|
+
|
|
1563
|
+
class LCSMetric extends Metric {
|
|
1564
|
+
constructor(a, b, opt = {}) {
|
|
1565
|
+
super('lcs', a, b, opt, true);
|
|
1342
1566
|
}
|
|
1343
|
-
|
|
1567
|
+
compute(a, b, m, n, maxLen) {
|
|
1568
|
+
const len = m + 1;
|
|
1569
|
+
const [prev, curr] = Pool.acquireMany('int32', [len, len]);
|
|
1344
1570
|
try {
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1571
|
+
for (let i = 0; i <= m; i++) prev[i] = 0;
|
|
1572
|
+
for (let j = 1; j <= n; j++) {
|
|
1573
|
+
curr[0] = 0;
|
|
1574
|
+
const cb = b.charCodeAt(j - 1);
|
|
1575
|
+
for (let i = 1; i <= m; i++) {
|
|
1576
|
+
if (a.charCodeAt(i - 1) === cb) curr[i] = prev[i - 1] + 1;
|
|
1577
|
+
else curr[i] = Math.max(prev[i], curr[i - 1]);
|
|
1578
|
+
}
|
|
1579
|
+
prev.set(curr);
|
|
1580
|
+
}
|
|
1581
|
+
const lcs = prev[m];
|
|
1582
|
+
return {
|
|
1583
|
+
res: maxLen === 0 ? 1 : Metric.clamp(lcs / maxLen),
|
|
1584
|
+
raw: { lcs, maxLen }
|
|
1585
|
+
};
|
|
1586
|
+
} finally {
|
|
1587
|
+
Pool.release('int32', prev, len);
|
|
1588
|
+
Pool.release('int32', curr, len);
|
|
1351
1589
|
}
|
|
1352
1590
|
}
|
|
1353
|
-
|
|
1354
|
-
|
|
1591
|
+
}
|
|
1592
|
+
MetricRegistry.add('lcs', LCSMetric);
|
|
1593
|
+
|
|
1594
|
+
class LevenshteinDistance extends Metric {
|
|
1595
|
+
constructor(a, b, opt = {}) {
|
|
1596
|
+
super('levenshtein', a, b, opt, true);
|
|
1355
1597
|
}
|
|
1356
|
-
|
|
1357
|
-
const
|
|
1358
|
-
const
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1598
|
+
compute(a, b, m, n, maxLen) {
|
|
1599
|
+
const len = m + 1;
|
|
1600
|
+
const [prev, curr] = Pool.acquireMany('int32', [len, len]);
|
|
1601
|
+
try {
|
|
1602
|
+
for (let i = 0; i <= m; i++) prev[i] = i;
|
|
1603
|
+
for (let j = 1; j <= n; j++) {
|
|
1604
|
+
curr[0] = j;
|
|
1605
|
+
const cb = b.charCodeAt(j - 1);
|
|
1606
|
+
for (let i = 1; i <= m; i++) {
|
|
1607
|
+
const cost = a.charCodeAt(i - 1) === cb ? 0 : 1;
|
|
1608
|
+
curr[i] = Math.min(
|
|
1609
|
+
curr[i - 1] + 1,
|
|
1610
|
+
prev[i] + 1,
|
|
1611
|
+
prev[i - 1] + cost
|
|
1612
|
+
);
|
|
1613
|
+
}
|
|
1614
|
+
prev.set(curr);
|
|
1615
|
+
}
|
|
1616
|
+
const dist = prev[m];
|
|
1617
|
+
return {
|
|
1618
|
+
res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
|
|
1619
|
+
raw: { dist, maxLen }
|
|
1620
|
+
};
|
|
1621
|
+
} finally {
|
|
1622
|
+
Pool.release('int32', prev, len);
|
|
1623
|
+
Pool.release('int32', curr, len);
|
|
1369
1624
|
}
|
|
1370
1625
|
}
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1626
|
+
}
|
|
1627
|
+
MetricRegistry.add('levenshtein', LevenshteinDistance);
|
|
1628
|
+
|
|
1629
|
+
class NeedlemanWunschDistance extends Metric {
|
|
1630
|
+
constructor(a, b, opt = {}) {
|
|
1631
|
+
super('needlemanWunsch', a, b, opt, true);
|
|
1376
1632
|
}
|
|
1377
|
-
|
|
1378
|
-
const
|
|
1379
|
-
const
|
|
1380
|
-
const
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1633
|
+
compute(a, b, m, n, maxLen) {
|
|
1634
|
+
const { match = 1, mismatch = -1, gap = -1 } = this.options;
|
|
1635
|
+
const len = m + 1;
|
|
1636
|
+
const [prev, curr] = Pool.acquireMany('int32', [len, len]);
|
|
1637
|
+
try {
|
|
1638
|
+
prev[0] = 0;
|
|
1639
|
+
for (let i = 1; i <= m; i++) prev[i] = prev[i - 1] + gap;
|
|
1640
|
+
for (let j = 1; j <= n; j++) {
|
|
1641
|
+
curr[0] = prev[0] + gap;
|
|
1642
|
+
const cb = b.charCodeAt(j - 1);
|
|
1643
|
+
for (let i = 1; i <= m; i++) {
|
|
1644
|
+
const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
|
|
1645
|
+
curr[i] = Math.max(
|
|
1646
|
+
prev[i - 1] + score,
|
|
1647
|
+
prev[i] + gap,
|
|
1648
|
+
curr[i - 1] + gap
|
|
1649
|
+
);
|
|
1650
|
+
}
|
|
1651
|
+
prev.set(curr);
|
|
1652
|
+
}
|
|
1653
|
+
const score = prev[m];
|
|
1654
|
+
const denum = maxLen * match;
|
|
1655
|
+
return {
|
|
1656
|
+
res: denum === 0 ? 0 : Metric.clamp(score / denum),
|
|
1657
|
+
raw: { score, denum }
|
|
1658
|
+
};
|
|
1659
|
+
} finally {
|
|
1660
|
+
Pool.release('int32', prev, len);
|
|
1661
|
+
Pool.release('int32', curr, len);
|
|
1662
|
+
}
|
|
1389
1663
|
}
|
|
1390
1664
|
}
|
|
1665
|
+
MetricRegistry.add('needlemanWunsch', NeedlemanWunschDistance);
|
|
1391
1666
|
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
metric;
|
|
1396
|
-
a;
|
|
1397
|
-
b;
|
|
1398
|
-
origA = [];
|
|
1399
|
-
origB = [];
|
|
1400
|
-
options;
|
|
1401
|
-
optKey;
|
|
1402
|
-
symmetric;
|
|
1403
|
-
results;
|
|
1404
|
-
static clear = () => this.cache.clear();
|
|
1405
|
-
static swap = (a, b, m, n) => (m > n ? [b, a, n, m] : [a, b, m, n]);
|
|
1406
|
-
static clamp = (res) => Math.max(0, Math.min(1, res));
|
|
1407
|
-
constructor(metric, a, b, opt = {}, symmetric = false) {
|
|
1408
|
-
this.metric = metric;
|
|
1409
|
-
this.a = Array.isArray(a) ? a : [a];
|
|
1410
|
-
this.b = Array.isArray(b) ? b : [b];
|
|
1411
|
-
ErrorUtil.assert(
|
|
1412
|
-
this.a.length > 0 && this.b.length > 0,
|
|
1413
|
-
`Inputs <a> and <b> must not be empty`,
|
|
1414
|
-
{ a: this.a, b: this.b }
|
|
1415
|
-
);
|
|
1416
|
-
this.options = opt;
|
|
1417
|
-
this.optKey = Hasher.fastFNV1a(
|
|
1418
|
-
JSON.stringify(opt, Object.keys(opt).sort())
|
|
1419
|
-
).toString();
|
|
1420
|
-
this.symmetric = symmetric;
|
|
1667
|
+
class QGramSimilarity extends Metric {
|
|
1668
|
+
constructor(a, b, opt = {}) {
|
|
1669
|
+
super('qGram', a, b, opt, true);
|
|
1421
1670
|
}
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
compute(a, b, m, n, maxLen) {
|
|
1428
|
-
throw new CmpStrInternalError(
|
|
1429
|
-
`Method compute() must be overridden in a subclass`
|
|
1430
|
-
);
|
|
1431
|
-
}
|
|
1432
|
-
runSingle(i, j) {
|
|
1433
|
-
return ErrorUtil.wrap(
|
|
1434
|
-
() => {
|
|
1435
|
-
let a = String(this.a[i]),
|
|
1436
|
-
A = a;
|
|
1437
|
-
let b = String(this.b[j]),
|
|
1438
|
-
B = b;
|
|
1439
|
-
let m = A.length,
|
|
1440
|
-
n = B.length;
|
|
1441
|
-
let result = this.preCompute(A, B, m, n);
|
|
1442
|
-
if (!result) {
|
|
1443
|
-
result = profiler$2.run(() => {
|
|
1444
|
-
if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
|
|
1445
|
-
const key =
|
|
1446
|
-
Metric.cache.key(this.metric, [A, B], this.symmetric) +
|
|
1447
|
-
this.optKey;
|
|
1448
|
-
return (
|
|
1449
|
-
Metric.cache.get(key || '') ??
|
|
1450
|
-
(() => {
|
|
1451
|
-
const res = this.compute(A, B, m, n, Math.max(m, n));
|
|
1452
|
-
if (key) Metric.cache.set(key, res);
|
|
1453
|
-
return res;
|
|
1454
|
-
})()
|
|
1455
|
-
);
|
|
1456
|
-
});
|
|
1457
|
-
}
|
|
1458
|
-
return {
|
|
1459
|
-
metric: this.metric,
|
|
1460
|
-
a: this.origA[i] ?? a,
|
|
1461
|
-
b: this.origB[j] ?? b,
|
|
1462
|
-
...result
|
|
1463
|
-
};
|
|
1464
|
-
},
|
|
1465
|
-
`Failed to compute metric for inputs at indices a[${i}] and b[${j}]`,
|
|
1466
|
-
{ i, j }
|
|
1467
|
-
);
|
|
1468
|
-
}
|
|
1469
|
-
async runSingleAsync(i, j) {
|
|
1470
|
-
return Promise.resolve(this.runSingle(i, j));
|
|
1471
|
-
}
|
|
1472
|
-
runBatch() {
|
|
1473
|
-
const results = [];
|
|
1474
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1475
|
-
for (let j = 0; j < this.b.length; j++)
|
|
1476
|
-
results.push(this.runSingle(i, j));
|
|
1477
|
-
this.results = results;
|
|
1478
|
-
}
|
|
1479
|
-
async runBatchAsync() {
|
|
1480
|
-
const results = [];
|
|
1481
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1482
|
-
for (let j = 0; j < this.b.length; j++)
|
|
1483
|
-
results.push(await this.runSingleAsync(i, j));
|
|
1484
|
-
this.results = results;
|
|
1485
|
-
}
|
|
1486
|
-
runPairwise() {
|
|
1487
|
-
const results = [];
|
|
1488
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1489
|
-
results.push(this.runSingle(i, i));
|
|
1490
|
-
this.results = results;
|
|
1491
|
-
}
|
|
1492
|
-
async runPairwiseAsync() {
|
|
1493
|
-
const results = [];
|
|
1494
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1495
|
-
results.push(await this.runSingleAsync(i, i));
|
|
1496
|
-
this.results = results;
|
|
1497
|
-
}
|
|
1498
|
-
setOriginal(a, b) {
|
|
1499
|
-
if (a) this.origA = Array.isArray(a) ? a : [a];
|
|
1500
|
-
if (b) this.origB = Array.isArray(b) ? b : [b];
|
|
1501
|
-
return this;
|
|
1502
|
-
}
|
|
1503
|
-
isBatch = () => this.a.length > 1 || this.b.length > 1;
|
|
1504
|
-
isSingle = () => !this.isBatch();
|
|
1505
|
-
isPairwise(safe = false) {
|
|
1506
|
-
return this.isBatch() && this.a.length === this.b.length
|
|
1507
|
-
? true
|
|
1508
|
-
: !safe &&
|
|
1509
|
-
(() => {
|
|
1510
|
-
throw new CmpStrUsageError(
|
|
1511
|
-
`Mode <pairwise> requires arrays of equal length`,
|
|
1512
|
-
{ a: this.a, b: this.b }
|
|
1513
|
-
);
|
|
1514
|
-
})();
|
|
1515
|
-
}
|
|
1516
|
-
isSymmetrical = () => this.symmetric;
|
|
1517
|
-
whichMode = (mode) => mode ?? this.options?.mode ?? 'default';
|
|
1518
|
-
clear = () => (this.results = undefined);
|
|
1519
|
-
run(mode, clear = true) {
|
|
1520
|
-
if (clear) this.clear();
|
|
1521
|
-
switch (this.whichMode(mode)) {
|
|
1522
|
-
case 'default':
|
|
1523
|
-
if (this.isSingle()) {
|
|
1524
|
-
this.results = this.runSingle(0, 0);
|
|
1525
|
-
break;
|
|
1526
|
-
}
|
|
1527
|
-
case 'batch':
|
|
1528
|
-
this.runBatch();
|
|
1529
|
-
break;
|
|
1530
|
-
case 'single':
|
|
1531
|
-
this.results = this.runSingle(0, 0);
|
|
1532
|
-
break;
|
|
1533
|
-
case 'pairwise':
|
|
1534
|
-
if (this.isPairwise()) this.runPairwise();
|
|
1535
|
-
break;
|
|
1536
|
-
default:
|
|
1537
|
-
throw new CmpStrInternalError(`Unsupported mode <${mode}>`);
|
|
1538
|
-
}
|
|
1539
|
-
}
|
|
1540
|
-
async runAsync(mode, clear = true) {
|
|
1541
|
-
if (clear) this.clear();
|
|
1542
|
-
switch (this.whichMode(mode)) {
|
|
1543
|
-
case 'default':
|
|
1544
|
-
if (this.isSingle()) {
|
|
1545
|
-
this.results = await this.runSingleAsync(0, 0);
|
|
1546
|
-
break;
|
|
1547
|
-
}
|
|
1548
|
-
case 'batch':
|
|
1549
|
-
await this.runBatchAsync();
|
|
1550
|
-
break;
|
|
1551
|
-
case 'single':
|
|
1552
|
-
this.results = await this.runSingleAsync(0, 0);
|
|
1553
|
-
break;
|
|
1554
|
-
case 'pairwise':
|
|
1555
|
-
if (this.isPairwise()) await this.runPairwiseAsync();
|
|
1556
|
-
break;
|
|
1557
|
-
default:
|
|
1558
|
-
throw new CmpStrInternalError(`Unsupported async mode <${mode}>`);
|
|
1559
|
-
}
|
|
1560
|
-
}
|
|
1561
|
-
getMetricName = () => this.metric;
|
|
1562
|
-
getResults() {
|
|
1563
|
-
ErrorUtil.assert(
|
|
1564
|
-
this.results !== undefined,
|
|
1565
|
-
`run() must be called before getResults()`
|
|
1566
|
-
);
|
|
1567
|
-
return this.results;
|
|
1568
|
-
}
|
|
1569
|
-
}
|
|
1570
|
-
const MetricRegistry = Registry('metric', Metric);
|
|
1571
|
-
|
|
1572
|
-
class CosineSimilarity extends Metric {
|
|
1573
|
-
constructor(a, b, opt = {}) {
|
|
1574
|
-
super('cosine', a, b, opt, true);
|
|
1575
|
-
}
|
|
1576
|
-
_termFreq(str, delimiter) {
|
|
1577
|
-
const terms = str.split(delimiter);
|
|
1578
|
-
const freq = Pool.acquire('map', terms.length);
|
|
1579
|
-
for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
|
|
1580
|
-
return freq;
|
|
1671
|
+
_qGrams(str, q) {
|
|
1672
|
+
const len = Math.max(0, str.length - q + 1);
|
|
1673
|
+
const grams = Pool.acquire('set', len);
|
|
1674
|
+
for (let i = 0; i < len; i++) grams.add(str.slice(i, i + q));
|
|
1675
|
+
return grams;
|
|
1581
1676
|
}
|
|
1582
1677
|
compute(a, b) {
|
|
1583
|
-
const {
|
|
1584
|
-
const
|
|
1585
|
-
|
|
1678
|
+
const { q = 2 } = this.options;
|
|
1679
|
+
const setA = this._qGrams(a, q),
|
|
1680
|
+
setB = this._qGrams(b, q);
|
|
1681
|
+
const sizeA = setA.size,
|
|
1682
|
+
sizeB = setB.size;
|
|
1586
1683
|
try {
|
|
1587
|
-
let
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
for (const [term, freqA] of termsA) {
|
|
1591
|
-
const freqB = termsB.get(term) || 0;
|
|
1592
|
-
dotP += freqA * freqB;
|
|
1593
|
-
magA += freqA * freqA;
|
|
1594
|
-
}
|
|
1595
|
-
for (const freqB of termsB.values()) magB += freqB * freqB;
|
|
1596
|
-
magA = Math.sqrt(magA);
|
|
1597
|
-
magB = Math.sqrt(magB);
|
|
1684
|
+
let intersection = 0;
|
|
1685
|
+
for (const gram of setA) if (setB.has(gram)) intersection++;
|
|
1686
|
+
const size = Math.max(sizeA, sizeB);
|
|
1598
1687
|
return {
|
|
1599
|
-
res:
|
|
1600
|
-
raw: {
|
|
1688
|
+
res: size === 0 ? 1 : Metric.clamp(intersection / size),
|
|
1689
|
+
raw: { intersection, size }
|
|
1601
1690
|
};
|
|
1602
1691
|
} finally {
|
|
1603
|
-
Pool.release('
|
|
1604
|
-
Pool.release('
|
|
1692
|
+
Pool.release('set', setA, sizeA);
|
|
1693
|
+
Pool.release('set', setB, sizeB);
|
|
1605
1694
|
}
|
|
1606
1695
|
}
|
|
1607
1696
|
}
|
|
1608
|
-
MetricRegistry.add('
|
|
1697
|
+
MetricRegistry.add('qGram', QGramSimilarity);
|
|
1609
1698
|
|
|
1610
|
-
class
|
|
1699
|
+
class SmithWatermanDistance extends Metric {
|
|
1611
1700
|
constructor(a, b, opt = {}) {
|
|
1612
|
-
super('
|
|
1701
|
+
super('smithWaterman', a, b, opt, true);
|
|
1613
1702
|
}
|
|
1614
|
-
compute(a, b, m, n
|
|
1703
|
+
compute(a, b, m, n) {
|
|
1704
|
+
const { match = 2, mismatch = -1, gap = -2 } = this.options;
|
|
1615
1705
|
const len = m + 1;
|
|
1616
|
-
const [
|
|
1706
|
+
const [prev, curr] = Pool.acquireMany('int32', [len, len]);
|
|
1707
|
+
let maxScore = 0;
|
|
1617
1708
|
try {
|
|
1618
|
-
for (let i = 0; i <= m; i++) prev[i] =
|
|
1709
|
+
for (let i = 0; i <= m; i++) prev[i] = 0;
|
|
1619
1710
|
for (let j = 1; j <= n; j++) {
|
|
1620
|
-
curr[0] =
|
|
1711
|
+
curr[0] = 0;
|
|
1621
1712
|
const cb = b.charCodeAt(j - 1);
|
|
1622
1713
|
for (let i = 1; i <= m; i++) {
|
|
1623
|
-
const
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
prev[i] +
|
|
1628
|
-
|
|
1714
|
+
const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
|
|
1715
|
+
curr[i] = Math.max(
|
|
1716
|
+
0,
|
|
1717
|
+
prev[i - 1] + score,
|
|
1718
|
+
prev[i] + gap,
|
|
1719
|
+
curr[i - 1] + gap
|
|
1629
1720
|
);
|
|
1630
|
-
if (
|
|
1631
|
-
i > 1 &&
|
|
1632
|
-
j > 1 &&
|
|
1633
|
-
ca === b.charCodeAt(j - 2) &&
|
|
1634
|
-
cb === a.charCodeAt(i - 2)
|
|
1635
|
-
)
|
|
1636
|
-
val = Math.min(val, test[i - 2] + cost);
|
|
1637
|
-
curr[i] = val;
|
|
1721
|
+
if (curr[i] > maxScore) maxScore = curr[i];
|
|
1638
1722
|
}
|
|
1639
|
-
test.set(prev);
|
|
1640
1723
|
prev.set(curr);
|
|
1641
1724
|
}
|
|
1642
|
-
const
|
|
1725
|
+
const denum = Math.min(m * match, n * match);
|
|
1643
1726
|
return {
|
|
1644
|
-
res:
|
|
1645
|
-
raw: {
|
|
1727
|
+
res: denum === 0 ? 0 : Metric.clamp(maxScore / denum),
|
|
1728
|
+
raw: { score: maxScore, denum }
|
|
1646
1729
|
};
|
|
1647
1730
|
} finally {
|
|
1648
|
-
Pool.release('int32', test, len);
|
|
1649
1731
|
Pool.release('int32', prev, len);
|
|
1650
1732
|
Pool.release('int32', curr, len);
|
|
1651
1733
|
}
|
|
1652
1734
|
}
|
|
1653
1735
|
}
|
|
1654
|
-
MetricRegistry.add('
|
|
1736
|
+
MetricRegistry.add('smithWaterman', SmithWatermanDistance);
|
|
1655
1737
|
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1738
|
+
const profiler$1 = Profiler.getInstance();
|
|
1739
|
+
class Phonetic {
|
|
1740
|
+
static cache = new HashTable();
|
|
1741
|
+
static default;
|
|
1742
|
+
algo;
|
|
1743
|
+
options;
|
|
1744
|
+
optKey;
|
|
1745
|
+
map;
|
|
1746
|
+
ignoreSet;
|
|
1747
|
+
static clear() {
|
|
1748
|
+
this.cache.clear();
|
|
1659
1749
|
}
|
|
1660
|
-
|
|
1661
|
-
const
|
|
1662
|
-
const
|
|
1663
|
-
|
|
1664
|
-
|
|
1750
|
+
constructor(algo, opt = {}) {
|
|
1751
|
+
const defaults = this.constructor.default ?? {};
|
|
1752
|
+
const mapId = opt.map ?? defaults.map;
|
|
1753
|
+
if (!mapId)
|
|
1754
|
+
throw new CmpStrNotFoundError(
|
|
1755
|
+
`No mapping specified for phonetic algorithm`,
|
|
1756
|
+
{ algo }
|
|
1757
|
+
);
|
|
1758
|
+
const map = PhoneticMappingRegistry.get(algo, mapId);
|
|
1759
|
+
if (map === undefined)
|
|
1760
|
+
throw new CmpStrNotFoundError(
|
|
1761
|
+
`Requested mapping <${mapId}> is not declared`,
|
|
1762
|
+
{ algo, mapId }
|
|
1763
|
+
);
|
|
1764
|
+
this.options = DeepMerge.merge(
|
|
1765
|
+
DeepMerge.merge(defaults, map.options ?? {}),
|
|
1766
|
+
opt
|
|
1767
|
+
);
|
|
1768
|
+
this.optKey = Hasher.fastFNV1a(
|
|
1769
|
+
JSON.stringify(this.options, Object.keys(this.options).sort())
|
|
1770
|
+
).toString();
|
|
1771
|
+
this.algo = algo;
|
|
1772
|
+
this.map = map;
|
|
1773
|
+
this.ignoreSet = new Set(map.ignore ?? []);
|
|
1665
1774
|
}
|
|
1666
|
-
|
|
1667
|
-
const
|
|
1668
|
-
|
|
1669
|
-
const
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
for (const bigram of setA) if (setB.has(bigram)) intersection++;
|
|
1674
|
-
const size = sizeA + sizeB;
|
|
1675
|
-
return {
|
|
1676
|
-
res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
|
|
1677
|
-
raw: { intersection, size }
|
|
1678
|
-
};
|
|
1679
|
-
} finally {
|
|
1680
|
-
Pool.release('set', setA, sizeA);
|
|
1681
|
-
Pool.release('set', setB, sizeB);
|
|
1775
|
+
applyPattern(word) {
|
|
1776
|
+
const { patterns = [] } = this.map;
|
|
1777
|
+
if (!patterns.length) return word;
|
|
1778
|
+
for (const { pattern, replace, all = false } of patterns) {
|
|
1779
|
+
word = all
|
|
1780
|
+
? word.replaceAll(pattern, replace)
|
|
1781
|
+
: word.replace(pattern, replace);
|
|
1682
1782
|
}
|
|
1783
|
+
return word;
|
|
1683
1784
|
}
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1785
|
+
applyRules(char, i, chars, charLen) {
|
|
1786
|
+
const { ruleset = [] } = this.map;
|
|
1787
|
+
if (!ruleset.length) return undefined;
|
|
1788
|
+
const prev = chars[i - 1] || '',
|
|
1789
|
+
prev2 = chars[i - 2] || '';
|
|
1790
|
+
const next = chars[i + 1] || '',
|
|
1791
|
+
next2 = chars[i + 2] || '';
|
|
1792
|
+
const str = chars.join('');
|
|
1793
|
+
for (const rule of ruleset) {
|
|
1794
|
+
if (rule.char && rule.char !== char) continue;
|
|
1795
|
+
if (rule.position === 'start' && i !== 0) continue;
|
|
1796
|
+
if (rule.position === 'middle' && (i === 0 || i === charLen - 1))
|
|
1797
|
+
continue;
|
|
1798
|
+
if (rule.position === 'end' && i !== charLen - 1) continue;
|
|
1799
|
+
if (rule.prev && !rule.prev.includes(prev)) continue;
|
|
1800
|
+
if (rule.prevNot && rule.prevNot.includes(prev)) continue;
|
|
1801
|
+
if (rule.prev2 && !rule.prev2.includes(prev2)) continue;
|
|
1802
|
+
if (rule.prev2Not && rule.prev2Not.includes(prev2)) continue;
|
|
1803
|
+
if (rule.next && !rule.next.includes(next)) continue;
|
|
1804
|
+
if (rule.nextNot && rule.nextNot.includes(next)) continue;
|
|
1805
|
+
if (rule.next2 && !rule.next2.includes(next2)) continue;
|
|
1806
|
+
if (rule.next2Not && rule.next2Not.includes(next2)) continue;
|
|
1807
|
+
if (
|
|
1808
|
+
rule.leading &&
|
|
1809
|
+
!rule.leading.includes(str.slice(0, rule.leading.length))
|
|
1810
|
+
)
|
|
1811
|
+
continue;
|
|
1812
|
+
if (
|
|
1813
|
+
rule.trailing &&
|
|
1814
|
+
!rule.trailing.includes(str.slice(-rule.trailing.length))
|
|
1815
|
+
)
|
|
1816
|
+
continue;
|
|
1817
|
+
if (rule.match && !rule.match.every((c, j) => chars[i + j] === c))
|
|
1818
|
+
continue;
|
|
1819
|
+
return rule.code;
|
|
1820
|
+
}
|
|
1821
|
+
return undefined;
|
|
1690
1822
|
}
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1823
|
+
encode(word) {
|
|
1824
|
+
const { map = {} } = this.map;
|
|
1825
|
+
word = this.applyPattern(word);
|
|
1826
|
+
const chars = this.word2Chars(word);
|
|
1827
|
+
const charLen = chars.length;
|
|
1828
|
+
let code = '',
|
|
1829
|
+
lastCode = null;
|
|
1830
|
+
for (let i = 0; i < charLen; i++) {
|
|
1831
|
+
const char = chars[i];
|
|
1832
|
+
if (this.ignoreSet.has(char)) continue;
|
|
1833
|
+
const mapped = this.mapChar(char, i, chars, charLen, lastCode, map);
|
|
1834
|
+
if (mapped === undefined) continue;
|
|
1835
|
+
((code += mapped), (lastCode = mapped));
|
|
1836
|
+
if (this.exitEarly(code, i)) break;
|
|
1703
1837
|
}
|
|
1704
|
-
|
|
1705
|
-
for (let i = 0; i < m; i++) if (a[i] !== b[i]) dist++;
|
|
1706
|
-
return { res: m === 0 ? 1 : Metric.clamp(1 - dist / m), raw: { dist } };
|
|
1838
|
+
return this.adjustCode(code, chars);
|
|
1707
1839
|
}
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
super('jaccard', a, b, opt, true);
|
|
1840
|
+
mapChar(char, i, chars, charLen, lastCode, map) {
|
|
1841
|
+
const { dedupe = true, fallback = undefined } = this.options;
|
|
1842
|
+
const c =
|
|
1843
|
+
this.applyRules(char, i, chars, charLen) ?? map[char] ?? fallback;
|
|
1844
|
+
return dedupe && c === lastCode ? undefined : c;
|
|
1714
1845
|
}
|
|
1715
|
-
|
|
1716
|
-
const
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
let intersection = 0;
|
|
1721
|
-
for (const c of setA) if (setB.has(c)) intersection++;
|
|
1722
|
-
const union = setA.size + setB.size - intersection;
|
|
1723
|
-
return {
|
|
1724
|
-
res: union === 0 ? 1 : Metric.clamp(intersection / union),
|
|
1725
|
-
raw: { intersection, union }
|
|
1726
|
-
};
|
|
1727
|
-
} finally {
|
|
1728
|
-
Pool.release('set', setA, m);
|
|
1729
|
-
Pool.release('set', setB, n);
|
|
1730
|
-
}
|
|
1846
|
+
equalLen(input) {
|
|
1847
|
+
const { length = -1, pad = '0' } = this.options;
|
|
1848
|
+
return length === -1
|
|
1849
|
+
? input
|
|
1850
|
+
: (input + pad.repeat(length)).slice(0, length);
|
|
1731
1851
|
}
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
class JaroWinklerDistance extends Metric {
|
|
1736
|
-
constructor(a, b, opt = {}) {
|
|
1737
|
-
super('jaroWinkler', a, b, opt, true);
|
|
1852
|
+
word2Chars(word) {
|
|
1853
|
+
return Array.from(word.toLowerCase());
|
|
1738
1854
|
}
|
|
1739
|
-
|
|
1740
|
-
const
|
|
1741
|
-
|
|
1742
|
-
for (let i = 0; i < m; i++) matchA[i] = 0;
|
|
1743
|
-
for (let i = 0; i < n; i++) matchB[i] = 0;
|
|
1744
|
-
const matchWindow = Math.max(0, Math.floor(n / 2) - 1);
|
|
1745
|
-
let matches = 0;
|
|
1746
|
-
for (let i = 0; i < m; i++) {
|
|
1747
|
-
const start = Math.max(0, i - matchWindow);
|
|
1748
|
-
const end = Math.min(i + matchWindow + 1, n);
|
|
1749
|
-
for (let j = start; j < end; j++) {
|
|
1750
|
-
if (!matchB[j] && a[i] === b[j]) {
|
|
1751
|
-
matchA[i] = 1;
|
|
1752
|
-
matchB[j] = 1;
|
|
1753
|
-
matches++;
|
|
1754
|
-
break;
|
|
1755
|
-
}
|
|
1756
|
-
}
|
|
1757
|
-
}
|
|
1758
|
-
let transpos = 0,
|
|
1759
|
-
jaro = 0,
|
|
1760
|
-
prefix = 0,
|
|
1761
|
-
res = 0;
|
|
1762
|
-
if (matches > 0) {
|
|
1763
|
-
let k = 0;
|
|
1764
|
-
for (let i = 0; i < m; i++) {
|
|
1765
|
-
if (matchA[i]) {
|
|
1766
|
-
while (!matchB[k]) k++;
|
|
1767
|
-
if (a[i] !== b[k]) transpos++;
|
|
1768
|
-
k++;
|
|
1769
|
-
}
|
|
1770
|
-
}
|
|
1771
|
-
transpos /= 2;
|
|
1772
|
-
jaro =
|
|
1773
|
-
(matches / m + matches / n + (matches - transpos) / matches) / 3;
|
|
1774
|
-
for (let i = 0; i < Math.min(4, m, n); i++) {
|
|
1775
|
-
if (a[i] === b[i]) prefix++;
|
|
1776
|
-
else break;
|
|
1777
|
-
}
|
|
1778
|
-
res = jaro + prefix * 0.1 * (1 - jaro);
|
|
1779
|
-
}
|
|
1780
|
-
return {
|
|
1781
|
-
res: Metric.clamp(res),
|
|
1782
|
-
raw: { matchWindow, matches, transpos, jaro, prefix }
|
|
1783
|
-
};
|
|
1784
|
-
} finally {
|
|
1785
|
-
Pool.release('int32', matchA, m);
|
|
1786
|
-
Pool.release('int32', matchB, n);
|
|
1787
|
-
}
|
|
1855
|
+
exitEarly(code, i) {
|
|
1856
|
+
const { length = -1 } = this.options;
|
|
1857
|
+
return length > 0 && code.length >= length;
|
|
1788
1858
|
}
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
class LCSMetric extends Metric {
|
|
1793
|
-
constructor(a, b, opt = {}) {
|
|
1794
|
-
super('lcs', a, b, opt, true);
|
|
1859
|
+
adjustCode(code, chars) {
|
|
1860
|
+
return code;
|
|
1795
1861
|
}
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1862
|
+
loop(words) {
|
|
1863
|
+
return ErrorUtil.wrap(
|
|
1864
|
+
() => {
|
|
1865
|
+
const index = [];
|
|
1866
|
+
for (const word of words) {
|
|
1867
|
+
let key = Phonetic.cache.key(this.algo, [word]);
|
|
1868
|
+
if (key) key += this.optKey;
|
|
1869
|
+
const code =
|
|
1870
|
+
Phonetic.cache.get(key || '') ??
|
|
1871
|
+
(() => {
|
|
1872
|
+
const res = this.encode(word);
|
|
1873
|
+
if (key) Phonetic.cache.set(key, res);
|
|
1874
|
+
return res;
|
|
1875
|
+
})();
|
|
1876
|
+
if (code && code.length) index.push(this.equalLen(code));
|
|
1807
1877
|
}
|
|
1808
|
-
|
|
1809
|
-
}
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
raw: { lcs, maxLen }
|
|
1814
|
-
};
|
|
1815
|
-
} finally {
|
|
1816
|
-
Pool.release('int32', prev, len);
|
|
1817
|
-
Pool.release('int32', curr, len);
|
|
1818
|
-
}
|
|
1819
|
-
}
|
|
1820
|
-
}
|
|
1821
|
-
MetricRegistry.add('lcs', LCSMetric);
|
|
1822
|
-
|
|
1823
|
-
class LevenshteinDistance extends Metric {
|
|
1824
|
-
constructor(a, b, opt = {}) {
|
|
1825
|
-
super('levenshtein', a, b, opt, true);
|
|
1878
|
+
return index;
|
|
1879
|
+
},
|
|
1880
|
+
`Failed to generate phonetic index`,
|
|
1881
|
+
{ algo: this.algo, words }
|
|
1882
|
+
);
|
|
1826
1883
|
}
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
prev[i - 1] + cost
|
|
1884
|
+
async loopAsync(words) {
|
|
1885
|
+
return ErrorUtil.wrapAsync(
|
|
1886
|
+
async () => {
|
|
1887
|
+
const index = [];
|
|
1888
|
+
for (const word of words) {
|
|
1889
|
+
const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
|
|
1890
|
+
const code = await Promise.resolve(
|
|
1891
|
+
Phonetic.cache.get(key || '') ??
|
|
1892
|
+
(() => {
|
|
1893
|
+
const res = this.encode(word);
|
|
1894
|
+
if (key) Phonetic.cache.set(key, res);
|
|
1895
|
+
return res;
|
|
1896
|
+
})()
|
|
1841
1897
|
);
|
|
1898
|
+
if (code && code.length) index.push(this.equalLen(code));
|
|
1842
1899
|
}
|
|
1843
|
-
|
|
1844
|
-
}
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
raw: { dist, maxLen }
|
|
1849
|
-
};
|
|
1850
|
-
} finally {
|
|
1851
|
-
Pool.release('int32', prev, len);
|
|
1852
|
-
Pool.release('int32', curr, len);
|
|
1853
|
-
}
|
|
1854
|
-
}
|
|
1855
|
-
}
|
|
1856
|
-
MetricRegistry.add('levenshtein', LevenshteinDistance);
|
|
1857
|
-
|
|
1858
|
-
class NeedlemanWunschDistance extends Metric {
|
|
1859
|
-
constructor(a, b, opt = {}) {
|
|
1860
|
-
super('needlemanWunsch', a, b, opt, true);
|
|
1861
|
-
}
|
|
1862
|
-
compute(a, b, m, n, maxLen) {
|
|
1863
|
-
const { match = 1, mismatch = -1, gap = -1 } = this.options;
|
|
1864
|
-
const len = m + 1;
|
|
1865
|
-
const [prev, curr] = Pool.acquireMany('int32', [len, len]);
|
|
1866
|
-
try {
|
|
1867
|
-
prev[0] = 0;
|
|
1868
|
-
for (let i = 1; i <= m; i++) prev[i] = prev[i - 1] + gap;
|
|
1869
|
-
for (let j = 1; j <= n; j++) {
|
|
1870
|
-
curr[0] = prev[0] + gap;
|
|
1871
|
-
const cb = b.charCodeAt(j - 1);
|
|
1872
|
-
for (let i = 1; i <= m; i++) {
|
|
1873
|
-
const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
|
|
1874
|
-
curr[i] = Math.max(
|
|
1875
|
-
prev[i - 1] + score,
|
|
1876
|
-
prev[i] + gap,
|
|
1877
|
-
curr[i - 1] + gap
|
|
1878
|
-
);
|
|
1879
|
-
}
|
|
1880
|
-
prev.set(curr);
|
|
1881
|
-
}
|
|
1882
|
-
const score = prev[m];
|
|
1883
|
-
const denum = maxLen * match;
|
|
1884
|
-
return {
|
|
1885
|
-
res: denum === 0 ? 0 : Metric.clamp(score / denum),
|
|
1886
|
-
raw: { score, denum }
|
|
1887
|
-
};
|
|
1888
|
-
} finally {
|
|
1889
|
-
Pool.release('int32', prev, len);
|
|
1890
|
-
Pool.release('int32', curr, len);
|
|
1891
|
-
}
|
|
1892
|
-
}
|
|
1893
|
-
}
|
|
1894
|
-
MetricRegistry.add('needlemanWunsch', NeedlemanWunschDistance);
|
|
1895
|
-
|
|
1896
|
-
class QGramSimilarity extends Metric {
|
|
1897
|
-
constructor(a, b, opt = {}) {
|
|
1898
|
-
super('qGram', a, b, opt, true);
|
|
1899
|
-
}
|
|
1900
|
-
_qGrams(str, q) {
|
|
1901
|
-
const len = Math.max(0, str.length - q + 1);
|
|
1902
|
-
const grams = Pool.acquire('set', len);
|
|
1903
|
-
for (let i = 0; i < len; i++) grams.add(str.slice(i, i + q));
|
|
1904
|
-
return grams;
|
|
1900
|
+
return index;
|
|
1901
|
+
},
|
|
1902
|
+
`Failed to generate phonetic index asynchronously`,
|
|
1903
|
+
{ algo: this.algo, words }
|
|
1904
|
+
);
|
|
1905
1905
|
}
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
const setA = this._qGrams(a, q),
|
|
1909
|
-
setB = this._qGrams(b, q);
|
|
1910
|
-
const sizeA = setA.size,
|
|
1911
|
-
sizeB = setB.size;
|
|
1912
|
-
try {
|
|
1913
|
-
let intersection = 0;
|
|
1914
|
-
for (const gram of setA) if (setB.has(gram)) intersection++;
|
|
1915
|
-
const size = Math.max(sizeA, sizeB);
|
|
1916
|
-
return {
|
|
1917
|
-
res: size === 0 ? 1 : Metric.clamp(intersection / size),
|
|
1918
|
-
raw: { intersection, size }
|
|
1919
|
-
};
|
|
1920
|
-
} finally {
|
|
1921
|
-
Pool.release('set', setA, sizeA);
|
|
1922
|
-
Pool.release('set', setB, sizeB);
|
|
1923
|
-
}
|
|
1906
|
+
getAlgoName() {
|
|
1907
|
+
return this.algo;
|
|
1924
1908
|
}
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
super('smithWaterman', a, b, opt, true);
|
|
1909
|
+
getIndex(input) {
|
|
1910
|
+
const { delimiter = ' ' } = this.options;
|
|
1911
|
+
return profiler$1.run(() =>
|
|
1912
|
+
this.loop(input.split(delimiter).filter(Boolean)).filter(Boolean)
|
|
1913
|
+
);
|
|
1931
1914
|
}
|
|
1932
|
-
|
|
1933
|
-
const {
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
curr[0] = 0;
|
|
1941
|
-
const cb = b.charCodeAt(j - 1);
|
|
1942
|
-
for (let i = 1; i <= m; i++) {
|
|
1943
|
-
const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
|
|
1944
|
-
curr[i] = Math.max(
|
|
1945
|
-
0,
|
|
1946
|
-
prev[i - 1] + score,
|
|
1947
|
-
prev[i] + gap,
|
|
1948
|
-
curr[i - 1] + gap
|
|
1949
|
-
);
|
|
1950
|
-
if (curr[i] > maxScore) maxScore = curr[i];
|
|
1951
|
-
}
|
|
1952
|
-
prev.set(curr);
|
|
1953
|
-
}
|
|
1954
|
-
const denum = Math.min(m * match, n * match);
|
|
1955
|
-
return {
|
|
1956
|
-
res: denum === 0 ? 0 : Metric.clamp(maxScore / denum),
|
|
1957
|
-
raw: { score: maxScore, denum }
|
|
1958
|
-
};
|
|
1959
|
-
} finally {
|
|
1960
|
-
Pool.release('int32', prev, len);
|
|
1961
|
-
Pool.release('int32', curr, len);
|
|
1962
|
-
}
|
|
1915
|
+
async getIndexAsync(input) {
|
|
1916
|
+
const { delimiter = ' ' } = this.options;
|
|
1917
|
+
return (
|
|
1918
|
+
await profiler$1.runAsync(
|
|
1919
|
+
async () =>
|
|
1920
|
+
await this.loopAsync(input.split(delimiter).filter(Boolean))
|
|
1921
|
+
)
|
|
1922
|
+
).filter(Boolean);
|
|
1963
1923
|
}
|
|
1964
1924
|
}
|
|
1965
|
-
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
constructor(algo, opt = {}) {
|
|
1977
|
-
const defaults = this.constructor.default ?? {};
|
|
1978
|
-
const mapId = opt.map ?? defaults.map;
|
|
1979
|
-
if (!mapId)
|
|
1980
|
-
throw new CmpStrNotFoundError(
|
|
1981
|
-
`No mapping specified for phonetic algorithm`,
|
|
1982
|
-
{ algo }
|
|
1983
|
-
);
|
|
1984
|
-
const map = PhoneticMappingRegistry.get(algo, mapId);
|
|
1985
|
-
if (map === undefined)
|
|
1986
|
-
throw new CmpStrNotFoundError(
|
|
1987
|
-
`Requested mapping <${mapId}> is not declared`,
|
|
1988
|
-
{ algo, mapId }
|
|
1925
|
+
const PhoneticRegistry = Registry('phonetic', Phonetic);
|
|
1926
|
+
const PhoneticMappingRegistry = (() => {
|
|
1927
|
+
const mappings = Object.create(null);
|
|
1928
|
+
const maps = (algo) => (mappings[algo] ||= Object.create(null));
|
|
1929
|
+
return Object.freeze({
|
|
1930
|
+
add(algo, id, map, update = false) {
|
|
1931
|
+
const mappings = maps(algo);
|
|
1932
|
+
ErrorUtil.assert(
|
|
1933
|
+
!(!id || id in mappings) || update,
|
|
1934
|
+
`Entry <${id}> already exists / use <update=true> to overwrite`,
|
|
1935
|
+
{ algo, id }
|
|
1989
1936
|
);
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
1993
|
-
|
|
1994
|
-
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
return word;
|
|
2004
|
-
}
|
|
2005
|
-
applyRules(char, i, chars, charLen) {
|
|
2006
|
-
const { ruleset = [] } = this.map;
|
|
2007
|
-
if (!ruleset || !ruleset.length) return undefined;
|
|
2008
|
-
const prev = chars[i - 1] || '',
|
|
2009
|
-
prev2 = chars[i - 2] || '';
|
|
2010
|
-
const next = chars[i + 1] || '',
|
|
2011
|
-
next2 = chars[i + 2] || '';
|
|
2012
|
-
for (const rule of ruleset) {
|
|
2013
|
-
if (rule.char && rule.char !== char) continue;
|
|
2014
|
-
if (rule.position === 'start' && i !== 0) continue;
|
|
2015
|
-
if (rule.position === 'middle' && (i === 0 || i === charLen - 1))
|
|
2016
|
-
continue;
|
|
2017
|
-
if (rule.position === 'end' && i !== charLen) continue;
|
|
2018
|
-
if (rule.prev && !rule.prev.includes(prev)) continue;
|
|
2019
|
-
if (rule.prevNot && rule.prevNot.includes(prev)) continue;
|
|
2020
|
-
if (rule.prev2 && !rule.prev2.includes(prev2)) continue;
|
|
2021
|
-
if (rule.prev2Not && rule.prev2Not.includes(prev2)) continue;
|
|
2022
|
-
if (rule.next && !rule.next.includes(next)) continue;
|
|
2023
|
-
if (rule.nextNot && rule.nextNot.includes(next)) continue;
|
|
2024
|
-
if (rule.next2 && !rule.next2.includes(next2)) continue;
|
|
2025
|
-
if (rule.next2Not && rule.next2Not.includes(next2)) continue;
|
|
2026
|
-
if (
|
|
2027
|
-
rule.leading &&
|
|
2028
|
-
!rule.leading.includes(chars.slice(0, rule.leading.length).join(''))
|
|
2029
|
-
)
|
|
2030
|
-
continue;
|
|
2031
|
-
if (
|
|
2032
|
-
rule.trailing &&
|
|
2033
|
-
!rule.trailing.includes(chars.slice(-rule.trailing.length).join(''))
|
|
2034
|
-
)
|
|
2035
|
-
continue;
|
|
2036
|
-
if (rule.match && !rule.match.every((c, j) => chars[i + j] === c))
|
|
2037
|
-
continue;
|
|
2038
|
-
return rule.code;
|
|
1937
|
+
mappings[id] = map;
|
|
1938
|
+
},
|
|
1939
|
+
remove(algo, id) {
|
|
1940
|
+
delete maps(algo)[id];
|
|
1941
|
+
},
|
|
1942
|
+
has(algo, id) {
|
|
1943
|
+
return id in maps(algo);
|
|
1944
|
+
},
|
|
1945
|
+
get(algo, id) {
|
|
1946
|
+
return maps(algo)[id];
|
|
1947
|
+
},
|
|
1948
|
+
list(algo) {
|
|
1949
|
+
return Object.keys(maps(algo));
|
|
2039
1950
|
}
|
|
2040
|
-
|
|
1951
|
+
});
|
|
1952
|
+
})();
|
|
1953
|
+
|
|
1954
|
+
class Caverphone extends Phonetic {
|
|
1955
|
+
static REGEX = { uppercase: /[^A-Z]/gi };
|
|
1956
|
+
static default = {
|
|
1957
|
+
map: 'en2',
|
|
1958
|
+
delimiter: ' ',
|
|
1959
|
+
length: -1,
|
|
1960
|
+
pad: '',
|
|
1961
|
+
dedupe: false
|
|
1962
|
+
};
|
|
1963
|
+
constructor(opt = {}) {
|
|
1964
|
+
super('caverphone', opt);
|
|
2041
1965
|
}
|
|
2042
1966
|
encode(word) {
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
const chars = this.word2Chars(word);
|
|
2046
|
-
const charLen = chars.length;
|
|
2047
|
-
let code = '',
|
|
2048
|
-
lastCode = null;
|
|
2049
|
-
for (let i = 0; i < charLen; i++) {
|
|
2050
|
-
const char = chars[i];
|
|
2051
|
-
if (ignore.includes(char)) continue;
|
|
2052
|
-
const mapped = this.mapChar(char, i, chars, charLen, lastCode, map);
|
|
2053
|
-
if (mapped === undefined) continue;
|
|
2054
|
-
((code += mapped), (lastCode = mapped));
|
|
2055
|
-
if (this.exitEarly(code, i)) break;
|
|
2056
|
-
}
|
|
2057
|
-
return this.adjustCode(code, chars);
|
|
2058
|
-
}
|
|
2059
|
-
mapChar(char, i, chars, charLen, lastCode, map) {
|
|
2060
|
-
const { dedupe = true, fallback = undefined } = this.options;
|
|
2061
|
-
const c =
|
|
2062
|
-
this.applyRules(char, i, chars, charLen) ?? map[char] ?? fallback;
|
|
2063
|
-
return dedupe && c === lastCode ? undefined : c;
|
|
2064
|
-
}
|
|
2065
|
-
equalLen(input) {
|
|
2066
|
-
const { length = -1, pad = '0' } = this.options;
|
|
2067
|
-
return length === -1
|
|
2068
|
-
? input
|
|
2069
|
-
: (input + pad.repeat(length)).slice(0, length);
|
|
2070
|
-
}
|
|
2071
|
-
word2Chars = (word) => word.toLowerCase().split('');
|
|
2072
|
-
exitEarly(code, i) {
|
|
2073
|
-
const { length = -1 } = this.options;
|
|
2074
|
-
return length > 0 && code.length >= length;
|
|
2075
|
-
}
|
|
2076
|
-
adjustCode(code, chars) {
|
|
2077
|
-
return code;
|
|
2078
|
-
}
|
|
2079
|
-
loop(words) {
|
|
2080
|
-
return ErrorUtil.wrap(
|
|
2081
|
-
() => {
|
|
2082
|
-
const index = [];
|
|
2083
|
-
for (const word of words) {
|
|
2084
|
-
const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
|
|
2085
|
-
const code =
|
|
2086
|
-
Phonetic.cache.get(key || '') ??
|
|
2087
|
-
(() => {
|
|
2088
|
-
const res = this.encode(word);
|
|
2089
|
-
if (key) Phonetic.cache.set(key, res);
|
|
2090
|
-
return res;
|
|
2091
|
-
})();
|
|
2092
|
-
if (code && code.length) index.push(this.equalLen(code));
|
|
2093
|
-
}
|
|
2094
|
-
return index;
|
|
2095
|
-
},
|
|
2096
|
-
`Failed to generate phonetic index`,
|
|
2097
|
-
{ algo: this.algo, words }
|
|
2098
|
-
);
|
|
2099
|
-
}
|
|
2100
|
-
async loopAsync(words) {
|
|
2101
|
-
return ErrorUtil.wrapAsync(
|
|
2102
|
-
async () => {
|
|
2103
|
-
const index = [];
|
|
2104
|
-
for (const word of words) {
|
|
2105
|
-
const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
|
|
2106
|
-
const code = await Promise.resolve(
|
|
2107
|
-
Phonetic.cache.get(key || '') ??
|
|
2108
|
-
(() => {
|
|
2109
|
-
const res = this.encode(word);
|
|
2110
|
-
if (key) Phonetic.cache.set(key, res);
|
|
2111
|
-
return res;
|
|
2112
|
-
})()
|
|
2113
|
-
);
|
|
2114
|
-
if (code && code.length) index.push(this.equalLen(code));
|
|
2115
|
-
}
|
|
2116
|
-
return index;
|
|
2117
|
-
},
|
|
2118
|
-
`Failed to generate phonetic index asynchronously`,
|
|
2119
|
-
{ algo: this.algo, words }
|
|
2120
|
-
);
|
|
2121
|
-
}
|
|
2122
|
-
getAlgoName = () => this.algo;
|
|
2123
|
-
getIndex(input) {
|
|
2124
|
-
const { delimiter = ' ' } = this.options;
|
|
2125
|
-
return profiler$1.run(() =>
|
|
2126
|
-
this.loop(input.split(delimiter).filter(Boolean)).filter(Boolean)
|
|
2127
|
-
);
|
|
2128
|
-
}
|
|
2129
|
-
async getIndexAsync(input) {
|
|
2130
|
-
const { delimiter = ' ' } = this.options;
|
|
2131
|
-
return (
|
|
2132
|
-
await profiler$1.runAsync(
|
|
2133
|
-
async () =>
|
|
2134
|
-
await this.loopAsync(input.split(delimiter).filter(Boolean))
|
|
2135
|
-
)
|
|
2136
|
-
).filter(Boolean);
|
|
2137
|
-
}
|
|
2138
|
-
}
|
|
2139
|
-
const PhoneticRegistry = Registry('phonetic', Phonetic);
|
|
2140
|
-
const PhoneticMappingRegistry = (() => {
|
|
2141
|
-
const mappings = Object.create(null);
|
|
2142
|
-
const maps = (algo) => (mappings[algo] ||= Object.create(null));
|
|
2143
|
-
return Object.freeze({
|
|
2144
|
-
add(algo, id, map, update = false) {
|
|
2145
|
-
const mappings = maps(algo);
|
|
2146
|
-
ErrorUtil.assert(
|
|
2147
|
-
!(!id || id in mappings) || update,
|
|
2148
|
-
`Entry <${id}> already exists / use <update=true> to overwrite`,
|
|
2149
|
-
{ algo, id }
|
|
2150
|
-
);
|
|
2151
|
-
mappings[id] = map;
|
|
2152
|
-
},
|
|
2153
|
-
remove(algo, id) {
|
|
2154
|
-
delete maps(algo)[id];
|
|
2155
|
-
},
|
|
2156
|
-
has(algo, id) {
|
|
2157
|
-
return id in maps(algo);
|
|
2158
|
-
},
|
|
2159
|
-
get(algo, id) {
|
|
2160
|
-
return maps(algo)[id];
|
|
2161
|
-
},
|
|
2162
|
-
list(algo) {
|
|
2163
|
-
return Object.keys(maps(algo));
|
|
2164
|
-
}
|
|
2165
|
-
});
|
|
2166
|
-
})();
|
|
2167
|
-
|
|
2168
|
-
class Caverphone extends Phonetic {
|
|
2169
|
-
static REGEX = { uppercase: /[^A-Z]/gi };
|
|
2170
|
-
static default = {
|
|
2171
|
-
map: 'en2',
|
|
2172
|
-
delimiter: ' ',
|
|
2173
|
-
length: -1,
|
|
2174
|
-
pad: '',
|
|
2175
|
-
dedupe: false
|
|
2176
|
-
};
|
|
2177
|
-
constructor(opt = {}) {
|
|
2178
|
-
super('caverphone', opt);
|
|
2179
|
-
}
|
|
2180
|
-
encode(word) {
|
|
2181
|
-
word = word.replace(Caverphone.REGEX.uppercase, '').toLowerCase();
|
|
2182
|
-
return super.encode(word);
|
|
1967
|
+
word = word.replace(Caverphone.REGEX.uppercase, '').toLowerCase();
|
|
1968
|
+
return super.encode(word);
|
|
2183
1969
|
}
|
|
2184
1970
|
mapChar = (char) => char;
|
|
2185
1971
|
adjustCode = (code) => code.toUpperCase();
|
|
@@ -2357,170 +2143,743 @@
|
|
|
2357
2143
|
constructor(opt = {}) {
|
|
2358
2144
|
super('metaphone', opt);
|
|
2359
2145
|
}
|
|
2360
|
-
encode(word) {
|
|
2361
|
-
word = word.replace(Metaphone.REGEX.adjacent, (m, c) =>
|
|
2362
|
-
c === 'C' ? m : c
|
|
2363
|
-
);
|
|
2364
|
-
return super.encode(word);
|
|
2146
|
+
encode(word) {
|
|
2147
|
+
word = word.replace(Metaphone.REGEX.adjacent, (m, c) =>
|
|
2148
|
+
c === 'C' ? m : c
|
|
2149
|
+
);
|
|
2150
|
+
return super.encode(word);
|
|
2151
|
+
}
|
|
2152
|
+
adjustCode(code) {
|
|
2153
|
+
return (
|
|
2154
|
+
code.slice(0, 1) + code.slice(1).replace(Metaphone.REGEX.vowel, '')
|
|
2155
|
+
);
|
|
2156
|
+
}
|
|
2157
|
+
}
|
|
2158
|
+
PhoneticRegistry.add('metaphone', Metaphone);
|
|
2159
|
+
PhoneticMappingRegistry.add('metaphone', 'en90', {
|
|
2160
|
+
map: {
|
|
2161
|
+
a: 'A',
|
|
2162
|
+
b: 'B',
|
|
2163
|
+
c: 'K',
|
|
2164
|
+
d: 'T',
|
|
2165
|
+
e: 'E',
|
|
2166
|
+
f: 'F',
|
|
2167
|
+
g: 'K',
|
|
2168
|
+
h: 'H',
|
|
2169
|
+
i: 'I',
|
|
2170
|
+
j: 'J',
|
|
2171
|
+
k: 'K',
|
|
2172
|
+
l: 'L',
|
|
2173
|
+
m: 'M',
|
|
2174
|
+
n: 'N',
|
|
2175
|
+
o: 'O',
|
|
2176
|
+
p: 'P',
|
|
2177
|
+
q: 'K',
|
|
2178
|
+
r: 'R',
|
|
2179
|
+
s: 'S',
|
|
2180
|
+
t: 'T',
|
|
2181
|
+
u: 'U',
|
|
2182
|
+
v: 'F',
|
|
2183
|
+
w: 'W',
|
|
2184
|
+
x: 'KS',
|
|
2185
|
+
y: 'Y',
|
|
2186
|
+
z: 'S'
|
|
2187
|
+
},
|
|
2188
|
+
ruleset: [
|
|
2189
|
+
{ char: 'a', position: 'start', next: ['e'], code: '' },
|
|
2190
|
+
{ char: 'g', position: 'start', next: ['n'], code: '' },
|
|
2191
|
+
{ char: 'k', position: 'start', next: ['n'], code: '' },
|
|
2192
|
+
{ char: 'p', position: 'start', next: ['n'], code: '' },
|
|
2193
|
+
{ char: 'w', position: 'start', next: ['r'], code: '' },
|
|
2194
|
+
{ char: 'b', position: 'end', prev: ['m'], code: '' },
|
|
2195
|
+
{ char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
|
|
2196
|
+
{ char: 'c', next: ['i'], next2: ['a'], code: 'X' },
|
|
2197
|
+
{ char: 'c', next: ['e', 'i', 'y'], code: 'S' },
|
|
2198
|
+
{ char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
|
|
2199
|
+
{
|
|
2200
|
+
char: 'g',
|
|
2201
|
+
next: ['h'],
|
|
2202
|
+
next2Not: ['', 'a', 'e', 'i', 'o', 'u'],
|
|
2203
|
+
code: ''
|
|
2204
|
+
},
|
|
2205
|
+
{ char: 'g', trailing: 'n', code: '' },
|
|
2206
|
+
{ char: 'g', trailing: 'ned', code: '' },
|
|
2207
|
+
{ char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
|
|
2208
|
+
{
|
|
2209
|
+
char: 'h',
|
|
2210
|
+
prev: ['a', 'e', 'i', 'o', 'u'],
|
|
2211
|
+
nextNot: ['a', 'e', 'i', 'o', 'u'],
|
|
2212
|
+
code: ''
|
|
2213
|
+
},
|
|
2214
|
+
{ char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
|
|
2215
|
+
{ char: 'k', prev: ['c'], code: '' },
|
|
2216
|
+
{ char: 'p', next: ['h'], code: 'F' },
|
|
2217
|
+
{ char: 's', next: ['h'], code: 'X' },
|
|
2218
|
+
{ char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2219
|
+
{ char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2220
|
+
{ char: 't', next: ['h'], code: '0' },
|
|
2221
|
+
{ char: 't', next: ['c'], next2: ['h'], code: '' },
|
|
2222
|
+
{ char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
|
|
2223
|
+
{ char: 'h', leading: 'w', code: '' },
|
|
2224
|
+
{ char: 'x', position: 'start', code: 'S' },
|
|
2225
|
+
{ char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
|
|
2226
|
+
]
|
|
2227
|
+
});
|
|
2228
|
+
|
|
2229
|
+
class Soundex extends Phonetic {
|
|
2230
|
+
static default = {
|
|
2231
|
+
map: 'en',
|
|
2232
|
+
delimiter: ' ',
|
|
2233
|
+
length: 4,
|
|
2234
|
+
pad: '0',
|
|
2235
|
+
dedupe: true
|
|
2236
|
+
};
|
|
2237
|
+
constructor(opt = {}) {
|
|
2238
|
+
super('soundex', opt);
|
|
2239
|
+
}
|
|
2240
|
+
adjustCode(code, chars) {
|
|
2241
|
+
return chars[0].toUpperCase() + code.slice(1).replaceAll('0', '');
|
|
2242
|
+
}
|
|
2243
|
+
}
|
|
2244
|
+
PhoneticRegistry.add('soundex', Soundex);
|
|
2245
|
+
PhoneticMappingRegistry.add('soundex', 'en', {
|
|
2246
|
+
map: {
|
|
2247
|
+
a: '0',
|
|
2248
|
+
e: '0',
|
|
2249
|
+
h: '0',
|
|
2250
|
+
i: '0',
|
|
2251
|
+
o: '0',
|
|
2252
|
+
u: '0',
|
|
2253
|
+
w: '0',
|
|
2254
|
+
y: '0',
|
|
2255
|
+
b: '1',
|
|
2256
|
+
f: '1',
|
|
2257
|
+
p: '1',
|
|
2258
|
+
v: '1',
|
|
2259
|
+
c: '2',
|
|
2260
|
+
g: '2',
|
|
2261
|
+
j: '2',
|
|
2262
|
+
k: '2',
|
|
2263
|
+
q: '2',
|
|
2264
|
+
s: '2',
|
|
2265
|
+
x: '2',
|
|
2266
|
+
z: '2',
|
|
2267
|
+
d: '3',
|
|
2268
|
+
t: '3',
|
|
2269
|
+
l: '4',
|
|
2270
|
+
m: '5',
|
|
2271
|
+
n: '5',
|
|
2272
|
+
r: '6'
|
|
2273
|
+
}
|
|
2274
|
+
});
|
|
2275
|
+
PhoneticMappingRegistry.add('soundex', 'de', {
|
|
2276
|
+
map: {
|
|
2277
|
+
a: '0',
|
|
2278
|
+
ä: '0',
|
|
2279
|
+
e: '0',
|
|
2280
|
+
h: '0',
|
|
2281
|
+
i: '0',
|
|
2282
|
+
j: '0',
|
|
2283
|
+
o: '0',
|
|
2284
|
+
ö: '0',
|
|
2285
|
+
u: '0',
|
|
2286
|
+
ü: '0',
|
|
2287
|
+
y: '0',
|
|
2288
|
+
b: '1',
|
|
2289
|
+
f: '1',
|
|
2290
|
+
p: '1',
|
|
2291
|
+
v: '1',
|
|
2292
|
+
w: '1',
|
|
2293
|
+
c: '2',
|
|
2294
|
+
g: '2',
|
|
2295
|
+
k: '2',
|
|
2296
|
+
q: '2',
|
|
2297
|
+
s: '2',
|
|
2298
|
+
ß: '2',
|
|
2299
|
+
x: '2',
|
|
2300
|
+
z: '2',
|
|
2301
|
+
d: '3',
|
|
2302
|
+
t: '3',
|
|
2303
|
+
l: '4',
|
|
2304
|
+
m: '5',
|
|
2305
|
+
n: '5',
|
|
2306
|
+
r: '6'
|
|
2307
|
+
},
|
|
2308
|
+
ruleset: [{ char: 'c', next: ['h'], code: '7' }]
|
|
2309
|
+
});
|
|
2310
|
+
|
|
2311
|
+
class OptionsValidator {
|
|
2312
|
+
static ALLOWED_FLAGS = new Set([
|
|
2313
|
+
'd',
|
|
2314
|
+
'u',
|
|
2315
|
+
'x',
|
|
2316
|
+
'w',
|
|
2317
|
+
't',
|
|
2318
|
+
'r',
|
|
2319
|
+
's',
|
|
2320
|
+
'k',
|
|
2321
|
+
'n',
|
|
2322
|
+
'i'
|
|
2323
|
+
]);
|
|
2324
|
+
static ALLOWED_OUTPUT = new Set(['orig', 'prep']);
|
|
2325
|
+
static ALLOWED_MODES = new Set(['default', 'batch', 'single', 'pairwise']);
|
|
2326
|
+
static ALLOWED_SORT = new Set(['asc', 'desc']);
|
|
2327
|
+
static PROCESSORS = {
|
|
2328
|
+
phonetic: (opt) => {
|
|
2329
|
+
if (!opt) return;
|
|
2330
|
+
OptionsValidator.validatePhoneticName(opt.algo);
|
|
2331
|
+
OptionsValidator.validatePhoneticOptions(opt.opt);
|
|
2332
|
+
}
|
|
2333
|
+
};
|
|
2334
|
+
static METRIC_OPT_MAP = {
|
|
2335
|
+
mode: (v) => OptionsValidator.validateMode(v),
|
|
2336
|
+
delimiter: (v) => OptionsValidator.validateString(v, 'opt.delimiter'),
|
|
2337
|
+
pad: (v) => OptionsValidator.validateString(v, 'opt.pad'),
|
|
2338
|
+
q: (v) => OptionsValidator.validateNumber(v, 'opt.q'),
|
|
2339
|
+
match: (v) => OptionsValidator.validateNumber(v, 'opt.match'),
|
|
2340
|
+
mismatch: (v) => OptionsValidator.validateNumber(v, 'opt.mismatch'),
|
|
2341
|
+
gap: (v) => OptionsValidator.validateNumber(v, 'opt.gap')
|
|
2342
|
+
};
|
|
2343
|
+
static PHONETIC_OPT_MAP = {
|
|
2344
|
+
map: (v) =>
|
|
2345
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.map'),
|
|
2346
|
+
delimiter: (v) =>
|
|
2347
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.delimiter'),
|
|
2348
|
+
length: (v) =>
|
|
2349
|
+
OptionsValidator.validateNumber(v, 'processors.phonetic.opt.length'),
|
|
2350
|
+
pad: (v) =>
|
|
2351
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.pad'),
|
|
2352
|
+
dedupe: (v) =>
|
|
2353
|
+
OptionsValidator.validateBoolean(v, 'processors.phonetic.opt.dedupe'),
|
|
2354
|
+
fallback: (v) =>
|
|
2355
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.fallback')
|
|
2356
|
+
};
|
|
2357
|
+
static CMPSTR_OPT_MAP = {
|
|
2358
|
+
raw: (v) => OptionsValidator.validateBoolean(v, 'raw'),
|
|
2359
|
+
removeZero: (v) => OptionsValidator.validateBoolean(v, 'removeZero'),
|
|
2360
|
+
safeEmpty: (v) => OptionsValidator.validateBoolean(v, 'safeEmpty'),
|
|
2361
|
+
flags: (v) => OptionsValidator.validateFlags(v),
|
|
2362
|
+
metric: (v) => OptionsValidator.validateMetricName(v),
|
|
2363
|
+
output: (v) => OptionsValidator.validateOutput(v),
|
|
2364
|
+
opt: (v) => OptionsValidator.validateMetricOptions(v),
|
|
2365
|
+
processors: (v) => OptionsValidator.validateProcessors(v),
|
|
2366
|
+
sort: (v) => OptionsValidator.validateSort(v, 'sort'),
|
|
2367
|
+
objectsOnly: (v) => OptionsValidator.validateBoolean(v, 'objectsOnly')
|
|
2368
|
+
};
|
|
2369
|
+
static set2string(set) {
|
|
2370
|
+
return Array.from(set).join(' | ');
|
|
2371
|
+
}
|
|
2372
|
+
static validateType(value, name, type) {
|
|
2373
|
+
if (value === undefined) return;
|
|
2374
|
+
if (typeof value !== type || (type === 'number' && Number.isNaN(value))) {
|
|
2375
|
+
throw new CmpStrValidationError(
|
|
2376
|
+
`Invalid option <${name}>: expected ${type}`,
|
|
2377
|
+
{ name, value }
|
|
2378
|
+
);
|
|
2379
|
+
}
|
|
2380
|
+
}
|
|
2381
|
+
static validateEnum(value, name, set) {
|
|
2382
|
+
if (value === undefined) return;
|
|
2383
|
+
if (typeof value !== 'string' || !set.has(value)) {
|
|
2384
|
+
throw new CmpStrValidationError(
|
|
2385
|
+
`Invalid option <${name}>: expected ${OptionsValidator.set2string(set)}`,
|
|
2386
|
+
{ name, value }
|
|
2387
|
+
);
|
|
2388
|
+
}
|
|
2389
|
+
}
|
|
2390
|
+
static validateMap(opt, map) {
|
|
2391
|
+
if (!opt) return;
|
|
2392
|
+
for (const k in opt) {
|
|
2393
|
+
const fn = map[k];
|
|
2394
|
+
if (!fn)
|
|
2395
|
+
throw new CmpStrValidationError(`Invalid option <${k}>`, {
|
|
2396
|
+
option: k,
|
|
2397
|
+
value: map[k]
|
|
2398
|
+
});
|
|
2399
|
+
fn(opt[k]);
|
|
2400
|
+
}
|
|
2401
|
+
}
|
|
2402
|
+
static validateRegistryName(value, name, label, has, list) {
|
|
2403
|
+
if (value === undefined) return;
|
|
2404
|
+
if (typeof value !== 'string' || value.length === 0)
|
|
2405
|
+
throw new CmpStrValidationError(
|
|
2406
|
+
`Invalid option <${name}>: expected non-empty string`,
|
|
2407
|
+
{ name, value }
|
|
2408
|
+
);
|
|
2409
|
+
if (!has(value))
|
|
2410
|
+
throw new CmpStrValidationError(
|
|
2411
|
+
`${label} <${value}> is not registered`,
|
|
2412
|
+
{ name, value, available: list() }
|
|
2413
|
+
);
|
|
2414
|
+
}
|
|
2415
|
+
static validateBoolean(value, name) {
|
|
2416
|
+
OptionsValidator.validateType(value, name, 'boolean');
|
|
2417
|
+
}
|
|
2418
|
+
static validateNumber(value, name) {
|
|
2419
|
+
OptionsValidator.validateType(value, name, 'number');
|
|
2420
|
+
}
|
|
2421
|
+
static validateString(value, name) {
|
|
2422
|
+
OptionsValidator.validateType(value, name, 'string');
|
|
2423
|
+
}
|
|
2424
|
+
static validateFlags(value) {
|
|
2425
|
+
if (value === undefined) return;
|
|
2426
|
+
if (typeof value !== 'string')
|
|
2427
|
+
throw new CmpStrValidationError(
|
|
2428
|
+
`Invalid option <flags>: expected string`,
|
|
2429
|
+
{ flags: value }
|
|
2430
|
+
);
|
|
2431
|
+
for (let i = 0; i < value.length; i++) {
|
|
2432
|
+
const ch = value[i];
|
|
2433
|
+
if (!OptionsValidator.ALLOWED_FLAGS.has(ch))
|
|
2434
|
+
throw new CmpStrValidationError(
|
|
2435
|
+
`Invalid normalization flag <${ch}> in <flags>: expected ${OptionsValidator.set2string(OptionsValidator.ALLOWED_FLAGS)}`,
|
|
2436
|
+
{ flags: value, invalid: ch }
|
|
2437
|
+
);
|
|
2438
|
+
}
|
|
2439
|
+
}
|
|
2440
|
+
static validateOutput(value) {
|
|
2441
|
+
OptionsValidator.validateEnum(
|
|
2442
|
+
value,
|
|
2443
|
+
'output',
|
|
2444
|
+
OptionsValidator.ALLOWED_OUTPUT
|
|
2445
|
+
);
|
|
2446
|
+
}
|
|
2447
|
+
static validateMode(value) {
|
|
2448
|
+
OptionsValidator.validateEnum(
|
|
2449
|
+
value,
|
|
2450
|
+
'mode',
|
|
2451
|
+
OptionsValidator.ALLOWED_MODES
|
|
2452
|
+
);
|
|
2453
|
+
}
|
|
2454
|
+
static validateSort(value, name) {
|
|
2455
|
+
if (value === undefined || typeof value === 'boolean') return;
|
|
2456
|
+
OptionsValidator.validateEnum(value, name, OptionsValidator.ALLOWED_SORT);
|
|
2457
|
+
}
|
|
2458
|
+
static validateMetricName(value) {
|
|
2459
|
+
OptionsValidator.validateRegistryName(
|
|
2460
|
+
value,
|
|
2461
|
+
'metric',
|
|
2462
|
+
'Comparison metric',
|
|
2463
|
+
MetricRegistry.has,
|
|
2464
|
+
MetricRegistry.list
|
|
2465
|
+
);
|
|
2466
|
+
}
|
|
2467
|
+
static validatePhoneticName(value) {
|
|
2468
|
+
OptionsValidator.validateRegistryName(
|
|
2469
|
+
value,
|
|
2470
|
+
'phonetic',
|
|
2471
|
+
'Phonetic algorithm',
|
|
2472
|
+
PhoneticRegistry.has,
|
|
2473
|
+
PhoneticRegistry.list
|
|
2474
|
+
);
|
|
2475
|
+
}
|
|
2476
|
+
static validateMetricOptions(opt) {
|
|
2477
|
+
OptionsValidator.validateMap(opt, OptionsValidator.METRIC_OPT_MAP);
|
|
2478
|
+
}
|
|
2479
|
+
static validatePhoneticOptions(opt) {
|
|
2480
|
+
OptionsValidator.validateMap(opt, OptionsValidator.PHONETIC_OPT_MAP);
|
|
2481
|
+
}
|
|
2482
|
+
static validateProcessors(opt) {
|
|
2483
|
+
if (!opt) return;
|
|
2484
|
+
for (const key in opt) {
|
|
2485
|
+
const fn = OptionsValidator.PROCESSORS[key];
|
|
2486
|
+
if (!fn)
|
|
2487
|
+
throw new CmpStrValidationError(
|
|
2488
|
+
`Invalid processor type <${key}> in <processors>: expected ${Object.keys(OptionsValidator.PROCESSORS).join(' | ')}`,
|
|
2489
|
+
{ processors: opt, invalid: key }
|
|
2490
|
+
);
|
|
2491
|
+
fn(opt[key]);
|
|
2492
|
+
}
|
|
2493
|
+
}
|
|
2494
|
+
static validateOptions(opt) {
|
|
2495
|
+
OptionsValidator.validateMap(opt, OptionsValidator.CMPSTR_OPT_MAP);
|
|
2496
|
+
}
|
|
2497
|
+
}
|
|
2498
|
+
|
|
2499
|
+
class StructuredData {
|
|
2500
|
+
data;
|
|
2501
|
+
key;
|
|
2502
|
+
static SORT_ASC = (a, b) => a.res - b.res;
|
|
2503
|
+
static SORT_DESC = (a, b) => b.res - a.res;
|
|
2504
|
+
static create(data, key) {
|
|
2505
|
+
return new StructuredData(data, key);
|
|
2506
|
+
}
|
|
2507
|
+
constructor(data, key) {
|
|
2508
|
+
this.data = data;
|
|
2509
|
+
this.key = key;
|
|
2510
|
+
}
|
|
2511
|
+
extractFrom(arr, key) {
|
|
2512
|
+
const n = arr.length;
|
|
2513
|
+
const result = new Array(n);
|
|
2514
|
+
for (let i = 0; i < n; i++) {
|
|
2515
|
+
const val = arr[i][key];
|
|
2516
|
+
result[i] = val != null ? String(val) : '';
|
|
2517
|
+
}
|
|
2518
|
+
return result;
|
|
2519
|
+
}
|
|
2520
|
+
extract() {
|
|
2521
|
+
return this.extractFrom(this.data, this.key);
|
|
2522
|
+
}
|
|
2523
|
+
isMetricResult(v) {
|
|
2524
|
+
return (
|
|
2525
|
+
typeof v === 'object' &&
|
|
2526
|
+
v !== null &&
|
|
2527
|
+
'a' in v &&
|
|
2528
|
+
'b' in v &&
|
|
2529
|
+
'res' in v
|
|
2530
|
+
);
|
|
2531
|
+
}
|
|
2532
|
+
isCmpStrResult(v) {
|
|
2533
|
+
return (
|
|
2534
|
+
typeof v === 'object' &&
|
|
2535
|
+
v !== null &&
|
|
2536
|
+
'source' in v &&
|
|
2537
|
+
'target' in v &&
|
|
2538
|
+
'match' in v
|
|
2539
|
+
);
|
|
2540
|
+
}
|
|
2541
|
+
normalizeResults(results) {
|
|
2542
|
+
if (!Array.isArray(results) || results.length === 0) return [];
|
|
2543
|
+
const first = results[0];
|
|
2544
|
+
let out = new Array(results.length);
|
|
2545
|
+
if (this.isMetricResult(first)) {
|
|
2546
|
+
const src = results;
|
|
2547
|
+
for (let i = 0; i < src.length; i++) out[i] = { ...src[i], __idx: i };
|
|
2548
|
+
} else if (this.isCmpStrResult(first)) {
|
|
2549
|
+
const src = results;
|
|
2550
|
+
for (let i = 0; i < src.length; i++) {
|
|
2551
|
+
const r = src[i];
|
|
2552
|
+
out[i] = {
|
|
2553
|
+
metric: 'unknown',
|
|
2554
|
+
a: r.source,
|
|
2555
|
+
b: r.target,
|
|
2556
|
+
res: r.match,
|
|
2557
|
+
raw: r.raw,
|
|
2558
|
+
__idx: i
|
|
2559
|
+
};
|
|
2560
|
+
}
|
|
2561
|
+
} else
|
|
2562
|
+
throw new CmpStrValidationError(
|
|
2563
|
+
'Unsupported result format for StructuredData normalization.'
|
|
2564
|
+
);
|
|
2565
|
+
return out;
|
|
2566
|
+
}
|
|
2567
|
+
rebuild(results, sourceData, extractedStrings, removeZero, objectsOnly) {
|
|
2568
|
+
const m = extractedStrings.length,
|
|
2569
|
+
n = results.length;
|
|
2570
|
+
const stringToIndices = Pool.acquire('map', m);
|
|
2571
|
+
const occurrenceCount = Pool.acquire('map', n);
|
|
2572
|
+
const output = new Array(n);
|
|
2573
|
+
stringToIndices.clear();
|
|
2574
|
+
occurrenceCount.clear();
|
|
2575
|
+
try {
|
|
2576
|
+
for (let i = 0; i < m; i++) {
|
|
2577
|
+
const str = extractedStrings[i];
|
|
2578
|
+
let arr = stringToIndices.get(str);
|
|
2579
|
+
if (!arr) {
|
|
2580
|
+
arr = [];
|
|
2581
|
+
stringToIndices.set(str, arr);
|
|
2582
|
+
}
|
|
2583
|
+
arr.push(i);
|
|
2584
|
+
}
|
|
2585
|
+
let out = 0;
|
|
2586
|
+
for (let i = 0; i < n; i++) {
|
|
2587
|
+
const result = results[i];
|
|
2588
|
+
if (removeZero && result.res === 0) continue;
|
|
2589
|
+
const targetStr = result.b || '';
|
|
2590
|
+
const indices = stringToIndices.get(targetStr);
|
|
2591
|
+
let dataIndex;
|
|
2592
|
+
if (indices && indices.length > 0) {
|
|
2593
|
+
const occurrence = occurrenceCount.get(targetStr) ?? 0;
|
|
2594
|
+
occurrenceCount.set(targetStr, occurrence + 1);
|
|
2595
|
+
dataIndex = indices[occurrence % indices.length];
|
|
2596
|
+
} else {
|
|
2597
|
+
dataIndex = result.__idx ?? i;
|
|
2598
|
+
}
|
|
2599
|
+
if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
|
|
2600
|
+
const sourceObj = sourceData[dataIndex];
|
|
2601
|
+
const mappedTarget = extractedStrings[dataIndex] || targetStr;
|
|
2602
|
+
if (objectsOnly) output[out++] = sourceObj;
|
|
2603
|
+
else
|
|
2604
|
+
output[out++] = {
|
|
2605
|
+
obj: sourceObj,
|
|
2606
|
+
key: this.key,
|
|
2607
|
+
result: {
|
|
2608
|
+
source: result.a,
|
|
2609
|
+
target: mappedTarget,
|
|
2610
|
+
match: result.res
|
|
2611
|
+
},
|
|
2612
|
+
...(result.raw ? { raw: result.raw } : null)
|
|
2613
|
+
};
|
|
2614
|
+
}
|
|
2615
|
+
output.length = out;
|
|
2616
|
+
return output;
|
|
2617
|
+
} finally {
|
|
2618
|
+
Pool.release('map', stringToIndices, m);
|
|
2619
|
+
Pool.release('map', occurrenceCount, n);
|
|
2620
|
+
}
|
|
2621
|
+
}
|
|
2622
|
+
sort(results, sort) {
|
|
2623
|
+
if (!sort || results.length <= 1) return results;
|
|
2624
|
+
return results.sort(
|
|
2625
|
+
sort === 'asc' ? StructuredData.SORT_ASC : StructuredData.SORT_DESC
|
|
2626
|
+
);
|
|
2627
|
+
}
|
|
2628
|
+
finalizeLookup(results, extractedStrings, opt) {
|
|
2629
|
+
return this.rebuild(
|
|
2630
|
+
this.sort(this.normalizeResults(results), opt?.sort),
|
|
2631
|
+
this.data,
|
|
2632
|
+
extractedStrings,
|
|
2633
|
+
opt?.removeZero,
|
|
2634
|
+
opt?.objectsOnly
|
|
2635
|
+
);
|
|
2636
|
+
}
|
|
2637
|
+
performLookup(fn, extractedStrings, opt) {
|
|
2638
|
+
return ErrorUtil.wrap(
|
|
2639
|
+
() => this.finalizeLookup(fn(), extractedStrings, opt),
|
|
2640
|
+
'StructuredData lookup failed',
|
|
2641
|
+
{ key: this.key }
|
|
2642
|
+
);
|
|
2643
|
+
}
|
|
2644
|
+
async performLookupAsync(fn, extractedStrings, opt) {
|
|
2645
|
+
return await ErrorUtil.wrapAsync(
|
|
2646
|
+
async () => this.finalizeLookup(await fn(), extractedStrings, opt),
|
|
2647
|
+
'StructuredData async lookup failed',
|
|
2648
|
+
{ key: this.key }
|
|
2649
|
+
);
|
|
2650
|
+
}
|
|
2651
|
+
lookup(fn, query, opt) {
|
|
2652
|
+
const b = this.extract();
|
|
2653
|
+
try {
|
|
2654
|
+
return this.performLookup(() => fn(query, b, opt), b, opt);
|
|
2655
|
+
} finally {
|
|
2656
|
+
Pool.release('string[]', b, b.length);
|
|
2657
|
+
}
|
|
2658
|
+
}
|
|
2659
|
+
async lookupAsync(fn, query, opt) {
|
|
2660
|
+
const b = this.extract();
|
|
2661
|
+
try {
|
|
2662
|
+
return await this.performLookupAsync(() => fn(query, b, opt), b, opt);
|
|
2663
|
+
} finally {
|
|
2664
|
+
Pool.release('string[]', b, b.length);
|
|
2665
|
+
}
|
|
2666
|
+
}
|
|
2667
|
+
lookupPairs(fn, other, otherKey, opt) {
|
|
2668
|
+
const a = this.extract();
|
|
2669
|
+
const b = this.extractFrom(other, otherKey);
|
|
2670
|
+
try {
|
|
2671
|
+
return this.performLookup(() => fn(a, b, opt), a, opt);
|
|
2672
|
+
} finally {
|
|
2673
|
+
Pool.release('string[]', a, a.length);
|
|
2674
|
+
Pool.release('string[]', b, b.length);
|
|
2675
|
+
}
|
|
2676
|
+
}
|
|
2677
|
+
async lookupPairsAsync(fn, other, otherKey, opt) {
|
|
2678
|
+
const a = this.extract();
|
|
2679
|
+
const b = this.extractFrom(other, otherKey);
|
|
2680
|
+
try {
|
|
2681
|
+
return await this.performLookupAsync(() => fn(a, b, opt), a, opt);
|
|
2682
|
+
} finally {
|
|
2683
|
+
Pool.release('string[]', a, a.length);
|
|
2684
|
+
Pool.release('string[]', b, b.length);
|
|
2685
|
+
}
|
|
2686
|
+
}
|
|
2687
|
+
}
|
|
2688
|
+
|
|
2689
|
+
class TextAnalyzer {
|
|
2690
|
+
static REGEX = {
|
|
2691
|
+
number: /\d/,
|
|
2692
|
+
sentence: /(?<=[.!?])\s+/,
|
|
2693
|
+
word: /\p{L}+/gu,
|
|
2694
|
+
nonWord: /[^\p{L}]/gu,
|
|
2695
|
+
vowelGroup: /[aeiouy]+/g,
|
|
2696
|
+
letter: /\p{L}/gu,
|
|
2697
|
+
ucLetter: /\p{Lu}/gu
|
|
2698
|
+
};
|
|
2699
|
+
text;
|
|
2700
|
+
words = [];
|
|
2701
|
+
sentences = [];
|
|
2702
|
+
charFrequency = new Map();
|
|
2703
|
+
wordHistogram = new Map();
|
|
2704
|
+
syllableCache = new Map();
|
|
2705
|
+
syllableStats;
|
|
2706
|
+
constructor(input) {
|
|
2707
|
+
this.text = input.trim();
|
|
2708
|
+
this.tokenize();
|
|
2709
|
+
this.computeFrequencies();
|
|
2710
|
+
}
|
|
2711
|
+
tokenize() {
|
|
2712
|
+
let match;
|
|
2713
|
+
const lcText = this.text.toLowerCase();
|
|
2714
|
+
while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null)
|
|
2715
|
+
this.words.push(match[0]);
|
|
2716
|
+
this.sentences = this.text
|
|
2717
|
+
.split(TextAnalyzer.REGEX.sentence)
|
|
2718
|
+
.filter(Boolean);
|
|
2719
|
+
}
|
|
2720
|
+
computeFrequencies() {
|
|
2721
|
+
for (const char of this.text)
|
|
2722
|
+
this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1);
|
|
2723
|
+
for (const word of this.words)
|
|
2724
|
+
this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1);
|
|
2725
|
+
}
|
|
2726
|
+
estimateSyllables(word) {
|
|
2727
|
+
const clean = word
|
|
2728
|
+
.normalize('NFC')
|
|
2729
|
+
.toLowerCase()
|
|
2730
|
+
.replace(TextAnalyzer.REGEX.nonWord, '');
|
|
2731
|
+
if (this.syllableCache.has(clean)) return this.syllableCache.get(clean);
|
|
2732
|
+
const matches = clean.match(TextAnalyzer.REGEX.vowelGroup);
|
|
2733
|
+
const count = matches ? matches.length : 1;
|
|
2734
|
+
this.syllableCache.set(clean, count);
|
|
2735
|
+
return count;
|
|
2736
|
+
}
|
|
2737
|
+
computeSyllableStats() {
|
|
2738
|
+
return (this.syllableStats ||= (() => {
|
|
2739
|
+
const perWord = this.words
|
|
2740
|
+
.map((w) => this.estimateSyllables(w))
|
|
2741
|
+
.sort((a, b) => a - b);
|
|
2742
|
+
const total = perWord.reduce((sum, s) => sum + s, 0);
|
|
2743
|
+
const mono = perWord.filter((s) => s === 1).length;
|
|
2744
|
+
const median = !perWord.length
|
|
2745
|
+
? 0
|
|
2746
|
+
: perWord.length % 2 === 0
|
|
2747
|
+
? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) /
|
|
2748
|
+
2
|
|
2749
|
+
: perWord[Math.floor(perWord.length / 2)];
|
|
2750
|
+
return {
|
|
2751
|
+
total,
|
|
2752
|
+
mono,
|
|
2753
|
+
perWord,
|
|
2754
|
+
avg: perWord.length ? total / perWord.length : 0,
|
|
2755
|
+
median
|
|
2756
|
+
};
|
|
2757
|
+
})());
|
|
2758
|
+
}
|
|
2759
|
+
getLength = () => this.text.length;
|
|
2760
|
+
getWordCount = () => this.words.length;
|
|
2761
|
+
getSentenceCount = () => this.sentences.length;
|
|
2762
|
+
getAvgWordLength() {
|
|
2763
|
+
return this.words.length
|
|
2764
|
+
? this.words.join('').length / this.words.length
|
|
2765
|
+
: 0;
|
|
2766
|
+
}
|
|
2767
|
+
getAvgSentenceLength() {
|
|
2768
|
+
return this.sentences.length
|
|
2769
|
+
? this.words.length / this.sentences.length
|
|
2770
|
+
: 0;
|
|
2771
|
+
}
|
|
2772
|
+
getWordHistogram() {
|
|
2773
|
+
return Object.fromEntries(this.wordHistogram);
|
|
2774
|
+
}
|
|
2775
|
+
getMostCommonWords(limit = 5) {
|
|
2776
|
+
return [...this.wordHistogram.entries()]
|
|
2777
|
+
.sort((a, b) => b[1] - a[1])
|
|
2778
|
+
.slice(0, limit)
|
|
2779
|
+
.map((e) => e[0]);
|
|
2780
|
+
}
|
|
2781
|
+
getHapaxLegomena() {
|
|
2782
|
+
return [...this.wordHistogram.entries()]
|
|
2783
|
+
.filter(([, c]) => c === 1)
|
|
2784
|
+
.map((e) => e[0]);
|
|
2785
|
+
}
|
|
2786
|
+
hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text);
|
|
2787
|
+
getUpperCaseRatio() {
|
|
2788
|
+
const matches = this.text.match(TextAnalyzer.REGEX.letter) || [];
|
|
2789
|
+
const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0;
|
|
2790
|
+
return matches.length ? upper / matches.length : 0;
|
|
2791
|
+
}
|
|
2792
|
+
getCharFrequency() {
|
|
2793
|
+
return Object.fromEntries(this.charFrequency);
|
|
2794
|
+
}
|
|
2795
|
+
getUnicodeCodepoints() {
|
|
2796
|
+
const result = {};
|
|
2797
|
+
for (const [char, count] of this.charFrequency) {
|
|
2798
|
+
const block = char
|
|
2799
|
+
.charCodeAt(0)
|
|
2800
|
+
.toString(16)
|
|
2801
|
+
.padStart(4, '0')
|
|
2802
|
+
.toUpperCase();
|
|
2803
|
+
result[block] = (result[block] || 0) + count;
|
|
2804
|
+
}
|
|
2805
|
+
return result;
|
|
2806
|
+
}
|
|
2807
|
+
getLongWordRatio(len = 7) {
|
|
2808
|
+
let long = 0;
|
|
2809
|
+
for (const w of this.words) if (w.length >= len) long++;
|
|
2810
|
+
return this.words.length ? long / this.words.length : 0;
|
|
2811
|
+
}
|
|
2812
|
+
getShortWordRatio(len = 3) {
|
|
2813
|
+
let short = 0;
|
|
2814
|
+
for (const w of this.words) if (w.length <= len) short++;
|
|
2815
|
+
return this.words.length ? short / this.words.length : 0;
|
|
2816
|
+
}
|
|
2817
|
+
getSyllablesCount() {
|
|
2818
|
+
return this.computeSyllableStats().total;
|
|
2819
|
+
}
|
|
2820
|
+
getMonosyllabicWordCount() {
|
|
2821
|
+
return this.computeSyllableStats().mono;
|
|
2822
|
+
}
|
|
2823
|
+
getMinSyllablesWordCount(min) {
|
|
2824
|
+
return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
|
|
2825
|
+
}
|
|
2826
|
+
getMaxSyllablesWordCount(max) {
|
|
2827
|
+
return this.computeSyllableStats().perWord.filter((w) => w <= max).length;
|
|
2828
|
+
}
|
|
2829
|
+
getAvgSyllablesPerWord() {
|
|
2830
|
+
return this.computeSyllableStats().avg;
|
|
2831
|
+
}
|
|
2832
|
+
getMedianSyllablesPerWord() {
|
|
2833
|
+
return this.computeSyllableStats().median;
|
|
2365
2834
|
}
|
|
2366
|
-
|
|
2367
|
-
|
|
2368
|
-
|
|
2369
|
-
|
|
2835
|
+
getHonoresR() {
|
|
2836
|
+
try {
|
|
2837
|
+
return (
|
|
2838
|
+
(100 * Math.log(this.words.length)) /
|
|
2839
|
+
(1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1))
|
|
2840
|
+
);
|
|
2841
|
+
} catch {
|
|
2842
|
+
return 0;
|
|
2843
|
+
}
|
|
2370
2844
|
}
|
|
2371
|
-
|
|
2372
|
-
|
|
2373
|
-
PhoneticMappingRegistry.add('metaphone', 'en90', {
|
|
2374
|
-
map: {
|
|
2375
|
-
a: 'A',
|
|
2376
|
-
b: 'B',
|
|
2377
|
-
c: 'K',
|
|
2378
|
-
d: 'T',
|
|
2379
|
-
e: 'E',
|
|
2380
|
-
f: 'F',
|
|
2381
|
-
g: 'K',
|
|
2382
|
-
h: 'H',
|
|
2383
|
-
i: 'I',
|
|
2384
|
-
j: 'J',
|
|
2385
|
-
k: 'K',
|
|
2386
|
-
l: 'L',
|
|
2387
|
-
m: 'M',
|
|
2388
|
-
n: 'N',
|
|
2389
|
-
o: 'O',
|
|
2390
|
-
p: 'P',
|
|
2391
|
-
q: 'K',
|
|
2392
|
-
r: 'R',
|
|
2393
|
-
s: 'S',
|
|
2394
|
-
t: 'T',
|
|
2395
|
-
u: 'U',
|
|
2396
|
-
v: 'F',
|
|
2397
|
-
w: 'W',
|
|
2398
|
-
x: 'KS',
|
|
2399
|
-
y: 'Y',
|
|
2400
|
-
z: 'S'
|
|
2401
|
-
},
|
|
2402
|
-
ruleset: [
|
|
2403
|
-
{ char: 'a', position: 'start', next: ['e'], code: '' },
|
|
2404
|
-
{ char: 'g', position: 'start', next: ['n'], code: '' },
|
|
2405
|
-
{ char: 'k', position: 'start', next: ['n'], code: '' },
|
|
2406
|
-
{ char: 'p', position: 'start', next: ['n'], code: '' },
|
|
2407
|
-
{ char: 'w', position: 'start', next: ['r'], code: '' },
|
|
2408
|
-
{ char: 'b', position: 'end', prev: ['m'], code: '' },
|
|
2409
|
-
{ char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
|
|
2410
|
-
{ char: 'c', next: ['i'], next2: ['a'], code: 'X' },
|
|
2411
|
-
{ char: 'c', next: ['e', 'i', 'y'], code: 'S' },
|
|
2412
|
-
{ char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
|
|
2413
|
-
{
|
|
2414
|
-
char: 'g',
|
|
2415
|
-
next: ['h'],
|
|
2416
|
-
next2Not: ['', 'a', 'e', 'i', 'o', 'u'],
|
|
2417
|
-
code: ''
|
|
2418
|
-
},
|
|
2419
|
-
{ char: 'g', trailing: 'n', code: '' },
|
|
2420
|
-
{ char: 'g', trailing: 'ned', code: '' },
|
|
2421
|
-
{ char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
|
|
2422
|
-
{
|
|
2423
|
-
char: 'h',
|
|
2424
|
-
prev: ['a', 'e', 'i', 'o', 'u'],
|
|
2425
|
-
nextNot: ['a', 'e', 'i', 'o', 'u'],
|
|
2426
|
-
code: ''
|
|
2427
|
-
},
|
|
2428
|
-
{ char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
|
|
2429
|
-
{ char: 'k', prev: ['c'], code: '' },
|
|
2430
|
-
{ char: 'p', next: ['h'], code: 'F' },
|
|
2431
|
-
{ char: 's', next: ['h'], code: 'X' },
|
|
2432
|
-
{ char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2433
|
-
{ char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2434
|
-
{ char: 't', next: ['h'], code: '0' },
|
|
2435
|
-
{ char: 't', next: ['c'], next2: ['h'], code: '' },
|
|
2436
|
-
{ char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
|
|
2437
|
-
{ char: 'h', leading: 'w', code: '' },
|
|
2438
|
-
{ char: 'x', position: 'start', code: 'S' },
|
|
2439
|
-
{ char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
|
|
2440
|
-
]
|
|
2441
|
-
});
|
|
2442
|
-
|
|
2443
|
-
class Soundex extends Phonetic {
|
|
2444
|
-
static default = {
|
|
2445
|
-
map: 'en',
|
|
2446
|
-
delimiter: ' ',
|
|
2447
|
-
length: 4,
|
|
2448
|
-
pad: '0',
|
|
2449
|
-
dedupe: true
|
|
2450
|
-
};
|
|
2451
|
-
constructor(opt = {}) {
|
|
2452
|
-
super('soundex', opt);
|
|
2845
|
+
getReadingTime(wpm = 200) {
|
|
2846
|
+
return this.words.length / (wpm ?? 1);
|
|
2453
2847
|
}
|
|
2454
|
-
|
|
2455
|
-
|
|
2848
|
+
getReadabilityScore(metric = 'flesch') {
|
|
2849
|
+
const w = this.words.length || 1;
|
|
2850
|
+
const s = this.sentences.length || 1;
|
|
2851
|
+
const y = this.getSyllablesCount() || 1;
|
|
2852
|
+
const asl = w / s;
|
|
2853
|
+
const asw = y / w;
|
|
2854
|
+
switch (metric) {
|
|
2855
|
+
case 'flesch':
|
|
2856
|
+
return 206.835 - 1.015 * asl - 84.6 * asw;
|
|
2857
|
+
case 'fleschde':
|
|
2858
|
+
return 180 - asl - 58.5 * asw;
|
|
2859
|
+
case 'kincaid':
|
|
2860
|
+
return 0.39 * asl + 11.8 * asw - 15.59;
|
|
2861
|
+
}
|
|
2456
2862
|
}
|
|
2457
|
-
|
|
2458
|
-
|
|
2459
|
-
|
|
2460
|
-
|
|
2461
|
-
|
|
2462
|
-
e: '0',
|
|
2463
|
-
h: '0',
|
|
2464
|
-
i: '0',
|
|
2465
|
-
o: '0',
|
|
2466
|
-
u: '0',
|
|
2467
|
-
w: '0',
|
|
2468
|
-
y: '0',
|
|
2469
|
-
b: '1',
|
|
2470
|
-
f: '1',
|
|
2471
|
-
p: '1',
|
|
2472
|
-
v: '1',
|
|
2473
|
-
c: '2',
|
|
2474
|
-
g: '2',
|
|
2475
|
-
j: '2',
|
|
2476
|
-
k: '2',
|
|
2477
|
-
q: '2',
|
|
2478
|
-
s: '2',
|
|
2479
|
-
x: '2',
|
|
2480
|
-
z: '2',
|
|
2481
|
-
d: '3',
|
|
2482
|
-
t: '3',
|
|
2483
|
-
l: '4',
|
|
2484
|
-
m: '5',
|
|
2485
|
-
n: '5',
|
|
2486
|
-
r: '6'
|
|
2863
|
+
getLIXScore() {
|
|
2864
|
+
const w = this.words.length || 1;
|
|
2865
|
+
const s = this.sentences.length || 1;
|
|
2866
|
+
const l = this.getLongWordRatio() * w;
|
|
2867
|
+
return w / s + (l / w) * 100;
|
|
2487
2868
|
}
|
|
2488
|
-
|
|
2489
|
-
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
2493
|
-
|
|
2494
|
-
|
|
2495
|
-
|
|
2496
|
-
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
b: '1',
|
|
2503
|
-
f: '1',
|
|
2504
|
-
p: '1',
|
|
2505
|
-
v: '1',
|
|
2506
|
-
w: '1',
|
|
2507
|
-
c: '2',
|
|
2508
|
-
g: '2',
|
|
2509
|
-
k: '2',
|
|
2510
|
-
q: '2',
|
|
2511
|
-
s: '2',
|
|
2512
|
-
ß: '2',
|
|
2513
|
-
x: '2',
|
|
2514
|
-
z: '2',
|
|
2515
|
-
d: '3',
|
|
2516
|
-
t: '3',
|
|
2517
|
-
l: '4',
|
|
2518
|
-
m: '5',
|
|
2519
|
-
n: '5',
|
|
2520
|
-
r: '6'
|
|
2521
|
-
},
|
|
2522
|
-
ruleset: [{ char: 'c', next: ['h'], code: '7' }]
|
|
2523
|
-
});
|
|
2869
|
+
getWSTFScore() {
|
|
2870
|
+
const w = this.words.length || 1;
|
|
2871
|
+
const h = (this.getMinSyllablesWordCount(3) / w) * 100;
|
|
2872
|
+
const s = this.getAvgSentenceLength();
|
|
2873
|
+
const l = this.getLongWordRatio() * 100;
|
|
2874
|
+
const m = (this.getMonosyllabicWordCount() / w) * 100;
|
|
2875
|
+
return [
|
|
2876
|
+
0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875,
|
|
2877
|
+
0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779,
|
|
2878
|
+
0.2963 * h + 0.1905 * s - 1.1144,
|
|
2879
|
+
0.2744 * h + 0.2656 * s - 1.693
|
|
2880
|
+
];
|
|
2881
|
+
}
|
|
2882
|
+
}
|
|
2524
2883
|
|
|
2525
2884
|
const profiler = Profiler.getInstance();
|
|
2526
2885
|
class CmpStr {
|
|
@@ -2572,31 +2931,26 @@
|
|
|
2572
2931
|
}
|
|
2573
2932
|
assert(cond, test) {
|
|
2574
2933
|
switch (cond) {
|
|
2934
|
+
default:
|
|
2935
|
+
throw new CmpStrInternalError(`Cmpstr condition <${cond}> unknown`);
|
|
2575
2936
|
case 'metric':
|
|
2576
|
-
|
|
2577
|
-
throw new CmpStrNotFoundError(
|
|
2578
|
-
`CmpStr <metric> must be set, call .setMetric(), ` +
|
|
2579
|
-
`use CmpStr.metric.list() for available metrics`,
|
|
2580
|
-
{ metric: test }
|
|
2581
|
-
);
|
|
2937
|
+
OptionsValidator.validateMetricName(test);
|
|
2582
2938
|
break;
|
|
2583
2939
|
case 'phonetic':
|
|
2584
|
-
|
|
2585
|
-
throw new CmpStrNotFoundError(
|
|
2586
|
-
`CmpStr <phonetic> must be set, call .setPhonetic(), ` +
|
|
2587
|
-
`use CmpStr.phonetic.list() for available phonetic algorithms`,
|
|
2588
|
-
{ phonetic: test }
|
|
2589
|
-
);
|
|
2940
|
+
OptionsValidator.validatePhoneticName(test);
|
|
2590
2941
|
break;
|
|
2591
|
-
default:
|
|
2592
|
-
throw new CmpStrInternalError(`Cmpstr condition <${cond}> unknown`);
|
|
2593
2942
|
}
|
|
2594
2943
|
}
|
|
2595
2944
|
assertMany(...cond) {
|
|
2596
2945
|
for (const [c, test] of cond) this.assert(c, test);
|
|
2597
2946
|
}
|
|
2598
2947
|
resolveOptions(opt) {
|
|
2599
|
-
|
|
2948
|
+
const merged = DeepMerge.merge(
|
|
2949
|
+
{ ...(this.options ?? Object.create(null)) },
|
|
2950
|
+
opt
|
|
2951
|
+
);
|
|
2952
|
+
OptionsValidator.validateOptions(merged);
|
|
2953
|
+
return merged;
|
|
2600
2954
|
}
|
|
2601
2955
|
normalize(input, flags) {
|
|
2602
2956
|
return Normalizer.normalize(input, flags ?? this.options.flags ?? '');
|
|
@@ -2612,7 +2966,7 @@
|
|
|
2612
2966
|
return input;
|
|
2613
2967
|
}
|
|
2614
2968
|
postProcess(result, opt) {
|
|
2615
|
-
if (
|
|
2969
|
+
if (Array.isArray(result) && opt?.removeZero)
|
|
2616
2970
|
result = result.filter((r) => r.res > 0);
|
|
2617
2971
|
return result;
|
|
2618
2972
|
}
|
|
@@ -2628,10 +2982,10 @@
|
|
|
2628
2982
|
return StructuredData.create(data, key);
|
|
2629
2983
|
}
|
|
2630
2984
|
compute(a, b, opt, mode, raw, skip) {
|
|
2985
|
+
const resolved = this.resolveOptions(opt);
|
|
2986
|
+
this.assert('metric', resolved.metric);
|
|
2631
2987
|
return ErrorUtil.wrap(
|
|
2632
2988
|
() => {
|
|
2633
|
-
const resolved = this.resolveOptions(opt);
|
|
2634
|
-
this.assert('metric', resolved.metric);
|
|
2635
2989
|
const A = skip ? a : this.prepare(a, resolved);
|
|
2636
2990
|
const B = skip ? b : this.prepare(b, resolved);
|
|
2637
2991
|
if (
|
|
@@ -2649,7 +3003,7 @@
|
|
|
2649
3003
|
const result = this.postProcess(metric.getResults(), resolved);
|
|
2650
3004
|
return this.output(result, raw ?? resolved.raw);
|
|
2651
3005
|
},
|
|
2652
|
-
`Failed to compute metric <${
|
|
3006
|
+
`Failed to compute metric <${resolved.metric}> for the given inputs`,
|
|
2653
3007
|
{ a, b, options: opt }
|
|
2654
3008
|
);
|
|
2655
3009
|
}
|
|
@@ -2665,47 +3019,79 @@
|
|
|
2665
3019
|
{ result, raw }
|
|
2666
3020
|
);
|
|
2667
3021
|
}
|
|
2668
|
-
clone
|
|
2669
|
-
Object.assign(
|
|
3022
|
+
clone() {
|
|
3023
|
+
const inst = Object.assign(
|
|
3024
|
+
Object.create(Object.getPrototypeOf(this)),
|
|
3025
|
+
this
|
|
3026
|
+
);
|
|
3027
|
+
inst.options = DeepMerge.merge(Object.create(null), this.options);
|
|
3028
|
+
return inst;
|
|
3029
|
+
}
|
|
2670
3030
|
reset() {
|
|
2671
|
-
|
|
3031
|
+
this.options = Object.create(null);
|
|
2672
3032
|
return this;
|
|
2673
3033
|
}
|
|
2674
3034
|
setOptions(opt) {
|
|
3035
|
+
OptionsValidator.validateOptions(opt);
|
|
2675
3036
|
this.options = opt;
|
|
2676
3037
|
return this;
|
|
2677
3038
|
}
|
|
2678
3039
|
mergeOptions(opt) {
|
|
2679
|
-
merge(this.options, opt);
|
|
3040
|
+
DeepMerge.merge(this.options, opt);
|
|
3041
|
+
OptionsValidator.validateOptions(this.options);
|
|
2680
3042
|
return this;
|
|
2681
3043
|
}
|
|
2682
3044
|
setSerializedOptions(opt) {
|
|
2683
|
-
|
|
2684
|
-
()
|
|
2685
|
-
|
|
2686
|
-
|
|
2687
|
-
|
|
2688
|
-
|
|
2689
|
-
|
|
2690
|
-
|
|
3045
|
+
try {
|
|
3046
|
+
const parsed = JSON.parse(opt);
|
|
3047
|
+
OptionsValidator.validateOptions(parsed);
|
|
3048
|
+
this.options = parsed;
|
|
3049
|
+
return this;
|
|
3050
|
+
} catch (err) {
|
|
3051
|
+
if (err instanceof SyntaxError)
|
|
3052
|
+
throw new CmpStrValidationError(
|
|
3053
|
+
`Failed to parse serialized options, invalid JSON string`,
|
|
3054
|
+
{ opt, error: err instanceof Error ? err.message : String(err) }
|
|
3055
|
+
);
|
|
3056
|
+
throw err;
|
|
3057
|
+
}
|
|
2691
3058
|
}
|
|
2692
3059
|
setOption(path, value) {
|
|
2693
|
-
set(this.options, path, value);
|
|
3060
|
+
DeepMerge.set(this.options, path, value);
|
|
3061
|
+
OptionsValidator.validateOptions(this.options);
|
|
2694
3062
|
return this;
|
|
2695
3063
|
}
|
|
2696
3064
|
rmvOption(path) {
|
|
2697
|
-
rmv(this.options, path);
|
|
3065
|
+
DeepMerge.rmv(this.options, path);
|
|
2698
3066
|
return this;
|
|
2699
3067
|
}
|
|
2700
|
-
setRaw
|
|
2701
|
-
|
|
2702
|
-
|
|
2703
|
-
|
|
2704
|
-
|
|
2705
|
-
|
|
2706
|
-
|
|
2707
|
-
|
|
2708
|
-
|
|
3068
|
+
setRaw(enable) {
|
|
3069
|
+
return this.setOption('raw', enable);
|
|
3070
|
+
}
|
|
3071
|
+
setMetric(name) {
|
|
3072
|
+
return this.setOption('metric', name);
|
|
3073
|
+
}
|
|
3074
|
+
setFlags(flags) {
|
|
3075
|
+
return this.setOption('flags', flags);
|
|
3076
|
+
}
|
|
3077
|
+
rmvFlags() {
|
|
3078
|
+
return this.rmvOption('flags');
|
|
3079
|
+
}
|
|
3080
|
+
setProcessors(opt) {
|
|
3081
|
+
return this.setOption('processors', opt);
|
|
3082
|
+
}
|
|
3083
|
+
rmvProcessors() {
|
|
3084
|
+
return this.rmvOption('processors');
|
|
3085
|
+
}
|
|
3086
|
+
getOptions() {
|
|
3087
|
+
return this.options;
|
|
3088
|
+
}
|
|
3089
|
+
getSerializedOptions() {
|
|
3090
|
+
return JSON.stringify(this.options);
|
|
3091
|
+
}
|
|
3092
|
+
getOption(path) {
|
|
3093
|
+
return DeepMerge.get(this.options, path);
|
|
3094
|
+
}
|
|
2709
3095
|
test(a, b, opt) {
|
|
2710
3096
|
return this.compute(a, b, opt, 'single');
|
|
2711
3097
|
}
|
|
@@ -2744,15 +3130,35 @@
|
|
|
2744
3130
|
const resolved = this.resolveOptions({ flags, processors });
|
|
2745
3131
|
const test = this.prepare(needle, resolved);
|
|
2746
3132
|
const hstk = this.prepare(haystack, resolved);
|
|
2747
|
-
|
|
3133
|
+
const out = [];
|
|
3134
|
+
for (let i = 0, len = hstk.length; i < len; i++) {
|
|
3135
|
+
if (hstk[i].includes(test)) out.push(haystack[i]);
|
|
3136
|
+
}
|
|
3137
|
+
return out;
|
|
2748
3138
|
}
|
|
2749
3139
|
matrix(input, opt) {
|
|
2750
|
-
|
|
2751
|
-
|
|
2752
|
-
|
|
2753
|
-
|
|
2754
|
-
|
|
2755
|
-
|
|
3140
|
+
const resolved = this.resolveOptions(opt);
|
|
3141
|
+
const arr = this.prepare(input, resolved);
|
|
3142
|
+
const n = arr.length;
|
|
3143
|
+
const out = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
3144
|
+
for (let i = 0; i < n; i++)
|
|
3145
|
+
for (let j = i; j < n; j++) {
|
|
3146
|
+
if (i === j) {
|
|
3147
|
+
out[i][j] = 1;
|
|
3148
|
+
} else {
|
|
3149
|
+
const score = this.compute(
|
|
3150
|
+
arr[i],
|
|
3151
|
+
arr[j],
|
|
3152
|
+
resolved,
|
|
3153
|
+
'single',
|
|
3154
|
+
true,
|
|
3155
|
+
true
|
|
3156
|
+
).res;
|
|
3157
|
+
out[i][j] = score;
|
|
3158
|
+
out[j][i] = score;
|
|
3159
|
+
}
|
|
3160
|
+
}
|
|
3161
|
+
return out;
|
|
2756
3162
|
}
|
|
2757
3163
|
phoneticIndex(input, algo, opt) {
|
|
2758
3164
|
const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
|
|
@@ -2833,10 +3239,10 @@
|
|
|
2833
3239
|
: phonetic.getIndexAsync(input).then((r) => r.join(delimiter));
|
|
2834
3240
|
}
|
|
2835
3241
|
async computeAsync(a, b, opt, mode, raw, skip) {
|
|
3242
|
+
const resolved = this.resolveOptions(opt);
|
|
3243
|
+
this.assert('metric', resolved.metric);
|
|
2836
3244
|
return ErrorUtil.wrapAsync(
|
|
2837
3245
|
async () => {
|
|
2838
|
-
const resolved = this.resolveOptions(opt);
|
|
2839
|
-
this.assert('metric', resolved.metric);
|
|
2840
3246
|
const A = skip ? a : await this.prepareAsync(a, resolved);
|
|
2841
3247
|
const B = skip ? b : await this.prepareAsync(b, resolved);
|
|
2842
3248
|
if (
|
|
@@ -2894,23 +3300,40 @@
|
|
|
2894
3300
|
const resolved = this.resolveOptions({ flags, processors });
|
|
2895
3301
|
const test = await this.prepareAsync(needle, resolved);
|
|
2896
3302
|
const hstk = await this.prepareAsync(haystack, resolved);
|
|
2897
|
-
|
|
3303
|
+
const out = [];
|
|
3304
|
+
for (let i = 0; i < hstk.length; i++) {
|
|
3305
|
+
if (hstk[i].includes(test)) out.push(haystack[i]);
|
|
3306
|
+
}
|
|
3307
|
+
return out;
|
|
2898
3308
|
}
|
|
2899
3309
|
async matrixAsync(input, opt) {
|
|
2900
|
-
|
|
2901
|
-
|
|
2902
|
-
|
|
2903
|
-
|
|
2904
|
-
|
|
2905
|
-
|
|
2906
|
-
|
|
2907
|
-
|
|
2908
|
-
|
|
2909
|
-
|
|
2910
|
-
|
|
2911
|
-
|
|
2912
|
-
|
|
2913
|
-
|
|
3310
|
+
const resolved = this.resolveOptions(opt);
|
|
3311
|
+
const arr = await this.prepareAsync(input, resolved);
|
|
3312
|
+
const n = arr.length;
|
|
3313
|
+
const out = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
3314
|
+
for (let i = 0; i < n; i++) {
|
|
3315
|
+
await Promise.all(
|
|
3316
|
+
Array.from({ length: n - i }, (_, k) => i + k).map(async (j) => {
|
|
3317
|
+
if (i === j) {
|
|
3318
|
+
out[i][j] = 1;
|
|
3319
|
+
} else {
|
|
3320
|
+
const score = (
|
|
3321
|
+
await this.computeAsync(
|
|
3322
|
+
arr[i],
|
|
3323
|
+
arr[j],
|
|
3324
|
+
resolved,
|
|
3325
|
+
'single',
|
|
3326
|
+
true,
|
|
3327
|
+
true
|
|
3328
|
+
)
|
|
3329
|
+
).res;
|
|
3330
|
+
out[i][j] = score;
|
|
3331
|
+
out[j][i] = score;
|
|
3332
|
+
}
|
|
3333
|
+
})
|
|
3334
|
+
);
|
|
3335
|
+
}
|
|
3336
|
+
return out;
|
|
2914
3337
|
}
|
|
2915
3338
|
async phoneticIndexAsync(input, algo, opt) {
|
|
2916
3339
|
const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
|
|
@@ -2966,6 +3389,7 @@
|
|
|
2966
3389
|
exports.Metric = Metric;
|
|
2967
3390
|
exports.MetricRegistry = MetricRegistry;
|
|
2968
3391
|
exports.Normalizer = Normalizer;
|
|
3392
|
+
exports.OptionsValidator = OptionsValidator;
|
|
2969
3393
|
exports.Phonetic = Phonetic;
|
|
2970
3394
|
exports.PhoneticMappingRegistry = PhoneticMappingRegistry;
|
|
2971
3395
|
exports.PhoneticRegistry = PhoneticRegistry;
|