cmpstr 3.2.2 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/CmpStr.esm.js +2149 -1721
- package/dist/CmpStr.esm.min.js +2 -2
- package/dist/CmpStr.umd.js +2028 -1604
- package/dist/CmpStr.umd.min.js +2 -2
- package/dist/cjs/CmpStr.cjs +100 -51
- package/dist/cjs/CmpStrAsync.cjs +35 -18
- package/dist/cjs/index.cjs +1 -1
- package/dist/cjs/metric/Cosine.cjs +1 -1
- package/dist/cjs/metric/DamerauLevenshtein.cjs +1 -1
- package/dist/cjs/metric/DiceSorensen.cjs +1 -1
- package/dist/cjs/metric/Hamming.cjs +1 -1
- package/dist/cjs/metric/Jaccard.cjs +1 -1
- package/dist/cjs/metric/JaroWinkler.cjs +1 -1
- package/dist/cjs/metric/LCS.cjs +1 -1
- package/dist/cjs/metric/Levenshtein.cjs +1 -1
- package/dist/cjs/metric/Metric.cjs +40 -22
- package/dist/cjs/metric/NeedlemanWunsch.cjs +1 -1
- package/dist/cjs/metric/QGram.cjs +1 -1
- package/dist/cjs/metric/SmithWaterman.cjs +1 -1
- package/dist/cjs/phonetic/Caverphone.cjs +1 -1
- package/dist/cjs/phonetic/Cologne.cjs +1 -1
- package/dist/cjs/phonetic/Metaphone.cjs +1 -1
- package/dist/cjs/phonetic/Phonetic.cjs +27 -15
- package/dist/cjs/phonetic/Soundex.cjs +1 -1
- package/dist/cjs/root.cjs +4 -2
- package/dist/cjs/utils/DeepMerge.cjs +102 -97
- package/dist/cjs/utils/DiffChecker.cjs +1 -1
- package/dist/cjs/utils/Errors.cjs +22 -19
- package/dist/cjs/utils/Filter.cjs +59 -24
- package/dist/cjs/utils/HashTable.cjs +44 -29
- package/dist/cjs/utils/Normalizer.cjs +57 -28
- package/dist/cjs/utils/OptionsValidator.cjs +211 -0
- package/dist/cjs/utils/Pool.cjs +27 -13
- package/dist/cjs/utils/Profiler.cjs +41 -27
- package/dist/cjs/utils/Registry.cjs +5 -5
- package/dist/cjs/utils/StructuredData.cjs +83 -53
- package/dist/cjs/utils/TextAnalyzer.cjs +1 -1
- package/dist/esm/CmpStr.mjs +101 -52
- package/dist/esm/CmpStrAsync.mjs +35 -18
- package/dist/esm/index.mjs +1 -1
- package/dist/esm/metric/Cosine.mjs +1 -1
- package/dist/esm/metric/DamerauLevenshtein.mjs +1 -1
- package/dist/esm/metric/DiceSorensen.mjs +1 -1
- package/dist/esm/metric/Hamming.mjs +1 -1
- package/dist/esm/metric/Jaccard.mjs +1 -1
- package/dist/esm/metric/JaroWinkler.mjs +1 -1
- package/dist/esm/metric/LCS.mjs +1 -1
- package/dist/esm/metric/Levenshtein.mjs +1 -1
- package/dist/esm/metric/Metric.mjs +40 -22
- package/dist/esm/metric/NeedlemanWunsch.mjs +1 -1
- package/dist/esm/metric/QGram.mjs +1 -1
- package/dist/esm/metric/SmithWaterman.mjs +1 -1
- package/dist/esm/phonetic/Caverphone.mjs +1 -1
- package/dist/esm/phonetic/Cologne.mjs +1 -1
- package/dist/esm/phonetic/Metaphone.mjs +1 -1
- package/dist/esm/phonetic/Phonetic.mjs +30 -15
- package/dist/esm/phonetic/Soundex.mjs +1 -1
- package/dist/esm/root.mjs +3 -3
- package/dist/esm/utils/DeepMerge.mjs +103 -94
- package/dist/esm/utils/DiffChecker.mjs +1 -1
- package/dist/esm/utils/Errors.mjs +22 -19
- package/dist/esm/utils/Filter.mjs +59 -24
- package/dist/esm/utils/HashTable.mjs +44 -29
- package/dist/esm/utils/Normalizer.mjs +57 -28
- package/dist/esm/utils/OptionsValidator.mjs +210 -0
- package/dist/esm/utils/Pool.mjs +27 -13
- package/dist/esm/utils/Profiler.mjs +41 -27
- package/dist/esm/utils/Registry.mjs +5 -5
- package/dist/esm/utils/StructuredData.mjs +83 -53
- package/dist/esm/utils/TextAnalyzer.mjs +1 -1
- package/dist/types/CmpStr.d.ts +22 -15
- package/dist/types/CmpStrAsync.d.ts +3 -0
- package/dist/types/index.d.ts +3 -3
- package/dist/types/metric/Metric.d.ts +9 -9
- package/dist/types/phonetic/Phonetic.d.ts +4 -3
- package/dist/types/root.d.ts +3 -2
- package/dist/types/utils/DeepMerge.d.ts +80 -58
- package/dist/types/utils/Errors.d.ts +25 -8
- package/dist/types/utils/Filter.d.ts +4 -1
- package/dist/types/utils/HashTable.d.ts +12 -11
- package/dist/types/utils/Normalizer.d.ts +2 -1
- package/dist/types/utils/OptionsValidator.d.ts +193 -0
- package/dist/types/utils/Profiler.d.ts +9 -28
- package/dist/types/utils/StructuredData.d.ts +3 -0
- package/dist/types/utils/Types.d.ts +13 -1
- package/package.json +14 -5
package/dist/CmpStr.esm.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* CmpStr v3.
|
|
2
|
+
* CmpStr v3.3.0 build-3699f85-260318
|
|
3
3
|
* This is a lightweight, fast and well performing library for calculating string similarity.
|
|
4
4
|
* (c) 2023-2026 Paul Köhler @komed3 / MIT License
|
|
5
5
|
* Visit https://github.com/komed3/cmpstr and https://npmjs.org/package/cmpstr
|
|
@@ -7,19 +7,32 @@
|
|
|
7
7
|
class CmpStrError extends Error {
|
|
8
8
|
code;
|
|
9
9
|
meta;
|
|
10
|
-
cause;
|
|
11
10
|
when = new Date().toISOString();
|
|
12
11
|
constructor(code, message, meta, cause) {
|
|
13
|
-
super(message);
|
|
12
|
+
super(message, cause !== undefined ? { cause } : undefined);
|
|
14
13
|
this.name = this.constructor.name;
|
|
15
14
|
this.code = code;
|
|
16
15
|
this.meta = meta;
|
|
17
|
-
this.cause = cause;
|
|
18
16
|
if (typeof Error.captureStackTrace === 'function') {
|
|
19
17
|
Error.captureStackTrace(this, this.constructor);
|
|
20
18
|
}
|
|
21
19
|
}
|
|
22
|
-
|
|
20
|
+
format(stack = false) {
|
|
21
|
+
const parts = [`${this.name} [${this.code}]`, this.message];
|
|
22
|
+
if (this.meta)
|
|
23
|
+
for (const _ in this.meta) {
|
|
24
|
+
parts.push(JSON.stringify(this.meta));
|
|
25
|
+
break;
|
|
26
|
+
}
|
|
27
|
+
return (
|
|
28
|
+
parts.join(' - ') +
|
|
29
|
+
(stack && this.stack ? `\nStack Trace:\n${this.stack}` : '')
|
|
30
|
+
);
|
|
31
|
+
}
|
|
32
|
+
toString() {
|
|
33
|
+
return this.format(false);
|
|
34
|
+
}
|
|
35
|
+
toJSON(stack = false) {
|
|
23
36
|
return {
|
|
24
37
|
name: this.name,
|
|
25
38
|
code: this.code,
|
|
@@ -31,23 +44,11 @@ class CmpStrError extends Error {
|
|
|
31
44
|
? {
|
|
32
45
|
name: this.cause.name,
|
|
33
46
|
message: this.cause.message,
|
|
34
|
-
stack: this.cause.stack
|
|
47
|
+
stack: stack && this.cause.stack
|
|
35
48
|
}
|
|
36
49
|
: this.cause
|
|
37
50
|
};
|
|
38
51
|
}
|
|
39
|
-
toString(stack = false) {
|
|
40
|
-
const parts = [`${this.name} [${this.code}]`, this.message];
|
|
41
|
-
if (this.meta && Object.keys(this.meta).length) {
|
|
42
|
-
try {
|
|
43
|
-
parts.push(JSON.stringify(this.meta));
|
|
44
|
-
} catch {}
|
|
45
|
-
}
|
|
46
|
-
return (
|
|
47
|
-
parts.join(' - ') +
|
|
48
|
-
(stack && this.stack ? `\nStack Trace:\n${this.stack}` : '')
|
|
49
|
-
);
|
|
50
|
-
}
|
|
51
52
|
}
|
|
52
53
|
class CmpStrValidationError extends CmpStrError {
|
|
53
54
|
constructor(message, meta, cause) {
|
|
@@ -73,7 +74,7 @@ class ErrorUtil {
|
|
|
73
74
|
static assert(condition, message, meta) {
|
|
74
75
|
if (!condition) throw new CmpStrUsageError(message, meta);
|
|
75
76
|
}
|
|
76
|
-
static
|
|
77
|
+
static rethrow(err, message, meta) {
|
|
77
78
|
if (err instanceof CmpStrError) throw err;
|
|
78
79
|
throw new CmpStrInternalError(message, meta, err);
|
|
79
80
|
}
|
|
@@ -86,6 +87,7 @@ class ErrorUtil {
|
|
|
86
87
|
try {
|
|
87
88
|
return fn();
|
|
88
89
|
} catch (err) {
|
|
90
|
+
if (err instanceof CmpStrError) throw err;
|
|
89
91
|
throw new CmpStrInternalError(message, meta, err);
|
|
90
92
|
}
|
|
91
93
|
}
|
|
@@ -93,6 +95,7 @@ class ErrorUtil {
|
|
|
93
95
|
try {
|
|
94
96
|
return await fn();
|
|
95
97
|
} catch (err) {
|
|
98
|
+
if (err instanceof CmpStrError) throw err;
|
|
96
99
|
throw new CmpStrInternalError(message, meta, err);
|
|
97
100
|
}
|
|
98
101
|
}
|
|
@@ -108,118 +111,118 @@ var Errors = /*#__PURE__*/ Object.freeze({
|
|
|
108
111
|
ErrorUtil: ErrorUtil
|
|
109
112
|
});
|
|
110
113
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
114
|
+
class DeepMerge {
|
|
115
|
+
static BRACKET_PATTERN = /\[(\d+)]/g;
|
|
116
|
+
static PATH_CACHE = new Map();
|
|
117
|
+
static walk(obj, keys) {
|
|
118
|
+
let o = obj;
|
|
119
|
+
for (let i = 0; i < keys.length; i++) {
|
|
120
|
+
const k = keys[i];
|
|
121
|
+
if (o == null || !(k in o)) return { exists: false };
|
|
122
|
+
o = o[k];
|
|
123
|
+
}
|
|
124
|
+
return { exists: true, value: o };
|
|
125
|
+
}
|
|
126
|
+
static parse(p) {
|
|
127
|
+
const cached = DeepMerge.PATH_CACHE.get(p);
|
|
128
|
+
if (cached) return cached;
|
|
129
|
+
const parsed = p
|
|
130
|
+
.replace(DeepMerge.BRACKET_PATTERN, '.$1')
|
|
131
|
+
.split('.')
|
|
132
|
+
.map((s) => {
|
|
133
|
+
const n = Number(s);
|
|
134
|
+
return Number.isInteger(n) && String(n) === s ? n : s;
|
|
135
|
+
});
|
|
136
|
+
if (DeepMerge.PATH_CACHE.size > 2000) DeepMerge.PATH_CACHE.clear();
|
|
137
|
+
DeepMerge.PATH_CACHE.set(p, parsed);
|
|
138
|
+
return parsed;
|
|
131
139
|
}
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
function has(t, path) {
|
|
135
|
-
let o = t;
|
|
136
|
-
for (const k of parse(path)) {
|
|
137
|
-
if (o == null || !(k in o)) return false;
|
|
138
|
-
o = o[k];
|
|
140
|
+
static has(t, path) {
|
|
141
|
+
return DeepMerge.walk(t, DeepMerge.parse(path)).exists;
|
|
139
142
|
}
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
143
|
+
static get(t, path, fb) {
|
|
144
|
+
const r = DeepMerge.walk(t, DeepMerge.parse(path));
|
|
145
|
+
return r.exists ? r.value : fb;
|
|
146
|
+
}
|
|
147
|
+
static set(t, path, value) {
|
|
148
|
+
if (path === '') return value;
|
|
149
|
+
const keys = DeepMerge.parse(path);
|
|
150
|
+
ErrorUtil.assert(
|
|
151
|
+
t === undefined || (typeof t === 'object' && t !== null),
|
|
147
152
|
`Cannot set property <${keys[0]}> of <${JSON.stringify(t)}>`,
|
|
148
153
|
{ path: keys[0], target: t }
|
|
149
154
|
);
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
155
|
+
const root = t ?? (typeof keys[0] === 'number' ? [] : Object.create(null));
|
|
156
|
+
let cur = root;
|
|
157
|
+
for (let i = 0; i < keys.length - 1; i++) {
|
|
158
|
+
const k = keys[i];
|
|
159
|
+
let n = cur[k];
|
|
160
|
+
ErrorUtil.assert(
|
|
161
|
+
n == null || typeof n === 'object',
|
|
157
162
|
`Cannot set property <${keys[i + 1]}> of <${JSON.stringify(n)}>`,
|
|
158
163
|
{ path: keys.slice(0, i + 2), value: n }
|
|
159
164
|
);
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
}
|
|
164
|
-
cur[keys[keys.length - 1]] = value;
|
|
165
|
-
return root;
|
|
166
|
-
}
|
|
167
|
-
function merge(
|
|
168
|
-
t = Object.create(null),
|
|
169
|
-
o = Object.create(null),
|
|
170
|
-
mergeUndefined = false
|
|
171
|
-
) {
|
|
172
|
-
const target = t ?? Object.create(null);
|
|
173
|
-
Object.keys(o).forEach((k) => {
|
|
174
|
-
const val = o[k];
|
|
175
|
-
if (!mergeUndefined && val === undefined) return;
|
|
176
|
-
if (k === '__proto__' || k === 'constructor') return;
|
|
177
|
-
if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
|
|
178
|
-
const existing = target[k];
|
|
179
|
-
target[k] = merge(
|
|
180
|
-
existing !== null &&
|
|
181
|
-
typeof existing === 'object' &&
|
|
182
|
-
!Array.isArray(existing)
|
|
183
|
-
? existing
|
|
184
|
-
: Object.create(null),
|
|
185
|
-
val,
|
|
186
|
-
mergeUndefined
|
|
187
|
-
);
|
|
188
|
-
} else target[k] = val;
|
|
189
|
-
});
|
|
190
|
-
return target;
|
|
191
|
-
}
|
|
192
|
-
function rmv(t, path, preserveEmpty = false) {
|
|
193
|
-
const keys = parse(path);
|
|
194
|
-
const remove = (obj, i = 0) => {
|
|
195
|
-
const key = keys[i];
|
|
196
|
-
if (!obj || typeof obj !== 'object') return false;
|
|
197
|
-
if (i === keys.length - 1) return delete obj[key];
|
|
198
|
-
if (!remove(obj[key], i + 1)) return false;
|
|
199
|
-
if (!preserveEmpty) {
|
|
200
|
-
const val = obj[key];
|
|
201
|
-
if (
|
|
202
|
-
typeof val === 'object' &&
|
|
203
|
-
((Array.isArray(val) && val.every((v) => v == null)) ||
|
|
204
|
-
(!Array.isArray(val) && Object.keys(val).length === 0))
|
|
205
|
-
)
|
|
206
|
-
delete obj[key];
|
|
165
|
+
if (n == null)
|
|
166
|
+
n = cur[k] = typeof keys[i + 1] === 'number' ? [] : Object.create(null);
|
|
167
|
+
cur = n;
|
|
207
168
|
}
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
169
|
+
cur[keys[keys.length - 1]] = value;
|
|
170
|
+
return root;
|
|
171
|
+
}
|
|
172
|
+
static rmv(t, path, preserveEmpty = false) {
|
|
173
|
+
const keys = DeepMerge.parse(path);
|
|
174
|
+
const remove = (obj, i = 0) => {
|
|
175
|
+
const key = keys[i];
|
|
176
|
+
if (!obj || typeof obj !== 'object') return false;
|
|
177
|
+
if (i === keys.length - 1) return delete obj[key];
|
|
178
|
+
if (!remove(obj[key], i + 1)) return false;
|
|
179
|
+
if (!preserveEmpty) {
|
|
180
|
+
const val = obj[key];
|
|
181
|
+
let empty = true;
|
|
182
|
+
if (typeof val === 'object') {
|
|
183
|
+
if (Array.isArray(val))
|
|
184
|
+
for (let i = 0; i < val.length; i++) {
|
|
185
|
+
if (val[i] != null) {
|
|
186
|
+
empty = false;
|
|
187
|
+
break;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
else empty = false;
|
|
191
|
+
}
|
|
192
|
+
if (empty) delete obj[key];
|
|
193
|
+
}
|
|
194
|
+
return true;
|
|
195
|
+
};
|
|
196
|
+
remove(t);
|
|
197
|
+
return t;
|
|
198
|
+
}
|
|
199
|
+
static merge(
|
|
200
|
+
t = Object.create(null),
|
|
201
|
+
o = Object.create(null),
|
|
202
|
+
mergeUndefined = false
|
|
203
|
+
) {
|
|
204
|
+
const target = t ?? Object.create(null);
|
|
205
|
+
for (const k in o) {
|
|
206
|
+
const val = o[k];
|
|
207
|
+
if (!mergeUndefined && val === undefined) continue;
|
|
208
|
+
if (k === '__proto__' || k === 'constructor') continue;
|
|
209
|
+
if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
|
|
210
|
+
const existing = target[k];
|
|
211
|
+
target[k] = DeepMerge.merge(
|
|
212
|
+
existing !== null &&
|
|
213
|
+
typeof existing === 'object' &&
|
|
214
|
+
!Array.isArray(existing)
|
|
215
|
+
? existing
|
|
216
|
+
: Object.create(null),
|
|
217
|
+
val,
|
|
218
|
+
mergeUndefined
|
|
219
|
+
);
|
|
220
|
+
} else target[k] = val;
|
|
221
|
+
}
|
|
222
|
+
return target;
|
|
223
|
+
}
|
|
212
224
|
}
|
|
213
225
|
|
|
214
|
-
var DeepMerge = /*#__PURE__*/ Object.freeze({
|
|
215
|
-
__proto__: null,
|
|
216
|
-
get: get,
|
|
217
|
-
has: has,
|
|
218
|
-
merge: merge,
|
|
219
|
-
rmv: rmv,
|
|
220
|
-
set: set
|
|
221
|
-
});
|
|
222
|
-
|
|
223
226
|
class DiffChecker {
|
|
224
227
|
a;
|
|
225
228
|
b;
|
|
@@ -516,20 +519,32 @@ class DiffChecker {
|
|
|
516
519
|
}
|
|
517
520
|
|
|
518
521
|
class Filter {
|
|
522
|
+
static IDENTITY = (s) => s;
|
|
519
523
|
static filters = new Map();
|
|
520
524
|
static pipeline = new Map();
|
|
521
|
-
static getPipeline(hook) {
|
|
525
|
+
static getPipeline(hook, force = false) {
|
|
522
526
|
return ErrorUtil.wrap(
|
|
523
527
|
() => {
|
|
524
|
-
|
|
525
|
-
|
|
528
|
+
if (!force) {
|
|
529
|
+
const cached = Filter.pipeline.get(hook);
|
|
530
|
+
if (cached) return cached;
|
|
531
|
+
}
|
|
526
532
|
const filter = Filter.filters.get(hook);
|
|
527
|
-
if (!filter)
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
const
|
|
533
|
+
if (!filter) {
|
|
534
|
+
Filter.pipeline.set(hook, Filter.IDENTITY);
|
|
535
|
+
return Filter.IDENTITY;
|
|
536
|
+
}
|
|
537
|
+
const pipeline = [];
|
|
538
|
+
for (const f of filter.values()) if (f.active) pipeline.push(f);
|
|
539
|
+
pipeline.sort((a, b) => a.priority - b.priority);
|
|
540
|
+
const fn =
|
|
541
|
+
pipeline.length === 0
|
|
542
|
+
? Filter.IDENTITY
|
|
543
|
+
: (input) => {
|
|
544
|
+
let v = input;
|
|
545
|
+
for (let i = 0; i < pipeline.length; i++) v = pipeline[i].fn(v);
|
|
546
|
+
return v;
|
|
547
|
+
};
|
|
533
548
|
Filter.pipeline.set(hook, fn);
|
|
534
549
|
return fn;
|
|
535
550
|
},
|
|
@@ -547,9 +562,16 @@ class Filter {
|
|
|
547
562
|
const filter = Filter.filters.get(hook) ?? new Map();
|
|
548
563
|
const index = filter.get(id);
|
|
549
564
|
if (index && !index.overrideable) return false;
|
|
565
|
+
if (
|
|
566
|
+
index &&
|
|
567
|
+
index.fn === fn &&
|
|
568
|
+
index.priority === priority &&
|
|
569
|
+
index.active === active
|
|
570
|
+
)
|
|
571
|
+
return true;
|
|
550
572
|
filter.set(id, { id, fn, priority, active, overrideable });
|
|
551
573
|
Filter.filters.set(hook, filter);
|
|
552
|
-
Filter.
|
|
574
|
+
Filter.getPipeline(hook, true);
|
|
553
575
|
return true;
|
|
554
576
|
},
|
|
555
577
|
`Error adding filter <${id}> to hook <${hook}>`,
|
|
@@ -557,19 +579,28 @@ class Filter {
|
|
|
557
579
|
);
|
|
558
580
|
}
|
|
559
581
|
static remove(hook, id) {
|
|
560
|
-
Filter.pipeline.delete(hook);
|
|
561
582
|
const filter = Filter.filters.get(hook);
|
|
562
|
-
|
|
583
|
+
if (!filter || !filter.delete(id)) return false;
|
|
584
|
+
Filter.getPipeline(hook, true);
|
|
585
|
+
return true;
|
|
563
586
|
}
|
|
564
587
|
static pause(hook, id) {
|
|
565
|
-
Filter.
|
|
566
|
-
|
|
567
|
-
|
|
588
|
+
const filter = Filter.filters.get(hook);
|
|
589
|
+
if (!filter) return false;
|
|
590
|
+
const f = filter.get(id);
|
|
591
|
+
if (!f || !f.active) return false;
|
|
592
|
+
f.active = false;
|
|
593
|
+
Filter.getPipeline(hook, true);
|
|
594
|
+
return true;
|
|
568
595
|
}
|
|
569
596
|
static resume(hook, id) {
|
|
570
|
-
Filter.
|
|
571
|
-
|
|
572
|
-
|
|
597
|
+
const filter = Filter.filters.get(hook);
|
|
598
|
+
if (!filter) return false;
|
|
599
|
+
const f = filter.get(id);
|
|
600
|
+
if (!f || f.active) return false;
|
|
601
|
+
f.active = true;
|
|
602
|
+
Filter.getPipeline(hook, true);
|
|
603
|
+
return true;
|
|
573
604
|
}
|
|
574
605
|
static list(hook, active = false) {
|
|
575
606
|
const filter = Filter.filters.get(hook);
|
|
@@ -582,7 +613,11 @@ class Filter {
|
|
|
582
613
|
return ErrorUtil.wrap(
|
|
583
614
|
() => {
|
|
584
615
|
const fn = Filter.getPipeline(hook);
|
|
585
|
-
|
|
616
|
+
if (typeof input === 'string') return fn(input);
|
|
617
|
+
const arr = input;
|
|
618
|
+
const out = new Array(arr.length);
|
|
619
|
+
for (let i = 0; i < arr.length; i++) out[i] = fn(arr[i]);
|
|
620
|
+
return out;
|
|
586
621
|
},
|
|
587
622
|
`Error applying filters for hook <${hook}>`,
|
|
588
623
|
{ hook, input }
|
|
@@ -592,16 +627,19 @@ class Filter {
|
|
|
592
627
|
return ErrorUtil.wrapAsync(
|
|
593
628
|
async () => {
|
|
594
629
|
const fn = Filter.getPipeline(hook);
|
|
595
|
-
return
|
|
596
|
-
|
|
597
|
-
|
|
630
|
+
if (typeof input === 'string') return Promise.resolve(fn(input));
|
|
631
|
+
const arr = input;
|
|
632
|
+
const out = new Array(arr.length);
|
|
633
|
+
for (let i = 0; i < arr.length; i++)
|
|
634
|
+
out[i] = Promise.resolve(fn(arr[i]));
|
|
635
|
+
return Promise.all(out);
|
|
598
636
|
},
|
|
599
637
|
`Error applying filters for hook <${hook}>`,
|
|
600
638
|
{ hook, input }
|
|
601
639
|
);
|
|
602
640
|
}
|
|
603
641
|
static clear(hook) {
|
|
604
|
-
Filter.
|
|
642
|
+
Filter.clearPipeline();
|
|
605
643
|
if (hook) Filter.filters.delete(hook);
|
|
606
644
|
else Filter.filters.clear();
|
|
607
645
|
}
|
|
@@ -615,25 +653,21 @@ class Hasher {
|
|
|
615
653
|
static HASH_OFFSET = 0x811c9dc5;
|
|
616
654
|
static fastFNV1a(str) {
|
|
617
655
|
const len = str.length;
|
|
656
|
+
const limit = len & -4;
|
|
618
657
|
let hash = this.HASH_OFFSET;
|
|
619
|
-
|
|
620
|
-
for (
|
|
621
|
-
const pos = i * 4;
|
|
658
|
+
let i = 0;
|
|
659
|
+
for (; i < limit; i += 4) {
|
|
622
660
|
const chunk =
|
|
623
|
-
str.charCodeAt(
|
|
624
|
-
(str.charCodeAt(
|
|
625
|
-
(str.charCodeAt(
|
|
626
|
-
(str.charCodeAt(
|
|
661
|
+
str.charCodeAt(i) |
|
|
662
|
+
(str.charCodeAt(i + 1) << 8) |
|
|
663
|
+
(str.charCodeAt(i + 2) << 16) |
|
|
664
|
+
(str.charCodeAt(i + 3) << 24);
|
|
627
665
|
hash ^= chunk;
|
|
628
666
|
hash = Math.imul(hash, this.FNV_PRIME);
|
|
629
667
|
}
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
for (let i = 0; i < remaining; i++) {
|
|
634
|
-
hash ^= str.charCodeAt(pos + i);
|
|
635
|
-
hash = Math.imul(hash, this.FNV_PRIME);
|
|
636
|
-
}
|
|
668
|
+
for (; i < len; i++) {
|
|
669
|
+
hash ^= str.charCodeAt(i);
|
|
670
|
+
hash = Math.imul(hash, this.FNV_PRIME);
|
|
637
671
|
}
|
|
638
672
|
hash ^= hash >>> 16;
|
|
639
673
|
hash *= 0x85ebca6b;
|
|
@@ -644,32 +678,51 @@ class Hasher {
|
|
|
644
678
|
}
|
|
645
679
|
}
|
|
646
680
|
class HashTable {
|
|
647
|
-
|
|
681
|
+
FIFO;
|
|
682
|
+
maxSize;
|
|
648
683
|
static MAX_LEN = 2048;
|
|
649
|
-
static TABLE_SIZE = 10_000;
|
|
650
684
|
table = new Map();
|
|
651
|
-
constructor(
|
|
652
|
-
this.
|
|
685
|
+
constructor(FIFO = true, maxSize = 10000) {
|
|
686
|
+
this.FIFO = FIFO;
|
|
687
|
+
this.maxSize = maxSize;
|
|
653
688
|
}
|
|
654
689
|
key(label, strs, sorted = false) {
|
|
655
|
-
|
|
656
|
-
const hashes =
|
|
657
|
-
|
|
690
|
+
const n = strs.length;
|
|
691
|
+
const hashes = new Array(n);
|
|
692
|
+
for (let i = 0; i < n; i++) {
|
|
693
|
+
const s = strs[i];
|
|
694
|
+
if (s.length > HashTable.MAX_LEN) return false;
|
|
695
|
+
hashes[i] = Hasher.fastFNV1a(s);
|
|
696
|
+
}
|
|
697
|
+
if (sorted) hashes.sort((a, b) => a - b);
|
|
698
|
+
let key = label;
|
|
699
|
+
for (let i = 0; i < hashes.length; i++) key += '-' + hashes[i];
|
|
700
|
+
return key;
|
|
701
|
+
}
|
|
702
|
+
has(key) {
|
|
703
|
+
return this.table.has(key);
|
|
704
|
+
}
|
|
705
|
+
get(key) {
|
|
706
|
+
return this.table.get(key);
|
|
658
707
|
}
|
|
659
|
-
has = (key) => this.table.has(key);
|
|
660
|
-
get = (key) => this.table.get(key);
|
|
661
708
|
set(key, entry, update = true) {
|
|
662
709
|
if (!update && this.table.has(key)) return false;
|
|
663
|
-
|
|
664
|
-
if (!this.
|
|
710
|
+
if (!this.table.has(key) && this.table.size >= this.maxSize) {
|
|
711
|
+
if (!this.FIFO) return false;
|
|
665
712
|
this.table.delete(this.table.keys().next().value);
|
|
666
713
|
}
|
|
667
714
|
this.table.set(key, entry);
|
|
668
715
|
return true;
|
|
669
716
|
}
|
|
670
|
-
delete
|
|
671
|
-
|
|
672
|
-
|
|
717
|
+
delete(key) {
|
|
718
|
+
return this.table.delete(key);
|
|
719
|
+
}
|
|
720
|
+
clear() {
|
|
721
|
+
this.table.clear();
|
|
722
|
+
}
|
|
723
|
+
size() {
|
|
724
|
+
return this.table.size;
|
|
725
|
+
}
|
|
673
726
|
}
|
|
674
727
|
|
|
675
728
|
class Normalizer {
|
|
@@ -688,25 +741,49 @@ class Normalizer {
|
|
|
688
741
|
static getPipeline(flags) {
|
|
689
742
|
return ErrorUtil.wrap(
|
|
690
743
|
() => {
|
|
691
|
-
|
|
692
|
-
|
|
744
|
+
const cached = Normalizer.pipeline.get(flags);
|
|
745
|
+
if (cached) return cached;
|
|
693
746
|
const { REGEX } = Normalizer;
|
|
694
|
-
const steps = [
|
|
695
|
-
|
|
696
|
-
[
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
747
|
+
const steps = [];
|
|
748
|
+
for (let i = 0; i < flags.length; i++) {
|
|
749
|
+
switch (flags[i]) {
|
|
750
|
+
case 'd':
|
|
751
|
+
steps.push((s) => s.normalize('NFD'));
|
|
752
|
+
break;
|
|
753
|
+
case 'i':
|
|
754
|
+
steps.push((s) => s.toLowerCase());
|
|
755
|
+
break;
|
|
756
|
+
case 'k':
|
|
757
|
+
steps.push((s) => s.replace(REGEX.nonLetters, ''));
|
|
758
|
+
break;
|
|
759
|
+
case 'n':
|
|
760
|
+
steps.push((s) => s.replace(REGEX.nonNumbers, ''));
|
|
761
|
+
break;
|
|
762
|
+
case 'r':
|
|
763
|
+
steps.push((s) => s.replace(REGEX.doubleChars, '$1'));
|
|
764
|
+
break;
|
|
765
|
+
case 's':
|
|
766
|
+
steps.push((s) => s.replace(REGEX.specialChars, ''));
|
|
767
|
+
break;
|
|
768
|
+
case 't':
|
|
769
|
+
steps.push((s) => s.trim());
|
|
770
|
+
break;
|
|
771
|
+
case 'u':
|
|
772
|
+
steps.push((s) => s.normalize('NFC'));
|
|
773
|
+
break;
|
|
774
|
+
case 'w':
|
|
775
|
+
steps.push((s) => s.replace(REGEX.whitespace, ' '));
|
|
776
|
+
break;
|
|
777
|
+
case 'x':
|
|
778
|
+
steps.push((s) => s.normalize('NFKC'));
|
|
779
|
+
break;
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
const fn = (input) => {
|
|
783
|
+
let v = input;
|
|
784
|
+
for (let i = 0; i < steps.length; i++) v = steps[i](v);
|
|
785
|
+
return v;
|
|
786
|
+
};
|
|
710
787
|
Normalizer.pipeline.set(flags, fn);
|
|
711
788
|
return fn;
|
|
712
789
|
},
|
|
@@ -714,18 +791,23 @@ class Normalizer {
|
|
|
714
791
|
{ flags }
|
|
715
792
|
);
|
|
716
793
|
}
|
|
717
|
-
static normalize(input, flags) {
|
|
794
|
+
static normalize(input, flags, normalizedFlags) {
|
|
718
795
|
return ErrorUtil.wrap(
|
|
719
796
|
() => {
|
|
720
797
|
if (!flags || typeof flags !== 'string' || !input) return input;
|
|
721
|
-
flags = this.canonicalFlags(flags);
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
798
|
+
flags = normalizedFlags ?? this.canonicalFlags(flags);
|
|
799
|
+
const pipeline = Normalizer.getPipeline(flags);
|
|
800
|
+
const normalizeOne = (s) => {
|
|
801
|
+
const key = Normalizer.cache.key(flags, [s]);
|
|
802
|
+
if (key && Normalizer.cache.has(key))
|
|
803
|
+
return Normalizer.cache.get(key);
|
|
804
|
+
const res = pipeline(s);
|
|
805
|
+
if (key) Normalizer.cache.set(key, res);
|
|
806
|
+
return res;
|
|
807
|
+
};
|
|
808
|
+
return Array.isArray(input)
|
|
809
|
+
? input.map(normalizeOne)
|
|
810
|
+
: normalizeOne(input);
|
|
729
811
|
},
|
|
730
812
|
`Failed to normalize input with flags: ${flags}`,
|
|
731
813
|
{ input, flags }
|
|
@@ -749,17 +831,143 @@ class Normalizer {
|
|
|
749
831
|
}
|
|
750
832
|
}
|
|
751
833
|
|
|
834
|
+
class RingPool {
|
|
835
|
+
maxSize;
|
|
836
|
+
buffers = [];
|
|
837
|
+
pointer = 0;
|
|
838
|
+
constructor(maxSize) {
|
|
839
|
+
this.maxSize = maxSize;
|
|
840
|
+
}
|
|
841
|
+
acquire(minSize, allowOversize) {
|
|
842
|
+
return ErrorUtil.wrap(
|
|
843
|
+
() => {
|
|
844
|
+
const buffers = this.buffers;
|
|
845
|
+
const len = buffers.length;
|
|
846
|
+
for (let i = 0; i < len; i++) {
|
|
847
|
+
const idx = (this.pointer + i) % len;
|
|
848
|
+
const item = buffers[idx];
|
|
849
|
+
const size = item.size;
|
|
850
|
+
if (size >= minSize && (allowOversize || size === minSize)) {
|
|
851
|
+
this.pointer = (idx + 1) % len;
|
|
852
|
+
return item;
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
return null;
|
|
856
|
+
},
|
|
857
|
+
`Failed to acquire buffer of size >= ${minSize} from pool`,
|
|
858
|
+
{ minSize, allowOversize }
|
|
859
|
+
);
|
|
860
|
+
}
|
|
861
|
+
release(item) {
|
|
862
|
+
ErrorUtil.wrap(
|
|
863
|
+
() => {
|
|
864
|
+
const buffers = this.buffers;
|
|
865
|
+
if (buffers.length < this.maxSize) {
|
|
866
|
+
buffers.push(item);
|
|
867
|
+
return;
|
|
868
|
+
}
|
|
869
|
+
buffers[this.pointer] = item;
|
|
870
|
+
this.pointer = (this.pointer + 1) % this.maxSize;
|
|
871
|
+
},
|
|
872
|
+
`Failed to release buffer back to pool`,
|
|
873
|
+
{ item }
|
|
874
|
+
);
|
|
875
|
+
}
|
|
876
|
+
clear() {
|
|
877
|
+
this.buffers = [];
|
|
878
|
+
this.pointer = 0;
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
class Pool {
|
|
882
|
+
static CONFIG = {
|
|
883
|
+
int32: {
|
|
884
|
+
type: 'int32',
|
|
885
|
+
maxSize: 64,
|
|
886
|
+
maxItemSize: 2048,
|
|
887
|
+
allowOversize: true
|
|
888
|
+
},
|
|
889
|
+
'arr[]': {
|
|
890
|
+
type: 'arr[]',
|
|
891
|
+
maxSize: 4,
|
|
892
|
+
maxItemSize: 1024,
|
|
893
|
+
allowOversize: false
|
|
894
|
+
},
|
|
895
|
+
'number[]': {
|
|
896
|
+
type: 'number[]',
|
|
897
|
+
maxSize: 16,
|
|
898
|
+
maxItemSize: 1024,
|
|
899
|
+
allowOversize: false
|
|
900
|
+
},
|
|
901
|
+
'string[]': {
|
|
902
|
+
type: 'string[]',
|
|
903
|
+
maxSize: 2,
|
|
904
|
+
maxItemSize: 1024,
|
|
905
|
+
allowOversize: false
|
|
906
|
+
},
|
|
907
|
+
set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
|
|
908
|
+
map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
|
|
909
|
+
};
|
|
910
|
+
static POOLS = {
|
|
911
|
+
int32: new RingPool(64),
|
|
912
|
+
'arr[]': new RingPool(4),
|
|
913
|
+
'number[]': new RingPool(16),
|
|
914
|
+
'string[]': new RingPool(2),
|
|
915
|
+
set: new RingPool(8),
|
|
916
|
+
map: new RingPool(8)
|
|
917
|
+
};
|
|
918
|
+
static allocate(type, size) {
|
|
919
|
+
switch (type) {
|
|
920
|
+
case 'int32':
|
|
921
|
+
return new Int32Array(size);
|
|
922
|
+
case 'arr[]':
|
|
923
|
+
return new Array(size);
|
|
924
|
+
case 'number[]':
|
|
925
|
+
return new Float64Array(size);
|
|
926
|
+
case 'string[]':
|
|
927
|
+
return new Array(size);
|
|
928
|
+
case 'set':
|
|
929
|
+
return new Set();
|
|
930
|
+
case 'map':
|
|
931
|
+
return new Map();
|
|
932
|
+
}
|
|
933
|
+
}
|
|
934
|
+
static acquire(type, size) {
|
|
935
|
+
const CONFIG = this.CONFIG[type];
|
|
936
|
+
if (!CONFIG)
|
|
937
|
+
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
|
|
938
|
+
if (size > CONFIG.maxItemSize) return this.allocate(type, size);
|
|
939
|
+
const item = this.POOLS[type].acquire(size, CONFIG.allowOversize);
|
|
940
|
+
if (item)
|
|
941
|
+
return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
|
|
942
|
+
return this.allocate(type, size);
|
|
943
|
+
}
|
|
944
|
+
static acquireMany(type, sizes) {
|
|
945
|
+
const out = new Array(sizes.length);
|
|
946
|
+
for (let i = 0; i < sizes.length; i++)
|
|
947
|
+
out[i] = this.acquire(type, sizes[i]);
|
|
948
|
+
return out;
|
|
949
|
+
}
|
|
950
|
+
static release(type, buffer, size) {
|
|
951
|
+
const CONFIG = this.CONFIG[type];
|
|
952
|
+
if (!CONFIG)
|
|
953
|
+
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
|
|
954
|
+
if (size <= CONFIG.maxItemSize) this.POOLS[type].release({ buffer, size });
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
|
|
752
958
|
class Profiler {
|
|
753
959
|
active;
|
|
754
960
|
static ENV;
|
|
755
961
|
static instance;
|
|
756
962
|
nowFn;
|
|
757
963
|
memFn;
|
|
758
|
-
store =
|
|
964
|
+
store = [];
|
|
965
|
+
last;
|
|
759
966
|
totalTime = 0;
|
|
760
967
|
totalMem = 0;
|
|
761
968
|
static detectEnv() {
|
|
762
|
-
if (typeof process !== 'undefined'
|
|
969
|
+
if (typeof process !== 'undefined' && process.versions?.node)
|
|
970
|
+
Profiler.ENV = 'nodejs';
|
|
763
971
|
else if (typeof performance !== 'undefined') Profiler.ENV = 'browser';
|
|
764
972
|
else Profiler.ENV = 'unknown';
|
|
765
973
|
}
|
|
@@ -771,7 +979,7 @@ class Profiler {
|
|
|
771
979
|
this.active = active;
|
|
772
980
|
switch (Profiler.ENV) {
|
|
773
981
|
case 'nodejs':
|
|
774
|
-
this.nowFn = () => Number(process.hrtime.bigint())
|
|
982
|
+
this.nowFn = () => Number(process.hrtime.bigint()) * 1e-6;
|
|
775
983
|
this.memFn = () => process.memoryUsage().heapUsed;
|
|
776
984
|
break;
|
|
777
985
|
case 'browser':
|
|
@@ -784,40 +992,52 @@ class Profiler {
|
|
|
784
992
|
break;
|
|
785
993
|
}
|
|
786
994
|
}
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
startMem = this.mem();
|
|
792
|
-
const res = fn();
|
|
793
|
-
const deltaTime = this.now() - startTime,
|
|
794
|
-
deltaMem = this.mem() - startMem;
|
|
795
|
-
this.store.add({ time: deltaTime, mem: deltaMem, res, meta });
|
|
796
|
-
((this.totalTime += deltaTime), (this.totalMem += deltaMem));
|
|
797
|
-
return res;
|
|
995
|
+
storeRes(entry) {
|
|
996
|
+
this.store.push((this.last = entry));
|
|
997
|
+
this.totalTime += entry.time;
|
|
998
|
+
this.totalMem += entry.mem;
|
|
798
999
|
}
|
|
799
|
-
enable
|
|
1000
|
+
enable() {
|
|
800
1001
|
this.active = true;
|
|
801
|
-
}
|
|
802
|
-
disable
|
|
1002
|
+
}
|
|
1003
|
+
disable() {
|
|
803
1004
|
this.active = false;
|
|
804
|
-
}
|
|
1005
|
+
}
|
|
805
1006
|
clear() {
|
|
806
|
-
this.store.
|
|
1007
|
+
this.store.length = 0;
|
|
1008
|
+
this.last = undefined;
|
|
807
1009
|
this.totalTime = 0;
|
|
808
1010
|
this.totalMem = 0;
|
|
809
1011
|
}
|
|
810
1012
|
run(fn, meta = {}) {
|
|
811
|
-
|
|
1013
|
+
if (!this.active) return fn();
|
|
1014
|
+
const startTime = this.nowFn(),
|
|
1015
|
+
startMem = this.memFn();
|
|
1016
|
+
const res = fn();
|
|
1017
|
+
const deltaTime = this.nowFn() - startTime,
|
|
1018
|
+
deltaMem = this.memFn() - startMem;
|
|
1019
|
+
this.storeRes({ time: deltaTime, mem: deltaMem, res, meta });
|
|
1020
|
+
return res;
|
|
812
1021
|
}
|
|
813
1022
|
async runAsync(fn, meta = {}) {
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
1023
|
+
if (!this.active) return fn();
|
|
1024
|
+
const startTime = this.nowFn(),
|
|
1025
|
+
startMem = this.memFn();
|
|
1026
|
+
const res = await fn();
|
|
1027
|
+
const deltaTime = this.nowFn() - startTime,
|
|
1028
|
+
deltaMem = this.memFn() - startMem;
|
|
1029
|
+
this.storeRes({ time: deltaTime, mem: deltaMem, res, meta });
|
|
1030
|
+
return res;
|
|
1031
|
+
}
|
|
1032
|
+
getAll() {
|
|
1033
|
+
return [...this.store];
|
|
1034
|
+
}
|
|
1035
|
+
getLast() {
|
|
1036
|
+
return this.last;
|
|
1037
|
+
}
|
|
1038
|
+
getTotal() {
|
|
1039
|
+
return { time: this.totalTime, mem: this.totalMem };
|
|
817
1040
|
}
|
|
818
|
-
getAll = () => [...this.store];
|
|
819
|
-
getLast = () => this.getAll().pop();
|
|
820
|
-
getTotal = () => ({ time: this.totalTime, mem: this.totalMem });
|
|
821
1041
|
services = Object.freeze({
|
|
822
1042
|
enable: this.enable.bind(this),
|
|
823
1043
|
disable: this.disable.bind(this),
|
|
@@ -893,1368 +1113,1134 @@ function resolveCls(reg, cls) {
|
|
|
893
1113
|
throw new CmpStrNotFoundError(`Registry <${reg}> does not exist`, {
|
|
894
1114
|
registry: reg
|
|
895
1115
|
});
|
|
896
|
-
return typeof cls === 'string' ? registry[reg]
|
|
1116
|
+
return typeof cls === 'string' ? registry[reg].get(cls) : cls;
|
|
897
1117
|
}
|
|
898
1118
|
function createFromRegistry(reg, cls, ...args) {
|
|
899
|
-
|
|
1119
|
+
const ctor = resolveCls(reg, cls);
|
|
900
1120
|
return ErrorUtil.wrap(
|
|
901
|
-
() => new
|
|
902
|
-
`Failed to create instance of class <${
|
|
1121
|
+
() => new ctor(...args),
|
|
1122
|
+
`Failed to create instance of class <${ctor.name ?? cls}> from registry <${reg}>`,
|
|
903
1123
|
{ registry: reg, class: cls, args }
|
|
904
1124
|
);
|
|
905
1125
|
}
|
|
906
1126
|
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
1127
|
+
const profiler$2 = Profiler.getInstance();
|
|
1128
|
+
class Metric {
|
|
1129
|
+
static cache = new HashTable();
|
|
1130
|
+
metric;
|
|
1131
|
+
a;
|
|
1132
|
+
b;
|
|
1133
|
+
origA = [];
|
|
1134
|
+
origB = [];
|
|
1135
|
+
options;
|
|
1136
|
+
optKey;
|
|
1137
|
+
symmetric;
|
|
1138
|
+
results;
|
|
1139
|
+
static clear() {
|
|
1140
|
+
this.cache.clear();
|
|
913
1141
|
}
|
|
914
|
-
|
|
915
|
-
return
|
|
916
|
-
() => {
|
|
917
|
-
const len = this.buffers.length;
|
|
918
|
-
for (let i = 0; i < len; i++) {
|
|
919
|
-
const idx = (this.pointer + i) & (len - 1);
|
|
920
|
-
const item = this.buffers[idx];
|
|
921
|
-
if (
|
|
922
|
-
item.size >= minSize &&
|
|
923
|
-
(allowOversize || item.size === minSize)
|
|
924
|
-
) {
|
|
925
|
-
this.pointer = (idx + 1) & (len - 1);
|
|
926
|
-
return item;
|
|
927
|
-
}
|
|
928
|
-
}
|
|
929
|
-
return null;
|
|
930
|
-
},
|
|
931
|
-
`Failed to acquire buffer of size >= ${minSize} from pool`,
|
|
932
|
-
{ minSize, allowOversize }
|
|
933
|
-
);
|
|
1142
|
+
static swap(a, b, m, n) {
|
|
1143
|
+
return m > n ? [b, a, n, m] : [a, b, m, n];
|
|
934
1144
|
}
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
1145
|
+
static clamp(res) {
|
|
1146
|
+
return Math.max(0, Math.min(1, res));
|
|
1147
|
+
}
|
|
1148
|
+
constructor(metric, a, b, opt = {}, symmetric = false) {
|
|
1149
|
+
this.metric = metric;
|
|
1150
|
+
this.a = Array.isArray(a) ? a : [a];
|
|
1151
|
+
this.b = Array.isArray(b) ? b : [b];
|
|
1152
|
+
ErrorUtil.assert(
|
|
1153
|
+
this.a.length > 0 && this.b.length > 0,
|
|
1154
|
+
`Inputs <a> and <b> must not be empty`,
|
|
1155
|
+
{ a: this.a, b: this.b }
|
|
945
1156
|
);
|
|
1157
|
+
this.options = opt;
|
|
1158
|
+
this.optKey = Hasher.fastFNV1a(
|
|
1159
|
+
JSON.stringify(opt, Object.keys(opt).sort())
|
|
1160
|
+
).toString();
|
|
1161
|
+
this.symmetric = symmetric;
|
|
946
1162
|
}
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
1163
|
+
preCompute(a, b, m, n) {
|
|
1164
|
+
if (a === b) return { res: 1 };
|
|
1165
|
+
if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 };
|
|
1166
|
+
return undefined;
|
|
950
1167
|
}
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
type: 'int32',
|
|
956
|
-
maxSize: 64,
|
|
957
|
-
maxItemSize: 2048,
|
|
958
|
-
allowOversize: true
|
|
959
|
-
},
|
|
960
|
-
'number[]': {
|
|
961
|
-
type: 'number[]',
|
|
962
|
-
maxSize: 16,
|
|
963
|
-
maxItemSize: 1024,
|
|
964
|
-
allowOversize: false
|
|
965
|
-
},
|
|
966
|
-
'string[]': {
|
|
967
|
-
type: 'string[]',
|
|
968
|
-
maxSize: 2,
|
|
969
|
-
maxItemSize: 1024,
|
|
970
|
-
allowOversize: false
|
|
971
|
-
},
|
|
972
|
-
set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
|
|
973
|
-
map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
|
|
974
|
-
};
|
|
975
|
-
static POOLS = {
|
|
976
|
-
int32: new RingPool(64),
|
|
977
|
-
'number[]': new RingPool(16),
|
|
978
|
-
'string[]': new RingPool(2),
|
|
979
|
-
set: new RingPool(8),
|
|
980
|
-
map: new RingPool(8)
|
|
981
|
-
};
|
|
982
|
-
static allocate(type, size) {
|
|
983
|
-
switch (type) {
|
|
984
|
-
case 'int32':
|
|
985
|
-
return new Int32Array(size);
|
|
986
|
-
case 'number[]':
|
|
987
|
-
return new Float64Array(size);
|
|
988
|
-
case 'string[]':
|
|
989
|
-
return new Array(size);
|
|
990
|
-
case 'set':
|
|
991
|
-
return new Set();
|
|
992
|
-
case 'map':
|
|
993
|
-
return new Map();
|
|
994
|
-
}
|
|
995
|
-
}
|
|
996
|
-
static acquire(type, size) {
|
|
997
|
-
const CONFIG = this.CONFIG[type];
|
|
998
|
-
if (!CONFIG)
|
|
999
|
-
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
|
|
1000
|
-
if (size > CONFIG.maxItemSize) return this.allocate(type, size);
|
|
1001
|
-
const item = this.POOLS[type].acquire(size, CONFIG.allowOversize);
|
|
1002
|
-
if (item)
|
|
1003
|
-
return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
|
|
1004
|
-
return this.allocate(type, size);
|
|
1168
|
+
compute(a, b, m, n, maxLen) {
|
|
1169
|
+
throw new CmpStrInternalError(
|
|
1170
|
+
`Method compute() must be overridden in a subclass`
|
|
1171
|
+
);
|
|
1005
1172
|
}
|
|
1006
|
-
|
|
1007
|
-
return
|
|
1173
|
+
runSingle(i, j) {
|
|
1174
|
+
return ErrorUtil.wrap(
|
|
1175
|
+
() => {
|
|
1176
|
+
let a = String(this.a[i]),
|
|
1177
|
+
A = a;
|
|
1178
|
+
let b = String(this.b[j]),
|
|
1179
|
+
B = b;
|
|
1180
|
+
let m = A.length,
|
|
1181
|
+
n = B.length;
|
|
1182
|
+
let result = this.preCompute(A, B, m, n);
|
|
1183
|
+
if (!result) {
|
|
1184
|
+
result = profiler$2.run(() => {
|
|
1185
|
+
if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
|
|
1186
|
+
let key = Metric.cache.key(this.metric, [A, B], this.symmetric);
|
|
1187
|
+
if (key) key += this.optKey;
|
|
1188
|
+
return (
|
|
1189
|
+
Metric.cache.get(key || '') ??
|
|
1190
|
+
(() => {
|
|
1191
|
+
const maxLen = m > n ? m : n;
|
|
1192
|
+
const res = this.compute(A, B, m, n, maxLen);
|
|
1193
|
+
if (key) Metric.cache.set(key, res);
|
|
1194
|
+
return res;
|
|
1195
|
+
})()
|
|
1196
|
+
);
|
|
1197
|
+
});
|
|
1198
|
+
}
|
|
1199
|
+
return {
|
|
1200
|
+
metric: this.metric,
|
|
1201
|
+
a: this.origA.length > i ? this.origA[i] : a,
|
|
1202
|
+
b: this.origB.length > j ? this.origB[j] : b,
|
|
1203
|
+
...result
|
|
1204
|
+
};
|
|
1205
|
+
},
|
|
1206
|
+
`Failed to compute metric for inputs at indices a[${i}] and b[${j}]`,
|
|
1207
|
+
{ i, j }
|
|
1208
|
+
);
|
|
1008
1209
|
}
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
if (!CONFIG)
|
|
1012
|
-
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
|
|
1013
|
-
if (size <= CONFIG.maxItemSize) this.POOLS[type].release({ buffer, size });
|
|
1210
|
+
async runSingleAsync(i, j) {
|
|
1211
|
+
return Promise.resolve(this.runSingle(i, j));
|
|
1014
1212
|
}
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
return new StructuredData(data, key);
|
|
1213
|
+
runBatch() {
|
|
1214
|
+
const results = [];
|
|
1215
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1216
|
+
for (let j = 0; j < this.b.length; j++)
|
|
1217
|
+
results.push(this.runSingle(i, j));
|
|
1218
|
+
this.results = results;
|
|
1022
1219
|
}
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1220
|
+
async runBatchAsync() {
|
|
1221
|
+
const tasks = [];
|
|
1222
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1223
|
+
for (let j = 0; j < this.b.length; j++)
|
|
1224
|
+
tasks.push(this.runSingleAsync(i, j));
|
|
1225
|
+
this.results = await Promise.all(tasks);
|
|
1026
1226
|
}
|
|
1027
|
-
|
|
1028
|
-
const
|
|
1029
|
-
for (let i = 0; i <
|
|
1030
|
-
|
|
1031
|
-
result[i] = typeof val === 'string' ? val : String(val ?? '');
|
|
1032
|
-
}
|
|
1033
|
-
return result;
|
|
1227
|
+
runPairwise() {
|
|
1228
|
+
const results = [];
|
|
1229
|
+
for (let i = 0; i < this.a.length; i++) results.push(this.runSingle(i, i));
|
|
1230
|
+
this.results = results;
|
|
1034
1231
|
}
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
);
|
|
1232
|
+
async runPairwiseAsync() {
|
|
1233
|
+
const tasks = [];
|
|
1234
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1235
|
+
tasks.push(this.runSingleAsync(i, i));
|
|
1236
|
+
this.results = await Promise.all(tasks);
|
|
1040
1237
|
}
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
'source' in v &&
|
|
1046
|
-
'target' in v &&
|
|
1047
|
-
'match' in v
|
|
1048
|
-
);
|
|
1238
|
+
setOriginal(a, b) {
|
|
1239
|
+
if (a) this.origA = Array.isArray(a) ? a : [a];
|
|
1240
|
+
if (b) this.origB = Array.isArray(b) ? b : [b];
|
|
1241
|
+
return this;
|
|
1049
1242
|
}
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
const first = results[0];
|
|
1053
|
-
let normalized = [];
|
|
1054
|
-
if (this.isMetricResult(first)) normalized = results;
|
|
1055
|
-
else if (this.isCmpStrResult(first))
|
|
1056
|
-
normalized = results.map((r) => ({
|
|
1057
|
-
metric: 'unknown',
|
|
1058
|
-
a: r.source,
|
|
1059
|
-
b: r.target,
|
|
1060
|
-
res: r.match,
|
|
1061
|
-
raw: r.raw
|
|
1062
|
-
}));
|
|
1063
|
-
else
|
|
1064
|
-
throw new CmpStrValidationError(
|
|
1065
|
-
'Unsupported result format for StructuredData normalization.'
|
|
1066
|
-
);
|
|
1067
|
-
return normalized.map((r, idx) => ({ ...r, __idx: idx }));
|
|
1243
|
+
isBatch() {
|
|
1244
|
+
return this.a.length > 1 || this.b.length > 1;
|
|
1068
1245
|
}
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
for (let i = 0; i < extractedStrings.length; i++) {
|
|
1072
|
-
const str = extractedStrings[i];
|
|
1073
|
-
if (!stringToIndices.has(str)) stringToIndices.set(str, []);
|
|
1074
|
-
stringToIndices.get(str).push(i);
|
|
1075
|
-
}
|
|
1076
|
-
const output = new Array(results.length);
|
|
1077
|
-
const occurrenceCount = new Map();
|
|
1078
|
-
let out = 0;
|
|
1079
|
-
for (let i = 0; i < results.length; i++) {
|
|
1080
|
-
const result = results[i];
|
|
1081
|
-
if (removeZero && result.res === 0) continue;
|
|
1082
|
-
const targetStr = result.b || '';
|
|
1083
|
-
const indices = stringToIndices.get(targetStr);
|
|
1084
|
-
let dataIndex;
|
|
1085
|
-
if (indices && indices.length > 0) {
|
|
1086
|
-
const occurrence = occurrenceCount.get(targetStr) ?? 0;
|
|
1087
|
-
occurrenceCount.set(targetStr, occurrence + 1);
|
|
1088
|
-
dataIndex = indices[occurrence % indices.length];
|
|
1089
|
-
} else {
|
|
1090
|
-
dataIndex = result.__idx ?? i;
|
|
1091
|
-
}
|
|
1092
|
-
if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
|
|
1093
|
-
const sourceObj = sourceData[dataIndex];
|
|
1094
|
-
const mappedTarget = extractedStrings[dataIndex] || targetStr;
|
|
1095
|
-
if (objectsOnly) output[out++] = sourceObj;
|
|
1096
|
-
else
|
|
1097
|
-
output[out++] = {
|
|
1098
|
-
obj: sourceObj,
|
|
1099
|
-
key: this.key,
|
|
1100
|
-
result: { source: result.a, target: mappedTarget, match: result.res },
|
|
1101
|
-
...(result.raw ? { raw: result.raw } : null)
|
|
1102
|
-
};
|
|
1103
|
-
}
|
|
1104
|
-
output.length = out;
|
|
1105
|
-
return output;
|
|
1246
|
+
isSingle() {
|
|
1247
|
+
return !this.isBatch();
|
|
1106
1248
|
}
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1249
|
+
isPairwise(safe = false) {
|
|
1250
|
+
return this.isBatch() && this.a.length === this.b.length
|
|
1251
|
+
? true
|
|
1252
|
+
: !safe &&
|
|
1253
|
+
(() => {
|
|
1254
|
+
throw new CmpStrUsageError(
|
|
1255
|
+
`Mode <pairwise> requires arrays of equal length`,
|
|
1256
|
+
{ a: this.a, b: this.b }
|
|
1257
|
+
);
|
|
1258
|
+
})();
|
|
1111
1259
|
}
|
|
1112
|
-
|
|
1113
|
-
return this.
|
|
1114
|
-
this.sort(this.normalizeResults(results), opt?.sort),
|
|
1115
|
-
this.data,
|
|
1116
|
-
extractedStrings,
|
|
1117
|
-
opt?.removeZero,
|
|
1118
|
-
opt?.objectsOnly
|
|
1119
|
-
);
|
|
1260
|
+
isSymmetrical() {
|
|
1261
|
+
return this.symmetric;
|
|
1120
1262
|
}
|
|
1121
|
-
|
|
1122
|
-
return
|
|
1123
|
-
() => this.finalizeLookup(fn(), extractedStrings, opt),
|
|
1124
|
-
'StructuredData lookup failed',
|
|
1125
|
-
{ key: this.key }
|
|
1126
|
-
);
|
|
1263
|
+
whichMode(mode) {
|
|
1264
|
+
return mode ?? this.options.mode ?? 'default';
|
|
1127
1265
|
}
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
async () => this.finalizeLookup(await fn(), extractedStrings, opt),
|
|
1131
|
-
'StructuredData async lookup failed',
|
|
1132
|
-
{ key: this.key }
|
|
1133
|
-
);
|
|
1266
|
+
clear() {
|
|
1267
|
+
this.results = undefined;
|
|
1134
1268
|
}
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1269
|
+
run(mode, clear = true) {
|
|
1270
|
+
if (clear) this.clear();
|
|
1271
|
+
switch (this.whichMode(mode)) {
|
|
1272
|
+
case 'default':
|
|
1273
|
+
if (this.isSingle()) {
|
|
1274
|
+
this.results = this.runSingle(0, 0);
|
|
1275
|
+
break;
|
|
1276
|
+
}
|
|
1277
|
+
case 'batch':
|
|
1278
|
+
this.runBatch();
|
|
1279
|
+
break;
|
|
1280
|
+
case 'single':
|
|
1281
|
+
this.results = this.runSingle(0, 0);
|
|
1282
|
+
break;
|
|
1283
|
+
case 'pairwise':
|
|
1284
|
+
if (this.isPairwise()) this.runPairwise();
|
|
1285
|
+
break;
|
|
1286
|
+
default:
|
|
1287
|
+
throw new CmpStrInternalError(`Unsupported mode <${mode}>`);
|
|
1141
1288
|
}
|
|
1142
1289
|
}
|
|
1143
|
-
async
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1290
|
+
async runAsync(mode, clear = true) {
|
|
1291
|
+
if (clear) this.clear();
|
|
1292
|
+
switch (this.whichMode(mode)) {
|
|
1293
|
+
case 'default':
|
|
1294
|
+
if (this.isSingle()) {
|
|
1295
|
+
this.results = await this.runSingleAsync(0, 0);
|
|
1296
|
+
break;
|
|
1297
|
+
}
|
|
1298
|
+
case 'batch':
|
|
1299
|
+
await this.runBatchAsync();
|
|
1300
|
+
break;
|
|
1301
|
+
case 'single':
|
|
1302
|
+
this.results = await this.runSingleAsync(0, 0);
|
|
1303
|
+
break;
|
|
1304
|
+
case 'pairwise':
|
|
1305
|
+
if (this.isPairwise()) await this.runPairwiseAsync();
|
|
1306
|
+
break;
|
|
1307
|
+
default:
|
|
1308
|
+
throw new CmpStrInternalError(`Unsupported async mode <${mode}>`);
|
|
1149
1309
|
}
|
|
1150
1310
|
}
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
const b = this.extractFrom(other, otherKey);
|
|
1154
|
-
try {
|
|
1155
|
-
return this.performLookup(() => fn(a, b, opt), a, opt);
|
|
1156
|
-
} finally {
|
|
1157
|
-
Pool.release('string[]', a, a.length);
|
|
1158
|
-
Pool.release('string[]', b, b.length);
|
|
1159
|
-
}
|
|
1311
|
+
getMetricName() {
|
|
1312
|
+
return this.metric;
|
|
1160
1313
|
}
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
Pool.release('string[]', a, a.length);
|
|
1168
|
-
Pool.release('string[]', b, b.length);
|
|
1169
|
-
}
|
|
1314
|
+
getResults() {
|
|
1315
|
+
ErrorUtil.assert(
|
|
1316
|
+
this.results !== undefined,
|
|
1317
|
+
`run() must be called before getResults()`
|
|
1318
|
+
);
|
|
1319
|
+
return this.results;
|
|
1170
1320
|
}
|
|
1171
1321
|
}
|
|
1322
|
+
const MetricRegistry = Registry('metric', Metric);
|
|
1172
1323
|
|
|
1173
|
-
class
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
sentence: /(?<=[.!?])\s+/,
|
|
1177
|
-
word: /\p{L}+/gu,
|
|
1178
|
-
nonWord: /[^\p{L}]/gu,
|
|
1179
|
-
vowelGroup: /[aeiouy]+/g,
|
|
1180
|
-
letter: /\p{L}/gu,
|
|
1181
|
-
ucLetter: /\p{Lu}/gu
|
|
1182
|
-
};
|
|
1183
|
-
text;
|
|
1184
|
-
words = [];
|
|
1185
|
-
sentences = [];
|
|
1186
|
-
charFrequency = new Map();
|
|
1187
|
-
wordHistogram = new Map();
|
|
1188
|
-
syllableCache = new Map();
|
|
1189
|
-
syllableStats;
|
|
1190
|
-
constructor(input) {
|
|
1191
|
-
this.text = input.trim();
|
|
1192
|
-
this.tokenize();
|
|
1193
|
-
this.computeFrequencies();
|
|
1324
|
+
class CosineSimilarity extends Metric {
|
|
1325
|
+
constructor(a, b, opt = {}) {
|
|
1326
|
+
super('cosine', a, b, opt, true);
|
|
1194
1327
|
}
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
const
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
this.sentences = this.text
|
|
1201
|
-
.split(TextAnalyzer.REGEX.sentence)
|
|
1202
|
-
.filter(Boolean);
|
|
1328
|
+
_termFreq(str, delimiter) {
|
|
1329
|
+
const terms = str.split(delimiter);
|
|
1330
|
+
const freq = Pool.acquire('map', terms.length);
|
|
1331
|
+
for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
|
|
1332
|
+
return freq;
|
|
1203
1333
|
}
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1334
|
+
compute(a, b) {
|
|
1335
|
+
const { delimiter = ' ' } = this.options;
|
|
1336
|
+
const termsA = this._termFreq(a, delimiter);
|
|
1337
|
+
const termsB = this._termFreq(b, delimiter);
|
|
1338
|
+
try {
|
|
1339
|
+
let dotP = 0,
|
|
1340
|
+
magA = 0,
|
|
1341
|
+
magB = 0;
|
|
1342
|
+
for (const [term, freqA] of termsA) {
|
|
1343
|
+
const freqB = termsB.get(term) || 0;
|
|
1344
|
+
dotP += freqA * freqB;
|
|
1345
|
+
magA += freqA * freqA;
|
|
1346
|
+
}
|
|
1347
|
+
for (const freqB of termsB.values()) magB += freqB * freqB;
|
|
1348
|
+
magA = Math.sqrt(magA);
|
|
1349
|
+
magB = Math.sqrt(magB);
|
|
1350
|
+
return {
|
|
1351
|
+
res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0,
|
|
1352
|
+
raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB }
|
|
1353
|
+
};
|
|
1354
|
+
} finally {
|
|
1355
|
+
Pool.release('map', termsA, termsA.size);
|
|
1356
|
+
Pool.release('map', termsB, termsB.size);
|
|
1357
|
+
}
|
|
1209
1358
|
}
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
const matches = clean.match(TextAnalyzer.REGEX.vowelGroup);
|
|
1217
|
-
const count = matches ? matches.length : 1;
|
|
1218
|
-
this.syllableCache.set(clean, count);
|
|
1219
|
-
return count;
|
|
1359
|
+
}
|
|
1360
|
+
MetricRegistry.add('cosine', CosineSimilarity);
|
|
1361
|
+
|
|
1362
|
+
class DamerauLevenshteinDistance extends Metric {
|
|
1363
|
+
constructor(a, b, opt = {}) {
|
|
1364
|
+
super('damerau', a, b, opt, true);
|
|
1220
1365
|
}
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1366
|
+
compute(a, b, m, n, maxLen) {
|
|
1367
|
+
const len = m + 1;
|
|
1368
|
+
const [test, prev, curr] = Pool.acquireMany('int32', [len, len, len]);
|
|
1369
|
+
try {
|
|
1370
|
+
for (let i = 0; i <= m; i++) prev[i] = i;
|
|
1371
|
+
for (let j = 1; j <= n; j++) {
|
|
1372
|
+
curr[0] = j;
|
|
1373
|
+
const cb = b.charCodeAt(j - 1);
|
|
1374
|
+
for (let i = 1; i <= m; i++) {
|
|
1375
|
+
const ca = a.charCodeAt(i - 1);
|
|
1376
|
+
const cost = ca === cb ? 0 : 1;
|
|
1377
|
+
let val = Math.min(curr[i - 1] + 1, prev[i] + 1, prev[i - 1] + cost);
|
|
1378
|
+
if (
|
|
1379
|
+
i > 1 &&
|
|
1380
|
+
j > 1 &&
|
|
1381
|
+
ca === b.charCodeAt(j - 2) &&
|
|
1382
|
+
cb === a.charCodeAt(i - 2)
|
|
1383
|
+
)
|
|
1384
|
+
val = Math.min(val, test[i - 2] + cost);
|
|
1385
|
+
curr[i] = val;
|
|
1386
|
+
}
|
|
1387
|
+
test.set(prev);
|
|
1388
|
+
prev.set(curr);
|
|
1389
|
+
}
|
|
1390
|
+
const dist = prev[m];
|
|
1233
1391
|
return {
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
perWord,
|
|
1237
|
-
avg: perWord.length ? total / perWord.length : 0,
|
|
1238
|
-
median
|
|
1392
|
+
res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
|
|
1393
|
+
raw: { dist, maxLen }
|
|
1239
1394
|
};
|
|
1240
|
-
}
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
getAvgWordLength() {
|
|
1246
|
-
return this.words.length
|
|
1247
|
-
? this.words.join('').length / this.words.length
|
|
1248
|
-
: 0;
|
|
1395
|
+
} finally {
|
|
1396
|
+
Pool.release('int32', test, len);
|
|
1397
|
+
Pool.release('int32', prev, len);
|
|
1398
|
+
Pool.release('int32', curr, len);
|
|
1399
|
+
}
|
|
1249
1400
|
}
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1401
|
+
}
|
|
1402
|
+
MetricRegistry.add('damerau', DamerauLevenshteinDistance);
|
|
1403
|
+
|
|
1404
|
+
class DiceSorensenCoefficient extends Metric {
|
|
1405
|
+
constructor(a, b, opt = {}) {
|
|
1406
|
+
super('dice', a, b, opt, true);
|
|
1254
1407
|
}
|
|
1255
|
-
|
|
1256
|
-
|
|
1408
|
+
_bigrams(str) {
|
|
1409
|
+
const len = str.length - 1;
|
|
1410
|
+
const bigrams = Pool.acquire('set', len);
|
|
1411
|
+
for (let i = 0; i < len; i++) bigrams.add(str.substring(i, i + 2));
|
|
1412
|
+
return bigrams;
|
|
1257
1413
|
}
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
.
|
|
1261
|
-
|
|
1262
|
-
|
|
1414
|
+
compute(a, b) {
|
|
1415
|
+
const setA = this._bigrams(a),
|
|
1416
|
+
setB = this._bigrams(b);
|
|
1417
|
+
const sizeA = setA.size,
|
|
1418
|
+
sizeB = setB.size;
|
|
1419
|
+
try {
|
|
1420
|
+
let intersection = 0;
|
|
1421
|
+
for (const bigram of setA) if (setB.has(bigram)) intersection++;
|
|
1422
|
+
const size = sizeA + sizeB;
|
|
1423
|
+
return {
|
|
1424
|
+
res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
|
|
1425
|
+
raw: { intersection, size }
|
|
1426
|
+
};
|
|
1427
|
+
} finally {
|
|
1428
|
+
Pool.release('set', setA, sizeA);
|
|
1429
|
+
Pool.release('set', setB, sizeB);
|
|
1430
|
+
}
|
|
1263
1431
|
}
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1432
|
+
}
|
|
1433
|
+
MetricRegistry.add('dice', DiceSorensenCoefficient);
|
|
1434
|
+
|
|
1435
|
+
class HammingDistance extends Metric {
|
|
1436
|
+
constructor(a, b, opt = {}) {
|
|
1437
|
+
super('hamming', a, b, opt, true);
|
|
1268
1438
|
}
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1439
|
+
compute(a, b, m, n, maxLen) {
|
|
1440
|
+
if (m !== n) {
|
|
1441
|
+
if (this.options.pad !== undefined) {
|
|
1442
|
+
if (m < maxLen) a = a.padEnd(maxLen, this.options.pad);
|
|
1443
|
+
if (n < maxLen) b = b.padEnd(maxLen, this.options.pad);
|
|
1444
|
+
m = n = maxLen;
|
|
1445
|
+
} else
|
|
1446
|
+
throw new CmpStrUsageError(
|
|
1447
|
+
`Strings must be of equal length for Hamming Distance, a=${m} and b=${n} given, ` +
|
|
1448
|
+
`use option.pad for automatic adjustment`,
|
|
1449
|
+
{ a: m, b: n }
|
|
1450
|
+
);
|
|
1451
|
+
}
|
|
1452
|
+
let dist = 0;
|
|
1453
|
+
for (let i = 0; i < m; i++) if (a[i] !== b[i]) dist++;
|
|
1454
|
+
return { res: m === 0 ? 1 : Metric.clamp(1 - dist / m), raw: { dist } };
|
|
1274
1455
|
}
|
|
1275
|
-
|
|
1276
|
-
|
|
1456
|
+
}
|
|
1457
|
+
MetricRegistry.add('hamming', HammingDistance);
|
|
1458
|
+
|
|
1459
|
+
class JaccardIndex extends Metric {
|
|
1460
|
+
constructor(a, b, opt = {}) {
|
|
1461
|
+
super('jaccard', a, b, opt, true);
|
|
1277
1462
|
}
|
|
1278
|
-
|
|
1279
|
-
const
|
|
1280
|
-
|
|
1281
|
-
const
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1463
|
+
compute(a, b, m, n) {
|
|
1464
|
+
const [setA, setB] = Pool.acquireMany('set', [m, n]);
|
|
1465
|
+
try {
|
|
1466
|
+
for (const A of a) setA.add(A);
|
|
1467
|
+
for (const B of b) setB.add(B);
|
|
1468
|
+
let intersection = 0;
|
|
1469
|
+
for (const c of setA) if (setB.has(c)) intersection++;
|
|
1470
|
+
const union = setA.size + setB.size - intersection;
|
|
1471
|
+
return {
|
|
1472
|
+
res: union === 0 ? 1 : Metric.clamp(intersection / union),
|
|
1473
|
+
raw: { intersection, union }
|
|
1474
|
+
};
|
|
1475
|
+
} finally {
|
|
1476
|
+
Pool.release('set', setA, m);
|
|
1477
|
+
Pool.release('set', setB, n);
|
|
1287
1478
|
}
|
|
1288
|
-
return result;
|
|
1289
1479
|
}
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1480
|
+
}
|
|
1481
|
+
MetricRegistry.add('jaccard', JaccardIndex);
|
|
1482
|
+
|
|
1483
|
+
class JaroWinklerDistance extends Metric {
|
|
1484
|
+
constructor(a, b, opt = {}) {
|
|
1485
|
+
super('jaroWinkler', a, b, opt, true);
|
|
1294
1486
|
}
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1487
|
+
compute(a, b, m, n) {
|
|
1488
|
+
const [matchA, matchB] = Pool.acquireMany('int32', [m, n]);
|
|
1489
|
+
try {
|
|
1490
|
+
for (let i = 0; i < m; i++) matchA[i] = 0;
|
|
1491
|
+
for (let i = 0; i < n; i++) matchB[i] = 0;
|
|
1492
|
+
const matchWindow = Math.max(0, Math.floor(n / 2) - 1);
|
|
1493
|
+
let matches = 0;
|
|
1494
|
+
for (let i = 0; i < m; i++) {
|
|
1495
|
+
const start = Math.max(0, i - matchWindow);
|
|
1496
|
+
const end = Math.min(i + matchWindow + 1, n);
|
|
1497
|
+
for (let j = start; j < end; j++) {
|
|
1498
|
+
if (!matchB[j] && a[i] === b[j]) {
|
|
1499
|
+
matchA[i] = 1;
|
|
1500
|
+
matchB[j] = 1;
|
|
1501
|
+
matches++;
|
|
1502
|
+
break;
|
|
1503
|
+
}
|
|
1504
|
+
}
|
|
1505
|
+
}
|
|
1506
|
+
let transpos = 0,
|
|
1507
|
+
jaro = 0,
|
|
1508
|
+
prefix = 0,
|
|
1509
|
+
res = 0;
|
|
1510
|
+
if (matches > 0) {
|
|
1511
|
+
let k = 0;
|
|
1512
|
+
for (let i = 0; i < m; i++) {
|
|
1513
|
+
if (matchA[i]) {
|
|
1514
|
+
while (!matchB[k]) k++;
|
|
1515
|
+
if (a[i] !== b[k]) transpos++;
|
|
1516
|
+
k++;
|
|
1517
|
+
}
|
|
1518
|
+
}
|
|
1519
|
+
transpos /= 2;
|
|
1520
|
+
jaro = (matches / m + matches / n + (matches - transpos) / matches) / 3;
|
|
1521
|
+
for (let i = 0; i < Math.min(4, m, n); i++) {
|
|
1522
|
+
if (a[i] === b[i]) prefix++;
|
|
1523
|
+
else break;
|
|
1524
|
+
}
|
|
1525
|
+
res = jaro + prefix * 0.1 * (1 - jaro);
|
|
1526
|
+
}
|
|
1527
|
+
return {
|
|
1528
|
+
res: Metric.clamp(res),
|
|
1529
|
+
raw: { matchWindow, matches, transpos, jaro, prefix }
|
|
1530
|
+
};
|
|
1531
|
+
} finally {
|
|
1532
|
+
Pool.release('int32', matchA, m);
|
|
1533
|
+
Pool.release('int32', matchB, n);
|
|
1534
|
+
}
|
|
1308
1535
|
}
|
|
1309
|
-
|
|
1310
|
-
|
|
1536
|
+
}
|
|
1537
|
+
MetricRegistry.add('jaroWinkler', JaroWinklerDistance);
|
|
1538
|
+
|
|
1539
|
+
class LCSMetric extends Metric {
|
|
1540
|
+
constructor(a, b, opt = {}) {
|
|
1541
|
+
super('lcs', a, b, opt, true);
|
|
1311
1542
|
}
|
|
1312
|
-
|
|
1313
|
-
|
|
1543
|
+
compute(a, b, m, n, maxLen) {
|
|
1544
|
+
const len = m + 1;
|
|
1545
|
+
const [prev, curr] = Pool.acquireMany('int32', [len, len]);
|
|
1546
|
+
try {
|
|
1547
|
+
for (let i = 0; i <= m; i++) prev[i] = 0;
|
|
1548
|
+
for (let j = 1; j <= n; j++) {
|
|
1549
|
+
curr[0] = 0;
|
|
1550
|
+
const cb = b.charCodeAt(j - 1);
|
|
1551
|
+
for (let i = 1; i <= m; i++) {
|
|
1552
|
+
if (a.charCodeAt(i - 1) === cb) curr[i] = prev[i - 1] + 1;
|
|
1553
|
+
else curr[i] = Math.max(prev[i], curr[i - 1]);
|
|
1554
|
+
}
|
|
1555
|
+
prev.set(curr);
|
|
1556
|
+
}
|
|
1557
|
+
const lcs = prev[m];
|
|
1558
|
+
return {
|
|
1559
|
+
res: maxLen === 0 ? 1 : Metric.clamp(lcs / maxLen),
|
|
1560
|
+
raw: { lcs, maxLen }
|
|
1561
|
+
};
|
|
1562
|
+
} finally {
|
|
1563
|
+
Pool.release('int32', prev, len);
|
|
1564
|
+
Pool.release('int32', curr, len);
|
|
1565
|
+
}
|
|
1314
1566
|
}
|
|
1315
|
-
|
|
1316
|
-
|
|
1567
|
+
}
|
|
1568
|
+
MetricRegistry.add('lcs', LCSMetric);
|
|
1569
|
+
|
|
1570
|
+
class LevenshteinDistance extends Metric {
|
|
1571
|
+
constructor(a, b, opt = {}) {
|
|
1572
|
+
super('levenshtein', a, b, opt, true);
|
|
1317
1573
|
}
|
|
1318
|
-
|
|
1574
|
+
compute(a, b, m, n, maxLen) {
|
|
1575
|
+
const len = m + 1;
|
|
1576
|
+
const [prev, curr] = Pool.acquireMany('int32', [len, len]);
|
|
1319
1577
|
try {
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1578
|
+
for (let i = 0; i <= m; i++) prev[i] = i;
|
|
1579
|
+
for (let j = 1; j <= n; j++) {
|
|
1580
|
+
curr[0] = j;
|
|
1581
|
+
const cb = b.charCodeAt(j - 1);
|
|
1582
|
+
for (let i = 1; i <= m; i++) {
|
|
1583
|
+
const cost = a.charCodeAt(i - 1) === cb ? 0 : 1;
|
|
1584
|
+
curr[i] = Math.min(curr[i - 1] + 1, prev[i] + 1, prev[i - 1] + cost);
|
|
1585
|
+
}
|
|
1586
|
+
prev.set(curr);
|
|
1587
|
+
}
|
|
1588
|
+
const dist = prev[m];
|
|
1589
|
+
return {
|
|
1590
|
+
res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
|
|
1591
|
+
raw: { dist, maxLen }
|
|
1592
|
+
};
|
|
1593
|
+
} finally {
|
|
1594
|
+
Pool.release('int32', prev, len);
|
|
1595
|
+
Pool.release('int32', curr, len);
|
|
1326
1596
|
}
|
|
1327
1597
|
}
|
|
1328
|
-
|
|
1329
|
-
|
|
1598
|
+
}
|
|
1599
|
+
MetricRegistry.add('levenshtein', LevenshteinDistance);
|
|
1600
|
+
|
|
1601
|
+
class NeedlemanWunschDistance extends Metric {
|
|
1602
|
+
constructor(a, b, opt = {}) {
|
|
1603
|
+
super('needlemanWunsch', a, b, opt, true);
|
|
1330
1604
|
}
|
|
1331
|
-
|
|
1332
|
-
const
|
|
1333
|
-
const
|
|
1334
|
-
const
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1605
|
+
compute(a, b, m, n, maxLen) {
|
|
1606
|
+
const { match = 1, mismatch = -1, gap = -1 } = this.options;
|
|
1607
|
+
const len = m + 1;
|
|
1608
|
+
const [prev, curr] = Pool.acquireMany('int32', [len, len]);
|
|
1609
|
+
try {
|
|
1610
|
+
prev[0] = 0;
|
|
1611
|
+
for (let i = 1; i <= m; i++) prev[i] = prev[i - 1] + gap;
|
|
1612
|
+
for (let j = 1; j <= n; j++) {
|
|
1613
|
+
curr[0] = prev[0] + gap;
|
|
1614
|
+
const cb = b.charCodeAt(j - 1);
|
|
1615
|
+
for (let i = 1; i <= m; i++) {
|
|
1616
|
+
const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
|
|
1617
|
+
curr[i] = Math.max(
|
|
1618
|
+
prev[i - 1] + score,
|
|
1619
|
+
prev[i] + gap,
|
|
1620
|
+
curr[i - 1] + gap
|
|
1621
|
+
);
|
|
1622
|
+
}
|
|
1623
|
+
prev.set(curr);
|
|
1624
|
+
}
|
|
1625
|
+
const score = prev[m];
|
|
1626
|
+
const denum = maxLen * match;
|
|
1627
|
+
return {
|
|
1628
|
+
res: denum === 0 ? 0 : Metric.clamp(score / denum),
|
|
1629
|
+
raw: { score, denum }
|
|
1630
|
+
};
|
|
1631
|
+
} finally {
|
|
1632
|
+
Pool.release('int32', prev, len);
|
|
1633
|
+
Pool.release('int32', curr, len);
|
|
1344
1634
|
}
|
|
1345
1635
|
}
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1636
|
+
}
|
|
1637
|
+
MetricRegistry.add('needlemanWunsch', NeedlemanWunschDistance);
|
|
1638
|
+
|
|
1639
|
+
class QGramSimilarity extends Metric {
|
|
1640
|
+
constructor(a, b, opt = {}) {
|
|
1641
|
+
super('qGram', a, b, opt, true);
|
|
1351
1642
|
}
|
|
1352
|
-
|
|
1353
|
-
const
|
|
1354
|
-
const
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1643
|
+
_qGrams(str, q) {
|
|
1644
|
+
const len = Math.max(0, str.length - q + 1);
|
|
1645
|
+
const grams = Pool.acquire('set', len);
|
|
1646
|
+
for (let i = 0; i < len; i++) grams.add(str.slice(i, i + q));
|
|
1647
|
+
return grams;
|
|
1648
|
+
}
|
|
1649
|
+
compute(a, b) {
|
|
1650
|
+
const { q = 2 } = this.options;
|
|
1651
|
+
const setA = this._qGrams(a, q),
|
|
1652
|
+
setB = this._qGrams(b, q);
|
|
1653
|
+
const sizeA = setA.size,
|
|
1654
|
+
sizeB = setB.size;
|
|
1655
|
+
try {
|
|
1656
|
+
let intersection = 0;
|
|
1657
|
+
for (const gram of setA) if (setB.has(gram)) intersection++;
|
|
1658
|
+
const size = Math.max(sizeA, sizeB);
|
|
1659
|
+
return {
|
|
1660
|
+
res: size === 0 ? 1 : Metric.clamp(intersection / size),
|
|
1661
|
+
raw: { intersection, size }
|
|
1662
|
+
};
|
|
1663
|
+
} finally {
|
|
1664
|
+
Pool.release('set', setA, sizeA);
|
|
1665
|
+
Pool.release('set', setB, sizeB);
|
|
1666
|
+
}
|
|
1364
1667
|
}
|
|
1365
1668
|
}
|
|
1669
|
+
MetricRegistry.add('qGram', QGramSimilarity);
|
|
1366
1670
|
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
metric;
|
|
1371
|
-
a;
|
|
1372
|
-
b;
|
|
1373
|
-
origA = [];
|
|
1374
|
-
origB = [];
|
|
1375
|
-
options;
|
|
1376
|
-
optKey;
|
|
1377
|
-
symmetric;
|
|
1378
|
-
results;
|
|
1379
|
-
static clear = () => this.cache.clear();
|
|
1380
|
-
static swap = (a, b, m, n) => (m > n ? [b, a, n, m] : [a, b, m, n]);
|
|
1381
|
-
static clamp = (res) => Math.max(0, Math.min(1, res));
|
|
1382
|
-
constructor(metric, a, b, opt = {}, symmetric = false) {
|
|
1383
|
-
this.metric = metric;
|
|
1384
|
-
this.a = Array.isArray(a) ? a : [a];
|
|
1385
|
-
this.b = Array.isArray(b) ? b : [b];
|
|
1386
|
-
ErrorUtil.assert(
|
|
1387
|
-
this.a.length > 0 && this.b.length > 0,
|
|
1388
|
-
`Inputs <a> and <b> must not be empty`,
|
|
1389
|
-
{ a: this.a, b: this.b }
|
|
1390
|
-
);
|
|
1391
|
-
this.options = opt;
|
|
1392
|
-
this.optKey = Hasher.fastFNV1a(
|
|
1393
|
-
JSON.stringify(opt, Object.keys(opt).sort())
|
|
1394
|
-
).toString();
|
|
1395
|
-
this.symmetric = symmetric;
|
|
1671
|
+
class SmithWatermanDistance extends Metric {
|
|
1672
|
+
constructor(a, b, opt = {}) {
|
|
1673
|
+
super('smithWaterman', a, b, opt, true);
|
|
1396
1674
|
}
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 };
|
|
1400
|
-
return undefined;
|
|
1401
|
-
}
|
|
1402
|
-
compute(a, b, m, n, maxLen) {
|
|
1403
|
-
throw new CmpStrInternalError(
|
|
1404
|
-
`Method compute() must be overridden in a subclass`
|
|
1405
|
-
);
|
|
1406
|
-
}
|
|
1407
|
-
runSingle(i, j) {
|
|
1408
|
-
return ErrorUtil.wrap(
|
|
1409
|
-
() => {
|
|
1410
|
-
let a = String(this.a[i]),
|
|
1411
|
-
A = a;
|
|
1412
|
-
let b = String(this.b[j]),
|
|
1413
|
-
B = b;
|
|
1414
|
-
let m = A.length,
|
|
1415
|
-
n = B.length;
|
|
1416
|
-
let result = this.preCompute(A, B, m, n);
|
|
1417
|
-
if (!result) {
|
|
1418
|
-
result = profiler$2.run(() => {
|
|
1419
|
-
if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
|
|
1420
|
-
const key =
|
|
1421
|
-
Metric.cache.key(this.metric, [A, B], this.symmetric) +
|
|
1422
|
-
this.optKey;
|
|
1423
|
-
return (
|
|
1424
|
-
Metric.cache.get(key || '') ??
|
|
1425
|
-
(() => {
|
|
1426
|
-
const res = this.compute(A, B, m, n, Math.max(m, n));
|
|
1427
|
-
if (key) Metric.cache.set(key, res);
|
|
1428
|
-
return res;
|
|
1429
|
-
})()
|
|
1430
|
-
);
|
|
1431
|
-
});
|
|
1432
|
-
}
|
|
1433
|
-
return {
|
|
1434
|
-
metric: this.metric,
|
|
1435
|
-
a: this.origA[i] ?? a,
|
|
1436
|
-
b: this.origB[j] ?? b,
|
|
1437
|
-
...result
|
|
1438
|
-
};
|
|
1439
|
-
},
|
|
1440
|
-
`Failed to compute metric for inputs at indices a[${i}] and b[${j}]`,
|
|
1441
|
-
{ i, j }
|
|
1442
|
-
);
|
|
1443
|
-
}
|
|
1444
|
-
async runSingleAsync(i, j) {
|
|
1445
|
-
return Promise.resolve(this.runSingle(i, j));
|
|
1446
|
-
}
|
|
1447
|
-
runBatch() {
|
|
1448
|
-
const results = [];
|
|
1449
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1450
|
-
for (let j = 0; j < this.b.length; j++)
|
|
1451
|
-
results.push(this.runSingle(i, j));
|
|
1452
|
-
this.results = results;
|
|
1453
|
-
}
|
|
1454
|
-
async runBatchAsync() {
|
|
1455
|
-
const results = [];
|
|
1456
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1457
|
-
for (let j = 0; j < this.b.length; j++)
|
|
1458
|
-
results.push(await this.runSingleAsync(i, j));
|
|
1459
|
-
this.results = results;
|
|
1460
|
-
}
|
|
1461
|
-
runPairwise() {
|
|
1462
|
-
const results = [];
|
|
1463
|
-
for (let i = 0; i < this.a.length; i++) results.push(this.runSingle(i, i));
|
|
1464
|
-
this.results = results;
|
|
1465
|
-
}
|
|
1466
|
-
async runPairwiseAsync() {
|
|
1467
|
-
const results = [];
|
|
1468
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1469
|
-
results.push(await this.runSingleAsync(i, i));
|
|
1470
|
-
this.results = results;
|
|
1471
|
-
}
|
|
1472
|
-
setOriginal(a, b) {
|
|
1473
|
-
if (a) this.origA = Array.isArray(a) ? a : [a];
|
|
1474
|
-
if (b) this.origB = Array.isArray(b) ? b : [b];
|
|
1475
|
-
return this;
|
|
1476
|
-
}
|
|
1477
|
-
isBatch = () => this.a.length > 1 || this.b.length > 1;
|
|
1478
|
-
isSingle = () => !this.isBatch();
|
|
1479
|
-
isPairwise(safe = false) {
|
|
1480
|
-
return this.isBatch() && this.a.length === this.b.length
|
|
1481
|
-
? true
|
|
1482
|
-
: !safe &&
|
|
1483
|
-
(() => {
|
|
1484
|
-
throw new CmpStrUsageError(
|
|
1485
|
-
`Mode <pairwise> requires arrays of equal length`,
|
|
1486
|
-
{ a: this.a, b: this.b }
|
|
1487
|
-
);
|
|
1488
|
-
})();
|
|
1489
|
-
}
|
|
1490
|
-
isSymmetrical = () => this.symmetric;
|
|
1491
|
-
whichMode = (mode) => mode ?? this.options?.mode ?? 'default';
|
|
1492
|
-
clear = () => (this.results = undefined);
|
|
1493
|
-
run(mode, clear = true) {
|
|
1494
|
-
if (clear) this.clear();
|
|
1495
|
-
switch (this.whichMode(mode)) {
|
|
1496
|
-
case 'default':
|
|
1497
|
-
if (this.isSingle()) {
|
|
1498
|
-
this.results = this.runSingle(0, 0);
|
|
1499
|
-
break;
|
|
1500
|
-
}
|
|
1501
|
-
case 'batch':
|
|
1502
|
-
this.runBatch();
|
|
1503
|
-
break;
|
|
1504
|
-
case 'single':
|
|
1505
|
-
this.results = this.runSingle(0, 0);
|
|
1506
|
-
break;
|
|
1507
|
-
case 'pairwise':
|
|
1508
|
-
if (this.isPairwise()) this.runPairwise();
|
|
1509
|
-
break;
|
|
1510
|
-
default:
|
|
1511
|
-
throw new CmpStrInternalError(`Unsupported mode <${mode}>`);
|
|
1512
|
-
}
|
|
1513
|
-
}
|
|
1514
|
-
async runAsync(mode, clear = true) {
|
|
1515
|
-
if (clear) this.clear();
|
|
1516
|
-
switch (this.whichMode(mode)) {
|
|
1517
|
-
case 'default':
|
|
1518
|
-
if (this.isSingle()) {
|
|
1519
|
-
this.results = await this.runSingleAsync(0, 0);
|
|
1520
|
-
break;
|
|
1521
|
-
}
|
|
1522
|
-
case 'batch':
|
|
1523
|
-
await this.runBatchAsync();
|
|
1524
|
-
break;
|
|
1525
|
-
case 'single':
|
|
1526
|
-
this.results = await this.runSingleAsync(0, 0);
|
|
1527
|
-
break;
|
|
1528
|
-
case 'pairwise':
|
|
1529
|
-
if (this.isPairwise()) await this.runPairwiseAsync();
|
|
1530
|
-
break;
|
|
1531
|
-
default:
|
|
1532
|
-
throw new CmpStrInternalError(`Unsupported async mode <${mode}>`);
|
|
1533
|
-
}
|
|
1534
|
-
}
|
|
1535
|
-
getMetricName = () => this.metric;
|
|
1536
|
-
getResults() {
|
|
1537
|
-
ErrorUtil.assert(
|
|
1538
|
-
this.results !== undefined,
|
|
1539
|
-
`run() must be called before getResults()`
|
|
1540
|
-
);
|
|
1541
|
-
return this.results;
|
|
1542
|
-
}
|
|
1543
|
-
}
|
|
1544
|
-
const MetricRegistry = Registry('metric', Metric);
|
|
1545
|
-
|
|
1546
|
-
class CosineSimilarity extends Metric {
|
|
1547
|
-
constructor(a, b, opt = {}) {
|
|
1548
|
-
super('cosine', a, b, opt, true);
|
|
1549
|
-
}
|
|
1550
|
-
_termFreq(str, delimiter) {
|
|
1551
|
-
const terms = str.split(delimiter);
|
|
1552
|
-
const freq = Pool.acquire('map', terms.length);
|
|
1553
|
-
for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
|
|
1554
|
-
return freq;
|
|
1555
|
-
}
|
|
1556
|
-
compute(a, b) {
|
|
1557
|
-
const { delimiter = ' ' } = this.options;
|
|
1558
|
-
const termsA = this._termFreq(a, delimiter);
|
|
1559
|
-
const termsB = this._termFreq(b, delimiter);
|
|
1560
|
-
try {
|
|
1561
|
-
let dotP = 0,
|
|
1562
|
-
magA = 0,
|
|
1563
|
-
magB = 0;
|
|
1564
|
-
for (const [term, freqA] of termsA) {
|
|
1565
|
-
const freqB = termsB.get(term) || 0;
|
|
1566
|
-
dotP += freqA * freqB;
|
|
1567
|
-
magA += freqA * freqA;
|
|
1568
|
-
}
|
|
1569
|
-
for (const freqB of termsB.values()) magB += freqB * freqB;
|
|
1570
|
-
magA = Math.sqrt(magA);
|
|
1571
|
-
magB = Math.sqrt(magB);
|
|
1572
|
-
return {
|
|
1573
|
-
res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0,
|
|
1574
|
-
raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB }
|
|
1575
|
-
};
|
|
1576
|
-
} finally {
|
|
1577
|
-
Pool.release('map', termsA, termsA.size);
|
|
1578
|
-
Pool.release('map', termsB, termsB.size);
|
|
1579
|
-
}
|
|
1580
|
-
}
|
|
1581
|
-
}
|
|
1582
|
-
MetricRegistry.add('cosine', CosineSimilarity);
|
|
1583
|
-
|
|
1584
|
-
class DamerauLevenshteinDistance extends Metric {
|
|
1585
|
-
constructor(a, b, opt = {}) {
|
|
1586
|
-
super('damerau', a, b, opt, true);
|
|
1587
|
-
}
|
|
1588
|
-
compute(a, b, m, n, maxLen) {
|
|
1675
|
+
compute(a, b, m, n) {
|
|
1676
|
+
const { match = 2, mismatch = -1, gap = -2 } = this.options;
|
|
1589
1677
|
const len = m + 1;
|
|
1590
|
-
const [
|
|
1678
|
+
const [prev, curr] = Pool.acquireMany('int32', [len, len]);
|
|
1679
|
+
let maxScore = 0;
|
|
1591
1680
|
try {
|
|
1592
|
-
for (let i = 0; i <= m; i++) prev[i] =
|
|
1681
|
+
for (let i = 0; i <= m; i++) prev[i] = 0;
|
|
1593
1682
|
for (let j = 1; j <= n; j++) {
|
|
1594
|
-
curr[0] =
|
|
1683
|
+
curr[0] = 0;
|
|
1595
1684
|
const cb = b.charCodeAt(j - 1);
|
|
1596
1685
|
for (let i = 1; i <= m; i++) {
|
|
1597
|
-
const
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
i
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
)
|
|
1606
|
-
val = Math.min(val, test[i - 2] + cost);
|
|
1607
|
-
curr[i] = val;
|
|
1686
|
+
const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
|
|
1687
|
+
curr[i] = Math.max(
|
|
1688
|
+
0,
|
|
1689
|
+
prev[i - 1] + score,
|
|
1690
|
+
prev[i] + gap,
|
|
1691
|
+
curr[i - 1] + gap
|
|
1692
|
+
);
|
|
1693
|
+
if (curr[i] > maxScore) maxScore = curr[i];
|
|
1608
1694
|
}
|
|
1609
|
-
test.set(prev);
|
|
1610
1695
|
prev.set(curr);
|
|
1611
1696
|
}
|
|
1612
|
-
const
|
|
1697
|
+
const denum = Math.min(m * match, n * match);
|
|
1613
1698
|
return {
|
|
1614
|
-
res:
|
|
1615
|
-
raw: {
|
|
1699
|
+
res: denum === 0 ? 0 : Metric.clamp(maxScore / denum),
|
|
1700
|
+
raw: { score: maxScore, denum }
|
|
1616
1701
|
};
|
|
1617
1702
|
} finally {
|
|
1618
|
-
Pool.release('int32', test, len);
|
|
1619
1703
|
Pool.release('int32', prev, len);
|
|
1620
1704
|
Pool.release('int32', curr, len);
|
|
1621
1705
|
}
|
|
1622
1706
|
}
|
|
1623
1707
|
}
|
|
1624
|
-
MetricRegistry.add('
|
|
1708
|
+
MetricRegistry.add('smithWaterman', SmithWatermanDistance);
|
|
1625
1709
|
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1710
|
+
const profiler$1 = Profiler.getInstance();
|
|
1711
|
+
class Phonetic {
|
|
1712
|
+
static cache = new HashTable();
|
|
1713
|
+
static default;
|
|
1714
|
+
algo;
|
|
1715
|
+
options;
|
|
1716
|
+
optKey;
|
|
1717
|
+
map;
|
|
1718
|
+
ignoreSet;
|
|
1719
|
+
static clear() {
|
|
1720
|
+
this.cache.clear();
|
|
1629
1721
|
}
|
|
1630
|
-
|
|
1631
|
-
const
|
|
1632
|
-
const
|
|
1633
|
-
|
|
1634
|
-
|
|
1722
|
+
constructor(algo, opt = {}) {
|
|
1723
|
+
const defaults = this.constructor.default ?? {};
|
|
1724
|
+
const mapId = opt.map ?? defaults.map;
|
|
1725
|
+
if (!mapId)
|
|
1726
|
+
throw new CmpStrNotFoundError(
|
|
1727
|
+
`No mapping specified for phonetic algorithm`,
|
|
1728
|
+
{ algo }
|
|
1729
|
+
);
|
|
1730
|
+
const map = PhoneticMappingRegistry.get(algo, mapId);
|
|
1731
|
+
if (map === undefined)
|
|
1732
|
+
throw new CmpStrNotFoundError(
|
|
1733
|
+
`Requested mapping <${mapId}> is not declared`,
|
|
1734
|
+
{ algo, mapId }
|
|
1735
|
+
);
|
|
1736
|
+
this.options = DeepMerge.merge(
|
|
1737
|
+
DeepMerge.merge(defaults, map.options ?? {}),
|
|
1738
|
+
opt
|
|
1739
|
+
);
|
|
1740
|
+
this.optKey = Hasher.fastFNV1a(
|
|
1741
|
+
JSON.stringify(this.options, Object.keys(this.options).sort())
|
|
1742
|
+
).toString();
|
|
1743
|
+
this.algo = algo;
|
|
1744
|
+
this.map = map;
|
|
1745
|
+
this.ignoreSet = new Set(map.ignore ?? []);
|
|
1635
1746
|
}
|
|
1636
|
-
|
|
1637
|
-
const
|
|
1638
|
-
|
|
1639
|
-
const
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
for (const bigram of setA) if (setB.has(bigram)) intersection++;
|
|
1644
|
-
const size = sizeA + sizeB;
|
|
1645
|
-
return {
|
|
1646
|
-
res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
|
|
1647
|
-
raw: { intersection, size }
|
|
1648
|
-
};
|
|
1649
|
-
} finally {
|
|
1650
|
-
Pool.release('set', setA, sizeA);
|
|
1651
|
-
Pool.release('set', setB, sizeB);
|
|
1747
|
+
applyPattern(word) {
|
|
1748
|
+
const { patterns = [] } = this.map;
|
|
1749
|
+
if (!patterns.length) return word;
|
|
1750
|
+
for (const { pattern, replace, all = false } of patterns) {
|
|
1751
|
+
word = all
|
|
1752
|
+
? word.replaceAll(pattern, replace)
|
|
1753
|
+
: word.replace(pattern, replace);
|
|
1652
1754
|
}
|
|
1755
|
+
return word;
|
|
1653
1756
|
}
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1757
|
+
applyRules(char, i, chars, charLen) {
|
|
1758
|
+
const { ruleset = [] } = this.map;
|
|
1759
|
+
if (!ruleset.length) return undefined;
|
|
1760
|
+
const prev = chars[i - 1] || '',
|
|
1761
|
+
prev2 = chars[i - 2] || '';
|
|
1762
|
+
const next = chars[i + 1] || '',
|
|
1763
|
+
next2 = chars[i + 2] || '';
|
|
1764
|
+
const str = chars.join('');
|
|
1765
|
+
for (const rule of ruleset) {
|
|
1766
|
+
if (rule.char && rule.char !== char) continue;
|
|
1767
|
+
if (rule.position === 'start' && i !== 0) continue;
|
|
1768
|
+
if (rule.position === 'middle' && (i === 0 || i === charLen - 1))
|
|
1769
|
+
continue;
|
|
1770
|
+
if (rule.position === 'end' && i !== charLen - 1) continue;
|
|
1771
|
+
if (rule.prev && !rule.prev.includes(prev)) continue;
|
|
1772
|
+
if (rule.prevNot && rule.prevNot.includes(prev)) continue;
|
|
1773
|
+
if (rule.prev2 && !rule.prev2.includes(prev2)) continue;
|
|
1774
|
+
if (rule.prev2Not && rule.prev2Not.includes(prev2)) continue;
|
|
1775
|
+
if (rule.next && !rule.next.includes(next)) continue;
|
|
1776
|
+
if (rule.nextNot && rule.nextNot.includes(next)) continue;
|
|
1777
|
+
if (rule.next2 && !rule.next2.includes(next2)) continue;
|
|
1778
|
+
if (rule.next2Not && rule.next2Not.includes(next2)) continue;
|
|
1779
|
+
if (
|
|
1780
|
+
rule.leading &&
|
|
1781
|
+
!rule.leading.includes(str.slice(0, rule.leading.length))
|
|
1782
|
+
)
|
|
1783
|
+
continue;
|
|
1784
|
+
if (
|
|
1785
|
+
rule.trailing &&
|
|
1786
|
+
!rule.trailing.includes(str.slice(-rule.trailing.length))
|
|
1787
|
+
)
|
|
1788
|
+
continue;
|
|
1789
|
+
if (rule.match && !rule.match.every((c, j) => chars[i + j] === c))
|
|
1790
|
+
continue;
|
|
1791
|
+
return rule.code;
|
|
1792
|
+
}
|
|
1793
|
+
return undefined;
|
|
1660
1794
|
}
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1795
|
+
encode(word) {
|
|
1796
|
+
const { map = {} } = this.map;
|
|
1797
|
+
word = this.applyPattern(word);
|
|
1798
|
+
const chars = this.word2Chars(word);
|
|
1799
|
+
const charLen = chars.length;
|
|
1800
|
+
let code = '',
|
|
1801
|
+
lastCode = null;
|
|
1802
|
+
for (let i = 0; i < charLen; i++) {
|
|
1803
|
+
const char = chars[i];
|
|
1804
|
+
if (this.ignoreSet.has(char)) continue;
|
|
1805
|
+
const mapped = this.mapChar(char, i, chars, charLen, lastCode, map);
|
|
1806
|
+
if (mapped === undefined) continue;
|
|
1807
|
+
((code += mapped), (lastCode = mapped));
|
|
1808
|
+
if (this.exitEarly(code, i)) break;
|
|
1673
1809
|
}
|
|
1674
|
-
|
|
1675
|
-
for (let i = 0; i < m; i++) if (a[i] !== b[i]) dist++;
|
|
1676
|
-
return { res: m === 0 ? 1 : Metric.clamp(1 - dist / m), raw: { dist } };
|
|
1810
|
+
return this.adjustCode(code, chars);
|
|
1677
1811
|
}
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
constructor(a, b, opt = {}) {
|
|
1683
|
-
super('jaccard', a, b, opt, true);
|
|
1812
|
+
mapChar(char, i, chars, charLen, lastCode, map) {
|
|
1813
|
+
const { dedupe = true, fallback = undefined } = this.options;
|
|
1814
|
+
const c = this.applyRules(char, i, chars, charLen) ?? map[char] ?? fallback;
|
|
1815
|
+
return dedupe && c === lastCode ? undefined : c;
|
|
1684
1816
|
}
|
|
1685
|
-
|
|
1686
|
-
const
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
let intersection = 0;
|
|
1691
|
-
for (const c of setA) if (setB.has(c)) intersection++;
|
|
1692
|
-
const union = setA.size + setB.size - intersection;
|
|
1693
|
-
return {
|
|
1694
|
-
res: union === 0 ? 1 : Metric.clamp(intersection / union),
|
|
1695
|
-
raw: { intersection, union }
|
|
1696
|
-
};
|
|
1697
|
-
} finally {
|
|
1698
|
-
Pool.release('set', setA, m);
|
|
1699
|
-
Pool.release('set', setB, n);
|
|
1700
|
-
}
|
|
1817
|
+
equalLen(input) {
|
|
1818
|
+
const { length = -1, pad = '0' } = this.options;
|
|
1819
|
+
return length === -1
|
|
1820
|
+
? input
|
|
1821
|
+
: (input + pad.repeat(length)).slice(0, length);
|
|
1701
1822
|
}
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
class JaroWinklerDistance extends Metric {
|
|
1706
|
-
constructor(a, b, opt = {}) {
|
|
1707
|
-
super('jaroWinkler', a, b, opt, true);
|
|
1823
|
+
word2Chars(word) {
|
|
1824
|
+
return Array.from(word.toLowerCase());
|
|
1708
1825
|
}
|
|
1709
|
-
|
|
1710
|
-
const
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
res = 0;
|
|
1732
|
-
if (matches > 0) {
|
|
1733
|
-
let k = 0;
|
|
1734
|
-
for (let i = 0; i < m; i++) {
|
|
1735
|
-
if (matchA[i]) {
|
|
1736
|
-
while (!matchB[k]) k++;
|
|
1737
|
-
if (a[i] !== b[k]) transpos++;
|
|
1738
|
-
k++;
|
|
1739
|
-
}
|
|
1826
|
+
exitEarly(code, i) {
|
|
1827
|
+
const { length = -1 } = this.options;
|
|
1828
|
+
return length > 0 && code.length >= length;
|
|
1829
|
+
}
|
|
1830
|
+
adjustCode(code, chars) {
|
|
1831
|
+
return code;
|
|
1832
|
+
}
|
|
1833
|
+
loop(words) {
|
|
1834
|
+
return ErrorUtil.wrap(
|
|
1835
|
+
() => {
|
|
1836
|
+
const index = [];
|
|
1837
|
+
for (const word of words) {
|
|
1838
|
+
let key = Phonetic.cache.key(this.algo, [word]);
|
|
1839
|
+
if (key) key += this.optKey;
|
|
1840
|
+
const code =
|
|
1841
|
+
Phonetic.cache.get(key || '') ??
|
|
1842
|
+
(() => {
|
|
1843
|
+
const res = this.encode(word);
|
|
1844
|
+
if (key) Phonetic.cache.set(key, res);
|
|
1845
|
+
return res;
|
|
1846
|
+
})();
|
|
1847
|
+
if (code && code.length) index.push(this.equalLen(code));
|
|
1740
1848
|
}
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1849
|
+
return index;
|
|
1850
|
+
},
|
|
1851
|
+
`Failed to generate phonetic index`,
|
|
1852
|
+
{ algo: this.algo, words }
|
|
1853
|
+
);
|
|
1854
|
+
}
|
|
1855
|
+
async loopAsync(words) {
|
|
1856
|
+
return ErrorUtil.wrapAsync(
|
|
1857
|
+
async () => {
|
|
1858
|
+
const index = [];
|
|
1859
|
+
for (const word of words) {
|
|
1860
|
+
const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
|
|
1861
|
+
const code = await Promise.resolve(
|
|
1862
|
+
Phonetic.cache.get(key || '') ??
|
|
1863
|
+
(() => {
|
|
1864
|
+
const res = this.encode(word);
|
|
1865
|
+
if (key) Phonetic.cache.set(key, res);
|
|
1866
|
+
return res;
|
|
1867
|
+
})()
|
|
1868
|
+
);
|
|
1869
|
+
if (code && code.length) index.push(this.equalLen(code));
|
|
1746
1870
|
}
|
|
1747
|
-
|
|
1748
|
-
}
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
};
|
|
1753
|
-
} finally {
|
|
1754
|
-
Pool.release('int32', matchA, m);
|
|
1755
|
-
Pool.release('int32', matchB, n);
|
|
1756
|
-
}
|
|
1871
|
+
return index;
|
|
1872
|
+
},
|
|
1873
|
+
`Failed to generate phonetic index asynchronously`,
|
|
1874
|
+
{ algo: this.algo, words }
|
|
1875
|
+
);
|
|
1757
1876
|
}
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
class LCSMetric extends Metric {
|
|
1762
|
-
constructor(a, b, opt = {}) {
|
|
1763
|
-
super('lcs', a, b, opt, true);
|
|
1877
|
+
getAlgoName() {
|
|
1878
|
+
return this.algo;
|
|
1764
1879
|
}
|
|
1765
|
-
|
|
1766
|
-
const
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
}
|
|
1779
|
-
const lcs = prev[m];
|
|
1780
|
-
return {
|
|
1781
|
-
res: maxLen === 0 ? 1 : Metric.clamp(lcs / maxLen),
|
|
1782
|
-
raw: { lcs, maxLen }
|
|
1783
|
-
};
|
|
1784
|
-
} finally {
|
|
1785
|
-
Pool.release('int32', prev, len);
|
|
1786
|
-
Pool.release('int32', curr, len);
|
|
1787
|
-
}
|
|
1880
|
+
getIndex(input) {
|
|
1881
|
+
const { delimiter = ' ' } = this.options;
|
|
1882
|
+
return profiler$1.run(() =>
|
|
1883
|
+
this.loop(input.split(delimiter).filter(Boolean)).filter(Boolean)
|
|
1884
|
+
);
|
|
1885
|
+
}
|
|
1886
|
+
async getIndexAsync(input) {
|
|
1887
|
+
const { delimiter = ' ' } = this.options;
|
|
1888
|
+
return (
|
|
1889
|
+
await profiler$1.runAsync(
|
|
1890
|
+
async () => await this.loopAsync(input.split(delimiter).filter(Boolean))
|
|
1891
|
+
)
|
|
1892
|
+
).filter(Boolean);
|
|
1788
1893
|
}
|
|
1789
1894
|
}
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
return
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
} finally {
|
|
1816
|
-
Pool.release('int32', prev, len);
|
|
1817
|
-
Pool.release('int32', curr, len);
|
|
1895
|
+
const PhoneticRegistry = Registry('phonetic', Phonetic);
|
|
1896
|
+
const PhoneticMappingRegistry = (() => {
|
|
1897
|
+
const mappings = Object.create(null);
|
|
1898
|
+
const maps = (algo) => (mappings[algo] ||= Object.create(null));
|
|
1899
|
+
return Object.freeze({
|
|
1900
|
+
add(algo, id, map, update = false) {
|
|
1901
|
+
const mappings = maps(algo);
|
|
1902
|
+
ErrorUtil.assert(
|
|
1903
|
+
!(!id || id in mappings) || update,
|
|
1904
|
+
`Entry <${id}> already exists / use <update=true> to overwrite`,
|
|
1905
|
+
{ algo, id }
|
|
1906
|
+
);
|
|
1907
|
+
mappings[id] = map;
|
|
1908
|
+
},
|
|
1909
|
+
remove(algo, id) {
|
|
1910
|
+
delete maps(algo)[id];
|
|
1911
|
+
},
|
|
1912
|
+
has(algo, id) {
|
|
1913
|
+
return id in maps(algo);
|
|
1914
|
+
},
|
|
1915
|
+
get(algo, id) {
|
|
1916
|
+
return maps(algo)[id];
|
|
1917
|
+
},
|
|
1918
|
+
list(algo) {
|
|
1919
|
+
return Object.keys(maps(algo));
|
|
1818
1920
|
}
|
|
1819
|
-
}
|
|
1820
|
-
}
|
|
1821
|
-
MetricRegistry.add('levenshtein', LevenshteinDistance);
|
|
1921
|
+
});
|
|
1922
|
+
})();
|
|
1822
1923
|
|
|
1823
|
-
class
|
|
1824
|
-
|
|
1825
|
-
|
|
1924
|
+
class Caverphone extends Phonetic {
|
|
1925
|
+
static REGEX = { uppercase: /[^A-Z]/gi };
|
|
1926
|
+
static default = {
|
|
1927
|
+
map: 'en2',
|
|
1928
|
+
delimiter: ' ',
|
|
1929
|
+
length: -1,
|
|
1930
|
+
pad: '',
|
|
1931
|
+
dedupe: false
|
|
1932
|
+
};
|
|
1933
|
+
constructor(opt = {}) {
|
|
1934
|
+
super('caverphone', opt);
|
|
1826
1935
|
}
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
const [prev, curr] = Pool.acquireMany('int32', [len, len]);
|
|
1831
|
-
try {
|
|
1832
|
-
prev[0] = 0;
|
|
1833
|
-
for (let i = 1; i <= m; i++) prev[i] = prev[i - 1] + gap;
|
|
1834
|
-
for (let j = 1; j <= n; j++) {
|
|
1835
|
-
curr[0] = prev[0] + gap;
|
|
1836
|
-
const cb = b.charCodeAt(j - 1);
|
|
1837
|
-
for (let i = 1; i <= m; i++) {
|
|
1838
|
-
const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
|
|
1839
|
-
curr[i] = Math.max(
|
|
1840
|
-
prev[i - 1] + score,
|
|
1841
|
-
prev[i] + gap,
|
|
1842
|
-
curr[i - 1] + gap
|
|
1843
|
-
);
|
|
1844
|
-
}
|
|
1845
|
-
prev.set(curr);
|
|
1846
|
-
}
|
|
1847
|
-
const score = prev[m];
|
|
1848
|
-
const denum = maxLen * match;
|
|
1849
|
-
return {
|
|
1850
|
-
res: denum === 0 ? 0 : Metric.clamp(score / denum),
|
|
1851
|
-
raw: { score, denum }
|
|
1852
|
-
};
|
|
1853
|
-
} finally {
|
|
1854
|
-
Pool.release('int32', prev, len);
|
|
1855
|
-
Pool.release('int32', curr, len);
|
|
1856
|
-
}
|
|
1936
|
+
encode(word) {
|
|
1937
|
+
word = word.replace(Caverphone.REGEX.uppercase, '').toLowerCase();
|
|
1938
|
+
return super.encode(word);
|
|
1857
1939
|
}
|
|
1940
|
+
mapChar = (char) => char;
|
|
1941
|
+
adjustCode = (code) => code.toUpperCase();
|
|
1858
1942
|
}
|
|
1859
|
-
|
|
1943
|
+
PhoneticRegistry.add('caverphone', Caverphone);
|
|
1944
|
+
PhoneticMappingRegistry.add('caverphone', 'en1', {
|
|
1945
|
+
options: { length: 6, pad: '1' },
|
|
1946
|
+
map: {},
|
|
1947
|
+
patterns: [
|
|
1948
|
+
{ pattern: /^(c|r|t|en)ough/, replace: '$1ou2f' },
|
|
1949
|
+
{ pattern: /^gn/, replace: '2n' },
|
|
1950
|
+
{ pattern: /mb$/, replace: 'm2' },
|
|
1951
|
+
{ pattern: /cq/g, replace: '2q' },
|
|
1952
|
+
{ pattern: /c(e|i|y)/g, replace: 's$1' },
|
|
1953
|
+
{ pattern: /tch/g, replace: '2ch' },
|
|
1954
|
+
{ pattern: /[cqx]/g, replace: 'k' },
|
|
1955
|
+
{ pattern: /v/g, replace: 'f' },
|
|
1956
|
+
{ pattern: /dg/g, replace: '2g' },
|
|
1957
|
+
{ pattern: /ti(a|o)/g, replace: 'si$1' },
|
|
1958
|
+
{ pattern: /d/g, replace: 't' },
|
|
1959
|
+
{ pattern: /ph/g, replace: 'fh' },
|
|
1960
|
+
{ pattern: /b/g, replace: 'p' },
|
|
1961
|
+
{ pattern: /sh/g, replace: 's2' },
|
|
1962
|
+
{ pattern: /z/g, replace: 's' },
|
|
1963
|
+
{ pattern: /^[aeiou]/, replace: 'A' },
|
|
1964
|
+
{ pattern: /[aeiou]/g, replace: '3' },
|
|
1965
|
+
{ pattern: /3gh3/g, replace: '3kh3' },
|
|
1966
|
+
{ pattern: /gh/g, replace: '22' },
|
|
1967
|
+
{ pattern: /g/g, replace: 'k' },
|
|
1968
|
+
{ pattern: /s+/g, replace: 'S' },
|
|
1969
|
+
{ pattern: /t+/g, replace: 'T' },
|
|
1970
|
+
{ pattern: /p+/g, replace: 'P' },
|
|
1971
|
+
{ pattern: /k+/g, replace: 'K' },
|
|
1972
|
+
{ pattern: /f+/g, replace: 'F' },
|
|
1973
|
+
{ pattern: /m+/g, replace: 'M' },
|
|
1974
|
+
{ pattern: /n+/g, replace: 'N' },
|
|
1975
|
+
{ pattern: /j/g, replace: 'y' },
|
|
1976
|
+
{ pattern: /l3/g, replace: 'L3' },
|
|
1977
|
+
{ pattern: /r3/g, replace: 'R3' },
|
|
1978
|
+
{ pattern: /w3/g, replace: 'W3' },
|
|
1979
|
+
{ pattern: /y3/g, replace: 'Y3' },
|
|
1980
|
+
{ pattern: /ly/g, replace: 'Ly' },
|
|
1981
|
+
{ pattern: /ry/g, replace: 'Ry' },
|
|
1982
|
+
{ pattern: /wy/g, replace: 'Wy' },
|
|
1983
|
+
{ pattern: /wh3/g, replace: 'Wh3' },
|
|
1984
|
+
{ pattern: /why/g, replace: 'Why' },
|
|
1985
|
+
{ pattern: /^h/, replace: 'A' },
|
|
1986
|
+
{ pattern: /[hlrwy23]/g, replace: '' }
|
|
1987
|
+
]
|
|
1988
|
+
});
|
|
1989
|
+
PhoneticMappingRegistry.add('caverphone', 'en2', {
|
|
1990
|
+
options: { length: 10, pad: '1' },
|
|
1991
|
+
map: {},
|
|
1992
|
+
patterns: [
|
|
1993
|
+
{ pattern: /e$/, replace: '' },
|
|
1994
|
+
{ pattern: /^(c|r|t|en|tr)ough/, replace: '$1ou2f' },
|
|
1995
|
+
{ pattern: /^gn/, replace: '2n' },
|
|
1996
|
+
{ pattern: /mb$/, replace: 'm2' },
|
|
1997
|
+
{ pattern: /cq/g, replace: '2q' },
|
|
1998
|
+
{ pattern: /c(e|i|y)/g, replace: 's$1' },
|
|
1999
|
+
{ pattern: /tch/g, replace: '2ch' },
|
|
2000
|
+
{ pattern: /[cqx]/g, replace: 'k' },
|
|
2001
|
+
{ pattern: /v/g, replace: 'f' },
|
|
2002
|
+
{ pattern: /dg/g, replace: '2g' },
|
|
2003
|
+
{ pattern: /ti(a|o)/g, replace: 'si$1' },
|
|
2004
|
+
{ pattern: /d/g, replace: 't' },
|
|
2005
|
+
{ pattern: /ph/g, replace: 'fh' },
|
|
2006
|
+
{ pattern: /b/g, replace: 'p' },
|
|
2007
|
+
{ pattern: /sh/g, replace: 's2' },
|
|
2008
|
+
{ pattern: /z/g, replace: 's' },
|
|
2009
|
+
{ pattern: /^[aeiou]/, replace: 'A' },
|
|
2010
|
+
{ pattern: /[aeiou]/g, replace: '3' },
|
|
2011
|
+
{ pattern: /j/g, replace: 'y' },
|
|
2012
|
+
{ pattern: /^y3/, replace: 'Y3' },
|
|
2013
|
+
{ pattern: /^y/, replace: 'A' },
|
|
2014
|
+
{ pattern: /y/g, replace: '3' },
|
|
2015
|
+
{ pattern: /3gh3/g, replace: '3kh3' },
|
|
2016
|
+
{ pattern: /gh/g, replace: '22' },
|
|
2017
|
+
{ pattern: /g/g, replace: 'k' },
|
|
2018
|
+
{ pattern: /s+/g, replace: 'S' },
|
|
2019
|
+
{ pattern: /t+/g, replace: 'T' },
|
|
2020
|
+
{ pattern: /p+/g, replace: 'P' },
|
|
2021
|
+
{ pattern: /k+/g, replace: 'K' },
|
|
2022
|
+
{ pattern: /f+/g, replace: 'F' },
|
|
2023
|
+
{ pattern: /m+/g, replace: 'M' },
|
|
2024
|
+
{ pattern: /n+/g, replace: 'N' },
|
|
2025
|
+
{ pattern: /l3/g, replace: 'L3' },
|
|
2026
|
+
{ pattern: /r3/g, replace: 'R3' },
|
|
2027
|
+
{ pattern: /w3/g, replace: 'W3' },
|
|
2028
|
+
{ pattern: /wh3/g, replace: 'Wh3' },
|
|
2029
|
+
{ pattern: /[lrw]$/, replace: '3' },
|
|
2030
|
+
{ pattern: /^h/, replace: 'A' },
|
|
2031
|
+
{ pattern: /3$/, replace: 'A' },
|
|
2032
|
+
{ pattern: /[hlrw23]/g, replace: '' }
|
|
2033
|
+
]
|
|
2034
|
+
});
|
|
1860
2035
|
|
|
1861
|
-
class
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
_qGrams(str, q) {
|
|
1866
|
-
const len = Math.max(0, str.length - q + 1);
|
|
1867
|
-
const grams = Pool.acquire('set', len);
|
|
1868
|
-
for (let i = 0; i < len; i++) grams.add(str.slice(i, i + q));
|
|
1869
|
-
return grams;
|
|
2036
|
+
class Cologne extends Phonetic {
|
|
2037
|
+
static default = { map: 'default', delimiter: ' ', length: -1, dedupe: true };
|
|
2038
|
+
constructor(opt = {}) {
|
|
2039
|
+
super('cologne', opt);
|
|
1870
2040
|
}
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
const setA = this._qGrams(a, q),
|
|
1874
|
-
setB = this._qGrams(b, q);
|
|
1875
|
-
const sizeA = setA.size,
|
|
1876
|
-
sizeB = setB.size;
|
|
1877
|
-
try {
|
|
1878
|
-
let intersection = 0;
|
|
1879
|
-
for (const gram of setA) if (setB.has(gram)) intersection++;
|
|
1880
|
-
const size = Math.max(sizeA, sizeB);
|
|
1881
|
-
return {
|
|
1882
|
-
res: size === 0 ? 1 : Metric.clamp(intersection / size),
|
|
1883
|
-
raw: { intersection, size }
|
|
1884
|
-
};
|
|
1885
|
-
} finally {
|
|
1886
|
-
Pool.release('set', setA, sizeA);
|
|
1887
|
-
Pool.release('set', setB, sizeB);
|
|
1888
|
-
}
|
|
2041
|
+
adjustCode(code) {
|
|
2042
|
+
return code.slice(0, 1) + code.slice(1).replaceAll('0', '');
|
|
1889
2043
|
}
|
|
1890
2044
|
}
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
const profiler$1 = Profiler.getInstance();
|
|
1933
|
-
class Phonetic {
|
|
1934
|
-
static cache = new HashTable();
|
|
1935
|
-
static default;
|
|
1936
|
-
algo;
|
|
1937
|
-
options;
|
|
1938
|
-
optKey;
|
|
1939
|
-
map;
|
|
1940
|
-
static clear = () => this.cache.clear();
|
|
1941
|
-
constructor(algo, opt = {}) {
|
|
1942
|
-
const defaults = this.constructor.default ?? {};
|
|
1943
|
-
const mapId = opt.map ?? defaults.map;
|
|
1944
|
-
if (!mapId)
|
|
1945
|
-
throw new CmpStrNotFoundError(
|
|
1946
|
-
`No mapping specified for phonetic algorithm`,
|
|
1947
|
-
{ algo }
|
|
1948
|
-
);
|
|
1949
|
-
const map = PhoneticMappingRegistry.get(algo, mapId);
|
|
1950
|
-
if (map === undefined)
|
|
1951
|
-
throw new CmpStrNotFoundError(
|
|
1952
|
-
`Requested mapping <${mapId}> is not declared`,
|
|
1953
|
-
{ algo, mapId }
|
|
1954
|
-
);
|
|
1955
|
-
this.options = merge(merge(defaults, map.options ?? {}), opt);
|
|
1956
|
-
this.optKey = Hasher.fastFNV1a(
|
|
1957
|
-
JSON.stringify(this.options, Object.keys(this.options).sort())
|
|
1958
|
-
).toString();
|
|
1959
|
-
this.algo = algo;
|
|
1960
|
-
this.map = map;
|
|
1961
|
-
}
|
|
1962
|
-
applyPattern(word) {
|
|
1963
|
-
const { patterns = [] } = this.map;
|
|
1964
|
-
if (!patterns || !patterns.length) return word;
|
|
1965
|
-
for (const { pattern, replace, all = false } of patterns) {
|
|
1966
|
-
word = word[all ? 'replaceAll' : 'replace'](pattern, replace);
|
|
1967
|
-
}
|
|
1968
|
-
return word;
|
|
1969
|
-
}
|
|
1970
|
-
applyRules(char, i, chars, charLen) {
|
|
1971
|
-
const { ruleset = [] } = this.map;
|
|
1972
|
-
if (!ruleset || !ruleset.length) return undefined;
|
|
1973
|
-
const prev = chars[i - 1] || '',
|
|
1974
|
-
prev2 = chars[i - 2] || '';
|
|
1975
|
-
const next = chars[i + 1] || '',
|
|
1976
|
-
next2 = chars[i + 2] || '';
|
|
1977
|
-
for (const rule of ruleset) {
|
|
1978
|
-
if (rule.char && rule.char !== char) continue;
|
|
1979
|
-
if (rule.position === 'start' && i !== 0) continue;
|
|
1980
|
-
if (rule.position === 'middle' && (i === 0 || i === charLen - 1))
|
|
1981
|
-
continue;
|
|
1982
|
-
if (rule.position === 'end' && i !== charLen) continue;
|
|
1983
|
-
if (rule.prev && !rule.prev.includes(prev)) continue;
|
|
1984
|
-
if (rule.prevNot && rule.prevNot.includes(prev)) continue;
|
|
1985
|
-
if (rule.prev2 && !rule.prev2.includes(prev2)) continue;
|
|
1986
|
-
if (rule.prev2Not && rule.prev2Not.includes(prev2)) continue;
|
|
1987
|
-
if (rule.next && !rule.next.includes(next)) continue;
|
|
1988
|
-
if (rule.nextNot && rule.nextNot.includes(next)) continue;
|
|
1989
|
-
if (rule.next2 && !rule.next2.includes(next2)) continue;
|
|
1990
|
-
if (rule.next2Not && rule.next2Not.includes(next2)) continue;
|
|
1991
|
-
if (
|
|
1992
|
-
rule.leading &&
|
|
1993
|
-
!rule.leading.includes(chars.slice(0, rule.leading.length).join(''))
|
|
1994
|
-
)
|
|
1995
|
-
continue;
|
|
1996
|
-
if (
|
|
1997
|
-
rule.trailing &&
|
|
1998
|
-
!rule.trailing.includes(chars.slice(-rule.trailing.length).join(''))
|
|
1999
|
-
)
|
|
2000
|
-
continue;
|
|
2001
|
-
if (rule.match && !rule.match.every((c, j) => chars[i + j] === c))
|
|
2002
|
-
continue;
|
|
2003
|
-
return rule.code;
|
|
2004
|
-
}
|
|
2005
|
-
return undefined;
|
|
2006
|
-
}
|
|
2007
|
-
encode(word) {
|
|
2008
|
-
const { map = {}, ignore = [] } = this.map;
|
|
2009
|
-
word = this.applyPattern(word);
|
|
2010
|
-
const chars = this.word2Chars(word);
|
|
2011
|
-
const charLen = chars.length;
|
|
2012
|
-
let code = '',
|
|
2013
|
-
lastCode = null;
|
|
2014
|
-
for (let i = 0; i < charLen; i++) {
|
|
2015
|
-
const char = chars[i];
|
|
2016
|
-
if (ignore.includes(char)) continue;
|
|
2017
|
-
const mapped = this.mapChar(char, i, chars, charLen, lastCode, map);
|
|
2018
|
-
if (mapped === undefined) continue;
|
|
2019
|
-
((code += mapped), (lastCode = mapped));
|
|
2020
|
-
if (this.exitEarly(code, i)) break;
|
|
2021
|
-
}
|
|
2022
|
-
return this.adjustCode(code, chars);
|
|
2023
|
-
}
|
|
2024
|
-
mapChar(char, i, chars, charLen, lastCode, map) {
|
|
2025
|
-
const { dedupe = true, fallback = undefined } = this.options;
|
|
2026
|
-
const c = this.applyRules(char, i, chars, charLen) ?? map[char] ?? fallback;
|
|
2027
|
-
return dedupe && c === lastCode ? undefined : c;
|
|
2028
|
-
}
|
|
2029
|
-
equalLen(input) {
|
|
2030
|
-
const { length = -1, pad = '0' } = this.options;
|
|
2031
|
-
return length === -1
|
|
2032
|
-
? input
|
|
2033
|
-
: (input + pad.repeat(length)).slice(0, length);
|
|
2034
|
-
}
|
|
2035
|
-
word2Chars = (word) => word.toLowerCase().split('');
|
|
2036
|
-
exitEarly(code, i) {
|
|
2037
|
-
const { length = -1 } = this.options;
|
|
2038
|
-
return length > 0 && code.length >= length;
|
|
2039
|
-
}
|
|
2040
|
-
adjustCode(code, chars) {
|
|
2041
|
-
return code;
|
|
2042
|
-
}
|
|
2043
|
-
loop(words) {
|
|
2044
|
-
return ErrorUtil.wrap(
|
|
2045
|
-
() => {
|
|
2046
|
-
const index = [];
|
|
2047
|
-
for (const word of words) {
|
|
2048
|
-
const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
|
|
2049
|
-
const code =
|
|
2050
|
-
Phonetic.cache.get(key || '') ??
|
|
2051
|
-
(() => {
|
|
2052
|
-
const res = this.encode(word);
|
|
2053
|
-
if (key) Phonetic.cache.set(key, res);
|
|
2054
|
-
return res;
|
|
2055
|
-
})();
|
|
2056
|
-
if (code && code.length) index.push(this.equalLen(code));
|
|
2057
|
-
}
|
|
2058
|
-
return index;
|
|
2059
|
-
},
|
|
2060
|
-
`Failed to generate phonetic index`,
|
|
2061
|
-
{ algo: this.algo, words }
|
|
2062
|
-
);
|
|
2063
|
-
}
|
|
2064
|
-
async loopAsync(words) {
|
|
2065
|
-
return ErrorUtil.wrapAsync(
|
|
2066
|
-
async () => {
|
|
2067
|
-
const index = [];
|
|
2068
|
-
for (const word of words) {
|
|
2069
|
-
const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
|
|
2070
|
-
const code = await Promise.resolve(
|
|
2071
|
-
Phonetic.cache.get(key || '') ??
|
|
2072
|
-
(() => {
|
|
2073
|
-
const res = this.encode(word);
|
|
2074
|
-
if (key) Phonetic.cache.set(key, res);
|
|
2075
|
-
return res;
|
|
2076
|
-
})()
|
|
2077
|
-
);
|
|
2078
|
-
if (code && code.length) index.push(this.equalLen(code));
|
|
2079
|
-
}
|
|
2080
|
-
return index;
|
|
2081
|
-
},
|
|
2082
|
-
`Failed to generate phonetic index asynchronously`,
|
|
2083
|
-
{ algo: this.algo, words }
|
|
2084
|
-
);
|
|
2085
|
-
}
|
|
2086
|
-
getAlgoName = () => this.algo;
|
|
2087
|
-
getIndex(input) {
|
|
2088
|
-
const { delimiter = ' ' } = this.options;
|
|
2089
|
-
return profiler$1.run(() =>
|
|
2090
|
-
this.loop(input.split(delimiter).filter(Boolean)).filter(Boolean)
|
|
2091
|
-
);
|
|
2092
|
-
}
|
|
2093
|
-
async getIndexAsync(input) {
|
|
2094
|
-
const { delimiter = ' ' } = this.options;
|
|
2095
|
-
return (
|
|
2096
|
-
await profiler$1.runAsync(
|
|
2097
|
-
async () => await this.loopAsync(input.split(delimiter).filter(Boolean))
|
|
2098
|
-
)
|
|
2099
|
-
).filter(Boolean);
|
|
2100
|
-
}
|
|
2101
|
-
}
|
|
2102
|
-
const PhoneticRegistry = Registry('phonetic', Phonetic);
|
|
2103
|
-
const PhoneticMappingRegistry = (() => {
|
|
2104
|
-
const mappings = Object.create(null);
|
|
2105
|
-
const maps = (algo) => (mappings[algo] ||= Object.create(null));
|
|
2106
|
-
return Object.freeze({
|
|
2107
|
-
add(algo, id, map, update = false) {
|
|
2108
|
-
const mappings = maps(algo);
|
|
2109
|
-
ErrorUtil.assert(
|
|
2110
|
-
!(!id || id in mappings) || update,
|
|
2111
|
-
`Entry <${id}> already exists / use <update=true> to overwrite`,
|
|
2112
|
-
{ algo, id }
|
|
2113
|
-
);
|
|
2114
|
-
mappings[id] = map;
|
|
2115
|
-
},
|
|
2116
|
-
remove(algo, id) {
|
|
2117
|
-
delete maps(algo)[id];
|
|
2118
|
-
},
|
|
2119
|
-
has(algo, id) {
|
|
2120
|
-
return id in maps(algo);
|
|
2045
|
+
PhoneticRegistry.add('cologne', Cologne);
|
|
2046
|
+
PhoneticMappingRegistry.add('cologne', 'default', {
|
|
2047
|
+
map: {
|
|
2048
|
+
a: '0',
|
|
2049
|
+
ä: '0',
|
|
2050
|
+
e: '0',
|
|
2051
|
+
i: '0',
|
|
2052
|
+
j: '0',
|
|
2053
|
+
o: '0',
|
|
2054
|
+
ö: '0',
|
|
2055
|
+
u: '0',
|
|
2056
|
+
ü: '0',
|
|
2057
|
+
y: '0',
|
|
2058
|
+
b: '1',
|
|
2059
|
+
p: '1',
|
|
2060
|
+
d: '2',
|
|
2061
|
+
t: '2',
|
|
2062
|
+
f: '3',
|
|
2063
|
+
v: '3',
|
|
2064
|
+
w: '3',
|
|
2065
|
+
g: '4',
|
|
2066
|
+
k: '4',
|
|
2067
|
+
q: '4',
|
|
2068
|
+
l: '5',
|
|
2069
|
+
m: '6',
|
|
2070
|
+
n: '6',
|
|
2071
|
+
r: '7',
|
|
2072
|
+
c: '8',
|
|
2073
|
+
s: '8',
|
|
2074
|
+
ß: '8',
|
|
2075
|
+
z: '8',
|
|
2076
|
+
x: '48'
|
|
2077
|
+
},
|
|
2078
|
+
ignore: ['h'],
|
|
2079
|
+
ruleset: [
|
|
2080
|
+
{ char: 'p', next: ['h'], code: '3' },
|
|
2081
|
+
{
|
|
2082
|
+
char: 'c',
|
|
2083
|
+
position: 'start',
|
|
2084
|
+
next: ['a', 'h', 'k', 'l', 'o', 'q', 'r', 'u', 'x'],
|
|
2085
|
+
code: '4'
|
|
2121
2086
|
},
|
|
2122
|
-
|
|
2123
|
-
|
|
2087
|
+
{
|
|
2088
|
+
char: 'c',
|
|
2089
|
+
next: ['a', 'h', 'k', 'o', 'q', 'u', 'x'],
|
|
2090
|
+
prevNot: ['s', 'z'],
|
|
2091
|
+
code: '4'
|
|
2124
2092
|
},
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
}
|
|
2128
|
-
|
|
2129
|
-
})
|
|
2093
|
+
{ char: 'd', next: ['c', 's', 'z'], code: '8' },
|
|
2094
|
+
{ char: 't', next: ['c', 's', 'z'], code: '8' },
|
|
2095
|
+
{ char: 'x', prev: ['c', 'k', 'q'], code: '8' }
|
|
2096
|
+
]
|
|
2097
|
+
});
|
|
2130
2098
|
|
|
2131
|
-
class
|
|
2132
|
-
static REGEX = {
|
|
2099
|
+
class Metaphone extends Phonetic {
|
|
2100
|
+
static REGEX = { adjacent: /([A-BD-Z])\1+/gi, vowel: /[AEIOU]/g };
|
|
2133
2101
|
static default = {
|
|
2134
|
-
map: '
|
|
2102
|
+
map: 'en90',
|
|
2135
2103
|
delimiter: ' ',
|
|
2136
2104
|
length: -1,
|
|
2137
2105
|
pad: '',
|
|
2138
2106
|
dedupe: false
|
|
2139
2107
|
};
|
|
2140
2108
|
constructor(opt = {}) {
|
|
2141
|
-
super('
|
|
2109
|
+
super('metaphone', opt);
|
|
2142
2110
|
}
|
|
2143
2111
|
encode(word) {
|
|
2144
|
-
word = word.replace(
|
|
2112
|
+
word = word.replace(Metaphone.REGEX.adjacent, (m, c) =>
|
|
2113
|
+
c === 'C' ? m : c
|
|
2114
|
+
);
|
|
2145
2115
|
return super.encode(word);
|
|
2146
2116
|
}
|
|
2147
|
-
|
|
2148
|
-
|
|
2117
|
+
adjustCode(code) {
|
|
2118
|
+
return code.slice(0, 1) + code.slice(1).replace(Metaphone.REGEX.vowel, '');
|
|
2119
|
+
}
|
|
2149
2120
|
}
|
|
2150
|
-
PhoneticRegistry.add('
|
|
2151
|
-
PhoneticMappingRegistry.add('
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
|
|
2167
|
-
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
{
|
|
2182
|
-
{
|
|
2183
|
-
{
|
|
2184
|
-
{
|
|
2185
|
-
{
|
|
2186
|
-
{
|
|
2187
|
-
{
|
|
2188
|
-
{
|
|
2189
|
-
{
|
|
2190
|
-
{
|
|
2191
|
-
{
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2197
|
-
|
|
2198
|
-
|
|
2199
|
-
|
|
2200
|
-
{
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
{
|
|
2207
|
-
{
|
|
2208
|
-
{
|
|
2209
|
-
{
|
|
2210
|
-
{
|
|
2211
|
-
{
|
|
2212
|
-
{
|
|
2213
|
-
{
|
|
2214
|
-
{
|
|
2215
|
-
{
|
|
2216
|
-
{
|
|
2217
|
-
{
|
|
2218
|
-
{ pattern: /j/g, replace: 'y' },
|
|
2219
|
-
{ pattern: /^y3/, replace: 'Y3' },
|
|
2220
|
-
{ pattern: /^y/, replace: 'A' },
|
|
2221
|
-
{ pattern: /y/g, replace: '3' },
|
|
2222
|
-
{ pattern: /3gh3/g, replace: '3kh3' },
|
|
2223
|
-
{ pattern: /gh/g, replace: '22' },
|
|
2224
|
-
{ pattern: /g/g, replace: 'k' },
|
|
2225
|
-
{ pattern: /s+/g, replace: 'S' },
|
|
2226
|
-
{ pattern: /t+/g, replace: 'T' },
|
|
2227
|
-
{ pattern: /p+/g, replace: 'P' },
|
|
2228
|
-
{ pattern: /k+/g, replace: 'K' },
|
|
2229
|
-
{ pattern: /f+/g, replace: 'F' },
|
|
2230
|
-
{ pattern: /m+/g, replace: 'M' },
|
|
2231
|
-
{ pattern: /n+/g, replace: 'N' },
|
|
2232
|
-
{ pattern: /l3/g, replace: 'L3' },
|
|
2233
|
-
{ pattern: /r3/g, replace: 'R3' },
|
|
2234
|
-
{ pattern: /w3/g, replace: 'W3' },
|
|
2235
|
-
{ pattern: /wh3/g, replace: 'Wh3' },
|
|
2236
|
-
{ pattern: /[lrw]$/, replace: '3' },
|
|
2237
|
-
{ pattern: /^h/, replace: 'A' },
|
|
2238
|
-
{ pattern: /3$/, replace: 'A' },
|
|
2239
|
-
{ pattern: /[hlrw23]/g, replace: '' }
|
|
2121
|
+
PhoneticRegistry.add('metaphone', Metaphone);
|
|
2122
|
+
PhoneticMappingRegistry.add('metaphone', 'en90', {
|
|
2123
|
+
map: {
|
|
2124
|
+
a: 'A',
|
|
2125
|
+
b: 'B',
|
|
2126
|
+
c: 'K',
|
|
2127
|
+
d: 'T',
|
|
2128
|
+
e: 'E',
|
|
2129
|
+
f: 'F',
|
|
2130
|
+
g: 'K',
|
|
2131
|
+
h: 'H',
|
|
2132
|
+
i: 'I',
|
|
2133
|
+
j: 'J',
|
|
2134
|
+
k: 'K',
|
|
2135
|
+
l: 'L',
|
|
2136
|
+
m: 'M',
|
|
2137
|
+
n: 'N',
|
|
2138
|
+
o: 'O',
|
|
2139
|
+
p: 'P',
|
|
2140
|
+
q: 'K',
|
|
2141
|
+
r: 'R',
|
|
2142
|
+
s: 'S',
|
|
2143
|
+
t: 'T',
|
|
2144
|
+
u: 'U',
|
|
2145
|
+
v: 'F',
|
|
2146
|
+
w: 'W',
|
|
2147
|
+
x: 'KS',
|
|
2148
|
+
y: 'Y',
|
|
2149
|
+
z: 'S'
|
|
2150
|
+
},
|
|
2151
|
+
ruleset: [
|
|
2152
|
+
{ char: 'a', position: 'start', next: ['e'], code: '' },
|
|
2153
|
+
{ char: 'g', position: 'start', next: ['n'], code: '' },
|
|
2154
|
+
{ char: 'k', position: 'start', next: ['n'], code: '' },
|
|
2155
|
+
{ char: 'p', position: 'start', next: ['n'], code: '' },
|
|
2156
|
+
{ char: 'w', position: 'start', next: ['r'], code: '' },
|
|
2157
|
+
{ char: 'b', position: 'end', prev: ['m'], code: '' },
|
|
2158
|
+
{ char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
|
|
2159
|
+
{ char: 'c', next: ['i'], next2: ['a'], code: 'X' },
|
|
2160
|
+
{ char: 'c', next: ['e', 'i', 'y'], code: 'S' },
|
|
2161
|
+
{ char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
|
|
2162
|
+
{
|
|
2163
|
+
char: 'g',
|
|
2164
|
+
next: ['h'],
|
|
2165
|
+
next2Not: ['', 'a', 'e', 'i', 'o', 'u'],
|
|
2166
|
+
code: ''
|
|
2167
|
+
},
|
|
2168
|
+
{ char: 'g', trailing: 'n', code: '' },
|
|
2169
|
+
{ char: 'g', trailing: 'ned', code: '' },
|
|
2170
|
+
{ char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
|
|
2171
|
+
{
|
|
2172
|
+
char: 'h',
|
|
2173
|
+
prev: ['a', 'e', 'i', 'o', 'u'],
|
|
2174
|
+
nextNot: ['a', 'e', 'i', 'o', 'u'],
|
|
2175
|
+
code: ''
|
|
2176
|
+
},
|
|
2177
|
+
{ char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
|
|
2178
|
+
{ char: 'k', prev: ['c'], code: '' },
|
|
2179
|
+
{ char: 'p', next: ['h'], code: 'F' },
|
|
2180
|
+
{ char: 's', next: ['h'], code: 'X' },
|
|
2181
|
+
{ char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2182
|
+
{ char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2183
|
+
{ char: 't', next: ['h'], code: '0' },
|
|
2184
|
+
{ char: 't', next: ['c'], next2: ['h'], code: '' },
|
|
2185
|
+
{ char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
|
|
2186
|
+
{ char: 'h', leading: 'w', code: '' },
|
|
2187
|
+
{ char: 'x', position: 'start', code: 'S' },
|
|
2188
|
+
{ char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
|
|
2240
2189
|
]
|
|
2241
2190
|
});
|
|
2242
2191
|
|
|
2243
|
-
class
|
|
2244
|
-
static default = {
|
|
2192
|
+
class Soundex extends Phonetic {
|
|
2193
|
+
static default = {
|
|
2194
|
+
map: 'en',
|
|
2195
|
+
delimiter: ' ',
|
|
2196
|
+
length: 4,
|
|
2197
|
+
pad: '0',
|
|
2198
|
+
dedupe: true
|
|
2199
|
+
};
|
|
2245
2200
|
constructor(opt = {}) {
|
|
2246
|
-
super('
|
|
2201
|
+
super('soundex', opt);
|
|
2247
2202
|
}
|
|
2248
|
-
adjustCode(code) {
|
|
2249
|
-
return
|
|
2203
|
+
adjustCode(code, chars) {
|
|
2204
|
+
return chars[0].toUpperCase() + code.slice(1).replaceAll('0', '');
|
|
2250
2205
|
}
|
|
2251
2206
|
}
|
|
2252
|
-
PhoneticRegistry.add('
|
|
2253
|
-
PhoneticMappingRegistry.add('
|
|
2207
|
+
PhoneticRegistry.add('soundex', Soundex);
|
|
2208
|
+
PhoneticMappingRegistry.add('soundex', 'en', {
|
|
2209
|
+
map: {
|
|
2210
|
+
a: '0',
|
|
2211
|
+
e: '0',
|
|
2212
|
+
h: '0',
|
|
2213
|
+
i: '0',
|
|
2214
|
+
o: '0',
|
|
2215
|
+
u: '0',
|
|
2216
|
+
w: '0',
|
|
2217
|
+
y: '0',
|
|
2218
|
+
b: '1',
|
|
2219
|
+
f: '1',
|
|
2220
|
+
p: '1',
|
|
2221
|
+
v: '1',
|
|
2222
|
+
c: '2',
|
|
2223
|
+
g: '2',
|
|
2224
|
+
j: '2',
|
|
2225
|
+
k: '2',
|
|
2226
|
+
q: '2',
|
|
2227
|
+
s: '2',
|
|
2228
|
+
x: '2',
|
|
2229
|
+
z: '2',
|
|
2230
|
+
d: '3',
|
|
2231
|
+
t: '3',
|
|
2232
|
+
l: '4',
|
|
2233
|
+
m: '5',
|
|
2234
|
+
n: '5',
|
|
2235
|
+
r: '6'
|
|
2236
|
+
}
|
|
2237
|
+
});
|
|
2238
|
+
PhoneticMappingRegistry.add('soundex', 'de', {
|
|
2254
2239
|
map: {
|
|
2255
2240
|
a: '0',
|
|
2256
2241
|
ä: '0',
|
|
2257
2242
|
e: '0',
|
|
2243
|
+
h: '0',
|
|
2258
2244
|
i: '0',
|
|
2259
2245
|
j: '0',
|
|
2260
2246
|
o: '0',
|
|
@@ -2263,220 +2249,596 @@ PhoneticMappingRegistry.add('cologne', 'default', {
|
|
|
2263
2249
|
ü: '0',
|
|
2264
2250
|
y: '0',
|
|
2265
2251
|
b: '1',
|
|
2252
|
+
f: '1',
|
|
2266
2253
|
p: '1',
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
|
|
2272
|
-
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
|
|
2278
|
-
|
|
2279
|
-
|
|
2280
|
-
|
|
2281
|
-
|
|
2282
|
-
|
|
2283
|
-
x: '48'
|
|
2254
|
+
v: '1',
|
|
2255
|
+
w: '1',
|
|
2256
|
+
c: '2',
|
|
2257
|
+
g: '2',
|
|
2258
|
+
k: '2',
|
|
2259
|
+
q: '2',
|
|
2260
|
+
s: '2',
|
|
2261
|
+
ß: '2',
|
|
2262
|
+
x: '2',
|
|
2263
|
+
z: '2',
|
|
2264
|
+
d: '3',
|
|
2265
|
+
t: '3',
|
|
2266
|
+
l: '4',
|
|
2267
|
+
m: '5',
|
|
2268
|
+
n: '5',
|
|
2269
|
+
r: '6'
|
|
2284
2270
|
},
|
|
2285
|
-
|
|
2286
|
-
ruleset: [
|
|
2287
|
-
{ char: 'p', next: ['h'], code: '3' },
|
|
2288
|
-
{
|
|
2289
|
-
char: 'c',
|
|
2290
|
-
position: 'start',
|
|
2291
|
-
next: ['a', 'h', 'k', 'l', 'o', 'q', 'r', 'u', 'x'],
|
|
2292
|
-
code: '4'
|
|
2293
|
-
},
|
|
2294
|
-
{
|
|
2295
|
-
char: 'c',
|
|
2296
|
-
next: ['a', 'h', 'k', 'o', 'q', 'u', 'x'],
|
|
2297
|
-
prevNot: ['s', 'z'],
|
|
2298
|
-
code: '4'
|
|
2299
|
-
},
|
|
2300
|
-
{ char: 'd', next: ['c', 's', 'z'], code: '8' },
|
|
2301
|
-
{ char: 't', next: ['c', 's', 'z'], code: '8' },
|
|
2302
|
-
{ char: 'x', prev: ['c', 'k', 'q'], code: '8' }
|
|
2303
|
-
]
|
|
2271
|
+
ruleset: [{ char: 'c', next: ['h'], code: '7' }]
|
|
2304
2272
|
});
|
|
2305
2273
|
|
|
2306
|
-
class
|
|
2307
|
-
static
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
|
|
2274
|
+
class OptionsValidator {
|
|
2275
|
+
static ALLOWED_FLAGS = new Set([
|
|
2276
|
+
'd',
|
|
2277
|
+
'u',
|
|
2278
|
+
'x',
|
|
2279
|
+
'w',
|
|
2280
|
+
't',
|
|
2281
|
+
'r',
|
|
2282
|
+
's',
|
|
2283
|
+
'k',
|
|
2284
|
+
'n',
|
|
2285
|
+
'i'
|
|
2286
|
+
]);
|
|
2287
|
+
static ALLOWED_OUTPUT = new Set(['orig', 'prep']);
|
|
2288
|
+
static ALLOWED_MODES = new Set(['default', 'batch', 'single', 'pairwise']);
|
|
2289
|
+
static ALLOWED_SORT = new Set(['asc', 'desc']);
|
|
2290
|
+
static PROCESSORS = {
|
|
2291
|
+
phonetic: (opt) => {
|
|
2292
|
+
if (!opt) return;
|
|
2293
|
+
OptionsValidator.validatePhoneticName(opt.algo);
|
|
2294
|
+
OptionsValidator.validatePhoneticOptions(opt.opt);
|
|
2295
|
+
}
|
|
2314
2296
|
};
|
|
2315
|
-
|
|
2316
|
-
|
|
2297
|
+
static METRIC_OPT_MAP = {
|
|
2298
|
+
mode: (v) => OptionsValidator.validateMode(v),
|
|
2299
|
+
delimiter: (v) => OptionsValidator.validateString(v, 'opt.delimiter'),
|
|
2300
|
+
pad: (v) => OptionsValidator.validateString(v, 'opt.pad'),
|
|
2301
|
+
q: (v) => OptionsValidator.validateNumber(v, 'opt.q'),
|
|
2302
|
+
match: (v) => OptionsValidator.validateNumber(v, 'opt.match'),
|
|
2303
|
+
mismatch: (v) => OptionsValidator.validateNumber(v, 'opt.mismatch'),
|
|
2304
|
+
gap: (v) => OptionsValidator.validateNumber(v, 'opt.gap')
|
|
2305
|
+
};
|
|
2306
|
+
static PHONETIC_OPT_MAP = {
|
|
2307
|
+
map: (v) =>
|
|
2308
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.map'),
|
|
2309
|
+
delimiter: (v) =>
|
|
2310
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.delimiter'),
|
|
2311
|
+
length: (v) =>
|
|
2312
|
+
OptionsValidator.validateNumber(v, 'processors.phonetic.opt.length'),
|
|
2313
|
+
pad: (v) =>
|
|
2314
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.pad'),
|
|
2315
|
+
dedupe: (v) =>
|
|
2316
|
+
OptionsValidator.validateBoolean(v, 'processors.phonetic.opt.dedupe'),
|
|
2317
|
+
fallback: (v) =>
|
|
2318
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.fallback')
|
|
2319
|
+
};
|
|
2320
|
+
static CMPSTR_OPT_MAP = {
|
|
2321
|
+
raw: (v) => OptionsValidator.validateBoolean(v, 'raw'),
|
|
2322
|
+
removeZero: (v) => OptionsValidator.validateBoolean(v, 'removeZero'),
|
|
2323
|
+
safeEmpty: (v) => OptionsValidator.validateBoolean(v, 'safeEmpty'),
|
|
2324
|
+
flags: (v) => OptionsValidator.validateFlags(v),
|
|
2325
|
+
metric: (v) => OptionsValidator.validateMetricName(v),
|
|
2326
|
+
output: (v) => OptionsValidator.validateOutput(v),
|
|
2327
|
+
opt: (v) => OptionsValidator.validateMetricOptions(v),
|
|
2328
|
+
processors: (v) => OptionsValidator.validateProcessors(v),
|
|
2329
|
+
sort: (v) => OptionsValidator.validateSort(v, 'sort'),
|
|
2330
|
+
objectsOnly: (v) => OptionsValidator.validateBoolean(v, 'objectsOnly')
|
|
2331
|
+
};
|
|
2332
|
+
static set2string(set) {
|
|
2333
|
+
return Array.from(set).join(' | ');
|
|
2334
|
+
}
|
|
2335
|
+
static validateType(value, name, type) {
|
|
2336
|
+
if (value === undefined) return;
|
|
2337
|
+
if (typeof value !== type || (type === 'number' && Number.isNaN(value))) {
|
|
2338
|
+
throw new CmpStrValidationError(
|
|
2339
|
+
`Invalid option <${name}>: expected ${type}`,
|
|
2340
|
+
{ name, value }
|
|
2341
|
+
);
|
|
2342
|
+
}
|
|
2343
|
+
}
|
|
2344
|
+
static validateEnum(value, name, set) {
|
|
2345
|
+
if (value === undefined) return;
|
|
2346
|
+
if (typeof value !== 'string' || !set.has(value)) {
|
|
2347
|
+
throw new CmpStrValidationError(
|
|
2348
|
+
`Invalid option <${name}>: expected ${OptionsValidator.set2string(set)}`,
|
|
2349
|
+
{ name, value }
|
|
2350
|
+
);
|
|
2351
|
+
}
|
|
2352
|
+
}
|
|
2353
|
+
static validateMap(opt, map) {
|
|
2354
|
+
if (!opt) return;
|
|
2355
|
+
for (const k in opt) {
|
|
2356
|
+
const fn = map[k];
|
|
2357
|
+
if (!fn)
|
|
2358
|
+
throw new CmpStrValidationError(`Invalid option <${k}>`, {
|
|
2359
|
+
option: k,
|
|
2360
|
+
value: map[k]
|
|
2361
|
+
});
|
|
2362
|
+
fn(opt[k]);
|
|
2363
|
+
}
|
|
2364
|
+
}
|
|
2365
|
+
static validateRegistryName(value, name, label, has, list) {
|
|
2366
|
+
if (value === undefined) return;
|
|
2367
|
+
if (typeof value !== 'string' || value.length === 0)
|
|
2368
|
+
throw new CmpStrValidationError(
|
|
2369
|
+
`Invalid option <${name}>: expected non-empty string`,
|
|
2370
|
+
{ name, value }
|
|
2371
|
+
);
|
|
2372
|
+
if (!has(value))
|
|
2373
|
+
throw new CmpStrValidationError(`${label} <${value}> is not registered`, {
|
|
2374
|
+
name,
|
|
2375
|
+
value,
|
|
2376
|
+
available: list()
|
|
2377
|
+
});
|
|
2378
|
+
}
|
|
2379
|
+
static validateBoolean(value, name) {
|
|
2380
|
+
OptionsValidator.validateType(value, name, 'boolean');
|
|
2381
|
+
}
|
|
2382
|
+
static validateNumber(value, name) {
|
|
2383
|
+
OptionsValidator.validateType(value, name, 'number');
|
|
2384
|
+
}
|
|
2385
|
+
static validateString(value, name) {
|
|
2386
|
+
OptionsValidator.validateType(value, name, 'string');
|
|
2387
|
+
}
|
|
2388
|
+
static validateFlags(value) {
|
|
2389
|
+
if (value === undefined) return;
|
|
2390
|
+
if (typeof value !== 'string')
|
|
2391
|
+
throw new CmpStrValidationError(
|
|
2392
|
+
`Invalid option <flags>: expected string`,
|
|
2393
|
+
{ flags: value }
|
|
2394
|
+
);
|
|
2395
|
+
for (let i = 0; i < value.length; i++) {
|
|
2396
|
+
const ch = value[i];
|
|
2397
|
+
if (!OptionsValidator.ALLOWED_FLAGS.has(ch))
|
|
2398
|
+
throw new CmpStrValidationError(
|
|
2399
|
+
`Invalid normalization flag <${ch}> in <flags>: expected ${OptionsValidator.set2string(OptionsValidator.ALLOWED_FLAGS)}`,
|
|
2400
|
+
{ flags: value, invalid: ch }
|
|
2401
|
+
);
|
|
2402
|
+
}
|
|
2403
|
+
}
|
|
2404
|
+
static validateOutput(value) {
|
|
2405
|
+
OptionsValidator.validateEnum(
|
|
2406
|
+
value,
|
|
2407
|
+
'output',
|
|
2408
|
+
OptionsValidator.ALLOWED_OUTPUT
|
|
2409
|
+
);
|
|
2410
|
+
}
|
|
2411
|
+
static validateMode(value) {
|
|
2412
|
+
OptionsValidator.validateEnum(
|
|
2413
|
+
value,
|
|
2414
|
+
'mode',
|
|
2415
|
+
OptionsValidator.ALLOWED_MODES
|
|
2416
|
+
);
|
|
2417
|
+
}
|
|
2418
|
+
static validateSort(value, name) {
|
|
2419
|
+
if (value === undefined || typeof value === 'boolean') return;
|
|
2420
|
+
OptionsValidator.validateEnum(value, name, OptionsValidator.ALLOWED_SORT);
|
|
2421
|
+
}
|
|
2422
|
+
static validateMetricName(value) {
|
|
2423
|
+
OptionsValidator.validateRegistryName(
|
|
2424
|
+
value,
|
|
2425
|
+
'metric',
|
|
2426
|
+
'Comparison metric',
|
|
2427
|
+
MetricRegistry.has,
|
|
2428
|
+
MetricRegistry.list
|
|
2429
|
+
);
|
|
2430
|
+
}
|
|
2431
|
+
static validatePhoneticName(value) {
|
|
2432
|
+
OptionsValidator.validateRegistryName(
|
|
2433
|
+
value,
|
|
2434
|
+
'phonetic',
|
|
2435
|
+
'Phonetic algorithm',
|
|
2436
|
+
PhoneticRegistry.has,
|
|
2437
|
+
PhoneticRegistry.list
|
|
2438
|
+
);
|
|
2439
|
+
}
|
|
2440
|
+
static validateMetricOptions(opt) {
|
|
2441
|
+
OptionsValidator.validateMap(opt, OptionsValidator.METRIC_OPT_MAP);
|
|
2442
|
+
}
|
|
2443
|
+
static validatePhoneticOptions(opt) {
|
|
2444
|
+
OptionsValidator.validateMap(opt, OptionsValidator.PHONETIC_OPT_MAP);
|
|
2445
|
+
}
|
|
2446
|
+
static validateProcessors(opt) {
|
|
2447
|
+
if (!opt) return;
|
|
2448
|
+
for (const key in opt) {
|
|
2449
|
+
const fn = OptionsValidator.PROCESSORS[key];
|
|
2450
|
+
if (!fn)
|
|
2451
|
+
throw new CmpStrValidationError(
|
|
2452
|
+
`Invalid processor type <${key}> in <processors>: expected ${Object.keys(OptionsValidator.PROCESSORS).join(' | ')}`,
|
|
2453
|
+
{ processors: opt, invalid: key }
|
|
2454
|
+
);
|
|
2455
|
+
fn(opt[key]);
|
|
2456
|
+
}
|
|
2457
|
+
}
|
|
2458
|
+
static validateOptions(opt) {
|
|
2459
|
+
OptionsValidator.validateMap(opt, OptionsValidator.CMPSTR_OPT_MAP);
|
|
2460
|
+
}
|
|
2461
|
+
}
|
|
2462
|
+
|
|
2463
|
+
class StructuredData {
|
|
2464
|
+
data;
|
|
2465
|
+
key;
|
|
2466
|
+
static SORT_ASC = (a, b) => a.res - b.res;
|
|
2467
|
+
static SORT_DESC = (a, b) => b.res - a.res;
|
|
2468
|
+
static create(data, key) {
|
|
2469
|
+
return new StructuredData(data, key);
|
|
2470
|
+
}
|
|
2471
|
+
constructor(data, key) {
|
|
2472
|
+
this.data = data;
|
|
2473
|
+
this.key = key;
|
|
2474
|
+
}
|
|
2475
|
+
extractFrom(arr, key) {
|
|
2476
|
+
const n = arr.length;
|
|
2477
|
+
const result = new Array(n);
|
|
2478
|
+
for (let i = 0; i < n; i++) {
|
|
2479
|
+
const val = arr[i][key];
|
|
2480
|
+
result[i] = val != null ? String(val) : '';
|
|
2481
|
+
}
|
|
2482
|
+
return result;
|
|
2483
|
+
}
|
|
2484
|
+
extract() {
|
|
2485
|
+
return this.extractFrom(this.data, this.key);
|
|
2486
|
+
}
|
|
2487
|
+
isMetricResult(v) {
|
|
2488
|
+
return (
|
|
2489
|
+
typeof v === 'object' && v !== null && 'a' in v && 'b' in v && 'res' in v
|
|
2490
|
+
);
|
|
2491
|
+
}
|
|
2492
|
+
isCmpStrResult(v) {
|
|
2493
|
+
return (
|
|
2494
|
+
typeof v === 'object' &&
|
|
2495
|
+
v !== null &&
|
|
2496
|
+
'source' in v &&
|
|
2497
|
+
'target' in v &&
|
|
2498
|
+
'match' in v
|
|
2499
|
+
);
|
|
2500
|
+
}
|
|
2501
|
+
normalizeResults(results) {
|
|
2502
|
+
if (!Array.isArray(results) || results.length === 0) return [];
|
|
2503
|
+
const first = results[0];
|
|
2504
|
+
let out = new Array(results.length);
|
|
2505
|
+
if (this.isMetricResult(first)) {
|
|
2506
|
+
const src = results;
|
|
2507
|
+
for (let i = 0; i < src.length; i++) out[i] = { ...src[i], __idx: i };
|
|
2508
|
+
} else if (this.isCmpStrResult(first)) {
|
|
2509
|
+
const src = results;
|
|
2510
|
+
for (let i = 0; i < src.length; i++) {
|
|
2511
|
+
const r = src[i];
|
|
2512
|
+
out[i] = {
|
|
2513
|
+
metric: 'unknown',
|
|
2514
|
+
a: r.source,
|
|
2515
|
+
b: r.target,
|
|
2516
|
+
res: r.match,
|
|
2517
|
+
raw: r.raw,
|
|
2518
|
+
__idx: i
|
|
2519
|
+
};
|
|
2520
|
+
}
|
|
2521
|
+
} else
|
|
2522
|
+
throw new CmpStrValidationError(
|
|
2523
|
+
'Unsupported result format for StructuredData normalization.'
|
|
2524
|
+
);
|
|
2525
|
+
return out;
|
|
2526
|
+
}
|
|
2527
|
+
rebuild(results, sourceData, extractedStrings, removeZero, objectsOnly) {
|
|
2528
|
+
const m = extractedStrings.length,
|
|
2529
|
+
n = results.length;
|
|
2530
|
+
const stringToIndices = Pool.acquire('map', m);
|
|
2531
|
+
const occurrenceCount = Pool.acquire('map', n);
|
|
2532
|
+
const output = new Array(n);
|
|
2533
|
+
stringToIndices.clear();
|
|
2534
|
+
occurrenceCount.clear();
|
|
2535
|
+
try {
|
|
2536
|
+
for (let i = 0; i < m; i++) {
|
|
2537
|
+
const str = extractedStrings[i];
|
|
2538
|
+
let arr = stringToIndices.get(str);
|
|
2539
|
+
if (!arr) {
|
|
2540
|
+
arr = [];
|
|
2541
|
+
stringToIndices.set(str, arr);
|
|
2542
|
+
}
|
|
2543
|
+
arr.push(i);
|
|
2544
|
+
}
|
|
2545
|
+
let out = 0;
|
|
2546
|
+
for (let i = 0; i < n; i++) {
|
|
2547
|
+
const result = results[i];
|
|
2548
|
+
if (removeZero && result.res === 0) continue;
|
|
2549
|
+
const targetStr = result.b || '';
|
|
2550
|
+
const indices = stringToIndices.get(targetStr);
|
|
2551
|
+
let dataIndex;
|
|
2552
|
+
if (indices && indices.length > 0) {
|
|
2553
|
+
const occurrence = occurrenceCount.get(targetStr) ?? 0;
|
|
2554
|
+
occurrenceCount.set(targetStr, occurrence + 1);
|
|
2555
|
+
dataIndex = indices[occurrence % indices.length];
|
|
2556
|
+
} else {
|
|
2557
|
+
dataIndex = result.__idx ?? i;
|
|
2558
|
+
}
|
|
2559
|
+
if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
|
|
2560
|
+
const sourceObj = sourceData[dataIndex];
|
|
2561
|
+
const mappedTarget = extractedStrings[dataIndex] || targetStr;
|
|
2562
|
+
if (objectsOnly) output[out++] = sourceObj;
|
|
2563
|
+
else
|
|
2564
|
+
output[out++] = {
|
|
2565
|
+
obj: sourceObj,
|
|
2566
|
+
key: this.key,
|
|
2567
|
+
result: {
|
|
2568
|
+
source: result.a,
|
|
2569
|
+
target: mappedTarget,
|
|
2570
|
+
match: result.res
|
|
2571
|
+
},
|
|
2572
|
+
...(result.raw ? { raw: result.raw } : null)
|
|
2573
|
+
};
|
|
2574
|
+
}
|
|
2575
|
+
output.length = out;
|
|
2576
|
+
return output;
|
|
2577
|
+
} finally {
|
|
2578
|
+
Pool.release('map', stringToIndices, m);
|
|
2579
|
+
Pool.release('map', occurrenceCount, n);
|
|
2580
|
+
}
|
|
2581
|
+
}
|
|
2582
|
+
sort(results, sort) {
|
|
2583
|
+
if (!sort || results.length <= 1) return results;
|
|
2584
|
+
return results.sort(
|
|
2585
|
+
sort === 'asc' ? StructuredData.SORT_ASC : StructuredData.SORT_DESC
|
|
2586
|
+
);
|
|
2587
|
+
}
|
|
2588
|
+
finalizeLookup(results, extractedStrings, opt) {
|
|
2589
|
+
return this.rebuild(
|
|
2590
|
+
this.sort(this.normalizeResults(results), opt?.sort),
|
|
2591
|
+
this.data,
|
|
2592
|
+
extractedStrings,
|
|
2593
|
+
opt?.removeZero,
|
|
2594
|
+
opt?.objectsOnly
|
|
2595
|
+
);
|
|
2596
|
+
}
|
|
2597
|
+
performLookup(fn, extractedStrings, opt) {
|
|
2598
|
+
return ErrorUtil.wrap(
|
|
2599
|
+
() => this.finalizeLookup(fn(), extractedStrings, opt),
|
|
2600
|
+
'StructuredData lookup failed',
|
|
2601
|
+
{ key: this.key }
|
|
2602
|
+
);
|
|
2603
|
+
}
|
|
2604
|
+
async performLookupAsync(fn, extractedStrings, opt) {
|
|
2605
|
+
return await ErrorUtil.wrapAsync(
|
|
2606
|
+
async () => this.finalizeLookup(await fn(), extractedStrings, opt),
|
|
2607
|
+
'StructuredData async lookup failed',
|
|
2608
|
+
{ key: this.key }
|
|
2609
|
+
);
|
|
2610
|
+
}
|
|
2611
|
+
lookup(fn, query, opt) {
|
|
2612
|
+
const b = this.extract();
|
|
2613
|
+
try {
|
|
2614
|
+
return this.performLookup(() => fn(query, b, opt), b, opt);
|
|
2615
|
+
} finally {
|
|
2616
|
+
Pool.release('string[]', b, b.length);
|
|
2617
|
+
}
|
|
2618
|
+
}
|
|
2619
|
+
async lookupAsync(fn, query, opt) {
|
|
2620
|
+
const b = this.extract();
|
|
2621
|
+
try {
|
|
2622
|
+
return await this.performLookupAsync(() => fn(query, b, opt), b, opt);
|
|
2623
|
+
} finally {
|
|
2624
|
+
Pool.release('string[]', b, b.length);
|
|
2625
|
+
}
|
|
2626
|
+
}
|
|
2627
|
+
lookupPairs(fn, other, otherKey, opt) {
|
|
2628
|
+
const a = this.extract();
|
|
2629
|
+
const b = this.extractFrom(other, otherKey);
|
|
2630
|
+
try {
|
|
2631
|
+
return this.performLookup(() => fn(a, b, opt), a, opt);
|
|
2632
|
+
} finally {
|
|
2633
|
+
Pool.release('string[]', a, a.length);
|
|
2634
|
+
Pool.release('string[]', b, b.length);
|
|
2635
|
+
}
|
|
2636
|
+
}
|
|
2637
|
+
async lookupPairsAsync(fn, other, otherKey, opt) {
|
|
2638
|
+
const a = this.extract();
|
|
2639
|
+
const b = this.extractFrom(other, otherKey);
|
|
2640
|
+
try {
|
|
2641
|
+
return await this.performLookupAsync(() => fn(a, b, opt), a, opt);
|
|
2642
|
+
} finally {
|
|
2643
|
+
Pool.release('string[]', a, a.length);
|
|
2644
|
+
Pool.release('string[]', b, b.length);
|
|
2645
|
+
}
|
|
2646
|
+
}
|
|
2647
|
+
}
|
|
2648
|
+
|
|
2649
|
+
class TextAnalyzer {
|
|
2650
|
+
static REGEX = {
|
|
2651
|
+
number: /\d/,
|
|
2652
|
+
sentence: /(?<=[.!?])\s+/,
|
|
2653
|
+
word: /\p{L}+/gu,
|
|
2654
|
+
nonWord: /[^\p{L}]/gu,
|
|
2655
|
+
vowelGroup: /[aeiouy]+/g,
|
|
2656
|
+
letter: /\p{L}/gu,
|
|
2657
|
+
ucLetter: /\p{Lu}/gu
|
|
2658
|
+
};
|
|
2659
|
+
text;
|
|
2660
|
+
words = [];
|
|
2661
|
+
sentences = [];
|
|
2662
|
+
charFrequency = new Map();
|
|
2663
|
+
wordHistogram = new Map();
|
|
2664
|
+
syllableCache = new Map();
|
|
2665
|
+
syllableStats;
|
|
2666
|
+
constructor(input) {
|
|
2667
|
+
this.text = input.trim();
|
|
2668
|
+
this.tokenize();
|
|
2669
|
+
this.computeFrequencies();
|
|
2670
|
+
}
|
|
2671
|
+
tokenize() {
|
|
2672
|
+
let match;
|
|
2673
|
+
const lcText = this.text.toLowerCase();
|
|
2674
|
+
while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null)
|
|
2675
|
+
this.words.push(match[0]);
|
|
2676
|
+
this.sentences = this.text
|
|
2677
|
+
.split(TextAnalyzer.REGEX.sentence)
|
|
2678
|
+
.filter(Boolean);
|
|
2679
|
+
}
|
|
2680
|
+
computeFrequencies() {
|
|
2681
|
+
for (const char of this.text)
|
|
2682
|
+
this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1);
|
|
2683
|
+
for (const word of this.words)
|
|
2684
|
+
this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1);
|
|
2685
|
+
}
|
|
2686
|
+
estimateSyllables(word) {
|
|
2687
|
+
const clean = word
|
|
2688
|
+
.normalize('NFC')
|
|
2689
|
+
.toLowerCase()
|
|
2690
|
+
.replace(TextAnalyzer.REGEX.nonWord, '');
|
|
2691
|
+
if (this.syllableCache.has(clean)) return this.syllableCache.get(clean);
|
|
2692
|
+
const matches = clean.match(TextAnalyzer.REGEX.vowelGroup);
|
|
2693
|
+
const count = matches ? matches.length : 1;
|
|
2694
|
+
this.syllableCache.set(clean, count);
|
|
2695
|
+
return count;
|
|
2696
|
+
}
|
|
2697
|
+
computeSyllableStats() {
|
|
2698
|
+
return (this.syllableStats ||= (() => {
|
|
2699
|
+
const perWord = this.words
|
|
2700
|
+
.map((w) => this.estimateSyllables(w))
|
|
2701
|
+
.sort((a, b) => a - b);
|
|
2702
|
+
const total = perWord.reduce((sum, s) => sum + s, 0);
|
|
2703
|
+
const mono = perWord.filter((s) => s === 1).length;
|
|
2704
|
+
const median = !perWord.length
|
|
2705
|
+
? 0
|
|
2706
|
+
: perWord.length % 2 === 0
|
|
2707
|
+
? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) / 2
|
|
2708
|
+
: perWord[Math.floor(perWord.length / 2)];
|
|
2709
|
+
return {
|
|
2710
|
+
total,
|
|
2711
|
+
mono,
|
|
2712
|
+
perWord,
|
|
2713
|
+
avg: perWord.length ? total / perWord.length : 0,
|
|
2714
|
+
median
|
|
2715
|
+
};
|
|
2716
|
+
})());
|
|
2717
|
+
}
|
|
2718
|
+
getLength = () => this.text.length;
|
|
2719
|
+
getWordCount = () => this.words.length;
|
|
2720
|
+
getSentenceCount = () => this.sentences.length;
|
|
2721
|
+
getAvgWordLength() {
|
|
2722
|
+
return this.words.length
|
|
2723
|
+
? this.words.join('').length / this.words.length
|
|
2724
|
+
: 0;
|
|
2725
|
+
}
|
|
2726
|
+
getAvgSentenceLength() {
|
|
2727
|
+
return this.sentences.length
|
|
2728
|
+
? this.words.length / this.sentences.length
|
|
2729
|
+
: 0;
|
|
2730
|
+
}
|
|
2731
|
+
getWordHistogram() {
|
|
2732
|
+
return Object.fromEntries(this.wordHistogram);
|
|
2733
|
+
}
|
|
2734
|
+
getMostCommonWords(limit = 5) {
|
|
2735
|
+
return [...this.wordHistogram.entries()]
|
|
2736
|
+
.sort((a, b) => b[1] - a[1])
|
|
2737
|
+
.slice(0, limit)
|
|
2738
|
+
.map((e) => e[0]);
|
|
2739
|
+
}
|
|
2740
|
+
getHapaxLegomena() {
|
|
2741
|
+
return [...this.wordHistogram.entries()]
|
|
2742
|
+
.filter(([, c]) => c === 1)
|
|
2743
|
+
.map((e) => e[0]);
|
|
2744
|
+
}
|
|
2745
|
+
hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text);
|
|
2746
|
+
getUpperCaseRatio() {
|
|
2747
|
+
const matches = this.text.match(TextAnalyzer.REGEX.letter) || [];
|
|
2748
|
+
const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0;
|
|
2749
|
+
return matches.length ? upper / matches.length : 0;
|
|
2750
|
+
}
|
|
2751
|
+
getCharFrequency() {
|
|
2752
|
+
return Object.fromEntries(this.charFrequency);
|
|
2753
|
+
}
|
|
2754
|
+
getUnicodeCodepoints() {
|
|
2755
|
+
const result = {};
|
|
2756
|
+
for (const [char, count] of this.charFrequency) {
|
|
2757
|
+
const block = char
|
|
2758
|
+
.charCodeAt(0)
|
|
2759
|
+
.toString(16)
|
|
2760
|
+
.padStart(4, '0')
|
|
2761
|
+
.toUpperCase();
|
|
2762
|
+
result[block] = (result[block] || 0) + count;
|
|
2763
|
+
}
|
|
2764
|
+
return result;
|
|
2765
|
+
}
|
|
2766
|
+
getLongWordRatio(len = 7) {
|
|
2767
|
+
let long = 0;
|
|
2768
|
+
for (const w of this.words) if (w.length >= len) long++;
|
|
2769
|
+
return this.words.length ? long / this.words.length : 0;
|
|
2770
|
+
}
|
|
2771
|
+
getShortWordRatio(len = 3) {
|
|
2772
|
+
let short = 0;
|
|
2773
|
+
for (const w of this.words) if (w.length <= len) short++;
|
|
2774
|
+
return this.words.length ? short / this.words.length : 0;
|
|
2775
|
+
}
|
|
2776
|
+
getSyllablesCount() {
|
|
2777
|
+
return this.computeSyllableStats().total;
|
|
2778
|
+
}
|
|
2779
|
+
getMonosyllabicWordCount() {
|
|
2780
|
+
return this.computeSyllableStats().mono;
|
|
2781
|
+
}
|
|
2782
|
+
getMinSyllablesWordCount(min) {
|
|
2783
|
+
return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
|
|
2784
|
+
}
|
|
2785
|
+
getMaxSyllablesWordCount(max) {
|
|
2786
|
+
return this.computeSyllableStats().perWord.filter((w) => w <= max).length;
|
|
2317
2787
|
}
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
c === 'C' ? m : c
|
|
2321
|
-
);
|
|
2322
|
-
return super.encode(word);
|
|
2788
|
+
getAvgSyllablesPerWord() {
|
|
2789
|
+
return this.computeSyllableStats().avg;
|
|
2323
2790
|
}
|
|
2324
|
-
|
|
2325
|
-
return
|
|
2791
|
+
getMedianSyllablesPerWord() {
|
|
2792
|
+
return this.computeSyllableStats().median;
|
|
2326
2793
|
}
|
|
2327
|
-
|
|
2328
|
-
|
|
2329
|
-
|
|
2330
|
-
|
|
2331
|
-
|
|
2332
|
-
|
|
2333
|
-
|
|
2334
|
-
|
|
2335
|
-
|
|
2336
|
-
f: 'F',
|
|
2337
|
-
g: 'K',
|
|
2338
|
-
h: 'H',
|
|
2339
|
-
i: 'I',
|
|
2340
|
-
j: 'J',
|
|
2341
|
-
k: 'K',
|
|
2342
|
-
l: 'L',
|
|
2343
|
-
m: 'M',
|
|
2344
|
-
n: 'N',
|
|
2345
|
-
o: 'O',
|
|
2346
|
-
p: 'P',
|
|
2347
|
-
q: 'K',
|
|
2348
|
-
r: 'R',
|
|
2349
|
-
s: 'S',
|
|
2350
|
-
t: 'T',
|
|
2351
|
-
u: 'U',
|
|
2352
|
-
v: 'F',
|
|
2353
|
-
w: 'W',
|
|
2354
|
-
x: 'KS',
|
|
2355
|
-
y: 'Y',
|
|
2356
|
-
z: 'S'
|
|
2357
|
-
},
|
|
2358
|
-
ruleset: [
|
|
2359
|
-
{ char: 'a', position: 'start', next: ['e'], code: '' },
|
|
2360
|
-
{ char: 'g', position: 'start', next: ['n'], code: '' },
|
|
2361
|
-
{ char: 'k', position: 'start', next: ['n'], code: '' },
|
|
2362
|
-
{ char: 'p', position: 'start', next: ['n'], code: '' },
|
|
2363
|
-
{ char: 'w', position: 'start', next: ['r'], code: '' },
|
|
2364
|
-
{ char: 'b', position: 'end', prev: ['m'], code: '' },
|
|
2365
|
-
{ char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
|
|
2366
|
-
{ char: 'c', next: ['i'], next2: ['a'], code: 'X' },
|
|
2367
|
-
{ char: 'c', next: ['e', 'i', 'y'], code: 'S' },
|
|
2368
|
-
{ char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
|
|
2369
|
-
{
|
|
2370
|
-
char: 'g',
|
|
2371
|
-
next: ['h'],
|
|
2372
|
-
next2Not: ['', 'a', 'e', 'i', 'o', 'u'],
|
|
2373
|
-
code: ''
|
|
2374
|
-
},
|
|
2375
|
-
{ char: 'g', trailing: 'n', code: '' },
|
|
2376
|
-
{ char: 'g', trailing: 'ned', code: '' },
|
|
2377
|
-
{ char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
|
|
2378
|
-
{
|
|
2379
|
-
char: 'h',
|
|
2380
|
-
prev: ['a', 'e', 'i', 'o', 'u'],
|
|
2381
|
-
nextNot: ['a', 'e', 'i', 'o', 'u'],
|
|
2382
|
-
code: ''
|
|
2383
|
-
},
|
|
2384
|
-
{ char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
|
|
2385
|
-
{ char: 'k', prev: ['c'], code: '' },
|
|
2386
|
-
{ char: 'p', next: ['h'], code: 'F' },
|
|
2387
|
-
{ char: 's', next: ['h'], code: 'X' },
|
|
2388
|
-
{ char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2389
|
-
{ char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2390
|
-
{ char: 't', next: ['h'], code: '0' },
|
|
2391
|
-
{ char: 't', next: ['c'], next2: ['h'], code: '' },
|
|
2392
|
-
{ char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
|
|
2393
|
-
{ char: 'h', leading: 'w', code: '' },
|
|
2394
|
-
{ char: 'x', position: 'start', code: 'S' },
|
|
2395
|
-
{ char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
|
|
2396
|
-
]
|
|
2397
|
-
});
|
|
2398
|
-
|
|
2399
|
-
class Soundex extends Phonetic {
|
|
2400
|
-
static default = {
|
|
2401
|
-
map: 'en',
|
|
2402
|
-
delimiter: ' ',
|
|
2403
|
-
length: 4,
|
|
2404
|
-
pad: '0',
|
|
2405
|
-
dedupe: true
|
|
2406
|
-
};
|
|
2407
|
-
constructor(opt = {}) {
|
|
2408
|
-
super('soundex', opt);
|
|
2794
|
+
getHonoresR() {
|
|
2795
|
+
try {
|
|
2796
|
+
return (
|
|
2797
|
+
(100 * Math.log(this.words.length)) /
|
|
2798
|
+
(1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1))
|
|
2799
|
+
);
|
|
2800
|
+
} catch {
|
|
2801
|
+
return 0;
|
|
2802
|
+
}
|
|
2409
2803
|
}
|
|
2410
|
-
|
|
2411
|
-
return
|
|
2804
|
+
getReadingTime(wpm = 200) {
|
|
2805
|
+
return this.words.length / (wpm ?? 1);
|
|
2412
2806
|
}
|
|
2413
|
-
|
|
2414
|
-
|
|
2415
|
-
|
|
2416
|
-
|
|
2417
|
-
|
|
2418
|
-
|
|
2419
|
-
|
|
2420
|
-
|
|
2421
|
-
|
|
2422
|
-
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
|
|
2427
|
-
p: '1',
|
|
2428
|
-
v: '1',
|
|
2429
|
-
c: '2',
|
|
2430
|
-
g: '2',
|
|
2431
|
-
j: '2',
|
|
2432
|
-
k: '2',
|
|
2433
|
-
q: '2',
|
|
2434
|
-
s: '2',
|
|
2435
|
-
x: '2',
|
|
2436
|
-
z: '2',
|
|
2437
|
-
d: '3',
|
|
2438
|
-
t: '3',
|
|
2439
|
-
l: '4',
|
|
2440
|
-
m: '5',
|
|
2441
|
-
n: '5',
|
|
2442
|
-
r: '6'
|
|
2807
|
+
getReadabilityScore(metric = 'flesch') {
|
|
2808
|
+
const w = this.words.length || 1;
|
|
2809
|
+
const s = this.sentences.length || 1;
|
|
2810
|
+
const y = this.getSyllablesCount() || 1;
|
|
2811
|
+
const asl = w / s;
|
|
2812
|
+
const asw = y / w;
|
|
2813
|
+
switch (metric) {
|
|
2814
|
+
case 'flesch':
|
|
2815
|
+
return 206.835 - 1.015 * asl - 84.6 * asw;
|
|
2816
|
+
case 'fleschde':
|
|
2817
|
+
return 180 - asl - 58.5 * asw;
|
|
2818
|
+
case 'kincaid':
|
|
2819
|
+
return 0.39 * asl + 11.8 * asw - 15.59;
|
|
2820
|
+
}
|
|
2443
2821
|
}
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
|
|
2455
|
-
|
|
2456
|
-
|
|
2457
|
-
|
|
2458
|
-
|
|
2459
|
-
|
|
2460
|
-
|
|
2461
|
-
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
g: '2',
|
|
2465
|
-
k: '2',
|
|
2466
|
-
q: '2',
|
|
2467
|
-
s: '2',
|
|
2468
|
-
ß: '2',
|
|
2469
|
-
x: '2',
|
|
2470
|
-
z: '2',
|
|
2471
|
-
d: '3',
|
|
2472
|
-
t: '3',
|
|
2473
|
-
l: '4',
|
|
2474
|
-
m: '5',
|
|
2475
|
-
n: '5',
|
|
2476
|
-
r: '6'
|
|
2477
|
-
},
|
|
2478
|
-
ruleset: [{ char: 'c', next: ['h'], code: '7' }]
|
|
2479
|
-
});
|
|
2822
|
+
getLIXScore() {
|
|
2823
|
+
const w = this.words.length || 1;
|
|
2824
|
+
const s = this.sentences.length || 1;
|
|
2825
|
+
const l = this.getLongWordRatio() * w;
|
|
2826
|
+
return w / s + (l / w) * 100;
|
|
2827
|
+
}
|
|
2828
|
+
getWSTFScore() {
|
|
2829
|
+
const w = this.words.length || 1;
|
|
2830
|
+
const h = (this.getMinSyllablesWordCount(3) / w) * 100;
|
|
2831
|
+
const s = this.getAvgSentenceLength();
|
|
2832
|
+
const l = this.getLongWordRatio() * 100;
|
|
2833
|
+
const m = (this.getMonosyllabicWordCount() / w) * 100;
|
|
2834
|
+
return [
|
|
2835
|
+
0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875,
|
|
2836
|
+
0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779,
|
|
2837
|
+
0.2963 * h + 0.1905 * s - 1.1144,
|
|
2838
|
+
0.2744 * h + 0.2656 * s - 1.693
|
|
2839
|
+
];
|
|
2840
|
+
}
|
|
2841
|
+
}
|
|
2480
2842
|
|
|
2481
2843
|
const profiler = Profiler.getInstance();
|
|
2482
2844
|
class CmpStr {
|
|
@@ -2528,31 +2890,26 @@ class CmpStr {
|
|
|
2528
2890
|
}
|
|
2529
2891
|
assert(cond, test) {
|
|
2530
2892
|
switch (cond) {
|
|
2893
|
+
default:
|
|
2894
|
+
throw new CmpStrInternalError(`Cmpstr condition <${cond}> unknown`);
|
|
2531
2895
|
case 'metric':
|
|
2532
|
-
|
|
2533
|
-
throw new CmpStrNotFoundError(
|
|
2534
|
-
`CmpStr <metric> must be set, call .setMetric(), ` +
|
|
2535
|
-
`use CmpStr.metric.list() for available metrics`,
|
|
2536
|
-
{ metric: test }
|
|
2537
|
-
);
|
|
2896
|
+
OptionsValidator.validateMetricName(test);
|
|
2538
2897
|
break;
|
|
2539
2898
|
case 'phonetic':
|
|
2540
|
-
|
|
2541
|
-
throw new CmpStrNotFoundError(
|
|
2542
|
-
`CmpStr <phonetic> must be set, call .setPhonetic(), ` +
|
|
2543
|
-
`use CmpStr.phonetic.list() for available phonetic algorithms`,
|
|
2544
|
-
{ phonetic: test }
|
|
2545
|
-
);
|
|
2899
|
+
OptionsValidator.validatePhoneticName(test);
|
|
2546
2900
|
break;
|
|
2547
|
-
default:
|
|
2548
|
-
throw new CmpStrInternalError(`Cmpstr condition <${cond}> unknown`);
|
|
2549
2901
|
}
|
|
2550
2902
|
}
|
|
2551
2903
|
assertMany(...cond) {
|
|
2552
2904
|
for (const [c, test] of cond) this.assert(c, test);
|
|
2553
2905
|
}
|
|
2554
2906
|
resolveOptions(opt) {
|
|
2555
|
-
|
|
2907
|
+
const merged = DeepMerge.merge(
|
|
2908
|
+
{ ...(this.options ?? Object.create(null)) },
|
|
2909
|
+
opt
|
|
2910
|
+
);
|
|
2911
|
+
OptionsValidator.validateOptions(merged);
|
|
2912
|
+
return merged;
|
|
2556
2913
|
}
|
|
2557
2914
|
normalize(input, flags) {
|
|
2558
2915
|
return Normalizer.normalize(input, flags ?? this.options.flags ?? '');
|
|
@@ -2568,7 +2925,7 @@ class CmpStr {
|
|
|
2568
2925
|
return input;
|
|
2569
2926
|
}
|
|
2570
2927
|
postProcess(result, opt) {
|
|
2571
|
-
if (
|
|
2928
|
+
if (Array.isArray(result) && opt?.removeZero)
|
|
2572
2929
|
result = result.filter((r) => r.res > 0);
|
|
2573
2930
|
return result;
|
|
2574
2931
|
}
|
|
@@ -2584,10 +2941,10 @@ class CmpStr {
|
|
|
2584
2941
|
return StructuredData.create(data, key);
|
|
2585
2942
|
}
|
|
2586
2943
|
compute(a, b, opt, mode, raw, skip) {
|
|
2944
|
+
const resolved = this.resolveOptions(opt);
|
|
2945
|
+
this.assert('metric', resolved.metric);
|
|
2587
2946
|
return ErrorUtil.wrap(
|
|
2588
2947
|
() => {
|
|
2589
|
-
const resolved = this.resolveOptions(opt);
|
|
2590
|
-
this.assert('metric', resolved.metric);
|
|
2591
2948
|
const A = skip ? a : this.prepare(a, resolved);
|
|
2592
2949
|
const B = skip ? b : this.prepare(b, resolved);
|
|
2593
2950
|
if (
|
|
@@ -2605,7 +2962,7 @@ class CmpStr {
|
|
|
2605
2962
|
const result = this.postProcess(metric.getResults(), resolved);
|
|
2606
2963
|
return this.output(result, raw ?? resolved.raw);
|
|
2607
2964
|
},
|
|
2608
|
-
`Failed to compute metric <${
|
|
2965
|
+
`Failed to compute metric <${resolved.metric}> for the given inputs`,
|
|
2609
2966
|
{ a, b, options: opt }
|
|
2610
2967
|
);
|
|
2611
2968
|
}
|
|
@@ -2621,46 +2978,79 @@ class CmpStr {
|
|
|
2621
2978
|
{ result, raw }
|
|
2622
2979
|
);
|
|
2623
2980
|
}
|
|
2624
|
-
clone
|
|
2981
|
+
clone() {
|
|
2982
|
+
const inst = Object.assign(
|
|
2983
|
+
Object.create(Object.getPrototypeOf(this)),
|
|
2984
|
+
this
|
|
2985
|
+
);
|
|
2986
|
+
inst.options = DeepMerge.merge(Object.create(null), this.options);
|
|
2987
|
+
return inst;
|
|
2988
|
+
}
|
|
2625
2989
|
reset() {
|
|
2626
|
-
|
|
2990
|
+
this.options = Object.create(null);
|
|
2627
2991
|
return this;
|
|
2628
2992
|
}
|
|
2629
2993
|
setOptions(opt) {
|
|
2994
|
+
OptionsValidator.validateOptions(opt);
|
|
2630
2995
|
this.options = opt;
|
|
2631
2996
|
return this;
|
|
2632
2997
|
}
|
|
2633
2998
|
mergeOptions(opt) {
|
|
2634
|
-
merge(this.options, opt);
|
|
2999
|
+
DeepMerge.merge(this.options, opt);
|
|
3000
|
+
OptionsValidator.validateOptions(this.options);
|
|
2635
3001
|
return this;
|
|
2636
3002
|
}
|
|
2637
3003
|
setSerializedOptions(opt) {
|
|
2638
|
-
|
|
2639
|
-
()
|
|
2640
|
-
|
|
2641
|
-
|
|
2642
|
-
|
|
2643
|
-
|
|
2644
|
-
|
|
2645
|
-
|
|
3004
|
+
try {
|
|
3005
|
+
const parsed = JSON.parse(opt);
|
|
3006
|
+
OptionsValidator.validateOptions(parsed);
|
|
3007
|
+
this.options = parsed;
|
|
3008
|
+
return this;
|
|
3009
|
+
} catch (err) {
|
|
3010
|
+
if (err instanceof SyntaxError)
|
|
3011
|
+
throw new CmpStrValidationError(
|
|
3012
|
+
`Failed to parse serialized options, invalid JSON string`,
|
|
3013
|
+
{ opt, error: err instanceof Error ? err.message : String(err) }
|
|
3014
|
+
);
|
|
3015
|
+
throw err;
|
|
3016
|
+
}
|
|
2646
3017
|
}
|
|
2647
3018
|
setOption(path, value) {
|
|
2648
|
-
set(this.options, path, value);
|
|
3019
|
+
DeepMerge.set(this.options, path, value);
|
|
3020
|
+
OptionsValidator.validateOptions(this.options);
|
|
2649
3021
|
return this;
|
|
2650
3022
|
}
|
|
2651
3023
|
rmvOption(path) {
|
|
2652
|
-
rmv(this.options, path);
|
|
3024
|
+
DeepMerge.rmv(this.options, path);
|
|
2653
3025
|
return this;
|
|
2654
3026
|
}
|
|
2655
|
-
setRaw
|
|
2656
|
-
|
|
2657
|
-
|
|
2658
|
-
|
|
2659
|
-
|
|
2660
|
-
|
|
2661
|
-
|
|
2662
|
-
|
|
2663
|
-
|
|
3027
|
+
setRaw(enable) {
|
|
3028
|
+
return this.setOption('raw', enable);
|
|
3029
|
+
}
|
|
3030
|
+
setMetric(name) {
|
|
3031
|
+
return this.setOption('metric', name);
|
|
3032
|
+
}
|
|
3033
|
+
setFlags(flags) {
|
|
3034
|
+
return this.setOption('flags', flags);
|
|
3035
|
+
}
|
|
3036
|
+
rmvFlags() {
|
|
3037
|
+
return this.rmvOption('flags');
|
|
3038
|
+
}
|
|
3039
|
+
setProcessors(opt) {
|
|
3040
|
+
return this.setOption('processors', opt);
|
|
3041
|
+
}
|
|
3042
|
+
rmvProcessors() {
|
|
3043
|
+
return this.rmvOption('processors');
|
|
3044
|
+
}
|
|
3045
|
+
getOptions() {
|
|
3046
|
+
return this.options;
|
|
3047
|
+
}
|
|
3048
|
+
getSerializedOptions() {
|
|
3049
|
+
return JSON.stringify(this.options);
|
|
3050
|
+
}
|
|
3051
|
+
getOption(path) {
|
|
3052
|
+
return DeepMerge.get(this.options, path);
|
|
3053
|
+
}
|
|
2664
3054
|
test(a, b, opt) {
|
|
2665
3055
|
return this.compute(a, b, opt, 'single');
|
|
2666
3056
|
}
|
|
@@ -2699,15 +3089,35 @@ class CmpStr {
|
|
|
2699
3089
|
const resolved = this.resolveOptions({ flags, processors });
|
|
2700
3090
|
const test = this.prepare(needle, resolved);
|
|
2701
3091
|
const hstk = this.prepare(haystack, resolved);
|
|
2702
|
-
|
|
3092
|
+
const out = [];
|
|
3093
|
+
for (let i = 0, len = hstk.length; i < len; i++) {
|
|
3094
|
+
if (hstk[i].includes(test)) out.push(haystack[i]);
|
|
3095
|
+
}
|
|
3096
|
+
return out;
|
|
2703
3097
|
}
|
|
2704
3098
|
matrix(input, opt) {
|
|
2705
|
-
|
|
2706
|
-
|
|
2707
|
-
|
|
2708
|
-
|
|
2709
|
-
|
|
2710
|
-
|
|
3099
|
+
const resolved = this.resolveOptions(opt);
|
|
3100
|
+
const arr = this.prepare(input, resolved);
|
|
3101
|
+
const n = arr.length;
|
|
3102
|
+
const out = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
3103
|
+
for (let i = 0; i < n; i++)
|
|
3104
|
+
for (let j = i; j < n; j++) {
|
|
3105
|
+
if (i === j) {
|
|
3106
|
+
out[i][j] = 1;
|
|
3107
|
+
} else {
|
|
3108
|
+
const score = this.compute(
|
|
3109
|
+
arr[i],
|
|
3110
|
+
arr[j],
|
|
3111
|
+
resolved,
|
|
3112
|
+
'single',
|
|
3113
|
+
true,
|
|
3114
|
+
true
|
|
3115
|
+
).res;
|
|
3116
|
+
out[i][j] = score;
|
|
3117
|
+
out[j][i] = score;
|
|
3118
|
+
}
|
|
3119
|
+
}
|
|
3120
|
+
return out;
|
|
2711
3121
|
}
|
|
2712
3122
|
phoneticIndex(input, algo, opt) {
|
|
2713
3123
|
const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
|
|
@@ -2785,10 +3195,10 @@ class CmpStrAsync extends CmpStr {
|
|
|
2785
3195
|
: phonetic.getIndexAsync(input).then((r) => r.join(delimiter));
|
|
2786
3196
|
}
|
|
2787
3197
|
async computeAsync(a, b, opt, mode, raw, skip) {
|
|
3198
|
+
const resolved = this.resolveOptions(opt);
|
|
3199
|
+
this.assert('metric', resolved.metric);
|
|
2788
3200
|
return ErrorUtil.wrapAsync(
|
|
2789
3201
|
async () => {
|
|
2790
|
-
const resolved = this.resolveOptions(opt);
|
|
2791
|
-
this.assert('metric', resolved.metric);
|
|
2792
3202
|
const A = skip ? a : await this.prepareAsync(a, resolved);
|
|
2793
3203
|
const B = skip ? b : await this.prepareAsync(b, resolved);
|
|
2794
3204
|
if (
|
|
@@ -2846,23 +3256,40 @@ class CmpStrAsync extends CmpStr {
|
|
|
2846
3256
|
const resolved = this.resolveOptions({ flags, processors });
|
|
2847
3257
|
const test = await this.prepareAsync(needle, resolved);
|
|
2848
3258
|
const hstk = await this.prepareAsync(haystack, resolved);
|
|
2849
|
-
|
|
3259
|
+
const out = [];
|
|
3260
|
+
for (let i = 0; i < hstk.length; i++) {
|
|
3261
|
+
if (hstk[i].includes(test)) out.push(haystack[i]);
|
|
3262
|
+
}
|
|
3263
|
+
return out;
|
|
2850
3264
|
}
|
|
2851
3265
|
async matrixAsync(input, opt) {
|
|
2852
|
-
|
|
2853
|
-
|
|
2854
|
-
|
|
2855
|
-
|
|
2856
|
-
|
|
2857
|
-
|
|
2858
|
-
|
|
2859
|
-
|
|
2860
|
-
|
|
2861
|
-
|
|
2862
|
-
|
|
2863
|
-
|
|
2864
|
-
|
|
2865
|
-
|
|
3266
|
+
const resolved = this.resolveOptions(opt);
|
|
3267
|
+
const arr = await this.prepareAsync(input, resolved);
|
|
3268
|
+
const n = arr.length;
|
|
3269
|
+
const out = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
3270
|
+
for (let i = 0; i < n; i++) {
|
|
3271
|
+
await Promise.all(
|
|
3272
|
+
Array.from({ length: n - i }, (_, k) => i + k).map(async (j) => {
|
|
3273
|
+
if (i === j) {
|
|
3274
|
+
out[i][j] = 1;
|
|
3275
|
+
} else {
|
|
3276
|
+
const score = (
|
|
3277
|
+
await this.computeAsync(
|
|
3278
|
+
arr[i],
|
|
3279
|
+
arr[j],
|
|
3280
|
+
resolved,
|
|
3281
|
+
'single',
|
|
3282
|
+
true,
|
|
3283
|
+
true
|
|
3284
|
+
)
|
|
3285
|
+
).res;
|
|
3286
|
+
out[i][j] = score;
|
|
3287
|
+
out[j][i] = score;
|
|
3288
|
+
}
|
|
3289
|
+
})
|
|
3290
|
+
);
|
|
3291
|
+
}
|
|
3292
|
+
return out;
|
|
2866
3293
|
}
|
|
2867
3294
|
async phoneticIndexAsync(input, algo, opt) {
|
|
2868
3295
|
const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
|
|
@@ -2919,6 +3346,7 @@ export {
|
|
|
2919
3346
|
Metric,
|
|
2920
3347
|
MetricRegistry,
|
|
2921
3348
|
Normalizer,
|
|
3349
|
+
OptionsValidator,
|
|
2922
3350
|
Phonetic,
|
|
2923
3351
|
PhoneticMappingRegistry,
|
|
2924
3352
|
PhoneticRegistry,
|