cmpstr 3.2.1 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -18
- package/dist/CmpStr.esm.js +1904 -1211
- package/dist/CmpStr.esm.min.js +2 -3
- package/dist/CmpStr.umd.js +1924 -1236
- package/dist/CmpStr.umd.min.js +2 -3
- package/dist/cjs/CmpStr.cjs +134 -64
- package/dist/cjs/CmpStrAsync.cjs +60 -37
- package/dist/cjs/index.cjs +1 -2
- package/dist/cjs/metric/Cosine.cjs +1 -2
- package/dist/cjs/metric/DamerauLevenshtein.cjs +1 -2
- package/dist/cjs/metric/DiceSorensen.cjs +1 -2
- package/dist/cjs/metric/Hamming.cjs +5 -4
- package/dist/cjs/metric/Jaccard.cjs +1 -2
- package/dist/cjs/metric/JaroWinkler.cjs +1 -2
- package/dist/cjs/metric/LCS.cjs +1 -2
- package/dist/cjs/metric/Levenshtein.cjs +1 -2
- package/dist/cjs/metric/Metric.cjs +90 -53
- package/dist/cjs/metric/NeedlemanWunsch.cjs +1 -2
- package/dist/cjs/metric/QGram.cjs +1 -2
- package/dist/cjs/metric/SmithWaterman.cjs +1 -2
- package/dist/cjs/phonetic/Caverphone.cjs +1 -2
- package/dist/cjs/phonetic/Cologne.cjs +1 -2
- package/dist/cjs/phonetic/Metaphone.cjs +1 -2
- package/dist/cjs/phonetic/Phonetic.cjs +80 -48
- package/dist/cjs/phonetic/Soundex.cjs +1 -2
- package/dist/cjs/root.cjs +6 -3
- package/dist/cjs/utils/DeepMerge.cjs +109 -99
- package/dist/cjs/utils/DiffChecker.cjs +1 -2
- package/dist/cjs/utils/Errors.cjs +106 -0
- package/dist/cjs/utils/Filter.cjs +97 -37
- package/dist/cjs/utils/HashTable.cjs +44 -30
- package/dist/cjs/utils/Normalizer.cjs +84 -35
- package/dist/cjs/utils/OptionsValidator.cjs +211 -0
- package/dist/cjs/utils/Pool.cjs +57 -19
- package/dist/cjs/utils/Profiler.cjs +41 -28
- package/dist/cjs/utils/Registry.cjs +48 -24
- package/dist/cjs/utils/StructuredData.cjs +95 -57
- package/dist/cjs/utils/TextAnalyzer.cjs +1 -2
- package/dist/esm/CmpStr.mjs +133 -61
- package/dist/esm/CmpStrAsync.mjs +56 -33
- package/dist/esm/index.mjs +1 -2
- package/dist/esm/metric/Cosine.mjs +1 -2
- package/dist/esm/metric/DamerauLevenshtein.mjs +1 -2
- package/dist/esm/metric/DiceSorensen.mjs +1 -2
- package/dist/esm/metric/Hamming.mjs +5 -4
- package/dist/esm/metric/Jaccard.mjs +1 -2
- package/dist/esm/metric/JaroWinkler.mjs +1 -2
- package/dist/esm/metric/LCS.mjs +1 -2
- package/dist/esm/metric/Levenshtein.mjs +1 -2
- package/dist/esm/metric/Metric.mjs +92 -53
- package/dist/esm/metric/NeedlemanWunsch.mjs +1 -2
- package/dist/esm/metric/QGram.mjs +1 -2
- package/dist/esm/metric/SmithWaterman.mjs +1 -2
- package/dist/esm/phonetic/Caverphone.mjs +1 -2
- package/dist/esm/phonetic/Cologne.mjs +1 -2
- package/dist/esm/phonetic/Metaphone.mjs +1 -2
- package/dist/esm/phonetic/Phonetic.mjs +83 -48
- package/dist/esm/phonetic/Soundex.mjs +1 -2
- package/dist/esm/root.mjs +5 -4
- package/dist/esm/utils/DeepMerge.mjs +109 -95
- package/dist/esm/utils/DiffChecker.mjs +1 -2
- package/dist/esm/utils/Errors.mjs +106 -0
- package/dist/esm/utils/Filter.mjs +97 -37
- package/dist/esm/utils/HashTable.mjs +44 -30
- package/dist/esm/utils/Normalizer.mjs +84 -35
- package/dist/esm/utils/OptionsValidator.mjs +210 -0
- package/dist/esm/utils/Pool.mjs +53 -19
- package/dist/esm/utils/Profiler.mjs +41 -28
- package/dist/esm/utils/Registry.mjs +48 -24
- package/dist/esm/utils/StructuredData.mjs +95 -57
- package/dist/esm/utils/TextAnalyzer.mjs +1 -2
- package/dist/types/CmpStr.d.ts +25 -14
- package/dist/types/CmpStrAsync.d.ts +4 -0
- package/dist/types/index.d.ts +3 -2
- package/dist/types/metric/Metric.d.ts +15 -14
- package/dist/types/phonetic/Phonetic.d.ts +7 -4
- package/dist/types/root.d.ts +4 -2
- package/dist/types/utils/DeepMerge.d.ts +80 -58
- package/dist/types/utils/Errors.d.ts +154 -0
- package/dist/types/utils/Filter.d.ts +8 -1
- package/dist/types/utils/HashTable.d.ts +12 -11
- package/dist/types/utils/Normalizer.d.ts +5 -1
- package/dist/types/utils/OptionsValidator.d.ts +193 -0
- package/dist/types/utils/Pool.d.ts +2 -0
- package/dist/types/utils/Profiler.d.ts +9 -28
- package/dist/types/utils/Registry.d.ts +3 -3
- package/dist/types/utils/StructuredData.d.ts +6 -1
- package/dist/types/utils/Types.d.ts +39 -1
- package/package.json +20 -11
- package/dist/CmpStr.esm.js.map +0 -1
- package/dist/CmpStr.esm.min.js.map +0 -1
- package/dist/CmpStr.umd.js.map +0 -1
- package/dist/CmpStr.umd.min.js.map +0 -1
- package/dist/cjs/CmpStr.cjs.map +0 -1
- package/dist/cjs/CmpStrAsync.cjs.map +0 -1
- package/dist/cjs/index.cjs.map +0 -1
- package/dist/cjs/metric/Cosine.cjs.map +0 -1
- package/dist/cjs/metric/DamerauLevenshtein.cjs.map +0 -1
- package/dist/cjs/metric/DiceSorensen.cjs.map +0 -1
- package/dist/cjs/metric/Hamming.cjs.map +0 -1
- package/dist/cjs/metric/Jaccard.cjs.map +0 -1
- package/dist/cjs/metric/JaroWinkler.cjs.map +0 -1
- package/dist/cjs/metric/LCS.cjs.map +0 -1
- package/dist/cjs/metric/Levenshtein.cjs.map +0 -1
- package/dist/cjs/metric/Metric.cjs.map +0 -1
- package/dist/cjs/metric/NeedlemanWunsch.cjs.map +0 -1
- package/dist/cjs/metric/QGram.cjs.map +0 -1
- package/dist/cjs/metric/SmithWaterman.cjs.map +0 -1
- package/dist/cjs/phonetic/Caverphone.cjs.map +0 -1
- package/dist/cjs/phonetic/Cologne.cjs.map +0 -1
- package/dist/cjs/phonetic/Metaphone.cjs.map +0 -1
- package/dist/cjs/phonetic/Phonetic.cjs.map +0 -1
- package/dist/cjs/phonetic/Soundex.cjs.map +0 -1
- package/dist/cjs/root.cjs.map +0 -1
- package/dist/cjs/utils/DeepMerge.cjs.map +0 -1
- package/dist/cjs/utils/DiffChecker.cjs.map +0 -1
- package/dist/cjs/utils/Filter.cjs.map +0 -1
- package/dist/cjs/utils/HashTable.cjs.map +0 -1
- package/dist/cjs/utils/Normalizer.cjs.map +0 -1
- package/dist/cjs/utils/Pool.cjs.map +0 -1
- package/dist/cjs/utils/Profiler.cjs.map +0 -1
- package/dist/cjs/utils/Registry.cjs.map +0 -1
- package/dist/cjs/utils/StructuredData.cjs.map +0 -1
- package/dist/cjs/utils/TextAnalyzer.cjs.map +0 -1
- package/dist/esm/CmpStr.mjs.map +0 -1
- package/dist/esm/CmpStrAsync.mjs.map +0 -1
- package/dist/esm/index.mjs.map +0 -1
- package/dist/esm/metric/Cosine.mjs.map +0 -1
- package/dist/esm/metric/DamerauLevenshtein.mjs.map +0 -1
- package/dist/esm/metric/DiceSorensen.mjs.map +0 -1
- package/dist/esm/metric/Hamming.mjs.map +0 -1
- package/dist/esm/metric/Jaccard.mjs.map +0 -1
- package/dist/esm/metric/JaroWinkler.mjs.map +0 -1
- package/dist/esm/metric/LCS.mjs.map +0 -1
- package/dist/esm/metric/Levenshtein.mjs.map +0 -1
- package/dist/esm/metric/Metric.mjs.map +0 -1
- package/dist/esm/metric/NeedlemanWunsch.mjs.map +0 -1
- package/dist/esm/metric/QGram.mjs.map +0 -1
- package/dist/esm/metric/SmithWaterman.mjs.map +0 -1
- package/dist/esm/phonetic/Caverphone.mjs.map +0 -1
- package/dist/esm/phonetic/Cologne.mjs.map +0 -1
- package/dist/esm/phonetic/Metaphone.mjs.map +0 -1
- package/dist/esm/phonetic/Phonetic.mjs.map +0 -1
- package/dist/esm/phonetic/Soundex.mjs.map +0 -1
- package/dist/esm/root.mjs.map +0 -1
- package/dist/esm/utils/DeepMerge.mjs.map +0 -1
- package/dist/esm/utils/DiffChecker.mjs.map +0 -1
- package/dist/esm/utils/Filter.mjs.map +0 -1
- package/dist/esm/utils/HashTable.mjs.map +0 -1
- package/dist/esm/utils/Normalizer.mjs.map +0 -1
- package/dist/esm/utils/Pool.mjs.map +0 -1
- package/dist/esm/utils/Profiler.mjs.map +0 -1
- package/dist/esm/utils/Registry.mjs.map +0 -1
- package/dist/esm/utils/StructuredData.mjs.map +0 -1
- package/dist/esm/utils/TextAnalyzer.mjs.map +0 -1
package/dist/CmpStr.esm.js
CHANGED
|
@@ -1,117 +1,228 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* CmpStr v3.
|
|
2
|
+
* CmpStr v3.3.0 build-3699f85-260318
|
|
3
3
|
* This is a lightweight, fast and well performing library for calculating string similarity.
|
|
4
4
|
* (c) 2023-2026 Paul Köhler @komed3 / MIT License
|
|
5
5
|
* Visit https://github.com/komed3/cmpstr and https://npmjs.org/package/cmpstr
|
|
6
6
|
*/
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
.
|
|
14
|
-
.
|
|
15
|
-
.
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
|
|
7
|
+
class CmpStrError extends Error {
|
|
8
|
+
code;
|
|
9
|
+
meta;
|
|
10
|
+
when = new Date().toISOString();
|
|
11
|
+
constructor(code, message, meta, cause) {
|
|
12
|
+
super(message, cause !== undefined ? { cause } : undefined);
|
|
13
|
+
this.name = this.constructor.name;
|
|
14
|
+
this.code = code;
|
|
15
|
+
this.meta = meta;
|
|
16
|
+
if (typeof Error.captureStackTrace === 'function') {
|
|
17
|
+
Error.captureStackTrace(this, this.constructor);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
format(stack = false) {
|
|
21
|
+
const parts = [`${this.name} [${this.code}]`, this.message];
|
|
22
|
+
if (this.meta)
|
|
23
|
+
for (const _ in this.meta) {
|
|
24
|
+
parts.push(JSON.stringify(this.meta));
|
|
25
|
+
break;
|
|
26
|
+
}
|
|
27
|
+
return (
|
|
28
|
+
parts.join(' - ') +
|
|
29
|
+
(stack && this.stack ? `\nStack Trace:\n${this.stack}` : '')
|
|
30
|
+
);
|
|
31
|
+
}
|
|
32
|
+
toString() {
|
|
33
|
+
return this.format(false);
|
|
34
|
+
}
|
|
35
|
+
toJSON(stack = false) {
|
|
36
|
+
return {
|
|
37
|
+
name: this.name,
|
|
38
|
+
code: this.code,
|
|
39
|
+
message: this.message,
|
|
40
|
+
meta: this.meta,
|
|
41
|
+
when: this.when,
|
|
42
|
+
cause:
|
|
43
|
+
this.cause instanceof Error
|
|
44
|
+
? {
|
|
45
|
+
name: this.cause.name,
|
|
46
|
+
message: this.cause.message,
|
|
47
|
+
stack: stack && this.cause.stack
|
|
48
|
+
}
|
|
49
|
+
: this.cause
|
|
50
|
+
};
|
|
51
|
+
}
|
|
21
52
|
}
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
if (o == null || !(k in o)) return fb;
|
|
26
|
-
o = o[k];
|
|
53
|
+
class CmpStrValidationError extends CmpStrError {
|
|
54
|
+
constructor(message, meta, cause) {
|
|
55
|
+
super('E_VALIDATION', message, meta, cause);
|
|
27
56
|
}
|
|
28
|
-
return o;
|
|
29
57
|
}
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
if (o == null || !(k in o)) return false;
|
|
34
|
-
o = o[k];
|
|
58
|
+
class CmpStrNotFoundError extends CmpStrError {
|
|
59
|
+
constructor(message, meta, cause) {
|
|
60
|
+
super('E_NOT_FOUND', message, meta, cause);
|
|
35
61
|
}
|
|
36
|
-
return true;
|
|
37
62
|
}
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
if (t !== undefined && (typeof t !== 'object' || t === null))
|
|
42
|
-
throw Error(`Cannot set property <${keys[0]}> of <${JSON.stringify(t)}>`);
|
|
43
|
-
const root = t ?? (typeof keys[0] === 'number' ? [] : Object.create(null));
|
|
44
|
-
let cur = root;
|
|
45
|
-
for (let i = 0; i < keys.length - 1; i++) {
|
|
46
|
-
const k = keys[i];
|
|
47
|
-
let n = cur[k];
|
|
48
|
-
if (n != null && typeof n !== 'object')
|
|
49
|
-
throw Error(
|
|
50
|
-
`Cannot set property <${keys[i + 1]}> of <${JSON.stringify(n)}>`
|
|
51
|
-
);
|
|
52
|
-
if (n == null)
|
|
53
|
-
n = cur[k] = typeof keys[i + 1] === 'number' ? [] : Object.create(null);
|
|
54
|
-
cur = n;
|
|
63
|
+
class CmpStrUsageError extends CmpStrError {
|
|
64
|
+
constructor(message, meta, cause) {
|
|
65
|
+
super('E_USAGE', message, meta, cause);
|
|
55
66
|
}
|
|
56
|
-
cur[keys[keys.length - 1]] = value;
|
|
57
|
-
return root;
|
|
58
67
|
}
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
) {
|
|
64
|
-
const target = t ?? Object.create(null);
|
|
65
|
-
Object.keys(o).forEach((k) => {
|
|
66
|
-
const val = o[k];
|
|
67
|
-
if (!mergeUndefined && val === undefined) return;
|
|
68
|
-
if (k === '__proto__' || k === 'constructor') return;
|
|
69
|
-
if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
|
|
70
|
-
const existing = target[k];
|
|
71
|
-
target[k] = merge(
|
|
72
|
-
existing !== null &&
|
|
73
|
-
typeof existing === 'object' &&
|
|
74
|
-
!Array.isArray(existing)
|
|
75
|
-
? existing
|
|
76
|
-
: Object.create(null),
|
|
77
|
-
val,
|
|
78
|
-
mergeUndefined
|
|
79
|
-
);
|
|
80
|
-
} else target[k] = val;
|
|
81
|
-
});
|
|
82
|
-
return target;
|
|
68
|
+
class CmpStrInternalError extends CmpStrError {
|
|
69
|
+
constructor(message, meta, cause) {
|
|
70
|
+
super('E_INTERNAL', message, meta, cause);
|
|
71
|
+
}
|
|
83
72
|
}
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
if (
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
73
|
+
class ErrorUtil {
|
|
74
|
+
static assert(condition, message, meta) {
|
|
75
|
+
if (!condition) throw new CmpStrUsageError(message, meta);
|
|
76
|
+
}
|
|
77
|
+
static rethrow(err, message, meta) {
|
|
78
|
+
if (err instanceof CmpStrError) throw err;
|
|
79
|
+
throw new CmpStrInternalError(message, meta, err);
|
|
80
|
+
}
|
|
81
|
+
static format(err) {
|
|
82
|
+
if (err instanceof CmpStrError) return err.toString();
|
|
83
|
+
if (err instanceof Error) return `${err.name}: ${err.message}`;
|
|
84
|
+
return String(err);
|
|
85
|
+
}
|
|
86
|
+
static wrap(fn, message, meta) {
|
|
87
|
+
try {
|
|
88
|
+
return fn();
|
|
89
|
+
} catch (err) {
|
|
90
|
+
if (err instanceof CmpStrError) throw err;
|
|
91
|
+
throw new CmpStrInternalError(message, meta, err);
|
|
99
92
|
}
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
93
|
+
}
|
|
94
|
+
static async wrapAsync(fn, message, meta) {
|
|
95
|
+
try {
|
|
96
|
+
return await fn();
|
|
97
|
+
} catch (err) {
|
|
98
|
+
if (err instanceof CmpStrError) throw err;
|
|
99
|
+
throw new CmpStrInternalError(message, meta, err);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
104
102
|
}
|
|
105
103
|
|
|
106
|
-
var
|
|
104
|
+
var Errors = /*#__PURE__*/ Object.freeze({
|
|
107
105
|
__proto__: null,
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
106
|
+
CmpStrError: CmpStrError,
|
|
107
|
+
CmpStrInternalError: CmpStrInternalError,
|
|
108
|
+
CmpStrNotFoundError: CmpStrNotFoundError,
|
|
109
|
+
CmpStrUsageError: CmpStrUsageError,
|
|
110
|
+
CmpStrValidationError: CmpStrValidationError,
|
|
111
|
+
ErrorUtil: ErrorUtil
|
|
113
112
|
});
|
|
114
113
|
|
|
114
|
+
class DeepMerge {
|
|
115
|
+
static BRACKET_PATTERN = /\[(\d+)]/g;
|
|
116
|
+
static PATH_CACHE = new Map();
|
|
117
|
+
static walk(obj, keys) {
|
|
118
|
+
let o = obj;
|
|
119
|
+
for (let i = 0; i < keys.length; i++) {
|
|
120
|
+
const k = keys[i];
|
|
121
|
+
if (o == null || !(k in o)) return { exists: false };
|
|
122
|
+
o = o[k];
|
|
123
|
+
}
|
|
124
|
+
return { exists: true, value: o };
|
|
125
|
+
}
|
|
126
|
+
static parse(p) {
|
|
127
|
+
const cached = DeepMerge.PATH_CACHE.get(p);
|
|
128
|
+
if (cached) return cached;
|
|
129
|
+
const parsed = p
|
|
130
|
+
.replace(DeepMerge.BRACKET_PATTERN, '.$1')
|
|
131
|
+
.split('.')
|
|
132
|
+
.map((s) => {
|
|
133
|
+
const n = Number(s);
|
|
134
|
+
return Number.isInteger(n) && String(n) === s ? n : s;
|
|
135
|
+
});
|
|
136
|
+
if (DeepMerge.PATH_CACHE.size > 2000) DeepMerge.PATH_CACHE.clear();
|
|
137
|
+
DeepMerge.PATH_CACHE.set(p, parsed);
|
|
138
|
+
return parsed;
|
|
139
|
+
}
|
|
140
|
+
static has(t, path) {
|
|
141
|
+
return DeepMerge.walk(t, DeepMerge.parse(path)).exists;
|
|
142
|
+
}
|
|
143
|
+
static get(t, path, fb) {
|
|
144
|
+
const r = DeepMerge.walk(t, DeepMerge.parse(path));
|
|
145
|
+
return r.exists ? r.value : fb;
|
|
146
|
+
}
|
|
147
|
+
static set(t, path, value) {
|
|
148
|
+
if (path === '') return value;
|
|
149
|
+
const keys = DeepMerge.parse(path);
|
|
150
|
+
ErrorUtil.assert(
|
|
151
|
+
t === undefined || (typeof t === 'object' && t !== null),
|
|
152
|
+
`Cannot set property <${keys[0]}> of <${JSON.stringify(t)}>`,
|
|
153
|
+
{ path: keys[0], target: t }
|
|
154
|
+
);
|
|
155
|
+
const root = t ?? (typeof keys[0] === 'number' ? [] : Object.create(null));
|
|
156
|
+
let cur = root;
|
|
157
|
+
for (let i = 0; i < keys.length - 1; i++) {
|
|
158
|
+
const k = keys[i];
|
|
159
|
+
let n = cur[k];
|
|
160
|
+
ErrorUtil.assert(
|
|
161
|
+
n == null || typeof n === 'object',
|
|
162
|
+
`Cannot set property <${keys[i + 1]}> of <${JSON.stringify(n)}>`,
|
|
163
|
+
{ path: keys.slice(0, i + 2), value: n }
|
|
164
|
+
);
|
|
165
|
+
if (n == null)
|
|
166
|
+
n = cur[k] = typeof keys[i + 1] === 'number' ? [] : Object.create(null);
|
|
167
|
+
cur = n;
|
|
168
|
+
}
|
|
169
|
+
cur[keys[keys.length - 1]] = value;
|
|
170
|
+
return root;
|
|
171
|
+
}
|
|
172
|
+
static rmv(t, path, preserveEmpty = false) {
|
|
173
|
+
const keys = DeepMerge.parse(path);
|
|
174
|
+
const remove = (obj, i = 0) => {
|
|
175
|
+
const key = keys[i];
|
|
176
|
+
if (!obj || typeof obj !== 'object') return false;
|
|
177
|
+
if (i === keys.length - 1) return delete obj[key];
|
|
178
|
+
if (!remove(obj[key], i + 1)) return false;
|
|
179
|
+
if (!preserveEmpty) {
|
|
180
|
+
const val = obj[key];
|
|
181
|
+
let empty = true;
|
|
182
|
+
if (typeof val === 'object') {
|
|
183
|
+
if (Array.isArray(val))
|
|
184
|
+
for (let i = 0; i < val.length; i++) {
|
|
185
|
+
if (val[i] != null) {
|
|
186
|
+
empty = false;
|
|
187
|
+
break;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
else empty = false;
|
|
191
|
+
}
|
|
192
|
+
if (empty) delete obj[key];
|
|
193
|
+
}
|
|
194
|
+
return true;
|
|
195
|
+
};
|
|
196
|
+
remove(t);
|
|
197
|
+
return t;
|
|
198
|
+
}
|
|
199
|
+
static merge(
|
|
200
|
+
t = Object.create(null),
|
|
201
|
+
o = Object.create(null),
|
|
202
|
+
mergeUndefined = false
|
|
203
|
+
) {
|
|
204
|
+
const target = t ?? Object.create(null);
|
|
205
|
+
for (const k in o) {
|
|
206
|
+
const val = o[k];
|
|
207
|
+
if (!mergeUndefined && val === undefined) continue;
|
|
208
|
+
if (k === '__proto__' || k === 'constructor') continue;
|
|
209
|
+
if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
|
|
210
|
+
const existing = target[k];
|
|
211
|
+
target[k] = DeepMerge.merge(
|
|
212
|
+
existing !== null &&
|
|
213
|
+
typeof existing === 'object' &&
|
|
214
|
+
!Array.isArray(existing)
|
|
215
|
+
? existing
|
|
216
|
+
: Object.create(null),
|
|
217
|
+
val,
|
|
218
|
+
mergeUndefined
|
|
219
|
+
);
|
|
220
|
+
} else target[k] = val;
|
|
221
|
+
}
|
|
222
|
+
return target;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
115
226
|
class DiffChecker {
|
|
116
227
|
a;
|
|
117
228
|
b;
|
|
@@ -408,48 +519,88 @@ class DiffChecker {
|
|
|
408
519
|
}
|
|
409
520
|
|
|
410
521
|
class Filter {
|
|
522
|
+
static IDENTITY = (s) => s;
|
|
411
523
|
static filters = new Map();
|
|
412
524
|
static pipeline = new Map();
|
|
413
|
-
static getPipeline(hook) {
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
525
|
+
static getPipeline(hook, force = false) {
|
|
526
|
+
return ErrorUtil.wrap(
|
|
527
|
+
() => {
|
|
528
|
+
if (!force) {
|
|
529
|
+
const cached = Filter.pipeline.get(hook);
|
|
530
|
+
if (cached) return cached;
|
|
531
|
+
}
|
|
532
|
+
const filter = Filter.filters.get(hook);
|
|
533
|
+
if (!filter) {
|
|
534
|
+
Filter.pipeline.set(hook, Filter.IDENTITY);
|
|
535
|
+
return Filter.IDENTITY;
|
|
536
|
+
}
|
|
537
|
+
const pipeline = [];
|
|
538
|
+
for (const f of filter.values()) if (f.active) pipeline.push(f);
|
|
539
|
+
pipeline.sort((a, b) => a.priority - b.priority);
|
|
540
|
+
const fn =
|
|
541
|
+
pipeline.length === 0
|
|
542
|
+
? Filter.IDENTITY
|
|
543
|
+
: (input) => {
|
|
544
|
+
let v = input;
|
|
545
|
+
for (let i = 0; i < pipeline.length; i++) v = pipeline[i].fn(v);
|
|
546
|
+
return v;
|
|
547
|
+
};
|
|
548
|
+
Filter.pipeline.set(hook, fn);
|
|
549
|
+
return fn;
|
|
550
|
+
},
|
|
551
|
+
`Error compiling filter pipeline for hook <${hook}>`,
|
|
552
|
+
{ hook }
|
|
553
|
+
);
|
|
425
554
|
}
|
|
426
555
|
static has(hook, id) {
|
|
427
556
|
return !!Filter.filters.get(hook)?.has(id);
|
|
428
557
|
}
|
|
429
558
|
static add(hook, id, fn, opt = {}) {
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
559
|
+
return ErrorUtil.wrap(
|
|
560
|
+
() => {
|
|
561
|
+
const { priority = 10, active = true, overrideable = true } = opt;
|
|
562
|
+
const filter = Filter.filters.get(hook) ?? new Map();
|
|
563
|
+
const index = filter.get(id);
|
|
564
|
+
if (index && !index.overrideable) return false;
|
|
565
|
+
if (
|
|
566
|
+
index &&
|
|
567
|
+
index.fn === fn &&
|
|
568
|
+
index.priority === priority &&
|
|
569
|
+
index.active === active
|
|
570
|
+
)
|
|
571
|
+
return true;
|
|
572
|
+
filter.set(id, { id, fn, priority, active, overrideable });
|
|
573
|
+
Filter.filters.set(hook, filter);
|
|
574
|
+
Filter.getPipeline(hook, true);
|
|
575
|
+
return true;
|
|
576
|
+
},
|
|
577
|
+
`Error adding filter <${id}> to hook <${hook}>`,
|
|
578
|
+
{ hook, id, opt }
|
|
579
|
+
);
|
|
438
580
|
}
|
|
439
581
|
static remove(hook, id) {
|
|
440
|
-
Filter.pipeline.delete(hook);
|
|
441
582
|
const filter = Filter.filters.get(hook);
|
|
442
|
-
|
|
583
|
+
if (!filter || !filter.delete(id)) return false;
|
|
584
|
+
Filter.getPipeline(hook, true);
|
|
585
|
+
return true;
|
|
443
586
|
}
|
|
444
587
|
static pause(hook, id) {
|
|
445
|
-
Filter.
|
|
446
|
-
|
|
447
|
-
|
|
588
|
+
const filter = Filter.filters.get(hook);
|
|
589
|
+
if (!filter) return false;
|
|
590
|
+
const f = filter.get(id);
|
|
591
|
+
if (!f || !f.active) return false;
|
|
592
|
+
f.active = false;
|
|
593
|
+
Filter.getPipeline(hook, true);
|
|
594
|
+
return true;
|
|
448
595
|
}
|
|
449
596
|
static resume(hook, id) {
|
|
450
|
-
Filter.
|
|
451
|
-
|
|
452
|
-
|
|
597
|
+
const filter = Filter.filters.get(hook);
|
|
598
|
+
if (!filter) return false;
|
|
599
|
+
const f = filter.get(id);
|
|
600
|
+
if (!f || f.active) return false;
|
|
601
|
+
f.active = true;
|
|
602
|
+
Filter.getPipeline(hook, true);
|
|
603
|
+
return true;
|
|
453
604
|
}
|
|
454
605
|
static list(hook, active = false) {
|
|
455
606
|
const filter = Filter.filters.get(hook);
|
|
@@ -459,17 +610,36 @@ class Filter {
|
|
|
459
610
|
return out;
|
|
460
611
|
}
|
|
461
612
|
static apply(hook, input) {
|
|
462
|
-
|
|
463
|
-
|
|
613
|
+
return ErrorUtil.wrap(
|
|
614
|
+
() => {
|
|
615
|
+
const fn = Filter.getPipeline(hook);
|
|
616
|
+
if (typeof input === 'string') return fn(input);
|
|
617
|
+
const arr = input;
|
|
618
|
+
const out = new Array(arr.length);
|
|
619
|
+
for (let i = 0; i < arr.length; i++) out[i] = fn(arr[i]);
|
|
620
|
+
return out;
|
|
621
|
+
},
|
|
622
|
+
`Error applying filters for hook <${hook}>`,
|
|
623
|
+
{ hook, input }
|
|
624
|
+
);
|
|
464
625
|
}
|
|
465
626
|
static async applyAsync(hook, input) {
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
627
|
+
return ErrorUtil.wrapAsync(
|
|
628
|
+
async () => {
|
|
629
|
+
const fn = Filter.getPipeline(hook);
|
|
630
|
+
if (typeof input === 'string') return Promise.resolve(fn(input));
|
|
631
|
+
const arr = input;
|
|
632
|
+
const out = new Array(arr.length);
|
|
633
|
+
for (let i = 0; i < arr.length; i++)
|
|
634
|
+
out[i] = Promise.resolve(fn(arr[i]));
|
|
635
|
+
return Promise.all(out);
|
|
636
|
+
},
|
|
637
|
+
`Error applying filters for hook <${hook}>`,
|
|
638
|
+
{ hook, input }
|
|
639
|
+
);
|
|
470
640
|
}
|
|
471
641
|
static clear(hook) {
|
|
472
|
-
Filter.
|
|
642
|
+
Filter.clearPipeline();
|
|
473
643
|
if (hook) Filter.filters.delete(hook);
|
|
474
644
|
else Filter.filters.clear();
|
|
475
645
|
}
|
|
@@ -483,25 +653,21 @@ class Hasher {
|
|
|
483
653
|
static HASH_OFFSET = 0x811c9dc5;
|
|
484
654
|
static fastFNV1a(str) {
|
|
485
655
|
const len = str.length;
|
|
656
|
+
const limit = len & -4;
|
|
486
657
|
let hash = this.HASH_OFFSET;
|
|
487
|
-
|
|
488
|
-
for (
|
|
489
|
-
const pos = i * 4;
|
|
658
|
+
let i = 0;
|
|
659
|
+
for (; i < limit; i += 4) {
|
|
490
660
|
const chunk =
|
|
491
|
-
str.charCodeAt(
|
|
492
|
-
(str.charCodeAt(
|
|
493
|
-
(str.charCodeAt(
|
|
494
|
-
(str.charCodeAt(
|
|
661
|
+
str.charCodeAt(i) |
|
|
662
|
+
(str.charCodeAt(i + 1) << 8) |
|
|
663
|
+
(str.charCodeAt(i + 2) << 16) |
|
|
664
|
+
(str.charCodeAt(i + 3) << 24);
|
|
495
665
|
hash ^= chunk;
|
|
496
666
|
hash = Math.imul(hash, this.FNV_PRIME);
|
|
497
667
|
}
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
for (let i = 0; i < remaining; i++) {
|
|
502
|
-
hash ^= str.charCodeAt(pos + i);
|
|
503
|
-
hash = Math.imul(hash, this.FNV_PRIME);
|
|
504
|
-
}
|
|
668
|
+
for (; i < len; i++) {
|
|
669
|
+
hash ^= str.charCodeAt(i);
|
|
670
|
+
hash = Math.imul(hash, this.FNV_PRIME);
|
|
505
671
|
}
|
|
506
672
|
hash ^= hash >>> 16;
|
|
507
673
|
hash *= 0x85ebca6b;
|
|
@@ -512,32 +678,51 @@ class Hasher {
|
|
|
512
678
|
}
|
|
513
679
|
}
|
|
514
680
|
class HashTable {
|
|
515
|
-
|
|
681
|
+
FIFO;
|
|
682
|
+
maxSize;
|
|
516
683
|
static MAX_LEN = 2048;
|
|
517
|
-
static TABLE_SIZE = 10_000;
|
|
518
684
|
table = new Map();
|
|
519
|
-
constructor(
|
|
520
|
-
this.
|
|
685
|
+
constructor(FIFO = true, maxSize = 10000) {
|
|
686
|
+
this.FIFO = FIFO;
|
|
687
|
+
this.maxSize = maxSize;
|
|
521
688
|
}
|
|
522
689
|
key(label, strs, sorted = false) {
|
|
523
|
-
|
|
524
|
-
const hashes =
|
|
525
|
-
|
|
690
|
+
const n = strs.length;
|
|
691
|
+
const hashes = new Array(n);
|
|
692
|
+
for (let i = 0; i < n; i++) {
|
|
693
|
+
const s = strs[i];
|
|
694
|
+
if (s.length > HashTable.MAX_LEN) return false;
|
|
695
|
+
hashes[i] = Hasher.fastFNV1a(s);
|
|
696
|
+
}
|
|
697
|
+
if (sorted) hashes.sort((a, b) => a - b);
|
|
698
|
+
let key = label;
|
|
699
|
+
for (let i = 0; i < hashes.length; i++) key += '-' + hashes[i];
|
|
700
|
+
return key;
|
|
701
|
+
}
|
|
702
|
+
has(key) {
|
|
703
|
+
return this.table.has(key);
|
|
704
|
+
}
|
|
705
|
+
get(key) {
|
|
706
|
+
return this.table.get(key);
|
|
526
707
|
}
|
|
527
|
-
has = (key) => this.table.has(key);
|
|
528
|
-
get = (key) => this.table.get(key);
|
|
529
708
|
set(key, entry, update = true) {
|
|
530
709
|
if (!update && this.table.has(key)) return false;
|
|
531
|
-
|
|
532
|
-
if (!this.
|
|
710
|
+
if (!this.table.has(key) && this.table.size >= this.maxSize) {
|
|
711
|
+
if (!this.FIFO) return false;
|
|
533
712
|
this.table.delete(this.table.keys().next().value);
|
|
534
713
|
}
|
|
535
714
|
this.table.set(key, entry);
|
|
536
715
|
return true;
|
|
537
716
|
}
|
|
538
|
-
delete
|
|
539
|
-
|
|
540
|
-
|
|
717
|
+
delete(key) {
|
|
718
|
+
return this.table.delete(key);
|
|
719
|
+
}
|
|
720
|
+
clear() {
|
|
721
|
+
this.table.clear();
|
|
722
|
+
}
|
|
723
|
+
size() {
|
|
724
|
+
return this.table.size;
|
|
725
|
+
}
|
|
541
726
|
}
|
|
542
727
|
|
|
543
728
|
class Normalizer {
|
|
@@ -554,42 +739,91 @@ class Normalizer {
|
|
|
554
739
|
return Array.from(new Set(flags)).sort().join('');
|
|
555
740
|
}
|
|
556
741
|
static getPipeline(flags) {
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
742
|
+
return ErrorUtil.wrap(
|
|
743
|
+
() => {
|
|
744
|
+
const cached = Normalizer.pipeline.get(flags);
|
|
745
|
+
if (cached) return cached;
|
|
746
|
+
const { REGEX } = Normalizer;
|
|
747
|
+
const steps = [];
|
|
748
|
+
for (let i = 0; i < flags.length; i++) {
|
|
749
|
+
switch (flags[i]) {
|
|
750
|
+
case 'd':
|
|
751
|
+
steps.push((s) => s.normalize('NFD'));
|
|
752
|
+
break;
|
|
753
|
+
case 'i':
|
|
754
|
+
steps.push((s) => s.toLowerCase());
|
|
755
|
+
break;
|
|
756
|
+
case 'k':
|
|
757
|
+
steps.push((s) => s.replace(REGEX.nonLetters, ''));
|
|
758
|
+
break;
|
|
759
|
+
case 'n':
|
|
760
|
+
steps.push((s) => s.replace(REGEX.nonNumbers, ''));
|
|
761
|
+
break;
|
|
762
|
+
case 'r':
|
|
763
|
+
steps.push((s) => s.replace(REGEX.doubleChars, '$1'));
|
|
764
|
+
break;
|
|
765
|
+
case 's':
|
|
766
|
+
steps.push((s) => s.replace(REGEX.specialChars, ''));
|
|
767
|
+
break;
|
|
768
|
+
case 't':
|
|
769
|
+
steps.push((s) => s.trim());
|
|
770
|
+
break;
|
|
771
|
+
case 'u':
|
|
772
|
+
steps.push((s) => s.normalize('NFC'));
|
|
773
|
+
break;
|
|
774
|
+
case 'w':
|
|
775
|
+
steps.push((s) => s.replace(REGEX.whitespace, ' '));
|
|
776
|
+
break;
|
|
777
|
+
case 'x':
|
|
778
|
+
steps.push((s) => s.normalize('NFKC'));
|
|
779
|
+
break;
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
const fn = (input) => {
|
|
783
|
+
let v = input;
|
|
784
|
+
for (let i = 0; i < steps.length; i++) v = steps[i](v);
|
|
785
|
+
return v;
|
|
786
|
+
};
|
|
787
|
+
Normalizer.pipeline.set(flags, fn);
|
|
788
|
+
return fn;
|
|
789
|
+
},
|
|
790
|
+
`Failed to create normalization pipeline for flags: ${flags}`,
|
|
791
|
+
{ flags }
|
|
792
|
+
);
|
|
793
|
+
}
|
|
794
|
+
static normalize(input, flags, normalizedFlags) {
|
|
795
|
+
return ErrorUtil.wrap(
|
|
796
|
+
() => {
|
|
797
|
+
if (!flags || typeof flags !== 'string' || !input) return input;
|
|
798
|
+
flags = normalizedFlags ?? this.canonicalFlags(flags);
|
|
799
|
+
const pipeline = Normalizer.getPipeline(flags);
|
|
800
|
+
const normalizeOne = (s) => {
|
|
801
|
+
const key = Normalizer.cache.key(flags, [s]);
|
|
802
|
+
if (key && Normalizer.cache.has(key))
|
|
803
|
+
return Normalizer.cache.get(key);
|
|
804
|
+
const res = pipeline(s);
|
|
805
|
+
if (key) Normalizer.cache.set(key, res);
|
|
806
|
+
return res;
|
|
807
|
+
};
|
|
808
|
+
return Array.isArray(input)
|
|
809
|
+
? input.map(normalizeOne)
|
|
810
|
+
: normalizeOne(input);
|
|
811
|
+
},
|
|
812
|
+
`Failed to normalize input with flags: ${flags}`,
|
|
813
|
+
{ input, flags }
|
|
814
|
+
);
|
|
588
815
|
}
|
|
589
816
|
static async normalizeAsync(input, flags) {
|
|
590
|
-
return await
|
|
591
|
-
|
|
592
|
-
|
|
817
|
+
return await ErrorUtil.wrapAsync(
|
|
818
|
+
async () => {
|
|
819
|
+
if (!flags || typeof flags !== 'string' || !input) return input;
|
|
820
|
+
return await (Array.isArray(input)
|
|
821
|
+
? Promise.all(input.map((s) => Normalizer.normalize(s, flags)))
|
|
822
|
+
: Promise.resolve(Normalizer.normalize(input, flags)));
|
|
823
|
+
},
|
|
824
|
+
`Failed to asynchronously normalize input with flags: ${flags}`,
|
|
825
|
+
{ input, flags }
|
|
826
|
+
);
|
|
593
827
|
}
|
|
594
828
|
static clear() {
|
|
595
829
|
Normalizer.pipeline.clear();
|
|
@@ -597,17 +831,143 @@ class Normalizer {
|
|
|
597
831
|
}
|
|
598
832
|
}
|
|
599
833
|
|
|
834
|
+
class RingPool {
|
|
835
|
+
maxSize;
|
|
836
|
+
buffers = [];
|
|
837
|
+
pointer = 0;
|
|
838
|
+
constructor(maxSize) {
|
|
839
|
+
this.maxSize = maxSize;
|
|
840
|
+
}
|
|
841
|
+
acquire(minSize, allowOversize) {
|
|
842
|
+
return ErrorUtil.wrap(
|
|
843
|
+
() => {
|
|
844
|
+
const buffers = this.buffers;
|
|
845
|
+
const len = buffers.length;
|
|
846
|
+
for (let i = 0; i < len; i++) {
|
|
847
|
+
const idx = (this.pointer + i) % len;
|
|
848
|
+
const item = buffers[idx];
|
|
849
|
+
const size = item.size;
|
|
850
|
+
if (size >= minSize && (allowOversize || size === minSize)) {
|
|
851
|
+
this.pointer = (idx + 1) % len;
|
|
852
|
+
return item;
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
return null;
|
|
856
|
+
},
|
|
857
|
+
`Failed to acquire buffer of size >= ${minSize} from pool`,
|
|
858
|
+
{ minSize, allowOversize }
|
|
859
|
+
);
|
|
860
|
+
}
|
|
861
|
+
release(item) {
|
|
862
|
+
ErrorUtil.wrap(
|
|
863
|
+
() => {
|
|
864
|
+
const buffers = this.buffers;
|
|
865
|
+
if (buffers.length < this.maxSize) {
|
|
866
|
+
buffers.push(item);
|
|
867
|
+
return;
|
|
868
|
+
}
|
|
869
|
+
buffers[this.pointer] = item;
|
|
870
|
+
this.pointer = (this.pointer + 1) % this.maxSize;
|
|
871
|
+
},
|
|
872
|
+
`Failed to release buffer back to pool`,
|
|
873
|
+
{ item }
|
|
874
|
+
);
|
|
875
|
+
}
|
|
876
|
+
clear() {
|
|
877
|
+
this.buffers = [];
|
|
878
|
+
this.pointer = 0;
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
class Pool {
|
|
882
|
+
static CONFIG = {
|
|
883
|
+
int32: {
|
|
884
|
+
type: 'int32',
|
|
885
|
+
maxSize: 64,
|
|
886
|
+
maxItemSize: 2048,
|
|
887
|
+
allowOversize: true
|
|
888
|
+
},
|
|
889
|
+
'arr[]': {
|
|
890
|
+
type: 'arr[]',
|
|
891
|
+
maxSize: 4,
|
|
892
|
+
maxItemSize: 1024,
|
|
893
|
+
allowOversize: false
|
|
894
|
+
},
|
|
895
|
+
'number[]': {
|
|
896
|
+
type: 'number[]',
|
|
897
|
+
maxSize: 16,
|
|
898
|
+
maxItemSize: 1024,
|
|
899
|
+
allowOversize: false
|
|
900
|
+
},
|
|
901
|
+
'string[]': {
|
|
902
|
+
type: 'string[]',
|
|
903
|
+
maxSize: 2,
|
|
904
|
+
maxItemSize: 1024,
|
|
905
|
+
allowOversize: false
|
|
906
|
+
},
|
|
907
|
+
set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
|
|
908
|
+
map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
|
|
909
|
+
};
|
|
910
|
+
static POOLS = {
|
|
911
|
+
int32: new RingPool(64),
|
|
912
|
+
'arr[]': new RingPool(4),
|
|
913
|
+
'number[]': new RingPool(16),
|
|
914
|
+
'string[]': new RingPool(2),
|
|
915
|
+
set: new RingPool(8),
|
|
916
|
+
map: new RingPool(8)
|
|
917
|
+
};
|
|
918
|
+
static allocate(type, size) {
|
|
919
|
+
switch (type) {
|
|
920
|
+
case 'int32':
|
|
921
|
+
return new Int32Array(size);
|
|
922
|
+
case 'arr[]':
|
|
923
|
+
return new Array(size);
|
|
924
|
+
case 'number[]':
|
|
925
|
+
return new Float64Array(size);
|
|
926
|
+
case 'string[]':
|
|
927
|
+
return new Array(size);
|
|
928
|
+
case 'set':
|
|
929
|
+
return new Set();
|
|
930
|
+
case 'map':
|
|
931
|
+
return new Map();
|
|
932
|
+
}
|
|
933
|
+
}
|
|
934
|
+
static acquire(type, size) {
|
|
935
|
+
const CONFIG = this.CONFIG[type];
|
|
936
|
+
if (!CONFIG)
|
|
937
|
+
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
|
|
938
|
+
if (size > CONFIG.maxItemSize) return this.allocate(type, size);
|
|
939
|
+
const item = this.POOLS[type].acquire(size, CONFIG.allowOversize);
|
|
940
|
+
if (item)
|
|
941
|
+
return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
|
|
942
|
+
return this.allocate(type, size);
|
|
943
|
+
}
|
|
944
|
+
static acquireMany(type, sizes) {
|
|
945
|
+
const out = new Array(sizes.length);
|
|
946
|
+
for (let i = 0; i < sizes.length; i++)
|
|
947
|
+
out[i] = this.acquire(type, sizes[i]);
|
|
948
|
+
return out;
|
|
949
|
+
}
|
|
950
|
+
static release(type, buffer, size) {
|
|
951
|
+
const CONFIG = this.CONFIG[type];
|
|
952
|
+
if (!CONFIG)
|
|
953
|
+
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
|
|
954
|
+
if (size <= CONFIG.maxItemSize) this.POOLS[type].release({ buffer, size });
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
|
|
600
958
|
class Profiler {
|
|
601
959
|
active;
|
|
602
960
|
static ENV;
|
|
603
961
|
static instance;
|
|
604
962
|
nowFn;
|
|
605
963
|
memFn;
|
|
606
|
-
store =
|
|
964
|
+
store = [];
|
|
965
|
+
last;
|
|
607
966
|
totalTime = 0;
|
|
608
967
|
totalMem = 0;
|
|
609
968
|
static detectEnv() {
|
|
610
|
-
if (typeof process !== 'undefined'
|
|
969
|
+
if (typeof process !== 'undefined' && process.versions?.node)
|
|
970
|
+
Profiler.ENV = 'nodejs';
|
|
611
971
|
else if (typeof performance !== 'undefined') Profiler.ENV = 'browser';
|
|
612
972
|
else Profiler.ENV = 'unknown';
|
|
613
973
|
}
|
|
@@ -619,7 +979,7 @@ class Profiler {
|
|
|
619
979
|
this.active = active;
|
|
620
980
|
switch (Profiler.ENV) {
|
|
621
981
|
case 'nodejs':
|
|
622
|
-
this.nowFn = () => Number(process.hrtime.bigint())
|
|
982
|
+
this.nowFn = () => Number(process.hrtime.bigint()) * 1e-6;
|
|
623
983
|
this.memFn = () => process.memoryUsage().heapUsed;
|
|
624
984
|
break;
|
|
625
985
|
case 'browser':
|
|
@@ -632,40 +992,52 @@ class Profiler {
|
|
|
632
992
|
break;
|
|
633
993
|
}
|
|
634
994
|
}
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
startMem = this.mem();
|
|
640
|
-
const res = fn();
|
|
641
|
-
const deltaTime = this.now() - startTime,
|
|
642
|
-
deltaMem = this.mem() - startMem;
|
|
643
|
-
this.store.add({ time: deltaTime, mem: deltaMem, res, meta });
|
|
644
|
-
((this.totalTime += deltaTime), (this.totalMem += deltaMem));
|
|
645
|
-
return res;
|
|
995
|
+
storeRes(entry) {
|
|
996
|
+
this.store.push((this.last = entry));
|
|
997
|
+
this.totalTime += entry.time;
|
|
998
|
+
this.totalMem += entry.mem;
|
|
646
999
|
}
|
|
647
|
-
enable
|
|
1000
|
+
enable() {
|
|
648
1001
|
this.active = true;
|
|
649
|
-
}
|
|
650
|
-
disable
|
|
1002
|
+
}
|
|
1003
|
+
disable() {
|
|
651
1004
|
this.active = false;
|
|
652
|
-
}
|
|
1005
|
+
}
|
|
653
1006
|
clear() {
|
|
654
|
-
this.store.
|
|
1007
|
+
this.store.length = 0;
|
|
1008
|
+
this.last = undefined;
|
|
655
1009
|
this.totalTime = 0;
|
|
656
1010
|
this.totalMem = 0;
|
|
657
1011
|
}
|
|
658
1012
|
run(fn, meta = {}) {
|
|
659
|
-
|
|
1013
|
+
if (!this.active) return fn();
|
|
1014
|
+
const startTime = this.nowFn(),
|
|
1015
|
+
startMem = this.memFn();
|
|
1016
|
+
const res = fn();
|
|
1017
|
+
const deltaTime = this.nowFn() - startTime,
|
|
1018
|
+
deltaMem = this.memFn() - startMem;
|
|
1019
|
+
this.storeRes({ time: deltaTime, mem: deltaMem, res, meta });
|
|
1020
|
+
return res;
|
|
660
1021
|
}
|
|
661
1022
|
async runAsync(fn, meta = {}) {
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
1023
|
+
if (!this.active) return fn();
|
|
1024
|
+
const startTime = this.nowFn(),
|
|
1025
|
+
startMem = this.memFn();
|
|
1026
|
+
const res = await fn();
|
|
1027
|
+
const deltaTime = this.nowFn() - startTime,
|
|
1028
|
+
deltaMem = this.memFn() - startMem;
|
|
1029
|
+
this.storeRes({ time: deltaTime, mem: deltaMem, res, meta });
|
|
1030
|
+
return res;
|
|
1031
|
+
}
|
|
1032
|
+
getAll() {
|
|
1033
|
+
return [...this.store];
|
|
1034
|
+
}
|
|
1035
|
+
getLast() {
|
|
1036
|
+
return this.last;
|
|
1037
|
+
}
|
|
1038
|
+
getTotal() {
|
|
1039
|
+
return { time: this.totalTime, mem: this.totalMem };
|
|
665
1040
|
}
|
|
666
|
-
getAll = () => [...this.store];
|
|
667
|
-
getLast = () => this.getAll().pop();
|
|
668
|
-
getTotal = () => ({ time: this.totalTime, mem: this.totalMem });
|
|
669
1041
|
services = Object.freeze({
|
|
670
1042
|
enable: this.enable.bind(this),
|
|
671
1043
|
disable: this.disable.bind(this),
|
|
@@ -679,19 +1051,34 @@ class Profiler {
|
|
|
679
1051
|
const registry = Object.create(null);
|
|
680
1052
|
const factory = Object.create(null);
|
|
681
1053
|
function Registry(reg, ctor) {
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
1054
|
+
ErrorUtil.assert(
|
|
1055
|
+
!(reg in registry || reg in factory),
|
|
1056
|
+
`Registry <${reg}> already exists / overwriting is forbidden`,
|
|
1057
|
+
{ registry: reg }
|
|
1058
|
+
);
|
|
686
1059
|
const classes = Object.create(null);
|
|
687
1060
|
const service = Object.freeze({
|
|
688
1061
|
add(name, cls, update = false) {
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
1062
|
+
ErrorUtil.assert(
|
|
1063
|
+
typeof name === 'string' && name.length > 0,
|
|
1064
|
+
`Class name must be a non-empty string`,
|
|
1065
|
+
{ registry: reg, name }
|
|
1066
|
+
);
|
|
1067
|
+
ErrorUtil.assert(
|
|
1068
|
+
typeof cls === 'function',
|
|
1069
|
+
`Class must be a constructor function`,
|
|
1070
|
+
{ registry: reg, class: cls }
|
|
1071
|
+
);
|
|
1072
|
+
ErrorUtil.assert(
|
|
1073
|
+
cls.prototype instanceof ctor,
|
|
1074
|
+
`Class must extend <${reg}>`,
|
|
1075
|
+
{ registry: reg, class: cls }
|
|
1076
|
+
);
|
|
1077
|
+
ErrorUtil.assert(
|
|
1078
|
+
update || !(name in classes),
|
|
1079
|
+
`Class <${name}> already exists / use <update=true> to overwrite`,
|
|
1080
|
+
{ registry: reg, name }
|
|
1081
|
+
);
|
|
695
1082
|
classes[name] = cls;
|
|
696
1083
|
},
|
|
697
1084
|
remove(name) {
|
|
@@ -704,8 +1091,16 @@ function Registry(reg, ctor) {
|
|
|
704
1091
|
return Object.keys(classes);
|
|
705
1092
|
},
|
|
706
1093
|
get(name) {
|
|
707
|
-
|
|
708
|
-
|
|
1094
|
+
ErrorUtil.assert(
|
|
1095
|
+
typeof name === 'string' && name.length > 0,
|
|
1096
|
+
`Class name must be a non-empty string`,
|
|
1097
|
+
{ registry: reg, name }
|
|
1098
|
+
);
|
|
1099
|
+
ErrorUtil.assert(
|
|
1100
|
+
name in classes,
|
|
1101
|
+
`Class <${name}> not registered for <${reg}>`,
|
|
1102
|
+
{ registry: reg, name }
|
|
1103
|
+
);
|
|
709
1104
|
return classes[name];
|
|
710
1105
|
}
|
|
711
1106
|
});
|
|
@@ -715,745 +1110,348 @@ function Registry(reg, ctor) {
|
|
|
715
1110
|
}
|
|
716
1111
|
function resolveCls(reg, cls) {
|
|
717
1112
|
if (!(reg in registry))
|
|
718
|
-
throw new
|
|
719
|
-
|
|
1113
|
+
throw new CmpStrNotFoundError(`Registry <${reg}> does not exist`, {
|
|
1114
|
+
registry: reg
|
|
1115
|
+
});
|
|
1116
|
+
return typeof cls === 'string' ? registry[reg].get(cls) : cls;
|
|
720
1117
|
}
|
|
721
1118
|
function createFromRegistry(reg, cls, ...args) {
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
});
|
|
729
|
-
}
|
|
1119
|
+
const ctor = resolveCls(reg, cls);
|
|
1120
|
+
return ErrorUtil.wrap(
|
|
1121
|
+
() => new ctor(...args),
|
|
1122
|
+
`Failed to create instance of class <${ctor.name ?? cls}> from registry <${reg}>`,
|
|
1123
|
+
{ registry: reg, class: cls, args }
|
|
1124
|
+
);
|
|
730
1125
|
}
|
|
731
1126
|
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
1127
|
+
const profiler$2 = Profiler.getInstance();
|
|
1128
|
+
class Metric {
|
|
1129
|
+
static cache = new HashTable();
|
|
1130
|
+
metric;
|
|
1131
|
+
a;
|
|
1132
|
+
b;
|
|
1133
|
+
origA = [];
|
|
1134
|
+
origB = [];
|
|
1135
|
+
options;
|
|
1136
|
+
optKey;
|
|
1137
|
+
symmetric;
|
|
1138
|
+
results;
|
|
1139
|
+
static clear() {
|
|
1140
|
+
this.cache.clear();
|
|
738
1141
|
}
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
for (let i = 0; i < len; i++) {
|
|
742
|
-
const idx = (this.pointer + i) & (len - 1);
|
|
743
|
-
const item = this.buffers[idx];
|
|
744
|
-
if (item.size >= minSize && (allowOversize || item.size === minSize)) {
|
|
745
|
-
this.pointer = (idx + 1) & (len - 1);
|
|
746
|
-
return item;
|
|
747
|
-
}
|
|
748
|
-
}
|
|
749
|
-
return null;
|
|
1142
|
+
static swap(a, b, m, n) {
|
|
1143
|
+
return m > n ? [b, a, n, m] : [a, b, m, n];
|
|
750
1144
|
}
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
return void [this.buffers.push(item)];
|
|
754
|
-
this.buffers[this.pointer] = item;
|
|
755
|
-
this.pointer = (this.pointer + 1) % this.maxSize;
|
|
1145
|
+
static clamp(res) {
|
|
1146
|
+
return Math.max(0, Math.min(1, res));
|
|
756
1147
|
}
|
|
757
|
-
|
|
758
|
-
this.
|
|
759
|
-
this.
|
|
1148
|
+
constructor(metric, a, b, opt = {}, symmetric = false) {
|
|
1149
|
+
this.metric = metric;
|
|
1150
|
+
this.a = Array.isArray(a) ? a : [a];
|
|
1151
|
+
this.b = Array.isArray(b) ? b : [b];
|
|
1152
|
+
ErrorUtil.assert(
|
|
1153
|
+
this.a.length > 0 && this.b.length > 0,
|
|
1154
|
+
`Inputs <a> and <b> must not be empty`,
|
|
1155
|
+
{ a: this.a, b: this.b }
|
|
1156
|
+
);
|
|
1157
|
+
this.options = opt;
|
|
1158
|
+
this.optKey = Hasher.fastFNV1a(
|
|
1159
|
+
JSON.stringify(opt, Object.keys(opt).sort())
|
|
1160
|
+
).toString();
|
|
1161
|
+
this.symmetric = symmetric;
|
|
760
1162
|
}
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
type: 'int32',
|
|
766
|
-
maxSize: 64,
|
|
767
|
-
maxItemSize: 2048,
|
|
768
|
-
allowOversize: true
|
|
769
|
-
},
|
|
770
|
-
'number[]': {
|
|
771
|
-
type: 'number[]',
|
|
772
|
-
maxSize: 16,
|
|
773
|
-
maxItemSize: 1024,
|
|
774
|
-
allowOversize: false
|
|
775
|
-
},
|
|
776
|
-
'string[]': {
|
|
777
|
-
type: 'string[]',
|
|
778
|
-
maxSize: 2,
|
|
779
|
-
maxItemSize: 1024,
|
|
780
|
-
allowOversize: false
|
|
781
|
-
},
|
|
782
|
-
set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
|
|
783
|
-
map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
|
|
784
|
-
};
|
|
785
|
-
static POOLS = {
|
|
786
|
-
int32: new RingPool(64),
|
|
787
|
-
'number[]': new RingPool(16),
|
|
788
|
-
'string[]': new RingPool(2),
|
|
789
|
-
set: new RingPool(8),
|
|
790
|
-
map: new RingPool(8)
|
|
791
|
-
};
|
|
792
|
-
static allocate(type, size) {
|
|
793
|
-
switch (type) {
|
|
794
|
-
case 'int32':
|
|
795
|
-
return new Int32Array(size);
|
|
796
|
-
case 'number[]':
|
|
797
|
-
return new Float64Array(size);
|
|
798
|
-
case 'string[]':
|
|
799
|
-
return new Array(size);
|
|
800
|
-
case 'set':
|
|
801
|
-
return new Set();
|
|
802
|
-
case 'map':
|
|
803
|
-
return new Map();
|
|
804
|
-
}
|
|
1163
|
+
preCompute(a, b, m, n) {
|
|
1164
|
+
if (a === b) return { res: 1 };
|
|
1165
|
+
if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 };
|
|
1166
|
+
return undefined;
|
|
805
1167
|
}
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
if (item)
|
|
811
|
-
return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
|
|
812
|
-
return this.allocate(type, size);
|
|
1168
|
+
compute(a, b, m, n, maxLen) {
|
|
1169
|
+
throw new CmpStrInternalError(
|
|
1170
|
+
`Method compute() must be overridden in a subclass`
|
|
1171
|
+
);
|
|
813
1172
|
}
|
|
814
|
-
|
|
815
|
-
return
|
|
1173
|
+
runSingle(i, j) {
|
|
1174
|
+
return ErrorUtil.wrap(
|
|
1175
|
+
() => {
|
|
1176
|
+
let a = String(this.a[i]),
|
|
1177
|
+
A = a;
|
|
1178
|
+
let b = String(this.b[j]),
|
|
1179
|
+
B = b;
|
|
1180
|
+
let m = A.length,
|
|
1181
|
+
n = B.length;
|
|
1182
|
+
let result = this.preCompute(A, B, m, n);
|
|
1183
|
+
if (!result) {
|
|
1184
|
+
result = profiler$2.run(() => {
|
|
1185
|
+
if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
|
|
1186
|
+
let key = Metric.cache.key(this.metric, [A, B], this.symmetric);
|
|
1187
|
+
if (key) key += this.optKey;
|
|
1188
|
+
return (
|
|
1189
|
+
Metric.cache.get(key || '') ??
|
|
1190
|
+
(() => {
|
|
1191
|
+
const maxLen = m > n ? m : n;
|
|
1192
|
+
const res = this.compute(A, B, m, n, maxLen);
|
|
1193
|
+
if (key) Metric.cache.set(key, res);
|
|
1194
|
+
return res;
|
|
1195
|
+
})()
|
|
1196
|
+
);
|
|
1197
|
+
});
|
|
1198
|
+
}
|
|
1199
|
+
return {
|
|
1200
|
+
metric: this.metric,
|
|
1201
|
+
a: this.origA.length > i ? this.origA[i] : a,
|
|
1202
|
+
b: this.origB.length > j ? this.origB[j] : b,
|
|
1203
|
+
...result
|
|
1204
|
+
};
|
|
1205
|
+
},
|
|
1206
|
+
`Failed to compute metric for inputs at indices a[${i}] and b[${j}]`,
|
|
1207
|
+
{ i, j }
|
|
1208
|
+
);
|
|
816
1209
|
}
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
this.POOLS[type].release({ buffer, size });
|
|
1210
|
+
async runSingleAsync(i, j) {
|
|
1211
|
+
return Promise.resolve(this.runSingle(i, j));
|
|
820
1212
|
}
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
return new StructuredData(data, key);
|
|
1213
|
+
runBatch() {
|
|
1214
|
+
const results = [];
|
|
1215
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1216
|
+
for (let j = 0; j < this.b.length; j++)
|
|
1217
|
+
results.push(this.runSingle(i, j));
|
|
1218
|
+
this.results = results;
|
|
828
1219
|
}
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
1220
|
+
async runBatchAsync() {
|
|
1221
|
+
const tasks = [];
|
|
1222
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1223
|
+
for (let j = 0; j < this.b.length; j++)
|
|
1224
|
+
tasks.push(this.runSingleAsync(i, j));
|
|
1225
|
+
this.results = await Promise.all(tasks);
|
|
832
1226
|
}
|
|
833
|
-
|
|
834
|
-
const
|
|
835
|
-
for (let i = 0; i <
|
|
836
|
-
|
|
837
|
-
result[i] = typeof val === 'string' ? val : String(val ?? '');
|
|
838
|
-
}
|
|
839
|
-
return result;
|
|
1227
|
+
runPairwise() {
|
|
1228
|
+
const results = [];
|
|
1229
|
+
for (let i = 0; i < this.a.length; i++) results.push(this.runSingle(i, i));
|
|
1230
|
+
this.results = results;
|
|
840
1231
|
}
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
);
|
|
1232
|
+
async runPairwiseAsync() {
|
|
1233
|
+
const tasks = [];
|
|
1234
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1235
|
+
tasks.push(this.runSingleAsync(i, i));
|
|
1236
|
+
this.results = await Promise.all(tasks);
|
|
846
1237
|
}
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
'source' in v &&
|
|
852
|
-
'target' in v &&
|
|
853
|
-
'match' in v
|
|
854
|
-
);
|
|
1238
|
+
setOriginal(a, b) {
|
|
1239
|
+
if (a) this.origA = Array.isArray(a) ? a : [a];
|
|
1240
|
+
if (b) this.origB = Array.isArray(b) ? b : [b];
|
|
1241
|
+
return this;
|
|
855
1242
|
}
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
const first = results[0];
|
|
859
|
-
let normalized = [];
|
|
860
|
-
if (this.isMetricResult(first)) normalized = results;
|
|
861
|
-
else if (this.isCmpStrResult(first))
|
|
862
|
-
normalized = results.map((r) => ({
|
|
863
|
-
metric: 'unknown',
|
|
864
|
-
a: r.source,
|
|
865
|
-
b: r.target,
|
|
866
|
-
res: r.match,
|
|
867
|
-
raw: r.raw
|
|
868
|
-
}));
|
|
869
|
-
else
|
|
870
|
-
throw new TypeError(
|
|
871
|
-
'Unsupported result format for StructuredData normalization.'
|
|
872
|
-
);
|
|
873
|
-
return normalized.map((r, idx) => ({ ...r, __idx: idx }));
|
|
1243
|
+
isBatch() {
|
|
1244
|
+
return this.a.length > 1 || this.b.length > 1;
|
|
874
1245
|
}
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
for (let i = 0; i < extractedStrings.length; i++) {
|
|
878
|
-
const str = extractedStrings[i];
|
|
879
|
-
if (!stringToIndices.has(str)) stringToIndices.set(str, []);
|
|
880
|
-
stringToIndices.get(str).push(i);
|
|
881
|
-
}
|
|
882
|
-
const output = new Array(results.length);
|
|
883
|
-
const occurrenceCount = new Map();
|
|
884
|
-
let out = 0;
|
|
885
|
-
for (let i = 0; i < results.length; i++) {
|
|
886
|
-
const result = results[i];
|
|
887
|
-
if (removeZero && result.res === 0) continue;
|
|
888
|
-
const targetStr = result.b || '';
|
|
889
|
-
const indices = stringToIndices.get(targetStr);
|
|
890
|
-
let dataIndex;
|
|
891
|
-
if (indices && indices.length > 0) {
|
|
892
|
-
const occurrence = occurrenceCount.get(targetStr) ?? 0;
|
|
893
|
-
occurrenceCount.set(targetStr, occurrence + 1);
|
|
894
|
-
dataIndex = indices[occurrence % indices.length];
|
|
895
|
-
} else {
|
|
896
|
-
dataIndex = result.__idx ?? i;
|
|
897
|
-
}
|
|
898
|
-
if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
|
|
899
|
-
const sourceObj = sourceData[dataIndex];
|
|
900
|
-
const mappedTarget = extractedStrings[dataIndex] || targetStr;
|
|
901
|
-
if (objectsOnly) output[out++] = sourceObj;
|
|
902
|
-
else
|
|
903
|
-
output[out++] = {
|
|
904
|
-
obj: sourceObj,
|
|
905
|
-
key: this.key,
|
|
906
|
-
result: { source: result.a, target: mappedTarget, match: result.res },
|
|
907
|
-
...(result.raw ? { raw: result.raw } : null)
|
|
908
|
-
};
|
|
909
|
-
}
|
|
910
|
-
output.length = out;
|
|
911
|
-
return output;
|
|
1246
|
+
isSingle() {
|
|
1247
|
+
return !this.isBatch();
|
|
912
1248
|
}
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
1249
|
+
isPairwise(safe = false) {
|
|
1250
|
+
return this.isBatch() && this.a.length === this.b.length
|
|
1251
|
+
? true
|
|
1252
|
+
: !safe &&
|
|
1253
|
+
(() => {
|
|
1254
|
+
throw new CmpStrUsageError(
|
|
1255
|
+
`Mode <pairwise> requires arrays of equal length`,
|
|
1256
|
+
{ a: this.a, b: this.b }
|
|
1257
|
+
);
|
|
1258
|
+
})();
|
|
917
1259
|
}
|
|
918
|
-
|
|
919
|
-
return this.
|
|
920
|
-
this.sort(this.normalizeResults(results), opt?.sort),
|
|
921
|
-
this.data,
|
|
922
|
-
extractedStrings,
|
|
923
|
-
opt?.removeZero,
|
|
924
|
-
opt?.objectsOnly
|
|
925
|
-
);
|
|
1260
|
+
isSymmetrical() {
|
|
1261
|
+
return this.symmetric;
|
|
926
1262
|
}
|
|
927
|
-
|
|
928
|
-
return this.
|
|
1263
|
+
whichMode(mode) {
|
|
1264
|
+
return mode ?? this.options.mode ?? 'default';
|
|
929
1265
|
}
|
|
930
|
-
|
|
931
|
-
|
|
1266
|
+
clear() {
|
|
1267
|
+
this.results = undefined;
|
|
932
1268
|
}
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
1269
|
+
run(mode, clear = true) {
|
|
1270
|
+
if (clear) this.clear();
|
|
1271
|
+
switch (this.whichMode(mode)) {
|
|
1272
|
+
case 'default':
|
|
1273
|
+
if (this.isSingle()) {
|
|
1274
|
+
this.results = this.runSingle(0, 0);
|
|
1275
|
+
break;
|
|
1276
|
+
}
|
|
1277
|
+
case 'batch':
|
|
1278
|
+
this.runBatch();
|
|
1279
|
+
break;
|
|
1280
|
+
case 'single':
|
|
1281
|
+
this.results = this.runSingle(0, 0);
|
|
1282
|
+
break;
|
|
1283
|
+
case 'pairwise':
|
|
1284
|
+
if (this.isPairwise()) this.runPairwise();
|
|
1285
|
+
break;
|
|
1286
|
+
default:
|
|
1287
|
+
throw new CmpStrInternalError(`Unsupported mode <${mode}>`);
|
|
939
1288
|
}
|
|
940
1289
|
}
|
|
941
|
-
async
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
1290
|
+
async runAsync(mode, clear = true) {
|
|
1291
|
+
if (clear) this.clear();
|
|
1292
|
+
switch (this.whichMode(mode)) {
|
|
1293
|
+
case 'default':
|
|
1294
|
+
if (this.isSingle()) {
|
|
1295
|
+
this.results = await this.runSingleAsync(0, 0);
|
|
1296
|
+
break;
|
|
1297
|
+
}
|
|
1298
|
+
case 'batch':
|
|
1299
|
+
await this.runBatchAsync();
|
|
1300
|
+
break;
|
|
1301
|
+
case 'single':
|
|
1302
|
+
this.results = await this.runSingleAsync(0, 0);
|
|
1303
|
+
break;
|
|
1304
|
+
case 'pairwise':
|
|
1305
|
+
if (this.isPairwise()) await this.runPairwiseAsync();
|
|
1306
|
+
break;
|
|
1307
|
+
default:
|
|
1308
|
+
throw new CmpStrInternalError(`Unsupported async mode <${mode}>`);
|
|
947
1309
|
}
|
|
948
1310
|
}
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
const b = this.extractFrom(other, otherKey);
|
|
952
|
-
try {
|
|
953
|
-
return this.performLookup(() => fn(a, b, opt), a, opt);
|
|
954
|
-
} finally {
|
|
955
|
-
Pool.release('string[]', a, a.length);
|
|
956
|
-
Pool.release('string[]', b, b.length);
|
|
957
|
-
}
|
|
1311
|
+
getMetricName() {
|
|
1312
|
+
return this.metric;
|
|
958
1313
|
}
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
1314
|
+
getResults() {
|
|
1315
|
+
ErrorUtil.assert(
|
|
1316
|
+
this.results !== undefined,
|
|
1317
|
+
`run() must be called before getResults()`
|
|
1318
|
+
);
|
|
1319
|
+
return this.results;
|
|
1320
|
+
}
|
|
1321
|
+
}
|
|
1322
|
+
const MetricRegistry = Registry('metric', Metric);
|
|
1323
|
+
|
|
1324
|
+
class CosineSimilarity extends Metric {
|
|
1325
|
+
constructor(a, b, opt = {}) {
|
|
1326
|
+
super('cosine', a, b, opt, true);
|
|
1327
|
+
}
|
|
1328
|
+
_termFreq(str, delimiter) {
|
|
1329
|
+
const terms = str.split(delimiter);
|
|
1330
|
+
const freq = Pool.acquire('map', terms.length);
|
|
1331
|
+
for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
|
|
1332
|
+
return freq;
|
|
1333
|
+
}
|
|
1334
|
+
compute(a, b) {
|
|
1335
|
+
const { delimiter = ' ' } = this.options;
|
|
1336
|
+
const termsA = this._termFreq(a, delimiter);
|
|
1337
|
+
const termsB = this._termFreq(b, delimiter);
|
|
962
1338
|
try {
|
|
963
|
-
|
|
1339
|
+
let dotP = 0,
|
|
1340
|
+
magA = 0,
|
|
1341
|
+
magB = 0;
|
|
1342
|
+
for (const [term, freqA] of termsA) {
|
|
1343
|
+
const freqB = termsB.get(term) || 0;
|
|
1344
|
+
dotP += freqA * freqB;
|
|
1345
|
+
magA += freqA * freqA;
|
|
1346
|
+
}
|
|
1347
|
+
for (const freqB of termsB.values()) magB += freqB * freqB;
|
|
1348
|
+
magA = Math.sqrt(magA);
|
|
1349
|
+
magB = Math.sqrt(magB);
|
|
1350
|
+
return {
|
|
1351
|
+
res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0,
|
|
1352
|
+
raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB }
|
|
1353
|
+
};
|
|
964
1354
|
} finally {
|
|
965
|
-
Pool.release('
|
|
966
|
-
Pool.release('
|
|
1355
|
+
Pool.release('map', termsA, termsA.size);
|
|
1356
|
+
Pool.release('map', termsB, termsB.size);
|
|
967
1357
|
}
|
|
968
1358
|
}
|
|
969
1359
|
}
|
|
1360
|
+
MetricRegistry.add('cosine', CosineSimilarity);
|
|
970
1361
|
|
|
971
|
-
class
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
sentence: /(?<=[.!?])\s+/,
|
|
975
|
-
word: /\p{L}+/gu,
|
|
976
|
-
nonWord: /[^\p{L}]/gu,
|
|
977
|
-
vowelGroup: /[aeiouy]+/g,
|
|
978
|
-
letter: /\p{L}/gu,
|
|
979
|
-
ucLetter: /\p{Lu}/gu
|
|
980
|
-
};
|
|
981
|
-
text;
|
|
982
|
-
words = [];
|
|
983
|
-
sentences = [];
|
|
984
|
-
charFrequency = new Map();
|
|
985
|
-
wordHistogram = new Map();
|
|
986
|
-
syllableCache = new Map();
|
|
987
|
-
syllableStats;
|
|
988
|
-
constructor(input) {
|
|
989
|
-
this.text = input.trim();
|
|
990
|
-
this.tokenize();
|
|
991
|
-
this.computeFrequencies();
|
|
992
|
-
}
|
|
993
|
-
tokenize() {
|
|
994
|
-
let match;
|
|
995
|
-
const lcText = this.text.toLowerCase();
|
|
996
|
-
while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null)
|
|
997
|
-
this.words.push(match[0]);
|
|
998
|
-
this.sentences = this.text
|
|
999
|
-
.split(TextAnalyzer.REGEX.sentence)
|
|
1000
|
-
.filter(Boolean);
|
|
1362
|
+
class DamerauLevenshteinDistance extends Metric {
|
|
1363
|
+
constructor(a, b, opt = {}) {
|
|
1364
|
+
super('damerau', a, b, opt, true);
|
|
1001
1365
|
}
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
.
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
const
|
|
1027
|
-
? 0
|
|
1028
|
-
: perWord.length % 2 === 0
|
|
1029
|
-
? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) / 2
|
|
1030
|
-
: perWord[Math.floor(perWord.length / 2)];
|
|
1366
|
+
compute(a, b, m, n, maxLen) {
|
|
1367
|
+
const len = m + 1;
|
|
1368
|
+
const [test, prev, curr] = Pool.acquireMany('int32', [len, len, len]);
|
|
1369
|
+
try {
|
|
1370
|
+
for (let i = 0; i <= m; i++) prev[i] = i;
|
|
1371
|
+
for (let j = 1; j <= n; j++) {
|
|
1372
|
+
curr[0] = j;
|
|
1373
|
+
const cb = b.charCodeAt(j - 1);
|
|
1374
|
+
for (let i = 1; i <= m; i++) {
|
|
1375
|
+
const ca = a.charCodeAt(i - 1);
|
|
1376
|
+
const cost = ca === cb ? 0 : 1;
|
|
1377
|
+
let val = Math.min(curr[i - 1] + 1, prev[i] + 1, prev[i - 1] + cost);
|
|
1378
|
+
if (
|
|
1379
|
+
i > 1 &&
|
|
1380
|
+
j > 1 &&
|
|
1381
|
+
ca === b.charCodeAt(j - 2) &&
|
|
1382
|
+
cb === a.charCodeAt(i - 2)
|
|
1383
|
+
)
|
|
1384
|
+
val = Math.min(val, test[i - 2] + cost);
|
|
1385
|
+
curr[i] = val;
|
|
1386
|
+
}
|
|
1387
|
+
test.set(prev);
|
|
1388
|
+
prev.set(curr);
|
|
1389
|
+
}
|
|
1390
|
+
const dist = prev[m];
|
|
1031
1391
|
return {
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
perWord,
|
|
1035
|
-
avg: perWord.length ? total / perWord.length : 0,
|
|
1036
|
-
median
|
|
1392
|
+
res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
|
|
1393
|
+
raw: { dist, maxLen }
|
|
1037
1394
|
};
|
|
1038
|
-
}
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
getSentenceCount = () => this.sentences.length;
|
|
1043
|
-
getAvgWordLength() {
|
|
1044
|
-
return this.words.length
|
|
1045
|
-
? this.words.join('').length / this.words.length
|
|
1046
|
-
: 0;
|
|
1047
|
-
}
|
|
1048
|
-
getAvgSentenceLength() {
|
|
1049
|
-
return this.sentences.length
|
|
1050
|
-
? this.words.length / this.sentences.length
|
|
1051
|
-
: 0;
|
|
1052
|
-
}
|
|
1053
|
-
getWordHistogram() {
|
|
1054
|
-
return Object.fromEntries(this.wordHistogram);
|
|
1055
|
-
}
|
|
1056
|
-
getMostCommonWords(limit = 5) {
|
|
1057
|
-
return [...this.wordHistogram.entries()]
|
|
1058
|
-
.sort((a, b) => b[1] - a[1])
|
|
1059
|
-
.slice(0, limit)
|
|
1060
|
-
.map((e) => e[0]);
|
|
1061
|
-
}
|
|
1062
|
-
getHapaxLegomena() {
|
|
1063
|
-
return [...this.wordHistogram.entries()]
|
|
1064
|
-
.filter(([, c]) => c === 1)
|
|
1065
|
-
.map((e) => e[0]);
|
|
1066
|
-
}
|
|
1067
|
-
hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text);
|
|
1068
|
-
getUpperCaseRatio() {
|
|
1069
|
-
const matches = this.text.match(TextAnalyzer.REGEX.letter) || [];
|
|
1070
|
-
const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0;
|
|
1071
|
-
return matches.length ? upper / matches.length : 0;
|
|
1072
|
-
}
|
|
1073
|
-
getCharFrequency() {
|
|
1074
|
-
return Object.fromEntries(this.charFrequency);
|
|
1075
|
-
}
|
|
1076
|
-
getUnicodeCodepoints() {
|
|
1077
|
-
const result = {};
|
|
1078
|
-
for (const [char, count] of this.charFrequency) {
|
|
1079
|
-
const block = char
|
|
1080
|
-
.charCodeAt(0)
|
|
1081
|
-
.toString(16)
|
|
1082
|
-
.padStart(4, '0')
|
|
1083
|
-
.toUpperCase();
|
|
1084
|
-
result[block] = (result[block] || 0) + count;
|
|
1395
|
+
} finally {
|
|
1396
|
+
Pool.release('int32', test, len);
|
|
1397
|
+
Pool.release('int32', prev, len);
|
|
1398
|
+
Pool.release('int32', curr, len);
|
|
1085
1399
|
}
|
|
1086
|
-
return result;
|
|
1087
|
-
}
|
|
1088
|
-
getLongWordRatio(len = 7) {
|
|
1089
|
-
let long = 0;
|
|
1090
|
-
for (const w of this.words) if (w.length >= len) long++;
|
|
1091
|
-
return this.words.length ? long / this.words.length : 0;
|
|
1092
|
-
}
|
|
1093
|
-
getShortWordRatio(len = 3) {
|
|
1094
|
-
let short = 0;
|
|
1095
|
-
for (const w of this.words) if (w.length <= len) short++;
|
|
1096
|
-
return this.words.length ? short / this.words.length : 0;
|
|
1097
|
-
}
|
|
1098
|
-
getSyllablesCount() {
|
|
1099
|
-
return this.computeSyllableStats().total;
|
|
1100
|
-
}
|
|
1101
|
-
getMonosyllabicWordCount() {
|
|
1102
|
-
return this.computeSyllableStats().mono;
|
|
1103
|
-
}
|
|
1104
|
-
getMinSyllablesWordCount(min) {
|
|
1105
|
-
return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
|
|
1106
1400
|
}
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1401
|
+
}
|
|
1402
|
+
MetricRegistry.add('damerau', DamerauLevenshteinDistance);
|
|
1403
|
+
|
|
1404
|
+
class DiceSorensenCoefficient extends Metric {
|
|
1405
|
+
constructor(a, b, opt = {}) {
|
|
1406
|
+
super('dice', a, b, opt, true);
|
|
1112
1407
|
}
|
|
1113
|
-
|
|
1114
|
-
|
|
1408
|
+
_bigrams(str) {
|
|
1409
|
+
const len = str.length - 1;
|
|
1410
|
+
const bigrams = Pool.acquire('set', len);
|
|
1411
|
+
for (let i = 0; i < len; i++) bigrams.add(str.substring(i, i + 2));
|
|
1412
|
+
return bigrams;
|
|
1115
1413
|
}
|
|
1116
|
-
|
|
1414
|
+
compute(a, b) {
|
|
1415
|
+
const setA = this._bigrams(a),
|
|
1416
|
+
setB = this._bigrams(b);
|
|
1417
|
+
const sizeA = setA.size,
|
|
1418
|
+
sizeB = setB.size;
|
|
1117
1419
|
try {
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
}
|
|
1129
|
-
getReadabilityScore(metric = 'flesch') {
|
|
1130
|
-
const w = this.words.length || 1;
|
|
1131
|
-
const s = this.sentences.length || 1;
|
|
1132
|
-
const y = this.getSyllablesCount() || 1;
|
|
1133
|
-
const asl = w / s;
|
|
1134
|
-
const asw = y / w;
|
|
1135
|
-
switch (metric) {
|
|
1136
|
-
case 'flesch':
|
|
1137
|
-
return 206.835 - 1.015 * asl - 84.6 * asw;
|
|
1138
|
-
case 'fleschde':
|
|
1139
|
-
return 180 - asl - 58.5 * asw;
|
|
1140
|
-
case 'kincaid':
|
|
1141
|
-
return 0.39 * asl + 11.8 * asw - 15.59;
|
|
1420
|
+
let intersection = 0;
|
|
1421
|
+
for (const bigram of setA) if (setB.has(bigram)) intersection++;
|
|
1422
|
+
const size = sizeA + sizeB;
|
|
1423
|
+
return {
|
|
1424
|
+
res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
|
|
1425
|
+
raw: { intersection, size }
|
|
1426
|
+
};
|
|
1427
|
+
} finally {
|
|
1428
|
+
Pool.release('set', setA, sizeA);
|
|
1429
|
+
Pool.release('set', setB, sizeB);
|
|
1142
1430
|
}
|
|
1143
1431
|
}
|
|
1144
|
-
getLIXScore() {
|
|
1145
|
-
const w = this.words.length || 1;
|
|
1146
|
-
const s = this.sentences.length || 1;
|
|
1147
|
-
const l = this.getLongWordRatio() * w;
|
|
1148
|
-
return w / s + (l / w) * 100;
|
|
1149
|
-
}
|
|
1150
|
-
getWSTFScore() {
|
|
1151
|
-
const w = this.words.length || 1;
|
|
1152
|
-
const h = (this.getMinSyllablesWordCount(3) / w) * 100;
|
|
1153
|
-
const s = this.getAvgSentenceLength();
|
|
1154
|
-
const l = this.getLongWordRatio() * 100;
|
|
1155
|
-
const m = (this.getMonosyllabicWordCount() / w) * 100;
|
|
1156
|
-
return [
|
|
1157
|
-
0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875,
|
|
1158
|
-
0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779,
|
|
1159
|
-
0.2963 * h + 0.1905 * s - 1.1144,
|
|
1160
|
-
0.2744 * h + 0.2656 * s - 1.693
|
|
1161
|
-
];
|
|
1162
|
-
}
|
|
1163
1432
|
}
|
|
1433
|
+
MetricRegistry.add('dice', DiceSorensenCoefficient);
|
|
1164
1434
|
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
metric;
|
|
1169
|
-
a;
|
|
1170
|
-
b;
|
|
1171
|
-
origA = [];
|
|
1172
|
-
origB = [];
|
|
1173
|
-
options;
|
|
1174
|
-
optKey;
|
|
1175
|
-
symmetric;
|
|
1176
|
-
results;
|
|
1177
|
-
static clear = () => this.cache.clear();
|
|
1178
|
-
static swap = (a, b, m, n) => (m > n ? [b, a, n, m] : [a, b, m, n]);
|
|
1179
|
-
static clamp = (res) => Math.max(0, Math.min(1, res));
|
|
1180
|
-
constructor(metric, a, b, opt = {}, symmetric = false) {
|
|
1181
|
-
this.metric = metric;
|
|
1182
|
-
this.a = Array.isArray(a) ? a : [a];
|
|
1183
|
-
this.b = Array.isArray(b) ? b : [b];
|
|
1184
|
-
if (this.a.length === 0 || this.b.length === 0)
|
|
1185
|
-
throw new Error(`Inputs <a> and <b> must not be empty`);
|
|
1186
|
-
this.options = opt;
|
|
1187
|
-
this.optKey = Hasher.fastFNV1a(
|
|
1188
|
-
JSON.stringify(opt, Object.keys(opt).sort())
|
|
1189
|
-
).toString();
|
|
1190
|
-
this.symmetric = symmetric;
|
|
1191
|
-
}
|
|
1192
|
-
preCompute(a, b, m, n) {
|
|
1193
|
-
if (a === b) return { res: 1 };
|
|
1194
|
-
if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 };
|
|
1195
|
-
return undefined;
|
|
1435
|
+
class HammingDistance extends Metric {
|
|
1436
|
+
constructor(a, b, opt = {}) {
|
|
1437
|
+
super('hamming', a, b, opt, true);
|
|
1196
1438
|
}
|
|
1197
1439
|
compute(a, b, m, n, maxLen) {
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
if (!result) {
|
|
1209
|
-
result = profiler$2.run(() => {
|
|
1210
|
-
if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
|
|
1211
|
-
const key =
|
|
1212
|
-
Metric.cache.key(this.metric, [A, B], this.symmetric) + this.optKey;
|
|
1213
|
-
return (
|
|
1214
|
-
Metric.cache.get(key || '') ??
|
|
1215
|
-
(() => {
|
|
1216
|
-
const res = this.compute(A, B, m, n, Math.max(m, n));
|
|
1217
|
-
if (key) Metric.cache.set(key, res);
|
|
1218
|
-
return res;
|
|
1219
|
-
})()
|
|
1440
|
+
if (m !== n) {
|
|
1441
|
+
if (this.options.pad !== undefined) {
|
|
1442
|
+
if (m < maxLen) a = a.padEnd(maxLen, this.options.pad);
|
|
1443
|
+
if (n < maxLen) b = b.padEnd(maxLen, this.options.pad);
|
|
1444
|
+
m = n = maxLen;
|
|
1445
|
+
} else
|
|
1446
|
+
throw new CmpStrUsageError(
|
|
1447
|
+
`Strings must be of equal length for Hamming Distance, a=${m} and b=${n} given, ` +
|
|
1448
|
+
`use option.pad for automatic adjustment`,
|
|
1449
|
+
{ a: m, b: n }
|
|
1220
1450
|
);
|
|
1221
|
-
});
|
|
1222
|
-
}
|
|
1223
|
-
return {
|
|
1224
|
-
metric: this.metric,
|
|
1225
|
-
a: this.origA[i] ?? a,
|
|
1226
|
-
b: this.origB[j] ?? b,
|
|
1227
|
-
...result
|
|
1228
|
-
};
|
|
1229
|
-
}
|
|
1230
|
-
async runSingleAsync(i, j) {
|
|
1231
|
-
return Promise.resolve(this.runSingle(i, j));
|
|
1232
|
-
}
|
|
1233
|
-
runBatch() {
|
|
1234
|
-
const results = [];
|
|
1235
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1236
|
-
for (let j = 0; j < this.b.length; j++)
|
|
1237
|
-
results.push(this.runSingle(i, j));
|
|
1238
|
-
this.results = results;
|
|
1239
|
-
}
|
|
1240
|
-
async runBatchAsync() {
|
|
1241
|
-
const results = [];
|
|
1242
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1243
|
-
for (let j = 0; j < this.b.length; j++)
|
|
1244
|
-
results.push(await this.runSingleAsync(i, j));
|
|
1245
|
-
this.results = results;
|
|
1246
|
-
}
|
|
1247
|
-
runPairwise() {
|
|
1248
|
-
const results = [];
|
|
1249
|
-
for (let i = 0; i < this.a.length; i++) results.push(this.runSingle(i, i));
|
|
1250
|
-
this.results = results;
|
|
1251
|
-
}
|
|
1252
|
-
async runPairwiseAsync() {
|
|
1253
|
-
const results = [];
|
|
1254
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1255
|
-
results.push(await this.runSingleAsync(i, i));
|
|
1256
|
-
this.results = results;
|
|
1257
|
-
}
|
|
1258
|
-
setOriginal(a, b) {
|
|
1259
|
-
if (a) this.origA = Array.isArray(a) ? a : [a];
|
|
1260
|
-
if (b) this.origB = Array.isArray(b) ? b : [b];
|
|
1261
|
-
return this;
|
|
1262
|
-
}
|
|
1263
|
-
isBatch = () => this.a.length > 1 || this.b.length > 1;
|
|
1264
|
-
isSingle = () => !this.isBatch();
|
|
1265
|
-
isPairwise(safe = false) {
|
|
1266
|
-
return this.isBatch() && this.a.length === this.b.length
|
|
1267
|
-
? true
|
|
1268
|
-
: !safe &&
|
|
1269
|
-
(() => {
|
|
1270
|
-
throw new Error(`Mode <pairwise> requires arrays of equal length`);
|
|
1271
|
-
})();
|
|
1272
|
-
}
|
|
1273
|
-
isSymmetrical = () => this.symmetric;
|
|
1274
|
-
whichMode = (mode) => mode ?? this.options?.mode ?? 'default';
|
|
1275
|
-
clear = () => (this.results = undefined);
|
|
1276
|
-
run(mode, clear = true) {
|
|
1277
|
-
if (clear) this.clear();
|
|
1278
|
-
switch (this.whichMode(mode)) {
|
|
1279
|
-
case 'default':
|
|
1280
|
-
if (this.isSingle()) {
|
|
1281
|
-
this.results = this.runSingle(0, 0);
|
|
1282
|
-
break;
|
|
1283
|
-
}
|
|
1284
|
-
case 'batch':
|
|
1285
|
-
this.runBatch();
|
|
1286
|
-
break;
|
|
1287
|
-
case 'single':
|
|
1288
|
-
this.results = this.runSingle(0, 0);
|
|
1289
|
-
break;
|
|
1290
|
-
case 'pairwise':
|
|
1291
|
-
if (this.isPairwise()) this.runPairwise();
|
|
1292
|
-
break;
|
|
1293
|
-
default:
|
|
1294
|
-
throw new Error(`Unsupported mode <${mode}>`);
|
|
1295
|
-
}
|
|
1296
|
-
}
|
|
1297
|
-
async runAsync(mode, clear = true) {
|
|
1298
|
-
if (clear) this.clear();
|
|
1299
|
-
switch (this.whichMode(mode)) {
|
|
1300
|
-
case 'default':
|
|
1301
|
-
if (this.isSingle()) {
|
|
1302
|
-
this.results = await this.runSingleAsync(0, 0);
|
|
1303
|
-
break;
|
|
1304
|
-
}
|
|
1305
|
-
case 'batch':
|
|
1306
|
-
await this.runBatchAsync();
|
|
1307
|
-
break;
|
|
1308
|
-
case 'single':
|
|
1309
|
-
this.results = await this.runSingleAsync(0, 0);
|
|
1310
|
-
break;
|
|
1311
|
-
case 'pairwise':
|
|
1312
|
-
if (this.isPairwise()) await this.runPairwiseAsync();
|
|
1313
|
-
break;
|
|
1314
|
-
default:
|
|
1315
|
-
throw new Error(`Unsupported async mode <${mode}>`);
|
|
1316
|
-
}
|
|
1317
|
-
}
|
|
1318
|
-
getMetricName = () => this.metric;
|
|
1319
|
-
getResults() {
|
|
1320
|
-
if (this.results === undefined)
|
|
1321
|
-
throw new Error(`run() must be called before getResult()`);
|
|
1322
|
-
return this.results;
|
|
1323
|
-
}
|
|
1324
|
-
}
|
|
1325
|
-
const MetricRegistry = Registry('metric', Metric);
|
|
1326
|
-
|
|
1327
|
-
class CosineSimilarity extends Metric {
|
|
1328
|
-
constructor(a, b, opt = {}) {
|
|
1329
|
-
super('cosine', a, b, opt, true);
|
|
1330
|
-
}
|
|
1331
|
-
_termFreq(str, delimiter) {
|
|
1332
|
-
const terms = str.split(delimiter);
|
|
1333
|
-
const freq = Pool.acquire('map', terms.length);
|
|
1334
|
-
for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
|
|
1335
|
-
return freq;
|
|
1336
|
-
}
|
|
1337
|
-
compute(a, b) {
|
|
1338
|
-
const { delimiter = ' ' } = this.options;
|
|
1339
|
-
const termsA = this._termFreq(a, delimiter);
|
|
1340
|
-
const termsB = this._termFreq(b, delimiter);
|
|
1341
|
-
try {
|
|
1342
|
-
let dotP = 0,
|
|
1343
|
-
magA = 0,
|
|
1344
|
-
magB = 0;
|
|
1345
|
-
for (const [term, freqA] of termsA) {
|
|
1346
|
-
const freqB = termsB.get(term) || 0;
|
|
1347
|
-
dotP += freqA * freqB;
|
|
1348
|
-
magA += freqA * freqA;
|
|
1349
|
-
}
|
|
1350
|
-
for (const freqB of termsB.values()) magB += freqB * freqB;
|
|
1351
|
-
magA = Math.sqrt(magA);
|
|
1352
|
-
magB = Math.sqrt(magB);
|
|
1353
|
-
return {
|
|
1354
|
-
res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0,
|
|
1355
|
-
raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB }
|
|
1356
|
-
};
|
|
1357
|
-
} finally {
|
|
1358
|
-
Pool.release('map', termsA, termsA.size);
|
|
1359
|
-
Pool.release('map', termsB, termsB.size);
|
|
1360
|
-
}
|
|
1361
|
-
}
|
|
1362
|
-
}
|
|
1363
|
-
MetricRegistry.add('cosine', CosineSimilarity);
|
|
1364
|
-
|
|
1365
|
-
class DamerauLevenshteinDistance extends Metric {
|
|
1366
|
-
constructor(a, b, opt = {}) {
|
|
1367
|
-
super('damerau', a, b, opt, true);
|
|
1368
|
-
}
|
|
1369
|
-
compute(a, b, m, n, maxLen) {
|
|
1370
|
-
const len = m + 1;
|
|
1371
|
-
const [test, prev, curr] = Pool.acquireMany('int32', [len, len, len]);
|
|
1372
|
-
try {
|
|
1373
|
-
for (let i = 0; i <= m; i++) prev[i] = i;
|
|
1374
|
-
for (let j = 1; j <= n; j++) {
|
|
1375
|
-
curr[0] = j;
|
|
1376
|
-
const cb = b.charCodeAt(j - 1);
|
|
1377
|
-
for (let i = 1; i <= m; i++) {
|
|
1378
|
-
const ca = a.charCodeAt(i - 1);
|
|
1379
|
-
const cost = ca === cb ? 0 : 1;
|
|
1380
|
-
let val = Math.min(curr[i - 1] + 1, prev[i] + 1, prev[i - 1] + cost);
|
|
1381
|
-
if (
|
|
1382
|
-
i > 1 &&
|
|
1383
|
-
j > 1 &&
|
|
1384
|
-
ca === b.charCodeAt(j - 2) &&
|
|
1385
|
-
cb === a.charCodeAt(i - 2)
|
|
1386
|
-
)
|
|
1387
|
-
val = Math.min(val, test[i - 2] + cost);
|
|
1388
|
-
curr[i] = val;
|
|
1389
|
-
}
|
|
1390
|
-
test.set(prev);
|
|
1391
|
-
prev.set(curr);
|
|
1392
|
-
}
|
|
1393
|
-
const dist = prev[m];
|
|
1394
|
-
return {
|
|
1395
|
-
res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
|
|
1396
|
-
raw: { dist, maxLen }
|
|
1397
|
-
};
|
|
1398
|
-
} finally {
|
|
1399
|
-
Pool.release('int32', test, len);
|
|
1400
|
-
Pool.release('int32', prev, len);
|
|
1401
|
-
Pool.release('int32', curr, len);
|
|
1402
|
-
}
|
|
1403
|
-
}
|
|
1404
|
-
}
|
|
1405
|
-
MetricRegistry.add('damerau', DamerauLevenshteinDistance);
|
|
1406
|
-
|
|
1407
|
-
class DiceSorensenCoefficient extends Metric {
|
|
1408
|
-
constructor(a, b, opt = {}) {
|
|
1409
|
-
super('dice', a, b, opt, true);
|
|
1410
|
-
}
|
|
1411
|
-
_bigrams(str) {
|
|
1412
|
-
const len = str.length - 1;
|
|
1413
|
-
const bigrams = Pool.acquire('set', len);
|
|
1414
|
-
for (let i = 0; i < len; i++) bigrams.add(str.substring(i, i + 2));
|
|
1415
|
-
return bigrams;
|
|
1416
|
-
}
|
|
1417
|
-
compute(a, b) {
|
|
1418
|
-
const setA = this._bigrams(a),
|
|
1419
|
-
setB = this._bigrams(b);
|
|
1420
|
-
const sizeA = setA.size,
|
|
1421
|
-
sizeB = setB.size;
|
|
1422
|
-
try {
|
|
1423
|
-
let intersection = 0;
|
|
1424
|
-
for (const bigram of setA) if (setB.has(bigram)) intersection++;
|
|
1425
|
-
const size = sizeA + sizeB;
|
|
1426
|
-
return {
|
|
1427
|
-
res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
|
|
1428
|
-
raw: { intersection, size }
|
|
1429
|
-
};
|
|
1430
|
-
} finally {
|
|
1431
|
-
Pool.release('set', setA, sizeA);
|
|
1432
|
-
Pool.release('set', setB, sizeB);
|
|
1433
1451
|
}
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
class HammingDistance extends Metric {
|
|
1439
|
-
constructor(a, b, opt = {}) {
|
|
1440
|
-
super('hamming', a, b, opt, true);
|
|
1441
|
-
}
|
|
1442
|
-
compute(a, b, m, n, maxLen) {
|
|
1443
|
-
if (m !== n) {
|
|
1444
|
-
if (this.options.pad !== undefined) {
|
|
1445
|
-
if (m < maxLen) a = a.padEnd(maxLen, this.options.pad);
|
|
1446
|
-
if (n < maxLen) b = b.padEnd(maxLen, this.options.pad);
|
|
1447
|
-
m = n = maxLen;
|
|
1448
|
-
} else
|
|
1449
|
-
throw new Error(
|
|
1450
|
-
`Strings must be of equal length for Hamming Distance, a=${m} and b=${n} given, ` +
|
|
1451
|
-
`use option.pad for automatic adjustment`
|
|
1452
|
-
);
|
|
1453
|
-
}
|
|
1454
|
-
let dist = 0;
|
|
1455
|
-
for (let i = 0; i < m; i++) if (a[i] !== b[i]) dist++;
|
|
1456
|
-
return { res: m === 0 ? 1 : Metric.clamp(1 - dist / m), raw: { dist } };
|
|
1452
|
+
let dist = 0;
|
|
1453
|
+
for (let i = 0; i < m; i++) if (a[i] !== b[i]) dist++;
|
|
1454
|
+
return { res: m === 0 ? 1 : Metric.clamp(1 - dist / m), raw: { dist } };
|
|
1457
1455
|
}
|
|
1458
1456
|
}
|
|
1459
1457
|
MetricRegistry.add('hamming', HammingDistance);
|
|
@@ -1717,42 +1715,59 @@ class Phonetic {
|
|
|
1717
1715
|
options;
|
|
1718
1716
|
optKey;
|
|
1719
1717
|
map;
|
|
1720
|
-
|
|
1718
|
+
ignoreSet;
|
|
1719
|
+
static clear() {
|
|
1720
|
+
this.cache.clear();
|
|
1721
|
+
}
|
|
1721
1722
|
constructor(algo, opt = {}) {
|
|
1722
1723
|
const defaults = this.constructor.default ?? {};
|
|
1723
1724
|
const mapId = opt.map ?? defaults.map;
|
|
1724
|
-
if (!mapId)
|
|
1725
|
+
if (!mapId)
|
|
1726
|
+
throw new CmpStrNotFoundError(
|
|
1727
|
+
`No mapping specified for phonetic algorithm`,
|
|
1728
|
+
{ algo }
|
|
1729
|
+
);
|
|
1725
1730
|
const map = PhoneticMappingRegistry.get(algo, mapId);
|
|
1726
1731
|
if (map === undefined)
|
|
1727
|
-
throw new
|
|
1728
|
-
|
|
1732
|
+
throw new CmpStrNotFoundError(
|
|
1733
|
+
`Requested mapping <${mapId}> is not declared`,
|
|
1734
|
+
{ algo, mapId }
|
|
1735
|
+
);
|
|
1736
|
+
this.options = DeepMerge.merge(
|
|
1737
|
+
DeepMerge.merge(defaults, map.options ?? {}),
|
|
1738
|
+
opt
|
|
1739
|
+
);
|
|
1729
1740
|
this.optKey = Hasher.fastFNV1a(
|
|
1730
1741
|
JSON.stringify(this.options, Object.keys(this.options).sort())
|
|
1731
1742
|
).toString();
|
|
1732
1743
|
this.algo = algo;
|
|
1733
1744
|
this.map = map;
|
|
1745
|
+
this.ignoreSet = new Set(map.ignore ?? []);
|
|
1734
1746
|
}
|
|
1735
1747
|
applyPattern(word) {
|
|
1736
1748
|
const { patterns = [] } = this.map;
|
|
1737
|
-
if (!patterns
|
|
1749
|
+
if (!patterns.length) return word;
|
|
1738
1750
|
for (const { pattern, replace, all = false } of patterns) {
|
|
1739
|
-
word =
|
|
1751
|
+
word = all
|
|
1752
|
+
? word.replaceAll(pattern, replace)
|
|
1753
|
+
: word.replace(pattern, replace);
|
|
1740
1754
|
}
|
|
1741
1755
|
return word;
|
|
1742
1756
|
}
|
|
1743
1757
|
applyRules(char, i, chars, charLen) {
|
|
1744
1758
|
const { ruleset = [] } = this.map;
|
|
1745
|
-
if (!ruleset
|
|
1759
|
+
if (!ruleset.length) return undefined;
|
|
1746
1760
|
const prev = chars[i - 1] || '',
|
|
1747
1761
|
prev2 = chars[i - 2] || '';
|
|
1748
1762
|
const next = chars[i + 1] || '',
|
|
1749
1763
|
next2 = chars[i + 2] || '';
|
|
1764
|
+
const str = chars.join('');
|
|
1750
1765
|
for (const rule of ruleset) {
|
|
1751
1766
|
if (rule.char && rule.char !== char) continue;
|
|
1752
1767
|
if (rule.position === 'start' && i !== 0) continue;
|
|
1753
1768
|
if (rule.position === 'middle' && (i === 0 || i === charLen - 1))
|
|
1754
1769
|
continue;
|
|
1755
|
-
if (rule.position === 'end' && i !== charLen) continue;
|
|
1770
|
+
if (rule.position === 'end' && i !== charLen - 1) continue;
|
|
1756
1771
|
if (rule.prev && !rule.prev.includes(prev)) continue;
|
|
1757
1772
|
if (rule.prevNot && rule.prevNot.includes(prev)) continue;
|
|
1758
1773
|
if (rule.prev2 && !rule.prev2.includes(prev2)) continue;
|
|
@@ -1763,12 +1778,12 @@ class Phonetic {
|
|
|
1763
1778
|
if (rule.next2Not && rule.next2Not.includes(next2)) continue;
|
|
1764
1779
|
if (
|
|
1765
1780
|
rule.leading &&
|
|
1766
|
-
!rule.leading.includes(
|
|
1781
|
+
!rule.leading.includes(str.slice(0, rule.leading.length))
|
|
1767
1782
|
)
|
|
1768
1783
|
continue;
|
|
1769
1784
|
if (
|
|
1770
1785
|
rule.trailing &&
|
|
1771
|
-
!rule.trailing.includes(
|
|
1786
|
+
!rule.trailing.includes(str.slice(-rule.trailing.length))
|
|
1772
1787
|
)
|
|
1773
1788
|
continue;
|
|
1774
1789
|
if (rule.match && !rule.match.every((c, j) => chars[i + j] === c))
|
|
@@ -1778,7 +1793,7 @@ class Phonetic {
|
|
|
1778
1793
|
return undefined;
|
|
1779
1794
|
}
|
|
1780
1795
|
encode(word) {
|
|
1781
|
-
const { map = {}
|
|
1796
|
+
const { map = {} } = this.map;
|
|
1782
1797
|
word = this.applyPattern(word);
|
|
1783
1798
|
const chars = this.word2Chars(word);
|
|
1784
1799
|
const charLen = chars.length;
|
|
@@ -1786,7 +1801,7 @@ class Phonetic {
|
|
|
1786
1801
|
lastCode = null;
|
|
1787
1802
|
for (let i = 0; i < charLen; i++) {
|
|
1788
1803
|
const char = chars[i];
|
|
1789
|
-
if (
|
|
1804
|
+
if (this.ignoreSet.has(char)) continue;
|
|
1790
1805
|
const mapped = this.mapChar(char, i, chars, charLen, lastCode, map);
|
|
1791
1806
|
if (mapped === undefined) continue;
|
|
1792
1807
|
((code += mapped), (lastCode = mapped));
|
|
@@ -1805,7 +1820,9 @@ class Phonetic {
|
|
|
1805
1820
|
? input
|
|
1806
1821
|
: (input + pad.repeat(length)).slice(0, length);
|
|
1807
1822
|
}
|
|
1808
|
-
word2Chars
|
|
1823
|
+
word2Chars(word) {
|
|
1824
|
+
return Array.from(word.toLowerCase());
|
|
1825
|
+
}
|
|
1809
1826
|
exitEarly(code, i) {
|
|
1810
1827
|
const { length = -1 } = this.options;
|
|
1811
1828
|
return length > 0 && code.length >= length;
|
|
@@ -1814,37 +1831,52 @@ class Phonetic {
|
|
|
1814
1831
|
return code;
|
|
1815
1832
|
}
|
|
1816
1833
|
loop(words) {
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
const
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
1834
|
+
return ErrorUtil.wrap(
|
|
1835
|
+
() => {
|
|
1836
|
+
const index = [];
|
|
1837
|
+
for (const word of words) {
|
|
1838
|
+
let key = Phonetic.cache.key(this.algo, [word]);
|
|
1839
|
+
if (key) key += this.optKey;
|
|
1840
|
+
const code =
|
|
1841
|
+
Phonetic.cache.get(key || '') ??
|
|
1842
|
+
(() => {
|
|
1843
|
+
const res = this.encode(word);
|
|
1844
|
+
if (key) Phonetic.cache.set(key, res);
|
|
1845
|
+
return res;
|
|
1846
|
+
})();
|
|
1847
|
+
if (code && code.length) index.push(this.equalLen(code));
|
|
1848
|
+
}
|
|
1849
|
+
return index;
|
|
1850
|
+
},
|
|
1851
|
+
`Failed to generate phonetic index`,
|
|
1852
|
+
{ algo: this.algo, words }
|
|
1853
|
+
);
|
|
1830
1854
|
}
|
|
1831
1855
|
async loopAsync(words) {
|
|
1832
|
-
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1856
|
+
return ErrorUtil.wrapAsync(
|
|
1857
|
+
async () => {
|
|
1858
|
+
const index = [];
|
|
1859
|
+
for (const word of words) {
|
|
1860
|
+
const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
|
|
1861
|
+
const code = await Promise.resolve(
|
|
1862
|
+
Phonetic.cache.get(key || '') ??
|
|
1863
|
+
(() => {
|
|
1864
|
+
const res = this.encode(word);
|
|
1865
|
+
if (key) Phonetic.cache.set(key, res);
|
|
1866
|
+
return res;
|
|
1867
|
+
})()
|
|
1868
|
+
);
|
|
1869
|
+
if (code && code.length) index.push(this.equalLen(code));
|
|
1870
|
+
}
|
|
1871
|
+
return index;
|
|
1872
|
+
},
|
|
1873
|
+
`Failed to generate phonetic index asynchronously`,
|
|
1874
|
+
{ algo: this.algo, words }
|
|
1875
|
+
);
|
|
1876
|
+
}
|
|
1877
|
+
getAlgoName() {
|
|
1878
|
+
return this.algo;
|
|
1846
1879
|
}
|
|
1847
|
-
getAlgoName = () => this.algo;
|
|
1848
1880
|
getIndex(input) {
|
|
1849
1881
|
const { delimiter = ' ' } = this.options;
|
|
1850
1882
|
return profiler$1.run(() =>
|
|
@@ -1867,10 +1899,11 @@ const PhoneticMappingRegistry = (() => {
|
|
|
1867
1899
|
return Object.freeze({
|
|
1868
1900
|
add(algo, id, map, update = false) {
|
|
1869
1901
|
const mappings = maps(algo);
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1902
|
+
ErrorUtil.assert(
|
|
1903
|
+
!(!id || id in mappings) || update,
|
|
1904
|
+
`Entry <${id}> already exists / use <update=true> to overwrite`,
|
|
1905
|
+
{ algo, id }
|
|
1906
|
+
);
|
|
1874
1907
|
mappings[id] = map;
|
|
1875
1908
|
},
|
|
1876
1909
|
remove(algo, id) {
|
|
@@ -2075,168 +2108,737 @@ class Metaphone extends Phonetic {
|
|
|
2075
2108
|
constructor(opt = {}) {
|
|
2076
2109
|
super('metaphone', opt);
|
|
2077
2110
|
}
|
|
2078
|
-
encode(word) {
|
|
2079
|
-
word = word.replace(Metaphone.REGEX.adjacent, (m, c) =>
|
|
2080
|
-
c === 'C' ? m : c
|
|
2081
|
-
);
|
|
2082
|
-
return super.encode(word);
|
|
2111
|
+
encode(word) {
|
|
2112
|
+
word = word.replace(Metaphone.REGEX.adjacent, (m, c) =>
|
|
2113
|
+
c === 'C' ? m : c
|
|
2114
|
+
);
|
|
2115
|
+
return super.encode(word);
|
|
2116
|
+
}
|
|
2117
|
+
adjustCode(code) {
|
|
2118
|
+
return code.slice(0, 1) + code.slice(1).replace(Metaphone.REGEX.vowel, '');
|
|
2119
|
+
}
|
|
2120
|
+
}
|
|
2121
|
+
PhoneticRegistry.add('metaphone', Metaphone);
|
|
2122
|
+
PhoneticMappingRegistry.add('metaphone', 'en90', {
|
|
2123
|
+
map: {
|
|
2124
|
+
a: 'A',
|
|
2125
|
+
b: 'B',
|
|
2126
|
+
c: 'K',
|
|
2127
|
+
d: 'T',
|
|
2128
|
+
e: 'E',
|
|
2129
|
+
f: 'F',
|
|
2130
|
+
g: 'K',
|
|
2131
|
+
h: 'H',
|
|
2132
|
+
i: 'I',
|
|
2133
|
+
j: 'J',
|
|
2134
|
+
k: 'K',
|
|
2135
|
+
l: 'L',
|
|
2136
|
+
m: 'M',
|
|
2137
|
+
n: 'N',
|
|
2138
|
+
o: 'O',
|
|
2139
|
+
p: 'P',
|
|
2140
|
+
q: 'K',
|
|
2141
|
+
r: 'R',
|
|
2142
|
+
s: 'S',
|
|
2143
|
+
t: 'T',
|
|
2144
|
+
u: 'U',
|
|
2145
|
+
v: 'F',
|
|
2146
|
+
w: 'W',
|
|
2147
|
+
x: 'KS',
|
|
2148
|
+
y: 'Y',
|
|
2149
|
+
z: 'S'
|
|
2150
|
+
},
|
|
2151
|
+
ruleset: [
|
|
2152
|
+
{ char: 'a', position: 'start', next: ['e'], code: '' },
|
|
2153
|
+
{ char: 'g', position: 'start', next: ['n'], code: '' },
|
|
2154
|
+
{ char: 'k', position: 'start', next: ['n'], code: '' },
|
|
2155
|
+
{ char: 'p', position: 'start', next: ['n'], code: '' },
|
|
2156
|
+
{ char: 'w', position: 'start', next: ['r'], code: '' },
|
|
2157
|
+
{ char: 'b', position: 'end', prev: ['m'], code: '' },
|
|
2158
|
+
{ char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
|
|
2159
|
+
{ char: 'c', next: ['i'], next2: ['a'], code: 'X' },
|
|
2160
|
+
{ char: 'c', next: ['e', 'i', 'y'], code: 'S' },
|
|
2161
|
+
{ char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
|
|
2162
|
+
{
|
|
2163
|
+
char: 'g',
|
|
2164
|
+
next: ['h'],
|
|
2165
|
+
next2Not: ['', 'a', 'e', 'i', 'o', 'u'],
|
|
2166
|
+
code: ''
|
|
2167
|
+
},
|
|
2168
|
+
{ char: 'g', trailing: 'n', code: '' },
|
|
2169
|
+
{ char: 'g', trailing: 'ned', code: '' },
|
|
2170
|
+
{ char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
|
|
2171
|
+
{
|
|
2172
|
+
char: 'h',
|
|
2173
|
+
prev: ['a', 'e', 'i', 'o', 'u'],
|
|
2174
|
+
nextNot: ['a', 'e', 'i', 'o', 'u'],
|
|
2175
|
+
code: ''
|
|
2176
|
+
},
|
|
2177
|
+
{ char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
|
|
2178
|
+
{ char: 'k', prev: ['c'], code: '' },
|
|
2179
|
+
{ char: 'p', next: ['h'], code: 'F' },
|
|
2180
|
+
{ char: 's', next: ['h'], code: 'X' },
|
|
2181
|
+
{ char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2182
|
+
{ char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2183
|
+
{ char: 't', next: ['h'], code: '0' },
|
|
2184
|
+
{ char: 't', next: ['c'], next2: ['h'], code: '' },
|
|
2185
|
+
{ char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
|
|
2186
|
+
{ char: 'h', leading: 'w', code: '' },
|
|
2187
|
+
{ char: 'x', position: 'start', code: 'S' },
|
|
2188
|
+
{ char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
|
|
2189
|
+
]
|
|
2190
|
+
});
|
|
2191
|
+
|
|
2192
|
+
class Soundex extends Phonetic {
|
|
2193
|
+
static default = {
|
|
2194
|
+
map: 'en',
|
|
2195
|
+
delimiter: ' ',
|
|
2196
|
+
length: 4,
|
|
2197
|
+
pad: '0',
|
|
2198
|
+
dedupe: true
|
|
2199
|
+
};
|
|
2200
|
+
constructor(opt = {}) {
|
|
2201
|
+
super('soundex', opt);
|
|
2202
|
+
}
|
|
2203
|
+
adjustCode(code, chars) {
|
|
2204
|
+
return chars[0].toUpperCase() + code.slice(1).replaceAll('0', '');
|
|
2205
|
+
}
|
|
2206
|
+
}
|
|
2207
|
+
PhoneticRegistry.add('soundex', Soundex);
|
|
2208
|
+
PhoneticMappingRegistry.add('soundex', 'en', {
|
|
2209
|
+
map: {
|
|
2210
|
+
a: '0',
|
|
2211
|
+
e: '0',
|
|
2212
|
+
h: '0',
|
|
2213
|
+
i: '0',
|
|
2214
|
+
o: '0',
|
|
2215
|
+
u: '0',
|
|
2216
|
+
w: '0',
|
|
2217
|
+
y: '0',
|
|
2218
|
+
b: '1',
|
|
2219
|
+
f: '1',
|
|
2220
|
+
p: '1',
|
|
2221
|
+
v: '1',
|
|
2222
|
+
c: '2',
|
|
2223
|
+
g: '2',
|
|
2224
|
+
j: '2',
|
|
2225
|
+
k: '2',
|
|
2226
|
+
q: '2',
|
|
2227
|
+
s: '2',
|
|
2228
|
+
x: '2',
|
|
2229
|
+
z: '2',
|
|
2230
|
+
d: '3',
|
|
2231
|
+
t: '3',
|
|
2232
|
+
l: '4',
|
|
2233
|
+
m: '5',
|
|
2234
|
+
n: '5',
|
|
2235
|
+
r: '6'
|
|
2236
|
+
}
|
|
2237
|
+
});
|
|
2238
|
+
PhoneticMappingRegistry.add('soundex', 'de', {
|
|
2239
|
+
map: {
|
|
2240
|
+
a: '0',
|
|
2241
|
+
ä: '0',
|
|
2242
|
+
e: '0',
|
|
2243
|
+
h: '0',
|
|
2244
|
+
i: '0',
|
|
2245
|
+
j: '0',
|
|
2246
|
+
o: '0',
|
|
2247
|
+
ö: '0',
|
|
2248
|
+
u: '0',
|
|
2249
|
+
ü: '0',
|
|
2250
|
+
y: '0',
|
|
2251
|
+
b: '1',
|
|
2252
|
+
f: '1',
|
|
2253
|
+
p: '1',
|
|
2254
|
+
v: '1',
|
|
2255
|
+
w: '1',
|
|
2256
|
+
c: '2',
|
|
2257
|
+
g: '2',
|
|
2258
|
+
k: '2',
|
|
2259
|
+
q: '2',
|
|
2260
|
+
s: '2',
|
|
2261
|
+
ß: '2',
|
|
2262
|
+
x: '2',
|
|
2263
|
+
z: '2',
|
|
2264
|
+
d: '3',
|
|
2265
|
+
t: '3',
|
|
2266
|
+
l: '4',
|
|
2267
|
+
m: '5',
|
|
2268
|
+
n: '5',
|
|
2269
|
+
r: '6'
|
|
2270
|
+
},
|
|
2271
|
+
ruleset: [{ char: 'c', next: ['h'], code: '7' }]
|
|
2272
|
+
});
|
|
2273
|
+
|
|
2274
|
+
class OptionsValidator {
|
|
2275
|
+
static ALLOWED_FLAGS = new Set([
|
|
2276
|
+
'd',
|
|
2277
|
+
'u',
|
|
2278
|
+
'x',
|
|
2279
|
+
'w',
|
|
2280
|
+
't',
|
|
2281
|
+
'r',
|
|
2282
|
+
's',
|
|
2283
|
+
'k',
|
|
2284
|
+
'n',
|
|
2285
|
+
'i'
|
|
2286
|
+
]);
|
|
2287
|
+
static ALLOWED_OUTPUT = new Set(['orig', 'prep']);
|
|
2288
|
+
static ALLOWED_MODES = new Set(['default', 'batch', 'single', 'pairwise']);
|
|
2289
|
+
static ALLOWED_SORT = new Set(['asc', 'desc']);
|
|
2290
|
+
static PROCESSORS = {
|
|
2291
|
+
phonetic: (opt) => {
|
|
2292
|
+
if (!opt) return;
|
|
2293
|
+
OptionsValidator.validatePhoneticName(opt.algo);
|
|
2294
|
+
OptionsValidator.validatePhoneticOptions(opt.opt);
|
|
2295
|
+
}
|
|
2296
|
+
};
|
|
2297
|
+
static METRIC_OPT_MAP = {
|
|
2298
|
+
mode: (v) => OptionsValidator.validateMode(v),
|
|
2299
|
+
delimiter: (v) => OptionsValidator.validateString(v, 'opt.delimiter'),
|
|
2300
|
+
pad: (v) => OptionsValidator.validateString(v, 'opt.pad'),
|
|
2301
|
+
q: (v) => OptionsValidator.validateNumber(v, 'opt.q'),
|
|
2302
|
+
match: (v) => OptionsValidator.validateNumber(v, 'opt.match'),
|
|
2303
|
+
mismatch: (v) => OptionsValidator.validateNumber(v, 'opt.mismatch'),
|
|
2304
|
+
gap: (v) => OptionsValidator.validateNumber(v, 'opt.gap')
|
|
2305
|
+
};
|
|
2306
|
+
static PHONETIC_OPT_MAP = {
|
|
2307
|
+
map: (v) =>
|
|
2308
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.map'),
|
|
2309
|
+
delimiter: (v) =>
|
|
2310
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.delimiter'),
|
|
2311
|
+
length: (v) =>
|
|
2312
|
+
OptionsValidator.validateNumber(v, 'processors.phonetic.opt.length'),
|
|
2313
|
+
pad: (v) =>
|
|
2314
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.pad'),
|
|
2315
|
+
dedupe: (v) =>
|
|
2316
|
+
OptionsValidator.validateBoolean(v, 'processors.phonetic.opt.dedupe'),
|
|
2317
|
+
fallback: (v) =>
|
|
2318
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.fallback')
|
|
2319
|
+
};
|
|
2320
|
+
static CMPSTR_OPT_MAP = {
|
|
2321
|
+
raw: (v) => OptionsValidator.validateBoolean(v, 'raw'),
|
|
2322
|
+
removeZero: (v) => OptionsValidator.validateBoolean(v, 'removeZero'),
|
|
2323
|
+
safeEmpty: (v) => OptionsValidator.validateBoolean(v, 'safeEmpty'),
|
|
2324
|
+
flags: (v) => OptionsValidator.validateFlags(v),
|
|
2325
|
+
metric: (v) => OptionsValidator.validateMetricName(v),
|
|
2326
|
+
output: (v) => OptionsValidator.validateOutput(v),
|
|
2327
|
+
opt: (v) => OptionsValidator.validateMetricOptions(v),
|
|
2328
|
+
processors: (v) => OptionsValidator.validateProcessors(v),
|
|
2329
|
+
sort: (v) => OptionsValidator.validateSort(v, 'sort'),
|
|
2330
|
+
objectsOnly: (v) => OptionsValidator.validateBoolean(v, 'objectsOnly')
|
|
2331
|
+
};
|
|
2332
|
+
static set2string(set) {
|
|
2333
|
+
return Array.from(set).join(' | ');
|
|
2334
|
+
}
|
|
2335
|
+
static validateType(value, name, type) {
|
|
2336
|
+
if (value === undefined) return;
|
|
2337
|
+
if (typeof value !== type || (type === 'number' && Number.isNaN(value))) {
|
|
2338
|
+
throw new CmpStrValidationError(
|
|
2339
|
+
`Invalid option <${name}>: expected ${type}`,
|
|
2340
|
+
{ name, value }
|
|
2341
|
+
);
|
|
2342
|
+
}
|
|
2343
|
+
}
|
|
2344
|
+
static validateEnum(value, name, set) {
|
|
2345
|
+
if (value === undefined) return;
|
|
2346
|
+
if (typeof value !== 'string' || !set.has(value)) {
|
|
2347
|
+
throw new CmpStrValidationError(
|
|
2348
|
+
`Invalid option <${name}>: expected ${OptionsValidator.set2string(set)}`,
|
|
2349
|
+
{ name, value }
|
|
2350
|
+
);
|
|
2351
|
+
}
|
|
2352
|
+
}
|
|
2353
|
+
static validateMap(opt, map) {
|
|
2354
|
+
if (!opt) return;
|
|
2355
|
+
for (const k in opt) {
|
|
2356
|
+
const fn = map[k];
|
|
2357
|
+
if (!fn)
|
|
2358
|
+
throw new CmpStrValidationError(`Invalid option <${k}>`, {
|
|
2359
|
+
option: k,
|
|
2360
|
+
value: map[k]
|
|
2361
|
+
});
|
|
2362
|
+
fn(opt[k]);
|
|
2363
|
+
}
|
|
2364
|
+
}
|
|
2365
|
+
static validateRegistryName(value, name, label, has, list) {
|
|
2366
|
+
if (value === undefined) return;
|
|
2367
|
+
if (typeof value !== 'string' || value.length === 0)
|
|
2368
|
+
throw new CmpStrValidationError(
|
|
2369
|
+
`Invalid option <${name}>: expected non-empty string`,
|
|
2370
|
+
{ name, value }
|
|
2371
|
+
);
|
|
2372
|
+
if (!has(value))
|
|
2373
|
+
throw new CmpStrValidationError(`${label} <${value}> is not registered`, {
|
|
2374
|
+
name,
|
|
2375
|
+
value,
|
|
2376
|
+
available: list()
|
|
2377
|
+
});
|
|
2378
|
+
}
|
|
2379
|
+
static validateBoolean(value, name) {
|
|
2380
|
+
OptionsValidator.validateType(value, name, 'boolean');
|
|
2381
|
+
}
|
|
2382
|
+
static validateNumber(value, name) {
|
|
2383
|
+
OptionsValidator.validateType(value, name, 'number');
|
|
2384
|
+
}
|
|
2385
|
+
static validateString(value, name) {
|
|
2386
|
+
OptionsValidator.validateType(value, name, 'string');
|
|
2387
|
+
}
|
|
2388
|
+
static validateFlags(value) {
|
|
2389
|
+
if (value === undefined) return;
|
|
2390
|
+
if (typeof value !== 'string')
|
|
2391
|
+
throw new CmpStrValidationError(
|
|
2392
|
+
`Invalid option <flags>: expected string`,
|
|
2393
|
+
{ flags: value }
|
|
2394
|
+
);
|
|
2395
|
+
for (let i = 0; i < value.length; i++) {
|
|
2396
|
+
const ch = value[i];
|
|
2397
|
+
if (!OptionsValidator.ALLOWED_FLAGS.has(ch))
|
|
2398
|
+
throw new CmpStrValidationError(
|
|
2399
|
+
`Invalid normalization flag <${ch}> in <flags>: expected ${OptionsValidator.set2string(OptionsValidator.ALLOWED_FLAGS)}`,
|
|
2400
|
+
{ flags: value, invalid: ch }
|
|
2401
|
+
);
|
|
2402
|
+
}
|
|
2403
|
+
}
|
|
2404
|
+
static validateOutput(value) {
|
|
2405
|
+
OptionsValidator.validateEnum(
|
|
2406
|
+
value,
|
|
2407
|
+
'output',
|
|
2408
|
+
OptionsValidator.ALLOWED_OUTPUT
|
|
2409
|
+
);
|
|
2410
|
+
}
|
|
2411
|
+
static validateMode(value) {
|
|
2412
|
+
OptionsValidator.validateEnum(
|
|
2413
|
+
value,
|
|
2414
|
+
'mode',
|
|
2415
|
+
OptionsValidator.ALLOWED_MODES
|
|
2416
|
+
);
|
|
2417
|
+
}
|
|
2418
|
+
static validateSort(value, name) {
|
|
2419
|
+
if (value === undefined || typeof value === 'boolean') return;
|
|
2420
|
+
OptionsValidator.validateEnum(value, name, OptionsValidator.ALLOWED_SORT);
|
|
2421
|
+
}
|
|
2422
|
+
static validateMetricName(value) {
|
|
2423
|
+
OptionsValidator.validateRegistryName(
|
|
2424
|
+
value,
|
|
2425
|
+
'metric',
|
|
2426
|
+
'Comparison metric',
|
|
2427
|
+
MetricRegistry.has,
|
|
2428
|
+
MetricRegistry.list
|
|
2429
|
+
);
|
|
2430
|
+
}
|
|
2431
|
+
static validatePhoneticName(value) {
|
|
2432
|
+
OptionsValidator.validateRegistryName(
|
|
2433
|
+
value,
|
|
2434
|
+
'phonetic',
|
|
2435
|
+
'Phonetic algorithm',
|
|
2436
|
+
PhoneticRegistry.has,
|
|
2437
|
+
PhoneticRegistry.list
|
|
2438
|
+
);
|
|
2439
|
+
}
|
|
2440
|
+
static validateMetricOptions(opt) {
|
|
2441
|
+
OptionsValidator.validateMap(opt, OptionsValidator.METRIC_OPT_MAP);
|
|
2442
|
+
}
|
|
2443
|
+
static validatePhoneticOptions(opt) {
|
|
2444
|
+
OptionsValidator.validateMap(opt, OptionsValidator.PHONETIC_OPT_MAP);
|
|
2445
|
+
}
|
|
2446
|
+
static validateProcessors(opt) {
|
|
2447
|
+
if (!opt) return;
|
|
2448
|
+
for (const key in opt) {
|
|
2449
|
+
const fn = OptionsValidator.PROCESSORS[key];
|
|
2450
|
+
if (!fn)
|
|
2451
|
+
throw new CmpStrValidationError(
|
|
2452
|
+
`Invalid processor type <${key}> in <processors>: expected ${Object.keys(OptionsValidator.PROCESSORS).join(' | ')}`,
|
|
2453
|
+
{ processors: opt, invalid: key }
|
|
2454
|
+
);
|
|
2455
|
+
fn(opt[key]);
|
|
2456
|
+
}
|
|
2457
|
+
}
|
|
2458
|
+
static validateOptions(opt) {
|
|
2459
|
+
OptionsValidator.validateMap(opt, OptionsValidator.CMPSTR_OPT_MAP);
|
|
2460
|
+
}
|
|
2461
|
+
}
|
|
2462
|
+
|
|
2463
|
+
class StructuredData {
|
|
2464
|
+
data;
|
|
2465
|
+
key;
|
|
2466
|
+
static SORT_ASC = (a, b) => a.res - b.res;
|
|
2467
|
+
static SORT_DESC = (a, b) => b.res - a.res;
|
|
2468
|
+
static create(data, key) {
|
|
2469
|
+
return new StructuredData(data, key);
|
|
2470
|
+
}
|
|
2471
|
+
constructor(data, key) {
|
|
2472
|
+
this.data = data;
|
|
2473
|
+
this.key = key;
|
|
2474
|
+
}
|
|
2475
|
+
extractFrom(arr, key) {
|
|
2476
|
+
const n = arr.length;
|
|
2477
|
+
const result = new Array(n);
|
|
2478
|
+
for (let i = 0; i < n; i++) {
|
|
2479
|
+
const val = arr[i][key];
|
|
2480
|
+
result[i] = val != null ? String(val) : '';
|
|
2481
|
+
}
|
|
2482
|
+
return result;
|
|
2483
|
+
}
|
|
2484
|
+
extract() {
|
|
2485
|
+
return this.extractFrom(this.data, this.key);
|
|
2486
|
+
}
|
|
2487
|
+
isMetricResult(v) {
|
|
2488
|
+
return (
|
|
2489
|
+
typeof v === 'object' && v !== null && 'a' in v && 'b' in v && 'res' in v
|
|
2490
|
+
);
|
|
2491
|
+
}
|
|
2492
|
+
isCmpStrResult(v) {
|
|
2493
|
+
return (
|
|
2494
|
+
typeof v === 'object' &&
|
|
2495
|
+
v !== null &&
|
|
2496
|
+
'source' in v &&
|
|
2497
|
+
'target' in v &&
|
|
2498
|
+
'match' in v
|
|
2499
|
+
);
|
|
2500
|
+
}
|
|
2501
|
+
normalizeResults(results) {
|
|
2502
|
+
if (!Array.isArray(results) || results.length === 0) return [];
|
|
2503
|
+
const first = results[0];
|
|
2504
|
+
let out = new Array(results.length);
|
|
2505
|
+
if (this.isMetricResult(first)) {
|
|
2506
|
+
const src = results;
|
|
2507
|
+
for (let i = 0; i < src.length; i++) out[i] = { ...src[i], __idx: i };
|
|
2508
|
+
} else if (this.isCmpStrResult(first)) {
|
|
2509
|
+
const src = results;
|
|
2510
|
+
for (let i = 0; i < src.length; i++) {
|
|
2511
|
+
const r = src[i];
|
|
2512
|
+
out[i] = {
|
|
2513
|
+
metric: 'unknown',
|
|
2514
|
+
a: r.source,
|
|
2515
|
+
b: r.target,
|
|
2516
|
+
res: r.match,
|
|
2517
|
+
raw: r.raw,
|
|
2518
|
+
__idx: i
|
|
2519
|
+
};
|
|
2520
|
+
}
|
|
2521
|
+
} else
|
|
2522
|
+
throw new CmpStrValidationError(
|
|
2523
|
+
'Unsupported result format for StructuredData normalization.'
|
|
2524
|
+
);
|
|
2525
|
+
return out;
|
|
2526
|
+
}
|
|
2527
|
+
rebuild(results, sourceData, extractedStrings, removeZero, objectsOnly) {
|
|
2528
|
+
const m = extractedStrings.length,
|
|
2529
|
+
n = results.length;
|
|
2530
|
+
const stringToIndices = Pool.acquire('map', m);
|
|
2531
|
+
const occurrenceCount = Pool.acquire('map', n);
|
|
2532
|
+
const output = new Array(n);
|
|
2533
|
+
stringToIndices.clear();
|
|
2534
|
+
occurrenceCount.clear();
|
|
2535
|
+
try {
|
|
2536
|
+
for (let i = 0; i < m; i++) {
|
|
2537
|
+
const str = extractedStrings[i];
|
|
2538
|
+
let arr = stringToIndices.get(str);
|
|
2539
|
+
if (!arr) {
|
|
2540
|
+
arr = [];
|
|
2541
|
+
stringToIndices.set(str, arr);
|
|
2542
|
+
}
|
|
2543
|
+
arr.push(i);
|
|
2544
|
+
}
|
|
2545
|
+
let out = 0;
|
|
2546
|
+
for (let i = 0; i < n; i++) {
|
|
2547
|
+
const result = results[i];
|
|
2548
|
+
if (removeZero && result.res === 0) continue;
|
|
2549
|
+
const targetStr = result.b || '';
|
|
2550
|
+
const indices = stringToIndices.get(targetStr);
|
|
2551
|
+
let dataIndex;
|
|
2552
|
+
if (indices && indices.length > 0) {
|
|
2553
|
+
const occurrence = occurrenceCount.get(targetStr) ?? 0;
|
|
2554
|
+
occurrenceCount.set(targetStr, occurrence + 1);
|
|
2555
|
+
dataIndex = indices[occurrence % indices.length];
|
|
2556
|
+
} else {
|
|
2557
|
+
dataIndex = result.__idx ?? i;
|
|
2558
|
+
}
|
|
2559
|
+
if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
|
|
2560
|
+
const sourceObj = sourceData[dataIndex];
|
|
2561
|
+
const mappedTarget = extractedStrings[dataIndex] || targetStr;
|
|
2562
|
+
if (objectsOnly) output[out++] = sourceObj;
|
|
2563
|
+
else
|
|
2564
|
+
output[out++] = {
|
|
2565
|
+
obj: sourceObj,
|
|
2566
|
+
key: this.key,
|
|
2567
|
+
result: {
|
|
2568
|
+
source: result.a,
|
|
2569
|
+
target: mappedTarget,
|
|
2570
|
+
match: result.res
|
|
2571
|
+
},
|
|
2572
|
+
...(result.raw ? { raw: result.raw } : null)
|
|
2573
|
+
};
|
|
2574
|
+
}
|
|
2575
|
+
output.length = out;
|
|
2576
|
+
return output;
|
|
2577
|
+
} finally {
|
|
2578
|
+
Pool.release('map', stringToIndices, m);
|
|
2579
|
+
Pool.release('map', occurrenceCount, n);
|
|
2580
|
+
}
|
|
2581
|
+
}
|
|
2582
|
+
sort(results, sort) {
|
|
2583
|
+
if (!sort || results.length <= 1) return results;
|
|
2584
|
+
return results.sort(
|
|
2585
|
+
sort === 'asc' ? StructuredData.SORT_ASC : StructuredData.SORT_DESC
|
|
2586
|
+
);
|
|
2587
|
+
}
|
|
2588
|
+
finalizeLookup(results, extractedStrings, opt) {
|
|
2589
|
+
return this.rebuild(
|
|
2590
|
+
this.sort(this.normalizeResults(results), opt?.sort),
|
|
2591
|
+
this.data,
|
|
2592
|
+
extractedStrings,
|
|
2593
|
+
opt?.removeZero,
|
|
2594
|
+
opt?.objectsOnly
|
|
2595
|
+
);
|
|
2596
|
+
}
|
|
2597
|
+
performLookup(fn, extractedStrings, opt) {
|
|
2598
|
+
return ErrorUtil.wrap(
|
|
2599
|
+
() => this.finalizeLookup(fn(), extractedStrings, opt),
|
|
2600
|
+
'StructuredData lookup failed',
|
|
2601
|
+
{ key: this.key }
|
|
2602
|
+
);
|
|
2603
|
+
}
|
|
2604
|
+
async performLookupAsync(fn, extractedStrings, opt) {
|
|
2605
|
+
return await ErrorUtil.wrapAsync(
|
|
2606
|
+
async () => this.finalizeLookup(await fn(), extractedStrings, opt),
|
|
2607
|
+
'StructuredData async lookup failed',
|
|
2608
|
+
{ key: this.key }
|
|
2609
|
+
);
|
|
2610
|
+
}
|
|
2611
|
+
lookup(fn, query, opt) {
|
|
2612
|
+
const b = this.extract();
|
|
2613
|
+
try {
|
|
2614
|
+
return this.performLookup(() => fn(query, b, opt), b, opt);
|
|
2615
|
+
} finally {
|
|
2616
|
+
Pool.release('string[]', b, b.length);
|
|
2617
|
+
}
|
|
2618
|
+
}
|
|
2619
|
+
async lookupAsync(fn, query, opt) {
|
|
2620
|
+
const b = this.extract();
|
|
2621
|
+
try {
|
|
2622
|
+
return await this.performLookupAsync(() => fn(query, b, opt), b, opt);
|
|
2623
|
+
} finally {
|
|
2624
|
+
Pool.release('string[]', b, b.length);
|
|
2625
|
+
}
|
|
2626
|
+
}
|
|
2627
|
+
lookupPairs(fn, other, otherKey, opt) {
|
|
2628
|
+
const a = this.extract();
|
|
2629
|
+
const b = this.extractFrom(other, otherKey);
|
|
2630
|
+
try {
|
|
2631
|
+
return this.performLookup(() => fn(a, b, opt), a, opt);
|
|
2632
|
+
} finally {
|
|
2633
|
+
Pool.release('string[]', a, a.length);
|
|
2634
|
+
Pool.release('string[]', b, b.length);
|
|
2635
|
+
}
|
|
2636
|
+
}
|
|
2637
|
+
async lookupPairsAsync(fn, other, otherKey, opt) {
|
|
2638
|
+
const a = this.extract();
|
|
2639
|
+
const b = this.extractFrom(other, otherKey);
|
|
2640
|
+
try {
|
|
2641
|
+
return await this.performLookupAsync(() => fn(a, b, opt), a, opt);
|
|
2642
|
+
} finally {
|
|
2643
|
+
Pool.release('string[]', a, a.length);
|
|
2644
|
+
Pool.release('string[]', b, b.length);
|
|
2645
|
+
}
|
|
2646
|
+
}
|
|
2647
|
+
}
|
|
2648
|
+
|
|
2649
|
+
class TextAnalyzer {
|
|
2650
|
+
static REGEX = {
|
|
2651
|
+
number: /\d/,
|
|
2652
|
+
sentence: /(?<=[.!?])\s+/,
|
|
2653
|
+
word: /\p{L}+/gu,
|
|
2654
|
+
nonWord: /[^\p{L}]/gu,
|
|
2655
|
+
vowelGroup: /[aeiouy]+/g,
|
|
2656
|
+
letter: /\p{L}/gu,
|
|
2657
|
+
ucLetter: /\p{Lu}/gu
|
|
2658
|
+
};
|
|
2659
|
+
text;
|
|
2660
|
+
words = [];
|
|
2661
|
+
sentences = [];
|
|
2662
|
+
charFrequency = new Map();
|
|
2663
|
+
wordHistogram = new Map();
|
|
2664
|
+
syllableCache = new Map();
|
|
2665
|
+
syllableStats;
|
|
2666
|
+
constructor(input) {
|
|
2667
|
+
this.text = input.trim();
|
|
2668
|
+
this.tokenize();
|
|
2669
|
+
this.computeFrequencies();
|
|
2670
|
+
}
|
|
2671
|
+
tokenize() {
|
|
2672
|
+
let match;
|
|
2673
|
+
const lcText = this.text.toLowerCase();
|
|
2674
|
+
while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null)
|
|
2675
|
+
this.words.push(match[0]);
|
|
2676
|
+
this.sentences = this.text
|
|
2677
|
+
.split(TextAnalyzer.REGEX.sentence)
|
|
2678
|
+
.filter(Boolean);
|
|
2679
|
+
}
|
|
2680
|
+
computeFrequencies() {
|
|
2681
|
+
for (const char of this.text)
|
|
2682
|
+
this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1);
|
|
2683
|
+
for (const word of this.words)
|
|
2684
|
+
this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1);
|
|
2685
|
+
}
|
|
2686
|
+
estimateSyllables(word) {
|
|
2687
|
+
const clean = word
|
|
2688
|
+
.normalize('NFC')
|
|
2689
|
+
.toLowerCase()
|
|
2690
|
+
.replace(TextAnalyzer.REGEX.nonWord, '');
|
|
2691
|
+
if (this.syllableCache.has(clean)) return this.syllableCache.get(clean);
|
|
2692
|
+
const matches = clean.match(TextAnalyzer.REGEX.vowelGroup);
|
|
2693
|
+
const count = matches ? matches.length : 1;
|
|
2694
|
+
this.syllableCache.set(clean, count);
|
|
2695
|
+
return count;
|
|
2696
|
+
}
|
|
2697
|
+
computeSyllableStats() {
|
|
2698
|
+
return (this.syllableStats ||= (() => {
|
|
2699
|
+
const perWord = this.words
|
|
2700
|
+
.map((w) => this.estimateSyllables(w))
|
|
2701
|
+
.sort((a, b) => a - b);
|
|
2702
|
+
const total = perWord.reduce((sum, s) => sum + s, 0);
|
|
2703
|
+
const mono = perWord.filter((s) => s === 1).length;
|
|
2704
|
+
const median = !perWord.length
|
|
2705
|
+
? 0
|
|
2706
|
+
: perWord.length % 2 === 0
|
|
2707
|
+
? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) / 2
|
|
2708
|
+
: perWord[Math.floor(perWord.length / 2)];
|
|
2709
|
+
return {
|
|
2710
|
+
total,
|
|
2711
|
+
mono,
|
|
2712
|
+
perWord,
|
|
2713
|
+
avg: perWord.length ? total / perWord.length : 0,
|
|
2714
|
+
median
|
|
2715
|
+
};
|
|
2716
|
+
})());
|
|
2717
|
+
}
|
|
2718
|
+
getLength = () => this.text.length;
|
|
2719
|
+
getWordCount = () => this.words.length;
|
|
2720
|
+
getSentenceCount = () => this.sentences.length;
|
|
2721
|
+
getAvgWordLength() {
|
|
2722
|
+
return this.words.length
|
|
2723
|
+
? this.words.join('').length / this.words.length
|
|
2724
|
+
: 0;
|
|
2725
|
+
}
|
|
2726
|
+
getAvgSentenceLength() {
|
|
2727
|
+
return this.sentences.length
|
|
2728
|
+
? this.words.length / this.sentences.length
|
|
2729
|
+
: 0;
|
|
2730
|
+
}
|
|
2731
|
+
getWordHistogram() {
|
|
2732
|
+
return Object.fromEntries(this.wordHistogram);
|
|
2733
|
+
}
|
|
2734
|
+
getMostCommonWords(limit = 5) {
|
|
2735
|
+
return [...this.wordHistogram.entries()]
|
|
2736
|
+
.sort((a, b) => b[1] - a[1])
|
|
2737
|
+
.slice(0, limit)
|
|
2738
|
+
.map((e) => e[0]);
|
|
2739
|
+
}
|
|
2740
|
+
getHapaxLegomena() {
|
|
2741
|
+
return [...this.wordHistogram.entries()]
|
|
2742
|
+
.filter(([, c]) => c === 1)
|
|
2743
|
+
.map((e) => e[0]);
|
|
2744
|
+
}
|
|
2745
|
+
hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text);
|
|
2746
|
+
getUpperCaseRatio() {
|
|
2747
|
+
const matches = this.text.match(TextAnalyzer.REGEX.letter) || [];
|
|
2748
|
+
const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0;
|
|
2749
|
+
return matches.length ? upper / matches.length : 0;
|
|
2750
|
+
}
|
|
2751
|
+
getCharFrequency() {
|
|
2752
|
+
return Object.fromEntries(this.charFrequency);
|
|
2753
|
+
}
|
|
2754
|
+
getUnicodeCodepoints() {
|
|
2755
|
+
const result = {};
|
|
2756
|
+
for (const [char, count] of this.charFrequency) {
|
|
2757
|
+
const block = char
|
|
2758
|
+
.charCodeAt(0)
|
|
2759
|
+
.toString(16)
|
|
2760
|
+
.padStart(4, '0')
|
|
2761
|
+
.toUpperCase();
|
|
2762
|
+
result[block] = (result[block] || 0) + count;
|
|
2763
|
+
}
|
|
2764
|
+
return result;
|
|
2765
|
+
}
|
|
2766
|
+
getLongWordRatio(len = 7) {
|
|
2767
|
+
let long = 0;
|
|
2768
|
+
for (const w of this.words) if (w.length >= len) long++;
|
|
2769
|
+
return this.words.length ? long / this.words.length : 0;
|
|
2770
|
+
}
|
|
2771
|
+
getShortWordRatio(len = 3) {
|
|
2772
|
+
let short = 0;
|
|
2773
|
+
for (const w of this.words) if (w.length <= len) short++;
|
|
2774
|
+
return this.words.length ? short / this.words.length : 0;
|
|
2775
|
+
}
|
|
2776
|
+
getSyllablesCount() {
|
|
2777
|
+
return this.computeSyllableStats().total;
|
|
2778
|
+
}
|
|
2779
|
+
getMonosyllabicWordCount() {
|
|
2780
|
+
return this.computeSyllableStats().mono;
|
|
2781
|
+
}
|
|
2782
|
+
getMinSyllablesWordCount(min) {
|
|
2783
|
+
return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
|
|
2784
|
+
}
|
|
2785
|
+
getMaxSyllablesWordCount(max) {
|
|
2786
|
+
return this.computeSyllableStats().perWord.filter((w) => w <= max).length;
|
|
2787
|
+
}
|
|
2788
|
+
getAvgSyllablesPerWord() {
|
|
2789
|
+
return this.computeSyllableStats().avg;
|
|
2790
|
+
}
|
|
2791
|
+
getMedianSyllablesPerWord() {
|
|
2792
|
+
return this.computeSyllableStats().median;
|
|
2793
|
+
}
|
|
2794
|
+
getHonoresR() {
|
|
2795
|
+
try {
|
|
2796
|
+
return (
|
|
2797
|
+
(100 * Math.log(this.words.length)) /
|
|
2798
|
+
(1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1))
|
|
2799
|
+
);
|
|
2800
|
+
} catch {
|
|
2801
|
+
return 0;
|
|
2802
|
+
}
|
|
2083
2803
|
}
|
|
2084
|
-
|
|
2085
|
-
return
|
|
2804
|
+
getReadingTime(wpm = 200) {
|
|
2805
|
+
return this.words.length / (wpm ?? 1);
|
|
2086
2806
|
}
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
k: 'K',
|
|
2102
|
-
l: 'L',
|
|
2103
|
-
m: 'M',
|
|
2104
|
-
n: 'N',
|
|
2105
|
-
o: 'O',
|
|
2106
|
-
p: 'P',
|
|
2107
|
-
q: 'K',
|
|
2108
|
-
r: 'R',
|
|
2109
|
-
s: 'S',
|
|
2110
|
-
t: 'T',
|
|
2111
|
-
u: 'U',
|
|
2112
|
-
v: 'F',
|
|
2113
|
-
w: 'W',
|
|
2114
|
-
x: 'KS',
|
|
2115
|
-
y: 'Y',
|
|
2116
|
-
z: 'S'
|
|
2117
|
-
},
|
|
2118
|
-
ruleset: [
|
|
2119
|
-
{ char: 'a', position: 'start', next: ['e'], code: '' },
|
|
2120
|
-
{ char: 'g', position: 'start', next: ['n'], code: '' },
|
|
2121
|
-
{ char: 'k', position: 'start', next: ['n'], code: '' },
|
|
2122
|
-
{ char: 'p', position: 'start', next: ['n'], code: '' },
|
|
2123
|
-
{ char: 'w', position: 'start', next: ['r'], code: '' },
|
|
2124
|
-
{ char: 'b', position: 'end', prev: ['m'], code: '' },
|
|
2125
|
-
{ char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
|
|
2126
|
-
{ char: 'c', next: ['i'], next2: ['a'], code: 'X' },
|
|
2127
|
-
{ char: 'c', next: ['e', 'i', 'y'], code: 'S' },
|
|
2128
|
-
{ char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
|
|
2129
|
-
{
|
|
2130
|
-
char: 'g',
|
|
2131
|
-
next: ['h'],
|
|
2132
|
-
next2Not: ['', 'a', 'e', 'i', 'o', 'u'],
|
|
2133
|
-
code: ''
|
|
2134
|
-
},
|
|
2135
|
-
{ char: 'g', trailing: 'n', code: '' },
|
|
2136
|
-
{ char: 'g', trailing: 'ned', code: '' },
|
|
2137
|
-
{ char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
|
|
2138
|
-
{
|
|
2139
|
-
char: 'h',
|
|
2140
|
-
prev: ['a', 'e', 'i', 'o', 'u'],
|
|
2141
|
-
nextNot: ['a', 'e', 'i', 'o', 'u'],
|
|
2142
|
-
code: ''
|
|
2143
|
-
},
|
|
2144
|
-
{ char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
|
|
2145
|
-
{ char: 'k', prev: ['c'], code: '' },
|
|
2146
|
-
{ char: 'p', next: ['h'], code: 'F' },
|
|
2147
|
-
{ char: 's', next: ['h'], code: 'X' },
|
|
2148
|
-
{ char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2149
|
-
{ char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2150
|
-
{ char: 't', next: ['h'], code: '0' },
|
|
2151
|
-
{ char: 't', next: ['c'], next2: ['h'], code: '' },
|
|
2152
|
-
{ char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
|
|
2153
|
-
{ char: 'h', leading: 'w', code: '' },
|
|
2154
|
-
{ char: 'x', position: 'start', code: 'S' },
|
|
2155
|
-
{ char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
|
|
2156
|
-
]
|
|
2157
|
-
});
|
|
2158
|
-
|
|
2159
|
-
class Soundex extends Phonetic {
|
|
2160
|
-
static default = {
|
|
2161
|
-
map: 'en',
|
|
2162
|
-
delimiter: ' ',
|
|
2163
|
-
length: 4,
|
|
2164
|
-
pad: '0',
|
|
2165
|
-
dedupe: true
|
|
2166
|
-
};
|
|
2167
|
-
constructor(opt = {}) {
|
|
2168
|
-
super('soundex', opt);
|
|
2807
|
+
getReadabilityScore(metric = 'flesch') {
|
|
2808
|
+
const w = this.words.length || 1;
|
|
2809
|
+
const s = this.sentences.length || 1;
|
|
2810
|
+
const y = this.getSyllablesCount() || 1;
|
|
2811
|
+
const asl = w / s;
|
|
2812
|
+
const asw = y / w;
|
|
2813
|
+
switch (metric) {
|
|
2814
|
+
case 'flesch':
|
|
2815
|
+
return 206.835 - 1.015 * asl - 84.6 * asw;
|
|
2816
|
+
case 'fleschde':
|
|
2817
|
+
return 180 - asl - 58.5 * asw;
|
|
2818
|
+
case 'kincaid':
|
|
2819
|
+
return 0.39 * asl + 11.8 * asw - 15.59;
|
|
2820
|
+
}
|
|
2169
2821
|
}
|
|
2170
|
-
|
|
2171
|
-
|
|
2822
|
+
getLIXScore() {
|
|
2823
|
+
const w = this.words.length || 1;
|
|
2824
|
+
const s = this.sentences.length || 1;
|
|
2825
|
+
const l = this.getLongWordRatio() * w;
|
|
2826
|
+
return w / s + (l / w) * 100;
|
|
2172
2827
|
}
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
b: '1',
|
|
2186
|
-
f: '1',
|
|
2187
|
-
p: '1',
|
|
2188
|
-
v: '1',
|
|
2189
|
-
c: '2',
|
|
2190
|
-
g: '2',
|
|
2191
|
-
j: '2',
|
|
2192
|
-
k: '2',
|
|
2193
|
-
q: '2',
|
|
2194
|
-
s: '2',
|
|
2195
|
-
x: '2',
|
|
2196
|
-
z: '2',
|
|
2197
|
-
d: '3',
|
|
2198
|
-
t: '3',
|
|
2199
|
-
l: '4',
|
|
2200
|
-
m: '5',
|
|
2201
|
-
n: '5',
|
|
2202
|
-
r: '6'
|
|
2828
|
+
getWSTFScore() {
|
|
2829
|
+
const w = this.words.length || 1;
|
|
2830
|
+
const h = (this.getMinSyllablesWordCount(3) / w) * 100;
|
|
2831
|
+
const s = this.getAvgSentenceLength();
|
|
2832
|
+
const l = this.getLongWordRatio() * 100;
|
|
2833
|
+
const m = (this.getMonosyllabicWordCount() / w) * 100;
|
|
2834
|
+
return [
|
|
2835
|
+
0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875,
|
|
2836
|
+
0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779,
|
|
2837
|
+
0.2963 * h + 0.1905 * s - 1.1144,
|
|
2838
|
+
0.2744 * h + 0.2656 * s - 1.693
|
|
2839
|
+
];
|
|
2203
2840
|
}
|
|
2204
|
-
}
|
|
2205
|
-
PhoneticMappingRegistry.add('soundex', 'de', {
|
|
2206
|
-
map: {
|
|
2207
|
-
a: '0',
|
|
2208
|
-
ä: '0',
|
|
2209
|
-
e: '0',
|
|
2210
|
-
h: '0',
|
|
2211
|
-
i: '0',
|
|
2212
|
-
j: '0',
|
|
2213
|
-
o: '0',
|
|
2214
|
-
ö: '0',
|
|
2215
|
-
u: '0',
|
|
2216
|
-
ü: '0',
|
|
2217
|
-
y: '0',
|
|
2218
|
-
b: '1',
|
|
2219
|
-
f: '1',
|
|
2220
|
-
p: '1',
|
|
2221
|
-
v: '1',
|
|
2222
|
-
w: '1',
|
|
2223
|
-
c: '2',
|
|
2224
|
-
g: '2',
|
|
2225
|
-
k: '2',
|
|
2226
|
-
q: '2',
|
|
2227
|
-
s: '2',
|
|
2228
|
-
ß: '2',
|
|
2229
|
-
x: '2',
|
|
2230
|
-
z: '2',
|
|
2231
|
-
d: '3',
|
|
2232
|
-
t: '3',
|
|
2233
|
-
l: '4',
|
|
2234
|
-
m: '5',
|
|
2235
|
-
n: '5',
|
|
2236
|
-
r: '6'
|
|
2237
|
-
},
|
|
2238
|
-
ruleset: [{ char: 'c', next: ['h'], code: '7' }]
|
|
2239
|
-
});
|
|
2841
|
+
}
|
|
2240
2842
|
|
|
2241
2843
|
const profiler = Profiler.getInstance();
|
|
2242
2844
|
class CmpStr {
|
|
@@ -2288,29 +2890,26 @@ class CmpStr {
|
|
|
2288
2890
|
}
|
|
2289
2891
|
assert(cond, test) {
|
|
2290
2892
|
switch (cond) {
|
|
2893
|
+
default:
|
|
2894
|
+
throw new CmpStrInternalError(`Cmpstr condition <${cond}> unknown`);
|
|
2291
2895
|
case 'metric':
|
|
2292
|
-
|
|
2293
|
-
throw new Error(
|
|
2294
|
-
`CmpStr <metric> must be set, call .setMetric(), ` +
|
|
2295
|
-
`use CmpStr.metric.list() for available metrics`
|
|
2296
|
-
);
|
|
2896
|
+
OptionsValidator.validateMetricName(test);
|
|
2297
2897
|
break;
|
|
2298
2898
|
case 'phonetic':
|
|
2299
|
-
|
|
2300
|
-
throw new Error(
|
|
2301
|
-
`CmpStr <phonetic> must be set, call .setPhonetic(), ` +
|
|
2302
|
-
`use CmpStr.phonetic.list() for available phonetic algorithms`
|
|
2303
|
-
);
|
|
2899
|
+
OptionsValidator.validatePhoneticName(test);
|
|
2304
2900
|
break;
|
|
2305
|
-
default:
|
|
2306
|
-
throw new Error(`Cmpstr condition <${cond}> unknown`);
|
|
2307
2901
|
}
|
|
2308
2902
|
}
|
|
2309
2903
|
assertMany(...cond) {
|
|
2310
2904
|
for (const [c, test] of cond) this.assert(c, test);
|
|
2311
2905
|
}
|
|
2312
2906
|
resolveOptions(opt) {
|
|
2313
|
-
|
|
2907
|
+
const merged = DeepMerge.merge(
|
|
2908
|
+
{ ...(this.options ?? Object.create(null)) },
|
|
2909
|
+
opt
|
|
2910
|
+
);
|
|
2911
|
+
OptionsValidator.validateOptions(merged);
|
|
2912
|
+
return merged;
|
|
2314
2913
|
}
|
|
2315
2914
|
normalize(input, flags) {
|
|
2316
2915
|
return Normalizer.normalize(input, flags ?? this.options.flags ?? '');
|
|
@@ -2326,7 +2925,7 @@ class CmpStr {
|
|
|
2326
2925
|
return input;
|
|
2327
2926
|
}
|
|
2328
2927
|
postProcess(result, opt) {
|
|
2329
|
-
if (
|
|
2928
|
+
if (Array.isArray(result) && opt?.removeZero)
|
|
2330
2929
|
result = result.filter((r) => r.res > 0);
|
|
2331
2930
|
return result;
|
|
2332
2931
|
}
|
|
@@ -2344,64 +2943,114 @@ class CmpStr {
|
|
|
2344
2943
|
compute(a, b, opt, mode, raw, skip) {
|
|
2345
2944
|
const resolved = this.resolveOptions(opt);
|
|
2346
2945
|
this.assert('metric', resolved.metric);
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2355
|
-
|
|
2356
|
-
|
|
2357
|
-
|
|
2358
|
-
|
|
2359
|
-
|
|
2360
|
-
|
|
2361
|
-
|
|
2362
|
-
|
|
2946
|
+
return ErrorUtil.wrap(
|
|
2947
|
+
() => {
|
|
2948
|
+
const A = skip ? a : this.prepare(a, resolved);
|
|
2949
|
+
const B = skip ? b : this.prepare(b, resolved);
|
|
2950
|
+
if (
|
|
2951
|
+
resolved.safeEmpty &&
|
|
2952
|
+
((Array.isArray(A) && A.length === 0) ||
|
|
2953
|
+
(Array.isArray(B) && B.length === 0) ||
|
|
2954
|
+
A === '' ||
|
|
2955
|
+
B === '')
|
|
2956
|
+
) {
|
|
2957
|
+
return [];
|
|
2958
|
+
}
|
|
2959
|
+
const metric = factory['metric'](resolved.metric, A, B, resolved.opt);
|
|
2960
|
+
if (resolved.output !== 'prep') metric.setOriginal(a, b);
|
|
2961
|
+
metric.run(mode);
|
|
2962
|
+
const result = this.postProcess(metric.getResults(), resolved);
|
|
2963
|
+
return this.output(result, raw ?? resolved.raw);
|
|
2964
|
+
},
|
|
2965
|
+
`Failed to compute metric <${resolved.metric}> for the given inputs`,
|
|
2966
|
+
{ a, b, options: opt }
|
|
2967
|
+
);
|
|
2363
2968
|
}
|
|
2364
2969
|
output(result, raw) {
|
|
2365
|
-
return (
|
|
2366
|
-
|
|
2367
|
-
|
|
2368
|
-
|
|
2369
|
-
|
|
2970
|
+
return ErrorUtil.wrap(
|
|
2971
|
+
() =>
|
|
2972
|
+
(raw ?? this.options.raw)
|
|
2973
|
+
? result
|
|
2974
|
+
: Array.isArray(result)
|
|
2975
|
+
? result.map((r) => ({ source: r.a, target: r.b, match: r.res }))
|
|
2976
|
+
: { source: result.a, target: result.b, match: result.res },
|
|
2977
|
+
`Failed to resolve output format for the metric result`,
|
|
2978
|
+
{ result, raw }
|
|
2979
|
+
);
|
|
2980
|
+
}
|
|
2981
|
+
clone() {
|
|
2982
|
+
const inst = Object.assign(
|
|
2983
|
+
Object.create(Object.getPrototypeOf(this)),
|
|
2984
|
+
this
|
|
2985
|
+
);
|
|
2986
|
+
inst.options = DeepMerge.merge(Object.create(null), this.options);
|
|
2987
|
+
return inst;
|
|
2370
2988
|
}
|
|
2371
|
-
clone = () => Object.assign(Object.create(Object.getPrototypeOf(this)), this);
|
|
2372
2989
|
reset() {
|
|
2373
|
-
|
|
2990
|
+
this.options = Object.create(null);
|
|
2374
2991
|
return this;
|
|
2375
2992
|
}
|
|
2376
2993
|
setOptions(opt) {
|
|
2994
|
+
OptionsValidator.validateOptions(opt);
|
|
2377
2995
|
this.options = opt;
|
|
2378
2996
|
return this;
|
|
2379
2997
|
}
|
|
2380
2998
|
mergeOptions(opt) {
|
|
2381
|
-
merge(this.options, opt);
|
|
2999
|
+
DeepMerge.merge(this.options, opt);
|
|
3000
|
+
OptionsValidator.validateOptions(this.options);
|
|
2382
3001
|
return this;
|
|
2383
3002
|
}
|
|
2384
3003
|
setSerializedOptions(opt) {
|
|
2385
|
-
|
|
2386
|
-
|
|
3004
|
+
try {
|
|
3005
|
+
const parsed = JSON.parse(opt);
|
|
3006
|
+
OptionsValidator.validateOptions(parsed);
|
|
3007
|
+
this.options = parsed;
|
|
3008
|
+
return this;
|
|
3009
|
+
} catch (err) {
|
|
3010
|
+
if (err instanceof SyntaxError)
|
|
3011
|
+
throw new CmpStrValidationError(
|
|
3012
|
+
`Failed to parse serialized options, invalid JSON string`,
|
|
3013
|
+
{ opt, error: err instanceof Error ? err.message : String(err) }
|
|
3014
|
+
);
|
|
3015
|
+
throw err;
|
|
3016
|
+
}
|
|
2387
3017
|
}
|
|
2388
3018
|
setOption(path, value) {
|
|
2389
|
-
set(this.options, path, value);
|
|
3019
|
+
DeepMerge.set(this.options, path, value);
|
|
3020
|
+
OptionsValidator.validateOptions(this.options);
|
|
2390
3021
|
return this;
|
|
2391
3022
|
}
|
|
2392
3023
|
rmvOption(path) {
|
|
2393
|
-
rmv(this.options, path);
|
|
3024
|
+
DeepMerge.rmv(this.options, path);
|
|
2394
3025
|
return this;
|
|
2395
3026
|
}
|
|
2396
|
-
setRaw
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
3027
|
+
setRaw(enable) {
|
|
3028
|
+
return this.setOption('raw', enable);
|
|
3029
|
+
}
|
|
3030
|
+
setMetric(name) {
|
|
3031
|
+
return this.setOption('metric', name);
|
|
3032
|
+
}
|
|
3033
|
+
setFlags(flags) {
|
|
3034
|
+
return this.setOption('flags', flags);
|
|
3035
|
+
}
|
|
3036
|
+
rmvFlags() {
|
|
3037
|
+
return this.rmvOption('flags');
|
|
3038
|
+
}
|
|
3039
|
+
setProcessors(opt) {
|
|
3040
|
+
return this.setOption('processors', opt);
|
|
3041
|
+
}
|
|
3042
|
+
rmvProcessors() {
|
|
3043
|
+
return this.rmvOption('processors');
|
|
3044
|
+
}
|
|
3045
|
+
getOptions() {
|
|
3046
|
+
return this.options;
|
|
3047
|
+
}
|
|
3048
|
+
getSerializedOptions() {
|
|
3049
|
+
return JSON.stringify(this.options);
|
|
3050
|
+
}
|
|
3051
|
+
getOption(path) {
|
|
3052
|
+
return DeepMerge.get(this.options, path);
|
|
3053
|
+
}
|
|
2405
3054
|
test(a, b, opt) {
|
|
2406
3055
|
return this.compute(a, b, opt, 'single');
|
|
2407
3056
|
}
|
|
@@ -2440,15 +3089,35 @@ class CmpStr {
|
|
|
2440
3089
|
const resolved = this.resolveOptions({ flags, processors });
|
|
2441
3090
|
const test = this.prepare(needle, resolved);
|
|
2442
3091
|
const hstk = this.prepare(haystack, resolved);
|
|
2443
|
-
|
|
3092
|
+
const out = [];
|
|
3093
|
+
for (let i = 0, len = hstk.length; i < len; i++) {
|
|
3094
|
+
if (hstk[i].includes(test)) out.push(haystack[i]);
|
|
3095
|
+
}
|
|
3096
|
+
return out;
|
|
2444
3097
|
}
|
|
2445
3098
|
matrix(input, opt) {
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
3099
|
+
const resolved = this.resolveOptions(opt);
|
|
3100
|
+
const arr = this.prepare(input, resolved);
|
|
3101
|
+
const n = arr.length;
|
|
3102
|
+
const out = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
3103
|
+
for (let i = 0; i < n; i++)
|
|
3104
|
+
for (let j = i; j < n; j++) {
|
|
3105
|
+
if (i === j) {
|
|
3106
|
+
out[i][j] = 1;
|
|
3107
|
+
} else {
|
|
3108
|
+
const score = this.compute(
|
|
3109
|
+
arr[i],
|
|
3110
|
+
arr[j],
|
|
3111
|
+
resolved,
|
|
3112
|
+
'single',
|
|
3113
|
+
true,
|
|
3114
|
+
true
|
|
3115
|
+
).res;
|
|
3116
|
+
out[i][j] = score;
|
|
3117
|
+
out[j][i] = score;
|
|
3118
|
+
}
|
|
3119
|
+
}
|
|
3120
|
+
return out;
|
|
2452
3121
|
}
|
|
2453
3122
|
phoneticIndex(input, algo, opt) {
|
|
2454
3123
|
const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
|
|
@@ -2528,22 +3197,28 @@ class CmpStrAsync extends CmpStr {
|
|
|
2528
3197
|
async computeAsync(a, b, opt, mode, raw, skip) {
|
|
2529
3198
|
const resolved = this.resolveOptions(opt);
|
|
2530
3199
|
this.assert('metric', resolved.metric);
|
|
2531
|
-
|
|
2532
|
-
|
|
2533
|
-
|
|
2534
|
-
|
|
2535
|
-
|
|
2536
|
-
|
|
2537
|
-
|
|
2538
|
-
|
|
2539
|
-
|
|
2540
|
-
|
|
2541
|
-
|
|
2542
|
-
|
|
2543
|
-
|
|
2544
|
-
|
|
2545
|
-
|
|
2546
|
-
|
|
3200
|
+
return ErrorUtil.wrapAsync(
|
|
3201
|
+
async () => {
|
|
3202
|
+
const A = skip ? a : await this.prepareAsync(a, resolved);
|
|
3203
|
+
const B = skip ? b : await this.prepareAsync(b, resolved);
|
|
3204
|
+
if (
|
|
3205
|
+
resolved.safeEmpty &&
|
|
3206
|
+
((Array.isArray(A) && A.length === 0) ||
|
|
3207
|
+
(Array.isArray(B) && B.length === 0) ||
|
|
3208
|
+
A === '' ||
|
|
3209
|
+
B === '')
|
|
3210
|
+
) {
|
|
3211
|
+
return [];
|
|
3212
|
+
}
|
|
3213
|
+
const metric = factory['metric'](resolved.metric, A, B, resolved.opt);
|
|
3214
|
+
if (resolved.output !== 'prep') metric.setOriginal(a, b);
|
|
3215
|
+
await metric.runAsync(mode);
|
|
3216
|
+
const result = this.postProcess(metric.getResults(), resolved);
|
|
3217
|
+
return this.output(result, raw ?? resolved.raw);
|
|
3218
|
+
},
|
|
3219
|
+
`Failed to compute metric <${opt?.metric ?? this.options.metric}> for the given inputs`,
|
|
3220
|
+
{ a, b, opt }
|
|
3221
|
+
);
|
|
2547
3222
|
}
|
|
2548
3223
|
async testAsync(a, b, opt) {
|
|
2549
3224
|
return this.computeAsync(a, b, opt, 'single');
|
|
@@ -2581,23 +3256,40 @@ class CmpStrAsync extends CmpStr {
|
|
|
2581
3256
|
const resolved = this.resolveOptions({ flags, processors });
|
|
2582
3257
|
const test = await this.prepareAsync(needle, resolved);
|
|
2583
3258
|
const hstk = await this.prepareAsync(haystack, resolved);
|
|
2584
|
-
|
|
3259
|
+
const out = [];
|
|
3260
|
+
for (let i = 0; i < hstk.length; i++) {
|
|
3261
|
+
if (hstk[i].includes(test)) out.push(haystack[i]);
|
|
3262
|
+
}
|
|
3263
|
+
return out;
|
|
2585
3264
|
}
|
|
2586
3265
|
async matrixAsync(input, opt) {
|
|
2587
|
-
|
|
2588
|
-
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
|
|
2592
|
-
|
|
2593
|
-
|
|
2594
|
-
|
|
2595
|
-
|
|
2596
|
-
|
|
2597
|
-
|
|
2598
|
-
|
|
2599
|
-
|
|
2600
|
-
|
|
3266
|
+
const resolved = this.resolveOptions(opt);
|
|
3267
|
+
const arr = await this.prepareAsync(input, resolved);
|
|
3268
|
+
const n = arr.length;
|
|
3269
|
+
const out = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
3270
|
+
for (let i = 0; i < n; i++) {
|
|
3271
|
+
await Promise.all(
|
|
3272
|
+
Array.from({ length: n - i }, (_, k) => i + k).map(async (j) => {
|
|
3273
|
+
if (i === j) {
|
|
3274
|
+
out[i][j] = 1;
|
|
3275
|
+
} else {
|
|
3276
|
+
const score = (
|
|
3277
|
+
await this.computeAsync(
|
|
3278
|
+
arr[i],
|
|
3279
|
+
arr[j],
|
|
3280
|
+
resolved,
|
|
3281
|
+
'single',
|
|
3282
|
+
true,
|
|
3283
|
+
true
|
|
3284
|
+
)
|
|
3285
|
+
).res;
|
|
3286
|
+
out[i][j] = score;
|
|
3287
|
+
out[j][i] = score;
|
|
3288
|
+
}
|
|
3289
|
+
})
|
|
3290
|
+
);
|
|
3291
|
+
}
|
|
3292
|
+
return out;
|
|
2601
3293
|
}
|
|
2602
3294
|
async phoneticIndexAsync(input, algo, opt) {
|
|
2603
3295
|
const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
|
|
@@ -2645,6 +3337,7 @@ class CmpStrAsync extends CmpStr {
|
|
|
2645
3337
|
export {
|
|
2646
3338
|
CmpStr,
|
|
2647
3339
|
CmpStrAsync,
|
|
3340
|
+
Errors as CmpStrError,
|
|
2648
3341
|
DeepMerge,
|
|
2649
3342
|
DiffChecker,
|
|
2650
3343
|
Filter,
|
|
@@ -2653,6 +3346,7 @@ export {
|
|
|
2653
3346
|
Metric,
|
|
2654
3347
|
MetricRegistry,
|
|
2655
3348
|
Normalizer,
|
|
3349
|
+
OptionsValidator,
|
|
2656
3350
|
Phonetic,
|
|
2657
3351
|
PhoneticMappingRegistry,
|
|
2658
3352
|
PhoneticRegistry,
|
|
@@ -2661,4 +3355,3 @@ export {
|
|
|
2661
3355
|
StructuredData,
|
|
2662
3356
|
TextAnalyzer
|
|
2663
3357
|
};
|
|
2664
|
-
//# sourceMappingURL=CmpStr.esm.js.map
|