cmpstr 3.2.1 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -18
- package/dist/CmpStr.esm.js +1904 -1211
- package/dist/CmpStr.esm.min.js +2 -3
- package/dist/CmpStr.umd.js +1924 -1236
- package/dist/CmpStr.umd.min.js +2 -3
- package/dist/cjs/CmpStr.cjs +134 -64
- package/dist/cjs/CmpStrAsync.cjs +60 -37
- package/dist/cjs/index.cjs +1 -2
- package/dist/cjs/metric/Cosine.cjs +1 -2
- package/dist/cjs/metric/DamerauLevenshtein.cjs +1 -2
- package/dist/cjs/metric/DiceSorensen.cjs +1 -2
- package/dist/cjs/metric/Hamming.cjs +5 -4
- package/dist/cjs/metric/Jaccard.cjs +1 -2
- package/dist/cjs/metric/JaroWinkler.cjs +1 -2
- package/dist/cjs/metric/LCS.cjs +1 -2
- package/dist/cjs/metric/Levenshtein.cjs +1 -2
- package/dist/cjs/metric/Metric.cjs +90 -53
- package/dist/cjs/metric/NeedlemanWunsch.cjs +1 -2
- package/dist/cjs/metric/QGram.cjs +1 -2
- package/dist/cjs/metric/SmithWaterman.cjs +1 -2
- package/dist/cjs/phonetic/Caverphone.cjs +1 -2
- package/dist/cjs/phonetic/Cologne.cjs +1 -2
- package/dist/cjs/phonetic/Metaphone.cjs +1 -2
- package/dist/cjs/phonetic/Phonetic.cjs +80 -48
- package/dist/cjs/phonetic/Soundex.cjs +1 -2
- package/dist/cjs/root.cjs +6 -3
- package/dist/cjs/utils/DeepMerge.cjs +109 -99
- package/dist/cjs/utils/DiffChecker.cjs +1 -2
- package/dist/cjs/utils/Errors.cjs +106 -0
- package/dist/cjs/utils/Filter.cjs +97 -37
- package/dist/cjs/utils/HashTable.cjs +44 -30
- package/dist/cjs/utils/Normalizer.cjs +84 -35
- package/dist/cjs/utils/OptionsValidator.cjs +211 -0
- package/dist/cjs/utils/Pool.cjs +57 -19
- package/dist/cjs/utils/Profiler.cjs +41 -28
- package/dist/cjs/utils/Registry.cjs +48 -24
- package/dist/cjs/utils/StructuredData.cjs +95 -57
- package/dist/cjs/utils/TextAnalyzer.cjs +1 -2
- package/dist/esm/CmpStr.mjs +133 -61
- package/dist/esm/CmpStrAsync.mjs +56 -33
- package/dist/esm/index.mjs +1 -2
- package/dist/esm/metric/Cosine.mjs +1 -2
- package/dist/esm/metric/DamerauLevenshtein.mjs +1 -2
- package/dist/esm/metric/DiceSorensen.mjs +1 -2
- package/dist/esm/metric/Hamming.mjs +5 -4
- package/dist/esm/metric/Jaccard.mjs +1 -2
- package/dist/esm/metric/JaroWinkler.mjs +1 -2
- package/dist/esm/metric/LCS.mjs +1 -2
- package/dist/esm/metric/Levenshtein.mjs +1 -2
- package/dist/esm/metric/Metric.mjs +92 -53
- package/dist/esm/metric/NeedlemanWunsch.mjs +1 -2
- package/dist/esm/metric/QGram.mjs +1 -2
- package/dist/esm/metric/SmithWaterman.mjs +1 -2
- package/dist/esm/phonetic/Caverphone.mjs +1 -2
- package/dist/esm/phonetic/Cologne.mjs +1 -2
- package/dist/esm/phonetic/Metaphone.mjs +1 -2
- package/dist/esm/phonetic/Phonetic.mjs +83 -48
- package/dist/esm/phonetic/Soundex.mjs +1 -2
- package/dist/esm/root.mjs +5 -4
- package/dist/esm/utils/DeepMerge.mjs +109 -95
- package/dist/esm/utils/DiffChecker.mjs +1 -2
- package/dist/esm/utils/Errors.mjs +106 -0
- package/dist/esm/utils/Filter.mjs +97 -37
- package/dist/esm/utils/HashTable.mjs +44 -30
- package/dist/esm/utils/Normalizer.mjs +84 -35
- package/dist/esm/utils/OptionsValidator.mjs +210 -0
- package/dist/esm/utils/Pool.mjs +53 -19
- package/dist/esm/utils/Profiler.mjs +41 -28
- package/dist/esm/utils/Registry.mjs +48 -24
- package/dist/esm/utils/StructuredData.mjs +95 -57
- package/dist/esm/utils/TextAnalyzer.mjs +1 -2
- package/dist/types/CmpStr.d.ts +25 -14
- package/dist/types/CmpStrAsync.d.ts +4 -0
- package/dist/types/index.d.ts +3 -2
- package/dist/types/metric/Metric.d.ts +15 -14
- package/dist/types/phonetic/Phonetic.d.ts +7 -4
- package/dist/types/root.d.ts +4 -2
- package/dist/types/utils/DeepMerge.d.ts +80 -58
- package/dist/types/utils/Errors.d.ts +154 -0
- package/dist/types/utils/Filter.d.ts +8 -1
- package/dist/types/utils/HashTable.d.ts +12 -11
- package/dist/types/utils/Normalizer.d.ts +5 -1
- package/dist/types/utils/OptionsValidator.d.ts +193 -0
- package/dist/types/utils/Pool.d.ts +2 -0
- package/dist/types/utils/Profiler.d.ts +9 -28
- package/dist/types/utils/Registry.d.ts +3 -3
- package/dist/types/utils/StructuredData.d.ts +6 -1
- package/dist/types/utils/Types.d.ts +39 -1
- package/package.json +20 -11
- package/dist/CmpStr.esm.js.map +0 -1
- package/dist/CmpStr.esm.min.js.map +0 -1
- package/dist/CmpStr.umd.js.map +0 -1
- package/dist/CmpStr.umd.min.js.map +0 -1
- package/dist/cjs/CmpStr.cjs.map +0 -1
- package/dist/cjs/CmpStrAsync.cjs.map +0 -1
- package/dist/cjs/index.cjs.map +0 -1
- package/dist/cjs/metric/Cosine.cjs.map +0 -1
- package/dist/cjs/metric/DamerauLevenshtein.cjs.map +0 -1
- package/dist/cjs/metric/DiceSorensen.cjs.map +0 -1
- package/dist/cjs/metric/Hamming.cjs.map +0 -1
- package/dist/cjs/metric/Jaccard.cjs.map +0 -1
- package/dist/cjs/metric/JaroWinkler.cjs.map +0 -1
- package/dist/cjs/metric/LCS.cjs.map +0 -1
- package/dist/cjs/metric/Levenshtein.cjs.map +0 -1
- package/dist/cjs/metric/Metric.cjs.map +0 -1
- package/dist/cjs/metric/NeedlemanWunsch.cjs.map +0 -1
- package/dist/cjs/metric/QGram.cjs.map +0 -1
- package/dist/cjs/metric/SmithWaterman.cjs.map +0 -1
- package/dist/cjs/phonetic/Caverphone.cjs.map +0 -1
- package/dist/cjs/phonetic/Cologne.cjs.map +0 -1
- package/dist/cjs/phonetic/Metaphone.cjs.map +0 -1
- package/dist/cjs/phonetic/Phonetic.cjs.map +0 -1
- package/dist/cjs/phonetic/Soundex.cjs.map +0 -1
- package/dist/cjs/root.cjs.map +0 -1
- package/dist/cjs/utils/DeepMerge.cjs.map +0 -1
- package/dist/cjs/utils/DiffChecker.cjs.map +0 -1
- package/dist/cjs/utils/Filter.cjs.map +0 -1
- package/dist/cjs/utils/HashTable.cjs.map +0 -1
- package/dist/cjs/utils/Normalizer.cjs.map +0 -1
- package/dist/cjs/utils/Pool.cjs.map +0 -1
- package/dist/cjs/utils/Profiler.cjs.map +0 -1
- package/dist/cjs/utils/Registry.cjs.map +0 -1
- package/dist/cjs/utils/StructuredData.cjs.map +0 -1
- package/dist/cjs/utils/TextAnalyzer.cjs.map +0 -1
- package/dist/esm/CmpStr.mjs.map +0 -1
- package/dist/esm/CmpStrAsync.mjs.map +0 -1
- package/dist/esm/index.mjs.map +0 -1
- package/dist/esm/metric/Cosine.mjs.map +0 -1
- package/dist/esm/metric/DamerauLevenshtein.mjs.map +0 -1
- package/dist/esm/metric/DiceSorensen.mjs.map +0 -1
- package/dist/esm/metric/Hamming.mjs.map +0 -1
- package/dist/esm/metric/Jaccard.mjs.map +0 -1
- package/dist/esm/metric/JaroWinkler.mjs.map +0 -1
- package/dist/esm/metric/LCS.mjs.map +0 -1
- package/dist/esm/metric/Levenshtein.mjs.map +0 -1
- package/dist/esm/metric/Metric.mjs.map +0 -1
- package/dist/esm/metric/NeedlemanWunsch.mjs.map +0 -1
- package/dist/esm/metric/QGram.mjs.map +0 -1
- package/dist/esm/metric/SmithWaterman.mjs.map +0 -1
- package/dist/esm/phonetic/Caverphone.mjs.map +0 -1
- package/dist/esm/phonetic/Cologne.mjs.map +0 -1
- package/dist/esm/phonetic/Metaphone.mjs.map +0 -1
- package/dist/esm/phonetic/Phonetic.mjs.map +0 -1
- package/dist/esm/phonetic/Soundex.mjs.map +0 -1
- package/dist/esm/root.mjs.map +0 -1
- package/dist/esm/utils/DeepMerge.mjs.map +0 -1
- package/dist/esm/utils/DiffChecker.mjs.map +0 -1
- package/dist/esm/utils/Filter.mjs.map +0 -1
- package/dist/esm/utils/HashTable.mjs.map +0 -1
- package/dist/esm/utils/Normalizer.mjs.map +0 -1
- package/dist/esm/utils/Pool.mjs.map +0 -1
- package/dist/esm/utils/Profiler.mjs.map +0 -1
- package/dist/esm/utils/Registry.mjs.map +0 -1
- package/dist/esm/utils/StructuredData.mjs.map +0 -1
- package/dist/esm/utils/TextAnalyzer.mjs.map +0 -1
package/dist/CmpStr.umd.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* CmpStr v3.
|
|
2
|
+
* CmpStr v3.3.0 build-3699f85-260318
|
|
3
3
|
* This is a lightweight, fast and well performing library for calculating string similarity.
|
|
4
4
|
* (c) 2023-2026 Paul Köhler @komed3 / MIT License
|
|
5
5
|
* Visit https://github.com/komed3/cmpstr and https://npmjs.org/package/cmpstr
|
|
@@ -15,114 +15,227 @@
|
|
|
15
15
|
})(this, function (exports) {
|
|
16
16
|
'use strict';
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
.
|
|
25
|
-
.
|
|
26
|
-
.
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
|
|
18
|
+
class CmpStrError extends Error {
|
|
19
|
+
code;
|
|
20
|
+
meta;
|
|
21
|
+
when = new Date().toISOString();
|
|
22
|
+
constructor(code, message, meta, cause) {
|
|
23
|
+
super(message, cause !== undefined ? { cause } : undefined);
|
|
24
|
+
this.name = this.constructor.name;
|
|
25
|
+
this.code = code;
|
|
26
|
+
this.meta = meta;
|
|
27
|
+
if (typeof Error.captureStackTrace === 'function') {
|
|
28
|
+
Error.captureStackTrace(this, this.constructor);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
format(stack = false) {
|
|
32
|
+
const parts = [`${this.name} [${this.code}]`, this.message];
|
|
33
|
+
if (this.meta)
|
|
34
|
+
for (const _ in this.meta) {
|
|
35
|
+
parts.push(JSON.stringify(this.meta));
|
|
36
|
+
break;
|
|
37
|
+
}
|
|
38
|
+
return (
|
|
39
|
+
parts.join(' - ') +
|
|
40
|
+
(stack && this.stack ? `\nStack Trace:\n${this.stack}` : '')
|
|
41
|
+
);
|
|
42
|
+
}
|
|
43
|
+
toString() {
|
|
44
|
+
return this.format(false);
|
|
45
|
+
}
|
|
46
|
+
toJSON(stack = false) {
|
|
47
|
+
return {
|
|
48
|
+
name: this.name,
|
|
49
|
+
code: this.code,
|
|
50
|
+
message: this.message,
|
|
51
|
+
meta: this.meta,
|
|
52
|
+
when: this.when,
|
|
53
|
+
cause:
|
|
54
|
+
this.cause instanceof Error
|
|
55
|
+
? {
|
|
56
|
+
name: this.cause.name,
|
|
57
|
+
message: this.cause.message,
|
|
58
|
+
stack: stack && this.cause.stack
|
|
59
|
+
}
|
|
60
|
+
: this.cause
|
|
61
|
+
};
|
|
62
|
+
}
|
|
32
63
|
}
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
if (o == null || !(k in o)) return fb;
|
|
37
|
-
o = o[k];
|
|
64
|
+
class CmpStrValidationError extends CmpStrError {
|
|
65
|
+
constructor(message, meta, cause) {
|
|
66
|
+
super('E_VALIDATION', message, meta, cause);
|
|
38
67
|
}
|
|
39
|
-
return o;
|
|
40
68
|
}
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
if (o == null || !(k in o)) return false;
|
|
45
|
-
o = o[k];
|
|
69
|
+
class CmpStrNotFoundError extends CmpStrError {
|
|
70
|
+
constructor(message, meta, cause) {
|
|
71
|
+
super('E_NOT_FOUND', message, meta, cause);
|
|
46
72
|
}
|
|
47
|
-
return true;
|
|
48
73
|
}
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
if (t !== undefined && (typeof t !== 'object' || t === null))
|
|
53
|
-
throw Error(`Cannot set property <${keys[0]}> of <${JSON.stringify(t)}>`);
|
|
54
|
-
const root = t ?? (typeof keys[0] === 'number' ? [] : Object.create(null));
|
|
55
|
-
let cur = root;
|
|
56
|
-
for (let i = 0; i < keys.length - 1; i++) {
|
|
57
|
-
const k = keys[i];
|
|
58
|
-
let n = cur[k];
|
|
59
|
-
if (n != null && typeof n !== 'object')
|
|
60
|
-
throw Error(
|
|
61
|
-
`Cannot set property <${keys[i + 1]}> of <${JSON.stringify(n)}>`
|
|
62
|
-
);
|
|
63
|
-
if (n == null)
|
|
64
|
-
n = cur[k] = typeof keys[i + 1] === 'number' ? [] : Object.create(null);
|
|
65
|
-
cur = n;
|
|
74
|
+
class CmpStrUsageError extends CmpStrError {
|
|
75
|
+
constructor(message, meta, cause) {
|
|
76
|
+
super('E_USAGE', message, meta, cause);
|
|
66
77
|
}
|
|
67
|
-
cur[keys[keys.length - 1]] = value;
|
|
68
|
-
return root;
|
|
69
78
|
}
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
) {
|
|
75
|
-
const target = t ?? Object.create(null);
|
|
76
|
-
Object.keys(o).forEach((k) => {
|
|
77
|
-
const val = o[k];
|
|
78
|
-
if (!mergeUndefined && val === undefined) return;
|
|
79
|
-
if (k === '__proto__' || k === 'constructor') return;
|
|
80
|
-
if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
|
|
81
|
-
const existing = target[k];
|
|
82
|
-
target[k] = merge(
|
|
83
|
-
existing !== null &&
|
|
84
|
-
typeof existing === 'object' &&
|
|
85
|
-
!Array.isArray(existing)
|
|
86
|
-
? existing
|
|
87
|
-
: Object.create(null),
|
|
88
|
-
val,
|
|
89
|
-
mergeUndefined
|
|
90
|
-
);
|
|
91
|
-
} else target[k] = val;
|
|
92
|
-
});
|
|
93
|
-
return target;
|
|
79
|
+
class CmpStrInternalError extends CmpStrError {
|
|
80
|
+
constructor(message, meta, cause) {
|
|
81
|
+
super('E_INTERNAL', message, meta, cause);
|
|
82
|
+
}
|
|
94
83
|
}
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
if (
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
84
|
+
class ErrorUtil {
|
|
85
|
+
static assert(condition, message, meta) {
|
|
86
|
+
if (!condition) throw new CmpStrUsageError(message, meta);
|
|
87
|
+
}
|
|
88
|
+
static rethrow(err, message, meta) {
|
|
89
|
+
if (err instanceof CmpStrError) throw err;
|
|
90
|
+
throw new CmpStrInternalError(message, meta, err);
|
|
91
|
+
}
|
|
92
|
+
static format(err) {
|
|
93
|
+
if (err instanceof CmpStrError) return err.toString();
|
|
94
|
+
if (err instanceof Error) return `${err.name}: ${err.message}`;
|
|
95
|
+
return String(err);
|
|
96
|
+
}
|
|
97
|
+
static wrap(fn, message, meta) {
|
|
98
|
+
try {
|
|
99
|
+
return fn();
|
|
100
|
+
} catch (err) {
|
|
101
|
+
if (err instanceof CmpStrError) throw err;
|
|
102
|
+
throw new CmpStrInternalError(message, meta, err);
|
|
110
103
|
}
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
104
|
+
}
|
|
105
|
+
static async wrapAsync(fn, message, meta) {
|
|
106
|
+
try {
|
|
107
|
+
return await fn();
|
|
108
|
+
} catch (err) {
|
|
109
|
+
if (err instanceof CmpStrError) throw err;
|
|
110
|
+
throw new CmpStrInternalError(message, meta, err);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
115
113
|
}
|
|
116
114
|
|
|
117
|
-
var
|
|
115
|
+
var Errors = /*#__PURE__*/ Object.freeze({
|
|
118
116
|
__proto__: null,
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
117
|
+
CmpStrError: CmpStrError,
|
|
118
|
+
CmpStrInternalError: CmpStrInternalError,
|
|
119
|
+
CmpStrNotFoundError: CmpStrNotFoundError,
|
|
120
|
+
CmpStrUsageError: CmpStrUsageError,
|
|
121
|
+
CmpStrValidationError: CmpStrValidationError,
|
|
122
|
+
ErrorUtil: ErrorUtil
|
|
124
123
|
});
|
|
125
124
|
|
|
125
|
+
class DeepMerge {
|
|
126
|
+
static BRACKET_PATTERN = /\[(\d+)]/g;
|
|
127
|
+
static PATH_CACHE = new Map();
|
|
128
|
+
static walk(obj, keys) {
|
|
129
|
+
let o = obj;
|
|
130
|
+
for (let i = 0; i < keys.length; i++) {
|
|
131
|
+
const k = keys[i];
|
|
132
|
+
if (o == null || !(k in o)) return { exists: false };
|
|
133
|
+
o = o[k];
|
|
134
|
+
}
|
|
135
|
+
return { exists: true, value: o };
|
|
136
|
+
}
|
|
137
|
+
static parse(p) {
|
|
138
|
+
const cached = DeepMerge.PATH_CACHE.get(p);
|
|
139
|
+
if (cached) return cached;
|
|
140
|
+
const parsed = p
|
|
141
|
+
.replace(DeepMerge.BRACKET_PATTERN, '.$1')
|
|
142
|
+
.split('.')
|
|
143
|
+
.map((s) => {
|
|
144
|
+
const n = Number(s);
|
|
145
|
+
return Number.isInteger(n) && String(n) === s ? n : s;
|
|
146
|
+
});
|
|
147
|
+
if (DeepMerge.PATH_CACHE.size > 2000) DeepMerge.PATH_CACHE.clear();
|
|
148
|
+
DeepMerge.PATH_CACHE.set(p, parsed);
|
|
149
|
+
return parsed;
|
|
150
|
+
}
|
|
151
|
+
static has(t, path) {
|
|
152
|
+
return DeepMerge.walk(t, DeepMerge.parse(path)).exists;
|
|
153
|
+
}
|
|
154
|
+
static get(t, path, fb) {
|
|
155
|
+
const r = DeepMerge.walk(t, DeepMerge.parse(path));
|
|
156
|
+
return r.exists ? r.value : fb;
|
|
157
|
+
}
|
|
158
|
+
static set(t, path, value) {
|
|
159
|
+
if (path === '') return value;
|
|
160
|
+
const keys = DeepMerge.parse(path);
|
|
161
|
+
ErrorUtil.assert(
|
|
162
|
+
t === undefined || (typeof t === 'object' && t !== null),
|
|
163
|
+
`Cannot set property <${keys[0]}> of <${JSON.stringify(t)}>`,
|
|
164
|
+
{ path: keys[0], target: t }
|
|
165
|
+
);
|
|
166
|
+
const root =
|
|
167
|
+
t ?? (typeof keys[0] === 'number' ? [] : Object.create(null));
|
|
168
|
+
let cur = root;
|
|
169
|
+
for (let i = 0; i < keys.length - 1; i++) {
|
|
170
|
+
const k = keys[i];
|
|
171
|
+
let n = cur[k];
|
|
172
|
+
ErrorUtil.assert(
|
|
173
|
+
n == null || typeof n === 'object',
|
|
174
|
+
`Cannot set property <${keys[i + 1]}> of <${JSON.stringify(n)}>`,
|
|
175
|
+
{ path: keys.slice(0, i + 2), value: n }
|
|
176
|
+
);
|
|
177
|
+
if (n == null)
|
|
178
|
+
n = cur[k] =
|
|
179
|
+
typeof keys[i + 1] === 'number' ? [] : Object.create(null);
|
|
180
|
+
cur = n;
|
|
181
|
+
}
|
|
182
|
+
cur[keys[keys.length - 1]] = value;
|
|
183
|
+
return root;
|
|
184
|
+
}
|
|
185
|
+
static rmv(t, path, preserveEmpty = false) {
|
|
186
|
+
const keys = DeepMerge.parse(path);
|
|
187
|
+
const remove = (obj, i = 0) => {
|
|
188
|
+
const key = keys[i];
|
|
189
|
+
if (!obj || typeof obj !== 'object') return false;
|
|
190
|
+
if (i === keys.length - 1) return delete obj[key];
|
|
191
|
+
if (!remove(obj[key], i + 1)) return false;
|
|
192
|
+
if (!preserveEmpty) {
|
|
193
|
+
const val = obj[key];
|
|
194
|
+
let empty = true;
|
|
195
|
+
if (typeof val === 'object') {
|
|
196
|
+
if (Array.isArray(val))
|
|
197
|
+
for (let i = 0; i < val.length; i++) {
|
|
198
|
+
if (val[i] != null) {
|
|
199
|
+
empty = false;
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
else empty = false;
|
|
204
|
+
}
|
|
205
|
+
if (empty) delete obj[key];
|
|
206
|
+
}
|
|
207
|
+
return true;
|
|
208
|
+
};
|
|
209
|
+
remove(t);
|
|
210
|
+
return t;
|
|
211
|
+
}
|
|
212
|
+
static merge(
|
|
213
|
+
t = Object.create(null),
|
|
214
|
+
o = Object.create(null),
|
|
215
|
+
mergeUndefined = false
|
|
216
|
+
) {
|
|
217
|
+
const target = t ?? Object.create(null);
|
|
218
|
+
for (const k in o) {
|
|
219
|
+
const val = o[k];
|
|
220
|
+
if (!mergeUndefined && val === undefined) continue;
|
|
221
|
+
if (k === '__proto__' || k === 'constructor') continue;
|
|
222
|
+
if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
|
|
223
|
+
const existing = target[k];
|
|
224
|
+
target[k] = DeepMerge.merge(
|
|
225
|
+
existing !== null &&
|
|
226
|
+
typeof existing === 'object' &&
|
|
227
|
+
!Array.isArray(existing)
|
|
228
|
+
? existing
|
|
229
|
+
: Object.create(null),
|
|
230
|
+
val,
|
|
231
|
+
mergeUndefined
|
|
232
|
+
);
|
|
233
|
+
} else target[k] = val;
|
|
234
|
+
}
|
|
235
|
+
return target;
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
126
239
|
class DiffChecker {
|
|
127
240
|
a;
|
|
128
241
|
b;
|
|
@@ -422,48 +535,89 @@
|
|
|
422
535
|
}
|
|
423
536
|
|
|
424
537
|
class Filter {
|
|
538
|
+
static IDENTITY = (s) => s;
|
|
425
539
|
static filters = new Map();
|
|
426
540
|
static pipeline = new Map();
|
|
427
|
-
static getPipeline(hook) {
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
541
|
+
static getPipeline(hook, force = false) {
|
|
542
|
+
return ErrorUtil.wrap(
|
|
543
|
+
() => {
|
|
544
|
+
if (!force) {
|
|
545
|
+
const cached = Filter.pipeline.get(hook);
|
|
546
|
+
if (cached) return cached;
|
|
547
|
+
}
|
|
548
|
+
const filter = Filter.filters.get(hook);
|
|
549
|
+
if (!filter) {
|
|
550
|
+
Filter.pipeline.set(hook, Filter.IDENTITY);
|
|
551
|
+
return Filter.IDENTITY;
|
|
552
|
+
}
|
|
553
|
+
const pipeline = [];
|
|
554
|
+
for (const f of filter.values()) if (f.active) pipeline.push(f);
|
|
555
|
+
pipeline.sort((a, b) => a.priority - b.priority);
|
|
556
|
+
const fn =
|
|
557
|
+
pipeline.length === 0
|
|
558
|
+
? Filter.IDENTITY
|
|
559
|
+
: (input) => {
|
|
560
|
+
let v = input;
|
|
561
|
+
for (let i = 0; i < pipeline.length; i++)
|
|
562
|
+
v = pipeline[i].fn(v);
|
|
563
|
+
return v;
|
|
564
|
+
};
|
|
565
|
+
Filter.pipeline.set(hook, fn);
|
|
566
|
+
return fn;
|
|
567
|
+
},
|
|
568
|
+
`Error compiling filter pipeline for hook <${hook}>`,
|
|
569
|
+
{ hook }
|
|
570
|
+
);
|
|
439
571
|
}
|
|
440
572
|
static has(hook, id) {
|
|
441
573
|
return !!Filter.filters.get(hook)?.has(id);
|
|
442
574
|
}
|
|
443
575
|
static add(hook, id, fn, opt = {}) {
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
576
|
+
return ErrorUtil.wrap(
|
|
577
|
+
() => {
|
|
578
|
+
const { priority = 10, active = true, overrideable = true } = opt;
|
|
579
|
+
const filter = Filter.filters.get(hook) ?? new Map();
|
|
580
|
+
const index = filter.get(id);
|
|
581
|
+
if (index && !index.overrideable) return false;
|
|
582
|
+
if (
|
|
583
|
+
index &&
|
|
584
|
+
index.fn === fn &&
|
|
585
|
+
index.priority === priority &&
|
|
586
|
+
index.active === active
|
|
587
|
+
)
|
|
588
|
+
return true;
|
|
589
|
+
filter.set(id, { id, fn, priority, active, overrideable });
|
|
590
|
+
Filter.filters.set(hook, filter);
|
|
591
|
+
Filter.getPipeline(hook, true);
|
|
592
|
+
return true;
|
|
593
|
+
},
|
|
594
|
+
`Error adding filter <${id}> to hook <${hook}>`,
|
|
595
|
+
{ hook, id, opt }
|
|
596
|
+
);
|
|
452
597
|
}
|
|
453
598
|
static remove(hook, id) {
|
|
454
|
-
Filter.pipeline.delete(hook);
|
|
455
599
|
const filter = Filter.filters.get(hook);
|
|
456
|
-
|
|
600
|
+
if (!filter || !filter.delete(id)) return false;
|
|
601
|
+
Filter.getPipeline(hook, true);
|
|
602
|
+
return true;
|
|
457
603
|
}
|
|
458
604
|
static pause(hook, id) {
|
|
459
|
-
Filter.
|
|
460
|
-
|
|
461
|
-
|
|
605
|
+
const filter = Filter.filters.get(hook);
|
|
606
|
+
if (!filter) return false;
|
|
607
|
+
const f = filter.get(id);
|
|
608
|
+
if (!f || !f.active) return false;
|
|
609
|
+
f.active = false;
|
|
610
|
+
Filter.getPipeline(hook, true);
|
|
611
|
+
return true;
|
|
462
612
|
}
|
|
463
613
|
static resume(hook, id) {
|
|
464
|
-
Filter.
|
|
465
|
-
|
|
466
|
-
|
|
614
|
+
const filter = Filter.filters.get(hook);
|
|
615
|
+
if (!filter) return false;
|
|
616
|
+
const f = filter.get(id);
|
|
617
|
+
if (!f || f.active) return false;
|
|
618
|
+
f.active = true;
|
|
619
|
+
Filter.getPipeline(hook, true);
|
|
620
|
+
return true;
|
|
467
621
|
}
|
|
468
622
|
static list(hook, active = false) {
|
|
469
623
|
const filter = Filter.filters.get(hook);
|
|
@@ -473,17 +627,36 @@
|
|
|
473
627
|
return out;
|
|
474
628
|
}
|
|
475
629
|
static apply(hook, input) {
|
|
476
|
-
|
|
477
|
-
|
|
630
|
+
return ErrorUtil.wrap(
|
|
631
|
+
() => {
|
|
632
|
+
const fn = Filter.getPipeline(hook);
|
|
633
|
+
if (typeof input === 'string') return fn(input);
|
|
634
|
+
const arr = input;
|
|
635
|
+
const out = new Array(arr.length);
|
|
636
|
+
for (let i = 0; i < arr.length; i++) out[i] = fn(arr[i]);
|
|
637
|
+
return out;
|
|
638
|
+
},
|
|
639
|
+
`Error applying filters for hook <${hook}>`,
|
|
640
|
+
{ hook, input }
|
|
641
|
+
);
|
|
478
642
|
}
|
|
479
643
|
static async applyAsync(hook, input) {
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
644
|
+
return ErrorUtil.wrapAsync(
|
|
645
|
+
async () => {
|
|
646
|
+
const fn = Filter.getPipeline(hook);
|
|
647
|
+
if (typeof input === 'string') return Promise.resolve(fn(input));
|
|
648
|
+
const arr = input;
|
|
649
|
+
const out = new Array(arr.length);
|
|
650
|
+
for (let i = 0; i < arr.length; i++)
|
|
651
|
+
out[i] = Promise.resolve(fn(arr[i]));
|
|
652
|
+
return Promise.all(out);
|
|
653
|
+
},
|
|
654
|
+
`Error applying filters for hook <${hook}>`,
|
|
655
|
+
{ hook, input }
|
|
656
|
+
);
|
|
484
657
|
}
|
|
485
658
|
static clear(hook) {
|
|
486
|
-
Filter.
|
|
659
|
+
Filter.clearPipeline();
|
|
487
660
|
if (hook) Filter.filters.delete(hook);
|
|
488
661
|
else Filter.filters.clear();
|
|
489
662
|
}
|
|
@@ -497,25 +670,21 @@
|
|
|
497
670
|
static HASH_OFFSET = 0x811c9dc5;
|
|
498
671
|
static fastFNV1a(str) {
|
|
499
672
|
const len = str.length;
|
|
673
|
+
const limit = len & -4;
|
|
500
674
|
let hash = this.HASH_OFFSET;
|
|
501
|
-
|
|
502
|
-
for (
|
|
503
|
-
const pos = i * 4;
|
|
675
|
+
let i = 0;
|
|
676
|
+
for (; i < limit; i += 4) {
|
|
504
677
|
const chunk =
|
|
505
|
-
str.charCodeAt(
|
|
506
|
-
(str.charCodeAt(
|
|
507
|
-
(str.charCodeAt(
|
|
508
|
-
(str.charCodeAt(
|
|
678
|
+
str.charCodeAt(i) |
|
|
679
|
+
(str.charCodeAt(i + 1) << 8) |
|
|
680
|
+
(str.charCodeAt(i + 2) << 16) |
|
|
681
|
+
(str.charCodeAt(i + 3) << 24);
|
|
509
682
|
hash ^= chunk;
|
|
510
683
|
hash = Math.imul(hash, this.FNV_PRIME);
|
|
511
684
|
}
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
for (let i = 0; i < remaining; i++) {
|
|
516
|
-
hash ^= str.charCodeAt(pos + i);
|
|
517
|
-
hash = Math.imul(hash, this.FNV_PRIME);
|
|
518
|
-
}
|
|
685
|
+
for (; i < len; i++) {
|
|
686
|
+
hash ^= str.charCodeAt(i);
|
|
687
|
+
hash = Math.imul(hash, this.FNV_PRIME);
|
|
519
688
|
}
|
|
520
689
|
hash ^= hash >>> 16;
|
|
521
690
|
hash *= 0x85ebca6b;
|
|
@@ -526,32 +695,51 @@
|
|
|
526
695
|
}
|
|
527
696
|
}
|
|
528
697
|
class HashTable {
|
|
529
|
-
|
|
698
|
+
FIFO;
|
|
699
|
+
maxSize;
|
|
530
700
|
static MAX_LEN = 2048;
|
|
531
|
-
static TABLE_SIZE = 10_000;
|
|
532
701
|
table = new Map();
|
|
533
|
-
constructor(
|
|
534
|
-
this.
|
|
702
|
+
constructor(FIFO = true, maxSize = 10000) {
|
|
703
|
+
this.FIFO = FIFO;
|
|
704
|
+
this.maxSize = maxSize;
|
|
535
705
|
}
|
|
536
706
|
key(label, strs, sorted = false) {
|
|
537
|
-
|
|
538
|
-
const hashes =
|
|
539
|
-
|
|
707
|
+
const n = strs.length;
|
|
708
|
+
const hashes = new Array(n);
|
|
709
|
+
for (let i = 0; i < n; i++) {
|
|
710
|
+
const s = strs[i];
|
|
711
|
+
if (s.length > HashTable.MAX_LEN) return false;
|
|
712
|
+
hashes[i] = Hasher.fastFNV1a(s);
|
|
713
|
+
}
|
|
714
|
+
if (sorted) hashes.sort((a, b) => a - b);
|
|
715
|
+
let key = label;
|
|
716
|
+
for (let i = 0; i < hashes.length; i++) key += '-' + hashes[i];
|
|
717
|
+
return key;
|
|
718
|
+
}
|
|
719
|
+
has(key) {
|
|
720
|
+
return this.table.has(key);
|
|
721
|
+
}
|
|
722
|
+
get(key) {
|
|
723
|
+
return this.table.get(key);
|
|
540
724
|
}
|
|
541
|
-
has = (key) => this.table.has(key);
|
|
542
|
-
get = (key) => this.table.get(key);
|
|
543
725
|
set(key, entry, update = true) {
|
|
544
726
|
if (!update && this.table.has(key)) return false;
|
|
545
|
-
|
|
546
|
-
if (!this.
|
|
727
|
+
if (!this.table.has(key) && this.table.size >= this.maxSize) {
|
|
728
|
+
if (!this.FIFO) return false;
|
|
547
729
|
this.table.delete(this.table.keys().next().value);
|
|
548
730
|
}
|
|
549
731
|
this.table.set(key, entry);
|
|
550
732
|
return true;
|
|
551
733
|
}
|
|
552
|
-
delete
|
|
553
|
-
|
|
554
|
-
|
|
734
|
+
delete(key) {
|
|
735
|
+
return this.table.delete(key);
|
|
736
|
+
}
|
|
737
|
+
clear() {
|
|
738
|
+
this.table.clear();
|
|
739
|
+
}
|
|
740
|
+
size() {
|
|
741
|
+
return this.table.size;
|
|
742
|
+
}
|
|
555
743
|
}
|
|
556
744
|
|
|
557
745
|
class Normalizer {
|
|
@@ -568,42 +756,91 @@
|
|
|
568
756
|
return Array.from(new Set(flags)).sort().join('');
|
|
569
757
|
}
|
|
570
758
|
static getPipeline(flags) {
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
759
|
+
return ErrorUtil.wrap(
|
|
760
|
+
() => {
|
|
761
|
+
const cached = Normalizer.pipeline.get(flags);
|
|
762
|
+
if (cached) return cached;
|
|
763
|
+
const { REGEX } = Normalizer;
|
|
764
|
+
const steps = [];
|
|
765
|
+
for (let i = 0; i < flags.length; i++) {
|
|
766
|
+
switch (flags[i]) {
|
|
767
|
+
case 'd':
|
|
768
|
+
steps.push((s) => s.normalize('NFD'));
|
|
769
|
+
break;
|
|
770
|
+
case 'i':
|
|
771
|
+
steps.push((s) => s.toLowerCase());
|
|
772
|
+
break;
|
|
773
|
+
case 'k':
|
|
774
|
+
steps.push((s) => s.replace(REGEX.nonLetters, ''));
|
|
775
|
+
break;
|
|
776
|
+
case 'n':
|
|
777
|
+
steps.push((s) => s.replace(REGEX.nonNumbers, ''));
|
|
778
|
+
break;
|
|
779
|
+
case 'r':
|
|
780
|
+
steps.push((s) => s.replace(REGEX.doubleChars, '$1'));
|
|
781
|
+
break;
|
|
782
|
+
case 's':
|
|
783
|
+
steps.push((s) => s.replace(REGEX.specialChars, ''));
|
|
784
|
+
break;
|
|
785
|
+
case 't':
|
|
786
|
+
steps.push((s) => s.trim());
|
|
787
|
+
break;
|
|
788
|
+
case 'u':
|
|
789
|
+
steps.push((s) => s.normalize('NFC'));
|
|
790
|
+
break;
|
|
791
|
+
case 'w':
|
|
792
|
+
steps.push((s) => s.replace(REGEX.whitespace, ' '));
|
|
793
|
+
break;
|
|
794
|
+
case 'x':
|
|
795
|
+
steps.push((s) => s.normalize('NFKC'));
|
|
796
|
+
break;
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
const fn = (input) => {
|
|
800
|
+
let v = input;
|
|
801
|
+
for (let i = 0; i < steps.length; i++) v = steps[i](v);
|
|
802
|
+
return v;
|
|
803
|
+
};
|
|
804
|
+
Normalizer.pipeline.set(flags, fn);
|
|
805
|
+
return fn;
|
|
806
|
+
},
|
|
807
|
+
`Failed to create normalization pipeline for flags: ${flags}`,
|
|
808
|
+
{ flags }
|
|
809
|
+
);
|
|
810
|
+
}
|
|
811
|
+
static normalize(input, flags, normalizedFlags) {
|
|
812
|
+
return ErrorUtil.wrap(
|
|
813
|
+
() => {
|
|
814
|
+
if (!flags || typeof flags !== 'string' || !input) return input;
|
|
815
|
+
flags = normalizedFlags ?? this.canonicalFlags(flags);
|
|
816
|
+
const pipeline = Normalizer.getPipeline(flags);
|
|
817
|
+
const normalizeOne = (s) => {
|
|
818
|
+
const key = Normalizer.cache.key(flags, [s]);
|
|
819
|
+
if (key && Normalizer.cache.has(key))
|
|
820
|
+
return Normalizer.cache.get(key);
|
|
821
|
+
const res = pipeline(s);
|
|
822
|
+
if (key) Normalizer.cache.set(key, res);
|
|
823
|
+
return res;
|
|
824
|
+
};
|
|
825
|
+
return Array.isArray(input)
|
|
826
|
+
? input.map(normalizeOne)
|
|
827
|
+
: normalizeOne(input);
|
|
828
|
+
},
|
|
829
|
+
`Failed to normalize input with flags: ${flags}`,
|
|
830
|
+
{ input, flags }
|
|
831
|
+
);
|
|
602
832
|
}
|
|
603
833
|
static async normalizeAsync(input, flags) {
|
|
604
|
-
return await
|
|
605
|
-
|
|
606
|
-
|
|
834
|
+
return await ErrorUtil.wrapAsync(
|
|
835
|
+
async () => {
|
|
836
|
+
if (!flags || typeof flags !== 'string' || !input) return input;
|
|
837
|
+
return await (Array.isArray(input)
|
|
838
|
+
? Promise.all(input.map((s) => Normalizer.normalize(s, flags)))
|
|
839
|
+
: Promise.resolve(Normalizer.normalize(input, flags)));
|
|
840
|
+
},
|
|
841
|
+
`Failed to asynchronously normalize input with flags: ${flags}`,
|
|
842
|
+
{ input, flags }
|
|
843
|
+
);
|
|
607
844
|
}
|
|
608
845
|
static clear() {
|
|
609
846
|
Normalizer.pipeline.clear();
|
|
@@ -611,17 +848,144 @@
|
|
|
611
848
|
}
|
|
612
849
|
}
|
|
613
850
|
|
|
851
|
+
class RingPool {
|
|
852
|
+
maxSize;
|
|
853
|
+
buffers = [];
|
|
854
|
+
pointer = 0;
|
|
855
|
+
constructor(maxSize) {
|
|
856
|
+
this.maxSize = maxSize;
|
|
857
|
+
}
|
|
858
|
+
acquire(minSize, allowOversize) {
|
|
859
|
+
return ErrorUtil.wrap(
|
|
860
|
+
() => {
|
|
861
|
+
const buffers = this.buffers;
|
|
862
|
+
const len = buffers.length;
|
|
863
|
+
for (let i = 0; i < len; i++) {
|
|
864
|
+
const idx = (this.pointer + i) % len;
|
|
865
|
+
const item = buffers[idx];
|
|
866
|
+
const size = item.size;
|
|
867
|
+
if (size >= minSize && (allowOversize || size === minSize)) {
|
|
868
|
+
this.pointer = (idx + 1) % len;
|
|
869
|
+
return item;
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
return null;
|
|
873
|
+
},
|
|
874
|
+
`Failed to acquire buffer of size >= ${minSize} from pool`,
|
|
875
|
+
{ minSize, allowOversize }
|
|
876
|
+
);
|
|
877
|
+
}
|
|
878
|
+
release(item) {
|
|
879
|
+
ErrorUtil.wrap(
|
|
880
|
+
() => {
|
|
881
|
+
const buffers = this.buffers;
|
|
882
|
+
if (buffers.length < this.maxSize) {
|
|
883
|
+
buffers.push(item);
|
|
884
|
+
return;
|
|
885
|
+
}
|
|
886
|
+
buffers[this.pointer] = item;
|
|
887
|
+
this.pointer = (this.pointer + 1) % this.maxSize;
|
|
888
|
+
},
|
|
889
|
+
`Failed to release buffer back to pool`,
|
|
890
|
+
{ item }
|
|
891
|
+
);
|
|
892
|
+
}
|
|
893
|
+
clear() {
|
|
894
|
+
this.buffers = [];
|
|
895
|
+
this.pointer = 0;
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
class Pool {
|
|
899
|
+
static CONFIG = {
|
|
900
|
+
int32: {
|
|
901
|
+
type: 'int32',
|
|
902
|
+
maxSize: 64,
|
|
903
|
+
maxItemSize: 2048,
|
|
904
|
+
allowOversize: true
|
|
905
|
+
},
|
|
906
|
+
'arr[]': {
|
|
907
|
+
type: 'arr[]',
|
|
908
|
+
maxSize: 4,
|
|
909
|
+
maxItemSize: 1024,
|
|
910
|
+
allowOversize: false
|
|
911
|
+
},
|
|
912
|
+
'number[]': {
|
|
913
|
+
type: 'number[]',
|
|
914
|
+
maxSize: 16,
|
|
915
|
+
maxItemSize: 1024,
|
|
916
|
+
allowOversize: false
|
|
917
|
+
},
|
|
918
|
+
'string[]': {
|
|
919
|
+
type: 'string[]',
|
|
920
|
+
maxSize: 2,
|
|
921
|
+
maxItemSize: 1024,
|
|
922
|
+
allowOversize: false
|
|
923
|
+
},
|
|
924
|
+
set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
|
|
925
|
+
map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
|
|
926
|
+
};
|
|
927
|
+
static POOLS = {
|
|
928
|
+
int32: new RingPool(64),
|
|
929
|
+
'arr[]': new RingPool(4),
|
|
930
|
+
'number[]': new RingPool(16),
|
|
931
|
+
'string[]': new RingPool(2),
|
|
932
|
+
set: new RingPool(8),
|
|
933
|
+
map: new RingPool(8)
|
|
934
|
+
};
|
|
935
|
+
static allocate(type, size) {
|
|
936
|
+
switch (type) {
|
|
937
|
+
case 'int32':
|
|
938
|
+
return new Int32Array(size);
|
|
939
|
+
case 'arr[]':
|
|
940
|
+
return new Array(size);
|
|
941
|
+
case 'number[]':
|
|
942
|
+
return new Float64Array(size);
|
|
943
|
+
case 'string[]':
|
|
944
|
+
return new Array(size);
|
|
945
|
+
case 'set':
|
|
946
|
+
return new Set();
|
|
947
|
+
case 'map':
|
|
948
|
+
return new Map();
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
static acquire(type, size) {
|
|
952
|
+
const CONFIG = this.CONFIG[type];
|
|
953
|
+
if (!CONFIG)
|
|
954
|
+
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
|
|
955
|
+
if (size > CONFIG.maxItemSize) return this.allocate(type, size);
|
|
956
|
+
const item = this.POOLS[type].acquire(size, CONFIG.allowOversize);
|
|
957
|
+
if (item)
|
|
958
|
+
return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
|
|
959
|
+
return this.allocate(type, size);
|
|
960
|
+
}
|
|
961
|
+
static acquireMany(type, sizes) {
|
|
962
|
+
const out = new Array(sizes.length);
|
|
963
|
+
for (let i = 0; i < sizes.length; i++)
|
|
964
|
+
out[i] = this.acquire(type, sizes[i]);
|
|
965
|
+
return out;
|
|
966
|
+
}
|
|
967
|
+
static release(type, buffer, size) {
|
|
968
|
+
const CONFIG = this.CONFIG[type];
|
|
969
|
+
if (!CONFIG)
|
|
970
|
+
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
|
|
971
|
+
if (size <= CONFIG.maxItemSize)
|
|
972
|
+
this.POOLS[type].release({ buffer, size });
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
|
|
614
976
|
class Profiler {
|
|
615
977
|
active;
|
|
616
978
|
static ENV;
|
|
617
979
|
static instance;
|
|
618
980
|
nowFn;
|
|
619
981
|
memFn;
|
|
620
|
-
store =
|
|
982
|
+
store = [];
|
|
983
|
+
last;
|
|
621
984
|
totalTime = 0;
|
|
622
985
|
totalMem = 0;
|
|
623
986
|
static detectEnv() {
|
|
624
|
-
if (typeof process !== 'undefined'
|
|
987
|
+
if (typeof process !== 'undefined' && process.versions?.node)
|
|
988
|
+
Profiler.ENV = 'nodejs';
|
|
625
989
|
else if (typeof performance !== 'undefined') Profiler.ENV = 'browser';
|
|
626
990
|
else Profiler.ENV = 'unknown';
|
|
627
991
|
}
|
|
@@ -633,7 +997,7 @@
|
|
|
633
997
|
this.active = active;
|
|
634
998
|
switch (Profiler.ENV) {
|
|
635
999
|
case 'nodejs':
|
|
636
|
-
this.nowFn = () => Number(process.hrtime.bigint())
|
|
1000
|
+
this.nowFn = () => Number(process.hrtime.bigint()) * 1e-6;
|
|
637
1001
|
this.memFn = () => process.memoryUsage().heapUsed;
|
|
638
1002
|
break;
|
|
639
1003
|
case 'browser':
|
|
@@ -646,40 +1010,52 @@
|
|
|
646
1010
|
break;
|
|
647
1011
|
}
|
|
648
1012
|
}
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
startMem = this.mem();
|
|
654
|
-
const res = fn();
|
|
655
|
-
const deltaTime = this.now() - startTime,
|
|
656
|
-
deltaMem = this.mem() - startMem;
|
|
657
|
-
this.store.add({ time: deltaTime, mem: deltaMem, res, meta });
|
|
658
|
-
((this.totalTime += deltaTime), (this.totalMem += deltaMem));
|
|
659
|
-
return res;
|
|
1013
|
+
storeRes(entry) {
|
|
1014
|
+
this.store.push((this.last = entry));
|
|
1015
|
+
this.totalTime += entry.time;
|
|
1016
|
+
this.totalMem += entry.mem;
|
|
660
1017
|
}
|
|
661
|
-
enable
|
|
1018
|
+
enable() {
|
|
662
1019
|
this.active = true;
|
|
663
|
-
}
|
|
664
|
-
disable
|
|
1020
|
+
}
|
|
1021
|
+
disable() {
|
|
665
1022
|
this.active = false;
|
|
666
|
-
}
|
|
1023
|
+
}
|
|
667
1024
|
clear() {
|
|
668
|
-
this.store.
|
|
1025
|
+
this.store.length = 0;
|
|
1026
|
+
this.last = undefined;
|
|
669
1027
|
this.totalTime = 0;
|
|
670
1028
|
this.totalMem = 0;
|
|
671
1029
|
}
|
|
672
1030
|
run(fn, meta = {}) {
|
|
673
|
-
|
|
1031
|
+
if (!this.active) return fn();
|
|
1032
|
+
const startTime = this.nowFn(),
|
|
1033
|
+
startMem = this.memFn();
|
|
1034
|
+
const res = fn();
|
|
1035
|
+
const deltaTime = this.nowFn() - startTime,
|
|
1036
|
+
deltaMem = this.memFn() - startMem;
|
|
1037
|
+
this.storeRes({ time: deltaTime, mem: deltaMem, res, meta });
|
|
1038
|
+
return res;
|
|
674
1039
|
}
|
|
675
1040
|
async runAsync(fn, meta = {}) {
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
1041
|
+
if (!this.active) return fn();
|
|
1042
|
+
const startTime = this.nowFn(),
|
|
1043
|
+
startMem = this.memFn();
|
|
1044
|
+
const res = await fn();
|
|
1045
|
+
const deltaTime = this.nowFn() - startTime,
|
|
1046
|
+
deltaMem = this.memFn() - startMem;
|
|
1047
|
+
this.storeRes({ time: deltaTime, mem: deltaMem, res, meta });
|
|
1048
|
+
return res;
|
|
1049
|
+
}
|
|
1050
|
+
getAll() {
|
|
1051
|
+
return [...this.store];
|
|
1052
|
+
}
|
|
1053
|
+
getLast() {
|
|
1054
|
+
return this.last;
|
|
1055
|
+
}
|
|
1056
|
+
getTotal() {
|
|
1057
|
+
return { time: this.totalTime, mem: this.totalMem };
|
|
679
1058
|
}
|
|
680
|
-
getAll = () => [...this.store];
|
|
681
|
-
getLast = () => this.getAll().pop();
|
|
682
|
-
getTotal = () => ({ time: this.totalTime, mem: this.totalMem });
|
|
683
1059
|
services = Object.freeze({
|
|
684
1060
|
enable: this.enable.bind(this),
|
|
685
1061
|
disable: this.disable.bind(this),
|
|
@@ -693,19 +1069,34 @@
|
|
|
693
1069
|
const registry = Object.create(null);
|
|
694
1070
|
const factory = Object.create(null);
|
|
695
1071
|
function Registry(reg, ctor) {
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
1072
|
+
ErrorUtil.assert(
|
|
1073
|
+
!(reg in registry || reg in factory),
|
|
1074
|
+
`Registry <${reg}> already exists / overwriting is forbidden`,
|
|
1075
|
+
{ registry: reg }
|
|
1076
|
+
);
|
|
700
1077
|
const classes = Object.create(null);
|
|
701
1078
|
const service = Object.freeze({
|
|
702
1079
|
add(name, cls, update = false) {
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
1080
|
+
ErrorUtil.assert(
|
|
1081
|
+
typeof name === 'string' && name.length > 0,
|
|
1082
|
+
`Class name must be a non-empty string`,
|
|
1083
|
+
{ registry: reg, name }
|
|
1084
|
+
);
|
|
1085
|
+
ErrorUtil.assert(
|
|
1086
|
+
typeof cls === 'function',
|
|
1087
|
+
`Class must be a constructor function`,
|
|
1088
|
+
{ registry: reg, class: cls }
|
|
1089
|
+
);
|
|
1090
|
+
ErrorUtil.assert(
|
|
1091
|
+
cls.prototype instanceof ctor,
|
|
1092
|
+
`Class must extend <${reg}>`,
|
|
1093
|
+
{ registry: reg, class: cls }
|
|
1094
|
+
);
|
|
1095
|
+
ErrorUtil.assert(
|
|
1096
|
+
update || !(name in classes),
|
|
1097
|
+
`Class <${name}> already exists / use <update=true> to overwrite`,
|
|
1098
|
+
{ registry: reg, name }
|
|
1099
|
+
);
|
|
709
1100
|
classes[name] = cls;
|
|
710
1101
|
},
|
|
711
1102
|
remove(name) {
|
|
@@ -718,8 +1109,16 @@
|
|
|
718
1109
|
return Object.keys(classes);
|
|
719
1110
|
},
|
|
720
1111
|
get(name) {
|
|
721
|
-
|
|
722
|
-
|
|
1112
|
+
ErrorUtil.assert(
|
|
1113
|
+
typeof name === 'string' && name.length > 0,
|
|
1114
|
+
`Class name must be a non-empty string`,
|
|
1115
|
+
{ registry: reg, name }
|
|
1116
|
+
);
|
|
1117
|
+
ErrorUtil.assert(
|
|
1118
|
+
name in classes,
|
|
1119
|
+
`Class <${name}> not registered for <${reg}>`,
|
|
1120
|
+
{ registry: reg, name }
|
|
1121
|
+
);
|
|
723
1122
|
return classes[name];
|
|
724
1123
|
}
|
|
725
1124
|
});
|
|
@@ -729,764 +1128,356 @@
|
|
|
729
1128
|
}
|
|
730
1129
|
function resolveCls(reg, cls) {
|
|
731
1130
|
if (!(reg in registry))
|
|
732
|
-
throw new
|
|
733
|
-
|
|
1131
|
+
throw new CmpStrNotFoundError(`Registry <${reg}> does not exist`, {
|
|
1132
|
+
registry: reg
|
|
1133
|
+
});
|
|
1134
|
+
return typeof cls === 'string' ? registry[reg].get(cls) : cls;
|
|
734
1135
|
}
|
|
735
1136
|
function createFromRegistry(reg, cls, ...args) {
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
});
|
|
743
|
-
}
|
|
1137
|
+
const ctor = resolveCls(reg, cls);
|
|
1138
|
+
return ErrorUtil.wrap(
|
|
1139
|
+
() => new ctor(...args),
|
|
1140
|
+
`Failed to create instance of class <${ctor.name ?? cls}> from registry <${reg}>`,
|
|
1141
|
+
{ registry: reg, class: cls, args }
|
|
1142
|
+
);
|
|
744
1143
|
}
|
|
745
1144
|
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
1145
|
+
const profiler$2 = Profiler.getInstance();
|
|
1146
|
+
class Metric {
|
|
1147
|
+
static cache = new HashTable();
|
|
1148
|
+
metric;
|
|
1149
|
+
a;
|
|
1150
|
+
b;
|
|
1151
|
+
origA = [];
|
|
1152
|
+
origB = [];
|
|
1153
|
+
options;
|
|
1154
|
+
optKey;
|
|
1155
|
+
symmetric;
|
|
1156
|
+
results;
|
|
1157
|
+
static clear() {
|
|
1158
|
+
this.cache.clear();
|
|
752
1159
|
}
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
for (let i = 0; i < len; i++) {
|
|
756
|
-
const idx = (this.pointer + i) & (len - 1);
|
|
757
|
-
const item = this.buffers[idx];
|
|
758
|
-
if (item.size >= minSize && (allowOversize || item.size === minSize)) {
|
|
759
|
-
this.pointer = (idx + 1) & (len - 1);
|
|
760
|
-
return item;
|
|
761
|
-
}
|
|
762
|
-
}
|
|
763
|
-
return null;
|
|
1160
|
+
static swap(a, b, m, n) {
|
|
1161
|
+
return m > n ? [b, a, n, m] : [a, b, m, n];
|
|
764
1162
|
}
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
return void [this.buffers.push(item)];
|
|
768
|
-
this.buffers[this.pointer] = item;
|
|
769
|
-
this.pointer = (this.pointer + 1) % this.maxSize;
|
|
1163
|
+
static clamp(res) {
|
|
1164
|
+
return Math.max(0, Math.min(1, res));
|
|
770
1165
|
}
|
|
771
|
-
|
|
772
|
-
this.
|
|
773
|
-
this.
|
|
1166
|
+
constructor(metric, a, b, opt = {}, symmetric = false) {
|
|
1167
|
+
this.metric = metric;
|
|
1168
|
+
this.a = Array.isArray(a) ? a : [a];
|
|
1169
|
+
this.b = Array.isArray(b) ? b : [b];
|
|
1170
|
+
ErrorUtil.assert(
|
|
1171
|
+
this.a.length > 0 && this.b.length > 0,
|
|
1172
|
+
`Inputs <a> and <b> must not be empty`,
|
|
1173
|
+
{ a: this.a, b: this.b }
|
|
1174
|
+
);
|
|
1175
|
+
this.options = opt;
|
|
1176
|
+
this.optKey = Hasher.fastFNV1a(
|
|
1177
|
+
JSON.stringify(opt, Object.keys(opt).sort())
|
|
1178
|
+
).toString();
|
|
1179
|
+
this.symmetric = symmetric;
|
|
774
1180
|
}
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
type: 'int32',
|
|
780
|
-
maxSize: 64,
|
|
781
|
-
maxItemSize: 2048,
|
|
782
|
-
allowOversize: true
|
|
783
|
-
},
|
|
784
|
-
'number[]': {
|
|
785
|
-
type: 'number[]',
|
|
786
|
-
maxSize: 16,
|
|
787
|
-
maxItemSize: 1024,
|
|
788
|
-
allowOversize: false
|
|
789
|
-
},
|
|
790
|
-
'string[]': {
|
|
791
|
-
type: 'string[]',
|
|
792
|
-
maxSize: 2,
|
|
793
|
-
maxItemSize: 1024,
|
|
794
|
-
allowOversize: false
|
|
795
|
-
},
|
|
796
|
-
set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
|
|
797
|
-
map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
|
|
798
|
-
};
|
|
799
|
-
static POOLS = {
|
|
800
|
-
int32: new RingPool(64),
|
|
801
|
-
'number[]': new RingPool(16),
|
|
802
|
-
'string[]': new RingPool(2),
|
|
803
|
-
set: new RingPool(8),
|
|
804
|
-
map: new RingPool(8)
|
|
805
|
-
};
|
|
806
|
-
static allocate(type, size) {
|
|
807
|
-
switch (type) {
|
|
808
|
-
case 'int32':
|
|
809
|
-
return new Int32Array(size);
|
|
810
|
-
case 'number[]':
|
|
811
|
-
return new Float64Array(size);
|
|
812
|
-
case 'string[]':
|
|
813
|
-
return new Array(size);
|
|
814
|
-
case 'set':
|
|
815
|
-
return new Set();
|
|
816
|
-
case 'map':
|
|
817
|
-
return new Map();
|
|
818
|
-
}
|
|
1181
|
+
preCompute(a, b, m, n) {
|
|
1182
|
+
if (a === b) return { res: 1 };
|
|
1183
|
+
if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 };
|
|
1184
|
+
return undefined;
|
|
819
1185
|
}
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
if (item)
|
|
825
|
-
return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
|
|
826
|
-
return this.allocate(type, size);
|
|
1186
|
+
compute(a, b, m, n, maxLen) {
|
|
1187
|
+
throw new CmpStrInternalError(
|
|
1188
|
+
`Method compute() must be overridden in a subclass`
|
|
1189
|
+
);
|
|
827
1190
|
}
|
|
828
|
-
|
|
829
|
-
return
|
|
1191
|
+
runSingle(i, j) {
|
|
1192
|
+
return ErrorUtil.wrap(
|
|
1193
|
+
() => {
|
|
1194
|
+
let a = String(this.a[i]),
|
|
1195
|
+
A = a;
|
|
1196
|
+
let b = String(this.b[j]),
|
|
1197
|
+
B = b;
|
|
1198
|
+
let m = A.length,
|
|
1199
|
+
n = B.length;
|
|
1200
|
+
let result = this.preCompute(A, B, m, n);
|
|
1201
|
+
if (!result) {
|
|
1202
|
+
result = profiler$2.run(() => {
|
|
1203
|
+
if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
|
|
1204
|
+
let key = Metric.cache.key(this.metric, [A, B], this.symmetric);
|
|
1205
|
+
if (key) key += this.optKey;
|
|
1206
|
+
return (
|
|
1207
|
+
Metric.cache.get(key || '') ??
|
|
1208
|
+
(() => {
|
|
1209
|
+
const maxLen = m > n ? m : n;
|
|
1210
|
+
const res = this.compute(A, B, m, n, maxLen);
|
|
1211
|
+
if (key) Metric.cache.set(key, res);
|
|
1212
|
+
return res;
|
|
1213
|
+
})()
|
|
1214
|
+
);
|
|
1215
|
+
});
|
|
1216
|
+
}
|
|
1217
|
+
return {
|
|
1218
|
+
metric: this.metric,
|
|
1219
|
+
a: this.origA.length > i ? this.origA[i] : a,
|
|
1220
|
+
b: this.origB.length > j ? this.origB[j] : b,
|
|
1221
|
+
...result
|
|
1222
|
+
};
|
|
1223
|
+
},
|
|
1224
|
+
`Failed to compute metric for inputs at indices a[${i}] and b[${j}]`,
|
|
1225
|
+
{ i, j }
|
|
1226
|
+
);
|
|
830
1227
|
}
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
this.POOLS[type].release({ buffer, size });
|
|
1228
|
+
async runSingleAsync(i, j) {
|
|
1229
|
+
return Promise.resolve(this.runSingle(i, j));
|
|
834
1230
|
}
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
return new StructuredData(data, key);
|
|
1231
|
+
runBatch() {
|
|
1232
|
+
const results = [];
|
|
1233
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1234
|
+
for (let j = 0; j < this.b.length; j++)
|
|
1235
|
+
results.push(this.runSingle(i, j));
|
|
1236
|
+
this.results = results;
|
|
842
1237
|
}
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
1238
|
+
async runBatchAsync() {
|
|
1239
|
+
const tasks = [];
|
|
1240
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1241
|
+
for (let j = 0; j < this.b.length; j++)
|
|
1242
|
+
tasks.push(this.runSingleAsync(i, j));
|
|
1243
|
+
this.results = await Promise.all(tasks);
|
|
846
1244
|
}
|
|
847
|
-
|
|
848
|
-
const
|
|
849
|
-
for (let i = 0; i <
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
}
|
|
853
|
-
return result;
|
|
1245
|
+
runPairwise() {
|
|
1246
|
+
const results = [];
|
|
1247
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1248
|
+
results.push(this.runSingle(i, i));
|
|
1249
|
+
this.results = results;
|
|
854
1250
|
}
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
'a' in v &&
|
|
861
|
-
'b' in v &&
|
|
862
|
-
'res' in v
|
|
863
|
-
);
|
|
1251
|
+
async runPairwiseAsync() {
|
|
1252
|
+
const tasks = [];
|
|
1253
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1254
|
+
tasks.push(this.runSingleAsync(i, i));
|
|
1255
|
+
this.results = await Promise.all(tasks);
|
|
864
1256
|
}
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
'source' in v &&
|
|
870
|
-
'target' in v &&
|
|
871
|
-
'match' in v
|
|
872
|
-
);
|
|
1257
|
+
setOriginal(a, b) {
|
|
1258
|
+
if (a) this.origA = Array.isArray(a) ? a : [a];
|
|
1259
|
+
if (b) this.origB = Array.isArray(b) ? b : [b];
|
|
1260
|
+
return this;
|
|
873
1261
|
}
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
const first = results[0];
|
|
877
|
-
let normalized = [];
|
|
878
|
-
if (this.isMetricResult(first)) normalized = results;
|
|
879
|
-
else if (this.isCmpStrResult(first))
|
|
880
|
-
normalized = results.map((r) => ({
|
|
881
|
-
metric: 'unknown',
|
|
882
|
-
a: r.source,
|
|
883
|
-
b: r.target,
|
|
884
|
-
res: r.match,
|
|
885
|
-
raw: r.raw
|
|
886
|
-
}));
|
|
887
|
-
else
|
|
888
|
-
throw new TypeError(
|
|
889
|
-
'Unsupported result format for StructuredData normalization.'
|
|
890
|
-
);
|
|
891
|
-
return normalized.map((r, idx) => ({ ...r, __idx: idx }));
|
|
1262
|
+
isBatch() {
|
|
1263
|
+
return this.a.length > 1 || this.b.length > 1;
|
|
892
1264
|
}
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
for (let i = 0; i < extractedStrings.length; i++) {
|
|
896
|
-
const str = extractedStrings[i];
|
|
897
|
-
if (!stringToIndices.has(str)) stringToIndices.set(str, []);
|
|
898
|
-
stringToIndices.get(str).push(i);
|
|
899
|
-
}
|
|
900
|
-
const output = new Array(results.length);
|
|
901
|
-
const occurrenceCount = new Map();
|
|
902
|
-
let out = 0;
|
|
903
|
-
for (let i = 0; i < results.length; i++) {
|
|
904
|
-
const result = results[i];
|
|
905
|
-
if (removeZero && result.res === 0) continue;
|
|
906
|
-
const targetStr = result.b || '';
|
|
907
|
-
const indices = stringToIndices.get(targetStr);
|
|
908
|
-
let dataIndex;
|
|
909
|
-
if (indices && indices.length > 0) {
|
|
910
|
-
const occurrence = occurrenceCount.get(targetStr) ?? 0;
|
|
911
|
-
occurrenceCount.set(targetStr, occurrence + 1);
|
|
912
|
-
dataIndex = indices[occurrence % indices.length];
|
|
913
|
-
} else {
|
|
914
|
-
dataIndex = result.__idx ?? i;
|
|
915
|
-
}
|
|
916
|
-
if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
|
|
917
|
-
const sourceObj = sourceData[dataIndex];
|
|
918
|
-
const mappedTarget = extractedStrings[dataIndex] || targetStr;
|
|
919
|
-
if (objectsOnly) output[out++] = sourceObj;
|
|
920
|
-
else
|
|
921
|
-
output[out++] = {
|
|
922
|
-
obj: sourceObj,
|
|
923
|
-
key: this.key,
|
|
924
|
-
result: {
|
|
925
|
-
source: result.a,
|
|
926
|
-
target: mappedTarget,
|
|
927
|
-
match: result.res
|
|
928
|
-
},
|
|
929
|
-
...(result.raw ? { raw: result.raw } : null)
|
|
930
|
-
};
|
|
931
|
-
}
|
|
932
|
-
output.length = out;
|
|
933
|
-
return output;
|
|
1265
|
+
isSingle() {
|
|
1266
|
+
return !this.isBatch();
|
|
934
1267
|
}
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
1268
|
+
isPairwise(safe = false) {
|
|
1269
|
+
return this.isBatch() && this.a.length === this.b.length
|
|
1270
|
+
? true
|
|
1271
|
+
: !safe &&
|
|
1272
|
+
(() => {
|
|
1273
|
+
throw new CmpStrUsageError(
|
|
1274
|
+
`Mode <pairwise> requires arrays of equal length`,
|
|
1275
|
+
{ a: this.a, b: this.b }
|
|
1276
|
+
);
|
|
1277
|
+
})();
|
|
939
1278
|
}
|
|
940
|
-
|
|
941
|
-
return this.
|
|
942
|
-
this.sort(this.normalizeResults(results), opt?.sort),
|
|
943
|
-
this.data,
|
|
944
|
-
extractedStrings,
|
|
945
|
-
opt?.removeZero,
|
|
946
|
-
opt?.objectsOnly
|
|
947
|
-
);
|
|
1279
|
+
isSymmetrical() {
|
|
1280
|
+
return this.symmetric;
|
|
948
1281
|
}
|
|
949
|
-
|
|
950
|
-
return this.
|
|
1282
|
+
whichMode(mode) {
|
|
1283
|
+
return mode ?? this.options.mode ?? 'default';
|
|
951
1284
|
}
|
|
952
|
-
|
|
953
|
-
|
|
1285
|
+
clear() {
|
|
1286
|
+
this.results = undefined;
|
|
954
1287
|
}
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
1288
|
+
run(mode, clear = true) {
|
|
1289
|
+
if (clear) this.clear();
|
|
1290
|
+
switch (this.whichMode(mode)) {
|
|
1291
|
+
case 'default':
|
|
1292
|
+
if (this.isSingle()) {
|
|
1293
|
+
this.results = this.runSingle(0, 0);
|
|
1294
|
+
break;
|
|
1295
|
+
}
|
|
1296
|
+
case 'batch':
|
|
1297
|
+
this.runBatch();
|
|
1298
|
+
break;
|
|
1299
|
+
case 'single':
|
|
1300
|
+
this.results = this.runSingle(0, 0);
|
|
1301
|
+
break;
|
|
1302
|
+
case 'pairwise':
|
|
1303
|
+
if (this.isPairwise()) this.runPairwise();
|
|
1304
|
+
break;
|
|
1305
|
+
default:
|
|
1306
|
+
throw new CmpStrInternalError(`Unsupported mode <${mode}>`);
|
|
961
1307
|
}
|
|
962
1308
|
}
|
|
963
|
-
async
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
1309
|
+
async runAsync(mode, clear = true) {
|
|
1310
|
+
if (clear) this.clear();
|
|
1311
|
+
switch (this.whichMode(mode)) {
|
|
1312
|
+
case 'default':
|
|
1313
|
+
if (this.isSingle()) {
|
|
1314
|
+
this.results = await this.runSingleAsync(0, 0);
|
|
1315
|
+
break;
|
|
1316
|
+
}
|
|
1317
|
+
case 'batch':
|
|
1318
|
+
await this.runBatchAsync();
|
|
1319
|
+
break;
|
|
1320
|
+
case 'single':
|
|
1321
|
+
this.results = await this.runSingleAsync(0, 0);
|
|
1322
|
+
break;
|
|
1323
|
+
case 'pairwise':
|
|
1324
|
+
if (this.isPairwise()) await this.runPairwiseAsync();
|
|
1325
|
+
break;
|
|
1326
|
+
default:
|
|
1327
|
+
throw new CmpStrInternalError(`Unsupported async mode <${mode}>`);
|
|
969
1328
|
}
|
|
970
1329
|
}
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
const b = this.extractFrom(other, otherKey);
|
|
974
|
-
try {
|
|
975
|
-
return this.performLookup(() => fn(a, b, opt), a, opt);
|
|
976
|
-
} finally {
|
|
977
|
-
Pool.release('string[]', a, a.length);
|
|
978
|
-
Pool.release('string[]', b, b.length);
|
|
979
|
-
}
|
|
1330
|
+
getMetricName() {
|
|
1331
|
+
return this.metric;
|
|
980
1332
|
}
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
1333
|
+
getResults() {
|
|
1334
|
+
ErrorUtil.assert(
|
|
1335
|
+
this.results !== undefined,
|
|
1336
|
+
`run() must be called before getResults()`
|
|
1337
|
+
);
|
|
1338
|
+
return this.results;
|
|
1339
|
+
}
|
|
1340
|
+
}
|
|
1341
|
+
const MetricRegistry = Registry('metric', Metric);
|
|
1342
|
+
|
|
1343
|
+
class CosineSimilarity extends Metric {
|
|
1344
|
+
constructor(a, b, opt = {}) {
|
|
1345
|
+
super('cosine', a, b, opt, true);
|
|
1346
|
+
}
|
|
1347
|
+
_termFreq(str, delimiter) {
|
|
1348
|
+
const terms = str.split(delimiter);
|
|
1349
|
+
const freq = Pool.acquire('map', terms.length);
|
|
1350
|
+
for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
|
|
1351
|
+
return freq;
|
|
1352
|
+
}
|
|
1353
|
+
compute(a, b) {
|
|
1354
|
+
const { delimiter = ' ' } = this.options;
|
|
1355
|
+
const termsA = this._termFreq(a, delimiter);
|
|
1356
|
+
const termsB = this._termFreq(b, delimiter);
|
|
984
1357
|
try {
|
|
985
|
-
|
|
1358
|
+
let dotP = 0,
|
|
1359
|
+
magA = 0,
|
|
1360
|
+
magB = 0;
|
|
1361
|
+
for (const [term, freqA] of termsA) {
|
|
1362
|
+
const freqB = termsB.get(term) || 0;
|
|
1363
|
+
dotP += freqA * freqB;
|
|
1364
|
+
magA += freqA * freqA;
|
|
1365
|
+
}
|
|
1366
|
+
for (const freqB of termsB.values()) magB += freqB * freqB;
|
|
1367
|
+
magA = Math.sqrt(magA);
|
|
1368
|
+
magB = Math.sqrt(magB);
|
|
1369
|
+
return {
|
|
1370
|
+
res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0,
|
|
1371
|
+
raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB }
|
|
1372
|
+
};
|
|
986
1373
|
} finally {
|
|
987
|
-
Pool.release('
|
|
988
|
-
Pool.release('
|
|
1374
|
+
Pool.release('map', termsA, termsA.size);
|
|
1375
|
+
Pool.release('map', termsB, termsB.size);
|
|
989
1376
|
}
|
|
990
1377
|
}
|
|
991
1378
|
}
|
|
1379
|
+
MetricRegistry.add('cosine', CosineSimilarity);
|
|
992
1380
|
|
|
993
|
-
class
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
sentence: /(?<=[.!?])\s+/,
|
|
997
|
-
word: /\p{L}+/gu,
|
|
998
|
-
nonWord: /[^\p{L}]/gu,
|
|
999
|
-
vowelGroup: /[aeiouy]+/g,
|
|
1000
|
-
letter: /\p{L}/gu,
|
|
1001
|
-
ucLetter: /\p{Lu}/gu
|
|
1002
|
-
};
|
|
1003
|
-
text;
|
|
1004
|
-
words = [];
|
|
1005
|
-
sentences = [];
|
|
1006
|
-
charFrequency = new Map();
|
|
1007
|
-
wordHistogram = new Map();
|
|
1008
|
-
syllableCache = new Map();
|
|
1009
|
-
syllableStats;
|
|
1010
|
-
constructor(input) {
|
|
1011
|
-
this.text = input.trim();
|
|
1012
|
-
this.tokenize();
|
|
1013
|
-
this.computeFrequencies();
|
|
1381
|
+
class DamerauLevenshteinDistance extends Metric {
|
|
1382
|
+
constructor(a, b, opt = {}) {
|
|
1383
|
+
super('damerau', a, b, opt, true);
|
|
1014
1384
|
}
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
const
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
const
|
|
1044
|
-
.map((w) => this.estimateSyllables(w))
|
|
1045
|
-
.sort((a, b) => a - b);
|
|
1046
|
-
const total = perWord.reduce((sum, s) => sum + s, 0);
|
|
1047
|
-
const mono = perWord.filter((s) => s === 1).length;
|
|
1048
|
-
const median = !perWord.length
|
|
1049
|
-
? 0
|
|
1050
|
-
: perWord.length % 2 === 0
|
|
1051
|
-
? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) /
|
|
1052
|
-
2
|
|
1053
|
-
: perWord[Math.floor(perWord.length / 2)];
|
|
1385
|
+
compute(a, b, m, n, maxLen) {
|
|
1386
|
+
const len = m + 1;
|
|
1387
|
+
const [test, prev, curr] = Pool.acquireMany('int32', [len, len, len]);
|
|
1388
|
+
try {
|
|
1389
|
+
for (let i = 0; i <= m; i++) prev[i] = i;
|
|
1390
|
+
for (let j = 1; j <= n; j++) {
|
|
1391
|
+
curr[0] = j;
|
|
1392
|
+
const cb = b.charCodeAt(j - 1);
|
|
1393
|
+
for (let i = 1; i <= m; i++) {
|
|
1394
|
+
const ca = a.charCodeAt(i - 1);
|
|
1395
|
+
const cost = ca === cb ? 0 : 1;
|
|
1396
|
+
let val = Math.min(
|
|
1397
|
+
curr[i - 1] + 1,
|
|
1398
|
+
prev[i] + 1,
|
|
1399
|
+
prev[i - 1] + cost
|
|
1400
|
+
);
|
|
1401
|
+
if (
|
|
1402
|
+
i > 1 &&
|
|
1403
|
+
j > 1 &&
|
|
1404
|
+
ca === b.charCodeAt(j - 2) &&
|
|
1405
|
+
cb === a.charCodeAt(i - 2)
|
|
1406
|
+
)
|
|
1407
|
+
val = Math.min(val, test[i - 2] + cost);
|
|
1408
|
+
curr[i] = val;
|
|
1409
|
+
}
|
|
1410
|
+
test.set(prev);
|
|
1411
|
+
prev.set(curr);
|
|
1412
|
+
}
|
|
1413
|
+
const dist = prev[m];
|
|
1054
1414
|
return {
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
perWord,
|
|
1058
|
-
avg: perWord.length ? total / perWord.length : 0,
|
|
1059
|
-
median
|
|
1415
|
+
res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
|
|
1416
|
+
raw: { dist, maxLen }
|
|
1060
1417
|
};
|
|
1061
|
-
}
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
getSentenceCount = () => this.sentences.length;
|
|
1066
|
-
getAvgWordLength() {
|
|
1067
|
-
return this.words.length
|
|
1068
|
-
? this.words.join('').length / this.words.length
|
|
1069
|
-
: 0;
|
|
1070
|
-
}
|
|
1071
|
-
getAvgSentenceLength() {
|
|
1072
|
-
return this.sentences.length
|
|
1073
|
-
? this.words.length / this.sentences.length
|
|
1074
|
-
: 0;
|
|
1075
|
-
}
|
|
1076
|
-
getWordHistogram() {
|
|
1077
|
-
return Object.fromEntries(this.wordHistogram);
|
|
1078
|
-
}
|
|
1079
|
-
getMostCommonWords(limit = 5) {
|
|
1080
|
-
return [...this.wordHistogram.entries()]
|
|
1081
|
-
.sort((a, b) => b[1] - a[1])
|
|
1082
|
-
.slice(0, limit)
|
|
1083
|
-
.map((e) => e[0]);
|
|
1084
|
-
}
|
|
1085
|
-
getHapaxLegomena() {
|
|
1086
|
-
return [...this.wordHistogram.entries()]
|
|
1087
|
-
.filter(([, c]) => c === 1)
|
|
1088
|
-
.map((e) => e[0]);
|
|
1089
|
-
}
|
|
1090
|
-
hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text);
|
|
1091
|
-
getUpperCaseRatio() {
|
|
1092
|
-
const matches = this.text.match(TextAnalyzer.REGEX.letter) || [];
|
|
1093
|
-
const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0;
|
|
1094
|
-
return matches.length ? upper / matches.length : 0;
|
|
1095
|
-
}
|
|
1096
|
-
getCharFrequency() {
|
|
1097
|
-
return Object.fromEntries(this.charFrequency);
|
|
1098
|
-
}
|
|
1099
|
-
getUnicodeCodepoints() {
|
|
1100
|
-
const result = {};
|
|
1101
|
-
for (const [char, count] of this.charFrequency) {
|
|
1102
|
-
const block = char
|
|
1103
|
-
.charCodeAt(0)
|
|
1104
|
-
.toString(16)
|
|
1105
|
-
.padStart(4, '0')
|
|
1106
|
-
.toUpperCase();
|
|
1107
|
-
result[block] = (result[block] || 0) + count;
|
|
1418
|
+
} finally {
|
|
1419
|
+
Pool.release('int32', test, len);
|
|
1420
|
+
Pool.release('int32', prev, len);
|
|
1421
|
+
Pool.release('int32', curr, len);
|
|
1108
1422
|
}
|
|
1109
|
-
return result;
|
|
1110
|
-
}
|
|
1111
|
-
getLongWordRatio(len = 7) {
|
|
1112
|
-
let long = 0;
|
|
1113
|
-
for (const w of this.words) if (w.length >= len) long++;
|
|
1114
|
-
return this.words.length ? long / this.words.length : 0;
|
|
1115
|
-
}
|
|
1116
|
-
getShortWordRatio(len = 3) {
|
|
1117
|
-
let short = 0;
|
|
1118
|
-
for (const w of this.words) if (w.length <= len) short++;
|
|
1119
|
-
return this.words.length ? short / this.words.length : 0;
|
|
1120
|
-
}
|
|
1121
|
-
getSyllablesCount() {
|
|
1122
|
-
return this.computeSyllableStats().total;
|
|
1123
|
-
}
|
|
1124
|
-
getMonosyllabicWordCount() {
|
|
1125
|
-
return this.computeSyllableStats().mono;
|
|
1126
|
-
}
|
|
1127
|
-
getMinSyllablesWordCount(min) {
|
|
1128
|
-
return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
|
|
1129
|
-
}
|
|
1130
|
-
getMaxSyllablesWordCount(max) {
|
|
1131
|
-
return this.computeSyllableStats().perWord.filter((w) => w <= max).length;
|
|
1132
1423
|
}
|
|
1133
|
-
|
|
1134
|
-
|
|
1424
|
+
}
|
|
1425
|
+
MetricRegistry.add('damerau', DamerauLevenshteinDistance);
|
|
1426
|
+
|
|
1427
|
+
class DiceSorensenCoefficient extends Metric {
|
|
1428
|
+
constructor(a, b, opt = {}) {
|
|
1429
|
+
super('dice', a, b, opt, true);
|
|
1135
1430
|
}
|
|
1136
|
-
|
|
1137
|
-
|
|
1431
|
+
_bigrams(str) {
|
|
1432
|
+
const len = str.length - 1;
|
|
1433
|
+
const bigrams = Pool.acquire('set', len);
|
|
1434
|
+
for (let i = 0; i < len; i++) bigrams.add(str.substring(i, i + 2));
|
|
1435
|
+
return bigrams;
|
|
1138
1436
|
}
|
|
1139
|
-
|
|
1437
|
+
compute(a, b) {
|
|
1438
|
+
const setA = this._bigrams(a),
|
|
1439
|
+
setB = this._bigrams(b);
|
|
1440
|
+
const sizeA = setA.size,
|
|
1441
|
+
sizeB = setB.size;
|
|
1140
1442
|
try {
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1443
|
+
let intersection = 0;
|
|
1444
|
+
for (const bigram of setA) if (setB.has(bigram)) intersection++;
|
|
1445
|
+
const size = sizeA + sizeB;
|
|
1446
|
+
return {
|
|
1447
|
+
res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
|
|
1448
|
+
raw: { intersection, size }
|
|
1449
|
+
};
|
|
1450
|
+
} finally {
|
|
1451
|
+
Pool.release('set', setA, sizeA);
|
|
1452
|
+
Pool.release('set', setB, sizeB);
|
|
1147
1453
|
}
|
|
1148
1454
|
}
|
|
1149
|
-
|
|
1150
|
-
|
|
1455
|
+
}
|
|
1456
|
+
MetricRegistry.add('dice', DiceSorensenCoefficient);
|
|
1457
|
+
|
|
1458
|
+
class HammingDistance extends Metric {
|
|
1459
|
+
constructor(a, b, opt = {}) {
|
|
1460
|
+
super('hamming', a, b, opt, true);
|
|
1151
1461
|
}
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
return 0.39 * asl + 11.8 * asw - 15.59;
|
|
1462
|
+
compute(a, b, m, n, maxLen) {
|
|
1463
|
+
if (m !== n) {
|
|
1464
|
+
if (this.options.pad !== undefined) {
|
|
1465
|
+
if (m < maxLen) a = a.padEnd(maxLen, this.options.pad);
|
|
1466
|
+
if (n < maxLen) b = b.padEnd(maxLen, this.options.pad);
|
|
1467
|
+
m = n = maxLen;
|
|
1468
|
+
} else
|
|
1469
|
+
throw new CmpStrUsageError(
|
|
1470
|
+
`Strings must be of equal length for Hamming Distance, a=${m} and b=${n} given, ` +
|
|
1471
|
+
`use option.pad for automatic adjustment`,
|
|
1472
|
+
{ a: m, b: n }
|
|
1473
|
+
);
|
|
1165
1474
|
}
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
const s = this.sentences.length || 1;
|
|
1170
|
-
const l = this.getLongWordRatio() * w;
|
|
1171
|
-
return w / s + (l / w) * 100;
|
|
1172
|
-
}
|
|
1173
|
-
getWSTFScore() {
|
|
1174
|
-
const w = this.words.length || 1;
|
|
1175
|
-
const h = (this.getMinSyllablesWordCount(3) / w) * 100;
|
|
1176
|
-
const s = this.getAvgSentenceLength();
|
|
1177
|
-
const l = this.getLongWordRatio() * 100;
|
|
1178
|
-
const m = (this.getMonosyllabicWordCount() / w) * 100;
|
|
1179
|
-
return [
|
|
1180
|
-
0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875,
|
|
1181
|
-
0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779,
|
|
1182
|
-
0.2963 * h + 0.1905 * s - 1.1144,
|
|
1183
|
-
0.2744 * h + 0.2656 * s - 1.693
|
|
1184
|
-
];
|
|
1475
|
+
let dist = 0;
|
|
1476
|
+
for (let i = 0; i < m; i++) if (a[i] !== b[i]) dist++;
|
|
1477
|
+
return { res: m === 0 ? 1 : Metric.clamp(1 - dist / m), raw: { dist } };
|
|
1185
1478
|
}
|
|
1186
1479
|
}
|
|
1187
|
-
|
|
1188
|
-
const profiler$2 = Profiler.getInstance();
|
|
1189
|
-
class Metric {
|
|
1190
|
-
static cache = new HashTable();
|
|
1191
|
-
metric;
|
|
1192
|
-
a;
|
|
1193
|
-
b;
|
|
1194
|
-
origA = [];
|
|
1195
|
-
origB = [];
|
|
1196
|
-
options;
|
|
1197
|
-
optKey;
|
|
1198
|
-
symmetric;
|
|
1199
|
-
results;
|
|
1200
|
-
static clear = () => this.cache.clear();
|
|
1201
|
-
static swap = (a, b, m, n) => (m > n ? [b, a, n, m] : [a, b, m, n]);
|
|
1202
|
-
static clamp = (res) => Math.max(0, Math.min(1, res));
|
|
1203
|
-
constructor(metric, a, b, opt = {}, symmetric = false) {
|
|
1204
|
-
this.metric = metric;
|
|
1205
|
-
this.a = Array.isArray(a) ? a : [a];
|
|
1206
|
-
this.b = Array.isArray(b) ? b : [b];
|
|
1207
|
-
if (this.a.length === 0 || this.b.length === 0)
|
|
1208
|
-
throw new Error(`Inputs <a> and <b> must not be empty`);
|
|
1209
|
-
this.options = opt;
|
|
1210
|
-
this.optKey = Hasher.fastFNV1a(
|
|
1211
|
-
JSON.stringify(opt, Object.keys(opt).sort())
|
|
1212
|
-
).toString();
|
|
1213
|
-
this.symmetric = symmetric;
|
|
1214
|
-
}
|
|
1215
|
-
preCompute(a, b, m, n) {
|
|
1216
|
-
if (a === b) return { res: 1 };
|
|
1217
|
-
if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 };
|
|
1218
|
-
return undefined;
|
|
1219
|
-
}
|
|
1220
|
-
compute(a, b, m, n, maxLen) {
|
|
1221
|
-
throw new Error(`Method compute() must be overridden in a subclass`);
|
|
1222
|
-
}
|
|
1223
|
-
runSingle(i, j) {
|
|
1224
|
-
let a = String(this.a[i]),
|
|
1225
|
-
A = a;
|
|
1226
|
-
let b = String(this.b[j]),
|
|
1227
|
-
B = b;
|
|
1228
|
-
let m = A.length,
|
|
1229
|
-
n = B.length;
|
|
1230
|
-
let result = this.preCompute(A, B, m, n);
|
|
1231
|
-
if (!result) {
|
|
1232
|
-
result = profiler$2.run(() => {
|
|
1233
|
-
if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
|
|
1234
|
-
const key =
|
|
1235
|
-
Metric.cache.key(this.metric, [A, B], this.symmetric) + this.optKey;
|
|
1236
|
-
return (
|
|
1237
|
-
Metric.cache.get(key || '') ??
|
|
1238
|
-
(() => {
|
|
1239
|
-
const res = this.compute(A, B, m, n, Math.max(m, n));
|
|
1240
|
-
if (key) Metric.cache.set(key, res);
|
|
1241
|
-
return res;
|
|
1242
|
-
})()
|
|
1243
|
-
);
|
|
1244
|
-
});
|
|
1245
|
-
}
|
|
1246
|
-
return {
|
|
1247
|
-
metric: this.metric,
|
|
1248
|
-
a: this.origA[i] ?? a,
|
|
1249
|
-
b: this.origB[j] ?? b,
|
|
1250
|
-
...result
|
|
1251
|
-
};
|
|
1252
|
-
}
|
|
1253
|
-
async runSingleAsync(i, j) {
|
|
1254
|
-
return Promise.resolve(this.runSingle(i, j));
|
|
1255
|
-
}
|
|
1256
|
-
runBatch() {
|
|
1257
|
-
const results = [];
|
|
1258
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1259
|
-
for (let j = 0; j < this.b.length; j++)
|
|
1260
|
-
results.push(this.runSingle(i, j));
|
|
1261
|
-
this.results = results;
|
|
1262
|
-
}
|
|
1263
|
-
async runBatchAsync() {
|
|
1264
|
-
const results = [];
|
|
1265
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1266
|
-
for (let j = 0; j < this.b.length; j++)
|
|
1267
|
-
results.push(await this.runSingleAsync(i, j));
|
|
1268
|
-
this.results = results;
|
|
1269
|
-
}
|
|
1270
|
-
runPairwise() {
|
|
1271
|
-
const results = [];
|
|
1272
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1273
|
-
results.push(this.runSingle(i, i));
|
|
1274
|
-
this.results = results;
|
|
1275
|
-
}
|
|
1276
|
-
async runPairwiseAsync() {
|
|
1277
|
-
const results = [];
|
|
1278
|
-
for (let i = 0; i < this.a.length; i++)
|
|
1279
|
-
results.push(await this.runSingleAsync(i, i));
|
|
1280
|
-
this.results = results;
|
|
1281
|
-
}
|
|
1282
|
-
setOriginal(a, b) {
|
|
1283
|
-
if (a) this.origA = Array.isArray(a) ? a : [a];
|
|
1284
|
-
if (b) this.origB = Array.isArray(b) ? b : [b];
|
|
1285
|
-
return this;
|
|
1286
|
-
}
|
|
1287
|
-
isBatch = () => this.a.length > 1 || this.b.length > 1;
|
|
1288
|
-
isSingle = () => !this.isBatch();
|
|
1289
|
-
isPairwise(safe = false) {
|
|
1290
|
-
return this.isBatch() && this.a.length === this.b.length
|
|
1291
|
-
? true
|
|
1292
|
-
: !safe &&
|
|
1293
|
-
(() => {
|
|
1294
|
-
throw new Error(
|
|
1295
|
-
`Mode <pairwise> requires arrays of equal length`
|
|
1296
|
-
);
|
|
1297
|
-
})();
|
|
1298
|
-
}
|
|
1299
|
-
isSymmetrical = () => this.symmetric;
|
|
1300
|
-
whichMode = (mode) => mode ?? this.options?.mode ?? 'default';
|
|
1301
|
-
clear = () => (this.results = undefined);
|
|
1302
|
-
run(mode, clear = true) {
|
|
1303
|
-
if (clear) this.clear();
|
|
1304
|
-
switch (this.whichMode(mode)) {
|
|
1305
|
-
case 'default':
|
|
1306
|
-
if (this.isSingle()) {
|
|
1307
|
-
this.results = this.runSingle(0, 0);
|
|
1308
|
-
break;
|
|
1309
|
-
}
|
|
1310
|
-
case 'batch':
|
|
1311
|
-
this.runBatch();
|
|
1312
|
-
break;
|
|
1313
|
-
case 'single':
|
|
1314
|
-
this.results = this.runSingle(0, 0);
|
|
1315
|
-
break;
|
|
1316
|
-
case 'pairwise':
|
|
1317
|
-
if (this.isPairwise()) this.runPairwise();
|
|
1318
|
-
break;
|
|
1319
|
-
default:
|
|
1320
|
-
throw new Error(`Unsupported mode <${mode}>`);
|
|
1321
|
-
}
|
|
1322
|
-
}
|
|
1323
|
-
async runAsync(mode, clear = true) {
|
|
1324
|
-
if (clear) this.clear();
|
|
1325
|
-
switch (this.whichMode(mode)) {
|
|
1326
|
-
case 'default':
|
|
1327
|
-
if (this.isSingle()) {
|
|
1328
|
-
this.results = await this.runSingleAsync(0, 0);
|
|
1329
|
-
break;
|
|
1330
|
-
}
|
|
1331
|
-
case 'batch':
|
|
1332
|
-
await this.runBatchAsync();
|
|
1333
|
-
break;
|
|
1334
|
-
case 'single':
|
|
1335
|
-
this.results = await this.runSingleAsync(0, 0);
|
|
1336
|
-
break;
|
|
1337
|
-
case 'pairwise':
|
|
1338
|
-
if (this.isPairwise()) await this.runPairwiseAsync();
|
|
1339
|
-
break;
|
|
1340
|
-
default:
|
|
1341
|
-
throw new Error(`Unsupported async mode <${mode}>`);
|
|
1342
|
-
}
|
|
1343
|
-
}
|
|
1344
|
-
getMetricName = () => this.metric;
|
|
1345
|
-
getResults() {
|
|
1346
|
-
if (this.results === undefined)
|
|
1347
|
-
throw new Error(`run() must be called before getResult()`);
|
|
1348
|
-
return this.results;
|
|
1349
|
-
}
|
|
1350
|
-
}
|
|
1351
|
-
const MetricRegistry = Registry('metric', Metric);
|
|
1352
|
-
|
|
1353
|
-
class CosineSimilarity extends Metric {
|
|
1354
|
-
constructor(a, b, opt = {}) {
|
|
1355
|
-
super('cosine', a, b, opt, true);
|
|
1356
|
-
}
|
|
1357
|
-
_termFreq(str, delimiter) {
|
|
1358
|
-
const terms = str.split(delimiter);
|
|
1359
|
-
const freq = Pool.acquire('map', terms.length);
|
|
1360
|
-
for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
|
|
1361
|
-
return freq;
|
|
1362
|
-
}
|
|
1363
|
-
compute(a, b) {
|
|
1364
|
-
const { delimiter = ' ' } = this.options;
|
|
1365
|
-
const termsA = this._termFreq(a, delimiter);
|
|
1366
|
-
const termsB = this._termFreq(b, delimiter);
|
|
1367
|
-
try {
|
|
1368
|
-
let dotP = 0,
|
|
1369
|
-
magA = 0,
|
|
1370
|
-
magB = 0;
|
|
1371
|
-
for (const [term, freqA] of termsA) {
|
|
1372
|
-
const freqB = termsB.get(term) || 0;
|
|
1373
|
-
dotP += freqA * freqB;
|
|
1374
|
-
magA += freqA * freqA;
|
|
1375
|
-
}
|
|
1376
|
-
for (const freqB of termsB.values()) magB += freqB * freqB;
|
|
1377
|
-
magA = Math.sqrt(magA);
|
|
1378
|
-
magB = Math.sqrt(magB);
|
|
1379
|
-
return {
|
|
1380
|
-
res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0,
|
|
1381
|
-
raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB }
|
|
1382
|
-
};
|
|
1383
|
-
} finally {
|
|
1384
|
-
Pool.release('map', termsA, termsA.size);
|
|
1385
|
-
Pool.release('map', termsB, termsB.size);
|
|
1386
|
-
}
|
|
1387
|
-
}
|
|
1388
|
-
}
|
|
1389
|
-
MetricRegistry.add('cosine', CosineSimilarity);
|
|
1390
|
-
|
|
1391
|
-
class DamerauLevenshteinDistance extends Metric {
|
|
1392
|
-
constructor(a, b, opt = {}) {
|
|
1393
|
-
super('damerau', a, b, opt, true);
|
|
1394
|
-
}
|
|
1395
|
-
compute(a, b, m, n, maxLen) {
|
|
1396
|
-
const len = m + 1;
|
|
1397
|
-
const [test, prev, curr] = Pool.acquireMany('int32', [len, len, len]);
|
|
1398
|
-
try {
|
|
1399
|
-
for (let i = 0; i <= m; i++) prev[i] = i;
|
|
1400
|
-
for (let j = 1; j <= n; j++) {
|
|
1401
|
-
curr[0] = j;
|
|
1402
|
-
const cb = b.charCodeAt(j - 1);
|
|
1403
|
-
for (let i = 1; i <= m; i++) {
|
|
1404
|
-
const ca = a.charCodeAt(i - 1);
|
|
1405
|
-
const cost = ca === cb ? 0 : 1;
|
|
1406
|
-
let val = Math.min(
|
|
1407
|
-
curr[i - 1] + 1,
|
|
1408
|
-
prev[i] + 1,
|
|
1409
|
-
prev[i - 1] + cost
|
|
1410
|
-
);
|
|
1411
|
-
if (
|
|
1412
|
-
i > 1 &&
|
|
1413
|
-
j > 1 &&
|
|
1414
|
-
ca === b.charCodeAt(j - 2) &&
|
|
1415
|
-
cb === a.charCodeAt(i - 2)
|
|
1416
|
-
)
|
|
1417
|
-
val = Math.min(val, test[i - 2] + cost);
|
|
1418
|
-
curr[i] = val;
|
|
1419
|
-
}
|
|
1420
|
-
test.set(prev);
|
|
1421
|
-
prev.set(curr);
|
|
1422
|
-
}
|
|
1423
|
-
const dist = prev[m];
|
|
1424
|
-
return {
|
|
1425
|
-
res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
|
|
1426
|
-
raw: { dist, maxLen }
|
|
1427
|
-
};
|
|
1428
|
-
} finally {
|
|
1429
|
-
Pool.release('int32', test, len);
|
|
1430
|
-
Pool.release('int32', prev, len);
|
|
1431
|
-
Pool.release('int32', curr, len);
|
|
1432
|
-
}
|
|
1433
|
-
}
|
|
1434
|
-
}
|
|
1435
|
-
MetricRegistry.add('damerau', DamerauLevenshteinDistance);
|
|
1436
|
-
|
|
1437
|
-
class DiceSorensenCoefficient extends Metric {
|
|
1438
|
-
constructor(a, b, opt = {}) {
|
|
1439
|
-
super('dice', a, b, opt, true);
|
|
1440
|
-
}
|
|
1441
|
-
_bigrams(str) {
|
|
1442
|
-
const len = str.length - 1;
|
|
1443
|
-
const bigrams = Pool.acquire('set', len);
|
|
1444
|
-
for (let i = 0; i < len; i++) bigrams.add(str.substring(i, i + 2));
|
|
1445
|
-
return bigrams;
|
|
1446
|
-
}
|
|
1447
|
-
compute(a, b) {
|
|
1448
|
-
const setA = this._bigrams(a),
|
|
1449
|
-
setB = this._bigrams(b);
|
|
1450
|
-
const sizeA = setA.size,
|
|
1451
|
-
sizeB = setB.size;
|
|
1452
|
-
try {
|
|
1453
|
-
let intersection = 0;
|
|
1454
|
-
for (const bigram of setA) if (setB.has(bigram)) intersection++;
|
|
1455
|
-
const size = sizeA + sizeB;
|
|
1456
|
-
return {
|
|
1457
|
-
res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
|
|
1458
|
-
raw: { intersection, size }
|
|
1459
|
-
};
|
|
1460
|
-
} finally {
|
|
1461
|
-
Pool.release('set', setA, sizeA);
|
|
1462
|
-
Pool.release('set', setB, sizeB);
|
|
1463
|
-
}
|
|
1464
|
-
}
|
|
1465
|
-
}
|
|
1466
|
-
MetricRegistry.add('dice', DiceSorensenCoefficient);
|
|
1467
|
-
|
|
1468
|
-
class HammingDistance extends Metric {
|
|
1469
|
-
constructor(a, b, opt = {}) {
|
|
1470
|
-
super('hamming', a, b, opt, true);
|
|
1471
|
-
}
|
|
1472
|
-
compute(a, b, m, n, maxLen) {
|
|
1473
|
-
if (m !== n) {
|
|
1474
|
-
if (this.options.pad !== undefined) {
|
|
1475
|
-
if (m < maxLen) a = a.padEnd(maxLen, this.options.pad);
|
|
1476
|
-
if (n < maxLen) b = b.padEnd(maxLen, this.options.pad);
|
|
1477
|
-
m = n = maxLen;
|
|
1478
|
-
} else
|
|
1479
|
-
throw new Error(
|
|
1480
|
-
`Strings must be of equal length for Hamming Distance, a=${m} and b=${n} given, ` +
|
|
1481
|
-
`use option.pad for automatic adjustment`
|
|
1482
|
-
);
|
|
1483
|
-
}
|
|
1484
|
-
let dist = 0;
|
|
1485
|
-
for (let i = 0; i < m; i++) if (a[i] !== b[i]) dist++;
|
|
1486
|
-
return { res: m === 0 ? 1 : Metric.clamp(1 - dist / m), raw: { dist } };
|
|
1487
|
-
}
|
|
1488
|
-
}
|
|
1489
|
-
MetricRegistry.add('hamming', HammingDistance);
|
|
1480
|
+
MetricRegistry.add('hamming', HammingDistance);
|
|
1490
1481
|
|
|
1491
1482
|
class JaccardIndex extends Metric {
|
|
1492
1483
|
constructor(a, b, opt = {}) {
|
|
@@ -1752,43 +1743,59 @@
|
|
|
1752
1743
|
options;
|
|
1753
1744
|
optKey;
|
|
1754
1745
|
map;
|
|
1755
|
-
|
|
1746
|
+
ignoreSet;
|
|
1747
|
+
static clear() {
|
|
1748
|
+
this.cache.clear();
|
|
1749
|
+
}
|
|
1756
1750
|
constructor(algo, opt = {}) {
|
|
1757
1751
|
const defaults = this.constructor.default ?? {};
|
|
1758
1752
|
const mapId = opt.map ?? defaults.map;
|
|
1759
1753
|
if (!mapId)
|
|
1760
|
-
throw new
|
|
1754
|
+
throw new CmpStrNotFoundError(
|
|
1755
|
+
`No mapping specified for phonetic algorithm`,
|
|
1756
|
+
{ algo }
|
|
1757
|
+
);
|
|
1761
1758
|
const map = PhoneticMappingRegistry.get(algo, mapId);
|
|
1762
1759
|
if (map === undefined)
|
|
1763
|
-
throw new
|
|
1764
|
-
|
|
1760
|
+
throw new CmpStrNotFoundError(
|
|
1761
|
+
`Requested mapping <${mapId}> is not declared`,
|
|
1762
|
+
{ algo, mapId }
|
|
1763
|
+
);
|
|
1764
|
+
this.options = DeepMerge.merge(
|
|
1765
|
+
DeepMerge.merge(defaults, map.options ?? {}),
|
|
1766
|
+
opt
|
|
1767
|
+
);
|
|
1765
1768
|
this.optKey = Hasher.fastFNV1a(
|
|
1766
1769
|
JSON.stringify(this.options, Object.keys(this.options).sort())
|
|
1767
1770
|
).toString();
|
|
1768
1771
|
this.algo = algo;
|
|
1769
1772
|
this.map = map;
|
|
1773
|
+
this.ignoreSet = new Set(map.ignore ?? []);
|
|
1770
1774
|
}
|
|
1771
1775
|
applyPattern(word) {
|
|
1772
1776
|
const { patterns = [] } = this.map;
|
|
1773
|
-
if (!patterns
|
|
1777
|
+
if (!patterns.length) return word;
|
|
1774
1778
|
for (const { pattern, replace, all = false } of patterns) {
|
|
1775
|
-
word =
|
|
1779
|
+
word = all
|
|
1780
|
+
? word.replaceAll(pattern, replace)
|
|
1781
|
+
: word.replace(pattern, replace);
|
|
1776
1782
|
}
|
|
1777
1783
|
return word;
|
|
1778
1784
|
}
|
|
1779
1785
|
applyRules(char, i, chars, charLen) {
|
|
1780
1786
|
const { ruleset = [] } = this.map;
|
|
1781
|
-
if (!ruleset
|
|
1787
|
+
if (!ruleset.length) return undefined;
|
|
1782
1788
|
const prev = chars[i - 1] || '',
|
|
1783
1789
|
prev2 = chars[i - 2] || '';
|
|
1784
1790
|
const next = chars[i + 1] || '',
|
|
1785
1791
|
next2 = chars[i + 2] || '';
|
|
1792
|
+
const str = chars.join('');
|
|
1786
1793
|
for (const rule of ruleset) {
|
|
1787
1794
|
if (rule.char && rule.char !== char) continue;
|
|
1788
1795
|
if (rule.position === 'start' && i !== 0) continue;
|
|
1789
1796
|
if (rule.position === 'middle' && (i === 0 || i === charLen - 1))
|
|
1790
1797
|
continue;
|
|
1791
|
-
if (rule.position === 'end' && i !== charLen) continue;
|
|
1798
|
+
if (rule.position === 'end' && i !== charLen - 1) continue;
|
|
1792
1799
|
if (rule.prev && !rule.prev.includes(prev)) continue;
|
|
1793
1800
|
if (rule.prevNot && rule.prevNot.includes(prev)) continue;
|
|
1794
1801
|
if (rule.prev2 && !rule.prev2.includes(prev2)) continue;
|
|
@@ -1799,12 +1806,12 @@
|
|
|
1799
1806
|
if (rule.next2Not && rule.next2Not.includes(next2)) continue;
|
|
1800
1807
|
if (
|
|
1801
1808
|
rule.leading &&
|
|
1802
|
-
!rule.leading.includes(
|
|
1809
|
+
!rule.leading.includes(str.slice(0, rule.leading.length))
|
|
1803
1810
|
)
|
|
1804
1811
|
continue;
|
|
1805
1812
|
if (
|
|
1806
1813
|
rule.trailing &&
|
|
1807
|
-
!rule.trailing.includes(
|
|
1814
|
+
!rule.trailing.includes(str.slice(-rule.trailing.length))
|
|
1808
1815
|
)
|
|
1809
1816
|
continue;
|
|
1810
1817
|
if (rule.match && !rule.match.every((c, j) => chars[i + j] === c))
|
|
@@ -1814,7 +1821,7 @@
|
|
|
1814
1821
|
return undefined;
|
|
1815
1822
|
}
|
|
1816
1823
|
encode(word) {
|
|
1817
|
-
const { map = {}
|
|
1824
|
+
const { map = {} } = this.map;
|
|
1818
1825
|
word = this.applyPattern(word);
|
|
1819
1826
|
const chars = this.word2Chars(word);
|
|
1820
1827
|
const charLen = chars.length;
|
|
@@ -1822,7 +1829,7 @@
|
|
|
1822
1829
|
lastCode = null;
|
|
1823
1830
|
for (let i = 0; i < charLen; i++) {
|
|
1824
1831
|
const char = chars[i];
|
|
1825
|
-
if (
|
|
1832
|
+
if (this.ignoreSet.has(char)) continue;
|
|
1826
1833
|
const mapped = this.mapChar(char, i, chars, charLen, lastCode, map);
|
|
1827
1834
|
if (mapped === undefined) continue;
|
|
1828
1835
|
((code += mapped), (lastCode = mapped));
|
|
@@ -1842,7 +1849,9 @@
|
|
|
1842
1849
|
? input
|
|
1843
1850
|
: (input + pad.repeat(length)).slice(0, length);
|
|
1844
1851
|
}
|
|
1845
|
-
word2Chars
|
|
1852
|
+
word2Chars(word) {
|
|
1853
|
+
return Array.from(word.toLowerCase());
|
|
1854
|
+
}
|
|
1846
1855
|
exitEarly(code, i) {
|
|
1847
1856
|
const { length = -1 } = this.options;
|
|
1848
1857
|
return length > 0 && code.length >= length;
|
|
@@ -1851,37 +1860,52 @@
|
|
|
1851
1860
|
return code;
|
|
1852
1861
|
}
|
|
1853
1862
|
loop(words) {
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
const
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1863
|
+
return ErrorUtil.wrap(
|
|
1864
|
+
() => {
|
|
1865
|
+
const index = [];
|
|
1866
|
+
for (const word of words) {
|
|
1867
|
+
let key = Phonetic.cache.key(this.algo, [word]);
|
|
1868
|
+
if (key) key += this.optKey;
|
|
1869
|
+
const code =
|
|
1870
|
+
Phonetic.cache.get(key || '') ??
|
|
1871
|
+
(() => {
|
|
1872
|
+
const res = this.encode(word);
|
|
1873
|
+
if (key) Phonetic.cache.set(key, res);
|
|
1874
|
+
return res;
|
|
1875
|
+
})();
|
|
1876
|
+
if (code && code.length) index.push(this.equalLen(code));
|
|
1877
|
+
}
|
|
1878
|
+
return index;
|
|
1879
|
+
},
|
|
1880
|
+
`Failed to generate phonetic index`,
|
|
1881
|
+
{ algo: this.algo, words }
|
|
1882
|
+
);
|
|
1867
1883
|
}
|
|
1868
1884
|
async loopAsync(words) {
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
|
|
1882
|
-
|
|
1885
|
+
return ErrorUtil.wrapAsync(
|
|
1886
|
+
async () => {
|
|
1887
|
+
const index = [];
|
|
1888
|
+
for (const word of words) {
|
|
1889
|
+
const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
|
|
1890
|
+
const code = await Promise.resolve(
|
|
1891
|
+
Phonetic.cache.get(key || '') ??
|
|
1892
|
+
(() => {
|
|
1893
|
+
const res = this.encode(word);
|
|
1894
|
+
if (key) Phonetic.cache.set(key, res);
|
|
1895
|
+
return res;
|
|
1896
|
+
})()
|
|
1897
|
+
);
|
|
1898
|
+
if (code && code.length) index.push(this.equalLen(code));
|
|
1899
|
+
}
|
|
1900
|
+
return index;
|
|
1901
|
+
},
|
|
1902
|
+
`Failed to generate phonetic index asynchronously`,
|
|
1903
|
+
{ algo: this.algo, words }
|
|
1904
|
+
);
|
|
1905
|
+
}
|
|
1906
|
+
getAlgoName() {
|
|
1907
|
+
return this.algo;
|
|
1883
1908
|
}
|
|
1884
|
-
getAlgoName = () => this.algo;
|
|
1885
1909
|
getIndex(input) {
|
|
1886
1910
|
const { delimiter = ' ' } = this.options;
|
|
1887
1911
|
return profiler$1.run(() =>
|
|
@@ -1905,10 +1929,11 @@
|
|
|
1905
1929
|
return Object.freeze({
|
|
1906
1930
|
add(algo, id, map, update = false) {
|
|
1907
1931
|
const mappings = maps(algo);
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
1932
|
+
ErrorUtil.assert(
|
|
1933
|
+
!(!id || id in mappings) || update,
|
|
1934
|
+
`Entry <${id}> already exists / use <update=true> to overwrite`,
|
|
1935
|
+
{ algo, id }
|
|
1936
|
+
);
|
|
1912
1937
|
mappings[id] = map;
|
|
1913
1938
|
},
|
|
1914
1939
|
remove(algo, id) {
|
|
@@ -2118,170 +2143,743 @@
|
|
|
2118
2143
|
constructor(opt = {}) {
|
|
2119
2144
|
super('metaphone', opt);
|
|
2120
2145
|
}
|
|
2121
|
-
encode(word) {
|
|
2122
|
-
word = word.replace(Metaphone.REGEX.adjacent, (m, c) =>
|
|
2123
|
-
c === 'C' ? m : c
|
|
2124
|
-
);
|
|
2125
|
-
return super.encode(word);
|
|
2146
|
+
encode(word) {
|
|
2147
|
+
word = word.replace(Metaphone.REGEX.adjacent, (m, c) =>
|
|
2148
|
+
c === 'C' ? m : c
|
|
2149
|
+
);
|
|
2150
|
+
return super.encode(word);
|
|
2151
|
+
}
|
|
2152
|
+
adjustCode(code) {
|
|
2153
|
+
return (
|
|
2154
|
+
code.slice(0, 1) + code.slice(1).replace(Metaphone.REGEX.vowel, '')
|
|
2155
|
+
);
|
|
2156
|
+
}
|
|
2157
|
+
}
|
|
2158
|
+
PhoneticRegistry.add('metaphone', Metaphone);
|
|
2159
|
+
PhoneticMappingRegistry.add('metaphone', 'en90', {
|
|
2160
|
+
map: {
|
|
2161
|
+
a: 'A',
|
|
2162
|
+
b: 'B',
|
|
2163
|
+
c: 'K',
|
|
2164
|
+
d: 'T',
|
|
2165
|
+
e: 'E',
|
|
2166
|
+
f: 'F',
|
|
2167
|
+
g: 'K',
|
|
2168
|
+
h: 'H',
|
|
2169
|
+
i: 'I',
|
|
2170
|
+
j: 'J',
|
|
2171
|
+
k: 'K',
|
|
2172
|
+
l: 'L',
|
|
2173
|
+
m: 'M',
|
|
2174
|
+
n: 'N',
|
|
2175
|
+
o: 'O',
|
|
2176
|
+
p: 'P',
|
|
2177
|
+
q: 'K',
|
|
2178
|
+
r: 'R',
|
|
2179
|
+
s: 'S',
|
|
2180
|
+
t: 'T',
|
|
2181
|
+
u: 'U',
|
|
2182
|
+
v: 'F',
|
|
2183
|
+
w: 'W',
|
|
2184
|
+
x: 'KS',
|
|
2185
|
+
y: 'Y',
|
|
2186
|
+
z: 'S'
|
|
2187
|
+
},
|
|
2188
|
+
ruleset: [
|
|
2189
|
+
{ char: 'a', position: 'start', next: ['e'], code: '' },
|
|
2190
|
+
{ char: 'g', position: 'start', next: ['n'], code: '' },
|
|
2191
|
+
{ char: 'k', position: 'start', next: ['n'], code: '' },
|
|
2192
|
+
{ char: 'p', position: 'start', next: ['n'], code: '' },
|
|
2193
|
+
{ char: 'w', position: 'start', next: ['r'], code: '' },
|
|
2194
|
+
{ char: 'b', position: 'end', prev: ['m'], code: '' },
|
|
2195
|
+
{ char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
|
|
2196
|
+
{ char: 'c', next: ['i'], next2: ['a'], code: 'X' },
|
|
2197
|
+
{ char: 'c', next: ['e', 'i', 'y'], code: 'S' },
|
|
2198
|
+
{ char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
|
|
2199
|
+
{
|
|
2200
|
+
char: 'g',
|
|
2201
|
+
next: ['h'],
|
|
2202
|
+
next2Not: ['', 'a', 'e', 'i', 'o', 'u'],
|
|
2203
|
+
code: ''
|
|
2204
|
+
},
|
|
2205
|
+
{ char: 'g', trailing: 'n', code: '' },
|
|
2206
|
+
{ char: 'g', trailing: 'ned', code: '' },
|
|
2207
|
+
{ char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
|
|
2208
|
+
{
|
|
2209
|
+
char: 'h',
|
|
2210
|
+
prev: ['a', 'e', 'i', 'o', 'u'],
|
|
2211
|
+
nextNot: ['a', 'e', 'i', 'o', 'u'],
|
|
2212
|
+
code: ''
|
|
2213
|
+
},
|
|
2214
|
+
{ char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
|
|
2215
|
+
{ char: 'k', prev: ['c'], code: '' },
|
|
2216
|
+
{ char: 'p', next: ['h'], code: 'F' },
|
|
2217
|
+
{ char: 's', next: ['h'], code: 'X' },
|
|
2218
|
+
{ char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2219
|
+
{ char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2220
|
+
{ char: 't', next: ['h'], code: '0' },
|
|
2221
|
+
{ char: 't', next: ['c'], next2: ['h'], code: '' },
|
|
2222
|
+
{ char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
|
|
2223
|
+
{ char: 'h', leading: 'w', code: '' },
|
|
2224
|
+
{ char: 'x', position: 'start', code: 'S' },
|
|
2225
|
+
{ char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
|
|
2226
|
+
]
|
|
2227
|
+
});
|
|
2228
|
+
|
|
2229
|
+
class Soundex extends Phonetic {
|
|
2230
|
+
static default = {
|
|
2231
|
+
map: 'en',
|
|
2232
|
+
delimiter: ' ',
|
|
2233
|
+
length: 4,
|
|
2234
|
+
pad: '0',
|
|
2235
|
+
dedupe: true
|
|
2236
|
+
};
|
|
2237
|
+
constructor(opt = {}) {
|
|
2238
|
+
super('soundex', opt);
|
|
2239
|
+
}
|
|
2240
|
+
adjustCode(code, chars) {
|
|
2241
|
+
return chars[0].toUpperCase() + code.slice(1).replaceAll('0', '');
|
|
2242
|
+
}
|
|
2243
|
+
}
|
|
2244
|
+
PhoneticRegistry.add('soundex', Soundex);
|
|
2245
|
+
PhoneticMappingRegistry.add('soundex', 'en', {
|
|
2246
|
+
map: {
|
|
2247
|
+
a: '0',
|
|
2248
|
+
e: '0',
|
|
2249
|
+
h: '0',
|
|
2250
|
+
i: '0',
|
|
2251
|
+
o: '0',
|
|
2252
|
+
u: '0',
|
|
2253
|
+
w: '0',
|
|
2254
|
+
y: '0',
|
|
2255
|
+
b: '1',
|
|
2256
|
+
f: '1',
|
|
2257
|
+
p: '1',
|
|
2258
|
+
v: '1',
|
|
2259
|
+
c: '2',
|
|
2260
|
+
g: '2',
|
|
2261
|
+
j: '2',
|
|
2262
|
+
k: '2',
|
|
2263
|
+
q: '2',
|
|
2264
|
+
s: '2',
|
|
2265
|
+
x: '2',
|
|
2266
|
+
z: '2',
|
|
2267
|
+
d: '3',
|
|
2268
|
+
t: '3',
|
|
2269
|
+
l: '4',
|
|
2270
|
+
m: '5',
|
|
2271
|
+
n: '5',
|
|
2272
|
+
r: '6'
|
|
2273
|
+
}
|
|
2274
|
+
});
|
|
2275
|
+
PhoneticMappingRegistry.add('soundex', 'de', {
|
|
2276
|
+
map: {
|
|
2277
|
+
a: '0',
|
|
2278
|
+
ä: '0',
|
|
2279
|
+
e: '0',
|
|
2280
|
+
h: '0',
|
|
2281
|
+
i: '0',
|
|
2282
|
+
j: '0',
|
|
2283
|
+
o: '0',
|
|
2284
|
+
ö: '0',
|
|
2285
|
+
u: '0',
|
|
2286
|
+
ü: '0',
|
|
2287
|
+
y: '0',
|
|
2288
|
+
b: '1',
|
|
2289
|
+
f: '1',
|
|
2290
|
+
p: '1',
|
|
2291
|
+
v: '1',
|
|
2292
|
+
w: '1',
|
|
2293
|
+
c: '2',
|
|
2294
|
+
g: '2',
|
|
2295
|
+
k: '2',
|
|
2296
|
+
q: '2',
|
|
2297
|
+
s: '2',
|
|
2298
|
+
ß: '2',
|
|
2299
|
+
x: '2',
|
|
2300
|
+
z: '2',
|
|
2301
|
+
d: '3',
|
|
2302
|
+
t: '3',
|
|
2303
|
+
l: '4',
|
|
2304
|
+
m: '5',
|
|
2305
|
+
n: '5',
|
|
2306
|
+
r: '6'
|
|
2307
|
+
},
|
|
2308
|
+
ruleset: [{ char: 'c', next: ['h'], code: '7' }]
|
|
2309
|
+
});
|
|
2310
|
+
|
|
2311
|
+
class OptionsValidator {
|
|
2312
|
+
static ALLOWED_FLAGS = new Set([
|
|
2313
|
+
'd',
|
|
2314
|
+
'u',
|
|
2315
|
+
'x',
|
|
2316
|
+
'w',
|
|
2317
|
+
't',
|
|
2318
|
+
'r',
|
|
2319
|
+
's',
|
|
2320
|
+
'k',
|
|
2321
|
+
'n',
|
|
2322
|
+
'i'
|
|
2323
|
+
]);
|
|
2324
|
+
static ALLOWED_OUTPUT = new Set(['orig', 'prep']);
|
|
2325
|
+
static ALLOWED_MODES = new Set(['default', 'batch', 'single', 'pairwise']);
|
|
2326
|
+
static ALLOWED_SORT = new Set(['asc', 'desc']);
|
|
2327
|
+
static PROCESSORS = {
|
|
2328
|
+
phonetic: (opt) => {
|
|
2329
|
+
if (!opt) return;
|
|
2330
|
+
OptionsValidator.validatePhoneticName(opt.algo);
|
|
2331
|
+
OptionsValidator.validatePhoneticOptions(opt.opt);
|
|
2332
|
+
}
|
|
2333
|
+
};
|
|
2334
|
+
static METRIC_OPT_MAP = {
|
|
2335
|
+
mode: (v) => OptionsValidator.validateMode(v),
|
|
2336
|
+
delimiter: (v) => OptionsValidator.validateString(v, 'opt.delimiter'),
|
|
2337
|
+
pad: (v) => OptionsValidator.validateString(v, 'opt.pad'),
|
|
2338
|
+
q: (v) => OptionsValidator.validateNumber(v, 'opt.q'),
|
|
2339
|
+
match: (v) => OptionsValidator.validateNumber(v, 'opt.match'),
|
|
2340
|
+
mismatch: (v) => OptionsValidator.validateNumber(v, 'opt.mismatch'),
|
|
2341
|
+
gap: (v) => OptionsValidator.validateNumber(v, 'opt.gap')
|
|
2342
|
+
};
|
|
2343
|
+
static PHONETIC_OPT_MAP = {
|
|
2344
|
+
map: (v) =>
|
|
2345
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.map'),
|
|
2346
|
+
delimiter: (v) =>
|
|
2347
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.delimiter'),
|
|
2348
|
+
length: (v) =>
|
|
2349
|
+
OptionsValidator.validateNumber(v, 'processors.phonetic.opt.length'),
|
|
2350
|
+
pad: (v) =>
|
|
2351
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.pad'),
|
|
2352
|
+
dedupe: (v) =>
|
|
2353
|
+
OptionsValidator.validateBoolean(v, 'processors.phonetic.opt.dedupe'),
|
|
2354
|
+
fallback: (v) =>
|
|
2355
|
+
OptionsValidator.validateString(v, 'processors.phonetic.opt.fallback')
|
|
2356
|
+
};
|
|
2357
|
+
static CMPSTR_OPT_MAP = {
|
|
2358
|
+
raw: (v) => OptionsValidator.validateBoolean(v, 'raw'),
|
|
2359
|
+
removeZero: (v) => OptionsValidator.validateBoolean(v, 'removeZero'),
|
|
2360
|
+
safeEmpty: (v) => OptionsValidator.validateBoolean(v, 'safeEmpty'),
|
|
2361
|
+
flags: (v) => OptionsValidator.validateFlags(v),
|
|
2362
|
+
metric: (v) => OptionsValidator.validateMetricName(v),
|
|
2363
|
+
output: (v) => OptionsValidator.validateOutput(v),
|
|
2364
|
+
opt: (v) => OptionsValidator.validateMetricOptions(v),
|
|
2365
|
+
processors: (v) => OptionsValidator.validateProcessors(v),
|
|
2366
|
+
sort: (v) => OptionsValidator.validateSort(v, 'sort'),
|
|
2367
|
+
objectsOnly: (v) => OptionsValidator.validateBoolean(v, 'objectsOnly')
|
|
2368
|
+
};
|
|
2369
|
+
static set2string(set) {
|
|
2370
|
+
return Array.from(set).join(' | ');
|
|
2371
|
+
}
|
|
2372
|
+
static validateType(value, name, type) {
|
|
2373
|
+
if (value === undefined) return;
|
|
2374
|
+
if (typeof value !== type || (type === 'number' && Number.isNaN(value))) {
|
|
2375
|
+
throw new CmpStrValidationError(
|
|
2376
|
+
`Invalid option <${name}>: expected ${type}`,
|
|
2377
|
+
{ name, value }
|
|
2378
|
+
);
|
|
2379
|
+
}
|
|
2380
|
+
}
|
|
2381
|
+
static validateEnum(value, name, set) {
|
|
2382
|
+
if (value === undefined) return;
|
|
2383
|
+
if (typeof value !== 'string' || !set.has(value)) {
|
|
2384
|
+
throw new CmpStrValidationError(
|
|
2385
|
+
`Invalid option <${name}>: expected ${OptionsValidator.set2string(set)}`,
|
|
2386
|
+
{ name, value }
|
|
2387
|
+
);
|
|
2388
|
+
}
|
|
2389
|
+
}
|
|
2390
|
+
static validateMap(opt, map) {
|
|
2391
|
+
if (!opt) return;
|
|
2392
|
+
for (const k in opt) {
|
|
2393
|
+
const fn = map[k];
|
|
2394
|
+
if (!fn)
|
|
2395
|
+
throw new CmpStrValidationError(`Invalid option <${k}>`, {
|
|
2396
|
+
option: k,
|
|
2397
|
+
value: map[k]
|
|
2398
|
+
});
|
|
2399
|
+
fn(opt[k]);
|
|
2400
|
+
}
|
|
2401
|
+
}
|
|
2402
|
+
static validateRegistryName(value, name, label, has, list) {
|
|
2403
|
+
if (value === undefined) return;
|
|
2404
|
+
if (typeof value !== 'string' || value.length === 0)
|
|
2405
|
+
throw new CmpStrValidationError(
|
|
2406
|
+
`Invalid option <${name}>: expected non-empty string`,
|
|
2407
|
+
{ name, value }
|
|
2408
|
+
);
|
|
2409
|
+
if (!has(value))
|
|
2410
|
+
throw new CmpStrValidationError(
|
|
2411
|
+
`${label} <${value}> is not registered`,
|
|
2412
|
+
{ name, value, available: list() }
|
|
2413
|
+
);
|
|
2414
|
+
}
|
|
2415
|
+
static validateBoolean(value, name) {
|
|
2416
|
+
OptionsValidator.validateType(value, name, 'boolean');
|
|
2417
|
+
}
|
|
2418
|
+
static validateNumber(value, name) {
|
|
2419
|
+
OptionsValidator.validateType(value, name, 'number');
|
|
2420
|
+
}
|
|
2421
|
+
static validateString(value, name) {
|
|
2422
|
+
OptionsValidator.validateType(value, name, 'string');
|
|
2423
|
+
}
|
|
2424
|
+
static validateFlags(value) {
|
|
2425
|
+
if (value === undefined) return;
|
|
2426
|
+
if (typeof value !== 'string')
|
|
2427
|
+
throw new CmpStrValidationError(
|
|
2428
|
+
`Invalid option <flags>: expected string`,
|
|
2429
|
+
{ flags: value }
|
|
2430
|
+
);
|
|
2431
|
+
for (let i = 0; i < value.length; i++) {
|
|
2432
|
+
const ch = value[i];
|
|
2433
|
+
if (!OptionsValidator.ALLOWED_FLAGS.has(ch))
|
|
2434
|
+
throw new CmpStrValidationError(
|
|
2435
|
+
`Invalid normalization flag <${ch}> in <flags>: expected ${OptionsValidator.set2string(OptionsValidator.ALLOWED_FLAGS)}`,
|
|
2436
|
+
{ flags: value, invalid: ch }
|
|
2437
|
+
);
|
|
2438
|
+
}
|
|
2439
|
+
}
|
|
2440
|
+
static validateOutput(value) {
|
|
2441
|
+
OptionsValidator.validateEnum(
|
|
2442
|
+
value,
|
|
2443
|
+
'output',
|
|
2444
|
+
OptionsValidator.ALLOWED_OUTPUT
|
|
2445
|
+
);
|
|
2446
|
+
}
|
|
2447
|
+
static validateMode(value) {
|
|
2448
|
+
OptionsValidator.validateEnum(
|
|
2449
|
+
value,
|
|
2450
|
+
'mode',
|
|
2451
|
+
OptionsValidator.ALLOWED_MODES
|
|
2452
|
+
);
|
|
2453
|
+
}
|
|
2454
|
+
static validateSort(value, name) {
|
|
2455
|
+
if (value === undefined || typeof value === 'boolean') return;
|
|
2456
|
+
OptionsValidator.validateEnum(value, name, OptionsValidator.ALLOWED_SORT);
|
|
2457
|
+
}
|
|
2458
|
+
static validateMetricName(value) {
|
|
2459
|
+
OptionsValidator.validateRegistryName(
|
|
2460
|
+
value,
|
|
2461
|
+
'metric',
|
|
2462
|
+
'Comparison metric',
|
|
2463
|
+
MetricRegistry.has,
|
|
2464
|
+
MetricRegistry.list
|
|
2465
|
+
);
|
|
2466
|
+
}
|
|
2467
|
+
static validatePhoneticName(value) {
|
|
2468
|
+
OptionsValidator.validateRegistryName(
|
|
2469
|
+
value,
|
|
2470
|
+
'phonetic',
|
|
2471
|
+
'Phonetic algorithm',
|
|
2472
|
+
PhoneticRegistry.has,
|
|
2473
|
+
PhoneticRegistry.list
|
|
2474
|
+
);
|
|
2475
|
+
}
|
|
2476
|
+
static validateMetricOptions(opt) {
|
|
2477
|
+
OptionsValidator.validateMap(opt, OptionsValidator.METRIC_OPT_MAP);
|
|
2478
|
+
}
|
|
2479
|
+
static validatePhoneticOptions(opt) {
|
|
2480
|
+
OptionsValidator.validateMap(opt, OptionsValidator.PHONETIC_OPT_MAP);
|
|
2481
|
+
}
|
|
2482
|
+
static validateProcessors(opt) {
|
|
2483
|
+
if (!opt) return;
|
|
2484
|
+
for (const key in opt) {
|
|
2485
|
+
const fn = OptionsValidator.PROCESSORS[key];
|
|
2486
|
+
if (!fn)
|
|
2487
|
+
throw new CmpStrValidationError(
|
|
2488
|
+
`Invalid processor type <${key}> in <processors>: expected ${Object.keys(OptionsValidator.PROCESSORS).join(' | ')}`,
|
|
2489
|
+
{ processors: opt, invalid: key }
|
|
2490
|
+
);
|
|
2491
|
+
fn(opt[key]);
|
|
2492
|
+
}
|
|
2493
|
+
}
|
|
2494
|
+
static validateOptions(opt) {
|
|
2495
|
+
OptionsValidator.validateMap(opt, OptionsValidator.CMPSTR_OPT_MAP);
|
|
2496
|
+
}
|
|
2497
|
+
}
|
|
2498
|
+
|
|
2499
|
+
class StructuredData {
|
|
2500
|
+
data;
|
|
2501
|
+
key;
|
|
2502
|
+
static SORT_ASC = (a, b) => a.res - b.res;
|
|
2503
|
+
static SORT_DESC = (a, b) => b.res - a.res;
|
|
2504
|
+
static create(data, key) {
|
|
2505
|
+
return new StructuredData(data, key);
|
|
2506
|
+
}
|
|
2507
|
+
constructor(data, key) {
|
|
2508
|
+
this.data = data;
|
|
2509
|
+
this.key = key;
|
|
2510
|
+
}
|
|
2511
|
+
extractFrom(arr, key) {
|
|
2512
|
+
const n = arr.length;
|
|
2513
|
+
const result = new Array(n);
|
|
2514
|
+
for (let i = 0; i < n; i++) {
|
|
2515
|
+
const val = arr[i][key];
|
|
2516
|
+
result[i] = val != null ? String(val) : '';
|
|
2517
|
+
}
|
|
2518
|
+
return result;
|
|
2519
|
+
}
|
|
2520
|
+
extract() {
|
|
2521
|
+
return this.extractFrom(this.data, this.key);
|
|
2522
|
+
}
|
|
2523
|
+
isMetricResult(v) {
|
|
2524
|
+
return (
|
|
2525
|
+
typeof v === 'object' &&
|
|
2526
|
+
v !== null &&
|
|
2527
|
+
'a' in v &&
|
|
2528
|
+
'b' in v &&
|
|
2529
|
+
'res' in v
|
|
2530
|
+
);
|
|
2531
|
+
}
|
|
2532
|
+
isCmpStrResult(v) {
|
|
2533
|
+
return (
|
|
2534
|
+
typeof v === 'object' &&
|
|
2535
|
+
v !== null &&
|
|
2536
|
+
'source' in v &&
|
|
2537
|
+
'target' in v &&
|
|
2538
|
+
'match' in v
|
|
2539
|
+
);
|
|
2540
|
+
}
|
|
2541
|
+
normalizeResults(results) {
|
|
2542
|
+
if (!Array.isArray(results) || results.length === 0) return [];
|
|
2543
|
+
const first = results[0];
|
|
2544
|
+
let out = new Array(results.length);
|
|
2545
|
+
if (this.isMetricResult(first)) {
|
|
2546
|
+
const src = results;
|
|
2547
|
+
for (let i = 0; i < src.length; i++) out[i] = { ...src[i], __idx: i };
|
|
2548
|
+
} else if (this.isCmpStrResult(first)) {
|
|
2549
|
+
const src = results;
|
|
2550
|
+
for (let i = 0; i < src.length; i++) {
|
|
2551
|
+
const r = src[i];
|
|
2552
|
+
out[i] = {
|
|
2553
|
+
metric: 'unknown',
|
|
2554
|
+
a: r.source,
|
|
2555
|
+
b: r.target,
|
|
2556
|
+
res: r.match,
|
|
2557
|
+
raw: r.raw,
|
|
2558
|
+
__idx: i
|
|
2559
|
+
};
|
|
2560
|
+
}
|
|
2561
|
+
} else
|
|
2562
|
+
throw new CmpStrValidationError(
|
|
2563
|
+
'Unsupported result format for StructuredData normalization.'
|
|
2564
|
+
);
|
|
2565
|
+
return out;
|
|
2566
|
+
}
|
|
2567
|
+
rebuild(results, sourceData, extractedStrings, removeZero, objectsOnly) {
|
|
2568
|
+
const m = extractedStrings.length,
|
|
2569
|
+
n = results.length;
|
|
2570
|
+
const stringToIndices = Pool.acquire('map', m);
|
|
2571
|
+
const occurrenceCount = Pool.acquire('map', n);
|
|
2572
|
+
const output = new Array(n);
|
|
2573
|
+
stringToIndices.clear();
|
|
2574
|
+
occurrenceCount.clear();
|
|
2575
|
+
try {
|
|
2576
|
+
for (let i = 0; i < m; i++) {
|
|
2577
|
+
const str = extractedStrings[i];
|
|
2578
|
+
let arr = stringToIndices.get(str);
|
|
2579
|
+
if (!arr) {
|
|
2580
|
+
arr = [];
|
|
2581
|
+
stringToIndices.set(str, arr);
|
|
2582
|
+
}
|
|
2583
|
+
arr.push(i);
|
|
2584
|
+
}
|
|
2585
|
+
let out = 0;
|
|
2586
|
+
for (let i = 0; i < n; i++) {
|
|
2587
|
+
const result = results[i];
|
|
2588
|
+
if (removeZero && result.res === 0) continue;
|
|
2589
|
+
const targetStr = result.b || '';
|
|
2590
|
+
const indices = stringToIndices.get(targetStr);
|
|
2591
|
+
let dataIndex;
|
|
2592
|
+
if (indices && indices.length > 0) {
|
|
2593
|
+
const occurrence = occurrenceCount.get(targetStr) ?? 0;
|
|
2594
|
+
occurrenceCount.set(targetStr, occurrence + 1);
|
|
2595
|
+
dataIndex = indices[occurrence % indices.length];
|
|
2596
|
+
} else {
|
|
2597
|
+
dataIndex = result.__idx ?? i;
|
|
2598
|
+
}
|
|
2599
|
+
if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
|
|
2600
|
+
const sourceObj = sourceData[dataIndex];
|
|
2601
|
+
const mappedTarget = extractedStrings[dataIndex] || targetStr;
|
|
2602
|
+
if (objectsOnly) output[out++] = sourceObj;
|
|
2603
|
+
else
|
|
2604
|
+
output[out++] = {
|
|
2605
|
+
obj: sourceObj,
|
|
2606
|
+
key: this.key,
|
|
2607
|
+
result: {
|
|
2608
|
+
source: result.a,
|
|
2609
|
+
target: mappedTarget,
|
|
2610
|
+
match: result.res
|
|
2611
|
+
},
|
|
2612
|
+
...(result.raw ? { raw: result.raw } : null)
|
|
2613
|
+
};
|
|
2614
|
+
}
|
|
2615
|
+
output.length = out;
|
|
2616
|
+
return output;
|
|
2617
|
+
} finally {
|
|
2618
|
+
Pool.release('map', stringToIndices, m);
|
|
2619
|
+
Pool.release('map', occurrenceCount, n);
|
|
2620
|
+
}
|
|
2621
|
+
}
|
|
2622
|
+
sort(results, sort) {
|
|
2623
|
+
if (!sort || results.length <= 1) return results;
|
|
2624
|
+
return results.sort(
|
|
2625
|
+
sort === 'asc' ? StructuredData.SORT_ASC : StructuredData.SORT_DESC
|
|
2626
|
+
);
|
|
2627
|
+
}
|
|
2628
|
+
finalizeLookup(results, extractedStrings, opt) {
|
|
2629
|
+
return this.rebuild(
|
|
2630
|
+
this.sort(this.normalizeResults(results), opt?.sort),
|
|
2631
|
+
this.data,
|
|
2632
|
+
extractedStrings,
|
|
2633
|
+
opt?.removeZero,
|
|
2634
|
+
opt?.objectsOnly
|
|
2635
|
+
);
|
|
2636
|
+
}
|
|
2637
|
+
performLookup(fn, extractedStrings, opt) {
|
|
2638
|
+
return ErrorUtil.wrap(
|
|
2639
|
+
() => this.finalizeLookup(fn(), extractedStrings, opt),
|
|
2640
|
+
'StructuredData lookup failed',
|
|
2641
|
+
{ key: this.key }
|
|
2642
|
+
);
|
|
2643
|
+
}
|
|
2644
|
+
async performLookupAsync(fn, extractedStrings, opt) {
|
|
2645
|
+
return await ErrorUtil.wrapAsync(
|
|
2646
|
+
async () => this.finalizeLookup(await fn(), extractedStrings, opt),
|
|
2647
|
+
'StructuredData async lookup failed',
|
|
2648
|
+
{ key: this.key }
|
|
2649
|
+
);
|
|
2650
|
+
}
|
|
2651
|
+
lookup(fn, query, opt) {
|
|
2652
|
+
const b = this.extract();
|
|
2653
|
+
try {
|
|
2654
|
+
return this.performLookup(() => fn(query, b, opt), b, opt);
|
|
2655
|
+
} finally {
|
|
2656
|
+
Pool.release('string[]', b, b.length);
|
|
2657
|
+
}
|
|
2658
|
+
}
|
|
2659
|
+
async lookupAsync(fn, query, opt) {
|
|
2660
|
+
const b = this.extract();
|
|
2661
|
+
try {
|
|
2662
|
+
return await this.performLookupAsync(() => fn(query, b, opt), b, opt);
|
|
2663
|
+
} finally {
|
|
2664
|
+
Pool.release('string[]', b, b.length);
|
|
2665
|
+
}
|
|
2666
|
+
}
|
|
2667
|
+
lookupPairs(fn, other, otherKey, opt) {
|
|
2668
|
+
const a = this.extract();
|
|
2669
|
+
const b = this.extractFrom(other, otherKey);
|
|
2670
|
+
try {
|
|
2671
|
+
return this.performLookup(() => fn(a, b, opt), a, opt);
|
|
2672
|
+
} finally {
|
|
2673
|
+
Pool.release('string[]', a, a.length);
|
|
2674
|
+
Pool.release('string[]', b, b.length);
|
|
2675
|
+
}
|
|
2676
|
+
}
|
|
2677
|
+
async lookupPairsAsync(fn, other, otherKey, opt) {
|
|
2678
|
+
const a = this.extract();
|
|
2679
|
+
const b = this.extractFrom(other, otherKey);
|
|
2680
|
+
try {
|
|
2681
|
+
return await this.performLookupAsync(() => fn(a, b, opt), a, opt);
|
|
2682
|
+
} finally {
|
|
2683
|
+
Pool.release('string[]', a, a.length);
|
|
2684
|
+
Pool.release('string[]', b, b.length);
|
|
2685
|
+
}
|
|
2686
|
+
}
|
|
2687
|
+
}
|
|
2688
|
+
|
|
2689
|
+
class TextAnalyzer {
|
|
2690
|
+
static REGEX = {
|
|
2691
|
+
number: /\d/,
|
|
2692
|
+
sentence: /(?<=[.!?])\s+/,
|
|
2693
|
+
word: /\p{L}+/gu,
|
|
2694
|
+
nonWord: /[^\p{L}]/gu,
|
|
2695
|
+
vowelGroup: /[aeiouy]+/g,
|
|
2696
|
+
letter: /\p{L}/gu,
|
|
2697
|
+
ucLetter: /\p{Lu}/gu
|
|
2698
|
+
};
|
|
2699
|
+
text;
|
|
2700
|
+
words = [];
|
|
2701
|
+
sentences = [];
|
|
2702
|
+
charFrequency = new Map();
|
|
2703
|
+
wordHistogram = new Map();
|
|
2704
|
+
syllableCache = new Map();
|
|
2705
|
+
syllableStats;
|
|
2706
|
+
constructor(input) {
|
|
2707
|
+
this.text = input.trim();
|
|
2708
|
+
this.tokenize();
|
|
2709
|
+
this.computeFrequencies();
|
|
2710
|
+
}
|
|
2711
|
+
tokenize() {
|
|
2712
|
+
let match;
|
|
2713
|
+
const lcText = this.text.toLowerCase();
|
|
2714
|
+
while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null)
|
|
2715
|
+
this.words.push(match[0]);
|
|
2716
|
+
this.sentences = this.text
|
|
2717
|
+
.split(TextAnalyzer.REGEX.sentence)
|
|
2718
|
+
.filter(Boolean);
|
|
2719
|
+
}
|
|
2720
|
+
computeFrequencies() {
|
|
2721
|
+
for (const char of this.text)
|
|
2722
|
+
this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1);
|
|
2723
|
+
for (const word of this.words)
|
|
2724
|
+
this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1);
|
|
2725
|
+
}
|
|
2726
|
+
estimateSyllables(word) {
|
|
2727
|
+
const clean = word
|
|
2728
|
+
.normalize('NFC')
|
|
2729
|
+
.toLowerCase()
|
|
2730
|
+
.replace(TextAnalyzer.REGEX.nonWord, '');
|
|
2731
|
+
if (this.syllableCache.has(clean)) return this.syllableCache.get(clean);
|
|
2732
|
+
const matches = clean.match(TextAnalyzer.REGEX.vowelGroup);
|
|
2733
|
+
const count = matches ? matches.length : 1;
|
|
2734
|
+
this.syllableCache.set(clean, count);
|
|
2735
|
+
return count;
|
|
2736
|
+
}
|
|
2737
|
+
computeSyllableStats() {
|
|
2738
|
+
return (this.syllableStats ||= (() => {
|
|
2739
|
+
const perWord = this.words
|
|
2740
|
+
.map((w) => this.estimateSyllables(w))
|
|
2741
|
+
.sort((a, b) => a - b);
|
|
2742
|
+
const total = perWord.reduce((sum, s) => sum + s, 0);
|
|
2743
|
+
const mono = perWord.filter((s) => s === 1).length;
|
|
2744
|
+
const median = !perWord.length
|
|
2745
|
+
? 0
|
|
2746
|
+
: perWord.length % 2 === 0
|
|
2747
|
+
? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) /
|
|
2748
|
+
2
|
|
2749
|
+
: perWord[Math.floor(perWord.length / 2)];
|
|
2750
|
+
return {
|
|
2751
|
+
total,
|
|
2752
|
+
mono,
|
|
2753
|
+
perWord,
|
|
2754
|
+
avg: perWord.length ? total / perWord.length : 0,
|
|
2755
|
+
median
|
|
2756
|
+
};
|
|
2757
|
+
})());
|
|
2758
|
+
}
|
|
2759
|
+
getLength = () => this.text.length;
|
|
2760
|
+
getWordCount = () => this.words.length;
|
|
2761
|
+
getSentenceCount = () => this.sentences.length;
|
|
2762
|
+
getAvgWordLength() {
|
|
2763
|
+
return this.words.length
|
|
2764
|
+
? this.words.join('').length / this.words.length
|
|
2765
|
+
: 0;
|
|
2766
|
+
}
|
|
2767
|
+
getAvgSentenceLength() {
|
|
2768
|
+
return this.sentences.length
|
|
2769
|
+
? this.words.length / this.sentences.length
|
|
2770
|
+
: 0;
|
|
2771
|
+
}
|
|
2772
|
+
getWordHistogram() {
|
|
2773
|
+
return Object.fromEntries(this.wordHistogram);
|
|
2774
|
+
}
|
|
2775
|
+
getMostCommonWords(limit = 5) {
|
|
2776
|
+
return [...this.wordHistogram.entries()]
|
|
2777
|
+
.sort((a, b) => b[1] - a[1])
|
|
2778
|
+
.slice(0, limit)
|
|
2779
|
+
.map((e) => e[0]);
|
|
2780
|
+
}
|
|
2781
|
+
getHapaxLegomena() {
|
|
2782
|
+
return [...this.wordHistogram.entries()]
|
|
2783
|
+
.filter(([, c]) => c === 1)
|
|
2784
|
+
.map((e) => e[0]);
|
|
2785
|
+
}
|
|
2786
|
+
hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text);
|
|
2787
|
+
getUpperCaseRatio() {
|
|
2788
|
+
const matches = this.text.match(TextAnalyzer.REGEX.letter) || [];
|
|
2789
|
+
const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0;
|
|
2790
|
+
return matches.length ? upper / matches.length : 0;
|
|
2791
|
+
}
|
|
2792
|
+
getCharFrequency() {
|
|
2793
|
+
return Object.fromEntries(this.charFrequency);
|
|
2794
|
+
}
|
|
2795
|
+
getUnicodeCodepoints() {
|
|
2796
|
+
const result = {};
|
|
2797
|
+
for (const [char, count] of this.charFrequency) {
|
|
2798
|
+
const block = char
|
|
2799
|
+
.charCodeAt(0)
|
|
2800
|
+
.toString(16)
|
|
2801
|
+
.padStart(4, '0')
|
|
2802
|
+
.toUpperCase();
|
|
2803
|
+
result[block] = (result[block] || 0) + count;
|
|
2804
|
+
}
|
|
2805
|
+
return result;
|
|
2806
|
+
}
|
|
2807
|
+
getLongWordRatio(len = 7) {
|
|
2808
|
+
let long = 0;
|
|
2809
|
+
for (const w of this.words) if (w.length >= len) long++;
|
|
2810
|
+
return this.words.length ? long / this.words.length : 0;
|
|
2811
|
+
}
|
|
2812
|
+
getShortWordRatio(len = 3) {
|
|
2813
|
+
let short = 0;
|
|
2814
|
+
for (const w of this.words) if (w.length <= len) short++;
|
|
2815
|
+
return this.words.length ? short / this.words.length : 0;
|
|
2816
|
+
}
|
|
2817
|
+
getSyllablesCount() {
|
|
2818
|
+
return this.computeSyllableStats().total;
|
|
2819
|
+
}
|
|
2820
|
+
getMonosyllabicWordCount() {
|
|
2821
|
+
return this.computeSyllableStats().mono;
|
|
2822
|
+
}
|
|
2823
|
+
getMinSyllablesWordCount(min) {
|
|
2824
|
+
return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
|
|
2825
|
+
}
|
|
2826
|
+
getMaxSyllablesWordCount(max) {
|
|
2827
|
+
return this.computeSyllableStats().perWord.filter((w) => w <= max).length;
|
|
2828
|
+
}
|
|
2829
|
+
getAvgSyllablesPerWord() {
|
|
2830
|
+
return this.computeSyllableStats().avg;
|
|
2831
|
+
}
|
|
2832
|
+
getMedianSyllablesPerWord() {
|
|
2833
|
+
return this.computeSyllableStats().median;
|
|
2834
|
+
}
|
|
2835
|
+
getHonoresR() {
|
|
2836
|
+
try {
|
|
2837
|
+
return (
|
|
2838
|
+
(100 * Math.log(this.words.length)) /
|
|
2839
|
+
(1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1))
|
|
2840
|
+
);
|
|
2841
|
+
} catch {
|
|
2842
|
+
return 0;
|
|
2843
|
+
}
|
|
2844
|
+
}
|
|
2845
|
+
getReadingTime(wpm = 200) {
|
|
2846
|
+
return this.words.length / (wpm ?? 1);
|
|
2126
2847
|
}
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
);
|
|
2848
|
+
getReadabilityScore(metric = 'flesch') {
|
|
2849
|
+
const w = this.words.length || 1;
|
|
2850
|
+
const s = this.sentences.length || 1;
|
|
2851
|
+
const y = this.getSyllablesCount() || 1;
|
|
2852
|
+
const asl = w / s;
|
|
2853
|
+
const asw = y / w;
|
|
2854
|
+
switch (metric) {
|
|
2855
|
+
case 'flesch':
|
|
2856
|
+
return 206.835 - 1.015 * asl - 84.6 * asw;
|
|
2857
|
+
case 'fleschde':
|
|
2858
|
+
return 180 - asl - 58.5 * asw;
|
|
2859
|
+
case 'kincaid':
|
|
2860
|
+
return 0.39 * asl + 11.8 * asw - 15.59;
|
|
2861
|
+
}
|
|
2131
2862
|
}
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
b: 'B',
|
|
2138
|
-
c: 'K',
|
|
2139
|
-
d: 'T',
|
|
2140
|
-
e: 'E',
|
|
2141
|
-
f: 'F',
|
|
2142
|
-
g: 'K',
|
|
2143
|
-
h: 'H',
|
|
2144
|
-
i: 'I',
|
|
2145
|
-
j: 'J',
|
|
2146
|
-
k: 'K',
|
|
2147
|
-
l: 'L',
|
|
2148
|
-
m: 'M',
|
|
2149
|
-
n: 'N',
|
|
2150
|
-
o: 'O',
|
|
2151
|
-
p: 'P',
|
|
2152
|
-
q: 'K',
|
|
2153
|
-
r: 'R',
|
|
2154
|
-
s: 'S',
|
|
2155
|
-
t: 'T',
|
|
2156
|
-
u: 'U',
|
|
2157
|
-
v: 'F',
|
|
2158
|
-
w: 'W',
|
|
2159
|
-
x: 'KS',
|
|
2160
|
-
y: 'Y',
|
|
2161
|
-
z: 'S'
|
|
2162
|
-
},
|
|
2163
|
-
ruleset: [
|
|
2164
|
-
{ char: 'a', position: 'start', next: ['e'], code: '' },
|
|
2165
|
-
{ char: 'g', position: 'start', next: ['n'], code: '' },
|
|
2166
|
-
{ char: 'k', position: 'start', next: ['n'], code: '' },
|
|
2167
|
-
{ char: 'p', position: 'start', next: ['n'], code: '' },
|
|
2168
|
-
{ char: 'w', position: 'start', next: ['r'], code: '' },
|
|
2169
|
-
{ char: 'b', position: 'end', prev: ['m'], code: '' },
|
|
2170
|
-
{ char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
|
|
2171
|
-
{ char: 'c', next: ['i'], next2: ['a'], code: 'X' },
|
|
2172
|
-
{ char: 'c', next: ['e', 'i', 'y'], code: 'S' },
|
|
2173
|
-
{ char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
|
|
2174
|
-
{
|
|
2175
|
-
char: 'g',
|
|
2176
|
-
next: ['h'],
|
|
2177
|
-
next2Not: ['', 'a', 'e', 'i', 'o', 'u'],
|
|
2178
|
-
code: ''
|
|
2179
|
-
},
|
|
2180
|
-
{ char: 'g', trailing: 'n', code: '' },
|
|
2181
|
-
{ char: 'g', trailing: 'ned', code: '' },
|
|
2182
|
-
{ char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
|
|
2183
|
-
{
|
|
2184
|
-
char: 'h',
|
|
2185
|
-
prev: ['a', 'e', 'i', 'o', 'u'],
|
|
2186
|
-
nextNot: ['a', 'e', 'i', 'o', 'u'],
|
|
2187
|
-
code: ''
|
|
2188
|
-
},
|
|
2189
|
-
{ char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
|
|
2190
|
-
{ char: 'k', prev: ['c'], code: '' },
|
|
2191
|
-
{ char: 'p', next: ['h'], code: 'F' },
|
|
2192
|
-
{ char: 's', next: ['h'], code: 'X' },
|
|
2193
|
-
{ char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2194
|
-
{ char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
2195
|
-
{ char: 't', next: ['h'], code: '0' },
|
|
2196
|
-
{ char: 't', next: ['c'], next2: ['h'], code: '' },
|
|
2197
|
-
{ char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
|
|
2198
|
-
{ char: 'h', leading: 'w', code: '' },
|
|
2199
|
-
{ char: 'x', position: 'start', code: 'S' },
|
|
2200
|
-
{ char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
|
|
2201
|
-
]
|
|
2202
|
-
});
|
|
2203
|
-
|
|
2204
|
-
class Soundex extends Phonetic {
|
|
2205
|
-
static default = {
|
|
2206
|
-
map: 'en',
|
|
2207
|
-
delimiter: ' ',
|
|
2208
|
-
length: 4,
|
|
2209
|
-
pad: '0',
|
|
2210
|
-
dedupe: true
|
|
2211
|
-
};
|
|
2212
|
-
constructor(opt = {}) {
|
|
2213
|
-
super('soundex', opt);
|
|
2863
|
+
getLIXScore() {
|
|
2864
|
+
const w = this.words.length || 1;
|
|
2865
|
+
const s = this.sentences.length || 1;
|
|
2866
|
+
const l = this.getLongWordRatio() * w;
|
|
2867
|
+
return w / s + (l / w) * 100;
|
|
2214
2868
|
}
|
|
2215
|
-
|
|
2216
|
-
|
|
2869
|
+
getWSTFScore() {
|
|
2870
|
+
const w = this.words.length || 1;
|
|
2871
|
+
const h = (this.getMinSyllablesWordCount(3) / w) * 100;
|
|
2872
|
+
const s = this.getAvgSentenceLength();
|
|
2873
|
+
const l = this.getLongWordRatio() * 100;
|
|
2874
|
+
const m = (this.getMonosyllabicWordCount() / w) * 100;
|
|
2875
|
+
return [
|
|
2876
|
+
0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875,
|
|
2877
|
+
0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779,
|
|
2878
|
+
0.2963 * h + 0.1905 * s - 1.1144,
|
|
2879
|
+
0.2744 * h + 0.2656 * s - 1.693
|
|
2880
|
+
];
|
|
2217
2881
|
}
|
|
2218
2882
|
}
|
|
2219
|
-
PhoneticRegistry.add('soundex', Soundex);
|
|
2220
|
-
PhoneticMappingRegistry.add('soundex', 'en', {
|
|
2221
|
-
map: {
|
|
2222
|
-
a: '0',
|
|
2223
|
-
e: '0',
|
|
2224
|
-
h: '0',
|
|
2225
|
-
i: '0',
|
|
2226
|
-
o: '0',
|
|
2227
|
-
u: '0',
|
|
2228
|
-
w: '0',
|
|
2229
|
-
y: '0',
|
|
2230
|
-
b: '1',
|
|
2231
|
-
f: '1',
|
|
2232
|
-
p: '1',
|
|
2233
|
-
v: '1',
|
|
2234
|
-
c: '2',
|
|
2235
|
-
g: '2',
|
|
2236
|
-
j: '2',
|
|
2237
|
-
k: '2',
|
|
2238
|
-
q: '2',
|
|
2239
|
-
s: '2',
|
|
2240
|
-
x: '2',
|
|
2241
|
-
z: '2',
|
|
2242
|
-
d: '3',
|
|
2243
|
-
t: '3',
|
|
2244
|
-
l: '4',
|
|
2245
|
-
m: '5',
|
|
2246
|
-
n: '5',
|
|
2247
|
-
r: '6'
|
|
2248
|
-
}
|
|
2249
|
-
});
|
|
2250
|
-
PhoneticMappingRegistry.add('soundex', 'de', {
|
|
2251
|
-
map: {
|
|
2252
|
-
a: '0',
|
|
2253
|
-
ä: '0',
|
|
2254
|
-
e: '0',
|
|
2255
|
-
h: '0',
|
|
2256
|
-
i: '0',
|
|
2257
|
-
j: '0',
|
|
2258
|
-
o: '0',
|
|
2259
|
-
ö: '0',
|
|
2260
|
-
u: '0',
|
|
2261
|
-
ü: '0',
|
|
2262
|
-
y: '0',
|
|
2263
|
-
b: '1',
|
|
2264
|
-
f: '1',
|
|
2265
|
-
p: '1',
|
|
2266
|
-
v: '1',
|
|
2267
|
-
w: '1',
|
|
2268
|
-
c: '2',
|
|
2269
|
-
g: '2',
|
|
2270
|
-
k: '2',
|
|
2271
|
-
q: '2',
|
|
2272
|
-
s: '2',
|
|
2273
|
-
ß: '2',
|
|
2274
|
-
x: '2',
|
|
2275
|
-
z: '2',
|
|
2276
|
-
d: '3',
|
|
2277
|
-
t: '3',
|
|
2278
|
-
l: '4',
|
|
2279
|
-
m: '5',
|
|
2280
|
-
n: '5',
|
|
2281
|
-
r: '6'
|
|
2282
|
-
},
|
|
2283
|
-
ruleset: [{ char: 'c', next: ['h'], code: '7' }]
|
|
2284
|
-
});
|
|
2285
2883
|
|
|
2286
2884
|
const profiler = Profiler.getInstance();
|
|
2287
2885
|
class CmpStr {
|
|
@@ -2333,29 +2931,26 @@
|
|
|
2333
2931
|
}
|
|
2334
2932
|
assert(cond, test) {
|
|
2335
2933
|
switch (cond) {
|
|
2934
|
+
default:
|
|
2935
|
+
throw new CmpStrInternalError(`Cmpstr condition <${cond}> unknown`);
|
|
2336
2936
|
case 'metric':
|
|
2337
|
-
|
|
2338
|
-
throw new Error(
|
|
2339
|
-
`CmpStr <metric> must be set, call .setMetric(), ` +
|
|
2340
|
-
`use CmpStr.metric.list() for available metrics`
|
|
2341
|
-
);
|
|
2937
|
+
OptionsValidator.validateMetricName(test);
|
|
2342
2938
|
break;
|
|
2343
2939
|
case 'phonetic':
|
|
2344
|
-
|
|
2345
|
-
throw new Error(
|
|
2346
|
-
`CmpStr <phonetic> must be set, call .setPhonetic(), ` +
|
|
2347
|
-
`use CmpStr.phonetic.list() for available phonetic algorithms`
|
|
2348
|
-
);
|
|
2940
|
+
OptionsValidator.validatePhoneticName(test);
|
|
2349
2941
|
break;
|
|
2350
|
-
default:
|
|
2351
|
-
throw new Error(`Cmpstr condition <${cond}> unknown`);
|
|
2352
2942
|
}
|
|
2353
2943
|
}
|
|
2354
2944
|
assertMany(...cond) {
|
|
2355
2945
|
for (const [c, test] of cond) this.assert(c, test);
|
|
2356
2946
|
}
|
|
2357
2947
|
resolveOptions(opt) {
|
|
2358
|
-
|
|
2948
|
+
const merged = DeepMerge.merge(
|
|
2949
|
+
{ ...(this.options ?? Object.create(null)) },
|
|
2950
|
+
opt
|
|
2951
|
+
);
|
|
2952
|
+
OptionsValidator.validateOptions(merged);
|
|
2953
|
+
return merged;
|
|
2359
2954
|
}
|
|
2360
2955
|
normalize(input, flags) {
|
|
2361
2956
|
return Normalizer.normalize(input, flags ?? this.options.flags ?? '');
|
|
@@ -2371,7 +2966,7 @@
|
|
|
2371
2966
|
return input;
|
|
2372
2967
|
}
|
|
2373
2968
|
postProcess(result, opt) {
|
|
2374
|
-
if (
|
|
2969
|
+
if (Array.isArray(result) && opt?.removeZero)
|
|
2375
2970
|
result = result.filter((r) => r.res > 0);
|
|
2376
2971
|
return result;
|
|
2377
2972
|
}
|
|
@@ -2389,65 +2984,114 @@
|
|
|
2389
2984
|
compute(a, b, opt, mode, raw, skip) {
|
|
2390
2985
|
const resolved = this.resolveOptions(opt);
|
|
2391
2986
|
this.assert('metric', resolved.metric);
|
|
2392
|
-
|
|
2393
|
-
|
|
2394
|
-
|
|
2395
|
-
|
|
2396
|
-
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2987
|
+
return ErrorUtil.wrap(
|
|
2988
|
+
() => {
|
|
2989
|
+
const A = skip ? a : this.prepare(a, resolved);
|
|
2990
|
+
const B = skip ? b : this.prepare(b, resolved);
|
|
2991
|
+
if (
|
|
2992
|
+
resolved.safeEmpty &&
|
|
2993
|
+
((Array.isArray(A) && A.length === 0) ||
|
|
2994
|
+
(Array.isArray(B) && B.length === 0) ||
|
|
2995
|
+
A === '' ||
|
|
2996
|
+
B === '')
|
|
2997
|
+
) {
|
|
2998
|
+
return [];
|
|
2999
|
+
}
|
|
3000
|
+
const metric = factory['metric'](resolved.metric, A, B, resolved.opt);
|
|
3001
|
+
if (resolved.output !== 'prep') metric.setOriginal(a, b);
|
|
3002
|
+
metric.run(mode);
|
|
3003
|
+
const result = this.postProcess(metric.getResults(), resolved);
|
|
3004
|
+
return this.output(result, raw ?? resolved.raw);
|
|
3005
|
+
},
|
|
3006
|
+
`Failed to compute metric <${resolved.metric}> for the given inputs`,
|
|
3007
|
+
{ a, b, options: opt }
|
|
3008
|
+
);
|
|
2408
3009
|
}
|
|
2409
3010
|
output(result, raw) {
|
|
2410
|
-
return (
|
|
2411
|
-
|
|
2412
|
-
|
|
2413
|
-
|
|
2414
|
-
|
|
2415
|
-
|
|
2416
|
-
|
|
2417
|
-
|
|
3011
|
+
return ErrorUtil.wrap(
|
|
3012
|
+
() =>
|
|
3013
|
+
(raw ?? this.options.raw)
|
|
3014
|
+
? result
|
|
3015
|
+
: Array.isArray(result)
|
|
3016
|
+
? result.map((r) => ({ source: r.a, target: r.b, match: r.res }))
|
|
3017
|
+
: { source: result.a, target: result.b, match: result.res },
|
|
3018
|
+
`Failed to resolve output format for the metric result`,
|
|
3019
|
+
{ result, raw }
|
|
3020
|
+
);
|
|
3021
|
+
}
|
|
3022
|
+
clone() {
|
|
3023
|
+
const inst = Object.assign(
|
|
3024
|
+
Object.create(Object.getPrototypeOf(this)),
|
|
3025
|
+
this
|
|
3026
|
+
);
|
|
3027
|
+
inst.options = DeepMerge.merge(Object.create(null), this.options);
|
|
3028
|
+
return inst;
|
|
3029
|
+
}
|
|
2418
3030
|
reset() {
|
|
2419
|
-
|
|
3031
|
+
this.options = Object.create(null);
|
|
2420
3032
|
return this;
|
|
2421
3033
|
}
|
|
2422
3034
|
setOptions(opt) {
|
|
3035
|
+
OptionsValidator.validateOptions(opt);
|
|
2423
3036
|
this.options = opt;
|
|
2424
3037
|
return this;
|
|
2425
3038
|
}
|
|
2426
3039
|
mergeOptions(opt) {
|
|
2427
|
-
merge(this.options, opt);
|
|
3040
|
+
DeepMerge.merge(this.options, opt);
|
|
3041
|
+
OptionsValidator.validateOptions(this.options);
|
|
2428
3042
|
return this;
|
|
2429
3043
|
}
|
|
2430
3044
|
setSerializedOptions(opt) {
|
|
2431
|
-
|
|
2432
|
-
|
|
3045
|
+
try {
|
|
3046
|
+
const parsed = JSON.parse(opt);
|
|
3047
|
+
OptionsValidator.validateOptions(parsed);
|
|
3048
|
+
this.options = parsed;
|
|
3049
|
+
return this;
|
|
3050
|
+
} catch (err) {
|
|
3051
|
+
if (err instanceof SyntaxError)
|
|
3052
|
+
throw new CmpStrValidationError(
|
|
3053
|
+
`Failed to parse serialized options, invalid JSON string`,
|
|
3054
|
+
{ opt, error: err instanceof Error ? err.message : String(err) }
|
|
3055
|
+
);
|
|
3056
|
+
throw err;
|
|
3057
|
+
}
|
|
2433
3058
|
}
|
|
2434
3059
|
setOption(path, value) {
|
|
2435
|
-
set(this.options, path, value);
|
|
3060
|
+
DeepMerge.set(this.options, path, value);
|
|
3061
|
+
OptionsValidator.validateOptions(this.options);
|
|
2436
3062
|
return this;
|
|
2437
3063
|
}
|
|
2438
3064
|
rmvOption(path) {
|
|
2439
|
-
rmv(this.options, path);
|
|
3065
|
+
DeepMerge.rmv(this.options, path);
|
|
2440
3066
|
return this;
|
|
2441
3067
|
}
|
|
2442
|
-
setRaw
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
3068
|
+
setRaw(enable) {
|
|
3069
|
+
return this.setOption('raw', enable);
|
|
3070
|
+
}
|
|
3071
|
+
setMetric(name) {
|
|
3072
|
+
return this.setOption('metric', name);
|
|
3073
|
+
}
|
|
3074
|
+
setFlags(flags) {
|
|
3075
|
+
return this.setOption('flags', flags);
|
|
3076
|
+
}
|
|
3077
|
+
rmvFlags() {
|
|
3078
|
+
return this.rmvOption('flags');
|
|
3079
|
+
}
|
|
3080
|
+
setProcessors(opt) {
|
|
3081
|
+
return this.setOption('processors', opt);
|
|
3082
|
+
}
|
|
3083
|
+
rmvProcessors() {
|
|
3084
|
+
return this.rmvOption('processors');
|
|
3085
|
+
}
|
|
3086
|
+
getOptions() {
|
|
3087
|
+
return this.options;
|
|
3088
|
+
}
|
|
3089
|
+
getSerializedOptions() {
|
|
3090
|
+
return JSON.stringify(this.options);
|
|
3091
|
+
}
|
|
3092
|
+
getOption(path) {
|
|
3093
|
+
return DeepMerge.get(this.options, path);
|
|
3094
|
+
}
|
|
2451
3095
|
test(a, b, opt) {
|
|
2452
3096
|
return this.compute(a, b, opt, 'single');
|
|
2453
3097
|
}
|
|
@@ -2486,15 +3130,35 @@
|
|
|
2486
3130
|
const resolved = this.resolveOptions({ flags, processors });
|
|
2487
3131
|
const test = this.prepare(needle, resolved);
|
|
2488
3132
|
const hstk = this.prepare(haystack, resolved);
|
|
2489
|
-
|
|
3133
|
+
const out = [];
|
|
3134
|
+
for (let i = 0, len = hstk.length; i < len; i++) {
|
|
3135
|
+
if (hstk[i].includes(test)) out.push(haystack[i]);
|
|
3136
|
+
}
|
|
3137
|
+
return out;
|
|
2490
3138
|
}
|
|
2491
3139
|
matrix(input, opt) {
|
|
2492
|
-
|
|
2493
|
-
|
|
2494
|
-
|
|
2495
|
-
|
|
2496
|
-
|
|
2497
|
-
|
|
3140
|
+
const resolved = this.resolveOptions(opt);
|
|
3141
|
+
const arr = this.prepare(input, resolved);
|
|
3142
|
+
const n = arr.length;
|
|
3143
|
+
const out = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
3144
|
+
for (let i = 0; i < n; i++)
|
|
3145
|
+
for (let j = i; j < n; j++) {
|
|
3146
|
+
if (i === j) {
|
|
3147
|
+
out[i][j] = 1;
|
|
3148
|
+
} else {
|
|
3149
|
+
const score = this.compute(
|
|
3150
|
+
arr[i],
|
|
3151
|
+
arr[j],
|
|
3152
|
+
resolved,
|
|
3153
|
+
'single',
|
|
3154
|
+
true,
|
|
3155
|
+
true
|
|
3156
|
+
).res;
|
|
3157
|
+
out[i][j] = score;
|
|
3158
|
+
out[j][i] = score;
|
|
3159
|
+
}
|
|
3160
|
+
}
|
|
3161
|
+
return out;
|
|
2498
3162
|
}
|
|
2499
3163
|
phoneticIndex(input, algo, opt) {
|
|
2500
3164
|
const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
|
|
@@ -2577,22 +3241,28 @@
|
|
|
2577
3241
|
async computeAsync(a, b, opt, mode, raw, skip) {
|
|
2578
3242
|
const resolved = this.resolveOptions(opt);
|
|
2579
3243
|
this.assert('metric', resolved.metric);
|
|
2580
|
-
|
|
2581
|
-
|
|
2582
|
-
|
|
2583
|
-
|
|
2584
|
-
|
|
2585
|
-
|
|
2586
|
-
|
|
2587
|
-
|
|
2588
|
-
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
|
|
2592
|
-
|
|
2593
|
-
|
|
2594
|
-
|
|
2595
|
-
|
|
3244
|
+
return ErrorUtil.wrapAsync(
|
|
3245
|
+
async () => {
|
|
3246
|
+
const A = skip ? a : await this.prepareAsync(a, resolved);
|
|
3247
|
+
const B = skip ? b : await this.prepareAsync(b, resolved);
|
|
3248
|
+
if (
|
|
3249
|
+
resolved.safeEmpty &&
|
|
3250
|
+
((Array.isArray(A) && A.length === 0) ||
|
|
3251
|
+
(Array.isArray(B) && B.length === 0) ||
|
|
3252
|
+
A === '' ||
|
|
3253
|
+
B === '')
|
|
3254
|
+
) {
|
|
3255
|
+
return [];
|
|
3256
|
+
}
|
|
3257
|
+
const metric = factory['metric'](resolved.metric, A, B, resolved.opt);
|
|
3258
|
+
if (resolved.output !== 'prep') metric.setOriginal(a, b);
|
|
3259
|
+
await metric.runAsync(mode);
|
|
3260
|
+
const result = this.postProcess(metric.getResults(), resolved);
|
|
3261
|
+
return this.output(result, raw ?? resolved.raw);
|
|
3262
|
+
},
|
|
3263
|
+
`Failed to compute metric <${opt?.metric ?? this.options.metric}> for the given inputs`,
|
|
3264
|
+
{ a, b, opt }
|
|
3265
|
+
);
|
|
2596
3266
|
}
|
|
2597
3267
|
async testAsync(a, b, opt) {
|
|
2598
3268
|
return this.computeAsync(a, b, opt, 'single');
|
|
@@ -2630,23 +3300,40 @@
|
|
|
2630
3300
|
const resolved = this.resolveOptions({ flags, processors });
|
|
2631
3301
|
const test = await this.prepareAsync(needle, resolved);
|
|
2632
3302
|
const hstk = await this.prepareAsync(haystack, resolved);
|
|
2633
|
-
|
|
3303
|
+
const out = [];
|
|
3304
|
+
for (let i = 0; i < hstk.length; i++) {
|
|
3305
|
+
if (hstk[i].includes(test)) out.push(haystack[i]);
|
|
3306
|
+
}
|
|
3307
|
+
return out;
|
|
2634
3308
|
}
|
|
2635
3309
|
async matrixAsync(input, opt) {
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
-
|
|
2641
|
-
|
|
2642
|
-
|
|
2643
|
-
|
|
2644
|
-
|
|
2645
|
-
|
|
2646
|
-
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
|
|
3310
|
+
const resolved = this.resolveOptions(opt);
|
|
3311
|
+
const arr = await this.prepareAsync(input, resolved);
|
|
3312
|
+
const n = arr.length;
|
|
3313
|
+
const out = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
3314
|
+
for (let i = 0; i < n; i++) {
|
|
3315
|
+
await Promise.all(
|
|
3316
|
+
Array.from({ length: n - i }, (_, k) => i + k).map(async (j) => {
|
|
3317
|
+
if (i === j) {
|
|
3318
|
+
out[i][j] = 1;
|
|
3319
|
+
} else {
|
|
3320
|
+
const score = (
|
|
3321
|
+
await this.computeAsync(
|
|
3322
|
+
arr[i],
|
|
3323
|
+
arr[j],
|
|
3324
|
+
resolved,
|
|
3325
|
+
'single',
|
|
3326
|
+
true,
|
|
3327
|
+
true
|
|
3328
|
+
)
|
|
3329
|
+
).res;
|
|
3330
|
+
out[i][j] = score;
|
|
3331
|
+
out[j][i] = score;
|
|
3332
|
+
}
|
|
3333
|
+
})
|
|
3334
|
+
);
|
|
3335
|
+
}
|
|
3336
|
+
return out;
|
|
2650
3337
|
}
|
|
2651
3338
|
async phoneticIndexAsync(input, algo, opt) {
|
|
2652
3339
|
const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
|
|
@@ -2693,6 +3380,7 @@
|
|
|
2693
3380
|
|
|
2694
3381
|
exports.CmpStr = CmpStr;
|
|
2695
3382
|
exports.CmpStrAsync = CmpStrAsync;
|
|
3383
|
+
exports.CmpStrError = Errors;
|
|
2696
3384
|
exports.DeepMerge = DeepMerge;
|
|
2697
3385
|
exports.DiffChecker = DiffChecker;
|
|
2698
3386
|
exports.Filter = Filter;
|
|
@@ -2701,6 +3389,7 @@
|
|
|
2701
3389
|
exports.Metric = Metric;
|
|
2702
3390
|
exports.MetricRegistry = MetricRegistry;
|
|
2703
3391
|
exports.Normalizer = Normalizer;
|
|
3392
|
+
exports.OptionsValidator = OptionsValidator;
|
|
2704
3393
|
exports.Phonetic = Phonetic;
|
|
2705
3394
|
exports.PhoneticMappingRegistry = PhoneticMappingRegistry;
|
|
2706
3395
|
exports.PhoneticRegistry = PhoneticRegistry;
|
|
@@ -2709,4 +3398,3 @@
|
|
|
2709
3398
|
exports.StructuredData = StructuredData;
|
|
2710
3399
|
exports.TextAnalyzer = TextAnalyzer;
|
|
2711
3400
|
});
|
|
2712
|
-
//# sourceMappingURL=CmpStr.umd.js.map
|