cmpstr 2.0.2 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +75 -499
- package/dist/CmpStr.esm.js +4863 -0
- package/dist/CmpStr.esm.js.map +1 -0
- package/dist/CmpStr.esm.min.js +8 -0
- package/dist/CmpStr.esm.min.js.map +1 -0
- package/dist/CmpStr.umd.js +4875 -0
- package/dist/CmpStr.umd.js.map +1 -0
- package/dist/CmpStr.umd.min.js +8 -0
- package/dist/CmpStr.umd.min.js.map +1 -0
- package/dist/cjs/CmpStr.js +663 -0
- package/dist/cjs/CmpStr.js.map +1 -0
- package/dist/cjs/CmpStrAsync.js +336 -0
- package/dist/cjs/CmpStrAsync.js.map +1 -0
- package/dist/cjs/index.js +15 -0
- package/dist/cjs/index.js.map +1 -0
- package/dist/cjs/metric/Cosine.js +101 -0
- package/dist/cjs/metric/Cosine.js.map +1 -0
- package/dist/cjs/metric/DamerauLevenshtein.js +110 -0
- package/dist/cjs/metric/DamerauLevenshtein.js.map +1 -0
- package/dist/cjs/metric/DiceSorensen.js +91 -0
- package/dist/cjs/metric/DiceSorensen.js.map +1 -0
- package/dist/cjs/metric/Hamming.js +82 -0
- package/dist/cjs/metric/Hamming.js.map +1 -0
- package/dist/cjs/metric/Jaccard.js +76 -0
- package/dist/cjs/metric/Jaccard.js.map +1 -0
- package/dist/cjs/metric/JaroWinkler.js +114 -0
- package/dist/cjs/metric/JaroWinkler.js.map +1 -0
- package/dist/cjs/metric/LCS.js +89 -0
- package/dist/cjs/metric/LCS.js.map +1 -0
- package/dist/cjs/metric/Levenshtein.js +94 -0
- package/dist/cjs/metric/Levenshtein.js.map +1 -0
- package/dist/cjs/metric/Metric.js +445 -0
- package/dist/cjs/metric/Metric.js.map +1 -0
- package/dist/cjs/metric/NeedlemanWunsch.js +95 -0
- package/dist/cjs/metric/NeedlemanWunsch.js.map +1 -0
- package/dist/cjs/metric/SmithWaterman.js +98 -0
- package/dist/cjs/metric/SmithWaterman.js.map +1 -0
- package/dist/cjs/metric/qGram.js +91 -0
- package/dist/cjs/metric/qGram.js.map +1 -0
- package/dist/cjs/phonetic/Cologne.js +112 -0
- package/dist/cjs/phonetic/Cologne.js.map +1 -0
- package/dist/cjs/phonetic/Metaphone.js +172 -0
- package/dist/cjs/phonetic/Metaphone.js.map +1 -0
- package/dist/cjs/phonetic/Phonetic.js +413 -0
- package/dist/cjs/phonetic/Phonetic.js.map +1 -0
- package/dist/cjs/phonetic/Soundex.js +135 -0
- package/dist/cjs/phonetic/Soundex.js.map +1 -0
- package/dist/cjs/utils/DeepMerge.js +144 -0
- package/dist/cjs/utils/DeepMerge.js.map +1 -0
- package/dist/cjs/utils/DiffChecker.js +500 -0
- package/dist/cjs/utils/DiffChecker.js.map +1 -0
- package/dist/cjs/utils/Filter.js +189 -0
- package/dist/cjs/utils/Filter.js.map +1 -0
- package/dist/cjs/utils/HashTable.js +175 -0
- package/dist/cjs/utils/HashTable.js.map +1 -0
- package/dist/cjs/utils/Normalizer.js +144 -0
- package/dist/cjs/utils/Normalizer.js.map +1 -0
- package/dist/cjs/utils/Pool.js +196 -0
- package/dist/cjs/utils/Pool.js.map +1 -0
- package/dist/cjs/utils/Profiler.js +229 -0
- package/dist/cjs/utils/Profiler.js.map +1 -0
- package/dist/cjs/utils/Registry.js +148 -0
- package/dist/cjs/utils/Registry.js.map +1 -0
- package/dist/cjs/utils/TextAnalyzer.js +358 -0
- package/dist/cjs/utils/TextAnalyzer.js.map +1 -0
- package/dist/esm/CmpStr.js +662 -0
- package/dist/esm/CmpStr.js.map +1 -0
- package/dist/esm/CmpStrAsync.js +331 -0
- package/dist/esm/CmpStrAsync.js.map +1 -0
- package/dist/esm/index.js +7 -0
- package/dist/esm/index.js.map +1 -0
- package/dist/esm/metric/Cosine.js +99 -0
- package/dist/esm/metric/Cosine.js.map +1 -0
- package/dist/esm/metric/DamerauLevenshtein.js +108 -0
- package/dist/esm/metric/DamerauLevenshtein.js.map +1 -0
- package/dist/esm/metric/DiceSorensen.js +89 -0
- package/dist/esm/metric/DiceSorensen.js.map +1 -0
- package/dist/esm/metric/Hamming.js +77 -0
- package/dist/esm/metric/Hamming.js.map +1 -0
- package/dist/esm/metric/Jaccard.js +74 -0
- package/dist/esm/metric/Jaccard.js.map +1 -0
- package/dist/esm/metric/JaroWinkler.js +112 -0
- package/dist/esm/metric/JaroWinkler.js.map +1 -0
- package/dist/esm/metric/LCS.js +87 -0
- package/dist/esm/metric/LCS.js.map +1 -0
- package/dist/esm/metric/Levenshtein.js +92 -0
- package/dist/esm/metric/Levenshtein.js.map +1 -0
- package/dist/esm/metric/Metric.js +442 -0
- package/dist/esm/metric/Metric.js.map +1 -0
- package/dist/esm/metric/NeedlemanWunsch.js +93 -0
- package/dist/esm/metric/NeedlemanWunsch.js.map +1 -0
- package/dist/esm/metric/SmithWaterman.js +96 -0
- package/dist/esm/metric/SmithWaterman.js.map +1 -0
- package/dist/esm/metric/qGram.js +89 -0
- package/dist/esm/metric/qGram.js.map +1 -0
- package/dist/esm/phonetic/Cologne.js +114 -0
- package/dist/esm/phonetic/Cologne.js.map +1 -0
- package/dist/esm/phonetic/Metaphone.js +174 -0
- package/dist/esm/phonetic/Metaphone.js.map +1 -0
- package/dist/esm/phonetic/Phonetic.js +409 -0
- package/dist/esm/phonetic/Phonetic.js.map +1 -0
- package/dist/esm/phonetic/Soundex.js +137 -0
- package/dist/esm/phonetic/Soundex.js.map +1 -0
- package/dist/esm/utils/DeepMerge.js +139 -0
- package/dist/esm/utils/DeepMerge.js.map +1 -0
- package/dist/esm/utils/DiffChecker.js +498 -0
- package/dist/esm/utils/DiffChecker.js.map +1 -0
- package/dist/esm/utils/Filter.js +187 -0
- package/dist/esm/utils/Filter.js.map +1 -0
- package/dist/esm/utils/HashTable.js +173 -0
- package/dist/esm/utils/HashTable.js.map +1 -0
- package/dist/esm/utils/Normalizer.js +142 -0
- package/dist/esm/utils/Normalizer.js.map +1 -0
- package/dist/esm/utils/Pool.js +194 -0
- package/dist/esm/utils/Pool.js.map +1 -0
- package/dist/esm/utils/Profiler.js +227 -0
- package/dist/esm/utils/Profiler.js.map +1 -0
- package/dist/esm/utils/Registry.js +142 -0
- package/dist/esm/utils/Registry.js.map +1 -0
- package/dist/esm/utils/TextAnalyzer.js +356 -0
- package/dist/esm/utils/TextAnalyzer.js.map +1 -0
- package/dist/types/CmpStr.d.ts +472 -0
- package/dist/types/CmpStrAsync.d.ts +233 -0
- package/dist/types/index.d.ts +51 -0
- package/dist/types/metric/Cosine.d.ts +57 -0
- package/dist/types/metric/DamerauLevenshtein.d.ts +50 -0
- package/dist/types/metric/DiceSorensen.d.ts +57 -0
- package/dist/types/metric/Hamming.d.ts +49 -0
- package/dist/types/metric/Jaccard.d.ts +48 -0
- package/dist/types/metric/JaroWinkler.d.ts +50 -0
- package/dist/types/metric/LCS.d.ts +50 -0
- package/dist/types/metric/Levenshtein.d.ts +50 -0
- package/dist/types/metric/Metric.d.ts +261 -0
- package/dist/types/metric/NeedlemanWunsch.d.ts +47 -0
- package/dist/types/metric/SmithWaterman.d.ts +48 -0
- package/dist/types/metric/index.d.ts +41 -0
- package/dist/types/metric/qGram.d.ts +56 -0
- package/dist/types/phonetic/Cologne.d.ts +46 -0
- package/dist/types/phonetic/Metaphone.d.ts +50 -0
- package/dist/types/phonetic/Phonetic.d.ts +189 -0
- package/dist/types/phonetic/Soundex.d.ts +49 -0
- package/dist/types/phonetic/index.d.ts +30 -0
- package/dist/types/utils/DeepMerge.d.ts +70 -0
- package/dist/types/utils/DiffChecker.d.ts +137 -0
- package/dist/types/utils/Filter.d.ts +97 -0
- package/dist/types/utils/HashTable.d.ts +86 -0
- package/dist/types/utils/Normalizer.d.ts +76 -0
- package/dist/types/utils/Pool.d.ts +63 -0
- package/dist/types/utils/Profiler.d.ts +129 -0
- package/dist/types/utils/Registry.d.ts +57 -0
- package/dist/types/utils/TextAnalyzer.d.ts +199 -0
- package/dist/types/utils/Types.d.ts +313 -0
- package/package.json +62 -49
- package/src/CmpStr.d.ts +0 -70
- package/src/CmpStr.js +0 -912
- package/src/CmpStrAsync.d.ts +0 -19
- package/src/CmpStrAsync.js +0 -204
- package/src/algorithms/cosine.js +0 -86
- package/src/algorithms/damerau.js +0 -78
- package/src/algorithms/dice.js +0 -65
- package/src/algorithms/hamming.js +0 -44
- package/src/algorithms/jaccard.js +0 -34
- package/src/algorithms/jaroWinkler.js +0 -106
- package/src/algorithms/lcs.js +0 -58
- package/src/algorithms/levenshtein.js +0 -70
- package/src/algorithms/needlemanWunsch.js +0 -72
- package/src/algorithms/qGram.js +0 -63
- package/src/algorithms/smithWaterman.js +0 -78
- package/src/algorithms/soundex.js +0 -152
- package/src/index.d.ts +0 -3
- package/src/index.js +0 -47
|
@@ -0,0 +1,4863 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CmpStr v3.0.0 dev-1a82e20-250612
|
|
3
|
+
* This is a lightweight, fast and well performing library for calculating string similarity.
|
|
4
|
+
* (c) 2023-2025 Paul Köhler @komed3 / MIT License
|
|
5
|
+
* Visit https://github.com/komed3/cmpstr and https://npmjs.org/package/cmpstr
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Deep Merge Utility
|
|
9
|
+
* src/utils/DeepMerge.ts
|
|
10
|
+
*
|
|
11
|
+
* This module provides utility functions for deep merging objects, getting values by path,
|
|
12
|
+
* and setting values by path in a deeply nested object structure.
|
|
13
|
+
*
|
|
14
|
+
* It supports dot and bracket notation (e.g. `a.b[0].c`) as well as escaped keys.
|
|
15
|
+
*
|
|
16
|
+
* Included functions:
|
|
17
|
+
* - `get`: Retrieve a deeply nested value by path
|
|
18
|
+
* - `set`: Assign a value to a nested path
|
|
19
|
+
* - `merge`: Deeply merge two objects
|
|
20
|
+
* - `has`: Check whether a path exists
|
|
21
|
+
* - `rmv`: Delete a value at a path
|
|
22
|
+
*
|
|
23
|
+
* @module Utils/DeepMerge
|
|
24
|
+
* @author Paul Köhler
|
|
25
|
+
* @license MIT
|
|
26
|
+
*/
|
|
27
|
+
/**
|
|
28
|
+
* Parse a path string into an array of keys.
|
|
29
|
+
*
|
|
30
|
+
* @param {string} p - The path string, e.g. `a.b.c` or `a[0].b`
|
|
31
|
+
* @returns {(string|number)[]} - An array of keys, e.g. `['a', 'b', 'c']` or `['a', 0, 'b']`
|
|
32
|
+
*/
|
|
33
|
+
const parse = (p) => (p.replace(/\[(\d+)]/g, '.$1').split('.').map(s => /^\d+$/.test(s) ? +s : s));
|
|
34
|
+
/**
|
|
35
|
+
* Deeply get a value from an object by a path string.
|
|
36
|
+
*
|
|
37
|
+
* @template T - The type of the object to get the value from
|
|
38
|
+
* @param {T} t - The object to get the value from
|
|
39
|
+
* @param {string} path - The path string, e.g. `a.b.c`
|
|
40
|
+
* @param {any} fallback - The default value to return if the path does not exist
|
|
41
|
+
* @returns {T|R|undefined} - The value at the specified path, otherwise the default value
|
|
42
|
+
*/
|
|
43
|
+
function get(t, path, fallback) {
|
|
44
|
+
return parse(path).reduce((o, k) => o?.[k] ?? fallback, t);
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Deeply set a value in an object by a path string.
|
|
48
|
+
*
|
|
49
|
+
* @template T - The type of the object to get the value from
|
|
50
|
+
* @param {T} t - The object to set the value in
|
|
51
|
+
* @param {string} path - The path string, e.g. `a.b.c`
|
|
52
|
+
* @param {any} value - The value to set at the specified path
|
|
53
|
+
* @returns {T} - The modified object with the value set at the specified path
|
|
54
|
+
* @throws {Error} - Throws an error if the key is not a valid identifier
|
|
55
|
+
*/
|
|
56
|
+
function set(t, path, value) {
|
|
57
|
+
// If the path is empty, return the value
|
|
58
|
+
if (path === '')
|
|
59
|
+
return value;
|
|
60
|
+
// Split the path into the first key and the rest of the path
|
|
61
|
+
const [k, ...r] = parse(path);
|
|
62
|
+
// Throw an error if the key is not a valid identifier
|
|
63
|
+
if (t !== undefined && (typeof t !== 'object' || t === null))
|
|
64
|
+
throw Error(`cannot set property <${k}> of <${JSON.stringify(t)}>`);
|
|
65
|
+
// Assign the value to the specified key in the object
|
|
66
|
+
return Object.assign(t ?? (typeof k === 'number' ? [] : Object.create(null)), {
|
|
67
|
+
[k]: set(t?.[k], r.join('.'), value)
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Deeply merge two objects, where the second object overrides the first.
|
|
72
|
+
*
|
|
73
|
+
* @template T - The type of the object to get the value from
|
|
74
|
+
* @param {T} t - The target object to merge into
|
|
75
|
+
* @param {T} o - The source object to merge from
|
|
76
|
+
* @param {boolean} [mergeUndefined=false] - Whether to merge undefined values
|
|
77
|
+
* @returns {T} - The merged object
|
|
78
|
+
*/
|
|
79
|
+
function merge(t = Object.create(null), o = Object.create(null), mergeUndefined = false) {
|
|
80
|
+
// Iterate over the keys of the source object and merge them into the target object
|
|
81
|
+
return Object.keys(o).forEach(k => {
|
|
82
|
+
const val = o[k];
|
|
83
|
+
// If the value is undefined and mergeUndefined is false, skip it
|
|
84
|
+
if (!mergeUndefined && val === undefined)
|
|
85
|
+
return;
|
|
86
|
+
// Skip dangerous property names to prevent prototype pollution
|
|
87
|
+
if (k === '__proto__' || k === 'constructor')
|
|
88
|
+
return;
|
|
89
|
+
// If the value is an object and not an array, recursively merge it
|
|
90
|
+
t[k] = typeof val === 'object' && !Array.isArray(val)
|
|
91
|
+
? merge(typeof t[k] === 'object' && !Array.isArray(t[k])
|
|
92
|
+
? t[k] : Object.create(null), val)
|
|
93
|
+
: val;
|
|
94
|
+
}), t;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Delete a value at a specified path in an object.
|
|
98
|
+
*
|
|
99
|
+
* @template T - The type of the object to get the value from
|
|
100
|
+
* @param {T} t - The object to delete the value from
|
|
101
|
+
* @param {string} path - The path string, e.g. `a.b.c`
|
|
102
|
+
* @param {boolean} [preserveEmpty=false] - Whether to preserve empty objects/arrays
|
|
103
|
+
* @returns {T} - The modified object with the value deleted at the specified path
|
|
104
|
+
*/
|
|
105
|
+
function rmv(t, path, preserveEmpty = false) {
|
|
106
|
+
const r = (o, k, i = 0) => {
|
|
107
|
+
const key = k[i];
|
|
108
|
+
// Delete the key if it is not an object or if it is the last key in the path
|
|
109
|
+
if (!o || typeof o !== 'object')
|
|
110
|
+
return false;
|
|
111
|
+
if (i === k.length - 1)
|
|
112
|
+
return delete o[key];
|
|
113
|
+
if (!r(o[key], k, i + 1))
|
|
114
|
+
return false;
|
|
115
|
+
// If preserveEmpty is false, check if the object or array is empty
|
|
116
|
+
if (!preserveEmpty) {
|
|
117
|
+
const val = o[key];
|
|
118
|
+
// If the value is an empty array or object, delete the key
|
|
119
|
+
if (typeof val === 'object' && ((Array.isArray(val) && val.every(v => v == null)) ||
|
|
120
|
+
(!Array.isArray(val) && Object.keys(val).length === 0)))
|
|
121
|
+
delete o[key];
|
|
122
|
+
}
|
|
123
|
+
return true;
|
|
124
|
+
};
|
|
125
|
+
r(t, parse(path));
|
|
126
|
+
return t;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Profiler Utility
|
|
131
|
+
* src/utils/profiler.ts
|
|
132
|
+
*
|
|
133
|
+
* @see https://en.wikipedia.org/wiki/Profiling_(computer_programming)
|
|
134
|
+
*
|
|
135
|
+
* This class provides methods to run synchronous and asynchronous functions, capturing
|
|
136
|
+
* their execution time and memory usage, and storing the results in a set of profiler
|
|
137
|
+
* entries. It supports both Node.js and browser environments, detecting the environment
|
|
138
|
+
* automatically.
|
|
139
|
+
*
|
|
140
|
+
* The class is optimized for minimal overhead and can be used for fine-grained
|
|
141
|
+
* performance profiling.
|
|
142
|
+
*
|
|
143
|
+
* @module Utils/Profiler
|
|
144
|
+
* @author Paul Köhler (komed3)
|
|
145
|
+
* @license MIT
|
|
146
|
+
*/
|
|
147
|
+
/**
|
|
148
|
+
* Profiler class for measuring execution time and memory usage of functions.
|
|
149
|
+
*/
|
|
150
|
+
class Profiler {
|
|
151
|
+
// Environment detection
|
|
152
|
+
static ENV;
|
|
153
|
+
// Singleton instance
|
|
154
|
+
static instance;
|
|
155
|
+
// Store for profiler entries
|
|
156
|
+
store = new Set();
|
|
157
|
+
// Total time and memory consumption
|
|
158
|
+
totalTime = 0;
|
|
159
|
+
totalMem = 0;
|
|
160
|
+
// The Profiler active state
|
|
161
|
+
active;
|
|
162
|
+
/**
|
|
163
|
+
* Sets the environment based on the available global objects.
|
|
164
|
+
* Detects if running in Node.js or browser and sets the ENV property accordingly.
|
|
165
|
+
*/
|
|
166
|
+
static detectEnv() {
|
|
167
|
+
// Check for Node.js environment
|
|
168
|
+
if (typeof process !== 'undefined')
|
|
169
|
+
Profiler.ENV = 'nodejs';
|
|
170
|
+
// Check for browser environment
|
|
171
|
+
else if (typeof performance !== 'undefined')
|
|
172
|
+
Profiler.ENV = 'browser';
|
|
173
|
+
// If neither, set ENV to unknown
|
|
174
|
+
else
|
|
175
|
+
Profiler.ENV = 'unknown';
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Returns the singleton instance of the Perf class.
|
|
179
|
+
* If the instance does not exist, it creates a new one.
|
|
180
|
+
*
|
|
181
|
+
* @param {boolean} [enable=false] - Optional parameter to enable the profiler upon instantiation
|
|
182
|
+
* @returns {Profiler} - Singleton Profiler instance
|
|
183
|
+
*/
|
|
184
|
+
static getInstance(enable) {
|
|
185
|
+
// Ensure the environment is detected
|
|
186
|
+
if (!Profiler.ENV)
|
|
187
|
+
Profiler.detectEnv();
|
|
188
|
+
// If instance does not exist, create a new one
|
|
189
|
+
if (!Profiler.instance)
|
|
190
|
+
Profiler.instance = new Profiler(enable);
|
|
191
|
+
// Return singleton instance
|
|
192
|
+
return Profiler.instance;
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Private constructor to enforce singleton pattern.
|
|
196
|
+
* Initializes the store for profiler entries.
|
|
197
|
+
*
|
|
198
|
+
* @param {boolean} [enable=false] - Optional parameter to enable the profiler
|
|
199
|
+
*/
|
|
200
|
+
constructor(enable) { this.active = enable ?? false; }
|
|
201
|
+
/**
|
|
202
|
+
* Gets the current time based on the environment.
|
|
203
|
+
*
|
|
204
|
+
* Uses process.hrtime.bigint() for Node.js, performance.now() for browsers,
|
|
205
|
+
* and Date.now() as a fallback.
|
|
206
|
+
*
|
|
207
|
+
* @returns {number} - Current time in milliseconds
|
|
208
|
+
*/
|
|
209
|
+
now() {
|
|
210
|
+
switch (Profiler.ENV) {
|
|
211
|
+
// Node.js environment
|
|
212
|
+
case 'nodejs': return Number(process.hrtime.bigint()) / 1e6;
|
|
213
|
+
// Browser environment
|
|
214
|
+
case 'browser': return performance.now();
|
|
215
|
+
// Fallback
|
|
216
|
+
default: return Date.now();
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Gets the current memory usage based on the environment.
|
|
221
|
+
*
|
|
222
|
+
* Uses process.memoryUsage().heapUsed for Node.js, performance.memory.usedJSHeapSize
|
|
223
|
+
* for browsers, and returns 0 as a fallback.
|
|
224
|
+
*
|
|
225
|
+
* @returns {number} - Current memory usage in bytes
|
|
226
|
+
*/
|
|
227
|
+
mem() {
|
|
228
|
+
switch (Profiler.ENV) {
|
|
229
|
+
// Node.js environment
|
|
230
|
+
case 'nodejs': return process.memoryUsage().heapUsed;
|
|
231
|
+
// Browser environment
|
|
232
|
+
case 'browser': return performance.memory?.usedJSHeapSize ?? 0;
|
|
233
|
+
// Fallback
|
|
234
|
+
default: return 0;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
/**
|
|
238
|
+
* Enables the profiler.
|
|
239
|
+
* Sets the active state to true, allowing profiling to occur.
|
|
240
|
+
*/
|
|
241
|
+
enable() { this.active = true; }
|
|
242
|
+
/**
|
|
243
|
+
* Disables the profiler.
|
|
244
|
+
* Sets the active state to false, preventing further profiling.
|
|
245
|
+
*/
|
|
246
|
+
disable() { this.active = false; }
|
|
247
|
+
/**
|
|
248
|
+
* Resets the profiler by clearing the store, total time and memory consumption.
|
|
249
|
+
* This method is useful for starting a new profiling session.
|
|
250
|
+
*/
|
|
251
|
+
clear() {
|
|
252
|
+
this.store.clear();
|
|
253
|
+
this.totalTime = 0;
|
|
254
|
+
this.totalMem = 0;
|
|
255
|
+
}
|
|
256
|
+
/**
|
|
257
|
+
* Runs a synchronous function and profiles its execution time and memory usage.
|
|
258
|
+
* If the profiler is not active, it simply executes the function without profiling.
|
|
259
|
+
*
|
|
260
|
+
* @param {() => T} fn - Function to be executed and profiled
|
|
261
|
+
* @param {Record<string, any>} meta - Metadata to be associated with the profiling entry
|
|
262
|
+
* @returns {T} - The result of the executed function
|
|
263
|
+
*/
|
|
264
|
+
run(fn, meta = {}) {
|
|
265
|
+
// If the profiler is not active, simply execute the function without profiling
|
|
266
|
+
if (!this.active)
|
|
267
|
+
return fn();
|
|
268
|
+
// Capture the start time and memory usage
|
|
269
|
+
const startTime = this.now(), startMem = this.mem();
|
|
270
|
+
// Execute the function and capture the result
|
|
271
|
+
const res = fn();
|
|
272
|
+
// Calculate the time and memory consumption
|
|
273
|
+
const deltaTime = this.now() - startTime;
|
|
274
|
+
const deltaMem = this.mem() - startMem;
|
|
275
|
+
// Add the profiling entry to the store
|
|
276
|
+
this.store.add({ time: deltaTime, mem: deltaMem, res, meta });
|
|
277
|
+
this.totalTime += deltaTime, this.totalMem += deltaMem;
|
|
278
|
+
// Return the result of the function
|
|
279
|
+
return res;
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Runs an asynchronous function and profiles its execution time and memory usage.
|
|
283
|
+
* If the profiler is not active, it simply executes the function without profiling.
|
|
284
|
+
*
|
|
285
|
+
* @param {() => Promise<T>} fn - Asynchronous function to be executed and profiled
|
|
286
|
+
* @param {Record<string, any>} meta - Metadata to be associated with the profiling entry
|
|
287
|
+
* @returns {Promise<T>} - A promise that resolves to the result of the executed function
|
|
288
|
+
*/
|
|
289
|
+
async runAsync(fn, meta = {}) {
|
|
290
|
+
// If the profiler is not active, simply execute the function without profiling
|
|
291
|
+
if (!this.active)
|
|
292
|
+
return await fn();
|
|
293
|
+
// Capture the start time and memory usage
|
|
294
|
+
const startTime = this.now(), startMem = this.mem();
|
|
295
|
+
// Execute the asynchronous function and wait for its result
|
|
296
|
+
const res = await fn();
|
|
297
|
+
// Calculate the time and memory consumption
|
|
298
|
+
const deltaTime = this.now() - startTime;
|
|
299
|
+
const deltaMem = this.mem() - startMem;
|
|
300
|
+
// Add the profiling entry to the store
|
|
301
|
+
this.store.add({ time: deltaTime, mem: deltaMem, res, meta });
|
|
302
|
+
this.totalTime += deltaTime, this.totalMem += deltaMem;
|
|
303
|
+
// Return the result of the function
|
|
304
|
+
return res;
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
307
|
+
* Retrieves all profiler entries stored in the profiler.
|
|
308
|
+
*
|
|
309
|
+
* @returns {ProfilerEntry<any>[]} - An array of profiler entries
|
|
310
|
+
*/
|
|
311
|
+
getAll() { return [...this.store]; }
|
|
312
|
+
/**
|
|
313
|
+
* Retrieves the last profiler entry stored in the profiler.
|
|
314
|
+
*
|
|
315
|
+
* @returns {ProfilerEntry<any> | undefined} - The last profiler entry or undefined if no entries exist
|
|
316
|
+
*/
|
|
317
|
+
getLast() { return this.getAll().pop(); }
|
|
318
|
+
/**
|
|
319
|
+
* Retrieves the total time and memory consumption recorded by the profiler.
|
|
320
|
+
*
|
|
321
|
+
* @returns {{ time: number, mem: number }} - An object containing total time and memory usage
|
|
322
|
+
*/
|
|
323
|
+
getTotal() {
|
|
324
|
+
return {
|
|
325
|
+
time: this.totalTime, mem: this.totalMem
|
|
326
|
+
};
|
|
327
|
+
}
|
|
328
|
+
/**
|
|
329
|
+
* Returns the services provided by the Profiler class.
|
|
330
|
+
* This allows for easy access to the profiler's methods.
|
|
331
|
+
*
|
|
332
|
+
* @returns {ProfilerService<any>} - An object containing methods to control the profiler
|
|
333
|
+
*/
|
|
334
|
+
services = {
|
|
335
|
+
enable: this.enable.bind(this),
|
|
336
|
+
disable: this.disable.bind(this),
|
|
337
|
+
clear: this.clear.bind(this),
|
|
338
|
+
report: this.getAll.bind(this),
|
|
339
|
+
last: this.getLast.bind(this),
|
|
340
|
+
total: this.getTotal.bind(this)
|
|
341
|
+
};
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
/**
|
|
345
|
+
* TextAnalyzer Utility
|
|
346
|
+
* src/utils/TextAnalyzer.ts
|
|
347
|
+
*
|
|
348
|
+
* The TextAnalyzer class provides a comprehensive set of methods for analyzing and
|
|
349
|
+
* extracting statistics from a given text. It supports word and sentence tokenization,
|
|
350
|
+
* character and word frequency analysis, syllable estimation, readability metrics
|
|
351
|
+
* (Flesch, Kincaid, LIX, WSTF), and various ratios and histograms. Designed for
|
|
352
|
+
* efficiency and flexibility, it is suitable for linguistic research, readability
|
|
353
|
+
* scoring, and text preprocessing tasks.
|
|
354
|
+
*
|
|
355
|
+
* @module Utils/TextAnalyzer
|
|
356
|
+
* @author Paul Köhler (komed3)
|
|
357
|
+
* @license MIT
|
|
358
|
+
*/
|
|
359
|
+
class TextAnalyzer {
|
|
360
|
+
// The original text to analyze
|
|
361
|
+
text;
|
|
362
|
+
// Tokenized words and sentences
|
|
363
|
+
words = [];
|
|
364
|
+
sentences = [];
|
|
365
|
+
// Frequency maps for characters and words
|
|
366
|
+
charFrequency = new Map();
|
|
367
|
+
wordHistogram = new Map();
|
|
368
|
+
syllableCache = new Map();
|
|
369
|
+
/**
|
|
370
|
+
* Constructs a new TextAnalyzer instance with the provided input text.
|
|
371
|
+
*
|
|
372
|
+
* @param {string} input - The text to analyze
|
|
373
|
+
*/
|
|
374
|
+
constructor(input) {
|
|
375
|
+
this.text = input.trim();
|
|
376
|
+
this.tokenize();
|
|
377
|
+
this.computeFrequencies();
|
|
378
|
+
}
|
|
379
|
+
/**
|
|
380
|
+
* Tokenizes the input text into words and sentences.
|
|
381
|
+
*/
|
|
382
|
+
tokenize() {
|
|
383
|
+
this.words = [], this.sentences = [];
|
|
384
|
+
const text = this.text;
|
|
385
|
+
const wordRegex = /\p{L}+/gu;
|
|
386
|
+
let match;
|
|
387
|
+
// Tokenize words using Unicode property escapes for letters
|
|
388
|
+
while ((match = wordRegex.exec(text)) !== null) {
|
|
389
|
+
this.words.push(match[0].toLowerCase());
|
|
390
|
+
}
|
|
391
|
+
// Tokenize sentences using punctuation marks as delimiters
|
|
392
|
+
this.sentences = text.split(/(?<=[.!?])\s+/).filter(Boolean);
|
|
393
|
+
}
|
|
394
|
+
/**
|
|
395
|
+
* Computes character and word frequencies from the tokenized text.
|
|
396
|
+
*/
|
|
397
|
+
computeFrequencies() {
|
|
398
|
+
// Compute character frequencies
|
|
399
|
+
for (const char of this.text)
|
|
400
|
+
this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1);
|
|
401
|
+
// Compute word frequencies
|
|
402
|
+
for (const word of this.words)
|
|
403
|
+
this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1);
|
|
404
|
+
}
|
|
405
|
+
/**
|
|
406
|
+
* Estimates the number of syllables in a word using a simple heuristic.
|
|
407
|
+
*
|
|
408
|
+
* @param {string} word - The word to estimate syllables for
|
|
409
|
+
* @returns {number} - Estimated syllable count
|
|
410
|
+
*/
|
|
411
|
+
estimateSyllables(word) {
|
|
412
|
+
// Check cache first to avoid redundant calculations
|
|
413
|
+
if (this.syllableCache.has(word))
|
|
414
|
+
return this.syllableCache.get(word);
|
|
415
|
+
// Normalize the word: lowercase and remove non-letter characters
|
|
416
|
+
const clean = word.toLowerCase().replace(/[^a-zäöüß]/g, '');
|
|
417
|
+
const matches = clean.match(/[aeiouyäöü]+/g);
|
|
418
|
+
// Count syllables based on vowel groups
|
|
419
|
+
const count = matches ? matches.length : 1;
|
|
420
|
+
this.syllableCache.set(word, count);
|
|
421
|
+
return count;
|
|
422
|
+
}
|
|
423
|
+
/**
|
|
424
|
+
* Gets the original text length in characters.
|
|
425
|
+
*
|
|
426
|
+
* @return {number} - Length of the text
|
|
427
|
+
*/
|
|
428
|
+
getLength() { return this.text.length; }
|
|
429
|
+
/**
|
|
430
|
+
* Gets the number of words in the text.
|
|
431
|
+
*
|
|
432
|
+
* @return {number} - Count of words
|
|
433
|
+
*/
|
|
434
|
+
getWordCount() { return this.words.length; }
|
|
435
|
+
/**
|
|
436
|
+
* Gets the number of sentences in the text.
|
|
437
|
+
*
|
|
438
|
+
* @return {number} - Count of sentences
|
|
439
|
+
*/
|
|
440
|
+
getSentenceCount() { return this.sentences.length; }
|
|
441
|
+
/**
|
|
442
|
+
* Gets the average word length in the text.
|
|
443
|
+
*
|
|
444
|
+
* @return {number} - Average length of words
|
|
445
|
+
*/
|
|
446
|
+
getAvgWordLength() {
|
|
447
|
+
let totalLen = 0;
|
|
448
|
+
for (const w of this.words)
|
|
449
|
+
totalLen += w.length;
|
|
450
|
+
return this.words.length ? totalLen / this.words.length : 0;
|
|
451
|
+
}
|
|
452
|
+
/**
|
|
453
|
+
* Gets the average sentence length in words.
|
|
454
|
+
*
|
|
455
|
+
* @return {number} - Average length of sentences
|
|
456
|
+
*/
|
|
457
|
+
getAvgSentenceLength() {
|
|
458
|
+
return this.sentences.length ? this.words.length / this.sentences.length : 0;
|
|
459
|
+
}
|
|
460
|
+
/**
|
|
461
|
+
* Gets a histogram of word frequencies in the text.
|
|
462
|
+
*
|
|
463
|
+
* @returns {Record<string, number>} - A histogram of word frequencies
|
|
464
|
+
*/
|
|
465
|
+
getWordHistogram() {
|
|
466
|
+
return Object.fromEntries(this.wordHistogram);
|
|
467
|
+
}
|
|
468
|
+
/**
|
|
469
|
+
* Gets the most common words in the text, limited to a specified number.
|
|
470
|
+
*
|
|
471
|
+
* @param {number} [limit=5] - Maximum number of common words to return
|
|
472
|
+
* @returns {string[]} - Array of the most common words
|
|
473
|
+
*/
|
|
474
|
+
getMostCommonWords(limit = 5) {
|
|
475
|
+
return [...this.wordHistogram.entries()]
|
|
476
|
+
.sort((a, b) => b[1] - a[1])
|
|
477
|
+
.slice(0, limit).map(e => e[0]);
|
|
478
|
+
}
|
|
479
|
+
/**
|
|
480
|
+
* Gets the least common words (hapax legomena) in the text.
|
|
481
|
+
*
|
|
482
|
+
* Hapax legomena are words that occur only once in the text.
|
|
483
|
+
*
|
|
484
|
+
* @returns {string[]} - Array of hapax legomena
|
|
485
|
+
*/
|
|
486
|
+
getHapaxLegomena() {
|
|
487
|
+
return [...this.wordHistogram.entries()]
|
|
488
|
+
.filter(([, c]) => c === 1)
|
|
489
|
+
.map(e => e[0]);
|
|
490
|
+
}
|
|
491
|
+
/**
|
|
492
|
+
* Checks if the text contains any numbers.
|
|
493
|
+
*
|
|
494
|
+
* @returns {boolean} - True if numbers are present, false otherwise
|
|
495
|
+
*/
|
|
496
|
+
hasNumbers() { return /\d/.test(this.text); }
|
|
497
|
+
/**
|
|
498
|
+
* Calculates the ratio of uppercase letters to total letters in the text.
|
|
499
|
+
*
|
|
500
|
+
* @return {number} - Ratio of uppercase letters to total letters
|
|
501
|
+
*/
|
|
502
|
+
getUpperCaseRatio() {
|
|
503
|
+
let upper = 0, letters = 0;
|
|
504
|
+
for (let i = 0, len = this.text.length; i < len; i++) {
|
|
505
|
+
const c = this.text[i];
|
|
506
|
+
if (/[A-Za-zÄÖÜäöüß]/.test(c)) {
|
|
507
|
+
letters++;
|
|
508
|
+
if (/[A-ZÄÖÜ]/.test(c))
|
|
509
|
+
upper++;
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
return letters ? upper / letters : 0;
|
|
513
|
+
}
|
|
514
|
+
/**
|
|
515
|
+
* Gets the frequency of each character in the text.
|
|
516
|
+
*
|
|
517
|
+
* @returns {Record<string, number>} - A record of character frequencies
|
|
518
|
+
*/
|
|
519
|
+
getCharFrequency() {
|
|
520
|
+
return Object.fromEntries(this.charFrequency);
|
|
521
|
+
}
|
|
522
|
+
/**
|
|
523
|
+
* Gets the frequency of each Unicode block in the text.
|
|
524
|
+
*
|
|
525
|
+
* @returns {Record<string, number>} - A record of Unicode block frequencies
|
|
526
|
+
*/
|
|
527
|
+
getUnicodeStats() {
|
|
528
|
+
const result = {};
|
|
529
|
+
for (const [char, count] of this.charFrequency) {
|
|
530
|
+
// Get the Unicode block for the character
|
|
531
|
+
const block = char
|
|
532
|
+
.charCodeAt(0).toString(16)
|
|
533
|
+
.padStart(4, '0').toUpperCase();
|
|
534
|
+
// Increment the count for the block
|
|
535
|
+
result[block] = (result[block] ?? 0) + count;
|
|
536
|
+
}
|
|
537
|
+
return result;
|
|
538
|
+
}
|
|
539
|
+
/**
|
|
540
|
+
* Gets the ratio of long words (words with length >= len) to total words.
|
|
541
|
+
*
|
|
542
|
+
* @param {number} [len=7] - Minimum length for a word to be considered long
|
|
543
|
+
* @returns {number} - Ratio of long words to total words
|
|
544
|
+
*/
|
|
545
|
+
getLongWordRatio(len = 7) {
|
|
546
|
+
let long = 0;
|
|
547
|
+
for (const w of this.words)
|
|
548
|
+
if (w.length >= len)
|
|
549
|
+
long++;
|
|
550
|
+
return this.words.length ? long / this.words.length : 0;
|
|
551
|
+
}
|
|
552
|
+
/**
|
|
553
|
+
* Gets the ratio of short words (words with length <= len) to total words.
|
|
554
|
+
*
|
|
555
|
+
* @param {number} [len=3] - Maximum length for a word to be considered short
|
|
556
|
+
* @returns {number} - Ratio of short words to total words
|
|
557
|
+
*/
|
|
558
|
+
getShortWordRatio(len = 3) {
|
|
559
|
+
let short = 0;
|
|
560
|
+
for (const w of this.words)
|
|
561
|
+
if (w.length <= len)
|
|
562
|
+
short++;
|
|
563
|
+
return this.words.length ? short / this.words.length : 0;
|
|
564
|
+
}
|
|
565
|
+
/**
|
|
566
|
+
* Estimates the number of syllables in the text.
|
|
567
|
+
*
|
|
568
|
+
* @returns {number} - Total estimated syllable count
|
|
569
|
+
*/
|
|
570
|
+
getSyllablesCount() {
|
|
571
|
+
let count = 0;
|
|
572
|
+
for (const w of this.words)
|
|
573
|
+
count += this.estimateSyllables(w);
|
|
574
|
+
return count;
|
|
575
|
+
}
|
|
576
|
+
/**
|
|
577
|
+
* Gets the number of monosyllabic words (words with exactly one syllable).
|
|
578
|
+
*
|
|
579
|
+
* @returns {number} - Count of monosyllabic words
|
|
580
|
+
*/
|
|
581
|
+
getMonosyllabicWordCount() {
|
|
582
|
+
let count = 0;
|
|
583
|
+
for (const w of this.words)
|
|
584
|
+
if (this.estimateSyllables(w) === 1)
|
|
585
|
+
count++;
|
|
586
|
+
return count;
|
|
587
|
+
}
|
|
588
|
+
/**
|
|
589
|
+
* Gets the number of words with at least a specified minimum syllable count.
|
|
590
|
+
*
|
|
591
|
+
* @param {number} min - Minimum syllable count for a word to be included
|
|
592
|
+
* @returns {number} - Count of words meeting the syllable criteria
|
|
593
|
+
*/
|
|
594
|
+
getMinSyllablesWordCount(min) {
|
|
595
|
+
let count = 0;
|
|
596
|
+
for (const w of this.words)
|
|
597
|
+
if (this.estimateSyllables(w) >= min)
|
|
598
|
+
count++;
|
|
599
|
+
return count;
|
|
600
|
+
}
|
|
601
|
+
/**
|
|
602
|
+
* Gets the number of words with at most a specified maximum syllable count.
|
|
603
|
+
*
|
|
604
|
+
* @param {number} max - Maximum syllable count for a word to be included
|
|
605
|
+
* @returns {number} - Count of words meeting the syllable criteria
|
|
606
|
+
*/
|
|
607
|
+
getMaxSyllablesWordCount(max) {
|
|
608
|
+
let count = 0;
|
|
609
|
+
for (const w of this.words)
|
|
610
|
+
if (this.estimateSyllables(w) <= max)
|
|
611
|
+
count++;
|
|
612
|
+
return count;
|
|
613
|
+
}
|
|
614
|
+
/**
|
|
615
|
+
* Calculates the Honore's R statistic for the text as a measure of lexical richness.
|
|
616
|
+
*
|
|
617
|
+
* @returns {number} - The Honore's R statistic
|
|
618
|
+
*/
|
|
619
|
+
getHonoresR() {
|
|
620
|
+
return (100 * Math.log(this.words.length)) / (1 - (this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1)));
|
|
621
|
+
}
|
|
622
|
+
/**
|
|
623
|
+
* Estimates the reading time for the text based on words per minute (WPM).
|
|
624
|
+
*
|
|
625
|
+
* @param {number} [wpm=200] - Words per minute for the calculation
|
|
626
|
+
* @returns {number} - Estimated reading time in minutes
|
|
627
|
+
*/
|
|
628
|
+
getReadingTime(wpm = 200) {
|
|
629
|
+
return Math.max(1, this.words.length / (wpm ?? 1));
|
|
630
|
+
}
|
|
631
|
+
/**
|
|
632
|
+
* Calculates various readability scores based on the text.
|
|
633
|
+
*
|
|
634
|
+
* This method supports multiple readability metrics:
|
|
635
|
+
* - Flesch Reading Ease
|
|
636
|
+
* - Flesch-Kincaid Grade Level
|
|
637
|
+
*
|
|
638
|
+
* @param {'flesch'|'fleschde'|'kincaid'} [metric='flesch'] - The readability metric to calculate
|
|
639
|
+
* @returns {number} - The calculated readability score
|
|
640
|
+
*/
|
|
641
|
+
getReadabilityScore(metric = 'flesch') {
|
|
642
|
+
const w = this.words.length || 1;
|
|
643
|
+
const s = this.sentences.length || 1;
|
|
644
|
+
const y = this.getSyllablesCount() || 1;
|
|
645
|
+
const asl = w / s;
|
|
646
|
+
const asw = y / w;
|
|
647
|
+
switch (metric) {
|
|
648
|
+
// Flesch Reading Ease formula
|
|
649
|
+
case 'flesch': return 206.835 - (1.015 * asl) - (84.6 * asw);
|
|
650
|
+
// Flesch Reading Ease formula for German texts
|
|
651
|
+
case 'fleschde': return 180 - asl - (58.5 * asw);
|
|
652
|
+
// Flesch-Kincaid Grade Level formula
|
|
653
|
+
case 'kincaid': return (0.39 * asl) + (11.8 * asw) - 15.59;
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
/**
|
|
657
|
+
* Calculates the LIX (Lesbarhetsindex) score for the text.
|
|
658
|
+
*
|
|
659
|
+
* The LIX score is a readability index that combines average word length and sentence length.
|
|
660
|
+
*
|
|
661
|
+
* @returns {number} - The LIX score
|
|
662
|
+
*/
|
|
663
|
+
getLIXScore() {
|
|
664
|
+
const w = this.words.length || 1;
|
|
665
|
+
const s = this.sentences.length || 1;
|
|
666
|
+
const l = this.getLongWordRatio() * w;
|
|
667
|
+
return (w / s) + (l / w * 100);
|
|
668
|
+
}
|
|
669
|
+
/**
|
|
670
|
+
* Calculates the Wiener Sachtextformel (WSTF) scores for the text.
|
|
671
|
+
*
|
|
672
|
+
* The WSTF scores are a set of readability metrics based on word and sentence characteristics.
|
|
673
|
+
*
|
|
674
|
+
* @returns {[number, number, number, number]} - An array of WSTF scores
|
|
675
|
+
*/
|
|
676
|
+
getWSTFScore() {
|
|
677
|
+
const w = this.words.length || 1;
|
|
678
|
+
const h = this.getMinSyllablesWordCount(3) / w * 100;
|
|
679
|
+
const s = this.getAvgSentenceLength();
|
|
680
|
+
const l = this.getLongWordRatio() * 100;
|
|
681
|
+
const m = this.getMonosyllabicWordCount() / w * 100;
|
|
682
|
+
return [
|
|
683
|
+
0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.8750,
|
|
684
|
+
0.2007 * h + 0.1682 * s + 0.1373 * l - 2.7790,
|
|
685
|
+
0.2963 * h + 0.1905 * s - 1.1144,
|
|
686
|
+
0.2744 * h + 0.2656 * s - 1.6930
|
|
687
|
+
];
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
/**
|
|
692
|
+
* DiffChecker Utility
|
|
693
|
+
* src/utils/DiffChecker.ts
|
|
694
|
+
*
|
|
695
|
+
* The DiffChecker class provides a robust and efficient utility for comparing two
|
|
696
|
+
* texts and extracting their differences (full lines or word mode). It supports
|
|
697
|
+
* context-aware grouping of changes, unified diff output (with CLI color or ASCII
|
|
698
|
+
* markup), and detailed change magnitude metrics. The class is highly configurable,
|
|
699
|
+
* allowing users to choose the diff granularity, case sensitivity, context lines,
|
|
700
|
+
* grouping, and output style. It is suitable for text comparison, code review
|
|
701
|
+
* tools, document versioning, and any application requiring precise and human-
|
|
702
|
+
* readable difference reporting.
|
|
703
|
+
*
|
|
704
|
+
* Features:
|
|
705
|
+
* - Line and word-based diffing
|
|
706
|
+
* - Case-insensitive comparison option
|
|
707
|
+
* - Context lines and grouping of adjacent changes
|
|
708
|
+
* - Unified diff output (ASCII or colored CLI)
|
|
709
|
+
* - Highlighting of changed segments within lines
|
|
710
|
+
* - Change magnitude calculation (relative to group or line)
|
|
711
|
+
* - Expand-all mode for full file context
|
|
712
|
+
*
|
|
713
|
+
* @module Utils/DiffChecker
|
|
714
|
+
* @author Paul Köhler (komed3)
|
|
715
|
+
* @license MIT
|
|
716
|
+
*/
|
|
717
|
+
/**
|
|
718
|
+
* The DiffChecker class provides methods to compare two texts and generate
|
|
719
|
+
* structured diffs, grouped diffs, and unified diff outputs.
|
|
720
|
+
*/
|
|
721
|
+
class DiffChecker {
|
|
722
|
+
// Original input texts and options
|
|
723
|
+
a;
|
|
724
|
+
b;
|
|
725
|
+
options;
|
|
726
|
+
// Computed diff entries and groups
|
|
727
|
+
entries = [];
|
|
728
|
+
grouped = [];
|
|
729
|
+
// Flag to indicate if the diff has already been computed
|
|
730
|
+
diffRun = false;
|
|
731
|
+
/**
|
|
732
|
+
* Constructs a new DiffChecker instance for comparing two texts.
|
|
733
|
+
*
|
|
734
|
+
* @param {string} a - The first (original) text
|
|
735
|
+
* @param {string} b - The second (modified) text
|
|
736
|
+
* @param {DiffOptions} [opt] - Optional diff configuration
|
|
737
|
+
*/
|
|
738
|
+
constructor(a, b, opt = {}) {
|
|
739
|
+
// Set the two texts to compare
|
|
740
|
+
this.a = a, this.b = b;
|
|
741
|
+
// Merge default with user-provided options
|
|
742
|
+
this.options = { ...{
|
|
743
|
+
mode: 'word',
|
|
744
|
+
caseInsensitive: false,
|
|
745
|
+
contextLines: 1,
|
|
746
|
+
groupedLines: true,
|
|
747
|
+
expandLines: false,
|
|
748
|
+
showChangeMagnitude: true,
|
|
749
|
+
maxMagnitudeSymbols: 5,
|
|
750
|
+
lineBreak: '\n'
|
|
751
|
+
}, ...opt };
|
|
752
|
+
// Run the diff computation immediately
|
|
753
|
+
this.computeDiff();
|
|
754
|
+
}
|
|
755
|
+
/**
|
|
756
|
+
* Splits both input texts into arrays of lines and returns them
|
|
757
|
+
* with the maximum line count.
|
|
758
|
+
*
|
|
759
|
+
* @returns { linesA: string[], linesB: string[], maxLen: number }
|
|
760
|
+
*/
|
|
761
|
+
text2lines() {
|
|
762
|
+
// Trim and split the input texts into lines
|
|
763
|
+
const linesA = this.a.trim().split(/\r?\n/);
|
|
764
|
+
const linesB = this.b.trim().split(/\r?\n/);
|
|
765
|
+
return { linesA, linesB, maxLen: Math.max(linesA.length, linesB.length) };
|
|
766
|
+
}
|
|
767
|
+
/**
|
|
768
|
+
* Tokenizes a string according to the current diff mode (line or word).
|
|
769
|
+
*
|
|
770
|
+
* @param {string} input - The string to tokenize
|
|
771
|
+
* @returns {string[]} - Array of tokens
|
|
772
|
+
*/
|
|
773
|
+
tokenize(input) {
|
|
774
|
+
const { mode } = this.options;
|
|
775
|
+
switch (mode) {
|
|
776
|
+
// Tokenize by lines
|
|
777
|
+
case 'line': return [input];
|
|
778
|
+
// Tokenize by words
|
|
779
|
+
case 'word': return input.split(/\s+/);
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
/**
|
|
783
|
+
* Concatenates an array of tokens back into a string, respecting the diff mode.
|
|
784
|
+
*
|
|
785
|
+
* @param {string[]} input - Array of tokens
|
|
786
|
+
* @returns {string} - Concatenated string
|
|
787
|
+
*/
|
|
788
|
+
concat(input) {
|
|
789
|
+
const { mode } = this.options;
|
|
790
|
+
return input.join(mode === 'word' ? ' ' : '');
|
|
791
|
+
}
|
|
792
|
+
/**
|
|
793
|
+
* Computes the diff between the two input texts and populates the
|
|
794
|
+
* entries and grouped arrays.
|
|
795
|
+
*/
|
|
796
|
+
computeDiff() {
|
|
797
|
+
if (!this.diffRun) {
|
|
798
|
+
// Get the lines from both texts
|
|
799
|
+
const { linesA, linesB, maxLen } = this.text2lines();
|
|
800
|
+
// Loop through each line and compare them
|
|
801
|
+
for (let i = 0; i < maxLen; i++) {
|
|
802
|
+
const a = linesA[i] || '';
|
|
803
|
+
const b = linesB[i] || '';
|
|
804
|
+
// Perform line diffing
|
|
805
|
+
this.lineDiff(a, b, i);
|
|
806
|
+
}
|
|
807
|
+
// Find groups of adjacent changes
|
|
808
|
+
this.findGroups();
|
|
809
|
+
// Set the diff run flag to true
|
|
810
|
+
this.diffRun = true;
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
/**
|
|
814
|
+
* Compares two lines and records their differences at the configured granularity.
|
|
815
|
+
*
|
|
816
|
+
* @param {string} a - Line from the first text
|
|
817
|
+
* @param {string} b - Line from the second text
|
|
818
|
+
* @param {number} line - Line number
|
|
819
|
+
*/
|
|
820
|
+
lineDiff(a, b, line) {
|
|
821
|
+
const { mode, caseInsensitive } = this.options;
|
|
822
|
+
const baseLen = Math.max(a.length, b.length);
|
|
823
|
+
let A = a, B = b;
|
|
824
|
+
// If case-insensitive mode is enabled, convert both lines to lowercase
|
|
825
|
+
if (caseInsensitive)
|
|
826
|
+
A = a.toLowerCase(), B = b.toLowerCase();
|
|
827
|
+
let diffs = [];
|
|
828
|
+
let delSize = 0, insSize = 0;
|
|
829
|
+
if (mode === 'line') {
|
|
830
|
+
// For line mode, compare the entire lines directly
|
|
831
|
+
if (A !== B) {
|
|
832
|
+
diffs.push({
|
|
833
|
+
posA: 0, posB: 0,
|
|
834
|
+
del: a, ins: b,
|
|
835
|
+
size: b.length - a.length
|
|
836
|
+
});
|
|
837
|
+
delSize = a.length;
|
|
838
|
+
insSize = b.length;
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
else {
|
|
842
|
+
// For word mode, find precise diffs between tokenized lines
|
|
843
|
+
diffs = this.preciseDiff(a, A, b, B);
|
|
844
|
+
// Calculate total sizes of deletions and insertions
|
|
845
|
+
for (const d of diffs)
|
|
846
|
+
delSize += d.del.length, insSize += d.ins.length;
|
|
847
|
+
}
|
|
848
|
+
if (diffs.length) {
|
|
849
|
+
// Add the diff entry for this line
|
|
850
|
+
this.entries.push({
|
|
851
|
+
line, diffs, delSize, insSize, baseLen,
|
|
852
|
+
totalSize: insSize - delSize,
|
|
853
|
+
magnitude: this.magnitude(delSize, insSize, baseLen)
|
|
854
|
+
});
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
/**
|
|
858
|
+
* Finds all minimal diff blocks between two tokenized strings,
|
|
859
|
+
* returning original text and positions.
|
|
860
|
+
*
|
|
861
|
+
* @param {string} a - Original line (case preserved)
|
|
862
|
+
* @param {string} A - Original line (possibly lowercased)
|
|
863
|
+
* @param {string} b - Modified line (case preserved)
|
|
864
|
+
* @param {string} B - Modified line (possibly lowercased)
|
|
865
|
+
* @returns {DiffEntry[]} - Array of diff entries for this line
|
|
866
|
+
*/
|
|
867
|
+
preciseDiff(a, A, b, B) {
|
|
868
|
+
// Helper function to calculate positions of tokens in the original text
|
|
869
|
+
const posIndex = (t) => t.reduce((p, _, i) => (p.push(i ? p[i - 1] + t[i - 1].length + 1 : 0), p), []);
|
|
870
|
+
// Original and tokenized arrays, their lengths and position arrays
|
|
871
|
+
const origA = this.tokenize(a);
|
|
872
|
+
const origB = this.tokenize(b);
|
|
873
|
+
const tokenA = this.tokenize(A);
|
|
874
|
+
const tokenB = this.tokenize(B);
|
|
875
|
+
const lenA = tokenA.length;
|
|
876
|
+
const lenB = tokenB.length;
|
|
877
|
+
const posArrA = posIndex(origA);
|
|
878
|
+
const posArrB = posIndex(origB);
|
|
879
|
+
// Find all matching blocks (LCS)
|
|
880
|
+
const matches = [];
|
|
881
|
+
let ai = 0, bi = 0;
|
|
882
|
+
while (ai < lenA && bi < lenB) {
|
|
883
|
+
// If tokens match, find the length of the match
|
|
884
|
+
if (tokenA[ai] === tokenB[bi]) {
|
|
885
|
+
let len = 1;
|
|
886
|
+
// Extend the match as long as tokens continue to match
|
|
887
|
+
while (ai + len < lenA && bi + len < lenB &&
|
|
888
|
+
tokenA[ai + len] === tokenB[bi + len])
|
|
889
|
+
len++;
|
|
890
|
+
matches.push({ ai, bi, len });
|
|
891
|
+
ai += len, bi += len;
|
|
892
|
+
}
|
|
893
|
+
else {
|
|
894
|
+
let found = false;
|
|
895
|
+
// Look ahead for next sync point (greedy, but avoids long tails)
|
|
896
|
+
for (let offset = 1; offset <= 3 && !found; offset++) {
|
|
897
|
+
// Check if the next token in A matches the current token in B
|
|
898
|
+
if (ai + offset < lenA && tokenA[ai + offset] === tokenB[bi]) {
|
|
899
|
+
matches.push({ ai: ai + offset, bi, len: 1 });
|
|
900
|
+
ai += offset + 1, bi += 1, found = true;
|
|
901
|
+
}
|
|
902
|
+
// Check if the next token in B matches the current token in A
|
|
903
|
+
else if (bi + offset < lenB && tokenA[ai] === tokenB[bi + offset]) {
|
|
904
|
+
matches.push({ ai, bi: bi + offset, len: 1 });
|
|
905
|
+
ai += 1, bi += offset + 1, found = true;
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
// If no match was found, advance both pointers by one
|
|
909
|
+
if (!found)
|
|
910
|
+
ai++, bi++;
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
// Walk through tokens and emit diffs between matches
|
|
914
|
+
const diffs = [];
|
|
915
|
+
let i = 0, j = 0;
|
|
916
|
+
for (const m of matches) {
|
|
917
|
+
// If there are unmatched tokens before the match, record them
|
|
918
|
+
if (i < m.ai || j < m.bi) {
|
|
919
|
+
// Slice the original arrays to get the unmatched tokens
|
|
920
|
+
const delArr = origA.slice(i, m.ai);
|
|
921
|
+
const insArr = origB.slice(j, m.bi);
|
|
922
|
+
// Push the diff entry for unmatched tokens
|
|
923
|
+
diffs.push({
|
|
924
|
+
posA: posArrA[i] ?? 0,
|
|
925
|
+
posB: posArrB[j] ?? 0,
|
|
926
|
+
del: this.concat(delArr),
|
|
927
|
+
ins: this.concat(insArr),
|
|
928
|
+
size: insArr.join('').length - delArr.join('').length
|
|
929
|
+
});
|
|
930
|
+
}
|
|
931
|
+
// Advance to after the match
|
|
932
|
+
i = m.ai + m.len, j = m.bi + m.len;
|
|
933
|
+
}
|
|
934
|
+
// Tail diffs after the last match
|
|
935
|
+
if (i < lenA || j < lenB) {
|
|
936
|
+
// Slice the original arrays to get the unmatched tokens
|
|
937
|
+
const delArr = origA.slice(i);
|
|
938
|
+
const insArr = origB.slice(j);
|
|
939
|
+
// Push the diff entry for unmatched tokens at the end
|
|
940
|
+
diffs.push({
|
|
941
|
+
posA: posArrA[i] ?? 0,
|
|
942
|
+
posB: posArrB[j] ?? 0,
|
|
943
|
+
del: this.concat(delArr),
|
|
944
|
+
ins: this.concat(insArr),
|
|
945
|
+
size: insArr.join('').length - delArr.join('').length
|
|
946
|
+
});
|
|
947
|
+
}
|
|
948
|
+
// Remove empty diffs
|
|
949
|
+
return diffs.filter(d => d.del.length > 0 || d.ins.length > 0);
|
|
950
|
+
}
|
|
951
|
+
/**
|
|
952
|
+
* Groups adjacent changed lines together, including context lines,
|
|
953
|
+
* and calculates group metrics.
|
|
954
|
+
*/
|
|
955
|
+
findGroups() {
|
|
956
|
+
const { contextLines } = this.options;
|
|
957
|
+
// Helper function to add a group to the grouped array
|
|
958
|
+
const addGroup = (group, start, end) => {
|
|
959
|
+
// Calculate total sizes and base length for the group
|
|
960
|
+
const [delSize, insSize, totalSize, baseLen] = [
|
|
961
|
+
'delSize', 'insSize', 'totalSize', 'baseLen'
|
|
962
|
+
].map(k => group.reduce((sum, e) => sum + e[k], 0));
|
|
963
|
+
// Push the group to the grouped array
|
|
964
|
+
this.grouped.push({
|
|
965
|
+
start, end, delSize, insSize, totalSize,
|
|
966
|
+
line: group[0].line, entries: group,
|
|
967
|
+
magnitude: this.magnitude(delSize, insSize, baseLen)
|
|
968
|
+
});
|
|
969
|
+
};
|
|
970
|
+
let group = [];
|
|
971
|
+
let start = 0, end = 0;
|
|
972
|
+
// Iterate through each diff entry to find groups
|
|
973
|
+
for (const entry of this.entries) {
|
|
974
|
+
const s = Math.max(0, entry.line - contextLines);
|
|
975
|
+
const e = entry.line + contextLines;
|
|
976
|
+
// If the group is empty or the current entry is adjacent to the last one
|
|
977
|
+
if (!group.length || s <= end + 1) {
|
|
978
|
+
// If this is the first entry, set the start position
|
|
979
|
+
if (!group.length)
|
|
980
|
+
start = s;
|
|
981
|
+
end = Math.max(end, e);
|
|
982
|
+
group.push(entry);
|
|
983
|
+
}
|
|
984
|
+
else {
|
|
985
|
+
// If the group is not empty, finalize it and start a new one
|
|
986
|
+
addGroup(group, start, end);
|
|
987
|
+
group = [entry], start = s, end = e;
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
// If there is a remaining group, finalize it
|
|
991
|
+
if (group.length)
|
|
992
|
+
addGroup(group, start, end);
|
|
993
|
+
}
|
|
994
|
+
/**
|
|
995
|
+
* Calculates the change magnitude string for a group or line.
|
|
996
|
+
*
|
|
997
|
+
* @param {number} del - Number of deleted characters
|
|
998
|
+
* @param {number} ins - Number of inserted characters
|
|
999
|
+
* @param {number} baseLen - Base length for normalization
|
|
1000
|
+
* @returns {string} - Magnitude string (e.g. "++-")
|
|
1001
|
+
*/
|
|
1002
|
+
magnitude(del, ins, baseLen) {
|
|
1003
|
+
const { maxMagnitudeSymbols } = this.options;
|
|
1004
|
+
const total = del + ins;
|
|
1005
|
+
// If there are no changes or base length is zero, return empty string
|
|
1006
|
+
if (total === 0 || baseLen === 0)
|
|
1007
|
+
return '';
|
|
1008
|
+
// Calculate the length of the magnitude string based on the full length
|
|
1009
|
+
const magLen = Math.min(maxMagnitudeSymbols, Math.max(Math.round(total / baseLen * maxMagnitudeSymbols), 1));
|
|
1010
|
+
// Calculate the number of plus and minus symbols
|
|
1011
|
+
const plus = Math.round((ins / total) * magLen);
|
|
1012
|
+
const minus = magLen - plus;
|
|
1013
|
+
// Return the magnitude string with plus and minus symbols
|
|
1014
|
+
return '+'.repeat(plus) + '-'.repeat(minus);
|
|
1015
|
+
}
|
|
1016
|
+
/**
|
|
1017
|
+
* Generates a unified diff output as a string, with optional CLI coloring.
|
|
1018
|
+
*
|
|
1019
|
+
* @param {boolean} cli - If true, use CLI colors; otherwise, ASCII markup
|
|
1020
|
+
* @returns {string} - Unified diff output
|
|
1021
|
+
*/
|
|
1022
|
+
output(cli) {
|
|
1023
|
+
const { mode, contextLines, groupedLines, expandLines, showChangeMagnitude, lineBreak } = this.options;
|
|
1024
|
+
// Get the lines and maximum length from the input texts
|
|
1025
|
+
const { linesA, linesB, maxLen } = this.text2lines();
|
|
1026
|
+
const linePad = Math.max(4, maxLen.toString().length);
|
|
1027
|
+
// Helper functions for coloring and formatting (ASCII or CLI colored)
|
|
1028
|
+
const highlight = (s, ansi) => cli ? `\x1b[${ansi}m${s}\x1b[0m` : s;
|
|
1029
|
+
const cy = (s) => highlight(s, '36');
|
|
1030
|
+
const gy = (s) => highlight(s, '90');
|
|
1031
|
+
const gn = (s) => highlight(s, '32');
|
|
1032
|
+
const rd = (s) => highlight(s, '31');
|
|
1033
|
+
const ye = (s) => highlight(s, '33');
|
|
1034
|
+
const del = (s) => cli ? `\x1b[37;41m${s}\x1b[31;49m` : `-[${s}]`;
|
|
1035
|
+
const ins = (s) => cli ? `\x1b[37;42m${s}\x1b[32;49m` : `+[${s}]`;
|
|
1036
|
+
// Function to output a block of lines with optional header
|
|
1037
|
+
const block = (start, end, forced, headerEntry) => {
|
|
1038
|
+
// If there is a header entry, output the header
|
|
1039
|
+
if (headerEntry)
|
|
1040
|
+
header(headerEntry);
|
|
1041
|
+
// Loop through the range and output lines
|
|
1042
|
+
for (let i = start; i <= end; i++)
|
|
1043
|
+
line(i, forced ?? i);
|
|
1044
|
+
out.push('');
|
|
1045
|
+
};
|
|
1046
|
+
// Function to output a header for a group or line
|
|
1047
|
+
const header = (e) => {
|
|
1048
|
+
out.push(`${(' '.repeat(linePad))} ${(cy(`@@ -${(e.line + 1)},${e.delSize} +${(e.line + 1)},${e.insSize} @@`))} ${(showChangeMagnitude ? ye(e.magnitude) : '')}`);
|
|
1049
|
+
};
|
|
1050
|
+
// Function to output a single line with optional diff highlighting
|
|
1051
|
+
const line = (i, forced) => {
|
|
1052
|
+
// If the line exists in either text, output it
|
|
1053
|
+
if (linesA[i] || linesB[i]) {
|
|
1054
|
+
// Find the diff entry for this line, if it exists
|
|
1055
|
+
const entry = this.entries.find(e => e.line === i);
|
|
1056
|
+
// Format the line number with padding
|
|
1057
|
+
const lineNo = (i + 1).toString().padStart(linePad, ' ');
|
|
1058
|
+
if (entry && forced === i) {
|
|
1059
|
+
// If there is an entry, output the line with diff highlighting
|
|
1060
|
+
out.push(`${lineNo} ${rd(`- ${mark(linesA[i], entry.diffs, 'del')}`)}`);
|
|
1061
|
+
out.push(`${' '.repeat(linePad)} ${gn(`+ ${mark(linesB[i], entry.diffs, 'ins')}`)}`);
|
|
1062
|
+
}
|
|
1063
|
+
else {
|
|
1064
|
+
// If no entry, just output the line without diff (context lines)
|
|
1065
|
+
out.push(`${lineNo} ${gy(linesA[i])}`);
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
};
|
|
1069
|
+
// Function to mark changes in a line based on the diffs
|
|
1070
|
+
const mark = (line, diffs, type) => {
|
|
1071
|
+
// If there are no diffs or the mode is line, return the line as is
|
|
1072
|
+
if (!diffs.length || mode === 'line')
|
|
1073
|
+
return line;
|
|
1074
|
+
let res = '', idx = 0;
|
|
1075
|
+
// Loop through each diff entry and apply the changes
|
|
1076
|
+
for (const d of diffs) {
|
|
1077
|
+
// Get the position and value based on the type
|
|
1078
|
+
const pos = type === 'del' ? d.posA : d.posB;
|
|
1079
|
+
const val = type === 'del' ? d.del : d.ins;
|
|
1080
|
+
// If the value is empty, skip it
|
|
1081
|
+
if (!val)
|
|
1082
|
+
continue;
|
|
1083
|
+
// Add the unchanged part of the line before the change
|
|
1084
|
+
if (pos > idx)
|
|
1085
|
+
res += line.slice(idx, pos);
|
|
1086
|
+
// Add the changed part of the line with appropriate formatting
|
|
1087
|
+
res += (type === 'del' ? del(val) : ins(val));
|
|
1088
|
+
idx = pos + val.length;
|
|
1089
|
+
}
|
|
1090
|
+
// Return the marked line with any remaining unchanged part
|
|
1091
|
+
return res + line.slice(idx);
|
|
1092
|
+
};
|
|
1093
|
+
let out = [''];
|
|
1094
|
+
switch (true) {
|
|
1095
|
+
// For expandLines, output the entire file context
|
|
1096
|
+
case expandLines:
|
|
1097
|
+
block(0, maxLen);
|
|
1098
|
+
break;
|
|
1099
|
+
// For groupedLines, output each group with its start and end
|
|
1100
|
+
case groupedLines:
|
|
1101
|
+
for (const group of this.grouped)
|
|
1102
|
+
block(group.start, group.end, undefined, group);
|
|
1103
|
+
break;
|
|
1104
|
+
// For individual lines, output each entry with context lines
|
|
1105
|
+
default:
|
|
1106
|
+
for (const entry of this.entries)
|
|
1107
|
+
block(entry.line - contextLines, entry.line + contextLines, entry.line, entry);
|
|
1108
|
+
break;
|
|
1109
|
+
}
|
|
1110
|
+
// Output the final diff as a string (ASCII or CLI colored)
|
|
1111
|
+
return out.join(lineBreak);
|
|
1112
|
+
}
|
|
1113
|
+
/**
|
|
1114
|
+
* Returns the structured diff as an array of DiffLine objects.
|
|
1115
|
+
*
|
|
1116
|
+
* @returns {DiffLine[]} - Array of line-level diffs
|
|
1117
|
+
*/
|
|
1118
|
+
getStructuredDiff() { return this.entries; }
|
|
1119
|
+
/**
|
|
1120
|
+
* Returns the grouped diff as an array of DiffGroup objects.
|
|
1121
|
+
*
|
|
1122
|
+
* @returns {DiffGroup[]} - Array of grouped diffs
|
|
1123
|
+
*/
|
|
1124
|
+
getGroupedDiff() { return this.grouped; }
|
|
1125
|
+
/**
|
|
1126
|
+
* Returns the unified diff as a plain ASCII string.
|
|
1127
|
+
*
|
|
1128
|
+
* @returns {string} - Unified diff (ASCII)
|
|
1129
|
+
*/
|
|
1130
|
+
getASCIIDiff() { return this.output(false); }
|
|
1131
|
+
/**
|
|
1132
|
+
* Returns the unified diff as a CLI-colored string.
|
|
1133
|
+
*
|
|
1134
|
+
* @returns {string} - Unified diff (CLI colors)
|
|
1135
|
+
*/
|
|
1136
|
+
getCLIDiff() { return this.output(true); }
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
/**
|
|
1140
|
+
* Hash Table Utility
|
|
1141
|
+
* src/utils/HashTable.ts
|
|
1142
|
+
*
|
|
1143
|
+
* @see https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function
|
|
1144
|
+
* @see https://en.wikipedia.org/wiki/Hash_table
|
|
1145
|
+
*
|
|
1146
|
+
* This module implements an instantiable hash table/cache using the FNV-1a hash algorithm.
|
|
1147
|
+
* It allows for multiple independent caches (e.g. for metrics, normalization, etc.) with
|
|
1148
|
+
* type safety and high performance. The FNV-1a algorithm is factored out into its own
|
|
1149
|
+
* static utility class to avoid code duplication and memory overhead.
|
|
1150
|
+
*
|
|
1151
|
+
* The key() method supports any number of string arguments, enabling flexible cache keys
|
|
1152
|
+
* for different use cases (e.g. normalization, metrics, etc.).
|
|
1153
|
+
*
|
|
1154
|
+
* @module Utils/HashTable
|
|
1155
|
+
* @author Paul Köhler (komed3)
|
|
1156
|
+
* @license MIT
|
|
1157
|
+
*/
|
|
1158
|
+
/**
|
|
1159
|
+
* Hasher Utility
|
|
1160
|
+
* Static class for FNV-1a hash calculation.
|
|
1161
|
+
*/
|
|
1162
|
+
class Hasher {
|
|
1163
|
+
// Constants for the FNV-1a hash algorithm
|
|
1164
|
+
static FNV_PRIME = 0x01000193;
|
|
1165
|
+
static HASH_OFFSET = 0x811c9dc5;
|
|
1166
|
+
/**
|
|
1167
|
+
* Computes a hash value for a given string using the FNV-1a algorithm.
|
|
1168
|
+
* Processes the string in chunks of 4 characters for better performance.
|
|
1169
|
+
*
|
|
1170
|
+
* @param {string} str - The string to hash
|
|
1171
|
+
* @return {number} - The computed hash value as an unsigned 32-bit integer
|
|
1172
|
+
*/
|
|
1173
|
+
static fnv1a(str) {
|
|
1174
|
+
const len = str.length;
|
|
1175
|
+
let hash = this.HASH_OFFSET;
|
|
1176
|
+
// Process 4 characters at a time for better performance
|
|
1177
|
+
const chunks = Math.floor(len / 4);
|
|
1178
|
+
for (let i = 0; i < chunks; i++) {
|
|
1179
|
+
const pos = i * 4;
|
|
1180
|
+
// Combine 4 chars into a single number for faster processing
|
|
1181
|
+
const chunk = ((str.charCodeAt(pos)) |
|
|
1182
|
+
(str.charCodeAt(pos + 1) << 8) |
|
|
1183
|
+
(str.charCodeAt(pos + 2) << 16) |
|
|
1184
|
+
(str.charCodeAt(pos + 3) << 24));
|
|
1185
|
+
hash ^= chunk;
|
|
1186
|
+
hash *= this.FNV_PRIME;
|
|
1187
|
+
}
|
|
1188
|
+
// Handle remaining characters
|
|
1189
|
+
const remaining = len % 4;
|
|
1190
|
+
if (remaining > 0) {
|
|
1191
|
+
const pos = chunks * 4;
|
|
1192
|
+
for (let i = 0; i < remaining; i++) {
|
|
1193
|
+
hash ^= str.charCodeAt(pos + i);
|
|
1194
|
+
hash *= this.FNV_PRIME;
|
|
1195
|
+
}
|
|
1196
|
+
}
|
|
1197
|
+
// Final mixing to improve distribution
|
|
1198
|
+
hash ^= hash >>> 16;
|
|
1199
|
+
hash *= 0x85ebca6b;
|
|
1200
|
+
hash ^= hash >>> 13;
|
|
1201
|
+
hash *= 0xc2b2ae35;
|
|
1202
|
+
hash ^= hash >>> 16;
|
|
1203
|
+
// Convert to unsigned 32-bit integer
|
|
1204
|
+
return hash >>> 0;
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
/**
|
|
1208
|
+
* HashTable class implements an instantiable hash table/cache.
|
|
1209
|
+
* Allows for multiple independent caches with type safety and high performance.
|
|
1210
|
+
*
|
|
1211
|
+
* @template K - The type of the label for the key (e.g. string, MetricName, …)
|
|
1212
|
+
* @template T - The type of value to be stored in the hash table (e.g. MetricCompute, string, …)
|
|
1213
|
+
*/
|
|
1214
|
+
class HashTable {
|
|
1215
|
+
// The max. length of a string to hash, which is set to 2048 characters.
|
|
1216
|
+
static MAX_LEN = 2048;
|
|
1217
|
+
// The max. size of the hash table, which is set to 10,000.
|
|
1218
|
+
static TABLE_SIZE = 10_000;
|
|
1219
|
+
/**
|
|
1220
|
+
* The internal map to store entries.
|
|
1221
|
+
* The key is a string generated from the label and any number of hashed strings.
|
|
1222
|
+
* The value is of type T.
|
|
1223
|
+
*/
|
|
1224
|
+
table = new Map();
|
|
1225
|
+
/**
|
|
1226
|
+
* Generates a unique hash key for any number of string arguments.
|
|
1227
|
+
* The key is in the format "label-H1-H2-H3-..."
|
|
1228
|
+
*
|
|
1229
|
+
* @param {K} label - Label for this key (e.g. metric name, normalization flags, …)
|
|
1230
|
+
* @param {string[]} strs - Array of strings to hash (e.g. input, params, …)
|
|
1231
|
+
* @param {boolean} [sorted=false] - Whether to sort the hashes before creating the key
|
|
1232
|
+
* @returns {string|false} - A unique hash key or false if any string is too long
|
|
1233
|
+
*/
|
|
1234
|
+
key(label, strs, sorted = false) {
|
|
1235
|
+
// Return false if any string exceeds the maximum length
|
|
1236
|
+
for (const str of strs) {
|
|
1237
|
+
if (str.length > HashTable.MAX_LEN)
|
|
1238
|
+
return false;
|
|
1239
|
+
}
|
|
1240
|
+
// Hash all strings
|
|
1241
|
+
const hashes = strs.map(s => Hasher.fnv1a(s));
|
|
1242
|
+
// Sort them in ascending order
|
|
1243
|
+
if (sorted)
|
|
1244
|
+
hashes.sort();
|
|
1245
|
+
// Build key: label-H1-H2-H3-...
|
|
1246
|
+
return [label, ...hashes].join('-');
|
|
1247
|
+
}
|
|
1248
|
+
/**
|
|
1249
|
+
* Checks if a key exists in the hash table.
|
|
1250
|
+
*
|
|
1251
|
+
* @param {string} key - The key to check
|
|
1252
|
+
* @returns {boolean} - True if the key exists, false otherwise
|
|
1253
|
+
*/
|
|
1254
|
+
has(key) { return this.table.has(key); }
|
|
1255
|
+
/**
|
|
1256
|
+
* Retrieves the entry from the hash table by its key.
|
|
1257
|
+
*
|
|
1258
|
+
* @param {string} key - The key to look up
|
|
1259
|
+
* @returns {T|undefined} - The entry if found, undefined otherwise
|
|
1260
|
+
*/
|
|
1261
|
+
get(key) { return this.table.get(key); }
|
|
1262
|
+
/**
|
|
1263
|
+
* Adds an entry to the hash table.
|
|
1264
|
+
*
|
|
1265
|
+
* @param {string} key - The hashed key for the entry
|
|
1266
|
+
* @param {T} entry - The entry itself to add
|
|
1267
|
+
* @param {boolean} [update=true] - Whether to update the entry if it already exists
|
|
1268
|
+
* @returns {boolean} - True if added successfully, false if the table is full
|
|
1269
|
+
*/
|
|
1270
|
+
set(key, entry, update = true) {
|
|
1271
|
+
// If the table is not full and the key does not exist or update is true, add the entry
|
|
1272
|
+
if (this.table.size < HashTable.TABLE_SIZE && (update || !this.table.has(key))) {
|
|
1273
|
+
this.table.set(key, entry);
|
|
1274
|
+
return true;
|
|
1275
|
+
}
|
|
1276
|
+
return false;
|
|
1277
|
+
}
|
|
1278
|
+
/**
|
|
1279
|
+
* Deletes an entry from the hash table by its key.
|
|
1280
|
+
*
|
|
1281
|
+
* @param {string} key - The key of the entry to delete
|
|
1282
|
+
*/
|
|
1283
|
+
delete(key) { this.table.delete(key); }
|
|
1284
|
+
/**
|
|
1285
|
+
* Clears the hash table.
|
|
1286
|
+
* This method removes all entries from the hash table.
|
|
1287
|
+
*/
|
|
1288
|
+
clear() { this.table.clear(); }
|
|
1289
|
+
/**
|
|
1290
|
+
* Returns the current size of the hash table.
|
|
1291
|
+
*
|
|
1292
|
+
* @returns {number} - The number of entries in the hash table
|
|
1293
|
+
*/
|
|
1294
|
+
size() { return this.table.size; }
|
|
1295
|
+
}
|
|
1296
|
+
|
|
1297
|
+
/**
|
|
1298
|
+
* Normalizer Utility
|
|
1299
|
+
* src/utils/Normalizer.ts
|
|
1300
|
+
*
|
|
1301
|
+
* @see https://en.wikipedia.org/wiki/Text_normalization
|
|
1302
|
+
* @see https://en.wikipedia.org/wiki/Unicode_equivalence
|
|
1303
|
+
*
|
|
1304
|
+
* This module provides a Normalizer class that allows for string normalization based
|
|
1305
|
+
* on various flags. It uses a pipeline of normalization functions that can be reused
|
|
1306
|
+
* and cached for efficiency. The Normalizer can handle both single strings and arrays
|
|
1307
|
+
* of strings, and supports synchronous and asynchronous normalization.
|
|
1308
|
+
*
|
|
1309
|
+
* Supported flags:
|
|
1310
|
+
* 'd' :: Normalize to NFD (Normalization Form Decomposed)
|
|
1311
|
+
* 'u' :: Normalize to NFC (Normalization Form Composed)
|
|
1312
|
+
* 'x' :: Normalize to NFKC (Normalization Form Compatibility Composed)
|
|
1313
|
+
* 'w' :: Collapse whitespace
|
|
1314
|
+
* 't' :: Remove leading and trailing whitespace
|
|
1315
|
+
* 'r' :: Remove double characters
|
|
1316
|
+
* 's' :: Remove punctuation / special characters
|
|
1317
|
+
* 'k' :: Remove non-letter characters
|
|
1318
|
+
* 'n' :: Remove non-number characters
|
|
1319
|
+
* 'i' :: Case insensitive (convert to lowercase)
|
|
1320
|
+
*
|
|
1321
|
+
* @module Utils/Normalizer
|
|
1322
|
+
* @author Paul Köhler (komed3)
|
|
1323
|
+
* @license MIT
|
|
1324
|
+
*/
|
|
1325
|
+
/**
|
|
1326
|
+
* The Normalizer class providing methods to normalize strings based on various flags.
|
|
1327
|
+
*/
|
|
1328
|
+
class Normalizer {
|
|
1329
|
+
/**
|
|
1330
|
+
* A map that holds normalization functions based on the flags.
|
|
1331
|
+
* This allows for reusing normalization logic without recomputing it.
|
|
1332
|
+
*/
|
|
1333
|
+
static pipeline = new Map();
|
|
1334
|
+
/**
|
|
1335
|
+
* A cache to store normalized strings based on the flags and input.
|
|
1336
|
+
* This helps avoid recomputing normalization for the same input and flags.
|
|
1337
|
+
*/
|
|
1338
|
+
static cache = new HashTable();
|
|
1339
|
+
/**
|
|
1340
|
+
* Returns a normalization function based on the provided flags.
|
|
1341
|
+
* The flags are a string of characters that define the normalization steps.
|
|
1342
|
+
*
|
|
1343
|
+
* @param {NormalizeFlags} flags - A string of characters representing the normalization steps
|
|
1344
|
+
* @returns {NormalizerFn} - A function that normalizes a string based on the provided flags
|
|
1345
|
+
*/
|
|
1346
|
+
static getPipeline(flags) {
|
|
1347
|
+
// Return the cached pipeline if it exists
|
|
1348
|
+
if (Normalizer.pipeline.has(flags))
|
|
1349
|
+
return Normalizer.pipeline.get(flags);
|
|
1350
|
+
// Define the normalization steps based on the flags
|
|
1351
|
+
const steps = [];
|
|
1352
|
+
// Normalize to NFD (Normalization Form Decomposed)
|
|
1353
|
+
if (flags.includes('d'))
|
|
1354
|
+
steps.push(str => str.normalize('NFD'));
|
|
1355
|
+
// Normalize to NFC (Normalization Form Composed)
|
|
1356
|
+
if (flags.includes('u'))
|
|
1357
|
+
steps.push(str => str.normalize('NFC'));
|
|
1358
|
+
// Normalize to NFKC (Normalization Form Compatibility Composed)
|
|
1359
|
+
if (flags.includes('x'))
|
|
1360
|
+
steps.push(str => str.normalize('NFKC'));
|
|
1361
|
+
// Collapse whitespace
|
|
1362
|
+
if (flags.includes('w'))
|
|
1363
|
+
steps.push(str => str.replace(/\s+/g, ' '));
|
|
1364
|
+
// Remove leading and trailing whitespace
|
|
1365
|
+
if (flags.includes('t'))
|
|
1366
|
+
steps.push(str => str.trim());
|
|
1367
|
+
// Remove double characters
|
|
1368
|
+
if (flags.includes('r'))
|
|
1369
|
+
steps.push(str => str.replace(/(.)\1+/g, '$1'));
|
|
1370
|
+
// Remove punctuation / special characters
|
|
1371
|
+
if (flags.includes('s'))
|
|
1372
|
+
steps.push(str => str.replace(/[^\p{L}\p{N}\s]/gu, ''));
|
|
1373
|
+
// Remove non-letter characters
|
|
1374
|
+
if (flags.includes('k'))
|
|
1375
|
+
steps.push(str => str.replace(/[^\p{L}]/gu, ''));
|
|
1376
|
+
// Remove non-number characters
|
|
1377
|
+
if (flags.includes('n'))
|
|
1378
|
+
steps.push(str => str.replace(/\p{N}/gu, ''));
|
|
1379
|
+
// Case insensitive
|
|
1380
|
+
if (flags.includes('i'))
|
|
1381
|
+
steps.push(str => str.toLowerCase());
|
|
1382
|
+
// Build the normalization function from the steps
|
|
1383
|
+
const compiled = (input) => {
|
|
1384
|
+
let res = input;
|
|
1385
|
+
for (const step of steps)
|
|
1386
|
+
res = step(res);
|
|
1387
|
+
return res;
|
|
1388
|
+
};
|
|
1389
|
+
// Cache the compiled function for the given flags
|
|
1390
|
+
Normalizer.pipeline.set(flags, compiled);
|
|
1391
|
+
// Return the compiled normalization function
|
|
1392
|
+
return compiled;
|
|
1393
|
+
}
|
|
1394
|
+
/**
|
|
1395
|
+
* Normalizes the input string or array of strings based on the provided flags.
|
|
1396
|
+
* The flags are a string of characters that define the normalization steps.
|
|
1397
|
+
*
|
|
1398
|
+
* @param {string|string[]} input - The string or array of strings to normalize
|
|
1399
|
+
* @param {NormalizeFlags} flags - A string of characters representing the normalization steps
|
|
1400
|
+
* @returns {string|string[]} - The normalized string(s)
|
|
1401
|
+
*/
|
|
1402
|
+
static normalize(input, flags) {
|
|
1403
|
+
// If input is an array, normalize each string in the array
|
|
1404
|
+
if (Array.isArray(input))
|
|
1405
|
+
return input.map(s => Normalizer.normalize(s, flags));
|
|
1406
|
+
// If input or flags are not provided, return the input as is
|
|
1407
|
+
if (!flags || typeof flags !== 'string' || !input)
|
|
1408
|
+
return input;
|
|
1409
|
+
// Generate a cache key based on the flags and input
|
|
1410
|
+
const key = Normalizer.cache.key(flags, [input]);
|
|
1411
|
+
// If the key exists in the cache, return the cached result
|
|
1412
|
+
if (key && Normalizer.cache.has(key))
|
|
1413
|
+
return Normalizer.cache.get(key);
|
|
1414
|
+
// Normalize the input using the pipeline for the given flags
|
|
1415
|
+
const res = Normalizer.getPipeline(flags)(input);
|
|
1416
|
+
// If a key was generated, store the result in the cache
|
|
1417
|
+
if (key)
|
|
1418
|
+
Normalizer.cache.set(key, res);
|
|
1419
|
+
// Return the normalized result
|
|
1420
|
+
return res;
|
|
1421
|
+
}
|
|
1422
|
+
/**
|
|
1423
|
+
* Asynchronously normalizes the input string or array of strings based on the
|
|
1424
|
+
* provided flags. This method is useful for handling large inputs or when
|
|
1425
|
+
* normalization needs to be done in a non-blocking way.
|
|
1426
|
+
*
|
|
1427
|
+
* @param {string|string[]} input - The string or array of strings to normalize
|
|
1428
|
+
* @param {NormalizeFlags} flags - A string of characters representing the normalization steps
|
|
1429
|
+
* @returns {Promise<string|string[]>} - A promise that resolves to the normalized string(s)
|
|
1430
|
+
*/
|
|
1431
|
+
static async normalizeAsync(input, flags) {
|
|
1432
|
+
return await (Array.isArray(input)
|
|
1433
|
+
// If input is an array, normalize each string in the array asynchronously
|
|
1434
|
+
? Promise.all(input.map(s => Normalizer.normalize(s, flags)))
|
|
1435
|
+
// If input is a single string, normalize it asynchronously
|
|
1436
|
+
: Promise.resolve(Normalizer.normalize(input, flags)));
|
|
1437
|
+
}
|
|
1438
|
+
/**
|
|
1439
|
+
* Clears the normalization pipeline and cache.
|
|
1440
|
+
* This is useful for resetting the state of the Normalizer.
|
|
1441
|
+
*/
|
|
1442
|
+
static clear() {
|
|
1443
|
+
Normalizer.pipeline.clear();
|
|
1444
|
+
Normalizer.cache.clear();
|
|
1445
|
+
}
|
|
1446
|
+
}
|
|
1447
|
+
|
|
1448
|
+
/**
|
|
1449
|
+
* Filter Utility
|
|
1450
|
+
* src/utils/Filter.ts
|
|
1451
|
+
*
|
|
1452
|
+
* This module provides a Filter class that allows for the management and application of
|
|
1453
|
+
* filters to strings based on hooks. Filters can be added, removed, paused, resumed, and
|
|
1454
|
+
* applied to input strings. Each filter has an id, a function, a priority, and options
|
|
1455
|
+
* for activation and overrideability.
|
|
1456
|
+
*
|
|
1457
|
+
* @module Utils/Filter
|
|
1458
|
+
* @author Paul Köhler (komed3)
|
|
1459
|
+
* @license MIT
|
|
1460
|
+
*/
|
|
1461
|
+
/**
|
|
1462
|
+
* The Filter class provides a way to manage and apply filters to strings based on hooks.
|
|
1463
|
+
*/
|
|
1464
|
+
class Filter {
|
|
1465
|
+
/**
|
|
1466
|
+
* A static map to hold all filters.
|
|
1467
|
+
* The key is the hook name, and the value is an array of FilterEntry objects.
|
|
1468
|
+
*/
|
|
1469
|
+
static filters = new Map();
|
|
1470
|
+
/**
|
|
1471
|
+
* Finds a filter by its hook and id.
|
|
1472
|
+
*
|
|
1473
|
+
* @param {string} hook - The name of the hook
|
|
1474
|
+
* @param {string} id - The id of the filter
|
|
1475
|
+
* @returns {FilterEntry|undefined} - The FilterEntry if found, otherwise undefined
|
|
1476
|
+
*/
|
|
1477
|
+
static find(hook, id) {
|
|
1478
|
+
return Filter.filters.get(hook)?.find(f => f.id === id);
|
|
1479
|
+
}
|
|
1480
|
+
/**
|
|
1481
|
+
* Adds a filter to the specified hook.
|
|
1482
|
+
*
|
|
1483
|
+
* @param {string} hook - The name of the hook
|
|
1484
|
+
* @param {string} id - The id of the filter
|
|
1485
|
+
* @param {FilterFn} fn - The filter function
|
|
1486
|
+
* @param {FilterOptions} [opt] - Additional options for the filter
|
|
1487
|
+
* @returns {boolean} - Returns true if the filter was added, false if it was not added due to override restrictions
|
|
1488
|
+
*/
|
|
1489
|
+
static add(hook, id, fn, opt = {}) {
|
|
1490
|
+
const { priority = 10, active = true, overrideable = true } = opt;
|
|
1491
|
+
// Check if the filter already exists
|
|
1492
|
+
const filter = Filter.filters.get(hook) ?? [];
|
|
1493
|
+
const index = filter.findIndex(f => f.id === id);
|
|
1494
|
+
// If the filter already exists and is not overrideable, return false
|
|
1495
|
+
if (index >= 0) {
|
|
1496
|
+
const f = filter[index];
|
|
1497
|
+
if (!f.overrideable)
|
|
1498
|
+
return false;
|
|
1499
|
+
filter.splice(index, 1);
|
|
1500
|
+
}
|
|
1501
|
+
// Add the new filter entry
|
|
1502
|
+
filter.push({ id, fn, priority, active, overrideable });
|
|
1503
|
+
// Sort the filters by priority
|
|
1504
|
+
filter.sort((a, b) => a.priority - b.priority);
|
|
1505
|
+
// Update the filters map
|
|
1506
|
+
Filter.filters.set(hook, filter);
|
|
1507
|
+
return true;
|
|
1508
|
+
}
|
|
1509
|
+
/**
|
|
1510
|
+
* Removes a filter by its hook and id.
|
|
1511
|
+
*
|
|
1512
|
+
* @param {string} hook - The name of the hook
|
|
1513
|
+
* @param {string} id - The id of the filter
|
|
1514
|
+
* @returns {boolean} - Returns true if the filter was removed, false if it was not found
|
|
1515
|
+
*/
|
|
1516
|
+
static remove(hook, id) {
|
|
1517
|
+
// Get the filter array for the specified hook
|
|
1518
|
+
const filter = Filter.filters.get(hook);
|
|
1519
|
+
// If the filter array does not exist, return false
|
|
1520
|
+
if (!filter)
|
|
1521
|
+
return false;
|
|
1522
|
+
// Find the index of the filter with the specified id
|
|
1523
|
+
const index = filter.findIndex(f => f.id === id);
|
|
1524
|
+
// If the filter is found, remove it and return true
|
|
1525
|
+
if (index >= 0) {
|
|
1526
|
+
filter.splice(index, 1);
|
|
1527
|
+
return true;
|
|
1528
|
+
}
|
|
1529
|
+
return false;
|
|
1530
|
+
}
|
|
1531
|
+
/**
|
|
1532
|
+
* Pauses a filter by its hook and id.
|
|
1533
|
+
*
|
|
1534
|
+
* @param {string} hook - The name of the hook
|
|
1535
|
+
* @param {string} id - The id of the filter
|
|
1536
|
+
* @returns {boolean} - Returns true if the filter was paused, false if it was not found
|
|
1537
|
+
*/
|
|
1538
|
+
static pause(hook, id) {
|
|
1539
|
+
// Find the filter entry by hook and id
|
|
1540
|
+
const f = Filter.find(hook, id);
|
|
1541
|
+
if (!f)
|
|
1542
|
+
return false;
|
|
1543
|
+
// Set the active property to false to pause the filter
|
|
1544
|
+
f.active = false;
|
|
1545
|
+
return true;
|
|
1546
|
+
}
|
|
1547
|
+
/**
|
|
1548
|
+
* Resumes a filter by its hook and id.
|
|
1549
|
+
*
|
|
1550
|
+
* @param {string} hook - The name of the hook
|
|
1551
|
+
* @param {string} id - The id of the filter
|
|
1552
|
+
* @returns {boolean} - Returns true if the filter was resumed, false if it was not found
|
|
1553
|
+
*/
|
|
1554
|
+
static resume(hook, id) {
|
|
1555
|
+
// Find the filter entry by hook and id
|
|
1556
|
+
const f = Filter.find(hook, id);
|
|
1557
|
+
if (!f)
|
|
1558
|
+
return false;
|
|
1559
|
+
// Set the active property to true to resume the filter
|
|
1560
|
+
f.active = true;
|
|
1561
|
+
return true;
|
|
1562
|
+
}
|
|
1563
|
+
/**
|
|
1564
|
+
* Lists all filters for a given hook.
|
|
1565
|
+
*
|
|
1566
|
+
* @param {string} hook - The name of the hook
|
|
1567
|
+
* @param {boolean} active - If true, only list active filters
|
|
1568
|
+
* @returns {string[]} - An array of filter ids
|
|
1569
|
+
*/
|
|
1570
|
+
static list(hook, active = false) {
|
|
1571
|
+
// Get the filter array for the specified hook
|
|
1572
|
+
const filter = Filter.filters.get(hook) ?? [];
|
|
1573
|
+
const list = [];
|
|
1574
|
+
// If active is true, filter the entries based on their active status
|
|
1575
|
+
for (const f of filter)
|
|
1576
|
+
if (!active || f.active)
|
|
1577
|
+
list.push(f.id);
|
|
1578
|
+
return list;
|
|
1579
|
+
}
|
|
1580
|
+
/**
|
|
1581
|
+
* Applies all active filters for a given hook to the input string(s).
|
|
1582
|
+
*
|
|
1583
|
+
* @param {string} hook - The name of the hook
|
|
1584
|
+
* @param {string|string[]} input - The input string(s) to be filtered
|
|
1585
|
+
* @returns {string|string[]} - The filtered string(s)
|
|
1586
|
+
*/
|
|
1587
|
+
static apply(hook, input) {
|
|
1588
|
+
// Get the filter array for the specified hook
|
|
1589
|
+
const filter = Filter.filters.get(hook);
|
|
1590
|
+
// If no filters are found for the hook or if no filters are active, return the input unchanged
|
|
1591
|
+
if (!filter || filter.every(f => !f.active))
|
|
1592
|
+
return input;
|
|
1593
|
+
// Apply each active filter function to the given string
|
|
1594
|
+
const applyOne = (s) => {
|
|
1595
|
+
for (const f of filter)
|
|
1596
|
+
if (f.active)
|
|
1597
|
+
s = f.fn(s);
|
|
1598
|
+
return s;
|
|
1599
|
+
};
|
|
1600
|
+
// If the input is an array, apply the filter to each element, otherwise just once
|
|
1601
|
+
return Array.isArray(input) ? input.map(applyOne) : applyOne(input);
|
|
1602
|
+
}
|
|
1603
|
+
/**
|
|
1604
|
+
* Applies all active filters for a given hook to the input string(s) asynchronously.
|
|
1605
|
+
* Each filter function may return a Promise or a plain string; all are awaited in order.
|
|
1606
|
+
*
|
|
1607
|
+
* @param {string} hook - The name of the hook
|
|
1608
|
+
* @param {string|string[]} input - The input string(s) to be filtered
|
|
1609
|
+
* @returns {Promise<string|string[]>} - The filtered string(s)
|
|
1610
|
+
*/
|
|
1611
|
+
static async applyAsync(hook, input) {
|
|
1612
|
+
// Get the filter array for the specified hook
|
|
1613
|
+
const filter = Filter.filters.get(hook);
|
|
1614
|
+
// If no filters are found for the hook or if no filters are active, return the input unchanged
|
|
1615
|
+
if (!filter || filter.every(f => !f.active))
|
|
1616
|
+
return input;
|
|
1617
|
+
// Apply each active filter function to the given string
|
|
1618
|
+
// Support both sync and async filter functions
|
|
1619
|
+
const applyOne = async (s) => {
|
|
1620
|
+
for (const f of filter)
|
|
1621
|
+
if (f.active)
|
|
1622
|
+
s = await Promise.resolve(f.fn(s));
|
|
1623
|
+
return s;
|
|
1624
|
+
};
|
|
1625
|
+
// If the input is an array, apply the filter to each element, otherwise just once
|
|
1626
|
+
// Use Promise.all to handle multiple promises if input is an array
|
|
1627
|
+
return Array.isArray(input) ? Promise.all(input.map(applyOne)) : applyOne(input);
|
|
1628
|
+
}
|
|
1629
|
+
/**
|
|
1630
|
+
* Clears all filters or filters for a specific hook.
|
|
1631
|
+
*
|
|
1632
|
+
* @param {string} [hook] - Optional name of the hook to clear filters for
|
|
1633
|
+
*/
|
|
1634
|
+
static clear(hook) {
|
|
1635
|
+
// If a specific hook is provided, delete its filters
|
|
1636
|
+
if (hook)
|
|
1637
|
+
Filter.filters.delete(hook);
|
|
1638
|
+
// If no hook is provided, clear all filters
|
|
1639
|
+
else
|
|
1640
|
+
Filter.filters.clear();
|
|
1641
|
+
}
|
|
1642
|
+
}
|
|
1643
|
+
|
|
1644
|
+
/**
|
|
1645
|
+
* Registry Utility
|
|
1646
|
+
* src/utils/Registry.ts
|
|
1647
|
+
*
|
|
1648
|
+
* This module provides a Registry function that allows for registering,
|
|
1649
|
+
* removing, checking, getting, and listing class constructors.
|
|
1650
|
+
*
|
|
1651
|
+
* It is designed to manage class extensions, ensuring that all registered
|
|
1652
|
+
* classes extend a specified base constructor.
|
|
1653
|
+
*
|
|
1654
|
+
* @module Utils/Registry
|
|
1655
|
+
* @author Paul Köhler (komed3)
|
|
1656
|
+
* @license MIT
|
|
1657
|
+
*/
|
|
1658
|
+
/**
|
|
1659
|
+
* Global registry object to hold multiple registries.
|
|
1660
|
+
* Each registry is keyed by a string identifier.
|
|
1661
|
+
*
|
|
1662
|
+
* @type {Record<string, RegistryService<any>>}
|
|
1663
|
+
*/
|
|
1664
|
+
const registry = Object.create(null);
|
|
1665
|
+
/**
|
|
1666
|
+
* Factory object to hold factory functions for creating instances.
|
|
1667
|
+
* This is used to create instances of registered classes.
|
|
1668
|
+
*
|
|
1669
|
+
* @type {Record<string, ( cls: string, ...args: any[] ) => InstanceType<any>>}
|
|
1670
|
+
*/
|
|
1671
|
+
const factory = Object.create(null);
|
|
1672
|
+
/**
|
|
1673
|
+
* Registry function to create a service for managing class constructors.
|
|
1674
|
+
*
|
|
1675
|
+
* @param {string} reg - The name of the registry
|
|
1676
|
+
* @param {RegistryConstructor<T>} ctor - The base constructor that all registered classes must extend
|
|
1677
|
+
* @returns {RegistryService<T>} - An object with methods to register, remove, check, get, and list classes
|
|
1678
|
+
* @throws {Error} If the registry already exists (overwriting is forbidden)
|
|
1679
|
+
*/
|
|
1680
|
+
function Registry(reg, ctor) {
|
|
1681
|
+
// Throws an error if the registry already exists
|
|
1682
|
+
if (reg in registry || reg in factory)
|
|
1683
|
+
throw new Error(`registry <${reg}> already exists / overwriting is forbidden`);
|
|
1684
|
+
// Create a registry object to hold class constructors
|
|
1685
|
+
const classes = Object.create(null);
|
|
1686
|
+
const service = {
|
|
1687
|
+
/**
|
|
1688
|
+
* Register a new extension of the base class.
|
|
1689
|
+
*
|
|
1690
|
+
* @param {string} name - The name of the class to register
|
|
1691
|
+
* @param {RegistryConstructor<T>} cls - The class constructor
|
|
1692
|
+
* @param {boolean} [update=false] - Whether to allow overwriting an existing entry
|
|
1693
|
+
* @throws {TypeError} If the class does not extend the base constructor
|
|
1694
|
+
* @throws {Error} If the class name already exists and update is false
|
|
1695
|
+
*/
|
|
1696
|
+
add(name, cls, update = false) {
|
|
1697
|
+
if (!(cls.prototype instanceof ctor))
|
|
1698
|
+
throw new TypeError(`class must extend <${reg}>`);
|
|
1699
|
+
if (!update && name in classes)
|
|
1700
|
+
throw new Error(`entry <${name}> already exists / use <update=true> to overwrite`);
|
|
1701
|
+
classes[name] = cls;
|
|
1702
|
+
},
|
|
1703
|
+
/**
|
|
1704
|
+
* Remove a class from the registry.
|
|
1705
|
+
*
|
|
1706
|
+
* @param {string} name - The name of the class to remove
|
|
1707
|
+
*/
|
|
1708
|
+
remove(name) { delete classes[name]; },
|
|
1709
|
+
/**
|
|
1710
|
+
* Check if a class is registered.
|
|
1711
|
+
*
|
|
1712
|
+
* @param {string} name - The name of the class to check
|
|
1713
|
+
* @returns {boolean} - True if the class is registered, false otherwise
|
|
1714
|
+
*/
|
|
1715
|
+
has(name) { return name in classes; },
|
|
1716
|
+
/**
|
|
1717
|
+
* List all registered class names.
|
|
1718
|
+
*
|
|
1719
|
+
* @returns {string[]} - An array of registered class names
|
|
1720
|
+
*/
|
|
1721
|
+
list() { return Object.keys(classes); },
|
|
1722
|
+
/**
|
|
1723
|
+
* Get a registered class by name.
|
|
1724
|
+
*
|
|
1725
|
+
* @param {string} name - The name of the class to retrieve
|
|
1726
|
+
* @returns {RegistryConstructor<T>} - The class constructor
|
|
1727
|
+
* @throws {Error} If the class is not registered
|
|
1728
|
+
*/
|
|
1729
|
+
get(name) {
|
|
1730
|
+
if (!(name in classes))
|
|
1731
|
+
throw new Error(`class <${name}> not registered for <${reg}>`);
|
|
1732
|
+
return classes[name];
|
|
1733
|
+
}
|
|
1734
|
+
};
|
|
1735
|
+
// Register the service in the global registry
|
|
1736
|
+
registry[reg] = service;
|
|
1737
|
+
// Create a factory function for creating instances from the registry
|
|
1738
|
+
factory[reg] = (cls, ...args) => (createFromRegistry(reg, cls, ...args));
|
|
1739
|
+
// Return the service object
|
|
1740
|
+
return service;
|
|
1741
|
+
}
|
|
1742
|
+
/**
|
|
1743
|
+
* Resolve a class constructor from a specific registry.
|
|
1744
|
+
*
|
|
1745
|
+
* @param {string} reg - The name of the registry
|
|
1746
|
+
* @param {T|string} cls - The class itself or name of the class to resolve
|
|
1747
|
+
* @returns {T|undefined} - The class constructor if found, otherwise undefined
|
|
1748
|
+
* @throws {ReferenceError} If the registry does not exist
|
|
1749
|
+
*/
|
|
1750
|
+
function resolveCls(reg, cls) {
|
|
1751
|
+
if (!(reg in registry))
|
|
1752
|
+
throw new ReferenceError(`registry <${reg}> does not exist`);
|
|
1753
|
+
return (typeof cls === 'string' ? registry[reg]?.get(cls) : cls);
|
|
1754
|
+
}
|
|
1755
|
+
/**
|
|
1756
|
+
* Create an instance of a class from a specific registry.
|
|
1757
|
+
*
|
|
1758
|
+
* @param {string} reg - The name of the registry
|
|
1759
|
+
* @param {T|string} cls - The class itself or name of the class to instantiate
|
|
1760
|
+
* @param {...any} args - Arguments to pass to the class constructor
|
|
1761
|
+
* @returns {T} - An instance of the class
|
|
1762
|
+
* @throws {Error} If the class cannot be instantiated
|
|
1763
|
+
*/
|
|
1764
|
+
function createFromRegistry(reg, cls, ...args) {
|
|
1765
|
+
cls = resolveCls(reg, cls);
|
|
1766
|
+
try {
|
|
1767
|
+
return new cls(...args);
|
|
1768
|
+
}
|
|
1769
|
+
catch (err) {
|
|
1770
|
+
throw new Error(`cannot instantiate class <${cls}>`);
|
|
1771
|
+
}
|
|
1772
|
+
}
|
|
1773
|
+
|
|
1774
|
+
/**
|
|
1775
|
+
* Abstract Metric
|
|
1776
|
+
* src/metric/Metric.ts
|
|
1777
|
+
*
|
|
1778
|
+
* This module defines an abstract class for string metrics, providing a framework for
|
|
1779
|
+
* computing various string similarity metrics. It includes methods for running metrics
|
|
1780
|
+
* in different modes (single, batch, pairwise) synchronous or asynchronous and caching
|
|
1781
|
+
* results to optimize performance. The class is designed to be extended by specific
|
|
1782
|
+
* metric implementations like the Levenshtein distance or Jaro-Winkler similarity.
|
|
1783
|
+
*
|
|
1784
|
+
* It provides:
|
|
1785
|
+
* - A base class for string metrics with common functionality
|
|
1786
|
+
* - Methods for running metrics in different modes
|
|
1787
|
+
* - Pre-computation for trivial cases to optimize performance
|
|
1788
|
+
* - Caching of metric computations to avoid redundant calculations
|
|
1789
|
+
* - Support for symmetrical metrics (same result for inputs in any order)
|
|
1790
|
+
* - Performance tracking capabilities (Profiler)
|
|
1791
|
+
* - Asynchronous execution support for metrics
|
|
1792
|
+
*
|
|
1793
|
+
* This class is intended to be extended by specific metric implementations that will
|
|
1794
|
+
* implement the `compute` method to define the specific metric computation logic.
|
|
1795
|
+
*
|
|
1796
|
+
* @module Metric
|
|
1797
|
+
* @author Paul Köhler (komed3)
|
|
1798
|
+
* @license MIT
|
|
1799
|
+
*/
|
|
1800
|
+
// Get the singleton profiler instance for performance monitoring
|
|
1801
|
+
const profiler$2 = Profiler.getInstance();
|
|
1802
|
+
/**
|
|
1803
|
+
* Abstract class representing a generic string metric.
|
|
1804
|
+
*
|
|
1805
|
+
* @abstract
|
|
1806
|
+
* @template R - The type of the raw result, defaulting to `MetricRaw`.
|
|
1807
|
+
*/
|
|
1808
|
+
class Metric {
|
|
1809
|
+
// Cache for metric computations to avoid redundant calculations
|
|
1810
|
+
static cache = new HashTable();
|
|
1811
|
+
// Metric name for identification
|
|
1812
|
+
metric;
|
|
1813
|
+
// Inputs for the metric computation, transformed into arrays
|
|
1814
|
+
a;
|
|
1815
|
+
b;
|
|
1816
|
+
// Store original inputs for result mapping
|
|
1817
|
+
origA = [];
|
|
1818
|
+
origB = [];
|
|
1819
|
+
// Options for the metric computation, such as performance tracking
|
|
1820
|
+
options;
|
|
1821
|
+
// Indicates whether the metric is symmetric (same result for inputs in any order)
|
|
1822
|
+
symmetric;
|
|
1823
|
+
/**
|
|
1824
|
+
* Result of the metric computation, which can be a single result or an array of results.
|
|
1825
|
+
* This will be populated after running the metric.
|
|
1826
|
+
*/
|
|
1827
|
+
results;
|
|
1828
|
+
/**
|
|
1829
|
+
* Static method to clear the cache of metric computations.
|
|
1830
|
+
*/
|
|
1831
|
+
static clear() { this.cache.clear(); }
|
|
1832
|
+
/**
|
|
1833
|
+
* Swaps two strings and their lengths if the first is longer than the second.
|
|
1834
|
+
*
|
|
1835
|
+
* @param {string} a - First string
|
|
1836
|
+
* @param {string} b - Second string
|
|
1837
|
+
* @param {number} m - Length of the first string
|
|
1838
|
+
* @param {number} n - Length of the second string
|
|
1839
|
+
* @returns {[string, string, number, number]} - Swapped strings and lengths
|
|
1840
|
+
*/
|
|
1841
|
+
static swap(a, b, m, n) { return m > n ? [b, a, n, m] : [a, b, m, n]; }
|
|
1842
|
+
/**
|
|
1843
|
+
* Clamps the similarity result between 0 and 1.
|
|
1844
|
+
*
|
|
1845
|
+
* @param {number} res - The input similarity to clamp
|
|
1846
|
+
* @returns {number} - The clamped similarity (0 to 1)
|
|
1847
|
+
*/
|
|
1848
|
+
static clamp(res) { return Math.max(0, Math.min(1, res)); }
|
|
1849
|
+
/**
|
|
1850
|
+
* Constructor for the Metric class.
|
|
1851
|
+
* Initializes the metric with two inputs (strings or arrays of strings) and options.
|
|
1852
|
+
*
|
|
1853
|
+
* @param {string} metric - The name of the metric (e.g. 'levenshtein')
|
|
1854
|
+
* @param {MetricInput} a - First input string or array of strings
|
|
1855
|
+
* @param {MetricInput} b - Second input string or array of strings
|
|
1856
|
+
* @param {MetricOptions} [opt] - Options for the metric computation
|
|
1857
|
+
* @param {boolean} [symmetric=false] - Whether the metric is symmetric (same result for inputs in any order)
|
|
1858
|
+
* @throws {Error} - If inputs `a` or `b` are empty
|
|
1859
|
+
*/
|
|
1860
|
+
constructor(metric, a, b, opt = {}, symmetric = false) {
|
|
1861
|
+
// Set the metric name
|
|
1862
|
+
this.metric = metric;
|
|
1863
|
+
// Set the inputs
|
|
1864
|
+
this.a = Array.isArray(a) ? a : [a];
|
|
1865
|
+
this.b = Array.isArray(b) ? b : [b];
|
|
1866
|
+
// Validate inputs: ensure they are not empty
|
|
1867
|
+
if (this.a.length === 0 || this.b.length === 0)
|
|
1868
|
+
throw new Error(`inputs <a> and <b> must not be empty`);
|
|
1869
|
+
// Set options
|
|
1870
|
+
this.options = opt;
|
|
1871
|
+
this.symmetric = symmetric;
|
|
1872
|
+
}
|
|
1873
|
+
/**
|
|
1874
|
+
* Pre-compute the metric for two strings.
|
|
1875
|
+
* This method is called before the actual computation to handle trivial cases.
|
|
1876
|
+
*
|
|
1877
|
+
* @param {string} a - First string
|
|
1878
|
+
* @param {string} b - Second string
|
|
1879
|
+
* @param {number} m - Length of the first string
|
|
1880
|
+
* @param {number} n - Length of the second string
|
|
1881
|
+
* @returns {MetricCompute<R>|undefined} - Pre-computed result or undefined if not applicable
|
|
1882
|
+
*/
|
|
1883
|
+
preCompute(a, b, m, n) {
|
|
1884
|
+
// If strings are identical, return a similarity of 1
|
|
1885
|
+
if (a === b)
|
|
1886
|
+
return { res: 1 };
|
|
1887
|
+
// If the lengths of both strings is less than 2, return a similarity of 0
|
|
1888
|
+
if (m == 0 || n == 0 || (m < 2 && n < 2))
|
|
1889
|
+
return { res: 0 };
|
|
1890
|
+
return undefined;
|
|
1891
|
+
}
|
|
1892
|
+
/**
|
|
1893
|
+
* Abstract method to be implemented by subclasses to perform the metric computation.
|
|
1894
|
+
* This method should contain the logic for computing the metric between two strings.
|
|
1895
|
+
*
|
|
1896
|
+
* @param {string} a - First string
|
|
1897
|
+
* @param {string} b - Second string
|
|
1898
|
+
* @param {number} m - Length of the first string
|
|
1899
|
+
* @param {number} n - Length of the second string
|
|
1900
|
+
* @param {number} maxLen - Maximum length of the strings
|
|
1901
|
+
* @returns {MetricCompute<R>} - The result of the metric computation
|
|
1902
|
+
* @throws {Error} - If not overridden in a subclass
|
|
1903
|
+
*/
|
|
1904
|
+
compute(a, b, m, n, maxLen) {
|
|
1905
|
+
throw new Error(`method compute() must be overridden in a subclass`);
|
|
1906
|
+
}
|
|
1907
|
+
/**
|
|
1908
|
+
* Run the metric computation for single inputs (two strings).
|
|
1909
|
+
* Applies preCompute for trivial cases before cache lookup and computation.
|
|
1910
|
+
*
|
|
1911
|
+
* If the profiler is active, it will measure time and memory usage.
|
|
1912
|
+
*
|
|
1913
|
+
* @param {number} i - Pointer to the first string
|
|
1914
|
+
* @param {number} j - Pointer to the second string
|
|
1915
|
+
* @returns {MetricResultSingle<R>} - The result of the metric computation
|
|
1916
|
+
*/
|
|
1917
|
+
runSingle(i, j) {
|
|
1918
|
+
// Type safety: convert inputs to strings
|
|
1919
|
+
let a = String(this.a[i]), A = a;
|
|
1920
|
+
let b = String(this.b[j]), B = b;
|
|
1921
|
+
// Get lengths
|
|
1922
|
+
let m = A.length, n = B.length;
|
|
1923
|
+
// Pre-compute trivial cases (identical, empty, etc.)
|
|
1924
|
+
let result = this.preCompute(A, B, m, n);
|
|
1925
|
+
if (!result) {
|
|
1926
|
+
// If the profiler is enabled, measure; else, just run
|
|
1927
|
+
result = profiler$2.run(() => {
|
|
1928
|
+
// Generate a cache key based on the metric and pair of strings `a` and `b`
|
|
1929
|
+
const key = Metric.cache.key(this.metric, [A, B], this.symmetric);
|
|
1930
|
+
// If the key exists in the cache, return the cached result
|
|
1931
|
+
// Otherwise, compute the metric using the algorithm
|
|
1932
|
+
return Metric.cache.get(key || '') ?? (() => {
|
|
1933
|
+
// If the metric is symmetrical, swap `a` and `b` (shorter string first)
|
|
1934
|
+
if (this.symmetric)
|
|
1935
|
+
[A, B, m, n] = Metric.swap(A, B, m, n);
|
|
1936
|
+
// Compute the similarity using the algorithm
|
|
1937
|
+
const res = this.compute(A, B, m, n, Math.max(m, n));
|
|
1938
|
+
// If a key was generated, store the result in the cache
|
|
1939
|
+
if (key)
|
|
1940
|
+
Metric.cache.set(key, res);
|
|
1941
|
+
return res;
|
|
1942
|
+
})();
|
|
1943
|
+
});
|
|
1944
|
+
}
|
|
1945
|
+
// Build metric result object
|
|
1946
|
+
return {
|
|
1947
|
+
metric: this.metric,
|
|
1948
|
+
a: this.origA[i] ?? a,
|
|
1949
|
+
b: this.origB[j] ?? b,
|
|
1950
|
+
...result
|
|
1951
|
+
};
|
|
1952
|
+
}
|
|
1953
|
+
/**
|
|
1954
|
+
* Run the metric computation for single inputs (two strings) asynchronously.
|
|
1955
|
+
*
|
|
1956
|
+
* @param {number} i - Pointer to the first string
|
|
1957
|
+
* @param {number} j - Pointer to the second string
|
|
1958
|
+
* @returns {Promise<MetricResultSingle<R>>} - Promise resolving the result of the metric computation
|
|
1959
|
+
*/
|
|
1960
|
+
async runSingleAsync(i, j) {
|
|
1961
|
+
return Promise.resolve(this.runSingle(i, j));
|
|
1962
|
+
}
|
|
1963
|
+
/**
|
|
1964
|
+
* Run the metric computation for batch inputs (arrays of strings).
|
|
1965
|
+
*
|
|
1966
|
+
* It iterates through each string in the first array and computes the metric
|
|
1967
|
+
* against each string in the second array.
|
|
1968
|
+
*/
|
|
1969
|
+
runBatch() {
|
|
1970
|
+
const results = [];
|
|
1971
|
+
// Loop through each combination of strings in a[] and b[]
|
|
1972
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1973
|
+
for (let j = 0; j < this.b.length; j++)
|
|
1974
|
+
results.push(this.runSingle(i, j));
|
|
1975
|
+
// Populate the results
|
|
1976
|
+
// `this.results` will be an array of MetricResultSingle
|
|
1977
|
+
this.results = results;
|
|
1978
|
+
}
|
|
1979
|
+
/**
|
|
1980
|
+
* Run the metric computation for batch inputs (arrays of strings) asynchronously.
|
|
1981
|
+
*/
|
|
1982
|
+
async runBatchAsync() {
|
|
1983
|
+
const results = [];
|
|
1984
|
+
// Loop through each combination of strings in a[] and b[]
|
|
1985
|
+
for (let i = 0; i < this.a.length; i++)
|
|
1986
|
+
for (let j = 0; j < this.b.length; j++)
|
|
1987
|
+
results.push(await this.runSingleAsync(i, j));
|
|
1988
|
+
// Populate the results
|
|
1989
|
+
// `this.results` will be an array of MetricResultSingle
|
|
1990
|
+
this.results = results;
|
|
1991
|
+
}
|
|
1992
|
+
/**
|
|
1993
|
+
* Run the metric computation for pairwise inputs (A[i] vs B[i]).
|
|
1994
|
+
*
|
|
1995
|
+
* This method assumes that both `a` and `b` are arrays of equal length
|
|
1996
|
+
* and computes the metric only for corresponding index pairs.
|
|
1997
|
+
*/
|
|
1998
|
+
runPairwise() {
|
|
1999
|
+
const results = [];
|
|
2000
|
+
// Compute metric for each corresponding pair
|
|
2001
|
+
for (let i = 0; i < this.a.length; i++)
|
|
2002
|
+
results.push(this.runSingle(i, i));
|
|
2003
|
+
// Populate the results
|
|
2004
|
+
// `this.results` will be an array of MetricResultSingle
|
|
2005
|
+
this.results = results;
|
|
2006
|
+
}
|
|
2007
|
+
/**
|
|
2008
|
+
* Run the metric computation for pairwise inputs (A[i] vs B[i]) asynchronously.
|
|
2009
|
+
*/
|
|
2010
|
+
async runPairwiseAsync() {
|
|
2011
|
+
const results = [];
|
|
2012
|
+
// Compute metric for each corresponding pair
|
|
2013
|
+
for (let i = 0; i < this.a.length; i++)
|
|
2014
|
+
results.push(await this.runSingleAsync(i, i));
|
|
2015
|
+
// Populate the results
|
|
2016
|
+
// `this.results` will be an array of MetricResultSingle
|
|
2017
|
+
this.results = results;
|
|
2018
|
+
}
|
|
2019
|
+
/**
|
|
2020
|
+
* Set the original inputs to which the results of the metric calculation will refer.
|
|
2021
|
+
*
|
|
2022
|
+
* @param {MetricInput} [a] - original input(s) for a
|
|
2023
|
+
* @param {MetricInput} [b] - original input(s) for b
|
|
2024
|
+
*/
|
|
2025
|
+
setOriginal(a, b) {
|
|
2026
|
+
if (a)
|
|
2027
|
+
this.origA = Array.isArray(a) ? a : [a];
|
|
2028
|
+
if (b)
|
|
2029
|
+
this.origB = Array.isArray(b) ? b : [b];
|
|
2030
|
+
return this;
|
|
2031
|
+
}
|
|
2032
|
+
/**
|
|
2033
|
+
* Check if the inputs are in batch mode.
|
|
2034
|
+
*
|
|
2035
|
+
* This method checks if either `a` or `b` contains more than one string,
|
|
2036
|
+
* indicating that the metric is being run in batch mode.
|
|
2037
|
+
*
|
|
2038
|
+
* @returns {boolean} - True if either input is an array with more than one element
|
|
2039
|
+
*/
|
|
2040
|
+
isBatch() { return this.a.length > 1 || this.b.length > 1; }
|
|
2041
|
+
/**
|
|
2042
|
+
* Check if the inputs are in single mode.
|
|
2043
|
+
*
|
|
2044
|
+
* This method checks if both `a` and `b` are single strings (not arrays),
|
|
2045
|
+
* indicating that the metric is being run on a single pair of strings.
|
|
2046
|
+
*
|
|
2047
|
+
* @returns {boolean} - True if both inputs are single strings
|
|
2048
|
+
*/
|
|
2049
|
+
isSingle() { return !this.isBatch(); }
|
|
2050
|
+
/**
|
|
2051
|
+
* Check if the inputs are in pairwise mode.
|
|
2052
|
+
*
|
|
2053
|
+
* This method checks if both `a` and `b` are arrays of the same length,
|
|
2054
|
+
* indicating that the metric is being run on corresponding pairs of strings.
|
|
2055
|
+
*
|
|
2056
|
+
* @returns {boolean} - True if both inputs are arrays of equal length
|
|
2057
|
+
* @param {boolean} [safe=false] - If true, does not throw an error if lengths are not equal
|
|
2058
|
+
* @throws {Error} - If `safe` is false and the lengths of `a` and `b` are not equal
|
|
2059
|
+
*/
|
|
2060
|
+
isPairwise(safe = false) {
|
|
2061
|
+
return this.isBatch() && this.a.length === this.b.length ? true : !safe && (() => {
|
|
2062
|
+
throw new Error(`mode <pairwise> requires arrays of equal length`);
|
|
2063
|
+
})();
|
|
2064
|
+
}
|
|
2065
|
+
/**
|
|
2066
|
+
* Check if the metric is symmetrical.
|
|
2067
|
+
*
|
|
2068
|
+
* This method returns whether the metric is symmetric, meaning it produces the same
|
|
2069
|
+
* result regardless of the order of inputs (e.g., Levenshtein distance).
|
|
2070
|
+
*
|
|
2071
|
+
* @returns {boolean} - True if the metric is symmetric
|
|
2072
|
+
*/
|
|
2073
|
+
isSymmetrical() { return this.symmetric; }
|
|
2074
|
+
/**
|
|
2075
|
+
* Determine which mode to run the metric in.
|
|
2076
|
+
*
|
|
2077
|
+
* This method checks the provided mode or defaults to the mode specified in options.
|
|
2078
|
+
* If no mode is specified, it defaults to 'default'.
|
|
2079
|
+
*
|
|
2080
|
+
* @param {MetricMode} [mode] - The mode to run the metric in (optional)
|
|
2081
|
+
* @returns {MetricMode} - The determined mode
|
|
2082
|
+
*/
|
|
2083
|
+
whichMode(mode) { return mode ?? this.options?.mode ?? 'default'; }
|
|
2084
|
+
/**
|
|
2085
|
+
* Clear the cached results of the metric.
|
|
2086
|
+
*
|
|
2087
|
+
* This method resets the `results` property to `undefined`, effectively clearing
|
|
2088
|
+
* any previously computed results. It can be useful for re-running the metric
|
|
2089
|
+
* with new inputs or options.
|
|
2090
|
+
*/
|
|
2091
|
+
clear() { this.results = undefined; }
|
|
2092
|
+
/**
|
|
2093
|
+
* Run the metric computation based on the specified mode.
|
|
2094
|
+
*
|
|
2095
|
+
* @param {MetricMode} [mode] - The mode to run the metric in (optional)
|
|
2096
|
+
* @param {boolean} [clear=true] - Whether to clear previous results before running
|
|
2097
|
+
* @throws {Error} - If an unsupported mode is specified
|
|
2098
|
+
*/
|
|
2099
|
+
run(mode, clear = true) {
|
|
2100
|
+
// Clear previous results if requested
|
|
2101
|
+
if (clear)
|
|
2102
|
+
this.clear();
|
|
2103
|
+
switch (this.whichMode(mode)) {
|
|
2104
|
+
// Default mode runs the metric on single inputs or falls back to batch mode
|
|
2105
|
+
case 'default': if (this.isSingle()) {
|
|
2106
|
+
this.results = this.runSingle(0, 0);
|
|
2107
|
+
break;
|
|
2108
|
+
}
|
|
2109
|
+
// Batch mode runs the metric on all combinations of a[] and b[]
|
|
2110
|
+
case 'batch':
|
|
2111
|
+
this.runBatch();
|
|
2112
|
+
break;
|
|
2113
|
+
// Single mode runs the metric on the first elements of a[] and b[]
|
|
2114
|
+
case 'single':
|
|
2115
|
+
this.results = this.runSingle(0, 0);
|
|
2116
|
+
break;
|
|
2117
|
+
// Pairwise mode runs the metric on corresponding pairs of a[] and b[]
|
|
2118
|
+
case 'pairwise':
|
|
2119
|
+
if (this.isPairwise())
|
|
2120
|
+
this.runPairwise();
|
|
2121
|
+
break;
|
|
2122
|
+
// Unsupported mode
|
|
2123
|
+
default: throw new Error(`unsupported mode <${mode}>`);
|
|
2124
|
+
}
|
|
2125
|
+
}
|
|
2126
|
+
/**
|
|
2127
|
+
* Run the metric computation based on the specified mode asynchronously.
|
|
2128
|
+
*
|
|
2129
|
+
* @param {MetricMode} [mode] - The mode to run the metric in (optional)
|
|
2130
|
+
* @param {boolean} [clear=true] - Whether to clear previous results before running
|
|
2131
|
+
* @returns {Promise<void>} - A promise that resolves when the metric computation is complete
|
|
2132
|
+
* @throws {Error} - If an unsupported mode is specified
|
|
2133
|
+
*/
|
|
2134
|
+
async runAsync(mode, clear = true) {
|
|
2135
|
+
// Clear previous results if requested
|
|
2136
|
+
if (clear)
|
|
2137
|
+
this.clear();
|
|
2138
|
+
switch (this.whichMode(mode)) {
|
|
2139
|
+
// Default mode runs the metric on single inputs or falls back to batch mode
|
|
2140
|
+
case 'default': if (this.isSingle()) {
|
|
2141
|
+
this.results = await this.runSingleAsync(0, 0);
|
|
2142
|
+
break;
|
|
2143
|
+
}
|
|
2144
|
+
// Batch mode runs the metric on all combinations of a[] and b[]
|
|
2145
|
+
case 'batch':
|
|
2146
|
+
await this.runBatchAsync();
|
|
2147
|
+
break;
|
|
2148
|
+
// Single mode runs the metric on the first elements of a[] and b[]
|
|
2149
|
+
case 'single':
|
|
2150
|
+
this.results = await this.runSingleAsync(0, 0);
|
|
2151
|
+
break;
|
|
2152
|
+
// Pairwise mode runs the metric on corresponding pairs of a[] and b[]
|
|
2153
|
+
case 'pairwise':
|
|
2154
|
+
if (this.isPairwise())
|
|
2155
|
+
await this.runPairwiseAsync();
|
|
2156
|
+
break;
|
|
2157
|
+
// Unsupported mode
|
|
2158
|
+
default: throw new Error(`unsupported async mode <${mode}>`);
|
|
2159
|
+
}
|
|
2160
|
+
}
|
|
2161
|
+
/**
|
|
2162
|
+
* Get the name of the metric.
|
|
2163
|
+
*
|
|
2164
|
+
* @returns {string} - The name of the metric
|
|
2165
|
+
*/
|
|
2166
|
+
getMetricName() { return this.metric; }
|
|
2167
|
+
/**
|
|
2168
|
+
* Get the result of the metric computation.
|
|
2169
|
+
*
|
|
2170
|
+
* @returns {MetricResult<R>} - The result of the metric computation
|
|
2171
|
+
* @throws {Error} - If `run()` has not been called before this method
|
|
2172
|
+
*/
|
|
2173
|
+
getResults() {
|
|
2174
|
+
// Ensure that the metric has been run before getting the result
|
|
2175
|
+
if (this.results === undefined)
|
|
2176
|
+
throw new Error(`run() must be called before getResult()`);
|
|
2177
|
+
// Return the result(s)
|
|
2178
|
+
return this.results;
|
|
2179
|
+
}
|
|
2180
|
+
}
|
|
2181
|
+
/**
|
|
2182
|
+
* Metric registry service for managing metric implementations.
|
|
2183
|
+
*
|
|
2184
|
+
* This registry allows for dynamic registration and retrieval of metric classes,
|
|
2185
|
+
* enabling the use of various string similarity metrics in a consistent manner.
|
|
2186
|
+
*/
|
|
2187
|
+
const MetricRegistry = Registry('metric', Metric);
|
|
2188
|
+
|
|
2189
|
+
/**
|
|
2190
|
+
* Pool Utility
|
|
2191
|
+
* src/utils/Pool.ts
|
|
2192
|
+
*
|
|
2193
|
+
* @see https://en.wikipedia.org/wiki/Circular_buffer
|
|
2194
|
+
*
|
|
2195
|
+
* The Pool class provides a simple and efficient buffer pool for dynamic programming
|
|
2196
|
+
* algorithms that require temporary arrays (such as Levenshtein, LCS, etc.).
|
|
2197
|
+
* By reusing pre-allocated typed arrays, it reduces memory allocations and garbage
|
|
2198
|
+
* collection overhead, especially for repeated or batch computations.
|
|
2199
|
+
*
|
|
2200
|
+
* It supports different types of buffers (Uint16Array, number[], Set, Map) and allows
|
|
2201
|
+
* for acquiring buffers of specific sizes while managing a maximum pool size.
|
|
2202
|
+
*
|
|
2203
|
+
* @module Utils/Pool
|
|
2204
|
+
* @author Paul Köhler (komed3)
|
|
2205
|
+
* @license MIT
|
|
2206
|
+
*/
|
|
2207
|
+
/**
|
|
2208
|
+
* RingPool is a circular buffer implementation that manages a pool of buffers.
|
|
2209
|
+
*
|
|
2210
|
+
* It allows for efficient acquisition and release of buffers, ensuring that
|
|
2211
|
+
* buffers are reused without unnecessary allocations.
|
|
2212
|
+
*
|
|
2213
|
+
* @template T - The type of buffers managed by the pool
|
|
2214
|
+
*/
|
|
2215
|
+
class RingPool {
|
|
2216
|
+
maxSize;
|
|
2217
|
+
// The buffers in the pool
|
|
2218
|
+
buffers = [];
|
|
2219
|
+
// The current pointer for acquiring buffers
|
|
2220
|
+
pointer = 0;
|
|
2221
|
+
/**
|
|
2222
|
+
* Creates a new RingPool with a specified maximum size.
|
|
2223
|
+
*
|
|
2224
|
+
* @param {number} maxSize - The maximum number of buffers that can be stored in the pool
|
|
2225
|
+
*/
|
|
2226
|
+
constructor(maxSize) {
|
|
2227
|
+
this.maxSize = maxSize;
|
|
2228
|
+
}
|
|
2229
|
+
/**
|
|
2230
|
+
* Acquires a buffer of at least the specified minimum size from the pool.
|
|
2231
|
+
*
|
|
2232
|
+
* @param {number} minSize - The minimum size of the buffer to acquire
|
|
2233
|
+
* @param {boolean} allowOversize - Whether to allow buffers larger than minSize
|
|
2234
|
+
* @return {PoolBuffer<T>|null} - The acquired buffer or null if no suitable buffer is found
|
|
2235
|
+
*/
|
|
2236
|
+
acquire(minSize, allowOversize) {
|
|
2237
|
+
const len = this.buffers.length;
|
|
2238
|
+
// Iterate through the buffers in the pool
|
|
2239
|
+
for (let i = 0; i < len; i++) {
|
|
2240
|
+
const idx = (this.pointer + i) % len;
|
|
2241
|
+
const item = this.buffers[idx];
|
|
2242
|
+
// Check if the item size is greater than or equal to the minimum size
|
|
2243
|
+
if (item.size >= minSize) {
|
|
2244
|
+
// Set the pointer to the next position
|
|
2245
|
+
this.pointer = (idx + 1) % len;
|
|
2246
|
+
// If the item size is equal to minSize or oversize is allowed, return the item
|
|
2247
|
+
return allowOversize || item.size === minSize ? item : null;
|
|
2248
|
+
}
|
|
2249
|
+
}
|
|
2250
|
+
// If no suitable buffer is found, return null
|
|
2251
|
+
return null;
|
|
2252
|
+
}
|
|
2253
|
+
/**
|
|
2254
|
+
* Releases a buffer back to the pool.
|
|
2255
|
+
* If the pool is full, it replaces the oldest buffer with the new one.
|
|
2256
|
+
*
|
|
2257
|
+
* @param {PoolBuffer<T>} item - The buffer to release back to the pool
|
|
2258
|
+
*/
|
|
2259
|
+
release(item) {
|
|
2260
|
+
if (this.buffers.length < this.maxSize) {
|
|
2261
|
+
// If the pool is not full, simply add the item
|
|
2262
|
+
this.buffers.push(item);
|
|
2263
|
+
}
|
|
2264
|
+
else {
|
|
2265
|
+
// If the pool is full, replace the oldest buffer
|
|
2266
|
+
this.buffers[this.pointer] = item;
|
|
2267
|
+
this.pointer = (this.pointer + 1) % this.maxSize;
|
|
2268
|
+
}
|
|
2269
|
+
}
|
|
2270
|
+
/**
|
|
2271
|
+
* Clears the pool, removing all buffers.
|
|
2272
|
+
* This resets the pointer and empties the buffer list.
|
|
2273
|
+
*/
|
|
2274
|
+
clear() {
|
|
2275
|
+
this.buffers = [];
|
|
2276
|
+
this.pointer = 0;
|
|
2277
|
+
}
|
|
2278
|
+
}
|
|
2279
|
+
/**
|
|
2280
|
+
* The Pool class provides a buffer pool for dynamic programming algorithms.
|
|
2281
|
+
*
|
|
2282
|
+
* It allows for efficient reuse of buffers (Uint16Array, number[], Set, Map)
|
|
2283
|
+
* to reduce memory allocations and garbage collection overhead.
|
|
2284
|
+
*/
|
|
2285
|
+
class Pool {
|
|
2286
|
+
// Pool Types
|
|
2287
|
+
static CONFIG = {
|
|
2288
|
+
'uint16': { type: 'uint16', maxSize: 32, maxItemSize: 2048, allowOversize: true },
|
|
2289
|
+
'number[]': { type: 'number[]', maxSize: 16, maxItemSize: 1024, allowOversize: false },
|
|
2290
|
+
'set': { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
|
|
2291
|
+
'map': { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
|
|
2292
|
+
};
|
|
2293
|
+
// Pool Rings for each type
|
|
2294
|
+
static POOLS = {
|
|
2295
|
+
'uint16': new RingPool(32),
|
|
2296
|
+
'number[]': new RingPool(16),
|
|
2297
|
+
'set': new RingPool(8),
|
|
2298
|
+
'map': new RingPool(8)
|
|
2299
|
+
};
|
|
2300
|
+
/**
|
|
2301
|
+
* Allocates a new buffer of the specified type and size.
|
|
2302
|
+
*
|
|
2303
|
+
* @param {PoolType} type - The type of buffer to allocate
|
|
2304
|
+
* @param {number} size - The size of the buffer to allocate
|
|
2305
|
+
* @return {any} - The newly allocated buffer
|
|
2306
|
+
*/
|
|
2307
|
+
static allocate(type, size) {
|
|
2308
|
+
switch (type) {
|
|
2309
|
+
case 'uint16': return new Uint16Array(size);
|
|
2310
|
+
case 'number[]': return new Array(size).fill(0);
|
|
2311
|
+
case 'set': return new Set();
|
|
2312
|
+
case 'map': return new Map();
|
|
2313
|
+
}
|
|
2314
|
+
}
|
|
2315
|
+
/**
|
|
2316
|
+
* Acquires a buffer of the specified type and size from the pool.
|
|
2317
|
+
* If no suitable buffer is available, it allocates a new one.
|
|
2318
|
+
*
|
|
2319
|
+
* @param {PoolType} type - The type of buffer to acquire (e.g., 'uint16', 'number[]', 'set', 'map')
|
|
2320
|
+
* @param {number} size - The size of the buffer to acquire
|
|
2321
|
+
* @return {T} - The acquired buffer of the specified type
|
|
2322
|
+
*/
|
|
2323
|
+
static acquire(type, size) {
|
|
2324
|
+
// Get the configuration for the specified type
|
|
2325
|
+
const CONFIG = this.CONFIG[type];
|
|
2326
|
+
// If the requested size exceeds the maximum item size, allocate a new buffer
|
|
2327
|
+
if (size > CONFIG.maxItemSize)
|
|
2328
|
+
return this.allocate(type, size);
|
|
2329
|
+
// Try to acquire a buffer from the pool ring
|
|
2330
|
+
// If a suitable buffer is found, return it (subarray for uint16)
|
|
2331
|
+
const item = this.POOLS[type].acquire(size, CONFIG.allowOversize);
|
|
2332
|
+
if (item) {
|
|
2333
|
+
// If the type is 'uint16', return a subarray of the buffer
|
|
2334
|
+
return type === 'uint16' ? item.buffer.subarray(0, size) : item.buffer;
|
|
2335
|
+
}
|
|
2336
|
+
// If no suitable buffer is found, allocate a new one
|
|
2337
|
+
return this.allocate(type, size);
|
|
2338
|
+
}
|
|
2339
|
+
/**
|
|
2340
|
+
* Acquires multiple buffers of the specified type and sizes from the pool.
|
|
2341
|
+
*
|
|
2342
|
+
* @param {PoolType} type - The type of buffers to acquire
|
|
2343
|
+
* @param {number[]} sizes - An array of sizes for each buffer to acquire
|
|
2344
|
+
* @return {T[]} - An array of acquired buffers of the specified type
|
|
2345
|
+
*/
|
|
2346
|
+
static acquireMany(type, sizes) {
|
|
2347
|
+
return sizes.map(size => this.acquire(type, size));
|
|
2348
|
+
}
|
|
2349
|
+
/**
|
|
2350
|
+
* Releases a buffer back to the pool.
|
|
2351
|
+
* If the size of the buffer is larger than the maximum item size, it will not be released.
|
|
2352
|
+
*
|
|
2353
|
+
* @param {PoolType} type - The type of buffer to release
|
|
2354
|
+
* @param {T} buffer - The buffer to release
|
|
2355
|
+
* @param {number} size - The size of the buffer
|
|
2356
|
+
*/
|
|
2357
|
+
static release(type, buffer, size) {
|
|
2358
|
+
// Get the configuration for the specified type
|
|
2359
|
+
const CONFIG = this.CONFIG[type];
|
|
2360
|
+
// If the size of the buffer is less than or equal to the maximum item size, release it
|
|
2361
|
+
if (size <= CONFIG.maxItemSize) {
|
|
2362
|
+
// Release the buffer back to the pool ring
|
|
2363
|
+
this.POOLS[type].release({ buffer, size });
|
|
2364
|
+
}
|
|
2365
|
+
}
|
|
2366
|
+
}
|
|
2367
|
+
|
|
2368
|
+
/**
|
|
2369
|
+
* Cosine Similarity
|
|
2370
|
+
* src/metric/Cosine.ts
|
|
2371
|
+
*
|
|
2372
|
+
* @see https://en.wikipedia.org/wiki/Cosine_similarity
|
|
2373
|
+
*
|
|
2374
|
+
* Cosine similarity is a metric used to measure how similar two vectors are, regardless
|
|
2375
|
+
* of their magnitude. In text analysis, it is commonly used to compare documents or
|
|
2376
|
+
* strings by representing them as term frequency vectors and computing the cosine of
|
|
2377
|
+
* the angle between these vectors.
|
|
2378
|
+
*
|
|
2379
|
+
* The result is a value between 0 and 1, where 1 means the vectors are identical and
|
|
2380
|
+
* 0 means they are orthogonal (no similarity).
|
|
2381
|
+
*
|
|
2382
|
+
* @module Metric/CosineSimilarity
|
|
2383
|
+
* @author Paul Köhler (komed3)
|
|
2384
|
+
* @license MIT
|
|
2385
|
+
*/
|
|
2386
|
+
/**
|
|
2387
|
+
* CosineSimilarity class extends the Metric class to implement the Cosine similarity algorithm.
|
|
2388
|
+
*/
|
|
2389
|
+
class CosineSimilarity extends Metric {
|
|
2390
|
+
/**
|
|
2391
|
+
* Constructor for the CosineSimilarity class.
|
|
2392
|
+
*
|
|
2393
|
+
* Initializes the Cosine similarity metric with two input strings or
|
|
2394
|
+
* arrays of strings and optional options.
|
|
2395
|
+
*
|
|
2396
|
+
* @param {MetricInput} a - First input string or array of strings
|
|
2397
|
+
* @param {MetricInput} b - Second input string or array of strings
|
|
2398
|
+
* @param {MetricOptions} [opt] - Options for the metric computation
|
|
2399
|
+
*/
|
|
2400
|
+
constructor(a, b, opt = {}) {
|
|
2401
|
+
// Call the parent Metric constructor with the metric name and inputs
|
|
2402
|
+
// Metric is symmetrical
|
|
2403
|
+
super('cosine', a, b, opt, true);
|
|
2404
|
+
}
|
|
2405
|
+
/**
|
|
2406
|
+
* Calculates the term frequency vector for a given string.
|
|
2407
|
+
*
|
|
2408
|
+
* @param {string} str - The input string
|
|
2409
|
+
* @param {string} delimiter - The delimiter to split terms
|
|
2410
|
+
* @return {Map<string, number>} - Term frequency object
|
|
2411
|
+
*/
|
|
2412
|
+
_termFreq(str, delimiter) {
|
|
2413
|
+
const terms = str.split(delimiter);
|
|
2414
|
+
const freq = Pool.acquire('map', terms.length);
|
|
2415
|
+
for (const term of terms)
|
|
2416
|
+
freq.set(term, (freq.get(term) || 0) + 1);
|
|
2417
|
+
return freq;
|
|
2418
|
+
}
|
|
2419
|
+
/**
|
|
2420
|
+
* Calculates the Cosine similarity between two strings.
|
|
2421
|
+
*
|
|
2422
|
+
* @param {string} a - First string
|
|
2423
|
+
* @param {string} b - Second string
|
|
2424
|
+
* @return {MetricCompute<CosineRaw>} - Object containing the similarity result and raw values
|
|
2425
|
+
*/
|
|
2426
|
+
compute(a, b) {
|
|
2427
|
+
// Get delimiter from options or use default (space)
|
|
2428
|
+
const { delimiter = ' ' } = this.options;
|
|
2429
|
+
// Compute term frequency vectors
|
|
2430
|
+
const termsA = this._termFreq(a, delimiter);
|
|
2431
|
+
const termsB = this._termFreq(b, delimiter);
|
|
2432
|
+
// Calculate dot product and magnitudes
|
|
2433
|
+
let dotProduct = 0, magnitudeA = 0, magnitudeB = 0;
|
|
2434
|
+
// Iterate over terms in A for dotProduct and magnitudeA
|
|
2435
|
+
for (const [term, freqA] of termsA) {
|
|
2436
|
+
const freqB = termsB.get(term) || 0;
|
|
2437
|
+
dotProduct += freqA * freqB;
|
|
2438
|
+
magnitudeA += freqA * freqA;
|
|
2439
|
+
}
|
|
2440
|
+
// Iterate over terms in B for magnitudeB
|
|
2441
|
+
for (const freqB of termsB.values())
|
|
2442
|
+
magnitudeB += freqB * freqB;
|
|
2443
|
+
magnitudeA = Math.sqrt(magnitudeA);
|
|
2444
|
+
magnitudeB = Math.sqrt(magnitudeB);
|
|
2445
|
+
// Release maps back to the pool
|
|
2446
|
+
Pool.release('map', termsA, termsA.size);
|
|
2447
|
+
Pool.release('map', termsB, termsB.size);
|
|
2448
|
+
// Return the result as a MetricCompute object
|
|
2449
|
+
return {
|
|
2450
|
+
res: (magnitudeA && magnitudeB) ? Metric.clamp(dotProduct / (magnitudeA * magnitudeB)) : 0,
|
|
2451
|
+
raw: { dotProduct, magnitudeA, magnitudeB }
|
|
2452
|
+
};
|
|
2453
|
+
}
|
|
2454
|
+
}
|
|
2455
|
+
// Register the Cosine similarity in the metric registry
|
|
2456
|
+
MetricRegistry.add('cosine', CosineSimilarity);
|
|
2457
|
+
|
|
2458
|
+
/**
|
|
2459
|
+
* Damerau-Levenshtein Distance
|
|
2460
|
+
* src/metric/DamerauLevenshtein.ts
|
|
2461
|
+
*
|
|
2462
|
+
* @see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
|
|
2463
|
+
*
|
|
2464
|
+
* The Damerau-Levenshtein distance extends the classical Levenshtein algorithm by
|
|
2465
|
+
* including transpositions (swapping of two adjacent characters) as a single edit
|
|
2466
|
+
* operation, in addition to insertions, deletions, and substitutions.
|
|
2467
|
+
*
|
|
2468
|
+
* This metric is particularly useful for detecting and correcting common
|
|
2469
|
+
* typographical errors.
|
|
2470
|
+
*
|
|
2471
|
+
* @module Metric/DamerauLevenshtein
|
|
2472
|
+
* @author Paul Köhler (komed3)
|
|
2473
|
+
* @license MIT
|
|
2474
|
+
*/
|
|
2475
|
+
/**
|
|
2476
|
+
* DamerauLevenshteinDistance class extends the Metric class to implement the Damerau-Levenshtein algorithm.
|
|
2477
|
+
*/
|
|
2478
|
+
class DamerauLevenshteinDistance extends Metric {
|
|
2479
|
+
/**
|
|
2480
|
+
* Constructor for the DamerauLevenshteinDistance class.
|
|
2481
|
+
*
|
|
2482
|
+
* Initializes the Damerau-Levenshtein metric with two input strings or
|
|
2483
|
+
* arrays of strings and optional options.
|
|
2484
|
+
*
|
|
2485
|
+
* @param {MetricInput} a - First input string or array of strings
|
|
2486
|
+
* @param {MetricInput} b - Second input string or array of strings
|
|
2487
|
+
* @param {MetricOptions} [opt] - Options for the metric computation
|
|
2488
|
+
*/
|
|
2489
|
+
constructor(a, b, opt = {}) {
|
|
2490
|
+
// Call the parent Metric constructor with the metric name and inputs
|
|
2491
|
+
// Metric is symmetrical
|
|
2492
|
+
super('damerau', a, b, opt, true);
|
|
2493
|
+
}
|
|
2494
|
+
/**
|
|
2495
|
+
* Calculates the normalized Damerau-Levenshtein distance between two strings.
|
|
2496
|
+
*
|
|
2497
|
+
* @param {string} a - First string (always the shorter string for memory efficiency)
|
|
2498
|
+
* @param {string} b - Second string
|
|
2499
|
+
* @param {number} m - Length of the first string (a)
|
|
2500
|
+
* @param {number} n - Length of the second string (b)
|
|
2501
|
+
* @param {number} maxLen - Maximum length of the strings
|
|
2502
|
+
* @return {MetricCompute<DamerauRaw>} - Object containing the similarity result and raw distance
|
|
2503
|
+
*/
|
|
2504
|
+
compute(a, b, m, n, maxLen) {
|
|
2505
|
+
// Get three reusable arrays from the Pool for the DP rows
|
|
2506
|
+
const len = m + 1;
|
|
2507
|
+
const [test, prev, curr] = Pool.acquireMany('uint16', [len, len, len]);
|
|
2508
|
+
// Initialize the first row (edit distances from empty string to a)
|
|
2509
|
+
for (let i = 0; i <= m; i++)
|
|
2510
|
+
prev[i] = i;
|
|
2511
|
+
// Fill the DP matrix row by row (over the longer string)
|
|
2512
|
+
for (let j = 1; j <= n; j++) {
|
|
2513
|
+
// Cost of transforming empty string to b[0..j]
|
|
2514
|
+
curr[0] = j;
|
|
2515
|
+
// Get the character code of the current character in b
|
|
2516
|
+
const cb = b.charCodeAt(j - 1);
|
|
2517
|
+
for (let i = 1; i <= m; i++) {
|
|
2518
|
+
// Get the character code of the current character in b
|
|
2519
|
+
const ca = a.charCodeAt(i - 1);
|
|
2520
|
+
// If characters are the same, no cost for substitution
|
|
2521
|
+
const cost = ca === cb ? 0 : 1;
|
|
2522
|
+
// Calculate minimum of deletion, insertion, substitution
|
|
2523
|
+
let val = Math.min(curr[i - 1] + 1, // Insertion
|
|
2524
|
+
prev[i] + 1, // Deletion
|
|
2525
|
+
prev[i - 1] + cost // Substitution
|
|
2526
|
+
);
|
|
2527
|
+
// Check for transposition
|
|
2528
|
+
if (i > 1 && j > 1 &&
|
|
2529
|
+
ca === b.charCodeAt(j - 2) &&
|
|
2530
|
+
cb === a.charCodeAt(i - 2)) {
|
|
2531
|
+
// Transposition
|
|
2532
|
+
val = Math.min(val, test[i - 2] + cost);
|
|
2533
|
+
}
|
|
2534
|
+
// Set the cost for the current cell
|
|
2535
|
+
curr[i] = val;
|
|
2536
|
+
}
|
|
2537
|
+
// Rotate rows: test <= prev, prev <= curr, curr <= test
|
|
2538
|
+
test.set(prev);
|
|
2539
|
+
prev.set(curr);
|
|
2540
|
+
}
|
|
2541
|
+
// The last value in prev is the Damerau-Levenshtein distance
|
|
2542
|
+
const dist = prev[m];
|
|
2543
|
+
// Release arrays back to the pool
|
|
2544
|
+
Pool.release('uint16', test, len);
|
|
2545
|
+
Pool.release('uint16', prev, len);
|
|
2546
|
+
Pool.release('uint16', curr, len);
|
|
2547
|
+
// Normalize by the length of the longer string
|
|
2548
|
+
return {
|
|
2549
|
+
res: maxLen === 0 ? 1 : Metric.clamp(1 - (dist / maxLen)),
|
|
2550
|
+
raw: { dist, maxLen }
|
|
2551
|
+
};
|
|
2552
|
+
}
|
|
2553
|
+
}
|
|
2554
|
+
// Register the Damerau-Levenshtein distance in the metric registry
|
|
2555
|
+
MetricRegistry.add('damerau', DamerauLevenshteinDistance);
|
|
2556
|
+
|
|
2557
|
+
/**
|
|
2558
|
+
* Dice-Sørensen Coefficient
|
|
2559
|
+
* src/metric/DiceSorensen.ts
|
|
2560
|
+
*
|
|
2561
|
+
* @see https://en.wikipedia.org/wiki/Dice-S%C3%B8rensen_coefficient
|
|
2562
|
+
*
|
|
2563
|
+
* This module implements the Dice-Sørensen coefficient, a statistic used to gauge
|
|
2564
|
+
* the similarity of two samples. It is commonly used in natural language processing
|
|
2565
|
+
* and information retrieval to compare the similarity between two sets of data,
|
|
2566
|
+
* such as text documents. The coefficient is defined as twice the size of the
|
|
2567
|
+
* intersection divided by the sum of the sizes of the two sets.
|
|
2568
|
+
*
|
|
2569
|
+
* The implementation includes methods to compute bigrams from strings and calculate
|
|
2570
|
+
* the coefficient based on these bigrams. It handles edge cases, such as empty
|
|
2571
|
+
* strings and identical strings, to ensure accurate results.
|
|
2572
|
+
*
|
|
2573
|
+
* @module Metric/DiceSorensenCoefficient
|
|
2574
|
+
* @author Paul Köhler (komed3)
|
|
2575
|
+
* @license MIT
|
|
2576
|
+
*/
|
|
2577
|
+
/**
|
|
2578
|
+
* DiceSorensenCoefficient class extends the Metric class to implement the Dice-Sørensen coefficient.
|
|
2579
|
+
*/
|
|
2580
|
+
class DiceSorensenCoefficient extends Metric {
|
|
2581
|
+
/**
|
|
2582
|
+
* Constructor for the DiceSorensen class.
|
|
2583
|
+
*
|
|
2584
|
+
* Initializes the DiceSorensen metric with two input strings or
|
|
2585
|
+
* arrays of strings and optional options.
|
|
2586
|
+
*
|
|
2587
|
+
* @param {MetricInput} a - First input string or array of strings
|
|
2588
|
+
* @param {MetricInput} b - Second input string or array of strings
|
|
2589
|
+
* @param {MetricOptions} [opt] - Options for the metric computation
|
|
2590
|
+
*/
|
|
2591
|
+
constructor(a, b, opt = {}) {
|
|
2592
|
+
// Call the parent Metric constructor with the metric name and inputs
|
|
2593
|
+
// Metric is symmetrical
|
|
2594
|
+
super('dice', a, b, opt, true);
|
|
2595
|
+
}
|
|
2596
|
+
/**
|
|
2597
|
+
* Computes the bigrams of a given string.
|
|
2598
|
+
*
|
|
2599
|
+
* @param {string} str - The input string
|
|
2600
|
+
* @return {Set<string>} - A set of bigrams (two-character sequences) from the string
|
|
2601
|
+
*/
|
|
2602
|
+
_bigrams(str) {
|
|
2603
|
+
const len = str.length - 1;
|
|
2604
|
+
const bigrams = Pool.acquire('set', len);
|
|
2605
|
+
// Generate bigrams by iterating through the string
|
|
2606
|
+
for (let i = 0; i < len; i++)
|
|
2607
|
+
bigrams.add(str.substring(i, i + 2));
|
|
2608
|
+
return bigrams;
|
|
2609
|
+
}
|
|
2610
|
+
/**
|
|
2611
|
+
* Calculates the Dice-Sørensen coefficient between two strings.
|
|
2612
|
+
*
|
|
2613
|
+
* @param {string} a - First string
|
|
2614
|
+
* @param {string} b - Second string
|
|
2615
|
+
* @return {MetricCompute<DiceRaw>} - Object containing the similarity result and raw distance
|
|
2616
|
+
*/
|
|
2617
|
+
compute(a, b) {
|
|
2618
|
+
// Generate bigrams for both strings
|
|
2619
|
+
const setA = this._bigrams(a);
|
|
2620
|
+
const setB = this._bigrams(b);
|
|
2621
|
+
// Calculate the intersection of bigrams
|
|
2622
|
+
let intersection = 0;
|
|
2623
|
+
for (const bigram of setA)
|
|
2624
|
+
if (setB.has(bigram))
|
|
2625
|
+
intersection++;
|
|
2626
|
+
// Calculate the size of the union of both sets
|
|
2627
|
+
const sizeA = setA.size, sizeB = setB.size;
|
|
2628
|
+
const size = sizeA + sizeB;
|
|
2629
|
+
// Release sets back to the pool
|
|
2630
|
+
Pool.release('set', setA, sizeA);
|
|
2631
|
+
Pool.release('set', setB, sizeB);
|
|
2632
|
+
// Return the result as a MetricCompute object
|
|
2633
|
+
return {
|
|
2634
|
+
res: size === 0 ? 1 : Metric.clamp((2 * intersection) / size),
|
|
2635
|
+
raw: { intersection, size }
|
|
2636
|
+
};
|
|
2637
|
+
}
|
|
2638
|
+
}
|
|
2639
|
+
// Register the Dice-Sørensen coefficient in the metric registry
|
|
2640
|
+
MetricRegistry.add('dice', DiceSorensenCoefficient);
|
|
2641
|
+
|
|
2642
|
+
/**
|
|
2643
|
+
* Hamming Distance
|
|
2644
|
+
* src/metric/Hamming.ts
|
|
2645
|
+
*
|
|
2646
|
+
* @see https://en.wikipedia.org/wiki/Hamming_distance
|
|
2647
|
+
*
|
|
2648
|
+
* The Hamming distance is a metric for comparing two strings of equal length. It
|
|
2649
|
+
* measures the number of positions at which the corresponding symbols are different.
|
|
2650
|
+
*
|
|
2651
|
+
* This implementation allows for optional padding of the shorter string to equalize
|
|
2652
|
+
* lengths, otherwise it throws an error if the strings are of unequal length.
|
|
2653
|
+
*
|
|
2654
|
+
* @module Metric/HammingDistance
|
|
2655
|
+
* @author Paul Köhler (komed3)
|
|
2656
|
+
* @license MIT
|
|
2657
|
+
*/
|
|
2658
|
+
/**
|
|
2659
|
+
* HammingDistance class extends the Metric class to implement the Hamming distance.
|
|
2660
|
+
*/
|
|
2661
|
+
class HammingDistance extends Metric {
|
|
2662
|
+
/**
|
|
2663
|
+
* Constructor for the Hamming class.
|
|
2664
|
+
*
|
|
2665
|
+
* Initializes the Hamming distance metric with two input strings or
|
|
2666
|
+
* arrays of strings and optional options.
|
|
2667
|
+
*
|
|
2668
|
+
* @param {MetricInput} a - First input string or array of strings
|
|
2669
|
+
* @param {MetricInput} b - Second input string or array of strings
|
|
2670
|
+
* @param {MetricOptions} opt - Options for the metric computation
|
|
2671
|
+
*/
|
|
2672
|
+
constructor(a, b, opt = {}) {
|
|
2673
|
+
// Call the parent Metric constructor with the metric name and inputs
|
|
2674
|
+
// Metric is symmetrical
|
|
2675
|
+
super('hamming', a, b, opt, true);
|
|
2676
|
+
}
|
|
2677
|
+
/**
|
|
2678
|
+
* Calculates the Hamming distance between two strings.
|
|
2679
|
+
*
|
|
2680
|
+
* @param {string} a - First string
|
|
2681
|
+
* @param {string} b - Second string
|
|
2682
|
+
* @param {number} m - Length of the first string
|
|
2683
|
+
* @param {number} n - Length of the second string
|
|
2684
|
+
* @param {number} maxLen - Maximum length of the strings
|
|
2685
|
+
* @return {MetricCompute<HammingRaw>} - Object containing the similarity result and raw distance
|
|
2686
|
+
* @throws {Error} - If strings are of unequal length and padding is not specified
|
|
2687
|
+
*/
|
|
2688
|
+
compute(a, b, m, n, maxLen) {
|
|
2689
|
+
// Check for equal string length
|
|
2690
|
+
if (m !== n) {
|
|
2691
|
+
// Optional: use padding to equalize string length
|
|
2692
|
+
if (this.options.pad !== undefined) {
|
|
2693
|
+
if (m < maxLen)
|
|
2694
|
+
a = a.padEnd(maxLen, this.options.pad);
|
|
2695
|
+
if (n < maxLen)
|
|
2696
|
+
b = b.padEnd(maxLen, this.options.pad);
|
|
2697
|
+
m = n = maxLen;
|
|
2698
|
+
}
|
|
2699
|
+
// Standard: Error for unequal length
|
|
2700
|
+
else
|
|
2701
|
+
throw new Error(`strings must be of equal length for Hamming Distance, a=${m} and b=${n} given, ` +
|
|
2702
|
+
`use option.pad for automatic adjustment`);
|
|
2703
|
+
}
|
|
2704
|
+
// Calculate the Hamming distance
|
|
2705
|
+
let dist = 0;
|
|
2706
|
+
for (let i = 0; i < a.length; i++)
|
|
2707
|
+
if (a[i] !== b[i])
|
|
2708
|
+
dist++;
|
|
2709
|
+
// Return the result as a MetricCompute object
|
|
2710
|
+
return {
|
|
2711
|
+
res: m === 0 ? 1 : Metric.clamp(1 - dist / m),
|
|
2712
|
+
raw: { dist }
|
|
2713
|
+
};
|
|
2714
|
+
}
|
|
2715
|
+
}
|
|
2716
|
+
// Register the Hamming distance in the metric registry
|
|
2717
|
+
MetricRegistry.add('hamming', HammingDistance);
|
|
2718
|
+
|
|
2719
|
+
/**
|
|
2720
|
+
* Jaccard Index
|
|
2721
|
+
* src/metric/Jaccard.ts
|
|
2722
|
+
*
|
|
2723
|
+
* @see https://en.wikipedia.org/wiki/Jaccard_index
|
|
2724
|
+
*
|
|
2725
|
+
* The Jaccard Index (or Jaccard similarity coefficient) measures the similarity
|
|
2726
|
+
* between two sets by dividing the size of their intersection by the size of
|
|
2727
|
+
* their union. In string similarity, it is often used to compare sets of characters,
|
|
2728
|
+
* tokens, or n-grams. The result is a value between 0 and 1, where 1 means the
|
|
2729
|
+
* sets are identical and 0 means they have no elements in common.
|
|
2730
|
+
*
|
|
2731
|
+
* @module Metric/JaccardIndex
|
|
2732
|
+
* @author Paul Köhler (komed3)
|
|
2733
|
+
* @license MIT
|
|
2734
|
+
*/
|
|
2735
|
+
/**
|
|
2736
|
+
* JaccardIndex class extends the Metric class to implement the Jaccard Index algorithm.
|
|
2737
|
+
*/
|
|
2738
|
+
class JaccardIndex extends Metric {
|
|
2739
|
+
/**
|
|
2740
|
+
* Constructor for the JaccardIndex class.
|
|
2741
|
+
*
|
|
2742
|
+
* Initializes the Jaccard Index metric with two input strings or
|
|
2743
|
+
* arrays of strings and optional options.
|
|
2744
|
+
*
|
|
2745
|
+
* @param {MetricInput} a - First input string or array of strings
|
|
2746
|
+
* @param {MetricInput} b - Second input string or array of strings
|
|
2747
|
+
* @param {MetricOptions} [opt] - Options for the metric computation
|
|
2748
|
+
*/
|
|
2749
|
+
constructor(a, b, opt = {}) {
|
|
2750
|
+
// Call the parent Metric constructor with the metric name and inputs
|
|
2751
|
+
// Metric is symmetrical
|
|
2752
|
+
super('jaccard', a, b, opt, true);
|
|
2753
|
+
}
|
|
2754
|
+
/**
|
|
2755
|
+
* Calculates the Jaccard Index between two strings.
|
|
2756
|
+
*
|
|
2757
|
+
* @param {string} a - First string
|
|
2758
|
+
* @param {string} b - Second string
|
|
2759
|
+
* @param {number} m - Length of the first string
|
|
2760
|
+
* @param {number} n - Length of the second string
|
|
2761
|
+
* @return {MetricCompute<JaccardRaw>} - Object containing the similarity result and raw values
|
|
2762
|
+
*/
|
|
2763
|
+
compute(a, b, m, n) {
|
|
2764
|
+
// Acquire two sets from the Pool
|
|
2765
|
+
const [setA, setB] = Pool.acquireMany('set', [m, n]);
|
|
2766
|
+
// Fill setA and setB with unique characters from a and b
|
|
2767
|
+
for (const A of a)
|
|
2768
|
+
setA.add(A);
|
|
2769
|
+
for (const B of b)
|
|
2770
|
+
setB.add(B);
|
|
2771
|
+
// Calculate intersection size
|
|
2772
|
+
let intersection = 0;
|
|
2773
|
+
for (const c of setA)
|
|
2774
|
+
if (setB.has(c))
|
|
2775
|
+
intersection++;
|
|
2776
|
+
// Calculate union size (setA + elements in setB not in setA)
|
|
2777
|
+
const union = setA.size + setB.size - intersection;
|
|
2778
|
+
// Release sets back to the pool
|
|
2779
|
+
Pool.release('set', setA, m);
|
|
2780
|
+
Pool.release('set', setB, n);
|
|
2781
|
+
// Return the result as a MetricCompute object
|
|
2782
|
+
return {
|
|
2783
|
+
res: union === 0 ? 1 : Metric.clamp(intersection / union),
|
|
2784
|
+
raw: { intersection, union }
|
|
2785
|
+
};
|
|
2786
|
+
}
|
|
2787
|
+
}
|
|
2788
|
+
// Register the Jaccard index in the metric registry
|
|
2789
|
+
MetricRegistry.add('jaccard', JaccardIndex);
|
|
2790
|
+
|
|
2791
|
+
/**
|
|
2792
|
+
* Jaro-Winkler Distance
|
|
2793
|
+
* src/metric/JaroWinkler.ts
|
|
2794
|
+
*
|
|
2795
|
+
* @see https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
|
|
2796
|
+
*
|
|
2797
|
+
* The Jaro-Winkler distance is a string similarity metric that gives more weight
|
|
2798
|
+
* to matching characters at the start of the strings. It is especially effective
|
|
2799
|
+
* for short strings and typographical errors, and is widely used in record linkage
|
|
2800
|
+
* and duplicate detection.
|
|
2801
|
+
*
|
|
2802
|
+
* @module Metric/JaroWinkler
|
|
2803
|
+
* @author Paul Köhler (komed3)
|
|
2804
|
+
* @license MIT
|
|
2805
|
+
*/
|
|
2806
|
+
/**
|
|
2807
|
+
* JaroWinklerDistance class extends the Metric class to implement the Jaro-Winkler algorithm.
|
|
2808
|
+
*/
|
|
2809
|
+
class JaroWinklerDistance extends Metric {
|
|
2810
|
+
/**
|
|
2811
|
+
* Constructor for the JaroWinklerDistance class.
|
|
2812
|
+
*
|
|
2813
|
+
* Initializes the Jaro-Winkler metric with two input strings or
|
|
2814
|
+
* arrays of strings and optional options.
|
|
2815
|
+
*
|
|
2816
|
+
* @param {MetricInput} a - First input string or array of strings
|
|
2817
|
+
* @param {MetricInput} b - Second input string or array of strings
|
|
2818
|
+
* @param {MetricOptions} [opt] - Options for the metric computation
|
|
2819
|
+
*/
|
|
2820
|
+
constructor(a, b, opt = {}) {
|
|
2821
|
+
// Call the parent Metric constructor with the metric name and inputs
|
|
2822
|
+
// Metric is symmetrical
|
|
2823
|
+
super('jaro-winkler', a, b, opt, true);
|
|
2824
|
+
}
|
|
2825
|
+
/**
|
|
2826
|
+
* Calculates the Jaro-Winkler similarity between two strings.
|
|
2827
|
+
*
|
|
2828
|
+
* @param {string} a - First string
|
|
2829
|
+
* @param {string} b - Second string
|
|
2830
|
+
* @param {number} m - Length of the first string
|
|
2831
|
+
* @param {number} n - Length of the second string
|
|
2832
|
+
* @return {MetricCompute<JaroWinklerRaw>} - Object containing the similarity result and raw values
|
|
2833
|
+
*/
|
|
2834
|
+
compute(a, b, m, n) {
|
|
2835
|
+
// Find matches
|
|
2836
|
+
const matchWindow = Math.max(0, Math.floor(n / 2) - 1);
|
|
2837
|
+
// Use Pool for boolean arrays
|
|
2838
|
+
const matchA = Pool.acquire('uint16', m);
|
|
2839
|
+
const matchB = Pool.acquire('uint16', n);
|
|
2840
|
+
// Initialize match arrays
|
|
2841
|
+
for (let i = 0; i < m; i++)
|
|
2842
|
+
matchA[i] = 0;
|
|
2843
|
+
for (let i = 0; i < n; i++)
|
|
2844
|
+
matchB[i] = 0;
|
|
2845
|
+
// Find matches within the match window
|
|
2846
|
+
let matches = 0;
|
|
2847
|
+
for (let i = 0; i < m; i++) {
|
|
2848
|
+
const start = Math.max(0, i - matchWindow);
|
|
2849
|
+
const end = Math.min(i + matchWindow + 1, n);
|
|
2850
|
+
for (let j = start; j < end; j++) {
|
|
2851
|
+
if (!matchB[j] && a[i] === b[j]) {
|
|
2852
|
+
matchA[i] = 1;
|
|
2853
|
+
matchB[j] = 1;
|
|
2854
|
+
matches++;
|
|
2855
|
+
break;
|
|
2856
|
+
}
|
|
2857
|
+
}
|
|
2858
|
+
}
|
|
2859
|
+
// Set initial values for transpositions, jaro distance, prefix and result
|
|
2860
|
+
let transpos = 0, jaro = 0, prefix = 0, res = 0;
|
|
2861
|
+
// If matches are found, proceed with further calculations
|
|
2862
|
+
if (matches > 0) {
|
|
2863
|
+
// Count transpositions
|
|
2864
|
+
let k = 0;
|
|
2865
|
+
for (let i = 0; i < m; i++) {
|
|
2866
|
+
if (matchA[i]) {
|
|
2867
|
+
while (!matchB[k])
|
|
2868
|
+
k++;
|
|
2869
|
+
if (a[i] !== b[k])
|
|
2870
|
+
transpos++;
|
|
2871
|
+
k++;
|
|
2872
|
+
}
|
|
2873
|
+
}
|
|
2874
|
+
transpos /= 2;
|
|
2875
|
+
// Calculate Jaro similarity
|
|
2876
|
+
jaro = ((matches / m) + (matches / n) +
|
|
2877
|
+
(matches - transpos) / matches) / 3;
|
|
2878
|
+
// Calculate common prefix length (max 4)
|
|
2879
|
+
for (let i = 0; i < Math.min(4, m, n); i++) {
|
|
2880
|
+
if (a[i] === b[i])
|
|
2881
|
+
prefix++;
|
|
2882
|
+
else
|
|
2883
|
+
break;
|
|
2884
|
+
}
|
|
2885
|
+
// Step 5: Calculate Jaro-Winkler similarity
|
|
2886
|
+
res = jaro + prefix * 0.1 * (1 - jaro);
|
|
2887
|
+
}
|
|
2888
|
+
// Release arrays back to the pool
|
|
2889
|
+
Pool.release('uint16', matchA, m);
|
|
2890
|
+
Pool.release('uint16', matchB, n);
|
|
2891
|
+
// Return the result as a MetricCompute object
|
|
2892
|
+
return {
|
|
2893
|
+
res: Metric.clamp(res),
|
|
2894
|
+
raw: { matchWindow, matches, transpos, jaro, prefix }
|
|
2895
|
+
};
|
|
2896
|
+
}
|
|
2897
|
+
}
|
|
2898
|
+
// Register the Jaro-Winkler distance in the metric registry
|
|
2899
|
+
MetricRegistry.add('jaroWinkler', JaroWinklerDistance);
|
|
2900
|
+
|
|
2901
|
+
/**
|
|
2902
|
+
* Longest Common Subsequence (LCS)
|
|
2903
|
+
* src/metric/LCS.ts
|
|
2904
|
+
*
|
|
2905
|
+
* @see https://en.wikipedia.org/wiki/Longest_common_subsequence
|
|
2906
|
+
*
|
|
2907
|
+
* The Longest Common Subsequence (LCS) metric measures the length of the longest
|
|
2908
|
+
* subsequence common to both strings. Unlike substrings, the characters of a
|
|
2909
|
+
* subsequence do not need to be contiguous, but must appear in the same order.
|
|
2910
|
+
*
|
|
2911
|
+
* The LCS is widely used in diff tools, bioinformatics, and approximate string
|
|
2912
|
+
* matching.
|
|
2913
|
+
*
|
|
2914
|
+
* @module Metric/LCS
|
|
2915
|
+
* @author Paul Köhler (komed3)
|
|
2916
|
+
* @license MIT
|
|
2917
|
+
*/
|
|
2918
|
+
/**
|
|
2919
|
+
* LCSMetric class extends the Metric class to implement the Longest Common Subsequence algorithm.
|
|
2920
|
+
*/
|
|
2921
|
+
class LCSMetric extends Metric {
|
|
2922
|
+
/**
|
|
2923
|
+
* Constructor for the LCSMetric class.
|
|
2924
|
+
*
|
|
2925
|
+
* Initializes the LCS metric with two input strings or
|
|
2926
|
+
* arrays of strings and optional options.
|
|
2927
|
+
*
|
|
2928
|
+
* @param {MetricInput} a - First input string or array of strings
|
|
2929
|
+
* @param {MetricInput} b - Second input string or array of strings
|
|
2930
|
+
* @param {MetricOptions} [opt] - Options for the metric computation
|
|
2931
|
+
*/
|
|
2932
|
+
constructor(a, b, opt = {}) {
|
|
2933
|
+
// Call the parent Metric constructor with the metric name and inputs
|
|
2934
|
+
// Metric is symmetrical
|
|
2935
|
+
super('lcs', a, b, opt, true);
|
|
2936
|
+
}
|
|
2937
|
+
/**
|
|
2938
|
+
* Calculates the normalized LCS similarity between two strings.
|
|
2939
|
+
*
|
|
2940
|
+
* @param {string} a - First string
|
|
2941
|
+
* @param {string} b - Second string
|
|
2942
|
+
* @param {number} m - Length of the first string
|
|
2943
|
+
* @param {number} n - Length of the second string
|
|
2944
|
+
* @param {number} maxLen - Maximum length of the strings
|
|
2945
|
+
* @return {MetricCompute<LCSRaw>} - Object containing the similarity result and raw LCS length
|
|
2946
|
+
*/
|
|
2947
|
+
compute(a, b, m, n, maxLen) {
|
|
2948
|
+
// Get two reusable arrays from the Pool for the DP rows
|
|
2949
|
+
const len = m + 1;
|
|
2950
|
+
const [prev, curr] = Pool.acquireMany('uint16', [len, len]);
|
|
2951
|
+
// Initialize the first row to zeros
|
|
2952
|
+
for (let i = 0; i <= m; i++)
|
|
2953
|
+
prev[i] = 0;
|
|
2954
|
+
// Fill the DP matrix row by row (over the longer string)
|
|
2955
|
+
for (let j = 1; j <= n; j++) {
|
|
2956
|
+
curr[0] = 0;
|
|
2957
|
+
// Get the character code of the current character in b
|
|
2958
|
+
const cb = b.charCodeAt(j - 1);
|
|
2959
|
+
for (let i = 1; i <= m; i++) {
|
|
2960
|
+
// If characters match, increment the LCS length
|
|
2961
|
+
if (a.charCodeAt(i - 1) === cb)
|
|
2962
|
+
curr[i] = prev[i - 1] + 1;
|
|
2963
|
+
// Otherwise, take the maximum of the left or above cell
|
|
2964
|
+
else
|
|
2965
|
+
curr[i] = Math.max(prev[i], curr[i - 1]);
|
|
2966
|
+
}
|
|
2967
|
+
// Copy current row to previous for next iteration
|
|
2968
|
+
prev.set(curr);
|
|
2969
|
+
}
|
|
2970
|
+
// The last value in prev is the LCS length
|
|
2971
|
+
const lcs = prev[m];
|
|
2972
|
+
// Release arrays back to the pool
|
|
2973
|
+
Pool.release('uint16', prev, len);
|
|
2974
|
+
Pool.release('uint16', curr, len);
|
|
2975
|
+
// Normalize by the length of the longer string
|
|
2976
|
+
return {
|
|
2977
|
+
res: maxLen === 0 ? 1 : Metric.clamp(lcs / maxLen),
|
|
2978
|
+
raw: { lcs, maxLen }
|
|
2979
|
+
};
|
|
2980
|
+
}
|
|
2981
|
+
}
|
|
2982
|
+
// Register the Longest Common Subsequence (LCS) in the metric registry
|
|
2983
|
+
MetricRegistry.add('lcs', LCSMetric);
|
|
2984
|
+
|
|
2985
|
+
/**
|
|
2986
|
+
* Levenshtein Distance
|
|
2987
|
+
* src/metric/Levenshtein.ts
|
|
2988
|
+
*
|
|
2989
|
+
* @see https://en.wikipedia.org/wiki/Levenshtein_distance
|
|
2990
|
+
*
|
|
2991
|
+
* The Levenshtein distance is a classic metric for measuring the minimum number
|
|
2992
|
+
* of single-character edits (insertions, deletions, or substitutions) required
|
|
2993
|
+
* to change one string into another.
|
|
2994
|
+
*
|
|
2995
|
+
* It is widely used in approximate string matching, spell checking, and natural
|
|
2996
|
+
* language processing.
|
|
2997
|
+
*
|
|
2998
|
+
* @module Metric/LevenshteinDistance
|
|
2999
|
+
* @author Paul Köhler (komed3)
|
|
3000
|
+
* @license MIT
|
|
3001
|
+
*/
|
|
3002
|
+
/**
|
|
3003
|
+
* LevenshteinDistance class extends the Metric class to implement the Levenshtein distance algorithm.
|
|
3004
|
+
*/
|
|
3005
|
+
class LevenshteinDistance extends Metric {
|
|
3006
|
+
/**
|
|
3007
|
+
* Constructor for the Levenshtein class.
|
|
3008
|
+
*
|
|
3009
|
+
* Initializes the Levenshtein metric with two input strings
|
|
3010
|
+
* or arrays of strings and optional options.
|
|
3011
|
+
*
|
|
3012
|
+
* @param {MetricInput} a - First input string or array of strings
|
|
3013
|
+
* @param {MetricInput} b - Second input string or array of strings
|
|
3014
|
+
* @param {MetricOptions} [opt] - Options for the metric computation
|
|
3015
|
+
*/
|
|
3016
|
+
constructor(a, b, opt = {}) {
|
|
3017
|
+
// Call the parent Metric constructor with the metric name and inputs
|
|
3018
|
+
// Metric is symmetrical
|
|
3019
|
+
super('levenshtein', a, b, opt, true);
|
|
3020
|
+
}
|
|
3021
|
+
/**
|
|
3022
|
+
* Calculates the Levenshtein distance between two strings.
|
|
3023
|
+
*
|
|
3024
|
+
* @param {string} a - First string
|
|
3025
|
+
* @param {string} b - Second string
|
|
3026
|
+
* @param {number} m - Length of the first string
|
|
3027
|
+
* @param {number} n - Length of the second string
|
|
3028
|
+
* @param {number} maxLen - Maximum length of the strings
|
|
3029
|
+
* @return {MetricCompute<LevenshteinRaw>} - Object containing the similarity result and raw distance
|
|
3030
|
+
*/
|
|
3031
|
+
compute(a, b, m, n, maxLen) {
|
|
3032
|
+
// Get two reusable arrays from the Pool for the DP rows
|
|
3033
|
+
const len = m + 1;
|
|
3034
|
+
const [prev, curr] = Pool.acquireMany('uint16', [len, len]);
|
|
3035
|
+
// Initialize the first row (edit distances from empty string to a)
|
|
3036
|
+
for (let i = 0; i <= m; i++)
|
|
3037
|
+
prev[i] = i;
|
|
3038
|
+
// Fill the DP matrix row by row (over the longer string)
|
|
3039
|
+
for (let j = 1; j <= n; j++) {
|
|
3040
|
+
// Cost of transforming empty string to b[0..j]
|
|
3041
|
+
curr[0] = j;
|
|
3042
|
+
// Get the character code of the current character in b
|
|
3043
|
+
const cb = b.charCodeAt(j - 1);
|
|
3044
|
+
for (let i = 1; i <= m; i++) {
|
|
3045
|
+
// Cost is 0 if characters match, 1 otherwise
|
|
3046
|
+
const cost = a.charCodeAt(i - 1) === cb ? 0 : 1;
|
|
3047
|
+
// Calculate the minimum edit distance for current cell
|
|
3048
|
+
curr[i] = Math.min(curr[i - 1] + 1, // Insertion
|
|
3049
|
+
prev[i] + 1, // Deletion
|
|
3050
|
+
prev[i - 1] + cost // Substitution
|
|
3051
|
+
);
|
|
3052
|
+
}
|
|
3053
|
+
// Copy current row to previous for next iteration
|
|
3054
|
+
prev.set(curr);
|
|
3055
|
+
}
|
|
3056
|
+
// The last value in prev is the Levenshtein distance
|
|
3057
|
+
const dist = prev[m];
|
|
3058
|
+
// Release arrays back to the pool
|
|
3059
|
+
Pool.release('uint16', prev, len);
|
|
3060
|
+
Pool.release('uint16', curr, len);
|
|
3061
|
+
// Return the result as a MetricCompute object
|
|
3062
|
+
return {
|
|
3063
|
+
res: maxLen === 0 ? 1 : Metric.clamp(1 - dist / maxLen),
|
|
3064
|
+
raw: { dist, maxLen }
|
|
3065
|
+
};
|
|
3066
|
+
}
|
|
3067
|
+
}
|
|
3068
|
+
// Register the Levenshtein distance in the metric registry
|
|
3069
|
+
MetricRegistry.add('levenshtein', LevenshteinDistance);
|
|
3070
|
+
|
|
3071
|
+
/**
|
|
3072
|
+
* Needleman-Wunsch Algorithm
|
|
3073
|
+
* src/metric/NeedlemanWunsch.ts
|
|
3074
|
+
*
|
|
3075
|
+
* @see https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
|
|
3076
|
+
*
|
|
3077
|
+
* The Needleman-Wunsch algorithm performs global alignment, aligning two strings
|
|
3078
|
+
* entirely, including gaps. It is commonly used in bioinformatics for sequence
|
|
3079
|
+
* alignment.
|
|
3080
|
+
*
|
|
3081
|
+
* @module Metric/NeedlemanWunsch
|
|
3082
|
+
* @author Paul Köhler (komed3)
|
|
3083
|
+
* @license MIT
|
|
3084
|
+
*/
|
|
3085
|
+
/**
|
|
3086
|
+
* NeedlemanWunschDistance class extends the Metric class to implement the Needleman-Wunsch algorithm.
|
|
3087
|
+
*/
|
|
3088
|
+
class NeedlemanWunschDistance extends Metric {
|
|
3089
|
+
/**
|
|
3090
|
+
* Constructor for the NeedlemanWunsch class.
|
|
3091
|
+
*
|
|
3092
|
+
* Initializes the Needleman-Wunsch metric with two input strings or
|
|
3093
|
+
* arrays of strings and optional options.
|
|
3094
|
+
*
|
|
3095
|
+
* @param {MetricInput} a - First input string or array of strings
|
|
3096
|
+
* @param {MetricInput} b - Second input string or array of strings
|
|
3097
|
+
* @param {MetricOptions} [opt] - Options for the metric computation
|
|
3098
|
+
*/
|
|
3099
|
+
constructor(a, b, opt = {}) {
|
|
3100
|
+
// Call the parent Metric constructor with the metric name and inputs
|
|
3101
|
+
// Metric is symmetrical
|
|
3102
|
+
super('needlemanWunsch', a, b, opt, true);
|
|
3103
|
+
}
|
|
3104
|
+
/**
|
|
3105
|
+
* Calculates the Needleman-Wunsch global alignment score between two strings.
|
|
3106
|
+
*
|
|
3107
|
+
* @param {string} a - First string
|
|
3108
|
+
* @param {string} b - Second string
|
|
3109
|
+
* @param {number} m - Length of the first string
|
|
3110
|
+
* @param {number} n - Length of the second string
|
|
3111
|
+
* @param {number} maxLen - Maximum length of the strings
|
|
3112
|
+
* @return {MetricCompute<NeedlemanRaw>} - Object containing the similarity result and raw score
|
|
3113
|
+
*/
|
|
3114
|
+
compute(a, b, m, n, maxLen) {
|
|
3115
|
+
// Scoring parameters (can be customized via options if needed)
|
|
3116
|
+
const { match = 1, mismatch = -1, gap = -1 } = this.options;
|
|
3117
|
+
// Get two reusable arrays from the Pool for the DP rows
|
|
3118
|
+
const len = m + 1;
|
|
3119
|
+
const [prev, curr] = Pool.acquireMany('uint16', [len, len]);
|
|
3120
|
+
// Initialize the first row (gap penalties)
|
|
3121
|
+
prev[0] = 0;
|
|
3122
|
+
for (let i = 1; i <= m; i++)
|
|
3123
|
+
prev[i] = prev[i - 1] + gap;
|
|
3124
|
+
// Fill the DP matrix row by row (over the longer string)
|
|
3125
|
+
for (let j = 1; j <= n; j++) {
|
|
3126
|
+
curr[0] = prev[0] + gap;
|
|
3127
|
+
// Get the character code of the current character in b
|
|
3128
|
+
const cb = b.charCodeAt(j - 1);
|
|
3129
|
+
for (let i = 1; i <= m; i++) {
|
|
3130
|
+
// Score for match / mismatch
|
|
3131
|
+
const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
|
|
3132
|
+
// Calculate the maximum score for current cell
|
|
3133
|
+
curr[i] = Math.max(prev[i - 1] + score, // Diagonal (match/mismatch)
|
|
3134
|
+
prev[i] + gap, // Up (gap)
|
|
3135
|
+
curr[i - 1] + gap // Left (gap)
|
|
3136
|
+
);
|
|
3137
|
+
}
|
|
3138
|
+
// Copy current row to previous for next iteration
|
|
3139
|
+
prev.set(curr);
|
|
3140
|
+
}
|
|
3141
|
+
// The last value in prev is the Needleman-Wunsch score
|
|
3142
|
+
const score = prev[m];
|
|
3143
|
+
// Release arrays back to the pool
|
|
3144
|
+
Pool.release('uint16', prev, len);
|
|
3145
|
+
Pool.release('uint16', curr, len);
|
|
3146
|
+
// Use the maximum possible score for the longer string (global alignment)
|
|
3147
|
+
const denum = maxLen * match;
|
|
3148
|
+
// Return the result as a MetricCompute object
|
|
3149
|
+
return {
|
|
3150
|
+
res: denum === 0 ? 0 : Metric.clamp(score / denum),
|
|
3151
|
+
raw: { score, denum }
|
|
3152
|
+
};
|
|
3153
|
+
}
|
|
3154
|
+
}
|
|
3155
|
+
// Register the Needleman-Wunsch algorithm in the metric registry
|
|
3156
|
+
MetricRegistry.add('needlemanWunsch', NeedlemanWunschDistance);
|
|
3157
|
+
|
|
3158
|
+
/**
|
|
3159
|
+
* q-Gram Similarity
|
|
3160
|
+
* src/metric/QGram.ts
|
|
3161
|
+
*
|
|
3162
|
+
* @see https://en.wikipedia.org/wiki/Q-gram
|
|
3163
|
+
*
|
|
3164
|
+
* Q-gram similarity is a string-matching algorithm that compares two strings by
|
|
3165
|
+
* breaking them into substrings (q-grams) of length Q. The similarity is computed
|
|
3166
|
+
* as the size of the intersection of q-gram sets divided by the size of the larger
|
|
3167
|
+
* set.
|
|
3168
|
+
*
|
|
3169
|
+
* This metric is widely used in approximate string matching, information retrieval,
|
|
3170
|
+
* and computational linguistics.
|
|
3171
|
+
*
|
|
3172
|
+
* @module Metric/QGramSimilarity
|
|
3173
|
+
* @author Paul Köhler (komed3)
|
|
3174
|
+
* @license MIT
|
|
3175
|
+
*/
|
|
3176
|
+
/**
|
|
3177
|
+
* QGramSimilarity class extends the Metric class to implement the q-Gram similarity algorithm.
|
|
3178
|
+
*/
|
|
3179
|
+
class QGramSimilarity extends Metric {
|
|
3180
|
+
/**
|
|
3181
|
+
* Constructor for the QGramSimilarity class.
|
|
3182
|
+
*
|
|
3183
|
+
* Initializes the q-Gram similarity metric with two input strings or
|
|
3184
|
+
* arrays of strings and optional options.
|
|
3185
|
+
*
|
|
3186
|
+
* @param {MetricInput} a - First input string or array of strings
|
|
3187
|
+
* @param {MetricInput} b - Second input string or array of strings
|
|
3188
|
+
* @param {MetricOptions} [opt] - Options for the metric computation
|
|
3189
|
+
*/
|
|
3190
|
+
constructor(a, b, opt = {}) {
|
|
3191
|
+
// Call the parent Metric constructor with the metric name and inputs
|
|
3192
|
+
// Metric is symmetrical
|
|
3193
|
+
super('qgram', a, b, opt, true);
|
|
3194
|
+
}
|
|
3195
|
+
/**
|
|
3196
|
+
* Converts a string into a set of q-grams (substrings of length q).
|
|
3197
|
+
*
|
|
3198
|
+
* @param {string} str - The input string
|
|
3199
|
+
* @param {number} q - The length of each q-gram
|
|
3200
|
+
* @return {Set<string>} - Set of q-grams
|
|
3201
|
+
*/
|
|
3202
|
+
_qGrams(str, q) {
|
|
3203
|
+
const len = Math.max(0, str.length - q + 1);
|
|
3204
|
+
const grams = Pool.acquire('set', len);
|
|
3205
|
+
for (let i = 0; i < len; i++)
|
|
3206
|
+
grams.add(str.slice(i, i + q));
|
|
3207
|
+
return grams;
|
|
3208
|
+
}
|
|
3209
|
+
/**
|
|
3210
|
+
* Calculates the q-Gram similarity between two strings.
|
|
3211
|
+
*
|
|
3212
|
+
* @param {string} a - First string
|
|
3213
|
+
* @param {string} b - Second string
|
|
3214
|
+
* @return {MetricCompute<QGramRaw>} - Object containing the similarity result and raw values
|
|
3215
|
+
*/
|
|
3216
|
+
compute(a, b) {
|
|
3217
|
+
// Get q from options or use default "2"
|
|
3218
|
+
const { q = 2 } = this.options;
|
|
3219
|
+
// Generate q-gram sets for both strings
|
|
3220
|
+
const setA = this._qGrams(a, q);
|
|
3221
|
+
const setB = this._qGrams(b, q);
|
|
3222
|
+
// Calculate intersection size
|
|
3223
|
+
let intersection = 0;
|
|
3224
|
+
for (const gram of setA)
|
|
3225
|
+
if (setB.has(gram))
|
|
3226
|
+
intersection++;
|
|
3227
|
+
// Calculate the size of the larger set
|
|
3228
|
+
const sizeA = setA.size, sizeB = setB.size;
|
|
3229
|
+
const size = Math.max(sizeA, sizeB);
|
|
3230
|
+
// Release sets back to the pool
|
|
3231
|
+
Pool.release('set', setA, sizeA);
|
|
3232
|
+
Pool.release('set', setB, sizeB);
|
|
3233
|
+
// Return the result as a MetricCompute object
|
|
3234
|
+
return {
|
|
3235
|
+
res: size === 0 ? 1 : Metric.clamp(intersection / size),
|
|
3236
|
+
raw: { intersection, size }
|
|
3237
|
+
};
|
|
3238
|
+
}
|
|
3239
|
+
}
|
|
3240
|
+
// Register the q-Gram similariry in the metric registry
|
|
3241
|
+
MetricRegistry.add('qGram', QGramSimilarity);
|
|
3242
|
+
|
|
3243
|
+
/**
|
|
3244
|
+
* Smith-Waterman Algorithm
|
|
3245
|
+
* src/metric/SmithWaterman.ts
|
|
3246
|
+
*
|
|
3247
|
+
* @see https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
|
|
3248
|
+
*
|
|
3249
|
+
* The Smith-Waterman algorithm performs local alignment, finding the best matching
|
|
3250
|
+
* subsequence between two strings. It is commonly used in bioinformatics for local
|
|
3251
|
+
* sequence alignment. Instead of looking at the entire sequence, the Smith–Waterman
|
|
3252
|
+
* algorithm compares segments of all possible lengths and optimizes the similarity
|
|
3253
|
+
* measure.
|
|
3254
|
+
*
|
|
3255
|
+
* @module Metric/SmithWatermanDistance
|
|
3256
|
+
* @author Paul Köhler (komed3)
|
|
3257
|
+
* @license MIT
|
|
3258
|
+
*/
|
|
3259
|
+
/**
|
|
3260
|
+
* SmithWatermanDistance class extends the Metric class to implement the Smith-Waterman algorithm.
|
|
3261
|
+
*/
|
|
3262
|
+
class SmithWatermanDistance extends Metric {
|
|
3263
|
+
/**
|
|
3264
|
+
* Constructor for the SmithWaterman class.
|
|
3265
|
+
*
|
|
3266
|
+
* Initializes the Smith-Waterman metric with two input strings or
|
|
3267
|
+
* arrays of strings and optional options.
|
|
3268
|
+
*
|
|
3269
|
+
* @param {MetricInput} a - First input string or array of strings
|
|
3270
|
+
* @param {MetricInput} b - Second input string or array of strings
|
|
3271
|
+
* @param {MetricOptions} [opt] - Options for the metric computation
|
|
3272
|
+
*/
|
|
3273
|
+
constructor(a, b, opt = {}) {
|
|
3274
|
+
// Call the parent Metric constructor with the metric name and inputs
|
|
3275
|
+
// Metric is symmetrical
|
|
3276
|
+
super('smithWaterman', a, b, opt, true);
|
|
3277
|
+
}
|
|
3278
|
+
/**
|
|
3279
|
+
* Calculates the Smith-Waterman local alignment score between two strings.
|
|
3280
|
+
*
|
|
3281
|
+
* @param {string} a - First string
|
|
3282
|
+
* @param {string} b - Second string
|
|
3283
|
+
* @param {number} m - Length of the first string
|
|
3284
|
+
* @param {number} n - Length of the second string
|
|
3285
|
+
* @return {MetricCompute<SmithWatermanRaw>} - Object containing the similarity result and raw score
|
|
3286
|
+
*/
|
|
3287
|
+
compute(a, b, m, n) {
|
|
3288
|
+
// Scoring parameters (can be customized via options if needed)
|
|
3289
|
+
const { match = 2, mismatch = -1, gap = -2 } = this.options;
|
|
3290
|
+
// Get two reusable arrays from the Pool for the DP rows
|
|
3291
|
+
const len = m + 1;
|
|
3292
|
+
const [prev, curr] = Pool.acquireMany('uint16', [len, len]);
|
|
3293
|
+
// Initialize the first row to zeros (Smith-Waterman local alignment)
|
|
3294
|
+
for (let i = 0; i <= m; i++)
|
|
3295
|
+
prev[i] = 0;
|
|
3296
|
+
let maxScore = 0;
|
|
3297
|
+
// Fill the DP matrix row by row (over the longer string)
|
|
3298
|
+
for (let j = 1; j <= n; j++) {
|
|
3299
|
+
// First column always zero
|
|
3300
|
+
curr[0] = 0;
|
|
3301
|
+
// Get the character code of the current character in b
|
|
3302
|
+
const cb = b.charCodeAt(j - 1);
|
|
3303
|
+
for (let i = 1; i <= m; i++) {
|
|
3304
|
+
// Score for match / mismatch
|
|
3305
|
+
const score = a.charCodeAt(i - 1) === cb ? match : mismatch;
|
|
3306
|
+
// Calculate the maximum score for current cell
|
|
3307
|
+
curr[i] = Math.max(0, prev[i - 1] + score, // Diagonal (match/mismatch)
|
|
3308
|
+
prev[i] + gap, // Up (gap)
|
|
3309
|
+
curr[i - 1] + gap // Left (gap)
|
|
3310
|
+
);
|
|
3311
|
+
// Track the maximum score in the matrix
|
|
3312
|
+
if (curr[i] > maxScore)
|
|
3313
|
+
maxScore = curr[i];
|
|
3314
|
+
}
|
|
3315
|
+
// Copy current row to previous for next iteration
|
|
3316
|
+
prev.set(curr);
|
|
3317
|
+
}
|
|
3318
|
+
// Release arrays back to the pool
|
|
3319
|
+
Pool.release('uint16', prev, len);
|
|
3320
|
+
Pool.release('uint16', curr, len);
|
|
3321
|
+
// Use the maximum possible score for the shorter string (local alignment)
|
|
3322
|
+
const denum = Math.min(m * match, n * match);
|
|
3323
|
+
// Return the result as a MetricCompute object
|
|
3324
|
+
return {
|
|
3325
|
+
res: denum === 0 ? 0 : Metric.clamp(maxScore / denum),
|
|
3326
|
+
raw: { score: maxScore, denum }
|
|
3327
|
+
};
|
|
3328
|
+
}
|
|
3329
|
+
}
|
|
3330
|
+
// Register the Smith-Waterman algorithm in the metric registry
|
|
3331
|
+
MetricRegistry.add('smithWaterman', SmithWatermanDistance);
|
|
3332
|
+
|
|
3333
|
+
/**
|
|
3334
|
+
* Abstract Phonetic
|
|
3335
|
+
* src/phonetic/Phonetic.ts
|
|
3336
|
+
*
|
|
3337
|
+
* @see https://en.wikipedia.org/wiki/Phonetic_algorithm
|
|
3338
|
+
*
|
|
3339
|
+
* A phonetic algorithm refers to a method for indexing words according to their
|
|
3340
|
+
* pronunciation. When the algorithm relies on orthography, it is significantly
|
|
3341
|
+
* influenced by the spelling conventions of the language for which it is intended:
|
|
3342
|
+
* since the majority of phonetic algorithms were created for English, they tend
|
|
3343
|
+
* to be less effective for indexing words in other languages.
|
|
3344
|
+
*
|
|
3345
|
+
* Phonetic search has numerous applications, and one of the initial use cases has
|
|
3346
|
+
* been in trademark searches to verify that newly registered trademarks do not
|
|
3347
|
+
* pose a risk of infringing upon existing trademarks due to their pronunciation.
|
|
3348
|
+
*
|
|
3349
|
+
* This module provides an abstract class for generating phonetic indices based
|
|
3350
|
+
* on mappings and rules. It allows for the implementation of various phonetic
|
|
3351
|
+
* algorithms by extending the abstract class.
|
|
3352
|
+
*
|
|
3353
|
+
* @module Phonetic
|
|
3354
|
+
* @author Paul Köhler (komed3)
|
|
3355
|
+
* @license MIT
|
|
3356
|
+
*/
|
|
3357
|
+
// Get the singleton profiler instance for performance monitoring
|
|
3358
|
+
const profiler$1 = Profiler.getInstance();
|
|
3359
|
+
/**
|
|
3360
|
+
* Abstract class representing a phonetic algorithm.
|
|
3361
|
+
*
|
|
3362
|
+
* The protected methods `applyRules`, `encode`, `mapChar`, `equalLen`, `word2Chars`,
|
|
3363
|
+
* `exitEarly`, `adjustCode`, `loop` and `loopAsync` can be overridden in subclasses
|
|
3364
|
+
* to implement specific phonetic algorithms.
|
|
3365
|
+
*
|
|
3366
|
+
* @abstract
|
|
3367
|
+
*/
|
|
3368
|
+
class Phonetic {
|
|
3369
|
+
// Cache for indexed words to avoid redundant calculations
|
|
3370
|
+
static cache = new HashTable();
|
|
3371
|
+
/**
|
|
3372
|
+
* Default phonetic options.
|
|
3373
|
+
*
|
|
3374
|
+
* This object contains default settings for phonetic algorithms,
|
|
3375
|
+
* implemented in the subclass.
|
|
3376
|
+
*/
|
|
3377
|
+
static default;
|
|
3378
|
+
// Phonetic algorithm name for identification
|
|
3379
|
+
algo;
|
|
3380
|
+
// Phonetic map and options for the algorithm
|
|
3381
|
+
options;
|
|
3382
|
+
map;
|
|
3383
|
+
/**
|
|
3384
|
+
* Static method to clear the cache of indexed words.
|
|
3385
|
+
*/
|
|
3386
|
+
static clear() { this.cache.clear(); }
|
|
3387
|
+
/**
|
|
3388
|
+
* Constructor for the Phonetic class.
|
|
3389
|
+
*
|
|
3390
|
+
* Initializes the phonetic algorithm with the specified options and mapping.
|
|
3391
|
+
*
|
|
3392
|
+
* @param {string} algo - The name of the algorithm (e.g. 'soundex')
|
|
3393
|
+
* @param {PhoneticOptions} [opt] - Options for the phonetic algorithm
|
|
3394
|
+
* @throws {Error} - If the requested mapping is not declared
|
|
3395
|
+
*/
|
|
3396
|
+
constructor(algo, opt = {}) {
|
|
3397
|
+
// Set the options by merging the default options with the provided ones
|
|
3398
|
+
this.options = merge(this.constructor.default ?? {}, opt);
|
|
3399
|
+
// Get the mapping based on the provided options
|
|
3400
|
+
const map = PhoneticMappingRegistry.get(algo, this.options.map);
|
|
3401
|
+
// If the mapping is not defined, throw an error
|
|
3402
|
+
if (map === undefined)
|
|
3403
|
+
throw new Error(`requested mapping <${this.options.map}> is not declared`);
|
|
3404
|
+
this.algo = algo;
|
|
3405
|
+
this.map = map;
|
|
3406
|
+
}
|
|
3407
|
+
/**
|
|
3408
|
+
* Applies phonetic rules to a character in a word context.
|
|
3409
|
+
*
|
|
3410
|
+
* This method is designed to be generic and efficient for all phonetic algorithms.
|
|
3411
|
+
* It checks all rule types (prev, next, prevNot, nextNot, position, etc.) and
|
|
3412
|
+
* returns either the appropriate code (string) or undefined.
|
|
3413
|
+
*
|
|
3414
|
+
* @param {string} char - The current character
|
|
3415
|
+
* @param {number} i - The current position within the word
|
|
3416
|
+
* @param {string[]} chars - The word as an array of characters
|
|
3417
|
+
* @param {number} charLen - The total length of the word
|
|
3418
|
+
* @returns {string|undefined} - The rule code or undefined if no rule applies
|
|
3419
|
+
*/
|
|
3420
|
+
applyRules(char, i, chars, charLen) {
|
|
3421
|
+
const { ruleset = [] } = this.map;
|
|
3422
|
+
// If no rules are provided, return undefined
|
|
3423
|
+
if (!ruleset || !ruleset.length)
|
|
3424
|
+
return undefined;
|
|
3425
|
+
// Get the surrounding characters
|
|
3426
|
+
const prev = chars[i - 1] || '', prev2 = chars[i - 2] || '';
|
|
3427
|
+
const next = chars[i + 1] || '', next2 = chars[i + 2] || '';
|
|
3428
|
+
// Iterate over the rules to find a matching rule for the current character
|
|
3429
|
+
for (const rule of ruleset) {
|
|
3430
|
+
// Skip if the rule does not match the current character
|
|
3431
|
+
if (rule.char && rule.char !== char)
|
|
3432
|
+
continue;
|
|
3433
|
+
// Position in the word (start, middle, end)
|
|
3434
|
+
if (rule.position === 'start' && i !== 0)
|
|
3435
|
+
continue;
|
|
3436
|
+
if (rule.position === 'middle' && i > 0 && i < charLen)
|
|
3437
|
+
continue;
|
|
3438
|
+
if (rule.position === 'end' && i !== charLen)
|
|
3439
|
+
continue;
|
|
3440
|
+
// Previous character(s)
|
|
3441
|
+
if (rule.prev && !rule.prev.includes(prev))
|
|
3442
|
+
continue;
|
|
3443
|
+
if (rule.prevNot && rule.prevNot.includes(prev))
|
|
3444
|
+
continue;
|
|
3445
|
+
if (rule.prev2 && !rule.prev2.includes(prev2))
|
|
3446
|
+
continue;
|
|
3447
|
+
if (rule.prev2Not && rule.prev2Not.includes(prev2))
|
|
3448
|
+
continue;
|
|
3449
|
+
// Following character(s)
|
|
3450
|
+
if (rule.next && !rule.next.includes(next))
|
|
3451
|
+
continue;
|
|
3452
|
+
if (rule.nextNot && rule.nextNot.includes(next))
|
|
3453
|
+
continue;
|
|
3454
|
+
if (rule.next2 && !rule.next2.includes(next2))
|
|
3455
|
+
continue;
|
|
3456
|
+
if (rule.next2Not && rule.next2Not.includes(next2))
|
|
3457
|
+
continue;
|
|
3458
|
+
// Special case: Beginning of a word (e.g. chars.slice(0, n))
|
|
3459
|
+
if (rule.leading && !rule.leading.includes(chars.slice(0, rule.leading.length).join('')))
|
|
3460
|
+
continue;
|
|
3461
|
+
// Special case: end of word (e.g. chars.slice(-n))
|
|
3462
|
+
if (rule.trailing && !rule.trailing.includes(chars.slice(-rule.trailing.length).join('')))
|
|
3463
|
+
continue;
|
|
3464
|
+
// Check multiple characters (e.g. bigram/trigram)
|
|
3465
|
+
if (rule.match && !rule.match.every((c, j) => chars[i + j] === c))
|
|
3466
|
+
continue;
|
|
3467
|
+
// If all conditions met, return the rule code
|
|
3468
|
+
return rule.code;
|
|
3469
|
+
}
|
|
3470
|
+
// If no rule matched, return undefined
|
|
3471
|
+
return undefined;
|
|
3472
|
+
}
|
|
3473
|
+
/**
|
|
3474
|
+
* Generates the phonetic code for a given word.
|
|
3475
|
+
*
|
|
3476
|
+
* This method processes the word character by character, applying phonetic rules
|
|
3477
|
+
* and mappings to generate a phonetic code.
|
|
3478
|
+
*
|
|
3479
|
+
* @param {string} word - The input word to be converted into a phonetic code
|
|
3480
|
+
* @returns {string} - The generated phonetic code
|
|
3481
|
+
*/
|
|
3482
|
+
encode(word) {
|
|
3483
|
+
const { map = {}, ignore = [] } = this.map;
|
|
3484
|
+
// Get the characters of the word and its length
|
|
3485
|
+
const chars = this.word2Chars(word);
|
|
3486
|
+
const charLen = chars.length;
|
|
3487
|
+
let code = '', lastCode = null;
|
|
3488
|
+
// Iterate over each character in the word
|
|
3489
|
+
for (let i = 0; i < charLen; i++) {
|
|
3490
|
+
const char = chars[i];
|
|
3491
|
+
// Skip characters that are in the ignore list
|
|
3492
|
+
if (ignore.includes(char))
|
|
3493
|
+
continue;
|
|
3494
|
+
// Convert the character to its phonetic code
|
|
3495
|
+
const mapped = this.mapChar(char, i, chars, charLen, lastCode, map);
|
|
3496
|
+
// If no code is generated, skip to the next character
|
|
3497
|
+
if (mapped === undefined)
|
|
3498
|
+
continue;
|
|
3499
|
+
// Append the generated code to the final code
|
|
3500
|
+
code += mapped, lastCode = mapped;
|
|
3501
|
+
// If the code length exceeds the specified limit, exit early
|
|
3502
|
+
if (this.exitEarly(code, i))
|
|
3503
|
+
break;
|
|
3504
|
+
}
|
|
3505
|
+
// Return the adjusted phonetic code
|
|
3506
|
+
return this.adjustCode(code, chars);
|
|
3507
|
+
}
|
|
3508
|
+
/**
|
|
3509
|
+
* Converts a character to its phonetic code based on the mapping and rules.
|
|
3510
|
+
*
|
|
3511
|
+
* @param {string} char - The current character
|
|
3512
|
+
* @param {number} i - The current position within the word
|
|
3513
|
+
* @param {string[]} chars - The word as an array of characters
|
|
3514
|
+
* @param {number} charLen - The total length of the word
|
|
3515
|
+
* @param {string|null} lastCode - The last code generated (to avoid duplicates)
|
|
3516
|
+
* @param {Record<string, string>} map - The phonetic mapping
|
|
3517
|
+
* @returns {string|undefined} - The phonetic code or undefined if no code applies
|
|
3518
|
+
*/
|
|
3519
|
+
mapChar(char, i, chars, charLen, lastCode, map) {
|
|
3520
|
+
const { dedupe = true } = this.options;
|
|
3521
|
+
// Apply phonetic rules to the character
|
|
3522
|
+
// If no rules apply, use the mapping
|
|
3523
|
+
// If the character is not in the mapping, return undefined
|
|
3524
|
+
const c = this.applyRules(char, i, chars, charLen) ?? map[char] ?? undefined;
|
|
3525
|
+
// De-duplicate the code if necessary
|
|
3526
|
+
return dedupe && c === lastCode ? undefined : c;
|
|
3527
|
+
}
|
|
3528
|
+
/**
|
|
3529
|
+
* Ensures the phonetic code has a fixed length by padding or truncating.
|
|
3530
|
+
*
|
|
3531
|
+
* @param {string} input - The input string to be adjusted
|
|
3532
|
+
* @returns {string} - The adjusted string with fixed length
|
|
3533
|
+
*/
|
|
3534
|
+
equalLen(input) {
|
|
3535
|
+
const { length = -1, pad = '0' } = this.options;
|
|
3536
|
+
return length === -1 ? input : (input + pad.repeat(length)).slice(0, length);
|
|
3537
|
+
}
|
|
3538
|
+
/**
|
|
3539
|
+
* Converts a word into an array of characters.
|
|
3540
|
+
*
|
|
3541
|
+
* @param {string} word - The input word to be converted
|
|
3542
|
+
* @returns {string[]} - An array of characters from the input word
|
|
3543
|
+
*/
|
|
3544
|
+
word2Chars(word) { return word.toLowerCase().split(''); }
|
|
3545
|
+
/**
|
|
3546
|
+
* Determines whether to exit early based on the current phonetic code length.
|
|
3547
|
+
*
|
|
3548
|
+
* @param {string} code - The current phonetic code
|
|
3549
|
+
* @param {number} i - The current index in the word
|
|
3550
|
+
* @returns {boolean} - True if the code length exceeds the specified limit, false otherwise
|
|
3551
|
+
*/
|
|
3552
|
+
exitEarly(code, i) {
|
|
3553
|
+
const { length = -1 } = this.options;
|
|
3554
|
+
return length > 0 && code.length >= length;
|
|
3555
|
+
}
|
|
3556
|
+
/**
|
|
3557
|
+
* Adjusts the phonetic code.
|
|
3558
|
+
*
|
|
3559
|
+
* @param {string} code - The phonetic code to be adjusted
|
|
3560
|
+
* @param {string[]} chars - Characters to be removed from the code
|
|
3561
|
+
* @returns {string} - The adjusted phonetic code
|
|
3562
|
+
*/
|
|
3563
|
+
adjustCode(code, chars) { return code; }
|
|
3564
|
+
/**
|
|
3565
|
+
* Processes an array of words to generate their phonetic indices.
|
|
3566
|
+
*
|
|
3567
|
+
* This method iterates over each word, generates its phonetic code,
|
|
3568
|
+
* and ensures that the resulting codes are of equal length.
|
|
3569
|
+
*
|
|
3570
|
+
* @param {string[]} words - An array of words to be processed
|
|
3571
|
+
* @returns {string[]} - An array of phonetic indices for the input words
|
|
3572
|
+
*/
|
|
3573
|
+
loop(words) {
|
|
3574
|
+
const index = [];
|
|
3575
|
+
// Loop over each word in the input array
|
|
3576
|
+
for (const word of words) {
|
|
3577
|
+
// Generate a cache key based on the algorithm and word
|
|
3578
|
+
const key = Phonetic.cache.key(this.algo, [word]);
|
|
3579
|
+
// If the key exists in the cache, return the cached result
|
|
3580
|
+
// Otherwise, encode the word using the algorithm
|
|
3581
|
+
const code = Phonetic.cache.get(key || '') ?? (() => {
|
|
3582
|
+
// Get the phonetic code for the word
|
|
3583
|
+
const res = this.encode(word);
|
|
3584
|
+
// If a key was generated, store the result in the cache
|
|
3585
|
+
if (key)
|
|
3586
|
+
Phonetic.cache.set(key, res);
|
|
3587
|
+
return res;
|
|
3588
|
+
})();
|
|
3589
|
+
// If a code is generated, add them to the index
|
|
3590
|
+
if (code && code.length)
|
|
3591
|
+
index.push(this.equalLen(code));
|
|
3592
|
+
}
|
|
3593
|
+
return index;
|
|
3594
|
+
}
|
|
3595
|
+
/**
|
|
3596
|
+
* Asynchronously processes an array of words to generate their phonetic indices.
|
|
3597
|
+
*
|
|
3598
|
+
* This method iterates over each word, generates its phonetic code asynchronously,
|
|
3599
|
+
* and ensures that the resulting codes are of equal length.
|
|
3600
|
+
*
|
|
3601
|
+
* @param {string[]} words - An array of words to be processed
|
|
3602
|
+
* @returns {Promise<string[]>} - A promise that resolves to an array of phonetic indices for the input words
|
|
3603
|
+
*/
|
|
3604
|
+
async loopAsync(words) {
|
|
3605
|
+
const index = [];
|
|
3606
|
+
// Loop over each word in the input array
|
|
3607
|
+
for (const word of words) {
|
|
3608
|
+
// Get the phonetic code for the word asynchronously
|
|
3609
|
+
const code = await Promise.resolve(this.encode(word));
|
|
3610
|
+
// If a code is generated, add them to the index
|
|
3611
|
+
if (code && code.length)
|
|
3612
|
+
index.push(this.equalLen(code));
|
|
3613
|
+
}
|
|
3614
|
+
return index;
|
|
3615
|
+
}
|
|
3616
|
+
/**
|
|
3617
|
+
* Get the name of the phonetic algorithm.
|
|
3618
|
+
*
|
|
3619
|
+
* @returns {string} - The name of the algorithm
|
|
3620
|
+
*/
|
|
3621
|
+
getAlgoName() { return this.algo; }
|
|
3622
|
+
/**
|
|
3623
|
+
* Generates a phonetic index for the given input string.
|
|
3624
|
+
*
|
|
3625
|
+
* @param {string} input - The input string to be indexed
|
|
3626
|
+
* @returns {string[]} - An array of phonetic indices for the input words
|
|
3627
|
+
*/
|
|
3628
|
+
getIndex(input) {
|
|
3629
|
+
const { delimiter = ' ' } = this.options;
|
|
3630
|
+
// Split the input string by the specified delimiter and loop over it
|
|
3631
|
+
return profiler$1.run(() => this.loop(input.split(delimiter).filter(Boolean)).filter(Boolean));
|
|
3632
|
+
}
|
|
3633
|
+
/**
|
|
3634
|
+
* Asynchronously generates a phonetic index for the given input string.
|
|
3635
|
+
*
|
|
3636
|
+
* @param {string} input - The input string to be indexed
|
|
3637
|
+
* @returns {Promise<string[]>} - A promise that resolves to an array of phonetic indices for the input words
|
|
3638
|
+
*/
|
|
3639
|
+
async getIndexAsync(input) {
|
|
3640
|
+
const { delimiter = ' ' } = this.options;
|
|
3641
|
+
// Split the input string by the specified delimiter and loop over it asynchronously
|
|
3642
|
+
return (await profiler$1.runAsync(async () => await this.loopAsync(input.split(delimiter).filter(Boolean)))).filter(Boolean);
|
|
3643
|
+
}
|
|
3644
|
+
}
|
|
3645
|
+
/**
|
|
3646
|
+
* Phonetic registry service for managing phonetic implementations.
|
|
3647
|
+
*
|
|
3648
|
+
* This registry allows for dynamic registration and retrieval of phonetic classes,
|
|
3649
|
+
* enabling the use of various phonetic algorithms in a consistent manner.
|
|
3650
|
+
*/
|
|
3651
|
+
const PhoneticRegistry = Registry('phonetic', Phonetic);
|
|
3652
|
+
/**
|
|
3653
|
+
* Phonetic Mapping Service
|
|
3654
|
+
*
|
|
3655
|
+
* This service provides a simple interface to manage phonetic mappings across
|
|
3656
|
+
* different phonetic algorithms. It allows adding, removing, checking existence,
|
|
3657
|
+
* retrieving, and listing phonetic mappings for specified algorithms.
|
|
3658
|
+
*/
|
|
3659
|
+
const PhoneticMappingRegistry = (() => {
|
|
3660
|
+
// Create a registry object to hold mappings
|
|
3661
|
+
const mappings = Object.create(null);
|
|
3662
|
+
// Helper function to retrieve mappings for a specific algorithm
|
|
3663
|
+
const maps = (algo) => (mappings[algo] ||= Object.create(null));
|
|
3664
|
+
return {
|
|
3665
|
+
/**
|
|
3666
|
+
* Adds a phonetic mapping for a specific algorithm and ID.
|
|
3667
|
+
*
|
|
3668
|
+
* @param {string} algo - The phonetic algorithm identifier (e.g., 'soundex', 'metaphone')
|
|
3669
|
+
* @param {string} id - The unique identifier for the mapping
|
|
3670
|
+
* @param {PhoneticMap} map - The phonetic map to be added, containing rules and mappings
|
|
3671
|
+
* @param {boolean} [update=false] - Whether to allow overwriting an existing entry
|
|
3672
|
+
* @throws {Error} If the mapping name already exists and update is false
|
|
3673
|
+
*/
|
|
3674
|
+
add(algo, id, map, update = false) {
|
|
3675
|
+
const mappings = maps(algo);
|
|
3676
|
+
if (!update && id in mappings)
|
|
3677
|
+
throw new Error(`entry <${id}> already exists / use <update=true> to overwrite`);
|
|
3678
|
+
mappings[id] = map;
|
|
3679
|
+
},
|
|
3680
|
+
/**
|
|
3681
|
+
* Removes a phonetic mapping for a specific algorithm and ID.
|
|
3682
|
+
*
|
|
3683
|
+
* @param {string} algo - The phonetic algorithm identifier
|
|
3684
|
+
* @param {string} id - The unique identifier for the mapping to be removed
|
|
3685
|
+
*/
|
|
3686
|
+
remove(algo, id) { delete maps(algo)[id]; },
|
|
3687
|
+
/**
|
|
3688
|
+
* Checks if a phonetic mapping exists for a specific algorithm and ID.
|
|
3689
|
+
*
|
|
3690
|
+
* @param {string} algo - The phonetic algorithm identifier
|
|
3691
|
+
* @param {string} id - The unique identifier for the mapping to check
|
|
3692
|
+
* @returns {boolean} - Returns true if the mapping exists, false otherwise
|
|
3693
|
+
*/
|
|
3694
|
+
has(algo, id) { return id in maps(algo); },
|
|
3695
|
+
/**
|
|
3696
|
+
* Retrieves a phonetic mapping for a specific algorithm and ID.
|
|
3697
|
+
*
|
|
3698
|
+
* @param {string} algo - The phonetic algorithm identifier
|
|
3699
|
+
* @param {string} id - The unique identifier for the mapping to retrieve
|
|
3700
|
+
* @returns {PhoneticMap | undefined} - Returns the phonetic map if found, otherwise undefined
|
|
3701
|
+
*/
|
|
3702
|
+
get(algo, id) { return maps(algo)[id]; },
|
|
3703
|
+
/**
|
|
3704
|
+
* Lists all phonetic mappings for a specific algorithm.
|
|
3705
|
+
*
|
|
3706
|
+
* @param {string} algo - The phonetic algorithm identifier
|
|
3707
|
+
* @returns {string[]} - Returns an array of mapping IDs for the specified algorithm
|
|
3708
|
+
*/
|
|
3709
|
+
list(algo) { return Object.keys(maps(algo)); }
|
|
3710
|
+
};
|
|
3711
|
+
})();
|
|
3712
|
+
|
|
3713
|
+
/**
|
|
3714
|
+
* Cologne Phonetic Algorithm
|
|
3715
|
+
* src/phonetic/Cologne.ts
|
|
3716
|
+
*
|
|
3717
|
+
* @see https://en.wikipedia.org/wiki/Cologne_phonetics
|
|
3718
|
+
*
|
|
3719
|
+
* Cologne phonetics, also known as `Kölner Phonetik` or the `Cologne process`,
|
|
3720
|
+
* is a phonetic algorithm that assigns a sequence of digits, referred to as the
|
|
3721
|
+
* phonetic code, to words. The purpose of this method is to ensure that words
|
|
3722
|
+
* with identical sounds receive the same code. This algorithm can facilitate a
|
|
3723
|
+
* similarity search among words.
|
|
3724
|
+
*
|
|
3725
|
+
* Cologne phonetics is associated with the well-known Soundex phonetic algorithm,
|
|
3726
|
+
* yet it is specifically optimized for the German language. This algorithm was
|
|
3727
|
+
* introduced by Hans Joachim Postel in 1969.
|
|
3728
|
+
*
|
|
3729
|
+
* The Cologne phonetic algorithm works by mapping letters to digits, ignoring
|
|
3730
|
+
* certain letters, and applying specific rules to handle character combinations.
|
|
3731
|
+
*
|
|
3732
|
+
* @module Phonetic/Cologne
|
|
3733
|
+
* @author Paul Köhler (komed3)
|
|
3734
|
+
* @license MIT
|
|
3735
|
+
*/
|
|
3736
|
+
/**
|
|
3737
|
+
* Cologne class extends the Phonetic class to implement the Cologne phonetic algorithm.
|
|
3738
|
+
*/
|
|
3739
|
+
class Cologne extends Phonetic {
|
|
3740
|
+
// Default options for the Cologne phonetic algorithm
|
|
3741
|
+
static default = {
|
|
3742
|
+
map: 'default', delimiter: ' ', length: -1, dedupe: true
|
|
3743
|
+
};
|
|
3744
|
+
/**
|
|
3745
|
+
* Constructor for the Cologne class.
|
|
3746
|
+
*
|
|
3747
|
+
* Initializes the Cologne phonetic algorithm with the mapping and options.
|
|
3748
|
+
*
|
|
3749
|
+
* @param {PhoneticOptions} [opt] - Options for the Cologne phonetic algorithm
|
|
3750
|
+
*/
|
|
3751
|
+
constructor(opt = {}) { super('cologne', opt); }
|
|
3752
|
+
/**
|
|
3753
|
+
* Adjusts the phonetic code by removing all '0's except the first character.
|
|
3754
|
+
*
|
|
3755
|
+
* @param {string} code - The phonetic code to adjust
|
|
3756
|
+
* @returns {string} - The adjusted phonetic code
|
|
3757
|
+
*/
|
|
3758
|
+
adjustCode(code) {
|
|
3759
|
+
return code.slice(0, 1) + code.slice(1).replaceAll('0', '');
|
|
3760
|
+
}
|
|
3761
|
+
}
|
|
3762
|
+
// Register the Cologne algorithm in the phonetic registry
|
|
3763
|
+
PhoneticRegistry.add('cologne', Cologne);
|
|
3764
|
+
// Register the Cologne phonetic mapping
|
|
3765
|
+
PhoneticMappingRegistry.add('cologne', 'default', {
|
|
3766
|
+
map: {
|
|
3767
|
+
a: '0', ä: '0', e: '0', i: '0', j: '0', o: '0', ö: '0', u: '0', ü: '0', y: '0',
|
|
3768
|
+
b: '1', p: '1', d: '2', t: '2', f: '3', v: '3', w: '3', g: '4', k: '4', q: '4',
|
|
3769
|
+
l: '5', m: '6', n: '6', r: '7', c: '8', s: '8', ß: '8', z: '8', x: '48'
|
|
3770
|
+
},
|
|
3771
|
+
ignore: ['h'],
|
|
3772
|
+
ruleset: [
|
|
3773
|
+
{ char: 'p', next: ['h'], code: '3' },
|
|
3774
|
+
{ char: 'c', position: 'start', next: ['a', 'h', 'k', 'l', 'o', 'q', 'r', 'u', 'x'], code: '4' },
|
|
3775
|
+
{ char: 'c', next: ['a', 'h', 'k', 'o', 'q', 'u', 'x'], prevNot: ['s', 'z'], code: '4' },
|
|
3776
|
+
{ char: 'd', next: ['c', 's', 'z'], code: '8' },
|
|
3777
|
+
{ char: 't', next: ['c', 's', 'z'], code: '8' },
|
|
3778
|
+
{ char: 'x', prev: ['c', 'k', 'q'], code: '8' }
|
|
3779
|
+
]
|
|
3780
|
+
});
|
|
3781
|
+
|
|
3782
|
+
/**
|
|
3783
|
+
* Metaphone Phonetic Algorithm
|
|
3784
|
+
* src/phonetic/Metaphone.ts
|
|
3785
|
+
*
|
|
3786
|
+
* @see https://en.wikipedia.org/wiki/Metaphone
|
|
3787
|
+
*
|
|
3788
|
+
* Metaphone is a phonetic algorithm for indexing words by their English pronunciation.
|
|
3789
|
+
* It encodes words into a string of consonant symbols, allowing for the comparison of
|
|
3790
|
+
* words based on their pronunciation rather than their spelling. Metaphone is more
|
|
3791
|
+
* accurate than Soundex for English and is widely used in search, spell-checking,
|
|
3792
|
+
* and fuzzy matching.
|
|
3793
|
+
*
|
|
3794
|
+
* This implementation uses a mapping and a comprehensive ruleset to efficiently
|
|
3795
|
+
* transform input words into their Metaphone code. The algorithm drops or transforms
|
|
3796
|
+
* letters according to context-sensitive rules, and only retains vowels at the start.
|
|
3797
|
+
*
|
|
3798
|
+
* @module Phonetic/Metaphone
|
|
3799
|
+
* @author Paul Köhler (komed3)
|
|
3800
|
+
* @license MIT
|
|
3801
|
+
*/
|
|
3802
|
+
/**
|
|
3803
|
+
* Metaphone class extends the Phonetic class to implement the Metaphone phonetic algorithm.
|
|
3804
|
+
*/
|
|
3805
|
+
class Metaphone extends Phonetic {
|
|
3806
|
+
// Default options for the Metaphone phonetic algorithm
|
|
3807
|
+
static default = {
|
|
3808
|
+
map: 'en90', delimiter: ' ', length: -1, pad: '', dedupe: false
|
|
3809
|
+
};
|
|
3810
|
+
/**
|
|
3811
|
+
* Constructor for the Metaphone class.
|
|
3812
|
+
*
|
|
3813
|
+
* Initializes the Metaphone phonetic algorithm with the mapping and options.
|
|
3814
|
+
*
|
|
3815
|
+
* @param {PhoneticOptions} [opt] - Options for the Metaphone phonetic algorithm
|
|
3816
|
+
*/
|
|
3817
|
+
constructor(opt = {}) { super('metaphone', opt); }
|
|
3818
|
+
/**
|
|
3819
|
+
* Generates the Metaphone code for a given word.
|
|
3820
|
+
*
|
|
3821
|
+
* @param {string} word - The input word to be converted into a Metaphone code
|
|
3822
|
+
* @returns {string} - The generated Metaphone code
|
|
3823
|
+
*/
|
|
3824
|
+
encode(word) {
|
|
3825
|
+
// Remove duplicate adjacent letters except for C
|
|
3826
|
+
word = word.replace(/([A-BD-Z])\1+/gi, (m, c) => c === 'C' ? m : c);
|
|
3827
|
+
// Use the base implementation for rule/mapping application
|
|
3828
|
+
return super.encode(word);
|
|
3829
|
+
}
|
|
3830
|
+
/**
|
|
3831
|
+
* Adjusts the Metaphone code by removing vowels except for the first letter.
|
|
3832
|
+
*
|
|
3833
|
+
* @param {string} code - The Metaphone code to be adjusted
|
|
3834
|
+
* @returns {string} - The adjusted Metaphone code
|
|
3835
|
+
*/
|
|
3836
|
+
adjustCode(code) {
|
|
3837
|
+
// Remove vowels except for the first letter
|
|
3838
|
+
return code.slice(0, 1) + code.slice(1).replace(/[AEIOU]/g, '');
|
|
3839
|
+
}
|
|
3840
|
+
}
|
|
3841
|
+
// Register the Metaphone algorithm in the phonetic registry
|
|
3842
|
+
PhoneticRegistry.add('metaphone', Metaphone);
|
|
3843
|
+
/**
|
|
3844
|
+
* Register the Metaphone phonetic mapping for English.
|
|
3845
|
+
*
|
|
3846
|
+
* This version is based on the original BASIC implementation from 1990,
|
|
3847
|
+
* written by Lawrence Philips.
|
|
3848
|
+
*
|
|
3849
|
+
* @see https://gist.github.com/Rostepher/b688f709587ac145a0b3
|
|
3850
|
+
*/
|
|
3851
|
+
PhoneticMappingRegistry.add('metaphone', 'en90', {
|
|
3852
|
+
map: {
|
|
3853
|
+
a: 'A', b: 'B', c: 'K', d: 'T', e: 'E', f: 'F',
|
|
3854
|
+
g: 'K', h: 'H', i: 'I', j: 'J', k: 'K',
|
|
3855
|
+
l: 'L', m: 'M', n: 'N', o: 'O', p: 'P', q: 'K',
|
|
3856
|
+
r: 'R', s: 'S', t: 'T', u: 'U', v: 'F', w: 'W',
|
|
3857
|
+
x: 'KS', y: 'Y', z: 'S'
|
|
3858
|
+
},
|
|
3859
|
+
ruleset: [
|
|
3860
|
+
// Drop the first letter if the string begins with `AE`, `GN`, `KN`, `PN` or `WR`
|
|
3861
|
+
{ char: 'a', position: 'start', next: ['e'], code: '' },
|
|
3862
|
+
{ char: 'g', position: 'start', next: ['n'], code: '' },
|
|
3863
|
+
{ char: 'k', position: 'start', next: ['n'], code: '' },
|
|
3864
|
+
{ char: 'p', position: 'start', next: ['n'], code: '' },
|
|
3865
|
+
{ char: 'w', position: 'start', next: ['r'], code: '' },
|
|
3866
|
+
// Drop `B` if after `M` at the end of the string
|
|
3867
|
+
{ char: 'b', position: 'end', prev: ['m'], code: '' },
|
|
3868
|
+
// `C` transforms into `X` if followed by `H` or `IA`
|
|
3869
|
+
{ char: 'c', next: ['h'], prevNot: ['s'], code: 'X' },
|
|
3870
|
+
{ char: 'c', next: ['i'], next2: ['a'], code: 'X' },
|
|
3871
|
+
// `C` transforms into `S` if followed by `E`, `I` or `Y`
|
|
3872
|
+
{ char: 'c', next: ['e', 'i', 'y'], code: 'S' },
|
|
3873
|
+
// `D` transforms into `J` if followed by `GE`, `GI` or `GY`
|
|
3874
|
+
{ char: 'd', next: ['g'], next2: ['e', 'i', 'y'], code: 'J' },
|
|
3875
|
+
// Drop `G` if followed by `H` and `H` is not at the end or before a vowel
|
|
3876
|
+
{ char: 'g', next: ['h'], next2Not: ['', 'a', 'e', 'i', 'o', 'u'], code: '' },
|
|
3877
|
+
// Drop `G` if followed by `N` or `NED` and is at the end of the string
|
|
3878
|
+
{ char: 'g', trailing: 'n', code: '' },
|
|
3879
|
+
{ char: 'g', trailing: 'ned', code: '' },
|
|
3880
|
+
// `G` transforms into `J` if before `E`, `I` or `Y` and is not a `GG`
|
|
3881
|
+
{ char: 'g', next: ['e', 'i', 'y'], prevNot: ['g'], code: 'J' },
|
|
3882
|
+
// Drop `H` if after a vowel and not before a vowel
|
|
3883
|
+
{ char: 'h', prev: ['a', 'e', 'i', 'o', 'u'], nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
|
|
3884
|
+
// Drop `H` if after `C`, `G`, `P`, `S` or `T`
|
|
3885
|
+
{ char: 'h', prev: ['c', 'g', 'p', 's', 't'], code: '' },
|
|
3886
|
+
// Drop `K` if after `C`
|
|
3887
|
+
{ char: 'k', prev: ['c'], code: '' },
|
|
3888
|
+
// `PH` transforms into `F`
|
|
3889
|
+
{ char: 'p', next: ['h'], code: 'F' },
|
|
3890
|
+
// `S` transforms into `X` if followed by `H`, `IA` or `IO`
|
|
3891
|
+
{ char: 's', next: ['h'], code: 'X' },
|
|
3892
|
+
{ char: 's', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
3893
|
+
// `T` transforms into `X` if followed by `IA` or `IO`
|
|
3894
|
+
{ char: 't', next: ['i'], next2: ['a', 'o'], code: 'X' },
|
|
3895
|
+
// `TH` transforms into `0` (zero)
|
|
3896
|
+
{ char: 't', next: ['h'], code: '0' },
|
|
3897
|
+
// Drop `T` if followed by `CH`
|
|
3898
|
+
{ char: 't', next: ['c'], next2: ['h'], code: '' },
|
|
3899
|
+
// Drop `W` if not followed by a vowel
|
|
3900
|
+
{ char: 'w', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' },
|
|
3901
|
+
// `WH` transforms into `W` if at the beginning of the string
|
|
3902
|
+
{ char: 'h', leading: 'w', code: '' },
|
|
3903
|
+
// `X` transforms into `S` if at the beginning
|
|
3904
|
+
{ char: 'x', position: 'start', code: 'S' },
|
|
3905
|
+
// Drop `Y` if not followed by a vowel
|
|
3906
|
+
{ char: 'y', nextNot: ['a', 'e', 'i', 'o', 'u'], code: '' }
|
|
3907
|
+
]
|
|
3908
|
+
});
|
|
3909
|
+
|
|
3910
|
+
/**
|
|
3911
|
+
* Soundex Phonetic Algorithm
|
|
3912
|
+
* src/phonetic/Soudex.ts
|
|
3913
|
+
*
|
|
3914
|
+
* @see https://en.wikipedia.org/wiki/Soundex
|
|
3915
|
+
*
|
|
3916
|
+
* Soundex is a phonetic algorithm for indexing names by sound. It is used to
|
|
3917
|
+
* encode words into a phonetic representation, allowing for the comparison of
|
|
3918
|
+
* words based on their pronunciation rather than their spelling. This works
|
|
3919
|
+
* by mapping letters to digits, ignoring certain letters, and applying specific
|
|
3920
|
+
* rules to handle character combinations.
|
|
3921
|
+
*
|
|
3922
|
+
* It is particularly useful for matching names that may be spelled differently
|
|
3923
|
+
* but sound similar and commonly used in genealogical research and databases
|
|
3924
|
+
* to find similar-sounding names.
|
|
3925
|
+
*
|
|
3926
|
+
* The Soundex algorithm is not case-sensitive and ignores vowels and certain
|
|
3927
|
+
* consonants. It outputs an array of strings that represents the phonetic code
|
|
3928
|
+
* of the input, typically limited to the length of four characters.
|
|
3929
|
+
*
|
|
3930
|
+
* @module Phonetic/Soundex
|
|
3931
|
+
* @author Paul Köhler (komed3)
|
|
3932
|
+
* @license MIT
|
|
3933
|
+
*/
|
|
3934
|
+
/**
|
|
3935
|
+
* Soundex class extends the Phonetic class to implement the Soundex phonetic algorithm.
|
|
3936
|
+
*/
|
|
3937
|
+
class Soundex extends Phonetic {
|
|
3938
|
+
// Default options for the Soundex phonetic algorithm
|
|
3939
|
+
static default = {
|
|
3940
|
+
map: 'en', delimiter: ' ', length: 4, pad: '0', dedupe: true
|
|
3941
|
+
};
|
|
3942
|
+
/**
|
|
3943
|
+
* Constructor for the Soundex class.
|
|
3944
|
+
*
|
|
3945
|
+
* Initializes the Soundex phonetic algorithm with the mapping and options.
|
|
3946
|
+
*
|
|
3947
|
+
* @param {PhoneticOptions} [opt] - Options for the Soundex phonetic algorithm
|
|
3948
|
+
*/
|
|
3949
|
+
constructor(opt = {}) { super('soundex', opt); }
|
|
3950
|
+
/**
|
|
3951
|
+
* Adjusts the phonetic code by removing leading zeros and ensuring the
|
|
3952
|
+
* first character is uppercase.
|
|
3953
|
+
*
|
|
3954
|
+
* @param {string} code - The phonetic code to adjust
|
|
3955
|
+
* @param {string[]} chars - The characters used in the phonetic code
|
|
3956
|
+
* @returns {string} - The adjusted phonetic code
|
|
3957
|
+
*/
|
|
3958
|
+
adjustCode(code, chars) {
|
|
3959
|
+
return chars[0].toUpperCase() + code.slice(1).replaceAll('0', '');
|
|
3960
|
+
}
|
|
3961
|
+
}
|
|
3962
|
+
// Register the Soundex algorithm in the phonetic registry
|
|
3963
|
+
PhoneticRegistry.add('soundex', Soundex);
|
|
3964
|
+
//Register the Soundex phonetic mapping for English.
|
|
3965
|
+
PhoneticMappingRegistry.add('soundex', 'en', {
|
|
3966
|
+
map: {
|
|
3967
|
+
a: '0', e: '0', h: '0', i: '0', o: '0', u: '0', w: '0', y: '0',
|
|
3968
|
+
b: '1', f: '1', p: '1', v: '1',
|
|
3969
|
+
c: '2', g: '2', j: '2', k: '2', q: '2', s: '2', x: '2', z: '2',
|
|
3970
|
+
d: '3', t: '3', l: '4', m: '5', n: '5', r: '6'
|
|
3971
|
+
}
|
|
3972
|
+
});
|
|
3973
|
+
//Register the Soundex phonetic mapping for German.
|
|
3974
|
+
PhoneticMappingRegistry.add('soundex', 'de', {
|
|
3975
|
+
map: {
|
|
3976
|
+
a: '0', ä: '0', e: '0', h: '0', i: '0', j: '0', o: '0', ö: '0', u: '0', ü: '0', y: '0',
|
|
3977
|
+
b: '1', f: '1', p: '1', v: '1', w: '1',
|
|
3978
|
+
c: '2', g: '2', k: '2', q: '2', s: '2', ß: '2', x: '2', z: '2',
|
|
3979
|
+
d: '3', t: '3', l: '4', m: '5', n: '5', r: '6'
|
|
3980
|
+
},
|
|
3981
|
+
ruleset: [
|
|
3982
|
+
{ char: 'c', next: ['h'], code: '7' }
|
|
3983
|
+
]
|
|
3984
|
+
});
|
|
3985
|
+
|
|
3986
|
+
/**
|
|
3987
|
+
* CmpStr Main API
|
|
3988
|
+
* src/CmpStr.ts
|
|
3989
|
+
*
|
|
3990
|
+
* The CmpStr class provides a comprehensive, highly abstracted, and type-safe interface
|
|
3991
|
+
* for string comparison, similarity measurement, phonetic indexing, filtering, normalization,
|
|
3992
|
+
* and text analysis. It unifies all core features of the CmpStr package and exposes a
|
|
3993
|
+
* consistent, user-friendly API for both single and batch operations.
|
|
3994
|
+
*
|
|
3995
|
+
* Features:
|
|
3996
|
+
* - Centralized management of metrics, phonetic algorithms, and filters
|
|
3997
|
+
* - Flexible normalization and filtering pipeline for all inputs
|
|
3998
|
+
* - Batch, pairwise, and single string comparison with detailed results
|
|
3999
|
+
* - Phonetic indexing and phonetic-aware search and comparison
|
|
4000
|
+
* - Text analysis and unified diff utilities
|
|
4001
|
+
* - Full TypeScript type safety and extensibility
|
|
4002
|
+
*
|
|
4003
|
+
* @module CmpStr
|
|
4004
|
+
* @author Paul Köhler (komed3)
|
|
4005
|
+
* @license MIT
|
|
4006
|
+
*/
|
|
4007
|
+
// Import the Profiler instance for global profiling
|
|
4008
|
+
const profiler = Profiler.getInstance();
|
|
4009
|
+
/**
|
|
4010
|
+
* The main CmpStr class that provides a unified interface for string comparison,
|
|
4011
|
+
* phonetic indexing, filtering, and text analysis.
|
|
4012
|
+
*
|
|
4013
|
+
* @template R - The type of the metric result, defaults to MetricRaw
|
|
4014
|
+
*/
|
|
4015
|
+
class CmpStr {
|
|
4016
|
+
/**
|
|
4017
|
+
* --------------------------------------------------------------------------------
|
|
4018
|
+
* Static methods and properties for global access to CmpStr features
|
|
4019
|
+
* --------------------------------------------------------------------------------
|
|
4020
|
+
*
|
|
4021
|
+
* These static methods provide a convenient way to access the core features of
|
|
4022
|
+
* the CmpStr package without needing to instantiate a CmpStr object.
|
|
4023
|
+
*/
|
|
4024
|
+
/**
|
|
4025
|
+
* Adds, removes, pauses, resumes, lists, or clears global filters.
|
|
4026
|
+
*
|
|
4027
|
+
* @see Filter
|
|
4028
|
+
*/
|
|
4029
|
+
static filter = {
|
|
4030
|
+
add: Filter.add,
|
|
4031
|
+
remove: Filter.remove,
|
|
4032
|
+
pause: Filter.pause,
|
|
4033
|
+
resume: Filter.resume,
|
|
4034
|
+
list: Filter.list,
|
|
4035
|
+
clear: Filter.clear
|
|
4036
|
+
};
|
|
4037
|
+
/**
|
|
4038
|
+
* Adds, removes, checks, or lists available metrics.
|
|
4039
|
+
*
|
|
4040
|
+
* @see MetricRegistry
|
|
4041
|
+
*/
|
|
4042
|
+
static metric = {
|
|
4043
|
+
add: MetricRegistry.add,
|
|
4044
|
+
remove: MetricRegistry.remove,
|
|
4045
|
+
has: MetricRegistry.has,
|
|
4046
|
+
list: MetricRegistry.list
|
|
4047
|
+
};
|
|
4048
|
+
/**
|
|
4049
|
+
* Adds, removes, checks, or lists available phonetic algorithms and mappings.
|
|
4050
|
+
*
|
|
4051
|
+
* @see PhoneticRegistry
|
|
4052
|
+
*/
|
|
4053
|
+
static phonetic = {
|
|
4054
|
+
add: PhoneticRegistry.add,
|
|
4055
|
+
remove: PhoneticRegistry.remove,
|
|
4056
|
+
has: PhoneticRegistry.has,
|
|
4057
|
+
list: PhoneticRegistry.list,
|
|
4058
|
+
map: {
|
|
4059
|
+
add: PhoneticMappingRegistry.add,
|
|
4060
|
+
remove: PhoneticMappingRegistry.remove,
|
|
4061
|
+
has: PhoneticMappingRegistry.has,
|
|
4062
|
+
list: PhoneticMappingRegistry.list
|
|
4063
|
+
}
|
|
4064
|
+
};
|
|
4065
|
+
/**
|
|
4066
|
+
* Provides access to the global profiler services.
|
|
4067
|
+
*
|
|
4068
|
+
* @see Profiler
|
|
4069
|
+
*/
|
|
4070
|
+
static profiler = profiler.services;
|
|
4071
|
+
/**
|
|
4072
|
+
* Clears the caches for normalizer, metric, and phonetic modules.
|
|
4073
|
+
*/
|
|
4074
|
+
static clearCache = {
|
|
4075
|
+
normalizer: Normalizer.clear,
|
|
4076
|
+
metric: Metric.clear,
|
|
4077
|
+
phonetic: Phonetic.clear
|
|
4078
|
+
};
|
|
4079
|
+
/**
|
|
4080
|
+
* Returns a TextAnalyzer instance for the given input string.
|
|
4081
|
+
*
|
|
4082
|
+
* @param {string} [input] - The input string
|
|
4083
|
+
* @returns {TextAnalyzer} - The text analyzer
|
|
4084
|
+
*/
|
|
4085
|
+
static analyze(input) { return new TextAnalyzer(input); }
|
|
4086
|
+
/**
|
|
4087
|
+
* Returns a DiffChecker instance for computing the unified diff between two texts.
|
|
4088
|
+
*
|
|
4089
|
+
* @param {string} a - The first (original) text
|
|
4090
|
+
* @param {string} b - The second (modified) text
|
|
4091
|
+
* @param {DiffOptions} [opt] - Optional diff configuration
|
|
4092
|
+
* @returns {DiffChecker} - The diff checker instance
|
|
4093
|
+
*/
|
|
4094
|
+
static diff(a, b, opt) { return new DiffChecker(a, b, opt); }
|
|
4095
|
+
/**
|
|
4096
|
+
* --------------------------------------------------------------------------------
|
|
4097
|
+
* Instanciate the CmpStr class
|
|
4098
|
+
* --------------------------------------------------------------------------------
|
|
4099
|
+
*
|
|
4100
|
+
* Methods to create a new CmpStr instance with the given options.
|
|
4101
|
+
* Using the static `create` method is recommended to ensure proper instantiation.
|
|
4102
|
+
*/
|
|
4103
|
+
/**
|
|
4104
|
+
* Creates a new CmpStr instance with the given options.
|
|
4105
|
+
*
|
|
4106
|
+
* @param {string|CmpStrOptions} [opt] - Optional serialized or options object
|
|
4107
|
+
* @returns {CmpStr<R>} - A new CmpStr instance
|
|
4108
|
+
*/
|
|
4109
|
+
static create(opt) { return new CmpStr(opt); }
|
|
4110
|
+
// The options object that holds the configuration for this CmpStr instance
|
|
4111
|
+
options = Object.create(null);
|
|
4112
|
+
/**
|
|
4113
|
+
* Creates a new CmpStr instance with the given options.
|
|
4114
|
+
* The constructor is protected to enforce the use of the static `create` method.
|
|
4115
|
+
*
|
|
4116
|
+
* @param {string|CmpStrOptions} [opt] - Optional serialized or options object
|
|
4117
|
+
*/
|
|
4118
|
+
constructor(opt) {
|
|
4119
|
+
if (opt)
|
|
4120
|
+
typeof opt === 'string'
|
|
4121
|
+
? this.setSerializedOptions(opt)
|
|
4122
|
+
: this.setOptions(opt);
|
|
4123
|
+
}
|
|
4124
|
+
/**
|
|
4125
|
+
* ---------------------------------------------------------------------------------
|
|
4126
|
+
* Protected utility methods for internal use
|
|
4127
|
+
* ---------------------------------------------------------------------------------
|
|
4128
|
+
*
|
|
4129
|
+
* These methods provide utility functions for converting inputs, merging options,
|
|
4130
|
+
* normalizing inputs, filtering, and preparing inputs for comparison.
|
|
4131
|
+
*/
|
|
4132
|
+
/**
|
|
4133
|
+
* Assert a condition and throws if the condition is not met.
|
|
4134
|
+
*
|
|
4135
|
+
* @param {string} cond - The condition to met
|
|
4136
|
+
* @param {any} [test] - Value to test for
|
|
4137
|
+
* @throws {Error} If the condition is not met
|
|
4138
|
+
*/
|
|
4139
|
+
assert(cond, test) {
|
|
4140
|
+
switch (cond) {
|
|
4141
|
+
// Check if the metric exists
|
|
4142
|
+
case 'metric':
|
|
4143
|
+
if (!CmpStr.metric.has(test))
|
|
4144
|
+
throw new Error(`CmpStr <metric> must be set, call .setMetric(), ` +
|
|
4145
|
+
`use CmpStr.metric.list() for available metrics`);
|
|
4146
|
+
break;
|
|
4147
|
+
// Check if the phonetic algorithm exists
|
|
4148
|
+
case 'phonetic':
|
|
4149
|
+
if (!CmpStr.phonetic.has(test))
|
|
4150
|
+
throw new Error(`CmpStr <phonetic> must be set, call .setPhonetic(), ` +
|
|
4151
|
+
`use CmpStr.phonetic.list() for available phonetic algorithms`);
|
|
4152
|
+
break;
|
|
4153
|
+
// Throw an error for unknown conditions
|
|
4154
|
+
default: throw new Error(`Cmpstr condition <${cond}> unknown`);
|
|
4155
|
+
}
|
|
4156
|
+
}
|
|
4157
|
+
/**
|
|
4158
|
+
* Assert multiple conditions.
|
|
4159
|
+
*
|
|
4160
|
+
* @param {[ string, any? ][]} cond - Array of [ condition, value ] pairs
|
|
4161
|
+
*/
|
|
4162
|
+
assertMany(...cond) {
|
|
4163
|
+
for (const [c, test] of cond)
|
|
4164
|
+
this.assert(c, test);
|
|
4165
|
+
}
|
|
4166
|
+
/**
|
|
4167
|
+
* Resolves the options for the CmpStr instance, merging the provided options with
|
|
4168
|
+
* the existing options.
|
|
4169
|
+
*
|
|
4170
|
+
* @param {CmpStrOptions} [opt] - Optional options to merge
|
|
4171
|
+
* @returns {CmpStrOptions} - The resolved options
|
|
4172
|
+
*/
|
|
4173
|
+
resolveOptions(opt) {
|
|
4174
|
+
return merge({ ...(this.options ?? Object.create(null)) }, opt);
|
|
4175
|
+
}
|
|
4176
|
+
/**
|
|
4177
|
+
* Normalizes the input string or array using the configured or provided flags.
|
|
4178
|
+
*
|
|
4179
|
+
* @param {MetricInput} input - The input string or array
|
|
4180
|
+
* @param {NormalizeFlags} [flags] - Normalization flags
|
|
4181
|
+
* @returns {MetricInput} - The normalized input
|
|
4182
|
+
*/
|
|
4183
|
+
normalize(input, flags) {
|
|
4184
|
+
return Normalizer.normalize(input, flags ?? this.options.flags ?? '');
|
|
4185
|
+
}
|
|
4186
|
+
/**
|
|
4187
|
+
* Applies all active filters to the input string or array.
|
|
4188
|
+
*
|
|
4189
|
+
* @param {MetricInput} input - The input string or array
|
|
4190
|
+
* @param {string} [hook='input'] - The filter hook
|
|
4191
|
+
* @returns {MetricInput} - The filtered string(s)
|
|
4192
|
+
*/
|
|
4193
|
+
filter(input, hook) {
|
|
4194
|
+
return Filter.apply(hook, input);
|
|
4195
|
+
}
|
|
4196
|
+
/**
|
|
4197
|
+
* Prepares the input by normalizing and filtering.
|
|
4198
|
+
*
|
|
4199
|
+
* @param {MetricInput} [input] - The input string or array
|
|
4200
|
+
* @param {CmpStrOptions} [opt] - Optional options to use
|
|
4201
|
+
* @returns {MetricInput} - The prepared input
|
|
4202
|
+
*/
|
|
4203
|
+
prepare(input, opt) {
|
|
4204
|
+
const { flags, processors } = opt ?? this.options;
|
|
4205
|
+
// Normalize the input using flags (i.e., 'itw')
|
|
4206
|
+
if (flags?.length)
|
|
4207
|
+
input = this.normalize(input, flags);
|
|
4208
|
+
// Filter the input using hooked up filters
|
|
4209
|
+
input = this.filter(input, 'input');
|
|
4210
|
+
// Apply phonetic processors if configured
|
|
4211
|
+
if (processors?.phonetic)
|
|
4212
|
+
input = this.index(input, processors.phonetic);
|
|
4213
|
+
return input;
|
|
4214
|
+
}
|
|
4215
|
+
/**
|
|
4216
|
+
* Post-process the results of the metric computation.
|
|
4217
|
+
*
|
|
4218
|
+
* @param {MetricResult<R>} result - The metric result
|
|
4219
|
+
* @returns {MetricResult<R>} - The post-processed results
|
|
4220
|
+
*/
|
|
4221
|
+
postProcess(result, opt) {
|
|
4222
|
+
// Remove "zero similarity" from batch results if configured
|
|
4223
|
+
if (opt?.removeZero && Array.isArray(result))
|
|
4224
|
+
result = result.filter(r => r.res > 0);
|
|
4225
|
+
return result;
|
|
4226
|
+
}
|
|
4227
|
+
/**
|
|
4228
|
+
* Computes the phonetic index for the given input using the specified phonetic algorithm.
|
|
4229
|
+
*
|
|
4230
|
+
* @param {MetricInput} input - The input string or array
|
|
4231
|
+
* @param {{ algo: string, opt?: PhoneticOptions }} options - The phonetic algorithm and options
|
|
4232
|
+
* @returns {MetricInput} - The phonetic index for the given input
|
|
4233
|
+
*/
|
|
4234
|
+
index(input, { algo, opt }) {
|
|
4235
|
+
this.assert('phonetic', algo);
|
|
4236
|
+
const phonetic = factory.phonetic(algo, opt);
|
|
4237
|
+
const delimiter = opt?.delimiter ?? ' ';
|
|
4238
|
+
return Array.isArray(input)
|
|
4239
|
+
? input.map(s => phonetic.getIndex(s).join(delimiter))
|
|
4240
|
+
: phonetic.getIndex(input).join(delimiter);
|
|
4241
|
+
}
|
|
4242
|
+
/**
|
|
4243
|
+
* Computes the metric result for the given inputs, applying normalization and
|
|
4244
|
+
* filtering as configured.
|
|
4245
|
+
*
|
|
4246
|
+
* @template T - The type of the metric result
|
|
4247
|
+
* @param {MetricInput} a - The first input string or array
|
|
4248
|
+
* @param {MetricInput} b - The second input string or array
|
|
4249
|
+
* @param {CmpStrOptions} [opt] - Optional options to use
|
|
4250
|
+
* @param {MetricMode} [mode='single'] - The metric mode to use
|
|
4251
|
+
* @param {boolean} [raw=false] - Whether to return raw results
|
|
4252
|
+
* @param {boolean} [skip=false] - Whether to skip normalization and filtering
|
|
4253
|
+
* @returns {T} - The computed metric result
|
|
4254
|
+
*/
|
|
4255
|
+
compute(a, b, opt, mode, raw, skip) {
|
|
4256
|
+
const resolved = this.resolveOptions(opt);
|
|
4257
|
+
this.assert('metric', resolved.metric);
|
|
4258
|
+
// Prepare the input
|
|
4259
|
+
const A = skip ? a : this.prepare(a, resolved);
|
|
4260
|
+
const B = skip ? b : this.prepare(b, resolved);
|
|
4261
|
+
// Get the metric class
|
|
4262
|
+
const metric = factory.metric(resolved.metric, A, B, resolved.opt);
|
|
4263
|
+
// Pass the original inputs to the metric
|
|
4264
|
+
if (resolved.output !== 'prep')
|
|
4265
|
+
metric.setOriginal(a, b);
|
|
4266
|
+
// Compute the metric result
|
|
4267
|
+
metric.run(mode);
|
|
4268
|
+
// Post-process the results
|
|
4269
|
+
const result = this.postProcess(metric.getResults(), resolved);
|
|
4270
|
+
// Resolve and return the result based on the raw flag
|
|
4271
|
+
return this.output(result, raw ?? resolved.raw);
|
|
4272
|
+
}
|
|
4273
|
+
/**
|
|
4274
|
+
* Resolves the result format (raw or formatted).
|
|
4275
|
+
*
|
|
4276
|
+
* @template T - The type of the metric result
|
|
4277
|
+
* @param {MetricResult<R>} result - The metric result
|
|
4278
|
+
* @param {boolean} [raw] - Whether to return raw results
|
|
4279
|
+
* @returns {T} - The resolved result
|
|
4280
|
+
*/
|
|
4281
|
+
output(result, raw) {
|
|
4282
|
+
return (raw ?? this.options.raw ? result : Array.isArray(result)
|
|
4283
|
+
? result.map(r => ({ source: r.a, target: r.b, match: r.res }))
|
|
4284
|
+
: { source: result.a, target: result.b, match: result.res });
|
|
4285
|
+
}
|
|
4286
|
+
/**
|
|
4287
|
+
* ---------------------------------------------------------------------------------
|
|
4288
|
+
* Managing methods for CmpStr
|
|
4289
|
+
* ---------------------------------------------------------------------------------
|
|
4290
|
+
*
|
|
4291
|
+
* These methods provides an interface to set and get properties of the CmpStr
|
|
4292
|
+
* instance, such as options, metric, phonetic algorithm, and more.
|
|
4293
|
+
*/
|
|
4294
|
+
/**
|
|
4295
|
+
* Creates a shallow clone of the current instance.
|
|
4296
|
+
*
|
|
4297
|
+
* @returns {CmpStr<R>} - The cloned instance
|
|
4298
|
+
*/
|
|
4299
|
+
clone() { return Object.assign(Object.create(Object.getPrototypeOf(this)), this); }
|
|
4300
|
+
/**
|
|
4301
|
+
* Resets the instance, clearing all data and options.
|
|
4302
|
+
*
|
|
4303
|
+
* @returns {this}
|
|
4304
|
+
*/
|
|
4305
|
+
reset() { for (const k in this.options)
|
|
4306
|
+
delete this.options[k]; return this; }
|
|
4307
|
+
/**
|
|
4308
|
+
* Sets / replaces the full options object.
|
|
4309
|
+
*
|
|
4310
|
+
* @param {CmpStrOptions} opt - The options
|
|
4311
|
+
* @returns {this}
|
|
4312
|
+
*/
|
|
4313
|
+
setOptions(opt) { this.options = opt; return this; }
|
|
4314
|
+
/**
|
|
4315
|
+
* Deep merges and sets new options.
|
|
4316
|
+
*
|
|
4317
|
+
* @param {CmpStrOptions} opt - The options to merge
|
|
4318
|
+
* @returns {this}
|
|
4319
|
+
*/
|
|
4320
|
+
mergeOptions(opt) { merge(this.options, opt); return this; }
|
|
4321
|
+
/**
|
|
4322
|
+
* Sets the serialized options from a JSON string.
|
|
4323
|
+
*
|
|
4324
|
+
* @param {string} opt - The serialized options
|
|
4325
|
+
* @returns {this}
|
|
4326
|
+
*/
|
|
4327
|
+
setSerializedOptions(opt) { this.options = JSON.parse(opt); return this; }
|
|
4328
|
+
/**
|
|
4329
|
+
* Sets a specific option at the given path.
|
|
4330
|
+
*
|
|
4331
|
+
* @param {string} path - The path to the option
|
|
4332
|
+
* @param {any} value - The value to set
|
|
4333
|
+
* @returns {this}
|
|
4334
|
+
*/
|
|
4335
|
+
setOption(path, value) { set(this.options, path, value); return this; }
|
|
4336
|
+
/**
|
|
4337
|
+
* Removes an option at the given path.
|
|
4338
|
+
*
|
|
4339
|
+
* @param {string} path - The path to the option
|
|
4340
|
+
* @returns {this}
|
|
4341
|
+
*/
|
|
4342
|
+
rmvOption(path) { rmv(this.options, path); return this; }
|
|
4343
|
+
/**
|
|
4344
|
+
* Enable or disable raw output.
|
|
4345
|
+
*
|
|
4346
|
+
* @param {boolean} enable - Whether to enable or disable raw output
|
|
4347
|
+
* @returns {this}
|
|
4348
|
+
*/
|
|
4349
|
+
setRaw(enable) { return this.setOption('raw', enable); }
|
|
4350
|
+
/**
|
|
4351
|
+
* Sets the similatity metric to use (e.g., 'levenshtein', 'dice').
|
|
4352
|
+
*
|
|
4353
|
+
* @param {string} name - The metric name
|
|
4354
|
+
* @returns {this}
|
|
4355
|
+
*/
|
|
4356
|
+
setMetric(name) { return this.setOption('metric', name); }
|
|
4357
|
+
/**
|
|
4358
|
+
* Sets the normalization flags (e.g., 'itw', 'nfc').
|
|
4359
|
+
*
|
|
4360
|
+
* @param {NormalizeFlags} flags - The normalization flags
|
|
4361
|
+
* @returns {this}
|
|
4362
|
+
*/
|
|
4363
|
+
setFlags(flags) { return this.setOption('flags', flags); }
|
|
4364
|
+
/**
|
|
4365
|
+
* Removes the normalization flags entirely.
|
|
4366
|
+
*
|
|
4367
|
+
* @return {this}
|
|
4368
|
+
*/
|
|
4369
|
+
rmvFlags() { return this.rmvOption('flags'); }
|
|
4370
|
+
/**
|
|
4371
|
+
* Sets the pre-processors to use for preparing the input.
|
|
4372
|
+
*
|
|
4373
|
+
* @param {CmpStrProcessors} opt - The processors to set
|
|
4374
|
+
* @returns {this}
|
|
4375
|
+
*/
|
|
4376
|
+
setProcessors(opt) { return this.setOption('processors', opt); }
|
|
4377
|
+
/**
|
|
4378
|
+
* Removes the processors entirely.
|
|
4379
|
+
*
|
|
4380
|
+
* @returns {this}
|
|
4381
|
+
*/
|
|
4382
|
+
rmvProcessors() { return this.rmvOption('processors'); }
|
|
4383
|
+
/**
|
|
4384
|
+
* Returns the current options object.
|
|
4385
|
+
*
|
|
4386
|
+
* @returns {CmpStrOptions} - The options
|
|
4387
|
+
*/
|
|
4388
|
+
getOptions() { return this.options; }
|
|
4389
|
+
/**
|
|
4390
|
+
* Returns the options as a JSON string.
|
|
4391
|
+
*
|
|
4392
|
+
* @returns {string} - The serialized options
|
|
4393
|
+
*/
|
|
4394
|
+
getSerializedOptions() { return JSON.stringify(this.options); }
|
|
4395
|
+
/**
|
|
4396
|
+
* Returns a specific option value by path.
|
|
4397
|
+
*
|
|
4398
|
+
* @param {string} path - The path to the option
|
|
4399
|
+
* @returns {any} - The option value
|
|
4400
|
+
*/
|
|
4401
|
+
getOption(path) { return get(this.options, path); }
|
|
4402
|
+
/**
|
|
4403
|
+
* ---------------------------------------------------------------------------------
|
|
4404
|
+
* Public core methods for string comparison
|
|
4405
|
+
* ---------------------------------------------------------------------------------
|
|
4406
|
+
*
|
|
4407
|
+
* These methods provide the core functionality of the CmpStr class, allowing for
|
|
4408
|
+
* string comparison, phonetic indexing, filtering, and text search.
|
|
4409
|
+
*/
|
|
4410
|
+
/**
|
|
4411
|
+
* Performs a single metric comparison between the source and target.
|
|
4412
|
+
*
|
|
4413
|
+
* @template T - The type of the metric result
|
|
4414
|
+
* @param {string} a - The source string
|
|
4415
|
+
* @param {string} b - The target string
|
|
4416
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4417
|
+
* @returns {T} - The metric result
|
|
4418
|
+
*/
|
|
4419
|
+
test(a, b, opt) {
|
|
4420
|
+
return this.compute(a, b, opt, 'single');
|
|
4421
|
+
}
|
|
4422
|
+
/**
|
|
4423
|
+
* Performs a single metric comparison and returns only the numeric score.
|
|
4424
|
+
*
|
|
4425
|
+
* @param {string} a - The source string
|
|
4426
|
+
* @param {string} b - The target string
|
|
4427
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4428
|
+
* @returns {number} - The similarity score (0..1)
|
|
4429
|
+
*/
|
|
4430
|
+
compare(a, b, opt) {
|
|
4431
|
+
return this.compute(a, b, opt, 'single', true).res;
|
|
4432
|
+
}
|
|
4433
|
+
/**
|
|
4434
|
+
* Performs a batch metric comparison between source and target strings
|
|
4435
|
+
* or array of strings.
|
|
4436
|
+
*
|
|
4437
|
+
* @template T - The type of the metric result
|
|
4438
|
+
* @param {MetricInput} a - The source string or array of strings
|
|
4439
|
+
* @param {MetricInput} b - The target string or array of strings
|
|
4440
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4441
|
+
* @returns {T} - The batch metric results
|
|
4442
|
+
*/
|
|
4443
|
+
batchTest(a, b, opt) {
|
|
4444
|
+
return this.compute(a, b, opt, 'batch');
|
|
4445
|
+
}
|
|
4446
|
+
/**
|
|
4447
|
+
* Performs a batch metric comparison and returns results sorted by score.
|
|
4448
|
+
*
|
|
4449
|
+
* @template T - The type of the metric result
|
|
4450
|
+
* @param {MetricInput} a - The source string or array of strings
|
|
4451
|
+
* @param {MetricInput} b - The target string or array of strings
|
|
4452
|
+
* @param {'desc'|'asc'} [dir='desc'] - Sort direction (desc, asc)
|
|
4453
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4454
|
+
* @returns {T} - The sorted batch results
|
|
4455
|
+
*/
|
|
4456
|
+
batchSorted(a, b, dir = 'desc', opt) {
|
|
4457
|
+
return this.output(this.compute(a, b, opt, 'batch', true)
|
|
4458
|
+
.sort((a, b) => dir === 'asc' ? a.res - b.res : b.res - a.res), opt?.raw ?? this.options.raw);
|
|
4459
|
+
}
|
|
4460
|
+
/**
|
|
4461
|
+
* Performs a pairwise metric comparison between source and target strings
|
|
4462
|
+
* or array of strings.
|
|
4463
|
+
*
|
|
4464
|
+
* Input arrays needs of the same length to perform pairwise comparison,
|
|
4465
|
+
* otherwise the method will throw an error.
|
|
4466
|
+
*
|
|
4467
|
+
* @template T - The type of the metric result
|
|
4468
|
+
* @param {MetricInput} a - The source string or array of strings
|
|
4469
|
+
* @param {MetricInput} b - The target string or array of strings
|
|
4470
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4471
|
+
* @returns {T} - The pairwise metric results
|
|
4472
|
+
*/
|
|
4473
|
+
pairs(a, b, opt) {
|
|
4474
|
+
return this.compute(a, b, opt, 'pairwise');
|
|
4475
|
+
}
|
|
4476
|
+
/**
|
|
4477
|
+
* Performs a batch comparison and returns only results above the threshold.
|
|
4478
|
+
*
|
|
4479
|
+
* @template T - The type of the metric result
|
|
4480
|
+
* @param {MetricInput} a - The source string or array of strings
|
|
4481
|
+
* @param {MetricInput} b - The target string or array of strings
|
|
4482
|
+
* @param {number} threshold - The similarity threshold (0..1)
|
|
4483
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4484
|
+
* @returns {T} - The filtered batch results
|
|
4485
|
+
*/
|
|
4486
|
+
match(a, b, threshold, opt) {
|
|
4487
|
+
return this.output(this.compute(a, b, opt, 'batch', true)
|
|
4488
|
+
.filter(r => r.res >= threshold).sort((a, b) => b.res - a.res), opt?.raw ?? this.options.raw);
|
|
4489
|
+
}
|
|
4490
|
+
/**
|
|
4491
|
+
* Returns the n closest matches from a batch comparison.
|
|
4492
|
+
*
|
|
4493
|
+
* @template T - The type of the metric result
|
|
4494
|
+
* @param {MetricInput} a - The source string or array of strings
|
|
4495
|
+
* @param {MetricInput} b - The target string or array of strings
|
|
4496
|
+
* @param {number} [n=1] - Number of closest matches
|
|
4497
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4498
|
+
* @returns {T} - The closest matches
|
|
4499
|
+
*/
|
|
4500
|
+
closest(a, b, n = 1, opt) {
|
|
4501
|
+
return this.batchSorted(a, b, 'desc', opt).slice(0, n);
|
|
4502
|
+
}
|
|
4503
|
+
/**
|
|
4504
|
+
* Returns the n furthest matches from a batch comparison.
|
|
4505
|
+
*
|
|
4506
|
+
* @template T - The type of the metric result
|
|
4507
|
+
* @param {MetricInput} a - The source string or array of strings
|
|
4508
|
+
* @param {MetricInput} b - The target string or array of strings
|
|
4509
|
+
* @param {number} [n=1] - Number of furthest matches
|
|
4510
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4511
|
+
* @returns {T} - The furthest matches
|
|
4512
|
+
*/
|
|
4513
|
+
furthest(a, b, n = 1, opt) {
|
|
4514
|
+
return this.batchSorted(a, b, 'asc', opt).slice(0, n);
|
|
4515
|
+
}
|
|
4516
|
+
/**
|
|
4517
|
+
* Performs a normalized and filtered substring search.
|
|
4518
|
+
*
|
|
4519
|
+
* @param {string} needle - The search string
|
|
4520
|
+
* @param {string[]} haystack - The array to search in
|
|
4521
|
+
* @param {NormalizeFlags} [flags] - Normalization flags
|
|
4522
|
+
* @param {CmpStrProcessors} [processors] - Pre-processors to apply
|
|
4523
|
+
* @returns {string[]} - Array of matching entries
|
|
4524
|
+
*/
|
|
4525
|
+
search(needle, haystack, flags, processors) {
|
|
4526
|
+
const resolved = this.resolveOptions({ flags, processors });
|
|
4527
|
+
// Prepare the needle and haystack, normalizing and filtering them
|
|
4528
|
+
const test = this.prepare(needle, resolved);
|
|
4529
|
+
const hstk = this.prepare(haystack, resolved);
|
|
4530
|
+
// Filter the haystack based on the normalized test string
|
|
4531
|
+
return haystack.filter((_, i) => hstk[i].includes(test));
|
|
4532
|
+
}
|
|
4533
|
+
/**
|
|
4534
|
+
* Computes a similarity matrix for the given input array.
|
|
4535
|
+
*
|
|
4536
|
+
* @param {string[]} input - The input array
|
|
4537
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4538
|
+
* @returns {number[][]} - The similarity matrix
|
|
4539
|
+
*/
|
|
4540
|
+
matrix(input, opt) {
|
|
4541
|
+
input = this.prepare(input, this.resolveOptions(opt));
|
|
4542
|
+
return input.map(a => this.compute(a, input, undefined, 'batch', true, true).map(b => b.res ?? 0));
|
|
4543
|
+
}
|
|
4544
|
+
/**
|
|
4545
|
+
* Computes the phonetic index for a string using the configured
|
|
4546
|
+
* or given algorithm.
|
|
4547
|
+
*
|
|
4548
|
+
* @param {string} [input] - The input string
|
|
4549
|
+
* @param {string} [algo] - The phonetic algorithm to use
|
|
4550
|
+
* @param {PhoneticOptions} [opt] - Optional phonetic options
|
|
4551
|
+
* @returns {string} - The phonetic index as a string
|
|
4552
|
+
*/
|
|
4553
|
+
phoneticIndex(input, algo, opt) {
|
|
4554
|
+
const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
|
|
4555
|
+
return this.index(input, { algo: (algo ?? a), opt: opt ?? o });
|
|
4556
|
+
}
|
|
4557
|
+
}
|
|
4558
|
+
|
|
4559
|
+
/**
|
|
4560
|
+
* CmpStrAsync Asynchronous API
|
|
4561
|
+
* src/CmpStrAsync.ts
|
|
4562
|
+
*
|
|
4563
|
+
* The CmpStrAsync class provides a fully asynchronous, Promise-based interface for
|
|
4564
|
+
* advanced string comparison, similarity measurement, phonetic indexing, filtering
|
|
4565
|
+
* and normalization. It extends the CmpStr class and overrides all relevant methods
|
|
4566
|
+
* to support non-blocking, scalable, and I/O-friendly workloads.
|
|
4567
|
+
*
|
|
4568
|
+
* Features:
|
|
4569
|
+
* - Asynchronous normalization, filtering, and metric computation
|
|
4570
|
+
* - Async batch, pairwise, and single string comparison with detailed results
|
|
4571
|
+
* - Async phonetic indexing and phonetic-aware search and comparison
|
|
4572
|
+
* - Full compatibility with the synchronous CmpStr API
|
|
4573
|
+
* - Designed for large-scale, high-performance, and server-side applications
|
|
4574
|
+
*
|
|
4575
|
+
* @module CmpStrAsync
|
|
4576
|
+
* @author Paul Köhler (komed3)
|
|
4577
|
+
* @license MIT
|
|
4578
|
+
*/
|
|
4579
|
+
/**
|
|
4580
|
+
* The CmpStrAsync class provides a fully asynchronous API for string comparison,
|
|
4581
|
+
* phonetic indexing, filtering and normalization.
|
|
4582
|
+
*
|
|
4583
|
+
* @template R - The type of the metric result, defaults to MetricRaw
|
|
4584
|
+
*/
|
|
4585
|
+
class CmpStrAsync extends CmpStr {
|
|
4586
|
+
/**
|
|
4587
|
+
* --------------------------------------------------------------------------------
|
|
4588
|
+
* Instanciate the CmpStrAsync class
|
|
4589
|
+
* --------------------------------------------------------------------------------
|
|
4590
|
+
*
|
|
4591
|
+
* Methods to create a new CmpStrAsync instance with the given options.
|
|
4592
|
+
* Using the static `create` method is recommended to ensure proper instantiation.
|
|
4593
|
+
*/
|
|
4594
|
+
/**
|
|
4595
|
+
* Creates a new CmpStrAsync instance with the given options.
|
|
4596
|
+
*
|
|
4597
|
+
* @param {string|CmpStrOptions} [opt] - Optional serialized or options object
|
|
4598
|
+
* @returns {CmpStrAsync<R>} - A new CmpStrAsync instance
|
|
4599
|
+
*/
|
|
4600
|
+
static create(opt) {
|
|
4601
|
+
return new CmpStrAsync(opt);
|
|
4602
|
+
}
|
|
4603
|
+
/**
|
|
4604
|
+
* Creates a new CmpStrAsync instance calliing the super constructor.
|
|
4605
|
+
*
|
|
4606
|
+
* @param {string|CmpStrOptions} [opt] - Optional serialized or options object
|
|
4607
|
+
*/
|
|
4608
|
+
constructor(opt) { super(opt); }
|
|
4609
|
+
/**
|
|
4610
|
+
* ---------------------------------------------------------------------------------
|
|
4611
|
+
* Protected asynchronously utility methods for internal use
|
|
4612
|
+
* ---------------------------------------------------------------------------------
|
|
4613
|
+
*
|
|
4614
|
+
* These methods provide asynchronous normalization, filtering, and metric
|
|
4615
|
+
* computation capabilities, allowing for non-blocking operations.
|
|
4616
|
+
*/
|
|
4617
|
+
/**
|
|
4618
|
+
* Asynchronously normalizes the input string or array using the configured or provided flags.
|
|
4619
|
+
*
|
|
4620
|
+
* @param {MetricInput} input - The input string or array
|
|
4621
|
+
* @param {NormalizeFlags} [flags] - Normalization flags
|
|
4622
|
+
* @returns {Promise<MetricInput>} - The normalized input
|
|
4623
|
+
*/
|
|
4624
|
+
async normalizeAsync(input, flags) {
|
|
4625
|
+
return Normalizer.normalizeAsync(input, flags ?? this.options.flags ?? '');
|
|
4626
|
+
}
|
|
4627
|
+
/**
|
|
4628
|
+
* Asynchronously applies all active filters to the input string or array.
|
|
4629
|
+
*
|
|
4630
|
+
* @param {MetricInput} input - The input string or array
|
|
4631
|
+
* @param {string} [hook='input'] - The filter hook
|
|
4632
|
+
* @returns {Promise<MetricInput>} - The filtered string(s)
|
|
4633
|
+
*/
|
|
4634
|
+
async filterAsync(input, hook) {
|
|
4635
|
+
return Filter.applyAsync(hook, input);
|
|
4636
|
+
}
|
|
4637
|
+
/**
|
|
4638
|
+
* Asynchronously prepares the input by normalizing and filtering.
|
|
4639
|
+
*
|
|
4640
|
+
* @param {MetricInput} [input] - The input string or array
|
|
4641
|
+
* @param {CmpStrOptions} [opt] - Optional options to use
|
|
4642
|
+
* @returns {Promise<MetricInput>} - The prepared input
|
|
4643
|
+
*/
|
|
4644
|
+
async prepareAsync(input, opt) {
|
|
4645
|
+
const { flags, processors } = opt ?? this.options;
|
|
4646
|
+
// Normalize the input using flags (i.e., 'itw')
|
|
4647
|
+
if (flags?.length)
|
|
4648
|
+
input = await this.normalizeAsync(input, flags);
|
|
4649
|
+
// Filter the input using hooked up filters
|
|
4650
|
+
input = await this.filterAsync(input, 'input');
|
|
4651
|
+
// Apply phonetic processors if configured
|
|
4652
|
+
if (processors?.phonetic)
|
|
4653
|
+
input = await this.indexAsync(input, processors.phonetic);
|
|
4654
|
+
return input;
|
|
4655
|
+
}
|
|
4656
|
+
/**
|
|
4657
|
+
* Asynchronously computes the phonetic index for the given input using
|
|
4658
|
+
* the specified phonetic algorithm.
|
|
4659
|
+
*
|
|
4660
|
+
* @param {MetricInput} input - The input string or array
|
|
4661
|
+
* @param {{ algo: string, opt?: PhoneticOptions }} options - The phonetic algorithm and options
|
|
4662
|
+
* @returns {Promise<MetricInput>} - The phonetic index for the given input
|
|
4663
|
+
*/
|
|
4664
|
+
async indexAsync(input, { algo, opt }) {
|
|
4665
|
+
this.assert('phonetic', algo);
|
|
4666
|
+
const phonetic = factory.phonetic(algo, opt);
|
|
4667
|
+
const delimiter = opt?.delimiter ?? ' ';
|
|
4668
|
+
return Array.isArray(input)
|
|
4669
|
+
? Promise.all(input.map(s => phonetic.getIndexAsync(s).then(r => r.join(delimiter))))
|
|
4670
|
+
: phonetic.getIndexAsync(input).then(r => r.join(delimiter));
|
|
4671
|
+
}
|
|
4672
|
+
/**
|
|
4673
|
+
* Asynchronously computes the metric result for the given inputs, applying
|
|
4674
|
+
* normalization and filtering as configured.
|
|
4675
|
+
*
|
|
4676
|
+
* @template T - The type of the metric result
|
|
4677
|
+
* @param {MetricInput} a - The first input string or array
|
|
4678
|
+
* @param {MetricInput} b - The second input string or array
|
|
4679
|
+
* @param {CmpStrOptions} [opt] - Optional options to use
|
|
4680
|
+
* @param {MetricMode} [mode='single'] - The metric mode to use
|
|
4681
|
+
* @param {boolean} [raw=false] - Whether to return raw results
|
|
4682
|
+
* @param {boolean} [skip=false] - Whether to skip normalization and filtering
|
|
4683
|
+
* @returns {Promise<T>} - The computed metric result
|
|
4684
|
+
*/
|
|
4685
|
+
async computeAsync(a, b, opt, mode, raw, skip) {
|
|
4686
|
+
const resolved = this.resolveOptions(opt);
|
|
4687
|
+
this.assert('metric', resolved.metric);
|
|
4688
|
+
// Prepare the input
|
|
4689
|
+
const A = skip ? a : await this.prepareAsync(a, resolved);
|
|
4690
|
+
const B = skip ? b : await this.prepareAsync(b, resolved);
|
|
4691
|
+
// Get the metric class
|
|
4692
|
+
const metric = factory.metric(resolved.metric, A, B, resolved.opt);
|
|
4693
|
+
// Pass the original inputs to the metric
|
|
4694
|
+
if (resolved.output !== 'prep')
|
|
4695
|
+
metric.setOriginal(a, b);
|
|
4696
|
+
// Compute the metric result
|
|
4697
|
+
await metric.runAsync(mode);
|
|
4698
|
+
// Post-process the results and concat the original inputs
|
|
4699
|
+
const result = this.postProcess(metric.getResults(), resolved);
|
|
4700
|
+
// Resolve and return the result based on the raw flag
|
|
4701
|
+
return this.output(result, raw ?? resolved.raw);
|
|
4702
|
+
}
|
|
4703
|
+
/**
|
|
4704
|
+
* ---------------------------------------------------------------------------------
|
|
4705
|
+
* Public asynchronously core methods for string comparison
|
|
4706
|
+
* ---------------------------------------------------------------------------------
|
|
4707
|
+
*
|
|
4708
|
+
* These methods provide the asynchronous core functionality for string comparison,
|
|
4709
|
+
* phonetic indexing and text search, allowing for non-blocking operations.
|
|
4710
|
+
*/
|
|
4711
|
+
/**
|
|
4712
|
+
* Asynchronously performs a single metric comparison.
|
|
4713
|
+
*
|
|
4714
|
+
* @template T - The type of the metric result
|
|
4715
|
+
* @param {string} a - The source string
|
|
4716
|
+
* @param {string} b - The target string
|
|
4717
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4718
|
+
* @returns {Promise<T>} - The metric result
|
|
4719
|
+
*/
|
|
4720
|
+
async testAsync(a, b, opt) {
|
|
4721
|
+
return this.computeAsync(a, b, opt, 'single');
|
|
4722
|
+
}
|
|
4723
|
+
/**
|
|
4724
|
+
* Asynchronously performs a single metric comparison returning the numeric score.
|
|
4725
|
+
*
|
|
4726
|
+
* @param {string} a - The source string
|
|
4727
|
+
* @param {string} b - The target string
|
|
4728
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4729
|
+
* @returns {Promise<number>} - The similarity score (0..1)
|
|
4730
|
+
*/
|
|
4731
|
+
async compareAsync(a, b, opt) {
|
|
4732
|
+
return (await this.computeAsync(a, b, opt, 'single', true)).res;
|
|
4733
|
+
}
|
|
4734
|
+
/**
|
|
4735
|
+
* Asynchronously performs a batch metric comparison between source and target
|
|
4736
|
+
* strings or array of strings.
|
|
4737
|
+
*
|
|
4738
|
+
* @template T - The type of the metric result
|
|
4739
|
+
* @param {MetricInput} a - The source string or array of strings
|
|
4740
|
+
* @param {MetricInput} b - The target string or array of strings
|
|
4741
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4742
|
+
* @returns {Promise<T>} - The batch metric results
|
|
4743
|
+
*/
|
|
4744
|
+
async batchTestAsync(a, b, opt) {
|
|
4745
|
+
return this.computeAsync(a, b, opt, 'batch');
|
|
4746
|
+
}
|
|
4747
|
+
/**
|
|
4748
|
+
* Asynchronously performs a batch metric comparison and returns results sorted by score.
|
|
4749
|
+
*
|
|
4750
|
+
* @template T - The type of the metric result
|
|
4751
|
+
* @param {MetricInput} a - The source string or array of strings
|
|
4752
|
+
* @param {MetricInput} b - The target string or array of strings
|
|
4753
|
+
* @param {'desc'|'asc'} [dir='desc'] - Sort direction (desc, asc)
|
|
4754
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4755
|
+
* @returns {Promise<T>} - The sorted batch results
|
|
4756
|
+
*/
|
|
4757
|
+
async batchSortedAsync(a, b, dir = 'desc', opt) {
|
|
4758
|
+
const res = await this.computeAsync(a, b, opt, 'batch', true);
|
|
4759
|
+
return this.output(res.sort((a, b) => dir === 'asc' ? a.res - b.res : b.res - a.res), opt?.raw ?? this.options.raw);
|
|
4760
|
+
}
|
|
4761
|
+
/**
|
|
4762
|
+
* Asynchronously performs a pairwise metric comparison between source and target
|
|
4763
|
+
* strings or array of strings.
|
|
4764
|
+
*
|
|
4765
|
+
* @template T - The type of the metric result
|
|
4766
|
+
* Input arrays needs of the same length to perform pairwise comparison,
|
|
4767
|
+
* otherwise the method will throw an error.
|
|
4768
|
+
*
|
|
4769
|
+
* @param {MetricInput} a - The source string or array of strings
|
|
4770
|
+
* @param {MetricInput} b - The target string or array of strings
|
|
4771
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4772
|
+
* @returns {Promise<T>} - The pairwise metric results
|
|
4773
|
+
*/
|
|
4774
|
+
async pairsAsync(a, b, opt) {
|
|
4775
|
+
return this.computeAsync(a, b, opt, 'pairwise');
|
|
4776
|
+
}
|
|
4777
|
+
/**
|
|
4778
|
+
* Asynchronously performs a batch comparison and returns only results above the threshold.
|
|
4779
|
+
*
|
|
4780
|
+
* @template T - The type of the metric result
|
|
4781
|
+
* @param {MetricInput} a - The source string or array of strings
|
|
4782
|
+
* @param {MetricInput} b - The target string or array of strings
|
|
4783
|
+
* @param {number} threshold - The similarity threshold (0..1)
|
|
4784
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4785
|
+
* @returns {Promise<T>} - The filtered batch results
|
|
4786
|
+
*/
|
|
4787
|
+
async matchAsync(a, b, threshold, opt) {
|
|
4788
|
+
const res = await this.computeAsync(a, b, opt, 'batch', true);
|
|
4789
|
+
return this.output(res.filter(r => r.res >= threshold).sort((a, b) => b.res - a.res), opt?.raw ?? this.options.raw);
|
|
4790
|
+
}
|
|
4791
|
+
/**
|
|
4792
|
+
* Asynchronously returns the n closest matches from a batch comparison.
|
|
4793
|
+
*
|
|
4794
|
+
* @template T - The type of the metric result
|
|
4795
|
+
* @param {MetricInput} a - The source string or array of strings
|
|
4796
|
+
* @param {MetricInput} b - The target string or array of strings
|
|
4797
|
+
* @param {number} [n=1] - Number of closest matches
|
|
4798
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4799
|
+
* @returns {Promise<T>} - The closest matches
|
|
4800
|
+
*/
|
|
4801
|
+
async closestAsync(a, b, n = 1, opt) {
|
|
4802
|
+
return (await this.batchSortedAsync(a, b, 'desc', opt)).slice(0, n);
|
|
4803
|
+
}
|
|
4804
|
+
/**
|
|
4805
|
+
* Asynchronously returns the n furthest matches from a batch comparison.
|
|
4806
|
+
*
|
|
4807
|
+
* @template T - The type of the metric result
|
|
4808
|
+
* @param {MetricInput} a - The source string or array of strings
|
|
4809
|
+
* @param {MetricInput} b - The target string or array of strings
|
|
4810
|
+
* @param {number} [n=1] - Number of furthest matches
|
|
4811
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4812
|
+
* @returns {Promise<T>} - The furthest matches
|
|
4813
|
+
*/
|
|
4814
|
+
async furthestAsync(a, b, n = 1, opt) {
|
|
4815
|
+
return (await this.batchSortedAsync(a, b, 'asc', opt)).slice(0, n);
|
|
4816
|
+
}
|
|
4817
|
+
/**
|
|
4818
|
+
* Asynchronously performs a normalized and filtered substring search.
|
|
4819
|
+
*
|
|
4820
|
+
* @param {string} needle - The search string
|
|
4821
|
+
* @param {string[]} haystack - The array to search in
|
|
4822
|
+
* @param {NormalizeFlags} [flags] - Normalization flags
|
|
4823
|
+
* @param {CmpStrProcessors} [processors] - Pre-processors to apply
|
|
4824
|
+
* @returns {Promise<string[]>} - Array of matching entries
|
|
4825
|
+
*/
|
|
4826
|
+
async searchAsync(needle, haystack, flags, processors) {
|
|
4827
|
+
const resolved = this.resolveOptions({ flags, processors });
|
|
4828
|
+
// Prepare the needle and haystack, normalizing and filtering them
|
|
4829
|
+
const test = await this.prepareAsync(needle, resolved);
|
|
4830
|
+
const hstk = await this.prepareAsync(haystack, resolved);
|
|
4831
|
+
// Filter the haystack based on the normalized test string
|
|
4832
|
+
return haystack.filter((_, i) => hstk[i].includes(test));
|
|
4833
|
+
}
|
|
4834
|
+
/**
|
|
4835
|
+
* Asynchronously computes a similarity matrix for the given input array.
|
|
4836
|
+
*
|
|
4837
|
+
* @param {string[]} input - The input array
|
|
4838
|
+
* @param {CmpStrOptions} [opt] - Optional options
|
|
4839
|
+
* @returns {Promise<number[][]>} - The similarity matrix
|
|
4840
|
+
*/
|
|
4841
|
+
async matrixAsync(input, opt) {
|
|
4842
|
+
input = await this.prepareAsync(input, this.resolveOptions(opt));
|
|
4843
|
+
return Promise.all(input.map(async (a) => (await this.computeAsync(a, input, undefined, 'batch', true, true).then(r => r.map(b => b.res ?? 0)))));
|
|
4844
|
+
}
|
|
4845
|
+
/**
|
|
4846
|
+
* Asynchronously computes the phonetic index for a string using the
|
|
4847
|
+
* configured or given algorithm.
|
|
4848
|
+
*
|
|
4849
|
+
* @param {string} [input] - The input string
|
|
4850
|
+
* @param {string} [algo] - The phonetic algorithm to use
|
|
4851
|
+
* @param {PhoneticOptions} [opt] - Optional phonetic options
|
|
4852
|
+
* @returns {Promise<string>} - The phonetic index as a string
|
|
4853
|
+
*/
|
|
4854
|
+
async phoneticIndexAsync(input, algo, opt) {
|
|
4855
|
+
const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
|
|
4856
|
+
return this.indexAsync(input, {
|
|
4857
|
+
algo: (algo ?? a), opt: opt ?? o
|
|
4858
|
+
});
|
|
4859
|
+
}
|
|
4860
|
+
}
|
|
4861
|
+
|
|
4862
|
+
export { CmpStr, CmpStrAsync, DiffChecker, Normalizer, TextAnalyzer };
|
|
4863
|
+
//# sourceMappingURL=CmpStr.esm.js.map
|