wingbot 3.67.8 → 3.67.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +8 -0
- package/package.json +6 -6
- package/src/Ai.js +6 -4
- package/src/fuzzy/factoryFuzzySearch.js +243 -0
- package/src/fuzzy/fuzzyUtils.js +91 -0
- package/src/fuzzy/index.js +40 -0
- package/src/fuzzy/levenshtein.js +228 -0
- package/src/fuzzy/normalize.js +62 -0
- package/src/fuzzy/prepareFuzzyIndex.js +198 -0
- package/src/utils/tokenizer.js +64 -7
- package/src/wingbot/CustomEntityDetectionModel.js +3 -3
package/index.js
CHANGED
|
@@ -64,6 +64,9 @@ const {
|
|
|
64
64
|
} = require('./src/analytics/consts');
|
|
65
65
|
|
|
66
66
|
const { version: wingbotVersion } = require('./package.json');
|
|
67
|
+
const { fuzzy } = require('./src/fuzzy');
|
|
68
|
+
const prepareFuzzyIndex = require('./src/fuzzy/prepareFuzzyIndex');
|
|
69
|
+
const factoryFuzzySearch = require('./src/fuzzy/factoryFuzzySearch');
|
|
67
70
|
|
|
68
71
|
module.exports = {
|
|
69
72
|
|
|
@@ -109,6 +112,11 @@ module.exports = {
|
|
|
109
112
|
plugins,
|
|
110
113
|
vars,
|
|
111
114
|
|
|
115
|
+
// FUZZY
|
|
116
|
+
fuzzy,
|
|
117
|
+
prepareFuzzyIndex,
|
|
118
|
+
factoryFuzzySearch,
|
|
119
|
+
|
|
112
120
|
// Notifications
|
|
113
121
|
Notifications,
|
|
114
122
|
NotificationsStorage,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "wingbot",
|
|
3
|
-
"version": "3.67.
|
|
3
|
+
"version": "3.67.10",
|
|
4
4
|
"description": "Enterprise Messaging Bot Conversation Engine",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -54,18 +54,18 @@
|
|
|
54
54
|
},
|
|
55
55
|
"dependencies": {
|
|
56
56
|
"@amplitude/ua-parser-js": "^0.7.33",
|
|
57
|
-
"compress-json": "^
|
|
57
|
+
"compress-json": "^3.0.0",
|
|
58
58
|
"deep-extend": "^0.6.0",
|
|
59
59
|
"form-data": "^4.0.0",
|
|
60
|
-
"graphql": "^16.8.
|
|
61
|
-
"jsonwebtoken": "^9.0.
|
|
60
|
+
"graphql": "^16.8.1",
|
|
61
|
+
"jsonwebtoken": "^9.0.2",
|
|
62
62
|
"node-fetch": "^2.6.7",
|
|
63
63
|
"path-to-regexp": "^6.2.1",
|
|
64
|
-
"uuid": "^9.0.
|
|
64
|
+
"uuid": "^9.0.1",
|
|
65
65
|
"webalize": "^0.1.0"
|
|
66
66
|
},
|
|
67
67
|
"optionalDependencies": {
|
|
68
|
-
"axios": "^
|
|
68
|
+
"axios": "^1.6.4",
|
|
69
69
|
"handlebars": "^4.0.0"
|
|
70
70
|
}
|
|
71
71
|
}
|
package/src/Ai.js
CHANGED
|
@@ -59,7 +59,7 @@ let uq = 1;
|
|
|
59
59
|
|
|
60
60
|
/**
|
|
61
61
|
* @callback WordEntityDetectorFactory
|
|
62
|
-
* @returns {Promise<
|
|
62
|
+
* @returns {Promise<WordDetectorData>}
|
|
63
63
|
*/
|
|
64
64
|
|
|
65
65
|
/** @typedef {[string,EntityDetector|RegExp,DetectorOptions]} DetectorArgs */
|
|
@@ -241,11 +241,14 @@ class Ai {
|
|
|
241
241
|
* @returns {T}
|
|
242
242
|
* @memberOf Ai
|
|
243
243
|
*/
|
|
244
|
-
register (model, prefix = this.DEFAULT_PREFIX) {
|
|
244
|
+
register (model = null, prefix = this.DEFAULT_PREFIX) {
|
|
245
245
|
/** @type {T} */
|
|
246
246
|
let modelObj;
|
|
247
247
|
|
|
248
|
-
if (
|
|
248
|
+
if (!model) {
|
|
249
|
+
// @ts-ignore
|
|
250
|
+
modelObj = new CustomEntityDetectionModel({ prefix });
|
|
251
|
+
} else if (typeof model === 'string') {
|
|
249
252
|
// @ts-ignore
|
|
250
253
|
modelObj = new WingbotModel({
|
|
251
254
|
model,
|
|
@@ -803,7 +806,6 @@ class Ai {
|
|
|
803
806
|
if (!req.isText()) {
|
|
804
807
|
return;
|
|
805
808
|
}
|
|
806
|
-
|
|
807
809
|
if (this._keyworders.size !== 0) {
|
|
808
810
|
const model = this._getModelForRequest(req);
|
|
809
811
|
if (!model) {
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @author David Menger
|
|
3
|
+
*/
|
|
4
|
+
'use strict';
|
|
5
|
+
|
|
6
|
+
const { shortArrayIndex, splitToNgrams, cleanup } = require('./fuzzyUtils');
|
|
7
|
+
const {
|
|
8
|
+
relativeLevenshtein, SEED_FUZZY, SEED_FUZZY_MULTIPLICATOR, WORD_HANDICAP_K_FUZZY
|
|
9
|
+
} = require('./levenshtein');
|
|
10
|
+
|
|
11
|
+
const LOWER_DUPLICATES = 0.9;
|
|
12
|
+
|
|
13
|
+
function getIndexesToIterate (ngrams, tfEntry) {
|
|
14
|
+
if (tfEntry.length === 2) {
|
|
15
|
+
return [1, 1];
|
|
16
|
+
}
|
|
17
|
+
const min = Math.ceil(ngrams * 0.6);
|
|
18
|
+
const max = Math.floor(ngrams * 1.5);
|
|
19
|
+
return [shortArrayIndex(min), shortArrayIndex(max)];
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* @typedef {object} FuzzySearchOptions
|
|
24
|
+
* @prop {boolean} [keepMultipleValues]
|
|
25
|
+
* @prop {Stemmer} [stemmer]
|
|
26
|
+
* @prop {number} [threshold]
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
/** @typedef {import('./prepareFuzzyIndex').FuzzyIndexData} FuzzyIndexData */
|
|
30
|
+
/** @typedef {import('./prepareFuzzyIndex').Stemmer} Stemmer */
|
|
31
|
+
/** @typedef {import('../Ai').WordEntityDetector} WordEntityDetector */
|
|
32
|
+
/** @typedef {import('../Ai').WordDetectorData} WordDetectorData */
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* @typedef {object} Entity
|
|
36
|
+
* @prop {string} entity
|
|
37
|
+
* @prop {string} value
|
|
38
|
+
* @prop {string[]} [synonyms]
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
function searchFnFactory (indexMap, ngramCounts, entities, maxIdf, {
|
|
42
|
+
stemmer = null,
|
|
43
|
+
keepMultipleValues = false,
|
|
44
|
+
threshold = 0.5,
|
|
45
|
+
limit = undefined
|
|
46
|
+
}, hasFuzzyMultiplier = false) {
|
|
47
|
+
/** @type {WordEntityDetector} */
|
|
48
|
+
const searchFn = (search) => {
|
|
49
|
+
const cleanQuery = cleanup(search, stemmer);
|
|
50
|
+
const tokens = splitToNgrams(cleanQuery);
|
|
51
|
+
const results = new Map();
|
|
52
|
+
|
|
53
|
+
tokens.forEach((token) => {
|
|
54
|
+
const entry = indexMap.get(token);
|
|
55
|
+
if (!entry) {
|
|
56
|
+
return;
|
|
57
|
+
}
|
|
58
|
+
const [idf] = entry;
|
|
59
|
+
const [startIndex, endIndex] = getIndexesToIterate(tokens.length, entry);
|
|
60
|
+
|
|
61
|
+
const maxIndex = Math.min(endIndex, entry.length - 1);
|
|
62
|
+
for (let i = startIndex; i <= maxIndex; i++) {
|
|
63
|
+
for (const id of entry[i]) {
|
|
64
|
+
let res = results.get(id);
|
|
65
|
+
if (!res) {
|
|
66
|
+
res = { cnt: 0, idf: 0 };
|
|
67
|
+
results.set(id, res);
|
|
68
|
+
}
|
|
69
|
+
res.cnt++;
|
|
70
|
+
res.idf += idf;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
let maxScore = 0;
|
|
77
|
+
let maxRelIdf = 0; // small but positive
|
|
78
|
+
const levenshteinSeed = hasFuzzyMultiplier
|
|
79
|
+
? SEED_FUZZY_MULTIPLICATOR
|
|
80
|
+
: SEED_FUZZY;
|
|
81
|
+
|
|
82
|
+
const percentage = hasFuzzyMultiplier
|
|
83
|
+
? 0.6
|
|
84
|
+
: 0.5;
|
|
85
|
+
|
|
86
|
+
const preprocessed = Array.from(results.entries())
|
|
87
|
+
.filter(([id, { cnt }]) => {
|
|
88
|
+
const [ngramCount] = ngramCounts[id];
|
|
89
|
+
const percentageOfMatchedNgrams = (cnt * 2) / (ngramCount + tokens.length);
|
|
90
|
+
return percentageOfMatchedNgrams >= percentage;
|
|
91
|
+
})
|
|
92
|
+
.map(([id, { cnt, idf }]) => {
|
|
93
|
+
const [, entityIndex, cleanText] = ngramCounts[id];
|
|
94
|
+
const [entity, value] = entities[entityIndex];
|
|
95
|
+
const relIdf = (idf / cnt) / maxIdf;
|
|
96
|
+
let score = relativeLevenshtein(
|
|
97
|
+
cleanText,
|
|
98
|
+
cleanQuery,
|
|
99
|
+
levenshteinSeed,
|
|
100
|
+
WORD_HANDICAP_K_FUZZY
|
|
101
|
+
);
|
|
102
|
+
let start = 0;
|
|
103
|
+
|
|
104
|
+
if (cleanQuery.match(/^[^\s]{1,3}\s+.{6,}$/)) {
|
|
105
|
+
const without = cleanQuery.replace(/^[^\s]{1,3}\s+/, '');
|
|
106
|
+
const altScore = relativeLevenshtein(
|
|
107
|
+
cleanText,
|
|
108
|
+
without,
|
|
109
|
+
levenshteinSeed,
|
|
110
|
+
WORD_HANDICAP_K_FUZZY
|
|
111
|
+
);
|
|
112
|
+
|
|
113
|
+
if (altScore > score) {
|
|
114
|
+
score = altScore;
|
|
115
|
+
start = cleanQuery.length - without.length;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (maxScore < score) maxScore = score;
|
|
120
|
+
if (maxRelIdf < relIdf) maxRelIdf = relIdf;
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
entity,
|
|
124
|
+
value,
|
|
125
|
+
_relIdf: relIdf,
|
|
126
|
+
score,
|
|
127
|
+
...(start ? { start } : {})
|
|
128
|
+
};
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
const found = preprocessed.map((o) => {
|
|
132
|
+
const { _relIdf: relIdf } = o;
|
|
133
|
+
// eslint-disable-next-line no-param-reassign
|
|
134
|
+
delete o._relIdf;
|
|
135
|
+
|
|
136
|
+
const koef = maxRelIdf <= 0 ? relIdf : (relIdf / maxRelIdf);
|
|
137
|
+
const addToScore = ((1 - maxScore) / 2) * koef;
|
|
138
|
+
|
|
139
|
+
Object.assign(o, {
|
|
140
|
+
score: Math.round((o.score + addToScore) * 10000) / 10000
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
return o;
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
found.sort((a, z) => z.score - a.score);
|
|
147
|
+
|
|
148
|
+
const known = new Map();
|
|
149
|
+
const res = found
|
|
150
|
+
.filter((result) => {
|
|
151
|
+
const key = keepMultipleValues ? `${result.entity}|${result.value}` : result.entity;
|
|
152
|
+
if (result.score < threshold) {
|
|
153
|
+
return false;
|
|
154
|
+
}
|
|
155
|
+
if (known.has(key)) {
|
|
156
|
+
const { result: origResult, score, alts } = known.get(key);
|
|
157
|
+
if (!keepMultipleValues
|
|
158
|
+
&& Math.abs(score - result.score) < (1 - LOWER_DUPLICATES)
|
|
159
|
+
&& origResult.value !== result.value) {
|
|
160
|
+
|
|
161
|
+
if (!alts.some((a) => a.value === result.value)) {
|
|
162
|
+
// five percent down for collisions
|
|
163
|
+
origResult.score *= LOWER_DUPLICATES;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
alts.push(result);
|
|
167
|
+
|
|
168
|
+
Object.assign(origResult, {
|
|
169
|
+
alternatives: alts
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
return false;
|
|
173
|
+
}
|
|
174
|
+
known.set(key, { result, score: result.score, alts: [] });
|
|
175
|
+
return true;
|
|
176
|
+
})
|
|
177
|
+
.slice(0, limit);
|
|
178
|
+
|
|
179
|
+
res.forEach((entity) => {
|
|
180
|
+
if ('alternatives' in entity) {
|
|
181
|
+
// @ts-ignore
|
|
182
|
+
let { alternatives } = entity;
|
|
183
|
+
|
|
184
|
+
const kn = new Set([entity.value]);
|
|
185
|
+
alternatives = alternatives
|
|
186
|
+
// @ts-ignore
|
|
187
|
+
.sort((a, z) => z.score - a.score)
|
|
188
|
+
.filter((e) => !known.has(e.value) && kn.add(e.value));
|
|
189
|
+
|
|
190
|
+
// @ts-ignore
|
|
191
|
+
for (let i = 0; i < alternatives.length; i++) {
|
|
192
|
+
const alt = alternatives[i];
|
|
193
|
+
// @ts-ignore
|
|
194
|
+
Object.assign(alt, {
|
|
195
|
+
// @ts-ignore
|
|
196
|
+
score: alt.score * (LOWER_DUPLICATES ** alternatives.length)
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
Object.assign(entity, { alternatives });
|
|
201
|
+
}
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
return res;
|
|
205
|
+
};
|
|
206
|
+
|
|
207
|
+
return searchFn;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
*
|
|
212
|
+
* @param {FuzzyIndexData} data
|
|
213
|
+
* @param {FuzzySearchOptions} [options]
|
|
214
|
+
* @returns {WordDetectorData}
|
|
215
|
+
*/
|
|
216
|
+
function factoryFuzzySearch (data, options = {}) {
|
|
217
|
+
const {
|
|
218
|
+
ngramCounts,
|
|
219
|
+
entities,
|
|
220
|
+
indexArray,
|
|
221
|
+
maxIdf,
|
|
222
|
+
hasFuzzyMultiplier,
|
|
223
|
+
maxWordCount
|
|
224
|
+
} = data;
|
|
225
|
+
|
|
226
|
+
const indexMap = new Map(indexArray);
|
|
227
|
+
|
|
228
|
+
const detector = searchFnFactory(
|
|
229
|
+
indexMap,
|
|
230
|
+
ngramCounts,
|
|
231
|
+
entities,
|
|
232
|
+
maxIdf,
|
|
233
|
+
options,
|
|
234
|
+
hasFuzzyMultiplier
|
|
235
|
+
);
|
|
236
|
+
|
|
237
|
+
return {
|
|
238
|
+
detector,
|
|
239
|
+
maxWordCount
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
module.exports = factoryFuzzySearch;
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @author David Menger
|
|
3
|
+
*/
|
|
4
|
+
'use strict';
|
|
5
|
+
|
|
6
|
+
const { normalize } = require('./normalize');
|
|
7
|
+
|
|
8
|
+
const SHORTEN_BY = 2;
|
|
9
|
+
const NGRAMS = 3;
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
*
|
|
13
|
+
* @param {string|number} word
|
|
14
|
+
* @returns {string}
|
|
15
|
+
*/
|
|
16
|
+
function preNormalize (word) {
|
|
17
|
+
return normalize(word)
|
|
18
|
+
.replace(/[^a-z0-9]+/g, ' ')
|
|
19
|
+
.trim();
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function stem (normalized, stemmer) {
|
|
23
|
+
if (!stemmer) {
|
|
24
|
+
return normalized;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const stems = normalized
|
|
28
|
+
.split(/\s+/g)
|
|
29
|
+
.map((w) => stemmer(w) || w);
|
|
30
|
+
|
|
31
|
+
return `${normalized} ${stems.join(' ')}`;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** @typedef {{ (word: string): string}} Stemmer */
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
*
|
|
38
|
+
* @param {string|number} word
|
|
39
|
+
* @param {Stemmer} stemmer
|
|
40
|
+
* @returns {string}
|
|
41
|
+
*/
|
|
42
|
+
function cleanup (word, stemmer) {
|
|
43
|
+
const normalized = preNormalize(word);
|
|
44
|
+
return stem(normalized, stemmer);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
*
|
|
49
|
+
* @param {string} normalized
|
|
50
|
+
* @param {Stemmer} stemmer
|
|
51
|
+
* @returns {string}
|
|
52
|
+
*/
|
|
53
|
+
function cleanupPreNormalized (normalized, stemmer) {
|
|
54
|
+
return stem(normalized, stemmer);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
*
|
|
59
|
+
* @param {number} ngramCount
|
|
60
|
+
* @returns {number}
|
|
61
|
+
*/
|
|
62
|
+
function shortArrayIndex (ngramCount) {
|
|
63
|
+
return Math.floor(ngramCount / SHORTEN_BY) + 1;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
*
|
|
68
|
+
* @param {string} word
|
|
69
|
+
* @returns {string[]}
|
|
70
|
+
*/
|
|
71
|
+
function splitToNgrams (word) {
|
|
72
|
+
const prolonged = ` ${word} `;
|
|
73
|
+
const len = prolonged.length - NGRAMS + 1;
|
|
74
|
+
if (len <= 0) {
|
|
75
|
+
return word.length > 0 ? [prolonged] : [];
|
|
76
|
+
}
|
|
77
|
+
const ret = new Array(len);
|
|
78
|
+
for (let i = 0; i < len; i++) {
|
|
79
|
+
const sub = prolonged.substring(i, i + NGRAMS);
|
|
80
|
+
ret[i] = sub;
|
|
81
|
+
}
|
|
82
|
+
return ret;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
module.exports = {
|
|
86
|
+
cleanup,
|
|
87
|
+
shortArrayIndex,
|
|
88
|
+
splitToNgrams,
|
|
89
|
+
cleanupPreNormalized,
|
|
90
|
+
preNormalize
|
|
91
|
+
};
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @author David Menger
|
|
3
|
+
*/
|
|
4
|
+
'use strict';
|
|
5
|
+
|
|
6
|
+
const factoryFuzzySearch = require('./factoryFuzzySearch');
|
|
7
|
+
const prepareFuzzyIndex = require('./prepareFuzzyIndex');
|
|
8
|
+
|
|
9
|
+
/** @typedef {import('./factoryFuzzySearch').Entity} Entity */
|
|
10
|
+
/** @typedef {import('./factoryFuzzySearch').FuzzySearchOptions} FuzzySearchOptions */
|
|
11
|
+
/** @typedef {import('../Ai').WordEntityDetectorFactory} WordEntityDetectorFactory */
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* @callback EntityFactory
|
|
15
|
+
* @returns {Promise<Entity[]>}
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
*
|
|
20
|
+
* @param {Entity[]|EntityFactory} entities
|
|
21
|
+
* @param {FuzzySearchOptions} options
|
|
22
|
+
* @returns {WordEntityDetectorFactory}
|
|
23
|
+
*/
|
|
24
|
+
function fuzzy (entities, options = {}) {
|
|
25
|
+
|
|
26
|
+
return async () => {
|
|
27
|
+
const data = typeof entities === 'function'
|
|
28
|
+
? (await entities())
|
|
29
|
+
: entities;
|
|
30
|
+
|
|
31
|
+
const index = prepareFuzzyIndex(data, options);
|
|
32
|
+
return factoryFuzzySearch(index, options);
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
module.exports = {
|
|
37
|
+
fuzzy,
|
|
38
|
+
prepareFuzzyIndex,
|
|
39
|
+
factoryFuzzySearch
|
|
40
|
+
};
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @author David Menger
|
|
3
|
+
*/
|
|
4
|
+
'use strict';
|
|
5
|
+
|
|
6
|
+
const NUMERIC_KOEF = 4;
|
|
7
|
+
const SUFFIX_WEIGHT = 0.055;
|
|
8
|
+
|
|
9
|
+
const SEED_DEFAULT = 0.5;
|
|
10
|
+
const SEED_FUZZY = 0.25;
|
|
11
|
+
const SEED_FUZZY_MULTIPLICATOR = -0.25;
|
|
12
|
+
|
|
13
|
+
const WORD_HANDICAP_K_DEFAULT = 0.9;
|
|
14
|
+
const WORD_HANDICAP_K_FUZZY = 0.6;
|
|
15
|
+
|
|
16
|
+
function _min (d0, d1, d2, bx, ay) {
|
|
17
|
+
if (d0 < d1 || d2 < d1) {
|
|
18
|
+
return d0 > d2
|
|
19
|
+
? d2 + 1
|
|
20
|
+
: d0 + 1;
|
|
21
|
+
}
|
|
22
|
+
return bx === ay
|
|
23
|
+
? d1
|
|
24
|
+
: d1 + 1;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
*
|
|
29
|
+
* @param {string} left
|
|
30
|
+
* @param {string} right
|
|
31
|
+
* @returns {number}
|
|
32
|
+
*/
|
|
33
|
+
function levenshtein (left, right) {
|
|
34
|
+
if (left === right) {
|
|
35
|
+
return 0;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
let a = left;
|
|
39
|
+
let b = right;
|
|
40
|
+
|
|
41
|
+
if (a.length > b.length) {
|
|
42
|
+
const tmp = a;
|
|
43
|
+
a = b;
|
|
44
|
+
b = tmp;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
let la = a.length;
|
|
48
|
+
let lb = b.length;
|
|
49
|
+
|
|
50
|
+
while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {
|
|
51
|
+
la--;
|
|
52
|
+
lb--;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
let offset = 0;
|
|
56
|
+
|
|
57
|
+
while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {
|
|
58
|
+
offset++;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
la -= offset;
|
|
62
|
+
lb -= offset;
|
|
63
|
+
|
|
64
|
+
if (la === 0 || lb < 3) {
|
|
65
|
+
return lb;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
let x = 0;
|
|
69
|
+
let y;
|
|
70
|
+
let d0;
|
|
71
|
+
let d1;
|
|
72
|
+
let d2;
|
|
73
|
+
let d3;
|
|
74
|
+
let dd;
|
|
75
|
+
let dy;
|
|
76
|
+
let ay;
|
|
77
|
+
let bx0;
|
|
78
|
+
let bx1;
|
|
79
|
+
let bx2;
|
|
80
|
+
let bx3;
|
|
81
|
+
|
|
82
|
+
const vector = [];
|
|
83
|
+
|
|
84
|
+
for (y = 0; y < la; y++) {
|
|
85
|
+
vector.push(y + 1);
|
|
86
|
+
vector.push(a.charCodeAt(offset + y));
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const len = vector.length - 1;
|
|
90
|
+
|
|
91
|
+
for (; x < lb - 3;) {
|
|
92
|
+
bx0 = b.charCodeAt(offset + (d0 = x));
|
|
93
|
+
bx1 = b.charCodeAt(offset + (d1 = x + 1));
|
|
94
|
+
bx2 = b.charCodeAt(offset + (d2 = x + 2));
|
|
95
|
+
bx3 = b.charCodeAt(offset + (d3 = x + 3));
|
|
96
|
+
x += 4;
|
|
97
|
+
dd = x;
|
|
98
|
+
for (y = 0; y < len; y += 2) {
|
|
99
|
+
dy = vector[y];
|
|
100
|
+
ay = vector[y + 1];
|
|
101
|
+
d0 = _min(dy, d0, d1, bx0, ay);
|
|
102
|
+
d1 = _min(d0, d1, d2, bx1, ay);
|
|
103
|
+
d2 = _min(d1, d2, d3, bx2, ay);
|
|
104
|
+
dd = _min(d2, d3, dd, bx3, ay);
|
|
105
|
+
vector[y] = dd;
|
|
106
|
+
d3 = d2;
|
|
107
|
+
d2 = d1;
|
|
108
|
+
d1 = d0;
|
|
109
|
+
d0 = dy;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
for (; x < lb;) {
|
|
114
|
+
bx0 = b.charCodeAt(offset + (d0 = x));
|
|
115
|
+
dd = ++x;
|
|
116
|
+
for (y = 0; y < len; y += 2) {
|
|
117
|
+
dy = vector[y];
|
|
118
|
+
dd = _min(dy, d0, dd, bx0, vector[y + 1]);
|
|
119
|
+
vector[y] = dd;
|
|
120
|
+
d0 = dy;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return dd;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function addSeed (seed, len, value, base = seed) {
|
|
128
|
+
return base + (((len - value) / len) * (1 - seed));
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
*
|
|
133
|
+
* @param {string} left - training data
|
|
134
|
+
* @param {string} right - query
|
|
135
|
+
* @param {number} [seed]
|
|
136
|
+
* @param {number} [wordKoef]
|
|
137
|
+
* @returns {number}
|
|
138
|
+
*/
|
|
139
|
+
function relativeLevenshtein (
|
|
140
|
+
left,
|
|
141
|
+
right,
|
|
142
|
+
seed = SEED_DEFAULT,
|
|
143
|
+
wordKoef = WORD_HANDICAP_K_DEFAULT
|
|
144
|
+
) {
|
|
145
|
+
const len = Math.max(left.length, right.length);
|
|
146
|
+
if (!len) {
|
|
147
|
+
return 0;
|
|
148
|
+
}
|
|
149
|
+
let stemLen = Math.min(left.length, right.length);
|
|
150
|
+
|
|
151
|
+
const leftWordCount = (left.match(/[^\s]+/g) || ['']).length;
|
|
152
|
+
const rightWordCount = (right.match(/[^\s]+/g) || ['']).length;
|
|
153
|
+
|
|
154
|
+
const wordDiff = Math.max(0, rightWordCount - leftWordCount);
|
|
155
|
+
const wordHandicap = (wordKoef ** wordDiff);
|
|
156
|
+
|
|
157
|
+
const leftNum = left.replace(/[^0-9]+/g, '');
|
|
158
|
+
const rightNum = right.replace(/[^0-9]+/g, '');
|
|
159
|
+
const numLen = leftNum.length ? leftNum.length * NUMERIC_KOEF : rightNum.length;
|
|
160
|
+
const useNumK = leftNum.length ? NUMERIC_KOEF : 1;
|
|
161
|
+
const numLev = numLen ? levenshtein(leftNum, rightNum) * useNumK : 0;
|
|
162
|
+
|
|
163
|
+
if (stemLen < 3) {
|
|
164
|
+
return addSeed(seed, len + numLen, levenshtein(left, right) + numLev) * wordHandicap;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
let diff = len - stemLen;
|
|
168
|
+
|
|
169
|
+
if (diff <= 2) {
|
|
170
|
+
diff += 2;
|
|
171
|
+
stemLen -= 2;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
let diffWeight = diff * SUFFIX_WEIGHT;
|
|
175
|
+
|
|
176
|
+
const lStem = left.substring(0, stemLen);
|
|
177
|
+
const rStem = right.substring(0, stemLen);
|
|
178
|
+
const lSuff = left.substring(stemLen);
|
|
179
|
+
const rSuff = right.substring(stemLen);
|
|
180
|
+
|
|
181
|
+
const stemLev = levenshtein(lStem, rStem);
|
|
182
|
+
const suffLev = levenshtein(lSuff, rSuff);
|
|
183
|
+
|
|
184
|
+
if (suffLev === 1 && stemLev === 0) {
|
|
185
|
+
diffWeight = (diff - 1) * SUFFIX_WEIGHT;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
const vStem = addSeed(seed, stemLen + numLen, stemLev + numLev, seed - diffWeight);
|
|
189
|
+
const vSuffix = addSeed(1 - diffWeight, diff, suffLev, 0);
|
|
190
|
+
|
|
191
|
+
const r = (vStem + vSuffix) * wordHandicap;
|
|
192
|
+
|
|
193
|
+
// console.log(`#levenshtein "${left}" <- ${right}: ${r.toFixed(3)}`);
|
|
194
|
+
return r;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
*
|
|
199
|
+
* @param {string} left
|
|
200
|
+
* @param {string} right
|
|
201
|
+
* @param {number} seed
|
|
202
|
+
* @param {number} [wordKoef]
|
|
203
|
+
* @returns {number}
|
|
204
|
+
*/
|
|
205
|
+
function multiwordLevenshtein (left, right, seed, wordKoef = undefined) {
|
|
206
|
+
const leftSplit = `${left}`.split(/\s+/g);
|
|
207
|
+
const rightSplit = `${right}`.split(/\s+/g);
|
|
208
|
+
|
|
209
|
+
let sum = 0;
|
|
210
|
+
|
|
211
|
+
const max = Math.max(leftSplit.length, rightSplit.length, 1);
|
|
212
|
+
for (let i = 0; i < max; i++) {
|
|
213
|
+
sum += relativeLevenshtein(leftSplit[i] || '', rightSplit[i] || '', seed, wordKoef);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return sum / max;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
module.exports = {
|
|
220
|
+
levenshtein,
|
|
221
|
+
multiwordLevenshtein,
|
|
222
|
+
relativeLevenshtein,
|
|
223
|
+
SEED_DEFAULT,
|
|
224
|
+
SEED_FUZZY,
|
|
225
|
+
SEED_FUZZY_MULTIPLICATOR,
|
|
226
|
+
WORD_HANDICAP_K_FUZZY,
|
|
227
|
+
WORD_HANDICAP_K_DEFAULT
|
|
228
|
+
};
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* @author David Menger
|
|
3
|
+
*/
|
|
4
|
+
'use strict';
|
|
5
|
+
|
|
6
|
+
const { normalize } = require('../utils/tokenizer');
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Preserves only letters (with or withour diacritics) and makes everything lowercased
|
|
10
|
+
*
|
|
11
|
+
* @param {string} str - input string
|
|
12
|
+
* @returns {string}
|
|
13
|
+
*/
|
|
14
|
+
function cleanup (str) {
|
|
15
|
+
return str
|
|
16
|
+
.replace(/[`']+(\s|$)|(\s|^)['`]+/g, ' ')
|
|
17
|
+
.replace(/\s+/g, ' ')
|
|
18
|
+
.trim();
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
*
|
|
23
|
+
* @param {string} str
|
|
24
|
+
* @param {boolean} strict
|
|
25
|
+
* @returns {string}
|
|
26
|
+
*/
|
|
27
|
+
function normalizeEntity (str, strict) {
|
|
28
|
+
if (strict) {
|
|
29
|
+
return `${str}`.toLocaleLowerCase()
|
|
30
|
+
.replace(/\s+/g, ' ')
|
|
31
|
+
.trim();
|
|
32
|
+
}
|
|
33
|
+
return cleanup(normalize(str));
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
*
|
|
38
|
+
* @param {string} str
|
|
39
|
+
* @returns {string}
|
|
40
|
+
*/
|
|
41
|
+
function normalizePreserveEntities (str) {
|
|
42
|
+
|
|
43
|
+
let ret = normalize(str);
|
|
44
|
+
|
|
45
|
+
str.replace(/@[A-Z0-9-]+/g, (entity, start) => {
|
|
46
|
+
const begin = ret.substring(0, start);
|
|
47
|
+
const end = ret.substring(start + entity.length);
|
|
48
|
+
|
|
49
|
+
ret = `${begin}${entity}${end}`;
|
|
50
|
+
|
|
51
|
+
return entity;
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
return ret;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
module.exports = {
|
|
58
|
+
normalize,
|
|
59
|
+
cleanup,
|
|
60
|
+
normalizePreserveEntities,
|
|
61
|
+
normalizeEntity
|
|
62
|
+
};
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @author David Menger
|
|
3
|
+
*/
|
|
4
|
+
'use strict';
|
|
5
|
+
|
|
6
|
+
const {
|
|
7
|
+
shortArrayIndex,
|
|
8
|
+
splitToNgrams,
|
|
9
|
+
cleanupPreNormalized,
|
|
10
|
+
preNormalize
|
|
11
|
+
} = require('./fuzzyUtils');
|
|
12
|
+
|
|
13
|
+
const SHORTEN_MIN = 5000;
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
*
|
|
17
|
+
* @param {number} idf
|
|
18
|
+
* @param {*} tfArray
|
|
19
|
+
* @param {NgramCount[]} ngramCounts
|
|
20
|
+
* @returns {IndexMapTuple}
|
|
21
|
+
*/
|
|
22
|
+
function divideTfArray (idf, tfArray, ngramCounts) {
|
|
23
|
+
// first index is ID, second tfArray
|
|
24
|
+
if (tfArray.length < SHORTEN_MIN) {
|
|
25
|
+
return [idf, tfArray];
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** @type {IndexMapTuple} */
|
|
29
|
+
const ret = [idf];
|
|
30
|
+
for (const id of tfArray) {
|
|
31
|
+
const [ngramCount] = ngramCounts[id];
|
|
32
|
+
const i = shortArrayIndex(ngramCount);
|
|
33
|
+
if (!ret[i]) {
|
|
34
|
+
ret[i] = [];
|
|
35
|
+
}
|
|
36
|
+
// @ts-ignore
|
|
37
|
+
ret[i].push(id);
|
|
38
|
+
}
|
|
39
|
+
for (let i = 1; i < ret.length; i++) {
|
|
40
|
+
if (!ret[i]) {
|
|
41
|
+
ret[i] = [];
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
return ret;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* @typedef {object} Entity
|
|
49
|
+
* @prop {boolean} [id]
|
|
50
|
+
* @prop {string} entity
|
|
51
|
+
* @prop {string|number} value
|
|
52
|
+
* @prop {string[]} [synonyms]
|
|
53
|
+
*/
|
|
54
|
+
|
|
55
|
+
/** @typedef {[idf: number, ...index: number[][]]} IndexMapTuple */
|
|
56
|
+
/** @typedef {[entity: string, value: string|number]} EntityIndex */
|
|
57
|
+
/** @typedef {[ngramCount: number, index: number, cleanText: string]} NgramCount */
|
|
58
|
+
/** @typedef {[ngram: string, index: IndexMapTuple]} IndexMapEntry */
|
|
59
|
+
|
|
60
|
+
/** @typedef {Map<string, [number, Set<number>]>} IndexMap */
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* @typedef {object} FuzzyIndexData
|
|
64
|
+
* @prop {NgramCount[]} ngramCounts,
|
|
65
|
+
* @prop {EntityIndex[]} entities,
|
|
66
|
+
* @prop {IndexMapEntry[]} indexArray,
|
|
67
|
+
* @prop {number} maxIdf,
|
|
68
|
+
* @prop {number} tfEntryMaxLen,
|
|
69
|
+
* @prop {number} tfTotal,
|
|
70
|
+
* @prop {number} avgIdf
|
|
71
|
+
* @prop {boolean} hasFuzzyMultiplier
|
|
72
|
+
* @prop {number} maxWordCount
|
|
73
|
+
*/
|
|
74
|
+
|
|
75
|
+
/** @typedef {import('./fuzzyUtils').Stemmer} Stemmer */
|
|
76
|
+
|
|
77
|
+
const DEFAULT_MULTIPLIER = (w) => [w];
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
*
|
|
81
|
+
* @param {Entity[]} data
|
|
82
|
+
* @param {Object} [options]
|
|
83
|
+
* @param {Stemmer} [options.stemmer]
|
|
84
|
+
* @param {Function} [options.multiplier]
|
|
85
|
+
* @returns {FuzzyIndexData}
|
|
86
|
+
*/
|
|
87
|
+
function prepareFuzzyIndex (data, {
|
|
88
|
+
stemmer = null,
|
|
89
|
+
multiplier = DEFAULT_MULTIPLIER
|
|
90
|
+
} = {}) {
|
|
91
|
+
|
|
92
|
+
/** @type {IndexMap} */
|
|
93
|
+
const indexMap = new Map();
|
|
94
|
+
|
|
95
|
+
function addToIndex (token, id) {
|
|
96
|
+
let entry = indexMap.get(token);
|
|
97
|
+
if (!entry) {
|
|
98
|
+
entry = [null, new Set()];
|
|
99
|
+
indexMap.set(token, entry);
|
|
100
|
+
}
|
|
101
|
+
entry[1].add(id);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function addItemToIndex (cleanText, id) {
|
|
105
|
+
const tokens = splitToNgrams(cleanText);
|
|
106
|
+
|
|
107
|
+
tokens
|
|
108
|
+
.forEach((token) => {
|
|
109
|
+
addToIndex(token, id);
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
return tokens.length;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function cleanForMultiples (text) {
|
|
116
|
+
return text.toLocaleLowerCase().replace(/[^a-z0-9\u00C0-\u017F]+/g, ' ');
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
let maxWordCount = 0;
|
|
120
|
+
const entities = new Array(data.length);
|
|
121
|
+
let overAllIndex = 0;
|
|
122
|
+
const ngramCounts = data
|
|
123
|
+
// flattern synonyms
|
|
124
|
+
.reduce((arr, {
|
|
125
|
+
entity, value, synonyms = [], id = null
|
|
126
|
+
}, index) => {
|
|
127
|
+
const known = new Set();
|
|
128
|
+
let texts = Array.isArray(synonyms) && synonyms.length && id === true
|
|
129
|
+
? synonyms
|
|
130
|
+
: [value, ...synonyms];
|
|
131
|
+
|
|
132
|
+
texts = texts
|
|
133
|
+
.filter((t) => t)
|
|
134
|
+
.map((text) => cleanForMultiples(text));
|
|
135
|
+
|
|
136
|
+
texts = texts
|
|
137
|
+
.map((text) => multiplier(text, texts[0]))
|
|
138
|
+
.reduce((a, multiplied) => [
|
|
139
|
+
...a,
|
|
140
|
+
...multiplied.filter((word) => {
|
|
141
|
+
if (known.has(word)) {
|
|
142
|
+
return false;
|
|
143
|
+
}
|
|
144
|
+
known.add(word);
|
|
145
|
+
return true;
|
|
146
|
+
})
|
|
147
|
+
], []);
|
|
148
|
+
|
|
149
|
+
entities[index] = [entity, value];
|
|
150
|
+
const ngramsData = texts
|
|
151
|
+
.map((text, i) => {
|
|
152
|
+
const normalized = preNormalize(text);
|
|
153
|
+
const wordCount = normalized.split(/\s+/g).length;
|
|
154
|
+
if (wordCount > maxWordCount) maxWordCount = wordCount;
|
|
155
|
+
const cleanText = cleanupPreNormalized(normalized, stemmer);
|
|
156
|
+
const ngramCount = addItemToIndex(cleanText, i + overAllIndex);
|
|
157
|
+
return [ngramCount, index, cleanText];
|
|
158
|
+
});
|
|
159
|
+
overAllIndex += ngramsData.length;
|
|
160
|
+
arr.push(...ngramsData);
|
|
161
|
+
return arr;
|
|
162
|
+
}, []);
|
|
163
|
+
|
|
164
|
+
let totIdf = 0;
|
|
165
|
+
let maxIdf = 0;
|
|
166
|
+
let tfEntryMaxLen = 0;
|
|
167
|
+
let tfTotal = 0;
|
|
168
|
+
for (const [key, entry] of indexMap.entries()) {
|
|
169
|
+
const idf = Math.log10((indexMap.size / entry[1].size));
|
|
170
|
+
const tfArray = Array.from(entry[1].values());
|
|
171
|
+
const tfEntry = divideTfArray(idf, tfArray, ngramCounts);
|
|
172
|
+
// @ts-ignore
|
|
173
|
+
indexMap.set(key, tfEntry);
|
|
174
|
+
|
|
175
|
+
// stats
|
|
176
|
+
tfTotal++;
|
|
177
|
+
totIdf += idf;
|
|
178
|
+
if (maxIdf < idf) maxIdf = idf;
|
|
179
|
+
if (tfEntryMaxLen < tfEntry.length) tfEntryMaxLen = tfEntry.length;
|
|
180
|
+
}
|
|
181
|
+
const indexArray = Array.from(indexMap.entries());
|
|
182
|
+
const avgIdf = totIdf / indexArray.length;
|
|
183
|
+
|
|
184
|
+
return {
|
|
185
|
+
ngramCounts,
|
|
186
|
+
entities,
|
|
187
|
+
// @ts-ignore
|
|
188
|
+
indexArray,
|
|
189
|
+
maxIdf,
|
|
190
|
+
tfEntryMaxLen,
|
|
191
|
+
tfTotal,
|
|
192
|
+
avgIdf,
|
|
193
|
+
hasFuzzyMultiplier: multiplier !== DEFAULT_MULTIPLIER,
|
|
194
|
+
maxWordCount
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
module.exports = prepareFuzzyIndex;
|
package/src/utils/tokenizer.js
CHANGED
|
@@ -87,14 +87,51 @@ const DEFAULT_REMOVAL_MAP = [
|
|
|
87
87
|
{ base: 'w', letters: '\u0077\u24E6\uFF57\u1E81\u1E83\u0175\u1E87\u1E85\u1E98\u1E89\u2C73' },
|
|
88
88
|
{ base: 'x', letters: '\u0078\u24E7\uFF58\u1E8B\u1E8D' },
|
|
89
89
|
{ base: 'y', letters: '\u0079\u24E8\uFF59\u1EF3\u00FD\u0177\u1EF9\u0233\u1E8F\u00FF\u1EF7\u1E99\u1EF5\u01B4\u024F\u1EFF' },
|
|
90
|
-
{ base: 'z', letters: '\u007A\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763' }
|
|
90
|
+
{ base: 'z', letters: '\u007A\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763' },
|
|
91
|
+
|
|
92
|
+
{ base: '\u0433', letters: '\u0403\u0490\u0491\u0492\u0493\u0413\u0494\u0495\u04F6\u04F7' }, // Г
|
|
93
|
+
{ base: 'i', letters: '\u0406\u0456\u04c0\u0407\u0457\u04CF' },
|
|
94
|
+
{ base: 'j', letters: '\u0408\u0458' },
|
|
95
|
+
{ base: '\u0438', letters: '\u040d\u0419\u0439\u0418\u045D\u048B\u04E2\u04E3\u04E4\u04E5' }, // И
|
|
96
|
+
{ base: 'a', letters: '\u0410\u0430\u04D0\u04D1\u04D2\u04D3' },
|
|
97
|
+
{ base: 'b', letters: '\u0412\u0432' },
|
|
98
|
+
{ base: 'e', letters: '\u0400\u0401\u0415\u0435\u0450\u0451\u0454\u04BC\u04BD\u04BE\u04BF\u04D6\u04D7' },
|
|
99
|
+
{ base: 'h', letters: '\u04BA\u04BB\u04C7\u04C8\u04C9\u04CA' },
|
|
100
|
+
{ base: 'k', letters: '\u040c\u041a\u043A\u045C\u049A\u049B\u049C\u049D\u049E\u049F\u04A0\u04A1\u04C3\u04C4' },
|
|
101
|
+
{ base: 'm', letters: '\u041c\u043C\u04CD\u04CE' },
|
|
102
|
+
{ base: 'h', letters: '\u041d\u043D\u045B\u04A2\u04A3\u04A4\u04A5' },
|
|
103
|
+
{ base: 'o', letters: '\u041e\u043E\u04E6\u04E7\u04E8\u04E9\u04EA\u04EB' },
|
|
104
|
+
{ base: 'p', letters: '\u0420\u0440\u048E\u048F' },
|
|
105
|
+
{ base: 's', letters: '\u0405\u0455' },
|
|
106
|
+
{ base: 'c', letters: '\u0421\u0441\u04AA\u04AB' },
|
|
107
|
+
{ base: 't', letters: '\u0422\u0442\u04AC\u04AD' },
|
|
108
|
+
{ base: 'y', letters: '\u0423\u040E\u0478\u04ee\u04f0\u04ef\u0443\u04f1\u04f2\u04f3\u045E\u04AE\u04AF\u04B0\u04B1' },
|
|
109
|
+
{ base: 'x', letters: '\u0425\u0445\u04A8\u04A9\u04B2\u04B3' }, // Х (H)
|
|
110
|
+
{ base: '\u044C', letters: '\u042C\u048C\u048D' }, // ь
|
|
111
|
+
{ base: '\u0436', letters: '\u0496\u0497\u0416\u04C1\u04C2\u04DC\u04DD' }, // Ж (ZH)
|
|
112
|
+
{ base: '\u0437', letters: '\u0417\u0498\u0499\u04DE\u04DF\u04E0\u04E1' }, // З (ZE)
|
|
113
|
+
{ base: '\u043f', letters: '\u041f\u04A6\u04A7' }, // П (P)
|
|
114
|
+
{ base: '\u0446', letters: '\u0426\u04B4\u04B5' }, // Ц (TSE)
|
|
115
|
+
{ base: '\u0447', letters: '\u0427\u04B6\u04B7\u04B8\u04B9\u04CB\u04CC\u04F4\u04F5' }, // Ч (CHE)
|
|
116
|
+
{ base: '\u0434', letters: '\u041B\u04C5\u04C6' }, // Л (L)
|
|
117
|
+
{ base: '\u044D', letters: '\u042D\u04ED\u04EC' }, // Э (E)
|
|
118
|
+
{ base: '\u044b', letters: '\u042b\u04F8\u04F9' }, // Ы (YER)
|
|
119
|
+
|
|
120
|
+
{ base: 'nj', letters: '\u045A\u040A' }, // њ
|
|
121
|
+
{ base: 'lj', letters: '\u0409\u0459' }, // Љ
|
|
122
|
+
{ base: 'dz', letters: '\u045F\u040F' } // џ
|
|
123
|
+
|
|
91
124
|
];
|
|
92
125
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
126
|
+
let diacriticsMap = null;
|
|
127
|
+
|
|
128
|
+
function buildDiacriticsMap () {
|
|
129
|
+
diacriticsMap = {};
|
|
130
|
+
for (let i = 0; i < DEFAULT_REMOVAL_MAP.length; i++) {
|
|
131
|
+
const { letters } = DEFAULT_REMOVAL_MAP[i];
|
|
132
|
+
for (let j = 0; j < letters.length; j++) {
|
|
133
|
+
diacriticsMap[letters[j]] = DEFAULT_REMOVAL_MAP[i].base;
|
|
134
|
+
}
|
|
98
135
|
}
|
|
99
136
|
}
|
|
100
137
|
|
|
@@ -106,9 +143,28 @@ for (let i = 0; i < DEFAULT_REMOVAL_MAP.length; i++) {
|
|
|
106
143
|
* @returns {string}
|
|
107
144
|
*/
|
|
108
145
|
function replaceDiacritics (str) {
|
|
146
|
+
if (!diacriticsMap) {
|
|
147
|
+
buildDiacriticsMap();
|
|
148
|
+
}
|
|
109
149
|
return str.replace(/[^\u0000-\u007E]/g, (a) => diacriticsMap[a] || a); // eslint-disable-line no-control-regex
|
|
110
150
|
}
|
|
111
151
|
|
|
152
|
+
/**
|
|
153
|
+
*
|
|
154
|
+
* @param {string|number} str
|
|
155
|
+
* @returns {string}
|
|
156
|
+
*/
|
|
157
|
+
function normalize (str) {
|
|
158
|
+
if (!diacriticsMap) {
|
|
159
|
+
buildDiacriticsMap();
|
|
160
|
+
}
|
|
161
|
+
// U+0400–U+04FF - cyrillic
|
|
162
|
+
return `${str}`
|
|
163
|
+
.replace(/[\u0400-\u04ff]/g, (a) => (diacriticsMap[a] ? diacriticsMap[a] : a)) // cyrillic
|
|
164
|
+
.replace(/[^A-Za-z0-9\s'`\u0400-\u04ff]/g, (a) => (diacriticsMap[a] ? diacriticsMap[a] : ' '))
|
|
165
|
+
.toLowerCase();
|
|
166
|
+
}
|
|
167
|
+
|
|
112
168
|
/**
|
|
113
169
|
*
|
|
114
170
|
* @param {string} string
|
|
@@ -124,5 +180,6 @@ function tokenize (string) {
|
|
|
124
180
|
|
|
125
181
|
module.exports = {
|
|
126
182
|
replaceDiacritics,
|
|
127
|
-
tokenize
|
|
183
|
+
tokenize,
|
|
184
|
+
normalize
|
|
128
185
|
};
|
|
@@ -65,9 +65,9 @@ const { iterateThroughWords } = require('../utils/ai');
|
|
|
65
65
|
/**
|
|
66
66
|
* @callback WordEntityDetector
|
|
67
67
|
* @param {string} text
|
|
68
|
-
* @param {DetectedEntity[]} entities
|
|
69
|
-
* @param {number} startIndex
|
|
70
|
-
* @param {string} prefix
|
|
68
|
+
* @param {DetectedEntity[]} [entities]
|
|
69
|
+
* @param {number} [startIndex]
|
|
70
|
+
* @param {string} [prefix]
|
|
71
71
|
* @returns {DetectedEntity[]}
|
|
72
72
|
*/
|
|
73
73
|
|