wingbot 3.67.8 → 3.67.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -64,6 +64,9 @@ const {
64
64
  } = require('./src/analytics/consts');
65
65
 
66
66
  const { version: wingbotVersion } = require('./package.json');
67
+ const { fuzzy } = require('./src/fuzzy');
68
+ const prepareFuzzyIndex = require('./src/fuzzy/prepareFuzzyIndex');
69
+ const factoryFuzzySearch = require('./src/fuzzy/factoryFuzzySearch');
67
70
 
68
71
  module.exports = {
69
72
 
@@ -109,6 +112,11 @@ module.exports = {
109
112
  plugins,
110
113
  vars,
111
114
 
115
+ // FUZZY
116
+ fuzzy,
117
+ prepareFuzzyIndex,
118
+ factoryFuzzySearch,
119
+
112
120
  // Notifications
113
121
  Notifications,
114
122
  NotificationsStorage,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "wingbot",
3
- "version": "3.67.8",
3
+ "version": "3.67.9",
4
4
  "description": "Enterprise Messaging Bot Conversation Engine",
5
5
  "main": "index.js",
6
6
  "scripts": {
package/src/Ai.js CHANGED
@@ -59,7 +59,7 @@ let uq = 1;
59
59
 
60
60
  /**
61
61
  * @callback WordEntityDetectorFactory
62
- * @returns {Promise<WordEntityDetector|WordDetectorData>}
62
+ * @returns {Promise<WordDetectorData>}
63
63
  */
64
64
 
65
65
  /** @typedef {[string,EntityDetector|RegExp,DetectorOptions]} DetectorArgs */
@@ -241,11 +241,14 @@ class Ai {
241
241
  * @returns {T}
242
242
  * @memberOf Ai
243
243
  */
244
- register (model, prefix = this.DEFAULT_PREFIX) {
244
+ register (model = null, prefix = this.DEFAULT_PREFIX) {
245
245
  /** @type {T} */
246
246
  let modelObj;
247
247
 
248
- if (typeof model === 'string') {
248
+ if (!model) {
249
+ // @ts-ignore
250
+ modelObj = new CustomEntityDetectionModel({ prefix });
251
+ } else if (typeof model === 'string') {
249
252
  // @ts-ignore
250
253
  modelObj = new WingbotModel({
251
254
  model,
@@ -803,7 +806,6 @@ class Ai {
803
806
  if (!req.isText()) {
804
807
  return;
805
808
  }
806
-
807
809
  if (this._keyworders.size !== 0) {
808
810
  const model = this._getModelForRequest(req);
809
811
  if (!model) {
@@ -0,0 +1,243 @@
1
+ /**
2
+ * @author David Menger
3
+ */
4
+ 'use strict';
5
+
6
+ const { shortArrayIndex, splitToNgrams, cleanup } = require('./fuzzyUtils');
7
+ const {
8
+ relativeLevenshtein, SEED_FUZZY, SEED_FUZZY_MULTIPLICATOR, WORD_HANDICAP_K_FUZZY
9
+ } = require('./levenshtein');
10
+
11
+ const LOWER_DUPLICATES = 0.9;
12
+
13
+ function getIndexesToIterate (ngrams, tfEntry) {
14
+ if (tfEntry.length === 2) {
15
+ return [1, 1];
16
+ }
17
+ const min = Math.ceil(ngrams * 0.6);
18
+ const max = Math.floor(ngrams * 1.5);
19
+ return [shortArrayIndex(min), shortArrayIndex(max)];
20
+ }
21
+
22
+ /**
23
+ * @typedef {object} FuzzySearchOptions
24
+ * @prop {boolean} [keepMultipleValues]
25
+ * @prop {Stemmer} [stemmer]
26
+ * @prop {number} [threshold]
27
+ */
28
+
29
+ /** @typedef {import('./prepareFuzzyIndex').FuzzyIndexData} FuzzyIndexData */
30
+ /** @typedef {import('./prepareFuzzyIndex').Stemmer} Stemmer */
31
+ /** @typedef {import('../Ai').WordEntityDetector} WordEntityDetector */
32
+ /** @typedef {import('../Ai').WordDetectorData} WordDetectorData */
33
+
34
+ /**
35
+ * @typedef {object} Entity
36
+ * @prop {string} entity
37
+ * @prop {string} value
38
+ * @prop {string[]} [synonyms]
39
+ */
40
+
41
+ function searchFnFactory (indexMap, ngramCounts, entities, maxIdf, {
42
+ stemmer = null,
43
+ keepMultipleValues = false,
44
+ threshold = 0.5,
45
+ limit = undefined
46
+ }, hasFuzzyMultiplier = false) {
47
+ /** @type {WordEntityDetector} */
48
+ const searchFn = (search) => {
49
+ const cleanQuery = cleanup(search, stemmer);
50
+ const tokens = splitToNgrams(cleanQuery);
51
+ const results = new Map();
52
+
53
+ tokens.forEach((token) => {
54
+ const entry = indexMap.get(token);
55
+ if (!entry) {
56
+ return;
57
+ }
58
+ const [idf] = entry;
59
+ const [startIndex, endIndex] = getIndexesToIterate(tokens.length, entry);
60
+
61
+ const maxIndex = Math.min(endIndex, entry.length - 1);
62
+ for (let i = startIndex; i <= maxIndex; i++) {
63
+ for (const id of entry[i]) {
64
+ let res = results.get(id);
65
+ if (!res) {
66
+ res = { cnt: 0, idf: 0 };
67
+ results.set(id, res);
68
+ }
69
+ res.cnt++;
70
+ res.idf += idf;
71
+ }
72
+ }
73
+
74
+ });
75
+
76
+ let maxScore = 0;
77
+ let maxRelIdf = 0; // small but positive
78
+ const levenshteinSeed = hasFuzzyMultiplier
79
+ ? SEED_FUZZY_MULTIPLICATOR
80
+ : SEED_FUZZY;
81
+
82
+ const percentage = hasFuzzyMultiplier
83
+ ? 0.6
84
+ : 0.5;
85
+
86
+ const preprocessed = Array.from(results.entries())
87
+ .filter(([id, { cnt }]) => {
88
+ const [ngramCount] = ngramCounts[id];
89
+ const percentageOfMatchedNgrams = (cnt * 2) / (ngramCount + tokens.length);
90
+ return percentageOfMatchedNgrams >= percentage;
91
+ })
92
+ .map(([id, { cnt, idf }]) => {
93
+ const [, entityIndex, cleanText] = ngramCounts[id];
94
+ const [entity, value] = entities[entityIndex];
95
+ const relIdf = (idf / cnt) / maxIdf;
96
+ let score = relativeLevenshtein(
97
+ cleanText,
98
+ cleanQuery,
99
+ levenshteinSeed,
100
+ WORD_HANDICAP_K_FUZZY
101
+ );
102
+ let start = 0;
103
+
104
+ if (cleanQuery.match(/^[^\s]{1,3}\s+.{6,}$/)) {
105
+ const without = cleanQuery.replace(/^[^\s]{1,3}\s+/, '');
106
+ const altScore = relativeLevenshtein(
107
+ cleanText,
108
+ without,
109
+ levenshteinSeed,
110
+ WORD_HANDICAP_K_FUZZY
111
+ );
112
+
113
+ if (altScore > score) {
114
+ score = altScore;
115
+ start = cleanQuery.length - without.length;
116
+ }
117
+ }
118
+
119
+ if (maxScore < score) maxScore = score;
120
+ if (maxRelIdf < relIdf) maxRelIdf = relIdf;
121
+
122
+ return {
123
+ entity,
124
+ value,
125
+ _relIdf: relIdf,
126
+ score,
127
+ ...(start ? { start } : {})
128
+ };
129
+ });
130
+
131
+ const found = preprocessed.map((o) => {
132
+ const { _relIdf: relIdf } = o;
133
+ // eslint-disable-next-line no-param-reassign
134
+ delete o._relIdf;
135
+
136
+ const koef = maxRelIdf <= 0 ? relIdf : (relIdf / maxRelIdf);
137
+ const addToScore = ((1 - maxScore) / 2) * koef;
138
+
139
+ Object.assign(o, {
140
+ score: Math.round((o.score + addToScore) * 10000) / 10000
141
+ });
142
+
143
+ return o;
144
+ });
145
+
146
+ found.sort((a, z) => z.score - a.score);
147
+
148
+ const known = new Map();
149
+ const res = found
150
+ .filter((result) => {
151
+ const key = keepMultipleValues ? `${result.entity}|${result.value}` : result.entity;
152
+ if (result.score < threshold) {
153
+ return false;
154
+ }
155
+ if (known.has(key)) {
156
+ const { result: origResult, score, alts } = known.get(key);
157
+ if (!keepMultipleValues
158
+ && Math.abs(score - result.score) < (1 - LOWER_DUPLICATES)
159
+ && origResult.value !== result.value) {
160
+
161
+ if (!alts.some((a) => a.value === result.value)) {
162
+ // five percent down for collisions
163
+ origResult.score *= LOWER_DUPLICATES;
164
+ }
165
+
166
+ alts.push(result);
167
+
168
+ Object.assign(origResult, {
169
+ alternatives: alts
170
+ });
171
+ }
172
+ return false;
173
+ }
174
+ known.set(key, { result, score: result.score, alts: [] });
175
+ return true;
176
+ })
177
+ .slice(0, limit);
178
+
179
+ res.forEach((entity) => {
180
+ if ('alternatives' in entity) {
181
+ // @ts-ignore
182
+ let { alternatives } = entity;
183
+
184
+ const kn = new Set([entity.value]);
185
+ alternatives = alternatives
186
+ // @ts-ignore
187
+ .sort((a, z) => z.score - a.score)
188
+ .filter((e) => !known.has(e.value) && kn.add(e.value));
189
+
190
+ // @ts-ignore
191
+ for (let i = 0; i < alternatives.length; i++) {
192
+ const alt = alternatives[i];
193
+ // @ts-ignore
194
+ Object.assign(alt, {
195
+ // @ts-ignore
196
+ score: alt.score * (LOWER_DUPLICATES ** alternatives.length)
197
+ });
198
+ }
199
+
200
+ Object.assign(entity, { alternatives });
201
+ }
202
+ });
203
+
204
+ return res;
205
+ };
206
+
207
+ return searchFn;
208
+ }
209
+
210
+ /**
211
+ *
212
+ * @param {FuzzyIndexData} data
213
+ * @param {FuzzySearchOptions} [options]
214
+ * @returns {WordDetectorData}
215
+ */
216
+ function factoryFuzzySearch (data, options = {}) {
217
+ const {
218
+ ngramCounts,
219
+ entities,
220
+ indexArray,
221
+ maxIdf,
222
+ hasFuzzyMultiplier,
223
+ maxWordCount
224
+ } = data;
225
+
226
+ const indexMap = new Map(indexArray);
227
+
228
+ const detector = searchFnFactory(
229
+ indexMap,
230
+ ngramCounts,
231
+ entities,
232
+ maxIdf,
233
+ options,
234
+ hasFuzzyMultiplier
235
+ );
236
+
237
+ return {
238
+ detector,
239
+ maxWordCount
240
+ };
241
+ }
242
+
243
+ module.exports = factoryFuzzySearch;
@@ -0,0 +1,91 @@
1
+ /**
2
+ * @author David Menger
3
+ */
4
+ 'use strict';
5
+
6
+ const { normalize } = require('./normalize');
7
+
8
+ const SHORTEN_BY = 2;
9
+ const NGRAMS = 3;
10
+
11
+ /**
12
+ *
13
+ * @param {string|number} word
14
+ * @returns {string}
15
+ */
16
+ function preNormalize (word) {
17
+ return normalize(word)
18
+ .replace(/[^a-z0-9]+/g, ' ')
19
+ .trim();
20
+ }
21
+
22
+ function stem (normalized, stemmer) {
23
+ if (!stemmer) {
24
+ return normalized;
25
+ }
26
+
27
+ const stems = normalized
28
+ .split(/\s+/g)
29
+ .map((w) => stemmer(w) || w);
30
+
31
+ return `${normalized} ${stems.join(' ')}`;
32
+ }
33
+
34
+ /** @typedef {{ (word: string): string}} Stemmer */
35
+
36
+ /**
37
+ *
38
+ * @param {string|number} word
39
+ * @param {Stemmer} stemmer
40
+ * @returns {string}
41
+ */
42
+ function cleanup (word, stemmer) {
43
+ const normalized = preNormalize(word);
44
+ return stem(normalized, stemmer);
45
+ }
46
+
47
+ /**
48
+ *
49
+ * @param {string} normalized
50
+ * @param {Stemmer} stemmer
51
+ * @returns {string}
52
+ */
53
+ function cleanupPreNormalized (normalized, stemmer) {
54
+ return stem(normalized, stemmer);
55
+ }
56
+
57
+ /**
58
+ *
59
+ * @param {number} ngramCount
60
+ * @returns {number}
61
+ */
62
+ function shortArrayIndex (ngramCount) {
63
+ return Math.floor(ngramCount / SHORTEN_BY) + 1;
64
+ }
65
+
66
+ /**
67
+ *
68
+ * @param {string} word
69
+ * @returns {string[]}
70
+ */
71
+ function splitToNgrams (word) {
72
+ const prolonged = ` ${word} `;
73
+ const len = prolonged.length - NGRAMS + 1;
74
+ if (len <= 0) {
75
+ return word.length > 0 ? [prolonged] : [];
76
+ }
77
+ const ret = new Array(len);
78
+ for (let i = 0; i < len; i++) {
79
+ const sub = prolonged.substring(i, i + NGRAMS);
80
+ ret[i] = sub;
81
+ }
82
+ return ret;
83
+ }
84
+
85
+ module.exports = {
86
+ cleanup,
87
+ shortArrayIndex,
88
+ splitToNgrams,
89
+ cleanupPreNormalized,
90
+ preNormalize
91
+ };
@@ -0,0 +1,40 @@
1
+ /**
2
+ * @author David Menger
3
+ */
4
+ 'use strict';
5
+
6
+ const factoryFuzzySearch = require('./factoryFuzzySearch');
7
+ const prepareFuzzyIndex = require('./prepareFuzzyIndex');
8
+
9
+ /** @typedef {import('./factoryFuzzySearch').Entity} Entity */
10
+ /** @typedef {import('./factoryFuzzySearch').FuzzySearchOptions} FuzzySearchOptions */
11
+ /** @typedef {import('../Ai').WordEntityDetectorFactory} WordEntityDetectorFactory */
12
+
13
+ /**
14
+ * @callback EntityFactory
15
+ * @returns {Promise<Entity[]>}
16
+ */
17
+
18
+ /**
19
+ *
20
+ * @param {Entity[]|EntityFactory} entities
21
+ * @param {FuzzySearchOptions} options
22
+ * @returns {WordEntityDetectorFactory}
23
+ */
24
+ function fuzzy (entities, options = {}) {
25
+
26
+ return async () => {
27
+ const data = typeof entities === 'function'
28
+ ? (await entities())
29
+ : entities;
30
+
31
+ const index = prepareFuzzyIndex(data, options);
32
+ return factoryFuzzySearch(index, options);
33
+ };
34
+ }
35
+
36
+ module.exports = {
37
+ fuzzy,
38
+ prepareFuzzyIndex,
39
+ factoryFuzzySearch
40
+ };
@@ -0,0 +1,228 @@
1
+ /**
2
+ * @author David Menger
3
+ */
4
+ 'use strict';
5
+
6
+ const NUMERIC_KOEF = 4;
7
+ const SUFFIX_WEIGHT = 0.055;
8
+
9
+ const SEED_DEFAULT = 0.5;
10
+ const SEED_FUZZY = 0.25;
11
+ const SEED_FUZZY_MULTIPLICATOR = -0.25;
12
+
13
+ const WORD_HANDICAP_K_DEFAULT = 0.9;
14
+ const WORD_HANDICAP_K_FUZZY = 0.6;
15
+
16
+ function _min (d0, d1, d2, bx, ay) {
17
+ if (d0 < d1 || d2 < d1) {
18
+ return d0 > d2
19
+ ? d2 + 1
20
+ : d0 + 1;
21
+ }
22
+ return bx === ay
23
+ ? d1
24
+ : d1 + 1;
25
+ }
26
+
27
+ /**
28
+ *
29
+ * @param {string} left
30
+ * @param {string} right
31
+ * @returns {number}
32
+ */
33
+ function levenshtein (left, right) {
34
+ if (left === right) {
35
+ return 0;
36
+ }
37
+
38
+ let a = left;
39
+ let b = right;
40
+
41
+ if (a.length > b.length) {
42
+ const tmp = a;
43
+ a = b;
44
+ b = tmp;
45
+ }
46
+
47
+ let la = a.length;
48
+ let lb = b.length;
49
+
50
+ while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {
51
+ la--;
52
+ lb--;
53
+ }
54
+
55
+ let offset = 0;
56
+
57
+ while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {
58
+ offset++;
59
+ }
60
+
61
+ la -= offset;
62
+ lb -= offset;
63
+
64
+ if (la === 0 || lb < 3) {
65
+ return lb;
66
+ }
67
+
68
+ let x = 0;
69
+ let y;
70
+ let d0;
71
+ let d1;
72
+ let d2;
73
+ let d3;
74
+ let dd;
75
+ let dy;
76
+ let ay;
77
+ let bx0;
78
+ let bx1;
79
+ let bx2;
80
+ let bx3;
81
+
82
+ const vector = [];
83
+
84
+ for (y = 0; y < la; y++) {
85
+ vector.push(y + 1);
86
+ vector.push(a.charCodeAt(offset + y));
87
+ }
88
+
89
+ const len = vector.length - 1;
90
+
91
+ for (; x < lb - 3;) {
92
+ bx0 = b.charCodeAt(offset + (d0 = x));
93
+ bx1 = b.charCodeAt(offset + (d1 = x + 1));
94
+ bx2 = b.charCodeAt(offset + (d2 = x + 2));
95
+ bx3 = b.charCodeAt(offset + (d3 = x + 3));
96
+ x += 4;
97
+ dd = x;
98
+ for (y = 0; y < len; y += 2) {
99
+ dy = vector[y];
100
+ ay = vector[y + 1];
101
+ d0 = _min(dy, d0, d1, bx0, ay);
102
+ d1 = _min(d0, d1, d2, bx1, ay);
103
+ d2 = _min(d1, d2, d3, bx2, ay);
104
+ dd = _min(d2, d3, dd, bx3, ay);
105
+ vector[y] = dd;
106
+ d3 = d2;
107
+ d2 = d1;
108
+ d1 = d0;
109
+ d0 = dy;
110
+ }
111
+ }
112
+
113
+ for (; x < lb;) {
114
+ bx0 = b.charCodeAt(offset + (d0 = x));
115
+ dd = ++x;
116
+ for (y = 0; y < len; y += 2) {
117
+ dy = vector[y];
118
+ dd = _min(dy, d0, dd, bx0, vector[y + 1]);
119
+ vector[y] = dd;
120
+ d0 = dy;
121
+ }
122
+ }
123
+
124
+ return dd;
125
+ }
126
+
127
+ function addSeed (seed, len, value, base = seed) {
128
+ return base + (((len - value) / len) * (1 - seed));
129
+ }
130
+
131
+ /**
132
+ *
133
+ * @param {string} left - training data
134
+ * @param {string} right - query
135
+ * @param {number} [seed]
136
+ * @param {number} [wordKoef]
137
+ * @returns {number}
138
+ */
139
+ function relativeLevenshtein (
140
+ left,
141
+ right,
142
+ seed = SEED_DEFAULT,
143
+ wordKoef = WORD_HANDICAP_K_DEFAULT
144
+ ) {
145
+ const len = Math.max(left.length, right.length);
146
+ if (!len) {
147
+ return 0;
148
+ }
149
+ let stemLen = Math.min(left.length, right.length);
150
+
151
+ const leftWordCount = (left.match(/[^\s]+/g) || ['']).length;
152
+ const rightWordCount = (right.match(/[^\s]+/g) || ['']).length;
153
+
154
+ const wordDiff = Math.max(0, rightWordCount - leftWordCount);
155
+ const wordHandicap = (wordKoef ** wordDiff);
156
+
157
+ const leftNum = left.replace(/[^0-9]+/g, '');
158
+ const rightNum = right.replace(/[^0-9]+/g, '');
159
+ const numLen = leftNum.length ? leftNum.length * NUMERIC_KOEF : rightNum.length;
160
+ const useNumK = leftNum.length ? NUMERIC_KOEF : 1;
161
+ const numLev = numLen ? levenshtein(leftNum, rightNum) * useNumK : 0;
162
+
163
+ if (stemLen < 3) {
164
+ return addSeed(seed, len + numLen, levenshtein(left, right) + numLev) * wordHandicap;
165
+ }
166
+
167
+ let diff = len - stemLen;
168
+
169
+ if (diff <= 2) {
170
+ diff += 2;
171
+ stemLen -= 2;
172
+ }
173
+
174
+ let diffWeight = diff * SUFFIX_WEIGHT;
175
+
176
+ const lStem = left.substring(0, stemLen);
177
+ const rStem = right.substring(0, stemLen);
178
+ const lSuff = left.substring(stemLen);
179
+ const rSuff = right.substring(stemLen);
180
+
181
+ const stemLev = levenshtein(lStem, rStem);
182
+ const suffLev = levenshtein(lSuff, rSuff);
183
+
184
+ if (suffLev === 1 && stemLev === 0) {
185
+ diffWeight = (diff - 1) * SUFFIX_WEIGHT;
186
+ }
187
+
188
+ const vStem = addSeed(seed, stemLen + numLen, stemLev + numLev, seed - diffWeight);
189
+ const vSuffix = addSeed(1 - diffWeight, diff, suffLev, 0);
190
+
191
+ const r = (vStem + vSuffix) * wordHandicap;
192
+
193
+ // console.log(`#levenshtein "${left}" <- ${right}: ${r.toFixed(3)}`);
194
+ return r;
195
+ }
196
+
197
+ /**
198
+ *
199
+ * @param {string} left
200
+ * @param {string} right
201
+ * @param {number} seed
202
+ * @param {number} [wordKoef]
203
+ * @returns {number}
204
+ */
205
+ function multiwordLevenshtein (left, right, seed, wordKoef = undefined) {
206
+ const leftSplit = `${left}`.split(/\s+/g);
207
+ const rightSplit = `${right}`.split(/\s+/g);
208
+
209
+ let sum = 0;
210
+
211
+ const max = Math.max(leftSplit.length, rightSplit.length, 1);
212
+ for (let i = 0; i < max; i++) {
213
+ sum += relativeLevenshtein(leftSplit[i] || '', rightSplit[i] || '', seed, wordKoef);
214
+ }
215
+
216
+ return sum / max;
217
+ }
218
+
219
+ module.exports = {
220
+ levenshtein,
221
+ multiwordLevenshtein,
222
+ relativeLevenshtein,
223
+ SEED_DEFAULT,
224
+ SEED_FUZZY,
225
+ SEED_FUZZY_MULTIPLICATOR,
226
+ WORD_HANDICAP_K_FUZZY,
227
+ WORD_HANDICAP_K_DEFAULT
228
+ };
@@ -0,0 +1,62 @@
1
+ /*
2
+ * @author David Menger
3
+ */
4
+ 'use strict';
5
+
6
+ const { normalize } = require('../utils/tokenizer');
7
+
8
+ /**
9
+ * Preserves only letters (with or withour diacritics) and makes everything lowercased
10
+ *
11
+ * @param {string} str - input string
12
+ * @returns {string}
13
+ */
14
+ function cleanup (str) {
15
+ return str
16
+ .replace(/[`']+(\s|$)|(\s|^)['`]+/g, ' ')
17
+ .replace(/\s+/g, ' ')
18
+ .trim();
19
+ }
20
+
21
+ /**
22
+ *
23
+ * @param {string} str
24
+ * @param {boolean} strict
25
+ * @returns {string}
26
+ */
27
+ function normalizeEntity (str, strict) {
28
+ if (strict) {
29
+ return `${str}`.toLocaleLowerCase()
30
+ .replace(/\s+/g, ' ')
31
+ .trim();
32
+ }
33
+ return cleanup(normalize(str));
34
+ }
35
+
36
+ /**
37
+ *
38
+ * @param {string} str
39
+ * @returns {string}
40
+ */
41
+ function normalizePreserveEntities (str) {
42
+
43
+ let ret = normalize(str);
44
+
45
+ str.replace(/@[A-Z0-9-]+/g, (entity, start) => {
46
+ const begin = ret.substring(0, start);
47
+ const end = ret.substring(start + entity.length);
48
+
49
+ ret = `${begin}${entity}${end}`;
50
+
51
+ return entity;
52
+ });
53
+
54
+ return ret;
55
+ }
56
+
57
+ module.exports = {
58
+ normalize,
59
+ cleanup,
60
+ normalizePreserveEntities,
61
+ normalizeEntity
62
+ };
@@ -0,0 +1,196 @@
1
+ /**
2
+ * @author David Menger
3
+ */
4
+ 'use strict';
5
+
6
+ const {
7
+ shortArrayIndex,
8
+ splitToNgrams,
9
+ cleanupPreNormalized,
10
+ preNormalize
11
+ } = require('./fuzzyUtils');
12
+
13
+ const SHORTEN_MIN = 5000;
14
+
15
+ /**
16
+ *
17
+ * @param {number} idf
18
+ * @param {*} tfArray
19
+ * @param {NgramCount[]} ngramCounts
20
+ * @returns {IndexMapTuple}
21
+ */
22
+ function divideTfArray (idf, tfArray, ngramCounts) {
23
+ // first index is ID, second tfArray
24
+ if (tfArray.length < SHORTEN_MIN) {
25
+ return [idf, tfArray];
26
+ }
27
+
28
+ /** @type {IndexMapTuple} */
29
+ const ret = [idf];
30
+ for (const id of tfArray) {
31
+ const [ngramCount] = ngramCounts[id];
32
+ const i = shortArrayIndex(ngramCount);
33
+ if (!ret[i]) {
34
+ ret[i] = [];
35
+ }
36
+ // @ts-ignore
37
+ ret[i].push(id);
38
+ }
39
+ for (let i = 1; i < ret.length; i++) {
40
+ if (!ret[i]) {
41
+ ret[i] = [];
42
+ }
43
+ }
44
+ return ret;
45
+ }
46
+
47
+ /**
48
+ * @typedef {object} Entity
49
+ * @prop {boolean} [id]
50
+ * @prop {string} entity
51
+ * @prop {string|number} value
52
+ * @prop {string[]} [synonyms]
53
+ */
54
+
55
+ /** @typedef {[idf: number, ...index: number[][]]} IndexMapTuple */
56
+ /** @typedef {[entity: string, value: string|number]} EntityIndex */
57
+ /** @typedef {[ngramCount: number, index: number, cleanText: string]} NgramCount */
58
+ /** @typedef {[ngram: string, index: IndexMapTuple]} IndexMapEntry */
59
+
60
+ /** @typedef {Map<string, [number, Set<number>]>} IndexMap */
61
+
62
+ /**
63
+ * @typedef {object} FuzzyIndexData
64
+ * @prop {NgramCount[]} ngramCounts,
65
+ * @prop {EntityIndex[]} entities,
66
+ * @prop {IndexMapEntry[]} indexArray,
67
+ * @prop {number} maxIdf,
68
+ * @prop {number} tfEntryMaxLen,
69
+ * @prop {number} tfTotal,
70
+ * @prop {number} avgIdf
71
+ * @prop {boolean} hasFuzzyMultiplier
72
+ * @prop {number} maxWordCount
73
+ */
74
+
75
+ /** @typedef {import('./fuzzyUtils').Stemmer} Stemmer */
76
+
77
+ const DEFAULT_MULTIPLIER = (w) => [w];
78
+
79
+ /**
80
+ *
81
+ * @param {Entity[]} data
82
+ * @param {Object} [options]
83
+ * @param {Stemmer} [options.stemmer]
84
+ * @param {Function} [options.multiplier]
85
+ * @returns {FuzzyIndexData}
86
+ */
87
+ function prepareFuzzyIndex (data, {
88
+ stemmer = null,
89
+ multiplier = DEFAULT_MULTIPLIER
90
+ } = {}) {
91
+
92
+ /** @type {IndexMap} */
93
+ const indexMap = new Map();
94
+
95
+ function addToIndex (token, id) {
96
+ let entry = indexMap.get(token);
97
+ if (!entry) {
98
+ entry = [null, new Set()];
99
+ indexMap.set(token, entry);
100
+ }
101
+ entry[1].add(id);
102
+ }
103
+
104
+ function addItemToIndex (cleanText, id) {
105
+ const tokens = splitToNgrams(cleanText);
106
+
107
+ tokens
108
+ .forEach((token) => {
109
+ addToIndex(token, id);
110
+ });
111
+
112
+ return tokens.length;
113
+ }
114
+
115
+ function cleanForMultiples (text) {
116
+ return text.toLocaleLowerCase().replace(/[^a-z0-9\u00C0-\u017F]+/g, ' ');
117
+ }
118
+
119
+ let maxWordCount = 0;
120
+ const entities = new Array(data.length);
121
+ let overAllIndex = 0;
122
+ const ngramCounts = data
123
+ // flattern synonyms
124
+ .reduce((arr, {
125
+ entity, value, synonyms = [], id = null
126
+ }, index) => {
127
+ const known = new Set();
128
+ let texts = Array.isArray(synonyms) && synonyms.length && id === true
129
+ ? synonyms
130
+ : [value, ...synonyms];
131
+
132
+ texts = texts.map((text) => cleanForMultiples(text));
133
+
134
+ texts = texts
135
+ .map((text) => multiplier(text, texts[0]))
136
+ .reduce((a, multiplied) => [
137
+ ...a,
138
+ ...multiplied.filter((word) => {
139
+ if (known.has(word)) {
140
+ return false;
141
+ }
142
+ known.add(word);
143
+ return true;
144
+ })
145
+ ], []);
146
+
147
+ entities[index] = [entity, value];
148
+ const ngramsData = texts
149
+ .map((text, i) => {
150
+ const normalized = preNormalize(text);
151
+ const wordCount = normalized.split(/\s+/g).length;
152
+ if (wordCount > maxWordCount) maxWordCount = wordCount;
153
+ const cleanText = cleanupPreNormalized(normalized, stemmer);
154
+ const ngramCount = addItemToIndex(cleanText, i + overAllIndex);
155
+ return [ngramCount, index, cleanText];
156
+ });
157
+ overAllIndex += ngramsData.length;
158
+ arr.push(...ngramsData);
159
+ return arr;
160
+ }, []);
161
+
162
+ let totIdf = 0;
163
+ let maxIdf = 0;
164
+ let tfEntryMaxLen = 0;
165
+ let tfTotal = 0;
166
+ for (const [key, entry] of indexMap.entries()) {
167
+ const idf = Math.log10((indexMap.size / entry[1].size));
168
+ const tfArray = Array.from(entry[1].values());
169
+ const tfEntry = divideTfArray(idf, tfArray, ngramCounts);
170
+ // @ts-ignore
171
+ indexMap.set(key, tfEntry);
172
+
173
+ // stats
174
+ tfTotal++;
175
+ totIdf += idf;
176
+ if (maxIdf < idf) maxIdf = idf;
177
+ if (tfEntryMaxLen < tfEntry.length) tfEntryMaxLen = tfEntry.length;
178
+ }
179
+ const indexArray = Array.from(indexMap.entries());
180
+ const avgIdf = totIdf / indexArray.length;
181
+
182
+ return {
183
+ ngramCounts,
184
+ entities,
185
+ // @ts-ignore
186
+ indexArray,
187
+ maxIdf,
188
+ tfEntryMaxLen,
189
+ tfTotal,
190
+ avgIdf,
191
+ hasFuzzyMultiplier: multiplier !== DEFAULT_MULTIPLIER,
192
+ maxWordCount
193
+ };
194
+ }
195
+
196
+ module.exports = prepareFuzzyIndex;
@@ -87,14 +87,51 @@ const DEFAULT_REMOVAL_MAP = [
87
87
  { base: 'w', letters: '\u0077\u24E6\uFF57\u1E81\u1E83\u0175\u1E87\u1E85\u1E98\u1E89\u2C73' },
88
88
  { base: 'x', letters: '\u0078\u24E7\uFF58\u1E8B\u1E8D' },
89
89
  { base: 'y', letters: '\u0079\u24E8\uFF59\u1EF3\u00FD\u0177\u1EF9\u0233\u1E8F\u00FF\u1EF7\u1E99\u1EF5\u01B4\u024F\u1EFF' },
90
- { base: 'z', letters: '\u007A\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763' }
90
+ { base: 'z', letters: '\u007A\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763' },
91
+
92
+ { base: '\u0433', letters: '\u0403\u0490\u0491\u0492\u0493\u0413\u0494\u0495\u04F6\u04F7' }, // Г
93
+ { base: 'i', letters: '\u0406\u0456\u04c0\u0407\u0457\u04CF' },
94
+ { base: 'j', letters: '\u0408\u0458' },
95
+ { base: '\u0438', letters: '\u040d\u0419\u0439\u0418\u045D\u048B\u04E2\u04E3\u04E4\u04E5' }, // И
96
+ { base: 'a', letters: '\u0410\u0430\u04D0\u04D1\u04D2\u04D3' },
97
+ { base: 'b', letters: '\u0412\u0432' },
98
+ { base: 'e', letters: '\u0400\u0401\u0415\u0435\u0450\u0451\u0454\u04BC\u04BD\u04BE\u04BF\u04D6\u04D7' },
99
+ { base: 'h', letters: '\u04BA\u04BB\u04C7\u04C8\u04C9\u04CA' },
100
+ { base: 'k', letters: '\u040c\u041a\u043A\u045C\u049A\u049B\u049C\u049D\u049E\u049F\u04A0\u04A1\u04C3\u04C4' },
101
+ { base: 'm', letters: '\u041c\u043C\u04CD\u04CE' },
102
+ { base: 'h', letters: '\u041d\u043D\u045B\u04A2\u04A3\u04A4\u04A5' },
103
+ { base: 'o', letters: '\u041e\u043E\u04E6\u04E7\u04E8\u04E9\u04EA\u04EB' },
104
+ { base: 'p', letters: '\u0420\u0440\u048E\u048F' },
105
+ { base: 's', letters: '\u0405\u0455' },
106
+ { base: 'c', letters: '\u0421\u0441\u04AA\u04AB' },
107
+ { base: 't', letters: '\u0422\u0442\u04AC\u04AD' },
108
+ { base: 'y', letters: '\u0423\u040E\u0478\u04ee\u04f0\u04ef\u0443\u04f1\u04f2\u04f3\u045E\u04AE\u04AF\u04B0\u04B1' },
109
+ { base: 'x', letters: '\u0425\u0445\u04A8\u04A9\u04B2\u04B3' }, // Х (H)
110
+ { base: '\u044C', letters: '\u042C\u048C\u048D' }, // ь
111
+ { base: '\u0436', letters: '\u0496\u0497\u0416\u04C1\u04C2\u04DC\u04DD' }, // Ж (ZH)
112
+ { base: '\u0437', letters: '\u0417\u0498\u0499\u04DE\u04DF\u04E0\u04E1' }, // З (ZE)
113
+ { base: '\u043f', letters: '\u041f\u04A6\u04A7' }, // П (P)
114
+ { base: '\u0446', letters: '\u0426\u04B4\u04B5' }, // Ц (TSE)
115
+ { base: '\u0447', letters: '\u0427\u04B6\u04B7\u04B8\u04B9\u04CB\u04CC\u04F4\u04F5' }, // Ч (CHE)
116
+ { base: '\u0434', letters: '\u041B\u04C5\u04C6' }, // Л (L)
117
+ { base: '\u044D', letters: '\u042D\u04ED\u04EC' }, // Э (E)
118
+ { base: '\u044b', letters: '\u042b\u04F8\u04F9' }, // Ы (YER)
119
+
120
+ { base: 'nj', letters: '\u045A\u040A' }, // њ
121
+ { base: 'lj', letters: '\u0409\u0459' }, // Љ
122
+ { base: 'dz', letters: '\u045F\u040F' } // џ
123
+
91
124
  ];
92
125
 
93
- const diacriticsMap = {};
94
- for (let i = 0; i < DEFAULT_REMOVAL_MAP.length; i++) {
95
- const { letters } = DEFAULT_REMOVAL_MAP[i];
96
- for (let j = 0; j < letters.length; j++) {
97
- diacriticsMap[letters[j]] = DEFAULT_REMOVAL_MAP[i].base;
126
+ let diacriticsMap = null;
127
+
128
+ function buildDiacriticsMap () {
129
+ diacriticsMap = {};
130
+ for (let i = 0; i < DEFAULT_REMOVAL_MAP.length; i++) {
131
+ const { letters } = DEFAULT_REMOVAL_MAP[i];
132
+ for (let j = 0; j < letters.length; j++) {
133
+ diacriticsMap[letters[j]] = DEFAULT_REMOVAL_MAP[i].base;
134
+ }
98
135
  }
99
136
  }
100
137
 
@@ -106,9 +143,28 @@ for (let i = 0; i < DEFAULT_REMOVAL_MAP.length; i++) {
106
143
  * @returns {string}
107
144
  */
108
145
  function replaceDiacritics (str) {
146
+ if (!diacriticsMap) {
147
+ buildDiacriticsMap();
148
+ }
109
149
  return str.replace(/[^\u0000-\u007E]/g, (a) => diacriticsMap[a] || a); // eslint-disable-line no-control-regex
110
150
  }
111
151
 
152
+ /**
153
+ *
154
+ * @param {string|number} str
155
+ * @returns {string}
156
+ */
157
+ function normalize (str) {
158
+ if (!diacriticsMap) {
159
+ buildDiacriticsMap();
160
+ }
161
+ // U+0400–U+04FF - cyrillic
162
+ return `${str}`
163
+ .replace(/[\u0400-\u04ff]/g, (a) => (diacriticsMap[a] ? diacriticsMap[a] : a)) // cyrillic
164
+ .replace(/[^A-Za-z0-9\s'`\u0400-\u04ff]/g, (a) => (diacriticsMap[a] ? diacriticsMap[a] : ' '))
165
+ .toLowerCase();
166
+ }
167
+
112
168
  /**
113
169
  *
114
170
  * @param {string} string
@@ -124,5 +180,6 @@ function tokenize (string) {
124
180
 
125
181
  module.exports = {
126
182
  replaceDiacritics,
127
- tokenize
183
+ tokenize,
184
+ normalize
128
185
  };
@@ -65,9 +65,9 @@ const { iterateThroughWords } = require('../utils/ai');
65
65
  /**
66
66
  * @callback WordEntityDetector
67
67
  * @param {string} text
68
- * @param {DetectedEntity[]} entities
69
- * @param {number} startIndex
70
- * @param {string} prefix
68
+ * @param {DetectedEntity[]} [entities]
69
+ * @param {number} [startIndex]
70
+ * @param {string} [prefix]
71
71
  * @returns {DetectedEntity[]}
72
72
  */
73
73