@sc-voice/tools 2.1.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/package.json +1 -1
- package/src/text/logger.mjs +0 -1
- package/src/text/word-space.mjs +107 -6
package/README.md
CHANGED
|
@@ -23,6 +23,7 @@ console.log(f.add(new Fraction(1,80)); // new Fraction(4,80)
|
|
|
23
23
|
Used for text similarity comparison, WordSpace creates and compares
|
|
24
24
|
Vectors of words weighted by normalized frequency of occurrence.
|
|
25
25
|
Weights and scores are normalized to the interval [0..1].
|
|
26
|
+
(NOTE: Odd how bag-of-words works better here than fastText?)
|
|
26
27
|
|
|
27
28
|
## Text.MerkleJson
|
|
28
29
|
Computing the hash of JSON objects can be tricky because JSON.stringify()
|
package/package.json
CHANGED
package/src/text/logger.mjs
CHANGED
package/src/text/word-space.mjs
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
import { DBG } from '../defines.mjs';
|
|
2
2
|
|
|
3
|
+
// The golden ratio is pretty.
|
|
4
|
+
// 1.6180339887498948482045868343656381177203091798057628621354;
|
|
5
|
+
const GOLDEN_FUDGE = 1.618033988749895;
|
|
6
|
+
|
|
3
7
|
class Vector extends Object {
|
|
4
8
|
constructor(props) {
|
|
5
9
|
super();
|
|
@@ -49,6 +53,17 @@ class Vector extends Object {
|
|
|
49
53
|
}, new Vector(this));
|
|
50
54
|
}
|
|
51
55
|
|
|
56
|
+
increment(vec2) {
|
|
57
|
+
let keys = Object.keys(vec2);
|
|
58
|
+
return keys.reduce((a, k) => {
|
|
59
|
+
let v2 = vec2[k];
|
|
60
|
+
if (v2) {
|
|
61
|
+
a[k] = (a[k] || 0) + v2;
|
|
62
|
+
}
|
|
63
|
+
return a;
|
|
64
|
+
}, this);
|
|
65
|
+
}
|
|
66
|
+
|
|
52
67
|
dot(vec2) {
|
|
53
68
|
const msg = 'V4r.dot:';
|
|
54
69
|
if (vec2 == null) {
|
|
@@ -118,9 +133,9 @@ export class WordMapTransformer {
|
|
|
118
133
|
static normalizeFR(s) {
|
|
119
134
|
return s
|
|
120
135
|
.replace(/[«»]/gi, '')
|
|
121
|
-
.replace(/\bd
|
|
122
|
-
.replace(/\bl
|
|
123
|
-
.replace(/\bs
|
|
136
|
+
.replace(/\bd[’']/gi, 'de ')
|
|
137
|
+
.replace(/\bl[’']/gi, 'le ')
|
|
138
|
+
.replace(/\bs[’']/gi, 'se ')
|
|
124
139
|
.replace('?', '$QUESTION')
|
|
125
140
|
.replace('!', '$EXCLAMATION')
|
|
126
141
|
.trim();
|
|
@@ -165,6 +180,9 @@ export class WordSpace {
|
|
|
165
180
|
constructor(opts = {}) {
|
|
166
181
|
let {
|
|
167
182
|
lang, // 2-letter code: fr, en, es, pt
|
|
183
|
+
corpusBow = new Vector(), // corpus bag of words
|
|
184
|
+
corpusSize = 0, // number of retrieval units (docs, segments, etc.)
|
|
185
|
+
idfWeight = GOLDEN_FUDGE, // IDF dampening
|
|
168
186
|
minWord = 4, // minimum word length
|
|
169
187
|
normalize,
|
|
170
188
|
normalizeVector = WordSpace.normalizeVector,
|
|
@@ -189,6 +207,9 @@ export class WordSpace {
|
|
|
189
207
|
|
|
190
208
|
Object.assign(this, {
|
|
191
209
|
lang,
|
|
210
|
+
corpusBow,
|
|
211
|
+
corpusSize,
|
|
212
|
+
idfWeight,
|
|
192
213
|
minWord,
|
|
193
214
|
normalizeVector,
|
|
194
215
|
reWordMap,
|
|
@@ -197,6 +218,12 @@ export class WordSpace {
|
|
|
197
218
|
});
|
|
198
219
|
}
|
|
199
220
|
|
|
221
|
+
static createTfIdf() {
|
|
222
|
+
let minWord = 1;
|
|
223
|
+
let normalizeVector = (v) => v;
|
|
224
|
+
return new WordSpace({ minWord, normalizeVector });
|
|
225
|
+
}
|
|
226
|
+
|
|
200
227
|
static get WordMapTransformer() {
|
|
201
228
|
return WordMapTransformer;
|
|
202
229
|
}
|
|
@@ -206,8 +233,7 @@ export class WordSpace {
|
|
|
206
233
|
}
|
|
207
234
|
|
|
208
235
|
// Golden Ratio fudge factor scales a count of 1 to ~0.8
|
|
209
|
-
|
|
210
|
-
static normalizeVector(v, scale = 1.618033988749895) {
|
|
236
|
+
static normalizeVector(v, scale = GOLDEN_FUDGE) {
|
|
211
237
|
let vNew = new Vector(v);
|
|
212
238
|
Object.entries(v).forEach((e) => {
|
|
213
239
|
let [key, value] = e;
|
|
@@ -217,8 +243,83 @@ export class WordSpace {
|
|
|
217
243
|
return vNew;
|
|
218
244
|
}
|
|
219
245
|
|
|
246
|
+
addDocument(doc) {
|
|
247
|
+
let { corpusBow } = this;
|
|
248
|
+
this.corpusSize += 1;
|
|
249
|
+
let { bow } = this.countWords(doc, 1); // one-hot
|
|
250
|
+
corpusBow.increment(bow);
|
|
251
|
+
|
|
252
|
+
return this;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
inverseDocumentFrequency(word, idfWeight) {
|
|
256
|
+
return this.idf(word, idfWeight);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
idf(word, idfWeight = this.idfWeight) {
|
|
260
|
+
const msg = 'w7e.idf:';
|
|
261
|
+
let { corpusBow, corpusSize } = this;
|
|
262
|
+
let wCount = corpusBow[word] || 0;
|
|
263
|
+
// Map to [0:ignore..1:important]
|
|
264
|
+
return corpusSize
|
|
265
|
+
? 1 - Math.exp(((wCount - corpusSize) / wCount) * idfWeight)
|
|
266
|
+
: 1;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
termFrequency(word, document) {
|
|
270
|
+
return this.tf(word, document);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
tf(word, doc) {
|
|
274
|
+
let { bow, words } = this.countWords(doc);
|
|
275
|
+
let count = bow[word] || 0;
|
|
276
|
+
return count ? count / words.length : 0;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
tfidf(doc) {
|
|
280
|
+
const msg = 'w7e.tfidf:';
|
|
281
|
+
let { corpusBow, corpusSize, idfWeight } = this;
|
|
282
|
+
|
|
283
|
+
// More efficient implementation of tf * idf
|
|
284
|
+
let { bow, words } = this.countWords(doc);
|
|
285
|
+
let nWords = words.length;
|
|
286
|
+
|
|
287
|
+
let vTfIdf = words.reduce((a, word) => {
|
|
288
|
+
let wd = bow[word] || 0;
|
|
289
|
+
let tf = wd ? wd / nWords : 0;
|
|
290
|
+
let wc = corpusBow[word] || 0;
|
|
291
|
+
let idf = corpusSize
|
|
292
|
+
? 1 - Math.exp(((wc - corpusSize) / wc) * idfWeight)
|
|
293
|
+
: 1;
|
|
294
|
+
let tfidf = tf * idf;
|
|
295
|
+
if (tfidf) {
|
|
296
|
+
a[word] = tfidf;
|
|
297
|
+
}
|
|
298
|
+
return a;
|
|
299
|
+
}, new Vector());
|
|
300
|
+
|
|
301
|
+
return vTfIdf;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
countWords(str, maxCount) {
|
|
305
|
+
const msg = 'w7e.countWords:';
|
|
306
|
+
if (str == null) {
|
|
307
|
+
throw new Error(`${msg} str?`);
|
|
308
|
+
}
|
|
309
|
+
let dbg = 0;
|
|
310
|
+
let sNorm = this.transformText(str);
|
|
311
|
+
let words = sNorm.split(' ');
|
|
312
|
+
let bow = words.reduce((a, w) => {
|
|
313
|
+
let count = (a[w] || 0) + 1;
|
|
314
|
+
a[w] = maxCount ? Math.min(maxCount, count) : count;
|
|
315
|
+
return a;
|
|
316
|
+
}, new Vector());
|
|
317
|
+
|
|
318
|
+
return { bow, words };
|
|
319
|
+
}
|
|
320
|
+
|
|
220
321
|
string2Vector(str, scale = 1) {
|
|
221
|
-
const msg = '
|
|
322
|
+
const msg = 'w7e.string2Vector:';
|
|
222
323
|
if (str == null) {
|
|
223
324
|
throw new Error(`${msg} str?`);
|
|
224
325
|
}
|