@sc-voice/tools 2.3.0 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.mjs +4 -0
- package/package.json +1 -1
- package/src/text/tfidf-space.mjs +168 -0
- package/src/text/word-vector.mjs +112 -0
package/index.mjs
CHANGED
|
@@ -12,6 +12,8 @@ import { MerkleJson } from './src/text/merkle-json.mjs';
|
|
|
12
12
|
import { SuttaCentralId } from './src/text/sutta-central-id.mjs';
|
|
13
13
|
import { Unicode } from './src/text/unicode.mjs';
|
|
14
14
|
import { WordSpace } from './src/text/word-space.mjs';
|
|
15
|
+
import { WordVector } from './src/text/word-vector.mjs';
|
|
16
|
+
import { TfidfSpace } from './src/text/tfidf-space.mjs';
|
|
15
17
|
import { LogEntry, Logger } from './src/text/logger.mjs';
|
|
16
18
|
|
|
17
19
|
export const Text = {
|
|
@@ -24,6 +26,8 @@ export const Text = {
|
|
|
24
26
|
SuttaCentralId,
|
|
25
27
|
Unicode,
|
|
26
28
|
WordSpace,
|
|
29
|
+
WordVector,
|
|
30
|
+
TfidfSpace,
|
|
27
31
|
};
|
|
28
32
|
|
|
29
33
|
import { default as Sankey } from './src/graph/sankey.mjs';
|
package/package.json
CHANGED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import { DBG } from '../defines.mjs';
|
|
2
|
+
import { WordVector } from './word-vector.mjs';
|
|
3
|
+
|
|
4
|
+
// The golden ratio is pretty.
|
|
5
|
+
// 1.6180339887498948482045868343656381177203091798057628621354;
|
|
6
|
+
const GOLDEN_FUDGE = 1.618033988749895;
|
|
7
|
+
|
|
8
|
+
export class TfidfSpace {
|
|
9
|
+
constructor(opts = {}) {
|
|
10
|
+
const msg = 't8e.ctor:';
|
|
11
|
+
let {
|
|
12
|
+
lang = 'en', // 2-letter code: fr, en, es, pt
|
|
13
|
+
corpusBow = new WordVector(), // corpus bag of words
|
|
14
|
+
corpusSize = 0, // number of retrieval units (docs, segments, etc.)
|
|
15
|
+
idfWeight = GOLDEN_FUDGE, // IDF dampening
|
|
16
|
+
idfFunction = TfidfSpace.idfTunable,
|
|
17
|
+
normalizeText,
|
|
18
|
+
} = opts;
|
|
19
|
+
if (lang == null) {
|
|
20
|
+
throw new Error(`${msg} lang?`);
|
|
21
|
+
}
|
|
22
|
+
if (normalizeText == null) {
|
|
23
|
+
switch (lang) {
|
|
24
|
+
case 'fr':
|
|
25
|
+
normalizeText = TfidfSpace.normalizeFR;
|
|
26
|
+
break;
|
|
27
|
+
case 'en':
|
|
28
|
+
normalizeText = TfidfSpace.normalizeEN;
|
|
29
|
+
break;
|
|
30
|
+
default:
|
|
31
|
+
throw new Error(`${msg} normalizeText?`);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
Object.defineProperty(this, 'normalizeText', {
|
|
35
|
+
value: normalizeText,
|
|
36
|
+
});
|
|
37
|
+
Object.defineProperty(this, 'idfFunction', {
|
|
38
|
+
value: idfFunction,
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
// Serializable properties
|
|
42
|
+
Object.assign(this, {
|
|
43
|
+
lang,
|
|
44
|
+
corpusBow,
|
|
45
|
+
corpusSize,
|
|
46
|
+
idfWeight,
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
static normalizeEN(s) {
|
|
51
|
+
return s
|
|
52
|
+
.toLowerCase()
|
|
53
|
+
.replace(/[-.,_:;"'“”‘’!?]/g, '')
|
|
54
|
+
.replace(/ {2,}/g, ' ')
|
|
55
|
+
.trim();
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
static normalizeFR(s) {
|
|
59
|
+
return s
|
|
60
|
+
.toLowerCase()
|
|
61
|
+
.replace(/[«»]/gi, '')
|
|
62
|
+
.replace(/\bd[’']/gi, 'de ')
|
|
63
|
+
.replace(/\bl[’']/gi, 'le ')
|
|
64
|
+
.replace(/\bs[’']/gi, 'se ')
|
|
65
|
+
.replace(/[-.,_:;"'“”‘’!?]/g, '')
|
|
66
|
+
.replace(/[ ]+/g, ' ')
|
|
67
|
+
.trim();
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
static idfStandard(space, word) {
|
|
71
|
+
const msg = 'w7e.idfStandard:';
|
|
72
|
+
let { corpusBow, corpusSize } = space;
|
|
73
|
+
let wordDocs = corpusBow[word] || 0;
|
|
74
|
+
return Math.log((corpusSize + 1) / (wordDocs+1));
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
static idfTunable(space, word, idfWeight = this.idfWeight) {
|
|
78
|
+
const msg = 'w7e.idf:';
|
|
79
|
+
let { corpusBow, corpusSize } = space;
|
|
80
|
+
let wordDocs = corpusBow[word] || 0;
|
|
81
|
+
// NOTE: This is NOT the usual formula
|
|
82
|
+
// Map to [0:ignore..1:important]
|
|
83
|
+
return corpusSize
|
|
84
|
+
? 1 - Math.exp(((wordDocs - corpusSize) / wordDocs) * idfWeight)
|
|
85
|
+
: 1;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
idf(word, idfWeight) {
|
|
89
|
+
return this.idfFunction(this, word, idfWeight);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
addDocument(doc) {
|
|
93
|
+
let { corpusBow } = this;
|
|
94
|
+
this.corpusSize += 1;
|
|
95
|
+
let { bow } = this.countWords(doc, 1); // one-hot
|
|
96
|
+
corpusBow.increment(bow);
|
|
97
|
+
|
|
98
|
+
return this;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
termFrequency(word, document) {
|
|
102
|
+
return this.tf(word, document);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
tf(word, doc) {
|
|
106
|
+
let { bow, words } = this.countWords(doc);
|
|
107
|
+
let count = bow[word] || 0;
|
|
108
|
+
return count ? count / words.length : 0;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
tfidf(doc) {
|
|
112
|
+
const msg = 'w7e.tfidf:';
|
|
113
|
+
let { corpusBow, corpusSize, idfWeight } = this;
|
|
114
|
+
|
|
115
|
+
// More efficient implementation of tf * idf
|
|
116
|
+
let { bow, words } = this.countWords(doc);
|
|
117
|
+
let nWords = words.length;
|
|
118
|
+
|
|
119
|
+
let vTfIdf = words.reduce((a, word) => {
|
|
120
|
+
let wd = bow[word] || 0;
|
|
121
|
+
let tf = wd ? wd / nWords : 0;
|
|
122
|
+
let wc = corpusBow[word] || 0;
|
|
123
|
+
let idf = corpusSize
|
|
124
|
+
? 1 - Math.exp(((wc - corpusSize) / wc) * idfWeight)
|
|
125
|
+
: 1;
|
|
126
|
+
let tfidf = tf * idf;
|
|
127
|
+
if (tfidf) {
|
|
128
|
+
a[word] = tfidf;
|
|
129
|
+
}
|
|
130
|
+
return a;
|
|
131
|
+
}, new WordVector());
|
|
132
|
+
|
|
133
|
+
return vTfIdf;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
countWords(str, maxCount) {
|
|
137
|
+
const msg = 'w7e.countWords:';
|
|
138
|
+
if (str == null) {
|
|
139
|
+
throw new Error(`${msg} str?`);
|
|
140
|
+
}
|
|
141
|
+
let dbg = 0;
|
|
142
|
+
let sNorm = this.normalizeText(str);
|
|
143
|
+
let words = sNorm.split(' ');
|
|
144
|
+
let bow = words.reduce((a, w) => {
|
|
145
|
+
let count = (a[w] || 0) + 1;
|
|
146
|
+
a[w] = maxCount ? Math.min(maxCount, count) : count;
|
|
147
|
+
return a;
|
|
148
|
+
}, new WordVector());
|
|
149
|
+
|
|
150
|
+
return { bow, words };
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
bowOfText(text) {
|
|
154
|
+
const msg = 'w7e.bowOfText:';
|
|
155
|
+
if (text == null) {
|
|
156
|
+
throw new Error(`${msg} text?`);
|
|
157
|
+
}
|
|
158
|
+
let dbg = 0;
|
|
159
|
+
let sNorm = this.normalizeText(text);
|
|
160
|
+
let words = sNorm.split(' ');
|
|
161
|
+
let bow = words.reduce((a, w) => {
|
|
162
|
+
a[w] = (a[w] || 0) + 1;
|
|
163
|
+
return a;
|
|
164
|
+
}, new WordVector());
|
|
165
|
+
|
|
166
|
+
return bow;
|
|
167
|
+
}
|
|
168
|
+
} // TfidfSpace
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import { DBG } from '../defines.mjs';
|
|
2
|
+
|
|
3
|
+
// The golden ratio is pretty.
|
|
4
|
+
// 1.6180339887498948482045868343656381177203091798057628621354;
|
|
5
|
+
const GOLDEN_FUDGE = 1.618033988749895;
|
|
6
|
+
|
|
7
|
+
export class WordVector extends Object {
|
|
8
|
+
constructor(props) {
|
|
9
|
+
super();
|
|
10
|
+
Object.assign(this, props);
|
|
11
|
+
Object.defineProperty(this, '$length', {
|
|
12
|
+
writable: true,
|
|
13
|
+
});
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
get length() {
|
|
17
|
+
if (this.$length == null) {
|
|
18
|
+
this.$length = Object.keys(this).length;
|
|
19
|
+
}
|
|
20
|
+
return this.$length;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
toString() {
|
|
24
|
+
let sv = Object.entries(this).reduce((a, e) => {
|
|
25
|
+
let [k, v] = e;
|
|
26
|
+
let vf = v.toFixed(2);
|
|
27
|
+
a.push(`${k}:${vf}`);
|
|
28
|
+
return a;
|
|
29
|
+
}, []);
|
|
30
|
+
return sv.join(',');
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
norm() {
|
|
34
|
+
let keys = Object.keys(this);
|
|
35
|
+
if (keys.length === 0) {
|
|
36
|
+
return 0;
|
|
37
|
+
}
|
|
38
|
+
let sumSqr = keys.reduce((a, k) => {
|
|
39
|
+
let v = this[k];
|
|
40
|
+
return a + v * v;
|
|
41
|
+
}, 0);
|
|
42
|
+
return Math.sqrt(sumSqr);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
add(vec2) {
|
|
46
|
+
let keys = Object.keys(vec2);
|
|
47
|
+
return keys.reduce((a, k) => {
|
|
48
|
+
let v2 = vec2[k];
|
|
49
|
+
if (v2) {
|
|
50
|
+
a[k] = (a[k] || 0) + v2;
|
|
51
|
+
}
|
|
52
|
+
return a;
|
|
53
|
+
}, new WordVector(this));
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
increment(vec2) {
|
|
57
|
+
let keys = Object.keys(vec2);
|
|
58
|
+
return keys.reduce((a, k) => {
|
|
59
|
+
let v2 = vec2[k];
|
|
60
|
+
if (v2) {
|
|
61
|
+
a[k] = (a[k] || 0) + v2;
|
|
62
|
+
}
|
|
63
|
+
return a;
|
|
64
|
+
}, this);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
dot(vec2) {
|
|
68
|
+
const msg = 'w8r.dot:';
|
|
69
|
+
if (vec2 == null) {
|
|
70
|
+
throw new Error(`${msg} vec2?`);
|
|
71
|
+
}
|
|
72
|
+
let keys = Object.keys(this);
|
|
73
|
+
return keys.reduce((a, k) => {
|
|
74
|
+
let v1 = this[k];
|
|
75
|
+
let v2 = vec2[k] || 0;
|
|
76
|
+
|
|
77
|
+
return a + v1 * v2;
|
|
78
|
+
}, 0);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
scale(c) {
|
|
82
|
+
return Object.keys(this).reduce((a,k)=>{
|
|
83
|
+
a[k] *= c;
|
|
84
|
+
return a;
|
|
85
|
+
}, this);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
intersect(vec2 = {}) {
|
|
89
|
+
let keys = Object.keys(this);
|
|
90
|
+
return keys.reduce((a, k) => {
|
|
91
|
+
let v1 = this[k];
|
|
92
|
+
let v2 = vec2[k] || 0;
|
|
93
|
+
if (v1 && v2) {
|
|
94
|
+
a[k] = v1 * v2;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return a;
|
|
98
|
+
}, new WordVector());
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
similar(vec2) {
|
|
102
|
+
const msg = 'w8r.similar:';
|
|
103
|
+
if (vec2 == null) {
|
|
104
|
+
throw new Error(`${msg} vec2?`);
|
|
105
|
+
}
|
|
106
|
+
let d = this.dot(vec2);
|
|
107
|
+
let norm1 = this.norm();
|
|
108
|
+
let norm2 = vec2.norm();
|
|
109
|
+
let den = norm1 * norm2;
|
|
110
|
+
return den ? d / den : 0;
|
|
111
|
+
}
|
|
112
|
+
} // WordVector
|