@sc-voice/tools 2.4.0 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/text/tfidf-space.mjs +34 -22
- package/src/text/word-vector.mjs +7 -0
package/package.json
CHANGED
package/src/text/tfidf-space.mjs
CHANGED
|
@@ -13,6 +13,7 @@ export class TfidfSpace {
|
|
|
13
13
|
corpusBow = new WordVector(), // corpus bag of words
|
|
14
14
|
corpusSize = 0, // number of retrieval units (docs, segments, etc.)
|
|
15
15
|
idfWeight = GOLDEN_FUDGE, // IDF dampening
|
|
16
|
+
idfFunction = TfidfSpace.idfTunable,
|
|
16
17
|
normalizeText,
|
|
17
18
|
} = opts;
|
|
18
19
|
if (lang == null) {
|
|
@@ -33,6 +34,9 @@ export class TfidfSpace {
|
|
|
33
34
|
Object.defineProperty(this, 'normalizeText', {
|
|
34
35
|
value: normalizeText,
|
|
35
36
|
});
|
|
37
|
+
Object.defineProperty(this, 'idfFunction', {
|
|
38
|
+
value: idfFunction,
|
|
39
|
+
});
|
|
36
40
|
|
|
37
41
|
// Serializable properties
|
|
38
42
|
Object.assign(this, {
|
|
@@ -63,6 +67,28 @@ export class TfidfSpace {
|
|
|
63
67
|
.trim();
|
|
64
68
|
}
|
|
65
69
|
|
|
70
|
+
static idfStandard(space, word) {
|
|
71
|
+
const msg = 'w7e.idfStandard:';
|
|
72
|
+
let { corpusBow, corpusSize } = space;
|
|
73
|
+
let wordDocs = corpusBow[word] || 0;
|
|
74
|
+
return Math.log((corpusSize + 1) / (wordDocs+1));
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
static idfTunable(space, word, idfWeight = this.idfWeight) {
|
|
78
|
+
const msg = 'w7e.idf:';
|
|
79
|
+
let { corpusBow, corpusSize } = space;
|
|
80
|
+
let wordDocs = corpusBow[word] || 0;
|
|
81
|
+
// NOTE: This is NOT the usual formula
|
|
82
|
+
// Map to [0:ignore..1:important]
|
|
83
|
+
return corpusSize
|
|
84
|
+
? 1 - Math.exp(((wordDocs - corpusSize) / wordDocs) * idfWeight)
|
|
85
|
+
: 1;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
idf(word, idfWeight) {
|
|
89
|
+
return this.idfFunction(this, word, idfWeight);
|
|
90
|
+
}
|
|
91
|
+
|
|
66
92
|
addDocument(doc) {
|
|
67
93
|
let { corpusBow } = this;
|
|
68
94
|
this.corpusSize += 1;
|
|
@@ -72,20 +98,6 @@ export class TfidfSpace {
|
|
|
72
98
|
return this;
|
|
73
99
|
}
|
|
74
100
|
|
|
75
|
-
inverseDocumentFrequency(word, idfWeight) {
|
|
76
|
-
return this.idf(word, idfWeight);
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
idf(word, idfWeight = this.idfWeight) {
|
|
80
|
-
const msg = 'w7e.idf:';
|
|
81
|
-
let { corpusBow, corpusSize } = this;
|
|
82
|
-
let wCount = corpusBow[word] || 0;
|
|
83
|
-
// Map to [0:ignore..1:important]
|
|
84
|
-
return corpusSize
|
|
85
|
-
? 1 - Math.exp(((wCount - corpusSize) / wCount) * idfWeight)
|
|
86
|
-
: 1;
|
|
87
|
-
}
|
|
88
|
-
|
|
89
101
|
termFrequency(word, document) {
|
|
90
102
|
return this.tf(word, document);
|
|
91
103
|
}
|
|
@@ -138,19 +150,19 @@ export class TfidfSpace {
|
|
|
138
150
|
return { bow, words };
|
|
139
151
|
}
|
|
140
152
|
|
|
141
|
-
|
|
142
|
-
const msg = 'w7e.
|
|
143
|
-
if (
|
|
144
|
-
throw new Error(`${msg}
|
|
153
|
+
bowOfText(text) {
|
|
154
|
+
const msg = 'w7e.bowOfText:';
|
|
155
|
+
if (text == null) {
|
|
156
|
+
throw new Error(`${msg} text?`);
|
|
145
157
|
}
|
|
146
158
|
let dbg = 0;
|
|
147
|
-
let sNorm = this.normalizeText(
|
|
159
|
+
let sNorm = this.normalizeText(text);
|
|
148
160
|
let words = sNorm.split(' ');
|
|
149
|
-
let
|
|
150
|
-
a[w] = (a[w] || 0) +
|
|
161
|
+
let bow = words.reduce((a, w) => {
|
|
162
|
+
a[w] = (a[w] || 0) + 1;
|
|
151
163
|
return a;
|
|
152
164
|
}, new WordVector());
|
|
153
165
|
|
|
154
|
-
return
|
|
166
|
+
return bow;
|
|
155
167
|
}
|
|
156
168
|
} // TfidfSpace
|
package/src/text/word-vector.mjs
CHANGED
|
@@ -78,6 +78,13 @@ export class WordVector extends Object {
|
|
|
78
78
|
}, 0);
|
|
79
79
|
}
|
|
80
80
|
|
|
81
|
+
scale(c) {
|
|
82
|
+
return Object.keys(this).reduce((a,k)=>{
|
|
83
|
+
a[k] *= c;
|
|
84
|
+
return a;
|
|
85
|
+
}, this);
|
|
86
|
+
}
|
|
87
|
+
|
|
81
88
|
intersect(vec2 = {}) {
|
|
82
89
|
let keys = Object.keys(this);
|
|
83
90
|
return keys.reduce((a, k) => {
|