@sc-voice/tools 2.11.0 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/text/tfidf-space.mjs +24 -23
package/package.json
CHANGED
package/src/text/tfidf-space.mjs
CHANGED
|
@@ -70,41 +70,37 @@ export class TfidfSpace {
|
|
|
70
70
|
return TfidfSpace.removeNonWords(sAbbr);
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
-
static idfStandard(
|
|
74
|
-
|
|
75
|
-
let { corpus } = space;
|
|
76
|
-
let wordDocs = corpus.wordDocCount[word] || 0;
|
|
77
|
-
return Math.log((corpus.size + 1) / (wordDocs + 1));
|
|
73
|
+
static idfStandard(nDocs, wdc, idfWeight) {
|
|
74
|
+
return Math.log((nDocs + 1) / (wdc + 1));
|
|
78
75
|
}
|
|
79
76
|
|
|
80
|
-
static idfTunable(
|
|
81
|
-
const msg = 'w7e.
|
|
82
|
-
let { corpus } = space;
|
|
83
|
-
let wordDocs = corpus.wordDocCount[word] || 0;
|
|
77
|
+
static idfTunable(nDocs, wdc, idfWeight) {
|
|
78
|
+
const msg = 'w7e.idfTunable:';
|
|
84
79
|
// NOTE: This is NOT the usual formula
|
|
85
80
|
// Map to [0:ignore..1:important]
|
|
86
|
-
return
|
|
87
|
-
? 1 -
|
|
88
|
-
Math.exp(((wordDocs - corpus.size) / wordDocs) * idfWeight)
|
|
81
|
+
return nDocs
|
|
82
|
+
? 1 - Math.exp(((wdc - nDocs) / wdc) * idfWeight)
|
|
89
83
|
: 1;
|
|
90
84
|
}
|
|
91
85
|
|
|
92
86
|
idf(word, idfWeight = this.idfWeight) {
|
|
93
|
-
|
|
87
|
+
let { corpus } = this;
|
|
88
|
+
let wdc = corpus.wordDocCount[word] || 0;
|
|
89
|
+
let nDocs = corpus.size;
|
|
90
|
+
return this.idfFunction(nDocs, wdc, idfWeight);
|
|
94
91
|
}
|
|
95
92
|
|
|
96
|
-
addCorpusDocument(id, bow
|
|
93
|
+
addCorpusDocument(id, bow) {
|
|
97
94
|
const msg = 't8w.addCorpusDocument:';
|
|
98
95
|
let { corpus } = this;
|
|
99
96
|
if (id == null) {
|
|
100
97
|
throw new Error(`${msg} id?`);
|
|
101
98
|
}
|
|
102
99
|
if (bow == null) {
|
|
100
|
+
// Bag-of-words maps word to wordCount(word,doc)
|
|
103
101
|
throw new Error(`${msg} bow?`);
|
|
104
102
|
}
|
|
105
|
-
|
|
106
|
-
throw new Error(`${msg} nWords?`);
|
|
107
|
-
}
|
|
103
|
+
let nWords = Object.values(bow).reduce((a,v)=>a+v);
|
|
108
104
|
let docInfo = { id, bow, nWords };
|
|
109
105
|
corpus.wordDocCount.increment(bow.oneHot());
|
|
110
106
|
corpus.addDocument(id, docInfo);
|
|
@@ -129,20 +125,20 @@ export class TfidfSpace {
|
|
|
129
125
|
return count ? count / words.length : 0;
|
|
130
126
|
}
|
|
131
127
|
|
|
132
|
-
|
|
133
|
-
const msg = 'w7e.
|
|
128
|
+
tfidfOfBow(bow) {
|
|
129
|
+
const msg = 'w7e.tfidfOfBow:';
|
|
134
130
|
let { corpus, idfWeight } = this;
|
|
135
131
|
|
|
136
132
|
// More efficient implementation of tf * idf
|
|
137
|
-
let
|
|
138
|
-
let nWords = words.
|
|
133
|
+
let words = Object.keys(bow);
|
|
134
|
+
let nWords = words.reduce((a,w)=>a+bow[w],0);
|
|
139
135
|
|
|
140
136
|
let vTfIdf = words.reduce((a, word) => {
|
|
141
137
|
let wd = bow[word] || 0;
|
|
142
138
|
let tf = wd ? wd / nWords : 0;
|
|
143
|
-
let
|
|
139
|
+
let wdc = corpus.wordDocCount[word] || 0;
|
|
144
140
|
let idf = corpus.size
|
|
145
|
-
? 1 - Math.exp(((
|
|
141
|
+
? 1 - Math.exp(((wdc - corpus.size) / wdc) * idfWeight)
|
|
146
142
|
: 1;
|
|
147
143
|
let tfidf = tf * idf;
|
|
148
144
|
if (tfidf) {
|
|
@@ -154,6 +150,11 @@ export class TfidfSpace {
|
|
|
154
150
|
return vTfIdf;
|
|
155
151
|
}
|
|
156
152
|
|
|
153
|
+
tfidf(text) { // TfIdf of words in text w/r to corpus
|
|
154
|
+
let { bow } = this.countWords(text);
|
|
155
|
+
return this.tfidfOfBow(bow);
|
|
156
|
+
}
|
|
157
|
+
|
|
157
158
|
countWords(str) {
|
|
158
159
|
const msg = 'w7e.countWords:';
|
|
159
160
|
if (str == null) {
|