@sc-voice/tools 2.8.0 → 2.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.mjs +2 -0
- package/package.json +1 -1
- package/src/text/corpus.mjs +39 -0
- package/src/text/tfidf-space.mjs +45 -28
- package/src/text/word-space.mjs +0 -1
- package/src/text/word-vector.mjs +2 -2
package/index.mjs
CHANGED
|
@@ -6,6 +6,7 @@ export const ScvMath = {
|
|
|
6
6
|
};
|
|
7
7
|
|
|
8
8
|
import { BilaraPath } from './src/text/bilara-path.mjs';
|
|
9
|
+
import { Corpus } from './src/text/corpus.mjs';
|
|
9
10
|
import { EbtDoc } from './src/text/ebt-doc.mjs';
|
|
10
11
|
import { LegacyDoc } from './src/text/legacy-doc.mjs';
|
|
11
12
|
import { MerkleJson } from './src/text/merkle-json.mjs';
|
|
@@ -18,6 +19,7 @@ import { LogEntry, Logger } from './src/text/logger.mjs';
|
|
|
18
19
|
|
|
19
20
|
export const Text = {
|
|
20
21
|
BilaraPath,
|
|
22
|
+
Corpus,
|
|
21
23
|
EbtDoc,
|
|
22
24
|
LegacyDoc,
|
|
23
25
|
LogEntry,
|
package/package.json
CHANGED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { WordVector } from './word-vector.mjs';
|
|
2
|
+
|
|
3
|
+
export class Corpus {
|
|
4
|
+
constructor(opts = {}) {
|
|
5
|
+
let { wordDocCount = new WordVector(), docMap = {} } = opts;
|
|
6
|
+
|
|
7
|
+
this._size = Object.keys(docMap).length;
|
|
8
|
+
|
|
9
|
+
Object.assign(this, {
|
|
10
|
+
wordDocCount,
|
|
11
|
+
docMap,
|
|
12
|
+
});
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
get size() {
|
|
16
|
+
return this._size;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
addDocument(id, doc) {
|
|
20
|
+
this.deleteDocument(id);
|
|
21
|
+
this.docMap[id] = doc;
|
|
22
|
+
this._size++;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
getDocument(id) {
|
|
26
|
+
return this.docMap[id];
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
deleteDocument(id) {
|
|
30
|
+
let { docMap } = this;
|
|
31
|
+
let doc = docMap[id];
|
|
32
|
+
if (doc) {
|
|
33
|
+
delete docMap[id];
|
|
34
|
+
this._size--;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
return doc;
|
|
38
|
+
}
|
|
39
|
+
}
|
package/src/text/tfidf-space.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { DBG } from '../defines.mjs';
|
|
2
|
+
import { Corpus } from './corpus.mjs';
|
|
2
3
|
import { WordVector } from './word-vector.mjs';
|
|
3
4
|
|
|
4
5
|
// The golden ratio is pretty.
|
|
@@ -10,8 +11,7 @@ export class TfidfSpace {
|
|
|
10
11
|
const msg = 't8e.ctor:';
|
|
11
12
|
let {
|
|
12
13
|
lang = 'en', // 2-letter code: fr, en, es, pt
|
|
13
|
-
|
|
14
|
-
corpusSize = 0, // number of retrieval units (docs, segments, etc.)
|
|
14
|
+
corpus = new Corpus(),
|
|
15
15
|
idfWeight = GOLDEN_FUDGE, // IDF dampening
|
|
16
16
|
idfFunction = TfidfSpace.idfTunable,
|
|
17
17
|
normalizeText,
|
|
@@ -41,8 +41,7 @@ export class TfidfSpace {
|
|
|
41
41
|
// Serializable properties
|
|
42
42
|
Object.assign(this, {
|
|
43
43
|
lang,
|
|
44
|
-
|
|
45
|
-
corpusSize,
|
|
44
|
+
corpus,
|
|
46
45
|
idfWeight,
|
|
47
46
|
});
|
|
48
47
|
}
|
|
@@ -59,46 +58,65 @@ export class TfidfSpace {
|
|
|
59
58
|
}
|
|
60
59
|
|
|
61
60
|
static normalizeFR(s) {
|
|
62
|
-
let sAbbr = s
|
|
61
|
+
let sAbbr = s
|
|
62
|
+
.toLowerCase()
|
|
63
63
|
.replace(/\bd[’']/gi, 'de ')
|
|
64
64
|
.replace(/\bl[’']/gi, 'le ')
|
|
65
65
|
.replace(/\bs[’']/gi, 's_')
|
|
66
66
|
.replace(/\bj[’']/gi, 'j_')
|
|
67
67
|
.replace(/\bm[’']/gi, 'm_')
|
|
68
68
|
.replace(/\bn[’']/gi, 'n_')
|
|
69
|
-
.replace(/\bc[’']/gi, 'c_')
|
|
69
|
+
.replace(/\bc[’']/gi, 'c_');
|
|
70
70
|
return TfidfSpace.removeNonWords(sAbbr);
|
|
71
71
|
}
|
|
72
72
|
|
|
73
73
|
static idfStandard(space, word) {
|
|
74
74
|
const msg = 'w7e.idfStandard:';
|
|
75
|
-
let {
|
|
76
|
-
let wordDocs =
|
|
77
|
-
return Math.log((
|
|
75
|
+
let { corpus } = space;
|
|
76
|
+
let wordDocs = corpus.wordDocCount[word] || 0;
|
|
77
|
+
return Math.log((corpus.size + 1) / (wordDocs + 1));
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
-
static idfTunable(space, word, idfWeight
|
|
80
|
+
static idfTunable(space, word, idfWeight) {
|
|
81
81
|
const msg = 'w7e.idf:';
|
|
82
|
-
let {
|
|
83
|
-
let wordDocs =
|
|
82
|
+
let { corpus } = space;
|
|
83
|
+
let wordDocs = corpus.wordDocCount[word] || 0;
|
|
84
84
|
// NOTE: This is NOT the usual formula
|
|
85
85
|
// Map to [0:ignore..1:important]
|
|
86
|
-
return
|
|
87
|
-
? 1 -
|
|
86
|
+
return corpus.size
|
|
87
|
+
? 1 -
|
|
88
|
+
Math.exp(((wordDocs - corpus.size) / wordDocs) * idfWeight)
|
|
88
89
|
: 1;
|
|
89
90
|
}
|
|
90
91
|
|
|
91
|
-
idf(word, idfWeight) {
|
|
92
|
+
idf(word, idfWeight = this.idfWeight) {
|
|
92
93
|
return this.idfFunction(this, word, idfWeight);
|
|
93
94
|
}
|
|
94
95
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
96
|
+
addCorpusDocument(id, bow, nWords) {
|
|
97
|
+
const msg = 't8w.addCorpusDocument:';
|
|
98
|
+
let { corpus } = this;
|
|
99
|
+
if (id == null) {
|
|
100
|
+
throw new Error(`${msg} id?`);
|
|
101
|
+
}
|
|
102
|
+
if (bow == null) {
|
|
103
|
+
throw new Error(`${msg} bow?`);
|
|
104
|
+
}
|
|
105
|
+
if (nWords == null) {
|
|
106
|
+
throw new Error(`${msg} nWords?`);
|
|
107
|
+
}
|
|
108
|
+
let docInfo = { id, bow, nWords };
|
|
109
|
+
corpus.addDocument(id, docInfo);
|
|
100
110
|
|
|
101
|
-
return
|
|
111
|
+
return docInfo;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
addDocument(id, doc) {
|
|
115
|
+
let { corpus } = this;
|
|
116
|
+
let { bow, words } = this.countWords(doc);
|
|
117
|
+
corpus.wordDocCount.increment(bow.oneHot());
|
|
118
|
+
|
|
119
|
+
return this.addCorpusDocument(id, bow, words.length);
|
|
102
120
|
}
|
|
103
121
|
|
|
104
122
|
termFrequency(word, document) {
|
|
@@ -113,7 +131,7 @@ export class TfidfSpace {
|
|
|
113
131
|
|
|
114
132
|
tfidf(doc) {
|
|
115
133
|
const msg = 'w7e.tfidf:';
|
|
116
|
-
let {
|
|
134
|
+
let { corpus, idfWeight } = this;
|
|
117
135
|
|
|
118
136
|
// More efficient implementation of tf * idf
|
|
119
137
|
let { bow, words } = this.countWords(doc);
|
|
@@ -122,9 +140,9 @@ export class TfidfSpace {
|
|
|
122
140
|
let vTfIdf = words.reduce((a, word) => {
|
|
123
141
|
let wd = bow[word] || 0;
|
|
124
142
|
let tf = wd ? wd / nWords : 0;
|
|
125
|
-
let wc =
|
|
126
|
-
let idf =
|
|
127
|
-
? 1 - Math.exp(((wc -
|
|
143
|
+
let wc = corpus.wordDocCount[word] || 0;
|
|
144
|
+
let idf = corpus.size
|
|
145
|
+
? 1 - Math.exp(((wc - corpus.size) / wc) * idfWeight)
|
|
128
146
|
: 1;
|
|
129
147
|
let tfidf = tf * idf;
|
|
130
148
|
if (tfidf) {
|
|
@@ -136,7 +154,7 @@ export class TfidfSpace {
|
|
|
136
154
|
return vTfIdf;
|
|
137
155
|
}
|
|
138
156
|
|
|
139
|
-
countWords(str
|
|
157
|
+
countWords(str) {
|
|
140
158
|
const msg = 'w7e.countWords:';
|
|
141
159
|
if (str == null) {
|
|
142
160
|
throw new Error(`${msg} str?`);
|
|
@@ -145,8 +163,7 @@ export class TfidfSpace {
|
|
|
145
163
|
let sNorm = this.normalizeText(str);
|
|
146
164
|
let words = sNorm.split(' ');
|
|
147
165
|
let bow = words.reduce((a, w) => {
|
|
148
|
-
|
|
149
|
-
a[w] = maxCount ? Math.min(maxCount, count) : count;
|
|
166
|
+
a[w] = (a[w] || 0) + 1;
|
|
150
167
|
return a;
|
|
151
168
|
}, new WordVector());
|
|
152
169
|
|
package/src/text/word-space.mjs
CHANGED
package/src/text/word-vector.mjs
CHANGED
|
@@ -79,7 +79,7 @@ export class WordVector extends Object {
|
|
|
79
79
|
}
|
|
80
80
|
|
|
81
81
|
scale(c) {
|
|
82
|
-
return Object.keys(this).reduce((a,k)=>{
|
|
82
|
+
return Object.keys(this).reduce((a, k) => {
|
|
83
83
|
a[k] *= c;
|
|
84
84
|
return a;
|
|
85
85
|
}, this);
|
|
@@ -111,7 +111,7 @@ export class WordVector extends Object {
|
|
|
111
111
|
}
|
|
112
112
|
|
|
113
113
|
oneHot() {
|
|
114
|
-
return Object.keys(this).reduce((a,k)=>{
|
|
114
|
+
return Object.keys(this).reduce((a, k) => {
|
|
115
115
|
if (this[k] > 0) {
|
|
116
116
|
a[k] = 1;
|
|
117
117
|
}
|