@sc-voice/tools 2.7.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.mjs +2 -0
- package/package.json +1 -1
- package/src/text/corpus.mjs +39 -0
- package/src/text/tfidf-space.mjs +33 -27
- package/src/text/word-vector.mjs +10 -1
package/index.mjs
CHANGED
|
@@ -6,6 +6,7 @@ export const ScvMath = {
|
|
|
6
6
|
};
|
|
7
7
|
|
|
8
8
|
import { BilaraPath } from './src/text/bilara-path.mjs';
|
|
9
|
+
import { Corpus } from './src/text/corpus.mjs';
|
|
9
10
|
import { EbtDoc } from './src/text/ebt-doc.mjs';
|
|
10
11
|
import { LegacyDoc } from './src/text/legacy-doc.mjs';
|
|
11
12
|
import { MerkleJson } from './src/text/merkle-json.mjs';
|
|
@@ -18,6 +19,7 @@ import { LogEntry, Logger } from './src/text/logger.mjs';
|
|
|
18
19
|
|
|
19
20
|
export const Text = {
|
|
20
21
|
BilaraPath,
|
|
22
|
+
Corpus,
|
|
21
23
|
EbtDoc,
|
|
22
24
|
LegacyDoc,
|
|
23
25
|
LogEntry,
|
package/package.json
CHANGED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { WordVector } from './word-vector.mjs';
|
|
2
|
+
|
|
3
|
+
export class Corpus {
|
|
4
|
+
constructor(opts = {}) {
|
|
5
|
+
let { wordDocCount = new WordVector(), docMap = {} } = opts;
|
|
6
|
+
|
|
7
|
+
this._size = Object.keys(docMap).length;
|
|
8
|
+
|
|
9
|
+
Object.assign(this, {
|
|
10
|
+
wordDocCount,
|
|
11
|
+
docMap,
|
|
12
|
+
});
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
get size() {
|
|
16
|
+
return this._size;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
addDocument(id, doc) {
|
|
20
|
+
this.deleteDocument(id);
|
|
21
|
+
this.docMap[id] = doc;
|
|
22
|
+
this._size++;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
getDocument(id) {
|
|
26
|
+
return this.docMap[id];
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
deleteDocument(id) {
|
|
30
|
+
let { docMap } = this;
|
|
31
|
+
let doc = docMap[id];
|
|
32
|
+
if (doc) {
|
|
33
|
+
delete docMap[id];
|
|
34
|
+
this._size--;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
return doc;
|
|
38
|
+
}
|
|
39
|
+
}
|
package/src/text/tfidf-space.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { DBG } from '../defines.mjs';
|
|
2
|
+
import { Corpus } from './corpus.mjs';
|
|
2
3
|
import { WordVector } from './word-vector.mjs';
|
|
3
4
|
|
|
4
5
|
// The golden ratio is pretty.
|
|
@@ -10,8 +11,7 @@ export class TfidfSpace {
|
|
|
10
11
|
const msg = 't8e.ctor:';
|
|
11
12
|
let {
|
|
12
13
|
lang = 'en', // 2-letter code: fr, en, es, pt
|
|
13
|
-
|
|
14
|
-
corpusSize = 0, // number of retrieval units (docs, segments, etc.)
|
|
14
|
+
corpus = new Corpus(),
|
|
15
15
|
idfWeight = GOLDEN_FUDGE, // IDF dampening
|
|
16
16
|
idfFunction = TfidfSpace.idfTunable,
|
|
17
17
|
normalizeText,
|
|
@@ -41,8 +41,7 @@ export class TfidfSpace {
|
|
|
41
41
|
// Serializable properties
|
|
42
42
|
Object.assign(this, {
|
|
43
43
|
lang,
|
|
44
|
-
|
|
45
|
-
corpusSize,
|
|
44
|
+
corpus,
|
|
46
45
|
idfWeight,
|
|
47
46
|
});
|
|
48
47
|
}
|
|
@@ -59,45 +58,53 @@ export class TfidfSpace {
|
|
|
59
58
|
}
|
|
60
59
|
|
|
61
60
|
static normalizeFR(s) {
|
|
62
|
-
let sAbbr = s
|
|
61
|
+
let sAbbr = s
|
|
62
|
+
.toLowerCase()
|
|
63
63
|
.replace(/\bd[’']/gi, 'de ')
|
|
64
64
|
.replace(/\bl[’']/gi, 'le ')
|
|
65
65
|
.replace(/\bs[’']/gi, 's_')
|
|
66
66
|
.replace(/\bj[’']/gi, 'j_')
|
|
67
67
|
.replace(/\bm[’']/gi, 'm_')
|
|
68
68
|
.replace(/\bn[’']/gi, 'n_')
|
|
69
|
+
.replace(/\bc[’']/gi, 'c_');
|
|
69
70
|
return TfidfSpace.removeNonWords(sAbbr);
|
|
70
71
|
}
|
|
71
72
|
|
|
72
73
|
static idfStandard(space, word) {
|
|
73
74
|
const msg = 'w7e.idfStandard:';
|
|
74
|
-
let {
|
|
75
|
-
let wordDocs =
|
|
76
|
-
return Math.log((
|
|
75
|
+
let { corpus } = space;
|
|
76
|
+
let wordDocs = corpus.wordDocCount[word] || 0;
|
|
77
|
+
return Math.log((corpus.size + 1) / (wordDocs + 1));
|
|
77
78
|
}
|
|
78
79
|
|
|
79
|
-
static idfTunable(space, word, idfWeight
|
|
80
|
+
static idfTunable(space, word, idfWeight) {
|
|
80
81
|
const msg = 'w7e.idf:';
|
|
81
|
-
let {
|
|
82
|
-
let wordDocs =
|
|
82
|
+
let { corpus } = space;
|
|
83
|
+
let wordDocs = corpus.wordDocCount[word] || 0;
|
|
83
84
|
// NOTE: This is NOT the usual formula
|
|
84
85
|
// Map to [0:ignore..1:important]
|
|
85
|
-
return
|
|
86
|
-
? 1 -
|
|
86
|
+
return corpus.size
|
|
87
|
+
? 1 -
|
|
88
|
+
Math.exp(((wordDocs - corpus.size) / wordDocs) * idfWeight)
|
|
87
89
|
: 1;
|
|
88
90
|
}
|
|
89
91
|
|
|
90
|
-
idf(word, idfWeight) {
|
|
92
|
+
idf(word, idfWeight = this.idfWeight) {
|
|
91
93
|
return this.idfFunction(this, word, idfWeight);
|
|
92
94
|
}
|
|
93
95
|
|
|
94
|
-
addDocument(doc) {
|
|
95
|
-
let {
|
|
96
|
-
this.
|
|
97
|
-
|
|
98
|
-
|
|
96
|
+
addDocument(id, doc) {
|
|
97
|
+
let { corpus } = this;
|
|
98
|
+
let { bow, words } = this.countWords(doc);
|
|
99
|
+
corpus.wordDocCount.increment(bow.oneHot());
|
|
100
|
+
|
|
101
|
+
let docInfo = {
|
|
102
|
+
bow,
|
|
103
|
+
nWords: words.length,
|
|
104
|
+
};
|
|
105
|
+
corpus.addDocument(id, docInfo);
|
|
99
106
|
|
|
100
|
-
return
|
|
107
|
+
return docInfo;
|
|
101
108
|
}
|
|
102
109
|
|
|
103
110
|
termFrequency(word, document) {
|
|
@@ -112,7 +119,7 @@ export class TfidfSpace {
|
|
|
112
119
|
|
|
113
120
|
tfidf(doc) {
|
|
114
121
|
const msg = 'w7e.tfidf:';
|
|
115
|
-
let {
|
|
122
|
+
let { corpus, idfWeight } = this;
|
|
116
123
|
|
|
117
124
|
// More efficient implementation of tf * idf
|
|
118
125
|
let { bow, words } = this.countWords(doc);
|
|
@@ -121,9 +128,9 @@ export class TfidfSpace {
|
|
|
121
128
|
let vTfIdf = words.reduce((a, word) => {
|
|
122
129
|
let wd = bow[word] || 0;
|
|
123
130
|
let tf = wd ? wd / nWords : 0;
|
|
124
|
-
let wc =
|
|
125
|
-
let idf =
|
|
126
|
-
? 1 - Math.exp(((wc -
|
|
131
|
+
let wc = corpus.wordDocCount[word] || 0;
|
|
132
|
+
let idf = corpus.size
|
|
133
|
+
? 1 - Math.exp(((wc - corpus.size) / wc) * idfWeight)
|
|
127
134
|
: 1;
|
|
128
135
|
let tfidf = tf * idf;
|
|
129
136
|
if (tfidf) {
|
|
@@ -135,7 +142,7 @@ export class TfidfSpace {
|
|
|
135
142
|
return vTfIdf;
|
|
136
143
|
}
|
|
137
144
|
|
|
138
|
-
countWords(str
|
|
145
|
+
countWords(str) {
|
|
139
146
|
const msg = 'w7e.countWords:';
|
|
140
147
|
if (str == null) {
|
|
141
148
|
throw new Error(`${msg} str?`);
|
|
@@ -144,8 +151,7 @@ export class TfidfSpace {
|
|
|
144
151
|
let sNorm = this.normalizeText(str);
|
|
145
152
|
let words = sNorm.split(' ');
|
|
146
153
|
let bow = words.reduce((a, w) => {
|
|
147
|
-
|
|
148
|
-
a[w] = maxCount ? Math.min(maxCount, count) : count;
|
|
154
|
+
a[w] = (a[w] || 0) + 1;
|
|
149
155
|
return a;
|
|
150
156
|
}, new WordVector());
|
|
151
157
|
|
package/src/text/word-vector.mjs
CHANGED
|
@@ -79,7 +79,7 @@ export class WordVector extends Object {
|
|
|
79
79
|
}
|
|
80
80
|
|
|
81
81
|
scale(c) {
|
|
82
|
-
return Object.keys(this).reduce((a,k)=>{
|
|
82
|
+
return Object.keys(this).reduce((a, k) => {
|
|
83
83
|
a[k] *= c;
|
|
84
84
|
return a;
|
|
85
85
|
}, this);
|
|
@@ -109,4 +109,13 @@ export class WordVector extends Object {
|
|
|
109
109
|
let den = norm1 * norm2;
|
|
110
110
|
return den ? d / den : 0;
|
|
111
111
|
}
|
|
112
|
+
|
|
113
|
+
oneHot() {
|
|
114
|
+
return Object.keys(this).reduce((a, k) => {
|
|
115
|
+
if (this[k] > 0) {
|
|
116
|
+
a[k] = 1;
|
|
117
|
+
}
|
|
118
|
+
return a;
|
|
119
|
+
}, new WordVector());
|
|
120
|
+
}
|
|
112
121
|
} // WordVector
|