@sc-voice/tools 2.8.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.mjs +2 -0
- package/package.json +1 -1
- package/src/text/corpus.mjs +39 -0
- package/src/text/tfidf-space.mjs +33 -28
- package/src/text/word-space.mjs +0 -1
- package/src/text/word-vector.mjs +2 -2
package/index.mjs
CHANGED
|
@@ -6,6 +6,7 @@ export const ScvMath = {
|
|
|
6
6
|
};
|
|
7
7
|
|
|
8
8
|
import { BilaraPath } from './src/text/bilara-path.mjs';
|
|
9
|
+
import { Corpus } from './src/text/corpus.mjs';
|
|
9
10
|
import { EbtDoc } from './src/text/ebt-doc.mjs';
|
|
10
11
|
import { LegacyDoc } from './src/text/legacy-doc.mjs';
|
|
11
12
|
import { MerkleJson } from './src/text/merkle-json.mjs';
|
|
@@ -18,6 +19,7 @@ import { LogEntry, Logger } from './src/text/logger.mjs';
|
|
|
18
19
|
|
|
19
20
|
export const Text = {
|
|
20
21
|
BilaraPath,
|
|
22
|
+
Corpus,
|
|
21
23
|
EbtDoc,
|
|
22
24
|
LegacyDoc,
|
|
23
25
|
LogEntry,
|
package/package.json
CHANGED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { WordVector } from './word-vector.mjs';
|
|
2
|
+
|
|
3
|
+
export class Corpus {
|
|
4
|
+
constructor(opts = {}) {
|
|
5
|
+
let { wordDocCount = new WordVector(), docMap = {} } = opts;
|
|
6
|
+
|
|
7
|
+
this._size = Object.keys(docMap).length;
|
|
8
|
+
|
|
9
|
+
Object.assign(this, {
|
|
10
|
+
wordDocCount,
|
|
11
|
+
docMap,
|
|
12
|
+
});
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
get size() {
|
|
16
|
+
return this._size;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
addDocument(id, doc) {
|
|
20
|
+
this.deleteDocument(id);
|
|
21
|
+
this.docMap[id] = doc;
|
|
22
|
+
this._size++;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
getDocument(id) {
|
|
26
|
+
return this.docMap[id];
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
deleteDocument(id) {
|
|
30
|
+
let { docMap } = this;
|
|
31
|
+
let doc = docMap[id];
|
|
32
|
+
if (doc) {
|
|
33
|
+
delete docMap[id];
|
|
34
|
+
this._size--;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
return doc;
|
|
38
|
+
}
|
|
39
|
+
}
|
package/src/text/tfidf-space.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { DBG } from '../defines.mjs';
|
|
2
|
+
import { Corpus } from './corpus.mjs';
|
|
2
3
|
import { WordVector } from './word-vector.mjs';
|
|
3
4
|
|
|
4
5
|
// The golden ratio is pretty.
|
|
@@ -10,8 +11,7 @@ export class TfidfSpace {
|
|
|
10
11
|
const msg = 't8e.ctor:';
|
|
11
12
|
let {
|
|
12
13
|
lang = 'en', // 2-letter code: fr, en, es, pt
|
|
13
|
-
|
|
14
|
-
corpusSize = 0, // number of retrieval units (docs, segments, etc.)
|
|
14
|
+
corpus = new Corpus(),
|
|
15
15
|
idfWeight = GOLDEN_FUDGE, // IDF dampening
|
|
16
16
|
idfFunction = TfidfSpace.idfTunable,
|
|
17
17
|
normalizeText,
|
|
@@ -41,8 +41,7 @@ export class TfidfSpace {
|
|
|
41
41
|
// Serializable properties
|
|
42
42
|
Object.assign(this, {
|
|
43
43
|
lang,
|
|
44
|
-
|
|
45
|
-
corpusSize,
|
|
44
|
+
corpus,
|
|
46
45
|
idfWeight,
|
|
47
46
|
});
|
|
48
47
|
}
|
|
@@ -59,46 +58,53 @@ export class TfidfSpace {
|
|
|
59
58
|
}
|
|
60
59
|
|
|
61
60
|
static normalizeFR(s) {
|
|
62
|
-
let sAbbr = s
|
|
61
|
+
let sAbbr = s
|
|
62
|
+
.toLowerCase()
|
|
63
63
|
.replace(/\bd[’']/gi, 'de ')
|
|
64
64
|
.replace(/\bl[’']/gi, 'le ')
|
|
65
65
|
.replace(/\bs[’']/gi, 's_')
|
|
66
66
|
.replace(/\bj[’']/gi, 'j_')
|
|
67
67
|
.replace(/\bm[’']/gi, 'm_')
|
|
68
68
|
.replace(/\bn[’']/gi, 'n_')
|
|
69
|
-
.replace(/\bc[’']/gi, 'c_')
|
|
69
|
+
.replace(/\bc[’']/gi, 'c_');
|
|
70
70
|
return TfidfSpace.removeNonWords(sAbbr);
|
|
71
71
|
}
|
|
72
72
|
|
|
73
73
|
static idfStandard(space, word) {
|
|
74
74
|
const msg = 'w7e.idfStandard:';
|
|
75
|
-
let {
|
|
76
|
-
let wordDocs =
|
|
77
|
-
return Math.log((
|
|
75
|
+
let { corpus } = space;
|
|
76
|
+
let wordDocs = corpus.wordDocCount[word] || 0;
|
|
77
|
+
return Math.log((corpus.size + 1) / (wordDocs + 1));
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
-
static idfTunable(space, word, idfWeight
|
|
80
|
+
static idfTunable(space, word, idfWeight) {
|
|
81
81
|
const msg = 'w7e.idf:';
|
|
82
|
-
let {
|
|
83
|
-
let wordDocs =
|
|
82
|
+
let { corpus } = space;
|
|
83
|
+
let wordDocs = corpus.wordDocCount[word] || 0;
|
|
84
84
|
// NOTE: This is NOT the usual formula
|
|
85
85
|
// Map to [0:ignore..1:important]
|
|
86
|
-
return
|
|
87
|
-
? 1 -
|
|
86
|
+
return corpus.size
|
|
87
|
+
? 1 -
|
|
88
|
+
Math.exp(((wordDocs - corpus.size) / wordDocs) * idfWeight)
|
|
88
89
|
: 1;
|
|
89
90
|
}
|
|
90
91
|
|
|
91
|
-
idf(word, idfWeight) {
|
|
92
|
+
idf(word, idfWeight = this.idfWeight) {
|
|
92
93
|
return this.idfFunction(this, word, idfWeight);
|
|
93
94
|
}
|
|
94
95
|
|
|
95
|
-
addDocument(doc) {
|
|
96
|
-
let {
|
|
97
|
-
this.
|
|
98
|
-
|
|
99
|
-
|
|
96
|
+
addDocument(id, doc) {
|
|
97
|
+
let { corpus } = this;
|
|
98
|
+
let { bow, words } = this.countWords(doc);
|
|
99
|
+
corpus.wordDocCount.increment(bow.oneHot());
|
|
100
|
+
|
|
101
|
+
let docInfo = {
|
|
102
|
+
bow,
|
|
103
|
+
nWords: words.length,
|
|
104
|
+
};
|
|
105
|
+
corpus.addDocument(id, docInfo);
|
|
100
106
|
|
|
101
|
-
return
|
|
107
|
+
return docInfo;
|
|
102
108
|
}
|
|
103
109
|
|
|
104
110
|
termFrequency(word, document) {
|
|
@@ -113,7 +119,7 @@ export class TfidfSpace {
|
|
|
113
119
|
|
|
114
120
|
tfidf(doc) {
|
|
115
121
|
const msg = 'w7e.tfidf:';
|
|
116
|
-
let {
|
|
122
|
+
let { corpus, idfWeight } = this;
|
|
117
123
|
|
|
118
124
|
// More efficient implementation of tf * idf
|
|
119
125
|
let { bow, words } = this.countWords(doc);
|
|
@@ -122,9 +128,9 @@ export class TfidfSpace {
|
|
|
122
128
|
let vTfIdf = words.reduce((a, word) => {
|
|
123
129
|
let wd = bow[word] || 0;
|
|
124
130
|
let tf = wd ? wd / nWords : 0;
|
|
125
|
-
let wc =
|
|
126
|
-
let idf =
|
|
127
|
-
? 1 - Math.exp(((wc -
|
|
131
|
+
let wc = corpus.wordDocCount[word] || 0;
|
|
132
|
+
let idf = corpus.size
|
|
133
|
+
? 1 - Math.exp(((wc - corpus.size) / wc) * idfWeight)
|
|
128
134
|
: 1;
|
|
129
135
|
let tfidf = tf * idf;
|
|
130
136
|
if (tfidf) {
|
|
@@ -136,7 +142,7 @@ export class TfidfSpace {
|
|
|
136
142
|
return vTfIdf;
|
|
137
143
|
}
|
|
138
144
|
|
|
139
|
-
countWords(str
|
|
145
|
+
countWords(str) {
|
|
140
146
|
const msg = 'w7e.countWords:';
|
|
141
147
|
if (str == null) {
|
|
142
148
|
throw new Error(`${msg} str?`);
|
|
@@ -145,8 +151,7 @@ export class TfidfSpace {
|
|
|
145
151
|
let sNorm = this.normalizeText(str);
|
|
146
152
|
let words = sNorm.split(' ');
|
|
147
153
|
let bow = words.reduce((a, w) => {
|
|
148
|
-
|
|
149
|
-
a[w] = maxCount ? Math.min(maxCount, count) : count;
|
|
154
|
+
a[w] = (a[w] || 0) + 1;
|
|
150
155
|
return a;
|
|
151
156
|
}, new WordVector());
|
|
152
157
|
|
package/src/text/word-space.mjs
CHANGED
package/src/text/word-vector.mjs
CHANGED
|
@@ -79,7 +79,7 @@ export class WordVector extends Object {
|
|
|
79
79
|
}
|
|
80
80
|
|
|
81
81
|
scale(c) {
|
|
82
|
-
return Object.keys(this).reduce((a,k)=>{
|
|
82
|
+
return Object.keys(this).reduce((a, k) => {
|
|
83
83
|
a[k] *= c;
|
|
84
84
|
return a;
|
|
85
85
|
}, this);
|
|
@@ -111,7 +111,7 @@ export class WordVector extends Object {
|
|
|
111
111
|
}
|
|
112
112
|
|
|
113
113
|
oneHot() {
|
|
114
|
-
return Object.keys(this).reduce((a,k)=>{
|
|
114
|
+
return Object.keys(this).reduce((a, k) => {
|
|
115
115
|
if (this[k] > 0) {
|
|
116
116
|
a[k] = 1;
|
|
117
117
|
}
|