@sc-voice/tools 2.16.0 → 2.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/text/tfidf-space.mjs +16 -6
- package/src/text/word-vector.mjs +6 -6
package/package.json
CHANGED
package/src/text/tfidf-space.mjs
CHANGED
|
@@ -46,11 +46,20 @@ export class TfidfSpace {
|
|
|
46
46
|
});
|
|
47
47
|
}
|
|
48
48
|
|
|
49
|
+
static removeHtml(s) {
|
|
50
|
+
return s.replace(/<[^>]*>/gi, '');
|
|
51
|
+
}
|
|
52
|
+
|
|
49
53
|
static removeNonWords(s) {
|
|
50
54
|
const RE_RESERVED = /[_-]/g; // allowed in bow words
|
|
51
|
-
const
|
|
55
|
+
const RE_LQUOTE = /[“‘«]/g;
|
|
56
|
+
const RE_PUNCT = /[.,:;$"'“”‘’!?«»\[\]]/g;
|
|
52
57
|
const RE_SPACE = /\s+/g;
|
|
53
|
-
return
|
|
58
|
+
return TfidfSpace.removeHtml(s)
|
|
59
|
+
.replace(RE_LQUOTE, '__LQUOTE ')
|
|
60
|
+
.replace(RE_PUNCT, '')
|
|
61
|
+
.replace(RE_SPACE, ' ')
|
|
62
|
+
.trim();
|
|
54
63
|
}
|
|
55
64
|
|
|
56
65
|
static normalizeEN(s) {
|
|
@@ -100,7 +109,7 @@ export class TfidfSpace {
|
|
|
100
109
|
// Bag-of-words maps word to wordCount(word,doc)
|
|
101
110
|
throw new Error(`${msg} bow?`);
|
|
102
111
|
}
|
|
103
|
-
let nWords = Object.values(bow).reduce((a,v)=>a+v);
|
|
112
|
+
let nWords = Object.values(bow).reduce((a, v) => a + v);
|
|
104
113
|
let docInfo = { id, bow, nWords };
|
|
105
114
|
corpus.wordDocCount.increment(bow.oneHot());
|
|
106
115
|
corpus.addDocument(id, docInfo);
|
|
@@ -111,7 +120,7 @@ export class TfidfSpace {
|
|
|
111
120
|
addDocument(id, doc) {
|
|
112
121
|
let { corpus } = this;
|
|
113
122
|
let { bow, words } = this.countWords(doc);
|
|
114
|
-
|
|
123
|
+
|
|
115
124
|
return this.addCorpusDocument(id, bow, words.length);
|
|
116
125
|
}
|
|
117
126
|
|
|
@@ -131,7 +140,7 @@ export class TfidfSpace {
|
|
|
131
140
|
|
|
132
141
|
// More efficient implementation of tf * idf
|
|
133
142
|
let words = Object.keys(bow);
|
|
134
|
-
let nWords = words.reduce((a,w)=>a+bow[w],0);
|
|
143
|
+
let nWords = words.reduce((a, w) => a + bow[w], 0);
|
|
135
144
|
|
|
136
145
|
let vTfIdf = words.reduce((a, word) => {
|
|
137
146
|
let wd = bow[word] || 0;
|
|
@@ -150,7 +159,8 @@ export class TfidfSpace {
|
|
|
150
159
|
return vTfIdf;
|
|
151
160
|
}
|
|
152
161
|
|
|
153
|
-
tfidf(text) {
|
|
162
|
+
tfidf(text) {
|
|
163
|
+
// TfIdf of words in text w/r to corpus
|
|
154
164
|
let { bow } = this.countWords(text);
|
|
155
165
|
return this.tfidfOfBow(bow);
|
|
156
166
|
}
|
package/src/text/word-vector.mjs
CHANGED
|
@@ -20,12 +20,12 @@ export class WordVector extends Object {
|
|
|
20
20
|
return this.$length;
|
|
21
21
|
}
|
|
22
22
|
|
|
23
|
-
toString(opts={}) {
|
|
24
|
-
let { order='value', precision=2 } = opts;
|
|
23
|
+
toString(opts = {}) {
|
|
24
|
+
let { order = 'value', precision = 2 } = opts;
|
|
25
25
|
let entries = Object.entries(this);
|
|
26
26
|
switch (order) {
|
|
27
27
|
case 'key':
|
|
28
|
-
entries.sort((a,b)=>{
|
|
28
|
+
entries.sort((a, b) => {
|
|
29
29
|
let [ka] = a;
|
|
30
30
|
let [kb] = b;
|
|
31
31
|
return ka.localeCompare(kb);
|
|
@@ -33,16 +33,16 @@ export class WordVector extends Object {
|
|
|
33
33
|
break;
|
|
34
34
|
case 'value':
|
|
35
35
|
default:
|
|
36
|
-
entries.sort((a,b)=>{
|
|
36
|
+
entries.sort((a, b) => {
|
|
37
37
|
let [ka, va] = a;
|
|
38
38
|
let [kb, vb] = b;
|
|
39
|
-
return
|
|
39
|
+
return vb - va || ka.localeCompare(kb);
|
|
40
40
|
});
|
|
41
41
|
break;
|
|
42
42
|
}
|
|
43
43
|
let sv = entries.reduce((a, e) => {
|
|
44
44
|
let [k, v] = e;
|
|
45
|
-
let vf = v.toFixed(precision).replace(/\.0*$/,'');
|
|
45
|
+
let vf = v.toFixed(precision).replace(/\.0*$/, '');
|
|
46
46
|
a.push(`${k}:${vf}`);
|
|
47
47
|
return a;
|
|
48
48
|
}, []);
|