@sc-voice/tools 2.4.0 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/text/tfidf-space.mjs +44 -33
- package/src/text/word-vector.mjs +7 -0
package/package.json
CHANGED
package/src/text/tfidf-space.mjs
CHANGED
|
@@ -13,6 +13,7 @@ export class TfidfSpace {
|
|
|
13
13
|
corpusBow = new WordVector(), // corpus bag of words
|
|
14
14
|
corpusSize = 0, // number of retrieval units (docs, segments, etc.)
|
|
15
15
|
idfWeight = GOLDEN_FUDGE, // IDF dampening
|
|
16
|
+
idfFunction = TfidfSpace.idfTunable,
|
|
16
17
|
normalizeText,
|
|
17
18
|
} = opts;
|
|
18
19
|
if (lang == null) {
|
|
@@ -33,6 +34,9 @@ export class TfidfSpace {
|
|
|
33
34
|
Object.defineProperty(this, 'normalizeText', {
|
|
34
35
|
value: normalizeText,
|
|
35
36
|
});
|
|
37
|
+
Object.defineProperty(this, 'idfFunction', {
|
|
38
|
+
value: idfFunction,
|
|
39
|
+
});
|
|
36
40
|
|
|
37
41
|
// Serializable properties
|
|
38
42
|
Object.assign(this, {
|
|
@@ -43,24 +47,45 @@ export class TfidfSpace {
|
|
|
43
47
|
});
|
|
44
48
|
}
|
|
45
49
|
|
|
50
|
+
static removeNonWords(s) {
|
|
51
|
+
const RE_RESERVED = /[_-]/g; // allowed in bow words
|
|
52
|
+
const RE_PUNCT = /[.,:;$"'“”‘’!?«»]/g;
|
|
53
|
+
const RE_SPACE = /\s+/g;
|
|
54
|
+
return s.replace(RE_PUNCT, '').replace(RE_SPACE, ' ').trim();
|
|
55
|
+
}
|
|
56
|
+
|
|
46
57
|
static normalizeEN(s) {
|
|
47
|
-
return s
|
|
48
|
-
.toLowerCase()
|
|
49
|
-
.replace(/[-.,_:;"'“”‘’!?]/g, '')
|
|
50
|
-
.replace(/ {2,}/g, ' ')
|
|
51
|
-
.trim();
|
|
58
|
+
return TfidfSpace.removeNonWords(s.toLowerCase());
|
|
52
59
|
}
|
|
53
60
|
|
|
54
61
|
static normalizeFR(s) {
|
|
55
|
-
|
|
56
|
-
.toLowerCase()
|
|
57
|
-
.replace(/[«»]/gi, '')
|
|
62
|
+
let sAbbr = s.toLowerCase()
|
|
58
63
|
.replace(/\bd[’']/gi, 'de ')
|
|
59
64
|
.replace(/\bl[’']/gi, 'le ')
|
|
60
65
|
.replace(/\bs[’']/gi, 'se ')
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
66
|
+
return TfidfSpace.removeNonWords(sAbbr);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
static idfStandard(space, word) {
|
|
70
|
+
const msg = 'w7e.idfStandard:';
|
|
71
|
+
let { corpusBow, corpusSize } = space;
|
|
72
|
+
let wordDocs = corpusBow[word] || 0;
|
|
73
|
+
return Math.log((corpusSize + 1) / (wordDocs+1));
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
static idfTunable(space, word, idfWeight = this.idfWeight) {
|
|
77
|
+
const msg = 'w7e.idf:';
|
|
78
|
+
let { corpusBow, corpusSize } = space;
|
|
79
|
+
let wordDocs = corpusBow[word] || 0;
|
|
80
|
+
// NOTE: This is NOT the usual formula
|
|
81
|
+
// Map to [0:ignore..1:important]
|
|
82
|
+
return corpusSize
|
|
83
|
+
? 1 - Math.exp(((wordDocs - corpusSize) / wordDocs) * idfWeight)
|
|
84
|
+
: 1;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
idf(word, idfWeight) {
|
|
88
|
+
return this.idfFunction(this, word, idfWeight);
|
|
64
89
|
}
|
|
65
90
|
|
|
66
91
|
addDocument(doc) {
|
|
@@ -72,20 +97,6 @@ export class TfidfSpace {
|
|
|
72
97
|
return this;
|
|
73
98
|
}
|
|
74
99
|
|
|
75
|
-
inverseDocumentFrequency(word, idfWeight) {
|
|
76
|
-
return this.idf(word, idfWeight);
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
idf(word, idfWeight = this.idfWeight) {
|
|
80
|
-
const msg = 'w7e.idf:';
|
|
81
|
-
let { corpusBow, corpusSize } = this;
|
|
82
|
-
let wCount = corpusBow[word] || 0;
|
|
83
|
-
// Map to [0:ignore..1:important]
|
|
84
|
-
return corpusSize
|
|
85
|
-
? 1 - Math.exp(((wCount - corpusSize) / wCount) * idfWeight)
|
|
86
|
-
: 1;
|
|
87
|
-
}
|
|
88
|
-
|
|
89
100
|
termFrequency(word, document) {
|
|
90
101
|
return this.tf(word, document);
|
|
91
102
|
}
|
|
@@ -138,19 +149,19 @@ export class TfidfSpace {
|
|
|
138
149
|
return { bow, words };
|
|
139
150
|
}
|
|
140
151
|
|
|
141
|
-
|
|
142
|
-
const msg = 'w7e.
|
|
143
|
-
if (
|
|
144
|
-
throw new Error(`${msg}
|
|
152
|
+
bowOfText(text) {
|
|
153
|
+
const msg = 'w7e.bowOfText:';
|
|
154
|
+
if (text == null) {
|
|
155
|
+
throw new Error(`${msg} text?`);
|
|
145
156
|
}
|
|
146
157
|
let dbg = 0;
|
|
147
|
-
let sNorm = this.normalizeText(
|
|
158
|
+
let sNorm = this.normalizeText(text);
|
|
148
159
|
let words = sNorm.split(' ');
|
|
149
|
-
let
|
|
150
|
-
a[w] = (a[w] || 0) +
|
|
160
|
+
let bow = words.reduce((a, w) => {
|
|
161
|
+
a[w] = (a[w] || 0) + 1;
|
|
151
162
|
return a;
|
|
152
163
|
}, new WordVector());
|
|
153
164
|
|
|
154
|
-
return
|
|
165
|
+
return bow;
|
|
155
166
|
}
|
|
156
167
|
} // TfidfSpace
|
package/src/text/word-vector.mjs
CHANGED
|
@@ -78,6 +78,13 @@ export class WordVector extends Object {
|
|
|
78
78
|
}, 0);
|
|
79
79
|
}
|
|
80
80
|
|
|
81
|
+
scale(c) {
|
|
82
|
+
return Object.keys(this).reduce((a,k)=>{
|
|
83
|
+
a[k] *= c;
|
|
84
|
+
return a;
|
|
85
|
+
}, this);
|
|
86
|
+
}
|
|
87
|
+
|
|
81
88
|
intersect(vec2 = {}) {
|
|
82
89
|
let keys = Object.keys(this);
|
|
83
90
|
return keys.reduce((a, k) => {
|