@sc-voice/tools 2.20.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/text/tfidf-space.mjs +16 -7
- package/src/text/word-vector.mjs +28 -8
package/package.json
CHANGED
package/src/text/tfidf-space.mjs
CHANGED
|
@@ -15,6 +15,7 @@ export class TfidfSpace {
|
|
|
15
15
|
idfWeight = GOLDEN_FUDGE, // IDF dampening
|
|
16
16
|
idfFunction = TfidfSpace.idfTunable,
|
|
17
17
|
normalizeText,
|
|
18
|
+
leftQuoteToken,
|
|
18
19
|
} = opts;
|
|
19
20
|
if (lang == null) {
|
|
20
21
|
throw new Error(`${msg} lang?`);
|
|
@@ -31,7 +32,7 @@ export class TfidfSpace {
|
|
|
31
32
|
throw new Error(`${msg} normalizeText?`);
|
|
32
33
|
}
|
|
33
34
|
}
|
|
34
|
-
Object.defineProperty(this, '
|
|
35
|
+
Object.defineProperty(this, '_normalizeText', {
|
|
35
36
|
value: normalizeText,
|
|
36
37
|
});
|
|
37
38
|
Object.defineProperty(this, 'idfFunction', {
|
|
@@ -43,6 +44,7 @@ export class TfidfSpace {
|
|
|
43
44
|
lang,
|
|
44
45
|
corpus,
|
|
45
46
|
idfWeight,
|
|
47
|
+
leftQuoteToken,
|
|
46
48
|
});
|
|
47
49
|
}
|
|
48
50
|
|
|
@@ -50,23 +52,26 @@ export class TfidfSpace {
|
|
|
50
52
|
return s.replace(/<[^>]*>/gi, '');
|
|
51
53
|
}
|
|
52
54
|
|
|
53
|
-
static removeNonWords(s) {
|
|
55
|
+
static removeNonWords(s, opts={}) {
|
|
54
56
|
const RE_RESERVED = /[_-]/g; // allowed in bow words
|
|
55
57
|
const RE_LQUOTE = /[“‘«]/g;
|
|
56
58
|
const RE_PUNCT = /[.,:;$"'“”‘’!?«»\[\]]/g;
|
|
57
59
|
const RE_SPACE = /\s+/g;
|
|
60
|
+
let {
|
|
61
|
+
leftQuoteToken = '', // TBD: is this useful?
|
|
62
|
+
} = opts;
|
|
58
63
|
return TfidfSpace.removeHtml(s)
|
|
59
|
-
.replace(RE_LQUOTE,
|
|
64
|
+
.replace(RE_LQUOTE, leftQuoteToken)
|
|
60
65
|
.replace(RE_PUNCT, '')
|
|
61
66
|
.replace(RE_SPACE, ' ')
|
|
62
67
|
.trim();
|
|
63
68
|
}
|
|
64
69
|
|
|
65
|
-
static normalizeEN(s) {
|
|
66
|
-
return TfidfSpace.removeNonWords(s.toLowerCase());
|
|
70
|
+
static normalizeEN(s, opts={}) {
|
|
71
|
+
return TfidfSpace.removeNonWords(s.toLowerCase(), opts);
|
|
67
72
|
}
|
|
68
73
|
|
|
69
|
-
static normalizeFR(s) {
|
|
74
|
+
static normalizeFR(s, opts={}) {
|
|
70
75
|
let sAbbr = s
|
|
71
76
|
.toLowerCase()
|
|
72
77
|
.replace(/\bd[’']/gi, 'de ')
|
|
@@ -76,7 +81,7 @@ export class TfidfSpace {
|
|
|
76
81
|
.replace(/\bm[’']/gi, 'm_')
|
|
77
82
|
.replace(/\bn[’']/gi, 'n_')
|
|
78
83
|
.replace(/\bc[’']/gi, 'c_');
|
|
79
|
-
return TfidfSpace.removeNonWords(sAbbr);
|
|
84
|
+
return TfidfSpace.removeNonWords(sAbbr, opts);
|
|
80
85
|
}
|
|
81
86
|
|
|
82
87
|
static idfStandard(nDocs, wdc, idfWeight) {
|
|
@@ -165,6 +170,10 @@ export class TfidfSpace {
|
|
|
165
170
|
return this.tfidfOfBow(bow);
|
|
166
171
|
}
|
|
167
172
|
|
|
173
|
+
normalizeText(str) {
|
|
174
|
+
return this._normalizeText(str, this);
|
|
175
|
+
}
|
|
176
|
+
|
|
168
177
|
countWords(str) {
|
|
169
178
|
const msg = 'w7e.countWords:';
|
|
170
179
|
if (str == null) {
|
package/src/text/word-vector.mjs
CHANGED
|
@@ -21,7 +21,13 @@ export class WordVector extends Object {
|
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
toString(opts = {}) {
|
|
24
|
-
|
|
24
|
+
const msg = 'w10r.toString:';
|
|
25
|
+
let { order = 'value', minValue, precision = 2 } = opts;
|
|
26
|
+
|
|
27
|
+
if (minValue == null) {
|
|
28
|
+
minValue = Math.pow(10, -precision) / 2;
|
|
29
|
+
}
|
|
30
|
+
|
|
25
31
|
let entries = Object.entries(this);
|
|
26
32
|
switch (order) {
|
|
27
33
|
case 'key':
|
|
@@ -42,14 +48,18 @@ export class WordVector extends Object {
|
|
|
42
48
|
}
|
|
43
49
|
let sv = entries.reduce((a, e) => {
|
|
44
50
|
let [k, v] = e;
|
|
45
|
-
|
|
46
|
-
|
|
51
|
+
if (minValue <= v) {
|
|
52
|
+
let vf = v.toFixed(precision)
|
|
53
|
+
.replace(/\.0*$/, '')
|
|
54
|
+
.replace(/0\./,'.');
|
|
55
|
+
a.push(`${k}:${vf}`);
|
|
56
|
+
}
|
|
47
57
|
return a;
|
|
48
58
|
}, []);
|
|
49
59
|
return sv.join(',');
|
|
50
60
|
}
|
|
51
61
|
|
|
52
|
-
norm() {
|
|
62
|
+
norm() { // L2 norm
|
|
53
63
|
let keys = Object.keys(this);
|
|
54
64
|
if (keys.length === 0) {
|
|
55
65
|
return 0;
|
|
@@ -104,17 +114,27 @@ export class WordVector extends Object {
|
|
|
104
114
|
}, this);
|
|
105
115
|
}
|
|
106
116
|
|
|
107
|
-
|
|
117
|
+
hadamardL1(vec2 = {}) {
|
|
118
|
+
// L1-norm of Hadamard product shows how
|
|
119
|
+
// the cosine similarity score is apportioned
|
|
108
120
|
let keys = Object.keys(this);
|
|
109
|
-
|
|
121
|
+
let n = 0;
|
|
122
|
+
let hadamard = keys.reduce((a, k) => {
|
|
110
123
|
let v1 = this[k];
|
|
111
124
|
let v2 = vec2[k] || 0;
|
|
112
125
|
if (v1 && v2) {
|
|
113
126
|
a[k] = v1 * v2;
|
|
127
|
+
n++;
|
|
114
128
|
}
|
|
115
129
|
|
|
116
130
|
return a;
|
|
117
131
|
}, new WordVector());
|
|
132
|
+
|
|
133
|
+
if (n === 0) {
|
|
134
|
+
return hadamard; // empty vector
|
|
135
|
+
}
|
|
136
|
+
let n12 = this.norm() * vec2.norm();
|
|
137
|
+
return hadamard.scale(1/n12);
|
|
118
138
|
}
|
|
119
139
|
|
|
120
140
|
similar(vec2) {
|
|
@@ -125,8 +145,8 @@ export class WordVector extends Object {
|
|
|
125
145
|
let d = this.dot(vec2);
|
|
126
146
|
let norm1 = this.norm();
|
|
127
147
|
let norm2 = vec2.norm();
|
|
128
|
-
let
|
|
129
|
-
return
|
|
148
|
+
let n12 = norm1 * norm2;
|
|
149
|
+
return n12 ? d / n12 : 0;
|
|
130
150
|
}
|
|
131
151
|
|
|
132
152
|
oneHot() {
|