@sc-voice/tools 2.19.0 → 2.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/text/legacy-doc.mjs +2 -2
- package/src/text/tfidf-space.mjs +16 -7
- package/src/text/word-vector.mjs +13 -3
package/package.json
CHANGED
package/src/text/legacy-doc.mjs
CHANGED
|
@@ -39,7 +39,7 @@ export class LegacyDoc {
|
|
|
39
39
|
|
|
40
40
|
static legacyUrl(opts={}) {
|
|
41
41
|
let {
|
|
42
|
-
endPoint = 'https://
|
|
42
|
+
endPoint = 'https://suttacentral.net/api/suttas',
|
|
43
43
|
sutta_uid,
|
|
44
44
|
lang,
|
|
45
45
|
author,
|
|
@@ -65,7 +65,7 @@ export class LegacyDoc {
|
|
|
65
65
|
dbg && console.log(msg, '[2]scapi', res.ok);
|
|
66
66
|
}
|
|
67
67
|
if (!res.ok) {
|
|
68
|
-
throw new Error(`${msg} {res.status} ${url}`);
|
|
68
|
+
throw new Error(`${msg} ${res.status} ${url}`);
|
|
69
69
|
}
|
|
70
70
|
let json = await res.json();
|
|
71
71
|
let { translation } = json;
|
package/src/text/tfidf-space.mjs
CHANGED
|
@@ -15,6 +15,7 @@ export class TfidfSpace {
|
|
|
15
15
|
idfWeight = GOLDEN_FUDGE, // IDF dampening
|
|
16
16
|
idfFunction = TfidfSpace.idfTunable,
|
|
17
17
|
normalizeText,
|
|
18
|
+
leftQuoteToken,
|
|
18
19
|
} = opts;
|
|
19
20
|
if (lang == null) {
|
|
20
21
|
throw new Error(`${msg} lang?`);
|
|
@@ -31,7 +32,7 @@ export class TfidfSpace {
|
|
|
31
32
|
throw new Error(`${msg} normalizeText?`);
|
|
32
33
|
}
|
|
33
34
|
}
|
|
34
|
-
Object.defineProperty(this, '
|
|
35
|
+
Object.defineProperty(this, '_normalizeText', {
|
|
35
36
|
value: normalizeText,
|
|
36
37
|
});
|
|
37
38
|
Object.defineProperty(this, 'idfFunction', {
|
|
@@ -43,6 +44,7 @@ export class TfidfSpace {
|
|
|
43
44
|
lang,
|
|
44
45
|
corpus,
|
|
45
46
|
idfWeight,
|
|
47
|
+
leftQuoteToken,
|
|
46
48
|
});
|
|
47
49
|
}
|
|
48
50
|
|
|
@@ -50,23 +52,26 @@ export class TfidfSpace {
|
|
|
50
52
|
return s.replace(/<[^>]*>/gi, '');
|
|
51
53
|
}
|
|
52
54
|
|
|
53
|
-
static removeNonWords(s) {
|
|
55
|
+
static removeNonWords(s, opts={}) {
|
|
54
56
|
const RE_RESERVED = /[_-]/g; // allowed in bow words
|
|
55
57
|
const RE_LQUOTE = /[“‘«]/g;
|
|
56
58
|
const RE_PUNCT = /[.,:;$"'“”‘’!?«»\[\]]/g;
|
|
57
59
|
const RE_SPACE = /\s+/g;
|
|
60
|
+
let {
|
|
61
|
+
leftQuoteToken = '', // TBD: is this useful?
|
|
62
|
+
} = opts;
|
|
58
63
|
return TfidfSpace.removeHtml(s)
|
|
59
|
-
.replace(RE_LQUOTE,
|
|
64
|
+
.replace(RE_LQUOTE, leftQuoteToken)
|
|
60
65
|
.replace(RE_PUNCT, '')
|
|
61
66
|
.replace(RE_SPACE, ' ')
|
|
62
67
|
.trim();
|
|
63
68
|
}
|
|
64
69
|
|
|
65
|
-
static normalizeEN(s) {
|
|
66
|
-
return TfidfSpace.removeNonWords(s.toLowerCase());
|
|
70
|
+
static normalizeEN(s, opts={}) {
|
|
71
|
+
return TfidfSpace.removeNonWords(s.toLowerCase(), opts);
|
|
67
72
|
}
|
|
68
73
|
|
|
69
|
-
static normalizeFR(s) {
|
|
74
|
+
static normalizeFR(s, opts={}) {
|
|
70
75
|
let sAbbr = s
|
|
71
76
|
.toLowerCase()
|
|
72
77
|
.replace(/\bd[’']/gi, 'de ')
|
|
@@ -76,7 +81,7 @@ export class TfidfSpace {
|
|
|
76
81
|
.replace(/\bm[’']/gi, 'm_')
|
|
77
82
|
.replace(/\bn[’']/gi, 'n_')
|
|
78
83
|
.replace(/\bc[’']/gi, 'c_');
|
|
79
|
-
return TfidfSpace.removeNonWords(sAbbr);
|
|
84
|
+
return TfidfSpace.removeNonWords(sAbbr, opts);
|
|
80
85
|
}
|
|
81
86
|
|
|
82
87
|
static idfStandard(nDocs, wdc, idfWeight) {
|
|
@@ -165,6 +170,10 @@ export class TfidfSpace {
|
|
|
165
170
|
return this.tfidfOfBow(bow);
|
|
166
171
|
}
|
|
167
172
|
|
|
173
|
+
normalizeText(str) {
|
|
174
|
+
return this._normalizeText(str, this);
|
|
175
|
+
}
|
|
176
|
+
|
|
168
177
|
countWords(str) {
|
|
169
178
|
const msg = 'w7e.countWords:';
|
|
170
179
|
if (str == null) {
|
package/src/text/word-vector.mjs
CHANGED
|
@@ -21,7 +21,13 @@ export class WordVector extends Object {
|
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
toString(opts = {}) {
|
|
24
|
-
|
|
24
|
+
const msg = 'w10r.toString:';
|
|
25
|
+
let { order = 'value', minValue, precision = 2 } = opts;
|
|
26
|
+
|
|
27
|
+
if (minValue == null) {
|
|
28
|
+
minValue = Math.pow(10, -precision) / 2;
|
|
29
|
+
}
|
|
30
|
+
|
|
25
31
|
let entries = Object.entries(this);
|
|
26
32
|
switch (order) {
|
|
27
33
|
case 'key':
|
|
@@ -42,8 +48,12 @@ export class WordVector extends Object {
|
|
|
42
48
|
}
|
|
43
49
|
let sv = entries.reduce((a, e) => {
|
|
44
50
|
let [k, v] = e;
|
|
45
|
-
|
|
46
|
-
|
|
51
|
+
if (minValue <= v) {
|
|
52
|
+
let vf = v.toFixed(precision)
|
|
53
|
+
.replace(/\.0*$/, '')
|
|
54
|
+
.replace(/0\./,'.');
|
|
55
|
+
a.push(`${k}:${vf}`);
|
|
56
|
+
}
|
|
47
57
|
return a;
|
|
48
58
|
}, []);
|
|
49
59
|
return sv.join(',');
|