@sc-voice/tools 2.20.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sc-voice/tools",
3
- "version": "2.20.0",
3
+ "version": "3.0.0",
4
4
  "description": "Utilities for SC-Voice",
5
5
  "main": "index.mjs",
6
6
  "files": [
@@ -15,6 +15,7 @@ export class TfidfSpace {
15
15
  idfWeight = GOLDEN_FUDGE, // IDF dampening
16
16
  idfFunction = TfidfSpace.idfTunable,
17
17
  normalizeText,
18
+ leftQuoteToken,
18
19
  } = opts;
19
20
  if (lang == null) {
20
21
  throw new Error(`${msg} lang?`);
@@ -31,7 +32,7 @@ export class TfidfSpace {
31
32
  throw new Error(`${msg} normalizeText?`);
32
33
  }
33
34
  }
34
- Object.defineProperty(this, 'normalizeText', {
35
+ Object.defineProperty(this, '_normalizeText', {
35
36
  value: normalizeText,
36
37
  });
37
38
  Object.defineProperty(this, 'idfFunction', {
@@ -43,6 +44,7 @@ export class TfidfSpace {
43
44
  lang,
44
45
  corpus,
45
46
  idfWeight,
47
+ leftQuoteToken,
46
48
  });
47
49
  }
48
50
 
@@ -50,23 +52,26 @@ export class TfidfSpace {
50
52
  return s.replace(/<[^>]*>/gi, '');
51
53
  }
52
54
 
53
- static removeNonWords(s) {
55
+ static removeNonWords(s, opts={}) {
54
56
  const RE_RESERVED = /[_-]/g; // allowed in bow words
55
57
  const RE_LQUOTE = /[“‘«]/g;
56
58
  const RE_PUNCT = /[.,:;$"'“”‘’!?«»\[\]]/g;
57
59
  const RE_SPACE = /\s+/g;
60
+ let {
61
+ leftQuoteToken = '', // TBD: is this useful?
62
+ } = opts;
58
63
  return TfidfSpace.removeHtml(s)
59
- .replace(RE_LQUOTE, '__LQUOTE ')
64
+ .replace(RE_LQUOTE, leftQuoteToken)
60
65
  .replace(RE_PUNCT, '')
61
66
  .replace(RE_SPACE, ' ')
62
67
  .trim();
63
68
  }
64
69
 
65
- static normalizeEN(s) {
66
- return TfidfSpace.removeNonWords(s.toLowerCase());
70
+ static normalizeEN(s, opts={}) {
71
+ return TfidfSpace.removeNonWords(s.toLowerCase(), opts);
67
72
  }
68
73
 
69
- static normalizeFR(s) {
74
+ static normalizeFR(s, opts={}) {
70
75
  let sAbbr = s
71
76
  .toLowerCase()
72
77
  .replace(/\bd[’']/gi, 'de ')
@@ -76,7 +81,7 @@ export class TfidfSpace {
76
81
  .replace(/\bm[’']/gi, 'm_')
77
82
  .replace(/\bn[’']/gi, 'n_')
78
83
  .replace(/\bc[’']/gi, 'c_');
79
- return TfidfSpace.removeNonWords(sAbbr);
84
+ return TfidfSpace.removeNonWords(sAbbr, opts);
80
85
  }
81
86
 
82
87
  static idfStandard(nDocs, wdc, idfWeight) {
@@ -165,6 +170,10 @@ export class TfidfSpace {
165
170
  return this.tfidfOfBow(bow);
166
171
  }
167
172
 
173
+ normalizeText(str) {
174
+ return this._normalizeText(str, this);
175
+ }
176
+
168
177
  countWords(str) {
169
178
  const msg = 'w7e.countWords:';
170
179
  if (str == null) {
@@ -21,7 +21,13 @@ export class WordVector extends Object {
21
21
  }
22
22
 
23
23
  toString(opts = {}) {
24
- let { order = 'value', precision = 2 } = opts;
24
+ const msg = 'w10r.toString:';
25
+ let { order = 'value', minValue, precision = 2 } = opts;
26
+
27
+ if (minValue == null) {
28
+ minValue = Math.pow(10, -precision) / 2;
29
+ }
30
+
25
31
  let entries = Object.entries(this);
26
32
  switch (order) {
27
33
  case 'key':
@@ -42,14 +48,18 @@ export class WordVector extends Object {
42
48
  }
43
49
  let sv = entries.reduce((a, e) => {
44
50
  let [k, v] = e;
45
- let vf = v.toFixed(precision).replace(/\.0*$/, '');
46
- a.push(`${k}:${vf}`);
51
+ if (minValue <= v) {
52
+ let vf = v.toFixed(precision)
53
+ .replace(/\.0*$/, '')
54
+ .replace(/0\./,'.');
55
+ a.push(`${k}:${vf}`);
56
+ }
47
57
  return a;
48
58
  }, []);
49
59
  return sv.join(',');
50
60
  }
51
61
 
52
- norm() {
62
+ norm() { // L2 norm
53
63
  let keys = Object.keys(this);
54
64
  if (keys.length === 0) {
55
65
  return 0;
@@ -104,17 +114,27 @@ export class WordVector extends Object {
104
114
  }, this);
105
115
  }
106
116
 
107
- intersect(vec2 = {}) {
117
+ hadamardL1(vec2 = {}) {
118
+ // L1-norm of Hadamard product shows how
119
+ // the cosine similarity score is apportioned
108
120
  let keys = Object.keys(this);
109
- return keys.reduce((a, k) => {
121
+ let n = 0;
122
+ let hadamard = keys.reduce((a, k) => {
110
123
  let v1 = this[k];
111
124
  let v2 = vec2[k] || 0;
112
125
  if (v1 && v2) {
113
126
  a[k] = v1 * v2;
127
+ n++;
114
128
  }
115
129
 
116
130
  return a;
117
131
  }, new WordVector());
132
+
133
+ if (n === 0) {
134
+ return hadamard; // empty vector
135
+ }
136
+ let n12 = this.norm() * vec2.norm();
137
+ return hadamard.scale(1/n12);
118
138
  }
119
139
 
120
140
  similar(vec2) {
@@ -125,8 +145,8 @@ export class WordVector extends Object {
125
145
  let d = this.dot(vec2);
126
146
  let norm1 = this.norm();
127
147
  let norm2 = vec2.norm();
128
- let den = norm1 * norm2;
129
- return den ? d / den : 0;
148
+ let n12 = norm1 * norm2;
149
+ return n12 ? d / n12 : 0;
130
150
  }
131
151
 
132
152
  oneHot() {