@sc-voice/tools 2.16.0 → 2.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sc-voice/tools",
3
- "version": "2.16.0",
3
+ "version": "2.18.0",
4
4
  "description": "Utilities for SC-Voice",
5
5
  "main": "index.mjs",
6
6
  "files": [
@@ -46,11 +46,20 @@ export class TfidfSpace {
46
46
  });
47
47
  }
48
48
 
49
+ static removeHtml(s) {
50
+ return s.replace(/<[^>]*>/gi, '');
51
+ }
52
+
49
53
  static removeNonWords(s) {
50
54
  const RE_RESERVED = /[_-]/g; // allowed in bow words
51
- const RE_PUNCT = /[.,:;$"'“”‘’!?«»]/g;
55
+ const RE_LQUOTE = /[“‘«]/g;
56
+ const RE_PUNCT = /[.,:;$"'“”‘’!?«»\[\]]/g;
52
57
  const RE_SPACE = /\s+/g;
53
- return s.replace(RE_PUNCT, '').replace(RE_SPACE, ' ').trim();
58
+ return TfidfSpace.removeHtml(s)
59
+ .replace(RE_LQUOTE, '__LQUOTE ')
60
+ .replace(RE_PUNCT, '')
61
+ .replace(RE_SPACE, ' ')
62
+ .trim();
54
63
  }
55
64
 
56
65
  static normalizeEN(s) {
@@ -100,7 +109,7 @@ export class TfidfSpace {
100
109
  // Bag-of-words maps word to wordCount(word,doc)
101
110
  throw new Error(`${msg} bow?`);
102
111
  }
103
- let nWords = Object.values(bow).reduce((a,v)=>a+v);
112
+ let nWords = Object.values(bow).reduce((a, v) => a + v);
104
113
  let docInfo = { id, bow, nWords };
105
114
  corpus.wordDocCount.increment(bow.oneHot());
106
115
  corpus.addDocument(id, docInfo);
@@ -111,7 +120,7 @@ export class TfidfSpace {
111
120
  addDocument(id, doc) {
112
121
  let { corpus } = this;
113
122
  let { bow, words } = this.countWords(doc);
114
-
123
+
115
124
  return this.addCorpusDocument(id, bow, words.length);
116
125
  }
117
126
 
@@ -131,7 +140,7 @@ export class TfidfSpace {
131
140
 
132
141
  // More efficient implementation of tf * idf
133
142
  let words = Object.keys(bow);
134
- let nWords = words.reduce((a,w)=>a+bow[w],0);
143
+ let nWords = words.reduce((a, w) => a + bow[w], 0);
135
144
 
136
145
  let vTfIdf = words.reduce((a, word) => {
137
146
  let wd = bow[word] || 0;
@@ -150,7 +159,8 @@ export class TfidfSpace {
150
159
  return vTfIdf;
151
160
  }
152
161
 
153
- tfidf(text) { // TfIdf of words in text w/r to corpus
162
+ tfidf(text) {
163
+ // TfIdf of words in text w/r to corpus
154
164
  let { bow } = this.countWords(text);
155
165
  return this.tfidfOfBow(bow);
156
166
  }
@@ -20,12 +20,12 @@ export class WordVector extends Object {
20
20
  return this.$length;
21
21
  }
22
22
 
23
- toString(opts={}) {
24
- let { order='value', precision=2 } = opts;
23
+ toString(opts = {}) {
24
+ let { order = 'value', precision = 2 } = opts;
25
25
  let entries = Object.entries(this);
26
26
  switch (order) {
27
27
  case 'key':
28
- entries.sort((a,b)=>{
28
+ entries.sort((a, b) => {
29
29
  let [ka] = a;
30
30
  let [kb] = b;
31
31
  return ka.localeCompare(kb);
@@ -33,16 +33,16 @@ export class WordVector extends Object {
33
33
  break;
34
34
  case 'value':
35
35
  default:
36
- entries.sort((a,b)=>{
36
+ entries.sort((a, b) => {
37
37
  let [ka, va] = a;
38
38
  let [kb, vb] = b;
39
- return (vb-va) || ka.localeCompare(kb);
39
+ return vb - va || ka.localeCompare(kb);
40
40
  });
41
41
  break;
42
42
  }
43
43
  let sv = entries.reduce((a, e) => {
44
44
  let [k, v] = e;
45
- let vf = v.toFixed(precision).replace(/\.0*$/,'');
45
+ let vf = v.toFixed(precision).replace(/\.0*$/, '');
46
46
  a.push(`${k}:${vf}`);
47
47
  return a;
48
48
  }, []);