@sc-voice/tools 2.4.0 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sc-voice/tools",
3
- "version": "2.4.0",
3
+ "version": "2.6.0",
4
4
  "description": "Utilities for SC-Voice",
5
5
  "main": "index.mjs",
6
6
  "files": [
@@ -13,6 +13,7 @@ export class TfidfSpace {
13
13
  corpusBow = new WordVector(), // corpus bag of words
14
14
  corpusSize = 0, // number of retrieval units (docs, segments, etc.)
15
15
  idfWeight = GOLDEN_FUDGE, // IDF dampening
16
+ idfFunction = TfidfSpace.idfTunable,
16
17
  normalizeText,
17
18
  } = opts;
18
19
  if (lang == null) {
@@ -33,6 +34,9 @@ export class TfidfSpace {
33
34
  Object.defineProperty(this, 'normalizeText', {
34
35
  value: normalizeText,
35
36
  });
37
+ Object.defineProperty(this, 'idfFunction', {
38
+ value: idfFunction,
39
+ });
36
40
 
37
41
  // Serializable properties
38
42
  Object.assign(this, {
@@ -43,24 +47,45 @@ export class TfidfSpace {
43
47
  });
44
48
  }
45
49
 
50
+ static removeNonWords(s) {
51
+ const RE_RESERVED = /[_-]/g; // allowed in bow words
52
+ const RE_PUNCT = /[.,:;$"'“”‘’!?«»]/g;
53
+ const RE_SPACE = /\s+/g;
54
+ return s.replace(RE_PUNCT, '').replace(RE_SPACE, ' ').trim();
55
+ }
56
+
46
57
  static normalizeEN(s) {
47
- return s
48
- .toLowerCase()
49
- .replace(/[-.,_:;"'“”‘’!?]/g, '')
50
- .replace(/ {2,}/g, ' ')
51
- .trim();
58
+ return TfidfSpace.removeNonWords(s.toLowerCase());
52
59
  }
53
60
 
54
61
  static normalizeFR(s) {
55
- return s
56
- .toLowerCase()
57
- .replace(/[«»]/gi, '')
62
+ let sAbbr = s.toLowerCase()
58
63
  .replace(/\bd[’']/gi, 'de ')
59
64
  .replace(/\bl[’']/gi, 'le ')
60
65
  .replace(/\bs[’']/gi, 'se ')
61
- .replace(/[-.,_:;"'“”‘’!?]/g, '')
62
- .replace(/[  ]+/g, ' ')
63
- .trim();
66
+ return TfidfSpace.removeNonWords(sAbbr);
67
+ }
68
+
69
+ static idfStandard(space, word) {
70
+ const msg = 'w7e.idfStandard:';
71
+ let { corpusBow, corpusSize } = space;
72
+ let wordDocs = corpusBow[word] || 0;
73
+ return Math.log((corpusSize + 1) / (wordDocs+1));
74
+ }
75
+
76
+ static idfTunable(space, word, idfWeight = this.idfWeight) {
77
+ const msg = 'w7e.idf:';
78
+ let { corpusBow, corpusSize } = space;
79
+ let wordDocs = corpusBow[word] || 0;
80
+ // NOTE: This is NOT the usual formula
81
+ // Map to [0:ignore..1:important]
82
+ return corpusSize
83
+ ? 1 - Math.exp(((wordDocs - corpusSize) / wordDocs) * idfWeight)
84
+ : 1;
85
+ }
86
+
87
+ idf(word, idfWeight) {
88
+ return this.idfFunction(this, word, idfWeight);
64
89
  }
65
90
 
66
91
  addDocument(doc) {
@@ -72,20 +97,6 @@ export class TfidfSpace {
72
97
  return this;
73
98
  }
74
99
 
75
- inverseDocumentFrequency(word, idfWeight) {
76
- return this.idf(word, idfWeight);
77
- }
78
-
79
- idf(word, idfWeight = this.idfWeight) {
80
- const msg = 'w7e.idf:';
81
- let { corpusBow, corpusSize } = this;
82
- let wCount = corpusBow[word] || 0;
83
- // Map to [0:ignore..1:important]
84
- return corpusSize
85
- ? 1 - Math.exp(((wCount - corpusSize) / wCount) * idfWeight)
86
- : 1;
87
- }
88
-
89
100
  termFrequency(word, document) {
90
101
  return this.tf(word, document);
91
102
  }
@@ -138,19 +149,19 @@ export class TfidfSpace {
138
149
  return { bow, words };
139
150
  }
140
151
 
141
- string2Vector(str, scale = 1) {
142
- const msg = 'w7e.string2Vector:';
143
- if (str == null) {
144
- throw new Error(`${msg} str?`);
152
+ bowOfText(text) {
153
+ const msg = 'w7e.bowOfText:';
154
+ if (text == null) {
155
+ throw new Error(`${msg} text?`);
145
156
  }
146
157
  let dbg = 0;
147
- let sNorm = this.normalizeText(str);
158
+ let sNorm = this.normalizeText(text);
148
159
  let words = sNorm.split(' ');
149
- let v = words.reduce((a, w) => {
150
- a[w] = (a[w] || 0) + scale;
160
+ let bow = words.reduce((a, w) => {
161
+ a[w] = (a[w] || 0) + 1;
151
162
  return a;
152
163
  }, new WordVector());
153
164
 
154
- return v;
165
+ return bow;
155
166
  }
156
167
  } // TfidfSpace
@@ -78,6 +78,13 @@ export class WordVector extends Object {
78
78
  }, 0);
79
79
  }
80
80
 
81
+ scale(c) {
82
+ return Object.keys(this).reduce((a,k)=>{
83
+ a[k] *= c;
84
+ return a;
85
+ }, this);
86
+ }
87
+
81
88
  intersect(vec2 = {}) {
82
89
  let keys = Object.keys(this);
83
90
  return keys.reduce((a, k) => {