@sc-voice/tools 2.4.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sc-voice/tools",
3
- "version": "2.4.0",
3
+ "version": "2.5.0",
4
4
  "description": "Utilities for SC-Voice",
5
5
  "main": "index.mjs",
6
6
  "files": [
@@ -13,6 +13,7 @@ export class TfidfSpace {
13
13
  corpusBow = new WordVector(), // corpus bag of words
14
14
  corpusSize = 0, // number of retrieval units (docs, segments, etc.)
15
15
  idfWeight = GOLDEN_FUDGE, // IDF dampening
16
+ idfFunction = TfidfSpace.idfTunable,
16
17
  normalizeText,
17
18
  } = opts;
18
19
  if (lang == null) {
@@ -33,6 +34,9 @@ export class TfidfSpace {
33
34
  Object.defineProperty(this, 'normalizeText', {
34
35
  value: normalizeText,
35
36
  });
37
+ Object.defineProperty(this, 'idfFunction', {
38
+ value: idfFunction,
39
+ });
36
40
 
37
41
  // Serializable properties
38
42
  Object.assign(this, {
@@ -63,6 +67,28 @@ export class TfidfSpace {
63
67
  .trim();
64
68
  }
65
69
 
70
+ static idfStandard(space, word) {
71
+ const msg = 'w7e.idfStandard:';
72
+ let { corpusBow, corpusSize } = space;
73
+ let wordDocs = corpusBow[word] || 0;
74
+ return Math.log((corpusSize + 1) / (wordDocs+1));
75
+ }
76
+
77
+ static idfTunable(space, word, idfWeight = this.idfWeight) {
78
+ const msg = 'w7e.idf:';
79
+ let { corpusBow, corpusSize } = space;
80
+ let wordDocs = corpusBow[word] || 0;
81
+ // NOTE: This is NOT the usual formula
82
+ // Map to [0:ignore..1:important]
83
+ return corpusSize
84
+ ? 1 - Math.exp(((wordDocs - corpusSize) / wordDocs) * idfWeight)
85
+ : 1;
86
+ }
87
+
88
+ idf(word, idfWeight) {
89
+ return this.idfFunction(this, word, idfWeight);
90
+ }
91
+
66
92
  addDocument(doc) {
67
93
  let { corpusBow } = this;
68
94
  this.corpusSize += 1;
@@ -72,20 +98,6 @@ export class TfidfSpace {
72
98
  return this;
73
99
  }
74
100
 
75
- inverseDocumentFrequency(word, idfWeight) {
76
- return this.idf(word, idfWeight);
77
- }
78
-
79
- idf(word, idfWeight = this.idfWeight) {
80
- const msg = 'w7e.idf:';
81
- let { corpusBow, corpusSize } = this;
82
- let wCount = corpusBow[word] || 0;
83
- // Map to [0:ignore..1:important]
84
- return corpusSize
85
- ? 1 - Math.exp(((wCount - corpusSize) / wCount) * idfWeight)
86
- : 1;
87
- }
88
-
89
101
  termFrequency(word, document) {
90
102
  return this.tf(word, document);
91
103
  }
@@ -138,19 +150,19 @@ export class TfidfSpace {
138
150
  return { bow, words };
139
151
  }
140
152
 
141
- string2Vector(str, scale = 1) {
142
- const msg = 'w7e.string2Vector:';
143
- if (str == null) {
144
- throw new Error(`${msg} str?`);
153
+ bowOfText(text) {
154
+ const msg = 'w7e.bowOfText:';
155
+ if (text == null) {
156
+ throw new Error(`${msg} text?`);
145
157
  }
146
158
  let dbg = 0;
147
- let sNorm = this.normalizeText(str);
159
+ let sNorm = this.normalizeText(text);
148
160
  let words = sNorm.split(' ');
149
- let v = words.reduce((a, w) => {
150
- a[w] = (a[w] || 0) + scale;
161
+ let bow = words.reduce((a, w) => {
162
+ a[w] = (a[w] || 0) + 1;
151
163
  return a;
152
164
  }, new WordVector());
153
165
 
154
- return v;
166
+ return bow;
155
167
  }
156
168
  } // TfidfSpace
@@ -78,6 +78,13 @@ export class WordVector extends Object {
78
78
  }, 0);
79
79
  }
80
80
 
81
+ scale(c) {
82
+ return Object.keys(this).reduce((a,k)=>{
83
+ a[k] *= c;
84
+ return a;
85
+ }, this);
86
+ }
87
+
81
88
  intersect(vec2 = {}) {
82
89
  let keys = Object.keys(this);
83
90
  return keys.reduce((a, k) => {