npm - @sc-voice/tools - Versions diffs - 2.4.0 → 2.6.0 - Mend

@sc-voice/tools 2.4.0 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/src/text/tfidf-space.mjs +44 -33
package/src/text/word-vector.mjs +7 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sc-voice/tools",
-  "version": "2.4.0",
+  "version": "2.6.0",
   "description": "Utilities for SC-Voice",
   "main": "index.mjs",
   "files": [

package/src/text/tfidf-space.mjs CHANGED Viewed

@@ -13,6 +13,7 @@ export class TfidfSpace {
       corpusBow = new WordVector(), // corpus bag of words
       corpusSize = 0, // number of retrieval units (docs, segments, etc.)
       idfWeight = GOLDEN_FUDGE, // IDF dampening
+      idfFunction = TfidfSpace.idfTunable,
       normalizeText,
     } = opts;
     if (lang == null) {
@@ -33,6 +34,9 @@ export class TfidfSpace {
     Object.defineProperty(this, 'normalizeText', {
       value: normalizeText,
     });
+    Object.defineProperty(this, 'idfFunction', {
+      value: idfFunction,
+    });
     // Serializable properties
     Object.assign(this, {
@@ -43,24 +47,45 @@ export class TfidfSpace {
     });
   }
+  static removeNonWords(s) {
+    const RE_RESERVED = /[_-]/g; // allowed in bow words
+    const RE_PUNCT = /[.,:;$"'“”‘’!?«»]/g;
+    const RE_SPACE = /\s+/g;
+    return s.replace(RE_PUNCT, '').replace(RE_SPACE, ' ').trim();
+  }
   static normalizeEN(s) {
-    return s
-      .toLowerCase()
-      .replace(/[-.,_:;"'“”‘’!?]/g, '')
-      .replace(/ {2,}/g, ' ')
-      .trim();
+    return TfidfSpace.removeNonWords(s.toLowerCase());
   }
   static normalizeFR(s) {
-    return s
-      .toLowerCase()
-      .replace(/[«»]/gi, '')
+    let sAbbr = s.toLowerCase()
       .replace(/\bd[’']/gi, 'de ')
       .replace(/\bl[’']/gi, 'le ')
       .replace(/\bs[’']/gi, 'se ')
-      .replace(/[-.,_:;"'“”‘’!?]/g, '')
-      .replace(/[  ]+/g, ' ')
-      .trim();
+    return TfidfSpace.removeNonWords(sAbbr);
+  }
+  static idfStandard(space, word) {
+    const msg = 'w7e.idfStandard:';
+    let { corpusBow, corpusSize } = space;
+    let wordDocs = corpusBow[word] || 0;
+    return Math.log((corpusSize + 1) / (wordDocs+1));
+  }
+  static idfTunable(space, word, idfWeight = this.idfWeight) {
+    const msg = 'w7e.idf:';
+    let { corpusBow, corpusSize } = space;
+    let wordDocs = corpusBow[word] || 0;
+    // NOTE: This is NOT the usual formula
+    // Map to [0:ignore..1:important]
+    return corpusSize
+      ? 1 - Math.exp(((wordDocs - corpusSize) / wordDocs) * idfWeight)
+      : 1;
+  }
+  idf(word, idfWeight) {
+    return this.idfFunction(this, word, idfWeight);
   }
   addDocument(doc) {
@@ -72,20 +97,6 @@ export class TfidfSpace {
     return this;
   }
-  inverseDocumentFrequency(word, idfWeight) {
-    return this.idf(word, idfWeight);
-  }
-  idf(word, idfWeight = this.idfWeight) {
-    const msg = 'w7e.idf:';
-    let { corpusBow, corpusSize } = this;
-    let wCount = corpusBow[word] || 0;
-    // Map to [0:ignore..1:important]
-    return corpusSize
-      ? 1 - Math.exp(((wCount - corpusSize) / wCount) * idfWeight)
-      : 1;
-  }
   termFrequency(word, document) {
     return this.tf(word, document);
   }
@@ -138,19 +149,19 @@ export class TfidfSpace {
     return { bow, words };
   }
-  string2Vector(str, scale = 1) {
-    const msg = 'w7e.string2Vector:';
-    if (str == null) {
-      throw new Error(`${msg} str?`);
+  bowOfText(text) {
+    const msg = 'w7e.bowOfText:';
+    if (text == null) {
+      throw new Error(`${msg} text?`);
     }
     let dbg = 0;
-    let sNorm = this.normalizeText(str);
+    let sNorm = this.normalizeText(text);
     let words = sNorm.split(' ');
-    let v = words.reduce((a, w) => {
-      a[w] = (a[w] || 0) + scale;
+    let bow = words.reduce((a, w) => {
+      a[w] = (a[w] || 0) + 1;
       return a;
     }, new WordVector());
-    return v;
+    return bow;
   }
 } // TfidfSpace

package/src/text/word-vector.mjs CHANGED Viewed

@@ -78,6 +78,13 @@ export class WordVector extends Object {
     }, 0);
   }
+  scale(c) {
+    return Object.keys(this).reduce((a,k)=>{
+      a[k] *= c;
+      return a;
+    }, this);
+  }
   intersect(vec2 = {}) {
     let keys = Object.keys(this);
     return keys.reduce((a, k) => {