npm - @sc-voice/tools - Versions diffs - 2.20.0 → 3.0.0 - Mend

@sc-voice/tools 2.20.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sc-voice/tools",
-  "version": "2.20.0",
+  "version": "3.0.0",
   "description": "Utilities for SC-Voice",
   "main": "index.mjs",
   "files": [

package/src/text/tfidf-space.mjs CHANGED Viewed

@@ -15,6 +15,7 @@ export class TfidfSpace {
       idfWeight = GOLDEN_FUDGE, // IDF dampening
       idfFunction = TfidfSpace.idfTunable,
       normalizeText,
+      leftQuoteToken,
     } = opts;
     if (lang == null) {
       throw new Error(`${msg} lang?`);
@@ -31,7 +32,7 @@ export class TfidfSpace {
           throw new Error(`${msg} normalizeText?`);
       }
     }
-    Object.defineProperty(this, 'normalizeText', {
+    Object.defineProperty(this, '_normalizeText', {
       value: normalizeText,
     });
     Object.defineProperty(this, 'idfFunction', {
@@ -43,6 +44,7 @@ export class TfidfSpace {
       lang,
       corpus,
       idfWeight,
+      leftQuoteToken,
     });
   }
@@ -50,23 +52,26 @@ export class TfidfSpace {
     return s.replace(/<[^>]*>/gi, '');
   }
-  static removeNonWords(s) {
+  static removeNonWords(s, opts={}) {
     const RE_RESERVED = /[_-]/g; // allowed in bow words
     const RE_LQUOTE = /[“‘«]/g;
     const RE_PUNCT = /[.,:;$"'“”‘’!?«»\[\]]/g;
     const RE_SPACE = /\s+/g;
+    let {
+      leftQuoteToken = '', // TBD: is this useful?
+    } = opts;
     return TfidfSpace.removeHtml(s)
-      .replace(RE_LQUOTE, '__LQUOTE ')
+      .replace(RE_LQUOTE, leftQuoteToken)
       .replace(RE_PUNCT, '')
       .replace(RE_SPACE, ' ')
       .trim();
   }
-  static normalizeEN(s) {
-    return TfidfSpace.removeNonWords(s.toLowerCase());
+  static normalizeEN(s, opts={}) {
+    return TfidfSpace.removeNonWords(s.toLowerCase(), opts);
   }
-  static normalizeFR(s) {
+  static normalizeFR(s, opts={}) {
     let sAbbr = s
       .toLowerCase()
       .replace(/\bd[’']/gi, 'de ')
@@ -76,7 +81,7 @@ export class TfidfSpace {
       .replace(/\bm[’']/gi, 'm_')
       .replace(/\bn[’']/gi, 'n_')
       .replace(/\bc[’']/gi, 'c_');
-    return TfidfSpace.removeNonWords(sAbbr);
+    return TfidfSpace.removeNonWords(sAbbr, opts);
   }
   static idfStandard(nDocs, wdc, idfWeight) {
@@ -165,6 +170,10 @@ export class TfidfSpace {
     return this.tfidfOfBow(bow);
   }
+  normalizeText(str) {
+    return this._normalizeText(str, this);
+  }
   countWords(str) {
     const msg = 'w7e.countWords:';
     if (str == null) {

package/src/text/word-vector.mjs CHANGED Viewed

@@ -21,7 +21,13 @@ export class WordVector extends Object {
   }
   toString(opts = {}) {
-    let { order = 'value', precision = 2 } = opts;
+    const msg = 'w10r.toString:';
+    let { order = 'value', minValue, precision = 2 } = opts;
+    if (minValue == null) {
+      minValue = Math.pow(10, -precision) / 2;
+    }
     let entries = Object.entries(this);
     switch (order) {
       case 'key':
@@ -42,14 +48,18 @@ export class WordVector extends Object {
     }
     let sv = entries.reduce((a, e) => {
       let [k, v] = e;
-      let vf = v.toFixed(precision).replace(/\.0*$/, '');
-      a.push(`${k}:${vf}`);
+      if (minValue <= v) {
+        let vf = v.toFixed(precision)
+          .replace(/\.0*$/, '')
+          .replace(/0\./,'.');
+        a.push(`${k}:${vf}`);
+      }
       return a;
     }, []);
     return sv.join(',');
   }
-  norm() {
+  norm() { // L2 norm
     let keys = Object.keys(this);
     if (keys.length === 0) {
       return 0;
@@ -104,17 +114,27 @@ export class WordVector extends Object {
     }, this);
   }
-  intersect(vec2 = {}) {
+  hadamardL1(vec2 = {}) {
+    // L1-norm of Hadamard product shows how
+    // the cosine similarity score is apportioned
     let keys = Object.keys(this);
-    return keys.reduce((a, k) => {
+    let n = 0;
+    let hadamard = keys.reduce((a, k) => {
       let v1 = this[k];
       let v2 = vec2[k] || 0;
       if (v1 && v2) {
         a[k] = v1 * v2;
+        n++;
       }
       return a;
     }, new WordVector());
+    if (n === 0) {
+      return hadamard; // empty vector
+    }
+    let n12 = this.norm() * vec2.norm();
+    return hadamard.scale(1/n12);
   }
   similar(vec2) {
@@ -125,8 +145,8 @@ export class WordVector extends Object {
     let d = this.dot(vec2);
     let norm1 = this.norm();
     let norm2 = vec2.norm();
-    let den = norm1 * norm2;
-    return den ? d / den : 0;
+    let n12 = norm1 * norm2;
+    return n12 ? d / n12 : 0;
   }
   oneHot() {