npm - @sc-voice/tools - Versions diffs - 2.3.0 → 2.4.0 - Mend

@sc-voice/tools 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/index.mjs +4 -0
package/package.json +1 -1
package/src/text/tfidf-space.mjs +156 -0
package/src/text/word-vector.mjs +105 -0

package/index.mjs CHANGED Viewed

@@ -12,6 +12,8 @@ import { MerkleJson } from './src/text/merkle-json.mjs';
 import { SuttaCentralId } from './src/text/sutta-central-id.mjs';
 import { Unicode } from './src/text/unicode.mjs';
 import { WordSpace } from './src/text/word-space.mjs';
+import { WordVector } from './src/text/word-vector.mjs';
+import { TfidfSpace } from './src/text/tfidf-space.mjs';
 import { LogEntry, Logger } from './src/text/logger.mjs';
 export const Text = {
@@ -24,6 +26,8 @@ export const Text = {
   SuttaCentralId,
   Unicode,
   WordSpace,
+  WordVector,
+  TfidfSpace,
 };
 import { default as Sankey } from './src/graph/sankey.mjs';

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sc-voice/tools",
-  "version": "2.3.0",
+  "version": "2.4.0",
   "description": "Utilities for SC-Voice",
   "main": "index.mjs",
   "files": [

package/src/text/tfidf-space.mjs ADDED Viewed

@@ -0,0 +1,156 @@
+import { DBG } from '../defines.mjs';
+import { WordVector } from './word-vector.mjs';
+// The golden ratio is pretty.
+// 1.6180339887498948482045868343656381177203091798057628621354;
+const GOLDEN_FUDGE = 1.618033988749895;
+export class TfidfSpace {
+  constructor(opts = {}) {
+    const msg = 't8e.ctor:';
+    let {
+      lang = 'en', // 2-letter code: fr, en, es, pt
+      corpusBow = new WordVector(), // corpus bag of words
+      corpusSize = 0, // number of retrieval units (docs, segments, etc.)
+      idfWeight = GOLDEN_FUDGE, // IDF dampening
+      normalizeText,
+    } = opts;
+    if (lang == null) {
+      throw new Error(`${msg} lang?`);
+    }
+    if (normalizeText == null) {
+      switch (lang) {
+        case 'fr':
+          normalizeText = TfidfSpace.normalizeFR;
+          break;
+        case 'en':
+          normalizeText = TfidfSpace.normalizeEN;
+          break;
+        default:
+          throw new Error(`${msg} normalizeText?`);
+      }
+    }
+    Object.defineProperty(this, 'normalizeText', {
+      value: normalizeText,
+    });
+    // Serializable properties
+    Object.assign(this, {
+      lang,
+      corpusBow,
+      corpusSize,
+      idfWeight,
+    });
+  }
+  static normalizeEN(s) {
+    return s
+      .toLowerCase()
+      .replace(/[-.,_:;"'“”‘’!?]/g, '')
+      .replace(/ {2,}/g, ' ')
+      .trim();
+  }
+  static normalizeFR(s) {
+    return s
+      .toLowerCase()
+      .replace(/[«»]/gi, '')
+      .replace(/\bd[’']/gi, 'de ')
+      .replace(/\bl[’']/gi, 'le ')
+      .replace(/\bs[’']/gi, 'se ')
+      .replace(/[-.,_:;"'“”‘’!?]/g, '')
+      .replace(/[  ]+/g, ' ')
+      .trim();
+  }
+  addDocument(doc) {
+    let { corpusBow } = this;
+    this.corpusSize += 1;
+    let { bow } = this.countWords(doc, 1); // one-hot
+    corpusBow.increment(bow);
+    return this;
+  }
+  inverseDocumentFrequency(word, idfWeight) {
+    return this.idf(word, idfWeight);
+  }
+  idf(word, idfWeight = this.idfWeight) {
+    const msg = 'w7e.idf:';
+    let { corpusBow, corpusSize } = this;
+    let wCount = corpusBow[word] || 0;
+    // Map to [0:ignore..1:important]
+    return corpusSize
+      ? 1 - Math.exp(((wCount - corpusSize) / wCount) * idfWeight)
+      : 1;
+  }
+  termFrequency(word, document) {
+    return this.tf(word, document);
+  }
+  tf(word, doc) {
+    let { bow, words } = this.countWords(doc);
+    let count = bow[word] || 0;
+    return count ? count / words.length : 0;
+  }
+  tfidf(doc) {
+    const msg = 'w7e.tfidf:';
+    let { corpusBow, corpusSize, idfWeight } = this;
+    // More efficient implementation of tf * idf
+    let { bow, words } = this.countWords(doc);
+    let nWords = words.length;
+    let vTfIdf = words.reduce((a, word) => {
+      let wd = bow[word] || 0;
+      let tf = wd ? wd / nWords : 0;
+      let wc = corpusBow[word] || 0;
+      let idf = corpusSize
+        ? 1 - Math.exp(((wc - corpusSize) / wc) * idfWeight)
+        : 1;
+      let tfidf = tf * idf;
+      if (tfidf) {
+        a[word] = tfidf;
+      }
+      return a;
+    }, new WordVector());
+    return vTfIdf;
+  }
+  countWords(str, maxCount) {
+    const msg = 'w7e.countWords:';
+    if (str == null) {
+      throw new Error(`${msg} str?`);
+    }
+    let dbg = 0;
+    let sNorm = this.normalizeText(str);
+    let words = sNorm.split(' ');
+    let bow = words.reduce((a, w) => {
+      let count = (a[w] || 0) + 1;
+      a[w] = maxCount ? Math.min(maxCount, count) : count;
+      return a;
+    }, new WordVector());
+    return { bow, words };
+  }
+  string2Vector(str, scale = 1) {
+    const msg = 'w7e.string2Vector:';
+    if (str == null) {
+      throw new Error(`${msg} str?`);
+    }
+    let dbg = 0;
+    let sNorm = this.normalizeText(str);
+    let words = sNorm.split(' ');
+    let v = words.reduce((a, w) => {
+      a[w] = (a[w] || 0) + scale;
+      return a;
+    }, new WordVector());
+    return v;
+  }
+} // TfidfSpace

package/src/text/word-vector.mjs ADDED Viewed

@@ -0,0 +1,105 @@
+import { DBG } from '../defines.mjs';
+// The golden ratio is pretty.
+// 1.6180339887498948482045868343656381177203091798057628621354;
+const GOLDEN_FUDGE = 1.618033988749895;
+export class WordVector extends Object {
+  constructor(props) {
+    super();
+    Object.assign(this, props);
+    Object.defineProperty(this, '$length', {
+      writable: true,
+    });
+  }
+  get length() {
+    if (this.$length == null) {
+      this.$length = Object.keys(this).length;
+    }
+    return this.$length;
+  }
+  toString() {
+    let sv = Object.entries(this).reduce((a, e) => {
+      let [k, v] = e;
+      let vf = v.toFixed(2);
+      a.push(`${k}:${vf}`);
+      return a;
+    }, []);
+    return sv.join(',');
+  }
+  norm() {
+    let keys = Object.keys(this);
+    if (keys.length === 0) {
+      return 0;
+    }
+    let sumSqr = keys.reduce((a, k) => {
+      let v = this[k];
+      return a + v * v;
+    }, 0);
+    return Math.sqrt(sumSqr);
+  }
+  add(vec2) {
+    let keys = Object.keys(vec2);
+    return keys.reduce((a, k) => {
+      let v2 = vec2[k];
+      if (v2) {
+        a[k] = (a[k] || 0) + v2;
+      }
+      return a;
+    }, new WordVector(this));
+  }
+  increment(vec2) {
+    let keys = Object.keys(vec2);
+    return keys.reduce((a, k) => {
+      let v2 = vec2[k];
+      if (v2) {
+        a[k] = (a[k] || 0) + v2;
+      }
+      return a;
+    }, this);
+  }
+  dot(vec2) {
+    const msg = 'w8r.dot:';
+    if (vec2 == null) {
+      throw new Error(`${msg} vec2?`);
+    }
+    let keys = Object.keys(this);
+    return keys.reduce((a, k) => {
+      let v1 = this[k];
+      let v2 = vec2[k] || 0;
+      return a + v1 * v2;
+    }, 0);
+  }
+  intersect(vec2 = {}) {
+    let keys = Object.keys(this);
+    return keys.reduce((a, k) => {
+      let v1 = this[k];
+      let v2 = vec2[k] || 0;
+      if (v1 && v2) {
+        a[k] = v1 * v2;
+      }
+      return a;
+    }, new WordVector());
+  }
+  similar(vec2) {
+    const msg = 'w8r.similar:';
+    if (vec2 == null) {
+      throw new Error(`${msg} vec2?`);
+    }
+    let d = this.dot(vec2);
+    let norm1 = this.norm();
+    let norm2 = vec2.norm();
+    let den = norm1 * norm2;
+    return den ? d / den : 0;
+  }
+} // WordVector