npm - @sc-voice/tools - Versions diffs - 2.7.0 → 2.9.0 - Mend

@sc-voice/tools 2.7.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/index.mjs +2 -0
package/package.json +1 -1
package/src/text/corpus.mjs +39 -0
package/src/text/tfidf-space.mjs +33 -27
package/src/text/word-vector.mjs +10 -1

package/index.mjs CHANGED Viewed

@@ -6,6 +6,7 @@ export const ScvMath = {
 };
 import { BilaraPath } from './src/text/bilara-path.mjs';
+import { Corpus } from './src/text/corpus.mjs';
 import { EbtDoc } from './src/text/ebt-doc.mjs';
 import { LegacyDoc } from './src/text/legacy-doc.mjs';
 import { MerkleJson } from './src/text/merkle-json.mjs';
@@ -18,6 +19,7 @@ import { LogEntry, Logger } from './src/text/logger.mjs';
 export const Text = {
   BilaraPath,
+  Corpus,
   EbtDoc,
   LegacyDoc,
   LogEntry,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sc-voice/tools",
-  "version": "2.7.0",
+  "version": "2.9.0",
   "description": "Utilities for SC-Voice",
   "main": "index.mjs",
   "files": [

package/src/text/corpus.mjs ADDED Viewed

@@ -0,0 +1,39 @@
+import { WordVector } from './word-vector.mjs';
+export class Corpus {
+  constructor(opts = {}) {
+    let { wordDocCount = new WordVector(), docMap = {} } = opts;
+    this._size = Object.keys(docMap).length;
+    Object.assign(this, {
+      wordDocCount,
+      docMap,
+    });
+  }
+  get size() {
+    return this._size;
+  }
+  addDocument(id, doc) {
+    this.deleteDocument(id);
+    this.docMap[id] = doc;
+    this._size++;
+  }
+  getDocument(id) {
+    return this.docMap[id];
+  }
+  deleteDocument(id) {
+    let { docMap } = this;
+    let doc = docMap[id];
+    if (doc) {
+      delete docMap[id];
+      this._size--;
+    }
+    return doc;
+  }
+}

package/src/text/tfidf-space.mjs CHANGED Viewed

@@ -1,4 +1,5 @@
 import { DBG } from '../defines.mjs';
+import { Corpus } from './corpus.mjs';
 import { WordVector } from './word-vector.mjs';
 // The golden ratio is pretty.
@@ -10,8 +11,7 @@ export class TfidfSpace {
     const msg = 't8e.ctor:';
     let {
       lang = 'en', // 2-letter code: fr, en, es, pt
-      corpusBow = new WordVector(), // corpus bag of words
-      corpusSize = 0, // number of retrieval units (docs, segments, etc.)
+      corpus = new Corpus(),
       idfWeight = GOLDEN_FUDGE, // IDF dampening
       idfFunction = TfidfSpace.idfTunable,
       normalizeText,
@@ -41,8 +41,7 @@ export class TfidfSpace {
     // Serializable properties
     Object.assign(this, {
       lang,
-      corpusBow,
-      corpusSize,
+      corpus,
       idfWeight,
     });
   }
@@ -59,45 +58,53 @@ export class TfidfSpace {
   }
   static normalizeFR(s) {
-    let sAbbr = s.toLowerCase()
+    let sAbbr = s
+      .toLowerCase()
       .replace(/\bd[’']/gi, 'de ')
       .replace(/\bl[’']/gi, 'le ')
       .replace(/\bs[’']/gi, 's_')
       .replace(/\bj[’']/gi, 'j_')
       .replace(/\bm[’']/gi, 'm_')
       .replace(/\bn[’']/gi, 'n_')
+      .replace(/\bc[’']/gi, 'c_');
     return TfidfSpace.removeNonWords(sAbbr);
   }
   static idfStandard(space, word) {
     const msg = 'w7e.idfStandard:';
-    let { corpusBow, corpusSize } = space;
-    let wordDocs = corpusBow[word] || 0;
-    return Math.log((corpusSize + 1) / (wordDocs+1));
+    let { corpus } = space;
+    let wordDocs = corpus.wordDocCount[word] || 0;
+    return Math.log((corpus.size + 1) / (wordDocs + 1));
   }
-  static idfTunable(space, word, idfWeight = this.idfWeight) {
+  static idfTunable(space, word, idfWeight) {
     const msg = 'w7e.idf:';
-    let { corpusBow, corpusSize } = space;
-    let wordDocs = corpusBow[word] || 0;
+    let { corpus } = space;
+    let wordDocs = corpus.wordDocCount[word] || 0;
     // NOTE: This is NOT the usual formula
     // Map to [0:ignore..1:important]
-    return corpusSize
-      ? 1 - Math.exp(((wordDocs - corpusSize) / wordDocs) * idfWeight)
+    return corpus.size
+      ? 1 -
+          Math.exp(((wordDocs - corpus.size) / wordDocs) * idfWeight)
       : 1;
   }
-  idf(word, idfWeight) {
+  idf(word, idfWeight = this.idfWeight) {
     return this.idfFunction(this, word, idfWeight);
   }
-  addDocument(doc) {
-    let { corpusBow } = this;
-    this.corpusSize += 1;
-    let { bow } = this.countWords(doc, 1); // one-hot
-    corpusBow.increment(bow);
+  addDocument(id, doc) {
+    let { corpus } = this;
+    let { bow, words } = this.countWords(doc);
+    corpus.wordDocCount.increment(bow.oneHot());
+    let docInfo = {
+      bow,
+      nWords: words.length,
+    };
+    corpus.addDocument(id, docInfo);
-    return this;
+    return docInfo;
   }
   termFrequency(word, document) {
@@ -112,7 +119,7 @@ export class TfidfSpace {
   tfidf(doc) {
     const msg = 'w7e.tfidf:';
-    let { corpusBow, corpusSize, idfWeight } = this;
+    let { corpus, idfWeight } = this;
     // More efficient implementation of tf * idf
     let { bow, words } = this.countWords(doc);
@@ -121,9 +128,9 @@ export class TfidfSpace {
     let vTfIdf = words.reduce((a, word) => {
       let wd = bow[word] || 0;
       let tf = wd ? wd / nWords : 0;
-      let wc = corpusBow[word] || 0;
-      let idf = corpusSize
-        ? 1 - Math.exp(((wc - corpusSize) / wc) * idfWeight)
+      let wc = corpus.wordDocCount[word] || 0;
+      let idf = corpus.size
+        ? 1 - Math.exp(((wc - corpus.size) / wc) * idfWeight)
         : 1;
       let tfidf = tf * idf;
       if (tfidf) {
@@ -135,7 +142,7 @@ export class TfidfSpace {
     return vTfIdf;
   }
-  countWords(str, maxCount) {
+  countWords(str) {
     const msg = 'w7e.countWords:';
     if (str == null) {
       throw new Error(`${msg} str?`);
@@ -144,8 +151,7 @@ export class TfidfSpace {
     let sNorm = this.normalizeText(str);
     let words = sNorm.split(' ');
     let bow = words.reduce((a, w) => {
-      let count = (a[w] || 0) + 1;
-      a[w] = maxCount ? Math.min(maxCount, count) : count;
+      a[w] = (a[w] || 0) + 1;
       return a;
     }, new WordVector());

package/src/text/word-vector.mjs CHANGED Viewed

@@ -79,7 +79,7 @@ export class WordVector extends Object {
   }
   scale(c) {
-    return Object.keys(this).reduce((a,k)=>{
+    return Object.keys(this).reduce((a, k) => {
       a[k] *= c;
       return a;
     }, this);
@@ -109,4 +109,13 @@ export class WordVector extends Object {
     let den = norm1 * norm2;
     return den ? d / den : 0;
   }
+  oneHot() {
+    return Object.keys(this).reduce((a, k) => {
+      if (this[k] > 0) {
+        a[k] = 1;
+      }
+      return a;
+    }, new WordVector());
+  }
 } // WordVector