@sc-voice/tools 2.8.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.mjs CHANGED
@@ -6,6 +6,7 @@ export const ScvMath = {
6
6
  };
7
7
 
8
8
  import { BilaraPath } from './src/text/bilara-path.mjs';
9
+ import { Corpus } from './src/text/corpus.mjs';
9
10
  import { EbtDoc } from './src/text/ebt-doc.mjs';
10
11
  import { LegacyDoc } from './src/text/legacy-doc.mjs';
11
12
  import { MerkleJson } from './src/text/merkle-json.mjs';
@@ -18,6 +19,7 @@ import { LogEntry, Logger } from './src/text/logger.mjs';
18
19
 
19
20
  export const Text = {
20
21
  BilaraPath,
22
+ Corpus,
21
23
  EbtDoc,
22
24
  LegacyDoc,
23
25
  LogEntry,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sc-voice/tools",
3
- "version": "2.8.0",
3
+ "version": "2.9.0",
4
4
  "description": "Utilities for SC-Voice",
5
5
  "main": "index.mjs",
6
6
  "files": [
@@ -0,0 +1,39 @@
1
+ import { WordVector } from './word-vector.mjs';
2
+
3
+ export class Corpus {
4
+ constructor(opts = {}) {
5
+ let { wordDocCount = new WordVector(), docMap = {} } = opts;
6
+
7
+ this._size = Object.keys(docMap).length;
8
+
9
+ Object.assign(this, {
10
+ wordDocCount,
11
+ docMap,
12
+ });
13
+ }
14
+
15
+ get size() {
16
+ return this._size;
17
+ }
18
+
19
+ addDocument(id, doc) {
20
+ this.deleteDocument(id);
21
+ this.docMap[id] = doc;
22
+ this._size++;
23
+ }
24
+
25
+ getDocument(id) {
26
+ return this.docMap[id];
27
+ }
28
+
29
+ deleteDocument(id) {
30
+ let { docMap } = this;
31
+ let doc = docMap[id];
32
+ if (doc) {
33
+ delete docMap[id];
34
+ this._size--;
35
+ }
36
+
37
+ return doc;
38
+ }
39
+ }
@@ -1,4 +1,5 @@
1
1
  import { DBG } from '../defines.mjs';
2
+ import { Corpus } from './corpus.mjs';
2
3
  import { WordVector } from './word-vector.mjs';
3
4
 
4
5
  // The golden ratio is pretty.
@@ -10,8 +11,7 @@ export class TfidfSpace {
10
11
  const msg = 't8e.ctor:';
11
12
  let {
12
13
  lang = 'en', // 2-letter code: fr, en, es, pt
13
- corpusBow = new WordVector(), // corpus bag of words
14
- corpusSize = 0, // number of retrieval units (docs, segments, etc.)
14
+ corpus = new Corpus(),
15
15
  idfWeight = GOLDEN_FUDGE, // IDF dampening
16
16
  idfFunction = TfidfSpace.idfTunable,
17
17
  normalizeText,
@@ -41,8 +41,7 @@ export class TfidfSpace {
41
41
  // Serializable properties
42
42
  Object.assign(this, {
43
43
  lang,
44
- corpusBow,
45
- corpusSize,
44
+ corpus,
46
45
  idfWeight,
47
46
  });
48
47
  }
@@ -59,46 +58,53 @@ export class TfidfSpace {
59
58
  }
60
59
 
61
60
  static normalizeFR(s) {
62
- let sAbbr = s.toLowerCase()
61
+ let sAbbr = s
62
+ .toLowerCase()
63
63
  .replace(/\bd[’']/gi, 'de ')
64
64
  .replace(/\bl[’']/gi, 'le ')
65
65
  .replace(/\bs[’']/gi, 's_')
66
66
  .replace(/\bj[’']/gi, 'j_')
67
67
  .replace(/\bm[’']/gi, 'm_')
68
68
  .replace(/\bn[’']/gi, 'n_')
69
- .replace(/\bc[’']/gi, 'c_')
69
+ .replace(/\bc[’']/gi, 'c_');
70
70
  return TfidfSpace.removeNonWords(sAbbr);
71
71
  }
72
72
 
73
73
  static idfStandard(space, word) {
74
74
  const msg = 'w7e.idfStandard:';
75
- let { corpusBow, corpusSize } = space;
76
- let wordDocs = corpusBow[word] || 0;
77
- return Math.log((corpusSize + 1) / (wordDocs+1));
75
+ let { corpus } = space;
76
+ let wordDocs = corpus.wordDocCount[word] || 0;
77
+ return Math.log((corpus.size + 1) / (wordDocs + 1));
78
78
  }
79
79
 
80
- static idfTunable(space, word, idfWeight = this.idfWeight) {
80
+ static idfTunable(space, word, idfWeight) {
81
81
  const msg = 'w7e.idf:';
82
- let { corpusBow, corpusSize } = space;
83
- let wordDocs = corpusBow[word] || 0;
82
+ let { corpus } = space;
83
+ let wordDocs = corpus.wordDocCount[word] || 0;
84
84
  // NOTE: This is NOT the usual formula
85
85
  // Map to [0:ignore..1:important]
86
- return corpusSize
87
- ? 1 - Math.exp(((wordDocs - corpusSize) / wordDocs) * idfWeight)
86
+ return corpus.size
87
+ ? 1 -
88
+ Math.exp(((wordDocs - corpus.size) / wordDocs) * idfWeight)
88
89
  : 1;
89
90
  }
90
91
 
91
- idf(word, idfWeight) {
92
+ idf(word, idfWeight = this.idfWeight) {
92
93
  return this.idfFunction(this, word, idfWeight);
93
94
  }
94
95
 
95
- addDocument(doc) {
96
- let { corpusBow } = this;
97
- this.corpusSize += 1;
98
- let { bow } = this.countWords(doc, 1); // one-hot
99
- corpusBow.increment(bow);
96
+ addDocument(id, doc) {
97
+ let { corpus } = this;
98
+ let { bow, words } = this.countWords(doc);
99
+ corpus.wordDocCount.increment(bow.oneHot());
100
+
101
+ let docInfo = {
102
+ bow,
103
+ nWords: words.length,
104
+ };
105
+ corpus.addDocument(id, docInfo);
100
106
 
101
- return this;
107
+ return docInfo;
102
108
  }
103
109
 
104
110
  termFrequency(word, document) {
@@ -113,7 +119,7 @@ export class TfidfSpace {
113
119
 
114
120
  tfidf(doc) {
115
121
  const msg = 'w7e.tfidf:';
116
- let { corpusBow, corpusSize, idfWeight } = this;
122
+ let { corpus, idfWeight } = this;
117
123
 
118
124
  // More efficient implementation of tf * idf
119
125
  let { bow, words } = this.countWords(doc);
@@ -122,9 +128,9 @@ export class TfidfSpace {
122
128
  let vTfIdf = words.reduce((a, word) => {
123
129
  let wd = bow[word] || 0;
124
130
  let tf = wd ? wd / nWords : 0;
125
- let wc = corpusBow[word] || 0;
126
- let idf = corpusSize
127
- ? 1 - Math.exp(((wc - corpusSize) / wc) * idfWeight)
131
+ let wc = corpus.wordDocCount[word] || 0;
132
+ let idf = corpus.size
133
+ ? 1 - Math.exp(((wc - corpus.size) / wc) * idfWeight)
128
134
  : 1;
129
135
  let tfidf = tf * idf;
130
136
  if (tfidf) {
@@ -136,7 +142,7 @@ export class TfidfSpace {
136
142
  return vTfIdf;
137
143
  }
138
144
 
139
- countWords(str, maxCount) {
145
+ countWords(str) {
140
146
  const msg = 'w7e.countWords:';
141
147
  if (str == null) {
142
148
  throw new Error(`${msg} str?`);
@@ -145,8 +151,7 @@ export class TfidfSpace {
145
151
  let sNorm = this.normalizeText(str);
146
152
  let words = sNorm.split(' ');
147
153
  let bow = words.reduce((a, w) => {
148
- let count = (a[w] || 0) + 1;
149
- a[w] = maxCount ? Math.min(maxCount, count) : count;
154
+ a[w] = (a[w] || 0) + 1;
150
155
  return a;
151
156
  }, new WordVector());
152
157
 
@@ -102,7 +102,6 @@ class Vector extends Object {
102
102
  let den = norm1 * norm2;
103
103
  return den ? d / den : 0;
104
104
  }
105
-
106
105
  } // Vector
107
106
 
108
107
  export class WordMapTransformer {
@@ -79,7 +79,7 @@ export class WordVector extends Object {
79
79
  }
80
80
 
81
81
  scale(c) {
82
- return Object.keys(this).reduce((a,k)=>{
82
+ return Object.keys(this).reduce((a, k) => {
83
83
  a[k] *= c;
84
84
  return a;
85
85
  }, this);
@@ -111,7 +111,7 @@ export class WordVector extends Object {
111
111
  }
112
112
 
113
113
  oneHot() {
114
- return Object.keys(this).reduce((a,k)=>{
114
+ return Object.keys(this).reduce((a, k) => {
115
115
  if (this[k] > 0) {
116
116
  a[k] = 1;
117
117
  }