@sc-voice/tools 2.8.0 → 2.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.mjs CHANGED
@@ -6,6 +6,7 @@ export const ScvMath = {
6
6
  };
7
7
 
8
8
  import { BilaraPath } from './src/text/bilara-path.mjs';
9
+ import { Corpus } from './src/text/corpus.mjs';
9
10
  import { EbtDoc } from './src/text/ebt-doc.mjs';
10
11
  import { LegacyDoc } from './src/text/legacy-doc.mjs';
11
12
  import { MerkleJson } from './src/text/merkle-json.mjs';
@@ -18,6 +19,7 @@ import { LogEntry, Logger } from './src/text/logger.mjs';
18
19
 
19
20
  export const Text = {
20
21
  BilaraPath,
22
+ Corpus,
21
23
  EbtDoc,
22
24
  LegacyDoc,
23
25
  LogEntry,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sc-voice/tools",
3
- "version": "2.8.0",
3
+ "version": "2.10.0",
4
4
  "description": "Utilities for SC-Voice",
5
5
  "main": "index.mjs",
6
6
  "files": [
@@ -0,0 +1,39 @@
1
+ import { WordVector } from './word-vector.mjs';
2
+
3
+ export class Corpus {
4
+ constructor(opts = {}) {
5
+ let { wordDocCount = new WordVector(), docMap = {} } = opts;
6
+
7
+ this._size = Object.keys(docMap).length;
8
+
9
+ Object.assign(this, {
10
+ wordDocCount,
11
+ docMap,
12
+ });
13
+ }
14
+
15
+ get size() {
16
+ return this._size;
17
+ }
18
+
19
+ addDocument(id, doc) {
20
+ this.deleteDocument(id);
21
+ this.docMap[id] = doc;
22
+ this._size++;
23
+ }
24
+
25
+ getDocument(id) {
26
+ return this.docMap[id];
27
+ }
28
+
29
+ deleteDocument(id) {
30
+ let { docMap } = this;
31
+ let doc = docMap[id];
32
+ if (doc) {
33
+ delete docMap[id];
34
+ this._size--;
35
+ }
36
+
37
+ return doc;
38
+ }
39
+ }
@@ -1,4 +1,5 @@
1
1
  import { DBG } from '../defines.mjs';
2
+ import { Corpus } from './corpus.mjs';
2
3
  import { WordVector } from './word-vector.mjs';
3
4
 
4
5
  // The golden ratio is pretty.
@@ -10,8 +11,7 @@ export class TfidfSpace {
10
11
  const msg = 't8e.ctor:';
11
12
  let {
12
13
  lang = 'en', // 2-letter code: fr, en, es, pt
13
- corpusBow = new WordVector(), // corpus bag of words
14
- corpusSize = 0, // number of retrieval units (docs, segments, etc.)
14
+ corpus = new Corpus(),
15
15
  idfWeight = GOLDEN_FUDGE, // IDF dampening
16
16
  idfFunction = TfidfSpace.idfTunable,
17
17
  normalizeText,
@@ -41,8 +41,7 @@ export class TfidfSpace {
41
41
  // Serializable properties
42
42
  Object.assign(this, {
43
43
  lang,
44
- corpusBow,
45
- corpusSize,
44
+ corpus,
46
45
  idfWeight,
47
46
  });
48
47
  }
@@ -59,46 +58,65 @@ export class TfidfSpace {
59
58
  }
60
59
 
61
60
  static normalizeFR(s) {
62
- let sAbbr = s.toLowerCase()
61
+ let sAbbr = s
62
+ .toLowerCase()
63
63
  .replace(/\bd[’']/gi, 'de ')
64
64
  .replace(/\bl[’']/gi, 'le ')
65
65
  .replace(/\bs[’']/gi, 's_')
66
66
  .replace(/\bj[’']/gi, 'j_')
67
67
  .replace(/\bm[’']/gi, 'm_')
68
68
  .replace(/\bn[’']/gi, 'n_')
69
- .replace(/\bc[’']/gi, 'c_')
69
+ .replace(/\bc[’']/gi, 'c_');
70
70
  return TfidfSpace.removeNonWords(sAbbr);
71
71
  }
72
72
 
73
73
  static idfStandard(space, word) {
74
74
  const msg = 'w7e.idfStandard:';
75
- let { corpusBow, corpusSize } = space;
76
- let wordDocs = corpusBow[word] || 0;
77
- return Math.log((corpusSize + 1) / (wordDocs+1));
75
+ let { corpus } = space;
76
+ let wordDocs = corpus.wordDocCount[word] || 0;
77
+ return Math.log((corpus.size + 1) / (wordDocs + 1));
78
78
  }
79
79
 
80
- static idfTunable(space, word, idfWeight = this.idfWeight) {
80
+ static idfTunable(space, word, idfWeight) {
81
81
  const msg = 'w7e.idf:';
82
- let { corpusBow, corpusSize } = space;
83
- let wordDocs = corpusBow[word] || 0;
82
+ let { corpus } = space;
83
+ let wordDocs = corpus.wordDocCount[word] || 0;
84
84
  // NOTE: This is NOT the usual formula
85
85
  // Map to [0:ignore..1:important]
86
- return corpusSize
87
- ? 1 - Math.exp(((wordDocs - corpusSize) / wordDocs) * idfWeight)
86
+ return corpus.size
87
+ ? 1 -
88
+ Math.exp(((wordDocs - corpus.size) / wordDocs) * idfWeight)
88
89
  : 1;
89
90
  }
90
91
 
91
- idf(word, idfWeight) {
92
+ idf(word, idfWeight = this.idfWeight) {
92
93
  return this.idfFunction(this, word, idfWeight);
93
94
  }
94
95
 
95
- addDocument(doc) {
96
- let { corpusBow } = this;
97
- this.corpusSize += 1;
98
- let { bow } = this.countWords(doc, 1); // one-hot
99
- corpusBow.increment(bow);
96
+ addCorpusDocument(id, bow, nWords) {
97
+ const msg = 't8w.addCorpusDocument:';
98
+ let { corpus } = this;
99
+ if (id == null) {
100
+ throw new Error(`${msg} id?`);
101
+ }
102
+ if (bow == null) {
103
+ throw new Error(`${msg} bow?`);
104
+ }
105
+ if (nWords == null) {
106
+ throw new Error(`${msg} nWords?`);
107
+ }
108
+ let docInfo = { id, bow, nWords };
109
+ corpus.addDocument(id, docInfo);
100
110
 
101
- return this;
111
+ return docInfo;
112
+ }
113
+
114
+ addDocument(id, doc) {
115
+ let { corpus } = this;
116
+ let { bow, words } = this.countWords(doc);
117
+ corpus.wordDocCount.increment(bow.oneHot());
118
+
119
+ return this.addCorpusDocument(id, bow, words.length);
102
120
  }
103
121
 
104
122
  termFrequency(word, document) {
@@ -113,7 +131,7 @@ export class TfidfSpace {
113
131
 
114
132
  tfidf(doc) {
115
133
  const msg = 'w7e.tfidf:';
116
- let { corpusBow, corpusSize, idfWeight } = this;
134
+ let { corpus, idfWeight } = this;
117
135
 
118
136
  // More efficient implementation of tf * idf
119
137
  let { bow, words } = this.countWords(doc);
@@ -122,9 +140,9 @@ export class TfidfSpace {
122
140
  let vTfIdf = words.reduce((a, word) => {
123
141
  let wd = bow[word] || 0;
124
142
  let tf = wd ? wd / nWords : 0;
125
- let wc = corpusBow[word] || 0;
126
- let idf = corpusSize
127
- ? 1 - Math.exp(((wc - corpusSize) / wc) * idfWeight)
143
+ let wc = corpus.wordDocCount[word] || 0;
144
+ let idf = corpus.size
145
+ ? 1 - Math.exp(((wc - corpus.size) / wc) * idfWeight)
128
146
  : 1;
129
147
  let tfidf = tf * idf;
130
148
  if (tfidf) {
@@ -136,7 +154,7 @@ export class TfidfSpace {
136
154
  return vTfIdf;
137
155
  }
138
156
 
139
- countWords(str, maxCount) {
157
+ countWords(str) {
140
158
  const msg = 'w7e.countWords:';
141
159
  if (str == null) {
142
160
  throw new Error(`${msg} str?`);
@@ -145,8 +163,7 @@ export class TfidfSpace {
145
163
  let sNorm = this.normalizeText(str);
146
164
  let words = sNorm.split(' ');
147
165
  let bow = words.reduce((a, w) => {
148
- let count = (a[w] || 0) + 1;
149
- a[w] = maxCount ? Math.min(maxCount, count) : count;
166
+ a[w] = (a[w] || 0) + 1;
150
167
  return a;
151
168
  }, new WordVector());
152
169
 
@@ -102,7 +102,6 @@ class Vector extends Object {
102
102
  let den = norm1 * norm2;
103
103
  return den ? d / den : 0;
104
104
  }
105
-
106
105
  } // Vector
107
106
 
108
107
  export class WordMapTransformer {
@@ -79,7 +79,7 @@ export class WordVector extends Object {
79
79
  }
80
80
 
81
81
  scale(c) {
82
- return Object.keys(this).reduce((a,k)=>{
82
+ return Object.keys(this).reduce((a, k) => {
83
83
  a[k] *= c;
84
84
  return a;
85
85
  }, this);
@@ -111,7 +111,7 @@ export class WordVector extends Object {
111
111
  }
112
112
 
113
113
  oneHot() {
114
- return Object.keys(this).reduce((a,k)=>{
114
+ return Object.keys(this).reduce((a, k) => {
115
115
  if (this[k] > 0) {
116
116
  a[k] = 1;
117
117
  }