@sc-voice/tools 2.7.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.mjs CHANGED
@@ -6,6 +6,7 @@ export const ScvMath = {
6
6
  };
7
7
 
8
8
  import { BilaraPath } from './src/text/bilara-path.mjs';
9
+ import { Corpus } from './src/text/corpus.mjs';
9
10
  import { EbtDoc } from './src/text/ebt-doc.mjs';
10
11
  import { LegacyDoc } from './src/text/legacy-doc.mjs';
11
12
  import { MerkleJson } from './src/text/merkle-json.mjs';
@@ -18,6 +19,7 @@ import { LogEntry, Logger } from './src/text/logger.mjs';
18
19
 
19
20
  export const Text = {
20
21
  BilaraPath,
22
+ Corpus,
21
23
  EbtDoc,
22
24
  LegacyDoc,
23
25
  LogEntry,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sc-voice/tools",
3
- "version": "2.7.0",
3
+ "version": "2.9.0",
4
4
  "description": "Utilities for SC-Voice",
5
5
  "main": "index.mjs",
6
6
  "files": [
@@ -0,0 +1,39 @@
1
+ import { WordVector } from './word-vector.mjs';
2
+
3
+ export class Corpus {
4
+ constructor(opts = {}) {
5
+ let { wordDocCount = new WordVector(), docMap = {} } = opts;
6
+
7
+ this._size = Object.keys(docMap).length;
8
+
9
+ Object.assign(this, {
10
+ wordDocCount,
11
+ docMap,
12
+ });
13
+ }
14
+
15
+ get size() {
16
+ return this._size;
17
+ }
18
+
19
+ addDocument(id, doc) {
20
+ this.deleteDocument(id);
21
+ this.docMap[id] = doc;
22
+ this._size++;
23
+ }
24
+
25
+ getDocument(id) {
26
+ return this.docMap[id];
27
+ }
28
+
29
+ deleteDocument(id) {
30
+ let { docMap } = this;
31
+ let doc = docMap[id];
32
+ if (doc) {
33
+ delete docMap[id];
34
+ this._size--;
35
+ }
36
+
37
+ return doc;
38
+ }
39
+ }
@@ -1,4 +1,5 @@
1
1
  import { DBG } from '../defines.mjs';
2
+ import { Corpus } from './corpus.mjs';
2
3
  import { WordVector } from './word-vector.mjs';
3
4
 
4
5
  // The golden ratio is pretty.
@@ -10,8 +11,7 @@ export class TfidfSpace {
10
11
  const msg = 't8e.ctor:';
11
12
  let {
12
13
  lang = 'en', // 2-letter code: fr, en, es, pt
13
- corpusBow = new WordVector(), // corpus bag of words
14
- corpusSize = 0, // number of retrieval units (docs, segments, etc.)
14
+ corpus = new Corpus(),
15
15
  idfWeight = GOLDEN_FUDGE, // IDF dampening
16
16
  idfFunction = TfidfSpace.idfTunable,
17
17
  normalizeText,
@@ -41,8 +41,7 @@ export class TfidfSpace {
41
41
  // Serializable properties
42
42
  Object.assign(this, {
43
43
  lang,
44
- corpusBow,
45
- corpusSize,
44
+ corpus,
46
45
  idfWeight,
47
46
  });
48
47
  }
@@ -59,45 +58,53 @@ export class TfidfSpace {
59
58
  }
60
59
 
61
60
  static normalizeFR(s) {
62
- let sAbbr = s.toLowerCase()
61
+ let sAbbr = s
62
+ .toLowerCase()
63
63
  .replace(/\bd[’']/gi, 'de ')
64
64
  .replace(/\bl[’']/gi, 'le ')
65
65
  .replace(/\bs[’']/gi, 's_')
66
66
  .replace(/\bj[’']/gi, 'j_')
67
67
  .replace(/\bm[’']/gi, 'm_')
68
68
  .replace(/\bn[’']/gi, 'n_')
69
+ .replace(/\bc[’']/gi, 'c_');
69
70
  return TfidfSpace.removeNonWords(sAbbr);
70
71
  }
71
72
 
72
73
  static idfStandard(space, word) {
73
74
  const msg = 'w7e.idfStandard:';
74
- let { corpusBow, corpusSize } = space;
75
- let wordDocs = corpusBow[word] || 0;
76
- return Math.log((corpusSize + 1) / (wordDocs+1));
75
+ let { corpus } = space;
76
+ let wordDocs = corpus.wordDocCount[word] || 0;
77
+ return Math.log((corpus.size + 1) / (wordDocs + 1));
77
78
  }
78
79
 
79
- static idfTunable(space, word, idfWeight = this.idfWeight) {
80
+ static idfTunable(space, word, idfWeight) {
80
81
  const msg = 'w7e.idf:';
81
- let { corpusBow, corpusSize } = space;
82
- let wordDocs = corpusBow[word] || 0;
82
+ let { corpus } = space;
83
+ let wordDocs = corpus.wordDocCount[word] || 0;
83
84
  // NOTE: This is NOT the usual formula
84
85
  // Map to [0:ignore..1:important]
85
- return corpusSize
86
- ? 1 - Math.exp(((wordDocs - corpusSize) / wordDocs) * idfWeight)
86
+ return corpus.size
87
+ ? 1 -
88
+ Math.exp(((wordDocs - corpus.size) / wordDocs) * idfWeight)
87
89
  : 1;
88
90
  }
89
91
 
90
- idf(word, idfWeight) {
92
+ idf(word, idfWeight = this.idfWeight) {
91
93
  return this.idfFunction(this, word, idfWeight);
92
94
  }
93
95
 
94
- addDocument(doc) {
95
- let { corpusBow } = this;
96
- this.corpusSize += 1;
97
- let { bow } = this.countWords(doc, 1); // one-hot
98
- corpusBow.increment(bow);
96
+ addDocument(id, doc) {
97
+ let { corpus } = this;
98
+ let { bow, words } = this.countWords(doc);
99
+ corpus.wordDocCount.increment(bow.oneHot());
100
+
101
+ let docInfo = {
102
+ bow,
103
+ nWords: words.length,
104
+ };
105
+ corpus.addDocument(id, docInfo);
99
106
 
100
- return this;
107
+ return docInfo;
101
108
  }
102
109
 
103
110
  termFrequency(word, document) {
@@ -112,7 +119,7 @@ export class TfidfSpace {
112
119
 
113
120
  tfidf(doc) {
114
121
  const msg = 'w7e.tfidf:';
115
- let { corpusBow, corpusSize, idfWeight } = this;
122
+ let { corpus, idfWeight } = this;
116
123
 
117
124
  // More efficient implementation of tf * idf
118
125
  let { bow, words } = this.countWords(doc);
@@ -121,9 +128,9 @@ export class TfidfSpace {
121
128
  let vTfIdf = words.reduce((a, word) => {
122
129
  let wd = bow[word] || 0;
123
130
  let tf = wd ? wd / nWords : 0;
124
- let wc = corpusBow[word] || 0;
125
- let idf = corpusSize
126
- ? 1 - Math.exp(((wc - corpusSize) / wc) * idfWeight)
131
+ let wc = corpus.wordDocCount[word] || 0;
132
+ let idf = corpus.size
133
+ ? 1 - Math.exp(((wc - corpus.size) / wc) * idfWeight)
127
134
  : 1;
128
135
  let tfidf = tf * idf;
129
136
  if (tfidf) {
@@ -135,7 +142,7 @@ export class TfidfSpace {
135
142
  return vTfIdf;
136
143
  }
137
144
 
138
- countWords(str, maxCount) {
145
+ countWords(str) {
139
146
  const msg = 'w7e.countWords:';
140
147
  if (str == null) {
141
148
  throw new Error(`${msg} str?`);
@@ -144,8 +151,7 @@ export class TfidfSpace {
144
151
  let sNorm = this.normalizeText(str);
145
152
  let words = sNorm.split(' ');
146
153
  let bow = words.reduce((a, w) => {
147
- let count = (a[w] || 0) + 1;
148
- a[w] = maxCount ? Math.min(maxCount, count) : count;
154
+ a[w] = (a[w] || 0) + 1;
149
155
  return a;
150
156
  }, new WordVector());
151
157
 
@@ -79,7 +79,7 @@ export class WordVector extends Object {
79
79
  }
80
80
 
81
81
  scale(c) {
82
- return Object.keys(this).reduce((a,k)=>{
82
+ return Object.keys(this).reduce((a, k) => {
83
83
  a[k] *= c;
84
84
  return a;
85
85
  }, this);
@@ -109,4 +109,13 @@ export class WordVector extends Object {
109
109
  let den = norm1 * norm2;
110
110
  return den ? d / den : 0;
111
111
  }
112
+
113
+ oneHot() {
114
+ return Object.keys(this).reduce((a, k) => {
115
+ if (this[k] > 0) {
116
+ a[k] = 1;
117
+ }
118
+ return a;
119
+ }, new WordVector());
120
+ }
112
121
  } // WordVector