@sc-voice/tools 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.mjs CHANGED
@@ -12,6 +12,8 @@ import { MerkleJson } from './src/text/merkle-json.mjs';
12
12
  import { SuttaCentralId } from './src/text/sutta-central-id.mjs';
13
13
  import { Unicode } from './src/text/unicode.mjs';
14
14
  import { WordSpace } from './src/text/word-space.mjs';
15
+ import { WordVector } from './src/text/word-vector.mjs';
16
+ import { TfidfSpace } from './src/text/tfidf-space.mjs';
15
17
  import { LogEntry, Logger } from './src/text/logger.mjs';
16
18
 
17
19
  export const Text = {
@@ -24,6 +26,8 @@ export const Text = {
24
26
  SuttaCentralId,
25
27
  Unicode,
26
28
  WordSpace,
29
+ WordVector,
30
+ TfidfSpace,
27
31
  };
28
32
 
29
33
  import { default as Sankey } from './src/graph/sankey.mjs';
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sc-voice/tools",
3
- "version": "2.3.0",
3
+ "version": "2.5.0",
4
4
  "description": "Utilities for SC-Voice",
5
5
  "main": "index.mjs",
6
6
  "files": [
@@ -0,0 +1,168 @@
1
+ import { DBG } from '../defines.mjs';
2
+ import { WordVector } from './word-vector.mjs';
3
+
4
+ // The golden ratio is pretty.
5
+ // 1.6180339887498948482045868343656381177203091798057628621354;
6
+ const GOLDEN_FUDGE = 1.618033988749895;
7
+
8
+ export class TfidfSpace {
9
+ constructor(opts = {}) {
10
+ const msg = 't8e.ctor:';
11
+ let {
12
+ lang = 'en', // 2-letter code: fr, en, es, pt
13
+ corpusBow = new WordVector(), // corpus bag of words
14
+ corpusSize = 0, // number of retrieval units (docs, segments, etc.)
15
+ idfWeight = GOLDEN_FUDGE, // IDF dampening
16
+ idfFunction = TfidfSpace.idfTunable,
17
+ normalizeText,
18
+ } = opts;
19
+ if (lang == null) {
20
+ throw new Error(`${msg} lang?`);
21
+ }
22
+ if (normalizeText == null) {
23
+ switch (lang) {
24
+ case 'fr':
25
+ normalizeText = TfidfSpace.normalizeFR;
26
+ break;
27
+ case 'en':
28
+ normalizeText = TfidfSpace.normalizeEN;
29
+ break;
30
+ default:
31
+ throw new Error(`${msg} normalizeText?`);
32
+ }
33
+ }
34
+ Object.defineProperty(this, 'normalizeText', {
35
+ value: normalizeText,
36
+ });
37
+ Object.defineProperty(this, 'idfFunction', {
38
+ value: idfFunction,
39
+ });
40
+
41
+ // Serializable properties
42
+ Object.assign(this, {
43
+ lang,
44
+ corpusBow,
45
+ corpusSize,
46
+ idfWeight,
47
+ });
48
+ }
49
+
50
+ static normalizeEN(s) {
51
+ return s
52
+ .toLowerCase()
53
+ .replace(/[-.,_:;"'“”‘’!?]/g, '')
54
+ .replace(/ {2,}/g, ' ')
55
+ .trim();
56
+ }
57
+
58
+ static normalizeFR(s) {
59
+ return s
60
+ .toLowerCase()
61
+ .replace(/[«»]/gi, '')
62
+ .replace(/\bd[’']/gi, 'de ')
63
+ .replace(/\bl[’']/gi, 'le ')
64
+ .replace(/\bs[’']/gi, 'se ')
65
+ .replace(/[-.,_:;"'“”‘’!?]/g, '')
66
+ .replace(/[  ]+/g, ' ')
67
+ .trim();
68
+ }
69
+
70
+ static idfStandard(space, word) {
71
+ const msg = 'w7e.idfStandard:';
72
+ let { corpusBow, corpusSize } = space;
73
+ let wordDocs = corpusBow[word] || 0;
74
+ return Math.log((corpusSize + 1) / (wordDocs+1));
75
+ }
76
+
77
+ static idfTunable(space, word, idfWeight = this.idfWeight) {
78
+ const msg = 'w7e.idf:';
79
+ let { corpusBow, corpusSize } = space;
80
+ let wordDocs = corpusBow[word] || 0;
81
+ // NOTE: This is NOT the usual formula
82
+ // Map to [0:ignore..1:important]
83
+ return corpusSize
84
+ ? 1 - Math.exp(((wordDocs - corpusSize) / wordDocs) * idfWeight)
85
+ : 1;
86
+ }
87
+
88
+ idf(word, idfWeight) {
89
+ return this.idfFunction(this, word, idfWeight);
90
+ }
91
+
92
+ addDocument(doc) {
93
+ let { corpusBow } = this;
94
+ this.corpusSize += 1;
95
+ let { bow } = this.countWords(doc, 1); // one-hot
96
+ corpusBow.increment(bow);
97
+
98
+ return this;
99
+ }
100
+
101
+ termFrequency(word, document) {
102
+ return this.tf(word, document);
103
+ }
104
+
105
+ tf(word, doc) {
106
+ let { bow, words } = this.countWords(doc);
107
+ let count = bow[word] || 0;
108
+ return count ? count / words.length : 0;
109
+ }
110
+
111
+ tfidf(doc) {
112
+ const msg = 'w7e.tfidf:';
113
+ let { corpusBow, corpusSize, idfWeight } = this;
114
+
115
+ // More efficient implementation of tf * idf
116
+ let { bow, words } = this.countWords(doc);
117
+ let nWords = words.length;
118
+
119
+ let vTfIdf = words.reduce((a, word) => {
120
+ let wd = bow[word] || 0;
121
+ let tf = wd ? wd / nWords : 0;
122
+ let wc = corpusBow[word] || 0;
123
+ let idf = corpusSize
124
+ ? 1 - Math.exp(((wc - corpusSize) / wc) * idfWeight)
125
+ : 1;
126
+ let tfidf = tf * idf;
127
+ if (tfidf) {
128
+ a[word] = tfidf;
129
+ }
130
+ return a;
131
+ }, new WordVector());
132
+
133
+ return vTfIdf;
134
+ }
135
+
136
+ countWords(str, maxCount) {
137
+ const msg = 'w7e.countWords:';
138
+ if (str == null) {
139
+ throw new Error(`${msg} str?`);
140
+ }
141
+ let dbg = 0;
142
+ let sNorm = this.normalizeText(str);
143
+ let words = sNorm.split(' ');
144
+ let bow = words.reduce((a, w) => {
145
+ let count = (a[w] || 0) + 1;
146
+ a[w] = maxCount ? Math.min(maxCount, count) : count;
147
+ return a;
148
+ }, new WordVector());
149
+
150
+ return { bow, words };
151
+ }
152
+
153
+ bowOfText(text) {
154
+ const msg = 'w7e.bowOfText:';
155
+ if (text == null) {
156
+ throw new Error(`${msg} text?`);
157
+ }
158
+ let dbg = 0;
159
+ let sNorm = this.normalizeText(text);
160
+ let words = sNorm.split(' ');
161
+ let bow = words.reduce((a, w) => {
162
+ a[w] = (a[w] || 0) + 1;
163
+ return a;
164
+ }, new WordVector());
165
+
166
+ return bow;
167
+ }
168
+ } // TfidfSpace
@@ -0,0 +1,112 @@
1
+ import { DBG } from '../defines.mjs';
2
+
3
+ // The golden ratio is pretty.
4
+ // 1.6180339887498948482045868343656381177203091798057628621354;
5
+ const GOLDEN_FUDGE = 1.618033988749895;
6
+
7
+ export class WordVector extends Object {
8
+ constructor(props) {
9
+ super();
10
+ Object.assign(this, props);
11
+ Object.defineProperty(this, '$length', {
12
+ writable: true,
13
+ });
14
+ }
15
+
16
+ get length() {
17
+ if (this.$length == null) {
18
+ this.$length = Object.keys(this).length;
19
+ }
20
+ return this.$length;
21
+ }
22
+
23
+ toString() {
24
+ let sv = Object.entries(this).reduce((a, e) => {
25
+ let [k, v] = e;
26
+ let vf = v.toFixed(2);
27
+ a.push(`${k}:${vf}`);
28
+ return a;
29
+ }, []);
30
+ return sv.join(',');
31
+ }
32
+
33
+ norm() {
34
+ let keys = Object.keys(this);
35
+ if (keys.length === 0) {
36
+ return 0;
37
+ }
38
+ let sumSqr = keys.reduce((a, k) => {
39
+ let v = this[k];
40
+ return a + v * v;
41
+ }, 0);
42
+ return Math.sqrt(sumSqr);
43
+ }
44
+
45
+ add(vec2) {
46
+ let keys = Object.keys(vec2);
47
+ return keys.reduce((a, k) => {
48
+ let v2 = vec2[k];
49
+ if (v2) {
50
+ a[k] = (a[k] || 0) + v2;
51
+ }
52
+ return a;
53
+ }, new WordVector(this));
54
+ }
55
+
56
+ increment(vec2) {
57
+ let keys = Object.keys(vec2);
58
+ return keys.reduce((a, k) => {
59
+ let v2 = vec2[k];
60
+ if (v2) {
61
+ a[k] = (a[k] || 0) + v2;
62
+ }
63
+ return a;
64
+ }, this);
65
+ }
66
+
67
+ dot(vec2) {
68
+ const msg = 'w8r.dot:';
69
+ if (vec2 == null) {
70
+ throw new Error(`${msg} vec2?`);
71
+ }
72
+ let keys = Object.keys(this);
73
+ return keys.reduce((a, k) => {
74
+ let v1 = this[k];
75
+ let v2 = vec2[k] || 0;
76
+
77
+ return a + v1 * v2;
78
+ }, 0);
79
+ }
80
+
81
+ scale(c) {
82
+ return Object.keys(this).reduce((a,k)=>{
83
+ a[k] *= c;
84
+ return a;
85
+ }, this);
86
+ }
87
+
88
+ intersect(vec2 = {}) {
89
+ let keys = Object.keys(this);
90
+ return keys.reduce((a, k) => {
91
+ let v1 = this[k];
92
+ let v2 = vec2[k] || 0;
93
+ if (v1 && v2) {
94
+ a[k] = v1 * v2;
95
+ }
96
+
97
+ return a;
98
+ }, new WordVector());
99
+ }
100
+
101
+ similar(vec2) {
102
+ const msg = 'w8r.similar:';
103
+ if (vec2 == null) {
104
+ throw new Error(`${msg} vec2?`);
105
+ }
106
+ let d = this.dot(vec2);
107
+ let norm1 = this.norm();
108
+ let norm2 = vec2.norm();
109
+ let den = norm1 * norm2;
110
+ return den ? d / den : 0;
111
+ }
112
+ } // WordVector