@sc-voice/tools 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.mjs CHANGED
@@ -12,6 +12,8 @@ import { MerkleJson } from './src/text/merkle-json.mjs';
12
12
  import { SuttaCentralId } from './src/text/sutta-central-id.mjs';
13
13
  import { Unicode } from './src/text/unicode.mjs';
14
14
  import { WordSpace } from './src/text/word-space.mjs';
15
+ import { WordVector } from './src/text/word-vector.mjs';
16
+ import { TfidfSpace } from './src/text/tfidf-space.mjs';
15
17
  import { LogEntry, Logger } from './src/text/logger.mjs';
16
18
 
17
19
  export const Text = {
@@ -24,6 +26,8 @@ export const Text = {
24
26
  SuttaCentralId,
25
27
  Unicode,
26
28
  WordSpace,
29
+ WordVector,
30
+ TfidfSpace,
27
31
  };
28
32
 
29
33
  import { default as Sankey } from './src/graph/sankey.mjs';
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sc-voice/tools",
3
- "version": "2.3.0",
3
+ "version": "2.4.0",
4
4
  "description": "Utilities for SC-Voice",
5
5
  "main": "index.mjs",
6
6
  "files": [
@@ -0,0 +1,156 @@
1
+ import { DBG } from '../defines.mjs';
2
+ import { WordVector } from './word-vector.mjs';
3
+
4
+ // The golden ratio is pretty.
5
+ // 1.6180339887498948482045868343656381177203091798057628621354;
6
+ const GOLDEN_FUDGE = 1.618033988749895;
7
+
8
+ export class TfidfSpace {
9
+ constructor(opts = {}) {
10
+ const msg = 't8e.ctor:';
11
+ let {
12
+ lang = 'en', // 2-letter code: fr, en, es, pt
13
+ corpusBow = new WordVector(), // corpus bag of words
14
+ corpusSize = 0, // number of retrieval units (docs, segments, etc.)
15
+ idfWeight = GOLDEN_FUDGE, // IDF dampening
16
+ normalizeText,
17
+ } = opts;
18
+ if (lang == null) {
19
+ throw new Error(`${msg} lang?`);
20
+ }
21
+ if (normalizeText == null) {
22
+ switch (lang) {
23
+ case 'fr':
24
+ normalizeText = TfidfSpace.normalizeFR;
25
+ break;
26
+ case 'en':
27
+ normalizeText = TfidfSpace.normalizeEN;
28
+ break;
29
+ default:
30
+ throw new Error(`${msg} normalizeText?`);
31
+ }
32
+ }
33
+ Object.defineProperty(this, 'normalizeText', {
34
+ value: normalizeText,
35
+ });
36
+
37
+ // Serializable properties
38
+ Object.assign(this, {
39
+ lang,
40
+ corpusBow,
41
+ corpusSize,
42
+ idfWeight,
43
+ });
44
+ }
45
+
46
+ static normalizeEN(s) {
47
+ return s
48
+ .toLowerCase()
49
+ .replace(/[-.,_:;"'“”‘’!?]/g, '')
50
+ .replace(/ {2,}/g, ' ')
51
+ .trim();
52
+ }
53
+
54
+ static normalizeFR(s) {
55
+ return s
56
+ .toLowerCase()
57
+ .replace(/[«»]/gi, '')
58
+ .replace(/\bd[’']/gi, 'de ')
59
+ .replace(/\bl[’']/gi, 'le ')
60
+ .replace(/\bs[’']/gi, 'se ')
61
+ .replace(/[-.,_:;"'“”‘’!?]/g, '')
62
+ .replace(/[  ]+/g, ' ')
63
+ .trim();
64
+ }
65
+
66
+ addDocument(doc) {
67
+ let { corpusBow } = this;
68
+ this.corpusSize += 1;
69
+ let { bow } = this.countWords(doc, 1); // one-hot
70
+ corpusBow.increment(bow);
71
+
72
+ return this;
73
+ }
74
+
75
+ inverseDocumentFrequency(word, idfWeight) {
76
+ return this.idf(word, idfWeight);
77
+ }
78
+
79
+ idf(word, idfWeight = this.idfWeight) {
80
+ const msg = 'w7e.idf:';
81
+ let { corpusBow, corpusSize } = this;
82
+ let wCount = corpusBow[word] || 0;
83
+ // Map to [0:ignore..1:important]
84
+ return corpusSize
85
+ ? 1 - Math.exp(((wCount - corpusSize) / wCount) * idfWeight)
86
+ : 1;
87
+ }
88
+
89
+ termFrequency(word, document) {
90
+ return this.tf(word, document);
91
+ }
92
+
93
+ tf(word, doc) {
94
+ let { bow, words } = this.countWords(doc);
95
+ let count = bow[word] || 0;
96
+ return count ? count / words.length : 0;
97
+ }
98
+
99
+ tfidf(doc) {
100
+ const msg = 'w7e.tfidf:';
101
+ let { corpusBow, corpusSize, idfWeight } = this;
102
+
103
+ // More efficient implementation of tf * idf
104
+ let { bow, words } = this.countWords(doc);
105
+ let nWords = words.length;
106
+
107
+ let vTfIdf = words.reduce((a, word) => {
108
+ let wd = bow[word] || 0;
109
+ let tf = wd ? wd / nWords : 0;
110
+ let wc = corpusBow[word] || 0;
111
+ let idf = corpusSize
112
+ ? 1 - Math.exp(((wc - corpusSize) / wc) * idfWeight)
113
+ : 1;
114
+ let tfidf = tf * idf;
115
+ if (tfidf) {
116
+ a[word] = tfidf;
117
+ }
118
+ return a;
119
+ }, new WordVector());
120
+
121
+ return vTfIdf;
122
+ }
123
+
124
+ countWords(str, maxCount) {
125
+ const msg = 'w7e.countWords:';
126
+ if (str == null) {
127
+ throw new Error(`${msg} str?`);
128
+ }
129
+ let dbg = 0;
130
+ let sNorm = this.normalizeText(str);
131
+ let words = sNorm.split(' ');
132
+ let bow = words.reduce((a, w) => {
133
+ let count = (a[w] || 0) + 1;
134
+ a[w] = maxCount ? Math.min(maxCount, count) : count;
135
+ return a;
136
+ }, new WordVector());
137
+
138
+ return { bow, words };
139
+ }
140
+
141
+ string2Vector(str, scale = 1) {
142
+ const msg = 'w7e.string2Vector:';
143
+ if (str == null) {
144
+ throw new Error(`${msg} str?`);
145
+ }
146
+ let dbg = 0;
147
+ let sNorm = this.normalizeText(str);
148
+ let words = sNorm.split(' ');
149
+ let v = words.reduce((a, w) => {
150
+ a[w] = (a[w] || 0) + scale;
151
+ return a;
152
+ }, new WordVector());
153
+
154
+ return v;
155
+ }
156
+ } // TfidfSpace
@@ -0,0 +1,105 @@
1
+ import { DBG } from '../defines.mjs';
2
+
3
+ // The golden ratio is pretty.
4
+ // 1.6180339887498948482045868343656381177203091798057628621354;
5
+ const GOLDEN_FUDGE = 1.618033988749895;
6
+
7
+ export class WordVector extends Object {
8
+ constructor(props) {
9
+ super();
10
+ Object.assign(this, props);
11
+ Object.defineProperty(this, '$length', {
12
+ writable: true,
13
+ });
14
+ }
15
+
16
+ get length() {
17
+ if (this.$length == null) {
18
+ this.$length = Object.keys(this).length;
19
+ }
20
+ return this.$length;
21
+ }
22
+
23
+ toString() {
24
+ let sv = Object.entries(this).reduce((a, e) => {
25
+ let [k, v] = e;
26
+ let vf = v.toFixed(2);
27
+ a.push(`${k}:${vf}`);
28
+ return a;
29
+ }, []);
30
+ return sv.join(',');
31
+ }
32
+
33
+ norm() {
34
+ let keys = Object.keys(this);
35
+ if (keys.length === 0) {
36
+ return 0;
37
+ }
38
+ let sumSqr = keys.reduce((a, k) => {
39
+ let v = this[k];
40
+ return a + v * v;
41
+ }, 0);
42
+ return Math.sqrt(sumSqr);
43
+ }
44
+
45
+ add(vec2) {
46
+ let keys = Object.keys(vec2);
47
+ return keys.reduce((a, k) => {
48
+ let v2 = vec2[k];
49
+ if (v2) {
50
+ a[k] = (a[k] || 0) + v2;
51
+ }
52
+ return a;
53
+ }, new WordVector(this));
54
+ }
55
+
56
+ increment(vec2) {
57
+ let keys = Object.keys(vec2);
58
+ return keys.reduce((a, k) => {
59
+ let v2 = vec2[k];
60
+ if (v2) {
61
+ a[k] = (a[k] || 0) + v2;
62
+ }
63
+ return a;
64
+ }, this);
65
+ }
66
+
67
+ dot(vec2) {
68
+ const msg = 'w8r.dot:';
69
+ if (vec2 == null) {
70
+ throw new Error(`${msg} vec2?`);
71
+ }
72
+ let keys = Object.keys(this);
73
+ return keys.reduce((a, k) => {
74
+ let v1 = this[k];
75
+ let v2 = vec2[k] || 0;
76
+
77
+ return a + v1 * v2;
78
+ }, 0);
79
+ }
80
+
81
+ intersect(vec2 = {}) {
82
+ let keys = Object.keys(this);
83
+ return keys.reduce((a, k) => {
84
+ let v1 = this[k];
85
+ let v2 = vec2[k] || 0;
86
+ if (v1 && v2) {
87
+ a[k] = v1 * v2;
88
+ }
89
+
90
+ return a;
91
+ }, new WordVector());
92
+ }
93
+
94
+ similar(vec2) {
95
+ const msg = 'w8r.similar:';
96
+ if (vec2 == null) {
97
+ throw new Error(`${msg} vec2?`);
98
+ }
99
+ let d = this.dot(vec2);
100
+ let norm1 = this.norm();
101
+ let norm2 = vec2.norm();
102
+ let den = norm1 * norm2;
103
+ return den ? d / den : 0;
104
+ }
105
+ } // WordVector