@sc-voice/tools 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.mjs +21 -6
- package/package.json +2 -1
- package/src/defines.mjs +6 -1
- package/src/graph/sankey.mjs +56 -0
- package/src/math/fraction.mjs +8 -0
- package/src/text/ebt-doc.mjs +6 -2
- package/src/text/legacy-doc.mjs +29 -2
- package/src/text/word-space.mjs +83 -54
- package/src/{text → translate}/aligner.mjs +127 -38
- package/src/translate/deepl-adapter.mjs +353 -0
- package/src/translate/dpd-transformer.mjs +17 -0
- package/src/translate/mock-deepl.mjs +351 -0
- package/src/translate/quote-parser.mjs +681 -0
package/index.mjs
CHANGED
|
@@ -5,9 +5,6 @@ export const ScvMath = {
|
|
|
5
5
|
Fraction,
|
|
6
6
|
};
|
|
7
7
|
|
|
8
|
-
import {
|
|
9
|
-
Aligner, Alignment, AlignmentStatus
|
|
10
|
-
} from './src/text/aligner.mjs';
|
|
11
8
|
import { BilaraPath } from './src/text/bilara-path.mjs';
|
|
12
9
|
import { EbtDoc } from './src/text/ebt-doc.mjs';
|
|
13
10
|
import { LegacyDoc } from './src/text/legacy-doc.mjs';
|
|
@@ -17,9 +14,6 @@ import { Unicode } from './src/text/unicode.mjs';
|
|
|
17
14
|
import { WordSpace } from './src/text/word-space.mjs';
|
|
18
15
|
|
|
19
16
|
export const Text = {
|
|
20
|
-
Aligner,
|
|
21
|
-
Alignment,
|
|
22
|
-
AlignmentStatus,
|
|
23
17
|
BilaraPath,
|
|
24
18
|
EbtDoc,
|
|
25
19
|
LegacyDoc,
|
|
@@ -29,3 +23,24 @@ export const Text = {
|
|
|
29
23
|
WordSpace,
|
|
30
24
|
};
|
|
31
25
|
|
|
26
|
+
import { default as Sankey } from './src/graph/sankey.mjs';
|
|
27
|
+
export const Graph = {
|
|
28
|
+
Sankey,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
import {
|
|
32
|
+
Aligner, Alignment, AlignmentStatus
|
|
33
|
+
} from './src/translate/aligner.mjs';
|
|
34
|
+
import { DpdTransformer } from './src/translate/dpd-transformer.mjs';
|
|
35
|
+
import { MockDeepL } from './src/translate/mock-deepl.mjs';
|
|
36
|
+
import { DeepLAdapter } from './src/translate/deepl-adapter.mjs';
|
|
37
|
+
import { QuoteParser } from './src/translate/quote-parser.mjs';
|
|
38
|
+
export const Translate = {
|
|
39
|
+
Aligner,
|
|
40
|
+
Alignment,
|
|
41
|
+
AlignmentStatus,
|
|
42
|
+
DeepLAdapter,
|
|
43
|
+
DpdTransformer,
|
|
44
|
+
MockDeepL,
|
|
45
|
+
QuoteParser,
|
|
46
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sc-voice/tools",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "Utilities for SC-Voice",
|
|
5
5
|
"main": "index.mjs",
|
|
6
6
|
"files": [
|
|
@@ -31,6 +31,7 @@
|
|
|
31
31
|
"homepage": "https://github.com/sc-voice/tools#readme",
|
|
32
32
|
"devDependencies": {
|
|
33
33
|
"@biomejs/biome": "1.9.4",
|
|
34
|
+
"deepl-node": "^1.15.0",
|
|
34
35
|
"eslint": "^9.17.0",
|
|
35
36
|
"mocha": "^11.0.1",
|
|
36
37
|
"should": "^13.2.3"
|
package/src/defines.mjs
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
|
|
2
2
|
export const DBG = {
|
|
3
|
-
APPLY_WORD_MAP: 0,
|
|
4
3
|
ALIGN_ALL: 0,
|
|
5
4
|
ALIGN_LINE: 0,
|
|
6
5
|
ML_DOC_VECTORS: 0, // 'mn8:3.4',
|
|
7
6
|
MN8_MOHAN: 0,
|
|
7
|
+
DEEPL_ADAPTER: 0,
|
|
8
|
+
DEEPL_MOCK: 0, // use mock-deepl
|
|
9
|
+
DEEPL_MOCK_XLT: 0, // use mock translation
|
|
10
|
+
DEEPL_TEST_API: 0, // test with live DeepL API ($$$)
|
|
11
|
+
DEEPL_XLT: 0, // test live translation
|
|
12
|
+
WORD_MAP_TRANFORMER: 0,
|
|
8
13
|
};
|
|
9
14
|
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { DBG } from '../defines.mjs';
|
|
2
|
+
|
|
3
|
+
export default class Sankey {
|
|
4
|
+
constructor(opts = {}) {
|
|
5
|
+
let { links = [] } = opts;
|
|
6
|
+
|
|
7
|
+
Object.assign(this, {
|
|
8
|
+
links: [...links],
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
let nodeMap = {};
|
|
12
|
+
Object.defineProperty(this, 'nodeMap', { value: nodeMap });
|
|
13
|
+
|
|
14
|
+
// organize links by source/target
|
|
15
|
+
let linkMap = {};
|
|
16
|
+
Object.defineProperty(this, 'linkMap', { value: linkMap });
|
|
17
|
+
this.links.forEach((link) => {
|
|
18
|
+
let { source, target } = link;
|
|
19
|
+
let key = `${source}|${target}`;
|
|
20
|
+
linkMap[key] = link;
|
|
21
|
+
nodeMap[source] = (nodeMap[source] || 0) + 1;
|
|
22
|
+
nodeMap[target] = (nodeMap[target] || 0) + 1;
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
get nodes() {
|
|
27
|
+
let { nodeMap } = this;
|
|
28
|
+
let nodes = Object.keys(nodeMap).map((n) => ({ id: n }));
|
|
29
|
+
return nodes;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
addLink({ source, target, value = 0 }) {
|
|
33
|
+
const msg = 'Sankey.addLink()';
|
|
34
|
+
const dbg = DBG.SANKEY;
|
|
35
|
+
let { links, linkMap, nodeMap } = this;
|
|
36
|
+
if (source == null) {
|
|
37
|
+
throw new Error(`${msg} source? [${source}]`);
|
|
38
|
+
}
|
|
39
|
+
nodeMap[source] = (nodeMap[source] || 0) + 1;
|
|
40
|
+
if (target == null) {
|
|
41
|
+
throw new Error(`${msg} target? [${target}]`);
|
|
42
|
+
}
|
|
43
|
+
if (Number.isNaN(value)) {
|
|
44
|
+
throw new Error(`${msg} value? [${value}]`);
|
|
45
|
+
}
|
|
46
|
+
nodeMap[target] = (nodeMap[target] || 0) + 1;
|
|
47
|
+
|
|
48
|
+
let key = `${source}|${target}`;
|
|
49
|
+
let link = linkMap[key];
|
|
50
|
+
if (link == null) {
|
|
51
|
+
link = linkMap[key] = { source, target, value: 0 };
|
|
52
|
+
links.push(link);
|
|
53
|
+
}
|
|
54
|
+
link.value += value;
|
|
55
|
+
}
|
|
56
|
+
}
|
package/src/math/fraction.mjs
CHANGED
|
@@ -41,10 +41,18 @@ export class Fraction {
|
|
|
41
41
|
return this.numerator;
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
+
set n(value) {
|
|
45
|
+
return this.numerator = Number(value);
|
|
46
|
+
}
|
|
47
|
+
|
|
44
48
|
get d() {
|
|
45
49
|
return this.denominator;
|
|
46
50
|
}
|
|
47
51
|
|
|
52
|
+
set d(value) {
|
|
53
|
+
return this.denominator = Number(value);
|
|
54
|
+
}
|
|
55
|
+
|
|
48
56
|
get value() {
|
|
49
57
|
let { numerator, denominator } = this;
|
|
50
58
|
return numerator / denominator;
|
package/src/text/ebt-doc.mjs
CHANGED
|
@@ -3,7 +3,11 @@ import { SuttaCentralId } from './sutta-central-id.mjs';
|
|
|
3
3
|
let privateCtor = false;
|
|
4
4
|
|
|
5
5
|
const INHERITED_KEYS = [
|
|
6
|
-
'lang',
|
|
6
|
+
'lang',
|
|
7
|
+
'author',
|
|
8
|
+
'author_uid',
|
|
9
|
+
'wordSpace',
|
|
10
|
+
'footer',
|
|
7
11
|
];
|
|
8
12
|
|
|
9
13
|
const HDR_KEY = '__header__';
|
|
@@ -27,7 +31,7 @@ export class EbtDoc {
|
|
|
27
31
|
|
|
28
32
|
static create(opts = {}) {
|
|
29
33
|
const msg = 'E4c.create:';
|
|
30
|
-
let {
|
|
34
|
+
let { segMap = {}, parent = {}, suid, bilaraPath } = opts;
|
|
31
35
|
if (segMap == null) {
|
|
32
36
|
throw new Error(`${msg} segMap?`);
|
|
33
37
|
}
|
package/src/text/legacy-doc.mjs
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { DBG } from '../defines.mjs';
|
|
2
|
+
|
|
1
3
|
let privateCtor = false;
|
|
2
4
|
|
|
3
5
|
const HTML_FILTER = (() => {
|
|
@@ -35,13 +37,38 @@ export class LegacyDoc {
|
|
|
35
37
|
return true;
|
|
36
38
|
}
|
|
37
39
|
|
|
38
|
-
static
|
|
40
|
+
static async fetchLegacy(opts = {}) {
|
|
41
|
+
const msg = 'L7c.fetch:';
|
|
42
|
+
const dbg = DBG.FETCH_LEGACY;
|
|
43
|
+
let {
|
|
44
|
+
endPoint = 'https://suttacentral.net/api/suttas',
|
|
45
|
+
sutta_uid,
|
|
46
|
+
lang,
|
|
47
|
+
author,
|
|
48
|
+
maxBuffer = 10 * 1024 * 1024,
|
|
49
|
+
} = opts;
|
|
50
|
+
let url = [endPoint, sutta_uid, `${author}?lang=${lang}`].join(
|
|
51
|
+
'/',
|
|
52
|
+
);
|
|
53
|
+
let res = await fetch(url);
|
|
54
|
+
if (!res.ok) {
|
|
55
|
+
throw new Error(`${msg} {res.status} ${url}`);
|
|
56
|
+
}
|
|
57
|
+
let json = await res.json();
|
|
58
|
+
let { translation } = json;
|
|
59
|
+
return LegacyDoc.create(translation);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
static create(translation) {
|
|
39
63
|
const msg = 'LegacyDoc.create:';
|
|
40
64
|
if (typeof legacy === 'string') {
|
|
41
65
|
legacy = JSON.parse(legacy);
|
|
42
66
|
}
|
|
43
67
|
|
|
44
|
-
let { uid, lang, title, author, author_uid, text } =
|
|
68
|
+
let { uid, lang, title, author, author_uid, text } = translation;
|
|
69
|
+
if (typeof text === 'string') {
|
|
70
|
+
text = text.split('\n');
|
|
71
|
+
}
|
|
45
72
|
|
|
46
73
|
let para;
|
|
47
74
|
let lines = text.filter((line) => !HTML_FILTER.test(line));
|
package/src/text/word-space.mjs
CHANGED
|
@@ -63,7 +63,7 @@ class Vector extends Object {
|
|
|
63
63
|
}, 0);
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
-
intersect(vec2) {
|
|
66
|
+
intersect(vec2 = {}) {
|
|
67
67
|
let keys = Object.keys(this);
|
|
68
68
|
return keys.reduce((a, k) => {
|
|
69
69
|
let v1 = this[k];
|
|
@@ -89,26 +89,21 @@ class Vector extends Object {
|
|
|
89
89
|
}
|
|
90
90
|
} // Vector
|
|
91
91
|
|
|
92
|
-
export class
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
minWord = 4, // minimum word length
|
|
97
|
-
normalize,
|
|
98
|
-
normalizeVector = WordSpace.normalizeVector,
|
|
99
|
-
wordMap = {}, // word replacement map
|
|
100
|
-
reWordMap,
|
|
101
|
-
} = opts;
|
|
102
|
-
|
|
103
|
-
wordMap = Object.keys(wordMap).reduce((a, w) => {
|
|
92
|
+
export class WordMapTransformer {
|
|
93
|
+
// DEPRECATED
|
|
94
|
+
constructor(oWordMap = {}, opts = {}) {
|
|
95
|
+
let wordMap = Object.keys(oWordMap).reduce((a, w) => {
|
|
104
96
|
let wLow = w.toLowerCase();
|
|
105
|
-
a[wLow] =
|
|
97
|
+
a[wLow] = oWordMap[w].toLowerCase();
|
|
106
98
|
return a;
|
|
107
99
|
}, {});
|
|
100
|
+
|
|
101
|
+
let { lang = 'en', normalize } = opts;
|
|
102
|
+
|
|
108
103
|
if (!normalize) {
|
|
109
104
|
switch (lang) {
|
|
110
105
|
case 'fr':
|
|
111
|
-
normalize =
|
|
106
|
+
normalize = WordMapTransformer.normalizeFR;
|
|
112
107
|
break;
|
|
113
108
|
default:
|
|
114
109
|
normalize = (s) => s;
|
|
@@ -116,27 +111,8 @@ export class WordSpace {
|
|
|
116
111
|
}
|
|
117
112
|
}
|
|
118
113
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
minWord,
|
|
122
|
-
normalize,
|
|
123
|
-
normalizeVector,
|
|
124
|
-
reWordMap,
|
|
125
|
-
wordMap,
|
|
126
|
-
});
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
static compileWordMap(wordMap) {
|
|
130
|
-
return (
|
|
131
|
-
wordMap &&
|
|
132
|
-
Object.keys(wordMap).map((pat) => {
|
|
133
|
-
let rep = wordMap[pat];
|
|
134
|
-
return {
|
|
135
|
-
re: new RegExp(pat, 'iugm'),
|
|
136
|
-
rep,
|
|
137
|
-
};
|
|
138
|
-
})
|
|
139
|
-
);
|
|
114
|
+
this.wordMap = wordMap;
|
|
115
|
+
this.normalize = normalize;
|
|
140
116
|
}
|
|
141
117
|
|
|
142
118
|
static normalizeFR(s) {
|
|
@@ -150,34 +126,92 @@ export class WordSpace {
|
|
|
150
126
|
.trim();
|
|
151
127
|
}
|
|
152
128
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
129
|
+
#compileWordMap() {
|
|
130
|
+
let { wordMap } = this;
|
|
131
|
+
return Object.keys(wordMap).map((pat) => {
|
|
132
|
+
let rep = wordMap[pat];
|
|
133
|
+
return {
|
|
134
|
+
re: new RegExp(pat, 'iugm'),
|
|
135
|
+
rep,
|
|
136
|
+
};
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
transform(text) {
|
|
141
|
+
const msg = 'W16r.transform:';
|
|
142
|
+
const dbg = DBG.WORD_MAP_TRANSFORMER;
|
|
143
|
+
let { wordMap, reWordMap, normalize } = this;
|
|
157
144
|
if (reWordMap == null) {
|
|
158
|
-
reWordMap =
|
|
145
|
+
reWordMap = this.#compileWordMap();
|
|
159
146
|
this.reWordMap = reWordMap;
|
|
160
147
|
}
|
|
161
148
|
dbg && console.log(msg, { text });
|
|
162
|
-
let
|
|
149
|
+
let textMapped = text;
|
|
163
150
|
for (let i = 0; i < reWordMap.length; i++) {
|
|
164
151
|
let { re, rep } = reWordMap[i];
|
|
165
|
-
|
|
166
|
-
dbg && console.log(msg, { i,
|
|
152
|
+
textMapped = textMapped.replaceAll(re, rep);
|
|
153
|
+
dbg && console.log(msg, { i, textMapped, re });
|
|
167
154
|
}
|
|
155
|
+
let rslt = normalize(textMapped)
|
|
156
|
+
.toLowerCase()
|
|
157
|
+
.trim()
|
|
158
|
+
.replace(/[-]/g, ' ')
|
|
159
|
+
.replace(/[.,_:;"'“”‘’!?]/g, '');
|
|
168
160
|
return rslt;
|
|
169
161
|
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
export class WordSpace {
|
|
165
|
+
constructor(opts = {}) {
|
|
166
|
+
let {
|
|
167
|
+
lang, // 2-letter code: fr, en, es, pt
|
|
168
|
+
minWord = 4, // minimum word length
|
|
169
|
+
normalize,
|
|
170
|
+
normalizeVector = WordSpace.normalizeVector,
|
|
171
|
+
transformText,
|
|
172
|
+
transformer,
|
|
173
|
+
reWordMap,
|
|
174
|
+
} = opts;
|
|
175
|
+
|
|
176
|
+
if (transformer == null) {
|
|
177
|
+
let wordMap = opts.wordMap;
|
|
178
|
+
transformer = new WordMapTransformer(wordMap, {
|
|
179
|
+
lang,
|
|
180
|
+
normalize,
|
|
181
|
+
});
|
|
182
|
+
if (transformText == null) {
|
|
183
|
+
transformText = (text) => transformer.transform(text);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
Object.defineProperty(this, 'transformText', {
|
|
187
|
+
value: transformText,
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
Object.assign(this, {
|
|
191
|
+
lang,
|
|
192
|
+
minWord,
|
|
193
|
+
normalizeVector,
|
|
194
|
+
reWordMap,
|
|
195
|
+
transformer,
|
|
196
|
+
wordMap: opts.wordMap, // DEPRECATED
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
static get WordMapTransformer() {
|
|
201
|
+
return WordMapTransformer;
|
|
202
|
+
}
|
|
170
203
|
|
|
171
204
|
static get Vector() {
|
|
172
205
|
return Vector;
|
|
173
206
|
}
|
|
174
207
|
|
|
175
|
-
|
|
176
|
-
|
|
208
|
+
// Golden Ratio fudge factor scales a count of 1 to ~0.8
|
|
209
|
+
// 1.6180339887498948482045868343656381177203091798057628621354
|
|
210
|
+
static normalizeVector(v, scale=1.618033988749895) {
|
|
177
211
|
let vNew = new Vector(v);
|
|
178
212
|
Object.entries(v).forEach((e) => {
|
|
179
213
|
let [key, value] = e;
|
|
180
|
-
vNew[key] = 1 - Math.exp(-value
|
|
214
|
+
vNew[key] = 1 - Math.exp(-value * scale);
|
|
181
215
|
});
|
|
182
216
|
|
|
183
217
|
return vNew;
|
|
@@ -189,13 +223,8 @@ export class WordSpace {
|
|
|
189
223
|
throw new Error(`${msg} str?`);
|
|
190
224
|
}
|
|
191
225
|
let dbg = 0;
|
|
192
|
-
let { normalize, normalizeVector, minWord
|
|
193
|
-
let
|
|
194
|
-
let sNorm = normalize(sWordMap)
|
|
195
|
-
.toLowerCase()
|
|
196
|
-
.trim()
|
|
197
|
-
.replace(/[-]/g, ' ')
|
|
198
|
-
.replace(/[.,_:;"'“”‘’!?]/g, '');
|
|
226
|
+
let { normalize, normalizeVector, minWord } = this;
|
|
227
|
+
let sNorm = this.transformText(str);
|
|
199
228
|
let words = sNorm.split(' ');
|
|
200
229
|
let v = words.reduce((a, w) => {
|
|
201
230
|
if (w.length >= minWord) {
|