@sc-voice/tools 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.mjs +21 -6
- package/package.json +2 -1
- package/src/defines.mjs +6 -1
- package/src/graph/sankey.mjs +56 -0
- package/src/text/ebt-doc.mjs +6 -2
- package/src/text/legacy-doc.mjs +29 -2
- package/src/text/word-space.mjs +83 -54
- package/src/{text → translate}/aligner.mjs +127 -38
- package/src/translate/deepl-adapter.mjs +353 -0
- package/src/translate/dpd-transformer.mjs +17 -0
- package/src/translate/mock-deepl.mjs +351 -0
- package/src/translate/quote-parser.mjs +681 -0
package/index.mjs
CHANGED
|
@@ -5,9 +5,6 @@ export const ScvMath = {
|
|
|
5
5
|
Fraction,
|
|
6
6
|
};
|
|
7
7
|
|
|
8
|
-
import {
|
|
9
|
-
Aligner, Alignment, AlignmentStatus
|
|
10
|
-
} from './src/text/aligner.mjs';
|
|
11
8
|
import { BilaraPath } from './src/text/bilara-path.mjs';
|
|
12
9
|
import { EbtDoc } from './src/text/ebt-doc.mjs';
|
|
13
10
|
import { LegacyDoc } from './src/text/legacy-doc.mjs';
|
|
@@ -17,9 +14,6 @@ import { Unicode } from './src/text/unicode.mjs';
|
|
|
17
14
|
import { WordSpace } from './src/text/word-space.mjs';
|
|
18
15
|
|
|
19
16
|
export const Text = {
|
|
20
|
-
Aligner,
|
|
21
|
-
Alignment,
|
|
22
|
-
AlignmentStatus,
|
|
23
17
|
BilaraPath,
|
|
24
18
|
EbtDoc,
|
|
25
19
|
LegacyDoc,
|
|
@@ -29,3 +23,24 @@ export const Text = {
|
|
|
29
23
|
WordSpace,
|
|
30
24
|
};
|
|
31
25
|
|
|
26
|
+
import { default as Sankey } from './src/graph/sankey.mjs';
|
|
27
|
+
export const Graph = {
|
|
28
|
+
Sankey,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
import {
|
|
32
|
+
Aligner, Alignment, AlignmentStatus
|
|
33
|
+
} from './src/translate/aligner.mjs';
|
|
34
|
+
import { DpdTransformer } from './src/translate/dpd-transformer.mjs';
|
|
35
|
+
import { MockDeepL } from './src/translate/mock-deepl.mjs';
|
|
36
|
+
import { DeepLAdapter } from './src/translate/deepl-adapter.mjs';
|
|
37
|
+
import { QuoteParser } from './src/translate/quote-parser.mjs';
|
|
38
|
+
export const Translate = {
|
|
39
|
+
Aligner,
|
|
40
|
+
Alignment,
|
|
41
|
+
AlignmentStatus,
|
|
42
|
+
DeepLAdapter,
|
|
43
|
+
DpdTransformer,
|
|
44
|
+
MockDeepL,
|
|
45
|
+
QuoteParser,
|
|
46
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sc-voice/tools",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"description": "Utilities for SC-Voice",
|
|
5
5
|
"main": "index.mjs",
|
|
6
6
|
"files": [
|
|
@@ -31,6 +31,7 @@
|
|
|
31
31
|
"homepage": "https://github.com/sc-voice/tools#readme",
|
|
32
32
|
"devDependencies": {
|
|
33
33
|
"@biomejs/biome": "1.9.4",
|
|
34
|
+
"deepl-node": "^1.15.0",
|
|
34
35
|
"eslint": "^9.17.0",
|
|
35
36
|
"mocha": "^11.0.1",
|
|
36
37
|
"should": "^13.2.3"
|
package/src/defines.mjs
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
|
|
2
2
|
export const DBG = {
|
|
3
|
-
APPLY_WORD_MAP: 0,
|
|
4
3
|
ALIGN_ALL: 0,
|
|
5
4
|
ALIGN_LINE: 0,
|
|
6
5
|
ML_DOC_VECTORS: 0, // 'mn8:3.4',
|
|
7
6
|
MN8_MOHAN: 0,
|
|
7
|
+
DEEPL_ADAPTER: 0,
|
|
8
|
+
DEEPL_MOCK: 0, // use mock-deepl
|
|
9
|
+
DEEPL_MOCK_XLT: 0, // use mock translation
|
|
10
|
+
DEEPL_TEST_API: 0, // test with live DeepL API ($$$)
|
|
11
|
+
DEEPL_XLT: 0, // test live translation
|
|
12
|
+
WORD_MAP_TRANFORMER: 0,
|
|
8
13
|
};
|
|
9
14
|
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { DBG } from '../defines.mjs';
|
|
2
|
+
|
|
3
|
+
export default class Sankey {
|
|
4
|
+
constructor(opts = {}) {
|
|
5
|
+
let { links = [] } = opts;
|
|
6
|
+
|
|
7
|
+
Object.assign(this, {
|
|
8
|
+
links: [...links],
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
let nodeMap = {};
|
|
12
|
+
Object.defineProperty(this, 'nodeMap', { value: nodeMap });
|
|
13
|
+
|
|
14
|
+
// organize links by source/target
|
|
15
|
+
let linkMap = {};
|
|
16
|
+
Object.defineProperty(this, 'linkMap', { value: linkMap });
|
|
17
|
+
this.links.forEach((link) => {
|
|
18
|
+
let { source, target } = link;
|
|
19
|
+
let key = `${source}|${target}`;
|
|
20
|
+
linkMap[key] = link;
|
|
21
|
+
nodeMap[source] = (nodeMap[source] || 0) + 1;
|
|
22
|
+
nodeMap[target] = (nodeMap[target] || 0) + 1;
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
get nodes() {
|
|
27
|
+
let { nodeMap } = this;
|
|
28
|
+
let nodes = Object.keys(nodeMap).map((n) => ({ id: n }));
|
|
29
|
+
return nodes;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
addLink({ source, target, value = 0 }) {
|
|
33
|
+
const msg = 'Sankey.addLink()';
|
|
34
|
+
const dbg = DBG.SANKEY;
|
|
35
|
+
let { links, linkMap, nodeMap } = this;
|
|
36
|
+
if (source == null) {
|
|
37
|
+
throw new Error(`${msg} source? [${source}]`);
|
|
38
|
+
}
|
|
39
|
+
nodeMap[source] = (nodeMap[source] || 0) + 1;
|
|
40
|
+
if (target == null) {
|
|
41
|
+
throw new Error(`${msg} target? [${target}]`);
|
|
42
|
+
}
|
|
43
|
+
if (Number.isNaN(value)) {
|
|
44
|
+
throw new Error(`${msg} value? [${value}]`);
|
|
45
|
+
}
|
|
46
|
+
nodeMap[target] = (nodeMap[target] || 0) + 1;
|
|
47
|
+
|
|
48
|
+
let key = `${source}|${target}`;
|
|
49
|
+
let link = linkMap[key];
|
|
50
|
+
if (link == null) {
|
|
51
|
+
link = linkMap[key] = { source, target, value: 0 };
|
|
52
|
+
links.push(link);
|
|
53
|
+
}
|
|
54
|
+
link.value += value;
|
|
55
|
+
}
|
|
56
|
+
}
|
package/src/text/ebt-doc.mjs
CHANGED
|
@@ -3,7 +3,11 @@ import { SuttaCentralId } from './sutta-central-id.mjs';
|
|
|
3
3
|
let privateCtor = false;
|
|
4
4
|
|
|
5
5
|
const INHERITED_KEYS = [
|
|
6
|
-
'lang',
|
|
6
|
+
'lang',
|
|
7
|
+
'author',
|
|
8
|
+
'author_uid',
|
|
9
|
+
'wordSpace',
|
|
10
|
+
'footer',
|
|
7
11
|
];
|
|
8
12
|
|
|
9
13
|
const HDR_KEY = '__header__';
|
|
@@ -27,7 +31,7 @@ export class EbtDoc {
|
|
|
27
31
|
|
|
28
32
|
static create(opts = {}) {
|
|
29
33
|
const msg = 'E4c.create:';
|
|
30
|
-
let {
|
|
34
|
+
let { segMap = {}, parent = {}, suid, bilaraPath } = opts;
|
|
31
35
|
if (segMap == null) {
|
|
32
36
|
throw new Error(`${msg} segMap?`);
|
|
33
37
|
}
|
package/src/text/legacy-doc.mjs
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { DBG } from '../defines.mjs';
|
|
2
|
+
|
|
1
3
|
let privateCtor = false;
|
|
2
4
|
|
|
3
5
|
const HTML_FILTER = (() => {
|
|
@@ -35,13 +37,38 @@ export class LegacyDoc {
|
|
|
35
37
|
return true;
|
|
36
38
|
}
|
|
37
39
|
|
|
38
|
-
static
|
|
40
|
+
static async fetchLegacy(opts = {}) {
|
|
41
|
+
const msg = 'L7c.fetch:';
|
|
42
|
+
const dbg = DBG.FETCH_LEGACY;
|
|
43
|
+
let {
|
|
44
|
+
endPoint = 'https://suttacentral.net/api/suttas',
|
|
45
|
+
sutta_uid,
|
|
46
|
+
lang,
|
|
47
|
+
author,
|
|
48
|
+
maxBuffer = 10 * 1024 * 1024,
|
|
49
|
+
} = opts;
|
|
50
|
+
let url = [endPoint, sutta_uid, `${author}?lang=${lang}`].join(
|
|
51
|
+
'/',
|
|
52
|
+
);
|
|
53
|
+
let res = await fetch(url);
|
|
54
|
+
if (!res.ok) {
|
|
55
|
+
throw new Error(`${msg} {res.status} ${url}`);
|
|
56
|
+
}
|
|
57
|
+
let json = await res.json();
|
|
58
|
+
let { translation } = json;
|
|
59
|
+
return LegacyDoc.create(translation);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
static create(translation) {
|
|
39
63
|
const msg = 'LegacyDoc.create:';
|
|
40
64
|
if (typeof legacy === 'string') {
|
|
41
65
|
legacy = JSON.parse(legacy);
|
|
42
66
|
}
|
|
43
67
|
|
|
44
|
-
let { uid, lang, title, author, author_uid, text } =
|
|
68
|
+
let { uid, lang, title, author, author_uid, text } = translation;
|
|
69
|
+
if (typeof text === 'string') {
|
|
70
|
+
text = text.split('\n');
|
|
71
|
+
}
|
|
45
72
|
|
|
46
73
|
let para;
|
|
47
74
|
let lines = text.filter((line) => !HTML_FILTER.test(line));
|
package/src/text/word-space.mjs
CHANGED
|
@@ -63,7 +63,7 @@ class Vector extends Object {
|
|
|
63
63
|
}, 0);
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
-
intersect(vec2) {
|
|
66
|
+
intersect(vec2 = {}) {
|
|
67
67
|
let keys = Object.keys(this);
|
|
68
68
|
return keys.reduce((a, k) => {
|
|
69
69
|
let v1 = this[k];
|
|
@@ -89,26 +89,21 @@ class Vector extends Object {
|
|
|
89
89
|
}
|
|
90
90
|
} // Vector
|
|
91
91
|
|
|
92
|
-
export class
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
minWord = 4, // minimum word length
|
|
97
|
-
normalize,
|
|
98
|
-
normalizeVector = WordSpace.normalizeVector,
|
|
99
|
-
wordMap = {}, // word replacement map
|
|
100
|
-
reWordMap,
|
|
101
|
-
} = opts;
|
|
102
|
-
|
|
103
|
-
wordMap = Object.keys(wordMap).reduce((a, w) => {
|
|
92
|
+
export class WordMapTransformer {
|
|
93
|
+
// DEPRECATED
|
|
94
|
+
constructor(oWordMap = {}, opts = {}) {
|
|
95
|
+
let wordMap = Object.keys(oWordMap).reduce((a, w) => {
|
|
104
96
|
let wLow = w.toLowerCase();
|
|
105
|
-
a[wLow] =
|
|
97
|
+
a[wLow] = oWordMap[w].toLowerCase();
|
|
106
98
|
return a;
|
|
107
99
|
}, {});
|
|
100
|
+
|
|
101
|
+
let { lang = 'en', normalize } = opts;
|
|
102
|
+
|
|
108
103
|
if (!normalize) {
|
|
109
104
|
switch (lang) {
|
|
110
105
|
case 'fr':
|
|
111
|
-
normalize =
|
|
106
|
+
normalize = WordMapTransformer.normalizeFR;
|
|
112
107
|
break;
|
|
113
108
|
default:
|
|
114
109
|
normalize = (s) => s;
|
|
@@ -116,27 +111,8 @@ export class WordSpace {
|
|
|
116
111
|
}
|
|
117
112
|
}
|
|
118
113
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
minWord,
|
|
122
|
-
normalize,
|
|
123
|
-
normalizeVector,
|
|
124
|
-
reWordMap,
|
|
125
|
-
wordMap,
|
|
126
|
-
});
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
static compileWordMap(wordMap) {
|
|
130
|
-
return (
|
|
131
|
-
wordMap &&
|
|
132
|
-
Object.keys(wordMap).map((pat) => {
|
|
133
|
-
let rep = wordMap[pat];
|
|
134
|
-
return {
|
|
135
|
-
re: new RegExp(pat, 'iugm'),
|
|
136
|
-
rep,
|
|
137
|
-
};
|
|
138
|
-
})
|
|
139
|
-
);
|
|
114
|
+
this.wordMap = wordMap;
|
|
115
|
+
this.normalize = normalize;
|
|
140
116
|
}
|
|
141
117
|
|
|
142
118
|
static normalizeFR(s) {
|
|
@@ -150,34 +126,92 @@ export class WordSpace {
|
|
|
150
126
|
.trim();
|
|
151
127
|
}
|
|
152
128
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
129
|
+
#compileWordMap() {
|
|
130
|
+
let { wordMap } = this;
|
|
131
|
+
return Object.keys(wordMap).map((pat) => {
|
|
132
|
+
let rep = wordMap[pat];
|
|
133
|
+
return {
|
|
134
|
+
re: new RegExp(pat, 'iugm'),
|
|
135
|
+
rep,
|
|
136
|
+
};
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
transform(text) {
|
|
141
|
+
const msg = 'W16r.transform:';
|
|
142
|
+
const dbg = DBG.WORD_MAP_TRANSFORMER;
|
|
143
|
+
let { wordMap, reWordMap, normalize } = this;
|
|
157
144
|
if (reWordMap == null) {
|
|
158
|
-
reWordMap =
|
|
145
|
+
reWordMap = this.#compileWordMap();
|
|
159
146
|
this.reWordMap = reWordMap;
|
|
160
147
|
}
|
|
161
148
|
dbg && console.log(msg, { text });
|
|
162
|
-
let
|
|
149
|
+
let textMapped = text;
|
|
163
150
|
for (let i = 0; i < reWordMap.length; i++) {
|
|
164
151
|
let { re, rep } = reWordMap[i];
|
|
165
|
-
|
|
166
|
-
dbg && console.log(msg, { i,
|
|
152
|
+
textMapped = textMapped.replaceAll(re, rep);
|
|
153
|
+
dbg && console.log(msg, { i, textMapped, re });
|
|
167
154
|
}
|
|
155
|
+
let rslt = normalize(textMapped)
|
|
156
|
+
.toLowerCase()
|
|
157
|
+
.trim()
|
|
158
|
+
.replace(/[-]/g, ' ')
|
|
159
|
+
.replace(/[.,_:;"'“”‘’!?]/g, '');
|
|
168
160
|
return rslt;
|
|
169
161
|
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
export class WordSpace {
|
|
165
|
+
constructor(opts = {}) {
|
|
166
|
+
let {
|
|
167
|
+
lang, // 2-letter code: fr, en, es, pt
|
|
168
|
+
minWord = 4, // minimum word length
|
|
169
|
+
normalize,
|
|
170
|
+
normalizeVector = WordSpace.normalizeVector,
|
|
171
|
+
transformText,
|
|
172
|
+
transformer,
|
|
173
|
+
reWordMap,
|
|
174
|
+
} = opts;
|
|
175
|
+
|
|
176
|
+
if (transformer == null) {
|
|
177
|
+
let wordMap = opts.wordMap;
|
|
178
|
+
transformer = new WordMapTransformer(wordMap, {
|
|
179
|
+
lang,
|
|
180
|
+
normalize,
|
|
181
|
+
});
|
|
182
|
+
if (transformText == null) {
|
|
183
|
+
transformText = (text) => transformer.transform(text);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
Object.defineProperty(this, 'transformText', {
|
|
187
|
+
value: transformText,
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
Object.assign(this, {
|
|
191
|
+
lang,
|
|
192
|
+
minWord,
|
|
193
|
+
normalizeVector,
|
|
194
|
+
reWordMap,
|
|
195
|
+
transformer,
|
|
196
|
+
wordMap: opts.wordMap, // DEPRECATED
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
static get WordMapTransformer() {
|
|
201
|
+
return WordMapTransformer;
|
|
202
|
+
}
|
|
170
203
|
|
|
171
204
|
static get Vector() {
|
|
172
205
|
return Vector;
|
|
173
206
|
}
|
|
174
207
|
|
|
175
|
-
|
|
176
|
-
|
|
208
|
+
// Golden Ratio fudge factor scales a count of 1 to ~0.8
|
|
209
|
+
// 1.6180339887498948482045868343656381177203091798057628621354
|
|
210
|
+
static normalizeVector(v, scale=1.618033988749895) {
|
|
177
211
|
let vNew = new Vector(v);
|
|
178
212
|
Object.entries(v).forEach((e) => {
|
|
179
213
|
let [key, value] = e;
|
|
180
|
-
vNew[key] = 1 - Math.exp(-value
|
|
214
|
+
vNew[key] = 1 - Math.exp(-value * scale);
|
|
181
215
|
});
|
|
182
216
|
|
|
183
217
|
return vNew;
|
|
@@ -189,13 +223,8 @@ export class WordSpace {
|
|
|
189
223
|
throw new Error(`${msg} str?`);
|
|
190
224
|
}
|
|
191
225
|
let dbg = 0;
|
|
192
|
-
let { normalize, normalizeVector, minWord
|
|
193
|
-
let
|
|
194
|
-
let sNorm = normalize(sWordMap)
|
|
195
|
-
.toLowerCase()
|
|
196
|
-
.trim()
|
|
197
|
-
.replace(/[-]/g, ' ')
|
|
198
|
-
.replace(/[.,_:;"'“”‘’!?]/g, '');
|
|
226
|
+
let { normalize, normalizeVector, minWord } = this;
|
|
227
|
+
let sNorm = this.transformText(str);
|
|
199
228
|
let words = sNorm.split(' ');
|
|
200
229
|
let v = words.reduce((a, w) => {
|
|
201
230
|
if (w.length >= minWord) {
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import { DBG } from '../defines.mjs';
|
|
2
2
|
import { Fraction } from '../math/fraction.mjs';
|
|
3
|
-
import { EbtDoc } from '
|
|
4
|
-
import { LegacyDoc } from '
|
|
5
|
-
import { SuttaCentralId } from '
|
|
6
|
-
import { Unicode } from '
|
|
7
|
-
import {
|
|
3
|
+
import { EbtDoc } from '../text/ebt-doc.mjs';
|
|
4
|
+
import { LegacyDoc } from '../text/legacy-doc.mjs';
|
|
5
|
+
import { SuttaCentralId } from '../text/sutta-central-id.mjs';
|
|
6
|
+
import { Unicode } from '../text/unicode.mjs';
|
|
7
|
+
import {
|
|
8
|
+
WordMapTransformer,
|
|
9
|
+
WordSpace,
|
|
10
|
+
} from '../text/word-space.mjs';
|
|
8
11
|
|
|
9
12
|
const STATE_OK = 'ok';
|
|
10
13
|
const STATE_WARN = 'warn';
|
|
@@ -33,13 +36,72 @@ const {
|
|
|
33
36
|
|
|
34
37
|
let alignmentCtor = false;
|
|
35
38
|
|
|
39
|
+
class PaliTransformer {
|
|
40
|
+
constructor(transformer) {
|
|
41
|
+
let { wordMap } = transformer;
|
|
42
|
+
this.transformer = transformer;
|
|
43
|
+
|
|
44
|
+
let reList;
|
|
45
|
+
let entries = Object.entries(wordMap);
|
|
46
|
+
reList = entries.reduce((a, e) => {
|
|
47
|
+
let [legacyText, paliText] = e;
|
|
48
|
+
if (paliText) {
|
|
49
|
+
a.set(paliText, new RegExp(`\\b${paliText}`, 'gi'));
|
|
50
|
+
}
|
|
51
|
+
return a;
|
|
52
|
+
}, new Map());
|
|
53
|
+
this.reList = reList;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
get wordMap() {
|
|
57
|
+
return this.transformer.wordMap;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
transform(text) {
|
|
61
|
+
const msg = 'P14r.transform';
|
|
62
|
+
const dbg = DBG.PALI_TRANSFORMER;
|
|
63
|
+
let { transformer } = this;
|
|
64
|
+
dbg && console.log(msg, text);
|
|
65
|
+
return transformer.transform(text);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
normalize(text) {
|
|
69
|
+
const msg = 'P14r.normalize';
|
|
70
|
+
const dbg = DBG.PALI_TRANSFORMER;
|
|
71
|
+
let { transformer } = this;
|
|
72
|
+
dbg && console.log(msg, text);
|
|
73
|
+
return transformer.normalize(text);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
export class DpdTransformer {
|
|
78
|
+
constructor(opts = {}) {
|
|
79
|
+
const msg = 'D12r.ctor:';
|
|
80
|
+
let { dictionary } = opts;
|
|
81
|
+
if (dictionary == null) {
|
|
82
|
+
throw new Error(`${msg} dictionary?`);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
this.dictionary = dictionary;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
transform(text) {
|
|
89
|
+
return text;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
normalize(text) {
|
|
93
|
+
return text;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
36
97
|
export class Aligner {
|
|
37
98
|
constructor(opts = {}) {
|
|
38
|
-
const msg = '
|
|
99
|
+
const msg = 'A5r.ctor:';
|
|
39
100
|
let {
|
|
40
|
-
|
|
101
|
+
alignMethod = 'alignPali',
|
|
41
102
|
authorAligned, // author of segment aligned document
|
|
42
103
|
authorLegacy, // author of legacy document
|
|
104
|
+
dbgScid,
|
|
43
105
|
groupDecay = 0.5, // group exponential decay
|
|
44
106
|
groupSize = 1, // comparison group size
|
|
45
107
|
lang, // 2-letter ISO language (en, fr, es, pt)
|
|
@@ -52,16 +114,26 @@ export class Aligner {
|
|
|
52
114
|
wordSpace,
|
|
53
115
|
} = opts;
|
|
54
116
|
if (wordSpace == null) {
|
|
55
|
-
wordSpace = new WordSpace({
|
|
117
|
+
wordSpace = new WordSpace({
|
|
118
|
+
lang,
|
|
119
|
+
minWord,
|
|
120
|
+
normalizeVector,
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
if (alignMethod === 'alignPali') {
|
|
124
|
+
wordSpace.transformer = new PaliTransformer(
|
|
125
|
+
wordSpace.transformer,
|
|
126
|
+
);
|
|
56
127
|
}
|
|
57
128
|
if (lang == null) {
|
|
58
129
|
lang = wordSpace.lang;
|
|
59
130
|
}
|
|
60
131
|
|
|
61
132
|
Object.assign(this, {
|
|
62
|
-
|
|
133
|
+
alignMethod,
|
|
63
134
|
authorAligned,
|
|
64
135
|
authorLegacy,
|
|
136
|
+
dbgScid,
|
|
65
137
|
groupSize,
|
|
66
138
|
groupDecay,
|
|
67
139
|
lang,
|
|
@@ -97,6 +169,7 @@ export class Aligner {
|
|
|
97
169
|
const msg = 'A7t.createAlignment:';
|
|
98
170
|
const dbg = DBG.CREATE_ALIGNMENT;
|
|
99
171
|
let {
|
|
172
|
+
dbgScid = this.dbgScid,
|
|
100
173
|
legacyDoc,
|
|
101
174
|
mlDoc,
|
|
102
175
|
minScore = this.minScore,
|
|
@@ -112,7 +185,8 @@ export class Aligner {
|
|
|
112
185
|
throw new Error(`${msg} mlDoc?`);
|
|
113
186
|
}
|
|
114
187
|
|
|
115
|
-
let
|
|
188
|
+
let { author, author_uid, lines, footer } = legacyDoc;
|
|
189
|
+
let nLines = lines.length;
|
|
116
190
|
let lineCursor = new Fraction(0, nLines, 'lines');
|
|
117
191
|
let scids = Object.keys(mlDoc.segMap);
|
|
118
192
|
let nSegs = scids.length;
|
|
@@ -128,18 +202,25 @@ export class Aligner {
|
|
|
128
202
|
throw new Error(`${msg} minScanSize? ${minScanSize} `);
|
|
129
203
|
}
|
|
130
204
|
|
|
131
|
-
let { sutta_uid:suid, docAuthor, bilaraPaths } = mlDoc;
|
|
132
|
-
let
|
|
133
|
-
let bilaraPath = bilaraPaths.reduce((a,p)=>{
|
|
205
|
+
let { sutta_uid: suid, docAuthor, bilaraPaths } = mlDoc;
|
|
206
|
+
let bilaraPath = bilaraPaths.reduce((a, p) => {
|
|
134
207
|
if (p.includes(docAuthor)) {
|
|
135
208
|
a = p.replaceAll(docAuthor, author_uid);
|
|
136
209
|
}
|
|
137
210
|
return a;
|
|
138
211
|
});
|
|
139
|
-
let docOpts = {
|
|
212
|
+
let docOpts = {
|
|
213
|
+
suid,
|
|
214
|
+
lang,
|
|
215
|
+
author,
|
|
216
|
+
author_uid,
|
|
217
|
+
bilaraPath,
|
|
218
|
+
footer,
|
|
219
|
+
};
|
|
140
220
|
|
|
141
221
|
const optsAlignment = {
|
|
142
222
|
aligner: this,
|
|
223
|
+
dbgScid,
|
|
143
224
|
ebtDoc: EbtDoc.create(docOpts),
|
|
144
225
|
legacyDoc,
|
|
145
226
|
lineCursor,
|
|
@@ -162,14 +243,14 @@ export class Aligner {
|
|
|
162
243
|
mlDocVectors(mld) {
|
|
163
244
|
const msg = 'Aligner.mlDocVectors';
|
|
164
245
|
const dbg = DBG.ML_DOC_VECTORS;
|
|
165
|
-
let {
|
|
166
|
-
let { wordMap } = wordSpace;
|
|
246
|
+
let { alignMethod, groupDecay, groupSize, wordSpace } = this;
|
|
247
|
+
let { wordMap } = wordSpace.transformer;
|
|
167
248
|
let { segMap, lang } = mld;
|
|
168
249
|
let segs = Object.entries(segMap);
|
|
169
250
|
let iLastSeg = segs.length - 1;
|
|
170
251
|
let reList;
|
|
171
252
|
|
|
172
|
-
if (alignPali) {
|
|
253
|
+
if (alignMethod === 'alignPali') {
|
|
173
254
|
let entries = Object.entries(wordMap);
|
|
174
255
|
reList = entries.reduce((a, e) => {
|
|
175
256
|
let [legacyText, paliText] = e;
|
|
@@ -184,26 +265,34 @@ export class Aligner {
|
|
|
184
265
|
let segGroup = [];
|
|
185
266
|
for (let i = segs.length; i-- > 0; ) {
|
|
186
267
|
let [scid, seg] = segs[i];
|
|
268
|
+
let vGroup = new WordSpace.Vector();
|
|
269
|
+
|
|
187
270
|
let { pli } = seg;
|
|
188
271
|
let segData = seg[lang] || '';
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
272
|
+
switch (alignMethod) {
|
|
273
|
+
case 'alignPali':
|
|
274
|
+
{
|
|
275
|
+
// for aligning Pali, we add all Pali words that
|
|
276
|
+
// occur in the Pali for a segment to the
|
|
277
|
+
// vector input text
|
|
278
|
+
let pliWords = [];
|
|
279
|
+
reList.forEach((re, paliText, map) => {
|
|
280
|
+
let nMatch = pli.match(re)?.length || 0;
|
|
281
|
+
if (nMatch) {
|
|
282
|
+
for (let i = 0; i < nMatch; i++) {
|
|
283
|
+
pliWords.push(paliText);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
});
|
|
287
|
+
if (pliWords.length) {
|
|
288
|
+
segData += ' ' + pliWords.join(' ');
|
|
289
|
+
dbg === scid &&
|
|
290
|
+
console.log(msg, 'segData', scid, segData);
|
|
200
291
|
}
|
|
201
292
|
}
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
dbg === scid && console.log(msg, 'segData', scid, segData);
|
|
206
|
-
}
|
|
293
|
+
break;
|
|
294
|
+
case 'DPD':
|
|
295
|
+
break;
|
|
207
296
|
}
|
|
208
297
|
segGroup.unshift(segData);
|
|
209
298
|
if (segGroup.length > groupSize) {
|
|
@@ -267,7 +356,7 @@ export class Alignment {
|
|
|
267
356
|
if (typeof opts !== 'object') {
|
|
268
357
|
throw new Error(`${msg} opts?`);
|
|
269
358
|
}
|
|
270
|
-
let { dbgScid } = opts;
|
|
359
|
+
let { dbgScid = this.dbgScid } = opts;
|
|
271
360
|
// biome-ignore format:
|
|
272
361
|
let { ebtDoc, legacyDoc, lineCursor, maxScanSize, minScanSize,
|
|
273
362
|
minScore, mlDoc, scids, segCursor, vMLDoc, wordSpace,
|
|
@@ -281,6 +370,7 @@ export class Alignment {
|
|
|
281
370
|
for (let i = 0; scanning(i); i++) {
|
|
282
371
|
let scid = scids[segCursor.numerator + i];
|
|
283
372
|
if (scid == null) {
|
|
373
|
+
console.log(error, '[1]scid?', segCursor.toString());
|
|
284
374
|
break;
|
|
285
375
|
}
|
|
286
376
|
let vSeg = vMLDoc[scid];
|
|
@@ -407,7 +497,7 @@ export class Alignment {
|
|
|
407
497
|
aligner, ebtDoc, legacyDoc, lineCursor, maxScanSize, minScanSize,
|
|
408
498
|
mlDoc, scidsExp, segCursor, vMLDoc,
|
|
409
499
|
} = this;
|
|
410
|
-
let { lang,
|
|
500
|
+
let { lang, alignMethod, wordSpace } = aligner;
|
|
411
501
|
let { segMap } = mlDoc;
|
|
412
502
|
let scids = Object.keys(segMap);
|
|
413
503
|
scids.sort(SuttaCentralId.compareLow);
|
|
@@ -417,7 +507,6 @@ export class Alignment {
|
|
|
417
507
|
|
|
418
508
|
while (lineCursor.difference < 0) {
|
|
419
509
|
let line = lines[lineCursor.numerator];
|
|
420
|
-
dbg > 1 && console.log(msg, lineCursor.toString(), line);
|
|
421
510
|
let curScid = scids[segCursor.numerator];
|
|
422
511
|
let dbgScid = scidsExp?.[lineCursor.numerator];
|
|
423
512
|
let r = this.alignLine(line, { dbgScid });
|
|
@@ -425,12 +514,12 @@ export class Alignment {
|
|
|
425
514
|
// biome-ignore format:
|
|
426
515
|
if (r == null) {
|
|
427
516
|
let { vSeg, vLegacy, intersection } = this.status;
|
|
428
|
-
dbg && console.log(
|
|
517
|
+
dbg && console.log(msg, 'UNMATCHED',
|
|
429
518
|
lineCursor.toString(),
|
|
430
519
|
segCursor.toString(),
|
|
431
520
|
{ curScid, line, minScanSize, maxScanSize, vSeg, vLegacy, intersection },
|
|
432
521
|
);
|
|
433
|
-
|
|
522
|
+
return null;
|
|
434
523
|
}
|
|
435
524
|
}
|
|
436
525
|
|