@sc-voice/tools 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,13 @@
1
1
  import { DBG } from '../defines.mjs';
2
2
  import { Fraction } from '../math/fraction.mjs';
3
- import { EbtDoc } from './ebt-doc.mjs';
4
- import { LegacyDoc } from './legacy-doc.mjs';
5
- import { SuttaCentralId } from './sutta-central-id.mjs';
6
- import { Unicode } from './unicode.mjs';
7
- import { WordSpace } from './word-space.mjs';
3
+ import { EbtDoc } from '../text/ebt-doc.mjs';
4
+ import { LegacyDoc } from '../text/legacy-doc.mjs';
5
+ import { SuttaCentralId } from '../text/sutta-central-id.mjs';
6
+ import { Unicode } from '../text/unicode.mjs';
7
+ import {
8
+ WordMapTransformer,
9
+ WordSpace,
10
+ } from '../text/word-space.mjs';
8
11
 
9
12
  const STATE_OK = 'ok';
10
13
  const STATE_WARN = 'warn';
@@ -33,13 +36,72 @@ const {
33
36
 
34
37
  let alignmentCtor = false;
35
38
 
39
+ class PaliTransformer {
40
+ constructor(transformer) {
41
+ let { wordMap } = transformer;
42
+ this.transformer = transformer;
43
+
44
+ let reList;
45
+ let entries = Object.entries(wordMap);
46
+ reList = entries.reduce((a, e) => {
47
+ let [legacyText, paliText] = e;
48
+ if (paliText) {
49
+ a.set(paliText, new RegExp(`\\b${paliText}`, 'gi'));
50
+ }
51
+ return a;
52
+ }, new Map());
53
+ this.reList = reList;
54
+ }
55
+
56
+ get wordMap() {
57
+ return this.transformer.wordMap;
58
+ }
59
+
60
+ transform(text) {
61
+ const msg = 'P14r.transform';
62
+ const dbg = DBG.PALI_TRANSFORMER;
63
+ let { transformer } = this;
64
+ dbg && console.log(msg, text);
65
+ return transformer.transform(text);
66
+ }
67
+
68
+ normalize(text) {
69
+ const msg = 'P14r.normalize';
70
+ const dbg = DBG.PALI_TRANSFORMER;
71
+ let { transformer } = this;
72
+ dbg && console.log(msg, text);
73
+ return transformer.normalize(text);
74
+ }
75
+ }
76
+
77
+ export class DpdTransformer {
78
+ constructor(opts = {}) {
79
+ const msg = 'D12r.ctor:';
80
+ let { dictionary } = opts;
81
+ if (dictionary == null) {
82
+ throw new Error(`${msg} dictionary?`);
83
+ }
84
+
85
+ this.dictionary = dictionary;
86
+ }
87
+
88
+ transform(text) {
89
+ return text;
90
+ }
91
+
92
+ normalize(text) {
93
+ return text;
94
+ }
95
+ }
96
+
36
97
  export class Aligner {
37
98
  constructor(opts = {}) {
38
- const msg = 'Aligner.ctor:';
99
+ const msg = 'A5r.ctor:';
39
100
  let {
40
- alignPali = true,
101
+ alignMethod = 'alignPali',
41
102
  authorAligned, // author of segment aligned document
42
103
  authorLegacy, // author of legacy document
104
+ dbgScid,
43
105
  groupDecay = 0.5, // group exponential decay
44
106
  groupSize = 1, // comparison group size
45
107
  lang, // 2-letter ISO language (en, fr, es, pt)
@@ -52,16 +114,26 @@ export class Aligner {
52
114
  wordSpace,
53
115
  } = opts;
54
116
  if (wordSpace == null) {
55
- wordSpace = new WordSpace({ lang, minWord, normalizeVector });
117
+ wordSpace = new WordSpace({
118
+ lang,
119
+ minWord,
120
+ normalizeVector,
121
+ });
122
+ }
123
+ if (alignMethod === 'alignPali') {
124
+ wordSpace.transformer = new PaliTransformer(
125
+ wordSpace.transformer,
126
+ );
56
127
  }
57
128
  if (lang == null) {
58
129
  lang = wordSpace.lang;
59
130
  }
60
131
 
61
132
  Object.assign(this, {
62
- alignPali,
133
+ alignMethod,
63
134
  authorAligned,
64
135
  authorLegacy,
136
+ dbgScid,
65
137
  groupSize,
66
138
  groupDecay,
67
139
  lang,
@@ -97,6 +169,7 @@ export class Aligner {
97
169
  const msg = 'A7t.createAlignment:';
98
170
  const dbg = DBG.CREATE_ALIGNMENT;
99
171
  let {
172
+ dbgScid = this.dbgScid,
100
173
  legacyDoc,
101
174
  mlDoc,
102
175
  minScore = this.minScore,
@@ -112,7 +185,8 @@ export class Aligner {
112
185
  throw new Error(`${msg} mlDoc?`);
113
186
  }
114
187
 
115
- let nLines = legacyDoc.lines.length;
188
+ let { author, author_uid, lines, footer } = legacyDoc;
189
+ let nLines = lines.length;
116
190
  let lineCursor = new Fraction(0, nLines, 'lines');
117
191
  let scids = Object.keys(mlDoc.segMap);
118
192
  let nSegs = scids.length;
@@ -128,18 +202,25 @@ export class Aligner {
128
202
  throw new Error(`${msg} minScanSize? ${minScanSize} `);
129
203
  }
130
204
 
131
- let { sutta_uid:suid, docAuthor, bilaraPaths } = mlDoc;
132
- let { author, author_uid } = legacyDoc;
133
- let bilaraPath = bilaraPaths.reduce((a,p)=>{
205
+ let { sutta_uid: suid, docAuthor, bilaraPaths } = mlDoc;
206
+ let bilaraPath = bilaraPaths.reduce((a, p) => {
134
207
  if (p.includes(docAuthor)) {
135
208
  a = p.replaceAll(docAuthor, author_uid);
136
209
  }
137
210
  return a;
138
211
  });
139
- let docOpts = { suid, lang, author, author_uid, bilaraPath };
212
+ let docOpts = {
213
+ suid,
214
+ lang,
215
+ author,
216
+ author_uid,
217
+ bilaraPath,
218
+ footer,
219
+ };
140
220
 
141
221
  const optsAlignment = {
142
222
  aligner: this,
223
+ dbgScid,
143
224
  ebtDoc: EbtDoc.create(docOpts),
144
225
  legacyDoc,
145
226
  lineCursor,
@@ -162,14 +243,14 @@ export class Aligner {
162
243
  mlDocVectors(mld) {
163
244
  const msg = 'Aligner.mlDocVectors';
164
245
  const dbg = DBG.ML_DOC_VECTORS;
165
- let { alignPali, groupDecay, groupSize, wordSpace } = this;
166
- let { wordMap } = wordSpace;
246
+ let { alignMethod, groupDecay, groupSize, wordSpace } = this;
247
+ let { wordMap } = wordSpace.transformer;
167
248
  let { segMap, lang } = mld;
168
249
  let segs = Object.entries(segMap);
169
250
  let iLastSeg = segs.length - 1;
170
251
  let reList;
171
252
 
172
- if (alignPali) {
253
+ if (alignMethod === 'alignPali') {
173
254
  let entries = Object.entries(wordMap);
174
255
  reList = entries.reduce((a, e) => {
175
256
  let [legacyText, paliText] = e;
@@ -184,26 +265,34 @@ export class Aligner {
184
265
  let segGroup = [];
185
266
  for (let i = segs.length; i-- > 0; ) {
186
267
  let [scid, seg] = segs[i];
268
+ let vGroup = new WordSpace.Vector();
269
+
187
270
  let { pli } = seg;
188
271
  let segData = seg[lang] || '';
189
- let vGroup = new WordSpace.Vector();
190
- if (alignPali) {
191
- // for aligning Pali, we add all Pali words that
192
- // occur in the Pali for a segment to the
193
- // vector input text
194
- let pliWords = [];
195
- reList.forEach((re, paliText, map) => {
196
- let nMatch = pli.match(re)?.length || 0;
197
- if (nMatch) {
198
- for (let i = 0; i < nMatch; i++) {
199
- pliWords.push(paliText);
272
+ switch (alignMethod) {
273
+ case 'alignPali':
274
+ {
275
+ // for aligning Pali, we add all Pali words that
276
+ // occur in the Pali for a segment to the
277
+ // vector input text
278
+ let pliWords = [];
279
+ reList.forEach((re, paliText, map) => {
280
+ let nMatch = pli.match(re)?.length || 0;
281
+ if (nMatch) {
282
+ for (let i = 0; i < nMatch; i++) {
283
+ pliWords.push(paliText);
284
+ }
285
+ }
286
+ });
287
+ if (pliWords.length) {
288
+ segData += ' ' + pliWords.join(' ');
289
+ dbg === scid &&
290
+ console.log(msg, 'segData', scid, segData);
200
291
  }
201
292
  }
202
- });
203
- if (pliWords.length) {
204
- segData += ' ' + pliWords.join(' ');
205
- dbg === scid && console.log(msg, 'segData', scid, segData);
206
- }
293
+ break;
294
+ case 'DPD':
295
+ break;
207
296
  }
208
297
  segGroup.unshift(segData);
209
298
  if (segGroup.length > groupSize) {
@@ -267,7 +356,7 @@ export class Alignment {
267
356
  if (typeof opts !== 'object') {
268
357
  throw new Error(`${msg} opts?`);
269
358
  }
270
- let { dbgScid } = opts;
359
+ let { dbgScid = this.dbgScid } = opts;
271
360
  // biome-ignore format:
272
361
  let { ebtDoc, legacyDoc, lineCursor, maxScanSize, minScanSize,
273
362
  minScore, mlDoc, scids, segCursor, vMLDoc, wordSpace,
@@ -281,6 +370,7 @@ export class Alignment {
281
370
  for (let i = 0; scanning(i); i++) {
282
371
  let scid = scids[segCursor.numerator + i];
283
372
  if (scid == null) {
373
+ console.log(error, '[1]scid?', segCursor.toString());
284
374
  break;
285
375
  }
286
376
  let vSeg = vMLDoc[scid];
@@ -407,7 +497,7 @@ export class Alignment {
407
497
  aligner, ebtDoc, legacyDoc, lineCursor, maxScanSize, minScanSize,
408
498
  mlDoc, scidsExp, segCursor, vMLDoc,
409
499
  } = this;
410
- let { lang, alignPali, wordSpace } = aligner;
500
+ let { lang, alignMethod, wordSpace } = aligner;
411
501
  let { segMap } = mlDoc;
412
502
  let scids = Object.keys(segMap);
413
503
  scids.sort(SuttaCentralId.compareLow);
@@ -417,7 +507,6 @@ export class Alignment {
417
507
 
418
508
  while (lineCursor.difference < 0) {
419
509
  let line = lines[lineCursor.numerator];
420
- dbg > 1 && console.log(msg, lineCursor.toString(), line);
421
510
  let curScid = scids[segCursor.numerator];
422
511
  let dbgScid = scidsExp?.[lineCursor.numerator];
423
512
  let r = this.alignLine(line, { dbgScid });
@@ -425,12 +514,12 @@ export class Alignment {
425
514
  // biome-ignore format:
426
515
  if (r == null) {
427
516
  let { vSeg, vLegacy, intersection } = this.status;
428
- dbg && console.log( msg, 'UNMATCHED',
517
+ dbg && console.log(msg, 'UNMATCHED',
429
518
  lineCursor.toString(),
430
519
  segCursor.toString(),
431
520
  { curScid, line, minScanSize, maxScanSize, vSeg, vLegacy, intersection },
432
521
  );
433
- throw new Error(`${msg} unmatched`);
522
+ return null;
434
523
  }
435
524
  }
436
525
 
@@ -0,0 +1,353 @@
1
+ import * as deepl from 'deepl-node';
2
+ import { DBG } from '../defines.mjs';
3
+ import { MockDeepL } from './mock-deepl.mjs';
4
+
5
+ const EMPTY_TEXT = '911911911';
6
+ const TRANSLATE_OPTS = {
7
+ tag_handling: 'xml',
8
+ formality: 'more',
9
+ };
10
+ const DST_AUTHOR = 'no-author';
11
+
12
+ let mockApi = DBG.MOCK_DEEPL;
13
+
14
+ export class DeepLAdapter {
15
+ #authKey;
16
+
17
+ constructor(opts = {}) {
18
+ let {
19
+ authKey,
20
+ glossary,
21
+ glossaryName,
22
+ initialized,
23
+ sourceLang, // deepl lang
24
+ targetLang, // deepl lang
25
+ translateOpts,
26
+ translator,
27
+ dstLang,
28
+ dstLang2, // bilara-data lang
29
+ srcLang2, // bilara-data lang
30
+ srcLang,
31
+ } = DeepLAdapter.srcDstLangs(opts);
32
+
33
+ let emsg = 'use DeepLAdapter.create()';
34
+ let check = 1;
35
+ if (null == authKey) throw new Error(`${emsg} ${check}`);
36
+ check++;
37
+ if (null == dstLang2) throw new Error(`${emsg} ${check}`);
38
+ check++;
39
+ if (null == glossaryName) throw new Error(`${emsg} ${check}`);
40
+ check++;
41
+ if (null == initialized) throw new Error(`${emsg} ${check}`);
42
+ check++;
43
+ if (null == sourceLang) throw new Error(`${emsg} ${check}`);
44
+ check++;
45
+ if (null == targetLang) throw new Error(`${emsg} ${check}`);
46
+ check++;
47
+ if (null == srcLang2) throw new Error(`${emsg} ${check}`);
48
+ check++;
49
+ if (null == translateOpts) throw new Error(`${emsg} ${check}`);
50
+ check++;
51
+ if (null == translator) throw new Error(`${emsg} ${check}`);
52
+ check++;
53
+
54
+ this.#authKey = authKey;
55
+
56
+ Object.assign(this, {
57
+ dstLang,
58
+ dstLang2,
59
+ glossary,
60
+ glossaryName,
61
+ initialized,
62
+ srcLang,
63
+ srcLang2,
64
+ sourceLang,
65
+ targetLang,
66
+ translateOpts: JSON.parse(JSON.stringify(translateOpts)),
67
+ translator,
68
+ });
69
+ }
70
+
71
+ static srcDstLangs(opts = {}) {
72
+ let { srcLang = 'en', dstLang = 'pt-pt' } = opts;
73
+ srcLang = srcLang.toLowerCase();
74
+ let srcLang2 = srcLang.split('-')[0];
75
+ dstLang = dstLang.toLowerCase();
76
+ let dstLang2 = dstLang.split('-')[0];
77
+
78
+ return Object.assign({}, opts, {
79
+ srcLang,
80
+ srcLang2,
81
+ dstLang,
82
+ dstLang2,
83
+ });
84
+ }
85
+
86
+ static deeplLang(lang) {
87
+ switch (lang) {
88
+ case 'pt':
89
+ return 'pt-PT';
90
+ default:
91
+ return lang;
92
+ }
93
+ }
94
+
95
+ static glossaryName(opts = {}) {
96
+ const msg = 'D10r.glossaryName()';
97
+ const dbg = DBG.GLOSSARY;
98
+ let { dstAuthor = DST_AUTHOR } = opts;
99
+ let {
100
+ dstLang,
101
+ dstLang2, // bilara-data lang
102
+ srcLang2, // bilara-data lang
103
+ srcLang,
104
+ } = DeepLAdapter.srcDstLangs(opts);
105
+ let name =
106
+ `D10r_${srcLang2}_${dstLang2}_${dstAuthor}`.toLowerCase();
107
+ dbg && console.log(msg, name);
108
+ return name;
109
+ }
110
+
111
+ static async create(opts = {}) {
112
+ const msg = 'D10r.create()';
113
+ const dbg = DBG.GLOSSARY;
114
+ let {
115
+ authKey,
116
+ srcLang,
117
+ srcLang2,
118
+ dstLang,
119
+ dstLang2,
120
+ dstAuthor = DST_AUTHOR,
121
+ sourceLang,
122
+ targetLang,
123
+ translateOpts = TRANSLATE_OPTS,
124
+ updateGlossary = false,
125
+ translator,
126
+ } = DeepLAdapter.srcDstLangs(opts);
127
+ dbg && console.log(msg, '[1]opts', opts);
128
+ if (authKey == null) {
129
+ throw new Error(`${msg} authKey?`);
130
+ }
131
+ sourceLang = sourceLang || DeepLAdapter.deeplLang(srcLang);
132
+ targetLang = targetLang || DeepLAdapter.deeplLang(dstLang);
133
+ if (translator == null) {
134
+ dbg && console.log(msg, '[2]new deepl.Translator()');
135
+ let deeplOpts = {};
136
+ translator = mockApi
137
+ ? new MockDeepL.Translator(authKey)
138
+ : new deepl.Translator(authKey);
139
+ }
140
+
141
+ let glossaryName = DeepLAdapter.glossaryName({
142
+ srcLang,
143
+ dstLang,
144
+ dstAuthor,
145
+ });
146
+ let glossaries = await translator.listGlossaries();
147
+ let glossary = glossaries.reduce((a, g) => {
148
+ return g.name === glossaryName ? g : a;
149
+ }, null);
150
+ if (updateGlossary) {
151
+ console.warn(msg, '[3]updateGlossary', glossaryName);
152
+ dbg && console.log(msg, '[4]uploadGlossary');
153
+ glossary = await DeepLAdapter.uploadGlossary({
154
+ srcLang,
155
+ dstLang,
156
+ dstAuthor,
157
+ translator,
158
+ glossaries,
159
+ });
160
+ }
161
+ if (glossary) {
162
+ let { glossaryId, name } = glossary;
163
+ dbg &&
164
+ console.warn(
165
+ msg,
166
+ '[5]using glossary',
167
+ name,
168
+ glossaryId && glossaryId.substring(0, 8),
169
+ );
170
+ } else {
171
+ let dbg = DBG.GLOSSARY;
172
+ dbg && console.log(msg, '[6]no glossary');
173
+ }
174
+ translateOpts = translateOpts
175
+ ? JSON.parse(JSON.stringify(translateOpts))
176
+ : TRANSLATE_OPTS;
177
+ if (glossary) {
178
+ translateOpts.glossary = glossary;
179
+ }
180
+ let initialized = true;
181
+
182
+ let ctorOpts = {
183
+ authKey,
184
+ dstLang,
185
+ dstLang2,
186
+ glossary,
187
+ glossaryName,
188
+ initialized,
189
+ srcLang,
190
+ srcLang2,
191
+ sourceLang,
192
+ targetLang,
193
+ translateOpts,
194
+ translator,
195
+ };
196
+ dbg &&
197
+ console.log(msg, '[7]ctor', {
198
+ sourceLang,
199
+ targetLang,
200
+ glossaryName,
201
+ });
202
+ return new DeepLAdapter(ctorOpts);
203
+ }
204
+
205
+ static setMockApi(value) {
206
+ mockApi = value;
207
+ }
208
+
209
+ static asGlossaryEntries(strObj) {
210
+ const msg = 'd12r.asToGlossaryEntries:';
211
+ let dbg = DBG.KVG_TO_GLOSSARY_ENTRIES;
212
+
213
+ if (strObj instanceof deepl.GlossaryEntries) {
214
+ return strObj;
215
+ }
216
+ let nEntries = 0;
217
+ let entries;
218
+
219
+ if (typeof strObj === 'string') {
220
+ // assume kvg string
221
+ entries = strObj.split('\n').reduce((a, kv) => {
222
+ let [key, value] = kv.split(/\|/);
223
+ if (key && !value) {
224
+ throw new Error(`${msg} [1]no value for key:${key}`);
225
+ } else if (!key && value) {
226
+ throw new Error(`${msg} [2]no key for value:${value}`);
227
+ } else if (!key && !value) {
228
+ // ignore
229
+ } else {
230
+ key = key.trim();
231
+ value = value.trim();
232
+ a[key] = value;
233
+ dbg > 1 && console.log(msg, '[3]', { key, value });
234
+ nEntries++;
235
+ }
236
+ return a;
237
+ }, []);
238
+ } else if (typeof strObj === 'object') {
239
+ entries = strObj;
240
+ } else {
241
+ throw new Error(`${msg} string or object?`);
242
+ }
243
+
244
+ return new deepl.GlossaryEntries({ entries });
245
+ }
246
+
247
+ static async uploadGlossary(opts = {}) {
248
+ const msg = 'D10r.uploadGlossary()';
249
+ const dbg = DBG.GLOSSARY;
250
+ const dbgv = DBG.VERBOSE && dbg;
251
+ let {
252
+ srcLang,
253
+ srcLang2,
254
+ dstLang,
255
+ dstLang2,
256
+ dstAuthor,
257
+ translator,
258
+ glossaries,
259
+ glossaryEntries,
260
+ } = DeepLAdapter.srcDstLangs(opts);
261
+ if (glossaryEntries == null) {
262
+ throw new Error(`${msg} glossaryEntries?`);
263
+ }
264
+ let nEntries = Object.keys(glossaryEntries).length;
265
+ let glossaryName = DeepLAdapter.glossaryName({
266
+ srcLang,
267
+ dstLang,
268
+ dstAuthor,
269
+ });
270
+ let glossary;
271
+
272
+ if (glossaries == null) {
273
+ glossaries = await translator.listGlossaries();
274
+ }
275
+ for (let i = 0; i < glossaries.length; i++) {
276
+ let g = glossaries[i];
277
+ if (g.name === glossaryName) {
278
+ dbg && console.log(msg, '[1]deleting', g.glossaryId);
279
+ await translator.deleteGlossary(g.glossaryId);
280
+ }
281
+ }
282
+
283
+ let sourceLang = DeepLAdapter.deeplLang(srcLang);
284
+ let targetLang = DeepLAdapter.deeplLang(dstLang);
285
+ glossary = await translator.createGlossary(
286
+ glossaryName,
287
+ sourceLang,
288
+ targetLang,
289
+ glossaryEntries,
290
+ );
291
+ let { glossaryId } = glossary;
292
+ dbg &&
293
+ console.log(msg, '[6]createGlossary', {
294
+ fName,
295
+ glossaryName,
296
+ sourceLang,
297
+ targetLang,
298
+ glossaryId,
299
+ nEntries,
300
+ });
301
+
302
+ return glossary;
303
+ }
304
+
305
+ async deleteGlossary(id) {
306
+ const msg = 'd12r.deleteGlossary:';
307
+ let { translator } = this;
308
+ dbg && console.log(msg, '[1]deleting', id);
309
+ await translator.deleteGlossary(id);
310
+ dbg>1 && console.log(msg, '[2]deleted', id);
311
+ }
312
+
313
+ async listGlossaries() {
314
+ let { translator } = this;
315
+
316
+ let glossaries = await translator.listGlossaries();
317
+ return glossaries;
318
+ }
319
+
320
+ async translate(texts) {
321
+ const msg = 'D10r.translate()';
322
+ const dbg = DBG.DEEPL_XLT;
323
+ const dbgv = dbg && DBG.VERBOSE;
324
+ let { translator, srcLang, dstLang, translateOpts } = this;
325
+
326
+ let sourceLang = DeepLAdapter.deeplLang(srcLang);
327
+ let targetLang = DeepLAdapter.deeplLang(dstLang);
328
+ texts = texts.map((t) => t || EMPTY_TEXT);
329
+ dbgv && console.log(msg, '[1]translateOpts', translateOpts);
330
+ let results = await translator.translateText(
331
+ texts,
332
+ sourceLang,
333
+ targetLang,
334
+ translateOpts,
335
+ );
336
+ if (dbg) {
337
+ results.forEach((result, i) => {
338
+ console.log(
339
+ msg,
340
+ `\n[${i}<] `,
341
+ `${texts[i]}$`,
342
+ `\n[${i}>] `,
343
+ `${results[i]?.text}$`,
344
+ );
345
+ });
346
+ }
347
+ results = results.map((r) =>
348
+ r.text === EMPTY_TEXT ? '' : r.text,
349
+ );
350
+
351
+ return results;
352
+ }
353
+ } // DeepLAdapter
@@ -0,0 +1,17 @@
1
+ export class DpdTransformer {
2
+ constructor(opts = {}) {
3
+ const msg = 'D14r.ctor:';
4
+ let { dictionary } = opts;
5
+ if (dictionary == null) {
6
+ throw new Error(`${msg} dictionary?`);
7
+ }
8
+
9
+ Object.assign(this, {
10
+ dictionary,
11
+ });
12
+ }
13
+
14
+ transform(text) {
15
+ return text;
16
+ }
17
+ }