@sc-voice/tools 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,562 @@
1
+ import { DBG } from '../defines.mjs';
2
+ import { Fraction } from '../math/fraction.mjs';
3
+ import { EbtDoc } from './ebt-doc.mjs';
4
+ import { LegacyDoc } from './legacy-doc.mjs';
5
+ import { SuttaCentralId } from './sutta-central-id.mjs';
6
+ import { Unicode } from './unicode.mjs';
7
+ import { WordSpace } from './word-space.mjs';
8
+
9
+ const STATE_OK = 'ok';
10
+ const STATE_WARN = 'warn';
11
+ const STATE_ERROR = 'error';
12
+ const STATE_DONE = 'done';
13
+ const {
14
+ GREEN_CHECKBOX,
15
+ LEFT_ARROW,
16
+ RIGHT_ARROW,
17
+ CHECKMARK,
18
+ ELLIPSIS,
19
+ WARNING,
20
+ RED_X,
21
+ } = Unicode;
22
+ const {
23
+ BLACK,
24
+ WHITE,
25
+ RED,
26
+ GREEN,
27
+ BLUE,
28
+ CYAN,
29
+ MAGENTA,
30
+ YELLOW,
31
+ NO_COLOR,
32
+ } = Unicode.LINUX_COLOR;
33
+
34
+ let alignmentCtor = false;
35
+
36
+ export class Aligner {
37
+ constructor(opts = {}) {
38
+ const msg = 'Aligner.ctor:';
39
+ let {
40
+ alignPali = true,
41
+ authorAligned, // author of segment aligned document
42
+ authorLegacy, // author of legacy document
43
+ groupDecay = 0.5, // group exponential decay
44
+ groupSize = 1, // comparison group size
45
+ lang, // 2-letter ISO language (en, fr, es, pt)
46
+ maxScanSize, // maximum segments to scan for alignment
47
+ minScanSize = 5, // minimum number of segments to scan
48
+ minScore = 0.1, // minimum alignment score
49
+ minWord,
50
+ normalizeVector,
51
+ scvEndpoint = 'https://www.api.sc-voice.net/scv',
52
+ wordSpace,
53
+ } = opts;
54
+ if (wordSpace == null) {
55
+ wordSpace = new WordSpace({ lang, minWord, normalizeVector });
56
+ }
57
+ if (lang == null) {
58
+ lang = wordSpace.lang;
59
+ }
60
+
61
+ Object.assign(this, {
62
+ alignPali,
63
+ authorAligned,
64
+ authorLegacy,
65
+ groupSize,
66
+ groupDecay,
67
+ lang,
68
+ minScore,
69
+ minScanSize,
70
+ maxScanSize,
71
+ scvEndpoint,
72
+ wordSpace,
73
+ });
74
+ }
75
+
76
+ async fetchMLDoc(scid) {
77
+ const msg = 'Aligner.fetchMLDoc:';
78
+ let { lang, scvEndpoint, authorAligned } = this;
79
+ let url = [
80
+ scvEndpoint,
81
+ 'search',
82
+ `${scid}%20-da%20${authorAligned}%20-ml1`,
83
+ lang,
84
+ ].join('/');
85
+ try {
86
+ let res = await fetch(url);
87
+ let json = await res.json();
88
+ let mld = json.mlDocs[0];
89
+ return mld;
90
+ } catch (e) {
91
+ console.error(msg, e);
92
+ throw e;
93
+ }
94
+ }
95
+
96
+ createAlignment(opts = {}) {
97
+ const msg = 'A7t.createAlignment:';
98
+ const dbg = DBG.CREATE_ALIGNMENT;
99
+ let {
100
+ legacyDoc,
101
+ mlDoc,
102
+ minScore = this.minScore,
103
+ minScanSize = this.minScanSize,
104
+ maxScanSize = this.maxScanSize,
105
+ scidsExp,
106
+ } = opts;
107
+ let { lang } = this;
108
+ if (!(legacyDoc instanceof LegacyDoc)) {
109
+ throw new Error(`${msg} legacyDoc?`);
110
+ }
111
+ if (mlDoc == null) {
112
+ throw new Error(`${msg} mlDoc?`);
113
+ }
114
+
115
+ let nLines = legacyDoc.lines.length;
116
+ let lineCursor = new Fraction(0, nLines, 'lines');
117
+ let scids = Object.keys(mlDoc.segMap);
118
+ let nSegs = scids.length;
119
+ scids.sort(SuttaCentralId.compareLow);
120
+ let segCursor = new Fraction(0, nSegs, 'segs');
121
+ if (nSegs < nLines) {
122
+ throw new Error(`${msg} nSegs:${nSegs} < nLines:${nLines}?`);
123
+ }
124
+ if (maxScanSize == null) {
125
+ maxScanSize = Math.ceil(Math.max(1, (nSegs - nLines) * 0.8));
126
+ }
127
+ if (minScanSize < 1) {
128
+ throw new Error(`${msg} minScanSize? ${minScanSize} `);
129
+ }
130
+
131
+ let { sutta_uid:suid, docAuthor, bilaraPaths } = mlDoc;
132
+ let { author, author_uid } = legacyDoc;
133
+ let bilaraPath = bilaraPaths.reduce((a,p)=>{
134
+ if (p.includes(docAuthor)) {
135
+ a = p.replaceAll(docAuthor, author_uid);
136
+ }
137
+ return a;
138
+ });
139
+ let docOpts = { suid, lang, author, author_uid, bilaraPath };
140
+
141
+ const optsAlignment = {
142
+ aligner: this,
143
+ ebtDoc: EbtDoc.create(docOpts),
144
+ legacyDoc,
145
+ lineCursor,
146
+ mlDoc,
147
+ minScore,
148
+ minScanSize,
149
+ maxScanSize,
150
+ scids,
151
+ scidsExp,
152
+ segCursor,
153
+ vMLDoc: this.mlDocVectors(mlDoc),
154
+ };
155
+ alignmentCtor = true;
156
+ let alignment = new Alignment(optsAlignment);
157
+ alignmentCtor = false;
158
+
159
+ return alignment;
160
+ }
161
+
162
+ mlDocVectors(mld) {
163
+ const msg = 'Aligner.mlDocVectors';
164
+ const dbg = DBG.ML_DOC_VECTORS;
165
+ let { alignPali, groupDecay, groupSize, wordSpace } = this;
166
+ let { wordMap } = wordSpace;
167
+ let { segMap, lang } = mld;
168
+ let segs = Object.entries(segMap);
169
+ let iLastSeg = segs.length - 1;
170
+ let reList;
171
+
172
+ if (alignPali) {
173
+ let entries = Object.entries(wordMap);
174
+ reList = entries.reduce((a, e) => {
175
+ let [legacyText, paliText] = e;
176
+ if (paliText) {
177
+ a.set(paliText, new RegExp(`\\b${paliText}`, 'gi'));
178
+ }
179
+ return a;
180
+ }, new Map());
181
+ }
182
+
183
+ let vectorMap = {};
184
+ let segGroup = [];
185
+ for (let i = segs.length; i-- > 0; ) {
186
+ let [scid, seg] = segs[i];
187
+ let { pli } = seg;
188
+ let segData = seg[lang] || '';
189
+ let vGroup = new WordSpace.Vector();
190
+ if (alignPali) {
191
+ // for aligning Pali, we add all Pali words that
192
+ // occur in the Pali for a segment to the
193
+ // vector input text
194
+ let pliWords = [];
195
+ reList.forEach((re, paliText, map) => {
196
+ let nMatch = pli.match(re)?.length || 0;
197
+ if (nMatch) {
198
+ for (let i = 0; i < nMatch; i++) {
199
+ pliWords.push(paliText);
200
+ }
201
+ }
202
+ });
203
+ if (pliWords.length) {
204
+ segData += ' ' + pliWords.join(' ');
205
+ dbg === scid && console.log(msg, 'segData', scid, segData);
206
+ }
207
+ }
208
+ segGroup.unshift(segData);
209
+ if (segGroup.length > groupSize) {
210
+ segGroup.pop();
211
+ }
212
+ let scale = 1;
213
+ vGroup = segGroup.reduce((a, seg, i) => {
214
+ let vScale = wordSpace.string2Vector(segData, scale);
215
+ scale *= groupDecay;
216
+ return a.add(vScale);
217
+ }, vGroup);
218
+ vectorMap[scid] = vGroup;
219
+ }
220
+ return vectorMap;
221
+ }
222
+ }
223
+
224
+ export class Alignment {
225
+ constructor(opts = {}) {
226
+ const msg = 'A7t.ctor:';
227
+ if (!alignmentCtor) {
228
+ throw new Error(`${msg} createAlignment()?`);
229
+ }
230
+
231
+ Object.assign(this, opts);
232
+
233
+ Object.defineProperty(this, 'lang', {
234
+ get: () => this.aligner.lang,
235
+ });
236
+ Object.defineProperty(this, 'state', {
237
+ get: () => this.status.state,
238
+ });
239
+ Object.defineProperty(this, 'wordSpace', {
240
+ get: () => this.aligner.wordSpace,
241
+ });
242
+ Object.defineProperty(this, 'status', {
243
+ get: () => {
244
+ let { legacyDoc, history } = this;
245
+ if (history.length === 0) {
246
+ let { uid, lang, author_uid } = legacyDoc;
247
+ let text = `${uid}/${lang}/${author_uid} unaligned`;
248
+ return new AlignmentStatus(this, { text });
249
+ }
250
+ return history.at(-1);
251
+ },
252
+ });
253
+
254
+ this.history = [];
255
+ let { legacyDoc } = this;
256
+ }
257
+
258
+ pushStatus(opts) {
259
+ let status = new AlignmentStatus(this, opts);
260
+ this.history.push(status);
261
+ return status;
262
+ }
263
+
264
+ alignLine(legacyText, opts = {}) {
265
+ const msg = 'A7t.alignLine:';
266
+ const dbg = DBG.ALIGN_LINE;
267
+ if (typeof opts !== 'object') {
268
+ throw new Error(`${msg} opts?`);
269
+ }
270
+ let { dbgScid } = opts;
271
+ // biome-ignore format:
272
+ let { ebtDoc, legacyDoc, lineCursor, maxScanSize, minScanSize,
273
+ minScore, mlDoc, scids, segCursor, vMLDoc, wordSpace,
274
+ } = this;
275
+ let vLegacy = wordSpace.string2Vector(legacyText);
276
+ let scoreMax = 0;
277
+ let segMap = mlDoc.segMap;
278
+ let scoreId;
279
+ let scanning = (i) =>
280
+ i < maxScanSize && (i < minScanSize || scoreMax < minScore);
281
+ for (let i = 0; scanning(i); i++) {
282
+ let scid = scids[segCursor.numerator + i];
283
+ if (scid == null) {
284
+ break;
285
+ }
286
+ let vSeg = vMLDoc[scid];
287
+ if (vSeg == null) {
288
+ throw new Error(`${msg}scid[${scid}]? ${vMLDoc.length}`);
289
+ }
290
+ let score = vLegacy.similar(vSeg);
291
+ if (minScanSize <= i) {
292
+ // Scan exceeded minScanSize. We might be lost.
293
+ // Or maybe we got lucky and translator omitted many segments.
294
+ // For example, MN8 42 segments are skipped for Môhan
295
+ if (score) {
296
+ let percent = (score * 100).toFixed(0);
297
+ let linePos = `line ${lineCursor.n + 1}`;
298
+ this.pushStatus({
299
+ state: STATE_WARN,
300
+ text: `SCAN+${i}`,
301
+ score,
302
+ scid,
303
+ legacyText,
304
+ });
305
+ dbg && console.log(msg, this.status.summary);
306
+ }
307
+ }
308
+ // biome-ignore format:
309
+ if (dbg > 1 && scid === dbgScid) {
310
+ let seg = mlDoc?.segMap[scid] || {};
311
+ let intersection = vLegacy.intersect(vSeg).toString();
312
+ let { pli } = seg;
313
+ console.log(msg, 'dbgScid', {
314
+ legacyText, vLegacy: vLegacy.toString(),
315
+ seg, vSeg: vSeg.toString(),
316
+ score, intersection,
317
+ });
318
+ }
319
+ if (scoreMax < score) {
320
+ scoreMax = score;
321
+ scoreId = scid;
322
+ if (dbg > 1 && dbgScid) {
323
+ let cmp = SuttaCentralId.compareLow(scoreId, dbgScid);
324
+ let intersection = vLegacy.intersect(vSeg).toString();
325
+ // biome-ignore format:
326
+ if (cmp <= 0) {
327
+ console.log(msg, `scoreMax-${dbgScid}`,
328
+ { scoreId, scoreMax, intersection, });
329
+ } else {
330
+ let segExp = segMap && segMap[dbgScid];
331
+ console.log( msg, `scoreMax-${dbgScid}-MISMATCH?`,
332
+ segCursor.toString(),
333
+ lineCursor.toString(),
334
+ { scoreId, segExp, legacyText, scoreMax, intersection},
335
+ );
336
+ }
337
+ }
338
+ }
339
+ } // for
340
+
341
+ let vSeg = vMLDoc[scoreId];
342
+ let intersection = vLegacy.intersect(vSeg);
343
+
344
+ if (scoreId == null || scoreMax < minScore) {
345
+ let iEnd =
346
+ Math.min(scids.length, segCursor.numerator + maxScanSize) - 1;
347
+ let lastId = scids[iEnd];
348
+ let scanned = iEnd - segCursor.numerator + 1;
349
+ // biome-ignore format:
350
+ this.pushStatus({
351
+ state: STATE_ERROR,
352
+ text: `${maxScanSize} UNMATCHED`,
353
+ legacyText,
354
+ score: scoreMax,
355
+ scid: scoreId,
356
+ intersection,
357
+ vLegacy,
358
+ vSeg,
359
+ });
360
+ dbg && console.log(msg, this.status.summary);
361
+ return undefined;
362
+ }
363
+
364
+ // STATE_OK: Current line matches current segment
365
+ ebtDoc.segMap[scoreId] = legacyText;
366
+
367
+ lineCursor.increment();
368
+ let iFound = scids.indexOf(scoreId);
369
+ if (iFound >= 0) {
370
+ segCursor.numerator = iFound + 1;
371
+ } else {
372
+ dbg &&
373
+ console.error(msg, `${ERROR} iFound?`, {
374
+ lineCursor,
375
+ scoreId,
376
+ });
377
+ }
378
+ let status = this.pushStatus({
379
+ score: scoreMax,
380
+ scid: scoreId,
381
+ intersection,
382
+ legacyText,
383
+ vLegacy,
384
+ vSeg,
385
+ iLine: lineCursor.n,
386
+ });
387
+ dbg && console.log(msg, status.summary);
388
+ if (lineCursor.value === 1) {
389
+ let { uid, lang, author_uid } = this.legacyDoc;
390
+ let lineCur = lineCursor.toString();
391
+ status = this.pushStatus({
392
+ state: STATE_DONE,
393
+ text: `${uid}/${lang}/${author_uid} aligned ${lineCur}`,
394
+ context: lineCursor.toString(),
395
+ });
396
+ dbg && console.log(msg, this.status.summary);
397
+ }
398
+
399
+ return status;
400
+ } // alignLine
401
+
402
+ alignAll() {
403
+ const msg = 'A7t.alignAll:';
404
+ let dbg = DBG.ALIGN_ALL;
405
+ //biome-ignore format:
406
+ let {
407
+ aligner, ebtDoc, legacyDoc, lineCursor, maxScanSize, minScanSize,
408
+ mlDoc, scidsExp, segCursor, vMLDoc,
409
+ } = this;
410
+ let { lang, alignPali, wordSpace } = aligner;
411
+ let { segMap } = mlDoc;
412
+ let scids = Object.keys(segMap);
413
+ scids.sort(SuttaCentralId.compareLow);
414
+ let { lines } = legacyDoc;
415
+ let rPrev;
416
+ let iEnd = lines.length - 1;
417
+
418
+ while (lineCursor.difference < 0) {
419
+ let line = lines[lineCursor.numerator];
420
+ dbg > 1 && console.log(msg, lineCursor.toString(), line);
421
+ let curScid = scids[segCursor.numerator];
422
+ let dbgScid = scidsExp?.[lineCursor.numerator];
423
+ let r = this.alignLine(line, { dbgScid });
424
+ rPrev = r;
425
+ // biome-ignore format:
426
+ if (r == null) {
427
+ let { vSeg, vLegacy, intersection } = this.status;
428
+ dbg && console.log( msg, 'UNMATCHED',
429
+ lineCursor.toString(),
430
+ segCursor.toString(),
431
+ { curScid, line, minScanSize, maxScanSize, vSeg, vLegacy, intersection },
432
+ );
433
+ throw new Error(`${msg} unmatched`);
434
+ }
435
+ }
436
+
437
+ return ebtDoc;
438
+ } // alignAll
439
+ } // class Alignment
440
+
441
+ export class AlignmentStatus {
442
+ constructor(alignment, opts = {}) {
443
+ let { lineCursor, segCursor } = alignment;
444
+ let {
445
+ text,
446
+ scid,
447
+ state = STATE_OK,
448
+ score,
449
+ intersection,
450
+ legacyText,
451
+ vLegacy,
452
+ vSeg,
453
+ iLine = lineCursor.n + 1,
454
+ } = opts;
455
+
456
+ Object.assign(this, {
457
+ iLine,
458
+ intersection: intersection?.toString(),
459
+ legacyText,
460
+ lineCursor: lineCursor && new Fraction(lineCursor),
461
+ text,
462
+ scid,
463
+ score,
464
+ segCursor: segCursor && new Fraction(segCursor),
465
+ state,
466
+ vLegacy: vLegacy?.toString(),
467
+ vSeg: vSeg?.toString(),
468
+ });
469
+
470
+ Object.defineProperty(this, 'alignment', {
471
+ value: alignment,
472
+ });
473
+ Object.defineProperty(this, 'scorePercent', {
474
+ get: () =>
475
+ this.score == null
476
+ ? '--%'
477
+ : `${(100 * this.score)?.toFixed(0)}%`,
478
+ });
479
+ Object.defineProperty(this, 'lineCur', {
480
+ get: () => this?.lineCursor?.toString(),
481
+ });
482
+ Object.defineProperty(this, 'segCur', {
483
+ get: () => this?.segCursor?.toString(),
484
+ });
485
+ }
486
+
487
+ static get STATE_ERROR() {
488
+ return STATE_ERROR;
489
+ }
490
+ static get STATE_DONE() {
491
+ return STATE_DONE;
492
+ }
493
+ static get STATE_OK() {
494
+ return STATE_OK;
495
+ }
496
+ static get STATE_WARN() {
497
+ return STATE_WARN;
498
+ }
499
+
500
+ get summary() {
501
+ let {
502
+ alignment,
503
+ state,
504
+ text,
505
+ scid,
506
+ scorePercent,
507
+ lineCur,
508
+ lineCursor,
509
+ segCur,
510
+ segCursor,
511
+ score,
512
+ legacyText = '',
513
+ iLine,
514
+ } = this;
515
+
516
+ let status = [];
517
+ let symbol;
518
+ let color = NO_COLOR;
519
+ let context = legacyText ? `${iLine}:` + legacyText : '';
520
+ let { minScore } = alignment;
521
+ let CTX_LEN = 25;
522
+ switch (state) {
523
+ case STATE_ERROR:
524
+ symbol = RED_X;
525
+ color = RED;
526
+ break;
527
+ case STATE_WARN:
528
+ color = YELLOW;
529
+ symbol = WARNING + ' ';
530
+ context = context.substring(0, CTX_LEN) + ELLIPSIS;
531
+ break;
532
+ case STATE_DONE:
533
+ symbol = CHECKMARK + ' ';
534
+ color = WHITE;
535
+ break;
536
+ case STATE_OK:
537
+ symbol = CHECKMARK;
538
+ context = context.substring(0, CTX_LEN) + ELLIPSIS;
539
+ color = NO_COLOR;
540
+ break;
541
+ default:
542
+ symbol = RED_X;
543
+ text = `UNKNOWN STATE ${state}`;
544
+ color = RED;
545
+ break;
546
+ }
547
+ status.push(color + symbol);
548
+ status.push(text);
549
+ if (score) {
550
+ status.push(scid);
551
+ status.push(`segs[${segCursor.n}]`);
552
+ status.push(
553
+ score < minScore
554
+ ? RED + LEFT_ARROW + scorePercent + RIGHT_ARROW + color
555
+ : GREEN + LEFT_ARROW + scorePercent + RIGHT_ARROW + color,
556
+ );
557
+ }
558
+ context && status.push(context + NO_COLOR);
559
+
560
+ return status.join(' ');
561
+ }
562
+ } // AlignmentStatus
@@ -0,0 +1,84 @@
1
+ export class BilaraPath {
2
+ constructor(bilaraPath) {
3
+ Object.assign(this, BilaraPath.pathParts(bilaraPath));
4
+ }
5
+
6
+ static htmlPath(mid) {
7
+ let lang = 'pli';
8
+ let auth = 'ms';
9
+ return ['html', lang, `${auth}/sutta`, `${mid}_html.json`].join(
10
+ '/',
11
+ );
12
+ }
13
+
14
+ static variantPath(mid) {
15
+ let lang = 'pli';
16
+ let auth = 'ms';
17
+ return [
18
+ 'variant',
19
+ lang,
20
+ `${auth}/sutta`,
21
+ `${mid}_variant-${lang}-${auth}.json`,
22
+ ].join('/');
23
+ }
24
+
25
+ static referencePath(mid) {
26
+ let lang = 'pli';
27
+ let auth = 'ms';
28
+ return [
29
+ 'reference',
30
+ lang,
31
+ `${auth}/sutta`,
32
+ `${mid}_reference.json`,
33
+ ].join('/');
34
+ }
35
+
36
+ static rootPath(mid, lang = 'pli', auth = 'ms') {
37
+ return [
38
+ 'root',
39
+ lang,
40
+ `${auth}/sutta`,
41
+ `${mid}_root-${lang}-${auth}.json`,
42
+ ].join('/');
43
+ }
44
+
45
+ static legacyPath(mid, lang, auth) {
46
+ return [`${mid}_legacy-${lang}-${auth}.json`].join('/');
47
+ }
48
+
49
+ static translationPath(mid, lang, auth) {
50
+ return [
51
+ 'translation',
52
+ lang,
53
+ `${auth}/sutta`,
54
+ `${mid}_translation-${lang}-${auth}.json`,
55
+ ].join('/');
56
+ }
57
+
58
+ static commentPath(mid, lang, auth) {
59
+ return [
60
+ 'comment',
61
+ lang,
62
+ `${auth}/sutta`,
63
+ `${mid}_comment-${lang}-${auth}.json`,
64
+ ].join('/');
65
+ }
66
+
67
+ static pathParts(bilaraPath) {
68
+ let bpParts = bilaraPath.split('/');
69
+ let fname = bpParts.pop();
70
+ let [type, lang, author_uid, category, collection] = bpParts;
71
+ let suid = fname.replace(/_.*$/, '');
72
+ let suttaRef = `${suid}/${lang}/${author_uid}`;
73
+ return {
74
+ suid,
75
+ type,
76
+ category,
77
+ collection,
78
+ lang,
79
+ author_uid,
80
+ suttaRef,
81
+ bilaraPath,
82
+ };
83
+ }
84
+ }