@sc-voice/tools 1.5.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,681 +0,0 @@
1
- import fs from 'node:fs';
2
- import path from 'node:path';
3
- import { fileURLToPath } from 'node:url';
4
- const __filename = fileURLToPath(import.meta.url);
5
- const __dirname = path.dirname(__filename);
6
-
7
- const QUOTE = '“'; // Quotation mark
8
- const APOS = "'"; // Apostrophe/single-quote
9
- const LSQUOT = '‘'; // Left single quote
10
- const RSQUOT = '’'; // \u2019 Right single quote, curly apostrophe
11
- const LDGUIL = '«'; // Left double guillemet
12
- const RDGUIL = '»'; // Right double guillemet
13
- const LGUIL = '\u2039'; // Left guillemet
14
- const RGUIL = '\u203a'; // Right guillemet
15
- const NBSP = '\u00a0'; // non-breaking space
16
- const THNSP = '\u2009'; // thin space
17
- const LDQUOT = '“'; // Left double quote
18
- const RDQUOT = '”'; // Right double quote
19
- const ELLIPSIS = '…';
20
-
21
- const FR_QUOTES = '«|»|“|”|‘|’';
22
- const RE_POST_APOS = /^\w/;
23
-
24
- // Deepl
25
- const LQ1 = '<l1/>';
26
- const LQ2 = '<l2/>';
27
- const LQ3 = '<l3/>';
28
- const LQ4 = '<l4/>';
29
- const RQ1 = ' <r1/>'; // DeepL deletes trailing XML elements
30
- const RQ2 = ' <r2/>'; // DeepL deletes trailing XML elements
31
- const RQ3 = ' <r3/>'; // DeepL deletes trailing XML elements
32
- const RQ4 = ' <r4/>'; // DeepL deletes trailing XML elements
33
- const ELL = '<ell/>';
34
-
35
- import { DBG } from '../defines.mjs';
36
-
37
- export class QuoteParser {
38
- constructor(opts = {}) {
39
- const msg = 'QuoteParser.ctor()';
40
- const dbg = DBG.QUOTE;
41
- let {
42
- lang = 'en',
43
- openQuotes,
44
- closeQuotes,
45
- apostrophe = RSQUOT,
46
- level = 0,
47
- maxLevel = 4,
48
- quotes = 0,
49
- } = opts;
50
-
51
- lang = lang.toLowerCase();
52
- openQuotes = openQuotes && [...openQuotes];
53
- closeQuotes = closeQuotes && [...closeQuotes];
54
-
55
- switch (lang) {
56
- case 'en-uk': // UK quote nesting
57
- openQuotes = openQuotes || [LSQUOT, LDQUOT, LSQUOT, LDQUOT];
58
- closeQuotes = closeQuotes || [RSQUOT, RDQUOT, RSQUOT, RDQUOT];
59
- break;
60
- case 'pt-br':
61
- case 'en-us':
62
- case 'en': // American quote nesting
63
- openQuotes = openQuotes || [LDQUOT, LSQUOT, LDQUOT, LSQUOT];
64
- closeQuotes = closeQuotes || [RDQUOT, RSQUOT, RDQUOT, RSQUOT];
65
- break;
66
- case 'nl':
67
- openQuotes = openQuotes || [LDQUOT, LDQUOT, LDQUOT, LDQUOT];
68
- closeQuotes = closeQuotes || [RDQUOT, RDQUOT, RDQUOT, RDQUOT];
69
- break;
70
- case 'it':
71
- case 'es':
72
- case 'pt':
73
- case 'pt-pt':
74
- openQuotes = openQuotes || [LDGUIL, LDQUOT, LSQUOT, LDQUOT];
75
- closeQuotes = closeQuotes || [RDGUIL, RDQUOT, RSQUOT, RDQUOT];
76
- break;
77
- case 'fr-eu':
78
- openQuotes = openQuotes || [LDGUIL + THNSP, LDQUOT, LSQUOT];
79
- closeQuotes = closeQuotes || [THNSP + RDGUIL, RDQUOT, RSQUOT];
80
- break;
81
- case 'fr':
82
- openQuotes = openQuotes || [
83
- LDGUIL + THNSP,
84
- LGUIL + THNSP,
85
- LDQUOT,
86
- LSQUOT,
87
- ];
88
- closeQuotes = closeQuotes || [
89
- THNSP + RDGUIL,
90
- THNSP + RGUIL,
91
- RDQUOT,
92
- RSQUOT,
93
- ];
94
- break;
95
- default:
96
- {
97
- if (lang.endsWith('-deepl')) {
98
- openQuotes = openQuotes || [LQ1, LQ2, LQ3, LQ4];
99
- closeQuotes = closeQuotes || [RQ1, RQ2, RQ3, RQ4];
100
- apostrophe = APOS;
101
- } else {
102
- let emsg = `${msg} unsupported language:${lang}`;
103
- throw new Error(emsg);
104
- }
105
- }
106
- break;
107
- }
108
-
109
- let quoteMap = {};
110
- let allQuotes = [...openQuotes, ...closeQuotes].reduce((a, q) => {
111
- if (quoteMap[q] == null) {
112
- quoteMap[q] = true;
113
- a.push(q);
114
- }
115
-
116
- return a;
117
- }, []);
118
- let rexSplit = new RegExp(`(${allQuotes.join('|')})`);
119
- let rexQuotes = new RegExp(`(${allQuotes.join('|')})`, 'g');
120
- for (let i = 0; i < maxLevel; i++) {
121
- openQuotes[i] = openQuotes[i] || openQuotes[i - 1];
122
- closeQuotes[i] = closeQuotes[i] || closeQuotes[i - 1];
123
- }
124
-
125
- let rexPreApos = QuoteParser.loadApostrophe(lang);
126
- if (rexPreApos) {
127
- this.rexPreApos = rexPreApos;
128
- }
129
-
130
- Object.assign(this, {
131
- apostrophe,
132
- closeQuotes,
133
- lang,
134
- level,
135
- openQuotes,
136
- rexQuotes,
137
- rexSplit,
138
- maxLevel,
139
- quotes,
140
- });
141
- dbg && console.log(msg, lang);
142
- }
143
-
144
- static loadApostrophe(lang) {
145
- const msg = 'QuoteParser.loadApostrophe()';
146
- const dbg = 0;
147
- let majorLang = lang.split('-')[0];
148
- let fname = `apostrophe_${majorLang}.txt`;
149
- let fpath = path.join(__dirname, './glossary', fname);
150
- let rex;
151
- try {
152
- let text = fs.readFileSync(fpath).toString().trim();
153
- let lines = text
154
- .split('’\n')
155
- .map((line) => line && `\\b${line}$`);
156
- rex = new RegExp(lines.join('|'), 'ig');
157
- dbg && console.log(msg, `[1]${lang}`, rex);
158
- } catch (e) {
159
- dbg && console.warn(msg, '[2]No apostrophe info:', fname);
160
- }
161
- return rex;
162
- }
163
-
164
- static testcaseQ2EN(lang) {
165
- const { LQ1, LQ2, LQ3, LQ4, RQ1, RQ2, RQ3, RQ4 } = QuoteParser;
166
- return [
167
- // LQ1 in preceding segment
168
- `${LQ2}I say, `,
169
- `${LQ3}You say, `,
170
- `${LQ4}I said ${lang}!${RQ4}`,
171
- `?${RQ3}.`,
172
- `${RQ2}`,
173
- `${RQ1}`, // closing
174
- ].join('');
175
- }
176
-
177
- static testcaseDepthEN(lang) {
178
- const { LQ1, LQ2, LQ3, LQ4, RQ1, RQ2, RQ3, RQ4 } = QuoteParser;
179
- return [
180
- `${LQ1}`,
181
- `${LQ2}I say, `,
182
- `${LQ3}You say, `,
183
- `${LQ4}I said ${lang}!${RQ4}`,
184
- `?${RQ3}.`,
185
- `${RQ2}`,
186
- `${RQ1}`,
187
- ].join('');
188
- }
189
-
190
- static testcaseElderlyEN(opts = {}) {
191
- let {
192
- lang = 'messenger',
193
- lQuote = '',
194
- rQuote = '',
195
- gods = 'gods',
196
- } = opts;
197
- return [
198
- lQuote,
199
- `Mister, did you not see among human beings an `,
200
- `elderly woman or a man—eighty, ninety, or a hundred `,
201
- `years old—bent double, crooked, leaning on a staff, `,
202
- `trembling as they walk, ailing, past their prime, `,
203
- `with teeth broken, hair grey and scanty or bald, `,
204
- `skin wrinkled, and limbs blotchy?`,
205
- rQuote,
206
- ].join('');
207
- }
208
-
209
- static testcaseSickEN(opts = {}) {
210
- let {
211
- lang = 'sickness',
212
- lQuote1 = '',
213
- rQuote1 = '',
214
- rQuote2 = '',
215
- apos = "'",
216
- } = opts;
217
- return [
218
- lQuote1,
219
- `I, too, am liable to become sick. I${apos}m not exempt `,
220
- `from ${lang}. I${apos}d better do good by way of body, `,
221
- `speech, and mind`,
222
- rQuote1,
223
- '?',
224
- rQuote2,
225
- ].join('');
226
- }
227
-
228
- static testcaseMisterEN(opts = {}) {
229
- let {
230
- lang = 'messenger',
231
- lQuote = '',
232
- rQuote = '',
233
- gods = 'gods',
234
- } = opts;
235
- return [
236
- lQuote,
237
- `Mister, did you not see the first ${lang} of the ${gods} that appeared among human beings?`,
238
- rQuote,
239
- ].join('');
240
- }
241
-
242
- static testcaseQuotesEN(opts = {}) {
243
- let { lang = 'mind', lQuote = '', rQuote = '' } = opts;
244
- return `${lQuote}Listen and apply your ${lang} well, I will speak.${rQuote}`;
245
- }
246
-
247
- static testcaseDonationEN(opts = {}) {
248
- let {
249
- lang = 'religious',
250
- people = 'kinds of people',
251
- lQuote = '',
252
- rQuote = '',
253
- apos = "'",
254
- } = opts;
255
- return [
256
- `${lQuote}These are two ${people} in the world`,
257
- `who are worthy of a ${lang} donation,`,
258
- `and that${apos}s where you should give a gift.${rQuote} `,
259
- ].join(' ');
260
- }
261
-
262
- static testcaseEllipsisEN(lang, opts = QuoteParser) {
263
- const {
264
- prefix = 'They understand: ',
265
- lQuote = LDQUOT,
266
- rQuote = RDQUOT,
267
- ellipsis = ` ${ELLIPSIS} `,
268
- } = opts;
269
- return [
270
- prefix,
271
- lQuote,
272
- `This is ${lang}`,
273
- rQuote,
274
- ellipsis,
275
- lQuote,
276
- 'This is suffering',
277
- rQuote,
278
- ellipsis,
279
- lQuote,
280
- 'This is the origin',
281
- rQuote,
282
- '.',
283
- ].join('');
284
- }
285
-
286
- static testcaseThinking_EN(lang, opts = {}) {
287
- const { LQ1, RQ1 } = QuoteParser;
288
- let { lQuote = LQ1, rQuote = RQ1 } = opts;
289
-
290
- return [
291
- `Thinking, `,
292
- `${lQuote}I${APOS}ve done ${lang} `,
293
- `things by way of body, speech, and mind`,
294
- `${rQuote}, they${APOS}re mortified.`,
295
- ].join('');
296
- }
297
-
298
- static get APOS() {
299
- return APOS;
300
- }
301
- static get ELLIPSIS() {
302
- return ELLIPSIS;
303
- }
304
- static get ELL() {
305
- return ELL;
306
- }
307
- static get LDQUOT() {
308
- return LDQUOT;
309
- }
310
- static get RDQUOT() {
311
- return RDQUOT;
312
- }
313
- static get LSQUOT() {
314
- return LSQUOT;
315
- }
316
- static get RSQUOT() {
317
- return RSQUOT;
318
- }
319
- static get LGUIL() {
320
- return LGUIL;
321
- }
322
- static get RGUIL() {
323
- return RGUIL;
324
- }
325
- static get LDGUIL() {
326
- return LDGUIL;
327
- }
328
- static get RDGUIL() {
329
- return RDGUIL;
330
- }
331
- static get NBSP() {
332
- return NBSP;
333
- }
334
- static get THNSP() {
335
- return THNSP;
336
- }
337
- static get QUOTE() {
338
- return QUOTE;
339
- }
340
- static get LQ1() {
341
- return LQ1;
342
- }
343
- static get LQ2() {
344
- return LQ2;
345
- }
346
- static get LQ3() {
347
- return LQ3;
348
- }
349
- static get LQ4() {
350
- return LQ4;
351
- }
352
- static get RQ1() {
353
- return RQ1;
354
- }
355
- static get RQ2() {
356
- return RQ2;
357
- }
358
- static get RQ3() {
359
- return RQ3;
360
- }
361
- static get RQ4() {
362
- return RQ4;
363
- }
364
-
365
- // ...APOS...
366
- testcaseGratificationEN(lang) {
367
- const apos = this.apostrophe;
368
- const [LQ1, LQ2, LQ3, LQ4] = this.openQuotes;
369
- const [RQ1, RQ2, RQ3, RQ4] = this.closeQuotes;
370
-
371
- return (
372
- `‘But reverends, what${apos}s the gratification, ` +
373
- `the drawback, and the escape when it comes to ${lang}`
374
- );
375
- }
376
-
377
- // ...APOS...
378
- testcasePleasuresEN(lang) {
379
- const apos = this.apostrophe;
380
- const [LQ1, LQ2, LQ3, LQ4] = this.openQuotes;
381
- const [RQ1, RQ2, RQ3, RQ4] = this.closeQuotes;
382
-
383
- return (
384
- `understand ${lang} pleasures${apos} ` +
385
- `gratification, drawback, and escape`
386
- );
387
- }
388
-
389
- // ...APOS...RQ1 RQ2
390
- testcaseSquirrelsEN(lang) {
391
- const apos = this.apostrophe;
392
- const [LQ1, LQ2, LQ3, LQ4] = this.openQuotes;
393
- const [RQ1, RQ2, RQ3, RQ4] = this.closeQuotes;
394
-
395
- return `the ${lang} squirrels${apos} feeding ground${RQ2}${RQ1}`;
396
- }
397
-
398
- // LQ2.....RQ2 RQ1
399
- testcaseRebirthEN(lang) {
400
- const [LQ1, LQ2, LQ3, LQ4] = this.openQuotes;
401
- const [RQ1, RQ2, RQ3, RQ4] = this.closeQuotes;
402
- return [
403
- //`${LQ1}`,
404
- `${LQ2}I understand: `,
405
- `${LQ3}`,
406
- `Rebirth is ended in ${lang}`,
407
- `${RQ3}`,
408
- `${RQ2}`,
409
- '?',
410
- `${RQ1}`,
411
- ].join('');
412
- }
413
-
414
- // ... RQ2
415
- testcaseFeelingsEN(lang) {
416
- const apos = this.apostrophe;
417
- const [RQ1, RQ2, RQ3, RQ4] = this.closeQuotes;
418
- return [
419
- `what${apos}s the escape from that ${lang} feeling?${RQ2}`,
420
- ].join('');
421
- }
422
-
423
- // ... RQ1
424
- testcaseReligionsEN(lang) {
425
- const [LQ1, LQ2, LQ3, LQ4] = this.openQuotes;
426
- const [RQ1, RQ2, RQ3, RQ4] = this.closeQuotes;
427
- return [`Why don't we visit ${lang} religions?${RQ1} `].join('');
428
- }
429
-
430
- testcaseApostropheEN(lang) {
431
- const { apostrophe } = this;
432
- return `The ${lang} child${apostrophe}s toy`;
433
- }
434
-
435
- testcaseApostropheFR(lang) {
436
- const { apostrophe } = this;
437
- return `Le jouet de l${apostrophe}enfant ${lang}`;
438
- }
439
-
440
- scan(text, level = this.level) {
441
- const msg = 'QuoteParser.scan()';
442
- const dbg = DBG.QUOTE;
443
- const dbgv = DBG.VERBOSE && dbg;
444
- let { rexQuotes, openQuotes, closeQuotes, maxLevel } = this;
445
- let quotes = 0;
446
- let execRes = rexQuotes.exec(text);
447
- for (; execRes != null; execRes = rexQuotes.exec(text)) {
448
- let match = execRes[0];
449
- dbgv && console.log(msg, match);
450
- quotes++;
451
- if (match === closeQuotes[level - 1]) {
452
- level--;
453
- if (level < 0) {
454
- let emsg = `${msg} unmatched close quote: ${text}`;
455
- console.warn(msg, emsg);
456
- throw new Error(emsg);
457
- }
458
- } else if (match === openQuotes[level]) {
459
- level++;
460
- if (maxLevel < level) {
461
- let emsg = `${msg} quote nesting exceeded: ${text}`;
462
- console.warn(msg, emsg);
463
- throw new Error(emsg);
464
- }
465
- } else {
466
- let emsg = `${msg} invalid quote [${match}] for level:${level}`;
467
- console.warn(msg, emsg);
468
- throw new Error(emsg);
469
- }
470
- }
471
-
472
- return {
473
- level,
474
- quotes,
475
- };
476
- }
477
-
478
- parse(text, level) {
479
- const msg = 'QuoteParser.parse()';
480
- const dbg = DBG.QUOTE;
481
- let dState = this.scan(text, level);
482
- if (dbg) {
483
- console.log(msg, dState);
484
- }
485
-
486
- this.level = dState.level;
487
- this.quotes += dState.quotes;
488
-
489
- return dState;
490
- }
491
-
492
- isApostrophe(context) {
493
- const msg = 'QuoteParser.isApostrophe()';
494
- const dbg = 0;
495
- const [before, quote, after] = context;
496
- let { rexPreApos } = this;
497
-
498
- if (quote !== RSQUOT) {
499
- dbg && console.log(msg, '[1]!RSQUOT', context);
500
- return false;
501
- }
502
- if (rexPreApos && rexPreApos.test(before)) {
503
- dbg && console.log(msg, '[2]rexPreApos', before);
504
- return true;
505
- }
506
- if (after === '') {
507
- dbg && console.log(msg, '[3]$', context);
508
- return false;
509
- }
510
- if (RE_POST_APOS.test(after)) {
511
- dbg && console.log(msg, '[4]RE_POST_APOS', context);
512
- return true;
513
- }
514
-
515
- dbg && console.log(msg, '[5]RSQUOT', context);
516
- return false;
517
- }
518
-
519
- convertQuotes(text = '', qpSwap = text, level = this.level) {
520
- const msg = 'QuoteParser.convertQuotes()';
521
- const dbg = 0 || DBG.VERBOSE;
522
- let {
523
- openQuotes: srcOpen,
524
- closeQuotes: srcClose,
525
- apostrophe: srcApos,
526
- rexSplit,
527
- maxLevel,
528
- } = this;
529
- if (qpSwap == null || text == '') {
530
- return text;
531
- }
532
- let {
533
- openQuotes: swapOpen,
534
- closeQuotes: swapClose,
535
- apostrophe: swapApos,
536
- } = qpSwap;
537
-
538
- let dstParts = [];
539
- let srcParts = text.split(rexSplit);
540
- dbg && console.log(msg, '[1]srcParts', srcParts);
541
- let lastPart;
542
- let nextPart = srcParts[0];
543
- for (let i = 0; i < srcParts.length; i++) {
544
- let part = nextPart;
545
- let srcOpenQuote = srcOpen[level];
546
- let srcCloseQuote = srcClose[level - 1];
547
-
548
- if (i % 2 === 0) {
549
- dbg && console.log(msg, `[2]text@${i}`, part);
550
- } else if (part === srcCloseQuote) {
551
- let nextPart = srcParts[i + 1];
552
- let context = [srcParts[i - 1], part, nextPart];
553
- if (this.isApostrophe(context)) {
554
- dbg && console.log(msg, `[3]apos@${i}`, { part, nextPart });
555
- } else {
556
- dbg &&
557
- console.log(msg, `[4]close@${i}`, { part, nextPart });
558
- level--;
559
- part = swapClose[level];
560
- if (level < 0) {
561
- let emsg = `${msg} unmatched close quote: ${text}`;
562
- console.warn(msg, emsg);
563
- throw new Error(emsg);
564
- }
565
- }
566
- } else if (part === srcOpenQuote) {
567
- dbg && console.log(msg, `[5]open@${i}`, part);
568
- part = swapOpen[level];
569
- level++;
570
- if (maxLevel < level) {
571
- let emsg = `${msg} quote nesting exceeded: ${text}`;
572
- console.warn(msg, emsg);
573
- throw new Error(emsg);
574
- }
575
- } else {
576
- dbg && console.log(msg, `[6]skip@${i}`, level, `"${part}"`);
577
- // not a quote
578
- }
579
- dstParts.push(part);
580
- lastPart = part;
581
- nextPart = srcParts[i + 1];
582
- }
583
- this.level = level;
584
-
585
- let aposParts = dstParts.join('').split(srcApos);
586
- dbg &&
587
- console.log(
588
- msg,
589
- '[7]aposParts',
590
- aposParts,
591
- swapApos.charCodeAt(0),
592
- );
593
-
594
- return aposParts.join(swapApos);
595
- }
596
-
597
- #checkQuoteLevel(text = '', startLevel = 0) {
598
- const msg = `qp-${this.lang}.#checkQuoteLevel()`;
599
- const dbg = DBG.QUOTE;
600
- const dbgv = dbg && DBG.VERBOSE;
601
- let endLevel = startLevel;
602
- let syncLevel = startLevel;
603
- let levelError = 0;
604
- let { maxLevel, rexSplit, openQuotes, closeQuotes } = this;
605
- let parts = text.split(rexSplit);
606
- let error;
607
-
608
- if (parts.length === 1) {
609
- dbg &&
610
- console.log(
611
- msg,
612
- `[1]no-quotes${startLevel}`,
613
- text.substring(0, 50),
614
- '...',
615
- );
616
- }
617
- for (let i = 1; !error && i < parts.length; i += 2) {
618
- let part = parts[i]; // parts with odd indices are quotes
619
- let context = [parts[i - 1], part, parts[i + 1]];
620
-
621
- if (part === openQuotes[endLevel]) {
622
- // sync ok
623
- endLevel++;
624
- dbg &&
625
- console.log(msg, `[2]open${endLevel}`, context.join('|'));
626
- } else if (part === closeQuotes[endLevel - 1]) {
627
- if (this.isApostrophe(context)) {
628
- dbgv && console.log(msg, `[3]apos`, context.join('|'));
629
- } else {
630
- dbg &&
631
- console.log(
632
- msg,
633
- `[4]close${endLevel}`,
634
- context.join('|'),
635
- );
636
- endLevel--;
637
- }
638
- } else if (this.isApostrophe(context)) {
639
- dbgv && console.log(msg, `[5]apos`, context.join('|'));
640
- } else {
641
- // sync fail
642
- let emsg = `${msg} ERROR [${startLevel}?${text}]`;
643
- dbg &&
644
- console.log(msg, `[6]SYNC?`, { startLevel, i, context });
645
- error = new Error(emsg);
646
- }
647
- }
648
- return {
649
- error,
650
- startLevel,
651
- endLevel,
652
- };
653
- }
654
-
655
- syncQuoteLevel(text = '', startLevel = 0) {
656
- const msg = `qp-${this.lang}.syncQuoteLevel()`;
657
- const dbg = DBG.QUOTE;
658
- let { maxLevel } = this;
659
- let check = this.#checkQuoteLevel(text, startLevel);
660
- if (check.error) {
661
- for (let i = 1; check.error && i < maxLevel; i++) {
662
- let tryLevel = (startLevel + i) % maxLevel;
663
- check = this.#checkQuoteLevel(text, tryLevel);
664
- }
665
- if (check.error) {
666
- // Could not synchronize quotes. Source document error
667
- console.log(msg, '[1]SYNC?!', `level ${startLevel}=>ERROR`);
668
- throw check.error;
669
- }
670
-
671
- // Synchronized, but source document might be in error
672
- console.log(
673
- msg,
674
- '[2]SYNC?',
675
- `level ${startLevel}=>${check.startLevel}`,
676
- `\n |${text}|`,
677
- );
678
- }
679
- return check;
680
- }
681
- }