@sc-voice/tools 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.mjs +21 -6
- package/package.json +2 -1
- package/src/defines.mjs +6 -1
- package/src/graph/sankey.mjs +56 -0
- package/src/text/ebt-doc.mjs +6 -2
- package/src/text/legacy-doc.mjs +29 -2
- package/src/text/word-space.mjs +83 -54
- package/src/{text → translate}/aligner.mjs +127 -38
- package/src/translate/deepl-adapter.mjs +353 -0
- package/src/translate/dpd-transformer.mjs +17 -0
- package/src/translate/mock-deepl.mjs +351 -0
- package/src/translate/quote-parser.mjs +681 -0
|
@@ -0,0 +1,681 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { fileURLToPath } from 'node:url';
|
|
4
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
5
|
+
const __dirname = path.dirname(__filename);
|
|
6
|
+
|
|
7
|
+
const QUOTE = '“'; // Quotation mark
|
|
8
|
+
const APOS = "'"; // Apostrophe/single-quote
|
|
9
|
+
const LSQUOT = '‘'; // Left single quote
|
|
10
|
+
const RSQUOT = '’'; // \u2019 Right single quote, curly apostrophe
|
|
11
|
+
const LDGUIL = '«'; // Left double guillemet
|
|
12
|
+
const RDGUIL = '»'; // Right double guillemet
|
|
13
|
+
const LGUIL = '\u2039'; // Left guillemet
|
|
14
|
+
const RGUIL = '\u203a'; // Right guillemet
|
|
15
|
+
const NBSP = '\u00a0'; // non-breaking space
|
|
16
|
+
const THNSP = '\u2009'; // thin space
|
|
17
|
+
const LDQUOT = '“'; // Left double quote
|
|
18
|
+
const RDQUOT = '”'; // Right double quote
|
|
19
|
+
const ELLIPSIS = '…';
|
|
20
|
+
|
|
21
|
+
const FR_QUOTES = '«|»|“|”|‘|’';
|
|
22
|
+
const RE_POST_APOS = /^\w/;
|
|
23
|
+
|
|
24
|
+
// Deepl
|
|
25
|
+
const LQ1 = '<l1/>';
|
|
26
|
+
const LQ2 = '<l2/>';
|
|
27
|
+
const LQ3 = '<l3/>';
|
|
28
|
+
const LQ4 = '<l4/>';
|
|
29
|
+
const RQ1 = ' <r1/>'; // DeepL deletes trailing XML elements
|
|
30
|
+
const RQ2 = ' <r2/>'; // DeepL deletes trailing XML elements
|
|
31
|
+
const RQ3 = ' <r3/>'; // DeepL deletes trailing XML elements
|
|
32
|
+
const RQ4 = ' <r4/>'; // DeepL deletes trailing XML elements
|
|
33
|
+
const ELL = '<ell/>';
|
|
34
|
+
|
|
35
|
+
import { DBG } from '../defines.mjs';
|
|
36
|
+
|
|
37
|
+
export class QuoteParser {
|
|
38
|
+
constructor(opts = {}) {
|
|
39
|
+
const msg = 'QuoteParser.ctor()';
|
|
40
|
+
const dbg = DBG.QUOTE;
|
|
41
|
+
let {
|
|
42
|
+
lang = 'en',
|
|
43
|
+
openQuotes,
|
|
44
|
+
closeQuotes,
|
|
45
|
+
apostrophe = RSQUOT,
|
|
46
|
+
level = 0,
|
|
47
|
+
maxLevel = 4,
|
|
48
|
+
quotes = 0,
|
|
49
|
+
} = opts;
|
|
50
|
+
|
|
51
|
+
lang = lang.toLowerCase();
|
|
52
|
+
openQuotes = openQuotes && [...openQuotes];
|
|
53
|
+
closeQuotes = closeQuotes && [...closeQuotes];
|
|
54
|
+
|
|
55
|
+
switch (lang) {
|
|
56
|
+
case 'en-uk': // UK quote nesting
|
|
57
|
+
openQuotes = openQuotes || [LSQUOT, LDQUOT, LSQUOT, LDQUOT];
|
|
58
|
+
closeQuotes = closeQuotes || [RSQUOT, RDQUOT, RSQUOT, RDQUOT];
|
|
59
|
+
break;
|
|
60
|
+
case 'pt-br':
|
|
61
|
+
case 'en-us':
|
|
62
|
+
case 'en': // American quote nesting
|
|
63
|
+
openQuotes = openQuotes || [LDQUOT, LSQUOT, LDQUOT, LSQUOT];
|
|
64
|
+
closeQuotes = closeQuotes || [RDQUOT, RSQUOT, RDQUOT, RSQUOT];
|
|
65
|
+
break;
|
|
66
|
+
case 'nl':
|
|
67
|
+
openQuotes = openQuotes || [LDQUOT, LDQUOT, LDQUOT, LDQUOT];
|
|
68
|
+
closeQuotes = closeQuotes || [RDQUOT, RDQUOT, RDQUOT, RDQUOT];
|
|
69
|
+
break;
|
|
70
|
+
case 'it':
|
|
71
|
+
case 'es':
|
|
72
|
+
case 'pt':
|
|
73
|
+
case 'pt-pt':
|
|
74
|
+
openQuotes = openQuotes || [LDGUIL, LDQUOT, LSQUOT, LDQUOT];
|
|
75
|
+
closeQuotes = closeQuotes || [RDGUIL, RDQUOT, RSQUOT, RDQUOT];
|
|
76
|
+
break;
|
|
77
|
+
case 'fr-eu':
|
|
78
|
+
openQuotes = openQuotes || [LDGUIL + THNSP, LDQUOT, LSQUOT];
|
|
79
|
+
closeQuotes = closeQuotes || [THNSP + RDGUIL, RDQUOT, RSQUOT];
|
|
80
|
+
break;
|
|
81
|
+
case 'fr':
|
|
82
|
+
openQuotes = openQuotes || [
|
|
83
|
+
LDGUIL + THNSP,
|
|
84
|
+
LGUIL + THNSP,
|
|
85
|
+
LDQUOT,
|
|
86
|
+
LSQUOT,
|
|
87
|
+
];
|
|
88
|
+
closeQuotes = closeQuotes || [
|
|
89
|
+
THNSP + RDGUIL,
|
|
90
|
+
THNSP + RGUIL,
|
|
91
|
+
RDQUOT,
|
|
92
|
+
RSQUOT,
|
|
93
|
+
];
|
|
94
|
+
break;
|
|
95
|
+
default:
|
|
96
|
+
{
|
|
97
|
+
if (lang.endsWith('-deepl')) {
|
|
98
|
+
openQuotes = openQuotes || [LQ1, LQ2, LQ3, LQ4];
|
|
99
|
+
closeQuotes = closeQuotes || [RQ1, RQ2, RQ3, RQ4];
|
|
100
|
+
apostrophe = APOS;
|
|
101
|
+
} else {
|
|
102
|
+
let emsg = `${msg} unsupported language:${lang}`;
|
|
103
|
+
throw new Error(emsg);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
break;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
let quoteMap = {};
|
|
110
|
+
let allQuotes = [...openQuotes, ...closeQuotes].reduce((a, q) => {
|
|
111
|
+
if (quoteMap[q] == null) {
|
|
112
|
+
quoteMap[q] = true;
|
|
113
|
+
a.push(q);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return a;
|
|
117
|
+
}, []);
|
|
118
|
+
let rexSplit = new RegExp(`(${allQuotes.join('|')})`);
|
|
119
|
+
let rexQuotes = new RegExp(`(${allQuotes.join('|')})`, 'g');
|
|
120
|
+
for (let i = 0; i < maxLevel; i++) {
|
|
121
|
+
openQuotes[i] = openQuotes[i] || openQuotes[i - 1];
|
|
122
|
+
closeQuotes[i] = closeQuotes[i] || closeQuotes[i - 1];
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
let rexPreApos = QuoteParser.loadApostrophe(lang);
|
|
126
|
+
if (rexPreApos) {
|
|
127
|
+
this.rexPreApos = rexPreApos;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
Object.assign(this, {
|
|
131
|
+
apostrophe,
|
|
132
|
+
closeQuotes,
|
|
133
|
+
lang,
|
|
134
|
+
level,
|
|
135
|
+
openQuotes,
|
|
136
|
+
rexQuotes,
|
|
137
|
+
rexSplit,
|
|
138
|
+
maxLevel,
|
|
139
|
+
quotes,
|
|
140
|
+
});
|
|
141
|
+
dbg && console.log(msg, lang);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
static loadApostrophe(lang) {
|
|
145
|
+
const msg = 'QuoteParser.loadApostrophe()';
|
|
146
|
+
const dbg = 0;
|
|
147
|
+
let majorLang = lang.split('-')[0];
|
|
148
|
+
let fname = `apostrophe_${majorLang}.txt`;
|
|
149
|
+
let fpath = path.join(__dirname, './glossary', fname);
|
|
150
|
+
let rex;
|
|
151
|
+
try {
|
|
152
|
+
let text = fs.readFileSync(fpath).toString().trim();
|
|
153
|
+
let lines = text
|
|
154
|
+
.split('’\n')
|
|
155
|
+
.map((line) => line && `\\b${line}$`);
|
|
156
|
+
rex = new RegExp(lines.join('|'), 'ig');
|
|
157
|
+
dbg && console.log(msg, `[1]${lang}`, rex);
|
|
158
|
+
} catch (e) {
|
|
159
|
+
dbg && console.warn(msg, '[2]No apostrophe info:', fname);
|
|
160
|
+
}
|
|
161
|
+
return rex;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
static testcaseQ2EN(lang) {
|
|
165
|
+
const { LQ1, LQ2, LQ3, LQ4, RQ1, RQ2, RQ3, RQ4 } = QuoteParser;
|
|
166
|
+
return [
|
|
167
|
+
// LQ1 in preceding segment
|
|
168
|
+
`${LQ2}I say, `,
|
|
169
|
+
`${LQ3}You say, `,
|
|
170
|
+
`${LQ4}I said ${lang}!${RQ4}`,
|
|
171
|
+
`?${RQ3}.`,
|
|
172
|
+
`${RQ2}`,
|
|
173
|
+
`${RQ1}`, // closing
|
|
174
|
+
].join('');
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
static testcaseDepthEN(lang) {
|
|
178
|
+
const { LQ1, LQ2, LQ3, LQ4, RQ1, RQ2, RQ3, RQ4 } = QuoteParser;
|
|
179
|
+
return [
|
|
180
|
+
`${LQ1}`,
|
|
181
|
+
`${LQ2}I say, `,
|
|
182
|
+
`${LQ3}You say, `,
|
|
183
|
+
`${LQ4}I said ${lang}!${RQ4}`,
|
|
184
|
+
`?${RQ3}.`,
|
|
185
|
+
`${RQ2}`,
|
|
186
|
+
`${RQ1}`,
|
|
187
|
+
].join('');
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
static testcaseElderlyEN(opts = {}) {
|
|
191
|
+
let {
|
|
192
|
+
lang = 'messenger',
|
|
193
|
+
lQuote = '',
|
|
194
|
+
rQuote = '',
|
|
195
|
+
gods = 'gods',
|
|
196
|
+
} = opts;
|
|
197
|
+
return [
|
|
198
|
+
lQuote,
|
|
199
|
+
`Mister, did you not see among human beings an `,
|
|
200
|
+
`elderly woman or a man—eighty, ninety, or a hundred `,
|
|
201
|
+
`years old—bent double, crooked, leaning on a staff, `,
|
|
202
|
+
`trembling as they walk, ailing, past their prime, `,
|
|
203
|
+
`with teeth broken, hair grey and scanty or bald, `,
|
|
204
|
+
`skin wrinkled, and limbs blotchy?`,
|
|
205
|
+
rQuote,
|
|
206
|
+
].join('');
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
static testcaseSickEN(opts = {}) {
|
|
210
|
+
let {
|
|
211
|
+
lang = 'sickness',
|
|
212
|
+
lQuote1 = '',
|
|
213
|
+
rQuote1 = '',
|
|
214
|
+
rQuote2 = '',
|
|
215
|
+
apos = "'",
|
|
216
|
+
} = opts;
|
|
217
|
+
return [
|
|
218
|
+
lQuote1,
|
|
219
|
+
`I, too, am liable to become sick. I${apos}m not exempt `,
|
|
220
|
+
`from ${lang}. I${apos}d better do good by way of body, `,
|
|
221
|
+
`speech, and mind`,
|
|
222
|
+
rQuote1,
|
|
223
|
+
'?',
|
|
224
|
+
rQuote2,
|
|
225
|
+
].join('');
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
static testcaseMisterEN(opts = {}) {
|
|
229
|
+
let {
|
|
230
|
+
lang = 'messenger',
|
|
231
|
+
lQuote = '',
|
|
232
|
+
rQuote = '',
|
|
233
|
+
gods = 'gods',
|
|
234
|
+
} = opts;
|
|
235
|
+
return [
|
|
236
|
+
lQuote,
|
|
237
|
+
`Mister, did you not see the first ${lang} of the ${gods} that appeared among human beings?`,
|
|
238
|
+
rQuote,
|
|
239
|
+
].join('');
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
static testcaseQuotesEN(opts = {}) {
|
|
243
|
+
let { lang = 'mind', lQuote = '', rQuote = '' } = opts;
|
|
244
|
+
return `${lQuote}Listen and apply your ${lang} well, I will speak.${rQuote}`;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
static testcaseDonationEN(opts = {}) {
|
|
248
|
+
let {
|
|
249
|
+
lang = 'religious',
|
|
250
|
+
people = 'kinds of people',
|
|
251
|
+
lQuote = '',
|
|
252
|
+
rQuote = '',
|
|
253
|
+
apos = "'",
|
|
254
|
+
} = opts;
|
|
255
|
+
return [
|
|
256
|
+
`${lQuote}These are two ${people} in the world`,
|
|
257
|
+
`who are worthy of a ${lang} donation,`,
|
|
258
|
+
`and that${apos}s where you should give a gift.${rQuote} `,
|
|
259
|
+
].join(' ');
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
static testcaseEllipsisEN(lang, opts = QuoteParser) {
|
|
263
|
+
const {
|
|
264
|
+
prefix = 'They understand: ',
|
|
265
|
+
lQuote = LDQUOT,
|
|
266
|
+
rQuote = RDQUOT,
|
|
267
|
+
ellipsis = ` ${ELLIPSIS} `,
|
|
268
|
+
} = opts;
|
|
269
|
+
return [
|
|
270
|
+
prefix,
|
|
271
|
+
lQuote,
|
|
272
|
+
`This is ${lang}`,
|
|
273
|
+
rQuote,
|
|
274
|
+
ellipsis,
|
|
275
|
+
lQuote,
|
|
276
|
+
'This is suffering',
|
|
277
|
+
rQuote,
|
|
278
|
+
ellipsis,
|
|
279
|
+
lQuote,
|
|
280
|
+
'This is the origin',
|
|
281
|
+
rQuote,
|
|
282
|
+
'.',
|
|
283
|
+
].join('');
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
static testcaseThinking_EN(lang, opts = {}) {
|
|
287
|
+
const { LQ1, RQ1 } = QuoteParser;
|
|
288
|
+
let { lQuote = LQ1, rQuote = RQ1 } = opts;
|
|
289
|
+
|
|
290
|
+
return [
|
|
291
|
+
`Thinking, `,
|
|
292
|
+
`${lQuote}I${APOS}ve done ${lang} `,
|
|
293
|
+
`things by way of body, speech, and mind`,
|
|
294
|
+
`${rQuote}, they${APOS}re mortified.`,
|
|
295
|
+
].join('');
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
static get APOS() {
|
|
299
|
+
return APOS;
|
|
300
|
+
}
|
|
301
|
+
static get ELLIPSIS() {
|
|
302
|
+
return ELLIPSIS;
|
|
303
|
+
}
|
|
304
|
+
static get ELL() {
|
|
305
|
+
return ELL;
|
|
306
|
+
}
|
|
307
|
+
static get LDQUOT() {
|
|
308
|
+
return LDQUOT;
|
|
309
|
+
}
|
|
310
|
+
static get RDQUOT() {
|
|
311
|
+
return RDQUOT;
|
|
312
|
+
}
|
|
313
|
+
static get LSQUOT() {
|
|
314
|
+
return LSQUOT;
|
|
315
|
+
}
|
|
316
|
+
static get RSQUOT() {
|
|
317
|
+
return RSQUOT;
|
|
318
|
+
}
|
|
319
|
+
static get LGUIL() {
|
|
320
|
+
return LGUIL;
|
|
321
|
+
}
|
|
322
|
+
static get RGUIL() {
|
|
323
|
+
return RGUIL;
|
|
324
|
+
}
|
|
325
|
+
static get LDGUIL() {
|
|
326
|
+
return LDGUIL;
|
|
327
|
+
}
|
|
328
|
+
static get RDGUIL() {
|
|
329
|
+
return RDGUIL;
|
|
330
|
+
}
|
|
331
|
+
static get NBSP() {
|
|
332
|
+
return NBSP;
|
|
333
|
+
}
|
|
334
|
+
static get THNSP() {
|
|
335
|
+
return THNSP;
|
|
336
|
+
}
|
|
337
|
+
static get QUOTE() {
|
|
338
|
+
return QUOTE;
|
|
339
|
+
}
|
|
340
|
+
static get LQ1() {
|
|
341
|
+
return LQ1;
|
|
342
|
+
}
|
|
343
|
+
static get LQ2() {
|
|
344
|
+
return LQ2;
|
|
345
|
+
}
|
|
346
|
+
static get LQ3() {
|
|
347
|
+
return LQ3;
|
|
348
|
+
}
|
|
349
|
+
static get LQ4() {
|
|
350
|
+
return LQ4;
|
|
351
|
+
}
|
|
352
|
+
static get RQ1() {
|
|
353
|
+
return RQ1;
|
|
354
|
+
}
|
|
355
|
+
static get RQ2() {
|
|
356
|
+
return RQ2;
|
|
357
|
+
}
|
|
358
|
+
static get RQ3() {
|
|
359
|
+
return RQ3;
|
|
360
|
+
}
|
|
361
|
+
static get RQ4() {
|
|
362
|
+
return RQ4;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// ...APOS...
|
|
366
|
+
testcaseGratificationEN(lang) {
|
|
367
|
+
const apos = this.apostrophe;
|
|
368
|
+
const [LQ1, LQ2, LQ3, LQ4] = this.openQuotes;
|
|
369
|
+
const [RQ1, RQ2, RQ3, RQ4] = this.closeQuotes;
|
|
370
|
+
|
|
371
|
+
return (
|
|
372
|
+
`‘But reverends, what${apos}s the gratification, ` +
|
|
373
|
+
`the drawback, and the escape when it comes to ${lang}`
|
|
374
|
+
);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// ...APOS...
|
|
378
|
+
testcasePleasuresEN(lang) {
|
|
379
|
+
const apos = this.apostrophe;
|
|
380
|
+
const [LQ1, LQ2, LQ3, LQ4] = this.openQuotes;
|
|
381
|
+
const [RQ1, RQ2, RQ3, RQ4] = this.closeQuotes;
|
|
382
|
+
|
|
383
|
+
return (
|
|
384
|
+
`understand ${lang} pleasures${apos} ` +
|
|
385
|
+
`gratification, drawback, and escape`
|
|
386
|
+
);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
// ...APOS...RQ1 RQ2
|
|
390
|
+
testcaseSquirrelsEN(lang) {
|
|
391
|
+
const apos = this.apostrophe;
|
|
392
|
+
const [LQ1, LQ2, LQ3, LQ4] = this.openQuotes;
|
|
393
|
+
const [RQ1, RQ2, RQ3, RQ4] = this.closeQuotes;
|
|
394
|
+
|
|
395
|
+
return `the ${lang} squirrels${apos} feeding ground${RQ2}${RQ1}`;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// LQ2.....RQ2 RQ1
|
|
399
|
+
testcaseRebirthEN(lang) {
|
|
400
|
+
const [LQ1, LQ2, LQ3, LQ4] = this.openQuotes;
|
|
401
|
+
const [RQ1, RQ2, RQ3, RQ4] = this.closeQuotes;
|
|
402
|
+
return [
|
|
403
|
+
//`${LQ1}`,
|
|
404
|
+
`${LQ2}I understand: `,
|
|
405
|
+
`${LQ3}`,
|
|
406
|
+
`Rebirth is ended in ${lang}`,
|
|
407
|
+
`${RQ3}`,
|
|
408
|
+
`${RQ2}`,
|
|
409
|
+
'?',
|
|
410
|
+
`${RQ1}`,
|
|
411
|
+
].join('');
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// ... RQ2
|
|
415
|
+
testcaseFeelingsEN(lang) {
|
|
416
|
+
const apos = this.apostrophe;
|
|
417
|
+
const [RQ1, RQ2, RQ3, RQ4] = this.closeQuotes;
|
|
418
|
+
return [
|
|
419
|
+
`what${apos}s the escape from that ${lang} feeling?${RQ2}`,
|
|
420
|
+
].join('');
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// ... RQ1
|
|
424
|
+
testcaseReligionsEN(lang) {
|
|
425
|
+
const [LQ1, LQ2, LQ3, LQ4] = this.openQuotes;
|
|
426
|
+
const [RQ1, RQ2, RQ3, RQ4] = this.closeQuotes;
|
|
427
|
+
return [`Why don't we visit ${lang} religions?${RQ1} `].join('');
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
testcaseApostropheEN(lang) {
|
|
431
|
+
const { apostrophe } = this;
|
|
432
|
+
return `The ${lang} child${apostrophe}s toy`;
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
testcaseApostropheFR(lang) {
|
|
436
|
+
const { apostrophe } = this;
|
|
437
|
+
return `Le jouet de l${apostrophe}enfant ${lang}`;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
scan(text, level = this.level) {
|
|
441
|
+
const msg = 'QuoteParser.scan()';
|
|
442
|
+
const dbg = DBG.QUOTE;
|
|
443
|
+
const dbgv = DBG.VERBOSE && dbg;
|
|
444
|
+
let { rexQuotes, openQuotes, closeQuotes, maxLevel } = this;
|
|
445
|
+
let quotes = 0;
|
|
446
|
+
let execRes = rexQuotes.exec(text);
|
|
447
|
+
for (; execRes != null; execRes = rexQuotes.exec(text)) {
|
|
448
|
+
let match = execRes[0];
|
|
449
|
+
dbgv && console.log(msg, match);
|
|
450
|
+
quotes++;
|
|
451
|
+
if (match === closeQuotes[level - 1]) {
|
|
452
|
+
level--;
|
|
453
|
+
if (level < 0) {
|
|
454
|
+
let emsg = `${msg} unmatched close quote: ${text}`;
|
|
455
|
+
console.warn(msg, emsg);
|
|
456
|
+
throw new Error(emsg);
|
|
457
|
+
}
|
|
458
|
+
} else if (match === openQuotes[level]) {
|
|
459
|
+
level++;
|
|
460
|
+
if (maxLevel < level) {
|
|
461
|
+
let emsg = `${msg} quote nesting exceeded: ${text}`;
|
|
462
|
+
console.warn(msg, emsg);
|
|
463
|
+
throw new Error(emsg);
|
|
464
|
+
}
|
|
465
|
+
} else {
|
|
466
|
+
let emsg = `${msg} invalid quote [${match}] for level:${level}`;
|
|
467
|
+
console.warn(msg, emsg);
|
|
468
|
+
throw new Error(emsg);
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
return {
|
|
473
|
+
level,
|
|
474
|
+
quotes,
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
parse(text, level) {
|
|
479
|
+
const msg = 'QuoteParser.parse()';
|
|
480
|
+
const dbg = DBG.QUOTE;
|
|
481
|
+
let dState = this.scan(text, level);
|
|
482
|
+
if (dbg) {
|
|
483
|
+
console.log(msg, dState);
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
this.level = dState.level;
|
|
487
|
+
this.quotes += dState.quotes;
|
|
488
|
+
|
|
489
|
+
return dState;
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
isApostrophe(context) {
|
|
493
|
+
const msg = 'QuoteParser.isApostrophe()';
|
|
494
|
+
const dbg = 0;
|
|
495
|
+
const [before, quote, after] = context;
|
|
496
|
+
let { rexPreApos } = this;
|
|
497
|
+
|
|
498
|
+
if (quote !== RSQUOT) {
|
|
499
|
+
dbg && console.log(msg, '[1]!RSQUOT', context);
|
|
500
|
+
return false;
|
|
501
|
+
}
|
|
502
|
+
if (rexPreApos && rexPreApos.test(before)) {
|
|
503
|
+
dbg && console.log(msg, '[2]rexPreApos', before);
|
|
504
|
+
return true;
|
|
505
|
+
}
|
|
506
|
+
if (after === '') {
|
|
507
|
+
dbg && console.log(msg, '[3]$', context);
|
|
508
|
+
return false;
|
|
509
|
+
}
|
|
510
|
+
if (RE_POST_APOS.test(after)) {
|
|
511
|
+
dbg && console.log(msg, '[4]RE_POST_APOS', context);
|
|
512
|
+
return true;
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
dbg && console.log(msg, '[5]RSQUOT', context);
|
|
516
|
+
return false;
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
convertQuotes(text = '', qpSwap = text, level = this.level) {
|
|
520
|
+
const msg = 'QuoteParser.convertQuotes()';
|
|
521
|
+
const dbg = 0 || DBG.VERBOSE;
|
|
522
|
+
let {
|
|
523
|
+
openQuotes: srcOpen,
|
|
524
|
+
closeQuotes: srcClose,
|
|
525
|
+
apostrophe: srcApos,
|
|
526
|
+
rexSplit,
|
|
527
|
+
maxLevel,
|
|
528
|
+
} = this;
|
|
529
|
+
if (qpSwap == null || text == '') {
|
|
530
|
+
return text;
|
|
531
|
+
}
|
|
532
|
+
let {
|
|
533
|
+
openQuotes: swapOpen,
|
|
534
|
+
closeQuotes: swapClose,
|
|
535
|
+
apostrophe: swapApos,
|
|
536
|
+
} = qpSwap;
|
|
537
|
+
|
|
538
|
+
let dstParts = [];
|
|
539
|
+
let srcParts = text.split(rexSplit);
|
|
540
|
+
dbg && console.log(msg, '[1]srcParts', srcParts);
|
|
541
|
+
let lastPart;
|
|
542
|
+
let nextPart = srcParts[0];
|
|
543
|
+
for (let i = 0; i < srcParts.length; i++) {
|
|
544
|
+
let part = nextPart;
|
|
545
|
+
let srcOpenQuote = srcOpen[level];
|
|
546
|
+
let srcCloseQuote = srcClose[level - 1];
|
|
547
|
+
|
|
548
|
+
if (i % 2 === 0) {
|
|
549
|
+
dbg && console.log(msg, `[2]text@${i}`, part);
|
|
550
|
+
} else if (part === srcCloseQuote) {
|
|
551
|
+
let nextPart = srcParts[i + 1];
|
|
552
|
+
let context = [srcParts[i - 1], part, nextPart];
|
|
553
|
+
if (this.isApostrophe(context)) {
|
|
554
|
+
dbg && console.log(msg, `[3]apos@${i}`, { part, nextPart });
|
|
555
|
+
} else {
|
|
556
|
+
dbg &&
|
|
557
|
+
console.log(msg, `[4]close@${i}`, { part, nextPart });
|
|
558
|
+
level--;
|
|
559
|
+
part = swapClose[level];
|
|
560
|
+
if (level < 0) {
|
|
561
|
+
let emsg = `${msg} unmatched close quote: ${text}`;
|
|
562
|
+
console.warn(msg, emsg);
|
|
563
|
+
throw new Error(emsg);
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
} else if (part === srcOpenQuote) {
|
|
567
|
+
dbg && console.log(msg, `[5]open@${i}`, part);
|
|
568
|
+
part = swapOpen[level];
|
|
569
|
+
level++;
|
|
570
|
+
if (maxLevel < level) {
|
|
571
|
+
let emsg = `${msg} quote nesting exceeded: ${text}`;
|
|
572
|
+
console.warn(msg, emsg);
|
|
573
|
+
throw new Error(emsg);
|
|
574
|
+
}
|
|
575
|
+
} else {
|
|
576
|
+
dbg && console.log(msg, `[6]skip@${i}`, level, `"${part}"`);
|
|
577
|
+
// not a quote
|
|
578
|
+
}
|
|
579
|
+
dstParts.push(part);
|
|
580
|
+
lastPart = part;
|
|
581
|
+
nextPart = srcParts[i + 1];
|
|
582
|
+
}
|
|
583
|
+
this.level = level;
|
|
584
|
+
|
|
585
|
+
let aposParts = dstParts.join('').split(srcApos);
|
|
586
|
+
dbg &&
|
|
587
|
+
console.log(
|
|
588
|
+
msg,
|
|
589
|
+
'[7]aposParts',
|
|
590
|
+
aposParts,
|
|
591
|
+
swapApos.charCodeAt(0),
|
|
592
|
+
);
|
|
593
|
+
|
|
594
|
+
return aposParts.join(swapApos);
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
#checkQuoteLevel(text = '', startLevel = 0) {
|
|
598
|
+
const msg = `qp-${this.lang}.#checkQuoteLevel()`;
|
|
599
|
+
const dbg = DBG.QUOTE;
|
|
600
|
+
const dbgv = dbg && DBG.VERBOSE;
|
|
601
|
+
let endLevel = startLevel;
|
|
602
|
+
let syncLevel = startLevel;
|
|
603
|
+
let levelError = 0;
|
|
604
|
+
let { maxLevel, rexSplit, openQuotes, closeQuotes } = this;
|
|
605
|
+
let parts = text.split(rexSplit);
|
|
606
|
+
let error;
|
|
607
|
+
|
|
608
|
+
if (parts.length === 1) {
|
|
609
|
+
dbg &&
|
|
610
|
+
console.log(
|
|
611
|
+
msg,
|
|
612
|
+
`[1]no-quotes${startLevel}`,
|
|
613
|
+
text.substring(0, 50),
|
|
614
|
+
'...',
|
|
615
|
+
);
|
|
616
|
+
}
|
|
617
|
+
for (let i = 1; !error && i < parts.length; i += 2) {
|
|
618
|
+
let part = parts[i]; // parts with odd indices are quotes
|
|
619
|
+
let context = [parts[i - 1], part, parts[i + 1]];
|
|
620
|
+
|
|
621
|
+
if (part === openQuotes[endLevel]) {
|
|
622
|
+
// sync ok
|
|
623
|
+
endLevel++;
|
|
624
|
+
dbg &&
|
|
625
|
+
console.log(msg, `[2]open${endLevel}`, context.join('|'));
|
|
626
|
+
} else if (part === closeQuotes[endLevel - 1]) {
|
|
627
|
+
if (this.isApostrophe(context)) {
|
|
628
|
+
dbgv && console.log(msg, `[3]apos`, context.join('|'));
|
|
629
|
+
} else {
|
|
630
|
+
dbg &&
|
|
631
|
+
console.log(
|
|
632
|
+
msg,
|
|
633
|
+
`[4]close${endLevel}`,
|
|
634
|
+
context.join('|'),
|
|
635
|
+
);
|
|
636
|
+
endLevel--;
|
|
637
|
+
}
|
|
638
|
+
} else if (this.isApostrophe(context)) {
|
|
639
|
+
dbgv && console.log(msg, `[5]apos`, context.join('|'));
|
|
640
|
+
} else {
|
|
641
|
+
// sync fail
|
|
642
|
+
let emsg = `${msg} ERROR [${startLevel}?${text}]`;
|
|
643
|
+
dbg &&
|
|
644
|
+
console.log(msg, `[6]SYNC?`, { startLevel, i, context });
|
|
645
|
+
error = new Error(emsg);
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
return {
|
|
649
|
+
error,
|
|
650
|
+
startLevel,
|
|
651
|
+
endLevel,
|
|
652
|
+
};
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
syncQuoteLevel(text = '', startLevel = 0) {
|
|
656
|
+
const msg = `qp-${this.lang}.syncQuoteLevel()`;
|
|
657
|
+
const dbg = DBG.QUOTE;
|
|
658
|
+
let { maxLevel } = this;
|
|
659
|
+
let check = this.#checkQuoteLevel(text, startLevel);
|
|
660
|
+
if (check.error) {
|
|
661
|
+
for (let i = 1; check.error && i < maxLevel; i++) {
|
|
662
|
+
let tryLevel = (startLevel + i) % maxLevel;
|
|
663
|
+
check = this.#checkQuoteLevel(text, tryLevel);
|
|
664
|
+
}
|
|
665
|
+
if (check.error) {
|
|
666
|
+
// Could not synchronize quotes. Source document error
|
|
667
|
+
console.log(msg, '[1]SYNC?!', `level ${startLevel}=>ERROR`);
|
|
668
|
+
throw check.error;
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
// Synchronized, but source document might be in error
|
|
672
|
+
console.log(
|
|
673
|
+
msg,
|
|
674
|
+
'[2]SYNC?',
|
|
675
|
+
`level ${startLevel}=>${check.startLevel}`,
|
|
676
|
+
`\n |${text}|`,
|
|
677
|
+
);
|
|
678
|
+
}
|
|
679
|
+
return check;
|
|
680
|
+
}
|
|
681
|
+
}
|