tex2typst 0.0.19 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/parser.ts CHANGED
@@ -1,98 +1,681 @@
1
- // @ts-ignore
2
- import katex from 'katex';
3
- import { TexNode, KatexParseNode, TexSupsubData } from './types';
1
+ import { TexNode, LatexParseNode, TexSupsubData } from "./types";
4
2
 
3
+ const UNARY_COMMANDS = [
4
+ 'sqrt',
5
+ 'text',
5
6
 
6
- const generateParseTree = katex.__parse;
7
+ 'arccos',
8
+ 'arcsin',
9
+ 'arctan',
10
+ 'arg',
11
+ 'bar',
12
+ 'bold',
13
+ 'boldsymbol',
14
+ 'ddot',
15
+ 'det',
16
+ 'dim',
17
+ 'dot',
18
+ 'exp',
19
+ 'gcd',
20
+ 'hat',
21
+ 'ker',
22
+ 'mathbb',
23
+ 'mathbf',
24
+ 'mathcal',
25
+ 'mathscr',
26
+ 'mathsf',
27
+ 'mathtt',
28
+ 'mathrm',
29
+ 'max',
30
+ 'min',
31
+ 'mod',
32
+ 'operatorname',
33
+ 'overbrace',
34
+ 'overline',
35
+ 'pmb',
36
+ 'sup',
37
+ 'rm',
38
+ 'tilde',
39
+ 'underbrace',
40
+ 'underline',
41
+ 'vec',
42
+ 'widehat',
43
+ 'widetilde',
44
+ ]
7
45
 
8
- export class KatexNodeToTexNodeError extends Error {
9
- node: KatexParseNode;
46
+ const BINARY_COMMANDS = [
47
+ 'frac',
48
+ 'tfrac',
49
+ 'binom',
50
+ 'dbinom',
51
+ 'dfrac',
52
+ 'tbinom',
53
+ ]
10
54
 
11
- constructor(message: string, node: KatexParseNode) {
55
+ const EMPTY_NODE = { 'type': 'empty', 'content': '' }
56
+
57
+ function assert(condition: boolean, message: string = ''): void {
58
+ if (!condition) {
59
+ throw new LatexParserError(message);
60
+ }
61
+ }
62
+
63
+
64
+ function get_command_param_num(command: string): number {
65
+ if (UNARY_COMMANDS.includes(command)) {
66
+ return 1;
67
+ } else if (BINARY_COMMANDS.includes(command)) {
68
+ return 2;
69
+ } else {
70
+ return 0;
71
+ }
72
+ }
73
+
74
+ function find_closing_curly_bracket(latex: string, start: number): number {
75
+ assert(latex[start] === '{');
76
+ let count = 1;
77
+ let pos = start + 1;
78
+
79
+ while (count > 0) {
80
+ if (pos >= latex.length) {
81
+ throw new LatexParserError('Unmatched curly brackets');
82
+ }
83
+ if(pos + 1 < latex.length && (['\\{', '\\}'].includes(latex.substring(pos, pos + 2)))) {
84
+ pos += 2;
85
+ continue;
86
+ }
87
+ if (latex[pos] === '{') {
88
+ count += 1;
89
+ } else if (latex[pos] === '}') {
90
+ count -= 1;
91
+ }
92
+ pos += 1;
93
+ }
94
+
95
+ return pos - 1;
96
+ }
97
+
98
+ function find_closing_square_bracket(latex: string, start: number): number {
99
+ assert(latex[start] === '[');
100
+ let count = 1;
101
+ let pos = start + 1;
102
+
103
+ while (count > 0) {
104
+ if (pos >= latex.length) {
105
+ throw new LatexParserError('Unmatched square brackets');
106
+ }
107
+ if (latex[pos] === '[') {
108
+ count += 1;
109
+ } else if (latex[pos] === ']') {
110
+ count -= 1;
111
+ }
112
+ pos += 1;
113
+ }
114
+
115
+ return pos - 1;
116
+ }
117
+
118
+
119
+ function isalpha(char: string): boolean {
120
+ return 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'.includes(char);
121
+ }
122
+
123
+ function isdigit(char: string): boolean {
124
+ return '0123456789'.includes(char);
125
+ }
126
+
127
+
128
+
129
+ function find_command(latex: string, start: number, command_name: string): number {
130
+ const len_slash_command = 1 + command_name.length;
131
+ let pos = start;
132
+
133
+ while (pos < latex.length) {
134
+ pos = latex.indexOf('\\' + command_name, pos);
135
+ if (pos === -1) {
136
+ return -1;
137
+ }
138
+ if (pos + len_slash_command >= latex.length || !isalpha(latex[pos + len_slash_command])) {
139
+ return pos;
140
+ } else {
141
+ pos += len_slash_command;
142
+ }
143
+ }
144
+
145
+ return -1;
146
+ }
147
+
148
+ function find_closing_right_command(latex: string, start: number): number {
149
+ let count = 1;
150
+ let pos = start;
151
+
152
+ while (count > 0) {
153
+ if (pos >= latex.length) {
154
+ return -1;
155
+ }
156
+ const left_idx = find_command(latex, pos, 'left');
157
+ const right_idx = find_command(latex, pos, 'right');
158
+
159
+ if (right_idx === -1) {
160
+ return -1;
161
+ }
162
+
163
+ if (left_idx === -1 || left_idx > right_idx) {
164
+ // a \right is ahead
165
+ count -= 1;
166
+ pos = right_idx + '\\right'.length;
167
+ } else {
168
+ // a \left is ahead
169
+ count += 1;
170
+ pos = left_idx + '\\left'.length;
171
+ }
172
+ }
173
+
174
+ return pos - '\\right'.length;
175
+ }
176
+
177
+ function find_closing_end_command(latex: string, start: number): number {
178
+ let count = 1;
179
+ let pos = start;
180
+
181
+ while (count > 0) {
182
+ if (pos >= latex.length) {
183
+ return -1;
184
+ }
185
+ const begin_idx = find_command(latex, pos, 'begin');
186
+ const end_idx = find_command(latex, pos, 'end');
187
+
188
+ if (end_idx === -1) {
189
+ return -1;
190
+ }
191
+
192
+ if (begin_idx === -1 || begin_idx > end_idx) {
193
+ // an \end is ahead
194
+ count -= 1;
195
+ pos = end_idx + '\\end'.length;
196
+ } else {
197
+ // a \begin is ahead
198
+ count += 1;
199
+ pos = begin_idx + '\\begin'.length;
200
+ }
201
+ }
202
+
203
+ return pos - '\\end'.length;
204
+ }
205
+
206
+ function eat_whitespaces(latex: string, start: number): string {
207
+ let pos = start;
208
+ while (pos < latex.length && [' ', '\t', '\n'].includes(latex[pos])) {
209
+ pos += 1;
210
+ }
211
+ return latex.substring(start, pos);
212
+ }
213
+
214
+ function eat_spaces(latex: string, start: number): string {
215
+ let pos = start;
216
+ while (pos < latex.length && latex[pos] === ' ') {
217
+ pos += 1;
218
+ }
219
+ return latex.substring(start, pos);
220
+ }
221
+
222
+ function eat_command_name(latex: string, start: number): string {
223
+ let pos = start;
224
+ while (pos < latex.length && isalpha(latex[pos])) {
225
+ pos += 1;
226
+ }
227
+ return latex.substring(start, pos);
228
+ }
229
+
230
+ function eat_parenthesis(latex: string, start: number): string | null {
231
+ if ('()[]|'.includes(latex[start])) {
232
+ return latex[start];
233
+ } else if (start + 1 < latex.length && ['\\{', '\\}'].includes(latex.substring(start, start + 2))) {
234
+ return latex.substring(start, start + 2);
235
+ } else if (start + 6 < latex.length && ['\\lfloor', '\\rfloor'].includes(latex.substring(start, start + 7))) {
236
+ return latex.substring(start, start + 7);
237
+ } else if (start + 5 < latex.length && ['\\lceil', '\\rceil'].includes(latex.substring(start, start + 6))) {
238
+ return latex.substring(start, start + 6);
239
+ } else if (start + 6 < latex.length && ['\\langle', '\\rangle'].includes(latex.substring(start, start + 7))) {
240
+ return latex.substring(start, start + 7);
241
+ } else {
242
+ return null;
243
+ }
244
+ }
245
+
246
+ function eat_primes(latex: string, start: number): number {
247
+ let pos = start;
248
+ while (pos < latex.length && latex[pos] === "'") {
249
+ pos += 1;
250
+ }
251
+ return pos - start;
252
+ }
253
+
254
+
255
+ class LatexParserError extends Error {
256
+ constructor(message: string) {
257
+ super(message);
258
+ this.name = 'LatexParserError';
259
+ }
260
+ }
261
+
262
+
263
+ type ParseResult = [LatexParseNode, number];
264
+
265
+ export class LatexParser {
266
+ space_sensitive: boolean;
267
+ newline_sensitive: boolean;
268
+
269
+ constructor(space_sensitive: boolean = false, newline_sensitive: boolean = true) {
270
+ this.space_sensitive = space_sensitive;
271
+ this.newline_sensitive = newline_sensitive;
272
+ }
273
+
274
+ parse(latex: string): LatexParseNode {
275
+ const results: LatexParseNode[] = [];
276
+ let pos = 0;
277
+
278
+ while (pos < latex.length) {
279
+ const [res, newPos] = this.parseNextExpr(latex, pos);
280
+ pos = newPos;
281
+ if (!this.space_sensitive && res.type === 'whitespace') {
282
+ continue;
283
+ }
284
+ if (!this.newline_sensitive && res.type === 'newline') {
285
+ continue;
286
+ }
287
+ if (res.type === 'control' && res.content === '&') {
288
+ throw new LatexParserError('Unexpected & outside of an alignment');
289
+ }
290
+ results.push(res);
291
+ }
292
+
293
+ if (results.length === 0) {
294
+ return EMPTY_NODE;
295
+ } else if (results.length === 1) {
296
+ return results[0];
297
+ } else {
298
+ return { type: 'ordgroup', args: results };
299
+ }
300
+ }
301
+
302
+ parseNextExpr(latex: string, start: number): ParseResult {
303
+ let [base, pos] = this.parseNextExprWithoutSupSub(latex, start);
304
+ let sub: LatexParseNode | null = null;
305
+ let sup: LatexParseNode | null = null;
306
+ let num_prime = 0;
307
+
308
+ num_prime += eat_primes(latex, pos);
309
+ pos += num_prime;
310
+ if (pos < latex.length && latex[pos] === '_') {
311
+ [sub, pos] = this.parseNextExprWithoutSupSub(latex, pos + 1);
312
+ num_prime += eat_primes(latex, pos);
313
+ pos += num_prime;
314
+ if (pos < latex.length && latex[pos] === '^') {
315
+ [sup, pos] = this.parseNextExprWithoutSupSub(latex, pos + 1);
316
+ if (eat_primes(latex, pos) > 0) {
317
+ throw new LatexParserError('Double superscript');
318
+ }
319
+ }
320
+ } else if (pos < latex.length && latex[pos] === '^') {
321
+ [sup, pos] = this.parseNextExprWithoutSupSub(latex, pos + 1);
322
+ if (eat_primes(latex, pos) > 0) {
323
+ throw new LatexParserError('Double superscript');
324
+ }
325
+ if (pos < latex.length && latex[pos] === '_') {
326
+ [sub, pos] = this.parseNextExprWithoutSupSub(latex, pos + 1);
327
+ if (eat_primes(latex, pos) > 0) {
328
+ throw new LatexParserError('Double superscript');
329
+ }
330
+ }
331
+ }
332
+
333
+ if (sub !== null || sup !== null || num_prime > 0) {
334
+ const res = { type: 'supsub', base } as LatexParseNode;
335
+ if (sub) {
336
+ res.sub = sub;
337
+ }
338
+ if (num_prime > 0) {
339
+ res.sup = { type: 'ordgroup', args: [] };
340
+ for (let i = 0; i < num_prime; i++) {
341
+ res.sup.args!.push({ type: 'command', content: 'prime' });
342
+ }
343
+ if (sup) {
344
+ res.sup.args!.push(sup);
345
+ }
346
+ if (res.sup.args!.length === 1) {
347
+ res.sup = res.sup.args![0];
348
+ }
349
+ } else if (sup) {
350
+ res.sup = sup;
351
+ }
352
+ return [res, pos];
353
+ } else {
354
+ return [base, pos];
355
+ }
356
+ }
357
+
358
+ parseNextExprWithoutSupSub(latex: string, start: number): ParseResult {
359
+ const firstChar = latex[start];
360
+ if (firstChar === '{') {
361
+ const posClosingBracket = find_closing_curly_bracket(latex, start);
362
+ const exprInside = latex.slice(start + 1, posClosingBracket);
363
+ return [this.parse(exprInside), posClosingBracket + 1];
364
+ } else if (firstChar === '\\') {
365
+ if (start + 1 >= latex.length) {
366
+ throw new LatexParserError('Expecting command name after \\');
367
+ }
368
+ const firstTwoChars = latex.slice(start, start + 2);
369
+ if (firstTwoChars === '\\\\') {
370
+ return [{ type: 'control', content: '\\\\' }, start + 2];
371
+ } else if (firstTwoChars === '\\{' || firstTwoChars === '\\}') {
372
+ return [{ type: 'token-parenthesis', content: firstTwoChars }, start + 2];
373
+ } else if (['\\%', '\\$', '\\&', '\\#', '\\_'].includes(firstTwoChars)) {
374
+ return [{ type: 'token', content: firstTwoChars }, start + 2];
375
+ } else if (latex.slice(start).startsWith('\\begin{')) {
376
+ return this.parseBeginEndExpr(latex, start);
377
+ } else if (latex.slice(start).startsWith('\\left') && (start + 5 >= latex.length || !isalpha(latex[start + 5]))) {
378
+ return this.parseLeftRightExpr(latex, start);
379
+ } else {
380
+ return this.parseCommandExpr(latex, start);
381
+ }
382
+ } else if (firstChar === '%') {
383
+ let pos = start + 1;
384
+ while (pos < latex.length && latex[pos] !== '\n') {
385
+ pos += 1;
386
+ }
387
+ return [{ type: 'comment', content: latex.slice(start + 1, pos) }, pos];
388
+ } else if (isdigit(firstChar)) {
389
+ let pos = start;
390
+ while (pos < latex.length && isdigit(latex[pos])) {
391
+ pos += 1;
392
+ }
393
+ return [{ type: 'token-number', content: latex.slice(start, pos) }, pos];
394
+ } else if (isalpha(firstChar)) {
395
+ return [{ type: 'token-letter-var', content: firstChar }, start + 1];
396
+ } else if ('+-*/=<>!'.includes(firstChar)) {
397
+ return [{ type: 'token-operator', content: firstChar }, start + 1];
398
+ } else if ('.,;?'.includes(firstChar)) {
399
+ return [{ type: 'atom', content: firstChar }, start + 1];
400
+ } else if ('()[]'.includes(firstChar)) {
401
+ return [{ type: 'token-parenthesis', content: firstChar }, start + 1];
402
+ } else if (firstChar === '_') {
403
+ let [sub, pos] = this.parseNextExpr(latex, start + 1);
404
+ let sup: LatexParseNode | undefined = undefined;
405
+ if (pos < latex.length && latex[pos] === '^') {
406
+ [sup, pos] = this.parseNextExpr(latex, pos + 1);
407
+ }
408
+ return [{ type: 'supsub', base: EMPTY_NODE, sub, sup }, pos];
409
+ } else if (firstChar === '^') {
410
+ let [sup, pos] = this.parseNextExpr(latex, start + 1);
411
+ let sub: LatexParseNode | undefined = undefined;
412
+ if (pos < latex.length && latex[pos] === '_') {
413
+ [sub, pos] = this.parseNextExpr(latex, pos + 1);
414
+ }
415
+ return [{ type: 'supsub', base: EMPTY_NODE, sub, sup }, pos];
416
+ } else if (firstChar === ' ') {
417
+ let pos = start;
418
+ while (pos < latex.length && latex[pos] === ' ') {
419
+ pos += 1;
420
+ }
421
+ return [{ type: 'whitespace', content: latex.slice(start, pos) }, pos];
422
+ } else if (firstChar === '\n') {
423
+ return [{ type: 'newline', content: '\n' }, start + 1];
424
+ } else if (firstChar === '\r') {
425
+ if (start + 1 < latex.length && latex[start + 1] === '\n') {
426
+ return [{ type: 'newline', content: '\n' }, start + 2];
427
+ } else {
428
+ return [{ type: 'newline', content: '\n' }, start + 1];
429
+ }
430
+ } else if (firstChar === '&') {
431
+ return [{ type: 'control', content: '&' }, start + 1];
432
+ } else {
433
+ return [{ type: 'unknown', content: firstChar }, start + 1];
434
+ }
435
+ }
436
+
437
+ parseCommandExpr(latex: string, start: number): ParseResult {
438
+ assert(latex[start] === '\\');
439
+ let pos = start + 1;
440
+ const command = eat_command_name(latex, pos);
441
+ pos += command.length;
442
+ const paramNum = get_command_param_num(command);
443
+ if (paramNum === 0) {
444
+ return [{ type: 'command', content: command }, pos];
445
+ } else if (paramNum === 1) {
446
+ if (command === 'sqrt' && pos < latex.length && latex[pos] === '[') {
447
+ const posLeftSquareBracket = pos;
448
+ const posRightSquareBracket = find_closing_square_bracket(latex, pos);
449
+ const exprInside = latex.slice(posLeftSquareBracket + 1, posRightSquareBracket);
450
+ const exponent = this.parse(exprInside);
451
+ const [arg1, newPos] = this.parseNextExprWithoutSupSub(latex, posRightSquareBracket + 1);
452
+ return [{ type: 'command', content: command, arg1, exponent }, newPos];
453
+ } else if (command === 'text') {
454
+ assert(latex[pos] === '{');
455
+ const posClosingBracket = find_closing_curly_bracket(latex, pos);
456
+ const text = latex.slice(pos + 1, posClosingBracket);
457
+ return [{ type: 'text', content: text }, posClosingBracket + 1];
458
+ } else {
459
+ let [arg1, newPos] = this.parseNextExprWithoutSupSub(latex, pos);
460
+ return [{ type: 'command', content: command, arg1 }, newPos];
461
+ }
462
+ } else if (paramNum === 2) {
463
+ const [arg1, pos1] = this.parseNextExprWithoutSupSub(latex, pos);
464
+ const [arg2, pos2] = this.parseNextExprWithoutSupSub(latex, pos1);
465
+ return [{ type: 'command', content: command, arg1, arg2 }, pos2];
466
+ } else {
467
+ throw new Error( 'Invalid number of parameters');
468
+ }
469
+ }
470
+
471
+ parseLeftRightExpr(latex: string, start: number): ParseResult {
472
+ assert(latex.slice(start, start + 5) === '\\left');
473
+ let pos = start + '\\left'.length;
474
+ pos += eat_whitespaces(latex, pos).length;
475
+ if (pos >= latex.length) {
476
+ throw new LatexParserError('Expecting delimiter after \\left');
477
+ }
478
+ const leftDelimiter = eat_parenthesis(latex, pos);
479
+ if (leftDelimiter === null) {
480
+ throw new LatexParserError('Invalid delimiter after \\left');
481
+ }
482
+ pos += leftDelimiter.length;
483
+ const exprInsideStart = pos;
484
+ const idx = find_closing_right_command(latex, pos);
485
+ if (idx === -1) {
486
+ throw new LatexParserError('No matching \\right');
487
+ }
488
+ const exprInsideEnd = idx;
489
+ pos = idx + '\\right'.length;
490
+ pos += eat_whitespaces(latex, pos).length;
491
+ if (pos >= latex.length) {
492
+ throw new LatexParserError('Expecting delimiter after \\right');
493
+ }
494
+ const rightDelimiter = eat_parenthesis(latex, pos);
495
+ if (rightDelimiter === null) {
496
+ throw new LatexParserError('Invalid delimiter after \\right');
497
+ }
498
+ pos += rightDelimiter.length;
499
+ const exprInside = latex.slice(exprInsideStart, exprInsideEnd);
500
+ const body = this.parse(exprInside);
501
+ const res = { type: 'leftright', left: leftDelimiter, right: rightDelimiter, body };
502
+ return [res, pos];
503
+ }
504
+
505
+
506
+ parseBeginEndExpr(latex: string, start: number): ParseResult {
507
+ assert(latex.slice(start, start + 7) === '\\begin{');
508
+ let pos = start + '\\begin'.length;
509
+ const idx = find_closing_curly_bracket(latex, pos);
510
+ if (idx === -1) {
511
+ throw new LatexParserError('No matching } after \\begin{');
512
+ }
513
+ const envName = latex.slice(pos + 1, idx);
514
+ pos = idx + 1;
515
+ pos += eat_whitespaces(latex, pos).length; // ignore whitespaces and '\n' after \begin{envName}
516
+ const exprInsideStart = pos;
517
+ const endIdx = find_closing_end_command(latex, pos);
518
+ if (endIdx === -1) {
519
+ throw new LatexParserError('No matching \\end');
520
+ }
521
+ const exprInsideEnd = endIdx;
522
+ pos = endIdx + '\\end'.length;
523
+ const closingIdx = find_closing_curly_bracket(latex, pos);
524
+ if (closingIdx === -1) {
525
+ throw new LatexParserError('No matching } after \\end{');
526
+ }
527
+ if (latex.slice(pos + 1, closingIdx) !== envName) {
528
+ throw new LatexParserError('Mismatched \\begin and \\end environments');
529
+ }
530
+ let exprInside = latex.slice(exprInsideStart, exprInsideEnd);
531
+ exprInside = exprInside.trimEnd(); // ignore whitespaces and '\n' before \end{envName}
532
+ const body = this.parseAligned(exprInside);
533
+ const res = { type: 'beginend', content: envName, body };
534
+ return [res, closingIdx + 1];
535
+ }
536
+
537
+ parseAligned(latex: string): LatexParseNode[][] {
538
+ let pos = 0;
539
+ const allRows: LatexParseNode[][] = [];
540
+ let row: LatexParseNode[] = [];
541
+ allRows.push(row);
542
+ let group: LatexParseNode = { type: 'ordgroup', args: [] };
543
+ row.push(group);
544
+
545
+ while (pos < latex.length) {
546
+ const [res, newPos] = this.parseNextExpr(latex, pos);
547
+ pos = newPos;
548
+ if (res.type === 'whitespace') {
549
+ continue;
550
+ } else if (res.type === 'newline' && !this.newline_sensitive) {
551
+ continue;
552
+ } else if (res.type === 'control' && res.content === '\\\\') {
553
+ row = [];
554
+ group = { type: 'ordgroup', args: [] };
555
+ row.push(group);
556
+ allRows.push(row);
557
+ } else if (res.type === 'control' && res.content === '&') {
558
+ group = { type: 'ordgroup', args: [] };
559
+ row.push(group);
560
+ } else {
561
+ group.args!.push(res);
562
+ }
563
+ }
564
+
565
+ return allRows;
566
+ }
567
+ }
568
+
569
+ // Split tex into a list of tex strings and comments.
570
+ // Each item in the returned list is either a tex snippet or a comment.
571
+ // Each comment item is a string starting with '%'.
572
+ function splitTex(tex: string): string[] {
573
+ const lines = tex.split("\n");
574
+ const out_tex_list: string[] = [];
575
+ let current_tex = "";
576
+ // let inside_begin_depth = 0;
577
+ for (let i = 0; i < lines.length; i++) {
578
+ const line = lines[i];
579
+ // if (line.includes('\\begin{')) {
580
+ // inside_begin_depth += line.split('\\begin{').length - 1;
581
+ // }
582
+
583
+ let index = -1;
584
+ while (index + 1 < line.length) {
585
+ index = line.indexOf('%', index + 1);
586
+ if (index === -1) {
587
+ // No comment in this line
588
+ break;
589
+ }
590
+ if (index === 0 || line[index - 1] !== '\\') {
591
+ // Found a comment
592
+ break;
593
+ }
594
+ }
595
+ if (index !== -1) {
596
+ current_tex += line.substring(0, index);
597
+ const comment = line.substring(index);
598
+ out_tex_list.push(current_tex);
599
+ current_tex = "";
600
+ out_tex_list.push(comment);
601
+ } else {
602
+ current_tex += line;
603
+ }
604
+ if (i < lines.length - 1) {
605
+ const has_begin_command = line.includes('\\begin{');
606
+ const followed_by_end_command = lines[i + 1].includes('\\end{');
607
+ if(!has_begin_command && !followed_by_end_command) {
608
+ current_tex += '\n';
609
+ }
610
+ }
611
+
612
+ // if (line.includes('\\end{')) {
613
+ // inside_begin_depth -= line.split('\\end{').length - 1;
614
+ // }
615
+ }
616
+
617
+ if (current_tex.length > 0) {
618
+ out_tex_list.push(current_tex);
619
+ }
620
+
621
+ return out_tex_list;
622
+ }
623
+
624
+ export class LatexNodeToTexNodeError extends Error {
625
+ node: LatexParseNode;
626
+
627
+ constructor(message: string, node: LatexParseNode) {
12
628
  super(message);
13
- this.name = "KatexNodeToTexNodeError";
629
+ this.name = "LatexNodeToTexNodeError";
14
630
  this.node = node;
15
631
  }
16
632
  }
17
633
 
18
- export function katexNodeToTexNode(node: KatexParseNode): TexNode {
634
+ function latexNodeToTexNode(node: LatexParseNode): TexNode {
19
635
  try {
20
- if (node.loc) {
21
- delete node.loc;
22
- }
23
636
  let res = {} as TexNode;
24
637
  switch (node.type) {
638
+ case 'ordgroup':
639
+ res.type = 'ordgroup';
640
+ res.args = (node.args as LatexParseNode[]).map((n: LatexParseNode) => latexNodeToTexNode(n));
641
+ if (res.args!.length === 1) {
642
+ res = res.args![0] as TexNode;
643
+ }
644
+ break;
645
+ case 'empty':
646
+ res.type = 'empty';
647
+ res.content = '';
648
+ break;
25
649
  case 'atom':
26
- // basic symbol like +, -, =, '(', ')', '\{', '\}'
27
- // other punctuation-like macro such as \cdot, \to, \pm
28
650
  res.type = 'atom';
29
- res.content = node.text!;
30
- if (node.text === '\\{' || node.text === '\\}') {
31
- res.content = node.text.substring(1); // '{' or '}'
32
- } else if (node.text!.startsWith('\\')) {
33
- res.type = 'symbol';
34
- }
651
+ res.content = node.content!;
35
652
  break;
36
- case 'mathord':
37
- // basic variable like a, b, c
38
- // macro variable like \alpha, \beta, \gamma
39
- case 'textord':
40
- // - constant number like 1, 2, 3
41
- // - operator symbol like \nabla, \partial
42
- case 'op':
43
- // \lim, \sum
44
- case 'cr':
45
- // new line symbol '\\'
653
+ case 'token':
654
+ case 'token-letter-var':
655
+ case 'token-number':
656
+ case 'token-operator':
657
+ case 'token-parenthesis':
46
658
  res.type = 'symbol';
47
- res.content = node.text!;
48
- if (node.type === 'op') {
49
- res.content = node['name']!;
50
- } else if (node.type === 'cr') {
51
- res.content = '\\\\';
52
- }
53
- break;
54
- case 'genfrac':
55
- res.type = 'binaryFunc';
56
- if (node['leftDelim'] === '(' && node['rightDelim'] === ')') {
57
- // This occurs for \binom \tbinom
58
- res.content = '\\binom';
59
- } else {
60
- res.content = '\\frac';
61
- }
62
- res.args = [
63
- katexNodeToTexNode(node['numer']),
64
- katexNodeToTexNode(node['denom'])
65
- ];
659
+ res.content = node.content!;
66
660
  break;
67
661
  case 'supsub':
68
662
  res.type = 'supsub';
69
663
  res.irregularData = {} as TexSupsubData;
70
664
  if (node['base']) {
71
- res.irregularData.base = katexNodeToTexNode(node['base']);
665
+ res.irregularData.base = latexNodeToTexNode(node['base']);
72
666
  }
73
667
  if (node['sup']) {
74
- res.irregularData.sup = katexNodeToTexNode(node['sup']);
668
+ res.irregularData.sup = latexNodeToTexNode(node['sup']);
75
669
  }
76
670
  if (node['sub']) {
77
- res.irregularData.sub = katexNodeToTexNode(node['sub']);
78
- }
79
- break;
80
- case 'mclass':
81
- case 'ordgroup':
82
- res.type = 'ordgroup';
83
- res.args = (node.body as KatexParseNode[]).map((n: KatexParseNode) => katexNodeToTexNode(n));
84
- if (res.args!.length === 1) {
85
- res = res.args![0] as TexNode;
671
+ res.irregularData.sub = latexNodeToTexNode(node['sub']);
86
672
  }
87
673
  break;
88
- case 'leftright': {
89
- const body = katexNodeToTexNode({
90
- type: 'ordgroup',
91
- mode: 'math',
92
- body: node.body
93
- });
94
-
674
+ case 'leftright':
95
675
  res.type = 'leftright';
676
+
677
+ const body = latexNodeToTexNode(node.body as LatexParseNode);
678
+
96
679
  let left: string = node['left']!;
97
680
  if (left === "\\{") {
98
681
  left = "{";
@@ -108,129 +691,69 @@ export function katexNodeToTexNode(node: KatexParseNode): TexNode {
108
691
  { type: is_atom(right)? 'atom': 'symbol', content: right}
109
692
  ];
110
693
  break;
111
- }
112
- case 'underline':
113
- case 'overline':
114
- res.type = 'unaryFunc';
115
- res.content = '\\' + node.type;
116
- res.args = [
117
- katexNodeToTexNode(node['body'] as KatexParseNode)
118
- ];
119
- break;
120
- case 'accent': {
121
- res.type = 'unaryFunc';
122
- res.content = node['label']!;
123
- res.args = [
124
- katexNodeToTexNode(node['base'])
125
- ];
126
- break;
127
- }
128
- case 'sqrt':
129
- if (node['index']) {
130
- // There is a [] after \sqrt
131
- // \sqrt[some thing]{}
132
- res.irregularData = katexNodeToTexNode(node['index']);
133
- }
134
- // Fall through
135
- case 'font':
136
- case 'operatorname':
137
- res.type = 'unaryFunc';
138
- res.content = ('\\' + node.type!) as string;
139
- if (node.type === 'font') {
140
- res.content = '\\' + node['font']; // e.g. \mathbf, \mathrm
694
+ case 'beginend':
695
+ if (node.content?.startsWith('align')) {
696
+ // align, align*, alignat, alignat*, aligned, etc.
697
+ res.type = 'align';
698
+ } else {
699
+ res.type = 'matrix';
141
700
  }
142
- if(Array.isArray(node.body)) {
143
- const obj = {
144
- type: 'ordgroup',
145
- mode: 'math',
146
- body: node.body as KatexParseNode[]
147
- } as KatexParseNode;
701
+ res.content = node.content!;
702
+ res.irregularData = (node.body as LatexParseNode[][]).map((row: LatexParseNode[]) => {
703
+ return row.map((n: LatexParseNode) => latexNodeToTexNode(n));
704
+ });
705
+ break;
706
+ case 'command':
707
+ const num_args = get_command_param_num(node.content!);
708
+ res.content = '\\' + node.content!;
709
+ if (num_args === 0) {
710
+ res.type = 'symbol';
711
+ } else if (num_args === 1) {
712
+ res.type = 'unaryFunc';
148
713
  res.args = [
149
- katexNodeToTexNode(obj)
714
+ latexNodeToTexNode(node.arg1 as LatexParseNode)
150
715
  ]
151
- } else {
716
+ if (node.content === 'sqrt') {
717
+ if (node.exponent) {
718
+ res.irregularData = latexNodeToTexNode(node.exponent) as TexNode;
719
+ }
720
+ }
721
+ } else if (num_args === 2) {
722
+ res.type = 'binaryFunc';
152
723
  res.args = [
153
- katexNodeToTexNode(node.body as KatexParseNode)
724
+ latexNodeToTexNode(node.arg1 as LatexParseNode),
725
+ latexNodeToTexNode(node.arg2 as LatexParseNode)
154
726
  ]
155
- }
156
- break;
157
- case 'horizBrace':
158
- res.type = 'unaryFunc';
159
- res.content = node['label']!; // '\\overbrace' or '\\unerbrace'
160
- res.args = [
161
- katexNodeToTexNode(node['base']),
162
- ];
163
- break;
164
- case 'array':
165
- if (node['colSeparationType'] === 'align') {
166
- // align environment
167
- res.type = 'align';
168
727
  } else {
169
- res.type = 'matrix'
728
+ throw new LatexNodeToTexNodeError('Invalid number of arguments', node);
170
729
  }
171
- res.irregularData = (node.body! as KatexParseNode[][]).map((row: KatexParseNode[]) => {
172
- return row.map((cell: KatexParseNode) => {
173
- if (cell.type !== 'styling' || (cell.body as KatexParseNode[]).length !== 1) {
174
- throw new KatexNodeToTexNodeError("Expecting cell.type==='\\styling' and cell.body.length===1", cell);
175
- }
176
- return katexNodeToTexNode((cell.body as KatexParseNode[])[0]);
177
- });
178
- });
179
730
  break;
180
-
181
- case 'text': {
731
+ case 'text':
182
732
  res.type = 'text';
183
- let str = "";
184
- (node.body as KatexParseNode[]).forEach((n) => {
185
- if(n.mode !== 'text') {
186
- throw new KatexNodeToTexNodeError("Expecting node.mode==='text'", node)
187
- }
188
- str += n.text;
189
- });
190
- res.content = str;
733
+ res.content = node.content!;
191
734
  break;
192
- }
193
- case 'spacing':
194
- // res.type = 'spacing';
195
- // res.content = node.text! as string;
196
- // break;
197
- case 'kern':
198
- // This can occur for \implies, \iff.
199
- // e.g. \implies is parsed as [{type:'kern'}, {type:'atom', text:'\\Longrightarrow'}, {type:'kern'}]
200
- // TODO: Ideally, we should output a single symbol \implies.
201
- // But for now, we simply let the output be \Longrightarrow
735
+ case 'comment':
736
+ res.type = 'comment';
737
+ res.content = node.content!;
738
+ break;
739
+ case 'whitespace':
202
740
  res.type = 'empty';
203
- res.content = ' ';
204
741
  break;
205
-
206
- case 'htmlmathml': {
207
- // This can occur for \neq.
208
- const element = (node['mathml'] as KatexParseNode[])[0]!['body']![0];
209
- if (element && element.type === 'textord' && element.text === '≠') {
742
+ case 'newline':
743
+ res.type = 'newline';
744
+ res.content = '\n';
745
+ break;
746
+ case 'control':
747
+ if (node.content === '\\\\') {
210
748
  res.type = 'symbol';
211
- res.content = '\\neq';
749
+ res.content = node.content!;
212
750
  break;
213
751
  } else {
214
- // Fall through to throw error
752
+ throw new LatexNodeToTexNodeError(`Unknown control sequence: ${node.content}`, node);
215
753
  }
216
- }
217
- case 'color':
218
- // KaTeX encounters an unrecognized macro.
219
- if (Array.isArray(node.body) && node.body.length === 1) {
220
- const sub_body = node.body[0] as KatexParseNode;
221
- if (sub_body.type === 'text') {
222
- res.type = 'unknownMacro';
223
- const joined = (sub_body.body as KatexParseNode[]).map((n) => n.text).join('');
224
- if (/^\\[a-zA-Z]+$/.test(joined)){
225
- res.content = joined.substring(1);
226
- break;
227
- }
228
- }
229
- }
230
- throw new KatexNodeToTexNodeError(`Unknown error type in parsed result:`, node);
231
- default:
232
- throw new KatexNodeToTexNodeError(`Unknown node type: ${node.type}`, node);
233
754
  break;
755
+ default:
756
+ throw new LatexNodeToTexNodeError(`Unknown node type: ${node.type}`, node);
234
757
  }
235
758
  return res as TexNode;
236
759
  } catch (e) {
@@ -239,38 +762,10 @@ export function katexNodeToTexNode(node: KatexParseNode): TexNode {
239
762
  }
240
763
 
241
764
  export function parseTex(tex: string, customTexMacros: {[key: string]: string}): TexNode {
242
- // displayMode=true. Otherwise, "KaTeX parse error: {align*} can be used only in display mode."
243
- const macros = {
244
- // KaTeX parse these commands so complicatedly that we need some hacks to keep things simple.
245
- '\\mod': '\\operatorname{SyMb01-mod}',
246
- '\\liminf': '\\operatorname{SyMb01-liminf}',
247
- '\\limsup': '\\operatorname{SyMb01-limsup}',
248
- '\\qquad': '\\operatorname{SyMb01-qquad}',
249
- '\\quad': '\\operatorname{SyMb01-quad}',
250
- '\\cdots': '\\operatorname{SyMb01-cdots}',
251
- '\\colon': '\\operatorname{SyMb01-colon}',
252
- '\\imath': '\\operatorname{SyMb01-imath}',
253
- '\\\iiiint': '\\operatorname{SyMb01-iiiint}', // \iiint is valid in LaTeX but not supported in KaTeX
254
- '\\jmath': '\\operatorname{SyMb01-jmath}',
255
- '\\vdots': '\\operatorname{SyMb01-vdots}',
256
- '\\notin': '\\operatorname{SyMb01-notin}',
257
- '\\slash': '\\operatorname{SyMb01-slash}',
258
- '\\LaTeX': '\\operatorname{SyMb01-LaTeX}',
259
- '\\TeX': '\\operatorname{SyMb01-TeX}',
260
- ...customTexMacros
261
- };
262
- const options = {
263
- macros: macros,
264
- displayMode: true,
265
- strict: "ignore",
266
- throwOnError: false
267
- };
268
- let treeArray = generateParseTree(tex, options);
269
- let t = {
270
- type: 'ordgroup',
271
- mode: 'math',
272
- body: treeArray as KatexParseNode[],
273
- loc: {}
274
- } as KatexParseNode;
275
- return katexNodeToTexNode(t);
765
+ const parser = new LatexParser();
766
+ for (const [macro, replacement] of Object.entries(customTexMacros)) {
767
+ tex = tex.replaceAll(macro, replacement);
768
+ }
769
+ const node = parser.parse(tex);
770
+ return latexNodeToTexNode(node);
276
771
  }