tex2typst 0.1.20 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/parser.ts CHANGED
@@ -1,244 +1,568 @@
1
- // @ts-ignore
2
- import katex from 'katex';
3
- import { TexNode, KatexParseNode, TexSupsubData } from './types';
1
+ import { TexNode, LatexParseNode, TexSupsubData } from "./types";
4
2
 
3
+ const UNARY_COMMANDS = [
4
+ 'sqrt',
5
+ 'text',
5
6
 
6
- const generateParseTree = katex.__parse;
7
+ 'arccos',
8
+ 'arcsin',
9
+ 'arctan',
10
+ 'arg',
11
+ 'bar',
12
+ 'bold',
13
+ 'boldsymbol',
14
+ 'ddot',
15
+ 'det',
16
+ 'dim',
17
+ 'dot',
18
+ 'exp',
19
+ 'gcd',
20
+ 'hat',
21
+ 'ker',
22
+ 'mathbb',
23
+ 'mathbf',
24
+ 'mathcal',
25
+ 'mathscr',
26
+ 'mathsf',
27
+ 'mathtt',
28
+ 'mathrm',
29
+ 'max',
30
+ 'min',
31
+ 'mod',
32
+ 'operatorname',
33
+ 'overbrace',
34
+ 'overline',
35
+ 'pmb',
36
+ 'sup',
37
+ 'rm',
38
+ 'tilde',
39
+ 'underbrace',
40
+ 'underline',
41
+ 'vec',
42
+ 'widehat',
43
+ 'widetilde',
44
+ ]
7
45
 
8
- export class KatexNodeToTexNodeError extends Error {
9
- node: KatexParseNode;
46
+ const BINARY_COMMANDS = [
47
+ 'frac',
48
+ 'tfrac',
49
+ 'binom',
50
+ 'dbinom',
51
+ 'dfrac',
52
+ 'tbinom',
53
+ ]
10
54
 
11
- constructor(message: string, node: KatexParseNode) {
55
+ const EMPTY_NODE = { 'type': 'empty', 'content': '' }
56
+
57
+ function assert(condition: boolean, message: string = ''): void {
58
+ if (!condition) {
59
+ throw new LatexParserError(message);
60
+ }
61
+ }
62
+
63
+
64
+ function get_command_param_num(command: string): number {
65
+ if (UNARY_COMMANDS.includes(command)) {
66
+ return 1;
67
+ } else if (BINARY_COMMANDS.includes(command)) {
68
+ return 2;
69
+ } else {
70
+ return 0;
71
+ }
72
+ }
73
+
74
+ function find_closing_curly_bracket(latex: string, start: number): number {
75
+ assert(latex[start] === '{');
76
+ let count = 1;
77
+ let pos = start + 1;
78
+
79
+ while (count > 0) {
80
+ if (pos >= latex.length) {
81
+ throw new LatexParserError('Unmatched curly brackets');
82
+ }
83
+ if(pos + 1 < latex.length && (['\\{', '\\}'].includes(latex.substring(pos, pos + 2)))) {
84
+ pos += 2;
85
+ continue;
86
+ }
87
+ if (latex[pos] === '{') {
88
+ count += 1;
89
+ } else if (latex[pos] === '}') {
90
+ count -= 1;
91
+ }
92
+ pos += 1;
93
+ }
94
+
95
+ return pos - 1;
96
+ }
97
+
98
+ function find_closing_square_bracket(latex: string, start: number): number {
99
+ assert(latex[start] === '[');
100
+ let count = 1;
101
+ let pos = start + 1;
102
+
103
+ while (count > 0) {
104
+ if (pos >= latex.length) {
105
+ throw new LatexParserError('Unmatched square brackets');
106
+ }
107
+ if (latex[pos] === '[') {
108
+ count += 1;
109
+ } else if (latex[pos] === ']') {
110
+ count -= 1;
111
+ }
112
+ pos += 1;
113
+ }
114
+
115
+ return pos - 1;
116
+ }
117
+
118
+
119
+ function isalpha(char: string): boolean {
120
+ return 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'.includes(char);
121
+ }
122
+
123
+ function isdigit(char: string): boolean {
124
+ return '0123456789'.includes(char);
125
+ }
126
+
127
+
128
+
129
+ function find_command(latex: string, start: number, command_name: string): number {
130
+ const len_slash_command = 1 + command_name.length;
131
+ let pos = start;
132
+
133
+ while (pos < latex.length) {
134
+ pos = latex.indexOf('\\' + command_name, pos);
135
+ if (pos === -1) {
136
+ return -1;
137
+ }
138
+ if (pos + len_slash_command >= latex.length || !isalpha(latex[pos + len_slash_command])) {
139
+ return pos;
140
+ } else {
141
+ pos += len_slash_command;
142
+ }
143
+ }
144
+
145
+ return -1;
146
+ }
147
+
148
+ function find_closing_right_command(latex: string, start: number): number {
149
+ let count = 1;
150
+ let pos = start;
151
+
152
+ while (count > 0) {
153
+ if (pos >= latex.length) {
154
+ return -1;
155
+ }
156
+ const left_idx = find_command(latex, pos, 'left');
157
+ const right_idx = find_command(latex, pos, 'right');
158
+
159
+ if (right_idx === -1) {
160
+ return -1;
161
+ }
162
+
163
+ if (left_idx === -1 || left_idx > right_idx) {
164
+ // a \right is ahead
165
+ count -= 1;
166
+ pos = right_idx + '\\right'.length;
167
+ } else {
168
+ // a \left is ahead
169
+ count += 1;
170
+ pos = left_idx + '\\left'.length;
171
+ }
172
+ }
173
+
174
+ return pos - '\\right'.length;
175
+ }
176
+
177
+ function find_closing_end_command(latex: string, start: number): number {
178
+ let count = 1;
179
+ let pos = start;
180
+
181
+ while (count > 0) {
182
+ if (pos >= latex.length) {
183
+ return -1;
184
+ }
185
+ const begin_idx = find_command(latex, pos, 'begin');
186
+ const end_idx = find_command(latex, pos, 'end');
187
+
188
+ if (end_idx === -1) {
189
+ return -1;
190
+ }
191
+
192
+ if (begin_idx === -1 || begin_idx > end_idx) {
193
+ // an \end is ahead
194
+ count -= 1;
195
+ pos = end_idx + '\\end'.length;
196
+ } else {
197
+ // a \begin is ahead
198
+ count += 1;
199
+ pos = begin_idx + '\\begin'.length;
200
+ }
201
+ }
202
+
203
+ return pos - '\\end'.length;
204
+ }
205
+
206
+ function eat_whitespaces(latex: string, start: number): string {
207
+ let pos = start;
208
+ while (pos < latex.length && [' ', '\t', '\n'].includes(latex[pos])) {
209
+ pos += 1;
210
+ }
211
+ return latex.substring(start, pos);
212
+ }
213
+
214
+ function eat_spaces(latex: string, start: number): string {
215
+ let pos = start;
216
+ while (pos < latex.length && latex[pos] === ' ') {
217
+ pos += 1;
218
+ }
219
+ return latex.substring(start, pos);
220
+ }
221
+
222
+ function eat_command_name(latex: string, start: number): string {
223
+ let pos = start;
224
+ while (pos < latex.length && isalpha(latex[pos])) {
225
+ pos += 1;
226
+ }
227
+ return latex.substring(start, pos);
228
+ }
229
+
230
+ function eat_parenthesis(latex: string, start: number): string | null {
231
+ if ('()[]|'.includes(latex[start])) {
232
+ return latex[start];
233
+ } else if (start + 1 < latex.length && ['\\{', '\\}'].includes(latex.substring(start, start + 2))) {
234
+ return latex.substring(start, start + 2);
235
+ } else if (start + 6 < latex.length && ['\\lfloor', '\\rfloor'].includes(latex.substring(start, start + 7))) {
236
+ return latex.substring(start, start + 7);
237
+ } else if (start + 5 < latex.length && ['\\lceil', '\\rceil'].includes(latex.substring(start, start + 6))) {
238
+ return latex.substring(start, start + 6);
239
+ } else if (start + 6 < latex.length && ['\\langle', '\\rangle'].includes(latex.substring(start, start + 7))) {
240
+ return latex.substring(start, start + 7);
241
+ } else {
242
+ return null;
243
+ }
244
+ }
245
+
246
+ function eat_primes(latex: string, start: number): number {
247
+ let pos = start;
248
+ while (pos < latex.length && latex[pos] === "'") {
249
+ pos += 1;
250
+ }
251
+ return pos - start;
252
+ }
253
+
254
+
255
+ class LatexParserError extends Error {
256
+ constructor(message: string) {
12
257
  super(message);
13
- this.name = "KatexNodeToTexNodeError";
14
- this.node = node;
258
+ this.name = 'LatexParserError';
15
259
  }
16
260
  }
17
261
 
18
- export function katexNodeToTexNode(node: KatexParseNode): TexNode {
19
- try {
20
- if (node.loc) {
21
- delete node.loc;
262
+
263
+ type ParseResult = [LatexParseNode, number];
264
+
265
+ export class LatexParser {
266
+ space_sensitive: boolean;
267
+ newline_sensitive: boolean;
268
+
269
+ constructor(space_sensitive: boolean = false, newline_sensitive: boolean = true) {
270
+ this.space_sensitive = space_sensitive;
271
+ this.newline_sensitive = newline_sensitive;
272
+ }
273
+
274
+ parse(latex: string): LatexParseNode {
275
+ const results: LatexParseNode[] = [];
276
+ let pos = 0;
277
+
278
+ while (pos < latex.length) {
279
+ const [res, newPos] = this.parseNextExpr(latex, pos);
280
+ pos = newPos;
281
+ if (!this.space_sensitive && res.type === 'whitespace') {
282
+ continue;
283
+ }
284
+ if (!this.newline_sensitive && res.type === 'newline') {
285
+ continue;
286
+ }
287
+ if (res.type === 'control' && res.content === '&') {
288
+ throw new LatexParserError('Unexpected & outside of an alignment');
289
+ }
290
+ results.push(res);
22
291
  }
23
- let res = {} as TexNode;
24
- switch (node.type) {
25
- case 'atom':
26
- // basic symbol like +, -, =, '(', ')', '\{', '\}'
27
- // other punctuation-like macro such as \cdot, \to, \pm
28
- res.type = 'atom';
29
- res.content = node.text!;
30
- if (node.text === '\\{' || node.text === '\\}') {
31
- res.content = node.text.substring(1); // '{' or '}'
32
- } else if (node.text!.startsWith('\\')) {
33
- res.type = 'symbol';
34
- }
35
- break;
36
- case 'mathord':
37
- // basic variable like a, b, c
38
- // macro variable like \alpha, \beta, \gamma
39
- case 'textord':
40
- // - constant number like 1, 2, 3
41
- // - operator symbol like \nabla, \partial
42
- case 'op':
43
- // \lim, \sum
44
- case 'cr':
45
- // new line symbol '\\'
46
- res.type = 'symbol';
47
- res.content = node.text!;
48
- if (node.type === 'op') {
49
- res.content = node['name']!;
50
- } else if (node.type === 'cr') {
51
- res.content = '\\\\';
52
- }
53
- break;
54
- case 'genfrac':
55
- res.type = 'binaryFunc';
56
- if (node['leftDelim'] === '(' && node['rightDelim'] === ')') {
57
- // This occurs for \binom \tbinom
58
- res.content = '\\binom';
59
- } else {
60
- res.content = '\\frac';
61
- }
62
- res.args = [
63
- katexNodeToTexNode(node['numer']),
64
- katexNodeToTexNode(node['denom'])
65
- ];
66
- break;
67
- case 'supsub':
68
- res.type = 'supsub';
69
- res.irregularData = {} as TexSupsubData;
70
- if (node['base']) {
71
- res.irregularData.base = katexNodeToTexNode(node['base']);
72
- }
73
- if (node['sup']) {
74
- res.irregularData.sup = katexNodeToTexNode(node['sup']);
75
- }
76
- if (node['sub']) {
77
- res.irregularData.sub = katexNodeToTexNode(node['sub']);
78
- }
79
- break;
80
- case 'mclass':
81
- case 'ordgroup':
82
- res.type = 'ordgroup';
83
- res.args = (node.body as KatexParseNode[]).map((n: KatexParseNode) => katexNodeToTexNode(n));
84
- if (res.args!.length === 1) {
85
- res = res.args![0] as TexNode;
86
- }
87
- break;
88
- case 'leftright': {
89
- const body = katexNodeToTexNode({
90
- type: 'ordgroup',
91
- mode: 'math',
92
- body: node.body
93
- });
94
292
 
95
- res.type = 'leftright';
96
- let left: string = node['left']!;
97
- if (left === "\\{") {
98
- left = "{";
293
+ if (results.length === 0) {
294
+ return EMPTY_NODE;
295
+ } else if (results.length === 1) {
296
+ return results[0];
297
+ } else {
298
+ return { type: 'ordgroup', args: results };
299
+ }
300
+ }
301
+
302
+ parseNextExpr(latex: string, start: number): ParseResult {
303
+ let [base, pos] = this.parseNextExprWithoutSupSub(latex, start);
304
+ let sub: LatexParseNode | null = null;
305
+ let sup: LatexParseNode | null = null;
306
+ let num_prime = 0;
307
+
308
+ num_prime += eat_primes(latex, pos);
309
+ pos += num_prime;
310
+ if (pos < latex.length && latex[pos] === '_') {
311
+ [sub, pos] = this.parseNextExprWithoutSupSub(latex, pos + 1);
312
+ num_prime += eat_primes(latex, pos);
313
+ pos += num_prime;
314
+ if (pos < latex.length && latex[pos] === '^') {
315
+ [sup, pos] = this.parseNextExprWithoutSupSub(latex, pos + 1);
316
+ if (eat_primes(latex, pos) > 0) {
317
+ throw new LatexParserError('Double superscript');
99
318
  }
100
- let right: string = node['right']!;
101
- if (right === "\\}") {
102
- right = "}";
319
+ }
320
+ } else if (pos < latex.length && latex[pos] === '^') {
321
+ [sup, pos] = this.parseNextExprWithoutSupSub(latex, pos + 1);
322
+ if (eat_primes(latex, pos) > 0) {
323
+ throw new LatexParserError('Double superscript');
324
+ }
325
+ if (pos < latex.length && latex[pos] === '_') {
326
+ [sub, pos] = this.parseNextExprWithoutSupSub(latex, pos + 1);
327
+ if (eat_primes(latex, pos) > 0) {
328
+ throw new LatexParserError('Double superscript');
103
329
  }
104
- const is_atom = (str:string) => (['(', ')', '[', ']', '{', '}'].includes(str));
105
- res.args = [
106
- { type: is_atom(left)? 'atom': 'symbol', content: left },
107
- body,
108
- { type: is_atom(right)? 'atom': 'symbol', content: right}
109
- ];
110
- break;
111
330
  }
112
- case 'underline':
113
- case 'overline':
114
- res.type = 'unaryFunc';
115
- res.content = '\\' + node.type;
116
- res.args = [
117
- katexNodeToTexNode(node['body'] as KatexParseNode)
118
- ];
119
- break;
120
- case 'accent': {
121
- res.type = 'unaryFunc';
122
- res.content = node['label']!;
123
- res.args = [
124
- katexNodeToTexNode(node['base'])
125
- ];
126
- break;
331
+ }
332
+
333
+ if (sub !== null || sup !== null || num_prime > 0) {
334
+ const res = { type: 'supsub', base } as LatexParseNode;
335
+ if (sub) {
336
+ res.sub = sub;
127
337
  }
128
- case 'sqrt':
129
- if (node['index']) {
130
- // There is a [] after \sqrt
131
- // \sqrt[some thing]{}
132
- res.irregularData = katexNodeToTexNode(node['index']);
338
+ if (num_prime > 0) {
339
+ res.sup = { type: 'ordgroup', args: [] };
340
+ for (let i = 0; i < num_prime; i++) {
341
+ res.sup.args!.push({ type: 'command', content: 'prime' });
133
342
  }
134
- // Fall through
135
- case 'font':
136
- case 'operatorname':
137
- res.type = 'unaryFunc';
138
- res.content = ('\\' + node.type!) as string;
139
- if (node.type === 'font') {
140
- res.content = '\\' + node['font']; // e.g. \mathbf, \mathrm
343
+ if (sup) {
344
+ res.sup.args!.push(sup);
141
345
  }
142
- if(Array.isArray(node.body)) {
143
- const obj = {
144
- type: 'ordgroup',
145
- mode: 'math',
146
- body: node.body as KatexParseNode[]
147
- } as KatexParseNode;
148
- res.args = [
149
- katexNodeToTexNode(obj)
150
- ]
151
- } else {
152
- res.args = [
153
- katexNodeToTexNode(node.body as KatexParseNode)
154
- ]
346
+ if (res.sup.args!.length === 1) {
347
+ res.sup = res.sup.args![0];
155
348
  }
156
- break;
157
- case 'horizBrace':
158
- res.type = 'unaryFunc';
159
- res.content = node['label']!; // '\\overbrace' or '\\unerbrace'
160
- res.args = [
161
- katexNodeToTexNode(node['base']),
162
- ];
163
- break;
164
- case 'array':
165
- if (node['colSeparationType'] === 'align') {
166
- // align environment
167
- res.type = 'align';
168
- } else {
169
- res.type = 'matrix'
170
- }
171
- res.irregularData = (node.body! as KatexParseNode[][]).map((row: KatexParseNode[]) => {
172
- return row.map((cell: KatexParseNode) => {
173
- if (cell.type !== 'styling' || (cell.body as KatexParseNode[]).length !== 1) {
174
- throw new KatexNodeToTexNodeError("Expecting cell.type==='\\styling' and cell.body.length===1", cell);
175
- }
176
- return katexNodeToTexNode((cell.body as KatexParseNode[])[0]);
177
- });
178
- });
179
- break;
349
+ } else if (sup) {
350
+ res.sup = sup;
351
+ }
352
+ return [res, pos];
353
+ } else {
354
+ return [base, pos];
355
+ }
356
+ }
180
357
 
181
- case 'text': {
182
- res.type = 'text';
183
- let str = "";
184
- (node.body as KatexParseNode[]).forEach((n) => {
185
- if(n.mode !== 'text') {
186
- throw new KatexNodeToTexNodeError("Expecting node.mode==='text'", node)
187
- }
188
- str += n.text;
189
- });
190
- res.content = str;
191
- break;
358
+ parseNextExprWithoutSupSub(latex: string, start: number): ParseResult {
359
+ const firstChar = latex[start];
360
+ if (firstChar === '{') {
361
+ const posClosingBracket = find_closing_curly_bracket(latex, start);
362
+ const exprInside = latex.slice(start + 1, posClosingBracket);
363
+ return [this.parse(exprInside), posClosingBracket + 1];
364
+ } else if (firstChar === '\\') {
365
+ if (start + 1 >= latex.length) {
366
+ throw new LatexParserError('Expecting command name after \\');
192
367
  }
193
- case 'spacing':
194
- // res.type = 'spacing';
195
- // res.content = node.text! as string;
196
- // break;
197
- case 'kern':
198
- // This can occur for \implies, \iff.
199
- // e.g. \implies is parsed as [{type:'kern'}, {type:'atom', text:'\\Longrightarrow'}, {type:'kern'}]
200
- // TODO: Ideally, we should output a single symbol \implies.
201
- // But for now, we simply let the output be \Longrightarrow
202
- res.type = 'empty';
203
- res.content = ' ';
204
- break;
205
-
206
- case 'htmlmathml': {
207
- // This can occur for \neq.
208
- const element = (node['mathml'] as KatexParseNode[])[0]!['body']![0];
209
- if (element && element.type === 'textord' && element.text === '≠') {
210
- res.type = 'symbol';
211
- res.content = '\\neq';
212
- break;
213
- } else {
214
- // Fall through to throw error
215
- }
368
+ const firstTwoChars = latex.slice(start, start + 2);
369
+ if (firstTwoChars === '\\\\') {
370
+ return [{ type: 'control', content: '\\\\' }, start + 2];
371
+ } else if (firstTwoChars === '\\{' || firstTwoChars === '\\}') {
372
+ return [{ type: 'token-parenthesis', content: firstTwoChars }, start + 2];
373
+ } else if (['\\%', '\\$', '\\&', '\\#', '\\_'].includes(firstTwoChars)) {
374
+ return [{ type: 'token', content: firstTwoChars }, start + 2];
375
+ } else if (latex.slice(start).startsWith('\\begin{')) {
376
+ return this.parseBeginEndExpr(latex, start);
377
+ } else if (latex.slice(start).startsWith('\\left') && (start + 5 >= latex.length || !isalpha(latex[start + 5]))) {
378
+ return this.parseLeftRightExpr(latex, start);
379
+ } else {
380
+ return this.parseCommandExpr(latex, start);
216
381
  }
217
- case 'color':
218
- // KaTeX encounters an unrecognized macro.
219
- if (Array.isArray(node.body) && node.body.length === 1) {
220
- const sub_body = node.body[0] as KatexParseNode;
221
- if (sub_body.type === 'text') {
222
- res.type = 'unknownMacro';
223
- const joined = (sub_body.body as KatexParseNode[]).map((n) => n.text).join('');
224
- if (/^\\[a-zA-Z]+$/.test(joined)){
225
- res.content = joined.substring(1);
226
- break;
227
- }
228
- }
229
- }
230
- throw new KatexNodeToTexNodeError(`Unknown error type in parsed result:`, node);
231
- case 'comment':
232
- res.type = 'comment';
233
- res.content = node.text!;
234
- break;
235
- default:
236
- throw new KatexNodeToTexNodeError(`Unknown node type: ${node.type}`, node);
237
- break;
382
+ } else if (firstChar === '%') {
383
+ let pos = start + 1;
384
+ while (pos < latex.length && latex[pos] !== '\n') {
385
+ pos += 1;
386
+ }
387
+ return [{ type: 'comment', content: latex.slice(start + 1, pos) }, pos];
388
+ } else if (isdigit(firstChar)) {
389
+ let pos = start;
390
+ while (pos < latex.length && isdigit(latex[pos])) {
391
+ pos += 1;
392
+ }
393
+ return [{ type: 'token-number', content: latex.slice(start, pos) }, pos];
394
+ } else if (isalpha(firstChar)) {
395
+ return [{ type: 'token-letter-var', content: firstChar }, start + 1];
396
+ } else if ('+-*/=<>!'.includes(firstChar)) {
397
+ return [{ type: 'token-operator', content: firstChar }, start + 1];
398
+ } else if ('.,;?'.includes(firstChar)) {
399
+ return [{ type: 'atom', content: firstChar }, start + 1];
400
+ } else if ('()[]'.includes(firstChar)) {
401
+ return [{ type: 'token-parenthesis', content: firstChar }, start + 1];
402
+ } else if (firstChar === '_') {
403
+ let [sub, pos] = this.parseNextExpr(latex, start + 1);
404
+ let sup: LatexParseNode | undefined = undefined;
405
+ if (pos < latex.length && latex[pos] === '^') {
406
+ [sup, pos] = this.parseNextExpr(latex, pos + 1);
407
+ }
408
+ return [{ type: 'supsub', base: EMPTY_NODE, sub, sup }, pos];
409
+ } else if (firstChar === '^') {
410
+ let [sup, pos] = this.parseNextExpr(latex, start + 1);
411
+ let sub: LatexParseNode | undefined = undefined;
412
+ if (pos < latex.length && latex[pos] === '_') {
413
+ [sub, pos] = this.parseNextExpr(latex, pos + 1);
414
+ }
415
+ return [{ type: 'supsub', base: EMPTY_NODE, sub, sup }, pos];
416
+ } else if (firstChar === ' ') {
417
+ let pos = start;
418
+ while (pos < latex.length && latex[pos] === ' ') {
419
+ pos += 1;
420
+ }
421
+ return [{ type: 'whitespace', content: latex.slice(start, pos) }, pos];
422
+ } else if (firstChar === '\n') {
423
+ return [{ type: 'newline', content: '\n' }, start + 1];
424
+ } else if (firstChar === '\r') {
425
+ if (start + 1 < latex.length && latex[start + 1] === '\n') {
426
+ return [{ type: 'newline', content: '\n' }, start + 2];
427
+ } else {
428
+ return [{ type: 'newline', content: '\n' }, start + 1];
429
+ }
430
+ } else if (firstChar === '&') {
431
+ return [{ type: 'control', content: '&' }, start + 1];
432
+ } else {
433
+ return [{ type: 'unknown', content: firstChar }, start + 1];
238
434
  }
239
- return res as TexNode;
240
- } catch (e) {
241
- throw e;
435
+ }
436
+
437
+ parseCommandExpr(latex: string, start: number): ParseResult {
438
+ assert(latex[start] === '\\');
439
+ let pos = start + 1;
440
+ const command = eat_command_name(latex, pos);
441
+ pos += command.length;
442
+ const paramNum = get_command_param_num(command);
443
+ if (paramNum === 0) {
444
+ return [{ type: 'command', content: command }, pos];
445
+ } else if (paramNum === 1) {
446
+ if (command === 'sqrt' && pos < latex.length && latex[pos] === '[') {
447
+ const posLeftSquareBracket = pos;
448
+ const posRightSquareBracket = find_closing_square_bracket(latex, pos);
449
+ const exprInside = latex.slice(posLeftSquareBracket + 1, posRightSquareBracket);
450
+ const exponent = this.parse(exprInside);
451
+ const [arg1, newPos] = this.parseNextExprWithoutSupSub(latex, posRightSquareBracket + 1);
452
+ return [{ type: 'command', content: command, arg1, exponent }, newPos];
453
+ } else if (command === 'text') {
454
+ assert(latex[pos] === '{');
455
+ const posClosingBracket = find_closing_curly_bracket(latex, pos);
456
+ const text = latex.slice(pos + 1, posClosingBracket);
457
+ return [{ type: 'text', content: text }, posClosingBracket + 1];
458
+ } else {
459
+ let [arg1, newPos] = this.parseNextExprWithoutSupSub(latex, pos);
460
+ return [{ type: 'command', content: command, arg1 }, newPos];
461
+ }
462
+ } else if (paramNum === 2) {
463
+ const [arg1, pos1] = this.parseNextExprWithoutSupSub(latex, pos);
464
+ const [arg2, pos2] = this.parseNextExprWithoutSupSub(latex, pos1);
465
+ return [{ type: 'command', content: command, arg1, arg2 }, pos2];
466
+ } else {
467
+ throw new Error( 'Invalid number of parameters');
468
+ }
469
+ }
470
+
471
+ parseLeftRightExpr(latex: string, start: number): ParseResult {
472
+ assert(latex.slice(start, start + 5) === '\\left');
473
+ let pos = start + '\\left'.length;
474
+ pos += eat_whitespaces(latex, pos).length;
475
+ if (pos >= latex.length) {
476
+ throw new LatexParserError('Expecting delimiter after \\left');
477
+ }
478
+ const leftDelimiter = eat_parenthesis(latex, pos);
479
+ if (leftDelimiter === null) {
480
+ throw new LatexParserError('Invalid delimiter after \\left');
481
+ }
482
+ pos += leftDelimiter.length;
483
+ const exprInsideStart = pos;
484
+ const idx = find_closing_right_command(latex, pos);
485
+ if (idx === -1) {
486
+ throw new LatexParserError('No matching \\right');
487
+ }
488
+ const exprInsideEnd = idx;
489
+ pos = idx + '\\right'.length;
490
+ pos += eat_whitespaces(latex, pos).length;
491
+ if (pos >= latex.length) {
492
+ throw new LatexParserError('Expecting delimiter after \\right');
493
+ }
494
+ const rightDelimiter = eat_parenthesis(latex, pos);
495
+ if (rightDelimiter === null) {
496
+ throw new LatexParserError('Invalid delimiter after \\right');
497
+ }
498
+ pos += rightDelimiter.length;
499
+ const exprInside = latex.slice(exprInsideStart, exprInsideEnd);
500
+ const body = this.parse(exprInside);
501
+ const res = { type: 'leftright', left: leftDelimiter, right: rightDelimiter, body };
502
+ return [res, pos];
503
+ }
504
+
505
+
506
+ parseBeginEndExpr(latex: string, start: number): ParseResult {
507
+ assert(latex.slice(start, start + 7) === '\\begin{');
508
+ let pos = start + '\\begin'.length;
509
+ const idx = find_closing_curly_bracket(latex, pos);
510
+ if (idx === -1) {
511
+ throw new LatexParserError('No matching } after \\begin{');
512
+ }
513
+ const envName = latex.slice(pos + 1, idx);
514
+ pos = idx + 1;
515
+ pos += eat_whitespaces(latex, pos).length; // ignore whitespaces and '\n' after \begin{envName}
516
+ const exprInsideStart = pos;
517
+ const endIdx = find_closing_end_command(latex, pos);
518
+ if (endIdx === -1) {
519
+ throw new LatexParserError('No matching \\end');
520
+ }
521
+ const exprInsideEnd = endIdx;
522
+ pos = endIdx + '\\end'.length;
523
+ const closingIdx = find_closing_curly_bracket(latex, pos);
524
+ if (closingIdx === -1) {
525
+ throw new LatexParserError('No matching } after \\end{');
526
+ }
527
+ if (latex.slice(pos + 1, closingIdx) !== envName) {
528
+ throw new LatexParserError('Mismatched \\begin and \\end environments');
529
+ }
530
+ let exprInside = latex.slice(exprInsideStart, exprInsideEnd);
531
+ exprInside = exprInside.trimEnd(); // ignore whitespaces and '\n' before \end{envName}
532
+ const body = this.parseAligned(exprInside);
533
+ const res = { type: 'beginend', content: envName, body };
534
+ return [res, closingIdx + 1];
535
+ }
536
+
537
+ parseAligned(latex: string): LatexParseNode[][] {
538
+ let pos = 0;
539
+ const allRows: LatexParseNode[][] = [];
540
+ let row: LatexParseNode[] = [];
541
+ allRows.push(row);
542
+ let group: LatexParseNode = { type: 'ordgroup', args: [] };
543
+ row.push(group);
544
+
545
+ while (pos < latex.length) {
546
+ const [res, newPos] = this.parseNextExpr(latex, pos);
547
+ pos = newPos;
548
+ if (res.type === 'whitespace') {
549
+ continue;
550
+ } else if (res.type === 'newline' && !this.newline_sensitive) {
551
+ continue;
552
+ } else if (res.type === 'control' && res.content === '\\\\') {
553
+ row = [];
554
+ group = { type: 'ordgroup', args: [] };
555
+ row.push(group);
556
+ allRows.push(row);
557
+ } else if (res.type === 'control' && res.content === '&') {
558
+ group = { type: 'ordgroup', args: [] };
559
+ row.push(group);
560
+ } else {
561
+ group.args!.push(res);
562
+ }
563
+ }
564
+
565
+ return allRows;
242
566
  }
243
567
  }
244
568
 
@@ -281,7 +605,7 @@ function splitTex(tex: string): string[] {
281
605
  const has_begin_command = line.includes('\\begin{');
282
606
  const followed_by_end_command = lines[i + 1].includes('\\end{');
283
607
  if(!has_begin_command && !followed_by_end_command) {
284
- current_tex += "\\SyMbOlNeWlInE ";
608
+ current_tex += '\n';
285
609
  }
286
610
  }
287
611
 
@@ -297,58 +621,151 @@ function splitTex(tex: string): string[] {
297
621
  return out_tex_list;
298
622
  }
299
623
 
300
- export function parseTex(tex: string, customTexMacros: {[key: string]: string}): TexNode {
301
- // displayMode=true. Otherwise, "KaTeX parse error: {align*} can be used only in display mode."
302
- const macros = {
303
- // KaTeX parse these commands so complicatedly that we need some hacks to keep things simple.
304
- '\\mod': '\\operatorname{SyMb01-mod}',
305
- '\\liminf': '\\operatorname{SyMb01-liminf}',
306
- '\\limsup': '\\operatorname{SyMb01-limsup}',
307
- '\\qquad': '\\operatorname{SyMb01-qquad}',
308
- '\\quad': '\\operatorname{SyMb01-quad}',
309
- '\\cdots': '\\operatorname{SyMb01-cdots}',
310
- '\\colon': '\\operatorname{SyMb01-colon}',
311
- '\\imath': '\\operatorname{SyMb01-imath}',
312
- '\\\iiiint': '\\operatorname{SyMb01-iiiint}', // \iiint is valid in LaTeX but not supported in KaTeX
313
- '\\jmath': '\\operatorname{SyMb01-jmath}',
314
- '\\vdots': '\\operatorname{SyMb01-vdots}',
315
- '\\notin': '\\operatorname{SyMb01-notin}',
316
- '\\slash': '\\operatorname{SyMb01-slash}',
317
- '\\LaTeX': '\\operatorname{SyMb01-LaTeX}',
318
- '\\TeX': '\\operatorname{SyMb01-TeX}',
319
- '\\SyMbOlNeWlInE': '\\operatorname{SyMb01-newline}',
320
- ...customTexMacros
321
- };
322
- const options = {
323
- macros: macros,
324
- displayMode: true,
325
- strict: "ignore",
326
- throwOnError: false
327
- };
328
-
329
- const tex_list = splitTex(tex);
330
-
331
- let treeArray: KatexParseNode[] = [];
332
-
333
- for (const tex_item of tex_list) {
334
- if (tex_item.startsWith('%')) {
335
- const tex_node: KatexParseNode = {
336
- type: 'comment',
337
- mode: 'math',
338
- text: tex_item.substring(1),
339
- };
340
- treeArray.push(tex_node);
341
- continue;
624
+ export class LatexNodeToTexNodeError extends Error {
625
+ node: LatexParseNode;
626
+
627
+ constructor(message: string, node: LatexParseNode) {
628
+ super(message);
629
+ this.name = "LatexNodeToTexNodeError";
630
+ this.node = node;
631
+ }
632
+ }
633
+
634
+ function latexNodeToTexNode(node: LatexParseNode): TexNode {
635
+ try {
636
+ let res = {} as TexNode;
637
+ switch (node.type) {
638
+ case 'ordgroup':
639
+ res.type = 'ordgroup';
640
+ res.args = (node.args as LatexParseNode[]).map((n: LatexParseNode) => latexNodeToTexNode(n));
641
+ if (res.args!.length === 1) {
642
+ res = res.args![0] as TexNode;
643
+ }
644
+ break;
645
+ case 'empty':
646
+ res.type = 'empty';
647
+ res.content = '';
648
+ break;
649
+ case 'atom':
650
+ res.type = 'atom';
651
+ res.content = node.content!;
652
+ break;
653
+ case 'token':
654
+ case 'token-letter-var':
655
+ case 'token-number':
656
+ case 'token-operator':
657
+ case 'token-parenthesis':
658
+ res.type = 'symbol';
659
+ res.content = node.content!;
660
+ break;
661
+ case 'supsub':
662
+ res.type = 'supsub';
663
+ res.irregularData = {} as TexSupsubData;
664
+ if (node['base']) {
665
+ res.irregularData.base = latexNodeToTexNode(node['base']);
666
+ }
667
+ if (node['sup']) {
668
+ res.irregularData.sup = latexNodeToTexNode(node['sup']);
669
+ }
670
+ if (node['sub']) {
671
+ res.irregularData.sub = latexNodeToTexNode(node['sub']);
672
+ }
673
+ break;
674
+ case 'leftright':
675
+ res.type = 'leftright';
676
+
677
+ const body = latexNodeToTexNode(node.body as LatexParseNode);
678
+
679
+ let left: string = node['left']!;
680
+ if (left === "\\{") {
681
+ left = "{";
682
+ }
683
+ let right: string = node['right']!;
684
+ if (right === "\\}") {
685
+ right = "}";
686
+ }
687
+ const is_atom = (str:string) => (['(', ')', '[', ']', '{', '}'].includes(str));
688
+ res.args = [
689
+ { type: is_atom(left)? 'atom': 'symbol', content: left },
690
+ body,
691
+ { type: is_atom(right)? 'atom': 'symbol', content: right}
692
+ ];
693
+ break;
694
+ case 'beginend':
695
+ if (node.content?.startsWith('align')) {
696
+ // align, align*, alignat, alignat*, aligned, etc.
697
+ res.type = 'align';
698
+ } else {
699
+ res.type = 'matrix';
700
+ }
701
+ res.content = node.content!;
702
+ res.irregularData = (node.body as LatexParseNode[][]).map((row: LatexParseNode[]) => {
703
+ return row.map((n: LatexParseNode) => latexNodeToTexNode(n));
704
+ });
705
+ break;
706
+ case 'command':
707
+ const num_args = get_command_param_num(node.content!);
708
+ res.content = '\\' + node.content!;
709
+ if (num_args === 0) {
710
+ res.type = 'symbol';
711
+ } else if (num_args === 1) {
712
+ res.type = 'unaryFunc';
713
+ res.args = [
714
+ latexNodeToTexNode(node.arg1 as LatexParseNode)
715
+ ]
716
+ if (node.content === 'sqrt') {
717
+ if (node.exponent) {
718
+ res.irregularData = latexNodeToTexNode(node.exponent) as TexNode;
719
+ }
720
+ }
721
+ } else if (num_args === 2) {
722
+ res.type = 'binaryFunc';
723
+ res.args = [
724
+ latexNodeToTexNode(node.arg1 as LatexParseNode),
725
+ latexNodeToTexNode(node.arg2 as LatexParseNode)
726
+ ]
727
+ } else {
728
+ throw new LatexNodeToTexNodeError('Invalid number of arguments', node);
729
+ }
730
+ break;
731
+ case 'text':
732
+ res.type = 'text';
733
+ res.content = node.content!;
734
+ break;
735
+ case 'comment':
736
+ res.type = 'comment';
737
+ res.content = node.content!;
738
+ break;
739
+ case 'whitespace':
740
+ res.type = 'empty';
741
+ break;
742
+ case 'newline':
743
+ res.type = 'newline';
744
+ res.content = '\n';
745
+ break;
746
+ case 'control':
747
+ if (node.content === '\\\\') {
748
+ res.type = 'symbol';
749
+ res.content = node.content!;
750
+ break;
751
+ } else {
752
+ throw new LatexNodeToTexNodeError(`Unknown control sequence: ${node.content}`, node);
753
+ }
754
+ break;
755
+ default:
756
+ throw new LatexNodeToTexNodeError(`Unknown node type: ${node.type}`, node);
342
757
  }
343
- const trees = generateParseTree(tex_item, options);
344
- treeArray = treeArray.concat(trees);
758
+ return res as TexNode;
759
+ } catch (e) {
760
+ throw e;
345
761
  }
762
+ }
346
763
 
347
- let t = {
348
- type: 'ordgroup',
349
- mode: 'math',
350
- body: treeArray as KatexParseNode[],
351
- loc: {}
352
- } as KatexParseNode;
353
- return katexNodeToTexNode(t);
764
+ export function parseTex(tex: string, customTexMacros: {[key: string]: string}): TexNode {
765
+ const parser = new LatexParser();
766
+ for (const [macro, replacement] of Object.entries(customTexMacros)) {
767
+ tex = tex.replaceAll(macro, replacement);
768
+ }
769
+ const node = parser.parse(tex);
770
+ return latexNodeToTexNode(node);
354
771
  }