tex2typst 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/parser.ts CHANGED
@@ -1,4 +1,5 @@
1
- import { TexNode, LatexParseNode, TexSupsubData } from "./types";
1
+ import { TexNode, TexSupsubData } from "./types";
2
+
2
3
 
3
4
  const UNARY_COMMANDS = [
4
5
  'sqrt',
@@ -60,7 +61,6 @@ function assert(condition: boolean, message: string = ''): void {
60
61
  }
61
62
  }
62
63
 
63
-
64
64
  function get_command_param_num(command: string): number {
65
65
  if (UNARY_COMMANDS.includes(command)) {
66
66
  return 1;
@@ -71,22 +71,21 @@ function get_command_param_num(command: string): number {
71
71
  }
72
72
  }
73
73
 
74
- function find_closing_curly_bracket(latex: string, start: number): number {
75
- assert(latex[start] === '{');
74
+ const LEFT_CURLY_BRACKET: Token = {type: 'control', value: '{'};
75
+ const RIGHT_CURLY_BRACKET: Token = {type: 'control', value: '}'};
76
+
77
+ function find_closing_curly_bracket(tokens: Token[], start: number): number {
78
+ assert(token_eq(tokens[start], LEFT_CURLY_BRACKET));
76
79
  let count = 1;
77
80
  let pos = start + 1;
78
81
 
79
82
  while (count > 0) {
80
- if (pos >= latex.length) {
83
+ if (pos >= tokens.length) {
81
84
  throw new LatexParserError('Unmatched curly brackets');
82
85
  }
83
- if(pos + 1 < latex.length && (['\\{', '\\}'].includes(latex.substring(pos, pos + 2)))) {
84
- pos += 2;
85
- continue;
86
- }
87
- if (latex[pos] === '{') {
86
+ if (token_eq(tokens[pos], LEFT_CURLY_BRACKET)) {
88
87
  count += 1;
89
- } else if (latex[pos] === '}') {
88
+ } else if (token_eq(tokens[pos], RIGHT_CURLY_BRACKET)) {
90
89
  count -= 1;
91
90
  }
92
91
  pos += 1;
@@ -95,18 +94,21 @@ function find_closing_curly_bracket(latex: string, start: number): number {
95
94
  return pos - 1;
96
95
  }
97
96
 
98
- function find_closing_square_bracket(latex: string, start: number): number {
99
- assert(latex[start] === '[');
97
+ const LEFT_SQUARE_BRACKET: Token = {type: 'element', value: '['};
98
+ const RIGHT_SQUARE_BRACKET: Token = {type: 'element', value: ']'};
99
+
100
+ function find_closing_square_bracket(tokens: Token[], start: number): number {
101
+ assert(token_eq(tokens[start], LEFT_SQUARE_BRACKET));
100
102
  let count = 1;
101
103
  let pos = start + 1;
102
104
 
103
105
  while (count > 0) {
104
- if (pos >= latex.length) {
106
+ if (pos >= tokens.length) {
105
107
  throw new LatexParserError('Unmatched square brackets');
106
108
  }
107
- if (latex[pos] === '[') {
109
+ if (token_eq(tokens[pos], LEFT_SQUARE_BRACKET)) {
108
110
  count += 1;
109
- } else if (latex[pos] === ']') {
111
+ } else if (token_eq(tokens[pos], RIGHT_SQUARE_BRACKET)) {
110
112
  count -= 1;
111
113
  }
112
114
  pos += 1;
@@ -124,135 +126,235 @@ function isdigit(char: string): boolean {
124
126
  return '0123456789'.includes(char);
125
127
  }
126
128
 
129
+ function eat_whitespaces(tokens: Token[], start: number): Token[] {
130
+ let pos = start;
131
+ while (pos < tokens.length && ['whitespace', 'newline'].includes(tokens[pos].type)) {
132
+ pos++;
133
+ }
134
+ return tokens.slice(start, pos);
135
+ }
127
136
 
128
137
 
129
- function find_command(latex: string, start: number, command_name: string): number {
130
- const len_slash_command = 1 + command_name.length;
138
+ function eat_parenthesis(tokens: Token[], start: number): Token | null {
139
+ const firstToken = tokens[start];
140
+ if (firstToken.type === 'element' && ['(', ')', '[', ']', '|', '\\{', '\\}'].includes(firstToken.value)) {
141
+ return firstToken;
142
+ } else if (firstToken.type === 'command' && ['lfloor', 'rfloor', 'lceil', 'rceil', 'langle', 'rangle'].includes(firstToken.value.slice(1))) {
143
+ return firstToken;
144
+ } else {
145
+ return null;
146
+ }
147
+ }
148
+
149
+ function eat_primes(tokens: Token[], start: number): number {
131
150
  let pos = start;
151
+ while (pos < tokens.length && token_eq(tokens[pos], { type: 'element', value: "'" })) {
152
+ pos += 1;
153
+ }
154
+ return pos - start;
155
+ }
132
156
 
133
- while (pos < latex.length) {
134
- pos = latex.indexOf('\\' + command_name, pos);
135
- if (pos === -1) {
157
+
158
+ function eat_command_name(latex: string, start: number): string {
159
+ let pos = start;
160
+ while (pos < latex.length && isalpha(latex[pos])) {
161
+ pos += 1;
162
+ }
163
+ return latex.substring(start, pos);
164
+ }
165
+
166
+
167
+
168
+
169
+ const LEFT_COMMAND: Token = { type: 'command', value: '\\left' };
170
+ const RIGHT_COMMAND: Token = { type: 'command', value: '\\right' };
171
+
172
+ function find_closing_right_command(tokens: Token[], start: number): number {
173
+ let count = 1;
174
+ let pos = start;
175
+
176
+ while (count > 0) {
177
+ if (pos >= tokens.length) {
136
178
  return -1;
137
179
  }
138
- if (pos + len_slash_command >= latex.length || !isalpha(latex[pos + len_slash_command])) {
139
- return pos;
140
- } else {
141
- pos += len_slash_command;
180
+ if (token_eq(tokens[pos], LEFT_COMMAND)) {
181
+ count += 1;
182
+ } else if (token_eq(tokens[pos], RIGHT_COMMAND)) {
183
+ count -= 1;
142
184
  }
185
+ pos += 1;
143
186
  }
144
187
 
145
- return -1;
188
+ return pos - 1;
146
189
  }
147
190
 
148
- function find_closing_right_command(latex: string, start: number): number {
191
+
192
+ const BEGIN_COMMAND: Token = { type: 'command', value: '\\begin' };
193
+ const END_COMMAND: Token = { type: 'command', value: '\\end' };
194
+
195
+
196
+ function find_closing_end_command(tokens: Token[], start: number): number {
149
197
  let count = 1;
150
198
  let pos = start;
151
199
 
152
200
  while (count > 0) {
153
- if (pos >= latex.length) {
154
- return -1;
155
- }
156
- const left_idx = find_command(latex, pos, 'left');
157
- const right_idx = find_command(latex, pos, 'right');
158
-
159
- if (right_idx === -1) {
201
+ if (pos >= tokens.length) {
160
202
  return -1;
161
203
  }
162
-
163
- if (left_idx === -1 || left_idx > right_idx) {
164
- // a \right is ahead
165
- count -= 1;
166
- pos = right_idx + '\\right'.length;
167
- } else {
168
- // a \left is ahead
204
+ if (token_eq(tokens[pos], BEGIN_COMMAND)) {
169
205
  count += 1;
170
- pos = left_idx + '\\left'.length;
206
+ } else if (token_eq(tokens[pos], END_COMMAND)) {
207
+ count -= 1;
171
208
  }
209
+ pos += 1;
172
210
  }
173
211
 
174
- return pos - '\\right'.length;
212
+ return pos - 1;
175
213
  }
176
214
 
177
- function find_closing_end_command(latex: string, start: number): number {
215
+ function find_closing_curly_bracket_char(latex: string, start: number): number {
216
+ assert(latex[start] === '{');
178
217
  let count = 1;
179
- let pos = start;
218
+ let pos = start + 1;
180
219
 
181
220
  while (count > 0) {
182
221
  if (pos >= latex.length) {
183
- return -1;
222
+ throw new LatexParserError('Unmatched curly brackets');
184
223
  }
185
- const begin_idx = find_command(latex, pos, 'begin');
186
- const end_idx = find_command(latex, pos, 'end');
187
-
188
- if (end_idx === -1) {
189
- return -1;
224
+ if(pos + 1 < latex.length && (['\\{', '\\}'].includes(latex.substring(pos, pos + 2)))) {
225
+ pos += 2;
226
+ continue;
190
227
  }
191
-
192
- if (begin_idx === -1 || begin_idx > end_idx) {
193
- // an \end is ahead
194
- count -= 1;
195
- pos = end_idx + '\\end'.length;
196
- } else {
197
- // a \begin is ahead
228
+ if (latex[pos] === '{') {
198
229
  count += 1;
199
- pos = begin_idx + '\\begin'.length;
230
+ } else if (latex[pos] === '}') {
231
+ count -= 1;
200
232
  }
233
+ pos += 1;
201
234
  }
202
235
 
203
- return pos - '\\end'.length;
236
+ return pos - 1;
204
237
  }
205
238
 
206
- function eat_whitespaces(latex: string, start: number): string {
207
- let pos = start;
208
- while (pos < latex.length && [' ', '\t', '\n'].includes(latex[pos])) {
209
- pos += 1;
210
- }
211
- return latex.substring(start, pos);
212
- }
213
239
 
214
- function eat_spaces(latex: string, start: number): string {
215
- let pos = start;
216
- while (pos < latex.length && latex[pos] === ' ') {
217
- pos += 1;
218
- }
219
- return latex.substring(start, pos);
240
+ interface Token {
241
+ type: 'element' | 'command' | 'text' | 'comment' | 'whitespace' | 'newline' | 'control' | 'unknown';
242
+ value: string;
220
243
  }
221
244
 
222
- function eat_command_name(latex: string, start: number): string {
223
- let pos = start;
224
- while (pos < latex.length && isalpha(latex[pos])) {
225
- pos += 1;
226
- }
227
- return latex.substring(start, pos);
228
- }
245
+ function tokenize(latex: string): Token[] {
246
+ const tokens: Token[] = [];
247
+ let pos = 0;
229
248
 
230
- function eat_parenthesis(latex: string, start: number): string | null {
231
- if ('()[]|'.includes(latex[start])) {
232
- return latex[start];
233
- } else if (start + 1 < latex.length && ['\\{', '\\}'].includes(latex.substring(start, start + 2))) {
234
- return latex.substring(start, start + 2);
235
- } else if (start + 6 < latex.length && ['\\lfloor', '\\rfloor'].includes(latex.substring(start, start + 7))) {
236
- return latex.substring(start, start + 7);
237
- } else if (start + 5 < latex.length && ['\\lceil', '\\rceil'].includes(latex.substring(start, start + 6))) {
238
- return latex.substring(start, start + 6);
239
- } else if (start + 6 < latex.length && ['\\langle', '\\rangle'].includes(latex.substring(start, start + 7))) {
240
- return latex.substring(start, start + 7);
241
- } else {
242
- return null;
249
+ while (pos < latex.length) {
250
+ const firstChar = latex[pos];
251
+ let token: Token;
252
+ switch (firstChar) {
253
+ case '%': {
254
+ let newPos = pos + 1;
255
+ while (newPos < latex.length && latex[newPos] !== '\n') {
256
+ newPos += 1;
257
+ }
258
+ token = { type: 'comment', value: latex.slice(pos + 1, newPos) };
259
+ pos = newPos;
260
+ break;
261
+ }
262
+ case '{':
263
+ case '}':
264
+ case '_':
265
+ case '^':
266
+ case '&':
267
+ token = { type: 'control', value: firstChar};
268
+ pos++;
269
+ break;
270
+ case '\n':
271
+ token = { type: 'newline', value: firstChar};
272
+ pos++;
273
+ break;
274
+ case '\r': {
275
+ if (pos + 1 < latex.length && latex[pos + 1] === '\n') {
276
+ token = { type: 'newline', value: '\n' };
277
+ pos += 2;
278
+ } else {
279
+ token = { type: 'newline', value: '\n' };
280
+ pos ++;
281
+ }
282
+ break;
283
+ }
284
+ case ' ': {
285
+ let newPos = pos;
286
+ while (newPos < latex.length && latex[newPos] === ' ') {
287
+ newPos += 1;
288
+ }
289
+ token = {type: 'whitespace', value: latex.slice(pos, newPos)};
290
+ pos = newPos;
291
+ break;
292
+ }
293
+ case '\\': {
294
+ if (pos + 1 >= latex.length) {
295
+ throw new LatexParserError('Expecting command name after \\');
296
+ }
297
+ const firstTwoChars = latex.slice(pos, pos + 2);
298
+ if (firstTwoChars === '\\\\') {
299
+ token = { type: 'control', value: '\\\\' };
300
+ pos += 2;
301
+ } else if (['\\{','\\}', '\\%', '\\$', '\\&', '\\#', '\\_'].includes(firstTwoChars)) {
302
+ token = { type: 'element', value: firstTwoChars };
303
+ pos += 2;
304
+ } else {
305
+ const command = eat_command_name(latex, pos + 1);
306
+ token = { type: 'command', value: '\\' + command};
307
+ pos += 1 + command.length;
308
+ }
309
+ break;
310
+ }
311
+ default: {
312
+ if (isdigit(firstChar)) {
313
+ let newPos = pos;
314
+ while (newPos < latex.length && isdigit(latex[newPos])) {
315
+ newPos += 1;
316
+ }
317
+ token = { type: 'element', value: latex.slice(pos, newPos) }
318
+ } else if (isalpha(firstChar)) {
319
+ token = { type: 'element', value: firstChar };
320
+ } else if ('+-*/=\'<>!.,;?()[]|'.includes(firstChar)) {
321
+ token = { type: 'element', value: firstChar }
322
+ } else {
323
+ token = { type: 'unknown', value: firstChar };
324
+ }
325
+ pos += token.value.length;
326
+ }
327
+ }
328
+
329
+ tokens.push(token);
330
+
331
+ if (token.type === 'command' && ['\\text', '\\begin', '\\end'].includes(token.value)) {
332
+ if (pos >= latex.length || latex[pos] !== '{') {
333
+ throw new LatexParserError(`No content for ${token.value} command`);
334
+ }
335
+ tokens.push({ type: 'control', value: '{' });
336
+ const posClosingBracket = find_closing_curly_bracket_char(latex, pos);
337
+ pos++;
338
+ let textInside = latex.slice(pos, posClosingBracket);
339
+ // replace all escape characters with their actual characters
340
+ const chars = ['{', '}', '\\', '$', '&', '#', '_', '%'];
341
+ for (const char of chars) {
342
+ textInside = textInside.replaceAll('\\' + char, char);
343
+ }
344
+ tokens.push({ type: 'text', value: textInside });
345
+ tokens.push({ type: 'control', value: '}' });
346
+ pos = posClosingBracket + 1;
347
+ }
243
348
  }
349
+ return tokens;
244
350
  }
245
351
 
246
- function eat_primes(latex: string, start: number): number {
247
- let pos = start;
248
- while (pos < latex.length && latex[pos] === "'") {
249
- pos += 1;
250
- }
251
- return pos - start;
352
+ function token_eq(token1: Token, token2: Token) {
353
+ return token1.type == token2.type && token1.value == token2.value;
252
354
  }
253
355
 
254
356
 
255
- class LatexParserError extends Error {
357
+ export class LatexParserError extends Error {
256
358
  constructor(message: string) {
257
359
  super(message);
258
360
  this.name = 'LatexParserError';
@@ -260,7 +362,10 @@ class LatexParserError extends Error {
260
362
  }
261
363
 
262
364
 
263
- type ParseResult = [LatexParseNode, number];
365
+ type ParseResult = [TexNode, number];
366
+
367
+ const SUB_SYMBOL:Token = { type: 'control', value: '_' };
368
+ const SUP_SYMBOL:Token = { type: 'control', value: '^' };
264
369
 
265
370
  export class LatexParser {
266
371
  space_sensitive: boolean;
@@ -271,74 +376,87 @@ export class LatexParser {
271
376
  this.newline_sensitive = newline_sensitive;
272
377
  }
273
378
 
274
- parse(latex: string): LatexParseNode {
275
- const results: LatexParseNode[] = [];
379
+ parse(tokens: Token[]): TexNode {
380
+ const results: TexNode[] = [];
276
381
  let pos = 0;
277
-
278
- while (pos < latex.length) {
279
- const [res, newPos] = this.parseNextExpr(latex, pos);
280
- pos = newPos;
281
- if (!this.space_sensitive && res.type === 'whitespace') {
282
- continue;
283
- }
284
- if (!this.newline_sensitive && res.type === 'newline') {
285
- continue;
382
+ while (pos < tokens.length) {
383
+ const results: TexNode[] = [];
384
+ let pos = 0;
385
+
386
+ while (pos < tokens.length) {
387
+ const [res, newPos] = this.parseNextExpr(tokens, pos);
388
+ pos = newPos;
389
+ if (!this.space_sensitive && res.type === 'whitespace') {
390
+ continue;
391
+ }
392
+ if (!this.newline_sensitive && res.type === 'newline') {
393
+ continue;
394
+ }
395
+ if (res.type === 'control' && res.content === '&') {
396
+ throw new LatexParserError('Unexpected & outside of an alignment');
397
+ }
398
+ results.push(res);
286
399
  }
287
- if (res.type === 'control' && res.content === '&') {
288
- throw new LatexParserError('Unexpected & outside of an alignment');
400
+
401
+ if (results.length === 0) {
402
+ return EMPTY_NODE;
403
+ } else if (results.length === 1) {
404
+ return results[0];
405
+ } else {
406
+ return { type: 'ordgroup', content: '', args: results };
289
407
  }
290
- results.push(res);
291
408
  }
292
409
 
410
+
293
411
  if (results.length === 0) {
294
412
  return EMPTY_NODE;
295
413
  } else if (results.length === 1) {
296
414
  return results[0];
297
415
  } else {
298
- return { type: 'ordgroup', args: results };
416
+ return { type: 'ordgroup', content: '', args: results };
299
417
  }
300
418
  }
301
419
 
302
- parseNextExpr(latex: string, start: number): ParseResult {
303
- let [base, pos] = this.parseNextExprWithoutSupSub(latex, start);
304
- let sub: LatexParseNode | null = null;
305
- let sup: LatexParseNode | null = null;
420
+ parseNextExpr(tokens: Token[], start: number): ParseResult {
421
+ let [base, pos] = this.parseNextExprWithoutSupSub(tokens, start);
422
+ let sub: TexNode | null = null;
423
+ let sup: TexNode | null = null;
306
424
  let num_prime = 0;
307
425
 
308
- num_prime += eat_primes(latex, pos);
426
+ num_prime += eat_primes(tokens, pos);
309
427
  pos += num_prime;
310
- if (pos < latex.length && latex[pos] === '_') {
311
- [sub, pos] = this.parseNextExprWithoutSupSub(latex, pos + 1);
312
- num_prime += eat_primes(latex, pos);
428
+ if (pos < tokens.length && token_eq(tokens[pos], SUB_SYMBOL)) {
429
+ [sub, pos] = this.parseNextExprWithoutSupSub(tokens, pos + 1);
430
+ num_prime += eat_primes(tokens, pos);
313
431
  pos += num_prime;
314
- if (pos < latex.length && latex[pos] === '^') {
315
- [sup, pos] = this.parseNextExprWithoutSupSub(latex, pos + 1);
316
- if (eat_primes(latex, pos) > 0) {
432
+ if (pos < tokens.length && token_eq(tokens[pos], SUP_SYMBOL)) {
433
+ [sup, pos] = this.parseNextExprWithoutSupSub(tokens, pos + 1);
434
+ if (eat_primes(tokens, pos) > 0) {
317
435
  throw new LatexParserError('Double superscript');
318
436
  }
319
437
  }
320
- } else if (pos < latex.length && latex[pos] === '^') {
321
- [sup, pos] = this.parseNextExprWithoutSupSub(latex, pos + 1);
322
- if (eat_primes(latex, pos) > 0) {
438
+ } else if (pos < tokens.length && token_eq(tokens[pos], SUP_SYMBOL)) {
439
+ [sup, pos] = this.parseNextExprWithoutSupSub(tokens, pos + 1);
440
+ if (eat_primes(tokens, pos) > 0) {
323
441
  throw new LatexParserError('Double superscript');
324
442
  }
325
- if (pos < latex.length && latex[pos] === '_') {
326
- [sub, pos] = this.parseNextExprWithoutSupSub(latex, pos + 1);
327
- if (eat_primes(latex, pos) > 0) {
443
+ if (pos < tokens.length && token_eq(tokens[pos], SUB_SYMBOL)) {
444
+ [sub, pos] = this.parseNextExprWithoutSupSub(tokens, pos + 1);
445
+ if (eat_primes(tokens, pos) > 0) {
328
446
  throw new LatexParserError('Double superscript');
329
447
  }
330
448
  }
331
449
  }
332
450
 
333
451
  if (sub !== null || sup !== null || num_prime > 0) {
334
- const res = { type: 'supsub', base } as LatexParseNode;
452
+ const res: TexSupsubData = { base };
335
453
  if (sub) {
336
454
  res.sub = sub;
337
455
  }
338
456
  if (num_prime > 0) {
339
- res.sup = { type: 'ordgroup', args: [] };
457
+ res.sup = { type: 'ordgroup', content: '', args: [] };
340
458
  for (let i = 0; i < num_prime; i++) {
341
- res.sup.args!.push({ type: 'command', content: 'prime' });
459
+ res.sup.args!.push({ type: 'symbol', content: '\\prime' });
342
460
  }
343
461
  if (sup) {
344
462
  res.sup.args!.push(sup);
@@ -349,201 +467,206 @@ export class LatexParser {
349
467
  } else if (sup) {
350
468
  res.sup = sup;
351
469
  }
352
- return [res, pos];
470
+ return [{type: 'supsub', content: '', data: res }, pos];
353
471
  } else {
354
472
  return [base, pos];
355
473
  }
356
474
  }
357
475
 
358
- parseNextExprWithoutSupSub(latex: string, start: number): ParseResult {
359
- const firstChar = latex[start];
360
- if (firstChar === '{') {
361
- const posClosingBracket = find_closing_curly_bracket(latex, start);
362
- const exprInside = latex.slice(start + 1, posClosingBracket);
363
- return [this.parse(exprInside), posClosingBracket + 1];
364
- } else if (firstChar === '\\') {
365
- if (start + 1 >= latex.length) {
366
- throw new LatexParserError('Expecting command name after \\');
367
- }
368
- const firstTwoChars = latex.slice(start, start + 2);
369
- if (firstTwoChars === '\\\\') {
370
- return [{ type: 'control', content: '\\\\' }, start + 2];
371
- } else if (firstTwoChars === '\\{' || firstTwoChars === '\\}') {
372
- return [{ type: 'token-parenthesis', content: firstTwoChars }, start + 2];
373
- } else if (['\\%', '\\$', '\\&', '\\#', '\\_'].includes(firstTwoChars)) {
374
- return [{ type: 'token', content: firstTwoChars }, start + 2];
375
- } else if (latex.slice(start).startsWith('\\begin{')) {
376
- return this.parseBeginEndExpr(latex, start);
377
- } else if (latex.slice(start).startsWith('\\left') && (start + 5 >= latex.length || !isalpha(latex[start + 5]))) {
378
- return this.parseLeftRightExpr(latex, start);
379
- } else {
380
- return this.parseCommandExpr(latex, start);
381
- }
382
- } else if (firstChar === '%') {
383
- let pos = start + 1;
384
- while (pos < latex.length && latex[pos] !== '\n') {
385
- pos += 1;
386
- }
387
- return [{ type: 'comment', content: latex.slice(start + 1, pos) }, pos];
388
- } else if (isdigit(firstChar)) {
389
- let pos = start;
390
- while (pos < latex.length && isdigit(latex[pos])) {
391
- pos += 1;
392
- }
393
- return [{ type: 'token-number', content: latex.slice(start, pos) }, pos];
394
- } else if (isalpha(firstChar)) {
395
- return [{ type: 'token-letter-var', content: firstChar }, start + 1];
396
- } else if ('+-*/=<>!'.includes(firstChar)) {
397
- return [{ type: 'token-operator', content: firstChar }, start + 1];
398
- } else if ('.,;?'.includes(firstChar)) {
399
- return [{ type: 'atom', content: firstChar }, start + 1];
400
- } else if ('()[]'.includes(firstChar)) {
401
- return [{ type: 'token-parenthesis', content: firstChar }, start + 1];
402
- } else if (firstChar === '_') {
403
- let [sub, pos] = this.parseNextExpr(latex, start + 1);
404
- let sup: LatexParseNode | undefined = undefined;
405
- if (pos < latex.length && latex[pos] === '^') {
406
- [sup, pos] = this.parseNextExpr(latex, pos + 1);
407
- }
408
- return [{ type: 'supsub', base: EMPTY_NODE, sub, sup }, pos];
409
- } else if (firstChar === '^') {
410
- let [sup, pos] = this.parseNextExpr(latex, start + 1);
411
- let sub: LatexParseNode | undefined = undefined;
412
- if (pos < latex.length && latex[pos] === '_') {
413
- [sub, pos] = this.parseNextExpr(latex, pos + 1);
414
- }
415
- return [{ type: 'supsub', base: EMPTY_NODE, sub, sup }, pos];
416
- } else if (firstChar === ' ') {
417
- let pos = start;
418
- while (pos < latex.length && latex[pos] === ' ') {
419
- pos += 1;
420
- }
421
- return [{ type: 'whitespace', content: latex.slice(start, pos) }, pos];
422
- } else if (firstChar === '\n') {
423
- return [{ type: 'newline', content: '\n' }, start + 1];
424
- } else if (firstChar === '\r') {
425
- if (start + 1 < latex.length && latex[start + 1] === '\n') {
426
- return [{ type: 'newline', content: '\n' }, start + 2];
427
- } else {
428
- return [{ type: 'newline', content: '\n' }, start + 1];
429
- }
430
- } else if (firstChar === '&') {
431
- return [{ type: 'control', content: '&' }, start + 1];
432
- } else {
433
- return [{ type: 'unknown', content: firstChar }, start + 1];
476
+ parseNextExprWithoutSupSub(tokens: Token[], start: number): ParseResult {
477
+ const firstToken = tokens[start];
478
+ const tokenType = firstToken.type;
479
+ switch (tokenType) {
480
+ case 'element':
481
+ case 'text':
482
+ case 'comment':
483
+ case 'whitespace':
484
+ case 'newline':
485
+ return [{ type: tokenType, content: firstToken.value }, start + 1];
486
+ case 'command':
487
+ if (token_eq(firstToken, BEGIN_COMMAND)) {
488
+ return this.parseBeginEndExpr(tokens, start);
489
+ } else if (token_eq(firstToken, LEFT_COMMAND)) {
490
+ return this.parseLeftRightExpr(tokens, start);
491
+ } else {
492
+ return this.parseCommandExpr(tokens, start);
493
+ }
494
+ case 'control':
495
+ const controlChar = firstToken.value;
496
+ switch (controlChar) {
497
+ case '{':
498
+ const posClosingBracket = find_closing_curly_bracket(tokens, start);
499
+ const exprInside = tokens.slice(start + 1, posClosingBracket);
500
+ return [this.parse(exprInside), posClosingBracket + 1];
501
+ case '}':
502
+ throw new LatexParserError("Unmatched '}'");
503
+ case '\\\\':
504
+ return [{ type: 'control', content: '\\\\' }, start + 1];
505
+ case '_': {
506
+ let [sub, pos] = this.parseNextExpr(tokens, start + 1);
507
+ let sup: TexNode | undefined = undefined;
508
+ if (pos < tokens.length && token_eq(tokens[pos], SUP_SYMBOL)) {
509
+ [sup, pos] = this.parseNextExpr(tokens, pos + 1);
510
+ }
511
+ const subData = { base: EMPTY_NODE, sub, sup };
512
+ return [{ type: 'supsub', content: '', data: subData }, pos];
513
+ }
514
+ case '^': {
515
+ let [sup, pos] = this.parseNextExpr(tokens, start + 1);
516
+ let sub: TexNode | undefined = undefined;
517
+ if (pos < tokens.length && token_eq(tokens[pos], SUB_SYMBOL)) {
518
+ [sub, pos] = this.parseNextExpr(tokens, pos + 1);
519
+ }
520
+ const supData = { base: EMPTY_NODE, sub, sup };
521
+ return [{ type: 'supsub', content: '', data: supData }, pos];
522
+ }
523
+ case '&':
524
+ return [{ type: 'control', content: '&' }, start + 1];
525
+ default:
526
+ throw new LatexParserError('Unknown control sequence');
527
+ }
528
+ default:
529
+ throw new LatexParserError('Unknown token type');
434
530
  }
435
531
  }
436
532
 
437
- parseCommandExpr(latex: string, start: number): ParseResult {
438
- assert(latex[start] === '\\');
533
+ parseCommandExpr(tokens: Token[], start: number): ParseResult {
534
+ assert(tokens[start].type === 'command');
535
+
536
+ const command = tokens[start].value; // command name starts with a \
537
+
439
538
  let pos = start + 1;
440
- const command = eat_command_name(latex, pos);
441
- pos += command.length;
442
- const paramNum = get_command_param_num(command);
539
+
540
+ if (['left', 'right', 'begin', 'end'].includes(command.slice(1))) {
541
+ throw new LatexParserError('Unexpected command: ' + command);
542
+ }
543
+
544
+ const paramNum = get_command_param_num(command.slice(1));
443
545
  if (paramNum === 0) {
444
- return [{ type: 'command', content: command }, pos];
546
+ return [{ type: 'symbol', content: command }, pos];
445
547
  } else if (paramNum === 1) {
446
- if (command === 'sqrt' && pos < latex.length && latex[pos] === '[') {
548
+ if (command === '\\sqrt' && pos < tokens.length && token_eq(tokens[pos], LEFT_SQUARE_BRACKET)) {
447
549
  const posLeftSquareBracket = pos;
448
- const posRightSquareBracket = find_closing_square_bracket(latex, pos);
449
- const exprInside = latex.slice(posLeftSquareBracket + 1, posRightSquareBracket);
550
+ const posRightSquareBracket = find_closing_square_bracket(tokens, pos);
551
+ const exprInside = tokens.slice(posLeftSquareBracket + 1, posRightSquareBracket);
450
552
  const exponent = this.parse(exprInside);
451
- const [arg1, newPos] = this.parseNextExprWithoutSupSub(latex, posRightSquareBracket + 1);
452
- return [{ type: 'command', content: command, arg1, exponent }, newPos];
453
- } else if (command === 'text') {
454
- assert(latex[pos] === '{');
455
- const posClosingBracket = find_closing_curly_bracket(latex, pos);
456
- const text = latex.slice(pos + 1, posClosingBracket);
457
- return [{ type: 'text', content: text }, posClosingBracket + 1];
458
- } else {
459
- let [arg1, newPos] = this.parseNextExprWithoutSupSub(latex, pos);
460
- return [{ type: 'command', content: command, arg1 }, newPos];
553
+ const [arg1, newPos] = this.parseNextExprWithoutSupSub(tokens, posRightSquareBracket + 1);
554
+ return [{ type: 'unaryFunc', content: command, args: [arg1], data: exponent }, newPos];
555
+ } else if (command === '\\text') {
556
+ if (pos + 2 >= tokens.length) {
557
+ throw new LatexParserError('Expecting content for \\text command');
558
+ }
559
+ assert(token_eq(tokens[pos], LEFT_CURLY_BRACKET));
560
+ assert(tokens[pos + 1].type === 'text');
561
+ assert(token_eq(tokens[pos + 2], RIGHT_CURLY_BRACKET));
562
+ const text = tokens[pos + 1].value;
563
+ return [{ type: 'text', content: text }, pos + 3];
461
564
  }
565
+ let [arg1, newPos] = this.parseNextExprWithoutSupSub(tokens, pos);
566
+ return [{ type: 'unaryFunc', content: command, args: [arg1] }, newPos];
462
567
  } else if (paramNum === 2) {
463
- const [arg1, pos1] = this.parseNextExprWithoutSupSub(latex, pos);
464
- const [arg2, pos2] = this.parseNextExprWithoutSupSub(latex, pos1);
465
- return [{ type: 'command', content: command, arg1, arg2 }, pos2];
568
+ const [arg1, pos1] = this.parseNextExprWithoutSupSub(tokens, pos);
569
+ const [arg2, pos2] = this.parseNextExprWithoutSupSub(tokens, pos1);
570
+ return [{ type: 'binaryFunc', content: command, args: [arg1, arg2] }, pos2];
466
571
  } else {
467
572
  throw new Error( 'Invalid number of parameters');
468
573
  }
469
574
  }
470
575
 
471
- parseLeftRightExpr(latex: string, start: number): ParseResult {
472
- assert(latex.slice(start, start + 5) === '\\left');
473
- let pos = start + '\\left'.length;
474
- pos += eat_whitespaces(latex, pos).length;
475
- if (pos >= latex.length) {
576
+ parseLeftRightExpr(tokens: Token[], start: number): ParseResult {
577
+ assert(token_eq(tokens[start], LEFT_COMMAND));
578
+
579
+ let pos = start + 1;
580
+ pos += eat_whitespaces(tokens, pos).length;
581
+
582
+ if (pos >= tokens.length) {
476
583
  throw new LatexParserError('Expecting delimiter after \\left');
477
584
  }
478
- const leftDelimiter = eat_parenthesis(latex, pos);
585
+
586
+ const leftDelimiter = eat_parenthesis(tokens, pos);
479
587
  if (leftDelimiter === null) {
480
588
  throw new LatexParserError('Invalid delimiter after \\left');
481
589
  }
482
- pos += leftDelimiter.length;
590
+ pos++;
483
591
  const exprInsideStart = pos;
484
- const idx = find_closing_right_command(latex, pos);
592
+ const idx = find_closing_right_command(tokens, pos);
485
593
  if (idx === -1) {
486
594
  throw new LatexParserError('No matching \\right');
487
595
  }
488
596
  const exprInsideEnd = idx;
489
- pos = idx + '\\right'.length;
490
- pos += eat_whitespaces(latex, pos).length;
491
- if (pos >= latex.length) {
492
- throw new LatexParserError('Expecting delimiter after \\right');
597
+ pos = idx + 1;
598
+
599
+ pos += eat_whitespaces(tokens, pos).length;
600
+ if (pos >= tokens.length) {
601
+ throw new LatexParserError('Expecting \\right after \\left');
493
602
  }
494
- const rightDelimiter = eat_parenthesis(latex, pos);
603
+
604
+ const rightDelimiter = eat_parenthesis(tokens, pos);
495
605
  if (rightDelimiter === null) {
496
606
  throw new LatexParserError('Invalid delimiter after \\right');
497
607
  }
498
- pos += rightDelimiter.length;
499
- const exprInside = latex.slice(exprInsideStart, exprInsideEnd);
608
+ pos++;
609
+
610
+ const exprInside = tokens.slice(exprInsideStart, exprInsideEnd);
500
611
  const body = this.parse(exprInside);
501
- const res = { type: 'leftright', left: leftDelimiter, right: rightDelimiter, body };
612
+ const args = [
613
+ { type: 'element', content: leftDelimiter.value },
614
+ body,
615
+ { type: 'element', content: rightDelimiter.value }
616
+ ]
617
+ const res = { type: 'leftright', content: '', args: args };
502
618
  return [res, pos];
503
619
  }
504
620
 
621
+ parseBeginEndExpr(tokens: Token[], start: number): ParseResult {
622
+ assert(token_eq(tokens[start], BEGIN_COMMAND));
505
623
 
506
- parseBeginEndExpr(latex: string, start: number): ParseResult {
507
- assert(latex.slice(start, start + 7) === '\\begin{');
508
- let pos = start + '\\begin'.length;
509
- const idx = find_closing_curly_bracket(latex, pos);
510
- if (idx === -1) {
511
- throw new LatexParserError('No matching } after \\begin{');
512
- }
513
- const envName = latex.slice(pos + 1, idx);
514
- pos = idx + 1;
515
- pos += eat_whitespaces(latex, pos).length; // ignore whitespaces and '\n' after \begin{envName}
624
+ let pos = start + 1;
625
+ assert(token_eq(tokens[pos], LEFT_CURLY_BRACKET));
626
+ assert(tokens[pos + 1].type === 'text');
627
+ assert(token_eq(tokens[pos + 2], RIGHT_CURLY_BRACKET));
628
+ const envName = tokens[pos + 1].value;
629
+ pos += 3;
630
+
631
+ pos += eat_whitespaces(tokens, pos).length; // ignore whitespaces and '\n' after \begin{envName}
632
+
516
633
  const exprInsideStart = pos;
517
- const endIdx = find_closing_end_command(latex, pos);
634
+
635
+ const endIdx = find_closing_end_command(tokens, pos);
518
636
  if (endIdx === -1) {
519
637
  throw new LatexParserError('No matching \\end');
520
638
  }
521
639
  const exprInsideEnd = endIdx;
522
- pos = endIdx + '\\end'.length;
523
- const closingIdx = find_closing_curly_bracket(latex, pos);
524
- if (closingIdx === -1) {
525
- throw new LatexParserError('No matching } after \\end{');
526
- }
527
- if (latex.slice(pos + 1, closingIdx) !== envName) {
640
+ pos = endIdx + 1;
641
+
642
+ assert(token_eq(tokens[pos], LEFT_CURLY_BRACKET));
643
+ assert(tokens[pos + 1].type === 'text');
644
+ assert(token_eq(tokens[pos + 2], RIGHT_CURLY_BRACKET));
645
+ if (tokens[pos + 1].value !== envName) {
528
646
  throw new LatexParserError('Mismatched \\begin and \\end environments');
529
647
  }
530
- let exprInside = latex.slice(exprInsideStart, exprInsideEnd);
531
- exprInside = exprInside.trimEnd(); // ignore whitespaces and '\n' before \end{envName}
648
+ pos += 3;
649
+
650
+ const exprInside = tokens.slice(exprInsideStart, exprInsideEnd);
651
+ // ignore whitespaces and '\n' before \end{envName}
652
+ while(exprInside.length > 0 && ['whitespace', 'newline'].includes(exprInside[exprInside.length - 1].type)) {
653
+ exprInside.pop();
654
+ }
532
655
  const body = this.parseAligned(exprInside);
533
- const res = { type: 'beginend', content: envName, body };
534
- return [res, closingIdx + 1];
656
+ const res = { type: 'beginend', content: envName, data: body };
657
+ return [res, pos];
535
658
  }
536
659
 
537
- parseAligned(latex: string): LatexParseNode[][] {
660
+ parseAligned(tokens: Token[]): TexNode[][] {
538
661
  let pos = 0;
539
- const allRows: LatexParseNode[][] = [];
540
- let row: LatexParseNode[] = [];
662
+ const allRows: TexNode[][] = [];
663
+ let row: TexNode[] = [];
541
664
  allRows.push(row);
542
- let group: LatexParseNode = { type: 'ordgroup', args: [] };
665
+ let group: TexNode = { type: 'ordgroup', content: '', args: [] };
543
666
  row.push(group);
544
667
 
545
- while (pos < latex.length) {
546
- const [res, newPos] = this.parseNextExpr(latex, pos);
668
+ while (pos < tokens.length) {
669
+ const [res, newPos] = this.parseNextExpr(tokens, pos);
547
670
  pos = newPos;
548
671
  if (res.type === 'whitespace') {
549
672
  continue;
@@ -551,221 +674,31 @@ export class LatexParser {
551
674
  continue;
552
675
  } else if (res.type === 'control' && res.content === '\\\\') {
553
676
  row = [];
554
- group = { type: 'ordgroup', args: [] };
677
+ group = { type: 'ordgroup', content: '', args: [] };
555
678
  row.push(group);
556
679
  allRows.push(row);
557
680
  } else if (res.type === 'control' && res.content === '&') {
558
- group = { type: 'ordgroup', args: [] };
681
+ group = { type: 'ordgroup', content: '', args: [] };
559
682
  row.push(group);
560
683
  } else {
561
684
  group.args!.push(res);
562
685
  }
563
686
  }
564
-
565
687
  return allRows;
566
688
  }
567
689
  }
568
690
 
569
- // Split tex into a list of tex strings and comments.
570
- // Each item in the returned list is either a tex snippet or a comment.
571
- // Each comment item is a string starting with '%'.
572
- function splitTex(tex: string): string[] {
573
- const lines = tex.split("\n");
574
- const out_tex_list: string[] = [];
575
- let current_tex = "";
576
- // let inside_begin_depth = 0;
577
- for (let i = 0; i < lines.length; i++) {
578
- const line = lines[i];
579
- // if (line.includes('\\begin{')) {
580
- // inside_begin_depth += line.split('\\begin{').length - 1;
581
- // }
582
-
583
- let index = -1;
584
- while (index + 1 < line.length) {
585
- index = line.indexOf('%', index + 1);
586
- if (index === -1) {
587
- // No comment in this line
588
- break;
589
- }
590
- if (index === 0 || line[index - 1] !== '\\') {
591
- // Found a comment
592
- break;
593
- }
594
- }
595
- if (index !== -1) {
596
- current_tex += line.substring(0, index);
597
- const comment = line.substring(index);
598
- out_tex_list.push(current_tex);
599
- current_tex = "";
600
- out_tex_list.push(comment);
601
- } else {
602
- current_tex += line;
603
- }
604
- if (i < lines.length - 1) {
605
- const has_begin_command = line.includes('\\begin{');
606
- const followed_by_end_command = lines[i + 1].includes('\\end{');
607
- if(!has_begin_command && !followed_by_end_command) {
608
- current_tex += '\n';
609
- }
610
- }
611
-
612
- // if (line.includes('\\end{')) {
613
- // inside_begin_depth -= line.split('\\end{').length - 1;
614
- // }
615
- }
616
-
617
- if (current_tex.length > 0) {
618
- out_tex_list.push(current_tex);
619
- }
620
-
621
- return out_tex_list;
622
- }
623
-
624
- export class LatexNodeToTexNodeError extends Error {
625
- node: LatexParseNode;
626
-
627
- constructor(message: string, node: LatexParseNode) {
628
- super(message);
629
- this.name = "LatexNodeToTexNodeError";
630
- this.node = node;
631
- }
632
- }
633
-
634
- function latexNodeToTexNode(node: LatexParseNode): TexNode {
635
- try {
636
- let res = {} as TexNode;
637
- switch (node.type) {
638
- case 'ordgroup':
639
- res.type = 'ordgroup';
640
- res.args = (node.args as LatexParseNode[]).map((n: LatexParseNode) => latexNodeToTexNode(n));
641
- if (res.args!.length === 1) {
642
- res = res.args![0] as TexNode;
643
- }
644
- break;
645
- case 'empty':
646
- res.type = 'empty';
647
- res.content = '';
648
- break;
649
- case 'atom':
650
- res.type = 'atom';
651
- res.content = node.content!;
652
- break;
653
- case 'token':
654
- case 'token-letter-var':
655
- case 'token-number':
656
- case 'token-operator':
657
- case 'token-parenthesis':
658
- res.type = 'symbol';
659
- res.content = node.content!;
660
- break;
661
- case 'supsub':
662
- res.type = 'supsub';
663
- res.irregularData = {} as TexSupsubData;
664
- if (node['base']) {
665
- res.irregularData.base = latexNodeToTexNode(node['base']);
666
- }
667
- if (node['sup']) {
668
- res.irregularData.sup = latexNodeToTexNode(node['sup']);
669
- }
670
- if (node['sub']) {
671
- res.irregularData.sub = latexNodeToTexNode(node['sub']);
672
- }
673
- break;
674
- case 'leftright':
675
- res.type = 'leftright';
676
-
677
- const body = latexNodeToTexNode(node.body as LatexParseNode);
678
-
679
- let left: string = node['left']!;
680
- if (left === "\\{") {
681
- left = "{";
682
- }
683
- let right: string = node['right']!;
684
- if (right === "\\}") {
685
- right = "}";
686
- }
687
- const is_atom = (str:string) => (['(', ')', '[', ']', '{', '}'].includes(str));
688
- res.args = [
689
- { type: is_atom(left)? 'atom': 'symbol', content: left },
690
- body,
691
- { type: is_atom(right)? 'atom': 'symbol', content: right}
692
- ];
693
- break;
694
- case 'beginend':
695
- if (node.content?.startsWith('align')) {
696
- // align, align*, alignat, alignat*, aligned, etc.
697
- res.type = 'align';
698
- } else {
699
- res.type = 'matrix';
700
- }
701
- res.content = node.content!;
702
- res.irregularData = (node.body as LatexParseNode[][]).map((row: LatexParseNode[]) => {
703
- return row.map((n: LatexParseNode) => latexNodeToTexNode(n));
704
- });
705
- break;
706
- case 'command':
707
- const num_args = get_command_param_num(node.content!);
708
- res.content = '\\' + node.content!;
709
- if (num_args === 0) {
710
- res.type = 'symbol';
711
- } else if (num_args === 1) {
712
- res.type = 'unaryFunc';
713
- res.args = [
714
- latexNodeToTexNode(node.arg1 as LatexParseNode)
715
- ]
716
- if (node.content === 'sqrt') {
717
- if (node.exponent) {
718
- res.irregularData = latexNodeToTexNode(node.exponent) as TexNode;
719
- }
720
- }
721
- } else if (num_args === 2) {
722
- res.type = 'binaryFunc';
723
- res.args = [
724
- latexNodeToTexNode(node.arg1 as LatexParseNode),
725
- latexNodeToTexNode(node.arg2 as LatexParseNode)
726
- ]
727
- } else {
728
- throw new LatexNodeToTexNodeError('Invalid number of arguments', node);
729
- }
730
- break;
731
- case 'text':
732
- res.type = 'text';
733
- res.content = node.content!;
734
- break;
735
- case 'comment':
736
- res.type = 'comment';
737
- res.content = node.content!;
738
- break;
739
- case 'whitespace':
740
- res.type = 'empty';
741
- break;
742
- case 'newline':
743
- res.type = 'newline';
744
- res.content = '\n';
745
- break;
746
- case 'control':
747
- if (node.content === '\\\\') {
748
- res.type = 'symbol';
749
- res.content = node.content!;
750
- break;
751
- } else {
752
- throw new LatexNodeToTexNodeError(`Unknown control sequence: ${node.content}`, node);
753
- }
754
- break;
755
- default:
756
- throw new LatexNodeToTexNodeError(`Unknown node type: ${node.type}`, node);
757
- }
758
- return res as TexNode;
759
- } catch (e) {
760
- throw e;
761
- }
762
- }
763
-
764
691
  export function parseTex(tex: string, customTexMacros: {[key: string]: string}): TexNode {
765
692
  const parser = new LatexParser();
766
- for (const [macro, replacement] of Object.entries(customTexMacros)) {
767
- tex = tex.replaceAll(macro, replacement);
693
+ const original_tokens = tokenize(tex);
694
+ let processed_tokens: Token[] = [];
695
+ for (const token of original_tokens) {
696
+ if (token.type === 'command' && customTexMacros[token.value]) {
697
+ const expanded_tokens = tokenize(customTexMacros[token.value]);
698
+ processed_tokens = processed_tokens.concat(expanded_tokens);
699
+ } else {
700
+ processed_tokens.push(token);
701
+ }
768
702
  }
769
- const node = parser.parse(tex);
770
- return latexNodeToTexNode(node);
703
+ return parser.parse(processed_tokens);
771
704
  }