tex2typst 0.3.17 → 0.3.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/convert.ts CHANGED
@@ -22,9 +22,6 @@ function tex_token_to_typst(token: string): string {
22
22
  return token;
23
23
  } else if (token === '/') {
24
24
  return '\\/';
25
- } else if (token === '\\|') {
26
- // \| in LaTeX is double vertical bar looks like ||
27
- return 'parallel';
28
25
  } else if (token === '\\\\') {
29
26
  return '\\';
30
27
  } else if (['\\$', '\\#', '\\&', '\\_'].includes(token)) {
@@ -49,40 +46,52 @@ function tex_token_to_typst(token: string): string {
49
46
  function convert_overset(node: TexNode, options: Tex2TypstOptions): TypstNode {
50
47
  const [sup, base] = node.args!;
51
48
 
52
- const is_def = (n: TexNode): boolean => {
53
- if (n.eq(new TexNode('text', 'def'))) {
54
- return true;
55
- }
56
- // \overset{def}{=} is also considered as eq.def
57
- if (n.type === 'ordgroup' && n.args!.length === 3) {
58
- const [a1, a2, a3] = n.args!;
59
- const d = new TexNode('element', 'd');
60
- const e = new TexNode('element', 'e');
61
- const f = new TexNode('element', 'f');
62
- if (a1.eq(d) && a2.eq(e) && a3.eq(f)) {
49
+ if (options.optimize) {
50
+ const is_def = (n: TexNode): boolean => {
51
+ if (n.eq(new TexNode('text', 'def'))) {
63
52
  return true;
64
53
  }
54
+ // \overset{def}{=} is also considered as eq.def
55
+ if (n.type === 'ordgroup' && n.args!.length === 3) {
56
+ const [a1, a2, a3] = n.args!;
57
+ const d = new TexNode('element', 'd');
58
+ const e = new TexNode('element', 'e');
59
+ const f = new TexNode('element', 'f');
60
+ if (a1.eq(d) && a2.eq(e) && a3.eq(f)) {
61
+ return true;
62
+ }
63
+ }
64
+ return false;
65
+ };
66
+ const is_eq = (n: TexNode): boolean => n.eq(new TexNode('element', '='));
67
+ if (is_def(sup) && is_eq(base)) {
68
+ return new TypstNode('symbol', 'eq.def');
65
69
  }
66
- return false;
67
- };
68
- const is_eq = (n: TexNode): boolean => n.eq(new TexNode('element', '='));
69
- if (is_def(sup) && is_eq(base)) {
70
- return new TypstNode('symbol', 'eq.def');
71
70
  }
72
71
  const limits_call = new TypstNode(
73
72
  'funcCall',
74
73
  'limits',
75
74
  [convert_tex_node_to_typst(base, options)]
76
75
  );
77
- return new TypstNode(
78
- 'supsub',
79
- '',
80
- [],
81
- {
76
+ return new TypstNode('supsub', '', [], {
82
77
  base: limits_call,
83
78
  sup: convert_tex_node_to_typst(sup, options),
84
- }
79
+ });
80
+ }
81
+
82
+ // \underset{X}{Y} -> limits(Y)_X
83
+ function convert_underset(node: TexNode, options: Tex2TypstOptions): TypstNode {
84
+ const [sub, base] = node.args!;
85
+
86
+ const limits_call = new TypstNode(
87
+ 'funcCall',
88
+ 'limits',
89
+ [convert_tex_node_to_typst(base, options)]
85
90
  );
91
+ return new TypstNode('supsub', '', [], {
92
+ base: limits_call,
93
+ sub: convert_tex_node_to_typst(sub, options),
94
+ });
86
95
  }
87
96
 
88
97
 
@@ -117,7 +126,7 @@ export function convert_tex_node_to_typst(node: TexNode, options: Tex2TypstOptio
117
126
  case 'supsub': {
118
127
  let { base, sup, sub } = node.data as TexSupsubData;
119
128
 
120
- // Special logic for overbrace
129
+ // special hook for overbrace
121
130
  if (base && base.type === 'unaryFunc' && base.content === '\\overbrace' && sup) {
122
131
  return new TypstNode(
123
132
  'funcCall',
@@ -150,40 +159,61 @@ export function convert_tex_node_to_typst(node: TexNode, options: Tex2TypstOptio
150
159
  return new TypstNode('supsub', '', [], data);
151
160
  }
152
161
  case 'leftright': {
153
- const [left, body, right] = node.args!;
154
- // These pairs will be handled by Typst compiler by default. No need to add lr()
155
- const group: TypstNode = new TypstNode(
162
+ const [left, _body, right] = node.args!;
163
+ const [typ_left, typ_body, typ_right] = node.args!.map((n) => convert_tex_node_to_typst(n, options));
164
+
165
+ if (options.optimize) {
166
+ // optimization off: "lr(bar.v.double a + 1/2 bar.v.double)"
167
+ // optimization on : "norm(a + 1/2)"
168
+ if (left.content === '\\|' && right.content === '\\|') {
169
+ return new TypstNode('funcCall', 'norm', [typ_body]);
170
+ }
171
+
172
+ // These pairs will be handled by Typst compiler by default. No need to add lr()
173
+ if ([
174
+ "[]", "()", "\\{\\}",
175
+ "\\lfloor\\rfloor",
176
+ "\\lceil\\rceil",
177
+ "\\lfloor\\rceil",
178
+ ].includes(left.content + right.content)) {
179
+ return new TypstNode('group', '', [typ_left, typ_body, typ_right]);
180
+ }
181
+ }
182
+
183
+ const group = new TypstNode(
156
184
  'group',
157
185
  '',
158
- node.args!.map((n) => convert_tex_node_to_typst(n, options))
186
+ [typ_left, typ_body, typ_right]
159
187
  );
160
- if ([
161
- "[]", "()", "\\{\\}",
162
- "\\lfloor\\rfloor",
163
- "\\lceil\\rceil",
164
- "\\lfloor\\rceil",
165
- ].includes(left.content + right.content)) {
166
- return group;
167
- }
168
- // "\left\{ A \right." -> "{A"
169
- // "\left. A \right\}" -> "lr( A} )"
188
+
189
+ // "\left\{ a + \frac{1}{3} \right." -> "lr(\{ a + 1/3)"
190
+ // "\left. a + \frac{1}{3} \right\}" -> "lr( a + \frac{1}{3} \})"
191
+ // Note that: In lr(), if one side of delimiter doesn't present (i.e. derived from "\\left." or "\\right."),
192
+ // "(", ")", "{", "[", should be escaped with "\" to be the other side of delimiter.
193
+ // Simple "lr({ a+1/3)" doesn't compile in Typst.
194
+ const escape_curly_or_paren = function(s: string): string {
195
+ if (["(", ")", "{", "["].includes(s)) {
196
+ return "\\" + s;
197
+ } else {
198
+ return s;
199
+ }
200
+ };
170
201
  if (right.content === '.') {
171
- group.args!.pop();
172
- return group;
202
+ typ_left.content = escape_curly_or_paren(typ_left.content);
203
+ group.args = [typ_left, typ_body];
173
204
  } else if (left.content === '.') {
174
- group.args!.shift();
175
- return new TypstNode('funcCall', 'lr', [group]);
205
+ typ_right.content = escape_curly_or_paren(typ_right.content);
206
+ group.args = [typ_body, typ_right];
176
207
  }
177
- return new TypstNode(
178
- 'funcCall',
179
- 'lr',
180
- [group]
181
- );
208
+ return new TypstNode('funcCall', 'lr', [group]);
182
209
  }
183
210
  case 'binaryFunc': {
184
211
  if (node.content === '\\overset') {
185
212
  return convert_overset(node, options);
186
213
  }
214
+ if (node.content === '\\underset') {
215
+ return convert_underset(node, options);
216
+ }
187
217
  // \frac{a}{b} -> a / b
188
218
  if (node.content === '\\frac') {
189
219
  if (options.fracToSlash) {
@@ -246,17 +276,14 @@ export function convert_tex_node_to_typst(node: TexNode, options: Tex2TypstOptio
246
276
  }
247
277
  // \operatorname{opname} -> op("opname")
248
278
  if (node.content === '\\operatorname') {
249
- const text = arg0.content;
250
279
 
251
- if (TYPST_INTRINSIC_SYMBOLS.includes(text)) {
252
- return new TypstNode('symbol', text);
253
- } else {
254
- return new TypstNode(
255
- 'funcCall',
256
- 'op',
257
- [arg0]
258
- );
280
+ if (options.optimize) {
281
+ const text = arg0.content;
282
+ if (TYPST_INTRINSIC_SYMBOLS.includes(text)) {
283
+ return new TypstNode('symbol', text);
284
+ }
259
285
  }
286
+ return new TypstNode('funcCall', 'op', [arg0]);
260
287
  }
261
288
  // \hspace{1cm} -> #h(1cm)
262
289
  // TODO: reverse conversion support for this
@@ -409,6 +436,8 @@ const TYPST_UNARY_FUNCTIONS: string[] = [
409
436
  'frak',
410
437
  'floor',
411
438
  'ceil',
439
+ 'norm',
440
+ 'limits',
412
441
  ];
413
442
 
414
443
  const TYPST_BINARY_FUNCTIONS: string[] = [
@@ -428,8 +457,6 @@ function apply_escape_if_needed(c: string) {
428
457
  function typst_token_to_tex(token: string): string {
429
458
  if (/^[a-zA-Z0-9]$/.test(token)) {
430
459
  return token;
431
- } else if (token === 'thin') {
432
- return '\\,';
433
460
  } else if (reverseSymbolMap.has(token)) {
434
461
  return '\\' + reverseSymbolMap.get(token)!;
435
462
  }
@@ -491,6 +518,8 @@ export function convert_typst_node_to_tex(node: TypstNode): TexNode {
491
518
  let left_delim = apply_escape_if_needed(data.leftDelim);
492
519
  assert(data.rightDelim !== null, "leftDelim has value but rightDelim not");
493
520
  let right_delim = apply_escape_if_needed(data.rightDelim!);
521
+ // TODO: should be TeXNode('leftright', ...)
522
+ // But currently writer will output `\left |` while people commonly prefer `\left|`.
494
523
  return new TexNode('ordgroup', '', [
495
524
  new TexNode('element', '\\left' + left_delim),
496
525
  ...node.args!.map(convert_typst_node_to_tex),
@@ -500,17 +529,29 @@ export function convert_typst_node_to_tex(node: TypstNode): TexNode {
500
529
  return new TexNode('ordgroup', '', node.args!.map(convert_typst_node_to_tex));
501
530
  }
502
531
  }
532
+ // special hook for norm
533
+ // `\| a \|` <- `norm(a)`
534
+ // `\left\| a + \frac{1}{3} \right\|` <- `norm(a + 1/3)`
535
+ if (node.content === 'norm') {
536
+ const arg0 = node.args![0];
537
+ const tex_node_type = node.isOverHigh() ? 'leftright' : 'ordgroup';
538
+ return new TexNode(tex_node_type, '', [
539
+ new TexNode('symbol', "\\|"),
540
+ convert_typst_node_to_tex(arg0),
541
+ new TexNode('symbol', "\\|")
542
+ ]);
543
+ }
503
544
  // special hook for floor, ceil
504
- // Typst "floor(a) + ceil(b)" should converts to Tex "\lfloor a \rfloor + \lceil b \rceil"
545
+ // `\lfloor a \rfloor` <- `floor(a)`
546
+ // `\lceil a \rceil` <- `ceil(a)`
547
+ // `\left\lfloor a \right\rfloor` <- `floor(a)`
548
+ // `\left\lceil a \right\rceil` <- `ceil(a)`
505
549
  if (node.content === 'floor' || node.content === 'ceil') {
506
- let left = "\\l" + node.content;
507
- let right = "\\r" + node.content;
550
+ const left = "\\l" + node.content;
551
+ const right = "\\r" + node.content;
508
552
  const arg0 = node.args![0];
509
- if (arg0.isOverHigh()) {
510
- left = "\\left" + left;
511
- right = "\\right" + right;
512
- }
513
- return new TexNode('ordgroup', '', [
553
+ const tex_node_type = node.isOverHigh() ? 'leftright' : 'ordgroup';
554
+ return new TexNode(tex_node_type, '', [
514
555
  new TexNode('symbol', left),
515
556
  convert_typst_node_to_tex(arg0),
516
557
  new TexNode('symbol', right)
@@ -552,15 +593,34 @@ export function convert_typst_node_to_tex(node: TypstNode): TexNode {
552
593
  }
553
594
  case 'supsub': {
554
595
  const { base, sup, sub } = node.data as TypstSupsubData;
555
- const base_tex = convert_typst_node_to_tex(base);
556
596
  let sup_tex: TexNode | undefined;
557
597
  let sub_tex: TexNode | undefined;
598
+
558
599
  if (sup) {
559
600
  sup_tex = convert_typst_node_to_tex(sup);
560
601
  }
561
602
  if (sub) {
562
603
  sub_tex = convert_typst_node_to_tex(sub);
563
604
  }
605
+
606
+ // special hook for limits
607
+ // `limits(+)^a` -> `\overset{a}{+}`
608
+ // `limits(+)_a` -> `\underset{a}{+}`
609
+ // `limits(+)_a^b` -> `\overset{b}{\underset{a}{+}}`
610
+ if (base.eq(new TypstNode('funcCall', 'limits'))) {
611
+ const body_in_limits = convert_typst_node_to_tex(base.args![0]);
612
+ if (sup_tex !== undefined && sub_tex === undefined) {
613
+ return new TexNode('binaryFunc', '\\overset', [sup_tex, body_in_limits]);
614
+ } else if (sup_tex === undefined && sub_tex !== undefined) {
615
+ return new TexNode('binaryFunc', '\\underset', [sub_tex, body_in_limits]);
616
+ } else {
617
+ const underset_call = new TexNode('binaryFunc', '\\underset', [sub_tex!, body_in_limits]);
618
+ return new TexNode('binaryFunc', '\\overset', [sup_tex!, underset_call]);
619
+ }
620
+ }
621
+
622
+ const base_tex = convert_typst_node_to_tex(base);
623
+
564
624
  const res = new TexNode('supsub', '', [], {
565
625
  base: base_tex,
566
626
  sup: sup_tex,
package/src/index.ts CHANGED
@@ -11,11 +11,11 @@ import { shorthandMap } from "./typst-shorthands";
11
11
  export function tex2typst(tex: string, options?: Tex2TypstOptions): string {
12
12
  const opt: Tex2TypstOptions = {
13
13
  nonStrict: true,
14
- preferTypstIntrinsic: true,
15
14
  preferShorthands: true,
16
15
  keepSpaces: false,
17
16
  fracToSlash: true,
18
17
  inftyToOo: false,
18
+ optimize: true,
19
19
  nonAsciiWrapper: "",
20
20
  customTexMacros: {}
21
21
  };
package/src/map.ts CHANGED
@@ -1,6 +1,8 @@
1
1
  const symbolMap = new Map<string, string>([
2
2
  ['displaystyle', 'display'],
3
3
 
4
+ ['|', 'bar.v.double'],
5
+ ['!', '#h(-math.thin.amount)'],
4
6
  [',', 'thin'],
5
7
  [':', 'med'],
6
8
  [';', 'thick'],
package/src/tex-parser.ts CHANGED
@@ -1,53 +1,8 @@
1
1
  import { symbolMap } from "./map";
2
2
  import { TexNode, TexSupsubData, TexToken, TexTokenType } from "./types";
3
3
  import { assert } from "./util";
4
- import { JSLex, Scanner } from "./jslex";
5
4
  import { array_find } from "./generic";
6
-
7
- const UNARY_COMMANDS = [
8
- 'sqrt',
9
- 'text',
10
-
11
- 'bar',
12
- 'bold',
13
- 'boldsymbol',
14
- 'ddot',
15
- 'dot',
16
- 'hat',
17
- 'mathbb',
18
- 'mathbf',
19
- 'mathcal',
20
- 'mathfrak',
21
- 'mathit',
22
- 'mathrm',
23
- 'mathscr',
24
- 'mathsf',
25
- 'mathtt',
26
- 'operatorname',
27
- 'overbrace',
28
- 'overline',
29
- 'pmb',
30
- 'rm',
31
- 'tilde',
32
- 'underbrace',
33
- 'underline',
34
- 'vec',
35
- 'widehat',
36
- 'widetilde',
37
- 'overleftarrow',
38
- 'overrightarrow',
39
- 'hspace',
40
- ]
41
-
42
- const BINARY_COMMANDS = [
43
- 'frac',
44
- 'tfrac',
45
- 'binom',
46
- 'dbinom',
47
- 'dfrac',
48
- 'tbinom',
49
- 'overset',
50
- ]
5
+ import { TEX_BINARY_COMMANDS, TEX_UNARY_COMMANDS, tokenize_tex } from "./tex-tokenizer";
51
6
 
52
7
  const IGNORED_COMMANDS = [
53
8
  'bigl', 'bigr',
@@ -59,9 +14,9 @@ const IGNORED_COMMANDS = [
59
14
  const EMPTY_NODE: TexNode = new TexNode('empty', '');
60
15
 
61
16
  function get_command_param_num(command: string): number {
62
- if (UNARY_COMMANDS.includes(command)) {
17
+ if (TEX_UNARY_COMMANDS.includes(command)) {
63
18
  return 1;
64
- } else if (BINARY_COMMANDS.includes(command)) {
19
+ } else if (TEX_BINARY_COMMANDS.includes(command)) {
65
20
  return 2;
66
21
  } else {
67
22
  return 0;
@@ -86,7 +41,7 @@ function eat_whitespaces(tokens: TexToken[], start: number): TexToken[] {
86
41
 
87
42
  function eat_parenthesis(tokens: TexToken[], start: number): TexToken | null {
88
43
  const firstToken = tokens[start];
89
- if (firstToken.type === TexTokenType.ELEMENT && ['(', ')', '[', ']', '|', '\\{', '\\}', '.'].includes(firstToken.value)) {
44
+ if (firstToken.type === TexTokenType.ELEMENT && ['(', ')', '[', ']', '|', '\\{', '\\}', '.', '\\|'].includes(firstToken.value)) {
90
45
  return firstToken;
91
46
  } else if (firstToken.type === TexTokenType.COMMAND && ['lfloor', 'rfloor', 'lceil', 'rceil', 'langle', 'rangle'].includes(firstToken.value.slice(1))) {
92
47
  return firstToken;
@@ -142,93 +97,6 @@ function find_closing_end_command(tokens: TexToken[], start: number): number {
142
97
  }
143
98
 
144
99
 
145
- function unescape(str: string): string {
146
- const chars = ['{', '}', '\\', '$', '&', '#', '_', '%'];
147
- for (const char of chars) {
148
- str = str.replaceAll('\\' + char, char);
149
- }
150
- return str;
151
- }
152
-
153
- const rules_map = new Map<string, (a: Scanner<TexToken>) => TexToken | TexToken[]>([
154
- [
155
- String.raw`\\(text|operatorname|begin|end|hspace){.+?}`, (s) => {
156
- const text = s.text()!;
157
- const command = text.substring(0, text.indexOf('{'));
158
- const text_inside = text.substring(text.indexOf('{') + 1, text.lastIndexOf('}'));
159
- return [
160
- new TexToken(TexTokenType.COMMAND, command),
161
- new TexToken(TexTokenType.CONTROL, '{'),
162
- new TexToken(TexTokenType.TEXT, unescape(text_inside)),
163
- new TexToken(TexTokenType.CONTROL, '}')
164
- ]
165
- }
166
- ],
167
- [String.raw`%[^\n]*`, (s) => new TexToken(TexTokenType.COMMENT, s.text()!.substring(1))],
168
- [String.raw`[{}_^&]`, (s) => new TexToken(TexTokenType.CONTROL, s.text()!)],
169
- [String.raw`\\[\\,:; ]`, (s) => new TexToken(TexTokenType.CONTROL, s.text()!)],
170
- [String.raw`\r?\n`, (_s) => new TexToken(TexTokenType.NEWLINE, "\n")],
171
- [String.raw`\s+`, (s) => new TexToken(TexTokenType.SPACE, s.text()!)],
172
- [String.raw`\\[{}%$&#_|]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
173
- [String.raw`(\\[a-zA-Z]+)(\s*\d|\s+[a-zA-Z])\s*([0-9a-zA-Z])`, (s) => {
174
- const text = s.text()!;
175
- const regex = RegExp(String.raw`(\\[a-zA-Z]+)(\s*\d|\s+[a-zA-Z])\s*([0-9a-zA-Z])`);
176
- const match = text.match(regex);
177
- assert(match !== null);
178
- const command = match![1];
179
- if (BINARY_COMMANDS.includes(command.substring(1))) {
180
- const arg1 = match![2].trimStart();
181
- const arg2 = match![3];
182
- return [
183
- new TexToken(TexTokenType.COMMAND, command),
184
- new TexToken(TexTokenType.ELEMENT, arg1),
185
- new TexToken(TexTokenType.ELEMENT, arg2),
186
- ];
187
- } else {
188
- s.reject();
189
- return [];
190
- }
191
- }],
192
- [String.raw`(\\[a-zA-Z]+)(\s*\d|\s+[a-zA-Z])`, (s) => {
193
- const text = s.text()!;
194
- const regex = RegExp(String.raw`(\\[a-zA-Z]+)(\s*\d|\s+[a-zA-Z])`);
195
- const match = text.match(regex);
196
- assert(match !== null);
197
- const command = match![1];
198
- if (UNARY_COMMANDS.includes(command.substring(1))) {
199
- const arg1 = match![2].trimStart();
200
- return [
201
- new TexToken(TexTokenType.COMMAND, command),
202
- new TexToken(TexTokenType.ELEMENT, arg1),
203
- ];
204
- } else {
205
- s.reject();
206
- return [];
207
- }
208
- }],
209
- [String.raw`\\[a-zA-Z]+`, (s) => {
210
- const command = s.text()!;
211
- return [ new TexToken(TexTokenType.COMMAND, command), ];
212
- }],
213
- // Numbers like "123", "3.14"
214
- [String.raw`[0-9]+(\.[0-9]+)?`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
215
- [String.raw`[a-zA-Z]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
216
- [String.raw`[+\-*/='<>!.,;:?()\[\]|]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
217
- // non-ASCII characters
218
- [String.raw`[^\x00-\x7F]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
219
- [String.raw`.`, (s) => new TexToken(TexTokenType.UNKNOWN, s.text()!)],
220
- ]);
221
-
222
- const spec = {
223
- "start": rules_map
224
- };
225
-
226
- export function tokenize_tex(input: string): TexToken[] {
227
- const lexer = new JSLex<TexToken>(spec);
228
- return lexer.collect(input);
229
- }
230
-
231
-
232
100
  export class LatexParserError extends Error {
233
101
  constructor(message: string) {
234
102
  super(message);
@@ -394,6 +262,7 @@ export class LatexParser {
394
262
  case '}':
395
263
  throw new LatexParserError("Unmatched '}'");
396
264
  case '\\\\':
265
+ case '\\!':
397
266
  case '\\,':
398
267
  case '\\:':
399
268
  case '\\;':
@@ -552,7 +421,7 @@ export class LatexParser {
552
421
  while (pos < tokens.length) {
553
422
  const whitespaceCount = eat_whitespaces(tokens, pos).length;
554
423
  pos += whitespaceCount;
555
-
424
+
556
425
  if (pos >= tokens.length || !tokens[pos].eq(LEFT_CURLY_BRACKET)) {
557
426
  break;
558
427
  }
@@ -0,0 +1,138 @@
1
+ import { TexToken, TexTokenType } from "./types";
2
+ import { assert } from "./util";
3
+ import { JSLex, Scanner } from "./jslex";
4
+
5
+ export const TEX_UNARY_COMMANDS = [
6
+ 'sqrt',
7
+ 'text',
8
+
9
+ 'bar',
10
+ 'bold',
11
+ 'boldsymbol',
12
+ 'ddot',
13
+ 'dot',
14
+ 'hat',
15
+ 'mathbb',
16
+ 'mathbf',
17
+ 'mathcal',
18
+ 'mathfrak',
19
+ 'mathit',
20
+ 'mathrm',
21
+ 'mathscr',
22
+ 'mathsf',
23
+ 'mathtt',
24
+ 'operatorname',
25
+ 'overbrace',
26
+ 'overline',
27
+ 'pmb',
28
+ 'rm',
29
+ 'tilde',
30
+ 'underbrace',
31
+ 'underline',
32
+ 'vec',
33
+ 'widehat',
34
+ 'widetilde',
35
+ 'overleftarrow',
36
+ 'overrightarrow',
37
+ 'hspace',
38
+ ]
39
+
40
+ export const TEX_BINARY_COMMANDS = [
41
+ 'frac',
42
+ 'tfrac',
43
+ 'binom',
44
+ 'dbinom',
45
+ 'dfrac',
46
+ 'tbinom',
47
+ 'overset',
48
+ 'underset',
49
+ ]
50
+
51
+
52
+ function unescape(str: string): string {
53
+ const chars = ['{', '}', '\\', '$', '&', '#', '_', '%'];
54
+ for (const char of chars) {
55
+ str = str.replaceAll('\\' + char, char);
56
+ }
57
+ return str;
58
+ }
59
+
60
+ const rules_map = new Map<string, (a: Scanner<TexToken>) => TexToken | TexToken[]>([
61
+ [
62
+ String.raw`\\(text|operatorname|begin|end|hspace){.+?}`, (s) => {
63
+ const text = s.text()!;
64
+ const command = text.substring(0, text.indexOf('{'));
65
+ const text_inside = text.substring(text.indexOf('{') + 1, text.lastIndexOf('}'));
66
+ return [
67
+ new TexToken(TexTokenType.COMMAND, command),
68
+ new TexToken(TexTokenType.CONTROL, '{'),
69
+ new TexToken(TexTokenType.TEXT, unescape(text_inside)),
70
+ new TexToken(TexTokenType.CONTROL, '}')
71
+ ]
72
+ }
73
+ ],
74
+ [String.raw`%[^\n]*`, (s) => new TexToken(TexTokenType.COMMENT, s.text()!.substring(1))],
75
+ [String.raw`[{}_^&]`, (s) => new TexToken(TexTokenType.CONTROL, s.text()!)],
76
+ [String.raw`\\[\\,:;! ]`, (s) => new TexToken(TexTokenType.CONTROL, s.text()!)],
77
+ [String.raw`\r?\n`, (_s) => new TexToken(TexTokenType.NEWLINE, "\n")],
78
+ [String.raw`\s+`, (s) => new TexToken(TexTokenType.SPACE, s.text()!)],
79
+ [String.raw`\\[{}%$&#_|]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
80
+ // e.g. match `\frac13`, `\frac1 b`, `\frac a b`
81
+ [String.raw`(\\[a-zA-Z]+)(\s*\d|\s+[a-zA-Z])\s*([0-9a-zA-Z])`, (s) => {
82
+ const text = s.text()!;
83
+ const regex = RegExp(String.raw`(\\[a-zA-Z]+)(\s*\d|\s+[a-zA-Z])\s*([0-9a-zA-Z])`);
84
+ const match = text.match(regex);
85
+ assert(match !== null);
86
+ const command = match![1];
87
+ if (TEX_BINARY_COMMANDS.includes(command.substring(1))) {
88
+ const arg1 = match![2].trimStart();
89
+ const arg2 = match![3];
90
+ return [
91
+ new TexToken(TexTokenType.COMMAND, command),
92
+ new TexToken(TexTokenType.ELEMENT, arg1),
93
+ new TexToken(TexTokenType.ELEMENT, arg2),
94
+ ];
95
+ } else {
96
+ s.reject();
97
+ return [];
98
+ }
99
+ }],
100
+ // e.g. match `\sqrt3`, `\sqrt a`
101
+ [String.raw`(\\[a-zA-Z]+)(\s*\d|\s+[a-zA-Z])`, (s) => {
102
+ const text = s.text()!;
103
+ const regex = RegExp(String.raw`(\\[a-zA-Z]+)(\s*\d|\s+[a-zA-Z])`);
104
+ const match = text.match(regex);
105
+ assert(match !== null);
106
+ const command = match![1];
107
+ if (TEX_UNARY_COMMANDS.includes(command.substring(1))) {
108
+ const arg1 = match![2].trimStart();
109
+ return [
110
+ new TexToken(TexTokenType.COMMAND, command),
111
+ new TexToken(TexTokenType.ELEMENT, arg1),
112
+ ];
113
+ } else {
114
+ s.reject();
115
+ return [];
116
+ }
117
+ }],
118
+ [String.raw`\\[a-zA-Z]+`, (s) => {
119
+ const command = s.text()!;
120
+ return [ new TexToken(TexTokenType.COMMAND, command), ];
121
+ }],
122
+ // Numbers like "123", "3.14"
123
+ [String.raw`[0-9]+(\.[0-9]+)?`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
124
+ [String.raw`[a-zA-Z]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
125
+ [String.raw`[+\-*/='<>!.,;:?()\[\]|]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
126
+ // non-ASCII characters
127
+ [String.raw`[^\x00-\x7F]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
128
+ [String.raw`.`, (s) => new TexToken(TexTokenType.UNKNOWN, s.text()!)],
129
+ ]);
130
+
131
+ const spec = {
132
+ "start": rules_map
133
+ };
134
+
135
+ export function tokenize_tex(input: string): TexToken[] {
136
+ const lexer = new JSLex<TexToken>(spec);
137
+ return lexer.collect(input);
138
+ }