tex2typst 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/jslex.ts ADDED
@@ -0,0 +1,304 @@
1
+ /**
2
+ * Adapted from jslex - A lexer in JavaScript. https://github.com/jimbojw/jslex
3
+ * Licensed under MIT license
4
+ */
5
+
6
+
7
+ interface ILexSpec<T> {
8
+ start: Map<string, (arg0: Scanner<T>) => T | T[]>;
9
+ }
10
+
11
+ interface IRule<T> {
12
+ re: RegExp;
13
+ action: (a: Scanner<T>) => T | T[];
14
+ }
15
+
16
+ interface IMatch<T> {
17
+ index: number;
18
+ text: string;
19
+ len: number;
20
+ rule: IRule<T>;
21
+ }
22
+
23
+
24
+ // End of File marker
25
+ const EOF = {};
26
+
27
+ /**
28
+ * Utility function for comparing two matches.
29
+ * @param {object} m1 Left-hand side match.
30
+ * @param {object} m2 Right-hand side match.
31
+ * @return {int} Difference between the matches.
32
+ */
33
+ function matchcompare<T>(m1: IMatch<T>, m2: IMatch<T>): number {
34
+ if(m2.len !== m1.len) {
35
+ return m2.len - m1.len;
36
+ } else {
37
+ return m1.index - m2.index;
38
+ }
39
+ }
40
+
41
+ export class Scanner<T> {
42
+ private _input: string;
43
+ private _lexer: JSLex<T>;
44
+
45
+ // position within input stream
46
+ private _pos: number = 0;
47
+
48
+ // current line number
49
+ private _line: number = 0;
50
+
51
+ // current column number
52
+ private _col: number = 0;
53
+
54
+ private _offset: number = 0;
55
+ private _less: number | null = null;
56
+ private _go: boolean = false;
57
+ private _newstate: string | null = null;
58
+ private _state: string;
59
+
60
+ private _text: string | null = null;
61
+ private _leng: number | null = null;
62
+
63
+ constructor(input: string, lexer: JSLex<T>) {
64
+ this._input = input;
65
+ this._lexer = lexer;
66
+ this._state = lexer.states[0];
67
+ }
68
+
69
+ /**
70
+ * Analogous to yytext and yyleng in lex - will be set during scan.
71
+ */
72
+ public text(): string | null {
73
+ return this._text;
74
+ }
75
+
76
+ public leng(): number | null {
77
+ return this._leng;
78
+ }
79
+
80
+ /**
81
+ * Position of in stream, line number and column number of match.
82
+ */
83
+ public pos(): number {
84
+ return this._pos;
85
+ }
86
+
87
+ public line(): number {
88
+ return this._line;
89
+ }
90
+
91
+ public column(): number {
92
+ return this._col;
93
+ }
94
+
95
+ /**
96
+ * Analogous to input() in lex.
97
+ * @return {string} The next character in the stream.
98
+ */
99
+ public input(): string {
100
+ return this._input.charAt(this._pos + this._leng! + this._offset++);
101
+ }
102
+
103
+ /**
104
+ * Similar to unput() in lex, but does not allow modifying the stream.
105
+ * @return {int} The offset position after the operation.
106
+ */
107
+ public unput(): number {
108
+ return this._offset = this._offset > 0 ? this._offset-- : 0;
109
+ }
110
+
111
+ /**
112
+ * Analogous to yyless(n) in lex - retains the first n characters from this pattern, and returns
113
+ * the rest to the input stream, such that they will be used in the next pattern-matching operation.
114
+ * @param {int} n Number of characters to retain.
115
+ * @return {int} Length of the stream after the operation has completed.
116
+ */
117
+ public less(n: number): number {
118
+ this._less = n;
119
+ this._offset = 0;
120
+ this._text = this._text!.substring(0, n);
121
+ return this._leng = this._text.length;
122
+ }
123
+
124
+ /**
125
+ * Like less(), but instead of retaining the first n characters, it chops off the last n.
126
+ * @param {int} n Number of characters to chop.
127
+ * @return {int} Length of the stream after the operation has completed.
128
+ */
129
+ public pushback(n: number): number {
130
+ return this.less(this._leng! - n);
131
+ }
132
+
133
+ /**
134
+ * Similar to REJECT in lex, except it doesn't break the current execution context.
135
+ * TIP: reject() should be the last instruction in a spec callback.
136
+ */
137
+ public reject(): void {
138
+ this._go = true;
139
+ }
140
+
141
+ /**
142
+ * Analogous to BEGIN in lex - sets the named state (start condition).
143
+ * @param {string|int} state Name of state to switch to, or ordinal number (0 is first, etc).
144
+ * @return {string} The new state on successful switch, throws exception on failure.
145
+ */
146
+ public begin(state: string | number): string {
147
+ if (this._lexer.specification[state]) {
148
+ return this._newstate = state as string;
149
+ }
150
+ const s = this._lexer.states[parseInt(state as string)];
151
+ if (s) {
152
+ return this._newstate = s;
153
+ }
154
+ throw "Unknown state '" + state + "' requested";
155
+ }
156
+
157
+ /**
158
+ * Simple accessor for reading in the current state.
159
+ * @return {string} The current state.
160
+ */
161
+ public state(): string {
162
+ return this._state;
163
+ }
164
+
165
+ /**
166
+ * Scan method to be returned to caller - grabs the next token and fires appropriate calback.
167
+ * @return {T} The next token extracted from the stream.
168
+ */
169
+ public scan(): T | T[] {
170
+ if(this._pos >= this._input.length) {
171
+ return EOF as T;
172
+ }
173
+
174
+ const str = this._input.substring(this._pos);
175
+ const rules = this._lexer.specification[this._state];
176
+ const matches: IMatch<T>[] = [];
177
+ for (let i = 0; i < rules.length; i++) {
178
+ const rule = rules[i];
179
+ const mt = str.match(rule.re);
180
+ if (mt !== null && mt[0].length > 0) {
181
+ matches.push({
182
+ index: i,
183
+ text: mt[0],
184
+ len: mt[0].length,
185
+ rule: rule
186
+ });
187
+ }
188
+ }
189
+ if (matches.length === 0) {
190
+ throw new Error("No match found for input '" + str + "'");
191
+ }
192
+ matches.sort(matchcompare);
193
+ this._go = true;
194
+
195
+ let result: T | T[];
196
+ let m: IMatch<T>;
197
+ for (let j = 0, n = matches.length; j < n && this._go; j++) {
198
+ this._offset = 0;
199
+ this._less = null;
200
+ this._go = false;
201
+ this._newstate = null;
202
+ m = matches[j];
203
+ this._text = m.text;
204
+ this._leng = m.len;
205
+ result = m.rule.action(this);
206
+ if (this._newstate && this._newstate != this._state) {
207
+ this._state = this._newstate;
208
+ break;
209
+ }
210
+ }
211
+ const text = this._less === null ? m!.text : m!.text.substring(0, this._less);
212
+ const len = text.length;
213
+ this._pos += len + this._offset;
214
+
215
+ const nlm = text.match(/\n/g);
216
+ if (nlm !== null) {
217
+ this._line += nlm.length;
218
+ this._col = len - text.lastIndexOf("\n") - 1;
219
+ } else {
220
+ this._col += len;
221
+ }
222
+ return result!;
223
+ }
224
+ }
225
+
226
+ export class JSLex<T> {
227
+ public states: string[];
228
+ public specification: Record<string, IRule<T>[]>;
229
+
230
+ constructor(spec: ILexSpec<T>) {
231
+ this.states = Object.keys(spec);
232
+ this.specification = {};
233
+
234
+ // build out internal representation of the provided spec
235
+ for (const s of this.states) {
236
+ // e.g. s = "start"
237
+ const rule_map = spec[s] as Map<string, (arg0: Scanner<T>) => T | T[]>;
238
+
239
+ if (s in this.specification) {
240
+ throw "Duplicate state declaration encountered for state '" + s + "'";
241
+ }
242
+
243
+ this.specification[s] = [] as IRule<T>[];
244
+
245
+ for (const [k,v] of rule_map.entries()) {
246
+ let re: RegExp;
247
+ try {
248
+ re = new RegExp('^' + k);
249
+ } catch (err) {
250
+ throw "Invalid regexp '" + k + "' in state '" + s + "' (" + (err as Error).message + ")";
251
+ }
252
+ this.specification[s].push({
253
+ re: re,
254
+ action: v
255
+ });
256
+ }
257
+ }
258
+ }
259
+
260
+ /**
261
+ * Scanner function - makes a new scanner object which is used to get tokens one at a time.
262
+ * @param {string} input Input text to tokenize.
263
+ * @return {function} Scanner function.
264
+ */
265
+ public scanner(input: string): Scanner<T> {
266
+ return new Scanner(input, this);
267
+ }
268
+
269
+ /**
270
+ * Similar to lex's yylex() function, consumes all input, calling calback for each token.
271
+ * @param {string} input Text to lex.
272
+ * @param {function} callback Function to execute for each token.
273
+ */
274
+ public lex(input: string, callback: (arg0: T | T[]) => void) {
275
+ const scanner = this.scanner(input);
276
+ while (true) {
277
+ const token = scanner.scan();
278
+ if (token === EOF) {
279
+ return;
280
+ }
281
+ if (token !== undefined) {
282
+ callback(token);
283
+ }
284
+ }
285
+ }
286
+
287
+ /**
288
+ * Consumes all input, collecting tokens along the way.
289
+ * @param {string} input Text to lex.
290
+ * @return {array} List of tokens, may contain an Error at the end.
291
+ */
292
+ public collect(input: string): T[] {
293
+ const tokens: T[] = [];
294
+ const callback = function(item: T | T[]) {
295
+ if (Array.isArray(item)) {
296
+ tokens.push(...item);
297
+ } else {
298
+ tokens.push(item);
299
+ }
300
+ };
301
+ this.lex(input, callback);
302
+ return tokens;
303
+ }
304
+ };
package/src/tex-parser.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { symbolMap } from "./map";
2
2
  import { TexNode, TexSupsubData, TexToken, TexTokenType } from "./types";
3
- import { isalpha, isdigit, assert } from "./util";
4
-
3
+ import { assert } from "./util";
4
+ import { JSLex, Scanner } from "./jslex";
5
5
 
6
6
  const UNARY_COMMANDS = [
7
7
  'sqrt',
@@ -33,6 +33,8 @@ const UNARY_COMMANDS = [
33
33
  'vec',
34
34
  'widehat',
35
35
  'widetilde',
36
+ 'overleftarrow',
37
+ 'overrightarrow',
36
38
  ]
37
39
 
38
40
  const BINARY_COMMANDS = [
@@ -95,15 +97,6 @@ function eat_primes(tokens: TexToken[], start: number): number {
95
97
  }
96
98
 
97
99
 
98
- function eat_command_name(latex: string, start: number): string {
99
- let pos = start;
100
- while (pos < latex.length && isalpha(latex[pos])) {
101
- pos += 1;
102
- }
103
- return latex.substring(start, pos);
104
- }
105
-
106
-
107
100
  function find_closing_match(tokens: TexToken[], start: number, leftToken: TexToken, rightToken: TexToken): number {
108
101
  assert(tokens[start].eq(leftToken));
109
102
  let count = 1;
@@ -141,135 +134,49 @@ function find_closing_end_command(tokens: TexToken[], start: number): number {
141
134
  return find_closing_match(tokens, start, BEGIN_COMMAND, END_COMMAND);
142
135
  }
143
136
 
144
- function find_closing_curly_bracket_char(latex: string, start: number): number {
145
- assert(latex[start] === '{');
146
- let count = 1;
147
- let pos = start + 1;
148
137
 
149
- while (count > 0) {
150
- if (pos >= latex.length) {
151
- throw new LatexParserError('Unmatched curly brackets');
152
- }
153
- if(pos + 1 < latex.length && (['\\{', '\\}'].includes(latex.substring(pos, pos + 2)))) {
154
- pos += 2;
155
- continue;
156
- }
157
- if (latex[pos] === '{') {
158
- count += 1;
159
- } else if (latex[pos] === '}') {
160
- count -= 1;
161
- }
162
- pos += 1;
138
+ function unescape(str: string): string {
139
+ const chars = ['{', '}', '\\', '$', '&', '#', '_', '%'];
140
+ for (const char of chars) {
141
+ str = str.replaceAll('\\' + char, char);
163
142
  }
164
-
165
- return pos - 1;
143
+ return str;
166
144
  }
167
145
 
168
-
169
- export function tokenize(latex: string): TexToken[] {
170
- const tokens: TexToken[] = [];
171
- let pos = 0;
172
-
173
- while (pos < latex.length) {
174
- const firstChar = latex[pos];
175
- let token: TexToken;
176
- switch (firstChar) {
177
- case '%': {
178
- let newPos = pos + 1;
179
- while (newPos < latex.length && latex[newPos] !== '\n') {
180
- newPos += 1;
181
- }
182
- token = new TexToken(TexTokenType.COMMENT, latex.slice(pos + 1, newPos));
183
- pos = newPos;
184
- break;
185
- }
186
- case '{':
187
- case '}':
188
- case '_':
189
- case '^':
190
- case '&':
191
- token = new TexToken(TexTokenType.CONTROL, firstChar);
192
- pos++;
193
- break;
194
- case '\n':
195
- token = new TexToken(TexTokenType.NEWLINE, firstChar);
196
- pos++;
197
- break;
198
- case '\r': {
199
- if (pos + 1 < latex.length && latex[pos + 1] === '\n') {
200
- token = new TexToken(TexTokenType.NEWLINE, '\n');
201
- pos += 2;
202
- } else {
203
- token = new TexToken(TexTokenType.NEWLINE, '\n');
204
- pos ++;
205
- }
206
- break;
207
- }
208
- case ' ': {
209
- let newPos = pos;
210
- while (newPos < latex.length && latex[newPos] === ' ') {
211
- newPos += 1;
212
- }
213
- token = new TexToken(TexTokenType.SPACE, latex.slice(pos, newPos));
214
- pos = newPos;
215
- break;
216
- }
217
- case '\\': {
218
- if (pos + 1 >= latex.length) {
219
- throw new LatexParserError('Expecting command name after \\');
220
- }
221
- const firstTwoChars = latex.slice(pos, pos + 2);
222
- if (['\\\\', '\\,'].includes(firstTwoChars)) {
223
- token = new TexToken(TexTokenType.CONTROL, firstTwoChars);
224
- } else if (['\\{','\\}', '\\%', '\\$', '\\&', '\\#', '\\_', '\\|'].includes(firstTwoChars)) {
225
- // \| is double vertical bar, not the same as just |
226
- token = new TexToken(TexTokenType.ELEMENT, firstTwoChars);
227
- } else {
228
- const command = eat_command_name(latex, pos + 1);
229
- token = new TexToken(TexTokenType.COMMAND, '\\' + command);
230
- }
231
- pos += token.value.length;
232
- break;
233
- }
234
- default: {
235
- if (isdigit(firstChar)) {
236
- let newPos = pos;
237
- while (newPos < latex.length && isdigit(latex[newPos])) {
238
- newPos += 1;
239
- }
240
- token = new TexToken(TexTokenType.ELEMENT, latex.slice(pos, newPos));
241
- } else if (isalpha(firstChar)) {
242
- token = new TexToken(TexTokenType.ELEMENT, firstChar);
243
- } else if ('+-*/=\'<>!.,;:?()[]|'.includes(firstChar)) {
244
- token = new TexToken(TexTokenType.ELEMENT, firstChar)
245
- } else {
246
- token = new TexToken(TexTokenType.UNKNOWN, firstChar);
247
- }
248
- pos += token.value.length;
249
- }
250
- }
251
-
252
- tokens.push(token);
253
-
254
- if (token.type === TexTokenType.COMMAND && ['\\text', '\\operatorname', '\\begin', '\\end'].includes(token.value)) {
255
- if (pos >= latex.length || latex[pos] !== '{') {
256
- throw new LatexParserError(`No content for ${token.value} command`);
257
- }
258
- tokens.push(new TexToken(TexTokenType.CONTROL, '{'));
259
- const posClosingBracket = find_closing_curly_bracket_char(latex, pos);
260
- pos++;
261
- let textInside = latex.slice(pos, posClosingBracket);
262
- // replace all escape characters with their actual characters
263
- const chars = ['{', '}', '\\', '$', '&', '#', '_', '%'];
264
- for (const char of chars) {
265
- textInside = textInside.replaceAll('\\' + char, char);
266
- }
267
- tokens.push(new TexToken(TexTokenType.TEXT, textInside));
268
- tokens.push(new TexToken(TexTokenType.CONTROL, '}'));
269
- pos = posClosingBracket + 1;
146
+ const rules_map = new Map<string, (a: Scanner<TexToken>) => TexToken | TexToken[]>([
147
+ [
148
+ String.raw`\\(text|operatorname|begin|end){.+?}`, (s) => {
149
+ const text = s.text()!;
150
+ const command = text.substring(0, text.indexOf('{'));
151
+ const text_inside = text.substring(text.indexOf('{') + 1, text.lastIndexOf('}'));
152
+ return [
153
+ new TexToken(TexTokenType.COMMAND, command),
154
+ new TexToken(TexTokenType.CONTROL, '{'),
155
+ new TexToken(TexTokenType.TEXT, unescape(text_inside)),
156
+ new TexToken(TexTokenType.CONTROL, '}')
157
+ ]
270
158
  }
271
- }
272
- return tokens;
159
+ ],
160
+ [String.raw`%[^\n]*`, (s) => new TexToken(TexTokenType.COMMENT, s.text()!.substring(1))],
161
+ [String.raw`[{}_^&]`, (s) => new TexToken(TexTokenType.CONTROL, s.text()!)],
162
+ [String.raw`\r?\n`, (_s) => new TexToken(TexTokenType.NEWLINE, "\n")],
163
+ [String.raw`\s+`, (s) => new TexToken(TexTokenType.SPACE, s.text()!)],
164
+ [String.raw`\\[\\,]`, (s) => new TexToken(TexTokenType.CONTROL, s.text()!)],
165
+ [String.raw`\\[{}%$&#_|]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
166
+ [String.raw`\\[a-zA-Z]+`, (s) => new TexToken(TexTokenType.COMMAND, s.text()!)],
167
+ [String.raw`[0-9]+`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
168
+ [String.raw`[a-zA-Z]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
169
+ [String.raw`[+\-*/='<>!.,;:?()\[\]|]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
170
+ [String.raw`.`, (s) => new TexToken(TexTokenType.UNKNOWN, s.text()!)],
171
+ ]);
172
+
173
+ const spec = {
174
+ "start": rules_map
175
+ };
176
+
177
+ export function tokenize_tex(input: string): TexToken[] {
178
+ const lexer = new JSLex<TexToken>(spec);
179
+ return lexer.collect(input);
273
180
  }
274
181
 
275
182
 
@@ -633,7 +540,7 @@ function passExpandCustomTexMacros(tokens: TexToken[], customTexMacros: {[key: s
633
540
  let out_tokens: TexToken[] = [];
634
541
  for (const token of tokens) {
635
542
  if (token.type === TexTokenType.COMMAND && customTexMacros[token.value]) {
636
- const expanded_tokens = tokenize(customTexMacros[token.value]);
543
+ const expanded_tokens = tokenize_tex(customTexMacros[token.value]);
637
544
  out_tokens = out_tokens.concat(expanded_tokens);
638
545
  } else {
639
546
  out_tokens.push(token);
@@ -644,7 +551,7 @@ function passExpandCustomTexMacros(tokens: TexToken[], customTexMacros: {[key: s
644
551
 
645
552
  export function parseTex(tex: string, customTexMacros: {[key: string]: string}): TexNode {
646
553
  const parser = new LatexParser();
647
- let tokens = tokenize(tex);
554
+ let tokens = tokenize_tex(tex);
648
555
  tokens = passIgnoreWhitespaceBeforeScriptMark(tokens);
649
556
  tokens = passExpandCustomTexMacros(tokens, customTexMacros);
650
557
  return parser.parse(tokens);