npm - tex2typst - Versions diffs - 0.3.2 → 0.3.3 - Mend

tex2typst 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/src/jslex.ts ADDED Viewed

@@ -0,0 +1,304 @@
+/**
+ * Adapted from jslex - A lexer in JavaScript. https://github.com/jimbojw/jslex
+ * Licensed under MIT license
+ */
+interface ILexSpec<T> {
+    start: Map<string, (arg0: Scanner<T>) => T | T[]>;
+}
+interface IRule<T> {
+    re: RegExp;
+    action: (a: Scanner<T>) => T | T[];
+}
+interface IMatch<T> {
+    index: number;
+    text: string;
+    len: number;
+    rule: IRule<T>;
+}
+// End of File marker
+const EOF = {};
+/**
+ * Utility function for comparing two matches.
+ * @param {object} m1 Left-hand side match.
+ * @param {object} m2 Right-hand side match.
+ * @return {int} Difference between the matches.
+ */
+function matchcompare<T>(m1: IMatch<T>, m2: IMatch<T>): number {
+    if(m2.len !== m1.len) {
+        return m2.len - m1.len;
+    } else {
+        return m1.index - m2.index;
+    }
+}
+export class Scanner<T> {
+    private _input: string;
+    private _lexer: JSLex<T>;
+    // position within input stream
+    private _pos: number = 0;
+    // current line number
+    private _line: number = 0;
+    // current column number
+    private _col: number = 0;
+    private _offset: number = 0;
+    private _less: number | null = null;
+    private _go: boolean = false;
+    private _newstate: string | null = null;
+    private _state: string;
+    private _text: string | null = null;
+    private _leng: number | null = null;
+    constructor(input: string, lexer: JSLex<T>) {
+        this._input = input;
+        this._lexer = lexer;
+        this._state = lexer.states[0];
+    }
+    /**
+     * Analogous to yytext and yyleng in lex - will be set during scan.
+     */
+    public text(): string | null {
+        return this._text;
+    }
+    public leng(): number | null {
+        return this._leng;
+    }
+    /**
+     * Position of in stream, line number and column number of match.
+     */
+    public pos(): number {
+        return this._pos;
+    }
+    public line(): number {
+        return this._line;
+    }
+    public column(): number {
+        return this._col;
+    }
+    /**
+     * Analogous to input() in lex.
+     * @return {string} The next character in the stream.
+     */
+    public input(): string {
+        return this._input.charAt(this._pos + this._leng! + this._offset++);
+    }
+    /**
+     * Similar to unput() in lex, but does not allow modifying the stream.
+     * @return {int} The offset position after the operation.
+     */
+    public unput(): number {
+        return this._offset = this._offset > 0 ? this._offset-- : 0;
+    }
+    /**
+     * Analogous to yyless(n) in lex - retains the first n characters from this pattern, and returns
+     * the rest to the input stream, such that they will be used in the next pattern-matching operation.
+     * @param {int} n Number of characters to retain.
+     * @return {int} Length of the stream after the operation has completed.
+     */
+    public less(n: number): number {
+        this._less = n;
+        this._offset = 0;
+        this._text = this._text!.substring(0, n);
+        return this._leng = this._text.length;
+    }
+    /**
+     * Like less(), but instead of retaining the first n characters, it chops off the last n.
+     * @param {int} n Number of characters to chop.
+     * @return {int} Length of the stream after the operation has completed.
+     */
+    public pushback(n: number): number {
+        return this.less(this._leng! - n);
+    }
+    /**
+     * Similar to REJECT in lex, except it doesn't break the current execution context.
+     * TIP: reject() should be the last instruction in a spec callback.
+     */
+    public reject(): void {
+        this._go = true;
+    }
+    /**
+     * Analogous to BEGIN in lex - sets the named state (start condition).
+     * @param {string|int} state Name of state to switch to, or ordinal number (0 is first, etc).
+     * @return {string} The new state on successful switch, throws exception on failure.
+     */
+    public begin(state: string | number): string {
+        if (this._lexer.specification[state]) {
+            return this._newstate = state as string;
+        }
+        const s = this._lexer.states[parseInt(state as string)];
+        if (s) {
+            return this._newstate = s;
+        }
+        throw "Unknown state '" + state + "' requested";
+    }
+    /**
+     * Simple accessor for reading in the current state.
+     * @return {string} The current state.
+     */
+    public state(): string {
+        return this._state;
+    }
+    /**
+     * Scan method to be returned to caller - grabs the next token and fires appropriate calback.
+     * @return {T} The next token extracted from the stream.
+     */
+    public scan(): T | T[] {
+        if(this._pos >= this._input.length) {
+            return EOF as T;
+        }
+        const str = this._input.substring(this._pos);
+        const rules = this._lexer.specification[this._state];
+        const matches: IMatch<T>[] = [];
+        for (let i = 0; i < rules.length; i++) {
+            const rule = rules[i];
+            const mt = str.match(rule.re);
+            if (mt !== null && mt[0].length > 0) {
+                matches.push({
+                    index: i,
+                    text: mt[0],
+                    len: mt[0].length,
+                    rule: rule
+                });
+            }
+        }
+        if (matches.length === 0) {
+            throw new Error("No match found for input '" + str + "'");
+        }
+        matches.sort(matchcompare);
+        this._go = true;
+        let result: T | T[];
+        let m: IMatch<T>;
+        for (let j = 0, n = matches.length; j < n && this._go; j++) {
+            this._offset = 0;
+            this._less = null;
+            this._go = false;
+            this._newstate = null;
+            m = matches[j];
+            this._text = m.text;
+            this._leng = m.len;
+            result = m.rule.action(this);
+            if (this._newstate && this._newstate != this._state) {
+                this._state = this._newstate;
+                break;
+            }
+        }
+        const text = this._less === null ? m!.text : m!.text.substring(0, this._less);
+        const len = text.length;
+        this._pos += len + this._offset;
+        const nlm = text.match(/\n/g);
+        if (nlm !== null) {
+            this._line += nlm.length;
+            this._col = len - text.lastIndexOf("\n") - 1;
+        } else {
+            this._col += len;
+        }
+        return result!;
+    }
+}
+export class JSLex<T> {
+    public states: string[];
+    public specification: Record<string, IRule<T>[]>;
+    constructor(spec: ILexSpec<T>) {
+        this.states = Object.keys(spec);
+        this.specification = {};
+        // build out internal representation of the provided spec
+        for (const s of this.states) {
+            // e.g. s = "start"
+            const rule_map = spec[s] as Map<string, (arg0: Scanner<T>) => T | T[]>;
+            if (s in this.specification) {
+                throw "Duplicate state declaration encountered for state '" + s + "'";
+            }
+            this.specification[s] = [] as IRule<T>[];
+            for (const [k,v] of rule_map.entries()) {
+                let re: RegExp;
+                try {
+                    re = new RegExp('^' + k);
+                } catch (err) {
+                    throw "Invalid regexp '" + k + "' in state '" + s + "' (" + (err as Error).message + ")";
+                }
+                this.specification[s].push({
+                    re: re,
+                    action: v
+                });
+            }
+        }
+    }
+    /**
+     * Scanner function - makes a new scanner object which is used to get tokens one at a time.
+     * @param {string} input Input text to tokenize.
+     * @return {function} Scanner function.
+     */
+    public scanner(input: string): Scanner<T> {
+        return new Scanner(input, this);
+    }
+    /**
+     * Similar to lex's yylex() function, consumes all input, calling calback for each token.
+     * @param {string} input Text to lex.
+     * @param {function} callback Function to execute for each token.
+     */
+    public lex(input: string, callback: (arg0: T | T[]) => void) {
+        const scanner = this.scanner(input);
+        while (true) {
+            const token = scanner.scan();
+            if (token === EOF) {
+                return;
+            }
+            if (token !== undefined) {
+                callback(token);
+            }
+        }
+    }
+    /**
+     * Consumes all input, collecting tokens along the way.
+     * @param {string} input Text to lex.
+     * @return {array} List of tokens, may contain an Error at the end.
+     */
+    public collect(input: string): T[] {
+        const tokens: T[] = [];
+        const callback = function(item: T | T[]) {
+            if (Array.isArray(item)) {
+                tokens.push(...item);
+            } else {
+                tokens.push(item);
+            }
+        };
+        this.lex(input, callback);
+        return tokens;
+    }
+};

package/src/tex-parser.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { symbolMap } from "./map";
 import { TexNode, TexSupsubData, TexToken, TexTokenType } from "./types";
-import { isalpha, isdigit, assert } from "./util";
+import { assert } from "./util";
+import { JSLex, Scanner } from "./jslex";
 const UNARY_COMMANDS = [
     'sqrt',
@@ -33,6 +33,8 @@ const UNARY_COMMANDS = [
     'vec',
     'widehat',
     'widetilde',
+    'overleftarrow',
+    'overrightarrow',
 ]
 const BINARY_COMMANDS = [
@@ -95,15 +97,6 @@ function eat_primes(tokens: TexToken[], start: number): number {
 }
-function eat_command_name(latex: string, start: number): string {
-    let pos = start;
-    while (pos < latex.length && isalpha(latex[pos])) {
-        pos += 1;
-    }
-    return latex.substring(start, pos);
-}
 function find_closing_match(tokens: TexToken[], start: number, leftToken: TexToken, rightToken: TexToken): number {
     assert(tokens[start].eq(leftToken));
     let count = 1;
@@ -141,135 +134,49 @@ function find_closing_end_command(tokens: TexToken[], start: number): number {
     return find_closing_match(tokens, start, BEGIN_COMMAND, END_COMMAND);
 }
-function find_closing_curly_bracket_char(latex: string, start: number): number {
-    assert(latex[start] === '{');
-    let count = 1;
-    let pos = start + 1;
-    while (count > 0) {
-        if (pos >= latex.length) {
-            throw new LatexParserError('Unmatched curly brackets');
-        }
-        if(pos + 1 < latex.length && (['\\{', '\\}'].includes(latex.substring(pos, pos + 2)))) {
-            pos += 2;
-            continue;
-        }
-        if (latex[pos] === '{') {
-            count += 1;
-        } else if (latex[pos] === '}') {
-            count -= 1;
-        }
-        pos += 1;
+function unescape(str: string): string {
+    const chars = ['{', '}', '\\', '$', '&', '#', '_', '%'];
+    for (const char of chars) {
+        str = str.replaceAll('\\' + char, char);
     }
-    return pos - 1;
+    return str;
 }
-export function tokenize(latex: string): TexToken[] {
-    const tokens: TexToken[] = [];
-    let pos = 0;
-    while (pos < latex.length) {
-        const firstChar = latex[pos];
-        let token: TexToken;
-        switch (firstChar) {
-            case '%': {
-                let newPos = pos + 1;
-                while (newPos < latex.length && latex[newPos] !== '\n') {
-                    newPos += 1;
-                }
-                token = new TexToken(TexTokenType.COMMENT, latex.slice(pos + 1, newPos));
-                pos = newPos;
-                break;
-            }
-            case '{':
-            case '}':
-            case '_':
-            case '^':
-            case '&':
-                token = new TexToken(TexTokenType.CONTROL, firstChar);
-                pos++;
-                break;
-            case '\n':
-                token = new TexToken(TexTokenType.NEWLINE, firstChar);
-                pos++;
-                break;
-            case '\r': {
-                if (pos + 1 < latex.length && latex[pos + 1] === '\n') {
-                    token = new TexToken(TexTokenType.NEWLINE, '\n');
-                    pos += 2;
-                } else {
-                    token = new TexToken(TexTokenType.NEWLINE, '\n');
-                    pos ++;
-                }
-                break;
-            }
-            case ' ': {
-                let newPos = pos;
-                while (newPos < latex.length && latex[newPos] === ' ') {
-                    newPos += 1;
-                }
-                token = new TexToken(TexTokenType.SPACE, latex.slice(pos, newPos));
-                pos = newPos;
-                break;
-            }
-            case '\\': {
-                if (pos + 1 >= latex.length) {
-                    throw new LatexParserError('Expecting command name after \\');
-                }
-                const firstTwoChars = latex.slice(pos, pos + 2);
-                if (['\\\\', '\\,'].includes(firstTwoChars)) {
-                    token = new TexToken(TexTokenType.CONTROL, firstTwoChars);
-                } else if (['\\{','\\}', '\\%', '\\$', '\\&', '\\#', '\\_', '\\|'].includes(firstTwoChars)) {
-                    // \| is double vertical bar, not the same as just |
-                    token = new TexToken(TexTokenType.ELEMENT, firstTwoChars);
-                } else {
-                    const command = eat_command_name(latex, pos + 1);
-                    token = new TexToken(TexTokenType.COMMAND, '\\' + command);
-                }
-                pos += token.value.length;
-                break;
-            }
-            default: {
-                if (isdigit(firstChar)) {
-                    let newPos = pos;
-                    while (newPos < latex.length && isdigit(latex[newPos])) {
-                        newPos += 1;
-                    }
-                    token = new TexToken(TexTokenType.ELEMENT, latex.slice(pos, newPos));
-                } else if (isalpha(firstChar)) {
-                    token = new TexToken(TexTokenType.ELEMENT, firstChar);
-                } else if ('+-*/=\'<>!.,;:?()[]|'.includes(firstChar)) {
-                    token = new TexToken(TexTokenType.ELEMENT, firstChar)
-                } else {
-                    token = new TexToken(TexTokenType.UNKNOWN, firstChar);
-                }
-                pos += token.value.length;
-            }
-        }
-        tokens.push(token);
-        if (token.type === TexTokenType.COMMAND && ['\\text', '\\operatorname', '\\begin', '\\end'].includes(token.value)) {
-            if (pos >= latex.length || latex[pos] !== '{') {
-                throw new LatexParserError(`No content for ${token.value} command`);
-            }
-            tokens.push(new TexToken(TexTokenType.CONTROL, '{'));
-            const posClosingBracket = find_closing_curly_bracket_char(latex, pos);
-            pos++;
-            let textInside = latex.slice(pos, posClosingBracket);
-            // replace all escape characters with their actual characters
-            const chars = ['{', '}', '\\', '$', '&', '#', '_', '%'];
-            for (const char of chars) {
-                textInside = textInside.replaceAll('\\' + char, char);
-            }
-            tokens.push(new TexToken(TexTokenType.TEXT, textInside));
-            tokens.push(new TexToken(TexTokenType.CONTROL, '}'));
-            pos = posClosingBracket + 1;
+const rules_map = new Map<string, (a: Scanner<TexToken>) => TexToken | TexToken[]>([
+    [
+        String.raw`\\(text|operatorname|begin|end){.+?}`, (s) => {
+            const text = s.text()!;
+            const command = text.substring(0, text.indexOf('{'));
+            const text_inside = text.substring(text.indexOf('{') + 1, text.lastIndexOf('}'));
+            return [
+                new TexToken(TexTokenType.COMMAND, command),
+                new TexToken(TexTokenType.CONTROL, '{'),
+                new TexToken(TexTokenType.TEXT, unescape(text_inside)),
+                new TexToken(TexTokenType.CONTROL, '}')
+            ]
         }
-    }
-    return tokens;
+    ],
+    [String.raw`%[^\n]*`, (s) => new TexToken(TexTokenType.COMMENT, s.text()!.substring(1))],
+    [String.raw`[{}_^&]`, (s) => new TexToken(TexTokenType.CONTROL, s.text()!)],
+    [String.raw`\r?\n`, (_s) => new TexToken(TexTokenType.NEWLINE, "\n")],
+    [String.raw`\s+`, (s) => new TexToken(TexTokenType.SPACE, s.text()!)],
+    [String.raw`\\[\\,]`, (s) => new TexToken(TexTokenType.CONTROL, s.text()!)],
+    [String.raw`\\[{}%$&#_|]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
+    [String.raw`\\[a-zA-Z]+`, (s) => new TexToken(TexTokenType.COMMAND, s.text()!)],
+    [String.raw`[0-9]+`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
+    [String.raw`[a-zA-Z]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
+    [String.raw`[+\-*/='<>!.,;:?()\[\]|]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)],
+    [String.raw`.`, (s) => new TexToken(TexTokenType.UNKNOWN, s.text()!)],
+]);
+const spec = {
+    "start": rules_map
+};
+export function tokenize_tex(input: string): TexToken[] {
+    const lexer = new JSLex<TexToken>(spec);
+    return lexer.collect(input);
 }
@@ -633,7 +540,7 @@ function passExpandCustomTexMacros(tokens: TexToken[], customTexMacros: {[key: s
     let out_tokens: TexToken[] = [];
     for (const token of tokens) {
         if (token.type === TexTokenType.COMMAND && customTexMacros[token.value]) {
-            const expanded_tokens = tokenize(customTexMacros[token.value]);
+            const expanded_tokens = tokenize_tex(customTexMacros[token.value]);
             out_tokens = out_tokens.concat(expanded_tokens);
         } else {
             out_tokens.push(token);
@@ -644,7 +551,7 @@ function passExpandCustomTexMacros(tokens: TexToken[], customTexMacros: {[key: s
 export function parseTex(tex: string, customTexMacros: {[key: string]: string}): TexNode {
     const parser = new LatexParser();
-    let tokens = tokenize(tex);
+    let tokens = tokenize_tex(tex);
     tokens = passIgnoreWhitespaceBeforeScriptMark(tokens);
     tokens = passExpandCustomTexMacros(tokens, customTexMacros);
     return parser.parse(tokens);