npm - @clickhouse/datatype-parser - Versions diffs - 0.1.0 - Mend

@clickhouse/datatype-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/json.js ADDED Viewed

@@ -0,0 +1,246 @@
+/// Serialize a node tree to JSON, matching the server's `formatASTAsJSON`
+/// shape for data types. This is a faithful TypeScript port of the C++
+/// `src/json.cpp`. `indent` < 0 produces compact output; >= 0 produces pretty
+/// output with that many spaces per level.
+///
+/// Byte-faithfulness: the C++ `escapeTo` iterates over the `unsigned char`
+/// bytes of the input. To match it exactly for multibyte UTF-8 content we
+/// accumulate the whole document into a byte buffer (`number[]` of byte
+/// values): structural characters are ASCII, strings are encoded to their
+/// UTF-8 bytes, and only bytes < 0x20 are special-cased (the ASCII escapes plus
+/// the `\u%04x` fallback). All bytes >= 0x20 are pushed through verbatim. At
+/// the end we decode the buffer back to a (byte-identical) JS string.
+import { NodeKind } from "./ast.js";
+const encoder = new TextEncoder();
+const decoder = new TextDecoder("utf-8");
+function escapeTo(out, s) {
+    out.push(0x22 /* '"' */);
+    for (const c of encoder.encode(s)) {
+        switch (c) {
+            case 0x22 /* '"' */:
+                pushAscii(out, '\\"');
+                break;
+            case 0x5c /* '\\' */:
+                pushAscii(out, "\\\\");
+                break;
+            case 0x08 /* '\b' */:
+                pushAscii(out, "\\b");
+                break;
+            case 0x0c /* '\f' */:
+                pushAscii(out, "\\f");
+                break;
+            case 0x0a /* '\n' */:
+                pushAscii(out, "\\n");
+                break;
+            case 0x0d /* '\r' */:
+                pushAscii(out, "\\r");
+                break;
+            case 0x09 /* '\t' */:
+                pushAscii(out, "\\t");
+                break;
+            default:
+                if (c < 0x20) {
+                    /// \u%04x — lowercase hex, 4 digits.
+                    pushAscii(out, "\\u" + c.toString(16).padStart(4, "0"));
+                }
+                else {
+                    out.push(c);
+                }
+        }
+    }
+    out.push(0x22 /* '"' */);
+}
+/// Append the ASCII bytes of `s` (used only for structural/escape text, which
+/// is always ASCII).
+function pushAscii(out, s) {
+    for (let i = 0; i < s.length; i++) {
+        out.push(s.charCodeAt(i));
+    }
+}
+class Writer {
+    out = [];
+    indent; /// spaces per level, or < 0 for compact
+    constructor(indent) {
+        this.indent = indent;
+    }
+    newlineIndent(depth) {
+        if (this.indent < 0)
+            return;
+        this.out.push(0x0a /* '\n' */);
+        const spaces = this.indent * depth;
+        for (let i = 0; i < spaces; i++) {
+            this.out.push(0x20 /* ' ' */);
+        }
+    }
+    colon() {
+        pushAscii(this.out, this.indent < 0 ? ":" : ": ");
+    }
+}
+/// Emit `"key": ` prefix; flips `flag.first` to false (mirrors the C++
+/// `bool & first`).
+function writeKey(w, key, flag, depth) {
+    if (!flag.first)
+        w.out.push(0x2c /* ',' */);
+    flag.first = false;
+    w.newlineIndent(depth + 1);
+    escapeTo(w.out, key);
+    w.colon();
+}
+function writeArray(w, items, depth) {
+    if (items.length === 0) {
+        pushAscii(w.out, "[]");
+        return;
+    }
+    w.out.push(0x5b /* '[' */);
+    let first = true;
+    for (const item of items) {
+        if (!first)
+            w.out.push(0x2c /* ',' */);
+        first = false;
+        w.newlineIndent(depth + 1);
+        writeNode(w, item, depth + 1);
+    }
+    w.newlineIndent(depth);
+    w.out.push(0x5d /* ']' */);
+}
+function writeStringArray(w, items, depth) {
+    if (items.length === 0) {
+        pushAscii(w.out, "[]");
+        return;
+    }
+    w.out.push(0x5b /* '[' */);
+    let first = true;
+    for (const item of items) {
+        if (!first)
+            w.out.push(0x2c /* ',' */);
+        first = false;
+        w.newlineIndent(depth + 1);
+        escapeTo(w.out, item);
+    }
+    w.newlineIndent(depth);
+    w.out.push(0x5d /* ']' */);
+}
+function writeLiteralValue(w, node) {
+    /// 64-bit integers are emitted as JSON strings (the server's contract:
+    /// values above 2^53 lose precision under JS `JSON.parse`). Float64 is a
+    /// JSON number; String is a JSON string.
+    if (node.value_type === "Float64") {
+        pushAscii(w.out, node.value); /// already a valid JSON number
+    }
+    else if (node.value_type === "String") {
+        escapeTo(w.out, node.value);
+    } /// UInt64 / Int64 / fallback
+    else {
+        escapeTo(w.out, node.value);
+    }
+}
+/// Object emission is inline (each node writes its own members in order).
+function writeNode(w, node, depth) {
+    w.out.push(0x7b /* '{' */);
+    const flag = { first: true };
+    const key = (k) => writeKey(w, k, flag, depth);
+    switch (node.kind) {
+        case NodeKind.DataType:
+            key("type");
+            escapeTo(w.out, "DataType");
+            key("name");
+            escapeTo(w.out, node.name);
+            if (node.has_argument_list) {
+                key("arguments");
+                writeArray(w, node.arguments, depth + 1);
+            }
+            break;
+        case NodeKind.EnumDataType: {
+            key("type");
+            escapeTo(w.out, "EnumDataType");
+            key("name");
+            escapeTo(w.out, node.name);
+            key("values");
+            if (node.values.length === 0) {
+                pushAscii(w.out, "[]");
+            }
+            else {
+                w.out.push(0x5b /* '[' */);
+                let vfirst = true;
+                for (const v of node.values) {
+                    if (!vfirst)
+                        w.out.push(0x2c /* ',' */);
+                    vfirst = false;
+                    w.newlineIndent(depth + 2);
+                    w.out.push(0x7b /* '{' */);
+                    const mflag = { first: true };
+                    writeKey(w, "name", mflag, depth + 2);
+                    escapeTo(w.out, v.name);
+                    writeKey(w, "value", mflag, depth + 2);
+                    pushAscii(w.out, v.value.toString());
+                    w.newlineIndent(depth + 2);
+                    w.out.push(0x7d /* '}' */);
+                }
+                w.newlineIndent(depth + 1);
+                w.out.push(0x5d /* ']' */);
+            }
+            break;
+        }
+        case NodeKind.TupleDataType:
+            key("type");
+            escapeTo(w.out, "TupleDataType");
+            key("name");
+            escapeTo(w.out, node.name);
+            if (node.has_argument_list) {
+                key("arguments");
+                writeArray(w, node.arguments, depth + 1);
+            }
+            if (node.element_names.length > 0) {
+                key("element_names");
+                writeStringArray(w, node.element_names, depth + 1);
+            }
+            break;
+        case NodeKind.NameTypePair:
+            key("type");
+            escapeTo(w.out, "NameTypePair");
+            key("name");
+            escapeTo(w.out, node.name);
+            if (node.data_type) {
+                key("data_type");
+                writeNode(w, node.data_type, depth + 1);
+            }
+            break;
+        case NodeKind.Literal:
+            key("type");
+            escapeTo(w.out, "Literal");
+            key("value_type");
+            escapeTo(w.out, node.value_type);
+            key("value");
+            writeLiteralValue(w, node);
+            break;
+        case NodeKind.Function:
+            key("type");
+            escapeTo(w.out, "Function");
+            key("name");
+            escapeTo(w.out, node.name);
+            if (node.is_operator) {
+                key("is_operator");
+                pushAscii(w.out, "true");
+            }
+            key("arguments");
+            writeArray(w, node.arguments, depth + 1);
+            break;
+        case NodeKind.Identifier:
+            key("type");
+            escapeTo(w.out, "Identifier");
+            key("name");
+            escapeTo(w.out, node.name);
+            if (node.name_parts.length > 0) {
+                key("name_parts");
+                writeStringArray(w, node.name_parts, depth + 1);
+            }
+            break;
+    }
+    w.newlineIndent(depth);
+    w.out.push(0x7d /* '}' */);
+}
+export function toJSON(node, indent = 2) {
+    const w = new Writer(indent);
+    writeNode(w, node, 0);
+    return decoder.decode(Uint8Array.from(w.out));
+}

package/dist/lexer.d.ts ADDED Viewed

@@ -0,0 +1,22 @@
+export declare const TokenType: {
+    readonly End: "End";
+    readonly Word: "Word";
+    readonly QuotedIdent: "QuotedIdent";
+    readonly Number: "Number";
+    readonly String: "String";
+    readonly OpeningParen: "OpeningParen";
+    readonly ClosingParen: "ClosingParen";
+    readonly Comma: "Comma";
+    readonly Equals: "Equals";
+    readonly Minus: "Minus";
+    readonly Dot: "Dot";
+    readonly Error: "Error";
+};
+export type TokenType = (typeof TokenType)[keyof typeof TokenType];
+export interface Token {
+    type: TokenType;
+    text: string;
+    is_float: boolean;
+    begin: number;
+}
+export declare function tokenize(input: string): Token[];

package/dist/lexer.js ADDED Viewed

@@ -0,0 +1,273 @@
+/// A small purpose-built tokenizer for ClickHouse data-type strings.
+///
+/// Type strings use a tiny slice of the SQL grammar — identifiers (bare,
+/// backtick- or double-quoted), single-quoted string literals, numbers, and a
+/// handful of punctuation tokens. Rather than vendor the full ClickHouse
+/// `Lexer` (and its `UTF8Helpers` / `find_symbols` dependencies), this covers
+/// exactly that slice, keeping the library free of any ClickHouse headers.
+///
+/// This is a faithful TypeScript port of the original C++ lexer.
+/// A plain `const` object rather than a TS `enum`, so the source is erasable
+/// and runs under Node's native type-stripping (which rejects `enum`). The
+/// companion type below makes `TokenType` usable as both a value and a type.
+export const TokenType = {
+    End: "End", /// end of input
+    Word: "Word", /// bare identifier / keyword, e.g. UInt8, Array, SIGNED
+    QuotedIdent: "QuotedIdent", /// `backtick` or "double"-quoted identifier (decoded)
+    Number: "Number", /// numeric literal (raw text, no sign)
+    String: "String", /// single-quoted string literal (decoded)
+    OpeningParen: "OpeningParen", /// (
+    ClosingParen: "ClosingParen", /// )
+    Comma: "Comma", /// ,
+    Equals: "Equals", /// =
+    Minus: "Minus", /// -
+    Dot: "Dot", /// .
+    Error: "Error", /// malformed token; `text` holds the message
+};
+function isSpace(c) {
+    return (c === " " ||
+        c === "\t" ||
+        c === "\n" ||
+        c === "\r" ||
+        c === "\f" ||
+        c === "\v");
+}
+function isDigit(c) {
+    return c >= "0" && c <= "9";
+}
+function isWordFirst(c) {
+    return ((c >= "a" && c <= "z") || (c >= "A" && c <= "Z") || c === "_" || c === "$");
+}
+function isWordChar(c) {
+    return isWordFirst(c) || isDigit(c);
+}
+/// Decode the body of a quoted token (string literal or quoted identifier).
+/// `quote` is the surrounding quote character. Handles C-style backslash
+/// escapes and the SQL doubled-quote escape (e.g. '' inside '...'). Mirrors
+/// the relevant behaviour of `tryReadQuotedStringWithSQLStyle`.
+function decodeQuoted(input, pos, quote) {
+    const n = input.length;
+    let out = "";
+    /// pos points at the opening quote.
+    ++pos;
+    while (pos < n) {
+        const c = input[pos];
+        if (c === quote) {
+            /// Doubled quote -> literal quote.
+            if (pos + 1 < n && input[pos + 1] === quote) {
+                out += quote;
+                pos += 2;
+                continue;
+            }
+            ++pos; /// consume the closing quote
+            return { ok: true, out, error: "", pos };
+        }
+        if (c === "\\") {
+            if (pos + 1 >= n) {
+                return {
+                    ok: false,
+                    out,
+                    error: "unterminated escape in quoted literal",
+                    pos,
+                };
+            }
+            const e = input[pos + 1];
+            switch (e) {
+                case "b":
+                    out += "\b";
+                    break;
+                case "f":
+                    out += "\f";
+                    break;
+                case "n":
+                    out += "\n";
+                    break;
+                case "r":
+                    out += "\r";
+                    break;
+                case "t":
+                    out += "\t";
+                    break;
+                case "0":
+                    out += "\0";
+                    break;
+                case "a":
+                    out += "\x07";
+                    break;
+                case "v":
+                    out += "\v";
+                    break;
+                /// \\, \', \", \`, and any other char: keep the literal char.
+                default:
+                    out += e;
+                    break;
+            }
+            pos += 2;
+            continue;
+        }
+        out += c;
+        ++pos;
+    }
+    return { ok: false, out, error: "unterminated quoted literal", pos };
+}
+/// Tokenize the whole input. The returned array always ends with an `End`
+/// token. A malformed token yields a single trailing `Error` token.
+export function tokenize(input) {
+    const tokens = [];
+    let pos = 0;
+    const n = input.length;
+    const fail = (at, msg) => {
+        tokens.push({
+            type: TokenType.Error,
+            text: msg,
+            is_float: false,
+            begin: at,
+        });
+    };
+    while (pos < n) {
+        const c = input[pos];
+        if (isSpace(c)) {
+            ++pos;
+            continue;
+        }
+        const start = pos;
+        switch (c) {
+            case "(":
+                tokens.push({
+                    type: TokenType.OpeningParen,
+                    text: "(",
+                    is_float: false,
+                    begin: start,
+                });
+                ++pos;
+                continue;
+            case ")":
+                tokens.push({
+                    type: TokenType.ClosingParen,
+                    text: ")",
+                    is_float: false,
+                    begin: start,
+                });
+                ++pos;
+                continue;
+            case ",":
+                tokens.push({
+                    type: TokenType.Comma,
+                    text: ",",
+                    is_float: false,
+                    begin: start,
+                });
+                ++pos;
+                continue;
+            case "=":
+                tokens.push({
+                    type: TokenType.Equals,
+                    text: "=",
+                    is_float: false,
+                    begin: start,
+                });
+                ++pos;
+                continue;
+            case "-":
+                tokens.push({
+                    type: TokenType.Minus,
+                    text: "-",
+                    is_float: false,
+                    begin: start,
+                });
+                ++pos;
+                continue;
+            default:
+                break;
+        }
+        /// A dot may start a fractional number (.5) or be a standalone separator.
+        if (c === "." && !(pos + 1 < n && isDigit(input[pos + 1]))) {
+            tokens.push({
+                type: TokenType.Dot,
+                text: ".",
+                is_float: false,
+                begin: start,
+            });
+            ++pos;
+            continue;
+        }
+        /// Quoted identifiers.
+        if (c === "`" || c === '"') {
+            const r = decodeQuoted(input, pos, c);
+            pos = r.pos;
+            if (!r.ok) {
+                fail(start, r.error);
+                break;
+            }
+            tokens.push({
+                type: TokenType.QuotedIdent,
+                text: r.out,
+                is_float: false,
+                begin: start,
+            });
+            continue;
+        }
+        /// String literal.
+        if (c === "'") {
+            const r = decodeQuoted(input, pos, c);
+            pos = r.pos;
+            if (!r.ok) {
+                fail(start, r.error);
+                break;
+            }
+            tokens.push({
+                type: TokenType.String,
+                text: r.out,
+                is_float: false,
+                begin: start,
+            });
+            continue;
+        }
+        /// Number.
+        if (isDigit(c) || c === ".") {
+            let is_float = false;
+            /// integer part
+            while (pos < n && isDigit(input[pos]))
+                ++pos;
+            /// fraction
+            if (pos < n && input[pos] === ".") {
+                is_float = true;
+                ++pos;
+                while (pos < n && isDigit(input[pos]))
+                    ++pos;
+            }
+            /// exponent
+            if (pos < n && (input[pos] === "e" || input[pos] === "E")) {
+                is_float = true;
+                ++pos;
+                if (pos < n && (input[pos] === "+" || input[pos] === "-"))
+                    ++pos;
+                while (pos < n && isDigit(input[pos]))
+                    ++pos;
+            }
+            tokens.push({
+                type: TokenType.Number,
+                text: input.substring(start, pos),
+                is_float,
+                begin: start,
+            });
+            continue;
+        }
+        /// Bare word / identifier / keyword.
+        if (isWordFirst(c)) {
+            while (pos < n && isWordChar(input[pos]))
+                ++pos;
+            tokens.push({
+                type: TokenType.Word,
+                text: input.substring(start, pos),
+                is_float: false,
+                begin: start,
+            });
+            continue;
+        }
+        fail(start, "unexpected character '" + c + "'");
+        break;
+    }
+    tokens.push({ type: TokenType.End, text: "", is_float: false, begin: pos });
+    return tokens;
+}

package/dist/parser.d.ts ADDED Viewed

@@ -0,0 +1,11 @@
+import { type Node } from "./ast.ts";
+export interface ParseError {
+    message: string;
+    position: number;
+}
+export interface ParseResult {
+    ast: Node | null;
+    error: ParseError | null;
+    ok(): boolean;
+}
+export declare function parseDataType(input: string): ParseResult;