@clickhouse/datatype-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/json.js ADDED
@@ -0,0 +1,246 @@
1
+ /// Serialize a node tree to JSON, matching the server's `formatASTAsJSON`
2
+ /// shape for data types. This is a faithful TypeScript port of the C++
3
+ /// `src/json.cpp`. `indent` < 0 produces compact output; >= 0 produces pretty
4
+ /// output with that many spaces per level.
5
+ ///
6
+ /// Byte-faithfulness: the C++ `escapeTo` iterates over the `unsigned char`
7
+ /// bytes of the input. To match it exactly for multibyte UTF-8 content we
8
+ /// accumulate the whole document into a byte buffer (`number[]` of byte
9
+ /// values): structural characters are ASCII, strings are encoded to their
10
+ /// UTF-8 bytes, and only bytes < 0x20 are special-cased (the ASCII escapes plus
11
+ /// the `\u%04x` fallback). All bytes >= 0x20 are pushed through verbatim. At
12
+ /// the end we decode the buffer back to a (byte-identical) JS string.
13
+ import { NodeKind } from "./ast.js";
14
+ const encoder = new TextEncoder();
15
+ const decoder = new TextDecoder("utf-8");
16
+ function escapeTo(out, s) {
17
+ out.push(0x22 /* '"' */);
18
+ for (const c of encoder.encode(s)) {
19
+ switch (c) {
20
+ case 0x22 /* '"' */:
21
+ pushAscii(out, '\\"');
22
+ break;
23
+ case 0x5c /* '\\' */:
24
+ pushAscii(out, "\\\\");
25
+ break;
26
+ case 0x08 /* '\b' */:
27
+ pushAscii(out, "\\b");
28
+ break;
29
+ case 0x0c /* '\f' */:
30
+ pushAscii(out, "\\f");
31
+ break;
32
+ case 0x0a /* '\n' */:
33
+ pushAscii(out, "\\n");
34
+ break;
35
+ case 0x0d /* '\r' */:
36
+ pushAscii(out, "\\r");
37
+ break;
38
+ case 0x09 /* '\t' */:
39
+ pushAscii(out, "\\t");
40
+ break;
41
+ default:
42
+ if (c < 0x20) {
43
+ /// \u%04x — lowercase hex, 4 digits.
44
+ pushAscii(out, "\\u" + c.toString(16).padStart(4, "0"));
45
+ }
46
+ else {
47
+ out.push(c);
48
+ }
49
+ }
50
+ }
51
+ out.push(0x22 /* '"' */);
52
+ }
53
+ /// Append the ASCII bytes of `s` (used only for structural/escape text, which
54
+ /// is always ASCII).
55
+ function pushAscii(out, s) {
56
+ for (let i = 0; i < s.length; i++) {
57
+ out.push(s.charCodeAt(i));
58
+ }
59
+ }
60
+ class Writer {
61
+ out = [];
62
+ indent; /// spaces per level, or < 0 for compact
63
+ constructor(indent) {
64
+ this.indent = indent;
65
+ }
66
+ newlineIndent(depth) {
67
+ if (this.indent < 0)
68
+ return;
69
+ this.out.push(0x0a /* '\n' */);
70
+ const spaces = this.indent * depth;
71
+ for (let i = 0; i < spaces; i++) {
72
+ this.out.push(0x20 /* ' ' */);
73
+ }
74
+ }
75
+ colon() {
76
+ pushAscii(this.out, this.indent < 0 ? ":" : ": ");
77
+ }
78
+ }
79
+ /// Emit `"key": ` prefix; flips `flag.first` to false (mirrors the C++
80
+ /// `bool & first`).
81
+ function writeKey(w, key, flag, depth) {
82
+ if (!flag.first)
83
+ w.out.push(0x2c /* ',' */);
84
+ flag.first = false;
85
+ w.newlineIndent(depth + 1);
86
+ escapeTo(w.out, key);
87
+ w.colon();
88
+ }
89
+ function writeArray(w, items, depth) {
90
+ if (items.length === 0) {
91
+ pushAscii(w.out, "[]");
92
+ return;
93
+ }
94
+ w.out.push(0x5b /* '[' */);
95
+ let first = true;
96
+ for (const item of items) {
97
+ if (!first)
98
+ w.out.push(0x2c /* ',' */);
99
+ first = false;
100
+ w.newlineIndent(depth + 1);
101
+ writeNode(w, item, depth + 1);
102
+ }
103
+ w.newlineIndent(depth);
104
+ w.out.push(0x5d /* ']' */);
105
+ }
106
+ function writeStringArray(w, items, depth) {
107
+ if (items.length === 0) {
108
+ pushAscii(w.out, "[]");
109
+ return;
110
+ }
111
+ w.out.push(0x5b /* '[' */);
112
+ let first = true;
113
+ for (const item of items) {
114
+ if (!first)
115
+ w.out.push(0x2c /* ',' */);
116
+ first = false;
117
+ w.newlineIndent(depth + 1);
118
+ escapeTo(w.out, item);
119
+ }
120
+ w.newlineIndent(depth);
121
+ w.out.push(0x5d /* ']' */);
122
+ }
123
+ function writeLiteralValue(w, node) {
124
+ /// 64-bit integers are emitted as JSON strings (the server's contract:
125
+ /// values above 2^53 lose precision under JS `JSON.parse`). Float64 is a
126
+ /// JSON number; String is a JSON string.
127
+ if (node.value_type === "Float64") {
128
+ pushAscii(w.out, node.value); /// already a valid JSON number
129
+ }
130
+ else if (node.value_type === "String") {
131
+ escapeTo(w.out, node.value);
132
+ } /// UInt64 / Int64 / fallback
133
+ else {
134
+ escapeTo(w.out, node.value);
135
+ }
136
+ }
137
+ /// Object emission is inline (each node writes its own members in order).
138
+ function writeNode(w, node, depth) {
139
+ w.out.push(0x7b /* '{' */);
140
+ const flag = { first: true };
141
+ const key = (k) => writeKey(w, k, flag, depth);
142
+ switch (node.kind) {
143
+ case NodeKind.DataType:
144
+ key("type");
145
+ escapeTo(w.out, "DataType");
146
+ key("name");
147
+ escapeTo(w.out, node.name);
148
+ if (node.has_argument_list) {
149
+ key("arguments");
150
+ writeArray(w, node.arguments, depth + 1);
151
+ }
152
+ break;
153
+ case NodeKind.EnumDataType: {
154
+ key("type");
155
+ escapeTo(w.out, "EnumDataType");
156
+ key("name");
157
+ escapeTo(w.out, node.name);
158
+ key("values");
159
+ if (node.values.length === 0) {
160
+ pushAscii(w.out, "[]");
161
+ }
162
+ else {
163
+ w.out.push(0x5b /* '[' */);
164
+ let vfirst = true;
165
+ for (const v of node.values) {
166
+ if (!vfirst)
167
+ w.out.push(0x2c /* ',' */);
168
+ vfirst = false;
169
+ w.newlineIndent(depth + 2);
170
+ w.out.push(0x7b /* '{' */);
171
+ const mflag = { first: true };
172
+ writeKey(w, "name", mflag, depth + 2);
173
+ escapeTo(w.out, v.name);
174
+ writeKey(w, "value", mflag, depth + 2);
175
+ pushAscii(w.out, v.value.toString());
176
+ w.newlineIndent(depth + 2);
177
+ w.out.push(0x7d /* '}' */);
178
+ }
179
+ w.newlineIndent(depth + 1);
180
+ w.out.push(0x5d /* ']' */);
181
+ }
182
+ break;
183
+ }
184
+ case NodeKind.TupleDataType:
185
+ key("type");
186
+ escapeTo(w.out, "TupleDataType");
187
+ key("name");
188
+ escapeTo(w.out, node.name);
189
+ if (node.has_argument_list) {
190
+ key("arguments");
191
+ writeArray(w, node.arguments, depth + 1);
192
+ }
193
+ if (node.element_names.length > 0) {
194
+ key("element_names");
195
+ writeStringArray(w, node.element_names, depth + 1);
196
+ }
197
+ break;
198
+ case NodeKind.NameTypePair:
199
+ key("type");
200
+ escapeTo(w.out, "NameTypePair");
201
+ key("name");
202
+ escapeTo(w.out, node.name);
203
+ if (node.data_type) {
204
+ key("data_type");
205
+ writeNode(w, node.data_type, depth + 1);
206
+ }
207
+ break;
208
+ case NodeKind.Literal:
209
+ key("type");
210
+ escapeTo(w.out, "Literal");
211
+ key("value_type");
212
+ escapeTo(w.out, node.value_type);
213
+ key("value");
214
+ writeLiteralValue(w, node);
215
+ break;
216
+ case NodeKind.Function:
217
+ key("type");
218
+ escapeTo(w.out, "Function");
219
+ key("name");
220
+ escapeTo(w.out, node.name);
221
+ if (node.is_operator) {
222
+ key("is_operator");
223
+ pushAscii(w.out, "true");
224
+ }
225
+ key("arguments");
226
+ writeArray(w, node.arguments, depth + 1);
227
+ break;
228
+ case NodeKind.Identifier:
229
+ key("type");
230
+ escapeTo(w.out, "Identifier");
231
+ key("name");
232
+ escapeTo(w.out, node.name);
233
+ if (node.name_parts.length > 0) {
234
+ key("name_parts");
235
+ writeStringArray(w, node.name_parts, depth + 1);
236
+ }
237
+ break;
238
+ }
239
+ w.newlineIndent(depth);
240
+ w.out.push(0x7d /* '}' */);
241
+ }
242
+ export function toJSON(node, indent = 2) {
243
+ const w = new Writer(indent);
244
+ writeNode(w, node, 0);
245
+ return decoder.decode(Uint8Array.from(w.out));
246
+ }
@@ -0,0 +1,22 @@
1
+ export declare const TokenType: {
2
+ readonly End: "End";
3
+ readonly Word: "Word";
4
+ readonly QuotedIdent: "QuotedIdent";
5
+ readonly Number: "Number";
6
+ readonly String: "String";
7
+ readonly OpeningParen: "OpeningParen";
8
+ readonly ClosingParen: "ClosingParen";
9
+ readonly Comma: "Comma";
10
+ readonly Equals: "Equals";
11
+ readonly Minus: "Minus";
12
+ readonly Dot: "Dot";
13
+ readonly Error: "Error";
14
+ };
15
+ export type TokenType = (typeof TokenType)[keyof typeof TokenType];
16
+ export interface Token {
17
+ type: TokenType;
18
+ text: string;
19
+ is_float: boolean;
20
+ begin: number;
21
+ }
22
+ export declare function tokenize(input: string): Token[];
package/dist/lexer.js ADDED
@@ -0,0 +1,273 @@
1
+ /// A small purpose-built tokenizer for ClickHouse data-type strings.
2
+ ///
3
+ /// Type strings use a tiny slice of the SQL grammar — identifiers (bare,
4
+ /// backtick- or double-quoted), single-quoted string literals, numbers, and a
5
+ /// handful of punctuation tokens. Rather than vendor the full ClickHouse
6
+ /// `Lexer` (and its `UTF8Helpers` / `find_symbols` dependencies), this covers
7
+ /// exactly that slice, keeping the library free of any ClickHouse headers.
8
+ ///
9
+ /// This is a faithful TypeScript port of the original C++ lexer.
10
+ /// A plain `const` object rather than a TS `enum`, so the source is erasable
11
+ /// and runs under Node's native type-stripping (which rejects `enum`). The
12
+ /// companion type below makes `TokenType` usable as both a value and a type.
13
+ export const TokenType = {
14
+ End: "End", /// end of input
15
+ Word: "Word", /// bare identifier / keyword, e.g. UInt8, Array, SIGNED
16
+ QuotedIdent: "QuotedIdent", /// `backtick` or "double"-quoted identifier (decoded)
17
+ Number: "Number", /// numeric literal (raw text, no sign)
18
+ String: "String", /// single-quoted string literal (decoded)
19
+ OpeningParen: "OpeningParen", /// (
20
+ ClosingParen: "ClosingParen", /// )
21
+ Comma: "Comma", /// ,
22
+ Equals: "Equals", /// =
23
+ Minus: "Minus", /// -
24
+ Dot: "Dot", /// .
25
+ Error: "Error", /// malformed token; `text` holds the message
26
+ };
27
+ function isSpace(c) {
28
+ return (c === " " ||
29
+ c === "\t" ||
30
+ c === "\n" ||
31
+ c === "\r" ||
32
+ c === "\f" ||
33
+ c === "\v");
34
+ }
35
+ function isDigit(c) {
36
+ return c >= "0" && c <= "9";
37
+ }
38
+ function isWordFirst(c) {
39
+ return ((c >= "a" && c <= "z") || (c >= "A" && c <= "Z") || c === "_" || c === "$");
40
+ }
41
+ function isWordChar(c) {
42
+ return isWordFirst(c) || isDigit(c);
43
+ }
44
+ /// Decode the body of a quoted token (string literal or quoted identifier).
45
+ /// `quote` is the surrounding quote character. Handles C-style backslash
46
+ /// escapes and the SQL doubled-quote escape (e.g. '' inside '...'). Mirrors
47
+ /// the relevant behaviour of `tryReadQuotedStringWithSQLStyle`.
48
+ function decodeQuoted(input, pos, quote) {
49
+ const n = input.length;
50
+ let out = "";
51
+ /// pos points at the opening quote.
52
+ ++pos;
53
+ while (pos < n) {
54
+ const c = input[pos];
55
+ if (c === quote) {
56
+ /// Doubled quote -> literal quote.
57
+ if (pos + 1 < n && input[pos + 1] === quote) {
58
+ out += quote;
59
+ pos += 2;
60
+ continue;
61
+ }
62
+ ++pos; /// consume the closing quote
63
+ return { ok: true, out, error: "", pos };
64
+ }
65
+ if (c === "\\") {
66
+ if (pos + 1 >= n) {
67
+ return {
68
+ ok: false,
69
+ out,
70
+ error: "unterminated escape in quoted literal",
71
+ pos,
72
+ };
73
+ }
74
+ const e = input[pos + 1];
75
+ switch (e) {
76
+ case "b":
77
+ out += "\b";
78
+ break;
79
+ case "f":
80
+ out += "\f";
81
+ break;
82
+ case "n":
83
+ out += "\n";
84
+ break;
85
+ case "r":
86
+ out += "\r";
87
+ break;
88
+ case "t":
89
+ out += "\t";
90
+ break;
91
+ case "0":
92
+ out += "\0";
93
+ break;
94
+ case "a":
95
+ out += "\x07";
96
+ break;
97
+ case "v":
98
+ out += "\v";
99
+ break;
100
+ /// \\, \', \", \`, and any other char: keep the literal char.
101
+ default:
102
+ out += e;
103
+ break;
104
+ }
105
+ pos += 2;
106
+ continue;
107
+ }
108
+ out += c;
109
+ ++pos;
110
+ }
111
+ return { ok: false, out, error: "unterminated quoted literal", pos };
112
+ }
113
+ /// Tokenize the whole input. The returned array always ends with an `End`
114
+ /// token. A malformed token yields a single trailing `Error` token.
115
+ export function tokenize(input) {
116
+ const tokens = [];
117
+ let pos = 0;
118
+ const n = input.length;
119
+ const fail = (at, msg) => {
120
+ tokens.push({
121
+ type: TokenType.Error,
122
+ text: msg,
123
+ is_float: false,
124
+ begin: at,
125
+ });
126
+ };
127
+ while (pos < n) {
128
+ const c = input[pos];
129
+ if (isSpace(c)) {
130
+ ++pos;
131
+ continue;
132
+ }
133
+ const start = pos;
134
+ switch (c) {
135
+ case "(":
136
+ tokens.push({
137
+ type: TokenType.OpeningParen,
138
+ text: "(",
139
+ is_float: false,
140
+ begin: start,
141
+ });
142
+ ++pos;
143
+ continue;
144
+ case ")":
145
+ tokens.push({
146
+ type: TokenType.ClosingParen,
147
+ text: ")",
148
+ is_float: false,
149
+ begin: start,
150
+ });
151
+ ++pos;
152
+ continue;
153
+ case ",":
154
+ tokens.push({
155
+ type: TokenType.Comma,
156
+ text: ",",
157
+ is_float: false,
158
+ begin: start,
159
+ });
160
+ ++pos;
161
+ continue;
162
+ case "=":
163
+ tokens.push({
164
+ type: TokenType.Equals,
165
+ text: "=",
166
+ is_float: false,
167
+ begin: start,
168
+ });
169
+ ++pos;
170
+ continue;
171
+ case "-":
172
+ tokens.push({
173
+ type: TokenType.Minus,
174
+ text: "-",
175
+ is_float: false,
176
+ begin: start,
177
+ });
178
+ ++pos;
179
+ continue;
180
+ default:
181
+ break;
182
+ }
183
+ /// A dot may start a fractional number (.5) or be a standalone separator.
184
+ if (c === "." && !(pos + 1 < n && isDigit(input[pos + 1]))) {
185
+ tokens.push({
186
+ type: TokenType.Dot,
187
+ text: ".",
188
+ is_float: false,
189
+ begin: start,
190
+ });
191
+ ++pos;
192
+ continue;
193
+ }
194
+ /// Quoted identifiers.
195
+ if (c === "`" || c === '"') {
196
+ const r = decodeQuoted(input, pos, c);
197
+ pos = r.pos;
198
+ if (!r.ok) {
199
+ fail(start, r.error);
200
+ break;
201
+ }
202
+ tokens.push({
203
+ type: TokenType.QuotedIdent,
204
+ text: r.out,
205
+ is_float: false,
206
+ begin: start,
207
+ });
208
+ continue;
209
+ }
210
+ /// String literal.
211
+ if (c === "'") {
212
+ const r = decodeQuoted(input, pos, c);
213
+ pos = r.pos;
214
+ if (!r.ok) {
215
+ fail(start, r.error);
216
+ break;
217
+ }
218
+ tokens.push({
219
+ type: TokenType.String,
220
+ text: r.out,
221
+ is_float: false,
222
+ begin: start,
223
+ });
224
+ continue;
225
+ }
226
+ /// Number.
227
+ if (isDigit(c) || c === ".") {
228
+ let is_float = false;
229
+ /// integer part
230
+ while (pos < n && isDigit(input[pos]))
231
+ ++pos;
232
+ /// fraction
233
+ if (pos < n && input[pos] === ".") {
234
+ is_float = true;
235
+ ++pos;
236
+ while (pos < n && isDigit(input[pos]))
237
+ ++pos;
238
+ }
239
+ /// exponent
240
+ if (pos < n && (input[pos] === "e" || input[pos] === "E")) {
241
+ is_float = true;
242
+ ++pos;
243
+ if (pos < n && (input[pos] === "+" || input[pos] === "-"))
244
+ ++pos;
245
+ while (pos < n && isDigit(input[pos]))
246
+ ++pos;
247
+ }
248
+ tokens.push({
249
+ type: TokenType.Number,
250
+ text: input.substring(start, pos),
251
+ is_float,
252
+ begin: start,
253
+ });
254
+ continue;
255
+ }
256
+ /// Bare word / identifier / keyword.
257
+ if (isWordFirst(c)) {
258
+ while (pos < n && isWordChar(input[pos]))
259
+ ++pos;
260
+ tokens.push({
261
+ type: TokenType.Word,
262
+ text: input.substring(start, pos),
263
+ is_float: false,
264
+ begin: start,
265
+ });
266
+ continue;
267
+ }
268
+ fail(start, "unexpected character '" + c + "'");
269
+ break;
270
+ }
271
+ tokens.push({ type: TokenType.End, text: "", is_float: false, begin: pos });
272
+ return tokens;
273
+ }
@@ -0,0 +1,11 @@
1
+ import { type Node } from "./ast.ts";
2
+ export interface ParseError {
3
+ message: string;
4
+ position: number;
5
+ }
6
+ export interface ParseResult {
7
+ ast: Node | null;
8
+ error: ParseError | null;
9
+ ok(): boolean;
10
+ }
11
+ export declare function parseDataType(input: string): ParseResult;