@clickhouse/datatype-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/parser.js ADDED
@@ -0,0 +1,460 @@
1
+ /// A faithful port of ClickHouse's `ParserDataType::parseImpl`
2
+ /// (src/Parsers/ParserDataType.cpp) onto the self-contained AST in `ast.ts`.
3
+ /// The control flow deliberately tracks the original: identifier + SQL-standard
4
+ /// multi-word aliases, the Enum and Tuple special cases, then the generic
5
+ /// parametric-argument loop. AggregateFunction/SimpleAggregateFunction and the
6
+ /// JSON object-argument syntax are reported as unsupported (see parser.h).
7
+ ///
8
+ /// This is the TypeScript port of the C++ `chdt/parser.cpp`.
9
+ import { makeNode, NodeKind } from "./ast.js";
10
+ import { tokenize, TokenType } from "./lexer.js";
11
+ function toUpper(s) {
12
+ let r = "";
13
+ for (let i = 0; i < s.length; ++i) {
14
+ const c = s[i];
15
+ if (c >= "a" && c <= "z")
16
+ r += String.fromCharCode(c.charCodeAt(0) - "a".charCodeAt(0) + "A".charCodeAt(0));
17
+ else
18
+ r += c;
19
+ }
20
+ return r;
21
+ }
22
+ function toLower(s) {
23
+ let r = "";
24
+ for (let i = 0; i < s.length; ++i) {
25
+ const c = s[i];
26
+ if (c >= "A" && c <= "Z")
27
+ r += String.fromCharCode(c.charCodeAt(0) - "A".charCodeAt(0) + "a".charCodeAt(0));
28
+ else
29
+ r += c;
30
+ }
31
+ return r;
32
+ }
33
+ function isWordCharOrDollar(c) {
34
+ return ((c >= "a" && c <= "z") ||
35
+ (c >= "A" && c <= "Z") ||
36
+ (c >= "0" && c <= "9") ||
37
+ c === "_" ||
38
+ c === "$");
39
+ }
40
+ function isEnumTypeUpper(u) {
41
+ return u === "ENUM" || u === "ENUM8" || u === "ENUM16";
42
+ }
43
+ class Parser {
44
+ tokens;
45
+ pos = 0;
46
+ hard_error = null;
47
+ constructor(tokens) {
48
+ this.tokens = tokens;
49
+ }
50
+ run() {
51
+ /// A lexing error surfaces as a trailing Error token.
52
+ for (const tok of this.tokens)
53
+ if (tok.type === TokenType.Error)
54
+ return Parser.fail(tok.begin, tok.text);
55
+ const node = this.parseType();
56
+ if (!node) {
57
+ if (this.hard_error)
58
+ return makeResult(null, this.hard_error);
59
+ return Parser.fail(this.cur().begin, "expected a data type");
60
+ }
61
+ if (this.cur().type !== TokenType.End)
62
+ return Parser.fail(this.cur().begin, "unexpected trailing input after the data type");
63
+ return makeResult(node, null);
64
+ }
65
+ cur() {
66
+ return this.tokens[this.pos];
67
+ }
68
+ type() {
69
+ return this.tokens[this.pos].type;
70
+ }
71
+ advance() {
72
+ if (this.tokens[this.pos].type !== TokenType.End)
73
+ ++this.pos;
74
+ }
75
+ static fail(at, msg) {
76
+ return makeResult(null, { message: msg, position: at });
77
+ }
78
+ setHardError(at, msg) {
79
+ if (!this.hard_error)
80
+ this.hard_error = { message: msg, position: at };
81
+ }
82
+ isIdentifier() {
83
+ return (this.type() === TokenType.Word || this.type() === TokenType.QuotedIdent);
84
+ }
85
+ /// Consume consecutive Word tokens iff they all match `words`
86
+ /// (case-insensitive). On a full match, advances `pos` past them and returns
87
+ /// true; otherwise leaves `pos` unchanged and returns false.
88
+ matchWords(words) {
89
+ let p = this.pos;
90
+ for (const w of words) {
91
+ const tok = this.tokens[p];
92
+ if (tok.type !== TokenType.Word || toUpper(tok.text) !== toUpper(w))
93
+ return false;
94
+ ++p;
95
+ }
96
+ this.pos = p;
97
+ return true;
98
+ }
99
+ /// Read a single identifier (bare or quoted) into `name`.
100
+ parseIdentifier() {
101
+ if (!this.isIdentifier())
102
+ return { ok: false, name: "" };
103
+ const name = this.cur().text;
104
+ this.advance();
105
+ return { ok: true, name };
106
+ }
107
+ parseType() {
108
+ const id = this.parseIdentifier();
109
+ if (!id.ok)
110
+ return null;
111
+ let type_name = id.name;
112
+ /// Reject quoted garbage that cannot be a type name (e.g. `x.y`, `Null`).
113
+ {
114
+ let allWordChar = true;
115
+ for (let i = 0; i < type_name.length; ++i) {
116
+ if (!isWordCharOrDollar(type_name[i])) {
117
+ allWordChar = false;
118
+ break;
119
+ }
120
+ }
121
+ if (!allWordChar)
122
+ return null;
123
+ }
124
+ const type_name_upper = toUpper(type_name);
125
+ /// Keywords that the column-declaration parser claims before the type.
126
+ if (type_name_upper === "NOT" ||
127
+ type_name_upper === "NULL" ||
128
+ type_name_upper === "DEFAULT" ||
129
+ type_name_upper === "MATERIALIZED" ||
130
+ type_name_upper === "EPHEMERAL" ||
131
+ type_name_upper === "ALIAS" ||
132
+ type_name_upper === "AUTO" ||
133
+ type_name_upper === "PRIMARY" ||
134
+ type_name_upper === "COMMENT" ||
135
+ type_name_upper === "CODEC")
136
+ return null;
137
+ /// SQL-standard multi-word type names.
138
+ const suffix = this.parseTypeNameSuffix(type_name_upper);
139
+ if (suffix !== "")
140
+ type_name = type_name_upper + " " + suffix;
141
+ this.skipTrailingComma();
142
+ /// Enum special case -> EnumDataType with explicit values.
143
+ if (isEnumTypeUpper(type_name_upper) &&
144
+ this.type() === TokenType.OpeningParen) {
145
+ const saved = this.pos;
146
+ this.advance();
147
+ const values = [];
148
+ if (this.parseEnumValues(values) &&
149
+ this.type() === TokenType.ClosingParen) {
150
+ this.advance();
151
+ const node = makeNode(NodeKind.EnumDataType);
152
+ node.name = type_name;
153
+ node.values = values;
154
+ return node;
155
+ }
156
+ this.pos = saved;
157
+ }
158
+ /// Tuple special case -> TupleDataType with optional element names.
159
+ if (type_name === "Tuple" && this.type() === TokenType.OpeningParen) {
160
+ const tuple = this.parseTuple(type_name);
161
+ if (tuple)
162
+ return tuple;
163
+ /// else: fall through to the generic path
164
+ }
165
+ const node = makeNode(NodeKind.DataType);
166
+ node.name = type_name;
167
+ if (this.type() !== TokenType.OpeningParen)
168
+ return node;
169
+ this.advance();
170
+ if (!this.parseArgumentList(type_name, node.arguments))
171
+ return null;
172
+ if (this.type() !== TokenType.ClosingParen)
173
+ return null;
174
+ this.advance();
175
+ node.has_argument_list = true;
176
+ return node;
177
+ }
178
+ /// Returns the suffix to append for SQL-standard multi-word names, or "".
179
+ parseTypeNameSuffix(u) {
180
+ if (u === "NATIONAL") {
181
+ if (this.matchWords(["CHARACTER", "LARGE", "OBJECT"]))
182
+ return "CHARACTER LARGE OBJECT";
183
+ if (this.matchWords(["CHARACTER", "VARYING"]))
184
+ return "CHARACTER VARYING";
185
+ if (this.matchWords(["CHAR", "VARYING"]))
186
+ return "CHAR VARYING";
187
+ if (this.matchWords(["CHARACTER"]))
188
+ return "CHARACTER";
189
+ if (this.matchWords(["CHAR"]))
190
+ return "CHAR";
191
+ }
192
+ else if (u === "BINARY" ||
193
+ u === "CHARACTER" ||
194
+ u === "CHAR" ||
195
+ u === "NCHAR") {
196
+ if (this.matchWords(["LARGE", "OBJECT"]))
197
+ return "LARGE OBJECT";
198
+ if (this.matchWords(["VARYING"]))
199
+ return "VARYING";
200
+ }
201
+ else if (u === "DOUBLE") {
202
+ if (this.matchWords(["PRECISION"]))
203
+ return "PRECISION";
204
+ }
205
+ else if (u.indexOf("INT") !== -1) {
206
+ /// MySQL-compatible SIGNED / UNSIGNED, optionally after `(width)`.
207
+ if (this.matchWords(["SIGNED"]))
208
+ return "SIGNED";
209
+ if (this.matchWords(["UNSIGNED"]))
210
+ return "UNSIGNED";
211
+ if (this.type() === TokenType.OpeningParen) {
212
+ const saved = this.pos;
213
+ this.advance();
214
+ if (this.type() === TokenType.Number)
215
+ this.advance();
216
+ if (this.type() === TokenType.ClosingParen) {
217
+ this.advance();
218
+ if (this.matchWords(["SIGNED"]))
219
+ return "SIGNED";
220
+ if (this.matchWords(["UNSIGNED"]))
221
+ return "UNSIGNED";
222
+ }
223
+ else {
224
+ /// not the width form; leave the paren for generic args
225
+ this.pos = saved;
226
+ }
227
+ }
228
+ }
229
+ return "";
230
+ }
231
+ /// Skip a trailing comma right before a closing paren: `Tuple(Int, String,)`.
232
+ skipTrailingComma() {
233
+ if (this.type() === TokenType.Comma &&
234
+ this.tokens[this.pos + 1].type === TokenType.ClosingParen)
235
+ this.advance();
236
+ }
237
+ /// Explicit-only enum body: 'name' = value, ... . Returns false (caller
238
+ /// restores) for auto-assigned or otherwise non-trivial enums.
239
+ parseEnumValues(values) {
240
+ let first = true;
241
+ while (true) {
242
+ if (!first) {
243
+ if (this.type() !== TokenType.Comma)
244
+ break;
245
+ this.advance();
246
+ }
247
+ first = false;
248
+ if (this.type() !== TokenType.String)
249
+ return false;
250
+ const name = this.cur().text;
251
+ this.advance();
252
+ if (this.type() !== TokenType.Equals)
253
+ return false;
254
+ this.advance();
255
+ let negative = false;
256
+ if (this.type() === TokenType.Minus) {
257
+ negative = true;
258
+ this.advance();
259
+ }
260
+ if (this.type() !== TokenType.Number || this.cur().is_float)
261
+ return false;
262
+ const v = BigInt(this.cur().text);
263
+ this.advance();
264
+ values.push({ name, value: negative ? -v : v });
265
+ }
266
+ return values.length !== 0;
267
+ }
268
+ /// Parse a Tuple body into element types + names. Returns null (with the
269
+ /// position restored) if it cannot, so the caller can try the generic path.
270
+ parseTuple(type_name) {
271
+ const saved = this.pos;
272
+ this.advance(); /// consume '('
273
+ const node = makeNode(NodeKind.TupleDataType);
274
+ node.name = type_name;
275
+ const names = [];
276
+ let has_named = false;
277
+ let first = true;
278
+ while (true) {
279
+ if (!first) {
280
+ if (this.type() === TokenType.Comma)
281
+ this.advance();
282
+ else
283
+ break;
284
+ }
285
+ first = false;
286
+ const element_pos = this.pos;
287
+ /// Try: identifier Type (named element)
288
+ const id = this.parseIdentifier();
289
+ if (id.ok) {
290
+ const t = this.parseType();
291
+ if (t) {
292
+ names.push(id.name);
293
+ node.arguments.push(t);
294
+ has_named = true;
295
+ continue;
296
+ }
297
+ }
298
+ /// Else: just Type (unnamed element)
299
+ this.pos = element_pos;
300
+ const t = this.parseType();
301
+ if (t) {
302
+ names.push("");
303
+ node.arguments.push(t);
304
+ }
305
+ else {
306
+ break;
307
+ }
308
+ }
309
+ if (this.type() === TokenType.ClosingParen && node.arguments.length !== 0) {
310
+ this.advance();
311
+ node.has_argument_list = true;
312
+ if (has_named)
313
+ node.element_names = names;
314
+ return node;
315
+ }
316
+ this.pos = saved;
317
+ return null;
318
+ }
319
+ /// The generic comma-separated argument list inside `Type(...)`.
320
+ parseArgumentList(type_name, out) {
321
+ const lower = toLower(type_name);
322
+ if (type_name === "AggregateFunction" ||
323
+ type_name === "SimpleAggregateFunction") {
324
+ this.setHardError(this.cur().begin, type_name + " is not supported by this parser yet");
325
+ return false;
326
+ }
327
+ if (lower === "json") {
328
+ this.setHardError(this.cur().begin, "JSON typed/object arguments are not supported by this parser yet");
329
+ return false;
330
+ }
331
+ let arg_num = 0;
332
+ while (true) {
333
+ if (arg_num > 0) {
334
+ if (this.type() === TokenType.Comma)
335
+ this.advance();
336
+ else
337
+ break;
338
+ }
339
+ let arg;
340
+ if (type_name === "Dynamic")
341
+ arg = this.parseEqualsArgument();
342
+ else if (type_name === "Nested")
343
+ arg = this.parseNameTypePair();
344
+ else if (type_name === "Tuple")
345
+ arg = this.parseNameTypePairOrType();
346
+ else
347
+ arg = this.parseGenericArgument();
348
+ if (!arg)
349
+ break;
350
+ out.push(arg);
351
+ ++arg_num;
352
+ }
353
+ return true;
354
+ }
355
+ /// `identifier = number` -> Function equals(Identifier, Literal).
356
+ parseEqualsArgument() {
357
+ const id = this.parseIdentifier();
358
+ if (!id.ok)
359
+ return null;
360
+ if (this.type() !== TokenType.Equals)
361
+ return null;
362
+ this.advance();
363
+ const number = this.parseNumberLiteral();
364
+ if (!number)
365
+ return null;
366
+ const idNode = makeNode(NodeKind.Identifier);
367
+ idNode.name = id.name;
368
+ const fn = makeNode(NodeKind.Function);
369
+ fn.name = "equals";
370
+ fn.is_operator = true;
371
+ fn.arguments = [idNode, number];
372
+ return fn;
373
+ }
374
+ /// `name Type` -> NameTypePair (Nested elements).
375
+ parseNameTypePair() {
376
+ const id = this.parseIdentifier();
377
+ if (!id.ok)
378
+ return null;
379
+ const t = this.parseType();
380
+ if (!t)
381
+ return null;
382
+ const node = makeNode(NodeKind.NameTypePair);
383
+ node.name = id.name;
384
+ node.data_type = t;
385
+ return node;
386
+ }
387
+ parseNameTypePairOrType() {
388
+ const saved = this.pos;
389
+ const pair = this.parseNameTypePair();
390
+ if (pair)
391
+ return pair;
392
+ this.pos = saved;
393
+ return this.parseType();
394
+ }
395
+ /// Generic argument: a scalar literal (optionally `lit = lit`), or a type.
396
+ parseGenericArgument() {
397
+ const lit = this.parseScalarLiteral();
398
+ if (lit) {
399
+ if (this.type() === TokenType.Equals) {
400
+ this.advance();
401
+ const rhs = this.parseScalarLiteral();
402
+ if (!rhs)
403
+ return null;
404
+ const fn = makeNode(NodeKind.Function);
405
+ fn.name = "equals";
406
+ fn.is_operator = true;
407
+ fn.arguments = [lit, rhs];
408
+ return fn;
409
+ }
410
+ return lit;
411
+ }
412
+ return this.parseType();
413
+ }
414
+ parseNumberLiteral() {
415
+ let negative = false;
416
+ if (this.type() === TokenType.Minus) {
417
+ negative = true;
418
+ this.advance();
419
+ }
420
+ if (this.type() !== TokenType.Number)
421
+ return null;
422
+ const node = makeNode(NodeKind.Literal);
423
+ node.value_type = this.cur().is_float
424
+ ? "Float64"
425
+ : negative
426
+ ? "Int64"
427
+ : "UInt64";
428
+ node.value = (negative ? "-" : "") + this.cur().text;
429
+ this.advance();
430
+ return node;
431
+ }
432
+ /// A scalar literal: number (optionally signed) or string.
433
+ parseScalarLiteral() {
434
+ if (this.type() === TokenType.Number || this.type() === TokenType.Minus)
435
+ return this.parseNumberLiteral();
436
+ if (this.type() === TokenType.String) {
437
+ const node = makeNode(NodeKind.Literal);
438
+ node.value_type = "String";
439
+ node.value = this.cur().text;
440
+ this.advance();
441
+ return node;
442
+ }
443
+ return null;
444
+ }
445
+ }
446
+ function makeResult(ast, error) {
447
+ return {
448
+ ast,
449
+ error,
450
+ ok() {
451
+ return this.ast !== null;
452
+ },
453
+ };
454
+ }
455
+ /// Parse the whole string as a single data type. Trailing tokens after a
456
+ /// complete type are an error (the entire input must be one type).
457
+ export function parseDataType(input) {
458
+ const parser = new Parser(tokenize(input));
459
+ return parser.run();
460
+ }
package/package.json ADDED
@@ -0,0 +1,56 @@
1
+ {
2
+ "name": "@clickhouse/datatype-parser",
3
+ "version": "0.1.0",
4
+ "description": "Standalone ClickHouse data-type string parser — a TypeScript port of the chdt C++ library.",
5
+ "license": "Apache-2.0",
6
+ "type": "module",
7
+ "main": "dist/index.js",
8
+ "types": "dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "types": "./dist/index.d.ts",
12
+ "import": "./dist/index.js"
13
+ }
14
+ },
15
+ "files": [
16
+ "dist",
17
+ "README.md",
18
+ "LICENSE"
19
+ ],
20
+ "sideEffects": false,
21
+ "engines": {
22
+ "node": ">=18.0.0"
23
+ },
24
+ "repository": {
25
+ "type": "git",
26
+ "url": "git+https://github.com/ClickHouse/clickhouse-js.git",
27
+ "directory": "packages/datatype-parser"
28
+ },
29
+ "keywords": [
30
+ "clickhouse",
31
+ "data-type",
32
+ "parser",
33
+ "ast",
34
+ "rowbinary"
35
+ ],
36
+ "publishConfig": {
37
+ "access": "public"
38
+ },
39
+ "scripts": {
40
+ "build": "tsc -p tsconfig.build.json",
41
+ "typecheck": "tsc -p tsconfig.json --noEmit",
42
+ "lint": "eslint --max-warnings=0 .",
43
+ "lint:fix": "eslint . --fix",
44
+ "prepack": "npm run build",
45
+ "parse": "node tool/main.ts",
46
+ "test": "node --import tsx --test test/*.test.ts",
47
+ "test:oracle": "tsx test/oracle_compare.ts",
48
+ "test:unsupported": "tsx test/check_unsupported.ts",
49
+ "snapshot:update": "tsx test/update_snapshots.ts",
50
+ "validate:live": "tsx test/validate_types_live.ts"
51
+ },
52
+ "devDependencies": {
53
+ "tsx": "^4.22.4",
54
+ "typescript": "^5.6.0"
55
+ }
56
+ }