tinybird 0.0.1.dev14__py3-none-any.whl → 0.0.1.dev16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tinybird might be problematic. Click here for more details.
- tinybird/client.py +3 -1
- tinybird/feedback_manager.py +12 -1
- tinybird/{tb/modules/prompts.py → prompts.py} +26 -42
- tinybird/tb/cli.py +1 -0
- tinybird/tb/modules/build.py +17 -93
- tinybird/tb/modules/build_shell.py +133 -0
- tinybird/tb/modules/cli.py +14 -21
- tinybird/tb/modules/create.py +245 -10
- tinybird/tb/modules/datafile/build.py +29 -21
- tinybird/tb/modules/datafile/build_pipe.py +4 -0
- tinybird/tb/modules/datafile/common.py +989 -49
- tinybird/tb/modules/datafile/parse_datasource.py +1 -0
- tinybird/tb/modules/llm.py +57 -35
- tinybird/tb/modules/local.py +2 -47
- tinybird/tb/modules/local_common.py +54 -0
- tinybird/tb/modules/login.py +0 -1
- tinybird/tb/modules/mock.py +6 -5
- tinybird/tb/modules/test.py +104 -73
- {tinybird-0.0.1.dev14.dist-info → tinybird-0.0.1.dev16.dist-info}/METADATA +1 -1
- {tinybird-0.0.1.dev14.dist-info → tinybird-0.0.1.dev16.dist-info}/RECORD +23 -22
- tinybird/check_pypi.py +0 -25
- {tinybird-0.0.1.dev14.dist-info → tinybird-0.0.1.dev16.dist-info}/WHEEL +0 -0
- {tinybird-0.0.1.dev14.dist-info → tinybird-0.0.1.dev16.dist-info}/entry_points.txt +0 -0
- {tinybird-0.0.1.dev14.dist-info → tinybird-0.0.1.dev16.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import functools
|
|
1
2
|
import glob
|
|
2
3
|
import itertools
|
|
3
4
|
import os
|
|
@@ -5,23 +6,55 @@ import os.path
|
|
|
5
6
|
import pprint
|
|
6
7
|
import re
|
|
7
8
|
import shlex
|
|
9
|
+
import string
|
|
8
10
|
import textwrap
|
|
9
11
|
import traceback
|
|
10
12
|
from collections import namedtuple
|
|
13
|
+
from dataclasses import dataclass
|
|
11
14
|
from io import StringIO
|
|
12
15
|
from pathlib import Path
|
|
13
16
|
from string import Template
|
|
14
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, cast
|
|
17
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
|
|
15
18
|
|
|
16
19
|
import click
|
|
17
20
|
from mypy_extensions import KwArg, VarArg
|
|
18
21
|
|
|
19
22
|
from tinybird.ch_utils.engine import ENABLED_ENGINES
|
|
20
23
|
from tinybird.feedback_manager import FeedbackManager
|
|
21
|
-
from tinybird.sql import parse_indexes_structure, parse_table_structure, schema_to_sql_columns
|
|
22
24
|
from tinybird.tb.modules.datafile.exceptions import IncludeFileNotFoundException, ParseException, ValidationException
|
|
23
25
|
from tinybird.tb.modules.exceptions import CLIPipeException
|
|
24
26
|
|
|
27
|
+
# Code from sql.py has been duplicated so I can change it without breaking absolutely everything in the app
|
|
28
|
+
# I'll try not to make logic changes, just error reporting changes
|
|
29
|
+
# from tinybird.sql import parse_indexes_structure, parse_table_structure, schema_to_sql_columns
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DatafileSyntaxError(Exception):
|
|
33
|
+
def __init__(self, message: str, lineno: int, pos: int, hint: Optional[str] = None):
|
|
34
|
+
super().__init__(message)
|
|
35
|
+
self.message = message
|
|
36
|
+
self.hint = hint
|
|
37
|
+
self.lineno = lineno
|
|
38
|
+
self.pos = pos
|
|
39
|
+
|
|
40
|
+
def __str__(self) -> str:
|
|
41
|
+
hint = f" {self.hint}." if self.hint else ""
|
|
42
|
+
return f"{self.message} at {self.lineno}:{self.pos}." + hint
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SchemaSyntaxError(DatafileSyntaxError):
|
|
46
|
+
def __init__(self, message: str, lineno: int, pos: int, hint: Optional[str] = None):
|
|
47
|
+
super().__init__(message=message, lineno=lineno, pos=pos, hint=hint)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class IndexesSyntaxError(DatafileSyntaxError):
|
|
51
|
+
def __init__(self, message: str, lineno: int, pos: int, hint: Optional[str] = None):
|
|
52
|
+
super().__init__(message=message, lineno=lineno, pos=pos, hint=hint)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class MalformedColumnError(Exception):
|
|
56
|
+
pass
|
|
57
|
+
|
|
25
58
|
|
|
26
59
|
class PipeTypes:
|
|
27
60
|
MATERIALIZED = "materialized"
|
|
@@ -89,11 +122,13 @@ TB_LOCAL_WORKSPACE_NAME = "Tinybird_Local_Testing"
|
|
|
89
122
|
|
|
90
123
|
pp = pprint.PrettyPrinter()
|
|
91
124
|
|
|
125
|
+
valid_chars_name: str = string.ascii_letters + string.digits + "._`*<>+-'"
|
|
126
|
+
valid_chars_fn: str = valid_chars_name + "[](),=!?:/ \n\t\r"
|
|
127
|
+
|
|
92
128
|
|
|
93
129
|
class Datafile:
|
|
94
130
|
def __init__(self) -> None:
|
|
95
131
|
self.maintainer: Optional[str] = None
|
|
96
|
-
self.sources: List[str] = []
|
|
97
132
|
self.nodes: List[Dict[str, Any]] = []
|
|
98
133
|
self.tokens: List[Dict[str, Any]] = []
|
|
99
134
|
self.version: Optional[int] = None
|
|
@@ -104,15 +139,6 @@ class Datafile:
|
|
|
104
139
|
self.warnings: List[str] = []
|
|
105
140
|
self.filtering_tags: Optional[List[str]] = None
|
|
106
141
|
|
|
107
|
-
def validate(self) -> None:
|
|
108
|
-
for x in self.nodes:
|
|
109
|
-
if not x["name"].strip():
|
|
110
|
-
raise ValidationException("invalid node name, can't be empty")
|
|
111
|
-
if "sql" not in x:
|
|
112
|
-
raise ValidationException("node %s must have a SQL query" % x["name"])
|
|
113
|
-
if self.version is not None and (not isinstance(self.version, int) or self.version < 0):
|
|
114
|
-
raise ValidationException("version must be a positive integer")
|
|
115
|
-
|
|
116
142
|
def is_equal(self, other):
|
|
117
143
|
if len(self.nodes) != len(other.nodes):
|
|
118
144
|
return False
|
|
@@ -166,6 +192,848 @@ def parse_tags(tags: str) -> Tuple[str, List[str]]:
|
|
|
166
192
|
return all_kv_tags, filtering_tags
|
|
167
193
|
|
|
168
194
|
|
|
195
|
+
@dataclass
|
|
196
|
+
class TableIndex:
|
|
197
|
+
"""Defines a CH table INDEX"""
|
|
198
|
+
|
|
199
|
+
name: str
|
|
200
|
+
expr: str
|
|
201
|
+
type_full: str
|
|
202
|
+
granularity: Optional[str] = None
|
|
203
|
+
|
|
204
|
+
def to_datafile(self):
|
|
205
|
+
granularity_expr = f"GRANULARITY {self.granularity}" if self.granularity else ""
|
|
206
|
+
return f"{self.name} {self.expr} TYPE {self.type_full} {granularity_expr}"
|
|
207
|
+
|
|
208
|
+
def to_sql(self):
|
|
209
|
+
return f"INDEX {self.to_datafile()}"
|
|
210
|
+
|
|
211
|
+
def add_index_sql(self):
|
|
212
|
+
return f"ADD {self.to_sql()}"
|
|
213
|
+
|
|
214
|
+
def drop_index_sql(self):
|
|
215
|
+
return f"DROP INDEX IF EXISTS {self.name}"
|
|
216
|
+
|
|
217
|
+
def materialize_index_sql(self):
|
|
218
|
+
return f"MATERIALIZE INDEX IF EXISTS {self.name}"
|
|
219
|
+
|
|
220
|
+
def clear_index_sql(self):
|
|
221
|
+
return f"CLEAR INDEX IF EXISTS {self.name}"
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def parse_indexes_structure(indexes: Optional[List[str]]) -> List[TableIndex]:
|
|
225
|
+
"""
|
|
226
|
+
>>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
|
|
227
|
+
[TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
|
|
228
|
+
>>> parse_indexes_structure(["INDEX index_name a TYPE set(100) GRANULARITY 100", " INDEX index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
|
|
229
|
+
[TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
|
|
230
|
+
>>> parse_indexes_structure(["index_name type TYPE set(100) GRANULARITY 100", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
|
|
231
|
+
[TableIndex(name='index_name', expr='type', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
|
|
232
|
+
>>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100,", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
|
|
233
|
+
[TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
|
|
234
|
+
>>> parse_indexes_structure(["index_name a TYPE set(100)", "index_name_bf mapValues(d) TYPE bloom_filter(0.001)"])
|
|
235
|
+
[TableIndex(name='index_name', expr='a', type_full='set(100)', granularity=None), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity=None)]
|
|
236
|
+
>>> parse_indexes_structure(["index_name u64 * length(s) TYPE set(100)", "index_name_bf mapValues(d) TYPE bloom_filter"])
|
|
237
|
+
[TableIndex(name='index_name', expr='u64 * length(s)', type_full='set(100)', granularity=None), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter', granularity=None)]
|
|
238
|
+
>>> parse_indexes_structure(["index_name path TYPE ngrambf_v1(4,1024,1,42) GRANULARITY 1"])
|
|
239
|
+
[TableIndex(name='index_name', expr='path', type_full='ngrambf_v1(4,1024,1,42)', granularity='1')]
|
|
240
|
+
>>> parse_indexes_structure(["index_name path TYPE ngrambf_v1(4, 1024, 1, 42) GRANULARITY 1"])
|
|
241
|
+
[TableIndex(name='index_name', expr='path', type_full='ngrambf_v1(4, 1024, 1, 42)', granularity='1')]
|
|
242
|
+
>>> parse_indexes_structure(["index_name u64 * length(s)"])
|
|
243
|
+
Traceback (most recent call last):
|
|
244
|
+
...
|
|
245
|
+
tinybird.tb.modules.datafile.common.IndexesSyntaxError: Invalid INDEX syntax at 1:1. Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`.
|
|
246
|
+
>>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100, index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
|
|
247
|
+
Traceback (most recent call last):
|
|
248
|
+
...
|
|
249
|
+
tinybird.tb.modules.datafile.common.IndexesSyntaxError: Invalid INDEX syntax at 1:1. Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`.
|
|
250
|
+
>>> parse_indexes_structure(["", " ", " wrong_index_syntax,"])
|
|
251
|
+
Traceback (most recent call last):
|
|
252
|
+
...
|
|
253
|
+
tinybird.tb.modules.datafile.common.IndexesSyntaxError: Invalid INDEX syntax at 3:6. Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`.
|
|
254
|
+
>>> parse_indexes_structure(["my_index m['key'] TYPE ngrambf_v1(1, 1024, 1, 42) GRANULARITY 1"])
|
|
255
|
+
[TableIndex(name='my_index', expr="m['key']", type_full='ngrambf_v1(1, 1024, 1, 42)', granularity='1')]
|
|
256
|
+
>>> parse_indexes_structure(["my_index_lambda arrayMap(x -> tupleElement(x,'message'), column_name) TYPE ngrambf_v1(1, 1024, 1, 42) GRANULARITY 1"])
|
|
257
|
+
[TableIndex(name='my_index_lambda', expr="arrayMap(x -> tupleElement(x,'message'), column_name)", type_full='ngrambf_v1(1, 1024, 1, 42)', granularity='1')]
|
|
258
|
+
>>> parse_indexes_structure(["ip_range_minmax_idx (toIPv6(ip_range_start), toIPv6(ip_range_end)) TYPE minmax GRANULARITY 1"])
|
|
259
|
+
[TableIndex(name='ip_range_minmax_idx', expr='(toIPv6(ip_range_start), toIPv6(ip_range_end))', type_full='minmax', granularity='1')]
|
|
260
|
+
"""
|
|
261
|
+
parsed_indices: List[TableIndex] = []
|
|
262
|
+
if not indexes:
|
|
263
|
+
return parsed_indices
|
|
264
|
+
|
|
265
|
+
# TODO(eclbg): It might not be obvious that we only allow one index per line.
|
|
266
|
+
for i, index in enumerate(indexes):
|
|
267
|
+
lineno = i + 1
|
|
268
|
+
if not index.strip():
|
|
269
|
+
continue
|
|
270
|
+
leading_whitespaces = len(index) - len(index.lstrip())
|
|
271
|
+
index = index.strip().rstrip(",")
|
|
272
|
+
index = index.lstrip("INDEX").strip()
|
|
273
|
+
if index.count("TYPE") != 1:
|
|
274
|
+
raise IndexesSyntaxError(
|
|
275
|
+
message="Invalid INDEX syntax",
|
|
276
|
+
hint="Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`",
|
|
277
|
+
lineno=lineno,
|
|
278
|
+
pos=leading_whitespaces + 1,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
match = re.match(
|
|
282
|
+
r"(\w+)\s+([\w\s*\[\]\*\(\),\'\"-><.]+)\s+TYPE\s+(\w+)(?:\(([\w\s*.,]+)\))?(?:\s+GRANULARITY\s+(\d+))?",
|
|
283
|
+
index,
|
|
284
|
+
)
|
|
285
|
+
if match:
|
|
286
|
+
index_name, a, index_type, value, granularity = match.groups()
|
|
287
|
+
index_expr = f"{index_type}({value})" if value else index_type
|
|
288
|
+
parsed_indices.append(TableIndex(index_name, a.strip(), f"{index_expr}", granularity))
|
|
289
|
+
else:
|
|
290
|
+
raise IndexesSyntaxError(
|
|
291
|
+
message="Invalid INDEX syntax",
|
|
292
|
+
hint="Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`",
|
|
293
|
+
lineno=1,
|
|
294
|
+
pos=leading_whitespaces + 1,
|
|
295
|
+
)
|
|
296
|
+
return parsed_indices
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def clean_comments_rstrip_keep_empty_lines(schema_to_clean: Optional[str]) -> Tuple[Optional[str], bool]:
|
|
300
|
+
"""Remove the comments from the schema
|
|
301
|
+
If the comments are between backticks, they will not be removed.
|
|
302
|
+
Lines that are empty after removing comments are also removed. Lines are only rstripped of whitespaces
|
|
303
|
+
>>> clean_comments_rstrip_keep_empty_lines(None) is None
|
|
304
|
+
True
|
|
305
|
+
>>> clean_comments_rstrip_keep_empty_lines('')
|
|
306
|
+
''
|
|
307
|
+
>>> clean_comments_rstrip_keep_empty_lines(' ')
|
|
308
|
+
''
|
|
309
|
+
>>> clean_comments_rstrip_keep_empty_lines('\\n')
|
|
310
|
+
''
|
|
311
|
+
>>> clean_comments_rstrip_keep_empty_lines('\\n\\n\\n\\n')
|
|
312
|
+
''
|
|
313
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32')
|
|
314
|
+
'c Float32'
|
|
315
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32\\n')
|
|
316
|
+
'c Float32'
|
|
317
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a comment')
|
|
318
|
+
'c Float32'
|
|
319
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a comment\\n')
|
|
320
|
+
'c Float32'
|
|
321
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32\\t-- this is a comment\\t\\n')
|
|
322
|
+
'c Float32'
|
|
323
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a comment\\r\\n')
|
|
324
|
+
'c Float32'
|
|
325
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a comment\\n--this is a comment2\\n')
|
|
326
|
+
'c Float32'
|
|
327
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a ```comment\\n')
|
|
328
|
+
'c Float32'
|
|
329
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a ```comment\\n')
|
|
330
|
+
'c Float32'
|
|
331
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32, -- comment\\nd Float32 -- comment2')
|
|
332
|
+
'c Float32,\\nd Float32'
|
|
333
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32, -- comment\\n -- comment \\nd Float32 -- comment2')
|
|
334
|
+
'c Float32,\\n\\nd Float32'
|
|
335
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32 `json:$.aa--aa`\\n--this is a ```comment\\n')
|
|
336
|
+
'c Float32 `json:$.aa--aa`'
|
|
337
|
+
>>> clean_comments_rstrip_keep_empty_lines('c Float32 `json:$.cc--cc`\\nd Float32 `json:$.dd--dd`\\n--this is a ```comment\\n')
|
|
338
|
+
'c Float32 `json:$.cc--cc`\\nd Float32 `json:$.dd--dd`'
|
|
339
|
+
>>> clean_comments_rstrip_keep_empty_lines('c--c Float32 `json:$.cc--cc`\\n')
|
|
340
|
+
'c'
|
|
341
|
+
>>> clean_comments_rstrip_keep_empty_lines('`c--c` Float32 `json:$.cc--cc`\\n')
|
|
342
|
+
'`c'
|
|
343
|
+
"""
|
|
344
|
+
|
|
345
|
+
def clean_line_comments(line: str) -> str:
|
|
346
|
+
if not line:
|
|
347
|
+
return line
|
|
348
|
+
i = 0
|
|
349
|
+
inside_json_path = False
|
|
350
|
+
while i < len(line):
|
|
351
|
+
if i + 1 < len(line) and line[i] == "-" and line[i + 1] == "-" and not inside_json_path:
|
|
352
|
+
return line[:i].rstrip()
|
|
353
|
+
|
|
354
|
+
if not inside_json_path and line[i:].startswith("`json:"):
|
|
355
|
+
inside_json_path = True
|
|
356
|
+
elif inside_json_path and line[i] == "`":
|
|
357
|
+
inside_json_path = False
|
|
358
|
+
i += 1
|
|
359
|
+
return line
|
|
360
|
+
|
|
361
|
+
if schema_to_clean is None:
|
|
362
|
+
return schema_to_clean
|
|
363
|
+
|
|
364
|
+
cleaned_schema = ""
|
|
365
|
+
for line in schema_to_clean.splitlines():
|
|
366
|
+
cleaned_line = clean_line_comments(line)
|
|
367
|
+
cleaned_schema += cleaned_line + "\n"
|
|
368
|
+
return cleaned_schema.rstrip()
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
SyntaxExpr = namedtuple("SyntaxExpr", ["name", "regex"])
|
|
372
|
+
|
|
373
|
+
NULL = SyntaxExpr("NULL", re.compile(r"\s+NULL([^a-z0-9_]|$)", re.IGNORECASE))
|
|
374
|
+
NOTNULL = SyntaxExpr("NOTNULL", re.compile(r"\s+NOT\s+NULL([^a-z0-9_]|$)", re.IGNORECASE))
|
|
375
|
+
DEFAULT = SyntaxExpr("DEFAULT", re.compile(r"\s+DEFAULT([^a-z0-9_]|$)", re.IGNORECASE))
|
|
376
|
+
MATERIALIZED = SyntaxExpr("MATERIALIZED", re.compile(r"\s+MATERIALIZED([^a-z0-9_]|$)", re.IGNORECASE))
|
|
377
|
+
ALIAS = SyntaxExpr("ALIAS", re.compile(r"\s+ALIAS([^a-z0-9_]|$)", re.IGNORECASE))
|
|
378
|
+
CODEC = SyntaxExpr("CODEC", re.compile(r"\s+CODEC([^a-z0-9_]|$)", re.IGNORECASE))
|
|
379
|
+
TTL = SyntaxExpr("TTL", re.compile(r"\s+TTL([^a-z0-9_]|$)", re.IGNORECASE))
|
|
380
|
+
JSONPATH = SyntaxExpr("JSONPATH", re.compile(r"\s+`json:", re.IGNORECASE))
|
|
381
|
+
COMMA = SyntaxExpr("COMMA", re.compile(r",", re.IGNORECASE))
|
|
382
|
+
NEW_LINE = SyntaxExpr("NEW_LINE", re.compile(r"\s$"))
|
|
383
|
+
TYPE = SyntaxExpr("TYPE", re.compile(r"")) # TYPE doesn't have a fixed initial string
|
|
384
|
+
|
|
385
|
+
REGEX_WHITESPACE = re.compile(r"\s*")
|
|
386
|
+
REGEX_COMMENT = re.compile(r"\-\-[^\n\r]*[\n\r]")
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def mark_error_string(s: str, i: int, line: int = 1) -> str:
|
|
390
|
+
"""
|
|
391
|
+
>>> mark_error_string('0123456789', 0)
|
|
392
|
+
'0123456789\\n^---'
|
|
393
|
+
>>> mark_error_string('0123456789', 9)
|
|
394
|
+
'0123456789\\n ^---'
|
|
395
|
+
>>> mark_error_string('01234\\n56789', 1)
|
|
396
|
+
'01234\\n ^---'
|
|
397
|
+
"""
|
|
398
|
+
marker = "^---"
|
|
399
|
+
ss = s.splitlines()[line - 1] if s else ""
|
|
400
|
+
start = 0
|
|
401
|
+
end = len(ss)
|
|
402
|
+
return ss[start:end] + "\n" + (" " * (i - start)) + marker
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def format_parse_error(
|
|
406
|
+
table_structure: str,
|
|
407
|
+
i: int,
|
|
408
|
+
position: int,
|
|
409
|
+
hint: Optional[str] = None,
|
|
410
|
+
line: int = 0,
|
|
411
|
+
keyword: Optional[str] = None,
|
|
412
|
+
) -> str:
|
|
413
|
+
adjusted_position = position - (len(keyword) if keyword else 0)
|
|
414
|
+
message = f"{hint}\n" if hint else ""
|
|
415
|
+
message += mark_error_string(table_structure, adjusted_position - 1, line=line)
|
|
416
|
+
|
|
417
|
+
if keyword:
|
|
418
|
+
message += f" found at position {adjusted_position - len(keyword)}"
|
|
419
|
+
else:
|
|
420
|
+
message += (
|
|
421
|
+
f" found {repr(table_structure[i]) if len(table_structure)>i else 'EOF'} at position {adjusted_position}"
|
|
422
|
+
)
|
|
423
|
+
return message
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def clean_line_comments(line: str) -> str:
|
|
427
|
+
if not line:
|
|
428
|
+
return line
|
|
429
|
+
i = 0
|
|
430
|
+
inside_json_path = False
|
|
431
|
+
while i < len(line):
|
|
432
|
+
if i + 1 < len(line) and line[i] == "-" and line[i + 1] == "-" and not inside_json_path:
|
|
433
|
+
return line[:i].strip()
|
|
434
|
+
|
|
435
|
+
if not inside_json_path and line[i:].startswith("`json:"):
|
|
436
|
+
inside_json_path = True
|
|
437
|
+
elif inside_json_path and line[i] == "`":
|
|
438
|
+
inside_json_path = False
|
|
439
|
+
i += 1
|
|
440
|
+
return line
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _parse_table_structure(schema: str) -> List[Dict[str, Any]]:
|
|
444
|
+
# CH syntax from https://clickhouse.com/docs/en/sql-reference/statements/create/table/
|
|
445
|
+
# name1 [type1] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1]
|
|
446
|
+
try:
|
|
447
|
+
# This removes lines that are empty after removing comments, which might make it hard to locate errors properly.
|
|
448
|
+
# The parsing code afterwards seems to be mostly robust to empty lines.
|
|
449
|
+
# Perhaps I'll deliberately not support reporting errors correctly when empty lines have been removed to start
|
|
450
|
+
# with, and later I can see how to support it.
|
|
451
|
+
# It also removes the indentation of the lines, which might make it hard to locate errors properly.
|
|
452
|
+
# schema = clean_comments(schema + "\n")
|
|
453
|
+
|
|
454
|
+
# I've swapped the above with this. A first test didn't show any side effects in parsing a schema, and it should
|
|
455
|
+
# allow us to keep track of the line numbers in the error messages.
|
|
456
|
+
schema = clean_comments_rstrip_keep_empty_lines(schema + "\n")
|
|
457
|
+
except Exception:
|
|
458
|
+
# logging.exception(f"Error cleaning comments: {e}")
|
|
459
|
+
schema = REGEX_COMMENT.sub(" ", schema + "\n").strip()
|
|
460
|
+
|
|
461
|
+
if REGEX_WHITESPACE.fullmatch(schema):
|
|
462
|
+
return []
|
|
463
|
+
|
|
464
|
+
i: int = 0
|
|
465
|
+
|
|
466
|
+
# For error feedback only
|
|
467
|
+
line: int = 1
|
|
468
|
+
pos: int = 1
|
|
469
|
+
|
|
470
|
+
# Find the first SyntaxExpr in lookup that matches the schema at the current offset
|
|
471
|
+
def lookahead_matches(lookup: Iterable) -> Optional[SyntaxExpr]:
|
|
472
|
+
s = schema[i:]
|
|
473
|
+
match = next((x for x in lookup if x.regex.match(s)), None)
|
|
474
|
+
return match
|
|
475
|
+
|
|
476
|
+
def advance_single_char() -> None:
|
|
477
|
+
nonlocal i, line, pos
|
|
478
|
+
if schema[i] == "\n":
|
|
479
|
+
line += 1
|
|
480
|
+
pos = 1
|
|
481
|
+
else:
|
|
482
|
+
pos += 1
|
|
483
|
+
i += 1
|
|
484
|
+
|
|
485
|
+
# Advance all whitespaces characters and then len(s) more chars
|
|
486
|
+
def advance(s: str) -> None:
|
|
487
|
+
if i < len(schema):
|
|
488
|
+
while schema[i] in " \t\r\n":
|
|
489
|
+
advance_single_char()
|
|
490
|
+
for _ in s:
|
|
491
|
+
advance_single_char()
|
|
492
|
+
|
|
493
|
+
def get_backticked() -> str:
|
|
494
|
+
begin = i
|
|
495
|
+
while i < len(schema):
|
|
496
|
+
c = schema[i]
|
|
497
|
+
advance_single_char()
|
|
498
|
+
if c == "`":
|
|
499
|
+
return schema[begin : i - 1]
|
|
500
|
+
if c in " \t\r\n":
|
|
501
|
+
raise SchemaSyntaxError(message="Expected closing backtick", lineno=line, pos=pos - 1)
|
|
502
|
+
raise SchemaSyntaxError(message="Expected closing backtick", lineno=line, pos=pos)
|
|
503
|
+
|
|
504
|
+
def parse_name() -> str:
|
|
505
|
+
nonlocal i, line, pos
|
|
506
|
+
if schema[i] != "`":
|
|
507
|
+
# regular name
|
|
508
|
+
begin = i
|
|
509
|
+
while i < len(schema):
|
|
510
|
+
c = schema[i]
|
|
511
|
+
if c in " \t\r\n":
|
|
512
|
+
return schema[begin:i]
|
|
513
|
+
if c not in valid_chars_name:
|
|
514
|
+
raise SchemaSyntaxError(
|
|
515
|
+
message=f"Column name contains invalid character {repr(c)}",
|
|
516
|
+
hint="Tip: use backticks",
|
|
517
|
+
lineno=line,
|
|
518
|
+
pos=i + 1,
|
|
519
|
+
)
|
|
520
|
+
advance_single_char()
|
|
521
|
+
return schema[begin:i]
|
|
522
|
+
else:
|
|
523
|
+
# backticked name
|
|
524
|
+
advance_single_char()
|
|
525
|
+
return get_backticked()
|
|
526
|
+
|
|
527
|
+
def parse_expr(lookup: Iterable[SyntaxExpr]) -> str:
|
|
528
|
+
nonlocal i, line, pos
|
|
529
|
+
|
|
530
|
+
begin: int = i
|
|
531
|
+
context_stack: List[Optional[str]] = [None]
|
|
532
|
+
while i < len(schema):
|
|
533
|
+
context = context_stack[-1]
|
|
534
|
+
c = schema[i]
|
|
535
|
+
|
|
536
|
+
if (context == "'" and c == "'") or (context == '"' and c == '"') or (context == "(" and c == ")"):
|
|
537
|
+
context_stack.pop()
|
|
538
|
+
elif c == "'" and (context is None or context == "("):
|
|
539
|
+
context_stack.append("'")
|
|
540
|
+
elif c == '"' and (context is None or context == "("):
|
|
541
|
+
context_stack.append('"')
|
|
542
|
+
elif c == "(" and (context is None or context == "("):
|
|
543
|
+
context_stack.append("(")
|
|
544
|
+
elif context is None and lookahead_matches(lookup):
|
|
545
|
+
return schema[begin:i].strip(" \t\r\n")
|
|
546
|
+
elif (context is None and c not in valid_chars_fn) or (context == "(" and c not in valid_chars_fn):
|
|
547
|
+
raise SchemaSyntaxError(message=f"Invalid character {repr(c)}", lineno=line, pos=pos)
|
|
548
|
+
advance_single_char()
|
|
549
|
+
if i == begin:
|
|
550
|
+
# TODO(eclbg): Turn this into a SchemaSyntaxError. I don't know when it happens
|
|
551
|
+
raise ValueError(format_parse_error(schema, i, pos, "wrong value", line=line))
|
|
552
|
+
return schema[begin:].strip(" \t\r\n")
|
|
553
|
+
|
|
554
|
+
columns: List[Dict[str, Any]] = []
|
|
555
|
+
|
|
556
|
+
name: str = ""
|
|
557
|
+
_type: str = ""
|
|
558
|
+
default: str = ""
|
|
559
|
+
materialized: str = ""
|
|
560
|
+
codec: str = ""
|
|
561
|
+
jsonpath: str = ""
|
|
562
|
+
last: Optional[SyntaxExpr] = None
|
|
563
|
+
col_start: Tuple[int, int] = (0, 0) # (0, 0) means not set. It's not a valid line/pos as they start at 1
|
|
564
|
+
col_end: Tuple[int, int] = (0, 0) # (0, 0) means not set. It's not a valid line/pos as they start at 1
|
|
565
|
+
|
|
566
|
+
def add_column(found: str) -> None:
|
|
567
|
+
nonlocal name, _type, default, materialized, codec, jsonpath, col_start, col_end
|
|
568
|
+
if not name:
|
|
569
|
+
# TODO(eclbg): get rid of this ValueError and replace it with a custom one so it can be handled by the
|
|
570
|
+
# caller
|
|
571
|
+
raise ValueError(
|
|
572
|
+
format_parse_error(schema, i, pos, f"Syntax error: expecting NAME, found {found}", line=line)
|
|
573
|
+
)
|
|
574
|
+
default = "" if not default else f"DEFAULT {default}"
|
|
575
|
+
materialized = "" if not materialized else f"MATERIALIZED {materialized}"
|
|
576
|
+
codec = "" if not codec else f"CODEC{codec}"
|
|
577
|
+
# TODO(eclbg): We should validate the column as a whole. Name is mandatory, and one of type, default_value or
|
|
578
|
+
# materialized (I think).
|
|
579
|
+
columns.append(
|
|
580
|
+
{
|
|
581
|
+
"name": name,
|
|
582
|
+
"type": _type,
|
|
583
|
+
"codec": codec,
|
|
584
|
+
"default_value": default or materialized,
|
|
585
|
+
"jsonpath": jsonpath,
|
|
586
|
+
# "col_start": col_start,
|
|
587
|
+
# "col_end": col_end,
|
|
588
|
+
}
|
|
589
|
+
)
|
|
590
|
+
name = ""
|
|
591
|
+
_type = ""
|
|
592
|
+
default = ""
|
|
593
|
+
materialized = ""
|
|
594
|
+
codec = ""
|
|
595
|
+
jsonpath = ""
|
|
596
|
+
|
|
597
|
+
valid_next: List[SyntaxExpr] = [TYPE]
|
|
598
|
+
while i < len(schema):
|
|
599
|
+
if not name:
|
|
600
|
+
advance("")
|
|
601
|
+
valid_next = [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, TYPE]
|
|
602
|
+
col_start = (line, pos)
|
|
603
|
+
name = parse_name()
|
|
604
|
+
if name == "INDEX":
|
|
605
|
+
raise SchemaSyntaxError(
|
|
606
|
+
message="Forbidden INDEX definition",
|
|
607
|
+
hint="Indexes are not allowed in SCHEMA section. Use the INDEXES section instead",
|
|
608
|
+
lineno=line,
|
|
609
|
+
pos=pos - len(name), # We've already advanced the name
|
|
610
|
+
)
|
|
611
|
+
continue
|
|
612
|
+
found = lookahead_matches(
|
|
613
|
+
[NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, NEW_LINE, TYPE]
|
|
614
|
+
)
|
|
615
|
+
if found and found not in valid_next:
|
|
616
|
+
after = f" after {last.name}" if last else ""
|
|
617
|
+
raise SchemaSyntaxError(message=f"Unexpected {found.name}{after}", lineno=line, pos=pos)
|
|
618
|
+
if found == TYPE:
|
|
619
|
+
advance("")
|
|
620
|
+
valid_next = [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, NEW_LINE]
|
|
621
|
+
type_start_pos = pos # Save the position of the type start to use it in the error message
|
|
622
|
+
detected_type = parse_expr([NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
|
|
623
|
+
try:
|
|
624
|
+
# Imported in the body to be compatible with the CLI
|
|
625
|
+
from chtoolset.query import check_compatible_types
|
|
626
|
+
|
|
627
|
+
# Check compatibility of the type with itself to verify it's a known type
|
|
628
|
+
check_compatible_types(detected_type, detected_type)
|
|
629
|
+
except ValueError as e:
|
|
630
|
+
if (
|
|
631
|
+
"unknown data type family" in str(e).lower()
|
|
632
|
+
or "incompatible data types between aggregate function" in str(e).lower()
|
|
633
|
+
):
|
|
634
|
+
raise SchemaSyntaxError(message=str(e), lineno=line, pos=type_start_pos)
|
|
635
|
+
else:
|
|
636
|
+
raise e
|
|
637
|
+
except ModuleNotFoundError:
|
|
638
|
+
pass
|
|
639
|
+
_type = detected_type
|
|
640
|
+
elif found == NULL:
|
|
641
|
+
# Not implemented
|
|
642
|
+
advance("") # We need to advance to get the correct position
|
|
643
|
+
raise SchemaSyntaxError(
|
|
644
|
+
message="NULL column syntax not supported",
|
|
645
|
+
hint="Hint: use Nullable(...)",
|
|
646
|
+
lineno=line,
|
|
647
|
+
pos=pos,
|
|
648
|
+
)
|
|
649
|
+
elif found == NOTNULL:
|
|
650
|
+
advance("") # We need to advance to get the correct position
|
|
651
|
+
raise SchemaSyntaxError(
|
|
652
|
+
message="NOT NULL column syntax not supported",
|
|
653
|
+
hint="Hint: Columns are not nullable by default",
|
|
654
|
+
lineno=line,
|
|
655
|
+
pos=pos,
|
|
656
|
+
)
|
|
657
|
+
elif found == DEFAULT:
|
|
658
|
+
advance("DEFAULT")
|
|
659
|
+
valid_next = [
|
|
660
|
+
CODEC,
|
|
661
|
+
TTL,
|
|
662
|
+
COMMA,
|
|
663
|
+
# The three matches below are never valid. We're adding them here to avoid just complaining about their placement.
|
|
664
|
+
MATERIALIZED,
|
|
665
|
+
NULL,
|
|
666
|
+
NOTNULL,
|
|
667
|
+
]
|
|
668
|
+
default = parse_expr([NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
|
|
669
|
+
elif found == MATERIALIZED:
|
|
670
|
+
advance("")
|
|
671
|
+
raise SchemaSyntaxError(
|
|
672
|
+
message="MATERIALIZED columns are not supported",
|
|
673
|
+
lineno=line,
|
|
674
|
+
pos=pos,
|
|
675
|
+
)
|
|
676
|
+
elif found == ALIAS:
|
|
677
|
+
# Not implemented
|
|
678
|
+
advance("") # We need to advance to get the correct position
|
|
679
|
+
raise SchemaSyntaxError(
|
|
680
|
+
message="ALIAS columns are not supported",
|
|
681
|
+
lineno=line,
|
|
682
|
+
pos=pos,
|
|
683
|
+
)
|
|
684
|
+
elif found == CODEC:
|
|
685
|
+
advance("CODEC")
|
|
686
|
+
valid_next = [
|
|
687
|
+
TTL,
|
|
688
|
+
COMMA,
|
|
689
|
+
JSONPATH,
|
|
690
|
+
# The three matches below are never valid. We're adding them here to avoid just complaining about their placement.
|
|
691
|
+
MATERIALIZED,
|
|
692
|
+
NULL,
|
|
693
|
+
NOTNULL,
|
|
694
|
+
]
|
|
695
|
+
codec = parse_expr([NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
|
|
696
|
+
elif found == TTL:
|
|
697
|
+
advance("") # We need to advance to get the correct position
|
|
698
|
+
# Not implemented
|
|
699
|
+
advance("")
|
|
700
|
+
raise SchemaSyntaxError(
|
|
701
|
+
message="column TTL is not supported",
|
|
702
|
+
lineno=line,
|
|
703
|
+
pos=pos,
|
|
704
|
+
)
|
|
705
|
+
elif found == JSONPATH:
|
|
706
|
+
advance("`json:")
|
|
707
|
+
jsonpath = get_backticked()
|
|
708
|
+
elif found == COMMA:
|
|
709
|
+
if name == "INDEX":
|
|
710
|
+
advance(",")
|
|
711
|
+
continue
|
|
712
|
+
advance(",")
|
|
713
|
+
valid_next = []
|
|
714
|
+
col_end = (line, pos)
|
|
715
|
+
add_column("COMMA")
|
|
716
|
+
elif found == NEW_LINE or (name == "INDEX" and not found):
|
|
717
|
+
i += 1
|
|
718
|
+
else:
|
|
719
|
+
raise ValueError(
|
|
720
|
+
format_parse_error(
|
|
721
|
+
schema,
|
|
722
|
+
i,
|
|
723
|
+
pos,
|
|
724
|
+
"wrong value, DEFAULT, MATERIALIZED, CODEC, TTL expressions, a column data type, a comma, a new line or a jsonpath",
|
|
725
|
+
line=line,
|
|
726
|
+
)
|
|
727
|
+
)
|
|
728
|
+
last = found
|
|
729
|
+
col_end = (line, i + 1)
|
|
730
|
+
# Only add the last column if we've parsed something. This allows for a trailing comma after the last column.
|
|
731
|
+
if name:
|
|
732
|
+
add_column("EOF")
|
|
733
|
+
|
|
734
|
+
# normalize columns
|
|
735
|
+
for column in columns:
|
|
736
|
+
nullable = column["type"].lower().startswith("nullable")
|
|
737
|
+
column["type"] = column["type"] if not nullable else column["type"][len("Nullable(") : -1] # ')'
|
|
738
|
+
column["nullable"] = nullable
|
|
739
|
+
column["codec"] = column["codec"] if column["codec"] else None
|
|
740
|
+
column["name"] = column["name"]
|
|
741
|
+
column["normalized_name"] = column["name"]
|
|
742
|
+
column["jsonpath"] = column["jsonpath"] if column["jsonpath"] else None
|
|
743
|
+
default_value = column["default_value"] if column["default_value"] else None
|
|
744
|
+
if nullable and default_value and default_value.lower() == "default null":
|
|
745
|
+
default_value = None
|
|
746
|
+
column["default_value"] = default_value
|
|
747
|
+
return columns
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
def try_to_fix_nullable_in_simple_aggregating_function(t: str) -> Optional[str]:
|
|
751
|
+
# This workaround is to fix: https://github.com/ClickHouse/ClickHouse/issues/34407.
|
|
752
|
+
# In the case of nullable columns and SimpleAggregateFunction Clickhouse returns
|
|
753
|
+
# Nullable(SimpleAggregateFunction(sum, Int32)) instead of SimpleAggregateFunction(sum, Nullable(Int32))
|
|
754
|
+
# as it is done with other aggregate functions.
|
|
755
|
+
# If not, the aggregation could return incorrect results.
|
|
756
|
+
result = None
|
|
757
|
+
if match := re.search(r"SimpleAggregateFunction\((\w+),\s*(?!(?:Nullable))([\w,.()]+)\)", t):
|
|
758
|
+
fn = match.group(1)
|
|
759
|
+
inner_type = match.group(2)
|
|
760
|
+
result = f"SimpleAggregateFunction({fn}, Nullable({inner_type}))"
|
|
761
|
+
return result
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def col_name(name: str, backquotes: bool = True) -> str:
|
|
765
|
+
"""
|
|
766
|
+
>>> col_name('`test`', True)
|
|
767
|
+
'`test`'
|
|
768
|
+
>>> col_name('`test`', False)
|
|
769
|
+
'test'
|
|
770
|
+
>>> col_name('test', True)
|
|
771
|
+
'`test`'
|
|
772
|
+
>>> col_name('test', False)
|
|
773
|
+
'test'
|
|
774
|
+
>>> col_name('', True)
|
|
775
|
+
''
|
|
776
|
+
>>> col_name('', False)
|
|
777
|
+
''
|
|
778
|
+
"""
|
|
779
|
+
if not name:
|
|
780
|
+
return name
|
|
781
|
+
if name[0] == "`" and name[-1] == "`":
|
|
782
|
+
return name if backquotes else name[1:-1]
|
|
783
|
+
return f"`{name}`" if backquotes else name
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
def schema_to_sql_columns(schema: List[Dict[str, Any]]) -> List[str]:
|
|
787
|
+
"""return an array with each column in SQL
|
|
788
|
+
>>> schema_to_sql_columns([{'name': 'temperature', 'type': 'Float32', 'codec': None, 'default_value': None, 'nullable': False, 'normalized_name': 'temperature'}, {'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4))', 'default_value': 'MATERIALIZED temperature', 'nullable': False, 'normalized_name': 'temperature_delta'}])
|
|
789
|
+
['`temperature` Float32', '`temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4))']
|
|
790
|
+
>>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'codec': '', 'default_value': 'MATERIALIZED temperature', 'nullable': False, 'normalized_name': 'temperature_delta'}])
|
|
791
|
+
['`temperature_delta` Float32 MATERIALIZED temperature']
|
|
792
|
+
>>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4))', 'default_value': '', 'nullable': False, 'normalized_name': 'temperature_delta'}])
|
|
793
|
+
['`temperature_delta` Float32 CODEC(Delta(4), LZ4))']
|
|
794
|
+
>>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'nullable': False, 'normalized_name': 'temperature_delta'}])
|
|
795
|
+
['`temperature_delta` Float32']
|
|
796
|
+
>>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'nullable': False, 'normalized_name': 'temperature_delta', 'jsonpath': '$.temperature_delta'}])
|
|
797
|
+
['`temperature_delta` Float32 `json:$.temperature_delta`']
|
|
798
|
+
>>> schema_to_sql_columns([{'name': 'aggregation', 'type': 'SimpleAggregateFunction(sum, Int32)', 'nullable': True, 'normalized_name': 'aggregation', 'jsonpath': '$.aggregation'}])
|
|
799
|
+
['`aggregation` SimpleAggregateFunction(sum, Nullable(Int32)) `json:$.aggregation`']
|
|
800
|
+
"""
|
|
801
|
+
columns: List[str] = []
|
|
802
|
+
for x in schema:
|
|
803
|
+
name = x["normalized_name"] if "normalized_name" in x else x["name"]
|
|
804
|
+
if x["nullable"]:
|
|
805
|
+
if (_type := try_to_fix_nullable_in_simple_aggregating_function(x["type"])) is None:
|
|
806
|
+
_type = "Nullable(%s)" % x["type"]
|
|
807
|
+
else:
|
|
808
|
+
_type = x["type"]
|
|
809
|
+
parts = [col_name(name, backquotes=True), _type]
|
|
810
|
+
if x.get("jsonpath", None):
|
|
811
|
+
parts.append(f"`json:{x['jsonpath']}`")
|
|
812
|
+
if "default_value" in x and x["default_value"] not in ("", None):
|
|
813
|
+
parts.append(x["default_value"])
|
|
814
|
+
if "codec" in x and x["codec"] not in ("", None):
|
|
815
|
+
parts.append(x["codec"])
|
|
816
|
+
c = " ".join([x for x in parts if x]).strip()
|
|
817
|
+
columns.append(c)
|
|
818
|
+
return columns
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
def parse_table_structure(schema: str) -> List[Dict[str, Any]]:
|
|
822
|
+
"""This parses the SQL schema for a CREATE TABLE
|
|
823
|
+
Columns follow the syntax: name1 [type1] [DEFAULT expr1] [CODEC compression_codec] [TTL expr1] [JSONPATH `json:jsonpath`] [,]
|
|
824
|
+
|
|
825
|
+
The ClickHouse reference is followed pretty loosely at this point.
|
|
826
|
+
Reference: https://clickhouse.tech/docs/en/sql-reference/statements/create/table/#syntax-forms
|
|
827
|
+
|
|
828
|
+
>>> parse_table_structure('potato') # doctest: +SKIP
|
|
829
|
+
Traceback (most recent call last):
|
|
830
|
+
...
|
|
831
|
+
tinybird.sql.MalformedColumnError: Column name and either type or default_value are required
|
|
832
|
+
|
|
833
|
+
>>> parse_table_structure(' potato Int32')
|
|
834
|
+
[{'name': 'potato', 'type': 'Int32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'potato'}]
|
|
835
|
+
|
|
836
|
+
>>> parse_table_structure('`c Int32')
|
|
837
|
+
Traceback (most recent call last):
|
|
838
|
+
...
|
|
839
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: Expected closing backtick at 1:3.
|
|
840
|
+
|
|
841
|
+
>>> parse_table_structure('c Float32, b String')
|
|
842
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
|
|
843
|
+
|
|
844
|
+
>>> parse_table_structure('c Float32,--comment\\nb String')
|
|
845
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
|
|
846
|
+
|
|
847
|
+
>>> parse_table_structure('c Float32,--comment\\nb String --another-comment')
|
|
848
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
|
|
849
|
+
|
|
850
|
+
>>> parse_table_structure('c Float32 --first-comment\\n,--comment\\nb String --another-comment')
|
|
851
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
|
|
852
|
+
|
|
853
|
+
>>> parse_table_structure('--random comment here\\nc Float32 --another comment\\n,--another one\\nb String --this is the last one')
|
|
854
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
|
|
855
|
+
|
|
856
|
+
>>> parse_table_structure('--extra comment\\nc--extra comment\\nFloat32--extra comment\\n,--extra comment\\nb--extra comment\\nString--extra comment')
|
|
857
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
|
|
858
|
+
|
|
859
|
+
>>> parse_table_structure('c Nullable(Float32)')
|
|
860
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
|
|
861
|
+
|
|
862
|
+
>>> parse_table_structure('c Nullable(Float32) DEFAULT NULL')
|
|
863
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
|
|
864
|
+
|
|
865
|
+
>>> parse_table_structure("c String DEFAULT 'bla'")
|
|
866
|
+
[{'name': 'c', 'type': 'String', 'codec': None, 'default_value': "DEFAULT 'bla'", 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
|
|
867
|
+
|
|
868
|
+
>>> parse_table_structure('`foo.bar` UInt64')
|
|
869
|
+
[{'name': 'foo.bar', 'type': 'UInt64', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'foo.bar'}]
|
|
870
|
+
|
|
871
|
+
>>> parse_table_structure('double_value Float64 CODEC(LZ4HC(2))')
|
|
872
|
+
[{'name': 'double_value', 'type': 'Float64', 'codec': 'CODEC(LZ4HC(2))', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'double_value'}]
|
|
873
|
+
|
|
874
|
+
>>> parse_table_structure('doubl/e_value Float64 CODEC(LZ4HC(2))')
|
|
875
|
+
Traceback (most recent call last):
|
|
876
|
+
...
|
|
877
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: Column name contains invalid character '/' at 1:6. Tip: use backticks.
|
|
878
|
+
|
|
879
|
+
>>> parse_table_structure('`c` Nullable(Float32)')
|
|
880
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
|
|
881
|
+
|
|
882
|
+
>>> parse_table_structure('wadus INT UNSIGNED')
|
|
883
|
+
[{'name': 'wadus', 'type': 'INT UNSIGNED', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'wadus'}]
|
|
884
|
+
|
|
885
|
+
>>> parse_table_structure('c Int32 CODEC(Delta, LZ4)\\n')
|
|
886
|
+
[{'name': 'c', 'type': 'Int32', 'codec': 'CODEC(Delta, LZ4)', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
|
|
887
|
+
|
|
888
|
+
>>> parse_table_structure('c SimpleAggregateFunction(sum, Int32),\\np SimpleAggregateFunction(sum, Int32)')
|
|
889
|
+
Traceback (most recent call last):
|
|
890
|
+
...
|
|
891
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: Incompatible data types between aggregate function 'sum' which returns Int64 and column storage type Int32 at 1:4.
|
|
892
|
+
|
|
893
|
+
>>> parse_table_structure('c Int32 CODEC(Delta, LZ4) Materialized b*2\\n')
|
|
894
|
+
Traceback (most recent call last):
|
|
895
|
+
...
|
|
896
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:27.
|
|
897
|
+
|
|
898
|
+
>>> parse_table_structure('c Int32 CODEC(Delta, LZ4) Materialized ifNull(b*2, 0)\\n')
|
|
899
|
+
Traceback (most recent call last):
|
|
900
|
+
...
|
|
901
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:27.
|
|
902
|
+
|
|
903
|
+
>>> parse_table_structure('c Int32 Materialized b*2\\n')
|
|
904
|
+
Traceback (most recent call last):
|
|
905
|
+
...
|
|
906
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:9.
|
|
907
|
+
|
|
908
|
+
>>> parse_table_structure('c Int32 Materialized b != 1 ? b*2: pow(b, 3)\\n')
|
|
909
|
+
Traceback (most recent call last):
|
|
910
|
+
...
|
|
911
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:9.
|
|
912
|
+
|
|
913
|
+
>>> parse_table_structure('')
|
|
914
|
+
[]
|
|
915
|
+
|
|
916
|
+
>>> parse_table_structure('`date` Date,`timezone` String,`offset` Int32')
|
|
917
|
+
[{'name': 'date', 'type': 'Date', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'date'}, {'name': 'timezone', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'timezone'}, {'name': 'offset', 'type': 'Int32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'offset'}]
|
|
918
|
+
|
|
919
|
+
>>> parse_table_structure('c Int32 Materialized b*2 CODEC(Delta, LZ4)\\n')
|
|
920
|
+
Traceback (most recent call last):
|
|
921
|
+
...
|
|
922
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:9.
|
|
923
|
+
|
|
924
|
+
>>> parse_table_structure('c Int32 Materialized ifNull(b*2, 0) CODEC(Delta, LZ4)\\n')
|
|
925
|
+
Traceback (most recent call last):
|
|
926
|
+
...
|
|
927
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:9.
|
|
928
|
+
|
|
929
|
+
>>> parse_table_structure('`temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4)')
|
|
930
|
+
Traceback (most recent call last):
|
|
931
|
+
...
|
|
932
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:29.
|
|
933
|
+
|
|
934
|
+
>>> parse_table_structure('foo^bar Float32')
|
|
935
|
+
Traceback (most recent call last):
|
|
936
|
+
...
|
|
937
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: Column name contains invalid character '^' at 1:4. Tip: use backticks.
|
|
938
|
+
|
|
939
|
+
>>> parse_table_structure('foo Float#32')
|
|
940
|
+
Traceback (most recent call last):
|
|
941
|
+
...
|
|
942
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: Invalid character '#' at 1:10.
|
|
943
|
+
|
|
944
|
+
>>> parse_table_structure('foo Float32 DEFAULT 13, bar UInt64')
|
|
945
|
+
[{'name': 'foo', 'type': 'Float32', 'codec': None, 'default_value': 'DEFAULT 13', 'jsonpath': None, 'nullable': False, 'normalized_name': 'foo'}, {'name': 'bar', 'type': 'UInt64', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'bar'}]
|
|
946
|
+
|
|
947
|
+
>>> parse_table_structure('foo Float32 DEFAULT 1$$$3')
|
|
948
|
+
Traceback (most recent call last):
|
|
949
|
+
...
|
|
950
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: Invalid character '$' at 1:22.
|
|
951
|
+
|
|
952
|
+
>>> parse_table_structure('foo Float32 CODEC(Delta(4), LZ#4)')
|
|
953
|
+
Traceback (most recent call last):
|
|
954
|
+
...
|
|
955
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: Invalid character '#' at 1:31.
|
|
956
|
+
|
|
957
|
+
>>> parse_table_structure('\\n `temperature` Float32,\\n `temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4)\\n ')
|
|
958
|
+
Traceback (most recent call last):
|
|
959
|
+
...
|
|
960
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 3:33.
|
|
961
|
+
|
|
962
|
+
>>> parse_table_structure('temperature Float32, temperature_delta Float32 MATERIALIZED temperature Codec(Delta(4)), temperature_doubledelta Float32 MATERIALIZED temperature Codec(DoubleDelta), temperature_doubledelta_lz4 Float32 MATERIALIZED temperature Codec(DoubleDelta, LZ4)')
|
|
963
|
+
Traceback (most recent call last):
|
|
964
|
+
...
|
|
965
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:48.
|
|
966
|
+
|
|
967
|
+
>>> parse_table_structure('t UInt8 CODEC(Delta(1), LZ4)')
|
|
968
|
+
[{'name': 't', 'type': 'UInt8', 'codec': 'CODEC(Delta(1), LZ4)', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 't'}]
|
|
969
|
+
|
|
970
|
+
>>> parse_table_structure('tt UInt8 MATERIALIZED t')
|
|
971
|
+
Traceback (most recent call last):
|
|
972
|
+
...
|
|
973
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:11.
|
|
974
|
+
|
|
975
|
+
>>> parse_table_structure('tt UInt8 MATERIALIZED t CODEC(Delta(1), LZ4)')
|
|
976
|
+
Traceback (most recent call last):
|
|
977
|
+
...
|
|
978
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:11.
|
|
979
|
+
|
|
980
|
+
>>> parse_table_structure('tt SimpleAggregateFunction(any, Nullable(UInt8))')
|
|
981
|
+
[{'name': 'tt', 'type': 'SimpleAggregateFunction(any, Nullable(UInt8))', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'tt'}]
|
|
982
|
+
|
|
983
|
+
>>> parse_table_structure("timestamp DateTime MATERIALIZED toDateTime(JSONExtractInt(JSONExtractRaw(record, 'payload'), 'timestamp') / 1000)")
|
|
984
|
+
Traceback (most recent call last):
|
|
985
|
+
...
|
|
986
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:20.
|
|
987
|
+
|
|
988
|
+
>>> parse_table_structure("`test_default_cast` DEFAULT plus(13,1)")
|
|
989
|
+
[{'name': 'test_default_cast', 'type': '', 'codec': None, 'default_value': 'DEFAULT plus(13,1)', 'jsonpath': None, 'nullable': False, 'normalized_name': 'test_default_cast'}]
|
|
990
|
+
|
|
991
|
+
>>> parse_table_structure("hola Int, `materialized` String MATERIALIZED upper(no_nullable_string)")
|
|
992
|
+
Traceback (most recent call last):
|
|
993
|
+
...
|
|
994
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:33.
|
|
995
|
+
|
|
996
|
+
>>> parse_table_structure('`a2` String `json:$.a2`, `a3` String `json:$.a3`\\n')
|
|
997
|
+
[{'name': 'a2', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
|
|
998
|
+
|
|
999
|
+
>>> parse_table_structure("`arr` Array(String) DEFAULT ['-']")
|
|
1000
|
+
[{'name': 'arr', 'type': 'Array(String)', 'codec': None, 'default_value': "DEFAULT ['-']", 'jsonpath': None, 'nullable': False, 'normalized_name': 'arr'}]
|
|
1001
|
+
|
|
1002
|
+
>>> parse_table_structure("`arr` Array(String) DEFAULT array('-')")
|
|
1003
|
+
[{'name': 'arr', 'type': 'Array(String)', 'codec': None, 'default_value': "DEFAULT array('-')", 'jsonpath': None, 'nullable': False, 'normalized_name': 'arr'}]
|
|
1004
|
+
|
|
1005
|
+
>>> parse_table_structure('`a2` Float32 CODEC(Delta, ZSTD(4)) `json:$.a2`, `a3` String `json:$.a3`\\n')
|
|
1006
|
+
[{'name': 'a2', 'type': 'Float32', 'codec': 'CODEC(Delta, ZSTD(4))', 'default_value': None, 'jsonpath': '$.a2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
|
|
1007
|
+
|
|
1008
|
+
>>> parse_table_structure('`a` String, INDEX index_name a TYPE set(100) GRANULARITY 100')
|
|
1009
|
+
Traceback (most recent call last):
|
|
1010
|
+
...
|
|
1011
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: Forbidden INDEX definition at 1:13. Indexes are not allowed in SCHEMA section. Use the INDEXES section instead.
|
|
1012
|
+
|
|
1013
|
+
>>> parse_table_structure(' `a` String,\\n INDEX index_name a TYPE set(100, 1) GRANULARITY 100')
|
|
1014
|
+
Traceback (most recent call last):
|
|
1015
|
+
...
|
|
1016
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: Forbidden INDEX definition at 2:5. Indexes are not allowed in SCHEMA section. Use the INDEXES section instead.
|
|
1017
|
+
|
|
1018
|
+
>>> parse_table_structure('`index` String, INDEX index_name a TYPE set(100, 1) GRANULARITY 100')
|
|
1019
|
+
Traceback (most recent call last):
|
|
1020
|
+
...
|
|
1021
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: Forbidden INDEX definition at 1:17. Indexes are not allowed in SCHEMA section. Use the INDEXES section instead.
|
|
1022
|
+
|
|
1023
|
+
>>> parse_table_structure('`a2` String `json:$.a--2`, `a3` String `json:$.a3`\\n')
|
|
1024
|
+
[{'name': 'a2', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a--2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
|
|
1025
|
+
|
|
1026
|
+
>>> parse_table_structure('a InvalidType')
|
|
1027
|
+
Traceback (most recent call last):
|
|
1028
|
+
...
|
|
1029
|
+
tinybird.tb.modules.datafile.common.SchemaSyntaxError: Unknown data type family: InvalidType at 1:3.
|
|
1030
|
+
|
|
1031
|
+
>>> parse_table_structure('a Int32 DEFAULT 'a') # doctest: +SKIP
|
|
1032
|
+
# should fail as the type and default expr are incompatible
|
|
1033
|
+
"""
|
|
1034
|
+
return _parse_table_structure(schema)
|
|
1035
|
+
|
|
1036
|
+
|
|
169
1037
|
def parse(
|
|
170
1038
|
s: str,
|
|
171
1039
|
default_node: Optional[str] = None,
|
|
@@ -175,11 +1043,9 @@ def parse(
|
|
|
175
1043
|
) -> Datafile:
|
|
176
1044
|
"""
|
|
177
1045
|
Parses `s` string into a document
|
|
178
|
-
>>> d = parse("
|
|
1046
|
+
>>> d = parse("MAINTAINER 'rambo' #this is me\\nNODE \\"test_01\\"\\n DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n SELECT * from test_00\\n\\n\\nNODE \\"test_02\\"\\n DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n SELECT * from test_01\\n WHERE a > 1\\n GROUP by a\\n")
|
|
179
1047
|
>>> d.maintainer
|
|
180
1048
|
'rambo'
|
|
181
|
-
>>> d.sources
|
|
182
|
-
['https://example.com']
|
|
183
1049
|
>>> len(d.nodes)
|
|
184
1050
|
2
|
|
185
1051
|
>>> d.nodes[0]
|
|
@@ -192,12 +1058,43 @@ def parse(
|
|
|
192
1058
|
doc = Datafile()
|
|
193
1059
|
doc.raw = list(StringIO(s, newline=None))
|
|
194
1060
|
|
|
195
|
-
parser_state = namedtuple(
|
|
1061
|
+
parser_state = namedtuple(
|
|
1062
|
+
"parser_state", ["multiline", "current_node", "command", "multiline_string", "is_sql", "start_lineno"]
|
|
1063
|
+
)
|
|
196
1064
|
|
|
197
1065
|
parser_state.multiline = False
|
|
198
1066
|
parser_state.current_node = False
|
|
1067
|
+
parser_state.start_lineno = None
|
|
1068
|
+
|
|
1069
|
+
def multiline_not_supported(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
1070
|
+
@functools.wraps(func)
|
|
1071
|
+
def error_if_multiline(*args: Any, **kwargs: Any) -> Any:
|
|
1072
|
+
if parser_state.multiline:
|
|
1073
|
+
parser_state.multiline = (
|
|
1074
|
+
False # So we don't offset the line number when processing the exception. A bit hacky
|
|
1075
|
+
)
|
|
1076
|
+
raise DatafileSyntaxError(
|
|
1077
|
+
f"{kwargs['cmd'].upper()} does not support multiline arguments",
|
|
1078
|
+
lineno=parser_state.start_lineno, # We want to report the line where the command starts
|
|
1079
|
+
pos=1,
|
|
1080
|
+
)
|
|
1081
|
+
return func(*args, **kwargs)
|
|
1082
|
+
|
|
1083
|
+
return error_if_multiline
|
|
1084
|
+
|
|
1085
|
+
def deprecated(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
1086
|
+
@functools.wraps(func)
|
|
1087
|
+
def raise_deprecation_error(*args: Any, **kwargs: Any) -> Any:
|
|
1088
|
+
raise DatafileSyntaxError(
|
|
1089
|
+
f"{kwargs['cmd'].upper()} has been deprecated",
|
|
1090
|
+
lineno=kwargs["lineno"],
|
|
1091
|
+
pos=1,
|
|
1092
|
+
)
|
|
1093
|
+
|
|
1094
|
+
return raise_deprecation_error
|
|
199
1095
|
|
|
200
1096
|
def assign(attr):
|
|
1097
|
+
@multiline_not_supported
|
|
201
1098
|
def _fn(x, **kwargs):
|
|
202
1099
|
setattr(doc, attr, _unquote(x))
|
|
203
1100
|
|
|
@@ -207,7 +1104,10 @@ def parse(
|
|
|
207
1104
|
s = _unquote("".join(args))
|
|
208
1105
|
try:
|
|
209
1106
|
sh = parse_table_structure(s)
|
|
1107
|
+
except SchemaSyntaxError as e:
|
|
1108
|
+
raise e
|
|
210
1109
|
except Exception as e:
|
|
1110
|
+
# TODO(eclbg): Does it make sense to keep this exception? I'd like to get rid of all ParseException
|
|
211
1111
|
raise ParseException(FeedbackManager.error_parsing_schema(line=kwargs["lineno"], error=e))
|
|
212
1112
|
|
|
213
1113
|
parser_state.current_node["schema"] = ",".join(schema_to_sql_columns(sh))
|
|
@@ -219,26 +1119,33 @@ def parse(
|
|
|
219
1119
|
return
|
|
220
1120
|
try:
|
|
221
1121
|
indexes = parse_indexes_structure(s.splitlines())
|
|
1122
|
+
except IndexesSyntaxError as e:
|
|
1123
|
+
raise e
|
|
222
1124
|
except Exception as e:
|
|
1125
|
+
# TODO(eclbg): We get here when an unidentified error happens but we still report a parsing error. We could rething this.
|
|
223
1126
|
raise ParseException(FeedbackManager.error_parsing_indices(line=kwargs["lineno"], error=e))
|
|
224
1127
|
|
|
225
1128
|
parser_state.current_node["indexes"] = indexes
|
|
226
1129
|
|
|
227
1130
|
def assign_var(v: str) -> Callable[[VarArg(str), KwArg(Any)], None]:
|
|
1131
|
+
@multiline_not_supported
|
|
228
1132
|
def _f(*args: str, **kwargs: Any):
|
|
229
1133
|
s = _unquote((" ".join(args)).strip())
|
|
230
1134
|
parser_state.current_node[v.lower()] = eval_var(s, skip=skip_eval)
|
|
231
1135
|
|
|
232
1136
|
return _f
|
|
233
1137
|
|
|
1138
|
+
@deprecated
|
|
234
1139
|
def sources(x: str, **kwargs: Any) -> None:
|
|
235
|
-
|
|
1140
|
+
pass # Deprecated
|
|
236
1141
|
|
|
1142
|
+
@multiline_not_supported
|
|
237
1143
|
def node(*args: str, **kwargs: Any) -> None:
|
|
238
1144
|
node = {"name": eval_var(_unquote(args[0]))}
|
|
239
1145
|
doc.nodes.append(node)
|
|
240
1146
|
parser_state.current_node = node
|
|
241
1147
|
|
|
1148
|
+
@multiline_not_supported
|
|
242
1149
|
def scope(*args: str, **kwargs: Any) -> None:
|
|
243
1150
|
scope = {"name": eval_var(_unquote(args[0]))}
|
|
244
1151
|
doc.nodes.append(scope)
|
|
@@ -255,9 +1162,21 @@ def parse(
|
|
|
255
1162
|
doc.description = description
|
|
256
1163
|
|
|
257
1164
|
def sql(var_name: str, **kwargs: Any) -> Callable[[str, KwArg(Any)], None]:
|
|
258
|
-
|
|
1165
|
+
# TODO(eclbg): We shouldn't allow SQL in datasource files
|
|
1166
|
+
def _f(sql: str, *args: Any, **kwargs: Any) -> None:
|
|
1167
|
+
if not parser_state.multiline:
|
|
1168
|
+
raise DatafileSyntaxError(
|
|
1169
|
+
"SQL must be multiline",
|
|
1170
|
+
hint="Use > to start a multiline SQL block",
|
|
1171
|
+
lineno=kwargs["lineno"],
|
|
1172
|
+
pos=1,
|
|
1173
|
+
)
|
|
259
1174
|
if not parser_state.current_node:
|
|
260
|
-
raise
|
|
1175
|
+
raise DatafileSyntaxError(
|
|
1176
|
+
"SQL must be called after a NODE command",
|
|
1177
|
+
lineno=kwargs["lineno"],
|
|
1178
|
+
pos=1,
|
|
1179
|
+
)
|
|
261
1180
|
parser_state.current_node[var_name] = (
|
|
262
1181
|
textwrap.dedent(sql).rstrip() if "%" not in sql.strip()[0] else sql.strip()
|
|
263
1182
|
)
|
|
@@ -268,20 +1187,28 @@ def parse(
|
|
|
268
1187
|
def assign_node_var(v: str) -> Callable[[VarArg(str), KwArg(Any)], None]:
|
|
269
1188
|
def _f(*args: str, **kwargs: Any) -> None:
|
|
270
1189
|
if not parser_state.current_node:
|
|
271
|
-
raise
|
|
1190
|
+
raise DatafileSyntaxError(
|
|
1191
|
+
f"{v} must be called after a NODE command",
|
|
1192
|
+
lineno=kwargs["lineno"],
|
|
1193
|
+
pos=1,
|
|
1194
|
+
)
|
|
272
1195
|
return assign_var(v)(*args, **kwargs)
|
|
273
1196
|
|
|
274
1197
|
return _f
|
|
275
1198
|
|
|
1199
|
+
@multiline_not_supported
|
|
276
1200
|
def add_token(*args: str, **kwargs: Any) -> None: # token_name, permissions):
|
|
1201
|
+
# lineno = kwargs["lineno"]
|
|
277
1202
|
if len(args) < 2:
|
|
278
|
-
raise
|
|
1203
|
+
raise DatafileSyntaxError(
|
|
1204
|
+
message='TOKEN takes two params: token name and permissions e.g TOKEN "read api token" READ',
|
|
1205
|
+
lineno=lineno,
|
|
1206
|
+
pos=1,
|
|
1207
|
+
)
|
|
1208
|
+
# TODO(eclbg): We should validate that the permissions are a valid string. We only support READ for pipes and
|
|
1209
|
+
# APPEND for datasources
|
|
279
1210
|
doc.tokens.append({"token_name": _unquote(args[0]), "permissions": args[1]})
|
|
280
1211
|
|
|
281
|
-
def test(*args: str, **kwargs: Any) -> None:
|
|
282
|
-
# TODO: Should be removed?
|
|
283
|
-
print("test", args, kwargs) # noqa: T201
|
|
284
|
-
|
|
285
1212
|
def include(*args: str, **kwargs: Any) -> None:
|
|
286
1213
|
f = _unquote(args[0])
|
|
287
1214
|
f = eval_var(f)
|
|
@@ -330,16 +1257,9 @@ def parse(
|
|
|
330
1257
|
except FileNotFoundError:
|
|
331
1258
|
raise IncludeFileNotFoundException(f, lineno)
|
|
332
1259
|
|
|
1260
|
+
@deprecated
|
|
333
1261
|
def version(*args: str, **kwargs: Any) -> None:
|
|
334
|
-
|
|
335
|
-
raise ParseException("VERSION gets one positive integer param")
|
|
336
|
-
try:
|
|
337
|
-
version = int(args[0])
|
|
338
|
-
if version < 0:
|
|
339
|
-
raise ValidationException("version must be a positive integer e.g VERSION 2")
|
|
340
|
-
doc.version = version
|
|
341
|
-
except ValueError:
|
|
342
|
-
raise ValidationException("version must be a positive integer e.g VERSION 2")
|
|
1262
|
+
pass # whatever, it's deprecated
|
|
343
1263
|
|
|
344
1264
|
def shared_with(*args: str, **kwargs: Any) -> None:
|
|
345
1265
|
for entries in args:
|
|
@@ -381,13 +1301,10 @@ def parse(
|
|
|
381
1301
|
doc.filtering_tags += filtering_tags
|
|
382
1302
|
|
|
383
1303
|
cmds = {
|
|
384
|
-
"from": assign("from"),
|
|
385
1304
|
"source": sources,
|
|
386
1305
|
"maintainer": assign("maintainer"),
|
|
387
1306
|
"schema": schema,
|
|
388
1307
|
"indexes": indexes,
|
|
389
|
-
# TODO: Added to be able to merge MR 11347, let's remove it afterwards
|
|
390
|
-
"indices": indexes,
|
|
391
1308
|
"engine": set_engine,
|
|
392
1309
|
"partition_key": assign_var("partition_key"),
|
|
393
1310
|
"sorting_key": assign_var("sorting_key"),
|
|
@@ -408,7 +1325,6 @@ def parse(
|
|
|
408
1325
|
"resource": assign_node_var("resource"),
|
|
409
1326
|
"filter": assign_node_var("filter"),
|
|
410
1327
|
"token": add_token,
|
|
411
|
-
"test": test,
|
|
412
1328
|
"include": include,
|
|
413
1329
|
"sql": sql("sql"),
|
|
414
1330
|
"version": version,
|
|
@@ -462,10 +1378,11 @@ def parse(
|
|
|
462
1378
|
if default_node:
|
|
463
1379
|
node(default_node)
|
|
464
1380
|
|
|
465
|
-
lineno =
|
|
1381
|
+
lineno = 1
|
|
466
1382
|
try:
|
|
467
|
-
while lineno
|
|
468
|
-
line = lines[lineno]
|
|
1383
|
+
while lineno <= len(lines):
|
|
1384
|
+
line = lines[lineno - 1]
|
|
1385
|
+
# shlex.shlex(line) removes comments that start with #. This doesn't affect multiline commands
|
|
469
1386
|
try:
|
|
470
1387
|
sa = shlex.shlex(line)
|
|
471
1388
|
sa.whitespace_split = True
|
|
@@ -479,23 +1396,37 @@ def parse(
|
|
|
479
1396
|
if (
|
|
480
1397
|
parser_state.multiline
|
|
481
1398
|
and cmd.lower() in cmds
|
|
482
|
-
and not (line.startswith(" ") or line.startswith("\t")
|
|
1399
|
+
and not (line.startswith(" ") or line.startswith("\t"))
|
|
483
1400
|
):
|
|
484
|
-
parser_state.multiline = False
|
|
485
1401
|
cmds[parser_state.command](
|
|
486
|
-
parser_state.multiline_string,
|
|
1402
|
+
parser_state.multiline_string,
|
|
1403
|
+
lineno=lineno,
|
|
1404
|
+
replace_includes=replace_includes,
|
|
1405
|
+
cmd=parser_state.command,
|
|
487
1406
|
)
|
|
1407
|
+
parser_state.multiline = False
|
|
488
1408
|
|
|
489
1409
|
if not parser_state.multiline:
|
|
490
1410
|
if len(args) >= 1 and args[0] == ">":
|
|
491
1411
|
parser_state.multiline = True
|
|
492
1412
|
parser_state.command = cmd.lower()
|
|
1413
|
+
parser_state.start_lineno = lineno
|
|
493
1414
|
parser_state.multiline_string = ""
|
|
494
1415
|
else:
|
|
495
1416
|
if cmd.lower() == "settings":
|
|
496
|
-
|
|
1417
|
+
msg = (
|
|
1418
|
+
"SETTINGS option is not allowed, use ENGINE_SETTINGS instead. See "
|
|
1419
|
+
"https://www.tinybird.co/docs/cli/datafiles#data-source for more information."
|
|
1420
|
+
)
|
|
1421
|
+
raise DatafileSyntaxError(
|
|
1422
|
+
# TODO(eclbg): add surrounding lines as context to the error so we can print it
|
|
1423
|
+
# offending_line=line,
|
|
1424
|
+
message=msg,
|
|
1425
|
+
lineno=lineno,
|
|
1426
|
+
pos=0,
|
|
1427
|
+
)
|
|
497
1428
|
if cmd.lower() in cmds:
|
|
498
|
-
cmds[cmd.lower()](*args, lineno=lineno, replace_includes=replace_includes)
|
|
1429
|
+
cmds[cmd.lower()](*args, lineno=lineno, replace_includes=replace_includes, cmd=cmd)
|
|
499
1430
|
else:
|
|
500
1431
|
raise click.ClickException(FeedbackManager.error_option(option=cmd.upper()))
|
|
501
1432
|
else:
|
|
@@ -503,11 +1434,20 @@ def parse(
|
|
|
503
1434
|
lineno += 1
|
|
504
1435
|
# close final state
|
|
505
1436
|
if parser_state.multiline:
|
|
506
|
-
cmds[parser_state.command](
|
|
1437
|
+
cmds[parser_state.command](
|
|
1438
|
+
parser_state.multiline_string,
|
|
1439
|
+
lineno=lineno,
|
|
1440
|
+
replace_includes=replace_includes,
|
|
1441
|
+
cmd=parser_state.command,
|
|
1442
|
+
)
|
|
1443
|
+
except DatafileSyntaxError as e:
|
|
1444
|
+
# When the error is in a multiline block, add the start lineno to the error lineno so the error location is in
|
|
1445
|
+
# respect to the whole file
|
|
1446
|
+
if parser_state.multiline:
|
|
1447
|
+
e.lineno += parser_state.start_lineno
|
|
1448
|
+
raise e
|
|
507
1449
|
except ParseException as e:
|
|
508
1450
|
raise ParseException(str(e), lineno=lineno)
|
|
509
|
-
except ValidationException as e:
|
|
510
|
-
raise ValidationException(str(e), lineno=lineno)
|
|
511
1451
|
except IndexError as e:
|
|
512
1452
|
if "node" in line.lower():
|
|
513
1453
|
raise click.ClickException(FeedbackManager.error_missing_node_name())
|