tinybird 0.0.1.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tinybird might be problematic. Click here for more details.
- tinybird/__cli__.py +8 -0
- tinybird/ch_utils/constants.py +244 -0
- tinybird/ch_utils/engine.py +855 -0
- tinybird/check_pypi.py +25 -0
- tinybird/client.py +1281 -0
- tinybird/config.py +117 -0
- tinybird/connectors.py +428 -0
- tinybird/context.py +23 -0
- tinybird/datafile.py +5589 -0
- tinybird/datatypes.py +434 -0
- tinybird/feedback_manager.py +1022 -0
- tinybird/git_settings.py +145 -0
- tinybird/sql.py +865 -0
- tinybird/sql_template.py +2343 -0
- tinybird/sql_template_fmt.py +281 -0
- tinybird/sql_toolset.py +350 -0
- tinybird/syncasync.py +682 -0
- tinybird/tb_cli.py +25 -0
- tinybird/tb_cli_modules/auth.py +252 -0
- tinybird/tb_cli_modules/branch.py +1043 -0
- tinybird/tb_cli_modules/cicd.py +434 -0
- tinybird/tb_cli_modules/cli.py +1571 -0
- tinybird/tb_cli_modules/common.py +2082 -0
- tinybird/tb_cli_modules/config.py +344 -0
- tinybird/tb_cli_modules/connection.py +803 -0
- tinybird/tb_cli_modules/datasource.py +900 -0
- tinybird/tb_cli_modules/exceptions.py +91 -0
- tinybird/tb_cli_modules/fmt.py +91 -0
- tinybird/tb_cli_modules/job.py +85 -0
- tinybird/tb_cli_modules/pipe.py +858 -0
- tinybird/tb_cli_modules/regions.py +9 -0
- tinybird/tb_cli_modules/tag.py +100 -0
- tinybird/tb_cli_modules/telemetry.py +310 -0
- tinybird/tb_cli_modules/test.py +107 -0
- tinybird/tb_cli_modules/tinyunit/tinyunit.py +340 -0
- tinybird/tb_cli_modules/tinyunit/tinyunit_lib.py +71 -0
- tinybird/tb_cli_modules/token.py +349 -0
- tinybird/tb_cli_modules/workspace.py +269 -0
- tinybird/tb_cli_modules/workspace_members.py +212 -0
- tinybird/tornado_template.py +1194 -0
- tinybird-0.0.1.dev0.dist-info/METADATA +2815 -0
- tinybird-0.0.1.dev0.dist-info/RECORD +45 -0
- tinybird-0.0.1.dev0.dist-info/WHEEL +5 -0
- tinybird-0.0.1.dev0.dist-info/entry_points.txt +2 -0
- tinybird-0.0.1.dev0.dist-info/top_level.txt +4 -0
tinybird/sql.py
ADDED
|
@@ -0,0 +1,865 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
import string
|
|
4
|
+
from collections import namedtuple
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
7
|
+
|
|
8
|
+
valid_chars_name: str = string.ascii_letters + string.digits + "._`*<>+-'"
|
|
9
|
+
valid_chars_fn: str = valid_chars_name + "[](),=!?:/ \n\t\r"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class TableIndex:
|
|
14
|
+
"""Defines a CH table INDEX"""
|
|
15
|
+
|
|
16
|
+
name: str
|
|
17
|
+
expr: str
|
|
18
|
+
type_full: str
|
|
19
|
+
granularity: Optional[str] = None
|
|
20
|
+
|
|
21
|
+
def to_datafile(self):
|
|
22
|
+
granularity_expr = f"GRANULARITY {self.granularity}" if self.granularity else ""
|
|
23
|
+
return f"{self.name} {self.expr} TYPE {self.type_full} {granularity_expr}"
|
|
24
|
+
|
|
25
|
+
def to_sql(self):
|
|
26
|
+
return f"INDEX {self.to_datafile()}"
|
|
27
|
+
|
|
28
|
+
def add_index_sql(self):
|
|
29
|
+
return f"ADD {self.to_sql()}"
|
|
30
|
+
|
|
31
|
+
def drop_index_sql(self):
|
|
32
|
+
return f"DROP INDEX IF EXISTS {self.name}"
|
|
33
|
+
|
|
34
|
+
def materialize_index_sql(self):
|
|
35
|
+
return f"MATERIALIZE INDEX IF EXISTS {self.name}"
|
|
36
|
+
|
|
37
|
+
def clear_index_sql(self):
|
|
38
|
+
return f"CLEAR INDEX IF EXISTS {self.name}"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class TableProjection:
|
|
43
|
+
"""Defines a CH table PROJECTION"""
|
|
44
|
+
|
|
45
|
+
name: str
|
|
46
|
+
expr: str
|
|
47
|
+
|
|
48
|
+
def to_datafile(self):
|
|
49
|
+
return f"{self.name} ({self.expr})"
|
|
50
|
+
|
|
51
|
+
def to_sql(self):
|
|
52
|
+
return f"PROJECTION {self.to_datafile()}"
|
|
53
|
+
|
|
54
|
+
def add_projection_sql(self):
|
|
55
|
+
return f"ADD {self.to_sql()}"
|
|
56
|
+
|
|
57
|
+
def drop_projection_sql(self):
|
|
58
|
+
return f"DROP PROJECTION IF EXISTS {self.name}"
|
|
59
|
+
|
|
60
|
+
def materialize_projection_sql(self):
|
|
61
|
+
return f"MATERIALIZE PROJECTION IF EXISTS {self.name}"
|
|
62
|
+
|
|
63
|
+
def clear_projection_sql(self):
|
|
64
|
+
return f"CLEAR PROJECTION IF EXISTS {self.name}"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def as_subquery(sql: str) -> str:
|
|
68
|
+
return f"""(\n{sql}\n)"""
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_format(sql: str) -> Optional[str]:
|
|
72
|
+
"""
|
|
73
|
+
retrieves FORMAT from CH sql
|
|
74
|
+
>>> get_format('select * from test')
|
|
75
|
+
>>> get_format('select * from test formAt JSON')
|
|
76
|
+
'JSON'
|
|
77
|
+
"""
|
|
78
|
+
FORMAT_RE = r"\s+format\s+(\w+)\s*$"
|
|
79
|
+
sql = sql.strip()
|
|
80
|
+
format = re.findall(FORMAT_RE, sql, re.I)
|
|
81
|
+
return format[0] if format else None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_format_group(sql: str) -> str:
|
|
85
|
+
"""
|
|
86
|
+
retrieves FORMAT group from CH sql
|
|
87
|
+
>>> get_format_group('select * from test')
|
|
88
|
+
''
|
|
89
|
+
>>> get_format_group('select * from test formAt JSON')
|
|
90
|
+
' formAt JSON'
|
|
91
|
+
"""
|
|
92
|
+
FORMAT_RE = r"\s+format\s+(\w+)\s*$"
|
|
93
|
+
sql = sql.strip()
|
|
94
|
+
format = re.search(FORMAT_RE, sql, re.I)
|
|
95
|
+
return format.group() if format else ""
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def wrap_finalize_aggregation(sql: str, describe_result: Dict[str, Any], fm_group: Optional[str] = None) -> str:
|
|
99
|
+
if not fm_group:
|
|
100
|
+
fm_group = get_format_group(sql)
|
|
101
|
+
sql = sql[0 : -len(fm_group)] if fm_group else sql
|
|
102
|
+
|
|
103
|
+
qq: str
|
|
104
|
+
if describe_result:
|
|
105
|
+
columns: List[str] = [
|
|
106
|
+
f"finalizeAggregation({c['name']}) as {c['name']}"
|
|
107
|
+
if "Aggregate" in c["type"] and "SimpleAggregate" not in c["type"]
|
|
108
|
+
else f"{c['name']}"
|
|
109
|
+
for c in describe_result["data"]
|
|
110
|
+
]
|
|
111
|
+
columns_as_string: str = ",\n\t".join(columns)
|
|
112
|
+
sql = sql.replace("\n", "\n\t")
|
|
113
|
+
qq = f"SELECT \n\t{columns_as_string} \nFROM ({sql} \n) {fm_group}"
|
|
114
|
+
else:
|
|
115
|
+
qq = sql
|
|
116
|
+
return qq
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def remove_format(sql: str) -> str:
|
|
120
|
+
"""
|
|
121
|
+
removes FORMAT from CH sql
|
|
122
|
+
>>> remove_format('select * from test')
|
|
123
|
+
'select * from test'
|
|
124
|
+
>>> remove_format('select * from test formAt JSON')
|
|
125
|
+
'select * from test'
|
|
126
|
+
"""
|
|
127
|
+
FORMAT_RE = r"\s+(format)\s+(\w+)\s*$"
|
|
128
|
+
sql = sql.strip()
|
|
129
|
+
return re.sub(FORMAT_RE, "", sql, flags=re.I)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def col_name(name: str, backquotes: bool = True) -> str:
|
|
133
|
+
"""
|
|
134
|
+
>>> col_name('`test`', True)
|
|
135
|
+
'`test`'
|
|
136
|
+
>>> col_name('`test`', False)
|
|
137
|
+
'test'
|
|
138
|
+
>>> col_name('test', True)
|
|
139
|
+
'`test`'
|
|
140
|
+
>>> col_name('test', False)
|
|
141
|
+
'test'
|
|
142
|
+
>>> col_name('', True)
|
|
143
|
+
''
|
|
144
|
+
>>> col_name('', False)
|
|
145
|
+
''
|
|
146
|
+
"""
|
|
147
|
+
if not name:
|
|
148
|
+
return name
|
|
149
|
+
if name[0] == "`" and name[-1] == "`":
|
|
150
|
+
return name if backquotes else name[1:-1]
|
|
151
|
+
return f"`{name}`" if backquotes else name
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def try_to_fix_nullable_in_simple_aggregating_function(t: str) -> Optional[str]:
|
|
155
|
+
# This workaround is to fix: https://github.com/ClickHouse/ClickHouse/issues/34407.
|
|
156
|
+
# In the case of nullable columns and SimpleAggregateFunction Clickhouse returns
|
|
157
|
+
# Nullable(SimpleAggregateFunction(sum, Int32)) instead of SimpleAggregateFunction(sum, Nullable(Int32))
|
|
158
|
+
# as it is done with other aggregate functions.
|
|
159
|
+
# If not, the aggregation could return incorrect results.
|
|
160
|
+
result = None
|
|
161
|
+
if match := re.search(r"SimpleAggregateFunction\((\w+),\s*(?!(?:Nullable))([\w,.()]+)\)", t):
|
|
162
|
+
fn = match.group(1)
|
|
163
|
+
inner_type = match.group(2)
|
|
164
|
+
result = f"SimpleAggregateFunction({fn}, Nullable({inner_type}))"
|
|
165
|
+
return result
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def schema_to_sql_columns(schema: List[Dict[str, Any]]) -> List[str]:
|
|
169
|
+
"""return an array with each column in SQL
|
|
170
|
+
>>> schema_to_sql_columns([{'name': 'temperature', 'type': 'Float32', 'codec': None, 'default_value': None, 'nullable': False, 'normalized_name': 'temperature'}, {'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4))', 'default_value': 'MATERIALIZED temperature', 'nullable': False, 'normalized_name': 'temperature_delta'}])
|
|
171
|
+
['`temperature` Float32', '`temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4))']
|
|
172
|
+
>>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'codec': '', 'default_value': 'MATERIALIZED temperature', 'nullable': False, 'normalized_name': 'temperature_delta'}])
|
|
173
|
+
['`temperature_delta` Float32 MATERIALIZED temperature']
|
|
174
|
+
>>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4))', 'default_value': '', 'nullable': False, 'normalized_name': 'temperature_delta'}])
|
|
175
|
+
['`temperature_delta` Float32 CODEC(Delta(4), LZ4))']
|
|
176
|
+
>>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'nullable': False, 'normalized_name': 'temperature_delta'}])
|
|
177
|
+
['`temperature_delta` Float32']
|
|
178
|
+
>>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'nullable': False, 'normalized_name': 'temperature_delta', 'jsonpath': '$.temperature_delta'}])
|
|
179
|
+
['`temperature_delta` Float32 `json:$.temperature_delta`']
|
|
180
|
+
>>> schema_to_sql_columns([{'name': 'aggregation', 'type': 'SimpleAggregateFunction(sum, Int32)', 'nullable': True, 'normalized_name': 'aggregation', 'jsonpath': '$.aggregation'}])
|
|
181
|
+
['`aggregation` SimpleAggregateFunction(sum, Nullable(Int32)) `json:$.aggregation`']
|
|
182
|
+
"""
|
|
183
|
+
columns: List[str] = []
|
|
184
|
+
for x in schema:
|
|
185
|
+
name = x["normalized_name"] if "normalized_name" in x else x["name"]
|
|
186
|
+
if x["nullable"]:
|
|
187
|
+
if (_type := try_to_fix_nullable_in_simple_aggregating_function(x["type"])) is None:
|
|
188
|
+
_type = "Nullable(%s)" % x["type"]
|
|
189
|
+
else:
|
|
190
|
+
_type = x["type"]
|
|
191
|
+
parts = [col_name(name, backquotes=True), _type]
|
|
192
|
+
if x.get("jsonpath", None):
|
|
193
|
+
parts.append(f"`json:{x['jsonpath']}`")
|
|
194
|
+
if "default_value" in x and x["default_value"] not in ("", None):
|
|
195
|
+
parts.append(x["default_value"])
|
|
196
|
+
if "codec" in x and x["codec"] not in ("", None):
|
|
197
|
+
parts.append(x["codec"])
|
|
198
|
+
c = " ".join([x for x in parts if x]).strip()
|
|
199
|
+
columns.append(c)
|
|
200
|
+
return columns
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def mark_error_string(s: str, i: int, line: int = 1) -> str:
|
|
204
|
+
"""
|
|
205
|
+
>>> mark_error_string('0123456789', 0)
|
|
206
|
+
'0123456789\\n^---'
|
|
207
|
+
>>> mark_error_string('0123456789', 9)
|
|
208
|
+
'0123456789\\n ^---'
|
|
209
|
+
>>> mark_error_string('01234\\n56789', 1)
|
|
210
|
+
'01234\\n ^---'
|
|
211
|
+
"""
|
|
212
|
+
marker = "^---"
|
|
213
|
+
ss = s.splitlines()[line - 1] if s else ""
|
|
214
|
+
start = 0
|
|
215
|
+
end = len(ss)
|
|
216
|
+
return ss[start:end] + "\n" + (" " * (i - start)) + marker
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def format_parse_error(
|
|
220
|
+
table_structure: str,
|
|
221
|
+
i: int,
|
|
222
|
+
position: int,
|
|
223
|
+
hint: Optional[str] = None,
|
|
224
|
+
line: int = 0,
|
|
225
|
+
keyword: Optional[str] = None,
|
|
226
|
+
) -> str:
|
|
227
|
+
adjusted_position = position - (len(keyword) if keyword else 0)
|
|
228
|
+
message = f"{hint}\n" if hint else ""
|
|
229
|
+
message += mark_error_string(table_structure, adjusted_position - 1, line=line)
|
|
230
|
+
|
|
231
|
+
if keyword:
|
|
232
|
+
message += f" found at position {adjusted_position - len(keyword)}"
|
|
233
|
+
else:
|
|
234
|
+
message += (
|
|
235
|
+
f" found {repr(table_structure[i]) if len(table_structure)>i else 'EOF'} at position {adjusted_position}"
|
|
236
|
+
)
|
|
237
|
+
return message
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def parse_indexes_structure(indexes: Optional[List[str]]) -> List[TableIndex]:
|
|
241
|
+
"""
|
|
242
|
+
>>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
|
|
243
|
+
[TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
|
|
244
|
+
>>> parse_indexes_structure(["INDEX index_name a TYPE set(100) GRANULARITY 100", " INDEX index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
|
|
245
|
+
[TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
|
|
246
|
+
>>> parse_indexes_structure(["index_name type TYPE set(100) GRANULARITY 100", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
|
|
247
|
+
[TableIndex(name='index_name', expr='type', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
|
|
248
|
+
>>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100,", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
|
|
249
|
+
[TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
|
|
250
|
+
>>> parse_indexes_structure(["index_name a TYPE set(100)", "index_name_bf mapValues(d) TYPE bloom_filter(0.001)"])
|
|
251
|
+
[TableIndex(name='index_name', expr='a', type_full='set(100)', granularity=None), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity=None)]
|
|
252
|
+
>>> parse_indexes_structure(["index_name u64 * length(s) TYPE set(100)", "index_name_bf mapValues(d) TYPE bloom_filter"])
|
|
253
|
+
[TableIndex(name='index_name', expr='u64 * length(s)', type_full='set(100)', granularity=None), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter', granularity=None)]
|
|
254
|
+
>>> parse_indexes_structure(["index_name path TYPE ngrambf_v1(4,1024,1,42) GRANULARITY 1"])
|
|
255
|
+
[TableIndex(name='index_name', expr='path', type_full='ngrambf_v1(4,1024,1,42)', granularity='1')]
|
|
256
|
+
>>> parse_indexes_structure(["index_name path TYPE ngrambf_v1(4, 1024, 1, 42) GRANULARITY 1"])
|
|
257
|
+
[TableIndex(name='index_name', expr='path', type_full='ngrambf_v1(4, 1024, 1, 42)', granularity='1')]
|
|
258
|
+
>>> parse_indexes_structure(["index_name u64 * length(s)"])
|
|
259
|
+
Traceback (most recent call last):
|
|
260
|
+
...
|
|
261
|
+
ValueError: invalid INDEX format. Usage: `name expr TYPE type_full GRANULARITY granularity`
|
|
262
|
+
>>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100, index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
|
|
263
|
+
Traceback (most recent call last):
|
|
264
|
+
...
|
|
265
|
+
ValueError: invalid INDEX format. Usage: `name expr TYPE type_full GRANULARITY granularity`
|
|
266
|
+
>>> parse_indexes_structure(["my_index m['key'] TYPE ngrambf_v1(1, 1024, 1, 42) GRANULARITY 1"])
|
|
267
|
+
[TableIndex(name='my_index', expr="m['key']", type_full='ngrambf_v1(1, 1024, 1, 42)', granularity='1')]
|
|
268
|
+
>>> parse_indexes_structure(["my_index_lambda arrayMap(x -> tupleElement(x,'message'), column_name) TYPE ngrambf_v1(1, 1024, 1, 42) GRANULARITY 1"])
|
|
269
|
+
[TableIndex(name='my_index_lambda', expr="arrayMap(x -> tupleElement(x,'message'), column_name)", type_full='ngrambf_v1(1, 1024, 1, 42)', granularity='1')]
|
|
270
|
+
>>> parse_indexes_structure(["ip_range_minmax_idx (toIPv6(ip_range_start), toIPv6(ip_range_end)) TYPE minmax GRANULARITY 1"])
|
|
271
|
+
[TableIndex(name='ip_range_minmax_idx', expr='(toIPv6(ip_range_start), toIPv6(ip_range_end))', type_full='minmax', granularity='1')]
|
|
272
|
+
"""
|
|
273
|
+
parsed_indices: List[TableIndex] = []
|
|
274
|
+
if not indexes:
|
|
275
|
+
return parsed_indices
|
|
276
|
+
|
|
277
|
+
for index in indexes:
|
|
278
|
+
index = index.strip().rstrip(",")
|
|
279
|
+
index = index.lstrip("INDEX").strip()
|
|
280
|
+
if index.count("TYPE") != 1:
|
|
281
|
+
raise ValueError("invalid INDEX format. Usage: `name expr TYPE type_full GRANULARITY granularity`")
|
|
282
|
+
|
|
283
|
+
match = re.match(
|
|
284
|
+
r"(\w+)\s+([\w\s*\[\]\*\(\),\'\"-><.]+)\s+TYPE\s+(\w+)(?:\(([\w\s*.,]+)\))?(?:\s+GRANULARITY\s+(\d+))?",
|
|
285
|
+
index,
|
|
286
|
+
)
|
|
287
|
+
if match:
|
|
288
|
+
index_name, a, index_type, value, granularity = match.groups()
|
|
289
|
+
index_expr = f"{index_type}({value})" if value else index_type
|
|
290
|
+
parsed_indices.append(TableIndex(index_name, a.strip(), f"{index_expr}", granularity))
|
|
291
|
+
else:
|
|
292
|
+
raise ValueError("invalid INDEX format. Usage: `name expr TYPE type_full GRANULARITY granularity`")
|
|
293
|
+
return parsed_indices
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def parse_table_structure(schema: str) -> List[Dict[str, Any]]:
|
|
297
|
+
"""This parses the SQL schema for a CREATE TABLE
|
|
298
|
+
Columns follow the syntax: name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1][,]
|
|
299
|
+
Reference: https://clickhouse.tech/docs/en/sql-reference/statements/create/table/#syntax-forms
|
|
300
|
+
|
|
301
|
+
>>> parse_table_structure('c Float32, b String')
|
|
302
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
|
|
303
|
+
|
|
304
|
+
>>> parse_table_structure('c Float32,--comment\\nb String')
|
|
305
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
|
|
306
|
+
|
|
307
|
+
>>> parse_table_structure('c Float32,--comment\\nb String --another-comment')
|
|
308
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
|
|
309
|
+
|
|
310
|
+
>>> parse_table_structure('c Float32 --first-comment\\n,--comment\\nb String --another-comment')
|
|
311
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
|
|
312
|
+
|
|
313
|
+
>>> parse_table_structure('--random comment here\\nc Float32 --another comment\\n,--another one\\nb String --this is the last one')
|
|
314
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
|
|
315
|
+
|
|
316
|
+
>>> parse_table_structure('--extra comment\\nc--extra comment\\nFloat32--extra comment\\n,--extra comment\\nb--extra comment\\nString--extra comment')
|
|
317
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
|
|
318
|
+
|
|
319
|
+
>>> parse_table_structure('c Nullable(Float32)')
|
|
320
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
|
|
321
|
+
|
|
322
|
+
>>> parse_table_structure('c Nullable(Float32) DEFAULT NULL')
|
|
323
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
|
|
324
|
+
|
|
325
|
+
>>> parse_table_structure("c String DEFAULT 'bla'")
|
|
326
|
+
[{'name': 'c', 'type': 'String', 'codec': None, 'default_value': "DEFAULT 'bla'", 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
|
|
327
|
+
|
|
328
|
+
>>> parse_table_structure('`foo.bar` UInt64')
|
|
329
|
+
[{'name': 'foo.bar', 'type': 'UInt64', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'foo.bar'}]
|
|
330
|
+
|
|
331
|
+
>>> parse_table_structure('double_value Float64 CODEC(LZ4HC(2))')
|
|
332
|
+
[{'name': 'double_value', 'type': 'Float64', 'codec': 'CODEC(LZ4HC(2))', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'double_value'}]
|
|
333
|
+
>>> parse_table_structure('doubl/e_value Float64 CODEC(LZ4HC(2))')
|
|
334
|
+
Traceback (most recent call last):
|
|
335
|
+
...
|
|
336
|
+
ValueError: wrong value, please check the schema syntax
|
|
337
|
+
doubl/e_value Float64 CODEC(LZ4HC(2))
|
|
338
|
+
^--- found '/' at position 6
|
|
339
|
+
>>> parse_table_structure('`c` Nullable(Float32)')
|
|
340
|
+
[{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
|
|
341
|
+
>>> parse_table_structure('wadus INT UNSIGNED')
|
|
342
|
+
[{'name': 'wadus', 'type': 'INT UNSIGNED', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'wadus'}]
|
|
343
|
+
>>> parse_table_structure('c Int32 CODEC(Delta, LZ4)\\n')
|
|
344
|
+
[{'name': 'c', 'type': 'Int32', 'codec': 'CODEC(Delta, LZ4)', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
|
|
345
|
+
>>> parse_table_structure('c SimpleAggregateFunction(sum, Int32),\\np SimpleAggregateFunction(sum, Int32)')
|
|
346
|
+
Traceback (most recent call last):
|
|
347
|
+
...
|
|
348
|
+
ValueError: Incompatible data types between aggregate function 'sum' which returns Int64 and column storage type Int32
|
|
349
|
+
>>> parse_table_structure('c Int32 CODEC(Delta, LZ4) Materialized b*2\\n')
|
|
350
|
+
Traceback (most recent call last):
|
|
351
|
+
...
|
|
352
|
+
ValueError: Unexpected MATERIALIZED after CODEC
|
|
353
|
+
c Int32 CODEC(Delta, LZ4) Materialized b*2
|
|
354
|
+
^--- found ' ' at position 26
|
|
355
|
+
>>> parse_table_structure('c Int32 CODEC(Delta, LZ4) Materialized ifNull(b*2, 0)\\n')
|
|
356
|
+
Traceback (most recent call last):
|
|
357
|
+
...
|
|
358
|
+
ValueError: Unexpected MATERIALIZED after CODEC
|
|
359
|
+
c Int32 CODEC(Delta, LZ4) Materialized ifNull(b*2, 0)
|
|
360
|
+
^--- found ' ' at position 26
|
|
361
|
+
>>> parse_table_structure('c Int32 Materialized b*2\\n')
|
|
362
|
+
[{'name': 'c', 'type': 'Int32', 'codec': None, 'default_value': 'MATERIALIZED b*2', 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
|
|
363
|
+
>>> parse_table_structure('c Int32 Materialized b != 1 ? b*2: pow(b, 3)\\n')
|
|
364
|
+
[{'name': 'c', 'type': 'Int32', 'codec': None, 'default_value': 'MATERIALIZED b != 1 ? b*2: pow(b, 3)', 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
|
|
365
|
+
>>> parse_table_structure('')
|
|
366
|
+
[]
|
|
367
|
+
>>> parse_table_structure('`date` Date,`timezone` String,`offset` Int32')
|
|
368
|
+
[{'name': 'date', 'type': 'Date', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'date'}, {'name': 'timezone', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'timezone'}, {'name': 'offset', 'type': 'Int32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'offset'}]
|
|
369
|
+
>>> parse_table_structure('c Int32 Materialized b*2 CODEC(Delta, LZ4)\\n')
|
|
370
|
+
[{'name': 'c', 'type': 'Int32', 'codec': 'CODEC(Delta, LZ4)', 'default_value': 'MATERIALIZED b*2', 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
|
|
371
|
+
>>> parse_table_structure('c Int32 Materialized ifNull(b*2, 0) CODEC(Delta, LZ4)\\n')
|
|
372
|
+
[{'name': 'c', 'type': 'Int32', 'codec': 'CODEC(Delta, LZ4)', 'default_value': 'MATERIALIZED ifNull(b*2, 0)', 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
|
|
373
|
+
>>> parse_table_structure('`temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4)')
|
|
374
|
+
[{'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4)', 'default_value': 'MATERIALIZED temperature', 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature_delta'}]
|
|
375
|
+
>>> parse_table_structure('foo^bar Float32')
|
|
376
|
+
Traceback (most recent call last):
|
|
377
|
+
...
|
|
378
|
+
ValueError: wrong value, please check the schema syntax
|
|
379
|
+
foo^bar Float32
|
|
380
|
+
^--- found '^' at position 4
|
|
381
|
+
>>> parse_table_structure('foo Float#32')
|
|
382
|
+
Traceback (most recent call last):
|
|
383
|
+
...
|
|
384
|
+
ValueError: wrong value, please check the schema syntax
|
|
385
|
+
foo Float#32
|
|
386
|
+
^--- found '#' at position 10
|
|
387
|
+
>>> parse_table_structure('foo Float32 DEFAULT 13, bar UInt64')
|
|
388
|
+
[{'name': 'foo', 'type': 'Float32', 'codec': None, 'default_value': 'DEFAULT 13', 'jsonpath': None, 'nullable': False, 'normalized_name': 'foo'}, {'name': 'bar', 'type': 'UInt64', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'bar'}]
|
|
389
|
+
>>> parse_table_structure('foo Float32 DEFAULT 1$$$3')
|
|
390
|
+
Traceback (most recent call last):
|
|
391
|
+
...
|
|
392
|
+
ValueError: wrong value, please check the schema syntax
|
|
393
|
+
foo Float32 DEFAULT 1$$$3
|
|
394
|
+
^--- found '$' at position 22
|
|
395
|
+
>>> parse_table_structure('foo Float32 CODEC(Delta(4), LZ#4)')
|
|
396
|
+
Traceback (most recent call last):
|
|
397
|
+
...
|
|
398
|
+
ValueError: wrong value, please check the schema syntax
|
|
399
|
+
foo Float32 CODEC(Delta(4), LZ#4)
|
|
400
|
+
^--- found '#' at position 31
|
|
401
|
+
>>> parse_table_structure('\\n `temperature` Float32,\\n `temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4)\\n ')
|
|
402
|
+
[{'name': 'temperature', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature'}, {'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4)', 'default_value': 'MATERIALIZED temperature', 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature_delta'}]
|
|
403
|
+
>>> parse_table_structure('temperature Float32, temperature_delta Float32 MATERIALIZED temperature Codec(Delta(4)), temperature_doubledelta Float32 MATERIALIZED temperature Codec(DoubleDelta), temperature_doubledelta_lz4 Float32 MATERIALIZED temperature Codec(DoubleDelta, LZ4)')
|
|
404
|
+
[{'name': 'temperature', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature'}, {'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4))', 'default_value': 'MATERIALIZED temperature', 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature_delta'}, {'name': 'temperature_doubledelta', 'type': 'Float32', 'codec': 'CODEC(DoubleDelta)', 'default_value': 'MATERIALIZED temperature', 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature_doubledelta'}, {'name': 'temperature_doubledelta_lz4', 'type': 'Float32', 'codec': 'CODEC(DoubleDelta, LZ4)', 'default_value': 'MATERIALIZED temperature', 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature_doubledelta_lz4'}]
|
|
405
|
+
>>> parse_table_structure('t UInt8 CODEC(Delta(1), LZ4)')
|
|
406
|
+
[{'name': 't', 'type': 'UInt8', 'codec': 'CODEC(Delta(1), LZ4)', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 't'}]
|
|
407
|
+
>>> parse_table_structure('tt UInt8 MATERIALIZED t')
|
|
408
|
+
[{'name': 'tt', 'type': 'UInt8', 'codec': None, 'default_value': 'MATERIALIZED t', 'jsonpath': None, 'nullable': False, 'normalized_name': 'tt'}]
|
|
409
|
+
>>> parse_table_structure('tt UInt8 MATERIALIZED t CODEC(Delta(1), LZ4)')
|
|
410
|
+
[{'name': 'tt', 'type': 'UInt8', 'codec': 'CODEC(Delta(1), LZ4)', 'default_value': 'MATERIALIZED t', 'jsonpath': None, 'nullable': False, 'normalized_name': 'tt'}]
|
|
411
|
+
>>> parse_table_structure('tt SimpleAggregateFunction(any, Nullable(UInt8))')
|
|
412
|
+
[{'name': 'tt', 'type': 'SimpleAggregateFunction(any, Nullable(UInt8))', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'tt'}]
|
|
413
|
+
>>> parse_table_structure("timestamp DateTime MATERIALIZED toDateTime(JSONExtractInt(JSONExtractRaw(record, 'payload'), 'timestamp') / 1000)")
|
|
414
|
+
[{'name': 'timestamp', 'type': 'DateTime', 'codec': None, 'default_value': "MATERIALIZED toDateTime(JSONExtractInt(JSONExtractRaw(record, 'payload'), 'timestamp') / 1000)", 'jsonpath': None, 'nullable': False, 'normalized_name': 'timestamp'}]
|
|
415
|
+
>>> parse_table_structure("`test_default_cast` DEFAULT plus(13,1)")
|
|
416
|
+
[{'name': 'test_default_cast', 'type': '', 'codec': None, 'default_value': 'DEFAULT plus(13,1)', 'jsonpath': None, 'nullable': False, 'normalized_name': 'test_default_cast'}]
|
|
417
|
+
>>> parse_table_structure("hola Int, `materialized` String MATERIALIZED upper(no_nullable_string)")
|
|
418
|
+
[{'name': 'hola', 'type': 'Int', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'hola'}, {'name': 'materialized', 'type': 'String', 'codec': None, 'default_value': 'MATERIALIZED upper(no_nullable_string)', 'jsonpath': None, 'nullable': False, 'normalized_name': 'materialized'}]
|
|
419
|
+
>>> parse_table_structure('`a2` String `json:$.a2`, `a3` String `json:$.a3`\\n')
|
|
420
|
+
[{'name': 'a2', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
|
|
421
|
+
>>> parse_table_structure("`arr` Array(String) DEFAULT ['-']")
|
|
422
|
+
[{'name': 'arr', 'type': 'Array(String)', 'codec': None, 'default_value': "DEFAULT ['-']", 'jsonpath': None, 'nullable': False, 'normalized_name': 'arr'}]
|
|
423
|
+
>>> parse_table_structure("`arr` Array(String) DEFAULT array('-')")
|
|
424
|
+
[{'name': 'arr', 'type': 'Array(String)', 'codec': None, 'default_value': "DEFAULT array('-')", 'jsonpath': None, 'nullable': False, 'normalized_name': 'arr'}]
|
|
425
|
+
>>> parse_table_structure('`a2` Float32 CODEC(Delta, ZSTD(4)) `json:$.a2`, `a3` String `json:$.a3`\\n')
|
|
426
|
+
[{'name': 'a2', 'type': 'Float32', 'codec': 'CODEC(Delta, ZSTD(4))', 'default_value': None, 'jsonpath': '$.a2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
|
|
427
|
+
>>> parse_table_structure('`a` String, INDEX index_name a TYPE set(100) GRANULARITY 100')
|
|
428
|
+
[{'name': 'a', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'a'}]
|
|
429
|
+
>>> parse_table_structure('`a` String, INDEX index_name a TYPE set(100, 1) GRANULARITY 100')
|
|
430
|
+
[{'name': 'a', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'a'}]
|
|
431
|
+
>>> parse_table_structure('`index` String, INDEX index_name a TYPE set(100, 1) GRANULARITY 100')
|
|
432
|
+
[{'name': 'index', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'index'}]
|
|
433
|
+
>>> parse_table_structure('`a2` String `json:$.a--2`, `a3` String `json:$.a3`\\n')
|
|
434
|
+
[{'name': 'a2', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a--2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
|
|
435
|
+
"""
|
|
436
|
+
return _parse_table_structure(schema)
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def clean_comments(schema_to_clean: str) -> str:
|
|
440
|
+
"""Remove the comments from the schema
|
|
441
|
+
if the comments are between backticks, they will not be removed
|
|
442
|
+
>>> clean_comments(None) is None
|
|
443
|
+
True
|
|
444
|
+
>>> clean_comments('')
|
|
445
|
+
''
|
|
446
|
+
>>> clean_comments(' ')
|
|
447
|
+
''
|
|
448
|
+
>>> clean_comments('\\n')
|
|
449
|
+
''
|
|
450
|
+
>>> clean_comments('\\n\\n\\n\\n')
|
|
451
|
+
''
|
|
452
|
+
>>> clean_comments('c Float32')
|
|
453
|
+
'c Float32'
|
|
454
|
+
>>> clean_comments('c Float32\\n')
|
|
455
|
+
'c Float32'
|
|
456
|
+
>>> clean_comments('c Float32\\n--this is a comment')
|
|
457
|
+
'c Float32'
|
|
458
|
+
>>> clean_comments('c Float32\\n--this is a comment\\n')
|
|
459
|
+
'c Float32'
|
|
460
|
+
>>> clean_comments('c Float32\\t-- this is a comment\\t\\n')
|
|
461
|
+
'c Float32'
|
|
462
|
+
>>> clean_comments('c Float32\\n--this is a comment\\r\\n')
|
|
463
|
+
'c Float32'
|
|
464
|
+
>>> clean_comments('c Float32\\n--this is a comment\\n--this is a comment2\\n')
|
|
465
|
+
'c Float32'
|
|
466
|
+
>>> clean_comments('c Float32\\n--this is a ```comment\\n')
|
|
467
|
+
'c Float32'
|
|
468
|
+
>>> clean_comments('c Float32\\n--this is a ```comment\\n')
|
|
469
|
+
'c Float32'
|
|
470
|
+
>>> clean_comments('c Float32, -- comment\\nd Float32 -- comment2')
|
|
471
|
+
'c Float32,\\nd Float32'
|
|
472
|
+
>>> clean_comments('c Float32, -- comment\\n -- comment \\nd Float32 -- comment2')
|
|
473
|
+
'c Float32,\\nd Float32'
|
|
474
|
+
>>> clean_comments('c Float32 `json:$.aa--aa`\\n--this is a ```comment\\n')
|
|
475
|
+
'c Float32 `json:$.aa--aa`'
|
|
476
|
+
>>> clean_comments('c Float32 `json:$.cc--cc`\\nd Float32 `json:$.dd--dd`\\n--this is a ```comment\\n')
|
|
477
|
+
'c Float32 `json:$.cc--cc`\\nd Float32 `json:$.dd--dd`'
|
|
478
|
+
>>> clean_comments('c--c Float32 `json:$.cc--cc`\\n')
|
|
479
|
+
'c'
|
|
480
|
+
>>> clean_comments('`c--c` Float32 `json:$.cc--cc`\\n')
|
|
481
|
+
'`c'
|
|
482
|
+
"""
|
|
483
|
+
|
|
484
|
+
def clean_line_comments(line: str) -> str:
|
|
485
|
+
if not line:
|
|
486
|
+
return line
|
|
487
|
+
i = 0
|
|
488
|
+
inside_json_path = False
|
|
489
|
+
while i < len(line):
|
|
490
|
+
if i + 1 < len(line) and line[i] == "-" and line[i + 1] == "-" and not inside_json_path:
|
|
491
|
+
return line[:i].strip()
|
|
492
|
+
|
|
493
|
+
if not inside_json_path and line[i:].startswith("`json:"):
|
|
494
|
+
inside_json_path = True
|
|
495
|
+
elif inside_json_path and line[i] == "`":
|
|
496
|
+
inside_json_path = False
|
|
497
|
+
i += 1
|
|
498
|
+
return line
|
|
499
|
+
|
|
500
|
+
if schema_to_clean is None:
|
|
501
|
+
return schema_to_clean
|
|
502
|
+
|
|
503
|
+
cleaned_schema = ""
|
|
504
|
+
for line in schema_to_clean.splitlines():
|
|
505
|
+
cleaned_line = clean_line_comments(line)
|
|
506
|
+
if cleaned_line:
|
|
507
|
+
cleaned_schema += cleaned_line + "\n"
|
|
508
|
+
return cleaned_schema.strip()
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
SyntaxExpr = namedtuple("SyntaxExpr", ["name", "regex"])
|
|
512
|
+
|
|
513
|
+
NULL = SyntaxExpr("NULL", re.compile(r"\s+NULL([^a-z0-9_]|$)", re.IGNORECASE))
|
|
514
|
+
NOTNULL = SyntaxExpr("NOTNULL", re.compile(r"\s+NOT\s+NULL([^a-z0-9_]|$)", re.IGNORECASE))
|
|
515
|
+
DEFAULT = SyntaxExpr("DEFAULT", re.compile(r"\s+DEFAULT([^a-z0-9_]|$)", re.IGNORECASE))
|
|
516
|
+
MATERIALIZED = SyntaxExpr("MATERIALIZED", re.compile(r"\s+MATERIALIZED([^a-z0-9_]|$)", re.IGNORECASE))
|
|
517
|
+
ALIAS = SyntaxExpr("ALIAS", re.compile(r"\s+ALIAS([^a-z0-9_]|$)", re.IGNORECASE))
|
|
518
|
+
CODEC = SyntaxExpr("CODEC", re.compile(r"\s+CODEC([^a-z0-9_]|$)", re.IGNORECASE))
|
|
519
|
+
TTL = SyntaxExpr("TTL", re.compile(r"\s+TTL([^a-z0-9_]|$)", re.IGNORECASE))
|
|
520
|
+
JSONPATH = SyntaxExpr("JSONPATH", re.compile(r"\s+`json:", re.IGNORECASE))
|
|
521
|
+
COMMA = SyntaxExpr("COMMA", re.compile(r",", re.IGNORECASE))
|
|
522
|
+
NEW_LINE = SyntaxExpr("NEW_LINE", re.compile(r"\s$"))
|
|
523
|
+
TYPE = SyntaxExpr("TYPE", re.compile(r"")) # TYPE doesn't have a fixed initial string
|
|
524
|
+
|
|
525
|
+
REGEX_WHITESPACE = re.compile(r"\s*")
|
|
526
|
+
REGEX_COMMENT = re.compile(r"\-\-[^\n\r]*[\n\r]")
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def _parse_table_structure(schema: str) -> List[Dict[str, Any]]:
|
|
530
|
+
# CH syntax from https://clickhouse.com/docs/en/sql-reference/statements/create/table/
|
|
531
|
+
# name1 [type1] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1]
|
|
532
|
+
try:
|
|
533
|
+
schema = clean_comments(schema + "\n")
|
|
534
|
+
except Exception as e:
|
|
535
|
+
logging.exception(f"Error cleaning comments: {e}")
|
|
536
|
+
schema = REGEX_COMMENT.sub(" ", schema + "\n").strip()
|
|
537
|
+
|
|
538
|
+
if REGEX_WHITESPACE.fullmatch(schema):
|
|
539
|
+
return []
|
|
540
|
+
|
|
541
|
+
i: int = 0
|
|
542
|
+
|
|
543
|
+
# For error feedback only
|
|
544
|
+
line: int = 1
|
|
545
|
+
pos: int = 1
|
|
546
|
+
|
|
547
|
+
# Find the first SyntaxExpr in lookup that matches the schema at the current offset
|
|
548
|
+
def lookahead_matches(lookup: Iterable) -> Optional[SyntaxExpr]:
|
|
549
|
+
s = schema[i:]
|
|
550
|
+
match = next((x for x in lookup if x.regex.match(s)), None)
|
|
551
|
+
return match
|
|
552
|
+
|
|
553
|
+
def advance_single_char() -> None:
|
|
554
|
+
nonlocal i, line, pos
|
|
555
|
+
if schema[i] == "\n":
|
|
556
|
+
line += 1
|
|
557
|
+
pos = 1
|
|
558
|
+
else:
|
|
559
|
+
pos += 1
|
|
560
|
+
i += 1
|
|
561
|
+
|
|
562
|
+
# Advance all whitespaces characters and then len(s) more chars
|
|
563
|
+
def advance(s: str) -> None:
|
|
564
|
+
if i < len(schema):
|
|
565
|
+
while schema[i] in " \t\r\n":
|
|
566
|
+
advance_single_char()
|
|
567
|
+
for _ in s:
|
|
568
|
+
advance_single_char()
|
|
569
|
+
|
|
570
|
+
def get_backticked() -> str:
|
|
571
|
+
begin = i
|
|
572
|
+
while i < len(schema):
|
|
573
|
+
c = schema[i]
|
|
574
|
+
advance_single_char()
|
|
575
|
+
if c == "`":
|
|
576
|
+
return schema[begin : i - 1]
|
|
577
|
+
raise ValueError(format_parse_error(schema, i, pos, "expecting ending backtick", line=line))
|
|
578
|
+
|
|
579
|
+
def parse_name() -> str:
|
|
580
|
+
nonlocal i, line, pos
|
|
581
|
+
if schema[i] != "`":
|
|
582
|
+
# regular name
|
|
583
|
+
begin = i
|
|
584
|
+
while i < len(schema):
|
|
585
|
+
c = schema[i]
|
|
586
|
+
if c in " \t\r\n":
|
|
587
|
+
return schema[begin:i]
|
|
588
|
+
if c not in valid_chars_name:
|
|
589
|
+
raise ValueError(
|
|
590
|
+
format_parse_error(schema, i, pos, "wrong value, please check the schema syntax", line=line)
|
|
591
|
+
)
|
|
592
|
+
advance_single_char()
|
|
593
|
+
return schema[begin:i]
|
|
594
|
+
else:
|
|
595
|
+
# backticked name
|
|
596
|
+
advance_single_char()
|
|
597
|
+
return get_backticked()
|
|
598
|
+
|
|
599
|
+
def parse_expr(lookup: Iterable[SyntaxExpr]) -> str:
|
|
600
|
+
nonlocal i, line, pos
|
|
601
|
+
|
|
602
|
+
begin: int = i
|
|
603
|
+
context_stack: List[Optional[str]] = [None]
|
|
604
|
+
while i < len(schema):
|
|
605
|
+
context = context_stack[-1]
|
|
606
|
+
c = schema[i]
|
|
607
|
+
|
|
608
|
+
if (context == "'" and c == "'") or (context == '"' and c == '"') or (context == "(" and c == ")"):
|
|
609
|
+
context_stack.pop()
|
|
610
|
+
elif c == "'" and (context is None or context == "("):
|
|
611
|
+
context_stack.append("'")
|
|
612
|
+
elif c == '"' and (context is None or context == "("):
|
|
613
|
+
context_stack.append('"')
|
|
614
|
+
elif c == "(" and (context is None or context == "("):
|
|
615
|
+
context_stack.append("(")
|
|
616
|
+
elif context is None and lookahead_matches(lookup):
|
|
617
|
+
return schema[begin:i].strip(" \t\r\n")
|
|
618
|
+
elif (context is None and c not in valid_chars_fn) or (context == "(" and c not in valid_chars_fn):
|
|
619
|
+
raise ValueError(
|
|
620
|
+
format_parse_error(schema, i, pos, "wrong value, please check the schema syntax", line=line)
|
|
621
|
+
)
|
|
622
|
+
advance_single_char()
|
|
623
|
+
if i == begin:
|
|
624
|
+
raise ValueError(format_parse_error(schema, i, pos, "wrong value", line=line))
|
|
625
|
+
return schema[begin:].strip(" \t\r\n")
|
|
626
|
+
|
|
627
|
+
columns: List[Dict[str, Any]] = []
|
|
628
|
+
|
|
629
|
+
name: str = ""
|
|
630
|
+
_type: str = ""
|
|
631
|
+
default: str = ""
|
|
632
|
+
materialized: str = ""
|
|
633
|
+
codec: str = ""
|
|
634
|
+
jsonpath: str = ""
|
|
635
|
+
last: Optional[SyntaxExpr] = None
|
|
636
|
+
|
|
637
|
+
def add_column(found: str) -> None:
|
|
638
|
+
nonlocal name, _type, default, materialized, codec, jsonpath
|
|
639
|
+
if name == "INDEX":
|
|
640
|
+
return
|
|
641
|
+
if not name:
|
|
642
|
+
raise ValueError(
|
|
643
|
+
format_parse_error(schema, i, pos, f"Syntax error: expecting NAME, found {found}", line=line)
|
|
644
|
+
)
|
|
645
|
+
default = "" if not default else f"DEFAULT {default}"
|
|
646
|
+
materialized = "" if not materialized else f"MATERIALIZED {materialized}"
|
|
647
|
+
codec = "" if not codec else f"CODEC{codec}"
|
|
648
|
+
columns.append(
|
|
649
|
+
{
|
|
650
|
+
"name": name,
|
|
651
|
+
"type": _type,
|
|
652
|
+
"codec": codec,
|
|
653
|
+
"default_value": default or materialized,
|
|
654
|
+
"jsonpath": jsonpath,
|
|
655
|
+
}
|
|
656
|
+
)
|
|
657
|
+
name = ""
|
|
658
|
+
_type = ""
|
|
659
|
+
default = ""
|
|
660
|
+
materialized = ""
|
|
661
|
+
codec = ""
|
|
662
|
+
jsonpath = ""
|
|
663
|
+
|
|
664
|
+
valid_next: List[SyntaxExpr] = [TYPE]
|
|
665
|
+
while i < len(schema):
|
|
666
|
+
if not name:
|
|
667
|
+
advance("")
|
|
668
|
+
valid_next = [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, TYPE]
|
|
669
|
+
name = parse_name()
|
|
670
|
+
continue
|
|
671
|
+
found = lookahead_matches(
|
|
672
|
+
[NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, NEW_LINE, TYPE]
|
|
673
|
+
if name != "INDEX"
|
|
674
|
+
else [COMMA, NEW_LINE]
|
|
675
|
+
)
|
|
676
|
+
if found and found not in valid_next:
|
|
677
|
+
after = f" after {last.name}" if last else ""
|
|
678
|
+
raise ValueError(format_parse_error(schema, i, pos, f"Unexpected {found.name}{after}", line=line))
|
|
679
|
+
if found == TYPE:
|
|
680
|
+
advance("")
|
|
681
|
+
valid_next = [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, NEW_LINE]
|
|
682
|
+
detected_type = parse_expr([NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
|
|
683
|
+
try:
|
|
684
|
+
# Imported in the body to be compatible with the CLI
|
|
685
|
+
from chtoolset.query import check_compatible_types
|
|
686
|
+
|
|
687
|
+
# Check compatibility of the type with itself to verify it's a known type
|
|
688
|
+
check_compatible_types(detected_type, detected_type)
|
|
689
|
+
except ModuleNotFoundError:
|
|
690
|
+
pass
|
|
691
|
+
_type = detected_type
|
|
692
|
+
elif found == NULL:
|
|
693
|
+
# Not implemented
|
|
694
|
+
raise ValueError(
|
|
695
|
+
format_parse_error(schema, i, pos, "NULL column syntax not supported", line=line, keyword="NULL")
|
|
696
|
+
)
|
|
697
|
+
elif found == NOTNULL:
|
|
698
|
+
# Not implemented
|
|
699
|
+
raise ValueError(
|
|
700
|
+
format_parse_error(
|
|
701
|
+
schema, i, pos, "NOT NULL column syntax not supported", line=line, keyword="NOT NULL"
|
|
702
|
+
)
|
|
703
|
+
)
|
|
704
|
+
elif found == DEFAULT:
|
|
705
|
+
advance("DEFAULT")
|
|
706
|
+
valid_next = [CODEC, TTL, COMMA]
|
|
707
|
+
default = parse_expr([NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
|
|
708
|
+
elif found == MATERIALIZED:
|
|
709
|
+
advance("MATERIALIZED")
|
|
710
|
+
valid_next = [CODEC, TTL, COMMA]
|
|
711
|
+
materialized = parse_expr([NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
|
|
712
|
+
elif found == ALIAS:
|
|
713
|
+
# Not implemented
|
|
714
|
+
raise ValueError(format_parse_error(schema, i, pos, "ALIAS not supported", line=line, keyword="ALIAS"))
|
|
715
|
+
elif found == CODEC:
|
|
716
|
+
advance("CODEC")
|
|
717
|
+
valid_next = [TTL, COMMA, JSONPATH]
|
|
718
|
+
codec = parse_expr([NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
|
|
719
|
+
elif found == TTL:
|
|
720
|
+
advance("TTL")
|
|
721
|
+
# Not implemented
|
|
722
|
+
raise ValueError(format_parse_error(schema, i, pos, "column TTL not supported", line=line, keyword="TTL"))
|
|
723
|
+
elif found == JSONPATH:
|
|
724
|
+
advance("`json:")
|
|
725
|
+
jsonpath = get_backticked()
|
|
726
|
+
elif found == COMMA:
|
|
727
|
+
if name == "INDEX":
|
|
728
|
+
advance(",")
|
|
729
|
+
continue
|
|
730
|
+
advance(",")
|
|
731
|
+
valid_next = []
|
|
732
|
+
add_column("COMMA")
|
|
733
|
+
elif found == NEW_LINE or (name == "INDEX" and not found):
|
|
734
|
+
i += 1
|
|
735
|
+
else:
|
|
736
|
+
raise ValueError(
|
|
737
|
+
format_parse_error(
|
|
738
|
+
schema,
|
|
739
|
+
i,
|
|
740
|
+
pos,
|
|
741
|
+
"wrong value, expected a NULL, NOT NULL, DEFAULT, MATERIALIZED, CODEC, TTL expressions, a column data type, a comma, a new line or a jsonpath",
|
|
742
|
+
line=line,
|
|
743
|
+
)
|
|
744
|
+
)
|
|
745
|
+
last = found
|
|
746
|
+
add_column("EOF")
|
|
747
|
+
|
|
748
|
+
# normalize columns
|
|
749
|
+
for column in columns:
|
|
750
|
+
nullable = column["type"].lower().startswith("nullable")
|
|
751
|
+
column["type"] = column["type"] if not nullable else column["type"][len("Nullable(") : -1] # ')'
|
|
752
|
+
column["nullable"] = nullable
|
|
753
|
+
column["codec"] = column["codec"] if column["codec"] else None
|
|
754
|
+
column["name"] = column["name"]
|
|
755
|
+
column["normalized_name"] = column["name"]
|
|
756
|
+
column["jsonpath"] = column["jsonpath"] if column["jsonpath"] else None
|
|
757
|
+
default_value = column["default_value"] if column["default_value"] else None
|
|
758
|
+
if nullable and default_value and default_value.lower() == "default null":
|
|
759
|
+
default_value = None
|
|
760
|
+
column["default_value"] = default_value
|
|
761
|
+
|
|
762
|
+
return columns
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
def engine_can_be_replicated(engine: Optional[str]) -> bool:
|
|
766
|
+
"""
|
|
767
|
+
>>> engine_can_be_replicated('MergeTree() order by tuple()')
|
|
768
|
+
True
|
|
769
|
+
>>> engine_can_be_replicated('JOIN(ANY, LEFT, foo)')
|
|
770
|
+
False
|
|
771
|
+
>>> engine_can_be_replicated('ReplicatingMergeTree() order by tuple()')
|
|
772
|
+
True
|
|
773
|
+
>>> engine_can_be_replicated(None)
|
|
774
|
+
False
|
|
775
|
+
>>> engine_can_be_replicated("ReplicatedReplacingMergeTree('/clickhouse/tables/{layer}-{shard}/d_e7a588.t_ba45cd49a39c4649a1c7c2ec7adcaf43_t_be23eb990c394399854f8271c550fc36_staging', '{replica}', insert_date)")
|
|
776
|
+
False
|
|
777
|
+
"""
|
|
778
|
+
if not engine:
|
|
779
|
+
return False
|
|
780
|
+
lower_engine = engine.lower()
|
|
781
|
+
return not lower_engine.startswith("Replicated".lower()) and "mergetree" in lower_engine
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
def engine_supports_delete(engine: Optional[str]) -> bool:
|
|
785
|
+
"""
|
|
786
|
+
>>> engine_supports_delete('MergeTree() order by tuple()')
|
|
787
|
+
True
|
|
788
|
+
>>> engine_supports_delete('JOIN(ANY, LEFT, foo)')
|
|
789
|
+
False
|
|
790
|
+
>>> engine_supports_delete('ReplicatingMergeTree() order by tuple()')
|
|
791
|
+
True
|
|
792
|
+
>>> engine_supports_delete(None)
|
|
793
|
+
False
|
|
794
|
+
"""
|
|
795
|
+
if not engine:
|
|
796
|
+
return False
|
|
797
|
+
return "mergetree" in engine.lower()
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def engine_replicated_to_local(engine: str) -> str:
|
|
801
|
+
"""
|
|
802
|
+
>>> engine_replicated_to_local("ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/test.foo','{replica}') order by (test)")
|
|
803
|
+
'MergeTree() order by (test)'
|
|
804
|
+
>>> engine_replicated_to_local("ReplicatedReplacingMergeTree('/clickhouse/tables/{layer}-{shard}/test.foo', '{replica}', timestamp) order by (test)")
|
|
805
|
+
'ReplacingMergeTree(timestamp) order by (test)'
|
|
806
|
+
>>> engine_replicated_to_local("Join(ANY, LEFT, test)")
|
|
807
|
+
'Join(ANY, LEFT, test)'
|
|
808
|
+
>>> engine_replicated_to_local("ReplicatedVersionedCollapsingMergeTree('/clickhouse/tables/{layer}-{shard}/test.foo', '{replica}', sign,version) ORDER BY pk TTL toDate(local_timeplaced) + toIntervalDay(3) SETTINGS index_granularity = 8192")
|
|
809
|
+
'VersionedCollapsingMergeTree(sign, version) ORDER BY pk TTL toDate(local_timeplaced) + toIntervalDay(3) SETTINGS index_granularity = 8192'
|
|
810
|
+
"""
|
|
811
|
+
|
|
812
|
+
def _replace(m):
|
|
813
|
+
parts = m.groups()
|
|
814
|
+
s = parts[0] + "MergeTree("
|
|
815
|
+
if parts[1]:
|
|
816
|
+
tk = parts[1].split(",")
|
|
817
|
+
if len(tk) > 2: # remove key and {replica} part
|
|
818
|
+
s += ", ".join([x.strip() for x in tk[2:]])
|
|
819
|
+
s += ")" + parts[2]
|
|
820
|
+
return s
|
|
821
|
+
|
|
822
|
+
if "Replicated" not in engine:
|
|
823
|
+
return engine
|
|
824
|
+
|
|
825
|
+
return re.sub(r"Replicated(.*)MergeTree\(([^\)]*)\)(.*)", _replace, engine.strip())
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
def engine_patch_replicated_engine(engine: str, engine_full: Optional[str], new_table_name: str) -> Optional[str]:
|
|
829
|
+
"""
|
|
830
|
+
>>> engine_patch_replicated_engine("ReplicatedMergeTree", "ReplicatedMergeTree('/clickhouse/tables/1-1/table_name', 'replica') PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192", 'table_name_staging')
|
|
831
|
+
"ReplicatedMergeTree('/clickhouse/tables/1-1/table_name_staging', 'replica') PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
|
|
832
|
+
>>> engine_patch_replicated_engine("ReplicatedMergeTree", "ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/sales_product_rank_rt_replicated_2', '{replica}') PARTITION BY toYYYYMM(date) ORDER BY (purchase_location, sku_rank_lc, date)", 'sales_product_rank_rt_replicated_2_staging')
|
|
833
|
+
"ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/sales_product_rank_rt_replicated_2_staging', '{replica}') PARTITION BY toYYYYMM(date) ORDER BY (purchase_location, sku_rank_lc, date)"
|
|
834
|
+
>>> engine_patch_replicated_engine("ReplicatedMergeTree", None, 't_000') is None
|
|
835
|
+
True
|
|
836
|
+
>>> engine_patch_replicated_engine("Log", "Log()", 't_000')
|
|
837
|
+
'Log()'
|
|
838
|
+
>>> engine_patch_replicated_engine("MergeTree", "MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024", 't_000')
|
|
839
|
+
'MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024'
|
|
840
|
+
"""
|
|
841
|
+
if not engine_full:
|
|
842
|
+
return None
|
|
843
|
+
if engine.lower().startswith("Replicated".lower()):
|
|
844
|
+
parts = re.split(r"(Replicated.*MergeTree\(')([^']*)('.*)", engine_full)
|
|
845
|
+
paths = parts[2].split("/")
|
|
846
|
+
paths[-1] = new_table_name
|
|
847
|
+
zoo_path = "/".join(paths)
|
|
848
|
+
return "".join(parts[:2] + [zoo_path] + parts[3:])
|
|
849
|
+
return engine_full
|
|
850
|
+
|
|
851
|
+
|
|
852
|
+
if __name__ == "__main__":
|
|
853
|
+
print( # noqa: T201
|
|
854
|
+
_parse_table_structure(
|
|
855
|
+
"""hola Int --comment\n, `materialized` String --otro comment\n MATERIALIZED upper(no_nullable_string)"""
|
|
856
|
+
)
|
|
857
|
+
)
|
|
858
|
+
# print(_parse_table_structure('@'))
|
|
859
|
+
# print(mark_error_string('012345678901234567890123456789', 30))
|
|
860
|
+
# print(_parse_table_structure('@'))
|
|
861
|
+
# print(_parse_table_structure('`test_default_cast DEFAULT plus(13,1)'))
|
|
862
|
+
# print(_parse_table_structure('`test_default_cast` DEFAULT plus(13,1)'))
|
|
863
|
+
# print(_parse_table_structure('hola Int32'))
|
|
864
|
+
# _parse_table_structure('hola')
|
|
865
|
+
# print(parse_table_structure("timestamp DateTime MATERIALIZED toDateTime(JSONExtractInt(JSONExtractRaw(record, 'payload'), 'timestamp') / 1000)"))
|