tinybird 0.0.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tinybird might be problematic. Click here for more details.

Files changed (45) hide show
  1. tinybird/__cli__.py +8 -0
  2. tinybird/ch_utils/constants.py +244 -0
  3. tinybird/ch_utils/engine.py +855 -0
  4. tinybird/check_pypi.py +25 -0
  5. tinybird/client.py +1281 -0
  6. tinybird/config.py +117 -0
  7. tinybird/connectors.py +428 -0
  8. tinybird/context.py +23 -0
  9. tinybird/datafile.py +5589 -0
  10. tinybird/datatypes.py +434 -0
  11. tinybird/feedback_manager.py +1022 -0
  12. tinybird/git_settings.py +145 -0
  13. tinybird/sql.py +865 -0
  14. tinybird/sql_template.py +2343 -0
  15. tinybird/sql_template_fmt.py +281 -0
  16. tinybird/sql_toolset.py +350 -0
  17. tinybird/syncasync.py +682 -0
  18. tinybird/tb_cli.py +25 -0
  19. tinybird/tb_cli_modules/auth.py +252 -0
  20. tinybird/tb_cli_modules/branch.py +1043 -0
  21. tinybird/tb_cli_modules/cicd.py +434 -0
  22. tinybird/tb_cli_modules/cli.py +1571 -0
  23. tinybird/tb_cli_modules/common.py +2082 -0
  24. tinybird/tb_cli_modules/config.py +344 -0
  25. tinybird/tb_cli_modules/connection.py +803 -0
  26. tinybird/tb_cli_modules/datasource.py +900 -0
  27. tinybird/tb_cli_modules/exceptions.py +91 -0
  28. tinybird/tb_cli_modules/fmt.py +91 -0
  29. tinybird/tb_cli_modules/job.py +85 -0
  30. tinybird/tb_cli_modules/pipe.py +858 -0
  31. tinybird/tb_cli_modules/regions.py +9 -0
  32. tinybird/tb_cli_modules/tag.py +100 -0
  33. tinybird/tb_cli_modules/telemetry.py +310 -0
  34. tinybird/tb_cli_modules/test.py +107 -0
  35. tinybird/tb_cli_modules/tinyunit/tinyunit.py +340 -0
  36. tinybird/tb_cli_modules/tinyunit/tinyunit_lib.py +71 -0
  37. tinybird/tb_cli_modules/token.py +349 -0
  38. tinybird/tb_cli_modules/workspace.py +269 -0
  39. tinybird/tb_cli_modules/workspace_members.py +212 -0
  40. tinybird/tornado_template.py +1194 -0
  41. tinybird-0.0.1.dev0.dist-info/METADATA +2815 -0
  42. tinybird-0.0.1.dev0.dist-info/RECORD +45 -0
  43. tinybird-0.0.1.dev0.dist-info/WHEEL +5 -0
  44. tinybird-0.0.1.dev0.dist-info/entry_points.txt +2 -0
  45. tinybird-0.0.1.dev0.dist-info/top_level.txt +4 -0
tinybird/sql.py ADDED
@@ -0,0 +1,865 @@
1
+ import logging
2
+ import re
3
+ import string
4
+ from collections import namedtuple
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, Iterable, List, Optional
7
+
8
+ valid_chars_name: str = string.ascii_letters + string.digits + "._`*<>+-'"
9
+ valid_chars_fn: str = valid_chars_name + "[](),=!?:/ \n\t\r"
10
+
11
+
12
+ @dataclass
13
+ class TableIndex:
14
+ """Defines a CH table INDEX"""
15
+
16
+ name: str
17
+ expr: str
18
+ type_full: str
19
+ granularity: Optional[str] = None
20
+
21
+ def to_datafile(self):
22
+ granularity_expr = f"GRANULARITY {self.granularity}" if self.granularity else ""
23
+ return f"{self.name} {self.expr} TYPE {self.type_full} {granularity_expr}"
24
+
25
+ def to_sql(self):
26
+ return f"INDEX {self.to_datafile()}"
27
+
28
+ def add_index_sql(self):
29
+ return f"ADD {self.to_sql()}"
30
+
31
+ def drop_index_sql(self):
32
+ return f"DROP INDEX IF EXISTS {self.name}"
33
+
34
+ def materialize_index_sql(self):
35
+ return f"MATERIALIZE INDEX IF EXISTS {self.name}"
36
+
37
+ def clear_index_sql(self):
38
+ return f"CLEAR INDEX IF EXISTS {self.name}"
39
+
40
+
41
+ @dataclass
42
+ class TableProjection:
43
+ """Defines a CH table PROJECTION"""
44
+
45
+ name: str
46
+ expr: str
47
+
48
+ def to_datafile(self):
49
+ return f"{self.name} ({self.expr})"
50
+
51
+ def to_sql(self):
52
+ return f"PROJECTION {self.to_datafile()}"
53
+
54
+ def add_projection_sql(self):
55
+ return f"ADD {self.to_sql()}"
56
+
57
+ def drop_projection_sql(self):
58
+ return f"DROP PROJECTION IF EXISTS {self.name}"
59
+
60
+ def materialize_projection_sql(self):
61
+ return f"MATERIALIZE PROJECTION IF EXISTS {self.name}"
62
+
63
+ def clear_projection_sql(self):
64
+ return f"CLEAR PROJECTION IF EXISTS {self.name}"
65
+
66
+
67
+ def as_subquery(sql: str) -> str:
68
+ return f"""(\n{sql}\n)"""
69
+
70
+
71
+ def get_format(sql: str) -> Optional[str]:
72
+ """
73
+ retrieves FORMAT from CH sql
74
+ >>> get_format('select * from test')
75
+ >>> get_format('select * from test formAt JSON')
76
+ 'JSON'
77
+ """
78
+ FORMAT_RE = r"\s+format\s+(\w+)\s*$"
79
+ sql = sql.strip()
80
+ format = re.findall(FORMAT_RE, sql, re.I)
81
+ return format[0] if format else None
82
+
83
+
84
+ def get_format_group(sql: str) -> str:
85
+ """
86
+ retrieves FORMAT group from CH sql
87
+ >>> get_format_group('select * from test')
88
+ ''
89
+ >>> get_format_group('select * from test formAt JSON')
90
+ ' formAt JSON'
91
+ """
92
+ FORMAT_RE = r"\s+format\s+(\w+)\s*$"
93
+ sql = sql.strip()
94
+ format = re.search(FORMAT_RE, sql, re.I)
95
+ return format.group() if format else ""
96
+
97
+
98
+ def wrap_finalize_aggregation(sql: str, describe_result: Dict[str, Any], fm_group: Optional[str] = None) -> str:
99
+ if not fm_group:
100
+ fm_group = get_format_group(sql)
101
+ sql = sql[0 : -len(fm_group)] if fm_group else sql
102
+
103
+ qq: str
104
+ if describe_result:
105
+ columns: List[str] = [
106
+ f"finalizeAggregation({c['name']}) as {c['name']}"
107
+ if "Aggregate" in c["type"] and "SimpleAggregate" not in c["type"]
108
+ else f"{c['name']}"
109
+ for c in describe_result["data"]
110
+ ]
111
+ columns_as_string: str = ",\n\t".join(columns)
112
+ sql = sql.replace("\n", "\n\t")
113
+ qq = f"SELECT \n\t{columns_as_string} \nFROM ({sql} \n) {fm_group}"
114
+ else:
115
+ qq = sql
116
+ return qq
117
+
118
+
119
+ def remove_format(sql: str) -> str:
120
+ """
121
+ removes FORMAT from CH sql
122
+ >>> remove_format('select * from test')
123
+ 'select * from test'
124
+ >>> remove_format('select * from test formAt JSON')
125
+ 'select * from test'
126
+ """
127
+ FORMAT_RE = r"\s+(format)\s+(\w+)\s*$"
128
+ sql = sql.strip()
129
+ return re.sub(FORMAT_RE, "", sql, flags=re.I)
130
+
131
+
132
+ def col_name(name: str, backquotes: bool = True) -> str:
133
+ """
134
+ >>> col_name('`test`', True)
135
+ '`test`'
136
+ >>> col_name('`test`', False)
137
+ 'test'
138
+ >>> col_name('test', True)
139
+ '`test`'
140
+ >>> col_name('test', False)
141
+ 'test'
142
+ >>> col_name('', True)
143
+ ''
144
+ >>> col_name('', False)
145
+ ''
146
+ """
147
+ if not name:
148
+ return name
149
+ if name[0] == "`" and name[-1] == "`":
150
+ return name if backquotes else name[1:-1]
151
+ return f"`{name}`" if backquotes else name
152
+
153
+
154
+ def try_to_fix_nullable_in_simple_aggregating_function(t: str) -> Optional[str]:
155
+ # This workaround is to fix: https://github.com/ClickHouse/ClickHouse/issues/34407.
156
+ # In the case of nullable columns and SimpleAggregateFunction Clickhouse returns
157
+ # Nullable(SimpleAggregateFunction(sum, Int32)) instead of SimpleAggregateFunction(sum, Nullable(Int32))
158
+ # as it is done with other aggregate functions.
159
+ # If not, the aggregation could return incorrect results.
160
+ result = None
161
+ if match := re.search(r"SimpleAggregateFunction\((\w+),\s*(?!(?:Nullable))([\w,.()]+)\)", t):
162
+ fn = match.group(1)
163
+ inner_type = match.group(2)
164
+ result = f"SimpleAggregateFunction({fn}, Nullable({inner_type}))"
165
+ return result
166
+
167
+
168
+ def schema_to_sql_columns(schema: List[Dict[str, Any]]) -> List[str]:
169
+ """return an array with each column in SQL
170
+ >>> schema_to_sql_columns([{'name': 'temperature', 'type': 'Float32', 'codec': None, 'default_value': None, 'nullable': False, 'normalized_name': 'temperature'}, {'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4))', 'default_value': 'MATERIALIZED temperature', 'nullable': False, 'normalized_name': 'temperature_delta'}])
171
+ ['`temperature` Float32', '`temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4))']
172
+ >>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'codec': '', 'default_value': 'MATERIALIZED temperature', 'nullable': False, 'normalized_name': 'temperature_delta'}])
173
+ ['`temperature_delta` Float32 MATERIALIZED temperature']
174
+ >>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4))', 'default_value': '', 'nullable': False, 'normalized_name': 'temperature_delta'}])
175
+ ['`temperature_delta` Float32 CODEC(Delta(4), LZ4))']
176
+ >>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'nullable': False, 'normalized_name': 'temperature_delta'}])
177
+ ['`temperature_delta` Float32']
178
+ >>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'nullable': False, 'normalized_name': 'temperature_delta', 'jsonpath': '$.temperature_delta'}])
179
+ ['`temperature_delta` Float32 `json:$.temperature_delta`']
180
+ >>> schema_to_sql_columns([{'name': 'aggregation', 'type': 'SimpleAggregateFunction(sum, Int32)', 'nullable': True, 'normalized_name': 'aggregation', 'jsonpath': '$.aggregation'}])
181
+ ['`aggregation` SimpleAggregateFunction(sum, Nullable(Int32)) `json:$.aggregation`']
182
+ """
183
+ columns: List[str] = []
184
+ for x in schema:
185
+ name = x["normalized_name"] if "normalized_name" in x else x["name"]
186
+ if x["nullable"]:
187
+ if (_type := try_to_fix_nullable_in_simple_aggregating_function(x["type"])) is None:
188
+ _type = "Nullable(%s)" % x["type"]
189
+ else:
190
+ _type = x["type"]
191
+ parts = [col_name(name, backquotes=True), _type]
192
+ if x.get("jsonpath", None):
193
+ parts.append(f"`json:{x['jsonpath']}`")
194
+ if "default_value" in x and x["default_value"] not in ("", None):
195
+ parts.append(x["default_value"])
196
+ if "codec" in x and x["codec"] not in ("", None):
197
+ parts.append(x["codec"])
198
+ c = " ".join([x for x in parts if x]).strip()
199
+ columns.append(c)
200
+ return columns
201
+
202
+
203
+ def mark_error_string(s: str, i: int, line: int = 1) -> str:
204
+ """
205
+ >>> mark_error_string('0123456789', 0)
206
+ '0123456789\\n^---'
207
+ >>> mark_error_string('0123456789', 9)
208
+ '0123456789\\n ^---'
209
+ >>> mark_error_string('01234\\n56789', 1)
210
+ '01234\\n ^---'
211
+ """
212
+ marker = "^---"
213
+ ss = s.splitlines()[line - 1] if s else ""
214
+ start = 0
215
+ end = len(ss)
216
+ return ss[start:end] + "\n" + (" " * (i - start)) + marker
217
+
218
+
219
+ def format_parse_error(
220
+ table_structure: str,
221
+ i: int,
222
+ position: int,
223
+ hint: Optional[str] = None,
224
+ line: int = 0,
225
+ keyword: Optional[str] = None,
226
+ ) -> str:
227
+ adjusted_position = position - (len(keyword) if keyword else 0)
228
+ message = f"{hint}\n" if hint else ""
229
+ message += mark_error_string(table_structure, adjusted_position - 1, line=line)
230
+
231
+ if keyword:
232
+ message += f" found at position {adjusted_position - len(keyword)}"
233
+ else:
234
+ message += (
235
+ f" found {repr(table_structure[i]) if len(table_structure)>i else 'EOF'} at position {adjusted_position}"
236
+ )
237
+ return message
238
+
239
+
240
+ def parse_indexes_structure(indexes: Optional[List[str]]) -> List[TableIndex]:
241
+ """
242
+ >>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
243
+ [TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
244
+ >>> parse_indexes_structure(["INDEX index_name a TYPE set(100) GRANULARITY 100", " INDEX index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
245
+ [TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
246
+ >>> parse_indexes_structure(["index_name type TYPE set(100) GRANULARITY 100", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
247
+ [TableIndex(name='index_name', expr='type', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
248
+ >>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100,", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
249
+ [TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
250
+ >>> parse_indexes_structure(["index_name a TYPE set(100)", "index_name_bf mapValues(d) TYPE bloom_filter(0.001)"])
251
+ [TableIndex(name='index_name', expr='a', type_full='set(100)', granularity=None), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity=None)]
252
+ >>> parse_indexes_structure(["index_name u64 * length(s) TYPE set(100)", "index_name_bf mapValues(d) TYPE bloom_filter"])
253
+ [TableIndex(name='index_name', expr='u64 * length(s)', type_full='set(100)', granularity=None), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter', granularity=None)]
254
+ >>> parse_indexes_structure(["index_name path TYPE ngrambf_v1(4,1024,1,42) GRANULARITY 1"])
255
+ [TableIndex(name='index_name', expr='path', type_full='ngrambf_v1(4,1024,1,42)', granularity='1')]
256
+ >>> parse_indexes_structure(["index_name path TYPE ngrambf_v1(4, 1024, 1, 42) GRANULARITY 1"])
257
+ [TableIndex(name='index_name', expr='path', type_full='ngrambf_v1(4, 1024, 1, 42)', granularity='1')]
258
+ >>> parse_indexes_structure(["index_name u64 * length(s)"])
259
+ Traceback (most recent call last):
260
+ ...
261
+ ValueError: invalid INDEX format. Usage: `name expr TYPE type_full GRANULARITY granularity`
262
+ >>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100, index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
263
+ Traceback (most recent call last):
264
+ ...
265
+ ValueError: invalid INDEX format. Usage: `name expr TYPE type_full GRANULARITY granularity`
266
+ >>> parse_indexes_structure(["my_index m['key'] TYPE ngrambf_v1(1, 1024, 1, 42) GRANULARITY 1"])
267
+ [TableIndex(name='my_index', expr="m['key']", type_full='ngrambf_v1(1, 1024, 1, 42)', granularity='1')]
268
+ >>> parse_indexes_structure(["my_index_lambda arrayMap(x -> tupleElement(x,'message'), column_name) TYPE ngrambf_v1(1, 1024, 1, 42) GRANULARITY 1"])
269
+ [TableIndex(name='my_index_lambda', expr="arrayMap(x -> tupleElement(x,'message'), column_name)", type_full='ngrambf_v1(1, 1024, 1, 42)', granularity='1')]
270
+ >>> parse_indexes_structure(["ip_range_minmax_idx (toIPv6(ip_range_start), toIPv6(ip_range_end)) TYPE minmax GRANULARITY 1"])
271
+ [TableIndex(name='ip_range_minmax_idx', expr='(toIPv6(ip_range_start), toIPv6(ip_range_end))', type_full='minmax', granularity='1')]
272
+ """
273
+ parsed_indices: List[TableIndex] = []
274
+ if not indexes:
275
+ return parsed_indices
276
+
277
+ for index in indexes:
278
+ index = index.strip().rstrip(",")
279
+ index = index.lstrip("INDEX").strip()
280
+ if index.count("TYPE") != 1:
281
+ raise ValueError("invalid INDEX format. Usage: `name expr TYPE type_full GRANULARITY granularity`")
282
+
283
+ match = re.match(
284
+ r"(\w+)\s+([\w\s*\[\]\*\(\),\'\"-><.]+)\s+TYPE\s+(\w+)(?:\(([\w\s*.,]+)\))?(?:\s+GRANULARITY\s+(\d+))?",
285
+ index,
286
+ )
287
+ if match:
288
+ index_name, a, index_type, value, granularity = match.groups()
289
+ index_expr = f"{index_type}({value})" if value else index_type
290
+ parsed_indices.append(TableIndex(index_name, a.strip(), f"{index_expr}", granularity))
291
+ else:
292
+ raise ValueError("invalid INDEX format. Usage: `name expr TYPE type_full GRANULARITY granularity`")
293
+ return parsed_indices
294
+
295
+
296
+ def parse_table_structure(schema: str) -> List[Dict[str, Any]]:
297
+ """This parses the SQL schema for a CREATE TABLE
298
+ Columns follow the syntax: name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1][,]
299
+ Reference: https://clickhouse.tech/docs/en/sql-reference/statements/create/table/#syntax-forms
300
+
301
+ >>> parse_table_structure('c Float32, b String')
302
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
303
+
304
+ >>> parse_table_structure('c Float32,--comment\\nb String')
305
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
306
+
307
+ >>> parse_table_structure('c Float32,--comment\\nb String --another-comment')
308
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
309
+
310
+ >>> parse_table_structure('c Float32 --first-comment\\n,--comment\\nb String --another-comment')
311
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
312
+
313
+ >>> parse_table_structure('--random comment here\\nc Float32 --another comment\\n,--another one\\nb String --this is the last one')
314
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
315
+
316
+ >>> parse_table_structure('--extra comment\\nc--extra comment\\nFloat32--extra comment\\n,--extra comment\\nb--extra comment\\nString--extra comment')
317
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
318
+
319
+ >>> parse_table_structure('c Nullable(Float32)')
320
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
321
+
322
+ >>> parse_table_structure('c Nullable(Float32) DEFAULT NULL')
323
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
324
+
325
+ >>> parse_table_structure("c String DEFAULT 'bla'")
326
+ [{'name': 'c', 'type': 'String', 'codec': None, 'default_value': "DEFAULT 'bla'", 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
327
+
328
+ >>> parse_table_structure('`foo.bar` UInt64')
329
+ [{'name': 'foo.bar', 'type': 'UInt64', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'foo.bar'}]
330
+
331
+ >>> parse_table_structure('double_value Float64 CODEC(LZ4HC(2))')
332
+ [{'name': 'double_value', 'type': 'Float64', 'codec': 'CODEC(LZ4HC(2))', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'double_value'}]
333
+ >>> parse_table_structure('doubl/e_value Float64 CODEC(LZ4HC(2))')
334
+ Traceback (most recent call last):
335
+ ...
336
+ ValueError: wrong value, please check the schema syntax
337
+ doubl/e_value Float64 CODEC(LZ4HC(2))
338
+ ^--- found '/' at position 6
339
+ >>> parse_table_structure('`c` Nullable(Float32)')
340
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
341
+ >>> parse_table_structure('wadus INT UNSIGNED')
342
+ [{'name': 'wadus', 'type': 'INT UNSIGNED', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'wadus'}]
343
+ >>> parse_table_structure('c Int32 CODEC(Delta, LZ4)\\n')
344
+ [{'name': 'c', 'type': 'Int32', 'codec': 'CODEC(Delta, LZ4)', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
345
+ >>> parse_table_structure('c SimpleAggregateFunction(sum, Int32),\\np SimpleAggregateFunction(sum, Int32)')
346
+ Traceback (most recent call last):
347
+ ...
348
+ ValueError: Incompatible data types between aggregate function 'sum' which returns Int64 and column storage type Int32
349
+ >>> parse_table_structure('c Int32 CODEC(Delta, LZ4) Materialized b*2\\n')
350
+ Traceback (most recent call last):
351
+ ...
352
+ ValueError: Unexpected MATERIALIZED after CODEC
353
+ c Int32 CODEC(Delta, LZ4) Materialized b*2
354
+ ^--- found ' ' at position 26
355
+ >>> parse_table_structure('c Int32 CODEC(Delta, LZ4) Materialized ifNull(b*2, 0)\\n')
356
+ Traceback (most recent call last):
357
+ ...
358
+ ValueError: Unexpected MATERIALIZED after CODEC
359
+ c Int32 CODEC(Delta, LZ4) Materialized ifNull(b*2, 0)
360
+ ^--- found ' ' at position 26
361
+ >>> parse_table_structure('c Int32 Materialized b*2\\n')
362
+ [{'name': 'c', 'type': 'Int32', 'codec': None, 'default_value': 'MATERIALIZED b*2', 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
363
+ >>> parse_table_structure('c Int32 Materialized b != 1 ? b*2: pow(b, 3)\\n')
364
+ [{'name': 'c', 'type': 'Int32', 'codec': None, 'default_value': 'MATERIALIZED b != 1 ? b*2: pow(b, 3)', 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
365
+ >>> parse_table_structure('')
366
+ []
367
+ >>> parse_table_structure('`date` Date,`timezone` String,`offset` Int32')
368
+ [{'name': 'date', 'type': 'Date', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'date'}, {'name': 'timezone', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'timezone'}, {'name': 'offset', 'type': 'Int32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'offset'}]
369
+ >>> parse_table_structure('c Int32 Materialized b*2 CODEC(Delta, LZ4)\\n')
370
+ [{'name': 'c', 'type': 'Int32', 'codec': 'CODEC(Delta, LZ4)', 'default_value': 'MATERIALIZED b*2', 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
371
+ >>> parse_table_structure('c Int32 Materialized ifNull(b*2, 0) CODEC(Delta, LZ4)\\n')
372
+ [{'name': 'c', 'type': 'Int32', 'codec': 'CODEC(Delta, LZ4)', 'default_value': 'MATERIALIZED ifNull(b*2, 0)', 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
373
+ >>> parse_table_structure('`temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4)')
374
+ [{'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4)', 'default_value': 'MATERIALIZED temperature', 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature_delta'}]
375
+ >>> parse_table_structure('foo^bar Float32')
376
+ Traceback (most recent call last):
377
+ ...
378
+ ValueError: wrong value, please check the schema syntax
379
+ foo^bar Float32
380
+ ^--- found '^' at position 4
381
+ >>> parse_table_structure('foo Float#32')
382
+ Traceback (most recent call last):
383
+ ...
384
+ ValueError: wrong value, please check the schema syntax
385
+ foo Float#32
386
+ ^--- found '#' at position 10
387
+ >>> parse_table_structure('foo Float32 DEFAULT 13, bar UInt64')
388
+ [{'name': 'foo', 'type': 'Float32', 'codec': None, 'default_value': 'DEFAULT 13', 'jsonpath': None, 'nullable': False, 'normalized_name': 'foo'}, {'name': 'bar', 'type': 'UInt64', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'bar'}]
389
+ >>> parse_table_structure('foo Float32 DEFAULT 1$$$3')
390
+ Traceback (most recent call last):
391
+ ...
392
+ ValueError: wrong value, please check the schema syntax
393
+ foo Float32 DEFAULT 1$$$3
394
+ ^--- found '$' at position 22
395
+ >>> parse_table_structure('foo Float32 CODEC(Delta(4), LZ#4)')
396
+ Traceback (most recent call last):
397
+ ...
398
+ ValueError: wrong value, please check the schema syntax
399
+ foo Float32 CODEC(Delta(4), LZ#4)
400
+ ^--- found '#' at position 31
401
+ >>> parse_table_structure('\\n `temperature` Float32,\\n `temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4)\\n ')
402
+ [{'name': 'temperature', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature'}, {'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4)', 'default_value': 'MATERIALIZED temperature', 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature_delta'}]
403
+ >>> parse_table_structure('temperature Float32, temperature_delta Float32 MATERIALIZED temperature Codec(Delta(4)), temperature_doubledelta Float32 MATERIALIZED temperature Codec(DoubleDelta), temperature_doubledelta_lz4 Float32 MATERIALIZED temperature Codec(DoubleDelta, LZ4)')
404
+ [{'name': 'temperature', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature'}, {'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4))', 'default_value': 'MATERIALIZED temperature', 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature_delta'}, {'name': 'temperature_doubledelta', 'type': 'Float32', 'codec': 'CODEC(DoubleDelta)', 'default_value': 'MATERIALIZED temperature', 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature_doubledelta'}, {'name': 'temperature_doubledelta_lz4', 'type': 'Float32', 'codec': 'CODEC(DoubleDelta, LZ4)', 'default_value': 'MATERIALIZED temperature', 'jsonpath': None, 'nullable': False, 'normalized_name': 'temperature_doubledelta_lz4'}]
405
+ >>> parse_table_structure('t UInt8 CODEC(Delta(1), LZ4)')
406
+ [{'name': 't', 'type': 'UInt8', 'codec': 'CODEC(Delta(1), LZ4)', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 't'}]
407
+ >>> parse_table_structure('tt UInt8 MATERIALIZED t')
408
+ [{'name': 'tt', 'type': 'UInt8', 'codec': None, 'default_value': 'MATERIALIZED t', 'jsonpath': None, 'nullable': False, 'normalized_name': 'tt'}]
409
+ >>> parse_table_structure('tt UInt8 MATERIALIZED t CODEC(Delta(1), LZ4)')
410
+ [{'name': 'tt', 'type': 'UInt8', 'codec': 'CODEC(Delta(1), LZ4)', 'default_value': 'MATERIALIZED t', 'jsonpath': None, 'nullable': False, 'normalized_name': 'tt'}]
411
+ >>> parse_table_structure('tt SimpleAggregateFunction(any, Nullable(UInt8))')
412
+ [{'name': 'tt', 'type': 'SimpleAggregateFunction(any, Nullable(UInt8))', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'tt'}]
413
+ >>> parse_table_structure("timestamp DateTime MATERIALIZED toDateTime(JSONExtractInt(JSONExtractRaw(record, 'payload'), 'timestamp') / 1000)")
414
+ [{'name': 'timestamp', 'type': 'DateTime', 'codec': None, 'default_value': "MATERIALIZED toDateTime(JSONExtractInt(JSONExtractRaw(record, 'payload'), 'timestamp') / 1000)", 'jsonpath': None, 'nullable': False, 'normalized_name': 'timestamp'}]
415
+ >>> parse_table_structure("`test_default_cast` DEFAULT plus(13,1)")
416
+ [{'name': 'test_default_cast', 'type': '', 'codec': None, 'default_value': 'DEFAULT plus(13,1)', 'jsonpath': None, 'nullable': False, 'normalized_name': 'test_default_cast'}]
417
+ >>> parse_table_structure("hola Int, `materialized` String MATERIALIZED upper(no_nullable_string)")
418
+ [{'name': 'hola', 'type': 'Int', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'hola'}, {'name': 'materialized', 'type': 'String', 'codec': None, 'default_value': 'MATERIALIZED upper(no_nullable_string)', 'jsonpath': None, 'nullable': False, 'normalized_name': 'materialized'}]
419
+ >>> parse_table_structure('`a2` String `json:$.a2`, `a3` String `json:$.a3`\\n')
420
+ [{'name': 'a2', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
421
+ >>> parse_table_structure("`arr` Array(String) DEFAULT ['-']")
422
+ [{'name': 'arr', 'type': 'Array(String)', 'codec': None, 'default_value': "DEFAULT ['-']", 'jsonpath': None, 'nullable': False, 'normalized_name': 'arr'}]
423
+ >>> parse_table_structure("`arr` Array(String) DEFAULT array('-')")
424
+ [{'name': 'arr', 'type': 'Array(String)', 'codec': None, 'default_value': "DEFAULT array('-')", 'jsonpath': None, 'nullable': False, 'normalized_name': 'arr'}]
425
+ >>> parse_table_structure('`a2` Float32 CODEC(Delta, ZSTD(4)) `json:$.a2`, `a3` String `json:$.a3`\\n')
426
+ [{'name': 'a2', 'type': 'Float32', 'codec': 'CODEC(Delta, ZSTD(4))', 'default_value': None, 'jsonpath': '$.a2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
427
+ >>> parse_table_structure('`a` String, INDEX index_name a TYPE set(100) GRANULARITY 100')
428
+ [{'name': 'a', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'a'}]
429
+ >>> parse_table_structure('`a` String, INDEX index_name a TYPE set(100, 1) GRANULARITY 100')
430
+ [{'name': 'a', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'a'}]
431
+ >>> parse_table_structure('`index` String, INDEX index_name a TYPE set(100, 1) GRANULARITY 100')
432
+ [{'name': 'index', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'index'}]
433
+ >>> parse_table_structure('`a2` String `json:$.a--2`, `a3` String `json:$.a3`\\n')
434
+ [{'name': 'a2', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a--2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
435
+ """
436
+ return _parse_table_structure(schema)
437
+
438
+
439
+ def clean_comments(schema_to_clean: str) -> str:
440
+ """Remove the comments from the schema
441
+ if the comments are between backticks, they will not be removed
442
+ >>> clean_comments(None) is None
443
+ True
444
+ >>> clean_comments('')
445
+ ''
446
+ >>> clean_comments(' ')
447
+ ''
448
+ >>> clean_comments('\\n')
449
+ ''
450
+ >>> clean_comments('\\n\\n\\n\\n')
451
+ ''
452
+ >>> clean_comments('c Float32')
453
+ 'c Float32'
454
+ >>> clean_comments('c Float32\\n')
455
+ 'c Float32'
456
+ >>> clean_comments('c Float32\\n--this is a comment')
457
+ 'c Float32'
458
+ >>> clean_comments('c Float32\\n--this is a comment\\n')
459
+ 'c Float32'
460
+ >>> clean_comments('c Float32\\t-- this is a comment\\t\\n')
461
+ 'c Float32'
462
+ >>> clean_comments('c Float32\\n--this is a comment\\r\\n')
463
+ 'c Float32'
464
+ >>> clean_comments('c Float32\\n--this is a comment\\n--this is a comment2\\n')
465
+ 'c Float32'
466
+ >>> clean_comments('c Float32\\n--this is a ```comment\\n')
467
+ 'c Float32'
468
+ >>> clean_comments('c Float32\\n--this is a ```comment\\n')
469
+ 'c Float32'
470
+ >>> clean_comments('c Float32, -- comment\\nd Float32 -- comment2')
471
+ 'c Float32,\\nd Float32'
472
+ >>> clean_comments('c Float32, -- comment\\n -- comment \\nd Float32 -- comment2')
473
+ 'c Float32,\\nd Float32'
474
+ >>> clean_comments('c Float32 `json:$.aa--aa`\\n--this is a ```comment\\n')
475
+ 'c Float32 `json:$.aa--aa`'
476
+ >>> clean_comments('c Float32 `json:$.cc--cc`\\nd Float32 `json:$.dd--dd`\\n--this is a ```comment\\n')
477
+ 'c Float32 `json:$.cc--cc`\\nd Float32 `json:$.dd--dd`'
478
+ >>> clean_comments('c--c Float32 `json:$.cc--cc`\\n')
479
+ 'c'
480
+ >>> clean_comments('`c--c` Float32 `json:$.cc--cc`\\n')
481
+ '`c'
482
+ """
483
+
484
+ def clean_line_comments(line: str) -> str:
485
+ if not line:
486
+ return line
487
+ i = 0
488
+ inside_json_path = False
489
+ while i < len(line):
490
+ if i + 1 < len(line) and line[i] == "-" and line[i + 1] == "-" and not inside_json_path:
491
+ return line[:i].strip()
492
+
493
+ if not inside_json_path and line[i:].startswith("`json:"):
494
+ inside_json_path = True
495
+ elif inside_json_path and line[i] == "`":
496
+ inside_json_path = False
497
+ i += 1
498
+ return line
499
+
500
+ if schema_to_clean is None:
501
+ return schema_to_clean
502
+
503
+ cleaned_schema = ""
504
+ for line in schema_to_clean.splitlines():
505
+ cleaned_line = clean_line_comments(line)
506
+ if cleaned_line:
507
+ cleaned_schema += cleaned_line + "\n"
508
+ return cleaned_schema.strip()
509
+
510
+
511
+ SyntaxExpr = namedtuple("SyntaxExpr", ["name", "regex"])
512
+
513
+ NULL = SyntaxExpr("NULL", re.compile(r"\s+NULL([^a-z0-9_]|$)", re.IGNORECASE))
514
+ NOTNULL = SyntaxExpr("NOTNULL", re.compile(r"\s+NOT\s+NULL([^a-z0-9_]|$)", re.IGNORECASE))
515
+ DEFAULT = SyntaxExpr("DEFAULT", re.compile(r"\s+DEFAULT([^a-z0-9_]|$)", re.IGNORECASE))
516
+ MATERIALIZED = SyntaxExpr("MATERIALIZED", re.compile(r"\s+MATERIALIZED([^a-z0-9_]|$)", re.IGNORECASE))
517
+ ALIAS = SyntaxExpr("ALIAS", re.compile(r"\s+ALIAS([^a-z0-9_]|$)", re.IGNORECASE))
518
+ CODEC = SyntaxExpr("CODEC", re.compile(r"\s+CODEC([^a-z0-9_]|$)", re.IGNORECASE))
519
+ TTL = SyntaxExpr("TTL", re.compile(r"\s+TTL([^a-z0-9_]|$)", re.IGNORECASE))
520
+ JSONPATH = SyntaxExpr("JSONPATH", re.compile(r"\s+`json:", re.IGNORECASE))
521
+ COMMA = SyntaxExpr("COMMA", re.compile(r",", re.IGNORECASE))
522
+ NEW_LINE = SyntaxExpr("NEW_LINE", re.compile(r"\s$"))
523
+ TYPE = SyntaxExpr("TYPE", re.compile(r"")) # TYPE doesn't have a fixed initial string
524
+
525
+ REGEX_WHITESPACE = re.compile(r"\s*")
526
+ REGEX_COMMENT = re.compile(r"\-\-[^\n\r]*[\n\r]")
527
+
528
+
529
+ def _parse_table_structure(schema: str) -> List[Dict[str, Any]]:
530
+ # CH syntax from https://clickhouse.com/docs/en/sql-reference/statements/create/table/
531
+ # name1 [type1] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1]
532
+ try:
533
+ schema = clean_comments(schema + "\n")
534
+ except Exception as e:
535
+ logging.exception(f"Error cleaning comments: {e}")
536
+ schema = REGEX_COMMENT.sub(" ", schema + "\n").strip()
537
+
538
+ if REGEX_WHITESPACE.fullmatch(schema):
539
+ return []
540
+
541
+ i: int = 0
542
+
543
+ # For error feedback only
544
+ line: int = 1
545
+ pos: int = 1
546
+
547
+ # Find the first SyntaxExpr in lookup that matches the schema at the current offset
548
+ def lookahead_matches(lookup: Iterable) -> Optional[SyntaxExpr]:
549
+ s = schema[i:]
550
+ match = next((x for x in lookup if x.regex.match(s)), None)
551
+ return match
552
+
553
+ def advance_single_char() -> None:
554
+ nonlocal i, line, pos
555
+ if schema[i] == "\n":
556
+ line += 1
557
+ pos = 1
558
+ else:
559
+ pos += 1
560
+ i += 1
561
+
562
+ # Advance all whitespaces characters and then len(s) more chars
563
+ def advance(s: str) -> None:
564
+ if i < len(schema):
565
+ while schema[i] in " \t\r\n":
566
+ advance_single_char()
567
+ for _ in s:
568
+ advance_single_char()
569
+
570
+ def get_backticked() -> str:
571
+ begin = i
572
+ while i < len(schema):
573
+ c = schema[i]
574
+ advance_single_char()
575
+ if c == "`":
576
+ return schema[begin : i - 1]
577
+ raise ValueError(format_parse_error(schema, i, pos, "expecting ending backtick", line=line))
578
+
579
+ def parse_name() -> str:
580
+ nonlocal i, line, pos
581
+ if schema[i] != "`":
582
+ # regular name
583
+ begin = i
584
+ while i < len(schema):
585
+ c = schema[i]
586
+ if c in " \t\r\n":
587
+ return schema[begin:i]
588
+ if c not in valid_chars_name:
589
+ raise ValueError(
590
+ format_parse_error(schema, i, pos, "wrong value, please check the schema syntax", line=line)
591
+ )
592
+ advance_single_char()
593
+ return schema[begin:i]
594
+ else:
595
+ # backticked name
596
+ advance_single_char()
597
+ return get_backticked()
598
+
599
+ def parse_expr(lookup: Iterable[SyntaxExpr]) -> str:
600
+ nonlocal i, line, pos
601
+
602
+ begin: int = i
603
+ context_stack: List[Optional[str]] = [None]
604
+ while i < len(schema):
605
+ context = context_stack[-1]
606
+ c = schema[i]
607
+
608
+ if (context == "'" and c == "'") or (context == '"' and c == '"') or (context == "(" and c == ")"):
609
+ context_stack.pop()
610
+ elif c == "'" and (context is None or context == "("):
611
+ context_stack.append("'")
612
+ elif c == '"' and (context is None or context == "("):
613
+ context_stack.append('"')
614
+ elif c == "(" and (context is None or context == "("):
615
+ context_stack.append("(")
616
+ elif context is None and lookahead_matches(lookup):
617
+ return schema[begin:i].strip(" \t\r\n")
618
+ elif (context is None and c not in valid_chars_fn) or (context == "(" and c not in valid_chars_fn):
619
+ raise ValueError(
620
+ format_parse_error(schema, i, pos, "wrong value, please check the schema syntax", line=line)
621
+ )
622
+ advance_single_char()
623
+ if i == begin:
624
+ raise ValueError(format_parse_error(schema, i, pos, "wrong value", line=line))
625
+ return schema[begin:].strip(" \t\r\n")
626
+
627
+ columns: List[Dict[str, Any]] = []
628
+
629
+ name: str = ""
630
+ _type: str = ""
631
+ default: str = ""
632
+ materialized: str = ""
633
+ codec: str = ""
634
+ jsonpath: str = ""
635
+ last: Optional[SyntaxExpr] = None
636
+
637
+ def add_column(found: str) -> None:
638
+ nonlocal name, _type, default, materialized, codec, jsonpath
639
+ if name == "INDEX":
640
+ return
641
+ if not name:
642
+ raise ValueError(
643
+ format_parse_error(schema, i, pos, f"Syntax error: expecting NAME, found {found}", line=line)
644
+ )
645
+ default = "" if not default else f"DEFAULT {default}"
646
+ materialized = "" if not materialized else f"MATERIALIZED {materialized}"
647
+ codec = "" if not codec else f"CODEC{codec}"
648
+ columns.append(
649
+ {
650
+ "name": name,
651
+ "type": _type,
652
+ "codec": codec,
653
+ "default_value": default or materialized,
654
+ "jsonpath": jsonpath,
655
+ }
656
+ )
657
+ name = ""
658
+ _type = ""
659
+ default = ""
660
+ materialized = ""
661
+ codec = ""
662
+ jsonpath = ""
663
+
664
+ valid_next: List[SyntaxExpr] = [TYPE]
665
+ while i < len(schema):
666
+ if not name:
667
+ advance("")
668
+ valid_next = [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, TYPE]
669
+ name = parse_name()
670
+ continue
671
+ found = lookahead_matches(
672
+ [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, NEW_LINE, TYPE]
673
+ if name != "INDEX"
674
+ else [COMMA, NEW_LINE]
675
+ )
676
+ if found and found not in valid_next:
677
+ after = f" after {last.name}" if last else ""
678
+ raise ValueError(format_parse_error(schema, i, pos, f"Unexpected {found.name}{after}", line=line))
679
+ if found == TYPE:
680
+ advance("")
681
+ valid_next = [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, NEW_LINE]
682
+ detected_type = parse_expr([NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
683
+ try:
684
+ # Imported in the body to be compatible with the CLI
685
+ from chtoolset.query import check_compatible_types
686
+
687
+ # Check compatibility of the type with itself to verify it's a known type
688
+ check_compatible_types(detected_type, detected_type)
689
+ except ModuleNotFoundError:
690
+ pass
691
+ _type = detected_type
692
+ elif found == NULL:
693
+ # Not implemented
694
+ raise ValueError(
695
+ format_parse_error(schema, i, pos, "NULL column syntax not supported", line=line, keyword="NULL")
696
+ )
697
+ elif found == NOTNULL:
698
+ # Not implemented
699
+ raise ValueError(
700
+ format_parse_error(
701
+ schema, i, pos, "NOT NULL column syntax not supported", line=line, keyword="NOT NULL"
702
+ )
703
+ )
704
+ elif found == DEFAULT:
705
+ advance("DEFAULT")
706
+ valid_next = [CODEC, TTL, COMMA]
707
+ default = parse_expr([NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
708
+ elif found == MATERIALIZED:
709
+ advance("MATERIALIZED")
710
+ valid_next = [CODEC, TTL, COMMA]
711
+ materialized = parse_expr([NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
712
+ elif found == ALIAS:
713
+ # Not implemented
714
+ raise ValueError(format_parse_error(schema, i, pos, "ALIAS not supported", line=line, keyword="ALIAS"))
715
+ elif found == CODEC:
716
+ advance("CODEC")
717
+ valid_next = [TTL, COMMA, JSONPATH]
718
+ codec = parse_expr([NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
719
+ elif found == TTL:
720
+ advance("TTL")
721
+ # Not implemented
722
+ raise ValueError(format_parse_error(schema, i, pos, "column TTL not supported", line=line, keyword="TTL"))
723
+ elif found == JSONPATH:
724
+ advance("`json:")
725
+ jsonpath = get_backticked()
726
+ elif found == COMMA:
727
+ if name == "INDEX":
728
+ advance(",")
729
+ continue
730
+ advance(",")
731
+ valid_next = []
732
+ add_column("COMMA")
733
+ elif found == NEW_LINE or (name == "INDEX" and not found):
734
+ i += 1
735
+ else:
736
+ raise ValueError(
737
+ format_parse_error(
738
+ schema,
739
+ i,
740
+ pos,
741
+ "wrong value, expected a NULL, NOT NULL, DEFAULT, MATERIALIZED, CODEC, TTL expressions, a column data type, a comma, a new line or a jsonpath",
742
+ line=line,
743
+ )
744
+ )
745
+ last = found
746
+ add_column("EOF")
747
+
748
+ # normalize columns
749
+ for column in columns:
750
+ nullable = column["type"].lower().startswith("nullable")
751
+ column["type"] = column["type"] if not nullable else column["type"][len("Nullable(") : -1] # ')'
752
+ column["nullable"] = nullable
753
+ column["codec"] = column["codec"] if column["codec"] else None
754
+ column["name"] = column["name"]
755
+ column["normalized_name"] = column["name"]
756
+ column["jsonpath"] = column["jsonpath"] if column["jsonpath"] else None
757
+ default_value = column["default_value"] if column["default_value"] else None
758
+ if nullable and default_value and default_value.lower() == "default null":
759
+ default_value = None
760
+ column["default_value"] = default_value
761
+
762
+ return columns
763
+
764
+
765
+ def engine_can_be_replicated(engine: Optional[str]) -> bool:
766
+ """
767
+ >>> engine_can_be_replicated('MergeTree() order by tuple()')
768
+ True
769
+ >>> engine_can_be_replicated('JOIN(ANY, LEFT, foo)')
770
+ False
771
+ >>> engine_can_be_replicated('ReplicatingMergeTree() order by tuple()')
772
+ True
773
+ >>> engine_can_be_replicated(None)
774
+ False
775
+ >>> engine_can_be_replicated("ReplicatedReplacingMergeTree('/clickhouse/tables/{layer}-{shard}/d_e7a588.t_ba45cd49a39c4649a1c7c2ec7adcaf43_t_be23eb990c394399854f8271c550fc36_staging', '{replica}', insert_date)")
776
+ False
777
+ """
778
+ if not engine:
779
+ return False
780
+ lower_engine = engine.lower()
781
+ return not lower_engine.startswith("Replicated".lower()) and "mergetree" in lower_engine
782
+
783
+
784
+ def engine_supports_delete(engine: Optional[str]) -> bool:
785
+ """
786
+ >>> engine_supports_delete('MergeTree() order by tuple()')
787
+ True
788
+ >>> engine_supports_delete('JOIN(ANY, LEFT, foo)')
789
+ False
790
+ >>> engine_supports_delete('ReplicatingMergeTree() order by tuple()')
791
+ True
792
+ >>> engine_supports_delete(None)
793
+ False
794
+ """
795
+ if not engine:
796
+ return False
797
+ return "mergetree" in engine.lower()
798
+
799
+
800
+ def engine_replicated_to_local(engine: str) -> str:
801
+ """
802
+ >>> engine_replicated_to_local("ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/test.foo','{replica}') order by (test)")
803
+ 'MergeTree() order by (test)'
804
+ >>> engine_replicated_to_local("ReplicatedReplacingMergeTree('/clickhouse/tables/{layer}-{shard}/test.foo', '{replica}', timestamp) order by (test)")
805
+ 'ReplacingMergeTree(timestamp) order by (test)'
806
+ >>> engine_replicated_to_local("Join(ANY, LEFT, test)")
807
+ 'Join(ANY, LEFT, test)'
808
+ >>> engine_replicated_to_local("ReplicatedVersionedCollapsingMergeTree('/clickhouse/tables/{layer}-{shard}/test.foo', '{replica}', sign,version) ORDER BY pk TTL toDate(local_timeplaced) + toIntervalDay(3) SETTINGS index_granularity = 8192")
809
+ 'VersionedCollapsingMergeTree(sign, version) ORDER BY pk TTL toDate(local_timeplaced) + toIntervalDay(3) SETTINGS index_granularity = 8192'
810
+ """
811
+
812
+ def _replace(m):
813
+ parts = m.groups()
814
+ s = parts[0] + "MergeTree("
815
+ if parts[1]:
816
+ tk = parts[1].split(",")
817
+ if len(tk) > 2: # remove key and {replica} part
818
+ s += ", ".join([x.strip() for x in tk[2:]])
819
+ s += ")" + parts[2]
820
+ return s
821
+
822
+ if "Replicated" not in engine:
823
+ return engine
824
+
825
+ return re.sub(r"Replicated(.*)MergeTree\(([^\)]*)\)(.*)", _replace, engine.strip())
826
+
827
+
828
+ def engine_patch_replicated_engine(engine: str, engine_full: Optional[str], new_table_name: str) -> Optional[str]:
829
+ """
830
+ >>> engine_patch_replicated_engine("ReplicatedMergeTree", "ReplicatedMergeTree('/clickhouse/tables/1-1/table_name', 'replica') PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192", 'table_name_staging')
831
+ "ReplicatedMergeTree('/clickhouse/tables/1-1/table_name_staging', 'replica') PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
832
+ >>> engine_patch_replicated_engine("ReplicatedMergeTree", "ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/sales_product_rank_rt_replicated_2', '{replica}') PARTITION BY toYYYYMM(date) ORDER BY (purchase_location, sku_rank_lc, date)", 'sales_product_rank_rt_replicated_2_staging')
833
+ "ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/sales_product_rank_rt_replicated_2_staging', '{replica}') PARTITION BY toYYYYMM(date) ORDER BY (purchase_location, sku_rank_lc, date)"
834
+ >>> engine_patch_replicated_engine("ReplicatedMergeTree", None, 't_000') is None
835
+ True
836
+ >>> engine_patch_replicated_engine("Log", "Log()", 't_000')
837
+ 'Log()'
838
+ >>> engine_patch_replicated_engine("MergeTree", "MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024", 't_000')
839
+ 'MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024'
840
+ """
841
+ if not engine_full:
842
+ return None
843
+ if engine.lower().startswith("Replicated".lower()):
844
+ parts = re.split(r"(Replicated.*MergeTree\(')([^']*)('.*)", engine_full)
845
+ paths = parts[2].split("/")
846
+ paths[-1] = new_table_name
847
+ zoo_path = "/".join(paths)
848
+ return "".join(parts[:2] + [zoo_path] + parts[3:])
849
+ return engine_full
850
+
851
+
852
+ if __name__ == "__main__":
853
+ print( # noqa: T201
854
+ _parse_table_structure(
855
+ """hola Int --comment\n, `materialized` String --otro comment\n MATERIALIZED upper(no_nullable_string)"""
856
+ )
857
+ )
858
+ # print(_parse_table_structure('@'))
859
+ # print(mark_error_string('012345678901234567890123456789', 30))
860
+ # print(_parse_table_structure('@'))
861
+ # print(_parse_table_structure('`test_default_cast DEFAULT plus(13,1)'))
862
+ # print(_parse_table_structure('`test_default_cast` DEFAULT plus(13,1)'))
863
+ # print(_parse_table_structure('hola Int32'))
864
+ # _parse_table_structure('hola')
865
+ # print(parse_table_structure("timestamp DateTime MATERIALIZED toDateTime(JSONExtractInt(JSONExtractRaw(record, 'payload'), 'timestamp') / 1000)"))