tinybird 0.0.1.dev14__py3-none-any.whl → 0.0.1.dev15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tinybird might be problematic. Click here for more details.

@@ -1,3 +1,4 @@
1
+ import functools
1
2
  import glob
2
3
  import itertools
3
4
  import os
@@ -5,23 +6,55 @@ import os.path
5
6
  import pprint
6
7
  import re
7
8
  import shlex
9
+ import string
8
10
  import textwrap
9
11
  import traceback
10
12
  from collections import namedtuple
13
+ from dataclasses import dataclass
11
14
  from io import StringIO
12
15
  from pathlib import Path
13
16
  from string import Template
14
- from typing import Any, Callable, Dict, List, Optional, Tuple, cast
17
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
15
18
 
16
19
  import click
17
20
  from mypy_extensions import KwArg, VarArg
18
21
 
19
22
  from tinybird.ch_utils.engine import ENABLED_ENGINES
20
23
  from tinybird.feedback_manager import FeedbackManager
21
- from tinybird.sql import parse_indexes_structure, parse_table_structure, schema_to_sql_columns
22
24
  from tinybird.tb.modules.datafile.exceptions import IncludeFileNotFoundException, ParseException, ValidationException
23
25
  from tinybird.tb.modules.exceptions import CLIPipeException
24
26
 
27
+ # Code from sql.py has been duplicated so I can change it without breaking absolutely everything in the app
28
+ # I'll try not to make logic changes, just error reporting changes
29
+ # from tinybird.sql import parse_indexes_structure, parse_table_structure, schema_to_sql_columns
30
+
31
+
32
+ class DatafileSyntaxError(Exception):
33
+ def __init__(self, message: str, lineno: int, pos: int, hint: Optional[str] = None):
34
+ super().__init__(message)
35
+ self.message = message
36
+ self.hint = hint
37
+ self.lineno = lineno
38
+ self.pos = pos
39
+
40
+ def __str__(self) -> str:
41
+ hint = f" {self.hint}." if self.hint else ""
42
+ return f"{self.message} at {self.lineno}:{self.pos}." + hint
43
+
44
+
45
+ class SchemaSyntaxError(DatafileSyntaxError):
46
+ def __init__(self, message: str, lineno: int, pos: int, hint: Optional[str] = None):
47
+ super().__init__(message=message, lineno=lineno, pos=pos, hint=hint)
48
+
49
+
50
+ class IndexesSyntaxError(DatafileSyntaxError):
51
+ def __init__(self, message: str, lineno: int, pos: int, hint: Optional[str] = None):
52
+ super().__init__(message=message, lineno=lineno, pos=pos, hint=hint)
53
+
54
+
55
+ class MalformedColumnError(Exception):
56
+ pass
57
+
25
58
 
26
59
  class PipeTypes:
27
60
  MATERIALIZED = "materialized"
@@ -89,11 +122,13 @@ TB_LOCAL_WORKSPACE_NAME = "Tinybird_Local_Testing"
89
122
 
90
123
  pp = pprint.PrettyPrinter()
91
124
 
125
+ valid_chars_name: str = string.ascii_letters + string.digits + "._`*<>+-'"
126
+ valid_chars_fn: str = valid_chars_name + "[](),=!?:/ \n\t\r"
127
+
92
128
 
93
129
  class Datafile:
94
130
  def __init__(self) -> None:
95
131
  self.maintainer: Optional[str] = None
96
- self.sources: List[str] = []
97
132
  self.nodes: List[Dict[str, Any]] = []
98
133
  self.tokens: List[Dict[str, Any]] = []
99
134
  self.version: Optional[int] = None
@@ -104,15 +139,6 @@ class Datafile:
104
139
  self.warnings: List[str] = []
105
140
  self.filtering_tags: Optional[List[str]] = None
106
141
 
107
- def validate(self) -> None:
108
- for x in self.nodes:
109
- if not x["name"].strip():
110
- raise ValidationException("invalid node name, can't be empty")
111
- if "sql" not in x:
112
- raise ValidationException("node %s must have a SQL query" % x["name"])
113
- if self.version is not None and (not isinstance(self.version, int) or self.version < 0):
114
- raise ValidationException("version must be a positive integer")
115
-
116
142
  def is_equal(self, other):
117
143
  if len(self.nodes) != len(other.nodes):
118
144
  return False
@@ -166,6 +192,848 @@ def parse_tags(tags: str) -> Tuple[str, List[str]]:
166
192
  return all_kv_tags, filtering_tags
167
193
 
168
194
 
195
+ @dataclass
196
+ class TableIndex:
197
+ """Defines a CH table INDEX"""
198
+
199
+ name: str
200
+ expr: str
201
+ type_full: str
202
+ granularity: Optional[str] = None
203
+
204
+ def to_datafile(self):
205
+ granularity_expr = f"GRANULARITY {self.granularity}" if self.granularity else ""
206
+ return f"{self.name} {self.expr} TYPE {self.type_full} {granularity_expr}"
207
+
208
+ def to_sql(self):
209
+ return f"INDEX {self.to_datafile()}"
210
+
211
+ def add_index_sql(self):
212
+ return f"ADD {self.to_sql()}"
213
+
214
+ def drop_index_sql(self):
215
+ return f"DROP INDEX IF EXISTS {self.name}"
216
+
217
+ def materialize_index_sql(self):
218
+ return f"MATERIALIZE INDEX IF EXISTS {self.name}"
219
+
220
+ def clear_index_sql(self):
221
+ return f"CLEAR INDEX IF EXISTS {self.name}"
222
+
223
+
224
+ def parse_indexes_structure(indexes: Optional[List[str]]) -> List[TableIndex]:
225
+ """
226
+ >>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
227
+ [TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
228
+ >>> parse_indexes_structure(["INDEX index_name a TYPE set(100) GRANULARITY 100", " INDEX index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
229
+ [TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
230
+ >>> parse_indexes_structure(["index_name type TYPE set(100) GRANULARITY 100", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
231
+ [TableIndex(name='index_name', expr='type', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
232
+ >>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100,", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
233
+ [TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
234
+ >>> parse_indexes_structure(["index_name a TYPE set(100)", "index_name_bf mapValues(d) TYPE bloom_filter(0.001)"])
235
+ [TableIndex(name='index_name', expr='a', type_full='set(100)', granularity=None), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity=None)]
236
+ >>> parse_indexes_structure(["index_name u64 * length(s) TYPE set(100)", "index_name_bf mapValues(d) TYPE bloom_filter"])
237
+ [TableIndex(name='index_name', expr='u64 * length(s)', type_full='set(100)', granularity=None), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter', granularity=None)]
238
+ >>> parse_indexes_structure(["index_name path TYPE ngrambf_v1(4,1024,1,42) GRANULARITY 1"])
239
+ [TableIndex(name='index_name', expr='path', type_full='ngrambf_v1(4,1024,1,42)', granularity='1')]
240
+ >>> parse_indexes_structure(["index_name path TYPE ngrambf_v1(4, 1024, 1, 42) GRANULARITY 1"])
241
+ [TableIndex(name='index_name', expr='path', type_full='ngrambf_v1(4, 1024, 1, 42)', granularity='1')]
242
+ >>> parse_indexes_structure(["index_name u64 * length(s)"])
243
+ Traceback (most recent call last):
244
+ ...
245
+ tinybird.tb.modules.datafile.common.IndexesSyntaxError: Invalid INDEX syntax at 1:1. Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`.
246
+ >>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100, index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
247
+ Traceback (most recent call last):
248
+ ...
249
+ tinybird.tb.modules.datafile.common.IndexesSyntaxError: Invalid INDEX syntax at 1:1. Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`.
250
+ >>> parse_indexes_structure(["", " ", " wrong_index_syntax,"])
251
+ Traceback (most recent call last):
252
+ ...
253
+ tinybird.tb.modules.datafile.common.IndexesSyntaxError: Invalid INDEX syntax at 3:6. Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`.
254
+ >>> parse_indexes_structure(["my_index m['key'] TYPE ngrambf_v1(1, 1024, 1, 42) GRANULARITY 1"])
255
+ [TableIndex(name='my_index', expr="m['key']", type_full='ngrambf_v1(1, 1024, 1, 42)', granularity='1')]
256
+ >>> parse_indexes_structure(["my_index_lambda arrayMap(x -> tupleElement(x,'message'), column_name) TYPE ngrambf_v1(1, 1024, 1, 42) GRANULARITY 1"])
257
+ [TableIndex(name='my_index_lambda', expr="arrayMap(x -> tupleElement(x,'message'), column_name)", type_full='ngrambf_v1(1, 1024, 1, 42)', granularity='1')]
258
+ >>> parse_indexes_structure(["ip_range_minmax_idx (toIPv6(ip_range_start), toIPv6(ip_range_end)) TYPE minmax GRANULARITY 1"])
259
+ [TableIndex(name='ip_range_minmax_idx', expr='(toIPv6(ip_range_start), toIPv6(ip_range_end))', type_full='minmax', granularity='1')]
260
+ """
261
+ parsed_indices: List[TableIndex] = []
262
+ if not indexes:
263
+ return parsed_indices
264
+
265
+ # TODO(eclbg): It might not be obvious that we only allow one index per line.
266
+ for i, index in enumerate(indexes):
267
+ lineno = i + 1
268
+ if not index.strip():
269
+ continue
270
+ leading_whitespaces = len(index) - len(index.lstrip())
271
+ index = index.strip().rstrip(",")
272
+ index = index.lstrip("INDEX").strip()
273
+ if index.count("TYPE") != 1:
274
+ raise IndexesSyntaxError(
275
+ message="Invalid INDEX syntax",
276
+ hint="Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`",
277
+ lineno=lineno,
278
+ pos=leading_whitespaces + 1,
279
+ )
280
+
281
+ match = re.match(
282
+ r"(\w+)\s+([\w\s*\[\]\*\(\),\'\"-><.]+)\s+TYPE\s+(\w+)(?:\(([\w\s*.,]+)\))?(?:\s+GRANULARITY\s+(\d+))?",
283
+ index,
284
+ )
285
+ if match:
286
+ index_name, a, index_type, value, granularity = match.groups()
287
+ index_expr = f"{index_type}({value})" if value else index_type
288
+ parsed_indices.append(TableIndex(index_name, a.strip(), f"{index_expr}", granularity))
289
+ else:
290
+ raise IndexesSyntaxError(
291
+ message="Invalid INDEX syntax",
292
+ hint="Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`",
293
+ lineno=1,
294
+ pos=leading_whitespaces + 1,
295
+ )
296
+ return parsed_indices
297
+
298
+
299
+ def clean_comments_rstrip_keep_empty_lines(schema_to_clean: Optional[str]) -> Tuple[Optional[str], bool]:
300
+ """Remove the comments from the schema
301
+ If the comments are between backticks, they will not be removed.
302
+ Lines that are empty after removing comments are also removed. Lines are only rstripped of whitespaces
303
+ >>> clean_comments_rstrip_keep_empty_lines(None) is None
304
+ True
305
+ >>> clean_comments_rstrip_keep_empty_lines('')
306
+ ''
307
+ >>> clean_comments_rstrip_keep_empty_lines(' ')
308
+ ''
309
+ >>> clean_comments_rstrip_keep_empty_lines('\\n')
310
+ ''
311
+ >>> clean_comments_rstrip_keep_empty_lines('\\n\\n\\n\\n')
312
+ ''
313
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32')
314
+ 'c Float32'
315
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n')
316
+ 'c Float32'
317
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a comment')
318
+ 'c Float32'
319
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a comment\\n')
320
+ 'c Float32'
321
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32\\t-- this is a comment\\t\\n')
322
+ 'c Float32'
323
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a comment\\r\\n')
324
+ 'c Float32'
325
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a comment\\n--this is a comment2\\n')
326
+ 'c Float32'
327
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a ```comment\\n')
328
+ 'c Float32'
329
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a ```comment\\n')
330
+ 'c Float32'
331
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32, -- comment\\nd Float32 -- comment2')
332
+ 'c Float32,\\nd Float32'
333
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32, -- comment\\n -- comment \\nd Float32 -- comment2')
334
+ 'c Float32,\\n\\nd Float32'
335
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32 `json:$.aa--aa`\\n--this is a ```comment\\n')
336
+ 'c Float32 `json:$.aa--aa`'
337
+ >>> clean_comments_rstrip_keep_empty_lines('c Float32 `json:$.cc--cc`\\nd Float32 `json:$.dd--dd`\\n--this is a ```comment\\n')
338
+ 'c Float32 `json:$.cc--cc`\\nd Float32 `json:$.dd--dd`'
339
+ >>> clean_comments_rstrip_keep_empty_lines('c--c Float32 `json:$.cc--cc`\\n')
340
+ 'c'
341
+ >>> clean_comments_rstrip_keep_empty_lines('`c--c` Float32 `json:$.cc--cc`\\n')
342
+ '`c'
343
+ """
344
+
345
+ def clean_line_comments(line: str) -> str:
346
+ if not line:
347
+ return line
348
+ i = 0
349
+ inside_json_path = False
350
+ while i < len(line):
351
+ if i + 1 < len(line) and line[i] == "-" and line[i + 1] == "-" and not inside_json_path:
352
+ return line[:i].rstrip()
353
+
354
+ if not inside_json_path and line[i:].startswith("`json:"):
355
+ inside_json_path = True
356
+ elif inside_json_path and line[i] == "`":
357
+ inside_json_path = False
358
+ i += 1
359
+ return line
360
+
361
+ if schema_to_clean is None:
362
+ return schema_to_clean
363
+
364
+ cleaned_schema = ""
365
+ for line in schema_to_clean.splitlines():
366
+ cleaned_line = clean_line_comments(line)
367
+ cleaned_schema += cleaned_line + "\n"
368
+ return cleaned_schema.rstrip()
369
+
370
+
371
+ SyntaxExpr = namedtuple("SyntaxExpr", ["name", "regex"])
372
+
373
+ NULL = SyntaxExpr("NULL", re.compile(r"\s+NULL([^a-z0-9_]|$)", re.IGNORECASE))
374
+ NOTNULL = SyntaxExpr("NOTNULL", re.compile(r"\s+NOT\s+NULL([^a-z0-9_]|$)", re.IGNORECASE))
375
+ DEFAULT = SyntaxExpr("DEFAULT", re.compile(r"\s+DEFAULT([^a-z0-9_]|$)", re.IGNORECASE))
376
+ MATERIALIZED = SyntaxExpr("MATERIALIZED", re.compile(r"\s+MATERIALIZED([^a-z0-9_]|$)", re.IGNORECASE))
377
+ ALIAS = SyntaxExpr("ALIAS", re.compile(r"\s+ALIAS([^a-z0-9_]|$)", re.IGNORECASE))
378
+ CODEC = SyntaxExpr("CODEC", re.compile(r"\s+CODEC([^a-z0-9_]|$)", re.IGNORECASE))
379
+ TTL = SyntaxExpr("TTL", re.compile(r"\s+TTL([^a-z0-9_]|$)", re.IGNORECASE))
380
+ JSONPATH = SyntaxExpr("JSONPATH", re.compile(r"\s+`json:", re.IGNORECASE))
381
+ COMMA = SyntaxExpr("COMMA", re.compile(r",", re.IGNORECASE))
382
+ NEW_LINE = SyntaxExpr("NEW_LINE", re.compile(r"\s$"))
383
+ TYPE = SyntaxExpr("TYPE", re.compile(r"")) # TYPE doesn't have a fixed initial string
384
+
385
+ REGEX_WHITESPACE = re.compile(r"\s*")
386
+ REGEX_COMMENT = re.compile(r"\-\-[^\n\r]*[\n\r]")
387
+
388
+
389
+ def mark_error_string(s: str, i: int, line: int = 1) -> str:
390
+ """
391
+ >>> mark_error_string('0123456789', 0)
392
+ '0123456789\\n^---'
393
+ >>> mark_error_string('0123456789', 9)
394
+ '0123456789\\n ^---'
395
+ >>> mark_error_string('01234\\n56789', 1)
396
+ '01234\\n ^---'
397
+ """
398
+ marker = "^---"
399
+ ss = s.splitlines()[line - 1] if s else ""
400
+ start = 0
401
+ end = len(ss)
402
+ return ss[start:end] + "\n" + (" " * (i - start)) + marker
403
+
404
+
405
+ def format_parse_error(
406
+ table_structure: str,
407
+ i: int,
408
+ position: int,
409
+ hint: Optional[str] = None,
410
+ line: int = 0,
411
+ keyword: Optional[str] = None,
412
+ ) -> str:
413
+ adjusted_position = position - (len(keyword) if keyword else 0)
414
+ message = f"{hint}\n" if hint else ""
415
+ message += mark_error_string(table_structure, adjusted_position - 1, line=line)
416
+
417
+ if keyword:
418
+ message += f" found at position {adjusted_position - len(keyword)}"
419
+ else:
420
+ message += (
421
+ f" found {repr(table_structure[i]) if len(table_structure)>i else 'EOF'} at position {adjusted_position}"
422
+ )
423
+ return message
424
+
425
+
426
+ def clean_line_comments(line: str) -> str:
427
+ if not line:
428
+ return line
429
+ i = 0
430
+ inside_json_path = False
431
+ while i < len(line):
432
+ if i + 1 < len(line) and line[i] == "-" and line[i + 1] == "-" and not inside_json_path:
433
+ return line[:i].strip()
434
+
435
+ if not inside_json_path and line[i:].startswith("`json:"):
436
+ inside_json_path = True
437
+ elif inside_json_path and line[i] == "`":
438
+ inside_json_path = False
439
+ i += 1
440
+ return line
441
+
442
+
443
+ def _parse_table_structure(schema: str) -> List[Dict[str, Any]]:
444
+ # CH syntax from https://clickhouse.com/docs/en/sql-reference/statements/create/table/
445
+ # name1 [type1] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1]
446
+ try:
447
+ # This removes lines that are empty after removing comments, which might make it hard to locate errors properly.
448
+ # The parsing code afterwards seems to be mostly robust to empty lines.
449
+ # Perhaps I'll deliberately not support reporting errors correctly when empty lines have been removed to start
450
+ # with, and later I can see how to support it.
451
+ # It also removes the indentation of the lines, which might make it hard to locate errors properly.
452
+ # schema = clean_comments(schema + "\n")
453
+
454
+ # I've swapped the above with this. A first test didn't show any side effects in parsing a schema, and it should
455
+ # allow us to keep track of the line numbers in the error messages.
456
+ schema = clean_comments_rstrip_keep_empty_lines(schema + "\n")
457
+ except Exception:
458
+ # logging.exception(f"Error cleaning comments: {e}")
459
+ schema = REGEX_COMMENT.sub(" ", schema + "\n").strip()
460
+
461
+ if REGEX_WHITESPACE.fullmatch(schema):
462
+ return []
463
+
464
+ i: int = 0
465
+
466
+ # For error feedback only
467
+ line: int = 1
468
+ pos: int = 1
469
+
470
+ # Find the first SyntaxExpr in lookup that matches the schema at the current offset
471
+ def lookahead_matches(lookup: Iterable) -> Optional[SyntaxExpr]:
472
+ s = schema[i:]
473
+ match = next((x for x in lookup if x.regex.match(s)), None)
474
+ return match
475
+
476
+ def advance_single_char() -> None:
477
+ nonlocal i, line, pos
478
+ if schema[i] == "\n":
479
+ line += 1
480
+ pos = 1
481
+ else:
482
+ pos += 1
483
+ i += 1
484
+
485
+ # Advance all whitespaces characters and then len(s) more chars
486
+ def advance(s: str) -> None:
487
+ if i < len(schema):
488
+ while schema[i] in " \t\r\n":
489
+ advance_single_char()
490
+ for _ in s:
491
+ advance_single_char()
492
+
493
+ def get_backticked() -> str:
494
+ begin = i
495
+ while i < len(schema):
496
+ c = schema[i]
497
+ advance_single_char()
498
+ if c == "`":
499
+ return schema[begin : i - 1]
500
+ if c in " \t\r\n":
501
+ raise SchemaSyntaxError(message="Expected closing backtick", lineno=line, pos=pos - 1)
502
+ raise SchemaSyntaxError(message="Expected closing backtick", lineno=line, pos=pos)
503
+
504
+ def parse_name() -> str:
505
+ nonlocal i, line, pos
506
+ if schema[i] != "`":
507
+ # regular name
508
+ begin = i
509
+ while i < len(schema):
510
+ c = schema[i]
511
+ if c in " \t\r\n":
512
+ return schema[begin:i]
513
+ if c not in valid_chars_name:
514
+ raise SchemaSyntaxError(
515
+ message=f"Column name contains invalid character {repr(c)}",
516
+ hint="Tip: use backticks",
517
+ lineno=line,
518
+ pos=i + 1,
519
+ )
520
+ advance_single_char()
521
+ return schema[begin:i]
522
+ else:
523
+ # backticked name
524
+ advance_single_char()
525
+ return get_backticked()
526
+
527
+ def parse_expr(lookup: Iterable[SyntaxExpr]) -> str:
528
+ nonlocal i, line, pos
529
+
530
+ begin: int = i
531
+ context_stack: List[Optional[str]] = [None]
532
+ while i < len(schema):
533
+ context = context_stack[-1]
534
+ c = schema[i]
535
+
536
+ if (context == "'" and c == "'") or (context == '"' and c == '"') or (context == "(" and c == ")"):
537
+ context_stack.pop()
538
+ elif c == "'" and (context is None or context == "("):
539
+ context_stack.append("'")
540
+ elif c == '"' and (context is None or context == "("):
541
+ context_stack.append('"')
542
+ elif c == "(" and (context is None or context == "("):
543
+ context_stack.append("(")
544
+ elif context is None and lookahead_matches(lookup):
545
+ return schema[begin:i].strip(" \t\r\n")
546
+ elif (context is None and c not in valid_chars_fn) or (context == "(" and c not in valid_chars_fn):
547
+ raise SchemaSyntaxError(message=f"Invalid character {repr(c)}", lineno=line, pos=pos)
548
+ advance_single_char()
549
+ if i == begin:
550
+ # TODO(eclbg): Turn this into a SchemaSyntaxError. I don't know when it happens
551
+ raise ValueError(format_parse_error(schema, i, pos, "wrong value", line=line))
552
+ return schema[begin:].strip(" \t\r\n")
553
+
554
+ columns: List[Dict[str, Any]] = []
555
+
556
+ name: str = ""
557
+ _type: str = ""
558
+ default: str = ""
559
+ materialized: str = ""
560
+ codec: str = ""
561
+ jsonpath: str = ""
562
+ last: Optional[SyntaxExpr] = None
563
+ col_start: Tuple[int, int] = (0, 0) # (0, 0) means not set. It's not a valid line/pos as they start at 1
564
+ col_end: Tuple[int, int] = (0, 0) # (0, 0) means not set. It's not a valid line/pos as they start at 1
565
+
566
+ def add_column(found: str) -> None:
567
+ nonlocal name, _type, default, materialized, codec, jsonpath, col_start, col_end
568
+ if not name:
569
+ # TODO(eclbg): get rid of this ValueError and replace it with a custom one so it can be handled by the
570
+ # caller
571
+ raise ValueError(
572
+ format_parse_error(schema, i, pos, f"Syntax error: expecting NAME, found {found}", line=line)
573
+ )
574
+ default = "" if not default else f"DEFAULT {default}"
575
+ materialized = "" if not materialized else f"MATERIALIZED {materialized}"
576
+ codec = "" if not codec else f"CODEC{codec}"
577
+ # TODO(eclbg): We should validate the column as a whole. Name is mandatory, and one of type, default_value or
578
+ # materialized (I think).
579
+ columns.append(
580
+ {
581
+ "name": name,
582
+ "type": _type,
583
+ "codec": codec,
584
+ "default_value": default or materialized,
585
+ "jsonpath": jsonpath,
586
+ # "col_start": col_start,
587
+ # "col_end": col_end,
588
+ }
589
+ )
590
+ name = ""
591
+ _type = ""
592
+ default = ""
593
+ materialized = ""
594
+ codec = ""
595
+ jsonpath = ""
596
+
597
+ valid_next: List[SyntaxExpr] = [TYPE]
598
+ while i < len(schema):
599
+ if not name:
600
+ advance("")
601
+ valid_next = [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, TYPE]
602
+ col_start = (line, pos)
603
+ name = parse_name()
604
+ if name == "INDEX":
605
+ raise SchemaSyntaxError(
606
+ message="Forbidden INDEX definition",
607
+ hint="Indexes are not allowed in SCHEMA section. Use the INDEXES section instead",
608
+ lineno=line,
609
+ pos=pos - len(name), # We've already advanced the name
610
+ )
611
+ continue
612
+ found = lookahead_matches(
613
+ [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, NEW_LINE, TYPE]
614
+ )
615
+ if found and found not in valid_next:
616
+ after = f" after {last.name}" if last else ""
617
+ raise SchemaSyntaxError(message=f"Unexpected {found.name}{after}", lineno=line, pos=pos)
618
+ if found == TYPE:
619
+ advance("")
620
+ valid_next = [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, NEW_LINE]
621
+ type_start_pos = pos # Save the position of the type start to use it in the error message
622
+ detected_type = parse_expr([NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
623
+ try:
624
+ # Imported in the body to be compatible with the CLI
625
+ from chtoolset.query import check_compatible_types
626
+
627
+ # Check compatibility of the type with itself to verify it's a known type
628
+ check_compatible_types(detected_type, detected_type)
629
+ except ValueError as e:
630
+ if (
631
+ "unknown data type family" in str(e).lower()
632
+ or "incompatible data types between aggregate function" in str(e).lower()
633
+ ):
634
+ raise SchemaSyntaxError(message=str(e), lineno=line, pos=type_start_pos)
635
+ else:
636
+ raise e
637
+ except ModuleNotFoundError:
638
+ pass
639
+ _type = detected_type
640
+ elif found == NULL:
641
+ # Not implemented
642
+ advance("") # We need to advance to get the correct position
643
+ raise SchemaSyntaxError(
644
+ message="NULL column syntax not supported",
645
+ hint="Hint: use Nullable(...)",
646
+ lineno=line,
647
+ pos=pos,
648
+ )
649
+ elif found == NOTNULL:
650
+ advance("") # We need to advance to get the correct position
651
+ raise SchemaSyntaxError(
652
+ message="NOT NULL column syntax not supported",
653
+ hint="Hint: Columns are not nullable by default",
654
+ lineno=line,
655
+ pos=pos,
656
+ )
657
+ elif found == DEFAULT:
658
+ advance("DEFAULT")
659
+ valid_next = [
660
+ CODEC,
661
+ TTL,
662
+ COMMA,
663
+ # The three matches below are never valid. We're adding them here to avoid just complaining about their placement.
664
+ MATERIALIZED,
665
+ NULL,
666
+ NOTNULL,
667
+ ]
668
+ default = parse_expr([NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
669
+ elif found == MATERIALIZED:
670
+ advance("")
671
+ raise SchemaSyntaxError(
672
+ message="MATERIALIZED columns are not supported",
673
+ lineno=line,
674
+ pos=pos,
675
+ )
676
+ elif found == ALIAS:
677
+ # Not implemented
678
+ advance("") # We need to advance to get the correct position
679
+ raise SchemaSyntaxError(
680
+ message="ALIAS columns are not supported",
681
+ lineno=line,
682
+ pos=pos,
683
+ )
684
+ elif found == CODEC:
685
+ advance("CODEC")
686
+ valid_next = [
687
+ TTL,
688
+ COMMA,
689
+ JSONPATH,
690
+ # The three matches below are never valid. We're adding them here to avoid just complaining about their placement.
691
+ MATERIALIZED,
692
+ NULL,
693
+ NOTNULL,
694
+ ]
695
+ codec = parse_expr([NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
696
+ elif found == TTL:
697
+ advance("") # We need to advance to get the correct position
698
+ # Not implemented
699
+ advance("")
700
+ raise SchemaSyntaxError(
701
+ message="column TTL is not supported",
702
+ lineno=line,
703
+ pos=pos,
704
+ )
705
+ elif found == JSONPATH:
706
+ advance("`json:")
707
+ jsonpath = get_backticked()
708
+ elif found == COMMA:
709
+ if name == "INDEX":
710
+ advance(",")
711
+ continue
712
+ advance(",")
713
+ valid_next = []
714
+ col_end = (line, pos)
715
+ add_column("COMMA")
716
+ elif found == NEW_LINE or (name == "INDEX" and not found):
717
+ i += 1
718
+ else:
719
+ raise ValueError(
720
+ format_parse_error(
721
+ schema,
722
+ i,
723
+ pos,
724
+ "wrong value, DEFAULT, MATERIALIZED, CODEC, TTL expressions, a column data type, a comma, a new line or a jsonpath",
725
+ line=line,
726
+ )
727
+ )
728
+ last = found
729
+ col_end = (line, i + 1)
730
+ # Only add the last column if we've parsed something. This allows for a trailing comma after the last column.
731
+ if name:
732
+ add_column("EOF")
733
+
734
+ # normalize columns
735
+ for column in columns:
736
+ nullable = column["type"].lower().startswith("nullable")
737
+ column["type"] = column["type"] if not nullable else column["type"][len("Nullable(") : -1] # ')'
738
+ column["nullable"] = nullable
739
+ column["codec"] = column["codec"] if column["codec"] else None
740
+ column["name"] = column["name"]
741
+ column["normalized_name"] = column["name"]
742
+ column["jsonpath"] = column["jsonpath"] if column["jsonpath"] else None
743
+ default_value = column["default_value"] if column["default_value"] else None
744
+ if nullable and default_value and default_value.lower() == "default null":
745
+ default_value = None
746
+ column["default_value"] = default_value
747
+ return columns
748
+
749
+
750
+ def try_to_fix_nullable_in_simple_aggregating_function(t: str) -> Optional[str]:
751
+ # This workaround is to fix: https://github.com/ClickHouse/ClickHouse/issues/34407.
752
+ # In the case of nullable columns and SimpleAggregateFunction Clickhouse returns
753
+ # Nullable(SimpleAggregateFunction(sum, Int32)) instead of SimpleAggregateFunction(sum, Nullable(Int32))
754
+ # as it is done with other aggregate functions.
755
+ # If not, the aggregation could return incorrect results.
756
+ result = None
757
+ if match := re.search(r"SimpleAggregateFunction\((\w+),\s*(?!(?:Nullable))([\w,.()]+)\)", t):
758
+ fn = match.group(1)
759
+ inner_type = match.group(2)
760
+ result = f"SimpleAggregateFunction({fn}, Nullable({inner_type}))"
761
+ return result
762
+
763
+
764
+ def col_name(name: str, backquotes: bool = True) -> str:
765
+ """
766
+ >>> col_name('`test`', True)
767
+ '`test`'
768
+ >>> col_name('`test`', False)
769
+ 'test'
770
+ >>> col_name('test', True)
771
+ '`test`'
772
+ >>> col_name('test', False)
773
+ 'test'
774
+ >>> col_name('', True)
775
+ ''
776
+ >>> col_name('', False)
777
+ ''
778
+ """
779
+ if not name:
780
+ return name
781
+ if name[0] == "`" and name[-1] == "`":
782
+ return name if backquotes else name[1:-1]
783
+ return f"`{name}`" if backquotes else name
784
+
785
+
786
+ def schema_to_sql_columns(schema: List[Dict[str, Any]]) -> List[str]:
787
+ """return an array with each column in SQL
788
+ >>> schema_to_sql_columns([{'name': 'temperature', 'type': 'Float32', 'codec': None, 'default_value': None, 'nullable': False, 'normalized_name': 'temperature'}, {'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4))', 'default_value': 'MATERIALIZED temperature', 'nullable': False, 'normalized_name': 'temperature_delta'}])
789
+ ['`temperature` Float32', '`temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4))']
790
+ >>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'codec': '', 'default_value': 'MATERIALIZED temperature', 'nullable': False, 'normalized_name': 'temperature_delta'}])
791
+ ['`temperature_delta` Float32 MATERIALIZED temperature']
792
+ >>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4))', 'default_value': '', 'nullable': False, 'normalized_name': 'temperature_delta'}])
793
+ ['`temperature_delta` Float32 CODEC(Delta(4), LZ4))']
794
+ >>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'nullable': False, 'normalized_name': 'temperature_delta'}])
795
+ ['`temperature_delta` Float32']
796
+ >>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'nullable': False, 'normalized_name': 'temperature_delta', 'jsonpath': '$.temperature_delta'}])
797
+ ['`temperature_delta` Float32 `json:$.temperature_delta`']
798
+ >>> schema_to_sql_columns([{'name': 'aggregation', 'type': 'SimpleAggregateFunction(sum, Int32)', 'nullable': True, 'normalized_name': 'aggregation', 'jsonpath': '$.aggregation'}])
799
+ ['`aggregation` SimpleAggregateFunction(sum, Nullable(Int32)) `json:$.aggregation`']
800
+ """
801
+ columns: List[str] = []
802
+ for x in schema:
803
+ name = x["normalized_name"] if "normalized_name" in x else x["name"]
804
+ if x["nullable"]:
805
+ if (_type := try_to_fix_nullable_in_simple_aggregating_function(x["type"])) is None:
806
+ _type = "Nullable(%s)" % x["type"]
807
+ else:
808
+ _type = x["type"]
809
+ parts = [col_name(name, backquotes=True), _type]
810
+ if x.get("jsonpath", None):
811
+ parts.append(f"`json:{x['jsonpath']}`")
812
+ if "default_value" in x and x["default_value"] not in ("", None):
813
+ parts.append(x["default_value"])
814
+ if "codec" in x and x["codec"] not in ("", None):
815
+ parts.append(x["codec"])
816
+ c = " ".join([x for x in parts if x]).strip()
817
+ columns.append(c)
818
+ return columns
819
+
820
+
821
+ def parse_table_structure(schema: str) -> List[Dict[str, Any]]:
822
+ """This parses the SQL schema for a CREATE TABLE
823
+ Columns follow the syntax: name1 [type1] [DEFAULT expr1] [CODEC compression_codec] [TTL expr1] [JSONPATH `json:jsonpath`] [,]
824
+
825
+ The ClickHouse reference is followed pretty loosely at this point.
826
+ Reference: https://clickhouse.tech/docs/en/sql-reference/statements/create/table/#syntax-forms
827
+
828
+ >>> parse_table_structure('potato') # doctest: +SKIP
829
+ Traceback (most recent call last):
830
+ ...
831
+ tinybird.sql.MalformedColumnError: Column name and either type or default_value are required
832
+
833
+ >>> parse_table_structure(' potato Int32')
834
+ [{'name': 'potato', 'type': 'Int32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'potato'}]
835
+
836
+ >>> parse_table_structure('`c Int32')
837
+ Traceback (most recent call last):
838
+ ...
839
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: Expected closing backtick at 1:3.
840
+
841
+ >>> parse_table_structure('c Float32, b String')
842
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
843
+
844
+ >>> parse_table_structure('c Float32,--comment\\nb String')
845
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
846
+
847
+ >>> parse_table_structure('c Float32,--comment\\nb String --another-comment')
848
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
849
+
850
+ >>> parse_table_structure('c Float32 --first-comment\\n,--comment\\nb String --another-comment')
851
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
852
+
853
+ >>> parse_table_structure('--random comment here\\nc Float32 --another comment\\n,--another one\\nb String --this is the last one')
854
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
855
+
856
+ >>> parse_table_structure('--extra comment\\nc--extra comment\\nFloat32--extra comment\\n,--extra comment\\nb--extra comment\\nString--extra comment')
857
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
858
+
859
+ >>> parse_table_structure('c Nullable(Float32)')
860
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
861
+
862
+ >>> parse_table_structure('c Nullable(Float32) DEFAULT NULL')
863
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
864
+
865
+ >>> parse_table_structure("c String DEFAULT 'bla'")
866
+ [{'name': 'c', 'type': 'String', 'codec': None, 'default_value': "DEFAULT 'bla'", 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
867
+
868
+ >>> parse_table_structure('`foo.bar` UInt64')
869
+ [{'name': 'foo.bar', 'type': 'UInt64', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'foo.bar'}]
870
+
871
+ >>> parse_table_structure('double_value Float64 CODEC(LZ4HC(2))')
872
+ [{'name': 'double_value', 'type': 'Float64', 'codec': 'CODEC(LZ4HC(2))', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'double_value'}]
873
+
874
+ >>> parse_table_structure('doubl/e_value Float64 CODEC(LZ4HC(2))')
875
+ Traceback (most recent call last):
876
+ ...
877
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: Column name contains invalid character '/' at 1:6. Tip: use backticks.
878
+
879
+ >>> parse_table_structure('`c` Nullable(Float32)')
880
+ [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
881
+
882
+ >>> parse_table_structure('wadus INT UNSIGNED')
883
+ [{'name': 'wadus', 'type': 'INT UNSIGNED', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'wadus'}]
884
+
885
+ >>> parse_table_structure('c Int32 CODEC(Delta, LZ4)\\n')
886
+ [{'name': 'c', 'type': 'Int32', 'codec': 'CODEC(Delta, LZ4)', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
887
+
888
+ >>> parse_table_structure('c SimpleAggregateFunction(sum, Int32),\\np SimpleAggregateFunction(sum, Int32)')
889
+ Traceback (most recent call last):
890
+ ...
891
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: Incompatible data types between aggregate function 'sum' which returns Int64 and column storage type Int32 at 1:4.
892
+
893
+ >>> parse_table_structure('c Int32 CODEC(Delta, LZ4) Materialized b*2\\n')
894
+ Traceback (most recent call last):
895
+ ...
896
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:27.
897
+
898
+ >>> parse_table_structure('c Int32 CODEC(Delta, LZ4) Materialized ifNull(b*2, 0)\\n')
899
+ Traceback (most recent call last):
900
+ ...
901
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:27.
902
+
903
+ >>> parse_table_structure('c Int32 Materialized b*2\\n')
904
+ Traceback (most recent call last):
905
+ ...
906
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:9.
907
+
908
+ >>> parse_table_structure('c Int32 Materialized b != 1 ? b*2: pow(b, 3)\\n')
909
+ Traceback (most recent call last):
910
+ ...
911
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:9.
912
+
913
+ >>> parse_table_structure('')
914
+ []
915
+
916
+ >>> parse_table_structure('`date` Date,`timezone` String,`offset` Int32')
917
+ [{'name': 'date', 'type': 'Date', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'date'}, {'name': 'timezone', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'timezone'}, {'name': 'offset', 'type': 'Int32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'offset'}]
918
+
919
+ >>> parse_table_structure('c Int32 Materialized b*2 CODEC(Delta, LZ4)\\n')
920
+ Traceback (most recent call last):
921
+ ...
922
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:9.
923
+
924
+ >>> parse_table_structure('c Int32 Materialized ifNull(b*2, 0) CODEC(Delta, LZ4)\\n')
925
+ Traceback (most recent call last):
926
+ ...
927
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:9.
928
+
929
+ >>> parse_table_structure('`temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4)')
930
+ Traceback (most recent call last):
931
+ ...
932
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:29.
933
+
934
+ >>> parse_table_structure('foo^bar Float32')
935
+ Traceback (most recent call last):
936
+ ...
937
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: Column name contains invalid character '^' at 1:4. Tip: use backticks.
938
+
939
+ >>> parse_table_structure('foo Float#32')
940
+ Traceback (most recent call last):
941
+ ...
942
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: Invalid character '#' at 1:10.
943
+
944
+ >>> parse_table_structure('foo Float32 DEFAULT 13, bar UInt64')
945
+ [{'name': 'foo', 'type': 'Float32', 'codec': None, 'default_value': 'DEFAULT 13', 'jsonpath': None, 'nullable': False, 'normalized_name': 'foo'}, {'name': 'bar', 'type': 'UInt64', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'bar'}]
946
+
947
+ >>> parse_table_structure('foo Float32 DEFAULT 1$$$3')
948
+ Traceback (most recent call last):
949
+ ...
950
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: Invalid character '$' at 1:22.
951
+
952
+ >>> parse_table_structure('foo Float32 CODEC(Delta(4), LZ#4)')
953
+ Traceback (most recent call last):
954
+ ...
955
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: Invalid character '#' at 1:31.
956
+
957
+ >>> parse_table_structure('\\n `temperature` Float32,\\n `temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4)\\n ')
958
+ Traceback (most recent call last):
959
+ ...
960
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 3:33.
961
+
962
+ >>> parse_table_structure('temperature Float32, temperature_delta Float32 MATERIALIZED temperature Codec(Delta(4)), temperature_doubledelta Float32 MATERIALIZED temperature Codec(DoubleDelta), temperature_doubledelta_lz4 Float32 MATERIALIZED temperature Codec(DoubleDelta, LZ4)')
963
+ Traceback (most recent call last):
964
+ ...
965
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:48.
966
+
967
+ >>> parse_table_structure('t UInt8 CODEC(Delta(1), LZ4)')
968
+ [{'name': 't', 'type': 'UInt8', 'codec': 'CODEC(Delta(1), LZ4)', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 't'}]
969
+
970
+ >>> parse_table_structure('tt UInt8 MATERIALIZED t')
971
+ Traceback (most recent call last):
972
+ ...
973
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:11.
974
+
975
+ >>> parse_table_structure('tt UInt8 MATERIALIZED t CODEC(Delta(1), LZ4)')
976
+ Traceback (most recent call last):
977
+ ...
978
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:11.
979
+
980
+ >>> parse_table_structure('tt SimpleAggregateFunction(any, Nullable(UInt8))')
981
+ [{'name': 'tt', 'type': 'SimpleAggregateFunction(any, Nullable(UInt8))', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'tt'}]
982
+
983
+ >>> parse_table_structure("timestamp DateTime MATERIALIZED toDateTime(JSONExtractInt(JSONExtractRaw(record, 'payload'), 'timestamp') / 1000)")
984
+ Traceback (most recent call last):
985
+ ...
986
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:20.
987
+
988
+ >>> parse_table_structure("`test_default_cast` DEFAULT plus(13,1)")
989
+ [{'name': 'test_default_cast', 'type': '', 'codec': None, 'default_value': 'DEFAULT plus(13,1)', 'jsonpath': None, 'nullable': False, 'normalized_name': 'test_default_cast'}]
990
+
991
+ >>> parse_table_structure("hola Int, `materialized` String MATERIALIZED upper(no_nullable_string)")
992
+ Traceback (most recent call last):
993
+ ...
994
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:33.
995
+
996
+ >>> parse_table_structure('`a2` String `json:$.a2`, `a3` String `json:$.a3`\\n')
997
+ [{'name': 'a2', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
998
+
999
+ >>> parse_table_structure("`arr` Array(String) DEFAULT ['-']")
1000
+ [{'name': 'arr', 'type': 'Array(String)', 'codec': None, 'default_value': "DEFAULT ['-']", 'jsonpath': None, 'nullable': False, 'normalized_name': 'arr'}]
1001
+
1002
+ >>> parse_table_structure("`arr` Array(String) DEFAULT array('-')")
1003
+ [{'name': 'arr', 'type': 'Array(String)', 'codec': None, 'default_value': "DEFAULT array('-')", 'jsonpath': None, 'nullable': False, 'normalized_name': 'arr'}]
1004
+
1005
+ >>> parse_table_structure('`a2` Float32 CODEC(Delta, ZSTD(4)) `json:$.a2`, `a3` String `json:$.a3`\\n')
1006
+ [{'name': 'a2', 'type': 'Float32', 'codec': 'CODEC(Delta, ZSTD(4))', 'default_value': None, 'jsonpath': '$.a2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
1007
+
1008
+ >>> parse_table_structure('`a` String, INDEX index_name a TYPE set(100) GRANULARITY 100')
1009
+ Traceback (most recent call last):
1010
+ ...
1011
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: Forbidden INDEX definition at 1:13. Indexes are not allowed in SCHEMA section. Use the INDEXES section instead.
1012
+
1013
+ >>> parse_table_structure(' `a` String,\\n INDEX index_name a TYPE set(100, 1) GRANULARITY 100')
1014
+ Traceback (most recent call last):
1015
+ ...
1016
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: Forbidden INDEX definition at 2:5. Indexes are not allowed in SCHEMA section. Use the INDEXES section instead.
1017
+
1018
+ >>> parse_table_structure('`index` String, INDEX index_name a TYPE set(100, 1) GRANULARITY 100')
1019
+ Traceback (most recent call last):
1020
+ ...
1021
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: Forbidden INDEX definition at 1:17. Indexes are not allowed in SCHEMA section. Use the INDEXES section instead.
1022
+
1023
+ >>> parse_table_structure('`a2` String `json:$.a--2`, `a3` String `json:$.a3`\\n')
1024
+ [{'name': 'a2', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a--2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
1025
+
1026
+ >>> parse_table_structure('a InvalidType')
1027
+ Traceback (most recent call last):
1028
+ ...
1029
+ tinybird.tb.modules.datafile.common.SchemaSyntaxError: Unknown data type family: InvalidType at 1:3.
1030
+
1031
+ >>> parse_table_structure('a Int32 DEFAULT 'a') # doctest: +SKIP
1032
+ # should fail as the type and default expr are incompatible
1033
+ """
1034
+ return _parse_table_structure(schema)
1035
+
1036
+
169
1037
  def parse(
170
1038
  s: str,
171
1039
  default_node: Optional[str] = None,
@@ -175,11 +1043,9 @@ def parse(
175
1043
  ) -> Datafile:
176
1044
  """
177
1045
  Parses `s` string into a document
178
- >>> d = parse("FROM SCRATCH\\nSOURCE 'https://example.com'\\n#this is a comment\\nMAINTAINER 'rambo' #this is me\\nNODE \\"test_01\\"\\n DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n SELECT * from test_00\\n\\n\\nNODE \\"test_02\\"\\n DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n SELECT * from test_01\\n WHERE a > 1\\n GROUP by a\\n")
1046
+ >>> d = parse("MAINTAINER 'rambo' #this is me\\nNODE \\"test_01\\"\\n DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n SELECT * from test_00\\n\\n\\nNODE \\"test_02\\"\\n DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n SELECT * from test_01\\n WHERE a > 1\\n GROUP by a\\n")
179
1047
  >>> d.maintainer
180
1048
  'rambo'
181
- >>> d.sources
182
- ['https://example.com']
183
1049
  >>> len(d.nodes)
184
1050
  2
185
1051
  >>> d.nodes[0]
@@ -192,12 +1058,43 @@ def parse(
192
1058
  doc = Datafile()
193
1059
  doc.raw = list(StringIO(s, newline=None))
194
1060
 
195
- parser_state = namedtuple("parser_state", ["multiline", "current_node", "command", "multiline_string", "is_sql"])
1061
+ parser_state = namedtuple(
1062
+ "parser_state", ["multiline", "current_node", "command", "multiline_string", "is_sql", "start_lineno"]
1063
+ )
196
1064
 
197
1065
  parser_state.multiline = False
198
1066
  parser_state.current_node = False
1067
+ parser_state.start_lineno = None
1068
+
1069
+ def multiline_not_supported(func: Callable[..., Any]) -> Callable[..., Any]:
1070
+ @functools.wraps(func)
1071
+ def error_if_multiline(*args: Any, **kwargs: Any) -> Any:
1072
+ if parser_state.multiline:
1073
+ parser_state.multiline = (
1074
+ False # So we don't offset the line number when processing the exception. A bit hacky
1075
+ )
1076
+ raise DatafileSyntaxError(
1077
+ f"{kwargs['cmd'].upper()} does not support multiline arguments",
1078
+ lineno=parser_state.start_lineno, # We want to report the line where the command starts
1079
+ pos=1,
1080
+ )
1081
+ return func(*args, **kwargs)
1082
+
1083
+ return error_if_multiline
1084
+
1085
+ def deprecated(func: Callable[..., Any]) -> Callable[..., Any]:
1086
+ @functools.wraps(func)
1087
+ def raise_deprecation_error(*args: Any, **kwargs: Any) -> Any:
1088
+ raise DatafileSyntaxError(
1089
+ f"{kwargs['cmd'].upper()} has been deprecated",
1090
+ lineno=kwargs["lineno"],
1091
+ pos=1,
1092
+ )
1093
+
1094
+ return raise_deprecation_error
199
1095
 
200
1096
  def assign(attr):
1097
+ @multiline_not_supported
201
1098
  def _fn(x, **kwargs):
202
1099
  setattr(doc, attr, _unquote(x))
203
1100
 
@@ -207,7 +1104,10 @@ def parse(
207
1104
  s = _unquote("".join(args))
208
1105
  try:
209
1106
  sh = parse_table_structure(s)
1107
+ except SchemaSyntaxError as e:
1108
+ raise e
210
1109
  except Exception as e:
1110
+ # TODO(eclbg): Does it make sense to keep this exception? I'd like to get rid of all ParseException
211
1111
  raise ParseException(FeedbackManager.error_parsing_schema(line=kwargs["lineno"], error=e))
212
1112
 
213
1113
  parser_state.current_node["schema"] = ",".join(schema_to_sql_columns(sh))
@@ -219,26 +1119,33 @@ def parse(
219
1119
  return
220
1120
  try:
221
1121
  indexes = parse_indexes_structure(s.splitlines())
1122
+ except IndexesSyntaxError as e:
1123
+ raise e
222
1124
  except Exception as e:
1125
+ # TODO(eclbg): We get here when an unidentified error happens but we still report a parsing error. We could rething this.
223
1126
  raise ParseException(FeedbackManager.error_parsing_indices(line=kwargs["lineno"], error=e))
224
1127
 
225
1128
  parser_state.current_node["indexes"] = indexes
226
1129
 
227
1130
  def assign_var(v: str) -> Callable[[VarArg(str), KwArg(Any)], None]:
1131
+ @multiline_not_supported
228
1132
  def _f(*args: str, **kwargs: Any):
229
1133
  s = _unquote((" ".join(args)).strip())
230
1134
  parser_state.current_node[v.lower()] = eval_var(s, skip=skip_eval)
231
1135
 
232
1136
  return _f
233
1137
 
1138
+ @deprecated
234
1139
  def sources(x: str, **kwargs: Any) -> None:
235
- doc.sources.append(_unquote(x))
1140
+ pass # Deprecated
236
1141
 
1142
+ @multiline_not_supported
237
1143
  def node(*args: str, **kwargs: Any) -> None:
238
1144
  node = {"name": eval_var(_unquote(args[0]))}
239
1145
  doc.nodes.append(node)
240
1146
  parser_state.current_node = node
241
1147
 
1148
+ @multiline_not_supported
242
1149
  def scope(*args: str, **kwargs: Any) -> None:
243
1150
  scope = {"name": eval_var(_unquote(args[0]))}
244
1151
  doc.nodes.append(scope)
@@ -255,9 +1162,21 @@ def parse(
255
1162
  doc.description = description
256
1163
 
257
1164
  def sql(var_name: str, **kwargs: Any) -> Callable[[str, KwArg(Any)], None]:
258
- def _f(sql: str, **kwargs: Any) -> None:
1165
+ # TODO(eclbg): We shouldn't allow SQL in datasource files
1166
+ def _f(sql: str, *args: Any, **kwargs: Any) -> None:
1167
+ if not parser_state.multiline:
1168
+ raise DatafileSyntaxError(
1169
+ "SQL must be multiline",
1170
+ hint="Use > to start a multiline SQL block",
1171
+ lineno=kwargs["lineno"],
1172
+ pos=1,
1173
+ )
259
1174
  if not parser_state.current_node:
260
- raise ParseException("SQL must be called after a NODE command")
1175
+ raise DatafileSyntaxError(
1176
+ "SQL must be called after a NODE command",
1177
+ lineno=kwargs["lineno"],
1178
+ pos=1,
1179
+ )
261
1180
  parser_state.current_node[var_name] = (
262
1181
  textwrap.dedent(sql).rstrip() if "%" not in sql.strip()[0] else sql.strip()
263
1182
  )
@@ -268,20 +1187,28 @@ def parse(
268
1187
  def assign_node_var(v: str) -> Callable[[VarArg(str), KwArg(Any)], None]:
269
1188
  def _f(*args: str, **kwargs: Any) -> None:
270
1189
  if not parser_state.current_node:
271
- raise ParseException("%s must be called after a NODE command" % v)
1190
+ raise DatafileSyntaxError(
1191
+ f"{v} must be called after a NODE command",
1192
+ lineno=kwargs["lineno"],
1193
+ pos=1,
1194
+ )
272
1195
  return assign_var(v)(*args, **kwargs)
273
1196
 
274
1197
  return _f
275
1198
 
1199
+ @multiline_not_supported
276
1200
  def add_token(*args: str, **kwargs: Any) -> None: # token_name, permissions):
1201
+ # lineno = kwargs["lineno"]
277
1202
  if len(args) < 2:
278
- raise ParseException('TOKEN gets two params, token name and permissions e.g TOKEN "read api token" READ')
1203
+ raise DatafileSyntaxError(
1204
+ message='TOKEN takes two params: token name and permissions e.g TOKEN "read api token" READ',
1205
+ lineno=lineno,
1206
+ pos=1,
1207
+ )
1208
+ # TODO(eclbg): We should validate that the permissions are a valid string. We only support READ for pipes and
1209
+ # APPEND for datasources
279
1210
  doc.tokens.append({"token_name": _unquote(args[0]), "permissions": args[1]})
280
1211
 
281
- def test(*args: str, **kwargs: Any) -> None:
282
- # TODO: Should be removed?
283
- print("test", args, kwargs) # noqa: T201
284
-
285
1212
  def include(*args: str, **kwargs: Any) -> None:
286
1213
  f = _unquote(args[0])
287
1214
  f = eval_var(f)
@@ -330,16 +1257,9 @@ def parse(
330
1257
  except FileNotFoundError:
331
1258
  raise IncludeFileNotFoundException(f, lineno)
332
1259
 
1260
+ @deprecated
333
1261
  def version(*args: str, **kwargs: Any) -> None:
334
- if len(args) < 1:
335
- raise ParseException("VERSION gets one positive integer param")
336
- try:
337
- version = int(args[0])
338
- if version < 0:
339
- raise ValidationException("version must be a positive integer e.g VERSION 2")
340
- doc.version = version
341
- except ValueError:
342
- raise ValidationException("version must be a positive integer e.g VERSION 2")
1262
+ pass # whatever, it's deprecated
343
1263
 
344
1264
  def shared_with(*args: str, **kwargs: Any) -> None:
345
1265
  for entries in args:
@@ -381,13 +1301,10 @@ def parse(
381
1301
  doc.filtering_tags += filtering_tags
382
1302
 
383
1303
  cmds = {
384
- "from": assign("from"),
385
1304
  "source": sources,
386
1305
  "maintainer": assign("maintainer"),
387
1306
  "schema": schema,
388
1307
  "indexes": indexes,
389
- # TODO: Added to be able to merge MR 11347, let's remove it afterwards
390
- "indices": indexes,
391
1308
  "engine": set_engine,
392
1309
  "partition_key": assign_var("partition_key"),
393
1310
  "sorting_key": assign_var("sorting_key"),
@@ -408,7 +1325,6 @@ def parse(
408
1325
  "resource": assign_node_var("resource"),
409
1326
  "filter": assign_node_var("filter"),
410
1327
  "token": add_token,
411
- "test": test,
412
1328
  "include": include,
413
1329
  "sql": sql("sql"),
414
1330
  "version": version,
@@ -462,10 +1378,11 @@ def parse(
462
1378
  if default_node:
463
1379
  node(default_node)
464
1380
 
465
- lineno = 0
1381
+ lineno = 1
466
1382
  try:
467
- while lineno < len(lines):
468
- line = lines[lineno]
1383
+ while lineno <= len(lines):
1384
+ line = lines[lineno - 1]
1385
+ # shlex.shlex(line) removes comments that start with #. This doesn't affect multiline commands
469
1386
  try:
470
1387
  sa = shlex.shlex(line)
471
1388
  sa.whitespace_split = True
@@ -479,23 +1396,37 @@ def parse(
479
1396
  if (
480
1397
  parser_state.multiline
481
1398
  and cmd.lower() in cmds
482
- and not (line.startswith(" ") or line.startswith("\t") or line.lower().startswith("from"))
1399
+ and not (line.startswith(" ") or line.startswith("\t"))
483
1400
  ):
484
- parser_state.multiline = False
485
1401
  cmds[parser_state.command](
486
- parser_state.multiline_string, lineno=lineno, replace_includes=replace_includes
1402
+ parser_state.multiline_string,
1403
+ lineno=lineno,
1404
+ replace_includes=replace_includes,
1405
+ cmd=parser_state.command,
487
1406
  )
1407
+ parser_state.multiline = False
488
1408
 
489
1409
  if not parser_state.multiline:
490
1410
  if len(args) >= 1 and args[0] == ">":
491
1411
  parser_state.multiline = True
492
1412
  parser_state.command = cmd.lower()
1413
+ parser_state.start_lineno = lineno
493
1414
  parser_state.multiline_string = ""
494
1415
  else:
495
1416
  if cmd.lower() == "settings":
496
- raise click.ClickException(FeedbackManager.error_settings_not_allowed())
1417
+ msg = (
1418
+ "SETTINGS option is not allowed, use ENGINE_SETTINGS instead. See "
1419
+ "https://www.tinybird.co/docs/cli/datafiles#data-source for more information."
1420
+ )
1421
+ raise DatafileSyntaxError(
1422
+ # TODO(eclbg): add surrounding lines as context to the error so we can print it
1423
+ # offending_line=line,
1424
+ message=msg,
1425
+ lineno=lineno,
1426
+ pos=0,
1427
+ )
497
1428
  if cmd.lower() in cmds:
498
- cmds[cmd.lower()](*args, lineno=lineno, replace_includes=replace_includes)
1429
+ cmds[cmd.lower()](*args, lineno=lineno, replace_includes=replace_includes, cmd=cmd)
499
1430
  else:
500
1431
  raise click.ClickException(FeedbackManager.error_option(option=cmd.upper()))
501
1432
  else:
@@ -503,11 +1434,20 @@ def parse(
503
1434
  lineno += 1
504
1435
  # close final state
505
1436
  if parser_state.multiline:
506
- cmds[parser_state.command](parser_state.multiline_string, lineno=lineno, replace_includes=replace_includes)
1437
+ cmds[parser_state.command](
1438
+ parser_state.multiline_string,
1439
+ lineno=lineno,
1440
+ replace_includes=replace_includes,
1441
+ cmd=parser_state.command,
1442
+ )
1443
+ except DatafileSyntaxError as e:
1444
+ # When the error is in a multiline block, add the start lineno to the error lineno so the error location is in
1445
+ # respect to the whole file
1446
+ if parser_state.multiline:
1447
+ e.lineno += parser_state.start_lineno
1448
+ raise e
507
1449
  except ParseException as e:
508
1450
  raise ParseException(str(e), lineno=lineno)
509
- except ValidationException as e:
510
- raise ValidationException(str(e), lineno=lineno)
511
1451
  except IndexError as e:
512
1452
  if "node" in line.lower():
513
1453
  raise click.ClickException(FeedbackManager.error_missing_node_name())