PyPI - tinybird - Versions diffs - 0.0.1.dev14__py3-none-any.whl → 0.0.1.dev15__py3-none-any.whl - Mend

tinybird 0.0.1.dev14py3-none-any.whl → 0.0.1.dev15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tinybird might be problematic. Click here for more details.

Files changed (20) hide show

tinybird/tb/cli.py +0 -1
tinybird/tb/modules/build.py +67 -18
tinybird/tb/modules/cli.py +6 -20
tinybird/tb/modules/create.py +5 -9
tinybird/tb/modules/datafile/build.py +29 -21
tinybird/tb/modules/datafile/build_pipe.py +4 -0
tinybird/tb/modules/datafile/common.py +989 -49
tinybird/tb/modules/datafile/parse_datasource.py +1 -0
tinybird/tb/modules/llm.py +27 -37
tinybird/tb/modules/local.py +3 -4
tinybird/tb/modules/login.py +0 -1
tinybird/tb/modules/mock.py +5 -4
tinybird/tb/modules/prompts.py +23 -1
tinybird/tb/modules/test.py +94 -74
{tinybird-0.0.1.dev14.dist-info → tinybird-0.0.1.dev15.dist-info}/METADATA +1 -1
{tinybird-0.0.1.dev14.dist-info → tinybird-0.0.1.dev15.dist-info}/RECORD +19 -20
tinybird/check_pypi.py +0 -25
{tinybird-0.0.1.dev14.dist-info → tinybird-0.0.1.dev15.dist-info}/WHEEL +0 -0
{tinybird-0.0.1.dev14.dist-info → tinybird-0.0.1.dev15.dist-info}/entry_points.txt +0 -0
{tinybird-0.0.1.dev14.dist-info → tinybird-0.0.1.dev15.dist-info}/top_level.txt +0 -0

tinybird/tb/modules/datafile/common.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import functools
 import glob
 import itertools
 import os
@@ -5,23 +6,55 @@ import os.path
 import pprint
 import re
 import shlex
+import string
 import textwrap
 import traceback
 from collections import namedtuple
+from dataclasses import dataclass
 from io import StringIO
 from pathlib import Path
 from string import Template
-from typing import Any, Callable, Dict, List, Optional, Tuple, cast
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
 import click
 from mypy_extensions import KwArg, VarArg
 from tinybird.ch_utils.engine import ENABLED_ENGINES
 from tinybird.feedback_manager import FeedbackManager
-from tinybird.sql import parse_indexes_structure, parse_table_structure, schema_to_sql_columns
 from tinybird.tb.modules.datafile.exceptions import IncludeFileNotFoundException, ParseException, ValidationException
 from tinybird.tb.modules.exceptions import CLIPipeException
+# Code from sql.py has been duplicated so I can change it without breaking absolutely everything in the app
+# I'll try not to make logic changes, just error reporting changes
+# from tinybird.sql import parse_indexes_structure, parse_table_structure, schema_to_sql_columns
+class DatafileSyntaxError(Exception):
+    def __init__(self, message: str, lineno: int, pos: int, hint: Optional[str] = None):
+        super().__init__(message)
+        self.message = message
+        self.hint = hint
+        self.lineno = lineno
+        self.pos = pos
+    def __str__(self) -> str:
+        hint = f" {self.hint}." if self.hint else ""
+        return f"{self.message} at {self.lineno}:{self.pos}." + hint
+class SchemaSyntaxError(DatafileSyntaxError):
+    def __init__(self, message: str, lineno: int, pos: int, hint: Optional[str] = None):
+        super().__init__(message=message, lineno=lineno, pos=pos, hint=hint)
+class IndexesSyntaxError(DatafileSyntaxError):
+    def __init__(self, message: str, lineno: int, pos: int, hint: Optional[str] = None):
+        super().__init__(message=message, lineno=lineno, pos=pos, hint=hint)
+class MalformedColumnError(Exception):
+    pass
 class PipeTypes:
     MATERIALIZED = "materialized"
@@ -89,11 +122,13 @@ TB_LOCAL_WORKSPACE_NAME = "Tinybird_Local_Testing"
 pp = pprint.PrettyPrinter()
+valid_chars_name: str = string.ascii_letters + string.digits + "._`*<>+-'"
+valid_chars_fn: str = valid_chars_name + "[](),=!?:/ \n\t\r"
 class Datafile:
     def __init__(self) -> None:
         self.maintainer: Optional[str] = None
-        self.sources: List[str] = []
         self.nodes: List[Dict[str, Any]] = []
         self.tokens: List[Dict[str, Any]] = []
         self.version: Optional[int] = None
@@ -104,15 +139,6 @@ class Datafile:
         self.warnings: List[str] = []
         self.filtering_tags: Optional[List[str]] = None
-    def validate(self) -> None:
-        for x in self.nodes:
-            if not x["name"].strip():
-                raise ValidationException("invalid node name, can't be empty")
-            if "sql" not in x:
-                raise ValidationException("node %s must have a SQL query" % x["name"])
-        if self.version is not None and (not isinstance(self.version, int) or self.version < 0):
-            raise ValidationException("version must be a positive integer")
     def is_equal(self, other):
         if len(self.nodes) != len(other.nodes):
             return False
@@ -166,6 +192,848 @@ def parse_tags(tags: str) -> Tuple[str, List[str]]:
     return all_kv_tags, filtering_tags
+@dataclass
+class TableIndex:
+    """Defines a CH table INDEX"""
+    name: str
+    expr: str
+    type_full: str
+    granularity: Optional[str] = None
+    def to_datafile(self):
+        granularity_expr = f"GRANULARITY {self.granularity}" if self.granularity else ""
+        return f"{self.name} {self.expr} TYPE {self.type_full} {granularity_expr}"
+    def to_sql(self):
+        return f"INDEX {self.to_datafile()}"
+    def add_index_sql(self):
+        return f"ADD {self.to_sql()}"
+    def drop_index_sql(self):
+        return f"DROP INDEX IF EXISTS {self.name}"
+    def materialize_index_sql(self):
+        return f"MATERIALIZE INDEX IF EXISTS {self.name}"
+    def clear_index_sql(self):
+        return f"CLEAR INDEX IF EXISTS {self.name}"
+def parse_indexes_structure(indexes: Optional[List[str]]) -> List[TableIndex]:
+    """
+    >>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
+    [TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
+    >>> parse_indexes_structure(["INDEX index_name a TYPE set(100) GRANULARITY 100", " INDEX  index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
+    [TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
+    >>> parse_indexes_structure(["index_name type TYPE set(100) GRANULARITY 100", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
+    [TableIndex(name='index_name', expr='type', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
+    >>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100,", "index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
+    [TableIndex(name='index_name', expr='a', type_full='set(100)', granularity='100'), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity='16')]
+    >>> parse_indexes_structure(["index_name a TYPE set(100)", "index_name_bf mapValues(d) TYPE bloom_filter(0.001)"])
+    [TableIndex(name='index_name', expr='a', type_full='set(100)', granularity=None), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter(0.001)', granularity=None)]
+    >>> parse_indexes_structure(["index_name u64 * length(s) TYPE set(100)", "index_name_bf mapValues(d) TYPE bloom_filter"])
+    [TableIndex(name='index_name', expr='u64 * length(s)', type_full='set(100)', granularity=None), TableIndex(name='index_name_bf', expr='mapValues(d)', type_full='bloom_filter', granularity=None)]
+    >>> parse_indexes_structure(["index_name path TYPE ngrambf_v1(4,1024,1,42) GRANULARITY 1"])
+    [TableIndex(name='index_name', expr='path', type_full='ngrambf_v1(4,1024,1,42)', granularity='1')]
+    >>> parse_indexes_structure(["index_name path TYPE ngrambf_v1(4, 1024, 1, 42) GRANULARITY 1"])
+    [TableIndex(name='index_name', expr='path', type_full='ngrambf_v1(4, 1024, 1, 42)', granularity='1')]
+    >>> parse_indexes_structure(["index_name u64 * length(s)"])
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.IndexesSyntaxError: Invalid INDEX syntax at 1:1. Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`.
+    >>> parse_indexes_structure(["index_name a TYPE set(100) GRANULARITY 100, index_name_bf mapValues(d) TYPE bloom_filter(0.001) GRANULARITY 16"])
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.IndexesSyntaxError: Invalid INDEX syntax at 1:1. Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`.
+    >>> parse_indexes_structure(["", "    ", "     wrong_index_syntax,"])
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.IndexesSyntaxError: Invalid INDEX syntax at 3:6. Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`.
+    >>> parse_indexes_structure(["my_index m['key'] TYPE ngrambf_v1(1, 1024, 1, 42) GRANULARITY 1"])
+    [TableIndex(name='my_index', expr="m['key']", type_full='ngrambf_v1(1, 1024, 1, 42)', granularity='1')]
+    >>> parse_indexes_structure(["my_index_lambda arrayMap(x -> tupleElement(x,'message'), column_name) TYPE ngrambf_v1(1, 1024, 1, 42) GRANULARITY 1"])
+    [TableIndex(name='my_index_lambda', expr="arrayMap(x -> tupleElement(x,'message'), column_name)", type_full='ngrambf_v1(1, 1024, 1, 42)', granularity='1')]
+    >>> parse_indexes_structure(["ip_range_minmax_idx (toIPv6(ip_range_start), toIPv6(ip_range_end)) TYPE minmax GRANULARITY 1"])
+    [TableIndex(name='ip_range_minmax_idx', expr='(toIPv6(ip_range_start), toIPv6(ip_range_end))', type_full='minmax', granularity='1')]
+    """
+    parsed_indices: List[TableIndex] = []
+    if not indexes:
+        return parsed_indices
+    # TODO(eclbg): It might not be obvious that we only allow one index per line.
+    for i, index in enumerate(indexes):
+        lineno = i + 1
+        if not index.strip():
+            continue
+        leading_whitespaces = len(index) - len(index.lstrip())
+        index = index.strip().rstrip(",")
+        index = index.lstrip("INDEX").strip()
+        if index.count("TYPE") != 1:
+            raise IndexesSyntaxError(
+                message="Invalid INDEX syntax",
+                hint="Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`",
+                lineno=lineno,
+                pos=leading_whitespaces + 1,
+            )
+        match = re.match(
+            r"(\w+)\s+([\w\s*\[\]\*\(\),\'\"-><.]+)\s+TYPE\s+(\w+)(?:\(([\w\s*.,]+)\))?(?:\s+GRANULARITY\s+(\d+))?",
+            index,
+        )
+        if match:
+            index_name, a, index_type, value, granularity = match.groups()
+            index_expr = f"{index_type}({value})" if value else index_type
+            parsed_indices.append(TableIndex(index_name, a.strip(), f"{index_expr}", granularity))
+        else:
+            raise IndexesSyntaxError(
+                message="Invalid INDEX syntax",
+                hint="Usage: `[INDEX] name expr TYPE type_full GRANULARITY granularity`",
+                lineno=1,
+                pos=leading_whitespaces + 1,
+            )
+    return parsed_indices
+def clean_comments_rstrip_keep_empty_lines(schema_to_clean: Optional[str]) -> Tuple[Optional[str], bool]:
+    """Remove the comments from the schema
+    If the comments are between backticks, they will not be removed.
+    Lines that are empty after removing comments are also removed. Lines are only rstripped of whitespaces
+    >>> clean_comments_rstrip_keep_empty_lines(None) is None
+    True
+    >>> clean_comments_rstrip_keep_empty_lines('')
+    ''
+    >>> clean_comments_rstrip_keep_empty_lines('    ')
+    ''
+    >>> clean_comments_rstrip_keep_empty_lines('\\n')
+    ''
+    >>> clean_comments_rstrip_keep_empty_lines('\\n\\n\\n\\n')
+    ''
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32')
+    'c Float32'
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n')
+    'c Float32'
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a comment')
+    'c Float32'
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a comment\\n')
+    'c Float32'
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32\\t-- this is a comment\\t\\n')
+    'c Float32'
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a comment\\r\\n')
+    'c Float32'
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a comment\\n--this is a comment2\\n')
+    'c Float32'
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a ```comment\\n')
+    'c Float32'
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32\\n--this is a ```comment\\n')
+    'c Float32'
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32, -- comment\\nd Float32 -- comment2')
+    'c Float32,\\nd Float32'
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32, -- comment\\n   -- comment \\nd Float32 -- comment2')
+    'c Float32,\\n\\nd Float32'
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32 `json:$.aa--aa`\\n--this is a ```comment\\n')
+    'c Float32 `json:$.aa--aa`'
+    >>> clean_comments_rstrip_keep_empty_lines('c Float32 `json:$.cc--cc`\\nd Float32 `json:$.dd--dd`\\n--this is a ```comment\\n')
+    'c Float32 `json:$.cc--cc`\\nd Float32 `json:$.dd--dd`'
+    >>> clean_comments_rstrip_keep_empty_lines('c--c Float32 `json:$.cc--cc`\\n')
+    'c'
+    >>> clean_comments_rstrip_keep_empty_lines('`c--c` Float32 `json:$.cc--cc`\\n')
+    '`c'
+    """
+    def clean_line_comments(line: str) -> str:
+        if not line:
+            return line
+        i = 0
+        inside_json_path = False
+        while i < len(line):
+            if i + 1 < len(line) and line[i] == "-" and line[i + 1] == "-" and not inside_json_path:
+                return line[:i].rstrip()
+            if not inside_json_path and line[i:].startswith("`json:"):
+                inside_json_path = True
+            elif inside_json_path and line[i] == "`":
+                inside_json_path = False
+            i += 1
+        return line
+    if schema_to_clean is None:
+        return schema_to_clean
+    cleaned_schema = ""
+    for line in schema_to_clean.splitlines():
+        cleaned_line = clean_line_comments(line)
+        cleaned_schema += cleaned_line + "\n"
+    return cleaned_schema.rstrip()
+SyntaxExpr = namedtuple("SyntaxExpr", ["name", "regex"])
+NULL = SyntaxExpr("NULL", re.compile(r"\s+NULL([^a-z0-9_]|$)", re.IGNORECASE))
+NOTNULL = SyntaxExpr("NOTNULL", re.compile(r"\s+NOT\s+NULL([^a-z0-9_]|$)", re.IGNORECASE))
+DEFAULT = SyntaxExpr("DEFAULT", re.compile(r"\s+DEFAULT([^a-z0-9_]|$)", re.IGNORECASE))
+MATERIALIZED = SyntaxExpr("MATERIALIZED", re.compile(r"\s+MATERIALIZED([^a-z0-9_]|$)", re.IGNORECASE))
+ALIAS = SyntaxExpr("ALIAS", re.compile(r"\s+ALIAS([^a-z0-9_]|$)", re.IGNORECASE))
+CODEC = SyntaxExpr("CODEC", re.compile(r"\s+CODEC([^a-z0-9_]|$)", re.IGNORECASE))
+TTL = SyntaxExpr("TTL", re.compile(r"\s+TTL([^a-z0-9_]|$)", re.IGNORECASE))
+JSONPATH = SyntaxExpr("JSONPATH", re.compile(r"\s+`json:", re.IGNORECASE))
+COMMA = SyntaxExpr("COMMA", re.compile(r",", re.IGNORECASE))
+NEW_LINE = SyntaxExpr("NEW_LINE", re.compile(r"\s$"))
+TYPE = SyntaxExpr("TYPE", re.compile(r""))  # TYPE doesn't have a fixed initial string
+REGEX_WHITESPACE = re.compile(r"\s*")
+REGEX_COMMENT = re.compile(r"\-\-[^\n\r]*[\n\r]")
+def mark_error_string(s: str, i: int, line: int = 1) -> str:
+    """
+    >>> mark_error_string('0123456789', 0)
+    '0123456789\\n^---'
+    >>> mark_error_string('0123456789', 9)
+    '0123456789\\n         ^---'
+    >>> mark_error_string('01234\\n56789', 1)
+    '01234\\n ^---'
+    """
+    marker = "^---"
+    ss = s.splitlines()[line - 1] if s else ""
+    start = 0
+    end = len(ss)
+    return ss[start:end] + "\n" + (" " * (i - start)) + marker
+def format_parse_error(
+    table_structure: str,
+    i: int,
+    position: int,
+    hint: Optional[str] = None,
+    line: int = 0,
+    keyword: Optional[str] = None,
+) -> str:
+    adjusted_position = position - (len(keyword) if keyword else 0)
+    message = f"{hint}\n" if hint else ""
+    message += mark_error_string(table_structure, adjusted_position - 1, line=line)
+    if keyword:
+        message += f" found at position {adjusted_position - len(keyword)}"
+    else:
+        message += (
+            f" found {repr(table_structure[i]) if len(table_structure)>i else 'EOF'} at position {adjusted_position}"
+        )
+    return message
+def clean_line_comments(line: str) -> str:
+    if not line:
+        return line
+    i = 0
+    inside_json_path = False
+    while i < len(line):
+        if i + 1 < len(line) and line[i] == "-" and line[i + 1] == "-" and not inside_json_path:
+            return line[:i].strip()
+        if not inside_json_path and line[i:].startswith("`json:"):
+            inside_json_path = True
+        elif inside_json_path and line[i] == "`":
+            inside_json_path = False
+        i += 1
+    return line
+def _parse_table_structure(schema: str) -> List[Dict[str, Any]]:
+    # CH syntax from https://clickhouse.com/docs/en/sql-reference/statements/create/table/
+    # name1 [type1] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1]
+    try:
+        # This removes lines that are empty after removing comments, which might make it hard to locate errors properly.
+        # The parsing code afterwards seems to be mostly robust to empty lines.
+        # Perhaps I'll deliberately not support reporting errors correctly when empty lines have been removed to start
+        # with, and later I can see how to support it.
+        # It also removes the indentation of the lines, which might make it hard to locate errors properly.
+        # schema = clean_comments(schema + "\n")
+        # I've swapped the above with this. A first test didn't show any side effects in parsing a schema, and it should
+        # allow us to keep track of the line numbers in the error messages.
+        schema = clean_comments_rstrip_keep_empty_lines(schema + "\n")
+    except Exception:
+        # logging.exception(f"Error cleaning comments: {e}")
+        schema = REGEX_COMMENT.sub(" ", schema + "\n").strip()
+    if REGEX_WHITESPACE.fullmatch(schema):
+        return []
+    i: int = 0
+    # For error feedback only
+    line: int = 1
+    pos: int = 1
+    # Find the first SyntaxExpr in lookup that matches the schema at the current offset
+    def lookahead_matches(lookup: Iterable) -> Optional[SyntaxExpr]:
+        s = schema[i:]
+        match = next((x for x in lookup if x.regex.match(s)), None)
+        return match
+    def advance_single_char() -> None:
+        nonlocal i, line, pos
+        if schema[i] == "\n":
+            line += 1
+            pos = 1
+        else:
+            pos += 1
+        i += 1
+    # Advance all whitespaces characters and then len(s) more chars
+    def advance(s: str) -> None:
+        if i < len(schema):
+            while schema[i] in " \t\r\n":
+                advance_single_char()
+            for _ in s:
+                advance_single_char()
+    def get_backticked() -> str:
+        begin = i
+        while i < len(schema):
+            c = schema[i]
+            advance_single_char()
+            if c == "`":
+                return schema[begin : i - 1]
+            if c in " \t\r\n":
+                raise SchemaSyntaxError(message="Expected closing backtick", lineno=line, pos=pos - 1)
+        raise SchemaSyntaxError(message="Expected closing backtick", lineno=line, pos=pos)
+    def parse_name() -> str:
+        nonlocal i, line, pos
+        if schema[i] != "`":
+            # regular name
+            begin = i
+            while i < len(schema):
+                c = schema[i]
+                if c in " \t\r\n":
+                    return schema[begin:i]
+                if c not in valid_chars_name:
+                    raise SchemaSyntaxError(
+                        message=f"Column name contains invalid character {repr(c)}",
+                        hint="Tip: use backticks",
+                        lineno=line,
+                        pos=i + 1,
+                    )
+                advance_single_char()
+            return schema[begin:i]
+        else:
+            # backticked name
+            advance_single_char()
+            return get_backticked()
+    def parse_expr(lookup: Iterable[SyntaxExpr]) -> str:
+        nonlocal i, line, pos
+        begin: int = i
+        context_stack: List[Optional[str]] = [None]
+        while i < len(schema):
+            context = context_stack[-1]
+            c = schema[i]
+            if (context == "'" and c == "'") or (context == '"' and c == '"') or (context == "(" and c == ")"):
+                context_stack.pop()
+            elif c == "'" and (context is None or context == "("):
+                context_stack.append("'")
+            elif c == '"' and (context is None or context == "("):
+                context_stack.append('"')
+            elif c == "(" and (context is None or context == "("):
+                context_stack.append("(")
+            elif context is None and lookahead_matches(lookup):
+                return schema[begin:i].strip(" \t\r\n")
+            elif (context is None and c not in valid_chars_fn) or (context == "(" and c not in valid_chars_fn):
+                raise SchemaSyntaxError(message=f"Invalid character {repr(c)}", lineno=line, pos=pos)
+            advance_single_char()
+        if i == begin:
+            # TODO(eclbg): Turn this into a SchemaSyntaxError. I don't know when it happens
+            raise ValueError(format_parse_error(schema, i, pos, "wrong value", line=line))
+        return schema[begin:].strip(" \t\r\n")
+    columns: List[Dict[str, Any]] = []
+    name: str = ""
+    _type: str = ""
+    default: str = ""
+    materialized: str = ""
+    codec: str = ""
+    jsonpath: str = ""
+    last: Optional[SyntaxExpr] = None
+    col_start: Tuple[int, int] = (0, 0)  # (0, 0) means not set. It's not a valid line/pos as they start at 1
+    col_end: Tuple[int, int] = (0, 0)  # (0, 0) means not set. It's not a valid line/pos as they start at 1
+    def add_column(found: str) -> None:
+        nonlocal name, _type, default, materialized, codec, jsonpath, col_start, col_end
+        if not name:
+            # TODO(eclbg): get rid of this ValueError and replace it with a custom one so it can be handled by the
+            # caller
+            raise ValueError(
+                format_parse_error(schema, i, pos, f"Syntax error: expecting NAME, found {found}", line=line)
+            )
+        default = "" if not default else f"DEFAULT {default}"
+        materialized = "" if not materialized else f"MATERIALIZED {materialized}"
+        codec = "" if not codec else f"CODEC{codec}"
+        # TODO(eclbg): We should validate the column as a whole. Name is mandatory, and one of type, default_value or
+        # materialized (I think).
+        columns.append(
+            {
+                "name": name,
+                "type": _type,
+                "codec": codec,
+                "default_value": default or materialized,
+                "jsonpath": jsonpath,
+                # "col_start": col_start,
+                # "col_end": col_end,
+            }
+        )
+        name = ""
+        _type = ""
+        default = ""
+        materialized = ""
+        codec = ""
+        jsonpath = ""
+    valid_next: List[SyntaxExpr] = [TYPE]
+    while i < len(schema):
+        if not name:
+            advance("")
+            valid_next = [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, TYPE]
+            col_start = (line, pos)
+            name = parse_name()
+            if name == "INDEX":
+                raise SchemaSyntaxError(
+                    message="Forbidden INDEX definition",
+                    hint="Indexes are not allowed in SCHEMA section. Use the INDEXES section instead",
+                    lineno=line,
+                    pos=pos - len(name),  # We've already advanced the name
+                )
+            continue
+        found = lookahead_matches(
+            [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, NEW_LINE, TYPE]
+        )
+        if found and found not in valid_next:
+            after = f" after {last.name}" if last else ""
+            raise SchemaSyntaxError(message=f"Unexpected {found.name}{after}", lineno=line, pos=pos)
+        if found == TYPE:
+            advance("")
+            valid_next = [NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA, NEW_LINE]
+            type_start_pos = pos  # Save the position of the type start to use it in the error message
+            detected_type = parse_expr([NULL, NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
+            try:
+                # Imported in the body to be compatible with the CLI
+                from chtoolset.query import check_compatible_types
+                # Check compatibility of the type with itself to verify it's a known type
+                check_compatible_types(detected_type, detected_type)
+            except ValueError as e:
+                if (
+                    "unknown data type family" in str(e).lower()
+                    or "incompatible data types between aggregate function" in str(e).lower()
+                ):
+                    raise SchemaSyntaxError(message=str(e), lineno=line, pos=type_start_pos)
+                else:
+                    raise e
+            except ModuleNotFoundError:
+                pass
+            _type = detected_type
+        elif found == NULL:
+            # Not implemented
+            advance("")  # We need to advance to get the correct position
+            raise SchemaSyntaxError(
+                message="NULL column syntax not supported",
+                hint="Hint: use Nullable(...)",
+                lineno=line,
+                pos=pos,
+            )
+        elif found == NOTNULL:
+            advance("")  # We need to advance to get the correct position
+            raise SchemaSyntaxError(
+                message="NOT NULL column syntax not supported",
+                hint="Hint: Columns are not nullable by default",
+                lineno=line,
+                pos=pos,
+            )
+        elif found == DEFAULT:
+            advance("DEFAULT")
+            valid_next = [
+                CODEC,
+                TTL,
+                COMMA,
+                # The three matches below are never valid. We're adding them here to avoid just complaining about their placement.
+                MATERIALIZED,
+                NULL,
+                NOTNULL,
+            ]
+            default = parse_expr([NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
+        elif found == MATERIALIZED:
+            advance("")
+            raise SchemaSyntaxError(
+                message="MATERIALIZED columns are not supported",
+                lineno=line,
+                pos=pos,
+            )
+        elif found == ALIAS:
+            # Not implemented
+            advance("")  # We need to advance to get the correct position
+            raise SchemaSyntaxError(
+                message="ALIAS columns are not supported",
+                lineno=line,
+                pos=pos,
+            )
+        elif found == CODEC:
+            advance("CODEC")
+            valid_next = [
+                TTL,
+                COMMA,
+                JSONPATH,
+                # The three matches below are never valid. We're adding them here to avoid just complaining about their placement.
+                MATERIALIZED,
+                NULL,
+                NOTNULL,
+            ]
+            codec = parse_expr([NOTNULL, DEFAULT, MATERIALIZED, ALIAS, CODEC, TTL, JSONPATH, COMMA])
+        elif found == TTL:
+            advance("")  # We need to advance to get the correct position
+            # Not implemented
+            advance("")
+            raise SchemaSyntaxError(
+                message="column TTL is not supported",
+                lineno=line,
+                pos=pos,
+            )
+        elif found == JSONPATH:
+            advance("`json:")
+            jsonpath = get_backticked()
+        elif found == COMMA:
+            if name == "INDEX":
+                advance(",")
+                continue
+            advance(",")
+            valid_next = []
+            col_end = (line, pos)
+            add_column("COMMA")
+        elif found == NEW_LINE or (name == "INDEX" and not found):
+            i += 1
+        else:
+            raise ValueError(
+                format_parse_error(
+                    schema,
+                    i,
+                    pos,
+                    "wrong value, DEFAULT, MATERIALIZED, CODEC, TTL expressions, a column data type, a comma, a new line or a jsonpath",
+                    line=line,
+                )
+            )
+        last = found
+    col_end = (line, i + 1)
+    # Only add the last column if we've parsed something. This allows for a trailing comma after the last column.
+    if name:
+        add_column("EOF")
+    # normalize columns
+    for column in columns:
+        nullable = column["type"].lower().startswith("nullable")
+        column["type"] = column["type"] if not nullable else column["type"][len("Nullable(") : -1]  # ')'
+        column["nullable"] = nullable
+        column["codec"] = column["codec"] if column["codec"] else None
+        column["name"] = column["name"]
+        column["normalized_name"] = column["name"]
+        column["jsonpath"] = column["jsonpath"] if column["jsonpath"] else None
+        default_value = column["default_value"] if column["default_value"] else None
+        if nullable and default_value and default_value.lower() == "default null":
+            default_value = None
+        column["default_value"] = default_value
+    return columns
+def try_to_fix_nullable_in_simple_aggregating_function(t: str) -> Optional[str]:
+    # This workaround is to fix: https://github.com/ClickHouse/ClickHouse/issues/34407.
+    # In the case of nullable columns and SimpleAggregateFunction  Clickhouse returns
+    # Nullable(SimpleAggregateFunction(sum, Int32)) instead of SimpleAggregateFunction(sum, Nullable(Int32))
+    # as it is done with other aggregate functions.
+    # If not, the aggregation could return incorrect results.
+    result = None
+    if match := re.search(r"SimpleAggregateFunction\((\w+),\s*(?!(?:Nullable))([\w,.()]+)\)", t):
+        fn = match.group(1)
+        inner_type = match.group(2)
+        result = f"SimpleAggregateFunction({fn}, Nullable({inner_type}))"
+    return result
+def col_name(name: str, backquotes: bool = True) -> str:
+    """
+    >>> col_name('`test`', True)
+    '`test`'
+    >>> col_name('`test`', False)
+    'test'
+    >>> col_name('test', True)
+    '`test`'
+    >>> col_name('test', False)
+    'test'
+    >>> col_name('', True)
+    ''
+    >>> col_name('', False)
+    ''
+    """
+    if not name:
+        return name
+    if name[0] == "`" and name[-1] == "`":
+        return name if backquotes else name[1:-1]
+    return f"`{name}`" if backquotes else name
+def schema_to_sql_columns(schema: List[Dict[str, Any]]) -> List[str]:
+    """return an array with each column in SQL
+    >>> schema_to_sql_columns([{'name': 'temperature', 'type': 'Float32', 'codec': None, 'default_value': None, 'nullable': False, 'normalized_name': 'temperature'}, {'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4))', 'default_value': 'MATERIALIZED temperature', 'nullable': False, 'normalized_name': 'temperature_delta'}])
+    ['`temperature` Float32', '`temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4))']
+    >>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'codec': '', 'default_value': 'MATERIALIZED temperature', 'nullable': False, 'normalized_name': 'temperature_delta'}])
+    ['`temperature_delta` Float32 MATERIALIZED temperature']
+    >>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'codec': 'CODEC(Delta(4), LZ4))', 'default_value': '', 'nullable': False, 'normalized_name': 'temperature_delta'}])
+    ['`temperature_delta` Float32 CODEC(Delta(4), LZ4))']
+    >>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'nullable': False, 'normalized_name': 'temperature_delta'}])
+    ['`temperature_delta` Float32']
+    >>> schema_to_sql_columns([{'name': 'temperature_delta', 'type': 'Float32', 'nullable': False, 'normalized_name': 'temperature_delta', 'jsonpath': '$.temperature_delta'}])
+    ['`temperature_delta` Float32 `json:$.temperature_delta`']
+    >>> schema_to_sql_columns([{'name': 'aggregation', 'type': 'SimpleAggregateFunction(sum, Int32)', 'nullable': True, 'normalized_name': 'aggregation', 'jsonpath': '$.aggregation'}])
+    ['`aggregation` SimpleAggregateFunction(sum, Nullable(Int32)) `json:$.aggregation`']
+    """
+    columns: List[str] = []
+    for x in schema:
+        name = x["normalized_name"] if "normalized_name" in x else x["name"]
+        if x["nullable"]:
+            if (_type := try_to_fix_nullable_in_simple_aggregating_function(x["type"])) is None:
+                _type = "Nullable(%s)" % x["type"]
+        else:
+            _type = x["type"]
+        parts = [col_name(name, backquotes=True), _type]
+        if x.get("jsonpath", None):
+            parts.append(f"`json:{x['jsonpath']}`")
+        if "default_value" in x and x["default_value"] not in ("", None):
+            parts.append(x["default_value"])
+        if "codec" in x and x["codec"] not in ("", None):
+            parts.append(x["codec"])
+        c = " ".join([x for x in parts if x]).strip()
+        columns.append(c)
+    return columns
+def parse_table_structure(schema: str) -> List[Dict[str, Any]]:
+    """This parses the SQL schema for a CREATE TABLE
+    Columns follow the syntax: name1 [type1] [DEFAULT expr1] [CODEC compression_codec] [TTL expr1] [JSONPATH `json:jsonpath`] [,]
+    The ClickHouse reference is followed pretty loosely at this point.
+    Reference: https://clickhouse.tech/docs/en/sql-reference/statements/create/table/#syntax-forms
+    >>> parse_table_structure('potato') # doctest: +SKIP
+    Traceback (most recent call last):
+    ...
+    tinybird.sql.MalformedColumnError: Column name and either type or default_value are required
+    >>> parse_table_structure('    potato Int32')
+    [{'name': 'potato', 'type': 'Int32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'potato'}]
+    >>> parse_table_structure('`c Int32')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: Expected closing backtick at 1:3.
+    >>> parse_table_structure('c Float32, b String')
+    [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
+    >>> parse_table_structure('c Float32,--comment\\nb String')
+    [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
+    >>> parse_table_structure('c Float32,--comment\\nb String --another-comment')
+    [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
+    >>> parse_table_structure('c Float32 --first-comment\\n,--comment\\nb String --another-comment')
+    [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
+    >>> parse_table_structure('--random comment here\\nc Float32 --another comment\\n,--another one\\nb String --this is the last one')
+    [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
+    >>> parse_table_structure('--extra comment\\nc--extra comment\\nFloat32--extra comment\\n,--extra comment\\nb--extra comment\\nString--extra comment')
+    [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}, {'name': 'b', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'b'}]
+    >>> parse_table_structure('c Nullable(Float32)')
+    [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
+    >>> parse_table_structure('c Nullable(Float32) DEFAULT NULL')
+    [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
+    >>> parse_table_structure("c String DEFAULT 'bla'")
+    [{'name': 'c', 'type': 'String', 'codec': None, 'default_value': "DEFAULT 'bla'", 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
+    >>> parse_table_structure('`foo.bar` UInt64')
+    [{'name': 'foo.bar', 'type': 'UInt64', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'foo.bar'}]
+    >>> parse_table_structure('double_value Float64 CODEC(LZ4HC(2))')
+    [{'name': 'double_value', 'type': 'Float64', 'codec': 'CODEC(LZ4HC(2))', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'double_value'}]
+    >>> parse_table_structure('doubl/e_value Float64 CODEC(LZ4HC(2))')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: Column name contains invalid character '/' at 1:6. Tip: use backticks.
+    >>> parse_table_structure('`c` Nullable(Float32)')
+    [{'name': 'c', 'type': 'Float32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': True, 'normalized_name': 'c'}]
+    >>> parse_table_structure('wadus INT UNSIGNED')
+    [{'name': 'wadus', 'type': 'INT UNSIGNED', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'wadus'}]
+    >>> parse_table_structure('c Int32 CODEC(Delta, LZ4)\\n')
+    [{'name': 'c', 'type': 'Int32', 'codec': 'CODEC(Delta, LZ4)', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'c'}]
+    >>> parse_table_structure('c  SimpleAggregateFunction(sum, Int32),\\np SimpleAggregateFunction(sum, Int32)')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: Incompatible data types between aggregate function 'sum' which returns Int64 and column storage type Int32 at 1:4.
+    >>> parse_table_structure('c Int32 CODEC(Delta, LZ4) Materialized b*2\\n')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:27.
+    >>> parse_table_structure('c Int32 CODEC(Delta, LZ4) Materialized ifNull(b*2, 0)\\n')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:27.
+    >>> parse_table_structure('c Int32 Materialized b*2\\n')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:9.
+    >>> parse_table_structure('c Int32 Materialized b != 1 ? b*2: pow(b, 3)\\n')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:9.
+    >>> parse_table_structure('')
+    []
+    >>> parse_table_structure('`date` Date,`timezone` String,`offset` Int32')
+    [{'name': 'date', 'type': 'Date', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'date'}, {'name': 'timezone', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'timezone'}, {'name': 'offset', 'type': 'Int32', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'offset'}]
+    >>> parse_table_structure('c Int32 Materialized b*2 CODEC(Delta, LZ4)\\n')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:9.
+    >>> parse_table_structure('c Int32 Materialized ifNull(b*2, 0) CODEC(Delta, LZ4)\\n')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:9.
+    >>> parse_table_structure('`temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4)')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:29.
+    >>> parse_table_structure('foo^bar Float32')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: Column name contains invalid character '^' at 1:4. Tip: use backticks.
+    >>> parse_table_structure('foo Float#32')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: Invalid character '#' at 1:10.
+    >>> parse_table_structure('foo Float32 DEFAULT 13, bar UInt64')
+    [{'name': 'foo', 'type': 'Float32', 'codec': None, 'default_value': 'DEFAULT 13', 'jsonpath': None, 'nullable': False, 'normalized_name': 'foo'}, {'name': 'bar', 'type': 'UInt64', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'bar'}]
+    >>> parse_table_structure('foo Float32 DEFAULT 1$$$3')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: Invalid character '$' at 1:22.
+    >>> parse_table_structure('foo Float32 CODEC(Delta(4), LZ#4)')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: Invalid character '#' at 1:31.
+    >>> parse_table_structure('\\n    `temperature` Float32,\\n    `temperature_delta` Float32 MATERIALIZED temperature CODEC(Delta(4), LZ4)\\n    ')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 3:33.
+    >>> parse_table_structure('temperature Float32, temperature_delta Float32 MATERIALIZED temperature Codec(Delta(4)), temperature_doubledelta Float32 MATERIALIZED temperature Codec(DoubleDelta), temperature_doubledelta_lz4 Float32 MATERIALIZED temperature Codec(DoubleDelta, LZ4)')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:48.
+    >>> parse_table_structure('t UInt8  CODEC(Delta(1), LZ4)')
+    [{'name': 't', 'type': 'UInt8', 'codec': 'CODEC(Delta(1), LZ4)', 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 't'}]
+    >>> parse_table_structure('tt UInt8  MATERIALIZED t')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:11.
+    >>> parse_table_structure('tt UInt8  MATERIALIZED t  CODEC(Delta(1), LZ4)')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:11.
+    >>> parse_table_structure('tt SimpleAggregateFunction(any, Nullable(UInt8))')
+    [{'name': 'tt', 'type': 'SimpleAggregateFunction(any, Nullable(UInt8))', 'codec': None, 'default_value': None, 'jsonpath': None, 'nullable': False, 'normalized_name': 'tt'}]
+    >>> parse_table_structure("timestamp DateTime MATERIALIZED toDateTime(JSONExtractInt(JSONExtractRaw(record, 'payload'), 'timestamp') / 1000)")
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:20.
+    >>> parse_table_structure("`test_default_cast` DEFAULT plus(13,1)")
+    [{'name': 'test_default_cast', 'type': '', 'codec': None, 'default_value': 'DEFAULT plus(13,1)', 'jsonpath': None, 'nullable': False, 'normalized_name': 'test_default_cast'}]
+    >>> parse_table_structure("hola Int, `materialized` String MATERIALIZED upper(no_nullable_string)")
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: MATERIALIZED columns are not supported at 1:33.
+    >>> parse_table_structure('`a2` String `json:$.a2`, `a3` String `json:$.a3`\\n')
+    [{'name': 'a2', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
+    >>> parse_table_structure("`arr` Array(String) DEFAULT ['-']")
+    [{'name': 'arr', 'type': 'Array(String)', 'codec': None, 'default_value': "DEFAULT ['-']", 'jsonpath': None, 'nullable': False, 'normalized_name': 'arr'}]
+    >>> parse_table_structure("`arr` Array(String) DEFAULT array('-')")
+    [{'name': 'arr', 'type': 'Array(String)', 'codec': None, 'default_value': "DEFAULT array('-')", 'jsonpath': None, 'nullable': False, 'normalized_name': 'arr'}]
+    >>> parse_table_structure('`a2` Float32 CODEC(Delta, ZSTD(4)) `json:$.a2`, `a3` String `json:$.a3`\\n')
+    [{'name': 'a2', 'type': 'Float32', 'codec': 'CODEC(Delta, ZSTD(4))', 'default_value': None, 'jsonpath': '$.a2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
+    >>> parse_table_structure('`a` String, INDEX index_name a TYPE set(100) GRANULARITY 100')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: Forbidden INDEX definition at 1:13. Indexes are not allowed in SCHEMA section. Use the INDEXES section instead.
+    >>> parse_table_structure('    `a` String,\\n    INDEX index_name a TYPE set(100, 1) GRANULARITY 100')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: Forbidden INDEX definition at 2:5. Indexes are not allowed in SCHEMA section. Use the INDEXES section instead.
+    >>> parse_table_structure('`index` String, INDEX index_name a TYPE set(100, 1) GRANULARITY 100')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: Forbidden INDEX definition at 1:17. Indexes are not allowed in SCHEMA section. Use the INDEXES section instead.
+    >>> parse_table_structure('`a2` String `json:$.a--2`, `a3` String `json:$.a3`\\n')
+    [{'name': 'a2', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a--2', 'nullable': False, 'normalized_name': 'a2'}, {'name': 'a3', 'type': 'String', 'codec': None, 'default_value': None, 'jsonpath': '$.a3', 'nullable': False, 'normalized_name': 'a3'}]
+    >>> parse_table_structure('a InvalidType')
+    Traceback (most recent call last):
+    ...
+    tinybird.tb.modules.datafile.common.SchemaSyntaxError: Unknown data type family: InvalidType at 1:3.
+    >>> parse_table_structure('a Int32 DEFAULT 'a') # doctest: +SKIP
+    # should fail as the type and default expr are incompatible
+    """
+    return _parse_table_structure(schema)
 def parse(
     s: str,
     default_node: Optional[str] = None,
@@ -175,11 +1043,9 @@ def parse(
 ) -> Datafile:
     """
     Parses `s` string into a document
-    >>> d = parse("FROM SCRATCH\\nSOURCE 'https://example.com'\\n#this is a comment\\nMAINTAINER 'rambo' #this is me\\nNODE \\"test_01\\"\\n    DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n        SELECT * from test_00\\n\\n\\nNODE \\"test_02\\"\\n    DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n    SELECT * from test_01\\n    WHERE a > 1\\n    GROUP by a\\n")
+    >>> d = parse("MAINTAINER 'rambo' #this is me\\nNODE \\"test_01\\"\\n    DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n        SELECT * from test_00\\n\\n\\nNODE \\"test_02\\"\\n    DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n    SELECT * from test_01\\n    WHERE a > 1\\n    GROUP by a\\n")
     >>> d.maintainer
     'rambo'
-    >>> d.sources
-    ['https://example.com']
     >>> len(d.nodes)
     2
     >>> d.nodes[0]
@@ -192,12 +1058,43 @@ def parse(
     doc = Datafile()
     doc.raw = list(StringIO(s, newline=None))
-    parser_state = namedtuple("parser_state", ["multiline", "current_node", "command", "multiline_string", "is_sql"])
+    parser_state = namedtuple(
+        "parser_state", ["multiline", "current_node", "command", "multiline_string", "is_sql", "start_lineno"]
+    )
     parser_state.multiline = False
     parser_state.current_node = False
+    parser_state.start_lineno = None
+    def multiline_not_supported(func: Callable[..., Any]) -> Callable[..., Any]:
+        @functools.wraps(func)
+        def error_if_multiline(*args: Any, **kwargs: Any) -> Any:
+            if parser_state.multiline:
+                parser_state.multiline = (
+                    False  # So we don't offset the line number when processing the exception. A bit hacky
+                )
+                raise DatafileSyntaxError(
+                    f"{kwargs['cmd'].upper()} does not support multiline arguments",
+                    lineno=parser_state.start_lineno,  # We want to report the line where the command starts
+                    pos=1,
+                )
+            return func(*args, **kwargs)
+        return error_if_multiline
+    def deprecated(func: Callable[..., Any]) -> Callable[..., Any]:
+        @functools.wraps(func)
+        def raise_deprecation_error(*args: Any, **kwargs: Any) -> Any:
+            raise DatafileSyntaxError(
+                f"{kwargs['cmd'].upper()} has been deprecated",
+                lineno=kwargs["lineno"],
+                pos=1,
+            )
+        return raise_deprecation_error
     def assign(attr):
+        @multiline_not_supported
         def _fn(x, **kwargs):
             setattr(doc, attr, _unquote(x))
@@ -207,7 +1104,10 @@ def parse(
         s = _unquote("".join(args))
         try:
             sh = parse_table_structure(s)
+        except SchemaSyntaxError as e:
+            raise e
         except Exception as e:
+            # TODO(eclbg): Does it make sense to keep this exception? I'd like to get rid of all ParseException
             raise ParseException(FeedbackManager.error_parsing_schema(line=kwargs["lineno"], error=e))
         parser_state.current_node["schema"] = ",".join(schema_to_sql_columns(sh))
@@ -219,26 +1119,33 @@ def parse(
             return
         try:
             indexes = parse_indexes_structure(s.splitlines())
+        except IndexesSyntaxError as e:
+            raise e
         except Exception as e:
+            # TODO(eclbg): We get here when an unidentified error happens but we still report a parsing error. We could rething this.
             raise ParseException(FeedbackManager.error_parsing_indices(line=kwargs["lineno"], error=e))
         parser_state.current_node["indexes"] = indexes
     def assign_var(v: str) -> Callable[[VarArg(str), KwArg(Any)], None]:
+        @multiline_not_supported
         def _f(*args: str, **kwargs: Any):
             s = _unquote((" ".join(args)).strip())
             parser_state.current_node[v.lower()] = eval_var(s, skip=skip_eval)
         return _f
+    @deprecated
     def sources(x: str, **kwargs: Any) -> None:
-        doc.sources.append(_unquote(x))
+        pass  # Deprecated
+    @multiline_not_supported
     def node(*args: str, **kwargs: Any) -> None:
         node = {"name": eval_var(_unquote(args[0]))}
         doc.nodes.append(node)
         parser_state.current_node = node
+    @multiline_not_supported
     def scope(*args: str, **kwargs: Any) -> None:
         scope = {"name": eval_var(_unquote(args[0]))}
         doc.nodes.append(scope)
@@ -255,9 +1162,21 @@ def parse(
             doc.description = description
     def sql(var_name: str, **kwargs: Any) -> Callable[[str, KwArg(Any)], None]:
-        def _f(sql: str, **kwargs: Any) -> None:
+        # TODO(eclbg): We shouldn't allow SQL in datasource files
+        def _f(sql: str, *args: Any, **kwargs: Any) -> None:
+            if not parser_state.multiline:
+                raise DatafileSyntaxError(
+                    "SQL must be multiline",
+                    hint="Use > to start a multiline SQL block",
+                    lineno=kwargs["lineno"],
+                    pos=1,
+                )
             if not parser_state.current_node:
-                raise ParseException("SQL must be called after a NODE command")
+                raise DatafileSyntaxError(
+                    "SQL must be called after a NODE command",
+                    lineno=kwargs["lineno"],
+                    pos=1,
+                )
             parser_state.current_node[var_name] = (
                 textwrap.dedent(sql).rstrip() if "%" not in sql.strip()[0] else sql.strip()
             )
@@ -268,20 +1187,28 @@ def parse(
     def assign_node_var(v: str) -> Callable[[VarArg(str), KwArg(Any)], None]:
         def _f(*args: str, **kwargs: Any) -> None:
             if not parser_state.current_node:
-                raise ParseException("%s must be called after a NODE command" % v)
+                raise DatafileSyntaxError(
+                    f"{v} must be called after a NODE command",
+                    lineno=kwargs["lineno"],
+                    pos=1,
+                )
             return assign_var(v)(*args, **kwargs)
         return _f
+    @multiline_not_supported
     def add_token(*args: str, **kwargs: Any) -> None:  # token_name, permissions):
+        # lineno = kwargs["lineno"]
         if len(args) < 2:
-            raise ParseException('TOKEN gets two params, token name and permissions e.g TOKEN "read api token" READ')
+            raise DatafileSyntaxError(
+                message='TOKEN takes two params: token name and permissions e.g TOKEN "read api token" READ',
+                lineno=lineno,
+                pos=1,
+            )
+        # TODO(eclbg): We should validate that the permissions are a valid string. We only support READ for pipes and
+        # APPEND for datasources
         doc.tokens.append({"token_name": _unquote(args[0]), "permissions": args[1]})
-    def test(*args: str, **kwargs: Any) -> None:
-        # TODO: Should be removed?
-        print("test", args, kwargs)  # noqa: T201
     def include(*args: str, **kwargs: Any) -> None:
         f = _unquote(args[0])
         f = eval_var(f)
@@ -330,16 +1257,9 @@ def parse(
         except FileNotFoundError:
             raise IncludeFileNotFoundException(f, lineno)
+    @deprecated
     def version(*args: str, **kwargs: Any) -> None:
-        if len(args) < 1:
-            raise ParseException("VERSION gets one positive integer param")
-        try:
-            version = int(args[0])
-            if version < 0:
-                raise ValidationException("version must be a positive integer e.g VERSION 2")
-            doc.version = version
-        except ValueError:
-            raise ValidationException("version must be a positive integer e.g VERSION 2")
+        pass  # whatever, it's deprecated
     def shared_with(*args: str, **kwargs: Any) -> None:
         for entries in args:
@@ -381,13 +1301,10 @@ def parse(
                 doc.filtering_tags += filtering_tags
     cmds = {
-        "from": assign("from"),
         "source": sources,
         "maintainer": assign("maintainer"),
         "schema": schema,
         "indexes": indexes,
-        # TODO: Added to be able to merge MR 11347, let's remove it afterwards
-        "indices": indexes,
         "engine": set_engine,
         "partition_key": assign_var("partition_key"),
         "sorting_key": assign_var("sorting_key"),
@@ -408,7 +1325,6 @@ def parse(
         "resource": assign_node_var("resource"),
         "filter": assign_node_var("filter"),
         "token": add_token,
-        "test": test,
         "include": include,
         "sql": sql("sql"),
         "version": version,
@@ -462,10 +1378,11 @@ def parse(
     if default_node:
         node(default_node)
-    lineno = 0
+    lineno = 1
     try:
-        while lineno < len(lines):
-            line = lines[lineno]
+        while lineno <= len(lines):
+            line = lines[lineno - 1]
+            # shlex.shlex(line) removes comments that start with #. This doesn't affect multiline commands
             try:
                 sa = shlex.shlex(line)
                 sa.whitespace_split = True
@@ -479,23 +1396,37 @@ def parse(
                 if (
                     parser_state.multiline
                     and cmd.lower() in cmds
-                    and not (line.startswith(" ") or line.startswith("\t") or line.lower().startswith("from"))
+                    and not (line.startswith(" ") or line.startswith("\t"))
                 ):
-                    parser_state.multiline = False
                     cmds[parser_state.command](
-                        parser_state.multiline_string, lineno=lineno, replace_includes=replace_includes
+                        parser_state.multiline_string,
+                        lineno=lineno,
+                        replace_includes=replace_includes,
+                        cmd=parser_state.command,
                     )
+                    parser_state.multiline = False
                 if not parser_state.multiline:
                     if len(args) >= 1 and args[0] == ">":
                         parser_state.multiline = True
                         parser_state.command = cmd.lower()
+                        parser_state.start_lineno = lineno
                         parser_state.multiline_string = ""
                     else:
                         if cmd.lower() == "settings":
-                            raise click.ClickException(FeedbackManager.error_settings_not_allowed())
+                            msg = (
+                                "SETTINGS option is not allowed, use ENGINE_SETTINGS instead. See "
+                                "https://www.tinybird.co/docs/cli/datafiles#data-source for more information."
+                            )
+                            raise DatafileSyntaxError(
+                                # TODO(eclbg): add surrounding lines as context to the error so we can print it
+                                # offending_line=line,
+                                message=msg,
+                                lineno=lineno,
+                                pos=0,
+                            )
                         if cmd.lower() in cmds:
-                            cmds[cmd.lower()](*args, lineno=lineno, replace_includes=replace_includes)
+                            cmds[cmd.lower()](*args, lineno=lineno, replace_includes=replace_includes, cmd=cmd)
                         else:
                             raise click.ClickException(FeedbackManager.error_option(option=cmd.upper()))
                 else:
@@ -503,11 +1434,20 @@ def parse(
             lineno += 1
         # close final state
         if parser_state.multiline:
-            cmds[parser_state.command](parser_state.multiline_string, lineno=lineno, replace_includes=replace_includes)
+            cmds[parser_state.command](
+                parser_state.multiline_string,
+                lineno=lineno,
+                replace_includes=replace_includes,
+                cmd=parser_state.command,
+            )
+    except DatafileSyntaxError as e:
+        # When the error is in a multiline block, add the start lineno to the error lineno so the error location is in
+        # respect to the whole file
+        if parser_state.multiline:
+            e.lineno += parser_state.start_lineno
+        raise e
     except ParseException as e:
         raise ParseException(str(e), lineno=lineno)
-    except ValidationException as e:
-        raise ValidationException(str(e), lineno=lineno)
     except IndexError as e:
         if "node" in line.lower():
             raise click.ClickException(FeedbackManager.error_missing_node_name())

tinybird 0.0.1.dev14__py3-none-any.whl → 0.0.1.dev15__py3-none-any.whl

Potentially problematic release.

tinybird 0.0.1.dev14py3-none-any.whl → 0.0.1.dev15py3-none-any.whl