PyPI - sqlglot - Versions diffs - 28.4.1__py3-none-any.whl → 28.8.0__py3-none-any.whl - Mend

sqlglot 28.4.1py3-none-any.whl → 28.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

sqlglot/_version.py +2 -2
sqlglot/dialects/bigquery.py +20 -23
sqlglot/dialects/clickhouse.py +2 -0
sqlglot/dialects/dialect.py +355 -18
sqlglot/dialects/doris.py +38 -90
sqlglot/dialects/druid.py +1 -0
sqlglot/dialects/duckdb.py +1739 -163
sqlglot/dialects/exasol.py +17 -1
sqlglot/dialects/hive.py +27 -2
sqlglot/dialects/mysql.py +103 -11
sqlglot/dialects/oracle.py +38 -1
sqlglot/dialects/postgres.py +142 -33
sqlglot/dialects/presto.py +6 -2
sqlglot/dialects/redshift.py +7 -1
sqlglot/dialects/singlestore.py +13 -3
sqlglot/dialects/snowflake.py +271 -21
sqlglot/dialects/spark.py +25 -0
sqlglot/dialects/spark2.py +4 -3
sqlglot/dialects/starrocks.py +152 -17
sqlglot/dialects/trino.py +1 -0
sqlglot/dialects/tsql.py +5 -0
sqlglot/diff.py +1 -1
sqlglot/expressions.py +239 -47
sqlglot/generator.py +173 -44
sqlglot/optimizer/annotate_types.py +129 -60
sqlglot/optimizer/merge_subqueries.py +13 -2
sqlglot/optimizer/qualify_columns.py +7 -0
sqlglot/optimizer/resolver.py +19 -0
sqlglot/optimizer/scope.py +12 -0
sqlglot/optimizer/unnest_subqueries.py +7 -0
sqlglot/parser.py +251 -58
sqlglot/schema.py +186 -14
sqlglot/tokens.py +36 -6
sqlglot/transforms.py +6 -5
sqlglot/typing/__init__.py +29 -10
sqlglot/typing/bigquery.py +5 -10
sqlglot/typing/duckdb.py +39 -0
sqlglot/typing/hive.py +50 -1
sqlglot/typing/mysql.py +32 -0
sqlglot/typing/presto.py +0 -1
sqlglot/typing/snowflake.py +80 -17
sqlglot/typing/spark.py +29 -0
sqlglot/typing/spark2.py +9 -1
sqlglot/typing/tsql.py +21 -0
{sqlglot-28.4.1.dist-info → sqlglot-28.8.0.dist-info}/METADATA +47 -2
sqlglot-28.8.0.dist-info/RECORD +95 -0
{sqlglot-28.4.1.dist-info → sqlglot-28.8.0.dist-info}/WHEEL +1 -1
sqlglot-28.4.1.dist-info/RECORD +0 -92
{sqlglot-28.4.1.dist-info → sqlglot-28.8.0.dist-info}/licenses/LICENSE +0 -0
{sqlglot-28.4.1.dist-info → sqlglot-28.8.0.dist-info}/top_level.txt +0 -0

sqlglot/optimizer/annotate_types.py CHANGED Viewed

@@ -30,6 +30,15 @@ if t.TYPE_CHECKING:
 logger = logging.getLogger("sqlglot")
+# EXTRACT/DATE_PART specifiers that return BIGINT instead of INT
+BIGINT_EXTRACT_DATE_PARTS = {
+    "EPOCH_SECOND",
+    "EPOCH_MILLISECOND",
+    "EPOCH_MICROSECOND",
+    "EPOCH_NANOSECOND",
+    "NANOSECOND",
+}
 def annotate_types(
     expression: E,
@@ -213,10 +222,14 @@ class TypeAnnotator(metaclass=_TypeAnnotator):
         # When set to False, this enables partial annotation by skipping already-annotated nodes
         self._overwrite_types = overwrite_types
+        # Maps Scope to its corresponding selected sources
+        self._scope_selects: t.Dict[Scope, t.Dict[str, t.Dict[str, t.Any]]] = {}
     def clear(self) -> None:
         self._visited.clear()
         self._null_expressions.clear()
         self._setop_column_types.clear()
+        self._scope_selects.clear()
     def _set_type(
         self, expression: E, target_type: t.Optional[exp.DataType | exp.DataType.Type]
@@ -268,53 +281,58 @@ class TypeAnnotator(metaclass=_TypeAnnotator):
         return expression
-    def annotate_scope(self, scope: Scope) -> None:
-        selects = {}
-        for name, source in scope.sources.items():
-            if not isinstance(source, Scope):
-                continue
-            expression = source.expression
-            if isinstance(expression, exp.UDTF):
-                values = []
+    def _get_scope_selects(self, scope: Scope) -> t.Dict[str, t.Dict[str, t.Any]]:
+        if scope not in self._scope_selects:
+            selects = {}
+            for name, source in scope.sources.items():
+                if not isinstance(source, Scope):
+                    continue
-                if isinstance(expression, exp.Lateral):
-                    if isinstance(expression.this, exp.Explode):
-                        values = [expression.this.this]
-                elif isinstance(expression, exp.Unnest):
-                    values = [expression]
-                elif not isinstance(expression, exp.TableFromRows):
-                    values = expression.expressions[0].expressions
+                expression = source.expression
+                if isinstance(expression, exp.UDTF):
+                    values = []
-                if not values:
-                    continue
+                    if isinstance(expression, exp.Lateral):
+                        if isinstance(expression.this, exp.Explode):
+                            values = [expression.this.this]
+                    elif isinstance(expression, exp.Unnest):
+                        values = [expression]
+                    elif not isinstance(expression, exp.TableFromRows):
+                        values = expression.expressions[0].expressions
-                alias_column_names = expression.alias_column_names
+                    if not values:
+                        continue
-                if (
-                    isinstance(expression, exp.Unnest)
-                    and not alias_column_names
-                    and expression.type
-                    and expression.type.is_type(exp.DataType.Type.STRUCT)
-                ):
-                    selects[name] = {
-                        col_def.name: t.cast(t.Union[exp.DataType, exp.DataType.Type], col_def.kind)
-                        for col_def in expression.type.expressions
-                        if isinstance(col_def, exp.ColumnDef) and col_def.kind
-                    }
+                    alias_column_names = expression.alias_column_names
+                    if (
+                        isinstance(expression, exp.Unnest)
+                        and expression.type
+                        and expression.type.is_type(exp.DataType.Type.STRUCT)
+                    ):
+                        selects[name] = {
+                            col_def.name: t.cast(
+                                t.Union[exp.DataType, exp.DataType.Type], col_def.kind
+                            )
+                            for col_def in expression.type.expressions
+                            if isinstance(col_def, exp.ColumnDef) and col_def.kind
+                        }
+                    else:
+                        selects[name] = {
+                            alias: column.type for alias, column in zip(alias_column_names, values)
+                        }
+                elif isinstance(expression, exp.SetOperation) and len(
+                    expression.left.selects
+                ) == len(expression.right.selects):
+                    selects[name] = self._get_setop_column_types(expression)
                 else:
-                    selects[name] = {
-                        alias: column.type for alias, column in zip(alias_column_names, values)
-                    }
-            elif isinstance(expression, exp.SetOperation) and len(expression.left.selects) == len(
-                expression.right.selects
-            ):
-                selects[name] = self._get_setop_column_types(expression)
+                    selects[name] = {s.alias_or_name: s.type for s in expression.selects}
-            else:
-                selects[name] = {s.alias_or_name: s.type for s in expression.selects}
+            self._scope_selects[scope] = selects
+        return self._scope_selects[scope]
+    def annotate_scope(self, scope: Scope) -> None:
         if isinstance(self.schema, MappingSchema):
             for table_column in scope.table_columns:
                 source = scope.sources.get(table_column.name)
@@ -345,7 +363,7 @@ class TypeAnnotator(metaclass=_TypeAnnotator):
                     self._set_type(table_column, source.expression.meta["query_type"])
         # Iterate through all the expressions of the current scope in post-order, and annotate
-        self._annotate_expression(scope.expression, scope, selects)
+        self._annotate_expression(scope.expression, scope)
         if self.dialect.QUERY_RESULTS_ARE_STRUCTS and isinstance(scope.expression, exp.Query):
             struct_type = exp.DataType(
@@ -374,10 +392,8 @@ class TypeAnnotator(metaclass=_TypeAnnotator):
         self,
         expression: exp.Expression,
         scope: t.Optional[Scope] = None,
-        selects: t.Optional[t.Dict[str, t.Dict[str, t.Any]]] = None,
     ) -> None:
         stack = [(expression, False)]
-        selects = selects or {}
         while stack:
             expr, children_annotated = stack.pop()
@@ -396,12 +412,21 @@ class TypeAnnotator(metaclass=_TypeAnnotator):
                 continue
             if scope and isinstance(expr, exp.Column) and expr.table:
-                source = scope.sources.get(expr.table)
+                source = None
+                source_scope = scope
+                while source_scope and not source:
+                    source = source_scope.sources.get(expr.table)
+                    if not source:
+                        source_scope = source_scope.parent
                 if isinstance(source, exp.Table):
                     self._set_type(expr, self.schema.get_column_type(source, expr))
                 elif source:
-                    if expr.table in selects and expr.name in selects[expr.table]:
-                        self._set_type(expr, selects[expr.table][expr.name])
+                    col_type = (
+                        self._get_scope_selects(source_scope).get(expr.table, {}).get(expr.name)
+                    )
+                    if col_type:
+                        self._set_type(expr, col_type)
                     elif isinstance(source.expression, exp.Unnest):
                         self._set_type(expr, source.expression.type)
                     else:
@@ -536,7 +561,7 @@ class TypeAnnotator(metaclass=_TypeAnnotator):
         elif (left_type, right_type) in self.binary_coercions:
             self._set_type(expression, self.binary_coercions[(left_type, right_type)](left, right))
         else:
-            self._set_type(expression, self._maybe_coerce(left_type, right_type))
+            self._annotate_by_args(expression, left, right)
         if isinstance(expression, exp.Is) or (
             left.meta.get("nonnull") is True and right.meta.get("nonnull") is True
@@ -572,28 +597,64 @@ class TypeAnnotator(metaclass=_TypeAnnotator):
     def _annotate_by_args(
         self,
         expression: E,
-        *args: str,
+        *args: str | exp.Expression,
         promote: bool = False,
         array: bool = False,
     ) -> E:
-        expressions: t.List[exp.Expression] = []
+        literal_type = None
+        non_literal_type = None
+        nested_type = None
         for arg in args:
-            arg_expr = expression.args.get(arg)
-            expressions.extend(expr for expr in ensure_list(arg_expr) if expr)
+            if isinstance(arg, str):
+                expressions = expression.args.get(arg)
+            else:
+                expressions = arg
-        last_datatype = None
-        for expr in expressions:
-            expr_type = expr.type
+            for expr in ensure_list(expressions):
+                expr_type = expr.type
-            # Stop at the first nested data type found - we don't want to _maybe_coerce nested types
-            if expr_type.args.get("nested"):
-                last_datatype = expr_type
+                # Stop at the first nested data type found - we don't want to _maybe_coerce nested types
+                if expr_type.args.get("nested"):
+                    nested_type = expr_type
+                    break
+                if isinstance(expr, exp.Literal):
+                    literal_type = self._maybe_coerce(literal_type or expr_type, expr_type)
+                else:
+                    non_literal_type = self._maybe_coerce(non_literal_type or expr_type, expr_type)
+            if nested_type:
                 break
-            if not expr_type.is_type(exp.DataType.Type.UNKNOWN):
-                last_datatype = self._maybe_coerce(last_datatype or expr_type, expr_type)
+        result_type = None
+        if nested_type:
+            result_type = nested_type
+        elif literal_type and non_literal_type:
+            if self.dialect.PRIORITIZE_NON_LITERAL_TYPES:
+                literal_this_type = (
+                    literal_type.this if isinstance(literal_type, exp.DataType) else literal_type
+                )
+                non_literal_this_type = (
+                    non_literal_type.this
+                    if isinstance(non_literal_type, exp.DataType)
+                    else non_literal_type
+                )
+                if (
+                    literal_this_type in exp.DataType.INTEGER_TYPES
+                    and non_literal_this_type in exp.DataType.INTEGER_TYPES
+                ) or (
+                    literal_this_type in exp.DataType.REAL_TYPES
+                    and non_literal_this_type in exp.DataType.REAL_TYPES
+                ):
+                    result_type = non_literal_type
+        else:
+            result_type = literal_type or non_literal_type or exp.DataType.Type.UNKNOWN
-        self._set_type(expression, last_datatype)
+        self._set_type(
+            expression, result_type or self._maybe_coerce(non_literal_type, literal_type)
+        )
         if promote:
             if expression.type.this in exp.DataType.INTEGER_TYPES:
@@ -661,6 +722,12 @@ class TypeAnnotator(metaclass=_TypeAnnotator):
     def _annotate_dot(self, expression: exp.Dot) -> exp.Dot:
         self._set_type(expression, None)
+        # Propagate type from qualified UDF calls (e.g., db.my_udf(...))
+        if isinstance(expression.expression, exp.Anonymous):
+            self._set_type(expression, expression.expression.type)
+            return expression
         this_type = expression.this.type
         if this_type and this_type.is_type(exp.DataType.Type.STRUCT):
@@ -784,6 +851,8 @@ class TypeAnnotator(metaclass=_TypeAnnotator):
             self._set_type(expression, exp.DataType.Type.TIME)
         elif part == "DATE":
             self._set_type(expression, exp.DataType.Type.DATE)
+        elif part in BIGINT_EXTRACT_DATE_PARTS:
+            self._set_type(expression, exp.DataType.Type.BIGINT)
         else:
             self._set_type(expression, exp.DataType.Type.INT)
         return expression

sqlglot/optimizer/merge_subqueries.py CHANGED Viewed

@@ -326,14 +326,25 @@ def _merge_expressions(outer_scope: Scope, inner_scope: Scope, alias: str) -> No
         expression = expression.unalias()
         must_wrap_expression = not isinstance(expression, SAFE_TO_REPLACE_UNWRAPPED)
+        is_number = expression.is_number
         for column in columns_to_replace:
+            parent = column.parent
+            # Ensures that we don't merge literal numbers in GROUP BY as they have positional context
+            # e.g don't trasform `SELECT a FROM (SELECT 6 AS a) GROUP BY a` to `SELECT 6 AS a GROUP BY 6`,
+            # as this would attempt to GROUP BY the 6th projection instead of the column `a`
+            if is_number and isinstance(parent, exp.Group):
+                column.replace(exp.to_identifier(column.name))
+                continue
             # Ensures we don't alter the intended operator precedence if there's additional
             # context surrounding the outer expression (i.e. it's not a simple projection).
-            if isinstance(column.parent, (exp.Unary, exp.Binary)) and must_wrap_expression:
+            if isinstance(parent, (exp.Unary, exp.Binary)) and must_wrap_expression:
                 expression = exp.paren(expression, copy=False)
             # make sure we do not accidentally change the name of the column
-            if isinstance(column.parent, exp.Select) and column.name != expression.name:
+            if isinstance(parent, exp.Select) and column.name != expression.name:
                 expression = exp.alias_(expression, column.name)
             column.replace(expression.copy())

sqlglot/optimizer/qualify_columns.py CHANGED Viewed

@@ -610,6 +610,13 @@ def _qualify_columns(
             # column_table can be a '' because bigquery unnest has no table alias
             column_table = resolver.get_table(column)
+            if (
+                column_table
+                and isinstance(source := scope.sources.get(column_table.name), Scope)
+                and id(column) in source.column_index
+            ):
+                continue
             if column_table:
                 column.set("table", column_table)
             elif (

sqlglot/optimizer/resolver.py CHANGED Viewed

@@ -305,6 +305,21 @@ class Resolver:
             # Performance optimization - avoid copying first_columns if there is only one table.
             return SingleValuedMapping(first_columns, first_table)
+        # For BigQuery UNNEST_COLUMN_ONLY, build a mapping of original UNNEST aliases
+        # from alias.columns[0] to their source names. This is used to resolve shadowing
+        # where an UNNEST alias shadows a column name from another table.
+        unnest_original_aliases: t.Dict[str, str] = {}
+        if self.dialect.UNNEST_COLUMN_ONLY:
+            unnest_original_aliases = {
+                alias_arg.columns[0].name: source_name
+                for source_name, source in self.scope.sources.items()
+                if (
+                    isinstance(source.expression, exp.Unnest)
+                    and (alias_arg := source.expression.args.get("alias"))
+                    and alias_arg.columns
+                )
+            }
         unambiguous_columns = {col: first_table for col in first_columns}
         all_columns = set(unambiguous_columns)
@@ -314,6 +329,10 @@ class Resolver:
             all_columns.update(columns)
             for column in ambiguous:
+                if column in unnest_original_aliases:
+                    unambiguous_columns[column] = unnest_original_aliases[column]
+                    continue
                 unambiguous_columns.pop(column, None)
             for column in unique.difference(ambiguous):
                 unambiguous_columns[column] = table

sqlglot/optimizer/scope.py CHANGED Viewed

@@ -103,6 +103,7 @@ class Scope:
         self._pivots = None
         self._references = None
         self._semi_anti_join_tables = None
+        self._column_index = None
     def branch(
         self, expression, scope_type, sources=None, cte_sources=None, lateral_sources=None, **kwargs
@@ -131,6 +132,7 @@ class Scope:
         self._stars = []
         self._join_hints = []
         self._semi_anti_join_tables = set()
+        self._column_index = set()
         for node in self.walk(bfs=False):
             if node is self.expression:
@@ -139,6 +141,8 @@ class Scope:
             if isinstance(node, exp.Dot) and node.is_star:
                 self._stars.append(node)
             elif isinstance(node, exp.Column) and not isinstance(node, exp.Pseudocolumn):
+                self._column_index.add(id(node))
                 if isinstance(node.this, exp.Star):
                     self._stars.append(node)
                 else:
@@ -259,6 +263,14 @@ class Scope:
         self._ensure_collected()
         return self._stars
+    @property
+    def column_index(self) -> t.Set[int]:
+        """
+        Set of column object IDs that belong to this scope's expression.
+        """
+        self._ensure_collected()
+        return self._column_index
     @property
     def columns(self):
         """

sqlglot/optimizer/unnest_subqueries.py CHANGED Viewed

@@ -43,6 +43,12 @@ def unnest(select, parent_select, next_alias_name):
     predicate = select.find_ancestor(exp.Condition)
     if (
         not predicate
+        # Do not unnest subqueries inside table-valued functions such as
+        # FROM GENERATE_SERIES(...), FROM UNNEST(...) etc in order to preserve join order
+        or (
+            isinstance(predicate, exp.Func)
+            and isinstance(predicate.parent, (exp.Table, exp.From, exp.Join))
+        )
         or parent_select is not predicate.parent_select
         or not parent_select.args.get("from_")
     ):
@@ -83,6 +89,7 @@ def unnest(select, parent_select, next_alias_name):
         _replace(select.parent, column)
         parent_select.join(select, on=on_clause, join_type=join_type, join_alias=alias, copy=False)
         return
     if select.find(exp.Limit, exp.Offset):

sqlglot 28.4.1__py3-none-any.whl → 28.8.0__py3-none-any.whl

sqlglot 28.4.1py3-none-any.whl → 28.8.0py3-none-any.whl