PyPI - sqlglot - Versions diffs - 27.28.1__py3-none-any.whl → 27.29.0__py3-none-any.whl - Mend

sqlglot 27.28.1py3-none-any.whl → 27.29.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

sqlglot/__init__.py +1 -0
sqlglot/_version.py +2 -2
sqlglot/dialects/bigquery.py +1 -0
sqlglot/dialects/dialect.py +45 -7
sqlglot/dialects/duckdb.py +17 -3
sqlglot/dialects/mysql.py +1 -0
sqlglot/dialects/postgres.py +14 -2
sqlglot/dialects/snowflake.py +55 -18
sqlglot/dialects/spark.py +3 -0
sqlglot/dialects/sqlite.py +1 -0
sqlglot/executor/__init__.py +5 -10
sqlglot/executor/python.py +1 -29
sqlglot/expressions.py +102 -12
sqlglot/generator.py +16 -2
sqlglot/helper.py +0 -42
sqlglot/lineage.py +1 -1
sqlglot/optimizer/qualify.py +5 -5
sqlglot/optimizer/qualify_columns.py +89 -9
sqlglot/optimizer/qualify_tables.py +33 -23
sqlglot/optimizer/simplify.py +12 -7
sqlglot/parser.py +16 -8
{sqlglot-27.28.1.dist-info → sqlglot-27.29.0.dist-info}/METADATA +1 -1
{sqlglot-27.28.1.dist-info → sqlglot-27.29.0.dist-info}/RECORD +26 -26
{sqlglot-27.28.1.dist-info → sqlglot-27.29.0.dist-info}/WHEEL +0 -0
{sqlglot-27.28.1.dist-info → sqlglot-27.29.0.dist-info}/licenses/LICENSE +0 -0
{sqlglot-27.28.1.dist-info → sqlglot-27.29.0.dist-info}/top_level.txt +0 -0

sqlglot/expressions.py CHANGED Viewed

@@ -118,7 +118,7 @@ class Expression(metaclass=_Expression):
             self._set_parent(arg_key, value)
     def __eq__(self, other) -> bool:
-        return type(self) is type(other) and hash(self) == hash(other)
+        return self is other or (type(self) is type(other) and hash(self) == hash(other))
     def __hash__(self) -> int:
         if self._hash is None:
@@ -1893,7 +1893,13 @@ class Comment(Expression):
 class Comprehension(Expression):
-    arg_types = {"this": True, "expression": True, "iterator": True, "condition": False}
+    arg_types = {
+        "this": True,
+        "expression": True,
+        "position": False,
+        "iterator": True,
+        "condition": False,
+    }
 # https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/mergetree#mergetree-table-ttl
@@ -5620,6 +5626,14 @@ class Boolnot(Func):
     pass
+class Booland(Func):
+    arg_types = {"this": True, "expression": True}
+class Boolor(Func):
+    arg_types = {"this": True, "expression": True}
 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#bool_for_json
 class JSONBool(Func):
     pass
@@ -5974,11 +5988,11 @@ class Lead(AggFunc):
 # some dialects have a distinction between first and first_value, usually first is an aggregate func
 # and first_value is a window func
 class First(AggFunc):
-    pass
+    arg_types = {"this": True, "expression": False}
 class Last(AggFunc):
-    pass
+    arg_types = {"this": True, "expression": False}
 class FirstValue(AggFunc):
@@ -6276,6 +6290,14 @@ class WeekOfYear(Func):
     _sql_names = ["WEEK_OF_YEAR", "WEEKOFYEAR"]
+class YearOfWeek(Func):
+    _sql_names = ["YEAR_OF_WEEK", "YEAROFWEEK"]
+class YearOfWeekIso(Func):
+    _sql_names = ["YEAR_OF_WEEK_ISO", "YEAROFWEEKISO"]
 class MonthsBetween(Func):
     arg_types = {"this": True, "expression": True, "roundoff": False}
@@ -6426,6 +6448,10 @@ class Encode(Func):
     arg_types = {"this": True, "charset": True}
+class EqualNull(Func):
+    arg_types = {"this": True, "expression": True}
 class Exp(Func):
     pass
@@ -6570,6 +6596,16 @@ class Greatest(Func):
     is_var_len_args = True
+class GreatestIgnoreNulls(Func):
+    arg_types = {"expressions": True}
+    is_var_len_args = True
+class LeastIgnoreNulls(Func):
+    arg_types = {"expressions": True}
+    is_var_len_args = True
 # Trino's `ON OVERFLOW TRUNCATE [filler_string] {WITH | WITHOUT} COUNT`
 # https://trino.io/docs/current/functions/aggregate.html#listagg
 class OverflowTruncateBehavior(Expression):
@@ -6668,6 +6704,10 @@ class IsInf(Func):
     _sql_names = ["IS_INF", "ISINF"]
+class IsNullValue(Func):
+    pass
 # https://www.postgresql.org/docs/current/functions-json.html
 class JSON(Expression):
     arg_types = {"this": False, "with": False, "unique": False}
@@ -7349,6 +7389,7 @@ class RegexpReplace(Func):
         "position": False,
         "occurrence": False,
         "modifiers": False,
+        "single_replace": False,
     }
@@ -7391,6 +7432,14 @@ class RegexpCount(Func):
     }
+class RegrValx(Func):
+    arg_types = {"this": True, "expression": True}
+class RegrValy(Func):
+    arg_types = {"this": True, "expression": True}
 class Repeat(Func):
     arg_types = {"this": True, "times": True}
@@ -7754,18 +7803,38 @@ class Uuid(Func):
     arg_types = {"this": False, "name": False}
+TIMESTAMP_PARTS = {
+    "year": False,
+    "month": False,
+    "day": False,
+    "hour": False,
+    "min": False,
+    "sec": False,
+    "nano": False,
+}
 class TimestampFromParts(Func):
     _sql_names = ["TIMESTAMP_FROM_PARTS", "TIMESTAMPFROMPARTS"]
     arg_types = {
-        "year": True,
-        "month": True,
-        "day": True,
-        "hour": True,
-        "min": True,
-        "sec": True,
-        "nano": False,
+        **TIMESTAMP_PARTS,
         "zone": False,
         "milli": False,
+        "this": False,
+        "expression": False,
+    }
+class TimestampLtzFromParts(Func):
+    _sql_names = ["TIMESTAMP_LTZ_FROM_PARTS", "TIMESTAMPLTZFROMPARTS"]
+    arg_types = TIMESTAMP_PARTS.copy()
+class TimestampTzFromParts(Func):
+    _sql_names = ["TIMESTAMP_TZ_FROM_PARTS", "TIMESTAMPTZFROMPARTS"]
+    arg_types = {
+        **TIMESTAMP_PARTS,
+        "zone": False,
     }
@@ -7851,7 +7920,8 @@ class Merge(DML):
     arg_types = {
         "this": True,
         "using": True,
-        "on": True,
+        "on": False,
+        "using_cond": False,
         "whens": True,
         "with": False,
         "returning": False,
@@ -9355,6 +9425,26 @@ def replace_tree(
     return new_node
+def find_tables(expression: Expression) -> t.Set[Table]:
+    """
+    Find all tables referenced in a query.
+    Args:
+        expressions: The query to find the tables in.
+    Returns:
+        A set of all the tables.
+    """
+    from sqlglot.optimizer.scope import traverse_scope
+    return {
+        table
+        for scope in traverse_scope(expression)
+        for table in scope.tables
+        if table.name and table.name not in scope.cte_sources
+    }
 def column_table_names(expression: Expression, exclude: str = "") -> t.Set[str]:
     """
     Return all table names referenced through columns in an expression.

sqlglot/generator.py CHANGED Viewed

@@ -2531,6 +2531,12 @@ class Generator(metaclass=_Generator):
     def boolean_sql(self, expression: exp.Boolean) -> str:
         return "TRUE" if expression.this else "FALSE"
+    def booland_sql(self, expression: exp.Booland) -> str:
+        return f"(({self.sql(expression, 'this')}) AND ({self.sql(expression, 'expression')}))"
+    def boolor_sql(self, expression: exp.Boolor) -> str:
+        return f"(({self.sql(expression, 'this')}) OR ({self.sql(expression, 'expression')}))"
     def order_sql(self, expression: exp.Order, flat: bool = False) -> str:
         this = self.sql(expression, "this")
         this = f"{this} " if this else this
@@ -4078,9 +4084,15 @@ class Generator(metaclass=_Generator):
         this = self.sql(table)
         using = f"USING {self.sql(expression, 'using')}"
-        on = f"ON {self.sql(expression, 'on')}"
         whens = self.sql(expression, "whens")
+        on = self.sql(expression, "on")
+        on = f"ON {on}" if on else ""
+        if not on:
+            on = self.expressions(expression, key="using_cond")
+            on = f"USING ({on})" if on else ""
         returning = self.sql(expression, "returning")
         if returning:
             whens = f"{whens}{returning}"
@@ -4244,10 +4256,12 @@ class Generator(metaclass=_Generator):
     def comprehension_sql(self, expression: exp.Comprehension) -> str:
         this = self.sql(expression, "this")
         expr = self.sql(expression, "expression")
+        position = self.sql(expression, "position")
+        position = f", {position}" if position else ""
         iterator = self.sql(expression, "iterator")
         condition = self.sql(expression, "condition")
         condition = f" IF {condition}" if condition else ""
-        return f"{this} FOR {expr} IN {iterator}{condition}"
+        return f"{this} FOR {expr}{position} IN {iterator}{condition}"
     def columnprefix_sql(self, expression: exp.ColumnPrefix) -> str:
         return f"{self.sql(expression, 'this')}({self.sql(expression, 'expression')})"

sqlglot/helper.py CHANGED Viewed

@@ -7,7 +7,6 @@ import re
 import sys
 import typing as t
 from collections.abc import Collection, Set
-from contextlib import contextmanager
 from copy import copy
 from difflib import get_close_matches
 from enum import Enum
@@ -272,47 +271,6 @@ def tsort(dag: t.Dict[T, t.Set[T]]) -> t.List[T]:
     return result
-def open_file(file_name: str) -> t.TextIO:
-    """Open a file that may be compressed as gzip and return it in universal newline mode."""
-    with open(file_name, "rb") as f:
-        gzipped = f.read(2) == b"\x1f\x8b"
-    if gzipped:
-        import gzip
-        return gzip.open(file_name, "rt", newline="")
-    return open(file_name, encoding="utf-8", newline="")
-@contextmanager
-def csv_reader(read_csv: exp.ReadCSV) -> t.Any:
-    """
-    Returns a csv reader given the expression `READ_CSV(name, ['delimiter', '|', ...])`.
-    Args:
-        read_csv: A `ReadCSV` function call.
-    Yields:
-        A python csv reader.
-    """
-    args = read_csv.expressions
-    file = open_file(read_csv.name)
-    delimiter = ","
-    args = iter(arg.name for arg in args)  # type: ignore
-    for k, v in zip(args, args):
-        if k == "delimiter":
-            delimiter = v
-    try:
-        import csv as csv_
-        yield csv_.reader(file, delimiter=delimiter)
-    finally:
-        file.close()
 def find_new_name(taken: t.Collection[str], base: str) -> str:
     """
     Searches for a new name.

sqlglot/lineage.py CHANGED Viewed

@@ -232,7 +232,7 @@ def to_node(
             )
     # if the select is a star add all scope sources as downstreams
-    if select.is_star:
+    if isinstance(select, exp.Star):
         for source in scope.sources.values():
             if isinstance(source, Scope):
                 source = source.expression

sqlglot/optimizer/qualify.py CHANGED Viewed

@@ -31,7 +31,7 @@ def qualify(
     validate_qualify_columns: bool = True,
     quote_identifiers: bool = True,
     identify: bool = True,
-    infer_csv_schemas: bool = False,
+    on_qualify: t.Optional[t.Callable[[exp.Expression], None]] = None,
 ) -> exp.Expression:
     """
     Rewrite sqlglot AST to have normalized and qualified tables and columns.
@@ -63,21 +63,21 @@ def qualify(
             This step is necessary to ensure correctness for case sensitive queries.
             But this flag is provided in case this step is performed at a later time.
         identify: If True, quote all identifiers, else only necessary ones.
-        infer_csv_schemas: Whether to scan READ_CSV calls in order to infer the CSVs' schemas.
+        on_qualify: Callback after a table has been qualified.
     Returns:
         The qualified expression.
     """
     schema = ensure_schema(schema, dialect=dialect)
+    expression = normalize_identifiers(expression, dialect=dialect)
     expression = qualify_tables(
         expression,
         db=db,
         catalog=catalog,
-        schema=schema,
         dialect=dialect,
-        infer_csv_schemas=infer_csv_schemas,
+        on_qualify=on_qualify,
     )
-    expression = normalize_identifiers(expression, dialect=dialect)
     if isolate_tables:
         expression = isolate_table_selects(expression, schema=schema)

sqlglot/optimizer/qualify_columns.py CHANGED Viewed

@@ -551,7 +551,8 @@ def _qualify_columns(scope: Scope, resolver: Resolver, allow_partial_qualificati
                 continue
             # column_table can be a '' because bigquery unnest has no table alias
-            column_table = resolver.get_table(column_name)
+            column_table = resolver.get_table(column)
             if column_table:
                 column.set("table", column_table)
             elif (
@@ -948,21 +949,29 @@ class Resolver:
         self._infer_schema = infer_schema
         self._get_source_columns_cache: t.Dict[t.Tuple[str, bool], t.Sequence[str]] = {}
-    def get_table(self, column_name: str) -> t.Optional[exp.Identifier]:
+    def get_table(self, column: str | exp.Column) -> t.Optional[exp.Identifier]:
         """
         Get the table for a column name.
         Args:
-            column_name: The column name to find the table for.
+            column: The column expression (or column name) to find the table for.
         Returns:
             The table name if it can be found/inferred.
         """
-        if self._unambiguous_columns is None:
-            self._unambiguous_columns = self._get_unambiguous_columns(
-                self._get_all_source_columns()
-            )
-        table_name = self._unambiguous_columns.get(column_name)
+        column_name = column if isinstance(column, str) else column.name
+        table_name = self._get_table_name_from_sources(column_name)
+        if not table_name and isinstance(column, exp.Column):
+            # Fall-back case: If we couldn't find the `table_name` from ALL of the sources,
+            # attempt to disambiguate the column based on other characteristics e.g if this column is in a join condition,
+            # we may be able to disambiguate based on the source order.
+            if join_context := self._get_column_join_context(column):
+                # In this case, the return value will be the join that _may_ be able to disambiguate the column
+                # and we can use the source columns available at that join to get the table name
+                table_name = self._get_table_name_from_sources(
+                    column_name, self._get_available_source_columns(join_context)
+                )
         if not table_name and self._infer_schema:
             sources_without_schema = tuple(
@@ -1101,6 +1110,77 @@ class Resolver:
             }
         return self._source_columns
+    def _get_table_name_from_sources(
+        self, column_name: str, source_columns: t.Optional[t.Dict[str, t.Sequence[str]]] = None
+    ) -> t.Optional[str]:
+        if not source_columns:
+            # If not supplied, get all sources to calculate unambiguous columns
+            if self._unambiguous_columns is None:
+                self._unambiguous_columns = self._get_unambiguous_columns(
+                    self._get_all_source_columns()
+                )
+            unambiguous_columns = self._unambiguous_columns
+        else:
+            unambiguous_columns = self._get_unambiguous_columns(source_columns)
+        return unambiguous_columns.get(column_name)
+    def _get_column_join_context(self, column: exp.Column) -> t.Optional[exp.Join]:
+        """
+        Check if a column participating in a join can be qualified based on the source order.
+        """
+        args = self.scope.expression.args
+        joins = args.get("joins")
+        if not joins or args.get("laterals") or args.get("pivots"):
+            # Feature gap: We currently don't try to disambiguate columns if other sources
+            # (e.g laterals, pivots) exist alongside joins
+            return None
+        join_ancestor = column.find_ancestor(exp.Join, exp.Select)
+        if (
+            isinstance(join_ancestor, exp.Join)
+            and join_ancestor.alias_or_name in self.scope.selected_sources
+        ):
+            # Ensure that the found ancestor is a join that contains an actual source,
+            # e.g in Clickhouse `b` is an array expression in `a ARRAY JOIN b`
+            return join_ancestor
+        return None
+    def _get_available_source_columns(
+        self, join_ancestor: exp.Join
+    ) -> t.Dict[str, t.Sequence[str]]:
+        """
+        Get the source columns that are available at the point where a column is referenced.
+        For columns in JOIN conditions, this only includes tables that have been joined
+        up to that point. Example:
+        ```
+        SELECT * FROM t_1 INNER JOIN ... INNER JOIN t_n ON t_1.a = c INNER JOIN t_n+1 ON ...
+        ```                                                        ^
+                                                                   |
+                                +----------------------------------+
+                                |
+                                ⌄
+        The unqualified column `c` is not ambiguous if no other sources up until that
+        join i.e t_1, ..., t_n, contain a column named `c`.
+        """
+        args = self.scope.expression.args
+        # Collect tables in order: FROM clause tables + joined tables up to current join
+        from_name = args["from"].alias_or_name
+        available_sources = {from_name: self.get_source_columns(from_name)}
+        for join in args["joins"][: t.cast(int, join_ancestor.index) + 1]:
+            available_sources[join.alias_or_name] = self.get_source_columns(join.alias_or_name)
+        return available_sources
     def _get_unambiguous_columns(
         self, source_columns: t.Dict[str, t.Sequence[str]]
     ) -> t.Mapping[str, str]:

sqlglot/optimizer/qualify_tables.py CHANGED Viewed

@@ -4,11 +4,10 @@ import itertools
 import typing as t
 from sqlglot import alias, exp
-from sqlglot.dialects.dialect import DialectType
-from sqlglot.helper import csv_reader, name_sequence
+from sqlglot.dialects.dialect import Dialect, DialectType
+from sqlglot.helper import name_sequence
+from sqlglot.optimizer.normalize_identifiers import normalize_identifiers
 from sqlglot.optimizer.scope import Scope, traverse_scope
-from sqlglot.schema import Schema
-from sqlglot.dialects.dialect import Dialect
 if t.TYPE_CHECKING:
     from sqlglot._typing import E
@@ -18,8 +17,7 @@ def qualify_tables(
     expression: E,
     db: t.Optional[str | exp.Identifier] = None,
     catalog: t.Optional[str | exp.Identifier] = None,
-    schema: t.Optional[Schema] = None,
-    infer_csv_schemas: bool = False,
+    on_qualify: t.Optional[t.Callable[[exp.Expression], None]] = None,
     dialect: DialectType = None,
 ) -> E:
     """
@@ -40,18 +38,28 @@ def qualify_tables(
         expression: Expression to qualify
         db: Database name
         catalog: Catalog name
-        schema: A schema to populate
-        infer_csv_schemas: Whether to scan READ_CSV calls in order to infer the CSVs' schemas.
+        on_qualify: Callback after a table has been qualified.
         dialect: The dialect to parse catalog and schema into.
     Returns:
         The qualified expression.
     """
-    next_alias_name = name_sequence("_q_")
-    db = exp.parse_identifier(db, dialect=dialect) if db else None
-    catalog = exp.parse_identifier(catalog, dialect=dialect) if catalog else None
     dialect = Dialect.get_or_raise(dialect)
+    alias_sequence = name_sequence("_q_")
+    def next_alias_name() -> str:
+        return normalize_identifiers(alias_sequence(), dialect=dialect).name
+    if db := db or None:
+        db = exp.parse_identifier(db, dialect=dialect)
+        db.meta["is_table"] = True
+        db = normalize_identifiers(db, dialect=dialect)
+    if catalog := catalog or None:
+        catalog = exp.parse_identifier(catalog, dialect=dialect)
+        catalog.meta["is_table"] = True
+        catalog = normalize_identifiers(catalog, dialect=dialect)
     def _qualify(table: exp.Table) -> None:
         if isinstance(table.this, exp.Identifier):
             if db and not table.args.get("db"):
@@ -97,7 +105,10 @@ def qualify_tables(
                         name = source.name
                     # Mutates the source by attaching an alias to it
-                    alias(source, name or source.name or next_alias_name(), copy=False, table=True)
+                    normalized_alias = normalize_identifiers(
+                        name or source.name or alias_sequence(), dialect=dialect
+                    )
+                    alias(source, normalized_alias, copy=False, table=True)
                 table_aliases[".".join(p.name for p in source.parts)] = exp.to_identifier(
                     source.alias
@@ -106,7 +117,10 @@ def qualify_tables(
                 if pivots:
                     pivot = pivots[0]
                     if not pivot.alias:
-                        pivot_alias = source.alias if pivot.unpivot else next_alias_name()
+                        pivot_alias = normalize_identifiers(
+                            source.alias if pivot.unpivot else alias_sequence(),
+                            dialect=dialect,
+                        )
                         pivot.set("alias", exp.TableAlias(this=exp.to_identifier(pivot_alias)))
                     # This case corresponds to a pivoted CTE, we don't want to qualify that
@@ -115,15 +129,8 @@ def qualify_tables(
                 _qualify(source)
-                if infer_csv_schemas and schema and isinstance(source.this, exp.ReadCSV):
-                    with csv_reader(source.this) as reader:
-                        header = next(reader)
-                        columns = next(reader)
-                        schema.add_table(
-                            source,
-                            {k: type(v).__name__ for k, v in zip(header, columns)},
-                            match_depth=False,
-                        )
+                if on_qualify:
+                    on_qualify(source)
             elif isinstance(source, Scope) and source.is_udtf:
                 udtf = source.expression
                 table_alias = udtf.args.get("alias") or exp.TableAlias(
@@ -134,7 +141,10 @@ def qualify_tables(
                 if not table_alias.name:
                     table_alias.set("this", exp.to_identifier(next_alias_name()))
                 if isinstance(udtf, exp.Values) and not table_alias.columns:
-                    column_aliases = dialect.generate_values_aliases(udtf)
+                    column_aliases = [
+                        normalize_identifiers(i, dialect=dialect)
+                        for i in dialect.generate_values_aliases(udtf)
+                    ]
                     table_alias.set("columns", column_aliases)
             else:
                 for node in scope.walk():

sqlglot/optimizer/simplify.py CHANGED Viewed

@@ -125,7 +125,7 @@ def simplify(
                 node.set(k, v)
             # Post-order transformations
-            new_node = simplify_not(node)
+            new_node = simplify_not(node, dialect)
             new_node = flatten(new_node)
             new_node = simplify_connectors(new_node, root)
             new_node = remove_complements(new_node, root)
@@ -202,7 +202,7 @@ COMPLEMENT_SUBQUERY_PREDICATES = {
 }
-def simplify_not(expression):
+def simplify_not(expression: exp.Expression, dialect: Dialect) -> exp.Expression:
     """
     Demorgan's Law
     NOT (x OR y) -> NOT x AND NOT y
@@ -243,10 +243,12 @@ def simplify_not(expression):
             return exp.false()
         if is_false(this):
             return exp.true()
-        if isinstance(this, exp.Not):
-            # double negation
-            # NOT NOT x -> x
-            return this.this
+        if isinstance(this, exp.Not) and dialect.SAFE_TO_ELIMINATE_DOUBLE_NEGATION:
+            inner = this.this
+            if inner.is_type(exp.DataType.Type.BOOLEAN) or isinstance(inner, exp.Predicate):
+                # double negation
+                # NOT NOT x -> x, if x is BOOLEAN type
+                return inner
     return expression
@@ -760,7 +762,10 @@ def simplify_parens(expression: exp.Expression, dialect: DialectType = None) ->
             not isinstance(this, exp.Binary)
             and not (isinstance(this, (exp.Not, exp.Is)) and parent_is_predicate)
         )
-        or (isinstance(this, exp.Predicate) and not parent_is_predicate)
+        or (
+            isinstance(this, exp.Predicate)
+            and not (parent_is_predicate or isinstance(parent, exp.Neg))
+        )
         or (isinstance(this, exp.Add) and isinstance(parent, exp.Add))
         or (isinstance(this, exp.Mul) and isinstance(parent, exp.Mul))
         or (isinstance(this, exp.Mul) and isinstance(parent, (exp.Add, exp.Sub)))

sqlglot 27.28.1__py3-none-any.whl → 27.29.0__py3-none-any.whl

sqlglot 27.28.1py3-none-any.whl → 27.29.0py3-none-any.whl