PyPI - sqlglot - Versions diffs - 27.27.0__py3-none-any.whl → 28.4.0__py3-none-any.whl - Mend

sqlglot 27.27.0py3-none-any.whl → 28.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

sqlglot/__init__.py +1 -0
sqlglot/__main__.py +6 -4
sqlglot/_version.py +2 -2
sqlglot/dialects/bigquery.py +118 -279
sqlglot/dialects/clickhouse.py +73 -5
sqlglot/dialects/databricks.py +38 -1
sqlglot/dialects/dialect.py +354 -275
sqlglot/dialects/dremio.py +4 -1
sqlglot/dialects/duckdb.py +754 -25
sqlglot/dialects/exasol.py +243 -10
sqlglot/dialects/hive.py +8 -8
sqlglot/dialects/mysql.py +14 -4
sqlglot/dialects/oracle.py +29 -0
sqlglot/dialects/postgres.py +60 -26
sqlglot/dialects/presto.py +47 -16
sqlglot/dialects/redshift.py +16 -0
sqlglot/dialects/risingwave.py +3 -0
sqlglot/dialects/singlestore.py +12 -3
sqlglot/dialects/snowflake.py +239 -218
sqlglot/dialects/spark.py +15 -4
sqlglot/dialects/spark2.py +11 -48
sqlglot/dialects/sqlite.py +10 -0
sqlglot/dialects/starrocks.py +3 -0
sqlglot/dialects/teradata.py +5 -8
sqlglot/dialects/trino.py +6 -0
sqlglot/dialects/tsql.py +61 -22
sqlglot/diff.py +4 -2
sqlglot/errors.py +69 -0
sqlglot/executor/__init__.py +5 -10
sqlglot/executor/python.py +1 -29
sqlglot/expressions.py +637 -100
sqlglot/generator.py +160 -43
sqlglot/helper.py +2 -44
sqlglot/lineage.py +10 -4
sqlglot/optimizer/annotate_types.py +247 -140
sqlglot/optimizer/canonicalize.py +6 -1
sqlglot/optimizer/eliminate_joins.py +1 -1
sqlglot/optimizer/eliminate_subqueries.py +2 -2
sqlglot/optimizer/merge_subqueries.py +5 -5
sqlglot/optimizer/normalize.py +20 -13
sqlglot/optimizer/normalize_identifiers.py +17 -3
sqlglot/optimizer/optimizer.py +4 -0
sqlglot/optimizer/pushdown_predicates.py +1 -1
sqlglot/optimizer/qualify.py +18 -10
sqlglot/optimizer/qualify_columns.py +122 -275
sqlglot/optimizer/qualify_tables.py +128 -76
sqlglot/optimizer/resolver.py +374 -0
sqlglot/optimizer/scope.py +27 -16
sqlglot/optimizer/simplify.py +1075 -959
sqlglot/optimizer/unnest_subqueries.py +12 -2
sqlglot/parser.py +296 -170
sqlglot/planner.py +2 -2
sqlglot/schema.py +15 -4
sqlglot/tokens.py +42 -7
sqlglot/transforms.py +77 -22
sqlglot/typing/__init__.py +316 -0
sqlglot/typing/bigquery.py +376 -0
sqlglot/typing/hive.py +12 -0
sqlglot/typing/presto.py +24 -0
sqlglot/typing/snowflake.py +505 -0
sqlglot/typing/spark2.py +58 -0
sqlglot/typing/tsql.py +9 -0
{sqlglot-27.27.0.dist-info → sqlglot-28.4.0.dist-info}/METADATA +2 -2
sqlglot-28.4.0.dist-info/RECORD +92 -0
sqlglot-27.27.0.dist-info/RECORD +0 -84
{sqlglot-27.27.0.dist-info → sqlglot-28.4.0.dist-info}/WHEEL +0 -0
{sqlglot-27.27.0.dist-info → sqlglot-28.4.0.dist-info}/licenses/LICENSE +0 -0
{sqlglot-27.27.0.dist-info → sqlglot-28.4.0.dist-info}/top_level.txt +0 -0

sqlglot/dialects/dialect.py CHANGED Viewed

@@ -17,7 +17,6 @@ from sqlglot.helper import (
     flatten,
     is_int,
     seq_get,
-    subclasses,
     suggest_closest_match_and_fail,
     to_bool,
 )
@@ -26,6 +25,7 @@ from sqlglot.parser import Parser
 from sqlglot.time import TIMEZONES, format_time, subsecond_precision
 from sqlglot.tokens import Token, Tokenizer, TokenType
 from sqlglot.trie import new_trie
+from sqlglot.typing import EXPRESSION_METADATA
 DATE_ADD_OR_DIFF = t.Union[
     exp.DateAdd,
@@ -44,17 +44,15 @@ DATETIME_DELTA = t.Union[
     exp.DatetimeSub,
     exp.TimeAdd,
     exp.TimeSub,
+    exp.TimestampAdd,
     exp.TimestampSub,
     exp.TsOrDsAdd,
 ]
+DATETIME_ADD = (exp.DateAdd, exp.TimeAdd, exp.DatetimeAdd, exp.TsOrDsAdd, exp.TimestampAdd)
 if t.TYPE_CHECKING:
     from sqlglot._typing import B, E, F
-    from sqlglot.optimizer.annotate_types import TypeAnnotator
-    AnnotatorsType = t.Dict[t.Type[E], t.Callable[[TypeAnnotator, E], E]]
 logger = logging.getLogger("sqlglot")
 UNESCAPED_SEQUENCES = {
@@ -69,10 +67,6 @@ UNESCAPED_SEQUENCES = {
 }
-def annotate_with_type_lambda(data_type: exp.DataType.Type) -> t.Callable[[TypeAnnotator, E], E]:
-    return lambda self, e: self._annotate_with_type(e, data_type)
 class Dialects(str, Enum):
     """Dialects supported by SQLGLot."""
@@ -130,20 +124,6 @@ class NormalizationStrategy(str, AutoName):
     """Always case-insensitive (uppercase), regardless of quotes."""
-class Version(int):
-    def __new__(cls, version_str: t.Optional[str], *args, **kwargs):
-        if version_str:
-            parts = version_str.split(".")
-            parts.extend(["0"] * (3 - len(parts)))
-            v = int("".join([p.zfill(3) for p in parts]))
-        else:
-            # No version defined means we should support the latest engine semantics, so
-            # the comparison to any specific version should yield that latest is greater
-            v = sys.maxsize
-        return super(Version, cls).__new__(cls, v)
 class _Dialect(type):
     _classes: t.Dict[str, t.Type[Dialect]] = {}
@@ -205,7 +185,11 @@ class _Dialect(type):
         klass.FORMAT_TRIE = (
             new_trie(klass.FORMAT_MAPPING) if klass.FORMAT_MAPPING else klass.TIME_TRIE
         )
-        klass.INVERSE_TIME_MAPPING = {v: k for k, v in klass.TIME_MAPPING.items()}
+        # Merge class-defined INVERSE_TIME_MAPPING with auto-generated mappings
+        # This allows dialects to define custom inverse mappings for roundtrip correctness
+        klass.INVERSE_TIME_MAPPING = {v: k for k, v in klass.TIME_MAPPING.items()} | (
+            klass.__dict__.get("INVERSE_TIME_MAPPING") or {}
+        )
         klass.INVERSE_TIME_TRIE = new_trie(klass.INVERSE_TIME_MAPPING)
         klass.INVERSE_FORMAT_MAPPING = {v: k for k, v in klass.FORMAT_MAPPING.items()}
         klass.INVERSE_FORMAT_TRIE = new_trie(klass.INVERSE_FORMAT_MAPPING)
@@ -261,6 +245,9 @@ class _Dialect(type):
         klass.SUPPORTS_COLUMN_JOIN_MARKS = "(+)" in klass.tokenizer_class.KEYWORDS
+        if enum not in ("", "bigquery", "snowflake"):
+            klass.INITCAP_SUPPORTS_CUSTOM_DELIMITERS = False
         if enum not in ("", "bigquery"):
             klass.generator_class.SELECT_KINDS = ()
@@ -292,6 +279,54 @@ class _Dialect(type):
                 TokenType.SEMI,
             }
+        if enum not in (
+            "",
+            "postgres",
+            "duckdb",
+            "redshift",
+            "snowflake",
+            "presto",
+            "trino",
+            "mysql",
+            "singlestore",
+        ):
+            no_paren_functions = klass.parser_class.NO_PAREN_FUNCTIONS.copy()
+            no_paren_functions.pop(TokenType.LOCALTIME, None)
+            if enum != "oracle":
+                no_paren_functions.pop(TokenType.LOCALTIMESTAMP, None)
+            klass.parser_class.NO_PAREN_FUNCTIONS = no_paren_functions
+        if enum in (
+            "",
+            "postgres",
+            "duckdb",
+            "trino",
+        ):
+            no_paren_functions = klass.parser_class.NO_PAREN_FUNCTIONS.copy()
+            no_paren_functions[TokenType.CURRENT_CATALOG] = exp.CurrentCatalog
+            klass.parser_class.NO_PAREN_FUNCTIONS = no_paren_functions
+        else:
+            # For dialects that don't support this keyword, treat it as a regular identifier
+            # This fixes the "Unexpected token" error in BQ, Spark, etc.
+            klass.parser_class.ID_VAR_TOKENS = klass.parser_class.ID_VAR_TOKENS | {
+                TokenType.CURRENT_CATALOG,
+            }
+        if enum in (
+            "",
+            "duckdb",
+            "spark",
+            "postgres",
+            "tsql",
+        ):
+            no_paren_functions = klass.parser_class.NO_PAREN_FUNCTIONS.copy()
+            no_paren_functions[TokenType.SESSION_USER] = exp.SessionUser
+            klass.parser_class.NO_PAREN_FUNCTIONS = no_paren_functions
+        else:
+            klass.parser_class.ID_VAR_TOKENS = klass.parser_class.ID_VAR_TOKENS | {
+                TokenType.SESSION_USER,
+            }
         klass.VALID_INTERVAL_UNITS = {
             *klass.VALID_INTERVAL_UNITS,
             *klass.DATE_PART_MAPPING.keys(),
@@ -460,14 +495,139 @@ class Dialect(metaclass=_Dialect):
         to "WHERE id = 1 GROUP BY id HAVING id = 1"
     """
-    EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = False
+    EXPAND_ONLY_GROUP_ALIAS_REF = False
     """Whether alias reference expansion before qualification should only happen for the GROUP BY clause."""
+    ANNOTATE_ALL_SCOPES = False
+    """Whether to annotate all scopes during optimization. Used by BigQuery for UNNEST support."""
+    DISABLES_ALIAS_REF_EXPANSION = False
+    """
+    Whether alias reference expansion is disabled for this dialect.
+    Some dialects like Oracle do NOT support referencing aliases in projections or WHERE clauses.
+    The original expression must be repeated instead.
+    For example, in Oracle:
+        SELECT y.foo AS bar, bar * 2 AS baz FROM y  -- INVALID
+        SELECT y.foo AS bar, y.foo * 2 AS baz FROM y  -- VALID
+    """
+    SUPPORTS_ALIAS_REFS_IN_JOIN_CONDITIONS = False
+    """
+    Whether alias references are allowed in JOIN ... ON clauses.
+    Most dialects do not support this, but Snowflake allows alias expansion in the JOIN ... ON
+    clause (and almost everywhere else)
+    For example, in Snowflake:
+        SELECT a.id AS user_id FROM a JOIN b ON user_id = b.id  -- VALID
+    Reference: https://docs.snowflake.com/en/sql-reference/sql/select#usage-notes
+    """
     SUPPORTS_ORDER_BY_ALL = False
     """
     Whether ORDER BY ALL is supported (expands to all the selected columns) as in DuckDB, Spark3/Databricks
     """
+    PROJECTION_ALIASES_SHADOW_SOURCE_NAMES = False
+    """
+    Whether projection alias names can shadow table/source names in GROUP BY and HAVING clauses.
+    In BigQuery, when a projection alias has the same name as a source table, the alias takes
+    precedence in GROUP BY and HAVING clauses, and the table becomes inaccessible by that name.
+    For example, in BigQuery:
+        SELECT id, ARRAY_AGG(col) AS custom_fields
+        FROM custom_fields
+        GROUP BY id
+        HAVING id >= 1
+    The "custom_fields" source is shadowed by the projection alias, so we cannot qualify "id"
+    with "custom_fields" in GROUP BY/HAVING.
+    """
+    TABLES_REFERENCEABLE_AS_COLUMNS = False
+    """
+    Whether table names can be referenced as columns (treated as structs).
+    BigQuery allows tables to be referenced as columns in queries, automatically treating
+    them as struct values containing all the table's columns.
+    For example, in BigQuery:
+        SELECT t FROM my_table AS t  -- Returns entire row as a struct
+    """
+    SUPPORTS_STRUCT_STAR_EXPANSION = False
+    """
+    Whether the dialect supports expanding struct fields using star notation (e.g., struct_col.*).
+    BigQuery allows struct fields to be expanded with the star operator:
+        SELECT t.struct_col.* FROM table t
+    RisingWave also allows struct field expansion with the star operator using parentheses:
+        SELECT (t.struct_col).* FROM table t
+    This expands to all fields within the struct.
+    """
+    EXCLUDES_PSEUDOCOLUMNS_FROM_STAR = False
+    """
+    Whether pseudocolumns should be excluded from star expansion (SELECT *).
+    Pseudocolumns are special dialect-specific columns (e.g., Oracle's ROWNUM, ROWID, LEVEL,
+    or BigQuery's _PARTITIONTIME, _PARTITIONDATE) that are implicitly available but not part
+    of the table schema. When this is True, SELECT * will not include these pseudocolumns;
+    they must be explicitly selected.
+    """
+    QUERY_RESULTS_ARE_STRUCTS = False
+    """
+    Whether query results are typed as structs in metadata for type inference.
+    In BigQuery, subqueries store their column types as a STRUCT in metadata,
+    enabling special type inference for ARRAY(SELECT ...) expressions:
+        ARRAY(SELECT x, y FROM t) → ARRAY<STRUCT<...>>
+    For single column subqueries, BigQuery unwraps the struct:
+        ARRAY(SELECT x FROM t) → ARRAY<type_of_x>
+    This is metadata-only for type inference.
+    """
+    REQUIRES_PARENTHESIZED_STRUCT_ACCESS = False
+    """
+    Whether struct field access requires parentheses around the expression.
+    RisingWave requires parentheses for struct field access in certain contexts:
+        SELECT (col.field).subfield FROM table  -- Parentheses required
+    Without parentheses, the parser may not correctly interpret nested struct access.
+    Reference: https://docs.risingwave.com/sql/data-types/struct#retrieve-data-in-a-struct
+    """
+    SUPPORTS_NULL_TYPE = False
+    """
+    Whether NULL/VOID is supported as a valid data type (not just a value).
+    Databricks and Spark v3+ support NULL as an actual type, allowing expressions like:
+        SELECT NULL AS col  -- Has type NULL, not just value NULL
+        CAST(x AS VOID)     -- Valid type cast
+    """
+    COALESCE_COMPARISON_NON_STANDARD = False
+    """
+    Whether COALESCE in comparisons has non-standard NULL semantics.
+    We can't convert `COALESCE(x, 1) = 2` into `NOT x IS NULL AND x = 2` for redshift,
+    because they are not always equivalent. For example,  if `x` is `NULL` and it comes
+    from a table, then the result is `NULL`, despite `FALSE AND NULL` evaluating to `FALSE`.
+    In standard SQL and most dialects, these expressions are equivalent, but Redshift treats
+    table NULLs differently in this context.
+    """
     HAS_DISTINCT_ARRAY_CONSTRUCTORS = False
     """
     Whether the ARRAY constructor is context-sensitive, i.e in Redshift ARRAY[1, 2, 3] != ARRAY(1, 2, 3)
@@ -509,6 +669,9 @@ class Dialect(metaclass=_Dialect):
     REGEXP_EXTRACT_DEFAULT_GROUP = 0
     """The default value for the capturing group."""
+    REGEXP_EXTRACT_POSITION_OVERFLOW_RETURNS_NULL = True
+    """Whether REGEXP_EXTRACT returns NULL when the position arg exceeds the string length."""
     SET_OP_DISTINCT_BY_DEFAULT: t.Dict[t.Type[exp.Expression], t.Optional[bool]] = {
         exp.Except: True,
         exp.Intersect: True,
@@ -539,6 +702,45 @@ class Dialect(metaclass=_Dialect):
     # STRING type (Snowflake's case) or can be of any type
     TRY_CAST_REQUIRES_STRING: t.Optional[bool] = None
+    # Whether the double negation can be applied
+    # Not safe with MySQL and SQLite due to type coercion (may not return boolean)
+    SAFE_TO_ELIMINATE_DOUBLE_NEGATION = True
+    # Whether the INITCAP function supports custom delimiter characters as the second argument
+    # Default delimiter characters for INITCAP function: whitespace and non-alphanumeric characters
+    INITCAP_SUPPORTS_CUSTOM_DELIMITERS = True
+    INITCAP_DEFAULT_DELIMITER_CHARS = " \t\n\r\f\v!\"#$%&'()*+,\\-./:;<=>?@\\[\\]^_`{|}~"
+    BYTE_STRING_IS_BYTES_TYPE: bool = False
+    """
+    Whether byte string literals (ex: BigQuery's b'...') are typed as BYTES/BINARY
+    """
+    UUID_IS_STRING_TYPE: bool = False
+    """
+    Whether a UUID is considered a string or a UUID type.
+    """
+    JSON_EXTRACT_SCALAR_SCALAR_ONLY = False
+    """
+    Whether JSON_EXTRACT_SCALAR returns null if a non-scalar value is selected.
+    """
+    DEFAULT_FUNCTIONS_COLUMN_NAMES: t.Dict[t.Type[exp.Func], t.Union[str, t.Tuple[str, ...]]] = {}
+    """
+    Maps function expressions to their default output column name(s).
+    For example, in Postgres, generate_series function outputs a column named "generate_series" by default,
+    so we map the ExplodingGenerateSeries expression to "generate_series" string.
+    """
+    DEFAULT_NULL_TYPE = exp.DataType.Type.UNKNOWN
+    """
+    The default type of NULL for producing the correct projection type.
+    For example, in BigQuery the default type of the NULL value is INT64.
+    """
     # --- Autofilled ---
     tokenizer_class = Tokenizer
@@ -600,6 +802,7 @@ class Dialect(metaclass=_Dialect):
         "WEEKDAY_ISO": "DAYOFWEEKISO",
         "DOW_ISO": "DAYOFWEEKISO",
         "DW_ISO": "DAYOFWEEKISO",
+        "DAYOFWEEK_ISO": "DAYOFWEEKISO",
         "DAY OF YEAR": "DAYOFYEAR",
         "DOY": "DAYOFYEAR",
         "DY": "DAYOFYEAR",
@@ -662,232 +865,21 @@ class Dialect(metaclass=_Dialect):
         "DEC": "DECADE",
         "DECS": "DECADE",
         "DECADES": "DECADE",
-        "MIL": "MILLENIUM",
-        "MILS": "MILLENIUM",
-        "MILLENIA": "MILLENIUM",
+        "MIL": "MILLENNIUM",
+        "MILS": "MILLENNIUM",
+        "MILLENIA": "MILLENNIUM",
         "C": "CENTURY",
         "CENT": "CENTURY",
         "CENTS": "CENTURY",
         "CENTURIES": "CENTURY",
     }
-    TYPE_TO_EXPRESSIONS: t.Dict[exp.DataType.Type, t.Set[t.Type[exp.Expression]]] = {
-        exp.DataType.Type.BIGINT: {
-            exp.ApproxDistinct,
-            exp.ArraySize,
-            exp.CountIf,
-            exp.Int64,
-            exp.Length,
-            exp.UnixDate,
-            exp.UnixSeconds,
-            exp.UnixMicros,
-            exp.UnixMillis,
-        },
-        exp.DataType.Type.BINARY: {
-            exp.FromBase32,
-            exp.FromBase64,
-        },
-        exp.DataType.Type.BOOLEAN: {
-            exp.Between,
-            exp.Boolean,
-            exp.Contains,
-            exp.EndsWith,
-            exp.In,
-            exp.LogicalAnd,
-            exp.LogicalOr,
-            exp.RegexpLike,
-            exp.StartsWith,
-        },
-        exp.DataType.Type.DATE: {
-            exp.CurrentDate,
-            exp.Date,
-            exp.DateFromParts,
-            exp.DateStrToDate,
-            exp.DiToDate,
-            exp.LastDay,
-            exp.StrToDate,
-            exp.TimeStrToDate,
-            exp.TsOrDsToDate,
-        },
-        exp.DataType.Type.DATETIME: {
-            exp.CurrentDatetime,
-            exp.Datetime,
-            exp.DatetimeAdd,
-            exp.DatetimeSub,
-        },
-        exp.DataType.Type.DOUBLE: {
-            exp.ApproxQuantile,
-            exp.Avg,
-            exp.Exp,
-            exp.Ln,
-            exp.Log,
-            exp.Pi,
-            exp.Pow,
-            exp.Quantile,
-            exp.Radians,
-            exp.Round,
-            exp.SafeDivide,
-            exp.Sqrt,
-            exp.Stddev,
-            exp.StddevPop,
-            exp.StddevSamp,
-            exp.ToDouble,
-            exp.Variance,
-            exp.VariancePop,
-        },
-        exp.DataType.Type.INT: {
-            exp.Ascii,
-            exp.Ceil,
-            exp.DatetimeDiff,
-            exp.DateDiff,
-            exp.TimestampDiff,
-            exp.TimeDiff,
-            exp.Unicode,
-            exp.DateToDi,
-            exp.Levenshtein,
-            exp.Sign,
-            exp.StrPosition,
-            exp.TsOrDiToDi,
-        },
-        exp.DataType.Type.INTERVAL: {
-            exp.Interval,
-            exp.JustifyDays,
-            exp.JustifyHours,
-            exp.JustifyInterval,
-            exp.MakeInterval,
-        },
-        exp.DataType.Type.JSON: {
-            exp.ParseJSON,
-        },
-        exp.DataType.Type.TIME: {
-            exp.CurrentTime,
-            exp.Time,
-            exp.TimeAdd,
-            exp.TimeSub,
-        },
-        exp.DataType.Type.TIMESTAMPTZ: {
-            exp.CurrentTimestampLTZ,
-        },
-        exp.DataType.Type.TIMESTAMP: {
-            exp.CurrentTimestamp,
-            exp.StrToTime,
-            exp.TimeStrToTime,
-            exp.TimestampAdd,
-            exp.TimestampSub,
-            exp.UnixToTime,
-        },
-        exp.DataType.Type.TINYINT: {
-            exp.Day,
-            exp.Month,
-            exp.Week,
-            exp.Year,
-            exp.Quarter,
-        },
-        exp.DataType.Type.VARCHAR: {
-            exp.ArrayConcat,
-            exp.ArrayToString,
-            exp.Concat,
-            exp.ConcatWs,
-            exp.Chr,
-            exp.DateToDateStr,
-            exp.DPipe,
-            exp.GroupConcat,
-            exp.Initcap,
-            exp.Lower,
-            exp.Substring,
-            exp.String,
-            exp.TimeToStr,
-            exp.TimeToTimeStr,
-            exp.Trim,
-            exp.ToBase32,
-            exp.ToBase64,
-            exp.TsOrDsToDateStr,
-            exp.UnixToStr,
-            exp.UnixToTimeStr,
-            exp.Upper,
-        },
-    }
-    ANNOTATORS: AnnotatorsType = {
-        **{
-            expr_type: lambda self, e: self._annotate_unary(e)
-            for expr_type in subclasses(exp.__name__, (exp.Unary, exp.Alias))
-        },
-        **{
-            expr_type: lambda self, e: self._annotate_binary(e)
-            for expr_type in subclasses(exp.__name__, exp.Binary)
-        },
-        **{
-            expr_type: annotate_with_type_lambda(data_type)
-            for data_type, expressions in TYPE_TO_EXPRESSIONS.items()
-            for expr_type in expressions
-        },
-        exp.Abs: lambda self, e: self._annotate_by_args(e, "this"),
-        exp.Anonymous: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.UNKNOWN),
-        exp.Array: lambda self, e: self._annotate_by_args(e, "expressions", array=True),
-        exp.AnyValue: lambda self, e: self._annotate_by_args(e, "this"),
-        exp.ArrayAgg: lambda self, e: self._annotate_by_args(e, "this", array=True),
-        exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"),
-        exp.ArrayConcatAgg: lambda self, e: self._annotate_by_args(e, "this"),
-        exp.ArrayFirst: lambda self, e: self._annotate_by_array_element(e),
-        exp.ArrayLast: lambda self, e: self._annotate_by_array_element(e),
-        exp.ArrayReverse: lambda self, e: self._annotate_by_args(e, "this"),
-        exp.ArraySlice: lambda self, e: self._annotate_by_args(e, "this"),
-        exp.Bracket: lambda self, e: self._annotate_bracket(e),
-        exp.Cast: lambda self, e: self._annotate_with_type(e, e.args["to"]),
-        exp.Case: lambda self, e: self._annotate_by_args(e, "default", "ifs"),
-        exp.Coalesce: lambda self, e: self._annotate_by_args(e, "this", "expressions"),
-        exp.Count: lambda self, e: self._annotate_with_type(
-            e, exp.DataType.Type.BIGINT if e.args.get("big_int") else exp.DataType.Type.INT
-        ),
-        exp.DataType: lambda self, e: self._annotate_with_type(e, e.copy()),
-        exp.DateAdd: lambda self, e: self._annotate_timeunit(e),
-        exp.DateSub: lambda self, e: self._annotate_timeunit(e),
-        exp.DateTrunc: lambda self, e: self._annotate_timeunit(e),
-        exp.Distinct: lambda self, e: self._annotate_by_args(e, "expressions"),
-        exp.Div: lambda self, e: self._annotate_div(e),
-        exp.Dot: lambda self, e: self._annotate_dot(e),
-        exp.Explode: lambda self, e: self._annotate_explode(e),
-        exp.Extract: lambda self, e: self._annotate_extract(e),
-        exp.Filter: lambda self, e: self._annotate_by_args(e, "this"),
-        exp.GenerateSeries: lambda self, e: self._annotate_by_args(
-            e, "start", "end", "step", array=True
-        ),
-        exp.GenerateDateArray: lambda self, e: self._annotate_with_type(
-            e, exp.DataType.build("ARRAY<DATE>")
-        ),
-        exp.GenerateTimestampArray: lambda self, e: self._annotate_with_type(
-            e, exp.DataType.build("ARRAY<TIMESTAMP>")
-        ),
-        exp.Greatest: lambda self, e: self._annotate_by_args(e, "this", "expressions"),
-        exp.If: lambda self, e: self._annotate_by_args(e, "true", "false"),
-        exp.Least: lambda self, e: self._annotate_by_args(e, "this", "expressions"),
-        exp.Literal: lambda self, e: self._annotate_literal(e),
-        exp.LastValue: lambda self, e: self._annotate_by_args(e, "this"),
-        exp.Map: lambda self, e: self._annotate_map(e),
-        exp.Max: lambda self, e: self._annotate_by_args(e, "this", "expressions"),
-        exp.Min: lambda self, e: self._annotate_by_args(e, "this", "expressions"),
-        exp.Null: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.NULL),
-        exp.Nullif: lambda self, e: self._annotate_by_args(e, "this", "expression"),
-        exp.PropertyEQ: lambda self, e: self._annotate_by_args(e, "expression"),
-        exp.Slice: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.UNKNOWN),
-        exp.Struct: lambda self, e: self._annotate_struct(e),
-        exp.Sum: lambda self, e: self._annotate_by_args(e, "this", "expressions", promote=True),
-        exp.SortArray: lambda self, e: self._annotate_by_args(e, "this"),
-        exp.Timestamp: lambda self, e: self._annotate_with_type(
-            e,
-            exp.DataType.Type.TIMESTAMPTZ if e.args.get("with_tz") else exp.DataType.Type.TIMESTAMP,
-        ),
-        exp.ToMap: lambda self, e: self._annotate_to_map(e),
-        exp.TryCast: lambda self, e: self._annotate_with_type(e, e.args["to"]),
-        exp.Unnest: lambda self, e: self._annotate_unnest(e),
-        exp.VarMap: lambda self, e: self._annotate_map(e),
-        exp.Window: lambda self, e: self._annotate_by_args(e, "this"),
-    }
     # Specifies what types a given type can be coerced into
     COERCES_TO: t.Dict[exp.DataType.Type, t.Set[exp.DataType.Type]] = {}
+    # Specifies type inference & validation rules for expressions
+    EXPRESSION_METADATA = EXPRESSION_METADATA.copy()
     # Determines the supported Dialect instance settings
     SUPPORTED_SETTINGS = {
         "normalization_strategy",
@@ -967,7 +959,9 @@ class Dialect(metaclass=_Dialect):
         return expression
     def __init__(self, **kwargs) -> None:
-        self.version = Version(kwargs.pop("version", None))
+        parts = str(kwargs.pop("version", sys.maxsize)).split(".")
+        parts.extend(["0"] * (3 - len(parts)))
+        self.version = tuple(int(p) for p in parts[:3])
         normalization_strategy = kwargs.pop("normalization_strategy", None)
         if normalization_strategy is None:
@@ -1044,42 +1038,50 @@ class Dialect(metaclass=_Dialect):
         )
         return any(unsafe(char) for char in text)
-    def can_identify(self, text: str, identify: str | bool = "safe") -> bool:
-        """Checks if text can be identified given an identify option.
+    def can_quote(self, identifier: exp.Identifier, identify: str | bool = "safe") -> bool:
+        """Checks if an identifier can be quoted
         Args:
-            text: The text to check.
+            identifier: The identifier to check.
             identify:
-                `"always"` or `True`: Always returns `True`.
+                `True`: Always returns `True` except for certain cases.
                 `"safe"`: Only returns `True` if the identifier is case-insensitive.
+                `"unsafe"`: Only returns `True` if the identifier is case-sensitive.
         Returns:
             Whether the given text can be identified.
         """
-        if identify is True or identify == "always":
+        if identifier.quoted:
+            return True
+        if not identify:
+            return False
+        if isinstance(identifier.parent, exp.Func):
+            return False
+        if identify is True:
             return True
+        is_safe = not self.case_sensitive(identifier.this) and bool(
+            exp.SAFE_IDENTIFIER_RE.match(identifier.this)
+        )
         if identify == "safe":
-            return not self.case_sensitive(text)
+            return is_safe
+        if identify == "unsafe":
+            return not is_safe
-        return False
+        raise ValueError(f"Unexpected argument for identify: '{identify}'")
     def quote_identifier(self, expression: E, identify: bool = True) -> E:
         """
-        Adds quotes to a given identifier.
+        Adds quotes to a given expression if it is an identifier.
         Args:
             expression: The expression of interest. If it's not an `Identifier`, this method is a no-op.
             identify: If set to `False`, the quotes will only be added if the identifier is deemed
                 "unsafe", with respect to its characters and this dialect's normalization strategy.
         """
-        if isinstance(expression, exp.Identifier) and not isinstance(expression.parent, exp.Func):
-            name = expression.this
-            expression.set(
-                "quoted",
-                identify or self.case_sensitive(name) or not exp.SAFE_IDENTIFIER_RE.match(name),
-            )
+        if isinstance(expression, exp.Identifier):
+            expression.set("quoted", self.can_quote(expression, identify or "unsafe"))
         return expression
     def to_json_path(self, path: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
@@ -1170,11 +1172,11 @@ def arrow_json_extract_sql(self: Generator, expression: JSON_EXTRACT_TYPE) -> st
     return self.binary(expression, "->" if isinstance(expression, exp.JSONExtract) else "->>")
-def inline_array_sql(self: Generator, expression: exp.Array) -> str:
+def inline_array_sql(self: Generator, expression: exp.Expression) -> str:
     return f"[{self.expressions(expression, dynamic=True, new_line=True, skip_first=True, skip_last=True)}]"
-def inline_array_unless_query(self: Generator, expression: exp.Array) -> str:
+def inline_array_unless_query(self: Generator, expression: exp.Expression) -> str:
     elem = seq_get(expression.expressions, 0)
     if isinstance(elem, exp.Expression) and elem.find(exp.Query):
         return self.func("ARRAY", elem)
@@ -1397,12 +1399,14 @@ def date_add_interval_sql(
     return func
-def timestamptrunc_sql(zone: bool = False) -> t.Callable[[Generator, exp.TimestampTrunc], str]:
+def timestamptrunc_sql(
+    func: str = "DATE_TRUNC", zone: bool = False
+) -> t.Callable[[Generator, exp.TimestampTrunc], str]:
     def _timestamptrunc_sql(self: Generator, expression: exp.TimestampTrunc) -> str:
         args = [unit_to_str(expression), expression.this]
         if zone:
             args.append(expression.args.get("zone"))
-        return self.func("DATE_TRUNC", *args)
+        return self.func(func, *args)
     return _timestamptrunc_sql
@@ -1682,11 +1686,7 @@ def date_delta_to_binary_interval_op(
     def date_delta_to_binary_interval_op_sql(self: Generator, expression: DATETIME_DELTA) -> str:
         this = expression.this
         unit = unit_to_var(expression)
-        op = (
-            "+"
-            if isinstance(expression, (exp.DateAdd, exp.TimeAdd, exp.DatetimeAdd, exp.TsOrDsAdd))
-            else "-"
-        )
+        op = "+" if isinstance(expression, DATETIME_ADD) else "-"
         to_type: t.Optional[exp.DATA_TYPE] = None
         if cast:
@@ -1944,6 +1944,10 @@ def sha256_sql(self: Generator, expression: exp.SHA2) -> str:
     return self.func(f"SHA{expression.text('length') or '256'}", expression.this)
+def sha2_digest_sql(self: Generator, expression: exp.SHA2Digest) -> str:
+    return self.func(f"SHA{expression.text('length') or '256'}", expression.this)
 def sequence_sql(self: Generator, expression: exp.GenerateSeries | exp.GenerateDateArray) -> str:
     start = expression.args.get("start")
     end = expression.args.get("end")
@@ -1956,22 +1960,76 @@ def sequence_sql(self: Generator, expression: exp.GenerateSeries | exp.GenerateD
     else:
         target_type = None
-    if start and end and target_type and target_type.is_type("date", "timestamp"):
-        if isinstance(start, exp.Cast) and target_type is start.to:
-            end = exp.cast(end, target_type)
-        else:
-            start = exp.cast(start, target_type)
+    if start and end:
+        if target_type and target_type.is_type("date", "timestamp"):
+            if isinstance(start, exp.Cast) and target_type is start.to:
+                end = exp.cast(end, target_type)
+            else:
+                start = exp.cast(start, target_type)
+        if expression.args.get("is_end_exclusive"):
+            step_value = step or exp.Literal.number(1)
+            end = exp.paren(exp.Sub(this=end, expression=step_value), copy=False)
+            sequence_call = exp.Anonymous(
+                this="SEQUENCE", expressions=[e for e in (start, end, step) if e]
+            )
+            zero = exp.Literal.number(0)
+            should_return_empty = exp.or_(
+                exp.EQ(this=step_value.copy(), expression=zero.copy()),
+                exp.and_(
+                    exp.GT(this=step_value.copy(), expression=zero.copy()),
+                    exp.GTE(this=start.copy(), expression=end.copy()),
+                ),
+                exp.and_(
+                    exp.LT(this=step_value.copy(), expression=zero.copy()),
+                    exp.LTE(this=start.copy(), expression=end.copy()),
+                ),
+            )
+            empty_array_or_sequence = exp.If(
+                this=should_return_empty,
+                true=exp.Array(expressions=[]),
+                false=sequence_call,
+            )
+            return self.sql(self._simplify_unless_literal(empty_array_or_sequence))
     return self.func("SEQUENCE", start, end, step)
+def build_like(
+    expr_type: t.Type[E], not_like: bool = False
+) -> t.Callable[[t.List], exp.Expression]:
+    def _builder(args: t.List) -> exp.Expression:
+        like_expr: exp.Expression = expr_type(this=seq_get(args, 0), expression=seq_get(args, 1))
+        if escape := seq_get(args, 2):
+            like_expr = exp.Escape(this=like_expr, expression=escape)
+        if not_like:
+            like_expr = exp.Not(this=like_expr)
+        return like_expr
+    return _builder
 def build_regexp_extract(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]:
     def _builder(args: t.List, dialect: Dialect) -> E:
+        # The "position" argument specifies the index of the string character to start matching from.
+        # `null_if_pos_overflow` reflects the dialect's behavior when position is greater than the string
+        # length. If true, returns NULL. If false, returns an empty string. `null_if_pos_overflow` is
+        # only needed for exp.RegexpExtract - exp.RegexpExtractAll always returns an empty array if
+        # position overflows.
         return expr_type(
             this=seq_get(args, 0),
             expression=seq_get(args, 1),
             group=seq_get(args, 2) or exp.Literal.number(dialect.REGEXP_EXTRACT_DEFAULT_GROUP),
             parameters=seq_get(args, 3),
+            **(
+                {"null_if_pos_overflow": dialect.REGEXP_EXTRACT_POSITION_OVERFLOW_RETURNS_NULL}
+                if expr_type is exp.RegexpExtract
+                else {}
+            ),
         )
     return _builder
@@ -2016,12 +2074,14 @@ def groupconcat_sql(
     self: Generator,
     expression: exp.GroupConcat,
     func_name="LISTAGG",
-    sep: str = ",",
+    sep: t.Optional[str] = ",",
     within_group: bool = True,
     on_overflow: bool = False,
 ) -> str:
     this = expression.this
-    separator = self.sql(expression.args.get("separator") or exp.Literal.string(sep))
+    separator = self.sql(
+        expression.args.get("separator") or (exp.Literal.string(sep) if sep else None)
+    )
     on_overflow_sql = self.sql(expression, "on_overflow")
     on_overflow_sql = f" ON OVERFLOW {on_overflow_sql}" if (on_overflow and on_overflow_sql) else ""
@@ -2037,7 +2097,10 @@ def groupconcat_sql(
     if order and order.this:
         this = order.this.pop()
-    args = self.format_args(this, f"{separator}{on_overflow_sql}")
+    args = self.format_args(
+        this, f"{separator}{on_overflow_sql}" if separator or on_overflow_sql else None
+    )
     listagg: exp.Expression = exp.Anonymous(this=func_name, expressions=[args])
     modifiers = self.sql(limit)
@@ -2075,3 +2138,19 @@ def build_replace_with_optional_replacement(args: t.List) -> exp.Replace:
         expression=seq_get(args, 1),
         replacement=seq_get(args, 2) or exp.Literal.string(""),
     )
+def regexp_replace_global_modifier(expression: exp.RegexpReplace) -> exp.Expression | None:
+    modifiers = expression.args.get("modifiers")
+    single_replace = expression.args.get("single_replace")
+    occurrence = expression.args.get("occurrence")
+    if not single_replace and (not occurrence or (occurrence.is_int and occurrence.to_py() == 0)):
+        if not modifiers or modifiers.is_string:
+            # Append 'g' to the modifiers if they are not provided since
+            # the semantics of REGEXP_REPLACE from the input dialect
+            # is to replace all occurrences of the pattern.
+            value = "" if not modifiers else modifiers.name
+            modifiers = exp.Literal.string(value + "g")
+    return modifiers

sqlglot 27.27.0__py3-none-any.whl → 28.4.0__py3-none-any.whl

sqlglot 27.27.0py3-none-any.whl → 28.4.0py3-none-any.whl