PyPI - sqlglot - Versions diffs - 27.29.0__py3-none-any.whl → 28.4.1__py3-none-any.whl - Mend

sqlglot 27.29.0py3-none-any.whl → 28.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

sqlglot/__main__.py +6 -4
sqlglot/_version.py +2 -2
sqlglot/dialects/bigquery.py +116 -295
sqlglot/dialects/clickhouse.py +67 -2
sqlglot/dialects/databricks.py +38 -1
sqlglot/dialects/dialect.py +327 -286
sqlglot/dialects/dremio.py +4 -1
sqlglot/dialects/duckdb.py +718 -22
sqlglot/dialects/exasol.py +243 -10
sqlglot/dialects/hive.py +8 -8
sqlglot/dialects/mysql.py +11 -2
sqlglot/dialects/oracle.py +29 -0
sqlglot/dialects/postgres.py +46 -24
sqlglot/dialects/presto.py +47 -16
sqlglot/dialects/redshift.py +16 -0
sqlglot/dialects/risingwave.py +3 -0
sqlglot/dialects/singlestore.py +12 -3
sqlglot/dialects/snowflake.py +199 -271
sqlglot/dialects/spark.py +2 -2
sqlglot/dialects/spark2.py +11 -48
sqlglot/dialects/sqlite.py +9 -0
sqlglot/dialects/teradata.py +5 -8
sqlglot/dialects/trino.py +6 -0
sqlglot/dialects/tsql.py +61 -25
sqlglot/diff.py +4 -2
sqlglot/errors.py +69 -0
sqlglot/expressions.py +484 -84
sqlglot/generator.py +143 -41
sqlglot/helper.py +2 -2
sqlglot/optimizer/annotate_types.py +247 -140
sqlglot/optimizer/canonicalize.py +6 -1
sqlglot/optimizer/eliminate_joins.py +1 -1
sqlglot/optimizer/eliminate_subqueries.py +2 -2
sqlglot/optimizer/merge_subqueries.py +5 -5
sqlglot/optimizer/normalize.py +20 -13
sqlglot/optimizer/normalize_identifiers.py +17 -3
sqlglot/optimizer/optimizer.py +4 -0
sqlglot/optimizer/pushdown_predicates.py +1 -1
sqlglot/optimizer/qualify.py +14 -6
sqlglot/optimizer/qualify_columns.py +113 -352
sqlglot/optimizer/qualify_tables.py +112 -70
sqlglot/optimizer/resolver.py +374 -0
sqlglot/optimizer/scope.py +27 -16
sqlglot/optimizer/simplify.py +1074 -964
sqlglot/optimizer/unnest_subqueries.py +12 -2
sqlglot/parser.py +276 -160
sqlglot/planner.py +2 -2
sqlglot/schema.py +15 -4
sqlglot/tokens.py +42 -7
sqlglot/transforms.py +77 -22
sqlglot/typing/__init__.py +316 -0
sqlglot/typing/bigquery.py +376 -0
sqlglot/typing/hive.py +12 -0
sqlglot/typing/presto.py +24 -0
sqlglot/typing/snowflake.py +505 -0
sqlglot/typing/spark2.py +58 -0
sqlglot/typing/tsql.py +9 -0
{sqlglot-27.29.0.dist-info → sqlglot-28.4.1.dist-info}/METADATA +2 -2
sqlglot-28.4.1.dist-info/RECORD +92 -0
sqlglot-27.29.0.dist-info/RECORD +0 -84
{sqlglot-27.29.0.dist-info → sqlglot-28.4.1.dist-info}/WHEEL +0 -0
{sqlglot-27.29.0.dist-info → sqlglot-28.4.1.dist-info}/licenses/LICENSE +0 -0
{sqlglot-27.29.0.dist-info → sqlglot-28.4.1.dist-info}/top_level.txt +0 -0

sqlglot/dialects/duckdb.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from __future__ import annotations
+from decimal import Decimal
+from itertools import groupby
 import re
 import typing as t
@@ -9,7 +11,6 @@ from sqlglot.dialects.dialect import (
     Dialect,
     JSON_EXTRACT_TYPE,
     NormalizationStrategy,
-    Version,
     approx_count_distinct_sql,
     arrow_json_extract_sql,
     binary_from_function,
@@ -22,7 +23,6 @@ from sqlglot.dialects.dialect import (
     no_datetime_sql,
     encode_decode_sql,
     build_formatted_time,
-    inline_array_unless_query,
     no_comment_column_constraint_sql,
     no_time_sql,
     no_timestamp_sql,
@@ -31,7 +31,6 @@ from sqlglot.dialects.dialect import (
     remove_from_array_using_filter,
     strposition_sql,
     str_to_time_sql,
-    timestamptrunc_sql,
     timestrtotime_sql,
     unit_to_str,
     sha256_sql,
@@ -39,10 +38,12 @@ from sqlglot.dialects.dialect import (
     explode_to_unnest_sql,
     no_make_interval_sql,
     groupconcat_sql,
+    inline_array_unless_query,
     regexp_replace_global_modifier,
+    sha2_digest_sql,
 )
 from sqlglot.generator import unsupported_args
-from sqlglot.helper import seq_get
+from sqlglot.helper import is_date_unit, seq_get
 from sqlglot.tokens import TokenType
 from sqlglot.parser import binary_range_parser
@@ -50,6 +51,86 @@ from sqlglot.parser import binary_range_parser
 # The pattern matches timezone offsets that appear after the time portion
 TIMEZONE_PATTERN = re.compile(r":\d{2}.*?[+\-]\d{2}(?::\d{2})?")
+# Characters that must be escaped when building regex expressions in INITCAP
+REGEX_ESCAPE_REPLACEMENTS = {
+    "\\": "\\\\",
+    "-": r"\-",
+    "^": r"\^",
+    "[": r"\[",
+    "]": r"\]",
+}
+# Used to in RANDSTR transpilation
+RANDSTR_CHAR_POOL = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+RANDSTR_SEED = 123456
+# Whitespace control characters that DuckDB must process with `CHR({val})` calls
+WS_CONTROL_CHARS_TO_DUCK = {
+    "\u000b": 11,
+    "\u001c": 28,
+    "\u001d": 29,
+    "\u001e": 30,
+    "\u001f": 31,
+}
+# Days of week to ISO 8601 day-of-week numbers
+# ISO 8601 standard: Monday=1, Tuesday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6, Sunday=7
+WEEK_START_DAY_TO_DOW = {
+    "MONDAY": 1,
+    "TUESDAY": 2,
+    "WEDNESDAY": 3,
+    "THURSDAY": 4,
+    "FRIDAY": 5,
+    "SATURDAY": 6,
+    "SUNDAY": 7,
+}
+MAX_BIT_POSITION = exp.Literal.number(32768)
+def _to_boolean_sql(self: DuckDB.Generator, expression: exp.ToBoolean) -> str:
+    """
+    Transpile TO_BOOLEAN function from Snowflake to DuckDB equivalent.
+    DuckDB's CAST to BOOLEAN supports most of Snowflake's TO_BOOLEAN strings except 'on'/'off'.
+    We need to handle the 'on'/'off' cases explicitly, plus NaN/INF error cases.
+    In Snowflake, NaN and INF values cause errors. We use DuckDB's native ERROR()
+    function to replicate this behavior with a clear error message.
+    """
+    arg = expression.this
+    cast_to_real = exp.func("TRY_CAST", arg, exp.DataType.build("REAL"))
+    # Check for NaN and INF values
+    nan_inf_check = exp.Or(
+        this=exp.func("ISNAN", cast_to_real), expression=exp.func("ISINF", cast_to_real)
+    )
+    case_expr = (
+        exp.case()
+        .when(
+            nan_inf_check,
+            exp.func(
+                "ERROR",
+                exp.Literal.string("TO_BOOLEAN: Non-numeric values NaN and INF are not supported"),
+            ),
+        )
+        # Handle 'on' -> TRUE (case insensitive) - only for string literals
+        .when(
+            exp.Upper(this=exp.cast(arg, exp.DataType.Type.VARCHAR)).eq(exp.Literal.string("ON")),
+            exp.true(),
+        )
+        # Handle 'off' -> FALSE (case insensitive) - only for string literals
+        .when(
+            exp.Upper(this=exp.cast(arg, exp.DataType.Type.VARCHAR)).eq(exp.Literal.string("OFF")),
+            exp.false(),
+        )
+        .else_(exp.cast(arg, exp.DataType.Type.BOOLEAN))
+    )
+    return self.sql(case_expr)
 # BigQuery -> DuckDB conversion for the DATE function
 def _date_sql(self: DuckDB.Generator, expression: exp.Date) -> str:
@@ -231,9 +312,86 @@ def _implicit_datetime_cast(
     return arg
+def _week_unit_to_dow(unit: t.Optional[exp.Expression]) -> t.Optional[int]:
+    """
+    Compute the Monday-based day shift to align DATE_DIFF('WEEK', ...) coming
+    from other dialects, e.g BigQuery's WEEK(<day>) or ISOWEEK unit parts.
+    Args:
+        unit: The unit expression (Var for ISOWEEK or WeekStart)
+    Returns:
+        The ISO 8601 day number (Monday=1, Sunday=7 etc) or None if not a week unit or if day is dynamic (not a constant).
+        Examples:
+            "WEEK(SUNDAY)" -> 7
+            "WEEK(MONDAY)" -> 1
+            "ISOWEEK" -> 1
+    """
+    # Handle plain Var expressions for ISOWEEK only
+    if isinstance(unit, exp.Var) and unit.name.upper() in "ISOWEEK":
+        return 1
+    # Handle WeekStart expressions with explicit day
+    if isinstance(unit, exp.WeekStart):
+        return WEEK_START_DAY_TO_DOW.get(unit.name.upper())
+    return None
+def _build_week_trunc_expression(date_expr: exp.Expression, start_dow: int) -> exp.Expression:
+    """
+    Build DATE_TRUNC expression for week boundaries with custom start day.
+    Args:
+        date_expr: The date expression to truncate
+        shift_days: ISO 8601 day-of-week number (Monday=0, ..., Sunday=6)
+    DuckDB's DATE_TRUNC('WEEK', ...) aligns weeks to Monday (ISO standard).
+    To align to a different start day, we shift the date before truncating.
+    Shift formula: Sunday (7) gets +1, others get (1 - start_dow)
+    Examples:
+        Monday (1): shift = 0 (no shift needed)
+        Tuesday (2): shift = -1 (shift back 1 day) ...
+        Sunday (7): shift = +1 (shift forward 1 day, wraps to next Monday-based week)
+    """
+    shift_days = 1 if start_dow == 7 else 1 - start_dow
+    # Shift date to align week boundaries with the desired start day
+    # No shift needed for Monday-based weeks (shift_days == 0)
+    shifted_date = (
+        exp.DateAdd(
+            this=date_expr,
+            expression=exp.Interval(this=exp.Literal.string(str(shift_days)), unit=exp.var("DAY")),
+        )
+        if shift_days != 0
+        else date_expr
+    )
+    return exp.DateTrunc(unit=exp.var("WEEK"), this=shifted_date)
 def _date_diff_sql(self: DuckDB.Generator, expression: exp.DateDiff) -> str:
     this = _implicit_datetime_cast(expression.this)
     expr = _implicit_datetime_cast(expression.expression)
+    unit = expression.args.get("unit")
+    # DuckDB's WEEK diff does not respect Monday crossing (week boundaries), it checks (end_day - start_day) / 7:
+    #  SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-17' AS DATE)) --> 0 (Monday crossed)
+    #  SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-20' AS DATE)) --> 1 (7 days difference)
+    # Whereas for other units such as MONTH it does respect month boundaries:
+    #  SELECT DATE_DIFF('MONTH', CAST('2024-11-30' AS DATE), CAST('2024-12-01' AS DATE)) --> 1 (Month crossed)
+    date_part_boundary = expression.args.get("date_part_boundary")
+    # Extract week start day; returns None if day is dynamic (column/placeholder)
+    week_start = _week_unit_to_dow(unit)
+    if date_part_boundary and week_start and this and expr:
+        expression.set("unit", exp.Literal.string("WEEK"))
+        # Truncate both dates to week boundaries to respect input dialect semantics
+        this = _build_week_trunc_expression(this, week_start)
+        expr = _build_week_trunc_expression(expr, week_start)
     return self.func("DATE_DIFF", unit_to_str(expression), expr, this)
@@ -268,6 +426,228 @@ def _json_extract_value_array_sql(
     return self.sql(exp.cast(json_extract, to=exp.DataType.build(data_type)))
+def _cast_to_varchar(arg: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
+    if arg and arg.type and not arg.is_type(exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN):
+        return exp.cast(arg, exp.DataType.Type.VARCHAR)
+    return arg
+def _is_binary(arg: exp.Expression) -> bool:
+    return arg.is_type(
+        exp.DataType.Type.BINARY,
+        exp.DataType.Type.VARBINARY,
+        exp.DataType.Type.BLOB,
+    )
+def _gen_with_cast_to_blob(
+    self: DuckDB.Generator, expression: exp.Expression, result_sql: str
+) -> str:
+    if _is_binary(expression):
+        blob = exp.DataType.build("BLOB", dialect="duckdb")
+        result_sql = self.sql(exp.Cast(this=result_sql, to=blob))
+    return result_sql
+def _cast_to_bit(arg: exp.Expression) -> exp.Expression:
+    if not _is_binary(arg):
+        return arg
+    if isinstance(arg, exp.HexString):
+        arg = exp.Unhex(this=exp.Literal.string(arg.this))
+    return exp.cast(arg, exp.DataType.Type.BIT)
+def _prepare_binary_bitwise_args(expression: exp.Binary) -> None:
+    if _is_binary(expression.this):
+        expression.set("this", _cast_to_bit(expression.this))
+    if _is_binary(expression.expression):
+        expression.set("expression", _cast_to_bit(expression.expression))
+def _anyvalue_sql(self: DuckDB.Generator, expression: exp.AnyValue) -> str:
+    # Transform ANY_VALUE(expr HAVING MAX/MIN having_expr) to ARG_MAX_NULL/ARG_MIN_NULL
+    having = expression.this
+    if isinstance(having, exp.HavingMax):
+        func_name = "ARG_MAX_NULL" if having.args.get("max") else "ARG_MIN_NULL"
+        return self.func(func_name, having.this, having.expression)
+    return self.function_fallback_sql(expression)
+def _literal_sql_with_ws_chr(self: DuckDB.Generator, literal: str) -> str:
+    # DuckDB does not support \uXXXX escapes, so we must use CHR() instead of replacing them directly
+    if not any(ch in WS_CONTROL_CHARS_TO_DUCK for ch in literal):
+        return self.sql(exp.Literal.string(literal))
+    sql_segments: t.List[str] = []
+    for is_ws_control, group in groupby(literal, key=lambda ch: ch in WS_CONTROL_CHARS_TO_DUCK):
+        if is_ws_control:
+            for ch in group:
+                duckdb_char_code = WS_CONTROL_CHARS_TO_DUCK[ch]
+                sql_segments.append(self.func("CHR", exp.Literal.number(str(duckdb_char_code))))
+        else:
+            sql_segments.append(self.sql(exp.Literal.string("".join(group))))
+    sql = " || ".join(sql_segments)
+    return sql if len(sql_segments) == 1 else f"({sql})"
+def _escape_regex_metachars(
+    self: DuckDB.Generator, delimiters: t.Optional[exp.Expression], delimiters_sql: str
+) -> str:
+    r"""
+    Escapes regex metacharacters \ - ^ [ ] for use in character classes regex expressions.
+    Literal strings are escaped at transpile time, expressions handled with REPLACE() calls.
+    """
+    if not delimiters:
+        return delimiters_sql
+    if delimiters.is_string:
+        literal_value = delimiters.this
+        escaped_literal = "".join(REGEX_ESCAPE_REPLACEMENTS.get(ch, ch) for ch in literal_value)
+        return _literal_sql_with_ws_chr(self, escaped_literal)
+    escaped_sql = delimiters_sql
+    for raw, escaped in REGEX_ESCAPE_REPLACEMENTS.items():
+        escaped_sql = self.func(
+            "REPLACE",
+            escaped_sql,
+            self.sql(exp.Literal.string(raw)),
+            self.sql(exp.Literal.string(escaped)),
+        )
+    return escaped_sql
+def _build_capitalization_sql(
+    self: DuckDB.Generator,
+    value_to_split: str,
+    delimiters_sql: str,
+) -> str:
+    # empty string delimiter --> treat value as one word, no need to split
+    if delimiters_sql == "''":
+        return f"UPPER(LEFT({value_to_split}, 1)) || LOWER(SUBSTRING({value_to_split}, 2))"
+    delim_regex_sql = f"CONCAT('[', {delimiters_sql}, ']')"
+    split_regex_sql = f"CONCAT('([', {delimiters_sql}, ']+|[^', {delimiters_sql}, ']+)')"
+    # REGEXP_EXTRACT_ALL produces a list of string segments, alternating between delimiter and non-delimiter segments.
+    # We do not know whether the first segment is a delimiter or not, so we check the first character of the string
+    # with REGEXP_MATCHES. If the first char is a delimiter, we capitalize even list indexes, otherwise capitalize odd.
+    return self.func(
+        "ARRAY_TO_STRING",
+        exp.case()
+        .when(
+            f"REGEXP_MATCHES(LEFT({value_to_split}, 1), {delim_regex_sql})",
+            self.func(
+                "LIST_TRANSFORM",
+                self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql),
+                "(seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END",
+            ),
+        )
+        .else_(
+            self.func(
+                "LIST_TRANSFORM",
+                self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql),
+                "(seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END",
+            ),
+        ),
+        "''",
+    )
+def _initcap_sql(self: DuckDB.Generator, expression: exp.Initcap) -> str:
+    this_sql = self.sql(expression, "this")
+    delimiters = expression.args.get("expression")
+    if delimiters is None:
+        # fallback for manually created exp.Initcap w/o delimiters arg
+        delimiters = exp.Literal.string(self.dialect.INITCAP_DEFAULT_DELIMITER_CHARS)
+    delimiters_sql = self.sql(delimiters)
+    escaped_delimiters_sql = _escape_regex_metachars(self, delimiters, delimiters_sql)
+    return _build_capitalization_sql(self, this_sql, escaped_delimiters_sql)
+def _floor_sql(self: DuckDB.Generator, expression: exp.Floor) -> str:
+    decimals = expression.args.get("decimals")
+    if decimals is not None and expression.args.get("to") is None:
+        this = expression.this
+        if isinstance(this, exp.Binary):
+            this = exp.Paren(this=this)
+        n_int = decimals
+        if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)):
+            n_int = exp.cast(decimals, exp.DataType.Type.INT)
+        pow_ = exp.Pow(this=exp.Literal.number("10"), expression=n_int)
+        floored = exp.Floor(this=exp.Mul(this=this, expression=pow_))
+        result = exp.Div(this=floored, expression=pow_.copy())
+        return self.round_sql(
+            exp.Round(this=result, decimals=decimals, casts_non_integer_decimals=True)
+        )
+    return self.ceil_floor(expression)
+def _regr_val_sql(
+    self: DuckDB.Generator,
+    expression: exp.RegrValx | exp.RegrValy,
+) -> str:
+    """
+    Transpile Snowflake's REGR_VALX/REGR_VALY to DuckDB equivalent.
+    REGR_VALX(y, x) returns NULL if y is NULL; otherwise returns x.
+    REGR_VALY(y, x) returns NULL if x is NULL; otherwise returns y.
+    """
+    from sqlglot.optimizer.annotate_types import annotate_types
+    y = expression.this
+    x = expression.expression
+    # Determine which argument to check for NULL and which to return based on expression type
+    if isinstance(expression, exp.RegrValx):
+        # REGR_VALX: check y for NULL, return x
+        check_for_null = y
+        return_value = x
+        return_value_attr = "expression"
+    else:
+        # REGR_VALY: check x for NULL, return y
+        check_for_null = x
+        return_value = y
+        return_value_attr = "this"
+    # Get the type from the return argument
+    result_type = return_value.type
+    # If no type info, annotate the expression to infer types
+    if not result_type or result_type.this == exp.DataType.Type.UNKNOWN:
+        try:
+            annotated = annotate_types(expression.copy(), dialect=self.dialect)
+            result_type = getattr(annotated, return_value_attr).type
+        except Exception:
+            pass
+    # Default to DOUBLE for regression functions if type still unknown
+    if not result_type or result_type.this == exp.DataType.Type.UNKNOWN:
+        result_type = exp.DataType.build("DOUBLE")
+    # Cast NULL to the same type as return_value to avoid DuckDB type inference issues
+    typed_null = exp.Cast(this=exp.Null(), to=result_type)
+    return self.sql(
+        exp.If(
+            this=exp.Is(this=check_for_null.copy(), expression=exp.Null()),
+            true=typed_null,
+            false=return_value.copy(),
+        )
+    )
 class DuckDB(Dialect):
     NULL_ORDERING = "nulls_are_last"
     SUPPORTS_USER_DEFINED_TYPES = True
@@ -286,8 +666,13 @@ class DuckDB(Dialect):
         **Dialect.DATE_PART_MAPPING,
         "DAYOFWEEKISO": "ISODOW",
     }
     DATE_PART_MAPPING.pop("WEEKDAY")
+    INVERSE_TIME_MAPPING = {
+        "%e": "%-d",  # BigQuery's space-padded day (%e) -> DuckDB's no-padding day (%-d)
+    }
     def to_json_path(self, path: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
         if isinstance(path, exp.Literal):
             # DuckDB also supports the JSON pointer syntax, where every path starts with a `/`.
@@ -323,7 +708,9 @@ class DuckDB(Dialect):
             "DETACH": TokenType.DETACH,
             "FORCE": TokenType.FORCE,
             "INSTALL": TokenType.INSTALL,
+            "INT8": TokenType.BIGINT,
             "LOGICAL": TokenType.BOOLEAN,
+            "MACRO": TokenType.FUNCTION,
             "ONLY": TokenType.ONLY,
             "PIVOT_WIDER": TokenType.PIVOT,
             "POSITIONAL": TokenType.POSITIONAL,
@@ -580,7 +967,7 @@ class DuckDB(Dialect):
         ) -> t.Optional[exp.Expression]:
             bracket = super()._parse_bracket(this)
-            if self.dialect.version < Version("1.2.0") and isinstance(bracket, exp.Bracket):
+            if self.dialect.version < (1, 2) and isinstance(bracket, exp.Bracket):
                 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes
                 bracket.set("returns_list_for_maps", True)
@@ -638,11 +1025,9 @@ class DuckDB(Dialect):
         def _parse_install(self, force: bool = False) -> exp.Install:
             return self.expression(
                 exp.Install,
-                **{  # type: ignore
-                    "this": self._parse_id_var(),
-                    "from": self._parse_var_or_string() if self._match(TokenType.FROM) else None,
-                    "force": force,
-                },
+                this=self._parse_id_var(),
+                from_=self._parse_var_or_string() if self._match(TokenType.FROM) else None,
+                force=force,
             )
         def _parse_primary(self) -> t.Optional[exp.Expression]:
@@ -680,11 +1065,16 @@ class DuckDB(Dialect):
         ARRAY_SIZE_DIM_REQUIRED = False
         NORMALIZE_EXTRACT_DATE_PARTS = True
         SUPPORTS_LIKE_QUANTIFIERS = False
+        SET_ASSIGNMENT_REQUIRES_VARIABLE_KEYWORD = True
         TRANSFORMS = {
             **generator.Generator.TRANSFORMS,
+            exp.AnyValue: _anyvalue_sql,
             exp.ApproxDistinct: approx_count_distinct_sql,
-            exp.Array: inline_array_unless_query,
+            exp.Array: transforms.preprocess(
+                [transforms.inherit_struct_field_names],
+                generator=inline_array_unless_query,
+            ),
             exp.ArrayFilter: rename_func("LIST_FILTER"),
             exp.ArrayRemove: remove_from_array_using_filter,
             exp.ArraySort: _array_sort_sql,
@@ -692,9 +1082,10 @@ class DuckDB(Dialect):
             exp.ArrayUniqueAgg: lambda self, e: self.func(
                 "LIST", exp.Distinct(expressions=[e.this])
             ),
+            exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"),
             exp.BitwiseAndAgg: rename_func("BIT_AND"),
+            exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"),
             exp.BitwiseOrAgg: rename_func("BIT_OR"),
-            exp.BitwiseXor: rename_func("XOR"),
             exp.BitwiseXorAgg: rename_func("BIT_XOR"),
             exp.CommentColumnConstraint: no_comment_column_constraint_sql,
             exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"),
@@ -729,17 +1120,20 @@ class DuckDB(Dialect):
             exp.IntDiv: lambda self, e: self.binary(e, "//"),
             exp.IsInf: rename_func("ISINF"),
             exp.IsNan: rename_func("ISNAN"),
+            exp.Floor: _floor_sql,
             exp.JSONBExists: rename_func("JSON_EXISTS"),
             exp.JSONExtract: _arrow_json_extract_sql,
             exp.JSONExtractArray: _json_extract_value_array_sql,
-            exp.JSONExtractScalar: _arrow_json_extract_sql,
             exp.JSONFormat: _json_format_sql,
             exp.JSONValueArray: _json_extract_value_array_sql,
             exp.Lateral: explode_to_unnest_sql,
             exp.LogicalOr: rename_func("BOOL_OR"),
             exp.LogicalAnd: rename_func("BOOL_AND"),
             exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "),
+            exp.Initcap: _initcap_sql,
             exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)),
+            exp.SHA1Digest: lambda self, e: self.func("UNHEX", self.func("SHA1", e.this)),
+            exp.SHA2Digest: lambda self, e: self.func("UNHEX", sha2_digest_sql(self, e)),
             exp.MonthsBetween: lambda self, e: self.func(
                 "DATEDIFF",
                 "'month'",
@@ -763,6 +1157,8 @@ class DuckDB(Dialect):
                 "REGEXP_MATCHES", e.this, e.expression, exp.Literal.string("i")
             ),
             exp.RegexpSplit: rename_func("STR_SPLIT_REGEX"),
+            exp.RegrValx: _regr_val_sql,
+            exp.RegrValy: _regr_val_sql,
             exp.Return: lambda self, e: self.sql(e, "this"),
             exp.ReturnsProperty: lambda self, e: "TABLE" if isinstance(e.this, exp.Schema) else "",
             exp.Rand: rename_func("RANDOM"),
@@ -786,13 +1182,13 @@ class DuckDB(Dialect):
                 "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this
             ),
             exp.TimestampSub: date_delta_to_binary_interval_op(),
-            exp.TimestampTrunc: timestamptrunc_sql(),
             exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DataType.Type.DATE)),
             exp.TimeStrToTime: timestrtotime_sql,
             exp.TimeStrToUnix: lambda self, e: self.func(
                 "EPOCH", exp.cast(e.this, exp.DataType.Type.TIMESTAMP)
             ),
             exp.TimeToStr: lambda self, e: self.func("STRFTIME", e.this, self.format_time(e)),
+            exp.ToBoolean: _to_boolean_sql,
             exp.TimeToUnix: rename_func("EPOCH"),
             exp.TsOrDiToDi: lambda self,
             e: f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)",
@@ -804,6 +1200,12 @@ class DuckDB(Dialect):
                 exp.cast(e.this, exp.DataType.Type.TIMESTAMP),
             ),
             exp.UnixMicros: lambda self, e: self.func("EPOCH_US", _implicit_datetime_cast(e.this)),
+            exp.UnixMillis: lambda self, e: self.func("EPOCH_MS", _implicit_datetime_cast(e.this)),
+            exp.UnixSeconds: lambda self, e: self.sql(
+                exp.cast(
+                    self.func("EPOCH", _implicit_datetime_cast(e.this)), exp.DataType.Type.BIGINT
+                )
+            ),
             exp.UnixToStr: lambda self, e: self.func(
                 "STRFTIME", self.func("TO_TIMESTAMP", e.this), self.format_time(e)
             ),
@@ -836,6 +1238,7 @@ class DuckDB(Dialect):
             exp.DataType.Type.BPCHAR: "TEXT",
             exp.DataType.Type.CHAR: "TEXT",
             exp.DataType.Type.DATETIME: "TIMESTAMP",
+            exp.DataType.Type.DECFLOAT: "DECIMAL(38, 5)",
             exp.DataType.Type.FLOAT: "REAL",
             exp.DataType.Type.JSONB: "JSON",
             exp.DataType.Type.NCHAR: "TEXT",
@@ -848,6 +1251,7 @@ class DuckDB(Dialect):
             exp.DataType.Type.TIMESTAMP_S: "TIMESTAMP_S",
             exp.DataType.Type.TIMESTAMP_MS: "TIMESTAMP_MS",
             exp.DataType.Type.TIMESTAMP_NS: "TIMESTAMP_NS",
+            exp.DataType.Type.BIGDECIMAL: "DECIMAL(38, 5)",
         }
         # https://github.com/duckdb/duckdb/blob/ff7f24fd8e3128d94371827523dae85ebaf58713/third_party/libpg_query/grammar/keywords/reserved_keywords.list#L1-L77
@@ -955,6 +1359,135 @@ class DuckDB(Dialect):
             exp.NthValue,
         )
+        def bitmapbitposition_sql(self: DuckDB.Generator, expression: exp.BitmapBitPosition) -> str:
+            """
+            Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression.
+            Snowflake's BITMAP_BIT_POSITION behavior:
+            - For n <= 0: returns ABS(n) % 32768
+            - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767)
+            """
+            this = expression.this
+            return self.sql(
+                exp.Mod(
+                    this=exp.Paren(
+                        this=exp.If(
+                            this=exp.GT(this=this, expression=exp.Literal.number(0)),
+                            true=this - exp.Literal.number(1),
+                            false=exp.Abs(this=this),
+                        )
+                    ),
+                    expression=MAX_BIT_POSITION,
+                )
+            )
+        def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str:
+            """
+            Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random.
+            RANDSTR(length, generator) generates a random string of specified length.
+            - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result)
+            - With RANDOM(): Use RANDOM() in the hash for non-deterministic output
+            - No generator: Use default seed value
+            """
+            length = expression.this
+            generator = expression.args.get("generator")
+            if generator:
+                if isinstance(generator, exp.Rand):
+                    # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself
+                    seed_value = generator.this or generator
+                else:
+                    # Const/int or other expression - use as seed directly
+                    seed_value = generator
+            else:
+                # No generator specified, use default seed (arbitrary but deterministic)
+                seed_value = exp.Literal.number(RANDSTR_SEED)
+            length_sql = self.sql(length)
+            seed_sql = self.sql(seed_value)
+            query: exp.Select = exp.maybe_parse(
+                f"""
+                SELECT LISTAGG(
+                    SUBSTRING(
+                        '{RANDSTR_CHAR_POOL}',
+                        1 + CAST(FLOOR(random_value * 62) AS INT),
+                        1
+                    ),
+                    ''
+                )
+                FROM (
+                    SELECT (ABS(HASH(i + {seed_sql})) % 1000) / 1000.0 AS random_value
+                    FROM RANGE({length_sql}) AS t(i)
+                )
+                """,
+                dialect="duckdb",
+            )
+            return f"({self.sql(query)})"
+        def tobinary_sql(self: DuckDB.Generator, expression: exp.ToBinary) -> str:
+            """
+            TO_BINARY(value, format) transpilation if the return type is BINARY:
+            - 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50')
+            - 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST')
+            - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==')
+            format can be 'HEX', 'UTF-8' or 'BASE64'
+            return type can be either VARCHAR or BINARY
+            """
+            value = expression.this
+            format_arg = expression.args.get("format")
+            fmt = "HEX"
+            if format_arg:
+                fmt = format_arg.name.upper()
+            if expression.is_type(exp.DataType.Type.BINARY):
+                if fmt == "UTF-8":
+                    return self.func("ENCODE", value)
+                if fmt == "BASE64":
+                    return self.func("FROM_BASE64", value)
+                # Hex
+                return self.func("UNHEX", value)
+            # Fallback, which needs to be updated if want to support transpilation from other dialects than Snowflake
+            return self.func("TO_BINARY", value)
+        def _greatest_least_sql(
+            self: DuckDB.Generator, expression: exp.Greatest | exp.Least
+        ) -> str:
+            """
+            Handle GREATEST/LEAST functions with dialect-aware NULL behavior.
+            - If null_if_any_null=True (BigQuery-style): return NULL if any argument is NULL
+            - If null_if_any_null=False (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value
+            """
+            # Get all arguments
+            all_args = [expression.this, *expression.expressions]
+            fallback_sql = self.function_fallback_sql(expression)
+            if expression.args.get("null_if_any_null"):
+                # BigQuery behavior: NULL if any argument is NULL
+                case_expr = exp.case().when(
+                    exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False),
+                    exp.null(),
+                    copy=False,
+                )
+                case_expr.set("default", fallback_sql)
+                return self.sql(case_expr)
+            # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs)
+            return self.sql(fallback_sql)
+        def greatest_sql(self: DuckDB.Generator, expression: exp.Greatest) -> str:
+            return self._greatest_least_sql(expression)
+        def least_sql(self: DuckDB.Generator, expression: exp.Least) -> str:
+            return self._greatest_least_sql(expression)
         def lambda_sql(
             self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True
         ) -> str:
@@ -974,10 +1507,16 @@ class DuckDB(Dialect):
         def install_sql(self, expression: exp.Install) -> str:
             force = "FORCE " if expression.args.get("force") else ""
             this = self.sql(expression, "this")
-            from_clause = expression.args.get("from")
+            from_clause = expression.args.get("from_")
             from_clause = f" FROM {from_clause}" if from_clause else ""
             return f"{force}INSTALL {this}{from_clause}"
+        def approxtopk_sql(self, expression: exp.ApproxTopK) -> str:
+            self.unsupported(
+                "APPROX_TOP_K cannot be transpiled to DuckDB due to incompatible return types. "
+            )
+            return self.function_fallback_sql(expression)
         def fromiso8601timestamp_sql(self, expression: exp.FromISO8601Timestamp) -> str:
             return self.sql(exp.cast(expression.this, exp.DataType.Type.TIMESTAMPTZ))
@@ -1084,14 +1623,14 @@ class DuckDB(Dialect):
             return self.function_fallback_sql(expression)
         def countif_sql(self, expression: exp.CountIf) -> str:
-            if self.dialect.version >= Version("1.2"):
+            if self.dialect.version >= (1, 2):
                 return self.function_fallback_sql(expression)
             # https://github.com/tobymao/sqlglot/pull/4749
             return count_if_to_sum(self, expression)
         def bracket_sql(self, expression: exp.Bracket) -> str:
-            if self.dialect.version >= Version("1.2"):
+            if self.dialect.version >= (1, 2):
                 return super().bracket_sql(expression)
             # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes
@@ -1158,6 +1697,33 @@ class DuckDB(Dialect):
             return self.sql(case)
+        def lower_sql(self, expression: exp.Lower) -> str:
+            result_sql = self.func("LOWER", _cast_to_varchar(expression.this))
+            return _gen_with_cast_to_blob(self, expression, result_sql)
+        def upper_sql(self, expression: exp.Upper) -> str:
+            result_sql = self.func("UPPER", _cast_to_varchar(expression.this))
+            return _gen_with_cast_to_blob(self, expression, result_sql)
+        def replace_sql(self, expression: exp.Replace) -> str:
+            result_sql = self.func(
+                "REPLACE",
+                _cast_to_varchar(expression.this),
+                _cast_to_varchar(expression.expression),
+                _cast_to_varchar(expression.args.get("replacement")),
+            )
+            return _gen_with_cast_to_blob(self, expression, result_sql)
+        def _bitwise_op(self, expression: exp.Binary, op: str) -> str:
+            _prepare_binary_bitwise_args(expression)
+            result_sql = self.binary(expression, op)
+            return _gen_with_cast_to_blob(self, expression, result_sql)
+        def bitwisexor_sql(self, expression: exp.BitwiseXor) -> str:
+            _prepare_binary_bitwise_args(expression)
+            result_sql = self.func("XOR", expression.this, expression.expression)
+            return _gen_with_cast_to_blob(self, expression, result_sql)
         def objectinsert_sql(self, expression: exp.ObjectInsert) -> str:
             this = expression.this
             key = expression.args.get("key")
@@ -1173,6 +1739,13 @@ class DuckDB(Dialect):
             return self.func("STRUCT_INSERT", this, kv_sql)
+        def startswith_sql(self, expression: exp.StartsWith) -> str:
+            return self.func(
+                "STARTS_WITH",
+                _cast_to_varchar(expression.this),
+                _cast_to_varchar(expression.expression),
+            )
         def unnest_sql(self, expression: exp.Unnest) -> str:
             explode_array = expression.args.get("explode_array")
             if explode_array:
@@ -1206,7 +1779,7 @@ class DuckDB(Dialect):
             if isinstance(this, exp.First):
                 this = exp.AnyValue(this=this.this)
-            if not isinstance(this, exp.AnyValue):
+            if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)):
                 self.unsupported("IGNORE NULLS is not supported for non-window functions.")
             return self.sql(this)
@@ -1229,10 +1802,19 @@ class DuckDB(Dialect):
             return self.func("ARRAY_TO_STRING", this, expression.expression)
-        @unsupported_args("position", "occurrence")
         def regexpextract_sql(self, expression: exp.RegexpExtract) -> str:
+            this = expression.this
             group = expression.args.get("group")
             params = expression.args.get("parameters")
+            position = expression.args.get("position")
+            occurrence = expression.args.get("occurrence")
+            null_if_pos_overflow = expression.args.get("null_if_pos_overflow")
+            if position and (not position.is_int or position.to_py() > 1):
+                this = exp.Substring(this=this, start=position)
+                if null_if_pos_overflow:
+                    this = exp.Nullif(this=this, expression=exp.Literal.string(""))
             # Do not render group if there is no following argument,
             # and it's the default value for this dialect
@@ -1242,9 +1824,15 @@ class DuckDB(Dialect):
                 and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP)
             ):
                 group = None
-            return self.func(
-                "REGEXP_EXTRACT", expression.this, expression.expression, group, params
-            )
+            if occurrence and (not occurrence.is_int or occurrence.to_py() > 1):
+                return self.func(
+                    "ARRAY_EXTRACT",
+                    self.func("REGEXP_EXTRACT_ALL", this, expression.expression, group, params),
+                    exp.Literal.number(occurrence),
+                )
+            return self.func("REGEXP_EXTRACT", this, expression.expression, group, params)
         @unsupported_args("culture")
         def numbertostr_sql(self, expression: exp.NumberToStr) -> str:
@@ -1347,3 +1935,111 @@ class DuckDB(Dialect):
             to_hex = exp.cast(self.func("TO_HEX", from_hex), exp.DataType.Type.BLOB)
             return self.sql(to_hex)
+        def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str:
+            unit = unit_to_str(expression)
+            zone = expression.args.get("zone")
+            timestamp = expression.this
+            if is_date_unit(unit) and zone:
+                # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC.
+                # Double AT TIME ZONE needed for BigQuery compatibility:
+                # 1. First AT TIME ZONE: ensures truncation happens in the target timezone
+                # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component)
+                timestamp = exp.AtTimeZone(this=timestamp, zone=zone)
+                result_sql = self.func("DATE_TRUNC", unit, timestamp)
+                return self.sql(exp.AtTimeZone(this=result_sql, zone=zone))
+            return self.func("DATE_TRUNC", unit, timestamp)
+        def trim_sql(self, expression: exp.Trim) -> str:
+            result_sql = self.func(
+                "TRIM",
+                _cast_to_varchar(expression.this),
+                _cast_to_varchar(expression.expression),
+            )
+            return _gen_with_cast_to_blob(self, expression, result_sql)
+        def round_sql(self, expression: exp.Round) -> str:
+            this = expression.this
+            decimals = expression.args.get("decimals")
+            truncate = expression.args.get("truncate")
+            # DuckDB requires the scale (decimals) argument to be an INT
+            # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally
+            if decimals is not None and expression.args.get("casts_non_integer_decimals"):
+                if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)):
+                    decimals = exp.cast(decimals, exp.DataType.Type.INT)
+            func = "ROUND"
+            if truncate:
+                # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN
+                if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"):
+                    func = "ROUND_EVEN"
+                    truncate = None
+                # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO
+                elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"):
+                    truncate = None
+            return self.func(func, this, decimals, truncate)
+        def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str:
+            """
+            BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values
+            dividing the input distribution into n equal-sized buckets.
+            Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery
+            does not document the specific algorithm used so results may differ. DuckDB does not
+            support RESPECT NULLS.
+            """
+            this = expression.this
+            if isinstance(this, exp.Distinct):
+                # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both
+                if len(this.expressions) < 2:
+                    self.unsupported("APPROX_QUANTILES requires a bucket count argument")
+                    return self.function_fallback_sql(expression)
+                num_quantiles_expr = this.expressions[1].pop()
+            else:
+                num_quantiles_expr = expression.expression
+            if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int:
+                self.unsupported("APPROX_QUANTILES bucket count must be a positive integer")
+                return self.function_fallback_sql(expression)
+            num_quantiles = t.cast(int, num_quantiles_expr.to_py())
+            if num_quantiles <= 0:
+                self.unsupported("APPROX_QUANTILES bucket count must be a positive integer")
+                return self.function_fallback_sql(expression)
+            quantiles = [
+                exp.Literal.number(Decimal(i) / Decimal(num_quantiles))
+                for i in range(num_quantiles + 1)
+            ]
+            return self.sql(
+                exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles))
+            )
+        def jsonextractscalar_sql(self, expression: exp.JSONExtractScalar) -> str:
+            if expression.args.get("scalar_only"):
+                expression = exp.JSONExtractScalar(
+                    this=rename_func("JSON_VALUE")(self, expression), expression="'$'"
+                )
+            return _arrow_json_extract_sql(self, expression)
+        def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str:
+            this = expression.this
+            if _is_binary(this):
+                expression.type = exp.DataType.build("BINARY")
+            arg = _cast_to_bit(this)
+            if isinstance(this, exp.Neg):
+                arg = exp.Paren(this=arg)
+            expression.set("this", arg)
+            result_sql = f"~{self.sql(expression, 'this')}"
+            return _gen_with_cast_to_blob(self, expression, result_sql)

sqlglot 27.29.0__py3-none-any.whl → 28.4.1__py3-none-any.whl

sqlglot 27.29.0py3-none-any.whl → 28.4.1py3-none-any.whl