sqlglot 27.27.0__py3-none-any.whl → 28.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. sqlglot/__init__.py +1 -0
  2. sqlglot/__main__.py +6 -4
  3. sqlglot/_version.py +2 -2
  4. sqlglot/dialects/bigquery.py +118 -279
  5. sqlglot/dialects/clickhouse.py +73 -5
  6. sqlglot/dialects/databricks.py +38 -1
  7. sqlglot/dialects/dialect.py +354 -275
  8. sqlglot/dialects/dremio.py +4 -1
  9. sqlglot/dialects/duckdb.py +754 -25
  10. sqlglot/dialects/exasol.py +243 -10
  11. sqlglot/dialects/hive.py +8 -8
  12. sqlglot/dialects/mysql.py +14 -4
  13. sqlglot/dialects/oracle.py +29 -0
  14. sqlglot/dialects/postgres.py +60 -26
  15. sqlglot/dialects/presto.py +47 -16
  16. sqlglot/dialects/redshift.py +16 -0
  17. sqlglot/dialects/risingwave.py +3 -0
  18. sqlglot/dialects/singlestore.py +12 -3
  19. sqlglot/dialects/snowflake.py +239 -218
  20. sqlglot/dialects/spark.py +15 -4
  21. sqlglot/dialects/spark2.py +11 -48
  22. sqlglot/dialects/sqlite.py +10 -0
  23. sqlglot/dialects/starrocks.py +3 -0
  24. sqlglot/dialects/teradata.py +5 -8
  25. sqlglot/dialects/trino.py +6 -0
  26. sqlglot/dialects/tsql.py +61 -22
  27. sqlglot/diff.py +4 -2
  28. sqlglot/errors.py +69 -0
  29. sqlglot/executor/__init__.py +5 -10
  30. sqlglot/executor/python.py +1 -29
  31. sqlglot/expressions.py +637 -100
  32. sqlglot/generator.py +160 -43
  33. sqlglot/helper.py +2 -44
  34. sqlglot/lineage.py +10 -4
  35. sqlglot/optimizer/annotate_types.py +247 -140
  36. sqlglot/optimizer/canonicalize.py +6 -1
  37. sqlglot/optimizer/eliminate_joins.py +1 -1
  38. sqlglot/optimizer/eliminate_subqueries.py +2 -2
  39. sqlglot/optimizer/merge_subqueries.py +5 -5
  40. sqlglot/optimizer/normalize.py +20 -13
  41. sqlglot/optimizer/normalize_identifiers.py +17 -3
  42. sqlglot/optimizer/optimizer.py +4 -0
  43. sqlglot/optimizer/pushdown_predicates.py +1 -1
  44. sqlglot/optimizer/qualify.py +18 -10
  45. sqlglot/optimizer/qualify_columns.py +122 -275
  46. sqlglot/optimizer/qualify_tables.py +128 -76
  47. sqlglot/optimizer/resolver.py +374 -0
  48. sqlglot/optimizer/scope.py +27 -16
  49. sqlglot/optimizer/simplify.py +1075 -959
  50. sqlglot/optimizer/unnest_subqueries.py +12 -2
  51. sqlglot/parser.py +296 -170
  52. sqlglot/planner.py +2 -2
  53. sqlglot/schema.py +15 -4
  54. sqlglot/tokens.py +42 -7
  55. sqlglot/transforms.py +77 -22
  56. sqlglot/typing/__init__.py +316 -0
  57. sqlglot/typing/bigquery.py +376 -0
  58. sqlglot/typing/hive.py +12 -0
  59. sqlglot/typing/presto.py +24 -0
  60. sqlglot/typing/snowflake.py +505 -0
  61. sqlglot/typing/spark2.py +58 -0
  62. sqlglot/typing/tsql.py +9 -0
  63. {sqlglot-27.27.0.dist-info → sqlglot-28.4.0.dist-info}/METADATA +2 -2
  64. sqlglot-28.4.0.dist-info/RECORD +92 -0
  65. sqlglot-27.27.0.dist-info/RECORD +0 -84
  66. {sqlglot-27.27.0.dist-info → sqlglot-28.4.0.dist-info}/WHEEL +0 -0
  67. {sqlglot-27.27.0.dist-info → sqlglot-28.4.0.dist-info}/licenses/LICENSE +0 -0
  68. {sqlglot-27.27.0.dist-info → sqlglot-28.4.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from decimal import Decimal
4
+ from itertools import groupby
5
+ import re
3
6
  import typing as t
4
7
 
5
8
  from sqlglot import exp, generator, parser, tokens, transforms
@@ -8,7 +11,6 @@ from sqlglot.dialects.dialect import (
8
11
  Dialect,
9
12
  JSON_EXTRACT_TYPE,
10
13
  NormalizationStrategy,
11
- Version,
12
14
  approx_count_distinct_sql,
13
15
  arrow_json_extract_sql,
14
16
  binary_from_function,
@@ -21,7 +23,6 @@ from sqlglot.dialects.dialect import (
21
23
  no_datetime_sql,
22
24
  encode_decode_sql,
23
25
  build_formatted_time,
24
- inline_array_unless_query,
25
26
  no_comment_column_constraint_sql,
26
27
  no_time_sql,
27
28
  no_timestamp_sql,
@@ -30,7 +31,6 @@ from sqlglot.dialects.dialect import (
30
31
  remove_from_array_using_filter,
31
32
  strposition_sql,
32
33
  str_to_time_sql,
33
- timestamptrunc_sql,
34
34
  timestrtotime_sql,
35
35
  unit_to_str,
36
36
  sha256_sql,
@@ -38,12 +38,99 @@ from sqlglot.dialects.dialect import (
38
38
  explode_to_unnest_sql,
39
39
  no_make_interval_sql,
40
40
  groupconcat_sql,
41
+ inline_array_unless_query,
42
+ regexp_replace_global_modifier,
43
+ sha2_digest_sql,
41
44
  )
42
45
  from sqlglot.generator import unsupported_args
43
- from sqlglot.helper import seq_get
46
+ from sqlglot.helper import is_date_unit, seq_get
44
47
  from sqlglot.tokens import TokenType
45
48
  from sqlglot.parser import binary_range_parser
46
49
 
50
+ # Regex to detect time zones in timestamps of the form [+|-]TT[:tt]
51
+ # The pattern matches timezone offsets that appear after the time portion
52
+ TIMEZONE_PATTERN = re.compile(r":\d{2}.*?[+\-]\d{2}(?::\d{2})?")
53
+
54
+ # Characters that must be escaped when building regex expressions in INITCAP
55
+ REGEX_ESCAPE_REPLACEMENTS = {
56
+ "\\": "\\\\",
57
+ "-": r"\-",
58
+ "^": r"\^",
59
+ "[": r"\[",
60
+ "]": r"\]",
61
+ }
62
+
63
+ # Used to in RANDSTR transpilation
64
+ RANDSTR_CHAR_POOL = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
65
+ RANDSTR_SEED = 123456
66
+
67
+ # Whitespace control characters that DuckDB must process with `CHR({val})` calls
68
+ WS_CONTROL_CHARS_TO_DUCK = {
69
+ "\u000b": 11,
70
+ "\u001c": 28,
71
+ "\u001d": 29,
72
+ "\u001e": 30,
73
+ "\u001f": 31,
74
+ }
75
+
76
+ # Days of week to ISO 8601 day-of-week numbers
77
+ # ISO 8601 standard: Monday=1, Tuesday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6, Sunday=7
78
+ WEEK_START_DAY_TO_DOW = {
79
+ "MONDAY": 1,
80
+ "TUESDAY": 2,
81
+ "WEDNESDAY": 3,
82
+ "THURSDAY": 4,
83
+ "FRIDAY": 5,
84
+ "SATURDAY": 6,
85
+ "SUNDAY": 7,
86
+ }
87
+
88
+ MAX_BIT_POSITION = exp.Literal.number(32768)
89
+
90
+
91
+ def _to_boolean_sql(self: DuckDB.Generator, expression: exp.ToBoolean) -> str:
92
+ """
93
+ Transpile TO_BOOLEAN function from Snowflake to DuckDB equivalent.
94
+
95
+ DuckDB's CAST to BOOLEAN supports most of Snowflake's TO_BOOLEAN strings except 'on'/'off'.
96
+ We need to handle the 'on'/'off' cases explicitly, plus NaN/INF error cases.
97
+
98
+ In Snowflake, NaN and INF values cause errors. We use DuckDB's native ERROR()
99
+ function to replicate this behavior with a clear error message.
100
+ """
101
+ arg = expression.this
102
+
103
+ cast_to_real = exp.func("TRY_CAST", arg, exp.DataType.build("REAL"))
104
+
105
+ # Check for NaN and INF values
106
+ nan_inf_check = exp.Or(
107
+ this=exp.func("ISNAN", cast_to_real), expression=exp.func("ISINF", cast_to_real)
108
+ )
109
+
110
+ case_expr = (
111
+ exp.case()
112
+ .when(
113
+ nan_inf_check,
114
+ exp.func(
115
+ "ERROR",
116
+ exp.Literal.string("TO_BOOLEAN: Non-numeric values NaN and INF are not supported"),
117
+ ),
118
+ )
119
+ # Handle 'on' -> TRUE (case insensitive) - only for string literals
120
+ .when(
121
+ exp.Upper(this=exp.cast(arg, exp.DataType.Type.VARCHAR)).eq(exp.Literal.string("ON")),
122
+ exp.true(),
123
+ )
124
+ # Handle 'off' -> FALSE (case insensitive) - only for string literals
125
+ .when(
126
+ exp.Upper(this=exp.cast(arg, exp.DataType.Type.VARCHAR)).eq(exp.Literal.string("OFF")),
127
+ exp.false(),
128
+ )
129
+ .else_(exp.cast(arg, exp.DataType.Type.BOOLEAN))
130
+ )
131
+
132
+ return self.sql(case_expr)
133
+
47
134
 
48
135
  # BigQuery -> DuckDB conversion for the DATE function
49
136
  def _date_sql(self: DuckDB.Generator, expression: exp.Date) -> str:
@@ -211,12 +298,100 @@ def _arrow_json_extract_sql(self: DuckDB.Generator, expression: JSON_EXTRACT_TYP
211
298
  def _implicit_datetime_cast(
212
299
  arg: t.Optional[exp.Expression], type: exp.DataType.Type = exp.DataType.Type.DATE
213
300
  ) -> t.Optional[exp.Expression]:
214
- return exp.cast(arg, type) if isinstance(arg, exp.Literal) else arg
301
+ if isinstance(arg, exp.Literal) and arg.is_string:
302
+ ts = arg.name
303
+ if type == exp.DataType.Type.DATE and ":" in ts:
304
+ type = (
305
+ exp.DataType.Type.TIMESTAMPTZ
306
+ if TIMEZONE_PATTERN.search(ts)
307
+ else exp.DataType.Type.TIMESTAMP
308
+ )
309
+
310
+ arg = exp.cast(arg, type)
311
+
312
+ return arg
313
+
314
+
315
+ def _week_unit_to_dow(unit: t.Optional[exp.Expression]) -> t.Optional[int]:
316
+ """
317
+ Compute the Monday-based day shift to align DATE_DIFF('WEEK', ...) coming
318
+ from other dialects, e.g BigQuery's WEEK(<day>) or ISOWEEK unit parts.
319
+
320
+ Args:
321
+ unit: The unit expression (Var for ISOWEEK or WeekStart)
322
+
323
+ Returns:
324
+ The ISO 8601 day number (Monday=1, Sunday=7 etc) or None if not a week unit or if day is dynamic (not a constant).
325
+
326
+ Examples:
327
+ "WEEK(SUNDAY)" -> 7
328
+ "WEEK(MONDAY)" -> 1
329
+ "ISOWEEK" -> 1
330
+ """
331
+ # Handle plain Var expressions for ISOWEEK only
332
+ if isinstance(unit, exp.Var) and unit.name.upper() in "ISOWEEK":
333
+ return 1
334
+
335
+ # Handle WeekStart expressions with explicit day
336
+ if isinstance(unit, exp.WeekStart):
337
+ return WEEK_START_DAY_TO_DOW.get(unit.name.upper())
338
+
339
+ return None
340
+
341
+
342
+ def _build_week_trunc_expression(date_expr: exp.Expression, start_dow: int) -> exp.Expression:
343
+ """
344
+ Build DATE_TRUNC expression for week boundaries with custom start day.
345
+
346
+ Args:
347
+ date_expr: The date expression to truncate
348
+ shift_days: ISO 8601 day-of-week number (Monday=0, ..., Sunday=6)
349
+
350
+ DuckDB's DATE_TRUNC('WEEK', ...) aligns weeks to Monday (ISO standard).
351
+ To align to a different start day, we shift the date before truncating.
352
+
353
+ Shift formula: Sunday (7) gets +1, others get (1 - start_dow)
354
+ Examples:
355
+ Monday (1): shift = 0 (no shift needed)
356
+ Tuesday (2): shift = -1 (shift back 1 day) ...
357
+ Sunday (7): shift = +1 (shift forward 1 day, wraps to next Monday-based week)
358
+ """
359
+ shift_days = 1 if start_dow == 7 else 1 - start_dow
360
+
361
+ # Shift date to align week boundaries with the desired start day
362
+ # No shift needed for Monday-based weeks (shift_days == 0)
363
+ shifted_date = (
364
+ exp.DateAdd(
365
+ this=date_expr,
366
+ expression=exp.Interval(this=exp.Literal.string(str(shift_days)), unit=exp.var("DAY")),
367
+ )
368
+ if shift_days != 0
369
+ else date_expr
370
+ )
371
+
372
+ return exp.DateTrunc(unit=exp.var("WEEK"), this=shifted_date)
215
373
 
216
374
 
217
375
  def _date_diff_sql(self: DuckDB.Generator, expression: exp.DateDiff) -> str:
218
376
  this = _implicit_datetime_cast(expression.this)
219
377
  expr = _implicit_datetime_cast(expression.expression)
378
+ unit = expression.args.get("unit")
379
+
380
+ # DuckDB's WEEK diff does not respect Monday crossing (week boundaries), it checks (end_day - start_day) / 7:
381
+ # SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-17' AS DATE)) --> 0 (Monday crossed)
382
+ # SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-20' AS DATE)) --> 1 (7 days difference)
383
+ # Whereas for other units such as MONTH it does respect month boundaries:
384
+ # SELECT DATE_DIFF('MONTH', CAST('2024-11-30' AS DATE), CAST('2024-12-01' AS DATE)) --> 1 (Month crossed)
385
+ date_part_boundary = expression.args.get("date_part_boundary")
386
+
387
+ # Extract week start day; returns None if day is dynamic (column/placeholder)
388
+ week_start = _week_unit_to_dow(unit)
389
+ if date_part_boundary and week_start and this and expr:
390
+ expression.set("unit", exp.Literal.string("WEEK"))
391
+
392
+ # Truncate both dates to week boundaries to respect input dialect semantics
393
+ this = _build_week_trunc_expression(this, week_start)
394
+ expr = _build_week_trunc_expression(expr, week_start)
220
395
 
221
396
  return self.func("DATE_DIFF", unit_to_str(expression), expr, this)
222
397
 
@@ -251,6 +426,228 @@ def _json_extract_value_array_sql(
251
426
  return self.sql(exp.cast(json_extract, to=exp.DataType.build(data_type)))
252
427
 
253
428
 
429
+ def _cast_to_varchar(arg: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
430
+ if arg and arg.type and not arg.is_type(exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN):
431
+ return exp.cast(arg, exp.DataType.Type.VARCHAR)
432
+ return arg
433
+
434
+
435
+ def _is_binary(arg: exp.Expression) -> bool:
436
+ return arg.is_type(
437
+ exp.DataType.Type.BINARY,
438
+ exp.DataType.Type.VARBINARY,
439
+ exp.DataType.Type.BLOB,
440
+ )
441
+
442
+
443
+ def _gen_with_cast_to_blob(
444
+ self: DuckDB.Generator, expression: exp.Expression, result_sql: str
445
+ ) -> str:
446
+ if _is_binary(expression):
447
+ blob = exp.DataType.build("BLOB", dialect="duckdb")
448
+ result_sql = self.sql(exp.Cast(this=result_sql, to=blob))
449
+ return result_sql
450
+
451
+
452
+ def _cast_to_bit(arg: exp.Expression) -> exp.Expression:
453
+ if not _is_binary(arg):
454
+ return arg
455
+
456
+ if isinstance(arg, exp.HexString):
457
+ arg = exp.Unhex(this=exp.Literal.string(arg.this))
458
+
459
+ return exp.cast(arg, exp.DataType.Type.BIT)
460
+
461
+
462
+ def _prepare_binary_bitwise_args(expression: exp.Binary) -> None:
463
+ if _is_binary(expression.this):
464
+ expression.set("this", _cast_to_bit(expression.this))
465
+ if _is_binary(expression.expression):
466
+ expression.set("expression", _cast_to_bit(expression.expression))
467
+
468
+
469
+ def _anyvalue_sql(self: DuckDB.Generator, expression: exp.AnyValue) -> str:
470
+ # Transform ANY_VALUE(expr HAVING MAX/MIN having_expr) to ARG_MAX_NULL/ARG_MIN_NULL
471
+ having = expression.this
472
+ if isinstance(having, exp.HavingMax):
473
+ func_name = "ARG_MAX_NULL" if having.args.get("max") else "ARG_MIN_NULL"
474
+ return self.func(func_name, having.this, having.expression)
475
+ return self.function_fallback_sql(expression)
476
+
477
+
478
+ def _literal_sql_with_ws_chr(self: DuckDB.Generator, literal: str) -> str:
479
+ # DuckDB does not support \uXXXX escapes, so we must use CHR() instead of replacing them directly
480
+ if not any(ch in WS_CONTROL_CHARS_TO_DUCK for ch in literal):
481
+ return self.sql(exp.Literal.string(literal))
482
+
483
+ sql_segments: t.List[str] = []
484
+ for is_ws_control, group in groupby(literal, key=lambda ch: ch in WS_CONTROL_CHARS_TO_DUCK):
485
+ if is_ws_control:
486
+ for ch in group:
487
+ duckdb_char_code = WS_CONTROL_CHARS_TO_DUCK[ch]
488
+ sql_segments.append(self.func("CHR", exp.Literal.number(str(duckdb_char_code))))
489
+ else:
490
+ sql_segments.append(self.sql(exp.Literal.string("".join(group))))
491
+
492
+ sql = " || ".join(sql_segments)
493
+ return sql if len(sql_segments) == 1 else f"({sql})"
494
+
495
+
496
+ def _escape_regex_metachars(
497
+ self: DuckDB.Generator, delimiters: t.Optional[exp.Expression], delimiters_sql: str
498
+ ) -> str:
499
+ r"""
500
+ Escapes regex metacharacters \ - ^ [ ] for use in character classes regex expressions.
501
+
502
+ Literal strings are escaped at transpile time, expressions handled with REPLACE() calls.
503
+ """
504
+ if not delimiters:
505
+ return delimiters_sql
506
+
507
+ if delimiters.is_string:
508
+ literal_value = delimiters.this
509
+ escaped_literal = "".join(REGEX_ESCAPE_REPLACEMENTS.get(ch, ch) for ch in literal_value)
510
+ return _literal_sql_with_ws_chr(self, escaped_literal)
511
+
512
+ escaped_sql = delimiters_sql
513
+ for raw, escaped in REGEX_ESCAPE_REPLACEMENTS.items():
514
+ escaped_sql = self.func(
515
+ "REPLACE",
516
+ escaped_sql,
517
+ self.sql(exp.Literal.string(raw)),
518
+ self.sql(exp.Literal.string(escaped)),
519
+ )
520
+
521
+ return escaped_sql
522
+
523
+
524
+ def _build_capitalization_sql(
525
+ self: DuckDB.Generator,
526
+ value_to_split: str,
527
+ delimiters_sql: str,
528
+ ) -> str:
529
+ # empty string delimiter --> treat value as one word, no need to split
530
+ if delimiters_sql == "''":
531
+ return f"UPPER(LEFT({value_to_split}, 1)) || LOWER(SUBSTRING({value_to_split}, 2))"
532
+
533
+ delim_regex_sql = f"CONCAT('[', {delimiters_sql}, ']')"
534
+ split_regex_sql = f"CONCAT('([', {delimiters_sql}, ']+|[^', {delimiters_sql}, ']+)')"
535
+
536
+ # REGEXP_EXTRACT_ALL produces a list of string segments, alternating between delimiter and non-delimiter segments.
537
+ # We do not know whether the first segment is a delimiter or not, so we check the first character of the string
538
+ # with REGEXP_MATCHES. If the first char is a delimiter, we capitalize even list indexes, otherwise capitalize odd.
539
+ return self.func(
540
+ "ARRAY_TO_STRING",
541
+ exp.case()
542
+ .when(
543
+ f"REGEXP_MATCHES(LEFT({value_to_split}, 1), {delim_regex_sql})",
544
+ self.func(
545
+ "LIST_TRANSFORM",
546
+ self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql),
547
+ "(seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END",
548
+ ),
549
+ )
550
+ .else_(
551
+ self.func(
552
+ "LIST_TRANSFORM",
553
+ self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql),
554
+ "(seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END",
555
+ ),
556
+ ),
557
+ "''",
558
+ )
559
+
560
+
561
+ def _initcap_sql(self: DuckDB.Generator, expression: exp.Initcap) -> str:
562
+ this_sql = self.sql(expression, "this")
563
+ delimiters = expression.args.get("expression")
564
+ if delimiters is None:
565
+ # fallback for manually created exp.Initcap w/o delimiters arg
566
+ delimiters = exp.Literal.string(self.dialect.INITCAP_DEFAULT_DELIMITER_CHARS)
567
+ delimiters_sql = self.sql(delimiters)
568
+
569
+ escaped_delimiters_sql = _escape_regex_metachars(self, delimiters, delimiters_sql)
570
+
571
+ return _build_capitalization_sql(self, this_sql, escaped_delimiters_sql)
572
+
573
+
574
+ def _floor_sql(self: DuckDB.Generator, expression: exp.Floor) -> str:
575
+ decimals = expression.args.get("decimals")
576
+
577
+ if decimals is not None and expression.args.get("to") is None:
578
+ this = expression.this
579
+ if isinstance(this, exp.Binary):
580
+ this = exp.Paren(this=this)
581
+
582
+ n_int = decimals
583
+ if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)):
584
+ n_int = exp.cast(decimals, exp.DataType.Type.INT)
585
+
586
+ pow_ = exp.Pow(this=exp.Literal.number("10"), expression=n_int)
587
+ floored = exp.Floor(this=exp.Mul(this=this, expression=pow_))
588
+ result = exp.Div(this=floored, expression=pow_.copy())
589
+
590
+ return self.round_sql(
591
+ exp.Round(this=result, decimals=decimals, casts_non_integer_decimals=True)
592
+ )
593
+
594
+ return self.ceil_floor(expression)
595
+
596
+
597
+ def _regr_val_sql(
598
+ self: DuckDB.Generator,
599
+ expression: exp.RegrValx | exp.RegrValy,
600
+ ) -> str:
601
+ """
602
+ Transpile Snowflake's REGR_VALX/REGR_VALY to DuckDB equivalent.
603
+
604
+ REGR_VALX(y, x) returns NULL if y is NULL; otherwise returns x.
605
+ REGR_VALY(y, x) returns NULL if x is NULL; otherwise returns y.
606
+ """
607
+ from sqlglot.optimizer.annotate_types import annotate_types
608
+
609
+ y = expression.this
610
+ x = expression.expression
611
+
612
+ # Determine which argument to check for NULL and which to return based on expression type
613
+ if isinstance(expression, exp.RegrValx):
614
+ # REGR_VALX: check y for NULL, return x
615
+ check_for_null = y
616
+ return_value = x
617
+ return_value_attr = "expression"
618
+ else:
619
+ # REGR_VALY: check x for NULL, return y
620
+ check_for_null = x
621
+ return_value = y
622
+ return_value_attr = "this"
623
+
624
+ # Get the type from the return argument
625
+ result_type = return_value.type
626
+
627
+ # If no type info, annotate the expression to infer types
628
+ if not result_type or result_type.this == exp.DataType.Type.UNKNOWN:
629
+ try:
630
+ annotated = annotate_types(expression.copy(), dialect=self.dialect)
631
+ result_type = getattr(annotated, return_value_attr).type
632
+ except Exception:
633
+ pass
634
+
635
+ # Default to DOUBLE for regression functions if type still unknown
636
+ if not result_type or result_type.this == exp.DataType.Type.UNKNOWN:
637
+ result_type = exp.DataType.build("DOUBLE")
638
+
639
+ # Cast NULL to the same type as return_value to avoid DuckDB type inference issues
640
+ typed_null = exp.Cast(this=exp.Null(), to=result_type)
641
+
642
+ return self.sql(
643
+ exp.If(
644
+ this=exp.Is(this=check_for_null.copy(), expression=exp.Null()),
645
+ true=typed_null,
646
+ false=return_value.copy(),
647
+ )
648
+ )
649
+
650
+
254
651
  class DuckDB(Dialect):
255
652
  NULL_ORDERING = "nulls_are_last"
256
653
  SUPPORTS_USER_DEFINED_TYPES = True
@@ -269,8 +666,13 @@ class DuckDB(Dialect):
269
666
  **Dialect.DATE_PART_MAPPING,
270
667
  "DAYOFWEEKISO": "ISODOW",
271
668
  }
669
+
272
670
  DATE_PART_MAPPING.pop("WEEKDAY")
273
671
 
672
+ INVERSE_TIME_MAPPING = {
673
+ "%e": "%-d", # BigQuery's space-padded day (%e) -> DuckDB's no-padding day (%-d)
674
+ }
675
+
274
676
  def to_json_path(self, path: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
275
677
  if isinstance(path, exp.Literal):
276
678
  # DuckDB also supports the JSON pointer syntax, where every path starts with a `/`.
@@ -306,7 +708,9 @@ class DuckDB(Dialect):
306
708
  "DETACH": TokenType.DETACH,
307
709
  "FORCE": TokenType.FORCE,
308
710
  "INSTALL": TokenType.INSTALL,
711
+ "INT8": TokenType.BIGINT,
309
712
  "LOGICAL": TokenType.BOOLEAN,
713
+ "MACRO": TokenType.FUNCTION,
310
714
  "ONLY": TokenType.ONLY,
311
715
  "PIVOT_WIDER": TokenType.PIVOT,
312
716
  "POSITIONAL": TokenType.POSITIONAL,
@@ -398,6 +802,7 @@ class DuckDB(Dialect):
398
802
  "LIST_SORT": exp.SortArray.from_arg_list,
399
803
  "LIST_TRANSFORM": exp.Transform.from_arg_list,
400
804
  "LIST_VALUE": lambda args: exp.Array(expressions=args),
805
+ "MAKE_DATE": exp.DateFromParts.from_arg_list,
401
806
  "MAKE_TIME": exp.TimeFromParts.from_arg_list,
402
807
  "MAKE_TIMESTAMP": _build_make_timestamp,
403
808
  "QUANTILE_CONT": exp.PercentileCont.from_arg_list,
@@ -411,6 +816,7 @@ class DuckDB(Dialect):
411
816
  expression=seq_get(args, 1),
412
817
  replacement=seq_get(args, 2),
413
818
  modifiers=seq_get(args, 3),
819
+ single_replace=True,
414
820
  ),
415
821
  "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)),
416
822
  "STRFTIME": build_formatted_time(exp.TimeToStr, "duckdb"),
@@ -561,7 +967,7 @@ class DuckDB(Dialect):
561
967
  ) -> t.Optional[exp.Expression]:
562
968
  bracket = super()._parse_bracket(this)
563
969
 
564
- if self.dialect.version < Version("1.2.0") and isinstance(bracket, exp.Bracket):
970
+ if self.dialect.version < (1, 2) and isinstance(bracket, exp.Bracket):
565
971
  # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes
566
972
  bracket.set("returns_list_for_maps", True)
567
973
 
@@ -619,11 +1025,9 @@ class DuckDB(Dialect):
619
1025
  def _parse_install(self, force: bool = False) -> exp.Install:
620
1026
  return self.expression(
621
1027
  exp.Install,
622
- **{ # type: ignore
623
- "this": self._parse_id_var(),
624
- "from": self._parse_var_or_string() if self._match(TokenType.FROM) else None,
625
- "force": force,
626
- },
1028
+ this=self._parse_id_var(),
1029
+ from_=self._parse_var_or_string() if self._match(TokenType.FROM) else None,
1030
+ force=force,
627
1031
  )
628
1032
 
629
1033
  def _parse_primary(self) -> t.Optional[exp.Expression]:
@@ -661,11 +1065,16 @@ class DuckDB(Dialect):
661
1065
  ARRAY_SIZE_DIM_REQUIRED = False
662
1066
  NORMALIZE_EXTRACT_DATE_PARTS = True
663
1067
  SUPPORTS_LIKE_QUANTIFIERS = False
1068
+ SET_ASSIGNMENT_REQUIRES_VARIABLE_KEYWORD = True
664
1069
 
665
1070
  TRANSFORMS = {
666
1071
  **generator.Generator.TRANSFORMS,
1072
+ exp.AnyValue: _anyvalue_sql,
667
1073
  exp.ApproxDistinct: approx_count_distinct_sql,
668
- exp.Array: inline_array_unless_query,
1074
+ exp.Array: transforms.preprocess(
1075
+ [transforms.inherit_struct_field_names],
1076
+ generator=inline_array_unless_query,
1077
+ ),
669
1078
  exp.ArrayFilter: rename_func("LIST_FILTER"),
670
1079
  exp.ArrayRemove: remove_from_array_using_filter,
671
1080
  exp.ArraySort: _array_sort_sql,
@@ -673,13 +1082,13 @@ class DuckDB(Dialect):
673
1082
  exp.ArrayUniqueAgg: lambda self, e: self.func(
674
1083
  "LIST", exp.Distinct(expressions=[e.this])
675
1084
  ),
1085
+ exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"),
676
1086
  exp.BitwiseAndAgg: rename_func("BIT_AND"),
1087
+ exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"),
677
1088
  exp.BitwiseOrAgg: rename_func("BIT_OR"),
678
- exp.BitwiseXor: rename_func("XOR"),
679
1089
  exp.BitwiseXorAgg: rename_func("BIT_XOR"),
680
1090
  exp.CommentColumnConstraint: no_comment_column_constraint_sql,
681
1091
  exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"),
682
- exp.CurrentDate: lambda *_: "CURRENT_DATE",
683
1092
  exp.CurrentTime: lambda *_: "CURRENT_TIME",
684
1093
  exp.CurrentTimestamp: lambda *_: "CURRENT_TIMESTAMP",
685
1094
  exp.DayOfMonth: rename_func("DAYOFMONTH"),
@@ -694,6 +1103,7 @@ class DuckDB(Dialect):
694
1103
  exp.DateDiff: _date_diff_sql,
695
1104
  exp.DateStrToDate: datestrtodate_sql,
696
1105
  exp.Datetime: no_datetime_sql,
1106
+ exp.DatetimeDiff: _date_diff_sql,
697
1107
  exp.DatetimeSub: date_delta_to_binary_interval_op(),
698
1108
  exp.DatetimeAdd: date_delta_to_binary_interval_op(),
699
1109
  exp.DateToDi: lambda self,
@@ -710,17 +1120,20 @@ class DuckDB(Dialect):
710
1120
  exp.IntDiv: lambda self, e: self.binary(e, "//"),
711
1121
  exp.IsInf: rename_func("ISINF"),
712
1122
  exp.IsNan: rename_func("ISNAN"),
1123
+ exp.Floor: _floor_sql,
713
1124
  exp.JSONBExists: rename_func("JSON_EXISTS"),
714
1125
  exp.JSONExtract: _arrow_json_extract_sql,
715
1126
  exp.JSONExtractArray: _json_extract_value_array_sql,
716
- exp.JSONExtractScalar: _arrow_json_extract_sql,
717
1127
  exp.JSONFormat: _json_format_sql,
718
1128
  exp.JSONValueArray: _json_extract_value_array_sql,
719
1129
  exp.Lateral: explode_to_unnest_sql,
720
1130
  exp.LogicalOr: rename_func("BOOL_OR"),
721
1131
  exp.LogicalAnd: rename_func("BOOL_AND"),
722
1132
  exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "),
1133
+ exp.Initcap: _initcap_sql,
723
1134
  exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)),
1135
+ exp.SHA1Digest: lambda self, e: self.func("UNHEX", self.func("SHA1", e.this)),
1136
+ exp.SHA2Digest: lambda self, e: self.func("UNHEX", sha2_digest_sql(self, e)),
724
1137
  exp.MonthsBetween: lambda self, e: self.func(
725
1138
  "DATEDIFF",
726
1139
  "'month'",
@@ -737,13 +1150,15 @@ class DuckDB(Dialect):
737
1150
  e.this,
738
1151
  e.expression,
739
1152
  e.args.get("replacement"),
740
- e.args.get("modifiers"),
1153
+ regexp_replace_global_modifier(e),
741
1154
  ),
742
1155
  exp.RegexpLike: rename_func("REGEXP_MATCHES"),
743
1156
  exp.RegexpILike: lambda self, e: self.func(
744
1157
  "REGEXP_MATCHES", e.this, e.expression, exp.Literal.string("i")
745
1158
  ),
746
1159
  exp.RegexpSplit: rename_func("STR_SPLIT_REGEX"),
1160
+ exp.RegrValx: _regr_val_sql,
1161
+ exp.RegrValy: _regr_val_sql,
747
1162
  exp.Return: lambda self, e: self.sql(e, "this"),
748
1163
  exp.ReturnsProperty: lambda self, e: "TABLE" if isinstance(e.this, exp.Schema) else "",
749
1164
  exp.Rand: rename_func("RANDOM"),
@@ -758,19 +1173,22 @@ class DuckDB(Dialect):
758
1173
  exp.Struct: _struct_sql,
759
1174
  exp.Transform: rename_func("LIST_TRANSFORM"),
760
1175
  exp.TimeAdd: date_delta_to_binary_interval_op(),
1176
+ exp.TimeSub: date_delta_to_binary_interval_op(),
761
1177
  exp.Time: no_time_sql,
762
1178
  exp.TimeDiff: _timediff_sql,
763
1179
  exp.Timestamp: no_timestamp_sql,
1180
+ exp.TimestampAdd: date_delta_to_binary_interval_op(),
764
1181
  exp.TimestampDiff: lambda self, e: self.func(
765
1182
  "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this
766
1183
  ),
767
- exp.TimestampTrunc: timestamptrunc_sql(),
1184
+ exp.TimestampSub: date_delta_to_binary_interval_op(),
768
1185
  exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DataType.Type.DATE)),
769
1186
  exp.TimeStrToTime: timestrtotime_sql,
770
1187
  exp.TimeStrToUnix: lambda self, e: self.func(
771
1188
  "EPOCH", exp.cast(e.this, exp.DataType.Type.TIMESTAMP)
772
1189
  ),
773
1190
  exp.TimeToStr: lambda self, e: self.func("STRFTIME", e.this, self.format_time(e)),
1191
+ exp.ToBoolean: _to_boolean_sql,
774
1192
  exp.TimeToUnix: rename_func("EPOCH"),
775
1193
  exp.TsOrDiToDi: lambda self,
776
1194
  e: f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)",
@@ -781,6 +1199,13 @@ class DuckDB(Dialect):
781
1199
  exp.cast(e.expression, exp.DataType.Type.TIMESTAMP),
782
1200
  exp.cast(e.this, exp.DataType.Type.TIMESTAMP),
783
1201
  ),
1202
+ exp.UnixMicros: lambda self, e: self.func("EPOCH_US", _implicit_datetime_cast(e.this)),
1203
+ exp.UnixMillis: lambda self, e: self.func("EPOCH_MS", _implicit_datetime_cast(e.this)),
1204
+ exp.UnixSeconds: lambda self, e: self.sql(
1205
+ exp.cast(
1206
+ self.func("EPOCH", _implicit_datetime_cast(e.this)), exp.DataType.Type.BIGINT
1207
+ )
1208
+ ),
784
1209
  exp.UnixToStr: lambda self, e: self.func(
785
1210
  "STRFTIME", self.func("TO_TIMESTAMP", e.this), self.format_time(e)
786
1211
  ),
@@ -813,6 +1238,7 @@ class DuckDB(Dialect):
813
1238
  exp.DataType.Type.BPCHAR: "TEXT",
814
1239
  exp.DataType.Type.CHAR: "TEXT",
815
1240
  exp.DataType.Type.DATETIME: "TIMESTAMP",
1241
+ exp.DataType.Type.DECFLOAT: "DECIMAL(38, 5)",
816
1242
  exp.DataType.Type.FLOAT: "REAL",
817
1243
  exp.DataType.Type.JSONB: "JSON",
818
1244
  exp.DataType.Type.NCHAR: "TEXT",
@@ -825,6 +1251,7 @@ class DuckDB(Dialect):
825
1251
  exp.DataType.Type.TIMESTAMP_S: "TIMESTAMP_S",
826
1252
  exp.DataType.Type.TIMESTAMP_MS: "TIMESTAMP_MS",
827
1253
  exp.DataType.Type.TIMESTAMP_NS: "TIMESTAMP_NS",
1254
+ exp.DataType.Type.BIGDECIMAL: "DECIMAL(38, 5)",
828
1255
  }
829
1256
 
830
1257
  # https://github.com/duckdb/duckdb/blob/ff7f24fd8e3128d94371827523dae85ebaf58713/third_party/libpg_query/grammar/keywords/reserved_keywords.list#L1-L77
@@ -932,6 +1359,135 @@ class DuckDB(Dialect):
932
1359
  exp.NthValue,
933
1360
  )
934
1361
 
1362
+ def bitmapbitposition_sql(self: DuckDB.Generator, expression: exp.BitmapBitPosition) -> str:
1363
+ """
1364
+ Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression.
1365
+
1366
+ Snowflake's BITMAP_BIT_POSITION behavior:
1367
+ - For n <= 0: returns ABS(n) % 32768
1368
+ - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767)
1369
+ """
1370
+ this = expression.this
1371
+
1372
+ return self.sql(
1373
+ exp.Mod(
1374
+ this=exp.Paren(
1375
+ this=exp.If(
1376
+ this=exp.GT(this=this, expression=exp.Literal.number(0)),
1377
+ true=this - exp.Literal.number(1),
1378
+ false=exp.Abs(this=this),
1379
+ )
1380
+ ),
1381
+ expression=MAX_BIT_POSITION,
1382
+ )
1383
+ )
1384
+
1385
+ def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str:
1386
+ """
1387
+ Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random.
1388
+
1389
+ RANDSTR(length, generator) generates a random string of specified length.
1390
+ - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result)
1391
+ - With RANDOM(): Use RANDOM() in the hash for non-deterministic output
1392
+ - No generator: Use default seed value
1393
+ """
1394
+ length = expression.this
1395
+ generator = expression.args.get("generator")
1396
+
1397
+ if generator:
1398
+ if isinstance(generator, exp.Rand):
1399
+ # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself
1400
+ seed_value = generator.this or generator
1401
+ else:
1402
+ # Const/int or other expression - use as seed directly
1403
+ seed_value = generator
1404
+ else:
1405
+ # No generator specified, use default seed (arbitrary but deterministic)
1406
+ seed_value = exp.Literal.number(RANDSTR_SEED)
1407
+
1408
+ length_sql = self.sql(length)
1409
+ seed_sql = self.sql(seed_value)
1410
+
1411
+ query: exp.Select = exp.maybe_parse(
1412
+ f"""
1413
+ SELECT LISTAGG(
1414
+ SUBSTRING(
1415
+ '{RANDSTR_CHAR_POOL}',
1416
+ 1 + CAST(FLOOR(random_value * 62) AS INT),
1417
+ 1
1418
+ ),
1419
+ ''
1420
+ )
1421
+ FROM (
1422
+ SELECT (ABS(HASH(i + {seed_sql})) % 1000) / 1000.0 AS random_value
1423
+ FROM RANGE({length_sql}) AS t(i)
1424
+ )
1425
+ """,
1426
+ dialect="duckdb",
1427
+ )
1428
+ return f"({self.sql(query)})"
1429
+
1430
+ def tobinary_sql(self: DuckDB.Generator, expression: exp.ToBinary) -> str:
1431
+ """
1432
+ TO_BINARY(value, format) transpilation if the return type is BINARY:
1433
+ - 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50')
1434
+ - 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST')
1435
+ - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==')
1436
+
1437
+ format can be 'HEX', 'UTF-8' or 'BASE64'
1438
+ return type can be either VARCHAR or BINARY
1439
+ """
1440
+ value = expression.this
1441
+ format_arg = expression.args.get("format")
1442
+
1443
+ fmt = "HEX"
1444
+ if format_arg:
1445
+ fmt = format_arg.name.upper()
1446
+
1447
+ if expression.is_type(exp.DataType.Type.BINARY):
1448
+ if fmt == "UTF-8":
1449
+ return self.func("ENCODE", value)
1450
+ if fmt == "BASE64":
1451
+ return self.func("FROM_BASE64", value)
1452
+
1453
+ # Hex
1454
+ return self.func("UNHEX", value)
1455
+
1456
+ # Fallback, which needs to be updated if want to support transpilation from other dialects than Snowflake
1457
+ return self.func("TO_BINARY", value)
1458
+
1459
+ def _greatest_least_sql(
1460
+ self: DuckDB.Generator, expression: exp.Greatest | exp.Least
1461
+ ) -> str:
1462
+ """
1463
+ Handle GREATEST/LEAST functions with dialect-aware NULL behavior.
1464
+
1465
+ - If null_if_any_null=True (BigQuery-style): return NULL if any argument is NULL
1466
+ - If null_if_any_null=False (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value
1467
+ """
1468
+ # Get all arguments
1469
+ all_args = [expression.this, *expression.expressions]
1470
+ fallback_sql = self.function_fallback_sql(expression)
1471
+
1472
+ if expression.args.get("null_if_any_null"):
1473
+ # BigQuery behavior: NULL if any argument is NULL
1474
+ case_expr = exp.case().when(
1475
+ exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False),
1476
+ exp.null(),
1477
+ copy=False,
1478
+ )
1479
+ case_expr.set("default", fallback_sql)
1480
+ return self.sql(case_expr)
1481
+
1482
+ # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs)
1483
+ return self.sql(fallback_sql)
1484
+
1485
+ def greatest_sql(self: DuckDB.Generator, expression: exp.Greatest) -> str:
1486
+ return self._greatest_least_sql(expression)
1487
+
1488
+ def least_sql(self: DuckDB.Generator, expression: exp.Least) -> str:
1489
+ return self._greatest_least_sql(expression)
1490
+
935
1491
  def lambda_sql(
936
1492
  self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True
937
1493
  ) -> str:
@@ -951,10 +1507,16 @@ class DuckDB(Dialect):
951
1507
  def install_sql(self, expression: exp.Install) -> str:
952
1508
  force = "FORCE " if expression.args.get("force") else ""
953
1509
  this = self.sql(expression, "this")
954
- from_clause = expression.args.get("from")
1510
+ from_clause = expression.args.get("from_")
955
1511
  from_clause = f" FROM {from_clause}" if from_clause else ""
956
1512
  return f"{force}INSTALL {this}{from_clause}"
957
1513
 
1514
+ def approxtopk_sql(self, expression: exp.ApproxTopK) -> str:
1515
+ self.unsupported(
1516
+ "APPROX_TOP_K cannot be transpiled to DuckDB due to incompatible return types. "
1517
+ )
1518
+ return self.function_fallback_sql(expression)
1519
+
958
1520
  def fromiso8601timestamp_sql(self, expression: exp.FromISO8601Timestamp) -> str:
959
1521
  return self.sql(exp.cast(expression.this, exp.DataType.Type.TIMESTAMPTZ))
960
1522
 
@@ -970,6 +1532,16 @@ class DuckDB(Dialect):
970
1532
  return f"CAST({self.func('TRY_STRPTIME', expression.this, formatted_time)} AS DATE)"
971
1533
  return f"CAST({str_to_time_sql(self, expression)} AS DATE)"
972
1534
 
1535
+ def currentdate_sql(self, expression: exp.CurrentDate) -> str:
1536
+ if not expression.this:
1537
+ return "CURRENT_DATE"
1538
+
1539
+ expr = exp.Cast(
1540
+ this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this),
1541
+ to=exp.DataType(this=exp.DataType.Type.DATE),
1542
+ )
1543
+ return self.sql(expr)
1544
+
973
1545
  def parsejson_sql(self, expression: exp.ParseJSON) -> str:
974
1546
  arg = expression.this
975
1547
  if expression.args.get("safe"):
@@ -1051,14 +1623,14 @@ class DuckDB(Dialect):
1051
1623
  return self.function_fallback_sql(expression)
1052
1624
 
1053
1625
  def countif_sql(self, expression: exp.CountIf) -> str:
1054
- if self.dialect.version >= Version("1.2"):
1626
+ if self.dialect.version >= (1, 2):
1055
1627
  return self.function_fallback_sql(expression)
1056
1628
 
1057
1629
  # https://github.com/tobymao/sqlglot/pull/4749
1058
1630
  return count_if_to_sum(self, expression)
1059
1631
 
1060
1632
  def bracket_sql(self, expression: exp.Bracket) -> str:
1061
- if self.dialect.version >= Version("1.2"):
1633
+ if self.dialect.version >= (1, 2):
1062
1634
  return super().bracket_sql(expression)
1063
1635
 
1064
1636
  # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes
@@ -1125,6 +1697,33 @@ class DuckDB(Dialect):
1125
1697
 
1126
1698
  return self.sql(case)
1127
1699
 
1700
+ def lower_sql(self, expression: exp.Lower) -> str:
1701
+ result_sql = self.func("LOWER", _cast_to_varchar(expression.this))
1702
+ return _gen_with_cast_to_blob(self, expression, result_sql)
1703
+
1704
+ def upper_sql(self, expression: exp.Upper) -> str:
1705
+ result_sql = self.func("UPPER", _cast_to_varchar(expression.this))
1706
+ return _gen_with_cast_to_blob(self, expression, result_sql)
1707
+
1708
+ def replace_sql(self, expression: exp.Replace) -> str:
1709
+ result_sql = self.func(
1710
+ "REPLACE",
1711
+ _cast_to_varchar(expression.this),
1712
+ _cast_to_varchar(expression.expression),
1713
+ _cast_to_varchar(expression.args.get("replacement")),
1714
+ )
1715
+ return _gen_with_cast_to_blob(self, expression, result_sql)
1716
+
1717
+ def _bitwise_op(self, expression: exp.Binary, op: str) -> str:
1718
+ _prepare_binary_bitwise_args(expression)
1719
+ result_sql = self.binary(expression, op)
1720
+ return _gen_with_cast_to_blob(self, expression, result_sql)
1721
+
1722
+ def bitwisexor_sql(self, expression: exp.BitwiseXor) -> str:
1723
+ _prepare_binary_bitwise_args(expression)
1724
+ result_sql = self.func("XOR", expression.this, expression.expression)
1725
+ return _gen_with_cast_to_blob(self, expression, result_sql)
1726
+
1128
1727
  def objectinsert_sql(self, expression: exp.ObjectInsert) -> str:
1129
1728
  this = expression.this
1130
1729
  key = expression.args.get("key")
@@ -1140,6 +1739,13 @@ class DuckDB(Dialect):
1140
1739
 
1141
1740
  return self.func("STRUCT_INSERT", this, kv_sql)
1142
1741
 
1742
+ def startswith_sql(self, expression: exp.StartsWith) -> str:
1743
+ return self.func(
1744
+ "STARTS_WITH",
1745
+ _cast_to_varchar(expression.this),
1746
+ _cast_to_varchar(expression.expression),
1747
+ )
1748
+
1143
1749
  def unnest_sql(self, expression: exp.Unnest) -> str:
1144
1750
  explode_array = expression.args.get("explode_array")
1145
1751
  if explode_array:
@@ -1173,7 +1779,7 @@ class DuckDB(Dialect):
1173
1779
  if isinstance(this, exp.First):
1174
1780
  this = exp.AnyValue(this=this.this)
1175
1781
 
1176
- if not isinstance(this, exp.AnyValue):
1782
+ if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)):
1177
1783
  self.unsupported("IGNORE NULLS is not supported for non-window functions.")
1178
1784
 
1179
1785
  return self.sql(this)
@@ -1196,10 +1802,19 @@ class DuckDB(Dialect):
1196
1802
 
1197
1803
  return self.func("ARRAY_TO_STRING", this, expression.expression)
1198
1804
 
1199
- @unsupported_args("position", "occurrence")
1200
1805
  def regexpextract_sql(self, expression: exp.RegexpExtract) -> str:
1806
+ this = expression.this
1201
1807
  group = expression.args.get("group")
1202
1808
  params = expression.args.get("parameters")
1809
+ position = expression.args.get("position")
1810
+ occurrence = expression.args.get("occurrence")
1811
+ null_if_pos_overflow = expression.args.get("null_if_pos_overflow")
1812
+
1813
+ if position and (not position.is_int or position.to_py() > 1):
1814
+ this = exp.Substring(this=this, start=position)
1815
+
1816
+ if null_if_pos_overflow:
1817
+ this = exp.Nullif(this=this, expression=exp.Literal.string(""))
1203
1818
 
1204
1819
  # Do not render group if there is no following argument,
1205
1820
  # and it's the default value for this dialect
@@ -1209,9 +1824,15 @@ class DuckDB(Dialect):
1209
1824
  and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP)
1210
1825
  ):
1211
1826
  group = None
1212
- return self.func(
1213
- "REGEXP_EXTRACT", expression.this, expression.expression, group, params
1214
- )
1827
+
1828
+ if occurrence and (not occurrence.is_int or occurrence.to_py() > 1):
1829
+ return self.func(
1830
+ "ARRAY_EXTRACT",
1831
+ self.func("REGEXP_EXTRACT_ALL", this, expression.expression, group, params),
1832
+ exp.Literal.number(occurrence),
1833
+ )
1834
+
1835
+ return self.func("REGEXP_EXTRACT", this, expression.expression, group, params)
1215
1836
 
1216
1837
  @unsupported_args("culture")
1217
1838
  def numbertostr_sql(self, expression: exp.NumberToStr) -> str:
@@ -1314,3 +1935,111 @@ class DuckDB(Dialect):
1314
1935
  to_hex = exp.cast(self.func("TO_HEX", from_hex), exp.DataType.Type.BLOB)
1315
1936
 
1316
1937
  return self.sql(to_hex)
1938
+
1939
+ def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str:
1940
+ unit = unit_to_str(expression)
1941
+ zone = expression.args.get("zone")
1942
+ timestamp = expression.this
1943
+
1944
+ if is_date_unit(unit) and zone:
1945
+ # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC.
1946
+ # Double AT TIME ZONE needed for BigQuery compatibility:
1947
+ # 1. First AT TIME ZONE: ensures truncation happens in the target timezone
1948
+ # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component)
1949
+ timestamp = exp.AtTimeZone(this=timestamp, zone=zone)
1950
+ result_sql = self.func("DATE_TRUNC", unit, timestamp)
1951
+ return self.sql(exp.AtTimeZone(this=result_sql, zone=zone))
1952
+
1953
+ return self.func("DATE_TRUNC", unit, timestamp)
1954
+
1955
+ def trim_sql(self, expression: exp.Trim) -> str:
1956
+ result_sql = self.func(
1957
+ "TRIM",
1958
+ _cast_to_varchar(expression.this),
1959
+ _cast_to_varchar(expression.expression),
1960
+ )
1961
+ return _gen_with_cast_to_blob(self, expression, result_sql)
1962
+
1963
+ def round_sql(self, expression: exp.Round) -> str:
1964
+ this = expression.this
1965
+ decimals = expression.args.get("decimals")
1966
+ truncate = expression.args.get("truncate")
1967
+
1968
+ # DuckDB requires the scale (decimals) argument to be an INT
1969
+ # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally
1970
+ if decimals is not None and expression.args.get("casts_non_integer_decimals"):
1971
+ if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)):
1972
+ decimals = exp.cast(decimals, exp.DataType.Type.INT)
1973
+
1974
+ func = "ROUND"
1975
+ if truncate:
1976
+ # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN
1977
+ if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"):
1978
+ func = "ROUND_EVEN"
1979
+ truncate = None
1980
+ # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO
1981
+ elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"):
1982
+ truncate = None
1983
+
1984
+ return self.func(func, this, decimals, truncate)
1985
+
1986
+ def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str:
1987
+ """
1988
+ BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values
1989
+ dividing the input distribution into n equal-sized buckets.
1990
+
1991
+ Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery
1992
+ does not document the specific algorithm used so results may differ. DuckDB does not
1993
+ support RESPECT NULLS.
1994
+ """
1995
+ this = expression.this
1996
+ if isinstance(this, exp.Distinct):
1997
+ # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both
1998
+ if len(this.expressions) < 2:
1999
+ self.unsupported("APPROX_QUANTILES requires a bucket count argument")
2000
+ return self.function_fallback_sql(expression)
2001
+ num_quantiles_expr = this.expressions[1].pop()
2002
+ else:
2003
+ num_quantiles_expr = expression.expression
2004
+
2005
+ if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int:
2006
+ self.unsupported("APPROX_QUANTILES bucket count must be a positive integer")
2007
+ return self.function_fallback_sql(expression)
2008
+
2009
+ num_quantiles = t.cast(int, num_quantiles_expr.to_py())
2010
+ if num_quantiles <= 0:
2011
+ self.unsupported("APPROX_QUANTILES bucket count must be a positive integer")
2012
+ return self.function_fallback_sql(expression)
2013
+
2014
+ quantiles = [
2015
+ exp.Literal.number(Decimal(i) / Decimal(num_quantiles))
2016
+ for i in range(num_quantiles + 1)
2017
+ ]
2018
+
2019
+ return self.sql(
2020
+ exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles))
2021
+ )
2022
+
2023
+ def jsonextractscalar_sql(self, expression: exp.JSONExtractScalar) -> str:
2024
+ if expression.args.get("scalar_only"):
2025
+ expression = exp.JSONExtractScalar(
2026
+ this=rename_func("JSON_VALUE")(self, expression), expression="'$'"
2027
+ )
2028
+ return _arrow_json_extract_sql(self, expression)
2029
+
2030
+ def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str:
2031
+ this = expression.this
2032
+
2033
+ if _is_binary(this):
2034
+ expression.type = exp.DataType.build("BINARY")
2035
+
2036
+ arg = _cast_to_bit(this)
2037
+
2038
+ if isinstance(this, exp.Neg):
2039
+ arg = exp.Paren(this=arg)
2040
+
2041
+ expression.set("this", arg)
2042
+
2043
+ result_sql = f"~{self.sql(expression, 'this')}"
2044
+
2045
+ return _gen_with_cast_to_blob(self, expression, result_sql)