sqlglot 28.4.0__py3-none-any.whl → 28.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. sqlglot/_version.py +2 -2
  2. sqlglot/dialects/bigquery.py +20 -23
  3. sqlglot/dialects/clickhouse.py +2 -0
  4. sqlglot/dialects/dialect.py +355 -18
  5. sqlglot/dialects/doris.py +38 -90
  6. sqlglot/dialects/druid.py +1 -0
  7. sqlglot/dialects/duckdb.py +1739 -163
  8. sqlglot/dialects/exasol.py +17 -1
  9. sqlglot/dialects/hive.py +27 -2
  10. sqlglot/dialects/mysql.py +103 -11
  11. sqlglot/dialects/oracle.py +38 -1
  12. sqlglot/dialects/postgres.py +142 -33
  13. sqlglot/dialects/presto.py +6 -2
  14. sqlglot/dialects/redshift.py +7 -1
  15. sqlglot/dialects/singlestore.py +13 -3
  16. sqlglot/dialects/snowflake.py +271 -21
  17. sqlglot/dialects/spark.py +25 -0
  18. sqlglot/dialects/spark2.py +4 -3
  19. sqlglot/dialects/starrocks.py +152 -17
  20. sqlglot/dialects/trino.py +1 -0
  21. sqlglot/dialects/tsql.py +5 -0
  22. sqlglot/diff.py +1 -1
  23. sqlglot/expressions.py +239 -47
  24. sqlglot/generator.py +173 -44
  25. sqlglot/optimizer/annotate_types.py +129 -60
  26. sqlglot/optimizer/merge_subqueries.py +13 -2
  27. sqlglot/optimizer/qualify_columns.py +7 -0
  28. sqlglot/optimizer/resolver.py +19 -0
  29. sqlglot/optimizer/scope.py +12 -0
  30. sqlglot/optimizer/unnest_subqueries.py +7 -0
  31. sqlglot/parser.py +251 -58
  32. sqlglot/schema.py +186 -14
  33. sqlglot/tokens.py +36 -6
  34. sqlglot/transforms.py +6 -5
  35. sqlglot/typing/__init__.py +29 -10
  36. sqlglot/typing/bigquery.py +5 -10
  37. sqlglot/typing/duckdb.py +39 -0
  38. sqlglot/typing/hive.py +50 -1
  39. sqlglot/typing/mysql.py +32 -0
  40. sqlglot/typing/presto.py +0 -1
  41. sqlglot/typing/snowflake.py +80 -17
  42. sqlglot/typing/spark.py +29 -0
  43. sqlglot/typing/spark2.py +9 -1
  44. sqlglot/typing/tsql.py +21 -0
  45. {sqlglot-28.4.0.dist-info → sqlglot-28.8.0.dist-info}/METADATA +47 -2
  46. sqlglot-28.8.0.dist-info/RECORD +95 -0
  47. {sqlglot-28.4.0.dist-info → sqlglot-28.8.0.dist-info}/WHEEL +1 -1
  48. sqlglot-28.4.0.dist-info/RECORD +0 -92
  49. {sqlglot-28.4.0.dist-info → sqlglot-28.8.0.dist-info}/licenses/LICENSE +0 -0
  50. {sqlglot-28.4.0.dist-info → sqlglot-28.8.0.dist-info}/top_level.txt +0 -0
@@ -8,44 +8,50 @@ import typing as t
8
8
  from sqlglot import exp, generator, parser, tokens, transforms
9
9
 
10
10
  from sqlglot.dialects.dialect import (
11
+ DATETIME_DELTA,
11
12
  Dialect,
12
13
  JSON_EXTRACT_TYPE,
13
14
  NormalizationStrategy,
14
15
  approx_count_distinct_sql,
16
+ array_append_sql,
17
+ array_compact_sql,
18
+ array_concat_sql,
15
19
  arrow_json_extract_sql,
16
20
  binary_from_function,
17
- bool_xor_sql,
18
21
  build_default_decimal_type,
22
+ build_formatted_time,
23
+ build_regexp_extract,
19
24
  count_if_to_sum,
20
25
  date_delta_to_binary_interval_op,
21
26
  date_trunc_to_time,
22
27
  datestrtodate_sql,
23
- no_datetime_sql,
24
28
  encode_decode_sql,
25
- build_formatted_time,
29
+ explode_to_unnest_sql,
30
+ getbit_sql,
31
+ groupconcat_sql,
32
+ inline_array_unless_query,
33
+ months_between_sql,
34
+ no_datetime_sql,
26
35
  no_comment_column_constraint_sql,
36
+ no_make_interval_sql,
27
37
  no_time_sql,
28
38
  no_timestamp_sql,
29
39
  pivot_column_names,
40
+ regexp_replace_global_modifier,
30
41
  rename_func,
31
42
  remove_from_array_using_filter,
43
+ sha2_digest_sql,
44
+ sha256_sql,
32
45
  strposition_sql,
33
46
  str_to_time_sql,
34
47
  timestrtotime_sql,
35
48
  unit_to_str,
36
- sha256_sql,
37
- build_regexp_extract,
38
- explode_to_unnest_sql,
39
- no_make_interval_sql,
40
- groupconcat_sql,
41
- inline_array_unless_query,
42
- regexp_replace_global_modifier,
43
- sha2_digest_sql,
44
49
  )
45
50
  from sqlglot.generator import unsupported_args
46
51
  from sqlglot.helper import is_date_unit, seq_get
47
52
  from sqlglot.tokens import TokenType
48
53
  from sqlglot.parser import binary_range_parser
54
+ from sqlglot.typing.duckdb import EXPRESSION_METADATA
49
55
 
50
56
  # Regex to detect time zones in timestamps of the form [+|-]TT[:tt]
51
57
  # The pattern matches timezone offsets that appear after the time portion
@@ -87,68 +93,212 @@ WEEK_START_DAY_TO_DOW = {
87
93
 
88
94
  MAX_BIT_POSITION = exp.Literal.number(32768)
89
95
 
96
+ # SEQ function constants
97
+ _SEQ_BASE = "(ROW_NUMBER() OVER (ORDER BY 1) - 1)"
98
+ _SEQ_RESTRICTED = (exp.Where, exp.Having, exp.AggFunc, exp.Order, exp.Select)
99
+
100
+
101
+ def _apply_base64_alphabet_replacements(
102
+ result: exp.Expression,
103
+ alphabet: t.Optional[exp.Expression],
104
+ reverse: bool = False,
105
+ ) -> exp.Expression:
106
+ """
107
+ Apply base64 alphabet character replacements.
108
+
109
+ Base64 alphabet can be 1-3 chars: 1st = index 62 ('+'), 2nd = index 63 ('/'), 3rd = padding ('=').
110
+ zip truncates to the shorter string, so 1-char alphabet only replaces '+', 2-char replaces '+/', etc.
111
+
112
+ Args:
113
+ result: The expression to apply replacements to
114
+ alphabet: Custom alphabet literal (expected chars for +/=)
115
+ reverse: If False, replace default with custom (encode)
116
+ If True, replace custom with default (decode)
117
+ """
118
+ if isinstance(alphabet, exp.Literal) and alphabet.is_string:
119
+ for default_char, new_char in zip("+/=", alphabet.this):
120
+ if new_char != default_char:
121
+ find, replace = (new_char, default_char) if reverse else (default_char, new_char)
122
+ result = exp.Replace(
123
+ this=result,
124
+ expression=exp.Literal.string(find),
125
+ replacement=exp.Literal.string(replace),
126
+ )
127
+ return result
128
+
129
+
130
+ def _base64_decode_sql(self: DuckDB.Generator, expression: exp.Expression, to_string: bool) -> str:
131
+ """
132
+ Transpile Snowflake BASE64_DECODE_STRING/BINARY to DuckDB.
133
+
134
+ DuckDB uses FROM_BASE64() which returns BLOB. For string output, wrap with DECODE().
135
+ Custom alphabets require REPLACE() calls to convert to standard base64.
136
+ """
137
+ input_expr = expression.this
138
+ alphabet = expression.args.get("alphabet")
139
+
140
+ # Handle custom alphabet by replacing non-standard chars with standard ones
141
+ input_expr = _apply_base64_alphabet_replacements(input_expr, alphabet, reverse=True)
142
+
143
+ # FROM_BASE64 returns BLOB
144
+ input_expr = exp.FromBase64(this=input_expr)
145
+
146
+ if to_string:
147
+ input_expr = exp.Decode(this=input_expr)
148
+
149
+ return self.sql(input_expr)
150
+
151
+
152
+ def _last_day_sql(self: DuckDB.Generator, expression: exp.LastDay) -> str:
153
+ """
154
+ DuckDB's LAST_DAY only supports finding the last day of a month.
155
+ For other date parts (year, quarter, week), we need to implement equivalent logic.
156
+ """
157
+ date_expr = expression.this
158
+ unit = expression.text("unit")
159
+
160
+ if not unit or unit.upper() == "MONTH":
161
+ # Default behavior - use DuckDB's native LAST_DAY
162
+ return self.func("LAST_DAY", date_expr)
163
+
164
+ if unit.upper() == "YEAR":
165
+ # Last day of year: December 31st of the same year
166
+ year_expr = exp.func("EXTRACT", "YEAR", date_expr)
167
+ make_date_expr = exp.func(
168
+ "MAKE_DATE", year_expr, exp.Literal.number(12), exp.Literal.number(31)
169
+ )
170
+ return self.sql(make_date_expr)
171
+
172
+ if unit.upper() == "QUARTER":
173
+ # Last day of quarter
174
+ year_expr = exp.func("EXTRACT", "YEAR", date_expr)
175
+ quarter_expr = exp.func("EXTRACT", "QUARTER", date_expr)
176
+
177
+ # Calculate last month of quarter: quarter * 3. Quarter can be 1 to 4
178
+ last_month_expr = exp.Mul(this=quarter_expr, expression=exp.Literal.number(3))
179
+ first_day_last_month_expr = exp.func(
180
+ "MAKE_DATE", year_expr, last_month_expr, exp.Literal.number(1)
181
+ )
182
+
183
+ # Last day of the last month of the quarter
184
+ last_day_expr = exp.func("LAST_DAY", first_day_last_month_expr)
185
+ return self.sql(last_day_expr)
186
+
187
+ if unit.upper() == "WEEK":
188
+ # DuckDB DAYOFWEEK: Sunday=0, Monday=1, ..., Saturday=6
189
+ dow = exp.func("EXTRACT", "DAYOFWEEK", date_expr)
190
+ # Days to the last day of week: (7 - dayofweek) % 7, assuming the last day of week is Sunday (Snowflake)
191
+ # Wrap in parentheses to ensure correct precedence
192
+ days_to_sunday_expr = exp.Mod(
193
+ this=exp.Paren(this=exp.Sub(this=exp.Literal.number(7), expression=dow)),
194
+ expression=exp.Literal.number(7),
195
+ )
196
+ interval_expr = exp.Interval(this=days_to_sunday_expr, unit=exp.var("DAY"))
197
+ add_expr = exp.Add(this=date_expr, expression=interval_expr)
198
+ cast_expr = exp.cast(add_expr, exp.DataType.Type.DATE)
199
+ return self.sql(cast_expr)
200
+
201
+ self.unsupported(f"Unsupported date part '{unit}' in LAST_DAY function")
202
+ return self.function_fallback_sql(expression)
203
+
204
+
205
+ def _is_nanosecond_unit(unit: t.Optional[exp.Expression]) -> bool:
206
+ return isinstance(unit, (exp.Var, exp.Literal)) and unit.name.upper() == "NANOSECOND"
207
+
208
+
209
+ def _handle_nanosecond_diff(
210
+ self: DuckDB.Generator,
211
+ end_time: exp.Expression,
212
+ start_time: exp.Expression,
213
+ ) -> str:
214
+ """Generate NANOSECOND diff using EPOCH_NS since DATE_DIFF doesn't support it."""
215
+ end_ns = exp.cast(end_time, exp.DataType.Type.TIMESTAMP_NS)
216
+ start_ns = exp.cast(start_time, exp.DataType.Type.TIMESTAMP_NS)
217
+
218
+ # Build expression tree: EPOCH_NS(end) - EPOCH_NS(start)
219
+ return self.sql(
220
+ exp.Sub(this=exp.func("EPOCH_NS", end_ns), expression=exp.func("EPOCH_NS", start_ns))
221
+ )
222
+
90
223
 
91
224
  def _to_boolean_sql(self: DuckDB.Generator, expression: exp.ToBoolean) -> str:
92
225
  """
93
- Transpile TO_BOOLEAN function from Snowflake to DuckDB equivalent.
226
+ Transpile TO_BOOLEAN and TRY_TO_BOOLEAN functions from Snowflake to DuckDB equivalent.
94
227
 
95
228
  DuckDB's CAST to BOOLEAN supports most of Snowflake's TO_BOOLEAN strings except 'on'/'off'.
96
- We need to handle the 'on'/'off' cases explicitly, plus NaN/INF error cases.
229
+ We need to handle the 'on'/'off' cases explicitly.
97
230
 
98
- In Snowflake, NaN and INF values cause errors. We use DuckDB's native ERROR()
231
+ For TO_BOOLEAN (safe=False): NaN and INF values cause errors. We use DuckDB's native ERROR()
99
232
  function to replicate this behavior with a clear error message.
233
+
234
+ For TRY_TO_BOOLEAN (safe=True): Use DuckDB's TRY_CAST for conversion, which returns NULL
235
+ for invalid inputs instead of throwing errors.
100
236
  """
101
237
  arg = expression.this
238
+ is_safe = expression.args.get("safe", False)
102
239
 
103
- cast_to_real = exp.func("TRY_CAST", arg, exp.DataType.build("REAL"))
104
-
105
- # Check for NaN and INF values
106
- nan_inf_check = exp.Or(
107
- this=exp.func("ISNAN", cast_to_real), expression=exp.func("ISINF", cast_to_real)
108
- )
109
-
110
- case_expr = (
240
+ base_case_expr = (
111
241
  exp.case()
112
242
  .when(
113
- nan_inf_check,
114
- exp.func(
115
- "ERROR",
116
- exp.Literal.string("TO_BOOLEAN: Non-numeric values NaN and INF are not supported"),
117
- ),
118
- )
119
- # Handle 'on' -> TRUE (case insensitive) - only for string literals
120
- .when(
243
+ # Handle 'on' -> TRUE (case insensitive)
121
244
  exp.Upper(this=exp.cast(arg, exp.DataType.Type.VARCHAR)).eq(exp.Literal.string("ON")),
122
245
  exp.true(),
123
246
  )
124
- # Handle 'off' -> FALSE (case insensitive) - only for string literals
125
247
  .when(
248
+ # Handle 'off' -> FALSE (case insensitive)
126
249
  exp.Upper(this=exp.cast(arg, exp.DataType.Type.VARCHAR)).eq(exp.Literal.string("OFF")),
127
250
  exp.false(),
128
251
  )
129
- .else_(exp.cast(arg, exp.DataType.Type.BOOLEAN))
130
252
  )
131
253
 
254
+ if is_safe:
255
+ # TRY_TO_BOOLEAN: handle 'on'/'off' and use TRY_CAST for everything else
256
+ case_expr = base_case_expr.else_(exp.func("TRY_CAST", arg, exp.DataType.build("BOOLEAN")))
257
+ else:
258
+ # TO_BOOLEAN: handle NaN/INF errors, 'on'/'off', and use regular CAST
259
+ cast_to_real = exp.func("TRY_CAST", arg, exp.DataType.build("REAL"))
260
+
261
+ # Check for NaN and INF values
262
+ nan_inf_check = exp.Or(
263
+ this=exp.func("ISNAN", cast_to_real), expression=exp.func("ISINF", cast_to_real)
264
+ )
265
+
266
+ case_expr = base_case_expr.when(
267
+ nan_inf_check,
268
+ exp.func(
269
+ "ERROR",
270
+ exp.Literal.string("TO_BOOLEAN: Non-numeric values NaN and INF are not supported"),
271
+ ),
272
+ ).else_(exp.cast(arg, exp.DataType.Type.BOOLEAN))
273
+
132
274
  return self.sql(case_expr)
133
275
 
134
276
 
135
277
  # BigQuery -> DuckDB conversion for the DATE function
136
278
  def _date_sql(self: DuckDB.Generator, expression: exp.Date) -> str:
137
- result = f"CAST({self.sql(expression, 'this')} AS DATE)"
279
+ this = expression.this
138
280
  zone = self.sql(expression, "zone")
139
281
 
140
282
  if zone:
141
- date_str = self.func("STRFTIME", result, "'%d/%m/%Y'")
142
- date_str = f"{date_str} || ' ' || {zone}"
143
-
144
- # This will create a TIMESTAMP with time zone information
145
- result = self.func("STRPTIME", date_str, "'%d/%m/%Y %Z'")
283
+ # BigQuery considers "this" at UTC, converts it to the specified
284
+ # time zone and then keeps only the DATE part
285
+ # To micmic that, we:
286
+ # (1) Cast to TIMESTAMP to remove DuckDB's local tz
287
+ # (2) Apply consecutive AtTimeZone calls for UTC -> zone conversion
288
+ this = exp.cast(this, exp.DataType.Type.TIMESTAMP)
289
+ at_utc = exp.AtTimeZone(this=this, zone=exp.Literal.string("UTC"))
290
+ this = exp.AtTimeZone(this=at_utc, zone=zone)
146
291
 
147
- return result
292
+ return self.sql(exp.cast(expression=this, to=exp.DataType.Type.DATE))
148
293
 
149
294
 
150
295
  # BigQuery -> DuckDB conversion for the TIME_DIFF function
151
296
  def _timediff_sql(self: DuckDB.Generator, expression: exp.TimeDiff) -> str:
297
+ unit = expression.unit
298
+
299
+ if _is_nanosecond_unit(unit):
300
+ return _handle_nanosecond_diff(self, expression.expression, expression.this)
301
+
152
302
  this = exp.cast(expression.this, exp.DataType.Type.TIME)
153
303
  expr = exp.cast(expression.expression, exp.DataType.Type.TIME)
154
304
 
@@ -157,6 +307,140 @@ def _timediff_sql(self: DuckDB.Generator, expression: exp.TimeDiff) -> str:
157
307
  return self.func("DATE_DIFF", unit_to_str(expression), expr, this)
158
308
 
159
309
 
310
+ def _date_delta_to_binary_interval_op(
311
+ cast: bool = True,
312
+ ) -> t.Callable[[DuckDB.Generator, DATETIME_DELTA], str]:
313
+ """
314
+ DuckDB override to handle:
315
+ 1. NANOSECOND operations (DuckDB doesn't support INTERVAL ... NANOSECOND)
316
+ 2. Float/decimal interval values (DuckDB INTERVAL requires integers)
317
+ """
318
+ base_impl = date_delta_to_binary_interval_op(cast=cast)
319
+
320
+ def _duckdb_date_delta_sql(self: DuckDB.Generator, expression: DATETIME_DELTA) -> str:
321
+ unit = expression.unit
322
+ interval_value = expression.expression
323
+
324
+ # Handle NANOSECOND unit (DuckDB doesn't support INTERVAL ... NANOSECOND)
325
+ if _is_nanosecond_unit(unit):
326
+ if isinstance(interval_value, exp.Interval):
327
+ interval_value = interval_value.this
328
+
329
+ timestamp_ns = exp.cast(expression.this, exp.DataType.Type.TIMESTAMP_NS)
330
+
331
+ return self.sql(
332
+ exp.func(
333
+ "MAKE_TIMESTAMP_NS",
334
+ exp.Add(this=exp.func("EPOCH_NS", timestamp_ns), expression=interval_value),
335
+ )
336
+ )
337
+
338
+ # Handle float/decimal interval values as duckDB INTERVAL requires integer expressions
339
+ if not interval_value or isinstance(interval_value, exp.Interval):
340
+ return base_impl(self, expression)
341
+
342
+ if interval_value.is_type(*exp.DataType.REAL_TYPES):
343
+ expression.set("expression", exp.cast(exp.func("ROUND", interval_value), "INT"))
344
+
345
+ return base_impl(self, expression)
346
+
347
+ return _duckdb_date_delta_sql
348
+
349
+
350
+ def _array_insert_sql(self: DuckDB.Generator, expression: exp.ArrayInsert) -> str:
351
+ """
352
+ Transpile ARRAY_INSERT to DuckDB using LIST_CONCAT and slicing.
353
+
354
+ Handles:
355
+ - 0-based and 1-based indexing (normalizes to 0-based for calculations)
356
+ - Negative position conversion (requires array length)
357
+ - NULL propagation (source dialects return NULL, DuckDB creates single-element array)
358
+ - Assumes position is within bounds per user constraint
359
+
360
+ Note: All dialects that support ARRAY_INSERT (Snowflake, Spark, Databricks) have
361
+ ARRAY_FUNCS_PROPAGATES_NULLS=True, so we always assume source propagates NULLs.
362
+
363
+ Args:
364
+ expression: The ArrayInsert expression to transpile.
365
+
366
+ Returns:
367
+ SQL string implementing ARRAY_INSERT behavior.
368
+ """
369
+ this = expression.this
370
+ position = expression.args.get("position")
371
+ element = expression.expression
372
+ element_array = exp.Array(expressions=[element])
373
+ index_offset = expression.args.get("offset", 0)
374
+
375
+ if not position or not position.is_int:
376
+ self.unsupported("ARRAY_INSERT can only be transpiled with a literal position")
377
+ return self.func("ARRAY_INSERT", this, position, element)
378
+
379
+ pos_value = position.to_py()
380
+
381
+ # Normalize one-based indexing to zero-based for slice calculations
382
+ # Spark (1-based) → Snowflake (0-based):
383
+ # Positive: pos=1 → pos=0 (subtract 1)
384
+ # Negative: pos=-2 → pos=-1 (add 1)
385
+ # Example: Spark array_insert([a,b,c], -2, d) → [a,b,d,c] is same as Snowflake pos=-1
386
+ if pos_value > 0:
387
+ pos_value = pos_value - index_offset
388
+ elif pos_value < 0:
389
+ pos_value = pos_value + index_offset
390
+
391
+ # Build the appropriate list_concat expression based on position
392
+ if pos_value == 0:
393
+ # insert at beginning
394
+ concat_exprs = [element_array, this]
395
+ elif pos_value > 0:
396
+ # Positive position: LIST_CONCAT(arr[1:pos], [elem], arr[pos+1:])
397
+ # 0-based -> DuckDB 1-based slicing
398
+
399
+ # left slice: arr[1:pos]
400
+ slice_start = exp.Bracket(
401
+ this=this,
402
+ expressions=[
403
+ exp.Slice(this=exp.Literal.number(1), expression=exp.Literal.number(pos_value))
404
+ ],
405
+ )
406
+
407
+ # right slice: arr[pos+1:]
408
+ slice_end = exp.Bracket(
409
+ this=this, expressions=[exp.Slice(this=exp.Literal.number(pos_value + 1))]
410
+ )
411
+
412
+ concat_exprs = [slice_start, element_array, slice_end]
413
+ else:
414
+ # Negative position: arr[1:LEN(arr)+pos], [elem], arr[LEN(arr)+pos+1:]
415
+ # pos=-1 means insert before last element
416
+ arr_len = exp.Length(this=this)
417
+
418
+ # Calculate slice position: LEN(arr) + pos (e.g., LEN(arr) + (-1) = LEN(arr) - 1)
419
+ slice_end_pos = arr_len + exp.Literal.number(pos_value)
420
+ slice_start_pos = slice_end_pos + exp.Literal.number(1)
421
+
422
+ # left slice: arr[1:LEN(arr)+pos]
423
+ slice_start = exp.Bracket(
424
+ this=this,
425
+ expressions=[exp.Slice(this=exp.Literal.number(1), expression=slice_end_pos)],
426
+ )
427
+
428
+ # right slice: arr[LEN(arr)+pos+1:]
429
+ slice_end = exp.Bracket(this=this, expressions=[exp.Slice(this=slice_start_pos)])
430
+
431
+ concat_exprs = [slice_start, element_array, slice_end]
432
+
433
+ # All dialects that support ARRAY_INSERT propagate NULLs (Snowflake/Spark/Databricks)
434
+ # Wrap in CASE WHEN array IS NULL THEN NULL ELSE func_expr END
435
+ return self.sql(
436
+ exp.If(
437
+ this=exp.Is(this=this, expression=exp.Null()),
438
+ true=exp.Null(),
439
+ false=self.func("LIST_CONCAT", *concat_exprs),
440
+ )
441
+ )
442
+
443
+
160
444
  @unsupported_args(("expression", "DuckDB's ARRAY_SORT does not support a comparator."))
161
445
  def _array_sort_sql(self: DuckDB.Generator, expression: exp.ArraySort) -> str:
162
446
  return self.func("ARRAY_SORT", expression.this)
@@ -171,6 +455,10 @@ def _build_sort_array_desc(args: t.List) -> exp.Expression:
171
455
  return exp.SortArray(this=seq_get(args, 0), asc=exp.false())
172
456
 
173
457
 
458
+ def _build_array_prepend(args: t.List) -> exp.Expression:
459
+ return exp.ArrayPrepend(this=seq_get(args, 1), expression=seq_get(args, 0))
460
+
461
+
174
462
  def _build_date_diff(args: t.List) -> exp.Expression:
175
463
  return exp.DateDiff(this=seq_get(args, 2), expression=seq_get(args, 1), unit=seq_get(args, 0))
176
464
 
@@ -212,6 +500,14 @@ def _show_parser(*args: t.Any, **kwargs: t.Any) -> t.Callable[[DuckDB.Parser], e
212
500
 
213
501
 
214
502
  def _struct_sql(self: DuckDB.Generator, expression: exp.Struct) -> str:
503
+ ancestor_cast = expression.find_ancestor(exp.Cast, exp.Select)
504
+ ancestor_cast = None if isinstance(ancestor_cast, exp.Select) else ancestor_cast
505
+
506
+ # Empty struct cast works with MAP() since DuckDB can't parse {}
507
+ if not expression.expressions:
508
+ if isinstance(ancestor_cast, exp.Cast) and ancestor_cast.to.is_type(exp.DataType.Type.MAP):
509
+ return "MAP()"
510
+
215
511
  args: t.List[str] = []
216
512
 
217
513
  # BigQuery allows inline construction such as "STRUCT<a STRING, b INTEGER>('str', 1)" which is
@@ -219,7 +515,6 @@ def _struct_sql(self: DuckDB.Generator, expression: exp.Struct) -> str:
219
515
  # The transformation to ROW will take place if:
220
516
  # 1. The STRUCT itself does not have proper fields (key := value) as a "proper" STRUCT would
221
517
  # 2. A cast to STRUCT / ARRAY of STRUCTs is found
222
- ancestor_cast = expression.find_ancestor(exp.Cast)
223
518
  is_bq_inline_struct = (
224
519
  (expression.find(exp.PropertyEQ) is None)
225
520
  and ancestor_cast
@@ -231,16 +526,16 @@ def _struct_sql(self: DuckDB.Generator, expression: exp.Struct) -> str:
231
526
 
232
527
  for i, expr in enumerate(expression.expressions):
233
528
  is_property_eq = isinstance(expr, exp.PropertyEQ)
529
+ this = expr.this
234
530
  value = expr.expression if is_property_eq else expr
235
531
 
236
532
  if is_bq_inline_struct:
237
533
  args.append(self.sql(value))
238
534
  else:
239
- if is_property_eq:
240
- if isinstance(expr.this, exp.Identifier):
241
- key = self.sql(exp.Literal.string(expr.name))
242
- else:
243
- key = self.sql(expr.this)
535
+ if isinstance(this, exp.Identifier):
536
+ key = self.sql(exp.Literal.string(expr.name))
537
+ elif is_property_eq:
538
+ key = self.sql(this)
244
539
  else:
245
540
  key = self.sql(exp.Literal.string(f"_{i}"))
246
541
 
@@ -269,21 +564,77 @@ def _json_format_sql(self: DuckDB.Generator, expression: exp.JSONFormat) -> str:
269
564
  return f"CAST({sql} AS TEXT)"
270
565
 
271
566
 
567
+ def _seq_sql(self: DuckDB.Generator, expression: exp.Func, byte_width: int) -> str:
568
+ """
569
+ Transpile Snowflake SEQ1/SEQ2/SEQ4/SEQ8 to DuckDB.
570
+
571
+ Generates monotonically increasing integers starting from 0.
572
+ The signed parameter (0 or 1) affects wrap-around behavior:
573
+ - Unsigned (0): wraps at 2^(bits) - 1
574
+ - Signed (1): wraps at 2^(bits-1) - 1, then goes negative
575
+
576
+ Note: SEQ in WHERE, HAVING, aggregates, or window ORDER BY is not supported
577
+ because these contexts don't allow window functions. Users should rewrite
578
+ using CTEs or subqueries.
579
+
580
+ Args:
581
+ expression: The SEQ function expression (may have 'this' arg for signed param)
582
+ byte_width: 1, 2, 4, or 8 bytes
583
+
584
+ Returns:
585
+ SQL string using ROW_NUMBER() with modulo for wrap-around
586
+ """
587
+ # Warn if SEQ is in a restricted context (Select stops search at current scope)
588
+ ancestor = expression.find_ancestor(*_SEQ_RESTRICTED)
589
+ if ancestor and (
590
+ (not isinstance(ancestor, (exp.Order, exp.Select)))
591
+ or (isinstance(ancestor, exp.Order) and isinstance(ancestor.parent, exp.Window))
592
+ ):
593
+ self.unsupported("SEQ in restricted context is not supported - use CTE or subquery")
594
+
595
+ bits = byte_width * 8
596
+ max_val = exp.Literal.number(2**bits)
597
+
598
+ if expression.name == "1":
599
+ half = exp.Literal.number(2 ** (bits - 1))
600
+ result = exp.replace_placeholders(self.SEQ_SIGNED.copy(), max_val=max_val, half=half)
601
+ else:
602
+ result = exp.replace_placeholders(self.SEQ_UNSIGNED.copy(), max_val=max_val)
603
+
604
+ return self.sql(result)
605
+
606
+
272
607
  def _unix_to_time_sql(self: DuckDB.Generator, expression: exp.UnixToTime) -> str:
273
608
  scale = expression.args.get("scale")
274
609
  timestamp = expression.this
610
+ target_type = expression.args.get("target_type")
611
+
612
+ # Check if we need NTZ (naive timestamp in UTC)
613
+ is_ntz = target_type and target_type.this in (
614
+ exp.DataType.Type.TIMESTAMP,
615
+ exp.DataType.Type.TIMESTAMPNTZ,
616
+ )
275
617
 
276
- if scale in (None, exp.UnixToTime.SECONDS):
277
- return self.func("TO_TIMESTAMP", timestamp)
278
618
  if scale == exp.UnixToTime.MILLIS:
619
+ # EPOCH_MS already returns TIMESTAMP (naive, UTC)
279
620
  return self.func("EPOCH_MS", timestamp)
280
621
  if scale == exp.UnixToTime.MICROS:
622
+ # MAKE_TIMESTAMP already returns TIMESTAMP (naive, UTC)
281
623
  return self.func("MAKE_TIMESTAMP", timestamp)
282
624
 
283
- return self.func("TO_TIMESTAMP", exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)))
625
+ # Other scales: divide and use TO_TIMESTAMP
626
+ if scale not in (None, exp.UnixToTime.SECONDS):
627
+ timestamp = exp.Div(this=timestamp, expression=exp.func("POW", 10, scale))
628
+
629
+ to_timestamp: exp.Expression = exp.Anonymous(this="TO_TIMESTAMP", expressions=[timestamp])
630
+
631
+ if is_ntz:
632
+ to_timestamp = exp.AtTimeZone(this=to_timestamp, zone=exp.Literal.string("UTC"))
633
+
634
+ return self.sql(to_timestamp)
284
635
 
285
636
 
286
- WRAPPED_JSON_EXTRACT_EXPRESSIONS = (exp.Binary, exp.Bracket, exp.In)
637
+ WRAPPED_JSON_EXTRACT_EXPRESSIONS = (exp.Binary, exp.Bracket, exp.In, exp.Not)
287
638
 
288
639
 
289
640
  def _arrow_json_extract_sql(self: DuckDB.Generator, expression: JSON_EXTRACT_TYPE) -> str:
@@ -373,9 +724,13 @@ def _build_week_trunc_expression(date_expr: exp.Expression, start_dow: int) -> e
373
724
 
374
725
 
375
726
  def _date_diff_sql(self: DuckDB.Generator, expression: exp.DateDiff) -> str:
727
+ unit = expression.unit
728
+
729
+ if _is_nanosecond_unit(unit):
730
+ return _handle_nanosecond_diff(self, expression.this, expression.expression)
731
+
376
732
  this = _implicit_datetime_cast(expression.this)
377
733
  expr = _implicit_datetime_cast(expression.expression)
378
- unit = expression.args.get("unit")
379
734
 
380
735
  # DuckDB's WEEK diff does not respect Monday crossing (week boundaries), it checks (end_day - start_day) / 7:
381
736
  # SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-17' AS DATE)) --> 0 (Monday crossed)
@@ -427,11 +782,17 @@ def _json_extract_value_array_sql(
427
782
 
428
783
 
429
784
  def _cast_to_varchar(arg: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
430
- if arg and arg.type and not arg.is_type(exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN):
785
+ if arg and arg.type and not arg.is_type(*exp.DataType.TEXT_TYPES, exp.DataType.Type.UNKNOWN):
431
786
  return exp.cast(arg, exp.DataType.Type.VARCHAR)
432
787
  return arg
433
788
 
434
789
 
790
+ def _cast_to_boolean(arg: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
791
+ if arg and not arg.is_type(exp.DataType.Type.BOOLEAN):
792
+ return exp.cast(arg, exp.DataType.Type.BOOLEAN)
793
+ return arg
794
+
795
+
435
796
  def _is_binary(arg: exp.Expression) -> bool:
436
797
  return arg.is_type(
437
798
  exp.DataType.Type.BINARY,
@@ -466,6 +827,76 @@ def _prepare_binary_bitwise_args(expression: exp.Binary) -> None:
466
827
  expression.set("expression", _cast_to_bit(expression.expression))
467
828
 
468
829
 
830
+ def _day_navigation_sql(
831
+ self: DuckDB.Generator, expression: t.Union[exp.NextDay, exp.PreviousDay]
832
+ ) -> str:
833
+ """
834
+ Transpile Snowflake's NEXT_DAY / PREVIOUS_DAY to DuckDB using date arithmetic.
835
+
836
+ Returns the DATE of the next/previous occurrence of the specified weekday.
837
+
838
+ Formulas:
839
+ - NEXT_DAY: (target_dow - current_dow + 6) % 7 + 1
840
+ - PREVIOUS_DAY: (current_dow - target_dow + 6) % 7 + 1
841
+
842
+ Supports both literal and non-literal day names:
843
+ - Literal: Direct lookup (e.g., 'Monday' → 1)
844
+ - Non-literal: CASE statement for runtime evaluation
845
+
846
+ Examples:
847
+ NEXT_DAY('2024-01-01' (Monday), 'Monday')
848
+ → (1 - 1 + 6) % 7 + 1 = 6 % 7 + 1 = 7 days → 2024-01-08
849
+
850
+ PREVIOUS_DAY('2024-01-15' (Monday), 'Friday')
851
+ → (1 - 5 + 6) % 7 + 1 = 2 % 7 + 1 = 3 days → 2024-01-12
852
+ """
853
+ date_expr = expression.this
854
+ day_name_expr = expression.expression
855
+
856
+ # Build ISODOW call for current day of week
857
+ isodow_call = exp.func("ISODOW", date_expr)
858
+
859
+ # Determine target day of week
860
+ if isinstance(day_name_expr, exp.Literal):
861
+ # Literal day name: lookup target_dow directly
862
+ day_name_str = day_name_expr.name.upper()
863
+ matching_day = next(
864
+ (day for day in WEEK_START_DAY_TO_DOW if day.startswith(day_name_str)), None
865
+ )
866
+ if matching_day:
867
+ target_dow: exp.Expression = exp.Literal.number(WEEK_START_DAY_TO_DOW[matching_day])
868
+ else:
869
+ # Unrecognized day name, use fallback
870
+ return self.function_fallback_sql(expression)
871
+ else:
872
+ # Non-literal day name: build CASE statement for runtime mapping
873
+ upper_day_name = exp.Upper(this=day_name_expr)
874
+ target_dow = exp.Case(
875
+ ifs=[
876
+ exp.If(
877
+ this=exp.func(
878
+ "STARTS_WITH", upper_day_name.copy(), exp.Literal.string(day[:2])
879
+ ),
880
+ true=exp.Literal.number(dow_num),
881
+ )
882
+ for day, dow_num in WEEK_START_DAY_TO_DOW.items()
883
+ ]
884
+ )
885
+
886
+ # Calculate days offset and apply interval based on direction
887
+ if isinstance(expression, exp.NextDay):
888
+ # NEXT_DAY: (target_dow - current_dow + 6) % 7 + 1
889
+ days_offset = exp.paren(target_dow - isodow_call + 6, copy=False) % 7 + 1
890
+ date_with_offset = date_expr + exp.Interval(this=days_offset, unit=exp.var("DAY"))
891
+ else: # exp.PreviousDay
892
+ # PREVIOUS_DAY: (current_dow - target_dow + 6) % 7 + 1
893
+ days_offset = exp.paren(isodow_call - target_dow + 6, copy=False) % 7 + 1
894
+ date_with_offset = date_expr - exp.Interval(this=days_offset, unit=exp.var("DAY"))
895
+
896
+ # Build final: CAST(date_with_offset AS DATE)
897
+ return self.sql(exp.cast(date_with_offset, exp.DataType.Type.DATE))
898
+
899
+
469
900
  def _anyvalue_sql(self: DuckDB.Generator, expression: exp.AnyValue) -> str:
470
901
  # Transform ANY_VALUE(expr HAVING MAX/MIN having_expr) to ARG_MAX_NULL/ARG_MIN_NULL
471
902
  having = expression.this
@@ -475,6 +906,39 @@ def _anyvalue_sql(self: DuckDB.Generator, expression: exp.AnyValue) -> str:
475
906
  return self.function_fallback_sql(expression)
476
907
 
477
908
 
909
+ def _bitwise_agg_sql(
910
+ self: DuckDB.Generator,
911
+ expression: t.Union[exp.BitwiseOrAgg, exp.BitwiseAndAgg, exp.BitwiseXorAgg],
912
+ ) -> str:
913
+ """
914
+ DuckDB's bitwise aggregate functions only accept integer types. For other types:
915
+ - DECIMAL/STRING: Use CAST(arg AS INT) to convert directly, will round to nearest int
916
+ - FLOAT/DOUBLE: Use ROUND(arg)::INT to round to nearest integer, required due to float precision loss
917
+ """
918
+ if isinstance(expression, exp.BitwiseOrAgg):
919
+ func_name = "BIT_OR"
920
+ elif isinstance(expression, exp.BitwiseAndAgg):
921
+ func_name = "BIT_AND"
922
+ else: # exp.BitwiseXorAgg
923
+ func_name = "BIT_XOR"
924
+
925
+ arg = expression.this
926
+
927
+ if not arg.type:
928
+ from sqlglot.optimizer.annotate_types import annotate_types
929
+
930
+ arg = annotate_types(arg, dialect=self.dialect)
931
+
932
+ if arg.is_type(*exp.DataType.REAL_TYPES, *exp.DataType.TEXT_TYPES):
933
+ if arg.is_type(*exp.DataType.FLOAT_TYPES):
934
+ # float types need to be rounded first due to precision loss
935
+ arg = exp.func("ROUND", arg)
936
+
937
+ arg = exp.cast(arg, exp.DataType.Type.INT)
938
+
939
+ return self.func(func_name, arg)
940
+
941
+
478
942
  def _literal_sql_with_ws_chr(self: DuckDB.Generator, literal: str) -> str:
479
943
  # DuckDB does not support \uXXXX escapes, so we must use CHR() instead of replacing them directly
480
944
  if not any(ch in WS_CONTROL_CHARS_TO_DUCK for ch in literal):
@@ -571,26 +1035,102 @@ def _initcap_sql(self: DuckDB.Generator, expression: exp.Initcap) -> str:
571
1035
  return _build_capitalization_sql(self, this_sql, escaped_delimiters_sql)
572
1036
 
573
1037
 
574
- def _floor_sql(self: DuckDB.Generator, expression: exp.Floor) -> str:
575
- decimals = expression.args.get("decimals")
1038
+ def _boolxor_agg_sql(self: DuckDB.Generator, expression: exp.BoolxorAgg) -> str:
1039
+ """
1040
+ Snowflake's `BOOLXOR_AGG(col)` returns TRUE if exactly one input in `col` is TRUE, FALSE otherwise;
1041
+ Since DuckDB does not have a mapping function, we mimic the behavior by generating `COUNT_IF(col) = 1`.
1042
+
1043
+ DuckDB's COUNT_IF strictly requires boolean inputs, so cast if not already boolean.
1044
+ """
1045
+ return self.sql(
1046
+ exp.EQ(
1047
+ this=exp.CountIf(this=_cast_to_boolean(expression.this)),
1048
+ expression=exp.Literal.number(1),
1049
+ )
1050
+ )
576
1051
 
577
- if decimals is not None and expression.args.get("to") is None:
578
- this = expression.this
579
- if isinstance(this, exp.Binary):
580
- this = exp.Paren(this=this)
581
1052
 
582
- n_int = decimals
583
- if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)):
584
- n_int = exp.cast(decimals, exp.DataType.Type.INT)
1053
+ def _bitshift_sql(
1054
+ self: DuckDB.Generator, expression: exp.BitwiseLeftShift | exp.BitwiseRightShift
1055
+ ) -> str:
1056
+ """
1057
+ Transform bitshift expressions for DuckDB by injecting BIT/INT128 casts.
585
1058
 
586
- pow_ = exp.Pow(this=exp.Literal.number("10"), expression=n_int)
587
- floored = exp.Floor(this=exp.Mul(this=this, expression=pow_))
588
- result = exp.Div(this=floored, expression=pow_.copy())
1059
+ DuckDB's bitwise shift operators don't work with BLOB/BINARY types, so we cast
1060
+ them to BIT for the operation, then cast the result back to the original type.
589
1061
 
590
- return self.round_sql(
591
- exp.Round(this=result, decimals=decimals, casts_non_integer_decimals=True)
1062
+ Note: Assumes type annotation has been applied with the source dialect.
1063
+ """
1064
+ operator = "<<" if isinstance(expression, exp.BitwiseLeftShift) else ">>"
1065
+ result_is_blob = False
1066
+ this = expression.this
1067
+
1068
+ if _is_binary(this):
1069
+ result_is_blob = True
1070
+ expression.set("this", exp.cast(this, exp.DataType.Type.BIT))
1071
+ elif expression.args.get("requires_int128"):
1072
+ this.replace(exp.cast(this, exp.DataType.Type.INT128))
1073
+
1074
+ result_sql = self.binary(expression, operator)
1075
+
1076
+ # Wrap in parentheses if parent is a bitwise operator to "fix" DuckDB precedence issue
1077
+ # DuckDB parses: a << b | c << d as (a << b | c) << d
1078
+ if isinstance(expression.parent, exp.Binary):
1079
+ result_sql = self.sql(exp.Paren(this=result_sql))
1080
+
1081
+ if result_is_blob:
1082
+ result_sql = self.sql(
1083
+ exp.Cast(this=result_sql, to=exp.DataType.build("BLOB", dialect="duckdb"))
592
1084
  )
593
1085
 
1086
+ return result_sql
1087
+
1088
+
1089
+ def _scale_rounding_sql(
1090
+ self: DuckDB.Generator,
1091
+ expression: exp.Expression,
1092
+ rounding_func: type[exp.Expression],
1093
+ ) -> str | None:
1094
+ """
1095
+ Handle scale parameter transformation for rounding functions.
1096
+
1097
+ DuckDB doesn't support the scale parameter for certain functions (e.g., FLOOR, CEIL),
1098
+ so we transform: FUNC(x, n) to ROUND(FUNC(x * 10^n) / 10^n, n)
1099
+
1100
+ Args:
1101
+ self: The DuckDB generator instance
1102
+ expression: The expression to transform (must have 'this', 'decimals', and 'to' args)
1103
+ rounding_func: The rounding function class to use in the transformation
1104
+
1105
+ Returns:
1106
+ The transformed SQL string if decimals parameter exists, None otherwise
1107
+ """
1108
+ decimals = expression.args.get("decimals")
1109
+
1110
+ if decimals is None or expression.args.get("to") is not None:
1111
+ return None
1112
+
1113
+ this = expression.this
1114
+ if isinstance(this, exp.Binary):
1115
+ this = exp.Paren(this=this)
1116
+
1117
+ n_int = decimals
1118
+ if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)):
1119
+ n_int = exp.cast(decimals, exp.DataType.Type.INT)
1120
+
1121
+ pow_ = exp.Pow(this=exp.Literal.number("10"), expression=n_int)
1122
+ rounded = rounding_func(this=exp.Mul(this=this, expression=pow_))
1123
+ result = exp.Div(this=rounded, expression=pow_.copy())
1124
+
1125
+ return self.round_sql(
1126
+ exp.Round(this=result, decimals=decimals, casts_non_integer_decimals=True)
1127
+ )
1128
+
1129
+
1130
+ def _ceil_floor(self: DuckDB.Generator, expression: exp.Floor | exp.Ceil) -> str:
1131
+ scaled_sql = _scale_rounding_sql(self, expression, type(expression))
1132
+ if scaled_sql is not None:
1133
+ return scaled_sql
594
1134
  return self.ceil_floor(expression)
595
1135
 
596
1136
 
@@ -648,6 +1188,89 @@ def _regr_val_sql(
648
1188
  )
649
1189
 
650
1190
 
1191
+ def _maybe_corr_null_to_false(
1192
+ expression: t.Union[exp.Filter, exp.Window, exp.Corr],
1193
+ ) -> t.Optional[t.Union[exp.Filter, exp.Window, exp.Corr]]:
1194
+ corr = expression
1195
+ while isinstance(corr, (exp.Window, exp.Filter)):
1196
+ corr = corr.this
1197
+
1198
+ if not isinstance(corr, exp.Corr) or not corr.args.get("null_on_zero_variance"):
1199
+ return None
1200
+
1201
+ corr.set("null_on_zero_variance", False)
1202
+ return expression
1203
+
1204
+
1205
+ def _date_from_parts_sql(self, expression: exp.DateFromParts) -> str:
1206
+ """
1207
+ Snowflake's DATE_FROM_PARTS allows out-of-range values for the month and day input.
1208
+ E.g., larger values (month=13, day=100), zero-values (month=0, day=0), negative values (month=-13, day=-100).
1209
+
1210
+ DuckDB's MAKE_DATE does not support out-of-range values, but DuckDB's INTERVAL type does.
1211
+
1212
+ We convert to date arithmetic:
1213
+ DATE_FROM_PARTS(year, month, day)
1214
+ - MAKE_DATE(year, 1, 1) + INTERVAL (month-1) MONTH + INTERVAL (day-1) DAY
1215
+ """
1216
+ year_expr = expression.args.get("year")
1217
+ month_expr = expression.args.get("month")
1218
+ day_expr = expression.args.get("day")
1219
+
1220
+ if expression.args.get("allow_overflow"):
1221
+ base_date: exp.Expression = exp.func(
1222
+ "MAKE_DATE", year_expr, exp.Literal.number(1), exp.Literal.number(1)
1223
+ )
1224
+
1225
+ if month_expr:
1226
+ base_date = base_date + exp.Interval(this=month_expr - 1, unit=exp.var("MONTH"))
1227
+
1228
+ if day_expr:
1229
+ base_date = base_date + exp.Interval(this=day_expr - 1, unit=exp.var("DAY"))
1230
+
1231
+ return self.sql(exp.cast(expression=base_date, to=exp.DataType.Type.DATE))
1232
+
1233
+ return self.func("MAKE_DATE", year_expr, month_expr, day_expr)
1234
+
1235
+
1236
+ def _round_arg(arg: exp.Expression, round_input: t.Optional[bool] = None) -> exp.Expression:
1237
+ if round_input:
1238
+ return exp.func("ROUND", arg, exp.Literal.number(0))
1239
+ return arg
1240
+
1241
+
1242
+ def _boolnot_sql(self: DuckDB.Generator, expression: exp.Boolnot) -> str:
1243
+ arg = _round_arg(expression.this, expression.args.get("round_input"))
1244
+ return self.sql(exp.not_(exp.paren(arg)))
1245
+
1246
+
1247
+ def _booland_sql(self: DuckDB.Generator, expression: exp.Booland) -> str:
1248
+ round_input = expression.args.get("round_input")
1249
+ left = _round_arg(expression.this, round_input)
1250
+ right = _round_arg(expression.expression, round_input)
1251
+ return self.sql(exp.paren(exp.and_(exp.paren(left), exp.paren(right), wrap=False)))
1252
+
1253
+
1254
+ def _boolor_sql(self: DuckDB.Generator, expression: exp.Boolor) -> str:
1255
+ round_input = expression.args.get("round_input")
1256
+ left = _round_arg(expression.this, round_input)
1257
+ right = _round_arg(expression.expression, round_input)
1258
+ return self.sql(exp.paren(exp.or_(exp.paren(left), exp.paren(right), wrap=False)))
1259
+
1260
+
1261
+ def _xor_sql(self: DuckDB.Generator, expression: exp.Xor) -> str:
1262
+ round_input = expression.args.get("round_input")
1263
+ left = _round_arg(expression.this, round_input)
1264
+ right = _round_arg(expression.expression, round_input)
1265
+ return self.sql(
1266
+ exp.or_(
1267
+ exp.paren(exp.and_(left.copy(), exp.paren(right.not_()), wrap=False)),
1268
+ exp.paren(exp.and_(exp.paren(left.not_()), right.copy(), wrap=False)),
1269
+ wrap=False,
1270
+ )
1271
+ )
1272
+
1273
+
651
1274
  class DuckDB(Dialect):
652
1275
  NULL_ORDERING = "nulls_are_last"
653
1276
  SUPPORTS_USER_DEFINED_TYPES = True
@@ -667,10 +1290,23 @@ class DuckDB(Dialect):
667
1290
  "DAYOFWEEKISO": "ISODOW",
668
1291
  }
669
1292
 
1293
+ EXPRESSION_METADATA = EXPRESSION_METADATA.copy()
1294
+
670
1295
  DATE_PART_MAPPING.pop("WEEKDAY")
671
1296
 
672
1297
  INVERSE_TIME_MAPPING = {
673
1298
  "%e": "%-d", # BigQuery's space-padded day (%e) -> DuckDB's no-padding day (%-d)
1299
+ "%:z": "%z", # In DuckDB %z can represent ±HH:MM, ±HHMM, or ±HH.
1300
+ "%-z": "%z",
1301
+ "%f_zero": "%n",
1302
+ "%f_one": "%n",
1303
+ "%f_two": "%n",
1304
+ "%f_three": "%g",
1305
+ "%f_four": "%n",
1306
+ "%f_five": "%n",
1307
+ "%f_seven": "%n",
1308
+ "%f_eight": "%n",
1309
+ "%f_nine": "%n",
674
1310
  }
675
1311
 
676
1312
  def to_json_path(self, path: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
@@ -687,6 +1323,7 @@ class DuckDB(Dialect):
687
1323
 
688
1324
  class Tokenizer(tokens.Tokenizer):
689
1325
  BYTE_STRINGS = [("e'", "'"), ("E'", "'")]
1326
+ BYTE_STRING_ESCAPES = ["'", "\\"]
690
1327
  HEREDOC_STRINGS = ["$"]
691
1328
 
692
1329
  HEREDOC_TAG_IS_IDENTIFIER = True
@@ -749,7 +1386,7 @@ class DuckDB(Dialect):
749
1386
  **parser.Parser.RANGE_PARSERS,
750
1387
  TokenType.DAMP: binary_range_parser(exp.ArrayOverlaps),
751
1388
  TokenType.CARET_AT: binary_range_parser(exp.StartsWith),
752
- TokenType.TILDA: binary_range_parser(exp.RegexpFullMatch),
1389
+ TokenType.TILDE: binary_range_parser(exp.RegexpFullMatch),
753
1390
  }
754
1391
 
755
1392
  EXPONENT = {
@@ -768,6 +1405,7 @@ class DuckDB(Dialect):
768
1405
  FUNCTIONS = {
769
1406
  **parser.Parser.FUNCTIONS,
770
1407
  "ANY_VALUE": lambda args: exp.IgnoreNulls(this=exp.AnyValue.from_arg_list(args)),
1408
+ "ARRAY_PREPEND": _build_array_prepend,
771
1409
  "ARRAY_REVERSE_SORT": _build_sort_array_desc,
772
1410
  "ARRAY_SORT": exp.SortArray.from_arg_list,
773
1411
  "BIT_AND": exp.BitwiseAndAgg.from_arg_list,
@@ -789,15 +1427,21 @@ class DuckDB(Dialect):
789
1427
  this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS
790
1428
  ),
791
1429
  "GENERATE_SERIES": _build_generate_series(),
1430
+ "GET_BIT": lambda args: exp.Getbit(
1431
+ this=seq_get(args, 0), expression=seq_get(args, 1), zero_is_msb=True
1432
+ ),
792
1433
  "JSON": exp.ParseJSON.from_arg_list,
793
1434
  "JSON_EXTRACT_PATH": parser.build_extract_json_with_path(exp.JSONExtract),
794
1435
  "JSON_EXTRACT_STRING": parser.build_extract_json_with_path(exp.JSONExtractScalar),
1436
+ "LIST_APPEND": exp.ArrayAppend.from_arg_list,
1437
+ "LIST_CONCAT": parser.build_array_concat,
795
1438
  "LIST_CONTAINS": exp.ArrayContains.from_arg_list,
796
1439
  "LIST_COSINE_DISTANCE": exp.CosineDistance.from_arg_list,
797
1440
  "LIST_DISTANCE": exp.EuclideanDistance.from_arg_list,
798
1441
  "LIST_FILTER": exp.ArrayFilter.from_arg_list,
799
1442
  "LIST_HAS": exp.ArrayContains.from_arg_list,
800
1443
  "LIST_HAS_ANY": exp.ArrayOverlaps.from_arg_list,
1444
+ "LIST_PREPEND": _build_array_prepend,
801
1445
  "LIST_REVERSE_SORT": _build_sort_array_desc,
802
1446
  "LIST_SORT": exp.SortArray.from_arg_list,
803
1447
  "LIST_TRANSFORM": exp.Transform.from_arg_list,
@@ -1061,7 +1705,6 @@ class DuckDB(Dialect):
1061
1705
  COPY_HAS_INTO_KEYWORD = False
1062
1706
  STAR_EXCEPT = "EXCLUDE"
1063
1707
  PAD_FILL_PATTERN_IS_REQUIRED = True
1064
- ARRAY_CONCAT_IS_VAR_LEN = False
1065
1708
  ARRAY_SIZE_DIM_REQUIRED = False
1066
1709
  NORMALIZE_EXTRACT_DATE_PARTS = True
1067
1710
  SUPPORTS_LIKE_QUANTIFIERS = False
@@ -1071,80 +1714,124 @@ class DuckDB(Dialect):
1071
1714
  **generator.Generator.TRANSFORMS,
1072
1715
  exp.AnyValue: _anyvalue_sql,
1073
1716
  exp.ApproxDistinct: approx_count_distinct_sql,
1717
+ exp.Boolnot: _boolnot_sql,
1718
+ exp.Booland: _booland_sql,
1719
+ exp.Boolor: _boolor_sql,
1074
1720
  exp.Array: transforms.preprocess(
1075
1721
  [transforms.inherit_struct_field_names],
1076
1722
  generator=inline_array_unless_query,
1077
1723
  ),
1724
+ exp.ArrayAppend: array_append_sql("LIST_APPEND"),
1725
+ exp.ArrayCompact: array_compact_sql,
1726
+ exp.ArrayConstructCompact: lambda self, e: self.sql(
1727
+ exp.ArrayCompact(this=exp.Array(expressions=e.expressions))
1728
+ ),
1729
+ exp.ArrayConcat: array_concat_sql("LIST_CONCAT"),
1078
1730
  exp.ArrayFilter: rename_func("LIST_FILTER"),
1731
+ exp.ArrayInsert: _array_insert_sql,
1079
1732
  exp.ArrayRemove: remove_from_array_using_filter,
1080
1733
  exp.ArraySort: _array_sort_sql,
1734
+ exp.ArrayPrepend: array_append_sql("LIST_PREPEND", swap_params=True),
1081
1735
  exp.ArraySum: rename_func("LIST_SUM"),
1082
1736
  exp.ArrayUniqueAgg: lambda self, e: self.func(
1083
1737
  "LIST", exp.Distinct(expressions=[e.this])
1084
1738
  ),
1739
+ exp.Base64DecodeBinary: lambda self, e: _base64_decode_sql(self, e, to_string=False),
1740
+ exp.Base64DecodeString: lambda self, e: _base64_decode_sql(self, e, to_string=True),
1085
1741
  exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"),
1086
- exp.BitwiseAndAgg: rename_func("BIT_AND"),
1742
+ exp.BitwiseAndAgg: _bitwise_agg_sql,
1743
+ exp.BitwiseLeftShift: _bitshift_sql,
1087
1744
  exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"),
1088
- exp.BitwiseOrAgg: rename_func("BIT_OR"),
1089
- exp.BitwiseXorAgg: rename_func("BIT_XOR"),
1745
+ exp.BitwiseOrAgg: _bitwise_agg_sql,
1746
+ exp.BitwiseRightShift: _bitshift_sql,
1747
+ exp.BitwiseXorAgg: _bitwise_agg_sql,
1090
1748
  exp.CommentColumnConstraint: no_comment_column_constraint_sql,
1749
+ exp.Corr: lambda self, e: self._corr_sql(e),
1091
1750
  exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"),
1092
1751
  exp.CurrentTime: lambda *_: "CURRENT_TIME",
1093
- exp.CurrentTimestamp: lambda *_: "CURRENT_TIMESTAMP",
1752
+ exp.CurrentTimestamp: lambda self, e: self.sql(
1753
+ exp.AtTimeZone(this=exp.var("CURRENT_TIMESTAMP"), zone=exp.Literal.string("UTC"))
1754
+ )
1755
+ if e.args.get("sysdate")
1756
+ else "CURRENT_TIMESTAMP",
1757
+ exp.Localtime: unsupported_args("this")(lambda *_: "LOCALTIME"),
1094
1758
  exp.DayOfMonth: rename_func("DAYOFMONTH"),
1095
1759
  exp.DayOfWeek: rename_func("DAYOFWEEK"),
1096
1760
  exp.DayOfWeekIso: rename_func("ISODOW"),
1097
1761
  exp.DayOfYear: rename_func("DAYOFYEAR"),
1762
+ exp.Dayname: lambda self, e: (
1763
+ self.func("STRFTIME", e.this, exp.Literal.string("%a"))
1764
+ if e.args.get("abbreviated")
1765
+ else self.func("DAYNAME", e.this)
1766
+ ),
1767
+ exp.Monthname: lambda self, e: (
1768
+ self.func("STRFTIME", e.this, exp.Literal.string("%b"))
1769
+ if e.args.get("abbreviated")
1770
+ else self.func("MONTHNAME", e.this)
1771
+ ),
1098
1772
  exp.DataType: _datatype_sql,
1099
1773
  exp.Date: _date_sql,
1100
- exp.DateAdd: date_delta_to_binary_interval_op(),
1101
- exp.DateFromParts: rename_func("MAKE_DATE"),
1102
- exp.DateSub: date_delta_to_binary_interval_op(),
1774
+ exp.DateAdd: _date_delta_to_binary_interval_op(),
1775
+ exp.DateFromParts: _date_from_parts_sql,
1776
+ exp.DateSub: _date_delta_to_binary_interval_op(),
1103
1777
  exp.DateDiff: _date_diff_sql,
1104
1778
  exp.DateStrToDate: datestrtodate_sql,
1105
1779
  exp.Datetime: no_datetime_sql,
1106
1780
  exp.DatetimeDiff: _date_diff_sql,
1107
- exp.DatetimeSub: date_delta_to_binary_interval_op(),
1108
- exp.DatetimeAdd: date_delta_to_binary_interval_op(),
1781
+ exp.DatetimeSub: _date_delta_to_binary_interval_op(),
1782
+ exp.DatetimeAdd: _date_delta_to_binary_interval_op(),
1109
1783
  exp.DateToDi: lambda self,
1110
1784
  e: f"CAST(STRFTIME({self.sql(e, 'this')}, {DuckDB.DATEINT_FORMAT}) AS INT)",
1111
1785
  exp.Decode: lambda self, e: encode_decode_sql(self, e, "DECODE", replace=False),
1112
1786
  exp.DiToDate: lambda self,
1113
1787
  e: f"CAST(STRPTIME(CAST({self.sql(e, 'this')} AS TEXT), {DuckDB.DATEINT_FORMAT}) AS DATE)",
1114
1788
  exp.Encode: lambda self, e: encode_decode_sql(self, e, "ENCODE", replace=False),
1789
+ exp.EqualNull: lambda self, e: self.sql(
1790
+ exp.NullSafeEQ(this=e.this, expression=e.expression)
1791
+ ),
1115
1792
  exp.EuclideanDistance: rename_func("LIST_DISTANCE"),
1116
1793
  exp.GenerateDateArray: _generate_datetime_array_sql,
1117
1794
  exp.GenerateTimestampArray: _generate_datetime_array_sql,
1795
+ exp.Getbit: getbit_sql,
1118
1796
  exp.GroupConcat: lambda self, e: groupconcat_sql(self, e, within_group=False),
1119
1797
  exp.Explode: rename_func("UNNEST"),
1120
1798
  exp.IntDiv: lambda self, e: self.binary(e, "//"),
1121
1799
  exp.IsInf: rename_func("ISINF"),
1122
1800
  exp.IsNan: rename_func("ISNAN"),
1123
- exp.Floor: _floor_sql,
1801
+ exp.IsNullValue: lambda self, e: self.sql(
1802
+ exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("NULL"))
1803
+ ),
1804
+ exp.IsArray: lambda self, e: self.sql(
1805
+ exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("ARRAY"))
1806
+ ),
1807
+ exp.Ceil: _ceil_floor,
1808
+ exp.Floor: _ceil_floor,
1124
1809
  exp.JSONBExists: rename_func("JSON_EXISTS"),
1125
1810
  exp.JSONExtract: _arrow_json_extract_sql,
1126
1811
  exp.JSONExtractArray: _json_extract_value_array_sql,
1127
1812
  exp.JSONFormat: _json_format_sql,
1128
1813
  exp.JSONValueArray: _json_extract_value_array_sql,
1129
1814
  exp.Lateral: explode_to_unnest_sql,
1130
- exp.LogicalOr: rename_func("BOOL_OR"),
1131
- exp.LogicalAnd: rename_func("BOOL_AND"),
1815
+ exp.LogicalOr: lambda self, e: self.func("BOOL_OR", _cast_to_boolean(e.this)),
1816
+ exp.LogicalAnd: lambda self, e: self.func("BOOL_AND", _cast_to_boolean(e.this)),
1817
+ exp.Seq1: lambda self, e: _seq_sql(self, e, 1),
1818
+ exp.Seq2: lambda self, e: _seq_sql(self, e, 2),
1819
+ exp.Seq4: lambda self, e: _seq_sql(self, e, 4),
1820
+ exp.Seq8: lambda self, e: _seq_sql(self, e, 8),
1821
+ exp.BoolxorAgg: _boolxor_agg_sql,
1132
1822
  exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "),
1133
1823
  exp.Initcap: _initcap_sql,
1134
1824
  exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)),
1135
1825
  exp.SHA1Digest: lambda self, e: self.func("UNHEX", self.func("SHA1", e.this)),
1136
1826
  exp.SHA2Digest: lambda self, e: self.func("UNHEX", sha2_digest_sql(self, e)),
1137
- exp.MonthsBetween: lambda self, e: self.func(
1138
- "DATEDIFF",
1139
- "'month'",
1140
- exp.cast(e.expression, exp.DataType.Type.TIMESTAMP, copy=True),
1141
- exp.cast(e.this, exp.DataType.Type.TIMESTAMP, copy=True),
1142
- ),
1827
+ exp.MonthsBetween: months_between_sql,
1828
+ exp.NextDay: _day_navigation_sql,
1143
1829
  exp.PercentileCont: rename_func("QUANTILE_CONT"),
1144
1830
  exp.PercentileDisc: rename_func("QUANTILE_DISC"),
1145
1831
  # DuckDB doesn't allow qualified columns inside of PIVOT expressions.
1146
1832
  # See: https://github.com/duckdb/duckdb/blob/671faf92411182f81dce42ac43de8bfb05d9909e/src/planner/binder/tableref/bind_pivot.cpp#L61-L62
1147
1833
  exp.Pivot: transforms.preprocess([transforms.unqualify_columns]),
1834
+ exp.PreviousDay: _day_navigation_sql,
1148
1835
  exp.RegexpReplace: lambda self, e: self.func(
1149
1836
  "REGEXP_REPLACE",
1150
1837
  e.this,
@@ -1172,16 +1859,16 @@ class DuckDB(Dialect):
1172
1859
  ),
1173
1860
  exp.Struct: _struct_sql,
1174
1861
  exp.Transform: rename_func("LIST_TRANSFORM"),
1175
- exp.TimeAdd: date_delta_to_binary_interval_op(),
1176
- exp.TimeSub: date_delta_to_binary_interval_op(),
1862
+ exp.TimeAdd: _date_delta_to_binary_interval_op(),
1863
+ exp.TimeSub: _date_delta_to_binary_interval_op(),
1177
1864
  exp.Time: no_time_sql,
1178
1865
  exp.TimeDiff: _timediff_sql,
1179
1866
  exp.Timestamp: no_timestamp_sql,
1180
- exp.TimestampAdd: date_delta_to_binary_interval_op(),
1867
+ exp.TimestampAdd: _date_delta_to_binary_interval_op(),
1181
1868
  exp.TimestampDiff: lambda self, e: self.func(
1182
1869
  "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this
1183
1870
  ),
1184
- exp.TimestampSub: date_delta_to_binary_interval_op(),
1871
+ exp.TimestampSub: _date_delta_to_binary_interval_op(),
1185
1872
  exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DataType.Type.DATE)),
1186
1873
  exp.TimeStrToTime: timestrtotime_sql,
1187
1874
  exp.TimeStrToUnix: lambda self, e: self.func(
@@ -1192,7 +1879,7 @@ class DuckDB(Dialect):
1192
1879
  exp.TimeToUnix: rename_func("EPOCH"),
1193
1880
  exp.TsOrDiToDi: lambda self,
1194
1881
  e: f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)",
1195
- exp.TsOrDsAdd: date_delta_to_binary_interval_op(),
1882
+ exp.TsOrDsAdd: _date_delta_to_binary_interval_op(),
1196
1883
  exp.TsOrDsDiff: lambda self, e: self.func(
1197
1884
  "DATE_DIFF",
1198
1885
  f"'{e.args.get('unit') or 'DAY'}'",
@@ -1216,13 +1903,23 @@ class DuckDB(Dialect):
1216
1903
  exp.UnixToTimeStr: lambda self, e: f"CAST(TO_TIMESTAMP({self.sql(e, 'this')}) AS TEXT)",
1217
1904
  exp.VariancePop: rename_func("VAR_POP"),
1218
1905
  exp.WeekOfYear: rename_func("WEEKOFYEAR"),
1219
- exp.Xor: bool_xor_sql,
1220
- exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")(
1221
- rename_func("LEVENSHTEIN")
1906
+ exp.YearOfWeek: lambda self, e: self.sql(
1907
+ exp.Extract(
1908
+ this=exp.Var(this="ISOYEAR"),
1909
+ expression=e.this,
1910
+ )
1222
1911
  ),
1912
+ exp.YearOfWeekIso: lambda self, e: self.sql(
1913
+ exp.Extract(
1914
+ this=exp.Var(this="ISOYEAR"),
1915
+ expression=e.this,
1916
+ )
1917
+ ),
1918
+ exp.Xor: _xor_sql,
1223
1919
  exp.JSONObjectAgg: rename_func("JSON_GROUP_OBJECT"),
1224
1920
  exp.JSONBObjectAgg: rename_func("JSON_GROUP_OBJECT"),
1225
1921
  exp.DateBin: rename_func("TIME_BUCKET"),
1922
+ exp.LastDay: _last_day_sql,
1226
1923
  }
1227
1924
 
1228
1925
  SUPPORTED_JSON_PATH_PARTS = {
@@ -1247,6 +1944,7 @@ class DuckDB(Dialect):
1247
1944
  exp.DataType.Type.VARBINARY: "BLOB",
1248
1945
  exp.DataType.Type.ROWVERSION: "BLOB",
1249
1946
  exp.DataType.Type.VARCHAR: "TEXT",
1947
+ exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMPTZ",
1250
1948
  exp.DataType.Type.TIMESTAMPNTZ: "TIMESTAMP",
1251
1949
  exp.DataType.Type.TIMESTAMP_S: "TIMESTAMP_S",
1252
1950
  exp.DataType.Type.TIMESTAMP_MS: "TIMESTAMP_MS",
@@ -1359,6 +2057,277 @@ class DuckDB(Dialect):
1359
2057
  exp.NthValue,
1360
2058
  )
1361
2059
 
2060
+ # Template for ZIPF transpilation - placeholders get replaced with actual parameters
2061
+ ZIPF_TEMPLATE: exp.Expression = exp.maybe_parse(
2062
+ """
2063
+ WITH rand AS (SELECT :random_expr AS r),
2064
+ weights AS (
2065
+ SELECT i, 1.0 / POWER(i, :s) AS w
2066
+ FROM RANGE(1, :n + 1) AS t(i)
2067
+ ),
2068
+ cdf AS (
2069
+ SELECT i, SUM(w) OVER (ORDER BY i) / SUM(w) OVER () AS p
2070
+ FROM weights
2071
+ )
2072
+ SELECT MIN(i)
2073
+ FROM cdf
2074
+ WHERE p >= (SELECT r FROM rand)
2075
+ """
2076
+ )
2077
+
2078
+ # Template for NORMAL transpilation using Box-Muller transform
2079
+ # mean + (stddev * sqrt(-2 * ln(u1)) * cos(2 * pi * u2))
2080
+ NORMAL_TEMPLATE: exp.Expression = exp.maybe_parse(
2081
+ ":mean + (:stddev * SQRT(-2 * LN(GREATEST(:u1, 1e-10))) * COS(2 * PI() * :u2))"
2082
+ )
2083
+
2084
+ # Template for generating a seeded pseudo-random value in [0, 1) from a hash
2085
+ SEEDED_RANDOM_TEMPLATE: exp.Expression = exp.maybe_parse(
2086
+ "(ABS(HASH(:seed)) % 1000000) / 1000000.0"
2087
+ )
2088
+
2089
+ # Template for generating signed and unsigned SEQ values within a specified range
2090
+ SEQ_UNSIGNED: exp.Expression = exp.maybe_parse(f"{_SEQ_BASE} % :max_val")
2091
+ SEQ_SIGNED: exp.Expression = exp.maybe_parse(
2092
+ f"(CASE WHEN {_SEQ_BASE} % :max_val >= :half "
2093
+ f"THEN {_SEQ_BASE} % :max_val - :max_val "
2094
+ f"ELSE {_SEQ_BASE} % :max_val END)"
2095
+ )
2096
+
2097
+ # Template for MAP_CAT transpilation - Snowflake semantics:
2098
+ # 1. Returns NULL if either input is NULL
2099
+ # 2. For duplicate keys, prefers non-NULL value (COALESCE(m2[k], m1[k]))
2100
+ # 3. Filters out entries with NULL values from the result
2101
+ MAPCAT_TEMPLATE: exp.Expression = exp.maybe_parse(
2102
+ """
2103
+ CASE
2104
+ WHEN :map1 IS NULL OR :map2 IS NULL THEN NULL
2105
+ ELSE MAP_FROM_ENTRIES(LIST_FILTER(LIST_TRANSFORM(
2106
+ LIST_DISTINCT(LIST_CONCAT(MAP_KEYS(:map1), MAP_KEYS(:map2))),
2107
+ __k -> STRUCT_PACK(key := __k, value := COALESCE(:map2[__k], :map1[__k]))
2108
+ ), __x -> __x.value IS NOT NULL))
2109
+ END
2110
+ """
2111
+ )
2112
+
2113
+ # Mappings for EXTRACT/DATE_PART transpilation
2114
+ # Maps Snowflake specifiers unsupported in DuckDB to strftime format codes
2115
+ EXTRACT_STRFTIME_MAPPINGS: t.Dict[str, t.Tuple[str, str]] = {
2116
+ "WEEKISO": ("%V", "INTEGER"),
2117
+ "YEAROFWEEK": ("%G", "INTEGER"),
2118
+ "YEAROFWEEKISO": ("%G", "INTEGER"),
2119
+ "NANOSECOND": ("%n", "BIGINT"),
2120
+ }
2121
+
2122
+ # Maps epoch-based specifiers to DuckDB epoch functions
2123
+ EXTRACT_EPOCH_MAPPINGS: t.Dict[str, str] = {
2124
+ "EPOCH_SECOND": "EPOCH",
2125
+ "EPOCH_MILLISECOND": "EPOCH_MS",
2126
+ "EPOCH_MICROSECOND": "EPOCH_US",
2127
+ "EPOCH_NANOSECOND": "EPOCH_NS",
2128
+ }
2129
+
2130
+ # Template for BITMAP_CONSTRUCT_AGG transpilation
2131
+ #
2132
+ # BACKGROUND:
2133
+ # Snowflake's BITMAP_CONSTRUCT_AGG aggregates integers into a compact binary bitmap.
2134
+ # Supports values in range 0-32767, this version returns NULL if any value is out of range
2135
+ # See: https://docs.snowflake.com/en/sql-reference/functions/bitmap_construct_agg
2136
+ # See: https://docs.snowflake.com/en/user-guide/querying-bitmaps-for-distinct-counts
2137
+ #
2138
+ # Snowflake uses two different formats based on the number of unique values:
2139
+ #
2140
+ # Format 1 - Small bitmap (< 5 unique values): Length of 10 bytes
2141
+ # Bytes 0-1: Count of values as 2-byte big-endian integer (e.g., 3 values = 0x0003)
2142
+ # Bytes 2-9: Up to 4 values, each as 2-byte little-endian integers, zero-padded to 8 bytes
2143
+ # Example: Values [1, 2, 3] -> 0x0003 0100 0200 0300 0000 (hex)
2144
+ # count v1 v2 v3 pad
2145
+ #
2146
+ # Format 2 - Large bitmap (>= 5 unique values): Length of 10 + (2 * count) bytes
2147
+ # Bytes 0-9: Fixed header 0x08 followed by 9 zero bytes
2148
+ # Bytes 10+: Each value as 2-byte little-endian integer (no padding)
2149
+ # Example: Values [1,2,3,4,5] -> 0x08 00000000 00000000 00 0100 0200 0300 0400 0500
2150
+ # hdr ----9 zero bytes---- v1 v2 v3 v4 v5
2151
+ #
2152
+ # TEMPLATE STRUCTURE
2153
+ #
2154
+ # Phase 1 - Innermost subquery: Data preparation
2155
+ # SELECT LIST_SORT(...) AS l
2156
+ # - Aggregates all input values into a list, remove NULLs, duplicates and sorts
2157
+ # Result: Clean, sorted list of unique non-null integers stored as 'l'
2158
+ #
2159
+ # Phase 2 - Middle subquery: Hex string construction
2160
+ # LIST_TRANSFORM(...)
2161
+ # - Converts each integer to 2-byte little-endian hex representation
2162
+ # - & 255 extracts low byte, >> 8 extracts high byte
2163
+ # - LIST_REDUCE: Concatenates all hex pairs into single string 'h'
2164
+ # Result: Hex string of all values
2165
+ #
2166
+ # Phase 3 - Outer SELECT: Final bitmap assembly
2167
+ # LENGTH(l) < 5:
2168
+ # - Small format: 2-byte count (big-endian via %04X) + values + zero padding
2169
+ # LENGTH(l) >= 5:
2170
+ # - Large format: Fixed 10-byte header + values (no padding needed)
2171
+ # Result: Complete binary bitmap as BLOB
2172
+ #
2173
+ BITMAP_CONSTRUCT_AGG_TEMPLATE: exp.Expression = exp.maybe_parse(
2174
+ """
2175
+ SELECT CASE
2176
+ WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL
2177
+ WHEN LENGTH(l) != LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL
2178
+ WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2))
2179
+ ELSE UNHEX('08000000000000000000' || h)
2180
+ END
2181
+ FROM (
2182
+ SELECT l, COALESCE(LIST_REDUCE(
2183
+ LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)),
2184
+ (__a, __b) -> __a || __b, ''
2185
+ ), '') AS h
2186
+ FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(:arg) FILTER(NOT :arg IS NULL))) AS l)
2187
+ )
2188
+ """
2189
+ )
2190
+
2191
+ # Template for RANDSTR transpilation - placeholders get replaced with actual parameters
2192
+ RANDSTR_TEMPLATE: exp.Expression = exp.maybe_parse(
2193
+ f"""
2194
+ SELECT LISTAGG(
2195
+ SUBSTRING(
2196
+ '{RANDSTR_CHAR_POOL}',
2197
+ 1 + CAST(FLOOR(random_value * 62) AS INT),
2198
+ 1
2199
+ ),
2200
+ ''
2201
+ )
2202
+ FROM (
2203
+ SELECT (ABS(HASH(i + :seed)) % 1000) / 1000.0 AS random_value
2204
+ FROM RANGE(:length) AS t(i)
2205
+ )
2206
+ """,
2207
+ )
2208
+
2209
+ # Template for MINHASH transpilation
2210
+ # Computes k minimum hash values across aggregated data using DuckDB list functions
2211
+ # Returns JSON matching Snowflake format: {"state": [...], "type": "minhash", "version": 1}
2212
+ MINHASH_TEMPLATE: exp.Expression = exp.maybe_parse(
2213
+ """
2214
+ SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed), 'type', 'minhash', 'version', 1)
2215
+ FROM (
2216
+ SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS VARCHAR) || CAST(seed AS VARCHAR)))) AS min_h
2217
+ FROM (SELECT LIST(:expr) AS vals), RANGE(0, :k) AS t(seed)
2218
+ )
2219
+ """,
2220
+ )
2221
+
2222
+ # Template for MINHASH_COMBINE transpilation
2223
+ # Combines multiple minhash signatures by taking element-wise minimum
2224
+ MINHASH_COMBINE_TEMPLATE: exp.Expression = exp.maybe_parse(
2225
+ """
2226
+ SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx), 'type', 'minhash', 'version', 1)
2227
+ FROM (
2228
+ SELECT
2229
+ pos AS idx,
2230
+ MIN(val) AS min_h
2231
+ FROM
2232
+ UNNEST(LIST(:expr)) AS _(sig),
2233
+ UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos)
2234
+ GROUP BY pos
2235
+ )
2236
+ """,
2237
+ )
2238
+
2239
+ # Template for APPROXIMATE_SIMILARITY transpilation
2240
+ # Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree
2241
+ APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expression = exp.maybe_parse(
2242
+ """
2243
+ SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*)
2244
+ FROM (
2245
+ SELECT pos, COUNT(DISTINCT h) AS num_distinct
2246
+ FROM (
2247
+ SELECT h, pos
2248
+ FROM UNNEST(LIST(:expr)) AS _(sig),
2249
+ UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos)
2250
+ )
2251
+ GROUP BY pos
2252
+ )
2253
+ """,
2254
+ )
2255
+
2256
+ # Template for ARRAYS_ZIP transpilation
2257
+ # Snowflake pads to longest array; DuckDB LIST_ZIP truncates to shortest
2258
+ # Uses RANGE + indexing to match Snowflake behavior
2259
+ ARRAYS_ZIP_TEMPLATE: exp.Expression = exp.maybe_parse(
2260
+ """
2261
+ CASE WHEN :null_check THEN NULL
2262
+ WHEN :all_empty_check THEN [:empty_struct]
2263
+ ELSE LIST_TRANSFORM(RANGE(0, :max_len), __i -> :transform_struct)
2264
+ END
2265
+ """,
2266
+ )
2267
+
2268
+ def timeslice_sql(self: DuckDB.Generator, expression: exp.TimeSlice) -> str:
2269
+ """
2270
+ Transform Snowflake's TIME_SLICE to DuckDB's time_bucket.
2271
+
2272
+ Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END'])
2273
+ DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr)
2274
+
2275
+ For 'END' kind, add the interval to get the end of the slice.
2276
+ For DATE type with 'END', cast result back to DATE to preserve type.
2277
+ """
2278
+ date_expr = expression.this
2279
+ slice_length = expression.expression
2280
+ unit = expression.unit
2281
+ kind = expression.text("kind").upper()
2282
+
2283
+ # Create INTERVAL expression: INTERVAL 'N' UNIT
2284
+ interval_expr = exp.Interval(this=slice_length, unit=unit)
2285
+
2286
+ # Create base time_bucket expression
2287
+ time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr)
2288
+
2289
+ # Check if we need the end of the slice (default is start)
2290
+ if not kind == "END":
2291
+ # For 'START', return time_bucket directly
2292
+ return self.sql(time_bucket_expr)
2293
+
2294
+ # For 'END', add the interval to get end of slice
2295
+ add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy())
2296
+
2297
+ # If input is DATE type, cast result back to DATE to preserve type
2298
+ # DuckDB converts DATE to TIMESTAMP when adding intervals
2299
+ if date_expr.is_type(exp.DataType.Type.DATE):
2300
+ return self.sql(exp.cast(add_expr, exp.DataType.Type.DATE))
2301
+
2302
+ return self.sql(add_expr)
2303
+
2304
+ def bitmapbucketnumber_sql(
2305
+ self: DuckDB.Generator, expression: exp.BitmapBucketNumber
2306
+ ) -> str:
2307
+ """
2308
+ Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent.
2309
+
2310
+ Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where:
2311
+ - Each bucket covers 32,768 values
2312
+ - Bucket numbering starts at 1
2313
+ - Formula: ((value - 1) // 32768) + 1 for positive values
2314
+
2315
+ For non-positive values (0 and negative), we use value // 32768 to avoid
2316
+ producing bucket 0 or positive bucket IDs for negative inputs.
2317
+ """
2318
+ value = expression.this
2319
+
2320
+ positive_formula = ((value - 1) // 32768) + 1
2321
+ non_positive_formula = value // 32768
2322
+
2323
+ # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END
2324
+ case_expr = (
2325
+ exp.case()
2326
+ .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula)
2327
+ .else_(non_positive_formula)
2328
+ )
2329
+ return self.sql(case_expr)
2330
+
1362
2331
  def bitmapbitposition_sql(self: DuckDB.Generator, expression: exp.BitmapBitPosition) -> str:
1363
2332
  """
1364
2333
  Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression.
@@ -1382,9 +2351,24 @@ class DuckDB(Dialect):
1382
2351
  )
1383
2352
  )
1384
2353
 
2354
+ def bitmapconstructagg_sql(
2355
+ self: DuckDB.Generator, expression: exp.BitmapConstructAgg
2356
+ ) -> str:
2357
+ """
2358
+ Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent.
2359
+ Uses a pre-parsed template with placeholders replaced by expression nodes.
2360
+
2361
+ Snowflake bitmap format:
2362
+ - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes
2363
+ - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian)
2364
+ """
2365
+ arg = expression.this
2366
+ return f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})"
2367
+
1385
2368
  def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str:
1386
2369
  """
1387
2370
  Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random.
2371
+ Uses a pre-parsed template with placeholders replaced by expression nodes.
1388
2372
 
1389
2373
  RANDSTR(length, generator) generates a random string of specified length.
1390
2374
  - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result)
@@ -1405,40 +2389,49 @@ class DuckDB(Dialect):
1405
2389
  # No generator specified, use default seed (arbitrary but deterministic)
1406
2390
  seed_value = exp.Literal.number(RANDSTR_SEED)
1407
2391
 
1408
- length_sql = self.sql(length)
1409
- seed_sql = self.sql(seed_value)
2392
+ replacements = {"seed": seed_value, "length": length}
2393
+ return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})"
2394
+
2395
+ def zipf_sql(self: DuckDB.Generator, expression: exp.Zipf) -> str:
2396
+ """
2397
+ Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling.
2398
+ Uses a pre-parsed template with placeholders replaced by expression nodes.
2399
+ """
2400
+ s = expression.this
2401
+ n = expression.args["elementcount"]
2402
+ gen = expression.args["gen"]
1410
2403
 
1411
- query: exp.Select = exp.maybe_parse(
1412
- f"""
1413
- SELECT LISTAGG(
1414
- SUBSTRING(
1415
- '{RANDSTR_CHAR_POOL}',
1416
- 1 + CAST(FLOOR(random_value * 62) AS INT),
1417
- 1
2404
+ if not isinstance(gen, exp.Rand):
2405
+ # (ABS(HASH(seed)) % 1000000) / 1000000.0
2406
+ random_expr: exp.Expression = exp.Div(
2407
+ this=exp.Paren(
2408
+ this=exp.Mod(
2409
+ this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])),
2410
+ expression=exp.Literal.number(1000000),
2411
+ )
1418
2412
  ),
1419
- ''
2413
+ expression=exp.Literal.number(1000000.0),
1420
2414
  )
1421
- FROM (
1422
- SELECT (ABS(HASH(i + {seed_sql})) % 1000) / 1000.0 AS random_value
1423
- FROM RANGE({length_sql}) AS t(i)
1424
- )
1425
- """,
1426
- dialect="duckdb",
1427
- )
1428
- return f"({self.sql(query)})"
2415
+ else:
2416
+ # Use RANDOM() for non-deterministic output
2417
+ random_expr = exp.Rand()
2418
+
2419
+ replacements = {"s": s, "n": n, "random_expr": random_expr}
2420
+ return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})"
1429
2421
 
1430
2422
  def tobinary_sql(self: DuckDB.Generator, expression: exp.ToBinary) -> str:
1431
2423
  """
1432
- TO_BINARY(value, format) transpilation if the return type is BINARY:
2424
+ TO_BINARY and TRY_TO_BINARY transpilation:
1433
2425
  - 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50')
1434
2426
  - 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST')
1435
2427
  - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==')
1436
2428
 
1437
- format can be 'HEX', 'UTF-8' or 'BASE64'
1438
- return type can be either VARCHAR or BINARY
2429
+ For TRY_TO_BINARY (safe=True), wrap with TRY():
2430
+ - 'HEX': TRY_TO_BINARY('invalid', 'HEX') TRY(UNHEX('invalid'))
1439
2431
  """
1440
2432
  value = expression.this
1441
2433
  format_arg = expression.args.get("format")
2434
+ is_safe = expression.args.get("safe")
1442
2435
 
1443
2436
  fmt = "HEX"
1444
2437
  if format_arg:
@@ -1446,12 +2439,23 @@ class DuckDB(Dialect):
1446
2439
 
1447
2440
  if expression.is_type(exp.DataType.Type.BINARY):
1448
2441
  if fmt == "UTF-8":
1449
- return self.func("ENCODE", value)
1450
- if fmt == "BASE64":
1451
- return self.func("FROM_BASE64", value)
2442
+ result = self.func("ENCODE", value)
2443
+ elif fmt == "BASE64":
2444
+ result = self.func("FROM_BASE64", value)
2445
+ elif fmt == "HEX":
2446
+ result = self.func("UNHEX", value)
2447
+ else:
2448
+ if is_safe:
2449
+ return self.sql(exp.null())
2450
+ else:
2451
+ self.unsupported(f"format {fmt} is not supported")
2452
+ result = self.func("TO_BINARY", value)
1452
2453
 
1453
- # Hex
1454
- return self.func("UNHEX", value)
2454
+ # Wrap with TRY() for TRY_TO_BINARY
2455
+ if is_safe:
2456
+ result = self.func("TRY", result)
2457
+
2458
+ return result
1455
2459
 
1456
2460
  # Fallback, which needs to be updated if want to support transpilation from other dialects than Snowflake
1457
2461
  return self.func("TO_BINARY", value)
@@ -1462,25 +2466,39 @@ class DuckDB(Dialect):
1462
2466
  """
1463
2467
  Handle GREATEST/LEAST functions with dialect-aware NULL behavior.
1464
2468
 
1465
- - If null_if_any_null=True (BigQuery-style): return NULL if any argument is NULL
1466
- - If null_if_any_null=False (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value
2469
+ - If ignore_nulls=False (BigQuery-style): return NULL if any argument is NULL
2470
+ - If ignore_nulls=True (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value
1467
2471
  """
1468
2472
  # Get all arguments
1469
2473
  all_args = [expression.this, *expression.expressions]
1470
2474
  fallback_sql = self.function_fallback_sql(expression)
1471
2475
 
1472
- if expression.args.get("null_if_any_null"):
1473
- # BigQuery behavior: NULL if any argument is NULL
1474
- case_expr = exp.case().when(
1475
- exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False),
1476
- exp.null(),
1477
- copy=False,
1478
- )
1479
- case_expr.set("default", fallback_sql)
1480
- return self.sql(case_expr)
2476
+ if expression.args.get("ignore_nulls"):
2477
+ # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs)
2478
+ return self.sql(fallback_sql)
1481
2479
 
1482
- # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs)
1483
- return self.sql(fallback_sql)
2480
+ # return NULL if any argument is NULL
2481
+ case_expr = exp.case().when(
2482
+ exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False),
2483
+ exp.null(),
2484
+ copy=False,
2485
+ )
2486
+ case_expr.set("default", fallback_sql)
2487
+ return self.sql(case_expr)
2488
+
2489
+ def generator_sql(self, expression: exp.Generator) -> str:
2490
+ # Transpile Snowflake GENERATOR to DuckDB range()
2491
+ rowcount = expression.args.get("rowcount")
2492
+ time_limit = expression.args.get("time_limit")
2493
+
2494
+ if time_limit:
2495
+ self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB")
2496
+
2497
+ if not rowcount:
2498
+ self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB")
2499
+ return self.func("range", exp.Literal.number(0))
2500
+
2501
+ return self.func("range", rowcount)
1484
2502
 
1485
2503
  def greatest_sql(self: DuckDB.Generator, expression: exp.Greatest) -> str:
1486
2504
  return self._greatest_least_sql(expression)
@@ -1521,16 +2539,58 @@ class DuckDB(Dialect):
1521
2539
  return self.sql(exp.cast(expression.this, exp.DataType.Type.TIMESTAMPTZ))
1522
2540
 
1523
2541
  def strtotime_sql(self, expression: exp.StrToTime) -> str:
2542
+ # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants)
2543
+ target_type = expression.args.get("target_type")
2544
+ needs_tz = target_type and target_type.this in (
2545
+ exp.DataType.Type.TIMESTAMPLTZ,
2546
+ exp.DataType.Type.TIMESTAMPTZ,
2547
+ )
2548
+
1524
2549
  if expression.args.get("safe"):
1525
2550
  formatted_time = self.format_time(expression)
1526
- return f"CAST({self.func('TRY_STRPTIME', expression.this, formatted_time)} AS TIMESTAMP)"
1527
- return str_to_time_sql(self, expression)
2551
+ cast_type = (
2552
+ exp.DataType.Type.TIMESTAMPTZ if needs_tz else exp.DataType.Type.TIMESTAMP
2553
+ )
2554
+ return self.sql(
2555
+ exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type)
2556
+ )
2557
+
2558
+ base_sql = str_to_time_sql(self, expression)
2559
+ if needs_tz:
2560
+ return self.sql(
2561
+ exp.cast(
2562
+ base_sql,
2563
+ exp.DataType(this=exp.DataType.Type.TIMESTAMPTZ),
2564
+ )
2565
+ )
2566
+ return base_sql
1528
2567
 
1529
2568
  def strtodate_sql(self, expression: exp.StrToDate) -> str:
1530
- if expression.args.get("safe"):
1531
- formatted_time = self.format_time(expression)
1532
- return f"CAST({self.func('TRY_STRPTIME', expression.this, formatted_time)} AS DATE)"
1533
- return f"CAST({str_to_time_sql(self, expression)} AS DATE)"
2569
+ formatted_time = self.format_time(expression)
2570
+ function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME"
2571
+ return self.sql(
2572
+ exp.cast(
2573
+ self.func(function_name, expression.this, formatted_time),
2574
+ exp.DataType(this=exp.DataType.Type.DATE),
2575
+ )
2576
+ )
2577
+
2578
+ def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str:
2579
+ this = expression.this
2580
+ time_format = self.format_time(expression)
2581
+ safe = expression.args.get("safe")
2582
+ time_type = exp.DataType.build("TIME", dialect="duckdb")
2583
+ cast_expr = exp.TryCast if safe else exp.Cast
2584
+
2585
+ if time_format:
2586
+ func_name = "TRY_STRPTIME" if safe else "STRPTIME"
2587
+ strptime = exp.Anonymous(this=func_name, expressions=[this, time_format])
2588
+ return self.sql(cast_expr(this=strptime, to=time_type))
2589
+
2590
+ if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DataType.Type.TIME):
2591
+ return self.sql(this)
2592
+
2593
+ return self.sql(cast_expr(this=this, to=time_type))
1534
2594
 
1535
2595
  def currentdate_sql(self, expression: exp.CurrentDate) -> str:
1536
2596
  if not expression.this:
@@ -1548,17 +2608,210 @@ class DuckDB(Dialect):
1548
2608
  return self.sql(exp.case().when(exp.func("json_valid", arg), arg).else_(exp.null()))
1549
2609
  return self.func("JSON", arg)
1550
2610
 
2611
+ def normal_sql(self, expression: exp.Normal) -> str:
2612
+ """
2613
+ Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB.
2614
+
2615
+ Uses the Box-Muller transform via NORMAL_TEMPLATE.
2616
+ """
2617
+ mean = expression.this
2618
+ stddev = expression.args["stddev"]
2619
+ gen: exp.Expression = expression.args["gen"]
2620
+
2621
+ # Build two uniform random values [0, 1) for Box-Muller transform
2622
+ if isinstance(gen, exp.Rand) and gen.this is None:
2623
+ u1: exp.Expression = exp.Rand()
2624
+ u2: exp.Expression = exp.Rand()
2625
+ else:
2626
+ # Seeded: derive two values using HASH with different inputs
2627
+ seed = gen.this if isinstance(gen, exp.Rand) else gen
2628
+ u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed)
2629
+ u2 = exp.replace_placeholders(
2630
+ self.SEEDED_RANDOM_TEMPLATE,
2631
+ seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)),
2632
+ )
2633
+
2634
+ replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2}
2635
+ return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements))
2636
+
2637
+ def uniform_sql(self, expression: exp.Uniform) -> str:
2638
+ """
2639
+ Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB.
2640
+
2641
+ UNIFORM returns a random value in [min, max]:
2642
+ - Integer result if both min and max are integers
2643
+ - Float result if either min or max is a float
2644
+ """
2645
+ min_val = expression.this
2646
+ max_val = expression.expression
2647
+ gen = expression.args.get("gen")
2648
+
2649
+ # Determine if result should be integer (both bounds are integers).
2650
+ # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT
2651
+ is_int_result = min_val.is_int and max_val.is_int
2652
+
2653
+ # Build the random value expression [0, 1)
2654
+ if not isinstance(gen, exp.Rand):
2655
+ # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0
2656
+ random_expr: exp.Expression = exp.Div(
2657
+ this=exp.Paren(
2658
+ this=exp.Mod(
2659
+ this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])),
2660
+ expression=exp.Literal.number(1000000),
2661
+ )
2662
+ ),
2663
+ expression=exp.Literal.number(1000000.0),
2664
+ )
2665
+ else:
2666
+ random_expr = exp.Rand()
2667
+
2668
+ # Build: min + random * (max - min [+ 1 for int])
2669
+ range_expr: exp.Expression = exp.Sub(this=max_val, expression=min_val)
2670
+ if is_int_result:
2671
+ range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1))
2672
+
2673
+ result: exp.Expression = exp.Add(
2674
+ this=min_val,
2675
+ expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)),
2676
+ )
2677
+
2678
+ if is_int_result:
2679
+ result = exp.Cast(
2680
+ this=exp.Floor(this=result),
2681
+ to=exp.DataType.build("BIGINT"),
2682
+ )
2683
+
2684
+ return self.sql(result)
2685
+
1551
2686
  def timefromparts_sql(self, expression: exp.TimeFromParts) -> str:
1552
2687
  nano = expression.args.get("nano")
1553
- if nano is not None:
2688
+ overflow = expression.args.get("overflow")
2689
+
2690
+ # Snowflake's TIME_FROM_PARTS supports overflow
2691
+ if overflow:
2692
+ hour = expression.args["hour"]
2693
+ minute = expression.args["min"]
2694
+ sec = expression.args["sec"]
2695
+
2696
+ # Check if values are within normal ranges - use MAKE_TIME for efficiency
2697
+ if not nano and all(arg.is_int for arg in [hour, minute, sec]):
2698
+ try:
2699
+ h_val = hour.to_py()
2700
+ m_val = minute.to_py()
2701
+ s_val = sec.to_py()
2702
+ if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59:
2703
+ return rename_func("MAKE_TIME")(self, expression)
2704
+ except ValueError:
2705
+ pass
2706
+
2707
+ # Overflow or nanoseconds detected - use INTERVAL arithmetic
2708
+ if nano:
2709
+ sec = sec + nano.pop() / exp.Literal.number(1000000000.0)
2710
+
2711
+ total_seconds = (
2712
+ hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec
2713
+ )
2714
+
2715
+ return self.sql(
2716
+ exp.Add(
2717
+ this=exp.Cast(
2718
+ this=exp.Literal.string("00:00:00"), to=exp.DataType.build("TIME")
2719
+ ),
2720
+ expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")),
2721
+ )
2722
+ )
2723
+
2724
+ # Default: MAKE_TIME
2725
+ if nano:
1554
2726
  expression.set(
1555
2727
  "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0)
1556
2728
  )
1557
2729
 
1558
2730
  return rename_func("MAKE_TIME")(self, expression)
1559
2731
 
2732
+ def extract_sql(self, expression: exp.Extract) -> str:
2733
+ """
2734
+ Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported.
2735
+
2736
+ DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND,
2737
+ EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND
2738
+ """
2739
+ this = expression.this
2740
+ datetime_expr = expression.expression
2741
+
2742
+ # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB
2743
+ # because Snowflake applies server timezone while DuckDB uses local timezone
2744
+ if datetime_expr.is_type(exp.DataType.Type.TIMESTAMPTZ, exp.DataType.Type.TIMESTAMPLTZ):
2745
+ self.unsupported(
2746
+ "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences"
2747
+ )
2748
+
2749
+ part_name = this.name.upper()
2750
+
2751
+ if part_name in self.EXTRACT_STRFTIME_MAPPINGS:
2752
+ fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name]
2753
+
2754
+ # Problem: strftime doesn't accept TIME and there's no NANOSECOND function
2755
+ # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000
2756
+ is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type(
2757
+ exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ
2758
+ )
2759
+
2760
+ if is_nano_time:
2761
+ self.unsupported(
2762
+ "Parameter NANOSECOND is not supported with TIME type in DuckDB"
2763
+ )
2764
+ return self.sql(
2765
+ exp.cast(
2766
+ exp.Mul(
2767
+ this=exp.Extract(
2768
+ this=exp.var("MICROSECOND"), expression=datetime_expr
2769
+ ),
2770
+ expression=exp.Literal.number(1000),
2771
+ ),
2772
+ exp.DataType.build(cast_type, dialect="duckdb"),
2773
+ )
2774
+ )
2775
+
2776
+ # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision
2777
+ strftime_input = datetime_expr
2778
+ if part_name == "NANOSECOND":
2779
+ strftime_input = exp.cast(datetime_expr, exp.DataType.Type.TIMESTAMP_NS)
2780
+
2781
+ return self.sql(
2782
+ exp.cast(
2783
+ exp.Anonymous(
2784
+ this="STRFTIME",
2785
+ expressions=[strftime_input, exp.Literal.string(fmt)],
2786
+ ),
2787
+ exp.DataType.build(cast_type, dialect="duckdb"),
2788
+ )
2789
+ )
2790
+
2791
+ if part_name in self.EXTRACT_EPOCH_MAPPINGS:
2792
+ func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name]
2793
+ result: exp.Expression = exp.Anonymous(this=func_name, expressions=[datetime_expr])
2794
+ # EPOCH returns float, cast to BIGINT for integer result
2795
+ if part_name == "EPOCH_SECOND":
2796
+ result = exp.cast(result, exp.DataType.build("BIGINT", dialect="duckdb"))
2797
+ return self.sql(result)
2798
+
2799
+ return super().extract_sql(expression)
2800
+
1560
2801
  def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str:
1561
- sec = expression.args["sec"]
2802
+ # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr)
2803
+ date_expr = expression.this
2804
+ time_expr = expression.expression
2805
+
2806
+ if date_expr is not None and time_expr is not None:
2807
+ # In DuckDB, DATE + TIME produces TIMESTAMP
2808
+ return self.sql(exp.Add(this=date_expr, expression=time_expr))
2809
+
2810
+ # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...)
2811
+ sec = expression.args.get("sec")
2812
+ if sec is None:
2813
+ # This shouldn't happen with valid input, but handle gracefully
2814
+ return rename_func("MAKE_TIMESTAMP")(self, expression)
1562
2815
 
1563
2816
  milli = expression.args.get("milli")
1564
2817
  if milli is not None:
@@ -1573,6 +2826,34 @@ class DuckDB(Dialect):
1573
2826
 
1574
2827
  return rename_func("MAKE_TIMESTAMP")(self, expression)
1575
2828
 
2829
+ @unsupported_args("nano")
2830
+ def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str:
2831
+ # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts
2832
+ if nano := expression.args.get("nano"):
2833
+ nano.pop()
2834
+
2835
+ timestamp = rename_func("MAKE_TIMESTAMP")(self, expression)
2836
+ return f"CAST({timestamp} AS TIMESTAMPTZ)"
2837
+
2838
+ @unsupported_args("nano")
2839
+ def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str:
2840
+ # Extract zone before popping
2841
+ zone = expression.args.get("zone")
2842
+ # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts
2843
+ if zone:
2844
+ zone = zone.pop()
2845
+
2846
+ if nano := expression.args.get("nano"):
2847
+ nano.pop()
2848
+
2849
+ timestamp = rename_func("MAKE_TIMESTAMP")(self, expression)
2850
+
2851
+ if zone:
2852
+ # Use AT TIME ZONE to apply the explicit timezone
2853
+ return f"{timestamp} AT TIME ZONE {self.sql(zone)}"
2854
+
2855
+ return timestamp
2856
+
1576
2857
  def tablesample_sql(
1577
2858
  self,
1578
2859
  expression: exp.TableSample,
@@ -1652,9 +2933,35 @@ class DuckDB(Dialect):
1652
2933
  return bracket
1653
2934
 
1654
2935
  def withingroup_sql(self, expression: exp.WithinGroup) -> str:
2936
+ func = expression.this
2937
+
2938
+ # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP
2939
+ # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y)
2940
+ if isinstance(func, exp.ArrayAgg):
2941
+ if not isinstance(order := expression.expression, exp.Order):
2942
+ return self.sql(func)
2943
+
2944
+ # Save the original column for FILTER clause (before wrapping with Order)
2945
+ original_this = func.this
2946
+
2947
+ # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order
2948
+ # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions)
2949
+ func.set(
2950
+ "this",
2951
+ exp.Order(
2952
+ this=func.this.copy(),
2953
+ expressions=order.expressions,
2954
+ ),
2955
+ )
2956
+
2957
+ # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed
2958
+ # Use original_this (not the Order-wrapped version) for the FILTER condition
2959
+ array_agg_sql = self.function_fallback_sql(func)
2960
+ return self._add_arrayagg_null_filter(array_agg_sql, func, original_this)
2961
+
2962
+ # For other functions (like PERCENTILES), use existing logic
1655
2963
  expression_sql = self.sql(expression, "expression")
1656
2964
 
1657
- func = expression.this
1658
2965
  if isinstance(func, exp.PERCENTILES):
1659
2966
  # Make the order key the first arg and slide the fraction to the right
1660
2967
  # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions
@@ -1697,6 +3004,98 @@ class DuckDB(Dialect):
1697
3004
 
1698
3005
  return self.sql(case)
1699
3006
 
3007
+ @unsupported_args("ins_cost", "del_cost", "sub_cost")
3008
+ def levenshtein_sql(self, expression: exp.Levenshtein) -> str:
3009
+ this = expression.this
3010
+ expr = expression.expression
3011
+ max_dist = expression.args.get("max_dist")
3012
+
3013
+ if max_dist is None:
3014
+ return self.func("LEVENSHTEIN", this, expr)
3015
+
3016
+ # Emulate Snowflake semantics: if distance > max_dist, return max_dist
3017
+ levenshtein = exp.Levenshtein(this=this, expression=expr)
3018
+ return self.sql(exp.Least(this=levenshtein, expressions=[max_dist]))
3019
+
3020
+ def minhash_sql(self, expression: exp.Minhash) -> str:
3021
+ k = expression.this
3022
+ exprs = expression.expressions
3023
+
3024
+ if len(exprs) != 1 or isinstance(exprs[0], exp.Star):
3025
+ self.unsupported(
3026
+ "MINHASH with multiple expressions or * requires manual query restructuring"
3027
+ )
3028
+ return self.func("MINHASH", k, *exprs)
3029
+
3030
+ expr = exprs[0]
3031
+ result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k)
3032
+ return f"({self.sql(result)})"
3033
+
3034
+ def minhashcombine_sql(self, expression: exp.MinhashCombine) -> str:
3035
+ expr = expression.this
3036
+ result = exp.replace_placeholders(self.MINHASH_COMBINE_TEMPLATE.copy(), expr=expr)
3037
+ return f"({self.sql(result)})"
3038
+
3039
+ def approximatesimilarity_sql(self, expression: exp.ApproximateSimilarity) -> str:
3040
+ expr = expression.this
3041
+ result = exp.replace_placeholders(
3042
+ self.APPROXIMATE_SIMILARITY_TEMPLATE.copy(), expr=expr
3043
+ )
3044
+ return f"({self.sql(result)})"
3045
+
3046
+ def arrayszip_sql(self, expression: exp.ArraysZip) -> str:
3047
+ args = expression.expressions
3048
+
3049
+ if not args:
3050
+ # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs
3051
+ return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array())))
3052
+
3053
+ # Build placeholder values for template
3054
+ lengths = [exp.Length(this=arg) for arg in args]
3055
+ max_len = (
3056
+ lengths[0]
3057
+ if len(lengths) == 1
3058
+ else exp.Greatest(this=lengths[0], expressions=lengths[1:])
3059
+ )
3060
+
3061
+ # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...}
3062
+ empty_struct = exp.func(
3063
+ "STRUCT",
3064
+ *[
3065
+ exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null())
3066
+ for i in range(len(args))
3067
+ ],
3068
+ )
3069
+
3070
+ # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...}
3071
+ # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax
3072
+ index = exp.column("__i") + 1
3073
+ transform_struct = exp.func(
3074
+ "STRUCT",
3075
+ *[
3076
+ exp.PropertyEQ(
3077
+ this=exp.Literal.string(f"${i + 1}"),
3078
+ expression=exp.func("COALESCE", arg, exp.array())[index],
3079
+ )
3080
+ for i, arg in enumerate(args)
3081
+ ],
3082
+ )
3083
+
3084
+ result = exp.replace_placeholders(
3085
+ self.ARRAYS_ZIP_TEMPLATE.copy(),
3086
+ null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]),
3087
+ all_empty_check=exp.and_(
3088
+ *[
3089
+ exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0))
3090
+ for arg in args
3091
+ ]
3092
+ ),
3093
+ empty_struct=empty_struct,
3094
+ max_len=max_len,
3095
+ transform_struct=transform_struct,
3096
+ )
3097
+ return self.sql(result)
3098
+
1700
3099
  def lower_sql(self, expression: exp.Lower) -> str:
1701
3100
  result_sql = self.func("LOWER", _cast_to_varchar(expression.this))
1702
3101
  return _gen_with_cast_to_blob(self, expression, result_sql)
@@ -1705,6 +3104,50 @@ class DuckDB(Dialect):
1705
3104
  result_sql = self.func("UPPER", _cast_to_varchar(expression.this))
1706
3105
  return _gen_with_cast_to_blob(self, expression, result_sql)
1707
3106
 
3107
+ def reverse_sql(self, expression: exp.Reverse) -> str:
3108
+ result_sql = self.func("REVERSE", _cast_to_varchar(expression.this))
3109
+ return _gen_with_cast_to_blob(self, expression, result_sql)
3110
+
3111
+ def base64encode_sql(self, expression: exp.Base64Encode) -> str:
3112
+ # DuckDB TO_BASE64 requires BLOB input
3113
+ # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly
3114
+ # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type.
3115
+ result = expression.this
3116
+
3117
+ # Check if input is a string type - ENCODE only accepts VARCHAR
3118
+ if result.is_type(*exp.DataType.TEXT_TYPES):
3119
+ result = exp.Encode(this=result)
3120
+
3121
+ result = exp.ToBase64(this=result)
3122
+
3123
+ max_line_length = expression.args.get("max_line_length")
3124
+ alphabet = expression.args.get("alphabet")
3125
+
3126
+ # Handle custom alphabet by replacing standard chars with custom ones
3127
+ result = _apply_base64_alphabet_replacements(result, alphabet)
3128
+
3129
+ # Handle max_line_length by inserting newlines every N characters
3130
+ line_length = (
3131
+ t.cast(int, max_line_length.to_py())
3132
+ if isinstance(max_line_length, exp.Literal) and max_line_length.is_number
3133
+ else 0
3134
+ )
3135
+ if line_length > 0:
3136
+ newline = exp.Chr(expressions=[exp.Literal.number(10)])
3137
+ result = exp.Trim(
3138
+ this=exp.RegexpReplace(
3139
+ this=result,
3140
+ expression=exp.Literal.string(f"(.{{{line_length}}})"),
3141
+ replacement=exp.Concat(
3142
+ expressions=[exp.Literal.string("\\1"), newline.copy()]
3143
+ ),
3144
+ ),
3145
+ expression=newline,
3146
+ position="TRAILING",
3147
+ )
3148
+
3149
+ return self.sql(result)
3150
+
1708
3151
  def replace_sql(self, expression: exp.Replace) -> str:
1709
3152
  result_sql = self.func(
1710
3153
  "REPLACE",
@@ -1739,6 +3182,14 @@ class DuckDB(Dialect):
1739
3182
 
1740
3183
  return self.func("STRUCT_INSERT", this, kv_sql)
1741
3184
 
3185
+ def mapcat_sql(self, expression: exp.MapCat) -> str:
3186
+ result = exp.replace_placeholders(
3187
+ self.MAPCAT_TEMPLATE.copy(),
3188
+ map1=expression.this,
3189
+ map2=expression.expression,
3190
+ )
3191
+ return self.sql(result)
3192
+
1742
3193
  def startswith_sql(self, expression: exp.StartsWith) -> str:
1743
3194
  return self.func(
1744
3195
  "STARTS_WITH",
@@ -1746,6 +3197,28 @@ class DuckDB(Dialect):
1746
3197
  _cast_to_varchar(expression.expression),
1747
3198
  )
1748
3199
 
3200
+ def space_sql(self, expression: exp.Space) -> str:
3201
+ # DuckDB's REPEAT requires BIGINT for the count parameter
3202
+ return self.sql(
3203
+ exp.Repeat(
3204
+ this=exp.Literal.string(" "),
3205
+ times=exp.cast(expression.this, exp.DataType.Type.BIGINT),
3206
+ )
3207
+ )
3208
+
3209
+ def tablefromrows_sql(self, expression: exp.TableFromRows) -> str:
3210
+ # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE)
3211
+ if isinstance(expression.this, exp.Generator):
3212
+ # Preserve alias, joins, and other table-level args
3213
+ table = exp.Table(
3214
+ this=expression.this,
3215
+ alias=expression.args.get("alias"),
3216
+ joins=expression.args.get("joins"),
3217
+ )
3218
+ return self.sql(table)
3219
+
3220
+ return super().tablefromrows_sql(expression)
3221
+
1749
3222
  def unnest_sql(self, expression: exp.Unnest) -> str:
1750
3223
  explode_array = expression.args.get("explode_array")
1751
3224
  if explode_array:
@@ -1893,18 +3366,54 @@ class DuckDB(Dialect):
1893
3366
  return posexplode_sql
1894
3367
 
1895
3368
  def addmonths_sql(self, expression: exp.AddMonths) -> str:
1896
- this = expression.this
3369
+ """
3370
+ Handles three key issues:
3371
+ 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers
3372
+ 2. End-of-month preservation: If input is last day of month, result is last day of result month
3373
+ 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP)
3374
+ """
3375
+ from sqlglot.optimizer.annotate_types import annotate_types
1897
3376
 
3377
+ this = expression.this
1898
3378
  if not this.type:
1899
- from sqlglot.optimizer.annotate_types import annotate_types
1900
-
1901
3379
  this = annotate_types(this, dialect=self.dialect)
1902
3380
 
1903
3381
  if this.is_type(*exp.DataType.TEXT_TYPES):
1904
3382
  this = exp.Cast(this=this, to=exp.DataType(this=exp.DataType.Type.TIMESTAMP))
1905
3383
 
1906
- func = self.func(
1907
- "DATE_ADD", this, exp.Interval(this=expression.expression, unit=exp.var("MONTH"))
3384
+ # Detect float/decimal months to apply rounding (Snowflake behavior)
3385
+ # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS
3386
+ months_expr = expression.expression
3387
+ if not months_expr.type:
3388
+ months_expr = annotate_types(months_expr, dialect=self.dialect)
3389
+
3390
+ # Build interval or to_months expression based on type
3391
+ # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT))
3392
+ interval_or_to_months = (
3393
+ exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT"))
3394
+ if months_expr.is_type(
3395
+ exp.DataType.Type.FLOAT,
3396
+ exp.DataType.Type.DOUBLE,
3397
+ exp.DataType.Type.DECIMAL,
3398
+ )
3399
+ # Integer case: standard INTERVAL N MONTH syntax
3400
+ else exp.Interval(this=months_expr, unit=exp.var("MONTH"))
3401
+ )
3402
+
3403
+ date_add_expr = exp.Add(this=this, expression=interval_or_to_months)
3404
+
3405
+ # Apply end-of-month preservation if Snowflake flag is set
3406
+ # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END
3407
+ preserve_eom = expression.args.get("preserve_end_of_month")
3408
+ result_expr = (
3409
+ exp.case()
3410
+ .when(
3411
+ exp.EQ(this=exp.func("LAST_DAY", this), expression=this),
3412
+ exp.func("LAST_DAY", date_add_expr),
3413
+ )
3414
+ .else_(date_add_expr)
3415
+ if preserve_eom
3416
+ else date_add_expr
1908
3417
  )
1909
3418
 
1910
3419
  # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE
@@ -1912,9 +3421,8 @@ class DuckDB(Dialect):
1912
3421
  # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ
1913
3422
  # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP
1914
3423
  if this.is_type(exp.DataType.Type.DATE, exp.DataType.Type.TIMESTAMPTZ):
1915
- return self.sql(exp.Cast(this=func, to=this.type))
1916
-
1917
- return self.sql(func)
3424
+ return self.sql(exp.Cast(this=result_expr, to=this.type))
3425
+ return self.sql(result_expr)
1918
3426
 
1919
3427
  def format_sql(self, expression: exp.Format) -> str:
1920
3428
  if expression.name.lower() == "%s" and len(expression.expressions) == 1:
@@ -1925,23 +3433,30 @@ class DuckDB(Dialect):
1925
3433
  def hexstring_sql(
1926
3434
  self, expression: exp.HexString, binary_function_repr: t.Optional[str] = None
1927
3435
  ) -> str:
1928
- from_hex = super().hexstring_sql(expression, binary_function_repr="FROM_HEX")
3436
+ # UNHEX('FF') correctly produces blob \xFF in DuckDB
3437
+ return super().hexstring_sql(expression, binary_function_repr="UNHEX")
1929
3438
 
1930
- if expression.args.get("is_integer"):
1931
- return from_hex
3439
+ def datetrunc_sql(self, expression: exp.DateTrunc) -> str:
3440
+ unit = unit_to_str(expression)
3441
+ date = expression.this
3442
+ result = self.func("DATE_TRUNC", unit, date)
1932
3443
 
1933
- # `from_hex` has transpiled x'ABCD' (BINARY) to DuckDB's '\xAB\xCD' (BINARY)
1934
- # `to_hex` & CASTing transforms it to "ABCD" (BINARY) to match representation
1935
- to_hex = exp.cast(self.func("TO_HEX", from_hex), exp.DataType.Type.BLOB)
3444
+ if (
3445
+ expression.args.get("input_type_preserved")
3446
+ and date.is_type(*exp.DataType.TEMPORAL_TYPES)
3447
+ and not (is_date_unit(unit) and date.is_type(exp.DataType.Type.DATE))
3448
+ ):
3449
+ return self.sql(exp.Cast(this=result, to=date.type))
1936
3450
 
1937
- return self.sql(to_hex)
3451
+ return result
1938
3452
 
1939
3453
  def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str:
1940
3454
  unit = unit_to_str(expression)
1941
3455
  zone = expression.args.get("zone")
1942
3456
  timestamp = expression.this
3457
+ date_unit = is_date_unit(unit)
1943
3458
 
1944
- if is_date_unit(unit) and zone:
3459
+ if date_unit and zone:
1945
3460
  # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC.
1946
3461
  # Double AT TIME ZONE needed for BigQuery compatibility:
1947
3462
  # 1. First AT TIME ZONE: ensures truncation happens in the target timezone
@@ -1950,14 +3465,32 @@ class DuckDB(Dialect):
1950
3465
  result_sql = self.func("DATE_TRUNC", unit, timestamp)
1951
3466
  return self.sql(exp.AtTimeZone(this=result_sql, zone=zone))
1952
3467
 
1953
- return self.func("DATE_TRUNC", unit, timestamp)
3468
+ result = self.func("DATE_TRUNC", unit, timestamp)
3469
+ if expression.args.get("input_type_preserved"):
3470
+ if timestamp.type and timestamp.is_type(
3471
+ exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ
3472
+ ):
3473
+ dummy_date = exp.Cast(
3474
+ this=exp.Literal.string("1970-01-01"),
3475
+ to=exp.DataType(this=exp.DataType.Type.DATE),
3476
+ )
3477
+ date_time = exp.Add(this=dummy_date, expression=timestamp)
3478
+ result = self.func("DATE_TRUNC", unit, date_time)
3479
+ return self.sql(exp.Cast(this=result, to=timestamp.type))
3480
+
3481
+ if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not (
3482
+ date_unit and timestamp.is_type(exp.DataType.Type.DATE)
3483
+ ):
3484
+ return self.sql(exp.Cast(this=result, to=timestamp.type))
3485
+
3486
+ return result
1954
3487
 
1955
3488
  def trim_sql(self, expression: exp.Trim) -> str:
1956
- result_sql = self.func(
1957
- "TRIM",
1958
- _cast_to_varchar(expression.this),
1959
- _cast_to_varchar(expression.expression),
1960
- )
3489
+ expression.this.replace(_cast_to_varchar(expression.this))
3490
+ if expression.expression:
3491
+ expression.expression.replace(_cast_to_varchar(expression.expression))
3492
+
3493
+ result_sql = super().trim_sql(expression)
1961
3494
  return _gen_with_cast_to_blob(self, expression, result_sql)
1962
3495
 
1963
3496
  def round_sql(self, expression: exp.Round) -> str:
@@ -1983,6 +3516,15 @@ class DuckDB(Dialect):
1983
3516
 
1984
3517
  return self.func(func, this, decimals, truncate)
1985
3518
 
3519
+ def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str:
3520
+ result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile"))
3521
+
3522
+ # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type
3523
+ if expression.is_type(*exp.DataType.REAL_TYPES):
3524
+ result = f"CAST({result} AS DOUBLE)"
3525
+
3526
+ return result
3527
+
1986
3528
  def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str:
1987
3529
  """
1988
3530
  BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values
@@ -2043,3 +3585,37 @@ class DuckDB(Dialect):
2043
3585
  result_sql = f"~{self.sql(expression, 'this')}"
2044
3586
 
2045
3587
  return _gen_with_cast_to_blob(self, expression, result_sql)
3588
+
3589
+ def window_sql(self, expression: exp.Window) -> str:
3590
+ this = expression.this
3591
+ if isinstance(this, exp.Corr) or (
3592
+ isinstance(this, exp.Filter) and isinstance(this.this, exp.Corr)
3593
+ ):
3594
+ return self._corr_sql(expression)
3595
+
3596
+ return super().window_sql(expression)
3597
+
3598
+ def filter_sql(self, expression: exp.Filter) -> str:
3599
+ if isinstance(expression.this, exp.Corr):
3600
+ return self._corr_sql(expression)
3601
+
3602
+ return super().filter_sql(expression)
3603
+
3604
+ def _corr_sql(
3605
+ self,
3606
+ expression: t.Union[exp.Filter, exp.Window, exp.Corr],
3607
+ ) -> str:
3608
+ if isinstance(expression, exp.Corr) and not expression.args.get(
3609
+ "null_on_zero_variance"
3610
+ ):
3611
+ return self.func("CORR", expression.this, expression.expression)
3612
+
3613
+ corr_expr = _maybe_corr_null_to_false(expression)
3614
+ if corr_expr is None:
3615
+ if isinstance(expression, exp.Window):
3616
+ return super().window_sql(expression)
3617
+ if isinstance(expression, exp.Filter):
3618
+ return super().filter_sql(expression)
3619
+ corr_expr = expression # make mypy happy
3620
+
3621
+ return self.sql(exp.case().when(exp.IsNan(this=corr_expr), exp.null()).else_(corr_expr))