sqlglot 26.30.0__py3-none-any.whl → 26.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sqlglot/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '26.30.0'
21
- __version_tuple__ = version_tuple = (26, 30, 0)
20
+ __version__ = version = '26.32.0'
21
+ __version_tuple__ = version_tuple = (26, 32, 0)
@@ -70,6 +70,7 @@ DIALECTS = [
70
70
  "ClickHouse",
71
71
  "Databricks",
72
72
  "Doris",
73
+ "Dremio",
73
74
  "Drill",
74
75
  "Druid",
75
76
  "DuckDB",
@@ -93,6 +94,7 @@ DIALECTS = [
93
94
  "Teradata",
94
95
  "Trino",
95
96
  "TSQL",
97
+ "Exasol",
96
98
  ]
97
99
 
98
100
  MODULE_BY_DIALECT = {name: name.lower() for name in DIALECTS}
@@ -2,46 +2,218 @@ from __future__ import annotations
2
2
 
3
3
  import typing as t
4
4
 
5
- from sqlglot import exp
6
- from sqlglot.dialects.trino import Trino
7
- from sqlglot.dialects.hive import Hive
8
- from sqlglot.tokens import TokenType
5
+ from sqlglot import exp, generator, parser, tokens
6
+ from sqlglot.dialects import Dialect, Hive, Trino
7
+ from sqlglot.tokens import TokenType, Token
8
+
9
+
10
+ class Athena(Dialect):
11
+ """
12
+ Over the years, it looks like AWS has taken various execution engines, bolted on AWS-specific
13
+ modifications and then built the Athena service around them.
14
+
15
+ Thus, Athena is not simply hosted Trino, it's more like a router that routes SQL queries to an
16
+ execution engine depending on the query type.
17
+
18
+ As at 2024-09-10, assuming your Athena workgroup is configured to use "Athena engine version 3",
19
+ the following engines exist:
20
+
21
+ Hive:
22
+ - Accepts mostly the same syntax as Hadoop / Hive
23
+ - Uses backticks to quote identifiers
24
+ - Has a distinctive DDL syntax (around things like setting table properties, storage locations etc)
25
+ that is different from Trino
26
+ - Used for *most* DDL, with some exceptions that get routed to the Trino engine instead:
27
+ - CREATE [EXTERNAL] TABLE (without AS SELECT)
28
+ - ALTER
29
+ - DROP
30
+
31
+ Trino:
32
+ - Uses double quotes to quote identifiers
33
+ - Used for DDL operations that involve SELECT queries, eg:
34
+ - CREATE VIEW / DROP VIEW
35
+ - CREATE TABLE... AS SELECT
36
+ - Used for DML operations
37
+ - SELECT, INSERT, UPDATE, DELETE, MERGE
38
+
39
+ The SQLGlot Athena dialect tries to identify which engine a query would be routed to and then uses the
40
+ tokenizer / parser / generator for that engine. This is unfortunately necessary, as there are certain
41
+ incompatibilities between the engines' dialects and thus can't be handled by a single, unifying dialect.
42
+
43
+ References:
44
+ - https://docs.aws.amazon.com/athena/latest/ug/ddl-reference.html
45
+ - https://docs.aws.amazon.com/athena/latest/ug/dml-queries-functions-operators.html
46
+ """
47
+
48
+ def __init__(self, **kwargs):
49
+ super().__init__(**kwargs)
50
+
51
+ self._hive = Hive(**kwargs)
52
+ self._trino = Trino(**kwargs)
53
+
54
+ def tokenize(self, sql: str, **opts) -> t.List[Token]:
55
+ opts["hive"] = self._hive
56
+ opts["trino"] = self._trino
57
+ return super().tokenize(sql, **opts)
58
+
59
+ def parse(self, sql: str, **opts) -> t.List[t.Optional[exp.Expression]]:
60
+ opts["hive"] = self._hive
61
+ opts["trino"] = self._trino
62
+ return super().parse(sql, **opts)
63
+
64
+ def parse_into(
65
+ self, expression_type: exp.IntoType, sql: str, **opts
66
+ ) -> t.List[t.Optional[exp.Expression]]:
67
+ opts["hive"] = self._hive
68
+ opts["trino"] = self._trino
69
+ return super().parse_into(expression_type, sql, **opts)
70
+
71
+ def generate(self, expression: exp.Expression, copy: bool = True, **opts) -> str:
72
+ opts["hive"] = self._hive
73
+ opts["trino"] = self._trino
74
+ return super().generate(expression, copy=copy, **opts)
75
+
76
+ # This Tokenizer consumes a combination of HiveQL and Trino SQL and then processes the tokens
77
+ # to disambiguate which dialect needs to be actually used in order to tokenize correctly.
78
+ class Tokenizer(tokens.Tokenizer):
79
+ IDENTIFIERS = Trino.Tokenizer.IDENTIFIERS + Hive.Tokenizer.IDENTIFIERS
80
+ STRING_ESCAPES = Trino.Tokenizer.STRING_ESCAPES + Hive.Tokenizer.STRING_ESCAPES
81
+ HEX_STRINGS = Trino.Tokenizer.HEX_STRINGS + Hive.Tokenizer.HEX_STRINGS
82
+ UNICODE_STRINGS = Trino.Tokenizer.UNICODE_STRINGS + Hive.Tokenizer.UNICODE_STRINGS
83
+
84
+ NUMERIC_LITERALS = {
85
+ **Trino.Tokenizer.NUMERIC_LITERALS,
86
+ **Hive.Tokenizer.NUMERIC_LITERALS,
87
+ }
88
+
89
+ KEYWORDS = {
90
+ **Hive.Tokenizer.KEYWORDS,
91
+ **Trino.Tokenizer.KEYWORDS,
92
+ "UNLOAD": TokenType.COMMAND,
93
+ }
94
+
95
+ def __init__(self, *args: t.Any, **kwargs: t.Any) -> None:
96
+ hive = kwargs.pop("hive", None) or Hive()
97
+ trino = kwargs.pop("trino", None) or Trino()
98
+
99
+ super().__init__(*args, **kwargs)
100
+
101
+ self._hive_tokenizer = hive.tokenizer(*args, **{**kwargs, "dialect": hive})
102
+ self._trino_tokenizer = _TrinoTokenizer(*args, **{**kwargs, "dialect": trino})
103
+
104
+ def tokenize(self, sql: str) -> t.List[Token]:
105
+ tokens = super().tokenize(sql)
106
+
107
+ if _tokenize_as_hive(tokens):
108
+ return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql)
109
+
110
+ return self._trino_tokenizer.tokenize(sql)
111
+
112
+ class Parser(parser.Parser):
113
+ def __init__(self, *args: t.Any, **kwargs: t.Any) -> None:
114
+ hive = kwargs.pop("hive", None) or Hive()
115
+ trino = kwargs.pop("trino", None) or Trino()
116
+
117
+ super().__init__(*args, **kwargs)
118
+
119
+ self._hive_parser = hive.parser(*args, **{**kwargs, "dialect": hive})
120
+ self._trino_parser = _TrinoParser(*args, **{**kwargs, "dialect": trino})
121
+
122
+ def parse(
123
+ self, raw_tokens: t.List[Token], sql: t.Optional[str] = None
124
+ ) -> t.List[t.Optional[exp.Expression]]:
125
+ if raw_tokens and raw_tokens[0].token_type == TokenType.HIVE_TOKEN_STREAM:
126
+ return self._hive_parser.parse(raw_tokens[1:], sql)
127
+
128
+ return self._trino_parser.parse(raw_tokens, sql)
129
+
130
+ def parse_into(
131
+ self,
132
+ expression_types: exp.IntoType,
133
+ raw_tokens: t.List[Token],
134
+ sql: t.Optional[str] = None,
135
+ ) -> t.List[t.Optional[exp.Expression]]:
136
+ if raw_tokens and raw_tokens[0].token_type == TokenType.HIVE_TOKEN_STREAM:
137
+ return self._hive_parser.parse_into(expression_types, raw_tokens[1:], sql)
138
+
139
+ return self._trino_parser.parse_into(expression_types, raw_tokens, sql)
140
+
141
+ class Generator(generator.Generator):
142
+ def __init__(self, *args: t.Any, **kwargs: t.Any) -> None:
143
+ hive = kwargs.pop("hive", None) or Hive()
144
+ trino = kwargs.pop("trino", None) or Trino()
145
+
146
+ super().__init__(*args, **kwargs)
147
+
148
+ self._hive_generator = _HiveGenerator(*args, **{**kwargs, "dialect": hive})
149
+ self._trino_generator = _TrinoGenerator(*args, **{**kwargs, "dialect": trino})
150
+
151
+ def generate(self, expression: exp.Expression, copy: bool = True) -> str:
152
+ if _generate_as_hive(expression):
153
+ generator = self._hive_generator
154
+ else:
155
+ generator = self._trino_generator
156
+
157
+ return generator.generate(expression, copy=copy)
158
+
159
+
160
+ def _tokenize_as_hive(tokens: t.List[Token]) -> bool:
161
+ if len(tokens) < 2:
162
+ return False
163
+
164
+ first, second, *rest = tokens
165
+
166
+ first_type = first.token_type
167
+ first_text = first.text.upper()
168
+ second_type = second.token_type
169
+ second_text = second.text.upper()
170
+
171
+ if first_type in (TokenType.DESCRIBE, TokenType.SHOW) or first_text == "MSCK REPAIR":
172
+ return True
173
+
174
+ if first_type in (TokenType.ALTER, TokenType.CREATE, TokenType.DROP):
175
+ if second_text in ("DATABASE", "EXTERNAL", "SCHEMA"):
176
+ return True
177
+ if second_type == TokenType.VIEW:
178
+ return False
179
+
180
+ return all(t.token_type != TokenType.SELECT for t in rest)
181
+
182
+ return False
9
183
 
10
184
 
11
185
  def _generate_as_hive(expression: exp.Expression) -> bool:
12
186
  if isinstance(expression, exp.Create):
13
187
  if expression.kind == "TABLE":
14
- properties: t.Optional[exp.Properties] = expression.args.get("properties")
188
+ properties = expression.args.get("properties")
189
+
190
+ # CREATE EXTERNAL TABLE is Hive
15
191
  if properties and properties.find(exp.ExternalProperty):
16
- return True # CREATE EXTERNAL TABLE is Hive
192
+ return True
17
193
 
194
+ # Any CREATE TABLE other than CREATE TABLE ... AS <query> is Hive
18
195
  if not isinstance(expression.expression, exp.Query):
19
- return True # any CREATE TABLE other than CREATE TABLE AS SELECT is Hive
196
+ return True
20
197
  else:
21
- return expression.kind != "VIEW" # CREATE VIEW is never Hive but CREATE SCHEMA etc is
22
-
23
- # https://docs.aws.amazon.com/athena/latest/ug/ddl-reference.html
24
- elif isinstance(expression, (exp.Alter, exp.Drop, exp.Describe)):
198
+ # CREATE VIEW is Trino, but CREATE SCHEMA, CREATE DATABASE, etc, is Hive
199
+ return expression.kind != "VIEW"
200
+ elif isinstance(expression, (exp.Alter, exp.Drop, exp.Describe, exp.Show)):
25
201
  if isinstance(expression, exp.Drop) and expression.kind == "VIEW":
26
- # DROP VIEW is Trino (I guess because CREATE VIEW is)
202
+ # DROP VIEW is Trino, because CREATE VIEW is as well
27
203
  return False
28
204
 
29
- # Everything else is Hive
205
+ # Everything else, e.g., ALTER statements, is Hive
30
206
  return True
31
207
 
32
208
  return False
33
209
 
34
210
 
35
211
  def _is_iceberg_table(properties: exp.Properties) -> bool:
36
- table_type_property = next(
37
- (
38
- p
39
- for p in properties.expressions
40
- if isinstance(p, exp.Property) and p.name == "table_type"
41
- ),
42
- None,
43
- )
44
- return bool(table_type_property and table_type_property.text("value").lower() == "iceberg")
212
+ for p in properties.expressions:
213
+ if isinstance(p, exp.Property) and p.name == "table_type":
214
+ return p.text("value").lower() == "iceberg"
215
+
216
+ return False
45
217
 
46
218
 
47
219
  def _location_property_sql(self: Athena.Generator, e: exp.LocationProperty):
@@ -64,6 +236,7 @@ def _partitioned_by_property_sql(self: Athena.Generator, e: exp.PartitionedByPro
64
236
  # ref: https://docs.aws.amazon.com/athena/latest/ug/create-table-as.html#ctas-table-properties
65
237
 
66
238
  prop_name = "partitioned_by"
239
+
67
240
  if isinstance(e.parent, exp.Properties):
68
241
  if _is_iceberg_table(e.parent):
69
242
  prop_name = "partitioning"
@@ -71,97 +244,45 @@ def _partitioned_by_property_sql(self: Athena.Generator, e: exp.PartitionedByPro
71
244
  return f"{prop_name}={self.sql(e, 'this')}"
72
245
 
73
246
 
74
- class Athena(Trino):
75
- """
76
- Over the years, it looks like AWS has taken various execution engines, bolted on AWS-specific modifications and then
77
- built the Athena service around them.
78
-
79
- Thus, Athena is not simply hosted Trino, it's more like a router that routes SQL queries to an execution engine depending
80
- on the query type.
81
-
82
- As at 2024-09-10, assuming your Athena workgroup is configured to use "Athena engine version 3", the following engines exist:
83
-
84
- Hive:
85
- - Accepts mostly the same syntax as Hadoop / Hive
86
- - Uses backticks to quote identifiers
87
- - Has a distinctive DDL syntax (around things like setting table properties, storage locations etc) that is different from Trino
88
- - Used for *most* DDL, with some exceptions that get routed to the Trino engine instead:
89
- - CREATE [EXTERNAL] TABLE (without AS SELECT)
90
- - ALTER
91
- - DROP
92
-
93
- Trino:
94
- - Uses double quotes to quote identifiers
95
- - Used for DDL operations that involve SELECT queries, eg:
96
- - CREATE VIEW / DROP VIEW
97
- - CREATE TABLE... AS SELECT
98
- - Used for DML operations
99
- - SELECT, INSERT, UPDATE, DELETE, MERGE
100
-
101
- The SQLGlot Athena dialect tries to identify which engine a query would be routed to and then uses the parser / generator for that engine
102
- rather than trying to create a universal syntax that can handle both types.
103
- """
104
-
105
- class Tokenizer(Trino.Tokenizer):
106
- """
107
- The Tokenizer is flexible enough to tokenize queries across both the Hive and Trino engines
108
- """
109
-
110
- IDENTIFIERS = ['"', "`"]
111
- STRING_ESCAPES = ["'", "\\"]
112
- KEYWORDS = {
113
- **Hive.Tokenizer.KEYWORDS,
114
- **Trino.Tokenizer.KEYWORDS,
115
- "UNLOAD": TokenType.COMMAND,
116
- }
117
-
118
- class Parser(Trino.Parser):
119
- """
120
- Parse queries for the Athena Trino execution engine
121
- """
122
-
123
- STATEMENT_PARSERS = {
124
- **Trino.Parser.STATEMENT_PARSERS,
125
- TokenType.USING: lambda self: self._parse_as_command(self._prev),
126
- }
127
-
128
- class _HiveGenerator(Hive.Generator):
129
- def alter_sql(self, expression: exp.Alter) -> str:
130
- # package any ALTER TABLE ADD actions into a Schema object
131
- # so it gets generated as `ALTER TABLE .. ADD COLUMNS(...)`
132
- # instead of `ALTER TABLE ... ADD COLUMN` which is invalid syntax on Athena
133
- if isinstance(expression, exp.Alter) and expression.kind == "TABLE":
134
- if expression.actions and isinstance(expression.actions[0], exp.ColumnDef):
135
- new_actions = exp.Schema(expressions=expression.actions)
136
- expression.set("actions", [new_actions])
137
-
138
- return super().alter_sql(expression)
139
-
140
- class Generator(Trino.Generator):
141
- """
142
- Generate queries for the Athena Trino execution engine
143
- """
144
-
145
- PROPERTIES_LOCATION = {
146
- **Trino.Generator.PROPERTIES_LOCATION,
147
- exp.LocationProperty: exp.Properties.Location.POST_WITH,
148
- }
149
-
150
- TRANSFORMS = {
151
- **Trino.Generator.TRANSFORMS,
152
- exp.PartitionedByProperty: _partitioned_by_property_sql,
153
- exp.LocationProperty: _location_property_sql,
154
- }
155
-
156
- def __init__(self, *args, **kwargs):
157
- super().__init__(*args, **kwargs)
158
-
159
- hive_kwargs = {**kwargs, "dialect": "hive"}
160
-
161
- self._hive_generator = Athena._HiveGenerator(*args, **hive_kwargs)
162
-
163
- def generate(self, expression: exp.Expression, copy: bool = True) -> str:
164
- if _generate_as_hive(expression):
165
- return self._hive_generator.generate(expression, copy)
166
-
167
- return super().generate(expression, copy)
247
+ # Athena extensions to Hive's generator
248
+ class _HiveGenerator(Hive.Generator):
249
+ def alter_sql(self, expression: exp.Alter) -> str:
250
+ # Package any ALTER TABLE ADD actions into a Schema object, so it gets generated as
251
+ # `ALTER TABLE .. ADD COLUMNS(...)`, instead of `ALTER TABLE ... ADD COLUMN`, which
252
+ # is invalid syntax on Athena
253
+ if isinstance(expression, exp.Alter) and expression.kind == "TABLE":
254
+ if expression.actions and isinstance(expression.actions[0], exp.ColumnDef):
255
+ new_actions = exp.Schema(expressions=expression.actions)
256
+ expression.set("actions", [new_actions])
257
+
258
+ return super().alter_sql(expression)
259
+
260
+
261
+ # Athena extensions to Trino's tokenizer
262
+ class _TrinoTokenizer(Trino.Tokenizer):
263
+ KEYWORDS = {
264
+ **Trino.Tokenizer.KEYWORDS,
265
+ "UNLOAD": TokenType.COMMAND,
266
+ }
267
+
268
+
269
+ # Athena extensions to Trino's parser
270
+ class _TrinoParser(Trino.Parser):
271
+ STATEMENT_PARSERS = {
272
+ **Trino.Parser.STATEMENT_PARSERS,
273
+ TokenType.USING: lambda self: self._parse_as_command(self._prev),
274
+ }
275
+
276
+
277
+ # Athena extensions to Trino's generator
278
+ class _TrinoGenerator(Trino.Generator):
279
+ PROPERTIES_LOCATION = {
280
+ **Trino.Generator.PROPERTIES_LOCATION,
281
+ exp.LocationProperty: exp.Properties.Location.POST_WITH,
282
+ }
283
+
284
+ TRANSFORMS = {
285
+ **Trino.Generator.TRANSFORMS,
286
+ exp.PartitionedByProperty: _partitioned_by_property_sql,
287
+ exp.LocationProperty: _location_property_sql,
288
+ }
@@ -30,6 +30,7 @@ from sqlglot.dialects.dialect import (
30
30
  unit_to_var,
31
31
  strposition_sql,
32
32
  groupconcat_sql,
33
+ space_sql,
33
34
  )
34
35
  from sqlglot.helper import seq_get, split_num_words
35
36
  from sqlglot.tokens import TokenType
@@ -444,6 +445,7 @@ class BigQuery(Dialect):
444
445
  exp.Substring,
445
446
  )
446
447
  },
448
+ exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"),
447
449
  exp.Concat: _annotate_concat,
448
450
  exp.Sign: lambda self, e: self._annotate_by_args(e, "this"),
449
451
  exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True),
@@ -543,7 +545,7 @@ class BigQuery(Dialect):
543
545
  "DATE_ADD": build_date_delta_with_interval(exp.DateAdd),
544
546
  "DATE_SUB": build_date_delta_with_interval(exp.DateSub),
545
547
  "DATE_TRUNC": lambda args: exp.DateTrunc(
546
- unit=exp.Literal.string(str(seq_get(args, 1))),
548
+ unit=seq_get(args, 1),
547
549
  this=seq_get(args, 0),
548
550
  zone=seq_get(args, 2),
549
551
  ),
@@ -963,9 +965,6 @@ class BigQuery(Dialect):
963
965
  exp.DateSub: date_add_interval_sql("DATE", "SUB"),
964
966
  exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"),
965
967
  exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"),
966
- exp.DateTrunc: lambda self, e: self.func(
967
- "DATE_TRUNC", e.this, e.text("unit"), e.args.get("zone")
968
- ),
969
968
  exp.FromTimeZone: lambda self, e: self.func(
970
969
  "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'"
971
970
  ),
@@ -1014,6 +1013,7 @@ class BigQuery(Dialect):
1014
1013
  ),
1015
1014
  exp.SHA: rename_func("SHA1"),
1016
1015
  exp.SHA2: sha256_sql,
1016
+ exp.Space: space_sql,
1017
1017
  exp.StabilityProperty: lambda self, e: (
1018
1018
  "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC"
1019
1019
  ),
@@ -1195,6 +1195,11 @@ class BigQuery(Dialect):
1195
1195
  "within",
1196
1196
  }
1197
1197
 
1198
+ def datetrunc_sql(self, expression: exp.DateTrunc) -> str:
1199
+ unit = expression.unit
1200
+ unit_sql = unit.name if unit.is_string else self.sql(unit)
1201
+ return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone"))
1202
+
1198
1203
  def mod_sql(self, expression: exp.Mod) -> str:
1199
1204
  this = expression.this
1200
1205
  expr = expression.expression
@@ -303,6 +303,8 @@ class ClickHouse(Dialect):
303
303
  **parser.Parser.FUNCTIONS,
304
304
  "ANY": exp.AnyValue.from_arg_list,
305
305
  "ARRAYSUM": exp.ArraySum.from_arg_list,
306
+ "ARRAYREVERSE": exp.ArrayReverse.from_arg_list,
307
+ "ARRAYSLICE": exp.ArraySlice.from_arg_list,
306
308
  "COUNTIF": _build_count_if,
307
309
  "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None),
308
310
  "DATEADD": build_date_delta(exp.DateAdd, default_unit=None),
@@ -330,6 +332,7 @@ class ClickHouse(Dialect):
330
332
  "MD5": exp.MD5Digest.from_arg_list,
331
333
  "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)),
332
334
  "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)),
335
+ "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, # alias for camel-case substringIndex
333
336
  "EDITDISTANCE": exp.Levenshtein.from_arg_list,
334
337
  "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list,
335
338
  }
@@ -1065,6 +1068,8 @@ class ClickHouse(Dialect):
1065
1068
  exp.ArrayConcat: rename_func("arrayConcat"),
1066
1069
  exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this),
1067
1070
  exp.ArrayRemove: remove_from_array_using_filter,
1071
+ exp.ArrayReverse: rename_func("arrayReverse"),
1072
+ exp.ArraySlice: rename_func("arraySlice"),
1068
1073
  exp.ArraySum: rename_func("arraySum"),
1069
1074
  exp.ArgMax: arg_max_or_min_no_count("argMax"),
1070
1075
  exp.ArgMin: arg_max_or_min_no_count("argMin"),
@@ -9,6 +9,7 @@ from sqlglot.dialects.dialect import (
9
9
  build_date_delta,
10
10
  timestamptrunc_sql,
11
11
  build_formatted_time,
12
+ groupconcat_sql,
12
13
  )
13
14
  from sqlglot.dialects.spark import Spark
14
15
  from sqlglot.tokens import TokenType
@@ -87,6 +88,7 @@ class Databricks(Spark):
87
88
  e.this,
88
89
  ),
89
90
  exp.DatetimeTrunc: timestamptrunc_sql(),
91
+ exp.GroupConcat: groupconcat_sql,
90
92
  exp.Select: transforms.preprocess(
91
93
  [
92
94
  transforms.eliminate_distinct_on,
@@ -73,6 +73,7 @@ class Dialects(str, Enum):
73
73
  CLICKHOUSE = "clickhouse"
74
74
  DATABRICKS = "databricks"
75
75
  DORIS = "doris"
76
+ DREMIO = "dremio"
76
77
  DRILL = "drill"
77
78
  DRUID = "druid"
78
79
  DUCKDB = "duckdb"
@@ -96,6 +97,7 @@ class Dialects(str, Enum):
96
97
  TERADATA = "teradata"
97
98
  TRINO = "trino"
98
99
  TSQL = "tsql"
100
+ EXASOL = "exasol"
99
101
 
100
102
 
101
103
  class NormalizationStrategy(str, AutoName):
@@ -700,6 +702,9 @@ class Dialect(metaclass=_Dialect):
700
702
  exp.TimeAdd,
701
703
  exp.TimeSub,
702
704
  },
705
+ exp.DataType.Type.TIMESTAMPTZ: {
706
+ exp.CurrentTimestampLTZ,
707
+ },
703
708
  exp.DataType.Type.TIMESTAMP: {
704
709
  exp.CurrentTimestamp,
705
710
  exp.StrToTime,
@@ -755,6 +760,12 @@ class Dialect(metaclass=_Dialect):
755
760
  exp.Array: lambda self, e: self._annotate_by_args(e, "expressions", array=True),
756
761
  exp.ArrayAgg: lambda self, e: self._annotate_by_args(e, "this", array=True),
757
762
  exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"),
763
+ exp.ArrayConcatAgg: lambda self, e: self._annotate_by_args(e, "this"),
764
+ exp.ArrayToString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TEXT),
765
+ exp.ArrayFirst: lambda self, e: self._annotate_by_array_element(e),
766
+ exp.ArrayLast: lambda self, e: self._annotate_by_array_element(e),
767
+ exp.ArrayReverse: lambda self, e: self._annotate_by_args(e, "this"),
768
+ exp.ArraySlice: lambda self, e: self._annotate_by_args(e, "this"),
758
769
  exp.Bracket: lambda self, e: self._annotate_bracket(e),
759
770
  exp.Cast: lambda self, e: self._annotate_with_type(e, e.args["to"]),
760
771
  exp.Case: lambda self, e: self._annotate_by_args(e, "default", "ifs"),
@@ -1024,22 +1035,20 @@ class Dialect(metaclass=_Dialect):
1024
1035
  for expression in self.parse(sql)
1025
1036
  ]
1026
1037
 
1027
- def tokenize(self, sql: str) -> t.List[Token]:
1028
- return self.tokenizer.tokenize(sql)
1038
+ def tokenize(self, sql: str, **opts) -> t.List[Token]:
1039
+ return self.tokenizer(**opts).tokenize(sql)
1029
1040
 
1030
- @property
1031
- def tokenizer(self) -> Tokenizer:
1032
- return self.tokenizer_class(dialect=self)
1041
+ def tokenizer(self, **opts) -> Tokenizer:
1042
+ return self.tokenizer_class(**{"dialect": self, **opts})
1033
1043
 
1034
- @property
1035
- def jsonpath_tokenizer(self) -> JSONPathTokenizer:
1036
- return self.jsonpath_tokenizer_class(dialect=self)
1044
+ def jsonpath_tokenizer(self, **opts) -> JSONPathTokenizer:
1045
+ return self.jsonpath_tokenizer_class(**{"dialect": self, **opts})
1037
1046
 
1038
1047
  def parser(self, **opts) -> Parser:
1039
- return self.parser_class(dialect=self, **opts)
1048
+ return self.parser_class(**{"dialect": self, **opts})
1040
1049
 
1041
1050
  def generator(self, **opts) -> Generator:
1042
- return self.generator_class(dialect=self, **opts)
1051
+ return self.generator_class(**{"dialect": self, **opts})
1043
1052
 
1044
1053
  def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]:
1045
1054
  return [
@@ -1906,21 +1915,32 @@ def groupconcat_sql(
1906
1915
 
1907
1916
 
1908
1917
  def build_timetostr_or_tochar(args: t.List, dialect: Dialect) -> exp.TimeToStr | exp.ToChar:
1909
- this = seq_get(args, 0)
1910
- format = seq_get(args, 1)
1911
-
1912
- if this:
1918
+ if len(args) == 2:
1919
+ this = args[0]
1913
1920
  if not this.type:
1914
1921
  from sqlglot.optimizer.annotate_types import annotate_types
1915
1922
 
1916
1923
  annotate_types(this, dialect=dialect)
1917
1924
 
1918
- from sqlglot.dialects import Snowflake
1919
-
1920
- if this.is_type(*exp.DataType.TEMPORAL_TYPES) or (
1921
- isinstance(format, exp.Literal) and format.name in Snowflake.TIME_MAPPING
1922
- ):
1925
+ if this.is_type(*exp.DataType.TEMPORAL_TYPES):
1923
1926
  dialect_name = dialect.__class__.__name__.lower()
1924
1927
  return build_formatted_time(exp.TimeToStr, dialect_name, default=True)(args)
1925
1928
 
1926
1929
  return exp.ToChar.from_arg_list(args)
1930
+
1931
+
1932
+ def build_replace_with_optional_replacement(args: t.List) -> exp.Replace:
1933
+ return exp.Replace(
1934
+ this=seq_get(args, 0),
1935
+ expression=seq_get(args, 1),
1936
+ replacement=seq_get(args, 2) or exp.Literal.string(""),
1937
+ )
1938
+
1939
+
1940
+ def space_sql(self: Generator, expression: exp.Space) -> str:
1941
+ return self.sql(
1942
+ exp.Repeat(
1943
+ this=exp.Literal.string(" "),
1944
+ times=expression.this,
1945
+ )
1946
+ )
@@ -0,0 +1,53 @@
1
+ from sqlglot import expressions as exp
2
+ from sqlglot import parser, generator, tokens
3
+ from sqlglot.dialects.dialect import Dialect
4
+
5
+
6
+ class Dremio(Dialect):
7
+ SUPPORTS_USER_DEFINED_TYPES = False
8
+ CONCAT_COALESCE = True
9
+ TYPED_DIVISION = True
10
+ SUPPORTS_SEMI_ANTI_JOIN = False
11
+ NULL_ORDERING = "nulls_are_last"
12
+ SUPPORTS_VALUES_DEFAULT = False
13
+
14
+ class Parser(parser.Parser):
15
+ LOG_DEFAULTS_TO_LN = True
16
+
17
+ class Generator(generator.Generator):
18
+ NVL2_SUPPORTED = False
19
+ SUPPORTS_CONVERT_TIMEZONE = True
20
+ INTERVAL_ALLOWS_PLURAL_FORM = False
21
+ JOIN_HINTS = False
22
+ LIMIT_ONLY_LITERALS = True
23
+ MULTI_ARG_DISTINCT = False
24
+
25
+ # https://docs.dremio.com/current/reference/sql/data-types/
26
+ TYPE_MAPPING = {
27
+ **generator.Generator.TYPE_MAPPING,
28
+ exp.DataType.Type.SMALLINT: "INT",
29
+ exp.DataType.Type.TINYINT: "INT",
30
+ exp.DataType.Type.BINARY: "VARBINARY",
31
+ exp.DataType.Type.TEXT: "VARCHAR",
32
+ exp.DataType.Type.NCHAR: "VARCHAR",
33
+ exp.DataType.Type.CHAR: "VARCHAR",
34
+ exp.DataType.Type.TIMESTAMPNTZ: "TIMESTAMP",
35
+ exp.DataType.Type.DATETIME: "TIMESTAMP",
36
+ exp.DataType.Type.ARRAY: "LIST",
37
+ exp.DataType.Type.BIT: "BOOLEAN",
38
+ }
39
+
40
+ def datatype_sql(self, expression: exp.DataType) -> str:
41
+ """
42
+ Reject time-zone–aware TIMESTAMPs, which Dremio does not accept
43
+ """
44
+ if expression.is_type(
45
+ exp.DataType.Type.TIMESTAMPTZ,
46
+ exp.DataType.Type.TIMESTAMPLTZ,
47
+ ):
48
+ self.unsupported("Dremio does not support time-zone-aware TIMESTAMP")
49
+
50
+ return super().datatype_sql(expression)
51
+
52
+ class Tokenizer(tokens.Tokenizer):
53
+ COMMENTS = ["--", "//", ("/*", "*/")]