altimate-code 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/bin/altimate +6 -0
  3. package/bin/altimate-code +6 -0
  4. package/dbt-tools/bin/altimate-dbt +2 -0
  5. package/dbt-tools/dist/altimate_python_packages/altimate_packages/altimate/__init__.py +0 -0
  6. package/dbt-tools/dist/altimate_python_packages/altimate_packages/altimate/fetch_schema.py +35 -0
  7. package/dbt-tools/dist/altimate_python_packages/altimate_packages/altimate/utils.py +353 -0
  8. package/dbt-tools/dist/altimate_python_packages/altimate_packages/altimate/validate_sql.py +114 -0
  9. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/__init__.py +178 -0
  10. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/__main__.py +96 -0
  11. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/_typing.py +17 -0
  12. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/__init__.py +3 -0
  13. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/__init__.py +18 -0
  14. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/_typing.py +18 -0
  15. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/column.py +332 -0
  16. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/dataframe.py +866 -0
  17. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/functions.py +1267 -0
  18. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/group.py +59 -0
  19. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/normalize.py +78 -0
  20. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/operations.py +53 -0
  21. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/readwriter.py +108 -0
  22. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/session.py +190 -0
  23. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/transforms.py +9 -0
  24. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/types.py +212 -0
  25. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/util.py +32 -0
  26. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/window.py +134 -0
  27. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/__init__.py +118 -0
  28. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/athena.py +166 -0
  29. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/bigquery.py +1331 -0
  30. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/clickhouse.py +1393 -0
  31. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/databricks.py +131 -0
  32. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/dialect.py +1915 -0
  33. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/doris.py +561 -0
  34. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/drill.py +157 -0
  35. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/druid.py +20 -0
  36. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/duckdb.py +1159 -0
  37. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/dune.py +16 -0
  38. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/hive.py +787 -0
  39. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/materialize.py +94 -0
  40. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/mysql.py +1324 -0
  41. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/oracle.py +378 -0
  42. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/postgres.py +778 -0
  43. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/presto.py +788 -0
  44. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/prql.py +203 -0
  45. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/redshift.py +448 -0
  46. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/risingwave.py +78 -0
  47. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/snowflake.py +1464 -0
  48. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/spark.py +202 -0
  49. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/spark2.py +349 -0
  50. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/sqlite.py +320 -0
  51. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/starrocks.py +343 -0
  52. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/tableau.py +61 -0
  53. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/teradata.py +356 -0
  54. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/trino.py +115 -0
  55. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/tsql.py +1403 -0
  56. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/diff.py +456 -0
  57. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/errors.py +93 -0
  58. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/__init__.py +95 -0
  59. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/context.py +101 -0
  60. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/env.py +246 -0
  61. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/python.py +460 -0
  62. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/table.py +155 -0
  63. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/expressions.py +8870 -0
  64. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/generator.py +4993 -0
  65. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/helper.py +582 -0
  66. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/jsonpath.py +227 -0
  67. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/lineage.py +423 -0
  68. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/__init__.py +11 -0
  69. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/annotate_types.py +589 -0
  70. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/canonicalize.py +222 -0
  71. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/eliminate_ctes.py +43 -0
  72. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/eliminate_joins.py +181 -0
  73. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/eliminate_subqueries.py +189 -0
  74. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/isolate_table_selects.py +50 -0
  75. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/merge_subqueries.py +415 -0
  76. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/normalize.py +200 -0
  77. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/normalize_identifiers.py +64 -0
  78. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/optimize_joins.py +91 -0
  79. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/optimizer.py +94 -0
  80. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/pushdown_predicates.py +222 -0
  81. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/pushdown_projections.py +172 -0
  82. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/qualify.py +104 -0
  83. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/qualify_columns.py +1024 -0
  84. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/qualify_tables.py +155 -0
  85. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/scope.py +904 -0
  86. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/simplify.py +1587 -0
  87. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/unnest_subqueries.py +302 -0
  88. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/parser.py +8501 -0
  89. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/planner.py +463 -0
  90. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/schema.py +588 -0
  91. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/serde.py +68 -0
  92. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/time.py +687 -0
  93. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/tokens.py +1520 -0
  94. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/transforms.py +1020 -0
  95. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/trie.py +81 -0
  96. package/dbt-tools/dist/altimate_python_packages/dbt_core_integration.py +825 -0
  97. package/dbt-tools/dist/altimate_python_packages/dbt_utils.py +157 -0
  98. package/dbt-tools/dist/index.js +23859 -0
  99. package/package.json +13 -13
  100. package/postinstall.mjs +42 -0
  101. package/skills/altimate-setup/SKILL.md +31 -0
@@ -0,0 +1,1520 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import typing as t
5
+ from enum import auto
6
+
7
+ from sqlglot.errors import SqlglotError, TokenError
8
+ from sqlglot.helper import AutoName
9
+ from sqlglot.trie import TrieResult, in_trie, new_trie
10
+
11
+ if t.TYPE_CHECKING:
12
+ from sqlglot.dialects.dialect import DialectType
13
+
14
+
15
+ try:
16
+ from sqlglotrs import ( # type: ignore
17
+ Tokenizer as RsTokenizer,
18
+ TokenizerDialectSettings as RsTokenizerDialectSettings,
19
+ TokenizerSettings as RsTokenizerSettings,
20
+ TokenTypeSettings as RsTokenTypeSettings,
21
+ )
22
+
23
+ USE_RS_TOKENIZER = os.environ.get("SQLGLOTRS_TOKENIZER", "1") == "1"
24
+ except ImportError:
25
+ USE_RS_TOKENIZER = False
26
+
27
+
28
+ class TokenType(AutoName):
29
+ L_PAREN = auto()
30
+ R_PAREN = auto()
31
+ L_BRACKET = auto()
32
+ R_BRACKET = auto()
33
+ L_BRACE = auto()
34
+ R_BRACE = auto()
35
+ COMMA = auto()
36
+ DOT = auto()
37
+ DASH = auto()
38
+ PLUS = auto()
39
+ COLON = auto()
40
+ DOTCOLON = auto()
41
+ DCOLON = auto()
42
+ DQMARK = auto()
43
+ SEMICOLON = auto()
44
+ STAR = auto()
45
+ BACKSLASH = auto()
46
+ SLASH = auto()
47
+ LT = auto()
48
+ LTE = auto()
49
+ GT = auto()
50
+ GTE = auto()
51
+ NOT = auto()
52
+ EQ = auto()
53
+ NEQ = auto()
54
+ NULLSAFE_EQ = auto()
55
+ COLON_EQ = auto()
56
+ AND = auto()
57
+ OR = auto()
58
+ AMP = auto()
59
+ DPIPE = auto()
60
+ PIPE_GT = auto()
61
+ PIPE = auto()
62
+ PIPE_SLASH = auto()
63
+ DPIPE_SLASH = auto()
64
+ CARET = auto()
65
+ CARET_AT = auto()
66
+ TILDA = auto()
67
+ ARROW = auto()
68
+ DARROW = auto()
69
+ FARROW = auto()
70
+ HASH = auto()
71
+ HASH_ARROW = auto()
72
+ DHASH_ARROW = auto()
73
+ LR_ARROW = auto()
74
+ DAT = auto()
75
+ LT_AT = auto()
76
+ AT_GT = auto()
77
+ DOLLAR = auto()
78
+ PARAMETER = auto()
79
+ SESSION_PARAMETER = auto()
80
+ DAMP = auto()
81
+ XOR = auto()
82
+ DSTAR = auto()
83
+
84
+ URI_START = auto()
85
+
86
+ BLOCK_START = auto()
87
+ BLOCK_END = auto()
88
+
89
+ SPACE = auto()
90
+ BREAK = auto()
91
+
92
+ STRING = auto()
93
+ NUMBER = auto()
94
+ IDENTIFIER = auto()
95
+ DATABASE = auto()
96
+ COLUMN = auto()
97
+ COLUMN_DEF = auto()
98
+ SCHEMA = auto()
99
+ TABLE = auto()
100
+ WAREHOUSE = auto()
101
+ STAGE = auto()
102
+ STREAMLIT = auto()
103
+ VAR = auto()
104
+ BIT_STRING = auto()
105
+ HEX_STRING = auto()
106
+ BYTE_STRING = auto()
107
+ NATIONAL_STRING = auto()
108
+ RAW_STRING = auto()
109
+ HEREDOC_STRING = auto()
110
+ UNICODE_STRING = auto()
111
+
112
+ # types
113
+ BIT = auto()
114
+ BOOLEAN = auto()
115
+ TINYINT = auto()
116
+ UTINYINT = auto()
117
+ SMALLINT = auto()
118
+ USMALLINT = auto()
119
+ MEDIUMINT = auto()
120
+ UMEDIUMINT = auto()
121
+ INT = auto()
122
+ UINT = auto()
123
+ BIGINT = auto()
124
+ UBIGINT = auto()
125
+ INT128 = auto()
126
+ UINT128 = auto()
127
+ INT256 = auto()
128
+ UINT256 = auto()
129
+ FLOAT = auto()
130
+ DOUBLE = auto()
131
+ UDOUBLE = auto()
132
+ DECIMAL = auto()
133
+ DECIMAL32 = auto()
134
+ DECIMAL64 = auto()
135
+ DECIMAL128 = auto()
136
+ DECIMAL256 = auto()
137
+ UDECIMAL = auto()
138
+ BIGDECIMAL = auto()
139
+ CHAR = auto()
140
+ NCHAR = auto()
141
+ VARCHAR = auto()
142
+ NVARCHAR = auto()
143
+ BPCHAR = auto()
144
+ TEXT = auto()
145
+ MEDIUMTEXT = auto()
146
+ LONGTEXT = auto()
147
+ BLOB = auto()
148
+ MEDIUMBLOB = auto()
149
+ LONGBLOB = auto()
150
+ TINYBLOB = auto()
151
+ TINYTEXT = auto()
152
+ NAME = auto()
153
+ BINARY = auto()
154
+ VARBINARY = auto()
155
+ JSON = auto()
156
+ JSONB = auto()
157
+ TIME = auto()
158
+ TIMETZ = auto()
159
+ TIMESTAMP = auto()
160
+ TIMESTAMPTZ = auto()
161
+ TIMESTAMPLTZ = auto()
162
+ TIMESTAMPNTZ = auto()
163
+ TIMESTAMP_S = auto()
164
+ TIMESTAMP_MS = auto()
165
+ TIMESTAMP_NS = auto()
166
+ DATETIME = auto()
167
+ DATETIME2 = auto()
168
+ DATETIME64 = auto()
169
+ SMALLDATETIME = auto()
170
+ DATE = auto()
171
+ DATE32 = auto()
172
+ INT4RANGE = auto()
173
+ INT4MULTIRANGE = auto()
174
+ INT8RANGE = auto()
175
+ INT8MULTIRANGE = auto()
176
+ NUMRANGE = auto()
177
+ NUMMULTIRANGE = auto()
178
+ TSRANGE = auto()
179
+ TSMULTIRANGE = auto()
180
+ TSTZRANGE = auto()
181
+ TSTZMULTIRANGE = auto()
182
+ DATERANGE = auto()
183
+ DATEMULTIRANGE = auto()
184
+ UUID = auto()
185
+ GEOGRAPHY = auto()
186
+ NULLABLE = auto()
187
+ GEOMETRY = auto()
188
+ POINT = auto()
189
+ RING = auto()
190
+ LINESTRING = auto()
191
+ MULTILINESTRING = auto()
192
+ POLYGON = auto()
193
+ MULTIPOLYGON = auto()
194
+ HLLSKETCH = auto()
195
+ HSTORE = auto()
196
+ SUPER = auto()
197
+ SERIAL = auto()
198
+ SMALLSERIAL = auto()
199
+ BIGSERIAL = auto()
200
+ XML = auto()
201
+ YEAR = auto()
202
+ USERDEFINED = auto()
203
+ MONEY = auto()
204
+ SMALLMONEY = auto()
205
+ ROWVERSION = auto()
206
+ IMAGE = auto()
207
+ VARIANT = auto()
208
+ OBJECT = auto()
209
+ INET = auto()
210
+ IPADDRESS = auto()
211
+ IPPREFIX = auto()
212
+ IPV4 = auto()
213
+ IPV6 = auto()
214
+ ENUM = auto()
215
+ ENUM8 = auto()
216
+ ENUM16 = auto()
217
+ FIXEDSTRING = auto()
218
+ LOWCARDINALITY = auto()
219
+ NESTED = auto()
220
+ AGGREGATEFUNCTION = auto()
221
+ SIMPLEAGGREGATEFUNCTION = auto()
222
+ TDIGEST = auto()
223
+ UNKNOWN = auto()
224
+ VECTOR = auto()
225
+ DYNAMIC = auto()
226
+ VOID = auto()
227
+
228
+ # keywords
229
+ ALIAS = auto()
230
+ ALTER = auto()
231
+ ALWAYS = auto()
232
+ ALL = auto()
233
+ ANTI = auto()
234
+ ANY = auto()
235
+ APPLY = auto()
236
+ ARRAY = auto()
237
+ ASC = auto()
238
+ ASOF = auto()
239
+ ATTACH = auto()
240
+ AUTO_INCREMENT = auto()
241
+ BEGIN = auto()
242
+ BETWEEN = auto()
243
+ BULK_COLLECT_INTO = auto()
244
+ CACHE = auto()
245
+ CASE = auto()
246
+ CHARACTER_SET = auto()
247
+ CLUSTER_BY = auto()
248
+ COLLATE = auto()
249
+ COMMAND = auto()
250
+ COMMENT = auto()
251
+ COMMIT = auto()
252
+ CONNECT_BY = auto()
253
+ CONSTRAINT = auto()
254
+ COPY = auto()
255
+ CREATE = auto()
256
+ CROSS = auto()
257
+ CUBE = auto()
258
+ CURRENT_DATE = auto()
259
+ CURRENT_DATETIME = auto()
260
+ CURRENT_SCHEMA = auto()
261
+ CURRENT_TIME = auto()
262
+ CURRENT_TIMESTAMP = auto()
263
+ CURRENT_USER = auto()
264
+ DECLARE = auto()
265
+ DEFAULT = auto()
266
+ DELETE = auto()
267
+ DESC = auto()
268
+ DESCRIBE = auto()
269
+ DETACH = auto()
270
+ DICTIONARY = auto()
271
+ DISTINCT = auto()
272
+ DISTRIBUTE_BY = auto()
273
+ DIV = auto()
274
+ DROP = auto()
275
+ ELSE = auto()
276
+ END = auto()
277
+ ESCAPE = auto()
278
+ EXCEPT = auto()
279
+ EXECUTE = auto()
280
+ EXISTS = auto()
281
+ FALSE = auto()
282
+ FETCH = auto()
283
+ FILE_FORMAT = auto()
284
+ FILTER = auto()
285
+ FINAL = auto()
286
+ FIRST = auto()
287
+ FOR = auto()
288
+ FORCE = auto()
289
+ FOREIGN_KEY = auto()
290
+ FORMAT = auto()
291
+ FROM = auto()
292
+ FULL = auto()
293
+ FUNCTION = auto()
294
+ GET = auto()
295
+ GLOB = auto()
296
+ GLOBAL = auto()
297
+ GRANT = auto()
298
+ GROUP_BY = auto()
299
+ GROUPING_SETS = auto()
300
+ HAVING = auto()
301
+ HINT = auto()
302
+ IGNORE = auto()
303
+ ILIKE = auto()
304
+ ILIKE_ANY = auto()
305
+ IN = auto()
306
+ INDEX = auto()
307
+ INNER = auto()
308
+ INSERT = auto()
309
+ INTERSECT = auto()
310
+ INTERVAL = auto()
311
+ INTO = auto()
312
+ INTRODUCER = auto()
313
+ IRLIKE = auto()
314
+ IS = auto()
315
+ ISNULL = auto()
316
+ JOIN = auto()
317
+ JOIN_MARKER = auto()
318
+ KEEP = auto()
319
+ KEY = auto()
320
+ KILL = auto()
321
+ LANGUAGE = auto()
322
+ LATERAL = auto()
323
+ LEFT = auto()
324
+ LIKE = auto()
325
+ LIKE_ANY = auto()
326
+ LIMIT = auto()
327
+ LIST = auto()
328
+ LOAD = auto()
329
+ LOCK = auto()
330
+ MAP = auto()
331
+ MATCH_CONDITION = auto()
332
+ MATCH_RECOGNIZE = auto()
333
+ MEMBER_OF = auto()
334
+ MERGE = auto()
335
+ MOD = auto()
336
+ MODEL = auto()
337
+ NATURAL = auto()
338
+ NEXT = auto()
339
+ NOTHING = auto()
340
+ NOTNULL = auto()
341
+ NULL = auto()
342
+ OBJECT_IDENTIFIER = auto()
343
+ OFFSET = auto()
344
+ ON = auto()
345
+ ONLY = auto()
346
+ OPERATOR = auto()
347
+ ORDER_BY = auto()
348
+ ORDER_SIBLINGS_BY = auto()
349
+ ORDERED = auto()
350
+ ORDINALITY = auto()
351
+ OUTER = auto()
352
+ OVER = auto()
353
+ OVERLAPS = auto()
354
+ OVERWRITE = auto()
355
+ PARTITION = auto()
356
+ PARTITION_BY = auto()
357
+ PERCENT = auto()
358
+ PIVOT = auto()
359
+ PLACEHOLDER = auto()
360
+ POSITIONAL = auto()
361
+ PRAGMA = auto()
362
+ PREWHERE = auto()
363
+ PRIMARY_KEY = auto()
364
+ PROCEDURE = auto()
365
+ PROPERTIES = auto()
366
+ PSEUDO_TYPE = auto()
367
+ PUT = auto()
368
+ QUALIFY = auto()
369
+ QUOTE = auto()
370
+ RANGE = auto()
371
+ RECURSIVE = auto()
372
+ REFRESH = auto()
373
+ RENAME = auto()
374
+ REPLACE = auto()
375
+ RETURNING = auto()
376
+ REFERENCES = auto()
377
+ RIGHT = auto()
378
+ RLIKE = auto()
379
+ ROLLBACK = auto()
380
+ ROLLUP = auto()
381
+ ROW = auto()
382
+ ROWS = auto()
383
+ SELECT = auto()
384
+ SEMI = auto()
385
+ SEPARATOR = auto()
386
+ SEQUENCE = auto()
387
+ SERDE_PROPERTIES = auto()
388
+ SET = auto()
389
+ SETTINGS = auto()
390
+ SHOW = auto()
391
+ SIMILAR_TO = auto()
392
+ SOME = auto()
393
+ SORT_BY = auto()
394
+ START_WITH = auto()
395
+ STORAGE_INTEGRATION = auto()
396
+ STRAIGHT_JOIN = auto()
397
+ STRUCT = auto()
398
+ SUMMARIZE = auto()
399
+ TABLE_SAMPLE = auto()
400
+ TAG = auto()
401
+ TEMPORARY = auto()
402
+ TOP = auto()
403
+ THEN = auto()
404
+ TRUE = auto()
405
+ TRUNCATE = auto()
406
+ UNCACHE = auto()
407
+ UNION = auto()
408
+ UNNEST = auto()
409
+ UNPIVOT = auto()
410
+ UPDATE = auto()
411
+ USE = auto()
412
+ USING = auto()
413
+ VALUES = auto()
414
+ VIEW = auto()
415
+ VOLATILE = auto()
416
+ WHEN = auto()
417
+ WHERE = auto()
418
+ WINDOW = auto()
419
+ WITH = auto()
420
+ UNIQUE = auto()
421
+ VERSION_SNAPSHOT = auto()
422
+ TIMESTAMP_SNAPSHOT = auto()
423
+ OPTION = auto()
424
+ SINK = auto()
425
+ SOURCE = auto()
426
+ ANALYZE = auto()
427
+ NAMESPACE = auto()
428
+ EXPORT = auto()
429
+
430
+
431
+ _ALL_TOKEN_TYPES = list(TokenType)
432
+ _TOKEN_TYPE_TO_INDEX = {token_type: i for i, token_type in enumerate(_ALL_TOKEN_TYPES)}
433
+
434
+
435
+ class Token:
436
+ __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments")
437
+
438
+ @classmethod
439
+ def number(cls, number: int) -> Token:
440
+ """Returns a NUMBER token with `number` as its text."""
441
+ return cls(TokenType.NUMBER, str(number))
442
+
443
+ @classmethod
444
+ def string(cls, string: str) -> Token:
445
+ """Returns a STRING token with `string` as its text."""
446
+ return cls(TokenType.STRING, string)
447
+
448
+ @classmethod
449
+ def identifier(cls, identifier: str) -> Token:
450
+ """Returns an IDENTIFIER token with `identifier` as its text."""
451
+ return cls(TokenType.IDENTIFIER, identifier)
452
+
453
+ @classmethod
454
+ def var(cls, var: str) -> Token:
455
+ """Returns an VAR token with `var` as its text."""
456
+ return cls(TokenType.VAR, var)
457
+
458
+ def __init__(
459
+ self,
460
+ token_type: TokenType,
461
+ text: str,
462
+ line: int = 1,
463
+ col: int = 1,
464
+ start: int = 0,
465
+ end: int = 0,
466
+ comments: t.Optional[t.List[str]] = None,
467
+ ) -> None:
468
+ """Token initializer.
469
+
470
+ Args:
471
+ token_type: The TokenType Enum.
472
+ text: The text of the token.
473
+ line: The line that the token ends on.
474
+ col: The column that the token ends on.
475
+ start: The start index of the token.
476
+ end: The ending index of the token.
477
+ comments: The comments to attach to the token.
478
+ """
479
+ self.token_type = token_type
480
+ self.text = text
481
+ self.line = line
482
+ self.col = col
483
+ self.start = start
484
+ self.end = end
485
+ self.comments = [] if comments is None else comments
486
+
487
+ def __repr__(self) -> str:
488
+ attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
489
+ return f"<Token {attributes}>"
490
+
491
+
492
+ class _Tokenizer(type):
493
+ def __new__(cls, clsname, bases, attrs):
494
+ klass = super().__new__(cls, clsname, bases, attrs)
495
+
496
+ def _convert_quotes(arr: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]:
497
+ return dict(
498
+ (item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr
499
+ )
500
+
501
+ def _quotes_to_format(
502
+ token_type: TokenType, arr: t.List[str | t.Tuple[str, str]]
503
+ ) -> t.Dict[str, t.Tuple[str, TokenType]]:
504
+ return {k: (v, token_type) for k, v in _convert_quotes(arr).items()}
505
+
506
+ klass._QUOTES = _convert_quotes(klass.QUOTES)
507
+ klass._IDENTIFIERS = _convert_quotes(klass.IDENTIFIERS)
508
+
509
+ klass._FORMAT_STRINGS = {
510
+ **{
511
+ p + s: (e, TokenType.NATIONAL_STRING)
512
+ for s, e in klass._QUOTES.items()
513
+ for p in ("n", "N")
514
+ },
515
+ **_quotes_to_format(TokenType.BIT_STRING, klass.BIT_STRINGS),
516
+ **_quotes_to_format(TokenType.BYTE_STRING, klass.BYTE_STRINGS),
517
+ **_quotes_to_format(TokenType.HEX_STRING, klass.HEX_STRINGS),
518
+ **_quotes_to_format(TokenType.RAW_STRING, klass.RAW_STRINGS),
519
+ **_quotes_to_format(TokenType.HEREDOC_STRING, klass.HEREDOC_STRINGS),
520
+ **_quotes_to_format(TokenType.UNICODE_STRING, klass.UNICODE_STRINGS),
521
+ }
522
+
523
+ klass._STRING_ESCAPES = set(klass.STRING_ESCAPES)
524
+ klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES)
525
+ klass._COMMENTS = {
526
+ **dict(
527
+ (comment, None) if isinstance(comment, str) else (comment[0], comment[1])
528
+ for comment in klass.COMMENTS
529
+ ),
530
+ "{#": "#}", # Ensure Jinja comments are tokenized correctly in all dialects
531
+ }
532
+ if klass.HINT_START in klass.KEYWORDS:
533
+ klass._COMMENTS[klass.HINT_START] = "*/"
534
+
535
+ klass._KEYWORD_TRIE = new_trie(
536
+ key.upper()
537
+ for key in (
538
+ *klass.KEYWORDS,
539
+ *klass._COMMENTS,
540
+ *klass._QUOTES,
541
+ *klass._FORMAT_STRINGS,
542
+ )
543
+ if " " in key or any(single in key for single in klass.SINGLE_TOKENS)
544
+ )
545
+
546
+ if USE_RS_TOKENIZER:
547
+ settings = RsTokenizerSettings(
548
+ white_space={k: _TOKEN_TYPE_TO_INDEX[v] for k, v in klass.WHITE_SPACE.items()},
549
+ single_tokens={k: _TOKEN_TYPE_TO_INDEX[v] for k, v in klass.SINGLE_TOKENS.items()},
550
+ keywords={k: _TOKEN_TYPE_TO_INDEX[v] for k, v in klass.KEYWORDS.items()},
551
+ numeric_literals=klass.NUMERIC_LITERALS,
552
+ identifiers=klass._IDENTIFIERS,
553
+ identifier_escapes=klass._IDENTIFIER_ESCAPES,
554
+ string_escapes=klass._STRING_ESCAPES,
555
+ quotes=klass._QUOTES,
556
+ format_strings={
557
+ k: (v1, _TOKEN_TYPE_TO_INDEX[v2])
558
+ for k, (v1, v2) in klass._FORMAT_STRINGS.items()
559
+ },
560
+ has_bit_strings=bool(klass.BIT_STRINGS),
561
+ has_hex_strings=bool(klass.HEX_STRINGS),
562
+ comments=klass._COMMENTS,
563
+ var_single_tokens=klass.VAR_SINGLE_TOKENS,
564
+ commands={_TOKEN_TYPE_TO_INDEX[v] for v in klass.COMMANDS},
565
+ command_prefix_tokens={
566
+ _TOKEN_TYPE_TO_INDEX[v] for v in klass.COMMAND_PREFIX_TOKENS
567
+ },
568
+ heredoc_tag_is_identifier=klass.HEREDOC_TAG_IS_IDENTIFIER,
569
+ string_escapes_allowed_in_raw_strings=klass.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,
570
+ nested_comments=klass.NESTED_COMMENTS,
571
+ hint_start=klass.HINT_START,
572
+ tokens_preceding_hint={
573
+ _TOKEN_TYPE_TO_INDEX[v] for v in klass.TOKENS_PRECEDING_HINT
574
+ },
575
+ )
576
+ token_types = RsTokenTypeSettings(
577
+ bit_string=_TOKEN_TYPE_TO_INDEX[TokenType.BIT_STRING],
578
+ break_=_TOKEN_TYPE_TO_INDEX[TokenType.BREAK],
579
+ dcolon=_TOKEN_TYPE_TO_INDEX[TokenType.DCOLON],
580
+ heredoc_string=_TOKEN_TYPE_TO_INDEX[TokenType.HEREDOC_STRING],
581
+ raw_string=_TOKEN_TYPE_TO_INDEX[TokenType.RAW_STRING],
582
+ hex_string=_TOKEN_TYPE_TO_INDEX[TokenType.HEX_STRING],
583
+ identifier=_TOKEN_TYPE_TO_INDEX[TokenType.IDENTIFIER],
584
+ number=_TOKEN_TYPE_TO_INDEX[TokenType.NUMBER],
585
+ parameter=_TOKEN_TYPE_TO_INDEX[TokenType.PARAMETER],
586
+ semicolon=_TOKEN_TYPE_TO_INDEX[TokenType.SEMICOLON],
587
+ string=_TOKEN_TYPE_TO_INDEX[TokenType.STRING],
588
+ var=_TOKEN_TYPE_TO_INDEX[TokenType.VAR],
589
+ heredoc_string_alternative=_TOKEN_TYPE_TO_INDEX[klass.HEREDOC_STRING_ALTERNATIVE],
590
+ hint=_TOKEN_TYPE_TO_INDEX[TokenType.HINT],
591
+ )
592
+ klass._RS_TOKENIZER = RsTokenizer(settings, token_types)
593
+ else:
594
+ klass._RS_TOKENIZER = None
595
+
596
+ return klass
597
+
598
+
599
+ class Tokenizer(metaclass=_Tokenizer):
600
+ SINGLE_TOKENS = {
601
+ "(": TokenType.L_PAREN,
602
+ ")": TokenType.R_PAREN,
603
+ "[": TokenType.L_BRACKET,
604
+ "]": TokenType.R_BRACKET,
605
+ "{": TokenType.L_BRACE,
606
+ "}": TokenType.R_BRACE,
607
+ "&": TokenType.AMP,
608
+ "^": TokenType.CARET,
609
+ ":": TokenType.COLON,
610
+ ",": TokenType.COMMA,
611
+ ".": TokenType.DOT,
612
+ "-": TokenType.DASH,
613
+ "=": TokenType.EQ,
614
+ ">": TokenType.GT,
615
+ "<": TokenType.LT,
616
+ "%": TokenType.MOD,
617
+ "!": TokenType.NOT,
618
+ "|": TokenType.PIPE,
619
+ "+": TokenType.PLUS,
620
+ ";": TokenType.SEMICOLON,
621
+ "/": TokenType.SLASH,
622
+ "\\": TokenType.BACKSLASH,
623
+ "*": TokenType.STAR,
624
+ "~": TokenType.TILDA,
625
+ "?": TokenType.PLACEHOLDER,
626
+ "@": TokenType.PARAMETER,
627
+ "#": TokenType.HASH,
628
+ # Used for breaking a var like x'y' but nothing else the token type doesn't matter
629
+ "'": TokenType.UNKNOWN,
630
+ "`": TokenType.UNKNOWN,
631
+ '"': TokenType.UNKNOWN,
632
+ }
633
+
634
+ BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
635
+ BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
636
+ HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
637
+ RAW_STRINGS: t.List[str | t.Tuple[str, str]] = []
638
+ HEREDOC_STRINGS: t.List[str | t.Tuple[str, str]] = []
639
+ UNICODE_STRINGS: t.List[str | t.Tuple[str, str]] = []
640
+ IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
641
+ QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
642
+ STRING_ESCAPES = ["'"]
643
+ VAR_SINGLE_TOKENS: t.Set[str] = set()
644
+
645
+ # The strings in this list can always be used as escapes, regardless of the surrounding
646
+ # identifier delimiters. By default, the closing delimiter is assumed to also act as an
647
+ # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x"""
648
+ IDENTIFIER_ESCAPES: t.List[str] = []
649
+
650
+ # Whether the heredoc tags follow the same lexical rules as unquoted identifiers
651
+ HEREDOC_TAG_IS_IDENTIFIER = False
652
+
653
+ # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc
654
+ HEREDOC_STRING_ALTERNATIVE = TokenType.VAR
655
+
656
+ # Whether string escape characters function as such when placed within raw strings
657
+ STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
658
+
659
+ NESTED_COMMENTS = True
660
+
661
+ HINT_START = "/*+"
662
+
663
+ TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE}
664
+
665
+ # Autofilled
666
+ _COMMENTS: t.Dict[str, str] = {}
667
+ _FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {}
668
+ _IDENTIFIERS: t.Dict[str, str] = {}
669
+ _IDENTIFIER_ESCAPES: t.Set[str] = set()
670
+ _QUOTES: t.Dict[str, str] = {}
671
+ _STRING_ESCAPES: t.Set[str] = set()
672
+ _KEYWORD_TRIE: t.Dict = {}
673
+ _RS_TOKENIZER: t.Optional[t.Any] = None
674
+
675
+ KEYWORDS: t.Dict[str, TokenType] = {
676
+ **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
677
+ **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
678
+ **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
679
+ **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
680
+ HINT_START: TokenType.HINT,
681
+ "==": TokenType.EQ,
682
+ "::": TokenType.DCOLON,
683
+ "||": TokenType.DPIPE,
684
+ "|>": TokenType.PIPE_GT,
685
+ ">=": TokenType.GTE,
686
+ "<=": TokenType.LTE,
687
+ "<>": TokenType.NEQ,
688
+ "!=": TokenType.NEQ,
689
+ ":=": TokenType.COLON_EQ,
690
+ "<=>": TokenType.NULLSAFE_EQ,
691
+ "->": TokenType.ARROW,
692
+ "->>": TokenType.DARROW,
693
+ "=>": TokenType.FARROW,
694
+ "#>": TokenType.HASH_ARROW,
695
+ "#>>": TokenType.DHASH_ARROW,
696
+ "<->": TokenType.LR_ARROW,
697
+ "&&": TokenType.DAMP,
698
+ "??": TokenType.DQMARK,
699
+ "~~~": TokenType.GLOB,
700
+ "~~": TokenType.LIKE,
701
+ "~~*": TokenType.ILIKE,
702
+ "~*": TokenType.IRLIKE,
703
+ "ALL": TokenType.ALL,
704
+ "ALWAYS": TokenType.ALWAYS,
705
+ "AND": TokenType.AND,
706
+ "ANTI": TokenType.ANTI,
707
+ "ANY": TokenType.ANY,
708
+ "ASC": TokenType.ASC,
709
+ "AS": TokenType.ALIAS,
710
+ "ASOF": TokenType.ASOF,
711
+ "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
712
+ "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
713
+ "BEGIN": TokenType.BEGIN,
714
+ "BETWEEN": TokenType.BETWEEN,
715
+ "CACHE": TokenType.CACHE,
716
+ "UNCACHE": TokenType.UNCACHE,
717
+ "CASE": TokenType.CASE,
718
+ "CHARACTER SET": TokenType.CHARACTER_SET,
719
+ "CLUSTER BY": TokenType.CLUSTER_BY,
720
+ "COLLATE": TokenType.COLLATE,
721
+ "COLUMN": TokenType.COLUMN,
722
+ "COMMIT": TokenType.COMMIT,
723
+ "CONNECT BY": TokenType.CONNECT_BY,
724
+ "CONSTRAINT": TokenType.CONSTRAINT,
725
+ "COPY": TokenType.COPY,
726
+ "CREATE": TokenType.CREATE,
727
+ "CROSS": TokenType.CROSS,
728
+ "CUBE": TokenType.CUBE,
729
+ "CURRENT_DATE": TokenType.CURRENT_DATE,
730
+ "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA,
731
+ "CURRENT_TIME": TokenType.CURRENT_TIME,
732
+ "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
733
+ "CURRENT_USER": TokenType.CURRENT_USER,
734
+ "DATABASE": TokenType.DATABASE,
735
+ "DEFAULT": TokenType.DEFAULT,
736
+ "DELETE": TokenType.DELETE,
737
+ "DESC": TokenType.DESC,
738
+ "DESCRIBE": TokenType.DESCRIBE,
739
+ "DISTINCT": TokenType.DISTINCT,
740
+ "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
741
+ "DIV": TokenType.DIV,
742
+ "DROP": TokenType.DROP,
743
+ "ELSE": TokenType.ELSE,
744
+ "END": TokenType.END,
745
+ "ENUM": TokenType.ENUM,
746
+ "ESCAPE": TokenType.ESCAPE,
747
+ "EXCEPT": TokenType.EXCEPT,
748
+ "EXECUTE": TokenType.EXECUTE,
749
+ "EXISTS": TokenType.EXISTS,
750
+ "FALSE": TokenType.FALSE,
751
+ "FETCH": TokenType.FETCH,
752
+ "FILTER": TokenType.FILTER,
753
+ "FIRST": TokenType.FIRST,
754
+ "FULL": TokenType.FULL,
755
+ "FUNCTION": TokenType.FUNCTION,
756
+ "FOR": TokenType.FOR,
757
+ "FOREIGN KEY": TokenType.FOREIGN_KEY,
758
+ "FORMAT": TokenType.FORMAT,
759
+ "FROM": TokenType.FROM,
760
+ "GEOGRAPHY": TokenType.GEOGRAPHY,
761
+ "GEOMETRY": TokenType.GEOMETRY,
762
+ "GLOB": TokenType.GLOB,
763
+ "GROUP BY": TokenType.GROUP_BY,
764
+ "GROUPING SETS": TokenType.GROUPING_SETS,
765
+ "HAVING": TokenType.HAVING,
766
+ "ILIKE": TokenType.ILIKE,
767
+ "IN": TokenType.IN,
768
+ "INDEX": TokenType.INDEX,
769
+ "INET": TokenType.INET,
770
+ "INNER": TokenType.INNER,
771
+ "INSERT": TokenType.INSERT,
772
+ "INTERVAL": TokenType.INTERVAL,
773
+ "INTERSECT": TokenType.INTERSECT,
774
+ "INTO": TokenType.INTO,
775
+ "IS": TokenType.IS,
776
+ "ISNULL": TokenType.ISNULL,
777
+ "JOIN": TokenType.JOIN,
778
+ "KEEP": TokenType.KEEP,
779
+ "KILL": TokenType.KILL,
780
+ "LATERAL": TokenType.LATERAL,
781
+ "LEFT": TokenType.LEFT,
782
+ "LIKE": TokenType.LIKE,
783
+ "LIMIT": TokenType.LIMIT,
784
+ "LOAD": TokenType.LOAD,
785
+ "LOCK": TokenType.LOCK,
786
+ "MERGE": TokenType.MERGE,
787
+ "NAMESPACE": TokenType.NAMESPACE,
788
+ "NATURAL": TokenType.NATURAL,
789
+ "NEXT": TokenType.NEXT,
790
+ "NOT": TokenType.NOT,
791
+ "NOTNULL": TokenType.NOTNULL,
792
+ "NULL": TokenType.NULL,
793
+ "OBJECT": TokenType.OBJECT,
794
+ "OFFSET": TokenType.OFFSET,
795
+ "ON": TokenType.ON,
796
+ "OR": TokenType.OR,
797
+ "XOR": TokenType.XOR,
798
+ "ORDER BY": TokenType.ORDER_BY,
799
+ "ORDINALITY": TokenType.ORDINALITY,
800
+ "OUTER": TokenType.OUTER,
801
+ "OVER": TokenType.OVER,
802
+ "OVERLAPS": TokenType.OVERLAPS,
803
+ "OVERWRITE": TokenType.OVERWRITE,
804
+ "PARTITION": TokenType.PARTITION,
805
+ "PARTITION BY": TokenType.PARTITION_BY,
806
+ "PARTITIONED BY": TokenType.PARTITION_BY,
807
+ "PARTITIONED_BY": TokenType.PARTITION_BY,
808
+ "PERCENT": TokenType.PERCENT,
809
+ "PIVOT": TokenType.PIVOT,
810
+ "PRAGMA": TokenType.PRAGMA,
811
+ "PRIMARY KEY": TokenType.PRIMARY_KEY,
812
+ "PROCEDURE": TokenType.PROCEDURE,
813
+ "QUALIFY": TokenType.QUALIFY,
814
+ "RANGE": TokenType.RANGE,
815
+ "RECURSIVE": TokenType.RECURSIVE,
816
+ "REGEXP": TokenType.RLIKE,
817
+ "RENAME": TokenType.RENAME,
818
+ "REPLACE": TokenType.REPLACE,
819
+ "RETURNING": TokenType.RETURNING,
820
+ "REFERENCES": TokenType.REFERENCES,
821
+ "RIGHT": TokenType.RIGHT,
822
+ "RLIKE": TokenType.RLIKE,
823
+ "ROLLBACK": TokenType.ROLLBACK,
824
+ "ROLLUP": TokenType.ROLLUP,
825
+ "ROW": TokenType.ROW,
826
+ "ROWS": TokenType.ROWS,
827
+ "SCHEMA": TokenType.SCHEMA,
828
+ "SELECT": TokenType.SELECT,
829
+ "SEMI": TokenType.SEMI,
830
+ "SET": TokenType.SET,
831
+ "SETTINGS": TokenType.SETTINGS,
832
+ "SHOW": TokenType.SHOW,
833
+ "SIMILAR TO": TokenType.SIMILAR_TO,
834
+ "SOME": TokenType.SOME,
835
+ "SORT BY": TokenType.SORT_BY,
836
+ "START WITH": TokenType.START_WITH,
837
+ "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN,
838
+ "TABLE": TokenType.TABLE,
839
+ "TABLESAMPLE": TokenType.TABLE_SAMPLE,
840
+ "TEMP": TokenType.TEMPORARY,
841
+ "TEMPORARY": TokenType.TEMPORARY,
842
+ "THEN": TokenType.THEN,
843
+ "TRUE": TokenType.TRUE,
844
+ "TRUNCATE": TokenType.TRUNCATE,
845
+ "UNION": TokenType.UNION,
846
+ "UNKNOWN": TokenType.UNKNOWN,
847
+ "UNNEST": TokenType.UNNEST,
848
+ "UNPIVOT": TokenType.UNPIVOT,
849
+ "UPDATE": TokenType.UPDATE,
850
+ "USE": TokenType.USE,
851
+ "USING": TokenType.USING,
852
+ "UUID": TokenType.UUID,
853
+ "VALUES": TokenType.VALUES,
854
+ "VIEW": TokenType.VIEW,
855
+ "VOLATILE": TokenType.VOLATILE,
856
+ "WHEN": TokenType.WHEN,
857
+ "WHERE": TokenType.WHERE,
858
+ "WINDOW": TokenType.WINDOW,
859
+ "WITH": TokenType.WITH,
860
+ "APPLY": TokenType.APPLY,
861
+ "ARRAY": TokenType.ARRAY,
862
+ "BIT": TokenType.BIT,
863
+ "BOOL": TokenType.BOOLEAN,
864
+ "BOOLEAN": TokenType.BOOLEAN,
865
+ "BYTE": TokenType.TINYINT,
866
+ "MEDIUMINT": TokenType.MEDIUMINT,
867
+ "INT1": TokenType.TINYINT,
868
+ "TINYINT": TokenType.TINYINT,
869
+ "INT16": TokenType.SMALLINT,
870
+ "SHORT": TokenType.SMALLINT,
871
+ "SMALLINT": TokenType.SMALLINT,
872
+ "HUGEINT": TokenType.INT128,
873
+ "UHUGEINT": TokenType.UINT128,
874
+ "INT2": TokenType.SMALLINT,
875
+ "INTEGER": TokenType.INT,
876
+ "INT": TokenType.INT,
877
+ "INT4": TokenType.INT,
878
+ "INT32": TokenType.INT,
879
+ "INT64": TokenType.BIGINT,
880
+ "INT128": TokenType.INT128,
881
+ "INT256": TokenType.INT256,
882
+ "LONG": TokenType.BIGINT,
883
+ "BIGINT": TokenType.BIGINT,
884
+ "INT8": TokenType.TINYINT,
885
+ "UINT": TokenType.UINT,
886
+ "UINT128": TokenType.UINT128,
887
+ "UINT256": TokenType.UINT256,
888
+ "DEC": TokenType.DECIMAL,
889
+ "DECIMAL": TokenType.DECIMAL,
890
+ "DECIMAL32": TokenType.DECIMAL32,
891
+ "DECIMAL64": TokenType.DECIMAL64,
892
+ "DECIMAL128": TokenType.DECIMAL128,
893
+ "DECIMAL256": TokenType.DECIMAL256,
894
+ "BIGDECIMAL": TokenType.BIGDECIMAL,
895
+ "BIGNUMERIC": TokenType.BIGDECIMAL,
896
+ "LIST": TokenType.LIST,
897
+ "MAP": TokenType.MAP,
898
+ "NULLABLE": TokenType.NULLABLE,
899
+ "NUMBER": TokenType.DECIMAL,
900
+ "NUMERIC": TokenType.DECIMAL,
901
+ "FIXED": TokenType.DECIMAL,
902
+ "REAL": TokenType.FLOAT,
903
+ "FLOAT": TokenType.FLOAT,
904
+ "FLOAT4": TokenType.FLOAT,
905
+ "FLOAT8": TokenType.DOUBLE,
906
+ "DOUBLE": TokenType.DOUBLE,
907
+ "DOUBLE PRECISION": TokenType.DOUBLE,
908
+ "JSON": TokenType.JSON,
909
+ "JSONB": TokenType.JSONB,
910
+ "CHAR": TokenType.CHAR,
911
+ "CHARACTER": TokenType.CHAR,
912
+ "CHAR VARYING": TokenType.VARCHAR,
913
+ "CHARACTER VARYING": TokenType.VARCHAR,
914
+ "NCHAR": TokenType.NCHAR,
915
+ "VARCHAR": TokenType.VARCHAR,
916
+ "VARCHAR2": TokenType.VARCHAR,
917
+ "NVARCHAR": TokenType.NVARCHAR,
918
+ "NVARCHAR2": TokenType.NVARCHAR,
919
+ "BPCHAR": TokenType.BPCHAR,
920
+ "STR": TokenType.TEXT,
921
+ "STRING": TokenType.TEXT,
922
+ "TEXT": TokenType.TEXT,
923
+ "LONGTEXT": TokenType.LONGTEXT,
924
+ "MEDIUMTEXT": TokenType.MEDIUMTEXT,
925
+ "TINYTEXT": TokenType.TINYTEXT,
926
+ "CLOB": TokenType.TEXT,
927
+ "LONGVARCHAR": TokenType.TEXT,
928
+ "BINARY": TokenType.BINARY,
929
+ "BLOB": TokenType.VARBINARY,
930
+ "LONGBLOB": TokenType.LONGBLOB,
931
+ "MEDIUMBLOB": TokenType.MEDIUMBLOB,
932
+ "TINYBLOB": TokenType.TINYBLOB,
933
+ "BYTEA": TokenType.VARBINARY,
934
+ "VARBINARY": TokenType.VARBINARY,
935
+ "TIME": TokenType.TIME,
936
+ "TIMETZ": TokenType.TIMETZ,
937
+ "TIMESTAMP": TokenType.TIMESTAMP,
938
+ "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
939
+ "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
940
+ "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ,
941
+ "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ,
942
+ "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ,
943
+ "DATE": TokenType.DATE,
944
+ "DATETIME": TokenType.DATETIME,
945
+ "INT4RANGE": TokenType.INT4RANGE,
946
+ "INT4MULTIRANGE": TokenType.INT4MULTIRANGE,
947
+ "INT8RANGE": TokenType.INT8RANGE,
948
+ "INT8MULTIRANGE": TokenType.INT8MULTIRANGE,
949
+ "NUMRANGE": TokenType.NUMRANGE,
950
+ "NUMMULTIRANGE": TokenType.NUMMULTIRANGE,
951
+ "TSRANGE": TokenType.TSRANGE,
952
+ "TSMULTIRANGE": TokenType.TSMULTIRANGE,
953
+ "TSTZRANGE": TokenType.TSTZRANGE,
954
+ "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE,
955
+ "DATERANGE": TokenType.DATERANGE,
956
+ "DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
957
+ "UNIQUE": TokenType.UNIQUE,
958
+ "VECTOR": TokenType.VECTOR,
959
+ "STRUCT": TokenType.STRUCT,
960
+ "SEQUENCE": TokenType.SEQUENCE,
961
+ "VARIANT": TokenType.VARIANT,
962
+ "ALTER": TokenType.ALTER,
963
+ "ANALYZE": TokenType.ANALYZE,
964
+ "CALL": TokenType.COMMAND,
965
+ "COMMENT": TokenType.COMMENT,
966
+ "EXPLAIN": TokenType.COMMAND,
967
+ "GRANT": TokenType.GRANT,
968
+ "OPTIMIZE": TokenType.COMMAND,
969
+ "PREPARE": TokenType.COMMAND,
970
+ "VACUUM": TokenType.COMMAND,
971
+ "USER-DEFINED": TokenType.USERDEFINED,
972
+ "FOR VERSION": TokenType.VERSION_SNAPSHOT,
973
+ "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT,
974
+ }
975
+
976
+ WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = {
977
+ " ": TokenType.SPACE,
978
+ "\t": TokenType.SPACE,
979
+ "\n": TokenType.BREAK,
980
+ "\r": TokenType.BREAK,
981
+ }
982
+
983
+ COMMANDS = {
984
+ TokenType.COMMAND,
985
+ TokenType.EXECUTE,
986
+ TokenType.FETCH,
987
+ TokenType.SHOW,
988
+ TokenType.RENAME,
989
+ }
990
+
991
+ COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
992
+
993
+ # Handle numeric literals like in hive (3L = BIGINT)
994
+ NUMERIC_LITERALS: t.Dict[str, str] = {}
995
+
996
+ COMMENTS = ["--", ("/*", "*/")]
997
+
998
+ __slots__ = (
999
+ "sql",
1000
+ "size",
1001
+ "tokens",
1002
+ "dialect",
1003
+ "use_rs_tokenizer",
1004
+ "_start",
1005
+ "_current",
1006
+ "_line",
1007
+ "_col",
1008
+ "_comments",
1009
+ "_char",
1010
+ "_end",
1011
+ "_peek",
1012
+ "_prev_token_line",
1013
+ "_rs_dialect_settings",
1014
+ )
1015
+
1016
+ def __init__(
1017
+ self, dialect: DialectType = None, use_rs_tokenizer: t.Optional[bool] = None
1018
+ ) -> None:
1019
+ from sqlglot.dialects import Dialect
1020
+
1021
+ self.dialect = Dialect.get_or_raise(dialect)
1022
+
1023
+ # initialize `use_rs_tokenizer`, and allow it to be overwritten per Tokenizer instance
1024
+ self.use_rs_tokenizer = (
1025
+ use_rs_tokenizer if use_rs_tokenizer is not None else USE_RS_TOKENIZER
1026
+ )
1027
+
1028
+ if self.use_rs_tokenizer:
1029
+ self._rs_dialect_settings = RsTokenizerDialectSettings(
1030
+ unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
1031
+ identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
1032
+ numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
1033
+ )
1034
+
1035
+ self.reset()
1036
+
1037
+ def reset(self) -> None:
1038
+ self.sql = ""
1039
+ self.size = 0
1040
+ self.tokens: t.List[Token] = []
1041
+ self._start = 0
1042
+ self._current = 0
1043
+ self._line = 1
1044
+ self._col = 0
1045
+ self._comments: t.List[str] = []
1046
+
1047
+ self._char = ""
1048
+ self._end = False
1049
+ self._peek = ""
1050
+ self._prev_token_line = -1
1051
+
1052
+ def tokenize(self, sql: str) -> t.List[Token]:
1053
+ """Returns a list of tokens corresponding to the SQL string `sql`."""
1054
+ if self.use_rs_tokenizer:
1055
+ return self.tokenize_rs(sql)
1056
+
1057
+ self.reset()
1058
+ self.sql = sql
1059
+ self.size = len(sql)
1060
+
1061
+ try:
1062
+ self._scan()
1063
+ except Exception as e:
1064
+ start = max(self._current - 50, 0)
1065
+ end = min(self._current + 50, self.size - 1)
1066
+ context = self.sql[start:end]
1067
+ raise TokenError(f"Error tokenizing '{context}'") from e
1068
+
1069
+ return self.tokens
1070
+
1071
+ def _scan(self, until: t.Optional[t.Callable] = None) -> None:
1072
+ while self.size and not self._end:
1073
+ current = self._current
1074
+
1075
+ # Skip spaces here rather than iteratively calling advance() for performance reasons
1076
+ while current < self.size:
1077
+ char = self.sql[current]
1078
+
1079
+ if char.isspace() and (char == " " or char == "\t"):
1080
+ current += 1
1081
+ else:
1082
+ break
1083
+
1084
+ offset = current - self._current if current > self._current else 1
1085
+
1086
+ self._start = current
1087
+ self._advance(offset)
1088
+
1089
+ if not self._char.isspace():
1090
+ if self._char.isdigit():
1091
+ self._scan_number()
1092
+ elif self._char in self._IDENTIFIERS:
1093
+ self._scan_identifier(self._IDENTIFIERS[self._char])
1094
+ else:
1095
+ self._scan_keywords()
1096
+
1097
+ if until and until():
1098
+ break
1099
+
1100
+ if self.tokens and self._comments:
1101
+ self.tokens[-1].comments.extend(self._comments)
1102
+
1103
+ def _chars(self, size: int) -> str:
1104
+ if size == 1:
1105
+ return self._char
1106
+
1107
+ start = self._current - 1
1108
+ end = start + size
1109
+
1110
+ return self.sql[start:end] if end <= self.size else ""
1111
+
1112
+ def _advance(self, i: int = 1, alnum: bool = False) -> None:
1113
+ if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
1114
+ # Ensures we don't count an extra line if we get a \r\n line break sequence
1115
+ if not (self._char == "\r" and self._peek == "\n"):
1116
+ self._col = i
1117
+ self._line += 1
1118
+ else:
1119
+ self._col += i
1120
+
1121
+ self._current += i
1122
+ self._end = self._current >= self.size
1123
+ self._char = self.sql[self._current - 1]
1124
+ self._peek = "" if self._end else self.sql[self._current]
1125
+
1126
+ if alnum and self._char.isalnum():
1127
+ # Here we use local variables instead of attributes for better performance
1128
+ _col = self._col
1129
+ _current = self._current
1130
+ _end = self._end
1131
+ _peek = self._peek
1132
+
1133
+ while _peek.isalnum():
1134
+ _col += 1
1135
+ _current += 1
1136
+ _end = _current >= self.size
1137
+ _peek = "" if _end else self.sql[_current]
1138
+
1139
+ self._col = _col
1140
+ self._current = _current
1141
+ self._end = _end
1142
+ self._peek = _peek
1143
+ self._char = self.sql[_current - 1]
1144
+
1145
+ @property
1146
+ def _text(self) -> str:
1147
+ return self.sql[self._start : self._current]
1148
+
1149
+ def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
1150
+ self._prev_token_line = self._line
1151
+
1152
+ if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
1153
+ self.tokens[-1].comments.extend(self._comments)
1154
+ self._comments = []
1155
+
1156
+ self.tokens.append(
1157
+ Token(
1158
+ token_type,
1159
+ text=self._text if text is None else text,
1160
+ line=self._line,
1161
+ col=self._col,
1162
+ start=self._start,
1163
+ end=self._current - 1,
1164
+ comments=self._comments,
1165
+ )
1166
+ )
1167
+ self._comments = []
1168
+
1169
+ # If we have either a semicolon or a begin token before the command's token, we'll parse
1170
+ # whatever follows the command's token as a string
1171
+ if (
1172
+ token_type in self.COMMANDS
1173
+ and self._peek != ";"
1174
+ and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS)
1175
+ ):
1176
+ start = self._current
1177
+ tokens = len(self.tokens)
1178
+ self._scan(lambda: self._peek == ";")
1179
+ self.tokens = self.tokens[:tokens]
1180
+ text = self.sql[start : self._current].strip()
1181
+ if text:
1182
+ self._add(TokenType.STRING, text)
1183
+
1184
+ def _scan_keywords(self) -> None:
1185
+ size = 0
1186
+ word = None
1187
+ chars = self._text
1188
+ char = chars
1189
+ prev_space = False
1190
+ skip = False
1191
+ trie = self._KEYWORD_TRIE
1192
+ single_token = char in self.SINGLE_TOKENS
1193
+
1194
+ while chars:
1195
+ if skip:
1196
+ result = TrieResult.PREFIX
1197
+ else:
1198
+ result, trie = in_trie(trie, char.upper())
1199
+
1200
+ if result == TrieResult.FAILED:
1201
+ break
1202
+ if result == TrieResult.EXISTS:
1203
+ word = chars
1204
+
1205
+ end = self._current + size
1206
+ size += 1
1207
+
1208
+ if end < self.size:
1209
+ char = self.sql[end]
1210
+ single_token = single_token or char in self.SINGLE_TOKENS
1211
+ is_space = char.isspace()
1212
+
1213
+ if not is_space or not prev_space:
1214
+ if is_space:
1215
+ char = " "
1216
+ chars += char
1217
+ prev_space = is_space
1218
+ skip = False
1219
+ else:
1220
+ skip = True
1221
+ else:
1222
+ char = ""
1223
+ break
1224
+
1225
+ if word:
1226
+ if self._scan_string(word):
1227
+ return
1228
+ if self._scan_comment(word):
1229
+ return
1230
+ if prev_space or single_token or not char:
1231
+ self._advance(size - 1)
1232
+ word = word.upper()
1233
+ self._add(self.KEYWORDS[word], text=word)
1234
+ return
1235
+
1236
+ if self._char in self.SINGLE_TOKENS:
1237
+ self._add(self.SINGLE_TOKENS[self._char], text=self._char)
1238
+ return
1239
+
1240
+ self._scan_var()
1241
+
1242
+ def _scan_comment(self, comment_start: str) -> bool:
1243
+ if comment_start not in self._COMMENTS:
1244
+ return False
1245
+
1246
+ comment_start_line = self._line
1247
+ comment_start_size = len(comment_start)
1248
+ comment_end = self._COMMENTS[comment_start]
1249
+
1250
+ if comment_end:
1251
+ # Skip the comment's start delimiter
1252
+ self._advance(comment_start_size)
1253
+
1254
+ comment_count = 1
1255
+ comment_end_size = len(comment_end)
1256
+
1257
+ while not self._end:
1258
+ if self._chars(comment_end_size) == comment_end:
1259
+ comment_count -= 1
1260
+ if not comment_count:
1261
+ break
1262
+
1263
+ self._advance(alnum=True)
1264
+
1265
+ # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
1266
+ if (
1267
+ self.NESTED_COMMENTS
1268
+ and not self._end
1269
+ and self._chars(comment_end_size) == comment_start
1270
+ ):
1271
+ self._advance(comment_start_size)
1272
+ comment_count += 1
1273
+
1274
+ self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
1275
+ self._advance(comment_end_size - 1)
1276
+ else:
1277
+ while not self._end and self.WHITE_SPACE.get(self._peek) is not TokenType.BREAK:
1278
+ self._advance(alnum=True)
1279
+ self._comments.append(self._text[comment_start_size:])
1280
+
1281
+ if (
1282
+ comment_start == self.HINT_START
1283
+ and self.tokens
1284
+ and self.tokens[-1].token_type in self.TOKENS_PRECEDING_HINT
1285
+ ):
1286
+ self._add(TokenType.HINT)
1287
+
1288
+ # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
1289
+ # Multiple consecutive comments are preserved by appending them to the current comments list.
1290
+ if comment_start_line == self._prev_token_line:
1291
+ self.tokens[-1].comments.extend(self._comments)
1292
+ self._comments = []
1293
+ self._prev_token_line = self._line
1294
+
1295
+ return True
1296
+
1297
+ def _scan_number(self) -> None:
1298
+ if self._char == "0":
1299
+ peek = self._peek.upper()
1300
+ if peek == "B":
1301
+ return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER)
1302
+ elif peek == "X":
1303
+ return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER)
1304
+
1305
+ decimal = False
1306
+ scientific = 0
1307
+
1308
+ while True:
1309
+ if self._peek.isdigit():
1310
+ self._advance()
1311
+ elif self._peek == "." and not decimal:
1312
+ if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER:
1313
+ return self._add(TokenType.NUMBER)
1314
+ decimal = True
1315
+ self._advance()
1316
+ elif self._peek in ("-", "+") and scientific == 1:
1317
+ scientific += 1
1318
+ self._advance()
1319
+ elif self._peek.upper() == "E" and not scientific:
1320
+ scientific += 1
1321
+ self._advance()
1322
+ elif self._peek.isidentifier():
1323
+ number_text = self._text
1324
+ literal = ""
1325
+
1326
+ while self._peek.strip() and self._peek not in self.SINGLE_TOKENS:
1327
+ literal += self._peek
1328
+ self._advance()
1329
+
1330
+ token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal.upper(), ""))
1331
+
1332
+ if token_type:
1333
+ self._add(TokenType.NUMBER, number_text)
1334
+ self._add(TokenType.DCOLON, "::")
1335
+ return self._add(token_type, literal)
1336
+ else:
1337
+ replaced = literal.replace("_", "")
1338
+ if self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED and replaced.isdigit():
1339
+ return self._add(TokenType.NUMBER, number_text + replaced)
1340
+ if self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT:
1341
+ return self._add(TokenType.VAR)
1342
+
1343
+ self._advance(-len(literal))
1344
+ return self._add(TokenType.NUMBER, number_text)
1345
+ else:
1346
+ return self._add(TokenType.NUMBER)
1347
+
1348
+ def _scan_bits(self) -> None:
1349
+ self._advance()
1350
+ value = self._extract_value()
1351
+ try:
1352
+ # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
1353
+ int(value, 2)
1354
+ self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b
1355
+ except ValueError:
1356
+ self._add(TokenType.IDENTIFIER)
1357
+
1358
+ def _scan_hex(self) -> None:
1359
+ self._advance()
1360
+ value = self._extract_value()
1361
+ try:
1362
+ # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
1363
+ int(value, 16)
1364
+ self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x
1365
+ except ValueError:
1366
+ self._add(TokenType.IDENTIFIER)
1367
+
1368
+ def _extract_value(self) -> str:
1369
+ while True:
1370
+ char = self._peek.strip()
1371
+ if char and char not in self.SINGLE_TOKENS:
1372
+ self._advance(alnum=True)
1373
+ else:
1374
+ break
1375
+
1376
+ return self._text
1377
+
1378
+ def _scan_string(self, start: str) -> bool:
1379
+ base = None
1380
+ token_type = TokenType.STRING
1381
+
1382
+ if start in self._QUOTES:
1383
+ end = self._QUOTES[start]
1384
+ elif start in self._FORMAT_STRINGS:
1385
+ end, token_type = self._FORMAT_STRINGS[start]
1386
+
1387
+ if token_type == TokenType.HEX_STRING:
1388
+ base = 16
1389
+ elif token_type == TokenType.BIT_STRING:
1390
+ base = 2
1391
+ elif token_type == TokenType.HEREDOC_STRING:
1392
+ self._advance()
1393
+
1394
+ if self._char == end:
1395
+ tag = ""
1396
+ else:
1397
+ tag = self._extract_string(
1398
+ end,
1399
+ raw_string=True,
1400
+ raise_unmatched=not self.HEREDOC_TAG_IS_IDENTIFIER,
1401
+ )
1402
+
1403
+ if tag and self.HEREDOC_TAG_IS_IDENTIFIER and (self._end or not tag.isidentifier()):
1404
+ if not self._end:
1405
+ self._advance(-1)
1406
+
1407
+ self._advance(-len(tag))
1408
+ self._add(self.HEREDOC_STRING_ALTERNATIVE)
1409
+ return True
1410
+
1411
+ end = f"{start}{tag}{end}"
1412
+ else:
1413
+ return False
1414
+
1415
+ self._advance(len(start))
1416
+ text = self._extract_string(end, raw_string=token_type == TokenType.RAW_STRING)
1417
+
1418
+ if base:
1419
+ try:
1420
+ int(text, base)
1421
+ except Exception:
1422
+ raise TokenError(
1423
+ f"Numeric string contains invalid characters from {self._line}:{self._start}"
1424
+ )
1425
+
1426
+ self._add(token_type, text)
1427
+ return True
1428
+
1429
+ def _scan_identifier(self, identifier_end: str) -> None:
1430
+ self._advance()
1431
+ text = self._extract_string(
1432
+ identifier_end, escapes=self._IDENTIFIER_ESCAPES | {identifier_end}
1433
+ )
1434
+ self._add(TokenType.IDENTIFIER, text)
1435
+
1436
+ def _scan_var(self) -> None:
1437
+ while True:
1438
+ char = self._peek.strip()
1439
+ if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
1440
+ self._advance(alnum=True)
1441
+ else:
1442
+ break
1443
+
1444
+ self._add(
1445
+ TokenType.VAR
1446
+ if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1447
+ else self.KEYWORDS.get(self._text.upper(), TokenType.VAR)
1448
+ )
1449
+
1450
+ def _extract_string(
1451
+ self,
1452
+ delimiter: str,
1453
+ escapes: t.Optional[t.Set[str]] = None,
1454
+ raw_string: bool = False,
1455
+ raise_unmatched: bool = True,
1456
+ ) -> str:
1457
+ text = ""
1458
+ delim_size = len(delimiter)
1459
+ escapes = self._STRING_ESCAPES if escapes is None else escapes
1460
+
1461
+ while True:
1462
+ if (
1463
+ not raw_string
1464
+ and self.dialect.UNESCAPED_SEQUENCES
1465
+ and self._peek
1466
+ and self._char in self.STRING_ESCAPES
1467
+ ):
1468
+ unescaped_sequence = self.dialect.UNESCAPED_SEQUENCES.get(self._char + self._peek)
1469
+ if unescaped_sequence:
1470
+ self._advance(2)
1471
+ text += unescaped_sequence
1472
+ continue
1473
+ if (
1474
+ (self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS or not raw_string)
1475
+ and self._char in escapes
1476
+ and (self._peek == delimiter or self._peek in escapes)
1477
+ and (self._char not in self._QUOTES or self._char == self._peek)
1478
+ ):
1479
+ if self._peek == delimiter:
1480
+ text += self._peek
1481
+ else:
1482
+ text += self._char + self._peek
1483
+
1484
+ if self._current + 1 < self.size:
1485
+ self._advance(2)
1486
+ else:
1487
+ raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
1488
+ else:
1489
+ if self._chars(delim_size) == delimiter:
1490
+ if delim_size > 1:
1491
+ self._advance(delim_size - 1)
1492
+ break
1493
+
1494
+ if self._end:
1495
+ if not raise_unmatched:
1496
+ return text + self._char
1497
+
1498
+ raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
1499
+
1500
+ current = self._current - 1
1501
+ self._advance(alnum=True)
1502
+ text += self.sql[current : self._current - 1]
1503
+
1504
+ return text
1505
+
1506
+ def tokenize_rs(self, sql: str) -> t.List[Token]:
1507
+ if not self._RS_TOKENIZER:
1508
+ raise SqlglotError("Rust tokenizer is not available")
1509
+
1510
+ tokens, error_msg = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings)
1511
+ for token in tokens:
1512
+ token.token_type = _ALL_TOKEN_TYPES[token.token_type_index]
1513
+
1514
+ # Setting this here so partial token lists can be inspected even if there is a failure
1515
+ self.tokens = tokens
1516
+
1517
+ if error_msg is not None:
1518
+ raise TokenError(error_msg)
1519
+
1520
+ return tokens